diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000..7f220c5e8a
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+target
+.DS_Store
+.idea/
+.vscode
diff --git a/Cargo.lock b/Cargo.lock
new file mode 100644
index 0000000000..6958411537
--- /dev/null
+++ b/Cargo.lock
@@ -0,0 +1,4363 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "addr2line"
+version = "0.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3e61f2b7f93d2c7d2b08263acaa4a363b3e276806c68af6134c44f523bf1aacd"
+dependencies = [
+ "gimli",
+]
+
+[[package]]
+name = "adler"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
+
+[[package]]
+name = "adler32"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "aae1277d39aeec15cb388266ecc24b11c80469deae6067e17a1a7aa9e5c1f234"
+
+[[package]]
+name = "ahash"
+version = "0.3.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e8fd72866655d1904d6b0997d0b07ba561047d070fbe29de039031c641b61217"
+dependencies = [
+ "const-random",
+]
+
+[[package]]
+name = "ahash"
+version = "0.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "43bb833f0bf979d8475d38fbf09ed3b8a55e1885fe93ad3f93239fc6a4f17b98"
+dependencies = [
+ "getrandom 0.2.3",
+ "once_cell",
+ "version_check",
+]
+
+[[package]]
+name = "aho-corasick"
+version = "0.7.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "alloc-no-stdlib"
+version = "2.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "35ef4730490ad1c4eae5c4325b2a95f521d023e5c885853ff7aca0a6a1631db3"
+
+[[package]]
+name = "alloc-stdlib"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "697ed7edc0f1711de49ce108c541623a0af97c6c60b2f6e2b65229847ac843c2"
+dependencies = [
+ "alloc-no-stdlib",
+]
+
+[[package]]
+name = "analytic_engine"
+version = "0.1.0"
+dependencies = [
+ "arc-swap 1.4.0",
+ "arena",
+ "arrow_deps",
+ "async-trait",
+ "base64",
+ "common_types",
+ "common_util",
+ "env_logger",
+ "futures",
+ "lazy_static",
+ "log",
+ "object_store",
+ "parquet 0.1.0",
+ "prometheus 0.12.0",
+ "proto",
+ "protobuf",
+ "serde",
+ "serde_derive",
+ "skiplist",
+ "smallvec",
+ "snafu",
+ "table_engine",
+ "tempfile",
+ "tokio",
+ "wal",
+]
+
+[[package]]
+name = "ansi_term"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b"
+dependencies = [
+ "winapi",
+]
+
+[[package]]
+name = "ansi_term"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2"
+dependencies = [
+ "winapi",
+]
+
+[[package]]
+name = "anyhow"
+version = "1.0.43"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "28ae2b3dec75a406790005a200b1bd89785afc02517a00ca99ecfe093ee9e6cf"
+
+[[package]]
+name = "arc-swap"
+version = "0.3.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bc4662175ead9cd84451d5c35070517777949a2ed84551764129cedb88384841"
+
+[[package]]
+name = "arc-swap"
+version = "1.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6df5aef5c5830360ce5218cecb8f018af3438af5686ae945094affc86fdec63"
+
+[[package]]
+name = "arena"
+version = "0.1.0"
+dependencies = [
+ "parking_lot",
+]
+
+[[package]]
+name = "arrayref"
+version = "0.3.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a4c527152e37cf757a3f78aae5a06fbeefdb07ccc535c980a3208ee3060dd544"
+
+[[package]]
+name = "arrayvec"
+version = "0.7.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8da52d66c7071e2e3fa2a1e5c6d088fec47b593032b254f5e980de8ea54454d6"
+
+[[package]]
+name = "arrow"
+version = "7.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "66ec0a5964feebf378e2fc6db9530e712657b8edf72aa17b1b277b0f52a48e2d"
+dependencies = [
+ "bitflags",
+ "chrono",
+ "comfy-table",
+ "csv",
+ "flatbuffers",
+ "half",
+ "hex",
+ "indexmap",
+ "lazy_static",
+ "lexical-core",
+ "multiversion",
+ "num",
+ "rand 0.8.4",
+ "regex",
+ "serde",
+ "serde_derive",
+ "serde_json",
+]
+
+[[package]]
+name = "arrow-format"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9f7da2d9660bfaebbdb0a44a33b3bd1dcb5a952fafa02c0dfc6a51ea471fef2a"
+dependencies = [
+ "flatbuffers",
+]
+
+[[package]]
+name = "arrow2"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d873e2775c3d87a4e8d77aa544cbd43f34a0779d5164c59e7c6a1dd0678eb395"
+dependencies = [
+ "arrow-format",
+ "base64",
+ "chrono",
+ "futures",
+ "hash_hasher",
+ "num-traits",
+ "parquet2",
+ "simdutf8",
+]
+
+[[package]]
+name = "arrow_deps"
+version = "0.1.0"
+dependencies = [
+ "arrow",
+ "datafusion",
+ "parquet 7.0.0",
+ "uncover",
+]
+
+[[package]]
+name = "async-stream"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "171374e7e3b2504e0e5236e3b59260560f9fe94bfe9ac39ba5e4e929c5590625"
+dependencies = [
+ "async-stream-impl",
+ "futures-core",
+]
+
+[[package]]
+name = "async-stream-impl"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "648ed8c8d2ce5409ccd57453d9d1b214b342a0d69376a6feda1fd6cae3299308"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "async-trait"
+version = "0.1.51"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "44318e776df68115a881de9a8fd1b9e53368d7a4a5ce4cc48517da3393233a5e"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "atty"
+version = "0.2.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
+dependencies = [
+ "hermit-abi",
+ "libc",
+ "winapi",
+]
+
+[[package]]
+name = "autocfg"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a"
+
+[[package]]
+name = "avro-rs"
+version = "0.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ece550dd6710221de9bcdc1697424d8eee4fc4ca7e017479ea9d50c348465e37"
+dependencies = [
+ "byteorder",
+ "digest 0.9.0",
+ "lazy_static",
+ "libflate",
+ "num-bigint 0.2.6",
+ "rand 0.7.3",
+ "serde",
+ "serde_json",
+ "strum 0.18.0",
+ "strum_macros 0.18.0",
+ "thiserror",
+ "typed-builder",
+ "uuid",
+ "zerocopy",
+]
+
+[[package]]
+name = "backtrace"
+version = "0.3.61"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e7a905d892734eea339e896738c14b9afce22b5318f64b951e70bf3844419b01"
+dependencies = [
+ "addr2line",
+ "cc",
+ "cfg-if 1.0.0",
+ "libc",
+ "miniz_oxide",
+ "object",
+ "rustc-demangle",
+]
+
+[[package]]
+name = "base64"
+version = "0.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd"
+
+[[package]]
+name = "benchmarks"
+version = "0.1.0"
+dependencies = [
+ "analytic_engine",
+ "arena",
+ "arrow2",
+ "arrow_deps",
+ "clap",
+ "common_types",
+ "common_util",
+ "criterion",
+ "env_logger",
+ "futures",
+ "log",
+ "object_store",
+ "parquet 0.1.0",
+ "serde",
+ "serde_derive",
+ "table_engine",
+ "tokio",
+]
+
+[[package]]
+name = "bindgen"
+version = "0.57.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fd4865004a46a0aafb2a0a5eb19d3c9fc46ee5f063a6cfc605c69ac9ecf5263d"
+dependencies = [
+ "bitflags",
+ "cexpr",
+ "clang-sys",
+ "lazy_static",
+ "lazycell",
+ "peeking_take_while",
+ "proc-macro2",
+ "quote",
+ "regex",
+ "rustc-hash",
+ "shlex",
+]
+
+[[package]]
+name = "bitflags"
+version = "1.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
+
+[[package]]
+name = "bitpacking"
+version = "0.8.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a8c7d2ac73c167c06af4a5f37e6e59d84148d57ccbe4480b76f0273eefea82d7"
+dependencies = [
+ "crunchy",
+]
+
+[[package]]
+name = "blake2"
+version = "0.9.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0a4e37d16930f5459780f5621038b6382b9bb37c19016f39fb6b5808d831f174"
+dependencies = [
+ "crypto-mac",
+ "digest 0.9.0",
+ "opaque-debug",
+]
+
+[[package]]
+name = "blake3"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "882e99e4a0cb2ae6cb6e442102e8e6b7131718d94110e64c3e6a34ea9b106f37"
+dependencies = [
+ "arrayref",
+ "arrayvec",
+ "cc",
+ "cfg-if 1.0.0",
+ "constant_time_eq",
+ "digest 0.10.1",
+]
+
+[[package]]
+name = "block-buffer"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4152116fd6e9dadb291ae18fc1ec3575ed6d84c29642d97890f4b4a3417297e4"
+dependencies = [
+ "generic-array",
+]
+
+[[package]]
+name = "block-buffer"
+version = "0.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f1d36a02058e76b040de25a4464ba1c80935655595b661505c8b39b664828b95"
+dependencies = [
+ "generic-array",
+]
+
+[[package]]
+name = "boringssl-src"
+version = "0.3.0+688fc5c"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f901accdf830d2ea2f4e27f923a5e1125cd8b1a39ab578b9db1a42d578a6922b"
+dependencies = [
+ "cmake",
+]
+
+[[package]]
+name = "brotli"
+version = "3.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "71cb90ade945043d3d53597b2fc359bb063db8ade2bcffe7997351d0756e9d50"
+dependencies = [
+ "alloc-no-stdlib",
+ "alloc-stdlib",
+ "brotli-decompressor",
+]
+
+[[package]]
+name = "brotli-decompressor"
+version = "2.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "59ad2d4653bf5ca36ae797b1f4bb4dbddb60ce49ca4aed8a2ce4829f60425b80"
+dependencies = [
+ "alloc-no-stdlib",
+ "alloc-stdlib",
+]
+
+[[package]]
+name = "bstr"
+version = "0.2.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "90682c8d613ad3373e66de8c6411e0ae2ab2571e879d2efbf73558cc66f21279"
+dependencies = [
+ "lazy_static",
+ "memchr",
+ "regex-automata",
+ "serde",
+]
+
+[[package]]
+name = "buf_redux"
+version = "0.8.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b953a6887648bb07a535631f2bc00fbdb2a2216f135552cb3f534ed136b9c07f"
+dependencies = [
+ "memchr",
+ "safemem",
+]
+
+[[package]]
+name = "bumpalo"
+version = "3.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9c59e7af012c713f529e7a3ee57ce9b31ddd858d4b512923602f74608b009631"
+
+[[package]]
+name = "bytecount"
+version = "0.6.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72feb31ffc86498dacdbd0fcebb56138e7177a8cc5cea4516031d15ae85a742e"
+
+[[package]]
+name = "byteorder"
+version = "1.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"
+
+[[package]]
+name = "bytes"
+version = "0.1.0"
+dependencies = [
+ "bytes 1.1.0",
+ "snafu",
+]
+
+[[package]]
+name = "bytes"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c4872d67bab6358e59559027aa3b9157c53d9358c51423c17554809a8858e0f8"
+
+[[package]]
+name = "bzip2-sys"
+version = "0.1.11+1.0.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc"
+dependencies = [
+ "cc",
+ "libc",
+ "pkg-config",
+]
+
+[[package]]
+name = "cast"
+version = "0.2.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4c24dab4283a142afa2fdca129b80ad2c6284e073930f964c3a1293c225ee39a"
+dependencies = [
+ "rustc_version",
+]
+
+[[package]]
+name = "catalog"
+version = "0.1.0"
+dependencies = [
+ "async-trait",
+ "common_types",
+ "common_util",
+ "snafu",
+ "table_engine",
+]
+
+[[package]]
+name = "catalog_impls"
+version = "0.1.0"
+dependencies = [
+ "analytic_engine",
+ "async-trait",
+ "catalog",
+ "common_types",
+ "common_util",
+ "log",
+ "server",
+ "snafu",
+ "system_catalog",
+ "table_engine",
+ "tokio",
+]
+
+[[package]]
+name = "cc"
+version = "1.0.69"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e70cc2f62c6ce1868963827bd677764c62d07c3d9a3e1fb1177ee1a9ab199eb2"
+dependencies = [
+ "jobserver",
+]
+
+[[package]]
+name = "ceresdbproto"
+version = "0.1.0"
+source = "git+https://github.com/CeresDB/ceresdbproto.git#dc8eb387ca66347c2ea9d5b00924ae63e7360be3"
+dependencies = [
+ "futures",
+ "grpcio 0.9.1",
+ "protobuf",
+ "protobuf-builder",
+]
+
+[[package]]
+name = "ceresdbx"
+version = "0.1.0"
+dependencies = [
+ "analytic_engine",
+ "catalog",
+ "catalog_impls",
+ "clap",
+ "common_util",
+ "log",
+ "logger",
+ "query_engine",
+ "server",
+ "signal-hook",
+ "table_engine",
+ "tracing_util",
+ "udf",
+ "vergen",
+]
+
+[[package]]
+name = "cexpr"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f4aedb84272dbe89af497cf81375129abda4fc0a9e7c5d317498c15cc30c0d27"
+dependencies = [
+ "nom",
+]
+
+[[package]]
+name = "cfg-if"
+version = "0.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822"
+
+[[package]]
+name = "cfg-if"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
+
+[[package]]
+name = "chrono"
+version = "0.4.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "670ad68c9088c2a963aaa298cb369688cf3f9465ce5e2d4ca10e6e0098a1ce73"
+dependencies = [
+ "libc",
+ "num-integer",
+ "num-traits",
+ "time",
+ "winapi",
+]
+
+[[package]]
+name = "clang-sys"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fa66045b9cb23c2e9c1520732030608b02ee07e5cfaa5a521ec15ded7fa24c90"
+dependencies = [
+ "glob",
+ "libc",
+ "libloading",
+]
+
+[[package]]
+name = "clap"
+version = "2.33.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "37e58ac78573c40708d45522f0d80fa2f01cc4f9b4e2bf749807255454312002"
+dependencies = [
+ "ansi_term 0.11.0",
+ "atty",
+ "bitflags",
+ "strsim",
+ "textwrap",
+ "unicode-width",
+ "vec_map",
+]
+
+[[package]]
+name = "cmake"
+version = "0.1.45"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eb6210b637171dfba4cda12e579ac6dc73f5165ad56133e5d72ef3131f320855"
+dependencies = [
+ "cc",
+]
+
+[[package]]
+name = "comfy-table"
+version = "5.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c42350b81f044f576ff88ac750419f914abb46a03831bb1747134344ee7a4e64"
+dependencies = [
+ "strum 0.22.0",
+ "strum_macros 0.22.0",
+ "unicode-width",
+]
+
+[[package]]
+name = "common_types"
+version = "0.1.0"
+dependencies = [
+ "arrow_deps",
+ "byteorder",
+ "bytes 0.1.0",
+ "chrono",
+ "murmur3",
+ "paste 1.0.5",
+ "proto",
+ "serde",
+ "serde_derive",
+ "snafu",
+ "sqlparser",
+]
+
+[[package]]
+name = "common_util"
+version = "0.1.0"
+dependencies = [
+ "backtrace",
+ "chrono",
+ "common_types",
+ "crossbeam-utils 0.8.5",
+ "env_logger",
+ "gag",
+ "lazy_static",
+ "libc",
+ "log",
+ "logger",
+ "nix",
+ "pin-project-lite",
+ "prometheus 0.12.0",
+ "proto",
+ "serde",
+ "serde_derive",
+ "slog",
+ "slog-global 0.1.0 (git+https://github.com/breezewish/slog-global.git?rev=0e23a5baff302a9d7bccd85f8f31e43339c2f2c1)",
+ "snafu",
+ "tempfile",
+ "time",
+ "tokio",
+ "tokio-test",
+ "toml",
+]
+
+[[package]]
+name = "const-random"
+version = "0.1.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f590d95d011aa80b063ffe3253422ed5aa462af4e9867d43ce8337562bac77c4"
+dependencies = [
+ "const-random-macro",
+ "proc-macro-hack",
+]
+
+[[package]]
+name = "const-random-macro"
+version = "0.1.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "615f6e27d000a2bffbc7f2f6a8669179378fa27ee4d0a509e985dfc0a7defb40"
+dependencies = [
+ "getrandom 0.2.3",
+ "lazy_static",
+ "proc-macro-hack",
+ "tiny-keccak",
+]
+
+[[package]]
+name = "constant_time_eq"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "245097e9a4535ee1e3e3931fcfcd55a796a44c643e8596ff6566d68f09b87bbc"
+
+[[package]]
+name = "core-foundation"
+version = "0.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0a89e2ae426ea83155dccf10c0fa6b1463ef6d5fcb44cee0b224a408fa640a62"
+dependencies = [
+ "core-foundation-sys",
+ "libc",
+]
+
+[[package]]
+name = "core-foundation-sys"
+version = "0.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ea221b5284a47e40033bf9b66f35f984ec0ea2931eb03505246cd27a963f981b"
+
+[[package]]
+name = "cpufeatures"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "95059428f66df56b63431fdb4e1947ed2190586af5c5a8a8b71122bdf5a7f469"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "crc32fast"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "81156fece84ab6a9f2afdb109ce3ae577e42b1228441eded99bd77f627953b1a"
+dependencies = [
+ "cfg-if 1.0.0",
+]
+
+[[package]]
+name = "criterion"
+version = "0.3.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1604dafd25fba2fe2d5895a9da139f8dc9b319a5fe5354ca137cbbce4e178d10"
+dependencies = [
+ "atty",
+ "cast",
+ "clap",
+ "criterion-plot",
+ "csv",
+ "itertools",
+ "lazy_static",
+ "num-traits",
+ "oorandom",
+ "plotters",
+ "rayon",
+ "regex",
+ "serde",
+ "serde_cbor",
+ "serde_derive",
+ "serde_json",
+ "tinytemplate",
+ "walkdir",
+]
+
+[[package]]
+name = "criterion-plot"
+version = "0.4.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d00996de9f2f7559f7f4dc286073197f83e92256a59ed395f9aac01fe717da57"
+dependencies = [
+ "cast",
+ "itertools",
+]
+
+[[package]]
+name = "crossbeam-channel"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "06ed27e177f16d65f0f0c22a213e17c696ace5dd64b14258b52f9417ccb52db4"
+dependencies = [
+ "cfg-if 1.0.0",
+ "crossbeam-utils 0.8.5",
+]
+
+[[package]]
+name = "crossbeam-deque"
+version = "0.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c20ff29ded3204c5106278a81a38f4b482636ed4fa1e6cfbeef193291beb29ed"
+dependencies = [
+ "crossbeam-epoch 0.8.2",
+ "crossbeam-utils 0.7.2",
+ "maybe-uninit",
+]
+
+[[package]]
+name = "crossbeam-deque"
+version = "0.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e"
+dependencies = [
+ "cfg-if 1.0.0",
+ "crossbeam-epoch 0.9.5",
+ "crossbeam-utils 0.8.5",
+]
+
+[[package]]
+name = "crossbeam-epoch"
+version = "0.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "058ed274caafc1f60c4997b5fc07bf7dc7cca454af7c6e81edffe5f33f70dace"
+dependencies = [
+ "autocfg",
+ "cfg-if 0.1.10",
+ "crossbeam-utils 0.7.2",
+ "lazy_static",
+ "maybe-uninit",
+ "memoffset 0.5.6",
+ "scopeguard",
+]
+
+[[package]]
+name = "crossbeam-epoch"
+version = "0.9.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4ec02e091aa634e2c3ada4a392989e7c3116673ef0ac5b72232439094d73b7fd"
+dependencies = [
+ "cfg-if 1.0.0",
+ "crossbeam-utils 0.8.5",
+ "lazy_static",
+ "memoffset 0.6.4",
+ "scopeguard",
+]
+
+[[package]]
+name = "crossbeam-utils"
+version = "0.7.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c3c7c73a2d1e9fc0886a08b93e98eb643461230d5f1925e4036204d5f2e261a8"
+dependencies = [
+ "autocfg",
+ "cfg-if 0.1.10",
+ "lazy_static",
+]
+
+[[package]]
+name = "crossbeam-utils"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d82cfc11ce7f2c3faef78d8a684447b40d503d9681acebed6cb728d45940c4db"
+dependencies = [
+ "cfg-if 1.0.0",
+ "lazy_static",
+]
+
+[[package]]
+name = "crunchy"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7"
+
+[[package]]
+name = "crypto-common"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "683d6b536309245c849479fba3da410962a43ed8e51c26b729208ec0ac2798d0"
+dependencies = [
+ "generic-array",
+]
+
+[[package]]
+name = "crypto-mac"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b584a330336237c1eecd3e94266efb216c56ed91225d634cb2991c5f3fd1aeab"
+dependencies = [
+ "generic-array",
+ "subtle",
+]
+
+[[package]]
+name = "csv"
+version = "1.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1"
+dependencies = [
+ "bstr",
+ "csv-core",
+ "itoa",
+ "ryu",
+ "serde",
+]
+
+[[package]]
+name = "csv-core"
+version = "0.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "dashmap"
+version = "3.11.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0f260e2fc850179ef410018660006951c1b55b79e8087e87111a2c388994b9b5"
+dependencies = [
+ "ahash 0.3.8",
+ "cfg-if 0.1.10",
+ "num_cpus",
+]
+
+[[package]]
+name = "datafusion"
+version = "6.0.0"
+source = "git+https://github.com/apache/arrow-datafusion.git?rev=444c153863520072ea22d4f8c498dee39437516d#444c153863520072ea22d4f8c498dee39437516d"
+dependencies = [
+ "ahash 0.7.4",
+ "arrow",
+ "async-trait",
+ "blake2",
+ "blake3",
+ "chrono",
+ "futures",
+ "hashbrown",
+ "lazy_static",
+ "log",
+ "md-5",
+ "num_cpus",
+ "ordered-float 2.10.0",
+ "parquet 7.0.0",
+ "paste 1.0.5",
+ "pin-project-lite",
+ "rand 0.8.4",
+ "regex",
+ "sha2",
+ "smallvec",
+ "sqlparser",
+ "tempfile",
+ "tokio",
+ "tokio-stream",
+ "unicode-segmentation",
+]
+
+[[package]]
+name = "digest"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3dd60d1080a57a05ab032377049e0591415d2b31afd7028356dbf3cc6dcb066"
+dependencies = [
+ "generic-array",
+]
+
+[[package]]
+name = "digest"
+version = "0.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b697d66081d42af4fba142d56918a3cb21dc8eb63372c6b85d14f44fb9c5979b"
+dependencies = [
+ "block-buffer 0.10.0",
+ "crypto-common",
+ "generic-array",
+ "subtle",
+]
+
+[[package]]
+name = "dirs-next"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b98cf8ebf19c3d1b223e151f99a4f9f0690dca41414773390fc824184ac833e1"
+dependencies = [
+ "cfg-if 1.0.0",
+ "dirs-sys-next",
+]
+
+[[package]]
+name = "dirs-sys-next"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4ebda144c4fe02d1f7ea1a7d9641b6fc6b580adcfa024ae48797ecdeb6825b4d"
+dependencies = [
+ "libc",
+ "redox_users",
+ "winapi",
+]
+
+[[package]]
+name = "doc-comment"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10"
+
+[[package]]
+name = "either"
+version = "1.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457"
+
+[[package]]
+name = "encoding_rs"
+version = "0.8.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "80df024fbc5ac80f87dfef0d9f5209a252f2a497f7f42944cff24d8253cac065"
+dependencies = [
+ "cfg-if 1.0.0",
+]
+
+[[package]]
+name = "enum-iterator"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4eeac5c5edb79e4e39fe8439ef35207780a11f69c52cbe424ce3dfad4cb78de6"
+dependencies = [
+ "enum-iterator-derive",
+]
+
+[[package]]
+name = "enum-iterator-derive"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c134c37760b27a871ba422106eedbb8247da973a09e82558bf26d619c882b159"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "env_logger"
+version = "0.6.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "aafcde04e90a5226a6443b7aabdb016ba2f8307c847d524724bd9b346dd1a2d3"
+dependencies = [
+ "atty",
+ "humantime",
+ "log",
+ "regex",
+ "termcolor",
+]
+
+[[package]]
+name = "fail"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3be3c61c59fdc91f5dbc3ea31ee8623122ce80057058be560654c5d410d181a6"
+dependencies = [
+ "lazy_static",
+ "log",
+ "rand 0.7.3",
+]
+
+[[package]]
+name = "failure"
+version = "0.1.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d32e9bd16cc02eae7db7ef620b392808b89f6a5e16bb3497d159c6b92a0f4f86"
+dependencies = [
+ "backtrace",
+ "failure_derive",
+]
+
+[[package]]
+name = "failure_derive"
+version = "0.1.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "aa4da3c766cd7a0db8242e326e9e4e081edd567072893ed320008189715366a4"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+ "synstructure",
+]
+
+[[package]]
+name = "fallible-streaming-iterator"
+version = "0.1.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a"
+
+[[package]]
+name = "filedescriptor"
+version = "0.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9ed3d8a5e20435ff00469e51a0d82049bae66504b5c429920dadf9bb54d47b3f"
+dependencies = [
+ "libc",
+ "thiserror",
+ "winapi",
+]
+
+[[package]]
+name = "flatbuffers"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ef4c5738bcd7fad10315029c50026f83c9da5e4a21f8ed66826f43e0e2bde5f6"
+dependencies = [
+ "bitflags",
+ "smallvec",
+ "thiserror",
+]
+
+[[package]]
+name = "flate2"
+version = "1.0.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cd3aec53de10fe96d7d8c565eb17f2c687bb5518a2ec453b5b1252964526abe0"
+dependencies = [
+ "cfg-if 1.0.0",
+ "crc32fast",
+ "libc",
+ "miniz_oxide",
+]
+
+[[package]]
+name = "fnv"
+version = "1.0.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
+
+[[package]]
+name = "foreign-types"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1"
+dependencies = [
+ "foreign-types-shared",
+]
+
+[[package]]
+name = "foreign-types-shared"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b"
+
+[[package]]
+name = "form_urlencoded"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5fc25a87fa4fd2094bffb06925852034d90a17f0d1e05197d4956d3555752191"
+dependencies = [
+ "matches",
+ "percent-encoding",
+]
+
+[[package]]
+name = "fs_extra"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2022715d62ab30faffd124d40b76f4134a550a87792276512b18d63272333394"
+
+[[package]]
+name = "futures"
+version = "0.3.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1adc00f486adfc9ce99f77d717836f0c5aa84965eb0b4f051f4e83f7cab53f8b"
+dependencies = [
+ "futures-channel",
+ "futures-core",
+ "futures-executor",
+ "futures-io",
+ "futures-sink",
+ "futures-task",
+ "futures-util",
+]
+
+[[package]]
+name = "futures-channel"
+version = "0.3.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "74ed2411805f6e4e3d9bc904c95d5d423b89b3b25dc0250aa74729de20629ff9"
+dependencies = [
+ "futures-core",
+ "futures-sink",
+]
+
+[[package]]
+name = "futures-core"
+version = "0.3.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "af51b1b4a7fdff033703db39de8802c673eb91855f2e0d47dcf3bf2c0ef01f99"
+
+[[package]]
+name = "futures-executor"
+version = "0.3.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4d0d535a57b87e1ae31437b892713aee90cd2d7b0ee48727cd11fc72ef54761c"
+dependencies = [
+ "futures-core",
+ "futures-task",
+ "futures-util",
+]
+
+[[package]]
+name = "futures-io"
+version = "0.3.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b0e06c393068f3a6ef246c75cdca793d6a46347e75286933e5e75fd2fd11582"
+
+[[package]]
+name = "futures-macro"
+version = "0.3.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c54913bae956fb8df7f4dc6fc90362aa72e69148e3f39041fbe8742d21e0ac57"
+dependencies = [
+ "autocfg",
+ "proc-macro-hack",
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "futures-sink"
+version = "0.3.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c0f30aaa67363d119812743aa5f33c201a7a66329f97d1a887022971feea4b53"
+
+[[package]]
+name = "futures-task"
+version = "0.3.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bbe54a98670017f3be909561f6ad13e810d9a51f3f061b902062ca3da80799f2"
+
+[[package]]
+name = "futures-util"
+version = "0.3.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "67eb846bfd58e44a8481a00049e82c43e0ccb5d61f8dc071057cb19249dd4d78"
+dependencies = [
+ "autocfg",
+ "futures-channel",
+ "futures-core",
+ "futures-io",
+ "futures-macro",
+ "futures-sink",
+ "futures-task",
+ "memchr",
+ "pin-project-lite",
+ "pin-utils",
+ "proc-macro-hack",
+ "proc-macro-nested",
+ "slab",
+]
+
+[[package]]
+name = "gag"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a713bee13966e9fbffdf7193af71d54a6b35a0bb34997cd6c9519ebeb5005972"
+dependencies = [
+ "filedescriptor",
+ "tempfile",
+]
+
+[[package]]
+name = "generic-array"
+version = "0.14.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "501466ecc8a30d1d3b7fc9229b122b2ce8ed6e9d9223f1138d4babb253e51817"
+dependencies = [
+ "typenum",
+ "version_check",
+]
+
+[[package]]
+name = "getrandom"
+version = "0.1.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce"
+dependencies = [
+ "cfg-if 1.0.0",
+ "libc",
+ "wasi 0.9.0+wasi-snapshot-preview1",
+]
+
+[[package]]
+name = "getrandom"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7fcd999463524c52659517fe2cea98493cfe485d10565e7b0fb07dbba7ad2753"
+dependencies = [
+ "cfg-if 1.0.0",
+ "libc",
+ "wasi 0.10.2+wasi-snapshot-preview1",
+]
+
+[[package]]
+name = "getset"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "24b328c01a4d71d2d8173daa93562a73ab0fe85616876f02500f53d82948c504"
+dependencies = [
+ "proc-macro-error",
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "gimli"
+version = "0.25.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f0a01e0497841a3b2db4f8afa483cce65f7e96a3498bd6c541734792aeac8fe7"
+
+[[package]]
+name = "git2"
+version = "0.13.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "659cd14835e75b64d9dba5b660463506763cf0aa6cb640aeeb0e98d841093490"
+dependencies = [
+ "bitflags",
+ "libc",
+ "libgit2-sys",
+ "log",
+ "url",
+]
+
+[[package]]
+name = "glob"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574"
+
+[[package]]
+name = "grpcio"
+version = "0.1.0"
+dependencies = [
+ "grpcio 0.9.1",
+]
+
+[[package]]
+name = "grpcio"
+version = "0.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "24d99e00eed7e0a04ee2705112e7cfdbe1a3cc771147f22f016a8cd2d002187b"
+dependencies = [
+ "futures",
+ "grpcio-sys",
+ "libc",
+ "log",
+ "parking_lot",
+ "protobuf",
+]
+
+[[package]]
+name = "grpcio-compiler"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f1f1abac9f330ac9ee0950220c10eea84d66479cede4836f0b924407fecf093c"
+dependencies = [
+ "protobuf",
+]
+
+[[package]]
+name = "grpcio-sys"
+version = "0.9.1+1.38.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9447d1a926beeef466606cc45717f80897998b548e7dc622873d453e1ecb4be4"
+dependencies = [
+ "bindgen",
+ "boringssl-src",
+ "cc",
+ "cmake",
+ "libc",
+ "libz-sys",
+ "pkg-config",
+ "walkdir",
+]
+
+[[package]]
+name = "h2"
+version = "0.3.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d7f3675cfef6a30c8031cf9e6493ebdc3bb3272a3fea3923c4210d1830e6a472"
+dependencies = [
+ "bytes 1.1.0",
+ "fnv",
+ "futures-core",
+ "futures-sink",
+ "futures-util",
+ "http",
+ "indexmap",
+ "slab",
+ "tokio",
+ "tokio-util",
+ "tracing 0.1.26",
+]
+
+[[package]]
+name = "half"
+version = "1.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7"
+
+[[package]]
+name = "hash_hasher"
+version = "2.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "74721d007512d0cb3338cd20f0654ac913920061a4c4d0d8708edb3f2a698c0c"
+
+[[package]]
+name = "hashbrown"
+version = "0.11.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e"
+dependencies = [
+ "ahash 0.7.4",
+]
+
+[[package]]
+name = "headers"
+version = "0.3.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f0b7591fb62902706ae8e7aaff416b1b0fa2c0fd0878b46dc13baa3712d8a855"
+dependencies = [
+ "base64",
+ "bitflags",
+ "bytes 1.1.0",
+ "headers-core",
+ "http",
+ "mime",
+ "sha-1",
+ "time",
+]
+
+[[package]]
+name = "headers-core"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e7f66481bfee273957b1f20485a4ff3362987f85b2c236580d81b4eb7a326429"
+dependencies = [
+ "http",
+]
+
+[[package]]
+name = "heck"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c"
+dependencies = [
+ "unicode-segmentation",
+]
+
+[[package]]
+name = "hermit-abi"
+version = "0.1.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "hex"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"
+
+[[package]]
+name = "http"
+version = "0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "527e8c9ac747e28542699a951517aa9a6945af506cd1f2e1b53a576c17b6cc11"
+dependencies = [
+ "bytes 1.1.0",
+ "fnv",
+ "itoa",
+]
+
+[[package]]
+name = "http-body"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "399c583b2979440c60be0821a6199eca73bc3c8dcd9d070d75ac726e2c6186e5"
+dependencies = [
+ "bytes 1.1.0",
+ "http",
+ "pin-project-lite",
+]
+
+[[package]]
+name = "httparse"
+version = "1.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "acd94fdbe1d4ff688b67b04eee2e17bd50995534a61539e45adfefb45e5e5503"
+
+[[package]]
+name = "httpdate"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6456b8a6c8f33fee7d958fcd1b60d55b11940a79e63ae87013e6d22e26034440"
+
+[[package]]
+name = "humantime"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df004cfca50ef23c36850aaaa59ad52cc70d0e90243c3c7737a4dd32dc7a3c4f"
+dependencies = [
+ "quick-error",
+]
+
+[[package]]
+name = "hyper"
+version = "0.14.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "13f67199e765030fa08fe0bd581af683f0d5bc04ea09c2b1102012c5fb90e7fd"
+dependencies = [
+ "bytes 1.1.0",
+ "futures-channel",
+ "futures-core",
+ "futures-util",
+ "h2",
+ "http",
+ "http-body",
+ "httparse",
+ "httpdate",
+ "itoa",
+ "pin-project-lite",
+ "socket2",
+ "tokio",
+ "tower-service",
+ "tracing 0.1.26",
+ "want",
+]
+
+[[package]]
+name = "hyper-tls"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905"
+dependencies = [
+ "bytes 1.1.0",
+ "hyper",
+ "native-tls",
+ "tokio",
+ "tokio-native-tls",
+]
+
+[[package]]
+name = "hyperloglog"
+version = "1.0.0"
+dependencies = [
+ "bytecount",
+ "bytes 0.1.0",
+ "rand 0.8.4",
+ "siphasher",
+ "snafu",
+]
+
+[[package]]
+name = "idna"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "418a0a6fab821475f634efe3ccc45c013f742efe03d853e8d3355d5cb850ecf8"
+dependencies = [
+ "matches",
+ "unicode-bidi",
+ "unicode-normalization",
+]
+
+[[package]]
+name = "indexmap"
+version = "1.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bc633605454125dec4b66843673f01c7df2b89479b32e0ed634e43a91cff62a5"
+dependencies = [
+ "autocfg",
+ "hashbrown",
+]
+
+[[package]]
+name = "input_buffer"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f97967975f448f1a7ddb12b0bc41069d09ed6a1c161a92687e057325db35d413"
+dependencies = [
+ "bytes 1.1.0",
+]
+
+[[package]]
+name = "instant"
+version = "0.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bee0328b1209d157ef001c94dd85b4f8f64139adb0eac2659f4b08382b2f474d"
+dependencies = [
+ "cfg-if 1.0.0",
+]
+
+[[package]]
+name = "integer-encoding"
+version = "1.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "48dc51180a9b377fd75814d0cc02199c20f8e99433d6762f650d39cdbbd3b56f"
+
+[[package]]
+name = "integer-encoding"
+version = "3.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "90c11140ffea82edce8dcd74137ce9324ec24b3cf0175fc9d7e29164da9915b8"
+dependencies = [
+ "async-trait",
+ "futures-util",
+]
+
+[[package]]
+name = "interpreters"
+version = "0.1.0"
+dependencies = [
+ "analytic_engine",
+ "arrow_deps",
+ "async-trait",
+ "catalog",
+ "catalog_impls",
+ "common_types",
+ "common_util",
+ "log",
+ "query_engine",
+ "snafu",
+ "sql",
+ "table_engine",
+ "tokio",
+ "udf",
+]
+
+[[package]]
+name = "ipnet"
+version = "2.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "68f2d64f2edebec4ce84ad108148e67e1064789bee435edc5b60ad398714a3a9"
+
+[[package]]
+name = "itertools"
+version = "0.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "69ddb889f9d0d08a67338271fa9b62996bc788c7796a5c18cf057420aaed5eaf"
+dependencies = [
+ "either",
+]
+
+[[package]]
+name = "itoa"
+version = "0.4.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4"
+
+[[package]]
+name = "jemalloc-ctl"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c502a5ff9dd2924f1ed32ba96e3b65735d837b4bfd978d3161b1702e66aca4b7"
+dependencies = [
+ "jemalloc-sys",
+ "libc",
+ "paste 0.1.18",
+]
+
+[[package]]
+name = "jemalloc-sys"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0d3b9f3f5c9b31aa0f5ed3260385ac205db665baa41d49bb8338008ae94ede45"
+dependencies = [
+ "cc",
+ "fs_extra",
+ "libc",
+]
+
+[[package]]
+name = "jemallocator"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "43ae63fcfc45e99ab3d1b29a46782ad679e98436c3169d15a167a1108a724b69"
+dependencies = [
+ "jemalloc-sys",
+ "libc",
+]
+
+[[package]]
+name = "jobserver"
+version = "0.1.24"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "af25a77299a7f711a01975c35a6a424eb6862092cc2d6c72c4ed6cbc56dfc1fa"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "js-sys"
+version = "0.3.53"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e4bf49d50e2961077d9c99f4b7997d770a1114f087c3c2e0069b36c13fc2979d"
+dependencies = [
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "lazy_static"
+version = "1.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
+
+[[package]]
+name = "lazycell"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
+
+[[package]]
+name = "lexical-core"
+version = "0.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6a3926d8f156019890be4abe5fd3785e0cff1001e06f59c597641fd513a5a284"
+dependencies = [
+ "lexical-parse-float",
+ "lexical-parse-integer",
+ "lexical-util",
+ "lexical-write-float",
+ "lexical-write-integer",
+]
+
+[[package]]
+name = "lexical-parse-float"
+version = "0.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b4d066d004fa762d9da995ed21aa8845bb9f6e4265f540d716fb4b315197bf0e"
+dependencies = [
+ "lexical-parse-integer",
+ "lexical-util",
+ "static_assertions",
+]
+
+[[package]]
+name = "lexical-parse-integer"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f2c92badda8cc0fc4f3d3cc1c30aaefafb830510c8781ce4e8669881f3ed53ac"
+dependencies = [
+ "lexical-util",
+ "static_assertions",
+]
+
+[[package]]
+name = "lexical-util"
+version = "0.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6ff669ccaae16ee33af90dc51125755efed17f1309626ba5c12052512b11e291"
+dependencies = [
+ "static_assertions",
+]
+
+[[package]]
+name = "lexical-write-float"
+version = "0.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8b5186948c7b297abaaa51560f2581dae625e5ce7dfc2d8fdc56345adb6dc576"
+dependencies = [
+ "lexical-util",
+ "lexical-write-integer",
+ "static_assertions",
+]
+
+[[package]]
+name = "lexical-write-integer"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ece956492e0e40fd95ef8658a34d53a3b8c2015762fdcaaff2167b28de1f56ef"
+dependencies = [
+ "lexical-util",
+ "static_assertions",
+]
+
+[[package]]
+name = "libc"
+version = "0.2.101"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3cb00336871be5ed2c8ed44b60ae9959dc5b9f08539422ed43f09e34ecaeba21"
+
+[[package]]
+name = "libflate"
+version = "1.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "16364af76ebb39b5869bb32c81fa93573267cd8c62bb3474e28d78fac3fb141e"
+dependencies = [
+ "adler32",
+ "crc32fast",
+ "libflate_lz77",
+]
+
+[[package]]
+name = "libflate_lz77"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "39a734c0493409afcd49deee13c006a04e3586b9761a03543c6272c9c51f2f5a"
+dependencies = [
+ "rle-decode-fast",
+]
+
+[[package]]
+name = "libgit2-sys"
+version = "0.12.22+1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "89c53ac117c44f7042ad8d8f5681378dfbc6010e49ec2c0d1f11dfedc7a4a1c3"
+dependencies = [
+ "cc",
+ "libc",
+ "libz-sys",
+ "pkg-config",
+]
+
+[[package]]
+name = "libloading"
+version = "0.7.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "afe203d669ec979b7128619bae5a63b7b42e9203c1b29146079ee05e2f604b52"
+dependencies = [
+ "cfg-if 1.0.0",
+ "winapi",
+]
+
+[[package]]
+name = "librocksdb_sys"
+version = "0.1.0"
+source = "git+https://github.com/tikv/rust-rocksdb.git?branch=tikv-5.2#23bd00d50c79b40b6a32c11446c86f0714fa7844"
+dependencies = [
+ "bindgen",
+ "bzip2-sys",
+ "cc",
+ "cmake",
+ "libc",
+ "libtitan_sys",
+ "libz-sys",
+ "lz4-sys",
+ "snappy-sys",
+ "zstd-sys",
+]
+
+[[package]]
+name = "libtitan_sys"
+version = "0.0.1"
+source = "git+https://github.com/tikv/rust-rocksdb.git?branch=tikv-5.2#23bd00d50c79b40b6a32c11446c86f0714fa7844"
+dependencies = [
+ "bzip2-sys",
+ "cc",
+ "cmake",
+ "libc",
+ "libz-sys",
+ "lz4-sys",
+ "snappy-sys",
+ "zstd-sys",
+]
+
+[[package]]
+name = "libz-sys"
+version = "1.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "de5435b8549c16d423ed0c03dbaafe57cf6c3344744f1242520d59c9d8ecec66"
+dependencies = [
+ "cc",
+ "libc",
+ "pkg-config",
+ "vcpkg",
+]
+
+[[package]]
+name = "lock_api"
+version = "0.4.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "712a4d093c9976e24e7dbca41db895dabcbac38eb5f4045393d17a95bdfb1109"
+dependencies = [
+ "scopeguard",
+]
+
+[[package]]
+name = "log"
+version = "0.4.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710"
+dependencies = [
+ "cfg-if 1.0.0",
+]
+
+[[package]]
+name = "logger"
+version = "0.1.0"
+dependencies = [
+ "chrono",
+ "grpcio 0.1.0",
+ "log",
+ "slog",
+ "slog-async",
+ "slog-global 0.1.0 (git+https://github.com/breeswish/slog-global.git?rev=0e23a5baff302a9d7bccd85f8f31e43339c2f2c1)",
+ "slog-term",
+ "slog_derive",
+]
+
+[[package]]
+name = "lru"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6c748cfe47cb8da225c37595b3108bea1c198c84aaae8ea0ba76d01dda9fc803"
+dependencies = [
+ "hashbrown",
+]
+
+[[package]]
+name = "lz4"
+version = "1.23.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "aac20ed6991e01bf6a2e68cc73df2b389707403662a8ba89f68511fb340f724c"
+dependencies = [
+ "libc",
+ "lz4-sys",
+]
+
+[[package]]
+name = "lz4-sys"
+version = "1.9.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dca79aa95d8b3226213ad454d328369853be3a1382d89532a854f4d69640acae"
+dependencies = [
+ "cc",
+ "libc",
+]
+
+[[package]]
+name = "matchers"
+version = "0.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f099785f7595cc4b4553a174ce30dd7589ef93391ff414dbb67f62392b9e0ce1"
+dependencies = [
+ "regex-automata",
+]
+
+[[package]]
+name = "matches"
+version = "0.1.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a3e378b66a060d48947b590737b30a1be76706c8dd7b8ba0f2fe3989c68a853f"
+
+[[package]]
+name = "maybe-uninit"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "60302e4db3a61da70c0cb7991976248362f30319e88850c487b9b95bbf059e00"
+
+[[package]]
+name = "md-5"
+version = "0.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7b5a279bb9607f9f53c22d496eade00d138d1bdcccd07d74650387cf94942a15"
+dependencies = [
+ "block-buffer 0.9.0",
+ "digest 0.9.0",
+ "opaque-debug",
+]
+
+[[package]]
+name = "memchr"
+version = "2.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a"
+
+[[package]]
+name = "memoffset"
+version = "0.5.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "043175f069eda7b85febe4a74abbaeff828d9f8b448515d3151a14a3542811aa"
+dependencies = [
+ "autocfg",
+]
+
+[[package]]
+name = "memoffset"
+version = "0.6.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "59accc507f1338036a0477ef61afdae33cde60840f4dfe481319ce3ad116ddf9"
+dependencies = [
+ "autocfg",
+]
+
+[[package]]
+name = "meta_client"
+version = "0.1.0"
+dependencies = [
+ "async-trait",
+ "catalog",
+ "ceresdbproto",
+ "common_types",
+ "common_util",
+ "futures",
+ "grpcio 0.1.0",
+ "log",
+ "rand 0.7.3",
+ "reqwest",
+ "serde",
+ "serde_derive",
+ "serde_json",
+ "snafu",
+ "table_engine",
+ "tokio",
+ "url",
+]
+
+[[package]]
+name = "mime"
+version = "0.3.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d"
+
+[[package]]
+name = "mime_guess"
+version = "2.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2684d4c2e97d99848d30b324b00c8fcc7e5c897b7cbb5819b09e7c90e8baf212"
+dependencies = [
+ "mime",
+ "unicase",
+]
+
+[[package]]
+name = "miniz_oxide"
+version = "0.4.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a92518e98c078586bc6c934028adcca4c92a53d6a958196de835170a01d84e4b"
+dependencies = [
+ "adler",
+ "autocfg",
+]
+
+[[package]]
+name = "mio"
+version = "0.7.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8c2bdb6314ec10835cd3293dd268473a835c02b7b352e788be788b3c6ca6bb16"
+dependencies = [
+ "libc",
+ "log",
+ "miow",
+ "ntapi",
+ "winapi",
+]
+
+[[package]]
+name = "miow"
+version = "0.3.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b9f1c5b025cda876f66ef43a113f91ebc9f4ccef34843000e0adf6ebbab84e21"
+dependencies = [
+ "winapi",
+]
+
+[[package]]
+name = "multipart"
+version = "0.17.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d050aeedc89243f5347c3e237e3e13dc76fbe4ae3742a57b94dc14f69acf76d4"
+dependencies = [
+ "buf_redux",
+ "httparse",
+ "log",
+ "mime",
+ "mime_guess",
+ "quick-error",
+ "rand 0.7.3",
+ "safemem",
+ "tempfile",
+ "twoway",
+]
+
+[[package]]
+name = "multiversion"
+version = "0.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "025c962a3dd3cc5e0e520aa9c612201d127dcdf28616974961a649dca64f5373"
+dependencies = [
+ "multiversion-macros",
+]
+
+[[package]]
+name = "multiversion-macros"
+version = "0.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a8a3e2bde382ebf960c1f3e79689fa5941625fe9bf694a1cb64af3e85faff3af"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "murmur3"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a198f9589efc03f544388dfc4a19fe8af4323662b62f598b8dcfdac62c14771c"
+dependencies = [
+ "byteorder",
+]
+
+[[package]]
+name = "native-tls"
+version = "0.2.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "48ba9f7719b5a0f42f338907614285fb5fd70e53858141f69898a1fb7203b24d"
+dependencies = [
+ "lazy_static",
+ "libc",
+ "log",
+ "openssl",
+ "openssl-probe",
+ "openssl-sys",
+ "schannel",
+ "security-framework",
+ "security-framework-sys",
+ "tempfile",
+]
+
+[[package]]
+name = "nix"
+version = "0.19.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b2ccba0cfe4fdf15982d1674c69b1fd80bad427d293849982668dfe454bd61f2"
+dependencies = [
+ "bitflags",
+ "cc",
+ "cfg-if 1.0.0",
+ "libc",
+]
+
+[[package]]
+name = "nom"
+version = "5.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ffb4262d26ed83a1c0a33a38fe2bb15797329c85770da05e6b828ddb782627af"
+dependencies = [
+ "memchr",
+ "version_check",
+]
+
+[[package]]
+name = "ntapi"
+version = "0.3.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3f6bb902e437b6d86e03cce10a7e2af662292c5dfef23b65899ea3ac9354ad44"
+dependencies = [
+ "winapi",
+]
+
+[[package]]
+name = "num"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "43db66d1170d347f9a065114077f7dccb00c1b9478c89384490a3425279a4606"
+dependencies = [
+ "num-bigint 0.4.1",
+ "num-complex",
+ "num-integer",
+ "num-iter",
+ "num-rational",
+ "num-traits",
+]
+
+[[package]]
+name = "num-bigint"
+version = "0.2.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "090c7f9998ee0ff65aa5b723e4009f7b217707f1fb5ea551329cc4d6231fb304"
+dependencies = [
+ "autocfg",
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "num-bigint"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "76e97c412795abf6c24ba30055a8f20642ea57ca12875220b854cfa501bf1e48"
+dependencies = [
+ "autocfg",
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "num-complex"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "26873667bbbb7c5182d4a37c1add32cdf09f841af72da53318fdb81543c15085"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "num-integer"
+version = "0.1.44"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d2cc698a63b549a70bc047073d2949cce27cd1c7b0a4a862d08a8031bc2801db"
+dependencies = [
+ "autocfg",
+ "num-traits",
+]
+
+[[package]]
+name = "num-iter"
+version = "0.1.42"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b2021c8337a54d21aca0d59a92577a029af9431cb59b909b03252b9c164fad59"
+dependencies = [
+ "autocfg",
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "num-rational"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d41702bd167c2df5520b384281bc111a4b5efcf7fbc4c9c222c815b07e0a6a6a"
+dependencies = [
+ "autocfg",
+ "num-bigint 0.4.1",
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "num-traits"
+version = "0.2.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290"
+dependencies = [
+ "autocfg",
+]
+
+[[package]]
+name = "num_cpus"
+version = "1.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "05499f3756671c15885fee9034446956fff3f243d6077b91e5767df161f766b3"
+dependencies = [
+ "hermit-abi",
+ "libc",
+]
+
+[[package]]
+name = "object"
+version = "0.26.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "39f37e50073ccad23b6d09bcb5b263f4e76d3bb6038e4a3c08e52162ffa8abc2"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "object_store"
+version = "0.1.0"
+dependencies = [
+ "async-trait",
+ "bytes 1.1.0",
+ "common_util",
+ "futures",
+ "itertools",
+ "percent-encoding",
+ "snafu",
+ "tempfile",
+ "tokio",
+ "tokio-util",
+ "walkdir",
+]
+
+[[package]]
+name = "once_cell"
+version = "1.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "692fcb63b64b1758029e0a96ee63e049ce8c5948587f2f7208df04625e5f6b56"
+
+[[package]]
+name = "oorandom"
+version = "11.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575"
+
+[[package]]
+name = "opaque-debug"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "624a8340c38c1b80fd549087862da4ba43e08858af025b236e509b6649fc13d5"
+
+[[package]]
+name = "openssl"
+version = "0.10.36"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8d9facdb76fec0b73c406f125d44d86fdad818d66fef0531eec9233ca425ff4a"
+dependencies = [
+ "bitflags",
+ "cfg-if 1.0.0",
+ "foreign-types",
+ "libc",
+ "once_cell",
+ "openssl-sys",
+]
+
+[[package]]
+name = "openssl-probe"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "28988d872ab76095a6e6ac88d99b54fd267702734fd7ffe610ca27f533ddb95a"
+
+[[package]]
+name = "openssl-sys"
+version = "0.9.66"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1996d2d305e561b70d1ee0c53f1542833f4e1ac6ce9a6708b6ff2738ca67dc82"
+dependencies = [
+ "autocfg",
+ "cc",
+ "libc",
+ "pkg-config",
+ "vcpkg",
+]
+
+[[package]]
+name = "ordered-float"
+version = "1.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3305af35278dd29f46fcdd139e0b1fbfae2153f0e5928b39b035542dd31e37b7"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "ordered-float"
+version = "2.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7940cf2ca942593318d07fcf2596cdca60a85c9e7fab408a5e21a4f9dcd40d87"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "parking_lot"
+version = "0.11.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99"
+dependencies = [
+ "instant",
+ "lock_api",
+ "parking_lot_core",
+]
+
+[[package]]
+name = "parking_lot_core"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d76e8e1493bcac0d2766c42737f34458f1c8c50c0d23bcb24ea953affb273216"
+dependencies = [
+ "cfg-if 1.0.0",
+ "instant",
+ "libc",
+ "redox_syscall",
+ "smallvec",
+ "winapi",
+]
+
+[[package]]
+name = "parquet"
+version = "0.1.0"
+dependencies = [
+ "arrow_deps",
+ "lru",
+ "parquet-format",
+ "thrift",
+]
+
+[[package]]
+name = "parquet"
+version = "7.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5c718575b34e488fa78d4f0286356abb8466573cb17ae8faa96ffd871ca6e8c6"
+dependencies = [
+ "arrow",
+ "base64",
+ "brotli",
+ "byteorder",
+ "chrono",
+ "flate2",
+ "lz4",
+ "num-bigint 0.4.1",
+ "parquet-format",
+ "rand 0.8.4",
+ "snap",
+ "thrift",
+ "zstd",
+]
+
+[[package]]
+name = "parquet-format"
+version = "4.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1f0c06cdcd5460967c485f9c40a821746f5955ad81990533c7fae95dbd9bc0b5"
+dependencies = [
+ "thrift",
+]
+
+[[package]]
+name = "parquet-format-async-temp"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "03abc2f9c83fe9ceec83f47c76cc071bfd56caba33794340330f35623ab1f544"
+dependencies = [
+ "async-trait",
+ "byteorder",
+ "futures",
+ "integer-encoding 3.0.2",
+ "ordered-float 1.1.1",
+]
+
+[[package]]
+name = "parquet2"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "db82df54cdd88931d29b850190915b9069bb93fba8e1aefc0d59d8ca81603d6d"
+dependencies = [
+ "async-stream",
+ "bitpacking",
+ "futures",
+ "parquet-format-async-temp",
+ "streaming-decompression",
+]
+
+[[package]]
+name = "paste"
+version = "0.1.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "45ca20c77d80be666aef2b45486da86238fabe33e38306bd3118fe4af33fa880"
+dependencies = [
+ "paste-impl",
+ "proc-macro-hack",
+]
+
+[[package]]
+name = "paste"
+version = "1.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "acbf547ad0c65e31259204bd90935776d1c693cec2f4ff7abb7a1bbbd40dfe58"
+
+[[package]]
+name = "paste-impl"
+version = "0.1.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d95a7db200b97ef370c8e6de0088252f7e0dfff7d047a28528e47456c0fc98b6"
+dependencies = [
+ "proc-macro-hack",
+]
+
+[[package]]
+name = "peeking_take_while"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099"
+
+[[package]]
+name = "percent-encoding"
+version = "2.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e"
+
+[[package]]
+name = "pin-project"
+version = "0.4.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "918192b5c59119d51e0cd221f4d49dde9112824ba717369e903c97d076083d0f"
+dependencies = [
+ "pin-project-internal 0.4.28",
+]
+
+[[package]]
+name = "pin-project"
+version = "1.0.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "576bc800220cc65dac09e99e97b08b358cfab6e17078de8dc5fee223bd2d0c08"
+dependencies = [
+ "pin-project-internal 1.0.8",
+]
+
+[[package]]
+name = "pin-project-internal"
+version = "0.4.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3be26700300be6d9d23264c73211d8190e755b6b5ca7a1b28230025511b52a5e"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "pin-project-internal"
+version = "1.0.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6e8fe8163d14ce7f0cdac2e040116f22eac817edabff0be91e8aff7e9accf389"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "pin-project-lite"
+version = "0.2.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8d31d11c69a6b52a174b42bdc0c30e5e11670f90788b2c471c31c1d17d449443"
+
+[[package]]
+name = "pin-utils"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
+
+[[package]]
+name = "pkg-config"
+version = "0.3.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3831453b3449ceb48b6d9c7ad7c96d5ea673e9b470a1dc578c2ce6521230884c"
+
+[[package]]
+name = "plotters"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32a3fd9ec30b9749ce28cd91f255d569591cdf937fe280c312143e3c4bad6f2a"
+dependencies = [
+ "num-traits",
+ "plotters-backend",
+ "plotters-svg",
+ "wasm-bindgen",
+ "web-sys",
+]
+
+[[package]]
+name = "plotters-backend"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d88417318da0eaf0fdcdb51a0ee6c3bed624333bff8f946733049380be67ac1c"
+
+[[package]]
+name = "plotters-svg"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "521fa9638fa597e1dc53e9412a4f9cefb01187ee1f7413076f9e6749e2885ba9"
+dependencies = [
+ "plotters-backend",
+]
+
+[[package]]
+name = "ppv-lite86"
+version = "0.2.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac74c624d6b2d21f425f752262f42188365d7b8ff1aff74c82e45136510a4857"
+
+[[package]]
+name = "proc-macro-error"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c"
+dependencies = [
+ "proc-macro-error-attr",
+ "proc-macro2",
+ "quote",
+ "syn",
+ "version_check",
+]
+
+[[package]]
+name = "proc-macro-error-attr"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "version_check",
+]
+
+[[package]]
+name = "proc-macro-hack"
+version = "0.5.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5"
+
+[[package]]
+name = "proc-macro-nested"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bc881b2c22681370c6a780e47af9840ef841837bc98118431d4e1868bd0c1086"
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5c7ed8b8c7b886ea3ed7dde405212185f423ab44682667c8c6dd14aa1d9f6612"
+dependencies = [
+ "unicode-xid",
+]
+
+[[package]]
+name = "profile"
+version = "0.1.0"
+dependencies = [
+ "jemalloc-ctl",
+ "jemalloc-sys",
+ "jemallocator",
+ "log",
+ "tempfile",
+]
+
+[[package]]
+name = "prometheus"
+version = "0.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "30d70cf4412832bcac9cffe27906f4a66e450d323525e977168c70d1b36120ae"
+dependencies = [
+ "cfg-if 0.1.10",
+ "fnv",
+ "lazy_static",
+ "parking_lot",
+ "regex",
+ "thiserror",
+]
+
+[[package]]
+name = "prometheus"
+version = "0.12.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5986aa8d62380092d2f50f8b1cdba9cb9b6731ffd4b25b51fd126b6c3e05b99c"
+dependencies = [
+ "cfg-if 1.0.0",
+ "fnv",
+ "lazy_static",
+ "memchr",
+ "parking_lot",
+ "protobuf",
+ "thiserror",
+]
+
+[[package]]
+name = "prometheus-static-metric"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f8f30cdb09c39930b8fa5e0f23cbb895ab3f766b187403a0ba0956fc1ef4f0e5"
+dependencies = [
+ "lazy_static",
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "proto"
+version = "0.1.0"
+dependencies = [
+ "protobuf",
+ "protobuf-builder",
+]
+
+[[package]]
+name = "protobuf"
+version = "2.25.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "23129d50f2c9355ced935fce8a08bd706ee2e7ce2b3b33bf61dace0e379ac63a"
+
+[[package]]
+name = "protobuf-builder"
+version = "0.1.0"
+source = "git+https://github.com/CeresDB/protobuf-builder.git?rev=745cc8527d1c5eb48745f5ce74b2b5bdb75c3bf2#745cc8527d1c5eb48745f5ce74b2b5bdb75c3bf2"
+dependencies = [
+ "protobuf",
+ "protoc",
+ "protoc-bin-vendored",
+ "protoc-grpcio",
+]
+
+[[package]]
+name = "protobuf-codegen"
+version = "2.25.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4ba98ce0dadaa6de1e7f1b6d82a0a73b03e0c049169a167c919d906b0875026c"
+dependencies = [
+ "protobuf",
+]
+
+[[package]]
+name = "protoc"
+version = "2.25.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ace5c4ea0e4b0381eb37837e070182b7ab491445e2d5ea2201d861f2b2f94f82"
+dependencies = [
+ "log",
+ "which",
+]
+
+[[package]]
+name = "protoc-bin-vendored"
+version = "2.25.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c2a56d817108caebed2cfb20931270a6d95dc6e36a0801999eacfbf35c21a5dd"
+
+[[package]]
+name = "protoc-grpcio"
+version = "3.0.0"
+source = "git+https://github.com/CeresDB/protoc-grpcio.git?rev=fe9664cf003c908528f940d003a9c3e90e522658#fe9664cf003c908528f940d003a9c3e90e522658"
+dependencies = [
+ "failure",
+ "grpcio-compiler",
+ "protobuf",
+ "protobuf-codegen",
+ "protoc",
+ "tempfile",
+]
+
+[[package]]
+name = "query_engine"
+version = "0.1.0"
+dependencies = [
+ "arrow_deps",
+ "async-trait",
+ "common_types",
+ "common_util",
+ "futures",
+ "log",
+ "snafu",
+ "sql",
+ "table_engine",
+ "udf",
+]
+
+[[package]]
+name = "quick-error"
+version = "1.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0"
+
+[[package]]
+name = "quote"
+version = "1.0.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c3d0b9745dc2debf507c8422de05d7226cc1f0644216dfdfead988f9b1ab32a7"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "rand"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03"
+dependencies = [
+ "getrandom 0.1.16",
+ "libc",
+ "rand_chacha 0.2.2",
+ "rand_core 0.5.1",
+ "rand_hc 0.2.0",
+]
+
+[[package]]
+name = "rand"
+version = "0.8.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2e7573632e6454cf6b99d7aac4ccca54be06da05aca2ef7423d22d27d4d4bcd8"
+dependencies = [
+ "libc",
+ "rand_chacha 0.3.1",
+ "rand_core 0.6.3",
+ "rand_hc 0.3.1",
+]
+
+[[package]]
+name = "rand_chacha"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402"
+dependencies = [
+ "ppv-lite86",
+ "rand_core 0.5.1",
+]
+
+[[package]]
+name = "rand_chacha"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
+dependencies = [
+ "ppv-lite86",
+ "rand_core 0.6.3",
+]
+
+[[package]]
+name = "rand_core"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19"
+dependencies = [
+ "getrandom 0.1.16",
+]
+
+[[package]]
+name = "rand_core"
+version = "0.6.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d34f1408f55294453790c48b2f1ebbb1c5b4b7563eb1f418bcfcfdbb06ebb4e7"
+dependencies = [
+ "getrandom 0.2.3",
+]
+
+[[package]]
+name = "rand_hc"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c"
+dependencies = [
+ "rand_core 0.5.1",
+]
+
+[[package]]
+name = "rand_hc"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d51e9f596de227fda2ea6c84607f5558e196eeaf43c986b724ba4fb8fdf497e7"
+dependencies = [
+ "rand_core 0.6.3",
+]
+
+[[package]]
+name = "rayon"
+version = "1.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c06aca804d41dbc8ba42dfd964f0d01334eceb64314b9ecf7c5fad5188a06d90"
+dependencies = [
+ "autocfg",
+ "crossbeam-deque 0.8.1",
+ "either",
+ "rayon-core",
+]
+
+[[package]]
+name = "rayon-core"
+version = "1.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d78120e2c850279833f1dd3582f730c4ab53ed95aeaaaa862a2a5c71b1656d8e"
+dependencies = [
+ "crossbeam-channel",
+ "crossbeam-deque 0.8.1",
+ "crossbeam-utils 0.8.5",
+ "lazy_static",
+ "num_cpus",
+]
+
+[[package]]
+name = "redox_syscall"
+version = "0.2.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8383f39639269cde97d255a32bdb68c047337295414940c68bdd30c2e13203ff"
+dependencies = [
+ "bitflags",
+]
+
+[[package]]
+name = "redox_users"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "528532f3d801c87aec9def2add9ca802fe569e44a544afe633765267840abe64"
+dependencies = [
+ "getrandom 0.2.3",
+ "redox_syscall",
+]
+
+[[package]]
+name = "regex"
+version = "1.5.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-automata"
+version = "0.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"
+dependencies = [
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-syntax"
+version = "0.6.25"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b"
+
+[[package]]
+name = "remove_dir_all"
+version = "0.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7"
+dependencies = [
+ "winapi",
+]
+
+[[package]]
+name = "reqwest"
+version = "0.11.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "246e9f61b9bb77df069a947682be06e31ac43ea37862e244a69f177694ea6d22"
+dependencies = [
+ "base64",
+ "bytes 1.1.0",
+ "encoding_rs",
+ "futures-core",
+ "futures-util",
+ "http",
+ "http-body",
+ "hyper",
+ "hyper-tls",
+ "ipnet",
+ "js-sys",
+ "lazy_static",
+ "log",
+ "mime",
+ "native-tls",
+ "percent-encoding",
+ "pin-project-lite",
+ "serde",
+ "serde_urlencoded",
+ "tokio",
+ "tokio-native-tls",
+ "url",
+ "wasm-bindgen",
+ "wasm-bindgen-futures",
+ "web-sys",
+ "winreg",
+]
+
+[[package]]
+name = "rle-decode-fast"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cabe4fa914dec5870285fa7f71f602645da47c486e68486d2b4ceb4a343e90ac"
+
+[[package]]
+name = "rocksdb"
+version = "0.3.0"
+source = "git+https://github.com/tikv/rust-rocksdb.git?branch=tikv-5.2#23bd00d50c79b40b6a32c11446c86f0714fa7844"
+dependencies = [
+ "libc",
+ "librocksdb_sys",
+]
+
+[[package]]
+name = "rustc-demangle"
+version = "0.1.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7ef03e0a2b150c7a90d01faf6254c9c48a41e95fb2a8c2ac1c6f0d2b9aefc342"
+
+[[package]]
+name = "rustc-hash"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
+
+[[package]]
+name = "rustc_version"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366"
+dependencies = [
+ "semver",
+]
+
+[[package]]
+name = "rustversion"
+version = "1.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "61b3909d758bb75c79f23d4736fac9433868679d3ad2ea7a61e3c25cfda9a088"
+
+[[package]]
+name = "ryu"
+version = "1.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e"
+
+[[package]]
+name = "safemem"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ef703b7cb59335eae2eb93ceb664c0eb7ea6bf567079d843e09420219668e072"
+
+[[package]]
+name = "same-file"
+version = "1.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502"
+dependencies = [
+ "winapi-util",
+]
+
+[[package]]
+name = "schannel"
+version = "0.1.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f05ba609c234e60bee0d547fe94a4c7e9da733d1c962cf6e59efa4cd9c8bc75"
+dependencies = [
+ "lazy_static",
+ "winapi",
+]
+
+[[package]]
+name = "scoped-tls"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ea6a9290e3c9cf0f18145ef7ffa62d68ee0bf5fcd651017e586dc7fd5da448c2"
+
+[[package]]
+name = "scopeguard"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
+
+[[package]]
+name = "security-framework"
+version = "2.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6c1016a0b396a0e68d6c541a54370e0db49524aead4c9e6aa263d6855d978d78"
+dependencies = [
+ "bitflags",
+ "core-foundation",
+ "core-foundation-sys",
+ "libc",
+ "num",
+ "security-framework-sys",
+]
+
+[[package]]
+name = "security-framework-sys"
+version = "2.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3f6f179cd85a30f8652b3f8830f73861c76e87e70b939773e72daf38be3afc02"
+dependencies = [
+ "core-foundation-sys",
+ "libc",
+]
+
+[[package]]
+name = "semver"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "568a8e6258aa33c13358f81fd834adb854c6f7c9468520910a9b1e8fac068012"
+
+[[package]]
+name = "serde"
+version = "1.0.130"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f12d06de37cf59146fbdecab66aa99f9fe4f78722e3607577a5375d66bd0c913"
+dependencies = [
+ "serde_derive",
+]
+
+[[package]]
+name = "serde_cbor"
+version = "0.11.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2bef2ebfde456fb76bbcf9f59315333decc4fda0b2b44b420243c11e0f5ec1f5"
+dependencies = [
+ "half",
+ "serde",
+]
+
+[[package]]
+name = "serde_derive"
+version = "1.0.130"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d7bc1a1ab1961464eae040d96713baa5a724a8152c1222492465b54322ec508b"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "serde_json"
+version = "1.0.67"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a7f9e390c27c3c0ce8bc5d725f6e4d30a29d26659494aa4b17535f7522c5c950"
+dependencies = [
+ "indexmap",
+ "itoa",
+ "ryu",
+ "serde",
+]
+
+[[package]]
+name = "serde_urlencoded"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "edfa57a7f8d9c1d260a549e7224100f6c43d43f9103e06dd8b4095a9b2b43ce9"
+dependencies = [
+ "form_urlencoded",
+ "itoa",
+ "ryu",
+ "serde",
+]
+
+[[package]]
+name = "server"
+version = "0.1.0"
+dependencies = [
+ "analytic_engine",
+ "arrow_deps",
+ "async-trait",
+ "avro-rs",
+ "catalog",
+ "ceresdbproto",
+ "common_types",
+ "common_util",
+ "futures",
+ "grpcio 0.1.0",
+ "http",
+ "interpreters",
+ "lazy_static",
+ "log",
+ "logger",
+ "meta_client",
+ "profile",
+ "prometheus 0.12.0",
+ "prometheus-static-metric",
+ "protobuf",
+ "query_engine",
+ "serde",
+ "serde_derive",
+ "serde_json",
+ "snafu",
+ "sql",
+ "system_catalog",
+ "table_engine",
+ "tokio",
+ "twox-hash",
+ "udf",
+ "warp",
+]
+
+[[package]]
+name = "sha-1"
+version = "0.9.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "99cd6713db3cf16b6c84e06321e049a9b9f699826e16096d23bbcc44d15d51a6"
+dependencies = [
+ "block-buffer 0.9.0",
+ "cfg-if 1.0.0",
+ "cpufeatures",
+ "digest 0.9.0",
+ "opaque-debug",
+]
+
+[[package]]
+name = "sha2"
+version = "0.9.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4d58a1e1bf39749807d89cf2d98ac2dfa0ff1cb3faa38fbb64dd88ac8013d800"
+dependencies = [
+ "block-buffer 0.9.0",
+ "cfg-if 1.0.0",
+ "cpufeatures",
+ "digest 0.9.0",
+ "opaque-debug",
+]
+
+[[package]]
+name = "sharded-slab"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "740223c51853f3145fe7c90360d2d4232f2b62e3449489c207eccde818979982"
+dependencies = [
+ "lazy_static",
+]
+
+[[package]]
+name = "shlex"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7fdf1b9db47230893d76faad238fd6097fd6d6a9245cd7a4d90dbd639536bbd2"
+
+[[package]]
+name = "signal-hook"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "470c5a6397076fae0094aaf06a08e6ba6f37acb77d3b1b91ea92b4d6c8650c39"
+dependencies = [
+ "libc",
+ "signal-hook-registry",
+]
+
+[[package]]
+name = "signal-hook-registry"
+version = "1.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e51e73328dc4ac0c7ccbda3a494dfa03df1de2f46018127f60c693f2648455b0"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "simdutf8"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c970da16e7c682fa90a261cf0724dee241c9f7831635ecc4e988ae8f3b505559"
+
+[[package]]
+name = "siphasher"
+version = "0.3.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "729a25c17d72b06c68cb47955d44fda88ad2d3e7d77e025663fdd69b93dd71a1"
+
+[[package]]
+name = "skiplist"
+version = "0.1.0"
+dependencies = [
+ "arena",
+ "bytes 1.1.0",
+ "criterion",
+ "rand 0.7.3",
+ "yatp",
+]
+
+[[package]]
+name = "slab"
+version = "0.4.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c307a32c1c5c437f38c7fd45d753050587732ba8628319fbdf12a7e289ccc590"
+
+[[package]]
+name = "slog"
+version = "2.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8347046d4ebd943127157b94d63abb990fcf729dc4e9978927fdf4ac3c998d06"
+
+[[package]]
+name = "slog-async"
+version = "2.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "766c59b252e62a34651412870ff55d8c4e6d04df19b43eecb2703e417b097ffe"
+dependencies = [
+ "crossbeam-channel",
+ "slog",
+ "take_mut",
+ "thread_local",
+]
+
+[[package]]
+name = "slog-global"
+version = "0.1.0"
+source = "git+https://github.com/breeswish/slog-global.git?rev=0e23a5baff302a9d7bccd85f8f31e43339c2f2c1#0e23a5baff302a9d7bccd85f8f31e43339c2f2c1"
+dependencies = [
+ "arc-swap 0.3.11",
+ "lazy_static",
+ "log",
+ "slog",
+]
+
+[[package]]
+name = "slog-global"
+version = "0.1.0"
+source = "git+https://github.com/breezewish/slog-global.git?rev=0e23a5baff302a9d7bccd85f8f31e43339c2f2c1#0e23a5baff302a9d7bccd85f8f31e43339c2f2c1"
+dependencies = [
+ "arc-swap 0.3.11",
+ "lazy_static",
+ "log",
+ "slog",
+]
+
+[[package]]
+name = "slog-term"
+version = "2.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "95c1e7e5aab61ced6006149ea772770b84a0d16ce0f7885def313e4829946d76"
+dependencies = [
+ "atty",
+ "chrono",
+ "slog",
+ "term",
+ "thread_local",
+]
+
+[[package]]
+name = "slog_derive"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a945ec7f7ce853e89ffa36be1e27dce9a43e82ff9093bf3461c30d5da74ed11b"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "smallvec"
+version = "1.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fe0f37c9e8f3c5a4a66ad655a93c74daac4ad00c441533bf5c6e7990bb42604e"
+
+[[package]]
+name = "snafu"
+version = "0.6.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eab12d3c261b2308b0d80c26fffb58d17eba81a4be97890101f416b478c79ca7"
+dependencies = [
+ "backtrace",
+ "doc-comment",
+ "futures-core",
+ "pin-project 0.4.28",
+ "snafu-derive",
+]
+
+[[package]]
+name = "snafu-derive"
+version = "0.6.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1508efa03c362e23817f96cde18abed596a25219a8b2c66e8db33c03543d315b"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "snap"
+version = "1.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "45456094d1983e2ee2a18fdfebce3189fa451699d0502cb8e3b49dba5ba41451"
+
+[[package]]
+name = "snappy-sys"
+version = "0.1.0"
+source = "git+https://github.com/busyjay/rust-snappy.git?branch=static-link#8c12738bad811397600455d6982aff754ea2ac44"
+dependencies = [
+ "cmake",
+ "libc",
+ "pkg-config",
+]
+
+[[package]]
+name = "socket2"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "765f090f0e423d2b55843402a07915add955e7d60657db13707a159727326cad"
+dependencies = [
+ "libc",
+ "winapi",
+]
+
+[[package]]
+name = "sql"
+version = "0.1.0"
+dependencies = [
+ "arrow_deps",
+ "catalog",
+ "ceresdbproto",
+ "common_types",
+ "common_util",
+ "log",
+ "paste 1.0.5",
+ "regex",
+ "snafu",
+ "sqlparser",
+ "table_engine",
+ "tokio",
+ "udf",
+]
+
+[[package]]
+name = "sqlparser"
+version = "0.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b9907f54bd0f7b6ce72c2be1e570a614819ee08e3deb66d90480df341d8a12a8"
+dependencies = [
+ "log",
+]
+
+[[package]]
+name = "static_assertions"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
+
+[[package]]
+name = "streaming-decompression"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9bc687acd5dc742c4a7094f2927a8614a68e4743ef682e7a2f9f0f711656cc92"
+dependencies = [
+ "fallible-streaming-iterator",
+]
+
+[[package]]
+name = "strsim"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a"
+
+[[package]]
+name = "strum"
+version = "0.18.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "57bd81eb48f4c437cadc685403cad539345bf703d78e63707418431cecd4522b"
+
+[[package]]
+name = "strum"
+version = "0.22.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f7ac893c7d471c8a21f31cfe213ec4f6d9afeed25537c772e08ef3f005f8729e"
+
+[[package]]
+name = "strum_macros"
+version = "0.18.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "87c85aa3f8ea653bfd3ddf25f7ee357ee4d204731f6aa9ad04002306f6e2774c"
+dependencies = [
+ "heck",
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "strum_macros"
+version = "0.22.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "339f799d8b549e3744c7ac7feb216383e4005d94bdb22561b3ab8f3b808ae9fb"
+dependencies = [
+ "heck",
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "subtle"
+version = "2.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6bdef32e8150c2a081110b42772ffe7d7c9032b606bc226c8260fd97e0976601"
+
+[[package]]
+name = "syn"
+version = "1.0.75"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b7f58f7e8eaa0009c5fec437aabf511bd9933e4b2d7407bd05273c01a8906ea7"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-xid",
+]
+
+[[package]]
+name = "synstructure"
+version = "0.12.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "474aaa926faa1603c40b7885a9eaea29b444d1cb2850cb7c0e37bb1a4182f4fa"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+ "unicode-xid",
+]
+
+[[package]]
+name = "system_catalog"
+version = "0.1.0"
+dependencies = [
+ "arrow_deps",
+ "async-trait",
+ "catalog",
+ "common_types",
+ "common_util",
+ "futures",
+ "log",
+ "proto",
+ "protobuf",
+ "snafu",
+ "table_engine",
+ "tokio",
+]
+
+[[package]]
+name = "table_engine"
+version = "0.1.0"
+dependencies = [
+ "arrow_deps",
+ "async-trait",
+ "common_types",
+ "common_util",
+ "futures",
+ "log",
+ "proto",
+ "protobuf",
+ "serde",
+ "serde_derive",
+ "smallvec",
+ "snafu",
+ "tokio",
+]
+
+[[package]]
+name = "take_mut"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f764005d11ee5f36500a149ace24e00e3da98b0158b3e2d53a7495660d3f4d60"
+
+[[package]]
+name = "tempfile"
+version = "3.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dac1c663cfc93810f88aed9b8941d48cabf856a1b111c29a40439018d870eb22"
+dependencies = [
+ "cfg-if 1.0.0",
+ "libc",
+ "rand 0.8.4",
+ "redox_syscall",
+ "remove_dir_all",
+ "winapi",
+]
+
+[[package]]
+name = "term"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c59df8ac95d96ff9bede18eb7300b0fda5e5d8d90960e76f8e14ae765eedbf1f"
+dependencies = [
+ "dirs-next",
+ "rustversion",
+ "winapi",
+]
+
+[[package]]
+name = "termcolor"
+version = "1.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2dfed899f0eb03f32ee8c6a0aabdb8a7949659e3466561fc0adf54e26d88c5f4"
+dependencies = [
+ "winapi-util",
+]
+
+[[package]]
+name = "textwrap"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060"
+dependencies = [
+ "unicode-width",
+]
+
+[[package]]
+name = "thiserror"
+version = "1.0.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "283d5230e63df9608ac7d9691adc1dfb6e701225436eb64d0b9a7f0a5a04f6ec"
+dependencies = [
+ "thiserror-impl",
+]
+
+[[package]]
+name = "thiserror-impl"
+version = "1.0.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fa3884228611f5cd3608e2d409bf7dce832e4eb3135e3f11addbd7e41bd68e71"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "thread_local"
+version = "1.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8018d24e04c95ac8790716a5987d0fec4f8b27249ffa0f7d33f1369bdfb88cbd"
+dependencies = [
+ "once_cell",
+]
+
+[[package]]
+name = "threadpool"
+version = "1.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d050e60b33d41c19108b32cea32164033a9013fe3b46cbd4457559bfbf77afaa"
+dependencies = [
+ "num_cpus",
+]
+
+[[package]]
+name = "thrift"
+version = "0.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0c6d965454947cc7266d22716ebfd07b18d84ebaf35eec558586bbb2a8cb6b5b"
+dependencies = [
+ "byteorder",
+ "integer-encoding 1.1.7",
+ "log",
+ "ordered-float 1.1.1",
+ "threadpool",
+]
+
+[[package]]
+name = "time"
+version = "0.1.43"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ca8a50ef2360fbd1eeb0ecd46795a87a19024eb4b53c5dc916ca1fd95fe62438"
+dependencies = [
+ "libc",
+ "winapi",
+]
+
+[[package]]
+name = "tiny-keccak"
+version = "2.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237"
+dependencies = [
+ "crunchy",
+]
+
+[[package]]
+name = "tinytemplate"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc"
+dependencies = [
+ "serde",
+ "serde_json",
+]
+
+[[package]]
+name = "tinyvec"
+version = "1.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "848a1e1181b9f6753b5e96a092749e29b11d19ede67dfbbd6c7dc7e0f49b5338"
+dependencies = [
+ "tinyvec_macros",
+]
+
+[[package]]
+name = "tinyvec_macros"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c"
+
+[[package]]
+name = "tokio"
+version = "1.15.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fbbf1c778ec206785635ce8ad57fe52b3009ae9e0c9f574a728f3049d3e55838"
+dependencies = [
+ "bytes 1.1.0",
+ "libc",
+ "memchr",
+ "mio",
+ "num_cpus",
+ "once_cell",
+ "parking_lot",
+ "pin-project-lite",
+ "signal-hook-registry",
+ "tokio-macros",
+ "winapi",
+]
+
+[[package]]
+name = "tokio-macros"
+version = "1.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b557f72f448c511a979e2564e55d74e6c4432fc96ff4f6241bc6bded342643b7"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "tokio-native-tls"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f7d995660bd2b7f8c1568414c1126076c13fbb725c40112dc0120b78eb9b717b"
+dependencies = [
+ "native-tls",
+ "tokio",
+]
+
+[[package]]
+name = "tokio-stream"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7b2f3f698253f03119ac0102beaa64f67a67e08074d03a22d18784104543727f"
+dependencies = [
+ "futures-core",
+ "pin-project-lite",
+ "tokio",
+]
+
+[[package]]
+name = "tokio-test"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "53474327ae5e166530d17f2d956afcb4f8a004de581b3cae10f12006bc8163e3"
+dependencies = [
+ "async-stream",
+ "bytes 1.1.0",
+ "futures-core",
+ "tokio",
+ "tokio-stream",
+]
+
+[[package]]
+name = "tokio-tungstenite"
+version = "0.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e1a5f475f1b9d077ea1017ecbc60890fda8e54942d680ca0b1d2b47cfa2d861b"
+dependencies = [
+ "futures-util",
+ "log",
+ "pin-project 1.0.8",
+ "tokio",
+ "tungstenite",
+]
+
+[[package]]
+name = "tokio-util"
+version = "0.6.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1caa0b0c8d94a049db56b5acf8cba99dc0623aab1b26d5b5f5e2d945846b3592"
+dependencies = [
+ "bytes 1.1.0",
+ "futures-core",
+ "futures-io",
+ "futures-sink",
+ "log",
+ "pin-project-lite",
+ "tokio",
+]
+
+[[package]]
+name = "toml"
+version = "0.5.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a31142970826733df8241ef35dc040ef98c679ab14d7c3e54d827099b3acecaa"
+dependencies = [
+ "serde",
+]
+
+[[package]]
+name = "tower-service"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "360dfd1d6d30e05fda32ace2c8c70e9c0a9da713275777f5a4dbb8a1893930c6"
+
+[[package]]
+name = "trace_examples"
+version = "0.1.0"
+dependencies = [
+ "tracing 0.1.0",
+ "tracing_util",
+]
+
+[[package]]
+name = "tracing"
+version = "0.1.0"
+dependencies = [
+ "tracing 0.1.26",
+]
+
+[[package]]
+name = "tracing"
+version = "0.1.26"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09adeb8c97449311ccd28a427f96fb563e7fd31aabf994189879d9da2394b89d"
+dependencies = [
+ "cfg-if 1.0.0",
+ "log",
+ "pin-project-lite",
+ "tracing-attributes",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-appender"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9965507e507f12c8901432a33e31131222abac31edd90cabbcf85cf544b7127a"
+dependencies = [
+ "chrono",
+ "crossbeam-channel",
+ "tracing-subscriber",
+]
+
+[[package]]
+name = "tracing-attributes"
+version = "0.1.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c42e6fa53307c8a17e4ccd4dc81cf5ec38db9209f59b222210375b54ee40d1e2"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "tracing-core"
+version = "0.1.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2ca517f43f0fb96e0c3072ed5c275fe5eece87e8cb52f4a77b69226d3b1c9df8"
+dependencies = [
+ "lazy_static",
+]
+
+[[package]]
+name = "tracing-log"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a6923477a48e41c1951f1999ef8bb5a3023eb723ceadafe78ffb65dc366761e3"
+dependencies = [
+ "lazy_static",
+ "log",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-serde"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fb65ea441fbb84f9f6748fd496cf7f63ec9af5bca94dd86456978d055e8eb28b"
+dependencies = [
+ "serde",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-subscriber"
+version = "0.2.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b9cbe87a2fa7e35900ce5de20220a582a9483a7063811defce79d7cbd59d4cfe"
+dependencies = [
+ "ansi_term 0.12.1",
+ "chrono",
+ "lazy_static",
+ "matchers",
+ "regex",
+ "serde",
+ "serde_json",
+ "sharded-slab",
+ "smallvec",
+ "thread_local",
+ "tracing 0.1.26",
+ "tracing-core",
+ "tracing-log",
+ "tracing-serde",
+]
+
+[[package]]
+name = "tracing_util"
+version = "0.1.0"
+dependencies = [
+ "lazy_static",
+ "tracing 0.1.26",
+ "tracing-appender",
+ "tracing-subscriber",
+]
+
+[[package]]
+name = "try-lock"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "59547bce71d9c38b83d9c0e92b6066c4253371f15005def0c30d9657f50c7642"
+
+[[package]]
+name = "tungstenite"
+version = "0.12.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8ada8297e8d70872fa9a551d93250a9f407beb9f37ef86494eb20012a2ff7c24"
+dependencies = [
+ "base64",
+ "byteorder",
+ "bytes 1.1.0",
+ "http",
+ "httparse",
+ "input_buffer",
+ "log",
+ "rand 0.8.4",
+ "sha-1",
+ "url",
+ "utf-8",
+]
+
+[[package]]
+name = "twoway"
+version = "0.1.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "59b11b2b5241ba34be09c3cc85a36e56e48f9888862e19cedf23336d35316ed1"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "twox-hash"
+version = "1.6.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4ee73e6e4924fe940354b8d4d98cad5231175d615cd855b758adc658c0aac6a0"
+dependencies = [
+ "cfg-if 1.0.0",
+ "rand 0.8.4",
+ "static_assertions",
+]
+
+[[package]]
+name = "typed-builder"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "78cea224ddd4282dfc40d1edabbd0c020a12e946e3a48e2c2b8f6ff167ad29fe"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "typenum"
+version = "1.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "879f6906492a7cd215bfa4cf595b600146ccfac0c79bcbd1f3000162af5e8b06"
+
+[[package]]
+name = "udf"
+version = "0.1.0"
+dependencies = [
+ "arrow_deps",
+ "base64",
+ "chrono",
+ "common_types",
+ "common_util",
+ "hyperloglog",
+ "smallvec",
+ "snafu",
+]
+
+[[package]]
+name = "uncover"
+version = "0.1.1"
+source = "git+https://github.com/matklad/uncover.git?rev=1d0770d997e29731b287e9e11e4ffbbea5f456da#1d0770d997e29731b287e9e11e4ffbbea5f456da"
+dependencies = [
+ "lazy_static",
+]
+
+[[package]]
+name = "unicase"
+version = "2.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "50f37be617794602aabbeee0be4f259dc1778fabe05e2d67ee8f79326d5cb4f6"
+dependencies = [
+ "version_check",
+]
+
+[[package]]
+name = "unicode-bidi"
+version = "0.3.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "246f4c42e67e7a4e3c6106ff716a5d067d4132a642840b242e357e468a2a0085"
+
+[[package]]
+name = "unicode-normalization"
+version = "0.1.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d54590932941a9e9266f0832deed84ebe1bf2e4c9e4a3554d393d18f5e854bf9"
+dependencies = [
+ "tinyvec",
+]
+
+[[package]]
+name = "unicode-segmentation"
+version = "1.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8895849a949e7845e06bd6dc1aa51731a103c42707010a5b591c0038fb73385b"
+
+[[package]]
+name = "unicode-width"
+version = "0.1.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9337591893a19b88d8d87f2cec1e73fad5cdfd10e5a6f349f498ad6ea2ffb1e3"
+
+[[package]]
+name = "unicode-xid"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3"
+
+[[package]]
+name = "url"
+version = "2.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a507c383b2d33b5fc35d1861e77e6b383d158b2da5e14fe51b83dfedf6fd578c"
+dependencies = [
+ "form_urlencoded",
+ "idna",
+ "matches",
+ "percent-encoding",
+]
+
+[[package]]
+name = "utf-8"
+version = "0.7.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
+
+[[package]]
+name = "uuid"
+version = "0.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bc5cf98d8186244414c848017f0e2676b3fcb46807f6668a97dfe67359a3c4b7"
+dependencies = [
+ "getrandom 0.2.3",
+ "serde",
+]
+
+[[package]]
+name = "vcpkg"
+version = "0.2.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
+
+[[package]]
+name = "vec_map"
+version = "0.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191"
+
+[[package]]
+name = "vergen"
+version = "5.1.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "265455aab08c55a1ab13f07c8d5e25c7d46900f4484dd7cbd682e77171f93f3c"
+dependencies = [
+ "anyhow",
+ "cfg-if 1.0.0",
+ "chrono",
+ "enum-iterator",
+ "getset",
+ "git2",
+ "rustversion",
+ "thiserror",
+]
+
+[[package]]
+name = "version_check"
+version = "0.9.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5fecdca9a5291cc2b8dcf7dc02453fee791a280f3743cb0905f8822ae463b3fe"
+
+[[package]]
+name = "wal"
+version = "0.1.0"
+dependencies = [
+ "async-trait",
+ "common_types",
+ "common_util",
+ "futures",
+ "log",
+ "rocksdb",
+ "snafu",
+ "tempfile",
+ "tokio",
+]
+
+[[package]]
+name = "walkdir"
+version = "2.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "808cf2735cd4b6866113f648b791c6adc5714537bc222d9347bb203386ffda56"
+dependencies = [
+ "same-file",
+ "winapi",
+ "winapi-util",
+]
+
+[[package]]
+name = "want"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ce8a968cb1cd110d136ff8b819a556d6fb6d919363c61534f6860c7eb172ba0"
+dependencies = [
+ "log",
+ "try-lock",
+]
+
+[[package]]
+name = "warp"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "332d47745e9a0c38636dbd454729b147d16bd1ed08ae67b3ab281c4506771054"
+dependencies = [
+ "bytes 1.1.0",
+ "futures",
+ "headers",
+ "http",
+ "hyper",
+ "log",
+ "mime",
+ "mime_guess",
+ "multipart",
+ "percent-encoding",
+ "pin-project 1.0.8",
+ "scoped-tls",
+ "serde",
+ "serde_json",
+ "serde_urlencoded",
+ "tokio",
+ "tokio-stream",
+ "tokio-tungstenite",
+ "tokio-util",
+ "tower-service",
+ "tracing 0.1.26",
+]
+
+[[package]]
+name = "wasi"
+version = "0.9.0+wasi-snapshot-preview1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519"
+
+[[package]]
+name = "wasi"
+version = "0.10.2+wasi-snapshot-preview1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6"
+
+[[package]]
+name = "wasm-bindgen"
+version = "0.2.76"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8ce9b1b516211d33767048e5d47fa2a381ed8b76fc48d2ce4aa39877f9f183e0"
+dependencies = [
+ "cfg-if 1.0.0",
+ "serde",
+ "serde_json",
+ "wasm-bindgen-macro",
+]
+
+[[package]]
+name = "wasm-bindgen-backend"
+version = "0.2.76"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cfe8dc78e2326ba5f845f4b5bf548401604fa20b1dd1d365fb73b6c1d6364041"
+dependencies = [
+ "bumpalo",
+ "lazy_static",
+ "log",
+ "proc-macro2",
+ "quote",
+ "syn",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-futures"
+version = "0.4.26"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "95fded345a6559c2cfee778d562300c581f7d4ff3edb9b0d230d69800d213972"
+dependencies = [
+ "cfg-if 1.0.0",
+ "js-sys",
+ "wasm-bindgen",
+ "web-sys",
+]
+
+[[package]]
+name = "wasm-bindgen-macro"
+version = "0.2.76"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "44468aa53335841d9d6b6c023eaab07c0cd4bddbcfdee3e2bb1e8d2cb8069fef"
+dependencies = [
+ "quote",
+ "wasm-bindgen-macro-support",
+]
+
+[[package]]
+name = "wasm-bindgen-macro-support"
+version = "0.2.76"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0195807922713af1e67dc66132c7328206ed9766af3858164fb583eedc25fbad"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+ "wasm-bindgen-backend",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-shared"
+version = "0.2.76"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "acdb075a845574a1fa5f09fd77e43f7747599301ea3417a9fbffdeedfc1f4a29"
+
+[[package]]
+name = "web-sys"
+version = "0.3.53"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "224b2f6b67919060055ef1a67807367c2066ed520c3862cc013d26cf893a783c"
+dependencies = [
+ "js-sys",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "which"
+version = "4.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ea187a8ef279bc014ec368c27a920da2024d2a711109bfbe3440585d5cf27ad9"
+dependencies = [
+ "either",
+ "lazy_static",
+ "libc",
+]
+
+[[package]]
+name = "winapi"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
+dependencies = [
+ "winapi-i686-pc-windows-gnu",
+ "winapi-x86_64-pc-windows-gnu",
+]
+
+[[package]]
+name = "winapi-i686-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
+
+[[package]]
+name = "winapi-util"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178"
+dependencies = [
+ "winapi",
+]
+
+[[package]]
+name = "winapi-x86_64-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
+
+[[package]]
+name = "winreg"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0120db82e8a1e0b9fb3345a539c478767c0048d842860994d96113d5b667bd69"
+dependencies = [
+ "winapi",
+]
+
+[[package]]
+name = "yatp"
+version = "0.0.1"
+source = "git+https://github.com/tikv/yatp.git?rev=4b71f8abd86890f0d1e95778c2b6bf5a9ee4c502#4b71f8abd86890f0d1e95778c2b6bf5a9ee4c502"
+dependencies = [
+ "crossbeam-deque 0.7.4",
+ "dashmap",
+ "fail",
+ "lazy_static",
+ "num_cpus",
+ "parking_lot_core",
+ "prometheus 0.10.0",
+ "rand 0.7.3",
+]
+
+[[package]]
+name = "zerocopy"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6580539ad917b7c026220c4b3f2c08d52ce54d6ce0dc491e66002e35388fab46"
+dependencies = [
+ "byteorder",
+ "zerocopy-derive",
+]
+
+[[package]]
+name = "zerocopy-derive"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d498dbd1fd7beb83c86709ae1c33ca50942889473473d287d56ce4770a18edfb"
+dependencies = [
+ "proc-macro2",
+ "syn",
+ "synstructure",
+]
+
+[[package]]
+name = "zstd"
+version = "0.9.0+zstd.1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "07749a5dc2cb6b36661290245e350f15ec3bbb304e493db54a1d354480522ccd"
+dependencies = [
+ "zstd-safe",
+]
+
+[[package]]
+name = "zstd-safe"
+version = "4.1.1+zstd.1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c91c90f2c593b003603e5e0493c837088df4469da25aafff8bce42ba48caf079"
+dependencies = [
+ "libc",
+ "zstd-sys",
+]
+
+[[package]]
+name = "zstd-sys"
+version = "1.6.1+zstd.1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "615120c7a2431d16cf1cf979e7fc31ba7a5b5e5707b29c8a99e5dbf8a8392a33"
+dependencies = [
+ "cc",
+ "libc",
+]
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000000..7ad1ca4a7f
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,74 @@
+[package]
+name = "ceresdbx"
+version = "0.1.0"
+authors = ["CeresDB Authors <ceresdb@service.alipay.com>"]
+edition = "2018"
+resolver = "2"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[workspace]
+# In alphabetical order
+members = [
+    "analytic_engine",
+    "arrow_deps",
+    "benchmarks",
+    "catalog",
+    "catalog_impls",
+    "common_types",
+    "common_util",
+    "components/arena",
+    "components/bytes",
+    "components/logger",
+    "components/object_store",
+    "components/parquet",
+    "components/profile",
+    "components/rust-hyperloglog",
+    "components/skiplist",
+    "components/tracing",
+    "components/tracing_util",
+    "components/tracing_examples",
+    "grpcio",
+    "interpreters",
+    "meta_client",
+    "proto",
+    "query_engine",
+    "server",
+    "sql",
+    "system_catalog",
+    "table_engine",
+    "udf",
+    "wal",
+]
+
+[[bin]]
+name = "ceresdb-server"
+
+[dependencies]
+# Workspace dependencies, in alphabetical order
+analytic_engine = { path = "analytic_engine" }
+catalog = { path = "catalog" }
+catalog_impls = { path = "catalog_impls" }
+clap = "2.0"
+common_util = { path = "common_util" }
+log = "0.4"
+logger = { path = "components/logger" }
+query_engine = { path = "query_engine" }
+server = { path = "server" }
+table_engine = { path = "table_engine" }
+tracing_util = { path = "components/tracing_util" }
+udf = { path = "udf" }
+
+# Crates.io dependencies, in alphabetical order
+signal-hook = "0.3"
+
+[build-dependencies]
+vergen = { version = "5", default-features = false, features = ["build", "git"] }
+
+[profile.release]
+debug = true
+opt-level = 2
+overflow-checks = true
+
+[profile.bench]
+debug = true
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000..37cf72300f
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,39 @@
+ARG RUST_VERSION=1.59.0
+FROM rust:${RUST_VERSION}-slim-bullseye as build
+
+# cache mounts below may already exist and owned by root
+USER root
+
+RUN apt update && apt install --yes gcc g++ libssl-dev pkg-config cmake && rm -rf /var/lib/apt/lists/*
+
+# Build ceresdb
+COPY . /ceresdb
+WORKDIR /ceresdb
+
+RUN make build
+
+FROM ubuntu:20.04
+# create admin user
+ARG USER=admin
+ARG PASS="1q2w3s"
+RUN useradd -m -s /bin/bash $USER && echo "$USER:$PASS" | chpasswd
+
+COPY --from=build /ceresdb/target/release/ceresdb-server /usr/bin/ceresdb-server
+
+RUN apt update && apt install --yes curl gdb iotop cron
+
+ENV RUST_BACKTRACE 1
+
+COPY ./docker/entrypoint.py /entrypoint.py
+COPY ./docker/supervisor/supervisord.conf /etc/supervisor/supervisord.conf
+COPY ./docker/supervisor/conf.d /etc/supervisor/conf.d
+COPY ./configs/ceresdb.toml /usr/bin/
+
+RUN mkdir -p /etc/ceresdb
+RUN chmod +x /usr/bin/ceresdb-server
+
+COPY ./configs /etc/ceresdb
+
+COPY ./docker/tini /tini
+RUN chmod +x /tini
+ENTRYPOINT ["/tini", "--", "/entrypoint.py"]
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000..7a4a3ea242
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
\ No newline at end of file
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000000..dfc71ea25a
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,64 @@
+SHELL = /bin/bash
+
+DIR=$(shell pwd)
+
+init:
+	echo "init"
+	echo "Git branch: $GITBRANCH"
+
+build:
+	ls -alh
+	cd $(DIR); cargo build --release
+
+build-asan:
+	ls -alh
+	export RUSTFLAGS=-Zsanitizer=address RUSTDOCFLAGS=-Zsanitizer=address
+	cd $(DIR); cargo build -Zbuild-std --target x86_64-unknown-linux-gnu --release
+
+build-arm64:
+	ls -alh
+	cd $(DIR); cargo build --release --no-default-features
+
+test:
+	cd $(DIR); cargo test --workspace -- --test-threads=4
+
+# grcov needs build first, then run test
+build-ut:
+	echo $(CARGO_INCREMENTAL)
+	echo $(RUSTFLAGS)
+	echo $(RUSTDOCFLAGS)
+	cd $(DIR); cargo build -j 4 --workspace
+
+test-ut:
+	echo $(CARGO_INCREMENTAL)
+	echo $(RUSTFLAGS)
+	echo $(RUSTDOCFLAGS)
+	cd $(DIR); cargo test -j 4 --workspace -- -Z unstable-options --format json | tee results.json; \
+		cat results.json | cargo2junit > ${WORKSPACE}/testresult/TEST-all.xml
+
+fmt:
+	cd $(DIR); cargo fmt -- --check
+
+clippy:
+	cd $(DIR); cargo clippy --all-targets --all-features --workspace -- -D warnings
+
+# test with address sanitizer
+asan-test:
+	export RUSTFLAGS=-Zsanitizer=address RUSTDOCFLAGS=-Zsanitizer=address
+	cd $(DIR); cargo test -Zbuild-std --target x86_64-unknown-linux-gnu --workspace
+
+# test with address sanitizer under release mode to workaround `attempt to create unaligned or null slice`
+# error in parquet crate.
+asan-test-release:
+	export RUSTFLAGS=-Zsanitizer=address RUSTDOCFLAGS=-Zsanitizer=address
+	cd $(DIR); cargo test -Zbuild-std --target x86_64-unknown-linux-gnu --release --workspace
+
+# test with memory sanitizer
+mem-test:
+	export RUSTFLAGS=-Zsanitizer=memory RUSTDOCFLAGS=-Zsanitizer=memory
+	cd $(DIR); cargo test -Zbuild-std --target x86_64-unknown-linux-gnu --workspace
+
+# test with miri. 
+# only list packages will be tested.
+miri:
+	cd $(DIR); cargo miri test --package arena
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000..4e9d118178
--- /dev/null
+++ b/README.md
@@ -0,0 +1,90 @@
+# ceresdbx
+
+## Building
+Install clang (for rocksdb)
+
+Install deps (required by rust-rocksdb)
+```bash
+brew install cmake
+brew install lz4
+```
+
+Build in debug mode
+```bash
+cargo build --bin ceresdb-server
+```
+
+Build in release mode
+```bash
+cargo build --release --bin ceresdb-server
+```
+
+## Usage
+Run the server
+```bash
+./ceresdb-server
+```
+
+## RESTful API
+```bash
+curl -L -X POST 'http://localhost:5000/sql' \
+-H 'Content-Type: application/json' \
+-d '{
+    "query": "your DDL sql"
+}'
+```
+
+Describe a table
+```bash
+curl -L -X POST 'http://localhost:5000/sql' \
+-H 'Content-Type: application/json' \
+-d '{
+    "query": "DESCRIBE TABLE mytest"
+}'
+```
+
+Insert data
+```bash
+curl -L -X POST 'http://localhost:5000/sql' \
+-H 'Content-Type: application/json' \
+--data-raw '{
+    "query": "INSERT INTO mytest(c1, c2, c3, c4, c5, c6) VALUES(1618310218001, 12.5, '\''hello world'\'', 3.14159265, true, 2147483650)"
+}'
+```
+
+Query
+```bash
+curl -L -X POST 'http://localhost:5000/sql' \
+-H 'Content-Type: application/json' \
+-d '{
+    "query": "SELECT c1, c2, c3, c4, c5, c6 FROM mytest LIMIT 3"
+}'
+```
+
+Query from system tables
+```bash
+curl -L -X POST 'http://localhost:5000/sql' \
+-H 'Content-Type: application/json' \
+-d '{
+    "query": "SELECT * FROM system.numbers LIMIT 3"
+}'
+```
+
+## Support Data Type
+| SQL | CeresDB | Arrow |
+| --- | --- | --- |
+| null | Null | Null |
+| timestamp | Timestamp | Timestamp(TimeUnit::Millisecond, None) |
+| double | Double | Float64 |
+| float | Float | Float32 |
+| string | String | String |
+| Varbinary | Varbinary | Binary |
+| uint64 | UInt64 | UInt64 |
+| uint32 | UInt32 | UInt32 |
+| uint16 | UInt16 | UInt16 |
+| uint8 | UInt8 | UInt8 |
+| int64/bigint | Int64 | Int64 |
+| int32/int | Int32 | Int32 |
+| int16/smallint | Int16 | Int16 |
+| int8/tinyint | Int8 | Int8 |
+| boolean | Boolean | Boolean |
diff --git a/analytic_engine/Cargo.toml b/analytic_engine/Cargo.toml
new file mode 100644
index 0000000000..3be6760574
--- /dev/null
+++ b/analytic_engine/Cargo.toml
@@ -0,0 +1,43 @@
+[package]
+name = "analytic_engine"
+version = "0.1.0"
+authors = ["CeresDB Authors <ceresdb@service.alipay.com>"]
+edition = "2018"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[features]
+test = ["tempfile"]
+
+[dependencies]
+# In alphabetical order
+arc-swap = "1.4.0"
+arena = { path = "../components/arena" }
+arrow_deps = { path = "../arrow_deps" }
+async-trait = "0.1.41"
+base64 = "0.13"
+common_types = { path = "../common_types" }
+common_util = { path = "../common_util"}
+futures = "0.3"
+lazy_static = "1.4.0"
+log = "0.4"
+object_store = { path = "../components/object_store" }
+parquet = { path = "../components/parquet" }
+prometheus = "0.12"
+proto = { path = "../proto" }
+protobuf = "2.20"
+serde = "1.0"
+serde_derive = "1.0"
+skiplist = { path = "../components/skiplist" }
+smallvec = "1.6"
+snafu = { version = "0.6.10", features = ["backtraces"] }
+table_engine = { path = "../table_engine" }
+tokio = { version = "1.0", features = ["sync", "time"] }
+wal = { path = "../wal" }
+tempfile = { version = "3.1.0", optional = true }
+
+[dev-dependencies]
+common_types = { path = "../common_types", features = ["test"] }
+common_util = { path = "../common_util", features = ["test"] }
+env_logger = "0.6"
+tempfile = "3.1.0"
diff --git a/analytic_engine/src/compaction/metrics.rs b/analytic_engine/src/compaction/metrics.rs
new file mode 100644
index 0000000000..61d76453e3
--- /dev/null
+++ b/analytic_engine/src/compaction/metrics.rs
@@ -0,0 +1,15 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Metrics of compaction.
+
+use lazy_static::lazy_static;
+use prometheus::{register_int_gauge, IntGauge};
+
+lazy_static! {
+    // Counters:
+    pub static ref COMPACTION_PENDING_REQUEST_GAUGE: IntGauge = register_int_gauge!(
+        "compaction_pending_request_gauge",
+        "Pending request queue length of compaction"
+    )
+        .unwrap();
+}
diff --git a/analytic_engine/src/compaction/mod.rs b/analytic_engine/src/compaction/mod.rs
new file mode 100644
index 0000000000..a76ce2324a
--- /dev/null
+++ b/analytic_engine/src/compaction/mod.rs
@@ -0,0 +1,494 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Compaction.
+
+use std::{collections::HashMap, sync::Arc};
+
+use common_util::config::{ReadableSize, TimeUnit};
+use serde_derive::Deserialize;
+use snafu::{ensure, Backtrace, GenerateBacktrace, ResultExt, Snafu};
+use tokio::sync::oneshot;
+
+use crate::{
+    compaction::picker::{CommonCompactionPicker, CompactionPickerRef},
+    instance::write_worker::CompactionNotifier,
+    sst::file::{FileHandle, Level},
+    table::data::TableDataRef,
+    table_options::COMPACTION_STRATEGY,
+};
+
+mod metrics;
+pub mod picker;
+pub mod scheduler;
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Unable to parse compaction strategy, value: {}", value))]
+    ParseStrategy { value: String, backtrace: Backtrace },
+    #[snafu(display("Unable to parse float, key: {}, value: {}", key, value))]
+    ParseFloat {
+        key: String,
+        value: String,
+        source: std::num::ParseFloatError,
+        backtrace: Backtrace,
+    },
+    #[snafu(display("Unable to parse int, key: {}, value: {}", key, value))]
+    ParseInt {
+        key: String,
+        value: String,
+        source: std::num::ParseIntError,
+        backtrace: Backtrace,
+    },
+    #[snafu(display("Unable to parse readable size, key: {}, value: {}", key, value))]
+    ParseSize {
+        key: String,
+        value: String,
+        error: String,
+        backtrace: Backtrace,
+    },
+    #[snafu(display("Unable to parse time unit, key: {}, value: {}", key, value))]
+    ParseTimeUnit {
+        key: String,
+        value: String,
+        error: String,
+        backtrace: Backtrace,
+    },
+    #[snafu(display("Invalid compaction option value, err: {}", error))]
+    InvalidOption { error: String, backtrace: Backtrace },
+}
+
+#[derive(Debug, Clone, Copy, Deserialize, PartialEq)]
+pub enum CompactionStrategy {
+    Default,
+    TimeWindow(TimeWindowCompactionOptions),
+    SizeTiered(SizeTieredCompactionOptions),
+}
+
+#[derive(Debug, Clone, Copy, Deserialize, PartialEq)]
+pub struct SizeTieredCompactionOptions {
+    pub bucket_low: f32,
+    pub bucket_high: f32,
+    pub min_sstable_size: ReadableSize,
+    pub min_threshold: usize,
+    pub max_threshold: usize,
+}
+
+#[derive(Debug, Clone, Copy, Deserialize, PartialEq)]
+pub struct TimeWindowCompactionOptions {
+    pub size_tiered: SizeTieredCompactionOptions,
+    // TODO(boyan) In fact right now we only supports TimeUnit::Milliseconds resolution.
+    pub timestamp_resolution: TimeUnit,
+}
+
+impl protobuf::Clear for SizeTieredCompactionOptions {
+    fn clear(&mut self) {
+        *self = SizeTieredCompactionOptions::default()
+    }
+}
+
+impl protobuf::Clear for TimeWindowCompactionOptions {
+    fn clear(&mut self) {
+        *self = TimeWindowCompactionOptions::default()
+    }
+}
+
+impl Default for SizeTieredCompactionOptions {
+    fn default() -> Self {
+        Self {
+            bucket_low: 0.5,
+            bucket_high: 1.5,
+            min_sstable_size: ReadableSize::mb(50),
+            min_threshold: 4,
+            max_threshold: 16,
+        }
+    }
+}
+
+impl Default for TimeWindowCompactionOptions {
+    fn default() -> Self {
+        Self {
+            size_tiered: SizeTieredCompactionOptions::default(),
+            timestamp_resolution: TimeUnit::Milliseconds,
+        }
+    }
+}
+
+impl Default for CompactionStrategy {
+    fn default() -> Self {
+        CompactionStrategy::Default
+    }
+}
+
+const BUCKET_LOW_KEY: &str = "compaction_bucket_low";
+const BUCKET_HIGH_KEY: &str = "compaction_bucket_high";
+const MIN_THRESHOLD_KEY: &str = "compaction_min_threshold";
+const MAX_THRESHOLD_KEY: &str = "compaction_max_threshold";
+const MIN_SSTABLE_SIZE_KEY: &str = "compaction_min_sstable_size";
+const TIMESTAMP_RESOLUTION_KEY: &str = "compaction_timestamp_resolution";
+const DEFAULT_STRATEGY: &str = "default";
+const STC_STRATEGY: &str = "size_tiered";
+const TWC_STRATEGY: &str = "time_window";
+
+impl CompactionStrategy {
+    pub(crate) fn parse_from(
+        value: &str,
+        options: &HashMap<String, String>,
+    ) -> Result<CompactionStrategy, Error> {
+        match value.trim().to_lowercase().as_str() {
+            DEFAULT_STRATEGY => Ok(CompactionStrategy::Default),
+            STC_STRATEGY => Ok(CompactionStrategy::SizeTiered(
+                SizeTieredCompactionOptions::parse_from(options)?,
+            )),
+            TWC_STRATEGY => Ok(CompactionStrategy::TimeWindow(
+                TimeWindowCompactionOptions::parse_from(options)?,
+            )),
+            _ => ParseStrategy {
+                value: value.to_string(),
+            }
+            .fail(),
+        }
+    }
+
+    pub(crate) fn fill_raw_map(&self, m: &mut HashMap<String, String>) {
+        match self {
+            CompactionStrategy::Default => {
+                m.insert(
+                    COMPACTION_STRATEGY.to_string(),
+                    DEFAULT_STRATEGY.to_string(),
+                );
+            }
+            CompactionStrategy::SizeTiered(opts) => {
+                m.insert(COMPACTION_STRATEGY.to_string(), STC_STRATEGY.to_string());
+                opts.fill_raw_map(m);
+            }
+            CompactionStrategy::TimeWindow(opts) => {
+                m.insert(COMPACTION_STRATEGY.to_string(), TWC_STRATEGY.to_string());
+                opts.fill_raw_map(m);
+            }
+        }
+    }
+}
+
+impl SizeTieredCompactionOptions {
+    pub(crate) fn validate(&self) -> Result<(), Error> {
+        ensure!(
+            self.bucket_high > self.bucket_low,
+            InvalidOption {
+                error: format!(
+                    "{} value({}) is less than or equal to the {} value({}) ",
+                    BUCKET_HIGH_KEY, self.bucket_high, BUCKET_LOW_KEY, self.bucket_low
+                ),
+            }
+        );
+
+        Ok(())
+    }
+
+    fn fill_raw_map(&self, m: &mut HashMap<String, String>) {
+        m.insert(BUCKET_LOW_KEY.to_string(), format!("{}", self.bucket_low));
+        m.insert(BUCKET_HIGH_KEY.to_string(), format!("{}", self.bucket_high));
+        m.insert(
+            MIN_SSTABLE_SIZE_KEY.to_string(),
+            format!("{}", self.min_sstable_size.0),
+        );
+        m.insert(
+            MAX_THRESHOLD_KEY.to_string(),
+            format!("{}", self.max_threshold),
+        );
+        m.insert(
+            MIN_THRESHOLD_KEY.to_string(),
+            format!("{}", self.min_threshold),
+        );
+    }
+
+    pub(crate) fn parse_from(
+        options: &HashMap<String, String>,
+    ) -> Result<SizeTieredCompactionOptions, Error> {
+        let mut opts = SizeTieredCompactionOptions::default();
+        if let Some(v) = options.get(BUCKET_LOW_KEY) {
+            opts.bucket_low = v.parse().context(ParseFloat {
+                key: BUCKET_HIGH_KEY,
+                value: v,
+            })?;
+        }
+        if let Some(v) = options.get(BUCKET_HIGH_KEY) {
+            opts.bucket_high = v.parse().context(ParseFloat {
+                key: BUCKET_HIGH_KEY,
+                value: v,
+            })?;
+        }
+        if let Some(v) = options.get(MIN_SSTABLE_SIZE_KEY) {
+            opts.min_sstable_size = v.parse::<ReadableSize>().map_err(|err| Error::ParseSize {
+                key: MIN_SSTABLE_SIZE_KEY.to_string(),
+                value: v.to_string(),
+                error: err,
+                backtrace: Backtrace::generate(),
+            })?;
+        }
+        if let Some(v) = options.get(MAX_THRESHOLD_KEY) {
+            opts.max_threshold = v.parse().context(ParseInt {
+                key: MAX_THRESHOLD_KEY,
+                value: v,
+            })?;
+        }
+        if let Some(v) = options.get(MIN_THRESHOLD_KEY) {
+            opts.min_threshold = v.parse().context(ParseInt {
+                key: MIN_THRESHOLD_KEY,
+                value: v,
+            })?;
+        }
+
+        opts.validate()?;
+
+        Ok(opts)
+    }
+}
+
+impl TimeWindowCompactionOptions {
+    /// TODO(boyan) In fact right now we only supports TimeUnit::Milliseconds
+    /// resolution.
+    fn valid_timestamp_unit(unit: TimeUnit) -> bool {
+        matches!(
+            unit,
+            TimeUnit::Seconds
+                | TimeUnit::Milliseconds
+                | TimeUnit::Microseconds
+                | TimeUnit::Nanoseconds
+        )
+    }
+
+    fn fill_raw_map(&self, m: &mut HashMap<String, String>) {
+        self.size_tiered.fill_raw_map(m);
+
+        m.insert(
+            TIMESTAMP_RESOLUTION_KEY.to_string(),
+            format!("{}", self.timestamp_resolution),
+        );
+    }
+
+    pub(crate) fn validate(&self) -> Result<(), Error> {
+        if !Self::valid_timestamp_unit(self.timestamp_resolution) {
+            return InvalidOption {
+                error: format!(
+                    "{:?} is not valid for {}) ",
+                    self.timestamp_resolution, TIMESTAMP_RESOLUTION_KEY
+                ),
+            }
+            .fail();
+        }
+
+        Ok(())
+    }
+
+    pub(crate) fn parse_from(
+        options: &HashMap<String, String>,
+    ) -> Result<TimeWindowCompactionOptions, Error> {
+        let mut opts = TimeWindowCompactionOptions {
+            size_tiered: SizeTieredCompactionOptions::parse_from(options)?,
+            ..Default::default()
+        };
+
+        if let Some(v) = options.get(TIMESTAMP_RESOLUTION_KEY) {
+            opts.timestamp_resolution =
+                v.parse::<TimeUnit>().map_err(|err| Error::ParseTimeUnit {
+                    key: TIMESTAMP_RESOLUTION_KEY.to_string(),
+                    value: v.to_string(),
+                    error: err,
+                    backtrace: Backtrace::generate(),
+                })?;
+        }
+
+        opts.validate()?;
+
+        Ok(opts)
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct CompactionInputFiles {
+    /// Level of the files to be compacted.
+    pub level: Level,
+    /// Files to be compacted.
+    pub files: Vec<FileHandle>,
+    /// The output level of the merged file.
+    pub output_level: Level,
+}
+
+#[derive(Default, Clone)]
+pub struct ExpiredFiles {
+    /// Level of the expired files.
+    pub level: Level,
+    /// Expired files.
+    pub files: Vec<FileHandle>,
+}
+
+#[derive(Default, Clone)]
+pub struct CompactionTask {
+    pub compaction_inputs: Vec<CompactionInputFiles>,
+    pub expired: Vec<ExpiredFiles>,
+}
+
+impl CompactionTask {
+    pub fn mark_files_being_compacted(&self, being_compacted: bool) {
+        for input in &self.compaction_inputs {
+            for file in &input.files {
+                file.set_being_compacted(being_compacted);
+            }
+        }
+        for expired in &self.expired {
+            for file in &expired.files {
+                file.set_being_compacted(being_compacted);
+            }
+        }
+    }
+}
+
+pub struct PickerManager {
+    default_picker: CompactionPickerRef,
+    time_window_picker: CompactionPickerRef,
+    size_tiered_picker: CompactionPickerRef,
+}
+
+impl Default for PickerManager {
+    fn default() -> Self {
+        let size_tiered_picker = Arc::new(CommonCompactionPicker::new(
+            CompactionStrategy::SizeTiered(SizeTieredCompactionOptions::default()),
+        ));
+        let time_window_picker = Arc::new(CommonCompactionPicker::new(
+            CompactionStrategy::TimeWindow(TimeWindowCompactionOptions::default()),
+        ));
+
+        Self {
+            default_picker: time_window_picker.clone(),
+            size_tiered_picker,
+            time_window_picker,
+        }
+    }
+}
+
+impl PickerManager {
+    pub fn get_picker(&self, strategy: CompactionStrategy) -> CompactionPickerRef {
+        match strategy {
+            CompactionStrategy::Default => self.default_picker.clone(),
+            CompactionStrategy::SizeTiered(_) => self.size_tiered_picker.clone(),
+            CompactionStrategy::TimeWindow(_) => self.time_window_picker.clone(),
+        }
+    }
+}
+
+#[derive(Debug, Snafu)]
+pub enum WaitError {
+    #[snafu(display("The compaction is canceled"))]
+    Canceled,
+
+    #[snafu(display("Failed to compact, err:{}", source))]
+    Compaction {
+        source: Arc<dyn std::error::Error + Send + Sync>,
+    },
+}
+
+pub type WaitResult<T> = std::result::Result<T, WaitError>;
+
+pub struct WaiterNotifier {
+    waiter: Option<oneshot::Sender<WaitResult<()>>>,
+}
+
+impl WaiterNotifier {
+    pub fn new(waiter: Option<oneshot::Sender<WaitResult<()>>>) -> Self {
+        Self { waiter }
+    }
+
+    pub fn notify_wait_result(mut self, res: WaitResult<()>) {
+        // Ignore error if failed to send result.
+        if let Some(waiter) = self.waiter.take() {
+            let _ = waiter.send(res);
+        }
+    }
+}
+
+impl Drop for WaiterNotifier {
+    fn drop(&mut self) {
+        if let Some(waiter) = self.waiter.take() {
+            // The compaction result hasn't been sent before the notifier dropped, we
+            // send a canceled error to waiter.
+            let _ = waiter.send(Canceled.fail());
+        }
+    }
+}
+
+/// Request to compact single table.
+pub struct TableCompactionRequest {
+    pub table_data: TableDataRef,
+    pub compaction_notifier: CompactionNotifier,
+    pub waiter: Option<oneshot::Sender<WaitResult<()>>>,
+}
+
+impl TableCompactionRequest {
+    pub fn no_waiter(table_data: TableDataRef, compaction_notifier: CompactionNotifier) -> Self {
+        TableCompactionRequest {
+            table_data,
+            compaction_notifier,
+            waiter: None,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::collections::HashMap;
+
+    use super::*;
+
+    #[test]
+    fn test_fill_raw_map_then_parse() {
+        let c = CompactionStrategy::Default;
+        let mut m = HashMap::new();
+        c.fill_raw_map(&mut m);
+        assert_eq!(1, m.len());
+        assert_eq!(m[COMPACTION_STRATEGY], "default");
+        assert_eq!(c, CompactionStrategy::parse_from("default", &m).unwrap());
+
+        let opts = SizeTieredCompactionOptions {
+            bucket_low: 0.1,
+            min_sstable_size: ReadableSize(1024),
+            max_threshold: 10,
+            ..Default::default()
+        };
+
+        let c = CompactionStrategy::SizeTiered(opts);
+        let mut m = HashMap::new();
+        c.fill_raw_map(&mut m);
+        assert_eq!(6, m.len());
+        assert_eq!(m[COMPACTION_STRATEGY], "size_tiered");
+        assert_eq!(m[BUCKET_LOW_KEY], "0.1");
+        assert_eq!(m[BUCKET_HIGH_KEY], "1.5");
+        assert_eq!(m[MIN_SSTABLE_SIZE_KEY], "1024");
+        assert_eq!(m[MIN_THRESHOLD_KEY], "4");
+        assert_eq!(m[MAX_THRESHOLD_KEY], "10");
+        assert_eq!(
+            c,
+            CompactionStrategy::parse_from("size_tiered", &m).unwrap()
+        );
+
+        let twc_opts = TimeWindowCompactionOptions {
+            size_tiered: opts,
+            ..Default::default()
+        };
+        let c = CompactionStrategy::TimeWindow(twc_opts);
+        let mut m = HashMap::new();
+        c.fill_raw_map(&mut m);
+
+        assert_eq!(7, m.len());
+        assert_eq!(m[COMPACTION_STRATEGY], "time_window");
+        assert_eq!(m[BUCKET_LOW_KEY], "0.1");
+        assert_eq!(m[BUCKET_HIGH_KEY], "1.5");
+        assert_eq!(m[MIN_SSTABLE_SIZE_KEY], "1024");
+        assert_eq!(m[MIN_THRESHOLD_KEY], "4");
+        assert_eq!(m[MAX_THRESHOLD_KEY], "10");
+        assert_eq!(m[TIMESTAMP_RESOLUTION_KEY], "milliseconds");
+        assert_eq!(
+            c,
+            CompactionStrategy::parse_from("time_window", &m).unwrap()
+        );
+    }
+}
diff --git a/analytic_engine/src/compaction/picker.rs b/analytic_engine/src/compaction/picker.rs
new file mode 100644
index 0000000000..5cc9f2afc9
--- /dev/null
+++ b/analytic_engine/src/compaction/picker.rs
@@ -0,0 +1,740 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Compaction picker.
+
+use std::{
+    collections::{BTreeSet, HashMap},
+    sync::Arc,
+    time::Duration,
+};
+
+use common_types::time::Timestamp;
+use common_util::{config::TimeUnit, define_result};
+use log::{debug, info};
+use snafu::Snafu;
+
+use crate::{
+    compaction::{
+        CompactionInputFiles, CompactionStrategy, CompactionTask, SizeTieredCompactionOptions,
+        TimeWindowCompactionOptions,
+    },
+    sst::{
+        file::{FileHandle, Level},
+        manager::LevelsController,
+    },
+};
+
+#[derive(Debug, Snafu)]
+pub enum Error {}
+
+define_result!(Error);
+
+#[derive(Clone)]
+pub struct PickerContext {
+    pub segment_duration: Duration,
+    /// The ttl of the data in sst.
+    pub ttl: Option<Duration>,
+    pub strategy: CompactionStrategy,
+}
+
+impl PickerContext {
+    fn size_tiered_opts(&self) -> SizeTieredCompactionOptions {
+        match self.strategy {
+            CompactionStrategy::SizeTiered(opts) => opts,
+            _ => SizeTieredCompactionOptions::default(),
+        }
+    }
+
+    fn time_window_opts(&self) -> TimeWindowCompactionOptions {
+        match self.strategy {
+            CompactionStrategy::TimeWindow(opts) => opts,
+            _ => TimeWindowCompactionOptions::default(),
+        }
+    }
+}
+
+pub trait CompactionPicker {
+    /// Pick candidate files for compaction.
+    ///
+    /// Note: files being compacted should be ignored.
+    fn pick_compaction(
+        &self,
+        ctx: PickerContext,
+        levels_controller: &LevelsController,
+    ) -> Result<CompactionTask>;
+}
+
+pub type CompactionPickerRef = Arc<dyn CompactionPicker + Send + Sync>;
+
+trait LevelPicker {
+    /// Pick candidate files for compaction at level
+    fn pick_candidates_at_level(
+        &self,
+        ctx: &PickerContext,
+        levels_controller: &LevelsController,
+        level: Level,
+        expire_time: Option<Timestamp>,
+    ) -> Option<Vec<FileHandle>>;
+}
+
+type LevelPickerRef = Arc<dyn LevelPicker + Send + Sync>;
+
+pub struct CommonCompactionPicker {
+    level_picker: LevelPickerRef,
+}
+
+impl CommonCompactionPicker {
+    pub fn new(strategy: CompactionStrategy) -> Self {
+        let level_picker: LevelPickerRef = match strategy {
+            CompactionStrategy::SizeTiered(_) | CompactionStrategy::Default => {
+                Arc::new(SizeTieredPicker::default())
+            }
+            CompactionStrategy::TimeWindow(_) => Arc::new(TimeWindowPicker::default()),
+        };
+        Self { level_picker }
+    }
+
+    fn pick_compact_candidates(
+        &self,
+        ctx: &PickerContext,
+        levels_controller: &LevelsController,
+        expire_time: Option<Timestamp>,
+    ) -> Option<CompactionInputFiles> {
+        let num_levels = levels_controller.num_levels();
+        //TODO(boyan) level compaction strategy
+        for level in 0..num_levels {
+            if let Some(files) = self.level_picker.pick_candidates_at_level(
+                ctx,
+                levels_controller,
+                level,
+                expire_time,
+            ) {
+                return Some(CompactionInputFiles {
+                    level,
+                    files,
+                    // Now, we always output to the same level.
+                    output_level: level,
+                });
+            }
+        }
+
+        None
+    }
+}
+
+impl CompactionPicker for CommonCompactionPicker {
+    fn pick_compaction(
+        &self,
+        ctx: PickerContext,
+        levels_controller: &LevelsController,
+    ) -> Result<CompactionTask> {
+        let expire_time = ctx.ttl.map(Timestamp::expire_time);
+        let mut compaction_task = CompactionTask {
+            expired: levels_controller.expired_ssts(expire_time),
+            ..Default::default()
+        };
+
+        if let Some(input_files) =
+            self.pick_compact_candidates(&ctx, levels_controller, expire_time)
+        {
+            info!(
+                "Compaction strategy: {:?} picker pick files to compact, input_files:{:?}",
+                ctx.strategy, input_files
+            );
+
+            compaction_task.compaction_inputs = vec![input_files];
+        }
+
+        Ok(compaction_task)
+    }
+}
+
+#[inline]
+fn find_uncompact_files(
+    levels_controller: &LevelsController,
+    level: Level,
+    expire_time: Option<Timestamp>,
+) -> Vec<FileHandle> {
+    levels_controller
+        .iter_ssts_at_level(level)
+        // Only use files not being compacted and not expired.
+        .filter(|file| !file.being_compacted() && !file.time_range().is_expired(expire_time))
+        .map(Clone::clone)
+        .collect()
+}
+
+/// Size tiered compaction strategy
+/// See https://github.com/jeffjirsa/twcs/blob/master/src/main/java/com/jeffjirsa/cassandra/db/compaction/SizeTieredCompactionStrategy.java
+#[derive(Default)]
+pub struct SizeTieredPicker {}
+
+/// Similar size files group
+#[derive(Debug)]
+struct Bucket {
+    pub avg_size: usize,
+    pub files: Vec<FileHandle>,
+}
+
+impl Bucket {
+    fn with_file(file: &FileHandle) -> Self {
+        Self {
+            avg_size: file.size() as usize,
+            files: vec![file.clone()],
+        }
+    }
+
+    fn with_files(files: Vec<FileHandle>) -> Self {
+        let total: usize = files.iter().map(|f| f.size() as usize).sum();
+        Self {
+            avg_size: total / files.len(),
+            files,
+        }
+    }
+
+    fn insert_file(&mut self, file: &FileHandle) {
+        let total_size = self.files.len() * self.avg_size + file.size() as usize;
+        self.avg_size = total_size / (self.files.len() + 1);
+        self.files.push(file.clone());
+    }
+
+    fn get_hotness_map(&self) -> HashMap<FileHandle, f64> {
+        self.files
+            .iter()
+            .map(|f| (f.clone(), Self::hotness(f)))
+            .collect()
+    }
+
+    #[inline]
+    fn hotness(f: &FileHandle) -> f64 {
+        let row_num = match f.row_num() {
+            0 => 1, //prevent NAN hotness
+            v => v,
+        };
+        f.read_meter().h2_rate() / (row_num as f64)
+    }
+}
+
+impl LevelPicker for SizeTieredPicker {
+    fn pick_candidates_at_level(
+        &self,
+        ctx: &PickerContext,
+        levels_controller: &LevelsController,
+        level: Level,
+        expire_time: Option<Timestamp>,
+    ) -> Option<Vec<FileHandle>> {
+        let files_by_segment =
+            Self::files_by_segment(levels_controller, level, ctx.segment_duration, expire_time);
+        if files_by_segment.is_empty() {
+            return None;
+        }
+
+        let all_segments: BTreeSet<_> = files_by_segment.keys().collect();
+        let opts = ctx.size_tiered_opts();
+
+        // Iterate the segment in reverse order, so newest segment is examined first.
+        for (idx, segment_key) in all_segments.iter().rev().enumerate() {
+            // segment_key should always exist.
+            if let Some(segment) = files_by_segment.get(segment_key) {
+                let buckets = Self::get_buckets(
+                    segment.to_vec(),
+                    opts.bucket_high,
+                    opts.bucket_low,
+                    opts.min_sstable_size.as_bytes() as f32,
+                );
+
+                let files =
+                    Self::most_interesting_bucket(buckets, opts.min_threshold, opts.max_threshold);
+
+                if files.is_some() {
+                    info!(
+                        "Compact segment, idx: {}, size:{}, segment_key:{:?}, files:{:?}",
+                        idx,
+                        segment.len(),
+                        segment_key,
+                        segment
+                    );
+                    return files;
+                }
+                debug!(
+                    "No compaction necessary for segment, size:{}, segment_key:{:?}, idx:{}",
+                    segment.len(),
+                    segment_key,
+                    idx
+                );
+            }
+        }
+
+        None
+    }
+}
+
+impl SizeTieredPicker {
+    ///  Group files of similar size into buckets.
+    fn get_buckets(
+        mut files: Vec<FileHandle>,
+        bucket_high: f32,
+        bucket_low: f32,
+        min_sst_size: f32,
+    ) -> Vec<Bucket> {
+        // sort by file length
+        files.sort_unstable_by_key(FileHandle::size);
+
+        let mut buckets: Vec<Bucket> = Vec::new();
+        'outer: for sst in &files {
+            let size = sst.size() as f32;
+            // look for a bucket containing similar-sized files:
+            // group in the same bucket if it's w/in 50% of the average for this bucket,
+            // or this file and the bucket are all considered "small" (less than
+            // `min_sst_size`)
+            for bucket in buckets.iter_mut() {
+                let old_avg_size = bucket.avg_size as f32;
+                if (size > (old_avg_size * bucket_low) && size < (old_avg_size * bucket_high))
+                    || (size < min_sst_size && old_avg_size < min_sst_size)
+                {
+                    // find a similar file, insert it into bucket
+                    bucket.insert_file(sst);
+                    continue 'outer;
+                }
+            }
+
+            // no similar bucket found
+            // put it in a new bucket
+            buckets.push(Bucket::with_file(sst));
+        }
+
+        debug!("Group files of similar size into buckets: {:?}", buckets);
+
+        buckets
+    }
+
+    fn most_interesting_bucket(
+        buckets: Vec<Bucket>,
+        min_threshold: usize,
+        max_threshold: usize,
+    ) -> Option<Vec<FileHandle>> {
+        let mut pruned_bucket_and_hotness = Vec::with_capacity(buckets.len());
+        // skip buckets containing less than min_threshold sstables,
+        // and limit other buckets to max_threshold sstables
+        for bucket in buckets {
+            let (bucket, hotness) = Self::trim_to_threshold_with_hotness(bucket, max_threshold);
+            if bucket.files.len() >= min_threshold {
+                pruned_bucket_and_hotness.push((bucket, hotness));
+            }
+        }
+
+        if pruned_bucket_and_hotness.is_empty() {
+            return None;
+        }
+
+        // Find the hotest bucket
+        if let Some((bucket, hotness)) =
+            pruned_bucket_and_hotness
+                .into_iter()
+                .max_by(|(b1, h1), (b2, h2)| {
+                    let c = h1.partial_cmp(h2).unwrap();
+                    if !c.is_eq() {
+                        return c;
+                    }
+                    //TODO(boyan), compacting smallest sstables first?
+                    b1.avg_size.cmp(&b2.avg_size)
+                })
+        {
+            debug!(
+                "Find the hotest bucket, hotness: {}, bucket: {:?}",
+                hotness, bucket
+            );
+            Some(bucket.files)
+        } else {
+            None
+        }
+    }
+
+    fn files_by_segment(
+        levels_controller: &LevelsController,
+        level: Level,
+        segment_duration: Duration,
+        expire_time: Option<Timestamp>,
+    ) -> HashMap<Timestamp, Vec<FileHandle>> {
+        let mut files_by_segment = HashMap::new();
+        let uncompact_files = find_uncompact_files(levels_controller, level, expire_time);
+        for file in uncompact_files {
+            // We use the end time of the range to calculate segment.
+            let segment = file
+                .time_range()
+                .exclusive_end()
+                .truncate_by(segment_duration);
+            let files = files_by_segment.entry(segment).or_insert_with(Vec::new);
+            files.push(file);
+        }
+
+        files_by_segment
+    }
+
+    fn trim_to_threshold_with_hotness(bucket: Bucket, max_threshold: usize) -> (Bucket, f64) {
+        let hotness_snapshot = bucket.get_hotness_map();
+
+        // Sort by sstable hotness (descending).
+        let mut sorted_files = bucket.files.to_vec();
+        sorted_files.sort_unstable_by(|f1, f2| {
+            hotness_snapshot[f1]
+                .partial_cmp(&hotness_snapshot[f2])
+                .unwrap()
+                .reverse()
+        });
+
+        // and then trim the coldest sstables off the end to meet the max_threshold
+        let len = sorted_files.len();
+        let pruned_bucket: Vec<FileHandle> = sorted_files
+            .into_iter()
+            .take(std::cmp::min(max_threshold, len))
+            .collect();
+
+        // bucket hotness is the sum of the hotness of all sstable members
+        let bucket_hotness = pruned_bucket.iter().map(Bucket::hotness).sum();
+
+        (Bucket::with_files(pruned_bucket), bucket_hotness)
+    }
+}
+
+/// Time window compaction strategy
+/// See https://github.com/jeffjirsa/twcs/blob/master/src/main/java/com/jeffjirsa/cassandra/db/compaction/TimeWindowCompactionStrategy.java
+#[derive(Default)]
+pub struct TimeWindowPicker {}
+
+impl TimeWindowPicker {
+    fn get_window_bounds_in_millis(window: &Duration, ts: i64) -> (i64, i64) {
+        let ts_secs = ts / 1000;
+
+        let size = window.as_secs() as i64;
+
+        let lower = ts_secs - (ts_secs % size);
+        let upper = lower + size - 1;
+
+        (lower * 1000, upper * 1000)
+    }
+
+    #[inline]
+    fn resolve_timetamp(ts: i64, timestamp_resolution: TimeUnit) -> i64 {
+        match timestamp_resolution {
+            TimeUnit::Microseconds => ts / 1000,
+            TimeUnit::Nanoseconds => ts / 1000000,
+            TimeUnit::Seconds => ts * 1000,
+            TimeUnit::Milliseconds => ts,
+            // the option is validated before, so it won't reach here
+            _ => unreachable!(),
+        }
+    }
+
+    ///  Group files of similar timestamp into buckets.
+    fn get_buckets(
+        files: &[FileHandle],
+        window: &Duration,
+        timestamp_resolution: TimeUnit,
+    ) -> (HashMap<i64, Vec<FileHandle>>, i64) {
+        let mut max_ts = 0i64;
+        let mut buckets: HashMap<i64, Vec<FileHandle>> = HashMap::new();
+        for f in files {
+            let ts = f.time_range_ref().exclusive_end().as_i64();
+
+            let ts = Self::resolve_timetamp(ts, timestamp_resolution);
+
+            let (left, _) = Self::get_window_bounds_in_millis(window, ts);
+
+            let bucket_files = buckets.entry(left).or_insert_with(Vec::new);
+
+            bucket_files.push(f.clone());
+
+            if left > max_ts {
+                max_ts = left;
+            }
+        }
+
+        debug!(
+            "Group files of similar timestamp into buckets: {:?}",
+            buckets
+        );
+        (buckets, max_ts)
+    }
+
+    fn newest_bucket(
+        buckets: HashMap<i64, Vec<FileHandle>>,
+        size_tiered_opts: SizeTieredCompactionOptions,
+        now: i64,
+    ) -> Option<Vec<FileHandle>> {
+        // If the current bucket has at least minThreshold SSTables, choose that one.
+        // For any other bucket, at least 2 SSTables is enough.
+        // In any case, limit to max_threshold SSTables.
+
+        let all_keys: BTreeSet<_> = buckets.keys().collect();
+
+        for key in all_keys.into_iter().rev() {
+            if let Some(bucket) = buckets.get(key) {
+                debug!("Key {}, now {}", key, now);
+
+                if bucket.len() >= size_tiered_opts.min_threshold && *key >= now {
+                    // If we're in the newest bucket, we'll use STCS to prioritize sstables
+                    let buckets = SizeTieredPicker::get_buckets(
+                        bucket.to_vec(),
+                        size_tiered_opts.bucket_high,
+                        size_tiered_opts.bucket_low,
+                        size_tiered_opts.min_sstable_size.as_bytes() as f32,
+                    );
+                    let files = SizeTieredPicker::most_interesting_bucket(
+                        buckets,
+                        size_tiered_opts.min_threshold,
+                        size_tiered_opts.max_threshold,
+                    );
+
+                    if files.is_some() {
+                        return files;
+                    }
+                } else if bucket.len() >= 2 && *key < now {
+                    debug!("Bucket size {} >= 2 and not in current bucket, compacting what's here: {:?}", bucket.len(), bucket);
+                    return Some(Self::trim_to_threshold(
+                        bucket,
+                        size_tiered_opts.max_threshold,
+                    ));
+                } else {
+                    debug!(
+                        "No compaction necessary for bucket size {} , key {}, now {}",
+                        bucket.len(),
+                        key,
+                        now
+                    );
+                }
+            }
+        }
+
+        None
+    }
+
+    fn trim_to_threshold(files: &[FileHandle], max_threshold: usize) -> Vec<FileHandle> {
+        // Sort by sstable file size
+        let mut sorted_files = files.to_vec();
+        sorted_files.sort_unstable_by_key(FileHandle::size);
+
+        // Trim the largest sstables off the end to meet the maxThreshold
+        let len = sorted_files.len();
+        sorted_files
+            .into_iter()
+            .take(std::cmp::min(max_threshold, len))
+            .collect()
+    }
+
+    /// Get current window timestamp, the caller MUST ensure the level has ssts,
+    /// panic otherwise.
+    fn get_current_window(
+        levels_controller: &LevelsController,
+        level: Level,
+        window: &Duration,
+        timestamp_resolution: TimeUnit,
+    ) -> i64 {
+        // always find the latest sst here
+        let now = levels_controller
+            .latest_sst(level)
+            .unwrap()
+            .time_range()
+            .exclusive_end()
+            .as_i64();
+        let now = Self::resolve_timetamp(now, timestamp_resolution);
+        Self::get_window_bounds_in_millis(window, now).0
+    }
+}
+
+impl LevelPicker for TimeWindowPicker {
+    fn pick_candidates_at_level(
+        &self,
+        ctx: &PickerContext,
+        levels_controller: &LevelsController,
+        level: Level,
+        expire_time: Option<Timestamp>,
+    ) -> Option<Vec<FileHandle>> {
+        let uncompact_files = find_uncompact_files(levels_controller, level, expire_time);
+
+        if uncompact_files.is_empty() {
+            return None;
+        }
+
+        let opts = ctx.time_window_opts();
+
+        debug!("TWCS compaction options: {:?}", opts);
+
+        let (buckets, ts) = Self::get_buckets(
+            &uncompact_files,
+            &ctx.segment_duration,
+            opts.timestamp_resolution,
+        );
+
+        let now = Self::get_current_window(
+            levels_controller,
+            level,
+            &ctx.segment_duration,
+            opts.timestamp_resolution,
+        );
+        debug!("now {}, max_ts: {}", now, ts);
+        assert!(now >= ts);
+
+        Self::newest_bucket(buckets, opts.size_tiered, now)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::time::Duration;
+
+    use common_types::{
+        bytes::Bytes,
+        tests::build_schema,
+        time::{TimeRange, Timestamp},
+    };
+
+    use crate::{
+        compaction::{picker::PickerContext, CompactionStrategy, PickerManager},
+        sst::{
+            file::SstMetaData,
+            manager::{tests::LevelsControllerMockBuilder, LevelsController},
+        },
+    };
+
+    fn build_sst_meta_data(time_range: TimeRange, size: u64) -> SstMetaData {
+        SstMetaData {
+            min_key: Bytes::from_static(b"100"),
+            max_key: Bytes::from_static(b"200"),
+            time_range,
+            max_sequence: 200,
+            schema: build_schema(),
+            size,
+            row_num: 2,
+        }
+    }
+
+    // testcase 0: file buckets: old bucket:[0,1] newest bucket:[2], expired:[3]
+    fn build_old_bucket_case(now: i64) -> LevelsController {
+        let builder = LevelsControllerMockBuilder::default();
+        let sst_meta_vec = vec![
+            build_sst_meta_data(
+                TimeRange::new_unchecked(Timestamp::new(now - 14000), Timestamp::new(now - 13000)),
+                2,
+            ),
+            build_sst_meta_data(
+                TimeRange::new_unchecked(Timestamp::new(now - 14000), Timestamp::new(now - 13000)),
+                2,
+            ),
+            build_sst_meta_data(
+                TimeRange::new_unchecked(Timestamp::new(now - 4000), Timestamp::new(now - 3000)),
+                2,
+            ),
+            build_sst_meta_data(
+                TimeRange::new_unchecked(Timestamp::new(100), Timestamp::new(200)),
+                2,
+            ),
+        ];
+        builder.add_sst(sst_meta_vec).build()
+    }
+
+    // testcase 1: file buckets: old bucket:[0,1] newest bucket:[2,3,4,5]
+    // default min_threshold=4
+    fn build_newest_bucket_case(now: i64) -> LevelsController {
+        let builder = LevelsControllerMockBuilder::default();
+        let sst_meta_vec = vec![
+            build_sst_meta_data(
+                TimeRange::new_unchecked(Timestamp::new(now - 14000), Timestamp::new(now - 13000)),
+                2,
+            ),
+            build_sst_meta_data(
+                TimeRange::new_unchecked(Timestamp::new(now - 14000), Timestamp::new(now - 13000)),
+                2,
+            ),
+            build_sst_meta_data(
+                TimeRange::new_unchecked(Timestamp::new(now - 4000), Timestamp::new(now - 3000)),
+                2,
+            ),
+            build_sst_meta_data(
+                TimeRange::new_unchecked(Timestamp::new(now - 4000), Timestamp::new(now - 3000)),
+                2,
+            ),
+            build_sst_meta_data(
+                TimeRange::new_unchecked(Timestamp::new(now - 4000), Timestamp::new(now - 3000)),
+                2,
+            ),
+            build_sst_meta_data(
+                TimeRange::new_unchecked(Timestamp::new(now - 4000), Timestamp::new(now - 3000)),
+                2,
+            ),
+        ];
+        builder.add_sst(sst_meta_vec).build()
+    }
+
+    // testcase 2: file buckets: old bucket:[0] newest bucket:[1,2,3]
+    // default min_threshold=4
+    fn build_newest_bucket_no_match_case(now: i64) -> LevelsController {
+        let builder = LevelsControllerMockBuilder::default();
+        let sst_meta_vec = vec![
+            build_sst_meta_data(
+                TimeRange::new_unchecked(Timestamp::new(now - 14000), Timestamp::new(now - 13000)),
+                2,
+            ),
+            build_sst_meta_data(
+                TimeRange::new_unchecked(Timestamp::new(now - 4000), Timestamp::new(now - 3000)),
+                2,
+            ),
+            build_sst_meta_data(
+                TimeRange::new_unchecked(Timestamp::new(now - 4000), Timestamp::new(now - 3000)),
+                2,
+            ),
+            build_sst_meta_data(
+                TimeRange::new_unchecked(Timestamp::new(now - 4000), Timestamp::new(now - 3000)),
+                2,
+            ),
+        ];
+        builder.add_sst(sst_meta_vec).build()
+    }
+
+    #[test]
+    fn test_time_window_picker() {
+        let picker_manager = PickerManager::default();
+        let twp = picker_manager.get_picker(CompactionStrategy::Default);
+        let mut ctx = PickerContext {
+            segment_duration: Duration::from_millis(1000),
+            ttl: Some(Duration::from_secs(100000)),
+            strategy: CompactionStrategy::Default,
+        };
+        let now = Timestamp::now();
+        {
+            let lc = build_old_bucket_case(now.as_i64());
+            let task = twp.pick_compaction(ctx.clone(), &lc).unwrap();
+            assert_eq!(task.compaction_inputs[0].files.len(), 2);
+            assert_eq!(task.compaction_inputs[0].files[0].id(), 0);
+            assert_eq!(task.compaction_inputs[0].files[1].id(), 1);
+            assert_eq!(task.expired[0].files.len(), 1);
+            assert_eq!(task.expired[0].files[0].id(), 3);
+        }
+
+        {
+            let lc = build_newest_bucket_case(now.as_i64());
+            let task = twp.pick_compaction(ctx.clone(), &lc).unwrap();
+            assert_eq!(task.compaction_inputs[0].files.len(), 4);
+            assert_eq!(task.compaction_inputs[0].files[0].id(), 2);
+            assert_eq!(task.compaction_inputs[0].files[1].id(), 3);
+            assert_eq!(task.compaction_inputs[0].files[2].id(), 4);
+            assert_eq!(task.compaction_inputs[0].files[3].id(), 5);
+        }
+
+        {
+            let lc = build_newest_bucket_no_match_case(now.as_i64());
+            let task = twp.pick_compaction(ctx.clone(), &lc).unwrap();
+            assert_eq!(task.compaction_inputs.len(), 0);
+        }
+
+        // If ttl is None, then no file is expired.
+        ctx.ttl = None;
+        {
+            let lc = build_old_bucket_case(now.as_i64());
+            let task = twp.pick_compaction(ctx, &lc).unwrap();
+            assert_eq!(task.compaction_inputs[0].files.len(), 2);
+            assert_eq!(task.compaction_inputs[0].files[0].id(), 0);
+            assert_eq!(task.compaction_inputs[0].files[1].id(), 1);
+            assert!(task.expired[0].files.is_empty());
+        }
+    }
+}
diff --git a/analytic_engine/src/compaction/scheduler.rs b/analytic_engine/src/compaction/scheduler.rs
new file mode 100644
index 0000000000..d06925d6d2
--- /dev/null
+++ b/analytic_engine/src/compaction/scheduler.rs
@@ -0,0 +1,595 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+// Compaction scheduler.
+
+use std::{
+    collections::{HashMap, VecDeque},
+    hash::Hash,
+    sync::{
+        atomic::{AtomicBool, AtomicUsize, Ordering},
+        Arc, RwLock,
+    },
+    time::Duration,
+};
+
+use async_trait::async_trait;
+use common_types::{request_id::RequestId, time::Timestamp};
+use common_util::{
+    config::ReadableDuration,
+    define_result,
+    runtime::{JoinHandle, Runtime},
+};
+use log::{debug, error, info, warn};
+use object_store::ObjectStore;
+use serde_derive::Deserialize;
+use snafu::{ResultExt, Snafu};
+use table_engine::table::TableId;
+use tokio::{
+    sync::{
+        mpsc::{self, Receiver, Sender},
+        Mutex,
+    },
+    time,
+};
+
+use crate::{
+    compaction::{
+        metrics::COMPACTION_PENDING_REQUEST_GAUGE, picker::PickerContext, CompactionTask,
+        PickerManager, TableCompactionRequest, WaitError, WaiterNotifier,
+    },
+    instance::SpaceStore,
+    meta::Manifest,
+    sst::factory::Factory,
+    table::data::TableDataRef,
+    TableOptions,
+};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Failed to join compaction schedule worker, err:{}", source))]
+    JoinWorker { source: common_util::runtime::Error },
+}
+
+define_result!(Error);
+
+#[derive(Debug, Clone, Deserialize)]
+#[serde(default)]
+pub struct SchedulerConfig {
+    pub schedule_channel_len: usize,
+    pub schedule_interval: ReadableDuration,
+    pub max_ongoing_tasks: usize,
+}
+
+// TODO(boyan), a better default value?
+const MAX_GOING_COMPACTION_TASKS: usize = 8;
+const MAX_PENDING_COMPACTION_TASKS: usize = 1024;
+
+impl Default for SchedulerConfig {
+    fn default() -> Self {
+        Self {
+            schedule_channel_len: 16,
+            // 30 minutes schedule interval.
+            schedule_interval: ReadableDuration(Duration::from_secs(60 * 30)),
+            max_ongoing_tasks: MAX_GOING_COMPACTION_TASKS,
+        }
+    }
+}
+
+enum ScheduleTask {
+    Request(TableCompactionRequest),
+    Schedule,
+    Exit,
+}
+
+#[async_trait]
+pub trait CompactionScheduler {
+    /// Stop the scheduler.
+    async fn stop_scheduler(&self) -> Result<()>;
+
+    /// Schedule a compaction job to background workers.
+    async fn schedule_table_compaction(&self, request: TableCompactionRequest);
+}
+
+// A FIFO queue that remove duplicate values by key.
+struct RequestQueue<K: Eq + Hash + Clone, V> {
+    keys: VecDeque<K>,
+    values: HashMap<K, V>,
+}
+
+impl<K: Eq + Hash + Clone, V> Default for RequestQueue<K, V> {
+    fn default() -> Self {
+        Self {
+            keys: VecDeque::default(),
+            values: HashMap::default(),
+        }
+    }
+}
+
+impl<K: Eq + Hash + Clone, V> RequestQueue<K, V> {
+    fn push_back(&mut self, key: K, value: V) -> bool {
+        if self.values.insert(key.clone(), value).is_none() {
+            self.keys.push_back(key);
+            return true;
+        }
+        false
+    }
+
+    fn pop_front(&mut self) -> Option<V> {
+        if let Some(key) = self.keys.pop_front() {
+            return self.values.remove(&key);
+        }
+        None
+    }
+
+    #[inline]
+    fn len(&self) -> usize {
+        self.values.len()
+    }
+
+    #[inline]
+    fn is_empty(&self) -> bool {
+        self.values.is_empty()
+    }
+}
+
+type RequestBuf = RwLock<RequestQueue<TableId, TableCompactionRequest>>;
+
+struct OngoingTaskLimit {
+    ongoing_tasks: AtomicUsize,
+    /// Buffer to hold pending requests
+    request_buf: RequestBuf,
+}
+
+impl OngoingTaskLimit {
+    #[inline]
+    fn start_task(&self) {
+        self.ongoing_tasks.fetch_add(1, Ordering::SeqCst);
+    }
+
+    #[inline]
+    fn finish_task(&self) {
+        self.ongoing_tasks.fetch_sub(1, Ordering::SeqCst);
+    }
+
+    #[inline]
+    fn add_request(&self, request: TableCompactionRequest) {
+        let mut dropped = 0;
+
+        {
+            let mut req_buf = self.request_buf.write().unwrap();
+
+            // Remove older requests
+            if req_buf.len() >= MAX_PENDING_COMPACTION_TASKS {
+                while req_buf.len() >= MAX_PENDING_COMPACTION_TASKS {
+                    req_buf.pop_front();
+                    dropped += 1;
+                }
+                COMPACTION_PENDING_REQUEST_GAUGE.sub(dropped)
+            }
+
+            if req_buf.push_back(request.table_data.id, request) {
+                COMPACTION_PENDING_REQUEST_GAUGE.add(1)
+            }
+        }
+
+        if dropped > 0 {
+            warn!(
+                "Too many compaction pending tasks,  limit: {}, dropped {} older tasks.",
+                MAX_PENDING_COMPACTION_TASKS, dropped,
+            );
+        }
+    }
+
+    fn drain_requests(&self, max_num: usize) -> Vec<TableCompactionRequest> {
+        let mut result = Vec::with_capacity(max_num);
+        let mut req_buf = self.request_buf.write().unwrap();
+
+        while result.len() < max_num {
+            if let Some(req) = req_buf.pop_front() {
+                result.push(req);
+            } else {
+                break;
+            }
+        }
+        COMPACTION_PENDING_REQUEST_GAUGE.sub(result.len() as i64);
+
+        result
+    }
+
+    #[inline]
+    fn has_pending_requests(&self) -> bool {
+        !self.request_buf.read().unwrap().is_empty()
+    }
+
+    #[inline]
+    fn request_buf_len(&self) -> usize {
+        self.request_buf.read().unwrap().len()
+    }
+
+    #[inline]
+    fn ongoing_tasks(&self) -> usize {
+        self.ongoing_tasks.load(Ordering::SeqCst)
+    }
+}
+
+pub type CompactionSchedulerRef = Arc<dyn CompactionScheduler + Send + Sync>;
+
+pub struct SchedulerImpl {
+    sender: Sender<ScheduleTask>,
+    running: Arc<AtomicBool>,
+    handle: Mutex<JoinHandle<()>>,
+}
+
+impl SchedulerImpl {
+    pub fn new<
+        Wal: Send + Sync + 'static,
+        Meta: Manifest + Send + Sync + 'static,
+        Store: ObjectStore + Send + Sync + 'static,
+        Fa: Factory + Send + Sync + 'static,
+    >(
+        space_store: Arc<SpaceStore<Wal, Meta, Store, Fa>>,
+        runtime: Arc<Runtime>,
+        config: SchedulerConfig,
+    ) -> Self {
+        let (tx, rx) = mpsc::channel(config.schedule_channel_len);
+        let running = Arc::new(AtomicBool::new(true));
+
+        let mut worker = ScheduleWorker {
+            sender: tx.clone(),
+            receiver: rx,
+            space_store,
+            runtime: runtime.clone(),
+            schedule_interval: config.schedule_interval.0,
+            picker_manager: PickerManager::default(),
+            tables_buf: Vec::new(),
+            max_ongoing_tasks: config.max_ongoing_tasks,
+            limit: Arc::new(OngoingTaskLimit {
+                ongoing_tasks: AtomicUsize::new(0),
+                request_buf: RwLock::new(RequestQueue::default()),
+            }),
+            running: running.clone(),
+        };
+
+        let handle = runtime.spawn(async move {
+            worker.schedule_loop().await;
+        });
+
+        Self {
+            sender: tx,
+            running,
+            handle: Mutex::new(handle),
+        }
+    }
+}
+
+#[async_trait]
+impl CompactionScheduler for SchedulerImpl {
+    async fn stop_scheduler(&self) -> Result<()> {
+        self.running.store(false, Ordering::Relaxed);
+        // Wake up the receiver, if the channel is full, the worker should be busy and
+        // check the running flag later.
+        let _ = self.sender.try_send(ScheduleTask::Exit);
+
+        let mut handle = self.handle.lock().await;
+        (&mut *handle).await.context(JoinWorker)?;
+
+        Ok(())
+    }
+
+    async fn schedule_table_compaction(&self, request: TableCompactionRequest) {
+        let send_res = self.sender.send(ScheduleTask::Request(request)).await;
+
+        if let Err(e) = send_res {
+            error!("Compaction scheduler failed to send request, err:{}", e);
+        }
+    }
+}
+
+struct OngoingTask {
+    limit: Arc<OngoingTaskLimit>,
+    sender: Sender<ScheduleTask>,
+}
+
+impl OngoingTask {
+    async fn schedule_worker_if_need(&self) {
+        if self.limit.has_pending_requests() {
+            if let Err(e) = self.sender.send(ScheduleTask::Schedule).await {
+                error!("Fail to schedule worker, err:{}", e);
+            }
+        }
+    }
+}
+
+struct ScheduleWorker<Wal, Meta, Store, Fa> {
+    sender: Sender<ScheduleTask>,
+    receiver: Receiver<ScheduleTask>,
+    space_store: Arc<SpaceStore<Wal, Meta, Store, Fa>>,
+    runtime: Arc<Runtime>,
+    schedule_interval: Duration,
+    picker_manager: PickerManager,
+    /// Buffer to hold all tables.
+    tables_buf: Vec<TableDataRef>,
+    max_ongoing_tasks: usize,
+    limit: Arc<OngoingTaskLimit>,
+    running: Arc<AtomicBool>,
+}
+
+#[inline]
+async fn schedule_table_compaction(sender: Sender<ScheduleTask>, request: TableCompactionRequest) {
+    if let Err(e) = sender.send(ScheduleTask::Request(request)).await {
+        error!("Fail to send table compaction request, err:{}", e);
+    }
+}
+
+impl<
+        Wal: Send + Sync + 'static,
+        Meta: Manifest + Send + Sync + 'static,
+        Store: ObjectStore + Send + Sync + 'static,
+        Fa: Factory + Send + Sync + 'static,
+    > ScheduleWorker<Wal, Meta, Store, Fa>
+{
+    async fn schedule_loop(&mut self) {
+        while self.running.load(Ordering::Relaxed) {
+            // TODO(yingwen): Maybe add a random offset to the interval.
+            match time::timeout(self.schedule_interval, self.receiver.recv()).await {
+                Ok(Some(schedule_task)) => {
+                    self.handle_schedule_task(schedule_task).await;
+                }
+                Ok(None) => {
+                    // The channel is disconnected.
+                    info!("Channel disconnected, compaction schedule worker exit");
+                    break;
+                }
+                Err(_) => {
+                    // Timeout.
+                    info!("Periodical compaction schedule start");
+
+                    self.full_ttl_purge();
+
+                    info!("Periodical compaction schedule end");
+                }
+            }
+        }
+
+        info!("Compaction schedule loop exit");
+    }
+
+    // This function is called seqentially, so we can mark files in compaction
+    // without racy.
+    async fn handle_schedule_task(&self, schedule_task: ScheduleTask) {
+        let ongoing = self.limit.ongoing_tasks();
+        match schedule_task {
+            ScheduleTask::Request(compact_req) => {
+                debug!("Ongoing compaction tasks:{}", ongoing);
+                if ongoing >= self.max_ongoing_tasks {
+                    self.limit.add_request(compact_req);
+                    warn!(
+                        "Too many compaction ongoing tasks:{}, max:{}, buf_len:{}",
+                        ongoing,
+                        self.max_ongoing_tasks,
+                        self.limit.request_buf_len()
+                    );
+                } else {
+                    self.do_table_compaction_request(compact_req).await;
+                }
+            }
+            ScheduleTask::Schedule => {
+                if self.max_ongoing_tasks > ongoing {
+                    let pending = self.limit.drain_requests(self.max_ongoing_tasks - ongoing);
+                    let len = pending.len();
+                    for compact_req in pending {
+                        self.do_table_compaction_request(compact_req).await;
+                    }
+                    debug!("Scheduled {} pending compaction tasks.", len);
+                }
+            }
+            ScheduleTask::Exit => (),
+        };
+    }
+
+    async fn do_table_compaction_request(&self, compact_req: TableCompactionRequest) {
+        let table_data = compact_req.table_data;
+        let compaction_notifier = compact_req.compaction_notifier;
+        let waiter_notifier = WaiterNotifier::new(compact_req.waiter);
+
+        let table_options = table_data.table_options();
+        let compaction_strategy = table_options.compaction_strategy;
+        let picker = self.picker_manager.get_picker(compaction_strategy);
+        let picker_ctx = match new_picker_context(&*table_options) {
+            Some(v) => v,
+            None => {
+                warn!("No valid context can be created, compaction request will be ignored, table_id:{}, table_name:{}",
+                    table_data.id, table_data.name);
+                return;
+            }
+        };
+        let version = table_data.current_version();
+
+        // Pick compaction task.
+        let compaction_task = version.pick_for_compaction(picker_ctx, &picker);
+        let compaction_task = match compaction_task {
+            Ok(v) => v,
+            Err(e) => {
+                error!(
+                    "Compaction scheduler failed to pick compaction, table:{}, table_id:{}, err:{}",
+                    table_data.name, table_data.id, e
+                );
+                // Now the error of picking compaction is considered not fatal and not sent to
+                // compaction notifier.
+                return;
+            }
+        };
+
+        // Mark files are in compaction.
+        compaction_task.mark_files_being_compacted(true);
+
+        let keep_scheduling_compaction = !compaction_task.compaction_inputs.is_empty();
+
+        let runtime = self.runtime.clone();
+        let space_store = self.space_store.clone();
+        self.limit.start_task();
+        let task = OngoingTask {
+            sender: self.sender.clone(),
+            limit: self.limit.clone(),
+        };
+
+        let sender = self.sender.clone();
+        let request_id = RequestId::next_id();
+        // Do actual costly compact job in background.
+        self.runtime.spawn(async move {
+            let res = space_store
+                .compact_table(runtime, &table_data, request_id, &compaction_task)
+                .await;
+
+            if let Err(e) = &res {
+                // Compaction is failed, we need to unset the compaction mark.
+                compaction_task.mark_files_being_compacted(false);
+
+                error!(
+                    "Failed to compact table, table_name:{}, table_id:{}, request_id:{}, err:{}",
+                    table_data.name, table_data.id, request_id, e
+                );
+            }
+
+            task.limit.finish_task();
+            task.schedule_worker_if_need().await;
+
+            // Notify the background compact table result.
+            match res {
+                Ok(()) => {
+                    let new_compaction_notifier = compaction_notifier.clone();
+                    compaction_notifier.notify_ok();
+                    waiter_notifier.notify_wait_result(Ok(()));
+
+                    if keep_scheduling_compaction {
+                        schedule_table_compaction(
+                            sender,
+                            TableCompactionRequest::no_waiter(
+                                table_data.clone(),
+                                new_compaction_notifier,
+                            ),
+                        )
+                        .await;
+                    }
+                }
+                Err(e) => {
+                    let e = Arc::new(e);
+                    compaction_notifier.notify_err(e.clone());
+                    let wait_err = WaitError::Compaction { source: e };
+                    waiter_notifier.notify_wait_result(Err(wait_err));
+                }
+            }
+        });
+    }
+
+    fn full_ttl_purge(&mut self) {
+        self.tables_buf.clear();
+        self.space_store.list_all_tables(&mut self.tables_buf);
+
+        let mut to_purge = Vec::new();
+
+        let now = Timestamp::now();
+        for table_data in &self.tables_buf {
+            let expire_time = table_data
+                .table_options()
+                .ttl()
+                .map(|ttl| now.sub_duration_or_min(ttl.0));
+
+            let version = table_data.current_version();
+            if !version.has_expired_sst(expire_time) {
+                debug!(
+                    "Table has no expired sst, table:{}, table_id:{}, expire_time:{:?}",
+                    table_data.name, table_data.id, expire_time
+                );
+
+                continue;
+            }
+
+            // Create a compaction task that only purge expired files.
+            let compaction_task = CompactionTask {
+                expired: version.expired_ssts(expire_time),
+                ..Default::default()
+            };
+
+            // Marks being compacted.
+            compaction_task.mark_files_being_compacted(true);
+
+            to_purge.push((table_data.clone(), compaction_task));
+        }
+
+        let runtime = self.runtime.clone();
+        let space_store = self.space_store.clone();
+        let request_id = RequestId::next_id();
+        // Spawn a background job to purge ssts and avoid schedule thread blocked.
+        self.runtime.spawn(async move {
+            for (table_data, compaction_task) in to_purge {
+                info!("Period purge expired files, table:{}, table_id:{}, request_id:{}", table_data.name, table_data.id, request_id);
+
+                if let Err(e) = space_store
+                    .compact_table(runtime.clone(), &table_data, request_id, &compaction_task)
+                    .await
+                {
+                    error!(
+                        "Failed to purge expired files of table, table:{}, table_id:{}, request_id:{}, err:{}",
+                        table_data.name, table_data.id, request_id, e
+                    );
+
+                    // Unset the compaction mark.
+                    compaction_task.mark_files_being_compacted(false);
+                }
+            }
+        });
+    }
+}
+
+// If segment duration is None, then no compaction should be triggered, but we
+// return a None context instead of panic here.
+fn new_picker_context(table_opts: &TableOptions) -> Option<PickerContext> {
+    table_opts
+        .segment_duration()
+        .map(|segment_duration| PickerContext {
+            segment_duration,
+            ttl: table_opts.ttl().map(|ttl| ttl.0),
+            strategy: table_opts.compaction_strategy,
+        })
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_request_queue() {
+        let mut q: RequestQueue<i32, String> = RequestQueue::default();
+        assert!(q.is_empty());
+        assert_eq!(0, q.len());
+
+        q.push_back(1, "task1".to_string());
+        q.push_back(2, "task2".to_string());
+        q.push_back(3, "task3".to_string());
+
+        assert_eq!(3, q.len());
+        assert!(!q.is_empty());
+
+        assert_eq!("task1", q.pop_front().unwrap());
+        assert_eq!("task2", q.pop_front().unwrap());
+        assert_eq!("task3", q.pop_front().unwrap());
+        assert!(q.pop_front().is_none());
+        assert!(q.is_empty());
+
+        q.push_back(1, "task1".to_string());
+        q.push_back(2, "task2".to_string());
+        q.push_back(3, "task3".to_string());
+        q.push_back(1, "task11".to_string());
+        q.push_back(3, "task33".to_string());
+        q.push_back(3, "task333".to_string());
+
+        assert_eq!(3, q.len());
+        assert_eq!("task11", q.pop_front().unwrap());
+        assert_eq!("task2", q.pop_front().unwrap());
+        assert_eq!("task333", q.pop_front().unwrap());
+        assert!(q.pop_front().is_none());
+        assert!(q.is_empty());
+        assert_eq!(0, q.len());
+    }
+}
diff --git a/analytic_engine/src/context.rs b/analytic_engine/src/context.rs
new file mode 100644
index 0000000000..60f2ef17c5
--- /dev/null
+++ b/analytic_engine/src/context.rs
@@ -0,0 +1,38 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Context for instance
+
+use std::{fmt, sync::Arc};
+
+use parquet::{DataCacheRef, MetaCacheRef};
+use table_engine::engine::EngineRuntimes;
+
+use crate::Config;
+
+/// Common context for instance
+pub struct CommonContext {
+    pub db_write_buffer_size: usize,
+    pub space_write_buffer_size: usize,
+}
+
+/// Context for instance open
+pub struct OpenContext {
+    /// Engine config
+    pub config: Config,
+
+    /// Background job runtime
+    pub runtimes: Arc<EngineRuntimes>,
+
+    /// Sst meta data cache.
+    pub meta_cache: Option<MetaCacheRef>,
+    /// Sst page cache.
+    pub data_cache: Option<DataCacheRef>,
+}
+
+impl fmt::Debug for OpenContext {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("OpenContext")
+            .field("config", &self.config)
+            .finish()
+    }
+}
diff --git a/analytic_engine/src/engine.rs b/analytic_engine/src/engine.rs
new file mode 100644
index 0000000000..82e785186b
--- /dev/null
+++ b/analytic_engine/src/engine.rs
@@ -0,0 +1,163 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Implements the TableEngine trait
+
+use std::sync::Arc;
+
+use async_trait::async_trait;
+use log::info;
+use object_store::ObjectStore;
+use snafu::ResultExt;
+use table_engine::{
+    engine::{Close, CreateTableRequest, DropTableRequest, OpenTableRequest, Result, TableEngine},
+    table::TableRef,
+    ANALYTIC_ENGINE_TYPE,
+};
+use wal::manager::WalManager;
+
+use crate::{
+    context::CommonContext, instance::InstanceRef, meta::Manifest, space::SpaceName,
+    sst::factory::Factory, table::TableImpl,
+};
+
+/// TableEngine implementation
+pub struct TableEngineImpl<Wal, Meta, Store, Fa> {
+    /// Instance of the table engine
+    instance: InstanceRef<Wal, Meta, Store, Fa>,
+}
+
+impl<Wal, Meta, Store, Fa> Clone for TableEngineImpl<Wal, Meta, Store, Fa> {
+    fn clone(&self) -> Self {
+        Self {
+            instance: self.instance.clone(),
+        }
+    }
+}
+
+impl<
+        Wal: WalManager + Send + Sync + 'static,
+        Meta: Manifest + Send + Sync + 'static,
+        Store: ObjectStore,
+        Fa,
+    > TableEngineImpl<Wal, Meta, Store, Fa>
+{
+    pub fn new(instance: InstanceRef<Wal, Meta, Store, Fa>) -> Self {
+        Self { instance }
+    }
+}
+
+impl<Wal, Meta, Store, Fa> TableEngineImpl<Wal, Meta, Store, Fa> {
+    pub fn instance(&self) -> InstanceRef<Wal, Meta, Store, Fa> {
+        self.instance.clone()
+    }
+}
+
+impl<Wal, Meta, Store, Fa> Drop for TableEngineImpl<Wal, Meta, Store, Fa> {
+    fn drop(&mut self) {
+        info!("Table engine dropped");
+    }
+}
+
+#[async_trait]
+impl<
+        Wal: WalManager + Send + Sync + 'static,
+        Meta: Manifest + Send + Sync + 'static,
+        Store: ObjectStore,
+        Fa: Factory + Send + Sync + 'static,
+    > TableEngine for TableEngineImpl<Wal, Meta, Store, Fa>
+{
+    fn engine_type(&self) -> &str {
+        ANALYTIC_ENGINE_TYPE
+    }
+
+    async fn close(&self) -> Result<()> {
+        info!("Try to close table engine");
+
+        // Close the instance.
+        self.instance
+            .close()
+            .await
+            .map_err(|e| Box::new(e) as _)
+            .context(Close)?;
+
+        info!("Table engine closed");
+
+        Ok(())
+    }
+
+    async fn create_table(&self, request: CreateTableRequest) -> Result<TableRef> {
+        let space = build_space_name(&request.catalog_name, &request.schema_name);
+
+        info!(
+            "Table engine impl create table, space:{}, request:{:?}",
+            space, request
+        );
+
+        let ctx = CommonContext {
+            db_write_buffer_size: self.instance.db_write_buffer_size,
+            space_write_buffer_size: self.instance.space_write_buffer_size,
+        };
+        let space_table = self.instance.create_table(&ctx, &space, request).await?;
+
+        let table_impl = Arc::new(TableImpl::new(
+            space_table,
+            self.instance.clone(),
+            ANALYTIC_ENGINE_TYPE.to_string(),
+        ));
+
+        Ok(table_impl)
+    }
+
+    async fn drop_table(&self, request: DropTableRequest) -> Result<bool> {
+        let space = build_space_name(&request.catalog_name, &request.schema_name);
+
+        info!(
+            "Table engine impl drop table, space:{}, request:{:?}",
+            space, request
+        );
+
+        let ctx = CommonContext {
+            db_write_buffer_size: self.instance.db_write_buffer_size,
+            space_write_buffer_size: self.instance.space_write_buffer_size,
+        };
+        let dropped = self.instance.drop_table(&ctx, &space, request).await?;
+        Ok(dropped)
+    }
+
+    async fn open_table(&self, request: OpenTableRequest) -> Result<Option<TableRef>> {
+        let space = build_space_name(&request.catalog_name, &request.schema_name);
+
+        info!(
+            "Table engine impl open table, space:{}, request:{:?}",
+            space, request
+        );
+        let ctx = CommonContext {
+            db_write_buffer_size: self.instance.db_write_buffer_size,
+            space_write_buffer_size: self.instance.space_write_buffer_size,
+        };
+        let space_table = match self
+            .instance
+            .find_table(&ctx, &space, &request.table_name)?
+        {
+            Some(v) => v,
+            None => return Ok(None),
+        };
+
+        let table_impl = Arc::new(TableImpl::new(
+            space_table,
+            self.instance.clone(),
+            ANALYTIC_ENGINE_TYPE.to_string(),
+        ));
+
+        Ok(Some(table_impl))
+    }
+}
+
+/// Build the space name from catalog and schema
+// TODO(yingwen): Should we store the <catalog, schema> => space mapping in the
+// system catalog, then put it in the CreateTableRequest, avoid generating space
+// name here
+fn build_space_name(catalog: &str, schema: &str) -> SpaceName {
+    // FIXME(yingwen): Find out a better way to create space name
+    format!("{}/{}", catalog, schema)
+}
diff --git a/analytic_engine/src/instance/alter.rs b/analytic_engine/src/instance/alter.rs
new file mode 100644
index 0000000000..e7ee9f6c42
--- /dev/null
+++ b/analytic_engine/src/instance/alter.rs
@@ -0,0 +1,289 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Alter schema logic of instance
+
+use std::{collections::HashMap, sync::Arc};
+
+use common_types::schema::Version;
+use common_util::define_result;
+use log::info;
+use object_store::ObjectStore;
+use snafu::{ensure, Backtrace, ResultExt, Snafu};
+use table_engine::table::AlterSchemaRequest;
+use tokio::sync::oneshot;
+use wal::manager::WalManager;
+
+use crate::{
+    instance::{
+        flush_compaction::TableFlushOptions,
+        write_worker,
+        write_worker::{AlterOptionsCommand, AlterSchemaCommand, WorkerLocal},
+        Instance,
+    },
+    meta::{
+        meta_update::{AlterOptionsMeta, AlterSchemaMeta, MetaUpdate},
+        Manifest,
+    },
+    space::SpaceAndTable,
+    sst::factory::Factory,
+    table::data::TableDataRef,
+    table_options,
+};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Failed to alter schema, source:{}", source,))]
+    AlterSchema { source: write_worker::Error },
+
+    #[snafu(display("Failed to alter options, source:{}", source,))]
+    AlterOptions { source: write_worker::Error },
+
+    #[snafu(display(
+        "Try to update schema to elder version, table:{}, current_version:{}, given_version:{}.\nBacktrace:\n{}",
+        table,
+        current_version,
+        given_version,
+        backtrace,
+    ))]
+    InvalidSchemaVersion {
+        table: String,
+        current_version: Version,
+        given_version: Version,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Invalid previous schema version, table:{}, current_version:{}, pre_version:{}.\nBacktrace:\n{}",
+        table,
+        current_version,
+        pre_version,
+        backtrace,
+    ))]
+    InvalidPreVersion {
+        table: String,
+        current_version: Version,
+        pre_version: Version,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Alter schema of a dropped table:{}", table))]
+    AlterDroppedTable { table: String },
+
+    #[snafu(display("Failed to flush table, table:{}, err:{}", table, source))]
+    FlushTable {
+        table: String,
+        source: crate::instance::flush_compaction::Error,
+    },
+
+    #[snafu(display("Failed to persist alter update, err:{}", source))]
+    PersistAlter {
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+
+    #[snafu(display("Invalid options, table:{}, err:{}", table, source))]
+    InvalidOptions {
+        table: String,
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+}
+
+define_result!(Error);
+
+impl<
+        Wal: WalManager + Send + Sync + 'static,
+        Meta: Manifest + Send + Sync + 'static,
+        Store: ObjectStore,
+        Fa: Factory + Send + Sync + 'static,
+    > Instance<Wal, Meta, Store, Fa>
+{
+    // Alter schema need to be handled by write worker.
+    pub async fn alter_schema_of_table(
+        &self,
+        space_table: &SpaceAndTable,
+        request: AlterSchemaRequest,
+    ) -> Result<()> {
+        info!(
+            "Instance alter schema, space_table:{:?}, request:{:?}",
+            space_table, request
+        );
+
+        // Create a oneshot channel to send/receive alter schema result.
+        let (tx, rx) = oneshot::channel();
+        let cmd = AlterSchemaCommand {
+            space_table: space_table.clone(),
+            request,
+            tx,
+        };
+
+        // Send alter schema request to write worker, actual works done in
+        // Self::process_alter_schema_command()
+        write_worker::process_command_in_write_worker(
+            cmd.into_command(),
+            space_table.table_data(),
+            rx,
+        )
+        .await
+        .context(AlterSchema)
+    }
+
+    /// Do the actual alter schema job, must called by write worker in write
+    /// thread sequentially.
+    pub(crate) async fn process_alter_schema_command(
+        self: &Arc<Self>,
+        worker_local: &mut WorkerLocal,
+        space_table: &SpaceAndTable,
+        request: AlterSchemaRequest,
+    ) -> Result<()> {
+        let table_data = space_table.table_data();
+        // Validate alter schema request.
+        self.validate_before_alter(table_data, &request)?;
+
+        let opts = TableFlushOptions {
+            block_on_write_thread: true,
+            ..Default::default()
+        };
+        // We are in write thread now and there is no write request being processed, but
+        // we need to trigger a flush to ensure all wal entries with old schema
+        // are flushed, so we won't need to handle them during replaying wal.
+        self.flush_table_in_worker(worker_local, table_data, opts)
+            .await
+            .context(FlushTable {
+                table: &table_data.name,
+            })?;
+
+        // Now we can persist and update the schema, since this function is called by
+        // write worker, so there is no other concurrent writer altering the
+        // schema.
+        let meta_update = MetaUpdate::AlterSchema(AlterSchemaMeta {
+            space_id: space_table.space().id,
+            table_id: table_data.id,
+            schema: request.schema.clone(),
+            pre_schema_version: request.pre_schema_version,
+        });
+        self.space_store
+            .manifest
+            .store_update(meta_update)
+            .await
+            .map_err(|e| Box::new(e) as _)
+            .context(PersistAlter)?;
+
+        info!(
+            "Instance update table schema, new_schema:{:?}",
+            request.schema
+        );
+
+        // Update schema in memory.
+        table_data.set_schema(request.schema);
+
+        Ok(())
+    }
+
+    // Most validation should be done by catalog module, so we don't do too much
+    // duplicate check here, especially the schema compatibility.
+    fn validate_before_alter(
+        &self,
+        table_data: &TableDataRef,
+        request: &AlterSchemaRequest,
+    ) -> Result<()> {
+        ensure!(
+            !table_data.is_dropped(),
+            AlterDroppedTable {
+                table: &table_data.name,
+            }
+        );
+
+        let current_version = table_data.schema_version();
+        ensure!(
+            current_version < request.schema.version(),
+            InvalidSchemaVersion {
+                table: &table_data.name,
+                current_version,
+                given_version: request.schema.version(),
+            }
+        );
+
+        ensure!(
+            current_version == request.pre_schema_version,
+            InvalidPreVersion {
+                table: &table_data.name,
+                current_version,
+                pre_version: request.pre_schema_version,
+            }
+        );
+
+        Ok(())
+    }
+
+    pub async fn alter_options_of_table(
+        &self,
+        space_table: &SpaceAndTable,
+        options: HashMap<String, String>,
+    ) -> Result<()> {
+        info!(
+            "Instance alter options of table, space_table:{:?}, options:{:?}",
+            space_table, options
+        );
+
+        // Create a oneshot channel to send/receive alter options result.
+        let (tx, rx) = oneshot::channel();
+        let cmd = AlterOptionsCommand {
+            space_table: space_table.clone(),
+            options,
+            tx,
+        };
+
+        // Send alter options request to write worker, actual works done in
+        // Self::process_alter_options_command()
+        write_worker::process_command_in_write_worker(
+            cmd.into_command(),
+            space_table.table_data(),
+            rx,
+        )
+        .await
+        .context(AlterOptions)
+    }
+
+    /// Do the actual alter options job, must called by write worker in write
+    /// thread sequentially.
+    pub(crate) async fn process_alter_options_command(
+        self: &Arc<Self>,
+        worker_local: &mut WorkerLocal,
+        space_table: &SpaceAndTable,
+        options: HashMap<String, String>,
+    ) -> Result<()> {
+        let table_data = space_table.table_data();
+        let current_table_options = table_data.table_options();
+        info!(
+            "Instance alter options, space:{:?}, tables:{:?}, old_table_opts:{:?}, options:{:?}",
+            space_table.space().name,
+            space_table.table_data().name,
+            current_table_options,
+            options
+        );
+        let mut table_opts =
+            table_options::merge_table_options_for_alter(&options, &*current_table_options)
+                .map_err(|e| Box::new(e) as _)
+                .context(InvalidOptions {
+                    table: &table_data.name,
+                })?;
+        table_opts.sanitize();
+
+        // Now we can persist and update the options, since this function is called by
+        // write worker, so there is no other concurrent writer altering the
+        // options.
+        let meta_update = MetaUpdate::AlterOptions(AlterOptionsMeta {
+            space_id: space_table.space().id,
+            table_id: table_data.id,
+            options: table_opts.clone(),
+        });
+        self.space_store
+            .manifest
+            .store_update(meta_update)
+            .await
+            .map_err(|e| Box::new(e) as _)
+            .context(PersistAlter)?;
+
+        table_data.set_table_options(worker_local, table_opts);
+        Ok(())
+    }
+}
diff --git a/analytic_engine/src/instance/close.rs b/analytic_engine/src/instance/close.rs
new file mode 100644
index 0000000000..6ae34f4eb5
--- /dev/null
+++ b/analytic_engine/src/instance/close.rs
@@ -0,0 +1,93 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Close table logic of instance
+
+use std::sync::Arc;
+
+use log::{info, warn};
+use object_store::ObjectStore;
+use snafu::ResultExt;
+use table_engine::engine::CloseTableRequest;
+use tokio::sync::oneshot;
+use wal::manager::WalManager;
+
+use crate::{
+    instance::{
+        engine::{FlushTable, OperateByWriteWorker, Result},
+        flush_compaction::TableFlushOptions,
+        write_worker::{self, CloseTableCommand, WorkerLocal},
+        Instance,
+    },
+    meta::Manifest,
+    space::SpaceRef,
+    sst::factory::Factory,
+};
+
+impl<Wal, Meta, Store, Fa> Instance<Wal, Meta, Store, Fa>
+where
+    Wal: WalManager + Send + Sync + 'static,
+    Meta: Manifest + Send + Sync + 'static,
+    Store: ObjectStore,
+    Fa: Factory + Send + Sync + 'static,
+{
+    /// Close table need to be handled by write worker.
+    pub async fn do_close_table(&self, space: SpaceRef, request: CloseTableRequest) -> Result<()> {
+        info!("Instance close table, request:{:?}", request);
+
+        let table_data = match space.find_table_by_id(request.table_id) {
+            Some(v) => v,
+            None => return Ok(()),
+        };
+
+        let (tx, rx) = oneshot::channel::<Result<()>>();
+        let cmd = CloseTableCommand { space, request, tx };
+        write_worker::process_command_in_write_worker(cmd.into_command(), &table_data, rx)
+            .await
+            .context(OperateByWriteWorker {
+                space_id: table_data.space_id,
+                table: &table_data.name,
+                table_id: table_data.id,
+            })
+    }
+
+    /// Do the actual close table job, must be called by write worker in write
+    /// thread sequentially.
+    pub(crate) async fn process_close_table_command(
+        self: &Arc<Self>,
+        worker_local: &mut WorkerLocal,
+        space: SpaceRef,
+        request: CloseTableRequest,
+    ) -> Result<()> {
+        let table_data = match space.find_table_by_id(request.table_id) {
+            Some(v) => v,
+            None => {
+                warn!("try to close a closed table, request:{:?}", request);
+                return Ok(());
+            }
+        };
+
+        let opts = TableFlushOptions {
+            block_on_write_thread: true,
+            // The table will be dropped, no need to trigger a compaction.
+            compact_after_flush: false,
+            ..Default::default()
+        };
+        self.flush_table_in_worker(worker_local, &table_data, opts)
+            .await
+            .context(FlushTable {
+                space_id: space.id,
+                table: &table_data.name,
+                table_id: table_data.id,
+            })?;
+
+        // table has been closed so remove it from the space
+        let removed_table = space.remove_table(&request.table_name);
+        assert!(removed_table.is_some());
+
+        info!(
+            "table:{}-{} has been removed from the space_id:{}",
+            table_data.name, table_data.id, space.id
+        );
+        Ok(())
+    }
+}
diff --git a/analytic_engine/src/instance/create.rs b/analytic_engine/src/instance/create.rs
new file mode 100644
index 0000000000..1597982f27
--- /dev/null
+++ b/analytic_engine/src/instance/create.rs
@@ -0,0 +1,131 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Create table logic of instance
+
+use std::sync::Arc;
+
+use log::info;
+use object_store::ObjectStore;
+use snafu::ResultExt;
+use table_engine::engine::CreateTableRequest;
+use tokio::sync::oneshot;
+use wal::manager::WalManager;
+
+use crate::{
+    instance::{
+        engine::{CreateTableData, InvalidOptions, OperateByWriteWorker, Result, WriteManifest},
+        write_worker::{self, CreateTableCommand, WorkerLocal},
+        Instance,
+    },
+    meta::{
+        meta_update::{AddTableMeta, MetaUpdate},
+        Manifest,
+    },
+    space::SpaceRef,
+    sst::factory::Factory,
+    table::data::{TableData, TableDataRef},
+    table_options,
+};
+
+impl<Wal, Meta, Store, Fa> Instance<Wal, Meta, Store, Fa>
+where
+    Wal: WalManager + Send + Sync + 'static,
+    Meta: Manifest + Send + Sync + 'static,
+    Store: ObjectStore,
+    Fa: Factory + Send + Sync + 'static,
+{
+    /// Create table need to be handled by write worker.
+    pub async fn do_create_table(
+        &self,
+        space: SpaceRef,
+        request: CreateTableRequest,
+    ) -> Result<TableDataRef> {
+        info!("Instance create table, request:{:?}", request);
+
+        let mut table_opts =
+            table_options::merge_table_options_for_create(&request.options, &self.table_opts)
+                .map_err(|e| Box::new(e) as _)
+                .context(InvalidOptions {
+                    space_id: space.id,
+                    table: &request.table_name,
+                    table_id: request.table_id,
+                })?;
+        // Sanitize options before creating table.
+        table_opts.sanitize();
+
+        if let Some(table_data) = space.find_table_by_id(request.table_id) {
+            return Ok(table_data);
+        }
+
+        // Choose a write worker for this table
+        let write_handle = space.write_group.choose_worker(request.table_id);
+        let (table_name, table_id) = (request.table_name.clone(), request.table_id);
+
+        let table_data = Arc::new(
+            TableData::new(
+                space.id,
+                request,
+                write_handle,
+                table_opts,
+                &self.file_purger,
+                space.mem_usage_collector.clone(),
+            )
+            .context(CreateTableData {
+                space_id: space.id,
+                table: &table_name,
+                table_id,
+            })?,
+        );
+
+        let space_id = space.id;
+        let (tx, rx) = oneshot::channel();
+        let cmd = CreateTableCommand {
+            space,
+            table_data: table_data.clone(),
+            tx,
+        };
+        write_worker::process_command_in_write_worker(cmd.into_command(), &table_data, rx)
+            .await
+            .context(OperateByWriteWorker {
+                space_id,
+                table: table_name,
+                table_id: table_data.id,
+            })
+    }
+
+    /// Do the actual create table job, must be called by write worker in write
+    /// thread sequentially.
+    pub(crate) async fn process_create_table_command(
+        self: &Arc<Self>,
+        _worker_local: &mut WorkerLocal,
+        space: SpaceRef,
+        table_data: TableDataRef,
+    ) -> Result<TableDataRef> {
+        if let Some(table_data) = space.find_table_by_id(table_data.id) {
+            // Use the table data from the space instead of the table_data in params.
+            return Ok(table_data);
+        };
+
+        // Store table info into meta
+        let update = MetaUpdate::AddTable(AddTableMeta {
+            space_id: space.id,
+            table_id: table_data.id,
+            table_name: table_data.name.clone(),
+            schema: table_data.schema(),
+            opts: table_data.table_options().as_ref().clone(),
+        });
+        self.space_store
+            .manifest
+            .store_update(update)
+            .await
+            .map_err(|e| Box::new(e) as _)
+            .context(WriteManifest {
+                space_id: space.id,
+                table: &table_data.name,
+                table_id: table_data.id,
+            })?;
+
+        space.insert_table(table_data.clone());
+        Ok(table_data)
+    }
+}
diff --git a/analytic_engine/src/instance/drop.rs b/analytic_engine/src/instance/drop.rs
new file mode 100644
index 0000000000..899d937524
--- /dev/null
+++ b/analytic_engine/src/instance/drop.rs
@@ -0,0 +1,152 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Drop table logic of instance
+
+use std::sync::Arc;
+
+use common_util::define_result;
+use log::{info, warn};
+use object_store::ObjectStore;
+use snafu::{ResultExt, Snafu};
+use table_engine::engine::DropTableRequest;
+use tokio::sync::oneshot;
+use wal::manager::WalManager;
+
+use crate::{
+    instance::{
+        flush_compaction::TableFlushOptions,
+        write_worker::{self, DropTableCommand, WorkerLocal},
+        Instance,
+    },
+    meta::{
+        meta_update::{DropTableMeta, MetaUpdate},
+        Manifest,
+    },
+    space::SpaceAndTable,
+    sst::factory::Factory,
+};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display(
+        "Failed to drop table space:{}, table:{}, err:{}",
+        space,
+        table,
+        source,
+    ))]
+    DropTable {
+        space: String,
+        table: String,
+        source: write_worker::Error,
+    },
+
+    #[snafu(display("Flush before drop failed, table:{}, err:{}", table, source))]
+    FlushTable {
+        table: String,
+        source: crate::instance::flush_compaction::Error,
+    },
+
+    #[snafu(display("Failed to persist drop table update, err:{}", source))]
+    PersistDrop {
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+}
+
+define_result!(Error);
+
+impl<
+        Wal: WalManager + Send + Sync + 'static,
+        Meta: Manifest + Send + Sync + 'static,
+        Store: ObjectStore,
+        Fa: Factory + Send + Sync + 'static,
+    > Instance<Wal, Meta, Store, Fa>
+{
+    /// Drop table need to be handled by write worker.
+    pub async fn do_drop_table(
+        &self,
+        space_table: SpaceAndTable,
+        request: DropTableRequest,
+    ) -> Result<()> {
+        info!(
+            "Instance drop table, space_table:{:?}, request:{:?}",
+            space_table, request
+        );
+
+        // Create a oneshot channel to send/receive alter schema result.
+        let (tx, rx) = oneshot::channel();
+        let cmd = DropTableCommand {
+            space_table: space_table.clone(),
+            request,
+            tx,
+        };
+
+        write_worker::process_command_in_write_worker(
+            cmd.into_command(),
+            space_table.table_data(),
+            rx,
+        )
+        .await
+        .context(DropTable {
+            space: &space_table.space().name,
+            table: &space_table.table_data().name,
+        })?;
+
+        Ok(())
+    }
+
+    /// Do the actual drop table job, must be called by write worker in write
+    /// thread sequentially.
+    pub(crate) async fn process_drop_table_command(
+        self: &Arc<Self>,
+        worker_local: &mut WorkerLocal,
+        space_table: &SpaceAndTable,
+        _request: DropTableRequest,
+    ) -> Result<()> {
+        let table_data = space_table.table_data();
+        if table_data.is_dropped() {
+            warn!(
+                "Process drop table command tries to drop a dropped table, space_table:{:?}",
+                space_table
+            );
+            return Ok(());
+        }
+
+        // Fixme(xikai): Trigger a force flush so that the data of the table in the wal
+        //  is marked for deletable. However, the overhead of the flushing can
+        //  be avoided.
+        let opts = TableFlushOptions {
+            block_on_write_thread: true,
+            // The table will be dropped, no need to trigger a compaction.
+            compact_after_flush: false,
+            ..Default::default()
+        };
+        self.flush_table_in_worker(worker_local, table_data, opts)
+            .await
+            .context(FlushTable {
+                table: &table_data.name,
+            })?;
+
+        // Store the dropping information into meta
+        let update = MetaUpdate::DropTable(DropTableMeta {
+            space_id: space_table.space().id,
+            table_id: table_data.id,
+            table_name: table_data.name.clone(),
+        });
+        self.space_store
+            .manifest
+            .store_update(update)
+            .await
+            .map_err(|e| Box::new(e) as _)
+            .context(PersistDrop)?;
+
+        // Set the table dropped after finishing flushing and storing drop table meta
+        // information.
+        table_data.set_dropped();
+
+        // Clear the memory status after updating manifest and clearing wal so that
+        // the drop is retryable if fails to update and clear.
+        space_table.space().remove_table(&table_data.name);
+
+        Ok(())
+    }
+}
diff --git a/analytic_engine/src/instance/engine.rs b/analytic_engine/src/instance/engine.rs
new file mode 100644
index 0000000000..a96895070e
--- /dev/null
+++ b/analytic_engine/src/instance/engine.rs
@@ -0,0 +1,230 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Table engine logic of instance
+
+use std::sync::Arc;
+
+use common_util::define_result;
+use log::info;
+use object_store::ObjectStore;
+use snafu::{ResultExt, Snafu};
+use table_engine::engine::{CreateTableRequest, DropTableRequest};
+use wal::manager::WalManager;
+
+use crate::{
+    context::CommonContext,
+    instance::{write_worker::WriteGroup, Instance},
+    meta::{
+        meta_update::{AddSpaceMeta, MetaUpdate},
+        Manifest,
+    },
+    space::{Space, SpaceAndTable, SpaceNameRef, SpaceRef},
+    sst::factory::Factory,
+    table_options,
+};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Space failed to create table, err:{}", source))]
+    SpaceCreateTable { source: crate::space::Error },
+
+    #[snafu(display("Failed to drop table, err:{}", source))]
+    DoDropTable {
+        source: crate::instance::drop::Error,
+    },
+
+    #[snafu(display("Failed to store meta of space, space:{}, err:{}", space, source))]
+    SpaceWriteMeta {
+        space: String,
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+    #[snafu(display("Invalid options, table:{}, err:{}", table, source))]
+    InvalidOptions {
+        table: String,
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+}
+
+define_result!(Error);
+
+impl From<Error> for table_engine::engine::Error {
+    fn from(err: Error) -> Self {
+        match err {
+            Error::SpaceCreateTable { source } => Self::from(source),
+
+            // FIXME(xikai): should map drop table error to a more reasonable table engine error.
+            Error::DoDropTable { .. } => Self::Unexpected {
+                source: Box::new(err),
+            },
+
+            Error::SpaceWriteMeta { .. } => Self::WriteMeta {
+                source: Box::new(err),
+            },
+
+            Error::InvalidOptions { ref table, .. } => Self::InvalidArguments {
+                table: table.clone(),
+                source: Box::new(err),
+            },
+        }
+    }
+}
+
+impl<
+        Wal: WalManager + Send + Sync + 'static,
+        Meta: Manifest + Send + Sync + 'static,
+        Store: ObjectStore,
+        Fa: Factory + Send + Sync + 'static,
+    > Instance<Wal, Meta, Store, Fa>
+{
+    /// Find space by name, create if the space is not exists
+    pub async fn find_or_create_space(
+        self: &Arc<Self>,
+        _ctx: &CommonContext,
+        space_name: SpaceNameRef<'_>,
+    ) -> Result<SpaceRef> {
+        // Find space first
+        if let Some(space) = self.get_space_by_read_lock(space_name) {
+            return Ok(space);
+        }
+
+        // Persist space data into meta, done with `meta_state` guarded
+        let mut meta_state = self.space_store.meta_state.lock().await;
+        // The space may already been created by other thread
+        if let Some(space) = self.get_space_by_read_lock(space_name) {
+            return Ok(space);
+        }
+        // Now we are the one responsible to create and persist the space info into meta
+
+        let space_id = meta_state.alloc_space_id();
+        // Create write group for the space
+        // TODO(yingwen): Expose options
+        let write_group_opts = self.write_group_options(space_id);
+        let write_group = WriteGroup::new(write_group_opts, self.clone());
+
+        // Create space
+        let space = Arc::new(Space::new(
+            space_id,
+            space_name.to_string(),
+            self.space_write_buffer_size,
+            write_group,
+            self.mem_usage_collector.clone(),
+        ));
+
+        // Create a meta update and store it
+        let update = MetaUpdate::AddSpace(AddSpaceMeta {
+            space_id,
+            space_name: space_name.to_string(),
+        });
+        info!("Instance create space, update:{:?}", update);
+        self.space_store
+            .manifest
+            .store_update(update)
+            .await
+            .map_err(|e| Box::new(e) as _)
+            .context(SpaceWriteMeta { space: space_name })?;
+
+        let mut spaces = self.space_store.spaces.write().unwrap();
+        spaces.insert(space_name.to_string(), space.clone());
+        // Now we can release the meta state lock
+
+        Ok(space)
+    }
+
+    /// Find space by name
+    pub fn find_space(
+        &self,
+        _ctx: &CommonContext,
+        space: SpaceNameRef,
+    ) -> Result<Option<SpaceRef>> {
+        let spaces = self.space_store.spaces.read().unwrap();
+        Ok(spaces.get_by_name(space).cloned())
+    }
+
+    /// Create a table under given space
+    pub async fn create_table(
+        self: &Arc<Self>,
+        ctx: &CommonContext,
+        space: SpaceNameRef<'_>,
+        request: CreateTableRequest,
+    ) -> Result<SpaceAndTable> {
+        let mut table_opts =
+            table_options::merge_table_options_for_create(&request.options, &self.table_opts)
+                .map_err(|e| Box::new(e) as _)
+                .context(InvalidOptions {
+                    table: &request.table_name,
+                })?;
+        // Sanitize options before creating table.
+        table_opts.sanitize();
+
+        info!(
+            "Instance create table, space:{}, request:{:?}, table_opts:{:?}",
+            space, request, table_opts
+        );
+
+        let space = self.find_or_create_space(ctx, space).await?;
+
+        let table_data = space
+            .create_table(
+                request,
+                &self.space_store.manifest,
+                &table_opts,
+                &self.file_purger,
+            )
+            .await
+            .context(SpaceCreateTable)?;
+
+        Ok(SpaceAndTable::new(space, table_data))
+    }
+
+    /// Drop a table under given space
+    pub async fn drop_table(
+        self: &Arc<Self>,
+        ctx: &CommonContext,
+        space: SpaceNameRef<'_>,
+        request: DropTableRequest,
+    ) -> Result<bool> {
+        info!(
+            "Instance drop table, space:{}, request:{:?}",
+            space, request
+        );
+
+        let space = match self.find_space(ctx, space)? {
+            Some(v) => v,
+            None => return Ok(false),
+        };
+
+        // Checks whether the table is exists
+        let table = match space.find_table(&request.table_name) {
+            Some(v) => v,
+            None => return Ok(false),
+        };
+
+        let space_table = SpaceAndTable::new(space.clone(), table);
+        self.do_drop_table(space_table, request)
+            .await
+            .context(DoDropTable)?;
+
+        Ok(true)
+    }
+
+    /// Find the table under given space by its table name
+    ///
+    /// Return None if space or table is not found
+    pub fn find_table(
+        &self,
+        ctx: &CommonContext,
+        space: SpaceNameRef,
+        table: &str,
+    ) -> Result<Option<SpaceAndTable>> {
+        let space = match self.find_space(ctx, space)? {
+            Some(s) => s,
+            None => return Ok(None),
+        };
+
+        let space_table = space
+            .find_table(table)
+            .map(|table_data| SpaceAndTable::new(space, table_data));
+
+        Ok(space_table)
+    }
+}
diff --git a/analytic_engine/src/instance/flush_compaction.rs b/analytic_engine/src/instance/flush_compaction.rs
new file mode 100644
index 0000000000..f6fd3debf5
--- /dev/null
+++ b/analytic_engine/src/instance/flush_compaction.rs
@@ -0,0 +1,1037 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+// Flush and compaction logic of instance
+
+use std::{cmp, collections::Bound, sync::Arc};
+
+use common_types::{
+    projected_schema::ProjectedSchema,
+    record_batch::{RecordBatchWithKey, RecordBatchWithKeyBuilder},
+    request_id::RequestId,
+    row::RowViewOnBatch,
+    time::TimeRange,
+    SequenceNumber,
+};
+use common_util::{config::ReadableDuration, define_result, runtime::Runtime};
+use futures::{
+    channel::{mpsc, mpsc::channel},
+    future::try_join_all,
+    stream, SinkExt, TryStreamExt,
+};
+use log::{error, info};
+use object_store::{path::ObjectStorePath, ObjectStore};
+use snafu::{Backtrace, OptionExt, ResultExt, Snafu};
+use table_engine::{predicate::Predicate, table::Result as TableResult};
+use tokio::sync::oneshot;
+use wal::manager::{RegionId, WalManager};
+
+use crate::{
+    compaction::{
+        CompactionInputFiles, CompactionTask, ExpiredFiles, TableCompactionRequest, WaitError,
+    },
+    instance::{
+        write_worker::{self, CompactTableCommand, FlushTableCommand, WorkerLocal},
+        Instance, SpaceStore,
+    },
+    memtable::{ColumnarIterPtr, MemTableRef, ScanContext, ScanRequest},
+    meta::{
+        meta_update::{AlterOptionsMeta, MetaUpdate, VersionEditMeta},
+        Manifest,
+    },
+    row_iter::{
+        self,
+        dedup::DedupIterator,
+        merge::{MergeBuilder, MergeConfig},
+        IterOptions,
+    },
+    space::SpaceAndTable,
+    sst::{
+        builder::RecordBatchStream,
+        factory::{Factory, SstBuilderOptions, SstReaderOptions, SstType},
+        file::{self, FileMeta, SstMetaData},
+    },
+    table::{
+        data::{MemTableId, TableData, TableDataRef},
+        version::{FlushableMemTables, MemTableState, SamplingMemTable},
+        version_edit::{AddFile, DeleteFile, VersionEdit},
+    },
+};
+
+const DEFAULT_CHANNEL_SIZE: usize = 5;
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Failed to store version edit, err:{}", source))]
+    StoreVersionEdit {
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+
+    #[snafu(display("Failed to purge wal, region_id:{}, sequence:{}", region_id, sequence))]
+    PurgeWal {
+        region_id: RegionId,
+        sequence: SequenceNumber,
+        source: wal::manager::Error,
+    },
+
+    #[snafu(display("Failed to build mem table iterator, source:{}", source))]
+    InvalidMemIter {
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+
+    #[snafu(display(
+        "Sst type is not found, sst_type:{:?}.\nBacktrace:\n{}",
+        sst_type,
+        backtrace
+    ))]
+    InvalidSstType {
+        sst_type: SstType,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Failed to build sst, file_path:{}, source:{}", path, source))]
+    FailBuildSst {
+        path: String,
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+
+    #[snafu(display("Background flush failed, cannot schedule flush task, err:{}", source))]
+    BackgroundFlushFailed {
+        source: crate::instance::write_worker::Error,
+    },
+
+    #[snafu(display("Failed to send flush command, err:{}", source))]
+    SendFlushCmd {
+        source: crate::instance::write_worker::Error,
+    },
+
+    #[snafu(display("Failed to send compact command, err:{}", source))]
+    SendCompactCmd {
+        source: crate::instance::write_worker::Error,
+    },
+
+    #[snafu(display("Failed to build merge iterator, table:{}, err:{}", table, source))]
+    BuildMergeIterator {
+        table: String,
+        source: crate::row_iter::merge::Error,
+    },
+
+    #[snafu(display("Failed to do manual compaction, err:{}", source))]
+    ManualCompactFailed {
+        source: crate::compaction::WaitError,
+    },
+
+    #[snafu(display("Failed to split record batch, source:{}", source))]
+    SplitRecordBatch {
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+
+    #[snafu(display("Failed to send to channel, source:{}", source))]
+    ChannelSend { source: mpsc::SendError },
+
+    #[snafu(display("Runtime join error, source:{}", source))]
+    RuntimeJoin { source: common_util::runtime::Error },
+}
+
+define_result!(Error);
+
+/// Options to flush single table.
+#[derive(Debug)]
+pub struct TableFlushOptions {
+    /// Flush result sender.
+    ///
+    /// Default is None.
+    pub res_sender: Option<oneshot::Sender<TableResult<()>>>,
+    /// Schedule a compaction request after flush.
+    ///
+    /// Default is true.
+    pub compact_after_flush: bool,
+    /// Whether to block on write thread.
+    ///
+    /// Default is false.
+    pub block_on_write_thread: bool,
+}
+
+impl Default for TableFlushOptions {
+    fn default() -> Self {
+        Self {
+            res_sender: None,
+            compact_after_flush: true,
+            block_on_write_thread: false,
+        }
+    }
+}
+
+/// Request to flush single table.
+pub struct TableFlushRequest {
+    /// Table to flush.
+    pub table_data: TableDataRef,
+    /// Max id of memtable to flush (inclusive).
+    pub max_memtable_id: MemTableId,
+}
+
+impl<
+        Wal: WalManager + Send + Sync + 'static,
+        Meta: Manifest + Send + Sync + 'static,
+        Store: ObjectStore,
+        Fa: Factory + Send + Sync + 'static,
+    > Instance<Wal, Meta, Store, Fa>
+{
+    /// Flush this table.
+    pub async fn flush_table(
+        &self,
+        space_table: &SpaceAndTable,
+        flush_opts: TableFlushOptions,
+    ) -> Result<()> {
+        info!(
+            "Instance flush table, space_table:{:?}, flush_opts:{:?}",
+            space_table, flush_opts
+        );
+
+        // Create a oneshot channel to send/receive flush result.
+        let (tx, rx) = oneshot::channel();
+        let cmd = FlushTableCommand {
+            space_table: space_table.clone(),
+            flush_opts,
+            tx,
+        };
+
+        // Actual work is done in flush_table_in_worker().
+        write_worker::process_command_in_write_worker(
+            cmd.into_command(),
+            space_table.table_data(),
+            rx,
+        )
+        .await
+        .context(SendFlushCmd)
+    }
+
+    /// Compact the table manually.
+    pub async fn manual_compact_table(&self, space_table: &SpaceAndTable) -> Result<()> {
+        info!("Instance compact table, space_table:{:?}", space_table);
+
+        // Create a oneshot channel to send/receive result from write worker.
+        let (tx, rx) = oneshot::channel();
+        let (compact_tx, compact_rx) = oneshot::channel();
+        // Create a oneshot channel to send/receive compaction result.
+        let cmd = CompactTableCommand {
+            space_table: space_table.clone(),
+            waiter: Some(compact_tx),
+            tx,
+        };
+
+        // The write worker will call schedule_table_compaction().
+        write_worker::process_command_in_write_worker(
+            cmd.into_command(),
+            space_table.table_data(),
+            rx,
+        )
+        .await
+        .context(SendCompactCmd)?;
+
+        // Now wait for compaction done, if the sender has been dropped, we convert it
+        // into Error::Canceled.
+        compact_rx
+            .await
+            .unwrap_or(Err(WaitError::Canceled))
+            .context(ManualCompactFailed)
+    }
+
+    /// Flush given table in write worker thread.
+    pub async fn flush_table_in_worker(
+        self: &Arc<Self>,
+        worker_local: &mut WorkerLocal,
+        table_data: &TableDataRef,
+        opts: TableFlushOptions,
+    ) -> Result<()> {
+        let flush_req = self.preprocess_flush(worker_local, table_data).await?;
+
+        self.schedule_table_flush(worker_local, flush_req, opts)
+            .await
+    }
+
+    async fn preprocess_flush(
+        &self,
+        worker_local: &mut WorkerLocal,
+        table_data: &TableDataRef,
+    ) -> Result<TableFlushRequest> {
+        let current_version = table_data.current_version();
+        let last_sequence = table_data.last_sequence();
+        // Switch all mutable memtables
+        if let Some(suggest_segment_duration) =
+            current_version.switch_memtables_or_suggest_duration(worker_local)
+        {
+            info!("Switch memtable and suggest segment duration, table:{}, table_id:{}, segment_duration:{:?}", table_data.name, table_data.id, suggest_segment_duration);
+            assert!(suggest_segment_duration.as_millis() > 0);
+
+            let mut new_table_opts = (*table_data.table_options()).clone();
+            new_table_opts.segment_duration = Some(ReadableDuration(suggest_segment_duration));
+
+            // Now persist the new options, the `worker_local` ensure there is no race
+            // condition.
+            let meta_update = MetaUpdate::AlterOptions(AlterOptionsMeta {
+                space_id: table_data.space_id,
+                table_id: table_data.id,
+                options: new_table_opts.clone(),
+            });
+            self.space_store
+                .manifest
+                .store_update(meta_update)
+                .await
+                .map_err(|e| Box::new(e) as _)
+                .context(StoreVersionEdit)?;
+
+            table_data.set_table_options(worker_local, new_table_opts);
+
+            // Now the segment duration is applied, we can stop sampling and freeze the
+            // sampling memtable.
+            current_version.freeze_sampling(worker_local);
+        }
+
+        info!("Try to trigger memtable flush of table, table:{}, table_id:{}, max_memtable_id:{}, last_sequence:{}",
+            table_data.name, table_data.id, table_data.last_memtable_id(), last_sequence);
+
+        // Try to flush all memtables of current table
+        Ok(TableFlushRequest {
+            table_data: table_data.clone(),
+            max_memtable_id: table_data.last_memtable_id(),
+        })
+    }
+
+    /// Schedule table flush request to background workers
+    async fn schedule_table_flush(
+        self: &Arc<Self>,
+        worker_local: &mut WorkerLocal,
+        flush_req: TableFlushRequest,
+        opts: TableFlushOptions,
+    ) -> Result<()> {
+        // TODO(yingwen): Store pending flush reqs and retry flush on recoverable error,
+        // or try to recover from background error
+        let table_data = flush_req.table_data.clone();
+        let table = table_data.name.clone();
+
+        let instance = self.clone();
+        let flush_job = async move { instance.flush_memtables_to_outputs(&flush_req).await };
+
+        let compact_req = TableCompactionRequest::no_waiter(
+            table_data.clone(),
+            worker_local.compaction_notifier(),
+        );
+        let instance = self.clone();
+
+        if opts.compact_after_flush {
+            // Schedule compaction if flush completed successfully.
+            let on_flush_success = async move {
+                instance.schedule_table_compaction(compact_req).await;
+            };
+
+            worker_local
+                .flush_sequentially(
+                    table,
+                    &table_data.metrics,
+                    flush_job,
+                    on_flush_success,
+                    opts.block_on_write_thread,
+                    opts.res_sender,
+                )
+                .await
+                .context(BackgroundFlushFailed)
+        } else {
+            worker_local
+                .flush_sequentially(
+                    table,
+                    &table_data.metrics,
+                    flush_job,
+                    async {},
+                    opts.block_on_write_thread,
+                    opts.res_sender,
+                )
+                .await
+                .context(BackgroundFlushFailed)
+        }
+    }
+
+    /// Caller should guarantee flush of single table is sequential
+    pub(crate) async fn flush_memtables_to_outputs(
+        &self,
+        flush_req: &TableFlushRequest,
+    ) -> Result<()> {
+        // TODO(yingwen): Record memtables num to flush as statistics
+        let TableFlushRequest {
+            table_data,
+            max_memtable_id,
+        } = flush_req;
+
+        let current_version = table_data.current_version();
+        let mut mems_to_flush = FlushableMemTables::default();
+
+        current_version.pick_memtables_to_flush(*max_memtable_id, &mut mems_to_flush);
+
+        if mems_to_flush.is_empty() {
+            return Ok(());
+        }
+
+        let request_id = RequestId::next_id();
+
+        info!(
+            "Instance try to flush memtables, table:{}, table_id:{}, request_id:{}, mems_to_flush:{:?}",
+            table_data.name, table_data.id, request_id, mems_to_flush
+        );
+
+        let local_metrics = table_data.metrics.local_flush_metrics();
+        // Start flush duration timer.
+        let _timer = local_metrics.flush_duration_histogram.start_timer();
+        let mut files_to_level0 = Vec::with_capacity(mems_to_flush.memtables.len());
+        let mut flushed_sequence = 0;
+        let mut sst_num = 0;
+
+        if let Some(sampling_mem) = &mems_to_flush.sampling_mem {
+            if let Some(seq) = self
+                .flush_sampling_memtable(
+                    &*table_data,
+                    request_id,
+                    sampling_mem,
+                    &mut files_to_level0,
+                )
+                .await?
+            {
+                flushed_sequence = seq;
+                sst_num += files_to_level0.len();
+                for add_file in &files_to_level0 {
+                    local_metrics.observe_sst_size(add_file.file.meta.size);
+                }
+            }
+        }
+
+        for mem in &mems_to_flush.memtables {
+            let file = self
+                .flush_memtable_to_output(&*table_data, request_id, mem)
+                .await?;
+            if let Some(file) = file {
+                let sst_size = file.meta.size;
+                files_to_level0.push(AddFile { level: 0, file });
+
+                // Set flushed sequence to max of the last_sequence of memtables.
+                flushed_sequence = cmp::max(flushed_sequence, mem.last_sequence());
+
+                sst_num += 1;
+                // Collect sst size metrics.
+                local_metrics.observe_sst_size(sst_size);
+            }
+        }
+
+        // Collect sst num metrics.
+        local_metrics.observe_sst_num(sst_num);
+
+        info!(
+            "Instance flush memtables to output, table:{}, table_id:{}, request_id:{}, mems_to_flush:{:?}, files_to_level0:{:?}, flushed_sequence:{}",
+            table_data.name,
+            table_data.id,
+            request_id,
+            mems_to_flush,
+            files_to_level0,
+            flushed_sequence
+        );
+
+        // Persist the flush result to manifest.
+        let edit_meta = VersionEditMeta {
+            space_id: table_data.space_id,
+            table_id: table_data.id,
+            flushed_sequence,
+            files_to_add: files_to_level0.clone(),
+            files_to_delete: Vec::new(),
+        };
+        let meta_update = MetaUpdate::VersionEdit(edit_meta);
+        self.space_store
+            .manifest
+            .store_update(meta_update)
+            .await
+            .map_err(|e| Box::new(e) as _)
+            .context(StoreVersionEdit)?;
+
+        // Apply to the table version.
+        let mems_to_remove = mems_to_flush.ids();
+        let edit = VersionEdit {
+            flushed_sequence,
+            mems_to_remove,
+            files_to_add: files_to_level0,
+            files_to_delete: Vec::new(),
+        };
+        table_data.current_version().apply_edit(edit);
+
+        // Mark sequence <= flushed_sequence to be deleted.
+        self.space_store
+            .wal_manager
+            .mark_delete_entries_up_to(table_data.wal_region_id(), flushed_sequence)
+            .await
+            .context(PurgeWal {
+                region_id: table_data.wal_region_id(),
+                sequence: flushed_sequence,
+            })?;
+
+        info!(
+            "Instance flush memtables done, table:{}, table_id:{}, request_id:{}",
+            table_data.name, table_data.id, request_id
+        );
+
+        Ok(())
+    }
+
+    /// Flush rows in sampling memtable to multiple ssts according to segment
+    /// duration.
+    ///
+    /// Returns flushed sequence.
+    async fn flush_sampling_memtable(
+        &self,
+        table_data: &TableData,
+        request_id: RequestId,
+        sampling_mem: &SamplingMemTable,
+        files_to_level0: &mut Vec<AddFile>,
+    ) -> Result<Option<SequenceNumber>> {
+        let (min_key, max_key) = match (sampling_mem.mem.min_key(), sampling_mem.mem.max_key()) {
+            (Some(min_key), Some(max_key)) => (min_key, max_key),
+            _ => {
+                // the memtable is empty and nothing needs flushing.
+                return Ok(None);
+            }
+        };
+
+        let max_sequence = sampling_mem.mem.last_sequence();
+        let time_ranges = sampling_mem.sampler.ranges();
+
+        info!("Flush sampling memtable, table_id:{:?}, table_name:{:?}, request_id:{}, sampling memtable time_ranges:{:?}",
+            table_data.id,table_data.name, request_id, time_ranges);
+
+        let mut batch_record_senders = Vec::with_capacity(time_ranges.len());
+        let mut sst_handlers = Vec::with_capacity(time_ranges.len());
+        let mut file_ids = Vec::with_capacity(time_ranges.len());
+
+        let sst_builder_options = SstBuilderOptions {
+            sst_type: table_data.sst_type,
+            num_rows_per_row_group: table_data.table_options().num_rows_per_row_group,
+            compression: table_data.table_options().compression,
+        };
+
+        for time_range in &time_ranges {
+            let (batch_record_sender, batch_record_receiver) =
+                channel::<Result<RecordBatchWithKey>>(DEFAULT_CHANNEL_SIZE);
+            let file_id = table_data.alloc_file_id();
+            let mut sst_file_path = self.space_store.store.new_path();
+            table_data.set_sst_file_path(file_id, &mut sst_file_path);
+
+            // TODO: min_key max_key set in sst_builder build
+            let mut sst_meta = SstMetaData {
+                min_key: min_key.clone(),
+                max_key: max_key.clone(),
+                time_range: *time_range,
+                max_sequence,
+                schema: table_data.schema(),
+                size: 0,
+                row_num: 0,
+            };
+
+            let store = self.space_store.clone();
+            let sst_builder_options_clone = sst_builder_options.clone();
+            let sst_type = table_data.sst_type;
+
+            // spawn build sst
+            let handler = self.runtimes.bg_runtime.spawn(async move {
+                let mut builder = store
+                    .sst_factory
+                    .new_sst_builder(
+                        &sst_builder_options_clone,
+                        &sst_file_path,
+                        store.store_ref(),
+                    )
+                    .context(InvalidSstType { sst_type })?;
+
+                let sst_info = builder
+                    .build(
+                        request_id,
+                        &sst_meta,
+                        Box::new(batch_record_receiver.map_err(|e| Box::new(e) as _)),
+                    )
+                    .await
+                    .map_err(|e| {
+                        error!("Failed to build sst file, meta:{:?}, err:{}", sst_meta, e);
+                        Box::new(e) as _
+                    })
+                    .with_context(|| FailBuildSst {
+                        path: sst_file_path.display(),
+                    })?;
+
+                // update sst metadata by built info.
+                sst_meta.row_num = sst_info.row_num as u64;
+                sst_meta.size = sst_info.file_size as u64;
+                Ok(sst_meta)
+            });
+
+            batch_record_senders.push(batch_record_sender);
+            sst_handlers.push(handler);
+            file_ids.push(file_id);
+        }
+
+        let iter = build_mem_table_iter(sampling_mem.mem.clone(), table_data)?;
+
+        let timestamp_idx = table_data.schema().timestamp_index();
+
+        for data in iter {
+            for (idx, record_batch) in split_record_batch_with_time_ranges(
+                data.map_err(|e| Box::new(e) as _).context(InvalidMemIter)?,
+                &time_ranges,
+                timestamp_idx,
+            )?
+            .into_iter()
+            .enumerate()
+            {
+                if !record_batch.is_empty() {
+                    batch_record_senders[idx]
+                        .send(Ok(record_batch))
+                        .await
+                        .context(ChannelSend)?;
+                }
+            }
+        }
+        batch_record_senders.clear();
+
+        let ret = try_join_all(sst_handlers).await;
+        for (idx, sst_meta) in ret.context(RuntimeJoin)?.into_iter().enumerate() {
+            files_to_level0.push(AddFile {
+                level: 0,
+                file: FileMeta {
+                    id: file_ids[idx],
+                    meta: sst_meta?,
+                },
+            })
+        }
+
+        Ok(Some(max_sequence))
+    }
+
+    async fn flush_memtable_to_output(
+        &self,
+        table_data: &TableData,
+        request_id: RequestId,
+        memtable_state: &MemTableState,
+    ) -> Result<Option<FileMeta>> {
+        let (min_key, max_key) = match (memtable_state.mem.min_key(), memtable_state.mem.max_key())
+        {
+            (Some(min_key), Some(max_key)) => (min_key, max_key),
+            _ => {
+                // the memtable is empty and nothing needs flushing.
+                return Ok(None);
+            }
+        };
+        let max_sequence = memtable_state.last_sequence();
+        let mut sst_meta = SstMetaData {
+            min_key,
+            max_key,
+            time_range: memtable_state.time_range,
+            max_sequence,
+            schema: table_data.schema(),
+            size: 0,
+            row_num: 0,
+        };
+
+        // Alloc file id for next sst file
+        let file_id = table_data.alloc_file_id();
+        let mut sst_file_path = self.space_store.store.new_path();
+        table_data.set_sst_file_path(file_id, &mut sst_file_path);
+
+        let sst_builder_options = SstBuilderOptions {
+            sst_type: table_data.sst_type,
+            num_rows_per_row_group: table_data.table_options().num_rows_per_row_group,
+            compression: table_data.table_options().compression,
+        };
+        let mut builder = self
+            .space_store
+            .sst_factory
+            .new_sst_builder(
+                &sst_builder_options,
+                &sst_file_path,
+                self.space_store.store_ref(),
+            )
+            .context(InvalidSstType {
+                sst_type: table_data.sst_type,
+            })?;
+
+        let iter = build_mem_table_iter(memtable_state.mem.clone(), table_data)?;
+
+        let record_batch_stream: RecordBatchStream =
+            Box::new(stream::iter(iter).map_err(|e| Box::new(e) as _));
+
+        let sst_info = builder
+            .build(request_id, &sst_meta, record_batch_stream)
+            .await
+            .map_err(|e| {
+                // TODO(yingwen): Maybe remove this log.
+                error!("Failed to build sst file, meta:{:?}, err:{}", sst_meta, e);
+                Box::new(e) as _
+            })
+            .with_context(|| FailBuildSst {
+                path: sst_file_path.display(),
+            })?;
+
+        // update sst metadata by built info.
+        sst_meta.row_num = sst_info.row_num as u64;
+        sst_meta.size = sst_info.file_size as u64;
+
+        Ok(Some(FileMeta {
+            id: file_id,
+            meta: sst_meta,
+        }))
+    }
+
+    /// Schedule table compaction request to background workers and return
+    /// immediately.
+    pub async fn schedule_table_compaction(&self, compact_req: TableCompactionRequest) {
+        self.compaction_scheduler
+            .schedule_table_compaction(compact_req)
+            .await;
+    }
+}
+
+impl<Wal, Meta: Manifest, Store: ObjectStore, Fa: Factory> SpaceStore<Wal, Meta, Store, Fa> {
+    pub(crate) async fn compact_table(
+        &self,
+        runtime: Arc<Runtime>,
+        table_data: &TableData,
+        request_id: RequestId,
+        task: &CompactionTask,
+    ) -> Result<()> {
+        let mut edit_meta = VersionEditMeta {
+            space_id: table_data.space_id,
+            table_id: table_data.id,
+            flushed_sequence: 0,
+            // Use the number of compaction inputs as the estimated number of files to add.
+            files_to_add: Vec::with_capacity(task.compaction_inputs.len()),
+            files_to_delete: Vec::new(),
+        };
+
+        if task.expired.is_empty() && task.compaction_inputs.is_empty() {
+            // Nothing to compact.
+            return Ok(());
+        }
+
+        for files in &task.expired {
+            self.delete_expired_files(table_data, request_id, files, &mut edit_meta);
+        }
+
+        for input in &task.compaction_inputs {
+            self.compact_input_files(
+                runtime.clone(),
+                table_data,
+                request_id,
+                input,
+                &mut edit_meta,
+            )
+            .await?;
+        }
+
+        let meta_update = MetaUpdate::VersionEdit(edit_meta.clone());
+        self.manifest
+            .store_update(meta_update)
+            .await
+            .map_err(|e| Box::new(e) as _)
+            .context(StoreVersionEdit)?;
+
+        // Apply to the table version.
+        let edit = edit_meta.into_version_edit();
+        table_data.current_version().apply_edit(edit);
+
+        Ok(())
+    }
+
+    pub(crate) async fn compact_input_files(
+        &self,
+        runtime: Arc<Runtime>,
+        table_data: &TableData,
+        request_id: RequestId,
+        input: &CompactionInputFiles,
+        edit_meta: &mut VersionEditMeta,
+    ) -> Result<()> {
+        if input.files.is_empty() {
+            return Ok(());
+        }
+
+        // metrics
+        let _timer = table_data
+            .metrics
+            .compaction_duration_histogram
+            .start_timer();
+        table_data
+            .metrics
+            .compaction_observe_sst_num(input.files.len());
+        let mut sst_size = 0;
+        let mut sst_row_num = 0;
+        for file in &input.files {
+            sst_size += file.size();
+            sst_row_num += file.row_num();
+        }
+        table_data
+            .metrics
+            .compaction_observe_input_sst_size(sst_size);
+        table_data
+            .metrics
+            .compaction_observe_input_sst_row_num(sst_row_num);
+
+        info!(
+            "Instance try to compact table, table:{}, table_id:{}, request_id:{}, input_files:{:?}",
+            table_data.name, table_data.id, request_id, input.files,
+        );
+
+        // The schema may be modified during compaction, so we acquire it first and use
+        // the acquired schema as the compacted sst meta.
+        let schema = table_data.schema();
+        let table_options = table_data.table_options();
+
+        let iter_options = IterOptions::default();
+        let merge_iter = {
+            let space_id = table_data.space_id;
+            let table_id = table_data.id;
+            let sequence = table_data.last_sequence();
+            let projected_schema = ProjectedSchema::no_projection(schema.clone());
+            let sst_reader_options = SstReaderOptions {
+                sst_type: table_data.sst_type,
+                read_batch_row_num: table_options.num_rows_per_row_group,
+                reverse: false,
+                projected_schema: projected_schema.clone(),
+                predicate: Arc::new(Predicate::new(TimeRange::min_to_max())),
+                meta_cache: self.meta_cache.clone(),
+                data_cache: self.data_cache.clone(),
+                runtime: runtime.clone(),
+            };
+            let mut builder = MergeBuilder::new(MergeConfig {
+                request_id,
+                space_id,
+                table_id,
+                sequence,
+                projected_schema,
+                predicate: Arc::new(Predicate::empty()),
+                sst_factory: self.sst_factory.clone(),
+                sst_reader_options,
+                store: self.store_ref(),
+                merge_iter_options: iter_options.clone(),
+                need_dedup: table_options.need_dedup(),
+                reverse: false,
+            });
+            // Add all ssts in compaction input to builder.
+            builder
+                .mut_ssts_of_level(input.level)
+                .extend_from_slice(&input.files);
+            let merge_iter = builder.build().await.context(BuildMergeIterator {
+                table: table_data.name.clone(),
+            })?;
+            merge_iter
+        };
+
+        let record_batch_stream = if table_options.need_dedup() {
+            row_iter::record_batch_with_key_iter_to_stream(
+                DedupIterator::new(request_id, merge_iter, iter_options),
+                &runtime,
+            )
+        } else {
+            row_iter::record_batch_with_key_iter_to_stream(merge_iter, &runtime)
+        };
+
+        let mut sst_meta = file::merge_sst_meta(&input.files, schema);
+
+        // Alloc file id for the merged sst.
+        let file_id = table_data.alloc_file_id();
+        let mut sst_file_path = self.store.new_path();
+        table_data.set_sst_file_path(file_id, &mut sst_file_path);
+
+        let sst_builder_options = SstBuilderOptions {
+            sst_type: table_data.sst_type,
+            num_rows_per_row_group: table_options.num_rows_per_row_group,
+            compression: table_options.compression,
+        };
+        let mut sst_builder = self
+            .sst_factory
+            .new_sst_builder(&sst_builder_options, &sst_file_path, self.store_ref())
+            .context(InvalidSstType {
+                sst_type: table_data.sst_type,
+            })?;
+
+        let sst_info = sst_builder
+            .build(request_id, &sst_meta, record_batch_stream)
+            .await
+            .map_err(|e| Box::new(e) as _)
+            .with_context(|| FailBuildSst {
+                path: sst_file_path.display(),
+            })?;
+
+        // update sst metadata by built info.
+        sst_meta.row_num = sst_info.row_num as u64;
+        sst_meta.size = sst_info.file_size as u64;
+
+        table_data
+            .metrics
+            .compaction_observe_output_sst_size(sst_meta.size);
+        table_data
+            .metrics
+            .compaction_observe_output_sst_row_num(sst_meta.row_num);
+
+        info!(
+            "Instance files compacted, table:{}, table_id:{}, request_id:{}, output_path:{}, input_files:{:?}, sst_meta:{:?}",
+            table_data.name,
+            table_data.id,
+            request_id,
+            sst_file_path.display(),
+            input.files,
+            sst_meta
+        );
+
+        // Store updates to edit_meta.
+        edit_meta.files_to_delete.reserve(input.files.len());
+        // The compacted file can be deleted later.
+        for file in &input.files {
+            edit_meta.files_to_delete.push(DeleteFile {
+                level: input.level,
+                file_id: file.id(),
+            });
+        }
+        // Add the newly created file to meta.
+        edit_meta.files_to_add.push(AddFile {
+            level: input.output_level,
+            file: FileMeta {
+                id: file_id,
+                meta: sst_meta,
+            },
+        });
+
+        Ok(())
+    }
+
+    pub(crate) fn delete_expired_files(
+        &self,
+        table_data: &TableData,
+        request_id: RequestId,
+        expired: &ExpiredFiles,
+        edit_meta: &mut VersionEditMeta,
+    ) {
+        if !expired.files.is_empty() {
+            info!(
+                "Instance try to delete expired files, table:{}, table_id:{}, request_id:{}, level:{}, files:{:?}",
+                table_data.name, table_data.id, request_id, expired.level, expired.files,
+            );
+        }
+
+        let files = &expired.files;
+        edit_meta.files_to_delete.reserve(files.len());
+        for file in files {
+            edit_meta.files_to_delete.push(DeleteFile {
+                level: expired.level,
+                file_id: file.id(),
+            });
+        }
+    }
+}
+
+fn split_record_batch_with_time_ranges(
+    record_batch: RecordBatchWithKey,
+    time_ranges: &[TimeRange],
+    timestamp_idx: usize,
+) -> Result<Vec<RecordBatchWithKey>> {
+    let mut builders: Vec<RecordBatchWithKeyBuilder> = (0..time_ranges.len())
+        .into_iter()
+        .map(|_| RecordBatchWithKeyBuilder::new(record_batch.schema_with_key().clone()))
+        .collect();
+
+    for row_idx in 0..record_batch.num_rows() {
+        let datum = record_batch.column(timestamp_idx).datum(row_idx);
+        let timestamp = datum.as_timestamp().unwrap();
+        let mut idx = None;
+        for (i, time_range) in time_ranges.iter().enumerate() {
+            if time_range.contains(timestamp) {
+                idx = Some(i);
+                break;
+            }
+        }
+
+        if let Some(idx) = idx {
+            let view = RowViewOnBatch {
+                record_batch: &record_batch,
+                row_idx,
+            };
+            builders[idx]
+                .append_row_view(&view)
+                .map_err(|e| Box::new(e) as _)
+                .context(SplitRecordBatch)?;
+        } else {
+            panic!(
+                "Record timestamp is not in time_ranges, timestamp:{:?}, time_ranges:{:?}",
+                timestamp, time_ranges
+            );
+        }
+    }
+    let mut ret = Vec::with_capacity(builders.len());
+    for mut builder in builders {
+        ret.push(
+            builder
+                .build()
+                .map_err(|e| Box::new(e) as _)
+                .context(SplitRecordBatch)?,
+        );
+    }
+    Ok(ret)
+}
+
+fn build_mem_table_iter(memtable: MemTableRef, table_data: &TableData) -> Result<ColumnarIterPtr> {
+    let scan_ctx = ScanContext::default();
+    let scan_req = ScanRequest {
+        start_user_key: Bound::Unbounded,
+        end_user_key: Bound::Unbounded,
+        sequence: common_types::MAX_SEQUENCE_NUMBER,
+        projected_schema: ProjectedSchema::no_projection(table_data.schema()),
+        need_dedup: table_data.dedup(),
+        reverse: false,
+    };
+    memtable
+        .scan(scan_ctx, scan_req)
+        .map_err(|e| Box::new(e) as _)
+        .context(InvalidMemIter)
+}
+
+#[cfg(test)]
+mod tests {
+    use common_types::{
+        tests::{
+            build_record_batch_with_key_by_rows, build_row, build_row_opt,
+            check_record_batch_with_key_with_rows,
+        },
+        time::TimeRange,
+    };
+
+    use crate::instance::flush_compaction::split_record_batch_with_time_ranges;
+
+    #[test]
+    fn test_split_record_batch_with_time_ranges() {
+        let rows0 = vec![build_row(b"binary key", 20, 10.0, "string value")];
+        let rows1 = vec![build_row(b"binary key1", 120, 11.0, "string value 1")];
+        let rows2 = vec![
+            build_row_opt(b"binary key2", 220, None, Some("string value 2")),
+            build_row_opt(b"binary key3", 250, Some(13.0), None),
+        ];
+
+        let rows = vec![rows0.clone(), rows1.clone(), rows2.clone()]
+            .into_iter()
+            .flatten()
+            .collect();
+        let record_batch_with_key = build_record_batch_with_key_by_rows(rows);
+        let column_num = record_batch_with_key.num_columns();
+        let time_ranges = vec![
+            TimeRange::new_unchecked_for_test(0, 100),
+            TimeRange::new_unchecked_for_test(100, 200),
+            TimeRange::new_unchecked_for_test(200, 300),
+        ];
+
+        let timestamp_idx = 1;
+        let rets =
+            split_record_batch_with_time_ranges(record_batch_with_key, &time_ranges, timestamp_idx)
+                .unwrap();
+
+        check_record_batch_with_key_with_rows(&rets[0], rows0.len(), column_num, rows0);
+        check_record_batch_with_key_with_rows(&rets[1], rows1.len(), column_num, rows1);
+        check_record_batch_with_key_with_rows(&rets[2], rows2.len(), column_num, rows2);
+    }
+}
diff --git a/analytic_engine/src/instance/mem_collector.rs b/analytic_engine/src/instance/mem_collector.rs
new file mode 100644
index 0000000000..c686974b34
--- /dev/null
+++ b/analytic_engine/src/instance/mem_collector.rs
@@ -0,0 +1,118 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+use arena::{Collector, CollectorRef};
+
+/// Space memtable memory usage collector
+pub struct MemUsageCollector {
+    /// Memory size allocated in bytes.
+    bytes_allocated: AtomicUsize,
+    /// Memory size used in bytes.
+    bytes_used: AtomicUsize,
+    parent: Option<CollectorRef>,
+}
+
+impl Collector for MemUsageCollector {
+    fn on_alloc(&self, bytes: usize) {
+        self.bytes_allocated.fetch_add(bytes, Ordering::Relaxed);
+
+        if let Some(c) = &self.parent {
+            c.on_alloc(bytes);
+        }
+    }
+
+    fn on_used(&self, bytes: usize) {
+        self.bytes_used.fetch_add(bytes, Ordering::Relaxed);
+
+        if let Some(c) = &self.parent {
+            c.on_used(bytes);
+        }
+    }
+
+    fn on_free(&self, used: usize, allocated: usize) {
+        self.bytes_allocated.fetch_sub(allocated, Ordering::Relaxed);
+        self.bytes_used.fetch_sub(used, Ordering::Relaxed);
+
+        if let Some(c) = &self.parent {
+            c.on_free(used, allocated);
+        }
+    }
+}
+
+impl Default for MemUsageCollector {
+    fn default() -> Self {
+        Self {
+            bytes_allocated: AtomicUsize::new(0),
+            bytes_used: AtomicUsize::new(0),
+            parent: None,
+        }
+    }
+}
+
+impl MemUsageCollector {
+    pub fn with_parent(collector: CollectorRef) -> Self {
+        Self {
+            bytes_allocated: AtomicUsize::new(0),
+            bytes_used: AtomicUsize::new(0),
+            parent: Some(collector),
+        }
+    }
+
+    #[inline]
+    pub fn total_memory_allocated(&self) -> usize {
+        self.bytes_allocated.load(Ordering::Relaxed)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::{atomic::Ordering, Arc};
+
+    use super::*;
+    #[test]
+    fn test_collector() {
+        let collector = MemUsageCollector::default();
+
+        collector.on_alloc(1024);
+        collector.on_used(128);
+        assert_eq!(1024, collector.total_memory_allocated());
+        assert_eq!(128, collector.bytes_used.load(Ordering::Relaxed));
+
+        collector.on_free(64, 512);
+        assert_eq!(512, collector.total_memory_allocated());
+        assert_eq!(64, collector.bytes_used.load(Ordering::Relaxed));
+        collector.on_free(64, 512);
+        assert_eq!(0, collector.total_memory_allocated());
+        assert_eq!(0, collector.bytes_used.load(Ordering::Relaxed));
+    }
+
+    #[test]
+    fn test_collector_with_parent() {
+        let p = Arc::new(MemUsageCollector::default());
+        let c1 = MemUsageCollector::with_parent(p.clone());
+        let c2 = MemUsageCollector::with_parent(p.clone());
+
+        c1.on_alloc(1024);
+        c1.on_used(128);
+        c2.on_alloc(1024);
+        c2.on_used(128);
+        assert_eq!(1024, c1.total_memory_allocated());
+        assert_eq!(128, c1.bytes_used.load(Ordering::Relaxed));
+        assert_eq!(1024, c2.total_memory_allocated());
+        assert_eq!(128, c2.bytes_used.load(Ordering::Relaxed));
+        assert_eq!(2048, p.total_memory_allocated());
+        assert_eq!(256, p.bytes_used.load(Ordering::Relaxed));
+
+        c1.on_free(64, 512);
+        assert_eq!(512, c1.total_memory_allocated());
+        assert_eq!(64, c1.bytes_used.load(Ordering::Relaxed));
+        assert_eq!(1536, p.total_memory_allocated());
+        assert_eq!(192, p.bytes_used.load(Ordering::Relaxed));
+        c2.on_free(64, 512);
+        assert_eq!(512, c2.total_memory_allocated());
+        assert_eq!(64, c2.bytes_used.load(Ordering::Relaxed));
+        assert_eq!(1024, p.total_memory_allocated());
+        assert_eq!(128, p.bytes_used.load(Ordering::Relaxed));
+    }
+}
diff --git a/analytic_engine/src/instance/mod.rs b/analytic_engine/src/instance/mod.rs
new file mode 100644
index 0000000000..07bdcf350b
--- /dev/null
+++ b/analytic_engine/src/instance/mod.rs
@@ -0,0 +1,271 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! A table engine instance
+//!
+//! The root mod only contains common functions of instance, other logics are
+//! divided into the sub crates
+
+mod alter;
+mod drop;
+mod engine;
+pub mod flush_compaction;
+pub(crate) mod mem_collector;
+pub mod open;
+mod read;
+mod write;
+pub mod write_worker;
+
+use std::{
+    collections::HashMap,
+    sync::{Arc, RwLock},
+};
+
+use common_util::{define_result, runtime::Runtime};
+use log::info;
+use mem_collector::MemUsageCollector;
+use object_store::ObjectStore;
+use parquet::{DataCacheRef, MetaCacheRef};
+use snafu::{ResultExt, Snafu};
+use table_engine::engine::EngineRuntimes;
+use tokio::sync::Mutex;
+use wal::manager::WalManager;
+
+use crate::{
+    compaction::scheduler::CompactionSchedulerRef,
+    meta::Manifest,
+    space::{SpaceId, SpaceName, SpaceNameRef, SpaceRef},
+    sst::file::FilePurger,
+    table::data::TableDataRef,
+    TableOptions,
+};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Failed to stop file purger, err:{}", source))]
+    StopFilePurger { source: crate::sst::file::Error },
+
+    #[snafu(display("Failed to stop compaction scheduler, err:{}", source))]
+    StopScheduler {
+        source: crate::compaction::scheduler::Error,
+    },
+
+    #[snafu(display("Failed to close space, name:{}, err:{}", name, source))]
+    CloseSpace {
+        name: String,
+        source: crate::space::Error,
+    },
+}
+
+define_result!(Error);
+
+/// Meta state
+#[derive(Debug)]
+struct MetaState {
+    /// Id of the last space
+    last_space_id: SpaceId,
+}
+
+impl MetaState {
+    /// Create a new state
+    fn new() -> Self {
+        Self { last_space_id: 1 }
+    }
+
+    /// Acquire next id for a new space
+    fn alloc_space_id(&mut self) -> SpaceId {
+        self.last_space_id += 1;
+        self.last_space_id
+    }
+}
+
+impl Default for MetaState {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+/// Spaces states
+#[derive(Default)]
+struct Spaces {
+    /// Name to space
+    name_to_space: HashMap<SpaceName, SpaceRef>,
+    /// Id to space
+    id_to_space: HashMap<SpaceId, SpaceRef>,
+}
+
+impl Spaces {
+    /// Insert space by name, and also insert id to space mapping
+    fn insert(&mut self, space_name: SpaceName, space: SpaceRef) {
+        let space_id = space.id;
+        self.name_to_space.insert(space_name, space.clone());
+        self.id_to_space.insert(space_id, space);
+    }
+
+    fn get_by_name(&self, name: SpaceNameRef) -> Option<&SpaceRef> {
+        self.name_to_space.get(name)
+    }
+
+    /// List all tables of all spaces
+    fn list_all_tables(&self, tables: &mut Vec<TableDataRef>) {
+        let total_tables = self.id_to_space.values().map(|s| s.table_num()).sum();
+        tables.reserve(total_tables);
+        for space in self.id_to_space.values() {
+            space.list_all_tables(tables);
+        }
+    }
+
+    fn list_all_spaces(&self) -> Vec<SpaceRef> {
+        self.id_to_space.values().cloned().collect()
+    }
+}
+
+pub struct SpaceStore<Wal, Meta, Store, Fa> {
+    /// All spaces of the engine.
+    spaces: RwLock<Spaces>,
+    /// Manifest (or meta) stores meta data of the engine instance.
+    manifest: Meta,
+    /// Wal of all tables
+    wal_manager: Wal,
+    /// Sst storage.
+    store: Arc<Store>,
+    /// Meta lock protects mutation to meta data of the instance. This lock
+    /// should be held when persisting mutation of the instance level meta data
+    /// to the manifest.
+    /// - add a space
+    /// - delete a space
+    ///
+    /// Mutation to space's meta, like add/delete a table, is protected by
+    /// space's lock instead of this lock.
+    meta_state: Mutex<MetaState>,
+    /// Sst factory.
+    sst_factory: Fa,
+
+    meta_cache: Option<MetaCacheRef>,
+    data_cache: Option<DataCacheRef>,
+}
+
+impl<Wal, Meta, Store, Fa> Drop for SpaceStore<Wal, Meta, Store, Fa> {
+    fn drop(&mut self) {
+        info!("SpaceStore dropped");
+    }
+}
+
+impl<Wal, Meta, Store, Fa> SpaceStore<Wal, Meta, Store, Fa> {
+    async fn close(&self) -> Result<()> {
+        let spaces = self.spaces.read().unwrap().list_all_spaces();
+        for space in spaces {
+            // Close all spaces.
+            space
+                .close()
+                .await
+                .context(CloseSpace { name: &space.name })?;
+        }
+
+        Ok(())
+    }
+}
+
+impl<Wal, Meta, Store, Fa> SpaceStore<Wal, Meta, Store, Fa> {
+    fn store_ref(&self) -> &Store {
+        &*self.store
+    }
+
+    /// List all tables of all spaces
+    pub fn list_all_tables(&self, tables: &mut Vec<TableDataRef>) {
+        let spaces = self.spaces.read().unwrap();
+        spaces.list_all_tables(tables);
+    }
+
+    /// Find the space which it's all memtables consumes maximum memory.
+    #[inline]
+    fn find_maximum_memory_usage_space(&self) -> Option<SpaceRef> {
+        let spaces = self.spaces.read().unwrap().list_all_spaces();
+        spaces.into_iter().max_by_key(|t| t.memtable_memory_usage())
+    }
+}
+
+/// Table engine instance
+///
+/// Manages all spaces, also contains needed resources shared across all table
+// TODO(yingwen): Track memory usage of all tables (or tables of space)
+pub struct Instance<Wal, Meta, Store, Fa> {
+    /// Space storage
+    space_store: Arc<SpaceStore<Wal, Meta, Store, Fa>>,
+    /// Runtime to execute async tasks.
+    runtimes: Arc<EngineRuntimes>,
+    /// Global table options, overwrite mutable options in each table's
+    /// TableOptions.
+    table_opts: TableOptions,
+
+    // Write group options:
+    write_group_worker_num: usize,
+    write_group_command_channel_cap: usize,
+    // End of write group options.
+    compaction_scheduler: CompactionSchedulerRef,
+    file_purger: FilePurger,
+
+    meta_cache: Option<MetaCacheRef>,
+    data_cache: Option<DataCacheRef>,
+    /// Engine memtable memory usage collector
+    mem_usage_collector: Arc<MemUsageCollector>,
+    /// Engine write buffer size
+    pub(crate) db_write_buffer_size: usize,
+    /// Space write buffer size
+    pub(crate) space_write_buffer_size: usize,
+}
+
+impl<Wal, Meta, Store, Fa> Instance<Wal, Meta, Store, Fa> {
+    /// Close the instance gracefully.
+    pub async fn close(&self) -> Result<()> {
+        self.file_purger.stop().await.context(StopFilePurger)?;
+
+        self.space_store.close().await?;
+
+        self.compaction_scheduler
+            .stop_scheduler()
+            .await
+            .context(StopScheduler)
+    }
+}
+
+// TODO(yingwen): Instance builder
+impl<Wal: WalManager + Send + Sync, Meta: Manifest, Store: ObjectStore, Fa>
+    Instance<Wal, Meta, Store, Fa>
+{
+    /// Find space using read lock
+    fn get_space_by_read_lock(&self, space: SpaceNameRef) -> Option<SpaceRef> {
+        let spaces = self.space_store.spaces.read().unwrap();
+        spaces.get_by_name(space).cloned()
+    }
+
+    /// Returns options to create a write group for given space
+    fn write_group_options(&self, space_id: SpaceId) -> write_worker::Options {
+        write_worker::Options {
+            space_id,
+            worker_num: self.write_group_worker_num,
+            runtime: self.write_runtime().clone(),
+            command_channel_capacity: self.write_group_command_channel_cap,
+        }
+    }
+
+    /// Returns true when engine instance's total memtable memory usage reaches
+    /// db_write_buffer_size limit.
+    #[inline]
+    fn should_flush_instance(&self) -> bool {
+        self.db_write_buffer_size > 0
+            && self.mem_usage_collector.total_memory_allocated() >= self.db_write_buffer_size
+    }
+
+    #[inline]
+    fn read_runtime(&self) -> &Arc<Runtime> {
+        &self.runtimes.read_runtime
+    }
+
+    #[inline]
+    fn write_runtime(&self) -> &Arc<Runtime> {
+        &self.runtimes.write_runtime
+    }
+}
+
+/// Instance reference
+pub type InstanceRef<Wal, Meta, Store, Fa> = Arc<Instance<Wal, Meta, Store, Fa>>;
diff --git a/analytic_engine/src/instance/open.rs b/analytic_engine/src/instance/open.rs
new file mode 100644
index 0000000000..deb5a047b9
--- /dev/null
+++ b/analytic_engine/src/instance/open.rs
@@ -0,0 +1,415 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Open logic of instance
+
+use std::sync::{Arc, RwLock};
+
+use common_types::schema::IndexInWriterSchema;
+use common_util::define_result;
+use log::{debug, error, info, trace};
+use object_store::ObjectStore;
+use snafu::{ResultExt, Snafu};
+use tokio::sync::{oneshot, Mutex};
+use wal::{
+    log_batch::LogEntry,
+    manager::{LogIterator, ReadBoundary, ReadContext, ReadRequest, WalManager},
+};
+
+use crate::{
+    compaction::scheduler::SchedulerImpl,
+    context::OpenContext,
+    instance::{
+        mem_collector::MemUsageCollector,
+        write_worker,
+        write_worker::{RecoverTableCommand, WorkerLocal, WriteGroup},
+        Instance, MetaState, SpaceStore, Spaces,
+    },
+    meta::{meta_data::ManifestData, Manifest},
+    payload::{ReadPayload, WalDecoder},
+    space::{Space, SpaceId},
+    sst::{factory::Factory, file::FilePurger},
+    table::data::{TableData, TableDataRef},
+};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Failed to read meta update, err:{}", source))]
+    ReadMetaUpdate {
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+
+    #[snafu(display(
+        "Failed to recover table data, space_id:{}, table:{}, err:{}",
+        space_id,
+        table,
+        source
+    ))]
+    RecoverTableData {
+        space_id: SpaceId,
+        table: String,
+        source: crate::table::data::Error,
+    },
+
+    #[snafu(display("Failed to read wal, err:{}", source))]
+    ReadWal { source: wal::manager::Error },
+
+    #[snafu(display("Failed to apply log entry to memtable, err:{}", source))]
+    ApplyMemTable {
+        source: crate::instance::write::Error,
+    },
+
+    #[snafu(display("Failed to recover table, source:{}", source,))]
+    RecoverTable { source: write_worker::Error },
+}
+
+define_result!(Error);
+
+impl<
+        Wal: WalManager + Send + Sync + 'static,
+        Meta: Manifest + Send + Sync + 'static,
+        Store: ObjectStore,
+        Fa: Factory + Send + Sync + 'static,
+    > Instance<Wal, Meta, Store, Fa>
+{
+    /// Open a new instance
+    pub async fn open(
+        ctx: OpenContext,
+        manifest: Meta,
+        wal_manager: Wal,
+        store: Store,
+        sst_factory: Fa,
+    ) -> Result<Arc<Self>> {
+        let store = Arc::new(store);
+        let space_store = Arc::new(SpaceStore {
+            spaces: RwLock::new(Spaces::default()),
+            manifest,
+            wal_manager,
+            store: store.clone(),
+            meta_state: Mutex::new(MetaState::default()),
+            sst_factory,
+            meta_cache: ctx.meta_cache.clone(),
+            data_cache: ctx.data_cache.clone(),
+        });
+
+        let scheduler_config = ctx.config.compaction_config.clone();
+        let bg_runtime = ctx.runtimes.bg_runtime.clone();
+        let compaction_scheduler = Arc::new(SchedulerImpl::new(
+            space_store.clone(),
+            bg_runtime.clone(),
+            scheduler_config,
+        ));
+
+        let file_purger = FilePurger::start(&*bg_runtime, store);
+
+        let instance = Arc::new(Instance {
+            space_store,
+            runtimes: ctx.runtimes.clone(),
+            table_opts: ctx.config.table_opts.clone(),
+            write_group_worker_num: ctx.config.write_group_worker_num,
+            write_group_command_channel_cap: ctx.config.write_group_command_channel_cap,
+            compaction_scheduler,
+            file_purger,
+            meta_cache: ctx.meta_cache.clone(),
+            data_cache: ctx.data_cache.clone(),
+            mem_usage_collector: Arc::new(MemUsageCollector::default()),
+            db_write_buffer_size: ctx.config.db_write_buffer_size,
+            space_write_buffer_size: ctx.config.space_write_buffer_size,
+        });
+
+        instance.recover(ctx).await?;
+
+        Ok(instance)
+    }
+
+    /// Recover the instance
+    ///
+    /// Should only called by open()
+    async fn recover(self: &Arc<Self>, ctx: OpenContext) -> Result<()> {
+        // Recover meta data, such as all spaces and tables
+        self.recover_from_meta(&ctx).await?;
+
+        // Recover from wal
+        self.recover_from_wal(&ctx).await?;
+
+        Ok(())
+    }
+
+    /// Recover meta data from manifest
+    async fn recover_from_meta(self: &Arc<Self>, ctx: &OpenContext) -> Result<()> {
+        info!("Instance recover from meta begin");
+
+        // Load manifest, also create a new snapshot at startup.
+        let manifest_data = self
+            .space_store
+            .manifest
+            .load_data(true)
+            .await
+            .map_err(|e| Box::new(e) as _)
+            .context(ReadMetaUpdate)?;
+
+        self.apply_manifest_data(manifest_data, ctx).await?;
+
+        info!("Instance recover from meta end");
+
+        Ok(())
+    }
+
+    /// Apply manifest data to instance
+    async fn apply_manifest_data(
+        self: &Arc<Self>,
+        manifest_data: ManifestData,
+        ctx: &OpenContext,
+    ) -> Result<()> {
+        // Apply all spaces.
+        for (space_id, space_meta_data) in manifest_data.spaces {
+            // Create write group for space.
+            let space_meta = space_meta_data.space_meta;
+            let write_group_opts = self.write_group_options(space_id);
+            let write_group = WriteGroup::new(write_group_opts, self.clone());
+
+            // Add this space to instance.
+            let space = Arc::new(Space::new(
+                space_id,
+                space_meta.space_name.clone(),
+                ctx.config.space_write_buffer_size,
+                write_group,
+                self.mem_usage_collector.clone(),
+            ));
+            {
+                let mut spaces = self.space_store.spaces.write().unwrap();
+                spaces.insert(space_meta.space_name, space.clone());
+            }
+
+            // Add all tables to the space.
+            for (table_id, table_meta_data) in space_meta_data.tables {
+                let table_meta = table_meta_data.table_meta;
+                let table_name = table_meta.table_name.clone();
+                // Choose write worker for this table
+                let write_handle = space.write_group.choose_worker(table_id);
+
+                debug!("Instance apply add table, meta :{:?}", table_meta);
+
+                let table_data = Arc::new(
+                    TableData::recover_from_add(
+                        table_meta,
+                        write_handle,
+                        &self.file_purger,
+                        space.mem_usage_collector.clone(),
+                    )
+                    .context(RecoverTableData {
+                        space_id,
+                        table: &table_name,
+                    })?,
+                );
+                // Apply version meta to the table.
+                let version_meta = table_meta_data.version_meta;
+                let max_file_id = version_meta.max_file_id_to_add();
+                table_data.current_version().apply_meta(version_meta);
+                // In recovery case, we need to maintain last file id of the table manually.
+                if table_data.last_file_id() < max_file_id {
+                    table_data.set_last_file_id(max_file_id);
+                }
+                // Add table to space.
+                space.insert_table(table_data);
+            }
+        }
+
+        // Update meta state.
+        let mut meta_state = self.space_store.meta_state.lock().await;
+        meta_state.last_space_id = manifest_data.last_space_id;
+
+        Ok(())
+    }
+
+    /// Recover all table data from wal
+    async fn recover_from_wal(&self, ctx: &OpenContext) -> Result<()> {
+        // replay_batch_size == 0 causes infinite loop.
+        assert!(ctx.config.replay_batch_size > 0);
+
+        info!("Instance recover from wal begin, ctx:{:?}", ctx);
+
+        // For each table, recover data of that table
+        let tables = {
+            let mut tables = Vec::new();
+            self.space_store.list_all_tables(&mut tables);
+            tables
+        };
+
+        let replay_batch_size = ctx.config.max_replay_tables_per_batch;
+        let mut replaying_rxs = Vec::with_capacity(replay_batch_size);
+        let mut replaying_tables = Vec::with_capacity(replay_batch_size);
+
+        for table_data in tables {
+            // Create a oneshot channel to send/recieve recover result
+            let (tx, rx) = oneshot::channel();
+            let cmd = RecoverTableCommand {
+                table_data: table_data.clone(),
+                tx,
+                replay_batch_size: ctx.config.replay_batch_size,
+            };
+
+            // Send recover request to write worker, actual works done in
+            // Self::recover_table_from_wal()
+            write_worker::send_command_to_write_worker(cmd.into_command(), &table_data).await;
+
+            replaying_rxs.push(rx);
+            replaying_tables.push(table_data.clone());
+
+            if replaying_rxs.len() >= replay_batch_size {
+                // Wait batch done
+                write_worker::join_all(&replaying_tables, replaying_rxs)
+                    .await
+                    .context(RecoverTable)?;
+
+                replaying_rxs = Vec::with_capacity(replay_batch_size);
+                replaying_tables.clear();
+            }
+        }
+
+        // Don't forget to wait the last batch done.
+        if !replaying_rxs.is_empty() {
+            write_worker::join_all(&replaying_tables, replaying_rxs)
+                .await
+                .context(RecoverTable)?;
+        }
+
+        info!("Instance recover from wal end");
+
+        Ok(())
+    }
+
+    /// Recover table data from wal
+    ///
+    /// Called by write worker
+    pub(crate) async fn recover_table_from_wal(
+        &self,
+        worker_local: &WorkerLocal,
+        table: TableDataRef,
+        replay_batch_size: usize,
+        read_ctx: &ReadContext,
+        log_entry_buf: &mut Vec<LogEntry<ReadPayload>>,
+    ) -> Result<()> {
+        let decoder = WalDecoder::default();
+
+        let read_req = ReadRequest {
+            region_id: table.wal_region_id(),
+            start: ReadBoundary::Min,
+            end: ReadBoundary::Max,
+        };
+
+        // Read all wal of current table
+        let mut log_iter = self
+            .space_store
+            .wal_manager
+            .read(read_ctx, &read_req)
+            .context(ReadWal)?;
+
+        loop {
+            // fetch entries to log_entry_buf
+            let no_more_data = {
+                log_entry_buf.clear();
+
+                for _ in 0..replay_batch_size {
+                    if let Some(log_entry) = log_iter.next_log_entry(&decoder).context(ReadWal)? {
+                        log_entry_buf.push(log_entry);
+                    } else {
+                        break;
+                    }
+                }
+
+                log_entry_buf.len() < replay_batch_size
+            };
+
+            // Replay all log entries of current table
+            self.replay_table_log_entries(worker_local, &*table, log_entry_buf)
+                .await?;
+
+            // No more entries.
+            if no_more_data {
+                break;
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Replay all log entries into memtable
+    async fn replay_table_log_entries(
+        &self,
+        worker_local: &WorkerLocal,
+        table_data: &TableData,
+        log_entries: &mut [LogEntry<ReadPayload>],
+    ) -> Result<()> {
+        if log_entries.is_empty() {
+            // No data in wal
+            return Ok(());
+        }
+
+        let last_sequence = log_entries.last().unwrap().sequence;
+
+        info!(
+            "Instance replay table log entries begin, table:{}, table_id:{:?}, sequence:{}",
+            table_data.name, table_data.id, last_sequence
+        );
+
+        // TODO(yingwen): Maybe we need to trigger flush if memtable is full during
+        // recovery Replay entries
+        for log_entry in log_entries {
+            let (sequence, payload) = (log_entry.sequence, &mut log_entry.payload);
+
+            // Apply to memtable
+            match payload {
+                ReadPayload::Write { row_group } => {
+                    trace!(
+                        "Instance replay row_group, table:{}, row_group:{:?}",
+                        table_data.name,
+                        row_group
+                    );
+
+                    let table_schema_version = table_data.schema_version();
+                    if table_schema_version != row_group.schema().version() {
+                        // Data with old schema should already been flushed, but we avoid panic
+                        // here.
+                        error!(
+                            "Ignore data with mismatch schema version during replaying, \
+                            table:{}, \
+                            table_id:{:?}, \
+                            expect:{}, \
+                            actual:{}, \
+                            last_sequence:{}, \
+                            sequence:{}",
+                            table_data.name,
+                            table_data.id,
+                            table_schema_version,
+                            row_group.schema().version(),
+                            last_sequence,
+                            sequence,
+                        );
+
+                        continue;
+                    }
+
+                    let index_in_writer =
+                        IndexInWriterSchema::for_same_schema(row_group.schema().num_columns());
+                    Self::write_to_memtable(
+                        worker_local,
+                        table_data,
+                        sequence,
+                        row_group,
+                        index_in_writer,
+                    )
+                    .context(ApplyMemTable)?;
+                }
+            }
+        }
+
+        info!(
+            "Instance replay table log entries end, table:{}, table_id:{:?}, last_sequence:{}",
+            table_data.name, table_data.id, last_sequence
+        );
+
+        table_data.set_last_sequence(last_sequence);
+
+        Ok(())
+    }
+}
diff --git a/analytic_engine/src/instance/read.rs b/analytic_engine/src/instance/read.rs
new file mode 100644
index 0000000000..8d47d7d8d3
--- /dev/null
+++ b/analytic_engine/src/instance/read.rs
@@ -0,0 +1,388 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Read logic of instance
+
+use std::{
+    collections::BTreeMap,
+    pin::Pin,
+    task::{Context, Poll},
+};
+
+use common_types::{
+    projected_schema::ProjectedSchema, record_batch::RecordBatch, schema::RecordSchema,
+    time::TimeRange,
+};
+use common_util::{define_result, runtime::Runtime};
+use futures::stream::Stream;
+use log::{debug, error, trace};
+use object_store::ObjectStore;
+use snafu::{ResultExt, Snafu};
+use table_engine::{
+    stream::{
+        self, ErrWithSource, PartitionedStreams, RecordBatchStream, SendableRecordBatchStream,
+    },
+    table::ReadRequest,
+};
+use tokio::sync::mpsc::{self, Receiver};
+use wal::manager::WalManager;
+
+use crate::{
+    instance::Instance,
+    meta::Manifest,
+    row_iter::{
+        chain,
+        chain::{ChainConfig, ChainIterator},
+        dedup::DedupIterator,
+        merge::{MergeBuilder, MergeConfig, MergeIterator},
+        IterOptions, RecordBatchWithKeyIterator,
+    },
+    space::SpaceAndTable,
+    sst::factory::{Factory, SstReaderOptions},
+    table::{
+        data::TableData,
+        version::{ReadView, TableVersion},
+    },
+    table_options::TableOptions,
+};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Failed to scan memtable, table:{}, err:{}", table, source))]
+    ScanMemTable {
+        table: String,
+        source: crate::memtable::Error,
+    },
+
+    #[snafu(display("Failed to build merge iterator, table:{}, err:{}", table, source))]
+    BuildMergeIterator {
+        table: String,
+        source: crate::row_iter::merge::Error,
+    },
+
+    #[snafu(display("Failed to build chain iterator, table:{}, err:{}", table, source))]
+    BuildChainIterator {
+        table: String,
+        source: crate::row_iter::chain::Error,
+    },
+}
+
+define_result!(Error);
+
+const RECORD_BATCH_READ_BUF_SIZE: usize = 1000;
+
+/// Check whether it needs to apply merge sorting when reading the table with
+/// the `table_options` by the `read_request`.
+fn need_merge_sort_streams(table_options: &TableOptions, read_request: &ReadRequest) -> bool {
+    table_options.need_dedup() || read_request.order.is_in_order()
+}
+
+impl<Wal: WalManager + Send + Sync, Meta: Manifest, Store: ObjectStore, Fa: Factory>
+    Instance<Wal, Meta, Store, Fa>
+{
+    /// Read data in multiple time range from table, and return
+    /// `read_parallelism` output streams.
+    pub async fn partitioned_read_from_table(
+        &self,
+        space_table: &SpaceAndTable,
+        request: ReadRequest,
+    ) -> Result<PartitionedStreams> {
+        debug!(
+            "Instance read from table, space:{}, table:{}, table_id:{:?}, request:{:?}",
+            space_table.space().name,
+            space_table.table_data().name,
+            space_table.table_data().id,
+            request
+        );
+
+        let table_data = space_table.table_data();
+
+        // Collect metrics.
+        table_data.metrics.on_read_request_begin();
+
+        let iter_options = IterOptions::default();
+        let table_options = table_data.table_options();
+
+        if need_merge_sort_streams(&table_data.table_options(), &request) {
+            let merge_iters = self
+                .build_merge_iters(table_data, &request, iter_options, &*table_options)
+                .await?;
+            self.build_partitioned_streams(&request, merge_iters)
+        } else {
+            let chain_iters = self
+                .build_chain_iters(table_data, &request, &*table_options)
+                .await?;
+            self.build_partitioned_streams(&request, chain_iters)
+        }
+    }
+
+    fn build_partitioned_streams(
+        &self,
+        request: &ReadRequest,
+        mut partitioned_iters: Vec<impl RecordBatchWithKeyIterator + 'static>,
+    ) -> Result<PartitionedStreams> {
+        let read_parallelism = request.opts.read_parallelism;
+
+        if read_parallelism == 1 && request.order.is_in_desc_order() {
+            // TODO(xikai): it seems this can be avoided.
+            partitioned_iters.reverse();
+        };
+
+        // Split iterators into `read_parallelism` groups.
+        let mut splited_iters: Vec<_> = std::iter::repeat_with(Vec::new)
+            .take(read_parallelism)
+            .collect();
+
+        for (i, time_aligned_iter) in partitioned_iters.into_iter().enumerate() {
+            splited_iters[i % read_parallelism].push(time_aligned_iter);
+        }
+
+        let mut streams = Vec::with_capacity(read_parallelism);
+        for iters in splited_iters {
+            let stream = iters_to_stream(iters, self.read_runtime(), &request.projected_schema);
+            streams.push(stream);
+        }
+
+        assert_eq!(read_parallelism, streams.len());
+
+        Ok(PartitionedStreams { streams })
+    }
+
+    async fn build_merge_iters(
+        &self,
+        table_data: &TableData,
+        request: &ReadRequest,
+        iter_options: IterOptions,
+        table_options: &TableOptions,
+    ) -> Result<Vec<DedupIterator<MergeIterator>>> {
+        // Current visible sequence
+        let sequence = table_data.last_sequence();
+        let projected_schema = request.projected_schema.clone();
+        let sst_reader_options = SstReaderOptions {
+            sst_type: table_data.sst_type,
+            read_batch_row_num: table_options.num_rows_per_row_group,
+            reverse: request.order.is_in_desc_order(),
+            projected_schema: projected_schema.clone(),
+            predicate: request.predicate.clone(),
+            meta_cache: self.meta_cache.clone(),
+            data_cache: self.data_cache.clone(),
+            runtime: self.read_runtime().clone(),
+        };
+
+        let time_range = request.predicate.time_range;
+        let version = table_data.current_version();
+        let read_views = self.partition_ssts_and_memtables(time_range, version, &*table_options);
+
+        let mut iters = Vec::with_capacity(read_views.len());
+        for read_view in read_views {
+            let merge_config = MergeConfig {
+                request_id: request.request_id,
+                space_id: table_data.space_id,
+                table_id: table_data.id,
+                sequence,
+                projected_schema: projected_schema.clone(),
+                predicate: request.predicate.clone(),
+                sst_factory: self.space_store.sst_factory.clone(),
+                sst_reader_options: sst_reader_options.clone(),
+                store: self.space_store.store_ref(),
+                merge_iter_options: iter_options.clone(),
+                need_dedup: table_options.need_dedup(),
+                reverse: request.order.is_in_desc_order(),
+            };
+
+            let merge_iter = MergeBuilder::new(merge_config)
+                .sampling_mem(read_view.sampling_mem)
+                .memtables(read_view.memtables)
+                .ssts_of_level(read_view.leveled_ssts)
+                .build()
+                .await
+                .context(BuildMergeIterator {
+                    table: &table_data.name,
+                })?;
+            let dedup_iter =
+                DedupIterator::new(request.request_id, merge_iter, iter_options.clone());
+
+            iters.push(dedup_iter);
+        }
+
+        Ok(iters)
+    }
+
+    async fn build_chain_iters(
+        &self,
+        table_data: &TableData,
+        request: &ReadRequest,
+        table_options: &TableOptions,
+    ) -> Result<Vec<ChainIterator>> {
+        let projected_schema = request.projected_schema.clone();
+
+        assert!(request.order.is_out_of_order());
+
+        let sst_reader_options = SstReaderOptions {
+            sst_type: table_data.sst_type,
+            read_batch_row_num: table_options.num_rows_per_row_group,
+            // no need to read in order so just read in asc order by default.
+            reverse: false,
+            projected_schema: projected_schema.clone(),
+            predicate: request.predicate.clone(),
+            meta_cache: self.meta_cache.clone(),
+            data_cache: self.data_cache.clone(),
+            runtime: self.read_runtime().clone(),
+        };
+
+        let time_range = request.predicate.time_range;
+        let version = table_data.current_version();
+        let read_views = self.partition_ssts_and_memtables(time_range, version, &*table_options);
+
+        let mut iters = Vec::with_capacity(read_views.len());
+        for read_view in read_views {
+            let chain_config = ChainConfig {
+                request_id: request.request_id,
+                space_id: table_data.space_id,
+                table_id: table_data.id,
+                projected_schema: projected_schema.clone(),
+                predicate: request.predicate.clone(),
+                sst_reader_options: sst_reader_options.clone(),
+                sst_factory: self.space_store.sst_factory.clone(),
+                store: self.space_store.store_ref(),
+            };
+            let builder = chain::Builder::new(chain_config);
+            let chain_iter = builder
+                .sampling_mem(read_view.sampling_mem)
+                .memtables(read_view.memtables)
+                .ssts(read_view.leveled_ssts)
+                .build()
+                .await
+                .context(BuildChainIterator {
+                    table: &table_data.name,
+                })?;
+
+            iters.push(chain_iter);
+        }
+
+        Ok(iters)
+    }
+
+    fn partition_ssts_and_memtables(
+        &self,
+        time_range: TimeRange,
+        version: &TableVersion,
+        table_options: &TableOptions,
+    ) -> Vec<ReadView> {
+        let read_view = version.pick_read_view(time_range);
+
+        let segment_duration = match table_options.segment_duration {
+            Some(v) => v.0,
+            None => {
+                // Segment duration is unknown, the table maybe still in sampling phase
+                // or the segment duration is still not applied to the table options,
+                // just return one partition.
+                return vec![read_view];
+            }
+        };
+        if read_view.contains_sampling() {
+            // The table contains sampling memtable, just return one partition.
+            return vec![read_view];
+        }
+
+        // Collect the aligned ssts and memtables into the map.
+        // {aligned timestamp} => {read view}
+        let mut read_view_by_time = BTreeMap::new();
+        for (level, leveled_ssts) in read_view.leveled_ssts.into_iter().enumerate() {
+            for file in leveled_ssts {
+                let aligned_ts = file
+                    .time_range()
+                    .inclusive_start()
+                    .truncate_by(segment_duration);
+                let entry = read_view_by_time
+                    .entry(aligned_ts)
+                    .or_insert_with(ReadView::default);
+                entry.leveled_ssts[level].push(file);
+            }
+        }
+
+        for memtable in read_view.memtables {
+            let aligned_ts = memtable
+                .time_range
+                .inclusive_start()
+                .truncate_by(segment_duration);
+            let entry = read_view_by_time
+                .entry(aligned_ts)
+                .or_insert_with(ReadView::default);
+            entry.memtables.push(memtable);
+        }
+
+        read_view_by_time.into_values().collect()
+    }
+}
+
+// TODO(xikai): this is a hack way to implement SendableRecordBatchStream for
+// MergeIterator.
+fn iters_to_stream<T>(
+    collection: T,
+    runtime: &Runtime,
+    schema: &ProjectedSchema,
+) -> SendableRecordBatchStream
+where
+    T: IntoIterator + Send + 'static,
+    T::Item: RecordBatchWithKeyIterator,
+    T::IntoIter: Send,
+{
+    let (tx, rx) = mpsc::channel(RECORD_BATCH_READ_BUF_SIZE);
+    let projected_schema = schema.clone();
+
+    runtime.spawn(async move {
+        for mut iter in collection {
+            while let Some(record_batch) = iter.next_batch().await.transpose() {
+                let record_batch =
+                    record_batch
+                        .map_err(|e| Box::new(e) as _)
+                        .context(ErrWithSource {
+                            msg: "Read record batch",
+                        });
+
+                // Apply the projection to RecordBatchWithKey and gets the final RecordBatch.
+                let record_batch = record_batch.and_then(|batch_with_key| {
+                    // TODO(yingwen): Try to use projector to do this, which precompute row
+                    // indexes to project.
+                    batch_with_key
+                        .try_project(&projected_schema)
+                        .map_err(|e| Box::new(e) as _)
+                        .context(ErrWithSource {
+                            msg: "Project record batch",
+                        })
+                });
+
+                trace!("send next record batch:{:?}", record_batch);
+                if tx.send(record_batch).await.is_err() {
+                    error!("Failed to send record batch from the merge iterator");
+                    break;
+                }
+            }
+        }
+    });
+
+    Box::pin(ChannelledRecordBatchStream {
+        schema: schema.to_record_schema(),
+        rx,
+    })
+}
+
+pub struct ChannelledRecordBatchStream {
+    schema: RecordSchema,
+    rx: Receiver<stream::Result<RecordBatch>>,
+}
+
+impl Stream for ChannelledRecordBatchStream {
+    type Item = stream::Result<RecordBatch>;
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        let this = self.get_mut();
+        Pin::new(&mut this.rx).poll_recv(cx)
+    }
+}
+
+impl RecordBatchStream for ChannelledRecordBatchStream {
+    fn schema(&self) -> &RecordSchema {
+        &self.schema
+    }
+}
diff --git a/analytic_engine/src/instance/write.rs b/analytic_engine/src/instance/write.rs
new file mode 100644
index 0000000000..711e0c9b0d
--- /dev/null
+++ b/analytic_engine/src/instance/write.rs
@@ -0,0 +1,464 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Write logic of instance
+
+use std::sync::Arc;
+
+use common_types::{
+    bytes::ByteVec,
+    row::RowGroup,
+    schema::{IndexInWriterSchema, Schema},
+};
+use common_util::{codec::row, define_result};
+use log::{debug, error, info, trace, warn};
+use object_store::ObjectStore;
+use proto::table_requests;
+use smallvec::SmallVec;
+use snafu::{ensure, Backtrace, ResultExt, Snafu};
+use table_engine::table::WriteRequest;
+use tokio::sync::oneshot;
+use wal::{
+    log_batch::{LogWriteBatch, LogWriteEntry},
+    manager::{SequenceNumber, WalManager, WriteContext},
+};
+
+use crate::{
+    instance::{
+        flush_compaction::TableFlushOptions,
+        write_worker,
+        write_worker::{BackgroundStatus, WorkerLocal, WriteTableCommand},
+        Instance,
+    },
+    memtable::{key::KeySequence, PutContext},
+    meta::Manifest,
+    payload::WritePayload,
+    space::SpaceAndTable,
+    sst::factory::Factory,
+    table::{
+        data::{TableData, TableDataRef},
+        version::MemTableForWrite,
+    },
+};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Failed to write to wal, table:{}, err:{}", table, source))]
+    WriteLogBatch {
+        table: String,
+        source: wal::manager::Error,
+    },
+
+    #[snafu(display("Failed to write to memtable, table:{}, err:{}", table, source))]
+    WriteMemTable {
+        table: String,
+        source: crate::table::version::Error,
+    },
+
+    #[snafu(display("Try to write to a dropped table, table:{}", table))]
+    WriteDroppedTable { table: String },
+
+    #[snafu(display(
+        "Too many rows to write (more than {}), table:{}, rows:{}.\nBacktrace:\n{}",
+        MAX_ROWS_TO_WRITE,
+        table,
+        rows,
+        backtrace,
+    ))]
+    TooManyRows {
+        table: String,
+        rows: usize,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Failed to find mutable memtable, table:{}, err:{}", table, source))]
+    FindMutableMemTable {
+        table: String,
+        source: crate::table::data::Error,
+    },
+    #[snafu(display("Failed to write table, source:{}", source,))]
+    Write { source: write_worker::Error },
+
+    #[snafu(display("Failed to flush table, table:{}, err:{}", table, source))]
+    FlushTable {
+        table: String,
+        source: crate::instance::flush_compaction::Error,
+    },
+
+    #[snafu(display(
+        "Background flush failed, cannot write more data, err:{}.\nBacktrace:\n{}",
+        msg,
+        backtrace
+    ))]
+    BackgroundFlushFailed { msg: String, backtrace: Backtrace },
+
+    #[snafu(display("Schema of request is incompatible with table, err:{}", source))]
+    IncompatSchema {
+        source: common_types::schema::CompatError,
+    },
+
+    #[snafu(display("Failed to encode row group, err:{}", source))]
+    EncodeRowGroup {
+        source: common_util::codec::row::Error,
+    },
+
+    #[snafu(display("Failed to update sequence of memtable, err:{}", source))]
+    UpdateMemTableSequence { source: crate::memtable::Error },
+}
+
+define_result!(Error);
+
+/// Max rows in a write request, must less than [u32::MAX]
+const MAX_ROWS_TO_WRITE: usize = 10_000_000;
+
+pub struct EncodeContext {
+    row_group: RowGroup,
+    index_in_writer: IndexInWriterSchema,
+    encoded_rows: Vec<ByteVec>,
+}
+
+impl EncodeContext {
+    fn new(row_group: RowGroup) -> Self {
+        Self {
+            row_group,
+            index_in_writer: IndexInWriterSchema::default(),
+            encoded_rows: Vec::new(),
+        }
+    }
+
+    fn encode_rows(&mut self, table_schema: &Schema) -> Result<()> {
+        // Encode the row group into the buffer, which can be reused to write to
+        // memtable
+        row::encode_row_group_for_wal(
+            &self.row_group,
+            table_schema,
+            &self.index_in_writer,
+            &mut self.encoded_rows,
+        )
+        .context(EncodeRowGroup)?;
+
+        assert_eq!(self.row_group.num_rows(), self.encoded_rows.len());
+
+        Ok(())
+    }
+}
+
+impl<
+        Wal: WalManager + Send + Sync + 'static,
+        Meta: Manifest + Send + Sync + 'static,
+        Store: ObjectStore,
+        Fa: Factory + Send + Sync + 'static,
+    > Instance<Wal, Meta, Store, Fa>
+{
+    /// Write data to the table under give space.
+    pub async fn write_to_table(
+        &self,
+        space_table: &SpaceAndTable,
+        request: WriteRequest,
+    ) -> Result<usize> {
+        // Collect metrics.
+        space_table.table_data().metrics.on_write_request_begin();
+
+        self.validate_before_write(space_table, &request)?;
+
+        // Create a oneshot channel to send/receive write result.
+        let (tx, rx) = oneshot::channel();
+        let cmd = WriteTableCommand {
+            space_table: space_table.clone(),
+            request,
+            tx,
+        };
+
+        // Send write request to write worker, actual works done in
+        // Self::process_write_table_command().
+        write_worker::process_command_in_write_worker(
+            cmd.into_command(),
+            space_table.table_data(),
+            rx,
+        )
+        .await
+        .context(Write)
+    }
+
+    /// Do the actual write, must called by write worker in write thread
+    /// sequentially.
+    pub(crate) async fn process_write_table_command(
+        self: &Arc<Self>,
+        worker_local: &mut WorkerLocal,
+        space_table: &SpaceAndTable,
+        request: WriteRequest,
+    ) -> Result<usize> {
+        let mut encode_ctx = EncodeContext::new(request.row_group);
+
+        self.preprocess_write(worker_local, space_table, &mut encode_ctx)
+            .await?;
+
+        let table_data = space_table.table_data();
+        let schema = table_data.schema();
+        encode_ctx.encode_rows(&schema)?;
+
+        let EncodeContext {
+            row_group,
+            index_in_writer,
+            encoded_rows,
+        } = encode_ctx;
+
+        let sequence = self
+            .write_to_wal(worker_local, &**table_data, encoded_rows)
+            .await?;
+
+        Self::write_to_memtable(
+            worker_local,
+            &**table_data,
+            sequence,
+            &row_group,
+            index_in_writer,
+        )
+        .map_err(|e| {
+            error!(
+                "Failed to write to memtable, space_table:{:?}, err:{}",
+                space_table, e
+            );
+            e
+        })?;
+
+        // Failure of writing memtable may cause inconsecutive sequence.
+        if table_data.last_sequence() + 1 != sequence {
+            warn!(
+                "Sequence must be consecutive, space_table:{:?}, last_sequence:{}, wal_sequence:{}",
+                space_table,
+                table_data.last_sequence(),
+                sequence
+            );
+        }
+
+        debug!(
+            "Instance write finished, update sequence, space_table:{:?}, last_sequence:{}",
+            space_table, sequence
+        );
+
+        table_data.set_last_sequence(sequence);
+
+        let num_rows = row_group.num_rows();
+        // Collect metrics.
+        table_data.metrics.on_write_request_done(num_rows);
+
+        Ok(num_rows)
+    }
+
+    /// Return Ok if the request is valid, this is done before entering the
+    /// write thread.
+    fn validate_before_write(
+        &self,
+        space_table: &SpaceAndTable,
+        request: &WriteRequest,
+    ) -> Result<()> {
+        ensure!(
+            request.row_group.num_rows() < MAX_ROWS_TO_WRITE,
+            TooManyRows {
+                table: &space_table.table_data().name,
+                rows: request.row_group.num_rows(),
+            }
+        );
+
+        Ok(())
+    }
+
+    /// Preprocess before write, check:
+    ///  - whether table is dropped
+    ///  - memtable capacity and maybe trigger flush
+    ///
+    /// Fills [common_types::schema::IndexInWriterSchema] in [EncodeContext]
+    async fn preprocess_write(
+        self: &Arc<Self>,
+        worker_local: &mut WorkerLocal,
+        space_table: &SpaceAndTable,
+        encode_ctx: &mut EncodeContext,
+    ) -> Result<()> {
+        let space = space_table.space();
+        let table_data = space_table.table_data();
+
+        ensure!(
+            !table_data.is_dropped(),
+            WriteDroppedTable {
+                table: &table_data.name,
+            }
+        );
+
+        // Checks schema compability.
+        table_data
+            .schema()
+            .compatible_for_write(
+                encode_ctx.row_group.schema(),
+                &mut encode_ctx.index_in_writer,
+            )
+            .context(IncompatSchema)?;
+
+        // TODO(yingwen): Allow write and retry flush.
+        // Check background status, if background error occured, not allow to write
+        // again.
+        match &*worker_local.background_status() {
+            // Compaction error is ignored now.
+            BackgroundStatus::Ok | BackgroundStatus::CompactionFailed(_) => (),
+            BackgroundStatus::FlushFailed(e) => {
+                return BackgroundFlushFailed { msg: e.to_string() }.fail();
+            }
+        }
+
+        if self.should_flush_instance() {
+            if let Some(space) = self.space_store.find_maximum_memory_usage_space() {
+                if let Some(table) = space.find_maximum_memory_usage_table() {
+                    info!("Trying to flush table {} bytes {} in space {} because engine total memtable memory usage exceeds db_write_buffer_size {}.",
+                          table.name,
+                          table.memtable_memory_usage(),
+                          space.name,
+                          self.db_write_buffer_size,
+                    );
+                    self.handle_memtable_flush(worker_local, &table).await?;
+                }
+            }
+        }
+
+        if space.should_flush_space() {
+            if let Some(table) = space.find_maximum_memory_usage_table() {
+                info!("Trying to flush table {} bytes {} in space {} because space total memtable memory usage exceeds space_write_buffer_size {}.",
+                      table.name,
+                      table.memtable_memory_usage() ,
+                      space.name,
+                      space.write_buffer_size,
+                );
+                self.handle_memtable_flush(worker_local, &table).await?;
+            }
+        }
+
+        if table_data.should_flush_table(worker_local) {
+            self.handle_memtable_flush(worker_local, table_data).await?;
+        }
+
+        Ok(())
+    }
+
+    /// Write log_batch into wal, return the sequence number of log_batch.
+    async fn write_to_wal(
+        &self,
+        _worker_local: &WorkerLocal,
+        table_data: &TableData,
+        encoded_rows: Vec<ByteVec>,
+    ) -> Result<SequenceNumber> {
+        // Convert into pb
+        let mut write_req_pb = table_requests::WriteRequest::new();
+        // Use the table schema instead of the schema in request to avoid schema
+        // mismatch during replaying
+        write_req_pb.set_schema(table_data.schema().into());
+        write_req_pb.set_rows(encoded_rows.into());
+
+        let mut log_batch = LogWriteBatch::new(table_data.wal_region_id());
+        // Now we only have one request, so no need to use with_capacity
+        log_batch.push(LogWriteEntry {
+            payload: WritePayload::Write(&write_req_pb),
+        });
+
+        // Write to wal manager
+        let write_ctx = WriteContext::default();
+        let sequence = self
+            .space_store
+            .wal_manager
+            .write(&write_ctx, &log_batch)
+            .await
+            .context(WriteLogBatch {
+                table: &table_data.name,
+            })?;
+
+        Ok(sequence)
+    }
+
+    // TODO(yingwen): How to trigger flush if we found memtables are full during
+    // inserting memtable? RocksDB checks memtable size in MemTableInserter
+    /// Write data into memtable.
+    ///
+    /// The data in `encoded_rows` will be moved to memtable.
+    ///
+    /// The len of `row_group` and `encoded_rows` must be equal.
+    pub(crate) fn write_to_memtable(
+        worker_local: &WorkerLocal,
+        table_data: &TableData,
+        sequence: SequenceNumber,
+        row_group: &RowGroup,
+        index_in_writer: IndexInWriterSchema,
+    ) -> Result<()> {
+        if row_group.is_empty() {
+            return Ok(());
+        }
+
+        let schema = row_group.schema();
+        // Store all memtables we wrote and update their last sequence later.
+        let mut wrote_memtables: SmallVec<[_; 4]> = SmallVec::new();
+        let mut last_mutable_mem: Option<MemTableForWrite> = None;
+
+        let mut ctx = PutContext::new(index_in_writer);
+        for (row_idx, row) in row_group.iter().enumerate() {
+            // TODO(yingwen): Add RowWithSchema and take RowWithSchema as input, then remove
+            // this unwrap()
+            let timestamp = row.timestamp(schema).unwrap();
+            // skip expired row
+            if table_data.is_expired(timestamp) {
+                trace!("Skip expired row when write to memtable, row:{:?}", row);
+                continue;
+            }
+            if last_mutable_mem.is_none()
+                || !last_mutable_mem
+                    .as_ref()
+                    .unwrap()
+                    .accept_timestamp(timestamp)
+            {
+                // The time range is not processed by current memtable, find next one.
+                let mutable_mem = table_data
+                    .find_or_create_mutable(worker_local, timestamp, schema)
+                    .context(FindMutableMemTable {
+                        table: &table_data.name,
+                    })?;
+                wrote_memtables.push(mutable_mem.clone());
+                last_mutable_mem = Some(mutable_mem);
+            }
+
+            // We have check the row num is less than `MAX_ROWS_TO_WRITE`, it is safe to
+            // cast it to u32 here
+            let key_seq = KeySequence::new(sequence, row_idx as u32);
+            // TODO(yingwen): Batch sample timestamp in sampling phase.
+            last_mutable_mem
+                .as_ref()
+                .unwrap()
+                .put(&mut ctx, key_seq, row, schema, timestamp)
+                .context(WriteMemTable {
+                    table: &table_data.name,
+                })?;
+        }
+
+        // Update last sequence of memtable.
+        for mem_wrote in wrote_memtables {
+            mem_wrote
+                .set_last_sequence(sequence)
+                .context(UpdateMemTableSequence)?;
+        }
+
+        Ok(())
+    }
+
+    /// Flush memtables of table in background.
+    ///
+    /// Only flush mutable memtables, assuming all immutable memtables are
+    /// flushing.
+    async fn handle_memtable_flush(
+        self: &Arc<Self>,
+        worker_local: &mut WorkerLocal,
+        table_data: &TableDataRef,
+    ) -> Result<()> {
+        let opts = TableFlushOptions::default();
+
+        // Set `block_on_write_thread` to false and let flush do in background.
+        self.flush_table_in_worker(worker_local, table_data, opts)
+            .await
+            .context(FlushTable {
+                table: &table_data.name,
+            })
+    }
+}
diff --git a/analytic_engine/src/instance/write_worker.rs b/analytic_engine/src/instance/write_worker.rs
new file mode 100644
index 0000000000..41089a2605
--- /dev/null
+++ b/analytic_engine/src/instance/write_worker.rs
@@ -0,0 +1,970 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Write workers
+
+use std::{
+    collections::HashMap,
+    future::Future,
+    sync::{
+        atomic::{AtomicBool, AtomicI64, Ordering},
+        Arc,
+    },
+    time::Instant,
+};
+
+use common_util::{
+    define_result,
+    runtime::{JoinHandle, Runtime},
+    time::InstantExt,
+};
+use futures::future;
+use log::{error, info};
+use object_store::ObjectStore;
+use snafu::{Backtrace, ResultExt, Snafu};
+use table_engine::{
+    engine::DropTableRequest,
+    table::{
+        AlterSchemaRequest, Error as TableError, Result as TableResult, TableId, WriteRequest,
+    },
+};
+use tokio::sync::{mpsc, oneshot, watch, watch::Ref, Mutex, Notify};
+use wal::{
+    log_batch::LogEntry,
+    manager::{ReadContext, WalManager},
+};
+
+use crate::{
+    compaction::{TableCompactionRequest, WaitResult},
+    instance::{
+        alter, drop,
+        flush_compaction::{self, TableFlushOptions},
+        open, write, write_worker, InstanceRef,
+    },
+    meta::Manifest,
+    payload::ReadPayload,
+    space::{SpaceAndTable, SpaceId},
+    sst::factory::Factory,
+    table::{data::TableDataRef, metrics::Metrics},
+};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Failed to wait flush completed, channel disconnected, err:{}", source))]
+    WaitFlush {
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+
+    #[snafu(display(
+        "Background flush failed, cannot write more data, err:{}.\nBacktrace:\n{}",
+        msg,
+        backtrace
+    ))]
+    BackgroundFlushFailed { msg: String, backtrace: Backtrace },
+
+    #[snafu(display(
+        "Failed to receive cmd result, channel disconnected, table:{}, worker_id:{}.\nBacktrace:\n{}",
+        table,
+        worker_id,
+        backtrace,
+    ))]
+    ReceiveFromWorker {
+        table: String,
+        worker_id: usize,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Channel error, err:{}", source))]
+    Channel {
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+}
+
+define_result!(Error);
+
+#[derive(Debug)]
+pub enum BackgroundStatus {
+    Ok,
+    FlushFailed(Arc<flush_compaction::Error>),
+    CompactionFailed(Arc<flush_compaction::Error>),
+}
+
+/// Local state of worker
+///
+/// The worker is single threaded and holding this is equivalent to holding a
+/// write lock
+#[derive(Debug)]
+pub struct WorkerLocal {
+    data: Arc<WorkerSharedData>,
+    background_rx: watch::Receiver<BackgroundStatus>,
+}
+
+/// Notifier for the write worker when finishing flushing.
+struct FlushNotifier(Arc<WorkerSharedData>);
+
+impl FlushNotifier {
+    fn new(data: Arc<WorkerSharedData>) -> Self {
+        data.num_background_jobs.fetch_add(1, Ordering::SeqCst);
+
+        Self(data)
+    }
+
+    /// Mark flush is done and notify the waiter status ok (write thread).
+    /// Concurrency:
+    /// - Caller should guarantee that there is only one thread (the flush
+    ///   thread) calling this method
+    pub fn notify_ok(self) {
+        // Mark the worker is not flushing.
+        self.0.set_is_flushing(false);
+        // Send message to notify waiter, ignore send result.
+        let _ = self.0.background_tx.send(BackgroundStatus::Ok);
+    }
+
+    /// Mark flush is done and notify the waiter error (write thread).
+    /// Concurrency:
+    /// - Caller should guarantee that there is only one thread (the flush
+    ///   thread) calling this method
+    pub fn notify_err(self, err: Arc<flush_compaction::Error>) {
+        // Mark the worker is not flushing.
+        self.0.set_is_flushing(false);
+        // Send message to notify waiter, ignore send result.
+        let _ = self
+            .0
+            .background_tx
+            .send(BackgroundStatus::FlushFailed(err));
+    }
+}
+
+impl Drop for FlushNotifier {
+    fn drop(&mut self) {
+        // SeqCst to ensure subtraction num_background_jobs won't be reordered.
+        self.0.num_background_jobs.fetch_sub(1, Ordering::SeqCst);
+        self.0.background_notify.notify_one();
+    }
+}
+
+/// Notifier to notify compaction result. If no compaction happened, then the
+/// notifier may not be signaled.
+pub struct CompactionNotifier(Arc<WorkerSharedData>);
+
+impl CompactionNotifier {
+    fn new(data: Arc<WorkerSharedData>) -> Self {
+        data.num_background_jobs.fetch_add(1, Ordering::SeqCst);
+
+        Self(data)
+    }
+
+    pub fn notify_ok(self) {
+        // Send message to notify waiter, ignore send result.
+        let _ = self.0.background_tx.send(BackgroundStatus::Ok);
+    }
+
+    pub fn notify_err(self, err: Arc<flush_compaction::Error>) {
+        // Send message to notify waiter, ignore send result.
+        let _ = self
+            .0
+            .background_tx
+            .send(BackgroundStatus::CompactionFailed(err));
+    }
+}
+
+impl Clone for CompactionNotifier {
+    fn clone(&self) -> Self {
+        // It will add num_background_jobs in CompactionNotifier::new,
+        // so we can't derive Clone for CompactionNotifier.
+        CompactionNotifier::new(self.0.clone())
+    }
+}
+
+impl Drop for CompactionNotifier {
+    fn drop(&mut self) {
+        // SeqCst to ensure subtraction num_background_jobs won't be reordered.
+        self.0.num_background_jobs.fetch_sub(1, Ordering::SeqCst);
+        self.0.background_notify.notify_one();
+    }
+}
+
+fn send_flush_result(res_sender: Option<oneshot::Sender<TableResult<()>>>, res: TableResult<()>) {
+    if let Some(tx) = res_sender {
+        if let Err(send_res) = tx.send(res) {
+            error!("Fail to send flush result, send_res: {:?}", send_res);
+        }
+    }
+}
+
+impl WorkerLocal {
+    #[inline]
+    pub fn background_status(&self) -> Ref<'_, BackgroundStatus> {
+        self.background_rx.borrow()
+    }
+
+    /// Control the flush procedure and ensure multiple flush procedures to be
+    /// sequential.
+    ///
+    /// REQUIRE: should only be called by the write thread.
+    pub async fn flush_sequentially<F, T>(
+        &mut self,
+        table: String,
+        metrics: &Metrics,
+        flush_job: F,
+        on_flush_success: T,
+        block_on_write_thread: bool,
+        res_sender: Option<oneshot::Sender<TableResult<()>>>,
+    ) -> Result<()>
+    where
+        F: Future<Output = flush_compaction::Result<()>> + Send + 'static,
+        T: Future<Output = ()> + Send + 'static,
+    {
+        // If flush operation is running, then we need to wait for it to complete first.
+        // Actually, the loop waiting ensures the multiple flush procedures to be
+        // sequential, that is to say, at most one flush is being executed at
+        // the same time.
+        let mut stall_begin = None;
+        while self.data.is_flushing() {
+            if stall_begin.is_none() {
+                stall_begin = Some(Instant::now());
+            }
+
+            self.background_rx
+                .changed()
+                .await
+                .map_err(|e| Box::new(e) as _)
+                .context(WaitFlush)?;
+        }
+        assert!(!self.data.is_flushing());
+
+        // Report write stall.
+        if let Some(instant) = stall_begin {
+            metrics.on_write_stall(instant.saturating_elapsed());
+        }
+
+        // Check background status, if background error occurred, current flush is not
+        // allowed.
+        match &*self.background_status() {
+            // Now background compaction error is ignored.
+            BackgroundStatus::Ok | BackgroundStatus::CompactionFailed(_) => (),
+            BackgroundStatus::FlushFailed(e) => {
+                return BackgroundFlushFailed { msg: e.to_string() }.fail();
+            }
+        }
+
+        // TODO(yingwen): Store pending flush requests and retry flush on recoverable
+        // error,  or try to recover from background error.
+
+        // Mark the worker is flushing.
+        self.data.set_is_flushing(true);
+
+        let worker_data = self.data.clone();
+        // Create a notifier, remember to mark flushed and notify wait when we done!
+        let notifier = FlushNotifier::new(worker_data);
+        let task = async move {
+            let flush_res = flush_job.await;
+
+            match flush_res {
+                Ok(()) => {
+                    notifier.notify_ok();
+                    on_flush_success.await;
+                    send_flush_result(res_sender, Ok(()));
+                }
+                Err(e) => {
+                    let e = Arc::new(e);
+                    notifier.notify_err(e.clone());
+                    send_flush_result(
+                        res_sender,
+                        Err(TableError::Flush {
+                            source: Box::new(e),
+                            table,
+                        }),
+                    );
+                }
+            }
+        };
+
+        if block_on_write_thread {
+            task.await;
+        } else {
+            self.data.runtime.spawn(task);
+        }
+
+        Ok(())
+    }
+
+    pub fn compaction_notifier(&self) -> CompactionNotifier {
+        let data = self.data.clone();
+        CompactionNotifier::new(data)
+    }
+}
+
+/// Write table command.
+pub struct WriteTableCommand {
+    pub space_table: SpaceAndTable,
+    pub request: WriteRequest,
+    /// Sender for the worker to return result of write
+    pub tx: oneshot::Sender<write::Result<usize>>,
+}
+
+impl WriteTableCommand {
+    /// Convert into [Command]
+    pub fn into_command(self) -> Command {
+        Command::Write(self)
+    }
+}
+
+/// Recover table command.
+pub struct RecoverTableCommand {
+    /// Table to recover
+    pub table_data: TableDataRef,
+    /// Sender for the worker to return result of recover
+    pub tx: oneshot::Sender<open::Result<()>>,
+
+    // Options for recover:
+    /// Batch size to read records from wal to replay
+    pub replay_batch_size: usize,
+}
+
+impl RecoverTableCommand {
+    /// Convert into [Command]
+    pub fn into_command(self) -> Command {
+        Command::Recover(self)
+    }
+}
+
+/// Drop table command
+pub struct DropTableCommand {
+    pub space_table: SpaceAndTable,
+    pub request: DropTableRequest,
+    pub tx: oneshot::Sender<drop::Result<()>>,
+}
+
+impl DropTableCommand {
+    /// Convert into [Command]
+    pub fn into_command(self) -> Command {
+        Command::Drop(self)
+    }
+}
+
+/// Alter table command.
+pub struct AlterSchemaCommand {
+    pub space_table: SpaceAndTable,
+    pub request: AlterSchemaRequest,
+    /// Sender for the worker to return result of alter schema
+    pub tx: oneshot::Sender<write_worker::Result<()>>,
+}
+
+impl AlterSchemaCommand {
+    /// Convert into [Command]
+    pub fn into_command(self) -> Command {
+        Command::AlterSchema(self)
+    }
+}
+
+/// Alter table options command.
+pub struct AlterOptionsCommand {
+    pub space_table: SpaceAndTable,
+    pub options: HashMap<String, String>,
+    /// Sender for the worker to return result of alter schema
+    pub tx: oneshot::Sender<alter::Result<()>>,
+}
+
+impl AlterOptionsCommand {
+    /// Convert into [Command]
+    pub fn into_command(self) -> Command {
+        Command::AlterOptions(self)
+    }
+}
+
+/// Flush table request.
+pub struct FlushTableCommand {
+    pub space_table: SpaceAndTable,
+    pub flush_opts: TableFlushOptions,
+    pub tx: oneshot::Sender<flush_compaction::Result<()>>,
+}
+
+impl FlushTableCommand {
+    /// Convert into [Command]
+    pub fn into_command(self) -> Command {
+        Command::Flush(self)
+    }
+}
+
+/// Compact table request.
+pub struct CompactTableCommand {
+    pub space_table: SpaceAndTable,
+    pub waiter: Option<oneshot::Sender<WaitResult<()>>>,
+    pub tx: oneshot::Sender<flush_compaction::Result<()>>,
+}
+
+impl CompactTableCommand {
+    /// Convert into [Command]
+    pub fn into_command(self) -> Command {
+        Command::Compact(self)
+    }
+}
+
+/// Command sent to write worker
+pub enum Command {
+    /// Write to table
+    Write(WriteTableCommand),
+
+    /// Drop table
+    Drop(DropTableCommand),
+
+    /// Recover table
+    Recover(RecoverTableCommand),
+
+    /// Alter table schema
+    AlterSchema(AlterSchemaCommand),
+
+    /// Alter table modify setting
+    AlterOptions(AlterOptionsCommand),
+
+    /// Flush table
+    Flush(FlushTableCommand),
+
+    /// Compact table
+    Compact(CompactTableCommand),
+
+    /// Exit the worker
+    Exit,
+}
+
+/// Write handle hold by a table
+#[derive(Debug, Clone)]
+pub struct WriteHandle {
+    worker_data: Arc<WorkerSharedData>,
+}
+
+impl WriteHandle {
+    /// Send command to write worker.
+    ///
+    /// Panic if channel is disconnected
+    pub async fn send_command(&self, cmd: Command) {
+        if self.worker_data.tx.send(cmd).await.is_err() {
+            error!(
+                "Failed to send command to worker, worker_id:{}",
+                self.worker_id()
+            );
+
+            panic!("write worker {} disconnected", self.worker_id());
+        }
+    }
+
+    /// Returns the id of the worker
+    pub fn worker_id(&self) -> usize {
+        self.worker_data.id
+    }
+}
+
+pub async fn send_command_to_write_worker(cmd: Command, table_data: &TableDataRef) {
+    table_data.write_handle.send_command(cmd).await;
+}
+
+pub async fn process_command_in_write_worker<T, E: std::error::Error + Send + Sync + 'static>(
+    cmd: Command,
+    table_data: &TableDataRef,
+    rx: oneshot::Receiver<std::result::Result<T, E>>,
+) -> Result<T> {
+    send_command_to_write_worker(cmd, table_data).await;
+
+    // Receive alter options result.
+    match rx.await {
+        Ok(res) => res.map_err(|e| Box::new(e) as _).context(Channel),
+        Err(_) => ReceiveFromWorker {
+            table: &table_data.name,
+            worker_id: table_data.write_handle.worker_id(),
+        }
+        .fail(),
+    }
+}
+
+pub async fn join_all<T, E: std::error::Error + Send + Sync + 'static>(
+    table_vec: &[TableDataRef],
+    rx_vec: Vec<oneshot::Receiver<std::result::Result<T, E>>>,
+) -> Result<()> {
+    let results = future::join_all(rx_vec).await;
+    for (pos, res) in results.into_iter().enumerate() {
+        let table_data = &table_vec[pos];
+        match res {
+            Ok(res) => {
+                res.map_err(|e| Box::new(e) as _).context(Channel)?;
+            }
+            Err(_) => {
+                return ReceiveFromWorker {
+                    table: &table_data.name,
+                    worker_id: table_data.write_handle.worker_id(),
+                }
+                .fail()
+            }
+        }
+    }
+
+    Ok(())
+}
+
+/// Write group options
+pub struct Options {
+    pub space_id: SpaceId,
+    pub worker_num: usize,
+    pub runtime: Arc<Runtime>,
+    /// Capacity of the command channel for each worker
+    pub command_channel_capacity: usize,
+}
+
+// TODO(yingwen): Add method to stop all workers
+/// Write group manages all write worker of a space
+#[derive(Debug)]
+pub struct WriteGroup {
+    /// Space of the write group.
+    space_id: SpaceId,
+    /// Shared datas of workers.
+    worker_datas: Vec<Arc<WorkerSharedData>>,
+    /// Join handles of workers.
+    handles: Mutex<Vec<JoinHandle<()>>>,
+}
+
+impl WriteGroup {
+    pub fn new<Wal, Meta, Store, Fa>(
+        opts: Options,
+        instance: InstanceRef<Wal, Meta, Store, Fa>,
+    ) -> Self
+    where
+        Wal: WalManager + Send + Sync + 'static,
+        Meta: Manifest + Send + Sync + 'static,
+        Store: ObjectStore,
+        Fa: Factory + Send + Sync + 'static,
+    {
+        let mut worker_datas = Vec::with_capacity(opts.worker_num);
+        let mut handles = Vec::with_capacity(opts.worker_num);
+        for id in 0..opts.worker_num {
+            let (tx, rx) = mpsc::channel(opts.command_channel_capacity);
+            let (background_tx, background_rx) = watch::channel(BackgroundStatus::Ok);
+
+            let data = Arc::new(WorkerSharedData {
+                space_id: opts.space_id,
+                id,
+                tx,
+                is_flushing: AtomicBool::new(false),
+                background_tx,
+                runtime: opts.runtime.clone(),
+                num_background_jobs: AtomicI64::new(0),
+                background_notify: Notify::new(),
+            });
+
+            let mut worker = WriteWorker {
+                rx,
+                instance: instance.clone(),
+                local: WorkerLocal {
+                    data: data.clone(),
+                    background_rx,
+                },
+                log_entry_buf: Vec::new(),
+            };
+
+            let space_id = opts.space_id;
+            // Spawn a task to run the worker
+            let handle = opts.runtime.spawn(async move {
+                worker.run().await;
+
+                info!(
+                    "Write worker waiting background jobs, space_id:{}, id:{}",
+                    space_id, id
+                );
+
+                worker.wait_background_jobs_done().await;
+
+                info!("Write worker exit, space_id:{}, id:{}", space_id, id);
+            });
+
+            worker_datas.push(data);
+            handles.push(handle);
+        }
+
+        Self {
+            space_id: opts.space_id,
+            worker_datas,
+            handles: Mutex::new(handles),
+        }
+    }
+
+    /// Stop the write group.
+    pub async fn stop(&self) {
+        for data in &self.worker_datas {
+            if data.tx.send(Command::Exit).await.is_err() {
+                error!(
+                    "Failed to send exit command, space_id:{}, worker_id:{}",
+                    self.space_id, data.id
+                );
+            }
+        }
+
+        let mut handles = self.handles.lock().await;
+        for (i, handle) in handles.iter_mut().enumerate() {
+            if let Err(e) = handle.await {
+                error!(
+                    "Failed to join handle, space_id:{}, index:{}, err:{}",
+                    self.space_id, i, e
+                );
+            }
+        }
+
+        // Clear all handles to avoid await again.
+        handles.clear();
+    }
+
+    /// Choose worker for table with `table_id`. The worker chose should be
+    /// consistent, so the caller can cached the handle of the worker
+    ///
+    /// Returns the WriteHandle of the worker
+    pub fn choose_worker(&self, table_id: TableId) -> WriteHandle {
+        let index = table_id.as_u64() as usize % self.worker_datas.len();
+        let worker_data = self.worker_datas[index].clone();
+
+        WriteHandle { worker_data }
+    }
+}
+
+/// Data of write worker
+#[derive(Debug)]
+struct WorkerSharedData {
+    /// Space this worker belongs to
+    space_id: SpaceId,
+    /// Id of the write worker
+    id: usize,
+    /// Sender to send command to this worker
+    tx: mpsc::Sender<Command>,
+
+    /// Whether the flush job is already running
+    ///
+    /// When `is_flushing` is true, no more flush job should be scheduled
+    is_flushing: AtomicBool,
+    /// Channel to notify background status
+    background_tx: watch::Sender<BackgroundStatus>,
+
+    /// Background job runtime.
+    runtime: Arc<Runtime>,
+    /// Numbers of background jobs.
+    num_background_jobs: AtomicI64,
+    /// Notify when background job finished.
+    background_notify: Notify,
+}
+
+impl WorkerSharedData {
+    fn is_flushing(&self) -> bool {
+        self.is_flushing.load(Ordering::Relaxed)
+    }
+
+    fn set_is_flushing(&self, is_flushing: bool) {
+        self.is_flushing.store(is_flushing, Ordering::Relaxed);
+    }
+}
+
+/// Table write worker
+///
+/// Each table is managed by exactly one write worker. Write request to a table
+/// will be sent to this thread and done in this worker.
+///
+/// The write worker should ensure there is only one flush thread (task) is
+/// running.
+struct WriteWorker<Wal, Meta, Store, Fa> {
+    /// Command receiver
+    rx: mpsc::Receiver<Command>,
+    /// Engine instance
+    instance: InstanceRef<Wal, Meta, Store, Fa>,
+    /// Worker local states
+    local: WorkerLocal,
+    /// Log entry buffer for recover
+    log_entry_buf: Vec<LogEntry<ReadPayload>>,
+}
+
+impl<
+        Wal: WalManager + Send + Sync + 'static,
+        Meta: Manifest + Send + Sync + 'static,
+        Store: ObjectStore,
+        Fa: Factory + Send + Sync + 'static,
+    > WriteWorker<Wal, Meta, Store, Fa>
+{
+    /// Runs the write loop until stopped
+    async fn run(&mut self) {
+        // TODO(yingwen): Maybe batch write tasks to improve performance (group commit)
+        loop {
+            let command = match self.rx.recv().await {
+                Some(cmd) => cmd,
+                None => {
+                    info!(
+                        "Write worker recv None, exit, space_id:{}, id:{}",
+                        self.space_id(),
+                        self.id()
+                    );
+                    return;
+                }
+            };
+
+            match command {
+                Command::Write(cmd) => {
+                    self.handle_write_table(cmd).await;
+                }
+                Command::Drop(cmd) => {
+                    self.handle_drop_table(cmd).await;
+                }
+                Command::Recover(cmd) => {
+                    self.handle_recover_table(cmd).await;
+                }
+                Command::AlterSchema(cmd) => {
+                    self.handle_alter_schema(cmd).await;
+                }
+                Command::AlterOptions(cmd) => {
+                    self.handle_alter_options(cmd).await;
+                }
+                Command::Flush(cmd) => {
+                    self.handle_flush_table(cmd).await;
+                }
+                Command::Compact(cmd) => {
+                    self.handle_compact_table(cmd).await;
+                }
+                Command::Exit => {
+                    info!(
+                        "Write worker recv Command::Exit, exit, space_id:{}, id:{}",
+                        self.space_id(),
+                        self.id()
+                    );
+                    return;
+                }
+            }
+        }
+    }
+
+    async fn wait_background_jobs_done(&self) {
+        while self.num_background_jobs() > 0 {
+            self.wait_for_notify().await;
+        }
+    }
+
+    async fn handle_write_table(&mut self, cmd: WriteTableCommand) {
+        let WriteTableCommand {
+            space_table,
+            request,
+            tx,
+        } = cmd;
+
+        let write_res = self
+            .instance
+            .process_write_table_command(&mut self.local, &space_table, request)
+            .await;
+        if let Err(res) = tx.send(write_res) {
+            error!(
+                "handle write table failed to send result, write_res:{:?}",
+                res
+            );
+        }
+    }
+
+    async fn handle_recover_table(&mut self, cmd: RecoverTableCommand) {
+        let RecoverTableCommand {
+            table_data,
+            tx,
+            replay_batch_size,
+        } = cmd;
+
+        let read_ctx = ReadContext::default();
+        self.log_entry_buf.reserve(replay_batch_size);
+
+        let recover_res = self
+            .instance
+            .recover_table_from_wal(
+                &self.local,
+                table_data,
+                replay_batch_size,
+                &read_ctx,
+                &mut self.log_entry_buf,
+            )
+            .await;
+        if let Err(res) = tx.send(recover_res) {
+            error!(
+                "handle recover table failed to send result, recover_res:{:?}",
+                res
+            );
+        }
+    }
+
+    async fn handle_drop_table(&mut self, cmd: DropTableCommand) {
+        let DropTableCommand {
+            space_table,
+            request,
+            tx,
+        } = cmd;
+
+        let drop_res = self
+            .instance
+            .process_drop_table_command(&mut self.local, &space_table, request)
+            .await;
+        if let Err(res) = tx.send(drop_res) {
+            error!(
+                "handle drop table failed to send result, drop_res:{:?}",
+                res
+            );
+        }
+    }
+
+    async fn handle_alter_schema(&mut self, cmd: AlterSchemaCommand) {
+        let AlterSchemaCommand {
+            space_table,
+            request,
+            tx,
+        } = cmd;
+
+        let alter_res = self
+            .instance
+            .process_alter_schema_command(&mut self.local, &space_table, request)
+            .await
+            .map_err(|e| Box::new(e) as Box<dyn std::error::Error + Send + Sync>)
+            .context(Channel);
+        if let Err(res) = tx.send(alter_res) {
+            error!(
+                "handle alter schema failed to send result, alter_res:{:?}",
+                res
+            );
+        }
+    }
+
+    async fn handle_alter_options(&mut self, cmd: AlterOptionsCommand) {
+        let AlterOptionsCommand {
+            space_table,
+            options,
+            tx,
+        } = cmd;
+
+        let alter_res = self
+            .instance
+            .process_alter_options_command(&mut self.local, &space_table, options)
+            .await;
+        if let Err(res) = tx.send(alter_res) {
+            error!(
+                "handle alter schema failed to send result, alter_res:{:?}",
+                res
+            );
+        }
+    }
+
+    async fn handle_flush_table(&mut self, cmd: FlushTableCommand) {
+        let FlushTableCommand {
+            space_table,
+            flush_opts,
+            tx,
+        } = cmd;
+
+        let flush_res = self
+            .instance
+            .flush_table_in_worker(&mut self.local, space_table.table_data(), flush_opts)
+            .await;
+        if let Err(res) = tx.send(flush_res) {
+            error!(
+                "handle flush table failed to send result, flush_res:{:?}",
+                res
+            );
+        }
+    }
+
+    async fn handle_compact_table(&mut self, cmd: CompactTableCommand) {
+        let CompactTableCommand {
+            space_table,
+            waiter,
+            tx,
+        } = cmd;
+
+        let request = TableCompactionRequest {
+            table_data: space_table.table_data().clone(),
+            compaction_notifier: self.local.compaction_notifier(),
+            waiter,
+        };
+
+        self.instance.schedule_table_compaction(request).await;
+        if let Err(_res) = tx.send(Ok(())) {
+            error!("handle compact table failed to send result");
+        }
+    }
+
+    #[inline]
+    fn space_id(&self) -> SpaceId {
+        self.local.data.space_id
+    }
+
+    #[inline]
+    fn id(&self) -> usize {
+        self.local.data.id
+    }
+
+    #[inline]
+    fn num_background_jobs(&self) -> i64 {
+        self.local.data.num_background_jobs.load(Ordering::SeqCst)
+    }
+
+    async fn wait_for_notify(&self) {
+        self.local.data.background_notify.notified().await;
+    }
+}
+
+#[cfg(test)]
+pub mod tests {
+    use common_util::runtime;
+
+    use super::*;
+
+    pub struct MockedWriteHandle {
+        pub write_handle: WriteHandle,
+        pub rx: mpsc::Receiver<Command>,
+        pub worker_local: WorkerLocal,
+    }
+
+    pub struct WriteHandleMocker {
+        space_id: SpaceId,
+        runtime: Option<Arc<Runtime>>,
+    }
+
+    impl Default for WriteHandleMocker {
+        fn default() -> Self {
+            Self {
+                space_id: 1,
+                runtime: None,
+            }
+        }
+    }
+
+    impl WriteHandleMocker {
+        pub fn space_id(mut self, space_id: SpaceId) -> Self {
+            self.space_id = space_id;
+            self
+        }
+
+        pub fn build(self) -> MockedWriteHandle {
+            let (tx, rx) = mpsc::channel(1);
+            let (background_tx, background_rx) = watch::channel(BackgroundStatus::Ok);
+            let runtime = self.runtime.unwrap_or_else(|| {
+                let rt = runtime::Builder::default().build().unwrap();
+                Arc::new(rt)
+            });
+
+            let worker_data = Arc::new(WorkerSharedData {
+                space_id: self.space_id,
+                id: 0,
+                tx,
+                is_flushing: AtomicBool::new(false),
+                background_tx,
+                runtime,
+                num_background_jobs: AtomicI64::new(0),
+                background_notify: Notify::new(),
+            });
+
+            let write_handle = WriteHandle {
+                worker_data: worker_data.clone(),
+            };
+
+            MockedWriteHandle {
+                write_handle,
+                rx,
+                worker_local: WorkerLocal {
+                    data: worker_data,
+                    background_rx,
+                },
+            }
+        }
+    }
+}
diff --git a/analytic_engine/src/lib.rs b/analytic_engine/src/lib.rs
new file mode 100644
index 0000000000..a4fc60c14f
--- /dev/null
+++ b/analytic_engine/src/lib.rs
@@ -0,0 +1,98 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Analytic table engine implementations
+
+mod compaction;
+mod context;
+mod engine;
+mod instance;
+pub mod memtable;
+mod meta;
+mod payload;
+pub mod row_iter;
+mod sampler;
+pub mod setup;
+pub mod space;
+pub mod sst;
+pub mod table;
+pub mod table_options;
+
+#[cfg(any(test, feature = "test"))]
+pub mod tests;
+
+use object_store::disk::File;
+use serde_derive::Deserialize;
+use wal::rocks_impl::manager::RocksImpl;
+
+pub use crate::{compaction::scheduler::SchedulerConfig, table_options::TableOptions};
+use crate::{
+    engine::TableEngineImpl,
+    instance::InstanceRef,
+    meta::details::{ManifestImpl, Options as ManifestOptions},
+    sst::factory::FactoryImpl,
+};
+
+/// Analytic table engine
+pub type AnalyticTableEngine =
+    TableEngineImpl<RocksImpl, ManifestImpl<RocksImpl>, File, FactoryImpl>;
+/// Default instance
+pub(crate) type EngineInstance = InstanceRef<RocksImpl, ManifestImpl<RocksImpl>, File, FactoryImpl>;
+
+/// Config of analytic engine.
+#[derive(Debug, Clone, Deserialize)]
+#[serde(default)]
+pub struct Config {
+    /// Data path of the engine.
+    pub data_path: String,
+
+    /// Batch size to read records from wal to replay.
+    pub replay_batch_size: usize,
+    /// Batch size to replay tables.
+    pub max_replay_tables_per_batch: usize,
+    // Write group options:
+    pub write_group_worker_num: usize,
+    pub write_group_command_channel_cap: usize,
+    // End of write group options.
+    /// Default options for table.
+    pub table_opts: TableOptions,
+
+    pub compaction_config: SchedulerConfig,
+
+    /// sst meta cache capacity.
+    pub sst_meta_cache_cap: Option<usize>,
+    /// sst data cache capacity.
+    pub sst_data_cache_cap: Option<usize>,
+
+    /// Manifest options.
+    pub manifest: ManifestOptions,
+
+    // Global write buffer options:
+    /// The maximum write buffer size used for single space.
+    pub space_write_buffer_size: usize,
+    /// The maximum size of all Write Buffers across all spaces.
+    pub db_write_buffer_size: usize,
+    // End of global write buffer options.
+}
+
+impl Default for Config {
+    fn default() -> Self {
+        Self {
+            data_path: String::from("/tmp/ceresdbx"),
+            replay_batch_size: 500,
+            max_replay_tables_per_batch: 64,
+            write_group_worker_num: 8,
+            write_group_command_channel_cap: 128,
+            table_opts: TableOptions::default(),
+            compaction_config: SchedulerConfig::default(),
+            sst_meta_cache_cap: Some(1000),
+            sst_data_cache_cap: Some(1000),
+            manifest: ManifestOptions::default(),
+            /// Zero means disabling this param, give a postive value to enable
+            /// it.
+            space_write_buffer_size: 0,
+            /// Zero means disabling this param, give a postive value to enable
+            /// it.
+            db_write_buffer_size: 0,
+        }
+    }
+}
diff --git a/analytic_engine/src/memtable/factory.rs b/analytic_engine/src/memtable/factory.rs
new file mode 100644
index 0000000000..0867bba2da
--- /dev/null
+++ b/analytic_engine/src/memtable/factory.rs
@@ -0,0 +1,38 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! MemTable factory
+
+use std::{fmt, sync::Arc};
+
+use arena::CollectorRef;
+use common_types::{schema::Schema, SequenceNumber};
+use common_util::define_result;
+use snafu::Snafu;
+
+use crate::memtable::MemTableRef;
+
+#[derive(Debug, Snafu)]
+pub enum Error {}
+
+define_result!(Error);
+
+/// MemTable options
+pub struct Options {
+    /// Schema of the skiplist.
+    pub schema: Schema,
+    /// Block size of arena in bytes.
+    pub arena_block_size: u32,
+    /// Log sequence at the memtable creation.
+    pub creation_sequence: SequenceNumber,
+    /// Memory usage colllector
+    pub collector: CollectorRef,
+}
+
+/// MemTable factory
+pub trait Factory: fmt::Debug {
+    /// Create a new memtable instance
+    fn create_memtable(&self, opts: Options) -> Result<MemTableRef>;
+}
+
+/// MemTable Factory reference
+pub type FactoryRef = Arc<dyn Factory + Send + Sync>;
diff --git a/analytic_engine/src/memtable/key.rs b/analytic_engine/src/memtable/key.rs
new file mode 100644
index 0000000000..6c11837028
--- /dev/null
+++ b/analytic_engine/src/memtable/key.rs
@@ -0,0 +1,249 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Memtable key
+//!
+//! Some concepts:
+//! - User key (row key) is a bytes encoded from the key columns of a row
+//! - Internal key contains
+//!     - user key
+//!     - memtable key sequence
+//!         - sequence number
+//!         - index
+
+use std::mem;
+
+use common_types::{
+    bytes::{BytesMut, MemBuf, MemBufMut},
+    row::Row,
+    schema::Schema,
+    SequenceNumber,
+};
+use common_util::{
+    codec::{memcomparable::MemComparable, Decoder, Encoder},
+    define_result,
+};
+use snafu::{ensure, Backtrace, ResultExt, Snafu};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Failed to encode key datum, err:{}", source))]
+    EncodeKeyDatum {
+        source: common_util::codec::memcomparable::Error,
+    },
+
+    #[snafu(display("Failed to encode sequence, err:{}", source))]
+    EncodeSequence { source: common_types::bytes::Error },
+
+    #[snafu(display("Failed to encode row index, err:{}", source))]
+    EncodeIndex { source: common_types::bytes::Error },
+
+    #[snafu(display("Failed to decode sequence, err:{}", source))]
+    DecodeSequence { source: common_types::bytes::Error },
+
+    #[snafu(display("Failed to decode row index, err:{}", source))]
+    DecodeIndex { source: common_types::bytes::Error },
+
+    #[snafu(display(
+        "Insufficent internal key length, len:{}.\nBacktrace:\n{}",
+        len,
+        backtrace
+    ))]
+    InternalKeyLen { len: usize, backtrace: Backtrace },
+}
+
+define_result!(Error);
+
+// u64 + u32
+const KEY_SEQUENCE_BYTES_LEN: usize = 12;
+
+/// Row index in the batch
+pub type RowIndex = u32;
+
+/// Sequence number of row in memtable
+///
+/// Contains:
+/// - sequence number in wal (sequence number of the write batch)
+/// - unique index of the row in the write batch
+///
+/// Ordering:
+/// 1. ordered by sequence desc
+/// 2. ordered by index desc
+///
+/// The desc order is implemented via MAX - seq
+///
+/// The index is used to distinguish rows with same key of the same write batch
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub struct KeySequence(SequenceNumber, RowIndex);
+
+impl KeySequence {
+    pub fn new(sequence: SequenceNumber, index: RowIndex) -> Self {
+        Self(sequence, index)
+    }
+
+    #[inline]
+    pub fn sequence(&self) -> SequenceNumber {
+        self.0
+    }
+
+    #[inline]
+    pub fn row_index(&self) -> RowIndex {
+        self.1
+    }
+}
+
+// TODO(yingwen): We also need opcode (PUT/DELETE), put it in key or row value
+/// Comparable internal key encoder
+///
+/// Key order:
+/// 1. ordered by user key ascend (key parts of a row)
+/// 2. ordered by sequence descend
+///
+/// Encoding:
+/// user_key + sequence
+///
+/// REQUIRE: The schema of row to encode matches the Self::schema
+pub struct ComparableInternalKey<'a> {
+    /// Sequence number of the row
+    sequence: KeySequence,
+    /// Schema of row
+    schema: &'a Schema,
+}
+
+impl<'a> ComparableInternalKey<'a> {
+    pub fn new(sequence: KeySequence, schema: &'a Schema) -> Self {
+        Self { sequence, schema }
+    }
+}
+
+impl<'a> Encoder<Row> for ComparableInternalKey<'a> {
+    type Error = Error;
+
+    fn encode<B: MemBufMut>(&self, buf: &mut B, value: &Row) -> Result<()> {
+        let encoder = MemComparable;
+        for idx in 0..self.schema.num_key_columns() {
+            // Encode each column in primary key
+            encoder.encode(buf, &value[idx]).context(EncodeKeyDatum)?;
+        }
+        SequenceCodec.encode(buf, &self.sequence)?;
+
+        Ok(())
+    }
+
+    fn estimate_encoded_size(&self, value: &Row) -> usize {
+        let encoder = MemComparable;
+        let mut total_len = 0;
+        for idx in 0..self.schema.num_key_columns() {
+            // Size of each column in primary key
+            total_len += encoder.estimate_encoded_size(&value[idx]);
+        }
+        // The size of sequence
+        total_len += KEY_SEQUENCE_BYTES_LEN;
+
+        total_len
+    }
+}
+
+struct SequenceCodec;
+
+impl Encoder<KeySequence> for SequenceCodec {
+    type Error = Error;
+
+    fn encode<B: MemBufMut>(&self, buf: &mut B, value: &KeySequence) -> Result<()> {
+        // Encode sequence number and index in descend order
+        encode_sequence_number(buf, value.sequence())?;
+        let reversed_index = RowIndex::MAX - value.row_index();
+        buf.write_u32(reversed_index).context(EncodeIndex)?;
+
+        Ok(())
+    }
+
+    fn estimate_encoded_size(&self, _value: &KeySequence) -> usize {
+        KEY_SEQUENCE_BYTES_LEN
+    }
+}
+
+impl Decoder<KeySequence> for SequenceCodec {
+    type Error = Error;
+
+    fn decode<B: MemBuf>(&self, buf: &mut B) -> Result<KeySequence> {
+        let sequence = buf.read_u64().context(DecodeSequence)?;
+        // Reverse sequence
+        let sequence = SequenceNumber::MAX - sequence;
+        let row_index = buf.read_u32().context(DecodeIndex)?;
+        // Reverse row index
+        let row_index = RowIndex::MAX - row_index;
+
+        Ok(KeySequence::new(sequence, row_index))
+    }
+}
+
+#[inline]
+fn encode_sequence_number<B: MemBufMut>(buf: &mut B, sequence: SequenceNumber) -> Result<()> {
+    // The sequence need to encode in descend order
+    let reversed_sequence = SequenceNumber::MAX - sequence;
+    // Encode sequence
+    buf.write_u64(reversed_sequence).context(EncodeSequence)?;
+    Ok(())
+}
+
+// TODO(yingwen): Maybe make decoded internal key a type?
+
+/// Encode internal key from user key for seek
+///
+/// - user_key: the user key to encode
+/// - sequence: the sequence number to encode into internal key
+/// - scratch: buffer to store the encoded internal key, the scratch will be
+///   clear
+///
+/// Returns the slice to the encoded internal key
+pub fn internal_key_for_seek<'a>(
+    user_key: &[u8],
+    sequence: SequenceNumber,
+    scratch: &'a mut BytesMut,
+) -> Result<&'a [u8]> {
+    scratch.clear();
+
+    scratch.reserve(user_key.len() + mem::size_of::<SequenceNumber>());
+    scratch.extend_from_slice(user_key);
+    encode_sequence_number(scratch, sequence)?;
+
+    Ok(&scratch[..])
+}
+
+/// Decode user key and sequence number from the internal key
+pub fn user_key_from_internal_key(internal_key: &[u8]) -> Result<(&[u8], KeySequence)> {
+    // Empty user key is meaningless
+    ensure!(
+        internal_key.len() > KEY_SEQUENCE_BYTES_LEN,
+        InternalKeyLen {
+            len: internal_key.len(),
+        }
+    );
+
+    let (left, mut right) = internal_key.split_at(internal_key.len() - KEY_SEQUENCE_BYTES_LEN);
+    // Decode sequence number from right part
+    let sequence = SequenceCodec.decode(&mut right)?;
+
+    Ok((left, sequence))
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn test_sequence_codec() {
+        let codec = SequenceCodec;
+
+        let sequence = KeySequence::new(123, 456);
+        assert_eq!(12, codec.estimate_encoded_size(&sequence));
+        let mut buf = Vec::new();
+        codec.encode(&mut buf, &sequence).unwrap();
+        assert_eq!(12, buf.len());
+
+        let mut b = &buf[..];
+        let decoded_sequence = codec.decode(&mut b).unwrap();
+
+        assert_eq!(sequence, decoded_sequence);
+    }
+}
diff --git a/analytic_engine/src/memtable/mod.rs b/analytic_engine/src/memtable/mod.rs
new file mode 100644
index 0000000000..5074eff34c
--- /dev/null
+++ b/analytic_engine/src/memtable/mod.rs
@@ -0,0 +1,198 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! MemTable
+
+pub mod factory;
+pub mod key;
+pub mod skiplist;
+
+use std::{ops::Bound, sync::Arc};
+
+use common_types::{
+    bytes::{ByteVec, Bytes},
+    projected_schema::ProjectedSchema,
+    record_batch::RecordBatchWithKey,
+    row::Row,
+    schema::{IndexInWriterSchema, Schema},
+    SequenceNumber,
+};
+use common_util::define_result;
+use snafu::{Backtrace, Snafu};
+
+use crate::memtable::key::KeySequence;
+
+const DEFAULT_SCAN_BATCH_SIZE: usize = 500;
+
+#[derive(Debug, Snafu)]
+#[snafu(visibility(pub(crate)))]
+pub enum Error {
+    #[snafu(display("Failed to encode internal key, err:{}", source))]
+    EncodeInternalKey { source: crate::memtable::key::Error },
+
+    #[snafu(display("Failed to decode internal key, err:{}", source))]
+    DecodeInternalKey { source: crate::memtable::key::Error },
+
+    #[snafu(display("Failed to decode row, err:{}", source))]
+    DecodeRow {
+        source: common_util::codec::row::Error,
+    },
+
+    #[snafu(display("Failed to append row to batch builder, err:{}", source))]
+    AppendRow {
+        source: common_types::record_batch::Error,
+    },
+
+    #[snafu(display("Failed to build record batch, err:{}", source,))]
+    BuildRecordBatch {
+        source: common_types::record_batch::Error,
+    },
+
+    #[snafu(display("Failed to project memtable schema, err:{}", source))]
+    ProjectSchema {
+        source: common_types::projected_schema::Error,
+    },
+
+    #[snafu(display(
+        "Invalid sequence number to put, given:{}, last:{}.\nBacktrace:\n{}",
+        given,
+        last,
+        backtrace
+    ))]
+    InvalidPutSequence {
+        given: SequenceNumber,
+        last: SequenceNumber,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Invalid row, err:{}", source))]
+    InvalidRow {
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+
+    #[snafu(display("Fail to iter in reverse order, err:{}", source))]
+    IterReverse {
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+}
+
+define_result!(Error);
+
+/// Options for put and context for tracing
+pub struct PutContext {
+    /// Buffer for encoding key, can reuse during put
+    pub key_buf: ByteVec,
+    /// Buffer for encoding value, can reuse during put
+    pub value_buf: ByteVec,
+    /// Used to encode row.
+    pub index_in_writer: IndexInWriterSchema,
+}
+
+impl PutContext {
+    pub fn new(index_in_writer: IndexInWriterSchema) -> Self {
+        Self {
+            key_buf: ByteVec::new(),
+            value_buf: ByteVec::new(),
+            index_in_writer,
+        }
+    }
+}
+
+/// Options for scan and context for tracing
+#[derive(Debug, Clone)]
+pub struct ScanContext {
+    /// Suggested row number per batch
+    pub batch_size: usize,
+}
+
+impl Default for ScanContext {
+    fn default() -> Self {
+        Self {
+            batch_size: DEFAULT_SCAN_BATCH_SIZE,
+        }
+    }
+}
+
+/// Scan request
+///
+/// Now we only support forward scan.
+#[derive(Debug, Clone)]
+pub struct ScanRequest {
+    /// The start key of the encoded user key (without sequence).
+    pub start_user_key: Bound<Bytes>,
+    /// The end key of the encoded user key (without sequence).
+    pub end_user_key: Bound<Bytes>,
+    /// Max visible sequence (inclusive), row key with sequence <= this can be
+    /// visible.
+    pub sequence: SequenceNumber,
+    /// Schema and projection to read.
+    pub projected_schema: ProjectedSchema,
+    pub need_dedup: bool,
+    pub reverse: bool,
+}
+
+/// In memory storage for table's data.
+///
+/// # Concurrency
+/// The memtable is designed for single-writer and mutltiple-reader usage, so
+/// not all function supports concurrent writer, the caller should guarantee not
+/// writing to the memtable concurrrently.
+// All operation is done in memory, no need to use async trait
+pub trait MemTable {
+    /// Schema of this memtable
+    ///
+    /// The schema of a memtable is not allowed to change now. Modifying the
+    /// schema of a table requires a memtable switch and external
+    /// synchronization
+    fn schema(&self) -> &Schema;
+
+    /// Peek the min key of this memtable.
+    fn min_key(&self) -> Option<Bytes>;
+
+    /// Peek the max key of this memtable.
+    fn max_key(&self) -> Option<Bytes>;
+
+    /// Insert one row into the memtable.
+    ///
+    ///.- ctx: The put context
+    /// - sequence: The sequence of the row
+    /// - row: The row to insert
+    /// - schema: The schema of the row
+    ///
+    /// REQUIRE:
+    /// - The schema of RowGroup must equal to the schema of memtable. How to
+    /// handle duplicate entries is implementation specific.
+    fn put(
+        &self,
+        ctx: &mut PutContext,
+        sequence: KeySequence,
+        row: &Row,
+        schema: &Schema,
+    ) -> Result<()>;
+
+    /// Scan the memtable.
+    ///
+    /// Returns the data in columnar format. The returned rows is guaranteed
+    /// to be ordered by the primary key.
+    fn scan(&self, ctx: ScanContext, request: ScanRequest) -> Result<ColumnarIterPtr>;
+
+    /// Returns an estimate of the number of bytes of data in used
+    fn approximate_memory_usage(&self) -> usize;
+
+    /// Set last sequence of the memtable, returns error if the given `sequence`
+    /// is less than existing last sequence.
+    ///
+    /// REQUIRE:
+    /// - External synchronization is required.
+    fn set_last_sequence(&self, sequence: SequenceNumber) -> Result<()>;
+
+    /// Returns the last sequence of the memtable.
+    ///
+    /// If the memtable is empty, then the last sequence is 0.
+    fn last_sequence(&self) -> SequenceNumber;
+}
+
+/// A reference to memtable
+pub type MemTableRef = Arc<dyn MemTable + Send + Sync>;
+
+/// A pointer to columnar iterator
+pub type ColumnarIterPtr = Box<dyn Iterator<Item = Result<RecordBatchWithKey>> + Send + Sync>;
diff --git a/analytic_engine/src/memtable/skiplist/factory.rs b/analytic_engine/src/memtable/skiplist/factory.rs
new file mode 100644
index 0000000000..89dd453587
--- /dev/null
+++ b/analytic_engine/src/memtable/skiplist/factory.rs
@@ -0,0 +1,32 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Skiplist memtable factory
+
+use std::sync::{atomic::AtomicU64, Arc};
+
+use arena::MonoIncArena;
+use skiplist::Skiplist;
+
+use crate::memtable::{
+    factory::{Factory, Options, Result},
+    skiplist::{BytewiseComparator, SkiplistMemTable},
+    MemTableRef,
+};
+
+/// Factory to create memtable
+#[derive(Debug)]
+pub struct SkiplistMemTableFactory;
+
+impl Factory for SkiplistMemTableFactory {
+    fn create_memtable(&self, opts: Options) -> Result<MemTableRef> {
+        let arena = MonoIncArena::with_collector(opts.arena_block_size as usize, opts.collector);
+        let skiplist = Skiplist::with_arena(BytewiseComparator, arena);
+        let memtable = Arc::new(SkiplistMemTable {
+            schema: opts.schema,
+            skiplist,
+            last_sequence: AtomicU64::new(opts.creation_sequence),
+        });
+
+        Ok(memtable)
+    }
+}
diff --git a/analytic_engine/src/memtable/skiplist/iter.rs b/analytic_engine/src/memtable/skiplist/iter.rs
new file mode 100644
index 0000000000..0cf60cc90e
--- /dev/null
+++ b/analytic_engine/src/memtable/skiplist/iter.rs
@@ -0,0 +1,346 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Skiplist memtable iterator
+
+use std::{cmp::Ordering, iter::Rev, ops::Bound};
+
+use arena::{Arena, BasicStats};
+use common_types::{
+    bytes::{Bytes, BytesMut},
+    projected_schema::{ProjectedSchema, RowProjector},
+    record_batch::{RecordBatchWithKey, RecordBatchWithKeyBuilder},
+    row::contiguous::{ContiguousRowReader, ProjectedContiguousRow},
+    schema::Schema,
+    SequenceNumber,
+};
+use common_util::codec::row;
+use log::trace;
+use skiplist::{ArenaSlice, IterRef, Skiplist};
+use snafu::ResultExt;
+
+use crate::memtable::{
+    key::{self, KeySequence},
+    skiplist::{BytewiseComparator, SkiplistMemTable},
+    AppendRow, BuildRecordBatch, DecodeInternalKey, EncodeInternalKey, IterReverse, ProjectSchema,
+    Result, ScanContext, ScanRequest,
+};
+
+/// Iterator state
+#[derive(Debug, PartialEq)]
+enum State {
+    /// The iterator struct is created but not initialized
+    Uninitialized,
+    /// The iterator is initialized (seek)
+    Initialized,
+    /// No more element the iterator can return
+    Finished,
+}
+
+/// Columnar iterator for [SkiplistMemTable]
+pub struct ColumnarIterImpl<A: Arena<Stats = BasicStats> + Clone + Sync + Send> {
+    /// The internal skiplist iter
+    iter: IterRef<Skiplist<BytewiseComparator, A>, BytewiseComparator, A>,
+
+    // Schema related:
+    /// Schema of this memtable, used to decode row
+    memtable_schema: Schema,
+    /// Projection of schema to read
+    projected_schema: ProjectedSchema,
+    projector: RowProjector,
+
+    // Options related:
+    batch_size: usize,
+
+    start_user_key: Bound<Bytes>,
+    end_user_key: Bound<Bytes>,
+    /// Max visible sequence
+    sequence: SequenceNumber,
+    /// State of iterator
+    state: State,
+    /// Last internal key this iterator returned
+    // TODO(yingwen): Wrap a internal key struct?
+    last_internal_key: Option<ArenaSlice<A>>,
+
+    /// Dedup rows with key
+    need_dedup: bool,
+}
+
+impl<A: Arena<Stats = BasicStats> + Clone + Sync + Send> ColumnarIterImpl<A> {
+    /// Create a new [ColumnarIterImpl]
+    pub fn new(
+        memtable: &SkiplistMemTable<A>,
+        ctx: ScanContext,
+        request: ScanRequest,
+    ) -> Result<Self> {
+        // Create projection for the memtable schema
+        let projector = request
+            .projected_schema
+            .try_project_with_key(&memtable.schema)
+            .context(ProjectSchema)?;
+
+        let iter = memtable.skiplist.iter();
+        let mut columnar_iter = Self {
+            iter,
+            memtable_schema: memtable.schema.clone(),
+            projected_schema: request.projected_schema,
+            projector,
+            batch_size: ctx.batch_size,
+            start_user_key: request.start_user_key,
+            end_user_key: request.end_user_key,
+            sequence: request.sequence,
+            state: State::Uninitialized,
+            last_internal_key: None,
+            need_dedup: request.need_dedup,
+        };
+
+        columnar_iter.init()?;
+
+        Ok(columnar_iter)
+    }
+
+    /// Init the iterator, will seek to the proper position for first next()
+    /// call, so the first entry next() returned is after the
+    /// `start_user_key`, but we still need to check `end_user_key`
+    fn init(&mut self) -> Result<()> {
+        match &self.start_user_key {
+            Bound::Included(user_key) => {
+                // Construct seek key
+                let mut key_buf = BytesMut::new();
+                let seek_key = key::internal_key_for_seek(user_key, self.sequence, &mut key_buf)
+                    .context(EncodeInternalKey)?;
+
+                // Seek the skiplist
+                self.iter.seek(seek_key);
+            }
+            Bound::Excluded(user_key) => {
+                // Construct seek key, just seek to the key with next prefix, so there is no
+                // need to skip the key until we meet the first key >
+                // start_user_key
+                let next_user_key = row::key_prefix_next(user_key);
+                let mut key_buf = BytesMut::new();
+                let seek_key =
+                    key::internal_key_for_seek(&next_user_key, self.sequence, &mut key_buf)
+                        .context(EncodeInternalKey)?;
+
+                // Seek the skiplist
+                self.iter.seek(seek_key);
+            }
+            Bound::Unbounded => self.iter.seek_to_first(),
+        }
+
+        self.state = State::Initialized;
+
+        Ok(())
+    }
+
+    /// Fetch next record batch
+    fn fetch_next_record_batch(&mut self) -> Result<Option<RecordBatchWithKey>> {
+        debug_assert_eq!(State::Initialized, self.state);
+        assert!(self.batch_size > 0);
+
+        let mut builder = RecordBatchWithKeyBuilder::with_capacity(
+            self.projected_schema.to_record_schema_with_key(),
+            self.batch_size,
+        );
+        let mut num_rows = 0;
+        while self.iter.valid() && num_rows < self.batch_size {
+            if let Some(row) = self.fetch_next_row()? {
+                let row_reader = ContiguousRowReader::with_schema(&row, &self.memtable_schema);
+                let projected_row = ProjectedContiguousRow::new(row_reader, &self.projector);
+
+                trace!("Column iterator fetch next row, row:{:?}", projected_row);
+
+                builder
+                    .append_projected_contiguous_row(&projected_row)
+                    .context(AppendRow)?;
+                num_rows += 1;
+            } else {
+                // There is no more row to fetch
+                self.finish();
+                break;
+            }
+        }
+
+        if num_rows > 0 {
+            let batch = builder.build().context(BuildRecordBatch)?;
+            trace!("column iterator send one batch:{:?}", batch);
+
+            Ok(Some(batch))
+        } else {
+            // If iter is invalid after seek (nothing matched), then it may not be marked as
+            // finished yet
+            self.finish();
+            Ok(None)
+        }
+    }
+
+    /// Fetch next row matched the given condition, the current entry of iter
+    /// will be considered
+    ///
+    /// REQUIRE: The iter is valid
+    fn fetch_next_row(&mut self) -> Result<Option<ArenaSlice<A>>> {
+        debug_assert_eq!(State::Initialized, self.state);
+
+        // TODO(yingwen): Some operation like delete needs to be considered during
+        // iterating: we need to ignore this key if found a delete mark
+        while self.iter.valid() {
+            // Fetch current entry
+            let key = self.iter.key();
+            let (user_key, sequence) =
+                key::user_key_from_internal_key(key).context(DecodeInternalKey)?;
+
+            // Check user key is still in range
+            if self.is_after_end_bound(user_key) {
+                // Out of bound
+                self.finish();
+                return Ok(None);
+            }
+
+            if self.need_dedup {
+                // Whether this user key is already returned
+                let same_key = match &self.last_internal_key {
+                    Some(last_internal_key) => {
+                        // TODO(yingwen): Actually this call wont fail, only valid internal key will
+                        // be set as last_internal_key so maybe we can just
+                        // unwrap it?
+                        let (last_user_key, _) = key::user_key_from_internal_key(last_internal_key)
+                            .context(DecodeInternalKey)?;
+                        user_key == last_user_key
+                    }
+                    // This is the first user key
+                    None => false,
+                };
+
+                if same_key {
+                    // We meet duplicate key, move forward and continue to find next user key
+                    self.iter.next();
+                    continue;
+                }
+                // Now this is a new user key
+            }
+
+            // Check whether this key is visible
+            if !self.is_visible(sequence) {
+                // The sequence of this key is not visible, move forward
+                self.iter.next();
+                continue;
+            }
+
+            // This is the row we want
+            let row = self.iter.value_with_arena();
+
+            // Store the last key
+            self.last_internal_key = Some(self.iter.key_with_arena());
+            // Move iter forward
+            self.iter.next();
+
+            return Ok(Some(row));
+        }
+
+        // No more row in range, we can stop the iterator
+        self.finish();
+        Ok(None)
+    }
+
+    /// Return true if the sequence is visible
+    #[inline]
+    fn is_visible(&self, sequence: KeySequence) -> bool {
+        sequence.sequence() <= self.sequence
+    }
+
+    /// Return true if the key is after the `end_user_key` bound
+    fn is_after_end_bound(&self, key: &[u8]) -> bool {
+        match &self.end_user_key {
+            Bound::Included(end) => match key.cmp(end) {
+                Ordering::Less | Ordering::Equal => false,
+                Ordering::Greater => true,
+            },
+            Bound::Excluded(end) => match key.cmp(end) {
+                Ordering::Less => false,
+                Ordering::Equal | Ordering::Greater => true,
+            },
+            // All key is valid
+            Bound::Unbounded => false,
+        }
+    }
+
+    /// Mark the iterator state to finished and return None
+    fn finish(&mut self) {
+        self.state = State::Finished;
+    }
+}
+
+impl<A: Arena<Stats = BasicStats> + Clone + Sync + Send> Iterator for ColumnarIterImpl<A> {
+    type Item = Result<RecordBatchWithKey>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.state != State::Initialized {
+            return None;
+        }
+
+        self.fetch_next_record_batch().transpose()
+    }
+}
+
+/// Reversed columnar iterator.
+// TODO(xikai): Now the implementation is not perfect: read all the entries
+//  into a buffer and reverse read it. The memtable should support scan in
+// reverse  order naturally.
+pub struct ReversedColumnarIterator<I> {
+    iter: I,
+    reversed_iter: Option<Rev<std::vec::IntoIter<Result<RecordBatchWithKey>>>>,
+    num_record_batch: usize,
+}
+
+impl<I> ReversedColumnarIterator<I>
+where
+    I: Iterator<Item = Result<RecordBatchWithKey>>,
+{
+    pub fn new(iter: I, num_rows: usize, batch_size: usize) -> Self {
+        Self {
+            iter,
+            reversed_iter: None,
+            num_record_batch: num_rows / batch_size,
+        }
+    }
+
+    fn init_if_necessary(&mut self) {
+        if self.reversed_iter.is_some() {
+            return;
+        }
+
+        let mut buf = Vec::with_capacity(self.num_record_batch);
+        for item in &mut self.iter {
+            buf.push(item);
+        }
+        self.reversed_iter = Some(buf.into_iter().rev());
+    }
+}
+
+impl<I> Iterator for ReversedColumnarIterator<I>
+where
+    I: Iterator<Item = Result<RecordBatchWithKey>>,
+{
+    type Item = Result<RecordBatchWithKey>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        self.init_if_necessary();
+        self.reversed_iter
+            .as_mut()
+            .unwrap()
+            .next()
+            .map(|v| match v {
+                Ok(mut batch_with_key) => {
+                    batch_with_key
+                        .reverse_data()
+                        .map_err(|e| Box::new(e) as _)
+                        .context(IterReverse)?;
+
+                    Ok(batch_with_key)
+                }
+                Err(e) => Err(e),
+            })
+    }
+}
+
+// TODO(yingwen): Test
diff --git a/analytic_engine/src/memtable/skiplist/mod.rs b/analytic_engine/src/memtable/skiplist/mod.rs
new file mode 100644
index 0000000000..2a1459bc80
--- /dev/null
+++ b/analytic_engine/src/memtable/skiplist/mod.rs
@@ -0,0 +1,363 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! MemTable based on skiplist
+
+pub mod factory;
+pub mod iter;
+
+use std::{
+    cmp::Ordering,
+    convert::TryInto,
+    sync::atomic::{self, AtomicU64},
+};
+
+use arena::{Arena, BasicStats};
+use common_types::{
+    bytes::Bytes,
+    row::{contiguous::ContiguousRowWriter, Row},
+    schema::Schema,
+    SequenceNumber,
+};
+use common_util::codec::Encoder;
+use log::{debug, trace};
+use skiplist::{KeyComparator, Skiplist};
+use snafu::{ensure, ResultExt};
+
+use crate::memtable::{
+    key::{ComparableInternalKey, KeySequence},
+    skiplist::iter::{ColumnarIterImpl, ReversedColumnarIterator},
+    ColumnarIterPtr, EncodeInternalKey, InvalidPutSequence, InvalidRow, MemTable, PutContext,
+    Result, ScanContext, ScanRequest,
+};
+
+/// MemTable implementation based on skiplist
+pub struct SkiplistMemTable<A: Arena<Stats = BasicStats> + Clone + Sync + Send> {
+    /// Schema of this memtable, is immutable.
+    schema: Schema,
+    skiplist: Skiplist<BytewiseComparator, A>,
+    /// The last sequence of the rows in this memtable. Update to this field
+    /// require external synchronization.
+    last_sequence: AtomicU64,
+}
+
+impl<A: Arena<Stats = BasicStats> + Clone + Sync + Send + 'static> MemTable
+    for SkiplistMemTable<A>
+{
+    fn schema(&self) -> &Schema {
+        &self.schema
+    }
+
+    fn min_key(&self) -> Option<Bytes> {
+        let mut iter = self.skiplist.iter();
+        iter.seek_to_first();
+        if !iter.valid() {
+            None
+        } else {
+            Some(iter.key().to_vec().into())
+        }
+    }
+
+    fn max_key(&self) -> Option<Bytes> {
+        let mut iter = self.skiplist.iter();
+        iter.seek_to_last();
+        if !iter.valid() {
+            None
+        } else {
+            Some(iter.key().to_vec().into())
+        }
+    }
+
+    // TODO(yingwen): Encode value if value_buf is not set.
+    // Now the caller is required to encode the row into the `value_buf` in
+    // PutContext first.
+    fn put(
+        &self,
+        ctx: &mut PutContext,
+        sequence: KeySequence,
+        row: &Row,
+        schema: &Schema,
+    ) -> Result<()> {
+        trace!("skiplist put row, sequence:{:?}, row:{:?}", sequence, row);
+
+        let key_encoder = ComparableInternalKey::new(sequence, schema);
+
+        let internal_key = &mut ctx.key_buf;
+        // Reset key buffer
+        internal_key.clear();
+        // Reserve capacity for key
+        internal_key.reserve(key_encoder.estimate_encoded_size(row));
+        // Encode key
+        key_encoder
+            .encode(internal_key, row)
+            .context(EncodeInternalKey)?;
+
+        // Encode row value. The ContiguousRowWriter will clear the buf.
+        let row_value = &mut ctx.value_buf;
+        let mut row_writer = ContiguousRowWriter::new(row_value, schema, &ctx.index_in_writer);
+        row_writer
+            .write_row(row)
+            .map_err(|e| Box::new(e) as _)
+            .context(InvalidRow)?;
+
+        self.skiplist.put(internal_key, row_value);
+
+        Ok(())
+    }
+
+    fn scan(&self, ctx: ScanContext, request: ScanRequest) -> Result<ColumnarIterPtr> {
+        debug!(
+            "Scan skiplist memtable, ctx:{:?}, request:{:?}",
+            ctx, request
+        );
+
+        let num_rows = self.skiplist.len();
+        let (reverse, batch_size) = (request.reverse, ctx.batch_size);
+        let iter = ColumnarIterImpl::new(self, ctx, request)?;
+        if reverse {
+            Ok(Box::new(ReversedColumnarIterator::new(
+                iter, num_rows, batch_size,
+            )))
+        } else {
+            Ok(Box::new(iter))
+        }
+    }
+
+    fn approximate_memory_usage(&self) -> usize {
+        // Mem size of skiplist is u32, need to cast to usize
+        match self.skiplist.mem_size().try_into() {
+            Ok(v) => v,
+            // The skiplist already use bytes larger than usize
+            Err(_) => usize::MAX,
+        }
+    }
+
+    fn set_last_sequence(&self, sequence: SequenceNumber) -> Result<()> {
+        let last = self.last_sequence();
+        ensure!(
+            sequence >= last,
+            InvalidPutSequence {
+                given: sequence,
+                last
+            }
+        );
+
+        self.last_sequence
+            .store(sequence, atomic::Ordering::Relaxed);
+
+        Ok(())
+    }
+
+    fn last_sequence(&self) -> SequenceNumber {
+        self.last_sequence.load(atomic::Ordering::Relaxed)
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct BytewiseComparator;
+
+impl KeyComparator for BytewiseComparator {
+    #[inline]
+    fn compare_key(&self, lhs: &[u8], rhs: &[u8]) -> Ordering {
+        lhs.cmp(rhs)
+    }
+
+    #[inline]
+    fn same_key(&self, lhs: &[u8], rhs: &[u8]) -> bool {
+        lhs == rhs
+    }
+}
+
+#[cfg(test)]
+mod tests {
+
+    use std::{ops::Bound, sync::Arc};
+
+    use arena::NoopCollector;
+    use common_types::{
+        bytes::ByteVec,
+        datum::Datum,
+        projected_schema::ProjectedSchema,
+        record_batch::RecordBatchWithKey,
+        schema::IndexInWriterSchema,
+        tests::{build_row, build_schema},
+        time::Timestamp,
+    };
+    use common_util::codec::memcomparable::MemComparable;
+
+    use super::*;
+    use crate::memtable::{
+        factory::{Factory, Options},
+        skiplist::factory::SkiplistMemTableFactory,
+    };
+
+    fn test_memtable_scan_for_scan_request(
+        schema: Schema,
+        memtable: Arc<dyn MemTable + Send + Sync>,
+    ) {
+        let projection: Vec<usize> = (0..schema.num_columns()).collect();
+        let projected_schema = ProjectedSchema::new(schema, Some(projection)).unwrap();
+
+        let testcases = vec![
+            (
+                // limited by sequence
+                ScanRequest {
+                    start_user_key: Bound::Unbounded,
+                    end_user_key: Bound::Unbounded,
+                    sequence: 2,
+                    projected_schema: projected_schema.clone(),
+                    need_dedup: true,
+                    reverse: false,
+                },
+                vec![
+                    build_row(b"a", 1, 10.0, "v1"),
+                    build_row(b"b", 2, 10.0, "v2"),
+                    build_row(b"c", 3, 10.0, "v3"),
+                    build_row(b"d", 4, 10.0, "v4"),
+                    build_row(b"e", 5, 10.0, "v5"),
+                    build_row(b"f", 6, 10.0, "v6"),
+                ],
+            ),
+            (
+                // limited by sequence and start/end key
+                ScanRequest {
+                    start_user_key: Bound::Included(build_scan_key("a", 1)),
+                    end_user_key: Bound::Excluded(build_scan_key("e", 5)),
+                    sequence: 2,
+                    projected_schema: projected_schema.clone(),
+                    need_dedup: true,
+                    reverse: false,
+                },
+                vec![
+                    build_row(b"a", 1, 10.0, "v1"),
+                    build_row(b"b", 2, 10.0, "v2"),
+                    build_row(b"c", 3, 10.0, "v3"),
+                    build_row(b"d", 4, 10.0, "v4"),
+                ],
+            ),
+            (
+                // limited by sequence and start/end key
+                // but seq is one smaller than last case
+                ScanRequest {
+                    start_user_key: Bound::Included(build_scan_key("a", 1)),
+                    end_user_key: Bound::Excluded(build_scan_key("e", 5)),
+                    sequence: 1,
+                    projected_schema,
+                    need_dedup: true,
+                    reverse: false,
+                },
+                vec![
+                    build_row(b"a", 1, 10.0, "v1"),
+                    build_row(b"b", 2, 10.0, "v2"),
+                    build_row(b"c", 3, 10.0, "v3"),
+                ],
+            ),
+        ];
+
+        for (req, expected) in testcases {
+            let scan_ctx = ScanContext::default();
+            let iter = memtable.scan(scan_ctx, req).unwrap();
+            check_iterator(iter, expected);
+        }
+    }
+
+    fn test_memtable_scan_for_projection(
+        schema: Schema,
+        memtable: Arc<dyn MemTable + Send + Sync>,
+    ) {
+        let projection: Vec<usize> = (0..2).collect();
+        let projected_schema = ProjectedSchema::new(schema, Some(projection)).unwrap();
+
+        let testcases = vec![(
+            ScanRequest {
+                start_user_key: Bound::Included(build_scan_key("a", 1)),
+                end_user_key: Bound::Excluded(build_scan_key("e", 5)),
+                sequence: 2,
+                projected_schema,
+                need_dedup: true,
+                reverse: false,
+            },
+            vec![
+                build_row_for_two_column(b"a", 1),
+                build_row_for_two_column(b"b", 2),
+                build_row_for_two_column(b"c", 3),
+                build_row_for_two_column(b"d", 4),
+            ],
+        )];
+
+        for (req, expected) in testcases {
+            let scan_ctx = ScanContext::default();
+            let iter = memtable.scan(scan_ctx, req).unwrap();
+            check_iterator(iter, expected);
+        }
+    }
+
+    #[test]
+    fn test_memtable_scan() {
+        let schema = build_schema();
+        let factory = SkiplistMemTableFactory;
+        let memtable = factory
+            .create_memtable(Options {
+                schema: schema.clone(),
+                arena_block_size: 512,
+                creation_sequence: 1,
+                collector: Arc::new(NoopCollector {}),
+            })
+            .unwrap();
+
+        let mut ctx = PutContext::new(IndexInWriterSchema::for_same_schema(schema.num_columns()));
+        let input = vec![
+            (KeySequence::new(1, 1), build_row(b"a", 1, 10.0, "v1")),
+            (KeySequence::new(1, 2), build_row(b"b", 2, 10.0, "v2")),
+            (
+                KeySequence::new(1, 3),
+                build_row(b"c", 3, 10.0, "primary_key same with next row"),
+            ),
+            (KeySequence::new(1, 4), build_row(b"c", 3, 10.0, "v3")),
+            (KeySequence::new(2, 1), build_row(b"d", 4, 10.0, "v4")),
+            (KeySequence::new(2, 1), build_row(b"e", 5, 10.0, "v5")),
+            (KeySequence::new(2, 3), build_row(b"f", 6, 10.0, "v6")),
+            (KeySequence::new(3, 4), build_row(b"g", 7, 10.0, "v7")),
+        ];
+
+        for (seq, row) in input {
+            memtable.put(&mut ctx, seq, &row, &schema).unwrap();
+        }
+
+        test_memtable_scan_for_scan_request(schema.clone(), memtable.clone());
+        test_memtable_scan_for_projection(schema, memtable);
+    }
+
+    fn check_iterator<T: Iterator<Item = Result<RecordBatchWithKey>>>(
+        iter: T,
+        expected_rows: Vec<Row>,
+    ) {
+        let mut visited_rows = 0;
+        for batch in iter {
+            let batch = batch.unwrap();
+            for row_idx in 0..batch.num_rows() {
+                assert_eq!(batch.clone_row_at(row_idx), expected_rows[visited_rows]);
+                visited_rows += 1;
+            }
+        }
+
+        assert_eq!(visited_rows, expected_rows.len());
+    }
+
+    fn build_scan_key(c1: &str, c2: i64) -> Bytes {
+        let mut buf = ByteVec::new();
+        let encoder = MemComparable;
+        encoder.encode(&mut buf, &Datum::from(c1)).unwrap();
+        encoder.encode(&mut buf, &Datum::from(c2)).unwrap();
+
+        Bytes::from(buf)
+    }
+
+    pub fn build_row_for_two_column(key1: &[u8], key2: i64) -> Row {
+        let datums = vec![
+            Datum::Varbinary(Bytes::copy_from_slice(key1)),
+            Datum::Timestamp(Timestamp::new(key2)),
+        ];
+
+        Row::from_datums(datums)
+    }
+}
diff --git a/analytic_engine/src/meta/details.rs b/analytic_engine/src/meta/details.rs
new file mode 100644
index 0000000000..ae9c5a1741
--- /dev/null
+++ b/analytic_engine/src/meta/details.rs
@@ -0,0 +1,1282 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Implementation of Manifest
+
+use std::sync::{
+    atomic::{AtomicUsize, Ordering},
+    Arc,
+};
+
+use async_trait::async_trait;
+use common_types;
+use common_util::define_result;
+use log::{error, info, warn};
+use serde_derive::Deserialize;
+use snafu::{ResultExt, Snafu};
+use tokio::sync::Mutex;
+use wal::{
+    log_batch::{LogWriteBatch, LogWriteEntry},
+    manager::{
+        LogIterator, ReadBoundary, ReadContext, ReadRequest, RegionId, SequenceNumber, WalManager,
+        WriteContext,
+    },
+};
+
+use crate::meta::{
+    meta_data::ManifestData,
+    meta_update::{
+        MetaUpdate, MetaUpdateDecoder, MetaUpdatePayload, SnapshotManifestMeta, VersionEditMeta,
+    },
+    Manifest,
+};
+
+/// The region id manifest used.
+const MANIFEST_REGION_ID: RegionId = 1;
+/// The region id to store snapshot state.
+const SNAPSHOT_STATE_REGION_ID: RegionId = 2;
+// The first region id of snapshot region.
+const FIRST_SNAPSHOT_REGION_ID: RegionId = 3;
+// The second region id of snapshot region.
+const SECOND_SNAPSHOT_REGION_ID: RegionId = 4;
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Failed to write update to wal, err:{}", source))]
+    WriteWal { source: wal::manager::Error },
+
+    #[snafu(display("Failed to read wal, err:{}", source))]
+    ReadWal { source: wal::manager::Error },
+
+    #[snafu(display("Failed to read log entry, err:{}", source))]
+    ReadEntry { source: wal::manager::Error },
+
+    #[snafu(display("Failed to apply meta update, err:{}", source))]
+    ApplyUpdate {
+        source: crate::meta::meta_data::Error,
+    },
+
+    #[snafu(display("Failed to clean wal, err:{}", source))]
+    CleanWal { source: wal::manager::Error },
+
+    #[snafu(display("Failed to clean snapshot, region_id:{}, err:{}", region_id, source))]
+    CleanSnapshot {
+        region_id: RegionId,
+        source: wal::manager::Error,
+    },
+
+    #[snafu(display("Failed to load sequence of manifest, err:{}", source))]
+    LoadSequence { source: wal::manager::Error },
+
+    #[snafu(display("Failed to load sequence of snapshot state, err:{}", source))]
+    LoadSnapshotMetaSequence { source: wal::manager::Error },
+
+    #[snafu(display("Failed to clean snapshot state, err:{}", source))]
+    CleanSnapshotState { source: wal::manager::Error },
+}
+
+define_result!(Error);
+
+const STORE_UPDATE_BATCH: usize = 500;
+
+/// Implementation of [MetaUpdateReader]
+#[derive(Debug)]
+pub struct MetaUpdateReaderImpl<W: WalManager> {
+    iter: W::Iterator,
+}
+
+impl<W: WalManager + Send + Sync> MetaUpdateReaderImpl<W> {
+    async fn next_update(&mut self) -> Result<Option<MetaUpdate>> {
+        let decoder = MetaUpdateDecoder;
+
+        match self.iter.next_log_entry(&decoder).context(ReadEntry)? {
+            Some(entry) => Ok(Some(entry.payload)),
+            None => Ok(None),
+        }
+    }
+}
+
+/// State to track manifest snapshot.
+#[derive(Debug, Default)]
+struct SnapshotState {
+    /// Meta data of the snapshot of the manifest, `None` if there is no
+    /// snapshot.
+    snapshot_meta: Option<SnapshotManifestMeta>,
+}
+
+impl SnapshotState {
+    fn install_snapshot_meta(&mut self, snapshot_meta: SnapshotManifestMeta) {
+        self.snapshot_meta = Some(snapshot_meta);
+    }
+
+    fn next_snapshot_region_id(&self) -> RegionId {
+        match self.snapshot_meta {
+            Some(snapshot_meta) => {
+                if snapshot_meta.snapshot_region_id == FIRST_SNAPSHOT_REGION_ID {
+                    SECOND_SNAPSHOT_REGION_ID
+                } else {
+                    FIRST_SNAPSHOT_REGION_ID
+                }
+            }
+            None => FIRST_SNAPSHOT_REGION_ID,
+        }
+    }
+}
+
+#[derive(Debug, Clone, Deserialize)]
+pub struct Options {
+    pub snapshot_every_n_updates: usize,
+    pub paranoid_checks: bool,
+}
+
+impl Default for Options {
+    fn default() -> Self {
+        Self {
+            snapshot_every_n_updates: 10_000,
+            paranoid_checks: true,
+        }
+    }
+}
+
+// TODO(yingwen): Wrap into an inner struct if there are too many Arc fields.
+/// Implementation of [Manifest].
+#[derive(Debug, Clone)]
+pub struct ManifestImpl<W> {
+    /// Region id for this manifest.
+    manifest_region_id: RegionId,
+    /// Wal manager, the manifest use its own wal manager instance.
+    wal_manager: Arc<W>,
+    opts: Options,
+
+    // Snapshot related:
+    /// Region id to store snapshot state.
+    snapshot_state_region_id: RegionId,
+    snapshot_state: Arc<Mutex<SnapshotState>>,
+    /// Number of updates wrote to wal since last snapshot.
+    num_updates_since_snapshot: Arc<AtomicUsize>,
+}
+
+impl<W: WalManager + Send + Sync> ManifestImpl<W> {
+    pub async fn open(wal_manager: W, opts: Options) -> Result<Self> {
+        let mut manifest = Self {
+            manifest_region_id: MANIFEST_REGION_ID,
+            wal_manager: Arc::new(wal_manager),
+            opts,
+            snapshot_state_region_id: SNAPSHOT_STATE_REGION_ID,
+            snapshot_state: Arc::new(Mutex::new(SnapshotState::default())),
+            num_updates_since_snapshot: Arc::new(AtomicUsize::new(0)),
+        };
+
+        manifest.load_snapshot_state().await?;
+
+        Ok(manifest)
+    }
+
+    async fn load_snapshot_state(&mut self) -> Result<()> {
+        // Load snapshot state.
+        let mut reader = self.read_updates_from_region(
+            self.snapshot_state_region_id,
+            ReadBoundary::Min,
+            ReadBoundary::Max,
+        )?;
+
+        let mut last_snapshot_meta = None;
+        while let Some(update) = reader.next_update().await? {
+            // If the entry is a snapshot entry.
+            if let Some(snapshot_meta) = update.snapshot_manifest_meta() {
+                last_snapshot_meta = Some(snapshot_meta);
+            } else {
+                error!(
+                    "Manifest found non snapshot state entry, entry:{:?}",
+                    update
+                );
+            }
+        }
+
+        let mut snapshot_state = self.snapshot_state.lock().await;
+        if let Some(snapshot_meta) = last_snapshot_meta {
+            // Previous snapshot exists.
+            snapshot_state.install_snapshot_meta(snapshot_meta);
+
+            info!(
+                "Manifest found snapshot_meta, snapshot_state:{:?}, last_snapshot_meta:{:?}",
+                snapshot_state, last_snapshot_meta
+            );
+        }
+
+        Ok(())
+    }
+
+    fn read_updates_from_region(
+        &self,
+        region_id: RegionId,
+        start: ReadBoundary,
+        end: ReadBoundary,
+    ) -> Result<MetaUpdateReaderImpl<W>> {
+        let request = ReadRequest {
+            region_id,
+            start,
+            end,
+        };
+        let ctx = ReadContext::default();
+
+        let iter = self.wal_manager.read(&ctx, &request).context(ReadWal)?;
+
+        Ok(MetaUpdateReaderImpl { iter })
+    }
+
+    /// Load meta update from region of given `region_id` and apply into
+    /// `manifest_data`.
+    async fn load_data_from_region(
+        &self,
+        region_id: RegionId,
+        manifest_data: &mut ManifestData,
+    ) -> Result<()> {
+        self.load_data_from_region_in_range(
+            region_id,
+            ReadBoundary::Min,
+            ReadBoundary::Max,
+            manifest_data,
+        )
+        .await?;
+
+        Ok(())
+    }
+
+    /// Load meta update in given range from region of given `region_id`
+    /// boundary, and apply into `manifest_data`. Returns number of MetaUpdates
+    /// loaded.
+    async fn load_data_from_region_in_range(
+        &self,
+        region_id: RegionId,
+        start: ReadBoundary,
+        end: ReadBoundary,
+        manifest_data: &mut ManifestData,
+    ) -> Result<usize> {
+        let mut reader = self.read_updates_from_region(region_id, start, end)?;
+        let mut loaded = 0;
+
+        while let Some(update) = reader.next_update().await? {
+            if let Err(e) = manifest_data.apply_meta_update(update).context(ApplyUpdate) {
+                if self.opts.paranoid_checks {
+                    return Err(e);
+                } else {
+                    warn!("Manifest load meta update failed, err:{:?}", e);
+                    continue;
+                }
+            }
+            loaded += 1;
+        }
+        Ok(loaded)
+    }
+
+    /// Load data and create a snapshot.
+    async fn create_snapshot(&self) -> Result<ManifestData> {
+        info!("Manifest try to create snapshot");
+
+        // Acquire snapshot lock.
+        let mut snapshot_state = self.snapshot_state.lock().await;
+        let last_snapshot_meta = snapshot_state.snapshot_meta;
+        let next_snapshot_region_id = snapshot_state.next_snapshot_region_id();
+
+        // Clean next snapshot region.
+        self.clean_snapshot(next_snapshot_region_id).await?;
+
+        // Load previous snapshot.
+        let mut manifest_start = ReadBoundary::Min;
+        let mut manifest_data = ManifestData::default();
+        if let Some(snapshot_meta) = last_snapshot_meta {
+            // Load manifest from last snapshot first.
+            self.load_data_from_region(snapshot_meta.snapshot_region_id, &mut manifest_data)
+                .await?;
+            // The sequence after snapshot.
+            manifest_start = ReadBoundary::Excluded(snapshot_meta.sequence);
+        }
+
+        // Get current sequence, data until this sequence can be loaded to create next
+        // snapshot.
+        let snapshot_sequence = self
+            .wal_manager
+            .sequence_num(self.manifest_region_id)
+            .context(LoadSequence)?;
+
+        // Load manifest up to `snapshot_sequence`.
+        let num_loaded_from_manifest = self
+            .load_data_from_region_in_range(
+                self.manifest_region_id,
+                manifest_start,
+                ReadBoundary::Included(snapshot_sequence),
+                &mut manifest_data,
+            )
+            .await?;
+
+        // Store snapshot.
+        self.store_snapshot_to_region(next_snapshot_region_id, &manifest_data)
+            .await?;
+
+        // Store snapshot state.
+        let next_snapshot_meta = SnapshotManifestMeta {
+            snapshot_region_id: next_snapshot_region_id,
+            sequence: snapshot_sequence,
+        };
+        self.store_snapshot_state(next_snapshot_meta).await?;
+
+        info!(
+            "Manifest stored snapshot,
+            next_snapshot_meta:{:?},
+            last_snapshot_meta:{:?},
+            snapshot_state_before_install:{:?},
+            num_updates_since_snapshot:{}",
+            next_snapshot_meta,
+            last_snapshot_meta,
+            snapshot_state,
+            self.num_updates_since_snapshot()
+        );
+
+        // Install new snapshot, also bump next snapshot region id.
+        snapshot_state.install_snapshot_meta(next_snapshot_meta);
+
+        // Data before sequence of the snapshot can also be removed.
+        self.wal_manager
+            .mark_delete_entries_up_to(self.manifest_region_id, snapshot_sequence)
+            .await
+            .context(CleanWal)?;
+
+        self.decrease_num_updates(num_loaded_from_manifest);
+
+        info!(
+            "Manifest create snapshot done,
+            next_snapshot_meta:{:?},
+            last_snapshot_meta:{:?},
+            snapshot_state:{:?},
+            num_loaded_from_manifest:{},
+            num_updates:{}",
+            next_snapshot_meta,
+            last_snapshot_meta,
+            snapshot_state,
+            num_loaded_from_manifest,
+            self.num_updates_since_snapshot()
+        );
+
+        Ok(manifest_data)
+    }
+
+    async fn clean_snapshot(&self, snapshot_region_id: RegionId) -> Result<()> {
+        info!("Clean snapshot, snapshot_region_id:{}", snapshot_region_id);
+
+        self.wal_manager
+            .mark_delete_entries_up_to(snapshot_region_id, common_types::MAX_SEQUENCE_NUMBER)
+            .await
+            .context(CleanSnapshot {
+                region_id: snapshot_region_id,
+            })
+            .map_err(|e| {
+                error!(
+                    "Failed to clean snapshot, region_id:{}, err:{}",
+                    snapshot_region_id, e
+                );
+                e
+            })
+    }
+
+    async fn store_snapshot_state(&self, snapshot_meta: SnapshotManifestMeta) -> Result<()> {
+        // Get current snapshot state sequence.
+        let snapshot_state_sequence = self
+            .wal_manager
+            .sequence_num(self.snapshot_state_region_id)
+            .context(LoadSnapshotMetaSequence)?;
+        // Write a snapshot entry into the region.
+
+        self.store_update_to_region(
+            self.snapshot_state_region_id,
+            MetaUpdate::SnapshotManifest(snapshot_meta),
+        )
+        .await?;
+        // Clean old snapshot state.
+        self.wal_manager
+            .mark_delete_entries_up_to(self.snapshot_state_region_id, snapshot_state_sequence)
+            .await
+            .context(CleanSnapshotState)
+    }
+
+    async fn store_update_to_region(
+        &self,
+        region_id: RegionId,
+        update: MetaUpdate,
+    ) -> Result<SequenceNumber> {
+        info!(
+            "Manifest impl store update, region_id:{}, update:{:?}",
+            region_id, update
+        );
+
+        let mut log_batch = LogWriteBatch::new(region_id);
+        log_batch.push(LogWriteEntry {
+            payload: MetaUpdatePayload::from(update),
+        });
+
+        let write_ctx = WriteContext::default();
+
+        self.wal_manager
+            .write(&write_ctx, &log_batch)
+            .await
+            .context(WriteWal)
+    }
+
+    async fn store_updates_to_region(
+        &self,
+        region_id: RegionId,
+        updates: &[MetaUpdate],
+    ) -> Result<SequenceNumber> {
+        let mut log_batch = LogWriteBatch::new(region_id);
+        for update in updates {
+            log_batch.push(LogWriteEntry {
+                payload: MetaUpdatePayload::from(update),
+            });
+        }
+
+        let write_ctx = WriteContext::default();
+
+        self.wal_manager
+            .write(&write_ctx, &log_batch)
+            .await
+            .context(WriteWal)
+    }
+
+    async fn store_snapshot_to_region(
+        &self,
+        region_id: RegionId,
+        snapshot: &ManifestData,
+    ) -> Result<()> {
+        info!("Manifest store snapshot to region, region_id:{}", region_id);
+
+        let mut meta_updates = Vec::with_capacity(STORE_UPDATE_BATCH);
+
+        // Store all spaces.
+        for (space_id, space_meta_data) in &snapshot.spaces {
+            let space_meta = space_meta_data.space_meta.clone();
+            // Add this space.
+            meta_updates.push(MetaUpdate::AddSpace(space_meta));
+
+            // Add all tables to the space.
+            for (table_id, table_meta_data) in &space_meta_data.tables {
+                let table_meta = table_meta_data.table_meta.clone();
+                // Store table meta.
+                meta_updates.push(MetaUpdate::AddTable(table_meta));
+
+                // Store version edit.
+                let version_meta = &table_meta_data.version_meta;
+                let version_edit_meta = VersionEditMeta {
+                    space_id: *space_id,
+                    table_id: *table_id,
+                    flushed_sequence: version_meta.flushed_sequence,
+                    files_to_add: version_meta.ordered_files(),
+                    files_to_delete: Vec::new(),
+                };
+                meta_updates.push(MetaUpdate::VersionEdit(version_edit_meta));
+
+                if meta_updates.len() >= STORE_UPDATE_BATCH {
+                    self.store_updates_to_region(region_id, &meta_updates)
+                        .await?;
+                    meta_updates.clear();
+                }
+            }
+        }
+
+        if !meta_updates.is_empty() {
+            self.store_updates_to_region(region_id, &meta_updates)
+                .await?;
+            meta_updates.clear();
+        }
+
+        Ok(())
+    }
+
+    #[inline]
+    fn num_updates_since_snapshot(&self) -> usize {
+        self.num_updates_since_snapshot.load(Ordering::Relaxed)
+    }
+
+    // Guarded by snapshot state lock.
+    #[inline]
+    fn decrease_num_updates(&self, num: usize) {
+        if num >= self.num_updates_since_snapshot() {
+            self.num_updates_since_snapshot.store(0, Ordering::Relaxed);
+        } else {
+            self.num_updates_since_snapshot
+                .fetch_sub(num, Ordering::Relaxed);
+        }
+    }
+}
+
+#[async_trait]
+impl<W: WalManager + Send + Sync> Manifest for ManifestImpl<W> {
+    type Error = Error;
+
+    async fn store_update(&self, update: MetaUpdate) -> Result<()> {
+        self.store_update_to_region(self.manifest_region_id, update)
+            .await?;
+
+        let num_updates = self
+            .num_updates_since_snapshot
+            .fetch_add(1, Ordering::Relaxed);
+        if num_updates >= self.opts.snapshot_every_n_updates {
+            info!(
+                "Enough updates in manifest, trigger snapshot, num_updates:{}",
+                num_updates
+            );
+
+            self.create_snapshot().await?;
+        }
+
+        Ok(())
+    }
+
+    async fn load_data(&self, do_snapshot: bool) -> Result<ManifestData> {
+        if do_snapshot {
+            let manifest_data = self.create_snapshot().await?;
+
+            Ok(manifest_data)
+        } else {
+            let mut manifest_data = ManifestData::default();
+
+            let last_snapshot_meta = {
+                let snapshot_state = self.snapshot_state.lock().await;
+                snapshot_state.snapshot_meta
+            };
+            let mut manifest_start = ReadBoundary::Min;
+            // Load from snapshot.
+            if let Some(snapshot_meta) = last_snapshot_meta {
+                self.load_data_from_region(snapshot_meta.snapshot_region_id, &mut manifest_data)
+                    .await?;
+                // The sequence after snapshot.
+                manifest_start = ReadBoundary::Excluded(snapshot_meta.sequence);
+            }
+
+            // Load remaining data from wal.
+            self.load_data_from_region_in_range(
+                self.manifest_region_id,
+                manifest_start,
+                ReadBoundary::Max,
+                &mut manifest_data,
+            )
+            .await?;
+
+            Ok(manifest_data)
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{path::PathBuf, sync::Arc};
+
+    use common_types::{column_schema, datum::DatumKind, schema, schema::Schema};
+    use common_util::{runtime, runtime::Runtime, tests::init_log_for_test};
+    use table_engine::table::TableId;
+    use wal::rocks_impl::manager::{Builder as WalBuilder, RocksImpl};
+
+    use super::*;
+    use crate::{
+        meta::{
+            details::{ManifestImpl, Options},
+            meta_update::{
+                AddSpaceMeta, AddTableMeta, AlterOptionsMeta, AlterSchemaMeta, DropTableMeta,
+                MetaUpdate, VersionEditMeta,
+            },
+            Manifest,
+        },
+        TableOptions,
+    };
+
+    fn build_altered_schema(schema: &Schema) -> Schema {
+        let mut builder = schema::Builder::new().auto_increment_column_id(true);
+        for column_schema in schema.key_columns() {
+            builder = builder
+                .add_key_column(column_schema.clone())
+                .expect("should succeed to add key column");
+        }
+        for column_schema in schema.normal_columns() {
+            builder = builder
+                .add_normal_column(column_schema.clone())
+                .expect("should succeed to add normal column");
+        }
+        builder
+            .add_normal_column(
+                column_schema::Builder::new("field3".to_string(), DatumKind::String)
+                    .build()
+                    .expect("should succeed build column schema"),
+            )
+            .unwrap()
+            .build()
+            .unwrap()
+    }
+
+    fn build_runtime(thread_num: usize) -> Arc<Runtime> {
+        Arc::new(
+            runtime::Builder::default()
+                .worker_threads(thread_num)
+                .enable_all()
+                .build()
+                .unwrap(),
+        )
+    }
+
+    async fn build_manifest(
+        dir: impl Into<PathBuf>,
+        runtime: Arc<Runtime>,
+        opts: Options,
+    ) -> ManifestImpl<RocksImpl> {
+        let manifest_wal = WalBuilder::with_default_rocksdb_config(dir, runtime.clone())
+            .build()
+            .unwrap();
+
+        ManifestImpl::open(manifest_wal, opts).await.unwrap()
+    }
+
+    async fn assert_expected(
+        dir: impl Into<PathBuf>,
+        runtime: Arc<Runtime>,
+        opts: Options,
+        expected: &str,
+    ) -> Result<()> {
+        let manifest = build_manifest(dir, runtime, opts).await;
+        let data = manifest.load_data(false).await?;
+        assert_eq!(format!("{:#?}", data), expected);
+        Ok(())
+    }
+
+    async fn test_manifest_add_space(
+        dir: impl Into<PathBuf>,
+        runtime: Arc<Runtime>,
+        opts: Options,
+    ) {
+        let space_id = 10;
+        let space_name = "test".to_string();
+
+        let manifest = build_manifest(dir, runtime, opts).await;
+        let add_space = MetaUpdate::AddSpace(AddSpaceMeta {
+            space_id,
+            space_name: space_name.clone(),
+        });
+        manifest.store_update(add_space).await.unwrap();
+        let data = manifest.load_data(false).await.unwrap();
+        assert_eq!(data.spaces.len(), 1);
+        assert_eq!(data.spaces.get(&10).unwrap().space_meta.space_id, space_id);
+        assert_eq!(
+            data.spaces.get(&10).unwrap().space_meta.space_name,
+            space_name
+        );
+    }
+
+    async fn check_add_table(
+        dir: impl Into<PathBuf>,
+        runtime: Arc<Runtime>,
+        opts: Options,
+    ) -> Result<()> {
+        let expected = r#"ManifestData {
+    spaces: {
+        10: SpaceMetaData {
+            space_meta: AddSpaceMeta {
+                space_id: 10,
+                space_name: "test",
+            },
+            tables: {
+                TableId(100, 0, 100): TableMetaData {
+                    table_meta: AddTableMeta {
+                        space_id: 10,
+                        table_id: TableId(100, 0, 100),
+                        table_name: "test_table",
+                        schema: Schema {
+                            num_key_columns: 2,
+                            timestamp_index: 1,
+                            tsid_index: None,
+                            enable_tsid_primary_key: false,
+                            column_schemas: ColumnSchemas {
+                                columns: [
+                                    ColumnSchema {
+                                        id: 1,
+                                        name: "key1",
+                                        data_type: Varbinary,
+                                        is_nullable: false,
+                                        is_tag: false,
+                                        comment: "",
+                                    },
+                                    ColumnSchema {
+                                        id: 2,
+                                        name: "key2",
+                                        data_type: Timestamp,
+                                        is_nullable: false,
+                                        is_tag: false,
+                                        comment: "",
+                                    },
+                                    ColumnSchema {
+                                        id: 3,
+                                        name: "field1",
+                                        data_type: Double,
+                                        is_nullable: false,
+                                        is_tag: false,
+                                        comment: "",
+                                    },
+                                    ColumnSchema {
+                                        id: 4,
+                                        name: "field2",
+                                        data_type: String,
+                                        is_nullable: false,
+                                        is_tag: false,
+                                        comment: "",
+                                    },
+                                ],
+                            },
+                            version: 1,
+                        },
+                        opts: TableOptions {
+                            segment_duration: None,
+                            update_mode: Overwrite,
+                            enable_ttl: true,
+                            ttl: ReadableDuration(
+                                604800s,
+                            ),
+                            arena_block_size: 2097152,
+                            write_buffer_size: 33554432,
+                            compaction_strategy: Default,
+                            num_rows_per_row_group: 8192,
+                            compression: Zstd,
+                        },
+                    },
+                    version_meta: TableVersionMeta {
+                        flushed_sequence: 0,
+                        files: {},
+                        max_file_id: 0,
+                    },
+                },
+            },
+        },
+    },
+    last_space_id: 10,
+}"#;
+        assert_expected(dir, runtime, opts, expected).await
+    }
+
+    async fn test_manifest_add_table(
+        dir: impl Into<PathBuf>,
+        runtime: Arc<Runtime>,
+        opts: Options,
+    ) -> Result<()> {
+        let space_id = 10;
+        let manifest = build_manifest(dir, runtime, opts).await;
+
+        let table_id = TableId::from(100);
+        let table_name = "test_table".to_string();
+        let add_table = MetaUpdate::AddTable(AddTableMeta {
+            space_id,
+            table_id,
+            table_name,
+            schema: common_types::tests::build_schema(),
+            opts: TableOptions::default(),
+        });
+        manifest.store_update(add_table).await
+    }
+
+    async fn check_drop_table(
+        dir: impl Into<PathBuf>,
+        runtime: Arc<Runtime>,
+        opts: Options,
+    ) -> Result<()> {
+        let expected = r#"ManifestData {
+    spaces: {
+        10: SpaceMetaData {
+            space_meta: AddSpaceMeta {
+                space_id: 10,
+                space_name: "test",
+            },
+            tables: {},
+        },
+    },
+    last_space_id: 10,
+}"#;
+        assert_expected(dir, runtime, opts, expected).await
+    }
+
+    async fn test_manifest_drop_table(
+        dir: impl Into<PathBuf>,
+        runtime: Arc<Runtime>,
+        opts: Options,
+    ) -> Result<()> {
+        let space_id = 10;
+
+        let manifest = build_manifest(dir, runtime, opts).await;
+
+        let table_id = TableId::from(100);
+        let table_name = "test_table".to_string();
+        let add_table = MetaUpdate::DropTable(DropTableMeta {
+            space_id,
+            table_id,
+            table_name,
+        });
+        manifest.store_update(add_table).await
+    }
+
+    async fn check_version_edit_with_table(
+        dir: impl Into<PathBuf>,
+        runtime: Arc<Runtime>,
+        opts: Options,
+    ) -> Result<()> {
+        let expected = r#"ManifestData {
+    spaces: {
+        10: SpaceMetaData {
+            space_meta: AddSpaceMeta {
+                space_id: 10,
+                space_name: "test",
+            },
+            tables: {
+                TableId(100, 0, 100): TableMetaData {
+                    table_meta: AddTableMeta {
+                        space_id: 10,
+                        table_id: TableId(100, 0, 100),
+                        table_name: "test_table",
+                        schema: Schema {
+                            num_key_columns: 2,
+                            timestamp_index: 1,
+                            tsid_index: None,
+                            enable_tsid_primary_key: false,
+                            column_schemas: ColumnSchemas {
+                                columns: [
+                                    ColumnSchema {
+                                        id: 1,
+                                        name: "key1",
+                                        data_type: Varbinary,
+                                        is_nullable: false,
+                                        is_tag: false,
+                                        comment: "",
+                                    },
+                                    ColumnSchema {
+                                        id: 2,
+                                        name: "key2",
+                                        data_type: Timestamp,
+                                        is_nullable: false,
+                                        is_tag: false,
+                                        comment: "",
+                                    },
+                                    ColumnSchema {
+                                        id: 3,
+                                        name: "field1",
+                                        data_type: Double,
+                                        is_nullable: false,
+                                        is_tag: false,
+                                        comment: "",
+                                    },
+                                    ColumnSchema {
+                                        id: 4,
+                                        name: "field2",
+                                        data_type: String,
+                                        is_nullable: false,
+                                        is_tag: false,
+                                        comment: "",
+                                    },
+                                ],
+                            },
+                            version: 1,
+                        },
+                        opts: TableOptions {
+                            segment_duration: None,
+                            update_mode: Overwrite,
+                            enable_ttl: true,
+                            ttl: ReadableDuration(
+                                604800s,
+                            ),
+                            arena_block_size: 2097152,
+                            write_buffer_size: 33554432,
+                            compaction_strategy: Default,
+                            num_rows_per_row_group: 8192,
+                            compression: Zstd,
+                        },
+                    },
+                    version_meta: TableVersionMeta {
+                        flushed_sequence: 3,
+                        files: {},
+                        max_file_id: 0,
+                    },
+                },
+            },
+        },
+    },
+    last_space_id: 10,
+}"#;
+        assert_expected(dir, runtime, opts, expected).await
+    }
+
+    async fn check_version_edit_no_table(
+        dir: impl Into<PathBuf>,
+        runtime: Arc<Runtime>,
+        opts: Options,
+    ) -> Result<()> {
+        let expected = r#"ManifestData {
+    spaces: {
+        10: SpaceMetaData {
+            space_meta: AddSpaceMeta {
+                space_id: 10,
+                space_name: "test",
+            },
+            tables: {},
+        },
+    },
+    last_space_id: 10,
+}"#;
+        assert_expected(dir, runtime, opts, expected).await
+    }
+
+    async fn test_manifest_version_edit(
+        dir: impl Into<PathBuf>,
+        runtime: Arc<Runtime>,
+        opts: Options,
+    ) -> Result<()> {
+        let space_id = 10;
+
+        let manifest = build_manifest(dir, runtime, opts).await;
+
+        let table_id = TableId::from(100);
+        let version_edit = MetaUpdate::VersionEdit(VersionEditMeta {
+            space_id,
+            table_id,
+            flushed_sequence: 3,
+            files_to_add: Vec::new(),
+            files_to_delete: Vec::new(),
+        });
+        manifest.store_update(version_edit).await
+    }
+
+    async fn check_alter_schema(
+        dir: impl Into<PathBuf>,
+        runtime: Arc<Runtime>,
+        opts: Options,
+    ) -> Result<()> {
+        let expected = r#"ManifestData {
+    spaces: {
+        10: SpaceMetaData {
+            space_meta: AddSpaceMeta {
+                space_id: 10,
+                space_name: "test",
+            },
+            tables: {
+                TableId(100, 0, 100): TableMetaData {
+                    table_meta: AddTableMeta {
+                        space_id: 10,
+                        table_id: TableId(100, 0, 100),
+                        table_name: "test_table",
+                        schema: Schema {
+                            num_key_columns: 2,
+                            timestamp_index: 1,
+                            tsid_index: None,
+                            enable_tsid_primary_key: false,
+                            column_schemas: ColumnSchemas {
+                                columns: [
+                                    ColumnSchema {
+                                        id: 1,
+                                        name: "key1",
+                                        data_type: Varbinary,
+                                        is_nullable: false,
+                                        is_tag: false,
+                                        comment: "",
+                                    },
+                                    ColumnSchema {
+                                        id: 2,
+                                        name: "key2",
+                                        data_type: Timestamp,
+                                        is_nullable: false,
+                                        is_tag: false,
+                                        comment: "",
+                                    },
+                                    ColumnSchema {
+                                        id: 3,
+                                        name: "field1",
+                                        data_type: Double,
+                                        is_nullable: false,
+                                        is_tag: false,
+                                        comment: "",
+                                    },
+                                    ColumnSchema {
+                                        id: 4,
+                                        name: "field2",
+                                        data_type: String,
+                                        is_nullable: false,
+                                        is_tag: false,
+                                        comment: "",
+                                    },
+                                    ColumnSchema {
+                                        id: 5,
+                                        name: "field3",
+                                        data_type: String,
+                                        is_nullable: false,
+                                        is_tag: false,
+                                        comment: "",
+                                    },
+                                ],
+                            },
+                            version: 1,
+                        },
+                        opts: TableOptions {
+                            segment_duration: None,
+                            update_mode: Overwrite,
+                            enable_ttl: true,
+                            ttl: ReadableDuration(
+                                604800s,
+                            ),
+                            arena_block_size: 2097152,
+                            write_buffer_size: 33554432,
+                            compaction_strategy: Default,
+                            num_rows_per_row_group: 8192,
+                            compression: Zstd,
+                        },
+                    },
+                    version_meta: TableVersionMeta {
+                        flushed_sequence: 3,
+                        files: {},
+                        max_file_id: 0,
+                    },
+                },
+            },
+        },
+    },
+    last_space_id: 10,
+}"#;
+        assert_expected(dir, runtime, opts, expected).await
+    }
+
+    async fn test_manifest_alter_schema(
+        dir: impl Into<PathBuf>,
+        runtime: Arc<Runtime>,
+        opts: Options,
+    ) -> Result<()> {
+        let space_id = 10;
+        let manifest = build_manifest(dir, runtime, opts).await;
+
+        let table_id = TableId::from(100);
+        let alter_schema = MetaUpdate::AlterSchema(AlterSchemaMeta {
+            space_id,
+            table_id,
+            schema: build_altered_schema(&common_types::tests::build_schema()),
+            pre_schema_version: 1,
+        });
+        manifest.store_update(alter_schema).await
+    }
+
+    async fn check_alter_options(
+        dir: impl Into<PathBuf>,
+        runtime: Arc<Runtime>,
+        opts: Options,
+    ) -> Result<()> {
+        let expected = r#"ManifestData {
+    spaces: {
+        10: SpaceMetaData {
+            space_meta: AddSpaceMeta {
+                space_id: 10,
+                space_name: "test",
+            },
+            tables: {
+                TableId(100, 0, 100): TableMetaData {
+                    table_meta: AddTableMeta {
+                        space_id: 10,
+                        table_id: TableId(100, 0, 100),
+                        table_name: "test_table",
+                        schema: Schema {
+                            num_key_columns: 2,
+                            timestamp_index: 1,
+                            tsid_index: None,
+                            enable_tsid_primary_key: false,
+                            column_schemas: ColumnSchemas {
+                                columns: [
+                                    ColumnSchema {
+                                        id: 1,
+                                        name: "key1",
+                                        data_type: Varbinary,
+                                        is_nullable: false,
+                                        is_tag: false,
+                                        comment: "",
+                                    },
+                                    ColumnSchema {
+                                        id: 2,
+                                        name: "key2",
+                                        data_type: Timestamp,
+                                        is_nullable: false,
+                                        is_tag: false,
+                                        comment: "",
+                                    },
+                                    ColumnSchema {
+                                        id: 3,
+                                        name: "field1",
+                                        data_type: Double,
+                                        is_nullable: false,
+                                        is_tag: false,
+                                        comment: "",
+                                    },
+                                    ColumnSchema {
+                                        id: 4,
+                                        name: "field2",
+                                        data_type: String,
+                                        is_nullable: false,
+                                        is_tag: false,
+                                        comment: "",
+                                    },
+                                    ColumnSchema {
+                                        id: 5,
+                                        name: "field3",
+                                        data_type: String,
+                                        is_nullable: false,
+                                        is_tag: false,
+                                        comment: "",
+                                    },
+                                ],
+                            },
+                            version: 1,
+                        },
+                        opts: TableOptions {
+                            segment_duration: None,
+                            update_mode: Overwrite,
+                            enable_ttl: false,
+                            ttl: ReadableDuration(
+                                604800s,
+                            ),
+                            arena_block_size: 2097152,
+                            write_buffer_size: 33554432,
+                            compaction_strategy: Default,
+                            num_rows_per_row_group: 8192,
+                            compression: Zstd,
+                        },
+                    },
+                    version_meta: TableVersionMeta {
+                        flushed_sequence: 3,
+                        files: {},
+                        max_file_id: 0,
+                    },
+                },
+            },
+        },
+    },
+    last_space_id: 10,
+}"#;
+        assert_expected(dir, runtime, opts, expected).await
+    }
+
+    async fn test_manifest_alter_options(
+        dir: impl Into<PathBuf>,
+        runtime: Arc<Runtime>,
+        opts: Options,
+    ) -> Result<()> {
+        let space_id = 10;
+
+        let manifest = build_manifest(dir, runtime, opts).await;
+
+        let table_id = TableId::from(100);
+        let alter_options = MetaUpdate::AlterOptions(AlterOptionsMeta {
+            space_id,
+            table_id,
+            options: TableOptions {
+                enable_ttl: false,
+                ..Default::default()
+            },
+        });
+        manifest.store_update(alter_options).await
+    }
+
+    #[test]
+    fn test_manifest() {
+        init_log_for_test();
+        let dir = tempfile::tempdir().unwrap();
+        let runtime = build_runtime(2);
+        let runtime_clone = runtime.clone();
+        runtime.block_on(async move {
+            let opts = Options {
+                snapshot_every_n_updates: 2,
+                paranoid_checks: false,
+            };
+
+            test_manifest_add_space(dir.path(), runtime_clone.clone(), opts.clone()).await;
+
+            test_manifest_add_table(dir.path(), runtime_clone.clone(), opts.clone())
+                .await
+                .unwrap();
+            assert!(
+                check_add_table(dir.path(), runtime_clone.clone(), opts.clone())
+                    .await
+                    .is_ok()
+            );
+
+            test_manifest_drop_table(dir.path(), runtime_clone.clone(), opts.clone())
+                .await
+                .unwrap();
+            assert!(
+                check_drop_table(dir.path(), runtime_clone.clone(), opts.clone())
+                    .await
+                    .is_ok()
+            );
+            {
+                let opts = Options {
+                    snapshot_every_n_updates: 2,
+                    paranoid_checks: true,
+                };
+                test_manifest_version_edit(dir.path(), runtime_clone.clone(), opts.clone())
+                    .await
+                    .unwrap();
+                assert!(check_version_edit_no_table(
+                    dir.path(),
+                    runtime_clone.clone(),
+                    opts.clone()
+                )
+                .await
+                .is_ok());
+
+                test_manifest_alter_schema(dir.path(), runtime_clone.clone(), opts.clone())
+                    .await
+                    .unwrap();
+                assert!(
+                    check_alter_schema(dir.path(), runtime_clone.clone(), opts.clone())
+                        .await
+                        .is_err()
+                );
+
+                test_manifest_alter_options(dir.path(), runtime_clone.clone(), opts.clone())
+                    .await
+                    .unwrap();
+                assert!(check_alter_options(dir.path(), runtime_clone.clone(), opts)
+                    .await
+                    .is_err());
+            }
+            {
+                test_manifest_add_table(dir.path(), runtime_clone.clone(), opts.clone())
+                    .await
+                    .unwrap();
+                assert!(
+                    check_add_table(dir.path(), runtime_clone.clone(), opts.clone())
+                        .await
+                        .is_ok()
+                );
+
+                test_manifest_version_edit(dir.path(), runtime_clone.clone(), opts.clone())
+                    .await
+                    .unwrap();
+                assert!(check_version_edit_with_table(
+                    dir.path(),
+                    runtime_clone.clone(),
+                    opts.clone()
+                )
+                .await
+                .is_ok());
+
+                test_manifest_alter_schema(dir.path(), runtime_clone.clone(), opts.clone())
+                    .await
+                    .unwrap();
+                assert!(
+                    check_alter_schema(dir.path(), runtime_clone.clone(), opts.clone())
+                        .await
+                        .is_ok()
+                );
+
+                test_manifest_alter_options(dir.path(), runtime_clone.clone(), opts.clone())
+                    .await
+                    .unwrap();
+                assert!(check_alter_options(dir.path(), runtime_clone, opts)
+                    .await
+                    .is_ok());
+            }
+        });
+    }
+}
diff --git a/analytic_engine/src/meta/meta_data.rs b/analytic_engine/src/meta/meta_data.rs
new file mode 100644
index 0000000000..07467d9b9f
--- /dev/null
+++ b/analytic_engine/src/meta/meta_data.rs
@@ -0,0 +1,193 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Meta data of manifest.
+
+use std::collections::BTreeMap;
+
+use common_util::define_result;
+use log::{debug, info};
+use snafu::{ensure, Backtrace, OptionExt, Snafu};
+use table_engine::table::TableId;
+
+use crate::{
+    meta::meta_update::{AddSpaceMeta, AddTableMeta, MetaUpdate},
+    space::SpaceId,
+    table::version::TableVersionMeta,
+};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display(
+        "Space id corrupted (last >= given), last:{}, given:{}.\nBacktrace:\n{}",
+        last,
+        given,
+        backtrace
+    ))]
+    SpaceIdCorrupted {
+        last: SpaceId,
+        given: SpaceId,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Space of table is missing, maybe corrupted, space_id:{}, table:{}.\nBacktrace:\n{}",
+        space_id,
+        table_name,
+        backtrace,
+    ))]
+    TableSpaceMiss {
+        space_id: SpaceId,
+        table_name: String,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Space is missing, maybe corrupted, space_id:{}.\nBacktrace:\n{}",
+        space_id,
+        backtrace,
+    ))]
+    SpaceMiss {
+        space_id: SpaceId,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Table is missing, maybe corrupted, space_id:{}, table_id:{}.\nBacktrace:\n{}",
+        space_id,
+        table_id,
+        backtrace,
+    ))]
+    TableMiss {
+        space_id: SpaceId,
+        table_id: TableId,
+        backtrace: Backtrace,
+    },
+}
+
+define_result!(Error);
+
+#[derive(Debug)]
+pub struct TableMetaData {
+    pub table_meta: AddTableMeta,
+    pub version_meta: TableVersionMeta,
+}
+
+#[derive(Debug)]
+pub struct SpaceMetaData {
+    pub space_meta: AddSpaceMeta,
+    // Use BTreeMap to order table meta by table id.
+    pub tables: BTreeMap<TableId, TableMetaData>,
+}
+
+/// Holds the final view of the data in manifest.
+#[derive(Debug, Default)]
+pub struct ManifestData {
+    // Use BTreeMap to order space meta by space id, so space with smaller id
+    // can be processed first. This is necessary especially in creating snapshot.
+    pub spaces: BTreeMap<SpaceId, SpaceMetaData>,
+    pub last_space_id: SpaceId,
+}
+
+impl ManifestData {
+    pub fn apply_meta_update(&mut self, update: MetaUpdate) -> Result<()> {
+        debug!("Apply meta update, update:{:?}", update);
+
+        // TODO(yingwen): Ignore space not found error when we support drop space.
+        match update {
+            MetaUpdate::AddSpace(meta) => {
+                ensure!(
+                    self.last_space_id <= meta.space_id,
+                    SpaceIdCorrupted {
+                        last: self.last_space_id,
+                        given: meta.space_id,
+                    }
+                );
+
+                self.last_space_id = meta.space_id;
+                self.spaces.insert(
+                    meta.space_id,
+                    SpaceMetaData {
+                        space_meta: meta,
+                        tables: BTreeMap::new(),
+                    },
+                );
+            }
+            MetaUpdate::AddTable(meta) => {
+                let space = self
+                    .spaces
+                    .get_mut(&meta.space_id)
+                    .context(TableSpaceMiss {
+                        space_id: meta.space_id,
+                        table_name: &meta.table_name,
+                    })?;
+                space.tables.insert(
+                    meta.table_id,
+                    TableMetaData {
+                        table_meta: meta,
+                        version_meta: TableVersionMeta::default(),
+                    },
+                );
+            }
+            MetaUpdate::VersionEdit(meta) => {
+                let space = self.spaces.get_mut(&meta.space_id).context(SpaceMiss {
+                    space_id: meta.space_id,
+                })?;
+                // If there is a background compaction/flush job, then version edit
+                // may be stored after a drop table entry being stored. We ignore
+                // that case and won't return error if table is not found.
+                let table = match space.tables.get_mut(&meta.table_id) {
+                    Some(v) => v,
+                    None => {
+                        info!("Table of version edit not found, meta:{:?}", meta);
+
+                        return Ok(());
+                    }
+                };
+                let edit = meta.into_version_edit();
+                table.version_meta.apply_edit(edit);
+            }
+            MetaUpdate::AlterSchema(meta) => {
+                let space = self.spaces.get_mut(&meta.space_id).context(SpaceMiss {
+                    space_id: meta.space_id,
+                })?;
+                let table = space.tables.get_mut(&meta.table_id).context(TableMiss {
+                    space_id: meta.space_id,
+                    table_id: meta.table_id,
+                })?;
+
+                // Update schema of AddTableMeta.
+                table.table_meta.schema = meta.schema;
+            }
+            MetaUpdate::AlterOptions(meta) => {
+                let space = self.spaces.get_mut(&meta.space_id).context(SpaceMiss {
+                    space_id: meta.space_id,
+                })?;
+                let table = space.tables.get_mut(&meta.table_id).context(TableMiss {
+                    space_id: meta.space_id,
+                    table_id: meta.table_id,
+                })?;
+
+                // Update options of AddTableMeta.
+                table.table_meta.opts = meta.options;
+            }
+            MetaUpdate::DropTable(meta) => {
+                let space = self.spaces.get_mut(&meta.space_id).context(SpaceMiss {
+                    space_id: meta.space_id,
+                })?;
+
+                let removed_table = space.tables.remove(&meta.table_id);
+
+                debug!(
+                    "Apply drop table meta update, removed table:{}, removed:{}",
+                    meta.table_name,
+                    removed_table.is_some()
+                );
+            }
+            MetaUpdate::SnapshotManifest(_) => {
+                // A snapshot record, no need to handle this.
+            }
+        }
+
+        Ok(())
+    }
+}
diff --git a/analytic_engine/src/meta/meta_update.rs b/analytic_engine/src/meta/meta_update.rs
new file mode 100644
index 0000000000..06e8f86099
--- /dev/null
+++ b/analytic_engine/src/meta/meta_update.rs
@@ -0,0 +1,463 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Update to meta
+
+use std::convert::{TryFrom, TryInto};
+
+use common_types::{
+    bytes::{MemBuf, MemBufMut, Writer},
+    schema::{Schema, Version},
+    SequenceNumber,
+};
+use common_util::define_result;
+use proto::{analytic_common, common as common_pb, meta_update as meta_pb};
+use protobuf::Message;
+use snafu::{Backtrace, ResultExt, Snafu};
+use table_engine::table::TableId;
+use wal::{
+    log_batch::{Payload, PayloadDecoder},
+    manager::RegionId,
+};
+
+use crate::{
+    space::SpaceId,
+    table::version_edit::{AddFile, DeleteFile, VersionEdit},
+    TableOptions,
+};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Failed to encode payload, err:{}.\nBacktrace:\n{}", source, backtrace))]
+    EncodePayloadPb {
+        source: protobuf::error::ProtobufError,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Failed to convert schema, err:{}", source))]
+    ConvertSchema { source: common_types::schema::Error },
+
+    #[snafu(display("Empty meta update.\nBacktrace:\n{}", backtrace))]
+    EmptyMetaUpdate { backtrace: Backtrace },
+
+    #[snafu(display("Failed to decode payload, err:{}.\nBacktrace:\n{}", source, backtrace))]
+    DecodePayloadPb {
+        source: protobuf::error::ProtobufError,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Failed to convert version edit, err:{}", source))]
+    ConvertVersionEdit {
+        source: crate::table::version_edit::Error,
+    },
+}
+
+define_result!(Error);
+
+/// Modifications to meta data in meta
+#[derive(Debug, Clone)]
+pub enum MetaUpdate {
+    AddSpace(AddSpaceMeta),
+    AddTable(AddTableMeta),
+    DropTable(DropTableMeta),
+    VersionEdit(VersionEditMeta),
+    AlterSchema(AlterSchemaMeta),
+    AlterOptions(AlterOptionsMeta),
+    SnapshotManifest(SnapshotManifestMeta),
+}
+
+impl MetaUpdate {
+    pub fn into_pb(self) -> meta_pb::MetaUpdate {
+        let mut meta_update = meta_pb::MetaUpdate::new();
+
+        match self {
+            MetaUpdate::AddSpace(v) => {
+                meta_update.set_add_space(v.into_pb());
+            }
+            MetaUpdate::AddTable(v) => {
+                meta_update.set_add_table(v.into_pb());
+            }
+            MetaUpdate::VersionEdit(v) => {
+                meta_update.set_version_edit(v.into_pb());
+            }
+            MetaUpdate::AlterSchema(v) => {
+                meta_update.set_alter_schema(v.into_pb());
+            }
+            MetaUpdate::AlterOptions(v) => {
+                meta_update.set_alter_options(v.into_pb());
+            }
+            MetaUpdate::DropTable(v) => {
+                meta_update.set_drop_table(v.into_pb());
+            }
+            MetaUpdate::SnapshotManifest(v) => {
+                meta_update.set_snapshot_manifest(v.into_pb());
+            }
+        }
+
+        meta_update
+    }
+
+    pub fn snapshot_manifest_meta(&self) -> Option<SnapshotManifestMeta> {
+        if let MetaUpdate::SnapshotManifest(v) = self {
+            Some(*v)
+        } else {
+            None
+        }
+    }
+}
+
+impl TryFrom<meta_pb::MetaUpdate> for MetaUpdate {
+    type Error = Error;
+
+    fn try_from(src: meta_pb::MetaUpdate) -> Result<Self> {
+        let meta_update = match src.meta {
+            Some(meta_pb::MetaUpdate_oneof_meta::add_space(v)) => {
+                let add_space = AddSpaceMeta::from(v);
+                MetaUpdate::AddSpace(add_space)
+            }
+            Some(meta_pb::MetaUpdate_oneof_meta::add_table(v)) => {
+                let add_table = AddTableMeta::try_from(v)?;
+                MetaUpdate::AddTable(add_table)
+            }
+            Some(meta_pb::MetaUpdate_oneof_meta::version_edit(v)) => {
+                let version_edit = VersionEditMeta::try_from(v)?;
+                MetaUpdate::VersionEdit(version_edit)
+            }
+            Some(meta_pb::MetaUpdate_oneof_meta::alter_schema(v)) => {
+                let alter_schema = AlterSchemaMeta::try_from(v)?;
+                MetaUpdate::AlterSchema(alter_schema)
+            }
+            Some(meta_pb::MetaUpdate_oneof_meta::alter_options(v)) => {
+                let alter_options = AlterOptionsMeta::from(v);
+                MetaUpdate::AlterOptions(alter_options)
+            }
+            Some(meta_pb::MetaUpdate_oneof_meta::drop_table(v)) => {
+                let drop_table = DropTableMeta::from(v);
+                MetaUpdate::DropTable(drop_table)
+            }
+            Some(meta_pb::MetaUpdate_oneof_meta::snapshot_manifest(v)) => {
+                let snapshot_manifest = SnapshotManifestMeta::from(v);
+                MetaUpdate::SnapshotManifest(snapshot_manifest)
+            }
+            None => {
+                // Meta update should not be empty.
+                return EmptyMetaUpdate.fail();
+            }
+        };
+
+        Ok(meta_update)
+    }
+}
+
+/// Meta data for a new space
+#[derive(Debug, Clone)]
+pub struct AddSpaceMeta {
+    pub space_id: SpaceId,
+    pub space_name: String,
+}
+
+impl AddSpaceMeta {
+    fn into_pb(self) -> meta_pb::AddSpaceMeta {
+        let mut target = meta_pb::AddSpaceMeta::new();
+        target.set_space_id(self.space_id);
+        target.set_space_name(self.space_name);
+
+        target
+    }
+}
+
+impl From<meta_pb::AddSpaceMeta> for AddSpaceMeta {
+    fn from(src: meta_pb::AddSpaceMeta) -> Self {
+        Self {
+            space_id: src.space_id,
+            space_name: src.space_name,
+        }
+    }
+}
+
+/// Meta data for a new table
+#[derive(Debug, Clone)]
+pub struct AddTableMeta {
+    /// Space id of the table
+    pub space_id: SpaceId,
+    pub table_id: TableId,
+    pub table_name: String,
+    /// Schema of the table
+    pub schema: Schema,
+    // Options needed to persist
+    pub opts: TableOptions,
+}
+
+impl AddTableMeta {
+    fn into_pb(self) -> meta_pb::AddTableMeta {
+        let mut target = meta_pb::AddTableMeta::new();
+        target.set_space_id(self.space_id);
+        target.set_table_id(self.table_id.as_u64());
+        target.set_table_name(self.table_name);
+        target.set_schema(common_pb::TableSchema::from(self.schema));
+        target.set_options(analytic_common::TableOptions::from(self.opts));
+
+        target
+    }
+}
+
+impl TryFrom<meta_pb::AddTableMeta> for AddTableMeta {
+    type Error = Error;
+
+    fn try_from(mut src: meta_pb::AddTableMeta) -> Result<Self> {
+        let table_schema = src.take_schema();
+        let opts = src.take_options();
+
+        Ok(Self {
+            space_id: src.space_id,
+            table_id: TableId::from(src.table_id),
+            table_name: src.table_name,
+            schema: Schema::try_from(table_schema).context(ConvertSchema)?,
+            opts: TableOptions::from(opts),
+        })
+    }
+}
+
+/// Meta data for dropping a table
+#[derive(Debug, Clone)]
+pub struct DropTableMeta {
+    /// Space id of the table
+    pub space_id: SpaceId,
+    pub table_id: TableId,
+    pub table_name: String,
+}
+
+impl DropTableMeta {
+    fn into_pb(self) -> meta_pb::DropTableMeta {
+        let mut target = meta_pb::DropTableMeta::new();
+        target.set_space_id(self.space_id);
+        target.set_table_id(self.table_id.as_u64());
+        target.set_table_name(self.table_name);
+
+        target
+    }
+}
+
+impl From<meta_pb::DropTableMeta> for DropTableMeta {
+    fn from(src: meta_pb::DropTableMeta) -> Self {
+        Self {
+            space_id: src.space_id,
+            table_id: TableId::from(src.table_id),
+            table_name: src.table_name,
+        }
+    }
+}
+
+/// Meta data of version edit to table
+#[derive(Debug, Clone)]
+pub struct VersionEditMeta {
+    pub space_id: SpaceId,
+    pub table_id: TableId,
+    /// Sequence number of the flushed data. Set to 0 if this edit is not
+    /// created by a flush request.
+    pub flushed_sequence: SequenceNumber,
+    pub files_to_add: Vec<AddFile>,
+    pub files_to_delete: Vec<DeleteFile>,
+}
+
+impl VersionEditMeta {
+    fn into_pb(self) -> meta_pb::VersionEditMeta {
+        let mut target = meta_pb::VersionEditMeta::new();
+        target.set_space_id(self.space_id);
+        target.set_table_id(self.table_id.as_u64());
+        target.set_flushed_sequence(self.flushed_sequence);
+
+        let mut files_to_add = Vec::with_capacity(self.files_to_add.len());
+        for file in self.files_to_add {
+            files_to_add.push(file.into_pb());
+        }
+        target.files_to_add = files_to_add.into();
+
+        let mut files_to_delete = Vec::with_capacity(self.files_to_delete.len());
+        for file in self.files_to_delete {
+            files_to_delete.push(file.into_pb());
+        }
+        target.files_to_delete = files_to_delete.into();
+
+        target
+    }
+
+    /// Convert into [crate::table::version_edit::VersionEdit]. The
+    /// `mems_to_remove` field is left empty.
+    pub fn into_version_edit(self) -> VersionEdit {
+        VersionEdit {
+            mems_to_remove: Vec::new(),
+            flushed_sequence: self.flushed_sequence,
+            files_to_add: self.files_to_add,
+            files_to_delete: self.files_to_delete,
+        }
+    }
+}
+
+impl TryFrom<meta_pb::VersionEditMeta> for VersionEditMeta {
+    type Error = Error;
+
+    fn try_from(src: meta_pb::VersionEditMeta) -> Result<Self> {
+        let mut files_to_add = Vec::with_capacity(src.files_to_add.len());
+        for file_meta in src.files_to_add {
+            files_to_add.push(AddFile::try_from(file_meta).context(ConvertVersionEdit)?);
+        }
+
+        let mut files_to_delete = Vec::with_capacity(src.files_to_delete.len());
+        for file_meta in src.files_to_delete {
+            files_to_delete.push(DeleteFile::try_from(file_meta).context(ConvertVersionEdit)?);
+        }
+
+        Ok(Self {
+            space_id: src.space_id,
+            table_id: TableId::from(src.table_id),
+            flushed_sequence: src.flushed_sequence,
+            files_to_add,
+            files_to_delete,
+        })
+    }
+}
+
+/// Meta data of schema update.
+#[derive(Debug, Clone)]
+pub struct AlterSchemaMeta {
+    pub space_id: SpaceId,
+    pub table_id: TableId,
+    pub schema: Schema,
+    pub pre_schema_version: Version,
+}
+
+impl AlterSchemaMeta {
+    fn into_pb(self) -> meta_pb::AlterSchemaMeta {
+        let mut target = meta_pb::AlterSchemaMeta::new();
+        target.set_space_id(self.space_id);
+        target.set_table_id(self.table_id.as_u64());
+        target.set_schema(common_pb::TableSchema::from(self.schema));
+        target.set_pre_schema_version(self.pre_schema_version);
+
+        target
+    }
+}
+
+impl TryFrom<meta_pb::AlterSchemaMeta> for AlterSchemaMeta {
+    type Error = Error;
+
+    fn try_from(mut src: meta_pb::AlterSchemaMeta) -> Result<Self> {
+        let table_schema = src.take_schema();
+
+        Ok(Self {
+            space_id: src.space_id,
+            table_id: TableId::from(src.table_id),
+            schema: Schema::try_from(table_schema).context(ConvertSchema)?,
+            pre_schema_version: src.pre_schema_version,
+        })
+    }
+}
+
+/// Meta data of options update.
+#[derive(Debug, Clone)]
+pub struct AlterOptionsMeta {
+    pub space_id: SpaceId,
+    pub table_id: TableId,
+    pub options: TableOptions,
+}
+
+impl AlterOptionsMeta {
+    fn into_pb(self) -> meta_pb::AlterOptionsMeta {
+        let mut target = meta_pb::AlterOptionsMeta::new();
+        target.set_space_id(self.space_id);
+        target.set_table_id(self.table_id.as_u64());
+        target.set_options(analytic_common::TableOptions::from(self.options));
+
+        target
+    }
+}
+
+impl From<meta_pb::AlterOptionsMeta> for AlterOptionsMeta {
+    fn from(mut src: meta_pb::AlterOptionsMeta) -> Self {
+        let table_options = src.take_options();
+
+        Self {
+            space_id: src.space_id,
+            table_id: TableId::from(src.table_id),
+            options: TableOptions::from(table_options),
+        }
+    }
+}
+
+#[derive(Debug, Clone, Copy)]
+pub struct SnapshotManifestMeta {
+    pub snapshot_region_id: RegionId,
+    /// The last sequence (inclusive) of the data in this snapshot.
+    ///
+    /// Note that the sequence refers to the manifest region.
+    pub sequence: SequenceNumber,
+}
+
+impl SnapshotManifestMeta {
+    fn into_pb(self) -> meta_pb::SnapshotManifestMeta {
+        let mut target = meta_pb::SnapshotManifestMeta::new();
+        target.set_region_id(self.snapshot_region_id);
+        target.set_sequence(self.sequence);
+
+        target
+    }
+}
+
+impl From<meta_pb::SnapshotManifestMeta> for SnapshotManifestMeta {
+    fn from(src: meta_pb::SnapshotManifestMeta) -> SnapshotManifestMeta {
+        Self {
+            snapshot_region_id: src.region_id,
+            sequence: src.sequence,
+        }
+    }
+}
+
+/// An adapter to implement [wal::log_batch::Payload] for
+/// [proto::meta_update::MetaUpdate]
+#[derive(Debug)]
+pub struct MetaUpdatePayload(meta_pb::MetaUpdate);
+
+impl From<MetaUpdate> for MetaUpdatePayload {
+    fn from(src: MetaUpdate) -> Self {
+        MetaUpdatePayload(src.into_pb())
+    }
+}
+
+impl From<&MetaUpdate> for MetaUpdatePayload {
+    fn from(src: &MetaUpdate) -> Self {
+        MetaUpdatePayload(src.clone().into_pb())
+    }
+}
+
+impl Payload for MetaUpdatePayload {
+    type Error = Error;
+
+    fn encode_size(&self) -> usize {
+        self.0.compute_size().try_into().unwrap_or(usize::MAX)
+    }
+
+    fn encode_to<B: MemBufMut>(&self, buf: &mut B) -> Result<()> {
+        let mut writer = Writer::new(buf);
+        self.0
+            .write_to_writer(&mut writer)
+            .context(EncodePayloadPb)?;
+        Ok(())
+    }
+}
+
+/// Decoder to decode MetaUpdate from log entry
+pub struct MetaUpdateDecoder;
+
+impl PayloadDecoder for MetaUpdateDecoder {
+    type Error = Error;
+    type Target = MetaUpdate;
+
+    fn decode<B: MemBuf>(&self, buf: &mut B) -> Result<Self::Target> {
+        let meta_update = meta_pb::MetaUpdate::parse_from_bytes(buf.remaining_slice())
+            .context(DecodePayloadPb)?;
+
+        let meta_update = MetaUpdate::try_from(meta_update)?;
+
+        Ok(meta_update)
+    }
+}
diff --git a/analytic_engine/src/meta/mod.rs b/analytic_engine/src/meta/mod.rs
new file mode 100644
index 0000000000..3bea46d26e
--- /dev/null
+++ b/analytic_engine/src/meta/mod.rs
@@ -0,0 +1,29 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Manage meta data of the engine
+
+pub mod details;
+pub mod meta_data;
+pub mod meta_update;
+
+use std::fmt;
+
+use async_trait::async_trait;
+
+use crate::meta::{meta_data::ManifestData, meta_update::MetaUpdate};
+
+/// Manifest holds meta data of all tables
+#[async_trait]
+pub trait Manifest: fmt::Debug {
+    type Error: std::error::Error + Send + Sync + 'static;
+
+    /// Store update to manifest
+    async fn store_update(&self, update: MetaUpdate) -> Result<(), Self::Error>;
+
+    /// Load all data from manifest.
+    ///
+    /// If `do_snapshot` is true, the manifest will try to create a snapshot of
+    /// the manifest data. The caller should ensure `store_update()` wont be
+    /// called during loading data.
+    async fn load_data(&self, do_snapshot: bool) -> Result<ManifestData, Self::Error>;
+}
diff --git a/analytic_engine/src/payload.rs b/analytic_engine/src/payload.rs
new file mode 100644
index 0000000000..02cf58fe0a
--- /dev/null
+++ b/analytic_engine/src/payload.rs
@@ -0,0 +1,174 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Payloads to write to wal
+
+use std::convert::TryInto;
+
+use common_types::{
+    bytes::{MemBuf, MemBufMut, Writer},
+    row::{RowGroup, RowGroupBuilder},
+    schema::Schema,
+};
+use common_util::{
+    codec::{row::WalRowDecoder, Decoder},
+    define_result,
+};
+use proto::table_requests;
+use protobuf::Message;
+use snafu::{Backtrace, ResultExt, Snafu};
+use wal::log_batch::{Payload, PayloadDecoder};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Failed to encode header, err:{}", source))]
+    EncodeHeader { source: common_types::bytes::Error },
+
+    #[snafu(display("Failed to encode body, err:{}.\nBacktrace:\n{}", source, backtrace))]
+    EncodeBody {
+        source: protobuf::ProtobufError,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Failed to decode header, err:{}", source))]
+    DecodeHeader { source: common_types::bytes::Error },
+
+    #[snafu(display(
+        "Invalid wal entry header, value:{}.\nBacktrace:\n{}",
+        value,
+        backtrace
+    ))]
+    InvalidHeader { value: u8, backtrace: Backtrace },
+
+    #[snafu(display("Failed to decode body, err:{}.\nBacktrace:\n{}", source, backtrace))]
+    DecodeBody {
+        source: protobuf::ProtobufError,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Failed to decode schema, err:{}", source))]
+    DecodeSchema { source: common_types::schema::Error },
+
+    #[snafu(display("Failed to decode row, err:{}", source))]
+    DecodeRow {
+        source: common_util::codec::row::Error,
+    },
+}
+
+define_result!(Error);
+
+/// Wal entry header
+#[derive(Clone, Copy)]
+enum Header {
+    Write = 1,
+}
+
+impl Header {
+    pub fn to_u8(self) -> u8 {
+        self as u8
+    }
+
+    pub fn from_u8(value: u8) -> Option<Self> {
+        match value {
+            value if value == Self::Write as u8 => Some(Self::Write),
+            _ => None,
+        }
+    }
+}
+
+fn write_header<B: MemBufMut>(header: Header, buf: &mut B) -> Result<()> {
+    buf.write_u8(header.to_u8()).context(EncodeHeader)?;
+    Ok(())
+}
+
+/// Header size in bytes
+const HEADER_SIZE: usize = 1;
+
+/// Write request to persist in wal
+#[derive(Debug)]
+pub enum WritePayload<'a> {
+    Write(&'a table_requests::WriteRequest),
+}
+
+impl<'a> Payload for WritePayload<'a> {
+    type Error = Error;
+
+    fn encode_size(&self) -> usize {
+        let body_size = match self {
+            WritePayload::Write(req) => req.compute_size(),
+        };
+
+        HEADER_SIZE + body_size as usize
+    }
+
+    fn encode_to<B: MemBufMut>(&self, buf: &mut B) -> Result<()> {
+        match self {
+            WritePayload::Write(req) => {
+                write_header(Header::Write, buf)?;
+                let mut writer = Writer::new(buf);
+                req.write_to_writer(&mut writer).context(EncodeBody)?;
+            }
+        }
+
+        Ok(())
+    }
+}
+
+/// Payload decoded from wal
+#[derive(Debug)]
+pub enum ReadPayload {
+    Write { row_group: RowGroup },
+}
+
+/// Wal payload decoder
+#[derive(Default)]
+pub struct WalDecoder;
+
+impl PayloadDecoder for WalDecoder {
+    type Error = Error;
+    type Target = ReadPayload;
+
+    fn decode<B: MemBuf>(&self, buf: &mut B) -> Result<Self::Target> {
+        let header_value = buf.read_u8().context(DecodeHeader)?;
+        let header = match Header::from_u8(header_value) {
+            Some(header) => header,
+            None => {
+                return InvalidHeader {
+                    value: header_value,
+                }
+                .fail()
+            }
+        };
+
+        let payload = match header {
+            Header::Write => {
+                let mut write_req_pb: table_requests::WriteRequest =
+                    Message::parse_from_bytes(buf.remaining_slice()).context(DecodeBody)?;
+
+                // Consume and convert schema in pb
+                let schema: Schema = write_req_pb
+                    .take_schema()
+                    .try_into()
+                    .context(DecodeSchema)?;
+
+                // Consume and convert rows in pb
+                let encoded_rows = write_req_pb.take_rows().into_vec();
+                let mut builder =
+                    RowGroupBuilder::with_capacity(schema.clone(), encoded_rows.len());
+                let row_decoder = WalRowDecoder::new(&schema);
+                for row_bytes in &encoded_rows {
+                    let row = row_decoder
+                        .decode(&mut row_bytes.as_slice())
+                        .context(DecodeRow)?;
+                    // We skip schema check here
+                    builder.push_checked_row(row);
+                }
+
+                let row_group = builder.build();
+
+                ReadPayload::Write { row_group }
+            }
+        };
+
+        Ok(payload)
+    }
+}
diff --git a/analytic_engine/src/row_iter/chain.rs b/analytic_engine/src/row_iter/chain.rs
new file mode 100644
index 0000000000..881c96db3b
--- /dev/null
+++ b/analytic_engine/src/row_iter/chain.rs
@@ -0,0 +1,373 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use std::{fmt, time::Instant};
+
+use async_trait::async_trait;
+use common_types::{
+    projected_schema::ProjectedSchema, record_batch::RecordBatchWithKey, request_id::RequestId,
+    schema::RecordSchemaWithKey,
+};
+use common_util::define_result;
+use futures::StreamExt;
+use log::debug;
+use object_store::ObjectStore;
+use snafu::{ResultExt, Snafu};
+use table_engine::{predicate::PredicateRef, table::TableId};
+
+use crate::{
+    row_iter::{
+        record_batch_stream, record_batch_stream::SequencedRecordBatchStream,
+        RecordBatchWithKeyIterator,
+    },
+    space::SpaceId,
+    sst::{
+        factory::{Factory, SstReaderOptions},
+        file::FileHandle,
+    },
+    table::version::{MemTableVec, SamplingMemTable},
+};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Fail to build stream from the memtable, err:{}", source))]
+    BuildStreamFromMemtable {
+        source: crate::row_iter::record_batch_stream::Error,
+    },
+
+    #[snafu(display("Fail to build stream from the sst file, err:{}", source))]
+    BuildStreamFromSst {
+        source: crate::row_iter::record_batch_stream::Error,
+    },
+
+    #[snafu(display("Fail to poll next record batch, err:{}", source))]
+    PollNextRecordBatch {
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+}
+
+define_result!(Error);
+
+/// Required parameters to construct the [Builder].
+#[derive(Clone, Debug)]
+pub struct ChainConfig<'a, S, Fa> {
+    pub request_id: RequestId,
+    pub space_id: SpaceId,
+    pub table_id: TableId,
+    /// The projected schema to read.
+    pub projected_schema: ProjectedSchema,
+    /// Predicate of the query.
+    pub predicate: PredicateRef,
+
+    pub sst_reader_options: SstReaderOptions,
+    pub sst_factory: Fa,
+    /// Sst storage
+    pub store: &'a S,
+}
+
+/// Builder for [ChainIterator].
+#[must_use]
+pub struct Builder<'a, S, Fa> {
+    config: ChainConfig<'a, S, Fa>,
+    /// Sampling memtable to read.
+    sampling_mem: Option<SamplingMemTable>,
+    memtables: MemTableVec,
+    ssts: Vec<Vec<FileHandle>>,
+}
+
+impl<'a, S, Fa> Builder<'a, S, Fa> {
+    pub fn new(config: ChainConfig<'a, S, Fa>) -> Self {
+        Self {
+            config,
+            sampling_mem: None,
+            memtables: Vec::new(),
+            ssts: Vec::new(),
+        }
+    }
+
+    pub fn sampling_mem(mut self, sampling_mem: Option<SamplingMemTable>) -> Self {
+        self.sampling_mem = sampling_mem;
+        self
+    }
+
+    pub fn memtables(mut self, memtables: MemTableVec) -> Self {
+        self.memtables = memtables;
+        self
+    }
+
+    pub fn ssts(mut self, ssts: Vec<Vec<FileHandle>>) -> Self {
+        self.ssts = ssts;
+        self
+    }
+}
+
+impl<'a, S: ObjectStore, Fa: Factory> Builder<'a, S, Fa> {
+    pub async fn build(self) -> Result<ChainIterator> {
+        let total_sst_streams: usize = self.ssts.iter().map(|v| v.len()).sum();
+        let mut total_streams = self.memtables.len() + total_sst_streams;
+        if self.sampling_mem.is_some() {
+            total_streams += 1;
+        }
+        let mut streams = Vec::with_capacity(total_streams);
+
+        if let Some(v) = &self.sampling_mem {
+            let stream = record_batch_stream::filtered_stream_from_memtable(
+                self.config.projected_schema.clone(),
+                false,
+                &v.mem,
+                false,
+                self.config.predicate.as_ref(),
+            )
+            .context(BuildStreamFromMemtable)?;
+            streams.push(stream);
+        }
+
+        for memtable in &self.memtables {
+            let stream = record_batch_stream::filtered_stream_from_memtable(
+                self.config.projected_schema.clone(),
+                false,
+                // chain iterator only handle the case reading in no order so just read in asc
+                // order by default.
+                &memtable.mem,
+                false,
+                self.config.predicate.as_ref(),
+            )
+            .context(BuildStreamFromMemtable)?;
+            streams.push(stream);
+        }
+
+        for leveled_ssts in &self.ssts {
+            for sst in leveled_ssts {
+                let stream = record_batch_stream::filtered_stream_from_sst_file(
+                    self.config.space_id,
+                    self.config.table_id,
+                    sst,
+                    &self.config.sst_factory,
+                    &self.config.sst_reader_options,
+                    self.config.store,
+                )
+                .await
+                .context(BuildStreamFromSst)?;
+                streams.push(stream);
+            }
+        }
+
+        debug!(
+            "Build chain iterator, table_id:{:?}, request_id:{}, memtables:{:?}, ssts:{:?}",
+            self.config.table_id, self.config.request_id, self.memtables, self.ssts
+        );
+
+        Ok(ChainIterator {
+            space_id: self.config.space_id,
+            table_id: self.config.table_id,
+            request_id: self.config.request_id,
+            schema: self.config.projected_schema.to_record_schema_with_key(),
+            streams,
+            next_stream_idx: 0,
+            inited: false,
+            metrics: Metrics::new(self.memtables.len(), total_sst_streams),
+        })
+    }
+}
+
+/// Metrics for [ChainIterator].
+struct Metrics {
+    num_memtables: usize,
+    num_ssts: usize,
+    /// Total batch fetched.
+    total_batch_fetched: usize,
+    /// Total rows fetched.
+    total_rows_fetched: usize,
+    /// Create time of the metrics.
+    create_at: Instant,
+    /// Inited time of the iterator.
+    inited_at: Option<Instant>,
+}
+
+impl Metrics {
+    fn new(num_memtables: usize, num_ssts: usize) -> Self {
+        Self {
+            num_memtables,
+            num_ssts,
+            total_batch_fetched: 0,
+            total_rows_fetched: 0,
+            create_at: Instant::now(),
+            inited_at: None,
+        }
+    }
+
+    fn set_inited_time(&mut self) {
+        self.inited_at = Some(Instant::now());
+    }
+}
+
+impl fmt::Debug for Metrics {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("Metrics")
+            .field("num_memtables", &self.num_memtables)
+            .field("num_ssts", &self.num_ssts)
+            .field("total_batch_fetched", &self.total_batch_fetched)
+            .field("total_rows_fetched", &self.total_rows_fetched)
+            .field("duration_since_create", &self.create_at.elapsed())
+            .field("duration_since_init", &self.inited_at.map(|v| v.elapsed()))
+            .finish()
+    }
+}
+
+/// ChainIter chains memtables and ssts and reads the [RecordBatch] from them
+/// batch by batch.
+///
+/// Note: The chain order is `memtable -> sst level 0 -> sst_level 1`.
+pub struct ChainIterator {
+    space_id: SpaceId,
+    table_id: TableId,
+    request_id: RequestId,
+    schema: RecordSchemaWithKey,
+    streams: Vec<SequencedRecordBatchStream>,
+    /// The range of the index is [0, streams.len()] and the iterator is
+    /// exhausted if it reaches `streams.len()`.
+    next_stream_idx: usize,
+    inited: bool,
+
+    // metrics for the iterator.
+    metrics: Metrics,
+}
+
+impl ChainIterator {
+    fn init_if_necessary(&mut self) {
+        if self.inited {
+            return;
+        }
+        self.inited = true;
+        self.metrics.set_inited_time();
+
+        debug!("Init ChainIterator, space_id:{}, table_id:{:?}, request_id:{}, total_streams:{}, schema:{:?}",
+            self.space_id, self.table_id, self.request_id, self.streams.len(), self.schema
+        );
+    }
+}
+
+impl Drop for ChainIterator {
+    fn drop(&mut self) {
+        debug!(
+            "Chain iterator dropped, space_id:{}, table_id:{:?}, request_id:{}, metrics:{:?}",
+            self.space_id, self.table_id, self.request_id, self.metrics,
+        );
+    }
+}
+
+#[async_trait]
+impl RecordBatchWithKeyIterator for ChainIterator {
+    type Error = Error;
+
+    fn schema(&self) -> &RecordSchemaWithKey {
+        &self.schema
+    }
+
+    async fn next_batch(&mut self) -> Result<Option<RecordBatchWithKey>> {
+        self.init_if_necessary();
+
+        while self.next_stream_idx < self.streams.len() {
+            let read_stream = &mut self.streams[self.next_stream_idx];
+            let sequenced_record_batch = read_stream
+                .next()
+                .await
+                .transpose()
+                .context(PollNextRecordBatch)?;
+
+            match sequenced_record_batch {
+                Some(v) => {
+                    self.metrics.total_rows_fetched += v.num_rows();
+                    self.metrics.total_batch_fetched += 1;
+
+                    if v.num_rows() > 0 {
+                        return Ok(Some(v.record_batch));
+                    }
+                }
+                // Fetch next stream only if the current sequence_record_batch is None.
+                None => self.next_stream_idx += 1,
+            }
+        }
+
+        Ok(None)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use common_types::{
+        self,
+        row::Row,
+        tests::{build_row, build_schema},
+        SequenceNumber,
+    };
+
+    use super::*;
+    use crate::row_iter::tests::check_iterator;
+
+    async fn run_and_check(testcases: Vec<(SequenceNumber, Vec<Row>)>) {
+        let schema = build_schema();
+
+        let expect_rows: Vec<_> = testcases
+            .iter()
+            .flat_map(|(_, rows)| rows.clone())
+            .collect();
+
+        let streams =
+            record_batch_stream::tests::build_sequenced_record_batch_stream(&schema, testcases);
+
+        let mut chain_iter = ChainIterator {
+            space_id: 0,
+            table_id: TableId::MIN,
+            request_id: RequestId::next_id(),
+            schema: schema.to_record_schema_with_key(),
+            streams,
+            next_stream_idx: 0,
+            inited: false,
+            metrics: Metrics::new(0, 0),
+        };
+
+        check_iterator(&mut chain_iter, expect_rows).await;
+    }
+
+    #[tokio::test]
+    async fn test_chain_multiple_streams() {
+        let testcases = vec![
+            // (sequence, rows)
+            (10, vec![build_row(b"key4", 1000000, 10.0, "v4")]),
+            (20, vec![build_row(b"key2", 1000000, 10.0, "v2")]),
+            (100, vec![build_row(b"key3", 1000000, 10.0, "v3")]),
+            (1, vec![build_row(b"key1", 1000000, 10.0, "v1")]),
+        ];
+        run_and_check(testcases).await;
+    }
+
+    #[tokio::test]
+    async fn test_chain_empty_streams() {
+        let testcases = vec![
+            // (sequence, rows)
+            (10, vec![]),
+            (20, vec![]),
+            (100, vec![]),
+            (1, vec![]),
+        ];
+        run_and_check(testcases).await;
+    }
+
+    #[tokio::test]
+    async fn test_chain_no_streams() {
+        let testcases = vec![];
+        run_and_check(testcases).await;
+    }
+
+    #[tokio::test]
+    async fn test_chain_half_empty_streams() {
+        let testcases = vec![
+            // (sequence, rows)
+            (10, vec![build_row(b"key4", 1000000, 10.0, "v4")]),
+            (20, vec![]),
+            (100, vec![]),
+            (1, vec![build_row(b"key1", 1000000, 10.0, "v1")]),
+        ];
+        run_and_check(testcases).await;
+    }
+}
diff --git a/analytic_engine/src/row_iter/dedup.rs b/analytic_engine/src/row_iter/dedup.rs
new file mode 100644
index 0000000000..cd58b0157f
--- /dev/null
+++ b/analytic_engine/src/row_iter/dedup.rs
@@ -0,0 +1,243 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use std::cmp::Ordering;
+
+use async_trait::async_trait;
+use common_types::{
+    record_batch::{RecordBatchWithKey, RecordBatchWithKeyBuilder},
+    request_id::RequestId,
+    row::{Row, RowViewOnBatch, RowWithMeta},
+    schema::RecordSchemaWithKey,
+};
+use common_util::define_result;
+use log::{info, trace};
+use snafu::{ResultExt, Snafu};
+
+use crate::row_iter::{IterOptions, RecordBatchWithKeyIterator};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Failed to iterate column, error:{:?}", source))]
+    IterateColumn { source: common_types::row::Error },
+
+    #[snafu(display("Failed to build record batch, error:{:?}", source))]
+    BuildRecordBatch {
+        source: common_types::record_batch::Error,
+    },
+
+    #[snafu(display("Failed to append row, err:{:?}", source))]
+    AppendRow {
+        source: common_types::record_batch::Error,
+    },
+
+    #[snafu(display("Failed to read data from the sub iterator, err:{:?}", source))]
+    ReadFromSubIter {
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+}
+
+define_result!(Error);
+
+/// Dedup the elements from the `iter` by choosing the first one in the
+/// duplicate rows.
+pub struct DedupIterator<I> {
+    request_id: RequestId,
+    schema: RecordSchemaWithKey,
+    record_batch_builder: RecordBatchWithKeyBuilder,
+    iter: I,
+    /// Previous row returned.
+    prev_row: Option<Row>,
+    /// Store which row in record batch is keep, use Vec<bool> is a bit faster
+    /// than a bitmap.
+    selected_rows: Vec<bool>,
+
+    // Metrics:
+    total_duplications: usize,
+    total_selected_rows: usize,
+}
+
+impl<I: RecordBatchWithKeyIterator> DedupIterator<I> {
+    pub fn new(request_id: RequestId, iter: I, iter_options: IterOptions) -> Self {
+        let schema = iter.schema();
+
+        let record_batch_builder =
+            RecordBatchWithKeyBuilder::with_capacity(schema.clone(), iter_options.batch_size);
+        Self {
+            request_id,
+            schema: schema.clone(),
+            record_batch_builder,
+            iter,
+            prev_row: None,
+            selected_rows: Vec::new(),
+            total_duplications: 0,
+            total_selected_rows: 0,
+        }
+    }
+
+    fn dedup_batch(&mut self, record_batch: RecordBatchWithKey) -> Result<RecordBatchWithKey> {
+        self.selected_rows.clear();
+        // Ignore all rows by default.
+        self.selected_rows.resize(record_batch.num_rows(), false);
+
+        if record_batch.is_empty() {
+            return Ok(record_batch);
+        }
+
+        // Dedup batch.
+        for col_idx in 0..self.schema.num_key_columns() {
+            let column = record_batch.column(col_idx);
+
+            column.dedup(&mut self.selected_rows);
+        }
+
+        // Dedup first row in record batch with previous row.
+        if let Some(prev_row) = &self.prev_row {
+            let prev_row_view = RowWithMeta {
+                row: prev_row,
+                schema: &self.schema,
+            };
+            let curr_row_view = RowViewOnBatch {
+                record_batch: &record_batch,
+                // First row.
+                row_idx: 0,
+            };
+
+            let is_equal = matches!(
+                // TODO(yingwen): Compare row needs clone data of row.
+                self.schema.compare_row(&prev_row_view, &curr_row_view),
+                Ordering::Equal
+            );
+
+            if is_equal {
+                // Depulicate with previous row.
+                self.selected_rows[0] = false;
+            }
+        }
+
+        let selected_num = self
+            .selected_rows
+            .iter()
+            .map(|v| if *v { 1 } else { 0 })
+            .sum();
+
+        // Eventhough all rows are duplicate, we can still use row pointed by
+        // prev_row_idx because they have same row key.
+        self.prev_row = Some(record_batch.clone_row_at(record_batch.num_rows() - 1));
+
+        self.filter_batch(record_batch, selected_num)
+    }
+
+    /// Filter batch by `selected_rows`.
+    fn filter_batch(
+        &mut self,
+        record_batch: RecordBatchWithKey,
+        selected_num: usize,
+    ) -> Result<RecordBatchWithKey> {
+        self.total_selected_rows += selected_num;
+        self.total_duplications += record_batch.num_rows() - selected_num;
+
+        if selected_num == record_batch.num_rows() {
+            // No duplicate rows in batch.
+            return Ok(record_batch);
+        }
+
+        self.record_batch_builder.clear();
+        for (row_idx, selected) in self.selected_rows.iter().enumerate() {
+            if *selected {
+                self.record_batch_builder
+                    .append_row_view(&RowViewOnBatch {
+                        record_batch: &record_batch,
+                        row_idx,
+                    })
+                    .context(AppendRow)?;
+            }
+        }
+
+        self.record_batch_builder.build().context(BuildRecordBatch)
+    }
+}
+
+#[async_trait]
+impl<I: RecordBatchWithKeyIterator> RecordBatchWithKeyIterator for DedupIterator<I> {
+    type Error = Error;
+
+    fn schema(&self) -> &RecordSchemaWithKey {
+        &self.schema
+    }
+
+    async fn next_batch(&mut self) -> Result<Option<RecordBatchWithKey>> {
+        match self
+            .iter
+            .next_batch()
+            .await
+            .map_err(|e| Box::new(e) as _)
+            .context(ReadFromSubIter)?
+        {
+            Some(record_batch) => {
+                trace!(
+                    "DedupIterator received next record batch, request_id:{}, batch:{:?}",
+                    self.request_id,
+                    record_batch
+                );
+
+                self.dedup_batch(record_batch).map(Some)
+            }
+            None => {
+                info!(
+                    "DedupIterator received none record batch, request_id:{}, total_duplications:{}, total_selected_rows:{}",
+                    self.request_id, self.total_duplications, self.total_selected_rows,
+                );
+
+                Ok(None)
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use common_types::tests::{build_row, build_schema};
+
+    use super::*;
+    use crate::row_iter::tests::{build_record_batch_with_key, check_iterator, VectorIterator};
+
+    #[tokio::test]
+    async fn test_dedup_iterator() {
+        // first two columns are key columns
+        let schema = build_schema();
+        let iter = VectorIterator::new(
+            schema.to_record_schema_with_key(),
+            vec![
+                build_record_batch_with_key(
+                    schema.clone(),
+                    vec![
+                        build_row(b"a", 1, 10.0, "v1"),
+                        build_row(b"a", 1, 10.0, "v"),
+                        build_row(b"a", 2, 10.0, "v2"),
+                    ],
+                ),
+                build_record_batch_with_key(
+                    schema,
+                    vec![
+                        build_row(b"a", 2, 10.0, "v"),
+                        build_row(b"a", 3, 10.0, "v3"),
+                        build_row(b"a", 3, 10.0, "v"),
+                        build_row(b"a", 4, 10.0, "v4"),
+                    ],
+                ),
+            ],
+        );
+
+        let mut iter = DedupIterator::new(RequestId::next_id(), iter, IterOptions::default());
+        check_iterator(
+            &mut iter,
+            vec![
+                build_row(b"a", 1, 10.0, "v1"),
+                build_row(b"a", 2, 10.0, "v2"),
+                build_row(b"a", 3, 10.0, "v3"),
+                build_row(b"a", 4, 10.0, "v4"),
+            ],
+        )
+        .await;
+    }
+}
diff --git a/analytic_engine/src/row_iter/merge.rs b/analytic_engine/src/row_iter/merge.rs
new file mode 100644
index 0000000000..49403c90ae
--- /dev/null
+++ b/analytic_engine/src/row_iter/merge.rs
@@ -0,0 +1,957 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use std::{
+    cmp,
+    cmp::Ordering,
+    collections::BinaryHeap,
+    fmt, mem,
+    ops::{Deref, DerefMut},
+    time::{Duration, Instant},
+};
+
+use async_trait::async_trait;
+use common_types::{
+    projected_schema::ProjectedSchema,
+    record_batch::{RecordBatchWithKey, RecordBatchWithKeyBuilder},
+    request_id::RequestId,
+    row::RowViewOnBatch,
+    schema::RecordSchemaWithKey,
+    SequenceNumber,
+};
+use common_util::define_result;
+use futures::StreamExt;
+use log::{debug, info, trace};
+use object_store::ObjectStore;
+use snafu::{ensure, Backtrace, ResultExt, Snafu};
+use table_engine::{predicate::PredicateRef, table::TableId};
+
+use crate::{
+    row_iter::{
+        record_batch_stream,
+        record_batch_stream::{SequencedRecordBatch, SequencedRecordBatchStream},
+        IterOptions, RecordBatchWithKeyIterator,
+    },
+    space::SpaceId,
+    sst::{
+        factory::{Factory, SstReaderOptions},
+        file::FileHandle,
+        manager::{FileId, MAX_LEVEL},
+    },
+    table::version::{MemTableVec, SamplingMemTable},
+};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display(
+        "Expect the same schema, expect:{:?}, given:{:?}.\nBacktrace:\n{}",
+        expect,
+        given,
+        backtrace
+    ))]
+    MismatchedSchema {
+        expect: RecordSchemaWithKey,
+        given: RecordSchemaWithKey,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Failed to pull record batch, error:{}", source))]
+    PullRecordBatch {
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+
+    #[snafu(display("Failed to build record batch, error:{}", source))]
+    BuildRecordBatch {
+        source: common_types::record_batch::Error,
+    },
+
+    #[snafu(display("Failed to append row, err:{:?}", source))]
+    AppendRow {
+        source: common_types::record_batch::Error,
+    },
+
+    #[snafu(display("Failed to build stream from memtable, err:{}", source))]
+    BuildStreamFromMemtable {
+        source: crate::row_iter::record_batch_stream::Error,
+    },
+
+    #[snafu(display("Failed to build record batch from sst, err:{}", source))]
+    BuildStreamFromSst {
+        source: crate::row_iter::record_batch_stream::Error,
+    },
+}
+
+define_result!(Error);
+
+/// Required parameters to construct the [MergeBuilder]
+#[derive(Debug)]
+pub struct MergeConfig<'a, S, Fa> {
+    pub request_id: RequestId,
+    pub space_id: SpaceId,
+    pub table_id: TableId,
+    /// Max visible sequence (inclusive)
+    pub sequence: SequenceNumber,
+    /// The projected schema to read.
+    pub projected_schema: ProjectedSchema,
+    /// The predicate of the query.
+    pub predicate: PredicateRef,
+
+    pub sst_reader_options: SstReaderOptions,
+    pub sst_factory: Fa,
+    /// Sst storage
+    pub store: &'a S,
+
+    pub merge_iter_options: IterOptions,
+
+    pub need_dedup: bool,
+    pub reverse: bool,
+}
+
+/// Builder for building merge stream from memtables and sst files.
+#[must_use]
+pub struct MergeBuilder<'a, S, Fa> {
+    config: MergeConfig<'a, S, Fa>,
+
+    /// Sampling memtable to read.
+    sampling_mem: Option<SamplingMemTable>,
+    /// MemTables to read.
+    memtables: MemTableVec,
+    /// Ssts to read of each level.
+    ssts: Vec<Vec<FileHandle>>,
+}
+
+impl<'a, S: ObjectStore, Fa: Factory> MergeBuilder<'a, S, Fa> {
+    pub fn new(config: MergeConfig<'a, S, Fa>) -> Self {
+        Self {
+            config,
+            sampling_mem: None,
+            memtables: Vec::new(),
+            ssts: vec![Vec::new(); MAX_LEVEL],
+        }
+    }
+
+    pub fn sampling_mem(mut self, sampling_mem: Option<SamplingMemTable>) -> Self {
+        self.sampling_mem = sampling_mem;
+        self
+    }
+
+    pub fn memtables(mut self, memtables: MemTableVec) -> Self {
+        self.memtables = memtables;
+        self
+    }
+
+    pub fn ssts_of_level(mut self, ssts: Vec<Vec<FileHandle>>) -> Self {
+        self.ssts = ssts;
+        self
+    }
+
+    pub fn mut_memtables(&mut self) -> &mut MemTableVec {
+        &mut self.memtables
+    }
+
+    /// Returns file handles in `level`, panic if level >= MAX_LEVEL
+    pub fn mut_ssts_of_level(&mut self, level: u16) -> &mut Vec<FileHandle> {
+        &mut self.ssts[usize::from(level)]
+    }
+
+    pub async fn build(self) -> Result<MergeIterator> {
+        let sst_streams_num: usize = self
+            .ssts
+            .iter()
+            .map(|leveled_ssts| leveled_ssts.len())
+            .sum();
+        let mut streams_num = sst_streams_num + self.memtables.len();
+        if self.sampling_mem.is_some() {
+            streams_num += 1;
+        }
+        let mut streams = Vec::with_capacity(streams_num);
+
+        debug!(
+            "Build merge iterator, table_id:{:?}, request_id:{}, sampling_mem:{:?}, memtables:{:?}, ssts:{:?}",
+            self.config.table_id,
+            self.config.request_id,
+            self.sampling_mem,
+            self.memtables,
+            self.ssts
+        );
+
+        if let Some(v) = &self.sampling_mem {
+            let stream = record_batch_stream::filtered_stream_from_memtable(
+                self.config.projected_schema.clone(),
+                self.config.need_dedup,
+                &v.mem,
+                self.config.reverse,
+                self.config.predicate.as_ref(),
+            )
+            .context(BuildStreamFromMemtable)?;
+            streams.push(stream);
+        }
+
+        for memtable in &self.memtables {
+            let stream = record_batch_stream::filtered_stream_from_memtable(
+                self.config.projected_schema.clone(),
+                self.config.need_dedup,
+                &memtable.mem,
+                self.config.reverse,
+                self.config.predicate.as_ref(),
+            )
+            .context(BuildStreamFromMemtable)?;
+            streams.push(stream);
+        }
+
+        let mut sst_ids = Vec::with_capacity(self.ssts.len());
+        for leveled_ssts in &self.ssts {
+            for f in leveled_ssts {
+                let stream = record_batch_stream::filtered_stream_from_sst_file(
+                    self.config.space_id,
+                    self.config.table_id,
+                    f,
+                    &self.config.sst_factory,
+                    &self.config.sst_reader_options,
+                    self.config.store,
+                )
+                .await
+                .context(BuildStreamFromSst)?;
+                streams.push(stream);
+                sst_ids.push(f.id());
+            }
+        }
+
+        Ok(MergeIterator::new(
+            self.config.table_id,
+            self.config.request_id,
+            // Use the schema after projection as the schema of the merge iterator.
+            self.config.projected_schema.to_record_schema_with_key(),
+            streams,
+            self.config.merge_iter_options,
+            self.config.reverse,
+            Metrics::new(self.memtables.len(), sst_streams_num, sst_ids),
+        ))
+    }
+}
+
+struct BufferedStreamState {
+    /// Buffered record batch.
+    ///
+    /// invariant: `buffered_record_batch` is not empty.
+    buffered_record_batch: SequencedRecordBatch,
+    /// Cursor for reading buffered record batch.
+    ///
+    /// `cursor` increases monotonically from 0 to
+    /// `buffered_record_batch.num_rows()` and `cursor ==
+    /// buffered_record_batch.num_rows()` means no more buffered rows to read.
+    cursor: usize,
+}
+
+impl BufferedStreamState {
+    #[inline]
+    fn is_valid(&self) -> bool {
+        self.cursor < self.buffered_record_batch.num_rows()
+    }
+
+    #[inline]
+    fn is_empty(&self) -> bool {
+        self.cursor >= self.buffered_record_batch.num_rows()
+    }
+
+    #[inline]
+    fn sequence(&self) -> SequenceNumber {
+        self.buffered_record_batch.sequence
+    }
+
+    #[inline]
+    fn first_row(&self) -> RowViewOnBatch<'_> {
+        assert!(self.is_valid());
+
+        RowViewOnBatch {
+            record_batch: &self.buffered_record_batch.record_batch,
+            row_idx: self.cursor,
+        }
+    }
+
+    #[inline]
+    fn last_row(&self) -> RowViewOnBatch<'_> {
+        assert!(self.is_valid());
+
+        RowViewOnBatch {
+            record_batch: &self.buffered_record_batch.record_batch,
+            row_idx: self.buffered_record_batch.num_rows() - 1,
+        }
+    }
+
+    /// Returns the next available row in the buffer and advance the cursor by
+    /// one step.
+    fn next_row(&mut self) -> Option<RowViewOnBatch<'_>> {
+        if self.cursor < self.buffered_record_batch.num_rows() {
+            let row_view = RowViewOnBatch {
+                record_batch: &self.buffered_record_batch.record_batch,
+                row_idx: self.cursor,
+            };
+            self.cursor += 1;
+            Some(row_view)
+        } else {
+            None
+        }
+    }
+
+    /// Append `len` rows from cursor to the `builder` and advance the cursor.
+    ///
+    /// Returns number of rows added.
+    fn append_rows_to(
+        &mut self,
+        builder: &mut RecordBatchWithKeyBuilder,
+        len: usize,
+    ) -> Result<usize> {
+        let added = builder
+            .append_batch_range(&self.buffered_record_batch.record_batch, self.cursor, len)
+            .context(AppendRow)?;
+        self.cursor += added;
+        Ok(added)
+    }
+
+    /// Take record batch slice with at most `len` rows from cursor and advance
+    /// the cursor.
+    fn take_record_batch_slice(&mut self, len: usize) -> RecordBatchWithKey {
+        let len_to_fetch = cmp::min(
+            self.buffered_record_batch.record_batch.num_rows() - self.cursor,
+            len,
+        );
+        let record_batch = self
+            .buffered_record_batch
+            .record_batch
+            .slice(self.cursor, len_to_fetch);
+        self.cursor += record_batch.num_rows();
+        record_batch
+    }
+
+    #[inline]
+    fn reset(&mut self, record_batch: SequencedRecordBatch) {
+        self.buffered_record_batch = record_batch;
+        self.cursor = 0;
+    }
+}
+
+struct BufferedStream {
+    schema: RecordSchemaWithKey,
+    stream: SequencedRecordBatchStream,
+    /// `None` state means the stream is exhausted.
+    state: Option<BufferedStreamState>,
+}
+
+impl BufferedStream {
+    async fn build(
+        schema: RecordSchemaWithKey,
+        mut stream: SequencedRecordBatchStream,
+        metrics: &mut Metrics,
+    ) -> Result<Self> {
+        // TODO(xikai): do the metrics collection in the `pull_next_non_empty_batch`.
+        let pull_start = Instant::now();
+        let buffered_record_batch = Self::pull_next_non_empty_batch(&mut stream).await?;
+        metrics.scan_duration += pull_start.elapsed();
+        metrics.scan_count += 1;
+
+        let state = buffered_record_batch.map(|v| BufferedStreamState {
+            buffered_record_batch: v,
+            cursor: 0,
+        });
+
+        Ok(Self {
+            schema,
+            stream,
+            state,
+        })
+    }
+
+    fn sequence_in_buffer(&self) -> SequenceNumber {
+        self.state.as_ref().unwrap().sequence()
+    }
+
+    /// REQUIRE: the buffer is not exhausted.
+    fn first_row_in_buffer(&self) -> RowViewOnBatch<'_> {
+        self.state.as_ref().unwrap().first_row()
+    }
+
+    /// REQUIRE: the buffer is not exhausted.
+    fn last_row_in_buffer(&self) -> RowViewOnBatch<'_> {
+        self.state.as_ref().unwrap().last_row()
+    }
+
+    /// REQUIRE: the buffer is not exhausted.
+    fn next_row_in_buffer(&mut self) -> Option<RowViewOnBatch<'_>> {
+        self.state.as_mut().unwrap().next_row()
+    }
+
+    /// REQUIRE: the buffer is not exhausted.
+    fn append_rows_to(
+        &mut self,
+        builder: &mut RecordBatchWithKeyBuilder,
+        len: usize,
+    ) -> Result<usize> {
+        self.state.as_mut().unwrap().append_rows_to(builder, len)
+    }
+
+    /// REQUIRE: the buffer is not exhausted.
+    fn take_record_batch_slice(&mut self, len: usize) -> RecordBatchWithKey {
+        self.state.as_mut().unwrap().take_record_batch_slice(len)
+    }
+
+    /// Pull the next non empty record batch.
+    ///
+    /// The returned record batch is ensured `num_rows() > 0`.
+    async fn pull_next_non_empty_batch(
+        stream: &mut SequencedRecordBatchStream,
+    ) -> Result<Option<SequencedRecordBatch>> {
+        loop {
+            match stream.next().await.transpose().context(PullRecordBatch)? {
+                Some(record_batch) => {
+                    trace!(
+                        "MergeIterator one record batch is fetched:{:?}",
+                        record_batch
+                    );
+
+                    if record_batch.num_rows() > 0 {
+                        return Ok(Some(record_batch));
+                    }
+                }
+                None => return Ok(None),
+            }
+        }
+    }
+
+    /// Pull the next batch if the stream is not exhausted and the inner state
+    /// is empty.
+    async fn pull_next_batch_if_necessary(&mut self, metrics: &mut Metrics) -> Result<bool> {
+        let need_pull_new_batch = !self.is_exhausted() && self.state.as_ref().unwrap().is_empty();
+        if !need_pull_new_batch {
+            return Ok(false);
+        }
+
+        // TODO(xikai): do the metrics collection in the `pull_next_non_empty_batch`.
+        let pull_start = Instant::now();
+        let pulled = match Self::pull_next_non_empty_batch(&mut self.stream).await? {
+            None => {
+                self.state = None;
+                Ok(false)
+            }
+            Some(record_batch) => {
+                self.state.as_mut().unwrap().reset(record_batch);
+                Ok(true)
+            }
+        };
+
+        metrics.scan_duration += pull_start.elapsed();
+        metrics.scan_count += 1;
+
+        pulled
+    }
+
+    #[inline]
+    fn is_exhausted(&self) -> bool {
+        self.state.is_none()
+    }
+
+    fn into_heaped(self, reverse: bool) -> HeapBufferedStream {
+        HeapBufferedStream {
+            stream: self,
+            reverse,
+        }
+    }
+
+    #[inline]
+    fn schema(&self) -> &RecordSchemaWithKey {
+        &self.schema
+    }
+}
+
+/// The wrapper struct determines the compare result for the min binary heap.
+struct HeapBufferedStream {
+    stream: BufferedStream,
+    reverse: bool,
+}
+
+impl HeapBufferedStream {
+    /// Check whether all the buffered rows in the `stream` is after the
+    /// `boundary_row`.
+    ///
+    /// NOTE:
+    ///  - The first row in the stream is actually the max row if in reverse
+    ///    order and should check whether it is smaller than `boundary_row`.
+    ///  - The first row in the stream is actually the min row if in normal
+    ///    order and should check whether it is greater than `boundary_row`.
+    fn is_after_boundary(
+        &self,
+        schema: &RecordSchemaWithKey,
+        boundary_row: &RowViewOnBatch,
+    ) -> bool {
+        if self.reverse {
+            // Compare the max row(the first row) in of the stream with the boundary row.
+            // The stream is after the boundary if the max row is smaller than boundary.
+            // is_after: (boundary_row) > [first_row in buffer]
+            matches!(
+                schema.compare_row(boundary_row, &self.first_row_in_buffer()),
+                Ordering::Greater
+            )
+        } else {
+            // compare the min row(the first row) in of the stream with the boundary row.
+            // The stream is after the boundary if the min row is greater than boundary.
+            // is_after: (boundary_row) < [first_row in buffer]
+            matches!(
+                schema.compare_row(&self.first_row_in_buffer(), boundary_row),
+                Ordering::Greater
+            )
+        }
+    }
+}
+
+impl Deref for HeapBufferedStream {
+    type Target = BufferedStream;
+
+    fn deref(&self) -> &BufferedStream {
+        &self.stream
+    }
+}
+
+impl DerefMut for HeapBufferedStream {
+    fn deref_mut(&mut self) -> &mut BufferedStream {
+        &mut self.stream
+    }
+}
+
+impl PartialEq for HeapBufferedStream {
+    fn eq(&self, other: &Self) -> bool {
+        let ordering = self
+            .schema
+            .compare_row(&self.first_row_in_buffer(), &other.first_row_in_buffer());
+        if let Ordering::Equal = ordering {
+            self.sequence_in_buffer() == other.sequence_in_buffer()
+        } else {
+            false
+        }
+    }
+}
+
+impl Eq for HeapBufferedStream {}
+
+impl PartialOrd for HeapBufferedStream {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for HeapBufferedStream {
+    fn cmp(&self, other: &Self) -> Ordering {
+        let ordering = if self.reverse {
+            // keep the original ordering so the greater row comes before the smaller one.
+            self.schema
+                .compare_row(&self.first_row_in_buffer(), &other.first_row_in_buffer())
+        } else {
+            // reverse the original ordering so the smaller row comes before the greater
+            // one.
+            self.schema
+                .compare_row(&other.first_row_in_buffer(), &self.first_row_in_buffer())
+        };
+
+        if let Ordering::Equal = ordering {
+            // The larger sequence number should always comes before the smaller one.
+            self.sequence_in_buffer().cmp(&other.sequence_in_buffer())
+        } else {
+            ordering
+        }
+    }
+}
+
+pub struct Metrics {
+    num_memtables: usize,
+    num_ssts: usize,
+    sst_ids: Vec<FileId>,
+    /// Times to fetch rows from one stream.
+    times_fetch_rows_from_one: usize,
+    /// Total rows collected using fetch_rows_from_one_stream().
+    total_rows_fetch_from_one: usize,
+    /// Times to fetch one row from multiple stream.
+    times_fetch_row_from_multiple: usize,
+    /// Create time of the metrics.
+    create_at: Instant,
+    /// Init time cost of the metrics.
+    init_duration: Duration,
+    /// Scan time cost of the metrics.
+    scan_duration: Duration,
+    /// Scan count
+    scan_count: usize,
+}
+
+impl Metrics {
+    fn new(num_memtables: usize, num_ssts: usize, sst_ids: Vec<FileId>) -> Self {
+        Self {
+            num_memtables,
+            num_ssts,
+            sst_ids,
+            times_fetch_rows_from_one: 0,
+            total_rows_fetch_from_one: 0,
+            times_fetch_row_from_multiple: 0,
+            create_at: Instant::now(),
+            init_duration: Duration::default(),
+            scan_duration: Duration::default(),
+            scan_count: 0,
+        }
+    }
+}
+
+impl fmt::Debug for Metrics {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("Metrics")
+            .field("num_memtables", &self.num_memtables)
+            .field("num_ssts", &self.num_ssts)
+            .field("sst_ids", &self.sst_ids)
+            .field("times_fetch_rows_from_one", &self.times_fetch_rows_from_one)
+            .field("total_rows_fetch_from_one", &self.total_rows_fetch_from_one)
+            .field(
+                "times_fetch_row_from_multiple",
+                &self.times_fetch_row_from_multiple,
+            )
+            .field("duration_since_create", &self.create_at.elapsed())
+            .field("init_duration", &self.init_duration)
+            .field("scan_duration", &self.scan_duration)
+            .field("scan_count", &self.scan_count)
+            .finish()
+    }
+}
+
+pub struct MergeIterator {
+    table_id: TableId,
+    request_id: RequestId,
+    inited: bool,
+    schema: RecordSchemaWithKey,
+    record_batch_builder: RecordBatchWithKeyBuilder,
+    origin_streams: Vec<SequencedRecordBatchStream>,
+    /// Any [BufferedStream] in the hot heap is not empty.
+    hot: BinaryHeap<HeapBufferedStream>,
+    /// Any [BufferedStream] in the cold heap is not empty.
+    cold: BinaryHeap<HeapBufferedStream>,
+    iter_options: IterOptions,
+    reverse: bool,
+    metrics: Metrics,
+}
+
+impl MergeIterator {
+    pub fn new(
+        table_id: TableId,
+        request_id: RequestId,
+        schema: RecordSchemaWithKey,
+        streams: Vec<SequencedRecordBatchStream>,
+        iter_options: IterOptions,
+        reverse: bool,
+        metrics: Metrics,
+    ) -> Self {
+        let heap_cap = streams.len();
+        let record_batch_builder =
+            RecordBatchWithKeyBuilder::with_capacity(schema.clone(), iter_options.batch_size);
+        Self {
+            table_id,
+            request_id,
+            inited: false,
+            schema,
+            record_batch_builder,
+            origin_streams: streams,
+            hot: BinaryHeap::with_capacity(heap_cap),
+            cold: BinaryHeap::with_capacity(heap_cap),
+            iter_options,
+            reverse,
+            metrics,
+        }
+    }
+
+    fn merge_window_end(&self) -> Option<RowViewOnBatch> {
+        self.hot.peek().as_ref().map(|v| v.last_row_in_buffer())
+    }
+
+    async fn init_if_necessary(&mut self) -> Result<()> {
+        if self.inited {
+            return Ok(());
+        }
+
+        info!(
+            "Merge iterator init, table_id:{:?}, request_id:{}, schema:{:?}",
+            self.table_id, self.request_id, self.schema
+        );
+        let init_start = Instant::now();
+
+        let current_schema = &self.schema;
+        for stream in mem::take(&mut self.origin_streams) {
+            let buffered_stream =
+                BufferedStream::build(self.schema.clone(), stream, &mut self.metrics).await?;
+            let stream_schema = buffered_stream.schema();
+            ensure!(
+                current_schema == stream_schema,
+                MismatchedSchema {
+                    expect: current_schema.clone(),
+                    given: stream_schema.clone(),
+                }
+            );
+
+            if !buffered_stream.is_exhausted() {
+                self.cold.push(buffered_stream.into_heaped(self.reverse));
+            }
+        }
+
+        self.refill_hot();
+
+        self.inited = true;
+        self.metrics.init_duration = init_start.elapsed();
+        Ok(())
+    }
+
+    fn refill_hot(&mut self) {
+        while !self.cold.is_empty() {
+            if !self.hot.is_empty() {
+                let merge_window_end = self.merge_window_end().unwrap();
+                let warmest = self.cold.peek().unwrap();
+                if warmest.is_after_boundary(&self.schema, &merge_window_end) {
+                    // if the warmest stream in the cold stream sets is totally after the
+                    // merge_window_end then no need to add more streams into
+                    // the hot stream sets for merge sorting.
+                    break;
+                }
+            }
+
+            let warmest = self.cold.pop().unwrap();
+            self.hot.push(warmest);
+        }
+    }
+
+    /// Pull the next batch Rearrange the heap
+    async fn reheap(&mut self, mut buffered_stream: HeapBufferedStream) -> Result<()> {
+        let pulled_new_batch = buffered_stream
+            .pull_next_batch_if_necessary(&mut self.metrics)
+            .await?;
+
+        if buffered_stream.is_exhausted() {
+            self.refill_hot();
+        } else if pulled_new_batch {
+            // TODO(xikai): it seems no need to decide to which heap push the
+            // `buffered_stream`. Just put the new batch into the cold heap if
+            // the max bound of the hottest batch is smaller than the min bound
+            // of new one.
+            let cold_new_batch = if let Some(hottest) = self.hot.peek() {
+                buffered_stream.is_after_boundary(&self.schema, &hottest.last_row_in_buffer())
+            } else {
+                false
+            };
+
+            if cold_new_batch {
+                self.cold.push(buffered_stream);
+            } else {
+                self.hot.push(buffered_stream);
+            }
+            self.refill_hot();
+        } else {
+            // No new batch is pulled and the `buffered_stream` is not exhausted so just put
+            // it back to the hot heap.
+            self.hot.push(buffered_stream);
+        }
+
+        Ok(())
+    }
+
+    /// Fetch at most `num_rows_to_fetch` rows from the hottest
+    /// `BufferedStream`.
+    ///
+    /// If the inner builder is empty, returns a slice of the record batch in
+    /// stream.
+    async fn fetch_rows_from_one_stream(
+        &mut self,
+        num_rows_to_fetch: usize,
+    ) -> Result<Option<RecordBatchWithKey>> {
+        assert_eq!(self.hot.len(), 1);
+        self.metrics.times_fetch_rows_from_one += 1;
+
+        let mut buffered_stream = self.hot.pop().unwrap();
+
+        let record_batch = if self.record_batch_builder.is_empty() {
+            let record_batch = buffered_stream.take_record_batch_slice(num_rows_to_fetch);
+
+            self.metrics.total_rows_fetch_from_one += record_batch.num_rows();
+
+            Some(record_batch)
+        } else {
+            let fetched_row_num = buffered_stream
+                .append_rows_to(&mut self.record_batch_builder, num_rows_to_fetch)?;
+
+            self.metrics.total_rows_fetch_from_one += fetched_row_num;
+
+            None
+        };
+
+        self.reheap(buffered_stream).await?;
+
+        Ok(record_batch)
+    }
+
+    /// Fetch one row from the hottest `BufferedStream`.
+    ///
+    /// REQUIRES: `self.hot` is not empty.
+    async fn fetch_one_row_from_multiple_streams(&mut self) -> Result<()> {
+        assert!(!self.hot.is_empty());
+        self.metrics.times_fetch_row_from_multiple += 1;
+
+        let mut hottest = self.hot.pop().unwrap();
+        let row = hottest.next_row_in_buffer().unwrap();
+        self.record_batch_builder
+            .append_row_view(&row)
+            .context(AppendRow)?;
+        self.reheap(hottest).await
+    }
+
+    /// Fetch the next batch from the streams.
+    ///
+    /// `init_if_necessary` should be finished before this method.
+    async fn fetch_next_batch(&mut self) -> Result<Option<RecordBatchWithKey>> {
+        self.init_if_necessary().await?;
+
+        self.record_batch_builder.clear();
+
+        while !self.hot.is_empty() && self.record_batch_builder.len() < self.iter_options.batch_size
+        {
+            // no need to do merge sort if only one batch in the hot heap.
+            if self.hot.len() == 1 {
+                let fetch_row_num = self.iter_options.batch_size - self.record_batch_builder.len();
+
+                if let Some(record_batch) = self.fetch_rows_from_one_stream(fetch_row_num).await? {
+                    // The builder is empty and we have fetch a record batch from this stream, just
+                    // return that batch.
+                    return Ok(Some(record_batch));
+                }
+                // Else, some rows may have been pushed into the builder.
+            } else {
+                self.fetch_one_row_from_multiple_streams().await?;
+            }
+        }
+
+        if self.record_batch_builder.is_empty() {
+            Ok(None)
+        } else {
+            let record_batch = self
+                .record_batch_builder
+                .build()
+                .context(BuildRecordBatch)?;
+            Ok(Some(record_batch))
+        }
+    }
+}
+
+impl Drop for MergeIterator {
+    fn drop(&mut self) {
+        info!(
+            "Merge iterator dropped, table_id:{:?}, request_id:{}, metrics:{:?}, iter_options:{:?},",
+            self.table_id, self.request_id, self.metrics, self.iter_options,
+        );
+    }
+}
+
+#[async_trait]
+impl RecordBatchWithKeyIterator for MergeIterator {
+    type Error = Error;
+
+    fn schema(&self) -> &RecordSchemaWithKey {
+        &self.schema
+    }
+
+    async fn next_batch(&mut self) -> Result<Option<RecordBatchWithKey>> {
+        let record_batch = self.fetch_next_batch().await?;
+
+        trace!("MergeIterator send next record batch:{:?}", record_batch);
+
+        Ok(record_batch)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use common_types::{
+        self,
+        tests::{build_row, build_schema},
+    };
+
+    use super::*;
+    use crate::row_iter::tests::check_iterator;
+
+    #[tokio::test]
+    async fn test_row_merge_iterator() {
+        // first two columns are key columns
+        let schema = build_schema();
+
+        let testcases = vec![
+            // (sequence, rows)
+            (10, vec![build_row(b"y", 1000000, 10.0, "v4")]),
+            (20, vec![build_row(b"y", 1000000, 10.0, "v3")]),
+            (100, vec![build_row(b"b", 1000000, 10.0, "v2")]),
+            (1, vec![build_row(b"a", 1000000, 10.0, "v1")]),
+        ];
+
+        let streams =
+            record_batch_stream::tests::build_sequenced_record_batch_stream(&schema, testcases);
+        let mut iter = MergeIterator::new(
+            TableId::MIN,
+            RequestId::next_id(),
+            schema.to_record_schema_with_key(),
+            streams,
+            IterOptions::default(),
+            false,
+            Metrics::new(1, 1, vec![]),
+        );
+
+        check_iterator(
+            &mut iter,
+            vec![
+                build_row(b"a", 1000000, 10.0, "v1"),
+                build_row(b"b", 1000000, 10.0, "v2"),
+                build_row(b"y", 1000000, 10.0, "v3"),
+                build_row(b"y", 1000000, 10.0, "v4"),
+            ],
+        )
+        .await;
+    }
+
+    #[tokio::test]
+    async fn test_row_merge_iterator_reverse() {
+        // first two columns are key columns
+        let schema = build_schema();
+
+        let testcases = vec![
+            // (sequence, rows)
+            (
+                10,
+                vec![
+                    build_row(b"y", 1000001, 10.0, "v5"),
+                    build_row(b"y", 1000000, 10.0, "v4"),
+                ],
+            ),
+            (20, vec![build_row(b"y", 1000000, 10.0, "v3")]),
+            (100, vec![build_row(b"b", 1000000, 10.0, "v2")]),
+            (1, vec![build_row(b"a", 1000000, 10.0, "v1")]),
+        ];
+
+        let streams =
+            record_batch_stream::tests::build_sequenced_record_batch_stream(&schema, testcases);
+        let mut iter = MergeIterator::new(
+            TableId::MIN,
+            RequestId::next_id(),
+            schema.to_record_schema_with_key(),
+            streams,
+            IterOptions::default(),
+            true,
+            Metrics::new(1, 1, vec![]),
+        );
+
+        check_iterator(
+            &mut iter,
+            vec![
+                build_row(b"y", 1000001, 10.0, "v5"),
+                build_row(b"y", 1000000, 10.0, "v3"),
+                build_row(b"y", 1000000, 10.0, "v4"),
+                build_row(b"b", 1000000, 10.0, "v2"),
+                build_row(b"a", 1000000, 10.0, "v1"),
+            ],
+        )
+        .await;
+    }
+}
diff --git a/analytic_engine/src/row_iter/mod.rs b/analytic_engine/src/row_iter/mod.rs
new file mode 100644
index 0000000000..8c30523396
--- /dev/null
+++ b/analytic_engine/src/row_iter/mod.rs
@@ -0,0 +1,87 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Iterators for row.
+
+use std::{
+    pin::Pin,
+    task::{Context, Poll},
+};
+
+use async_trait::async_trait;
+use common_types::{record_batch::RecordBatchWithKey, schema::RecordSchemaWithKey};
+use common_util::runtime::Runtime;
+use futures::stream::Stream;
+use log::{debug, error};
+use tokio::sync::mpsc::{self, Receiver};
+
+use crate::sst::builder::{RecordBatchStream, RecordBatchStreamItem};
+
+pub mod chain;
+pub mod dedup;
+pub mod merge;
+pub mod record_batch_stream;
+#[cfg(test)]
+pub mod tests;
+
+const RECORD_BATCH_READ_BUF_SIZE: usize = 10;
+
+#[derive(Debug, Clone)]
+pub struct IterOptions {
+    pub batch_size: usize,
+}
+
+impl Default for IterOptions {
+    fn default() -> Self {
+        Self { batch_size: 500 }
+    }
+}
+
+/// The iterator for reading RecordBatch from a table.
+///
+/// The `schema()` should be the same as the RecordBatch from `read()`.
+/// The reader is exhausted if the `read()` returns the `Ok(None)`.
+#[async_trait]
+pub trait RecordBatchWithKeyIterator: Send {
+    type Error: std::error::Error + Send + Sync + 'static;
+
+    fn schema(&self) -> &RecordSchemaWithKey;
+
+    async fn next_batch(&mut self) -> std::result::Result<Option<RecordBatchWithKey>, Self::Error>;
+}
+
+struct ReceiverStream {
+    rx: Receiver<RecordBatchStreamItem>,
+}
+
+impl Stream for ReceiverStream {
+    type Item = RecordBatchStreamItem;
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        let this = self.get_mut();
+        Pin::new(&mut this.rx).poll_recv(cx)
+    }
+}
+
+// TODO(yingwen): This is a hack way to convert an async trait to stream.
+pub fn record_batch_with_key_iter_to_stream<I: RecordBatchWithKeyIterator + 'static>(
+    mut iter: I,
+    runtime: &Runtime,
+) -> RecordBatchStream {
+    let (tx, rx) = mpsc::channel(RECORD_BATCH_READ_BUF_SIZE);
+    runtime.spawn(async move {
+        while let Some(record_batch) = iter.next_batch().await.transpose() {
+            let record_batch = record_batch.map_err(|e| Box::new(e) as _);
+
+            debug!(
+                "compact table send next record batch, batch:{:?}",
+                record_batch
+            );
+            if tx.send(record_batch).await.is_err() {
+                error!("Failed to send record batch from the merge iterator");
+                break;
+            }
+        }
+    });
+
+    Box::new(ReceiverStream { rx })
+}
diff --git a/analytic_engine/src/row_iter/record_batch_stream.rs b/analytic_engine/src/row_iter/record_batch_stream.rs
new file mode 100644
index 0000000000..13cf049b13
--- /dev/null
+++ b/analytic_engine/src/row_iter/record_batch_stream.rs
@@ -0,0 +1,287 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use std::ops::Bound;
+
+use common_types::{
+    projected_schema::ProjectedSchema, record_batch::RecordBatchWithKey, SequenceNumber,
+};
+use common_util::define_result;
+use futures::stream::{self, Stream, StreamExt};
+use log::{error, trace};
+use snafu::{Backtrace, OptionExt, ResultExt, Snafu};
+use table_engine::{
+    predicate::{filter_record_batch::RecordBatchFilter, Predicate},
+    table::TableId,
+};
+
+use crate::{
+    memtable::{MemTableRef, ScanContext, ScanRequest},
+    space::SpaceId,
+    sst,
+    sst::{factory::SstReaderOptions, file::FileHandle},
+    table::sst_util,
+};
+
+#[derive(Debug, Snafu)]
+#[snafu(visibility(pub))]
+pub enum Error {
+    #[snafu(display(
+        "No sst reader found, sst_reader_options:{:?}.\nBacktrace:\n{}",
+        options,
+        backtrace
+    ))]
+    SstReaderNotFound {
+        options: SstReaderOptions,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Fail to read sst meta, err:{}", source))]
+    ReadSstMeta { source: crate::sst::reader::Error },
+
+    #[snafu(display("Fail to read sst data, err:{}", source))]
+    ReadSstData { source: crate::sst::reader::Error },
+
+    #[snafu(display("Fail to scan memtable, err:{}", source))]
+    ScanMemtable { source: crate::memtable::Error },
+}
+
+define_result!(Error);
+
+const REBUILD_FILTERED_RECORD_BATCH_MAGNIFICATION: usize = 2;
+
+// TODO(yingwen): Can we move sequence to RecordBatchWithKey and remove this
+// struct? But what is the sequence after merge?
+#[derive(Debug)]
+pub struct SequencedRecordBatch {
+    pub record_batch: RecordBatchWithKey,
+    pub sequence: SequenceNumber,
+}
+
+impl SequencedRecordBatch {
+    #[inline]
+    pub fn num_rows(&self) -> usize {
+        self.record_batch.num_rows()
+    }
+}
+
+pub type SequencedRecordBatchStream = Box<
+    dyn Stream<
+            Item = std::result::Result<
+                SequencedRecordBatch,
+                Box<dyn std::error::Error + Send + Sync>,
+            >,
+        > + Send
+        + Unpin,
+>;
+
+/// Filter the `sequenced_record_batch` according to the `filter` if necessary.
+/// Returns the original batch if only a small proportion of the batch is
+/// filtered out.
+/// The `selected_rows_buf` is for reuse.
+fn maybe_filter_record_batch(
+    mut sequenced_record_batch: SequencedRecordBatch,
+    filter: &RecordBatchFilter,
+    selected_rows_buf: &mut Vec<bool>,
+) -> Option<SequencedRecordBatch> {
+    if filter.is_empty() {
+        return Some(sequenced_record_batch);
+    }
+
+    // The filter requires the `selected_rows_buf.len() >=
+    // sequenced_record_batch.num_rows()`.
+    selected_rows_buf.resize(sequenced_record_batch.num_rows(), true);
+    let num_selected_rows = filter.filter(
+        &sequenced_record_batch.record_batch,
+        selected_rows_buf.as_mut_slice(),
+    );
+
+    trace!(
+        "filter record batch, selected_rows:{}, origin_rows:{}",
+        num_selected_rows,
+        sequenced_record_batch.num_rows()
+    );
+
+    // No row is selected.
+    if num_selected_rows == 0 {
+        return None;
+    }
+
+    if num_selected_rows
+        > sequenced_record_batch.num_rows() / REBUILD_FILTERED_RECORD_BATCH_MAGNIFICATION
+    {
+        // just use the original record batch because only a small proportion is
+        // filtered out.
+        return Some(sequenced_record_batch);
+    }
+
+    // select the rows according to the filter result.
+    if let Err(e) = sequenced_record_batch
+        .record_batch
+        .select_data(selected_rows_buf.as_slice())
+    {
+        error!(
+            "Fail to select record batch, data:{:?}, selected_rows:{:?}, err:{}",
+            sequenced_record_batch, selected_rows_buf, e,
+        );
+    }
+
+    Some(sequenced_record_batch)
+}
+
+/// Filter the sequenced record batch stream by applying the `predicate`.
+/// However, the output record batches is not ensured to meet the requirements
+/// of the `predicate`.
+pub fn filter_stream(
+    origin_stream: SequencedRecordBatchStream,
+    predicate: &Predicate,
+) -> SequencedRecordBatchStream {
+    if predicate.exprs.is_empty() {
+        return origin_stream;
+    }
+
+    let mut select_row_buf = Vec::new();
+    let filter = RecordBatchFilter::from(predicate.exprs.as_slice());
+    let stream = origin_stream.filter_map(move |sequence_record_batch| {
+        let v = match sequence_record_batch {
+            Ok(v) => maybe_filter_record_batch(v, &filter, &mut select_row_buf).map(Ok),
+            Err(e) => Some(Err(e)),
+        };
+
+        futures::future::ready(v)
+    });
+
+    Box::new(stream)
+}
+
+/// Build filtered (by `predicate`) [SequencedRecordBatchStream] from a
+/// memtable.
+pub fn filtered_stream_from_memtable(
+    projected_schema: ProjectedSchema,
+    need_dedup: bool,
+    memtable: &MemTableRef,
+    reverse: bool,
+    predicate: &Predicate,
+) -> Result<SequencedRecordBatchStream> {
+    stream_from_memtable(projected_schema, need_dedup, memtable, reverse)
+        .map(|origin_stream| filter_stream(origin_stream, predicate))
+}
+
+/// Build [SequencedRecordBatchStream] from a memtable.
+pub fn stream_from_memtable(
+    projected_schema: ProjectedSchema,
+    need_dedup: bool,
+    memtable: &MemTableRef,
+    reverse: bool,
+) -> Result<SequencedRecordBatchStream> {
+    let scan_ctx = ScanContext::default();
+    let max_seq = memtable.last_sequence();
+    let scan_req = ScanRequest {
+        start_user_key: Bound::Unbounded,
+        end_user_key: Bound::Unbounded,
+        sequence: max_seq,
+        projected_schema,
+        need_dedup,
+        reverse,
+    };
+
+    let iter = memtable.scan(scan_ctx, scan_req).context(ScanMemtable)?;
+    let stream = stream::iter(iter).map(move |v| {
+        v.map(|record_batch| SequencedRecordBatch {
+            record_batch,
+            sequence: max_seq,
+        })
+        .map_err(|e| Box::new(e) as _)
+    });
+
+    Ok(Box::new(stream))
+}
+
+/// Build the filtered by `sst_read_options.predicate`
+/// [SequencedRecordBatchStream] from a sst.
+pub async fn filtered_stream_from_sst_file<Fa, S>(
+    space_id: SpaceId,
+    table_id: TableId,
+    sst_file: &FileHandle,
+    sst_factory: &Fa,
+    sst_reader_options: &SstReaderOptions,
+    store: &S,
+) -> Result<SequencedRecordBatchStream>
+where
+    Fa: sst::factory::Factory,
+    S: object_store::ObjectStore,
+{
+    stream_from_sst_file(
+        space_id,
+        table_id,
+        sst_file,
+        sst_factory,
+        sst_reader_options,
+        store,
+    )
+    .await
+    .map(|origin_stream| filter_stream(origin_stream, sst_reader_options.predicate.as_ref()))
+}
+
+/// Build the [SequencedRecordBatchStream] from a sst.
+pub async fn stream_from_sst_file<Fa, S>(
+    space_id: SpaceId,
+    table_id: TableId,
+    sst_file: &FileHandle,
+    sst_factory: &Fa,
+    sst_reader_options: &SstReaderOptions,
+    store: &S,
+) -> Result<SequencedRecordBatchStream>
+where
+    Fa: sst::factory::Factory,
+    S: object_store::ObjectStore,
+{
+    sst_file.read_meter().mark();
+    let mut path = store.new_path();
+    sst_util::set_sst_file_path(space_id, table_id, sst_file.id(), &mut path);
+    let mut sst_reader = sst_factory
+        .new_sst_reader(sst_reader_options, &path, store)
+        .with_context(|| SstReaderNotFound {
+            options: sst_reader_options.clone(),
+        })?;
+    let meta = sst_reader.meta_data().await.context(ReadSstMeta)?;
+    let max_seq = meta.max_sequence;
+    let sst_stream = sst_reader.read().await.context(ReadSstData)?;
+
+    let stream = Box::new(sst_stream.map(move |v| {
+        v.map(|record_batch| SequencedRecordBatch {
+            record_batch,
+            sequence: max_seq,
+        })
+        .map_err(|e| Box::new(e) as _)
+    }));
+
+    Ok(stream)
+}
+
+#[cfg(test)]
+pub mod tests {
+    use common_types::{row::Row, schema::Schema};
+
+    use super::*;
+    use crate::row_iter;
+
+    /// Build [SequencedRecordBatchStream] from the sequenced rows.
+    pub fn build_sequenced_record_batch_stream(
+        schema: &Schema,
+        batches: Vec<(SequenceNumber, Vec<Row>)>,
+    ) -> Vec<SequencedRecordBatchStream> {
+        batches
+            .into_iter()
+            .map(|(seq, rows)| {
+                let batch = SequencedRecordBatch {
+                    record_batch: row_iter::tests::build_record_batch_with_key(
+                        schema.clone(),
+                        rows,
+                    ),
+                    sequence: seq,
+                };
+                Box::new(stream::iter(vec![Ok(batch)])) as SequencedRecordBatchStream
+            })
+            .collect()
+    }
+}
diff --git a/analytic_engine/src/row_iter/tests.rs b/analytic_engine/src/row_iter/tests.rs
new file mode 100644
index 0000000000..ce929b852a
--- /dev/null
+++ b/analytic_engine/src/row_iter/tests.rs
@@ -0,0 +1,93 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use async_trait::async_trait;
+use common_types::{
+    projected_schema::ProjectedSchema,
+    record_batch::{RecordBatchWithKey, RecordBatchWithKeyBuilder},
+    row::{
+        contiguous::{ContiguousRowReader, ContiguousRowWriter, ProjectedContiguousRow},
+        Row,
+    },
+    schema::{IndexInWriterSchema, RecordSchemaWithKey, Schema},
+};
+use common_util::define_result;
+use snafu::Snafu;
+
+use crate::row_iter::RecordBatchWithKeyIterator;
+
+#[derive(Debug, Snafu)]
+pub enum Error {}
+
+define_result!(Error);
+
+pub struct VectorIterator {
+    schema: RecordSchemaWithKey,
+    items: Vec<Option<RecordBatchWithKey>>,
+    idx: usize,
+}
+
+impl VectorIterator {
+    pub fn new(schema: RecordSchemaWithKey, items: Vec<RecordBatchWithKey>) -> Self {
+        Self {
+            schema,
+            items: items.into_iter().map(Some).collect(),
+            idx: 0,
+        }
+    }
+}
+
+#[async_trait]
+impl RecordBatchWithKeyIterator for VectorIterator {
+    type Error = Error;
+
+    fn schema(&self) -> &RecordSchemaWithKey {
+        &self.schema
+    }
+
+    async fn next_batch(&mut self) -> Result<Option<RecordBatchWithKey>> {
+        if self.idx == self.items.len() {
+            return Ok(None);
+        }
+
+        let ret = Ok(self.items[self.idx].take());
+        self.idx += 1;
+
+        ret
+    }
+}
+
+pub fn build_record_batch_with_key(schema: Schema, rows: Vec<Row>) -> RecordBatchWithKey {
+    assert!(schema.num_columns() > 1);
+    let projection: Vec<usize> = (0..schema.num_columns()).collect();
+    let projected_schema = ProjectedSchema::new(schema.clone(), Some(projection)).unwrap();
+    let row_projected_schema = projected_schema.try_project_with_key(&schema).unwrap();
+    let mut builder =
+        RecordBatchWithKeyBuilder::with_capacity(projected_schema.to_record_schema_with_key(), 2);
+    let index_in_writer = IndexInWriterSchema::for_same_schema(schema.num_columns());
+
+    let mut buf = Vec::new();
+    for row in rows {
+        let mut writer = ContiguousRowWriter::new(&mut buf, &schema, &index_in_writer);
+
+        writer.write_row(&row).unwrap();
+
+        let source_row = ContiguousRowReader::with_schema(&buf, &schema);
+        let projected_row = ProjectedContiguousRow::new(source_row, &row_projected_schema);
+        builder
+            .append_projected_contiguous_row(&projected_row)
+            .unwrap();
+    }
+    builder.build().unwrap()
+}
+
+pub async fn check_iterator<T: RecordBatchWithKeyIterator>(iter: &mut T, expected_rows: Vec<Row>) {
+    let mut visited_rows = 0;
+    while let Some(batch) = iter.next_batch().await.unwrap() {
+        for row_idx in 0..batch.num_rows() {
+            assert_eq!(batch.clone_row_at(row_idx), expected_rows[visited_rows]);
+            visited_rows += 1;
+        }
+    }
+
+    assert_eq!(visited_rows, expected_rows.len());
+}
diff --git a/analytic_engine/src/sampler.rs b/analytic_engine/src/sampler.rs
new file mode 100644
index 0000000000..304d052327
--- /dev/null
+++ b/analytic_engine/src/sampler.rs
@@ -0,0 +1,448 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Segment duration sampler.
+
+use std::{
+    collections::HashSet,
+    sync::{Arc, Mutex},
+    time::Duration,
+};
+
+use common_types::time::{TimeRange, Timestamp};
+use common_util::define_result;
+use snafu::{ensure, Backtrace, Snafu};
+
+use crate::table_options;
+
+/// Initial size of timestamps set.
+const INIT_CAPACITY: usize = 1000;
+const HOUR_MS: u64 = 3600 * 1000;
+const DAY_MS: u64 = 24 * HOUR_MS;
+const AVAILABLE_DURATIONS: [u64; 8] = [
+    2 * HOUR_MS,
+    DAY_MS,
+    7 * DAY_MS,
+    30 * DAY_MS,
+    180 * DAY_MS,
+    360 * DAY_MS,
+    5 * 360 * DAY_MS,
+    10 * 360 * DAY_MS,
+];
+const INTERVAL_RATIO: f64 = 0.9;
+/// Expected points per timeseries in a segment, used to pick a proper segment
+/// duration.
+const POINTS_PER_SERIES: u64 = 100;
+/// Max timestamp that wont overflow even using max duration.
+const MAX_TIMESTAMP_MS_FOR_DURATION: i64 =
+    i64::MAX - 2 * AVAILABLE_DURATIONS[AVAILABLE_DURATIONS.len() - 1] as i64;
+/// Minimun sample timestamps to compute duration.
+const MIN_SAMPLES: usize = 2;
+
+#[derive(Debug, Snafu)]
+#[snafu(display(
+    "Invalid timestamp to collect, timestamp:{:?}.\nBacktrace:\n{}",
+    timestamp,
+    backtrace
+))]
+pub struct Error {
+    timestamp: Timestamp,
+    backtrace: Backtrace,
+}
+
+define_result!(Error);
+
+/// Segment duration sampler.
+///
+/// Collects all timestamps and then yield a suggested segment duration to hold
+/// all data with similar timestamp interval.
+pub trait DurationSampler {
+    /// Collect a timestamp.
+    fn collect(&self, timestamp: Timestamp) -> Result<()>;
+
+    /// Returns a suggested duration to partition the timestamps or default
+    /// duration if no enough timestamp has been sampled.
+    ///
+    /// Note that this method may be invoked more than once.
+    fn suggest_duration(&self) -> Duration;
+
+    /// Returns a vector of time range with suggested duration that can hold all
+    /// timestamps collected by this sampler.
+    fn ranges(&self) -> Vec<TimeRange>;
+
+    // TODO(yingwen): Memory usage.
+}
+
+pub type SamplerRef = Arc<dyn DurationSampler + Send + Sync>;
+
+struct State {
+    /// Deduplicated timestamps.
+    deduped_timestamps: HashSet<Timestamp>,
+    /// Cached suggested duration.
+    duration: Option<Duration>,
+    /// Sorted timestamps cache, empty if `duration` is None.
+    sorted_timestamps: Vec<Timestamp>,
+}
+
+impl State {
+    fn clear_cache(&mut self) {
+        self.duration = None;
+        self.sorted_timestamps.clear();
+    }
+}
+
+pub struct DefaultSampler {
+    state: Mutex<State>,
+}
+
+impl Default for DefaultSampler {
+    fn default() -> Self {
+        Self {
+            state: Mutex::new(State {
+                deduped_timestamps: HashSet::with_capacity(INIT_CAPACITY),
+                duration: None,
+                sorted_timestamps: Vec::new(),
+            }),
+        }
+    }
+}
+
+impl DurationSampler for DefaultSampler {
+    fn collect(&self, timestamp: Timestamp) -> Result<()> {
+        ensure!(
+            timestamp.as_i64() < MAX_TIMESTAMP_MS_FOR_DURATION,
+            Context { timestamp }
+        );
+
+        let mut state = self.state.lock().unwrap();
+        state.deduped_timestamps.insert(timestamp);
+        state.clear_cache();
+
+        Ok(())
+    }
+
+    fn suggest_duration(&self) -> Duration {
+        if let Some(v) = self.duration() {
+            return v;
+        }
+
+        let timestamps = self.compute_sorted_timestamps();
+        let picked = match evaluate_interval(&timestamps) {
+            Some(interval) => pick_duration(interval),
+            None => table_options::DEFAULT_SEGMENT_DURATION,
+        };
+
+        {
+            // Cache the picked duration.
+            let mut state = self.state.lock().unwrap();
+            state.duration = Some(picked);
+            state.sorted_timestamps = timestamps;
+        }
+
+        picked
+    }
+
+    fn ranges(&self) -> Vec<TimeRange> {
+        let duration = self.suggest_duration();
+        let sorted_timestamps = self.cached_sorted_timestamps();
+        // This type hint is needed to make `ranges.last()` work.
+        let mut ranges: Vec<TimeRange> = Vec::new();
+
+        for ts in sorted_timestamps {
+            if let Some(range) = ranges.last() {
+                if range.contains(ts) {
+                    continue;
+                }
+            }
+
+            // collect() ensures timestamp won't overflow.
+            let range = TimeRange::bucket_of(ts, duration).unwrap();
+            ranges.push(range);
+        }
+
+        ranges
+    }
+}
+
+impl DefaultSampler {
+    fn cached_sorted_timestamps(&self) -> Vec<Timestamp> {
+        self.state.lock().unwrap().sorted_timestamps.clone()
+    }
+
+    fn compute_sorted_timestamps(&self) -> Vec<Timestamp> {
+        let mut timestamps: Vec<_> = {
+            let state = self.state.lock().unwrap();
+            state.deduped_timestamps.iter().copied().collect()
+        };
+
+        timestamps.sort_unstable();
+
+        timestamps
+    }
+
+    fn duration(&self) -> Option<Duration> {
+        self.state.lock().unwrap().duration
+    }
+}
+
+fn evaluate_interval(sorted_timestamps: &[Timestamp]) -> Option<u64> {
+    if sorted_timestamps.len() < MIN_SAMPLES {
+        return None;
+    }
+
+    let mut intervals = Vec::with_capacity(sorted_timestamps.len());
+    for i in 0..sorted_timestamps.len() - 1 {
+        let current = sorted_timestamps[i];
+        let next = sorted_timestamps[i + 1];
+        let interval = next.as_i64() - current.as_i64();
+        intervals.push(interval);
+    }
+
+    intervals.sort_unstable();
+
+    let mut index = (intervals.len() as f64 * INTERVAL_RATIO) as usize;
+    if index > 1 {
+        index -= 1;
+    };
+    let selected = intervals[index];
+    // Interval should larger than 0.
+    assert!(selected > 0);
+
+    Some(selected as u64)
+}
+
+fn pick_duration(interval: u64) -> Duration {
+    let scaled_interval = interval.checked_mul(POINTS_PER_SERIES).unwrap_or(u64::MAX);
+    for du_ms in AVAILABLE_DURATIONS {
+        if du_ms > scaled_interval {
+            return Duration::from_millis(du_ms);
+        }
+    }
+
+    // No duration larger than scaled interval, returns the largest duration.
+    let du_ms = AVAILABLE_DURATIONS[AVAILABLE_DURATIONS.len() - 1];
+
+    Duration::from_millis(du_ms)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    const SEC_MS: u64 = 1000;
+    const MIN_MS: u64 = 60 * SEC_MS;
+
+    #[test]
+    fn test_pick_duration() {
+        let cases = [
+            (1, 2 * HOUR_MS),
+            (5 * SEC_MS, 2 * HOUR_MS),
+            (15 * SEC_MS, 2 * HOUR_MS),
+            (MIN_MS, 2 * HOUR_MS),
+            (5 * MIN_MS, DAY_MS),
+            (10 * MIN_MS, DAY_MS),
+            (30 * MIN_MS, 7 * DAY_MS),
+            (HOUR_MS, 7 * DAY_MS),
+            (4 * HOUR_MS, 30 * DAY_MS),
+            (8 * HOUR_MS, 180 * DAY_MS),
+            (DAY_MS, 180 * DAY_MS),
+            (3 * DAY_MS, 360 * DAY_MS),
+            (7 * DAY_MS, 5 * 360 * DAY_MS),
+            (30 * DAY_MS, 10 * 360 * DAY_MS),
+            (360 * DAY_MS, 10 * 360 * DAY_MS),
+            (10 * 360 * DAY_MS, 10 * 360 * DAY_MS),
+            (20 * 360 * DAY_MS, 10 * 360 * DAY_MS),
+        ];
+
+        for (i, (interval, expect)) in cases.iter().enumerate() {
+            assert_eq!(
+                *expect,
+                pick_duration(*interval).as_millis() as u64,
+                "Case {}",
+                i
+            );
+        }
+    }
+
+    #[test]
+    fn test_empty_sampler() {
+        let sampler = DefaultSampler::default();
+
+        assert_eq!(
+            table_options::DEFAULT_SEGMENT_DURATION,
+            sampler.suggest_duration()
+        );
+        assert!(sampler.ranges().is_empty());
+    }
+
+    #[test]
+    fn test_one_sample() {
+        let sampler = DefaultSampler::default();
+
+        sampler.collect(Timestamp::new(0)).unwrap();
+
+        assert_eq!(
+            table_options::DEFAULT_SEGMENT_DURATION,
+            sampler.suggest_duration()
+        );
+        let time_range =
+            TimeRange::bucket_of(Timestamp::new(0), table_options::DEFAULT_SEGMENT_DURATION)
+                .unwrap();
+        assert_eq!(&[time_range], &sampler.ranges()[..]);
+    }
+
+    #[test]
+    fn test_all_sample_same() {
+        let sampler = DefaultSampler::default();
+
+        let ts = Timestamp::now();
+        for _ in 0..5 {
+            sampler.collect(ts).unwrap();
+        }
+
+        assert_eq!(
+            table_options::DEFAULT_SEGMENT_DURATION,
+            sampler.suggest_duration()
+        );
+        let time_range = TimeRange::bucket_of(ts, table_options::DEFAULT_SEGMENT_DURATION).unwrap();
+        assert_eq!(&[time_range], &sampler.ranges()[..]);
+    }
+
+    #[test]
+    fn test_collect_invalid() {
+        let sampler = DefaultSampler::default();
+
+        assert!(sampler
+            .collect(Timestamp::new(MAX_TIMESTAMP_MS_FOR_DURATION - 1))
+            .is_ok());
+        assert!(sampler
+            .collect(Timestamp::new(MAX_TIMESTAMP_MS_FOR_DURATION))
+            .is_err());
+    }
+
+    #[test]
+    fn test_sampler_cache() {
+        let sampler = DefaultSampler::default();
+
+        let ts1 = Timestamp::now();
+        for i in 0..3 {
+            sampler
+                .collect(Timestamp::new(ts1.as_i64() + i * SEC_MS as i64))
+                .unwrap();
+        }
+
+        assert_eq!(
+            table_options::DEFAULT_SEGMENT_DURATION,
+            sampler.suggest_duration()
+        );
+        let time_range1 =
+            TimeRange::bucket_of(ts1, table_options::DEFAULT_SEGMENT_DURATION).unwrap();
+        assert_eq!(&[time_range1], &sampler.ranges()[..]);
+
+        // A new timestamp is sampled.
+        let ts2 = Timestamp::new(ts1.as_i64() + DAY_MS as i64);
+        sampler.collect(ts2).unwrap();
+
+        assert!(sampler.state.lock().unwrap().duration.is_none());
+        assert!(sampler.state.lock().unwrap().sorted_timestamps.is_empty());
+
+        assert_eq!(
+            table_options::DEFAULT_SEGMENT_DURATION,
+            sampler.suggest_duration()
+        );
+        let time_range2 =
+            TimeRange::bucket_of(ts2, table_options::DEFAULT_SEGMENT_DURATION).unwrap();
+        assert_eq!(&[time_range1, time_range2], &sampler.ranges()[..]);
+    }
+
+    fn test_suggest_duration_and_ranges_case(
+        timestamps: &[i64],
+        duration: u64,
+        ranges: &[(i64, i64)],
+    ) {
+        let sampler = DefaultSampler::default();
+
+        for ts in timestamps {
+            sampler.collect(Timestamp::new(*ts)).unwrap();
+        }
+
+        assert_eq!(Duration::from_millis(duration), sampler.suggest_duration());
+
+        let suggested_ranges = sampler.ranges();
+        for (range, suggested_range) in ranges.iter().zip(suggested_ranges) {
+            assert_eq!(range.0, suggested_range.inclusive_start().as_i64());
+            assert_eq!(range.1, suggested_range.exclusive_end().as_i64());
+        }
+    }
+
+    #[test]
+    fn test_suggest_duration_and_ranges() {
+        test_suggest_duration_and_ranges_case(
+            // Intervals: 3, 5
+            &[100, 103, 108],
+            2 * HOUR_MS,
+            &[(0, 2 * HOUR_MS as i64)],
+        );
+
+        let now_ts = Timestamp::now();
+        let now = now_ts.as_i64();
+        let sec_ms_i64 = SEC_MS as i64;
+
+        let bucket = TimeRange::bucket_of(now_ts, Duration::from_millis(2 * HOUR_MS)).unwrap();
+        let expect_range = (
+            bucket.inclusive_start().as_i64(),
+            bucket.exclusive_end().as_i64(),
+        );
+        test_suggest_duration_and_ranges_case(
+            // Intervals: 5s, 5s, 5s, 5s, 100s,
+            &[
+                now,
+                now + 5 * sec_ms_i64,
+                now + 2 * 5 * sec_ms_i64,
+                now + 3 * 5 * sec_ms_i64,
+                now + 4 * 5 * sec_ms_i64,
+                now + 4 * 5 * sec_ms_i64 + 100 * sec_ms_i64,
+            ],
+            2 * HOUR_MS,
+            &[expect_range],
+        );
+
+        // Same with previous case, but shuffle the input timestamps.
+        test_suggest_duration_and_ranges_case(
+            &[
+                now + 3 * 5 * sec_ms_i64,
+                now,
+                now + 5 * sec_ms_i64,
+                now + 4 * 5 * sec_ms_i64,
+                now + 2 * 5 * sec_ms_i64,
+                now + 4 * 5 * sec_ms_i64 + 100 * sec_ms_i64,
+            ],
+            2 * HOUR_MS,
+            &[expect_range],
+        );
+
+        test_suggest_duration_and_ranges_case(
+            // Intervals: nine 5s and one 8h
+            &[
+                now + 5 * 5 * sec_ms_i64 + 8 * HOUR_MS as i64,
+                now,
+                now + 5 * sec_ms_i64,
+                now + 2 * 5 * sec_ms_i64,
+                now + 7 * 5 * sec_ms_i64 + 8 * HOUR_MS as i64,
+                now + 3 * 5 * sec_ms_i64,
+                now + 4 * 5 * sec_ms_i64,
+                now + 4 * 5 * sec_ms_i64 + 8 * HOUR_MS as i64,
+                now + 6 * 5 * sec_ms_i64 + 8 * HOUR_MS as i64,
+                now + 8 * 5 * sec_ms_i64 + 8 * HOUR_MS as i64,
+                now + 9 * 5 * sec_ms_i64 + 8 * HOUR_MS as i64,
+            ],
+            2 * HOUR_MS,
+            &[
+                expect_range,
+                (
+                    expect_range.0 + 8 * HOUR_MS as i64,
+                    expect_range.1 + 8 * HOUR_MS as i64,
+                ),
+            ],
+        );
+    }
+}
diff --git a/analytic_engine/src/setup.rs b/analytic_engine/src/setup.rs
new file mode 100644
index 0000000000..80e673778a
--- /dev/null
+++ b/analytic_engine/src/setup.rs
@@ -0,0 +1,103 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Setup the analytic engine
+
+use std::{path::Path, sync::Arc};
+
+use common_util::define_result;
+use object_store::disk::File;
+use parquet::{
+    cache::{LruDataCache, LruMetaCache},
+    DataCacheRef, MetaCacheRef,
+};
+use snafu::{ResultExt, Snafu};
+use table_engine::engine::EngineRuntimes;
+use wal::{manager, rocks_impl::manager::Builder as WalBuilder};
+
+use crate::{
+    context::OpenContext, engine::TableEngineImpl, instance::Instance, meta::details::ManifestImpl,
+    sst::factory::FactoryImpl, AnalyticTableEngine, Config, EngineInstance,
+};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Failed to open engine instance, err:{}", source))]
+    OpenInstance {
+        source: crate::instance::open::Error,
+    },
+
+    #[snafu(display("Failed to open wal, err:{}", source))]
+    OpenWal { source: manager::error::Error },
+
+    #[snafu(display("Failed to open wal for manifest, err:{}", source))]
+    OpenManifestWal { source: manager::error::Error },
+
+    #[snafu(display("Failed to open manifest, err:{}", source))]
+    OpenManifest { source: crate::meta::details::Error },
+}
+
+define_result!(Error);
+
+const WAL_DIR_NAME: &str = "wal";
+const MANIFEST_DIR_NAME: &str = "manifest";
+const STORE_DIR_NAME: &str = "store";
+
+/// Open an [AnalyticTableEngine] instance
+pub async fn open_analytic_table_engine(
+    config: Config,
+    engine_runtimes: Arc<EngineRuntimes>,
+) -> Result<AnalyticTableEngine> {
+    let instance = open_instance(config.clone(), engine_runtimes).await?;
+
+    Ok(TableEngineImpl::new(instance))
+}
+
+async fn open_instance(
+    config: Config,
+    engine_runtimes: Arc<EngineRuntimes>,
+) -> Result<EngineInstance> {
+    let write_runtime = engine_runtimes.write_runtime.clone();
+    let data_path = Path::new(&config.data_path);
+    let wal_path = data_path.join(WAL_DIR_NAME);
+    let wal_manager = WalBuilder::with_default_rocksdb_config(wal_path, write_runtime.clone())
+        .build()
+        .context(OpenWal)?;
+
+    let manifest_path = data_path.join(MANIFEST_DIR_NAME);
+    let manifest_wal = WalBuilder::with_default_rocksdb_config(manifest_path, write_runtime)
+        .build()
+        .context(OpenManifestWal)?;
+
+    let manifest = ManifestImpl::open(manifest_wal, config.manifest.clone())
+        .await
+        .context(OpenManifest)?;
+
+    let meta_cache: Option<MetaCacheRef> =
+        if let Some(sst_meta_cache_cap) = &config.sst_meta_cache_cap {
+            Some(Arc::new(LruMetaCache::new(*sst_meta_cache_cap)))
+        } else {
+            None
+        };
+
+    let data_cache: Option<DataCacheRef> =
+        if let Some(sst_data_cache_cap) = &config.sst_data_cache_cap {
+            Some(Arc::new(LruDataCache::new(*sst_data_cache_cap)))
+        } else {
+            None
+        };
+
+    let sst_path = data_path.join(STORE_DIR_NAME);
+    let store = File::new(sst_path);
+    let open_ctx = OpenContext {
+        config,
+        runtimes: engine_runtimes,
+        meta_cache,
+        data_cache,
+    };
+
+    let instance = Instance::open(open_ctx, manifest, wal_manager, store, FactoryImpl)
+        .await
+        .context(OpenInstance)?;
+
+    Ok(instance)
+}
diff --git a/analytic_engine/src/space.rs b/analytic_engine/src/space.rs
new file mode 100644
index 0000000000..d7ab539571
--- /dev/null
+++ b/analytic_engine/src/space.rs
@@ -0,0 +1,305 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Table space
+//!
+//! A table space acts like a namespace of a bunch of tables, tables under
+//! different space can use same table name
+
+use std::{
+    fmt,
+    sync::{Arc, RwLock},
+};
+
+use arena::CollectorRef;
+use common_util::define_result;
+use log::info;
+use snafu::{Backtrace, ResultExt, Snafu};
+use table_engine::{engine::CreateTableRequest, table::TableId};
+use tokio::sync::Mutex;
+
+use crate::{
+    instance::{mem_collector::MemUsageCollector, write_worker::WriteGroup},
+    meta::{
+        meta_update::{AddTableMeta, MetaUpdate},
+        Manifest,
+    },
+    sst::file::FilePurger,
+    table::data::{TableData, TableDataRef, TableDataSet},
+    TableOptions,
+};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Table already exists, table:{}.\nBacktrace:\n{}", table, backtrace))]
+    TableExists { table: String, backtrace: Backtrace },
+
+    #[snafu(display("Failed to create table data, table:{}, err:{}", table, source))]
+    CreateTableData {
+        table: String,
+        source: crate::table::data::Error,
+    },
+
+    #[snafu(display("Failed to store meta data, err:{}", source))]
+    WriteMeta {
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+}
+
+define_result!(Error);
+
+impl From<Error> for table_engine::engine::Error {
+    fn from(err: Error) -> Self {
+        match err {
+            Error::TableExists { table, backtrace } => Self::TableExists { table, backtrace },
+            Error::CreateTableData { ref table, .. } => Self::InvalidArguments {
+                table: table.clone(),
+                source: Box::new(err),
+            },
+            Error::WriteMeta { .. } => Self::WriteMeta {
+                source: Box::new(err),
+            },
+        }
+    }
+}
+
+/// Holds references to the table data and its space
+///
+/// REQUIRE: The table must belongs to the space
+#[derive(Clone)]
+pub struct SpaceAndTable {
+    /// The space of the table
+    space: SpaceRef,
+    /// Data of the table
+    table_data: TableDataRef,
+}
+
+impl SpaceAndTable {
+    /// Create SpaceAndTable
+    ///
+    /// REQUIRE: The table must belongs to the space
+    pub fn new(space: SpaceRef, table_data: TableDataRef) -> Self {
+        // Checks table is in space
+        debug_assert!(space
+            .table_datas
+            .read()
+            .unwrap()
+            .find_table(&table_data.name)
+            .is_some());
+
+        Self { space, table_data }
+    }
+
+    /// Get space info
+    #[inline]
+    pub fn space(&self) -> &SpaceRef {
+        &self.space
+    }
+
+    /// Get table data
+    #[inline]
+    pub fn table_data(&self) -> &TableDataRef {
+        &self.table_data
+    }
+}
+
+impl fmt::Debug for SpaceAndTable {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("SpaceAndTable")
+            .field("space_id", &self.space.id)
+            .field("space_name", &self.space.name)
+            .field("table_id", &self.table_data.id)
+            .field("table_name", &self.table_data.name)
+            .finish()
+    }
+}
+
+/// Name type of space
+// TODO(yingwen): Or use binary string?
+pub type SpaceName = String;
+/// Reference of space name
+pub type SpaceNameRef<'a> = &'a str;
+/// Space id
+// TODO(yingwen): Or just use something like uuid as space id?
+pub type SpaceId = u32;
+
+/// A space can hold mulitple tables
+pub struct Space {
+    /// Space id
+    pub id: SpaceId,
+    /// Space name
+    pub name: SpaceName,
+    /// Data of tables in this space
+    ///
+    /// Adding table into it should acquire the space lock first, then the write
+    /// lock
+    table_datas: RwLock<TableDataSet>,
+    /// Space lock
+    ///
+    /// Persisting meta update of this space is protected by this lock
+    mutex: Mutex<()>,
+
+    /// Write workers
+    pub write_group: WriteGroup,
+    /// Space memtable memory usage collector
+    pub mem_usage_collector: Arc<MemUsageCollector>,
+    /// The maximum write buffer size used for single space.
+    pub write_buffer_size: usize,
+}
+
+impl Space {
+    pub fn new(
+        id: SpaceId,
+        name: SpaceName,
+        write_buffer_size: usize,
+        write_group: WriteGroup,
+        engine_mem_collector: CollectorRef,
+    ) -> Self {
+        Self {
+            id,
+            name,
+            table_datas: RwLock::new(TableDataSet::new()),
+            mutex: Mutex::new(()),
+            write_group,
+            mem_usage_collector: Arc::new(MemUsageCollector::with_parent(engine_mem_collector)),
+            write_buffer_size,
+        }
+    }
+
+    /// Returns true when space total memtable memory usage reaches
+    /// space_write_buffer_size limit.
+    #[inline]
+    pub fn should_flush_space(&self) -> bool {
+        self.write_buffer_size > 0 && self.memtable_memory_usage() >= self.write_buffer_size
+    }
+
+    /// Find the table in space which it's memtable consumes maximum memory.
+    #[inline]
+    pub fn find_maximum_memory_usage_table(&self) -> Option<TableDataRef> {
+        self.table_datas
+            .read()
+            .unwrap()
+            .find_maximum_memory_usage_table()
+    }
+
+    #[inline]
+    pub fn memtable_memory_usage(&self) -> usize {
+        self.mem_usage_collector.total_memory_allocated()
+    }
+
+    pub async fn close(&self) -> Result<()> {
+        // Stop the write group.
+        self.write_group.stop().await;
+
+        Ok(())
+    }
+
+    /// Create a table under this space
+    ///
+    /// Returns error if the table already exists
+    pub async fn create_table<Meta: Manifest>(
+        &self,
+        request: CreateTableRequest,
+        manifest: &Meta,
+        table_opts: &TableOptions,
+        purger: &FilePurger,
+    ) -> Result<TableDataRef> {
+        info!(
+            "Space create table, space_id:{}, space_name:{}, request:{:?}",
+            self.id, self.name, request
+        );
+
+        // Checks whether the table is exists
+        if self.find_table(&request.table_name).is_some() {
+            return TableExists {
+                table: request.table_name,
+            }
+            .fail();
+        }
+
+        // Choose a write worker for this table
+        let write_handle = self.write_group.choose_worker(request.table_id);
+
+        let _lock = self.mutex.lock().await;
+
+        // Double check for table existence under space lock
+        if self.find_table(&request.table_name).is_some() {
+            return TableExists {
+                table: request.table_name,
+            }
+            .fail();
+        }
+
+        // Store table info into meta
+        let update = MetaUpdate::AddTable(AddTableMeta {
+            space_id: self.id,
+            table_id: request.table_id,
+            table_name: request.table_name.clone(),
+            schema: request.table_schema.clone(),
+            opts: table_opts.clone(),
+        });
+        manifest
+            .store_update(update)
+            .await
+            .map_err(|e| Box::new(e) as _)
+            .context(WriteMeta)?;
+
+        // Update memory state
+        let table_name = request.table_name.clone();
+        let table_data = Arc::new(
+            TableData::new(
+                self.id,
+                request,
+                write_handle,
+                table_opts.clone(),
+                purger,
+                self.mem_usage_collector.clone(),
+            )
+            .context(CreateTableData { table: &table_name })?,
+        );
+
+        self.insert_table(table_data.clone());
+
+        Ok(table_data)
+    }
+
+    /// Insert table data into space memory state if the table is
+    /// absent. For internal use only
+    ///
+    /// Panic if the table is already exists
+    pub(crate) fn insert_table(&self, table_data: TableDataRef) {
+        let success = self
+            .table_datas
+            .write()
+            .unwrap()
+            .insert_if_absent(table_data);
+        assert!(success);
+    }
+
+    /// Find table under this space by table name
+    pub fn find_table(&self, table_name: &str) -> Option<TableDataRef> {
+        self.table_datas.read().unwrap().find_table(table_name)
+    }
+
+    /// Find table under this space by its id
+    pub fn find_table_by_id(&self, table_id: TableId) -> Option<TableDataRef> {
+        self.table_datas.read().unwrap().find_table_by_id(table_id)
+    }
+
+    /// Remove table under this space by table name
+    pub fn remove_table(&self, table_name: &str) -> Option<TableDataRef> {
+        self.table_datas.write().unwrap().remove_table(table_name)
+    }
+
+    /// Returns the total table num in this space
+    pub fn table_num(&self) -> usize {
+        self.table_datas.read().unwrap().table_num()
+    }
+
+    /// List all tables of this space to `tables`
+    pub fn list_all_tables(&self, tables: &mut Vec<TableDataRef>) {
+        self.table_datas.read().unwrap().list_all_tables(tables)
+    }
+}
+
+/// A reference to space
+pub type SpaceRef = Arc<Space>;
diff --git a/analytic_engine/src/sst/builder.rs b/analytic_engine/src/sst/builder.rs
new file mode 100644
index 0000000000..3eecbcdf2a
--- /dev/null
+++ b/analytic_engine/src/sst/builder.rs
@@ -0,0 +1,76 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Sst builder trait definition
+
+use async_trait::async_trait;
+use common_types::{record_batch::RecordBatchWithKey, request_id::RequestId};
+use futures::Stream;
+
+use crate::sst::file::SstMetaData;
+
+pub mod error {
+    use common_util::define_result;
+    use snafu::{Backtrace, Snafu};
+
+    #[derive(Debug, Snafu)]
+    #[snafu(visibility(pub))]
+    pub enum Error {
+        #[snafu(display("Failed to persist sst content, path:{}, err:{}", path, source))]
+        Persist {
+            path: String,
+            source: Box<dyn std::error::Error + Send + Sync>,
+        },
+
+        #[snafu(display("Failed to encode meta data, err:{}", source))]
+        EncodeMetaData {
+            source: Box<dyn std::error::Error + Send + Sync>,
+        },
+
+        #[snafu(display("Failed to get sst file size, path:{}", path))]
+        GetFileSize { path: String },
+
+        #[snafu(display(
+            "Failed to encode record batch into sst, err:{}.\nBacktrace:\n{}",
+            source,
+            backtrace
+        ))]
+        EncodeRecordBatch {
+            source: Box<dyn std::error::Error + Send + Sync>,
+            backtrace: Backtrace,
+        },
+
+        #[snafu(display("Failed to poll record batch, err:{}", source))]
+        PollRecordBatch {
+            source: Box<dyn std::error::Error + Send + Sync>,
+        },
+    }
+
+    define_result!(Error);
+}
+
+pub use error::*;
+
+pub type RecordBatchStreamItem =
+    std::result::Result<RecordBatchWithKey, Box<dyn std::error::Error + Send + Sync>>;
+// TODO(yingwen): SstReader also has a RecordBatchStream, can we use same type?
+pub type RecordBatchStream = Box<dyn Stream<Item = RecordBatchStreamItem> + Send + Unpin>;
+
+#[derive(Debug, Copy, Clone)]
+pub struct SstInfo {
+    pub file_size: usize,
+    pub row_num: usize,
+}
+
+/// The builder for sst.
+///
+/// The caller provides a stream of [RecordBatch] and the builder takes
+/// responsibilities for persisting the records.
+#[async_trait]
+pub trait SstBuilder {
+    async fn build(
+        &mut self,
+        request_id: RequestId,
+        meta: &SstMetaData,
+        record_stream: RecordBatchStream,
+    ) -> Result<SstInfo>;
+}
diff --git a/analytic_engine/src/sst/factory.rs b/analytic_engine/src/sst/factory.rs
new file mode 100644
index 0000000000..f910468515
--- /dev/null
+++ b/analytic_engine/src/sst/factory.rs
@@ -0,0 +1,87 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Factory for different kinds sst builder and reader.
+
+use std::{fmt::Debug, sync::Arc};
+
+use common_types::projected_schema::ProjectedSchema;
+use common_util::runtime::Runtime;
+use object_store::ObjectStore;
+use parquet::{DataCacheRef, MetaCacheRef};
+use table_engine::predicate::PredicateRef;
+
+use crate::{
+    sst::{
+        builder::SstBuilder,
+        parquet::{builder::ParquetSstBuilder, reader::ParquetSstReader},
+        reader::SstReader,
+    },
+    table_options::Compression,
+};
+
+pub trait Factory: Clone {
+    fn new_sst_reader<'a, S: ObjectStore>(
+        &self,
+        options: &SstReaderOptions,
+        path: &'a S::Path,
+        storage: &'a S,
+    ) -> Option<Box<dyn SstReader + Send + 'a>>;
+
+    fn new_sst_builder<'a, S: ObjectStore>(
+        &self,
+        options: &SstBuilderOptions,
+        path: &'a S::Path,
+        storage: &'a S,
+    ) -> Option<Box<dyn SstBuilder + Send + 'a>>;
+}
+
+#[derive(Debug, Copy, Clone)]
+pub enum SstType {
+    Parquet,
+}
+
+#[derive(Debug, Clone)]
+pub struct SstReaderOptions {
+    pub sst_type: SstType,
+    pub read_batch_row_num: usize,
+    pub reverse: bool,
+    pub projected_schema: ProjectedSchema,
+    pub predicate: PredicateRef,
+    pub meta_cache: Option<MetaCacheRef>,
+    pub data_cache: Option<DataCacheRef>,
+    pub runtime: Arc<Runtime>,
+}
+
+#[derive(Debug, Clone)]
+pub struct SstBuilderOptions {
+    pub sst_type: SstType,
+    pub num_rows_per_row_group: usize,
+    pub compression: Compression,
+}
+
+#[derive(Debug, Clone)]
+pub struct FactoryImpl;
+
+impl Factory for FactoryImpl {
+    fn new_sst_reader<'a, S: ObjectStore>(
+        &self,
+        options: &SstReaderOptions,
+        path: &'a S::Path,
+        storage: &'a S,
+    ) -> Option<Box<dyn SstReader + Send + 'a>> {
+        match options.sst_type {
+            SstType::Parquet => Some(Box::new(ParquetSstReader::new(path, storage, options))),
+        }
+    }
+
+    fn new_sst_builder<'a, S: ObjectStore>(
+        &self,
+        options: &SstBuilderOptions,
+        path: &'a S::Path,
+        storage: &'a S,
+    ) -> Option<Box<dyn SstBuilder + Send + 'a>> {
+        match options.sst_type {
+            SstType::Parquet => Some(Box::new(ParquetSstBuilder::new(path, storage, options))),
+        }
+    }
+}
diff --git a/analytic_engine/src/sst/file.rs b/analytic_engine/src/sst/file.rs
new file mode 100644
index 0000000000..00bf345e66
--- /dev/null
+++ b/analytic_engine/src/sst/file.rs
@@ -0,0 +1,699 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Sst file and storage info
+
+use std::{
+    borrow::Borrow,
+    cmp,
+    collections::{BTreeMap, HashSet},
+    convert::TryFrom,
+    fmt,
+    fmt::Debug,
+    hash::{Hash, Hasher},
+    sync::{
+        atomic::{AtomicBool, Ordering},
+        Arc,
+    },
+};
+
+use common_types::{
+    bytes::Bytes,
+    schema::Schema,
+    time::{TimeRange, Timestamp},
+    SequenceNumber,
+};
+use common_util::{
+    define_result,
+    metric::Meter,
+    runtime::{JoinHandle, Runtime},
+};
+use log::{debug, error, info};
+use object_store::{path::ObjectStorePath, ObjectStore};
+use proto::{common::TimeRange as TimeRangePb, sst::SstMetaData as SstMetaDataPb};
+use snafu::{ResultExt, Snafu};
+use table_engine::table::TableId;
+use tokio::sync::{
+    mpsc::{self, UnboundedReceiver, UnboundedSender},
+    Mutex,
+};
+
+use crate::{space::SpaceId, sst::manager::FileId, table::sst_util};
+
+/// Error of sst file.
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Failed to convert time range, err:{}", source))]
+    ConvertTimeRange { source: common_types::time::Error },
+
+    #[snafu(display("Failed to convert table schema, err:{}", source))]
+    ConvertTableSchema { source: common_types::schema::Error },
+
+    #[snafu(display("Failed to join purger, err:{}", source))]
+    StopPurger { source: common_util::runtime::Error },
+}
+
+define_result!(Error);
+
+pub type Level = u16;
+
+// TODO(yingwen): Order or split file by time range to speed up filter (even in
+//  level 0).
+/// Manage files of single level
+pub struct LevelHandler {
+    pub level: Level,
+    /// All files in current level.
+    files: FileHandleSet,
+}
+
+impl LevelHandler {
+    pub fn new(level: u16) -> Self {
+        Self {
+            level,
+            files: FileHandleSet::default(),
+        }
+    }
+
+    #[inline]
+    pub fn insert(&mut self, file: FileHandle) {
+        self.files.insert(file);
+    }
+
+    pub fn latest_sst(&self) -> Option<FileHandle> {
+        self.files.latest()
+    }
+
+    pub fn pick_ssts(&self, time_range: TimeRange) -> Vec<FileHandle> {
+        if self.level == 0 {
+            self.files.files_by_time_range(time_range)
+        } else {
+            Vec::new()
+        }
+    }
+
+    #[inline]
+    pub fn remove_ssts(&mut self, file_ids: &[FileId]) {
+        self.files.remove_by_ids(file_ids);
+    }
+
+    pub fn iter_ssts(&self) -> Iter {
+        let iter = self.files.file_map.values();
+        Iter(iter)
+    }
+
+    #[inline]
+    pub fn collect_expired(
+        &self,
+        expire_time: Option<Timestamp>,
+        expired_files: &mut Vec<FileHandle>,
+    ) {
+        self.files.collect_expired(expire_time, expired_files);
+    }
+
+    #[inline]
+    pub fn has_expired_sst(&self, expire_time: Option<Timestamp>) -> bool {
+        self.files.has_expired_sst(expire_time)
+    }
+}
+
+pub struct Iter<'a>(std::collections::btree_map::Values<'a, FileOrdKey, FileHandle>);
+
+impl<'a> Iterator for Iter<'a> {
+    type Item = &'a FileHandle;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        self.0.next()
+    }
+}
+
+#[derive(Clone)]
+pub struct FileHandle {
+    inner: Arc<FileHandleInner>,
+}
+
+impl PartialEq for FileHandle {
+    fn eq(&self, other: &Self) -> bool {
+        self.id() == other.id()
+    }
+}
+
+impl Eq for FileHandle {}
+
+impl Hash for FileHandle {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        self.id().hash(state);
+    }
+}
+
+impl FileHandle {
+    pub fn new(meta: FileMeta, purge_queue: FilePurgeQueue) -> Self {
+        Self {
+            inner: Arc::new(FileHandleInner {
+                meta,
+                purge_queue,
+                being_compacted: AtomicBool::new(false),
+                metrics: SstMetrics::default(),
+            }),
+        }
+    }
+
+    #[inline]
+    pub fn read_meter(&self) -> Arc<Meter> {
+        self.inner.metrics.read_meter.clone()
+    }
+
+    #[inline]
+    pub fn row_num(&self) -> u64 {
+        self.inner.meta.meta.row_num
+    }
+
+    #[inline]
+    pub fn id(&self) -> FileId {
+        self.inner.meta.id
+    }
+
+    #[inline]
+    pub fn id_ref(&self) -> &FileId {
+        &self.inner.meta.id
+    }
+
+    #[inline]
+    pub fn intersect_with_time_range(&self, time_range: TimeRange) -> bool {
+        self.inner.meta.intersect_with_time_range(time_range)
+    }
+
+    #[inline]
+    pub fn min_key(&self) -> Bytes {
+        self.inner.meta.meta.min_key.clone()
+    }
+
+    #[inline]
+    pub fn max_key(&self) -> Bytes {
+        self.inner.meta.meta.max_key.clone()
+    }
+
+    #[inline]
+    pub fn time_range(&self) -> TimeRange {
+        self.inner.meta.meta.time_range
+    }
+
+    #[inline]
+    pub fn time_range_ref(&self) -> &TimeRange {
+        &self.inner.meta.meta.time_range
+    }
+
+    #[inline]
+    pub fn max_sequence(&self) -> SequenceNumber {
+        self.inner.meta.meta.max_sequence
+    }
+
+    #[inline]
+    pub fn being_compacted(&self) -> bool {
+        self.inner.being_compacted.load(Ordering::Relaxed)
+    }
+
+    #[inline]
+    pub fn size(&self) -> u64 {
+        self.inner.meta.meta.size
+    }
+
+    #[inline]
+    pub fn set_being_compacted(&self, value: bool) {
+        self.inner.being_compacted.store(value, Ordering::Relaxed);
+    }
+}
+
+impl fmt::Debug for FileHandle {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("FileHandle")
+            .field("meta", &self.inner.meta)
+            .field("being_compacted", &self.being_compacted())
+            .field("metrics", &self.inner.metrics)
+            .finish()
+    }
+}
+
+struct SstMetrics {
+    pub read_meter: Arc<Meter>,
+    pub key_num: usize,
+}
+
+impl Default for SstMetrics {
+    fn default() -> Self {
+        SstMetrics {
+            read_meter: Arc::new(Meter::new()),
+            key_num: 0,
+        }
+    }
+}
+
+impl fmt::Debug for SstMetrics {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("SstMetrics")
+            .field("read_meter", &self.read_meter.h2_rate())
+            .field("key_num", &self.key_num)
+            .finish()
+    }
+}
+
+struct FileHandleInner {
+    meta: FileMeta,
+    purge_queue: FilePurgeQueue,
+    /// The file is being compacting.
+    being_compacted: AtomicBool,
+    metrics: SstMetrics,
+}
+
+impl Drop for FileHandleInner {
+    fn drop(&mut self) {
+        debug!("FileHandle is dropped, meta:{:?}", self.meta);
+
+        // Push file cannot block or be async because we are in drop().
+        self.purge_queue.push_file(self.meta.id);
+    }
+}
+
+/// Used to order [FileHandle] by (end_time, start_time, file_id)
+#[derive(PartialEq, Eq, PartialOrd, Ord)]
+struct FileOrdKey {
+    exclusive_end: Timestamp,
+    inclusive_start: Timestamp,
+    file_id: FileId,
+}
+
+impl FileOrdKey {
+    fn for_seek(exclusive_end: Timestamp) -> Self {
+        Self {
+            exclusive_end,
+            inclusive_start: Timestamp::MIN,
+            file_id: 0,
+        }
+    }
+
+    fn key_of(file: &FileHandle) -> Self {
+        Self {
+            exclusive_end: file.time_range().exclusive_end(),
+            inclusive_start: file.time_range().inclusive_start(),
+            file_id: file.id(),
+        }
+    }
+}
+
+/// Used to index [FileHandle] by file_id
+struct FileHandleHash(FileHandle);
+
+impl PartialEq for FileHandleHash {
+    fn eq(&self, other: &Self) -> bool {
+        self.0.id() == other.0.id()
+    }
+}
+
+impl Eq for FileHandleHash {}
+
+impl Hash for FileHandleHash {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        self.0.id().hash(state);
+    }
+}
+
+impl Borrow<FileId> for FileHandleHash {
+    #[inline]
+    fn borrow(&self) -> &FileId {
+        self.0.id_ref()
+    }
+}
+
+#[derive(Default)]
+struct FileHandleSet {
+    /// Files ordered by time range and id.
+    file_map: BTreeMap<FileOrdKey, FileHandle>,
+    /// Files indexed by file id, used to speed up removal.
+    id_to_files: HashSet<FileHandleHash>,
+}
+
+impl FileHandleSet {
+    fn latest(&self) -> Option<FileHandle> {
+        if let Some(file) = self.file_map.values().rev().next() {
+            return Some(file.clone());
+        }
+        None
+    }
+
+    fn files_by_time_range(&self, time_range: TimeRange) -> Vec<FileHandle> {
+        // Seek to first sst whose end time >= time_range.inclusive_start().
+        let seek_key = FileOrdKey::for_seek(time_range.inclusive_start());
+        self.file_map
+            .range(seek_key..)
+            .into_iter()
+            .filter_map(|(_key, file)| {
+                if file.intersect_with_time_range(time_range) {
+                    Some(file.clone())
+                } else {
+                    None
+                }
+            })
+            .collect()
+    }
+
+    fn insert(&mut self, file: FileHandle) {
+        self.file_map
+            .insert(FileOrdKey::key_of(&file), file.clone());
+        self.id_to_files.insert(FileHandleHash(file));
+    }
+
+    fn remove_by_ids(&mut self, file_ids: &[FileId]) {
+        for file_id in file_ids {
+            if let Some(file) = self.id_to_files.take(file_id) {
+                let key = FileOrdKey::key_of(&file.0);
+                self.file_map.remove(&key);
+            }
+        }
+    }
+
+    /// Collect ssts with time range is expired.
+    fn collect_expired(&self, expire_time: Option<Timestamp>, expired_files: &mut Vec<FileHandle>) {
+        for file in self.file_map.values() {
+            if file.time_range().is_expired(expire_time) {
+                expired_files.push(file.clone());
+            } else {
+                // Files are sorted by end time first, so there is no more file whose end time
+                // is less than `expire_time`.
+                break;
+            }
+        }
+    }
+
+    fn has_expired_sst(&self, expire_time: Option<Timestamp>) -> bool {
+        // Files are sorted by end time first, so check first file is enough.
+        if let Some(file) = self.file_map.values().next() {
+            return file.time_range().is_expired(expire_time);
+        }
+
+        false
+    }
+}
+
+/// Meta of a sst file, immutable once created
+#[derive(Debug, Clone)]
+pub struct FileMeta {
+    /// Id of the sst file
+    pub id: FileId,
+    pub meta: SstMetaData,
+}
+
+impl FileMeta {
+    pub fn intersect_with_time_range(&self, time_range: TimeRange) -> bool {
+        self.meta.time_range.intersect_with(time_range)
+    }
+}
+
+/// Meta data of a sst file, immutable once created
+#[derive(Debug, Clone, PartialEq)]
+pub struct SstMetaData {
+    pub min_key: Bytes,
+    pub max_key: Bytes,
+    /// Time Range of the sst
+    pub time_range: TimeRange,
+    /// Max sequence number in the sst
+    pub max_sequence: SequenceNumber,
+    pub schema: Schema,
+    /// file size in bytes
+    pub size: u64,
+    // total row number
+    pub row_num: u64,
+}
+
+impl From<SstMetaData> for SstMetaDataPb {
+    fn from(src: SstMetaData) -> Self {
+        let mut target = SstMetaDataPb::default();
+        target.set_min_key(src.min_key.to_vec());
+        target.set_max_key(src.max_key.to_vec());
+        target.set_max_sequence(src.max_sequence);
+        let time_range = TimeRangePb::from(src.time_range);
+        target.set_time_range(time_range);
+        target.set_schema(src.schema.into());
+        target.set_size(src.size);
+        target.set_row_num(src.row_num);
+
+        target
+    }
+}
+
+impl TryFrom<SstMetaDataPb> for SstMetaData {
+    type Error = Error;
+
+    fn try_from(mut src: SstMetaDataPb) -> Result<Self> {
+        let time_range = TimeRange::try_from(src.take_time_range()).context(ConvertTimeRange)?;
+        let schema = Schema::try_from(src.take_schema()).context(ConvertTableSchema)?;
+        Ok(Self {
+            min_key: src.min_key.into(),
+            max_key: src.max_key.into(),
+            time_range,
+            max_sequence: src.max_sequence,
+            schema,
+            size: src.size,
+            row_num: src.row_num,
+        })
+    }
+}
+
+// Queue to store files to be deleted for a table.
+#[derive(Clone)]
+pub struct FilePurgeQueue {
+    // Wrap a inner struct to avoid storing space/table ids for each file.
+    inner: Arc<FilePurgeQueueInner>,
+}
+
+impl FilePurgeQueue {
+    pub fn new(space_id: SpaceId, table_id: TableId, sender: UnboundedSender<Request>) -> Self {
+        Self {
+            inner: Arc::new(FilePurgeQueueInner {
+                space_id,
+                table_id,
+                sender,
+                closed: AtomicBool::new(false),
+            }),
+        }
+    }
+
+    /// Close the purge queue, then all request pushed to this queue will be
+    /// ignored. This is mainly used to avoid files being deleted after the
+    /// db is closed.
+    pub fn close(&self) {
+        self.inner.closed.store(true, Ordering::SeqCst);
+    }
+
+    fn push_file(&self, file_id: FileId) {
+        if self.inner.closed.load(Ordering::SeqCst) {
+            return;
+        }
+
+        // Send the file id via a channel to file purger and delete the file from sst
+        // store in background.
+        let request = FilePurgeRequest {
+            space_id: self.inner.space_id,
+            table_id: self.inner.table_id,
+            file_id,
+        };
+
+        if let Err(send_res) = self.inner.sender.send(Request::Purge(request)) {
+            error!(
+                "Failed to send delete file request, request:{:?}",
+                send_res.0
+            );
+        }
+    }
+}
+
+struct FilePurgeQueueInner {
+    space_id: SpaceId,
+    table_id: TableId,
+    closed: AtomicBool,
+    sender: UnboundedSender<Request>,
+}
+
+#[derive(Debug)]
+pub struct FilePurgeRequest {
+    space_id: SpaceId,
+    table_id: TableId,
+    file_id: FileId,
+}
+
+#[derive(Debug)]
+pub enum Request {
+    Purge(FilePurgeRequest),
+    Exit,
+}
+
+/// Background file purger.
+pub struct FilePurger {
+    sender: UnboundedSender<Request>,
+    handle: Mutex<Option<JoinHandle<()>>>,
+}
+
+impl FilePurger {
+    pub fn start<Store: ObjectStore + Send + Sync + 'static>(
+        runtime: &Runtime,
+        store: Arc<Store>,
+    ) -> Self {
+        // We must use unbound channel, so the sender wont block when the handle is
+        // dropped.
+        let (tx, rx) = mpsc::unbounded_channel();
+
+        // Spawn a background job to purge files.
+        let handle = runtime.spawn(async {
+            Self::purge_file_loop(store, rx).await;
+        });
+
+        Self {
+            sender: tx,
+            handle: Mutex::new(Some(handle)),
+        }
+    }
+
+    pub async fn stop(&self) -> Result<()> {
+        info!("Try to stop file purger");
+
+        if self.sender.send(Request::Exit).is_err() {
+            error!("File purge task already exited");
+        }
+
+        let mut handle = self.handle.lock().await;
+        // Also clear the handle to avoid await a ready future.
+        if let Some(h) = handle.take() {
+            h.await.context(StopPurger)?;
+        }
+
+        Ok(())
+    }
+
+    pub fn create_purge_queue(&self, space_id: SpaceId, table_id: TableId) -> FilePurgeQueue {
+        FilePurgeQueue::new(space_id, table_id, self.sender.clone())
+    }
+
+    async fn purge_file_loop<Store: ObjectStore>(
+        store: Arc<Store>,
+        mut receiver: UnboundedReceiver<Request>,
+    ) {
+        info!("File purger start");
+
+        while let Some(request) = receiver.recv().await {
+            match request {
+                Request::Purge(purge_request) => {
+                    let mut sst_file_path = store.new_path();
+                    sst_util::set_sst_file_path(
+                        purge_request.space_id,
+                        purge_request.table_id,
+                        purge_request.file_id,
+                        &mut sst_file_path,
+                    );
+
+                    info!(
+                        "File purger delete file, purge_request:{:?}, sst_file_path:{}",
+                        purge_request,
+                        sst_file_path.display()
+                    );
+
+                    if let Err(e) = store.delete(&sst_file_path).await {
+                        error!(
+                            "File purger failed to delete file, sst_file_path:{}, err:{}",
+                            sst_file_path.display(),
+                            e
+                        );
+                    }
+                }
+                Request::Exit => break,
+            }
+        }
+
+        info!("File purger exit");
+    }
+}
+
+/// Merge sst meta of given `files`, panic if `files` is empty.
+///
+/// The size and row_num of the merged meta is initialized to 0.
+pub fn merge_sst_meta(files: &[FileHandle], schema: Schema) -> SstMetaData {
+    let mut min_key = files[0].min_key();
+    let mut max_key = files[0].max_key();
+    let mut time_range_start = files[0].time_range().inclusive_start();
+    let mut time_range_end = files[0].time_range().exclusive_end();
+    let mut max_sequence = files[0].max_sequence();
+
+    if files.len() > 1 {
+        for file in &files[1..] {
+            min_key = cmp::min(file.min_key(), min_key);
+            max_key = cmp::max(file.max_key(), max_key);
+            time_range_start = cmp::min(file.time_range().inclusive_start(), time_range_start);
+            time_range_end = cmp::max(file.time_range().exclusive_end(), time_range_end);
+            max_sequence = cmp::max(file.max_sequence(), max_sequence);
+        }
+    }
+
+    SstMetaData {
+        min_key,
+        max_key,
+        time_range: TimeRange::new(time_range_start, time_range_end).unwrap(),
+        max_sequence,
+        schema,
+        // we don't know file size and total row number yet
+        size: 0,
+        row_num: 0,
+    }
+}
+
+#[cfg(test)]
+pub mod tests {
+    use super::*;
+
+    pub struct FilePurgerMocker;
+
+    impl FilePurgerMocker {
+        pub fn mock() -> FilePurger {
+            let (sender, _receiver) = mpsc::unbounded_channel();
+
+            FilePurger {
+                sender,
+                handle: Mutex::new(None),
+            }
+        }
+    }
+
+    #[must_use]
+    pub struct SstMetaDataMocker {
+        schema: Schema,
+        time_range: TimeRange,
+        max_sequence: SequenceNumber,
+    }
+
+    impl SstMetaDataMocker {
+        pub fn new(schema: Schema) -> Self {
+            Self {
+                schema,
+                time_range: TimeRange::min_to_max(),
+                max_sequence: 1,
+            }
+        }
+
+        pub fn time_range(mut self, range: TimeRange) -> Self {
+            self.time_range = range;
+            self
+        }
+
+        pub fn max_sequence(mut self, max_sequence: SequenceNumber) -> Self {
+            self.max_sequence = max_sequence;
+            self
+        }
+
+        pub fn build(&self) -> SstMetaData {
+            SstMetaData {
+                min_key: Bytes::new(),
+                max_key: Bytes::new(),
+                time_range: self.time_range,
+                max_sequence: self.max_sequence,
+                schema: self.schema.clone(),
+                size: 0,
+                row_num: 0,
+            }
+        }
+    }
+}
diff --git a/analytic_engine/src/sst/manager.rs b/analytic_engine/src/sst/manager.rs
new file mode 100644
index 0000000000..2d64a8fafb
--- /dev/null
+++ b/analytic_engine/src/sst/manager.rs
@@ -0,0 +1,159 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Multi-level SST management
+
+use common_types::time::{TimeRange, Timestamp};
+
+use crate::{
+    compaction::ExpiredFiles,
+    sst::file::{FileHandle, FileMeta, FilePurgeQueue, Iter, Level, LevelHandler},
+};
+
+/// Id for a sst file
+pub type FileId = u64;
+/// We use two level merge tree, the max level should less than u16::MAX
+pub const MAX_LEVEL: usize = 2;
+
+/// A table level manager that manages all the sst files of the table
+pub struct LevelsController {
+    levels: Vec<LevelHandler>,
+    purge_queue: FilePurgeQueue,
+}
+
+impl Drop for LevelsController {
+    fn drop(&mut self) {
+        // Close the purge queue to avoid files being deleted.
+        self.purge_queue.close();
+    }
+}
+
+impl LevelsController {
+    /// Create an empty LevelsController
+    pub fn new(purge_queue: FilePurgeQueue) -> Self {
+        let mut levels = Vec::with_capacity(MAX_LEVEL);
+        for level in 0..MAX_LEVEL {
+            levels.push(LevelHandler::new(level as Level));
+        }
+
+        Self {
+            levels,
+            purge_queue,
+        }
+    }
+
+    /// Add sst file to level
+    ///
+    /// Panic: If the level is greater than the max level
+    pub fn add_sst_to_level(&mut self, level: Level, file_meta: FileMeta) {
+        let level_handler = &mut self.levels[usize::from(level)];
+        let file = FileHandle::new(file_meta, self.purge_queue.clone());
+
+        level_handler.insert(file);
+    }
+
+    pub fn latest_sst(&self, level: Level) -> Option<FileHandle> {
+        self.levels[usize::from(level)].latest_sst()
+    }
+
+    /// Pick the ssts and collect it by `append_sst`.
+    pub fn pick_ssts(
+        &self,
+        time_range: TimeRange,
+        mut append_sst: impl FnMut(Level, &[FileHandle]),
+    ) {
+        for level_handler in self.levels.iter() {
+            let ssts = level_handler.pick_ssts(time_range);
+            append_sst(level_handler.level, &ssts);
+        }
+    }
+
+    /// Remove sst files from level.
+    ///
+    /// Panic: If the level is greater than the max level
+    pub fn remove_ssts_from_level(&mut self, level: Level, file_ids: &[FileId]) {
+        let level_handler = &mut self.levels[usize::from(level)];
+        level_handler.remove_ssts(file_ids);
+    }
+
+    /// Total number of levels.
+    pub fn num_levels(&self) -> Level {
+        self.levels.len() as Level
+    }
+
+    /// Iter ssts at given `level`.
+    ///
+    /// Panic if level is out of bound.
+    pub fn iter_ssts_at_level(&self, level: Level) -> Iter {
+        let level_handler = &self.levels[usize::from(level)];
+        level_handler.iter_ssts()
+    }
+
+    pub fn collect_expired_at_level(
+        &self,
+        level: Level,
+        expire_time: Option<Timestamp>,
+    ) -> Vec<FileHandle> {
+        let level_handler = &self.levels[usize::from(level)];
+        let mut expired = Vec::new();
+        level_handler.collect_expired(expire_time, &mut expired);
+
+        expired
+    }
+
+    pub fn has_expired_sst(&self, expire_time: Option<Timestamp>) -> bool {
+        self.levels
+            .iter()
+            .any(|level_handler| level_handler.has_expired_sst(expire_time))
+    }
+
+    pub fn expired_ssts(&self, expire_time: Option<Timestamp>) -> Vec<ExpiredFiles> {
+        let mut expired = Vec::new();
+        let num_levels = self.num_levels();
+        for level in 0..num_levels {
+            let files = self.collect_expired_at_level(level, expire_time);
+            expired.push(ExpiredFiles { level, files });
+        }
+
+        expired
+    }
+}
+
+#[cfg(test)]
+pub mod tests {
+    use table_engine::table::TableId;
+    use tokio::sync::mpsc;
+
+    use crate::sst::{
+        file::{FileMeta, FilePurgeQueue, SstMetaData},
+        manager::{FileId, LevelsController},
+    };
+
+    #[must_use]
+    #[derive(Default)]
+    pub struct LevelsControllerMockBuilder {
+        sst_meta_vec: Vec<SstMetaData>,
+    }
+
+    impl LevelsControllerMockBuilder {
+        pub fn add_sst(mut self, mut sst_meta: Vec<SstMetaData>) -> Self {
+            self.sst_meta_vec.append(&mut sst_meta);
+            self
+        }
+
+        pub fn build(self) -> LevelsController {
+            let (tx, _rx) = mpsc::unbounded_channel();
+            let file_purge_queue = FilePurgeQueue::new(100, TableId::from(101), tx);
+            let mut levels_controller = LevelsController::new(file_purge_queue);
+            for (id, sst_meta) in self.sst_meta_vec.into_iter().enumerate() {
+                levels_controller.add_sst_to_level(
+                    0,
+                    FileMeta {
+                        id: id as FileId,
+                        meta: sst_meta,
+                    },
+                );
+            }
+            levels_controller
+        }
+    }
+}
diff --git a/analytic_engine/src/sst/mod.rs b/analytic_engine/src/sst/mod.rs
new file mode 100644
index 0000000000..a6fec9162b
--- /dev/null
+++ b/analytic_engine/src/sst/mod.rs
@@ -0,0 +1,10 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! SST (Sorted String Table) file
+
+pub mod builder;
+pub mod factory;
+pub mod file;
+pub mod manager;
+pub mod parquet;
+pub mod reader;
diff --git a/analytic_engine/src/sst/parquet/builder.rs b/analytic_engine/src/sst/parquet/builder.rs
new file mode 100644
index 0000000000..8bba10cc79
--- /dev/null
+++ b/analytic_engine/src/sst/parquet/builder.rs
@@ -0,0 +1,560 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Sst builder implementation based on parquet.
+
+use std::{
+    io::SeekFrom,
+    pin::Pin,
+    sync::{
+        atomic::{AtomicUsize, Ordering},
+        Arc, Mutex,
+    },
+    task::{Context, Poll},
+};
+
+use arrow_deps::{
+    arrow::record_batch::RecordBatch as ArrowRecordBatch,
+    datafusion::parquet::basic::Compression,
+    parquet::{
+        arrow::ArrowWriter,
+        file::{properties::WriterProperties, writer::TryClone},
+    },
+};
+use async_trait::async_trait;
+use common_types::{bytes::BufMut, request_id::RequestId};
+use futures::AsyncRead;
+use log::debug;
+use object_store::{path::ObjectStorePath, ObjectStore};
+use snafu::{ensure, ResultExt};
+
+use crate::sst::{
+    builder::{RecordBatchStream, SstBuilder, *},
+    factory::SstBuilderOptions,
+    file::SstMetaData,
+    parquet::encoding,
+};
+
+/// The implementation of sst based on parquet and object storage.
+#[derive(Debug)]
+pub struct ParquetSstBuilder<'a, S: ObjectStore> {
+    /// The path where the data is persisted.
+    path: &'a S::Path,
+    /// The storage where the data is persist.
+    storage: &'a S,
+    /// Max row group size.
+    num_rows_per_row_group: usize,
+    compression: Compression,
+}
+
+impl<'a, S: ObjectStore> ParquetSstBuilder<'a, S> {
+    pub fn new(path: &'a S::Path, storage: &'a S, options: &SstBuilderOptions) -> Self {
+        Self {
+            path,
+            storage,
+            num_rows_per_row_group: options.num_rows_per_row_group,
+            compression: options.compression.into(),
+        }
+    }
+}
+
+/// A memory writer implementing the [ParquetWriter].
+///
+/// The writer accepts the encoded bytes by parquet format and provides the byte
+/// stream to the reader.
+#[derive(Clone, Debug)]
+struct EncodingBuffer {
+    // In order to reuse the buffer, the buffer must be wrapped in the Arc and the Mutex because
+    // the writer is consumed when building a ArrowWriter.
+    inner: Arc<Mutex<EncodingBufferInner>>,
+}
+
+impl Default for EncodingBuffer {
+    fn default() -> Self {
+        Self {
+            inner: Arc::new(Mutex::new(EncodingBufferInner {
+                bytes_written: 0,
+                read_offset: 0,
+                buf: Vec::new(),
+            })),
+        }
+    }
+}
+
+impl std::io::Write for EncodingBuffer {
+    fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
+        let mut inner = self.inner.lock().unwrap();
+        inner.write(buf)
+    }
+
+    fn flush(&mut self) -> std::io::Result<()> {
+        let mut inner = self.inner.lock().unwrap();
+        inner.flush()
+    }
+}
+
+impl std::io::Seek for EncodingBuffer {
+    fn seek(&mut self, pos: SeekFrom) -> std::io::Result<u64> {
+        let mut inner = self.inner.lock().unwrap();
+        inner.seek(pos)
+    }
+}
+
+impl TryClone for EncodingBuffer {
+    fn try_clone(&self) -> std::io::Result<Self> {
+        Ok(self.clone())
+    }
+}
+
+impl EncodingBuffer {
+    fn read(&self, read_buf: &mut [u8]) -> usize {
+        let mut inner = self.inner.lock().unwrap();
+        inner.read(read_buf)
+    }
+}
+
+/// The underlying buffer implementing [ParquetWriter].
+///
+/// Provides the write function for [ArrowWriter] and read function for
+/// [AsyncRead].
+#[derive(Clone, Debug)]
+struct EncodingBufferInner {
+    bytes_written: usize,
+    read_offset: usize,
+    buf: Vec<u8>,
+}
+
+impl std::io::Write for EncodingBufferInner {
+    /// Write the `buf` to the `self.buf`.
+    ///
+    /// The readable bytes should be exhausted before writing new bytes.
+    /// `self.bytes_written` and `self.read_offset` is updated after writing.
+    fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
+        if self.read_offset != 0 {
+            assert_eq!(self.buf.len(), self.read_offset);
+            self.buf.clear();
+            self.buf.reserve(buf.len());
+            // reset the read offset
+            self.read_offset = 0;
+        }
+
+        let bytes_written = self.buf.write(buf)?;
+        // accumulate the written bytes
+        self.bytes_written += bytes_written;
+
+        Ok(bytes_written)
+    }
+
+    /// Actually nothing to flush.
+    fn flush(&mut self) -> std::io::Result<()> {
+        Ok(())
+    }
+}
+
+impl std::io::Seek for EncodingBufferInner {
+    /// Given the assumption that the seek usage of the [ParquetWriter] in the
+    /// parquet project is just `seek(SeekFrom::Current(0))`, the
+    /// implementation panics if seek to a different target.
+    fn seek(&mut self, pos: SeekFrom) -> std::io::Result<u64> {
+        if let SeekFrom::Current(offset) = pos {
+            assert_eq!(offset, 0);
+            return Ok(self.bytes_written as u64);
+        }
+
+        unreachable!("Only can handle the case where seek to current(0)")
+    }
+}
+
+impl EncodingBufferInner {
+    /// Read the content in `self.buf[self.offset..]` into `read_buf`.
+    ///
+    /// When finishing reading, advance the `self.offset`.
+    fn read(&mut self, mut read_buf: &mut [u8]) -> usize {
+        if self.read_offset >= self.buf.len() {
+            return 0;
+        }
+        let remaining_size = self.buf.len() - self.read_offset;
+
+        let read_len = remaining_size.min(read_buf.len());
+        read_buf.put(&self.buf[self.read_offset..self.read_offset + read_len]);
+
+        self.advance(read_len);
+        read_len
+    }
+
+    /// Advance the `self.offset` by `len`.
+    ///
+    /// Caller should ensures the advanced offset wont exceed `self.buf.len()`.
+    fn advance(&mut self, len: usize) {
+        self.read_offset += len;
+
+        assert!(self.read_offset <= self.buf.len());
+    }
+}
+
+/// RecordBytesReader provides AsyncRead implementation for the encoded records
+/// by parquet.
+struct RecordBytesReader {
+    request_id: RequestId,
+    record_stream: RecordBatchStream,
+    encoding_buffer: EncodingBuffer,
+    arrow_writer: Mutex<Option<ArrowWriter<EncodingBuffer>>>,
+    num_rows_per_row_group: usize,
+    compression: Compression,
+    meta_data: SstMetaData,
+    total_row_num: Arc<AtomicUsize>,
+    arrow_record_batch_vec: Vec<ArrowRecordBatch>,
+    // Whether the underlying `record_stream` is finished
+    stream_finished: bool,
+
+    fetched_row_num: usize,
+}
+
+/// Build the write properties containing the sst meta data.
+fn build_write_properties(
+    num_rows_per_row_group: usize,
+    compression: Compression,
+    meta_data: &SstMetaData,
+) -> Result<WriterProperties> {
+    let meta_data_kv = encoding::encode_sst_meta_data(meta_data.clone())
+        .map_err(|e| Box::new(e) as _)
+        .context(EncodeMetaData)?;
+
+    Ok(WriterProperties::builder()
+        .set_key_value_metadata(Some(vec![meta_data_kv]))
+        .set_max_row_group_size(num_rows_per_row_group)
+        .set_compression(compression)
+        .build())
+}
+
+/// Encode the record batch with [ArrowWriter] and the encoded contents is
+/// written to the [EncodingBuffer].
+// TODO(xikai): too many parameters
+fn encode_record_batch(
+    arrow_writer: &mut Option<ArrowWriter<EncodingBuffer>>,
+    num_rows_per_row_group: usize,
+    compression: Compression,
+    meta_data: &SstMetaData,
+    mem_buf_writer: EncodingBuffer,
+    arrow_record_batch_vec: Vec<ArrowRecordBatch>,
+) -> Result<usize> {
+    if arrow_record_batch_vec.is_empty() {
+        return Ok(0);
+    }
+
+    let arrow_schema = arrow_record_batch_vec[0].schema();
+
+    // create arrow writer if not exist
+    if arrow_writer.is_none() {
+        let write_props = build_write_properties(num_rows_per_row_group, compression, meta_data)?;
+        let writer = ArrowWriter::try_new(mem_buf_writer, arrow_schema.clone(), Some(write_props))
+            .map_err(|e| Box::new(e) as _)
+            .context(EncodeRecordBatch)?;
+        *arrow_writer = Some(writer);
+    }
+
+    let record_batch = ArrowRecordBatch::concat(&arrow_schema, &arrow_record_batch_vec)
+        .map_err(|e| Box::new(e) as _)
+        .context(EncodeRecordBatch)?;
+
+    arrow_writer
+        .as_mut()
+        .unwrap()
+        .write(&record_batch)
+        .map_err(|e| Box::new(e) as _)
+        .context(EncodeRecordBatch)?;
+
+    Ok(record_batch.num_rows())
+}
+
+fn close_writer(arrow_writer: &mut Option<ArrowWriter<EncodingBuffer>>) -> Result<()> {
+    if let Some(arrow_writer) = arrow_writer {
+        arrow_writer
+            .close()
+            .map_err(|e| Box::new(e) as _)
+            .context(EncodeRecordBatch)?;
+    }
+
+    Ok(())
+}
+
+impl AsyncRead for RecordBytesReader {
+    fn poll_read(
+        self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+        buf: &mut [u8],
+    ) -> Poll<std::io::Result<usize>> {
+        let mut reader = self.get_mut();
+        let size = reader.encoding_buffer.read(buf);
+        if size > 0 {
+            return Poll::Ready(Ok(size));
+        }
+
+        // The stream is also finished
+        if reader.stream_finished {
+            return Poll::Ready(Ok(0));
+        }
+
+        // FIXME(xikai): no data may cause empty sst file.
+        // fetch more rows from the stream.
+        while reader.fetched_row_num < reader.num_rows_per_row_group {
+            match Pin::new(reader.record_stream.as_mut()).poll_next(cx) {
+                Poll::Pending => return Poll::Pending,
+                Poll::Ready(v) => match v {
+                    Some(record_batch) => match record_batch.context(PollRecordBatch) {
+                        Ok(record_batch) => {
+                            assert!(
+                                !record_batch.is_empty(),
+                                "found empty record batch, request id:{}",
+                                reader.request_id
+                            );
+
+                            reader.fetched_row_num += record_batch.num_rows();
+                            reader
+                                .arrow_record_batch_vec
+                                .push(record_batch.into_record_batch().into_arrow_record_batch());
+                        }
+                        Err(e) => {
+                            return Poll::Ready(Err(std::io::Error::new(
+                                std::io::ErrorKind::Other,
+                                e,
+                            )))
+                        }
+                    },
+                    None => {
+                        reader.stream_finished = true;
+                        debug!(
+                            "Record stream finished, request_id:{}, batch_len:{}, fetched_row_num:{}, num_rows_per_row_group:{}",
+                            reader.request_id,
+                            reader.arrow_record_batch_vec.len(),
+                            reader.fetched_row_num,
+                            reader.num_rows_per_row_group,
+                        );
+                        break;
+                    }
+                },
+            }
+        }
+
+        assert!(reader.stream_finished || reader.fetched_row_num >= reader.num_rows_per_row_group);
+
+        // Reset fetched row num.
+        reader.fetched_row_num = 0;
+        match encode_record_batch(
+            reader.arrow_writer.get_mut().unwrap(),
+            reader.num_rows_per_row_group,
+            reader.compression,
+            &reader.meta_data,
+            reader.encoding_buffer.clone(),
+            std::mem::take(&mut reader.arrow_record_batch_vec),
+        ) {
+            Err(e) => return Poll::Ready(Err(std::io::Error::new(std::io::ErrorKind::Other, e))),
+            Ok(row_num) => {
+                reader.total_row_num.fetch_add(row_num, Ordering::Relaxed);
+            }
+        }
+
+        if reader.stream_finished {
+            if let Err(e) = close_writer(reader.arrow_writer.get_mut().unwrap()) {
+                return Poll::Ready(Err(std::io::Error::new(std::io::ErrorKind::Other, e)));
+            }
+        }
+
+        Poll::Ready(Ok(reader.encoding_buffer.read(buf)))
+    }
+}
+
+#[async_trait]
+impl<'a, S: ObjectStore> SstBuilder for ParquetSstBuilder<'a, S> {
+    async fn build(
+        &mut self,
+        request_id: RequestId,
+        meta: &SstMetaData,
+        record_stream: RecordBatchStream,
+    ) -> Result<SstInfo> {
+        debug!(
+            "Build parquet file, request_id:{}, meta:{:?}, num_rows_per_row_group:{}",
+            request_id, meta, self.num_rows_per_row_group
+        );
+
+        let total_row_num = Arc::new(AtomicUsize::new(0));
+        let reader = RecordBytesReader {
+            request_id,
+            record_stream,
+            encoding_buffer: EncodingBuffer::default(),
+            arrow_writer: Mutex::new(None),
+            num_rows_per_row_group: self.num_rows_per_row_group,
+            compression: self.compression,
+            total_row_num: total_row_num.clone(),
+            arrow_record_batch_vec: Vec::new(),
+            // TODO(xikai): should we avoid this clone?
+            meta_data: meta.to_owned(),
+            stream_finished: false,
+            fetched_row_num: 0,
+        };
+
+        self.storage
+            .put(self.path, reader, None)
+            .await
+            .map_err(|e| Box::new(e) as _)
+            .context(Persist {
+                path: self.path.display(),
+            })?;
+
+        let result = self
+            .storage
+            .list_with_delimiter(self.path)
+            .await
+            .map_err(|e| Box::new(e) as _)
+            .context(Persist {
+                path: self.path.display(),
+            })?;
+
+        ensure!(
+            result.objects.len() == 1,
+            GetFileSize {
+                path: self.path.display(),
+            }
+        );
+
+        Ok(SstInfo {
+            file_size: result.objects[0].size,
+            row_num: total_row_num.load(Ordering::Relaxed),
+        })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+
+    use common_types::{
+        bytes::Bytes,
+        projected_schema::ProjectedSchema,
+        tests::{build_row, build_schema},
+        time::{TimeRange, Timestamp},
+    };
+    use common_util::runtime::{self, Runtime};
+    use futures::stream;
+    use object_store::disk::File;
+    use table_engine::predicate::Predicate;
+    use tempfile::tempdir;
+
+    use super::*;
+    use crate::{
+        row_iter::tests::build_record_batch_with_key,
+        sst::{
+            factory::{Factory, FactoryImpl, SstBuilderOptions, SstReaderOptions, SstType},
+            parquet::reader::ParquetSstReader,
+            reader::{tests::check_stream, SstReader},
+        },
+        table_options,
+    };
+
+    // TODO(xikai): add test for reverse reader
+
+    #[test]
+    fn test_parquet_build_and_read() {
+        let runtime = Arc::new(runtime::Builder::default().build().unwrap());
+        parquet_write_and_then_read_back(runtime.clone(), 3, vec![3, 3, 3, 3, 3]);
+        // TODO: num_rows should be [4, 4, 4, 3]?
+        parquet_write_and_then_read_back(runtime.clone(), 4, vec![4, 2, 4, 2, 3]);
+        // TODO: num_rows should be [5, 5, 5]?
+        parquet_write_and_then_read_back(runtime, 5, vec![5, 1, 5, 1, 3]);
+    }
+
+    fn parquet_write_and_then_read_back(
+        runtime: Arc<Runtime>,
+        num_rows_per_row_group: usize,
+        expected_num_rows: Vec<i64>,
+    ) {
+        runtime.block_on(async {
+            let sst_factory = FactoryImpl;
+            let sst_builder_options = SstBuilderOptions {
+                sst_type: SstType::Parquet,
+                num_rows_per_row_group,
+                compression: table_options::Compression::Uncompressed,
+            };
+
+            let dir = tempdir().unwrap();
+            let root = dir.path();
+            let store = File::new(root);
+            let mut sst_file_path = store.new_path();
+            sst_file_path.set_file_name("data.par");
+
+            let schema = build_schema();
+            let projected_schema = ProjectedSchema::no_projection(schema.clone());
+            let sst_meta = SstMetaData {
+                min_key: Bytes::from_static(b"100"),
+                max_key: Bytes::from_static(b"200"),
+                time_range: TimeRange::new_unchecked(Timestamp::new(1), Timestamp::new(2)),
+                max_sequence: 200,
+                schema: schema.clone(),
+                size: 10,
+                row_num: 2,
+            };
+
+            let mut counter = 10;
+            let record_batch_stream = Box::new(stream::poll_fn(move |ctx| -> Poll<Option<_>> {
+                counter -= 1;
+                if counter == 0 {
+                    return Poll::Ready(None);
+                } else if counter % 2 == 0 {
+                    ctx.waker().wake_by_ref();
+                    return Poll::Pending;
+                }
+
+                // reach here when counter is 9 7 5 3 1
+                let ts = 100 + counter;
+                let rows = vec![
+                    build_row(b"a", ts, 10.0, "v4"),
+                    build_row(b"b", ts, 10.0, "v4"),
+                    build_row(b"c", ts, 10.0, "v4"),
+                ];
+                let batch = build_record_batch_with_key(schema.clone(), rows);
+                Poll::Ready(Some(Ok(batch)))
+            }));
+
+            let mut builder = sst_factory
+                .new_sst_builder(&sst_builder_options, &sst_file_path, &store)
+                .unwrap();
+            let sst_info = builder
+                .build(RequestId::next_id(), &sst_meta, record_batch_stream)
+                .await
+                .unwrap();
+
+            assert_eq!(15, sst_info.row_num);
+
+            // read sst back to test
+            let sst_reader_options = SstReaderOptions {
+                sst_type: SstType::Parquet,
+                read_batch_row_num: 5,
+                reverse: false,
+                projected_schema,
+                predicate: Arc::new(Predicate::new(TimeRange::min_to_max())),
+                meta_cache: None,
+                data_cache: None,
+                runtime: runtime.clone(),
+            };
+
+            let mut reader = ParquetSstReader::new(&sst_file_path, &store, &sst_reader_options);
+            assert_eq!(reader.meta_data().await.unwrap(), &sst_meta);
+            assert_eq!(
+                expected_num_rows,
+                reader
+                    .row_groups()
+                    .await
+                    .iter()
+                    .map(|g| g.num_rows())
+                    .collect::<Vec<_>>()
+            );
+
+            let mut stream = reader.read().await.unwrap();
+            let mut expect_rows = vec![];
+            for counter in &[9, 7, 5, 3, 1] {
+                expect_rows.push(build_row(b"a", 100 + counter, 10.0, "v4"));
+                expect_rows.push(build_row(b"b", 100 + counter, 10.0, "v4"));
+                expect_rows.push(build_row(b"c", 100 + counter, 10.0, "v4"));
+            }
+            check_stream(&mut stream, expect_rows).await;
+        });
+    }
+}
diff --git a/analytic_engine/src/sst/parquet/encoding.rs b/analytic_engine/src/sst/parquet/encoding.rs
new file mode 100644
index 0000000000..ddb916b14d
--- /dev/null
+++ b/analytic_engine/src/sst/parquet/encoding.rs
@@ -0,0 +1,152 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use std::convert::TryFrom;
+
+use arrow_deps::parquet::file::metadata::KeyValue;
+use common_types::bytes::{BytesMut, MemBufMut, Writer};
+use common_util::define_result;
+use proto::sst::SstMetaData as SstMetaDataPb;
+use protobuf::Message;
+use snafu::{ensure, Backtrace, OptionExt, ResultExt, Snafu};
+
+use crate::sst::file::SstMetaData;
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display(
+        "Failed to encode sst meta data, err:{}.\nBacktrace:\n{}",
+        source,
+        backtrace
+    ))]
+    EncodeIntoPb {
+        source: protobuf::ProtobufError,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Failed to decode sst meta data, base64 of meta value:{}, err:{}.\nBacktrace:\n{}",
+        meta_value,
+        source,
+        backtrace,
+    ))]
+    DecodeFromPb {
+        meta_value: String,
+        source: protobuf::ProtobufError,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Invalid meta key, expect:{}, given:{}.\nBacktrace:\n{}",
+        expect,
+        given,
+        backtrace
+    ))]
+    InvalidMetaKey {
+        expect: String,
+        given: String,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Base64 meta value not found.\nBacktrace:\n{}", backtrace))]
+    Base64MetaValueNotFound { backtrace: Backtrace },
+
+    #[snafu(display(
+        "Invalid base64 meta value length, base64 of meta value:{}.\nBacktrace:\n{}",
+        meta_value,
+        backtrace,
+    ))]
+    InvalidBase64MetaValueLen {
+        meta_value: String,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Failed to decode base64 meta value, base64 of meta value:{}, err:{}",
+        meta_value,
+        source
+    ))]
+    DecodeBase64MetaValue {
+        meta_value: String,
+        source: base64::DecodeError,
+    },
+
+    #[snafu(display(
+        "Invalid meta value length, base64 of meta value:{}.\nBacktrace:\n{}",
+        meta_value,
+        backtrace
+    ))]
+    InvalidMetaValueLen {
+        meta_value: String,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Invalid meta value header, base64 of meta value:{}.\nBacktrace:\n{}",
+        meta_value,
+        backtrace
+    ))]
+    InvalidMetaValueHeader {
+        meta_value: String,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Failed to convert sst meta data from protobuf, err:{}", source))]
+    ConvertSstMetaData { source: crate::sst::file::Error },
+}
+
+define_result!(Error);
+
+pub const META_KEY: &str = "meta";
+pub const META_VALUE_HEADER: u8 = 0;
+
+/// Encode the sst meta data into binary key value pair.
+pub fn encode_sst_meta_data(meta_data: SstMetaData) -> Result<KeyValue> {
+    let meta_data_pb = SstMetaDataPb::from(meta_data);
+
+    let mut buf = BytesMut::with_capacity(meta_data_pb.compute_size() as usize + 1);
+    buf.write_u8(META_VALUE_HEADER)
+        .expect("Should write header into the buffer successfully");
+
+    // encode the sst meta data into protobuf binary
+    {
+        let mut writer = Writer::new(&mut buf);
+        meta_data_pb
+            .write_to_writer(&mut writer)
+            .context(EncodeIntoPb)?;
+    }
+    Ok(KeyValue {
+        key: META_KEY.to_string(),
+        value: Some(base64::encode(buf.as_ref())),
+    })
+}
+
+/// Decode the sst meta data from the binary key value pair.
+pub fn decode_sst_meta_data(kv: &KeyValue) -> Result<SstMetaData> {
+    ensure!(
+        kv.key == META_KEY,
+        InvalidMetaKey {
+            expect: META_KEY,
+            given: &kv.key,
+        }
+    );
+
+    let meta_value = kv.value.as_ref().context(Base64MetaValueNotFound)?;
+    ensure!(
+        !meta_value.is_empty(),
+        InvalidBase64MetaValueLen { meta_value }
+    );
+
+    let raw_bytes = base64::decode(meta_value).context(DecodeBase64MetaValue { meta_value })?;
+
+    ensure!(!raw_bytes.is_empty(), InvalidMetaValueLen { meta_value });
+
+    ensure!(
+        raw_bytes[0] == META_VALUE_HEADER,
+        InvalidMetaValueHeader { meta_value }
+    );
+
+    let meta_data_pb: SstMetaDataPb =
+        Message::parse_from_bytes(&raw_bytes[1..]).context(DecodeFromPb { meta_value })?;
+
+    SstMetaData::try_from(meta_data_pb).context(ConvertSstMetaData)
+}
diff --git a/analytic_engine/src/sst/parquet/mod.rs b/analytic_engine/src/sst/parquet/mod.rs
new file mode 100644
index 0000000000..aaf82e4671
--- /dev/null
+++ b/analytic_engine/src/sst/parquet/mod.rs
@@ -0,0 +1,7 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Sst implementation based on parquet.
+
+pub mod builder;
+pub mod encoding;
+pub mod reader;
diff --git a/analytic_engine/src/sst/parquet/reader.rs b/analytic_engine/src/sst/parquet/reader.rs
new file mode 100644
index 0000000000..f515855ff7
--- /dev/null
+++ b/analytic_engine/src/sst/parquet/reader.rs
@@ -0,0 +1,371 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Sst reader implementation based on parquet.
+
+use std::{
+    fs::File,
+    pin::Pin,
+    sync::Arc,
+    task::{Context, Poll},
+    time::Instant,
+};
+
+use arrow_deps::{
+    arrow::{error::Result as ArrowResult, record_batch::RecordBatch},
+    parquet::{
+        arrow::{ArrowReader, ParquetFileArrowReader},
+        file::{metadata::RowGroupMetaData, reader::FileReader},
+    },
+};
+use async_trait::async_trait;
+use common_types::{
+    projected_schema::{ProjectedSchema, RowProjector},
+    record_batch::{ArrowRecordBatchProjector, RecordBatchWithKey},
+    schema::Schema,
+};
+use common_util::runtime::Runtime;
+use futures::Stream;
+use log::{debug, error, trace};
+use object_store::{path::ObjectStorePath, ObjectStore};
+use parquet::{
+    reverse_reader::Builder as ReverseRecordBatchReaderBuilder, CachableSerializedFileReader,
+    DataCacheRef, MetaCacheRef,
+};
+use snafu::{ensure, OptionExt, ResultExt};
+use table_engine::predicate::PredicateRef;
+use tokio::sync::mpsc::{self, Receiver, Sender};
+
+use crate::sst::{
+    factory::SstReaderOptions,
+    file::SstMetaData,
+    parquet::encoding,
+    reader::{error::*, SstReader},
+};
+
+const DEFAULT_CHANNEL_CAP: usize = 1000;
+
+pub async fn read_sst_meta<S: ObjectStore>(
+    storage: &S,
+    path: &S::Path,
+    meta_cache: &Option<MetaCacheRef>,
+    data_cache: &Option<DataCacheRef>,
+) -> Result<(CachableSerializedFileReader<File>, SstMetaData)> {
+    let file = storage
+        .get(path)
+        .await
+        .map_err(|e| Box::new(e) as _)
+        .with_context(|| ReadPersist {
+            path: path.display(),
+        })?;
+
+    // generate the file reader
+    let file_reader = CachableSerializedFileReader::new(
+        path.display(),
+        file,
+        meta_cache.clone(),
+        data_cache.clone(),
+    )
+    .map_err(|e| Box::new(e) as _)
+    .with_context(|| ReadPersist {
+        path: path.display(),
+    })?;
+
+    // parse sst meta data
+    let sst_meta = {
+        let kv_metas = file_reader
+            .metadata()
+            .file_metadata()
+            .key_value_metadata()
+            .as_ref()
+            .context(SstMetaNotFound)?;
+
+        ensure!(!kv_metas.is_empty(), EmptySstMeta);
+
+        encoding::decode_sst_meta_data(&kv_metas[0])
+            .map_err(|e| Box::new(e) as _)
+            .context(DecodeSstMeta)?
+    };
+
+    Ok((file_reader, sst_meta))
+}
+
+/// The implementation of sst based on parquet and object storage.
+pub struct ParquetSstReader<'a, S: ObjectStore> {
+    /// The path where the data is persisted.
+    path: &'a S::Path,
+    /// The storage where the data is persist.
+    storage: &'a S,
+    projected_schema: ProjectedSchema,
+    predicate: PredicateRef,
+    meta_data: Option<SstMetaData>,
+    file_reader: Option<CachableSerializedFileReader<File>>,
+    /// The batch of rows in one `record_batch`.
+    batch_size: usize,
+    /// Read the rows in reverse order.
+    reverse: bool,
+    channel_cap: usize,
+
+    meta_cache: Option<MetaCacheRef>,
+    data_cache: Option<DataCacheRef>,
+
+    runtime: Arc<Runtime>,
+}
+
+impl<'a, S: ObjectStore> ParquetSstReader<'a, S> {
+    pub fn new(path: &'a S::Path, storage: &'a S, options: &SstReaderOptions) -> Self {
+        Self {
+            path,
+            storage,
+            projected_schema: options.projected_schema.clone(),
+            predicate: options.predicate.clone(),
+            meta_data: None,
+            file_reader: None,
+            batch_size: options.read_batch_row_num,
+            reverse: options.reverse,
+            channel_cap: DEFAULT_CHANNEL_CAP,
+            meta_cache: options.meta_cache.clone(),
+            data_cache: options.data_cache.clone(),
+            runtime: options.runtime.clone(),
+        }
+    }
+}
+
+impl<'a, S: ObjectStore> ParquetSstReader<'a, S> {
+    async fn init_if_necessary(&mut self) -> Result<()> {
+        if self.meta_data.is_some() {
+            return Ok(());
+        }
+
+        let (file_reader, sst_meta) =
+            read_sst_meta(self.storage, self.path, &self.meta_cache, &self.data_cache).await?;
+
+        self.file_reader = Some(file_reader);
+        self.meta_data = Some(sst_meta);
+
+        Ok(())
+    }
+
+    fn read_record_batches(&mut self, tx: Sender<Result<RecordBatchWithKey>>) -> Result<()> {
+        let path = self.path.display();
+        ensure!(self.file_reader.is_some(), ReadAgain { path });
+
+        let file_reader = self.file_reader.take().unwrap();
+        let batch_size = self.batch_size;
+        let schema = {
+            let meta_data = self.meta_data.as_ref().unwrap();
+            meta_data.schema.clone()
+        };
+        let projected_schema = self.projected_schema.clone();
+        let row_projector = projected_schema
+            .try_project_with_key(&schema)
+            .map_err(|e| Box::new(e) as _)
+            .context(Projection)?;
+        let predicate = self.predicate.clone();
+        let reverse = self.reverse;
+
+        let _ = self.runtime.spawn_blocking(move || {
+            debug!(
+                "begin reading record batch from the sst:{}, predicate:{:?}, projection:{:?}",
+                path, predicate, projected_schema,
+            );
+
+            let mut send_failed = false;
+            let send = |v| -> Result<()> {
+                tx.blocking_send(v)
+                    .map_err(|e| {
+                        send_failed = true;
+                        Box::new(e) as _
+                    })
+                    .context(Other)?;
+                Ok(())
+            };
+
+            let reader = ProjectAndFilterReader {
+                file_path: path.clone(),
+                file_reader: Some(file_reader),
+                schema,
+                projected_schema,
+                row_projector,
+                predicate,
+                batch_size,
+                reverse,
+            };
+
+            let start_fetch = Instant::now();
+            match reader.fetch_and_send_record_batch(send) {
+                Ok(row_num) => {
+                    debug!(
+                        "finish reading record batch({} rows) from the sst:{}, time cost:{:?}",
+                        row_num,
+                        path,
+                        start_fetch.elapsed(),
+                    );
+                }
+                Err(e) => {
+                    if send_failed {
+                        error!("fail to send the fetched record batch result, err:{}", e);
+                    } else {
+                        error!(
+                            "failed to read record batch from the sst:{}, err:{}",
+                            path, e
+                        );
+                        let _ = tx.blocking_send(Err(e));
+                    }
+                }
+            }
+        });
+
+        Ok(())
+    }
+
+    #[cfg(test)]
+    pub(crate) async fn row_groups(&mut self) -> &[RowGroupMetaData] {
+        self.init_if_necessary().await.unwrap();
+        self.file_reader.as_ref().unwrap().metadata().row_groups()
+    }
+}
+
+/// A reader for projection and filter on the parquet file.
+struct ProjectAndFilterReader {
+    file_path: String,
+    file_reader: Option<CachableSerializedFileReader<File>>,
+    schema: Schema,
+    projected_schema: ProjectedSchema,
+    row_projector: RowProjector,
+    predicate: PredicateRef,
+    batch_size: usize,
+    reverse: bool,
+}
+
+impl ProjectAndFilterReader {
+    fn build_row_group_predicate(&self) -> Box<dyn Fn(&RowGroupMetaData, usize) -> bool + 'static> {
+        assert!(self.file_reader.is_some());
+
+        let row_groups = self.file_reader.as_ref().unwrap().metadata().row_groups();
+        let filter_results = self.predicate.filter_row_groups(&self.schema, row_groups);
+
+        trace!("Finish build row group predicate, predicate:{:?}, schema:{:?}, filter_results:{:?}, row_groups meta data:{:?}", self.predicate, self.schema, filter_results, row_groups);
+
+        Box::new(move |_, idx: usize| filter_results[idx])
+    }
+
+    /// Generate the reader which has processed projection and filter.
+    /// This `file_reader` is consumed after calling this method.
+    fn project_and_filter_reader(
+        &mut self,
+    ) -> Result<Box<dyn Iterator<Item = ArrowResult<RecordBatch>>>> {
+        assert!(self.file_reader.is_some());
+
+        let row_group_predicate = self.build_row_group_predicate();
+        let mut file_reader = self.file_reader.take().unwrap();
+        file_reader.filter_row_groups(&row_group_predicate);
+
+        if self.reverse {
+            let mut builder =
+                ReverseRecordBatchReaderBuilder::new(Arc::new(file_reader), self.batch_size);
+            if !self.projected_schema.is_all_projection() {
+                builder = builder.projection(Some(self.row_projector.existed_source_projection()));
+            }
+
+            let reverse_reader = builder
+                .build()
+                .map_err(|e| Box::new(e) as _)
+                .context(DecodeRecordBatch)?;
+
+            Ok(Box::new(reverse_reader))
+        } else {
+            let mut arrow_reader = ParquetFileArrowReader::new(Arc::new(file_reader));
+
+            let reader = if self.projected_schema.is_all_projection() {
+                arrow_reader.get_record_reader(self.batch_size)
+            } else {
+                let projection = self.row_projector.existed_source_projection();
+                arrow_reader.get_record_reader_by_columns(projection, self.batch_size)
+            };
+            let reader = reader
+                .map_err(|e| Box::new(e) as _)
+                .context(DecodeRecordBatch)?;
+
+            Ok(Box::new(reader))
+        }
+    }
+
+    /// Fetch the record batch from the `reader` and send them.
+    /// Returns the fetched row number.
+    fn fetch_and_send_record_batch(
+        mut self,
+        mut send: impl FnMut(Result<RecordBatchWithKey>) -> Result<()>,
+    ) -> Result<usize> {
+        let reader = self.project_and_filter_reader()?;
+
+        let arrow_record_batch_projector = ArrowRecordBatchProjector::from(self.row_projector);
+        let mut row_num = 0;
+        for record_batch in reader {
+            trace!(
+                "Fetch one record batch from sst:{}, num_rows:{:?}",
+                self.file_path,
+                record_batch.as_ref().map(|v| v.num_rows())
+            );
+
+            match record_batch
+                .map_err(|e| Box::new(e) as _)
+                .context(DecodeRecordBatch)
+            {
+                Ok(record_batch) => {
+                    row_num += record_batch.num_rows();
+
+                    let record_batch_with_key = arrow_record_batch_projector
+                        .project_to_record_batch_with_key(record_batch)
+                        .map_err(|e| Box::new(e) as _)
+                        .context(DecodeRecordBatch);
+
+                    send(record_batch_with_key)?;
+                }
+                Err(e) => {
+                    send(Err(e))?;
+                    break;
+                }
+            };
+        }
+
+        Ok(row_num)
+    }
+}
+
+struct RecordBatchReceiver {
+    rx: Receiver<Result<RecordBatchWithKey>>,
+}
+
+impl Stream for RecordBatchReceiver {
+    type Item = Result<RecordBatchWithKey>;
+
+    fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        self.as_mut().rx.poll_recv(cx)
+    }
+}
+
+#[async_trait]
+impl<'a, S: ObjectStore> SstReader for ParquetSstReader<'a, S> {
+    async fn meta_data(&mut self) -> Result<&SstMetaData> {
+        self.init_if_necessary().await?;
+        Ok(self.meta_data.as_ref().unwrap())
+    }
+
+    // TODO(yingwen): Project the schema in parquet
+    async fn read(
+        &mut self,
+    ) -> Result<Box<dyn Stream<Item = Result<RecordBatchWithKey>> + Send + Unpin>> {
+        debug!(
+            "read sst:{}, projected_schema:{:?}, predicate:{:?}",
+            self.path.display(),
+            self.projected_schema,
+            self.predicate
+        );
+
+        self.init_if_necessary().await?;
+        let (tx, rx) = mpsc::channel::<Result<RecordBatchWithKey>>(self.channel_cap);
+        self.read_record_batches(tx)?;
+
+        Ok(Box::new(RecordBatchReceiver { rx }))
+    }
+}
diff --git a/analytic_engine/src/sst/reader.rs b/analytic_engine/src/sst/reader.rs
new file mode 100644
index 0000000000..ab76c9a044
--- /dev/null
+++ b/analytic_engine/src/sst/reader.rs
@@ -0,0 +1,90 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Sst reader trait definition.
+
+use async_trait::async_trait;
+use common_types::record_batch::RecordBatchWithKey;
+use futures::Stream;
+
+use crate::sst::file::SstMetaData;
+
+pub mod error {
+    use common_util::define_result;
+    use snafu::{Backtrace, Snafu};
+
+    #[derive(Debug, Snafu)]
+    #[snafu(visibility(pub))]
+    pub enum Error {
+        #[snafu(display("Try to read again, path:{}.\nBacktrace:\n{}", path, backtrace))]
+        ReadAgain { backtrace: Backtrace, path: String },
+
+        #[snafu(display("Fail to read persisted file, path:{}, err:{}", path, source))]
+        ReadPersist {
+            path: String,
+            source: Box<dyn std::error::Error + Send + Sync>,
+        },
+
+        #[snafu(display("Failed to decode record batch, err:{}", source))]
+        DecodeRecordBatch {
+            source: Box<dyn std::error::Error + Send + Sync>,
+        },
+
+        #[snafu(display("Failed to decode sst meta data, err:{}", source))]
+        DecodeSstMeta {
+            source: Box<dyn std::error::Error + Send + Sync>,
+        },
+
+        #[snafu(display("Sst meta data is not found.\nBacktrace:\n{}", backtrace))]
+        SstMetaNotFound { backtrace: Backtrace },
+
+        #[snafu(display("Fail to projection, err:{}", source))]
+        Projection {
+            source: Box<dyn std::error::Error + Send + Sync>,
+        },
+
+        #[snafu(display("Sst meta data is empty.\nBacktrace:\n{}", backtrace))]
+        EmptySstMeta { backtrace: Backtrace },
+
+        #[snafu(display("Other kind of error:{}", source))]
+        Other {
+            source: Box<dyn std::error::Error + Send + Sync>,
+        },
+    }
+
+    define_result!(Error);
+}
+
+pub use error::*;
+
+#[async_trait]
+pub trait SstReader {
+    async fn meta_data(&mut self) -> Result<&SstMetaData>;
+
+    async fn read(
+        &mut self,
+    ) -> Result<Box<dyn Stream<Item = Result<RecordBatchWithKey>> + Send + Unpin>>;
+}
+
+#[cfg(test)]
+pub mod tests {
+    use common_types::row::Row;
+    use futures::StreamExt;
+
+    use super::*;
+
+    pub async fn check_stream<S>(stream: &mut S, expected_rows: Vec<Row>)
+    where
+        S: Stream<Item = Result<RecordBatchWithKey>> + Unpin,
+    {
+        let mut visited_rows = 0;
+        while let Some(batch) = stream.next().await {
+            let batch = batch.unwrap();
+            for row_idx in 0..batch.num_rows() {
+                assert_eq!(batch.clone_row_at(row_idx), expected_rows[visited_rows]);
+                visited_rows += 1;
+            }
+        }
+
+        assert_eq!(visited_rows, expected_rows.len());
+    }
+}
diff --git a/analytic_engine/src/table/data.rs b/analytic_engine/src/table/data.rs
new file mode 100644
index 0000000000..88dde35166
--- /dev/null
+++ b/analytic_engine/src/table/data.rs
@@ -0,0 +1,713 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Table data
+
+use std::{
+    collections::HashMap,
+    convert::TryInto,
+    sync::{
+        atomic::{AtomicBool, AtomicU32, AtomicU64, Ordering},
+        Arc, Mutex,
+    },
+    time::Duration,
+};
+
+use arc_swap::ArcSwap;
+use arena::CollectorRef;
+use common_types::{
+    schema::{Schema, Version},
+    time::{TimeRange, Timestamp},
+    SequenceNumber,
+};
+use common_util::define_result;
+use log::{debug, info};
+use object_store::path::ObjectStorePath;
+use snafu::{Backtrace, OptionExt, ResultExt, Snafu};
+use table_engine::{engine::CreateTableRequest, table::TableId};
+use wal::manager::RegionId;
+
+use crate::{
+    instance::write_worker::{WorkerLocal, WriteHandle},
+    memtable::{
+        factory::{FactoryRef as MemTableFactoryRef, Options as MemTableOptions},
+        skiplist::factory::SkiplistMemTableFactory,
+    },
+    meta::meta_update::AddTableMeta,
+    space::SpaceId,
+    sst::{factory::SstType, file::FilePurger, manager::FileId},
+    table::{
+        metrics::Metrics,
+        sst_util,
+        version::{MemTableForWrite, MemTableState, SamplingMemTable, TableVersion},
+    },
+    TableOptions,
+};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Failed to create memtable, err:{}", source))]
+    CreateMemTable {
+        source: crate::memtable::factory::Error,
+    },
+
+    #[snafu(display(
+        "Failed to find or create memtable, timestamp overflow, timestamp:{:?}, duration:{:?}.\nBacktrace:\n{}",
+        timestamp,
+        duration,
+        backtrace,
+    ))]
+    TimestampOverflow {
+        timestamp: Timestamp,
+        duration: Duration,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Failed to find memtable for write, err:{}", source))]
+    FindMemTable {
+        source: crate::table::version::Error,
+    },
+}
+
+define_result!(Error);
+
+pub type MemTableId = u64;
+
+/// Data of a table
+pub struct TableData {
+    /// Id of this table
+    pub id: TableId,
+    /// Name of this table
+    pub name: String,
+    /// Schema of this table
+    schema: Mutex<Schema>,
+    /// Space id of this table
+    pub space_id: SpaceId,
+    /// The sst type of this table
+    pub sst_type: SstType,
+
+    /// Mutable memtable memory size limitation
+    mutable_limit: AtomicU32,
+    /// Options of this table.
+    ///
+    /// Most modification to `opts` can be done by replacing the old options
+    /// with a new one. However, altering the segment duration should be done
+    /// carefully to avoid the reader seeing inconsistent segment duration
+    /// and memtables/ssts during query/compaction/flush .
+    opts: ArcSwap<TableOptions>,
+    /// MemTable factory of this table
+    memtable_factory: MemTableFactoryRef,
+    /// Space memtable memory usage collector
+    mem_usage_collector: CollectorRef,
+
+    /// Current table version
+    current_version: TableVersion,
+    /// Last sequence visible to the reads
+    ///
+    /// Write to last_sequence should be guarded by a mutex and only done by
+    /// single writer, but reads are allowed to be done concurrently without
+    /// mutex protected
+    last_sequence: AtomicU64,
+    /// Handle to the write worker
+    pub write_handle: WriteHandle,
+    /// Auto incremented id to track memtable, reset on engine open
+    ///
+    /// Allocating memtable id should be guarded by write lock
+    last_memtable_id: AtomicU64,
+
+    /// Last id of the sst file
+    ///
+    /// Write to last_file_id require external synchronization
+    last_file_id: AtomicU64,
+
+    /// Flag denoting whether the table is dropped
+    ///
+    /// No write/alter is allowed if the table is dropped.
+    dropped: AtomicBool,
+
+    /// Metrics of this table.
+    pub metrics: Metrics,
+}
+
+impl Drop for TableData {
+    fn drop(&mut self) {
+        debug!("TableData is dropped, id:{}, name:{}", self.id, self.name);
+    }
+}
+
+#[inline]
+fn get_mutable_limit(opts: &TableOptions) -> u32 {
+    opts.write_buffer_size * 7 / 8
+}
+
+impl TableData {
+    /// Create a new TableData
+    ///
+    /// This function should only be called when a new table is creating and
+    /// there is no existing data of the table
+    pub fn new(
+        space_id: SpaceId,
+        request: CreateTableRequest,
+        write_handle: WriteHandle,
+        table_opts: TableOptions,
+        purger: &FilePurger,
+        mem_usage_collector: CollectorRef,
+    ) -> Result<Self> {
+        // FIXME(yingwen): Validate TableOptions, such as bucket_duration >=
+        // segment_duration and bucket_duration is aligned to segment_duration
+
+        let memtable_factory = Arc::new(SkiplistMemTableFactory);
+        let purge_queue = purger.create_purge_queue(space_id, request.table_id);
+        let current_version = TableVersion::new(purge_queue);
+        let metrics = Metrics::new(&request.table_name);
+
+        Ok(Self {
+            id: request.table_id,
+            name: request.table_name,
+            schema: Mutex::new(request.table_schema),
+            space_id,
+            // TODO(xikai): sst type should be decided by the `request`.
+            sst_type: SstType::Parquet,
+            mutable_limit: AtomicU32::new(get_mutable_limit(&table_opts)),
+            opts: ArcSwap::new(Arc::new(table_opts)),
+            memtable_factory,
+            mem_usage_collector,
+            current_version,
+            last_sequence: AtomicU64::new(0),
+            write_handle,
+            last_memtable_id: AtomicU64::new(0),
+            last_file_id: AtomicU64::new(0),
+            dropped: AtomicBool::new(false),
+            metrics,
+        })
+    }
+
+    /// Recover table from add table meta
+    ///
+    /// This wont recover sequence number, which will be set after wal replayed
+    pub fn recover_from_add(
+        add_meta: AddTableMeta,
+        write_handle: WriteHandle,
+        purger: &FilePurger,
+        mem_usage_collector: CollectorRef,
+    ) -> Result<Self> {
+        let memtable_factory = Arc::new(SkiplistMemTableFactory);
+        let purge_queue = purger.create_purge_queue(add_meta.space_id, add_meta.table_id);
+        let current_version = TableVersion::new(purge_queue);
+        let metrics = Metrics::new(&add_meta.table_name);
+
+        Ok(Self {
+            id: add_meta.table_id,
+            name: add_meta.table_name,
+            schema: Mutex::new(add_meta.schema),
+            space_id: add_meta.space_id,
+            // TODO(xikai): it should be recovered from `add_meta` struct.
+            sst_type: SstType::Parquet,
+            mutable_limit: AtomicU32::new(get_mutable_limit(&add_meta.opts)),
+            opts: ArcSwap::new(Arc::new(add_meta.opts)),
+            memtable_factory,
+            mem_usage_collector,
+            current_version,
+            last_sequence: AtomicU64::new(0),
+            write_handle,
+            last_memtable_id: AtomicU64::new(0),
+            last_file_id: AtomicU64::new(0),
+            dropped: AtomicBool::new(false),
+            metrics,
+        })
+    }
+
+    /// Get current schema of the table.
+    pub fn schema(&self) -> Schema {
+        self.schema.lock().unwrap().clone()
+    }
+
+    /// Set current schema of the table.
+    pub fn set_schema(&self, schema: Schema) {
+        *self.schema.lock().unwrap() = schema;
+    }
+
+    /// Get current version of schema.
+    pub fn schema_version(&self) -> Version {
+        self.schema.lock().unwrap().version()
+    }
+
+    /// Get current table version
+    #[inline]
+    pub fn current_version(&self) -> &TableVersion {
+        &self.current_version
+    }
+
+    /// Get the wal region id of this table
+    ///
+    /// Now we just use table id as region id
+    #[inline]
+    pub fn wal_region_id(&self) -> RegionId {
+        self.id.as_u64()
+    }
+
+    /// Get last sequence number
+    #[inline]
+    pub fn last_sequence(&self) -> SequenceNumber {
+        self.last_sequence.load(Ordering::Acquire)
+    }
+
+    /// Set last sequence number
+    #[inline]
+    pub fn set_last_sequence(&self, seq: SequenceNumber) {
+        self.last_sequence.store(seq, Ordering::Release);
+    }
+
+    #[inline]
+    pub fn table_options(&self) -> Arc<TableOptions> {
+        self.opts.load().clone()
+    }
+
+    /// Update table options.
+    ///
+    /// REQUIRE: The write lock is held.
+    #[inline]
+    pub fn set_table_options(&self, _write_lock: &WorkerLocal, opts: TableOptions) {
+        self.mutable_limit
+            .store(get_mutable_limit(&opts), Ordering::Relaxed);
+        self.opts.store(Arc::new(opts))
+    }
+
+    #[inline]
+    pub fn is_dropped(&self) -> bool {
+        self.dropped.load(Ordering::SeqCst)
+    }
+
+    /// Set the table is dropped and forbid any writes/alter on this table.
+    #[inline]
+    pub fn set_dropped(&self) {
+        self.dropped.store(true, Ordering::SeqCst);
+    }
+
+    /// Returns total memtable memory usage in bytes.
+    #[inline]
+    pub fn memtable_memory_usage(&self) -> usize {
+        self.current_version.total_memory_usage()
+    }
+
+    /// Find memtable for given timestamp to insert, create if not exists
+    ///
+    /// If the memtable schema is outdated, switch all memtables and create the
+    /// needed mutable memtable by current schema. The returned memtable is
+    /// guaranteed to have same schema of current table
+    ///
+    /// REQUIRE: The write lock is held
+    pub fn find_or_create_mutable(
+        &self,
+        write_lock: &WorkerLocal,
+        timestamp: Timestamp,
+        table_schema: &Schema,
+    ) -> Result<MemTableForWrite> {
+        let schema_version = table_schema.version();
+        let last_sequence = self.last_sequence();
+
+        if let Some(mem) = self
+            .current_version
+            .memtable_for_write(write_lock, timestamp, schema_version)
+            .context(FindMemTable)?
+        {
+            return Ok(mem);
+        }
+
+        // Mutable memtable for this timestamp not found, need to create a new one.
+        let table_options = self.table_options();
+        let memtable_opts = MemTableOptions {
+            schema: table_schema.clone(),
+            arena_block_size: table_options.arena_block_size,
+            creation_sequence: last_sequence,
+            collector: self.mem_usage_collector.clone(),
+        };
+        let mem = self
+            .memtable_factory
+            .create_memtable(memtable_opts)
+            .context(CreateMemTable)?;
+
+        match table_options.segment_duration() {
+            Some(segment_duration) => {
+                let time_range = TimeRange::bucket_of(timestamp, segment_duration).context(
+                    TimestampOverflow {
+                        timestamp,
+                        duration: segment_duration,
+                    },
+                )?;
+                let mem_state = MemTableState {
+                    mem,
+                    time_range,
+                    id: self.alloc_memtable_id(),
+                };
+
+                // Insert memtable into mutable memtables of current version.
+                self.current_version.insert_mutable(mem_state.clone());
+
+                Ok(MemTableForWrite::Normal(mem_state))
+            }
+            None => {
+                let sampling_mem = SamplingMemTable::new(mem, self.alloc_memtable_id());
+
+                // Set sampling memtables of current version.
+                self.current_version.set_sampling(sampling_mem.clone());
+
+                Ok(MemTableForWrite::Sampling(sampling_mem))
+            }
+        }
+    }
+
+    /// Returns true if the memory usage of this table reaches flush threshold
+    ///
+    /// REQUIRE: Do in write worker
+    pub fn should_flush_table(&self, _worker_local: &WorkerLocal) -> bool {
+        // Fallback to usize::MAX if Failed to convert arena_block_size into
+        // usize (overflow)
+        let max_write_buffer_size = self
+            .table_options()
+            .write_buffer_size
+            .try_into()
+            .unwrap_or(usize::MAX);
+        let mutable_limit = self
+            .mutable_limit
+            .load(Ordering::Relaxed)
+            .try_into()
+            .unwrap_or(usize::MAX);
+
+        let mutable_usage = self.current_version.mutable_memory_usage();
+        let total_usage = self.current_version.total_memory_usage();
+
+        // Inspired by https://github.com/facebook/rocksdb/blob/main/include/rocksdb/write_buffer_manager.h#L94
+        if mutable_usage > mutable_limit {
+            info!(
+                "TableData should flush, table:{}, table_id:{}, mutable_usage:{}, mutable_limit: {}, total_usage:{}, max_write_buffer_size:{}",
+                self.name, self.id, mutable_usage, mutable_limit, total_usage, max_write_buffer_size
+            );
+            return true;
+        }
+
+        // If the memory exceeds the buffer size, we trigger more aggressive
+        // flush. But if already more than half memory is being flushed,
+        // triggering more flush may not help. We will hold it instead.
+        let should_flush =
+            total_usage >= max_write_buffer_size && mutable_usage >= max_write_buffer_size / 2;
+
+        debug!(
+            "Check should flush, table:{}, table_id:{}, mutable_usage:{}, mutable_limit: {}, total_usage:{}, max_write_buffer_size:{}",
+            self.name, self.id, mutable_usage, mutable_limit, total_usage, max_write_buffer_size
+        );
+
+        if should_flush {
+            info!(
+                "TableData should flush, table:{}, table_id:{}, mutable_usage:{}, mutable_limit: {}, total_usage:{}, max_write_buffer_size:{}",
+                self.name, self.id, mutable_usage, mutable_limit, total_usage, max_write_buffer_size
+            );
+        }
+
+        should_flush
+    }
+
+    /// Set `last_file_id`, mainly used in recover
+    ///
+    /// This operation require external synchronization
+    pub fn set_last_file_id(&self, last_file_id: FileId) {
+        self.last_file_id.store(last_file_id, Ordering::Relaxed);
+    }
+
+    /// Returns the last file id
+    pub fn last_file_id(&self) -> FileId {
+        self.last_file_id.load(Ordering::Relaxed)
+    }
+
+    /// Alloc a file id for a new file
+    pub fn alloc_file_id(&self) -> FileId {
+        let last = self.last_file_id.fetch_add(1, Ordering::Relaxed);
+        last + 1
+    }
+
+    /// Set the sst file path into the object storage path.
+    pub fn set_sst_file_path(&self, file_id: FileId, path: &mut impl ObjectStorePath) {
+        sst_util::set_sst_file_path(self.space_id, self.id, file_id, path)
+    }
+
+    /// Allocate next memtable id
+    fn alloc_memtable_id(&self) -> MemTableId {
+        let last = self.last_memtable_id.fetch_add(1, Ordering::Relaxed);
+        last + 1
+    }
+
+    /// Returns last memtable id
+    pub fn last_memtable_id(&self) -> MemTableId {
+        self.last_memtable_id.load(Ordering::Relaxed)
+    }
+
+    pub fn dedup(&self) -> bool {
+        self.table_options().need_dedup()
+    }
+
+    pub fn is_expired(&self, timestamp: Timestamp) -> bool {
+        self.table_options().is_expired(timestamp)
+    }
+}
+
+/// Table data reference
+pub type TableDataRef = Arc<TableData>;
+
+/// Manages TableDataRef
+pub struct TableDataSet {
+    /// Name to table data
+    table_datas: HashMap<String, TableDataRef>,
+    /// Id to table data
+    id_to_tables: HashMap<TableId, TableDataRef>,
+}
+
+impl TableDataSet {
+    /// Create an empty TableDataSet
+    pub fn new() -> Self {
+        Self {
+            table_datas: HashMap::new(),
+            id_to_tables: HashMap::new(),
+        }
+    }
+
+    /// Insert if absent, if successfully inserted, return true and return
+    /// false if the data already exists
+    pub fn insert_if_absent(&mut self, table_data_ref: TableDataRef) -> bool {
+        let table_name = &table_data_ref.name;
+        if self.table_datas.contains_key(table_name) {
+            return false;
+        }
+        self.table_datas
+            .insert(table_name.to_string(), table_data_ref.clone());
+        self.id_to_tables.insert(table_data_ref.id, table_data_ref);
+        true
+    }
+
+    /// Find table by table name
+    pub fn find_table(&self, table_name: &str) -> Option<TableDataRef> {
+        self.table_datas.get(table_name).cloned()
+    }
+
+    /// Find table by table id
+    pub fn find_table_by_id(&self, table_id: TableId) -> Option<TableDataRef> {
+        self.id_to_tables.get(&table_id).cloned()
+    }
+
+    /// Remove table by table name
+    pub fn remove_table(&mut self, table_name: &str) -> Option<TableDataRef> {
+        let table = self.table_datas.remove(table_name)?;
+        self.id_to_tables.remove(&table.id);
+        Some(table)
+    }
+
+    /// Returns the total table num in this set
+    pub fn table_num(&self) -> usize {
+        self.table_datas.len()
+    }
+
+    /// Find the table which consumes maximum memtable memory usag.
+    pub fn find_maximum_memory_usage_table(&self) -> Option<TableDataRef> {
+        self.table_datas
+            .values()
+            .max_by_key(|t| t.memtable_memory_usage())
+            .cloned()
+    }
+
+    /// List all tables to `tables`
+    pub fn list_all_tables(&self, tables: &mut Vec<TableDataRef>) {
+        for table_data in self.table_datas.values().cloned() {
+            tables.push(table_data);
+        }
+    }
+}
+
+impl Default for TableDataSet {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[cfg(test)]
+pub mod tests {
+    use std::sync::Arc;
+
+    use arena::NoopCollector;
+    use common_types::datum::DatumKind;
+    use common_util::config::ReadableDuration;
+    use table_engine::engine::TableState;
+
+    use super::*;
+    use crate::{
+        instance::write_worker::tests::WriteHandleMocker,
+        memtable::{factory::Factory, MemTableRef},
+        sst::file::tests::FilePurgerMocker,
+        table_options,
+        tests::table,
+    };
+
+    const DEFAULT_SPACE_ID: SpaceId = 1;
+
+    fn default_schema() -> Schema {
+        table::create_schema_builder(
+            &[("key", DatumKind::Timestamp)],
+            &[("value", DatumKind::Double)],
+        )
+        .build()
+        .unwrap()
+    }
+
+    #[derive(Default)]
+    pub struct MemTableMocker;
+
+    impl MemTableMocker {
+        pub fn build(&self) -> MemTableRef {
+            let memtable_opts = MemTableOptions {
+                schema: default_schema(),
+                arena_block_size: 1024 * 1024,
+                creation_sequence: 1000,
+                collector: Arc::new(NoopCollector),
+            };
+
+            let factory = SkiplistMemTableFactory;
+            factory.create_memtable(memtable_opts).unwrap()
+        }
+    }
+
+    #[must_use]
+    pub struct TableDataMocker {
+        table_id: TableId,
+        table_name: String,
+        write_handle: Option<WriteHandle>,
+    }
+
+    impl TableDataMocker {
+        pub fn table_id(mut self, table_id: TableId) -> Self {
+            self.table_id = table_id;
+            self
+        }
+
+        pub fn table_name(mut self, table_name: String) -> Self {
+            self.table_name = table_name;
+            self
+        }
+
+        pub fn write_handle(mut self, write_handle: WriteHandle) -> Self {
+            self.write_handle = Some(write_handle);
+            self
+        }
+
+        pub fn build(self) -> TableData {
+            let space_id = DEFAULT_SPACE_ID;
+            let table_schema = default_schema();
+            let create_request = CreateTableRequest {
+                catalog_name: "test_catalog".to_string(),
+                schema_name: "public".to_string(),
+                table_id: self.table_id,
+                table_name: self.table_name,
+                table_schema,
+                partition_info: None,
+                engine: table_engine::ANALYTIC_ENGINE_TYPE.to_string(),
+                options: HashMap::new(),
+                state: TableState::Stable,
+            };
+
+            let write_handle = self.write_handle.unwrap_or_else(|| {
+                let mocked_write_handle = WriteHandleMocker::default().space_id(space_id).build();
+                mocked_write_handle.write_handle
+            });
+            let table_opts = TableOptions::default();
+            let purger = FilePurgerMocker::mock();
+            let collector = Arc::new(NoopCollector);
+
+            TableData::new(
+                space_id,
+                create_request,
+                write_handle,
+                table_opts,
+                &purger,
+                collector,
+            )
+            .unwrap()
+        }
+    }
+
+    impl Default for TableDataMocker {
+        fn default() -> Self {
+            Self {
+                table_id: table::new_table_id(2, 1),
+                table_name: "mocked_table".to_string(),
+                write_handle: None,
+            }
+        }
+    }
+
+    #[test]
+    fn test_new_table_data() {
+        let table_id = table::new_table_id(100, 30);
+        let table_name = "new_table".to_string();
+        let table_data = TableDataMocker::default()
+            .table_id(table_id)
+            .table_name(table_name.clone())
+            .build();
+
+        assert_eq!(table_id, table_data.id);
+        assert_eq!(table_name, table_data.name);
+        assert_eq!(table_data.id.as_u64(), table_data.wal_region_id());
+        assert_eq!(0, table_data.last_sequence());
+        assert!(!table_data.is_dropped());
+        assert_eq!(0, table_data.last_file_id());
+        assert_eq!(0, table_data.last_memtable_id());
+        assert!(table_data.dedup());
+    }
+
+    #[test]
+    fn test_find_or_create_mutable() {
+        let mocked_write_handle = WriteHandleMocker::default()
+            .space_id(DEFAULT_SPACE_ID)
+            .build();
+        let table_data = TableDataMocker::default()
+            .write_handle(mocked_write_handle.write_handle)
+            .build();
+        let worker_local = mocked_write_handle.worker_local;
+        let schema = table_data.schema();
+
+        // Create sampling memtable.
+        let zero_ts = Timestamp::new(0);
+        let mutable = table_data
+            .find_or_create_mutable(&worker_local, zero_ts, &schema)
+            .unwrap();
+        assert!(mutable.accept_timestamp(zero_ts));
+        let sampling_mem = mutable.as_sampling();
+        let sampling_id = sampling_mem.id;
+        assert_eq!(1, sampling_id);
+
+        // Test memtable is reused.
+        let now_ts = Timestamp::now();
+        let mutable = table_data
+            .find_or_create_mutable(&worker_local, now_ts, &schema)
+            .unwrap();
+        assert!(mutable.accept_timestamp(now_ts));
+        let sampling_mem = mutable.as_sampling();
+        // Use same sampling memtable.
+        assert_eq!(sampling_id, sampling_mem.id);
+
+        let current_version = table_data.current_version();
+        // Set segment duration manually.
+        let mut table_opts = (*table_data.table_options()).clone();
+        table_opts.segment_duration =
+            Some(ReadableDuration(table_options::DEFAULT_SEGMENT_DURATION));
+        table_data.set_table_options(&worker_local, table_opts);
+        // Freeze sampling memtable.
+        current_version.freeze_sampling(&worker_local);
+
+        // A new mutable memtable should be created.
+        let mutable = table_data
+            .find_or_create_mutable(&worker_local, now_ts, &schema)
+            .unwrap();
+        assert!(mutable.accept_timestamp(now_ts));
+        let mem_state = mutable.as_normal();
+        assert_eq!(2, mem_state.id);
+        let time_range =
+            TimeRange::bucket_of(now_ts, table_options::DEFAULT_SEGMENT_DURATION).unwrap();
+        assert_eq!(time_range, mem_state.time_range);
+    }
+}
diff --git a/analytic_engine/src/table/metrics.rs b/analytic_engine/src/table/metrics.rs
new file mode 100644
index 0000000000..0a5d801796
--- /dev/null
+++ b/analytic_engine/src/table/metrics.rs
@@ -0,0 +1,229 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Metrics of table.
+
+use std::time::Duration;
+
+use lazy_static::lazy_static;
+use prometheus::{
+    exponential_buckets, local::LocalHistogram, register_histogram_vec, register_int_counter_vec,
+    Histogram, HistogramVec, IntCounter, IntCounterVec,
+};
+
+const KB: f64 = 1024.0;
+
+lazy_static! {
+    // Counters:
+    static ref TABLE_WRITE_REQUEST_COUNTER: IntCounterVec = register_int_counter_vec!(
+        "table_write_request_counter",
+        "Write request counter of table",
+        &["table"]
+    )
+    .unwrap();
+    static ref TABLE_WRITE_ROWS_COUNTER: IntCounterVec = register_int_counter_vec!(
+        "table_write_rows_counter",
+        "Number of rows wrote to table",
+        &["table"]
+    )
+    .unwrap();
+    static ref TABLE_READ_REQUEST_COUNTER: IntCounterVec = register_int_counter_vec!(
+        "table_read_request_counter",
+        "Read request counter of table",
+        &["table"]
+    )
+    .unwrap();
+    // End of counters.
+
+    // Histograms:
+    // Buckets: 0, 0.002, .., 0.002 * 4^9
+    static ref TABLE_FLUSH_DURATION_HISTOGRAM: HistogramVec = register_histogram_vec!(
+        "table_flush_duration",
+        "Histogram for flush duration of the table in seconds",
+        &["table"],
+        exponential_buckets(0.002, 4.0, 10).unwrap()
+    ).unwrap();
+    // Buckets: 0, 1, .., 2^7
+    static ref TABLE_FLUSH_SST_NUM_HISTOGRAM: HistogramVec = register_histogram_vec!(
+        "table_flush_sst_num",
+        "Histogram for number of ssts flushed by the table",
+        &["table"],
+        exponential_buckets(1.0, 2.0, 8).unwrap()
+    ).unwrap();
+    // Buckets: 0, 1, ..., 4^11 (4GB)
+    static ref TABLE_FLUSH_SST_SIZE_HISTOGRAM: HistogramVec = register_histogram_vec!(
+        "table_flush_sst_size",
+        "Histogram for size of ssts flushed by the table in KB",
+        &["table"],
+        exponential_buckets(1.0, 4.0, 12).unwrap()
+    ).unwrap();
+
+    // Buckets: 0, 0.02, .., 0.02 * 4^9
+    static ref TABLE_COMPACT_DURATION_HISTOGRAM: HistogramVec = register_histogram_vec!(
+        "table_compaction_duration",
+        "Histogram for compaction duration of the table in seconds",
+        &["table"],
+        exponential_buckets(0.02, 4.0, 10).unwrap()
+    ).unwrap();
+    // Buckets: 0, 1, .., 2^7
+    static ref TABLE_COMPACTION_SST_NUM_HISTOGRAM: HistogramVec = register_histogram_vec!(
+        "table_compaction_sst_num",
+        "Histogram for number of ssts compacted by the table",
+        &["table"],
+        exponential_buckets(1.0, 2.0, 8).unwrap()
+    ).unwrap();
+    // Buckets: 0, 1, ..., 4^11 (4GB)
+    static ref TABLE_COMPACTION_SST_SIZE_HISTOGRAM: HistogramVec = register_histogram_vec!(
+        "table_compaction_sst_size",
+        "Histogram for size of ssts compacted by the table in KB",
+        &["table", "type"],
+        exponential_buckets(1.0, 4.0, 12).unwrap()
+    ).unwrap();
+    // Buckets: 0, 1, ..., 10^12(1 billion)
+    static ref TABLE_COMPACTION_SST_ROW_NUM_HISTOGRAM: HistogramVec = register_histogram_vec!(
+        "table_compaction_sst_row_num",
+        "Histogram for row num of ssts compacted by the table",
+        &["table", "type"],
+        exponential_buckets(1.0, 10.0, 13).unwrap()
+    ).unwrap();
+
+    // Buckets: 0, 0.01, .., 0.01 * 2^12
+    static ref TABLE_WRITE_STALL_DURATION_HISTOGRAM: HistogramVec = register_histogram_vec!(
+        "table_write_stall_duration",
+        "Histogram for write stall duration of the table in seconds",
+        &["table"],
+        exponential_buckets(0.01, 2.0, 13).unwrap()
+    ).unwrap();
+    // End of histograms.
+}
+
+/// Table metrics.
+///
+/// Now the registered labels won't remove from the metrics vec to avoid panic
+/// on concurrent removal.
+pub struct Metrics {
+    // Counters:
+    pub write_request_counter: IntCounter,
+    write_rows_counter: IntCounter,
+    pub read_request_counter: IntCounter,
+    // End of counters.
+
+    // Histograms:
+    pub flush_duration_histogram: Histogram,
+    flush_sst_num_histogram: Histogram,
+    flush_sst_size_histogram: Histogram,
+
+    pub compaction_duration_histogram: Histogram,
+    compaction_sst_num_histogram: Histogram,
+    compaction_input_sst_size_histogram: Histogram,
+    compaction_output_sst_size_histogram: Histogram,
+    compaction_input_sst_row_num_histogram: Histogram,
+    compaction_output_sst_row_num_histogram: Histogram,
+
+    // Write stall metrics.
+    write_stall_duration_histogram: Histogram,
+    // End of histograms.
+}
+
+impl Metrics {
+    pub fn new(table_name: &str) -> Self {
+        Self {
+            write_request_counter: TABLE_WRITE_REQUEST_COUNTER.with_label_values(&[table_name]),
+            write_rows_counter: TABLE_WRITE_ROWS_COUNTER.with_label_values(&[table_name]),
+            read_request_counter: TABLE_READ_REQUEST_COUNTER.with_label_values(&[table_name]),
+
+            flush_duration_histogram: TABLE_FLUSH_DURATION_HISTOGRAM
+                .with_label_values(&[table_name]),
+            flush_sst_num_histogram: TABLE_FLUSH_SST_NUM_HISTOGRAM.with_label_values(&[table_name]),
+            flush_sst_size_histogram: TABLE_FLUSH_SST_SIZE_HISTOGRAM
+                .with_label_values(&[table_name]),
+
+            compaction_duration_histogram: TABLE_COMPACT_DURATION_HISTOGRAM
+                .with_label_values(&[table_name]),
+            compaction_sst_num_histogram: TABLE_COMPACTION_SST_NUM_HISTOGRAM
+                .with_label_values(&[table_name]),
+            compaction_input_sst_size_histogram: TABLE_COMPACTION_SST_SIZE_HISTOGRAM
+                .with_label_values(&[table_name, "input"]),
+            compaction_output_sst_size_histogram: TABLE_COMPACTION_SST_SIZE_HISTOGRAM
+                .with_label_values(&[table_name, "output"]),
+            compaction_input_sst_row_num_histogram: TABLE_COMPACTION_SST_ROW_NUM_HISTOGRAM
+                .with_label_values(&[table_name, "input"]),
+            compaction_output_sst_row_num_histogram: TABLE_COMPACTION_SST_ROW_NUM_HISTOGRAM
+                .with_label_values(&[table_name, "output"]),
+
+            write_stall_duration_histogram: TABLE_WRITE_STALL_DURATION_HISTOGRAM
+                .with_label_values(&[table_name]),
+        }
+    }
+
+    #[inline]
+    pub fn on_write_request_begin(&self) {
+        self.write_request_counter.inc();
+    }
+
+    #[inline]
+    pub fn on_write_request_done(&self, num_rows: usize) {
+        self.write_rows_counter.inc_by(num_rows as u64);
+    }
+
+    #[inline]
+    pub fn on_read_request_begin(&self) {
+        self.read_request_counter.inc();
+    }
+
+    #[inline]
+    pub fn on_write_stall(&self, duration: Duration) {
+        self.write_stall_duration_histogram
+            .observe(duration.as_secs_f64());
+    }
+
+    pub fn local_flush_metrics(&self) -> LocalFlushMetrics {
+        LocalFlushMetrics {
+            flush_duration_histogram: self.flush_duration_histogram.local(),
+            flush_sst_num_histogram: self.flush_sst_num_histogram.local(),
+            flush_sst_size_histogram: self.flush_sst_size_histogram.local(),
+        }
+    }
+
+    pub fn compaction_observe_sst_num(&self, sst_num: usize) {
+        self.compaction_sst_num_histogram.observe(sst_num as f64);
+    }
+
+    pub fn compaction_observe_input_sst_size(&self, sst_size: u64) {
+        // Convert bytes to KB.
+        self.compaction_input_sst_size_histogram
+            .observe(sst_size as f64 / KB);
+    }
+
+    pub fn compaction_observe_output_sst_size(&self, sst_size: u64) {
+        // Convert bytes to KB.
+        self.compaction_output_sst_size_histogram
+            .observe(sst_size as f64 / KB);
+    }
+
+    pub fn compaction_observe_input_sst_row_num(&self, sst_row_num: u64) {
+        self.compaction_input_sst_row_num_histogram
+            .observe(sst_row_num as f64);
+    }
+
+    pub fn compaction_observe_output_sst_row_num(&self, sst_row_num: u64) {
+        self.compaction_output_sst_row_num_histogram
+            .observe(sst_row_num as f64);
+    }
+}
+
+pub struct LocalFlushMetrics {
+    pub flush_duration_histogram: LocalHistogram,
+    flush_sst_num_histogram: LocalHistogram,
+    flush_sst_size_histogram: LocalHistogram,
+}
+
+impl LocalFlushMetrics {
+    pub fn observe_sst_num(&self, sst_num: usize) {
+        self.flush_sst_num_histogram.observe(sst_num as f64);
+    }
+
+    pub fn observe_sst_size(&self, sst_size: u64) {
+        // Convert bytes to KB.
+        self.flush_sst_size_histogram.observe(sst_size as f64 / KB);
+    }
+}
diff --git a/analytic_engine/src/table/mod.rs b/analytic_engine/src/table/mod.rs
new file mode 100644
index 0000000000..0f5598f0c1
--- /dev/null
+++ b/analytic_engine/src/table/mod.rs
@@ -0,0 +1,270 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Table implementation
+
+use std::{collections::HashMap, fmt, sync::Arc};
+
+use arrow_deps::datafusion::logical_plan::{Column, Expr};
+use async_trait::async_trait;
+use common_types::{row::Row, schema::Schema, time::TimeRange};
+use futures::TryStreamExt;
+use object_store::ObjectStore;
+use snafu::{ensure, OptionExt, ResultExt};
+use table_engine::{
+    predicate::Predicate,
+    stream::{PartitionedStreams, SendableRecordBatchStream},
+    table::{
+        AlterOptions, AlterSchema, AlterSchemaRequest, Compact, Flush, FlushRequest, Get,
+        GetInvalidPrimaryKey, GetNullPrimaryKey, GetRequest, ReadOptions, ReadOrder, ReadRequest,
+        Result, Scan, Table, TableId, TableStats, Write, WriteRequest,
+    },
+};
+use tokio::sync::oneshot;
+use wal::manager::WalManager;
+
+use crate::{
+    instance::{flush_compaction::TableFlushOptions, InstanceRef},
+    meta::Manifest,
+    space::SpaceAndTable,
+    sst::factory::Factory,
+};
+
+pub mod data;
+pub mod metrics;
+pub mod sst_util;
+pub mod version;
+pub mod version_edit;
+
+// TODO(yingwen): How to handle drop table?
+
+/// Table trait implementation
+pub struct TableImpl<Wal, Meta, Store, Fa> {
+    /// Space and table info
+    space_table: SpaceAndTable,
+    /// Instance
+    instance: InstanceRef<Wal, Meta, Store, Fa>,
+    /// Engine type
+    engine_type: String,
+}
+
+impl<Wal, Meta, Store, Fa> TableImpl<Wal, Meta, Store, Fa> {
+    pub fn new(
+        space_table: SpaceAndTable,
+        instance: InstanceRef<Wal, Meta, Store, Fa>,
+        engine_type: String,
+    ) -> Self {
+        Self {
+            space_table,
+            instance,
+            engine_type,
+        }
+    }
+}
+
+impl<Wal, Meta, Store, Fa> fmt::Debug for TableImpl<Wal, Meta, Store, Fa> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("TableImpl")
+            .field("space_table", &self.space_table)
+            .finish()
+    }
+}
+
+#[async_trait]
+impl<
+        Wal: WalManager + Send + Sync + 'static,
+        Meta: Manifest + Send + Sync + 'static,
+        Store: ObjectStore,
+        Fa: Factory + Send + Sync + 'static,
+    > Table for TableImpl<Wal, Meta, Store, Fa>
+{
+    fn name(&self) -> &str {
+        &self.space_table.table_data().name
+    }
+
+    fn id(&self) -> TableId {
+        self.space_table.table_data().id
+    }
+
+    fn schema(&self) -> Schema {
+        self.space_table.table_data().schema()
+    }
+
+    fn options(&self) -> HashMap<String, String> {
+        self.space_table.table_data().table_options().to_raw_map()
+    }
+
+    fn engine_type(&self) -> &str {
+        &self.engine_type
+    }
+
+    fn stats(&self) -> TableStats {
+        let metrics = &self.space_table.table_data().metrics;
+
+        TableStats {
+            num_write: metrics.write_request_counter.get(),
+            num_read: metrics.read_request_counter.get(),
+            num_flush: metrics.flush_duration_histogram.get_sample_count(),
+        }
+    }
+
+    async fn write(&self, request: WriteRequest) -> Result<usize> {
+        let num_rows = self
+            .instance
+            .write_to_table(&self.space_table, request)
+            .await
+            .map_err(|e| Box::new(e) as _)
+            .context(Write { table: self.name() })?;
+        Ok(num_rows)
+    }
+
+    async fn read(&self, mut request: ReadRequest) -> Result<SendableRecordBatchStream> {
+        request.opts.read_parallelism = 1;
+        let mut streams = self
+            .instance
+            .partitioned_read_from_table(&self.space_table, request)
+            .await
+            .map_err(|e| Box::new(e) as _)
+            .context(Scan { table: self.name() })?;
+
+        assert_eq!(streams.streams.len(), 1);
+        let stream = streams.streams.pop().unwrap();
+
+        Ok(stream)
+    }
+
+    async fn get(&self, request: GetRequest) -> Result<Option<Row>> {
+        let schema = request.projected_schema.to_record_schema_with_key();
+        let primary_key_columns = schema.key_columns();
+        ensure!(
+            primary_key_columns.len() == request.primary_key.len(),
+            GetInvalidPrimaryKey {
+                schema: schema.clone(),
+                primary_key_columns,
+            }
+        );
+
+        let mut primary_key_exprs: Vec<Expr> = Vec::with_capacity(request.primary_key.len());
+        for (primary_key_value, column_schema) in
+            request.primary_key.iter().zip(primary_key_columns.iter())
+        {
+            let v = primary_key_value
+                .as_scalar_value()
+                .with_context(|| GetNullPrimaryKey {
+                    schema: schema.clone(),
+                    primary_key_columns,
+                })?;
+            primary_key_exprs.push(
+                Expr::Column(Column::from_qualified_name(&column_schema.name)).eq(Expr::Literal(v)),
+            );
+        }
+
+        let read_request = ReadRequest {
+            request_id: request.request_id,
+            opts: ReadOptions::default(),
+            projected_schema: request.projected_schema,
+            predicate: Arc::new(Predicate {
+                exprs: primary_key_exprs,
+                time_range: TimeRange::min_to_max(),
+            }),
+            order: ReadOrder::None,
+        };
+        let mut batch_stream = self
+            .read(read_request)
+            .await
+            .map_err(|e| Box::new(e) as _)
+            .context(Scan { table: self.name() })?;
+
+        let mut result_columns = Vec::with_capacity(schema.num_columns());
+
+        while let Some(batch) = batch_stream
+            .try_next()
+            .await
+            .map_err(|e| Box::new(e) as _)
+            .context(Get { table: self.name() })?
+        {
+            let row_num = batch.num_rows();
+            if row_num == 0 {
+                return Ok(None);
+            }
+            for row_idx in 0..row_num {
+                for col_idx in 0..batch.num_columns() {
+                    let col = batch.column(col_idx);
+                    result_columns.push(col.datum(row_idx));
+                }
+
+                if request.primary_key == result_columns[..schema.num_key_columns()] {
+                    return Ok(Some(Row::from_datums(result_columns)));
+                }
+                result_columns.clear();
+            }
+        }
+
+        Ok(None)
+    }
+
+    async fn partitioned_read(&self, request: ReadRequest) -> Result<PartitionedStreams> {
+        let streams = self
+            .instance
+            .partitioned_read_from_table(&self.space_table, request)
+            .await
+            .map_err(|e| Box::new(e) as _)
+            .context(Scan { table: self.name() })?;
+
+        Ok(streams)
+    }
+
+    async fn alter_schema(&self, request: AlterSchemaRequest) -> Result<usize> {
+        self.instance
+            .alter_schema_of_table(&self.space_table, request)
+            .await
+            .map_err(|e| Box::new(e) as _)
+            .context(AlterSchema { table: self.name() })?;
+        Ok(1)
+    }
+
+    async fn alter_options(&self, options: HashMap<String, String>) -> Result<usize> {
+        self.instance
+            .alter_options_of_table(&self.space_table, options)
+            .await
+            .map_err(|e| Box::new(e) as _)
+            .context(AlterOptions { table: self.name() })?;
+        Ok(1)
+    }
+
+    async fn flush(&self, request: FlushRequest) -> Result<()> {
+        let mut rx_opt = None;
+        let flush_opts = TableFlushOptions {
+            compact_after_flush: request.compact_after_flush,
+            // Never block write thread
+            block_on_write_thread: false,
+            res_sender: if request.sync {
+                let (tx, rx) = oneshot::channel();
+                rx_opt = Some(rx);
+                Some(tx)
+            } else {
+                None
+            },
+        };
+
+        self.instance
+            .flush_table(&self.space_table, flush_opts)
+            .await
+            .map_err(|e| Box::new(e) as _)
+            .context(Flush { table: self.name() })?;
+        if let Some(rx) = rx_opt {
+            rx.await
+                .map_err(|e| Box::new(e) as _)
+                .context(Flush { table: self.name() })??;
+        }
+        Ok(())
+    }
+
+    async fn compact(&self) -> Result<()> {
+        self.instance
+            .manual_compact_table(&self.space_table)
+            .await
+            .map_err(|e| Box::new(e) as _)
+            .context(Compact { table: self.name() })?;
+        Ok(())
+    }
+}
diff --git a/analytic_engine/src/table/sst_util.rs b/analytic_engine/src/table/sst_util.rs
new file mode 100644
index 0000000000..b5d760a079
--- /dev/null
+++ b/analytic_engine/src/table/sst_util.rs
@@ -0,0 +1,27 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! utilities for sst.
+
+use object_store::path::ObjectStorePath;
+use table_engine::table::TableId;
+
+use crate::{space::SpaceId, sst::manager::FileId};
+
+const SST_FILE_SUFFIX: &str = "sst";
+
+#[inline]
+/// Generate the sst file name.
+pub fn sst_file_name(id: FileId) -> String {
+    format!("{}.{}", id, SST_FILE_SUFFIX)
+}
+
+/// Set the sst file path.
+pub fn set_sst_file_path<P: ObjectStorePath>(
+    space_id: SpaceId,
+    table_id: TableId,
+    file_id: FileId,
+    path: &mut P,
+) {
+    path.push_all_dirs([space_id.to_string().as_str(), table_id.to_string().as_str()]);
+    path.set_file_name(sst_file_name(file_id));
+}
diff --git a/analytic_engine/src/table/version.rs b/analytic_engine/src/table/version.rs
new file mode 100644
index 0000000000..b0e4e2b977
--- /dev/null
+++ b/analytic_engine/src/table/version.rs
@@ -0,0 +1,1096 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Table version
+
+use std::{
+    cmp,
+    collections::{BTreeMap, HashMap},
+    fmt,
+    ops::Bound,
+    sync::{Arc, RwLock},
+    time::Duration,
+};
+
+use common_types::{
+    row::Row,
+    schema::{self, Schema},
+    time::{TimeRange, Timestamp},
+    SequenceNumber,
+};
+use common_util::define_result;
+use snafu::{ensure, Backtrace, ResultExt, Snafu};
+
+use crate::{
+    compaction::{
+        picker::{self, CompactionPickerRef, PickerContext},
+        CompactionTask, ExpiredFiles,
+    },
+    instance::write_worker::WorkerLocal,
+    memtable::{self, key::KeySequence, MemTableRef, PutContext},
+    sampler::{DefaultSampler, SamplerRef},
+    sst::{
+        file::{FileHandle, FilePurgeQueue},
+        manager::{FileId, LevelsController, MAX_LEVEL},
+    },
+    table::{
+        data::MemTableId,
+        version_edit::{AddFile, VersionEdit},
+    },
+};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display(
+        "Schema mismatch, memtable_version:{}, given:{}.\nBacktrace:\n{}",
+        memtable_version,
+        given,
+        backtrace
+    ))]
+    SchemaMismatch {
+        memtable_version: schema::Version,
+        given: schema::Version,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Failed to put memtable, err:{}", source))]
+    PutMemTable { source: crate::memtable::Error },
+
+    #[snafu(display("Failed to collect timestamp, err:{}", source))]
+    CollectTimestamp { source: crate::sampler::Error },
+}
+
+define_result!(Error);
+
+/// Memtable for sampling timestamp.
+#[derive(Clone)]
+pub struct SamplingMemTable {
+    pub mem: MemTableRef,
+    pub id: MemTableId,
+    /// If freezed is true, the sampling is finished and no more data should be
+    /// inserted into this memtable. Otherwise, the memtable is active and all
+    /// data should ONLY write to this memtable instead of mutable memtable.
+    pub freezed: bool,
+    pub sampler: SamplerRef,
+}
+
+impl SamplingMemTable {
+    pub fn new(memtable: MemTableRef, id: MemTableId) -> Self {
+        SamplingMemTable {
+            mem: memtable,
+            id,
+            freezed: false,
+            sampler: Arc::new(DefaultSampler::default()),
+        }
+    }
+
+    fn memory_usage(&self) -> usize {
+        self.mem.approximate_memory_usage()
+    }
+
+    /// Suggest segment duration, if there is no sampled timestamp, returns
+    /// default segment duration.
+    fn suggest_segment_duration(&self) -> Duration {
+        self.sampler.suggest_duration()
+    }
+}
+
+impl fmt::Debug for SamplingMemTable {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("SamplingMemTable")
+            .field("id", &self.id)
+            .field("freezed", &self.freezed)
+            .finish()
+    }
+}
+
+/// Memtable with additional meta data
+#[derive(Clone)]
+pub struct MemTableState {
+    /// The mutable memtable
+    pub mem: MemTableRef,
+    /// The `time_range` is estimated via the time range of the first row group
+    /// write to this memtable and is aligned to segment size
+    pub time_range: TimeRange,
+    /// Id of the memtable, newer memtable has greater id
+    pub id: MemTableId,
+}
+
+impl MemTableState {
+    #[inline]
+    pub fn last_sequence(&self) -> SequenceNumber {
+        self.mem.last_sequence()
+    }
+}
+
+impl fmt::Debug for MemTableState {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("MemTableState")
+            .field("time_range", &self.time_range)
+            .field("id", &self.id)
+            .field("last_sequence", &self.mem.last_sequence())
+            .finish()
+    }
+}
+
+// TODO(yingwen): Replace by Either.
+#[derive(Clone)]
+pub enum MemTableForWrite {
+    Sampling(SamplingMemTable),
+    Normal(MemTableState),
+}
+
+impl MemTableForWrite {
+    #[inline]
+    pub fn set_last_sequence(&self, seq: SequenceNumber) -> memtable::Result<()> {
+        self.memtable().set_last_sequence(seq)
+    }
+
+    #[inline]
+    pub fn accept_timestamp(&self, timestamp: Timestamp) -> bool {
+        match self {
+            MemTableForWrite::Sampling(_) => true,
+            MemTableForWrite::Normal(v) => v.time_range.contains(timestamp),
+        }
+    }
+
+    #[inline]
+    pub fn put(
+        &self,
+        ctx: &mut PutContext,
+        sequence: KeySequence,
+        row: &Row,
+        schema: &Schema,
+        timestamp: Timestamp,
+    ) -> Result<()> {
+        match self {
+            MemTableForWrite::Sampling(v) => {
+                v.mem.put(ctx, sequence, row, schema).context(PutMemTable)?;
+
+                // Collect the timstamp of this row.
+                v.sampler.collect(timestamp).context(CollectTimestamp)?;
+
+                Ok(())
+            }
+            MemTableForWrite::Normal(v) => {
+                v.mem.put(ctx, sequence, row, schema).context(PutMemTable)
+            }
+        }
+    }
+
+    #[inline]
+    fn memtable(&self) -> &MemTableRef {
+        match self {
+            MemTableForWrite::Sampling(v) => &v.mem,
+            MemTableForWrite::Normal(v) => &v.mem,
+        }
+    }
+
+    #[cfg(test)]
+    pub fn as_sampling(&self) -> &SamplingMemTable {
+        match self {
+            MemTableForWrite::Sampling(v) => v,
+            MemTableForWrite::Normal(_) => panic!(),
+        }
+    }
+
+    #[cfg(test)]
+    pub fn as_normal(&self) -> &MemTableState {
+        match self {
+            MemTableForWrite::Sampling(_) => panic!(),
+            MemTableForWrite::Normal(v) => v,
+        }
+    }
+}
+
+#[derive(Debug, Default)]
+pub struct FlushableMemTables {
+    pub sampling_mem: Option<SamplingMemTable>,
+    pub memtables: MemTableVec,
+}
+
+impl FlushableMemTables {
+    #[inline]
+    pub fn is_empty(&self) -> bool {
+        self.sampling_mem.is_none() && self.memtables.is_empty()
+    }
+
+    pub fn ids(&self) -> Vec<MemTableId> {
+        let mut memtable_ids = Vec::with_capacity(self.memtables.len() + 1);
+        if let Some(v) = &self.sampling_mem {
+            memtable_ids.push(v.id);
+        }
+        for mem in &self.memtables {
+            memtable_ids.push(mem.id);
+        }
+
+        memtable_ids
+    }
+}
+
+/// Vec to store memtables
+pub type MemTableVec = Vec<MemTableState>;
+
+/// MemTableView holds all memtables of the table
+#[derive(Debug)]
+struct MemTableView {
+    /// The memtable for sampling timestamp to suggest segment duration.
+    ///
+    /// This memtable is special and may contains data in differnt segment, so
+    /// can not be moved into immutable memtable set.
+    sampling_mem: Option<SamplingMemTable>,
+    /// Mutable memtables arranged by its time range.
+    mutables: MutableMemTableSet,
+    /// Immutable memtables set, lookup by memtable id is fast.
+    immutables: ImmutableMemTableSet,
+}
+
+impl MemTableView {
+    fn new() -> Self {
+        Self {
+            sampling_mem: None,
+            mutables: MutableMemTableSet::new(),
+            immutables: ImmutableMemTableSet(BTreeMap::new()),
+        }
+    }
+
+    /// Get the memory usage of mutable memtables.
+    fn mutable_memory_usage(&self) -> usize {
+        self.mutables.memory_usage()
+            + self
+                .sampling_mem
+                .as_ref()
+                .map(|v| v.memory_usage())
+                .unwrap_or(0)
+    }
+
+    /// Get the total memory usage of mutable and immutable memtables.
+    fn total_memory_usage(&self) -> usize {
+        let mutable_usage = self.mutable_memory_usage();
+        let immutable_usage = self.immutables.memory_usage();
+
+        mutable_usage + immutable_usage
+    }
+
+    /// Switch all memtables or just sample the segment duration.
+    ///
+    /// If the sampling memtable is still active, return the suggested segment
+    /// duration or move all mutable memtables into immutable memtables if
+    /// the sampling memtable is freezed and returns None.
+    ///
+    /// Instead of replace the old memtable by a new memtable, we just move the
+    /// old memtable to immutable memtables and left mutable memtables
+    /// empty. New mutable memtable will be constructed via put request.
+    fn switch_memtables_or_suggest_duration(&mut self) -> Option<Duration> {
+        if let Some(v) = &mut self.sampling_mem {
+            if !v.freezed {
+                // Other memtable should be empty during sampling phase.
+                assert!(self.mutables.is_empty());
+                assert!(self.immutables.is_empty());
+
+                // The sampling memtable is still active, we need to compute the
+                // segment duration and then freeze the memtable.
+                let segment_duration = v.suggest_segment_duration();
+
+                // But we cannot freeze the sampling memtable now, because the
+                // segment duration may not yet been persisted.
+                return Some(segment_duration);
+            }
+        }
+
+        self.mutables.move_to_inmem(&mut self.immutables);
+
+        None
+    }
+
+    fn freeze_sampling_memtable(&mut self) {
+        if let Some(v) = &mut self.sampling_mem {
+            v.freezed = true;
+        }
+    }
+
+    /// Returns the memtables that needs to be flushed.
+    /// - Id of returned memtables are no greater than `max_memtable_id`.
+    /// - The last sequences of the returned memtables are continuous and can
+    ///   used as flushed sequence.
+    /// - All memTables with same last sequence must be picked to the same
+    ///   MemTableVec, so we can update flushed sequence safely (The
+    ///   `max_memtable_id` should also guarantee this).
+    /// - If freezed memtable exists, that memtable will be return if memtable
+    ///   id is no greater than `max_memtable_id` (The memtable id should always
+    ///   less than `max_memtable_id`).
+    ///
+    /// Now the returned memtables are also ordered by memtable id, but this may
+    /// change in the future.
+    fn pick_memtables_to_flush(&self, max_memtable_id: MemTableId, mems: &mut FlushableMemTables) {
+        if let Some(v) = &self.sampling_mem {
+            if v.id <= max_memtable_id {
+                mems.sampling_mem = Some(v.clone());
+            }
+        }
+
+        for mem in self.immutables.0.values() {
+            if mem.id <= max_memtable_id {
+                mems.memtables.push(mem.clone());
+            }
+        }
+    }
+
+    /// Remove memtable from immutables or sampling memtable.
+    #[inline]
+    fn remove_immutable_or_sampling(&mut self, id: MemTableId) {
+        if let Some(v) = &self.sampling_mem {
+            if v.id == id {
+                self.sampling_mem = None;
+                return;
+            }
+        }
+
+        self.immutables.0.remove(&id);
+    }
+
+    /// Collect memtables itersect with `time_range`
+    fn memtables_for_read(
+        &self,
+        time_range: TimeRange,
+        mems: &mut MemTableVec,
+        sampling_mem: &mut Option<SamplingMemTable>,
+    ) {
+        self.mutables.memtables_for_read(time_range, mems);
+
+        self.immutables.memtables_for_read(time_range, mems);
+
+        *sampling_mem = self.sampling_mem.clone();
+    }
+}
+
+/// Mutable memtables
+///
+/// All mutable memtables ordered by their end time (exclusive), their time
+/// range may overlaps if `alter segment duration` is supported
+///
+/// We choose end time so we can use BTreeMap::range to find the first range
+/// that may contains a given timestamp (end >= timestamp)
+#[derive(Debug)]
+struct MutableMemTableSet(BTreeMap<Timestamp, MemTableState>);
+
+impl MutableMemTableSet {
+    fn new() -> Self {
+        Self(BTreeMap::new())
+    }
+
+    /// Get memtale by timestamp for write
+    fn memtable_for_write(&self, timestamp: Timestamp) -> Option<&MemTableState> {
+        // Find the first memtable whose end time (exclusive) > timestamp
+        if let Some((_, memtable)) = self
+            .0
+            .range((Bound::Excluded(timestamp), Bound::Unbounded))
+            .next()
+        {
+            if memtable.time_range.contains(timestamp) {
+                return Some(memtable);
+            }
+        }
+
+        None
+    }
+
+    /// Insert memtable, the caller should guarantee the key of memtable is not
+    /// present.
+    fn insert(&mut self, memtable: MemTableState) -> Option<MemTableState> {
+        // Use end time of time range as key
+        let end = memtable.time_range.exclusive_end();
+        self.0.insert(end, memtable)
+    }
+
+    fn memory_usage(&self) -> usize {
+        self.0
+            .values()
+            .map(|m| m.mem.approximate_memory_usage())
+            .sum()
+    }
+
+    /// Move all mutable memtables to immutable memtables.
+    fn move_to_inmem(&mut self, immem: &mut ImmutableMemTableSet) {
+        for m in self.0.values() {
+            let state = m.clone();
+
+            immem.0.insert(m.id, state);
+        }
+        self.0.clear();
+    }
+
+    fn memtables_for_read(&self, time_range: TimeRange, mems: &mut MemTableVec) {
+        // Seek to first memtable whose end time (exclusive) > time_range.start
+        let inclusive_start = time_range.inclusive_start();
+        let iter = self
+            .0
+            .range((Bound::Excluded(inclusive_start), Bound::Unbounded));
+        for (_end_ts, mem) in iter {
+            // We need to iterate all candidate memtables as their start time is unspecific
+            if mem.time_range.intersect_with(time_range) {
+                mems.push(mem.clone());
+            }
+        }
+    }
+
+    fn is_empty(&self) -> bool {
+        self.0.is_empty()
+    }
+}
+
+/// Immutable memtables set
+///
+/// MemTables are ordered by memtable id, so lookup by memtable id is fast
+#[derive(Debug)]
+struct ImmutableMemTableSet(BTreeMap<MemTableId, MemTableState>);
+
+impl ImmutableMemTableSet {
+    /// Memory used by all immutable memtables
+    fn memory_usage(&self) -> usize {
+        self.0
+            .values()
+            .map(|m| m.mem.approximate_memory_usage())
+            .sum()
+    }
+
+    fn memtables_for_read(&self, time_range: TimeRange, mems: &mut MemTableVec) {
+        for mem in self.0.values() {
+            if mem.time_range.intersect_with(time_range) {
+                mems.push(mem.clone());
+            }
+        }
+    }
+
+    fn is_empty(&self) -> bool {
+        self.0.is_empty()
+    }
+}
+
+pub type LeveledFiles = Vec<Vec<FileHandle>>;
+
+/// Memtable/sst to read for given time range.
+pub struct ReadView {
+    pub sampling_mem: Option<SamplingMemTable>,
+    pub memtables: MemTableVec,
+    /// Ssts to read in each level.
+    ///
+    /// The `ReadView` MUST ensure the length of `leveled_ssts` >= MAX_LEVEL.
+    pub leveled_ssts: LeveledFiles,
+}
+
+impl Default for ReadView {
+    fn default() -> Self {
+        Self {
+            sampling_mem: None,
+            memtables: Vec::new(),
+            leveled_ssts: vec![Vec::new(); MAX_LEVEL],
+        }
+    }
+}
+
+impl ReadView {
+    pub fn contains_sampling(&self) -> bool {
+        self.sampling_mem.is_some()
+    }
+}
+
+/// Data of TableVersion
+struct TableVersionInner {
+    /// All memtables
+    memtable_view: MemTableView,
+    /// All ssts
+    levels: LevelsController,
+
+    /// The earliest sequence number of the entries already flushed (inclusive).
+    /// All log entry with sequence <= `flushed_sequence` can be deleted
+    flushed_sequence: SequenceNumber,
+}
+
+impl TableVersionInner {
+    fn memtable_for_write(
+        &self,
+        _write_lock: &WorkerLocal,
+        timestamp: Timestamp,
+    ) -> Option<MemTableForWrite> {
+        if let Some(mem) = self.memtable_view.sampling_mem.clone() {
+            if !mem.freezed {
+                // If sampling memtable is not freezed.
+                return Some(MemTableForWrite::Sampling(mem));
+            }
+        }
+
+        self.memtable_view
+            .mutables
+            .memtable_for_write(timestamp)
+            .cloned()
+            .map(MemTableForWrite::Normal)
+    }
+}
+
+// TODO(yingwen): How to support snapshot?
+/// Table version
+///
+/// Holds memtables and sst meta data of a table
+///
+/// Switching memtable, memtable to level 0 file, addition/deletion to files
+/// should be done atomically.
+pub struct TableVersion {
+    inner: RwLock<TableVersionInner>,
+}
+
+impl TableVersion {
+    /// Create an empty table version
+    pub fn new(purge_queue: FilePurgeQueue) -> Self {
+        Self {
+            inner: RwLock::new(TableVersionInner {
+                memtable_view: MemTableView::new(),
+                levels: LevelsController::new(purge_queue),
+                flushed_sequence: 0,
+            }),
+        }
+    }
+
+    /// See [MemTableView::mutable_memory_usage]
+    pub fn mutable_memory_usage(&self) -> usize {
+        self.inner
+            .read()
+            .unwrap()
+            .memtable_view
+            .mutable_memory_usage()
+    }
+
+    /// See [MemTableView::total_memory_usage]
+    pub fn total_memory_usage(&self) -> usize {
+        self.inner
+            .read()
+            .unwrap()
+            .memtable_view
+            .total_memory_usage()
+    }
+
+    /// Switch all mutable memtables or just return the suggested segment
+    /// duration if sampling memtable is still active.
+    ///
+    /// Returns a duration if a sampled segment duration needs to be persisted.
+    ///
+    /// REQUIRE: Do in write worker
+    pub fn switch_memtables_or_suggest_duration(
+        &self,
+        _worker_local: &WorkerLocal,
+    ) -> Option<Duration> {
+        self.inner
+            .write()
+            .unwrap()
+            .memtable_view
+            .switch_memtables_or_suggest_duration()
+    }
+
+    /// Stop timestamp sampling and freezed the sampling memtable.
+    ///
+    /// REQUIRE: Do in write worker
+    pub fn freeze_sampling(&self, _worker_local: &WorkerLocal) {
+        self.inner
+            .write()
+            .unwrap()
+            .memtable_view
+            .freeze_sampling_memtable();
+    }
+
+    /// See [MemTableView::pick_memtables_to_flush]
+    pub fn pick_memtables_to_flush(
+        &self,
+        max_memtable_id: MemTableId,
+        mems: &mut FlushableMemTables,
+    ) {
+        self.inner
+            .read()
+            .unwrap()
+            .memtable_view
+            .pick_memtables_to_flush(max_memtable_id, mems);
+    }
+
+    /// Get memtable by timestamp for write.
+    ///
+    /// The returned schema is guaranteed to have schema with same version as
+    /// `schema_version`. Return None if the schema of existing memtable has
+    /// different schema.
+    pub fn memtable_for_write(
+        &self,
+        write_lock: &WorkerLocal,
+        timestamp: Timestamp,
+        schema_version: schema::Version,
+    ) -> Result<Option<MemTableForWrite>> {
+        // Find memtable by timestamp
+        let mutable = {
+            let inner = self.inner.read().unwrap();
+            match inner.memtable_for_write(write_lock, timestamp) {
+                Some(v) => v,
+                None => return Ok(None),
+            }
+        };
+
+        // We consider the schemas are same if they have the same version.
+        ensure!(
+            mutable.memtable().schema().version() == schema_version,
+            SchemaMismatch {
+                memtable_version: mutable.memtable().schema().version(),
+                given: schema_version,
+            }
+        );
+
+        Ok(Some(mutable))
+    }
+
+    /// Insert memtable into mutable memtable set.
+    pub fn insert_mutable(&self, mem_state: MemTableState) {
+        let mut inner = self.inner.write().unwrap();
+        let old_memtable = inner.memtable_view.mutables.insert(mem_state.clone());
+        assert!(
+            old_memtable.is_none(),
+            "Find a duplicate memtable, new_memtable:{:?}, old_memtable:{:?}, memtable_view:{:#?}",
+            mem_state,
+            old_memtable,
+            inner.memtable_view
+        );
+    }
+
+    /// Set sampling memtable.
+    ///
+    /// Panic if the sampling memtable of this version is not None.
+    pub fn set_sampling(&self, sampling_mem: SamplingMemTable) {
+        let mut inner = self.inner.write().unwrap();
+        assert!(inner.memtable_view.sampling_mem.is_none());
+        inner.memtable_view.sampling_mem = Some(sampling_mem);
+    }
+
+    /// Atomically apply the edit to the version.
+    pub fn apply_edit(&self, edit: VersionEdit) {
+        let mut inner = self.inner.write().unwrap();
+
+        // TODO(yingwen): else, log warning
+        inner.flushed_sequence = cmp::max(inner.flushed_sequence, edit.flushed_sequence);
+
+        // Add sst files to level first.
+        for add_file in edit.files_to_add {
+            inner.levels.add_sst_to_level(add_file.level, add_file.file);
+        }
+
+        // Remove ssts from level.
+        for delete_file in edit.files_to_delete {
+            inner
+                .levels
+                .remove_ssts_from_level(delete_file.level, &[delete_file.file_id]);
+        }
+
+        // Remove immutable memtables.
+        for mem_id in edit.mems_to_remove {
+            inner.memtable_view.remove_immutable_or_sampling(mem_id);
+        }
+    }
+
+    /// Atomically apply the meta to the version, useful in recover.
+    pub fn apply_meta(&self, meta: TableVersionMeta) {
+        let mut inner = self.inner.write().unwrap();
+
+        inner.flushed_sequence = cmp::max(inner.flushed_sequence, meta.flushed_sequence);
+
+        for add_file in meta.files.into_values() {
+            inner.levels.add_sst_to_level(add_file.level, add_file.file);
+        }
+    }
+
+    pub fn pick_read_view(&self, time_range: TimeRange) -> ReadView {
+        let mut sampling_mem = None;
+        let mut memtables = MemTableVec::new();
+        let mut leveled_ssts = vec![Vec::new(); MAX_LEVEL];
+
+        {
+            // Pick memtables for read.
+            let inner = self.inner.read().unwrap();
+
+            inner
+                .memtable_view
+                .memtables_for_read(time_range, &mut memtables, &mut sampling_mem);
+
+            // Pick ssts for read.
+            inner.levels.pick_ssts(time_range, |level, ssts| {
+                leveled_ssts[level as usize].extend_from_slice(ssts)
+            });
+        }
+
+        ReadView {
+            sampling_mem,
+            memtables,
+            leveled_ssts,
+        }
+    }
+
+    /// Pick ssts for compaction using given `picker`.
+    pub fn pick_for_compaction(
+        &self,
+        picker_ctx: PickerContext,
+        picker: &CompactionPickerRef,
+    ) -> picker::Result<CompactionTask> {
+        let inner = self.inner.read().unwrap();
+
+        picker.pick_compaction(picker_ctx, &inner.levels)
+    }
+
+    pub fn has_expired_sst(&self, expire_time: Option<Timestamp>) -> bool {
+        let inner = self.inner.read().unwrap();
+
+        inner.levels.has_expired_sst(expire_time)
+    }
+
+    pub fn expired_ssts(&self, expire_time: Option<Timestamp>) -> Vec<ExpiredFiles> {
+        let inner = self.inner.read().unwrap();
+
+        inner.levels.expired_ssts(expire_time)
+    }
+}
+
+/// During recovery, we apply all version edit to [TableVersionMeta] first, then
+/// apply the version meta to the table, so we can avoid adding removed ssts to
+/// the version.
+#[derive(Debug, Default)]
+pub struct TableVersionMeta {
+    pub flushed_sequence: SequenceNumber,
+    files: HashMap<FileId, AddFile>,
+    max_file_id: FileId,
+}
+
+impl TableVersionMeta {
+    pub fn apply_edit(&mut self, edit: VersionEdit) {
+        self.flushed_sequence = cmp::max(self.flushed_sequence, edit.flushed_sequence);
+
+        for add_file in edit.files_to_add {
+            self.max_file_id = cmp::max(self.max_file_id, add_file.file.id);
+
+            self.files.insert(add_file.file.id, add_file);
+        }
+
+        for delete_file in edit.files_to_delete {
+            self.files.remove(&delete_file.file_id);
+        }
+    }
+
+    /// Returns the max file id in the files to add.
+    pub fn max_file_id_to_add(&self) -> FileId {
+        self.max_file_id
+    }
+
+    pub fn ordered_files(&self) -> Vec<AddFile> {
+        let mut files_vec: Vec<_> = self.files.values().cloned().collect();
+        files_vec.sort_unstable_by_key(|file| file.file.id);
+
+        files_vec
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::{
+        instance::write_worker::tests::WriteHandleMocker,
+        sst::file::tests::{FilePurgerMocker, SstMetaDataMocker},
+        table::{data::tests::MemTableMocker, version_edit::tests::AddFileMocker},
+        table_options,
+        tests::table,
+    };
+
+    fn new_table_version() -> TableVersion {
+        let purger = FilePurgerMocker::mock();
+        let queue = purger.create_purge_queue(1, table::new_table_id(2, 2));
+        TableVersion::new(queue)
+    }
+
+    #[test]
+    fn test_empty_table_version() {
+        let mocked_write_handle = WriteHandleMocker::default().build();
+        let worker_local = mocked_write_handle.worker_local;
+        let version = new_table_version();
+
+        let ts = Timestamp::now();
+        assert!(!version.has_expired_sst(None));
+        assert!(!version.has_expired_sst(Some(ts)));
+
+        assert_eq!(0, version.mutable_memory_usage());
+        assert_eq!(0, version.total_memory_usage());
+
+        {
+            let inner = version.inner.read().unwrap();
+            let memtable_view = &inner.memtable_view;
+            assert!(memtable_view.sampling_mem.is_none());
+            assert!(memtable_view.mutables.is_empty());
+            assert!(memtable_view.immutables.is_empty());
+        }
+
+        let mut flushable_mems = FlushableMemTables::default();
+        let max_memtable_id = 1000;
+        version.pick_memtables_to_flush(max_memtable_id, &mut flushable_mems);
+        assert!(flushable_mems.is_empty());
+
+        let read_view = version.pick_read_view(TimeRange::min_to_max());
+        assert!(!read_view.contains_sampling());
+
+        assert!(read_view.sampling_mem.is_none());
+        assert!(read_view.memtables.is_empty());
+        for ssts in read_view.leveled_ssts {
+            assert!(ssts.is_empty());
+        }
+
+        let now = Timestamp::now();
+        let mutable = version.memtable_for_write(&worker_local, now, 1).unwrap();
+        assert!(mutable.is_none());
+
+        // Nothing to switch.
+        assert!(version
+            .switch_memtables_or_suggest_duration(&worker_local)
+            .is_none());
+    }
+
+    fn check_flushable_mem_with_sampling(
+        flushable_mems: &FlushableMemTables,
+        memtable_id: MemTableId,
+    ) {
+        assert!(!flushable_mems.is_empty());
+        assert_eq!(
+            memtable_id,
+            flushable_mems.sampling_mem.as_ref().unwrap().id
+        );
+        assert!(flushable_mems.memtables.is_empty());
+    }
+
+    #[test]
+    fn test_table_version_sampling() {
+        let mocked_write_handle = WriteHandleMocker::default().build();
+        let worker_local = mocked_write_handle.worker_local;
+        let version = new_table_version();
+
+        let memtable = MemTableMocker::default().build();
+        let schema = memtable.schema().clone();
+
+        let memtable_id = 1;
+        let sampling_mem = SamplingMemTable::new(memtable, memtable_id);
+
+        version.set_sampling(sampling_mem);
+
+        // Should write to sampling memtable.
+        let now = Timestamp::now();
+        let mutable = version
+            .memtable_for_write(&worker_local, now, schema.version())
+            .unwrap()
+            .unwrap();
+        let actual_memtable = mutable.as_sampling();
+        assert_eq!(memtable_id, actual_memtable.id);
+
+        let mutable = version
+            .memtable_for_write(&worker_local, Timestamp::new(1234), schema.version())
+            .unwrap()
+            .unwrap();
+        let actual_memtable = mutable.as_sampling();
+        assert_eq!(memtable_id, actual_memtable.id);
+
+        // Sampling memtable should always be read.
+        let read_view = version.pick_read_view(TimeRange::new(0.into(), 123.into()).unwrap());
+        assert!(read_view.contains_sampling());
+        assert_eq!(memtable_id, read_view.sampling_mem.unwrap().id);
+
+        let mut flushable_mems = FlushableMemTables::default();
+        version.pick_memtables_to_flush(memtable_id, &mut flushable_mems);
+        check_flushable_mem_with_sampling(&flushable_mems, memtable_id);
+    }
+
+    #[test]
+    fn test_table_version_sampling_switch() {
+        let worker_local = WriteHandleMocker::default().build().worker_local;
+        let version = new_table_version();
+
+        let memtable = MemTableMocker::default().build();
+        let schema = memtable.schema().clone();
+
+        let memtable_id = 1;
+        let sampling_mem = SamplingMemTable::new(memtable, memtable_id);
+
+        version.set_sampling(sampling_mem);
+
+        let duration = version
+            .switch_memtables_or_suggest_duration(&worker_local)
+            .unwrap();
+        assert_eq!(table_options::DEFAULT_SEGMENT_DURATION, duration);
+
+        // Flushable memtables only contains sampling memtable.
+        let mut flushable_mems = FlushableMemTables::default();
+        version.pick_memtables_to_flush(memtable_id, &mut flushable_mems);
+        check_flushable_mem_with_sampling(&flushable_mems, memtable_id);
+
+        // Write to memtable after switch and before freezed.
+        let now = Timestamp::now();
+        let mutable = version
+            .memtable_for_write(&worker_local, now, schema.version())
+            .unwrap()
+            .unwrap();
+        // Still write to sampling memtable.
+        let actual_memtable = mutable.as_sampling();
+        assert_eq!(memtable_id, actual_memtable.id);
+
+        // Switch still return duration before freezed.
+        let duration = version
+            .switch_memtables_or_suggest_duration(&worker_local)
+            .unwrap();
+        assert_eq!(table_options::DEFAULT_SEGMENT_DURATION, duration);
+
+        // Flushable memtables only contains sampling memtable before sampling
+        // memtable is freezed.
+        let mut flushable_mems = FlushableMemTables::default();
+        version.pick_memtables_to_flush(memtable_id, &mut flushable_mems);
+        check_flushable_mem_with_sampling(&flushable_mems, memtable_id);
+    }
+
+    #[test]
+    fn test_table_version_sampling_freeze() {
+        let worker_local = WriteHandleMocker::default().build().worker_local;
+        let version = new_table_version();
+
+        let memtable = MemTableMocker::default().build();
+        let schema = memtable.schema().clone();
+
+        let memtable_id1 = 1;
+        let sampling_mem = SamplingMemTable::new(memtable, memtable_id1);
+
+        version.set_sampling(sampling_mem);
+        assert_eq!(
+            table_options::DEFAULT_SEGMENT_DURATION,
+            version
+                .switch_memtables_or_suggest_duration(&worker_local)
+                .unwrap()
+        );
+
+        // Freeze the sampling memtable.
+        version.freeze_sampling(&worker_local);
+
+        // No memtable after switch and freezed.
+        let now = Timestamp::now();
+        assert!(version
+            .memtable_for_write(&worker_local, now, schema.version())
+            .unwrap()
+            .is_none());
+
+        // Still flushable after freezed.
+        let mut flushable_mems = FlushableMemTables::default();
+        version.pick_memtables_to_flush(memtable_id1, &mut flushable_mems);
+        assert!(flushable_mems.sampling_mem.unwrap().freezed);
+
+        let time_range =
+            TimeRange::bucket_of(now, table_options::DEFAULT_SEGMENT_DURATION).unwrap();
+
+        // Sampling memtable still readable after freezed.
+        let read_view = version.pick_read_view(time_range);
+        assert!(read_view.contains_sampling());
+        assert_eq!(memtable_id1, read_view.sampling_mem.as_ref().unwrap().id);
+        assert!(read_view.sampling_mem.unwrap().freezed);
+
+        let memtable = MemTableMocker::default().build();
+        let memtable_id2 = 2;
+        let mem_state = MemTableState {
+            mem: memtable,
+            time_range,
+            id: memtable_id2,
+        };
+        // Insert a mutable memtable.
+        version.insert_mutable(mem_state);
+
+        // Write to mutable memtable.
+        let mutable = version
+            .memtable_for_write(&worker_local, now, schema.version())
+            .unwrap()
+            .unwrap();
+        let mutable = mutable.as_normal();
+        assert_eq!(time_range, mutable.time_range);
+        assert_eq!(memtable_id2, mutable.id);
+
+        // Need to read sampling memtable and mutable memtable.
+        let read_view = version.pick_read_view(time_range);
+        assert_eq!(memtable_id1, read_view.sampling_mem.as_ref().unwrap().id);
+        assert_eq!(1, read_view.memtables.len());
+        assert_eq!(memtable_id2, read_view.memtables[0].id);
+
+        // Switch mutable memtable.
+        assert!(version
+            .switch_memtables_or_suggest_duration(&worker_local)
+            .is_none());
+        // No memtable after switch.
+        let now = Timestamp::now();
+        assert!(version
+            .memtable_for_write(&worker_local, now, schema.version())
+            .unwrap()
+            .is_none());
+
+        // Two memtables to flush.
+        let mut flushable_mems = FlushableMemTables::default();
+        version.pick_memtables_to_flush(memtable_id2, &mut flushable_mems);
+        assert!(flushable_mems.sampling_mem.unwrap().freezed);
+        assert_eq!(1, flushable_mems.memtables.len());
+        assert_eq!(memtable_id2, flushable_mems.memtables[0].id);
+    }
+
+    #[test]
+    fn test_table_version_sampling_apply_edit() {
+        let worker_local = WriteHandleMocker::default().build().worker_local;
+        let version = new_table_version();
+
+        let memtable = MemTableMocker::default().build();
+        let schema = memtable.schema().clone();
+
+        let memtable_id1 = 1;
+        let sampling_mem = SamplingMemTable::new(memtable, memtable_id1);
+
+        // Prepare sampling memtable.
+        version.set_sampling(sampling_mem);
+        version.freeze_sampling(&worker_local);
+
+        let now = Timestamp::now();
+        let time_range =
+            TimeRange::bucket_of(now, table_options::DEFAULT_SEGMENT_DURATION).unwrap();
+
+        // Prepare mutable memtable.
+        let memtable = MemTableMocker::default().build();
+        let memtable_id2 = 2;
+        let mem_state = MemTableState {
+            mem: memtable,
+            time_range,
+            id: memtable_id2,
+        };
+        // Insert a mutable memtable.
+        version.insert_mutable(mem_state);
+
+        // Switch memtable.
+        assert!(version
+            .switch_memtables_or_suggest_duration(&worker_local)
+            .is_none());
+
+        let max_sequence = 120;
+        let file_id = 13;
+        // TO simplify test, we only create one sst.
+        let sst_meta = SstMetaDataMocker::new(schema)
+            .time_range(time_range)
+            .max_sequence(max_sequence)
+            .build();
+        let add_file = AddFileMocker::new(sst_meta).file_id(file_id).build();
+        let edit = VersionEdit {
+            flushed_sequence: max_sequence,
+            mems_to_remove: vec![memtable_id1, memtable_id2],
+            files_to_add: vec![add_file],
+            files_to_delete: vec![],
+        };
+        version.apply_edit(edit);
+
+        // Only pick ssts after flushed.
+        let read_view = version.pick_read_view(time_range);
+        assert!(!read_view.contains_sampling());
+        assert!(read_view.sampling_mem.is_none());
+        assert!(read_view.memtables.is_empty());
+        assert_eq!(1, read_view.leveled_ssts[0].len());
+        assert_eq!(file_id, read_view.leveled_ssts[0][0].id());
+    }
+}
diff --git a/analytic_engine/src/table/version_edit.rs b/analytic_engine/src/table/version_edit.rs
new file mode 100644
index 0000000000..97f09e5454
--- /dev/null
+++ b/analytic_engine/src/table/version_edit.rs
@@ -0,0 +1,176 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Version edits
+
+use std::convert::{TryFrom, TryInto};
+
+use common_types::{bytes::Bytes, schema::Schema, time::TimeRange, SequenceNumber};
+use common_util::define_result;
+use proto::meta_update as meta_pb;
+use snafu::{Backtrace, ResultExt, Snafu};
+
+use crate::{
+    sst::{
+        file::{FileMeta, SstMetaData},
+        manager::FileId,
+    },
+    table::data::MemTableId,
+};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Invalid level:{}, err:{}.\nBacktrace:\n{}", level, source, backtrace))]
+    InvalidLevel {
+        level: u32,
+        source: std::num::TryFromIntError,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Failed to convert time range, err:{}", source))]
+    ConvertTimeRange { source: common_types::time::Error },
+
+    #[snafu(display("Fail to convert table schema, err:{}", source))]
+    ConvertTableSchema { source: common_types::schema::Error },
+}
+
+define_result!(Error);
+
+/// Meta data of a new file.
+#[derive(Debug, Clone)]
+pub struct AddFile {
+    /// The level of the file intended to add.
+    pub level: u16,
+    /// Meta data of the file to add.
+    pub file: FileMeta,
+}
+
+impl AddFile {
+    /// Convert into protobuf struct
+    pub fn into_pb(self) -> meta_pb::AddFileMeta {
+        let mut target = meta_pb::AddFileMeta::new();
+        target.set_level(self.level.into());
+        target.set_file_id(self.file.id);
+        target.set_min_key(self.file.meta.min_key.to_vec());
+        target.set_max_key(self.file.meta.max_key.to_vec());
+        target.set_time_range(self.file.meta.time_range.into());
+        target.set_max_seq(self.file.meta.max_sequence);
+        target.set_schema(self.file.meta.schema.into());
+        target.set_size(self.file.meta.size);
+        target.set_row_num(self.file.meta.row_num);
+
+        target
+    }
+}
+
+impl TryFrom<meta_pb::AddFileMeta> for AddFile {
+    type Error = Error;
+
+    fn try_from(mut src: meta_pb::AddFileMeta) -> Result<Self> {
+        let time_range = TimeRange::try_from(src.take_time_range()).context(ConvertTimeRange)?;
+        let schema = Schema::try_from(src.take_schema()).context(ConvertTableSchema)?;
+        Ok(Self {
+            level: src
+                .level
+                .try_into()
+                .context(InvalidLevel { level: src.level })?,
+            file: FileMeta {
+                id: src.file_id,
+                meta: SstMetaData {
+                    min_key: Bytes::from(src.min_key),
+                    max_key: Bytes::from(src.max_key),
+                    time_range,
+                    max_sequence: src.max_seq,
+                    schema,
+                    size: src.size,
+                    row_num: src.row_num,
+                },
+            },
+        })
+    }
+}
+
+/// Meta data of the file to delete.
+#[derive(Debug, Clone)]
+pub struct DeleteFile {
+    /// The level of the file intended to delete.
+    pub level: u16,
+    /// Id of the file to delete.
+    pub file_id: FileId,
+}
+
+impl DeleteFile {
+    /// Convert into protobuf struct
+    pub fn into_pb(self) -> meta_pb::DeleteFileMeta {
+        let mut target = meta_pb::DeleteFileMeta::new();
+        target.set_level(self.level.into());
+        target.set_file_id(self.file_id);
+
+        target
+    }
+}
+
+impl TryFrom<meta_pb::DeleteFileMeta> for DeleteFile {
+    type Error = Error;
+
+    fn try_from(src: meta_pb::DeleteFileMeta) -> Result<Self> {
+        let level = src
+            .level
+            .try_into()
+            .context(InvalidLevel { level: src.level })?;
+
+        Ok(Self {
+            level,
+            file_id: src.file_id,
+        })
+    }
+}
+
+/// Edit to the [TableVersion], which should be done atomically
+#[derive(Debug)]
+pub struct VersionEdit {
+    /// The last sequence already flushed. This field is not guaranteed to be
+    /// set if the version edit is created by a non-flush operation (such as
+    /// compaction).
+    pub flushed_sequence: SequenceNumber,
+    /// Id of memtables to remove from immutable memtable lists.
+    pub mems_to_remove: Vec<MemTableId>,
+    /// Sst files to add.
+    pub files_to_add: Vec<AddFile>,
+    /// Sst files to delete.
+    pub files_to_delete: Vec<DeleteFile>,
+}
+
+#[cfg(test)]
+pub mod tests {
+    use super::*;
+
+    #[must_use]
+    pub struct AddFileMocker {
+        file_id: FileId,
+        sst_meta: SstMetaData,
+    }
+
+    impl AddFileMocker {
+        pub fn new(sst_meta: SstMetaData) -> Self {
+            Self {
+                file_id: 1,
+                sst_meta,
+            }
+        }
+
+        pub fn file_id(mut self, file_id: FileId) -> Self {
+            self.file_id = file_id;
+            self
+        }
+
+        pub fn build(&self) -> AddFile {
+            AddFile {
+                level: 0,
+                file: FileMeta {
+                    id: self.file_id,
+                    meta: self.sst_meta.clone(),
+                },
+            }
+        }
+    }
+}
diff --git a/analytic_engine/src/table_options.rs b/analytic_engine/src/table_options.rs
new file mode 100644
index 0000000000..badac47830
--- /dev/null
+++ b/analytic_engine/src/table_options.rs
@@ -0,0 +1,553 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Constants for table options.
+
+use std::{collections::HashMap, string::ToString, time::Duration};
+
+use arrow_deps::datafusion::parquet::basic::Compression as ParquetCompression;
+use common_types::time::Timestamp;
+use common_util::{
+    config::{ReadableDuration, ReadableSize},
+    define_result,
+    time::DurationExt,
+};
+use proto::analytic_common::{
+    CompactionOptions as CompactionOptionsPb, CompactionStrategy as CompactionStrategyPb,
+    Compression as CompressionPb, TableOptions as TableOptionsPb, UpdateMode as UpdateModePb,
+};
+use serde_derive::Deserialize;
+use snafu::{Backtrace, GenerateBacktrace, ResultExt, Snafu};
+use table_engine::OPTION_KEY_ENABLE_TTL;
+
+use crate::compaction::{
+    CompactionStrategy, SizeTieredCompactionOptions, TimeWindowCompactionOptions,
+};
+
+pub const SEGMENT_DURATION: &str = "segment_duration";
+pub const ENABLE_TTL: &str = OPTION_KEY_ENABLE_TTL;
+pub const TTL: &str = "ttl";
+pub const ARENA_BLOCK_SIZE: &str = "arena_block_size";
+pub const WRITE_BUFFER_SIZE: &str = "write_buffer_size";
+pub const COMPACTION_STRATEGY: &str = "compaction_strategy";
+pub const NUM_ROWS_PER_ROW_GROUP: &str = "num_rows_per_row_group";
+pub const UPDATE_MODE: &str = "update_mode";
+pub const COMPRESSION: &str = "compression";
+
+const UPDATE_MODE_OVERWRITE: &str = "OVERWRITE";
+const UPDATE_MODE_APPEND: &str = "APPEND";
+const COMPRESSION_UNCOMPRESSED: &str = "UNCOMPRESSED";
+const COMPRESSION_LZ4: &str = "LZ4";
+const COMPRESSION_SNAPPY: &str = "SNAPPY";
+const COMPRESSION_ZSTD: &str = "ZSTD";
+const AT_LEAST_OPTIONS_NUM: usize = 9;
+
+/// Default bucket duration (1d)
+const BUCKET_DURATION_1D: Duration = Duration::from_secs(24 * 60 * 60);
+/// Default duration of a segment (2h).
+pub const DEFAULT_SEGMENT_DURATION: Duration = Duration::from_secs(60 * 60 * 2);
+/// Default arena block size (2M).
+const DEFAULT_ARENA_BLOCK_SIZE: u32 = 2 * 1024 * 1024;
+/// Default write buffer size (32M).
+const DEFAULT_WRITE_BUFFER_SIZE: u32 = 32 * 1024 * 1024;
+/// Default ttl of table (7d).
+const DEFAULT_TTL: Duration = Duration::from_secs(7 * 24 * 60 * 60);
+/// Default row number of a row group.
+const DEFAULT_NUM_ROW_PER_ROW_GROUP: usize = 8192;
+
+/// Max arena block size (2G)
+const MAX_ARENA_BLOCK_SIZE: u32 = 2 * 1024 * 1024 * 1024;
+/// Min arena block size (1K)
+const MIN_ARENA_BLOCK_SIZE: u32 = 1024;
+const MIN_NUM_ROWS_PER_ROW_GROUP: usize = 100;
+const MAX_NUM_ROWS_PER_ROW_GROUP: usize = 10_000_000;
+
+#[derive(Debug, Snafu)]
+#[allow(clippy::enum_variant_names)]
+pub enum Error {
+    #[snafu(display("Failed to parse duration, err:{}.\nBacktrace:\n{}", err, backtrace))]
+    ParseDuration { err: String, backtrace: Backtrace },
+
+    #[snafu(display("Failed to parse size, err:{}.\nBacktrace:\n{}", err, backtrace))]
+    ParseSize { err: String, backtrace: Backtrace },
+
+    #[snafu(display("Failed to parse compaction strategy: {}, err: {}", value, source))]
+    ParseStrategy {
+        value: String,
+        source: crate::compaction::Error,
+    },
+    #[snafu(display("Failed to parse int, err:{}.\nBacktrace:\n{}", source, backtrace))]
+    ParseInt {
+        source: std::num::ParseIntError,
+        backtrace: Backtrace,
+    },
+    #[snafu(display("Failed to parse bool, err:{}.\nBacktrace:\n{}", source, backtrace))]
+    ParseBool {
+        source: std::str::ParseBoolError,
+        backtrace: Backtrace,
+    },
+    #[snafu(display(
+        "Failed to parse update mode, raw str:{}.\nBacktrace:\n{}",
+        s,
+        backtrace
+    ))]
+    ParseUpdateMode { s: String, backtrace: Backtrace },
+    #[snafu(display(
+        "Failed to parse compression, name:{}.\nBacktrace:\n{}",
+        name,
+        backtrace
+    ))]
+    ParseCompressionName { name: String, backtrace: Backtrace },
+}
+
+define_result!(Error);
+
+#[derive(Debug, Clone, Deserialize)]
+pub enum UpdateMode {
+    Overwrite,
+    Append,
+}
+
+impl UpdateMode {
+    pub fn parse_from(s: &str) -> Result<Self> {
+        if s.eq_ignore_ascii_case(UPDATE_MODE_OVERWRITE) {
+            Ok(UpdateMode::Overwrite)
+        } else if s.eq_ignore_ascii_case(UPDATE_MODE_APPEND) {
+            Ok(UpdateMode::Append)
+        } else {
+            ParseUpdateMode { s }.fail()
+        }
+    }
+}
+
+impl ToString for UpdateMode {
+    fn to_string(&self) -> String {
+        match self {
+            UpdateMode::Append => UPDATE_MODE_APPEND.to_string(),
+            UpdateMode::Overwrite => UPDATE_MODE_OVERWRITE.to_string(),
+        }
+    }
+}
+
+#[derive(Debug, Clone, Copy, Deserialize)]
+pub enum Compression {
+    Uncompressed,
+    Lz4,
+    Snappy,
+    Zstd,
+}
+
+impl Compression {
+    pub fn parse_from(name: &str) -> Result<Self> {
+        if name.eq_ignore_ascii_case(COMPRESSION_UNCOMPRESSED) {
+            Ok(Compression::Uncompressed)
+        } else if name.eq_ignore_ascii_case(COMPRESSION_LZ4) {
+            Ok(Compression::Lz4)
+        } else if name.eq_ignore_ascii_case(COMPRESSION_SNAPPY) {
+            Ok(Compression::Snappy)
+        } else if name.eq_ignore_ascii_case(COMPRESSION_ZSTD) {
+            Ok(Compression::Zstd)
+        } else {
+            ParseCompressionName { name }.fail()
+        }
+    }
+}
+
+impl ToString for Compression {
+    fn to_string(&self) -> String {
+        match self {
+            Compression::Uncompressed => COMPRESSION_UNCOMPRESSED.to_string(),
+            Compression::Lz4 => COMPRESSION_LZ4.to_string(),
+            Compression::Snappy => COMPRESSION_SNAPPY.to_string(),
+            Compression::Zstd => COMPRESSION_ZSTD.to_string(),
+        }
+    }
+}
+
+impl From<Compression> for CompressionPb {
+    fn from(compression: Compression) -> Self {
+        match compression {
+            Compression::Uncompressed => CompressionPb::UNCOMPRESSED,
+            Compression::Lz4 => CompressionPb::LZ4,
+            Compression::Snappy => CompressionPb::SNAPPY,
+            Compression::Zstd => CompressionPb::ZSTD,
+        }
+    }
+}
+
+impl From<CompressionPb> for Compression {
+    fn from(compression: CompressionPb) -> Self {
+        match compression {
+            CompressionPb::UNCOMPRESSED => Compression::Uncompressed,
+            CompressionPb::LZ4 => Compression::Lz4,
+            CompressionPb::SNAPPY => Compression::Snappy,
+            CompressionPb::ZSTD => Compression::Zstd,
+        }
+    }
+}
+
+impl From<Compression> for ParquetCompression {
+    fn from(compression: Compression) -> Self {
+        match compression {
+            Compression::Uncompressed => ParquetCompression::UNCOMPRESSED,
+            Compression::Lz4 => ParquetCompression::LZ4,
+            Compression::Snappy => ParquetCompression::SNAPPY,
+            Compression::Zstd => ParquetCompression::ZSTD,
+        }
+    }
+}
+
+/// Options for a table.
+#[derive(Debug, Clone, Deserialize)]
+#[serde(default)]
+pub struct TableOptions {
+    // The following options are immutable once table was created.
+    /// Segment duration of the table.
+    ///
+    /// `None` means the table is doing the segment duration sampling and
+    /// the actual duration is still unknown.
+    pub segment_duration: Option<ReadableDuration>,
+    /// Table update mode, now support Overwrite(Default) and Append
+    pub update_mode: UpdateMode,
+
+    // The following options can be altered.
+    /// Enable ttl
+    pub enable_ttl: bool,
+    /// Time-to-live of the data.
+    pub ttl: ReadableDuration,
+    /// Arena block size of memtable.
+    pub arena_block_size: u32,
+    /// Write buffer size of memtable.
+    pub write_buffer_size: u32,
+    /// Compaction strategy of the table.
+    pub compaction_strategy: CompactionStrategy,
+    /// Row number in a row group.
+    pub num_rows_per_row_group: usize,
+    /// Table Compression
+    pub compression: Compression,
+}
+
+impl TableOptions {
+    #[inline]
+    pub fn segment_duration(&self) -> Option<Duration> {
+        self.segment_duration.map(|v| v.0)
+    }
+
+    #[inline]
+    pub fn ttl(&self) -> Option<ReadableDuration> {
+        if self.enable_ttl {
+            Some(self.ttl)
+        } else {
+            None
+        }
+    }
+
+    // for show create table
+    pub fn to_raw_map(&self) -> HashMap<String, String> {
+        let mut m = HashMap::with_capacity(AT_LEAST_OPTIONS_NUM);
+        m.insert(
+            SEGMENT_DURATION.to_string(),
+            self.segment_duration
+                .map(|v| v.to_string())
+                .unwrap_or_else(String::new),
+        );
+        m.insert(UPDATE_MODE.to_string(), self.update_mode.to_string());
+        m.insert(ENABLE_TTL.to_string(), self.enable_ttl.to_string());
+        m.insert(TTL.to_string(), format!("{}", self.ttl));
+        m.insert(
+            ARENA_BLOCK_SIZE.to_string(),
+            format!("{}", self.arena_block_size),
+        );
+        m.insert(
+            WRITE_BUFFER_SIZE.to_string(),
+            format!("{}", self.write_buffer_size),
+        );
+        self.compaction_strategy.fill_raw_map(&mut m);
+        m.insert(
+            NUM_ROWS_PER_ROW_GROUP.to_string(),
+            format!("{}", self.num_rows_per_row_group),
+        );
+        m.insert(COMPRESSION.to_string(), self.compression.to_string());
+
+        assert!(m.len() >= AT_LEAST_OPTIONS_NUM);
+
+        m
+    }
+
+    /// Sanitize options silently.
+    pub fn sanitize(&mut self) {
+        let one_day_secs = BUCKET_DURATION_1D.as_secs();
+
+        if let Some(segment_duration) = self.segment_duration {
+            let mut segment_duration_secs = segment_duration.as_secs();
+            if segment_duration_secs == 0 {
+                segment_duration_secs = DEFAULT_SEGMENT_DURATION.as_secs()
+            };
+            self.segment_duration = Some(ReadableDuration::secs(segment_duration_secs));
+        }
+
+        let ttl_secs = self.ttl.as_secs();
+        // Ttl must align to day.
+        let ttl_secs = ttl_secs / one_day_secs * one_day_secs;
+        self.ttl = ReadableDuration::secs(ttl_secs);
+
+        if self.arena_block_size < MIN_ARENA_BLOCK_SIZE {
+            self.arena_block_size = MIN_ARENA_BLOCK_SIZE;
+        }
+
+        if self.arena_block_size > MAX_ARENA_BLOCK_SIZE {
+            self.arena_block_size = MAX_ARENA_BLOCK_SIZE;
+        }
+
+        if self.num_rows_per_row_group < MIN_NUM_ROWS_PER_ROW_GROUP {
+            self.num_rows_per_row_group = MIN_NUM_ROWS_PER_ROW_GROUP;
+        }
+
+        if self.num_rows_per_row_group > MAX_NUM_ROWS_PER_ROW_GROUP {
+            self.num_rows_per_row_group = MAX_NUM_ROWS_PER_ROW_GROUP;
+        }
+    }
+
+    pub fn need_dedup(&self) -> bool {
+        match self.update_mode {
+            UpdateMode::Overwrite => true,
+            UpdateMode::Append => false,
+        }
+    }
+
+    pub fn is_expired(&self, timestamp: Timestamp) -> bool {
+        self.enable_ttl && timestamp.is_expired(Timestamp::expire_time(self.ttl.0))
+    }
+}
+
+impl From<SizeTieredCompactionOptions> for CompactionOptionsPb {
+    fn from(opts: SizeTieredCompactionOptions) -> Self {
+        let mut target = CompactionOptionsPb::new();
+        target.set_bucket_low(opts.bucket_low);
+        target.set_bucket_high(opts.bucket_high);
+        target.set_min_sstable_size(opts.min_sstable_size.0 as u32);
+        target.set_max_threshold(opts.max_threshold as u32);
+        target.set_min_threshold(opts.min_threshold as u32);
+
+        target
+    }
+}
+
+impl From<CompactionOptionsPb> for SizeTieredCompactionOptions {
+    fn from(opts: CompactionOptionsPb) -> Self {
+        Self {
+            bucket_low: opts.bucket_low,
+            bucket_high: opts.bucket_high,
+            min_sstable_size: ReadableSize(opts.min_sstable_size.into()),
+            min_threshold: opts.min_threshold as usize,
+            max_threshold: opts.max_threshold as usize,
+        }
+    }
+}
+
+impl From<TimeWindowCompactionOptions> for CompactionOptionsPb {
+    fn from(opts: TimeWindowCompactionOptions) -> Self {
+        let mut target = CompactionOptionsPb::new();
+        target.set_bucket_low(opts.size_tiered.bucket_low);
+        target.set_bucket_high(opts.size_tiered.bucket_high);
+        target.set_min_sstable_size(opts.size_tiered.min_sstable_size.0 as u32);
+        target.set_min_threshold(opts.size_tiered.min_threshold as u32);
+        target.set_max_threshold(opts.size_tiered.max_threshold as u32);
+        target.set_timestamp_resolution(opts.timestamp_resolution.into());
+
+        target
+    }
+}
+
+impl From<CompactionOptionsPb> for TimeWindowCompactionOptions {
+    fn from(opts: CompactionOptionsPb) -> Self {
+        let size_tiered: SizeTieredCompactionOptions = opts.clone().into();
+
+        Self {
+            size_tiered,
+            timestamp_resolution: opts.timestamp_resolution.into(),
+        }
+    }
+}
+
+impl From<TableOptions> for TableOptionsPb {
+    fn from(opts: TableOptions) -> Self {
+        let mut target = TableOptionsPb::new();
+        if let Some(segment_duration) = opts.segment_duration {
+            target.set_segment_duration(segment_duration.0.as_millis_u64());
+            target.set_sampling_segment_duration(false);
+        } else {
+            // The segment duration is unknown.
+            target.set_sampling_segment_duration(true);
+        }
+        target.set_enable_ttl(opts.enable_ttl);
+        target.set_ttl(opts.ttl.0.as_millis_u64());
+        target.set_arena_block_size(opts.arena_block_size);
+        target.set_num_rows_per_row_group(opts.num_rows_per_row_group as u64);
+
+        match opts.compaction_strategy {
+            CompactionStrategy::Default => {
+                target.set_compaction_strategy(CompactionStrategyPb::DEFAULT);
+            }
+            CompactionStrategy::SizeTiered(opts) => {
+                target.set_compaction_strategy(CompactionStrategyPb::SIZE_TIERED);
+                target.set_compaction_options(opts.into());
+            }
+            CompactionStrategy::TimeWindow(opts) => {
+                target.set_compaction_strategy(CompactionStrategyPb::TIME_WINDOW);
+                target.set_compaction_options(opts.into());
+            }
+        }
+
+        match opts.update_mode {
+            UpdateMode::Overwrite => {
+                target.set_update_mode(UpdateModePb::Overwrite);
+            }
+            UpdateMode::Append => {
+                target.set_update_mode(UpdateModePb::Append);
+            }
+        }
+
+        target.set_write_buffer_size(opts.write_buffer_size);
+        target.set_compression(opts.compression.into());
+
+        target
+    }
+}
+
+impl From<TableOptionsPb> for TableOptions {
+    fn from(opts: TableOptionsPb) -> Self {
+        let compaction_strategy = match opts.compaction_strategy {
+            CompactionStrategyPb::DEFAULT => CompactionStrategy::default(),
+            CompactionStrategyPb::SIZE_TIERED => {
+                let opts = opts
+                    .compaction_options
+                    .map(SizeTieredCompactionOptions::from)
+                    .unwrap_or_default();
+                CompactionStrategy::SizeTiered(opts)
+            }
+            CompactionStrategyPb::TIME_WINDOW => {
+                let opts = opts
+                    .compaction_options
+                    .map(TimeWindowCompactionOptions::from)
+                    .unwrap_or_default();
+                CompactionStrategy::TimeWindow(opts)
+            }
+        };
+
+        let update_mode = match opts.update_mode {
+            UpdateModePb::Overwrite => UpdateMode::Overwrite,
+            UpdateModePb::Append => UpdateMode::Append,
+        };
+        let segment_duration = if opts.sampling_segment_duration {
+            None
+        } else if opts.segment_duration == 0 {
+            // If segment duration is still zero. If the data had been used by an elder
+            // version release that not yet support sampling, the
+            // `sampling_segment_duration` flag would be truncated after
+            // manifest snapshot, but left segment duration zero.
+            Some(DEFAULT_SEGMENT_DURATION.into())
+        } else {
+            Some(Duration::from_millis(opts.segment_duration).into())
+        };
+
+        Self {
+            segment_duration,
+            enable_ttl: opts.enable_ttl,
+            ttl: Duration::from_millis(opts.ttl).into(),
+            arena_block_size: opts.arena_block_size,
+            compaction_strategy,
+            num_rows_per_row_group: opts.num_rows_per_row_group as usize,
+            update_mode,
+            write_buffer_size: opts.write_buffer_size,
+            compression: opts.compression.into(),
+        }
+    }
+}
+
+impl Default for TableOptions {
+    fn default() -> Self {
+        Self {
+            segment_duration: None,
+            enable_ttl: true,
+            ttl: DEFAULT_TTL.into(),
+            arena_block_size: DEFAULT_ARENA_BLOCK_SIZE,
+            compaction_strategy: CompactionStrategy::default(),
+            num_rows_per_row_group: DEFAULT_NUM_ROW_PER_ROW_GROUP,
+            update_mode: UpdateMode::Overwrite,
+            write_buffer_size: DEFAULT_WRITE_BUFFER_SIZE,
+            compression: Compression::Zstd,
+        }
+    }
+}
+
+pub fn merge_table_options_for_create(
+    options: &HashMap<String, String>,
+    table_opts: &TableOptions,
+) -> Result<TableOptions> {
+    merge_table_options(options, table_opts, true)
+}
+
+pub fn merge_table_options_for_alter(
+    options: &HashMap<String, String>,
+    table_opts: &TableOptions,
+) -> Result<TableOptions> {
+    merge_table_options(options, table_opts, false)
+}
+
+/// The options will override the old options.
+fn merge_table_options(
+    options: &HashMap<String, String>,
+    table_old_opts: &TableOptions,
+    is_create: bool,
+) -> Result<TableOptions> {
+    let mut table_opts = table_old_opts.clone();
+    if is_create {
+        if let Some(v) = options.get(SEGMENT_DURATION) {
+            table_opts.segment_duration = Some(parse_duration(v)?);
+        }
+        if let Some(v) = options.get(UPDATE_MODE) {
+            table_opts.update_mode = UpdateMode::parse_from(v)?;
+        }
+    }
+
+    if let Some(v) = options.get(TTL) {
+        table_opts.ttl = parse_duration(v)?;
+    }
+    if let Some(v) = options.get(OPTION_KEY_ENABLE_TTL) {
+        table_opts.enable_ttl = v.parse::<bool>().context(ParseBool)?;
+    }
+    if let Some(v) = options.get(ARENA_BLOCK_SIZE) {
+        let size = parse_size(v)?;
+        table_opts.arena_block_size = size.0 as u32;
+    }
+    if let Some(v) = options.get(WRITE_BUFFER_SIZE) {
+        let size = parse_size(v)?;
+        table_opts.write_buffer_size = size.0 as u32;
+    }
+    if let Some(v) = options.get(COMPACTION_STRATEGY) {
+        table_opts.compaction_strategy =
+            CompactionStrategy::parse_from(v, options).context(ParseStrategy { value: v })?;
+    }
+    if let Some(v) = options.get(NUM_ROWS_PER_ROW_GROUP) {
+        table_opts.num_rows_per_row_group = v.parse().context(ParseInt)?;
+    }
+    if let Some(v) = options.get(COMPRESSION) {
+        table_opts.compression = Compression::parse_from(v)?;
+    }
+    Ok(table_opts)
+}
+
+fn parse_duration(v: &str) -> Result<ReadableDuration> {
+    v.parse::<ReadableDuration>()
+        .map_err(|err| Error::ParseDuration {
+            err,
+            backtrace: Backtrace::generate(),
+        })
+}
+
+fn parse_size(v: &str) -> Result<ReadableSize> {
+    v.parse::<ReadableSize>().map_err(|err| Error::ParseSize {
+        err,
+        backtrace: Backtrace::generate(),
+    })
+}
diff --git a/analytic_engine/src/tests/alter_test.rs b/analytic_engine/src/tests/alter_test.rs
new file mode 100644
index 0000000000..2bdc74f50b
--- /dev/null
+++ b/analytic_engine/src/tests/alter_test.rs
@@ -0,0 +1,449 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Alter test
+
+use std::collections::{BTreeMap, HashMap};
+
+use common_types::{
+    column_schema,
+    datum::DatumKind,
+    row::{RowGroup, RowGroupBuilder},
+    schema::{self, Schema},
+    time::Timestamp,
+};
+use log::info;
+use table_engine::table::AlterSchemaRequest;
+
+use crate::{
+    table_options::TableOptions,
+    tests::{
+        row_util,
+        table::{self, FixedSchemaTable},
+        util::{Null, TestContext, TestEnv},
+    },
+};
+
+#[test]
+fn test_alter_table_add_column() {
+    let env = TestEnv::builder().build();
+    let mut test_ctx = env.new_context();
+
+    env.block_on(async {
+        test_ctx.open().await;
+
+        let test_table1 = "test_table1";
+        let fixed_schema_table = test_ctx.create_fixed_schema_table(test_table1).await;
+
+        let start_ms = test_ctx.start_ms();
+        let rows = [
+            (
+                "key1",
+                Timestamp::new(start_ms),
+                "tag1-1",
+                11.0,
+                110.0,
+                "tag2-1",
+            ),
+            (
+                "key2",
+                Timestamp::new(start_ms),
+                "tag1-2",
+                12.0,
+                110.0,
+                "tag2-2",
+            ),
+        ];
+
+        // Write data to table.
+        let row_group = fixed_schema_table.rows_to_row_group(&rows);
+        test_ctx.write_to_table(test_table1, row_group).await;
+
+        alter_schema_same_schema_version_case(&test_ctx, test_table1).await;
+
+        alter_schema_old_pre_version_case(&test_ctx, test_table1).await;
+
+        alter_schema_add_column_case(&mut test_ctx, test_table1, start_ms, false).await;
+
+        // Prepare another table for alter.
+        let test_table2 = "test_table2";
+        test_ctx.create_fixed_schema_table(test_table2).await;
+        let row_group = fixed_schema_table.rows_to_row_group(&rows);
+        test_ctx.write_to_table(test_table2, row_group).await;
+
+        alter_schema_add_column_case(&mut test_ctx, test_table2, start_ms, true).await;
+    });
+}
+
+// Add two columns:
+// - add_string
+// - add_double
+fn add_columns(schema_builder: schema::Builder) -> schema::Builder {
+    schema_builder
+        .add_normal_column(
+            column_schema::Builder::new("add_string".to_string(), DatumKind::String)
+                .is_nullable(true)
+                .build()
+                .expect("should succeed build column schema"),
+        )
+        .unwrap()
+        .add_normal_column(
+            column_schema::Builder::new("add_double".to_string(), DatumKind::Double)
+                .is_nullable(true)
+                .build()
+                .expect("should succeed build column schema"),
+        )
+        .unwrap()
+}
+
+async fn alter_schema_same_schema_version_case(test_ctx: &TestContext, table_name: &str) {
+    info!("test alter_schema_same_schema_version_case");
+
+    let mut schema_builder = FixedSchemaTable::default_schema_builder();
+    schema_builder = add_columns(schema_builder);
+    let new_schema = schema_builder.build().unwrap();
+
+    let table = test_ctx.table(table_name);
+    let old_schema = table.schema();
+
+    let request = AlterSchemaRequest {
+        schema: new_schema,
+        pre_schema_version: old_schema.version(),
+    };
+
+    let res = test_ctx.try_alter_schema(table_name, request).await;
+    assert!(res.is_err());
+}
+
+async fn alter_schema_old_pre_version_case(test_ctx: &TestContext, table_name: &str) {
+    info!("test alter_schema_old_pre_version_case");
+
+    let mut schema_builder = FixedSchemaTable::default_schema_builder();
+    schema_builder = add_columns(schema_builder);
+
+    let table = test_ctx.table(table_name);
+    let old_schema = table.schema();
+
+    let new_schema = schema_builder
+        .version(old_schema.version() + 1)
+        .build()
+        .unwrap();
+
+    let request = AlterSchemaRequest {
+        schema: new_schema,
+        pre_schema_version: old_schema.version() - 1,
+    };
+
+    let res = test_ctx.try_alter_schema(table_name, request).await;
+    assert!(res.is_err());
+}
+
+async fn alter_schema_add_column_case(
+    test_ctx: &mut TestContext,
+    table_name: &str,
+    start_ms: i64,
+    flush: bool,
+) {
+    info!(
+        "test alter_schema_add_column_case, table_name:{}",
+        table_name
+    );
+
+    let mut schema_builder = FixedSchemaTable::default_schema_builder();
+    schema_builder = add_columns(schema_builder);
+
+    let old_schema = test_ctx.table(table_name).schema();
+
+    let new_schema = schema_builder
+        .version(old_schema.version() + 1)
+        .build()
+        .unwrap();
+
+    let request = AlterSchemaRequest {
+        schema: new_schema.clone(),
+        pre_schema_version: old_schema.version(),
+    };
+
+    let affected = test_ctx
+        .try_alter_schema(table_name, request)
+        .await
+        .unwrap();
+    assert_eq!(1, affected);
+
+    let rows = [
+        (
+            "key1",
+            Timestamp::new(start_ms + 10),
+            "tag1-1",
+            11.0,
+            110.0,
+            "tag2-1",
+            "add1-1",
+            210.0,
+        ),
+        (
+            "key2",
+            Timestamp::new(start_ms + 10),
+            "tag1-2",
+            12.0,
+            110.0,
+            "tag2-2",
+            "add1-2",
+            220.0,
+        ),
+    ];
+    let rows_vec = row_util::new_rows_8(&rows);
+    let row_group = RowGroupBuilder::with_rows(new_schema.clone(), rows_vec)
+        .unwrap()
+        .build();
+
+    // Write data with new schema.
+    test_ctx.write_to_table(table_name, row_group).await;
+
+    if flush {
+        test_ctx.flush_table(table_name).await;
+    }
+
+    let new_schema_rows = [
+        // We need to check null datum, so tuples have different types and we need to
+        // convert it into row first.
+        row_util::new_row_8((
+            "key1",
+            Timestamp::new(start_ms),
+            "tag1-1",
+            11.0,
+            110.0,
+            "tag2-1",
+            Null,
+            Null,
+        )),
+        row_util::new_row_8((
+            "key1",
+            Timestamp::new(start_ms + 10),
+            "tag1-1",
+            11.0,
+            110.0,
+            "tag2-1",
+            "add1-1",
+            210.0,
+        )),
+        row_util::new_row_8((
+            "key2",
+            Timestamp::new(start_ms),
+            "tag1-2",
+            12.0,
+            110.0,
+            "tag2-2",
+            Null,
+            Null,
+        )),
+        row_util::new_row_8((
+            "key2",
+            Timestamp::new(start_ms + 10),
+            "tag1-2",
+            12.0,
+            110.0,
+            "tag2-2",
+            "add1-2",
+            220.0,
+        )),
+    ];
+    let new_schema_row_group =
+        RowGroupBuilder::with_rows(new_schema.clone(), new_schema_rows.to_vec())
+            .unwrap()
+            .build();
+
+    // Read data using new schema.
+    check_read_row_group(
+        test_ctx,
+        "Test read new schema after add columns",
+        table_name,
+        &new_schema,
+        &new_schema_row_group,
+    )
+    .await;
+
+    let old_schema_rows = [
+        (
+            "key1",
+            Timestamp::new(start_ms),
+            "tag1-1",
+            11.0,
+            110.0,
+            "tag2-1",
+        ),
+        (
+            "key1",
+            Timestamp::new(start_ms + 10),
+            "tag1-1",
+            11.0,
+            110.0,
+            "tag2-1",
+        ),
+        (
+            "key2",
+            Timestamp::new(start_ms),
+            "tag1-2",
+            12.0,
+            110.0,
+            "tag2-2",
+        ),
+        (
+            "key2",
+            Timestamp::new(start_ms + 10),
+            "tag1-2",
+            12.0,
+            110.0,
+            "tag2-2",
+        ),
+    ];
+    let old_schema_rows_vec = row_util::new_rows_6(&old_schema_rows);
+    let old_schema_row_group = RowGroupBuilder::with_rows(old_schema.clone(), old_schema_rows_vec)
+        .unwrap()
+        .build();
+
+    // Read data using old schema.
+    check_read_row_group(
+        test_ctx,
+        "Test read old schema after add columns",
+        table_name,
+        &old_schema,
+        &old_schema_row_group,
+    )
+    .await;
+
+    // Reopen db.
+    test_ctx.reopen_with_tables(&[table_name]).await;
+
+    // Read again after reopen.
+    check_read_row_group(
+        test_ctx,
+        "Test read after reopen",
+        table_name,
+        &new_schema,
+        &new_schema_row_group,
+    )
+    .await;
+}
+
+async fn check_read_row_group(
+    test_ctx: &TestContext,
+    msg: &str,
+    table_name: &str,
+    schema: &Schema,
+    row_group: &RowGroup,
+) {
+    for read_opts in table::read_opts_list() {
+        info!("{}, opts:{:?}", msg, read_opts);
+
+        let record_batches = test_ctx
+            .read_table(
+                table_name,
+                table::new_read_all_request(schema.clone(), read_opts),
+            )
+            .await;
+
+        table::assert_batch_eq_to_row_group(&record_batches, row_group);
+    }
+}
+
+#[test]
+fn test_alter_table_options() {
+    let env = TestEnv::builder().build();
+    let mut test_ctx = env.new_context();
+
+    env.block_on(async {
+        test_ctx.open().await;
+
+        let test_table1 = "test_table1";
+        test_ctx.create_fixed_schema_table(test_table1).await;
+
+        let opts = test_ctx.table(test_table1).options();
+
+        let default_opts_map = default_options();
+
+        assert_options_eq(&default_opts_map, &opts);
+
+        alter_immutable_option_case(&test_ctx, test_table1, "segment_duration", "20d").await;
+
+        alter_immutable_option_case(&test_ctx, test_table1, "bucket_duration", "20d").await;
+
+        alter_immutable_option_case(&test_ctx, test_table1, "update_mode", "Append").await;
+
+        alter_mutable_option_case(&mut test_ctx, test_table1, "enable_ttl", "false").await;
+        alter_mutable_option_case(&mut test_ctx, test_table1, "enable_ttl", "true").await;
+
+        alter_mutable_option_case(&mut test_ctx, test_table1, "arena_block_size", "10240").await;
+
+        alter_mutable_option_case(&mut test_ctx, test_table1, "write_buffer_size", "1024000").await;
+
+        alter_mutable_option_case(
+            &mut test_ctx,
+            test_table1,
+            "num_rows_per_row_group",
+            "10000",
+        )
+        .await;
+    });
+}
+
+async fn alter_immutable_option_case(
+    test_ctx: &TestContext,
+    table_name: &str,
+    opt_key: &str,
+    opt_value: &str,
+) {
+    let old_opts = test_ctx.table(table_name).options();
+
+    let mut new_opts = HashMap::new();
+    new_opts.insert(opt_key.to_string(), opt_value.to_string());
+
+    let affected = test_ctx
+        .try_alter_options(table_name, new_opts)
+        .await
+        .unwrap();
+    assert_eq!(1, affected);
+
+    let opts_after_alter = test_ctx.table(table_name).options();
+    assert_options_eq(&old_opts, &opts_after_alter);
+}
+
+async fn alter_mutable_option_case(
+    test_ctx: &mut TestContext,
+    table_name: &str,
+    opt_key: &str,
+    opt_value: &str,
+) {
+    let mut expect_opts = test_ctx.table(table_name).options();
+    expect_opts.insert(opt_key.to_string(), opt_value.to_string());
+
+    let mut new_opts = HashMap::new();
+    new_opts.insert(opt_key.to_string(), opt_value.to_string());
+
+    let affected = test_ctx
+        .try_alter_options(table_name, new_opts)
+        .await
+        .unwrap();
+    assert_eq!(1, affected);
+
+    let opts_after_alter = test_ctx.table(table_name).options();
+    assert_options_eq(&expect_opts, &opts_after_alter);
+
+    // Reopen table.
+    test_ctx.reopen_with_tables(&[table_name]).await;
+
+    let opts_after_alter = test_ctx.table(table_name).options();
+    assert_options_eq(&expect_opts, &opts_after_alter);
+}
+
+fn assert_options_eq(left: &HashMap<String, String>, right: &HashMap<String, String>) {
+    let sorted_left: BTreeMap<_, _> = left.iter().collect();
+    let sorted_right: BTreeMap<_, _> = right.iter().collect();
+
+    assert_eq!(sorted_left, sorted_right);
+}
+
+fn default_options() -> HashMap<String, String> {
+    let table_opts = TableOptions::default();
+
+    table_opts.to_raw_map()
+}
diff --git a/analytic_engine/src/tests/compaction_test.rs b/analytic_engine/src/tests/compaction_test.rs
new file mode 100644
index 0000000000..6a5b300eb3
--- /dev/null
+++ b/analytic_engine/src/tests/compaction_test.rs
@@ -0,0 +1,90 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Compaction integration tests.
+
+use common_types::time::Timestamp;
+use table_engine::table::FlushRequest;
+
+use crate::{
+    compaction::SizeTieredCompactionOptions,
+    tests::util::{self, TestEnv},
+};
+
+#[test]
+fn test_table_compact_current_segment() {
+    let env = TestEnv::builder().build();
+    let mut test_ctx = env.new_context();
+
+    env.block_on(async {
+        test_ctx.open().await;
+
+        let test_table1 = "test_table1";
+        let fixed_schema_table = test_ctx.create_fixed_schema_table(test_table1).await;
+        let default_opts = SizeTieredCompactionOptions::default();
+
+        let mut expect_rows = Vec::new();
+
+        let start_ms = test_ctx.start_ms();
+        // Write more than ensure compaction will be triggered.
+        for offset in 0..default_opts.max_threshold as i64 * 2 {
+            let rows = [
+                (
+                    "key1",
+                    Timestamp::new(start_ms + offset),
+                    "tag1-1",
+                    11.0,
+                    110.0,
+                    "tag2-1",
+                ),
+                (
+                    "key2",
+                    Timestamp::new(start_ms + offset),
+                    "tag1-2",
+                    12.0,
+                    110.0,
+                    "tag2-2",
+                ),
+            ];
+            expect_rows.extend_from_slice(&rows);
+            let row_group = fixed_schema_table.rows_to_row_group(&rows);
+
+            test_ctx.write_to_table(test_table1, row_group).await;
+
+            // Flush table and generate sst.
+            test_ctx
+                .flush_table_with_request(
+                    test_table1,
+                    FlushRequest {
+                        // Don't trigger a compaction.
+                        compact_after_flush: false,
+                        sync: true,
+                    },
+                )
+                .await;
+        }
+
+        expect_rows.sort_unstable_by_key(|row_tuple| (row_tuple.0, row_tuple.1));
+
+        util::check_read(
+            &test_ctx,
+            &fixed_schema_table,
+            "Test read after flush",
+            test_table1,
+            &expect_rows,
+        )
+        .await;
+
+        // Trigger a compaction.
+        test_ctx.compact_table(test_table1).await;
+
+        // Check read after compaction.
+        util::check_read(
+            &test_ctx,
+            &fixed_schema_table,
+            "Test read after compaction",
+            test_table1,
+            &expect_rows,
+        )
+        .await;
+    });
+}
diff --git a/analytic_engine/src/tests/drop_test.rs b/analytic_engine/src/tests/drop_test.rs
new file mode 100644
index 0000000000..7d12baa536
--- /dev/null
+++ b/analytic_engine/src/tests/drop_test.rs
@@ -0,0 +1,231 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Drop table tests
+
+use std::collections::HashMap;
+
+use common_types::{column_schema, datum::DatumKind, time::Timestamp};
+use table_engine::table::AlterSchemaRequest;
+
+use crate::tests::{
+    table::FixedSchemaTable,
+    util::{self, TestEnv},
+};
+
+#[test]
+fn test_drop_table_once() {
+    let env = TestEnv::builder().build();
+    let mut test_ctx = env.new_context();
+
+    env.block_on(async {
+        test_ctx.open().await;
+
+        let test_table1 = "test_table1";
+        test_ctx.create_fixed_schema_table(test_table1).await;
+
+        assert!(test_ctx.drop_table(test_table1).await);
+
+        let table_opt = test_ctx.try_open_table(test_table1).await.unwrap();
+        assert!(table_opt.is_none());
+
+        test_ctx.reopen().await;
+
+        let table_opt = test_ctx.try_open_table(test_table1).await.unwrap();
+        assert!(table_opt.is_none());
+    });
+}
+
+#[test]
+fn test_drop_table_again() {
+    let env = TestEnv::builder().build();
+    let mut test_ctx = env.new_context();
+
+    env.block_on(async {
+        test_ctx.open().await;
+
+        let test_table1 = "test_table1";
+        test_ctx.create_fixed_schema_table(test_table1).await;
+
+        assert!(test_ctx.drop_table(test_table1).await);
+
+        assert!(!test_ctx.drop_table(test_table1).await);
+
+        let table_opt = test_ctx.try_open_table(test_table1).await.unwrap();
+        assert!(table_opt.is_none());
+    });
+}
+
+#[test]
+fn test_drop_create_table_mixed() {
+    let env = TestEnv::builder().build();
+    let mut test_ctx = env.new_context();
+
+    env.block_on(async {
+        test_ctx.open().await;
+
+        let test_table1 = "test_table1";
+        test_ctx.create_fixed_schema_table(test_table1).await;
+
+        assert!(test_ctx.drop_table(test_table1).await);
+
+        // Create another table after dropped.
+        let test_table2 = "test_table2";
+        test_ctx.create_fixed_schema_table(test_table2).await;
+
+        let table_opt = test_ctx.try_open_table(test_table1).await.unwrap();
+        assert!(table_opt.is_none());
+
+        test_ctx.reopen().await;
+
+        let table_opt = test_ctx.try_open_table(test_table1).await.unwrap();
+        assert!(table_opt.is_none());
+        // Table 2 is still exists.
+        assert!(test_ctx
+            .try_open_table(test_table2)
+            .await
+            .unwrap()
+            .is_some());
+    });
+}
+
+fn test_drop_create_same_table_case(flush: bool) {
+    let env = TestEnv::builder().build();
+    let mut test_ctx = env.new_context();
+
+    env.block_on(async {
+        test_ctx.open().await;
+
+        let test_table1 = "test_table1";
+        let fixed_schema_table = test_ctx.create_fixed_schema_table(test_table1).await;
+
+        // Write data to table1.
+        let start_ms = test_ctx.start_ms();
+        let rows = [(
+            "key1",
+            Timestamp::new(start_ms),
+            "tag1-1",
+            11.0,
+            110.0,
+            "tag2-1",
+        )];
+        let row_group = fixed_schema_table.rows_to_row_group(&rows);
+        test_ctx.write_to_table(test_table1, row_group).await;
+
+        if flush {
+            test_ctx.flush_table(test_table1).await;
+        }
+
+        assert!(test_ctx.drop_table(test_table1).await);
+
+        // Create same table again.
+        let test_table1 = "test_table1";
+        test_ctx.create_fixed_schema_table(test_table1).await;
+
+        // No data exists.
+        util::check_read(
+            &test_ctx,
+            &fixed_schema_table,
+            "Test read table",
+            test_table1,
+            &[],
+        )
+        .await;
+
+        test_ctx.reopen_with_tables(&[test_table1]).await;
+
+        // No data exists.
+        util::check_read(
+            &test_ctx,
+            &fixed_schema_table,
+            "Test read table after reopen",
+            test_table1,
+            &[],
+        )
+        .await;
+    });
+}
+
+#[test]
+fn test_drop_create_same_table() {
+    test_drop_create_same_table_case(false);
+
+    test_drop_create_same_table_case(true);
+}
+
+#[test]
+fn test_alter_schema_drop_create() {
+    let env = TestEnv::builder().build();
+    let mut test_ctx = env.new_context();
+
+    env.block_on(async {
+        test_ctx.open().await;
+
+        let test_table1 = "test_table1";
+        test_ctx.create_fixed_schema_table(test_table1).await;
+
+        // Alter schema.
+        let old_schema = test_ctx.table(test_table1).schema();
+        let schema_builder = FixedSchemaTable::default_schema_builder()
+            .add_normal_column(
+                column_schema::Builder::new("add_double".to_string(), DatumKind::Double)
+                    .is_nullable(true)
+                    .build()
+                    .unwrap(),
+            )
+            .unwrap();
+        let new_schema = schema_builder
+            .version(old_schema.version() + 1)
+            .build()
+            .unwrap();
+        let request = AlterSchemaRequest {
+            schema: new_schema.clone(),
+            pre_schema_version: old_schema.version(),
+        };
+        let affected = test_ctx
+            .try_alter_schema(test_table1, request)
+            .await
+            .unwrap();
+        assert_eq!(1, affected);
+
+        // Drop table.
+        assert!(test_ctx.drop_table(test_table1).await);
+
+        // Create same table again.
+        let test_table1 = "test_table1";
+        test_ctx.create_fixed_schema_table(test_table1).await;
+
+        test_ctx.reopen_with_tables(&[test_table1]).await;
+    });
+}
+
+#[test]
+fn test_alter_options_drop_create() {
+    let env = TestEnv::builder().build();
+    let mut test_ctx = env.new_context();
+
+    env.block_on(async {
+        test_ctx.open().await;
+
+        let test_table1 = "test_table1";
+        test_ctx.create_fixed_schema_table(test_table1).await;
+
+        // Alter options.
+        let mut new_opts = HashMap::new();
+        new_opts.insert("arena_block_size".to_string(), "10240".to_string());
+
+        let affected = test_ctx
+            .try_alter_options(test_table1, new_opts)
+            .await
+            .unwrap();
+        assert_eq!(1, affected);
+
+        // Drop table.
+        assert!(test_ctx.drop_table(test_table1).await);
+
+        // Create same table again.
+        let test_table1 = "test_table1";
+        test_ctx.create_fixed_schema_table(test_table1).await;
+
+        test_ctx.reopen_with_tables(&[test_table1]).await;
+    });
+}
diff --git a/analytic_engine/src/tests/mod.rs b/analytic_engine/src/tests/mod.rs
new file mode 100644
index 0000000000..3ed5f527e0
--- /dev/null
+++ b/analytic_engine/src/tests/mod.rs
@@ -0,0 +1,17 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Test suits and intergration tests.
+
+#[cfg(test)]
+mod alter_test;
+#[cfg(test)]
+mod compaction_test;
+#[cfg(test)]
+mod drop_test;
+#[cfg(test)]
+mod open_test;
+#[cfg(test)]
+mod read_write_test;
+pub mod row_util;
+pub mod table;
+pub mod util;
diff --git a/analytic_engine/src/tests/open_test.rs b/analytic_engine/src/tests/open_test.rs
new file mode 100644
index 0000000000..6c3afc0578
--- /dev/null
+++ b/analytic_engine/src/tests/open_test.rs
@@ -0,0 +1,18 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Engine open test.
+
+use crate::tests::util::TestEnv;
+
+#[test]
+fn test_open_engine() {
+    let env = TestEnv::builder().build();
+    let mut test_ctx = env.new_context();
+
+    env.block_on(async {
+        test_ctx.open().await;
+
+        // Reopen engine.
+        test_ctx.reopen().await;
+    });
+}
diff --git a/analytic_engine/src/tests/read_write_test.rs b/analytic_engine/src/tests/read_write_test.rs
new file mode 100644
index 0000000000..c190817470
--- /dev/null
+++ b/analytic_engine/src/tests/read_write_test.rs
@@ -0,0 +1,735 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Read write test.
+
+use std::{thread, time};
+
+use common_types::time::Timestamp;
+use log::info;
+use table_engine::table::ReadOrder;
+
+use crate::{
+    table_options,
+    tests::util::{self, TestEnv},
+};
+
+#[test]
+fn test_multi_table_read_write() {
+    let env = TestEnv::builder().build();
+    let mut test_ctx = env.new_context();
+
+    env.block_on(async {
+        test_ctx.open().await;
+
+        let test_table1 = "test_multi_table_read_write1";
+        let test_table2 = "test_multi_table_read_write2";
+        let test_table3 = "test_multi_table_read_write3";
+
+        let fixed_schema_table = test_ctx.create_fixed_schema_table(test_table1).await;
+        let _ = test_ctx.create_fixed_schema_table(test_table2).await;
+        let _ = test_ctx.create_fixed_schema_table(test_table3).await;
+
+        let start_ms = test_ctx.start_ms();
+        let rows = [
+            // One bucket.
+            (
+                "key1",
+                Timestamp::new(start_ms),
+                "tag1-1",
+                11.0,
+                110.0,
+                "tag2-1",
+            ),
+            (
+                "key2",
+                Timestamp::new(start_ms),
+                "tag1-2",
+                12.0,
+                110.0,
+                "tag2-2",
+            ),
+            (
+                "key2",
+                Timestamp::new(start_ms + 1),
+                "tag1-3",
+                13.0,
+                110.0,
+                "tag2-3",
+            ),
+            (
+                "key3",
+                Timestamp::new(start_ms + 2),
+                "tag1-4",
+                13.0,
+                110.0,
+                "tag2-4",
+            ),
+            (
+                "key4",
+                Timestamp::new(start_ms + 3),
+                "tag1-5",
+                13.0,
+                110.0,
+                "tag2-5",
+            ),
+            // Next bucket.
+            (
+                "key5",
+                Timestamp::new(
+                    start_ms + 1 + 2 * table_options::DEFAULT_SEGMENT_DURATION.as_millis() as i64,
+                ),
+                "tag-5-3",
+                33.0,
+                310.0,
+                "tag-5-3",
+            ),
+        ];
+
+        // Write data to table.
+        let row_group1 = fixed_schema_table.rows_to_row_group(&rows);
+        let row_group2 = fixed_schema_table.rows_to_row_group(&rows);
+        let row_group3 = fixed_schema_table.rows_to_row_group(&rows);
+        test_ctx.write_to_table(test_table1, row_group1).await;
+        test_ctx.write_to_table(test_table2, row_group2).await;
+        test_ctx.write_to_table(test_table3, row_group3).await;
+
+        // Read with different opts.
+        util::check_read(
+            &test_ctx,
+            &fixed_schema_table,
+            "Test read write table1",
+            test_table1,
+            &rows,
+        )
+        .await;
+
+        util::check_read(
+            &test_ctx,
+            &fixed_schema_table,
+            "Test read write table2",
+            test_table2,
+            &rows,
+        )
+        .await;
+
+        util::check_read(
+            &test_ctx,
+            &fixed_schema_table,
+            "Test read write table3",
+            test_table3,
+            &rows,
+        )
+        .await;
+
+        // Reopen db.
+        test_ctx
+            .reopen_with_tables(&[test_table1, test_table2, test_table3])
+            .await;
+
+        // Read with different opts again.
+        util::check_read(
+            &test_ctx,
+            &fixed_schema_table,
+            "Test read write table1 after reopen",
+            test_table1,
+            &rows,
+        )
+        .await;
+        util::check_read(
+            &test_ctx,
+            &fixed_schema_table,
+            "Test read write table2 after reopen",
+            test_table2,
+            &rows,
+        )
+        .await;
+        util::check_read(
+            &test_ctx,
+            &fixed_schema_table,
+            "Test read write table3 after reopen",
+            test_table3,
+            &rows,
+        )
+        .await;
+    });
+}
+
+#[test]
+fn test_table_write_read() {
+    let env = TestEnv::builder().build();
+    let mut test_ctx = env.new_context();
+
+    env.block_on(async {
+        test_ctx.open().await;
+
+        let test_table1 = "test_table1";
+        let fixed_schema_table = test_ctx.create_fixed_schema_table(test_table1).await;
+
+        let start_ms = test_ctx.start_ms();
+        let rows = [
+            (
+                "key1",
+                Timestamp::new(start_ms),
+                "tag1-1",
+                11.0,
+                110.0,
+                "tag2-1",
+            ),
+            (
+                "key2",
+                Timestamp::new(start_ms),
+                "tag1-2",
+                12.0,
+                110.0,
+                "tag2-2",
+            ),
+            (
+                "key2",
+                Timestamp::new(start_ms + 1),
+                "tag1-3",
+                13.0,
+                110.0,
+                "tag2-3",
+            ),
+        ];
+        let row_group = fixed_schema_table.rows_to_row_group(&rows);
+
+        // Write data to table.
+        test_ctx.write_to_table(test_table1, row_group).await;
+
+        // Read with different opts.
+        util::check_read(
+            &test_ctx,
+            &fixed_schema_table,
+            "Test read write table",
+            test_table1,
+            &rows,
+        )
+        .await;
+
+        // Reopen db.
+        test_ctx.reopen_with_tables(&[test_table1]).await;
+
+        // Read with different opts again.
+        util::check_read(
+            &test_ctx,
+            &fixed_schema_table,
+            "Test read write table after reopen",
+            test_table1,
+            &rows,
+        )
+        .await;
+    });
+}
+
+#[test]
+fn test_table_write_get() {
+    let env = TestEnv::builder().build();
+    let mut test_ctx = env.new_context();
+
+    env.block_on(async {
+        test_ctx.open().await;
+
+        let test_table1 = "test_table1";
+        let fixed_schema_table = test_ctx.create_fixed_schema_table(test_table1).await;
+
+        let start_ms = test_ctx.start_ms();
+        let rows = [
+            (
+                "key1",
+                Timestamp::new(start_ms),
+                "tag1-1",
+                11.0,
+                110.0,
+                "tag2-1",
+            ),
+            (
+                "key2",
+                Timestamp::new(start_ms),
+                "tag1-2",
+                12.0,
+                110.0,
+                "tag2-2",
+            ),
+            (
+                "key2",
+                Timestamp::new(start_ms + 1),
+                "tag1-3",
+                13.0,
+                110.0,
+                "tag2-3",
+            ),
+        ];
+        let row_group = fixed_schema_table.rows_to_row_group(&rows);
+
+        // Write data to table.
+        test_ctx.write_to_table(test_table1, row_group).await;
+
+        util::check_get(
+            &test_ctx,
+            &fixed_schema_table,
+            "Try to get row",
+            test_table1,
+            &rows,
+        )
+        .await;
+
+        // Reopen db.
+        test_ctx.reopen_with_tables(&[test_table1]).await;
+
+        util::check_get(
+            &test_ctx,
+            &fixed_schema_table,
+            "Try to get row after reopen",
+            test_table1,
+            &rows,
+        )
+        .await;
+    });
+}
+
+#[test]
+fn test_table_write_get_override() {
+    test_table_write_get_override_case(FlushPoint::NoFlush);
+
+    test_table_write_get_override_case(FlushPoint::AfterFirstWrite);
+
+    test_table_write_get_override_case(FlushPoint::AfterOverwrite);
+
+    test_table_write_get_override_case(FlushPoint::FirstAndOverwrite);
+}
+
+#[derive(Debug)]
+enum FlushPoint {
+    NoFlush,
+    AfterFirstWrite,
+    AfterOverwrite,
+    FirstAndOverwrite,
+}
+
+fn test_table_write_get_override_case(flush_point: FlushPoint) {
+    let env = TestEnv::builder().build();
+    let mut test_ctx = env.new_context();
+
+    env.block_on(async {
+        info!(
+            "test_table_write_get_override_case, flush_point:{:?}",
+            flush_point
+        );
+
+        test_ctx.open().await;
+
+        let test_table1 = "test_table1";
+        let fixed_schema_table = test_ctx.create_fixed_schema_table(test_table1).await;
+
+        let start_ms = test_ctx.start_ms();
+        {
+            let rows = [
+                (
+                    "key1",
+                    Timestamp::new(start_ms),
+                    "tag1-1",
+                    11.0,
+                    110.0,
+                    "tag2-1",
+                ),
+                (
+                    "key2",
+                    Timestamp::new(start_ms),
+                    "tag1-2",
+                    12.0,
+                    110.0,
+                    "tag2-2",
+                ),
+                (
+                    "key3",
+                    Timestamp::new(start_ms + 10),
+                    "tag1-3",
+                    13.0,
+                    110.0,
+                    "tag2-3",
+                ),
+                (
+                    "key2",
+                    Timestamp::new(start_ms + 1),
+                    "tag1-3",
+                    13.0,
+                    110.0,
+                    "tag2-3",
+                ),
+            ];
+            let row_group = fixed_schema_table.rows_to_row_group(&rows);
+
+            // Write data to table.
+            test_ctx.write_to_table(test_table1, row_group).await;
+        }
+
+        if let FlushPoint::AfterFirstWrite | FlushPoint::FirstAndOverwrite = flush_point {
+            test_ctx.flush_table(test_table1).await;
+        }
+
+        // Override some rows
+        {
+            let rows = [
+                (
+                    "key2",
+                    Timestamp::new(start_ms),
+                    "tag1-2-copy",
+                    112.0,
+                    210.0,
+                    "tag2-2-copy",
+                ),
+                (
+                    "key2",
+                    Timestamp::new(start_ms + 1),
+                    "tag1-3-copy",
+                    113.0,
+                    210.0,
+                    "tag2-3-copy",
+                ),
+            ];
+            let row_group = fixed_schema_table.rows_to_row_group(&rows);
+
+            test_ctx.write_to_table(test_table1, row_group).await;
+        }
+
+        if let FlushPoint::AfterOverwrite | FlushPoint::FirstAndOverwrite = flush_point {
+            test_ctx.flush_table(test_table1).await;
+        }
+
+        let expect_rows = [
+            (
+                "key1",
+                Timestamp::new(start_ms),
+                "tag1-1",
+                11.0,
+                110.0,
+                "tag2-1",
+            ),
+            (
+                "key2",
+                Timestamp::new(start_ms),
+                "tag1-2-copy",
+                112.0,
+                210.0,
+                "tag2-2-copy",
+            ),
+            (
+                "key2",
+                Timestamp::new(start_ms + 1),
+                "tag1-3-copy",
+                113.0,
+                210.0,
+                "tag2-3-copy",
+            ),
+            (
+                "key3",
+                Timestamp::new(start_ms + 10),
+                "tag1-3",
+                13.0,
+                110.0,
+                "tag2-3",
+            ),
+        ];
+
+        util::check_get(
+            &test_ctx,
+            &fixed_schema_table,
+            "Try to get row",
+            test_table1,
+            &expect_rows,
+        )
+        .await;
+
+        // Reopen db.
+        test_ctx.reopen_with_tables(&[test_table1]).await;
+
+        util::check_get(
+            &test_ctx,
+            &fixed_schema_table,
+            "Try to get row after reopen",
+            test_table1,
+            &expect_rows,
+        )
+        .await;
+    });
+}
+
+#[test]
+fn test_db_write_buffer_size() {
+    let mut env = TestEnv::builder().build();
+    env.config.db_write_buffer_size = 1;
+    test_write_buffer_size_overflow("db_write_buffer_size_test", env);
+}
+
+#[test]
+fn test_space_write_buffer_size() {
+    let mut env = TestEnv::builder().build();
+    env.config.space_write_buffer_size = 1;
+    test_write_buffer_size_overflow("space_write_buffer_size_test", env);
+}
+
+fn test_write_buffer_size_overflow(test_table_name: &str, env: TestEnv) {
+    let mut test_ctx = env.new_context();
+
+    env.block_on(async {
+        test_ctx.open().await;
+
+        let fixed_schema_table = test_ctx.create_fixed_schema_table(test_table_name).await;
+
+        let table = test_ctx.table(test_table_name);
+        let old_stats = table.stats();
+
+        let start_ms = test_ctx.start_ms();
+        let rows1 = [
+            (
+                "key1",
+                Timestamp::new(start_ms),
+                "tag1-1",
+                11.0,
+                110.0,
+                "tag2-1",
+            ),
+            (
+                "key2",
+                Timestamp::new(start_ms),
+                "tag1-2",
+                12.0,
+                110.0,
+                "tag2-2",
+            ),
+            (
+                "key2",
+                Timestamp::new(start_ms + 1),
+                "tag1-3",
+                13.0,
+                110.0,
+                "tag2-3",
+            ),
+        ];
+        let row_group = fixed_schema_table.rows_to_row_group(&rows1);
+        // Write rows1 to table.
+        test_ctx.write_to_table(test_table_name, row_group).await;
+
+        let stats = table.stats();
+        assert_eq!(old_stats.num_read, stats.num_read);
+        assert_eq!(old_stats.num_write + 1, stats.num_write);
+        assert_eq!(old_stats.num_flush, stats.num_flush);
+
+        let rows2 = [
+            (
+                "key4",
+                Timestamp::new(start_ms + 2),
+                "tag1-4",
+                11.0,
+                110.0,
+                "tag2-4",
+            ),
+            (
+                "key5",
+                Timestamp::new(start_ms + 3),
+                "tag1-5",
+                12.0,
+                110.0,
+                "tag2-5",
+            ),
+        ];
+
+        let row_group = fixed_schema_table.rows_to_row_group(&rows2);
+        // Write rowss2 to table.
+        test_ctx.write_to_table(test_table_name, row_group).await;
+
+        let mut rows = Vec::new();
+        rows.extend_from_slice(&rows1);
+        rows.extend_from_slice(&rows2);
+
+        // TODO(boyan) a better way to wait  table flushing finishes.
+        thread::sleep(time::Duration::from_millis(500));
+
+        // Read with different opts.
+        util::check_read(
+            &test_ctx,
+            &fixed_schema_table,
+            "Test read write table",
+            test_table_name,
+            &rows,
+        )
+        .await;
+
+        let stats = table.stats();
+        assert_eq!(old_stats.num_read + 5, stats.num_read);
+        assert_eq!(old_stats.num_write + 2, stats.num_write);
+        // Flush when reaches (db/space) write_buffer size limitation.
+        assert_eq!(old_stats.num_flush + 1, stats.num_flush);
+
+        drop(table);
+        // Reopen db.
+        test_ctx.reopen_with_tables(&[test_table_name]).await;
+
+        // Read with different opts again.
+        util::check_read(
+            &test_ctx,
+            &fixed_schema_table,
+            "Test read write table after reopen",
+            test_table_name,
+            &rows,
+        )
+        .await;
+    });
+}
+
+#[test]
+fn test_table_write_read_reverse() {
+    let env = TestEnv::builder().build();
+    let mut test_ctx = env.new_context();
+
+    env.block_on(async {
+        test_ctx.open().await;
+
+        let test_table = "test_table";
+        let fixed_schema_table = test_ctx.create_fixed_schema_table(test_table).await;
+
+        let start_ms = test_ctx.start_ms();
+        let rows = [
+            (
+                "key1",
+                Timestamp::new(start_ms),
+                "tag1-1",
+                11.0,
+                110.0,
+                "tag2-1",
+            ),
+            // update the first row
+            (
+                "key1",
+                Timestamp::new(start_ms),
+                "tag1-2",
+                12.0,
+                110.0,
+                "tag2-2",
+            ),
+            (
+                "key1",
+                Timestamp::new(start_ms + 1),
+                "tag1-2",
+                12.0,
+                110.0,
+                "tag2-2",
+            ),
+            (
+                "key2",
+                Timestamp::new(start_ms),
+                "tag1-3",
+                13.0,
+                110.0,
+                "tag2-3",
+            ),
+            (
+                "key2",
+                Timestamp::new(start_ms + 1),
+                "tag1-3",
+                13.0,
+                110.0,
+                "tag2-3",
+            ),
+        ];
+        let expect_reversed_rows = vec![rows[4], rows[3], rows[2], rows[1]];
+        let row_group = fixed_schema_table.rows_to_row_group(&rows);
+
+        // Write data to table.
+        test_ctx.write_to_table(test_table, row_group).await;
+
+        // Read reverse
+        util::check_read_with_order(
+            &test_ctx,
+            &fixed_schema_table,
+            "Test read write table",
+            test_table,
+            &expect_reversed_rows,
+            ReadOrder::Desc,
+        )
+        .await;
+    });
+}
+
+#[test]
+fn test_table_write_read_reverse_after_flush() {
+    let env = TestEnv::builder().build();
+    let mut test_ctx = env.new_context();
+
+    env.block_on(async {
+        test_ctx.open().await;
+
+        let test_table = "test_table";
+        let fixed_schema_table = test_ctx.create_fixed_schema_table(test_table).await;
+
+        let start_ms = test_ctx.start_ms();
+        let rows1 = [
+            (
+                "key1",
+                Timestamp::new(start_ms),
+                "tag1-1",
+                11.0,
+                110.0,
+                "tag2-1",
+            ),
+            (
+                "key2",
+                Timestamp::new(start_ms),
+                "tag1-3",
+                13.0,
+                110.0,
+                "tag2-3",
+            ),
+            (
+                "key2",
+                Timestamp::new(start_ms + 1),
+                "tag1-3",
+                13.0,
+                110.0,
+                "tag2-3",
+            ),
+        ];
+
+        let rows2 = vec![
+            // update the first row
+            (
+                "key1",
+                Timestamp::new(start_ms),
+                "tag1-2",
+                12.0,
+                110.0,
+                "tag2-2",
+            ),
+            (
+                "key1",
+                Timestamp::new(start_ms + 1),
+                "tag1-2",
+                12.0,
+                110.0,
+                "tag2-2",
+            ),
+        ];
+
+        let expect_reversed_rows = vec![rows1[2], rows1[1], rows2[1], rows2[0]];
+        let row_group1 = fixed_schema_table.rows_to_row_group(&rows1);
+        // Write data to table and flush
+        test_ctx.write_to_table(test_table, row_group1).await;
+        test_ctx.flush_table(test_table).await;
+
+        let row_group2 = fixed_schema_table.rows_to_row_group(&rows2);
+        // Write data to table and not flush
+        test_ctx.write_to_table(test_table, row_group2).await;
+
+        // Read reverse
+        util::check_read_with_order(
+            &test_ctx,
+            &fixed_schema_table,
+            "Test read write table",
+            test_table,
+            &expect_reversed_rows,
+            ReadOrder::Desc,
+        )
+        .await;
+    });
+}
diff --git a/analytic_engine/src/tests/row_util.rs b/analytic_engine/src/tests/row_util.rs
new file mode 100644
index 0000000000..eaf7b592ed
--- /dev/null
+++ b/analytic_engine/src/tests/row_util.rs
@@ -0,0 +1,93 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Row utils
+
+use common_types::{datum::Datum, row::Row};
+
+pub fn new_row_6<C0, C1, C2, C3, C4, C5>(data: (C0, C1, C2, C3, C4, C5)) -> Row
+where
+    C0: Into<Datum>,
+    C1: Into<Datum>,
+    C2: Into<Datum>,
+    C3: Into<Datum>,
+    C4: Into<Datum>,
+    C5: Into<Datum>,
+{
+    let cols = vec![
+        data.0.into(),
+        data.1.into(),
+        data.2.into(),
+        data.3.into(),
+        data.4.into(),
+        data.5.into(),
+    ];
+
+    Row::from_datums(cols)
+}
+
+pub fn assert_row_eq_6<C0, C1, C2, C3, C4, C5>(data: (C0, C1, C2, C3, C4, C5), row: Row)
+where
+    C0: Into<Datum>,
+    C1: Into<Datum>,
+    C2: Into<Datum>,
+    C3: Into<Datum>,
+    C4: Into<Datum>,
+    C5: Into<Datum>,
+{
+    let expect_row = new_row_6(data);
+    assert_eq!(expect_row, row);
+}
+
+pub fn new_row_8<C0, C1, C2, C3, C4, C5, C6, C7>(data: (C0, C1, C2, C3, C4, C5, C6, C7)) -> Row
+where
+    C0: Into<Datum>,
+    C1: Into<Datum>,
+    C2: Into<Datum>,
+    C3: Into<Datum>,
+    C4: Into<Datum>,
+    C5: Into<Datum>,
+    C6: Into<Datum>,
+    C7: Into<Datum>,
+{
+    let cols = vec![
+        data.0.into(),
+        data.1.into(),
+        data.2.into(),
+        data.3.into(),
+        data.4.into(),
+        data.5.into(),
+        data.6.into(),
+        data.7.into(),
+    ];
+
+    Row::from_datums(cols)
+}
+
+pub fn new_rows_6<C0, C1, C2, C3, C4, C5>(data: &[(C0, C1, C2, C3, C4, C5)]) -> Vec<Row>
+where
+    C0: Into<Datum> + Clone,
+    C1: Into<Datum> + Clone,
+    C2: Into<Datum> + Clone,
+    C3: Into<Datum> + Clone,
+    C4: Into<Datum> + Clone,
+    C5: Into<Datum> + Clone,
+{
+    data.iter().cloned().map(new_row_6).collect()
+}
+
+#[allow(clippy::type_complexity)]
+pub fn new_rows_8<C0, C1, C2, C3, C4, C5, C6, C7>(
+    data: &[(C0, C1, C2, C3, C4, C5, C6, C7)],
+) -> Vec<Row>
+where
+    C0: Into<Datum> + Clone,
+    C1: Into<Datum> + Clone,
+    C2: Into<Datum> + Clone,
+    C3: Into<Datum> + Clone,
+    C4: Into<Datum> + Clone,
+    C5: Into<Datum> + Clone,
+    C6: Into<Datum> + Clone,
+    C7: Into<Datum> + Clone,
+{
+    data.iter().cloned().map(new_row_8).collect()
+}
diff --git a/analytic_engine/src/tests/table.rs b/analytic_engine/src/tests/table.rs
new file mode 100644
index 0000000000..8d3d7a83e1
--- /dev/null
+++ b/analytic_engine/src/tests/table.rs
@@ -0,0 +1,331 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Utils to create table.
+
+use std::{collections::HashMap, sync::Arc};
+
+use common_types::{
+    column_schema,
+    datum::{Datum, DatumKind},
+    projected_schema::ProjectedSchema,
+    record_batch::RecordBatch,
+    request_id::RequestId,
+    row::{Row, RowGroup, RowGroupBuilder},
+    schema::{self, Schema},
+    time::{TimeRange, Timestamp},
+};
+use common_util::config::ReadableDuration;
+use table_engine::{
+    self,
+    engine::{CreateTableRequest, TableState},
+    predicate::Predicate,
+    table::{GetRequest, ReadOptions, ReadOrder, ReadRequest, SchemaId, TableId, TableSeq},
+};
+
+use crate::{table_options, tests::row_util};
+
+pub fn new_table_id(schema_id: u16, table_seq: u32) -> TableId {
+    TableId::new(SchemaId::from(schema_id), TableSeq::from(table_seq))
+}
+
+pub type RowTuple<'a> = (&'a str, Timestamp, &'a str, f64, f64, &'a str);
+pub type RowTupleOpt<'a> = (
+    &'a str,
+    Timestamp,
+    Option<&'a str>,
+    Option<f64>,
+    Option<f64>,
+    Option<&'a str>,
+);
+pub type KeyTuple<'a> = (&'a str, Timestamp);
+
+pub struct FixedSchemaTable {
+    create_request: CreateTableRequest,
+}
+
+impl FixedSchemaTable {
+    pub fn builder() -> Builder {
+        Builder::default()
+    }
+
+    fn default_schema() -> Schema {
+        Self::default_schema_builder().build().unwrap()
+    }
+
+    pub fn default_schema_builder() -> schema::Builder {
+        create_schema_builder(
+            // Key columns
+            &[("key", DatumKind::String), ("ts", DatumKind::Timestamp)],
+            // Normal columns
+            &[
+                ("string_tag", DatumKind::String),
+                ("double_field1", DatumKind::Double),
+                ("double_field2", DatumKind::Double),
+                ("string_field2", DatumKind::String),
+            ],
+        )
+    }
+
+    #[inline]
+    pub fn create_request(&self) -> &CreateTableRequest {
+        &self.create_request
+    }
+
+    #[inline]
+    pub fn segment_duration_ms(&self) -> i64 {
+        table_options::DEFAULT_SEGMENT_DURATION.as_millis() as i64
+    }
+
+    // Format of data: (key string, timestamp, string_tag, double_field1,
+    // double_field2, string_field2)
+    fn new_row(data: RowTuple) -> Row {
+        row_util::new_row_6(data)
+    }
+
+    pub fn rows_to_row_group(&self, data: &[RowTuple]) -> RowGroup {
+        let rows = data
+            .iter()
+            .copied()
+            .map(FixedSchemaTable::new_row)
+            .collect();
+
+        self.new_row_group(rows)
+    }
+
+    pub fn rows_opt_to_row_group(&self, data: &[RowTupleOpt]) -> RowGroup {
+        let rows = data
+            .iter()
+            .copied()
+            .map(FixedSchemaTable::new_row_opt)
+            .collect();
+
+        self.new_row_group(rows)
+    }
+
+    fn new_row_group(&self, rows: Vec<Row>) -> RowGroup {
+        RowGroupBuilder::with_rows(self.create_request.table_schema.clone(), rows)
+            .unwrap()
+            .build()
+    }
+
+    fn new_row_opt(data: RowTupleOpt) -> Row {
+        row_util::new_row_6(data)
+    }
+
+    pub fn new_read_all_request(&self, opts: ReadOptions, read_order: ReadOrder) -> ReadRequest {
+        new_read_all_request_with_order(self.create_request.table_schema.clone(), opts, read_order)
+    }
+
+    pub fn new_get_request(&self, key: KeyTuple) -> GetRequest {
+        let primary_key = vec![key.0.into(), key.1.into()];
+
+        GetRequest {
+            request_id: RequestId::next_id(),
+            projected_schema: ProjectedSchema::no_projection(
+                self.create_request.table_schema.clone(),
+            ),
+            primary_key,
+        }
+    }
+
+    pub fn new_get_request_from_row(&self, data: RowTuple) -> GetRequest {
+        self.new_get_request((data.0, data.1))
+    }
+
+    pub fn assert_batch_eq_to_rows(&self, record_batches: &[RecordBatch], rows: &[RowTuple]) {
+        let row_group = self.rows_to_row_group(rows);
+        assert_batch_eq_to_row_group(record_batches, &row_group);
+    }
+
+    pub fn assert_row_eq(&self, data: RowTuple, row: Row) {
+        row_util::assert_row_eq_6(data, row);
+    }
+}
+
+pub fn read_opts_list() -> Vec<ReadOptions> {
+    vec![
+        ReadOptions::default(),
+        ReadOptions {
+            batch_size: 1,
+            read_parallelism: 1,
+        },
+        ReadOptions {
+            batch_size: 1,
+            read_parallelism: 4,
+        },
+        ReadOptions {
+            batch_size: 100,
+            read_parallelism: 1,
+        },
+        ReadOptions {
+            batch_size: 100,
+            read_parallelism: 4,
+        },
+    ]
+}
+
+pub fn new_read_all_request_with_order(
+    schema: Schema,
+    opts: ReadOptions,
+    order: ReadOrder,
+) -> ReadRequest {
+    ReadRequest {
+        request_id: RequestId::next_id(),
+        opts,
+        projected_schema: ProjectedSchema::no_projection(schema),
+        predicate: Arc::new(Predicate::new(TimeRange::min_to_max())),
+        order,
+    }
+}
+
+pub fn new_read_all_request(schema: Schema, opts: ReadOptions) -> ReadRequest {
+    new_read_all_request_with_order(schema, opts, ReadOrder::None)
+}
+
+pub fn assert_batch_eq_to_row_group(record_batches: &[RecordBatch], row_group: &RowGroup) {
+    if record_batches.is_empty() {
+        assert!(row_group.is_empty());
+    }
+
+    for record_batch in record_batches {
+        assert_eq!(
+            record_batch.schema().columns(),
+            row_group.schema().columns()
+        );
+    }
+
+    let mut cursor = RecordBatchesCursor::new(record_batches);
+
+    for row in row_group.iter() {
+        for (column_idx, datum) in row.iter().enumerate() {
+            assert_eq!(
+                &cursor.datum(column_idx),
+                datum,
+                "record_batches:{:?}, row_group:{:?}",
+                record_batches,
+                row_group
+            );
+        }
+        cursor.step();
+    }
+}
+
+struct RecordBatchesCursor<'a> {
+    record_batches: &'a [RecordBatch],
+    batch_idx: usize,
+    row_idx_in_batch: usize,
+}
+
+impl<'a> RecordBatchesCursor<'a> {
+    fn new(record_batches: &[RecordBatch]) -> RecordBatchesCursor {
+        RecordBatchesCursor {
+            record_batches,
+            batch_idx: 0,
+            row_idx_in_batch: 0,
+        }
+    }
+
+    fn step(&mut self) {
+        if self.batch_idx >= self.record_batches.len() {
+            return;
+        }
+
+        self.row_idx_in_batch += 1;
+        if self.row_idx_in_batch >= self.record_batches[self.batch_idx].num_rows() {
+            self.batch_idx += 1;
+            self.row_idx_in_batch = 0;
+        }
+    }
+
+    fn datum(&self, column_idx: usize) -> Datum {
+        let record_batch = &self.record_batches[self.batch_idx];
+        let column_in_batch = record_batch.column(column_idx);
+        column_in_batch.datum(self.row_idx_in_batch)
+    }
+}
+
+#[must_use]
+pub struct Builder {
+    create_request: CreateTableRequest,
+}
+
+impl Builder {
+    pub fn table_name(mut self, table_name: String) -> Self {
+        self.create_request.table_name = table_name;
+        self
+    }
+
+    pub fn table_id(mut self, table_id: TableId) -> Self {
+        self.create_request.table_id = table_id;
+        self
+    }
+
+    pub fn enable_ttl(mut self, enable_ttl: bool) -> Self {
+        self.create_request.options.insert(
+            table_engine::OPTION_KEY_ENABLE_TTL.to_string(),
+            enable_ttl.to_string(),
+        );
+        self
+    }
+
+    pub fn ttl(mut self, duration: ReadableDuration) -> Self {
+        self.create_request
+            .options
+            .insert(table_options::TTL.to_string(), duration.to_string());
+        self
+    }
+
+    pub fn build_fixed(self) -> FixedSchemaTable {
+        FixedSchemaTable {
+            create_request: self.create_request,
+        }
+    }
+}
+
+impl Default for Builder {
+    fn default() -> Self {
+        Self {
+            create_request: CreateTableRequest {
+                catalog_name: "ceresdb".to_string(),
+                schema_name: "public".to_string(),
+                table_id: new_table_id(2, 1),
+                table_name: "test_table".to_string(),
+                table_schema: FixedSchemaTable::default_schema(),
+                partition_info: None,
+                engine: table_engine::ANALYTIC_ENGINE_TYPE.to_string(),
+                options: HashMap::new(),
+                state: TableState::Stable,
+            },
+        }
+    }
+}
+
+// Format of input slice: &[ ( column name, column type ) ]
+pub fn create_schema_builder(
+    key_tuples: &[(&str, DatumKind)],
+    normal_tuples: &[(&str, DatumKind)],
+) -> schema::Builder {
+    assert!(!key_tuples.is_empty());
+
+    let mut schema_builder = schema::Builder::with_capacity(key_tuples.len() + normal_tuples.len())
+        .auto_increment_column_id(true);
+
+    for tuple in key_tuples {
+        // Key column is not nullable.
+        let column_schema = column_schema::Builder::new(tuple.0.to_string(), tuple.1)
+            .is_nullable(false)
+            .build()
+            .expect("Should succeed to build key column schema");
+        schema_builder = schema_builder.add_key_column(column_schema).unwrap();
+    }
+
+    for tuple in normal_tuples {
+        let column_schema = column_schema::Builder::new(tuple.0.to_string(), tuple.1)
+            .is_nullable(true)
+            .build()
+            .expect("Should succeed to build normal column schema");
+        schema_builder = schema_builder.add_normal_column(column_schema).unwrap();
+    }
+
+    schema_builder
+}
diff --git a/analytic_engine/src/tests/util.rs b/analytic_engine/src/tests/util.rs
new file mode 100644
index 0000000000..31afc1b582
--- /dev/null
+++ b/analytic_engine/src/tests/util.rs
@@ -0,0 +1,404 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Test utils.
+
+use std::{collections::HashMap, future::Future, sync::Arc};
+
+use common_types::{
+    datum::Datum,
+    record_batch::RecordBatch,
+    row::{Row, RowGroup},
+    time::Timestamp,
+};
+use common_util::{config::ReadableDuration, runtime};
+use futures::stream::StreamExt;
+use log::info;
+use table_engine::{
+    engine::{
+        CreateTableRequest, DropTableRequest, EngineRuntimes, OpenTableRequest,
+        Result as EngineResult, TableEngine,
+    },
+    table::{
+        AlterSchemaRequest, FlushRequest, GetRequest, ReadOrder, ReadRequest, Result, TableId,
+        TableRef, WriteRequest,
+    },
+};
+use tempfile::TempDir;
+
+use crate::{
+    setup,
+    tests::table::{self, FixedSchemaTable, RowTuple},
+    AnalyticTableEngine, Config, EngineInstance,
+};
+
+const DAY_MS: i64 = 24 * 60 * 60 * 1000;
+
+/// Helper struct to create a null datum.
+pub struct Null;
+
+impl From<Null> for Datum {
+    fn from(_data: Null) -> Datum {
+        Datum::Null
+    }
+}
+
+pub async fn check_read_with_order(
+    test_ctx: &TestContext,
+    fixed_schema_table: &FixedSchemaTable,
+    msg: &str,
+    table_name: &str,
+    rows: &[RowTuple<'_>],
+    read_order: ReadOrder,
+) {
+    for read_opts in table::read_opts_list() {
+        info!("{}, opts:{:?}", msg, read_opts);
+
+        let record_batches = test_ctx
+            .read_table(
+                table_name,
+                fixed_schema_table.new_read_all_request(read_opts, read_order),
+            )
+            .await;
+
+        fixed_schema_table.assert_batch_eq_to_rows(&record_batches, rows);
+    }
+}
+
+pub async fn check_read(
+    test_ctx: &TestContext,
+    fixed_schema_table: &FixedSchemaTable,
+    msg: &str,
+    table_name: &str,
+    rows: &[RowTuple<'_>],
+) {
+    check_read_with_order(
+        test_ctx,
+        fixed_schema_table,
+        msg,
+        table_name,
+        rows,
+        ReadOrder::None,
+    )
+    .await
+}
+
+pub async fn check_get(
+    test_ctx: &TestContext,
+    fixed_schema_table: &FixedSchemaTable,
+    msg: &str,
+    table_name: &str,
+    rows: &[RowTuple<'_>],
+) {
+    for row_data in rows {
+        let request = fixed_schema_table.new_get_request_from_row(*row_data);
+
+        info!("{}, request:{:?}, row_data:{:?}", msg, request, row_data);
+
+        let row = test_ctx.get_from_table(table_name, request).await.unwrap();
+
+        fixed_schema_table.assert_row_eq(*row_data, row);
+    }
+}
+
+pub struct TestContext {
+    pub config: Config,
+    runtimes: Arc<EngineRuntimes>,
+    pub engine: Option<AnalyticTableEngine>,
+    last_table_seq: u32,
+
+    name_to_tables: HashMap<String, TableRef>,
+}
+
+impl TestContext {
+    pub async fn open(&mut self) {
+        let engine = setup::open_analytic_table_engine(self.config.clone(), self.runtimes.clone())
+            .await
+            .unwrap();
+
+        self.engine = Some(engine);
+    }
+
+    pub async fn reopen(&mut self) {
+        {
+            // Close all tables.
+            self.name_to_tables.clear();
+
+            // Close engine.
+            let engine = self.engine.take().unwrap();
+            engine.close().await.unwrap();
+        }
+
+        self.open().await;
+    }
+
+    pub async fn reopen_with_tables(&mut self, tables: &[&str]) {
+        {
+            // Close all tables.
+            self.name_to_tables.clear();
+
+            // Close engine.
+            let engine = self.engine.take().unwrap();
+            engine.close().await.unwrap();
+        }
+
+        self.open().await;
+
+        for name in tables {
+            self.open_table(name).await;
+        }
+    }
+
+    async fn open_table(&mut self, table_name: &str) {
+        let table = self
+            .engine()
+            .open_table(OpenTableRequest {
+                catalog_name: "ceresdb".to_string(),
+                schema_name: "public".to_string(),
+                table_name: table_name.to_string(),
+                engine: table_engine::ANALYTIC_ENGINE_TYPE.to_string(),
+            })
+            .await
+            .unwrap()
+            .unwrap();
+
+        self.name_to_tables.insert(table_name.to_string(), table);
+    }
+
+    pub async fn try_open_table(&mut self, table_name: &str) -> EngineResult<Option<TableRef>> {
+        let table_opt = self
+            .engine()
+            .open_table(OpenTableRequest {
+                catalog_name: "ceresdb".to_string(),
+                schema_name: "public".to_string(),
+                table_name: table_name.to_string(),
+                engine: table_engine::ANALYTIC_ENGINE_TYPE.to_string(),
+            })
+            .await?;
+
+        let table = match table_opt {
+            Some(v) => v,
+            None => return Ok(None),
+        };
+
+        self.name_to_tables
+            .insert(table_name.to_string(), table.clone());
+
+        Ok(Some(table))
+    }
+
+    pub async fn drop_table(&mut self, table_name: &str) -> bool {
+        let request = DropTableRequest {
+            catalog_name: "ceresdb".to_string(),
+            schema_name: "public".to_string(),
+            table_name: table_name.to_string(),
+            engine: table_engine::ANALYTIC_ENGINE_TYPE.to_string(),
+        };
+
+        let ret = self.engine().drop_table(request).await.unwrap();
+
+        self.name_to_tables.remove(table_name);
+
+        ret
+    }
+
+    /// 3 days ago.
+    pub fn start_ms(&self) -> i64 {
+        Timestamp::now().as_i64() - 3 * DAY_MS
+    }
+
+    pub async fn create_fixed_schema_table(&mut self, table_name: &str) -> FixedSchemaTable {
+        let fixed_schema_table = FixedSchemaTable::builder()
+            .table_name(table_name.to_string())
+            .table_id(self.next_table_id())
+            .ttl("7d".parse::<ReadableDuration>().unwrap())
+            .build_fixed();
+
+        self.create_table(fixed_schema_table.create_request().clone())
+            .await;
+
+        fixed_schema_table
+    }
+
+    async fn create_table(&mut self, create_request: CreateTableRequest) {
+        let table_name = create_request.table_name.clone();
+        let table = self.engine().create_table(create_request).await.unwrap();
+
+        self.name_to_tables.insert(table_name.to_string(), table);
+    }
+
+    pub async fn write_to_table(&self, table_name: &str, row_group: RowGroup) {
+        let table = self.table(table_name);
+
+        table.write(WriteRequest { row_group }).await.unwrap();
+    }
+
+    pub async fn read_table(
+        &self,
+        table_name: &str,
+        read_request: ReadRequest,
+    ) -> Vec<RecordBatch> {
+        let table = self.table(table_name);
+
+        let mut stream = table.read(read_request).await.unwrap();
+        let mut record_batches = Vec::new();
+        while let Some(batch) = stream.next().await {
+            let batch = batch.unwrap();
+
+            record_batches.push(batch);
+        }
+
+        record_batches
+    }
+
+    pub async fn partitioned_read_table(
+        &self,
+        table_name: &str,
+        read_request: ReadRequest,
+    ) -> Vec<RecordBatch> {
+        let table = self.table(table_name);
+
+        let streams = table.partitioned_read(read_request).await.unwrap();
+        let mut record_batches = Vec::new();
+
+        for mut stream in streams.streams {
+            while let Some(batch) = stream.next().await {
+                let batch = batch.unwrap();
+
+                record_batches.push(batch);
+            }
+        }
+
+        record_batches
+    }
+
+    pub async fn get_from_table(&self, table_name: &str, request: GetRequest) -> Option<Row> {
+        let table = self.table(table_name);
+
+        table.get(request).await.unwrap()
+    }
+
+    pub async fn flush_table(&self, table_name: &str) {
+        let table = self.table(table_name);
+
+        table.flush(FlushRequest::default()).await.unwrap();
+    }
+
+    pub async fn flush_table_with_request(&self, table_name: &str, request: FlushRequest) {
+        let table = self.table(table_name);
+
+        table.flush(request).await.unwrap();
+    }
+
+    pub async fn compact_table(&self, table_name: &str) {
+        let table = self.table(table_name);
+
+        table.compact().await.unwrap();
+    }
+
+    pub async fn try_alter_schema(
+        &self,
+        table_name: &str,
+        request: AlterSchemaRequest,
+    ) -> Result<usize> {
+        let table = self.table(table_name);
+
+        table.alter_schema(request).await
+    }
+
+    pub async fn try_alter_options(
+        &self,
+        table_name: &str,
+        opts: HashMap<String, String>,
+    ) -> Result<usize> {
+        let table = self.table(table_name);
+
+        table.alter_options(opts).await
+    }
+
+    pub fn table(&self, table_name: &str) -> TableRef {
+        self.name_to_tables.get(table_name).cloned().unwrap()
+    }
+
+    #[inline]
+    pub fn engine(&self) -> AnalyticTableEngine {
+        self.engine.clone().unwrap()
+    }
+
+    #[inline]
+    pub fn instance(&self) -> EngineInstance {
+        self.engine().instance()
+    }
+
+    fn next_table_id(&mut self) -> TableId {
+        self.last_table_seq += 1;
+        table::new_table_id(2, self.last_table_seq)
+    }
+}
+
+pub struct TestEnv {
+    _dir: TempDir,
+    pub config: Config,
+    pub runtimes: Arc<EngineRuntimes>,
+}
+
+impl TestEnv {
+    pub fn builder() -> Builder {
+        Builder::default()
+    }
+
+    pub fn new_context(&self) -> TestContext {
+        TestContext {
+            config: self.config.clone(),
+            runtimes: self.runtimes.clone(),
+            engine: None,
+            last_table_seq: 1,
+            name_to_tables: HashMap::new(),
+        }
+    }
+
+    pub fn block_on<F: Future>(&self, future: F) -> F::Output {
+        self.runtimes.bg_runtime.block_on(future)
+    }
+}
+
+pub struct Builder {
+    num_workers: usize,
+}
+
+impl Builder {
+    pub fn build(self) -> TestEnv {
+        // Init log for test.
+        common_util::tests::init_log_for_test();
+
+        let dir = tempfile::tempdir().unwrap();
+
+        let config = Config {
+            data_path: dir.path().to_str().unwrap().to_string(),
+            ..Default::default()
+        };
+
+        let runtime = Arc::new(
+            runtime::Builder::default()
+                .worker_threads(self.num_workers)
+                .enable_all()
+                .build()
+                .unwrap(),
+        );
+
+        TestEnv {
+            _dir: dir,
+            config,
+            runtimes: Arc::new(EngineRuntimes {
+                read_runtime: runtime.clone(),
+                write_runtime: runtime.clone(),
+                bg_runtime: runtime,
+            }),
+        }
+    }
+}
+
+impl Default for Builder {
+    fn default() -> Self {
+        Self { num_workers: 2 }
+    }
+}
diff --git a/arrow_deps/Cargo.toml b/arrow_deps/Cargo.toml
new file mode 100644
index 0000000000..e7cac70aa2
--- /dev/null
+++ b/arrow_deps/Cargo.toml
@@ -0,0 +1,19 @@
+[package]
+name = "arrow_deps"
+version = "0.1.0"
+authors = ["CeresDB Authors <ceresdb@service.alipay.com>"]
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+arrow = "7.0.0"
+parquet = "7.0.0"
+
+[dependencies.uncover]
+git = "https://github.com/matklad/uncover.git"
+rev = "1d0770d997e29731b287e9e11e4ffbbea5f456da"
+
+[dependencies.datafusion]
+git = "https://github.com/apache/arrow-datafusion.git"
+rev = "444c153863520072ea22d4f8c498dee39437516d"
diff --git a/arrow_deps/src/display.rs b/arrow_deps/src/display.rs
new file mode 100644
index 0000000000..be037d882e
--- /dev/null
+++ b/arrow_deps/src/display.rs
@@ -0,0 +1,428 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Functions for printing array values, as strings, for debugging
+//! purposes. See the `pretty` crate for additional functions for
+//! record batch pretty printing.
+
+// Copy from arrow
+
+use std::sync::Arc;
+
+use arrow::{
+    array::{self, Array, DictionaryArray},
+    datatypes::{
+        ArrowNativeType, ArrowPrimitiveType, DataType, Int16Type, Int32Type, Int64Type, Int8Type,
+        IntervalUnit, TimeUnit, UInt16Type, UInt32Type, UInt64Type, UInt8Type,
+    },
+    error::{ArrowError, Result},
+};
+
+macro_rules! make_string {
+    ($array_type:ty, $column: ident, $row: ident) => {{
+        let array = $column.as_any().downcast_ref::<$array_type>().unwrap();
+
+        let s = if array.is_null($row) {
+            "".to_string()
+        } else {
+            array.value($row).to_string()
+        };
+
+        Ok(s)
+    }};
+}
+
+macro_rules! make_string_interval_year_month {
+    ($column: ident, $row: ident) => {{
+        let array = $column
+            .as_any()
+            .downcast_ref::<array::IntervalYearMonthArray>()
+            .unwrap();
+
+        let s = if array.is_null($row) {
+            "NULL".to_string()
+        } else {
+            let interval = array.value($row) as f64;
+            let years = (interval / 12_f64).floor();
+            let month = interval - (years * 12_f64);
+
+            format!(
+                "{} years {} mons 0 days 0 hours 0 mins 0.00 secs",
+                years, month,
+            )
+        };
+
+        Ok(s)
+    }};
+}
+
+macro_rules! make_string_interval_day_time {
+    ($column: ident, $row: ident) => {{
+        let array = $column
+            .as_any()
+            .downcast_ref::<array::IntervalDayTimeArray>()
+            .unwrap();
+
+        let s = if array.is_null($row) {
+            "NULL".to_string()
+        } else {
+            let value: u64 = array.value($row) as u64;
+
+            let days_parts: i32 = ((value & 0xFFFFFFFF00000000) >> 32) as i32;
+            let milliseconds_part: i32 = (value & 0xFFFFFFFF) as i32;
+
+            let secs = milliseconds_part / 1000;
+            let mins = secs / 60;
+            let hours = mins / 60;
+
+            let secs = secs - (mins * 60);
+            let mins = mins - (hours * 60);
+
+            format!(
+                "0 years 0 mons {} days {} hours {} mins {}.{:02} secs",
+                days_parts,
+                hours,
+                mins,
+                secs,
+                (milliseconds_part % 1000),
+            )
+        };
+
+        Ok(s)
+    }};
+}
+
+macro_rules! make_string_interval_month_day_nano {
+    ($column: ident, $row: ident) => {{
+        let array = $column
+            .as_any()
+            .downcast_ref::<array::IntervalMonthDayNanoArray>()
+            .unwrap();
+
+        let s = if array.is_null($row) {
+            "NULL".to_string()
+        } else {
+            let value: u128 = array.value($row) as u128;
+
+            let months_part: i32 = ((value & 0xFFFFFFFF000000000000000000000000) >> 96) as i32;
+            let days_part: i32 = ((value & 0xFFFFFFFF0000000000000000) >> 64) as i32;
+            let nanoseconds_part: i64 = (value & 0xFFFFFFFFFFFFFFFF) as i64;
+
+            let secs = nanoseconds_part / 1000000000;
+            let mins = secs / 60;
+            let hours = mins / 60;
+
+            let secs = secs - (mins * 60);
+            let mins = mins - (hours * 60);
+
+            format!(
+                "0 years {} mons {} days {} hours {} mins {}.{:02} secs",
+                months_part,
+                days_part,
+                hours,
+                mins,
+                secs,
+                (nanoseconds_part % 1000000000),
+            )
+        };
+
+        Ok(s)
+    }};
+}
+
+macro_rules! make_string_date {
+    ($array_type:ty, $column: ident, $row: ident) => {{
+        let array = $column.as_any().downcast_ref::<$array_type>().unwrap();
+
+        let s = if array.is_null($row) {
+            "".to_string()
+        } else {
+            array
+                .value_as_date($row)
+                .map(|d| d.to_string())
+                .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string())
+        };
+
+        Ok(s)
+    }};
+}
+
+macro_rules! make_string_time {
+    ($array_type:ty, $column: ident, $row: ident) => {{
+        let array = $column.as_any().downcast_ref::<$array_type>().unwrap();
+
+        let s = if array.is_null($row) {
+            "".to_string()
+        } else {
+            array
+                .value_as_time($row)
+                .map(|d| d.to_string())
+                .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string())
+        };
+
+        Ok(s)
+    }};
+}
+
+macro_rules! make_string_datetime {
+    ($array_type:ty, $column: ident, $row: ident) => {{
+        let array = $column.as_any().downcast_ref::<$array_type>().unwrap();
+
+        let s = if array.is_null($row) {
+            "".to_string()
+        } else {
+            array
+                .value_as_datetime($row)
+                .map(|d| d.to_string())
+                .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string())
+        };
+
+        Ok(s)
+    }};
+}
+
+// It's not possible to do array.value($row).to_string() for &[u8], let's format
+// it as hex
+macro_rules! make_string_hex {
+    ($array_type:ty, $column: ident, $row: ident) => {{
+        let array = $column.as_any().downcast_ref::<$array_type>().unwrap();
+
+        let s = if array.is_null($row) {
+            "".to_string()
+        } else {
+            let mut tmp = "".to_string();
+
+            for character in array.value($row) {
+                tmp += &format!("{:02x}", character);
+            }
+
+            tmp
+        };
+
+        Ok(s)
+    }};
+}
+
+macro_rules! make_string_from_list {
+    ($column: ident, $row: ident) => {{
+        let list = $column
+            .as_any()
+            .downcast_ref::<array::ListArray>()
+            .ok_or(ArrowError::InvalidArgumentError(format!(
+                "Repl error: could not convert list column to list array."
+            )))?
+            .value($row);
+        let string_values = (0..list.len())
+            .map(|i| array_value_to_string(&list.clone(), i))
+            .collect::<Result<Vec<String>>>()?;
+        Ok(format!("[{}]", string_values.join(", ")))
+    }};
+}
+
+macro_rules! make_string_from_fixed_size_list {
+    ($column: ident, $row: ident) => {{
+        let list = $column
+            .as_any()
+            .downcast_ref::<array::FixedSizeListArray>()
+            .ok_or(ArrowError::InvalidArgumentError(format!(
+                "Repl error: could not convert list column to list array."
+            )))?
+            .value($row);
+        let string_values = (0..list.len())
+            .map(|i| array_value_to_string(&list.clone(), i))
+            .collect::<Result<Vec<String>>>()?;
+        Ok(format!("[{}]", string_values.join(", ")))
+    }};
+}
+
+#[inline(always)]
+pub fn make_string_from_decimal(column: &Arc<dyn Array>, row: usize) -> Result<String> {
+    let array = column
+        .as_any()
+        .downcast_ref::<array::DecimalArray>()
+        .unwrap();
+
+    let formatted_decimal = array.value_as_string(row);
+    Ok(formatted_decimal)
+}
+
+fn append_struct_field_string(
+    target: &mut String,
+    name: &str,
+    field_col: &Arc<dyn Array>,
+    row: usize,
+) -> Result<()> {
+    target.push('"');
+    target.push_str(name);
+    target.push_str("\": ");
+
+    if field_col.is_null(row) {
+        target.push_str("null");
+    } else {
+        match field_col.data_type() {
+            DataType::Utf8 | DataType::LargeUtf8 => {
+                target.push('"');
+                target.push_str(array_value_to_string(field_col, row)?.as_str());
+                target.push('"');
+            }
+            _ => {
+                target.push_str(array_value_to_string(field_col, row)?.as_str());
+            }
+        }
+    }
+
+    Ok(())
+}
+
+/// Get the value at the given row in an array as a String.
+///
+/// Note this function is quite inefficient and is unlikely to be
+/// suitable for converting large arrays or record batches.
+pub fn array_value_to_string(column: &array::ArrayRef, row: usize) -> Result<String> {
+    if column.is_null(row) {
+        return Ok("".to_string());
+    }
+    match column.data_type() {
+        DataType::Utf8 => make_string!(array::StringArray, column, row),
+        DataType::LargeUtf8 => make_string!(array::LargeStringArray, column, row),
+        DataType::Binary => make_string_hex!(array::BinaryArray, column, row),
+        DataType::LargeBinary => make_string_hex!(array::LargeBinaryArray, column, row),
+        DataType::FixedSizeBinary(_) => {
+            make_string_hex!(array::FixedSizeBinaryArray, column, row)
+        }
+        DataType::Boolean => make_string!(array::BooleanArray, column, row),
+        DataType::Int8 => make_string!(array::Int8Array, column, row),
+        DataType::Int16 => make_string!(array::Int16Array, column, row),
+        DataType::Int32 => make_string!(array::Int32Array, column, row),
+        DataType::Int64 => make_string!(array::Int64Array, column, row),
+        DataType::UInt8 => make_string!(array::UInt8Array, column, row),
+        DataType::UInt16 => make_string!(array::UInt16Array, column, row),
+        DataType::UInt32 => make_string!(array::UInt32Array, column, row),
+        DataType::UInt64 => make_string!(array::UInt64Array, column, row),
+        DataType::Float16 => make_string!(array::Float16Array, column, row),
+        DataType::Float32 => make_string!(array::Float32Array, column, row),
+        DataType::Float64 => make_string!(array::Float64Array, column, row),
+        DataType::Decimal(..) => make_string_from_decimal(column, row),
+        DataType::Timestamp(unit, _) if *unit == TimeUnit::Second => {
+            make_string_datetime!(array::TimestampSecondArray, column, row)
+        }
+        DataType::Timestamp(unit, _) if *unit == TimeUnit::Millisecond => {
+            make_string_datetime!(array::TimestampMillisecondArray, column, row)
+        }
+        DataType::Timestamp(unit, _) if *unit == TimeUnit::Microsecond => {
+            make_string_datetime!(array::TimestampMicrosecondArray, column, row)
+        }
+        DataType::Timestamp(unit, _) if *unit == TimeUnit::Nanosecond => {
+            make_string_datetime!(array::TimestampNanosecondArray, column, row)
+        }
+        DataType::Date32 => make_string_date!(array::Date32Array, column, row),
+        DataType::Date64 => make_string_date!(array::Date64Array, column, row),
+        DataType::Time32(unit) if *unit == TimeUnit::Second => {
+            make_string_time!(array::Time32SecondArray, column, row)
+        }
+        DataType::Time32(unit) if *unit == TimeUnit::Millisecond => {
+            make_string_time!(array::Time32MillisecondArray, column, row)
+        }
+        DataType::Time64(unit) if *unit == TimeUnit::Microsecond => {
+            make_string_time!(array::Time64MicrosecondArray, column, row)
+        }
+        DataType::Time64(unit) if *unit == TimeUnit::Nanosecond => {
+            make_string_time!(array::Time64NanosecondArray, column, row)
+        }
+        DataType::Interval(unit) => match unit {
+            IntervalUnit::DayTime => {
+                make_string_interval_day_time!(column, row)
+            }
+            IntervalUnit::YearMonth => {
+                make_string_interval_year_month!(column, row)
+            }
+            IntervalUnit::MonthDayNano => {
+                make_string_interval_month_day_nano!(column, row)
+            }
+        },
+        DataType::List(_) => make_string_from_list!(column, row),
+        DataType::Dictionary(index_type, _value_type) => match **index_type {
+            DataType::Int8 => dict_array_value_to_string::<Int8Type>(column, row),
+            DataType::Int16 => dict_array_value_to_string::<Int16Type>(column, row),
+            DataType::Int32 => dict_array_value_to_string::<Int32Type>(column, row),
+            DataType::Int64 => dict_array_value_to_string::<Int64Type>(column, row),
+            DataType::UInt8 => dict_array_value_to_string::<UInt8Type>(column, row),
+            DataType::UInt16 => dict_array_value_to_string::<UInt16Type>(column, row),
+            DataType::UInt32 => dict_array_value_to_string::<UInt32Type>(column, row),
+            DataType::UInt64 => dict_array_value_to_string::<UInt64Type>(column, row),
+            _ => Err(ArrowError::InvalidArgumentError(format!(
+                "Pretty printing not supported for {:?} due to index type",
+                column.data_type()
+            ))),
+        },
+        DataType::FixedSizeList(_, _) => make_string_from_fixed_size_list!(column, row),
+        DataType::Struct(_) => {
+            let st = column
+                .as_any()
+                .downcast_ref::<array::StructArray>()
+                .ok_or_else(|| {
+                    ArrowError::InvalidArgumentError(
+                        "Repl error: could not convert struct column to struct array.".to_string(),
+                    )
+                })?;
+
+            let mut s = String::new();
+            s.push('{');
+            let mut kv_iter = st.columns().into_iter().zip(st.column_names().into_iter());
+            if let Some((col, name)) = kv_iter.next() {
+                append_struct_field_string(&mut s, name, col, row)?;
+            }
+            for (col, name) in kv_iter {
+                s.push_str(", ");
+                append_struct_field_string(&mut s, name, col, row)?;
+            }
+            s.push('}');
+
+            Ok(s)
+        }
+        _ => Err(ArrowError::InvalidArgumentError(format!(
+            "Pretty printing not implemented for {:?} type",
+            column.data_type()
+        ))),
+    }
+}
+
+/// Converts the value of the dictionary array at `row` to a String
+fn dict_array_value_to_string<K: ArrowPrimitiveType>(
+    colum: &array::ArrayRef,
+    row: usize,
+) -> Result<String> {
+    let dict_array = colum.as_any().downcast_ref::<DictionaryArray<K>>().unwrap();
+
+    let keys_array = dict_array.keys();
+
+    if keys_array.is_null(row) {
+        return Ok(String::from(""));
+    }
+
+    let dict_index = keys_array.value(row).to_usize().ok_or_else(|| {
+        ArrowError::InvalidArgumentError(format!(
+            "Can not convert value {:?} at index {:?} to usize for string conversion.",
+            keys_array.value(row),
+            row
+        ))
+    })?;
+
+    array_value_to_string(dict_array.values(), dict_index)
+}
diff --git a/arrow_deps/src/lib.rs b/arrow_deps/src/lib.rs
new file mode 100644
index 0000000000..b1ead9249e
--- /dev/null
+++ b/arrow_deps/src/lib.rs
@@ -0,0 +1,14 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! This crate exists to add a dependency on (likely as yet
+//! unpublished) versions of arrow / datafusion so we can
+//! manage the version used by ceresdbx in a single crate.
+
+pub mod display;
+pub mod util;
+
+// export arrow and datafusion publically so we can have a single
+// reference in cargo
+pub use arrow;
+pub use datafusion;
+pub use parquet;
diff --git a/arrow_deps/src/util.rs b/arrow_deps/src/util.rs
new file mode 100644
index 0000000000..661fa919dd
--- /dev/null
+++ b/arrow_deps/src/util.rs
@@ -0,0 +1,133 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! utilities for manipulating arrow/parquet/datafusion data structures.
+
+use std::convert::TryFrom;
+
+use arrow::{
+    array::UInt32Array,
+    compute,
+    error::{ArrowError, Result},
+    record_batch::RecordBatch,
+};
+
+/// Select the data in the [`RecordBatch`] by read and copy from the source
+/// `batch`.
+pub fn select_record_batch(batch: &RecordBatch, selected_rows: &[bool]) -> Result<RecordBatch> {
+    assert_eq!(batch.num_rows(), selected_rows.len());
+    let selected_columns = {
+        // ensure the the selected_rows.len() is not greater than u32::MAX.
+        let _ = u32::try_from(selected_rows.len()).map_err(|e| {
+            ArrowError::InvalidArgumentError(format!(
+                "too many rows in a batch, convert usize to u32 failed, num_rows:{}, err:{}",
+                batch.num_rows(),
+                e
+            ))
+        })?;
+
+        let selected_index_iter = selected_rows
+            .iter()
+            .enumerate()
+            .filter_map(|(idx, selected)| if *selected { Some(idx as u32) } else { None });
+        // TODO(xikai): avoid this memory allocation.
+        let indices = UInt32Array::from_iter_values(selected_index_iter);
+
+        let mut cols = Vec::with_capacity(batch.num_columns());
+        for orig_col_data in batch.columns() {
+            let new_col_data = compute::take(orig_col_data.as_ref(), &indices, None)?;
+            cols.push(new_col_data);
+        }
+
+        cols
+    };
+
+    RecordBatch::try_new(batch.schema(), selected_columns)
+}
+
+/// Reverse the data in the [`RecordBatch`] by read and copy from the source
+/// `batch`.
+pub fn reverse_record_batch(batch: &RecordBatch) -> Result<RecordBatch> {
+    let reversed_columns = {
+        let num_rows = u32::try_from(batch.num_rows()).map_err(|e| {
+            ArrowError::InvalidArgumentError(format!(
+                "too many rows in a batch, convert usize to u32 failed, num_rows:{}, err:{}",
+                batch.num_rows(),
+                e
+            ))
+        })?;
+        // TODO(xikai): avoid this memory allocation.
+        let indices = UInt32Array::from_iter_values((0..num_rows).into_iter().rev());
+
+        let mut cols = Vec::with_capacity(batch.num_columns());
+        for orig_col_data in batch.columns() {
+            let new_col_data = compute::take(orig_col_data.as_ref(), &indices, None)?;
+            cols.push(new_col_data);
+        }
+
+        cols
+    };
+
+    RecordBatch::try_new(batch.schema(), reversed_columns)
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use arrow::{
+        array::Int32Array,
+        datatypes::{DataType, Field, Schema},
+    };
+
+    use super::*;
+
+    #[test]
+    fn test_reverse_record_batch() {
+        let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)]));
+        let (ids, reverse_ids) = {
+            let mut source = vec![1, 2, 3, 4, 5];
+            let arr = Int32Array::from(source.clone());
+            source.reverse();
+            let reversed_arr = Int32Array::from(source);
+            (arr, reversed_arr)
+        };
+
+        let batch =
+            RecordBatch::try_new(schema.clone(), vec![Arc::new(ids)]).expect("build record batch");
+        let expect_reversed_batch =
+            RecordBatch::try_new(schema, vec![Arc::new(reverse_ids)]).expect("build record batch");
+        let reversed_batch = reverse_record_batch(&batch).expect("reverse record batch");
+
+        assert_eq!(expect_reversed_batch, reversed_batch);
+    }
+
+    #[test]
+    fn test_reverse_empty_record_batch() {
+        let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)]));
+        let arr = Int32Array::from(Vec::<i32>::new());
+
+        let batch = RecordBatch::try_new(schema, vec![Arc::new(arr)]).expect("build record batch");
+        let reversed_batch = reverse_record_batch(&batch).expect("reverse record batch");
+
+        assert_eq!(batch, reversed_batch);
+    }
+
+    #[test]
+    fn test_select_record_batch() {
+        let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)]));
+        let (ids, expect_selected_ids, selected_rows) = {
+            let arr = Int32Array::from(vec![1, 2, 3, 4, 5]);
+            let selected_arr = Int32Array::from(vec![2, 3, 5]);
+            (arr, selected_arr, vec![false, true, true, false, true])
+        };
+        let batch =
+            RecordBatch::try_new(schema.clone(), vec![Arc::new(ids)]).expect("build record batch");
+        let selected_batch =
+            select_record_batch(&batch, &selected_rows).expect("select record batch");
+        let expect_selected_batch =
+            RecordBatch::try_new(schema, vec![Arc::new(expect_selected_ids)])
+                .expect("build record batch");
+
+        assert_eq!(selected_batch, expect_selected_batch);
+    }
+}
diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml
new file mode 100644
index 0000000000..e453bd8eb6
--- /dev/null
+++ b/benchmarks/Cargo.toml
@@ -0,0 +1,34 @@
+[package]
+name = "benchmarks"
+version = "0.1.0"
+edition = "2018"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+arena = { path = "../components/arena" }
+arrow2 = { version = "0.7.0", features = [ "io_parquet" ] }
+arrow_deps = { path = "../arrow_deps" }
+analytic_engine = { path = "../analytic_engine" }
+clap = "2.0"
+common_types = { path = "../common_types" }
+common_util = { path = "../common_util" }
+env_logger = "0.6"
+futures = "0.3"
+log = "0.4"
+object_store = { path = "../components/object_store" }
+parquet = { path = "../components/parquet"}
+serde = "1.0"
+serde_derive = "1.0"
+table_engine = { path = "../table_engine" }
+tokio = { version = "1.0", features = ["sync"] }
+
+[dev-dependencies]
+criterion = "0.3"
+
+[[bench]]
+name = "bench"
+harness = false
+
+[[bin]]
+name = "sst-tools"
diff --git a/benchmarks/README.md b/benchmarks/README.md
new file mode 100644
index 0000000000..65cc001e80
--- /dev/null
+++ b/benchmarks/README.md
@@ -0,0 +1,25 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+# Benchmarks
+
+## Test Data
+todo
+
+## Config
+A config template can be found in `config/bench.toml`.
+
+## Run benchmarks
+In root directory of `ceresdbx` (not this directory `ceresdbx/benchmarks`), run the following command:
+```bash
+ANALYTIC_BENCH_CONFIG_PATH=/path/to/bench.toml cargo bench -p benchmarks
+```
+
+Print logs:
+```bash
+RUST_LOG=info ANALYTIC_BENCH_CONFIG_PATH=/path/to/bench.toml cargo bench -p benchmarks
+```
+
+Run specific bench:
+```bash
+ANALYTIC_BENCH_CONFIG_PATH=/path/to/bench.toml cargo bench -p benchmarks -- read_parquet
+```
diff --git a/benchmarks/bench.toml b/benchmarks/bench.toml
new file mode 100644
index 0000000000..e182151bdb
--- /dev/null
+++ b/benchmarks/bench.toml
@@ -0,0 +1,45 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+[sst_bench]
+store_path = "/Users/chunshao.rcs/Desktop/work/gitlab/ceresdbx/1/1"
+# store_path = "/Users/yingwen.yyw/data/antmonitor_mid_table_4022"
+sst_file_name = "37.sst"
+runtime_thread_num = 1
+bench_measurement_time = "30s"
+max_projections = 5
+read_batch_row_num = 500
+sst_meta_cache_cap = 1000
+sst_data_cache_cap = 10000
+
+[sst_bench.predicate]
+# start_time_ms = 0
+start_time_ms = 1632985200000
+# end_time_ms = 0
+end_time_ms = 1632985800000
+
+[merge_sst_bench]
+store_path = "/Users/chunshao.rcs/Desktop/work/gitlab/ceresdbx"
+space_id = 1
+table_id = 1
+sst_file_ids = [ 34, 37 ]
+runtime_thread_num = 1
+bench_measurement_time = "120s"
+max_projections = 5
+read_batch_row_num = 500
+
+[merge_sst_bench.predicate]
+start_time_ms = 0
+# start_time_ms = 1632985200000
+end_time_ms = 0
+# end_time_ms = 1632985800000
+
+[scan_memtable_bench]
+store_path = "/Users/chunshao.rcs/Desktop/work/gitlab/ceresdbx/1/1"
+sst_file_name = "37.sst"
+runtime_thread_num = 1
+max_projections = 5
+arena_block_size = "64M"
+
+[wal_row_bench]
+rows_num = 100_0000
+test_num = 3
\ No newline at end of file
diff --git a/benchmarks/benches/bench.rs b/benchmarks/benches/bench.rs
new file mode 100644
index 0000000000..26ee634424
--- /dev/null
+++ b/benchmarks/benches/bench.rs
@@ -0,0 +1,208 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Benchmarks
+
+use std::sync::Once;
+
+use benchmarks::{
+    arrow2_bench::Arrow2Bench,
+    config::{self, BenchConfig},
+    merge_memtable_bench::MergeMemTableBench,
+    merge_sst_bench::MergeSstBench,
+    parquet_bench::ParquetBench,
+    scan_memtable_bench::ScanMemTableBench,
+    sst_bench::SstBench,
+};
+use criterion::*;
+
+static INIT_LOG: Once = Once::new();
+
+pub fn init_bench() -> BenchConfig {
+    INIT_LOG.call_once(|| {
+        env_logger::init();
+    });
+
+    config::bench_config_from_env()
+}
+
+fn bench_read_sst_iter(b: &mut Bencher<'_>, bench: &SstBench) {
+    b.iter(|| {
+        bench.run_bench();
+    })
+}
+
+fn bench_read_sst(c: &mut Criterion) {
+    let config = init_bench();
+
+    let mut group = c.benchmark_group("read_sst");
+    group.measurement_time(config.sst_bench.bench_measurement_time.0);
+    group.sample_size(config.sst_bench.bench_sample_size);
+
+    let mut bench = SstBench::new(config.sst_bench);
+
+    for i in 0..bench.num_benches() {
+        bench.init_for_bench(i);
+
+        group.bench_with_input(
+            BenchmarkId::new("read_sst", format!("{}/{}", bench.sst_file_name, i)),
+            &bench,
+            bench_read_sst_iter,
+        );
+    }
+
+    group.finish();
+}
+
+fn bench_merge_sst_iter(b: &mut Bencher<'_>, bench: &MergeSstBench) {
+    b.iter(|| bench.run_bench())
+}
+
+fn bench_merge_sst(c: &mut Criterion) {
+    let config = init_bench();
+
+    let mut group = c.benchmark_group("merge_sst");
+
+    group.measurement_time(config.merge_sst_bench.bench_measurement_time.0);
+    group.sample_size(config.sst_bench.bench_sample_size);
+
+    let sst_file_ids = format!("{:?}", config.merge_sst_bench.sst_file_ids);
+    let mut bench = MergeSstBench::new(config.merge_sst_bench);
+
+    for i in 0..bench.num_benches() {
+        bench.init_for_bench(i, true);
+        group.bench_with_input(
+            BenchmarkId::new("merge_sst", format!("{}/{}/dedup", sst_file_ids, i)),
+            &bench,
+            bench_merge_sst_iter,
+        );
+
+        bench.init_for_bench(i, false);
+        group.bench_with_input(
+            BenchmarkId::new("merge_sst", format!("{}/{}/no-dedup", sst_file_ids, i)),
+            &bench,
+            bench_merge_sst_iter,
+        );
+    }
+
+    group.finish();
+}
+
+fn bench_parquet_iter(b: &mut Bencher<'_>, bench: &ParquetBench) {
+    b.iter(|| bench.run_bench())
+}
+
+fn bench_parquet(c: &mut Criterion) {
+    let config = init_bench();
+
+    let mut group = c.benchmark_group("read_parquet");
+
+    group.measurement_time(config.sst_bench.bench_measurement_time.0);
+    group.sample_size(config.sst_bench.bench_sample_size);
+
+    let mut bench = ParquetBench::new(config.sst_bench);
+
+    for i in 0..bench.num_benches() {
+        bench.init_for_bench(i);
+
+        group.bench_with_input(
+            BenchmarkId::new("read_parquet", format!("{}/{}", bench.sst_file_name, i)),
+            &bench,
+            bench_parquet_iter,
+        );
+    }
+
+    group.finish();
+}
+
+fn bench_scan_memtable_iter(b: &mut Bencher<'_>, bench: &ScanMemTableBench) {
+    b.iter(|| bench.run_bench())
+}
+
+fn bench_scan_memtable(c: &mut Criterion) {
+    let config = init_bench();
+
+    let mut group = c.benchmark_group("scan_memtable");
+
+    let mut bench = ScanMemTableBench::new(config.scan_memtable_bench);
+
+    for i in 0..bench.num_benches() {
+        bench.init_for_bench(i);
+
+        group.bench_with_input(
+            BenchmarkId::new("scan_memtable", i),
+            &bench,
+            bench_scan_memtable_iter,
+        );
+    }
+
+    group.finish();
+}
+
+fn bench_merge_memtable_iter(b: &mut Bencher<'_>, bench: &MergeMemTableBench) {
+    b.iter(|| bench.run_bench())
+}
+
+fn bench_merge_memtable(c: &mut Criterion) {
+    let config = init_bench();
+
+    let mut group = c.benchmark_group("merge_memtable");
+
+    let sst_file_ids = format!("{:?}", config.merge_memtable_bench.sst_file_ids);
+    let mut bench = MergeMemTableBench::new(config.merge_memtable_bench);
+
+    for i in 0..bench.num_benches() {
+        bench.init_for_bench(i, true);
+        group.bench_with_input(
+            BenchmarkId::new("merge_memtable", format!("{}/{}/dedup", sst_file_ids, i)),
+            &bench,
+            bench_merge_memtable_iter,
+        );
+
+        bench.init_for_bench(i, false);
+        group.bench_with_input(
+            BenchmarkId::new("merge_memtable", format!("{}/{}/no-dedup", sst_file_ids, i)),
+            &bench,
+            bench_merge_memtable_iter,
+        );
+    }
+
+    group.finish();
+}
+
+fn bench_arrow2_iter(b: &mut Bencher<'_>, bench: &Arrow2Bench) {
+    b.iter(|| bench.run_bench())
+}
+
+fn bench_arrow2(c: &mut Criterion) {
+    let config = init_bench();
+
+    let mut group = c.benchmark_group("read_arrow2");
+
+    group.measurement_time(config.sst_bench.bench_measurement_time.0);
+    group.sample_size(config.sst_bench.bench_sample_size);
+
+    let mut bench = Arrow2Bench::new(config.sst_bench);
+
+    for i in 0..bench.num_benches() {
+        bench.init_for_bench(i);
+
+        group.bench_with_input(
+            BenchmarkId::new("read_arrow2", format!("{}/{}", bench.sst_file_name, i)),
+            &bench,
+            bench_arrow2_iter,
+        );
+    }
+
+    group.finish();
+}
+
+criterion_group!(
+    benches,
+    bench_read_sst,
+    bench_merge_sst,
+    bench_parquet,
+    bench_scan_memtable,
+    bench_merge_memtable,
+    bench_arrow2,
+);
+criterion_main!(benches);
diff --git a/benchmarks/config/bench.toml b/benchmarks/config/bench.toml
new file mode 100644
index 0000000000..ba73090b77
--- /dev/null
+++ b/benchmarks/config/bench.toml
@@ -0,0 +1,50 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+[sst_bench]
+store_path = "/path/to/data/1/1"
+sst_file_name = "37.sst"
+runtime_thread_num = 1
+bench_measurement_time = "30s"
+bench_sample_size = 30
+max_projections = 5
+read_batch_row_num = 500
+reverse = false
+
+[sst_bench.predicate]
+# start_time_ms = 0
+start_time_ms = 1632985200000
+# end_time_ms = 0
+end_time_ms = 1632985800000
+
+[merge_sst_bench]
+store_path = "/path/to/data"
+space_id = 1
+table_id = 1
+sst_file_ids = [ 34, 37 ]
+runtime_thread_num = 1
+bench_measurement_time = "30s"
+bench_sample_size = 30
+max_projections = 5
+read_batch_row_num = 500
+
+[merge_sst_bench.predicate]
+start_time_ms = 0
+# start_time_ms = 1632985200000
+end_time_ms = 0
+# end_time_ms = 1632985800000
+
+[scan_memtable_bench]
+store_path = "/path/to/data/1/1"
+sst_file_name = "37.sst"
+runtime_thread_num = 1
+max_projections = 5
+arena_block_size = "64M"
+
+[merge_memtable_bench]
+store_path = "/path/to/data"
+space_id = 1
+table_id = 1
+sst_file_ids = [ 37 ]
+runtime_thread_num = 1
+max_projections = 5
+arena_block_size = "64M"
diff --git a/benchmarks/config/sst.toml b/benchmarks/config/sst.toml
new file mode 100644
index 0000000000..5758df2459
--- /dev/null
+++ b/benchmarks/config/sst.toml
@@ -0,0 +1,33 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+runtime_thread_num = 4
+
+ [rebuild_sst]
+ store_path = "/Users/chunshao.rcs/Desktop/work/gitlab/ceresdb/neo/ceresdb/ceresdbx/benchmarks"
+ input_file_name = "898.sst"
+ # read_batch_row_num = 500
+ # read_batch_row_num = 4096
+  read_batch_row_num = 8192
+# read_batch_row_num = 16384
+ output_file_name = "tt_t.sst"
+ num_rows_per_row_group = 8192
+compression = "SNAPPY"
+
+ [rebuild_sst.predicate]
+ start_time_ms = 0
+ end_time_ms = 0
+
+#[merge_sst]
+#store_path = "/Users/chunshao.rcs/Desktop/work/gitlab/ceresdb/neo/ceresdb/ceresdbx/benchmarks/2199023255564"
+#space_id = 1
+#table_id = 1
+#sst_file_ids = [1, 17, 19, 24, 31, 37, 43, 45, 9, 14, 18, 21, 27, 34, 40, 44, 5]
+#dedup = true
+#read_batch_row_num = 16384
+#output_store_path = "/Users/yingwen.yyw/data/1/1"
+#output_file_name = "16384-all.sst"
+#num_rows_per_row_group = 16384
+#
+#[merge_sst.predicate]
+#start_time_ms = 0
+#end_time_ms = 0
diff --git a/benchmarks/src/arrow2_bench.rs b/benchmarks/src/arrow2_bench.rs
new file mode 100644
index 0000000000..e51e96fe4d
--- /dev/null
+++ b/benchmarks/src/arrow2_bench.rs
@@ -0,0 +1,81 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Arrow 2 bench.
+
+use std::{fs::File, io::BufReader, path::Path, sync::Arc, time::Instant};
+
+use arrow2::io::parquet::read;
+use common_util::runtime::Runtime;
+use log::info;
+
+use crate::{config::SstBenchConfig, util};
+
+pub struct Arrow2Bench {
+    store_path: String,
+    pub sst_file_name: String,
+    max_projections: usize,
+    projection: Vec<usize>,
+    runtime: Arc<Runtime>,
+}
+
+impl Arrow2Bench {
+    pub fn new(config: SstBenchConfig) -> Self {
+        let runtime = util::new_runtime(config.runtime_thread_num);
+
+        Arrow2Bench {
+            store_path: config.store_path,
+            sst_file_name: config.sst_file_name,
+            max_projections: config.max_projections,
+            projection: Vec::new(),
+            runtime: Arc::new(runtime),
+        }
+    }
+
+    pub fn num_benches(&self) -> usize {
+        // One test reads all columns and `max_projections` tests read with projection.
+        1 + self.max_projections
+    }
+
+    pub fn init_for_bench(&mut self, i: usize) {
+        let projection = if i < self.max_projections {
+            (0..i + 1).into_iter().collect()
+        } else {
+            Vec::new()
+        };
+
+        self.projection = projection;
+    }
+
+    pub fn run_bench(&self) {
+        let sst_path = Path::new(&self.store_path).join(&self.sst_file_name);
+
+        self.runtime.block_on(async {
+            let open_instant = Instant::now();
+            let file = BufReader::new(File::open(sst_path).unwrap());
+
+            let record_reader = if self.projection.is_empty() {
+                read::RecordReader::try_new(file, None, None, None, None).unwrap()
+            } else {
+                read::RecordReader::try_new(file, Some(self.projection.clone()), None, None, None).unwrap()
+            };
+            let open_cost = open_instant.elapsed();
+
+            let iter_begin_instant = Instant::now();
+            let mut total_rows = 0;
+            let mut batch_num = 0;
+            for record_batch in record_reader {
+                let num_rows = record_batch.unwrap().num_rows();
+                total_rows += num_rows;
+                batch_num += 1;
+            }
+
+            info!(
+                "\nParquetBench total rows of sst: {}, total batch num: {}, open cost: {:?}, iter cost: {:?}",
+                total_rows,
+                batch_num,
+                open_cost,
+                iter_begin_instant.elapsed(),
+            );
+        });
+    }
+}
diff --git a/benchmarks/src/bin/sst-tools.rs b/benchmarks/src/bin/sst-tools.rs
new file mode 100644
index 0000000000..ab1a6e91be
--- /dev/null
+++ b/benchmarks/src/bin/sst-tools.rs
@@ -0,0 +1,70 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use std::sync::Arc;
+
+use benchmarks::{
+    sst_tools::{self, MergeSstConfig, RebuildSstConfig},
+    util,
+};
+use clap::{App, Arg};
+use common_util::toml;
+use log::info;
+use serde_derive::Deserialize;
+
+#[derive(Debug, Deserialize)]
+#[serde(default)]
+struct Config {
+    runtime_thread_num: usize,
+    rebuild_sst: Option<RebuildSstConfig>,
+    merge_sst: Option<MergeSstConfig>,
+}
+
+impl Default for Config {
+    fn default() -> Config {
+        Self {
+            runtime_thread_num: 1,
+            rebuild_sst: None,
+            merge_sst: None,
+        }
+    }
+}
+
+fn config_from_path(path: &str) -> Config {
+    let mut toml_buf = String::new();
+    toml::parse_toml_from_path(path, &mut toml_buf).expect("Failed to parse config.")
+}
+
+fn main() {
+    env_logger::init();
+
+    let matches = App::new("SST Tools")
+        .arg(
+            Arg::with_name("config")
+                .short("c")
+                .long("config")
+                .required(true)
+                .takes_value(true)
+                .help("Set configuration file, eg: \"/path/server.toml\""),
+        )
+        .get_matches();
+
+    let config_path = matches
+        .value_of("config")
+        .expect("Config file is required.");
+    let config = config_from_path(config_path);
+
+    info!("sst tools start, config:{:?}", config);
+
+    let runtime = Arc::new(util::new_runtime(config.runtime_thread_num));
+
+    let rt = runtime.clone();
+    runtime.block_on(async {
+        if let Some(rebuild_sst) = config.rebuild_sst {
+            sst_tools::rebuild_sst(rebuild_sst, rt.clone()).await;
+        }
+
+        if let Some(merge_sst) = config.merge_sst {
+            sst_tools::merge_sst(merge_sst, rt).await;
+        }
+    });
+}
diff --git a/benchmarks/src/config.rs b/benchmarks/src/config.rs
new file mode 100644
index 0000000000..a66cfa1163
--- /dev/null
+++ b/benchmarks/src/config.rs
@@ -0,0 +1,123 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Benchmark configs.
+
+use std::env;
+
+use analytic_engine::{space::SpaceId, sst::manager::FileId};
+use common_types::time::{TimeRange, Timestamp};
+use common_util::{
+    config::{ReadableDuration, ReadableSize},
+    toml,
+};
+use serde_derive::Deserialize;
+use table_engine::{predicate::Predicate, table::TableId};
+
+const BENCH_CONFIG_PATH_KEY: &str = "ANALYTIC_BENCH_CONFIG_PATH";
+
+#[derive(Deserialize)]
+pub struct BenchConfig {
+    pub sst_bench: SstBenchConfig,
+    pub merge_sst_bench: MergeSstBenchConfig,
+    pub scan_memtable_bench: ScanMemTableBenchConfig,
+    pub merge_memtable_bench: MergeMemTableBenchConfig,
+}
+
+// TODO(yingwen): Maybe we can use layze static to load config first.
+pub fn bench_config_from_env() -> BenchConfig {
+    let path = match env::var(BENCH_CONFIG_PATH_KEY) {
+        Ok(v) => v,
+        Err(e) => panic!(
+            "Env {} is required to run benches, err:{}.",
+            BENCH_CONFIG_PATH_KEY, e
+        ),
+    };
+
+    let mut toml_buf = String::new();
+    toml::parse_toml_from_path(&path, &mut toml_buf).expect("Failed to parse config.")
+}
+
+#[derive(Deserialize)]
+pub struct SstBenchConfig {
+    pub store_path: String,
+    pub sst_file_name: String,
+    pub runtime_thread_num: usize,
+
+    pub bench_measurement_time: ReadableDuration,
+    pub bench_sample_size: usize,
+
+    /// Max number of projection columns.
+    pub max_projections: usize,
+    pub read_batch_row_num: usize,
+    pub predicate: BenchPredicate,
+    pub sst_meta_cache_cap: Option<usize>,
+    pub sst_data_cache_cap: Option<usize>,
+    pub reverse: bool,
+}
+
+#[derive(Deserialize)]
+pub struct MergeSstBenchConfig {
+    pub store_path: String,
+    pub space_id: SpaceId,
+    pub table_id: TableId,
+    pub sst_file_ids: Vec<FileId>,
+    pub runtime_thread_num: usize,
+
+    pub bench_measurement_time: ReadableDuration,
+    pub bench_sample_size: usize,
+
+    /// Max number of projection columns.
+    pub max_projections: usize,
+    pub read_batch_row_num: usize,
+    pub predicate: BenchPredicate,
+}
+
+#[derive(Deserialize)]
+pub struct ScanMemTableBenchConfig {
+    pub store_path: String,
+    pub sst_file_name: String,
+    pub runtime_thread_num: usize,
+
+    /// Max number of projection columns.
+    pub max_projections: usize,
+
+    pub arena_block_size: ReadableSize,
+}
+
+#[derive(Debug, Deserialize)]
+pub struct BenchPredicate {
+    /// Inclusive start time in millis.
+    start_time_ms: i64,
+    /// Exclusive end time in millis.
+    ///
+    /// Set to current time millis if start_time_ms == end_time_ms.
+    end_time_ms: i64,
+}
+
+impl BenchPredicate {
+    pub fn into_predicate(self) -> Predicate {
+        let start = Timestamp::new(self.start_time_ms);
+        let end = if self.start_time_ms == self.end_time_ms {
+            Timestamp::now()
+        } else {
+            Timestamp::new(self.end_time_ms)
+        };
+        let time_range = TimeRange::new(start, end).unwrap();
+
+        Predicate::new(time_range)
+    }
+}
+
+#[derive(Deserialize)]
+pub struct MergeMemTableBenchConfig {
+    pub store_path: String,
+    pub space_id: SpaceId,
+    pub table_id: TableId,
+    pub sst_file_ids: Vec<FileId>,
+    pub runtime_thread_num: usize,
+
+    /// Max number of projection columns.
+    pub max_projections: usize,
+
+    pub arena_block_size: ReadableSize,
+}
diff --git a/benchmarks/src/lib.rs b/benchmarks/src/lib.rs
new file mode 100644
index 0000000000..526d028021
--- /dev/null
+++ b/benchmarks/src/lib.rs
@@ -0,0 +1,17 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Utilities for benchmarks.
+
+use common_types::SequenceNumber;
+
+pub mod arrow2_bench;
+pub mod config;
+pub mod merge_memtable_bench;
+pub mod merge_sst_bench;
+pub mod parquet_bench;
+pub mod scan_memtable_bench;
+pub mod sst_bench;
+pub mod sst_tools;
+pub mod util;
+
+pub(crate) const INIT_SEQUENCE: SequenceNumber = 1;
diff --git a/benchmarks/src/merge_memtable_bench.rs b/benchmarks/src/merge_memtable_bench.rs
new file mode 100644
index 0000000000..7596576aa6
--- /dev/null
+++ b/benchmarks/src/merge_memtable_bench.rs
@@ -0,0 +1,209 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Merge memtable bench.
+
+use std::{cmp, sync::Arc, time::Instant};
+
+use analytic_engine::{
+    memtable::{
+        factory::{Factory as MemTableFactory, Options},
+        skiplist::factory::SkiplistMemTableFactory,
+    },
+    row_iter::{
+        dedup::DedupIterator,
+        merge::{MergeBuilder, MergeConfig},
+        IterOptions, RecordBatchWithKeyIterator,
+    },
+    space::SpaceId,
+    sst::factory::{FactoryImpl, SstReaderOptions, SstType},
+    table::{
+        sst_util,
+        version::{MemTableState, MemTableVec},
+    },
+};
+use arena::NoopCollector;
+use common_types::{
+    projected_schema::ProjectedSchema, request_id::RequestId, schema::Schema, time::TimeRange,
+};
+use common_util::runtime::Runtime;
+use log::info;
+use object_store::{disk::File, ObjectStore};
+use parquet::{DataCacheRef, MetaCacheRef};
+use table_engine::{predicate::Predicate, table::TableId};
+
+use crate::{config::MergeMemTableBenchConfig, util};
+
+pub struct MergeMemTableBench {
+    store: File,
+    memtables: MemTableVec,
+    max_projections: usize,
+    schema: Schema,
+    projected_schema: ProjectedSchema,
+    runtime: Arc<Runtime>,
+    space_id: SpaceId,
+    table_id: TableId,
+    dedup: bool,
+    sst_reader_options: SstReaderOptions,
+}
+
+impl MergeMemTableBench {
+    pub fn new(config: MergeMemTableBenchConfig) -> Self {
+        assert!(!config.sst_file_ids.is_empty());
+
+        let store = File::new(config.store_path);
+        let runtime = Arc::new(util::new_runtime(config.runtime_thread_num));
+        let space_id = config.space_id;
+        let table_id = config.table_id;
+
+        let meta_cache: Option<MetaCacheRef> = None;
+        let data_cache: Option<DataCacheRef> = None;
+
+        // Use first sst's schema.
+        let mut sst_path = store.new_path();
+        sst_util::set_sst_file_path(space_id, table_id, config.sst_file_ids[0], &mut sst_path);
+        let schema = runtime.block_on(util::schema_from_sst(
+            &store,
+            &sst_path,
+            &meta_cache,
+            &data_cache,
+        ));
+
+        let projected_schema = ProjectedSchema::no_projection(schema.clone());
+        let max_projections = cmp::min(config.max_projections, schema.num_columns());
+
+        let mut memtables = Vec::with_capacity(config.sst_file_ids.len());
+        for id in &config.sst_file_ids {
+            let mut sst_path = store.new_path();
+            sst_util::set_sst_file_path(space_id, table_id, *id, &mut sst_path);
+
+            let memtable_factory = SkiplistMemTableFactory;
+            let memtable_opts = Options {
+                collector: Arc::new(NoopCollector {}),
+                schema: schema.clone(),
+                arena_block_size: config.arena_block_size.0 as u32,
+                creation_sequence: crate::INIT_SEQUENCE,
+            };
+            let memtable = memtable_factory.create_memtable(memtable_opts).unwrap();
+
+            runtime.block_on(util::load_sst_to_memtable(
+                &store,
+                &sst_path,
+                &schema,
+                &memtable,
+                runtime.clone(),
+            ));
+
+            info!(
+                "\nMergeMemTableBench memtable loaded, memory used: {}",
+                memtable.approximate_memory_usage()
+            );
+
+            memtables.push(MemTableState {
+                mem: memtable,
+                time_range: TimeRange::min_to_max(),
+                id: *id,
+            });
+        }
+        let sst_reader_options = mock_sst_reader_options(projected_schema.clone(), runtime.clone());
+
+        MergeMemTableBench {
+            store,
+            memtables,
+            max_projections,
+            schema,
+            projected_schema,
+            runtime,
+            space_id,
+            table_id,
+            dedup: true,
+            sst_reader_options,
+        }
+    }
+
+    pub fn num_benches(&self) -> usize {
+        // One test reads all columns and `max_projections` tests read with projection.
+        1 + self.max_projections
+    }
+
+    pub fn init_for_bench(&mut self, i: usize, dedup: bool) {
+        let projected_schema =
+            util::projected_schema_by_number(&self.schema, i, self.max_projections);
+
+        self.projected_schema = projected_schema;
+        self.dedup = dedup;
+    }
+
+    // TODO(xikai): add benchmark for merge in reverse order.
+    pub fn run_bench(&self) {
+        let space_id = self.space_id;
+        let table_id = self.table_id;
+        let sequence = u64::MAX;
+        let iter_options = IterOptions::default();
+        let projected_schema = self.projected_schema.clone();
+        let sst_factory = FactoryImpl;
+
+        let request_id = RequestId::next_id();
+        let mut builder = MergeBuilder::new(MergeConfig {
+            request_id,
+            space_id,
+            table_id,
+            sequence,
+            projected_schema,
+            predicate: Arc::new(Predicate::empty()),
+            sst_factory,
+            sst_reader_options: self.sst_reader_options.clone(),
+            store: &self.store,
+            merge_iter_options: iter_options.clone(),
+            need_dedup: true,
+            reverse: false,
+        });
+
+        builder.mut_memtables().extend_from_slice(&self.memtables);
+
+        self.runtime.block_on(async {
+            let begin_instant = Instant::now();
+
+            let mut merge_iter = builder.build().await.unwrap();
+            let mut total_rows = 0;
+            let mut batch_num = 0;
+
+            if self.dedup {
+                let mut dedup_iter = DedupIterator::new(request_id, merge_iter, iter_options);
+                while let Some(batch) = dedup_iter.next_batch().await.unwrap() {
+                    let num_rows = batch.num_rows();
+                    total_rows += num_rows;
+                    batch_num += 1;
+                }
+            } else {
+                while let Some(batch) = merge_iter.next_batch().await.unwrap() {
+                    let num_rows = batch.num_rows();
+                    total_rows += num_rows;
+                    batch_num += 1;
+                }
+            }
+
+            info!(
+                "\nMergeMemTableBench total rows of sst: {}, total batch num: {}, cost: {:?}",
+                total_rows,
+                batch_num,
+                begin_instant.elapsed(),
+            );
+        });
+    }
+}
+
+fn mock_sst_reader_options(
+    projected_schema: ProjectedSchema,
+    runtime: Arc<Runtime>,
+) -> SstReaderOptions {
+    SstReaderOptions {
+        sst_type: SstType::Parquet,
+        read_batch_row_num: 500,
+        reverse: false,
+        projected_schema,
+        predicate: Arc::new(Predicate::new(TimeRange::min_to_max())),
+        meta_cache: None,
+        data_cache: None,
+        runtime,
+    }
+}
diff --git a/benchmarks/src/merge_sst_bench.rs b/benchmarks/src/merge_sst_bench.rs
new file mode 100644
index 0000000000..a0ccab50d5
--- /dev/null
+++ b/benchmarks/src/merge_sst_bench.rs
@@ -0,0 +1,225 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Merge SST bench.
+
+use std::{cmp, sync::Arc, time::Instant};
+
+use analytic_engine::{
+    row_iter::{
+        chain,
+        chain::ChainConfig,
+        dedup::DedupIterator,
+        merge::{MergeBuilder, MergeConfig},
+        IterOptions, RecordBatchWithKeyIterator,
+    },
+    space::SpaceId,
+    sst::{
+        factory::{FactoryImpl, SstReaderOptions, SstType},
+        file::{FileHandle, FilePurgeQueue, Request},
+    },
+    table::sst_util,
+};
+use common_types::{projected_schema::ProjectedSchema, request_id::RequestId, schema::Schema};
+use common_util::runtime::Runtime;
+use log::info;
+use object_store::{disk::File, ObjectStore};
+use parquet::{DataCacheRef, MetaCacheRef};
+use table_engine::{predicate::Predicate, table::TableId};
+use tokio::sync::mpsc::{self, UnboundedReceiver};
+
+use crate::{config::MergeSstBenchConfig, util};
+
+pub struct MergeSstBench {
+    store: File,
+    max_projections: usize,
+    schema: Schema,
+    sst_reader_options: SstReaderOptions,
+    runtime: Arc<Runtime>,
+    space_id: SpaceId,
+    table_id: TableId,
+    file_handles: Vec<FileHandle>,
+    _receiver: UnboundedReceiver<Request>,
+    dedup: bool,
+}
+
+impl MergeSstBench {
+    pub fn new(config: MergeSstBenchConfig) -> Self {
+        assert!(!config.sst_file_ids.is_empty());
+
+        let store = File::new(config.store_path);
+        let runtime = Arc::new(util::new_runtime(config.runtime_thread_num));
+        let space_id = config.space_id;
+        let table_id = config.table_id;
+
+        let mut sst_path = store.new_path();
+        sst_util::set_sst_file_path(space_id, table_id, config.sst_file_ids[0], &mut sst_path);
+        let meta_cache: Option<MetaCacheRef> = None;
+        let data_cache: Option<DataCacheRef> = None;
+
+        let schema = runtime.block_on(util::schema_from_sst(
+            &store,
+            &sst_path,
+            &meta_cache,
+            &data_cache,
+        ));
+
+        let predicate = config.predicate.into_predicate();
+        let projected_schema = ProjectedSchema::no_projection(schema.clone());
+        let sst_reader_options = SstReaderOptions {
+            sst_type: SstType::Parquet,
+            read_batch_row_num: config.read_batch_row_num,
+            reverse: false,
+            projected_schema,
+            predicate: Arc::new(predicate),
+            meta_cache: meta_cache.clone(),
+            data_cache: data_cache.clone(),
+            runtime: runtime.clone(),
+        };
+        let max_projections = cmp::min(config.max_projections, schema.num_columns());
+
+        let (tx, rx) = mpsc::unbounded_channel();
+        let purge_queue = FilePurgeQueue::new(space_id, table_id, tx);
+
+        let file_handles = runtime.block_on(util::file_handles_from_ssts(
+            &store,
+            space_id,
+            table_id,
+            &config.sst_file_ids,
+            purge_queue,
+            &meta_cache,
+            &data_cache,
+        ));
+
+        MergeSstBench {
+            store,
+            max_projections,
+            schema,
+            sst_reader_options,
+            runtime,
+            space_id,
+            table_id,
+            file_handles,
+            _receiver: rx,
+            dedup: true,
+        }
+    }
+
+    pub fn num_benches(&self) -> usize {
+        // One test reads all columns and `max_projections` tests read with projection.
+        1 + self.max_projections
+    }
+
+    pub fn init_for_bench(&mut self, i: usize, dedup: bool) {
+        let projected_schema =
+            util::projected_schema_by_number(&self.schema, i, self.max_projections);
+
+        self.sst_reader_options.projected_schema = projected_schema;
+        self.dedup = dedup;
+    }
+
+    fn run_dedup_bench(&self) {
+        let space_id = self.space_id;
+        let table_id = self.table_id;
+        let sequence = u64::MAX;
+        let iter_options = IterOptions::default();
+        let projected_schema = self.sst_reader_options.projected_schema.clone();
+        let sst_factory = FactoryImpl;
+
+        let request_id = RequestId::next_id();
+        let mut builder = MergeBuilder::new(MergeConfig {
+            request_id,
+            space_id,
+            table_id,
+            sequence,
+            projected_schema,
+            predicate: Arc::new(Predicate::empty()),
+            sst_factory,
+            sst_reader_options: self.sst_reader_options.clone(),
+            store: &self.store,
+            merge_iter_options: iter_options.clone(),
+            need_dedup: true,
+            reverse: false,
+        });
+
+        builder
+            .mut_ssts_of_level(0)
+            .extend_from_slice(&self.file_handles);
+
+        self.runtime.block_on(async {
+            let begin_instant = Instant::now();
+
+            let merge_iter = builder.build().await.unwrap();
+            let mut dedup_iter = DedupIterator::new(request_id, merge_iter, iter_options);
+            let mut total_rows = 0;
+            let mut batch_num = 0;
+
+            while let Some(batch) = dedup_iter.next_batch().await.unwrap() {
+                let num_rows = batch.num_rows();
+                total_rows += num_rows;
+                batch_num += 1;
+            }
+
+            info!(
+                "\nMergeSstBench total rows of sst: {}, total batch num: {}, cost: {:?}",
+                total_rows,
+                batch_num,
+                begin_instant.elapsed(),
+            );
+        });
+    }
+
+    fn run_no_dedup_bench(&self) {
+        let space_id = self.space_id;
+        let table_id = self.table_id;
+        let projected_schema = self.sst_reader_options.projected_schema.clone();
+        let sst_factory = FactoryImpl;
+
+        let request_id = RequestId::next_id();
+        let builder = chain::Builder::new(ChainConfig {
+            request_id,
+            space_id,
+            table_id,
+            projected_schema,
+            predicate: Arc::new(Predicate::empty()),
+            sst_factory,
+            sst_reader_options: self.sst_reader_options.clone(),
+            store: &self.store,
+        })
+        .ssts(vec![self.file_handles.clone()]);
+
+        self.runtime.block_on(async {
+            let begin_instant = Instant::now();
+
+            let mut chain_iter = builder.build().await.unwrap();
+            let mut total_rows = 0;
+            let mut batch_num = 0;
+
+            while let Some(batch) = chain_iter.next_batch().await.unwrap() {
+                let num_rows = batch.num_rows();
+                total_rows += num_rows;
+                batch_num += 1;
+            }
+
+            info!(
+                "\nMergeSstBench total rows of sst: {}, total batch num: {}, cost: {:?}",
+                total_rows,
+                batch_num,
+                begin_instant.elapsed(),
+            );
+        });
+    }
+
+    pub fn run_bench(&self) {
+        if self.dedup {
+            self.run_dedup_bench();
+        } else {
+            self.run_no_dedup_bench();
+        }
+    }
+}
+
+impl Drop for MergeSstBench {
+    fn drop(&mut self) {
+        self.file_handles.clear();
+    }
+}
diff --git a/benchmarks/src/parquet_bench.rs b/benchmarks/src/parquet_bench.rs
new file mode 100644
index 0000000000..b52c84f7e1
--- /dev/null
+++ b/benchmarks/src/parquet_bench.rs
@@ -0,0 +1,137 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Parquet bench.
+
+use std::{sync::Arc, time::Instant};
+
+use arrow_deps::parquet::{
+    arrow::{ArrowReader, ParquetFileArrowReader},
+    file::{
+        metadata::RowGroupMetaData, reader::FileReader, serialized_reader::SerializedFileReader,
+    },
+};
+use common_types::schema::Schema;
+use common_util::runtime::Runtime;
+use log::info;
+use object_store::{disk::File, path::ObjectStorePath, ObjectStore};
+use parquet::{DataCacheRef, MetaCacheRef};
+use table_engine::predicate::PredicateRef;
+
+use crate::{config::SstBenchConfig, util};
+
+type RowGroupPredicate = Box<dyn Fn(&RowGroupMetaData, usize) -> bool + 'static>;
+
+pub struct ParquetBench {
+    store: File,
+    pub sst_file_name: String,
+    max_projections: usize,
+    projection: Vec<usize>,
+    schema: Schema,
+    predicate: PredicateRef,
+    batch_size: usize,
+    runtime: Arc<Runtime>,
+}
+
+impl ParquetBench {
+    pub fn new(config: SstBenchConfig) -> Self {
+        let store = File::new(config.store_path);
+
+        let runtime = util::new_runtime(config.runtime_thread_num);
+
+        let mut sst_path = store.new_path();
+        sst_path.set_file_name(&config.sst_file_name);
+        let meta_cache: Option<MetaCacheRef> = None;
+        let data_cache: Option<DataCacheRef> = None;
+
+        let schema = runtime.block_on(util::schema_from_sst(
+            &store,
+            &sst_path,
+            &meta_cache,
+            &data_cache,
+        ));
+
+        let predicate = Arc::new(config.predicate.into_predicate());
+
+        ParquetBench {
+            store,
+            sst_file_name: config.sst_file_name,
+            max_projections: config.max_projections,
+            projection: Vec::new(),
+            schema,
+            predicate,
+            batch_size: config.read_batch_row_num,
+            runtime: Arc::new(runtime),
+        }
+    }
+
+    pub fn num_benches(&self) -> usize {
+        // One test reads all columns and `max_projections` tests read with projection.
+        1 + self.max_projections
+    }
+
+    pub fn init_for_bench(&mut self, i: usize) {
+        let projection = if i < self.max_projections {
+            (0..i + 1).into_iter().collect()
+        } else {
+            Vec::new()
+        };
+
+        self.projection = projection;
+    }
+
+    pub fn run_bench(&self) {
+        let mut sst_path = self.store.new_path();
+        sst_path.set_file_name(&self.sst_file_name);
+
+        self.runtime.block_on(async {
+            let open_instant = Instant::now();
+            let file = self.store.get(&sst_path).await.unwrap();
+            let mut file_reader = SerializedFileReader::new(file).unwrap();
+            let open_cost = open_instant.elapsed();
+
+            let filter_begin_instant = Instant::now();
+            let row_group_predicate = self.build_row_group_predicate(&file_reader);
+            let mut arrow_reader = {
+                file_reader.filter_row_groups(&row_group_predicate);
+                ParquetFileArrowReader::new(Arc::new(file_reader))
+            };
+            let filter_cost = filter_begin_instant.elapsed();
+
+            let record_reader = if self.projection.is_empty() {
+                arrow_reader.get_record_reader(self.batch_size).unwrap()
+            } else {
+                arrow_reader
+                    .get_record_reader_by_columns(self.projection.clone(), self.batch_size)
+                    .unwrap()
+            };
+
+            let iter_begin_instant = Instant::now();
+            let mut total_rows = 0;
+            let mut batch_num = 0;
+            for record_batch in record_reader {
+                let num_rows = record_batch.unwrap().num_rows();
+                total_rows += num_rows;
+                batch_num += 1;
+            }
+
+            info!(
+                "\nParquetBench total rows of sst: {}, total batch num: {}, open cost: {:?}, filter cost: {:?}, iter cost: {:?}",
+                total_rows,
+                batch_num,
+                open_cost,
+                filter_cost,
+                iter_begin_instant.elapsed(),
+            );
+        });
+    }
+
+    fn build_row_group_predicate(
+        &self,
+        file_reader: &SerializedFileReader<std::fs::File>,
+    ) -> RowGroupPredicate {
+        let row_groups = file_reader.metadata().row_groups();
+        let filter_results = self.predicate.filter_row_groups(&self.schema, row_groups);
+
+        Box::new(move |_, idx: usize| filter_results[idx])
+    }
+}
diff --git a/benchmarks/src/scan_memtable_bench.rs b/benchmarks/src/scan_memtable_bench.rs
new file mode 100644
index 0000000000..424e1886e8
--- /dev/null
+++ b/benchmarks/src/scan_memtable_bench.rs
@@ -0,0 +1,111 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Scan memtable bench.
+
+use std::{collections::Bound, sync::Arc};
+
+use analytic_engine::memtable::{
+    factory::{Factory as MemTableFactory, Options},
+    skiplist::factory::SkiplistMemTableFactory,
+    MemTableRef, ScanContext, ScanRequest,
+};
+use arena::NoopCollector;
+use common_types::projected_schema::ProjectedSchema;
+use log::info;
+use object_store::{disk::File, path::ObjectStorePath, ObjectStore};
+use parquet::{DataCacheRef, MetaCacheRef};
+
+use crate::{config::ScanMemTableBenchConfig, util};
+
+pub struct ScanMemTableBench {
+    memtable: MemTableRef,
+    projected_schema: ProjectedSchema,
+    max_projections: usize,
+}
+
+impl ScanMemTableBench {
+    pub fn new(config: ScanMemTableBenchConfig) -> Self {
+        let store = File::new(config.store_path);
+
+        let runtime = Arc::new(util::new_runtime(config.runtime_thread_num));
+        let meta_cache: Option<MetaCacheRef> = None;
+        let data_cache: Option<DataCacheRef> = None;
+        let mut sst_path = store.new_path();
+        sst_path.set_file_name(&config.sst_file_name);
+        let schema = runtime.block_on(util::schema_from_sst(
+            &store,
+            &sst_path,
+            &meta_cache,
+            &data_cache,
+        ));
+
+        let projected_schema = ProjectedSchema::no_projection(schema.clone());
+
+        let memtable_factory = SkiplistMemTableFactory;
+        let memtable_opts = Options {
+            collector: Arc::new(NoopCollector {}),
+            schema: schema.clone(),
+            arena_block_size: config.arena_block_size.0 as u32,
+            creation_sequence: crate::INIT_SEQUENCE,
+        };
+        let memtable = memtable_factory.create_memtable(memtable_opts).unwrap();
+
+        runtime.block_on(util::load_sst_to_memtable(
+            &store,
+            &sst_path,
+            &schema,
+            &memtable,
+            runtime.clone(),
+        ));
+
+        info!(
+            "\nScanMemTableBench memtable loaded, memory used: {}",
+            memtable.approximate_memory_usage()
+        );
+
+        Self {
+            memtable,
+            projected_schema,
+            max_projections: config.max_projections,
+        }
+    }
+
+    pub fn num_benches(&self) -> usize {
+        // One test reads all columns and `max_projections` tests read with projection.
+        1 + self.max_projections
+    }
+
+    pub fn init_for_bench(&mut self, i: usize) {
+        let projected_schema =
+            util::projected_schema_by_number(self.memtable.schema(), i, self.max_projections);
+
+        self.projected_schema = projected_schema;
+    }
+
+    pub fn run_bench(&self) {
+        let scan_ctx = ScanContext::default();
+        let scan_req = ScanRequest {
+            start_user_key: Bound::Unbounded,
+            end_user_key: Bound::Unbounded,
+            sequence: common_types::MAX_SEQUENCE_NUMBER,
+            projected_schema: self.projected_schema.clone(),
+            need_dedup: true,
+            reverse: false,
+        };
+
+        let iter = self.memtable.scan(scan_ctx, scan_req).unwrap();
+
+        let mut total_rows = 0;
+        let mut batch_num = 0;
+        for batch in iter {
+            let num_rows = batch.unwrap().num_rows();
+            total_rows += num_rows;
+            batch_num += 1;
+        }
+
+        info!(
+            "\nScanMemTableBench total rows of memtable: {}, total batch num: {}",
+            total_rows, batch_num,
+        );
+    }
+}
diff --git a/benchmarks/src/sst_bench.rs b/benchmarks/src/sst_bench.rs
new file mode 100644
index 0000000000..882e40b1fa
--- /dev/null
+++ b/benchmarks/src/sst_bench.rs
@@ -0,0 +1,123 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! SST bench.
+
+use std::{cmp, sync::Arc, time::Instant};
+
+use analytic_engine::sst::factory::{Factory, FactoryImpl, SstReaderOptions, SstType};
+use common_types::{projected_schema::ProjectedSchema, schema::Schema};
+use common_util::runtime::Runtime;
+use futures::stream::StreamExt;
+use log::info;
+use object_store::{disk::File, path::ObjectStorePath, ObjectStore};
+use parquet::{
+    cache::{LruDataCache, LruMetaCache},
+    DataCacheRef, MetaCacheRef,
+};
+
+use crate::{config::SstBenchConfig, util};
+
+pub struct SstBench {
+    store: File,
+    pub sst_file_name: String,
+    max_projections: usize,
+    schema: Schema,
+    sst_reader_options: SstReaderOptions,
+    runtime: Arc<Runtime>,
+}
+
+impl SstBench {
+    pub fn new(config: SstBenchConfig) -> Self {
+        let store = File::new(config.store_path);
+
+        let runtime = Arc::new(util::new_runtime(config.runtime_thread_num));
+
+        let mut sst_path = store.new_path();
+        sst_path.set_file_name(&config.sst_file_name);
+        let meta_cache: Option<MetaCacheRef> =
+            if let Some(sst_meta_cache_cap) = config.sst_meta_cache_cap {
+                Some(Arc::new(LruMetaCache::new(sst_meta_cache_cap)))
+            } else {
+                None
+            };
+
+        let data_cache: Option<DataCacheRef> =
+            if let Some(sst_data_cache_cap) = config.sst_data_cache_cap {
+                Some(Arc::new(LruDataCache::new(sst_data_cache_cap)))
+            } else {
+                None
+            };
+
+        let schema = runtime.block_on(util::schema_from_sst(
+            &store,
+            &sst_path,
+            &meta_cache,
+            &data_cache,
+        ));
+
+        let predicate = config.predicate.into_predicate();
+        let projected_schema = ProjectedSchema::no_projection(schema.clone());
+        let sst_reader_options = SstReaderOptions {
+            sst_type: SstType::Parquet,
+            read_batch_row_num: config.read_batch_row_num,
+            reverse: config.reverse,
+            projected_schema,
+            predicate: Arc::new(predicate),
+            meta_cache,
+            data_cache,
+            runtime: runtime.clone(),
+        };
+        let max_projections = cmp::min(config.max_projections, schema.num_columns());
+
+        SstBench {
+            store,
+            sst_file_name: config.sst_file_name,
+            max_projections,
+            schema,
+            sst_reader_options,
+            runtime,
+        }
+    }
+
+    pub fn num_benches(&self) -> usize {
+        // One test reads all columns and `max_projections` tests read with projection.
+        1 + self.max_projections
+    }
+
+    pub fn init_for_bench(&mut self, i: usize) {
+        let projected_schema =
+            util::projected_schema_by_number(&self.schema, i, self.max_projections);
+
+        self.sst_reader_options.projected_schema = projected_schema;
+    }
+
+    pub fn run_bench(&self) {
+        let mut sst_path = self.store.new_path();
+        sst_path.set_file_name(&self.sst_file_name);
+
+        let sst_factory = FactoryImpl;
+        let mut sst_reader = sst_factory
+            .new_sst_reader(&self.sst_reader_options, &sst_path, &self.store)
+            .unwrap();
+
+        self.runtime.block_on(async {
+            let begin_instant = Instant::now();
+            let mut sst_stream = sst_reader.read().await.unwrap();
+
+            let mut total_rows = 0;
+            let mut batch_num = 0;
+            while let Some(batch) = sst_stream.next().await {
+                let num_rows = batch.unwrap().num_rows();
+                total_rows += num_rows;
+                batch_num += 1;
+            }
+
+            info!(
+                "\nSstBench total rows of sst: {}, total batch num: {}, cost: {:?}",
+                total_rows,
+                batch_num,
+                begin_instant.elapsed(),
+            );
+        });
+    }
+}
diff --git a/benchmarks/src/sst_tools.rs b/benchmarks/src/sst_tools.rs
new file mode 100644
index 0000000000..666722d91b
--- /dev/null
+++ b/benchmarks/src/sst_tools.rs
@@ -0,0 +1,257 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Tools to generate SST.
+
+use std::sync::Arc;
+
+use analytic_engine::{
+    row_iter::{
+        self,
+        dedup::DedupIterator,
+        merge::{MergeBuilder, MergeConfig},
+        IterOptions,
+    },
+    space::SpaceId,
+    sst::{
+        builder::RecordBatchStream,
+        factory::{Factory, FactoryImpl, SstBuilderOptions, SstReaderOptions, SstType},
+        file::{self, FilePurgeQueue, SstMetaData},
+        manager::FileId,
+    },
+    table::sst_util,
+    table_options::Compression,
+};
+use common_types::{projected_schema::ProjectedSchema, request_id::RequestId};
+use common_util::runtime::Runtime;
+use futures::TryStreamExt;
+use log::info;
+use object_store::{
+    disk::File,
+    path::{file::FilePath, ObjectStorePath},
+    ObjectStore,
+};
+use serde_derive::Deserialize;
+use table_engine::{predicate::Predicate, table::TableId};
+use tokio::sync::mpsc;
+
+use crate::{config::BenchPredicate, util};
+
+#[derive(Debug)]
+struct SstConfig {
+    sst_meta: SstMetaData,
+    store_path: String,
+    sst_file_name: String,
+    num_rows_per_row_group: usize,
+    compression: Compression,
+}
+
+async fn create_sst_from_stream(config: SstConfig, record_batch_stream: RecordBatchStream) {
+    let sst_factory = FactoryImpl;
+    let sst_builder_options = SstBuilderOptions {
+        sst_type: SstType::Parquet,
+        num_rows_per_row_group: config.num_rows_per_row_group,
+        compression: config.compression,
+    };
+
+    info!(
+        "create sst from stream, config:{:?}, sst_builder_options:{:?}",
+        config, sst_builder_options
+    );
+
+    let store = File::new(config.store_path);
+    let mut sst_file_path = store.new_path();
+    sst_file_path.set_file_name(&config.sst_file_name);
+
+    let mut builder = sst_factory
+        .new_sst_builder(&sst_builder_options, &sst_file_path, &store)
+        .unwrap();
+    builder
+        .build(RequestId::next_id(), &config.sst_meta, record_batch_stream)
+        .await
+        .unwrap();
+}
+
+#[derive(Debug, Deserialize)]
+pub struct RebuildSstConfig {
+    store_path: String,
+    input_file_name: String,
+    read_batch_row_num: usize,
+    predicate: BenchPredicate,
+
+    // Output sst config:
+    output_file_name: String,
+    num_rows_per_row_group: usize,
+    compression: Compression,
+}
+
+pub async fn rebuild_sst(config: RebuildSstConfig, runtime: Arc<Runtime>) {
+    info!("Start rebuild sst, config:{:?}", config);
+
+    let store = File::new(config.store_path.clone());
+
+    let mut input_path = store.new_path();
+    input_path.set_file_name(&config.input_file_name);
+
+    let sst_meta = util::meta_from_sst(&store, &input_path, &None, &None).await;
+
+    let projected_schema = ProjectedSchema::no_projection(sst_meta.schema.clone());
+    let sst_reader_options = SstReaderOptions {
+        sst_type: SstType::Parquet,
+        read_batch_row_num: config.read_batch_row_num,
+        reverse: false,
+        projected_schema,
+        predicate: Arc::new(config.predicate.into_predicate()),
+        meta_cache: None,
+        data_cache: None,
+        runtime,
+    };
+
+    let record_batch_stream =
+        sst_to_record_batch_stream(&sst_reader_options, &input_path, &store).await;
+
+    let output_sst_config = SstConfig {
+        sst_meta,
+        store_path: config.store_path,
+        sst_file_name: config.output_file_name,
+        num_rows_per_row_group: config.num_rows_per_row_group,
+        compression: config.compression,
+    };
+
+    create_sst_from_stream(output_sst_config, record_batch_stream).await;
+
+    info!("Start rebuild sst done");
+}
+
+async fn sst_to_record_batch_stream(
+    sst_reader_options: &SstReaderOptions,
+    input_path: &FilePath,
+    store: &File,
+) -> RecordBatchStream {
+    let sst_factory = FactoryImpl;
+    let mut sst_reader = sst_factory
+        .new_sst_reader(sst_reader_options, input_path, store)
+        .unwrap();
+
+    let sst_stream = sst_reader.read().await.unwrap();
+
+    Box::new(sst_stream.map_err(|e| Box::new(e) as _))
+}
+
+#[derive(Debug, Deserialize)]
+pub struct MergeSstConfig {
+    store_path: String,
+    space_id: SpaceId,
+    table_id: TableId,
+    sst_file_ids: Vec<FileId>,
+    dedup: bool,
+    read_batch_row_num: usize,
+    predicate: BenchPredicate,
+
+    // Output sst config:
+    output_store_path: String,
+    output_file_name: String,
+    num_rows_per_row_group: usize,
+    compression: Compression,
+}
+
+pub async fn merge_sst(config: MergeSstConfig, runtime: Arc<Runtime>) {
+    if config.sst_file_ids.is_empty() {
+        info!("No input files to merge");
+        return;
+    }
+
+    info!("Merge sst begin, config:{:?}", config);
+
+    let space_id = config.space_id;
+    let table_id = config.table_id;
+    let store = File::new(config.store_path.clone());
+    let (tx, _rx) = mpsc::unbounded_channel();
+    let purge_queue = FilePurgeQueue::new(space_id, table_id, tx);
+
+    let file_handles = util::file_handles_from_ssts(
+        &store,
+        space_id,
+        table_id,
+        &config.sst_file_ids,
+        purge_queue,
+        &None,
+        &None,
+    )
+    .await;
+    let max_sequence = file_handles
+        .iter()
+        .map(|file| file.max_sequence())
+        .max()
+        .unwrap();
+
+    let mut first_sst_path = store.new_path();
+    sst_util::set_sst_file_path(
+        space_id,
+        table_id,
+        config.sst_file_ids[0],
+        &mut first_sst_path,
+    );
+    let schema = util::schema_from_sst(&store, &first_sst_path, &None, &None).await;
+    let iter_options = IterOptions {
+        batch_size: config.read_batch_row_num,
+    };
+
+    let request_id = RequestId::next_id();
+    let iter = {
+        let space_id = config.space_id;
+        let table_id = config.table_id;
+        let sequence = max_sequence + 1;
+        let projected_schema = ProjectedSchema::no_projection(schema.clone());
+        let sst_reader_options = SstReaderOptions {
+            sst_type: SstType::Parquet,
+            read_batch_row_num: config.read_batch_row_num,
+            reverse: false,
+            projected_schema: projected_schema.clone(),
+            predicate: Arc::new(config.predicate.into_predicate()),
+            meta_cache: None,
+            data_cache: None,
+            runtime: runtime.clone(),
+        };
+
+        let sst_factory = FactoryImpl;
+        let mut builder = MergeBuilder::new(MergeConfig {
+            request_id,
+            space_id,
+            table_id,
+            sequence,
+            projected_schema,
+            predicate: Arc::new(Predicate::empty()),
+            sst_factory,
+            sst_reader_options,
+            store: &store,
+            merge_iter_options: iter_options.clone(),
+            need_dedup: true,
+            reverse: false,
+        });
+        builder
+            .mut_ssts_of_level(0)
+            .extend_from_slice(&file_handles);
+
+        builder.build().await.unwrap()
+    };
+
+    let record_batch_stream = if config.dedup {
+        let iter = DedupIterator::new(request_id, iter, iter_options);
+        row_iter::record_batch_with_key_iter_to_stream(iter, &runtime)
+    } else {
+        row_iter::record_batch_with_key_iter_to_stream(iter, &runtime)
+    };
+
+    let sst_meta = file::merge_sst_meta(&file_handles, schema);
+    let output_sst_config = SstConfig {
+        sst_meta,
+        store_path: config.output_store_path,
+        sst_file_name: config.output_file_name,
+        num_rows_per_row_group: config.num_rows_per_row_group,
+        compression: config.compression,
+    };
+
+    create_sst_from_stream(output_sst_config, record_batch_stream).await;
+
+    info!("Merge sst done");
+}
diff --git a/benchmarks/src/util.rs b/benchmarks/src/util.rs
new file mode 100644
index 0000000000..639c3da19b
--- /dev/null
+++ b/benchmarks/src/util.rs
@@ -0,0 +1,146 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Utilities.
+
+use std::sync::Arc;
+
+use analytic_engine::{
+    memtable::{key::KeySequence, MemTableRef, PutContext},
+    space::SpaceId,
+    sst::{
+        factory::{Factory, FactoryImpl, SstReaderOptions, SstType},
+        file::{FileHandle, FileMeta, FilePurgeQueue, SstMetaData},
+        manager::FileId,
+        parquet::reader,
+    },
+    table::sst_util,
+};
+use common_types::{
+    projected_schema::ProjectedSchema,
+    schema::{IndexInWriterSchema, Schema},
+    time::TimeRange,
+};
+use common_util::runtime::{self, Runtime};
+use futures::stream::StreamExt;
+use object_store::{disk::File, path::file::FilePath, ObjectStore};
+use parquet::{DataCacheRef, MetaCacheRef};
+use table_engine::{predicate::Predicate, table::TableId};
+
+pub fn new_runtime(thread_num: usize) -> Runtime {
+    runtime::Builder::default()
+        .thread_name("engine_bench")
+        .worker_threads(thread_num)
+        .enable_all()
+        .build()
+        .unwrap()
+}
+
+pub async fn meta_from_sst(
+    store: &File,
+    sst_path: &FilePath,
+    meta_cache: &Option<MetaCacheRef>,
+    data_cache: &Option<DataCacheRef>,
+) -> SstMetaData {
+    let (_, sst_meta) = reader::read_sst_meta(store, sst_path, meta_cache, data_cache)
+        .await
+        .unwrap();
+
+    sst_meta
+}
+
+pub async fn schema_from_sst(
+    store: &File,
+    sst_path: &FilePath,
+    meta_cache: &Option<MetaCacheRef>,
+    data_cache: &Option<DataCacheRef>,
+) -> Schema {
+    let sst_meta = meta_from_sst(store, sst_path, meta_cache, data_cache).await;
+
+    sst_meta.schema
+}
+
+pub fn projected_schema_by_number(
+    schema: &Schema,
+    num_columns: usize,
+    max_projections: usize,
+) -> ProjectedSchema {
+    if num_columns < max_projections {
+        let projection = (0..num_columns + 1).into_iter().collect();
+
+        ProjectedSchema::new(schema.clone(), Some(projection)).unwrap()
+    } else {
+        ProjectedSchema::no_projection(schema.clone())
+    }
+}
+
+pub async fn load_sst_to_memtable(
+    store: &File,
+    sst_path: &FilePath,
+    schema: &Schema,
+    memtable: &MemTableRef,
+    runtime: Arc<Runtime>,
+) {
+    let sst_reader_options = SstReaderOptions {
+        sst_type: SstType::Parquet,
+        read_batch_row_num: 500,
+        reverse: false,
+        projected_schema: ProjectedSchema::no_projection(schema.clone()),
+        predicate: Arc::new(Predicate::new(TimeRange::min_to_max())),
+        meta_cache: None,
+        data_cache: None,
+        runtime,
+    };
+    let sst_factory = FactoryImpl;
+    let mut sst_reader = sst_factory
+        .new_sst_reader(&sst_reader_options, sst_path, store)
+        .unwrap();
+
+    let mut sst_stream = sst_reader.read().await.unwrap();
+    let index_in_writer = IndexInWriterSchema::for_same_schema(schema.num_columns());
+    let mut ctx = PutContext::new(index_in_writer);
+
+    let mut sequence = crate::INIT_SEQUENCE;
+
+    while let Some(batch) = sst_stream.next().await {
+        let batch = batch.unwrap();
+
+        for i in 0..batch.num_rows() {
+            let row = batch.clone_row_at(i);
+
+            let key_seq = KeySequence::new(sequence, i as u32);
+
+            memtable.put(&mut ctx, key_seq, &row, schema).unwrap();
+
+            sequence += 1;
+        }
+    }
+}
+
+pub async fn file_handles_from_ssts(
+    store: &File,
+    space_id: SpaceId,
+    table_id: TableId,
+    sst_file_ids: &[FileId],
+    purge_queue: FilePurgeQueue,
+    meta_cache: &Option<MetaCacheRef>,
+    data_cache: &Option<DataCacheRef>,
+) -> Vec<FileHandle> {
+    let mut file_handles = Vec::with_capacity(sst_file_ids.len());
+
+    for file_id in sst_file_ids.iter() {
+        let mut path = store.new_path();
+        sst_util::set_sst_file_path(space_id, table_id, *file_id, &mut path);
+
+        let sst_meta = meta_from_sst(store, &path, meta_cache, data_cache).await;
+        let file_meta = FileMeta {
+            id: *file_id,
+            meta: sst_meta,
+        };
+
+        let handle = FileHandle::new(file_meta, purge_queue.clone());
+
+        file_handles.push(handle);
+    }
+
+    file_handles
+}
diff --git a/build.rs b/build.rs
new file mode 100644
index 0000000000..ce2a0fb668
--- /dev/null
+++ b/build.rs
@@ -0,0 +1,26 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Build script
+
+use std::env;
+
+use vergen::{vergen, Config, ShaKind};
+
+fn main() {
+    // Generate the default 'cargo:' instruction output
+    let mut config = Config::default();
+    // Change the SHA output to the short variant
+    *config.git_mut().sha_kind_mut() = ShaKind::Short;
+    // Override git branch by env if provided.
+    if let Some(branch) = env::var_os("GITBRANCH") {
+        let branch = branch
+            .into_string()
+            .expect("Convert git branch env to string");
+        if !branch.is_empty() {
+            *config.git_mut().branch_mut() = false;
+            println!("cargo:rustc-env=VERGEN_GIT_BRANCH={}", branch);
+        }
+    }
+
+    vergen(config).expect("Vergen failed to generate config");
+}
diff --git a/catalog/Cargo.toml b/catalog/Cargo.toml
new file mode 100644
index 0000000000..14e3eb5c67
--- /dev/null
+++ b/catalog/Cargo.toml
@@ -0,0 +1,15 @@
+[package]
+name = "catalog"
+version = "0.1.0"
+authors = ["CeresDB Authors <ceresdb@service.alipay.com>"]
+edition = "2018"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+# Workspace dependencies, in alphabetical order
+async-trait = "0.1.41"
+snafu = { version ="0.6.10", features = ["backtraces"]}
+common_types = { path = "../common_types" }
+common_util = { path = "../common_util" }
+table_engine = { path = "../table_engine" }
diff --git a/catalog/src/consts.rs b/catalog/src/consts.rs
new file mode 100644
index 0000000000..ebac82873c
--- /dev/null
+++ b/catalog/src/consts.rs
@@ -0,0 +1,12 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Catalog constants
+
+/// Default catalog name
+pub const DEFAULT_CATALOG: &str = "ceresdb";
+/// Default schema name
+pub const DEFAULT_SCHEMA: &str = "public";
+/// Catalog name of the sys catalog
+pub const SYSTEM_CATALOG: &str = "system";
+/// Schema name of the sys catalog
+pub const SYSTEM_CATALOG_SCHEMA: &str = "public";
diff --git a/catalog/src/lib.rs b/catalog/src/lib.rs
new file mode 100644
index 0000000000..90799b9205
--- /dev/null
+++ b/catalog/src/lib.rs
@@ -0,0 +1,59 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Common traits and types about catalog (schema)
+
+#[macro_use]
+extern crate common_util;
+
+pub mod consts;
+pub mod manager;
+pub mod schema;
+
+use std::sync::Arc;
+
+use async_trait::async_trait;
+use snafu::{Backtrace, Snafu};
+
+use crate::schema::{NameRef, SchemaRef};
+
+#[derive(Debug, Snafu)]
+#[snafu(visibility = "pub")]
+pub enum Error {
+    #[snafu(display(
+        "Failed to create schema, catalog:{}, schema:{}, err:{}",
+        catalog,
+        schema,
+        source
+    ))]
+    CreateSchema {
+        catalog: String,
+        schema: String,
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+
+    #[snafu(display("Unsupported method, msg:{}.\nBacktrace:\n{}", msg, backtrace))]
+    UnSupported { msg: String, backtrace: Backtrace },
+}
+
+define_result!(Error);
+
+/// Catalog manage schemas
+// TODO(yingwen): Maybe use async trait?
+// TODO(yingwen): Provide a context
+// TODO(yingwen): Catalog id?
+#[async_trait]
+pub trait Catalog {
+    /// Get the catalog name
+    fn name(&self) -> NameRef;
+
+    /// Find schema by name
+    fn schema_by_name(&self, name: NameRef) -> Result<Option<SchemaRef>>;
+
+    async fn create_schema<'a>(&'a self, name: NameRef<'a>) -> Result<()>;
+
+    /// All schemas
+    fn all_schemas(&self) -> Result<Vec<SchemaRef>>;
+}
+
+/// A reference counted catalog pointer
+pub type CatalogRef = Arc<dyn Catalog + Send + Sync>;
diff --git a/catalog/src/manager.rs b/catalog/src/manager.rs
new file mode 100644
index 0000000000..fb10637750
--- /dev/null
+++ b/catalog/src/manager.rs
@@ -0,0 +1,32 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Catalog manager
+
+use snafu::Snafu;
+
+use crate::{schema::NameRef, CatalogRef};
+
+#[derive(Debug, Snafu)]
+pub struct Error;
+
+define_result!(Error);
+
+/// Catalog manager abstraction
+///
+/// Tracks meta data of databases/tables
+// TODO(yingwen): Maybe use async trait?
+// TODO(yingwen): Provide a context
+
+pub trait Manager: Clone + Send + Sync {
+    /// Get the default catalog name
+    fn default_catalog_name(&self) -> NameRef;
+
+    /// Get the default schema name
+    fn default_schema_name(&self) -> NameRef;
+
+    /// Find the catalog by name
+    fn catalog_by_name(&self, name: NameRef) -> Result<Option<CatalogRef>>;
+
+    /// All catalogs
+    fn all_catalogs(&self) -> Result<Vec<CatalogRef>>;
+}
diff --git a/catalog/src/schema.rs b/catalog/src/schema.rs
new file mode 100644
index 0000000000..49c2f6c462
--- /dev/null
+++ b/catalog/src/schema.rs
@@ -0,0 +1,169 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Schema contains one or more tables
+
+use std::sync::Arc;
+
+use async_trait::async_trait;
+use common_types::column_schema::ColumnSchema;
+use snafu::{Backtrace, Snafu};
+use table_engine::{
+    engine::{CreateTableRequest, DropTableRequest, TableEngineRef},
+    table::{TableId, TableRef},
+};
+
+#[derive(Debug, Snafu)]
+#[snafu(visibility(pub))]
+pub enum Error {
+    #[snafu(display("Unsupported method, msg:{}.\nBacktrace:\n{}", msg, backtrace))]
+    UnSupported { msg: String, backtrace: Backtrace },
+
+    #[snafu(display("Failed to create table, err:{}", source))]
+    CreateTable { source: table_engine::engine::Error },
+
+    #[snafu(display(
+        "Failed to create table, table already exists, table:{}.\nBacktrace:\n{}",
+        table,
+        backtrace
+    ))]
+    CreateExistTable { table: String, backtrace: Backtrace },
+
+    #[snafu(display(
+        "Failed to create table, cannot persist meta, table:{}, err:{}",
+        table,
+        source
+    ))]
+    WriteTableMeta {
+        table: String,
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+
+    #[snafu(display(
+        "Catalog mismatch, expect:{}, given:{}.\nBacktrace:\n{}",
+        expect,
+        given,
+        backtrace
+    ))]
+    CatalogMismatch {
+        expect: String,
+        given: String,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Schema mismatch, expect:{}, given:{}.\nBacktrace:\n{}",
+        expect,
+        given,
+        backtrace
+    ))]
+    SchemaMismatch {
+        expect: String,
+        given: String,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Invalid table id, msg:{}, table_id:{}.\nBacktrace:\n{}",
+        msg,
+        table_id,
+        backtrace
+    ))]
+    InvalidTableId {
+        msg: &'static str,
+        table_id: TableId,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Failed to find table, table:{}.\nBacktrace:\n{}", table, backtrace))]
+    TableNotFound { table: String, backtrace: Backtrace },
+
+    #[snafu(display("Failed to alter table, err:{}", source))]
+    AlterTable {
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+
+    #[snafu(display("Failed to drop table, err:{}", source))]
+    DropTable { source: table_engine::engine::Error },
+
+    #[snafu(display(
+        "Too many table, cannot create table, schema:{}, table:{}.\nBacktrace:\n{}",
+        schema,
+        table,
+        backtrace
+    ))]
+    TooManyTable {
+        schema: String,
+        table: String,
+        backtrace: Backtrace,
+    },
+}
+
+define_result!(Error);
+
+/// Create table options.
+#[derive(Clone)]
+pub struct CreateOptions {
+    /// Table engine
+    // FIXME(yingwen): We have engine type in create request, remove this
+    pub table_engine: TableEngineRef,
+    /// Create if not exists, if table already exists, wont return error
+    // TODO(yingwen): Maybe remove this?
+    pub create_if_not_exists: bool,
+}
+
+/// Drop table options.
+#[derive(Clone)]
+pub struct DropOptions {
+    /// Table engine
+    pub table_engine: TableEngineRef,
+}
+
+/// Alter table operations.
+#[derive(Debug)]
+pub enum AlterTableOperation {
+    /// Add column operation, the column id in [ColumnSchema] will be ignored.
+    /// Primary key column is not allowed to be added, so all columns will
+    /// be added as normal columns.
+    AddColumn(ColumnSchema),
+}
+
+/// Alter table request.
+#[derive(Debug)]
+pub struct AlterTableRequest {
+    pub table_name: String,
+    pub operations: Vec<AlterTableOperation>,
+}
+
+/// Schema manage tables.
+#[async_trait]
+pub trait Schema {
+    /// Get schema name.
+    fn name(&self) -> NameRef;
+
+    /// Find table by name.
+    fn table_by_name(&self, name: NameRef) -> Result<Option<TableRef>>;
+
+    /// Allocate a table id for given table.
+    fn alloc_table_id(&self, name: NameRef) -> Result<TableId>;
+
+    /// Create table according to `request`.
+    async fn create_table(
+        &self,
+        request: CreateTableRequest,
+        opts: CreateOptions,
+    ) -> Result<TableRef>;
+
+    /// Drop table according to `request`.
+    ///
+    /// Returns true if the table is really dropped.
+    async fn drop_table(&self, request: DropTableRequest, opts: DropOptions) -> Result<bool>;
+
+    /// All tables
+    fn all_tables(&self) -> Result<Vec<TableRef>>;
+}
+
+/// A name reference
+pub type NameRef<'a> = &'a str;
+/// A reference counted schema pointer
+// TODO(yingwen): This name is conflict with [table_engine::schema::SchemaRef].
+pub type SchemaRef = Arc<dyn Schema + Send + Sync>;
diff --git a/catalog_impls/Cargo.toml b/catalog_impls/Cargo.toml
new file mode 100644
index 0000000000..ddcbdcdeec
--- /dev/null
+++ b/catalog_impls/Cargo.toml
@@ -0,0 +1,23 @@
+[package]
+name = "catalog_impls"
+version = "0.1.0"
+authors = ["CeresDB Authors <ceresdb@service.alipay.com>"]
+edition = "2018"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+# Workspace dependencies, in alphabetical order
+async-trait = "0.1.41"
+catalog = { path = "../catalog" }
+common_types = { path = "../common_types" }
+common_util = { path = "../common_util" }
+log = "0.4"
+snafu = { version ="0.6.10", features = ["backtraces"]}
+system_catalog = { path = "../system_catalog" }
+table_engine = { path = "../table_engine" }
+tokio = { version = "1.0", features = ["sync"] }
+
+[dev-dependencies]
+analytic_engine = { path = "../analytic_engine", features = ["test"] }
+server = { path = "../server" }
diff --git a/catalog_impls/src/lib.rs b/catalog_impls/src/lib.rs
new file mode 100644
index 0000000000..6f4ca69947
--- /dev/null
+++ b/catalog_impls/src/lib.rs
@@ -0,0 +1,52 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use std::sync::Arc;
+
+use catalog::{consts::SYSTEM_CATALOG, manager::Manager, schema::NameRef, CatalogRef};
+use system_catalog::{tables::Tables, SystemTableAdapter};
+
+use crate::system_tables::{SystemTables, SystemTablesBuilder};
+
+pub mod memory;
+mod system_tables;
+pub mod table_based;
+
+/// CatalogManagerImpl is a wrapper for system and user tables
+#[derive(Clone)]
+pub struct CatalogManagerImpl<M> {
+    system_tables: SystemTables,
+    user_catalog_manager: M,
+}
+
+impl<M: Manager + 'static> CatalogManagerImpl<M> {
+    pub fn new(manager: M) -> Self {
+        let mut system_tables_builder = SystemTablesBuilder::new();
+        system_tables_builder = system_tables_builder
+            .insert_table(SystemTableAdapter::new(Tables::new(manager.clone())));
+        Self {
+            system_tables: system_tables_builder.build(),
+            user_catalog_manager: manager,
+        }
+    }
+}
+
+impl<M: Manager> Manager for CatalogManagerImpl<M> {
+    fn default_catalog_name(&self) -> NameRef {
+        self.user_catalog_manager.default_catalog_name()
+    }
+
+    fn default_schema_name(&self) -> NameRef {
+        self.user_catalog_manager.default_schema_name()
+    }
+
+    fn catalog_by_name(&self, name: NameRef) -> catalog::manager::Result<Option<CatalogRef>> {
+        match name {
+            SYSTEM_CATALOG => Ok(Some(Arc::new(self.system_tables.clone()))),
+            _ => self.user_catalog_manager.catalog_by_name(name),
+        }
+    }
+
+    fn all_catalogs(&self) -> catalog::manager::Result<Vec<CatalogRef>> {
+        self.user_catalog_manager.all_catalogs()
+    }
+}
diff --git a/catalog_impls/src/memory.rs b/catalog_impls/src/memory.rs
new file mode 100644
index 0000000000..e8ab37bb26
--- /dev/null
+++ b/catalog_impls/src/memory.rs
@@ -0,0 +1,260 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! A memory catalog implementation
+//!
+//! Mainly for test
+
+use std::{
+    collections::HashMap,
+    sync::{Arc, RwLock},
+};
+
+use async_trait::async_trait;
+use catalog::{
+    self, consts,
+    manager::{self, Manager},
+    schema::{
+        self, CatalogMismatch, CreateOptions, CreateTable, DropOptions, NameRef, Schema,
+        SchemaMismatch, SchemaRef, TooManyTable, UnSupported,
+    },
+    Catalog, CatalogRef,
+};
+use log::info;
+use snafu::{ensure, OptionExt, ResultExt};
+use table_engine::{
+    engine::{CreateTableRequest, DropTableRequest},
+    table::{SchemaId, SchemaIdGenerator, TableId, TableRef, TableSeqGenerator},
+};
+
+struct ManagerImplInner {
+    catalogs: HashMap<String, CatalogRef>,
+}
+
+/// In-memory catalog manager
+#[derive(Clone)]
+pub struct ManagerImpl {
+    inner: Arc<ManagerImplInner>,
+}
+
+impl Default for ManagerImpl {
+    fn default() -> Self {
+        let schema_id_generator = SchemaIdGenerator::default();
+        let schema_id = schema_id_generator.alloc_schema_id().unwrap();
+
+        // Register default schema
+        let default_schema: SchemaRef = Arc::new(SchemaImpl::new(
+            consts::DEFAULT_CATALOG.to_string(),
+            consts::DEFAULT_SCHEMA.to_string(),
+            schema_id,
+        ));
+        let mut schemas = HashMap::new();
+        schemas.insert(consts::DEFAULT_SCHEMA.to_string(), default_schema);
+
+        // Use above schemas to create a default catalog
+        let default_catalog: CatalogRef = Arc::new(CatalogImpl {
+            name: consts::DEFAULT_CATALOG.to_string(),
+            schemas: RwLock::new(schemas),
+            schema_id_generator: Arc::new(schema_id_generator),
+        });
+        // Register default catalog
+        let mut catalogs = HashMap::new();
+        catalogs.insert(consts::DEFAULT_CATALOG.to_string(), default_catalog);
+
+        Self {
+            inner: Arc::new(ManagerImplInner { catalogs }),
+        }
+    }
+}
+
+impl Manager for ManagerImpl {
+    fn default_catalog_name(&self) -> NameRef {
+        consts::DEFAULT_CATALOG
+    }
+
+    fn default_schema_name(&self) -> NameRef {
+        consts::DEFAULT_SCHEMA
+    }
+
+    fn catalog_by_name(&self, name: NameRef) -> manager::Result<Option<CatalogRef>> {
+        let catalog = self.inner.catalogs.get(name).cloned();
+        Ok(catalog)
+    }
+
+    fn all_catalogs(&self) -> manager::Result<Vec<CatalogRef>> {
+        Ok(self.inner.catalogs.iter().map(|(_, v)| v.clone()).collect())
+    }
+}
+
+/// In-memory catalog
+struct CatalogImpl {
+    /// Catalog name
+    name: String,
+    /// Schemas of catalog
+    schemas: RwLock<HashMap<String, SchemaRef>>,
+    /// Global schema id generator, Each schema has a unique schema id.
+    schema_id_generator: Arc<SchemaIdGenerator>,
+}
+
+#[async_trait]
+impl Catalog for CatalogImpl {
+    fn name(&self) -> NameRef {
+        &self.name
+    }
+
+    fn schema_by_name(&self, name: NameRef) -> catalog::Result<Option<SchemaRef>> {
+        let schema = self.schemas.read().unwrap().get(name).cloned();
+        Ok(schema)
+    }
+
+    async fn create_schema<'a>(&'a self, name: NameRef<'a>) -> catalog::Result<()> {
+        let mut schemas = self.schemas.write().unwrap();
+
+        if schemas.get(name).is_some() {
+            return Ok(());
+        }
+
+        let schema_id = self.schema_id_generator.alloc_schema_id().unwrap();
+
+        let schema: SchemaRef = Arc::new(SchemaImpl::new(
+            self.name.to_string(),
+            name.to_string(),
+            schema_id,
+        ));
+
+        schemas.insert(name.to_string(), schema);
+        info!(
+            "create schema success, catalog:{}, schema:{}",
+            &self.name, name
+        );
+        Ok(())
+    }
+
+    fn all_schemas(&self) -> catalog::Result<Vec<SchemaRef>> {
+        Ok(self
+            .schemas
+            .read()
+            .unwrap()
+            .iter()
+            .map(|(_, v)| v.clone())
+            .collect())
+    }
+}
+
+/// In-memory schema
+struct SchemaImpl {
+    /// Catalog name
+    catalog_name: String,
+    /// Schema name
+    schema_name: String,
+    /// Tables of schema
+    tables: RwLock<HashMap<String, TableRef>>,
+    schema_id: SchemaId,
+    table_seq_generator: TableSeqGenerator,
+}
+
+impl SchemaImpl {
+    fn new(catalog_name: String, schema_name: String, schema_id: SchemaId) -> Self {
+        Self {
+            catalog_name,
+            schema_name,
+            tables: RwLock::new(HashMap::new()),
+            schema_id,
+            table_seq_generator: TableSeqGenerator::default(),
+        }
+    }
+}
+
+#[async_trait]
+impl Schema for SchemaImpl {
+    fn name(&self) -> NameRef {
+        &self.schema_name
+    }
+
+    fn table_by_name(&self, name: NameRef) -> schema::Result<Option<TableRef>> {
+        let table = self.tables.read().unwrap().get(name).cloned();
+        Ok(table)
+    }
+
+    fn alloc_table_id(&self, name: NameRef) -> schema::Result<TableId> {
+        let table_seq = self
+            .table_seq_generator
+            .alloc_table_seq()
+            .context(TooManyTable {
+                schema: &self.schema_name,
+                table: name,
+            })?;
+
+        Ok(TableId::new(self.schema_id, table_seq))
+    }
+
+    // In memory schema does not support persisting table info
+    async fn create_table(
+        &self,
+        request: CreateTableRequest,
+        opts: CreateOptions,
+    ) -> schema::Result<TableRef> {
+        ensure!(
+            self.catalog_name == request.catalog_name,
+            CatalogMismatch {
+                expect: &self.catalog_name,
+                given: request.catalog_name,
+            }
+        );
+        ensure!(
+            self.schema_name == request.schema_name,
+            SchemaMismatch {
+                expect: &self.schema_name,
+                given: request.schema_name,
+            }
+        );
+
+        {
+            // Check table existence
+            let tables = self.tables.read().unwrap();
+            if let Some(table) = tables.get(&request.table_name) {
+                return Ok(table.clone());
+            }
+        }
+
+        // Table engine handles duplicate table creation
+        let table_name = request.table_name.clone();
+        let table = opts
+            .table_engine
+            .create_table(request)
+            .await
+            .context(CreateTable)?;
+
+        {
+            // Now the table engine have create the table, but we may not be the
+            // creator thread
+            let mut tables = self.tables.write().unwrap();
+            tables.entry(table_name).or_insert_with(|| table.clone());
+        }
+
+        Ok(table)
+    }
+
+    async fn drop_table(
+        &self,
+        request: DropTableRequest,
+        _opts: DropOptions,
+    ) -> schema::Result<bool> {
+        UnSupported {
+            msg: format!(
+                "Dropping table is not supported by memory catalog, request:{:?}",
+                request
+            ),
+        }
+        .fail()
+    }
+
+    fn all_tables(&self) -> schema::Result<Vec<TableRef>> {
+        Ok(self
+            .tables
+            .read()
+            .unwrap()
+            .iter()
+            .map(|(_, v)| v.clone())
+            .collect())
+    }
+}
diff --git a/catalog_impls/src/system_tables.rs b/catalog_impls/src/system_tables.rs
new file mode 100644
index 0000000000..672f3fa8f6
--- /dev/null
+++ b/catalog_impls/src/system_tables.rs
@@ -0,0 +1,131 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Contains System tables, such as system.public.tables
+
+use std::{collections::HashMap, sync::Arc};
+
+use async_trait::async_trait;
+use catalog::{
+    consts::{SYSTEM_CATALOG, SYSTEM_CATALOG_SCHEMA},
+    schema::{CreateOptions, DropOptions, NameRef, Schema, SchemaRef},
+    Catalog,
+};
+use system_catalog::SystemTableAdapter;
+use table_engine::{
+    engine::{CreateTableRequest, DropTableRequest},
+    table::{Table, TableId, TableRef},
+};
+
+const UNSUPPORTED_MSG: &str = "system tables not supported";
+
+pub struct SystemTablesBuilder {
+    tables: HashMap<String, Arc<SystemTableAdapter>>,
+}
+
+impl SystemTablesBuilder {
+    pub fn new() -> Self {
+        Self {
+            tables: HashMap::new(),
+        }
+    }
+
+    pub fn insert_table(mut self, table: SystemTableAdapter) -> Self {
+        self.tables
+            .insert(table.name().to_string(), Arc::new(table));
+        self
+    }
+
+    pub fn build(self) -> SystemTables {
+        SystemTables::new(self.tables)
+    }
+}
+
+#[derive(Clone)]
+pub struct SystemTables {
+    tables: Arc<HashMap<String, Arc<SystemTableAdapter>>>,
+}
+
+impl SystemTables {
+    pub fn new(tables: HashMap<String, Arc<SystemTableAdapter>>) -> Self {
+        Self {
+            tables: Arc::new(tables),
+        }
+    }
+}
+
+#[async_trait]
+impl Schema for SystemTables {
+    fn name(&self) -> NameRef {
+        SYSTEM_CATALOG_SCHEMA
+    }
+
+    fn table_by_name(&self, name: NameRef) -> catalog::schema::Result<Option<TableRef>> {
+        Ok(self.tables.get(name).map(|v| v.clone() as TableRef))
+    }
+
+    fn alloc_table_id(&self, _name: NameRef) -> catalog::schema::Result<TableId> {
+        catalog::schema::UnSupported {
+            msg: UNSUPPORTED_MSG,
+        }
+        .fail()
+    }
+
+    async fn create_table(
+        &self,
+        _request: CreateTableRequest,
+        _opts: CreateOptions,
+    ) -> catalog::schema::Result<TableRef> {
+        catalog::schema::UnSupported {
+            msg: UNSUPPORTED_MSG,
+        }
+        .fail()
+    }
+
+    async fn drop_table(
+        &self,
+        _request: DropTableRequest,
+        _opts: DropOptions,
+    ) -> catalog::schema::Result<bool> {
+        catalog::schema::UnSupported {
+            msg: UNSUPPORTED_MSG,
+        }
+        .fail()
+    }
+
+    fn all_tables(&self) -> catalog::schema::Result<Vec<TableRef>> {
+        Ok(self
+            .tables
+            .iter()
+            .map(|(_, v)| v.clone() as TableRef)
+            .collect())
+    }
+}
+
+#[async_trait]
+impl Catalog for SystemTables {
+    fn name(&self) -> NameRef {
+        SYSTEM_CATALOG
+    }
+
+    fn schema_by_name(&self, name: NameRef) -> catalog::Result<Option<SchemaRef>> {
+        if name == SYSTEM_CATALOG_SCHEMA {
+            Ok(Some(Arc::new(self.clone())))
+        } else {
+            Ok(None)
+        }
+    }
+
+    async fn create_schema<'a>(&'a self, _name: NameRef<'a>) -> catalog::Result<()> {
+        catalog::UnSupported {
+            msg: UNSUPPORTED_MSG,
+        }
+        .fail()
+    }
+
+    fn all_schemas(&self) -> catalog::Result<Vec<SchemaRef>> {
+        catalog::UnSupported {
+            msg: UNSUPPORTED_MSG,
+        }
+        .fail()
+    }
+}
diff --git a/catalog_impls/src/table_based.rs b/catalog_impls/src/table_based.rs
new file mode 100644
index 0000000000..60c578a530
--- /dev/null
+++ b/catalog_impls/src/table_based.rs
@@ -0,0 +1,1126 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Table based catalog implementation
+
+use std::{
+    collections::HashMap,
+    sync::{Arc, RwLock},
+};
+
+use async_trait::async_trait;
+use catalog::{
+    self, consts,
+    manager::{self, Manager},
+    schema::{
+        self, CatalogMismatch, CreateExistTable, CreateOptions, CreateTable, DropOptions,
+        DropTable, InvalidTableId, NameRef, Schema, SchemaMismatch, SchemaRef, TooManyTable,
+        WriteTableMeta,
+    },
+    Catalog, CatalogRef,
+};
+use common_util::define_result;
+use log::{debug, error, info};
+use snafu::{ensure, Backtrace, OptionExt, ResultExt, Snafu};
+use system_catalog::sys_catalog_table::{
+    self, CreateCatalogRequest, CreateSchemaRequest, SysCatalogTable, Visitor,
+    VisitorCatalogNotFound, VisitorOpenTable, VisitorSchemaNotFound,
+};
+use table_engine::{
+    engine::{
+        CreateTableRequest, DropTableRequest, OpenTableRequest, TableEngine, TableEngineRef,
+        TableState,
+    },
+    table::{
+        ReadOptions, SchemaId, SchemaIdGenerator, TableId, TableInfo, TableRef, TableSeqGenerator,
+    },
+};
+use tokio::sync::Mutex;
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Failed to build sys catalog table, err:{}", source))]
+    BuildSysCatalog {
+        source: system_catalog::sys_catalog_table::Error,
+    },
+
+    #[snafu(display("Failed to visit sys catalog table, err:{}", source))]
+    VisitSysCatalog {
+        source: system_catalog::sys_catalog_table::Error,
+    },
+
+    #[snafu(display(
+        "Failed to find table to update, name:{}.\nBacktrace:\n{}",
+        name,
+        backtrace
+    ))]
+    UpdateTableNotFound { name: String, backtrace: Backtrace },
+
+    #[snafu(display("Failed to create catalog, catalog:{}, err:{}", catalog, source))]
+    CreateCatalog {
+        catalog: String,
+        source: system_catalog::sys_catalog_table::Error,
+    },
+
+    #[snafu(display(
+        "Failed to create schema, catalog:{}, schema:{}, err:{}",
+        catalog,
+        schema,
+        source
+    ))]
+    CreateSchema {
+        catalog: String,
+        schema: String,
+        source: system_catalog::sys_catalog_table::Error,
+    },
+}
+
+define_result!(Error);
+
+/// Table based catalog manager
+#[derive(Clone)]
+pub struct TableBasedManager {
+    inner: Arc<Inner>,
+}
+
+impl Manager for TableBasedManager {
+    fn default_catalog_name(&self) -> NameRef {
+        consts::DEFAULT_CATALOG
+    }
+
+    fn default_schema_name(&self) -> NameRef {
+        consts::DEFAULT_SCHEMA
+    }
+
+    fn catalog_by_name(&self, name: NameRef) -> manager::Result<Option<CatalogRef>> {
+        let catalog = self.inner.catalogs.get(name).cloned().map(|v| v as _);
+        Ok(catalog)
+    }
+
+    fn all_catalogs(&self) -> manager::Result<Vec<CatalogRef>> {
+        Ok(self
+            .inner
+            .catalogs
+            .iter()
+            .map(|(_, v)| v.clone() as _)
+            .collect())
+    }
+}
+
+impl TableBasedManager {
+    /// Create and init the TableBasedManager.
+    // TODO(yingwen): Define all constants in catalog crate.
+    pub async fn new<T: TableEngine>(backend: &T, engine_proxy: TableEngineRef) -> Result<Self> {
+        // Create or open sys_catalog table, will also create a space (catalog + schema)
+        // for system catalog.
+        let catalog_table = SysCatalogTable::new(backend)
+            .await
+            .context(BuildSysCatalog)?;
+
+        let mut inner = Inner {
+            catalog_table: Arc::new(catalog_table),
+            catalogs: HashMap::new(),
+            engine_proxy,
+            schema_id_generator: Arc::new(SchemaIdGenerator::default()),
+        };
+
+        inner.init().await?;
+
+        Ok(Self {
+            inner: Arc::new(inner),
+        })
+    }
+
+    #[cfg(test)]
+    pub fn get_engine_proxy(&self) -> TableEngineRef {
+        self.inner.engine_proxy.clone()
+    }
+}
+
+type CatalogMap = HashMap<String, Arc<CatalogImpl>>;
+
+/// Inner state of TableBasedManager
+struct Inner {
+    /// Sys catalog table
+    catalog_table: Arc<SysCatalogTable>,
+    catalogs: CatalogMap,
+    /// Table engine proxy
+    engine_proxy: TableEngineRef,
+    /// Global schema id generator, Each schema has a unique schema id.
+    schema_id_generator: Arc<SchemaIdGenerator>,
+}
+
+impl Inner {
+    /// Load all data from sys catalog table.
+    async fn init(&mut self) -> Result<()> {
+        // The system catalog and schema in it is not persisted, so we add it manually.
+        self.load_system_catalog();
+
+        let mut visitor = VisitorImpl {
+            catalog_table: self.catalog_table.clone(),
+            catalogs: &mut self.catalogs,
+            engine_proxy: self.engine_proxy.clone(),
+            schema_id_generator: self.schema_id_generator.clone(),
+        };
+
+        // Load all existent catalog/schema/tables from catalog_table.
+        let opts = ReadOptions::default();
+        self.catalog_table
+            .visit(opts, &mut visitor)
+            .await
+            .context(VisitSysCatalog)?;
+
+        // Create default catalog if it is not exists.
+        self.maybe_create_default_catalog().await?;
+
+        Ok(())
+    }
+
+    fn load_system_catalog(&mut self) {
+        // Get the `sys_catalog` table and add it to tables.
+        let table = self.catalog_table.inner_table();
+        let mut tables = SchemaTables::default();
+        tables.insert(self.catalog_table.table_id(), table);
+
+        // Use schema id of schema `system/public` as last schema id.
+        let schema_id = sys_catalog_table::SCHEMA_ID;
+        self.schema_id_generator.set_last_schema_id(schema_id);
+
+        // Create the default schema in system catalog.
+        let schema = Arc::new(SchemaImpl {
+            catalog_name: consts::SYSTEM_CATALOG.to_string(),
+            schema_name: consts::SYSTEM_CATALOG_SCHEMA.to_string(),
+            schema_id,
+            tables: RwLock::new(tables),
+            mutex: Mutex::new(()),
+            catalog_table: self.catalog_table.clone(),
+            table_seq_generator: TableSeqGenerator::default(),
+        });
+        // Use table seq of `sys_catalog` table as last table seq.
+        schema
+            .table_seq_generator
+            .set_last_table_seq(sys_catalog_table::TABLE_SEQ);
+
+        let mut schemas = HashMap::new();
+        schemas.insert(schema.name().to_string(), schema);
+
+        let schema_id_generator = self.schema_id_generator.clone();
+        let catalog_table = self.catalog_table.clone();
+        // Create the system catalog.
+        let catalog = Arc::new(CatalogImpl {
+            name: consts::SYSTEM_CATALOG.to_string(),
+            schemas: RwLock::new(schemas),
+            schema_id_generator,
+            catalog_table,
+            mutex: Mutex::new(()),
+        });
+
+        self.catalogs.insert(catalog.name().to_string(), catalog);
+    }
+
+    async fn maybe_create_default_catalog(&mut self) -> Result<()> {
+        // Try to get default catalog, create it if not exists.
+        let catalog = match self.catalogs.get(consts::DEFAULT_CATALOG) {
+            Some(v) => v.clone(),
+            None => {
+                // Only system catalog should exists.
+                assert_eq!(1, self.catalogs.len());
+
+                // Default catalog is not exists, create and store it.
+                let default_catalog = self
+                    .create_catalog(CreateCatalogRequest {
+                        catalog_name: consts::DEFAULT_CATALOG.to_string(),
+                    })
+                    .await?;
+
+                default_catalog
+            }
+        };
+
+        // Create default schema if not exists.
+        if catalog.find_schema(consts::DEFAULT_SCHEMA).is_none() {
+            // Allocate schema id.
+            let schema_id = self
+                .schema_id_generator
+                .alloc_schema_id()
+                .expect("Schema id of default catalog should be valid");
+
+            self.add_schema_to_catalog(
+                CreateSchemaRequest {
+                    catalog_name: consts::DEFAULT_CATALOG.to_string(),
+                    schema_name: consts::DEFAULT_SCHEMA.to_string(),
+                    schema_id,
+                },
+                &*catalog,
+            )
+            .await?;
+        }
+
+        Ok(())
+    }
+
+    async fn create_catalog(&mut self, request: CreateCatalogRequest) -> Result<Arc<CatalogImpl>> {
+        let catalog_name = request.catalog_name.clone();
+
+        self.catalog_table
+            .create_catalog(request)
+            .await
+            .context(CreateCatalog {
+                catalog: &catalog_name,
+            })?;
+
+        let schema_id_generator = self.schema_id_generator.clone();
+        let catalog_table = self.catalog_table.clone();
+        let catalog = Arc::new(CatalogImpl {
+            name: catalog_name.clone(),
+            schemas: RwLock::new(HashMap::new()),
+            schema_id_generator,
+            catalog_table,
+            mutex: Mutex::new(()),
+        });
+
+        self.catalogs.insert(catalog_name, catalog.clone());
+
+        Ok(catalog)
+    }
+
+    async fn add_schema_to_catalog(
+        &mut self,
+        request: CreateSchemaRequest,
+        catalog: &CatalogImpl,
+    ) -> Result<Arc<SchemaImpl>> {
+        let schema_name = request.schema_name.clone();
+        let schema_id = request.schema_id;
+
+        self.catalog_table
+            .create_schema(request)
+            .await
+            .context(CreateSchema {
+                catalog: &catalog.name,
+                schema: &schema_name,
+            })?;
+
+        let schema = Arc::new(SchemaImpl::new(
+            &catalog.name,
+            &schema_name,
+            schema_id,
+            self.catalog_table.clone(),
+        ));
+
+        catalog.insert_schema_into_memory(schema.clone());
+
+        Ok(schema)
+    }
+}
+
+/// Sys catalog visitor implementation, used to load catalog info
+struct VisitorImpl<'a> {
+    catalog_table: Arc<SysCatalogTable>,
+    catalogs: &'a mut CatalogMap,
+    engine_proxy: TableEngineRef,
+    schema_id_generator: Arc<SchemaIdGenerator>,
+}
+
+#[async_trait]
+impl<'a> Visitor for VisitorImpl<'a> {
+    fn visit_catalog(&mut self, request: CreateCatalogRequest) -> sys_catalog_table::Result<()> {
+        debug!("Visitor visit catalog, request:{:?}", request);
+        let schema_id_generator = self.schema_id_generator.clone();
+        let catalog_table = self.catalog_table.clone();
+
+        let catalog = CatalogImpl {
+            name: request.catalog_name.to_string(),
+            schemas: RwLock::new(HashMap::new()),
+            schema_id_generator,
+            catalog_table,
+            mutex: Mutex::new(()),
+        };
+
+        // Register catalog.
+        self.catalogs
+            .insert(request.catalog_name, Arc::new(catalog));
+
+        Ok(())
+    }
+
+    fn visit_schema(&mut self, request: CreateSchemaRequest) -> sys_catalog_table::Result<()> {
+        debug!("Visitor visit schema, request:{:?}", request);
+
+        let catalog =
+            self.catalogs
+                .get_mut(&request.catalog_name)
+                .context(VisitorCatalogNotFound {
+                    catalog: &request.catalog_name,
+                })?;
+
+        let schema_id = request.schema_id;
+        let schema = Arc::new(SchemaImpl::new(
+            &request.catalog_name,
+            &request.schema_name,
+            schema_id,
+            self.catalog_table.clone(),
+        ));
+
+        // If schema exists, we overwrite it.
+        catalog.insert_schema_into_memory(schema);
+
+        // Update last schema id.
+        if self.schema_id_generator.last_schema_id_u32() < schema_id.as_u32() {
+            self.schema_id_generator.set_last_schema_id(schema_id);
+        }
+
+        Ok(())
+    }
+
+    async fn visit_tables(&mut self, table_info: TableInfo) -> sys_catalog_table::Result<()> {
+        debug!("Visitor visit tables, table_info:{:?}", table_info);
+
+        let catalog =
+            self.catalogs
+                .get_mut(&table_info.catalog_name)
+                .context(VisitorCatalogNotFound {
+                    catalog: &table_info.catalog_name,
+                })?;
+        let schema =
+            catalog
+                .find_schema(&table_info.schema_name)
+                .context(VisitorSchemaNotFound {
+                    catalog: &table_info.catalog_name,
+                    schema: &table_info.schema_name,
+                })?;
+
+        // Update max table sequence of the schema.
+        let table_id = table_info.table_id;
+        let table_seq = table_id.table_seq();
+        if table_seq.as_u64() >= schema.table_seq_generator.last_table_seq_u64() {
+            schema.table_seq_generator.set_last_table_seq(table_seq);
+        }
+
+        // Only the stable/altering table can be opened.
+        if !matches!(table_info.state, TableState::Stable) {
+            debug!(
+                "Visitor visit a unstable table, table_info:{:?}",
+                table_info
+            );
+            return Ok(());
+        }
+
+        let open_request = OpenTableRequest::from(table_info);
+        let table_name = open_request.table_name.clone();
+        let table_opt = self
+            .engine_proxy
+            .open_table(open_request)
+            .await
+            .context(VisitorOpenTable)?;
+
+        match table_opt {
+            Some(table) => {
+                schema.insert_table_into_memory(table_id, table);
+            }
+            None => {
+                // Now we ignore the error that table not in engine but in catalog.
+                error!(
+                    "Visitor found table not in engine, table_name:{:?}",
+                    table_name
+                );
+            }
+        }
+
+        Ok(())
+    }
+}
+
+type SchemaMap = HashMap<String, Arc<SchemaImpl>>;
+
+/// Table based catalog
+struct CatalogImpl {
+    /// Catalog name
+    name: String,
+    /// Schemas of catalog
+    // Now the Schema trait does not support create schema, so we use impl type here
+    schemas: RwLock<SchemaMap>,
+    /// Global schema id generator, Each schema has a unique schema id.
+    schema_id_generator: Arc<SchemaIdGenerator>,
+    /// Sys catalog table
+    catalog_table: Arc<SysCatalogTable>,
+    /// Mutex
+    ///
+    /// Protects:
+    /// - create schema
+    /// - persist to default catalog
+    mutex: Mutex<()>,
+}
+
+impl CatalogImpl {
+    /// Insert schema
+    fn insert_schema_into_memory(&self, schema: Arc<SchemaImpl>) {
+        let mut schemas = self.schemas.write().unwrap();
+        schemas.insert(schema.name().to_string(), schema);
+    }
+
+    fn find_schema(&self, schema_name: &str) -> Option<Arc<SchemaImpl>> {
+        let schemas = self.schemas.read().unwrap();
+        schemas.get(schema_name).cloned()
+    }
+}
+
+// TODO(yingwen): Support add schema (with options to control schema
+// persistence)
+#[async_trait]
+impl Catalog for CatalogImpl {
+    fn name(&self) -> NameRef {
+        &self.name
+    }
+
+    fn schema_by_name(&self, name: NameRef) -> catalog::Result<Option<SchemaRef>> {
+        let schemas = self.schemas.read().unwrap();
+        let schema = schemas.get(name).cloned().map(|v| v as _);
+        Ok(schema)
+    }
+
+    async fn create_schema<'a>(&'a self, name: NameRef<'a>) -> catalog::Result<()> {
+        // Check schema existence
+        if self.schema_by_name(name)?.is_some() {
+            return Ok(());
+        }
+
+        // Lock schema and persist schema to default catalog
+        let _lock = self.mutex.lock().await;
+        // Check again
+        if self.schema_by_name(name)?.is_some() {
+            return Ok(());
+        }
+
+        // Allocate schema id.
+        let schema_id = self
+            .schema_id_generator
+            .alloc_schema_id()
+            .expect("Schema id of default catalog should be valid");
+
+        let request = CreateSchemaRequest {
+            catalog_name: self.name.to_string(),
+            schema_name: name.to_string(),
+            schema_id,
+        };
+
+        let schema_id = request.schema_id;
+
+        self.catalog_table
+            .create_schema(request)
+            .await
+            .map_err(|e| Box::new(e) as _)
+            .context(catalog::CreateSchema {
+                catalog: &self.name,
+                schema: &name.to_string(),
+            })?;
+
+        let schema = Arc::new(SchemaImpl::new(
+            &self.name,
+            name,
+            schema_id,
+            self.catalog_table.clone(),
+        ));
+
+        self.insert_schema_into_memory(schema);
+        info!(
+            "create schema success, catalog:{}, schema:{}",
+            &self.name, name
+        );
+        Ok(())
+    }
+
+    fn all_schemas(&self) -> catalog::Result<Vec<SchemaRef>> {
+        Ok(self
+            .schemas
+            .read()
+            .unwrap()
+            .iter()
+            .map(|(_, v)| v.clone() as _)
+            .collect())
+    }
+}
+
+/// Table based schema
+struct SchemaImpl {
+    /// Catalog name
+    catalog_name: String,
+    /// Schema name
+    schema_name: String,
+    /// Schema id
+    schema_id: SchemaId,
+    /// Tables of schema
+    tables: RwLock<SchemaTables>,
+    /// Mutex
+    ///
+    /// Protects:
+    /// - add/drop/alter table
+    /// - persist to sys catalog table
+    mutex: Mutex<()>,
+    /// Sys catalog table
+    catalog_table: Arc<SysCatalogTable>,
+    table_seq_generator: TableSeqGenerator,
+}
+
+impl SchemaImpl {
+    fn new(
+        catalog_name: &str,
+        schema_name: &str,
+        schema_id: SchemaId,
+        catalog_table: Arc<SysCatalogTable>,
+    ) -> Self {
+        Self {
+            catalog_name: catalog_name.to_string(),
+            schema_name: schema_name.to_string(),
+            schema_id,
+            tables: RwLock::new(SchemaTables::default()),
+            mutex: Mutex::new(()),
+            catalog_table,
+            table_seq_generator: TableSeqGenerator::default(),
+        }
+    }
+
+    /// Insert table into memory, wont check existence
+    fn insert_table_into_memory(&self, table_id: TableId, table: TableRef) {
+        let mut tables = self.tables.write().unwrap();
+        tables.insert(table_id, table);
+    }
+
+    /// Check table existence in read lock
+    ///
+    /// If table exists:
+    /// - if create_if_not_exists is true, return Ok
+    /// - if create_if_not_exists is false, return Error
+    fn check_create_table_read(
+        &self,
+        request: &CreateTableRequest,
+        create_if_not_exists: bool,
+    ) -> schema::Result<Option<TableRef>> {
+        let table_id = request.table_id;
+        ensure!(
+            self.schema_id == table_id.schema_id(),
+            InvalidTableId {
+                msg: "schema id unmatch",
+                table_id,
+            }
+        );
+
+        let tables = self.tables.read().unwrap();
+        if let Some(table) = tables.tables_by_name.get(&request.table_name) {
+            // Already exists
+            if create_if_not_exists {
+                // Create if not exists is set
+                return Ok(Some(table.clone()));
+            }
+            // Create if not exists is not set, need to return error
+            return CreateExistTable {
+                table: &request.table_name,
+            }
+            .fail();
+        }
+
+        // Table is not exists, check whether table id is unique under this schema.
+        let table_by_id = tables.tables_by_id.get(&request.table_id);
+        ensure!(
+            table_by_id.is_none(),
+            InvalidTableId {
+                msg: "table with given id already exists",
+                table_id,
+            }
+        );
+
+        Ok(None)
+    }
+
+    fn find_table_by_name(&self, name: NameRef) -> Option<TableRef> {
+        self.tables
+            .read()
+            .unwrap()
+            .tables_by_name
+            .get(name)
+            .cloned()
+    }
+}
+
+#[derive(Default)]
+struct SchemaTables {
+    tables_by_name: HashMap<String, TableRef>,
+    tables_by_id: HashMap<TableId, TableRef>,
+}
+
+impl SchemaTables {
+    fn insert(&mut self, table_id: TableId, table: TableRef) {
+        self.tables_by_name
+            .insert(table.name().to_string(), table.clone());
+        self.tables_by_id.insert(table_id, table);
+    }
+
+    fn remove(&mut self, name: NameRef) {
+        if let Some(table) = self.tables_by_name.remove(name) {
+            self.tables_by_id.remove(&table.id());
+        }
+    }
+}
+
+#[async_trait]
+impl Schema for SchemaImpl {
+    fn name(&self) -> NameRef {
+        &self.schema_name
+    }
+
+    fn table_by_name(&self, name: NameRef) -> schema::Result<Option<TableRef>> {
+        let table = self
+            .tables
+            .read()
+            .unwrap()
+            .tables_by_name
+            .get(name)
+            .cloned();
+        Ok(table)
+    }
+
+    fn alloc_table_id(&self, name: NameRef) -> schema::Result<TableId> {
+        let table_seq = self
+            .table_seq_generator
+            .alloc_table_seq()
+            .context(TooManyTable {
+                schema: &self.schema_name,
+                table: name,
+            })?;
+
+        Ok(TableId::new(self.schema_id, table_seq))
+    }
+
+    // TODO(yingwen): Do not persist if engine is memory engine.
+    async fn create_table(
+        &self,
+        request: CreateTableRequest,
+        opts: CreateOptions,
+    ) -> schema::Result<TableRef> {
+        info!(
+            "Table based catalog manager create table, request:{:?}",
+            request
+        );
+
+        ensure!(
+            self.catalog_name == request.catalog_name,
+            CatalogMismatch {
+                expect: &self.catalog_name,
+                given: request.catalog_name,
+            }
+        );
+        ensure!(
+            self.schema_name == request.schema_name,
+            SchemaMismatch {
+                expect: &self.schema_name,
+                given: request.schema_name,
+            }
+        );
+        // TODO(yingwen): Validate table id is unique.
+
+        // Check table existence
+        if let Some(table) = self.check_create_table_read(&request, opts.create_if_not_exists)? {
+            return Ok(table);
+        }
+
+        // Lock schema and persist table to sys catalog table
+        let _lock = self.mutex.lock().await;
+        // Check again
+        if let Some(table) = self.check_create_table_read(&request, opts.create_if_not_exists)? {
+            return Ok(table);
+        }
+
+        // Create table
+        let table_name = request.table_name.clone();
+        let table = opts
+            .table_engine
+            .create_table(request.clone())
+            .await
+            .context(CreateTable)?;
+        assert_eq!(table_name, table.name());
+
+        self.catalog_table
+            .create_table(request.clone().into())
+            .await
+            .map_err(|e| Box::new(e) as _)
+            .context(WriteTableMeta {
+                table: &request.table_name,
+            })?;
+
+        {
+            // Insert into memory
+            let mut tables = self.tables.write().unwrap();
+            tables.insert(request.table_id, table.clone());
+        }
+
+        Ok(table)
+    }
+
+    async fn drop_table(
+        &self,
+        mut request: DropTableRequest,
+        opts: DropOptions,
+    ) -> schema::Result<bool> {
+        info!(
+            "Table based catalog manager drop table, request:{:?}",
+            request
+        );
+
+        if self.find_table_by_name(&request.table_name).is_none() {
+            return Ok(false);
+        };
+
+        let _lock = self.mutex.lock().await;
+        // double check whether the table to drop exists.
+        let table = match self.find_table_by_name(&request.table_name) {
+            Some(v) => v,
+            None => return Ok(false),
+        };
+
+        // Determine the real engine type of the table to drop.
+        // FIXME(xikai): the engine should not be part of the DropRequest.
+        request.engine = table.engine_type().to_string();
+
+        // Prepare to drop table info in the sys_catalog.
+        self.catalog_table
+            .prepare_drop_table(request.clone())
+            .await
+            .map_err(|e| Box::new(e) as _)
+            .context(WriteTableMeta {
+                table: &request.table_name,
+            })?;
+
+        let dropped = opts
+            .table_engine
+            .drop_table(request.clone())
+            .await
+            .context(DropTable)?;
+
+        info!(
+            "Table engine drop table successfully, request:{:?}, dropped:{}",
+            request, dropped
+        );
+
+        // Update the drop table record into the sys_catalog_table.
+        self.catalog_table
+            .drop_table(request.clone())
+            .await
+            .map_err(|e| Box::new(e) as _)
+            .context(WriteTableMeta {
+                table: &request.table_name,
+            })?;
+
+        {
+            let mut tables = self.tables.write().unwrap();
+            tables.remove(&request.table_name);
+        };
+
+        info!(
+            "Table based catalog manager drop table successfully, request:{:?}",
+            request
+        );
+
+        return Ok(true);
+    }
+
+    fn all_tables(&self) -> schema::Result<Vec<TableRef>> {
+        Ok(self
+            .tables
+            .read()
+            .unwrap()
+            .tables_by_name
+            .iter()
+            .map(|(_, v)| v.clone())
+            .collect())
+    }
+}
+
+#[cfg(any(test, feature = "test"))]
+mod tests {
+    use std::{collections::HashMap, sync::Arc};
+
+    use analytic_engine::{tests::util::TestEnv, AnalyticTableEngine};
+    use catalog::{
+        consts::{DEFAULT_CATALOG, DEFAULT_SCHEMA},
+        manager::Manager,
+        schema::{CreateOptions, DropOptions, SchemaRef},
+    };
+    use server::table_engine::{MemoryTableEngine, TableEngineProxy};
+    use table_engine::{
+        engine::{CreateTableRequest, DropTableRequest, TableState},
+        ANALYTIC_ENGINE_TYPE,
+    };
+
+    use crate::table_based::TableBasedManager;
+
+    async fn build_catalog_manager(analytic: AnalyticTableEngine) -> TableBasedManager {
+        // Create table engine proxy
+        let memory = MemoryTableEngine;
+
+        let engine_proxy = Arc::new(TableEngineProxy {
+            memory,
+            analytic: analytic.clone(),
+        });
+
+        // Create catalog manager, use analytic table as backend
+        TableBasedManager::new(&analytic, engine_proxy.clone())
+            .await
+            .unwrap_or_else(|e| {
+                panic!("Failed to create catalog manager, err:{}", e);
+            })
+    }
+
+    async fn build_default_schema_with_catalog(catalog_manager: &TableBasedManager) -> SchemaRef {
+        let catalog_name = catalog_manager.default_catalog_name();
+        let schema_name = catalog_manager.default_schema_name();
+        let catalog = catalog_manager.catalog_by_name(catalog_name);
+        assert!(catalog.is_ok());
+        assert!(catalog.as_ref().unwrap().is_some());
+        catalog
+            .as_ref()
+            .unwrap()
+            .as_ref()
+            .unwrap()
+            .schema_by_name(schema_name)
+            .unwrap()
+            .unwrap()
+    }
+
+    async fn build_default_schema(analytic: AnalyticTableEngine) -> SchemaRef {
+        let catalog_manager = build_catalog_manager(analytic).await;
+        let catalog_name = catalog_manager.default_catalog_name();
+        let schema_name = catalog_manager.default_schema_name();
+        let catalog = catalog_manager.catalog_by_name(catalog_name);
+        assert!(catalog.is_ok());
+        assert!(catalog.as_ref().unwrap().is_some());
+        catalog
+            .as_ref()
+            .unwrap()
+            .as_ref()
+            .unwrap()
+            .schema_by_name(schema_name)
+            .unwrap()
+            .unwrap()
+    }
+
+    fn build_create_table_req(table_name: &str, schema: SchemaRef) -> CreateTableRequest {
+        CreateTableRequest {
+            catalog_name: DEFAULT_CATALOG.to_string(),
+            schema_name: DEFAULT_SCHEMA.to_string(),
+            table_id: schema.alloc_table_id(table_name).unwrap(),
+            table_name: table_name.to_string(),
+            table_schema: common_types::tests::build_schema(),
+            partition_info: None,
+            engine: ANALYTIC_ENGINE_TYPE.to_string(),
+            options: HashMap::new(),
+            state: TableState::Stable,
+        }
+    }
+
+    #[tokio::test]
+    async fn test_catalog_by_name_schema_by_name() {
+        let env = TestEnv::builder().build();
+        let mut test_ctx = env.new_context();
+        test_ctx.open().await;
+
+        let catalog_manager = build_catalog_manager(test_ctx.engine()).await;
+        let catalog_name = catalog_manager.default_catalog_name();
+        let schema_name = catalog_manager.default_schema_name();
+        let catalog = catalog_manager.catalog_by_name(catalog_name);
+        assert!(catalog.is_ok());
+        assert!(catalog.as_ref().unwrap().is_some());
+        let schema = catalog
+            .as_ref()
+            .unwrap()
+            .as_ref()
+            .unwrap()
+            .schema_by_name(schema_name);
+        assert!(schema.is_ok());
+        assert!(schema.as_ref().unwrap().is_some());
+
+        let schema_name2 = "test";
+        let schema = catalog
+            .as_ref()
+            .unwrap()
+            .as_ref()
+            .unwrap()
+            .schema_by_name(schema_name2);
+        assert!(schema.is_ok());
+        assert!(schema.as_ref().unwrap().is_none());
+
+        let catalog_name2 = "test";
+        let catalog = catalog_manager.catalog_by_name(catalog_name2);
+        assert!(catalog.is_ok());
+        assert!(catalog.as_ref().unwrap().is_none());
+    }
+
+    #[tokio::test]
+    async fn test_maybe_create_schema_by_name() {
+        let env = TestEnv::builder().build();
+        let mut test_ctx = env.new_context();
+        test_ctx.open().await;
+
+        let catalog_manager = build_catalog_manager(test_ctx.engine()).await;
+        let catalog_name = catalog_manager.default_catalog_name();
+        let catalog = catalog_manager.catalog_by_name(catalog_name);
+        assert!(catalog.is_ok());
+        assert!(catalog.as_ref().unwrap().is_some());
+
+        let schema_name = "test";
+        let catalog_ref = catalog.as_ref().unwrap().as_ref().unwrap();
+        let mut schema = catalog_ref.schema_by_name(schema_name);
+        assert!(schema.is_ok());
+        assert!(schema.as_ref().unwrap().is_none());
+
+        catalog_ref.create_schema(schema_name).await.unwrap();
+        schema = catalog_ref.schema_by_name(schema_name);
+        assert!(schema.is_ok());
+        assert!(schema.as_ref().unwrap().is_some());
+    }
+
+    #[tokio::test]
+    async fn test_alloc_table_id() {
+        let env = TestEnv::builder().build();
+        let mut test_ctx = env.new_context();
+        test_ctx.open().await;
+
+        let schema = build_default_schema(test_ctx.engine()).await;
+        let table_id = schema.alloc_table_id("test").unwrap();
+        let expected_id = 2u64 << 40 | 1u64;
+        assert_eq!(table_id.as_u64(), expected_id);
+    }
+
+    #[tokio::test]
+    async fn test_create_table() {
+        let env = TestEnv::builder().build();
+        let mut test_ctx = env.new_context();
+        test_ctx.open().await;
+
+        let catalog_manager = build_catalog_manager(test_ctx.engine()).await;
+        let schema = build_default_schema_with_catalog(&catalog_manager).await;
+
+        let table_name = "test";
+        let request = build_create_table_req(table_name, schema.clone());
+
+        let opts = CreateOptions {
+            table_engine: catalog_manager.get_engine_proxy(),
+            create_if_not_exists: true,
+        };
+
+        schema
+            .create_table(request.clone(), opts.clone())
+            .await
+            .unwrap();
+        assert!(schema.table_by_name(table_name).unwrap().is_some());
+
+        // create again
+        schema.create_table(request.clone(), opts).await.unwrap();
+        assert!(schema.table_by_name(table_name).unwrap().is_some());
+
+        let opts2 = CreateOptions {
+            table_engine: catalog_manager.get_engine_proxy(),
+            create_if_not_exists: false,
+        };
+        assert!(schema.create_table(request.clone(), opts2).await.is_err());
+    }
+
+    #[tokio::test]
+    async fn test_drop_table() {
+        let env = TestEnv::builder().build();
+        let mut test_ctx = env.new_context();
+        test_ctx.open().await;
+
+        let catalog_manager = build_catalog_manager(test_ctx.engine()).await;
+        let schema = build_default_schema_with_catalog(&catalog_manager).await;
+
+        let table_name = "test";
+        let engine_name = "test_engine";
+        let drop_table_request = DropTableRequest {
+            catalog_name: DEFAULT_CATALOG.to_string(),
+            schema_name: DEFAULT_SCHEMA.to_string(),
+            table_name: table_name.to_string(),
+            engine: engine_name.to_string(),
+        };
+        let drop_table_opts = DropOptions {
+            table_engine: catalog_manager.get_engine_proxy(),
+        };
+
+        assert!(!schema
+            .drop_table(drop_table_request.clone(), drop_table_opts.clone())
+            .await
+            .unwrap());
+
+        let create_table_request = build_create_table_req(table_name, schema.clone());
+        let create_table_opts = CreateOptions {
+            table_engine: catalog_manager.get_engine_proxy(),
+            create_if_not_exists: true,
+        };
+
+        // create table
+        {
+            schema
+                .create_table(create_table_request.clone(), create_table_opts.clone())
+                .await
+                .unwrap();
+            // check table exists
+            assert!(schema.table_by_name(table_name).unwrap().is_some());
+        }
+
+        // drop table
+        {
+            assert!(schema
+                .drop_table(drop_table_request.clone(), drop_table_opts.clone())
+                .await
+                .unwrap());
+            // check table not exists
+            assert!(schema.table_by_name(table_name).unwrap().is_none());
+        }
+
+        // create table again
+        {
+            schema
+                .create_table(create_table_request.clone(), create_table_opts.clone())
+                .await
+                .unwrap();
+            // check table exists
+            assert!(schema.table_by_name(table_name).unwrap().is_some());
+        }
+
+        // drop table again
+        {
+            assert!(schema
+                .drop_table(drop_table_request.clone(), drop_table_opts.clone())
+                .await
+                .unwrap());
+            // check table not exists
+            assert!(schema.table_by_name(table_name).unwrap().is_none());
+        }
+
+        // create two tables
+        {
+            let table_name2 = "test2";
+            let create_table_request2 = build_create_table_req(table_name2, schema.clone());
+            schema
+                .create_table(create_table_request2.clone(), create_table_opts.clone())
+                .await
+                .unwrap();
+            // check table exists
+            assert!(schema.table_by_name(table_name2).unwrap().is_some());
+
+            schema
+                .create_table(create_table_request, create_table_opts)
+                .await
+                .unwrap();
+            // check table exists
+            assert!(schema.table_by_name(table_name).unwrap().is_some());
+        }
+
+        // drop table again
+        {
+            assert!(schema
+                .drop_table(drop_table_request, drop_table_opts)
+                .await
+                .unwrap());
+            // check table not exists
+            assert!(schema.table_by_name(table_name).unwrap().is_none());
+        }
+    }
+}
diff --git a/cluster/Cargo.toml b/cluster/Cargo.toml
new file mode 100644
index 0000000000..d75d30a86d
--- /dev/null
+++ b/cluster/Cargo.toml
@@ -0,0 +1,21 @@
+[package]
+name = "cluster"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+analytic_engine = { path = "../analytic_engine" }
+async-trait = "0.1.41"
+catalog = { path = "../catalog" }
+common_types = { path = "../common_types" }
+common_util = { path = "../common_util" }
+log = "0.4"
+meta_client_v2 = { path = "../meta_client_v2" }
+rust-fsm = "0.6.0"
+serde = "1.0"
+serde_derive = "1.0"
+serde_json = "1.0.60"
+snafu = { version ="0.6.10", features = ["backtraces"]}
+tokio = { version = "1.0", features = ["full"] }
\ No newline at end of file
diff --git a/cluster/src/config.rs b/cluster/src/config.rs
new file mode 100644
index 0000000000..2afb0bee57
--- /dev/null
+++ b/cluster/src/config.rs
@@ -0,0 +1,18 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use meta_client_v2::MetaClientConfig;
+use serde_derive::Deserialize;
+
+#[derive(Default, Clone, Deserialize, Debug)]
+pub struct ClusterConfig {
+    /// Local ip address of this node, used as endpoint ip in meta.
+    pub node: String,
+    /// Grpc port of this node, also used as endpoint port in meta.
+    pub port: u16,
+    pub zone: String,
+    pub idc: String,
+    pub binary_version: String,
+    pub cmd_channel_buffer_size: usize,
+
+    pub meta_client_config: MetaClientConfig,
+}
diff --git a/cluster/src/lib.rs b/cluster/src/lib.rs
new file mode 100644
index 0000000000..9fe5916dc9
--- /dev/null
+++ b/cluster/src/lib.rs
@@ -0,0 +1,263 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use std::{sync::Arc, time::Duration};
+
+use async_trait::async_trait;
+use catalog::manager::Manager;
+use common_util::{define_result, runtime::Runtime};
+use log::{error, info};
+use meta_client_v2::{
+    build_meta_client, ActionCmd, AllocSchemaIdRequest, AllocTableIdRequest, DropTableRequest,
+    GetTablesRequest, MetaClient, NodeMetaInfo, SchemaId, ShardId, ShardInfo, TableId,
+};
+use snafu::{Backtrace, ResultExt, Snafu};
+use tokio::{
+    sync::{mpsc::Receiver, RwLock},
+    time,
+};
+
+use crate::{config::ClusterConfig, table_manager::TableManager};
+
+pub mod config;
+mod table_manager;
+
+#[derive(Debug, Snafu)]
+#[snafu(visibility = "pub")]
+pub enum Error {
+    #[snafu(display("Build meta client failed, err:{}.", source))]
+    BuildMetaClient {
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+
+    #[snafu(display("Meta client start failed, err:{}.", source))]
+    StartMetaClient {
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+
+    #[snafu(display("Meta client start failed, err:{}.", source))]
+    MetaClientFailure {
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+
+    #[snafu(display(
+        "Shard not found in current node, shard_id:{}.\nBacktrace:\n{}",
+        shard_id,
+        backtrace
+    ))]
+    ShardNotFound {
+        shard_id: ShardId,
+        backtrace: Backtrace,
+    },
+}
+
+define_result!(Error);
+
+#[async_trait]
+pub trait Cluster {
+    async fn alloc_schema_id(&self, _schema_name: String) -> Result<SchemaId>;
+
+    async fn alloc_table_id(&self, _schema_name: String, _table_name: String) -> Result<TableId>;
+
+    async fn drop_table(&self, _schema_name: String, _table_name: String) -> Result<()>;
+}
+
+pub struct ClusterImpl<M> {
+    inner: Arc<ClusterImplInner<M>>,
+    runtime: Arc<Runtime>,
+}
+
+impl<M: Manager + 'static> ClusterImpl<M> {
+    pub fn new(config: ClusterConfig, catalog_manager: M, runtime: Arc<Runtime>) -> Result<Self> {
+        Ok(Self {
+            inner: Arc::new(ClusterImplInner::new(
+                config,
+                catalog_manager,
+                runtime.clone(),
+            )?),
+            runtime,
+        })
+    }
+
+    pub async fn start(&self) -> Result<()> {
+        let inner = self.inner.clone();
+        inner
+            .meta_client
+            .start()
+            .await
+            .map_err(|e| Box::new(e) as _)
+            .context(StartMetaClient)?;
+        self.runtime.spawn(async move {
+            inner.start_heartbeat().await;
+        });
+
+        Ok(())
+    }
+}
+
+#[async_trait]
+impl<M: Manager + 'static> Cluster for ClusterImpl<M> {
+    async fn alloc_schema_id(&self, schema_name: String) -> Result<SchemaId> {
+        self.inner.alloc_schema_id(schema_name).await
+    }
+
+    async fn alloc_table_id(&self, schema_name: String, table_name: String) -> Result<TableId> {
+        self.inner.alloc_table_id(schema_name, table_name).await
+    }
+
+    async fn drop_table(&self, schema_name: String, table_name: String) -> Result<()> {
+        self.inner.drop_table(schema_name, table_name).await
+    }
+}
+
+struct ClusterImplInner<M> {
+    meta_client: Arc<dyn MetaClient + Send + Sync>,
+    catalog_manager: M,
+    table_manager: TableManager,
+    action_cmd_receiver: RwLock<Receiver<ActionCmd>>,
+
+    config: ClusterConfig,
+}
+
+impl<M: Manager + 'static> ClusterImplInner<M> {
+    pub fn new(config: ClusterConfig, catalog_manager: M, runtime: Arc<Runtime>) -> Result<Self> {
+        let (sender, receiver) = tokio::sync::mpsc::channel(config.cmd_channel_buffer_size);
+        let node_meta_info = NodeMetaInfo {
+            node: config.node.clone(),
+            zone: config.zone.clone(),
+            idc: config.idc.clone(),
+            binary_version: config.binary_version.clone(),
+        };
+        Ok(Self {
+            meta_client: build_meta_client(
+                config.meta_client_config.clone(),
+                node_meta_info,
+                runtime,
+                Some(sender),
+            )
+            .map_err(|e| Box::new(e) as _)
+            .context(BuildMetaClient)?,
+            catalog_manager,
+            table_manager: TableManager::new(),
+            action_cmd_receiver: RwLock::new(receiver),
+            config: config,
+        })
+    }
+
+    // heartbeat
+    async fn start_heartbeat(&self) {
+        let mut interval = time::interval(self.heartbeat_interval());
+
+        loop {
+            let shards_info = self.get_shards_info();
+            info!("Node heartbeat to meta, shards info:{:?}", shards_info);
+            let resp = self.meta_client.send_heartbeat(shards_info).await;
+            match resp {
+                Ok(()) => {
+                    interval.tick().await;
+                }
+                Err(e) => {
+                    error!("Node heartbeat to meta failed, error:{}", e);
+                    time::sleep(self.error_wait_lease()).await;
+                }
+            }
+        }
+    }
+
+    async fn start_node_action_cmd(&self) {
+        let action_cmd_receiver = &mut *self.action_cmd_receiver.write().await;
+        // todo: handle error
+        while let Some(action_cmd) = action_cmd_receiver.recv().await {
+            info!(
+                "Node action cmd from meta received, action_cmd:{:?}",
+                action_cmd
+            );
+            match action_cmd {
+                ActionCmd::OpenCmd(open_cmd) => {
+                    let ret = self
+                        .meta_client
+                        .get_tables(GetTablesRequest {
+                            shard_ids: open_cmd.shard_ids,
+                        })
+                        .await;
+                    match ret {
+                        Err(ref e) => error!("Get shard tables failed, ret:{:?}, err:{}", ret, e),
+                        Ok(v) => {
+                            self.table_manager.update_table_info(v.tables_map);
+                            // todo: self.catalog_manager.open tables
+                        }
+                    }
+                }
+                // todo: other action cmd
+                _ => todo!(),
+            }
+        }
+        info!("Node action cmd receiver exit");
+    }
+
+    fn get_shards_info(&self) -> Vec<ShardInfo> {
+        self.table_manager.get_shards_info()
+    }
+
+    // Register node every 2/3 lease
+    fn heartbeat_interval(&self) -> Duration {
+        Duration::from_secs(self.config.meta_client_config.lease.as_secs() * 2 / 3)
+    }
+
+    fn error_wait_lease(&self) -> Duration {
+        Duration::from_secs(self.config.meta_client_config.lease.as_secs() / 2)
+    }
+
+    async fn alloc_schema_id(&self, schema_name: String) -> Result<SchemaId> {
+        if let Some(v) = self.table_manager.get_schema_id(&schema_name) {
+            Ok(v)
+        } else {
+            Ok(self
+                .meta_client
+                .alloc_schema_id(AllocSchemaIdRequest {
+                    name: schema_name.clone(),
+                })
+                .await
+                .map_err(|e| Box::new(e) as _)
+                .context(MetaClientFailure)?
+                .id)
+        }
+    }
+
+    async fn alloc_table_id(&self, schema_name: String, table_name: String) -> Result<TableId> {
+        if let Some(v) = self.table_manager.get_table_id(&schema_name, &table_name) {
+            Ok(v)
+        } else {
+            let resp = self
+                .meta_client
+                .alloc_table_id(AllocTableIdRequest {
+                    schema_name,
+                    name: table_name,
+                })
+                .await
+                .map_err(|e| Box::new(e) as _)
+                .context(MetaClientFailure)?;
+            self.table_manager.add_table(
+                resp.shard_id,
+                resp.schema_name,
+                resp.name,
+                resp.schema_id,
+                resp.id,
+            )?;
+            Ok(resp.id)
+        }
+    }
+
+    async fn drop_table(&self, schema_name: String, table_name: String) -> Result<()> {
+        let _resp = self
+            .meta_client
+            .drop_table(DropTableRequest {
+                schema_name: schema_name.clone(),
+                name: table_name.clone(),
+            })
+            .await
+            .map_err(|e| Box::new(e) as _)
+            .context(MetaClientFailure)?;
+        self.table_manager.drop_table(schema_name, table_name);
+        Ok(())
+    }
+}
diff --git a/cluster/src/table_manager.rs b/cluster/src/table_manager.rs
new file mode 100644
index 0000000000..738df85db6
--- /dev/null
+++ b/cluster/src/table_manager.rs
@@ -0,0 +1,163 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use std::{
+    collections::{BTreeMap, HashMap},
+    sync::RwLock,
+};
+
+use meta_client_v2::{SchemaId, ShardId, ShardInfo, ShardTables, TableId, TableInfo};
+
+use super::Result;
+use crate::ShardNotFound;
+
+struct SchemaInfo {
+    name: String,
+    id: SchemaId,
+}
+
+pub struct TableManager {
+    inner: RwLock<TableManagerInner>,
+}
+
+impl TableManager {
+    pub fn new() -> Self {
+        Self {
+            inner: RwLock::new(TableManagerInner {
+                shards_info: Vec::new(),
+                schemas_info: HashMap::new(),
+                tables: BTreeMap::new(),
+            }),
+        }
+    }
+
+    pub fn get_shards_info(&self) -> Vec<ShardInfo> {
+        self.inner.read().unwrap().get_shards_info()
+    }
+
+    pub fn add_table(
+        &self,
+        shard_id: ShardId,
+        schema_name: String,
+        table_name: String,
+        schema_id: SchemaId,
+        table_id: TableId,
+    ) -> Result<()> {
+        self.inner.write().unwrap().add_table(
+            shard_id,
+            schema_name,
+            table_name,
+            schema_id,
+            table_id,
+        )
+    }
+
+    pub fn drop_table(&self, schema_name: String, table_name: String) {
+        self.inner
+            .write()
+            .unwrap()
+            .drop_table(schema_name, table_name)
+    }
+
+    pub fn update_table_info(&self, shard_table: HashMap<ShardId, ShardTables>) {
+        self.inner.write().unwrap().update_table_info(shard_table)
+    }
+
+    pub fn get_schema_id(&self, schema_name: &str) -> Option<SchemaId> {
+        self.inner.read().unwrap().get_schema_id(schema_name)
+    }
+
+    pub fn get_table_id(&self, schema_name: &str, table_name: &str) -> Option<TableId> {
+        self.inner
+            .read()
+            .unwrap()
+            .get_table_id(schema_name, table_name)
+    }
+}
+
+struct TableManagerInner {
+    shards_info: Vec<ShardInfo>,
+    schemas_info: HashMap<String, SchemaInfo>,
+    // schema_name -> table_name -> (shard_info, table_info)
+    tables: BTreeMap<String, BTreeMap<String, (ShardInfo, TableInfo)>>,
+}
+
+impl TableManagerInner {
+    fn get_shards_info(&self) -> Vec<ShardInfo> {
+        self.shards_info.clone()
+    }
+
+    fn update_table_info(&mut self, shard_table: HashMap<ShardId, ShardTables>) {
+        for (shard_id, shard_tables) in shard_table {
+            let shard_info = ShardInfo {
+                shard_id,
+                role: shard_tables.role,
+            };
+            for table in shard_tables.tables {
+                self.schemas_info
+                    .entry(table.schema_name.clone())
+                    .or_insert(SchemaInfo {
+                        name: table.schema_name.clone(),
+                        id: table.schema_id,
+                    });
+                self.tables
+                    .entry(table.schema_name.clone())
+                    .or_insert_with(BTreeMap::new)
+                    .insert(table.name.clone(), (shard_info.clone(), table));
+            }
+        }
+    }
+
+    fn add_table(
+        &mut self,
+        shard_id: ShardId,
+        schema_name: String,
+        table_name: String,
+        schema_id: SchemaId,
+        table_id: TableId,
+    ) -> Result<()> {
+        let mut shard_info = None;
+        for shard in &self.shards_info {
+            if shard.shard_id == shard_id {
+                shard_info = Some(shard.clone());
+                break;
+            }
+        }
+        match shard_info {
+            None => ShardNotFound { shard_id }.fail(),
+            Some(v) => {
+                self.tables
+                    .entry(schema_name.clone())
+                    .or_insert_with(BTreeMap::new)
+                    .insert(
+                        table_name.clone(),
+                        (
+                            v,
+                            TableInfo {
+                                id: table_id,
+                                name: table_name,
+                                schema_id,
+                                schema_name,
+                            },
+                        ),
+                    );
+                Ok(())
+            }
+        }
+    }
+
+    fn drop_table(&mut self, schema_name: String, table_name: String) {
+        self.tables
+            .get_mut(&schema_name)
+            .map(|v| v.remove(&table_name));
+    }
+
+    fn get_schema_id(&self, schema_name: &str) -> Option<SchemaId> {
+        self.schemas_info.get(schema_name).map(|v| v.id)
+    }
+
+    fn get_table_id(&self, schema_name: &str, table_name: &str) -> Option<TableId> {
+        self.tables
+            .get(schema_name)
+            .and_then(|schema| schema.get(table_name).map(|v| v.1.id))
+    }
+}
diff --git a/cluster/src/util.rs b/cluster/src/util.rs
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/common_types/Cargo.toml b/common_types/Cargo.toml
new file mode 100644
index 0000000000..1bb477e3f3
--- /dev/null
+++ b/common_types/Cargo.toml
@@ -0,0 +1,25 @@
+[package]
+name = "common_types"
+version = "0.1.0"
+authors = ["CeresDB Authors <ceresdb@service.alipay.com>"]
+edition = "2018"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[features]
+test = []
+
+[dependencies]
+# In alphabetical order
+arrow_deps = { path = "../arrow_deps" }
+byteorder = "1.2"
+bytes = { path = "../components/bytes" }
+chrono = "0.4"
+murmur3 = "0.4.1"
+paste = "1.0"
+proto = { path = "../proto" }
+snafu = { version ="0.6.10", features = ["backtraces"]}
+# TODO(yingwen): Make sqlparser support a feature
+sqlparser = "0.13.0"
+serde = "1.0.81"
+serde_derive = "1.0.81"
diff --git a/common_types/src/bytes.rs b/common_types/src/bytes.rs
new file mode 100644
index 0000000000..5a545d7b14
--- /dev/null
+++ b/common_types/src/bytes.rs
@@ -0,0 +1,5 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Bytes type.
+
+pub use bytes::*;
diff --git a/common_types/src/column.rs b/common_types/src/column.rs
new file mode 100644
index 0000000000..44908687bd
--- /dev/null
+++ b/common_types/src/column.rs
@@ -0,0 +1,868 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Column
+use std::sync::Arc;
+
+use arrow_deps::arrow::array::{
+    Array, ArrayBuilder, ArrayRef, BinaryArray, BinaryBuilder, BooleanArray, BooleanBuilder,
+    Float32Array as FloatArray, Float32Builder as FloatBuilder, Float64Array as DoubleArray,
+    Float64Builder as DoubleBuilder, Int16Array, Int16Builder, Int32Array, Int32Builder,
+    Int64Array, Int64Builder, Int8Array, Int8Builder, NullArray, StringArray, StringBuilder,
+    TimestampMillisecondArray, TimestampMillisecondBuilder, UInt16Array, UInt16Builder,
+    UInt32Array, UInt32Builder, UInt64Array, UInt64Builder, UInt8Array, UInt8Builder,
+};
+use paste::paste;
+use snafu::{Backtrace, OptionExt, ResultExt, Snafu};
+
+use crate::{
+    bytes::Bytes,
+    datum::{Datum, DatumKind, DatumView},
+    string::StringBytes,
+    time::Timestamp,
+};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display(
+        "Invalid array type, datum_kind:{:?}, data_type:{:?}.\nBacktrace:\n{}",
+        datum_kind,
+        data_type,
+        backtrace
+    ))]
+    InvalidArrayType {
+        datum_kind: DatumKind,
+        data_type: arrow_deps::arrow::datatypes::DataType,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Failed to append value, err:{}.\nBacktrace:\n{}", source, backtrace))]
+    Append {
+        source: arrow_deps::arrow::error::ArrowError,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Data type conflict, expect:{:?}, given:{:?}.\nBacktrace:\n{}",
+        expect,
+        given,
+        backtrace
+    ))]
+    ConflictType {
+        expect: DatumKind,
+        given: DatumKind,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Failed to convert arrow data type, data_type:{}.\nBacktrace:\n{}",
+        data_type,
+        backtrace
+    ))]
+    UnsupportedArray {
+        data_type: arrow_deps::arrow::datatypes::DataType,
+        backtrace: Backtrace,
+    },
+}
+
+pub type Result<T> = std::result::Result<T, Error>;
+
+#[derive(Debug)]
+pub struct NullColumn(NullArray);
+
+impl NullColumn {
+    fn new_null(rows: usize) -> Self {
+        Self(NullArray::new(rows))
+    }
+
+    /// Only the first datum of NullColumn is considered not duplicated.
+    #[inline]
+    pub fn dedup(&self, selected: &mut [bool]) {
+        if !self.0.is_empty() {
+            selected[0] = true;
+        }
+    }
+}
+
+// TODO(yingwen): Builder for columns.
+
+macro_rules! define_numeric_column {
+    ($($Kind: ident), *) => {
+        $(paste! {
+            #[derive(Debug)]
+            pub struct [<$Kind Column>]([<$Kind Array>]);
+
+            #[inline]
+            fn [<get_ $Kind:lower _datum>](array: &[<$Kind Array>], index: usize) -> Datum {
+                let value = array.value(index);
+                Datum::$Kind(value)
+            }
+
+            #[inline]
+            fn [<get_ $Kind:lower _datum_view>](array: &[<$Kind Array>], index: usize) -> DatumView {
+                let value = array.value(index);
+                DatumView::$Kind(value)
+            }
+        })*
+    }
+}
+
+define_numeric_column!(
+    Float, Double, UInt64, UInt32, UInt16, UInt8, Int64, Int32, Int16, Int8, Boolean
+);
+
+#[derive(Debug)]
+pub struct TimestampColumn(TimestampMillisecondArray);
+
+#[derive(Debug)]
+pub struct VarbinaryColumn(BinaryArray);
+
+#[derive(Debug)]
+pub struct StringColumn(StringArray);
+
+#[inline]
+fn get_null_datum_view(_array: &NullArray, _index: usize) -> DatumView {
+    DatumView::Null
+}
+
+#[inline]
+fn get_timestamp_datum_view(array: &TimestampMillisecondArray, index: usize) -> DatumView {
+    let value = array.value(index);
+    DatumView::Timestamp(Timestamp::new(value))
+}
+
+#[inline]
+fn get_varbinary_datum_view(array: &BinaryArray, index: usize) -> DatumView {
+    let value = array.value(index);
+    DatumView::Varbinary(value)
+}
+
+#[inline]
+fn get_string_datum_view(array: &StringArray, index: usize) -> DatumView {
+    let value = array.value(index);
+    DatumView::String(value)
+}
+
+#[inline]
+fn get_null_datum(_array: &NullArray, _index: usize) -> Datum {
+    Datum::Null
+}
+
+#[inline]
+fn get_timestamp_datum(array: &TimestampMillisecondArray, index: usize) -> Datum {
+    let value = array.value(index);
+    Datum::Timestamp(Timestamp::new(value))
+}
+
+// TODO(yingwen): Avoid clone of data.
+// Require a clone.
+#[inline]
+fn get_varbinary_datum(array: &BinaryArray, index: usize) -> Datum {
+    let value = array.value(index);
+    Datum::Varbinary(Bytes::copy_from_slice(value))
+}
+
+// TODO(yingwen): Avoid clone of data.
+// Require a clone.
+#[inline]
+fn get_string_datum(array: &StringArray, index: usize) -> Datum {
+    let value = array.value(index);
+    Datum::String(StringBytes::copy_from_str(value))
+}
+
+macro_rules! impl_column {
+    ($Column: ident, $get_datum: expr, $get_datum_view: expr) => {
+        impl $Column {
+            /// Get datum by index.
+            pub fn datum_opt(&self, index: usize) -> Option<Datum> {
+                // Do bound check.
+                if index >= self.0.len() {
+                    return None;
+                }
+
+                Some(self.datum(index))
+            }
+
+            pub fn datum_view(&self, index: usize) -> DatumView {
+                // If this datum is null.
+                if self.0.is_null(index) {
+                    return DatumView::Null;
+                }
+
+                $get_datum_view(&self.0, index)
+            }
+
+            pub fn datum(&self, index: usize) -> Datum {
+                // If this datum is null.
+                if self.0.is_null(index) {
+                    return Datum::Null;
+                }
+
+                $get_datum(&self.0, index)
+            }
+
+            #[inline]
+            pub fn num_rows(&self) -> usize {
+                self.0.len()
+            }
+
+            #[inline]
+            pub fn is_empty(&self) -> bool {
+                self.num_rows() == 0
+            }
+        }
+    };
+}
+
+macro_rules! impl_dedup {
+    ($Column: ident) => {
+        impl $Column {
+            /// If datum i is not equal to previous datum i - 1, mark `selected[i]` to
+            /// true.
+            ///
+            /// The first datum is marked to true.
+            ///
+            /// The size of selected must equal to the size of this column and
+            /// initialized to false.
+            #[allow(clippy::float_cmp)]
+            pub fn dedup(&self, selected: &mut [bool]) {
+                if self.0.is_empty() {
+                    return;
+                }
+
+                selected[0] = true;
+                for i in 1..self.0.len() {
+                    let current = self.0.value(i);
+                    let prev = self.0.value(i - 1);
+
+                    if current != prev {
+                        selected[i] = true;
+                    }
+                }
+            }
+        }
+    };
+}
+
+macro_rules! impl_new_null {
+    ($Column: ident, $Builder: ident) => {
+        impl $Column {
+            /// Create a column that all values are null.
+            fn new_null(num_rows: usize) -> Result<Self> {
+                let mut builder = $Builder::new(num_rows);
+                for _ in 0..num_rows {
+                    builder.append_null().context(Append)?;
+                }
+                let array = builder.finish();
+
+                Ok(Self(array))
+            }
+        }
+    };
+}
+
+macro_rules! impl_from_array_and_slice {
+    ($Column: ident, $ArrayType: ident) => {
+        impl From<$ArrayType> for $Column {
+            fn from(array: $ArrayType) -> Self {
+                Self(array)
+            }
+        }
+
+        impl From<&$ArrayType> for $Column {
+            fn from(array_ref: &$ArrayType) -> Self {
+                // We need to clone the [arrow_deps::arrow::array::ArrayData], which clones
+                // the underlying vector of [arrow_deps::arrow::buffer::Buffer] and Bitmap (also
+                // holds a Buffer), thus require some allocation. However, the Buffer is
+                // managed by Arc, so cloning the buffer is not too expensive.
+                let array_data = array_ref.data().clone();
+                let array = $ArrayType::from(array_data);
+
+                Self(array)
+            }
+        }
+
+        impl $Column {
+            fn to_arrow_array(&self) -> $ArrayType {
+                // Clone the array data.
+                let array_data = self.0.data().clone();
+                $ArrayType::from(array_data)
+            }
+
+            /// Returns a zero-copy slice of this array with the indicated offset and
+            /// length.
+            ///
+            /// Panics if offset with length is greater than column length.
+            fn slice(&self, offset: usize, length: usize) -> Self {
+                let array_slice = self.0.slice(offset, length);
+                // Clone the slice data.
+                let array_data = array_slice.data().clone();
+                let array = $ArrayType::from(array_data);
+
+                Self(array)
+            }
+        }
+    };
+}
+
+macro_rules! impl_iter {
+    ($Column: ident, $Value: ident) => {
+        impl $Column {
+            /// Iter column values.
+            pub fn iter(&self) -> impl Iterator<Item = Option<$Value>> + '_ {
+                self.0.iter()
+            }
+        }
+    };
+}
+
+macro_rules! impl_iter_map {
+    ($Column: ident, $Value: ident) => {
+        impl $Column {
+            /// Iter column values.
+            pub fn iter(&self) -> impl Iterator<Item = Option<$Value>> + '_ {
+                self.0.iter().map(|v| v.map($Value::from))
+            }
+        }
+    };
+}
+
+impl_column!(NullColumn, get_null_datum, get_null_datum_view);
+impl_column!(
+    TimestampColumn,
+    get_timestamp_datum,
+    get_timestamp_datum_view
+);
+impl_column!(
+    VarbinaryColumn,
+    get_varbinary_datum,
+    get_varbinary_datum_view
+);
+impl_column!(StringColumn, get_string_datum, get_string_datum_view);
+
+impl_new_null!(TimestampColumn, TimestampMillisecondBuilder);
+impl_new_null!(VarbinaryColumn, BinaryBuilder);
+impl_new_null!(StringColumn, StringBuilder);
+
+impl_from_array_and_slice!(NullColumn, NullArray);
+impl_from_array_and_slice!(TimestampColumn, TimestampMillisecondArray);
+impl_from_array_and_slice!(VarbinaryColumn, BinaryArray);
+impl_from_array_and_slice!(StringColumn, StringArray);
+
+impl_iter_map!(TimestampColumn, Timestamp);
+
+impl_dedup!(TimestampColumn);
+impl_dedup!(VarbinaryColumn);
+impl_dedup!(StringColumn);
+
+macro_rules! impl_numeric_column {
+    ($(($Kind: ident, $type: ty)), *) =>  {
+        $(
+            paste! {
+                impl_column!([<$Kind Column>], [<get_ $Kind:lower _datum>], [<get_ $Kind:lower _datum_view>]);
+                impl_from_array_and_slice!([<$Kind Column>], [<$Kind Array>]);
+                impl_new_null!([<$Kind Column>], [<$Kind Builder>]);
+                impl_iter!([<$Kind Column>], $type);
+                impl_dedup!([<$Kind Column>]);
+            }
+        )*
+    }
+}
+
+impl_numeric_column!(
+    (Double, f64),
+    (Float, f32),
+    (UInt64, u64),
+    (UInt32, u32),
+    (UInt16, u16),
+    (UInt8, u8),
+    (Int64, i64),
+    (Int32, i32),
+    (Int16, i16),
+    (Int8, i8),
+    (Boolean, bool)
+);
+
+macro_rules! impl_numeric_value {
+    ($Column: ident, $Value: ident) => {
+        impl $Column {
+            /// Get value at index.
+            pub fn value(&self, index: usize) -> Option<$Value> {
+                if self.0.is_valid(index) {
+                    unsafe { Some(self.0.value_unchecked(index)) }
+                } else {
+                    None
+                }
+            }
+        }
+    };
+}
+
+macro_rules! batch_impl_numeric_value {
+    ($(($Kind: ident, $type: ty)), *) =>  {
+        $(
+            paste! {
+                impl_numeric_value!([<$Kind Column>], $type);
+            }
+        )*
+    }
+}
+
+batch_impl_numeric_value!(
+    (Timestamp, i64),
+    (Double, f64),
+    (Float, f32),
+    (UInt64, u64),
+    (UInt32, u32),
+    (UInt16, u16),
+    (UInt8, u8),
+    (Int64, i64),
+    (Int32, i32),
+    (Int16, i16),
+    (Int8, i8),
+    (Boolean, bool)
+);
+
+impl VarbinaryColumn {
+    pub fn iter(&self) -> impl Iterator<Item = Option<&[u8]>> + '_ {
+        self.0.iter()
+    }
+
+    pub fn value(&self, index: usize) -> Option<&[u8]> {
+        if self.0.is_valid(index) {
+            unsafe { Some(self.0.value_unchecked(index)) }
+        } else {
+            None
+        }
+    }
+}
+
+impl StringColumn {
+    pub fn iter(&self) -> impl Iterator<Item = Option<&str>> + '_ {
+        self.0.iter()
+    }
+
+    pub fn value(&self, index: usize) -> Option<&str> {
+        if self.0.is_valid(index) {
+            unsafe { Some(self.0.value_unchecked(index)) }
+        } else {
+            None
+        }
+    }
+}
+
+macro_rules! impl_column_block {
+    ($($Kind: ident), *) => {
+        impl ColumnBlock {
+            pub fn datum_kind(&self) -> DatumKind {
+                match self {
+                    $(ColumnBlock::$Kind(_) => DatumKind::$Kind,)*
+                }
+            }
+
+            pub fn datum_opt(&self, index: usize) -> Option<Datum> {
+                match self {
+                    $(ColumnBlock::$Kind(col) => col.datum_opt(index),)*
+                }
+            }
+
+            /// Panic if index is out fo bound.
+            pub fn datum_view(&self, index: usize) -> DatumView {
+                match self {
+                    $(ColumnBlock::$Kind(col) => col.datum_view(index),)*
+                }
+            }
+
+            /// Panic if index is out fo bound.
+            pub fn datum(&self, index: usize) -> Datum {
+                match self {
+                    $(ColumnBlock::$Kind(col) => col.datum(index),)*
+                }
+            }
+
+            pub fn num_rows(&self) -> usize {
+                match self {
+                    $(ColumnBlock::$Kind(col) => col.num_rows(),)*
+                }
+            }
+
+            pub fn to_arrow_array_ref(&self) -> ArrayRef {
+                match self {
+                    $(ColumnBlock::$Kind(col) => Arc::new(col.to_arrow_array()),)*
+                }
+            }
+
+            /// If datum i is not equal to previous datum i - 1, mark `selected[i]` to true.
+            ///
+            /// The first datum is not marked to true.
+            pub fn dedup(&self, selected: &mut [bool]) {
+                match self {
+                    $(ColumnBlock::$Kind(col) => col.dedup(selected),)*
+                }
+            }
+
+            /// Returns a zero-copy slice of this array with the indicated offset and length.
+            ///
+            /// Panics if offset with length is greater than column length.
+            #[must_use]
+            pub fn slice(&self, offset: usize, length: usize) -> Self {
+                match self {
+                    $(ColumnBlock::$Kind(col) => ColumnBlock::$Kind(col.slice(offset, length)),)*
+                }
+            }
+        }
+
+        $(paste! {
+            impl From<[<$Kind Column>]> for ColumnBlock {
+                fn from(column: [<$Kind Column>]) -> Self {
+                    Self::$Kind(column)
+                }
+            }
+        })*
+    };
+}
+
+// TODO(yingwen): We can add a unsafe function that don't do bound check.
+
+macro_rules! define_column_block {
+    ($($Kind: ident), *) => {
+        paste! {
+            #[derive(Debug)]
+            pub enum ColumnBlock {
+                Null(NullColumn),
+                $(
+                    $Kind([<$Kind Column>]),
+                )*
+            }
+
+            impl ColumnBlock {
+                pub fn try_from_arrow_array_ref(datum_kind: &DatumKind, array: &ArrayRef) -> Result<Self> {
+                    let column = match datum_kind {
+                        DatumKind::Null => ColumnBlock::Null(NullColumn::new_null(array.len())),
+                        $(
+                            DatumKind::$Kind => {
+                                let column = cast_array(datum_kind, array)?;
+                                ColumnBlock::$Kind([<$Kind Column>]::from(column))
+                            }
+                        )*
+                    };
+                    Ok(column)
+                }
+
+                pub fn new_null_with_type(kind: &DatumKind, rows: usize) -> Result<Self> {
+                    let block = match kind {
+                        DatumKind::Null => ColumnBlock::Null(NullColumn::new_null(rows)),
+                        $(
+                            DatumKind::$Kind => ColumnBlock::$Kind([<$Kind Column>]::new_null(rows)?),
+                        )*
+                    };
+
+                    Ok(block)
+                }
+            }
+        }
+    }
+}
+
+// Define column blocks, Null is defined explicitly in macro.
+define_column_block!(
+    Timestamp, Double, Float, Varbinary, String, UInt64, UInt32, UInt16, UInt8, Int64, Int32,
+    Int16, Int8, Boolean
+);
+
+impl ColumnBlock {
+    pub fn try_cast_arrow_array_ref(array: &ArrayRef) -> Result<Self> {
+        let datum_kind =
+            DatumKind::from_data_type(array.data_type()).with_context(|| UnsupportedArray {
+                data_type: array.data_type().clone(),
+            })?;
+
+        Self::try_from_arrow_array_ref(&datum_kind, array)
+    }
+
+    pub fn new_null(rows: usize) -> Self {
+        Self::Null(NullColumn::new_null(rows))
+    }
+
+    pub fn as_timestamp(&self) -> Option<&TimestampColumn> {
+        match self {
+            ColumnBlock::Timestamp(c) => Some(c),
+            _ => None,
+        }
+    }
+}
+
+impl_column_block!(
+    Null, Timestamp, Double, Float, Varbinary, String, UInt64, UInt32, UInt16, UInt8, Int64, Int32,
+    Int16, Int8, Boolean
+);
+
+fn cast_array<'a, T: 'static>(datum_kind: &DatumKind, array: &'a ArrayRef) -> Result<&'a T> {
+    array
+        .as_any()
+        .downcast_ref::<T>()
+        .with_context(|| InvalidArrayType {
+            datum_kind: *datum_kind,
+            data_type: array.data_type().clone(),
+        })
+}
+
+macro_rules! append_datum {
+    ($Kind: ident, $builder: ident, $DatumType: ident, $datum: ident) => {
+        match $datum {
+            $DatumType::Null => $builder.append_null().context(Append),
+            $DatumType::$Kind(v) => $builder.append_value(v).context(Append),
+            _ => ConflictType {
+                expect: DatumKind::$Kind,
+                given: $datum.kind(),
+            }
+            .fail(),
+        }
+    };
+}
+
+macro_rules! append_datum_into {
+    ($Kind: ident, $builder: ident, $DatumType: ident, $datum: ident) => {
+        match $datum {
+            $DatumType::Null => $builder.append_null().context(Append),
+            $DatumType::$Kind(v) => $builder.append_value(v.into()).context(Append),
+            _ => ConflictType {
+                expect: DatumKind::$Kind,
+                given: $datum.kind(),
+            }
+            .fail(),
+        }
+    };
+}
+
+macro_rules! append_block {
+    ($Kind: ident, $builder: ident, $BlockType: ident, $block: ident, $start: ident, $len: ident) => {
+        match $block {
+            $BlockType::Null(v) => {
+                let end = std::cmp::min($start + $len, v.num_rows());
+                for _ in $start..end {
+                    $builder.append_null().context(Append)?;
+                }
+                Ok(())
+            }
+            $BlockType::$Kind(v) => {
+                // There is no convenient api to copy a range of data from array to builder, so
+                // we still need to clone value one by one using a for loop.
+                let end = std::cmp::min($start + $len, v.num_rows());
+                for i in $start..end {
+                    let value_opt = v.value(i);
+                    match value_opt {
+                        Some(value) => {
+                            $builder.append_value(value).context(Append)?;
+                        }
+                        None => {
+                            $builder.append_null().context(Append)?;
+                        }
+                    }
+                }
+                Ok(())
+            }
+            _ => ConflictType {
+                expect: DatumKind::$Kind,
+                given: $block.datum_kind(),
+            }
+            .fail(),
+        }
+    };
+}
+
+macro_rules! define_column_block_builder {
+    ($(($Kind: ident, $Builder: ident)), *) => {
+        paste! {
+            #[derive(Debug)]
+            pub enum ColumnBlockBuilder {
+                Null { rows: usize },
+                Timestamp(TimestampMillisecondBuilder),
+                $(
+                    $Kind($Builder),
+                )*
+            }
+
+            impl ColumnBlockBuilder {
+                /// Create by data type with initial capacity
+                pub fn with_capacity(data_type: &DatumKind, capacity: usize) -> Self {
+                    match data_type {
+                        DatumKind::Null => Self::Null { rows: 0 },
+                        DatumKind::Timestamp => Self::Timestamp(TimestampMillisecondBuilder::new(capacity)),
+                        $(
+                            DatumKind::$Kind => Self::$Kind($Builder::new(capacity)),
+                        )*
+                    }
+                }
+
+                /// Append the datum into the builder, the datum should have same the data
+                /// type of builder
+                pub fn append(&mut self, datum: Datum) -> Result<()> {
+                    let given = datum.kind();
+                    match self {
+                        Self::Null { rows } => match datum {
+                            Datum::Null => {
+                                *rows += 1;
+                                Ok(())
+                            }
+                            _ => ConflictType {
+                                expect: DatumKind::Null,
+                                given,
+                            }
+                            .fail(),
+                        },
+                        Self::Timestamp(builder) => append_datum_into!(Timestamp, builder, Datum, datum),
+                        $(
+                            Self::$Kind(builder) => append_datum!($Kind, builder, Datum, datum),
+                        )*
+                    }
+                }
+
+                /// Append the [DatumView] into the builder, the datum view should have same the data
+                /// type of builder
+                pub fn append_view<'a>(&mut self, datum: DatumView<'a>) -> Result<()> {
+                    let given = datum.kind();
+                    match self {
+                        Self::Null { rows } => match datum {
+                            DatumView::Null => {
+                                *rows += 1;
+                                Ok(())
+                            }
+                            _ => ConflictType {
+                                expect: DatumKind::Null,
+                                given,
+                            }
+                            .fail(),
+                        },
+                        Self::Timestamp(builder) => append_datum_into!(Timestamp, builder, DatumView, datum),
+                        $(
+                            Self::$Kind(builder) => append_datum!($Kind, builder, DatumView, datum),
+                        )*
+                    }
+                }
+
+                /// Append rows in [start..start + len) from `block` to the builder.
+                ///
+                /// Returns rows actually appended.
+                pub fn append_block_range(&mut self, block: &ColumnBlock, start: usize, len: usize) -> Result<()> {
+                    match self {
+                        Self::Null { rows } => {
+                            if start + len >= block.num_rows() {
+                                *rows += block.num_rows() - start;
+                            } else {
+                                *rows += len;
+                            }
+                            Ok(())
+                        },
+                        Self::Timestamp(builder) => append_block!(Timestamp, builder, ColumnBlock, block, start, len),
+                        $(
+                            Self::$Kind(builder) => append_block!($Kind, builder, ColumnBlock, block, start, len),
+                        )*
+                    }
+                }
+
+                pub fn len(&self) -> usize {
+                    match &self {
+                        Self::Null { rows } => *rows,
+                        Self::Timestamp(builder) => builder.len(),
+                        $(
+                            Self::$Kind(builder) =>  builder.len(),
+                        )*
+                    }
+                }
+
+                // Build and reset the builder.
+                pub fn build(&mut self) -> ColumnBlock {
+                    match self {
+                        Self::Null { rows } => {
+                            let block = ColumnBlock::new_null(*rows);
+                            *rows = 0;
+                            block
+                        }
+                        Self::Timestamp(builder) => TimestampColumn::from(builder.finish()).into(),
+                        $(
+                            Self::$Kind(builder) => [<$Kind Column>]::from(builder.finish()).into(),
+                        )*
+                    }
+                }
+            }
+        }
+    }
+}
+
+// Define column block builders, Null and Timestamp are defined explicitly in
+// macro.
+define_column_block_builder!(
+    (Double, DoubleBuilder),
+    (Float, FloatBuilder),
+    (Varbinary, BinaryBuilder),
+    (String, StringBuilder),
+    (UInt64, UInt64Builder),
+    (UInt32, UInt32Builder),
+    (UInt16, UInt16Builder),
+    (UInt8, UInt8Builder),
+    (Int64, Int64Builder),
+    (Int32, Int32Builder),
+    (Int16, Int16Builder),
+    (Int8, Int8Builder),
+    (Boolean, BooleanBuilder)
+);
+
+impl ColumnBlockBuilder {
+    /// Create by data type
+    pub fn new(data_type: &DatumKind) -> Self {
+        Self::with_capacity(data_type, 0)
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+
+    /// Clear the builder by calling `build()` and drop the built result.
+    pub fn clear(&mut self) {
+        let _ = self.build();
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::tests::{build_rows, build_schema};
+
+    #[test]
+    fn test_column_block_builder() {
+        let schema = build_schema();
+        let rows = build_rows();
+        // DatumKind::Varbinary
+        let column = schema.column(0);
+        let mut builder = ColumnBlockBuilder::with_capacity(&column.data_type, 2);
+
+        // append
+        builder.append(rows[0][0].clone()).unwrap();
+        let ret = builder.append(rows[0][1].clone());
+        assert!(ret.is_err());
+
+        // append_view
+        builder.append_view(rows[1][0].as_view()).unwrap();
+        let ret = builder.append_view(rows[0][1].as_view());
+        assert!(ret.is_err());
+
+        let column_block = builder.build();
+        assert_eq!(column_block.num_rows(), 2);
+        let mut builder = ColumnBlockBuilder::with_capacity(&column.data_type, 2);
+
+        // append_block_range
+        builder.append_block_range(&column_block, 0, 1).unwrap();
+        builder.append_block_range(&column_block, 1, 1).unwrap();
+
+        let column_block = builder.build();
+        assert_eq!(column_block.num_rows(), 2);
+        assert_eq!(
+            column_block.datum(0),
+            Datum::Varbinary(Bytes::copy_from_slice(b"binary key"))
+        );
+        assert_eq!(
+            column_block.datum(1),
+            Datum::Varbinary(Bytes::copy_from_slice(b"binary key1"))
+        );
+    }
+}
diff --git a/common_types/src/column_schema.rs b/common_types/src/column_schema.rs
new file mode 100644
index 0000000000..eecf4303eb
--- /dev/null
+++ b/common_types/src/column_schema.rs
@@ -0,0 +1,477 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Schema of column
+
+use std::{collections::BTreeMap, convert::TryFrom, str::FromStr};
+
+use arrow_deps::arrow::datatypes::{DataType, Field};
+use proto::common as common_pb;
+use snafu::{ensure, Backtrace, OptionExt, ResultExt, Snafu};
+
+use crate::datum::DatumKind;
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display(
+        "Unsupported arrow data type, type:{}.\nBacktrace:\n{}",
+        data_type,
+        backtrace
+    ))]
+    UnsupportedDataType {
+        data_type: DataType,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Invalid tag type:{}.\nBacktrace:\n{}", data_type, backtrace))]
+    InvalidTagType {
+        data_type: DataType,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Arrow field meta data is missing, field name:{}.\nBacktrace:\n{}",
+        field_name,
+        backtrace
+    ))]
+    ArrowFieldMetaDataMissing {
+        field_name: String,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Arrow field meta key is not found, key:{:?}.\nBacktrace:\n{}",
+        key,
+        backtrace
+    ))]
+    ArrowFieldMetaKeyNotFound {
+        key: ArrowFieldMetaKey,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Arrow field meta value is invalid, key:{:?}, raw_value:{}, err:{}.\nBacktrace:\n{}",
+        key,
+        raw_value,
+        source,
+        backtrace
+    ))]
+    InvalidArrowFieldMetaValue {
+        key: ArrowFieldMetaKey,
+        raw_value: String,
+        source: Box<dyn std::error::Error + Send + Sync>,
+        backtrace: Backtrace,
+    },
+}
+
+pub type Result<T> = std::result::Result<T, Error>;
+
+/// Error of compatibility check
+#[derive(Debug, Snafu)]
+pub enum CompatError {
+    #[snafu(display(
+        "Incompatible data type of column, name:{}, expect:{:?}, given:{:?}.\nBacktrace:\n{}",
+        name,
+        expect,
+        given,
+        backtrace,
+    ))]
+    IncompatDataType {
+        name: String,
+        expect: DatumKind,
+        given: DatumKind,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Column is not nullable, name:{}.\nBacktrace:\n{}", name, backtrace))]
+    NotNullable { name: String, backtrace: Backtrace },
+}
+
+/// Id of column
+pub type ColumnId = u32;
+
+/// A ColumnId used to indicate that the column id is uninitialized
+pub const COLUMN_ID_UNINIT: ColumnId = 0;
+
+/// Read operation of a column
+#[derive(Debug)]
+pub enum ReadOp {
+    /// Use the column exactly
+    Exact,
+    /// Fill the column by null
+    FillNull,
+}
+
+/// Meta data of the arrow field.
+#[derive(Clone, Debug, Default)]
+struct ArrowFieldMeta {
+    id: u32,
+    is_tag: bool,
+    comment: String,
+}
+
+#[derive(Copy, Clone, Debug)]
+pub enum ArrowFieldMetaKey {
+    Id,
+    IsTag,
+    Comment,
+}
+
+impl ArrowFieldMetaKey {
+    fn as_str(&self) -> &str {
+        match self {
+            ArrowFieldMetaKey::Id => "field::id",
+            ArrowFieldMetaKey::IsTag => "field::is_tag",
+            ArrowFieldMetaKey::Comment => "field::comment",
+        }
+    }
+}
+
+impl ToString for ArrowFieldMetaKey {
+    fn to_string(&self) -> String {
+        self.as_str().to_string()
+    }
+}
+
+/// Schema of column
+#[derive(Debug, Clone, PartialEq)]
+pub struct ColumnSchema {
+    /// Id of column
+    pub id: ColumnId,
+    /// Column name
+    pub name: String,
+    /// Data type of the column
+    pub data_type: DatumKind,
+    /// Is nullable
+    pub is_nullable: bool,
+    /// Is tag, tag is just a hint for a column, there is no restriction that a
+    /// tag column must be a part of primary key
+    pub is_tag: bool,
+    /// Comment of the column
+    pub comment: String,
+}
+
+impl ColumnSchema {
+    /// Check whether a type is valid tag type.
+    pub fn is_valid_tag_type(typ: DatumKind) -> bool {
+        match typ {
+            DatumKind::Null => false,
+            DatumKind::Timestamp => true,
+            DatumKind::Double => false,
+            DatumKind::Float => false,
+            DatumKind::Varbinary => true,
+            DatumKind::String => true,
+            DatumKind::UInt64 => true,
+            DatumKind::UInt32 => true,
+            DatumKind::UInt16 => true,
+            DatumKind::UInt8 => true,
+            DatumKind::Int64 => true,
+            DatumKind::Int32 => true,
+            DatumKind::Int16 => true,
+            DatumKind::Int8 => true,
+            DatumKind::Boolean => true,
+        }
+    }
+
+    /// Convert `self` to [proto::common::ColumnSchema]
+    ///
+    /// The `is_key` is needed because it is maintained by
+    /// [crate::schema::Schema]
+    pub fn to_pb(&self) -> common_pb::ColumnSchema {
+        let mut column_schema = common_pb::ColumnSchema::new();
+        column_schema.set_name(self.name.clone());
+        column_schema.set_data_type(self.data_type.into());
+        column_schema.set_is_nullable(self.is_nullable);
+        column_schema.set_id(self.id);
+        column_schema.set_is_tag(self.is_tag);
+        column_schema.set_comment(self.comment.clone());
+
+        column_schema
+    }
+
+    /// Convert `self` to [arrow_deps::arrow::datatypes::Field]
+    pub fn to_arrow_field(&self) -> Field {
+        From::from(self)
+    }
+
+    /// Returns Ok if column with `writer_schema` can write to column with the
+    /// same schema as `self`.
+    pub fn compatible_for_write(
+        &self,
+        writer_schema: &ColumnSchema,
+    ) -> std::result::Result<(), CompatError> {
+        ensure!(
+            self.data_type == writer_schema.data_type,
+            IncompatDataType {
+                name: &self.name,
+                expect: writer_schema.data_type,
+                given: self.data_type,
+            }
+        );
+
+        // This column is not nullable but writer is nullable
+        ensure!(
+            self.is_nullable || !writer_schema.is_nullable,
+            NotNullable { name: &self.name }
+        );
+
+        Ok(())
+    }
+
+    /// Returns `Ok` if the source schema can read by this schema, now we won't
+    /// validate data type of column
+    pub fn compatible_for_read(
+        &self,
+        source_schema: &ColumnSchema,
+    ) -> std::result::Result<ReadOp, CompatError> {
+        if self.is_nullable {
+            // Column is nullable
+            if self.id == source_schema.id {
+                // Same column
+                Ok(ReadOp::Exact)
+            } else {
+                // Not the same column, maybe dropped, fill by null.
+                Ok(ReadOp::FillNull)
+            }
+        } else {
+            // Column is not null. We consider the old column was dropped if they have
+            // different column id and also try to fill by null, so we
+            // also check column id.
+            ensure!(
+                self.id == source_schema.id && !source_schema.is_nullable,
+                NotNullable {
+                    name: &source_schema.name,
+                }
+            );
+
+            Ok(ReadOp::Exact)
+        }
+    }
+}
+
+impl From<common_pb::ColumnSchema> for ColumnSchema {
+    fn from(column_schema: common_pb::ColumnSchema) -> Self {
+        Self {
+            id: column_schema.id,
+            name: column_schema.name,
+            data_type: DatumKind::from(column_schema.data_type),
+            is_nullable: column_schema.is_nullable,
+            is_tag: column_schema.is_tag,
+            comment: column_schema.comment,
+        }
+    }
+}
+
+impl TryFrom<&Field> for ColumnSchema {
+    type Error = Error;
+
+    fn try_from(field: &Field) -> Result<Self> {
+        let meta_data = field.metadata().as_ref();
+        let ArrowFieldMeta {
+            id,
+            is_tag,
+            comment,
+        } = if let Some(meta_data) = meta_data {
+            decode_arrow_field_meta_data(meta_data)?
+        } else {
+            // FIXME(xikai): Now we have to tolerate the decoding failure because of the bug
+            // of datafusion (fixed by: https://github.com/apache/arrow-datafusion/commit/1448d9752ab3a38f02732274f91136a6a6ad3db4).
+            //  (The bug may cause the meta data of the field meta lost duration plan
+            // execution.)
+            ArrowFieldMeta::default()
+        };
+        Ok(Self {
+            id,
+            name: field.name().clone(),
+            data_type: DatumKind::from_data_type(field.data_type()).context(
+                UnsupportedDataType {
+                    data_type: field.data_type().clone(),
+                },
+            )?,
+            is_nullable: field.is_nullable(),
+            is_tag,
+            comment,
+        })
+    }
+}
+
+impl From<&ColumnSchema> for Field {
+    fn from(col_schema: &ColumnSchema) -> Self {
+        let metadata = encode_arrow_field_meta_data(col_schema);
+        let mut field = Field::new(
+            &col_schema.name,
+            col_schema.data_type.into(),
+            col_schema.is_nullable,
+        );
+        field.set_metadata(Some(metadata));
+
+        field
+    }
+}
+
+fn parse_arrow_field_meta_value<T>(
+    meta: &BTreeMap<String, String>,
+    key: ArrowFieldMetaKey,
+) -> Result<T>
+where
+    T: FromStr,
+    T::Err: std::error::Error + Send + Sync + 'static,
+{
+    let raw_value = meta
+        .get(key.as_str())
+        .context(ArrowFieldMetaKeyNotFound { key })?;
+    T::from_str(raw_value.as_str())
+        .map_err(|e| Box::new(e) as _)
+        .context(InvalidArrowFieldMetaValue { key, raw_value })
+}
+
+fn decode_arrow_field_meta_data(meta: &BTreeMap<String, String>) -> Result<ArrowFieldMeta> {
+    Ok(ArrowFieldMeta {
+        id: parse_arrow_field_meta_value(meta, ArrowFieldMetaKey::Id)?,
+        is_tag: parse_arrow_field_meta_value(meta, ArrowFieldMetaKey::IsTag)?,
+        comment: parse_arrow_field_meta_value(meta, ArrowFieldMetaKey::Comment)?,
+    })
+}
+
+fn encode_arrow_field_meta_data(col_schema: &ColumnSchema) -> BTreeMap<String, String> {
+    let mut meta = BTreeMap::new();
+
+    meta.insert(ArrowFieldMetaKey::Id.to_string(), col_schema.id.to_string());
+    meta.insert(
+        ArrowFieldMetaKey::IsTag.to_string(),
+        col_schema.is_tag.to_string(),
+    );
+    meta.insert(
+        ArrowFieldMetaKey::Comment.to_string(),
+        col_schema.comment.clone(),
+    );
+
+    meta
+}
+
+/// ColumnSchema builder
+#[must_use]
+pub struct Builder {
+    id: ColumnId,
+    name: String,
+    data_type: DatumKind,
+    is_nullable: bool,
+    is_tag: bool,
+    comment: String,
+}
+
+impl Builder {
+    /// Create a new builder
+    pub fn new(name: String, data_type: DatumKind) -> Self {
+        Self {
+            id: COLUMN_ID_UNINIT,
+            name,
+            data_type,
+            is_nullable: false,
+            is_tag: false,
+            comment: String::new(),
+        }
+    }
+
+    pub fn id(mut self, id: ColumnId) -> Self {
+        self.id = id;
+        self
+    }
+
+    /// Set this column is nullable, default is true (not nullable).
+    pub fn is_nullable(mut self, is_nullable: bool) -> Self {
+        self.is_nullable = is_nullable;
+        self
+    }
+
+    /// Set this column is tag, default is false (not a tag).
+    pub fn is_tag(mut self, is_tag: bool) -> Self {
+        self.is_tag = is_tag;
+        self
+    }
+
+    pub fn comment(mut self, comment: String) -> Self {
+        self.comment = comment;
+        self
+    }
+
+    pub fn validate(&self) -> Result<()> {
+        if self.is_tag {
+            ensure!(
+                ColumnSchema::is_valid_tag_type(self.data_type),
+                InvalidTagType {
+                    data_type: self.data_type
+                }
+            );
+        }
+
+        Ok(())
+    }
+
+    pub fn build(self) -> Result<ColumnSchema> {
+        self.validate()?;
+
+        Ok(ColumnSchema {
+            id: self.id,
+            name: self.name,
+            data_type: self.data_type,
+            is_nullable: self.is_nullable,
+            is_tag: self.is_tag,
+            comment: self.comment,
+        })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// Create a column schema for test, each field is filled with non-default
+    /// value
+    fn new_test_column_schema() -> ColumnSchema {
+        Builder::new("test_column_schema".to_string(), DatumKind::Boolean)
+            .id(18)
+            .is_nullable(true)
+            .is_tag(true)
+            .comment("Comment of this column".to_string())
+            .build()
+            .expect("should succeed to build column schema")
+    }
+
+    #[test]
+    fn test_builder() {
+        let lhs = new_test_column_schema();
+        let rhs = ColumnSchema {
+            id: 18,
+            name: "test_column_schema".to_string(),
+            data_type: DatumKind::Boolean,
+            is_nullable: true,
+            is_tag: true,
+            comment: "Comment of this column".to_string(),
+        };
+
+        assert_eq!(&lhs, &rhs);
+    }
+
+    #[test]
+    fn test_pb_convert() {
+        let column_schema = new_test_column_schema();
+        let pb_schema = column_schema.to_pb();
+        // Check pb specific fields
+        assert!(pb_schema.is_tag);
+
+        let schema_from_pb = ColumnSchema::from(pb_schema);
+        assert_eq!(&schema_from_pb, &column_schema);
+    }
+
+    #[test]
+    fn test_valid_tag_type() {
+        let invalid_tag_types = vec![DatumKind::Null, DatumKind::Float, DatumKind::Double];
+
+        for v in &DatumKind::VALUES {
+            assert_eq!(
+                ColumnSchema::is_valid_tag_type(*v),
+                !invalid_tag_types.contains(v)
+            );
+        }
+    }
+}
diff --git a/common_types/src/datum.rs b/common_types/src/datum.rs
new file mode 100644
index 0000000000..4ae6a8124b
--- /dev/null
+++ b/common_types/src/datum.rs
@@ -0,0 +1,887 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Datum holds different kind of data
+
+use std::{convert::TryFrom, fmt, str};
+
+use arrow_deps::{
+    arrow::datatypes::{DataType, TimeUnit},
+    datafusion::scalar::ScalarValue,
+};
+use chrono::{Local, TimeZone};
+use proto::common::DataType as DataTypePb;
+use serde::ser::{Serialize, Serializer};
+use snafu::{Backtrace, ResultExt, Snafu};
+use sqlparser::ast::{DataType as SqlDataType, Value};
+
+use crate::{bytes::Bytes, hash::hash64, string::StringBytes, time::Timestamp};
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display(
+        "Unsupported SQL data type, type:{}.\nBacktrace:\n{}",
+        sql_type,
+        backtrace
+    ))]
+    UnsupportedDataType {
+        sql_type: SqlDataType,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Invalid double or float, err:{}.\nBacktrace:\n{}", source, backtrace))]
+    InvalidDouble {
+        source: std::num::ParseFloatError,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Invalid insert value, kind:{}, value:{:?}.\nBacktrace:\n{}",
+        kind,
+        value,
+        backtrace
+    ))]
+    InvalidValueType {
+        kind: DatumKind,
+        value: Value,
+        backtrace: Backtrace,
+    },
+    #[snafu(display("Invalid timestamp, err:{}.\nBacktrace:\n{}", source, backtrace))]
+    InvalidTimestamp {
+        source: std::num::ParseIntError,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Invalid integer, err:{}.\nBacktrace:\n{}", source, backtrace))]
+    InvalidInt {
+        source: std::num::ParseIntError,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Invalid datum byte, byte:{}.\nBacktrace:\n{}", value, backtrace))]
+    InvalidDatumByte { value: u8, backtrace: Backtrace },
+}
+
+pub type Result<T> = std::result::Result<T, Error>;
+
+// FIXME(yingwen): How to handle timezone?
+
+/// Data type of datum
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum DatumKind {
+    Null = 0,
+    Timestamp,
+    Double,
+    Float,
+    Varbinary,
+    String,
+    UInt64,
+    UInt32,
+    UInt16,
+    UInt8,
+    Int64,
+    Int32,
+    Int16,
+    Int8,
+    // DatumKind::Boolean as usize = 14
+    Boolean,
+}
+
+impl DatumKind {
+    pub const VALUES: [Self; 15] = [
+        Self::Null,
+        Self::Timestamp,
+        Self::Double,
+        Self::Float,
+        Self::Varbinary,
+        Self::String,
+        Self::UInt64,
+        Self::UInt32,
+        Self::UInt16,
+        Self::UInt8,
+        Self::Int64,
+        Self::Int32,
+        Self::Int16,
+        Self::Int8,
+        Self::Boolean,
+    ];
+
+    /// Return true if this is DatumKind::Timestamp
+    pub fn is_timestamp(&self) -> bool {
+        matches!(self, DatumKind::Timestamp)
+    }
+
+    pub fn is_f64_castable(&self) -> bool {
+        matches!(
+            self,
+            Self::Double
+                | Self::Float
+                | Self::UInt64
+                | Self::UInt32
+                | Self::UInt16
+                | Self::UInt8
+                | Self::Int64
+                | Self::Int32
+                | Self::Int16
+                | Self::Int8
+        )
+    }
+
+    /// Can column of this datum kind used as key column
+    pub fn is_key_kind(&self) -> bool {
+        matches!(
+            self,
+            DatumKind::Timestamp
+                | DatumKind::Varbinary
+                | DatumKind::String
+                | DatumKind::UInt64
+                | DatumKind::UInt32
+                | DatumKind::UInt16
+                | DatumKind::UInt8
+                | DatumKind::Int64
+                | DatumKind::Int32
+                | DatumKind::Int16
+                | DatumKind::Int8
+                | DatumKind::Boolean
+        )
+    }
+
+    pub fn unsign_kind(&self) -> Option<Self> {
+        match self {
+            Self::Int64 | Self::UInt64 => Some(Self::UInt64),
+            Self::Int32 | Self::UInt32 => Some(Self::UInt32),
+            Self::Int16 | Self::UInt16 => Some(Self::UInt16),
+            Self::Int8 | Self::UInt8 => Some(Self::UInt8),
+            _ => None,
+        }
+    }
+
+    /// Create DatumKind from [arrow_deps::arrow::datatypes::DataType], if the
+    /// type is not supported, returns None
+    pub fn from_data_type(data_type: &DataType) -> Option<Self> {
+        match data_type {
+            DataType::Null => Some(Self::Null),
+            DataType::Timestamp(TimeUnit::Millisecond, None) => Some(Self::Timestamp),
+            DataType::Float64 => Some(Self::Double),
+            DataType::Float32 => Some(Self::Float),
+            DataType::Binary => Some(Self::Varbinary),
+            DataType::Utf8 => Some(Self::String),
+            DataType::UInt64 => Some(Self::UInt64),
+            DataType::UInt32 => Some(Self::UInt32),
+            DataType::UInt16 => Some(Self::UInt16),
+            DataType::UInt8 => Some(Self::UInt8),
+            DataType::Int64 => Some(Self::Int64),
+            DataType::Int32 => Some(Self::Int32),
+            DataType::Int16 => Some(Self::Int16),
+            DataType::Int8 => Some(Self::Int8),
+            DataType::Boolean => Some(Self::Boolean),
+            DataType::Float16
+            | DataType::LargeUtf8
+            | DataType::LargeBinary
+            | DataType::FixedSizeBinary(_)
+            | DataType::Struct(_)
+            | DataType::Union(_, _)
+            | DataType::List(_)
+            | DataType::LargeList(_)
+            | DataType::FixedSizeList(_, _)
+            | DataType::Time32(_)
+            | DataType::Time64(_)
+            | DataType::Timestamp(_, _)
+            | DataType::Date32
+            | DataType::Date64
+            | DataType::Interval(_)
+            | DataType::Duration(_)
+            | DataType::Dictionary(_, _)
+            | DataType::Decimal(_, _)
+            | DataType::Map(_, _) => None,
+        }
+    }
+
+    /// Get name of this kind.
+    fn as_str(&self) -> &str {
+        match self {
+            DatumKind::Null => "null",
+            DatumKind::Timestamp => "timestamp",
+            DatumKind::Double => "double",
+            DatumKind::Float => "float",
+            DatumKind::Varbinary => "varbinary",
+            DatumKind::String => "string",
+            DatumKind::UInt64 => "uint64",
+            DatumKind::UInt32 => "uint32",
+            DatumKind::UInt16 => "uint16",
+            DatumKind::UInt8 => "uint8",
+            DatumKind::Int64 => "bigint",
+            DatumKind::Int32 => "int",
+            DatumKind::Int16 => "smallint",
+            DatumKind::Int8 => "tinyint",
+            DatumKind::Boolean => "boolean",
+        }
+    }
+
+    /// Convert into a byte.
+    #[inline]
+    pub fn into_u8(self) -> u8 {
+        self as u8
+    }
+}
+
+impl From<DatumKind> for DataType {
+    fn from(kind: DatumKind) -> Self {
+        match kind {
+            DatumKind::Null => DataType::Null,
+            DatumKind::Timestamp => DataType::Timestamp(TimeUnit::Millisecond, None),
+            DatumKind::Double => DataType::Float64,
+            DatumKind::Float => DataType::Float32,
+            DatumKind::Varbinary => DataType::Binary,
+            DatumKind::String => DataType::Utf8,
+            DatumKind::UInt64 => DataType::UInt64,
+            DatumKind::UInt32 => DataType::UInt32,
+            DatumKind::UInt16 => DataType::UInt16,
+            DatumKind::UInt8 => DataType::UInt8,
+            DatumKind::Int64 => DataType::Int64,
+            DatumKind::Int32 => DataType::Int32,
+            DatumKind::Int16 => DataType::Int16,
+            DatumKind::Int8 => DataType::Int8,
+            DatumKind::Boolean => DataType::Boolean,
+        }
+    }
+}
+
+impl TryFrom<&SqlDataType> for DatumKind {
+    type Error = Error;
+
+    fn try_from(sql_type: &SqlDataType) -> Result<Self> {
+        match sql_type {
+            // TODO(yingwen): Consider timezone
+            SqlDataType::Timestamp => Ok(Self::Timestamp),
+            SqlDataType::Real | SqlDataType::Float(_) => Ok(Self::Float),
+            SqlDataType::Double => Ok(Self::Double),
+            SqlDataType::Boolean => Ok(Self::Boolean),
+            SqlDataType::BigInt(_) => Ok(Self::Int64),
+            SqlDataType::Int(_) => Ok(Self::Int32),
+            SqlDataType::SmallInt(_) => Ok(Self::Int16),
+            SqlDataType::String => Ok(Self::String),
+            SqlDataType::Custom(objects) if objects.0.len() == 1 => {
+                match objects.0[0].value.as_str() {
+                    "UINT64" | "uint64" => Ok(Self::UInt64),
+                    "UINT32" | "uint32" => Ok(Self::UInt32),
+                    "UINT16" | "uint16" => Ok(Self::UInt16),
+                    "UINT8" | "uint8" => Ok(Self::UInt8),
+                    "INT64" | "int64" => Ok(Self::Int64),
+                    "INT32" | "int32" => Ok(Self::Int32),
+                    "INT16" | "int16" => Ok(Self::Int16),
+                    "TINYINT" | "INT8" | "tinyint" | "int8" => Ok(Self::Int8),
+                    "VARBINARY" | "varbinary" => Ok(Self::Varbinary),
+                    _ => UnsupportedDataType {
+                        sql_type: sql_type.clone(),
+                    }
+                    .fail(),
+                }
+            }
+
+            // Unlike datafusion, Decimal is not supported now
+            _ => UnsupportedDataType {
+                sql_type: sql_type.clone(),
+            }
+            .fail(),
+        }
+    }
+}
+
+impl TryFrom<u8> for DatumKind {
+    type Error = Error;
+
+    fn try_from(v: u8) -> Result<Self> {
+        match v {
+            v if DatumKind::Null.into_u8() == v => Ok(DatumKind::Null),
+            v if DatumKind::Timestamp.into_u8() == v => Ok(DatumKind::Timestamp),
+            v if DatumKind::Double.into_u8() == v => Ok(DatumKind::Double),
+            v if DatumKind::Float.into_u8() == v => Ok(DatumKind::Float),
+            v if DatumKind::Varbinary.into_u8() == v => Ok(DatumKind::Varbinary),
+            v if DatumKind::String.into_u8() == v => Ok(DatumKind::String),
+            v if DatumKind::UInt64.into_u8() == v => Ok(DatumKind::UInt64),
+            v if DatumKind::UInt32.into_u8() == v => Ok(DatumKind::UInt32),
+            v if DatumKind::UInt16.into_u8() == v => Ok(DatumKind::UInt16),
+            v if DatumKind::UInt8.into_u8() == v => Ok(DatumKind::UInt8),
+            v if DatumKind::Int64.into_u8() == v => Ok(DatumKind::Int64),
+            v if DatumKind::Int32.into_u8() == v => Ok(DatumKind::Int32),
+            v if DatumKind::Int16.into_u8() == v => Ok(DatumKind::Int16),
+            v if DatumKind::Int8.into_u8() == v => Ok(DatumKind::Int8),
+            v if DatumKind::Boolean.into_u8() == v => Ok(DatumKind::Boolean),
+            _ => InvalidDatumByte { value: v }.fail(),
+        }
+    }
+}
+
+impl From<DatumKind> for DataTypePb {
+    fn from(kind: DatumKind) -> Self {
+        match kind {
+            DatumKind::Null => Self::NULL,
+            DatumKind::Timestamp => Self::TIMESTAMP,
+            DatumKind::Double => Self::DOUBLE,
+            DatumKind::Float => Self::FLOAT,
+            DatumKind::Varbinary => Self::VARBINARY,
+            DatumKind::String => Self::STRING,
+            DatumKind::UInt64 => Self::UINT64,
+            DatumKind::UInt32 => Self::UINT32,
+            DatumKind::UInt16 => Self::UINT16,
+            DatumKind::UInt8 => Self::UINT8,
+            DatumKind::Int64 => Self::INT64,
+            DatumKind::Int32 => Self::INT32,
+            DatumKind::Int16 => Self::INT16,
+            DatumKind::Int8 => Self::INT8,
+            DatumKind::Boolean => Self::BOOL,
+        }
+    }
+}
+
+impl From<DataTypePb> for DatumKind {
+    fn from(data_type: DataTypePb) -> Self {
+        match data_type {
+            DataTypePb::NULL => DatumKind::Null,
+            DataTypePb::TIMESTAMP => DatumKind::Timestamp,
+            DataTypePb::DOUBLE => DatumKind::Double,
+            DataTypePb::FLOAT => DatumKind::Float,
+            DataTypePb::VARBINARY => DatumKind::Varbinary,
+            DataTypePb::STRING => DatumKind::String,
+            DataTypePb::UINT64 => DatumKind::UInt64,
+            DataTypePb::UINT32 => DatumKind::UInt32,
+            DataTypePb::UINT16 => DatumKind::UInt16,
+            DataTypePb::UINT8 => DatumKind::UInt8,
+            DataTypePb::INT64 => DatumKind::Int64,
+            DataTypePb::INT32 => DatumKind::Int32,
+            DataTypePb::INT16 => DatumKind::Int16,
+            DataTypePb::INT8 => DatumKind::Int8,
+            DataTypePb::BOOL => DatumKind::Boolean,
+        }
+    }
+}
+
+impl fmt::Display for DatumKind {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "{}", self.as_str())
+    }
+}
+
+// FIXME(yingwen): Validate the length of string and varbinary.
+/// A data box holds different kind of data
+#[derive(Debug, Clone, PartialEq, PartialOrd)]
+pub enum Datum {
+    Null,
+    /// Millisecond precision
+    ///
+    /// Map to arrow::datatypes::DataType::Timestamp(TimeUnit::Millisecond,
+    /// None)
+    Timestamp(Timestamp),
+    /// Map to arrow::datatypes::DataType::Float64
+    Double(f64),
+    /// Map to arrow::datatypes::DataType::Float32
+    Float(f32),
+    /// Map to arrow::datatypes::DateType::Binary
+    ///
+    /// No more than 2G (size of i32)
+    Varbinary(Bytes),
+    /// Map to arrow::datatypes::DataType::String
+    ///
+    /// No more than 2G (size of i32)
+    String(StringBytes),
+    /// Map to arrow::datatypes::DataType::UInt64
+    UInt64(u64),
+    UInt32(u32),
+    UInt16(u16),
+    UInt8(u8),
+    Int64(i64),
+    Int32(i32),
+    Int16(i16),
+    Int8(i8),
+    Boolean(bool),
+}
+
+impl Datum {
+    /// Creates an empty datum by given datum kind
+    pub fn empty(kind: &DatumKind) -> Self {
+        match kind {
+            DatumKind::Null => Self::Null,
+            DatumKind::Timestamp => Self::Timestamp(Timestamp::new(0)),
+            DatumKind::Double => Self::Double(0.0),
+            DatumKind::Float => Self::Float(0.0),
+            DatumKind::Varbinary => Self::Varbinary(Bytes::new()),
+            DatumKind::String => Self::String(StringBytes::new()),
+            DatumKind::UInt64 => Self::UInt64(0),
+            DatumKind::UInt32 => Self::UInt32(0),
+            DatumKind::UInt16 => Self::UInt16(0),
+            DatumKind::UInt8 => Self::UInt8(0),
+            DatumKind::Int64 => Self::Int64(0),
+            DatumKind::Int32 => Self::Int32(0),
+            DatumKind::Int16 => Self::Int16(0),
+            DatumKind::Int8 => Self::Int8(0),
+            DatumKind::Boolean => Self::Boolean(false),
+        }
+    }
+
+    /// Return the kind of datum
+    pub fn kind(&self) -> DatumKind {
+        match self {
+            Datum::Null => DatumKind::Null,
+            Datum::Timestamp(_) => DatumKind::Timestamp,
+            Datum::Double(_) => DatumKind::Double,
+            Datum::Float(_) => DatumKind::Float,
+            Datum::Varbinary(_) => DatumKind::Varbinary,
+            Datum::String(_) => DatumKind::String,
+            Datum::UInt64(_) => DatumKind::UInt64,
+            Datum::UInt32(_) => DatumKind::UInt32,
+            Datum::UInt16(_) => DatumKind::UInt16,
+            Datum::UInt8(_) => DatumKind::UInt8,
+            Datum::Int64(_) => DatumKind::Int64,
+            Datum::Int32(_) => DatumKind::Int32,
+            Datum::Int16(_) => DatumKind::Int16,
+            Datum::Int8(_) => DatumKind::Int8,
+            Datum::Boolean(_) => DatumKind::Boolean,
+        }
+    }
+
+    // TODO: handle error
+    pub fn convert_to_uint64(&self) -> u64 {
+        match self {
+            Datum::Null => 0,
+            Datum::Timestamp(v) => v.as_i64() as u64,
+            Datum::Double(v) => *v as u64,
+            Datum::Float(v) => *v as u64,
+            Datum::Varbinary(v) => hash64(v),
+            Datum::String(v) => hash64(v.as_bytes()),
+            Datum::UInt64(v) => *v,
+            Datum::UInt32(v) => *v as u64,
+            Datum::UInt16(v) => *v as u64,
+            Datum::UInt8(v) => *v as u64,
+            Datum::Int64(v) => *v as u64,
+            Datum::Int32(v) => *v as u64,
+            Datum::Int16(v) => *v as u64,
+            Datum::Int8(v) => *v as u64,
+            Datum::Boolean(v) => *v as u64,
+        }
+    }
+
+    pub fn is_null(&self) -> bool {
+        matches!(self, Datum::Null)
+    }
+
+    /// Cast datum to timestamp.
+    pub fn as_timestamp(&self) -> Option<Timestamp> {
+        match self {
+            Datum::Timestamp(v) => Some(*v),
+            _ => None,
+        }
+    }
+
+    /// Cast datum to &str.
+    pub fn as_str(&self) -> Option<&str> {
+        match self {
+            Datum::String(v) => Some(v),
+            _ => None,
+        }
+    }
+
+    /// Cast datum to uint64.
+    pub fn as_u64(&self) -> Option<u64> {
+        match self {
+            Datum::UInt64(v) => Some(*v),
+            Datum::UInt32(v) => Some(*v as u64),
+            Datum::UInt16(v) => Some(*v as u64),
+            Datum::UInt8(v) => Some(*v as u64),
+            Datum::Int64(v) => Some(*v as u64),
+            Datum::Int32(v) => Some(*v as u64),
+            Datum::Int16(v) => Some(*v as u64),
+            Datum::Int8(v) => Some(*v as u64),
+            Datum::Boolean(v) => Some(*v as u64),
+            _ => None,
+        }
+    }
+
+    /// Cast datum to Bytes.
+    pub fn as_varbinary(&self) -> Option<&Bytes> {
+        match self {
+            Datum::Varbinary(v) => Some(v),
+            _ => None,
+        }
+    }
+
+    pub fn as_f32(&self) -> Option<f32> {
+        match self {
+            Datum::Float(v) => Some(*v),
+            _ => None,
+        }
+    }
+
+    pub fn as_f64(&self) -> Option<f64> {
+        match self {
+            Datum::Double(v) => Some(*v),
+            Datum::Float(v) => Some(*v as f64),
+            Datum::UInt64(v) => Some(*v as f64),
+            Datum::UInt32(v) => Some(*v as f64),
+            Datum::UInt16(v) => Some(*v as f64),
+            Datum::UInt8(v) => Some(*v as f64),
+            Datum::Int64(v) => Some(*v as f64),
+            Datum::Int32(v) => Some(*v as f64),
+            Datum::Int16(v) => Some(*v as f64),
+            Datum::Int8(v) => Some(*v as f64),
+            Datum::Boolean(_)
+            | Datum::Null
+            | Datum::Timestamp(_)
+            | Datum::Varbinary(_)
+            | Datum::String(_) => None,
+        }
+    }
+
+    pub fn display_string(&self) -> String {
+        match self {
+            Datum::Null => "null".to_string(),
+            Datum::Timestamp(v) => Local.timestamp_millis(v.as_i64()).to_rfc3339(),
+            Datum::Double(v) => v.to_string(),
+            Datum::Float(v) => v.to_string(),
+            Datum::Varbinary(v) => format!("{:?}", v),
+            Datum::String(v) => v.to_string(),
+            Datum::UInt64(v) => v.to_string(),
+            Datum::UInt32(v) => v.to_string(),
+            Datum::UInt16(v) => v.to_string(),
+            Datum::UInt8(v) => v.to_string(),
+            Datum::Int64(v) => v.to_string(),
+            Datum::Int32(v) => v.to_string(),
+            Datum::Int16(v) => v.to_string(),
+            Datum::Int8(v) => v.to_string(),
+            Datum::Boolean(v) => v.to_string(),
+        }
+    }
+
+    pub fn try_from_sql_value(kind: &DatumKind, value: Value) -> Result<Datum> {
+        match (kind, value) {
+            (DatumKind::Null, Value::Null) => Ok(Datum::Null),
+            (DatumKind::Timestamp, Value::Number(n, _long)) => {
+                let n = n.parse::<i64>().context(InvalidTimestamp)?;
+                Ok(Datum::Timestamp(Timestamp::new(n)))
+            }
+            (DatumKind::Double, Value::Number(n, _long)) => {
+                let n = n.parse::<f64>().context(InvalidDouble)?;
+                Ok(Datum::Double(n))
+            }
+            (DatumKind::Float, Value::Number(n, _long)) => {
+                let n = n.parse::<f32>().context(InvalidDouble)?;
+                Ok(Datum::Float(n))
+            }
+            // TODO(yingwen): Support hex string.
+            (DatumKind::Varbinary, Value::SingleQuotedString(s)) => {
+                Ok(Datum::Varbinary(Bytes::from(s)))
+            }
+            (DatumKind::String, Value::SingleQuotedString(s)) => {
+                Ok(Datum::String(StringBytes::from(s)))
+            }
+            (DatumKind::UInt64, Value::Number(n, _long)) => {
+                let n = n.parse::<u64>().context(InvalidInt)?;
+                Ok(Datum::UInt64(n))
+            }
+            (DatumKind::UInt32, Value::Number(n, _long)) => {
+                let n = n.parse::<u32>().context(InvalidInt)?;
+                Ok(Datum::UInt32(n))
+            }
+            (DatumKind::UInt16, Value::Number(n, _long)) => {
+                let n = n.parse::<u16>().context(InvalidInt)?;
+                Ok(Datum::UInt16(n))
+            }
+            (DatumKind::UInt8, Value::Number(n, _long)) => {
+                let n = n.parse::<u8>().context(InvalidInt)?;
+                Ok(Datum::UInt8(n))
+            }
+            (DatumKind::Int64, Value::Number(n, _long)) => {
+                let n = n.parse::<i64>().context(InvalidInt)?;
+                Ok(Datum::Int64(n))
+            }
+            (DatumKind::Int32, Value::Number(n, _long)) => {
+                let n = n.parse::<i32>().context(InvalidInt)?;
+                Ok(Datum::Int32(n))
+            }
+            (DatumKind::Int16, Value::Number(n, _long)) => {
+                let n = n.parse::<i16>().context(InvalidInt)?;
+                Ok(Datum::Int16(n))
+            }
+            (DatumKind::Int8, Value::Number(n, _long)) => {
+                let n = n.parse::<i8>().context(InvalidInt)?;
+                Ok(Datum::Int8(n))
+            }
+            (DatumKind::Boolean, Value::Boolean(b)) => Ok(Datum::Boolean(b)),
+            (_, value) => InvalidValueType { kind: *kind, value }.fail(),
+        }
+    }
+
+    pub fn as_scalar_value(&self) -> Option<ScalarValue> {
+        match self {
+            Datum::Null => None,
+            Datum::Timestamp(v) => {
+                Some(ScalarValue::TimestampMillisecond(Some((*v).as_i64()), None))
+            }
+            Datum::Double(v) => Some(ScalarValue::Float64(Some(*v))),
+            Datum::Float(v) => Some(ScalarValue::Float32(Some(*v))),
+            Datum::Varbinary(v) => Some(ScalarValue::Binary(Some(v.to_vec()))),
+            Datum::String(v) => Some(ScalarValue::Utf8(Some(v.to_string()))),
+            Datum::UInt64(v) => Some(ScalarValue::UInt64(Some(*v))),
+            Datum::UInt32(v) => Some(ScalarValue::UInt32(Some(*v))),
+            Datum::UInt16(v) => Some(ScalarValue::UInt16(Some(*v))),
+            Datum::UInt8(v) => Some(ScalarValue::UInt8(Some(*v))),
+            Datum::Int64(v) => Some(ScalarValue::Int64(Some(*v))),
+            Datum::Int32(v) => Some(ScalarValue::Int32(Some(*v))),
+            Datum::Int16(v) => Some(ScalarValue::Int16(Some(*v))),
+            Datum::Int8(v) => Some(ScalarValue::Int8(Some(*v))),
+            Datum::Boolean(v) => Some(ScalarValue::Boolean(Some(*v))),
+        }
+    }
+
+    #[cfg(test)]
+    pub fn as_view(&self) -> DatumView {
+        match self {
+            Datum::Null => DatumView::Null,
+            Datum::Timestamp(v) => DatumView::Timestamp(*v),
+            Datum::Double(v) => DatumView::Double(*v),
+            Datum::Float(v) => DatumView::Float(*v),
+            Datum::Varbinary(v) => DatumView::Varbinary(v),
+            Datum::String(v) => DatumView::String(v),
+            Datum::UInt64(v) => DatumView::UInt64(*v),
+            Datum::UInt32(v) => DatumView::UInt32(*v),
+            Datum::UInt16(v) => DatumView::UInt16(*v),
+            Datum::UInt8(v) => DatumView::UInt8(*v),
+            Datum::Int64(v) => DatumView::Int64(*v),
+            Datum::Int32(v) => DatumView::Int32(*v),
+            Datum::Int16(v) => DatumView::Int16(*v),
+            Datum::Int8(v) => DatumView::Int8(*v),
+            Datum::Boolean(v) => DatumView::Boolean(*v),
+        }
+    }
+}
+
+macro_rules! impl_from {
+    ($Kind: ident, $FromType: ident) => {
+        impl From<$FromType> for Datum {
+            fn from(value: $FromType) -> Self {
+                Self::$Kind(value)
+            }
+        }
+
+        impl From<Option<$FromType>> for Datum {
+            fn from(value_opt: Option<$FromType>) -> Self {
+                match value_opt {
+                    Some(value) => Self::$Kind(value),
+                    None => Self::Null,
+                }
+            }
+        }
+    };
+}
+
+impl_from!(Timestamp, Timestamp);
+impl_from!(Double, f64);
+impl_from!(Float, f32);
+impl_from!(Varbinary, Bytes);
+impl_from!(String, StringBytes);
+impl_from!(UInt64, u64);
+impl_from!(UInt32, u32);
+impl_from!(UInt16, u16);
+impl_from!(UInt8, u8);
+impl_from!(Int64, i64);
+impl_from!(Int32, i32);
+impl_from!(Int16, i16);
+impl_from!(Int8, i8);
+impl_from!(Boolean, bool);
+
+impl From<&str> for Datum {
+    fn from(value: &str) -> Datum {
+        Datum::String(StringBytes::copy_from_str(value))
+    }
+}
+
+impl From<Option<&str>> for Datum {
+    fn from(value_opt: Option<&str>) -> Datum {
+        match value_opt {
+            Some(value) => Datum::String(StringBytes::copy_from_str(value)),
+            None => Datum::Null,
+        }
+    }
+}
+
+impl From<&[u8]> for Datum {
+    fn from(value: &[u8]) -> Datum {
+        Datum::Varbinary(Bytes::copy_from_slice(value))
+    }
+}
+
+impl From<Option<&[u8]>> for Datum {
+    fn from(value_opt: Option<&[u8]>) -> Datum {
+        match value_opt {
+            Some(value) => Datum::Varbinary(Bytes::copy_from_slice(value)),
+            None => Datum::Null,
+        }
+    }
+}
+
+/// impl serde serialize for Datum
+impl Serialize for Datum {
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: Serializer,
+    {
+        match self {
+            Datum::Null => serializer.serialize_none(),
+            Datum::Timestamp(v) => serializer.serialize_i64(v.as_i64()),
+            Datum::Double(v) => serializer.serialize_f64(*v),
+            Datum::Float(v) => serializer.serialize_f32(*v),
+            Datum::Varbinary(v) => serializer.serialize_bytes(v),
+            Datum::String(v) => serializer.serialize_str(v),
+            Datum::UInt64(v) => serializer.serialize_u64(*v),
+            Datum::UInt32(v) => serializer.serialize_u32(*v),
+            Datum::UInt16(v) => serializer.serialize_u16(*v),
+            Datum::UInt8(v) => serializer.serialize_u8(*v),
+            Datum::Int64(v) => serializer.serialize_i64(*v),
+            Datum::Int32(v) => serializer.serialize_i32(*v),
+            Datum::Int16(v) => serializer.serialize_i16(*v),
+            Datum::Int8(v) => serializer.serialize_i8(*v),
+            Datum::Boolean(v) => serializer.serialize_bool(*v),
+        }
+    }
+}
+
+/// A view to a datum.
+///
+/// Holds copy of integer like datum and reference of string like datum.
+#[derive(Debug, PartialEq, PartialOrd)]
+pub enum DatumView<'a> {
+    Null,
+    Timestamp(Timestamp),
+    Double(f64),
+    Float(f32),
+    Varbinary(&'a [u8]),
+    String(&'a str),
+    UInt64(u64),
+    UInt32(u32),
+    UInt16(u16),
+    UInt8(u8),
+    Int64(i64),
+    Int32(i32),
+    Int16(i16),
+    Int8(i8),
+    Boolean(bool),
+}
+
+impl<'a> DatumView<'a> {
+    /// Return the kind of datum
+    pub fn kind(&self) -> DatumKind {
+        match self {
+            DatumView::Null => DatumKind::Null,
+            DatumView::Timestamp(_) => DatumKind::Timestamp,
+            DatumView::Double(_) => DatumKind::Double,
+            DatumView::Float(_) => DatumKind::Float,
+            DatumView::Varbinary(_) => DatumKind::Varbinary,
+            DatumView::String(_) => DatumKind::String,
+            DatumView::UInt64(_) => DatumKind::UInt64,
+            DatumView::UInt32(_) => DatumKind::UInt32,
+            DatumView::UInt16(_) => DatumKind::UInt16,
+            DatumView::UInt8(_) => DatumKind::UInt8,
+            DatumView::Int64(_) => DatumKind::Int64,
+            DatumView::Int32(_) => DatumKind::Int32,
+            DatumView::Int16(_) => DatumKind::Int16,
+            DatumView::Int8(_) => DatumKind::Int8,
+            DatumView::Boolean(_) => DatumKind::Boolean,
+        }
+    }
+
+    pub fn from_scalar_value(val: &'a ScalarValue) -> Option<Self> {
+        match val {
+            ScalarValue::Boolean(v) => v.map(DatumView::Boolean),
+            ScalarValue::Float32(v) => v.map(DatumView::Float),
+            ScalarValue::Float64(v) => v.map(DatumView::Double),
+            ScalarValue::Int8(v) => v.map(DatumView::Int8),
+            ScalarValue::Int16(v) => v.map(DatumView::Int16),
+            ScalarValue::Int32(v) => v.map(DatumView::Int32),
+            ScalarValue::Int64(v) => v.map(DatumView::Int64),
+            ScalarValue::UInt8(v) => v.map(DatumView::UInt8),
+            ScalarValue::UInt16(v) => v.map(DatumView::UInt16),
+            ScalarValue::UInt32(v) => v.map(DatumView::UInt32),
+            ScalarValue::UInt64(v) => v.map(DatumView::UInt64),
+            ScalarValue::Utf8(v) | ScalarValue::LargeUtf8(v) => {
+                v.as_ref().map(|v| DatumView::String(v.as_str()))
+            }
+            ScalarValue::Binary(v) | ScalarValue::LargeBinary(v) => {
+                v.as_ref().map(|v| DatumView::Varbinary(v.as_slice()))
+            }
+            ScalarValue::TimestampMillisecond(v, _) => {
+                v.map(|v| DatumView::Timestamp(Timestamp::new(v)))
+            }
+            ScalarValue::List(_, _)
+            | ScalarValue::Date32(_)
+            | ScalarValue::Date64(_)
+            | ScalarValue::TimestampSecond(_, _)
+            | ScalarValue::TimestampMicrosecond(_, _)
+            | ScalarValue::TimestampNanosecond(_, _)
+            | ScalarValue::IntervalYearMonth(_)
+            | ScalarValue::IntervalDayTime(_)
+            | ScalarValue::Struct(_, _)
+            | ScalarValue::Decimal128(_, _, _) => None,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_is_key_kind() {
+        assert!(!DatumKind::Null.is_key_kind());
+        assert!(DatumKind::Timestamp.is_key_kind());
+        assert!(!DatumKind::Double.is_key_kind());
+        assert!(!DatumKind::Float.is_key_kind());
+        assert!(DatumKind::Varbinary.is_key_kind());
+        assert!(DatumKind::String.is_key_kind());
+        assert!(DatumKind::UInt64.is_key_kind());
+        assert!(DatumKind::UInt32.is_key_kind());
+        assert!(DatumKind::UInt16.is_key_kind());
+        assert!(DatumKind::UInt8.is_key_kind());
+        assert!(DatumKind::Int64.is_key_kind());
+        assert!(DatumKind::Int32.is_key_kind());
+        assert!(DatumKind::Int16.is_key_kind());
+        assert!(DatumKind::Int8.is_key_kind());
+        assert!(DatumKind::Boolean.is_key_kind());
+    }
+
+    #[test]
+    fn test_unsign_kind() {
+        assert_eq!(DatumKind::UInt64.unsign_kind(), Some(DatumKind::UInt64));
+        assert_eq!(DatumKind::Int64.unsign_kind(), Some(DatumKind::UInt64));
+        assert_eq!(DatumKind::UInt32.unsign_kind(), Some(DatumKind::UInt32));
+        assert_eq!(DatumKind::Int32.unsign_kind(), Some(DatumKind::UInt32));
+        assert_eq!(DatumKind::UInt16.unsign_kind(), Some(DatumKind::UInt16));
+        assert_eq!(DatumKind::Int16.unsign_kind(), Some(DatumKind::UInt16));
+        assert_eq!(DatumKind::UInt8.unsign_kind(), Some(DatumKind::UInt8));
+        assert_eq!(DatumKind::Int8.unsign_kind(), Some(DatumKind::UInt8));
+
+        assert!(DatumKind::Null.unsign_kind().is_none());
+        assert!(DatumKind::Timestamp.unsign_kind().is_none());
+        assert!(DatumKind::String.unsign_kind().is_none());
+        assert!(DatumKind::Boolean.unsign_kind().is_none());
+        assert!(DatumKind::Varbinary.unsign_kind().is_none());
+        assert!(DatumKind::Double.unsign_kind().is_none());
+        assert!(DatumKind::Float.unsign_kind().is_none());
+    }
+
+    #[test]
+    fn test_into_u8() {
+        assert_eq!(0, DatumKind::Null.into_u8());
+        assert_eq!(1, DatumKind::Timestamp.into_u8());
+        assert_eq!(2, DatumKind::Double.into_u8());
+        assert_eq!(3, DatumKind::Float.into_u8());
+        assert_eq!(4, DatumKind::Varbinary.into_u8());
+        assert_eq!(5, DatumKind::String.into_u8());
+        assert_eq!(6, DatumKind::UInt64.into_u8());
+        assert_eq!(7, DatumKind::UInt32.into_u8());
+        assert_eq!(8, DatumKind::UInt16.into_u8());
+        assert_eq!(9, DatumKind::UInt8.into_u8());
+        assert_eq!(10, DatumKind::Int64.into_u8());
+        assert_eq!(11, DatumKind::Int32.into_u8());
+        assert_eq!(12, DatumKind::Int16.into_u8());
+        assert_eq!(13, DatumKind::Int8.into_u8());
+        assert_eq!(14, DatumKind::Boolean.into_u8());
+    }
+}
diff --git a/common_types/src/hash.rs b/common_types/src/hash.rs
new file mode 100644
index 0000000000..9edc8c69cb
--- /dev/null
+++ b/common_types/src/hash.rs
@@ -0,0 +1,39 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+// custom hash mod
+use byteorder::{ByteOrder, LittleEndian};
+use murmur3::murmur3_x64_128;
+
+pub fn hash64(mut bytes: &[u8]) -> u64 {
+    let mut out = [0; 16];
+    murmur3_x64_128(&mut bytes, 0, &mut out);
+    // in most cases we run on little endian target
+    LittleEndian::read_u64(&out[0..8])
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn empty_hash_test() {
+        let res1 = hash64(&[]);
+        let res2 = hash64(&[]);
+        assert_eq!(res1, res2);
+    }
+
+    #[test]
+    fn hash_test() {
+        let test_bytes_1 = b"cse_engine_hash_mod_test_bytes1".to_vec();
+        let test_bytes_2 = b"cse_engine_hash_mod_test_bytes2".to_vec();
+        {
+            // hash64 testing
+            let res1 = hash64(&test_bytes_1);
+            let res1_1 = hash64(&test_bytes_1);
+            assert_eq!(res1, res1_1);
+
+            let res2 = hash64(&test_bytes_2);
+            assert_ne!(res1, res2);
+        }
+    }
+}
diff --git a/common_types/src/lib.rs b/common_types/src/lib.rs
new file mode 100644
index 0000000000..3da29b0a52
--- /dev/null
+++ b/common_types/src/lib.rs
@@ -0,0 +1,24 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Contains common types
+
+pub mod bytes;
+pub mod column;
+pub mod column_schema;
+pub mod datum;
+pub mod hash;
+pub mod projected_schema;
+pub mod record_batch;
+pub mod request_id;
+pub mod row;
+pub mod schema;
+pub mod string;
+pub mod time;
+
+/// Sequence number
+pub type SequenceNumber = u64;
+pub const MAX_SEQUENCE_NUMBER: u64 = u64::MAX;
+pub const MIN_SEQUENCE_NUMBER: u64 = 0;
+
+#[cfg(any(test, feature = "test"))]
+pub mod tests;
diff --git a/common_types/src/projected_schema.rs b/common_types/src/projected_schema.rs
new file mode 100644
index 0000000000..8fa17f2848
--- /dev/null
+++ b/common_types/src/projected_schema.rs
@@ -0,0 +1,292 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Projected schema
+
+use std::{fmt, sync::Arc};
+
+use snafu::{ensure, Backtrace, ResultExt, Snafu};
+
+use crate::{
+    column_schema::{ColumnSchema, ReadOp},
+    datum::Datum,
+    row::Row,
+    schema::{ArrowSchemaRef, RecordSchema, RecordSchemaWithKey, Schema},
+};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display(
+        "Invalid projection index, index:{}.\nBacktrace:\n{}",
+        index,
+        backtrace
+    ))]
+    InvalidProjectionIndex { index: usize, backtrace: Backtrace },
+
+    #[snafu(display("Incompatible column schema for read, err:{}", source))]
+    IncompatReadColumn {
+        source: crate::column_schema::CompatError,
+    },
+
+    #[snafu(display("Failed to build projected schema, err:{}", source))]
+    BuildProjectedSchema { source: crate::schema::Error },
+
+    #[snafu(display(
+        "Missing not null column for read, name:{}.\nBacktrace:\n{}",
+        name,
+        backtrace
+    ))]
+    MissingReadColumn { name: String, backtrace: Backtrace },
+}
+
+pub type Result<T> = std::result::Result<T, Error>;
+
+#[derive(Debug)]
+pub struct RowProjector {
+    schema_with_key: RecordSchemaWithKey,
+    source_schema: Schema,
+    /// The Vec stores the column index in source, and `None` means this column
+    /// is not in source but required by reader, and need to filled by null.
+    /// The length of Vec is the same as the number of columns reader intended
+    /// to read.
+    source_projection: Vec<Option<usize>>,
+}
+
+impl RowProjector {
+    /// The projected indexes of existed columns in the source schema.
+    pub fn existed_source_projection(&self) -> Vec<usize> {
+        self.source_projection
+            .iter()
+            .filter_map(|index| *index)
+            .collect()
+    }
+
+    /// The projected indexes of all columns(existed and not exist) in the
+    /// source schema.
+    pub fn source_projection(&self) -> &[Option<usize>] {
+        &self.source_projection
+    }
+
+    pub fn schema_with_key(&self) -> &RecordSchemaWithKey {
+        &self.schema_with_key
+    }
+
+    /// Project the row.
+    ///
+    /// REQUIRE: The schema of row is the same as source schema.
+    pub fn project_row(&self, row: &Row, mut datums_buffer: Vec<Datum>) -> Row {
+        assert_eq!(self.source_schema.num_columns(), row.num_columns());
+
+        datums_buffer.reserve(self.schema_with_key.num_columns());
+
+        for p in &self.source_projection {
+            let datum = match p {
+                Some(index_in_source) => row[*index_in_source].clone(),
+                None => Datum::Null,
+            };
+
+            datums_buffer.push(datum);
+        }
+
+        Row::from_datums(datums_buffer)
+    }
+}
+
+#[derive(Clone)]
+pub struct ProjectedSchema(Arc<ProjectedSchemaInner>);
+
+impl fmt::Debug for ProjectedSchema {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("ProjectedSchema")
+            .field("original_schema", &self.0.original_schema)
+            .field("projection", &self.0.projection)
+            .finish()
+    }
+}
+
+impl ProjectedSchema {
+    pub fn no_projection(schema: Schema) -> Self {
+        let inner = ProjectedSchemaInner::no_projection(schema);
+        Self(Arc::new(inner))
+    }
+
+    pub fn new(schema: Schema, projection: Option<Vec<usize>>) -> Result<Self> {
+        let inner = ProjectedSchemaInner::new(schema, projection)?;
+        Ok(Self(Arc::new(inner)))
+    }
+
+    pub fn is_all_projection(&self) -> bool {
+        self.0.is_all_projection()
+    }
+
+    /// Returns the [RowProjector] to project the rows with source schema to
+    /// rows with [RecordSchemaWithKey].
+    ///
+    /// REQUIRE: The key columns are the same as this schema.
+    #[inline]
+    pub fn try_project_with_key(&self, source_schema: &Schema) -> Result<RowProjector> {
+        self.0.try_project_with_key(source_schema)
+    }
+
+    // Returns the record schema after projection with key.
+    pub fn to_record_schema_with_key(&self) -> RecordSchemaWithKey {
+        self.0.schema_with_key.clone()
+    }
+
+    pub(crate) fn as_record_schema_with_key(&self) -> &RecordSchemaWithKey {
+        &self.0.schema_with_key
+    }
+
+    // Returns the record schema after projection.
+    pub fn to_record_schema(&self) -> RecordSchema {
+        self.0.record_schema.clone()
+    }
+
+    /// Returns the arrow schema after projection.
+    pub fn to_projected_arrow_schema(&self) -> ArrowSchemaRef {
+        self.0.record_schema.to_arrow_schema_ref()
+    }
+}
+
+/// Schema with projection informations
+struct ProjectedSchemaInner {
+    /// The schema before projection that the reader intended to read, may
+    /// differ from current schema of the table.
+    original_schema: Schema,
+    /// Index of the projected columns in `self.schema`, `None` if
+    /// all columns are needed.
+    projection: Option<Vec<usize>>,
+
+    /// The record schema from `self.schema` with key columns after projection.
+    schema_with_key: RecordSchemaWithKey,
+    /// The record schema from `self.schema` after projection.
+    record_schema: RecordSchema,
+}
+
+impl ProjectedSchemaInner {
+    fn no_projection(schema: Schema) -> Self {
+        let schema_with_key = schema.to_record_schema_with_key();
+        let record_schema = schema.to_record_schema();
+
+        Self {
+            original_schema: schema,
+            projection: None,
+            schema_with_key,
+            record_schema,
+        }
+    }
+
+    fn new(schema: Schema, projection: Option<Vec<usize>>) -> Result<Self> {
+        if let Some(p) = &projection {
+            // Projection is provided, validate the projection is valid. This is necessary
+            // to avoid panic when creating RecordSchema and
+            // RecordSchemaWithKey.
+            if let Some(max_idx) = p.iter().max() {
+                ensure!(
+                    *max_idx < schema.num_columns(),
+                    InvalidProjectionIndex { index: *max_idx }
+                );
+            }
+
+            let schema_with_key = schema.project_record_schema_with_key(p);
+            let record_schema = schema.project_record_schema(p);
+
+            Ok(Self {
+                original_schema: schema,
+                projection,
+                schema_with_key,
+                record_schema,
+            })
+        } else {
+            Ok(Self::no_projection(schema))
+        }
+    }
+
+    /// Selecting all the columns is the all projection.
+    fn is_all_projection(&self) -> bool {
+        self.projection.is_none()
+    }
+
+    // TODO(yingwen): We can fill missing not null column with default value instead
+    //  of returning error.
+    fn try_project_with_key(&self, source_schema: &Schema) -> Result<RowProjector> {
+        debug_assert_eq!(
+            self.schema_with_key.key_columns(),
+            source_schema.key_columns()
+        );
+        // We consider the two schema is equal if they have same version.
+        if self.original_schema.version() == source_schema.version() {
+            debug_assert_eq!(self.original_schema, *source_schema);
+        }
+
+        let mut source_projection = Vec::with_capacity(self.schema_with_key.num_columns());
+        // For each column in `schema_with_key`
+        for column_schema in self.schema_with_key.columns() {
+            self.try_project_column(column_schema, source_schema, &mut source_projection)?;
+        }
+
+        Ok(RowProjector {
+            schema_with_key: self.schema_with_key.clone(),
+            source_schema: source_schema.clone(),
+            source_projection,
+        })
+    }
+
+    fn try_project_column(
+        &self,
+        column: &ColumnSchema,
+        source_schema: &Schema,
+        source_projection: &mut Vec<Option<usize>>,
+    ) -> Result<()> {
+        match source_schema.index_of(&column.name) {
+            Some(source_idx) => {
+                // Column is in source
+                if self.original_schema.version() == source_schema.version() {
+                    // Same version, just use that column in source
+                    source_projection.push(Some(source_idx));
+                } else {
+                    // Different version, need to check column schema
+                    let source_column = source_schema.column(source_idx);
+                    // TODO(yingwen): Data type is not checked here because we do not support alter
+                    // data type now.
+                    match column
+                        .compatible_for_read(source_column)
+                        .context(IncompatReadColumn)?
+                    {
+                        ReadOp::Exact => {
+                            source_projection.push(Some(source_idx));
+                        }
+                        ReadOp::FillNull => {
+                            source_projection.push(None);
+                        }
+                    }
+                }
+            }
+            None => {
+                // Column is not in source
+                ensure!(column.is_nullable, MissingReadColumn { name: &column.name });
+                // Column is nullable, fill this column by null
+                source_projection.push(None);
+            }
+        }
+
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::{projected_schema::ProjectedSchema, tests::build_schema};
+
+    #[test]
+    fn test_projected_schema() {
+        let schema = build_schema();
+        assert!(schema.num_columns() > 1);
+        let projection: Vec<usize> = (0..schema.num_columns() - 1).collect();
+        let projected_schema = ProjectedSchema::new(schema.clone(), Some(projection)).unwrap();
+        assert_eq!(
+            projected_schema.0.schema_with_key.num_columns(),
+            schema.num_columns() - 1
+        );
+        assert!(!projected_schema.is_all_projection());
+    }
+}
diff --git a/common_types/src/record_batch.rs b/common_types/src/record_batch.rs
new file mode 100644
index 0000000000..1b7ca99d98
--- /dev/null
+++ b/common_types/src/record_batch.rs
@@ -0,0 +1,695 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Record batch
+
+use std::{cmp, convert::TryFrom, mem};
+
+use arrow_deps::{
+    arrow::{
+        datatypes::SchemaRef as ArrowSchemaRef, record_batch::RecordBatch as ArrowRecordBatch,
+    },
+    util,
+};
+use snafu::{ensure, Backtrace, OptionExt, ResultExt, Snafu};
+
+use crate::{
+    column::{ColumnBlock, ColumnBlockBuilder},
+    datum::DatumKind,
+    projected_schema::{ProjectedSchema, RowProjector},
+    row::{
+        contiguous::{ContiguousRow, ProjectedContiguousRow},
+        Row, RowViewOnBatch,
+    },
+    schema::{RecordSchema, RecordSchemaWithKey},
+};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Invalid schema len to build RecordBatch.\nBacktrace:\n{}", backtrace))]
+    SchemaLen { backtrace: Backtrace },
+
+    #[snafu(display("Failed to create column block, err:{}", source))]
+    CreateColumnBlock { source: crate::column::Error },
+
+    #[snafu(display(
+        "Failed to create arrow record batch, err:{}.\nBacktrace:\n{}",
+        source,
+        backtrace
+    ))]
+    CreateArrow {
+        source: arrow_deps::arrow::error::ArrowError,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Failed to iterate datum, err:{}", source))]
+    IterateDatum { source: crate::row::Error },
+
+    #[snafu(display("Failed to append datum, err:{}", source))]
+    AppendDatum { source: crate::column::Error },
+
+    #[snafu(display(
+        "Column not in schema with key, column_name:{}.\nBacktrace:\n{}",
+        name,
+        backtrace
+    ))]
+    ColumnNotInSchemaWithKey { name: String, backtrace: Backtrace },
+
+    #[snafu(display("Failed to convert arrow schema, err:{}", source))]
+    ConvertArrowSchema { source: crate::schema::Error },
+
+    #[snafu(display("Mismatch record schema to build RecordBatch, column_name:{}, schema_type:{:?}, column_type:{:?}.\nBacktrace:\n{}", column_name, schema_type, column_type, backtrace))]
+    MismatchRecordSchema {
+        column_name: String,
+        schema_type: DatumKind,
+        column_type: DatumKind,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Projection is out of the index, source_projection:{:?}, arrow_schema:{}.\nBacktrace:\n{}",
+        source_projection,
+        arrow_schema,
+        backtrace
+    ))]
+    OutOfIndexProjection {
+        source_projection: Vec<Option<usize>>,
+        arrow_schema: ArrowSchemaRef,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Failed to reverse record batch data, err:{:?}.\nBacktrace:\n{}",
+        source,
+        backtrace
+    ))]
+    ReverseRecordBatchData {
+        source: Box<dyn std::error::Error + Send + Sync>,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Failed to select record batch data, err:{:?}.\nBacktrace:\n{}",
+        source,
+        backtrace
+    ))]
+    SelectRecordBatchData {
+        source: Box<dyn std::error::Error + Send + Sync>,
+        backtrace: Backtrace,
+    },
+}
+
+pub type Result<T> = std::result::Result<T, Error>;
+
+#[derive(Debug)]
+pub struct RecordBatchData {
+    arrow_record_batch: ArrowRecordBatch,
+    column_blocks: Vec<ColumnBlock>,
+}
+
+impl RecordBatchData {
+    fn new(arrow_schema: ArrowSchemaRef, column_blocks: Vec<ColumnBlock>) -> Result<Self> {
+        let arrays = column_blocks
+            .iter()
+            .map(|column| column.to_arrow_array_ref())
+            .collect();
+
+        let arrow_record_batch =
+            ArrowRecordBatch::try_new(arrow_schema, arrays).context(CreateArrow)?;
+
+        Ok(RecordBatchData {
+            arrow_record_batch,
+            column_blocks,
+        })
+    }
+
+    fn num_rows(&self) -> usize {
+        self.column_blocks
+            .first()
+            .map(|column| column.num_rows())
+            .unwrap_or(0)
+    }
+
+    fn take_column_block(&mut self, index: usize) -> ColumnBlock {
+        let num_rows = self.num_rows();
+        mem::replace(
+            &mut self.column_blocks[index],
+            ColumnBlock::new_null(num_rows),
+        )
+    }
+
+    /// Returns a zero-copy slice of this array with the indicated offset and
+    /// length.
+    ///
+    /// Panics if offset with length is greater than column length.
+    fn slice(&self, offset: usize, length: usize) -> Self {
+        let column_blocks = self
+            .column_blocks
+            .iter()
+            .map(|col| col.slice(offset, length))
+            .collect();
+
+        Self {
+            arrow_record_batch: self.arrow_record_batch.slice(offset, length),
+            column_blocks,
+        }
+    }
+}
+
+fn build_column_blocks_from_arrow_record_batch(
+    arrow_record_batch: &ArrowRecordBatch,
+    record_schema: &RecordSchema,
+) -> Result<Vec<ColumnBlock>> {
+    let mut column_blocks = Vec::with_capacity(arrow_record_batch.num_columns());
+    for (column_schema, array) in record_schema
+        .columns()
+        .iter()
+        .zip(arrow_record_batch.columns())
+    {
+        let column = ColumnBlock::try_from_arrow_array_ref(&column_schema.data_type, array)
+            .context(CreateColumnBlock)?;
+        column_blocks.push(column);
+    }
+
+    Ok(column_blocks)
+}
+
+impl TryFrom<ArrowRecordBatch> for RecordBatchData {
+    type Error = Error;
+
+    fn try_from(arrow_record_batch: ArrowRecordBatch) -> Result<Self> {
+        let record_schema =
+            RecordSchema::try_from(arrow_record_batch.schema()).context(ConvertArrowSchema)?;
+        let column_blocks =
+            build_column_blocks_from_arrow_record_batch(&arrow_record_batch, &record_schema)?;
+        Ok(Self {
+            arrow_record_batch,
+            column_blocks,
+        })
+    }
+}
+
+// TODO(yingwen): The schema in RecordBatch should be much simple because it may
+// lack some information.
+#[derive(Debug)]
+pub struct RecordBatch {
+    schema: RecordSchema,
+    data: RecordBatchData,
+}
+
+impl RecordBatch {
+    pub fn new_empty(schema: RecordSchema) -> Self {
+        let arrow_schema = schema.to_arrow_schema_ref();
+        let arrow_record_batch = ArrowRecordBatch::new_empty(arrow_schema);
+
+        Self {
+            schema,
+            data: RecordBatchData {
+                arrow_record_batch,
+                column_blocks: Vec::new(),
+            },
+        }
+    }
+
+    pub fn new(schema: RecordSchema, column_blocks: Vec<ColumnBlock>) -> Result<Self> {
+        ensure!(schema.num_columns() == column_blocks.len(), SchemaLen);
+
+        // Validate schema and column_blocks.
+        for (column_schema, column_block) in schema.columns().iter().zip(column_blocks.iter()) {
+            ensure!(
+                column_schema.data_type == column_block.datum_kind(),
+                MismatchRecordSchema {
+                    column_name: &column_schema.name,
+                    schema_type: column_schema.data_type,
+                    column_type: column_block.datum_kind(),
+                }
+            );
+        }
+
+        let arrow_schema = schema.to_arrow_schema_ref();
+        let data = RecordBatchData::new(arrow_schema, column_blocks)?;
+
+        Ok(Self { schema, data })
+    }
+
+    pub fn schema(&self) -> &RecordSchema {
+        &self.schema
+    }
+
+    // REQUIRE: index is valid
+    pub fn column(&self, index: usize) -> &ColumnBlock {
+        &self.data.column_blocks[index]
+    }
+
+    pub fn num_columns(&self) -> usize {
+        self.schema.num_columns()
+    }
+
+    pub fn num_rows(&self) -> usize {
+        self.data.num_rows()
+    }
+
+    pub fn into_arrow_record_batch(self) -> ArrowRecordBatch {
+        self.data.arrow_record_batch
+    }
+}
+
+impl TryFrom<ArrowRecordBatch> for RecordBatch {
+    type Error = Error;
+
+    fn try_from(arrow_record_batch: ArrowRecordBatch) -> Result<Self> {
+        let record_schema =
+            RecordSchema::try_from(arrow_record_batch.schema()).context(ConvertArrowSchema)?;
+
+        let column_blocks =
+            build_column_blocks_from_arrow_record_batch(&arrow_record_batch, &record_schema)?;
+
+        Ok(Self {
+            schema: record_schema,
+            data: RecordBatchData {
+                arrow_record_batch,
+                column_blocks,
+            },
+        })
+    }
+}
+
+#[derive(Debug)]
+pub struct RecordBatchWithKey {
+    schema_with_key: RecordSchemaWithKey,
+    data: RecordBatchData,
+}
+
+impl RecordBatchWithKey {
+    pub fn num_rows(&self) -> usize {
+        self.data.num_rows()
+    }
+
+    pub fn num_columns(&self) -> usize {
+        self.data.arrow_record_batch.num_columns()
+    }
+
+    pub fn columns(&self) -> &[ColumnBlock] {
+        &self.data.column_blocks
+    }
+
+    pub fn clone_row_at(&self, index: usize) -> Row {
+        let datums = self
+            .data
+            .column_blocks
+            .iter()
+            .map(|column_block| column_block.datum(index))
+            .collect();
+
+        Row::from_datums(datums)
+    }
+
+    /// Project the [RecordBatchWithKey] into a [RecordBatch] according to
+    /// [ProjectedSchema].
+    ///
+    /// REQUIRE: The schema_with_key of the [RecordBatchWithKey] is the same as
+    /// the schema_with_key of [ProjectedSchema].
+    pub fn try_project(mut self, projected_schema: &ProjectedSchema) -> Result<RecordBatch> {
+        debug_assert_eq!(
+            &self.schema_with_key,
+            projected_schema.as_record_schema_with_key()
+        );
+
+        // Get the schema after projection.
+        let record_schema = projected_schema.to_record_schema();
+        let mut column_blocks = Vec::with_capacity(record_schema.num_columns());
+
+        for column_schema in record_schema.columns() {
+            let column_index = self.schema_with_key.index_of(&column_schema.name).context(
+                ColumnNotInSchemaWithKey {
+                    name: &column_schema.name,
+                },
+            )?;
+
+            // Take the column block out.
+            let column_block = self.data.take_column_block(column_index);
+            column_blocks.push(column_block);
+        }
+
+        let data = RecordBatchData::new(record_schema.to_arrow_schema_ref(), column_blocks)?;
+
+        Ok(RecordBatch {
+            schema: record_schema,
+            data,
+        })
+    }
+
+    pub fn into_record_batch(self) -> RecordBatch {
+        RecordBatch {
+            schema: self.schema_with_key.into_record_schema(),
+            data: self.data,
+        }
+    }
+
+    #[inline]
+    pub fn schema_with_key(&self) -> &RecordSchemaWithKey {
+        &self.schema_with_key
+    }
+
+    #[inline]
+    pub fn column(&self, index: usize) -> &ColumnBlock {
+        &self.data.column_blocks[index]
+    }
+
+    /// Reverse the rows in the data.
+    ///
+    /// The data retains intact if failed.
+    pub fn reverse_data(&mut self) -> Result<()> {
+        let reversed_record_batch = util::reverse_record_batch(&self.data.arrow_record_batch)
+            .map_err(|e| Box::new(e) as _)
+            .context(ReverseRecordBatchData)?;
+
+        self.data = RecordBatchData::try_from(reversed_record_batch)
+            .map_err(|e| Box::new(e) as _)
+            .context(ReverseRecordBatchData)?;
+
+        Ok(())
+    }
+
+    #[inline]
+    pub fn is_empty(&self) -> bool {
+        self.num_rows() == 0
+    }
+
+    /// Returns a zero-copy slice of this array with the indicated offset and
+    /// length.
+    ///
+    /// Panics if offset with length is greater than column length.
+    #[must_use]
+    pub fn slice(&self, offset: usize, length: usize) -> Self {
+        Self {
+            schema_with_key: self.schema_with_key.clone(),
+            data: self.data.slice(offset, length),
+        }
+    }
+
+    /// Select the rows according to the `selected_rows`.
+    ///
+    /// The data retains intact if failed.
+    pub fn select_data(&mut self, selected_rows: &[bool]) -> Result<()> {
+        assert_eq!(self.num_rows(), selected_rows.len());
+
+        let selected_record_batch =
+            util::select_record_batch(&self.data.arrow_record_batch, selected_rows)
+                .map_err(|e| Box::new(e) as _)
+                .context(SelectRecordBatchData)?;
+        let selected_data = RecordBatchData::try_from(selected_record_batch)
+            .map_err(|e| Box::new(e) as _)
+            .context(SelectRecordBatchData)?;
+
+        self.data = selected_data;
+
+        Ok(())
+    }
+}
+
+pub struct RecordBatchWithKeyBuilder {
+    schema_with_key: RecordSchemaWithKey,
+    builders: Vec<ColumnBlockBuilder>,
+}
+
+impl RecordBatchWithKeyBuilder {
+    pub fn new(schema_with_key: RecordSchemaWithKey) -> Self {
+        let builders = schema_with_key
+            .columns()
+            .iter()
+            .map(|column_schema| ColumnBlockBuilder::with_capacity(&column_schema.data_type, 0))
+            .collect();
+        Self {
+            schema_with_key,
+            builders,
+        }
+    }
+
+    pub fn with_capacity(schema_with_key: RecordSchemaWithKey, capacity: usize) -> Self {
+        let builders = schema_with_key
+            .columns()
+            .iter()
+            .map(|column_schema| {
+                ColumnBlockBuilder::with_capacity(&column_schema.data_type, capacity)
+            })
+            .collect();
+        Self {
+            schema_with_key,
+            builders,
+        }
+    }
+
+    /// Append row into builder.
+    ///
+    /// REQUIRE: The row and the builder must have the same schema.
+    pub fn append_row(&mut self, row: Row) -> Result<()> {
+        for (builder, datum) in self.builders.iter_mut().zip(row) {
+            builder.append(datum).context(AppendDatum)?;
+        }
+
+        Ok(())
+    }
+
+    /// Append projected contiguous row into builder.
+    ///
+    /// REQUIRE:
+    /// - The schema of `row` is the same as the source schema of the
+    ///   `projector`.
+    /// - The projected schema (with key) is the same as the schema of the
+    ///   builder.
+    pub fn append_projected_contiguous_row<T: ContiguousRow>(
+        &mut self,
+        row: &ProjectedContiguousRow<T>,
+    ) -> Result<()> {
+        assert_eq!(row.num_datum_views(), self.builders.len());
+
+        for (index, builder) in self.builders.iter_mut().enumerate() {
+            let datum_view = row.datum_view_at(index);
+            builder.append_view(datum_view).context(AppendDatum)?;
+        }
+
+        Ok(())
+    }
+
+    /// Append the row from the [RowView] to the builder.
+    ///
+    /// REQUIRE: The `row_view` and the builder must have the same schema.
+    pub fn append_row_view(&mut self, row_view: &RowViewOnBatch) -> Result<()> {
+        for (builder, datum) in self.builders.iter_mut().zip(row_view.iter_columns()) {
+            let datum = datum.context(IterateDatum)?;
+            builder.append(datum).context(AppendDatum)?;
+        }
+
+        Ok(())
+    }
+
+    /// Append `len` from `start` (inclusive) to this builder.
+    ///
+    /// REQUIRE:
+    /// - The `record_batch` and the builder must have the same schema.
+    pub fn append_batch_range(
+        &mut self,
+        record_batch: &RecordBatchWithKey,
+        start: usize,
+        len: usize,
+    ) -> Result<usize> {
+        let num_rows = record_batch.num_rows();
+        if start >= num_rows {
+            return Ok(0);
+        }
+
+        let added = cmp::min(num_rows - start, len);
+
+        for (builder, column) in self.builders.iter_mut().zip(record_batch.columns().iter()) {
+            builder
+                .append_block_range(column, start, added)
+                .context(AppendDatum)?;
+        }
+
+        Ok(added)
+    }
+
+    /// The number of the appended rows.
+    pub fn len(&self) -> usize {
+        self.builders
+            .first()
+            .map(|builder| builder.len())
+            .unwrap_or(0)
+    }
+
+    /// Returns true if the builder is empty.
+    pub fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+
+    /// Reset the builders for reuse.
+    pub fn clear(&mut self) {
+        for builder in &mut self.builders {
+            builder.clear();
+        }
+    }
+
+    /// Build [RecordBatchWithKey] and reset the builder.
+    pub fn build(&mut self) -> Result<RecordBatchWithKey> {
+        let column_blocks: Vec<_> = self
+            .builders
+            .iter_mut()
+            .map(|builder| builder.build())
+            .collect();
+        let arrow_schema = self.schema_with_key.to_arrow_schema_ref();
+
+        Ok(RecordBatchWithKey {
+            schema_with_key: self.schema_with_key.clone(),
+            data: RecordBatchData::new(arrow_schema, column_blocks)?,
+        })
+    }
+}
+
+#[derive(Debug)]
+pub struct ArrowRecordBatchProjector {
+    row_projector: RowProjector,
+}
+
+impl From<RowProjector> for ArrowRecordBatchProjector {
+    fn from(row_projector: RowProjector) -> Self {
+        Self { row_projector }
+    }
+}
+
+impl ArrowRecordBatchProjector {
+    /// Project the [arrow::RecordBatch] to [RecordBatchWithKey] and these
+    /// things is to be done:
+    ///  - Insert the null column if the projected column does not appear in the
+    ///    source schema.
+    ///  - Convert the [arrow::RecordBatch] to [RecordBatchWithKey].
+    ///
+    /// REQUIRE: Schema of the `arrow_record_batch` is the same as the
+    /// projection of existing column in the source schema.
+    pub fn project_to_record_batch_with_key(
+        &self,
+        arrow_record_batch: ArrowRecordBatch,
+    ) -> Result<RecordBatchWithKey> {
+        let schema_with_key = self.row_projector.schema_with_key().clone();
+        let source_projection = self.row_projector.source_projection();
+        let mut column_blocks = Vec::with_capacity(schema_with_key.num_columns());
+
+        let num_rows = arrow_record_batch.num_rows();
+        // ensure next_arrow_column_idx < num_columns
+        let mut next_arrow_column_idx = 0;
+        let num_columns = arrow_record_batch.num_columns();
+
+        for (source_idx, column_schema) in source_projection.iter().zip(schema_with_key.columns()) {
+            match source_idx {
+                Some(_) => {
+                    ensure!(
+                        next_arrow_column_idx < num_columns,
+                        OutOfIndexProjection {
+                            source_projection,
+                            arrow_schema: arrow_record_batch.schema()
+                        }
+                    );
+
+                    let array = arrow_record_batch.column(next_arrow_column_idx);
+                    next_arrow_column_idx += 1;
+
+                    let column_block =
+                        ColumnBlock::try_from_arrow_array_ref(&column_schema.data_type, array)
+                            .context(CreateColumnBlock)?;
+
+                    column_blocks.push(column_block);
+                }
+                None => {
+                    // Need to push row with specific type.
+                    let null_block =
+                        ColumnBlock::new_null_with_type(&column_schema.data_type, num_rows)
+                            .context(CreateColumnBlock)?;
+                    column_blocks.push(null_block);
+                }
+            }
+        }
+
+        let data = RecordBatchData::new(schema_with_key.to_arrow_schema_ref(), column_blocks)?;
+
+        Ok(RecordBatchWithKey {
+            schema_with_key,
+            data,
+        })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::{
+        record_batch::{RecordBatchWithKey, RecordBatchWithKeyBuilder},
+        row::RowViewOnBatch,
+        tests::{
+            build_projected_schema, build_record_batch_with_key_by_rows, build_rows,
+            check_record_batch_with_key_with_rows,
+        },
+    };
+
+    fn build_record_batch_with_key() -> RecordBatchWithKey {
+        let rows = build_rows();
+        build_record_batch_with_key_by_rows(rows)
+    }
+
+    fn check_record_batch_with_key(
+        record_batch_with_key: RecordBatchWithKey,
+        row_num: usize,
+        column_num: usize,
+    ) -> bool {
+        let rows = build_rows();
+        check_record_batch_with_key_with_rows(&record_batch_with_key, row_num, column_num, rows)
+    }
+
+    #[test]
+    fn test_append_projected_contiguous_row() {
+        let record_batch_with_key = build_record_batch_with_key();
+        assert_eq!(record_batch_with_key.num_rows(), 5);
+        assert_eq!(record_batch_with_key.num_columns(), 3);
+
+        check_record_batch_with_key(record_batch_with_key, 5, 3);
+    }
+
+    #[test]
+    fn test_append_row_view() {
+        let projected_schema = build_projected_schema();
+
+        let record_batch_with_key = build_record_batch_with_key();
+
+        let mut builder = RecordBatchWithKeyBuilder::with_capacity(
+            projected_schema.to_record_schema_with_key(),
+            2,
+        );
+        let view = RowViewOnBatch {
+            record_batch: &record_batch_with_key,
+            row_idx: 1,
+        };
+        builder.append_row_view(&view).unwrap();
+        let record_batch_with_key = builder.build().unwrap();
+        assert_eq!(record_batch_with_key.num_rows(), 1);
+        assert_eq!(record_batch_with_key.num_columns(), 3);
+
+        check_record_batch_with_key(record_batch_with_key, 1, 3);
+    }
+
+    #[test]
+    fn test_append_batch_range() {
+        let projected_schema = build_projected_schema();
+
+        let record_batch_with_key = build_record_batch_with_key();
+
+        let mut builder = RecordBatchWithKeyBuilder::with_capacity(
+            projected_schema.to_record_schema_with_key(),
+            2,
+        );
+        builder
+            .append_batch_range(&record_batch_with_key, 0, 2)
+            .unwrap();
+        let record_batch_with_key = builder.build().unwrap();
+        assert_eq!(record_batch_with_key.num_rows(), 2);
+        assert_eq!(record_batch_with_key.num_columns(), 3);
+
+        check_record_batch_with_key(record_batch_with_key, 2, 3);
+    }
+}
diff --git a/common_types/src/request_id.rs b/common_types/src/request_id.rs
new file mode 100644
index 0000000000..6990839818
--- /dev/null
+++ b/common_types/src/request_id.rs
@@ -0,0 +1,43 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Request id.
+
+use std::{
+    fmt,
+    sync::atomic::{AtomicU64, Ordering},
+};
+
+#[derive(Debug, Clone, Copy)]
+pub struct RequestId(u64);
+
+impl RequestId {
+    /// Acquire next request id.
+    pub fn next_id() -> Self {
+        static NEXT_ID: AtomicU64 = AtomicU64::new(1);
+
+        let id = NEXT_ID.fetch_add(1, Ordering::Relaxed);
+
+        Self(id)
+    }
+}
+
+impl fmt::Display for RequestId {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "{}", self.0)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_request_id() {
+        let id = RequestId::next_id();
+        assert_eq!(1, id.0);
+        let id = RequestId::next_id();
+        assert_eq!(2, id.0);
+
+        assert_eq!("2", id.to_string());
+    }
+}
diff --git a/common_types/src/row/contiguous.rs b/common_types/src/row/contiguous.rs
new file mode 100644
index 0000000000..dd35f6ecb0
--- /dev/null
+++ b/common_types/src/row/contiguous.rs
@@ -0,0 +1,501 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Contiguous row.
+
+use std::{
+    convert::{TryFrom, TryInto},
+    fmt, mem,
+    ops::{Deref, DerefMut},
+    str,
+};
+
+use snafu::{ensure, Backtrace, Snafu};
+
+use crate::{
+    datum::{Datum, DatumKind, DatumView},
+    projected_schema::RowProjector,
+    row::Row,
+    schema::{IndexInWriterSchema, Schema},
+    time::Timestamp,
+};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display(
+        "String is too long to encode into row (max is {}), len:{}.\nBacktrace:\n{}",
+        MAX_STRING_LEN,
+        len,
+        backtrace
+    ))]
+    StringTooLong { len: usize, backtrace: Backtrace },
+}
+
+pub type Result<T> = std::result::Result<T, Error>;
+
+/// Size to store the offset of string buffer.
+type OffsetSize = usize;
+
+/// Max allowed string length of datum to store in a contiguous row (16 MB).
+const MAX_STRING_LEN: usize = 1024 * 1024 * 16;
+
+/// Row encoded in a contiguous buffer.
+pub trait ContiguousRow {
+    /// Returns the number of datums.
+    fn num_datum_views(&self) -> usize;
+
+    /// Returns [DatumView] of column in given index, and returns null if the
+    /// datum kind is unknown.
+    ///
+    /// Panic if index or buffer is out of bound.
+    fn datum_view_at(&self, index: usize) -> DatumView;
+}
+
+pub struct ContiguousRowReader<'a, T> {
+    inner: &'a T,
+    byte_offsets: &'a [usize],
+    string_buffer_offset: usize,
+}
+
+impl<'a, T> ContiguousRowReader<'a, T> {
+    pub fn with_schema(inner: &'a T, schema: &'a Schema) -> Self {
+        Self {
+            inner,
+            byte_offsets: schema.byte_offsets(),
+            string_buffer_offset: schema.string_buffer_offset(),
+        }
+    }
+}
+
+impl<'a, T: Deref<Target = [u8]>> ContiguousRow for ContiguousRowReader<'a, T> {
+    fn num_datum_views(&self) -> usize {
+        self.byte_offsets.len()
+    }
+
+    fn datum_view_at(&self, index: usize) -> DatumView<'a> {
+        let offset = self.byte_offsets[index];
+        let buf = &self.inner[offset..];
+
+        // Get datum kind, if the datum kind is unknown, returns null.
+        let datum_kind = match DatumKind::try_from(buf[0]) {
+            Ok(v) => v,
+            Err(_) => return DatumView::Null,
+        };
+
+        // Advance 1 byte to skip the header byte.
+        let datum_buf = &buf[1..];
+        // If no string column in this schema, the string buffer offset should
+        // equal to the buffer len, and string buf is an empty slice.
+        let string_buf = &self.inner[self.string_buffer_offset..];
+
+        must_read_view(&datum_kind, datum_buf, string_buf)
+    }
+}
+
+/// Contiguous row with projection information.
+///
+/// The caller must ensure the source schema of projector is the same as the
+/// schema of source row.
+pub struct ProjectedContiguousRow<'a, T> {
+    source_row: T,
+    projector: &'a RowProjector,
+}
+
+impl<'a, T: ContiguousRow> ProjectedContiguousRow<'a, T> {
+    pub fn new(source_row: T, projector: &'a RowProjector) -> Self {
+        Self {
+            source_row,
+            projector,
+        }
+    }
+}
+
+impl<'a, T: ContiguousRow> ContiguousRow for ProjectedContiguousRow<'a, T> {
+    fn num_datum_views(&self) -> usize {
+        self.projector.source_projection().len()
+    }
+
+    fn datum_view_at(&self, index: usize) -> DatumView {
+        let p = self.projector.source_projection()[index];
+
+        match p {
+            Some(index_in_source) => self.source_row.datum_view_at(index_in_source),
+            None => DatumView::Null,
+        }
+    }
+}
+
+impl<'a, T: ContiguousRow> fmt::Debug for ProjectedContiguousRow<'a, T> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let mut list = f.debug_list();
+        for i in 0..self.num_datum_views() {
+            let view = self.datum_view_at(i);
+            list.entry(&view);
+        }
+        list.finish()
+    }
+}
+
+/// In memory buffer to hold data of a contiguous row.
+pub trait RowBuffer: DerefMut<Target = [u8]> {
+    /// Clear and resize the buffer size to `new_len` with given `value`.
+    fn reset(&mut self, new_len: usize, value: u8);
+
+    /// Append slice into the buffer, resize the buffer automatically.
+    fn append_slice(&mut self, src: &[u8]);
+}
+
+/// A writer to build a contiguous row.
+pub struct ContiguousRowWriter<'a, T> {
+    inner: &'a mut T,
+    /// The schema the row group need to be encoded into, the schema
+    /// of the row need to be write compatible for the table schema.
+    table_schema: &'a Schema,
+    /// The index mapping from table schema to column in the
+    /// schema of row group.
+    index_in_writer: &'a IndexInWriterSchema,
+}
+
+// TODO(yingwen): Try to replace usage of row by contiguous row.
+impl<'a, T: RowBuffer + 'a> ContiguousRowWriter<'a, T> {
+    pub fn new(
+        inner: &'a mut T,
+        table_schema: &'a Schema,
+        index_in_writer: &'a IndexInWriterSchema,
+    ) -> Self {
+        Self {
+            inner,
+            table_schema,
+            index_in_writer,
+        }
+    }
+
+    fn write_datum(
+        inner: &mut T,
+        datum: &Datum,
+        byte_offset: usize,
+        next_string_offset: &mut usize,
+    ) -> Result<()> {
+        let datum_offset = byte_offset + 1;
+
+        match datum {
+            // Already filled by null, nothing to do.
+            Datum::Null => (),
+            Datum::Timestamp(v) => {
+                Self::write_byte_to_offset(inner, byte_offset, DatumKind::Timestamp.into_u8());
+                let value_buf = v.as_i64().to_ne_bytes();
+                Self::write_slice_to_offset(inner, datum_offset, &value_buf);
+            }
+            Datum::Double(v) => {
+                Self::write_byte_to_offset(inner, byte_offset, DatumKind::Double.into_u8());
+                let value_buf = v.to_ne_bytes();
+                Self::write_slice_to_offset(inner, datum_offset, &value_buf);
+            }
+            Datum::Float(v) => {
+                Self::write_byte_to_offset(inner, byte_offset, DatumKind::Float.into_u8());
+                let value_buf = v.to_ne_bytes();
+                Self::write_slice_to_offset(inner, datum_offset, &value_buf);
+            }
+            Datum::Varbinary(v) => {
+                Self::write_byte_to_offset(inner, byte_offset, DatumKind::Varbinary.into_u8());
+                let value_buf = next_string_offset.to_ne_bytes();
+                Self::write_slice_to_offset(inner, datum_offset, &value_buf);
+                // Use u32 to store length of string.
+                *next_string_offset += mem::size_of::<u32>() + v.len();
+
+                ensure!(v.len() <= MAX_STRING_LEN, StringTooLong { len: v.len() });
+
+                let string_len = v.len() as u32;
+                inner.append_slice(&string_len.to_ne_bytes());
+                inner.append_slice(v);
+            }
+            Datum::String(v) => {
+                Self::write_byte_to_offset(inner, byte_offset, DatumKind::String.into_u8());
+                let value_buf = next_string_offset.to_ne_bytes();
+                Self::write_slice_to_offset(inner, datum_offset, &value_buf);
+                // Use u32 to store length of string.
+                *next_string_offset += mem::size_of::<u32>() + v.len();
+
+                ensure!(v.len() <= MAX_STRING_LEN, StringTooLong { len: v.len() });
+
+                let string_len = v.len() as u32;
+                inner.append_slice(&string_len.to_ne_bytes());
+                inner.append_slice(v.as_bytes());
+            }
+            Datum::UInt64(v) => {
+                Self::write_byte_to_offset(inner, byte_offset, DatumKind::UInt64.into_u8());
+                let value_buf = v.to_ne_bytes();
+                Self::write_slice_to_offset(inner, datum_offset, &value_buf);
+            }
+            Datum::UInt32(v) => {
+                Self::write_byte_to_offset(inner, byte_offset, DatumKind::UInt32.into_u8());
+                let value_buf = v.to_ne_bytes();
+                Self::write_slice_to_offset(inner, datum_offset, &value_buf);
+            }
+            Datum::UInt16(v) => {
+                Self::write_byte_to_offset(inner, byte_offset, DatumKind::UInt16.into_u8());
+                let value_buf = v.to_ne_bytes();
+                Self::write_slice_to_offset(inner, datum_offset, &value_buf);
+            }
+            Datum::UInt8(v) => {
+                Self::write_byte_to_offset(inner, byte_offset, DatumKind::UInt8.into_u8());
+                Self::write_slice_to_offset(inner, datum_offset, &[*v]);
+            }
+            Datum::Int64(v) => {
+                Self::write_byte_to_offset(inner, byte_offset, DatumKind::Int64.into_u8());
+                let value_buf = v.to_ne_bytes();
+                Self::write_slice_to_offset(inner, datum_offset, &value_buf);
+            }
+            Datum::Int32(v) => {
+                Self::write_byte_to_offset(inner, byte_offset, DatumKind::Int32.into_u8());
+                let value_buf = v.to_ne_bytes();
+                Self::write_slice_to_offset(inner, datum_offset, &value_buf);
+            }
+            Datum::Int16(v) => {
+                Self::write_byte_to_offset(inner, byte_offset, DatumKind::Int16.into_u8());
+                let value_buf = v.to_ne_bytes();
+                Self::write_slice_to_offset(inner, datum_offset, &value_buf);
+            }
+            Datum::Int8(v) => {
+                Self::write_byte_to_offset(inner, byte_offset, DatumKind::Int8.into_u8());
+                Self::write_slice_to_offset(inner, datum_offset, &[*v as u8]);
+            }
+            Datum::Boolean(v) => {
+                Self::write_byte_to_offset(inner, byte_offset, DatumKind::Boolean.into_u8());
+                Self::write_slice_to_offset(inner, datum_offset, &[*v as u8]);
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Write a row to the buffer, the buffer will be reset first.
+    pub fn write_row(&mut self, row: &Row) -> Result<()> {
+        let datum_buffer_len = self.table_schema.string_buffer_offset();
+        // Reset the buffer and fill the buffer by null, now new slice will be
+        // appended to the string buffer.
+        self.inner
+            .reset(datum_buffer_len, DatumKind::Null.into_u8());
+
+        assert_eq!(row.num_columns(), self.table_schema.num_columns());
+
+        // Offset to next string in string buffer.
+        let mut next_string_offset: OffsetSize = 0;
+        for index_in_table in 0..self.table_schema.num_columns() {
+            if let Some(writer_index) = self.index_in_writer.column_index_in_writer(index_in_table)
+            {
+                let datum = &row[writer_index];
+                let byte_offset = self.table_schema.byte_offset(index_in_table);
+
+                // Write datum bytes to the buffer.
+                Self::write_datum(self.inner, datum, byte_offset, &mut next_string_offset)?;
+            }
+            // Column not in row is already filled by null.
+        }
+
+        Ok(())
+    }
+
+    #[inline]
+    fn write_byte_to_offset(inner: &mut T, offset: usize, value: u8) {
+        inner[offset] = value;
+    }
+
+    #[inline]
+    fn write_slice_to_offset(inner: &mut T, offset: usize, value_buf: &[u8]) {
+        let dst = &mut inner[offset..offset + value_buf.len()];
+        dst.copy_from_slice(value_buf);
+    }
+}
+
+/// The byte size to encode the datum of this kind in memory.
+///
+/// Returns the (datum size + 1) for header. For integer types, the datum
+/// size is the memory size of the interger type. For string types, the
+/// datum size is the memory size to hold the offset.
+pub(crate) fn byte_size_of_datum(kind: &DatumKind) -> usize {
+    let datum_size = match kind {
+        DatumKind::Null => 1,
+        DatumKind::Timestamp => mem::size_of::<Timestamp>(),
+        DatumKind::Double => mem::size_of::<f64>(),
+        DatumKind::Float => mem::size_of::<f32>(),
+        // The size of offset.
+        DatumKind::Varbinary | DatumKind::String => mem::size_of::<OffsetSize>(),
+        DatumKind::UInt64 => mem::size_of::<u64>(),
+        DatumKind::UInt32 => mem::size_of::<u32>(),
+        DatumKind::UInt16 => mem::size_of::<u16>(),
+        DatumKind::UInt8 => mem::size_of::<u8>(),
+        DatumKind::Int64 => mem::size_of::<i64>(),
+        DatumKind::Int32 => mem::size_of::<i32>(),
+        DatumKind::Int16 => mem::size_of::<i16>(),
+        DatumKind::Int8 => mem::size_of::<i8>(),
+        DatumKind::Boolean => mem::size_of::<bool>(),
+    };
+
+    datum_size + 1
+}
+
+/// Read datum view from given datum buf, and may reference the string in
+/// `string_buf`.
+///
+/// Panic if out of bound.
+///
+/// ## Safety
+/// The string in buffer must be valid utf8.
+fn must_read_view<'a>(
+    datum_kind: &DatumKind,
+    datum_buf: &'a [u8],
+    string_buf: &'a [u8],
+) -> DatumView<'a> {
+    match datum_kind {
+        DatumKind::Null => DatumView::Null,
+        DatumKind::Timestamp => {
+            let value_buf = datum_buf[..mem::size_of::<i64>()].try_into().unwrap();
+            let ts = Timestamp::new(i64::from_ne_bytes(value_buf));
+            DatumView::Timestamp(ts)
+        }
+        DatumKind::Double => {
+            let value_buf = datum_buf[..mem::size_of::<f64>()].try_into().unwrap();
+            let v = f64::from_ne_bytes(value_buf);
+            DatumView::Double(v)
+        }
+        DatumKind::Float => {
+            let value_buf = datum_buf[..mem::size_of::<f32>()].try_into().unwrap();
+            let v = f32::from_ne_bytes(value_buf);
+            DatumView::Float(v)
+        }
+        DatumKind::Varbinary => {
+            let bytes = must_read_bytes(datum_buf, string_buf);
+            DatumView::Varbinary(bytes)
+        }
+        DatumKind::String => {
+            let bytes = must_read_bytes(datum_buf, string_buf);
+            let v = unsafe { str::from_utf8_unchecked(bytes) };
+            DatumView::String(v)
+        }
+        DatumKind::UInt64 => {
+            let value_buf = datum_buf[..mem::size_of::<u64>()].try_into().unwrap();
+            let v = u64::from_ne_bytes(value_buf);
+            DatumView::UInt64(v)
+        }
+        DatumKind::UInt32 => {
+            let value_buf = datum_buf[..mem::size_of::<u32>()].try_into().unwrap();
+            let v = u32::from_ne_bytes(value_buf);
+            DatumView::UInt32(v)
+        }
+        DatumKind::UInt16 => {
+            let value_buf = datum_buf[..mem::size_of::<u16>()].try_into().unwrap();
+            let v = u16::from_ne_bytes(value_buf);
+            DatumView::UInt16(v)
+        }
+        DatumKind::UInt8 => DatumView::UInt8(datum_buf[0]),
+        DatumKind::Int64 => {
+            let value_buf = datum_buf[..mem::size_of::<i64>()].try_into().unwrap();
+            let v = i64::from_ne_bytes(value_buf);
+            DatumView::Int64(v)
+        }
+        DatumKind::Int32 => {
+            let value_buf = datum_buf[..mem::size_of::<i32>()].try_into().unwrap();
+            let v = i32::from_ne_bytes(value_buf);
+            DatumView::Int32(v)
+        }
+        DatumKind::Int16 => {
+            let value_buf = datum_buf[..mem::size_of::<i16>()].try_into().unwrap();
+            let v = i16::from_ne_bytes(value_buf);
+            DatumView::Int16(v)
+        }
+        DatumKind::Int8 => DatumView::Int8(datum_buf[0] as i8),
+        DatumKind::Boolean => DatumView::Boolean(datum_buf[0] != 0),
+    }
+}
+
+fn must_read_bytes<'a>(datum_buf: &'a [u8], string_buf: &'a [u8]) -> &'a [u8] {
+    // Read offset of string in string buf.
+    let value_buf = datum_buf[..mem::size_of::<OffsetSize>()]
+        .try_into()
+        .unwrap();
+    let offset = OffsetSize::from_ne_bytes(value_buf);
+    let string_buf = &string_buf[offset..];
+
+    // Read len of the string.
+    let len_buf = string_buf[..mem::size_of::<u32>()].try_into().unwrap();
+    let string_len = u32::from_ne_bytes(len_buf) as usize;
+    let string_buf = &string_buf[mem::size_of::<u32>()..];
+
+    // Read string.
+    &string_buf[..string_len]
+}
+
+impl RowBuffer for Vec<u8> {
+    fn reset(&mut self, new_len: usize, value: u8) {
+        self.clear();
+
+        self.resize(new_len, value);
+    }
+
+    fn append_slice(&mut self, src: &[u8]) {
+        self.extend_from_slice(src);
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::{
+        projected_schema::ProjectedSchema,
+        tests::{build_rows, build_schema},
+    };
+
+    fn check_contiguous_row(row: &Row, reader: impl ContiguousRow, projection: Option<Vec<usize>>) {
+        let range = if let Some(projection) = projection {
+            projection
+        } else {
+            (0..reader.num_datum_views()).collect()
+        };
+        for i in range {
+            let datum = &row[i];
+            let view = reader.datum_view_at(i);
+
+            assert_eq!(datum.as_view(), view);
+        }
+    }
+
+    #[test]
+    fn test_contiguous_read_write() {
+        let schema = build_schema();
+        let rows = build_rows();
+        let index_in_writer = IndexInWriterSchema::for_same_schema(schema.num_columns());
+
+        let mut buf = Vec::new();
+        for row in rows {
+            let mut writer = ContiguousRowWriter::new(&mut buf, &schema, &index_in_writer);
+
+            writer.write_row(&row).unwrap();
+
+            let reader = ContiguousRowReader::with_schema(&buf, &schema);
+            check_contiguous_row(&row, reader, None);
+        }
+    }
+
+    #[test]
+    fn test_project_contiguous_read_write() {
+        let schema = build_schema();
+        assert!(schema.num_columns() > 1);
+        let projection: Vec<usize> = (0..schema.num_columns() - 1).collect();
+        let projected_schema =
+            ProjectedSchema::new(schema.clone(), Some(projection.clone())).unwrap();
+        let row_projected_schema = projected_schema.try_project_with_key(&schema).unwrap();
+        let rows = build_rows();
+        let index_in_writer = IndexInWriterSchema::for_same_schema(schema.num_columns());
+
+        let mut buf = Vec::new();
+        for row in rows {
+            let mut writer = ContiguousRowWriter::new(&mut buf, &schema, &index_in_writer);
+
+            writer.write_row(&row).unwrap();
+
+            let source_row = ContiguousRowReader::with_schema(&buf, &schema);
+            let projected_row = ProjectedContiguousRow::new(source_row, &row_projected_schema);
+            check_contiguous_row(&row, projected_row, Some(projection.clone()));
+        }
+    }
+}
diff --git a/common_types/src/row/mod.rs b/common_types/src/row/mod.rs
new file mode 100644
index 0000000000..600052cfcc
--- /dev/null
+++ b/common_types/src/row/mod.rs
@@ -0,0 +1,590 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Row type
+
+use std::{
+    cmp,
+    ops::{Index, IndexMut},
+};
+
+use snafu::{ensure, Backtrace, OptionExt, Snafu};
+
+use crate::{
+    column_schema::ColumnSchema,
+    datum::{Datum, DatumKind},
+    record_batch::RecordBatchWithKey,
+    schema::{RecordSchemaWithKey, Schema},
+    time::Timestamp,
+};
+
+pub mod contiguous;
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display(
+        "Column out of bound, len:{}, given:{}.\nBacktrace:\n{}",
+        len,
+        given,
+        backtrace
+    ))]
+    ColumnOutOfBound {
+        len: usize,
+        given: usize,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Invalid column num of row, expect:{}, given:{}.\nBacktrace:\n{}",
+        expect,
+        given,
+        backtrace
+    ))]
+    InvalidColumnNum {
+        expect: usize,
+        given: usize,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Column cannot be null, name:{}.\nBacktrace:\n{}", column, backtrace))]
+    NullColumn {
+        column: String,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Column type mismatch, name:{}, expect:{:?}, given:{:?}.\nBacktrace:\n{}",
+        column,
+        expect,
+        given,
+        backtrace
+    ))]
+    TypeMismatch {
+        column: String,
+        expect: DatumKind,
+        given: DatumKind,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Missing columns to build row.\nBacktrace:\n{}", backtrace))]
+    MissingColumns { backtrace: Backtrace },
+
+    #[snafu(display("Convert column failed, column:{}, err:{}", column, source))]
+    ConvertColumn {
+        column: String,
+        source: crate::datum::Error,
+    },
+
+    #[snafu(display("Column in the schema is not found, column_name:{}", column,))]
+    ColumnNameNotFound { column: String },
+
+    #[snafu(display(
+        "Column in the schema is not found, column_name:{}.\nBacktrace:\n{}",
+        column,
+        backtrace
+    ))]
+    ColumnNotFoundInSchema {
+        column: String,
+        backtrace: Backtrace,
+    },
+}
+
+// Do not depend on common_util crates
+pub type Result<T> = std::result::Result<T, Error>;
+
+// TODO(yingwen):
+// - Memory pooling (or Arena) and statistics
+// - Custom Debug format
+// - Add a type RowWithSchema so we can ensure the row always matches the schema
+// - Maybe add a type RowOperation like kudu
+
+/// Row contains multiple columns, each column is represented by a datum
+/// The internal representation of row is not specific
+#[derive(Debug, Clone, PartialEq)]
+pub struct Row {
+    cols: Vec<Datum>,
+}
+
+impl Row {
+    /// Convert vec of Datum into Row
+    pub fn from_datums(cols: Vec<Datum>) -> Self {
+        Self { cols }
+    }
+
+    /// Returns the column num
+    pub fn num_columns(&self) -> usize {
+        self.cols.len()
+    }
+
+    /// Iterate all datums
+    pub fn iter(&self) -> IterDatum {
+        IterDatum {
+            iter: self.cols.iter(),
+        }
+    }
+
+    /// Get the timestamp column
+    pub fn timestamp(&self, schema: &Schema) -> Option<Timestamp> {
+        let timestamp_index = schema.timestamp_index();
+
+        self.cols[timestamp_index].as_timestamp()
+    }
+}
+
+#[derive(Debug)]
+pub struct IterDatum<'a> {
+    iter: std::slice::Iter<'a, Datum>,
+}
+
+impl<'a> Iterator for IterDatum<'a> {
+    type Item = &'a Datum;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        self.iter.next()
+    }
+}
+
+impl Index<usize> for Row {
+    type Output = Datum;
+
+    fn index(&self, index: usize) -> &Self::Output {
+        &self.cols[index]
+    }
+}
+
+impl IndexMut<usize> for Row {
+    fn index_mut(&mut self, index: usize) -> &mut Self::Output {
+        &mut self.cols[index]
+    }
+}
+
+impl<'a> IntoIterator for &'a Row {
+    type IntoIter = std::slice::Iter<'a, Datum>;
+    type Item = &'a Datum;
+
+    fn into_iter(self) -> Self::IntoIter {
+        self.cols.iter()
+    }
+}
+
+impl IntoIterator for Row {
+    type IntoIter = std::vec::IntoIter<Self::Item>;
+    type Item = Datum;
+
+    fn into_iter(self) -> Self::IntoIter {
+        self.cols.into_iter()
+    }
+}
+
+/// Check whether the schema of the row equals to given `schema`
+pub fn check_row_schema(row: &Row, schema: &Schema) -> Result<()> {
+    ensure!(
+        schema.num_columns() == row.num_columns(),
+        InvalidColumnNum {
+            expect: schema.num_columns(),
+            given: row.num_columns(),
+        }
+    );
+
+    for (index, datum) in row.iter().enumerate() {
+        let column = schema.column(index);
+        check_datum_type(datum, column)?;
+    }
+
+    Ok(())
+}
+
+// TODO(yingwen): For multiple rows that share the same schema, no need to store
+// Datum for each row element, we can store the whole row as a binary and
+// provide more efficent way to convert rows into columns
+/// RowGroup
+///
+/// The min/max timestamp of an empty RowGroup is 0.
+///
+/// Rows in the RowGroup have the same schema. The internal representation of
+/// rows is not specific.
+#[derive(Debug)]
+pub struct RowGroup {
+    /// Schema of the row group, all rows in the row group should have same
+    /// schema
+    schema: Schema,
+    /// Rows in the row group
+    rows: Vec<Row>,
+    // TODO(yingwen): Maybe remove min/max timestamp
+    /// Min timestamp of all the rows
+    min_timestamp: Timestamp,
+    /// Max timestamp of all the rows
+    max_timestamp: Timestamp,
+}
+
+impl RowGroup {
+    /// Returns true if the row group is empty
+    #[inline]
+    pub fn is_empty(&self) -> bool {
+        self.rows.is_empty()
+    }
+
+    /// Returns number of rows in the row group
+    #[inline]
+    pub fn num_rows(&self) -> usize {
+        self.rows.len()
+    }
+
+    /// Returns the idx-th row in the row group
+    #[inline]
+    pub fn get_row(&self, idx: usize) -> Option<&Row> {
+        self.rows.get(idx)
+    }
+
+    /// Returns the idx-th mutable row in the row group
+    #[inline]
+    pub fn get_row_mut(&mut self, idx: usize) -> Option<&mut Row> {
+        self.rows.get_mut(idx)
+    }
+
+    /// Iter all datum of the column
+    ///
+    /// Will panic if col_index is out of bound
+    pub fn iter_column(&self, col_index: usize) -> IterCol {
+        IterCol {
+            rows: &self.rows,
+            row_index: 0,
+            col_index,
+        }
+    }
+
+    /// The schema of the row group
+    #[inline]
+    pub fn schema(&self) -> &Schema {
+        &self.schema
+    }
+
+    /// Iter the row group by rows
+    // TODO(yingwen): Add a iter_with_schema
+    pub fn iter(&self) -> IterRow {
+        IterRow {
+            iter: self.rows.iter(),
+        }
+    }
+
+    /// Get the min timestamp of rows
+    #[inline]
+    pub fn min_timestamp(&self) -> Timestamp {
+        self.min_timestamp
+    }
+
+    /// Get the max timestamp of rows
+    #[inline]
+    pub fn max_timestmap(&self) -> Timestamp {
+        self.max_timestamp
+    }
+}
+
+impl<'a> IntoIterator for &'a RowGroup {
+    type IntoIter = std::slice::Iter<'a, Row>;
+    type Item = &'a Row;
+
+    fn into_iter(self) -> Self::IntoIter {
+        self.rows.iter()
+    }
+}
+
+impl IntoIterator for RowGroup {
+    type IntoIter = std::vec::IntoIter<Self::Item>;
+    type Item = Row;
+
+    fn into_iter(self) -> Self::IntoIter {
+        self.rows.into_iter()
+    }
+}
+
+#[derive(Debug)]
+pub struct IterRow<'a> {
+    iter: std::slice::Iter<'a, Row>,
+}
+
+impl<'a> Iterator for IterRow<'a> {
+    type Item = &'a Row;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        self.iter.next()
+    }
+}
+
+#[derive(Debug)]
+pub struct IterCol<'a> {
+    rows: &'a Vec<Row>,
+    row_index: usize,
+    col_index: usize,
+}
+
+impl<'a> Iterator for IterCol<'a> {
+    type Item = &'a Datum;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.rows.is_empty() {
+            return None;
+        }
+
+        if self.row_index >= self.rows.len() {
+            return None;
+        }
+
+        let row = &self.rows[self.row_index];
+        self.row_index += 1;
+
+        Some(&row[self.col_index])
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        let remaining = self.rows.len() - self.row_index;
+        (remaining, Some(remaining))
+    }
+}
+
+/// RowGroup builder
+#[derive(Debug)]
+pub struct RowGroupBuilder {
+    schema: Schema,
+    rows: Vec<Row>,
+    min_timestamp: Option<Timestamp>,
+    max_timestmap: Timestamp,
+}
+
+impl RowGroupBuilder {
+    /// Create a new builder
+    pub fn new(schema: Schema) -> Self {
+        Self::with_capacity(schema, 0)
+    }
+
+    /// Create a new builder with given capacity
+    pub fn with_capacity(schema: Schema, capacity: usize) -> Self {
+        Self {
+            schema,
+            rows: Vec::with_capacity(capacity),
+            min_timestamp: None,
+            max_timestmap: Timestamp::new(0),
+        }
+    }
+
+    /// Create a new builder with schema and rows
+    ///
+    /// Return error if the `rows` do not matched the `schema`
+    pub fn with_rows(schema: Schema, rows: Vec<Row>) -> Result<Self> {
+        let mut row_group = Self::new(schema);
+
+        // Check schema and update min/max timestamp
+        for row in &rows {
+            check_row_schema(row, &row_group.schema)?;
+            row_group.update_timestamps(row);
+        }
+
+        row_group.rows = rows;
+
+        Ok(row_group)
+    }
+
+    /// Add a schema checked row
+    ///
+    /// REQUIRE: Caller should ensure the schema of row must equal to the schema
+    /// of this builder
+    pub fn push_checked_row(&mut self, row: Row) {
+        self.update_timestamps(&row);
+
+        self.rows.push(row);
+    }
+
+    /// Acquire builder to build next row of the row group
+    pub fn row_builder(&mut self) -> RowBuilder {
+        RowBuilder {
+            // schema: &self.schema,
+            cols: Vec::with_capacity(self.schema.num_columns()),
+            // rows: &mut self.rows,
+            group_builder: self,
+        }
+    }
+
+    /// Build the row group
+    pub fn build(self) -> RowGroup {
+        RowGroup {
+            schema: self.schema,
+            rows: self.rows,
+            min_timestamp: self.min_timestamp.unwrap_or_else(|| Timestamp::new(0)),
+            max_timestamp: self.max_timestmap,
+        }
+    }
+
+    /// Update min/max timestamp of the row group
+    fn update_timestamps(&mut self, row: &Row) {
+        // check_row_schema() ensures this datum is a timestamp, so we just unwrap here
+        let row_timestamp = row.timestamp(&self.schema).unwrap();
+
+        self.min_timestamp = match self.min_timestamp {
+            Some(min_timestamp) => Some(cmp::min(min_timestamp, row_timestamp)),
+            None => Some(row_timestamp),
+        };
+        self.max_timestmap = cmp::max(self.max_timestmap, row_timestamp);
+    }
+}
+
+/// Check whether the datum kind matches the column schema
+pub fn check_datum_type(datum: &Datum, column_schema: &ColumnSchema) -> Result<()> {
+    // Check null datum
+    if let Datum::Null = datum {
+        ensure!(
+            column_schema.is_nullable,
+            NullColumn {
+                column: &column_schema.name,
+            }
+        );
+    } else {
+        ensure!(
+            datum.kind() == column_schema.data_type,
+            TypeMismatch {
+                column: &column_schema.name,
+                expect: column_schema.data_type,
+                given: datum.kind(),
+            }
+        );
+    }
+
+    Ok(())
+}
+
+// TODO(yingwen): This builder is used to build RowGroup, need to provide a
+// builder to build one row
+/// Row builder for the row group
+#[derive(Debug)]
+pub struct RowBuilder<'a> {
+    group_builder: &'a mut RowGroupBuilder,
+    cols: Vec<Datum>,
+}
+
+impl<'a> RowBuilder<'a> {
+    /// Append a datum into the row
+    pub fn append_datum(mut self, datum: Datum) -> Result<Self> {
+        self.check_datum(&datum)?;
+
+        self.cols.push(datum);
+
+        Ok(self)
+    }
+
+    /// Check whether the datum is valid
+    fn check_datum(&self, datum: &Datum) -> Result<()> {
+        let index = self.cols.len();
+        let schema = &self.group_builder.schema;
+        ensure!(
+            index < schema.num_columns(),
+            ColumnOutOfBound {
+                len: schema.num_columns(),
+                given: index,
+            }
+        );
+
+        let column = schema.column(index);
+        check_datum_type(datum, column)
+    }
+
+    /// Finish building this row and append this row into the row group
+    pub fn finish(self) -> Result<()> {
+        ensure!(
+            self.cols.len() == self.group_builder.schema.num_columns(),
+            MissingColumns
+        );
+
+        self.group_builder.push_checked_row(Row { cols: self.cols });
+        Ok(())
+    }
+}
+
+pub trait RowView {
+    fn try_get_column_by_name(&self, column_name: &str) -> Result<Option<Datum>>;
+
+    fn column_by_idx(&self, column_idx: usize) -> Datum;
+}
+
+// TODO(yingwen): Add a method to get row view on RecordBatchWithKey.
+/// A row view on the [RecordBatchWithKey].
+///
+/// `row_idx < record_batch.num_rows()` is ensured.
+#[derive(Debug)]
+pub struct RowViewOnBatch<'a> {
+    pub record_batch: &'a RecordBatchWithKey,
+    pub row_idx: usize,
+}
+
+impl<'a> RowViewOnBatch<'a> {
+    pub fn iter_columns(&self) -> RowViewOnBatchColumnIter {
+        RowViewOnBatchColumnIter {
+            next_column_idx: 0,
+            row_idx: self.row_idx,
+            record_batch: self.record_batch,
+        }
+    }
+}
+
+pub struct RowViewOnBatchColumnIter<'a> {
+    next_column_idx: usize,
+    row_idx: usize,
+    record_batch: &'a RecordBatchWithKey,
+}
+
+impl<'a> RowView for RowViewOnBatch<'a> {
+    fn try_get_column_by_name(&self, column_name: &str) -> Result<Option<Datum>> {
+        let column_idx = self
+            .record_batch
+            .schema_with_key()
+            .index_of(column_name)
+            .context(ColumnNameNotFound {
+                column: column_name,
+            })?;
+        Ok(Some(self.column_by_idx(column_idx)))
+    }
+
+    #[inline]
+    fn column_by_idx(&self, column_idx: usize) -> Datum {
+        let column = self.record_batch.column(column_idx);
+        column.datum(self.row_idx)
+    }
+}
+
+impl<'a> Iterator for RowViewOnBatchColumnIter<'a> {
+    type Item = Result<Datum>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.next_column_idx >= self.record_batch.num_columns() {
+            return None;
+        }
+
+        let curr_column_idx = self.next_column_idx;
+        let column = self.record_batch.column(curr_column_idx);
+        let datum = column.datum_opt(self.row_idx).map(Ok);
+
+        self.next_column_idx += 1;
+
+        datum
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct RowWithMeta<'a> {
+    pub row: &'a Row,
+    pub schema: &'a RecordSchemaWithKey,
+}
+
+impl<'a> RowView for RowWithMeta<'a> {
+    fn try_get_column_by_name(&self, column_name: &str) -> Result<Option<Datum>> {
+        let idx = self
+            .schema
+            .index_of(column_name)
+            .context(ColumnNotFoundInSchema {
+                column: column_name,
+            })?;
+        Ok(Some(self.column_by_idx(idx)))
+    }
+
+    #[inline]
+    fn column_by_idx(&self, column_idx: usize) -> Datum {
+        self.row.cols[column_idx].clone()
+    }
+}
diff --git a/common_types/src/schema.rs b/common_types/src/schema.rs
new file mode 100644
index 0000000000..4172886057
--- /dev/null
+++ b/common_types/src/schema.rs
@@ -0,0 +1,1554 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Schema of table
+
+use std::{
+    cmp::{self, Ordering},
+    collections::{HashMap, HashSet},
+    convert::TryFrom,
+    fmt,
+    str::FromStr,
+    sync::Arc,
+};
+
+// Just re-use arrow's types
+// TODO(yingwen): No need to support all schema that arrow supports, we can
+// use a new type pattern to wrap Schema/SchemaRef and not allow to use
+// the data type we not supported
+pub use arrow_deps::arrow::datatypes::{
+    DataType, Field, Schema as ArrowSchema, SchemaRef as ArrowSchemaRef,
+};
+use proto::common as common_pb;
+use snafu::{ensure, Backtrace, OptionExt, ResultExt, Snafu};
+
+use crate::{
+    column_schema::{self, ColumnId, ColumnSchema},
+    datum::DatumKind,
+    row::{contiguous, RowView},
+};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display(
+        "Projection too long, max:{}, given:{}.\nBacktrace:\n{}",
+        max,
+        given,
+        backtrace
+    ))]
+    ProjectionTooLong {
+        max: usize,
+        given: usize,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Invalid projection index, max:{}, given:{}.\nBacktrace:\n{}",
+        max,
+        given,
+        backtrace
+    ))]
+    InvalidProjectionIndex {
+        max: usize,
+        given: usize,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Projection must have timestamp column.\nBacktrace:\n{}", backtrace))]
+    ProjectionMissTimestamp { backtrace: Backtrace },
+
+    #[snafu(display(
+        "Column name already exists, name:{}.\nBacktrace:\n{}",
+        name,
+        backtrace
+    ))]
+    ColumnNameExists { name: String, backtrace: Backtrace },
+
+    #[snafu(display(
+        "Column id already exists, name:{}, id:{}.\nBacktrace:\n{}",
+        name,
+        id,
+        backtrace
+    ))]
+    ColumnIdExists {
+        name: String,
+        id: ColumnId,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Unsupported key column type, name:{}, type:{:?}.\nBacktrace:\n{}",
+        name,
+        kind,
+        backtrace
+    ))]
+    KeyColumnType {
+        name: String,
+        kind: DatumKind,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Timestamp key column already exists, timestamp_column:{}, given:{}.\nBacktrace:\n{}",
+        timestamp_column,
+        given_column,
+        backtrace
+    ))]
+    TimestampKeyExists {
+        timestamp_column: String,
+        given_column: String,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Timestamp key not exists.\nBacktrace:\n{}", backtrace))]
+    MissingTimestampKey { backtrace: Backtrace },
+
+    #[snafu(display(
+        "Key column cannot be nullable, name:{}.\nBacktrace:\n{}",
+        name,
+        backtrace
+    ))]
+    NullKeyColumn { name: String, backtrace: Backtrace },
+
+    #[snafu(display(
+        "Invalid arrow field, field_name:{}, arrow_schema:{:?}, err:{}",
+        field_name,
+        arrow_schema,
+        source
+    ))]
+    InvalidArrowField {
+        field_name: String,
+        arrow_schema: ArrowSchemaRef,
+        source: crate::column_schema::Error,
+    },
+
+    #[snafu(display(
+        "Invalid schema to generate tsid primary key.\nBacktrace:\n{}",
+        backtrace
+    ))]
+    InvalidTsidSchema { backtrace: Backtrace },
+
+    #[snafu(display(
+        "Invalid arrow schema key, key:{:?}, raw_value:{}, err:{:?}.\nBacktrace:\n{}",
+        key,
+        raw_value,
+        source,
+        backtrace
+    ))]
+    InvalidArrowSchemaMetaValue {
+        key: ArrowSchemaMetaKey,
+        raw_value: String,
+        source: Box<dyn std::error::Error + Send + Sync>,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Arrow schema meta key not found, key:{:?}.\nBacktrace:\n{}",
+        key,
+        backtrace
+    ))]
+    ArrowSchemaMetaKeyNotFound {
+        key: ArrowSchemaMetaKey,
+        backtrace: Backtrace,
+    },
+}
+
+// TODO(boyan)  make these constants configurable
+pub const TSID_COLUMN: &str = "tsid";
+pub const TIMESTAMP_COLUMN: &str = "timestamp";
+
+pub type Result<T> = std::result::Result<T, Error>;
+
+const DEFAULT_SCHEMA_VERSION: Version = 1;
+
+#[derive(Debug, Snafu)]
+pub enum CompatError {
+    #[snafu(display("Incompatible column schema for write, err:{}", source))]
+    IncompatWriteColumn {
+        source: crate::column_schema::CompatError,
+    },
+
+    #[snafu(display("Missing column, name:{}", name))]
+    MissingWriteColumn { name: String },
+
+    #[snafu(display("Columns to write not found in table, names:{:?}", names))]
+    WriteMoreColumn { names: Vec<String> },
+}
+
+/// Meta data of the arrow schema
+struct ArrowSchemaMeta {
+    num_key_columns: usize,
+    timestamp_index: usize,
+    enable_tsid_primary_key: bool,
+    version: u32,
+}
+
+#[derive(Copy, Clone, Debug)]
+pub enum ArrowSchemaMetaKey {
+    NumKeyColumns,
+    TimestampIndex,
+    EnableTsidPrimaryKey,
+    Version,
+}
+
+impl ArrowSchemaMetaKey {
+    fn as_str(&self) -> &str {
+        match self {
+            ArrowSchemaMetaKey::NumKeyColumns => "schema:num_key_columns",
+            ArrowSchemaMetaKey::TimestampIndex => "schema::timestamp_index",
+            ArrowSchemaMetaKey::EnableTsidPrimaryKey => "schema::enable_tsid_primary_key",
+            ArrowSchemaMetaKey::Version => "schema::version",
+        }
+    }
+}
+
+impl ToString for ArrowSchemaMetaKey {
+    fn to_string(&self) -> String {
+        self.as_str().to_string()
+    }
+}
+
+/// Schema version
+pub type Version = u32;
+
+/// Mapping column index in table schema to column index in writer schema
+#[derive(Default)]
+pub struct IndexInWriterSchema(Vec<Option<usize>>);
+
+impl IndexInWriterSchema {
+    /// Create a index mapping for same schema with `num_columns` columns.
+    pub fn for_same_schema(num_columns: usize) -> Self {
+        let indexes = (0..num_columns).into_iter().map(Some).collect();
+        Self(indexes)
+    }
+
+    /// Returns the column index in writer schema of the column with index
+    /// `index_in_table` in the table schema where the writer prepared to
+    /// write to.
+    ///
+    /// If the column is not in writer schema, returns None, which means that
+    /// this column should be filled by null.
+    ///
+    /// Panic if the index_in_table is out of bound
+    pub fn column_index_in_writer(&self, index_in_table: usize) -> Option<usize> {
+        self.0[index_in_table]
+    }
+}
+
+// TODO(yingwen): No need to compare all elements in ColumnSchemas, Schema,
+// RecordSchema, custom PartialEq for them.
+
+/// Data of column schemas
+#[derive(PartialEq)]
+pub(crate) struct ColumnSchemas {
+    /// Column schemas
+    columns: Vec<ColumnSchema>,
+    /// Column name to index of that column schema in `columns`, the index is
+    /// guaranteed to be valid
+    name_to_index: HashMap<String, usize>,
+    /// Byte offsets of each column in contiguous row.
+    byte_offsets: Vec<usize>,
+    /// String buffer offset in contiguous row.
+    string_buffer_offset: usize,
+}
+
+impl ColumnSchemas {
+    fn new(columns: Vec<ColumnSchema>) -> Self {
+        let name_to_index = columns
+            .iter()
+            .enumerate()
+            .map(|(idx, c)| (c.name.to_string(), idx))
+            .collect();
+
+        let mut current_offset = 0;
+        let mut byte_offsets = Vec::with_capacity(columns.len());
+        for column_schema in &columns {
+            byte_offsets.push(current_offset);
+            current_offset += contiguous::byte_size_of_datum(&column_schema.data_type);
+        }
+
+        Self {
+            columns,
+            name_to_index,
+            byte_offsets,
+            string_buffer_offset: current_offset,
+        }
+    }
+}
+
+impl ColumnSchemas {
+    pub fn num_columns(&self) -> usize {
+        self.columns().len()
+    }
+
+    pub fn columns(&self) -> &[ColumnSchema] {
+        &self.columns
+    }
+
+    pub fn column(&self, i: usize) -> &ColumnSchema {
+        &self.columns[i]
+    }
+
+    pub fn index_of(&self, name: &str) -> Option<usize> {
+        self.name_to_index.get(name).copied()
+    }
+}
+
+impl fmt::Debug for ColumnSchemas {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("ColumnSchemas")
+            // name_to_index is ignored.
+            .field("columns", &self.columns)
+            .finish()
+    }
+}
+
+/// Schema of [crate::record_batch::RecordBatch]
+///
+/// Should be cheap to clone.
+///
+/// Note: Only `name`, `data_type`, `is_nullable` is valid after converting from
+/// arrow's schema, the additional fields like `id`/`is_tag`/`comment` is always
+/// unset. Now we only convert arrow's schema into our record before we output
+/// the final query result, where the additional fields is never used.
+#[derive(Debug, Clone, PartialEq)]
+pub struct RecordSchema {
+    arrow_schema: ArrowSchemaRef,
+    column_schemas: Arc<ColumnSchemas>,
+}
+
+impl RecordSchema {
+    fn from_column_schemas(column_schemas: ColumnSchemas) -> Self {
+        // Convert to arrow fields.
+        let fields = column_schemas
+            .columns
+            .iter()
+            .map(|col| col.to_arrow_field())
+            .collect();
+        // Build arrow schema.
+        let arrow_schema = Arc::new(ArrowSchema::new(fields));
+
+        Self {
+            arrow_schema,
+            column_schemas: Arc::new(column_schemas),
+        }
+    }
+
+    pub fn num_columns(&self) -> usize {
+        self.column_schemas.num_columns()
+    }
+
+    pub fn columns(&self) -> &[ColumnSchema] {
+        self.column_schemas.columns()
+    }
+
+    pub fn index_of(&self, name: &str) -> Option<usize> {
+        self.column_schemas.index_of(name)
+    }
+
+    pub fn column(&self, i: usize) -> &ColumnSchema {
+        self.column_schemas.column(i)
+    }
+
+    pub fn to_arrow_schema_ref(&self) -> ArrowSchemaRef {
+        self.arrow_schema.clone()
+    }
+}
+
+impl TryFrom<ArrowSchemaRef> for RecordSchema {
+    type Error = Error;
+
+    fn try_from(arrow_schema: ArrowSchemaRef) -> Result<Self> {
+        let fields = arrow_schema.fields();
+        let mut columns = Vec::with_capacity(fields.len());
+
+        for field in fields {
+            let column_schema =
+                ColumnSchema::try_from(field).with_context(|| InvalidArrowField {
+                    arrow_schema: arrow_schema.clone(),
+                    field_name: field.name(),
+                })?;
+            columns.push(column_schema);
+        }
+
+        let column_schemas = ColumnSchemas::new(columns);
+
+        Ok(Self::from_column_schemas(column_schemas))
+    }
+}
+
+#[derive(Debug, Clone, PartialEq)]
+pub struct RecordSchemaWithKey {
+    record_schema: RecordSchema,
+    num_key_columns: usize,
+}
+
+impl RecordSchemaWithKey {
+    pub fn num_columns(&self) -> usize {
+        self.record_schema.num_columns()
+    }
+
+    pub fn compare_row<LR: RowView, RR: RowView>(&self, lhs: &LR, rhs: &RR) -> Ordering {
+        compare_row(self.num_key_columns, lhs, rhs)
+    }
+
+    pub fn index_of(&self, name: &str) -> Option<usize> {
+        self.record_schema.index_of(name)
+    }
+
+    pub fn columns(&self) -> &[ColumnSchema] {
+        self.record_schema.columns()
+    }
+
+    /// Returns an immutable reference of the key column vector.
+    pub fn key_columns(&self) -> &[ColumnSchema] {
+        &self.columns()[..self.num_key_columns]
+    }
+
+    pub(crate) fn into_record_schema(self) -> RecordSchema {
+        self.record_schema
+    }
+
+    pub(crate) fn to_arrow_schema_ref(&self) -> ArrowSchemaRef {
+        self.record_schema.to_arrow_schema_ref()
+    }
+
+    #[inline]
+    pub fn num_key_columns(&self) -> usize {
+        self.num_key_columns
+    }
+}
+
+/// Compare the two rows.
+///
+/// REQUIRES: the two rows must have the same number of key columns as
+/// `num_key_columns`.
+pub fn compare_row<LR: RowView, RR: RowView>(
+    num_key_columns: usize,
+    lhs: &LR,
+    rhs: &RR,
+) -> Ordering {
+    for column_idx in 0..num_key_columns {
+        // caller should ensure the row view is valid.
+        // TODO(xikai): unwrap may not a good way to handle the error.
+        let left_datum = lhs.column_by_idx(column_idx);
+        let right_datum = rhs.column_by_idx(column_idx);
+        // the two datums must be of the same kind type.
+        match left_datum.partial_cmp(&right_datum).unwrap() {
+            Ordering::Equal => continue,
+            v @ Ordering::Less | v @ Ordering::Greater => return v,
+        }
+    }
+
+    Ordering::Equal
+}
+
+// TODO(yingwen): Maybe rename to TableSchema.
+/// Schema of a table
+///
+/// - Should be immutable
+/// - Each schema must have a timestamp column
+/// - Should be immutable and cheap to clone, though passing by reference is
+///   preferred
+/// - The prefix of columns makes up the primary key (similar to kudu's schema)
+/// - The Schema should built by builder
+#[derive(Clone, PartialEq)]
+pub struct Schema {
+    /// The underlying arrow schema, data type of fields must be supported by
+    /// datum
+    arrow_schema: ArrowSchemaRef,
+    /// The number of primary key columns
+    num_key_columns: usize,
+    /// Index of timestamp key column
+    // TODO(yingwen): Maybe we can remove the restriction that timestamp column must exists in
+    //  schema (mainly for projected schema)
+    timestamp_index: usize,
+    /// Index of tsid key column and None denotes the `enable_tsid_primary_key`
+    /// is not set.
+    tsid_index: Option<usize>,
+    /// Control whether to generate tsid as primary key
+    enable_tsid_primary_key: bool,
+    /// Column schemas, only holds arc pointer so the Schema can be cloned
+    /// without much overhead.
+    column_schemas: Arc<ColumnSchemas>,
+    /// Version of the schema, schemas with same version should be identical.
+    version: Version,
+}
+
+impl fmt::Debug for Schema {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("Schema")
+            // arrow_schema is ignored.
+            .field("num_key_columns", &self.num_key_columns)
+            .field("timestamp_index", &self.timestamp_index)
+            .field("tsid_index", &self.tsid_index)
+            .field("enable_tsid_primary_key", &self.enable_tsid_primary_key)
+            .field("column_schemas", &self.column_schemas)
+            .field("version", &self.version)
+            .finish()
+    }
+}
+
+impl TryFrom<ArrowSchemaRef> for Schema {
+    type Error = Error;
+
+    fn try_from(arrow_schema: ArrowSchemaRef) -> Result<Self> {
+        Builder::build_from_arrow_schema(arrow_schema)
+    }
+}
+
+impl TryFrom<RecordSchema> for Schema {
+    type Error = Error;
+
+    fn try_from(record_schema: RecordSchema) -> Result<Self> {
+        Builder::build_from_arrow_schema(record_schema.to_arrow_schema_ref())
+    }
+}
+
+impl Schema {
+    /// Returns an immutable reference of the vector of [ColumnSchema].
+    pub fn columns(&self) -> &[ColumnSchema] {
+        self.column_schemas.columns()
+    }
+
+    /// Returns an immutable reference of the key column vector.
+    pub fn key_columns(&self) -> &[ColumnSchema] {
+        &self.columns()[..self.num_key_columns]
+    }
+
+    /// Returns an immutable reference of the normal column vector.
+    pub fn normal_columns(&self) -> &[ColumnSchema] {
+        &self.columns()[self.num_key_columns..]
+    }
+
+    /// Returns index of the tsid column.
+    pub fn index_of_tsid(&self) -> Option<usize> {
+        self.tsid_index
+    }
+
+    /// Returns tsid column index and immutable reference of tsid column
+    pub fn tsid_column(&self) -> Option<&ColumnSchema> {
+        if let Some(idx) = self.index_of_tsid() {
+            Some(&self.column_schemas.columns[idx])
+        } else {
+            None
+        }
+    }
+
+    /// Returns total number of columns
+    pub fn num_columns(&self) -> usize {
+        self.column_schemas.num_columns()
+    }
+
+    /// Returns an immutable reference of a specific [ColumnSchema] selected by
+    /// name.
+    pub fn column_with_name(&self, name: &str) -> Option<&ColumnSchema> {
+        let index = self.column_schemas.name_to_index.get(name)?;
+        Some(&self.column_schemas.columns[*index])
+    }
+
+    /// Returns an immutable reference of a specific [ColumnSchema] selected
+    /// using an offset within the internal vector.
+    ///
+    /// Panic if i is out of bound
+    pub fn column(&self, i: usize) -> &ColumnSchema {
+        self.column_schemas.column(i)
+    }
+
+    /// Return the ref to [arrow_deps::arrow::datatypes::SchemaRef]
+    pub fn as_arrow_schema_ref(&self) -> &ArrowSchemaRef {
+        &self.arrow_schema
+    }
+
+    /// Return the cloned [arrow_deps::arrow::datatypes::SchemaRef]
+    pub fn to_arrow_schema_ref(&self) -> ArrowSchemaRef {
+        self.arrow_schema.clone()
+    }
+
+    /// Into [arrow_deps::arrow::datatypes::SchemaRef]
+    pub fn into_arrow_schema_ref(self) -> ArrowSchemaRef {
+        self.arrow_schema
+    }
+
+    /// Find the index of the column with the given name.
+    pub fn index_of(&self, name: &str) -> Option<usize> {
+        self.column_schemas.index_of(name)
+    }
+
+    /// Returns the number of columns in primary key
+    #[inline]
+    pub fn num_key_columns(&self) -> usize {
+        self.num_key_columns
+    }
+
+    /// Get the name of the timestamp column
+    #[inline]
+    pub fn timestamp_name(&self) -> &str {
+        &self.column(self.timestamp_index()).name
+    }
+
+    /// Get the index of the timestamp column
+    #[inline]
+    pub fn timestamp_index(&self) -> usize {
+        self.timestamp_index
+    }
+
+    /// Get the version of this schema
+    #[inline]
+    pub fn version(&self) -> Version {
+        self.version
+    }
+
+    /// Compare the two rows.
+    ///
+    /// REQUIRES: the two rows must have the key columns defined by the schema.
+    pub fn compare_row<R: RowView>(&self, lhs: &R, rhs: &R) -> Ordering {
+        compare_row(self.num_key_columns, lhs, rhs)
+    }
+
+    /// Returns `Ok` if rows with `writer_schema` can write to table with the
+    /// same schema as `self`.
+    pub fn compatible_for_write(
+        &self,
+        writer_schema: &Schema,
+        index_in_writer: &mut IndexInWriterSchema,
+    ) -> std::result::Result<(), CompatError> {
+        index_in_writer.0.reserve(self.num_columns());
+
+        let mut num_col_in_writer = 0;
+        for column in self.columns() {
+            // Find column in schema of writer.
+            match writer_schema.index_of(&column.name) {
+                Some(writer_index) => {
+                    let writer_column = writer_schema.column(writer_index);
+
+                    // Column is found in writer
+                    num_col_in_writer += 1;
+
+                    // Column with same name, but not compatible
+                    column
+                        .compatible_for_write(writer_column)
+                        .context(IncompatWriteColumn)?;
+
+                    // Column is compatible, push index mapping
+                    index_in_writer.0.push(Some(writer_index));
+                }
+                None => {
+                    // Column is not found in writer, then the column should be nullable.
+                    ensure!(
+                        column.is_nullable,
+                        MissingWriteColumn { name: &column.name }
+                    );
+
+                    // Column is nullable, push index mapping
+                    index_in_writer.0.push(None);
+                }
+            }
+        }
+        // All columns of this schema have been checked
+
+        // If the writer have columns not in this schema, then we consider it
+        // incompatible
+        ensure!(
+            num_col_in_writer == writer_schema.num_columns(),
+            WriteMoreColumn {
+                names: writer_schema
+                    .columns()
+                    .iter()
+                    .filter_map(|c| if self.column_with_name(&c.name).is_none() {
+                        Some(c.name.clone())
+                    } else {
+                        None
+                    })
+                    .collect::<Vec<_>>(),
+            }
+        );
+
+        Ok(())
+    }
+
+    pub fn to_record_schema(&self) -> RecordSchema {
+        RecordSchema {
+            arrow_schema: self.arrow_schema.clone(),
+            column_schemas: self.column_schemas.clone(),
+        }
+    }
+
+    pub fn to_record_schema_with_key(&self) -> RecordSchemaWithKey {
+        RecordSchemaWithKey {
+            record_schema: self.to_record_schema(),
+            num_key_columns: self.num_key_columns,
+        }
+    }
+
+    /// Panic if projection is invalid.
+    pub(crate) fn project_record_schema_with_key(
+        &self,
+        projection: &[usize],
+    ) -> RecordSchemaWithKey {
+        let mut columns = Vec::with_capacity(self.num_key_columns);
+        // Keep all key columns in order.
+        for key_column in self.key_columns() {
+            columns.push(key_column.clone());
+        }
+
+        // Collect normal columns needed by the projection.
+        for p in projection {
+            if *p >= self.num_key_columns {
+                // A normal column
+                let normal_column = &self.columns()[*p];
+                columns.push(normal_column.clone());
+            }
+        }
+
+        let record_schema = RecordSchema::from_column_schemas(ColumnSchemas::new(columns));
+
+        RecordSchemaWithKey {
+            record_schema,
+            num_key_columns: self.num_key_columns,
+        }
+    }
+
+    /// Panic if projection is invalid.
+    pub(crate) fn project_record_schema(&self, projection: &[usize]) -> RecordSchema {
+        let mut columns = Vec::with_capacity(projection.len());
+
+        // Collect all columns needed by the projection.
+        for p in projection {
+            let column_schema = &self.columns()[*p];
+            // Insert the index in projected schema of the column
+            columns.push(column_schema.clone());
+        }
+
+        RecordSchema::from_column_schemas(ColumnSchemas::new(columns))
+    }
+
+    /// Returns byte offsets in contiguous row.
+    #[inline]
+    pub fn byte_offsets(&self) -> &[usize] {
+        &self.column_schemas.byte_offsets
+    }
+
+    /// Returns byte offset in contiguous row of given column.
+    ///
+    /// Panic if out of bound.
+    #[inline]
+    pub fn byte_offset(&self, index: usize) -> usize {
+        self.column_schemas.byte_offsets[index]
+    }
+
+    /// Returns string buffer offset in contiguous row.
+    #[inline]
+    pub fn string_buffer_offset(&self) -> usize {
+        self.column_schemas.string_buffer_offset
+    }
+}
+
+impl TryFrom<common_pb::TableSchema> for Schema {
+    type Error = Error;
+
+    fn try_from(schema: common_pb::TableSchema) -> Result<Self> {
+        let mut builder = Builder::with_capacity(schema.columns.len())
+            .version(schema.version)
+            .enable_tsid_primary_key(schema.enable_tsid_primary_key);
+
+        for (i, column_schema_pb) in schema.columns.into_iter().enumerate() {
+            let column = ColumnSchema::from(column_schema_pb);
+
+            if i < schema.num_key_columns as usize {
+                builder = builder.add_key_column(column)?;
+            } else {
+                builder = builder.add_normal_column(column)?;
+            }
+        }
+
+        builder.build()
+    }
+}
+
+impl From<Schema> for common_pb::TableSchema {
+    fn from(schema: Schema) -> Self {
+        let mut table_schema = common_pb::TableSchema::new();
+
+        for column in schema.columns() {
+            // Convert schema of each column
+            let column_schema = column.to_pb();
+            table_schema.columns.push(column_schema);
+        }
+
+        table_schema.num_key_columns = schema.num_key_columns as u32;
+        table_schema.timestamp_index = schema.timestamp_index as u32;
+        table_schema.enable_tsid_primary_key = schema.enable_tsid_primary_key;
+        table_schema.version = schema.version;
+
+        table_schema
+    }
+}
+
+/// Schema builder
+#[must_use]
+pub struct Builder {
+    columns: Vec<ColumnSchema>,
+    /// The number of primary key columns
+    num_key_columns: usize,
+    /// Timestamp column index
+    timestamp_index: Option<usize>,
+    column_names: HashSet<String>,
+    column_ids: HashSet<ColumnId>,
+    /// Version of the schema
+    version: Version,
+    /// Auto increment the column id if the id of the input ColumnSchema is
+    /// [crate::column_schema::COLUMN_ID_UNINIT].
+    auto_increment_column_id: bool,
+    max_column_id: ColumnId,
+    enable_tsid_primary_key: bool,
+}
+
+impl Default for Builder {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl Builder {
+    /// Create a new builder
+    pub fn new() -> Self {
+        Self::with_capacity(0)
+    }
+
+    /// Create a builder with given capacity
+    pub fn with_capacity(capacity: usize) -> Self {
+        Self {
+            columns: Vec::with_capacity(capacity),
+            num_key_columns: 0,
+            timestamp_index: None,
+            column_names: HashSet::with_capacity(capacity),
+            column_ids: HashSet::with_capacity(capacity),
+            version: DEFAULT_SCHEMA_VERSION,
+            auto_increment_column_id: false,
+            max_column_id: column_schema::COLUMN_ID_UNINIT,
+            enable_tsid_primary_key: false,
+        }
+    }
+
+    /// Add a key column
+    pub fn add_key_column(mut self, mut column: ColumnSchema) -> Result<Self> {
+        self.may_alloc_column_id(&mut column);
+        self.validate_column(&column, true)?;
+
+        ensure!(!column.is_nullable, NullKeyColumn { name: column.name });
+
+        // FIXME(xikai): it seems not reasonable to decide the timestamp column in this
+        // way.
+        let is_timestamp = DatumKind::Timestamp == column.data_type;
+        if is_timestamp {
+            ensure!(
+                self.timestamp_index.is_none(),
+                TimestampKeyExists {
+                    timestamp_column: &self.columns[self.timestamp_index.unwrap()].name,
+                    given_column: column.name,
+                }
+            );
+            self.timestamp_index = Some(self.num_key_columns);
+        }
+
+        self.insert_new_key_column(column);
+
+        Ok(self)
+    }
+
+    /// Add a normal (non key) column
+    pub fn add_normal_column(mut self, mut column: ColumnSchema) -> Result<Self> {
+        self.may_alloc_column_id(&mut column);
+        self.validate_column(&column, false)?;
+
+        self.insert_new_normal_column(column);
+
+        Ok(self)
+    }
+
+    /// Set version of the schema
+    pub fn version(mut self, version: Version) -> Self {
+        self.version = version;
+        self
+    }
+
+    /// When auto increment is true, assign the column schema an auto
+    /// incremented id if its id is [crate::column_schema::COLUMN_ID_UNINIT].
+    ///
+    /// Default is false
+    pub fn auto_increment_column_id(mut self, auto_increment: bool) -> Self {
+        self.auto_increment_column_id = auto_increment;
+        self
+    }
+
+    /// Enable tsid as primary key.
+    pub fn enable_tsid_primary_key(mut self, enable_tsid_primary_key: bool) -> Self {
+        self.enable_tsid_primary_key = enable_tsid_primary_key;
+        self
+    }
+
+    fn may_alloc_column_id(&mut self, column: &mut ColumnSchema) {
+        // Assign this column an id
+        if self.auto_increment_column_id && column.id == column_schema::COLUMN_ID_UNINIT {
+            column.id = self.max_column_id + 1;
+        }
+
+        self.max_column_id = cmp::max(self.max_column_id, column.id);
+    }
+
+    // TODO(yingwen): Do we need to support null data type?
+    fn validate_column(&self, column: &ColumnSchema, is_key: bool) -> Result<()> {
+        ensure!(
+            !self.column_names.contains(&column.name),
+            ColumnNameExists { name: &column.name }
+        );
+
+        // Check datum kind if this is a key column
+        if is_key {
+            ensure!(
+                column.data_type.is_key_kind(),
+                KeyColumnType {
+                    name: &column.name,
+                    kind: column.data_type,
+                }
+            );
+        }
+
+        ensure!(
+            !self.column_ids.contains(&column.id),
+            ColumnIdExists {
+                name: &column.name,
+                id: column.id,
+            }
+        );
+
+        Ok(())
+    }
+
+    fn insert_new_key_column(&mut self, column: ColumnSchema) {
+        self.column_names.insert(column.name.clone());
+        self.column_ids.insert(column.id);
+
+        self.columns.insert(self.num_key_columns, column);
+        self.num_key_columns += 1;
+    }
+
+    fn insert_new_normal_column(&mut self, column: ColumnSchema) {
+        self.column_names.insert(column.name.clone());
+        self.column_ids.insert(column.id);
+
+        self.columns.push(column);
+    }
+
+    fn build_from_arrow_schema(arrow_schema: ArrowSchemaRef) -> Result<Schema> {
+        let fields = arrow_schema.fields();
+        let mut columns = Vec::with_capacity(fields.len());
+
+        for field in fields {
+            let column_schema =
+                ColumnSchema::try_from(field).with_context(|| InvalidArrowField {
+                    arrow_schema: arrow_schema.clone(),
+                    field_name: field.name(),
+                })?;
+            columns.push(column_schema);
+        }
+
+        // FIXME(xikai): Now we have to tolerate the decoding failure because of the bug
+        // of  datafusion (fixed by: https://github.com/apache/arrow-datafusion/commit/1448d9752ab3a38f02732274f91136a6a6ad3db4).
+        //  (The bug may cause the meta data of the schema meta lost duration plan
+        // execution.)
+        let ArrowSchemaMeta {
+            num_key_columns,
+            timestamp_index,
+            enable_tsid_primary_key,
+            version,
+        } = Self::parse_arrow_schema_meta_or_default(arrow_schema.metadata())?;
+        let tsid_index = Self::find_tsid_index(enable_tsid_primary_key, &columns)?;
+
+        let column_schemas = Arc::new(ColumnSchemas::new(columns));
+
+        Ok(Schema {
+            arrow_schema,
+            num_key_columns,
+            timestamp_index,
+            tsid_index,
+            enable_tsid_primary_key,
+            column_schemas,
+            version,
+        })
+    }
+
+    fn parse_arrow_schema_meta_value<T>(
+        meta: &HashMap<String, String>,
+        key: ArrowSchemaMetaKey,
+    ) -> Result<T>
+    where
+        T: FromStr,
+        T::Err: std::error::Error + Send + Sync + 'static,
+    {
+        let raw_value = meta
+            .get(key.as_str())
+            .context(ArrowSchemaMetaKeyNotFound { key })?;
+        T::from_str(raw_value.as_str())
+            .map_err(|e| Box::new(e) as _)
+            .context(InvalidArrowSchemaMetaValue { key, raw_value })
+    }
+
+    /// Parse the necessary meta information from the arrow schema's meta data.
+    fn parse_arrow_schema_meta_or_default(
+        meta: &HashMap<String, String>,
+    ) -> Result<ArrowSchemaMeta> {
+        match Self::parse_arrow_schema_meta(meta) {
+            Ok(v) => Ok(v),
+            Err(Error::ArrowSchemaMetaKeyNotFound { .. }) => Ok(ArrowSchemaMeta {
+                num_key_columns: 0,
+                timestamp_index: 0,
+                enable_tsid_primary_key: false,
+                version: 0,
+            }),
+            Err(e) => Err(e),
+        }
+    }
+
+    /// Parse the necessary meta information from the arrow schema's meta data.
+    fn parse_arrow_schema_meta(meta: &HashMap<String, String>) -> Result<ArrowSchemaMeta> {
+        Ok(ArrowSchemaMeta {
+            num_key_columns: Self::parse_arrow_schema_meta_value(
+                meta,
+                ArrowSchemaMetaKey::NumKeyColumns,
+            )?,
+            timestamp_index: Self::parse_arrow_schema_meta_value(
+                meta,
+                ArrowSchemaMetaKey::TimestampIndex,
+            )?,
+            enable_tsid_primary_key: Self::parse_arrow_schema_meta_value(
+                meta,
+                ArrowSchemaMetaKey::EnableTsidPrimaryKey,
+            )?,
+            version: Self::parse_arrow_schema_meta_value(meta, ArrowSchemaMetaKey::Version)?,
+        })
+    }
+
+    /// Build arrow schema meta data.
+    ///
+    /// Requires: the timestamp index is not None.
+    fn build_arrow_schema_meta(&self) -> HashMap<String, String> {
+        let mut meta = HashMap::with_capacity(4);
+        meta.insert(
+            ArrowSchemaMetaKey::NumKeyColumns.to_string(),
+            self.num_key_columns.to_string(),
+        );
+        meta.insert(
+            ArrowSchemaMetaKey::TimestampIndex.to_string(),
+            self.timestamp_index.unwrap().to_string(),
+        );
+        meta.insert(
+            ArrowSchemaMetaKey::Version.to_string(),
+            self.version.to_string(),
+        );
+        meta.insert(
+            ArrowSchemaMetaKey::EnableTsidPrimaryKey.to_string(),
+            self.enable_tsid_primary_key.to_string(),
+        );
+
+        meta
+    }
+
+    fn find_tsid_index(
+        enable_tsid_primary_key: bool,
+        columns: &[ColumnSchema],
+    ) -> Result<Option<usize>> {
+        if !enable_tsid_primary_key {
+            return Ok(None);
+        }
+
+        let idx = columns
+            .iter()
+            .enumerate()
+            .find_map(|(idx, col_schema)| {
+                if col_schema.name == TSID_COLUMN {
+                    Some(idx)
+                } else {
+                    None
+                }
+            })
+            .context(InvalidTsidSchema)?;
+
+        Ok(Some(idx))
+    }
+
+    /// Build the schema
+    pub fn build(self) -> Result<Schema> {
+        let timestamp_index = self.timestamp_index.context(MissingTimestampKey)?;
+        // Timestamp key column is exists, so key columns should not be zero
+        assert!(self.num_key_columns > 0);
+        if self.enable_tsid_primary_key {
+            ensure!(self.num_key_columns == 2, InvalidTsidSchema);
+        }
+
+        let tsid_index = Self::find_tsid_index(self.enable_tsid_primary_key, &self.columns)?;
+
+        let fields = self.columns.iter().map(|c| c.to_arrow_field()).collect();
+        let meta = self.build_arrow_schema_meta();
+
+        Ok(Schema {
+            arrow_schema: Arc::new(ArrowSchema::new_with_metadata(fields, meta)),
+            num_key_columns: self.num_key_columns,
+            timestamp_index,
+            tsid_index,
+            enable_tsid_primary_key: self.enable_tsid_primary_key,
+            column_schemas: Arc::new(ColumnSchemas::new(self.columns)),
+            version: self.version,
+        })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::{
+        bytes::Bytes,
+        datum::Datum,
+        row::{Row, RowWithMeta},
+        time::Timestamp,
+    };
+
+    #[test]
+    fn test_schema() {
+        let schema = Builder::new()
+            .auto_increment_column_id(true)
+            .add_key_column(
+                column_schema::Builder::new("key1".to_string(), DatumKind::Varbinary)
+                    .build()
+                    .expect("should succeed build column schema"),
+            )
+            .unwrap()
+            .add_key_column(
+                column_schema::Builder::new("timestamp".to_string(), DatumKind::Timestamp)
+                    .build()
+                    .expect("should succeed build column schema"),
+            )
+            .unwrap()
+            .add_normal_column(
+                column_schema::Builder::new("field1".to_string(), DatumKind::Double)
+                    .build()
+                    .expect("should succeed build column schema"),
+            )
+            .unwrap()
+            .add_normal_column(
+                column_schema::Builder::new("field2".to_string(), DatumKind::Double)
+                    .build()
+                    .expect("should succeed build column schema"),
+            )
+            .unwrap()
+            .build()
+            .unwrap();
+
+        // Length related test
+        assert_eq!(4, schema.columns().len());
+        assert_eq!(4, schema.num_columns());
+        assert_eq!(2, schema.num_key_columns());
+        assert_eq!(1, schema.timestamp_index());
+
+        // Test key columns
+        assert_eq!(2, schema.key_columns().len());
+        assert_eq!("key1", &schema.key_columns()[0].name);
+        assert_eq!("timestamp", &schema.key_columns()[1].name);
+
+        // Test normal columns
+        assert_eq!(2, schema.normal_columns().len());
+        assert_eq!("field1", &schema.normal_columns()[0].name);
+        assert_eq!("field2", &schema.normal_columns()[1].name);
+
+        // Test column_with_name()
+        let field1 = schema.column_with_name("field1").unwrap();
+        assert_eq!(3, field1.id);
+        assert_eq!("field1", field1.name);
+        assert!(schema.column_with_name("not exists").is_none());
+
+        // Test column()
+        assert_eq!(field1, schema.column(2));
+
+        // Test arrow schema
+        let arrow_schema = schema.as_arrow_schema_ref();
+        let key1 = arrow_schema.field(0);
+        assert_eq!("key1", key1.name());
+        let field2 = arrow_schema.field(3);
+        assert_eq!("field2", field2.name());
+
+        // Test index_of()
+        assert_eq!(1, schema.index_of("timestamp").unwrap());
+        assert!(schema.index_of("not exists").is_none());
+
+        // Test pb convert
+        let schema_pb = common_pb::TableSchema::from(schema.clone());
+        let schema_from_pb = Schema::try_from(schema_pb).unwrap();
+        assert_eq!(schema, schema_from_pb);
+    }
+
+    #[test]
+    fn test_build_unordered() {
+        let schema = Builder::new()
+            .auto_increment_column_id(true)
+            .add_normal_column(
+                column_schema::Builder::new("field1".to_string(), DatumKind::Double)
+                    .build()
+                    .expect("should succeed build column schema"),
+            )
+            .unwrap()
+            .add_key_column(
+                column_schema::Builder::new("key1".to_string(), DatumKind::Timestamp)
+                    .build()
+                    .expect("should succeed build column schema"),
+            )
+            .unwrap()
+            .add_key_column(
+                column_schema::Builder::new("key2".to_string(), DatumKind::Varbinary)
+                    .build()
+                    .expect("should succeed build column schema"),
+            )
+            .unwrap()
+            .add_normal_column(
+                column_schema::Builder::new("field2".to_string(), DatumKind::Double)
+                    .build()
+                    .expect("should succeed build column schema"),
+            )
+            .unwrap()
+            .build()
+            .unwrap();
+
+        let columns = schema.columns();
+        assert_eq!(2, columns[0].id);
+        assert_eq!("key1", columns[0].name);
+        assert_eq!(3, columns[1].id);
+        assert_eq!("key2", columns[1].name);
+        assert_eq!(1, columns[2].id);
+        assert_eq!("field1", columns[2].name);
+        assert_eq!(4, columns[3].id);
+        assert_eq!("field2", columns[3].name);
+    }
+
+    #[test]
+    fn test_name_exists() {
+        let builder = Builder::new()
+            .auto_increment_column_id(true)
+            .add_normal_column(
+                column_schema::Builder::new("field1".to_string(), DatumKind::Double)
+                    .build()
+                    .expect("should succeed build column schema"),
+            )
+            .unwrap();
+        assert!(builder
+            .add_normal_column(
+                column_schema::Builder::new("field1".to_string(), DatumKind::Double)
+                    .build()
+                    .expect("should succeed build column schema"),
+            )
+            .is_err());
+    }
+
+    #[test]
+    fn test_id_exists() {
+        let builder = Builder::new()
+            .add_normal_column(
+                column_schema::Builder::new("field1".to_string(), DatumKind::Double)
+                    .id(1)
+                    .build()
+                    .expect("should succeed build column schema"),
+            )
+            .unwrap();
+        assert!(builder
+            .add_normal_column(
+                column_schema::Builder::new("field2".to_string(), DatumKind::Double)
+                    .id(1)
+                    .build()
+                    .expect("should succeed build column schema")
+            )
+            .is_err());
+    }
+
+    #[test]
+    fn test_key_column_type() {
+        assert!(Builder::new()
+            .add_key_column(
+                column_schema::Builder::new("key".to_string(), DatumKind::Double)
+                    .id(1)
+                    .build()
+                    .expect("should succeed build column schema")
+            )
+            .is_err());
+    }
+
+    #[test]
+    fn test_timestamp_key_exists() {
+        let builder = Builder::new()
+            .auto_increment_column_id(true)
+            .add_key_column(
+                column_schema::Builder::new("key1".to_string(), DatumKind::Timestamp)
+                    .build()
+                    .expect("should succeed build column schema"),
+            )
+            .unwrap();
+        assert!(builder
+            .add_key_column(
+                column_schema::Builder::new("key2".to_string(), DatumKind::Timestamp)
+                    .build()
+                    .expect("should succeed build column schema")
+            )
+            .is_err());
+    }
+
+    #[test]
+    fn test_mulitple_timestamp() {
+        Builder::new()
+            .auto_increment_column_id(true)
+            .add_key_column(
+                column_schema::Builder::new("key1".to_string(), DatumKind::Timestamp)
+                    .build()
+                    .expect("should succeed build column schema"),
+            )
+            .unwrap()
+            .add_normal_column(
+                column_schema::Builder::new("field1".to_string(), DatumKind::Timestamp)
+                    .build()
+                    .expect("should succeed build column schema"),
+            )
+            .unwrap()
+            .build()
+            .unwrap();
+    }
+
+    #[test]
+    fn test_missing_timestamp_key() {
+        let builder = Builder::new()
+            .auto_increment_column_id(true)
+            .add_key_column(
+                column_schema::Builder::new("key1".to_string(), DatumKind::Varbinary)
+                    .build()
+                    .expect("should succeed build column schema"),
+            )
+            .unwrap()
+            .add_normal_column(
+                column_schema::Builder::new("field1".to_string(), DatumKind::Double)
+                    .build()
+                    .expect("should succeed build column schema"),
+            )
+            .unwrap();
+        assert!(builder.build().is_err());
+    }
+
+    #[test]
+    fn test_null_key() {
+        assert!(Builder::new()
+            .add_key_column(
+                column_schema::Builder::new("key1".to_string(), DatumKind::Varbinary)
+                    .id(1)
+                    .is_nullable(true)
+                    .build()
+                    .expect("should succeed build column schema")
+            )
+            .is_err());
+    }
+
+    #[test]
+    fn test_max_column_id() {
+        let builder = Builder::new()
+            .add_key_column(
+                column_schema::Builder::new("key1".to_string(), DatumKind::Varbinary)
+                    .id(2)
+                    .build()
+                    .expect("should succeed build column schema"),
+            )
+            .unwrap()
+            .add_normal_column(
+                column_schema::Builder::new("field1".to_string(), DatumKind::Timestamp)
+                    .id(5)
+                    .build()
+                    .expect("should succeed build column schema"),
+            )
+            .unwrap();
+
+        let schema = builder
+            .auto_increment_column_id(true)
+            .add_key_column(
+                column_schema::Builder::new("key2".to_string(), DatumKind::Timestamp)
+                    .build()
+                    .expect("should succeed build column schema"),
+            )
+            .unwrap()
+            .add_normal_column(
+                column_schema::Builder::new("field2".to_string(), DatumKind::Timestamp)
+                    .build()
+                    .expect("should succeed build column schema"),
+            )
+            .unwrap()
+            .build()
+            .unwrap();
+
+        let columns = schema.columns();
+        // Check key1
+        assert_eq!("key1", &columns[0].name);
+        assert_eq!(2, columns[0].id);
+        // Check key2
+        assert_eq!("key2", &columns[1].name);
+        assert_eq!(6, columns[1].id);
+        // Check field1
+        assert_eq!("field1", &columns[2].name);
+        assert_eq!(5, columns[2].id);
+        // Check field2
+        assert_eq!("field2", &columns[3].name);
+        assert_eq!(7, columns[3].id);
+    }
+
+    fn assert_row_compare(ordering: Ordering, schema: &Schema, row1: &Row, row2: &Row) {
+        let schema_with_key = schema.to_record_schema_with_key();
+        let lhs = RowWithMeta {
+            row: row1,
+            schema: &schema_with_key,
+        };
+        let rhs = RowWithMeta {
+            row: row2,
+            schema: &schema_with_key,
+        };
+        assert_eq!(ordering, schema.compare_row(&lhs, &rhs));
+    }
+
+    #[test]
+    fn test_compare_row() {
+        let schema = Builder::new()
+            .auto_increment_column_id(true)
+            .add_key_column(
+                column_schema::Builder::new("key1".to_string(), DatumKind::Varbinary)
+                    .build()
+                    .expect("should succeed build column schema"),
+            )
+            .unwrap()
+            .add_key_column(
+                column_schema::Builder::new("key2".to_string(), DatumKind::Timestamp)
+                    .build()
+                    .expect("should succeed build column schema"),
+            )
+            .unwrap()
+            .add_normal_column(
+                column_schema::Builder::new("field1".to_string(), DatumKind::Double)
+                    .build()
+                    .expect("should succeed build column schema"),
+            )
+            .unwrap()
+            .build()
+            .unwrap();
+
+        // Test equal
+        {
+            let row1 = Row::from_datums(vec![
+                Datum::Varbinary(Bytes::from_static(b"key1")),
+                Datum::Timestamp(Timestamp::new(1005)),
+                Datum::Double(12.5),
+            ]);
+            let row2 = Row::from_datums(vec![
+                Datum::Varbinary(Bytes::from_static(b"key1")),
+                Datum::Timestamp(Timestamp::new(1005)),
+                Datum::Double(15.5),
+            ]);
+
+            assert_row_compare(Ordering::Equal, &schema, &row1, &row2);
+        }
+
+        // Test first key column less
+        {
+            let row1 = Row::from_datums(vec![
+                Datum::Varbinary(Bytes::from_static(b"key2")),
+                Datum::Timestamp(Timestamp::new(1005)),
+                Datum::Double(17.5),
+            ]);
+            let row2 = Row::from_datums(vec![
+                Datum::Varbinary(Bytes::from_static(b"key5")),
+                Datum::Timestamp(Timestamp::new(1005)),
+                Datum::Double(17.5),
+            ]);
+
+            assert_row_compare(Ordering::Less, &schema, &row1, &row2);
+        }
+
+        // Test second key column less
+        {
+            let row1 = Row::from_datums(vec![
+                Datum::Varbinary(Bytes::from_static(b"key2")),
+                Datum::Timestamp(Timestamp::new(1002)),
+                Datum::Double(17.5),
+            ]);
+            let row2 = Row::from_datums(vec![
+                Datum::Varbinary(Bytes::from_static(b"key2")),
+                Datum::Timestamp(Timestamp::new(1005)),
+                Datum::Double(17.5),
+            ]);
+
+            assert_row_compare(Ordering::Less, &schema, &row1, &row2);
+        }
+
+        // Test first key column greater
+        {
+            let row1 = Row::from_datums(vec![
+                Datum::Varbinary(Bytes::from_static(b"key7")),
+                Datum::Timestamp(Timestamp::new(1005)),
+                Datum::Double(17.5),
+            ]);
+            let row2 = Row::from_datums(vec![
+                Datum::Varbinary(Bytes::from_static(b"key5")),
+                Datum::Timestamp(Timestamp::new(1005)),
+                Datum::Double(17.5),
+            ]);
+
+            assert_row_compare(Ordering::Greater, &schema, &row1, &row2);
+        }
+
+        // Test second key column greater
+        {
+            let row1 = Row::from_datums(vec![
+                Datum::Varbinary(Bytes::from_static(b"key2")),
+                Datum::Timestamp(Timestamp::new(1007)),
+                Datum::Double(17.5),
+            ]);
+            let row2 = Row::from_datums(vec![
+                Datum::Varbinary(Bytes::from_static(b"key2")),
+                Datum::Timestamp(Timestamp::new(1005)),
+                Datum::Double(17.5),
+            ]);
+
+            assert_row_compare(Ordering::Greater, &schema, &row1, &row2);
+        }
+    }
+
+    #[test]
+    fn test_build_from_arrow_schema() {
+        let schema = Builder::new()
+            .auto_increment_column_id(true)
+            .enable_tsid_primary_key(true)
+            .add_key_column(
+                column_schema::Builder::new(TSID_COLUMN.to_string(), DatumKind::UInt64)
+                    .build()
+                    .expect("should succeed build column schema"),
+            )
+            .unwrap()
+            .add_key_column(
+                column_schema::Builder::new("timestamp".to_string(), DatumKind::Timestamp)
+                    .build()
+                    .expect("should succeed build column schema"),
+            )
+            .unwrap()
+            .add_normal_column(
+                column_schema::Builder::new("value".to_string(), DatumKind::Double)
+                    .build()
+                    .expect("should succeed build column schema"),
+            )
+            .unwrap()
+            .build()
+            .expect("should succeed to build schema");
+
+        let arrow_schema = schema.clone().into_arrow_schema_ref();
+        let new_schema = Builder::build_from_arrow_schema(arrow_schema)
+            .expect("should succeed to build new schema");
+
+        assert_eq!(schema, new_schema);
+    }
+}
diff --git a/common_types/src/string.rs b/common_types/src/string.rs
new file mode 100644
index 0000000000..be41c82702
--- /dev/null
+++ b/common_types/src/string.rs
@@ -0,0 +1,107 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Bytes that can safely cast to str/string.
+
+use std::{convert::TryFrom, fmt, ops, str};
+
+use snafu::{Backtrace, ResultExt, Snafu};
+
+use crate::bytes::Bytes;
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Bytes are not valid utf8, err:{}.\nBacktrace:\n{}", source, backtrace))]
+    FromBytes {
+        source: std::str::Utf8Error,
+        backtrace: Backtrace,
+    },
+}
+
+pub type Result<T> = std::result::Result<T, Error>;
+
+/// String using [crate::bytes::Bytes] as storage so it can be cast into `Bytes`
+/// and clone like `Bytes`.
+#[derive(Debug, Clone, PartialEq, PartialOrd)]
+pub struct StringBytes(Bytes);
+
+impl StringBytes {
+    pub fn new() -> StringBytes {
+        StringBytes(Bytes::new())
+    }
+
+    pub const fn from_static(src: &'static str) -> StringBytes {
+        StringBytes(Bytes::from_static(src.as_bytes()))
+    }
+
+    pub fn copy_from_str(src: &str) -> StringBytes {
+        StringBytes(Bytes::copy_from_slice(src.as_bytes()))
+    }
+
+    /// Create a [StringBytes] from a valid utf bytes.
+    ///
+    /// # Safety
+    /// The caller must ensure `bytes` is valid utf string.
+    pub unsafe fn from_bytes_unchecked(bytes: Bytes) -> StringBytes {
+        StringBytes(bytes)
+    }
+
+    #[inline]
+    pub fn as_bytes(&self) -> &[u8] {
+        &self.0
+    }
+
+    #[inline]
+    pub fn as_str(&self) -> &str {
+        unsafe { str::from_utf8_unchecked(self.as_bytes()) }
+    }
+}
+
+impl Default for StringBytes {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl ops::Deref for StringBytes {
+    type Target = str;
+
+    #[inline]
+    fn deref(&self) -> &str {
+        self.as_str()
+    }
+}
+
+impl AsRef<str> for StringBytes {
+    #[inline]
+    fn as_ref(&self) -> &str {
+        self.as_str()
+    }
+}
+
+impl fmt::Display for StringBytes {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        f.write_str(self.as_str())
+    }
+}
+
+impl TryFrom<Bytes> for StringBytes {
+    type Error = Error;
+
+    fn try_from(bytes: Bytes) -> Result<StringBytes> {
+        str::from_utf8(&bytes).context(FromBytes)?;
+
+        Ok(StringBytes(bytes))
+    }
+}
+
+impl From<String> for StringBytes {
+    fn from(src: String) -> Self {
+        Self(Bytes::from(src))
+    }
+}
+
+impl From<&str> for StringBytes {
+    fn from(src: &str) -> Self {
+        Self::copy_from_str(src)
+    }
+}
diff --git a/common_types/src/tests.rs b/common_types/src/tests.rs
new file mode 100644
index 0000000000..e20313ce1c
--- /dev/null
+++ b/common_types/src/tests.rs
@@ -0,0 +1,139 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use bytes::Bytes;
+
+use crate::{
+    column_schema,
+    datum::{Datum, DatumKind},
+    projected_schema::ProjectedSchema,
+    record_batch::{RecordBatchWithKey, RecordBatchWithKeyBuilder},
+    row::{
+        contiguous::{ContiguousRowReader, ContiguousRowWriter, ProjectedContiguousRow},
+        Row,
+    },
+    schema,
+    schema::{IndexInWriterSchema, Schema},
+    string::StringBytes,
+    time::Timestamp,
+};
+
+fn base_schema_builder() -> schema::Builder {
+    schema::Builder::new()
+        .auto_increment_column_id(true)
+        .add_key_column(
+            column_schema::Builder::new("key1".to_string(), DatumKind::Varbinary)
+                .build()
+                .expect("should succeed build column schema"),
+        )
+        .unwrap()
+        .add_key_column(
+            column_schema::Builder::new("key2".to_string(), DatumKind::Timestamp)
+                .build()
+                .expect("should succeed build column schema"),
+        )
+        .unwrap()
+        .add_normal_column(
+            column_schema::Builder::new("field1".to_string(), DatumKind::Double)
+                .build()
+                .expect("should succeed build column schema"),
+        )
+        .unwrap()
+        .add_normal_column(
+            column_schema::Builder::new("field2".to_string(), DatumKind::String)
+                .build()
+                .expect("should succeed build column schema"),
+        )
+        .unwrap()
+}
+
+/// Build a schema for testing:
+/// (key1(varbinary), key2(timestamp), field1(double), field2(string))
+pub fn build_schema() -> Schema {
+    base_schema_builder().build().unwrap()
+}
+
+pub fn build_projected_schema() -> ProjectedSchema {
+    let schema = build_schema();
+    assert!(schema.num_columns() > 1);
+    let projection: Vec<usize> = (0..schema.num_columns() - 1).collect();
+    ProjectedSchema::new(schema, Some(projection)).unwrap()
+}
+
+pub fn build_row(key1: &[u8], key2: i64, field1: f64, field2: &str) -> Row {
+    let datums = vec![
+        Datum::Varbinary(Bytes::copy_from_slice(key1)),
+        Datum::Timestamp(Timestamp::new(key2)),
+        Datum::Double(field1),
+        Datum::String(StringBytes::from(field2)),
+    ];
+
+    Row::from_datums(datums)
+}
+
+pub fn build_row_opt(key1: &[u8], key2: i64, field1: Option<f64>, field2: Option<&str>) -> Row {
+    let datums = vec![
+        Datum::Varbinary(Bytes::copy_from_slice(key1)),
+        Datum::Timestamp(Timestamp::new(key2)),
+        field1.map(Datum::Double).unwrap_or(Datum::Null),
+        field2
+            .map(|v| Datum::String(StringBytes::from(v)))
+            .unwrap_or(Datum::Null),
+    ];
+
+    Row::from_datums(datums)
+}
+
+pub fn build_rows() -> Vec<Row> {
+    vec![
+        build_row(b"binary key", 1000000, 10.0, "string value"),
+        build_row(b"binary key1", 1000001, 11.0, "string value 1"),
+        build_row_opt(b"binary key2", 1000002, None, Some("string value 2")),
+        build_row_opt(b"binary key3", 1000003, Some(13.0), None),
+        build_row_opt(b"binary key4", 1000004, None, None),
+    ]
+}
+
+pub fn build_record_batch_with_key_by_rows(rows: Vec<Row>) -> RecordBatchWithKey {
+    let schema = build_schema();
+    assert!(schema.num_columns() > 1);
+    let projection: Vec<usize> = (0..schema.num_columns() - 1).collect();
+    let projected_schema = ProjectedSchema::new(schema.clone(), Some(projection)).unwrap();
+    let row_projected_schema = projected_schema.try_project_with_key(&schema).unwrap();
+
+    let mut builder =
+        RecordBatchWithKeyBuilder::with_capacity(projected_schema.to_record_schema_with_key(), 2);
+    let index_in_writer = IndexInWriterSchema::for_same_schema(schema.num_columns());
+
+    let mut buf = Vec::new();
+    for row in rows {
+        let mut writer = ContiguousRowWriter::new(&mut buf, &schema, &index_in_writer);
+
+        writer.write_row(&row).unwrap();
+
+        let source_row = ContiguousRowReader::with_schema(&buf, &schema);
+        let projected_row = ProjectedContiguousRow::new(source_row, &row_projected_schema);
+        builder
+            .append_projected_contiguous_row(&projected_row)
+            .unwrap();
+    }
+    builder.build().unwrap()
+}
+
+pub fn check_record_batch_with_key_with_rows(
+    record_batch_with_key: &RecordBatchWithKey,
+    row_num: usize,
+    column_num: usize,
+    rows: Vec<Row>,
+) -> bool {
+    for (i, row) in rows.iter().enumerate().take(row_num) {
+        for j in 0..column_num {
+            let datum = &row[j];
+            let datum2 = record_batch_with_key.column(j).datum(i);
+
+            if *datum != datum2 {
+                return false;
+            }
+        }
+    }
+    true
+}
diff --git a/common_types/src/time.rs b/common_types/src/time.rs
new file mode 100644
index 0000000000..27ff8802c0
--- /dev/null
+++ b/common_types/src/time.rs
@@ -0,0 +1,363 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Time types
+
+// TODO(yingwen): Support timezone
+
+use std::{
+    convert::{TryFrom, TryInto},
+    time::{self, Duration, SystemTime},
+};
+
+use proto::common::TimeRange as TimeRangePb;
+use snafu::{Backtrace, OptionExt, Snafu};
+
+/// Error of time module.
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Invalid time range, start:{}, end:{}", start, end))]
+    InvalidTimeRange {
+        start: i64,
+        end: i64,
+        backtrace: Backtrace,
+    },
+}
+
+/// Unix timestamp type in millis
+// Use i64 so we can store timestamp before 1970-01-01
+#[derive(Clone, Copy, Debug, Default, Eq, Ord, PartialEq, PartialOrd, Hash)]
+pub struct Timestamp(i64);
+
+impl Timestamp {
+    pub const MAX: Timestamp = Timestamp(i64::MAX);
+    pub const MIN: Timestamp = Timestamp(i64::MIN);
+    pub const ZERO: Timestamp = Timestamp(0);
+
+    pub const fn new(ts: i64) -> Self {
+        Self(ts)
+    }
+
+    /// Return current (non-negative) unix timestamp in millis.
+    pub fn now() -> Self {
+        SystemTime::now()
+            .duration_since(time::UNIX_EPOCH)
+            .map(|duration| {
+                duration
+                    .as_millis()
+                    .try_into()
+                    .map(Timestamp)
+                    .unwrap_or(Timestamp::MAX)
+            })
+            .unwrap_or(Timestamp::ZERO)
+    }
+
+    /// Returns the earliest expired timestamp.
+    #[inline]
+    pub fn expire_time(ttl: Duration) -> Timestamp {
+        Timestamp::now().sub_duration_or_min(ttl)
+    }
+
+    #[inline]
+    pub fn as_i64(&self) -> i64 {
+        self.0
+    }
+
+    /// Truncate the value of this timestamp by given duration, return that
+    /// value and keeps current timestamp unchanged.
+    ///
+    /// This function won't do overflow check.
+    #[must_use]
+    pub fn truncate_by(&self, duration: Duration) -> Self {
+        let duration_millis = duration.as_millis() as i64;
+        Timestamp::new(self.0 / duration_millis * duration_millis)
+    }
+
+    /// Floor the timestamp by the `duration_ms` (in millisecond) and return a
+    /// new Timestamp instance or None if overflow occurred.
+    ///
+    /// The `duration_ms` must be positive
+    #[inline]
+    fn checked_floor_by_i64(&self, duration_ms: i64) -> Option<Self> {
+        assert!(duration_ms > 0);
+        let normalized_ts = if self.0 >= 0 {
+            // self / duration_ms * duration_ms
+            self.0
+        } else {
+            // (self - (duration_ms - 1)) / duration_ms * duration_ms
+            self.0.checked_sub(duration_ms - 1)?
+        };
+
+        normalized_ts
+            .checked_div(duration_ms)
+            .and_then(|v| v.checked_mul(duration_ms))
+            .map(Timestamp)
+    }
+
+    /// Returns the result of this `timestamp + offset_ms`, or None if overflow
+    /// occurred.
+    ///
+    /// The `offset_ms` is in millis resolution
+    pub fn checked_add_i64(&self, offset_ms: i64) -> Option<Self> {
+        self.0.checked_add(offset_ms).map(Timestamp)
+    }
+
+    pub fn checked_add(&self, other: Self) -> Option<Self> {
+        self.0.checked_add(other.0).map(Timestamp)
+    }
+
+    pub fn checked_sub(&self, other: Self) -> Option<Self> {
+        self.0.checked_sub(other.0).map(Timestamp)
+    }
+
+    /// Returns the result of this `timestamp` - `duration`, or None if overflow
+    /// occurred.
+    pub fn checked_sub_duration(&self, duration: Duration) -> Option<Self> {
+        let duration_millis = duration.as_millis().try_into().ok()?;
+        self.0.checked_sub(duration_millis).map(Timestamp)
+    }
+
+    /// Return true if the time is expired
+    pub fn is_expired(&self, expired_time: Timestamp) -> bool {
+        *self < expired_time
+    }
+
+    /// Returns the result of this `timestamp` - `duration`, or MIN if overflow
+    /// occurred.
+    #[must_use]
+    pub fn sub_duration_or_min(&self, duration: Duration) -> Timestamp {
+        self.checked_sub_duration(duration)
+            .unwrap_or(Timestamp::MIN)
+    }
+}
+
+impl From<Timestamp> for i64 {
+    fn from(timestamp: Timestamp) -> Self {
+        timestamp.0
+    }
+}
+
+impl From<i64> for Timestamp {
+    fn from(ts: i64) -> Self {
+        Self::new(ts)
+    }
+}
+
+impl From<&i64> for Timestamp {
+    fn from(ts: &i64) -> Self {
+        Self::new(*ts)
+    }
+}
+
+/// Unix timestamp range in millis
+///
+/// The start time is inclusive and the end time is exclusive: [start, end).
+/// The range is empty if start equals end.
+#[derive(Default, Debug, Clone, Copy, PartialEq, Eq)]
+pub struct TimeRange {
+    /// The start timestamp (inclusive)
+    inclusive_start: Timestamp,
+    /// The end timestamp (exclusive)
+    exclusive_end: Timestamp,
+}
+
+impl TimeRange {
+    /// Create a new time range, returns None if the start/end is invalid
+    pub fn new(inclusive_start: Timestamp, exclusive_end: Timestamp) -> Option<Self> {
+        if inclusive_start <= exclusive_end {
+            Some(Self {
+                inclusive_start,
+                exclusive_end,
+            })
+        } else {
+            None
+        }
+    }
+
+    /// Create a new time range, panic if the start/end is invalid.
+    pub fn new_unchecked(inclusive_start: Timestamp, exclusive_end: Timestamp) -> Self {
+        Self::new(inclusive_start, exclusive_end).unwrap()
+    }
+
+    #[cfg(any(test, feature = "test"))]
+    pub fn new_unchecked_for_test(inclusive_start: i64, exclusive_end: i64) -> Self {
+        Self::new(
+            Timestamp::new(inclusive_start),
+            Timestamp::new(exclusive_end),
+        )
+        .unwrap()
+    }
+
+    /// Create a time range only including the single timestamp.
+    pub fn from_timestamp(t: Timestamp) -> Self {
+        // FIXME(xikai): now the time range can not express the `exclusive_end` as
+        //  infinite.
+        let end = t.checked_add_i64(1).unwrap_or(t);
+        Self::new(t, end).unwrap()
+    }
+
+    /// Create a new time range of [0, max)
+    pub fn min_to_max() -> Self {
+        Self {
+            inclusive_start: Timestamp::MIN,
+            exclusive_end: Timestamp::MAX,
+        }
+    }
+
+    /// Create a empty time range.
+    pub fn empty() -> Self {
+        Self {
+            inclusive_start: Timestamp::ZERO,
+            exclusive_end: Timestamp::ZERO,
+        }
+    }
+
+    /// The inclusive start timestamp
+    #[inline]
+    pub fn inclusive_start(&self) -> Timestamp {
+        self.inclusive_start
+    }
+
+    /// The exclusive end timestamp
+    #[inline]
+    pub fn exclusive_end(&self) -> Timestamp {
+        self.exclusive_end
+    }
+
+    /// Return the reference to the exclusive end timestamp.
+    #[inline]
+    pub fn exclusive_end_ref(&self) -> &Timestamp {
+        &self.exclusive_end
+    }
+
+    /// Returns true if the time range contains the given `ts`
+    #[inline]
+    pub fn contains(&self, ts: Timestamp) -> bool {
+        self.inclusive_start <= ts && ts < self.exclusive_end
+    }
+
+    /// Returns a time bucket with fixed bucket size that the timestamp belongs
+    /// to. Returns None if overflow occurred, the bucket_duration is greater
+    /// than [i64::MAX] or not positive.
+    pub fn bucket_of(timestamp: Timestamp, bucket_duration: Duration) -> Option<Self> {
+        let bucket_duration_ms: i64 = bucket_duration.as_millis().try_into().ok()?;
+        if bucket_duration_ms <= 0 {
+            return None;
+        }
+
+        let inclusive_start = timestamp.checked_floor_by_i64(bucket_duration_ms)?;
+        // end = start + bucket_duration
+        let exclusive_end = inclusive_start.checked_add_i64(bucket_duration_ms)?;
+
+        Some(Self {
+            inclusive_start,
+            exclusive_end,
+        })
+    }
+
+    /// Returns true if this time range intersect with `other`
+    pub fn intersect_with(&self, other: TimeRange) -> bool {
+        !self.not_intersecting(other)
+    }
+
+    /// Return true if the time range is expired (`exclusive_end_time` <
+    /// `expire_time`).
+    pub fn is_expired(&self, expire_time: Option<Timestamp>) -> bool {
+        expire_time.is_some() && self.exclusive_end() <= expire_time.unwrap()
+    }
+
+    #[inline]
+    fn not_intersecting(&self, other: TimeRange) -> bool {
+        other.exclusive_end <= self.inclusive_start || other.inclusive_start >= self.exclusive_end
+    }
+
+    pub fn intersected_range(&self, other: TimeRange) -> Option<TimeRange> {
+        TimeRange::new(
+            self.inclusive_start.max(other.inclusive_start),
+            self.exclusive_end.min(other.exclusive_end),
+        )
+    }
+}
+
+impl From<TimeRange> for TimeRangePb {
+    fn from(src: TimeRange) -> Self {
+        let mut target = TimeRangePb::default();
+        target.set_start(src.inclusive_start.as_i64());
+        target.set_end(src.exclusive_end.as_i64());
+        target
+    }
+}
+
+impl TryFrom<TimeRangePb> for TimeRange {
+    type Error = Error;
+
+    fn try_from(src: TimeRangePb) -> Result<Self, Error> {
+        Self::new(Timestamp::new(src.start), Timestamp::new(src.end)).context(InvalidTimeRange {
+            start: src.start,
+            end: src.end,
+        })
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use std::time::Duration;
+
+    use crate::time::{TimeRange, Timestamp};
+
+    #[test]
+    fn test_timestamp() {
+        // 1637723901000: 2021-11-24 11:18:21
+        let timestamp = Timestamp::new(1637723901000);
+        // 1d
+        let ttl = Duration::from_secs(24 * 3600);
+        assert_eq!(
+            timestamp.sub_duration_or_min(ttl),
+            Timestamp::new(1637637501000)
+        );
+        assert_eq!(timestamp.truncate_by(ttl), Timestamp::new(1637712000000));
+        assert_eq!(
+            timestamp.checked_floor_by_i64(2000),
+            Some(Timestamp::new(1637723900000))
+        );
+        assert_eq!(
+            timestamp.checked_add_i64(2000),
+            Some(Timestamp::new(1637723903000))
+        );
+        assert_eq!(
+            timestamp.checked_sub_duration(ttl),
+            Some(Timestamp::new(1637637501000))
+        );
+    }
+
+    #[test]
+    fn test_time_range() {
+        // [100,200)
+        let time_range = TimeRange::new_unchecked_for_test(100, 200);
+        assert!(time_range.contains(Timestamp::new(150)));
+        assert!(time_range.contains(Timestamp::new(100)));
+        assert!(!time_range.contains(Timestamp::new(200)));
+
+        assert!(!time_range.is_expired(Some(Timestamp::new(50))));
+        assert!(time_range.is_expired(Some(Timestamp::new(200))));
+
+        assert_eq!(
+            TimeRange::bucket_of(Timestamp::new(100), Duration::from_millis(2)),
+            Some(TimeRange::new_unchecked_for_test(100, 102))
+        );
+
+        let time_range2 = TimeRange::new_unchecked_for_test(200, 300);
+        assert!(!time_range.intersect_with(time_range2));
+        let time_range3 = TimeRange::new_unchecked_for_test(50, 200);
+        assert!(time_range.intersect_with(time_range3));
+
+        assert!(time_range.not_intersecting(time_range2));
+        assert!(!time_range.not_intersecting(time_range3));
+    }
+
+    #[test]
+    fn test_bucket_of_negative_timestamp() {
+        let ts = Timestamp::new(-126316800000);
+        let range = TimeRange::bucket_of(ts, Duration::from_millis(25920000000)).unwrap();
+        assert!(range.contains(ts), "range:{:?}", range);
+    }
+}
diff --git a/common_util/Cargo.toml b/common_util/Cargo.toml
new file mode 100644
index 0000000000..884b13236b
--- /dev/null
+++ b/common_util/Cargo.toml
@@ -0,0 +1,44 @@
+[package]
+name = "common_util"
+version = "0.1.0"
+authors = ["CeresDB Authors <ceresdb@service.alipay.com>"]
+edition = "2018"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[features]
+test = ["env_logger"]
+
+[dependencies]
+# In alphabetical order
+backtrace = "0.3.9"
+common_types = { path = "../common_types", features = ["test"] }
+chrono = "0.4"
+crossbeam-utils = "0.8"
+env_logger = { version = "0.6", optional = true }
+lazy_static = "1.4.0"
+libc = "0.2"
+log = "0.4"
+logger = { path = "../components/logger"}
+snafu = { version ="0.6.10", features = ["backtraces"]}
+serde = {version = "1.0.81", features = ["derive"]}
+serde_derive = "1.0.81"
+pin-project-lite = "0.2"
+prometheus = "0.12"
+proto = { path = "../proto" }
+time = "0.1"
+tokio = { version = "1.15", features = ["full"] }
+toml = "0.5"
+
+[dev-dependencies]
+env_logger = "0.6"
+gag = "1.0"
+nix = "0.19"
+slog = "2.7"
+tempfile = "3.1.0"
+tokio-test = "0.4.2"
+
+[dev-dependencies.slog-global]
+version = "0.1"
+git = "https://github.com/breezewish/slog-global.git"
+rev = "0e23a5baff302a9d7bccd85f8f31e43339c2f2c1"
diff --git a/common_util/src/alloc_tracker.rs b/common_util/src/alloc_tracker.rs
new file mode 100644
index 0000000000..7e0979cb0f
--- /dev/null
+++ b/common_util/src/alloc_tracker.rs
@@ -0,0 +1,159 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Alloc tracker
+
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+/// Collect memory usage from tracker, useful for extending the tracker
+pub trait Collector {
+    /// Called when `bytes` bytes memory is allocated and tracked by the tracker
+    fn on_allocate(&self, bytes: usize);
+
+    /// Called when `bytes` bytes memory is freed and tracked by the tracker
+    fn on_free(&self, bytes: usize);
+}
+
+/// A tracker to track memory in used
+// TODO(yingwen): Impl a thread local or local tracker that are not thread safe,
+// and collect statistics into the thread safe one for better performance
+pub struct Tracker<T: Collector> {
+    collector: T,
+    bytes_allocated: AtomicUsize,
+}
+
+impl<T: Collector> Tracker<T> {
+    pub fn new(collector: T) -> Self {
+        Self {
+            collector,
+            bytes_allocated: AtomicUsize::new(0),
+        }
+    }
+
+    /// Increase consumption of this tracker by bytes
+    pub fn consume(&self, bytes: usize) {
+        self.bytes_allocated.fetch_add(bytes, Ordering::Relaxed);
+        self.collector.on_allocate(bytes);
+    }
+
+    /// Decrease consumption of this tracker by bytes
+    ///
+    /// The caller should guarantee the released bytes wont larger than bytes
+    /// already consumed
+    pub fn release(&self, bytes: usize) {
+        self.bytes_allocated.fetch_sub(bytes, Ordering::Relaxed);
+        self.collector.on_free(bytes);
+    }
+
+    /// Bytes allocated
+    pub fn bytes_allocated(&self) -> usize {
+        self.bytes_allocated.load(Ordering::Relaxed)
+    }
+}
+
+impl<T: Collector> Drop for Tracker<T> {
+    fn drop(&mut self) {
+        let bytes = *self.bytes_allocated.get_mut();
+        self.collector.on_free(bytes);
+    }
+}
+
+/// The noop collector does nothing on alloc and free
+struct NoopCollector;
+
+impl Collector for NoopCollector {
+    fn on_allocate(&self, _bytes: usize) {}
+
+    fn on_free(&self, _bytes: usize) {}
+}
+
+/// A simple tracker hides the collector api
+pub struct SimpleTracker(Tracker<NoopCollector>);
+
+impl Default for SimpleTracker {
+    fn default() -> Self {
+        Self(Tracker::new(NoopCollector))
+    }
+}
+
+impl SimpleTracker {
+    /// Increase consumption of this tracker by bytes
+    #[inline]
+    pub fn consume(&self, bytes: usize) {
+        self.0.consume(bytes);
+    }
+
+    /// Decrease consumption of this tracker by bytes
+    ///
+    /// The caller should guarantee the released bytes wont larger than bytes
+    /// already consumed
+    #[inline]
+    pub fn release(&self, bytes: usize) {
+        self.0.release(bytes);
+    }
+
+    /// Bytes allocated
+    pub fn bytes_allocated(&self) -> usize {
+        self.0.bytes_allocated()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_simple_tracker() {
+        let tracker = SimpleTracker::default();
+        tracker.consume(256);
+        assert_eq!(256, tracker.bytes_allocated());
+
+        tracker.release(100);
+        assert_eq!(156, tracker.bytes_allocated());
+    }
+
+    #[test]
+    fn test_collector() {
+        use std::sync::atomic::AtomicBool;
+
+        struct MockCollector {
+            allocated: AtomicBool,
+            freed: AtomicBool,
+        }
+
+        impl MockCollector {
+            fn new() -> Self {
+                Self {
+                    allocated: AtomicBool::new(false),
+                    freed: AtomicBool::new(false),
+                }
+            }
+        }
+
+        impl Drop for MockCollector {
+            fn drop(&mut self) {
+                assert!(*self.allocated.get_mut());
+                assert!(*self.freed.get_mut());
+            }
+        }
+
+        impl Collector for MockCollector {
+            fn on_allocate(&self, bytes: usize) {
+                assert_eq!(800, bytes);
+                self.allocated.store(true, Ordering::Relaxed);
+            }
+
+            fn on_free(&self, bytes: usize) {
+                if self.freed.load(Ordering::Relaxed) {
+                    assert_eq!(440, bytes);
+                } else {
+                    assert_eq!(360, bytes);
+                }
+                self.freed.store(true, Ordering::Relaxed);
+            }
+        }
+
+        let tracker = Tracker::new(MockCollector::new());
+        tracker.consume(800);
+        tracker.release(360);
+    }
+}
diff --git a/common_util/src/codec/compact/bytes.rs b/common_util/src/codec/compact/bytes.rs
new file mode 100644
index 0000000000..aeeff7739d
--- /dev/null
+++ b/common_util/src/codec/compact/bytes.rs
@@ -0,0 +1,130 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Bytes format
+
+use std::convert::TryFrom;
+
+use common_types::bytes::{Bytes, BytesMut, MemBuf, MemBufMut};
+use snafu::{ensure, ResultExt};
+
+use crate::codec::{
+    compact::{
+        DecodeEmptyValue, DecodeValue, DecodeVarint, EncodeValue, EncodeVarint, Error,
+        MemCompactDecoder, MemCompactEncoder, Result, TryIntoUsize,
+    },
+    consts, varint, DecodeTo, Encoder,
+};
+
+impl Encoder<[u8]> for MemCompactEncoder {
+    type Error = Error;
+
+    // EncodeCompactBytes joins bytes with its length into a byte slice. It is more
+    // efficient in both space and time compare to EncodeBytes. Note that the
+    // encoded result is not memcomparable.
+    fn encode<B: MemBufMut>(&self, buf: &mut B, value: &[u8]) -> Result<()> {
+        varint::encode_varint(buf, value.len() as i64).context(EncodeVarint)?;
+        buf.write_slice(value).context(EncodeValue)?;
+        Ok(())
+    }
+
+    fn estimate_encoded_size(&self, value: &[u8]) -> usize {
+        consts::MAX_VARINT_BYTES + value.len()
+    }
+}
+
+impl Encoder<Bytes> for MemCompactEncoder {
+    type Error = Error;
+
+    fn encode<B: MemBufMut>(&self, buf: &mut B, value: &Bytes) -> Result<()> {
+        self.encode(buf, &value[..])
+    }
+
+    fn estimate_encoded_size(&self, value: &Bytes) -> usize {
+        self.estimate_encoded_size(&value[..])
+    }
+}
+
+impl DecodeTo<BytesMut> for MemCompactDecoder {
+    type Error = Error;
+
+    fn decode_to<B: MemBuf>(&self, buf: &mut B, value: &mut BytesMut) -> Result<()> {
+        let v = usize::try_from(varint::decode_varint(buf).context(DecodeVarint)?)
+            .context(TryIntoUsize)?;
+        ensure!(buf.remaining_slice().len() >= v, DecodeEmptyValue);
+        value
+            .write_slice(&buf.remaining_slice()[..v])
+            .context(DecodeValue)?;
+        buf.must_advance(v);
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    struct BytesTest {
+        data: Bytes,
+        estimate_encoded_size: usize,
+    }
+
+    #[test]
+    fn test_compact_bytes_codec() {
+        let data = vec![
+            BytesTest {
+                data: Bytes::from_static(b""),
+                estimate_encoded_size: 10,
+            },
+            BytesTest {
+                data: Bytes::from_static(b"hello1"),
+                estimate_encoded_size: 16,
+            },
+            BytesTest {
+                data: Bytes::from_static(b"hello2"),
+                estimate_encoded_size: 16,
+            },
+            BytesTest {
+                data: Bytes::from_static(b"hello3"),
+                estimate_encoded_size: 16,
+            },
+            BytesTest {
+                data: Bytes::from_static(&[0x00, 0x01]),
+                estimate_encoded_size: 12,
+            },
+            BytesTest {
+                data: Bytes::from_static(&[0xff, 0xff]),
+                estimate_encoded_size: 12,
+            },
+            BytesTest {
+                data: Bytes::from_static(&[0x01, 0x00]),
+                estimate_encoded_size: 12,
+            },
+            BytesTest {
+                data: Bytes::from_static(b"abc"),
+                estimate_encoded_size: 13,
+            },
+            BytesTest {
+                data: Bytes::from_static(b"hello world"),
+                estimate_encoded_size: 21,
+            },
+        ];
+
+        let encoder = MemCompactEncoder;
+        let mut buf = vec![];
+        for x in &data {
+            encoder.encode(&mut buf, &x.data).unwrap();
+            assert_eq!(
+                x.estimate_encoded_size,
+                encoder.estimate_encoded_size(&x.data)
+            );
+        }
+
+        let decoder = MemCompactDecoder;
+        let mut buf = &buf[..];
+        for x in &data {
+            let mut d = BytesMut::new();
+            decoder.decode_to(&mut buf, &mut d).unwrap();
+            assert_eq!(d, x.data);
+        }
+    }
+}
diff --git a/common_util/src/codec/compact/datum.rs b/common_util/src/codec/compact/datum.rs
new file mode 100644
index 0000000000..0d80088e06
--- /dev/null
+++ b/common_util/src/codec/compact/datum.rs
@@ -0,0 +1,264 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Datum compact codec
+
+use common_types::{
+    bytes::{BytesMut, MemBuf, MemBufMut},
+    datum::Datum,
+    string::StringBytes,
+    time::Timestamp,
+};
+use snafu::ResultExt;
+
+use crate::codec::{
+    compact::{EncodeKey, Error, MemCompactDecoder, MemCompactEncoder, Result},
+    consts, DecodeTo, Encoder,
+};
+
+// For float points, we use same encoding as mem comparable encoder
+impl Encoder<Datum> for MemCompactEncoder {
+    type Error = Error;
+
+    fn encode<B: MemBufMut>(&self, buf: &mut B, value: &Datum) -> Result<()> {
+        match value {
+            Datum::Null => buf.write_u8(consts::NULL_FLAG).context(EncodeKey),
+            Datum::Timestamp(ts) => {
+                buf.write_u8(consts::VARINT_FLAG).context(EncodeKey)?;
+                self.encode(buf, &ts.as_i64())
+            }
+            Datum::Double(v) => {
+                buf.write_u8(consts::FLOAT_FLAG).context(EncodeKey)?;
+                self.encode(buf, v)
+            }
+            Datum::Float(v) => {
+                buf.write_u8(consts::FLOAT_FLAG).context(EncodeKey)?;
+                self.encode(buf, v)
+            }
+            Datum::Varbinary(v) => {
+                buf.write_u8(consts::COMPACT_BYTES_FLAG)
+                    .context(EncodeKey)?;
+                self.encode(buf, v)
+            }
+            // For string, just encode/decode like bytes.
+            Datum::String(v) => {
+                buf.write_u8(consts::COMPACT_BYTES_FLAG)
+                    .context(EncodeKey)?;
+                self.encode(buf, v.as_bytes())
+            }
+            Datum::UInt64(v) => {
+                buf.write_u8(consts::UVARINT_FLAG).context(EncodeKey)?;
+                self.encode(buf, v)
+            }
+            Datum::UInt32(v) => {
+                buf.write_u8(consts::UVARINT_FLAG).context(EncodeKey)?;
+                self.encode(buf, &(u64::from(*v)))
+            }
+            Datum::UInt16(v) => {
+                buf.write_u8(consts::UVARINT_FLAG).context(EncodeKey)?;
+                self.encode(buf, &(u64::from(*v)))
+            }
+            Datum::UInt8(v) => {
+                buf.write_u8(consts::UVARINT_FLAG).context(EncodeKey)?;
+                self.encode(buf, &(u64::from(*v)))
+            }
+            Datum::Int64(v) => {
+                buf.write_u8(consts::VARINT_FLAG).context(EncodeKey)?;
+                self.encode(buf, v)
+            }
+            Datum::Int32(v) => {
+                buf.write_u8(consts::VARINT_FLAG).context(EncodeKey)?;
+                self.encode(buf, &(i64::from(*v)))
+            }
+            Datum::Int16(v) => {
+                buf.write_u8(consts::VARINT_FLAG).context(EncodeKey)?;
+                self.encode(buf, &(i64::from(*v)))
+            }
+            Datum::Int8(v) => {
+                buf.write_u8(consts::VARINT_FLAG).context(EncodeKey)?;
+                self.encode(buf, &(i64::from(*v)))
+            }
+            Datum::Boolean(v) => {
+                buf.write_u8(consts::UVARINT_FLAG).context(EncodeKey)?;
+                self.encode(buf, &(u64::from(*v)))
+            }
+        }
+    }
+
+    fn estimate_encoded_size(&self, value: &Datum) -> usize {
+        match value {
+            // Null takes 1 byte
+            Datum::Null => 1,
+            Datum::Timestamp(ts) => self.estimate_encoded_size(&ts.as_i64()),
+            Datum::Double(v) => self.estimate_encoded_size(v),
+            Datum::Float(v) => self.estimate_encoded_size(v),
+            Datum::Varbinary(v) => self.estimate_encoded_size(v),
+            Datum::String(v) => self.estimate_encoded_size(v.as_bytes()),
+            Datum::UInt64(v) => self.estimate_encoded_size(v),
+            Datum::UInt32(v) => self.estimate_encoded_size(&(u64::from(*v))),
+            Datum::UInt16(v) => self.estimate_encoded_size(&(u64::from(*v))),
+            Datum::UInt8(v) => self.estimate_encoded_size(&(u64::from(*v))),
+            Datum::Int64(v) => self.estimate_encoded_size(v),
+            Datum::Int32(v) => self.estimate_encoded_size(&(i64::from(*v))),
+            Datum::Int16(v) => self.estimate_encoded_size(&(i64::from(*v))),
+            Datum::Int8(v) => self.estimate_encoded_size(&(i64::from(*v))),
+            Datum::Boolean(v) => self.estimate_encoded_size(&(u64::from(*v))),
+        }
+    }
+}
+
+macro_rules! decode_var_u64_into {
+    ($self: ident, $v: ident, $actual: ident, $buf: ident, $type: ty) => {{
+        Self::ensure_flag(consts::UVARINT_FLAG, $actual)?;
+        let mut data = 0u64;
+        $self.decode_to($buf, &mut data)?;
+        *$v = data as $type;
+    }};
+}
+
+macro_rules! decode_var_u64_into_bool {
+    ($self: ident, $v: ident, $actual: ident, $buf: ident) => {{
+        Self::ensure_flag(consts::UVARINT_FLAG, $actual)?;
+        let mut data = 0u64;
+        $self.decode_to($buf, &mut data)?;
+        *$v = data != 0;
+    }};
+}
+
+macro_rules! decode_var_i64_into {
+    ($self: ident, $v: ident, $actual: ident, $buf: ident, $type: ty) => {{
+        Self::ensure_flag(consts::VARINT_FLAG, $actual)?;
+        let mut data = 0i64;
+        $self.decode_to($buf, &mut data)?;
+        *$v = data as $type;
+    }};
+}
+
+impl DecodeTo<Datum> for MemCompactDecoder {
+    type Error = Error;
+
+    /// REQUIRE: The datum type should match the type in buf
+    ///
+    /// For string datum, the utf8 check will be skipped.
+    fn decode_to<B: MemBuf>(&self, buf: &mut B, value: &mut Datum) -> Result<()> {
+        let actual = match self.maybe_read_null(buf)? {
+            Some(v) => v,
+            None => {
+                *value = Datum::Null;
+                return Ok(());
+            }
+        };
+
+        match value {
+            Datum::Null => {
+                Self::ensure_flag(consts::NULL_FLAG, actual)?;
+            }
+            Datum::Timestamp(ts) => {
+                Self::ensure_flag(consts::VARINT_FLAG, actual)?;
+                let mut data = 0;
+                self.decode_to(buf, &mut data)?;
+                *ts = Timestamp::new(data);
+            }
+            Datum::Double(v) => {
+                Self::ensure_flag(consts::FLOAT_FLAG, actual)?;
+                self.decode_to(buf, v)?;
+            }
+            Datum::Float(v) => {
+                Self::ensure_flag(consts::FLOAT_FLAG, actual)?;
+                self.decode_to(buf, v)?;
+            }
+            Datum::Varbinary(v) => {
+                Self::ensure_flag(consts::COMPACT_BYTES_FLAG, actual)?;
+                let mut data = BytesMut::new();
+                self.decode_to(buf, &mut data)?;
+                *v = data.freeze();
+            }
+            Datum::String(v) => {
+                Self::ensure_flag(consts::COMPACT_BYTES_FLAG, actual)?;
+                let mut data = BytesMut::new();
+                self.decode_to(buf, &mut data)?;
+                // For string datum, we won't validate whether the bytes is a valid utf string
+                // during decoding to improve decode performance. The encoder
+                // should already done the utf8 check.
+                unsafe {
+                    *v = StringBytes::from_bytes_unchecked(data.freeze());
+                }
+            }
+            Datum::UInt64(v) => {
+                Self::ensure_flag(consts::UVARINT_FLAG, actual)?;
+                self.decode_to(buf, v)?;
+            }
+            Datum::UInt32(v) => decode_var_u64_into!(self, v, actual, buf, u32),
+            Datum::UInt16(v) => decode_var_u64_into!(self, v, actual, buf, u16),
+            Datum::UInt8(v) => decode_var_u64_into!(self, v, actual, buf, u8),
+            Datum::Int64(v) => {
+                Self::ensure_flag(consts::VARINT_FLAG, actual)?;
+                self.decode_to(buf, v)?;
+            }
+            Datum::Int32(v) => decode_var_i64_into!(self, v, actual, buf, i32),
+            Datum::Int16(v) => decode_var_i64_into!(self, v, actual, buf, i16),
+            Datum::Int8(v) => decode_var_i64_into!(self, v, actual, buf, i8),
+            Datum::Boolean(v) => decode_var_u64_into_bool!(self, v, actual, buf),
+        }
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use common_types::bytes::Bytes;
+
+    use super::*;
+
+    // TODO(yingwen): Test nullable.
+    #[test]
+    fn test_datum_codec() {
+        let data = vec![
+            // (datum to encode, estimate_encoded_size)
+            (Datum::Null, 1),
+            (Datum::Timestamp(Timestamp::new(12345)), 10),
+            (Datum::Double(10.5), 8),
+            (Datum::Float(1.99), 4),
+            (Datum::Varbinary(Bytes::from_static(b"hello world")), 21),
+            (Datum::String(StringBytes::from_static("hello world")), 21),
+            (Datum::UInt64(12345), 10),
+            (Datum::UInt32(1000), 10),
+            (Datum::UInt16(65000), 10),
+            (Datum::UInt8(150), 10),
+            (Datum::Int64(-100209), 10),
+            (Datum::Int32(-10020), 10),
+            (Datum::Int16(32500), 10),
+            (Datum::Int8(-120), 10),
+            (Datum::Boolean(true), 10),
+            (Datum::Boolean(false), 10),
+        ];
+        let mut decoded = vec![
+            Datum::Null,
+            Datum::Timestamp(Timestamp::new(0)),
+            Datum::Double(0.0),
+            Datum::Float(0.0),
+            Datum::Varbinary(Bytes::new()),
+            Datum::String(StringBytes::new()),
+            Datum::UInt64(0),
+            Datum::UInt32(0),
+            Datum::UInt16(0),
+            Datum::UInt8(0),
+            Datum::Int64(0),
+            Datum::Int32(0),
+            Datum::Int16(0),
+            Datum::Int8(0),
+            Datum::Boolean(false),
+            Datum::Boolean(false),
+        ];
+        let encoder = MemCompactEncoder;
+        let decoder = MemCompactDecoder;
+        for (index, x) in data.iter().enumerate() {
+            let mut buf = vec![];
+            encoder.encode(&mut buf, &x.0).unwrap();
+            assert_eq!(x.1, encoder.estimate_encoded_size(&x.0));
+            decoder
+                .decode_to(&mut buf.as_slice(), &mut decoded[index])
+                .unwrap();
+            assert_eq!(decoded[index], data[index].0);
+        }
+    }
+}
diff --git a/common_util/src/codec/compact/float.rs b/common_util/src/codec/compact/float.rs
new file mode 100644
index 0000000000..867ff3282b
--- /dev/null
+++ b/common_util/src/codec/compact/float.rs
@@ -0,0 +1,101 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use std::mem;
+
+use common_types::bytes::{MemBuf, MemBufMut};
+use snafu::ResultExt;
+
+use crate::codec::{
+    compact::{DecodeValue, EncodeValue, Error, MemCompactDecoder, MemCompactEncoder, Result},
+    DecodeTo, Encoder,
+};
+
+impl Encoder<f64> for MemCompactEncoder {
+    type Error = Error;
+
+    fn encode<B: MemBufMut>(&self, buf: &mut B, value: &f64) -> Result<()> {
+        buf.write_f64(*value).context(EncodeValue)?;
+        Ok(())
+    }
+
+    fn estimate_encoded_size(&self, _value: &f64) -> usize {
+        mem::size_of::<f64>()
+    }
+}
+
+impl DecodeTo<f64> for MemCompactDecoder {
+    type Error = Error;
+
+    fn decode_to<B: MemBuf>(&self, buf: &mut B, value: &mut f64) -> Result<()> {
+        *value = buf.read_f64().context(DecodeValue)?;
+        Ok(())
+    }
+}
+
+impl Encoder<f32> for MemCompactEncoder {
+    type Error = Error;
+
+    fn encode<B: MemBufMut>(&self, buf: &mut B, value: &f32) -> Result<()> {
+        buf.write_f32(*value).context(EncodeValue)?;
+        Ok(())
+    }
+
+    fn estimate_encoded_size(&self, _value: &f32) -> usize {
+        mem::size_of::<f32>()
+    }
+}
+
+impl DecodeTo<f32> for MemCompactDecoder {
+    type Error = Error;
+
+    fn decode_to<B: MemBuf>(&self, buf: &mut B, value: &mut f32) -> Result<()> {
+        *value = buf.read_f32().context(DecodeValue)?;
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    struct TestF64 {
+        data: f64,
+        estimate_encoded_size: usize,
+    }
+
+    #[test]
+    fn test_compact_f64_codec() {
+        let data = vec![
+            TestF64 {
+                data: 162132470.5,
+                estimate_encoded_size: 8,
+            },
+            TestF64 {
+                data: f64::MIN,
+                estimate_encoded_size: 8,
+            },
+            TestF64 {
+                data: f64::MAX,
+                estimate_encoded_size: 8,
+            },
+        ];
+
+        let encoder = MemCompactEncoder;
+        let mut buf = vec![];
+        for x in &data {
+            encoder.encode(&mut buf, &x.data).unwrap();
+            assert_eq!(
+                x.estimate_encoded_size,
+                encoder.estimate_encoded_size(&x.data)
+            );
+        }
+
+        let decoder = MemCompactDecoder;
+        let mut buf = &buf[..];
+        for x in &data {
+            let mut d = 0.0;
+            decoder.decode_to(&mut buf, &mut d).unwrap();
+            assert!((d - x.data).abs() < f64::EPSILON);
+        }
+    }
+}
diff --git a/common_util/src/codec/compact/mod.rs b/common_util/src/codec/compact/mod.rs
new file mode 100644
index 0000000000..1327e05929
--- /dev/null
+++ b/common_util/src/codec/compact/mod.rs
@@ -0,0 +1,92 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Mem compact format codec
+
+// Implementation reference:
+// https://github.com/pingcap/tidb/blob/bd011d3c9567c506d8d4343ade03edf77fcd5b56/util/codec/codec.go
+mod bytes;
+mod datum;
+mod float;
+mod number;
+
+use common_types::bytes::MemBuf;
+use snafu::{ensure, Backtrace, ResultExt, Snafu};
+
+use crate::codec::consts;
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Failed to encode flag, err:{}", source))]
+    EncodeKey { source: common_types::bytes::Error },
+
+    #[snafu(display("Failed to encode value, err:{}", source))]
+    EncodeValue { source: common_types::bytes::Error },
+
+    #[snafu(display("Failed to encode varint, err:{}", source))]
+    EncodeVarint { source: crate::codec::varint::Error },
+
+    #[snafu(display("Failed to decode varint, err:{}", source))]
+    DecodeVarint { source: crate::codec::varint::Error },
+
+    #[snafu(display("Failed to decode key, err:{}", source))]
+    DecodeKey { source: common_types::bytes::Error },
+
+    #[snafu(display("Insufficient bytes to decode value.\nBacktrace:\n{}", backtrace))]
+    DecodeEmptyValue { backtrace: Backtrace },
+
+    #[snafu(display(
+        "Invalid flag, expect:{}, actual:{}.\nBacktrace:\n{}",
+        expect,
+        actual,
+        backtrace
+    ))]
+    InvalidKeyFlag {
+        expect: u8,
+        actual: u8,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Insufficient bytes to decode value, err:{}", source))]
+    DecodeValue { source: common_types::bytes::Error },
+
+    #[snafu(display("Try into usize error:{}.\nBacktrace:\n{}", source, backtrace))]
+    TryIntoUsize {
+        source: std::num::TryFromIntError,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Failed to decode string, err:{}", source))]
+    DecodeString { source: common_types::string::Error },
+
+    #[snafu(display("Datum cannot be null.\nBacktrace:\n{}", backtrace))]
+    NullDatum { backtrace: Backtrace },
+}
+
+define_result!(Error);
+
+/// Mem compact encoder
+pub struct MemCompactEncoder;
+
+/// Mem compact decoder
+pub struct MemCompactDecoder;
+
+impl MemCompactDecoder {
+    /// Returns None if we need to return null datum, otherwise return the flag.
+    fn maybe_read_null<B: MemBuf>(&self, buf: &mut B) -> Result<Option<u8>> {
+        let actual = buf.read_u8().context(DecodeKey)?;
+        // If actual flag is null, need to check whether this datum is nullable.
+        if actual == consts::NULL_FLAG {
+            // The decoder need to return null datum.
+            return Ok(None);
+        }
+
+        Ok(Some(actual))
+    }
+
+    #[inline]
+    fn ensure_flag(expect: u8, actual: u8) -> Result<()> {
+        // Actual flag is not null.
+        ensure!(expect == actual, InvalidKeyFlag { expect, actual });
+        Ok(())
+    }
+}
diff --git a/common_util/src/codec/compact/number.rs b/common_util/src/codec/compact/number.rs
new file mode 100644
index 0000000000..56aa76504f
--- /dev/null
+++ b/common_util/src/codec/compact/number.rs
@@ -0,0 +1,160 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Number format
+
+use common_types::bytes::{MemBuf, MemBufMut};
+use snafu::ResultExt;
+
+use crate::codec::{
+    compact::{DecodeVarint, EncodeVarint, Error, MemCompactDecoder, MemCompactEncoder, Result},
+    consts, varint, DecodeTo, Encoder,
+};
+
+impl Encoder<i64> for MemCompactEncoder {
+    type Error = Error;
+
+    fn encode<B: MemBufMut>(&self, buf: &mut B, value: &i64) -> Result<()> {
+        varint::encode_varint(buf, *value).context(EncodeVarint)?;
+        Ok(())
+    }
+
+    fn estimate_encoded_size(&self, _value: &i64) -> usize {
+        consts::MAX_VARINT_BYTES
+    }
+}
+
+impl DecodeTo<i64> for MemCompactDecoder {
+    type Error = Error;
+
+    fn decode_to<B: MemBuf>(&self, buf: &mut B, value: &mut i64) -> Result<()> {
+        *value = varint::decode_varint(buf).context(DecodeVarint)?;
+        Ok(())
+    }
+}
+
+impl Encoder<u64> for MemCompactEncoder {
+    type Error = Error;
+
+    fn encode<B: MemBufMut>(&self, buf: &mut B, value: &u64) -> Result<()> {
+        varint::encode_uvarint(buf, *value).context(EncodeVarint)?;
+        Ok(())
+    }
+
+    fn estimate_encoded_size(&self, _value: &u64) -> usize {
+        consts::MAX_UVARINT_BYTES
+    }
+}
+
+impl DecodeTo<u64> for MemCompactDecoder {
+    type Error = Error;
+
+    fn decode_to<B: MemBuf>(&self, buf: &mut B, value: &mut u64) -> Result<()> {
+        *value = varint::decode_uvarint(buf).context(DecodeVarint)?;
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    struct TestI64 {
+        data: i64,
+        estimate_encoded_size: usize,
+    }
+    #[test]
+    fn test_compact_i64_codec() {
+        let data = vec![
+            TestI64 {
+                data: 1621324705,
+                estimate_encoded_size: 10,
+            },
+            TestI64 {
+                data: 1621324705000,
+                estimate_encoded_size: 10,
+            },
+            TestI64 {
+                data: 1521324705,
+                estimate_encoded_size: 10,
+            },
+            TestI64 {
+                data: 1621324705123,
+                estimate_encoded_size: 10,
+            },
+            TestI64 {
+                data: i64::MIN,
+                estimate_encoded_size: 10,
+            },
+            TestI64 {
+                data: i64::MIN + 1,
+                estimate_encoded_size: 10,
+            },
+            TestI64 {
+                data: 0,
+                estimate_encoded_size: 10,
+            },
+            TestI64 {
+                data: i64::MAX,
+                estimate_encoded_size: 10,
+            },
+            TestI64 {
+                data: (1 << 47) - 1,
+                estimate_encoded_size: 10,
+            },
+            TestI64 {
+                data: -1 << 47,
+                estimate_encoded_size: 10,
+            },
+            TestI64 {
+                data: (1 << 23) - 1,
+                estimate_encoded_size: 10,
+            },
+            TestI64 {
+                data: -1 << 23,
+                estimate_encoded_size: 10,
+            },
+            TestI64 {
+                data: (1 << 33) - 1,
+                estimate_encoded_size: 10,
+            },
+            TestI64 {
+                data: -1 << 33,
+                estimate_encoded_size: 10,
+            },
+            TestI64 {
+                data: (1 << 55) - 1,
+                estimate_encoded_size: 10,
+            },
+            TestI64 {
+                data: -1 << 55,
+                estimate_encoded_size: 10,
+            },
+            TestI64 {
+                data: 1,
+                estimate_encoded_size: 10,
+            },
+            TestI64 {
+                data: -1,
+                estimate_encoded_size: 10,
+            },
+        ];
+
+        let encoder = MemCompactEncoder;
+        let mut buf = vec![];
+        for x in &data {
+            encoder.encode(&mut buf, &x.data).unwrap();
+            assert_eq!(
+                x.estimate_encoded_size,
+                encoder.estimate_encoded_size(&x.data)
+            );
+        }
+
+        let decoder = MemCompactDecoder;
+        let mut buf = &buf[..];
+        for x in &data {
+            let mut d = -1;
+            decoder.decode_to(&mut buf, &mut d).unwrap();
+            assert_eq!(d, x.data);
+        }
+    }
+}
diff --git a/common_util/src/codec/consts.rs b/common_util/src/codec/consts.rs
new file mode 100644
index 0000000000..843985eec6
--- /dev/null
+++ b/common_util/src/codec/consts.rs
@@ -0,0 +1,21 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Common constants used in codec
+
+// First byte in the encoded value which specifies the encoding type.
+// TODO(yingwen): Replace flags by datum kind. (Incompatible with old format).
+pub const NULL_FLAG: u8 = 0;
+pub const BYTES_FLAG: u8 = 1;
+pub const COMPACT_BYTES_FLAG: u8 = 2;
+pub const INT_FLAG: u8 = 3;
+pub const UINT_FLAG: u8 = 4;
+pub const FLOAT_FLAG: u8 = 5;
+pub const VARINT_FLAG: u8 = 8;
+pub const UVARINT_FLAG: u8 = 9;
+
+/// Max bytes varint can use
+pub const MAX_VARINT_BYTES: usize = 10;
+/// Max bytes uvarint can be use
+pub const MAX_UVARINT_BYTES: usize = 10;
+/// Sign mask for u64/i64 conversion
+pub const SIGN_MASK: u64 = 0x8000000000000000;
diff --git a/common_util/src/codec/memcomparable/bytes.rs b/common_util/src/codec/memcomparable/bytes.rs
new file mode 100644
index 0000000000..878ad9c051
--- /dev/null
+++ b/common_util/src/codec/memcomparable/bytes.rs
@@ -0,0 +1,279 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Bytes format
+
+use common_types::bytes::{Bytes, BytesMut, MemBuf, MemBufMut};
+use snafu::{ensure, ResultExt};
+
+use crate::codec::{
+    memcomparable::{
+        DecodeValueGroup, DecodeValueMarker, DecodeValuePadding, EncodeValue, Error, MemComparable,
+        Result,
+    },
+    DecodeTo, Encoder,
+};
+
+const ENC_GROUP_SIZE: usize = 8;
+const ENC_MARKER: u8 = 0xFF;
+const ENC_PAD: u8 = 0x0;
+const PADS: [u8; ENC_GROUP_SIZE] = [0; ENC_GROUP_SIZE];
+
+impl Encoder<[u8]> for MemComparable {
+    type Error = Error;
+
+    // encode Bytes guarantees the encoded value is in ascending order for
+    // comparison, encoding with the following rule:
+    //  [group1][marker1]...[groupN][markerN]
+    //  group is 8 bytes slice which is padding with 0.
+    //  marker is `0xFF - padding 0 count`
+    // For example:
+    //
+    // ```
+    //   [] -> [0, 0, 0, 0, 0, 0, 0, 0, 247]
+    //   [1, 2, 3] -> [1, 2, 3, 0, 0, 0, 0, 0, 250]
+    //   [1, 2, 3, 0] -> [1, 2, 3, 0, 0, 0, 0, 0, 251]
+    //   [1, 2, 3, 4, 5, 6, 7, 8] -> [1, 2, 3, 4, 5, 6, 7, 8, 255, 0, 0, 0, 0, 0, 0, 0, 0, 247]
+    // ```
+    //
+    // Refer: https://github.com/facebook/mysql-5.6/wiki/MyRocks-record-format#memcomparable-format
+    fn encode<B: MemBufMut>(&self, buf: &mut B, value: &[u8]) -> Result<()> {
+        let value_len = value.len();
+        for idx in (0..=value_len).step_by(ENC_GROUP_SIZE) {
+            let remain = value_len - idx;
+            let mut pad_count = 0;
+            if remain >= ENC_GROUP_SIZE {
+                buf.write_slice(&value[idx..idx + ENC_GROUP_SIZE])
+                    .context(EncodeValue)?;
+            } else {
+                pad_count = ENC_GROUP_SIZE - remain;
+                buf.write_slice(&value[idx..]).context(EncodeValue)?;
+                buf.write_slice(&PADS[..pad_count]).context(EncodeValue)?;
+            }
+            let marker = ENC_MARKER - pad_count as u8;
+            buf.write_u8(marker).context(EncodeValue)?;
+        }
+        Ok(())
+    }
+
+    // Allocate more space to avoid unnecessary slice growing.
+    // Assume that the byte slice size is about `(len(data) / encGroupSize + 1) *
+    // (encGroupSize + 1)` bytes, that is `(len(data) / 8 + 1) * 9` in our
+    // implement.
+    fn estimate_encoded_size(&self, value: &[u8]) -> usize {
+        (value.len() / ENC_GROUP_SIZE + 1) * (ENC_GROUP_SIZE + 1)
+    }
+}
+
+impl Encoder<Bytes> for MemComparable {
+    type Error = Error;
+
+    fn encode<B: MemBufMut>(&self, buf: &mut B, value: &Bytes) -> Result<()> {
+        self.encode(buf, &value[..])
+    }
+
+    fn estimate_encoded_size(&self, value: &Bytes) -> usize {
+        self.estimate_encoded_size(&value[..])
+    }
+}
+
+impl DecodeTo<BytesMut> for MemComparable {
+    type Error = Error;
+
+    // decode Bytes which is encoded by encode Bytes before,
+    // returns the leftover bytes and decoded value if no error.
+    fn decode_to<B: MemBuf>(&self, buf: &mut B, value: &mut BytesMut) -> Result<()> {
+        loop {
+            let b = buf.remaining_slice();
+            ensure!(b.len() > ENC_GROUP_SIZE, DecodeValueGroup);
+
+            let group_bytes = &b[..ENC_GROUP_SIZE + 1];
+            let group = &group_bytes[..ENC_GROUP_SIZE];
+            let marker = group_bytes[ENC_GROUP_SIZE];
+            let pad_count = usize::from(ENC_MARKER - marker);
+            ensure!(
+                pad_count <= ENC_GROUP_SIZE,
+                DecodeValueMarker { group_bytes }
+            );
+
+            let real_group_size = ENC_GROUP_SIZE - pad_count;
+            value
+                .write_slice(&group[..real_group_size])
+                .context(EncodeValue)?;
+
+            if pad_count != 0 {
+                // Check validity of padding bytes.
+                for v in &group[real_group_size..] {
+                    ensure!(*v == ENC_PAD, DecodeValuePadding { group_bytes });
+                }
+                buf.must_advance(ENC_GROUP_SIZE + 1);
+
+                break;
+            }
+            buf.must_advance(ENC_GROUP_SIZE + 1);
+        }
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use core::cmp::Ordering;
+
+    use super::*;
+
+    struct BytesTest {
+        data: Bytes,
+        estimate_encoded_size: usize,
+    }
+
+    #[test]
+    fn test_bytes_codec() {
+        let data = vec![
+            BytesTest {
+                data: Bytes::from_static(b""),
+                estimate_encoded_size: 9,
+            },
+            BytesTest {
+                data: Bytes::from_static(b"hello1"),
+                estimate_encoded_size: 9,
+            },
+            BytesTest {
+                data: Bytes::from_static(b"hello2"),
+                estimate_encoded_size: 9,
+            },
+            BytesTest {
+                data: Bytes::from_static(b"hello3"),
+                estimate_encoded_size: 9,
+            },
+            BytesTest {
+                data: Bytes::from_static(&[0x00, 0x01]),
+                estimate_encoded_size: 9,
+            },
+            BytesTest {
+                data: Bytes::from_static(&[0xff, 0xff]),
+                estimate_encoded_size: 9,
+            },
+            BytesTest {
+                data: Bytes::from_static(&[0x01, 0x00]),
+                estimate_encoded_size: 9,
+            },
+            BytesTest {
+                data: Bytes::from_static(b"abc"),
+                estimate_encoded_size: 9,
+            },
+            BytesTest {
+                data: Bytes::from_static(b"hello world"),
+                estimate_encoded_size: 18,
+            },
+        ];
+
+        let c = MemComparable;
+        let mut buf = vec![];
+        for x in &data {
+            c.encode(&mut buf, &x.data).unwrap();
+            assert_eq!(x.estimate_encoded_size, c.estimate_encoded_size(&x.data));
+        }
+
+        let mut buf = &buf[..];
+        for x in &data {
+            let mut d = BytesMut::new();
+            c.decode_to(&mut buf, &mut d).unwrap();
+            assert_eq!(d, x.data);
+        }
+    }
+
+    struct TbBytes {
+        arg1: Bytes,
+        arg2: Bytes,
+        ret: Ordering,
+    }
+
+    #[test]
+    fn test_bytes_order() {
+        let data = vec![
+            TbBytes {
+                arg1: Bytes::new(),
+                arg2: Bytes::from_static(&[0x00]),
+                ret: Ordering::Less,
+            },
+            TbBytes {
+                arg1: Bytes::from_static(&[0x00]),
+                arg2: Bytes::from_static(&[0x00]),
+                ret: Ordering::Equal,
+            },
+            TbBytes {
+                arg1: Bytes::from_static(&[0xFF]),
+                arg2: Bytes::from_static(&[0x00]),
+                ret: Ordering::Greater,
+            },
+            TbBytes {
+                arg1: Bytes::from_static(&[0xFF]),
+                arg2: Bytes::from_static(&[0xFF, 0x00]),
+                ret: Ordering::Less,
+            },
+            TbBytes {
+                arg1: Bytes::from_static(b"a"),
+                arg2: Bytes::from_static(b"b"),
+                ret: Ordering::Less,
+            },
+            TbBytes {
+                arg1: Bytes::from_static(b"a"),
+                arg2: Bytes::from_static(&[0x00]),
+                ret: Ordering::Greater,
+            },
+            TbBytes {
+                arg1: Bytes::from_static(&[0x00]),
+                arg2: Bytes::from_static(&[0x01]),
+                ret: Ordering::Less,
+            },
+            TbBytes {
+                arg1: Bytes::from_static(&[0x00, 0x01]),
+                arg2: Bytes::from_static(&[0x00, 0x00]),
+                ret: Ordering::Greater,
+            },
+            TbBytes {
+                arg1: Bytes::from_static(&[0x00, 0x00, 0x00]),
+                arg2: Bytes::from_static(&[0x00, 0x00]),
+                ret: Ordering::Greater,
+            },
+            TbBytes {
+                arg1: Bytes::from_static(&[0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00]),
+                arg2: Bytes::from_static(&[0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00]),
+                ret: Ordering::Less,
+            },
+            TbBytes {
+                arg1: Bytes::from_static(&[0x01, 0x02, 0x03, 0x00]),
+                arg2: Bytes::from_static(&[0x01, 0x02, 0x03]),
+                ret: Ordering::Greater,
+            },
+            TbBytes {
+                arg1: Bytes::from_static(&[0x01, 0x03, 0x03, 0x04]),
+                arg2: Bytes::from_static(&[0x01, 0x03, 0x03, 0x05]),
+                ret: Ordering::Less,
+            },
+            TbBytes {
+                arg1: Bytes::from_static(&[0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07]),
+                arg2: Bytes::from_static(&[0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08]),
+                ret: Ordering::Less,
+            },
+            TbBytes {
+                arg1: Bytes::from_static(&[0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09]),
+                arg2: Bytes::from_static(&[0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08]),
+                ret: Ordering::Greater,
+            },
+            TbBytes {
+                arg1: Bytes::from_static(&[0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x00]),
+                arg2: Bytes::from_static(&[0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08]),
+                ret: Ordering::Greater,
+            },
+        ];
+        let c = MemComparable;
+        for x in &data {
+            let mut buf1 = vec![];
+            let mut buf2 = vec![];
+            c.encode(&mut buf1, &x.arg1).unwrap();
+            c.encode(&mut buf2, &x.arg2).unwrap();
+            assert_eq!(x.ret, buf1.as_slice().cmp(buf2.as_slice()));
+        }
+    }
+}
diff --git a/common_util/src/codec/memcomparable/datum.rs b/common_util/src/codec/memcomparable/datum.rs
new file mode 100644
index 0000000000..3af3d5f474
--- /dev/null
+++ b/common_util/src/codec/memcomparable/datum.rs
@@ -0,0 +1,290 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Datum comparable codec
+
+use common_types::{
+    bytes::{BytesMut, MemBuf, MemBufMut},
+    datum::{Datum, DatumKind},
+    string::StringBytes,
+    time::Timestamp,
+};
+use snafu::ResultExt;
+
+use crate::codec::{
+    consts,
+    memcomparable::{EncodeKey, Error, MemComparable, Result, UnsupportedKind},
+    DecodeTo, Encoder,
+};
+
+// TODO(yingwen): Consider collate for string.
+impl Encoder<Datum> for MemComparable {
+    type Error = Error;
+
+    fn encode<B: MemBufMut>(&self, buf: &mut B, value: &Datum) -> Result<()> {
+        match value {
+            Datum::Null => buf.write_u8(consts::NULL_FLAG).context(EncodeKey),
+            Datum::Timestamp(ts) => {
+                buf.write_u8(consts::INT_FLAG).context(EncodeKey)?;
+                self.encode(buf, &ts.as_i64())
+            }
+            Datum::Varbinary(v) => {
+                buf.write_u8(consts::BYTES_FLAG).context(EncodeKey)?;
+                self.encode(buf, v)
+            }
+            // For string, we just use same encoding method as bytes now.
+            Datum::String(v) => {
+                buf.write_u8(consts::BYTES_FLAG).context(EncodeKey)?;
+                self.encode(buf, v.as_bytes())
+            }
+            Datum::UInt64(v) => {
+                buf.write_u8(consts::UINT_FLAG).context(EncodeKey)?;
+                self.encode(buf, v)
+            }
+            Datum::UInt32(v) => {
+                buf.write_u8(consts::UINT_FLAG).context(EncodeKey)?;
+                self.encode(buf, &(u64::from(*v)))
+            }
+            Datum::UInt16(v) => {
+                buf.write_u8(consts::UINT_FLAG).context(EncodeKey)?;
+                self.encode(buf, &(u64::from(*v)))
+            }
+            Datum::UInt8(v) => {
+                buf.write_u8(consts::UINT_FLAG).context(EncodeKey)?;
+                self.encode(buf, &(u64::from(*v)))
+            }
+            Datum::Int64(v) => {
+                buf.write_u8(consts::INT_FLAG).context(EncodeKey)?;
+                self.encode(buf, v)
+            }
+            Datum::Int32(v) => {
+                buf.write_u8(consts::INT_FLAG).context(EncodeKey)?;
+                self.encode(buf, &(i64::from(*v)))
+            }
+            Datum::Int16(v) => {
+                buf.write_u8(consts::INT_FLAG).context(EncodeKey)?;
+                self.encode(buf, &(i64::from(*v)))
+            }
+            Datum::Int8(v) => {
+                buf.write_u8(consts::INT_FLAG).context(EncodeKey)?;
+                self.encode(buf, &(i64::from(*v)))
+            }
+            Datum::Boolean(v) => {
+                buf.write_u8(consts::UINT_FLAG).context(EncodeKey)?;
+                self.encode(buf, &(u64::from(*v)))
+            }
+            Datum::Double(_) => UnsupportedKind {
+                kind: DatumKind::Double,
+            }
+            .fail(),
+            Datum::Float(_) => UnsupportedKind {
+                kind: DatumKind::Float,
+            }
+            .fail(),
+        }
+    }
+
+    fn estimate_encoded_size(&self, value: &Datum) -> usize {
+        match value {
+            // Null takes 1 byte
+            Datum::Null => 1,
+            Datum::Timestamp(ts) => self.estimate_encoded_size(&ts.as_i64()),
+            Datum::Varbinary(v) => self.estimate_encoded_size(v),
+            Datum::String(v) => self.estimate_encoded_size(v.as_bytes()),
+            Datum::UInt64(v) => self.estimate_encoded_size(v),
+            Datum::UInt32(v) => self.estimate_encoded_size(&(u64::from(*v))),
+            Datum::UInt16(v) => self.estimate_encoded_size(&(u64::from(*v))),
+            Datum::UInt8(v) => self.estimate_encoded_size(&(u64::from(*v))),
+            Datum::Int64(v) => self.estimate_encoded_size(v),
+            Datum::Int32(v) => self.estimate_encoded_size(&(i64::from(*v))),
+            Datum::Int16(v) => self.estimate_encoded_size(&(i64::from(*v))),
+            Datum::Int8(v) => self.estimate_encoded_size(&(i64::from(*v))),
+            Datum::Boolean(v) => self.estimate_encoded_size(&(u64::from(*v))),
+            // Unsupported kind, but we return 1
+            Datum::Double(_) | Datum::Float(_) => 1,
+        }
+    }
+}
+
+macro_rules! decode_u64_into {
+    ($self: ident, $v: ident, $buf: ident, $type: ty) => {{
+        Self::ensure_flag($buf, consts::UINT_FLAG)?;
+        let mut data = 0u64;
+        $self.decode_to($buf, &mut data)?;
+        *$v = data as $type;
+    }};
+}
+
+macro_rules! decode_u64_into_bool {
+    ($self: ident, $v: ident, $buf: ident) => {{
+        Self::ensure_flag($buf, consts::UINT_FLAG)?;
+        let mut data = 0u64;
+        $self.decode_to($buf, &mut data)?;
+        *$v = data != 0;
+    }};
+}
+
+macro_rules! decode_i64_into {
+    ($self: ident, $v: ident, $buf: ident, $type: ty) => {{
+        Self::ensure_flag($buf, consts::INT_FLAG)?;
+        let mut data = 0i64;
+        $self.decode_to($buf, &mut data)?;
+        *$v = data as $type;
+    }};
+}
+
+impl DecodeTo<Datum> for MemComparable {
+    type Error = Error;
+
+    /// REQUIRE: The datum type should match the type in buf
+    ///
+    /// For string datum, the utf8 check will be skipped.
+    fn decode_to<B: MemBuf>(&self, buf: &mut B, value: &mut Datum) -> Result<()> {
+        match value {
+            Datum::Null => {
+                Self::ensure_flag(buf, consts::NULL_FLAG)?;
+            }
+            Datum::Timestamp(ts) => {
+                Self::ensure_flag(buf, consts::INT_FLAG)?;
+                let mut data = 0;
+                self.decode_to(buf, &mut data)?;
+                *ts = Timestamp::new(data);
+            }
+            Datum::Varbinary(v) => {
+                Self::ensure_flag(buf, consts::BYTES_FLAG)?;
+                let mut data = BytesMut::new();
+                self.decode_to(buf, &mut data)?;
+                *v = data.freeze();
+            }
+            Datum::String(v) => {
+                Self::ensure_flag(buf, consts::BYTES_FLAG)?;
+                let mut data = BytesMut::new();
+                self.decode_to(buf, &mut data)?;
+                // For string datum, we won't validate whether the bytes is a valid utf string
+                // during decoding to improve decode performance. The encoder
+                // should already done the utf8 check.
+                unsafe {
+                    *v = StringBytes::from_bytes_unchecked(data.freeze());
+                }
+            }
+            Datum::UInt64(v) => {
+                Self::ensure_flag(buf, consts::UINT_FLAG)?;
+                self.decode_to(buf, v)?;
+            }
+            Datum::UInt32(v) => decode_u64_into!(self, v, buf, u32),
+            Datum::UInt16(v) => decode_u64_into!(self, v, buf, u16),
+            Datum::UInt8(v) => decode_u64_into!(self, v, buf, u8),
+            Datum::Int64(v) => {
+                Self::ensure_flag(buf, consts::INT_FLAG)?;
+                self.decode_to(buf, v)?;
+            }
+            Datum::Int32(v) => decode_i64_into!(self, v, buf, i32),
+            Datum::Int16(v) => decode_i64_into!(self, v, buf, i16),
+            Datum::Int8(v) => decode_i64_into!(self, v, buf, i8),
+            Datum::Boolean(v) => decode_u64_into_bool!(self, v, buf),
+            Datum::Double(_) => {
+                return UnsupportedKind {
+                    kind: DatumKind::Double,
+                }
+                .fail();
+            }
+            Datum::Float(_) => {
+                return UnsupportedKind {
+                    kind: DatumKind::Float,
+                }
+                .fail();
+            }
+        }
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use core::cmp::Ordering;
+
+    use common_types::bytes::Bytes;
+
+    use super::*;
+
+    #[test]
+    fn test_datum_codec() {
+        let data = vec![
+            // (datum to encode, estimate_encoded_size)
+            (Datum::Null, 1),
+            (Datum::Timestamp(Timestamp::new(12345)), 9),
+            (Datum::Varbinary(Bytes::from_static(b"hello world")), 18),
+            (Datum::String(StringBytes::from_static("hello world")), 18),
+            (Datum::UInt64(100209), 9),
+            (Datum::UInt32(10020), 9),
+            (Datum::UInt16(65000), 9),
+            (Datum::UInt8(150), 9),
+            (Datum::Int64(-100209), 9),
+            (Datum::Int32(-10020), 9),
+            (Datum::Int16(32500), 9),
+            (Datum::Int8(-120), 9),
+            (Datum::Boolean(true), 9),
+            (Datum::Boolean(false), 9),
+        ];
+        let mut decoded = vec![
+            Datum::Null,
+            Datum::Timestamp(Timestamp::new(0)),
+            Datum::Varbinary(Bytes::new()),
+            Datum::String(StringBytes::new()),
+            Datum::UInt64(0),
+            Datum::UInt32(0),
+            Datum::UInt16(0),
+            Datum::UInt8(0),
+            Datum::Int64(0),
+            Datum::Int32(0),
+            Datum::Int16(0),
+            Datum::Int8(0),
+            Datum::Boolean(false),
+            Datum::Boolean(false),
+        ];
+        let c = MemComparable;
+        for (index, x) in data.iter().enumerate() {
+            let mut buf = vec![];
+            c.encode(&mut buf, &x.0).unwrap();
+            assert_eq!(x.1, c.estimate_encoded_size(&x.0));
+            c.decode_to(&mut buf.as_slice(), &mut decoded[index])
+                .unwrap();
+            assert_eq!(decoded[index], data[index].0);
+        }
+    }
+
+    #[test]
+    fn test_datum_order() {
+        let data = vec![
+            // (arg1, arg2, cmp order of arg1 and arg2)
+            (Datum::Null, Datum::Null, Ordering::Equal),
+            (
+                Datum::Timestamp(Timestamp::new(12345)),
+                Datum::Timestamp(Timestamp::new(123456)),
+                Ordering::Less,
+            ),
+            (
+                Datum::Varbinary(Bytes::from_static(&[
+                    0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+                ])),
+                Datum::Varbinary(Bytes::from_static(&[
+                    0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
+                ])),
+                Ordering::Less,
+            ),
+            (
+                Datum::String(StringBytes::from_static("abce123")),
+                Datum::String(StringBytes::from_static("abce1234")),
+                Ordering::Less,
+            ),
+            (Datum::UInt64(888), Datum::UInt64(889), Ordering::Less),
+        ];
+        let c = MemComparable;
+        for x in &data {
+            let mut buf1 = vec![];
+            let mut buf2 = vec![];
+            c.encode(&mut buf1, &x.0).unwrap();
+            c.encode(&mut buf2, &x.1).unwrap();
+            assert_eq!(x.2, buf1.as_slice().cmp(buf2.as_slice()));
+        }
+    }
+}
diff --git a/common_util/src/codec/memcomparable/mod.rs b/common_util/src/codec/memcomparable/mod.rs
new file mode 100644
index 0000000000..1321fffdab
--- /dev/null
+++ b/common_util/src/codec/memcomparable/mod.rs
@@ -0,0 +1,98 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Mem comparable format codec
+
+// Implementation reference:
+// https://github.com/pingcap/tidb/blob/bd011d3c9567c506d8d4343ade03edf77fcd5b56/util/codec/codec.go
+
+mod bytes;
+mod datum;
+mod number;
+
+use common_types::{
+    bytes::{BytesMut, MemBuf},
+    datum::DatumKind,
+};
+use snafu::{ensure, Backtrace, ResultExt, Snafu};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Failed to encode flag, err:{}", source))]
+    EncodeKey { source: common_types::bytes::Error },
+
+    #[snafu(display("Failed to encode value, err:{}", source))]
+    EncodeValue { source: common_types::bytes::Error },
+
+    #[snafu(display("Failed to decode key, err:{}", source))]
+    DecodeKey { source: common_types::bytes::Error },
+
+    #[snafu(display(
+        "Invalid flag, expect:{}, actual:{}.\nBacktrace:\n{}",
+        expect,
+        actual,
+        backtrace
+    ))]
+    InvalidKeyFlag {
+        expect: u8,
+        actual: u8,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Unsupported datum kind to compare in mem, kind :{}.\nBacktrace:\n{}",
+        kind,
+        backtrace
+    ))]
+    UnsupportedKind {
+        kind: DatumKind,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Insufficient bytes to decode value, err:{}", source))]
+    DecodeValue { source: common_types::bytes::Error },
+
+    #[snafu(display("Insufficient bytes to decode value group.\nBacktrace:\n{}", backtrace))]
+    DecodeValueGroup { backtrace: Backtrace },
+
+    #[snafu(display(
+        "Invalid marker byte, group bytes: {:?}.\nBacktrace:\n{}",
+        group_bytes,
+        backtrace
+    ))]
+    DecodeValueMarker {
+        group_bytes: BytesMut,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Invalid padding byte, group bytes: {:?}.\nBacktrace:\n{}",
+        group_bytes,
+        backtrace
+    ))]
+    DecodeValuePadding {
+        group_bytes: BytesMut,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Failed to decode string, err:{}", source))]
+    DecodeString { source: common_types::string::Error },
+}
+
+define_result!(Error);
+
+/// Mem comparable codec
+pub struct MemComparable;
+
+impl MemComparable {
+    fn ensure_flag<B: MemBuf>(buf: &mut B, flag: u8) -> Result<()> {
+        let actual = buf.read_u8().context(DecodeKey)?;
+        ensure!(
+            flag == actual,
+            InvalidKeyFlag {
+                expect: flag,
+                actual
+            }
+        );
+        Ok(())
+    }
+}
diff --git a/common_util/src/codec/memcomparable/number.rs b/common_util/src/codec/memcomparable/number.rs
new file mode 100644
index 0000000000..70cb36b03d
--- /dev/null
+++ b/common_util/src/codec/memcomparable/number.rs
@@ -0,0 +1,333 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Number format
+
+use common_types::bytes::{MemBuf, MemBufMut};
+use snafu::ResultExt;
+
+use crate::codec::{
+    consts,
+    memcomparable::{DecodeValue, EncodeValue, Error, MemComparable, Result},
+    DecodeTo, Encoder,
+};
+
+impl Encoder<i64> for MemComparable {
+    type Error = Error;
+
+    fn encode<B: MemBufMut>(&self, buf: &mut B, value: &i64) -> Result<()> {
+        buf.write_u64(encode_int_to_cmp_uint(*value))
+            .context(EncodeValue)?;
+        Ok(())
+    }
+
+    fn estimate_encoded_size(&self, _value: &i64) -> usize {
+        // flag + u64
+        9
+    }
+}
+
+impl DecodeTo<i64> for MemComparable {
+    type Error = Error;
+
+    fn decode_to<B: MemBuf>(&self, buf: &mut B, value: &mut i64) -> Result<()> {
+        *value = decode_cmp_uint_to_int(buf.read_u64().context(DecodeValue)?);
+        Ok(())
+    }
+}
+
+// encode_int_to_cmp_uint make int v to comparable uint type
+fn encode_int_to_cmp_uint(v: i64) -> u64 {
+    (v as u64) ^ consts::SIGN_MASK
+}
+
+// decode_cmp_uint_to_int decodes the u that encoded by encode_int_to_cmp_uint
+fn decode_cmp_uint_to_int(u: u64) -> i64 {
+    (u ^ consts::SIGN_MASK) as i64
+}
+
+impl Encoder<u64> for MemComparable {
+    type Error = Error;
+
+    fn encode<B: MemBufMut>(&self, buf: &mut B, value: &u64) -> Result<()> {
+        buf.write_u64(*value).context(EncodeValue)?;
+        Ok(())
+    }
+
+    fn estimate_encoded_size(&self, _value: &u64) -> usize {
+        // flag + u64
+        9
+    }
+}
+
+impl DecodeTo<u64> for MemComparable {
+    type Error = Error;
+
+    fn decode_to<B: MemBuf>(&self, buf: &mut B, value: &mut u64) -> Result<()> {
+        *value = buf.read_u64().context(DecodeValue)?;
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use core::cmp::Ordering;
+
+    use super::*;
+
+    struct TestI64 {
+        data: i64,
+        estimate_encoded_size: usize,
+    }
+
+    impl TestI64 {
+        fn new(data: i64) -> Self {
+            Self {
+                data,
+                estimate_encoded_size: 9,
+            }
+        }
+    }
+
+    #[test]
+    fn test_i64_codec() {
+        let data = vec![
+            TestI64::new(1621324705),
+            TestI64::new(1621324705000),
+            TestI64::new(1521324705),
+            TestI64::new(1621324705123),
+            TestI64::new(i64::MIN),
+            TestI64::new(i64::MIN + 1),
+            TestI64::new(0),
+            TestI64::new(i64::MAX),
+            TestI64::new((1 << 47) - 1),
+            TestI64::new(-1 << 47),
+            TestI64::new((1 << 23) - 1),
+            TestI64::new(-1 << 23),
+            TestI64::new((1 << 33) - 1),
+            TestI64::new(-1 << 33),
+            TestI64::new((1 << 55) - 1),
+            TestI64::new(-1 << 55),
+            TestI64::new(1),
+            TestI64::new(-1),
+        ];
+        let c = MemComparable;
+        let mut buf = vec![];
+        for x in &data {
+            c.encode(&mut buf, &x.data).unwrap();
+            assert_eq!(x.estimate_encoded_size, c.estimate_encoded_size(&x.data));
+        }
+
+        let mut buf = &buf[..];
+        for x in &data {
+            let mut d = -1;
+            c.decode_to(&mut buf, &mut d).unwrap();
+            assert_eq!(d, x.data);
+        }
+    }
+
+    struct TestU64 {
+        data: u64,
+        estimate_encoded_size: usize,
+    }
+
+    impl TestU64 {
+        fn new(data: u64) -> Self {
+            Self {
+                data,
+                estimate_encoded_size: 9,
+            }
+        }
+    }
+
+    #[test]
+    fn test_u64_codec() {
+        let data = vec![
+            TestU64::new(0),
+            TestU64::new(u64::from(u8::MAX)),
+            TestU64::new(u64::from(u16::MAX)),
+            TestU64::new(u64::from(u32::MAX)),
+            TestU64::new(u64::MAX),
+            TestU64::new((1 << 24) - 1),
+            TestU64::new((1 << 48) - 1),
+            TestU64::new((1 << 56) - 1),
+            TestU64::new(1),
+            TestU64::new(i8::MAX as u64),
+            TestU64::new(i16::MAX as u64),
+            TestU64::new(i32::MAX as u64),
+            TestU64::new(i64::MAX as u64),
+        ];
+        let c = MemComparable;
+        let mut buf = vec![];
+        for x in &data {
+            c.encode(&mut buf, &x.data).unwrap();
+            assert_eq!(x.estimate_encoded_size, c.estimate_encoded_size(&x.data));
+        }
+
+        let mut buf = &buf[..];
+        for x in &data {
+            let mut d = 0;
+            c.decode_to(&mut buf, &mut d).unwrap();
+            assert_eq!(d, x.data);
+        }
+    }
+
+    struct TblI64 {
+        arg1: i64,
+        arg2: i64,
+        ret: Ordering,
+    }
+
+    #[test]
+    fn test_i64_order() {
+        let data = vec![
+            TblI64 {
+                arg1: -1,
+                arg2: 1,
+                ret: Ordering::Less,
+            },
+            TblI64 {
+                arg1: i64::MAX,
+                arg2: i64::MIN,
+                ret: Ordering::Greater,
+            },
+            TblI64 {
+                arg1: i64::MAX,
+                arg2: i32::MAX as i64,
+                ret: Ordering::Greater,
+            },
+            TblI64 {
+                arg1: i32::MIN as i64,
+                arg2: i16::MAX as i64,
+                ret: Ordering::Less,
+            },
+            TblI64 {
+                arg1: i64::MIN,
+                arg2: i8::MAX as i64,
+                ret: Ordering::Less,
+            },
+            TblI64 {
+                arg1: 0,
+                arg2: i8::MAX as i64,
+                ret: Ordering::Less,
+            },
+            TblI64 {
+                arg1: i8::MIN as i64,
+                arg2: 0,
+                ret: Ordering::Less,
+            },
+            TblI64 {
+                arg1: i16::MIN as i64,
+                arg2: i16::MAX as i64,
+                ret: Ordering::Less,
+            },
+            TblI64 {
+                arg1: 1,
+                arg2: -1,
+                ret: Ordering::Greater,
+            },
+            TblI64 {
+                arg1: 1,
+                arg2: 0,
+                ret: Ordering::Greater,
+            },
+            TblI64 {
+                arg1: -1,
+                arg2: 0,
+                ret: Ordering::Less,
+            },
+            TblI64 {
+                arg1: 0,
+                arg2: 0,
+                ret: Ordering::Equal,
+            },
+            TblI64 {
+                arg1: i16::MAX as i64,
+                arg2: i16::MAX as i64,
+                ret: Ordering::Equal,
+            },
+        ];
+        let c = MemComparable;
+        for x in &data {
+            let mut buf1 = vec![];
+            let mut buf2 = vec![];
+            c.encode(&mut buf1, &x.arg1).unwrap();
+            c.encode(&mut buf2, &x.arg2).unwrap();
+            assert_eq!(x.ret, buf1.as_slice().cmp(buf2.as_slice()));
+        }
+    }
+
+    struct TblU64 {
+        arg1: u64,
+        arg2: u64,
+        ret: Ordering,
+    }
+
+    #[test]
+    fn test_u64_order() {
+        let data = vec![
+            TblU64 {
+                arg1: 0,
+                arg2: 0,
+                ret: Ordering::Equal,
+            },
+            TblU64 {
+                arg1: 1,
+                arg2: 0,
+                ret: Ordering::Greater,
+            },
+            TblU64 {
+                arg1: 0,
+                arg2: 1,
+                ret: Ordering::Less,
+            },
+            TblU64 {
+                arg1: i8::MAX as u64,
+                arg2: i16::MAX as u64,
+                ret: Ordering::Less,
+            },
+            TblU64 {
+                arg1: u32::MAX as u64,
+                arg2: i32::MAX as u64,
+                ret: Ordering::Greater,
+            },
+            TblU64 {
+                arg1: u8::MAX as u64,
+                arg2: i8::MAX as u64,
+                ret: Ordering::Greater,
+            },
+            TblU64 {
+                arg1: u16::MAX as u64,
+                arg2: i32::MAX as u64,
+                ret: Ordering::Less,
+            },
+            TblU64 {
+                arg1: u64::MAX as u64,
+                arg2: i64::MAX as u64,
+                ret: Ordering::Greater,
+            },
+            TblU64 {
+                arg1: i64::MAX as u64,
+                arg2: u32::MAX as u64,
+                ret: Ordering::Greater,
+            },
+            TblU64 {
+                arg1: u64::MAX,
+                arg2: 0,
+                ret: Ordering::Greater,
+            },
+            TblU64 {
+                arg1: 0,
+                arg2: u64::MAX,
+                ret: Ordering::Less,
+            },
+        ];
+        let c = MemComparable;
+        for x in &data {
+            let mut buf1 = vec![];
+            let mut buf2 = vec![];
+            c.encode(&mut buf1, &x.arg1).unwrap();
+            c.encode(&mut buf2, &x.arg2).unwrap();
+            assert_eq!(x.ret, buf1.as_slice().cmp(buf2.as_slice()));
+        }
+    }
+}
diff --git a/common_util/src/codec/mod.rs b/common_util/src/codec/mod.rs
new file mode 100644
index 0000000000..0a9825f355
--- /dev/null
+++ b/common_util/src/codec/mod.rs
@@ -0,0 +1,42 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Data encoding
+
+// TODO(yingwen): Buf use generic type to avoid cost of vtable call per
+// encode/decode
+
+pub mod compact;
+mod consts;
+pub mod memcomparable;
+pub mod row;
+mod varint;
+
+use common_types::bytes::{MemBuf, MemBufMut};
+
+// encoder/decoder
+/// Data encode abstraction
+pub trait Encoder<T: ?Sized> {
+    type Error;
+
+    /// Encode value into buf
+    fn encode<B: MemBufMut>(&self, buf: &mut B, value: &T) -> Result<(), Self::Error>;
+
+    /// Estimate the value size after encoded
+    fn estimate_encoded_size(&self, value: &T) -> usize;
+}
+
+/// Data decode to target
+pub trait DecodeTo<T> {
+    type Error;
+
+    /// Decode from `buf` to `value`
+    fn decode_to<B: MemBuf>(&self, buf: &mut B, value: &mut T) -> Result<(), Self::Error>;
+}
+
+/// Data decode abstraction
+pub trait Decoder<T> {
+    type Error;
+
+    /// Decode `value` from `buf`
+    fn decode<B: MemBuf>(&self, buf: &mut B) -> Result<T, Self::Error>;
+}
diff --git a/common_util/src/codec/row.rs b/common_util/src/codec/row.rs
new file mode 100644
index 0000000000..54c1b8ccbe
--- /dev/null
+++ b/common_util/src/codec/row.rs
@@ -0,0 +1,234 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Row encoding utils
+//!
+//! Notice: The encoding method is used both in wal and memtable. Be careful for
+//! data compatibility
+
+use std::convert::TryFrom;
+
+use common_types::{
+    bytes::{BufMut, ByteVec, BytesMut, MemBuf, MemBufMut},
+    datum::Datum,
+    row::{Row, RowGroup},
+    schema::{IndexInWriterSchema, Schema},
+};
+use snafu::{ResultExt, Snafu};
+
+use crate::codec::{
+    compact::{MemCompactDecoder, MemCompactEncoder},
+    DecodeTo, Decoder, Encoder,
+};
+
+#[derive(Debug, Snafu)]
+#[snafu(visibility(pub(crate)))]
+pub enum Error {
+    #[snafu(display("Failed to encode row datum, err:{}", source))]
+    EncodeRowDatum {
+        source: crate::codec::compact::Error,
+    },
+
+    #[snafu(display("Failed to decode row datum, err:{}", source))]
+    DecodeRowDatum {
+        source: crate::codec::compact::Error,
+    },
+}
+
+define_result!(Error);
+
+/// Compact row encoder for wal.
+struct WalRowEncoder<'a> {
+    /// Schema of table
+    table_schema: &'a Schema,
+    /// Index of table column in writer
+    index_in_writer: &'a IndexInWriterSchema,
+}
+
+impl<'a> Encoder<Row> for WalRowEncoder<'a> {
+    type Error = Error;
+
+    fn encode<B: MemBufMut>(&self, buf: &mut B, value: &Row) -> Result<()> {
+        let encoder = MemCompactEncoder;
+        for index_in_table in 0..self.table_schema.num_columns() {
+            match self.index_in_writer.column_index_in_writer(index_in_table) {
+                Some(writer_index) => {
+                    // Column in writer
+                    encoder
+                        .encode(buf, &value[writer_index])
+                        .context(EncodeRowDatum)?;
+                }
+                None => {
+                    // Column not in writer
+                    encoder.encode(buf, &Datum::Null).context(EncodeRowDatum)?;
+                }
+            }
+        }
+
+        Ok(())
+    }
+
+    fn estimate_encoded_size(&self, value: &Row) -> usize {
+        let encoder = MemCompactEncoder;
+        let mut total_len = 0;
+        for index_in_table in 0..self.table_schema.num_columns() {
+            match self.index_in_writer.column_index_in_writer(index_in_table) {
+                Some(writer_index) => {
+                    // Column in writer
+                    total_len += encoder.estimate_encoded_size(&value[writer_index]);
+                }
+                None => {
+                    // Column not in writer
+                    total_len += encoder.estimate_encoded_size(&Datum::Null);
+                }
+            }
+        }
+
+        total_len
+    }
+}
+
+/// Compact row decoder for wal, supports projection.
+#[derive(Debug)]
+pub struct WalRowDecoder<'a> {
+    /// Schema of row to decode
+    schema: &'a Schema,
+}
+
+impl<'a> WalRowDecoder<'a> {
+    /// Create a decoder with given `schema`, the caller should ensure the
+    /// schema matches the row to be decoded.
+    pub fn new(schema: &'a Schema) -> Self {
+        Self { schema }
+    }
+}
+
+impl<'a> Decoder<Row> for WalRowDecoder<'a> {
+    type Error = Error;
+
+    fn decode<B: MemBuf>(&self, buf: &mut B) -> Result<Row> {
+        let num_columns = self.schema.num_columns();
+        let mut datums = Vec::with_capacity(num_columns);
+
+        for idx in 0..num_columns {
+            let column_schema = &self.schema.column(idx);
+            let datum_kind = &column_schema.data_type;
+            let decoder = MemCompactDecoder;
+
+            // Decode each column
+            let mut datum = Datum::empty(datum_kind);
+            decoder.decode_to(buf, &mut datum).context(DecodeRowDatum)?;
+
+            datums.push(datum);
+        }
+
+        Ok(Row::from_datums(datums))
+    }
+}
+
+/// Encode the row group in the format that can write to wal.
+///
+/// Arguments
+/// - row_group: The rows to be encoded and wrote to.
+/// - table_schema: The schema the row group need to be encoded into, the schema
+///   of the row group need to be write compatible for the table schema.
+/// - index_in_writer: The index mapping from table schema to column in the
+///   schema of row group.
+/// - encoded_rows: The Vec to store bytes of each encoded row.
+pub fn encode_row_group_for_wal(
+    row_group: &RowGroup,
+    table_schema: &Schema,
+    index_in_writer: &IndexInWriterSchema,
+    encoded_rows: &mut Vec<ByteVec>,
+) -> Result<()> {
+    let row_encoder = WalRowEncoder {
+        table_schema,
+        index_in_writer,
+    };
+
+    // Use estimated size of first row to avoid compute all
+    let row_estimated_size = match row_group.get_row(0) {
+        Some(first_row) => row_encoder.estimate_encoded_size(first_row),
+        // The row group is empty
+        None => return Ok(()),
+    };
+
+    encoded_rows.reserve(row_group.num_rows());
+
+    // Each row is constructed in writer schema, we need to encode it in
+    // `table_schema`
+    for row in row_group {
+        let mut buf = Vec::with_capacity(row_estimated_size);
+        row_encoder.encode(&mut buf, row)?;
+
+        encoded_rows.push(buf);
+    }
+
+    Ok(())
+}
+
+/// Return the next prefix key
+///
+/// Assume there are keys like:
+///
+/// ```text
+/// rowkey1
+/// rowkey1_column1
+/// rowkey1_column2
+/// rowKey2
+/// ```
+///
+/// If we seek 'rowkey1' Next, we will get 'rowkey1_column1'.
+/// If we seek 'rowkey1' PrefixNext, we will get 'rowkey2'.
+///
+/// Ported from <https://github.com/pingcap/tidb/blob/f81ef5579551a0523d18b049eb25ab3375bcfb48/kv/key.go#L49>
+///
+/// REQUIRE: The key should be memory comparable
+// TODO(yingwen): Maybe add scratch param
+// TODO(yingwen): Move to another mod
+pub fn key_prefix_next(key: &[u8]) -> BytesMut {
+    let mut buf = BytesMut::from(key);
+    // isize should be enough to represent the key len
+    let mut idx = isize::try_from(key.len() - 1).unwrap();
+    while idx >= 0 {
+        let i = idx as usize;
+        buf[i] += 1;
+        if buf[i] != 0 {
+            break;
+        }
+
+        idx -= 1;
+    }
+    if idx == -1 {
+        buf.copy_from_slice(key);
+        buf.put_u8(0);
+    }
+
+    buf
+}
+#[cfg(test)]
+mod test {
+    use common_types::schema::IndexInWriterSchema;
+
+    use crate::codec::{
+        row::{WalRowDecoder, WalRowEncoder},
+        Decoder, Encoder,
+    };
+
+    #[test]
+    fn test_wal_encode_decode() {
+        let schema = common_types::tests::build_schema();
+        let rows = common_types::tests::build_rows();
+        let index_in_writer = IndexInWriterSchema::for_same_schema(schema.num_columns());
+        let wal_encoder = WalRowEncoder {
+            table_schema: &schema,
+            index_in_writer: &index_in_writer,
+        };
+        let wal_decoder = WalRowDecoder::new(&schema);
+        for row in rows {
+            let mut buf = Vec::new();
+            wal_encoder.encode(&mut buf, &row).unwrap();
+            let row_decoded = wal_decoder.decode(&mut buf.as_slice()).unwrap();
+            assert_eq!(row_decoded, row);
+        }
+    }
+}
diff --git a/common_util/src/codec/varint.rs b/common_util/src/codec/varint.rs
new file mode 100644
index 0000000000..eb5616b692
--- /dev/null
+++ b/common_util/src/codec/varint.rs
@@ -0,0 +1,209 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Varint for codec whose test is covered by compact/number.rs
+use common_types::bytes::{MemBuf, MemBufMut};
+use snafu::{Backtrace, ResultExt, Snafu};
+
+#[derive(Debug, Snafu)]
+#[snafu(visibility(pub(crate)))]
+pub enum Error {
+    #[snafu(display("Failed to encode varint, err:{}", source))]
+    EncodeVarint { source: common_types::bytes::Error },
+
+    #[snafu(display("Insufficient bytes to decode value.\nBacktrace:\n{}", backtrace))]
+    DecodeEmptyValue { backtrace: Backtrace },
+
+    #[snafu(display("Insufficient bytes to decode value, err:{}", source))]
+    DecodeValue { source: common_types::bytes::Error },
+
+    #[snafu(display("Value larger than 64 bits (overflow).\nBacktrace:\n{}", backtrace))]
+    UvarintOverflow { backtrace: Backtrace },
+}
+
+define_result!(Error);
+
+// from https://golang.org/src/encoding/binary/varint.go?s=2506:2545#L68
+// PutVarint encodes an int64 into buf and returns the number of bytes written.
+// If the buffer is too small, PutVarint will panic.
+//
+// ```go
+// func PutVarint(buf []byte, x int64) int {
+//      ux := uint64(x) << 1
+//      if x < 0 {
+//      ux = ^ux
+//      }
+//      return PutUvarint(buf, ux)
+// }
+// ```
+pub fn encode_varint<B: MemBufMut>(buf: &mut B, value: i64) -> Result<()> {
+    let mut x = (value as u64) << 1;
+    if value < 0 {
+        x = !x;
+    }
+    encode_uvarint(buf, x)
+}
+
+//
+// from https://golang.org/src/encoding/binary/varint.go?s=1611:1652#L31
+//
+// ```go
+// func PutUvarint(buf []byte, x uint64) int {
+// 	i := 0
+// 	for x >= 0x80 {
+// 		buf[i] = byte(x) | 0x80
+// 		x >>= 7
+// 		i++
+// 	}
+// 	buf[i] = byte(x)
+// 	return i + 1
+// }
+// ```
+pub fn encode_uvarint<B: MemBufMut>(buf: &mut B, mut x: u64) -> Result<()> {
+    while x >= 0x80 {
+        buf.write_u8(x as u8 | 0x80).context(EncodeVarint)?;
+        x >>= 7;
+    }
+    buf.write_u8(x as u8).context(EncodeVarint)?;
+    Ok(())
+}
+
+// from https://golang.org/src/encoding/binary/varint.go?s=2955:2991#L84
+// Varint decodes an int64 from buf and returns that value and the
+// number of bytes read (> 0). If an error occurred, the value is 0
+// and the number of bytes n is <= 0 with the following meaning:
+//
+// 	n == 0: buf too small
+// 	n  < 0: value larger than 64 bits (overflow)
+// 	        and -n is the number of bytes read
+//
+// ```go
+// func Varint(buf []byte) (int64, int) {
+//      ux, n := Uvarint(buf) // ok to continue in presence of error
+//      x := int64(ux >> 1)
+//      if ux&1 != 0 {
+//          x = ^x
+//      }
+//      return x, n
+//      }
+//  ```
+pub fn decode_varint<B: MemBuf>(buf: &mut B) -> Result<i64> {
+    let ux = decode_uvarint(buf)?;
+    let mut x = (ux >> 1) as i64;
+    if ux & 1 != 0 {
+        x = !x;
+    }
+    Ok(x)
+}
+
+// from https://golang.org/src/encoding/binary/varint.go?s=2070:2108#L50
+// Uvarint decodes a uint64 from buf and returns that value and the
+// number of bytes read (> 0). If an error occurred, the value is 0
+// and the number of bytes n is <= 0 meaning:
+//
+// 	n == 0: buf too small
+// 	n  < 0: value larger than 64 bits (overflow)
+// 	        and -n is the number of bytes read
+//
+//  ```go
+// func Uvarint(buf []byte) (uint64, int) {
+//   var x uint64
+//   var s uint
+//   for i, b := range buf {
+//    if b < 0x80 {
+//    if i > 9 || i == 9 && b > 1 {
+//      return 0, -(i + 1) // overflow
+//    }
+//    return x | uint64(b)<<s, i + 1
+//    }
+//     x |= uint64(b&0x7f) << s
+//     s += 7
+//   }
+//    return 0, 0
+//  }
+//  ```
+//
+pub fn decode_uvarint<B: MemBuf>(buf: &mut B) -> Result<u64> {
+    let mut x: u64 = 0;
+    let mut s: usize = 0;
+    let len = buf.remaining_slice().len();
+    for i in 0..len {
+        let b = buf.read_u8().context(DecodeValue)?;
+        if b < 0x80 {
+            if i > 9 || i == 9 && b > 1 {
+                return UvarintOverflow.fail(); // overflow
+            }
+            return Ok(x | u64::from(b) << s);
+        }
+        x |= u64::from(b & 0x7f) << s;
+        s += 7;
+    }
+    DecodeEmptyValue.fail()
+}
+
+#[cfg(test)]
+mod tests {
+    use common_types::bytes::BytesMut;
+
+    use super::*;
+
+    #[test]
+    fn test_encode_decode_varint() {
+        let nums: Vec<(i64, usize)> = vec![
+            (i64::MIN, 10),
+            (-1000000000000000, 8),
+            (-100000000000, 6),
+            (-1000000000, 5),
+            (-100000, 3),
+            (-65535, 3),
+            (-1000, 2),
+            (-125, 2),
+            (-32, 1),
+            (0, 1),
+            (64, 2),
+            (125, 2),
+            (1000, 2),
+            (65535, 3),
+            (10000, 3),
+            (1000000000, 5),
+            (100000000000, 6),
+            (10000000000000, 7),
+            (1000000000000000, 8),
+            (i64::MAX, 10),
+        ];
+
+        for (i, size) in nums {
+            let mut buf = BytesMut::with_capacity(8);
+            assert!(encode_varint(&mut buf, i).is_ok());
+            assert_eq!(size, buf.len());
+            let d = decode_varint(&mut buf);
+            assert!(d.is_ok());
+            assert_eq!(i, d.unwrap());
+        }
+    }
+
+    #[test]
+    fn test_encode_decode_uvarint() {
+        let nums: Vec<(u64, usize)> = vec![
+            (0, 1),
+            (64, 1),
+            (125, 1),
+            (1000, 2),
+            (65535, 3),
+            (10000, 2),
+            (1000000000, 5),
+            (100000000000, 6),
+            (10000000000000, 7),
+            (1000000000000000, 8),
+            (u64::MAX, 10),
+        ];
+
+        for (i, size) in nums {
+            let mut buf = BytesMut::with_capacity(8);
+            assert!(encode_uvarint(&mut buf, i).is_ok());
+            assert_eq!(size, buf.len());
+            let d = decode_uvarint(&mut buf);
+            assert!(d.is_ok());
+            assert_eq!(i, d.unwrap());
+        }
+    }
+}
diff --git a/common_util/src/config.rs b/common_util/src/config.rs
new file mode 100644
index 0000000000..ac7232767f
--- /dev/null
+++ b/common_util/src/config.rs
@@ -0,0 +1,711 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+// Copyright 2016 TiKV Project Authors. Licensed under Apache-2.0.
+
+//! Configure utils
+
+//This module is forked from tikv and remove unnessary code.
+//https://github.com/tikv/tikv/blob/HEAD/src/util/config.rs
+use std::{
+    fmt::{self, Write},
+    ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Sub, SubAssign},
+    str::{self, FromStr},
+    time::Duration,
+};
+
+use proto::analytic_common;
+use serde::{
+    de::{self, Unexpected, Visitor},
+    Deserialize, Deserializer, Serialize, Serializer,
+};
+
+const UNIT: u64 = 1;
+
+const BINARY_DATA_MAGNITUDE: u64 = 1024;
+pub const B: u64 = UNIT;
+pub const KIB: u64 = UNIT * BINARY_DATA_MAGNITUDE;
+pub const MIB: u64 = KIB * BINARY_DATA_MAGNITUDE;
+pub const GIB: u64 = MIB * BINARY_DATA_MAGNITUDE;
+pub const TIB: u64 = GIB * BINARY_DATA_MAGNITUDE;
+pub const PIB: u64 = TIB * BINARY_DATA_MAGNITUDE;
+
+const TIME_MAGNITUDE_1: u64 = 1000;
+const TIME_MAGNITUDE_2: u64 = 60;
+const TIME_MAGNITUDE_3: u64 = 24;
+const MS: u64 = UNIT;
+const SECOND: u64 = MS * TIME_MAGNITUDE_1;
+const MINUTE: u64 = SECOND * TIME_MAGNITUDE_2;
+const HOUR: u64 = MINUTE * TIME_MAGNITUDE_2;
+const DAY: u64 = HOUR * TIME_MAGNITUDE_3;
+
+/// Convert Duration to milliseconds.
+///
+/// Panic if overflow. Mainly used by `ReadableDuration`.
+#[inline]
+fn duration_to_ms(d: Duration) -> u64 {
+    let nanos = u64::from(d.subsec_nanos());
+    // Most of case, we can't have so large Duration, so here just panic if overflow
+    // now.
+    d.as_secs() * 1_000 + (nanos / 1_000_000)
+}
+
+#[derive(Clone, Debug, Copy, PartialEq, PartialOrd, Serialize, Deserialize)]
+#[serde(rename_all = "lowercase")]
+pub enum TimeUnit {
+    Nanoseconds,
+    Microseconds,
+    Milliseconds,
+    Seconds,
+    Minutes,
+    Hours,
+    Days,
+}
+
+impl From<TimeUnit> for analytic_common::TimeUnit {
+    fn from(unit: TimeUnit) -> Self {
+        match unit {
+            TimeUnit::Nanoseconds => analytic_common::TimeUnit::NANOSECONDS,
+            TimeUnit::Microseconds => analytic_common::TimeUnit::MICROSECONDS,
+            TimeUnit::Milliseconds => analytic_common::TimeUnit::MILLISECONDS,
+            TimeUnit::Seconds => analytic_common::TimeUnit::SECONDS,
+            TimeUnit::Minutes => analytic_common::TimeUnit::MINUTES,
+            TimeUnit::Hours => analytic_common::TimeUnit::HOURS,
+            TimeUnit::Days => analytic_common::TimeUnit::DAYS,
+        }
+    }
+}
+
+impl From<analytic_common::TimeUnit> for TimeUnit {
+    fn from(unit: analytic_common::TimeUnit) -> Self {
+        match unit {
+            analytic_common::TimeUnit::NANOSECONDS => TimeUnit::Nanoseconds,
+            analytic_common::TimeUnit::MICROSECONDS => TimeUnit::Microseconds,
+            analytic_common::TimeUnit::MILLISECONDS => TimeUnit::Milliseconds,
+            analytic_common::TimeUnit::SECONDS => TimeUnit::Seconds,
+            analytic_common::TimeUnit::MINUTES => TimeUnit::Minutes,
+            analytic_common::TimeUnit::HOURS => TimeUnit::Hours,
+            analytic_common::TimeUnit::DAYS => TimeUnit::Days,
+        }
+    }
+}
+
+impl FromStr for TimeUnit {
+    type Err = String;
+
+    fn from_str(tu_str: &str) -> Result<TimeUnit, String> {
+        let tu_str = tu_str.trim();
+        if !tu_str.is_ascii() {
+            return Err(format!("unexpect ascii string: {}", tu_str));
+        }
+
+        match tu_str.to_lowercase().as_str() {
+            "nanoseconds" => Ok(TimeUnit::Nanoseconds),
+            "microseconds" => Ok(TimeUnit::Microseconds),
+            "milliseconds" => Ok(TimeUnit::Milliseconds),
+            "seconds" => Ok(TimeUnit::Seconds),
+            "minutes" => Ok(TimeUnit::Minutes),
+            "hours" => Ok(TimeUnit::Hours),
+            "days" => Ok(TimeUnit::Days),
+            _ => Err(format!("unexpect TimeUnit: {}", tu_str)),
+        }
+    }
+}
+
+impl fmt::Display for TimeUnit {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let s = match self {
+            TimeUnit::Nanoseconds => "nanoseconds",
+            TimeUnit::Microseconds => "microseconds",
+            TimeUnit::Milliseconds => "milliseconds",
+            TimeUnit::Seconds => "seconds",
+            TimeUnit::Minutes => "minutes",
+            TimeUnit::Hours => "hours",
+            TimeUnit::Days => "days",
+        };
+        write!(f, "{}", s)
+    }
+}
+
+#[derive(Clone, Debug, Copy, PartialEq, PartialOrd)]
+pub struct ReadableSize(pub u64);
+
+impl ReadableSize {
+    pub const fn kb(count: u64) -> ReadableSize {
+        ReadableSize(count * KIB)
+    }
+
+    pub const fn mb(count: u64) -> ReadableSize {
+        ReadableSize(count * MIB)
+    }
+
+    pub const fn gb(count: u64) -> ReadableSize {
+        ReadableSize(count * GIB)
+    }
+
+    pub const fn as_mb(self) -> u64 {
+        self.0 / MIB
+    }
+
+    pub const fn as_bytes(self) -> u64 {
+        self.0
+    }
+}
+
+impl Div<u64> for ReadableSize {
+    type Output = ReadableSize;
+
+    fn div(self, rhs: u64) -> ReadableSize {
+        ReadableSize(self.0 / rhs)
+    }
+}
+
+impl Div<ReadableSize> for ReadableSize {
+    type Output = u64;
+
+    fn div(self, rhs: ReadableSize) -> u64 {
+        self.0 / rhs.0
+    }
+}
+
+impl Mul<u64> for ReadableSize {
+    type Output = ReadableSize;
+
+    fn mul(self, rhs: u64) -> ReadableSize {
+        ReadableSize(self.0 * rhs)
+    }
+}
+
+impl Serialize for ReadableSize {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: Serializer,
+    {
+        let size = self.0;
+        let mut buffer = String::new();
+        if size == 0 {
+            write!(buffer, "{}KiB", size).unwrap();
+        } else if size % PIB == 0 {
+            write!(buffer, "{}PiB", size / PIB).unwrap();
+        } else if size % TIB == 0 {
+            write!(buffer, "{}TiB", size / TIB).unwrap();
+        } else if size % GIB as u64 == 0 {
+            write!(buffer, "{}GiB", size / GIB).unwrap();
+        } else if size % MIB as u64 == 0 {
+            write!(buffer, "{}MiB", size / MIB).unwrap();
+        } else if size % KIB as u64 == 0 {
+            write!(buffer, "{}KiB", size / KIB).unwrap();
+        } else {
+            return serializer.serialize_u64(size);
+        }
+        serializer.serialize_str(&buffer)
+    }
+}
+
+impl FromStr for ReadableSize {
+    type Err = String;
+
+    // This method parses value in binary unit.
+    fn from_str(s: &str) -> Result<ReadableSize, String> {
+        let size_str = s.trim();
+        if size_str.is_empty() {
+            return Err(format!("{:?} is not a valid size.", s));
+        }
+
+        if !size_str.is_ascii() {
+            return Err(format!("ASCII string is expected, but got {:?}", s));
+        }
+
+        // size: digits and '.' as decimal separator
+        let size_len = size_str
+            .to_string()
+            .chars()
+            .take_while(|c| char::is_ascii_digit(c) || ['.', 'e', 'E', '-', '+'].contains(c))
+            .count();
+
+        // unit: alphabetic characters
+        let (size, unit) = size_str.split_at(size_len);
+
+        let unit = match unit.trim() {
+            "K" | "KB" | "KiB" => KIB,
+            "M" | "MB" | "MiB" => MIB,
+            "G" | "GB" | "GiB" => GIB,
+            "T" | "TB" | "TiB" => TIB,
+            "P" | "PB" | "PiB" => PIB,
+            "B" | "" => UNIT,
+            _ => {
+                return Err(format!(
+                    "only B, KB, KiB, MB, MiB, GB, GiB, TB, TiB, PB, and PiB are supported: {:?}",
+                    s
+                ));
+            }
+        };
+
+        match size.parse::<f64>() {
+            Ok(n) => Ok(ReadableSize((n * unit as f64) as u64)),
+            Err(_) => Err(format!("invalid size string: {:?}", s)),
+        }
+    }
+}
+
+impl<'de> Deserialize<'de> for ReadableSize {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: Deserializer<'de>,
+    {
+        struct SizeVisitor;
+
+        impl<'de> Visitor<'de> for SizeVisitor {
+            type Value = ReadableSize;
+
+            fn expecting(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
+                formatter.write_str("valid size")
+            }
+
+            fn visit_i64<E>(self, size: i64) -> Result<ReadableSize, E>
+            where
+                E: de::Error,
+            {
+                if size >= 0 {
+                    self.visit_u64(size as u64)
+                } else {
+                    Err(E::invalid_value(Unexpected::Signed(size), &self))
+                }
+            }
+
+            fn visit_u64<E>(self, size: u64) -> Result<ReadableSize, E>
+            where
+                E: de::Error,
+            {
+                Ok(ReadableSize(size))
+            }
+
+            fn visit_str<E>(self, size_str: &str) -> Result<ReadableSize, E>
+            where
+                E: de::Error,
+            {
+                size_str.parse().map_err(E::custom)
+            }
+        }
+
+        deserializer.deserialize_any(SizeVisitor)
+    }
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Ord, PartialOrd)]
+pub struct ReadableDuration(pub Duration);
+
+impl Add for ReadableDuration {
+    type Output = ReadableDuration;
+
+    fn add(self, rhs: ReadableDuration) -> ReadableDuration {
+        Self(self.0 + rhs.0)
+    }
+}
+
+impl AddAssign for ReadableDuration {
+    fn add_assign(&mut self, rhs: ReadableDuration) {
+        *self = *self + rhs;
+    }
+}
+
+impl Sub for ReadableDuration {
+    type Output = ReadableDuration;
+
+    fn sub(self, rhs: ReadableDuration) -> ReadableDuration {
+        Self(self.0 - rhs.0)
+    }
+}
+
+impl SubAssign for ReadableDuration {
+    fn sub_assign(&mut self, rhs: ReadableDuration) {
+        *self = *self - rhs;
+    }
+}
+
+impl Mul<u32> for ReadableDuration {
+    type Output = ReadableDuration;
+
+    fn mul(self, rhs: u32) -> Self::Output {
+        Self(self.0 * rhs)
+    }
+}
+
+impl MulAssign<u32> for ReadableDuration {
+    fn mul_assign(&mut self, rhs: u32) {
+        *self = *self * rhs;
+    }
+}
+
+impl Div<u32> for ReadableDuration {
+    type Output = ReadableDuration;
+
+    fn div(self, rhs: u32) -> ReadableDuration {
+        Self(self.0 / rhs)
+    }
+}
+
+impl DivAssign<u32> for ReadableDuration {
+    fn div_assign(&mut self, rhs: u32) {
+        *self = *self / rhs;
+    }
+}
+
+impl From<ReadableDuration> for Duration {
+    fn from(readable: ReadableDuration) -> Duration {
+        readable.0
+    }
+}
+
+// yingwen: Support From<Duration>.
+impl From<Duration> for ReadableDuration {
+    fn from(t: Duration) -> ReadableDuration {
+        ReadableDuration(t)
+    }
+}
+
+impl FromStr for ReadableDuration {
+    type Err = String;
+
+    fn from_str(dur_str: &str) -> Result<ReadableDuration, String> {
+        let dur_str = dur_str.trim();
+        if !dur_str.is_ascii() {
+            return Err(format!("unexpect ascii string: {}", dur_str));
+        }
+        let err_msg = "valid duration, only d, h, m, s, ms are supported.".to_owned();
+        let mut left = dur_str.as_bytes();
+        let mut last_unit = DAY + 1;
+        let mut dur = 0f64;
+        while let Some(idx) = left.iter().position(|c| b"dhms".contains(c)) {
+            let (first, second) = left.split_at(idx);
+            let unit = if second.starts_with(b"ms") {
+                left = &left[idx + 2..];
+                MS
+            } else {
+                let u = match second[0] {
+                    b'd' => DAY,
+                    b'h' => HOUR,
+                    b'm' => MINUTE,
+                    b's' => SECOND,
+                    _ => return Err(err_msg),
+                };
+                left = &left[idx + 1..];
+                u
+            };
+            if unit >= last_unit {
+                return Err("d, h, m, s, ms should occur in given order.".to_owned());
+            }
+            // do we need to check 12h360m?
+            let number_str = unsafe { str::from_utf8_unchecked(first) };
+            dur += match number_str.trim().parse::<f64>() {
+                Ok(n) => n * unit as f64,
+                Err(_) => return Err(err_msg),
+            };
+            last_unit = unit;
+        }
+        if !left.is_empty() {
+            return Err(err_msg);
+        }
+        if dur.is_sign_negative() {
+            return Err("duration should be positive.".to_owned());
+        }
+        let secs = dur as u64 / SECOND as u64;
+        let millis = (dur as u64 % SECOND as u64) as u32 * 1_000_000;
+        Ok(ReadableDuration(Duration::new(secs, millis)))
+    }
+}
+
+impl ReadableDuration {
+    pub const fn secs(secs: u64) -> ReadableDuration {
+        ReadableDuration(Duration::from_secs(secs))
+    }
+
+    pub const fn millis(millis: u64) -> ReadableDuration {
+        ReadableDuration(Duration::from_millis(millis))
+    }
+
+    pub const fn minutes(minutes: u64) -> ReadableDuration {
+        ReadableDuration::secs(minutes * 60)
+    }
+
+    pub const fn hours(hours: u64) -> ReadableDuration {
+        ReadableDuration::minutes(hours * 60)
+    }
+
+    pub const fn days(days: u64) -> ReadableDuration {
+        ReadableDuration::hours(days * 24)
+    }
+
+    pub fn as_secs(&self) -> u64 {
+        self.0.as_secs()
+    }
+
+    pub fn as_millis(&self) -> u64 {
+        duration_to_ms(self.0)
+    }
+
+    pub fn is_zero(&self) -> bool {
+        self.0.as_nanos() == 0
+    }
+}
+
+impl fmt::Display for ReadableDuration {
+    #[inline]
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let mut dur = duration_to_ms(self.0);
+        let mut written = false;
+        if dur >= DAY {
+            written = true;
+            write!(f, "{}d", dur / DAY)?;
+            dur %= DAY;
+        }
+        if dur >= HOUR {
+            written = true;
+            write!(f, "{}h", dur / HOUR)?;
+            dur %= HOUR;
+        }
+        if dur >= MINUTE {
+            written = true;
+            write!(f, "{}m", dur / MINUTE)?;
+            dur %= MINUTE;
+        }
+        if dur >= SECOND {
+            written = true;
+            write!(f, "{}s", dur / SECOND)?;
+            dur %= SECOND;
+        }
+        if dur > 0 {
+            written = true;
+            write!(f, "{}ms", dur)?;
+        }
+        if !written {
+            write!(f, "0s")?;
+        }
+        Ok(())
+    }
+}
+
+impl Serialize for ReadableDuration {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: Serializer,
+    {
+        let mut buffer = String::new();
+        write!(buffer, "{}", self).unwrap();
+        serializer.serialize_str(&buffer)
+    }
+}
+
+impl<'de> Deserialize<'de> for ReadableDuration {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: Deserializer<'de>,
+    {
+        struct DurVisitor;
+
+        impl<'de> Visitor<'de> for DurVisitor {
+            type Value = ReadableDuration;
+
+            fn expecting(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
+                formatter.write_str("valid duration")
+            }
+
+            fn visit_str<E>(self, dur_str: &str) -> Result<ReadableDuration, E>
+            where
+                E: de::Error,
+            {
+                dur_str.parse().map_err(E::custom)
+            }
+        }
+
+        deserializer.deserialize_str(DurVisitor)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_readable_size() {
+        let s = ReadableSize::kb(2);
+        assert_eq!(s.0, 2048);
+        assert_eq!(s.as_mb(), 0);
+        let s = ReadableSize::mb(2);
+        assert_eq!(s.0, 2 * 1024 * 1024);
+        assert_eq!(s.as_mb(), 2);
+        let s = ReadableSize::gb(2);
+        assert_eq!(s.0, 2 * 1024 * 1024 * 1024);
+        assert_eq!(s.as_mb(), 2048);
+
+        assert_eq!((ReadableSize::mb(2) / 2).0, MIB);
+        assert_eq!((ReadableSize::mb(1) / 2).0, 512 * KIB);
+        assert_eq!(ReadableSize::mb(2) / ReadableSize::kb(1), 2048);
+    }
+
+    #[test]
+    fn test_parse_readable_size() {
+        #[derive(Serialize, Deserialize)]
+        struct SizeHolder {
+            s: ReadableSize,
+        }
+
+        let legal_cases = vec![
+            (0, "0KiB"),
+            (2 * KIB, "2KiB"),
+            (4 * MIB, "4MiB"),
+            (5 * GIB, "5GiB"),
+            (7 * TIB, "7TiB"),
+            (11 * PIB, "11PiB"),
+        ];
+        for (size, exp) in legal_cases {
+            let c = SizeHolder {
+                s: ReadableSize(size),
+            };
+            let res_str = toml::to_string(&c).unwrap();
+            let exp_str = format!("s = {:?}\n", exp);
+            assert_eq!(res_str, exp_str);
+            let res_size: SizeHolder = toml::from_str(&exp_str).unwrap();
+            assert_eq!(res_size.s.0, size);
+        }
+
+        let c = SizeHolder {
+            s: ReadableSize(512),
+        };
+        let res_str = toml::to_string(&c).unwrap();
+        assert_eq!(res_str, "s = 512\n");
+        let res_size: SizeHolder = toml::from_str(&res_str).unwrap();
+        assert_eq!(res_size.s.0, c.s.0);
+
+        let decode_cases = vec![
+            (" 0.5 PB", PIB / 2),
+            ("0.5 TB", TIB / 2),
+            ("0.5GB ", GIB / 2),
+            ("0.5MB", MIB / 2),
+            ("0.5KB", KIB / 2),
+            ("0.5P", PIB / 2),
+            ("0.5T", TIB / 2),
+            ("0.5G", GIB / 2),
+            ("0.5M", MIB / 2),
+            ("0.5K", KIB / 2),
+            ("23", 23),
+            ("1", 1),
+            ("1024B", KIB),
+            // units with binary prefixes
+            (" 0.5 PiB", PIB / 2),
+            ("1PiB", PIB),
+            ("0.5 TiB", TIB / 2),
+            ("2 TiB", TIB * 2),
+            ("0.5GiB ", GIB / 2),
+            ("787GiB ", GIB * 787),
+            ("0.5MiB", MIB / 2),
+            ("3MiB", MIB * 3),
+            ("0.5KiB", KIB / 2),
+            ("1 KiB", KIB),
+            // scientific notation
+            ("0.5e6 B", B * 500000),
+            ("0.5E6 B", B * 500000),
+            ("1e6B", B * 1000000),
+            ("8E6B", B * 8000000),
+            ("8e7", B * 80000000),
+            ("1e-1MB", MIB / 10),
+            ("1e+1MB", MIB * 10),
+            ("0e+10MB", 0),
+        ];
+        for (src, exp) in decode_cases {
+            let src = format!("s = {:?}", src);
+            let res: SizeHolder = toml::from_str(&src).unwrap();
+            assert_eq!(res.s.0, exp);
+        }
+
+        let illegal_cases = vec![
+            "0.5kb", "0.5kB", "0.5Kb", "0.5k", "0.5g", "b", "gb", "1b", "B", "1K24B", " 5_KB",
+            "4B7", "5M_",
+        ];
+        for src in illegal_cases {
+            let src_str = format!("s = {:?}", src);
+            assert!(toml::from_str::<SizeHolder>(&src_str).is_err(), "{}", src);
+        }
+    }
+
+    #[test]
+    fn test_duration_construction() {
+        let mut dur = ReadableDuration::secs(1);
+        assert_eq!(dur.0, Duration::new(1, 0));
+        assert_eq!(dur.as_secs(), 1);
+        assert_eq!(dur.as_millis(), 1000);
+        dur = ReadableDuration::millis(1001);
+        assert_eq!(dur.0, Duration::new(1, 1_000_000));
+        assert_eq!(dur.as_secs(), 1);
+        assert_eq!(dur.as_millis(), 1001);
+        dur = ReadableDuration::minutes(2);
+        assert_eq!(dur.0, Duration::new(2 * 60, 0));
+        assert_eq!(dur.as_secs(), 120);
+        assert_eq!(dur.as_millis(), 120000);
+        dur = ReadableDuration::hours(2);
+        assert_eq!(dur.0, Duration::new(2 * 3600, 0));
+        assert_eq!(dur.as_secs(), 7200);
+        assert_eq!(dur.as_millis(), 7200000);
+    }
+
+    #[test]
+    fn test_parse_readable_duration() {
+        #[derive(Serialize, Deserialize)]
+        struct DurHolder {
+            d: ReadableDuration,
+        }
+
+        let legal_cases = vec![
+            (0, 0, "0s"),
+            (0, 1, "1ms"),
+            (2, 0, "2s"),
+            (24 * 3600, 0, "1d"),
+            (2 * 24 * 3600, 10, "2d10ms"),
+            (4 * 60, 0, "4m"),
+            (5 * 3600, 0, "5h"),
+            (3600 + 2 * 60, 0, "1h2m"),
+            (5 * 24 * 3600 + 3600 + 2 * 60, 0, "5d1h2m"),
+            (3600 + 2, 5, "1h2s5ms"),
+            (3 * 24 * 3600 + 7 * 3600 + 2, 5, "3d7h2s5ms"),
+        ];
+        for (secs, ms, exp) in legal_cases {
+            let d = DurHolder {
+                d: ReadableDuration(Duration::new(secs, ms * 1_000_000)),
+            };
+            let res_str = toml::to_string(&d).unwrap();
+            let exp_str = format!("d = {:?}\n", exp);
+            assert_eq!(res_str, exp_str);
+            let res_dur: DurHolder = toml::from_str(&exp_str).unwrap();
+            assert_eq!(res_dur.d.0, d.d.0);
+        }
+
+        let decode_cases = vec![(" 0.5 h2m ", 3600 / 2 + 2 * 60, 0)];
+        for (src, secs, ms) in decode_cases {
+            let src = format!("d = {:?}", src);
+            let res: DurHolder = toml::from_str(&src).unwrap();
+            assert_eq!(res.d.0, Duration::new(secs, ms * 1_000_000));
+        }
+
+        let illegal_cases = vec!["1H", "1M", "1S", "1MS", "1h1h", "h"];
+        for src in illegal_cases {
+            let src_str = format!("d = {:?}", src);
+            assert!(toml::from_str::<DurHolder>(&src_str).is_err(), "{}", src);
+        }
+        assert!(toml::from_str::<DurHolder>("d = 23").is_err());
+    }
+
+    #[test]
+    fn test_parse_timeunit() {
+        let s = "milliseconds";
+        assert_eq!(TimeUnit::Milliseconds, s.parse::<TimeUnit>().unwrap());
+        let s = "seconds";
+        assert_eq!(TimeUnit::Seconds, s.parse::<TimeUnit>().unwrap());
+        let s = "minutes";
+        assert_eq!(TimeUnit::Minutes, s.parse::<TimeUnit>().unwrap());
+        let s = "hours";
+        assert_eq!(TimeUnit::Hours, s.parse::<TimeUnit>().unwrap());
+        let s = "days";
+        assert_eq!(TimeUnit::Days, s.parse::<TimeUnit>().unwrap());
+        let s = "microseconds";
+        assert_eq!(TimeUnit::Microseconds, s.parse::<TimeUnit>().unwrap());
+    }
+}
diff --git a/common_util/src/lib.rs b/common_util/src/lib.rs
new file mode 100644
index 0000000000..f7c2c11e31
--- /dev/null
+++ b/common_util/src/lib.rs
@@ -0,0 +1,31 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Common utils shared by the whole project
+
+// We need to define mod with macro_use before other mod so that other
+// mods in this crate can use the macros
+#[macro_use]
+pub mod macros;
+
+// TODO(yingwen): Move some mod into components as a crate
+pub mod alloc_tracker;
+pub mod codec;
+pub mod config;
+pub mod metric;
+pub mod panic;
+pub mod runtime;
+pub mod time;
+pub mod toml;
+
+#[cfg(any(test, feature = "test"))]
+pub mod tests {
+    use std::sync::Once;
+
+    static INIT_LOG: Once = Once::new();
+
+    pub fn init_log_for_test() {
+        INIT_LOG.call_once(|| {
+            env_logger::init();
+        });
+    }
+}
diff --git a/common_util/src/macros.rs b/common_util/src/macros.rs
new file mode 100644
index 0000000000..5ac5b6f1c8
--- /dev/null
+++ b/common_util/src/macros.rs
@@ -0,0 +1,25 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Contains all needed macros
+
+/// Define result for given Error type
+#[macro_export]
+macro_rules! define_result {
+    ($t:ty) => {
+        pub type Result<T> = std::result::Result<T, $t>;
+    };
+}
+
+#[cfg(test)]
+mod tests {
+    #[test]
+    fn test_define_result() {
+        define_result!(i32);
+
+        fn return_i32_error() -> Result<()> {
+            Err(18)
+        }
+
+        assert_eq!(Err(18), return_i32_error());
+    }
+}
diff --git a/common_util/src/metric.rs b/common_util/src/metric.rs
new file mode 100644
index 0000000000..3219a3c757
--- /dev/null
+++ b/common_util/src/metric.rs
@@ -0,0 +1,267 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+/// Copied from https://github.com/sunng87/metriki/blob/master/metriki-core/src/metrics/meter.rs
+/// But supports 1 hour and 2 hour rate.
+use std::sync::atomic::{AtomicU64, Ordering};
+use std::time::{Duration, Instant, SystemTime};
+
+use crossbeam_utils::atomic::AtomicCell;
+#[cfg(feature = "ser")]
+use serde::ser::SerializeMap;
+#[cfg(feature = "ser")]
+use serde::{Serialize, Serializer};
+
+use crate::time;
+
+/// Meters are used to calculate rate of an event.
+#[derive(Debug)]
+pub struct Meter {
+    moving_avarages: ExponentiallyWeightedMovingAverages,
+    count: AtomicU64,
+    start_time: SystemTime,
+}
+
+impl Default for Meter {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl Meter {
+    pub fn new() -> Meter {
+        Meter {
+            moving_avarages: ExponentiallyWeightedMovingAverages::new(),
+            count: AtomicU64::from(0),
+            start_time: SystemTime::now(),
+        }
+    }
+
+    pub fn mark(&self) {
+        self.mark_n(1)
+    }
+
+    pub fn mark_n(&self, n: u64) {
+        self.count.fetch_add(n, Ordering::Relaxed);
+        self.moving_avarages.tick_if_needed();
+        self.moving_avarages.update(n);
+    }
+
+    pub fn h1_rate(&self) -> f64 {
+        self.moving_avarages.tick_if_needed();
+        self.moving_avarages.h1_rate()
+    }
+
+    pub fn h2_rate(&self) -> f64 {
+        self.moving_avarages.tick_if_needed();
+        self.moving_avarages.h2_rate()
+    }
+
+    pub fn m15_rate(&self) -> f64 {
+        self.moving_avarages.tick_if_needed();
+        self.moving_avarages.m15_rate()
+    }
+
+    pub fn count(&self) -> u64 {
+        self.count.load(Ordering::Relaxed)
+    }
+
+    pub fn mean_rate(&self) -> f64 {
+        let count = self.count();
+        if count > 0 {
+            if let Ok(elapsed) = SystemTime::now()
+                .duration_since(self.start_time)
+                .map(|d| d.as_secs() as f64)
+            {
+                count as f64 / elapsed
+            } else {
+                0f64
+            }
+        } else {
+            0f64
+        }
+    }
+}
+
+#[derive(Debug)]
+struct ExponentiallyWeightedMovingAverage {
+    alpha: f64,
+    interval_nanos: u64,
+
+    uncounted: AtomicCell<u64>,
+    rate: AtomicCell<Option<f64>>,
+}
+
+impl ExponentiallyWeightedMovingAverage {
+    fn new(alpha: f64, interval_secs: u64) -> ExponentiallyWeightedMovingAverage {
+        ExponentiallyWeightedMovingAverage {
+            alpha,
+            interval_nanos: time::secs_to_nanos(interval_secs),
+
+            uncounted: AtomicCell::new(0),
+            rate: AtomicCell::new(None),
+        }
+    }
+
+    fn update(&self, n: u64) {
+        self.uncounted.fetch_add(n);
+    }
+
+    fn tick(&self) {
+        let count = self.uncounted.swap(0);
+        let instant_rate = count as f64 / self.interval_nanos as f64;
+
+        if let Some(prev_rate) = self.rate.load() {
+            let new_rate = prev_rate + (self.alpha * (instant_rate - prev_rate));
+            self.rate.store(Some(new_rate));
+        } else {
+            self.rate.store(Some(instant_rate));
+        }
+    }
+
+    fn get_rate(&self) -> f64 {
+        if let Some(rate) = self.rate.load() {
+            rate * time::secs_to_nanos(1) as f64
+        } else {
+            0f64
+        }
+    }
+}
+
+#[derive(Debug)]
+struct ExponentiallyWeightedMovingAverages {
+    h1: ExponentiallyWeightedMovingAverage,
+    h2: ExponentiallyWeightedMovingAverage,
+    m15: ExponentiallyWeightedMovingAverage,
+
+    last_tick: AtomicCell<Instant>,
+}
+
+#[inline]
+fn alpha(interval_secs: u64, minutes: u64) -> f64 {
+    1.0 - (-(interval_secs as f64) / 60.0 / minutes as f64).exp()
+}
+
+const DEFAULT_INTERVAL_SECS: u64 = 5;
+const DEFAULT_INTERVAL_MILLIS: u64 = DEFAULT_INTERVAL_SECS * 1000;
+
+impl ExponentiallyWeightedMovingAverages {
+    fn new() -> ExponentiallyWeightedMovingAverages {
+        ExponentiallyWeightedMovingAverages {
+            h1: ExponentiallyWeightedMovingAverage::new(
+                alpha(DEFAULT_INTERVAL_SECS, 60),
+                DEFAULT_INTERVAL_SECS,
+            ),
+
+            h2: ExponentiallyWeightedMovingAverage::new(
+                alpha(DEFAULT_INTERVAL_SECS, 120),
+                DEFAULT_INTERVAL_SECS,
+            ),
+
+            m15: ExponentiallyWeightedMovingAverage::new(
+                alpha(DEFAULT_INTERVAL_SECS, 15),
+                DEFAULT_INTERVAL_SECS,
+            ),
+
+            last_tick: AtomicCell::new(Instant::now()),
+        }
+    }
+
+    fn update(&self, n: u64) {
+        self.h1.update(n);
+        self.h2.update(n);
+        self.m15.update(n);
+    }
+
+    fn tick_if_needed(&self) {
+        let previous_tick = self.last_tick.load();
+        let current_tick = Instant::now();
+
+        let tick_age = (current_tick - previous_tick).as_millis() as u64;
+
+        if tick_age > DEFAULT_INTERVAL_MILLIS {
+            let latest_tick =
+                current_tick - Duration::from_millis(tick_age % DEFAULT_INTERVAL_MILLIS);
+            if self
+                .last_tick
+                .compare_exchange(previous_tick, latest_tick)
+                .is_ok()
+            {
+                let required_ticks = tick_age / DEFAULT_INTERVAL_MILLIS;
+                for _ in 0..required_ticks {
+                    self.h1.tick();
+                    self.h2.tick();
+                    self.m15.tick();
+                }
+            }
+        }
+    }
+
+    fn h1_rate(&self) -> f64 {
+        self.h1.get_rate()
+    }
+
+    fn h2_rate(&self) -> f64 {
+        self.h2.get_rate()
+    }
+
+    fn m15_rate(&self) -> f64 {
+        self.m15.get_rate()
+    }
+}
+
+#[cfg(feature = "ser")]
+impl Serialize for Meter {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: Serializer,
+    {
+        let mut map = serializer.serialize_map(Some(4))?;
+
+        map.serialize_entry("count", &self.count())?;
+        map.serialize_entry("h1_rate", &self.h1_rate())?;
+        map.serialize_entry("h2_rate", &self.h2_rate())?;
+        map.serialize_entry("m15_rate", &self.m15_rate())?;
+
+        map.end()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{thread, time};
+
+    use super::*;
+
+    macro_rules! assert_float_eq {
+        ($left:expr, $right:expr) => {{
+            match (&$left, &$right) {
+                (left_val, right_val) => {
+                    let diff = (left_val - right_val).abs();
+
+                    if diff > f64::EPSILON {
+                        panic!(
+                            "assertion failed: `(left == right)`\n      left: `{:?}`,\n     right: `{:?}`",
+                            &*left_val, &*right_val
+                        )
+                    }
+                }
+            }
+        }};
+    }
+
+    #[test]
+    fn test_meter() {
+        let m = Meter::new();
+
+        for _ in 0..10 {
+            m.mark();
+        }
+
+        thread::sleep(time::Duration::from_millis(DEFAULT_INTERVAL_MILLIS + 10));
+
+        assert_eq!(10, m.count());
+        assert_float_eq!(2.0, m.m15_rate());
+        assert_float_eq!(2.0, m.h1_rate());
+        assert_float_eq!(2.0, m.h2_rate());
+    }
+}
diff --git a/common_util/src/panic.rs b/common_util/src/panic.rs
new file mode 100644
index 0000000000..5b0a9f5713
--- /dev/null
+++ b/common_util/src/panic.rs
@@ -0,0 +1,159 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use std::thread;
+
+use log::error;
+
+/// fork from https://github.com/tikv/tikv/blob/83d173a2c0058246631f0e71de74238ccff670fd/components/tikv_util/src/lib.rs#L429
+/// Exit the whole process when panic.
+pub fn set_panic_hook(panic_abort: bool) {
+    use std::{panic, process};
+
+    // HACK! New a backtrace ahead for caching necessary elf sections of this
+    // tikv-server, in case it can not open more files during panicking
+    // which leads to no stack info (0x5648bdfe4ff2 - <no info>).
+    //
+    // Crate backtrace caches debug info in a static variable `STATE`,
+    // and the `STATE` lives forever once it has been created.
+    // See more: https://github.com/alexcrichton/backtrace-rs/blob/\
+    //           597ad44b131132f17ed76bf94ac489274dd16c7f/\
+    //           src/symbolize/libbacktrace.rs#L126-L159
+    // Caching is slow, spawn it in another thread to speed up.
+    thread::Builder::new()
+        .name("backtrace-loader".to_owned())
+        .spawn(::backtrace::Backtrace::new)
+        .unwrap();
+
+    panic::set_hook(Box::new(move |info: &panic::PanicInfo<'_>| {
+        let msg = match info.payload().downcast_ref::<&'static str>() {
+            Some(s) => *s,
+            None => match info.payload().downcast_ref::<String>() {
+                Some(s) => &s[..],
+                None => "Box<Any>",
+            },
+        };
+
+        let thread = thread::current();
+        let name = thread.name().unwrap_or("<unnamed>");
+        let loc = info
+            .location()
+            .map(|l| format!("{}:{}", l.file(), l.line()));
+        let bt = backtrace::Backtrace::new();
+        error!(
+            "thread '{}' panicked '{}' at {:?}\n{:?}",
+            name,
+            msg,
+            loc.unwrap_or_else(|| "<unknown>".to_owned()),
+            bt
+        );
+
+        // There might be remaining logs in the async logger.
+        // To collect remaining logs and also collect future logs, replace the old one
+        // with a terminal logger.
+        // When the old global async logger is replaced, the old async guard will be
+        // taken and dropped. In the drop() the async guard, it waits for the
+        // finish of the remaining logs in the async logger.
+        if let Some(level) = ::log::max_level().to_level() {
+            let drainer = logger::term_drainer();
+            let _ = logger::init_log(
+                drainer,
+                logger::convert_log_level_to_slog_level(level),
+                false, // Use sync logger to avoid an unnecessary log thread.
+                0,
+                false, // It is initialized already.
+            );
+        }
+
+        if panic_abort {
+            process::abort();
+        } else {
+            unsafe {
+                // Calling process::exit would trigger global static to destroy, like C++
+                // static variables of RocksDB, which may cause other threads encounter
+                // pure virtual method call. So calling libc::_exit() instead to skip the
+                // cleanup process.
+                libc::_exit(1);
+            }
+        }
+    }))
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{io::Read, time::Duration};
+
+    use nix::{
+        sys::wait::{wait, WaitStatus},
+        unistd::{fork, ForkResult},
+    };
+    use slog::{self, Drain, Level, OwnedKVList, Record};
+
+    use crate::panic::set_panic_hook;
+
+    /// Create a child process and wait to get its exit code.
+    fn run_and_wait_child_process(child: impl Fn()) -> Result<i32, String> {
+        match unsafe { fork() } {
+            Ok(ForkResult::Parent { .. }) => match wait().unwrap() {
+                WaitStatus::Exited(_, status) => Ok(status),
+                v => Err(format!("{:?}", v)),
+            },
+            Ok(ForkResult::Child) => {
+                child();
+                std::process::exit(0);
+            }
+            Err(e) => Err(format!("Fork failed: {}", e)),
+        }
+    }
+
+    #[test]
+    fn test_panic_hook() {
+        use gag::BufferRedirect;
+
+        struct DelayDrain<D>(D);
+
+        impl<D> Drain for DelayDrain<D>
+        where
+            D: Drain,
+            <D as Drain>::Err: std::fmt::Display,
+        {
+            type Err = <D as Drain>::Err;
+            type Ok = <D as Drain>::Ok;
+
+            fn log(
+                &self,
+                record: &Record<'_>,
+                values: &OwnedKVList,
+            ) -> Result<Self::Ok, Self::Err> {
+                std::thread::sleep(Duration::from_millis(100));
+                self.0.log(record, values)
+            }
+        }
+
+        let mut stdout = BufferRedirect::stdout().unwrap();
+        let status = run_and_wait_child_process(|| {
+            set_panic_hook(false);
+            let drainer = logger::term_drainer();
+            let _ = logger::init_log(
+                drainer,
+                Level::Debug,
+                true, // use async drainer
+                0,
+                true, // init std log
+            );
+
+            let _ = std::thread::spawn(|| {
+                // let the global logger is held by the other thread, so the
+                // drop() of the async drain is not called in time.
+                let _guard = slog_global::borrow_global();
+                std::thread::sleep(Duration::from_secs(1));
+            });
+            panic!("test");
+        })
+        .unwrap();
+
+        assert_eq!(status, 1);
+        let mut panic = String::new();
+        stdout.read_to_string(&mut panic).unwrap();
+        assert!(!panic.is_empty());
+    }
+}
diff --git a/common_util/src/runtime/metrics.rs b/common_util/src/runtime/metrics.rs
new file mode 100644
index 0000000000..4f82494093
--- /dev/null
+++ b/common_util/src/runtime/metrics.rs
@@ -0,0 +1,57 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use lazy_static::lazy_static;
+use prometheus::{register_int_gauge_vec, IntGauge, IntGaugeVec};
+
+lazy_static! {
+    // Gauges:
+    static ref RUNTIME_THREAD_ALIVE_GAUGE: IntGaugeVec = register_int_gauge_vec!(
+        "runtime_thread_alive_gauge",
+        "alive thread number for runtime",
+        &["name"]
+    )
+        .unwrap();
+    static ref RUNTIME_THREAD_IDLE_GAUGE: IntGaugeVec = register_int_gauge_vec!(
+        "runtime_thread_idle_gauge",
+        "idle thread number for runtime",
+        &["name"]
+    )
+        .unwrap();
+}
+
+/// Runtime metrics.
+#[derive(Debug)]
+pub struct Metrics {
+    // Gauges:
+    pub thread_alive_gauge: IntGauge,
+    pub thread_idle_gauge: IntGauge,
+}
+
+impl Metrics {
+    pub fn new(name: &str) -> Self {
+        Self {
+            thread_alive_gauge: RUNTIME_THREAD_ALIVE_GAUGE.with_label_values(&[name]),
+            thread_idle_gauge: RUNTIME_THREAD_IDLE_GAUGE.with_label_values(&[name]),
+        }
+    }
+
+    #[inline]
+    pub fn on_thread_start(&self) {
+        self.thread_alive_gauge.inc();
+    }
+
+    #[inline]
+    pub fn on_thread_stop(&self) {
+        self.thread_alive_gauge.dec();
+    }
+
+    #[inline]
+    pub fn on_thread_park(&self) {
+        self.thread_idle_gauge.inc();
+    }
+
+    #[inline]
+    pub fn on_thread_unpark(&self) {
+        self.thread_idle_gauge.dec();
+    }
+}
diff --git a/common_util/src/runtime/mod.rs b/common_util/src/runtime/mod.rs
new file mode 100644
index 0000000000..70494d6b6f
--- /dev/null
+++ b/common_util/src/runtime/mod.rs
@@ -0,0 +1,277 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! A multi-threaded runtime that supports running Futures
+use std::{
+    future::Future,
+    pin::Pin,
+    sync::Arc,
+    task::{Context, Poll},
+};
+
+use pin_project_lite::pin_project;
+use snafu::{Backtrace, GenerateBacktrace, ResultExt, Snafu};
+use tokio::{
+    runtime::{Builder as RuntimeBuilder, Runtime as TokioRuntime},
+    task::{JoinError, JoinHandle as TokioJoinHandle},
+};
+mod metrics;
+use metrics::Metrics;
+
+// TODO(yingwen): Use opaque error type
+#[derive(Debug, Snafu)]
+#[snafu(visibility(pub))]
+pub enum Error {
+    #[snafu(display(
+        "Runtime Failed to build runtime, err:{}.\nBacktrace:\n{}",
+        source,
+        backtrace
+    ))]
+    BuildRuntime {
+        source: std::io::Error,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Runtime Failed to join task, err:{}.\nBacktrace:\n{}",
+        source,
+        backtrace
+    ))]
+    JoinTask {
+        source: JoinError,
+        backtrace: Backtrace,
+    },
+}
+
+define_result!(Error);
+
+/// A runtime to run future tasks
+#[derive(Debug)]
+pub struct Runtime {
+    rt: TokioRuntime,
+    metrics: Arc<Metrics>,
+}
+
+impl Runtime {
+    /// Spawn a future and execute it in this thread pool
+    ///
+    /// Similar to tokio::runtime::Runtime::spawn()
+    pub fn spawn<F>(&self, future: F) -> JoinHandle<F::Output>
+    where
+        F: Future + Send + 'static,
+        F::Output: Send + 'static,
+    {
+        JoinHandle {
+            inner: self.rt.spawn(future),
+        }
+    }
+
+    /// Run the provided function on an executor dedicated to blocking
+    /// operations.
+    pub fn spawn_blocking<F, R>(&self, func: F) -> JoinHandle<R>
+    where
+        F: FnOnce() -> R + Send + 'static,
+        R: Send + 'static,
+    {
+        JoinHandle {
+            inner: self.rt.spawn_blocking(func),
+        }
+    }
+
+    /// Run a future to complete, this is the runtime's entry point
+    pub fn block_on<F: Future>(&self, future: F) -> F::Output {
+        self.rt.block_on(future)
+    }
+
+    /// Returns the runtime stats
+    pub fn stats(&self) -> RuntimeStats {
+        RuntimeStats {
+            alive_thread_num: self.metrics.thread_alive_gauge.get(),
+            idle_thread_num: self.metrics.thread_idle_gauge.get(),
+        }
+    }
+}
+
+pin_project! {
+    #[derive(Debug)]
+    pub struct JoinHandle<T> {
+        #[pin]
+        inner: TokioJoinHandle<T>,
+    }
+}
+
+impl<T> Future for JoinHandle<T> {
+    type Output = Result<T>;
+
+    fn poll(self: Pin<&mut Self>, ctx: &mut Context<'_>) -> Poll<Self::Output> {
+        let this = self.project();
+        this.inner.poll(ctx).map_err(|source| Error::JoinTask {
+            source,
+            backtrace: Backtrace::generate(),
+        })
+    }
+}
+
+/// Runtime statistics
+pub struct RuntimeStats {
+    pub alive_thread_num: i64,
+    pub idle_thread_num: i64,
+}
+
+pub struct Builder {
+    thread_name: String,
+    builder: RuntimeBuilder,
+}
+
+impl Default for Builder {
+    fn default() -> Self {
+        Self {
+            thread_name: "cse-runtime-worker".to_string(),
+            builder: RuntimeBuilder::new_multi_thread(),
+        }
+    }
+}
+
+fn with_metrics<F>(metrics: &Arc<Metrics>, f: F) -> impl Fn()
+where
+    F: Fn(&Arc<Metrics>) + 'static,
+{
+    let m = metrics.clone();
+    move || {
+        f(&m);
+    }
+}
+
+impl Builder {
+    /// Sets the number of worker threads the Runtime will use.
+    ///
+    /// This can be any number above 0
+    pub fn worker_threads(&mut self, val: usize) -> &mut Self {
+        self.builder.worker_threads(val);
+        self
+    }
+
+    /// Sets name of threads spawned by the Runtime thread pool
+    pub fn thread_name(&mut self, val: impl Into<String>) -> &mut Self {
+        self.thread_name = val.into();
+        self
+    }
+
+    /// Enable all feature of the underlying runtime
+    pub fn enable_all(&mut self) -> &mut Self {
+        self.builder.enable_all();
+        self
+    }
+
+    pub fn build(&mut self) -> Result<Runtime> {
+        let metrics = Arc::new(Metrics::new(&self.thread_name));
+
+        let rt = self
+            .builder
+            .thread_name(self.thread_name.clone())
+            .on_thread_start(with_metrics(&metrics, |m| {
+                m.on_thread_start();
+            }))
+            .on_thread_stop(with_metrics(&metrics, |m| {
+                m.on_thread_stop();
+            }))
+            .on_thread_park(with_metrics(&metrics, |m| {
+                m.on_thread_park();
+            }))
+            .on_thread_unpark(with_metrics(&metrics, |m| {
+                m.on_thread_unpark();
+            }))
+            .build()
+            .context(BuildRuntime)?;
+
+        Ok(Runtime { rt, metrics })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{sync::Arc, thread, time::Duration};
+
+    use tokio::sync::oneshot;
+    use tokio_test::assert_ok;
+
+    use super::*;
+
+    fn rt() -> Arc<Runtime> {
+        let rt = Builder::default()
+            .worker_threads(2)
+            .thread_name("test_spawn_join")
+            .enable_all()
+            .build();
+        assert!(rt.is_ok());
+        Arc::new(rt.unwrap())
+    }
+
+    #[test]
+    fn test_stats() {
+        let rt = Builder::default()
+            .worker_threads(5)
+            .thread_name("test_stats")
+            .enable_all()
+            .build();
+        assert!(rt.is_ok());
+        let rt = Arc::new(rt.unwrap());
+        // wait threads created
+        thread::sleep(Duration::from_millis(50));
+
+        let s = rt.stats();
+        assert_eq!(5, s.alive_thread_num);
+        assert_eq!(5, s.idle_thread_num);
+
+        rt.spawn(async {
+            thread::sleep(Duration::from_millis(50));
+        });
+
+        thread::sleep(Duration::from_millis(10));
+        let s = rt.stats();
+        assert_eq!(5, s.alive_thread_num);
+        assert_eq!(4, s.idle_thread_num);
+    }
+
+    #[test]
+    fn block_on_async() {
+        let rt = rt();
+
+        let out = rt.block_on(async {
+            let (tx, rx) = oneshot::channel();
+
+            thread::spawn(move || {
+                thread::sleep(Duration::from_millis(50));
+                tx.send("ZOMG").unwrap();
+            });
+
+            assert_ok!(rx.await)
+        });
+
+        assert_eq!(out, "ZOMG");
+    }
+
+    #[test]
+    fn spawn_from_blocking() {
+        let rt = rt();
+        let rt1 = rt.clone();
+        let out = rt.block_on(async move {
+            let rt2 = rt1.clone();
+            let inner = assert_ok!(
+                rt1.spawn_blocking(move || { rt2.spawn(async move { "hello" }) })
+                    .await
+            );
+
+            assert_ok!(inner.await)
+        });
+
+        assert_eq!(out, "hello")
+    }
+
+    #[test]
+    fn test_spawn_join() {
+        let rt = rt();
+        let handle = rt.spawn(async { 1 + 1 });
+
+        assert_eq!(2, rt.block_on(handle).unwrap());
+    }
+}
diff --git a/common_util/src/time.rs b/common_util/src/time.rs
new file mode 100644
index 0000000000..1a44f98402
--- /dev/null
+++ b/common_util/src/time.rs
@@ -0,0 +1,68 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Time utilities
+
+// TODO(yingwen): Move to common_types ?
+
+use std::{
+    convert::TryInto,
+    time::{Duration, Instant},
+};
+
+pub trait DurationExt {
+    /// Convert into u64.
+    ///
+    /// Returns u64::MAX if overflow
+    fn as_millis_u64(&self) -> u64;
+}
+
+impl DurationExt for Duration {
+    #[inline]
+    fn as_millis_u64(&self) -> u64 {
+        match self.as_millis().try_into() {
+            Ok(v) => v,
+            Err(_) => u64::MAX,
+        }
+    }
+}
+
+pub trait InstantExt {
+    fn saturating_elapsed(&self) -> Duration;
+}
+
+impl InstantExt for Instant {
+    fn saturating_elapsed(&self) -> Duration {
+        Instant::now().saturating_duration_since(*self)
+    }
+}
+
+#[inline]
+pub fn secs_to_nanos(s: u64) -> u64 {
+    s * 1000000000
+}
+
+#[cfg(test)]
+mod tests {
+    use std::thread;
+
+    use super::*;
+    #[test]
+    fn test_as_mills_u64() {
+        let d = Duration::from_millis(100);
+        assert_eq!(100, d.as_millis_u64());
+
+        let d = Duration::from_secs(100);
+        assert_eq!(100000, d.as_millis_u64());
+    }
+
+    #[test]
+    fn test_saturating_elapsed() {
+        let ins = Instant::now();
+        let one_hundred_mills = Duration::from_millis(100);
+        let error = 10;
+        thread::sleep(one_hundred_mills);
+        assert!(ins.saturating_elapsed().as_millis_u64() - 100 < error);
+        thread::sleep(one_hundred_mills);
+        assert!(ins.saturating_elapsed().as_millis_u64() - 200 < 2 * error);
+    }
+}
diff --git a/common_util/src/toml.rs b/common_util/src/toml.rs
new file mode 100644
index 0000000000..58332dc4c2
--- /dev/null
+++ b/common_util/src/toml.rs
@@ -0,0 +1,104 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Toml config utilities.
+
+use std::{fs::File, io::Read};
+
+use serde::de;
+use snafu::{Backtrace, ResultExt, Snafu};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display(
+        "Failed to open file, path:{}, err:{}.\nBacktrace:\n{}",
+        path,
+        source,
+        backtrace
+    ))]
+    OpenFile {
+        path: String,
+        source: std::io::Error,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Failed to read toml, path:{}, err:{}.\nBacktrace:\n{}",
+        path,
+        source,
+        backtrace
+    ))]
+    ReadToml {
+        path: String,
+        source: std::io::Error,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Failed to parse toml, path:{}, err:{}.\nBacktrace:\n{}",
+        path,
+        source,
+        backtrace
+    ))]
+    ParseToml {
+        path: String,
+        source: toml::de::Error,
+        backtrace: Backtrace,
+    },
+}
+
+define_result!(Error);
+
+/// Read toml file from given `path` to `toml_buf`, then parsed it to `T` and
+/// return.
+pub fn parse_toml_from_path<'a, T>(path: &str, toml_buf: &'a mut String) -> Result<T>
+where
+    T: de::Deserialize<'a>,
+{
+    let mut file = File::open(path).context(OpenFile { path })?;
+    file.read_to_string(toml_buf).context(ReadToml { path })?;
+
+    toml::from_str(toml_buf).context(ParseToml { path })
+}
+
+#[cfg(test)]
+mod tests {
+    use std::io::Write;
+
+    use serde_derive::Deserialize;
+    use tempfile::tempdir;
+
+    use super::*;
+
+    #[test]
+    fn test_parse_toml_from_path() {
+        let dir = tempdir().unwrap();
+        let file_path = dir.path().join("test.toml");
+        let path = file_path.to_str().unwrap();
+
+        let mut f = File::create(path).expect("Failed to create test config file");
+        f.write_all(b"host=\"localhost\"\nport=1081")
+            .expect("Failed to write test config");
+
+        f.sync_all().expect("Failed to sync test config");
+
+        #[derive(Clone, Debug, Deserialize)]
+        struct TestConfig {
+            host: String,
+            port: u16,
+        }
+        let mut config = TestConfig {
+            host: "".to_string(),
+            port: 0,
+        };
+
+        assert_eq!("", config.host);
+        assert_eq!(0, config.port);
+
+        let mut toml_str = String::new();
+
+        config = parse_toml_from_path(path, &mut toml_str).unwrap();
+
+        assert_eq!("localhost", config.host);
+        assert_eq!(1081, config.port);
+    }
+}
diff --git a/components/arena/Cargo.toml b/components/arena/Cargo.toml
new file mode 100644
index 0000000000..ec70993c17
--- /dev/null
+++ b/components/arena/Cargo.toml
@@ -0,0 +1,10 @@
+[package]
+name = "arena"
+version = "0.1.0"
+authors = ["Ruihang Xia <xrh262829@antgroup.com>"]
+edition = "2018"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+parking_lot = "0.11.1"
\ No newline at end of file
diff --git a/components/arena/src/arena_trait.rs b/components/arena/src/arena_trait.rs
new file mode 100644
index 0000000000..a8808fa38b
--- /dev/null
+++ b/components/arena/src/arena_trait.rs
@@ -0,0 +1,70 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use std::{alloc::Layout, ptr::NonNull, sync::Arc};
+
+/// Memory Arena trait.
+///
+/// The trait itself provides and enforces no guarantee about alignment. It's
+/// implementation's responsibility to cover.
+///
+/// All memory-relavent methods (`alloc()` etc.) are not "unsafe". Compare with
+/// "deallocate" which is not included in this trait, allocating is more safer
+/// and not likely to run into UB. However in fact, playing with raw pointer is
+/// always dangerous and needs to be careful for both who implements and uses
+/// this trait.
+pub trait Arena {
+    type Stats;
+
+    // required methods
+
+    /// Try to allocate required memory described by layout. Return a pointer of
+    /// allocated space in success, while `None` if failed.
+    fn try_alloc(&self, layout: Layout) -> Option<NonNull<u8>>;
+
+    /// Get arena's statistics.
+    fn stats(&self) -> Self::Stats;
+
+    // provided methods
+
+    /// Allocate required memory. Panic if failed.
+    fn alloc(&self, layout: Layout) -> NonNull<u8> {
+        self.try_alloc(layout).unwrap()
+    }
+}
+
+/// Basic statistics of arena. Offers [bytes_allocated]
+/// and [bytes_used].
+#[derive(Debug, Clone, Copy)]
+pub struct BasicStats {
+    pub(crate) bytes_allocated: usize,
+    pub(crate) bytes_used: usize,
+}
+
+impl BasicStats {
+    /// Total bytes allocated from system.
+    #[inline]
+    pub fn bytes_allocated(&self) -> usize {
+        self.bytes_allocated
+    }
+
+    /// Total bytes allocated to user.
+    #[inline]
+    pub fn bytes_used(&self) -> usize {
+        self.bytes_used
+    }
+}
+
+/// Collect memory usage from Arean
+pub trait Collector {
+    /// Called when `bytes` bytes memory is allocated in arena.
+    fn on_alloc(&self, bytes: usize);
+
+    /// Called when `bytes` bytes memory is used in arena.
+    fn on_used(&self, bytes: usize);
+
+    /// Called when `allocated` bytes memory is released, and `used` bytes in
+    /// it.
+    fn on_free(&self, used: usize, allocated: usize);
+}
+
+pub type CollectorRef = Arc<dyn Collector + Send + Sync>;
diff --git a/components/arena/src/fixed_size.rs b/components/arena/src/fixed_size.rs
new file mode 100644
index 0000000000..f7305e6144
--- /dev/null
+++ b/components/arena/src/fixed_size.rs
@@ -0,0 +1,107 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use std::{
+    alloc::{alloc, dealloc, Layout},
+    ptr::NonNull,
+    sync::{
+        atomic::{AtomicUsize, Ordering},
+        Arc,
+    },
+};
+
+use crate::arena_trait::{Arena, BasicStats};
+
+const DEFAULT_ALIGN: usize = 8;
+
+#[derive(Clone)]
+pub struct FixedSizeArena {
+    core: Arc<Core>,
+}
+
+impl FixedSizeArena {
+    pub fn with_capacity(cap: usize) -> Self {
+        Self {
+            core: Arc::new(Core::with_capacity(cap)),
+        }
+    }
+}
+
+struct Core {
+    len: AtomicUsize,
+    cap: usize,
+    ptr: NonNull<u8>,
+}
+
+impl Core {
+    /// # Safety
+    /// - alloc
+    /// See [std::alloc::alloc].
+    /// - new_unchecked
+    /// `ptr` is allocated from allocator.
+    fn with_capacity(cap: usize) -> Self {
+        let layout = Layout::from_size_align(cap as usize, DEFAULT_ALIGN).unwrap();
+        let ptr = unsafe { alloc(layout) };
+
+        Self {
+            len: AtomicUsize::new(0),
+            cap,
+            ptr: unsafe { NonNull::new_unchecked(ptr) },
+        }
+    }
+
+    /// # Safety
+    /// `self.ptr` is allocated from allocator
+    fn try_alloc(&self, layout: Layout) -> Option<NonNull<u8>> {
+        let layout = layout.pad_to_align();
+        let size = layout.size();
+
+        let offset = self.len.fetch_add(size, Ordering::SeqCst) as usize;
+        if offset + size > self.cap {
+            self.len.fetch_sub(size, Ordering::SeqCst);
+            return None;
+        }
+
+        unsafe { Some(NonNull::new_unchecked(self.ptr.as_ptr().add(size))) }
+    }
+}
+
+impl Drop for Core {
+    /// Reclaim space pointed by `data`.
+    fn drop(&mut self) {
+        unsafe {
+            dealloc(
+                self.ptr.as_ptr(),
+                Layout::from_size_align_unchecked(self.cap, DEFAULT_ALIGN),
+            )
+        }
+    }
+}
+
+impl Arena for FixedSizeArena {
+    type Stats = BasicStats;
+
+    fn try_alloc(&self, layout: Layout) -> Option<NonNull<u8>> {
+        self.core.try_alloc(layout)
+    }
+
+    fn stats(&self) -> Self::Stats {
+        Self::Stats {
+            bytes_used: self.core.cap,
+            bytes_allocated: self.core.len.load(Ordering::SeqCst) as usize,
+        }
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn capacity_overflow() {
+        let arena = FixedSizeArena::with_capacity(1024);
+        let layout = unsafe { Layout::from_size_align_unchecked(768, DEFAULT_ALIGN) };
+        let _ = arena.alloc(layout);
+
+        assert_eq!(None, arena.try_alloc(layout));
+    }
+}
diff --git a/components/arena/src/lib.rs b/components/arena/src/lib.rs
new file mode 100644
index 0000000000..963dd47933
--- /dev/null
+++ b/components/arena/src/lib.rs
@@ -0,0 +1,11 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! `Arena` Trait and implementations.
+
+mod arena_trait;
+mod fixed_size;
+mod mono_inc;
+
+pub use arena_trait::{Arena, BasicStats, Collector, CollectorRef};
+pub use fixed_size::FixedSizeArena;
+pub use mono_inc::{MonoIncArena, NoopCollector};
diff --git a/components/arena/src/mono_inc.rs b/components/arena/src/mono_inc.rs
new file mode 100644
index 0000000000..0adc7253de
--- /dev/null
+++ b/components/arena/src/mono_inc.rs
@@ -0,0 +1,347 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use std::{
+    alloc::{alloc, dealloc, Layout},
+    ptr::NonNull,
+    sync::Arc,
+};
+
+use parking_lot::Mutex;
+
+use crate::arena_trait::{Arena, BasicStats, Collector, CollectorRef};
+
+/// The noop collector does nothing on alloc and free
+pub struct NoopCollector;
+
+impl Collector for NoopCollector {
+    fn on_alloc(&self, _bytes: usize) {}
+
+    fn on_used(&self, _bytes: usize) {}
+
+    fn on_free(&self, _used: usize, _allocated: usize) {}
+}
+
+const DEFAULT_ALIGN: usize = 8;
+
+/// A thread-safe arena. All allocated memory is aligned to 8. Organizes its
+/// allocated memory as blocks.
+#[derive(Clone)]
+pub struct MonoIncArena {
+    core: Arc<Mutex<ArenaCore>>,
+}
+
+impl MonoIncArena {
+    pub fn new(regular_block_size: usize) -> Self {
+        Self {
+            core: Arc::new(Mutex::new(ArenaCore::new(
+                regular_block_size,
+                Arc::new(NoopCollector {}),
+            ))),
+        }
+    }
+
+    pub fn with_collector(regular_block_size: usize, collector: CollectorRef) -> Self {
+        Self {
+            core: Arc::new(Mutex::new(ArenaCore::new(regular_block_size, collector))),
+        }
+    }
+}
+
+impl Arena for MonoIncArena {
+    type Stats = BasicStats;
+
+    fn try_alloc(&self, layout: Layout) -> Option<NonNull<u8>> {
+        Some(self.core.lock().alloc(layout))
+    }
+
+    fn stats(&self) -> Self::Stats {
+        self.core.lock().stats
+    }
+
+    fn alloc(&self, layout: Layout) -> NonNull<u8> {
+        self.core.lock().alloc(layout)
+    }
+}
+
+struct ArenaCore {
+    collector: CollectorRef,
+    regular_layout: Layout,
+    regular_blocks: Vec<Block>,
+    special_blocks: Vec<Block>,
+    stats: BasicStats,
+}
+
+impl ArenaCore {
+    /// # Safety
+    /// Required property is tested in debug assertions.
+    fn new(regular_block_size: usize, collector: CollectorRef) -> Self {
+        debug_assert_ne!(DEFAULT_ALIGN, 0);
+        debug_assert_eq!(DEFAULT_ALIGN & (DEFAULT_ALIGN - 1), 0);
+        // TODO(yingwen): Avoid panic.
+        let regular_layout = Layout::from_size_align(regular_block_size, DEFAULT_ALIGN).unwrap();
+        let regular_blocks = vec![Block::new(regular_layout)];
+        let special_blocks = vec![];
+        let bytes = regular_layout.size();
+        collector.on_alloc(bytes);
+
+        Self {
+            collector,
+            regular_layout,
+            regular_blocks,
+            special_blocks,
+            stats: BasicStats {
+                bytes_allocated: bytes,
+                bytes_used: 0,
+            },
+        }
+    }
+
+    /// Input layout will be aligned.
+    fn alloc(&mut self, layout: Layout) -> NonNull<u8> {
+        let layout = layout
+            .align_to(self.regular_layout.align())
+            .unwrap()
+            .pad_to_align();
+        let bytes = layout.size();
+        // TODO(Ruihang): determine threshold
+        if layout.size() > self.regular_layout.size() {
+            self.stats.bytes_used += bytes;
+            self.collector.on_used(bytes);
+            Self::add_new_block(
+                layout,
+                &mut self.special_blocks,
+                &mut self.stats,
+                &self.collector,
+            );
+            let block = self.special_blocks.last().unwrap();
+            return block.data;
+        }
+
+        self.stats.bytes_used += bytes;
+        self.collector.on_used(bytes);
+        if let Some(ptr) = self.try_alloc(layout) {
+            ptr
+        } else {
+            Self::add_new_block(
+                self.regular_layout,
+                &mut self.regular_blocks,
+                &mut self.stats,
+                &self.collector,
+            );
+            self.try_alloc(layout).unwrap()
+        }
+    }
+
+    /// # Safety
+    /// `regular_blocks` vector is guaranteed to contains at least one element.
+    fn try_alloc(&mut self, layout: Layout) -> Option<NonNull<u8>> {
+        self.regular_blocks.last_mut().unwrap().alloc(layout)
+    }
+
+    fn add_new_block(
+        layout: Layout,
+        container: &mut Vec<Block>,
+        stats: &mut BasicStats,
+        collector: &CollectorRef,
+    ) {
+        let new_block = Block::new(layout);
+        container.push(new_block);
+        // Update allocated stats once a new block has been allocated from the system.
+        stats.bytes_allocated += layout.size();
+        collector.on_alloc(layout.size());
+    }
+}
+
+impl Drop for ArenaCore {
+    fn drop(&mut self) {
+        self.collector
+            .on_free(self.stats.bytes_used, self.stats.bytes_allocated);
+    }
+}
+
+struct Block {
+    data: NonNull<u8>,
+    len: usize,
+    layout: Layout,
+}
+
+impl Block {
+    /// Create a new block. Return the pointer of this new block.
+    ///
+    /// # Safety
+    /// See [std::alloc::alloc]. The allocated memory will be deallocated in
+    /// drop().
+    fn new(layout: Layout) -> Block {
+        let data = unsafe { alloc(layout) };
+
+        Self {
+            data: NonNull::new(data).unwrap(),
+            len: 0,
+            layout,
+        }
+    }
+
+    /// # Safety
+    /// ## ptr:add()
+    /// The added offset is checked before.
+    /// ## NonNull::new_unchecked()
+    /// `ptr` is added from a NonNull.
+    fn alloc(&mut self, layout: Layout) -> Option<NonNull<u8>> {
+        let size = layout.size();
+
+        if self.len + size <= self.layout.size() {
+            let ptr = unsafe { self.data.as_ptr().add(self.len) };
+            self.len += size;
+            unsafe { Some(NonNull::new_unchecked(ptr)) }
+        } else {
+            None
+        }
+    }
+}
+
+impl Drop for Block {
+    /// Reclaim space pointed by `data`.
+    fn drop(&mut self) {
+        unsafe { dealloc(self.data.as_ptr(), self.layout) }
+    }
+}
+
+unsafe impl Send for Block {}
+unsafe impl Sync for Block {}
+
+#[cfg(test)]
+mod test {
+    use std::{
+        mem,
+        sync::{
+            atomic::{AtomicUsize, Ordering},
+            Arc,
+        },
+    };
+
+    use super::*;
+
+    /// # Safety:
+    /// Caller should check the input buf has enough space.
+    fn consume_buf_as_u64_slice(buf: NonNull<u8>, n: usize) {
+        unsafe {
+            let mut buf = buf.as_ptr() as *mut u64;
+            for i in 0..n {
+                *buf = i as u64;
+                buf = buf.add(1);
+            }
+        }
+    }
+
+    #[test]
+    fn test_stats() {
+        let arena = MonoIncArena::new(1024 * 1024);
+
+        // Size is 80
+        let layout_slice = Layout::new::<[u64; 10]>().align_to(8).unwrap();
+        for _ in 0..20 {
+            arena.alloc(layout_slice);
+        }
+
+        assert_eq!(1024 * 1024, arena.stats().bytes_allocated());
+        assert_eq!(1600, arena.stats().bytes_used());
+    }
+
+    struct MockCollector {
+        allocated: AtomicUsize,
+        used: AtomicUsize,
+    }
+
+    impl Collector for MockCollector {
+        fn on_alloc(&self, bytes: usize) {
+            self.allocated.fetch_add(bytes, Ordering::Relaxed);
+        }
+
+        fn on_used(&self, bytes: usize) {
+            self.used.fetch_add(bytes, Ordering::Relaxed);
+        }
+
+        fn on_free(&self, _used: usize, _allocated: usize) {}
+    }
+
+    #[test]
+    fn test_collector() {
+        let collector = Arc::new(MockCollector {
+            allocated: AtomicUsize::new(0),
+            used: AtomicUsize::new(0),
+        });
+
+        let arena = MonoIncArena::with_collector(1024 * 1024, collector.clone());
+
+        // Size is 80
+        let layout_slice = Layout::new::<[u64; 10]>().align_to(8).unwrap();
+        for _ in 0..20 {
+            arena.alloc(layout_slice);
+        }
+
+        assert_eq!(1024 * 1024, collector.allocated.load(Ordering::Relaxed));
+        assert_eq!(1600, collector.used.load(Ordering::Relaxed));
+    }
+
+    #[test]
+    fn alloc_small_slice() {
+        let arena = MonoIncArena::new(128);
+
+        let layout_slice = Layout::new::<[u64; 10]>().align_to(8).unwrap();
+        for _ in 0..20 {
+            let buf = arena.alloc(layout_slice);
+            consume_buf_as_u64_slice(buf, 10);
+        }
+
+        assert_eq!(2560, arena.stats().bytes_allocated());
+        assert_eq!(1600, arena.stats().bytes_used());
+    }
+
+    #[test]
+    fn alloc_huge_slice() {
+        let arena = MonoIncArena::new(128);
+
+        let layout_slice = Layout::new::<[u64; 20]>().align_to(8).unwrap();
+        for _ in 0..20 {
+            let buf = arena.alloc(layout_slice);
+            consume_buf_as_u64_slice(buf, 20);
+        }
+
+        assert_eq!(3328, arena.stats().bytes_allocated());
+        assert_eq!(3200, arena.stats().bytes_used());
+    }
+
+    #[test]
+    fn alloc_various_slice() {
+        let arena = MonoIncArena::new(1024);
+        const SIZES: [usize; 12] = [10, 200, 30, 1024, 512, 77, 89, 1, 3, 29, 16, 480];
+        let total_used: usize = SIZES.iter().map(|v| v * 8).sum();
+
+        for size in &SIZES {
+            let layout_slice = Layout::from_size_align(mem::size_of::<u64>() * *size, 8).unwrap();
+            let buf = arena.alloc(layout_slice);
+            consume_buf_as_u64_slice(buf, *size);
+        }
+
+        assert_eq!(20800, arena.stats().bytes_allocated());
+        assert_eq!(total_used, arena.stats().bytes_used());
+    }
+
+    #[test]
+    fn unaligned_alloc_request() {
+        let arena = MonoIncArena::new(1024);
+
+        let regular_req_layout = Layout::from_size_align(mem::size_of::<u64>(), 2).unwrap();
+        for _ in 0..10 {
+            let buf = arena.alloc(regular_req_layout).as_ptr() as usize;
+            assert_eq!(0, buf % DEFAULT_ALIGN);
+        }
+
+        // 2003 is a prime number and 2004 % 8 != 0
+        let special_req_layout = Layout::from_size_align(2003, 2).unwrap();
+        for _ in 0..10 {
+            let buf = arena.alloc(special_req_layout).as_ptr() as usize;
+            assert_eq!(0, buf % DEFAULT_ALIGN);
+        }
+    }
+}
diff --git a/components/bytes/Cargo.toml b/components/bytes/Cargo.toml
new file mode 100644
index 0000000000..0fecefbe8e
--- /dev/null
+++ b/components/bytes/Cargo.toml
@@ -0,0 +1,10 @@
+[package]
+name = "bytes"
+version = "0.1.0"
+edition = "2018"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+upstream = { version = "1.0", package = "bytes" }
+snafu = { version ="0.6.10", features = ["backtraces"]}
diff --git a/components/bytes/src/lib.rs b/components/bytes/src/lib.rs
new file mode 100644
index 0000000000..015aabce0c
--- /dev/null
+++ b/components/bytes/src/lib.rs
@@ -0,0 +1,368 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Provides utilities for byte arrays
+//!
+//! Use Bytes instead of Vec<u8>. Currently just re-export bytes crate
+
+use std::{
+    fmt,
+    io::{self, Read, Write},
+};
+
+use snafu::{ensure, Backtrace, GenerateBacktrace, Snafu};
+// Should not use bytes crate outside of this mod so we can replace the actual
+// implementations if needed
+pub use upstream::{Buf, BufMut, Bytes, BytesMut};
+
+/// Error of MemBuf/MemBufMut
+///
+/// We do not use `std::io::Error` because it is too large
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Failed to fill whole buffer.\nBacktrace:\n{}", backtrace))]
+    UnexpectedEof { backtrace: Backtrace },
+
+    #[snafu(display("Failed to write whole buffer.\nBacktrace:\n{}", backtrace))]
+    WouldOverflow { backtrace: Backtrace },
+}
+
+pub type Result<T> = std::result::Result<T, Error>;
+
+/// Now is just an alias to `Vec<u8>`, prefer to use this alias instead of
+/// `Vec<u8>`
+pub type ByteVec = Vec<u8>;
+
+/// Read bytes from a buffer.
+///
+/// Unlike `bytes::Buf`, the underlying storage is in contiguous memory
+pub trait MemBuf: fmt::Debug {
+    /// Return the remaining byte slice
+    fn remaining_slice(&self) -> &[u8];
+
+    /// Advance the internal cursor of the buffer, panic if overflow
+    fn must_advance(&mut self, cnt: usize);
+
+    /// Read bytes from self into dst.
+    ///
+    /// The cursor is advanced by the number of bytes copied.
+    ///
+    /// Returns error if self does not have enough remaining bytes to fill dst.
+    fn read_to_slice(&mut self, dst: &mut [u8]) -> Result<()>;
+
+    /// Gets an unsigned 8 bit integer from self and advance current position
+    ///
+    /// Returns error if the capacity is not enough
+    fn read_u8(&mut self) -> Result<u8> {
+        let mut buf = [0; 1];
+        self.read_to_slice(&mut buf)?;
+        Ok(buf[0])
+    }
+
+    /// Gets an unsighed 32 bit integer from self in big-endian byte order and
+    /// advance current position
+    ///
+    /// Returns error if the capacity is not enough
+    fn read_u32(&mut self) -> Result<u32> {
+        let mut buf = [0; 4];
+        self.read_to_slice(&mut buf)?;
+        Ok(u32::from_be_bytes(buf))
+    }
+
+    /// Gets an unsighed 64 bit integer from self in big-endian byte order and
+    /// advance current position
+    ///
+    /// Returns error if the capacity is not enough
+    fn read_u64(&mut self) -> Result<u64> {
+        let mut buf = [0; 8];
+        self.read_to_slice(&mut buf)?;
+        Ok(u64::from_be_bytes(buf))
+    }
+
+    fn read_f64(&mut self) -> Result<f64> {
+        let mut buf = [0; 8];
+        self.read_to_slice(&mut buf)?;
+        Ok(f64::from_be_bytes(buf))
+    }
+
+    fn read_f32(&mut self) -> Result<f32> {
+        let mut buf = [0; 4];
+        self.read_to_slice(&mut buf)?;
+        Ok(f32::from_be_bytes(buf))
+    }
+}
+
+/// Write bytes to a buffer
+///
+/// Unlike `bytes::BufMut`, write operations may fail
+pub trait MemBufMut: fmt::Debug {
+    /// Write bytes into self from src, advance the buffer position
+    ///
+    /// Returns error if the capacity is not enough
+    fn write_slice(&mut self, src: &[u8]) -> Result<()>;
+
+    /// Write an unsigned 8 bit integer to self, advance the buffer position
+    ///
+    /// Returns error if the capacity is not enough
+    fn write_u8(&mut self, n: u8) -> Result<()> {
+        let src = [n];
+        self.write_slice(&src)
+    }
+
+    /// Writes an unsigned 32 bit integer to self in the big-endian byte order,
+    /// advance the buffer position
+    ///
+    /// Returns error if the capacity is not enough
+    fn write_u32(&mut self, n: u32) -> Result<()> {
+        self.write_slice(&n.to_be_bytes())
+    }
+
+    /// Writes an unsigned 64 bit integer to self in the big-endian byte order,
+    /// advance the buffer position
+    ///
+    /// Returns error if the capacity is not enough
+    fn write_u64(&mut self, n: u64) -> Result<()> {
+        self.write_slice(&n.to_be_bytes())
+    }
+
+    /// Writes an float 64 to self in the big-endian byte order,
+    /// advance the buffer position
+    ///
+    /// Returns error if the capacity is not enough
+    fn write_f64(&mut self, n: f64) -> Result<()> {
+        self.write_slice(&n.to_be_bytes())
+    }
+
+    /// Writes an float 32 to self in the big-endian byte order,
+    /// advance the buffer position
+    ///
+    /// Returns error if the capacity is not enough
+    fn write_f32(&mut self, n: f32) -> Result<()> {
+        self.write_slice(&n.to_be_bytes())
+    }
+}
+
+macro_rules! impl_mem_buf {
+    () => {
+        #[inline]
+        fn remaining_slice(&self) -> &[u8] {
+            &self
+        }
+
+        #[inline]
+        fn must_advance(&mut self, cnt: usize) {
+            self.advance(cnt);
+        }
+
+        #[inline]
+        fn read_to_slice(&mut self, dst: &mut [u8]) -> Result<()> {
+            ensure!(self.remaining() >= dst.len(), UnexpectedEof);
+            self.copy_to_slice(dst);
+            Ok(())
+        }
+    };
+}
+
+impl MemBuf for Bytes {
+    impl_mem_buf!();
+}
+
+impl MemBuf for BytesMut {
+    impl_mem_buf!();
+}
+
+impl MemBufMut for BytesMut {
+    fn write_slice(&mut self, src: &[u8]) -> Result<()> {
+        ensure!(self.remaining_mut() >= src.len(), WouldOverflow);
+        self.put_slice(src);
+        Ok(())
+    }
+}
+
+impl MemBuf for &[u8] {
+    #[inline]
+    fn remaining_slice(&self) -> &[u8] {
+        self
+    }
+
+    #[inline]
+    fn must_advance(&mut self, cnt: usize) {
+        *self = &self[cnt..];
+    }
+
+    #[inline]
+    fn read_to_slice(&mut self, dst: &mut [u8]) -> Result<()> {
+        // slice::read_exact() only throws UnexpectedEof error, see
+        //
+        // https://doc.rust-lang.org/src/std/io/impls.rs.html#264-281
+        self.read_exact(dst).map_err(|_| Error::UnexpectedEof {
+            backtrace: Backtrace::generate(),
+        })
+    }
+}
+
+impl MemBufMut for &mut [u8] {
+    fn write_slice(&mut self, src: &[u8]) -> Result<()> {
+        // slice::write_all() actually wont fail, see
+        //
+        // https://doc.rust-lang.org/src/std/io/impls.rs.html#344-350
+        self.write_all(src).map_err(|_| Error::WouldOverflow {
+            backtrace: Backtrace::generate(),
+        })
+    }
+}
+
+impl MemBufMut for Vec<u8> {
+    fn write_slice(&mut self, src: &[u8]) -> Result<()> {
+        self.extend_from_slice(src);
+        Ok(())
+    }
+}
+
+/// A `MemBufMut` adapter which implements [std::io::Write] for the inner value
+#[derive(Debug)]
+pub struct Writer<'a, B> {
+    buf: &'a mut B,
+}
+
+impl<'a, B: MemBufMut> Writer<'a, B> {
+    /// Create a new Writer from a mut ref to buf
+    pub fn new(buf: &'a mut B) -> Self {
+        Self { buf }
+    }
+}
+
+impl<'a, B: MemBufMut> Write for Writer<'a, B> {
+    fn write(&mut self, src: &[u8]) -> io::Result<usize> {
+        self.buf.write_slice(src).map_err(|e| match &e {
+            Error::UnexpectedEof { .. } => io::Error::new(io::ErrorKind::UnexpectedEof, e),
+            Error::WouldOverflow { .. } => io::Error::new(io::ErrorKind::WriteZero, e),
+        })?;
+        Ok(src.len())
+    }
+
+    fn flush(&mut self) -> io::Result<()> {
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_bytes_mut_mem_buf() {
+        let hello = b"hello";
+        let mut buffer = BytesMut::new();
+        buffer.write_u8(8).unwrap();
+        buffer.write_u64(u64::MAX - 5).unwrap();
+        buffer.write_slice(hello).unwrap();
+
+        assert_eq!(&buffer, buffer.remaining_slice());
+        assert_eq!(8, buffer.read_u8().unwrap());
+        assert_eq!(u64::MAX - 5, buffer.read_u64().unwrap());
+        let mut dst = [0; 5];
+        buffer.read_to_slice(&mut dst).unwrap();
+        assert_eq!(hello, &dst);
+
+        assert!(buffer.remaining_slice().is_empty());
+    }
+
+    #[test]
+    fn test_bytes_mut_empty() {
+        let mut buffer = BytesMut::new();
+        assert!(buffer.remaining_slice().is_empty());
+        assert!(matches!(buffer.read_u8(), Err(Error::UnexpectedEof { .. })));
+        assert!(matches!(
+            buffer.read_u64(),
+            Err(Error::UnexpectedEof { .. })
+        ));
+    }
+
+    #[test]
+    fn test_bytes_mem_buf() {
+        let mut buffer = Bytes::from_static(b"hello world");
+        assert_eq!(b"hello world", buffer.remaining_slice());
+
+        let mut dst = [0; 5];
+        buffer.read_to_slice(&mut dst).unwrap();
+        assert_eq!(b"hello", &dst);
+
+        assert_eq!(b" world", buffer.remaining_slice());
+        buffer.must_advance(1);
+        assert_eq!(b"world", buffer.remaining_slice());
+
+        let mut dst = [0; 50];
+        assert!(matches!(
+            buffer.read_to_slice(&mut dst),
+            Err(Error::UnexpectedEof { .. })
+        ));
+    }
+
+    #[test]
+    fn test_slice_mem_buf() {
+        let hello = b"hello world";
+        let mut buf = &hello[..];
+
+        assert_eq!(hello, buf.remaining_slice());
+        let mut dst = [0; 6];
+        buf.read_to_slice(&mut dst).unwrap();
+        assert_eq!(b"hello ", &dst);
+        assert_eq!(b"world", buf.remaining_slice());
+
+        buf.must_advance(1);
+        assert_eq!(b"orld", buf.remaining_slice());
+    }
+
+    #[test]
+    fn test_slice_mem_buf_mut() {
+        let mut dst = [b'x'; 11];
+        {
+            let mut buf = &mut dst[..];
+
+            buf.write_slice(b"abcde").unwrap();
+            assert_eq!(b"abcdexxxxxx", &dst);
+        }
+
+        {
+            let mut buf = &mut dst[..];
+
+            buf.write_slice(b"hello").unwrap();
+            buf.write_slice(b" world").unwrap();
+            assert_eq!(b"hello world", &dst);
+        }
+
+        let mut dst = [0; 3];
+        let mut buf = &mut dst[..];
+        assert!(matches!(
+            buf.write_slice(b"a long long long slice"),
+            Err(Error::WouldOverflow { .. })
+        ));
+    }
+
+    #[test]
+    fn test_vec_mem_buf_mut() {
+        let mut buf = Vec::new();
+        buf.write_slice(b"hello").unwrap();
+        assert_eq!(b"hello", &buf[..]);
+    }
+
+    #[test]
+    fn test_writer_write() {
+        let mut buf = Vec::new();
+        let mut writer = Writer::new(&mut buf);
+        writer.write_all(b"he").unwrap();
+        writer.write_all(b"llo").unwrap();
+        assert_eq!(b"hello", &buf[..]);
+    }
+
+    #[test]
+    fn test_writer_overflow() {
+        let mut dst = [0; 3];
+        let mut buf = &mut dst[..];
+        let mut writer = Writer::new(&mut buf);
+        assert_eq!(
+            io::ErrorKind::WriteZero,
+            writer.write_all(b"0123456789").err().unwrap().kind()
+        );
+    }
+}
diff --git a/components/logger/Cargo.toml b/components/logger/Cargo.toml
new file mode 100644
index 0000000000..9fdc938340
--- /dev/null
+++ b/components/logger/Cargo.toml
@@ -0,0 +1,21 @@
+[package]
+name = "logger"
+version = "0.1.0"
+authors = ["CeresDB Authors <ceresdb@service.alipay.com>"]
+edition = "2018"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+chrono = "0.4"
+grpcio = { path = "../../grpcio" }
+log = "0.4"
+slog = "2.7"
+slog-async = "2.6"
+slog-term = "2.8"
+slog_derive = "0.2"
+
+[dependencies.slog-global]
+version = "0.1"
+git = "https://github.com/breeswish/slog-global.git"
+rev = "0e23a5baff302a9d7bccd85f8f31e43339c2f2c1"
diff --git a/components/logger/src/lib.rs b/components/logger/src/lib.rs
new file mode 100644
index 0000000000..f0317ab586
--- /dev/null
+++ b/components/logger/src/lib.rs
@@ -0,0 +1,422 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use std::{
+    fmt,
+    fs::{File, OpenOptions},
+    io,
+    str::FromStr,
+    sync::{
+        atomic::{AtomicUsize, Ordering},
+        Arc, Mutex,
+    },
+};
+
+use log::{info, SetLoggerError};
+pub use slog::Level;
+use slog::{slog_o, Drain, Key, OwnedKVList, Record, KV};
+use slog_async::{Async, OverflowStrategy};
+use slog_term::{Decorator, PlainDecorator, RecordDecorator, TermDecorator};
+
+const ASYNC_CHAN_SIZE: usize = 102400;
+// This format is required for xflush monitor
+const TIMESTAMP_FORMAT: &str = "%Y-%m-%d %H:%M:%S%.3f";
+
+// Thanks to tikv
+// https://github.com/tikv/tikv/blob/eaeb39a2c85684de08c48cf4b9426b3faf4defe6/components/tikv_util/src/logger/mod.rs
+
+pub fn convert_slog_level_to_log_level(lv: Level) -> log::Level {
+    match lv {
+        Level::Critical | Level::Error => log::Level::Error,
+        Level::Warning => log::Level::Warn,
+        Level::Debug => log::Level::Debug,
+        Level::Trace => log::Level::Trace,
+        Level::Info => log::Level::Info,
+    }
+}
+
+pub fn convert_log_level_to_slog_level(lv: log::Level) -> Level {
+    match lv {
+        log::Level::Error => Level::Error,
+        log::Level::Warn => Level::Warning,
+        log::Level::Debug => Level::Debug,
+        log::Level::Trace => Level::Trace,
+        log::Level::Info => Level::Info,
+    }
+}
+
+// The `to_string()` function of `slog::Level` produces values like `erro` and
+// `trce` instead of the full words. This produces the full word.
+fn get_string_by_level(lv: Level) -> &'static str {
+    match lv {
+        Level::Critical => "critical",
+        Level::Error => "error",
+        Level::Warning => "warn",
+        Level::Debug => "debug",
+        Level::Trace => "trace",
+        Level::Info => "info",
+    }
+}
+
+pub fn term_drainer() -> CeresFormat<TermDecorator> {
+    let decorator = TermDecorator::new().stdout().build();
+    CeresFormat::new(decorator)
+}
+
+pub fn file_drainer(path: &Option<String>) -> Option<CeresFormat<PlainDecorator<File>>> {
+    match path {
+        Some(path) => {
+            let file = OpenOptions::new()
+                .create(true)
+                .append(true)
+                .open(path)
+                .unwrap();
+            let decorator = PlainDecorator::new(file);
+            Some(CeresFormat::new(decorator))
+        }
+        None => None,
+    }
+}
+
+// dispacher
+pub struct LogDispatcher<N: Drain> {
+    normal: N,
+}
+
+impl<N: Drain> LogDispatcher<N> {
+    pub fn new(normal: N) -> Self {
+        Self { normal }
+    }
+}
+
+impl<N> Drain for LogDispatcher<N>
+where
+    N: Drain<Ok = (), Err = io::Error>,
+{
+    type Err = io::Error;
+    type Ok = ();
+
+    fn log(&self, record: &Record, values: &OwnedKVList) -> Result<Self::Ok, Self::Err> {
+        self.normal.log(record, values)
+    }
+}
+
+pub fn init_log<D>(
+    drain: D,
+    level: Level,
+    use_async: bool,
+    async_log_channel_len: i32,
+    init_stdlog: bool,
+) -> Result<RuntimeLevel, SetLoggerError>
+where
+    D: Drain + Send + 'static,
+    <D as Drain>::Err: std::fmt::Display,
+{
+    let runtime_level = RuntimeLevel::new(level);
+    // TODO(yingwen): Consider print the error instead of just ignoring it?
+    let root_logger = if use_async {
+        let drain = if async_log_channel_len <= 0 {
+            Async::new(drain.ignore_res())
+                .chan_size(ASYNC_CHAN_SIZE)
+                .overflow_strategy(OverflowStrategy::Block)
+                .build()
+        } else {
+            Async::new(drain.ignore_res())
+                .chan_size(async_log_channel_len as usize)
+                .build()
+        };
+        let drain = RuntimeLevelFilter::new(drain, runtime_level.clone());
+        slog::Logger::root(drain.ignore_res(), slog_o!())
+    } else {
+        let drain = RuntimeLevelFilter::new(Mutex::new(drain), runtime_level.clone());
+        slog::Logger::root(drain.ignore_res(), slog_o!())
+    };
+
+    slog_global::set_global(root_logger);
+    if init_stdlog {
+        slog_global::redirect_std_log(Some(level))?;
+        grpcio::redirect_log();
+    }
+
+    Ok(runtime_level)
+}
+
+// e.g.
+// ```text
+// 2020-01-20 13:00:14.998 INFO [src/engine/rocksdb/rocks_kv.rs:394] RocksKV::open_with_op start, name:autogen
+// ```
+pub struct CeresFormat<D>
+where
+    D: Decorator,
+{
+    decorator: D,
+}
+
+impl<D> CeresFormat<D>
+where
+    D: Decorator,
+{
+    fn new(decorator: D) -> Self {
+        Self { decorator }
+    }
+}
+
+impl<D> Drain for CeresFormat<D>
+where
+    D: Decorator,
+{
+    type Err = io::Error;
+    type Ok = ();
+
+    fn log(&self, record: &Record, values: &OwnedKVList) -> Result<Self::Ok, Self::Err> {
+        self.decorator.with_record(record, values, |decorator| {
+            write_log_header(decorator, record)?;
+            write_log_msg(decorator, record)?;
+            write_log_fields(decorator, record, values)?;
+
+            decorator.start_whitespace()?;
+            writeln!(decorator)?;
+
+            decorator.flush()?;
+
+            Ok(())
+        })
+    }
+}
+
+#[derive(Clone)]
+pub struct RuntimeLevel {
+    level: Arc<AtomicUsize>,
+    default_level: Level,
+}
+
+impl RuntimeLevel {
+    fn new(default_level: Level) -> Self {
+        Self {
+            level: Arc::new(AtomicUsize::new(default_level.as_usize())),
+            default_level,
+        }
+    }
+
+    #[inline]
+    pub fn current_level(&self) -> Level {
+        Level::from_usize(self.level.load(Ordering::Relaxed)).unwrap_or(self.default_level)
+    }
+
+    pub fn set_level(&self, level: Level) {
+        self.level.store(level.as_usize(), Ordering::Relaxed);
+        // Log level of std log is not changed unless we call `log::set_max_level`
+        log::set_max_level(convert_slog_level_to_log_level(level).to_level_filter());
+
+        info!(
+            "RuntimeLevel::set_level log level changed to {}",
+            get_string_by_level(level)
+        );
+    }
+
+    #[inline]
+    pub fn reset(&self) {
+        self.set_level(self.default_level);
+    }
+
+    #[inline]
+    pub fn default_level(&self) -> Level {
+        self.default_level
+    }
+
+    #[inline]
+    pub fn current_level_str(&self) -> &str {
+        get_string_by_level(self.current_level())
+    }
+
+    pub fn set_level_by_str(&self, level_str: &str) -> Result<(), String> {
+        Level::from_str(level_str)
+            .map_err(|_| format!("Invalid level {}", level_str))
+            .and_then(|level| match level {
+                Level::Trace | Level::Debug | Level::Info => Ok(level),
+                _ => Err("Only allow to change log level to <trace|debug|info>".to_owned()),
+            })
+            .map(|level| self.set_level(level))
+    }
+}
+
+struct RuntimeLevelFilter<D> {
+    drain: D,
+    runtime_level: RuntimeLevel,
+}
+
+impl<D> RuntimeLevelFilter<D> {
+    fn new(drain: D, runtime_level: RuntimeLevel) -> Self {
+        Self {
+            drain,
+            runtime_level,
+        }
+    }
+}
+
+impl<D> Drain for RuntimeLevelFilter<D>
+where
+    D: Drain,
+{
+    type Err = D::Err;
+    type Ok = Option<D::Ok>;
+
+    fn log(&self, record: &Record, values: &OwnedKVList) -> Result<Self::Ok, Self::Err> {
+        let current_level = self.runtime_level.current_level();
+
+        if record.level().is_at_least(current_level) {
+            Ok(Some(self.drain.log(record, values)?))
+        } else {
+            Ok(None)
+        }
+    }
+}
+
+fn write_log_header(decorator: &mut dyn RecordDecorator, record: &Record<'_>) -> io::Result<()> {
+    decorator.start_timestamp()?;
+    write!(
+        decorator,
+        "{}",
+        chrono::Local::now().format(TIMESTAMP_FORMAT)
+    )?;
+
+    decorator.start_whitespace()?;
+    write!(decorator, " ")?;
+
+    decorator.start_level()?;
+    write!(decorator, "{}", record.level().as_short_str())?;
+
+    decorator.start_whitespace()?;
+    write!(decorator, " ")?;
+
+    // Writes source file info.
+    decorator.start_msg()?; // There is no `start_file()` or `start_line()`.
+    write!(decorator, "[{}:{}]", record.file(), record.line())?;
+
+    Ok(())
+}
+
+fn write_log_msg(decorator: &mut dyn RecordDecorator, record: &Record<'_>) -> io::Result<()> {
+    decorator.start_whitespace()?;
+    write!(decorator, " ")?;
+
+    decorator.start_msg()?;
+    write!(decorator, "{}", record.msg())?;
+
+    Ok(())
+}
+
+fn write_log_fields(
+    decorator: &mut dyn RecordDecorator,
+    record: &Record<'_>,
+    values: &OwnedKVList,
+) -> io::Result<()> {
+    let mut serializer = Serializer::new(decorator);
+
+    record.kv().serialize(record, &mut serializer)?;
+
+    values.serialize(record, &mut serializer)?;
+
+    serializer.finish()?;
+
+    Ok(())
+}
+
+struct Serializer<'a> {
+    decorator: &'a mut dyn RecordDecorator,
+}
+
+impl<'a> Serializer<'a> {
+    fn new(decorator: &'a mut dyn RecordDecorator) -> Self {
+        Serializer { decorator }
+    }
+
+    fn write_whitespace(&mut self) -> io::Result<()> {
+        self.decorator.start_whitespace()?;
+        write!(self.decorator, " ")?;
+        Ok(())
+    }
+
+    fn finish(self) -> io::Result<()> {
+        Ok(())
+    }
+}
+
+impl<'a> Drop for Serializer<'a> {
+    fn drop(&mut self) {}
+}
+
+impl<'a> slog::Serializer for Serializer<'a> {
+    fn emit_none(&mut self, key: Key) -> slog::Result {
+        self.emit_arguments(key, &format_args!("None"))
+    }
+
+    fn emit_arguments(&mut self, key: Key, val: &fmt::Arguments<'_>) -> slog::Result {
+        self.write_whitespace()?;
+
+        // Write key
+        write!(self.decorator, "[")?;
+        self.decorator.start_key()?;
+        write!(self.decorator, "{}", key)?;
+
+        // Write separator
+        self.decorator.start_separator()?;
+        write!(self.decorator, ":")?;
+
+        // Write value
+        self.decorator.start_value()?;
+        write!(self.decorator, "{}", val)?;
+        self.decorator.reset()?;
+        write!(self.decorator, "]")?;
+
+        Ok(())
+    }
+}
+
+pub fn init_test_logger() {
+    // level
+    let level = Level::Info;
+
+    // drain
+    let term_drain = term_drainer();
+    let drain = LogDispatcher::new(term_drain);
+
+    // Use async and init stdlog
+    let _ = init_log(drain, level, false, 12400, true);
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_runtime_level() {
+        let runtime_level = RuntimeLevel::new(Level::Info);
+
+        assert_eq!(runtime_level.current_level(), Level::Info);
+        assert_eq!(runtime_level.default_level(), Level::Info);
+
+        runtime_level.set_level(Level::Debug);
+        assert_eq!(runtime_level.current_level(), Level::Debug);
+        assert_eq!(runtime_level.default_level(), Level::Info);
+
+        runtime_level.reset();
+        assert_eq!(runtime_level.current_level(), Level::Info);
+        assert_eq!(runtime_level.current_level_str(), "info");
+
+        runtime_level.set_level_by_str("trace").unwrap();
+        assert_eq!(runtime_level.current_level(), Level::Trace);
+        runtime_level.set_level_by_str("debug").unwrap();
+        assert_eq!(runtime_level.current_level(), Level::Debug);
+        runtime_level.set_level_by_str("info").unwrap();
+        assert_eq!(runtime_level.current_level(), Level::Info);
+
+        assert!(runtime_level.set_level_by_str("warn").is_err());
+        assert_eq!(runtime_level.current_level(), Level::Info);
+        assert!(runtime_level.set_level_by_str("warning").is_err());
+        assert!(runtime_level.set_level_by_str("critical").is_err());
+        assert!(runtime_level.set_level_by_str("error").is_err());
+        assert!(runtime_level.set_level_by_str("no such level").is_err());
+
+        assert_eq!(runtime_level.current_level(), Level::Info);
+    }
+}
diff --git a/components/object_store/Cargo.toml b/components/object_store/Cargo.toml
new file mode 100644
index 0000000000..787f330dcd
--- /dev/null
+++ b/components/object_store/Cargo.toml
@@ -0,0 +1,21 @@
+[package]
+name = "object_store"
+version = "0.1.0"
+authors = ["CeresDB Authors <ceresdb@service.alipay.com>"]
+edition = "2018"
+
+[dependencies] # In alphabetical order
+async-trait = "0.1.42"
+bytes = "1.0"
+common_util = { path = "../../common_util" }
+futures = "0.3"
+itertools = "0.10"
+percent-encoding = "2.1"
+snafu = { version = "0.6.10", features = ["futures", "backtraces"] }
+tokio = { version = "1.0", features = ["macros", "fs"] }
+# Filesystem integration
+tokio-util = { version = "0.6.3", features = [ "io","compat" ] }
+walkdir = "2.3.2"
+
+[dev-dependencies]
+tempfile = "3.1.0"
diff --git a/components/object_store/src/disk.rs b/components/object_store/src/disk.rs
new file mode 100644
index 0000000000..14cdbb9cc0
--- /dev/null
+++ b/components/object_store/src/disk.rs
@@ -0,0 +1,389 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! This module contains the IOx implementation for using local disk as the
+//! object store.
+use std::{collections::BTreeSet, convert::TryFrom, io, path::PathBuf};
+
+use async_trait::async_trait;
+use futures::{
+    stream,
+    stream::{BoxStream, StreamExt},
+    AsyncRead,
+};
+use snafu::{Backtrace, GenerateBacktrace, OptionExt, ResultExt, Snafu};
+use tokio::fs;
+use tokio_util::compat::{Compat, FuturesAsyncReadCompatExt};
+use walkdir::WalkDir;
+
+use crate::{path::file::FilePath, ListResult, ObjectMeta, ObjectStore};
+
+/// A specialized `Result` for filesystem object store-related errors
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// A specialized `Error` for filesystem object store-related errors
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display(
+        "Expected streamed data to have length {}, got {}.\nBacktrace:\n{}",
+        expected,
+        actual,
+        backtrace
+    ))]
+    DataDoesNotMatchLength {
+        expected: usize,
+        actual: usize,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("File size for {} did not fit in a usize: {}.\nBacktrace:\n{}", path.display(), source, backtrace))]
+    FileSizeOverflowedUsize {
+        path: PathBuf,
+        source: std::num::TryFromIntError,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Unable to walk dir: {}.\nBacktrace:\n{}", source, backtrace))]
+    UnableToWalkDir {
+        source: walkdir::Error,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Unable to access metadata for {}: {}.\nBacktrace:\n{}", path.display(), source, backtrace))]
+    UnableToAccessMetadata {
+        path: PathBuf,
+        source: walkdir::Error,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Unable to copy data to file: {}.\nBacktrace:\n{}", source, backtrace))]
+    UnableToCopyDataToFile {
+        source: io::Error,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Unable to create dir {}: {}.\nBacktrace:\n{}", path.display(), source, backtrace))]
+    UnableToCreateDir {
+        source: io::Error,
+        path: PathBuf,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Unable to create file {}: {}.\nBacktrace:\n{}", path.display(), err, backtrace))]
+    UnableToCreateFile {
+        path: PathBuf,
+        err: io::Error,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Unable to delete file {}: {}.\nBacktrace:\n{}", path.display(), source, backtrace))]
+    UnableToDeleteFile {
+        source: io::Error,
+        path: PathBuf,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Unable to open file {}: {}.\nBacktrace:\n{}", path.display(), source, backtrace))]
+    UnableToOpenFile {
+        source: io::Error,
+        path: PathBuf,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Unable to read data from file {}: {}.\nBacktrace:\n{}", path.display(), source, backtrace))]
+    UnableToReadBytes {
+        source: io::Error,
+        path: PathBuf,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Unable to stream data from the request into memory: {}.\nBacktrace:\n{}",
+        source,
+        backtrace
+    ))]
+    UnableToStreamDataIntoMemory {
+        source: std::io::Error,
+        backtrace: Backtrace,
+    },
+}
+
+/// Local filesystem storage suitable for testing or for opting out of using a
+/// cloud storage provider.
+#[derive(Debug)]
+pub struct File {
+    root: FilePath,
+}
+
+#[async_trait]
+impl ObjectStore for File {
+    type Error = Error;
+    type Path = FilePath;
+    type Reader = Compat<fs::File>;
+
+    fn new_path(&self) -> Self::Path {
+        FilePath::default()
+    }
+
+    async fn put<R>(
+        &self,
+        location: &Self::Path,
+        bytes: R,
+        _length: Option<usize>,
+    ) -> Result<(), Self::Error>
+    where
+        R: AsyncRead + Send + Unpin,
+    {
+        let path = self.path(location);
+
+        let mut file = match fs::File::create(&path).await {
+            Ok(f) => f,
+            Err(err) if err.kind() == std::io::ErrorKind::NotFound => {
+                let parent = path
+                    .parent()
+                    .context(UnableToCreateFile { path: &path, err })?;
+                fs::create_dir_all(&parent)
+                    .await
+                    .context(UnableToCreateDir { path: parent })?;
+
+                match fs::File::create(&path).await {
+                    Ok(f) => f,
+                    Err(err) => return UnableToCreateFile { path, err }.fail(),
+                }
+            }
+            Err(err) => return UnableToCreateFile { path, err }.fail(),
+        };
+
+        tokio::io::copy(&mut bytes.compat(), &mut file)
+            .await
+            .context(UnableToCopyDataToFile)?;
+
+        Ok(())
+    }
+
+    async fn get(&self, location: &Self::Path) -> Result<std::fs::File, Self::Error> {
+        let path = self.path(location);
+        let file = fs::File::open(&path)
+            .await
+            .context(UnableToOpenFile { path: &path })?;
+        Ok(file.into_std().await)
+    }
+
+    async fn delete(&self, location: &Self::Path) -> Result<(), Self::Error> {
+        let path = self.path(location);
+        fs::remove_file(&path)
+            .await
+            .context(UnableToDeleteFile { path })?;
+        Ok(())
+    }
+
+    async fn list<'a>(
+        &'a self,
+        prefix: Option<&'a Self::Path>,
+    ) -> Result<BoxStream<'a, Result<Vec<Self::Path>, Self::Error>>, Self::Error> {
+        let root_path = self.root.to_raw();
+        let walkdir = WalkDir::new(&root_path)
+            // Don't include the root directory itself
+            .min_depth(1);
+
+        let s =
+            walkdir.into_iter().filter_map(move |result_dir_entry| {
+                match convert_walkdir_result(result_dir_entry) {
+                    Err(e) => Some(Err(e)),
+                    Ok(None) => None,
+                    Ok(entry @ Some(_)) => entry
+                        .filter(|dir_entry| dir_entry.file_type().is_file())
+                        .map(|file| {
+                            let relative_path = file.path().strip_prefix(&root_path).expect(
+                                "Must start with root path because this came from walking the root",
+                            );
+                            FilePath::raw(relative_path, false)
+                        })
+                        .filter(|name| prefix.map_or(true, |p| name.prefix_matches(p)))
+                        .map(|name| Ok(vec![name])),
+                }
+            });
+
+        Ok(stream::iter(s).boxed())
+    }
+
+    async fn list_with_delimiter(
+        &self,
+        prefix: &Self::Path,
+    ) -> Result<ListResult<Self::Path>, Self::Error> {
+        // Always treat prefix as relative because the list operations don't know
+        // anything about where on disk the root of this object store is; they
+        // only care about what's within this object store's directory. See
+        // documentation for `push_path`: it deliberately does *not* behave  as
+        // `PathBuf::push` does: there is no way to replace the root. So even if
+        // `prefix` isn't relative, we treat it as such here.
+        let mut resolved_prefix = self.root.clone();
+        resolved_prefix.push_path(prefix);
+
+        // It is valid to specify a prefix with directories `[foo, bar]` and filename
+        // `baz`, in which case we want to treat it like a glob for
+        // `foo/bar/baz*` and there may not actually be a file or directory
+        // named `foo/bar/baz`. We want to look at all the entries in
+        // `foo/bar/`, so remove the file name.
+        let mut search_path = resolved_prefix.clone();
+        search_path.unset_file_name();
+
+        let walkdir = WalkDir::new(&search_path.to_raw())
+            .min_depth(1)
+            .max_depth(1);
+
+        let mut common_prefixes = BTreeSet::new();
+        let mut objects = Vec::new();
+
+        let root_path = self.root.to_raw();
+        for entry_res in walkdir.into_iter().map(convert_walkdir_result) {
+            if let Some(entry) = entry_res? {
+                let entry_location = FilePath::raw(entry.path(), false);
+
+                if entry_location.prefix_matches(&resolved_prefix) {
+                    let metadata = entry
+                        .metadata()
+                        .context(UnableToAccessMetadata { path: entry.path() })?;
+
+                    if metadata.is_dir() {
+                        let parts = entry_location
+                            .parts_after_prefix(&resolved_prefix)
+                            .expect("must have prefix because of the if prefix_matches condition");
+
+                        let mut relative_location = prefix.to_owned();
+                        relative_location.push_part_as_dir(&parts[0]);
+                        common_prefixes.insert(relative_location);
+                    } else {
+                        let path = entry
+                            .path()
+                            .strip_prefix(&root_path)
+                            .expect("must have prefix because of the if prefix_matches condition");
+                        let location = FilePath::raw(path, false);
+
+                        let last_modified = metadata
+                            .modified()
+                            .expect("Modified file time should be supported on this platform");
+                        let size = usize::try_from(metadata.len())
+                            .context(FileSizeOverflowedUsize { path: entry.path() })?;
+
+                        objects.push(ObjectMeta {
+                            location,
+                            last_modified,
+                            size,
+                        });
+                    }
+                }
+            }
+        }
+
+        Ok(ListResult {
+            next_token: None,
+            common_prefixes: common_prefixes.into_iter().collect(),
+            objects,
+        })
+    }
+}
+
+impl File {
+    /// Create new filesystem storage.
+    pub fn new(root: impl Into<PathBuf>) -> Self {
+        Self {
+            root: FilePath::raw(root, true),
+        }
+    }
+
+    /// Return full path of the given location
+    pub fn path(&self, location: &FilePath) -> PathBuf {
+        let mut path = self.root.clone();
+        path.push_path(location);
+        path.to_raw()
+    }
+}
+
+/// Convert walkdir results and converts not-found errors into `None`.
+fn convert_walkdir_result(
+    res: std::result::Result<walkdir::DirEntry, walkdir::Error>,
+) -> Result<Option<walkdir::DirEntry>> {
+    match res {
+        Ok(entry) => Ok(Some(entry)),
+        Err(walkdir_err) => match walkdir_err.io_error() {
+            Some(io_err) => match io_err.kind() {
+                io::ErrorKind::NotFound => Ok(None),
+                _ => Err(Error::UnableToWalkDir {
+                    source: walkdir_err,
+                    backtrace: Backtrace::generate(),
+                }),
+            },
+            None => Err(Error::UnableToWalkDir {
+                source: walkdir_err,
+                backtrace: Backtrace::generate(),
+            }),
+        },
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::io::Read;
+
+    use bytes::Bytes;
+    use tempfile::TempDir;
+
+    use super::*;
+    use crate::{
+        path::ObjectStorePath,
+        tests::{list_with_delimiter, put_get_delete_list},
+        ObjectStore,
+    };
+
+    #[tokio::test]
+    async fn file_test() {
+        let root = TempDir::new().unwrap();
+        let file = File::new(root.path());
+
+        put_get_delete_list(&file).await.unwrap();
+        list_with_delimiter(&file).await.unwrap();
+    }
+
+    #[tokio::test]
+    async fn creates_dir_if_not_present() {
+        let root = TempDir::new().unwrap();
+        let file = File::new(root.path());
+
+        let data = Bytes::from("arbitrary data");
+        let mut location = file.new_path();
+        location.push_all_dirs(&["nested", "file", "test_file"]);
+
+        file.put(&location, Box::new(data.as_ref()), Some(data.len()))
+            .await
+            .unwrap();
+
+        let mut read_data = Vec::with_capacity(data.len());
+        file.get(&location)
+            .await
+            .unwrap()
+            .read_to_end(&mut read_data)
+            .unwrap();
+        assert_eq!(&*read_data, data);
+    }
+
+    #[tokio::test]
+    async fn unknown_length() {
+        let root = TempDir::new().unwrap();
+        let file = File::new(root.path());
+
+        let data = Bytes::from("arbitrary data");
+
+        let mut location = file.new_path();
+        location.set_file_name("some_file");
+        file.put(&location, Box::new(data.as_ref()), None)
+            .await
+            .unwrap();
+        let mut read_data = Vec::with_capacity(data.len());
+        file.get(&location)
+            .await
+            .unwrap()
+            .read_to_end(&mut read_data)
+            .unwrap();
+        assert_eq!(&*read_data, data);
+    }
+}
diff --git a/components/object_store/src/lib.rs b/components/object_store/src/lib.rs
new file mode 100644
index 0000000000..326a68459c
--- /dev/null
+++ b/components/object_store/src/lib.rs
@@ -0,0 +1,329 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! # object_store
+//!
+//! This crate provides APIs for interacting with object storage services. It
+//! currently supports PUT, GET, DELETE, and list for in-memory and
+//! local file storage.
+//!
+//! Future compatibility will include Aliyun OSS.
+//!
+//! Fork from https://github.com/influxdata/influxdb_iox/tree/main/object_store
+
+use std::time::SystemTime;
+
+use async_trait::async_trait;
+use futures::{stream::BoxStream, AsyncRead};
+use path::ObjectStorePath;
+
+pub mod disk;
+pub mod path;
+
+/// Universal API to multiple object store services.
+// TODO(xikai): ObjectStore -> FileStore
+#[async_trait]
+pub trait ObjectStore: std::fmt::Debug + Send + Sync + 'static {
+    /// The type of the locations used in interacting with this object store.
+    type Path: ObjectStorePath;
+
+    /// The error returned from fallible methods
+    type Error: std::error::Error + Send + Sync + 'static;
+
+    type Reader: AsyncRead + Send + Unpin;
+
+    /// Return a new location path appropriate for this object storage
+    fn new_path(&self) -> Self::Path;
+
+    /// Save the provided bytes to the specified location.
+    async fn put<R>(
+        &self,
+        location: &Self::Path,
+        bytes: R,
+        length: Option<usize>,
+    ) -> Result<(), Self::Error>
+    where
+        R: AsyncRead + Send + Unpin;
+
+    /// Return the bytes that are stored at the specified location.
+    async fn get(&self, location: &Self::Path) -> Result<std::fs::File, Self::Error>;
+
+    /// Delete the object at the specified location.
+    async fn delete(&self, location: &Self::Path) -> Result<(), Self::Error>;
+
+    /// List all the objects with the given prefix.
+    async fn list<'a>(
+        &'a self,
+        prefix: Option<&'a Self::Path>,
+    ) -> Result<BoxStream<'a, Result<Vec<Self::Path>, Self::Error>>, Self::Error>;
+
+    /// List objects with the given prefix and an implementation specific
+    /// delimiter. Returns common prefixes (directories) in addition to object
+    /// metadata.
+    async fn list_with_delimiter(
+        &self,
+        prefix: &Self::Path,
+    ) -> Result<ListResult<Self::Path>, Self::Error>;
+}
+
+/// Result of a list call that includes objects, prefixes (directories) and a
+/// token for the next set of results. Individual result sets may be limited to
+/// 1,00 objects based on the underlying object storage's limitations.
+#[derive(Debug)]
+pub struct ListResult<P: ObjectStorePath> {
+    /// Token passed to the API for the next page of list results.
+    pub next_token: Option<String>,
+    /// Prefixes that are common (like directories)
+    pub common_prefixes: Vec<P>,
+    /// Object metadata for the listing
+    pub objects: Vec<ObjectMeta<P>>,
+}
+
+/// The metadata that describes an object.
+#[derive(Debug)]
+pub struct ObjectMeta<P: ObjectStorePath> {
+    /// The full path to the object
+    pub location: P,
+    /// The last modified time
+    pub last_modified: SystemTime,
+    /// The size in bytes of the object
+    pub size: usize,
+}
+
+#[cfg(test)]
+mod tests {
+    use std::io::Read;
+
+    use bytes::Bytes;
+    use futures::{stream, StreamExt, TryStreamExt};
+
+    use super::*;
+    use crate::path::{file::FilePath, parsed::DirsAndFileName};
+
+    type Error = Box<dyn std::error::Error + Send + Sync + 'static>;
+    type Result<T, E = Error> = std::result::Result<T, E>;
+
+    async fn flatten_list_stream<
+        P: path::ObjectStorePath,
+        E: std::error::Error + Send + Sync + 'static,
+        R: AsyncRead + Unpin,
+    >(
+        storage: &impl ObjectStore<Path = P, Error = E, Reader = R>,
+        prefix: Option<&P>,
+    ) -> Result<Vec<P>> {
+        storage
+            .list(prefix)
+            .await?
+            .map_ok(|v| stream::iter(v).map(Ok))
+            .try_flatten()
+            .try_collect()
+            .await
+    }
+
+    pub(crate) async fn put_get_delete_list<
+        P: path::ObjectStorePath,
+        E: std::error::Error + Send + Sync + 'static,
+        R: AsyncRead + Unpin,
+    >(
+        storage: &impl ObjectStore<Path = P, Error = E, Reader = R>,
+    ) -> Result<()> {
+        delete_fixtures(storage).await;
+
+        let content_list = flatten_list_stream(storage, None).await?;
+        assert!(
+            content_list.is_empty(),
+            "Expected list to be empty; found: {:?}",
+            content_list
+        );
+
+        let data = Bytes::from("arbitrary data");
+        let mut location = storage.new_path();
+        location.push_dir("test_dir");
+        location.set_file_name("test_file.json");
+
+        storage
+            .put(&location, data.as_ref(), Some(data.len()))
+            .await?;
+
+        // List everything
+        let content_list = flatten_list_stream(storage, None).await?;
+        assert_eq!(content_list, &[location.clone()]);
+
+        // List everything starting with a prefix that should return results
+        let mut prefix = storage.new_path();
+        prefix.push_dir("test_dir");
+        let content_list = flatten_list_stream(storage, Some(&prefix)).await?;
+        assert_eq!(content_list, &[location.clone()]);
+
+        // List everything starting with a prefix that shouldn't return results
+        let mut prefix = storage.new_path();
+        prefix.push_dir("something");
+        let content_list = flatten_list_stream(storage, Some(&prefix)).await?;
+        assert!(content_list.is_empty());
+
+        let mut read_data = Vec::with_capacity(data.len());
+
+        storage.get(&location).await?.read_to_end(&mut read_data)?;
+        assert_eq!(&*read_data, data);
+
+        storage.delete(&location).await?;
+
+        let content_list = flatten_list_stream(storage, None).await?;
+        assert!(content_list.is_empty());
+
+        Ok(())
+    }
+
+    pub(crate) async fn list_with_delimiter<
+        P: path::ObjectStorePath,
+        E: std::error::Error + Send + Sync + 'static,
+        R: AsyncRead + Unpin,
+    >(
+        storage: &impl ObjectStore<Path = P, Error = E, Reader = R>,
+    ) -> Result<()> {
+        delete_fixtures(storage).await;
+
+        // ==================== check: store is empty ====================
+        let content_list = flatten_list_stream(storage, None).await?;
+        assert!(content_list.is_empty());
+
+        // ==================== do: create files ====================
+        let data = Bytes::from("arbitrary data");
+
+        let files: Vec<_> = [
+            "test_file",
+            "mydb/wb/000/000/000.segment",
+            "mydb/wb/000/000/001.segment",
+            "mydb/wb/000/000/002.segment",
+            "mydb/wb/001/001/000.segment",
+            "mydb/wb/foo.json",
+            "mydb/data/whatevs",
+        ]
+        .iter()
+        .map(|&s| str_to_path(storage, s))
+        .collect();
+
+        for f in &files {
+            storage
+                .put(f, data.as_ref(), Some(data.len()))
+                .await
+                .unwrap();
+        }
+
+        // ==================== check: prefix-list `mydb/wb` (directory)
+        // ====================
+        let mut prefix = storage.new_path();
+        prefix.push_all_dirs(&["mydb", "wb"]);
+
+        let mut expected_000 = prefix.clone();
+        expected_000.push_dir("000");
+        let mut expected_001 = prefix.clone();
+        expected_001.push_dir("001");
+        let mut expected_location = prefix.clone();
+        expected_location.set_file_name("foo.json");
+
+        let result = storage.list_with_delimiter(&prefix).await.unwrap();
+
+        assert_eq!(result.common_prefixes, vec![expected_000, expected_001]);
+        assert_eq!(result.objects.len(), 1);
+
+        let object = &result.objects[0];
+
+        assert_eq!(object.location, expected_location);
+        assert_eq!(object.size, data.len());
+
+        // ==================== check: prefix-list `mydb/wb/000/000/001` (partial
+        // filename) ====================
+        let mut prefix = storage.new_path();
+        prefix.push_all_dirs(&["mydb", "wb", "000", "000"]);
+        prefix.set_file_name("001");
+
+        let mut expected_location = storage.new_path();
+        expected_location.push_all_dirs(&["mydb", "wb", "000", "000"]);
+        expected_location.set_file_name("001.segment");
+
+        let result = storage.list_with_delimiter(&prefix).await.unwrap();
+        assert!(result.common_prefixes.is_empty());
+        assert_eq!(result.objects.len(), 1);
+
+        let object = &result.objects[0];
+
+        assert_eq!(object.location, expected_location);
+
+        // ==================== check: prefix-list `not_there` (non-existing prefix)
+        // ====================
+        let mut prefix = storage.new_path();
+        prefix.push_all_dirs(&["not_there"]);
+
+        let result = storage.list_with_delimiter(&prefix).await.unwrap();
+        assert!(result.common_prefixes.is_empty());
+        assert!(result.objects.is_empty());
+
+        // ==================== do: remove all files ====================
+        for f in &files {
+            storage.delete(f).await.unwrap();
+        }
+
+        // ==================== check: store is empty ====================
+        let content_list = flatten_list_stream(storage, None).await?;
+        assert!(content_list.is_empty());
+
+        Ok(())
+    }
+
+    /// Parse a str as a `CloudPath` into a `DirAndFileName`, even though the
+    /// associated storage might not be cloud storage, to reuse the cloud
+    /// path parsing logic. Then convert into the correct type of path for
+    /// the given storage.
+    fn str_to_path<
+        P: path::ObjectStorePath,
+        E: std::error::Error + Send + Sync,
+        R: AsyncRead + Unpin,
+    >(
+        storage: &impl ObjectStore<Path = P, Error = E, Reader = R>,
+        val: &str,
+    ) -> P {
+        let cloud_path = FilePath::raw(val, false);
+        let parsed: DirsAndFileName = cloud_path.into();
+
+        let mut new_path = storage.new_path();
+        for part in parsed.directories {
+            new_path.push_dir(part.to_string());
+        }
+
+        if let Some(file_name) = parsed.file_name {
+            new_path.set_file_name(file_name.to_string());
+        }
+        new_path
+    }
+
+    async fn delete_fixtures<
+        P: path::ObjectStorePath,
+        E: std::error::Error + Send + Sync,
+        R: AsyncRead + Unpin,
+    >(
+        storage: &impl ObjectStore<Path = P, Error = E, Reader = R>,
+    ) {
+        let files: Vec<_> = [
+            "test_file",
+            "mydb/wb/000/000/000.segment",
+            "mydb/wb/000/000/001.segment",
+            "mydb/wb/000/000/002.segment",
+            "mydb/wb/001/001/000.segment",
+            "mydb/wb/foo.json",
+            "mydb/data/whatevs",
+        ]
+        .iter()
+        .map(|&s| str_to_path(storage, s))
+        .collect();
+
+        for f in &files {
+            // don't care if it errors, should fail elsewhere
+            let _ = storage.delete(f).await;
+        }
+    }
+
+    // Tests TODO:
+    // GET nonexisting location (in_memory/file)
+    // DELETE nonexisting location
+    // PUT overwriting
+}
diff --git a/components/object_store/src/path/file.rs b/components/object_store/src/path/file.rs
new file mode 100644
index 0000000000..acdae35f69
--- /dev/null
+++ b/components/object_store/src/path/file.rs
@@ -0,0 +1,518 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use std::{
+    mem,
+    path::{is_separator, PathBuf},
+};
+
+use crate::path::{parsed::DirsAndFileName, parts::PathPart, ObjectStorePath};
+
+/// An object storage location suitable for passing to disk based object
+/// storage.
+#[derive(Debug, Clone, Default, PartialEq, Eq, Ord, PartialOrd)]
+pub struct FilePath {
+    inner: FilePathRepresentation,
+}
+
+impl ObjectStorePath for FilePath {
+    fn set_file_name(&mut self, part: impl Into<String>) {
+        self.inner = mem::take(&mut self.inner).set_file_name(part);
+    }
+
+    fn push_dir(&mut self, part: impl Into<String>) {
+        self.inner = mem::take(&mut self.inner).push_dir(part);
+    }
+
+    fn push_all_dirs<'a>(&mut self, parts: impl AsRef<[&'a str]>) {
+        self.inner = mem::take(&mut self.inner).push_all_dirs(parts);
+    }
+
+    fn display(&self) -> String {
+        self.to_raw().display().to_string()
+    }
+}
+
+impl FilePath {
+    /// Creates a file storage location from a `PathBuf` without parsing or
+    /// allocating unless other methods are called on this instance that
+    /// need it.
+    ///
+    /// The "nature" of path (i.e. if it is a directory or file) will be
+    /// guessed. So paths ending with a separator (e.g. `/foo/bar/` on
+    /// Linux) are treated as a directory. However for all other paths (like
+    /// `/foo/bar` on Linux) it is not clear if a directory or file is meant
+    /// w/o inspecting the underlying store. To workaround that there is the
+    /// `assume_directory` flag which will treat ambiguous paths as directories.
+    /// If set to `false`, these cases will be treated as files.
+    pub fn raw(path: impl Into<PathBuf>, assume_directory: bool) -> Self {
+        let path = path.into();
+        Self {
+            inner: FilePathRepresentation::Raw(path, assume_directory),
+        }
+    }
+
+    /// Creates a filesystem `PathBuf` location by using the standard library's
+    /// `PathBuf` building implementation appropriate for the current
+    /// platform.
+    pub fn to_raw(&self) -> PathBuf {
+        use FilePathRepresentation::*;
+
+        match &self.inner {
+            Raw(path, _) => path.to_owned(),
+            Parsed(dirs_and_file_name) => {
+                let mut path: PathBuf = dirs_and_file_name
+                    .directories
+                    .iter()
+                    .map(PathPart::encoded)
+                    .collect();
+                if let Some(file_name) = &dirs_and_file_name.file_name {
+                    path.push(file_name.encoded());
+                }
+                path
+            }
+        }
+    }
+
+    /// Add the parts of `path` to the end of this path. Notably does
+    /// *not* behave as `PathBuf::push` does: there is no way to replace the
+    /// root. If `self` has a file name, that will be removed, then the
+    /// directories of `path` will be appended, then any file name of `path`
+    /// will be assigned to `self`.
+    pub fn push_path(&mut self, path: &Self) {
+        self.inner = mem::take(&mut self.inner).push_path(path)
+    }
+
+    /// Add a `PathPart` to the end of the path's directories.
+    pub fn push_part_as_dir(&mut self, part: &PathPart) {
+        self.inner = mem::take(&mut self.inner).push_part_as_dir(part);
+    }
+
+    /// Whether the prefix is the start of this path or not.
+    pub fn prefix_matches(&self, prefix: &Self) -> bool {
+        self.inner.prefix_matches(&prefix.inner)
+    }
+
+    /// Returns all directory and file name `PathParts` in `self` after the
+    /// specified `prefix`. Ignores any `file_name` part of `prefix`.
+    /// Returns `None` if `self` dosen't start with `prefix`.
+    pub fn parts_after_prefix(&self, prefix: &Self) -> Option<Vec<PathPart>> {
+        self.inner.parts_after_prefix(&prefix.inner)
+    }
+
+    /// Remove this path's file name, if there is one.
+    pub fn unset_file_name(&mut self) {
+        self.inner = mem::take(&mut self.inner).unset_file_name();
+    }
+}
+
+impl From<FilePath> for DirsAndFileName {
+    fn from(file_path: FilePath) -> Self {
+        file_path.inner.into()
+    }
+}
+
+impl From<DirsAndFileName> for FilePath {
+    fn from(dirs_and_file_name: DirsAndFileName) -> Self {
+        Self {
+            inner: FilePathRepresentation::Parsed(dirs_and_file_name),
+        }
+    }
+}
+
+#[derive(Debug, Clone, Eq)]
+enum FilePathRepresentation {
+    // raw: native path representation and also remember if we always assume it is a directory
+    // assume_directory: bool
+    Raw(PathBuf, bool),
+    Parsed(DirsAndFileName),
+}
+
+impl Default for FilePathRepresentation {
+    fn default() -> Self {
+        Self::Parsed(DirsAndFileName::default())
+    }
+}
+
+impl PartialEq for FilePathRepresentation {
+    fn eq(&self, other: &Self) -> bool {
+        matches!(self.cmp(other), std::cmp::Ordering::Equal)
+    }
+}
+impl PartialOrd for FilePathRepresentation {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for FilePathRepresentation {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        use FilePathRepresentation::*;
+        match (self, other) {
+            (Parsed(self_parts), Parsed(other_parts)) => self_parts.cmp(other_parts),
+            (Parsed(self_parts), _) => {
+                let other_parts: DirsAndFileName = other.to_owned().into();
+                self_parts.cmp(&other_parts)
+            }
+            (_, Parsed(other_parts)) => {
+                let self_parts: DirsAndFileName = self.to_owned().into();
+                self_parts.cmp(other_parts)
+            }
+            _ => {
+                let self_parts: DirsAndFileName = self.to_owned().into();
+                let other_parts: DirsAndFileName = other.to_owned().into();
+                self_parts.cmp(&other_parts)
+            }
+        }
+    }
+}
+
+impl FilePathRepresentation {
+    fn push_dir(self, part: impl Into<String>) -> Self {
+        let mut dirs_and_file_name: DirsAndFileName = self.into();
+
+        dirs_and_file_name.push_dir(part);
+        Self::Parsed(dirs_and_file_name)
+    }
+
+    fn push_all_dirs<'a>(self, parts: impl AsRef<[&'a str]>) -> Self {
+        let mut dirs_and_file_name: DirsAndFileName = self.into();
+
+        dirs_and_file_name.push_all_dirs(parts);
+        Self::Parsed(dirs_and_file_name)
+    }
+
+    fn set_file_name(self, part: impl Into<String>) -> Self {
+        let mut dirs_and_file_name: DirsAndFileName = self.into();
+
+        dirs_and_file_name.set_file_name(part);
+        Self::Parsed(dirs_and_file_name)
+    }
+
+    fn unset_file_name(self) -> Self {
+        let mut dirs_and_file_name: DirsAndFileName = self.into();
+
+        dirs_and_file_name.unset_file_name();
+        Self::Parsed(dirs_and_file_name)
+    }
+
+    /// Add the parts of `path` to the end of this path. Notably does
+    /// *not* behave as `PathBuf::push` does: there is no way to replace the
+    /// root. If `self` has a file name, that will be removed, then the
+    /// directories of `path` will be appended, then any file name of `path`
+    /// will be assigned to `self`.
+    fn push_path(self, path: &FilePath) -> Self {
+        let DirsAndFileName {
+            directories: path_dirs,
+            file_name: path_file_name,
+        } = path.inner.to_owned().into();
+        let mut dirs_and_file_name: DirsAndFileName = self.into();
+
+        dirs_and_file_name.directories.extend(path_dirs);
+        dirs_and_file_name.file_name = path_file_name;
+
+        Self::Parsed(dirs_and_file_name)
+    }
+
+    /// Add a `PathPart` to the end of the path's directories.
+    fn push_part_as_dir(self, part: &PathPart) -> Self {
+        let mut dirs_and_file_name: DirsAndFileName = self.into();
+
+        dirs_and_file_name.push_part_as_dir(part);
+
+        Self::Parsed(dirs_and_file_name)
+    }
+
+    fn prefix_matches(&self, prefix: &Self) -> bool {
+        use FilePathRepresentation::*;
+        match (self, prefix) {
+            (Parsed(self_parts), Parsed(prefix_parts)) => self_parts.prefix_matches(prefix_parts),
+            (Parsed(self_parts), _) => {
+                let prefix_parts: DirsAndFileName = prefix.to_owned().into();
+                self_parts.prefix_matches(&prefix_parts)
+            }
+            (_, Parsed(prefix_parts)) => {
+                let self_parts: DirsAndFileName = self.to_owned().into();
+                self_parts.prefix_matches(prefix_parts)
+            }
+            _ => {
+                let self_parts: DirsAndFileName = self.to_owned().into();
+                let prefix_parts: DirsAndFileName = prefix.to_owned().into();
+                self_parts.prefix_matches(&prefix_parts)
+            }
+        }
+    }
+
+    /// Returns all directory and file name `PathParts` in `self` after the
+    /// specified `prefix`. Ignores any `file_name` part of `prefix`.
+    /// Returns `None` if `self` dosen't start with `prefix`.
+    fn parts_after_prefix(&self, prefix: &Self) -> Option<Vec<PathPart>> {
+        use FilePathRepresentation::*;
+        match (self, prefix) {
+            (Parsed(self_parts), Parsed(prefix_parts)) => {
+                self_parts.parts_after_prefix(prefix_parts)
+            }
+            (Parsed(self_parts), _) => {
+                let prefix_parts: DirsAndFileName = prefix.to_owned().into();
+                self_parts.parts_after_prefix(&prefix_parts)
+            }
+            (_, Parsed(prefix_parts)) => {
+                let self_parts: DirsAndFileName = self.to_owned().into();
+                self_parts.parts_after_prefix(prefix_parts)
+            }
+            _ => {
+                let self_parts: DirsAndFileName = self.to_owned().into();
+                let prefix_parts: DirsAndFileName = prefix.to_owned().into();
+                self_parts.parts_after_prefix(&prefix_parts)
+            }
+        }
+    }
+}
+
+impl From<FilePathRepresentation> for DirsAndFileName {
+    fn from(file_path_rep: FilePathRepresentation) -> Self {
+        use FilePathRepresentation::*;
+
+        match file_path_rep {
+            Raw(path, assume_directory) => {
+                let mut parts: Vec<PathPart> = path
+                    .iter()
+                    .flat_map(|s| s.to_os_string().into_string().map(PathPart))
+                    .collect();
+
+                if !assume_directory && !parts.is_empty() && !is_directory(&path) {
+                    let file_name = Some(parts.pop().expect("cannot be empty"));
+                    Self {
+                        directories: parts,
+                        file_name,
+                    }
+                } else {
+                    Self {
+                        directories: parts,
+                        file_name: None,
+                    }
+                }
+            }
+            Parsed(dirs_and_file_name) => dirs_and_file_name,
+        }
+    }
+}
+
+/// Checks if the path is for sure a directory (i.e. ends with a separator).
+fn is_directory(path: &std::path::Path) -> bool {
+    if let Some(s) = path.to_str() {
+        if let Some(c) = s.chars().last() {
+            return is_separator(c);
+        }
+    }
+    false
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::parsed_path;
+
+    #[test]
+    fn path_buf_to_dirs_and_file_name_conversion() {
+        // Last section ending in `.json` is a file name
+        let path_buf: PathBuf = "/one/two/blah.json".into();
+        let file_path = FilePath::raw(path_buf, false);
+        let parts: DirsAndFileName = file_path.into();
+        let mut expected_parts = parsed_path!(["/", "one", "two"], "blah.json");
+        expected_parts.directories[0] = PathPart("/".to_string()); // not escaped
+        assert_eq!(parts, expected_parts);
+
+        // Last section ending in `.segment` is a file name
+        let path_buf: PathBuf = "/one/two/blah.segment".into();
+        let file_path = FilePath::raw(path_buf, false);
+        let parts: DirsAndFileName = file_path.into();
+        let mut expected_parts = parsed_path!(["/", "one", "two"], "blah.segment");
+        expected_parts.directories[0] = PathPart("/".to_string()); // not escaped
+        assert_eq!(parts, expected_parts);
+
+        // Last section ending in `.parquet` is a file name
+        let path_buf: PathBuf = "/one/two/blah.parquet".into();
+        let file_path = FilePath::raw(path_buf, false);
+        let parts: DirsAndFileName = file_path.into();
+        let mut expected_parts = parsed_path!(["/", "one", "two"], "blah.parquet");
+        expected_parts.directories[0] = PathPart("/".to_string()); // not escaped
+        assert_eq!(parts, expected_parts);
+
+        // Last section ending in `.txt` is NOT a file name; we don't recognize that
+        // extension
+        let path_buf: PathBuf = "/one/two/blah.txt".into();
+        let file_path = FilePath::raw(path_buf, false);
+        let parts: DirsAndFileName = file_path.into();
+        let mut expected_parts = parsed_path!(["/", "one", "two"], "blah.txt");
+        expected_parts.directories[0] = PathPart("/".to_string()); // not escaped
+        assert_eq!(parts, expected_parts);
+
+        // Last section containing a `.` isn't a file name
+        let path_buf: PathBuf = "/one/two/blah.blah".into();
+        let file_path = FilePath::raw(path_buf, false);
+        let parts: DirsAndFileName = file_path.into();
+        let mut expected_parts = parsed_path!(["/", "one", "two"], "blah.blah");
+        expected_parts.directories[0] = PathPart("/".to_string()); // not escaped
+        assert_eq!(parts, expected_parts);
+
+        // Last section starting with a `.` isn't a file name (macos temp dirs do this)
+        let path_buf: PathBuf = "/one/two/.blah".into();
+        let file_path = FilePath::raw(path_buf, false);
+        let parts: DirsAndFileName = file_path.into();
+        let mut expected_parts = parsed_path!(["/", "one", "two"], ".blah");
+        expected_parts.directories[0] = PathPart("/".to_string()); // not escaped
+        assert_eq!(parts, expected_parts);
+
+        let path_buf: PathBuf = "/a/b/d".into();
+        let file_path = FilePath::raw(path_buf, false);
+        let parts: DirsAndFileName = file_path.into();
+        let mut expected_parts = parsed_path!(["/", "a", "b"], "d");
+        expected_parts.directories[0] = PathPart("/".to_string()); // not escaped
+        assert_eq!(parts, expected_parts);
+
+        let path_buf: PathBuf = "/a/b/c".into();
+        let file_path = FilePath::raw(path_buf, true);
+        let parts: DirsAndFileName = file_path.into();
+        let mut expected_parts = parsed_path!(["/", "a", "b", "c"]);
+        expected_parts.directories[0] = PathPart("/".to_string()); // not escaped
+        assert_eq!(parts, expected_parts);
+    }
+
+    #[test]
+    fn conversions() {
+        // dir and file name
+        let path_buf: PathBuf = "foo/bar/blah.json".into();
+        let file_path = FilePath::raw(path_buf, false);
+        let parts: DirsAndFileName = file_path.into();
+
+        let expected_parts = parsed_path!(["foo", "bar"], "blah.json");
+        assert_eq!(parts, expected_parts);
+
+        // dir, no file name
+        let path_buf: PathBuf = "foo/bar/".into();
+        let file_path = FilePath::raw(path_buf, false);
+        let parts: DirsAndFileName = file_path.into();
+
+        let expected_parts = parsed_path!(["foo", "bar"]);
+        assert_eq!(parts, expected_parts);
+
+        // same but w/o the final marker
+        let path_buf: PathBuf = "foo/bar".into();
+        let file_path = FilePath::raw(path_buf, false);
+        let parts: DirsAndFileName = file_path.into();
+
+        let expected_parts = parsed_path!(["foo"], "bar");
+        assert_eq!(parts, expected_parts);
+
+        // same but w/o the final marker, but forced to be a directory
+        let path_buf: PathBuf = "foo/bar".into();
+        let file_path = FilePath::raw(path_buf, true);
+        let parts: DirsAndFileName = file_path.into();
+
+        let expected_parts = parsed_path!(["foo", "bar"]);
+        assert_eq!(parts, expected_parts);
+
+        // no dir, file name
+        let path_buf: PathBuf = "blah.json".into();
+        let file_path = FilePath::raw(path_buf, false);
+        let parts: DirsAndFileName = file_path.into();
+
+        let expected_parts = parsed_path!([], "blah.json");
+        assert_eq!(parts, expected_parts);
+
+        // empty
+        let path_buf: PathBuf = "".into();
+        let file_path = FilePath::raw(path_buf, false);
+        let parts: DirsAndFileName = file_path.into();
+
+        let expected_parts = parsed_path!();
+        assert_eq!(parts, expected_parts);
+
+        // weird file name
+        let path_buf: PathBuf = "blah.x".into();
+        let file_path = FilePath::raw(path_buf, false);
+        let parts: DirsAndFileName = file_path.into();
+
+        let expected_parts = parsed_path!("blah.x");
+        assert_eq!(parts, expected_parts);
+    }
+
+    #[test]
+    fn equality() {
+        let path_buf: PathBuf = "foo/bar/blah.json".into();
+        let file_path = FilePath::raw(path_buf, false);
+        let parts: DirsAndFileName = file_path.clone().into();
+        let parsed: FilePath = parts.into();
+
+        assert_eq!(file_path, parsed);
+    }
+
+    #[test]
+    fn ordering() {
+        let a_path_buf: PathBuf = "foo/bar/a.json".into();
+        let a_file_path = FilePath::raw(&a_path_buf, false);
+        let a_parts: DirsAndFileName = a_file_path.into();
+        let a_parsed: FilePath = a_parts.into();
+
+        let b_path_buf: PathBuf = "foo/bar/b.json".into();
+        let b_file_path = FilePath::raw(&b_path_buf, false);
+
+        assert!(a_path_buf < b_path_buf);
+        assert!(
+            a_parsed < b_file_path,
+            "a was not less than b: a = {:#?}\nb = {:#?}",
+            a_parsed,
+            b_file_path
+        );
+    }
+
+    #[test]
+    fn path_display() {
+        let a_path_buf: PathBuf = "foo/bar/a.json".into();
+        let expected_display = a_path_buf.display().to_string();
+        let a_file_path = FilePath::raw(&a_path_buf, false);
+
+        assert_eq!(a_file_path.display(), expected_display);
+
+        let a_parts: DirsAndFileName = a_file_path.into();
+        let a_parsed: FilePath = a_parts.into();
+
+        assert_eq!(a_parsed.display(), expected_display);
+    }
+
+    #[test]
+    fn test_file_path_represent_ord() {
+        let file1 = FilePathRepresentation::Raw(PathBuf::from("/aa/bb"), false);
+        let file1_bak = FilePathRepresentation::Raw(PathBuf::from("/aa/bb"), false);
+        let file2 = FilePathRepresentation::Raw(PathBuf::from("/zz/aa/bb"), false);
+
+        assert!(file1 == file1_bak);
+        assert!(file1 < file2)
+    }
+
+    #[test]
+    fn test_file_path_parts_after_prefix() {
+        let file = FilePath::raw("/a/b/c", false);
+        let file2 = FilePath::raw("/a/b", true);
+        let ret = file.parts_after_prefix(&file2);
+        assert_eq!(ret, Some(vec![PathPart("c".to_string())]));
+
+        let file = FilePath::raw("/a/b/c", false);
+        let file2 = FilePath::raw("/a/b", false);
+        let ret = file.parts_after_prefix(&file2);
+        assert_eq!(
+            ret,
+            Some(vec![PathPart("b".to_string()), PathPart("c".to_string())])
+        );
+
+        let file = FilePath::raw("/a/b/d", false);
+        let file2 = FilePath::raw("/a/b/c/dd", true);
+        let ret = file.parts_after_prefix(&file2);
+        assert_eq!(ret, None);
+
+        let file = FilePath::raw("/a/b/d", true);
+        let file2 = FilePath::raw("/a/b/c", true);
+        let ret = file.parts_after_prefix(&file2);
+        assert_eq!(ret, None);
+    }
+}
diff --git a/components/object_store/src/path/mod.rs b/components/object_store/src/path/mod.rs
new file mode 100644
index 0000000000..e5922d6df8
--- /dev/null
+++ b/components/object_store/src/path/mod.rs
@@ -0,0 +1,35 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! This module contains code for abstracting object locations that work
+//! across different backing implementations and platforms.
+
+pub mod file;
+pub mod parsed;
+pub mod parts;
+
+/// The delimiter to separate object namespaces, creating a directory structure.
+pub const DELIMITER: &str = "/";
+
+/// Universal interface for handling paths and locations for objects and
+/// directories in the object store.
+///
+///
+/// Deliberately does not implement `Display` or `ToString`!
+pub trait ObjectStorePath:
+    std::fmt::Debug + Clone + PartialEq + Eq + Send + Sync + 'static
+{
+    /// Set the file name of this path
+    fn set_file_name(&mut self, part: impl Into<String>);
+
+    /// Add a part to the end of the path's directories, encoding any restricted
+    /// characters.
+    fn push_dir(&mut self, part: impl Into<String>);
+
+    /// Push a bunch of parts as directories in one go.
+    fn push_all_dirs<'a>(&mut self, parts: impl AsRef<[&'a str]>);
+
+    /// Like `std::path::Path::display, converts an `ObjectStorePath` to a
+    /// `String` suitable for printing; not suitable for sending to
+    /// APIs.
+    fn display(&self) -> String;
+}
diff --git a/components/object_store/src/path/parsed.rs b/components/object_store/src/path/parsed.rs
new file mode 100644
index 0000000000..0c9781a9b6
--- /dev/null
+++ b/components/object_store/src/path/parsed.rs
@@ -0,0 +1,389 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use itertools::Itertools;
+
+use crate::path::{parts::PathPart, ObjectStorePath, DELIMITER};
+
+/// A path stored as a collection of 0 or more directories and 0 or 1 file name
+#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Default, Hash)]
+pub struct DirsAndFileName {
+    /// Directory hierarchy.
+    pub directories: Vec<PathPart>,
+
+    /// Filename, if set.
+    pub file_name: Option<PathPart>,
+}
+
+impl ObjectStorePath for DirsAndFileName {
+    fn set_file_name(&mut self, part: impl Into<String>) {
+        let part = part.into();
+        self.file_name = Some((&*part).into());
+    }
+
+    fn push_dir(&mut self, part: impl Into<String>) {
+        let part = part.into();
+        self.directories.push((&*part).into());
+    }
+
+    fn push_all_dirs<'a>(&mut self, parts: impl AsRef<[&'a str]>) {
+        self.directories
+            .extend(parts.as_ref().iter().map(|&v| v.into()));
+    }
+
+    fn display(&self) -> String {
+        let mut s = self
+            .directories
+            .iter()
+            .map(PathPart::encoded)
+            .join(DELIMITER);
+
+        if !s.is_empty() {
+            s.push_str(DELIMITER);
+        }
+        if let Some(file_name) = &self.file_name {
+            s.push_str(file_name.encoded());
+        }
+        s
+    }
+}
+
+impl DirsAndFileName {
+    pub(crate) fn prefix_matches(&self, prefix: &Self) -> bool {
+        let diff = itertools::diff_with(
+            self.directories.iter(),
+            prefix.directories.iter(),
+            |a, b| a == b,
+        );
+
+        use itertools::Diff;
+        match diff {
+            None => match (self.file_name.as_ref(), prefix.file_name.as_ref()) {
+                (Some(self_file), Some(prefix_file)) => {
+                    self_file.encoded().starts_with(prefix_file.encoded())
+                }
+                (Some(_self_file), None) => true,
+                (None, Some(_prefix_file)) => false,
+                (None, None) => true,
+            },
+            Some(Diff::Shorter(_, mut remaining_self)) => {
+                let next_dir = remaining_self
+                    .next()
+                    .expect("must have at least one mismatch to be in this case");
+                match prefix.file_name.as_ref() {
+                    Some(prefix_file) => next_dir.encoded().starts_with(prefix_file.encoded()),
+                    None => true,
+                }
+            }
+            Some(Diff::FirstMismatch(_, mut remaining_self, mut remaining_prefix)) => {
+                let first_prefix = remaining_prefix
+                    .next()
+                    .expect("must have at least one mismatch to be in this case");
+
+                // There must not be any other remaining parts in the prefix
+                remaining_prefix.next().is_none()
+                    // and the next item in self must start with the last item in the prefix
+                    && remaining_self
+                    .next()
+                    .expect("must be at least one value")
+                    .encoded()
+                    .starts_with(first_prefix.encoded())
+            }
+            _ => false,
+        }
+    }
+
+    /// Returns all directory and file name `PathParts` in `self` after the
+    /// specified `prefix`. Ignores any `file_name` part of `prefix`.
+    /// Returns `None` if `self` dosen't start with `prefix`.
+    pub(crate) fn parts_after_prefix(&self, prefix: &Self) -> Option<Vec<PathPart>> {
+        if self.directories.len() < prefix.directories.len() {
+            return None;
+        }
+
+        let mut dirs_iter = self.directories.iter();
+        let mut prefix_dirs_iter = prefix.directories.iter();
+
+        let mut parts = vec![];
+
+        for dir in &mut dirs_iter {
+            let pre = prefix_dirs_iter.next();
+
+            match pre {
+                None => {
+                    parts.push(dir.to_owned());
+                    break;
+                }
+                Some(p) if p == dir => continue,
+                Some(_) => return None,
+            }
+        }
+
+        parts.extend(dirs_iter.cloned());
+
+        if let Some(file_name) = &self.file_name {
+            parts.push(file_name.to_owned());
+        }
+
+        Some(parts)
+    }
+
+    /// Add a `PathPart` to the end of the path's directories.
+    pub(crate) fn push_part_as_dir(&mut self, part: &PathPart) {
+        self.directories.push(part.to_owned());
+    }
+
+    /// Remove the file name, if any.
+    pub(crate) fn unset_file_name(&mut self) {
+        self.file_name = None;
+    }
+}
+
+/// Short-cut macro to create [`DirsAndFileName`] instances.
+///
+/// # Example
+/// ```
+/// use object_store::parsed_path;
+///
+/// // empty path
+/// parsed_path!();
+///
+/// // filename only
+/// parsed_path!("test.txt");
+///
+/// // directories only
+/// parsed_path!(["path", "to"]);
+///
+/// // filename + directories
+/// parsed_path!(["path", "to"], "test.txt");
+/// ```
+#[macro_export]
+macro_rules! parsed_path {
+    ([$($dir:expr),*], $file:expr) => {
+        $crate::path::parsed::DirsAndFileName {
+            directories: vec![$($crate::path::parts::PathPart::from($dir)),*],
+            file_name: Some($crate::path::parts::PathPart::from($file)),
+        }
+    };
+    ([$($dir:expr),*]) => {
+        $crate::path::parsed::DirsAndFileName {
+            directories: vec![$($crate::path::parts::PathPart::from($dir)),*],
+            file_name: None,
+        }
+    };
+    ($file:expr) => {
+        parsed_path!([], $file)
+    };
+    () => {
+        parsed_path!([])
+    };
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn parts_after_prefix_behavior() {
+        let mut existing_path = DirsAndFileName::default();
+        existing_path.push_all_dirs(&["apple", "bear", "cow", "dog"]);
+        existing_path.file_name = Some("egg.json".into());
+
+        // Prefix with one directory
+        let mut prefix = DirsAndFileName::default();
+        prefix.push_dir("apple");
+        let expected_parts: Vec<PathPart> = vec!["bear", "cow", "dog", "egg.json"]
+            .into_iter()
+            .map(Into::into)
+            .collect();
+        let parts = existing_path.parts_after_prefix(&prefix).unwrap();
+        assert_eq!(parts, expected_parts);
+
+        // Prefix with two directories
+        let mut prefix = DirsAndFileName::default();
+        prefix.push_all_dirs(&["apple", "bear"]);
+        let expected_parts: Vec<PathPart> = vec!["cow", "dog", "egg.json"]
+            .into_iter()
+            .map(Into::into)
+            .collect();
+        let parts = existing_path.parts_after_prefix(&prefix).unwrap();
+        assert_eq!(parts, expected_parts);
+
+        // Not a prefix
+        let mut prefix = DirsAndFileName::default();
+        prefix.push_dir("cow");
+        assert!(existing_path.parts_after_prefix(&prefix).is_none());
+
+        // Prefix with a partial directory
+        let mut prefix = DirsAndFileName::default();
+        prefix.push_dir("ap");
+        assert!(existing_path.parts_after_prefix(&prefix).is_none());
+
+        // Prefix matches but there aren't any parts after it
+        let mut existing_path = DirsAndFileName::default();
+        existing_path.push_all_dirs(&["apple", "bear", "cow", "dog"]);
+        let prefix = existing_path.clone();
+        let parts = existing_path.parts_after_prefix(&prefix).unwrap();
+        assert!(parts.is_empty());
+    }
+
+    #[test]
+    fn prefix_matches() {
+        let mut haystack = DirsAndFileName::default();
+        haystack.push_all_dirs(&["foo/bar", "baz%2Ftest", "something"]);
+
+        // self starts with self
+        assert!(
+            haystack.prefix_matches(&haystack),
+            "{:?} should have started with {:?}",
+            haystack,
+            haystack
+        );
+
+        // a longer prefix doesn't match
+        let mut needle = haystack.clone();
+        needle.push_dir("longer now");
+        assert!(
+            !haystack.prefix_matches(&needle),
+            "{:?} shouldn't have started with {:?}",
+            haystack,
+            needle
+        );
+
+        // one dir prefix matches
+        let mut needle = DirsAndFileName::default();
+        needle.push_dir("foo/bar");
+        assert!(
+            haystack.prefix_matches(&needle),
+            "{:?} should have started with {:?}",
+            haystack,
+            needle
+        );
+
+        // two dir prefix matches
+        needle.push_dir("baz%2Ftest");
+        assert!(
+            haystack.prefix_matches(&needle),
+            "{:?} should have started with {:?}",
+            haystack,
+            needle
+        );
+
+        // partial dir prefix matches
+        let mut needle = DirsAndFileName::default();
+        needle.push_dir("f");
+        assert!(
+            haystack.prefix_matches(&needle),
+            "{:?} should have started with {:?}",
+            haystack,
+            needle
+        );
+
+        // one dir and one partial dir matches
+        let mut needle = DirsAndFileName::default();
+        needle.push_all_dirs(&["foo/bar", "baz"]);
+        assert!(
+            haystack.prefix_matches(&needle),
+            "{:?} should have started with {:?}",
+            haystack,
+            needle
+        );
+    }
+
+    #[test]
+    fn prefix_matches_with_file_name() {
+        let mut haystack = DirsAndFileName::default();
+        haystack.push_all_dirs(&["foo/bar", "baz%2Ftest", "something"]);
+
+        let mut needle = haystack.clone();
+
+        // All directories match and file name is a prefix
+        haystack.set_file_name("foo.segment");
+        needle.set_file_name("foo");
+
+        assert!(
+            haystack.prefix_matches(&needle),
+            "{:?} should have started with {:?}",
+            haystack,
+            needle
+        );
+
+        // All directories match but file name is not a prefix
+        needle.set_file_name("e");
+
+        assert!(
+            !haystack.prefix_matches(&needle),
+            "{:?} should not have started with {:?}",
+            haystack,
+            needle
+        );
+
+        // Not all directories match; file name is a prefix of the next directory; this
+        // matches
+        let mut needle = DirsAndFileName::default();
+        needle.push_all_dirs(&["foo/bar", "baz%2Ftest"]);
+        needle.set_file_name("s");
+
+        assert!(
+            haystack.prefix_matches(&needle),
+            "{:?} should have started with {:?}",
+            haystack,
+            needle
+        );
+
+        // Not all directories match; file name is NOT a prefix of the next directory;
+        // no match
+        needle.set_file_name("p");
+
+        assert!(
+            !haystack.prefix_matches(&needle),
+            "{:?} should not have started with {:?}",
+            haystack,
+            needle
+        );
+    }
+
+    #[test]
+    fn test_macro() {
+        let actual = parsed_path!(["foo", "bar"], "baz");
+        let expected = DirsAndFileName {
+            directories: vec![PathPart::from("foo"), PathPart::from("bar")],
+            file_name: Some(PathPart::from("baz")),
+        };
+        assert_eq!(actual, expected);
+
+        let actual = parsed_path!([], "foo");
+        let expected = DirsAndFileName {
+            directories: vec![],
+            file_name: Some(PathPart::from("foo")),
+        };
+        assert_eq!(actual, expected);
+
+        let actual = parsed_path!("foo");
+        let expected = DirsAndFileName {
+            directories: vec![],
+            file_name: Some(PathPart::from("foo")),
+        };
+        assert_eq!(actual, expected);
+
+        let actual = parsed_path!(["foo", "bar"]);
+        let expected = DirsAndFileName {
+            directories: vec![PathPart::from("foo"), PathPart::from("bar")],
+            file_name: None,
+        };
+        assert_eq!(actual, expected);
+
+        let actual = parsed_path!([]);
+        let expected = DirsAndFileName {
+            directories: vec![],
+            file_name: None,
+        };
+        assert_eq!(actual, expected);
+
+        let actual = parsed_path!();
+        let expected = DirsAndFileName {
+            directories: vec![],
+            file_name: None,
+        };
+        assert_eq!(actual, expected);
+    }
+}
diff --git a/components/object_store/src/path/parts.rs b/components/object_store/src/path/parts.rs
new file mode 100644
index 0000000000..b9e69becfb
--- /dev/null
+++ b/components/object_store/src/path/parts.rs
@@ -0,0 +1,142 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use percent_encoding::{percent_decode_str, percent_encode, AsciiSet, CONTROLS};
+
+use super::DELIMITER;
+
+// percent_encode's API needs this as a byte
+const DELIMITER_BYTE: u8 = DELIMITER.as_bytes()[0];
+
+// special encoding of the empty string part.
+// Using '%' is the safest character since it will always be used in the
+// output of percent_encode no matter how we evolve the INVALID AsciiSet over
+// time.
+const EMPTY: &str = "%";
+
+/// The PathPart type exists to validate the directory/file names that form part
+/// of a path.
+///
+/// A PathPart instance is guaranteed to be non-empty and to contain no `/`
+/// characters as it can only be constructed by going through the `from` impl.
+#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Default, Hash)]
+pub struct PathPart(pub(super) String);
+
+/// Characters we want to encode.
+const INVALID: &AsciiSet = &CONTROLS
+    // The delimiter we are reserving for internal hierarchy
+    .add(DELIMITER_BYTE)
+    // Characters AWS recommends avoiding for object keys
+    // https://docs.aws.amazon.com/AmazonS3/latest/dev/UsingMetadata.html
+    .add(b'\\')
+    .add(b'{')
+    // TODO: Non-printable ASCII characters (128–255 decimal characters)
+    .add(b'^')
+    .add(b'}')
+    .add(b'%')
+    .add(b'`')
+    .add(b']')
+    .add(b'"') // " <-- my editor is confused about double quotes within single quotes
+    .add(b'>')
+    .add(b'[')
+    .add(b'~')
+    .add(b'<')
+    .add(b'#')
+    .add(b'|')
+    // Characters Google Cloud Storage recommends avoiding for object names
+    // https://cloud.google.com/storage/docs/naming-objects
+    .add(b'\r')
+    .add(b'\n')
+    .add(b'*')
+    .add(b'?');
+
+impl From<&str> for PathPart {
+    fn from(v: &str) -> Self {
+        match v {
+            // We don't want to encode `.` generally, but we do want to disallow parts of paths
+            // to be equal to `.` or `..` to prevent file system traversal shenanigans.
+            "." => Self(String::from("%2E")),
+            ".." => Self(String::from("%2E%2E")),
+
+            // Every string except the empty string will be percent encoded.
+            // The empty string will be transformed into a sentinel value EMPTY
+            // which can safely be a prefix of an encoded value since it will be
+            // fully matched at decode time (see impl Display for PathPart).
+            "" => Self(String::from(EMPTY)),
+            other => Self(percent_encode(other.as_bytes(), INVALID).to_string()),
+        }
+    }
+}
+
+impl std::fmt::Display for PathPart {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match &self.0[..] {
+            EMPTY => "".fmt(f),
+            _ => percent_decode_str(&self.0)
+                .decode_utf8()
+                .expect("Valid UTF-8 that came from String")
+                .fmt(f),
+        }
+    }
+}
+
+impl PathPart {
+    /// Encode as string.
+    pub fn encoded(&self) -> &str {
+        &self.0
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn path_part_delimiter_gets_encoded() {
+        let part: PathPart = "foo/bar".into();
+        assert_eq!(part, PathPart(String::from("foo%2Fbar")));
+    }
+
+    #[test]
+    fn path_part_gets_decoded_for_display() {
+        let part: PathPart = "foo/bar".into();
+        assert_eq!(part.to_string(), "foo/bar");
+    }
+
+    #[test]
+    fn path_part_given_already_encoded_string() {
+        let part: PathPart = "foo%2Fbar".into();
+        assert_eq!(part, PathPart(String::from("foo%252Fbar")));
+        assert_eq!(part.to_string(), "foo%2Fbar");
+    }
+
+    #[test]
+    fn path_part_cant_be_one_dot() {
+        let part: PathPart = ".".into();
+        assert_eq!(part, PathPart(String::from("%2E")));
+        assert_eq!(part.to_string(), ".");
+    }
+
+    #[test]
+    fn path_part_cant_be_two_dots() {
+        let part: PathPart = "..".into();
+        assert_eq!(part, PathPart(String::from("%2E%2E")));
+        assert_eq!(part.to_string(), "..");
+    }
+
+    #[test]
+    fn path_part_cant_be_empty() {
+        let part: PathPart = "".into();
+        assert_eq!(part, PathPart(String::from(EMPTY)));
+        assert_eq!(part.to_string(), "");
+    }
+
+    #[test]
+    fn empty_is_safely_encoded() {
+        let part: PathPart = EMPTY.into();
+        assert_eq!(
+            part,
+            PathPart(percent_encode(EMPTY.as_bytes(), INVALID).to_string())
+        );
+        assert_eq!(part.to_string(), EMPTY);
+    }
+}
diff --git a/components/parquet/Cargo.toml b/components/parquet/Cargo.toml
new file mode 100644
index 0000000000..c33523280e
--- /dev/null
+++ b/components/parquet/Cargo.toml
@@ -0,0 +1,12 @@
+[package]
+name = "parquet"
+version = "0.1.0"
+edition = "2018"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+arrow_deps = { path = "../../arrow_deps" }
+lru = "0.7.0"
+parquet-format = "4.0.0"
+thrift = "0.13"
\ No newline at end of file
diff --git a/components/parquet/src/cache.rs b/components/parquet/src/cache.rs
new file mode 100644
index 0000000000..393d49b63e
--- /dev/null
+++ b/components/parquet/src/cache.rs
@@ -0,0 +1,67 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use std::{
+    fmt::Debug,
+    sync::{Arc, RwLock},
+};
+
+use arrow_deps::parquet::file::metadata::ParquetMetaData;
+use lru::LruCache;
+
+pub trait MetaCache: Debug {
+    fn get(&self, key: &str) -> Option<Arc<ParquetMetaData>>;
+
+    fn put(&self, key: String, value: Arc<ParquetMetaData>);
+}
+
+pub trait DataCache: Debug {
+    fn get(&self, key: &str) -> Option<Arc<Vec<u8>>>;
+
+    fn put(&self, key: String, value: Arc<Vec<u8>>);
+}
+
+#[derive(Debug)]
+pub struct LruMetaCache {
+    cache: RwLock<LruCache<String, Arc<ParquetMetaData>>>,
+}
+
+impl LruMetaCache {
+    pub fn new(cap: usize) -> Self {
+        Self {
+            cache: RwLock::new(LruCache::new(cap)),
+        }
+    }
+}
+
+impl MetaCache for LruMetaCache {
+    fn get(&self, key: &str) -> Option<Arc<ParquetMetaData>> {
+        self.cache.write().unwrap().get(key).cloned()
+    }
+
+    fn put(&self, key: String, value: Arc<ParquetMetaData>) {
+        self.cache.write().unwrap().put(key, value);
+    }
+}
+
+#[derive(Debug)]
+pub struct LruDataCache {
+    cache: RwLock<LruCache<String, Arc<Vec<u8>>>>,
+}
+
+impl LruDataCache {
+    pub fn new(cap: usize) -> Self {
+        Self {
+            cache: RwLock::new(LruCache::new(cap)),
+        }
+    }
+}
+
+impl DataCache for LruDataCache {
+    fn get(&self, key: &str) -> Option<Arc<Vec<u8>>> {
+        self.cache.write().unwrap().get(key).cloned()
+    }
+
+    fn put(&self, key: String, value: Arc<Vec<u8>>) {
+        self.cache.write().unwrap().put(key, value);
+    }
+}
diff --git a/components/parquet/src/lib.rs b/components/parquet/src/lib.rs
new file mode 100644
index 0000000000..b2b8d28c46
--- /dev/null
+++ b/components/parquet/src/lib.rs
@@ -0,0 +1,17 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+pub mod cache;
+pub mod reverse_reader;
+mod serialized_reader;
+#[cfg(test)]
+pub mod tests;
+
+// use cache::Cache;
+use std::sync::Arc;
+
+pub use serialized_reader::CachableSerializedFileReader;
+
+use crate::cache::{DataCache, MetaCache};
+
+pub type MetaCacheRef = Arc<dyn MetaCache + Send + Sync>;
+pub type DataCacheRef = Arc<dyn DataCache + Send + Sync>;
diff --git a/components/parquet/src/reverse_reader.rs b/components/parquet/src/reverse_reader.rs
new file mode 100644
index 0000000000..ca201c3bea
--- /dev/null
+++ b/components/parquet/src/reverse_reader.rs
@@ -0,0 +1,231 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use std::{collections::VecDeque, sync::Arc};
+
+use arrow_deps::{
+    arrow::{
+        datatypes::SchemaRef,
+        error::Result as ArrowResult,
+        record_batch::{RecordBatch, RecordBatchReader},
+    },
+    parquet::{
+        arrow::{
+            self, arrow_reader::ParquetRecordBatchReader, ArrowReader, ParquetFileArrowReader,
+        },
+        errors::Result,
+        file::{
+            metadata::{FileMetaData, ParquetMetaData},
+            reader::{FileReader, RowGroupReader},
+        },
+        record::reader::RowIter,
+        schema::types::Type as SchemaType,
+    },
+};
+
+/// The reverse reader for [FileReader].
+///
+/// The details of implementation is:
+/// - Split the original [FileReader] into [RowGroup]s.
+/// - Reverse all the [RowGroup]s into `reversed_readers` so the order of
+///   [RowGroup] is already reversed.
+/// - Reverse all the [RecordBatch]es of the [RowGroup] into the
+///   `current_reversed_batches`.
+/// - Pop one [RecordBatch] from the `current_reversed_batches`and reverse its
+///   data and send it to caller.
+pub struct ReversedFileReader {
+    schema: SchemaRef,
+    /// The readers are arranged in reversed order and built from the
+    /// [RowGroup].
+    reversed_readers: Vec<ParquetRecordBatchReader>,
+    /// Buffer all the record batches of one reader and every record batch is
+    /// reversed.
+    current_reversed_batches: VecDeque<ArrowResult<RecordBatch>>,
+    next_reader_idx: usize,
+}
+
+impl ReversedFileReader {
+    fn fetch_next_batches_if_necessary(&mut self) {
+        if !self.current_reversed_batches.is_empty() {
+            // current reader is not exhausted and no need to fetch data.
+            return;
+        }
+
+        if self.next_reader_idx >= self.reversed_readers.len() {
+            // all the readers have been exhausted.
+            return;
+        }
+
+        let reader = &mut self.reversed_readers[self.next_reader_idx];
+        for batch in reader {
+            // reverse the order of the data of every record batch.
+            let reversed_batch = match batch {
+                Ok(v) => arrow_deps::util::reverse_record_batch(&v),
+                Err(e) => Err(e),
+            };
+            // reverse the order of the record batches.
+            self.current_reversed_batches.push_front(reversed_batch);
+        }
+
+        self.next_reader_idx += 1;
+    }
+}
+
+impl Iterator for ReversedFileReader {
+    type Item = ArrowResult<RecordBatch>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        self.fetch_next_batches_if_necessary();
+        self.current_reversed_batches.pop_front()
+    }
+}
+
+impl RecordBatchReader for ReversedFileReader {
+    fn schema(&self) -> SchemaRef {
+        self.schema.clone()
+    }
+}
+
+/// Reader for one [RowGroup] of the [FileReader].
+struct SingleRowGroupFileReader {
+    file_reader: Arc<dyn FileReader>,
+    /// The index of row group in `file_reader` to read.
+    row_group_idx: usize,
+    /// The meta data for the reader of the one row group.
+    meta_data: ParquetMetaData,
+}
+
+impl SingleRowGroupFileReader {
+    fn new(file_reader: Arc<dyn FileReader>, row_group_idx: usize) -> Self {
+        let meta_data = {
+            let orig_meta_data = file_reader.metadata();
+            let orig_file_meta_data = orig_meta_data.file_metadata();
+            let row_group_meta_data = orig_meta_data.row_group(row_group_idx);
+            let file_meta_data = FileMetaData::new(
+                orig_file_meta_data.version(),
+                // provide the row group's row number because of the reader only contains one row
+                // group.
+                row_group_meta_data.num_rows(),
+                orig_file_meta_data.created_by().clone(),
+                orig_file_meta_data.key_value_metadata().clone(),
+                orig_file_meta_data.schema_descr_ptr(),
+                orig_file_meta_data.column_orders().cloned(),
+            );
+            ParquetMetaData::new(file_meta_data, vec![row_group_meta_data.clone()])
+        };
+
+        Self {
+            file_reader,
+            row_group_idx,
+            meta_data,
+        }
+    }
+}
+
+impl FileReader for SingleRowGroupFileReader {
+    fn metadata(&self) -> &ParquetMetaData {
+        &self.meta_data
+    }
+
+    fn num_row_groups(&self) -> usize {
+        1
+    }
+
+    fn get_row_group(&self, i: usize) -> Result<Box<dyn RowGroupReader + '_>> {
+        self.file_reader.get_row_group(self.row_group_idx + i)
+    }
+
+    fn get_row_iter(&self, projection: Option<SchemaType>) -> Result<RowIter> {
+        RowIter::from_file(projection, self)
+    }
+}
+
+/// Builder for [ReverseRecordBatchReader] from the `file_reader`.
+#[must_use]
+pub struct Builder {
+    file_reader: Arc<dyn FileReader>,
+    batch_size: usize,
+    projection: Option<Vec<usize>>,
+}
+
+impl Builder {
+    pub fn new(file_reader: Arc<dyn FileReader>, batch_size: usize) -> Self {
+        Self {
+            file_reader,
+            batch_size,
+            projection: None,
+        }
+    }
+
+    pub fn projection(mut self, projection: Option<Vec<usize>>) -> Self {
+        self.projection = projection;
+
+        self
+    }
+
+    pub fn build(self) -> Result<ReversedFileReader> {
+        let mut reversed_readers = Vec::with_capacity(self.file_reader.num_row_groups());
+        for row_group_idx in (0..self.file_reader.num_row_groups()).rev() {
+            let row_group_file_reader =
+                SingleRowGroupFileReader::new(self.file_reader.clone(), row_group_idx);
+            let mut arrow_reader = ParquetFileArrowReader::new(Arc::new(row_group_file_reader));
+            let batch_reader = if let Some(proj) = &self.projection {
+                arrow_reader.get_record_reader_by_columns(proj.iter().cloned(), self.batch_size)?
+            } else {
+                arrow_reader.get_record_reader(self.batch_size)?
+            };
+            reversed_readers.push(batch_reader);
+        }
+
+        let schema = {
+            let file_metadata = self.file_reader.metadata().file_metadata();
+            Arc::new(arrow::parquet_to_arrow_schema(
+                file_metadata.schema_descr(),
+                file_metadata.key_value_metadata(),
+            )?)
+        };
+
+        Ok(ReversedFileReader {
+            schema,
+            reversed_readers,
+            current_reversed_batches: VecDeque::new(),
+            next_reader_idx: 0,
+        })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use arrow_deps::parquet::file::reader::SerializedFileReader;
+
+    use super::*;
+
+    const TEST_FILE: &str = "binary.parquet";
+    const TEST_BATCH_SIZE: usize = 1000;
+
+    fn check_reversed_row_iter(original: RowIter, reversed: ReversedFileReader) {
+        let mut original_reversed_rows: Vec<_> = original.into_iter().collect();
+        original_reversed_rows.reverse();
+
+        let reversed_record_batches: Vec<_> = reversed
+            .into_iter()
+            .map(|v| v.expect("Fail to fetch record batch"))
+            .collect();
+
+        crate::tests::check_rows_and_record_batches(
+            &original_reversed_rows,
+            &reversed_record_batches,
+        );
+    }
+
+    #[test]
+    fn test_reverse_file_reader() {
+        let test_file = crate::tests::get_test_file(TEST_FILE);
+        let file_reader: Arc<dyn FileReader> = Arc::new(
+            SerializedFileReader::new(test_file).expect("Should succeed to init file reader"),
+        );
+        let reversed_reader = Builder::new(file_reader.clone(), TEST_BATCH_SIZE)
+            .build()
+            .expect("Should succeed to build reversed file reader");
+        check_reversed_row_iter(file_reader.get_row_iter(None).unwrap(), reversed_reader);
+    }
+}
diff --git a/components/parquet/src/serialized_reader.rs b/components/parquet/src/serialized_reader.rs
new file mode 100644
index 0000000000..a79c13ed07
--- /dev/null
+++ b/components/parquet/src/serialized_reader.rs
@@ -0,0 +1,738 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! fork from https://github.com/apache/arrow-rs/blob/5.2.0/parquet/src/file/serialized_reader.rs
+
+//! Contains implementations of the reader traits FileReader, RowGroupReader and
+//! PageReader Also contains implementations of the ChunkReader for files (with
+//! buffering) and byte arrays (RAM)
+
+use std::{fs::File, io::Read, option::Option::Some, sync::Arc};
+
+use arrow_deps::parquet::{
+    basic::{Compression, Encoding, Type},
+    column::page::{Page, PageReader},
+    compression::{create_codec, Codec},
+    errors::{ParquetError, Result},
+    file::{footer, metadata::*, reader::*, statistics},
+    record::{reader::RowIter, Row},
+    schema::types::Type as SchemaType,
+    util::{cursor::SliceableCursor, memory::ByteBufferPtr},
+};
+use parquet_format::{PageHeader, PageType};
+use thrift::protocol::TCompactInputProtocol;
+
+use crate::{DataCacheRef, MetaCacheRef};
+
+fn format_page_data_key(name: &str, col_start: u64, col_length: u64) -> String {
+    format!("{}_{}_{}", name, col_start, col_length)
+}
+
+/// Conversion into a [`RowIter`](crate::record::reader::RowIter)
+/// using the full file schema over all row groups.
+impl IntoIterator for CachableSerializedFileReader<File> {
+    type IntoIter = RowIter<'static>;
+    type Item = Row;
+
+    fn into_iter(self) -> Self::IntoIter {
+        RowIter::from_file_into(Box::new(self))
+    }
+}
+
+// ----------------------------------------------------------------------
+// Implementations of file & row group readers
+
+/// A serialized with cache implementation for Parquet [`FileReader`].
+/// Two kinds of items are cacheable:
+///  - [`ParquetMetaData`]: only used for creating the reader.
+///  - Column chunk bytes: used for reading data by
+///    [`SerializedRowGroupReader`].
+///
+/// Note: the implementation is based on the https://github.com/apache/arrow-rs/blob/5.2.0/parquet/src/file/serialized_reader.rs.
+pub struct CachableSerializedFileReader<R: ChunkReader> {
+    name: String,
+    chunk_reader: Arc<R>,
+    metadata: Arc<ParquetMetaData>,
+    data_cache: Option<DataCacheRef>,
+}
+
+impl<R: 'static + ChunkReader> CachableSerializedFileReader<R> {
+    /// Creates file reader from a Parquet file.
+    /// Returns error if Parquet file does not exist or is corrupt.
+    pub fn new(
+        name: String,
+        chunk_reader: R,
+        meta_cache: Option<MetaCacheRef>,
+        data_cache: Option<DataCacheRef>,
+    ) -> Result<Self> {
+        // MODIFICATION START: consider cache for meta data.
+        let metadata = if let Some(meta_cache) = meta_cache {
+            if let Some(v) = meta_cache.get(&name) {
+                v
+            } else {
+                let meta_data = Arc::new(footer::parse_metadata(&chunk_reader)?);
+                meta_cache.put(name.clone(), meta_data.clone());
+                meta_data
+            }
+        } else {
+            Arc::new(footer::parse_metadata(&chunk_reader)?)
+        };
+        // MODIFICATION END.
+
+        Ok(Self {
+            name,
+            chunk_reader: Arc::new(chunk_reader),
+            metadata,
+            data_cache,
+        })
+    }
+
+    /// Filters row group metadata to only those row groups,
+    /// for which the predicate function returns true
+    pub fn filter_row_groups(&mut self, predicate: &dyn Fn(&RowGroupMetaData, usize) -> bool) {
+        let mut filtered_row_groups = Vec::<RowGroupMetaData>::new();
+        for (i, row_group_metadata) in self.metadata.row_groups().iter().enumerate() {
+            if predicate(row_group_metadata, i) {
+                filtered_row_groups.push(row_group_metadata.clone());
+            }
+        }
+        self.metadata = Arc::new(ParquetMetaData::new(
+            self.metadata.file_metadata().clone(),
+            filtered_row_groups,
+        ));
+    }
+}
+
+impl<R: 'static + ChunkReader> FileReader for CachableSerializedFileReader<R> {
+    fn metadata(&self) -> &ParquetMetaData {
+        &self.metadata
+    }
+
+    fn num_row_groups(&self) -> usize {
+        self.metadata.num_row_groups()
+    }
+
+    fn get_row_group(&self, i: usize) -> Result<Box<dyn RowGroupReader + '_>> {
+        let row_group_metadata = self.metadata.row_group(i);
+        // Row groups should be processed sequentially.
+        let f = Arc::clone(&self.chunk_reader);
+        Ok(Box::new(SerializedRowGroupReader::new(
+            f,
+            row_group_metadata,
+            self.name.clone(),
+            self.data_cache.clone(),
+        )))
+    }
+
+    fn get_row_iter(&self, projection: Option<SchemaType>) -> Result<RowIter> {
+        RowIter::from_file(projection, self)
+    }
+}
+
+/// A serialized with cache implementation for Parquet [`RowGroupReader`].
+///
+/// The cache is used for column data chunk when building [`PageReader`].
+///
+/// NOTE: the implementation is based on the https://github.com/apache/arrow-rs/blob/5.2.0/parquet/src/file/serialized_reader.rs
+pub struct SerializedRowGroupReader<'a, R: ChunkReader> {
+    chunk_reader: Arc<R>,
+    metadata: &'a RowGroupMetaData,
+    name: String,
+    data_cache: Option<DataCacheRef>,
+}
+
+impl<'a, R: ChunkReader> SerializedRowGroupReader<'a, R> {
+    /// Creates new row group reader from a file and row group metadata.
+    fn new(
+        chunk_reader: Arc<R>,
+        metadata: &'a RowGroupMetaData,
+        name: String,
+        data_cache: Option<DataCacheRef>,
+    ) -> Self {
+        Self {
+            chunk_reader,
+            metadata,
+            name,
+            data_cache,
+        }
+    }
+
+    fn get_data(&self, col_start: u64, col_length: u64) -> Result<Vec<u8>> {
+        let mut file_chunk = self.chunk_reader.get_read(col_start, col_length as usize)?;
+        let mut buf = Vec::with_capacity(col_length as usize);
+        file_chunk.read_to_end(&mut buf).unwrap();
+        Ok(buf)
+    }
+
+    fn get_file_chunk(&self, col_start: u64, col_length: u64) -> Result<impl Read> {
+        if let Some(data_cache) = &self.data_cache {
+            let key = format_page_data_key(&self.name, col_start, col_length);
+            if let Some(v) = data_cache.get(&key) {
+                Ok(SliceableCursor::new(v))
+            } else {
+                let buf_arc = Arc::new(self.get_data(col_start, col_length)?);
+                data_cache.put(key, buf_arc.clone());
+                let slice = SliceableCursor::new(buf_arc);
+                Ok(slice)
+            }
+        } else {
+            let buf_arc = Arc::new(self.get_data(col_start, col_length)?);
+            let slice = SliceableCursor::new(buf_arc);
+            Ok(slice)
+        }
+    }
+}
+
+impl<'a, R: 'static + ChunkReader> RowGroupReader for SerializedRowGroupReader<'a, R> {
+    fn metadata(&self) -> &RowGroupMetaData {
+        self.metadata
+    }
+
+    fn num_columns(&self) -> usize {
+        self.metadata.num_columns()
+    }
+
+    // TODO: fix PARQUET-816
+    fn get_column_page_reader(&self, i: usize) -> Result<Box<dyn PageReader>> {
+        let col = self.metadata.column(i);
+        let (col_start, col_length) = col.byte_range();
+
+        // MODIFICATION START: consider the cache for the data chunk: [col_start,
+        // col_start+col_length).
+        let file_chunk = self.get_file_chunk(col_start, col_length)?;
+        // MODIFICATION END.
+
+        let page_reader = SerializedPageReader::new(
+            file_chunk,
+            col.num_values(),
+            col.compression(),
+            col.column_descr().physical_type(),
+        )?;
+        Ok(Box::new(page_reader))
+    }
+
+    fn get_row_iter(&self, projection: Option<SchemaType>) -> Result<RowIter> {
+        RowIter::from_row_group(projection, self)
+    }
+}
+
+/// A serialized implementation for Parquet [`PageReader`].
+pub struct SerializedPageReader<T: Read> {
+    // The file source buffer which references exactly the bytes for the column trunk
+    // to be read by this page reader.
+    buf: T,
+
+    // The compression codec for this column chunk. Only set for non-PLAIN codec.
+    decompressor: Option<Box<dyn Codec>>,
+
+    // The number of values we have seen so far.
+    seen_num_values: i64,
+
+    // The number of total values in this column chunk.
+    total_num_values: i64,
+
+    // Column chunk type.
+    physical_type: Type,
+}
+
+impl<T: Read> SerializedPageReader<T> {
+    /// Creates a new serialized page reader from file source.
+    pub fn new(
+        buf: T,
+        total_num_values: i64,
+        compression: Compression,
+        physical_type: Type,
+    ) -> Result<Self> {
+        let decompressor = create_codec(compression)?;
+        let result = Self {
+            buf,
+            total_num_values,
+            seen_num_values: 0,
+            decompressor,
+            physical_type,
+        };
+        Ok(result)
+    }
+
+    /// Reads Page header from Thrift.
+    fn read_page_header(&mut self) -> Result<PageHeader> {
+        let mut prot = TCompactInputProtocol::new(&mut self.buf);
+        let page_header = PageHeader::read_from_in_protocol(&mut prot)?;
+        Ok(page_header)
+    }
+}
+
+impl<T: Read> Iterator for SerializedPageReader<T> {
+    type Item = Result<Page>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        self.get_next_page().transpose()
+    }
+}
+
+impl<T: Read> PageReader for SerializedPageReader<T> {
+    fn get_next_page(&mut self) -> Result<Option<Page>> {
+        while self.seen_num_values < self.total_num_values {
+            let page_header = self.read_page_header()?;
+
+            // When processing data page v2, depending on enabled compression for the
+            // page, we should account for uncompressed data ('offset') of
+            // repetition and definition levels.
+            //
+            // We always use 0 offset for other pages other than v2, `true` flag means
+            // that compression will be applied if decompressor is defined
+            let mut offset: usize = 0;
+            let mut can_decompress = true;
+
+            if let Some(ref header_v2) = page_header.data_page_header_v2 {
+                offset = (header_v2.definition_levels_byte_length
+                    + header_v2.repetition_levels_byte_length) as usize;
+                // When is_compressed flag is missing the page is considered compressed
+                can_decompress = header_v2.is_compressed.unwrap_or(true);
+            }
+
+            let compressed_len = page_header.compressed_page_size as usize - offset;
+            let uncompressed_len = page_header.uncompressed_page_size as usize - offset;
+            // We still need to read all bytes from buffered stream
+            let mut buffer = vec![0; offset + compressed_len];
+            self.buf.read_exact(&mut buffer)?;
+
+            // TODO: page header could be huge because of statistics. We should set a
+            //  maximum page header size and abort if that is exceeded.
+            if let Some(decompressor) = self.decompressor.as_mut() {
+                if can_decompress {
+                    let mut decompressed_buffer = Vec::with_capacity(uncompressed_len);
+                    let decompressed_size =
+                        decompressor.decompress(&buffer[offset..], &mut decompressed_buffer)?;
+                    if decompressed_size != uncompressed_len {
+                        return Err(ParquetError::General(format!(
+                            "Actual decompressed size doesn't match the expected one ({} vs {})",
+                            decompressed_size, uncompressed_len,
+                        )));
+                    }
+                    if offset == 0 {
+                        buffer = decompressed_buffer;
+                    } else {
+                        // Prepend saved offsets to the buffer
+                        buffer.truncate(offset);
+                        buffer.append(&mut decompressed_buffer);
+                    }
+                }
+            }
+
+            let result = match page_header.type_ {
+                PageType::DictionaryPage => {
+                    assert!(page_header.dictionary_page_header.is_some());
+                    let dict_header = page_header.dictionary_page_header.as_ref().unwrap();
+                    let is_sorted = dict_header.is_sorted.unwrap_or(false);
+                    Page::DictionaryPage {
+                        buf: ByteBufferPtr::new(buffer),
+                        num_values: dict_header.num_values as u32,
+                        encoding: Encoding::from(dict_header.encoding),
+                        is_sorted,
+                    }
+                }
+                PageType::DataPage => {
+                    assert!(page_header.data_page_header.is_some());
+                    let header = page_header.data_page_header.unwrap();
+                    self.seen_num_values += header.num_values as i64;
+                    Page::DataPage {
+                        buf: ByteBufferPtr::new(buffer),
+                        num_values: header.num_values as u32,
+                        encoding: Encoding::from(header.encoding),
+                        def_level_encoding: Encoding::from(header.definition_level_encoding),
+                        rep_level_encoding: Encoding::from(header.repetition_level_encoding),
+                        statistics: statistics::from_thrift(self.physical_type, header.statistics),
+                    }
+                }
+                PageType::DataPageV2 => {
+                    assert!(page_header.data_page_header_v2.is_some());
+                    let header = page_header.data_page_header_v2.unwrap();
+                    let is_compressed = header.is_compressed.unwrap_or(true);
+                    self.seen_num_values += header.num_values as i64;
+                    Page::DataPageV2 {
+                        buf: ByteBufferPtr::new(buffer),
+                        num_values: header.num_values as u32,
+                        encoding: Encoding::from(header.encoding),
+                        num_nulls: header.num_nulls as u32,
+                        num_rows: header.num_rows as u32,
+                        def_levels_byte_len: header.definition_levels_byte_length as u32,
+                        rep_levels_byte_len: header.repetition_levels_byte_length as u32,
+                        is_compressed,
+                        statistics: statistics::from_thrift(self.physical_type, header.statistics),
+                    }
+                }
+                _ => {
+                    // For unknown page type (e.g., INDEX_PAGE), skip and read next.
+                    continue;
+                }
+            };
+            return Ok(Some(result));
+        }
+
+        // We are at the end of this column chunk and no more page left. Return None.
+        Ok(None)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use arrow_deps::parquet::basic::ColumnOrder;
+
+    use super::*;
+    use crate::cache::{LruDataCache, LruMetaCache};
+
+    #[test]
+    fn test_cursor_and_file_has_the_same_behaviour() {
+        let mut buf: Vec<u8> = Vec::new();
+        crate::tests::get_test_file("alltypes_plain.parquet")
+            .read_to_end(&mut buf)
+            .unwrap();
+        let cursor = SliceableCursor::new(buf);
+        let read_from_cursor =
+            CachableSerializedFileReader::new("read_from_cursor".to_string(), cursor, None, None)
+                .unwrap();
+
+        let test_file = crate::tests::get_test_file("alltypes_plain.parquet");
+        let read_from_file =
+            CachableSerializedFileReader::new("read_from_file".to_string(), test_file, None, None)
+                .unwrap();
+
+        let file_iter = read_from_file.get_row_iter(None).unwrap();
+        let cursor_iter = read_from_cursor.get_row_iter(None).unwrap();
+
+        assert!(file_iter.eq(cursor_iter));
+    }
+
+    #[test]
+    fn test_reuse_file_chunk() {
+        // This test covers the case of maintaining the correct start position in a file
+        // stream for each column reader after initializing and moving to the next one
+        // (without necessarily reading the entire column).
+        let test_file = crate::tests::get_test_file("alltypes_plain.parquet");
+        let reader =
+            CachableSerializedFileReader::new("test".to_string(), test_file, None, None).unwrap();
+        let row_group = reader.get_row_group(0).unwrap();
+
+        let mut page_readers = Vec::new();
+        for i in 0..row_group.num_columns() {
+            page_readers.push(row_group.get_column_page_reader(i).unwrap());
+        }
+
+        // Now buffer each col reader, we do not expect any failures like:
+        // General("underlying Thrift error: end of file")
+        for mut page_reader in page_readers {
+            assert!(page_reader.get_next_page().is_ok());
+        }
+    }
+
+    fn new_filer_reader_with_cache() -> CachableSerializedFileReader<File> {
+        let data_cache: Option<DataCacheRef> = Some(Arc::new(LruDataCache::new(1000)));
+        let meta_cache: Option<MetaCacheRef> = Some(Arc::new(LruMetaCache::new(1000)));
+        let test_file = crate::tests::get_test_file("alltypes_plain.parquet");
+        let reader_result = CachableSerializedFileReader::new(
+            "test".to_string(),
+            test_file,
+            meta_cache.clone(),
+            data_cache.clone(),
+        );
+        assert!(reader_result.is_ok());
+        reader_result.unwrap()
+    }
+
+    fn test_with_file_reader(reader: &CachableSerializedFileReader<File>) {
+        // Test contents in Parquet metadata
+        let metadata = reader.metadata();
+        assert_eq!(metadata.num_row_groups(), 1);
+
+        // Test contents in file metadata
+        let file_metadata = metadata.file_metadata();
+        assert!(file_metadata.created_by().is_some());
+        assert_eq!(
+            file_metadata.created_by().as_ref().unwrap(),
+            "impala version 1.3.0-INTERNAL (build 8a48ddb1eff84592b3fc06bc6f51ec120e1fffc9)"
+        );
+        assert!(file_metadata.key_value_metadata().is_none());
+        assert_eq!(file_metadata.num_rows(), 8);
+        assert_eq!(file_metadata.version(), 1);
+        assert_eq!(file_metadata.column_orders(), None);
+
+        // Test contents in row group metadata
+        let row_group_metadata = metadata.row_group(0);
+        assert_eq!(row_group_metadata.num_columns(), 11);
+        assert_eq!(row_group_metadata.num_rows(), 8);
+        assert_eq!(row_group_metadata.total_byte_size(), 671);
+        // Check each column order
+        for i in 0..row_group_metadata.num_columns() {
+            assert_eq!(file_metadata.column_order(i), ColumnOrder::UNDEFINED);
+        }
+
+        // Test row group reader
+        let row_group_reader_result = reader.get_row_group(0);
+        assert!(row_group_reader_result.is_ok());
+        let row_group_reader: Box<dyn RowGroupReader> = row_group_reader_result.unwrap();
+        assert_eq!(
+            row_group_reader.num_columns(),
+            row_group_metadata.num_columns()
+        );
+        assert_eq!(
+            row_group_reader.metadata().total_byte_size(),
+            row_group_metadata.total_byte_size()
+        );
+
+        // Test page readers
+        // TODO: test for every column
+        let page_reader_0_result = row_group_reader.get_column_page_reader(0);
+        assert!(page_reader_0_result.is_ok());
+        let mut page_reader_0: Box<dyn PageReader> = page_reader_0_result.unwrap();
+        let mut page_count = 0;
+        while let Ok(Some(page)) = page_reader_0.get_next_page() {
+            let is_expected_page = match page {
+                Page::DictionaryPage {
+                    buf,
+                    num_values,
+                    encoding,
+                    is_sorted,
+                } => {
+                    assert_eq!(buf.len(), 32);
+                    assert_eq!(num_values, 8);
+                    assert_eq!(encoding, Encoding::PLAIN_DICTIONARY);
+                    assert!(!is_sorted);
+                    true
+                }
+                Page::DataPage {
+                    buf,
+                    num_values,
+                    encoding,
+                    def_level_encoding,
+                    rep_level_encoding,
+                    statistics,
+                } => {
+                    assert_eq!(buf.len(), 11);
+                    assert_eq!(num_values, 8);
+                    assert_eq!(encoding, Encoding::PLAIN_DICTIONARY);
+                    assert_eq!(def_level_encoding, Encoding::RLE);
+                    assert_eq!(rep_level_encoding, Encoding::BIT_PACKED);
+                    assert!(statistics.is_none());
+                    true
+                }
+                _ => false,
+            };
+            assert!(is_expected_page);
+            page_count += 1;
+        }
+        assert_eq!(page_count, 2);
+    }
+
+    #[test]
+    fn test_file_reader() {
+        let test_file = crate::tests::get_test_file("alltypes_plain.parquet");
+        let reader = CachableSerializedFileReader::new("test".to_string(), test_file, None, None)
+            .expect("Should succeed to build test reader");
+        test_with_file_reader(&reader);
+    }
+
+    #[test]
+    fn test_file_reader_with_cache() {
+        let reader = new_filer_reader_with_cache();
+        let test_num = 10usize;
+        for _ in 0..test_num {
+            test_with_file_reader(&reader);
+        }
+    }
+
+    #[test]
+    fn test_file_reader_datapage_v2() {
+        let test_file = crate::tests::get_test_file("datapage_v2.snappy.parquet");
+        let reader_result =
+            CachableSerializedFileReader::new("test".to_string(), test_file, None, None);
+        assert!(reader_result.is_ok());
+        let reader = reader_result.unwrap();
+
+        // Test contents in Parquet metadata
+        let metadata = reader.metadata();
+        assert_eq!(metadata.num_row_groups(), 1);
+
+        // Test contents in file metadata
+        let file_metadata = metadata.file_metadata();
+        assert!(file_metadata.created_by().is_some());
+        assert_eq!(
+            file_metadata.created_by().as_ref().unwrap(),
+            "parquet-mr version 1.8.1 (build 4aba4dae7bb0d4edbcf7923ae1339f28fd3f7fcf)"
+        );
+        assert!(file_metadata.key_value_metadata().is_some());
+        assert_eq!(
+            file_metadata.key_value_metadata().to_owned().unwrap().len(),
+            1
+        );
+
+        assert_eq!(file_metadata.num_rows(), 5);
+        assert_eq!(file_metadata.version(), 1);
+        assert_eq!(file_metadata.column_orders(), None);
+
+        let row_group_metadata = metadata.row_group(0);
+
+        // Check each column order
+        for i in 0..row_group_metadata.num_columns() {
+            assert_eq!(file_metadata.column_order(i), ColumnOrder::UNDEFINED);
+        }
+
+        // Test row group reader
+        let row_group_reader_result = reader.get_row_group(0);
+        assert!(row_group_reader_result.is_ok());
+        let row_group_reader: Box<dyn RowGroupReader> = row_group_reader_result.unwrap();
+        assert_eq!(
+            row_group_reader.num_columns(),
+            row_group_metadata.num_columns()
+        );
+        assert_eq!(
+            row_group_reader.metadata().total_byte_size(),
+            row_group_metadata.total_byte_size()
+        );
+
+        // Test page readers
+        // TODO: test for every column
+        let page_reader_0_result = row_group_reader.get_column_page_reader(0);
+        assert!(page_reader_0_result.is_ok());
+        let mut page_reader_0: Box<dyn PageReader> = page_reader_0_result.unwrap();
+        let mut page_count = 0;
+        while let Ok(Some(page)) = page_reader_0.get_next_page() {
+            let is_expected_page = match page {
+                Page::DictionaryPage {
+                    buf,
+                    num_values,
+                    encoding,
+                    is_sorted,
+                } => {
+                    assert_eq!(buf.len(), 7);
+                    assert_eq!(num_values, 1);
+                    assert_eq!(encoding, Encoding::PLAIN);
+                    assert!(!is_sorted);
+                    true
+                }
+                Page::DataPageV2 {
+                    buf,
+                    num_values,
+                    encoding,
+                    num_nulls,
+                    num_rows,
+                    def_levels_byte_len,
+                    rep_levels_byte_len,
+                    is_compressed,
+                    statistics,
+                } => {
+                    assert_eq!(buf.len(), 4);
+                    assert_eq!(num_values, 5);
+                    assert_eq!(encoding, Encoding::RLE_DICTIONARY);
+                    assert_eq!(num_nulls, 1);
+                    assert_eq!(num_rows, 5);
+                    assert_eq!(def_levels_byte_len, 2);
+                    assert_eq!(rep_levels_byte_len, 0);
+                    assert!(is_compressed);
+                    assert!(statistics.is_some());
+                    true
+                }
+                _ => false,
+            };
+            assert!(is_expected_page);
+            page_count += 1;
+        }
+        assert_eq!(page_count, 2);
+    }
+
+    #[test]
+    fn test_page_iterator() {
+        let file = crate::tests::get_test_file("alltypes_plain.parquet");
+        let file_reader = Arc::new(
+            CachableSerializedFileReader::new("test".to_string(), file, None, None).unwrap(),
+        );
+
+        let mut page_iterator = FilePageIterator::new(0, file_reader.clone()).unwrap();
+
+        // read first page
+        let page = page_iterator.next();
+        assert!(page.is_some());
+        assert!(page.unwrap().is_ok());
+
+        // reach end of file
+        let page = page_iterator.next();
+        assert!(page.is_none());
+
+        let row_group_indices = Box::new(0..1);
+        let mut page_iterator =
+            FilePageIterator::with_row_groups(0, row_group_indices, file_reader).unwrap();
+
+        // read first page
+        let page = page_iterator.next();
+        assert!(page.is_some());
+        assert!(page.unwrap().is_ok());
+
+        // reach end of file
+        let page = page_iterator.next();
+        assert!(page.is_none());
+    }
+
+    #[test]
+    fn test_file_reader_key_value_metadata() {
+        let file = crate::tests::get_test_file("binary.parquet");
+        let file_reader = Arc::new(
+            CachableSerializedFileReader::new("test".to_string(), file, None, None).unwrap(),
+        );
+
+        let metadata = file_reader
+            .metadata
+            .file_metadata()
+            .key_value_metadata()
+            .as_ref()
+            .unwrap();
+
+        assert_eq!(metadata.len(), 3);
+
+        assert_eq!(metadata.get(0).unwrap().key, "parquet.proto.descriptor");
+
+        assert_eq!(metadata.get(1).unwrap().key, "writer.model.name");
+        assert_eq!(metadata.get(1).unwrap().value, Some("protobuf".to_owned()));
+
+        assert_eq!(metadata.get(2).unwrap().key, "parquet.proto.class");
+        assert_eq!(
+            metadata.get(2).unwrap().value,
+            Some("foo.baz.Foobaz$Event".to_owned())
+        );
+    }
+
+    #[test]
+    fn test_file_reader_filter_row_groups() -> Result<()> {
+        let test_file = crate::tests::get_test_file("alltypes_plain.parquet");
+        let mut reader =
+            CachableSerializedFileReader::new("test".to_string(), test_file, None, None)?;
+
+        // test initial number of row groups
+        let metadata = reader.metadata();
+        assert_eq!(metadata.num_row_groups(), 1);
+
+        // test filtering out all row groups
+        reader.filter_row_groups(&|_, _| false);
+        let metadata = reader.metadata();
+        assert_eq!(metadata.num_row_groups(), 0);
+
+        Ok(())
+    }
+}
diff --git a/components/parquet/src/tests.rs b/components/parquet/src/tests.rs
new file mode 100644
index 0000000000..69d6904e8f
--- /dev/null
+++ b/components/parquet/src/tests.rs
@@ -0,0 +1,118 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use std::{env, error::Error, fs, path::PathBuf, str::FromStr};
+
+use arrow_deps::{
+    arrow::{array::*, datatypes::DataType, record_batch::RecordBatch},
+    parquet::record::{Field, Row},
+};
+
+fn get_data_dir(
+    udf_env: &str,
+    submodule_data: &str,
+) -> std::result::Result<PathBuf, Box<dyn Error>> {
+    // Try user defined env.
+    if let Ok(dir) = env::var(udf_env) {
+        let trimmed = dir.trim().to_string();
+        if !trimmed.is_empty() {
+            let pb = PathBuf::from(trimmed);
+            if pb.is_dir() {
+                return Ok(pb);
+            } else {
+                return Err(format!(
+                    "the data dir `{}` defined by env {} not found",
+                    pb.display(),
+                    udf_env
+                )
+                .into());
+            }
+        }
+    }
+
+    // The env is undefined or its value is trimmed to empty, let's try default dir.
+
+    // env "CARGO_MANIFEST_DIR" is "the directory containing the manifest of your
+    // package", set by `cargo run` or `cargo test`, see:
+    // https://doc.rust-lang.org/cargo/reference/environment-variables.html
+    let dir = env!("CARGO_MANIFEST_DIR");
+
+    let pb = PathBuf::from(dir).join(submodule_data);
+    if pb.is_dir() {
+        Ok(pb)
+    } else {
+        Err(format!(
+            "env `{}` is undefined or has empty value, and the pre-defined data dir `{}` not found\n\
+                HINT: try running `git submodule update --init`",
+            udf_env,
+            pb.display(),
+        ).into())
+    }
+}
+
+fn parquet_test_data() -> String {
+    match get_data_dir("PARQUET_TEST_DATA", "../parquet-testing/data") {
+        Ok(pb) => pb.display().to_string(),
+        Err(err) => panic!("failed to get parquet data dir: {}", err),
+    }
+}
+
+/// Returns path to the test parquet file in 'data' directory
+fn get_test_path(file_name: &str) -> PathBuf {
+    let mut pathbuf = PathBuf::from_str(&parquet_test_data()).unwrap();
+    pathbuf.push(file_name);
+    pathbuf
+}
+
+/// Returns file handle for a test parquet file from 'data' directory
+pub fn get_test_file(file_name: &str) -> fs::File {
+    let path = get_test_path(file_name);
+    fs::File::open(path.as_path()).unwrap_or_else(|err| {
+        panic!(
+            "Test file {} could not be opened, did you do `git submodule update`?: {}",
+            path.display(),
+            err
+        )
+    })
+}
+
+struct RowViewOfRecordBatch<'a> {
+    record_batch: &'a RecordBatch,
+    row_idx: usize,
+}
+
+impl<'a> RowViewOfRecordBatch<'a> {
+    fn check_row(&self, expect_row: &Row) {
+        for (col_idx, (_, field)) in expect_row.get_column_iter().enumerate() {
+            let array_ref = self.record_batch.column(col_idx);
+
+            match array_ref.data_type() {
+                DataType::Binary => {
+                    let array = array_ref.as_any().downcast_ref::<BinaryArray>().unwrap();
+                    let v = array.value(self.row_idx);
+
+                    if let Field::Bytes(field_value) = field {
+                        assert_eq!(v, field_value.data());
+                    } else {
+                        panic!("different value type");
+                    }
+                }
+                _ => unimplemented!("not support {:?}", array_ref.data_type()),
+            }
+        }
+    }
+}
+
+pub fn check_rows_and_record_batches(rows: &[Row], record_batches: &[RecordBatch]) {
+    let mut row_idx = 0;
+    for record_batch in record_batches {
+        for row_idx_in_batch in 0..record_batch.num_rows() {
+            let expect_row = &rows[row_idx];
+            let row_view = RowViewOfRecordBatch {
+                record_batch,
+                row_idx: row_idx_in_batch,
+            };
+            row_view.check_row(expect_row);
+            row_idx += 1;
+        }
+    }
+}
diff --git a/components/profile/Cargo.toml b/components/profile/Cargo.toml
new file mode 100644
index 0000000000..044fb5685a
--- /dev/null
+++ b/components/profile/Cargo.toml
@@ -0,0 +1,16 @@
+[package]
+name = "profile"
+version = "0.1.0"
+authors = ["CeresDB Authors <ceresdb@service.alipay.com>"]
+edition = "2018"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+[dependencies]
+log = "0.4"
+tempfile = "3.0"
+jemallocator = "0.3.2"
+jemalloc-ctl = "0.3.2"
+
+[dependencies.jemalloc-sys]
+version = "0.3.2"
+features = ["stats", "profiling", "unprefixed_malloc_on_supported_platforms"]
diff --git a/components/profile/src/lib.rs b/components/profile/src/lib.rs
new file mode 100644
index 0000000000..2f63f8c536
--- /dev/null
+++ b/components/profile/src/lib.rs
@@ -0,0 +1,142 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Memory profiler for running application based on jemalloc features.
+
+use std::{
+    fmt::Formatter,
+    fs::File,
+    io,
+    io::Read,
+    sync::{Mutex, MutexGuard},
+    thread, time,
+};
+
+use jemalloc_ctl::{Access, AsName};
+use jemallocator;
+use log::{error, info};
+
+#[derive(Debug)]
+pub enum Error {
+    Internal { msg: String },
+    IO(io::Error),
+    Jemalloc(jemalloc_ctl::Error),
+}
+
+impl std::fmt::Display for Error {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "Profile Error: {:?}", self)
+    }
+}
+
+impl std::error::Error for Error {}
+
+pub type Result<T> = std::result::Result<T, Error>;
+
+#[global_allocator]
+static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
+
+const PROF_ACTIVE: &'static [u8] = b"prof.active\0";
+const PROF_DUMP: &'static [u8] = b"prof.dump\0";
+const PROFILE_OUTPUT: &'static [u8] = b"profile.out\0";
+const PROFILE_OUTPUT_FILE_PATH: &str = "/tmp/profile.out";
+
+fn set_prof_active(active: bool) -> Result<()> {
+    let name = PROF_ACTIVE.name();
+    name.write(active).map_err(|e| Error::Jemalloc(e))
+}
+
+fn dump_profile() -> Result<()> {
+    let name = PROF_DUMP.name();
+    name.write(PROFILE_OUTPUT).map_err(|e| Error::Jemalloc(e))
+}
+
+struct ProfLockGuard<'a>(MutexGuard<'a, ()>);
+
+/// ProfLockGuard hold the profile lock and take responsibilities for
+/// (de)activating mem profiling. NOTE: Keeping mem profiling on may cause some
+/// extra runtime cost so we choose to activating it  dynamically.
+impl<'a> ProfLockGuard<'a> {
+    pub fn new(guard: MutexGuard<'a, ()>) -> Result<Self> {
+        set_prof_active(true)?;
+        Ok(Self(guard))
+    }
+}
+
+impl<'a> Drop for ProfLockGuard<'a> {
+    fn drop(&mut self) {
+        if let Err(e) = set_prof_active(false) {
+            error!("Fail to deactivate profiling, err:{}", e);
+        }
+    }
+}
+
+pub struct Profiler {
+    mem_prof_lock: Mutex<()>,
+}
+
+impl Default for Profiler {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl Profiler {
+    pub fn new() -> Self {
+        Self {
+            mem_prof_lock: Mutex::new(()),
+        }
+    }
+
+    // dump_mem_prof collects mem profiling data in `seconds`.
+    // TODO(xikai): limit the profiling duration
+    pub fn dump_mem_prof(&self, seconds: u64) -> Result<Vec<u8>> {
+        // concurrent profiling is disabled.
+        let lock_guard = self.mem_prof_lock.try_lock().map_err(|e| Error::Internal {
+            msg: format!("failed to acquire mem_prof_lock, err:{}", e),
+        })?;
+
+        let _guard = ProfLockGuard::new(lock_guard)?;
+
+        info!(
+            "Profiler::dump_mem_prof start memory profiling {} seconds",
+            seconds
+        );
+        // wait for seconds for collect the profiling data
+        thread::sleep(time::Duration::from_secs(seconds));
+
+        // clearing the profile output file before dumping profile results.
+        {
+            let f = File::open(PROFILE_OUTPUT_FILE_PATH).map_err(|e| {
+                error!("Failed to open prof data file, err:{}", e);
+                Error::IO(e)
+            })?;
+            f.set_len(0).map_err(|e| {
+                error!("Failed to truncate profile output file, err:{}", e);
+                Error::IO(e)
+            })?;
+        }
+
+        // dump the profile results to profile output file.
+        dump_profile().map_err(|e| {
+            error!(
+                "Failed to dump prof to {}, err:{}",
+                PROFILE_OUTPUT_FILE_PATH, e
+            );
+            e
+        })?;
+
+        // read the profile results into buffer
+        let mut f = File::open(PROFILE_OUTPUT_FILE_PATH).map_err(|e| {
+            error!("Failed to open prof data file, err:{}", e);
+            Error::IO(e)
+        })?;
+
+        let mut buffer = Vec::new();
+        f.read_to_end(&mut buffer).map_err(|e| {
+            error!("Failed to read prof data file, err:{}", e);
+            Error::IO(e)
+        })?;
+
+        Ok(buffer)
+    }
+}
diff --git a/components/rust-hyperloglog/.github/dependabot.yml b/components/rust-hyperloglog/.github/dependabot.yml
new file mode 100644
index 0000000000..66cef947a2
--- /dev/null
+++ b/components/rust-hyperloglog/.github/dependabot.yml
@@ -0,0 +1,10 @@
+# // Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+version: 2
+updates:
+- package-ecosystem: cargo
+  directory: "/"
+  schedule:
+    interval: daily
+    time: "04:00"
+  open-pull-requests-limit: 10
diff --git a/components/rust-hyperloglog/.gitignore b/components/rust-hyperloglog/.gitignore
new file mode 100644
index 0000000000..4468cbfb1c
--- /dev/null
+++ b/components/rust-hyperloglog/.gitignore
@@ -0,0 +1,7 @@
+*.dSYM
+*~
+.rust
+build
+Cargo.lock
+src/hyperloglog/hyperloglog
+target
diff --git a/components/rust-hyperloglog/.travis.yml b/components/rust-hyperloglog/.travis.yml
new file mode 100644
index 0000000000..52635e58a8
--- /dev/null
+++ b/components/rust-hyperloglog/.travis.yml
@@ -0,0 +1,6 @@
+# // Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+language: rust
+rust:
+  - nightly
+  - stable
diff --git a/components/rust-hyperloglog/Cargo.toml b/components/rust-hyperloglog/Cargo.toml
new file mode 100644
index 0000000000..40c7cb83f1
--- /dev/null
+++ b/components/rust-hyperloglog/Cargo.toml
@@ -0,0 +1,20 @@
+[package]
+name = "hyperloglog"
+version = "1.0.0"
+authors = ["Frank Denis <github@pureftpd.org>"]
+description = "Hyperloglog implementation in Rust"
+license = "ISC"
+homepage = "https://github.com/jedisct1/rust-hyperloglog"
+repository = "https://github.com/jedisct1/rust-hyperloglog"
+edition = "2018"
+
+[lib]
+name = "hyperloglog"
+path = "src/hyperloglog/lib.rs"
+
+[dependencies]
+bytecount = "0.6"
+bytes = { path = "../bytes" }
+rand = "0.8.0"
+siphasher = "0.3"
+snafu = { version ="0.6.10", features = ["backtraces"]}
diff --git a/components/rust-hyperloglog/LICENSE b/components/rust-hyperloglog/LICENSE
new file mode 100644
index 0000000000..ab647ead82
--- /dev/null
+++ b/components/rust-hyperloglog/LICENSE
@@ -0,0 +1,23 @@
+Copyright (c) 2013-2016, Frank Denis
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+  Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  Redistributions in binary form must reproduce the above copyright notice, this
+  list of conditions and the following disclaimer in the documentation and/or
+  other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/components/rust-hyperloglog/README.md b/components/rust-hyperloglog/README.md
new file mode 100644
index 0000000000..f104f9d59a
--- /dev/null
+++ b/components/rust-hyperloglog/README.md
@@ -0,0 +1,27 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+hyperloglog
+===========
+
+A [HyperLogLog](https://static.googleusercontent.com/media/research.google.com/en/us/pubs/archive/40671.pdf) implementation in Rust, with bias correction.
+
+Installation: use [Cargo](http://crates.io):
+
+```toml
+[dependencies]
+hyperloglog = "0"
+```
+
+Usage:
+
+```rust
+let mut hll = HyperLogLog::new(error_rate);
+hll.insert(&"test1");
+hll.insert(&"test2");
+let card_estimation = hll.len();
+
+let mut hll2 = HyperLogLog::new_from_template(&hll);
+hll2.insert(&"test3");
+
+hll.merge(&hll2);
+```
diff --git a/components/rust-hyperloglog/THANKS b/components/rust-hyperloglog/THANKS
new file mode 100644
index 0000000000..091c37fc33
--- /dev/null
+++ b/components/rust-hyperloglog/THANKS
@@ -0,0 +1,3 @@
+Nelson Gonçalves (@goncalvesnelson)
+Vasily Evseenko (@svpcom)
+for Python's hyperloglog implementation this code is based on.
diff --git a/components/rust-hyperloglog/src/hyperloglog/lib.rs b/components/rust-hyperloglog/src/hyperloglog/lib.rs
new file mode 100644
index 0000000000..242ae9980e
--- /dev/null
+++ b/components/rust-hyperloglog/src/hyperloglog/lib.rs
@@ -0,0 +1,4264 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+// (C)opyleft 2013-2019 Frank Denis
+
+//! HyperLogLog implementation for Rust
+//!
+//! Forked from <https://github.com/jedisct1/rust-hyperloglog/blob/31e708b03683861cee3bee5e5bb992df901e9ac9/src/hyperloglog/lib.rs>
+#![crate_name = "hyperloglog"]
+#![warn(non_camel_case_types, non_upper_case_globals, unused_qualifications)]
+#![allow(non_snake_case)]
+#![allow(clippy::unreadable_literal)]
+
+use std::{
+    cmp::Ordering::{Equal, Greater, Less},
+    hash::{Hash, Hasher},
+    iter::repeat,
+};
+
+use bytes::{MemBuf, MemBufMut};
+use siphasher::sip::SipHasher13;
+use snafu::{ResultExt, Snafu};
+
+static TRESHOLD_DATA: [f64; 15] = [
+    10.0, 20.0, 40.0, 80.0, 220.0, 400.0, 900.0, 1800.0, 3100.0, 6500.0, 11500.0, 20000.0, 50000.0,
+    120000.0, 350000.0,
+];
+
+static RAW_ESTIMATE_DATA: &[&[f64]] = &[
+    &[
+        11.0, 11.717, 12.207, 12.7896, 13.2882, 13.8204, 14.3772, 14.9342, 15.5202, 16.161,
+        16.7722, 17.4636, 18.0396, 18.6766, 19.3566, 20.0454, 20.7936, 21.4856, 22.2666, 22.9946,
+        23.766, 24.4692, 25.3638, 26.0764, 26.7864, 27.7602, 28.4814, 29.433, 30.2926, 31.0664,
+        31.9996, 32.7956, 33.5366, 34.5894, 35.5738, 36.2698, 37.3682, 38.0544, 39.2342, 40.0108,
+        40.7966, 41.9298, 42.8704, 43.6358, 44.5194, 45.773, 46.6772, 47.6174, 48.4888, 49.3304,
+        50.2506, 51.4996, 52.3824, 53.3078, 54.3984, 55.5838, 56.6618, 57.2174, 58.3514, 59.0802,
+        60.1482, 61.0376, 62.3598, 62.8078, 63.9744, 64.914, 65.781, 67.1806, 68.0594, 68.8446,
+        69.7928, 70.8248, 71.8324, 72.8598, 73.6246, 74.7014, 75.393, 76.6708, 77.2394,
+    ],
+    &[
+        23.0, 23.1194, 23.8208, 24.2318, 24.77, 25.2436, 25.7774, 26.2848, 26.8224, 27.3742,
+        27.9336, 28.503, 29.0494, 29.6292, 30.2124, 30.798, 31.367, 31.9728, 32.5944, 33.217,
+        33.8438, 34.3696, 35.0956, 35.7044, 36.324, 37.0668, 37.6698, 38.3644, 39.049, 39.6918,
+        40.4146, 41.082, 41.687, 42.5398, 43.2462, 43.857, 44.6606, 45.4168, 46.1248, 46.9222,
+        47.6804, 48.447, 49.3454, 49.9594, 50.7636, 51.5776, 52.331, 53.19, 53.9676, 54.7564,
+        55.5314, 56.4442, 57.3708, 57.9774, 58.9624, 59.8796, 60.755, 61.472, 62.2076, 63.1024,
+        63.8908, 64.7338, 65.7728, 66.629, 67.413, 68.3266, 69.1524, 70.2642, 71.1806, 72.0566,
+        72.9192, 73.7598, 74.3516, 75.5802, 76.4386, 77.4916, 78.1524, 79.1892, 79.8414, 80.8798,
+        81.8376, 82.4698, 83.7656, 84.331, 85.5914, 86.6012, 87.7016, 88.5582, 89.3394, 90.3544,
+        91.4912, 92.308, 93.3552, 93.9746, 95.2052, 95.727, 97.1322, 98.3944, 98.7588, 100.242,
+        101.1914, 102.2538, 102.8776, 103.6292, 105.1932, 105.9152, 107.0868, 107.6728, 108.7144,
+        110.3114, 110.8716, 111.245, 112.7908, 113.7064, 114.636, 115.7464, 116.1788, 117.7464,
+        118.4896, 119.6166, 120.5082, 121.7798, 122.9028, 123.4426, 124.8854, 125.705, 126.4652,
+        128.3464, 128.3462, 130.0398, 131.0342, 131.0042, 132.4766, 133.511, 134.7252, 135.425,
+        136.5172, 138.0572, 138.6694, 139.3712, 140.8598, 141.4594, 142.554, 143.4006, 144.7374,
+        146.1634, 146.8994, 147.605, 147.9304, 149.1636, 150.2468, 151.5876, 152.2096, 153.7032,
+        154.7146, 155.807, 156.9228, 157.0372, 158.5852,
+    ],
+    &[
+        46.0, 46.1902, 47.271, 47.8358, 48.8142, 49.2854, 50.317, 51.354, 51.8924, 52.9436,
+        53.4596, 54.5262, 55.6248, 56.1574, 57.2822, 57.837, 58.9636, 60.074, 60.7042, 61.7976,
+        62.4772, 63.6564, 64.7942, 65.5004, 66.686, 67.291, 68.5672, 69.8556, 70.4982, 71.8204,
+        72.4252, 73.7744, 75.0786, 75.8344, 77.0294, 77.8098, 79.0794, 80.5732, 81.1878, 82.5648,
+        83.2902, 84.6784, 85.3352, 86.8946, 88.3712, 89.0852, 90.499, 91.2686, 92.6844, 94.2234,
+        94.9732, 96.3356, 97.2286, 98.7262, 100.3284, 101.1048, 102.5962, 103.3562, 105.1272,
+        106.4184, 107.4974, 109.0822, 109.856, 111.48, 113.2834, 114.0208, 115.637, 116.5174,
+        118.0576, 119.7476, 120.427, 122.1326, 123.2372, 125.2788, 126.6776, 127.7926, 129.1952,
+        129.9564, 131.6454, 133.87, 134.5428, 136.2, 137.0294, 138.6278, 139.6782, 141.792,
+        143.3516, 144.2832, 146.0394, 147.0748, 148.4912, 150.849, 151.696, 153.5404, 154.073,
+        156.3714, 157.7216, 158.7328, 160.4208, 161.4184, 163.9424, 165.2772, 166.411, 168.1308,
+        168.769, 170.9258, 172.6828, 173.7502, 175.706, 176.3886, 179.0186, 180.4518, 181.927,
+        183.4172, 184.4114, 186.033, 188.5124, 189.5564, 191.6008, 192.4172, 193.8044, 194.997,
+        197.4548, 198.8948, 200.2346, 202.3086, 203.1548, 204.8842, 206.6508, 206.6772, 209.7254,
+        210.4752, 212.7228, 214.6614, 215.1676, 217.793, 218.0006, 219.9052, 221.66, 223.5588,
+        225.1636, 225.6882, 227.7126, 229.4502, 231.1978, 232.9756, 233.1654, 236.727, 238.1974,
+        237.7474, 241.1346, 242.3048, 244.1948, 245.3134, 246.879, 249.1204, 249.853, 252.6792,
+        253.857, 254.4486, 257.2362, 257.9534, 260.0286, 260.5632, 262.663, 264.723, 265.7566,
+        267.2566, 267.1624, 270.62, 272.8216, 273.2166, 275.2056, 276.2202, 278.3726, 280.3344,
+        281.9284, 283.9728, 284.1924, 286.4872, 287.587, 289.807, 291.1206, 292.769, 294.8708,
+        296.665, 297.1182, 299.4012, 300.6352, 302.1354, 304.1756, 306.1606, 307.3462, 308.5214,
+        309.4134, 310.8352, 313.9684, 315.837, 316.7796, 318.9858,
+    ],
+    &[
+        92.0, 93.4934, 94.9758, 96.4574, 97.9718, 99.4954, 101.5302, 103.0756, 104.6374, 106.1782,
+        107.7888, 109.9522, 111.592, 113.2532, 114.9086, 116.5938, 118.9474, 120.6796, 122.4394,
+        124.2176, 125.9768, 128.4214, 130.2528, 132.0102, 133.8658, 135.7278, 138.3044, 140.1316,
+        142.093, 144.0032, 145.9092, 148.6306, 150.5294, 152.5756, 154.6508, 156.662, 159.552,
+        161.3724, 163.617, 165.5754, 167.7872, 169.8444, 172.7988, 174.8606, 177.2118, 179.3566,
+        181.4476, 184.5882, 186.6816, 189.0824, 191.0258, 193.6048, 196.4436, 198.7274, 200.957,
+        203.147, 205.4364, 208.7592, 211.3386, 213.781, 215.8028, 218.656, 221.6544, 223.996,
+        226.4718, 229.1544, 231.6098, 234.5956, 237.0616, 239.5758, 242.4878, 244.5244, 248.2146,
+        250.724, 252.8722, 255.5198, 258.0414, 261.941, 264.9048, 266.87, 269.4304, 272.028,
+        274.4708, 278.37, 281.0624, 283.4668, 286.5532, 289.4352, 293.2564, 295.2744, 298.2118,
+        300.7472, 304.1456, 307.2928, 309.7504, 312.5528, 315.979, 318.2102, 322.1834, 324.3494,
+        327.325, 330.6614, 332.903, 337.2544, 339.9042, 343.215, 345.2864, 348.0814, 352.6764,
+        355.301, 357.139, 360.658, 363.1732, 366.5902, 369.9538, 373.0828, 375.922, 378.9902,
+        382.7328, 386.4538, 388.1136, 391.2234, 394.0878, 396.708, 401.1556, 404.1852, 406.6372,
+        409.6822, 412.7796, 416.6078, 418.4916, 422.131, 424.5376, 428.1988, 432.211, 434.4502,
+        438.5282, 440.912, 444.0448, 447.7432, 450.8524, 453.7988, 456.7858, 458.8868, 463.9886,
+        466.5064, 468.9124, 472.6616, 475.4682, 478.582, 481.304, 485.2738, 488.6894, 490.329,
+        496.106, 497.6908, 501.1374, 504.5322, 506.8848, 510.3324, 513.4512, 516.179, 520.4412,
+        522.6066, 526.167, 528.7794, 533.379, 536.067, 538.46, 542.9116, 545.692, 547.9546,
+        552.493, 555.2722, 557.335, 562.449, 564.2014, 569.0738, 571.0974, 574.8564, 578.2996,
+        581.409, 583.9704, 585.8098, 589.6528, 594.5998, 595.958, 600.068, 603.3278, 608.2016,
+        609.9632, 612.864, 615.43, 620.7794, 621.272, 625.8644, 629.206, 633.219, 634.5154,
+        638.6102,
+    ],
+    &[
+        184.2152, 187.2454, 190.2096, 193.6652, 196.6312, 199.6822, 203.249, 206.3296, 210.0038,
+        213.2074, 216.4612, 220.27, 223.5178, 227.4412, 230.8032, 234.1634, 238.1688, 241.6074,
+        245.6946, 249.2664, 252.8228, 257.0432, 260.6824, 264.9464, 268.6268, 272.2626, 276.8376,
+        280.4034, 284.8956, 288.8522, 292.7638, 297.3552, 301.3556, 305.7526, 309.9292, 313.8954,
+        318.8198, 322.7668, 327.298, 331.6688, 335.9466, 340.9746, 345.1672, 349.3474, 354.3028,
+        358.8912, 364.114, 368.4646, 372.9744, 378.4092, 382.6022, 387.843, 392.5684, 397.1652,
+        402.5426, 407.4152, 412.5388, 417.3592, 422.1366, 427.486, 432.3918, 437.5076, 442.509,
+        447.3834, 453.3498, 458.0668, 463.7346, 469.1228, 473.4528, 479.7, 484.644, 491.0518,
+        495.5774, 500.9068, 506.432, 512.1666, 517.434, 522.6644, 527.4894, 533.6312, 538.3804,
+        544.292, 550.5496, 556.0234, 562.8206, 566.6146, 572.4188, 579.117, 583.6762, 590.6576,
+        595.7864, 601.509, 607.5334, 612.9204, 619.772, 624.2924, 630.8654, 636.1836, 642.745,
+        649.1316, 655.0386, 660.0136, 666.6342, 671.6196, 678.1866, 684.4282, 689.3324, 695.4794,
+        702.5038, 708.129, 713.528, 720.3204, 726.463, 732.7928, 739.123, 744.7418, 751.2192,
+        756.5102, 762.6066, 769.0184, 775.2224, 781.4014, 787.7618, 794.1436, 798.6506, 805.6378,
+        811.766, 819.7514, 824.5776, 828.7322, 837.8048, 843.6302, 849.9336, 854.4798, 861.3388,
+        867.9894, 873.8196, 880.3136, 886.2308, 892.4588, 899.0816, 905.4076, 912.0064, 917.3878,
+        923.619, 929.998, 937.3482, 943.9506, 947.991, 955.1144, 962.203, 968.8222, 975.7324,
+        981.7826, 988.7666, 994.2648, 1000.3128, 1007.4082, 1013.7536, 1020.3376, 1026.7156,
+        1031.7478, 1037.4292, 1045.393, 1051.2278, 1058.3434, 1062.8726, 1071.884, 1076.806,
+        1082.9176, 1089.1678, 1095.5032, 1102.525, 1107.2264, 1115.315, 1120.93, 1127.252,
+        1134.1496, 1139.0408, 1147.5448, 1153.3296, 1158.1974, 1166.5262, 1174.3328, 1175.657,
+        1184.4222, 1190.9172, 1197.1292, 1204.4606, 1210.4578, 1218.8728, 1225.3336, 1226.6592,
+        1236.5768, 1241.363, 1249.4074, 1254.6566, 1260.8014, 1266.5454, 1274.5192,
+    ],
+    &[
+        369.0, 374.8294, 381.2452, 387.6698, 394.1464, 400.2024, 406.8782, 413.6598, 420.462,
+        427.2826, 433.7102, 440.7416, 447.9366, 455.1046, 462.285, 469.0668, 476.306, 483.8448,
+        491.301, 498.9886, 506.2422, 513.8138, 521.7074, 529.7428, 537.8402, 545.1664, 553.3534,
+        561.594, 569.6886, 577.7876, 585.65, 594.228, 602.8036, 611.1666, 620.0818, 628.0824,
+        637.2574, 646.302, 655.1644, 664.0056, 672.3802, 681.7192, 690.5234, 700.2084, 708.831,
+        718.485, 728.1112, 737.4764, 746.76, 756.3368, 766.5538, 775.5058, 785.2646, 795.5902,
+        804.3818, 814.8998, 824.9532, 835.2062, 845.2798, 854.4728, 864.9582, 875.3292, 886.171,
+        896.781, 906.5716, 916.7048, 927.5322, 937.875, 949.3972, 958.3464, 969.7274, 980.2834,
+        992.1444, 1003.4264, 1013.0166, 1024.018, 1035.0438, 1046.34, 1057.6856, 1068.9836,
+        1079.0312, 1091.677, 1102.3188, 1113.4846, 1124.4424, 1135.739, 1147.1488, 1158.9202,
+        1169.406, 1181.5342, 1193.2834, 1203.8954, 1216.3286, 1226.2146, 1239.6684, 1251.9946,
+        1262.123, 1275.4338, 1285.7378, 1296.076, 1308.9692, 1320.4964, 1333.0998, 1343.9864,
+        1357.7754, 1368.3208, 1380.4838, 1392.7388, 1406.0758, 1416.9098, 1428.9728, 1440.9228,
+        1453.9292, 1462.617, 1476.05, 1490.2996, 1500.6128, 1513.7392, 1524.5174, 1536.6322,
+        1548.2584, 1562.3766, 1572.423, 1587.1232, 1596.5164, 1610.5938, 1622.5972, 1633.1222,
+        1647.7674, 1658.5044, 1671.57, 1683.7044, 1695.4142, 1708.7102, 1720.6094, 1732.6522,
+        1747.841, 1756.4072, 1769.9786, 1782.3276, 1797.5216, 1808.3186, 1819.0694, 1834.354,
+        1844.575, 1856.2808, 1871.1288, 1880.7852, 1893.9622, 1906.3418, 1920.6548, 1932.9302,
+        1945.8584, 1955.473, 1968.8248, 1980.6446, 1995.9598, 2008.349, 2019.8556, 2033.0334,
+        2044.0206, 2059.3956, 2069.9174, 2082.6084, 2093.7036, 2106.6108, 2118.9124, 2132.301,
+        2144.7628, 2159.8422, 2171.0212, 2183.101, 2193.5112, 2208.052, 2221.3194, 2233.3282,
+        2247.295, 2257.7222, 2273.342, 2286.5638, 2299.6786, 2310.8114, 2322.3312, 2335.516,
+        2349.874, 2363.5968, 2373.865, 2387.1918, 2401.8328, 2414.8496, 2424.544, 2436.7592,
+        2447.1682, 2464.1958, 2474.3438, 2489.0006, 2497.4526, 2513.6586, 2527.19, 2540.7028,
+        2553.768,
+    ],
+    &[
+        738.1256, 750.4234, 763.1064, 775.4732, 788.4636, 801.0644, 814.488, 827.9654, 841.0832,
+        854.7864, 868.1992, 882.2176, 896.5228, 910.1716, 924.7752, 938.899, 953.6126, 968.6492,
+        982.9474, 998.5214, 1013.1064, 1028.6364, 1044.2468, 1059.4588, 1075.3832, 1091.0584,
+        1106.8606, 1123.3868, 1139.5062, 1156.1862, 1172.463, 1189.339, 1206.1936, 1223.1292,
+        1240.1854, 1257.2908, 1275.3324, 1292.8518, 1310.5204, 1328.4854, 1345.9318, 1364.552,
+        1381.4658, 1400.4256, 1419.849, 1438.152, 1456.8956, 1474.8792, 1494.118, 1513.62,
+        1532.5132, 1551.9322, 1570.7726, 1590.6086, 1610.5332, 1630.5918, 1650.4294, 1669.7662,
+        1690.4106, 1710.7338, 1730.9012, 1750.4486, 1770.1556, 1791.6338, 1812.7312, 1833.6264,
+        1853.9526, 1874.8742, 1896.8326, 1918.1966, 1939.5594, 1961.07, 1983.037, 2003.1804,
+        2026.071, 2047.4884, 2070.0848, 2091.2944, 2114.333, 2135.9626, 2158.2902, 2181.0814,
+        2202.0334, 2224.4832, 2246.39, 2269.7202, 2292.1714, 2314.2358, 2338.9346, 2360.891,
+        2384.0264, 2408.3834, 2430.1544, 2454.8684, 2476.9896, 2501.4368, 2522.8702, 2548.0408,
+        2570.6738, 2593.5208, 2617.0158, 2640.2302, 2664.0962, 2687.4986, 2714.2588, 2735.3914,
+        2759.6244, 2781.8378, 2808.0072, 2830.6516, 2856.2454, 2877.2136, 2903.4546, 2926.785,
+        2951.2294, 2976.468, 3000.867, 3023.6508, 3049.91, 3073.5984, 3098.162, 3121.5564,
+        3146.2328, 3170.9484, 3195.5902, 3221.3346, 3242.7032, 3271.6112, 3296.5546, 3317.7376,
+        3345.072, 3369.9518, 3394.326, 3418.1818, 3444.6926, 3469.086, 3494.2754, 3517.8698,
+        3544.248, 3565.3768, 3588.7234, 3616.979, 3643.7504, 3668.6812, 3695.72, 3719.7392,
+        3742.6224, 3770.4456, 3795.6602, 3819.9058, 3844.002, 3869.517, 3895.6824, 3920.8622,
+        3947.1364, 3973.985, 3995.4772, 4021.62, 4046.628, 4074.65, 4096.2256, 4121.831, 4146.6406,
+        4173.276, 4195.0744, 4223.9696, 4251.3708, 4272.9966, 4300.8046, 4326.302, 4353.1248,
+        4374.312, 4403.0322, 4426.819, 4450.0598, 4478.5206, 4504.8116, 4528.8928, 4553.9584,
+        4578.8712, 4603.8384, 4632.3872, 4655.5128, 4675.821, 4704.6222, 4731.9862, 4755.4174,
+        4781.2628, 4804.332, 4832.3048, 4862.8752, 4883.4148, 4906.9544, 4935.3516, 4954.3532,
+        4984.0248, 5011.217, 5035.3258, 5057.3672, 5084.1828,
+    ],
+    &[
+        1477.0, 1501.6014, 1526.5802, 1551.7942, 1577.3042, 1603.2062, 1629.8402, 1656.2292,
+        1682.9462, 1709.9926, 1737.3026, 1765.4252, 1793.0578, 1821.6092, 1849.626, 1878.5568,
+        1908.527, 1937.5154, 1967.1874, 1997.3878, 2027.37, 2058.1972, 2089.5728, 2120.1012,
+        2151.9668, 2183.292, 2216.0772, 2247.8578, 2280.6562, 2313.041, 2345.714, 2380.3112,
+        2414.1806, 2447.9854, 2481.656, 2516.346, 2551.5154, 2586.8378, 2621.7448, 2656.6722,
+        2693.5722, 2729.1462, 2765.4124, 2802.8728, 2838.898, 2876.408, 2913.4926, 2951.4938,
+        2989.6776, 3026.282, 3065.7704, 3104.1012, 3143.7388, 3181.6876, 3221.1872, 3261.5048,
+        3300.0214, 3339.806, 3381.409, 3421.4144, 3461.4294, 3502.2286, 3544.651, 3586.6156,
+        3627.337, 3670.083, 3711.1538, 3753.5094, 3797.01, 3838.6686, 3882.1678, 3922.8116,
+        3967.9978, 4009.9204, 4054.3286, 4097.5706, 4140.6014, 4185.544, 4229.5976, 4274.583,
+        4316.9438, 4361.672, 4406.2786, 4451.8628, 4496.1834, 4543.505, 4589.1816, 4632.5188,
+        4678.2294, 4724.8908, 4769.0194, 4817.052, 4861.4588, 4910.1596, 4956.4344, 5002.5238,
+        5048.13, 5093.6374, 5142.8162, 5187.7894, 5237.3984, 5285.6078, 5331.0858, 5379.1036,
+        5428.6258, 5474.6018, 5522.7618, 5571.5822, 5618.59, 5667.9992, 5714.88, 5763.454,
+        5808.6982, 5860.3644, 5910.2914, 5953.571, 6005.9232, 6055.1914, 6104.5882, 6154.5702,
+        6199.7036, 6251.1764, 6298.7596, 6350.0302, 6398.061, 6448.4694, 6495.933, 6548.0474,
+        6597.7166, 6646.9416, 6695.9208, 6742.6328, 6793.5276, 6842.1934, 6894.2372, 6945.3864,
+        6996.9228, 7044.2372, 7094.1374, 7142.2272, 7192.2942, 7238.8338, 7288.9006, 7344.0908,
+        7394.8544, 7443.5176, 7490.4148, 7542.9314, 7595.6738, 7641.9878, 7694.3688, 7743.0448,
+        7797.522, 7845.53, 7899.594, 7950.3132, 7996.455, 8050.9442, 8092.9114, 8153.1374,
+        8197.4472, 8252.8278, 8301.8728, 8348.6776, 8401.4698, 8453.551, 8504.6598, 8553.8944,
+        8604.1276, 8657.6514, 8710.3062, 8758.908, 8807.8706, 8862.1702, 8910.4668, 8960.77,
+        9007.2766, 9063.164, 9121.0534, 9164.1354, 9218.1594, 9267.767, 9319.0594, 9372.155,
+        9419.7126, 9474.3722, 9520.1338, 9572.368, 9622.7702, 9675.8448, 9726.5396, 9778.7378,
+        9827.6554, 9878.1922, 9928.7782, 9978.3984, 10026.578, 10076.5626, 10137.1618, 10177.5244,
+        10229.9176,
+    ],
+    &[
+        2954.0, 3003.4782, 3053.3568, 3104.3666, 3155.324, 3206.9598, 3259.648, 3312.539,
+        3366.1474, 3420.2576, 3474.8376, 3530.6076, 3586.451, 3643.38, 3700.4104, 3757.5638,
+        3815.9676, 3875.193, 3934.838, 3994.8548, 4055.018, 4117.1742, 4178.4482, 4241.1294,
+        4304.4776, 4367.4044, 4431.8724, 4496.3732, 4561.4304, 4627.5326, 4693.949, 4761.5532,
+        4828.7256, 4897.6182, 4965.5186, 5034.4528, 5104.865, 5174.7164, 5244.6828, 5316.6708,
+        5387.8312, 5459.9036, 5532.476, 5604.8652, 5679.6718, 5753.757, 5830.2072, 5905.2828,
+        5980.0434, 6056.6264, 6134.3192, 6211.5746, 6290.0816, 6367.1176, 6447.9796, 6526.5576,
+        6606.1858, 6686.9144, 6766.1142, 6847.0818, 6927.9664, 7010.9096, 7091.0816, 7175.3962,
+        7260.3454, 7344.018, 7426.4214, 7511.3106, 7596.0686, 7679.8094, 7765.818, 7852.4248,
+        7936.834, 8022.363, 8109.5066, 8200.4554, 8288.5832, 8373.366, 8463.4808, 8549.7682,
+        8642.0522, 8728.3288, 8820.9528, 8907.727, 9001.0794, 9091.2522, 9179.988, 9269.852,
+        9362.6394, 9453.642, 9546.9024, 9640.6616, 9732.6622, 9824.3254, 9917.7484, 10007.9392,
+        10106.7508, 10196.2152, 10289.8114, 10383.5494, 10482.3064, 10576.8734, 10668.7872,
+        10764.7156, 10862.0196, 10952.793, 11049.9748, 11146.0702, 11241.4492, 11339.2772,
+        11434.2336, 11530.741, 11627.6136, 11726.311, 11821.5964, 11918.837, 12015.3724,
+        12113.0162, 12213.0424, 12306.9804, 12408.4518, 12504.8968, 12604.586, 12700.9332,
+        12798.705, 12898.5142, 12997.0488, 13094.788, 13198.475, 13292.7764, 13392.9698,
+        13486.8574, 13590.1616, 13686.5838, 13783.6264, 13887.2638, 13992.0978, 14081.0844,
+        14189.9956, 14280.0912, 14382.4956, 14486.4384, 14588.1082, 14686.2392, 14782.276,
+        14888.0284, 14985.1864, 15088.8596, 15187.0998, 15285.027, 15383.6694, 15495.8266,
+        15591.3736, 15694.2008, 15790.3246, 15898.4116, 15997.4522, 16095.5014, 16198.8514,
+        16291.7492, 16402.6424, 16499.1266, 16606.2436, 16697.7186, 16796.3946, 16902.3376,
+        17005.7672, 17100.814, 17206.8282, 17305.8262, 17416.0744, 17508.4092, 17617.0178,
+        17715.4554, 17816.758, 17920.1748, 18012.9236, 18119.7984, 18223.2248, 18324.2482,
+        18426.6276, 18525.0932, 18629.8976, 18733.2588, 18831.0466, 18940.1366, 19032.2696,
+        19131.729, 19243.4864, 19349.6932, 19442.866, 19547.9448, 19653.2798, 19754.4034,
+        19854.0692, 19965.1224, 20065.1774, 20158.2212, 20253.353, 20366.3264, 20463.22,
+    ],
+    &[
+        5908.5052, 6007.2672, 6107.347, 6208.5794, 6311.2622, 6414.5514, 6519.3376, 6625.6952,
+        6732.5988, 6841.3552, 6950.5972, 7061.3082, 7173.5646, 7287.109, 7401.8216, 7516.4344,
+        7633.3802, 7751.2962, 7870.3784, 7990.292, 8110.79, 8233.4574, 8356.6036, 8482.2712,
+        8607.7708, 8735.099, 8863.1858, 8993.4746, 9123.8496, 9255.6794, 9388.5448, 9522.7516,
+        9657.3106, 9792.6094, 9930.5642, 10068.794, 10206.7256, 10347.81, 10490.3196, 10632.0778,
+        10775.9916, 10920.4662, 11066.124, 11213.073, 11358.0362, 11508.1006, 11659.1716,
+        11808.7514, 11959.4884, 12112.1314, 12265.037, 12420.3756, 12578.933, 12734.311,
+        12890.0006, 13047.2144, 13207.3096, 13368.5144, 13528.024, 13689.847, 13852.7528,
+        14018.3168, 14180.5372, 14346.9668, 14513.5074, 14677.867, 14846.2186, 15017.4186,
+        15184.9716, 15356.339, 15529.2972, 15697.3578, 15871.8686, 16042.187, 16216.4094,
+        16389.4188, 16565.9126, 16742.3272, 16919.0042, 17094.7592, 17273.965, 17451.8342,
+        17634.4254, 17810.5984, 17988.9242, 18171.051, 18354.7938, 18539.466, 18721.0408,
+        18904.9972, 19081.867, 19271.9118, 19451.8694, 19637.9816, 19821.2922, 20013.1292,
+        20199.3858, 20387.8726, 20572.9514, 20770.7764, 20955.1714, 21144.751, 21329.9952,
+        21520.709, 21712.7016, 21906.3868, 22096.2626, 22286.0524, 22475.051, 22665.5098,
+        22862.8492, 23055.5294, 23249.6138, 23437.848, 23636.273, 23826.093, 24020.3296,
+        24213.3896, 24411.7392, 24602.9614, 24805.7952, 24998.1552, 25193.9588, 25389.0166,
+        25585.8392, 25780.6976, 25981.2728, 26175.977, 26376.5252, 26570.1964, 26773.387,
+        26962.9812, 27163.0586, 27368.164, 27565.0534, 27758.7428, 27961.1276, 28163.2324,
+        28362.3816, 28565.7668, 28758.644, 28956.9768, 29163.4722, 29354.7026, 29561.1186,
+        29767.9948, 29959.9986, 30164.0492, 30366.9818, 30562.5338, 30762.9928, 30976.1592,
+        31166.274, 31376.722, 31570.3734, 31770.809, 31974.8934, 32179.5286, 32387.5442,
+        32582.3504, 32794.076, 32989.9528, 33191.842, 33392.4684, 33595.659, 33801.8672,
+        34000.3414, 34200.0922, 34402.6792, 34610.0638, 34804.0084, 35011.13, 35218.669,
+        35418.6634, 35619.0792, 35830.6534, 36028.4966, 36229.7902, 36438.6422, 36630.7764,
+        36833.3102, 37048.6728, 37247.3916, 37453.5904, 37669.3614, 37854.5526, 38059.305,
+        38268.0936, 38470.2516, 38674.7064, 38876.167, 39068.3794, 39281.9144, 39492.8566,
+        39684.8628, 39898.4108, 40093.1836, 40297.6858, 40489.7086, 40717.2424,
+    ],
+    &[
+        11817.475, 12015.0046, 12215.3792, 12417.7504, 12623.1814, 12830.0086, 13040.0072,
+        13252.503, 13466.178, 13683.2738, 13902.0344, 14123.9798, 14347.394, 14573.7784,
+        14802.6894, 15033.6824, 15266.9134, 15502.8624, 15741.4944, 15980.7956, 16223.8916,
+        16468.6316, 16715.733, 16965.5726, 17217.204, 17470.666, 17727.8516, 17986.7886,
+        18247.6902, 18510.9632, 18775.304, 19044.7486, 19314.4408, 19587.202, 19862.2576,
+        20135.924, 20417.0324, 20697.9788, 20979.6112, 21265.0274, 21550.723, 21841.6906,
+        22132.162, 22428.1406, 22722.127, 23020.5606, 23319.7394, 23620.4014, 23925.2728,
+        24226.9224, 24535.581, 24845.505, 25155.9618, 25470.3828, 25785.9702, 26103.7764,
+        26420.4132, 26742.0186, 27062.8852, 27388.415, 27714.6024, 28042.296, 28365.4494,
+        28701.1526, 29031.8008, 29364.2156, 29704.497, 30037.1458, 30380.111, 30723.8168,
+        31059.5114, 31404.9498, 31751.6752, 32095.2686, 32444.7792, 32794.767, 33145.204,
+        33498.4226, 33847.6502, 34209.006, 34560.849, 34919.4838, 35274.9778, 35635.1322,
+        35996.3266, 36359.1394, 36722.8266, 37082.8516, 37447.7354, 37815.9606, 38191.0692,
+        38559.4106, 38924.8112, 39294.6726, 39663.973, 40042.261, 40416.2036, 40779.2036,
+        41161.6436, 41540.9014, 41921.1998, 42294.7698, 42678.5264, 43061.3464, 43432.375,
+        43818.432, 44198.6598, 44583.0138, 44970.4794, 45353.924, 45729.858, 46118.2224,
+        46511.5724, 46900.7386, 47280.6964, 47668.1472, 48055.6796, 48446.9436, 48838.7146,
+        49217.7296, 49613.7796, 50010.7508, 50410.0208, 50793.7886, 51190.2456, 51583.1882,
+        51971.0796, 52376.5338, 52763.319, 53165.5534, 53556.5594, 53948.2702, 54346.352,
+        54748.7914, 55138.577, 55543.4824, 55941.1748, 56333.7746, 56745.1552, 57142.7944,
+        57545.2236, 57935.9956, 58348.5268, 58737.5474, 59158.5962, 59542.6896, 59958.8004,
+        60349.3788, 60755.0212, 61147.6144, 61548.194, 61946.0696, 62348.6042, 62763.603,
+        63162.781, 63560.635, 63974.3482, 64366.4908, 64771.5876, 65176.7346, 65597.3916,
+        65995.915, 66394.0384, 66822.9396, 67203.6336, 67612.2032, 68019.0078, 68420.0388,
+        68821.22, 69235.8388, 69640.0724, 70055.155, 70466.357, 70863.4266, 71276.2482, 71677.0306,
+        72080.2006, 72493.0214, 72893.5952, 73314.5856, 73714.9852, 74125.3022, 74521.2122,
+        74933.6814, 75341.5904, 75743.0244, 76166.0278, 76572.1322, 76973.1028, 77381.6284,
+        77800.6092, 78189.328, 78607.0962, 79012.2508, 79407.8358, 79825.725, 80238.701, 80646.891,
+        81035.6436, 81460.0448, 81876.3884,
+    ],
+    &[
+        23635.0036,
+        24030.8034,
+        24431.4744,
+        24837.1524,
+        25246.7928,
+        25661.326,
+        26081.3532,
+        26505.2806,
+        26933.9892,
+        27367.7098,
+        27805.318,
+        28248.799,
+        28696.4382,
+        29148.8244,
+        29605.5138,
+        30066.8668,
+        30534.2344,
+        31006.32,
+        31480.778,
+        31962.2418,
+        32447.3324,
+        32938.0232,
+        33432.731,
+        33930.728,
+        34433.9896,
+        34944.1402,
+        35457.5588,
+        35974.5958,
+        36497.3296,
+        37021.9096,
+        37554.326,
+        38088.0826,
+        38628.8816,
+        39171.3192,
+        39723.2326,
+        40274.5554,
+        40832.3142,
+        41390.613,
+        41959.5908,
+        42532.5466,
+        43102.0344,
+        43683.5072,
+        44266.694,
+        44851.2822,
+        45440.7862,
+        46038.0586,
+        46640.3164,
+        47241.064,
+        47846.155,
+        48454.7396,
+        49076.9168,
+        49692.542,
+        50317.4778,
+        50939.65,
+        51572.5596,
+        52210.2906,
+        52843.7396,
+        53481.3996,
+        54127.236,
+        54770.406,
+        55422.6598,
+        56078.7958,
+        56736.7174,
+        57397.6784,
+        58064.5784,
+        58730.308,
+        59404.9784,
+        60077.0864,
+        60751.9158,
+        61444.1386,
+        62115.817,
+        62808.7742,
+        63501.4774,
+        64187.5454,
+        64883.6622,
+        65582.7468,
+        66274.5318,
+        66976.9276,
+        67688.7764,
+        68402.138,
+        69109.6274,
+        69822.9706,
+        70543.6108,
+        71265.5202,
+        71983.3848,
+        72708.4656,
+        73433.384,
+        74158.4664,
+        74896.4868,
+        75620.9564,
+        76362.1434,
+        77098.3204,
+        77835.7662,
+        78582.6114,
+        79323.9902,
+        80067.8658,
+        80814.9246,
+        81567.0136,
+        82310.8536,
+        83061.9952,
+        83821.4096,
+        84580.8608,
+        85335.547,
+        86092.5802,
+        86851.6506,
+        87612.311,
+        88381.2016,
+        89146.3296,
+        89907.8974,
+        90676.846,
+        91451.4152,
+        92224.5518,
+        92995.8686,
+        93763.5066,
+        94551.2796,
+        95315.1944,
+        96096.1806,
+        96881.0918,
+        97665.679,
+        98442.68,
+        99229.3002,
+        100011.0994,
+        100790.6386,
+        101580.1564,
+        102377.7484,
+        103152.1392,
+        103944.2712,
+        104730.216,
+        105528.6336,
+        106324.9398,
+        107117.6706,
+        107890.3988,
+        108695.2266,
+        109485.238,
+        110294.7876,
+        111075.0958,
+        111878.0496,
+        112695.2864,
+        113464.5486,
+        114270.0474,
+        115068.608,
+        115884.3626,
+        116673.2588,
+        117483.3716,
+        118275.097,
+        119085.4092,
+        119879.2808,
+        120687.5868,
+        121499.9944,
+        122284.916,
+        123095.9254,
+        123912.5038,
+        124709.0454,
+        125503.7182,
+        126323.259,
+        127138.9412,
+        127943.8294,
+        128755.646,
+        129556.5354,
+        130375.3298,
+        131161.4734,
+        131971.1962,
+        132787.5458,
+        133588.1056,
+        134431.351,
+        135220.2906,
+        136023.398,
+        136846.6558,
+        137667.0004,
+        138463.663,
+        139283.7154,
+        140074.6146,
+        140901.3072,
+        141721.8548,
+        142543.2322,
+        143356.1096,
+        144173.7412,
+        144973.0948,
+        145794.3162,
+        146609.5714,
+        147420.003,
+        148237.9784,
+        149050.5696,
+        149854.761,
+        150663.1966,
+        151494.0754,
+        152313.1416,
+        153112.6902,
+        153935.7206,
+        154746.9262,
+        155559.547,
+        156401.9746,
+        157228.7036,
+        158008.7254,
+        158820.75,
+        159646.9184,
+        160470.4458,
+        161279.5348,
+        162093.3114,
+        162918.542,
+        163729.2842,
+    ],
+    &[
+        47271.0,
+        48062.3584,
+        48862.7074,
+        49673.152,
+        50492.8416,
+        51322.9514,
+        52161.03,
+        53009.407,
+        53867.6348,
+        54734.206,
+        55610.5144,
+        56496.2096,
+        57390.795,
+        58297.268,
+        59210.6448,
+        60134.665,
+        61068.0248,
+        62010.4472,
+        62962.5204,
+        63923.5742,
+        64895.0194,
+        65876.4182,
+        66862.6136,
+        67862.6968,
+        68868.8908,
+        69882.8544,
+        70911.271,
+        71944.0924,
+        72990.0326,
+        74040.692,
+        75100.6336,
+        76174.7826,
+        77252.5998,
+        78340.2974,
+        79438.2572,
+        80545.4976,
+        81657.2796,
+        82784.6336,
+        83915.515,
+        85059.7362,
+        86205.9368,
+        87364.4424,
+        88530.3358,
+        89707.3744,
+        90885.9638,
+        92080.197,
+        93275.5738,
+        94479.391,
+        95695.918,
+        96919.2236,
+        98148.4602,
+        99382.3474,
+        100625.6974,
+        101878.0284,
+        103141.6278,
+        104409.4588,
+        105686.2882,
+        106967.5402,
+        108261.6032,
+        109548.1578,
+        110852.0728,
+        112162.231,
+        113479.0072,
+        114806.2626,
+        116137.9072,
+        117469.5048,
+        118813.5186,
+        120165.4876,
+        121516.2556,
+        122875.766,
+        124250.5444,
+        125621.2222,
+        127003.2352,
+        128387.848,
+        129775.2644,
+        131181.7776,
+        132577.3086,
+        133979.9458,
+        135394.1132,
+        136800.9078,
+        138233.217,
+        139668.5308,
+        141085.212,
+        142535.2122,
+        143969.0684,
+        145420.2872,
+        146878.1542,
+        148332.7572,
+        149800.3202,
+        151269.66,
+        152743.6104,
+        154213.0948,
+        155690.288,
+        157169.4246,
+        158672.1756,
+        160160.059,
+        161650.6854,
+        163145.7772,
+        164645.6726,
+        166159.1952,
+        167682.1578,
+        169177.3328,
+        170700.0118,
+        172228.8964,
+        173732.6664,
+        175265.5556,
+        176787.799,
+        178317.111,
+        179856.6914,
+        181400.865,
+        182943.4612,
+        184486.742,
+        186033.4698,
+        187583.7886,
+        189148.1868,
+        190688.4526,
+        192250.1926,
+        193810.9042,
+        195354.2972,
+        196938.7682,
+        198493.5898,
+        200079.2824,
+        201618.912,
+        203205.5492,
+        204765.5798,
+        206356.1124,
+        207929.3064,
+        209498.7196,
+        211086.229,
+        212675.1324,
+        214256.7892,
+        215826.2392,
+        217412.8474,
+        218995.6724,
+        220618.6038,
+        222207.1166,
+        223781.0364,
+        225387.4332,
+        227005.7928,
+        228590.4336,
+        230217.8738,
+        231805.1054,
+        233408.9,
+        234995.3432,
+        236601.4956,
+        238190.7904,
+        239817.2548,
+        241411.2832,
+        243002.4066,
+        244640.1884,
+        246255.3128,
+        247849.3508,
+        249479.9734,
+        251106.8822,
+        252705.027,
+        254332.9242,
+        255935.129,
+        257526.9014,
+        259154.772,
+        260777.625,
+        262390.253,
+        264004.4906,
+        265643.59,
+        267255.4076,
+        268873.426,
+        270470.7252,
+        272106.4804,
+        273722.4456,
+        275337.794,
+        276945.7038,
+        278592.9154,
+        280204.3726,
+        281841.1606,
+        283489.171,
+        285130.1716,
+        286735.3362,
+        288364.7164,
+        289961.1814,
+        291595.5524,
+        293285.683,
+        294899.6668,
+        296499.3434,
+        298128.0462,
+        299761.8946,
+        301394.2424,
+        302997.6748,
+        304615.1478,
+        306269.7724,
+        307886.114,
+        309543.1028,
+        311153.2862,
+        312782.8546,
+        314421.2008,
+        316033.2438,
+        317692.9636,
+        319305.2648,
+        320948.7406,
+        322566.3364,
+        324228.4224,
+        325847.1542,
+    ],
+    &[
+        94542.0,
+        96125.811,
+        97728.019,
+        99348.558,
+        100987.9705,
+        102646.7565,
+        104324.5125,
+        106021.7435,
+        107736.7865,
+        109469.272,
+        111223.9465,
+        112995.219,
+        114787.432,
+        116593.152,
+        118422.71,
+        120267.2345,
+        122134.6765,
+        124020.937,
+        125927.2705,
+        127851.255,
+        129788.9485,
+        131751.016,
+        133726.8225,
+        135722.592,
+        137736.789,
+        139770.568,
+        141821.518,
+        143891.343,
+        145982.1415,
+        148095.387,
+        150207.526,
+        152355.649,
+        154515.6415,
+        156696.05,
+        158887.7575,
+        161098.159,
+        163329.852,
+        165569.053,
+        167837.4005,
+        170121.6165,
+        172420.4595,
+        174732.6265,
+        177062.77,
+        179412.502,
+        181774.035,
+        184151.939,
+        186551.6895,
+        188965.691,
+        191402.8095,
+        193857.949,
+        196305.0775,
+        198774.6715,
+        201271.2585,
+        203764.78,
+        206299.3695,
+        208818.1365,
+        211373.115,
+        213946.7465,
+        216532.076,
+        219105.541,
+        221714.5375,
+        224337.5135,
+        226977.5125,
+        229613.0655,
+        232270.2685,
+        234952.2065,
+        237645.3555,
+        240331.1925,
+        243034.517,
+        245756.0725,
+        248517.6865,
+        251232.737,
+        254011.3955,
+        256785.995,
+        259556.44,
+        262368.335,
+        265156.911,
+        267965.266,
+        270785.583,
+        273616.0495,
+        276487.4835,
+        279346.639,
+        282202.509,
+        285074.3885,
+        287942.2855,
+        290856.018,
+        293774.0345,
+        296678.5145,
+        299603.6355,
+        302552.6575,
+        305492.9785,
+        308466.8605,
+        311392.581,
+        314347.538,
+        317319.4295,
+        320285.9785,
+        323301.7325,
+        326298.3235,
+        329301.3105,
+        332301.987,
+        335309.791,
+        338370.762,
+        341382.923,
+        344431.1265,
+        347464.1545,
+        350507.28,
+        353619.2345,
+        356631.2005,
+        359685.203,
+        362776.7845,
+        365886.488,
+        368958.2255,
+        372060.6825,
+        375165.4335,
+        378237.935,
+        381328.311,
+        384430.5225,
+        387576.425,
+        390683.242,
+        393839.648,
+        396977.8425,
+        400101.9805,
+        403271.296,
+        406409.8425,
+        409529.5485,
+        412678.7,
+        415847.423,
+        419020.8035,
+        422157.081,
+        425337.749,
+        428479.6165,
+        431700.902,
+        434893.1915,
+        438049.582,
+        441210.5415,
+        444379.2545,
+        447577.356,
+        450741.931,
+        453959.548,
+        457137.0935,
+        460329.846,
+        463537.4815,
+        466732.3345,
+        469960.5615,
+        473164.681,
+        476347.6345,
+        479496.173,
+        482813.1645,
+        486025.6995,
+        489249.4885,
+        492460.1945,
+        495675.8805,
+        498908.0075,
+        502131.802,
+        505374.3855,
+        508550.9915,
+        511806.7305,
+        515026.776,
+        518217.0005,
+        521523.9855,
+        524705.9855,
+        527950.997,
+        531210.0265,
+        534472.497,
+        537750.7315,
+        540926.922,
+        544207.094,
+        547429.4345,
+        550666.3745,
+        553975.3475,
+        557150.7185,
+        560399.6165,
+        563662.697,
+        566916.7395,
+        570146.1215,
+        573447.425,
+        576689.6245,
+        579874.5745,
+        583202.337,
+        586503.0255,
+        589715.635,
+        592910.161,
+        596214.3885,
+        599488.035,
+        602740.92,
+        605983.0685,
+        609248.67,
+        612491.3605,
+        615787.912,
+        619107.5245,
+        622307.9555,
+        625577.333,
+        628840.4385,
+        632085.2155,
+        635317.6135,
+        638691.7195,
+        641887.467,
+        645139.9405,
+        648441.546,
+        651666.252,
+        654941.845,
+    ],
+    &[
+        189084.0,
+        192250.913,
+        195456.774,
+        198696.946,
+        201977.762,
+        205294.444,
+        208651.754,
+        212042.099,
+        215472.269,
+        218941.91,
+        222443.912,
+        225996.845,
+        229568.199,
+        233193.568,
+        236844.457,
+        240543.233,
+        244279.475,
+        248044.27,
+        251854.588,
+        255693.2,
+        259583.619,
+        263494.621,
+        267445.385,
+        271454.061,
+        275468.769,
+        279549.456,
+        283646.446,
+        287788.198,
+        291966.099,
+        296181.164,
+        300431.469,
+        304718.618,
+        309024.004,
+        313393.508,
+        317760.803,
+        322209.731,
+        326675.061,
+        331160.627,
+        335654.47,
+        340241.442,
+        344841.833,
+        349467.132,
+        354130.629,
+        358819.432,
+        363574.626,
+        368296.587,
+        373118.482,
+        377914.93,
+        382782.301,
+        387680.669,
+        392601.981,
+        397544.323,
+        402529.115,
+        407546.018,
+        412593.658,
+        417638.657,
+        422762.865,
+        427886.169,
+        433017.167,
+        438213.273,
+        443441.254,
+        448692.421,
+        453937.533,
+        459239.049,
+        464529.569,
+        469910.083,
+        475274.03,
+        480684.473,
+        486070.26,
+        491515.237,
+        496995.651,
+        502476.617,
+        507973.609,
+        513497.19,
+        519083.233,
+        524726.509,
+        530305.505,
+        535945.728,
+        541584.404,
+        547274.055,
+        552967.236,
+        558667.862,
+        564360.216,
+        570128.148,
+        575965.08,
+        581701.952,
+        587532.523,
+        593361.144,
+        599246.128,
+        605033.418,
+        610958.779,
+        616837.117,
+        622772.818,
+        628672.04,
+        634675.369,
+        640574.831,
+        646585.739,
+        652574.547,
+        658611.217,
+        664642.684,
+        670713.914,
+        676737.681,
+        682797.313,
+        688837.897,
+        694917.874,
+        701009.882,
+        707173.648,
+        713257.254,
+        719415.392,
+        725636.761,
+        731710.697,
+        737906.209,
+        744103.074,
+        750313.39,
+        756504.185,
+        762712.579,
+        768876.985,
+        775167.859,
+        781359.0,
+        787615.959,
+        793863.597,
+        800245.477,
+        806464.582,
+        812785.294,
+        819005.925,
+        825403.057,
+        831676.197,
+        837936.284,
+        844266.968,
+        850642.711,
+        856959.756,
+        863322.774,
+        869699.931,
+        876102.478,
+        882355.787,
+        888694.463,
+        895159.952,
+        901536.143,
+        907872.631,
+        914293.672,
+        920615.14,
+        927130.974,
+        933409.404,
+        939922.178,
+        946331.47,
+        952745.93,
+        959209.264,
+        965590.224,
+        972077.284,
+        978501.961,
+        984953.19,
+        991413.271,
+        997817.479,
+        1004222.658,
+        1010725.676,
+        1017177.138,
+        1023612.529,
+        1030098.236,
+        1036493.719,
+        1043112.207,
+        1049537.036,
+        1056008.096,
+        1062476.184,
+        1068942.337,
+        1075524.95,
+        1081932.864,
+        1088426.025,
+        1094776.005,
+        1101327.448,
+        1107901.673,
+        1114423.639,
+        1120884.602,
+        1127324.923,
+        1133794.24,
+        1140328.886,
+        1146849.376,
+        1153346.682,
+        1159836.502,
+        1166478.703,
+        1172953.304,
+        1179391.502,
+        1185950.982,
+        1192544.052,
+        1198913.41,
+        1205430.994,
+        1212015.525,
+        1218674.042,
+        1225121.683,
+        1231551.101,
+        1238126.379,
+        1244673.795,
+        1251260.649,
+        1257697.86,
+        1264320.983,
+        1270736.319,
+        1277274.694,
+        1283804.95,
+        1290211.514,
+        1296858.568,
+        1303455.691,
+    ],
+];
+
+static BIAS_DATA: &[&[f64]] = &[
+    &[
+        10.0,
+        9.717,
+        9.207,
+        8.7896,
+        8.2882,
+        7.8204,
+        7.3772,
+        6.9342,
+        6.5202,
+        6.161,
+        5.7722,
+        5.4636,
+        5.0396,
+        4.6766,
+        4.3566,
+        4.0454,
+        3.7936,
+        3.4856,
+        3.2666,
+        2.9946,
+        2.766,
+        2.4692,
+        2.3638,
+        2.0764,
+        1.7864,
+        1.7602,
+        1.4814,
+        1.433,
+        1.2926,
+        1.0664,
+        0.999600000000001,
+        0.7956,
+        0.5366,
+        0.589399999999998,
+        0.573799999999999,
+        0.269799999999996,
+        0.368200000000002,
+        0.0544000000000011,
+        0.234200000000001,
+        0.0108000000000033,
+        -0.203400000000002,
+        -0.0701999999999998,
+        -0.129600000000003,
+        -0.364199999999997,
+        -0.480600000000003,
+        -0.226999999999997,
+        -0.322800000000001,
+        -0.382599999999996,
+        -0.511200000000002,
+        -0.669600000000003,
+        -0.749400000000001,
+        -0.500399999999999,
+        -0.617600000000003,
+        -0.6922,
+        -0.601599999999998,
+        -0.416200000000003,
+        -0.338200000000001,
+        -0.782600000000002,
+        -0.648600000000002,
+        -0.919800000000002,
+        -0.851799999999997,
+        -0.962400000000002,
+        -0.6402,
+        -1.1922,
+        -1.0256,
+        -1.086,
+        -1.21899999999999,
+        -0.819400000000002,
+        -0.940600000000003,
+        -1.1554,
+        -1.2072,
+        -1.1752,
+        -1.16759999999999,
+        -1.14019999999999,
+        -1.3754,
+        -1.29859999999999,
+        -1.607,
+        -1.3292,
+        -1.7606,
+    ],
+    &[
+        22.0,
+        21.1194,
+        20.8208,
+        20.2318,
+        19.77,
+        19.2436,
+        18.7774,
+        18.2848,
+        17.8224,
+        17.3742,
+        16.9336,
+        16.503,
+        16.0494,
+        15.6292,
+        15.2124,
+        14.798,
+        14.367,
+        13.9728,
+        13.5944,
+        13.217,
+        12.8438,
+        12.3696,
+        12.0956,
+        11.7044,
+        11.324,
+        11.0668,
+        10.6698,
+        10.3644,
+        10.049,
+        9.6918,
+        9.4146,
+        9.082,
+        8.687,
+        8.5398,
+        8.2462,
+        7.857,
+        7.6606,
+        7.4168,
+        7.1248,
+        6.9222,
+        6.6804,
+        6.447,
+        6.3454,
+        5.9594,
+        5.7636,
+        5.5776,
+        5.331,
+        5.19,
+        4.9676,
+        4.7564,
+        4.5314,
+        4.4442,
+        4.3708,
+        3.9774,
+        3.9624,
+        3.8796,
+        3.755,
+        3.472,
+        3.2076,
+        3.1024,
+        2.8908,
+        2.7338,
+        2.7728,
+        2.629,
+        2.413,
+        2.3266,
+        2.1524,
+        2.2642,
+        2.1806,
+        2.0566,
+        1.9192,
+        1.7598,
+        1.3516,
+        1.5802,
+        1.43859999999999,
+        1.49160000000001,
+        1.1524,
+        1.1892,
+        0.841399999999993,
+        0.879800000000003,
+        0.837599999999995,
+        0.469800000000006,
+        0.765600000000006,
+        0.331000000000003,
+        0.591399999999993,
+        0.601200000000006,
+        0.701599999999999,
+        0.558199999999999,
+        0.339399999999998,
+        0.354399999999998,
+        0.491200000000006,
+        0.308000000000007,
+        0.355199999999996,
+        -0.0254000000000048,
+        0.205200000000005,
+        -0.272999999999996,
+        0.132199999999997,
+        0.394400000000005,
+        -0.241200000000006,
+        0.242000000000004,
+        0.191400000000002,
+        0.253799999999998,
+        -0.122399999999999,
+        -0.370800000000003,
+        0.193200000000004,
+        -0.0848000000000013,
+        0.0867999999999967,
+        -0.327200000000005,
+        -0.285600000000002,
+        0.311400000000006,
+        -0.128399999999999,
+        -0.754999999999995,
+        -0.209199999999996,
+        -0.293599999999998,
+        -0.364000000000004,
+        -0.253600000000006,
+        -0.821200000000005,
+        -0.253600000000006,
+        -0.510400000000004,
+        -0.383399999999995,
+        -0.491799999999998,
+        -0.220200000000006,
+        -0.0972000000000008,
+        -0.557400000000001,
+        -0.114599999999996,
+        -0.295000000000002,
+        -0.534800000000004,
+        0.346399999999988,
+        -0.65379999999999,
+        0.0398000000000138,
+        0.0341999999999985,
+        -0.995800000000003,
+        -0.523400000000009,
+        -0.489000000000004,
+        -0.274799999999999,
+        -0.574999999999989,
+        -0.482799999999997,
+        0.0571999999999946,
+        -0.330600000000004,
+        -0.628800000000012,
+        -0.140199999999993,
+        -0.540600000000012,
+        -0.445999999999998,
+        -0.599400000000003,
+        -0.262599999999992,
+        0.163399999999996,
+        -0.100599999999986,
+        -0.39500000000001,
+        -1.06960000000001,
+        -0.836399999999998,
+        -0.753199999999993,
+        -0.412399999999991,
+        -0.790400000000005,
+        -0.29679999999999,
+        -0.28540000000001,
+        -0.193000000000012,
+        -0.0772000000000048,
+        -0.962799999999987,
+        -0.414800000000014,
+    ],
+    &[
+        45.0,
+        44.1902,
+        43.271,
+        42.8358,
+        41.8142,
+        41.2854,
+        40.317,
+        39.354,
+        38.8924,
+        37.9436,
+        37.4596,
+        36.5262,
+        35.6248,
+        35.1574,
+        34.2822,
+        33.837,
+        32.9636,
+        32.074,
+        31.7042,
+        30.7976,
+        30.4772,
+        29.6564,
+        28.7942,
+        28.5004,
+        27.686,
+        27.291,
+        26.5672,
+        25.8556,
+        25.4982,
+        24.8204,
+        24.4252,
+        23.7744,
+        23.0786,
+        22.8344,
+        22.0294,
+        21.8098,
+        21.0794,
+        20.5732,
+        20.1878,
+        19.5648,
+        19.2902,
+        18.6784,
+        18.3352,
+        17.8946,
+        17.3712,
+        17.0852,
+        16.499,
+        16.2686,
+        15.6844,
+        15.2234,
+        14.9732,
+        14.3356,
+        14.2286,
+        13.7262,
+        13.3284,
+        13.1048,
+        12.5962,
+        12.3562,
+        12.1272,
+        11.4184,
+        11.4974,
+        11.0822,
+        10.856,
+        10.48,
+        10.2834,
+        10.0208,
+        9.637,
+        9.51739999999999,
+        9.05759999999999,
+        8.74760000000001,
+        8.42700000000001,
+        8.1326,
+        8.2372,
+        8.2788,
+        7.6776,
+        7.79259999999999,
+        7.1952,
+        6.9564,
+        6.6454,
+        6.87,
+        6.5428,
+        6.19999999999999,
+        6.02940000000001,
+        5.62780000000001,
+        5.6782,
+        5.792,
+        5.35159999999999,
+        5.28319999999999,
+        5.0394,
+        5.07480000000001,
+        4.49119999999999,
+        4.84899999999999,
+        4.696,
+        4.54040000000001,
+        4.07300000000001,
+        4.37139999999999,
+        3.7216,
+        3.7328,
+        3.42080000000001,
+        3.41839999999999,
+        3.94239999999999,
+        3.27719999999999,
+        3.411,
+        3.13079999999999,
+        2.76900000000001,
+        2.92580000000001,
+        2.68279999999999,
+        2.75020000000001,
+        2.70599999999999,
+        2.3886,
+        3.01859999999999,
+        2.45179999999999,
+        2.92699999999999,
+        2.41720000000001,
+        2.41139999999999,
+        2.03299999999999,
+        2.51240000000001,
+        2.5564,
+        2.60079999999999,
+        2.41720000000001,
+        1.80439999999999,
+        1.99700000000001,
+        2.45480000000001,
+        1.8948,
+        2.2346,
+        2.30860000000001,
+        2.15479999999999,
+        1.88419999999999,
+        1.6508,
+        0.677199999999999,
+        1.72540000000001,
+        1.4752,
+        1.72280000000001,
+        1.66139999999999,
+        1.16759999999999,
+        1.79300000000001,
+        1.00059999999999,
+        0.905200000000008,
+        0.659999999999997,
+        1.55879999999999,
+        1.1636,
+        0.688199999999995,
+        0.712600000000009,
+        0.450199999999995,
+        1.1978,
+        0.975599999999986,
+        0.165400000000005,
+        1.727,
+        1.19739999999999,
+        -0.252600000000001,
+        1.13460000000001,
+        1.3048,
+        1.19479999999999,
+        0.313400000000001,
+        0.878999999999991,
+        1.12039999999999,
+        0.853000000000009,
+        1.67920000000001,
+        0.856999999999999,
+        0.448599999999999,
+        1.2362,
+        0.953399999999988,
+        1.02859999999998,
+        0.563199999999995,
+        0.663000000000011,
+        0.723000000000013,
+        0.756599999999992,
+        0.256599999999992,
+        -0.837600000000009,
+        0.620000000000005,
+        0.821599999999989,
+        0.216600000000028,
+        0.205600000000004,
+        0.220199999999977,
+        0.372599999999977,
+        0.334400000000016,
+        0.928400000000011,
+        0.972800000000007,
+        0.192400000000021,
+        0.487199999999973,
+        -0.413000000000011,
+        0.807000000000016,
+        0.120600000000024,
+        0.769000000000005,
+        0.870799999999974,
+        0.66500000000002,
+        0.118200000000002,
+        0.401200000000017,
+        0.635199999999998,
+        0.135400000000004,
+        0.175599999999974,
+        1.16059999999999,
+        0.34620000000001,
+        0.521400000000028,
+        -0.586599999999976,
+        -1.16480000000001,
+        0.968399999999974,
+        0.836999999999989,
+        0.779600000000016,
+        0.985799999999983,
+    ],
+    &[
+        91.0,
+        89.4934,
+        87.9758,
+        86.4574,
+        84.9718,
+        83.4954,
+        81.5302,
+        80.0756,
+        78.6374,
+        77.1782,
+        75.7888,
+        73.9522,
+        72.592,
+        71.2532,
+        69.9086,
+        68.5938,
+        66.9474,
+        65.6796,
+        64.4394,
+        63.2176,
+        61.9768,
+        60.4214,
+        59.2528,
+        58.0102,
+        56.8658,
+        55.7278,
+        54.3044,
+        53.1316,
+        52.093,
+        51.0032,
+        49.9092,
+        48.6306,
+        47.5294,
+        46.5756,
+        45.6508,
+        44.662,
+        43.552,
+        42.3724,
+        41.617,
+        40.5754,
+        39.7872,
+        38.8444,
+        37.7988,
+        36.8606,
+        36.2118,
+        35.3566,
+        34.4476,
+        33.5882,
+        32.6816,
+        32.0824,
+        31.0258,
+        30.6048,
+        29.4436,
+        28.7274,
+        27.957,
+        27.147,
+        26.4364,
+        25.7592,
+        25.3386,
+        24.781,
+        23.8028,
+        23.656,
+        22.6544,
+        21.996,
+        21.4718,
+        21.1544,
+        20.6098,
+        19.5956,
+        19.0616,
+        18.5758,
+        18.4878,
+        17.5244,
+        17.2146,
+        16.724,
+        15.8722,
+        15.5198,
+        15.0414,
+        14.941,
+        14.9048,
+        13.87,
+        13.4304,
+        13.028,
+        12.4708,
+        12.37,
+        12.0624,
+        11.4668,
+        11.5532,
+        11.4352,
+        11.2564,
+        10.2744,
+        10.2118,
+        9.74720000000002,
+        10.1456,
+        9.2928,
+        8.75040000000001,
+        8.55279999999999,
+        8.97899999999998,
+        8.21019999999999,
+        8.18340000000001,
+        7.3494,
+        7.32499999999999,
+        7.66140000000001,
+        6.90300000000002,
+        7.25439999999998,
+        6.9042,
+        7.21499999999997,
+        6.28640000000001,
+        6.08139999999997,
+        6.6764,
+        6.30099999999999,
+        5.13900000000001,
+        5.65800000000002,
+        5.17320000000001,
+        4.59019999999998,
+        4.9538,
+        5.08280000000002,
+        4.92200000000003,
+        4.99020000000002,
+        4.7328,
+        5.4538,
+        4.11360000000002,
+        4.22340000000003,
+        4.08780000000002,
+        3.70800000000003,
+        4.15559999999999,
+        4.18520000000001,
+        3.63720000000001,
+        3.68220000000002,
+        3.77960000000002,
+        3.6078,
+        2.49160000000001,
+        3.13099999999997,
+        2.5376,
+        3.19880000000001,
+        3.21100000000001,
+        2.4502,
+        3.52820000000003,
+        2.91199999999998,
+        3.04480000000001,
+        2.7432,
+        2.85239999999999,
+        2.79880000000003,
+        2.78579999999999,
+        1.88679999999999,
+        2.98860000000002,
+        2.50639999999999,
+        1.91239999999999,
+        2.66160000000002,
+        2.46820000000002,
+        1.58199999999999,
+        1.30399999999997,
+        2.27379999999999,
+        2.68939999999998,
+        1.32900000000001,
+        3.10599999999999,
+        1.69080000000002,
+        2.13740000000001,
+        2.53219999999999,
+        1.88479999999998,
+        1.33240000000001,
+        1.45119999999997,
+        1.17899999999997,
+        2.44119999999998,
+        1.60659999999996,
+        2.16700000000003,
+        0.77940000000001,
+        2.37900000000002,
+        2.06700000000001,
+        1.46000000000004,
+        2.91160000000002,
+        1.69200000000001,
+        0.954600000000028,
+        2.49300000000005,
+        2.2722,
+        1.33500000000004,
+        2.44899999999996,
+        1.20140000000004,
+        3.07380000000001,
+        2.09739999999999,
+        2.85640000000001,
+        2.29960000000005,
+        2.40899999999999,
+        1.97040000000004,
+        0.809799999999996,
+        1.65279999999996,
+        2.59979999999996,
+        0.95799999999997,
+        2.06799999999998,
+        2.32780000000002,
+        4.20159999999998,
+        1.96320000000003,
+        1.86400000000003,
+        1.42999999999995,
+        3.77940000000001,
+        1.27200000000005,
+        1.86440000000005,
+        2.20600000000002,
+        3.21900000000005,
+        1.5154,
+        2.61019999999996,
+    ],
+    &[
+        183.2152,
+        180.2454,
+        177.2096,
+        173.6652,
+        170.6312,
+        167.6822,
+        164.249,
+        161.3296,
+        158.0038,
+        155.2074,
+        152.4612,
+        149.27,
+        146.5178,
+        143.4412,
+        140.8032,
+        138.1634,
+        135.1688,
+        132.6074,
+        129.6946,
+        127.2664,
+        124.8228,
+        122.0432,
+        119.6824,
+        116.9464,
+        114.6268,
+        112.2626,
+        109.8376,
+        107.4034,
+        104.8956,
+        102.8522,
+        100.7638,
+        98.3552,
+        96.3556,
+        93.7526,
+        91.9292,
+        89.8954,
+        87.8198,
+        85.7668,
+        83.298,
+        81.6688,
+        79.9466,
+        77.9746,
+        76.1672,
+        74.3474,
+        72.3028,
+        70.8912,
+        69.114,
+        67.4646,
+        65.9744,
+        64.4092,
+        62.6022,
+        60.843,
+        59.5684,
+        58.1652,
+        56.5426,
+        55.4152,
+        53.5388,
+        52.3592,
+        51.1366,
+        49.486,
+        48.3918,
+        46.5076,
+        45.509,
+        44.3834,
+        43.3498,
+        42.0668,
+        40.7346,
+        40.1228,
+        38.4528,
+        37.7,
+        36.644,
+        36.0518,
+        34.5774,
+        33.9068,
+        32.432,
+        32.1666,
+        30.434,
+        29.6644,
+        28.4894,
+        27.6312,
+        26.3804,
+        26.292,
+        25.5496000000001,
+        25.0234,
+        24.8206,
+        22.6146,
+        22.4188,
+        22.117,
+        20.6762,
+        20.6576,
+        19.7864,
+        19.509,
+        18.5334,
+        17.9204,
+        17.772,
+        16.2924,
+        16.8654,
+        15.1836,
+        15.745,
+        15.1316,
+        15.0386,
+        14.0136,
+        13.6342,
+        12.6196,
+        12.1866,
+        12.4281999999999,
+        11.3324,
+        10.4794000000001,
+        11.5038,
+        10.129,
+        9.52800000000002,
+        10.3203999999999,
+        9.46299999999997,
+        9.79280000000006,
+        9.12300000000005,
+        8.74180000000001,
+        9.2192,
+        7.51020000000005,
+        7.60659999999996,
+        7.01840000000004,
+        7.22239999999999,
+        7.40139999999997,
+        6.76179999999999,
+        7.14359999999999,
+        5.65060000000005,
+        5.63779999999997,
+        5.76599999999996,
+        6.75139999999999,
+        5.57759999999996,
+        3.73220000000003,
+        5.8048,
+        5.63019999999995,
+        4.93359999999996,
+        3.47979999999995,
+        4.33879999999999,
+        3.98940000000005,
+        3.81960000000004,
+        3.31359999999995,
+        3.23080000000004,
+        3.4588,
+        3.08159999999998,
+        3.4076,
+        3.00639999999999,
+        2.38779999999997,
+        2.61900000000003,
+        1.99800000000005,
+        3.34820000000002,
+        2.95060000000001,
+        0.990999999999985,
+        2.11440000000005,
+        2.20299999999997,
+        2.82219999999995,
+        2.73239999999998,
+        2.7826,
+        3.76660000000004,
+        2.26480000000004,
+        2.31280000000004,
+        2.40819999999997,
+        2.75360000000001,
+        3.33759999999995,
+        2.71559999999999,
+        1.7478000000001,
+        1.42920000000004,
+        2.39300000000003,
+        2.22779999999989,
+        2.34339999999997,
+        0.87259999999992,
+        3.88400000000001,
+        1.80600000000004,
+        1.91759999999999,
+        1.16779999999994,
+        1.50320000000011,
+        2.52500000000009,
+        0.226400000000012,
+        2.31500000000005,
+        0.930000000000064,
+        1.25199999999995,
+        2.14959999999996,
+        0.0407999999999902,
+        2.5447999999999,
+        1.32960000000003,
+        0.197400000000016,
+        2.52620000000002,
+        3.33279999999991,
+        -1.34300000000007,
+        0.422199999999975,
+        0.917200000000093,
+        1.12920000000008,
+        1.46060000000011,
+        1.45779999999991,
+        2.8728000000001,
+        3.33359999999993,
+        -1.34079999999994,
+        1.57680000000005,
+        0.363000000000056,
+        1.40740000000005,
+        0.656600000000026,
+        0.801400000000058,
+        -0.454600000000028,
+        1.51919999999996,
+    ],
+    &[
+        368.0,
+        361.8294,
+        355.2452,
+        348.6698,
+        342.1464,
+        336.2024,
+        329.8782,
+        323.6598,
+        317.462,
+        311.2826,
+        305.7102,
+        299.7416,
+        293.9366,
+        288.1046,
+        282.285,
+        277.0668,
+        271.306,
+        265.8448,
+        260.301,
+        254.9886,
+        250.2422,
+        244.8138,
+        239.7074,
+        234.7428,
+        229.8402,
+        225.1664,
+        220.3534,
+        215.594,
+        210.6886,
+        205.7876,
+        201.65,
+        197.228,
+        192.8036,
+        188.1666,
+        184.0818,
+        180.0824,
+        176.2574,
+        172.302,
+        168.1644,
+        164.0056,
+        160.3802,
+        156.7192,
+        152.5234,
+        149.2084,
+        145.831,
+        142.485,
+        139.1112,
+        135.4764,
+        131.76,
+        129.3368,
+        126.5538,
+        122.5058,
+        119.2646,
+        116.5902,
+        113.3818,
+        110.8998,
+        107.9532,
+        105.2062,
+        102.2798,
+        99.4728,
+        96.9582,
+        94.3292,
+        92.171,
+        89.7809999999999,
+        87.5716,
+        84.7048,
+        82.5322,
+        79.875,
+        78.3972,
+        75.3464,
+        73.7274,
+        71.2834,
+        70.1444,
+        68.4263999999999,
+        66.0166,
+        64.018,
+        62.0437999999999,
+        60.3399999999999,
+        58.6856,
+        57.9836,
+        55.0311999999999,
+        54.6769999999999,
+        52.3188,
+        51.4846,
+        49.4423999999999,
+        47.739,
+        46.1487999999999,
+        44.9202,
+        43.4059999999999,
+        42.5342000000001,
+        41.2834,
+        38.8954000000001,
+        38.3286000000001,
+        36.2146,
+        36.6684,
+        35.9946,
+        33.123,
+        33.4338,
+        31.7378000000001,
+        29.076,
+        28.9692,
+        27.4964,
+        27.0998,
+        25.9864,
+        26.7754,
+        24.3208,
+        23.4838,
+        22.7388000000001,
+        24.0758000000001,
+        21.9097999999999,
+        20.9728,
+        19.9228000000001,
+        19.9292,
+        16.617,
+        17.05,
+        18.2996000000001,
+        15.6128000000001,
+        15.7392,
+        14.5174,
+        13.6322,
+        12.2583999999999,
+        13.3766000000001,
+        11.423,
+        13.1232,
+        9.51639999999998,
+        10.5938000000001,
+        9.59719999999993,
+        8.12220000000002,
+        9.76739999999995,
+        7.50440000000003,
+        7.56999999999994,
+        6.70440000000008,
+        6.41419999999994,
+        6.71019999999999,
+        5.60940000000005,
+        4.65219999999999,
+        6.84099999999989,
+        3.4072000000001,
+        3.97859999999991,
+        3.32760000000007,
+        5.52160000000003,
+        3.31860000000006,
+        2.06940000000009,
+        4.35400000000004,
+        1.57500000000005,
+        0.280799999999999,
+        2.12879999999996,
+        -0.214799999999968,
+        -0.0378000000000611,
+        -0.658200000000079,
+        0.654800000000023,
+        -0.0697999999999865,
+        0.858400000000074,
+        -2.52700000000004,
+        -2.1751999999999,
+        -3.35539999999992,
+        -1.04019999999991,
+        -0.651000000000067,
+        -2.14439999999991,
+        -1.96659999999997,
+        -3.97939999999994,
+        -0.604400000000169,
+        -3.08260000000018,
+        -3.39159999999993,
+        -5.29640000000018,
+        -5.38920000000007,
+        -5.08759999999984,
+        -4.69900000000007,
+        -5.23720000000003,
+        -3.15779999999995,
+        -4.97879999999986,
+        -4.89899999999989,
+        -7.48880000000008,
+        -5.94799999999987,
+        -5.68060000000014,
+        -6.67180000000008,
+        -4.70499999999993,
+        -7.27779999999984,
+        -4.6579999999999,
+        -4.4362000000001,
+        -4.32139999999981,
+        -5.18859999999995,
+        -6.66879999999992,
+        -6.48399999999992,
+        -5.1260000000002,
+        -4.4032000000002,
+        -6.13500000000022,
+        -5.80819999999994,
+        -4.16719999999987,
+        -4.15039999999999,
+        -7.45600000000013,
+        -7.24080000000004,
+        -9.83179999999993,
+        -5.80420000000004,
+        -8.6561999999999,
+        -6.99940000000015,
+        -10.5473999999999,
+        -7.34139999999979,
+        -6.80999999999995,
+        -6.29719999999998,
+        -6.23199999999997,
+    ],
+    &[
+        737.1256,
+        724.4234,
+        711.1064,
+        698.4732,
+        685.4636,
+        673.0644,
+        660.488,
+        647.9654,
+        636.0832,
+        623.7864,
+        612.1992,
+        600.2176,
+        588.5228,
+        577.1716,
+        565.7752,
+        554.899,
+        543.6126,
+        532.6492,
+        521.9474,
+        511.5214,
+        501.1064,
+        490.6364,
+        480.2468,
+        470.4588,
+        460.3832,
+        451.0584,
+        440.8606,
+        431.3868,
+        422.5062,
+        413.1862,
+        404.463,
+        395.339,
+        386.1936,
+        378.1292,
+        369.1854,
+        361.2908,
+        353.3324,
+        344.8518,
+        337.5204,
+        329.4854,
+        321.9318,
+        314.552,
+        306.4658,
+        299.4256,
+        292.849,
+        286.152,
+        278.8956,
+        271.8792,
+        265.118,
+        258.62,
+        252.5132,
+        245.9322,
+        239.7726,
+        233.6086,
+        227.5332,
+        222.5918,
+        216.4294,
+        210.7662,
+        205.4106,
+        199.7338,
+        194.9012,
+        188.4486,
+        183.1556,
+        178.6338,
+        173.7312,
+        169.6264,
+        163.9526,
+        159.8742,
+        155.8326,
+        151.1966,
+        147.5594,
+        143.07,
+        140.037,
+        134.1804,
+        131.071,
+        127.4884,
+        124.0848,
+        120.2944,
+        117.333,
+        112.9626,
+        110.2902,
+        107.0814,
+        103.0334,
+        99.4832000000001,
+        96.3899999999999,
+        93.7202000000002,
+        90.1714000000002,
+        87.2357999999999,
+        85.9346,
+        82.8910000000001,
+        80.0264000000002,
+        78.3834000000002,
+        75.1543999999999,
+        73.8683999999998,
+        70.9895999999999,
+        69.4367999999999,
+        64.8701999999998,
+        65.0408000000002,
+        61.6738,
+        59.5207999999998,
+        57.0158000000001,
+        54.2302,
+        53.0962,
+        50.4985999999999,
+        52.2588000000001,
+        47.3914,
+        45.6244000000002,
+        42.8377999999998,
+        43.0072,
+        40.6516000000001,
+        40.2453999999998,
+        35.2136,
+        36.4546,
+        33.7849999999999,
+        33.2294000000002,
+        32.4679999999998,
+        30.8670000000002,
+        28.6507999999999,
+        28.9099999999999,
+        27.5983999999999,
+        26.1619999999998,
+        24.5563999999999,
+        23.2328000000002,
+        21.9484000000002,
+        21.5902000000001,
+        21.3346000000001,
+        17.7031999999999,
+        20.6111999999998,
+        19.5545999999999,
+        15.7375999999999,
+        17.0720000000001,
+        16.9517999999998,
+        15.326,
+        13.1817999999998,
+        14.6925999999999,
+        13.0859999999998,
+        13.2754,
+        10.8697999999999,
+        11.248,
+        7.3768,
+        4.72339999999986,
+        7.97899999999981,
+        8.7503999999999,
+        7.68119999999999,
+        9.7199999999998,
+        7.73919999999998,
+        5.6224000000002,
+        7.44560000000001,
+        6.6601999999998,
+        5.9058,
+        4.00199999999995,
+        4.51699999999983,
+        4.68240000000014,
+        3.86220000000003,
+        5.13639999999987,
+        5.98500000000013,
+        2.47719999999981,
+        2.61999999999989,
+        1.62800000000016,
+        4.65000000000009,
+        0.225599999999758,
+        0.831000000000131,
+        -0.359400000000278,
+        1.27599999999984,
+        -2.92559999999958,
+        -0.0303999999996449,
+        2.37079999999969,
+        -2.0033999999996,
+        0.804600000000391,
+        0.30199999999968,
+        1.1247999999996,
+        -2.6880000000001,
+        0.0321999999996478,
+        -1.18099999999959,
+        -3.9402,
+        -1.47940000000017,
+        -0.188400000000001,
+        -2.10720000000038,
+        -2.04159999999956,
+        -3.12880000000041,
+        -4.16160000000036,
+        -0.612799999999879,
+        -3.48719999999958,
+        -8.17900000000009,
+        -5.37780000000021,
+        -4.01379999999972,
+        -5.58259999999973,
+        -5.73719999999958,
+        -7.66799999999967,
+        -5.69520000000011,
+        -1.1247999999996,
+        -5.58520000000044,
+        -8.04560000000038,
+        -4.64840000000004,
+        -11.6468000000004,
+        -7.97519999999986,
+        -5.78300000000036,
+        -7.67420000000038,
+        -10.6328000000003,
+        -9.81720000000041,
+    ],
+    &[
+        1476.0,
+        1449.6014,
+        1423.5802,
+        1397.7942,
+        1372.3042,
+        1347.2062,
+        1321.8402,
+        1297.2292,
+        1272.9462,
+        1248.9926,
+        1225.3026,
+        1201.4252,
+        1178.0578,
+        1155.6092,
+        1132.626,
+        1110.5568,
+        1088.527,
+        1066.5154,
+        1045.1874,
+        1024.3878,
+        1003.37,
+        982.1972,
+        962.5728,
+        942.1012,
+        922.9668,
+        903.292,
+        884.0772,
+        864.8578,
+        846.6562,
+        828.041,
+        809.714,
+        792.3112,
+        775.1806,
+        757.9854,
+        740.656,
+        724.346,
+        707.5154,
+        691.8378,
+        675.7448,
+        659.6722,
+        645.5722,
+        630.1462,
+        614.4124,
+        600.8728,
+        585.898,
+        572.408,
+        558.4926,
+        544.4938,
+        531.6776,
+        517.282,
+        505.7704,
+        493.1012,
+        480.7388,
+        467.6876,
+        456.1872,
+        445.5048,
+        433.0214,
+        420.806,
+        411.409,
+        400.4144,
+        389.4294,
+        379.2286,
+        369.651,
+        360.6156,
+        350.337,
+        342.083,
+        332.1538,
+        322.5094,
+        315.01,
+        305.6686,
+        298.1678,
+        287.8116,
+        280.9978,
+        271.9204,
+        265.3286,
+        257.5706,
+        249.6014,
+        242.544,
+        235.5976,
+        229.583,
+        220.9438,
+        214.672,
+        208.2786,
+        201.8628,
+        195.1834,
+        191.505,
+        186.1816,
+        178.5188,
+        172.2294,
+        167.8908,
+        161.0194,
+        158.052,
+        151.4588,
+        148.1596,
+        143.4344,
+        138.5238,
+        133.13,
+        127.6374,
+        124.8162,
+        118.7894,
+        117.3984,
+        114.6078,
+        109.0858,
+        105.1036,
+        103.6258,
+        98.6018000000004,
+        95.7618000000002,
+        93.5821999999998,
+        88.5900000000001,
+        86.9992000000002,
+        82.8800000000001,
+        80.4539999999997,
+        74.6981999999998,
+        74.3644000000004,
+        73.2914000000001,
+        65.5709999999999,
+        66.9232000000002,
+        65.1913999999997,
+        62.5882000000001,
+        61.5702000000001,
+        55.7035999999998,
+        56.1764000000003,
+        52.7596000000003,
+        53.0302000000001,
+        49.0609999999997,
+        48.4694,
+        44.933,
+        46.0474000000004,
+        44.7165999999997,
+        41.9416000000001,
+        39.9207999999999,
+        35.6328000000003,
+        35.5276000000003,
+        33.1934000000001,
+        33.2371999999996,
+        33.3864000000003,
+        33.9228000000003,
+        30.2371999999996,
+        29.1373999999996,
+        25.2272000000003,
+        24.2942000000003,
+        19.8338000000003,
+        18.9005999999999,
+        23.0907999999999,
+        21.8544000000002,
+        19.5176000000001,
+        15.4147999999996,
+        16.9314000000004,
+        18.6737999999996,
+        12.9877999999999,
+        14.3688000000002,
+        12.0447999999997,
+        15.5219999999999,
+        12.5299999999997,
+        14.5940000000001,
+        14.3131999999996,
+        9.45499999999993,
+        12.9441999999999,
+        3.91139999999996,
+        13.1373999999996,
+        5.44720000000052,
+        9.82779999999912,
+        7.87279999999919,
+        3.67760000000089,
+        5.46980000000076,
+        5.55099999999948,
+        5.65979999999945,
+        3.89439999999922,
+        3.1275999999998,
+        5.65140000000065,
+        6.3062000000009,
+        3.90799999999945,
+        1.87060000000019,
+        5.17020000000048,
+        2.46680000000015,
+        0.770000000000437,
+        -3.72340000000077,
+        1.16400000000067,
+        8.05340000000069,
+        0.135399999999208,
+        2.15940000000046,
+        0.766999999999825,
+        1.0594000000001,
+        3.15500000000065,
+        -0.287399999999252,
+        2.37219999999979,
+        -2.86620000000039,
+        -1.63199999999961,
+        -2.22979999999916,
+        -0.15519999999924,
+        -1.46039999999994,
+        -0.262199999999211,
+        -2.34460000000036,
+        -2.8078000000005,
+        -3.22179999999935,
+        -5.60159999999996,
+        -8.42200000000048,
+        -9.43740000000071,
+        0.161799999999857,
+        -10.4755999999998,
+        -10.0823999999993,
+    ],
+    &[
+        2953.0,
+        2900.4782,
+        2848.3568,
+        2796.3666,
+        2745.324,
+        2694.9598,
+        2644.648,
+        2595.539,
+        2546.1474,
+        2498.2576,
+        2450.8376,
+        2403.6076,
+        2357.451,
+        2311.38,
+        2266.4104,
+        2221.5638,
+        2176.9676,
+        2134.193,
+        2090.838,
+        2048.8548,
+        2007.018,
+        1966.1742,
+        1925.4482,
+        1885.1294,
+        1846.4776,
+        1807.4044,
+        1768.8724,
+        1731.3732,
+        1693.4304,
+        1657.5326,
+        1621.949,
+        1586.5532,
+        1551.7256,
+        1517.6182,
+        1483.5186,
+        1450.4528,
+        1417.865,
+        1385.7164,
+        1352.6828,
+        1322.6708,
+        1291.8312,
+        1260.9036,
+        1231.476,
+        1201.8652,
+        1173.6718,
+        1145.757,
+        1119.2072,
+        1092.2828,
+        1065.0434,
+        1038.6264,
+        1014.3192,
+        988.5746,
+        965.0816,
+        940.1176,
+        917.9796,
+        894.5576,
+        871.1858,
+        849.9144,
+        827.1142,
+        805.0818,
+        783.9664,
+        763.9096,
+        742.0816,
+        724.3962,
+        706.3454,
+        688.018,
+        667.4214,
+        650.3106,
+        633.0686,
+        613.8094,
+        597.818,
+        581.4248,
+        563.834,
+        547.363,
+        531.5066,
+        520.455400000001,
+        505.583199999999,
+        488.366,
+        476.480799999999,
+        459.7682,
+        450.0522,
+        434.328799999999,
+        423.952799999999,
+        408.727000000001,
+        399.079400000001,
+        387.252200000001,
+        373.987999999999,
+        360.852000000001,
+        351.6394,
+        339.642,
+        330.902400000001,
+        322.661599999999,
+        311.662200000001,
+        301.3254,
+        291.7484,
+        279.939200000001,
+        276.7508,
+        263.215200000001,
+        254.811400000001,
+        245.5494,
+        242.306399999999,
+        234.8734,
+        223.787200000001,
+        217.7156,
+        212.0196,
+        200.793,
+        195.9748,
+        189.0702,
+        182.449199999999,
+        177.2772,
+        170.2336,
+        164.741,
+        158.613600000001,
+        155.311,
+        147.5964,
+        142.837,
+        137.3724,
+        132.0162,
+        130.0424,
+        121.9804,
+        120.451800000001,
+        114.8968,
+        111.585999999999,
+        105.933199999999,
+        101.705,
+        98.5141999999996,
+        95.0488000000005,
+        89.7880000000005,
+        91.4750000000004,
+        83.7764000000006,
+        80.9698000000008,
+        72.8574000000008,
+        73.1615999999995,
+        67.5838000000003,
+        62.6263999999992,
+        63.2638000000006,
+        66.0977999999996,
+        52.0843999999997,
+        58.9956000000002,
+        47.0912000000008,
+        46.4956000000002,
+        48.4383999999991,
+        47.1082000000006,
+        43.2392,
+        37.2759999999998,
+        40.0283999999992,
+        35.1864000000005,
+        35.8595999999998,
+        32.0998,
+        28.027,
+        23.6694000000007,
+        33.8266000000003,
+        26.3736000000008,
+        27.2008000000005,
+        21.3245999999999,
+        26.4115999999995,
+        23.4521999999997,
+        19.5013999999992,
+        19.8513999999996,
+        10.7492000000002,
+        18.6424000000006,
+        13.1265999999996,
+        18.2436000000016,
+        6.71860000000015,
+        3.39459999999963,
+        6.33759999999893,
+        7.76719999999841,
+        0.813999999998487,
+        3.82819999999992,
+        0.826199999999517,
+        8.07440000000133,
+        -1.59080000000176,
+        5.01780000000144,
+        0.455399999998917,
+        -0.24199999999837,
+        0.174800000000687,
+        -9.07640000000174,
+        -4.20160000000033,
+        -3.77520000000004,
+        -4.75179999999818,
+        -5.3724000000002,
+        -8.90680000000066,
+        -6.10239999999976,
+        -5.74120000000039,
+        -9.95339999999851,
+        -3.86339999999836,
+        -13.7304000000004,
+        -16.2710000000006,
+        -7.51359999999841,
+        -3.30679999999847,
+        -13.1339999999982,
+        -10.0551999999989,
+        -6.72019999999975,
+        -8.59660000000076,
+        -10.9307999999983,
+        -1.8775999999998,
+        -4.82259999999951,
+        -13.7788,
+        -21.6470000000008,
+        -10.6735999999983,
+        -15.7799999999988,
+    ],
+    &[
+        5907.5052,
+        5802.2672,
+        5697.347,
+        5593.5794,
+        5491.2622,
+        5390.5514,
+        5290.3376,
+        5191.6952,
+        5093.5988,
+        4997.3552,
+        4902.5972,
+        4808.3082,
+        4715.5646,
+        4624.109,
+        4533.8216,
+        4444.4344,
+        4356.3802,
+        4269.2962,
+        4183.3784,
+        4098.292,
+        4014.79,
+        3932.4574,
+        3850.6036,
+        3771.2712,
+        3691.7708,
+        3615.099,
+        3538.1858,
+        3463.4746,
+        3388.8496,
+        3315.6794,
+        3244.5448,
+        3173.7516,
+        3103.3106,
+        3033.6094,
+        2966.5642,
+        2900.794,
+        2833.7256,
+        2769.81,
+        2707.3196,
+        2644.0778,
+        2583.9916,
+        2523.4662,
+        2464.124,
+        2406.073,
+        2347.0362,
+        2292.1006,
+        2238.1716,
+        2182.7514,
+        2128.4884,
+        2077.1314,
+        2025.037,
+        1975.3756,
+        1928.933,
+        1879.311,
+        1831.0006,
+        1783.2144,
+        1738.3096,
+        1694.5144,
+        1649.024,
+        1606.847,
+        1564.7528,
+        1525.3168,
+        1482.5372,
+        1443.9668,
+        1406.5074,
+        1365.867,
+        1329.2186,
+        1295.4186,
+        1257.9716,
+        1225.339,
+        1193.2972,
+        1156.3578,
+        1125.8686,
+        1091.187,
+        1061.4094,
+        1029.4188,
+        1000.9126,
+        972.3272,
+        944.004199999999,
+        915.7592,
+        889.965,
+        862.834200000001,
+        840.4254,
+        812.598399999999,
+        785.924200000001,
+        763.050999999999,
+        741.793799999999,
+        721.466,
+        699.040799999999,
+        677.997200000002,
+        649.866999999998,
+        634.911800000002,
+        609.8694,
+        591.981599999999,
+        570.2922,
+        557.129199999999,
+        538.3858,
+        521.872599999999,
+        502.951400000002,
+        495.776399999999,
+        475.171399999999,
+        459.751,
+        439.995200000001,
+        426.708999999999,
+        413.7016,
+        402.3868,
+        387.262599999998,
+        372.0524,
+        357.050999999999,
+        342.5098,
+        334.849200000001,
+        322.529399999999,
+        311.613799999999,
+        295.848000000002,
+        289.273000000001,
+        274.093000000001,
+        263.329600000001,
+        251.389599999999,
+        245.7392,
+        231.9614,
+        229.7952,
+        217.155200000001,
+        208.9588,
+        199.016599999999,
+        190.839199999999,
+        180.6976,
+        176.272799999999,
+        166.976999999999,
+        162.5252,
+        151.196400000001,
+        149.386999999999,
+        133.981199999998,
+        130.0586,
+        130.164000000001,
+        122.053400000001,
+        110.7428,
+        108.1276,
+        106.232400000001,
+        100.381600000001,
+        98.7668000000012,
+        86.6440000000002,
+        79.9768000000004,
+        82.4722000000002,
+        68.7026000000005,
+        70.1186000000016,
+        71.9948000000004,
+        58.998599999999,
+        59.0492000000013,
+        56.9818000000014,
+        47.5338000000011,
+        42.9928,
+        51.1591999999982,
+        37.2740000000013,
+        42.7220000000016,
+        31.3734000000004,
+        26.8090000000011,
+        25.8934000000008,
+        26.5286000000015,
+        29.5442000000003,
+        19.3503999999994,
+        26.0760000000009,
+        17.9527999999991,
+        14.8419999999969,
+        10.4683999999979,
+        8.65899999999965,
+        9.86720000000059,
+        4.34139999999752,
+        -0.907800000000861,
+        -3.32080000000133,
+        -0.936199999996461,
+        -11.9916000000012,
+        -8.87000000000262,
+        -6.33099999999831,
+        -11.3366000000024,
+        -15.9207999999999,
+        -9.34659999999712,
+        -15.5034000000014,
+        -19.2097999999969,
+        -15.357799999998,
+        -28.2235999999975,
+        -30.6898000000001,
+        -19.3271999999997,
+        -25.6083999999973,
+        -24.409599999999,
+        -13.6385999999984,
+        -33.4473999999973,
+        -32.6949999999997,
+        -28.9063999999998,
+        -31.7483999999968,
+        -32.2935999999972,
+        -35.8329999999987,
+        -47.620600000002,
+        -39.0855999999985,
+        -33.1434000000008,
+        -46.1371999999974,
+        -37.5892000000022,
+        -46.8164000000033,
+        -47.3142000000007,
+        -60.2914000000019,
+        -37.7575999999972,
+    ],
+    &[
+        11816.475,
+        11605.0046,
+        11395.3792,
+        11188.7504,
+        10984.1814,
+        10782.0086,
+        10582.0072,
+        10384.503,
+        10189.178,
+        9996.2738,
+        9806.0344,
+        9617.9798,
+        9431.394,
+        9248.7784,
+        9067.6894,
+        8889.6824,
+        8712.9134,
+        8538.8624,
+        8368.4944,
+        8197.7956,
+        8031.8916,
+        7866.6316,
+        7703.733,
+        7544.5726,
+        7386.204,
+        7230.666,
+        7077.8516,
+        6926.7886,
+        6778.6902,
+        6631.9632,
+        6487.304,
+        6346.7486,
+        6206.4408,
+        6070.202,
+        5935.2576,
+        5799.924,
+        5671.0324,
+        5541.9788,
+        5414.6112,
+        5290.0274,
+        5166.723,
+        5047.6906,
+        4929.162,
+        4815.1406,
+        4699.127,
+        4588.5606,
+        4477.7394,
+        4369.4014,
+        4264.2728,
+        4155.9224,
+        4055.581,
+        3955.505,
+        3856.9618,
+        3761.3828,
+        3666.9702,
+        3575.7764,
+        3482.4132,
+        3395.0186,
+        3305.8852,
+        3221.415,
+        3138.6024,
+        3056.296,
+        2970.4494,
+        2896.1526,
+        2816.8008,
+        2740.2156,
+        2670.497,
+        2594.1458,
+        2527.111,
+        2460.8168,
+        2387.5114,
+        2322.9498,
+        2260.6752,
+        2194.2686,
+        2133.7792,
+        2074.767,
+        2015.204,
+        1959.4226,
+        1898.6502,
+        1850.006,
+        1792.849,
+        1741.4838,
+        1687.9778,
+        1638.1322,
+        1589.3266,
+        1543.1394,
+        1496.8266,
+        1447.8516,
+        1402.7354,
+        1361.9606,
+        1327.0692,
+        1285.4106,
+        1241.8112,
+        1201.6726,
+        1161.973,
+        1130.261,
+        1094.2036,
+        1048.2036,
+        1020.6436,
+        990.901400000002,
+        961.199800000002,
+        924.769800000002,
+        899.526400000002,
+        872.346400000002,
+        834.375,
+        810.432000000001,
+        780.659800000001,
+        756.013800000001,
+        733.479399999997,
+        707.923999999999,
+        673.858,
+        652.222399999999,
+        636.572399999997,
+        615.738599999997,
+        586.696400000001,
+        564.147199999999,
+        541.679600000003,
+        523.943599999999,
+        505.714599999999,
+        475.729599999999,
+        461.779600000002,
+        449.750800000002,
+        439.020799999998,
+        412.7886,
+        400.245600000002,
+        383.188199999997,
+        362.079599999997,
+        357.533799999997,
+        334.319000000003,
+        327.553399999997,
+        308.559399999998,
+        291.270199999999,
+        279.351999999999,
+        271.791400000002,
+        252.576999999997,
+        247.482400000001,
+        236.174800000001,
+        218.774599999997,
+        220.155200000001,
+        208.794399999999,
+        201.223599999998,
+        182.995600000002,
+        185.5268,
+        164.547400000003,
+        176.5962,
+        150.689599999998,
+        157.8004,
+        138.378799999999,
+        134.021200000003,
+        117.614399999999,
+        108.194000000003,
+        97.0696000000025,
+        89.6042000000016,
+        95.6030000000028,
+        84.7810000000027,
+        72.635000000002,
+        77.3482000000004,
+        59.4907999999996,
+        55.5875999999989,
+        50.7346000000034,
+        61.3916000000027,
+        50.9149999999936,
+        39.0384000000049,
+        58.9395999999979,
+        29.633600000001,
+        28.2032000000036,
+        26.0078000000067,
+        17.0387999999948,
+        9.22000000000116,
+        13.8387999999977,
+        8.07240000000456,
+        14.1549999999988,
+        15.3570000000036,
+        3.42660000000615,
+        6.24820000000182,
+        -2.96940000000177,
+        -8.79940000000352,
+        -5.97860000000219,
+        -14.4048000000039,
+        -3.4143999999942,
+        -13.0148000000045,
+        -11.6977999999945,
+        -25.7878000000055,
+        -22.3185999999987,
+        -24.409599999999,
+        -31.9756000000052,
+        -18.9722000000038,
+        -22.8678000000073,
+        -30.8972000000067,
+        -32.3715999999986,
+        -22.3907999999938,
+        -43.6720000000059,
+        -35.9038,
+        -39.7492000000057,
+        -54.1641999999993,
+        -45.2749999999942,
+        -42.2989999999991,
+        -44.1089999999967,
+        -64.3564000000042,
+        -49.9551999999967,
+        -42.6116000000038,
+    ],
+    &[
+        23634.0036,
+        23210.8034,
+        22792.4744,
+        22379.1524,
+        21969.7928,
+        21565.326,
+        21165.3532,
+        20770.2806,
+        20379.9892,
+        19994.7098,
+        19613.318,
+        19236.799,
+        18865.4382,
+        18498.8244,
+        18136.5138,
+        17778.8668,
+        17426.2344,
+        17079.32,
+        16734.778,
+        16397.2418,
+        16063.3324,
+        15734.0232,
+        15409.731,
+        15088.728,
+        14772.9896,
+        14464.1402,
+        14157.5588,
+        13855.5958,
+        13559.3296,
+        13264.9096,
+        12978.326,
+        12692.0826,
+        12413.8816,
+        12137.3192,
+        11870.2326,
+        11602.5554,
+        11340.3142,
+        11079.613,
+        10829.5908,
+        10583.5466,
+        10334.0344,
+        10095.5072,
+        9859.694,
+        9625.2822,
+        9395.7862,
+        9174.0586,
+        8957.3164,
+        8738.064,
+        8524.155,
+        8313.7396,
+        8116.9168,
+        7913.542,
+        7718.4778,
+        7521.65,
+        7335.5596,
+        7154.2906,
+        6968.7396,
+        6786.3996,
+        6613.236,
+        6437.406,
+        6270.6598,
+        6107.7958,
+        5945.7174,
+        5787.6784,
+        5635.5784,
+        5482.308,
+        5337.9784,
+        5190.0864,
+        5045.9158,
+        4919.1386,
+        4771.817,
+        4645.7742,
+        4518.4774,
+        4385.5454,
+        4262.6622,
+        4142.74679999999,
+        4015.5318,
+        3897.9276,
+        3790.7764,
+        3685.13800000001,
+        3573.6274,
+        3467.9706,
+        3368.61079999999,
+        3271.5202,
+        3170.3848,
+        3076.4656,
+        2982.38400000001,
+        2888.4664,
+        2806.4868,
+        2711.9564,
+        2634.1434,
+        2551.3204,
+        2469.7662,
+        2396.61139999999,
+        2318.9902,
+        2243.8658,
+        2171.9246,
+        2105.01360000001,
+        2028.8536,
+        1960.9952,
+        1901.4096,
+        1841.86079999999,
+        1777.54700000001,
+        1714.5802,
+        1654.65059999999,
+        1596.311,
+        1546.2016,
+        1492.3296,
+        1433.8974,
+        1383.84600000001,
+        1339.4152,
+        1293.5518,
+        1245.8686,
+        1193.50659999999,
+        1162.27959999999,
+        1107.19439999999,
+        1069.18060000001,
+        1035.09179999999,
+        999.679000000004,
+        957.679999999993,
+        925.300199999998,
+        888.099400000006,
+        848.638600000006,
+        818.156400000007,
+        796.748399999997,
+        752.139200000005,
+        725.271200000003,
+        692.216,
+        671.633600000001,
+        647.939799999993,
+        621.670599999998,
+        575.398799999995,
+        561.226599999995,
+        532.237999999998,
+        521.787599999996,
+        483.095799999996,
+        467.049599999998,
+        465.286399999997,
+        415.548599999995,
+        401.047399999996,
+        380.607999999993,
+        377.362599999993,
+        347.258799999996,
+        338.371599999999,
+        310.096999999994,
+        301.409199999995,
+        276.280799999993,
+        265.586800000005,
+        258.994399999996,
+        223.915999999997,
+        215.925399999993,
+        213.503800000006,
+        191.045400000003,
+        166.718200000003,
+        166.259000000005,
+        162.941200000001,
+        148.829400000002,
+        141.645999999993,
+        123.535399999993,
+        122.329800000007,
+        89.473399999988,
+        80.1962000000058,
+        77.5457999999926,
+        59.1056000000099,
+        83.3509999999951,
+        52.2906000000075,
+        36.3979999999865,
+        40.6558000000077,
+        42.0003999999899,
+        19.6630000000005,
+        19.7153999999864,
+        -8.38539999999921,
+        -0.692799999989802,
+        0.854800000000978,
+        3.23219999999856,
+        -3.89040000000386,
+        -5.25880000001052,
+        -24.9052000000083,
+        -22.6837999999989,
+        -26.4286000000138,
+        -34.997000000003,
+        -37.0216000000073,
+        -43.430400000012,
+        -58.2390000000014,
+        -68.8034000000043,
+        -56.9245999999985,
+        -57.8583999999973,
+        -77.3097999999882,
+        -73.2793999999994,
+        -81.0738000000129,
+        -87.4530000000086,
+        -65.0254000000132,
+        -57.296399999992,
+        -96.2746000000043,
+        -103.25,
+        -96.081600000005,
+        -91.5542000000132,
+        -102.465200000006,
+        -107.688599999994,
+        -101.458000000013,
+        -109.715800000005,
+    ],
+    &[
+        47270.0,
+        46423.3584,
+        45585.7074,
+        44757.152,
+        43938.8416,
+        43130.9514,
+        42330.03,
+        41540.407,
+        40759.6348,
+        39988.206,
+        39226.5144,
+        38473.2096,
+        37729.795,
+        36997.268,
+        36272.6448,
+        35558.665,
+        34853.0248,
+        34157.4472,
+        33470.5204,
+        32793.5742,
+        32127.0194,
+        31469.4182,
+        30817.6136,
+        30178.6968,
+        29546.8908,
+        28922.8544,
+        28312.271,
+        27707.0924,
+        27114.0326,
+        26526.692,
+        25948.6336,
+        25383.7826,
+        24823.5998,
+        24272.2974,
+        23732.2572,
+        23201.4976,
+        22674.2796,
+        22163.6336,
+        21656.515,
+        21161.7362,
+        20669.9368,
+        20189.4424,
+        19717.3358,
+        19256.3744,
+        18795.9638,
+        18352.197,
+        17908.5738,
+        17474.391,
+        17052.918,
+        16637.2236,
+        16228.4602,
+        15823.3474,
+        15428.6974,
+        15043.0284,
+        14667.6278,
+        14297.4588,
+        13935.2882,
+        13578.5402,
+        13234.6032,
+        12882.1578,
+        12548.0728,
+        12219.231,
+        11898.0072,
+        11587.2626,
+        11279.9072,
+        10973.5048,
+        10678.5186,
+        10392.4876,
+        10105.2556,
+        9825.766,
+        9562.5444,
+        9294.2222,
+        9038.2352,
+        8784.848,
+        8533.2644,
+        8301.7776,
+        8058.30859999999,
+        7822.94579999999,
+        7599.11319999999,
+        7366.90779999999,
+        7161.217,
+        6957.53080000001,
+        6736.212,
+        6548.21220000001,
+        6343.06839999999,
+        6156.28719999999,
+        5975.15419999999,
+        5791.75719999999,
+        5621.32019999999,
+        5451.66,
+        5287.61040000001,
+        5118.09479999999,
+        4957.288,
+        4798.4246,
+        4662.17559999999,
+        4512.05900000001,
+        4364.68539999999,
+        4220.77720000001,
+        4082.67259999999,
+        3957.19519999999,
+        3842.15779999999,
+        3699.3328,
+        3583.01180000001,
+        3473.8964,
+        3338.66639999999,
+        3233.55559999999,
+        3117.799,
+        3008.111,
+        2909.69140000001,
+        2814.86499999999,
+        2719.46119999999,
+        2624.742,
+        2532.46979999999,
+        2444.7886,
+        2370.1868,
+        2272.45259999999,
+        2196.19260000001,
+        2117.90419999999,
+        2023.2972,
+        1969.76819999999,
+        1885.58979999999,
+        1833.2824,
+        1733.91200000001,
+        1682.54920000001,
+        1604.57980000001,
+        1556.11240000001,
+        1491.3064,
+        1421.71960000001,
+        1371.22899999999,
+        1322.1324,
+        1264.7892,
+        1196.23920000001,
+        1143.8474,
+        1088.67240000001,
+        1073.60380000001,
+        1023.11660000001,
+        959.036400000012,
+        927.433199999999,
+        906.792799999996,
+        853.433599999989,
+        841.873800000001,
+        791.1054,
+        756.899999999994,
+        704.343200000003,
+        672.495599999995,
+        622.790399999998,
+        611.254799999995,
+        567.283200000005,
+        519.406599999988,
+        519.188400000014,
+        495.312800000014,
+        451.350799999986,
+        443.973399999988,
+        431.882199999993,
+        392.027000000002,
+        380.924200000009,
+        345.128999999986,
+        298.901400000002,
+        287.771999999997,
+        272.625,
+        247.253000000026,
+        222.490600000019,
+        223.590000000026,
+        196.407599999977,
+        176.425999999978,
+        134.725199999986,
+        132.4804,
+        110.445599999977,
+        86.7939999999944,
+        56.7038000000175,
+        64.915399999998,
+        38.3726000000024,
+        37.1606000000029,
+        46.170999999973,
+        49.1716000000015,
+        15.3362000000197,
+        6.71639999997569,
+        -34.8185999999987,
+        -39.4476000000141,
+        12.6830000000191,
+        -12.3331999999937,
+        -50.6565999999875,
+        -59.9538000000175,
+        -65.1054000000004,
+        -70.7576000000117,
+        -106.325200000021,
+        -126.852200000023,
+        -110.227599999984,
+        -132.885999999999,
+        -113.897200000007,
+        -142.713800000027,
+        -151.145399999979,
+        -150.799200000009,
+        -177.756200000003,
+        -156.036399999983,
+        -182.735199999996,
+        -177.259399999981,
+        -198.663600000029,
+        -174.577600000019,
+        -193.84580000001,
+    ],
+    &[
+        94541.0,
+        92848.811,
+        91174.019,
+        89517.558,
+        87879.9705,
+        86262.7565,
+        84663.5125,
+        83083.7435,
+        81521.7865,
+        79977.272,
+        78455.9465,
+        76950.219,
+        75465.432,
+        73994.152,
+        72546.71,
+        71115.2345,
+        69705.6765,
+        68314.937,
+        66944.2705,
+        65591.255,
+        64252.9485,
+        62938.016,
+        61636.8225,
+        60355.592,
+        59092.789,
+        57850.568,
+        56624.518,
+        55417.343,
+        54231.1415,
+        53067.387,
+        51903.526,
+        50774.649,
+        49657.6415,
+        48561.05,
+        47475.7575,
+        46410.159,
+        45364.852,
+        44327.053,
+        43318.4005,
+        42325.6165,
+        41348.4595,
+        40383.6265,
+        39436.77,
+        38509.502,
+        37594.035,
+        36695.939,
+        35818.6895,
+        34955.691,
+        34115.8095,
+        33293.949,
+        32465.0775,
+        31657.6715,
+        30877.2585,
+        30093.78,
+        29351.3695,
+        28594.1365,
+        27872.115,
+        27168.7465,
+        26477.076,
+        25774.541,
+        25106.5375,
+        24452.5135,
+        23815.5125,
+        23174.0655,
+        22555.2685,
+        21960.2065,
+        21376.3555,
+        20785.1925,
+        20211.517,
+        19657.0725,
+        19141.6865,
+        18579.737,
+        18081.3955,
+        17578.995,
+        17073.44,
+        16608.335,
+        16119.911,
+        15651.266,
+        15194.583,
+        14749.0495,
+        14343.4835,
+        13925.639,
+        13504.509,
+        13099.3885,
+        12691.2855,
+        12328.018,
+        11969.0345,
+        11596.5145,
+        11245.6355,
+        10917.6575,
+        10580.9785,
+        10277.8605,
+        9926.58100000001,
+        9605.538,
+        9300.42950000003,
+        8989.97850000003,
+        8728.73249999998,
+        8448.3235,
+        8175.31050000002,
+        7898.98700000002,
+        7629.79100000003,
+        7413.76199999999,
+        7149.92300000001,
+        6921.12650000001,
+        6677.1545,
+        6443.28000000003,
+        6278.23450000002,
+        6014.20049999998,
+        5791.20299999998,
+        5605.78450000001,
+        5438.48800000001,
+        5234.2255,
+        5059.6825,
+        4887.43349999998,
+        4682.935,
+        4496.31099999999,
+        4322.52250000002,
+        4191.42499999999,
+        4021.24200000003,
+        3900.64799999999,
+        3762.84250000003,
+        3609.98050000001,
+        3502.29599999997,
+        3363.84250000003,
+        3206.54849999998,
+        3079.70000000001,
+        2971.42300000001,
+        2867.80349999998,
+        2727.08100000001,
+        2630.74900000001,
+        2496.6165,
+        2440.902,
+        2356.19150000002,
+        2235.58199999999,
+        2120.54149999999,
+        2012.25449999998,
+        1933.35600000003,
+        1820.93099999998,
+        1761.54800000001,
+        1663.09350000002,
+        1578.84600000002,
+        1509.48149999999,
+        1427.3345,
+        1379.56150000001,
+        1306.68099999998,
+        1212.63449999999,
+        1084.17300000001,
+        1124.16450000001,
+        1060.69949999999,
+        1007.48849999998,
+        941.194499999983,
+        879.880500000028,
+        836.007500000007,
+        782.802000000025,
+        748.385499999975,
+        647.991500000004,
+        626.730500000005,
+        570.776000000013,
+        484.000500000024,
+        513.98550000001,
+        418.985499999952,
+        386.996999999974,
+        370.026500000036,
+        355.496999999974,
+        356.731499999994,
+        255.92200000002,
+        259.094000000041,
+        205.434499999974,
+        165.374500000034,
+        197.347500000033,
+        95.718499999959,
+        67.6165000000037,
+        54.6970000000438,
+        31.7395000000251,
+        -15.8784999999916,
+        8.42500000004657,
+        -26.3754999999655,
+        -118.425500000012,
+        -66.6629999999423,
+        -42.9745000000112,
+        -107.364999999991,
+        -189.839000000036,
+        -162.611499999999,
+        -164.964999999967,
+        -189.079999999958,
+        -223.931499999948,
+        -235.329999999958,
+        -269.639500000048,
+        -249.087999999989,
+        -206.475499999942,
+        -283.04449999996,
+        -290.667000000016,
+        -304.561499999953,
+        -336.784499999951,
+        -380.386500000022,
+        -283.280499999993,
+        -364.533000000054,
+        -389.059499999974,
+        -364.454000000027,
+        -415.748000000021,
+        -417.155000000028,
+    ],
+    &[
+        189083.0,
+        185696.913,
+        182348.774,
+        179035.946,
+        175762.762,
+        172526.444,
+        169329.754,
+        166166.099,
+        163043.269,
+        159958.91,
+        156907.912,
+        153906.845,
+        150924.199,
+        147996.568,
+        145093.457,
+        142239.233,
+        139421.475,
+        136632.27,
+        133889.588,
+        131174.2,
+        128511.619,
+        125868.621,
+        123265.385,
+        120721.061,
+        118181.769,
+        115709.456,
+        113252.446,
+        110840.198,
+        108465.099,
+        106126.164,
+        103823.469,
+        101556.618,
+        99308.004,
+        97124.508,
+        94937.803,
+        92833.731,
+        90745.061,
+        88677.627,
+        86617.47,
+        84650.442,
+        82697.833,
+        80769.132,
+        78879.629,
+        77014.432,
+        75215.626,
+        73384.587,
+        71652.482,
+        69895.93,
+        68209.301,
+        66553.669,
+        64921.981,
+        63310.323,
+        61742.115,
+        60205.018,
+        58698.658,
+        57190.657,
+        55760.865,
+        54331.169,
+        52908.167,
+        51550.273,
+        50225.254,
+        48922.421,
+        47614.533,
+        46362.049,
+        45098.569,
+        43926.083,
+        42736.03,
+        41593.473,
+        40425.26,
+        39316.237,
+        38243.651,
+        37170.617,
+        36114.609,
+        35084.19,
+        34117.233,
+        33206.509,
+        32231.505,
+        31318.728,
+        30403.404,
+        29540.0550000001,
+        28679.236,
+        27825.862,
+        26965.216,
+        26179.148,
+        25462.08,
+        24645.952,
+        23922.523,
+        23198.144,
+        22529.128,
+        21762.4179999999,
+        21134.779,
+        20459.117,
+        19840.818,
+        19187.04,
+        18636.3689999999,
+        17982.831,
+        17439.7389999999,
+        16874.547,
+        16358.2169999999,
+        15835.684,
+        15352.914,
+        14823.681,
+        14329.313,
+        13816.897,
+        13342.874,
+        12880.882,
+        12491.648,
+        12021.254,
+        11625.392,
+        11293.7610000001,
+        10813.697,
+        10456.209,
+        10099.074,
+        9755.39000000001,
+        9393.18500000006,
+        9047.57900000003,
+        8657.98499999999,
+        8395.85900000005,
+        8033.0,
+        7736.95900000003,
+        7430.59699999995,
+        7258.47699999996,
+        6924.58200000005,
+        6691.29399999999,
+        6357.92500000005,
+        6202.05700000003,
+        5921.19700000004,
+        5628.28399999999,
+        5404.96799999999,
+        5226.71100000001,
+        4990.75600000005,
+        4799.77399999998,
+        4622.93099999998,
+        4472.478,
+        4171.78700000001,
+        3957.46299999999,
+        3868.95200000005,
+        3691.14300000004,
+        3474.63100000005,
+        3341.67200000002,
+        3109.14000000001,
+        3071.97400000005,
+        2796.40399999998,
+        2756.17799999996,
+        2611.46999999997,
+        2471.93000000005,
+        2382.26399999997,
+        2209.22400000005,
+        2142.28399999999,
+        2013.96100000001,
+        1911.18999999994,
+        1818.27099999995,
+        1668.47900000005,
+        1519.65800000005,
+        1469.67599999998,
+        1367.13800000004,
+        1248.52899999998,
+        1181.23600000003,
+        1022.71900000004,
+        1088.20700000005,
+        959.03600000008,
+        876.095999999903,
+        791.183999999892,
+        703.337000000058,
+        731.949999999953,
+        586.86400000006,
+        526.024999999907,
+        323.004999999888,
+        320.448000000091,
+        340.672999999952,
+        309.638999999966,
+        216.601999999955,
+        102.922999999952,
+        19.2399999999907,
+        -0.114000000059605,
+        -32.6240000000689,
+        -89.3179999999702,
+        -153.497999999905,
+        -64.2970000000205,
+        -143.695999999996,
+        -259.497999999905,
+        -253.017999999924,
+        -213.948000000091,
+        -397.590000000084,
+        -434.006000000052,
+        -403.475000000093,
+        -297.958000000101,
+        -404.317000000039,
+        -528.898999999976,
+        -506.621000000043,
+        -513.205000000075,
+        -479.351000000024,
+        -596.139999999898,
+        -527.016999999993,
+        -664.681000000099,
+        -680.306000000099,
+        -704.050000000047,
+        -850.486000000034,
+        -757.43200000003,
+        -713.308999999892,
+    ],
+];
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Failed to write hll to bytes, err:{}", source))]
+    WriteHll { source: bytes::Error },
+
+    #[snafu(display("Failed to write hll to bytes, err:{}", source))]
+    ReadHll { source: bytes::Error },
+}
+
+pub type Result<T> = std::result::Result<T, Error>;
+
+pub struct HyperLogLog {
+    alpha: f64,
+    p: u8,
+    m: usize,
+    M: Vec<u8>,
+    sip: SipHasher13,
+}
+
+impl HyperLogLog {
+    pub fn new(error_rate: f64) -> Self {
+        Self::new_with_keys(error_rate, rand::random(), rand::random())
+    }
+
+    pub fn new_with_keys(error_rate: f64, key0: u64, key1: u64) -> Self {
+        assert!(error_rate > 0.0 && error_rate < 1.0);
+        let sr = 1.04 / error_rate;
+        let p = f64::ln(sr * sr).ceil() as u8;
+        assert!(p <= 64);
+        let alpha = Self::get_alpha(p);
+        let m = 1usize << p;
+        HyperLogLog {
+            alpha,
+            p,
+            m,
+            M: repeat(0u8).take(m).collect(),
+            sip: SipHasher13::new_with_keys(key0, key1),
+        }
+    }
+
+    pub fn new_from_template(hll: &HyperLogLog) -> Self {
+        HyperLogLog {
+            alpha: hll.alpha,
+            p: hll.p,
+            m: hll.m,
+            M: repeat(0u8).take(hll.m).collect(),
+            sip: hll.sip,
+        }
+    }
+
+    pub fn insert<V: Hash>(&mut self, value: &V) {
+        let sip = &mut self.sip.clone();
+        value.hash(sip);
+        let x = sip.finish();
+        self.insert_by_hash_value(x);
+    }
+
+    pub fn insert_by_hash_value(&mut self, x: u64) {
+        let j = x as usize & (self.m - 1);
+        let w = x >> self.p;
+        let rho = Self::get_rho(w, 64 - self.p);
+        let mjr = &mut self.M[j];
+        if rho > *mjr {
+            *mjr = rho;
+        }
+    }
+
+    pub fn len(&self) -> f64 {
+        let V = Self::vec_count_zero(&self.M);
+        if V > 0 {
+            let H = self.m as f64 * (self.m as f64 / V as f64).ln();
+            if H <= Self::get_treshold(self.p) {
+                H
+            } else {
+                self.ep()
+            }
+        } else {
+            self.ep()
+        }
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.len() == 0.0
+    }
+
+    pub fn merge(&mut self, src: &HyperLogLog) {
+        assert!(src.p == self.p);
+        assert!(src.m == self.m);
+        let sip1 = &mut src.sip.clone();
+        let sip2 = &mut self.sip.clone();
+        42.hash(sip1);
+        42.hash(sip2);
+        assert!(sip1.finish() == sip2.finish());
+        for i in 0..self.m {
+            let (src_mir, mir) = (src.M[i], &mut self.M[i]);
+            if src_mir > *mir {
+                *mir = src_mir;
+            }
+        }
+    }
+
+    pub fn clear(&mut self) {
+        self.M.iter_mut().all(|x| {
+            *x = 0;
+            true
+        });
+    }
+
+    pub fn write_to_buf<B: MemBufMut>(&self, buf: &mut B) -> Result<()> {
+        buf.write_f64(self.alpha).context(WriteHll)?;
+        buf.write_u8(self.p).context(WriteHll)?;
+        // self.m is the length of self.M
+        buf.write_u64(self.m as u64).context(WriteHll)?;
+        buf.write_slice(&self.M).context(WriteHll)?;
+        // Store keys of hasher
+        let (key0, key1) = self.sip.keys();
+        buf.write_u64(key0).context(WriteHll)?;
+        buf.write_u64(key1).context(WriteHll)
+    }
+
+    pub fn read_from_buf<B: MemBuf>(buf: &mut B) -> Result<Self> {
+        let alpha = buf.read_f64().context(ReadHll)?;
+        let p = buf.read_u8().context(ReadHll)?;
+        let m = buf.read_u64().context(ReadHll)? as usize;
+        let mut m_buf = vec![0u8; m];
+        buf.read_to_slice(&mut m_buf).context(ReadHll)?;
+        let key0 = buf.read_u64().context(ReadHll)?;
+        let key1 = buf.read_u64().context(ReadHll)?;
+
+        Ok(HyperLogLog {
+            alpha,
+            p,
+            m,
+            M: m_buf,
+            sip: SipHasher13::new_with_keys(key0, key1),
+        })
+    }
+
+    fn get_treshold(p: u8) -> f64 {
+        TRESHOLD_DATA[p as usize]
+    }
+
+    fn get_alpha(p: u8) -> f64 {
+        assert!((4..=16).contains(&p));
+        match p {
+            4 => 0.673,
+            5 => 0.697,
+            6 => 0.709,
+            _ => 0.7213 / (1.0 + 1.079 / (1usize << (p as usize)) as f64),
+        }
+    }
+
+    fn bit_length(x: u64) -> u8 {
+        let mut bits: u8 = 0;
+        let mut xm = x;
+        while xm != 0 {
+            bits += 1;
+            xm >>= 1;
+        }
+        bits
+    }
+
+    fn get_rho(w: u64, max_width: u8) -> u8 {
+        let rho = max_width - Self::bit_length(w) + 1;
+        assert!(rho > 0);
+        rho
+    }
+
+    fn vec_count_zero(v: &[u8]) -> usize {
+        bytecount::count(v, 0)
+    }
+
+    fn estimate_bias(E: f64, p: u8) -> f64 {
+        let bias_vector = BIAS_DATA[(p - 4) as usize];
+        let nearest_neighbors = Self::get_nearest_neighbors(E, RAW_ESTIMATE_DATA[(p - 4) as usize]);
+        let sum = nearest_neighbors
+            .iter()
+            .fold(0.0, |acc, &neighbor| acc + bias_vector[neighbor]);
+        sum / nearest_neighbors.len() as f64
+    }
+
+    fn get_nearest_neighbors(E: f64, estimate_vector: &[f64]) -> Vec<usize> {
+        let ev_len = estimate_vector.len();
+        let mut r: Vec<(f64, usize)> = repeat((0.0f64, 0usize)).take(ev_len).collect();
+        for i in 0..ev_len {
+            let dr = E - estimate_vector[i];
+            r[i] = (dr * dr, i);
+        }
+        r.sort_by(|a, b| {
+            if a < b {
+                Less
+            } else if a > b {
+                Greater
+            } else {
+                Equal
+            }
+        });
+        r.truncate(6);
+        r.iter()
+            .map(|&ez| {
+                let (_, b) = ez;
+                b
+            })
+            .collect()
+    }
+
+    fn ep(&self) -> f64 {
+        let sum = self
+            .M
+            .iter()
+            .fold(0.0, |acc, &x| acc + 2.0f64.powi(-(x as i32)));
+        let E = self.alpha * (self.m * self.m) as f64 / sum;
+        if E <= (5 * self.m) as f64 {
+            E - Self::estimate_bias(E, self.p)
+        } else {
+            E
+        }
+    }
+}
+
+#[test]
+fn hyperloglog_test_simple() {
+    let mut hll = HyperLogLog::new(0.00408);
+    let keys = ["test1", "test2", "test3", "test2", "test2", "test2"];
+    for k in &keys {
+        hll.insert(k);
+    }
+    assert!((hll.len().round() - 3.0).abs() < std::f64::EPSILON);
+    assert!(!hll.is_empty());
+    hll.clear();
+    assert!(hll.is_empty());
+    assert!(hll.len() == 0.0);
+}
+
+#[test]
+fn hyperloglog_test_merge() {
+    let mut hll = HyperLogLog::new(0.00408);
+    let keys = ["test1", "test2", "test3", "test2", "test2", "test2"];
+    for k in &keys {
+        hll.insert(k);
+    }
+    assert!((hll.len().round() - 3.0).abs() < std::f64::EPSILON);
+
+    let mut hll2 = HyperLogLog::new_from_template(&hll);
+    let keys2 = ["test3", "test4", "test4", "test4", "test4", "test1"];
+    for k in &keys2 {
+        hll2.insert(k);
+    }
+    assert!((hll2.len().round() - 3.0).abs() < std::f64::EPSILON);
+
+    hll.merge(&hll2);
+    assert!((hll.len().round() - 4.0).abs() < std::f64::EPSILON);
+}
+
+#[test]
+fn hyperloglog_test_write_read() {
+    let mut hll = HyperLogLog::new(0.00408);
+    hll.insert(&123);
+
+    let mut write_buf = Vec::new();
+    hll.write_to_buf(&mut write_buf).unwrap();
+
+    let mut buf = &write_buf[..];
+    let hll2 = HyperLogLog::read_from_buf(&mut buf).unwrap();
+
+    let error_margin = f64::EPSILON;
+    assert!((hll.alpha - hll2.alpha).abs() < error_margin);
+    assert_eq!(hll.p, hll2.p);
+    assert_eq!(hll.m, hll2.m);
+    assert_eq!(hll.M, hll2.M);
+    assert_eq!(hll.sip.keys(), hll2.sip.keys());
+}
diff --git a/components/skiplist/Cargo.toml b/components/skiplist/Cargo.toml
new file mode 100644
index 0000000000..f56e48d122
--- /dev/null
+++ b/components/skiplist/Cargo.toml
@@ -0,0 +1,21 @@
+[package]
+name = "skiplist"
+version = "0.1.0"
+authors = ["Jay Lee <busyjaylee@gmail.com>"]
+edition = "2018"
+
+[dependencies]
+rand = "0.7"
+bytes = "1.0"
+arena = { path = "../arena" }
+
+[dev-dependencies]
+yatp = { git = "https://github.com/tikv/yatp.git", rev = "4b71f8abd86890f0d1e95778c2b6bf5a9ee4c502" }
+criterion = "0.3"
+
+# [target.'cfg(not(target_env = "msvc"))'.dev-dependencies]
+# tikv-jemallocator = "0.4.0"
+
+[[bench]]
+name = "bench"
+harness = false
diff --git a/components/skiplist/benches/bench.rs b/components/skiplist/benches/bench.rs
new file mode 100644
index 0000000000..4744bb558c
--- /dev/null
+++ b/components/skiplist/benches/bench.rs
@@ -0,0 +1,181 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use std::{
+    collections::*,
+    sync::{atomic::*, *},
+    thread,
+};
+
+use arena::MonoIncArena;
+use bytes::*;
+use criterion::*;
+use rand::prelude::*;
+use skiplist::*;
+
+// #[cfg(not(target_env = "msvc"))]
+// use tikv_jemallocator::Jemalloc;
+
+// #[cfg(not(target_env = "msvc"))]
+// #[global_allocator]
+// static GLOBAL: Jemalloc = Jemalloc;
+
+fn skiplist_round(
+    l: &Skiplist<FixedLengthSuffixComparator, MonoIncArena>,
+    case: &(Bytes, bool),
+    exp: &Bytes,
+) {
+    if case.1 {
+        if let Some(v) = l.get(&case.0) {
+            assert_eq!(v, exp);
+        }
+    } else {
+        l.put(&case.0, exp);
+    }
+}
+
+fn append_ts(key: &mut BytesMut, ts: u64) {
+    key.put_u64(ts);
+}
+
+fn random_key(rng: &mut ThreadRng) -> Bytes {
+    let mut key = BytesMut::with_capacity(16);
+    unsafe {
+        rng.fill_bytes(&mut *(&mut key.chunk_mut()[..8] as *mut _ as *mut [u8]));
+        key.advance_mut(8);
+    }
+    append_ts(&mut key, 0);
+    key.freeze()
+}
+
+fn bench_read_write_skiplist_frac(b: &mut Bencher<'_>, frac: &usize) {
+    let frac = *frac;
+    let value = Bytes::from_static(b"00123");
+    let comp = FixedLengthSuffixComparator::new(8);
+    let arena = MonoIncArena::new(1 << 10);
+    let list = Skiplist::with_arena(comp, arena);
+    let l = list.clone();
+    let stop = Arc::new(AtomicBool::new(false));
+    let s = stop.clone();
+    let v = value.clone();
+    let handle = thread::spawn(move || {
+        let mut rng = rand::thread_rng();
+        while !s.load(Ordering::SeqCst) {
+            let key = random_key(&mut rng);
+            let case = (key, frac > rng.gen_range(0, 11));
+            skiplist_round(&l, &case, &v);
+        }
+    });
+    let mut rng = rand::thread_rng();
+    b.iter_batched_ref(
+        || (random_key(&mut rng), frac > rng.gen_range(0, 11)),
+        |case| skiplist_round(&list, case, &value),
+        BatchSize::SmallInput,
+    );
+    stop.store(true, Ordering::SeqCst);
+    handle.join().unwrap();
+}
+
+fn bench_read_write_skiplist(c: &mut Criterion) {
+    let mut group = c.benchmark_group("skiplist_read_write");
+    for i in 0..=10 {
+        group.bench_with_input(
+            BenchmarkId::from_parameter(i),
+            &i,
+            bench_read_write_skiplist_frac,
+        );
+    }
+    group.finish();
+}
+
+fn map_round(m: &Mutex<HashMap<Bytes, Bytes>>, case: &(Bytes, bool), exp: &Bytes) {
+    if case.1 {
+        let rm = m.lock().unwrap();
+        let value = rm.get(&case.0);
+        if let Some(v) = value {
+            assert_eq!(v, exp);
+        }
+    } else {
+        let mut rm = m.lock().unwrap();
+        rm.insert(case.0.clone(), exp.clone());
+    }
+}
+
+fn bench_read_write_map_frac(b: &mut Bencher<'_>, frac: &usize) {
+    let frac = *frac;
+    let value = Bytes::from_static(b"00123");
+    let map = Arc::new(Mutex::new(HashMap::with_capacity(512 << 10)));
+    let map_in_thread = map.clone();
+    let stop = Arc::new(AtomicBool::new(false));
+    let thread_stop = stop.clone();
+
+    let v = value.clone();
+    let handle = thread::spawn(move || {
+        let mut rng = rand::thread_rng();
+        while !thread_stop.load(Ordering::SeqCst) {
+            let f = rng.gen_range(0, 11);
+            let case = (random_key(&mut rng), f < frac);
+            map_round(&map_in_thread, &case, &v);
+        }
+    });
+    let mut rng = rand::thread_rng();
+    b.iter_batched_ref(
+        || {
+            let f = rng.gen_range(0, 11);
+            (random_key(&mut rng), f < frac)
+        },
+        |case| map_round(&map, case, &value),
+        BatchSize::SmallInput,
+    );
+    stop.store(true, Ordering::SeqCst);
+    handle.join().unwrap();
+}
+
+fn bench_read_write_map(c: &mut Criterion) {
+    let mut group = c.benchmark_group("map_read_write");
+    for i in 0..=10 {
+        group.bench_with_input(
+            BenchmarkId::from_parameter(i),
+            &i,
+            bench_read_write_map_frac,
+        );
+    }
+    group.finish();
+}
+
+fn bench_write_skiplist(c: &mut Criterion) {
+    let comp = FixedLengthSuffixComparator::new(8);
+    let arena = MonoIncArena::new(1 << 10);
+    let list = Skiplist::with_arena(comp, arena);
+    let value = Bytes::from_static(b"00123");
+    let l = list.clone();
+    let stop = Arc::new(AtomicBool::new(false));
+    let s = stop.clone();
+    let v = value.clone();
+    let handle = thread::spawn(move || {
+        let mut rng = rand::thread_rng();
+        while !s.load(Ordering::SeqCst) {
+            let case = (random_key(&mut rng), false);
+            skiplist_round(&l, &case, &v);
+        }
+    });
+    let mut rng = rand::thread_rng();
+    c.bench_function("skiplist_write", |b| {
+        b.iter_batched(
+            || random_key(&mut rng),
+            |key| {
+                list.put(&key, &value);
+            },
+            BatchSize::SmallInput,
+        )
+    });
+    stop.store(true, Ordering::SeqCst);
+    handle.join().unwrap();
+}
+
+criterion_group!(
+    benches,
+    bench_read_write_skiplist,
+    bench_read_write_map,
+    bench_write_skiplist
+);
+criterion_main!(benches);
diff --git a/components/skiplist/src/key.rs b/components/skiplist/src/key.rs
new file mode 100644
index 0000000000..297e4e446d
--- /dev/null
+++ b/components/skiplist/src/key.rs
@@ -0,0 +1,55 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use std::cmp::Ordering;
+
+use bytes::Bytes;
+
+pub trait KeyComparator: Clone {
+    fn compare_key(&self, lhs: &[u8], rhs: &[u8]) -> Ordering;
+    fn same_key(&self, lhs: &[u8], rhs: &[u8]) -> bool;
+}
+
+#[derive(Default, Debug, Clone, Copy)]
+pub struct FixedLengthSuffixComparator {
+    len: usize,
+}
+
+impl FixedLengthSuffixComparator {
+    pub const fn new(len: usize) -> FixedLengthSuffixComparator {
+        FixedLengthSuffixComparator { len }
+    }
+}
+
+impl KeyComparator for FixedLengthSuffixComparator {
+    #[inline]
+    fn compare_key(&self, lhs: &[u8], rhs: &[u8]) -> Ordering {
+        if lhs.len() < self.len {
+            panic!(
+                "cannot compare with suffix {}: {:?}",
+                self.len,
+                Bytes::copy_from_slice(lhs)
+            );
+        }
+        if rhs.len() < self.len {
+            panic!(
+                "cannot compare with suffix {}: {:?}",
+                self.len,
+                Bytes::copy_from_slice(rhs)
+            );
+        }
+        let (l_p, l_s) = lhs.split_at(lhs.len() - self.len);
+        let (r_p, r_s) = rhs.split_at(rhs.len() - self.len);
+        let res = l_p.cmp(r_p);
+        match res {
+            Ordering::Greater | Ordering::Less => res,
+            Ordering::Equal => l_s.cmp(r_s),
+        }
+    }
+
+    #[inline]
+    fn same_key(&self, lhs: &[u8], rhs: &[u8]) -> bool {
+        let (l_p, _) = lhs.split_at(lhs.len() - self.len);
+        let (r_p, _) = rhs.split_at(rhs.len() - self.len);
+        l_p == r_p
+    }
+}
diff --git a/components/skiplist/src/lib.rs b/components/skiplist/src/lib.rs
new file mode 100644
index 0000000000..ca7d13b1a8
--- /dev/null
+++ b/components/skiplist/src/lib.rs
@@ -0,0 +1,21 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Forked from <https://github.com/tikv/agatedb/blob/8510bff2bfde5b766c3f83cf81c00141967d48a4/skiplist>
+//!
+//! Differences:
+//! 1. Inline key and value in Node, so all memory of skiplist is allocated from
+//! arena. Drawback: we have to copy the content of key/value
+//! 2. Tower stores pointer to Node instead of offset, so we can use other arena
+//! implementation
+//! 3. Use [ArenaSlice] to replace Bytes
+//! 4. impl Send/Sync for the iterator
+
+mod key;
+mod list;
+mod slice;
+
+const MAX_HEIGHT: usize = 20;
+
+pub use key::{FixedLengthSuffixComparator, KeyComparator};
+pub use list::{IterRef, Skiplist};
+pub use slice::ArenaSlice;
diff --git a/components/skiplist/src/list.rs b/components/skiplist/src/list.rs
new file mode 100644
index 0000000000..ae84d2c3e7
--- /dev/null
+++ b/components/skiplist/src/list.rs
@@ -0,0 +1,698 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use std::{
+    alloc::Layout,
+    convert::TryInto,
+    mem, ptr,
+    ptr::NonNull,
+    slice,
+    sync::{
+        atomic::{AtomicPtr, AtomicUsize, Ordering},
+        Arc,
+    },
+};
+
+use arena::{Arena, BasicStats};
+use rand::Rng;
+
+use super::{slice::ArenaSlice, KeyComparator, MAX_HEIGHT};
+
+const HEIGHT_INCREASE: u32 = u32::MAX / 3;
+
+type KeySize = u16;
+type ValueSize = u32;
+
+/// The layout of Node
+/// 1. height: usize
+/// 2. tower: AtomicPtr<Node> x (height + 1)
+/// 3. key_size: KeySize
+/// 4. key: u8 x key_size
+/// 5. value_size: ValueSize
+/// 6. value: ValueSize
+// Uses C layout to make sure tower is at the bottom
+#[derive(Debug)]
+#[repr(C)]
+pub struct Node {
+    /// Height of node, different from badger, The valid range of tower is [0,
+    /// height]
+    height: usize,
+    /// The node tower
+    ///
+    /// Only [0, height] parts is utilized to store node pointer, the key and
+    /// value block are start from tower[height + 1]
+    tower: [AtomicPtr<Node>; MAX_HEIGHT],
+}
+
+impl Node {
+    /// Allocate a new node from the arena, and copy the content of key/value
+    /// into the node
+    /// # Safety
+    /// - from_size_align_unchecked: align is got from [mem::align_of].
+    /// # Notice
+    /// This will only allocate the *exact* amount of memory needed within the
+    /// given height.
+    fn alloc<A>(arena: &A, key: &[u8], value: &[u8], height: usize) -> *mut Node
+    where
+        A: Arena<Stats = BasicStats>,
+    {
+        // Calculate node size to alloc
+        let size = mem::size_of::<Node>();
+        // Not all values in Node::tower will be utilized.
+        let not_used = (MAX_HEIGHT - height - 1) * mem::size_of::<AtomicPtr<Node>>();
+        // Space to store key/value: (key size) + key + (value size) + value
+        let kv_used =
+            mem::size_of::<KeySize>() + key.len() + mem::size_of::<ValueSize>() + value.len();
+        // UB in fact: the `not_used` size is able to be access in a "safe" way.
+        // It is guaranteed by the user to not use those memory.
+        let alloc_size = size - not_used + kv_used;
+        let layout =
+            unsafe { Layout::from_size_align_unchecked(alloc_size, mem::align_of::<Node>()) };
+        let node_ptr = arena.alloc(layout).as_ptr() as *mut Node;
+        unsafe {
+            let node = &mut *node_ptr;
+            node.height = height;
+            ptr::write_bytes(node.tower.as_mut_ptr(), 0, height + 1);
+            Self::init_key_value(node, key, value);
+
+            node_ptr
+        }
+    }
+
+    /// Fetch next node ptr in given height
+    fn next_ptr(&self, height: usize) -> *mut Node {
+        self.tower[height].load(Ordering::SeqCst)
+    }
+
+    /// Get key
+    ///
+    /// REQUIRE: This Node is created via `Node::alloc()`
+    unsafe fn key(&self) -> &[u8] {
+        let (key_block, key_size) = self.load_key_size();
+
+        slice::from_raw_parts(key_block, key_size as usize)
+    }
+
+    /// Get value
+    ///
+    /// REQUIRE: This Node is created via `Node::alloc()`
+    unsafe fn value(&self) -> &[u8] {
+        let (key_block, key_size) = self.load_key_size();
+        let (value_block, value_size) = self.load_value_size(key_block, key_size);
+
+        slice::from_raw_parts(value_block, value_size as usize)
+    }
+
+    /// Set key and value parts of Node during creating Node
+    ///
+    /// Will copy the content of key and value to the Node
+    ///
+    /// REQUIRE: This Node is created via Arena and node.tower and node.height
+    /// is already set to correct value
+    /// Panic: The size of key/value must less than max value of
+    /// KeySize/ValueSize (u16/u32), otherwise this function will panic
+    unsafe fn init_key_value(node: &mut Node, key: &[u8], value: &[u8]) {
+        let key_block = node.tower.as_mut_ptr().add(node.height + 1) as *mut u8;
+        let key_size: KeySize = key.len().try_into().unwrap();
+        let key_size_bytes = key_size.to_ne_bytes();
+
+        ptr::copy_nonoverlapping(
+            key_size_bytes.as_ptr(),
+            key_block,
+            mem::size_of::<KeySize>(),
+        );
+        let key_block = key_block.add(mem::size_of::<KeySize>());
+        ptr::copy_nonoverlapping(key.as_ptr(), key_block, key.len());
+
+        let value_block = key_block.add(key.len());
+        let value_size: ValueSize = value.len().try_into().unwrap();
+        let value_size_bytes = value_size.to_ne_bytes();
+
+        ptr::copy_nonoverlapping(
+            value_size_bytes.as_ptr(),
+            value_block,
+            mem::size_of::<ValueSize>(),
+        );
+        let value_block = value_block.add(mem::size_of::<ValueSize>());
+        ptr::copy_nonoverlapping(value.as_ptr(), value_block, value.len());
+    }
+
+    /// Load key pointer and size of key
+    ///
+    /// REQUIRE: This Node is created via `Node::alloc()`
+    unsafe fn load_key_size(&self) -> (*const u8, KeySize) {
+        let tower = self.tower.as_ptr();
+        // Move to key block
+        let key_block = tower.add(self.height + 1) as *const u8;
+        // Load key size from key block
+        let key_size = u16::from_ne_bytes(*(key_block as *const [u8; mem::size_of::<KeySize>()]));
+        // Move key block to the start of key
+        let key_block = key_block.add(mem::size_of::<KeySize>());
+
+        (key_block, key_size)
+    }
+
+    /// Load value pointer and size of value
+    ///
+    /// Given key_block and key_size returned from `load_key_size()`, loads
+    /// value pointer and value size
+    ///
+    /// REQUIRE: This Node is created via `Node::alloc()`
+    unsafe fn load_value_size(
+        &self,
+        key_block: *const u8,
+        key_size: KeySize,
+    ) -> (*const u8, ValueSize) {
+        // Move to value block
+        let value_block = key_block.add(key_size as usize);
+        // Load value size from value block
+        let value_size =
+            u32::from_ne_bytes(*(value_block as *const [u8; mem::size_of::<ValueSize>()]));
+        // Move value block to the start of value
+        let value_block = value_block.add(mem::size_of::<ValueSize>());
+
+        (value_block, value_size)
+    }
+
+    /// Get key with arena
+    ///
+    /// REQUIRE: This Node is created via `Node::alloc()`
+    unsafe fn key_with_arena<A>(&self, arena: A) -> ArenaSlice<A>
+    where
+        A: Arena<Stats = BasicStats>,
+    {
+        let (key_block, key_size) = self.load_key_size();
+
+        ArenaSlice::from_raw_parts(arena, key_block, key_size as usize)
+    }
+
+    /// Get value with arena
+    ///
+    /// REQUIRE: This Node is created via `Node::alloc()`
+    unsafe fn value_with_arena<A>(&self, arena: A) -> ArenaSlice<A>
+    where
+        A: Arena<Stats = BasicStats>,
+    {
+        let (key_block, key_size) = self.load_key_size();
+        let (value_block, value_size) = self.load_value_size(key_block, key_size);
+
+        ArenaSlice::from_raw_parts(arena, value_block, value_size as usize)
+    }
+}
+
+struct SkiplistCore<A: Arena<Stats = BasicStats>> {
+    height: AtomicUsize,
+    head: NonNull<Node>,
+    arena: A,
+}
+
+/// FIXME(yingwen): Modify the skiplist to support arena that supports growth,
+/// otherwise it is hard to avoid memory usage not out of the arena capacity
+#[derive(Clone)]
+pub struct Skiplist<C, A: Arena<Stats = BasicStats> + Clone> {
+    core: Arc<SkiplistCore<A>>,
+    c: C,
+}
+
+impl<C, A: Arena<Stats = BasicStats> + Clone> Skiplist<C, A> {
+    pub fn with_arena(c: C, arena: A) -> Skiplist<C, A> {
+        let head = Node::alloc(&arena, &[], &[], MAX_HEIGHT - 1);
+        let head = unsafe { NonNull::new_unchecked(head) };
+        Skiplist {
+            core: Arc::new(SkiplistCore {
+                height: AtomicUsize::new(0),
+                head,
+                arena,
+            }),
+            c,
+        }
+    }
+
+    fn random_height(&self) -> usize {
+        let mut rng = rand::thread_rng();
+        for h in 0..(MAX_HEIGHT - 1) {
+            if !rng.gen_ratio(HEIGHT_INCREASE, u32::MAX) {
+                return h;
+            }
+        }
+        MAX_HEIGHT - 1
+    }
+
+    fn height(&self) -> usize {
+        self.core.height.load(Ordering::SeqCst)
+    }
+}
+
+impl<C: KeyComparator, A: Arena<Stats = BasicStats> + Clone> Skiplist<C, A> {
+    /// Finds the node near to key.
+    ///
+    /// If less=true, it finds rightmost node such that node.key < key (if
+    /// allow_equal=false) or node.key <= key (if allow_equal=true).
+    /// If less=false, it finds leftmost node such that node.key > key (if
+    /// allowEqual=false) or node.key >= key (if allow_equal=true).
+    /// Returns the node found.
+    unsafe fn find_near(&self, key: &[u8], less: bool, allow_equal: bool) -> *const Node {
+        let mut cursor: *const Node = self.core.head.as_ptr();
+        let mut level = self.height();
+        loop {
+            // Assume cursor.key < key
+            let next_ptr = (&*cursor).next_ptr(level);
+            if next_ptr.is_null() {
+                // cursor.key < key < END OF LIST
+                if level > 0 {
+                    // Can descend further to iterate closer to the end
+                    level -= 1;
+                    continue;
+                }
+                // 1. Level=0. Cannot descend further. Let's return something that makes sense
+                // 2. Try to return cursor. Make sure it is not a head node
+                if !less || cursor == self.core.head.as_ptr() {
+                    return ptr::null();
+                }
+                return cursor;
+            }
+
+            let next = &*next_ptr;
+            let res = self.c.compare_key(key, next.key());
+            if res == std::cmp::Ordering::Greater {
+                // cursor.key < next.key < key. We can continue to move right
+                cursor = next_ptr;
+                continue;
+            }
+            if res == std::cmp::Ordering::Equal {
+                // cursor.key < key == next.key
+                if allow_equal {
+                    return next;
+                }
+                if !less {
+                    // We want >, so go to base level to grab the next bigger node
+                    return next.next_ptr(0);
+                }
+                // We want <. If not base level, we should go closer in the next level.
+                if level > 0 {
+                    level -= 1;
+                    continue;
+                }
+                // On base level. Return cursor
+                if cursor == self.core.head.as_ptr() {
+                    return ptr::null();
+                }
+                return cursor;
+            }
+            // cursor.key < key < next.key
+            if level > 0 {
+                level -= 1;
+                continue;
+            }
+            // At base level. Need to return something
+            if !less {
+                return next;
+            }
+            // Try to return cursor. Make sure it is not a head node
+            if cursor == self.core.head.as_ptr() {
+                return ptr::null();
+            }
+            return cursor;
+        }
+    }
+
+    /// Returns (out_before, out_after) with out_before.key <= key <=
+    /// out_after.key
+    ///
+    /// The input `before` tells us where to start looking
+    /// If we found a node with the same key, then we return out_before =
+    /// out_after. Otherwise, out_before.key < key < out_after.key
+    unsafe fn find_splice_for_level(
+        &self,
+        key: &[u8],
+        mut before: *mut Node,
+        level: usize,
+    ) -> (*mut Node, *mut Node) {
+        loop {
+            // Assume before.key < key
+            let next_ptr = (&*before).next_ptr(level);
+            if next_ptr.is_null() {
+                return (before, ptr::null_mut());
+            }
+            let next_node = &*next_ptr;
+            match self.c.compare_key(key, next_node.key()) {
+                // Equality case
+                std::cmp::Ordering::Equal => return (next_ptr, next_ptr),
+                // before.key < key < next.key. We are done for this level
+                std::cmp::Ordering::Less => return (before, next_ptr),
+                // Keep moving right on this level
+                _ => before = next_ptr,
+            }
+        }
+    }
+
+    /// Put the key-value into the skiplist if the key does not exists.
+    ///
+    /// The content of key and value will be copied into the list. Returns true
+    /// if the node is inserted, otherwise return false (key is duplicated)
+    ///
+    /// Panic: The skiplist will panic if the allocated memory
+    /// out of the capacity
+    pub fn put(&self, key: &[u8], value: &[u8]) -> bool {
+        let mut list_height = self.height();
+        let mut prev = [ptr::null_mut(); MAX_HEIGHT + 1];
+        let mut next = [ptr::null_mut(); MAX_HEIGHT + 1];
+        prev[list_height + 1] = self.core.head.as_ptr();
+        // Recompute splice levels
+        for i in (0..=list_height).rev() {
+            // Use higher level to speed up for current level
+            let (p, n) = unsafe { self.find_splice_for_level(key, prev[i + 1], i) };
+            prev[i] = p;
+            next[i] = n;
+            if p == n {
+                // Key already exists
+                return false;
+            }
+        }
+
+        // Create a new node
+        let height = self.random_height();
+        let node_ptr = Node::alloc(&self.core.arena, key, value, height);
+
+        // Try to increase skiplist height via CAS
+        while height > list_height {
+            match self.core.height.compare_exchange_weak(
+                list_height,
+                height,
+                Ordering::SeqCst,
+                Ordering::SeqCst,
+            ) {
+                // Successfully increased skiplist height
+                Ok(_) => break,
+                Err(h) => list_height = h,
+            }
+        }
+
+        // We always insert from the base level and up. After you add a node in base
+        // leve, we cannot create a node in the level above because it would
+        // have discovered the node in the base level
+        let x: &mut Node = unsafe { &mut *node_ptr };
+        for i in 0..=height {
+            loop {
+                if prev[i].is_null() {
+                    // This cannot happen in base level
+                    assert!(i > 1);
+                    // We haven't computed prev, next for this level because height exceeds old
+                    // list_height. For these levels, we expect the lists to be
+                    // sparse, so we can just search from head.
+                    let (p, n) =
+                        unsafe { self.find_splice_for_level(x.key(), self.core.head.as_ptr(), i) };
+                    prev[i] = p;
+                    next[i] = n;
+                    // Someone adds the exact same key before we are able to do so. This can only
+                    // happen on the base level. But we know we are not on the
+                    // base level.
+                    assert_ne!(p, n);
+                }
+                x.tower[i].store(next[i], Ordering::SeqCst);
+                match unsafe { &*prev[i] }.tower[i].compare_exchange(
+                    next[i],
+                    node_ptr,
+                    Ordering::SeqCst,
+                    Ordering::SeqCst,
+                ) {
+                    // Managed to insert x between prev[i] and next[i]. Go to the next level.
+                    Ok(_) => break,
+                    Err(_) => {
+                        // CAS failed. We need to recompute prev and next.
+                        // It is unlikely to be helpful to try to use a different level as we redo
+                        // the search, because it is unlikely that lots of
+                        // nodes are inserted between prev[i] and next[i].
+                        let (p, n) = unsafe { self.find_splice_for_level(x.key(), prev[i], i) };
+                        if p == n {
+                            assert_eq!(i, 0);
+                            return false;
+                        }
+                        prev[i] = p;
+                        next[i] = n;
+                    }
+                }
+            }
+        }
+        true
+    }
+
+    /// Returns if the skiplist is empty
+    pub fn is_empty(&self) -> bool {
+        let node = self.core.head.as_ptr();
+        let next_ptr = unsafe { (&*node).next_ptr(0) };
+        next_ptr.is_null()
+    }
+
+    /// Returns len of the skiplist
+    pub fn len(&self) -> usize {
+        let mut node = self.core.head.as_ptr();
+        let mut count = 0;
+        loop {
+            let next_ptr = unsafe { (&*node).next_ptr(0) };
+            if !next_ptr.is_null() {
+                count += 1;
+                node = next_ptr;
+                continue;
+            }
+            return count;
+        }
+    }
+
+    /// Returns the last element. If head (empty list), we return null. All the
+    /// find functions will NEVER return the head nodes
+    fn find_last(&self) -> *const Node {
+        let mut node = self.core.head.as_ptr();
+        let mut level = self.height();
+        loop {
+            let next_ptr = unsafe { (&*node).next_ptr(level) };
+            if !next_ptr.is_null() {
+                node = next_ptr;
+                continue;
+            }
+            // next is null
+            if level == 0 {
+                if node == self.core.head.as_ptr() {
+                    return ptr::null();
+                }
+                return node;
+            }
+            level -= 1;
+        }
+    }
+
+    /// Gets the value associated with the key. It returns a valid value if it
+    /// finds equal or earlier version of the same key.
+    pub fn get(&self, key: &[u8]) -> Option<&[u8]> {
+        if let Some((_, value)) = self.get_with_key(key) {
+            Some(value)
+        } else {
+            None
+        }
+    }
+
+    /// Gets the key and value associated with the key. It returns a valid value
+    /// if it finds equal or earlier version of the same key.
+    pub fn get_with_key(&self, key: &[u8]) -> Option<(&[u8], &[u8])> {
+        // Find greater or equal
+        let node = unsafe { self.find_near(key, false, true) };
+        if node.is_null() {
+            return None;
+        }
+        if self.c.same_key(unsafe { (*node).key() }, key) {
+            return Some(unsafe { ((*node).key(), (*node).value()) });
+        }
+        None
+    }
+
+    /// Returns a skiplist iterator
+    pub fn iter_ref(&self) -> IterRef<&Skiplist<C, A>, C, A> {
+        IterRef {
+            list: self,
+            cursor: ptr::null(),
+            _key_cmp: std::marker::PhantomData,
+            _arena: std::marker::PhantomData,
+        }
+    }
+
+    /// Returns a skiplist iterator
+    pub fn iter(&self) -> IterRef<Skiplist<C, A>, C, A> {
+        IterRef {
+            list: self.clone(),
+            cursor: ptr::null(),
+            _key_cmp: std::marker::PhantomData,
+            _arena: std::marker::PhantomData,
+        }
+    }
+
+    /// Consider the total bytes allocated by the arena (not the bytes used).
+    pub fn mem_size(&self) -> u32 {
+        self.core.arena.stats().bytes_allocated() as u32
+    }
+}
+
+impl<C, A: Arena<Stats = BasicStats> + Clone> AsRef<Skiplist<C, A>> for Skiplist<C, A> {
+    fn as_ref(&self) -> &Skiplist<C, A> {
+        self
+    }
+}
+
+unsafe impl<C: Send, A: Arena<Stats = BasicStats> + Clone + Send> Send for Skiplist<C, A> {}
+unsafe impl<C: Sync, A: Arena<Stats = BasicStats> + Clone + Sync> Sync for Skiplist<C, A> {}
+
+pub struct IterRef<T, C, A>
+where
+    T: AsRef<Skiplist<C, A>>,
+    A: Arena<Stats = BasicStats> + Clone,
+{
+    list: T,
+    cursor: *const Node,
+    _key_cmp: std::marker::PhantomData<C>,
+    _arena: std::marker::PhantomData<A>,
+}
+
+impl<T: AsRef<Skiplist<C, A>>, C: KeyComparator, A: Arena<Stats = BasicStats> + Clone>
+    IterRef<T, C, A>
+{
+    pub fn valid(&self) -> bool {
+        !self.cursor.is_null()
+    }
+
+    pub fn key(&self) -> &[u8] {
+        assert!(self.valid());
+        unsafe { (*self.cursor).key() }
+    }
+
+    pub fn value(&self) -> &[u8] {
+        assert!(self.valid());
+        unsafe { (*self.cursor).value() }
+    }
+
+    pub fn next(&mut self) {
+        assert!(self.valid());
+        unsafe {
+            self.cursor = (&*self.cursor).next_ptr(0);
+        }
+    }
+
+    pub fn prev(&mut self) {
+        assert!(self.valid());
+        unsafe {
+            self.cursor = self.list.as_ref().find_near(self.key(), true, false);
+        }
+    }
+
+    pub fn seek(&mut self, target: &[u8]) {
+        unsafe {
+            self.cursor = self.list.as_ref().find_near(target, false, true);
+        }
+    }
+
+    pub fn seek_for_prev(&mut self, target: &[u8]) {
+        unsafe {
+            self.cursor = self.list.as_ref().find_near(target, true, true);
+        }
+    }
+
+    pub fn seek_to_first(&mut self) {
+        unsafe {
+            self.cursor = (&*self.list.as_ref().core.head.as_ptr()).next_ptr(0);
+        }
+    }
+
+    pub fn seek_to_last(&mut self) {
+        self.cursor = self.list.as_ref().find_last();
+    }
+
+    pub fn key_with_arena(&self) -> ArenaSlice<A> {
+        assert!(self.valid());
+        unsafe { (*self.cursor).key_with_arena(self.list.as_ref().core.arena.clone()) }
+    }
+
+    pub fn value_with_arena(&self) -> ArenaSlice<A> {
+        assert!(self.valid());
+        unsafe { (*self.cursor).value_with_arena(self.list.as_ref().core.arena.clone()) }
+    }
+}
+
+unsafe impl<T: AsRef<Skiplist<C, A>>, C: Send, A: Arena<Stats = BasicStats> + Clone + Send> Send
+    for IterRef<T, C, A>
+{
+}
+unsafe impl<T: AsRef<Skiplist<C, A>>, C: Sync, A: Arena<Stats = BasicStats> + Clone + Sync> Sync
+    for IterRef<T, C, A>
+{
+}
+
+#[cfg(test)]
+mod tests {
+    use arena::MonoIncArena;
+    use bytes::Bytes;
+
+    use super::*;
+    use crate::FixedLengthSuffixComparator;
+
+    #[test]
+    fn test_node_alloc() {
+        let arena = MonoIncArena::new(1 << 10);
+        let key = b"key of node";
+        let value = b"value of node";
+        let node_ptr = Node::alloc(&arena, key, value, 5);
+        unsafe {
+            let node = &*node_ptr;
+            assert_eq!(5, node.height);
+            for i in 0..=node.height {
+                assert!(node.tower[i].load(Ordering::SeqCst).is_null());
+            }
+            assert_eq!(key, node.key());
+            assert_eq!(value, node.value());
+        }
+    }
+
+    #[test]
+    fn test_find_near() {
+        let comp = FixedLengthSuffixComparator::new(8);
+        let arena = MonoIncArena::new(1 << 10);
+        let list = Skiplist::with_arena(comp, arena);
+        for i in 0..1000 {
+            let key = Bytes::from(format!("{:05}{:08}", i * 10 + 5, 0));
+            let value = Bytes::from(format!("{:05}", i));
+            list.put(&key, &value);
+        }
+        let mut cases = vec![
+            ("00001", false, false, Some("00005")),
+            ("00001", false, true, Some("00005")),
+            ("00001", true, false, None),
+            ("00001", true, true, None),
+            ("00005", false, false, Some("00015")),
+            ("00005", false, true, Some("00005")),
+            ("00005", true, false, None),
+            ("00005", true, true, Some("00005")),
+            ("05555", false, false, Some("05565")),
+            ("05555", false, true, Some("05555")),
+            ("05555", true, false, Some("05545")),
+            ("05555", true, true, Some("05555")),
+            ("05558", false, false, Some("05565")),
+            ("05558", false, true, Some("05565")),
+            ("05558", true, false, Some("05555")),
+            ("05558", true, true, Some("05555")),
+            ("09995", false, false, None),
+            ("09995", false, true, Some("09995")),
+            ("09995", true, false, Some("09985")),
+            ("09995", true, true, Some("09995")),
+            ("59995", false, false, None),
+            ("59995", false, true, None),
+            ("59995", true, false, Some("09995")),
+            ("59995", true, true, Some("09995")),
+        ];
+        for (i, (key, less, allow_equal, exp)) in cases.drain(..).enumerate() {
+            let seek_key = Bytes::from(format!("{}{:08}", key, 0));
+            let res = unsafe { list.find_near(&seek_key, less, allow_equal) };
+            if exp.is_none() {
+                assert!(res.is_null(), "{}", i);
+                continue;
+            }
+            let e = format!("{}{:08}", exp.unwrap(), 0);
+            assert_eq!(unsafe { (*res).key() }, e.as_bytes(), "{}", i);
+        }
+    }
+}
diff --git a/components/skiplist/src/slice.rs b/components/skiplist/src/slice.rs
new file mode 100644
index 0000000000..fb2fe9b0b1
--- /dev/null
+++ b/components/skiplist/src/slice.rs
@@ -0,0 +1,74 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Slice with arena
+
+use std::{fmt, ops::Deref, slice};
+
+use arena::{Arena, BasicStats};
+
+/// Arena slice
+///
+/// A slice allocated from the arena, it will holds the reference to the arena
+/// so it is safe to clone and deref the slice
+#[derive(Clone)]
+pub struct ArenaSlice<A: Arena<Stats = BasicStats>> {
+    /// Arena the slice memory allocated from.
+    _arena: A,
+    /// The slice pointer.
+    slice_ptr: *const u8,
+    /// The slice len.
+    slice_len: usize,
+}
+
+impl<A: Arena<Stats = BasicStats>> ArenaSlice<A> {
+    /// Create a [ArenaSlice]
+    ///
+    /// See the documentation of [`slice::from_raw_parts`] for slice safety
+    /// requirements.
+    pub(crate) unsafe fn from_raw_parts(_arena: A, slice_ptr: *const u8, slice_len: usize) -> Self {
+        Self {
+            _arena,
+            slice_ptr,
+            slice_len,
+        }
+    }
+}
+
+unsafe impl<A: Arena<Stats = BasicStats> + Send> Send for ArenaSlice<A> {}
+unsafe impl<A: Arena<Stats = BasicStats> + Sync> Sync for ArenaSlice<A> {}
+
+impl<A: Arena<Stats = BasicStats>> Deref for ArenaSlice<A> {
+    type Target = [u8];
+
+    fn deref(&self) -> &[u8] {
+        unsafe { slice::from_raw_parts(self.slice_ptr, self.slice_len) }
+    }
+}
+
+impl<A: Arena<Stats = BasicStats>> fmt::Debug for ArenaSlice<A> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_list().entries(self.iter()).finish()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{alloc::Layout, mem, ptr};
+
+    use arena::MonoIncArena;
+
+    use super::*;
+
+    #[test]
+    fn test_arena_slice() {
+        let hello = b"hello";
+        let arena = MonoIncArena::new(1 << 10);
+        let slice = unsafe {
+            let data_ptr = arena
+                .alloc(Layout::from_size_align(hello.len(), mem::align_of_val(hello)).unwrap());
+            ptr::copy_nonoverlapping(hello.as_ptr(), data_ptr.as_ptr(), hello.len());
+            ArenaSlice::from_raw_parts(arena, data_ptr.as_ptr(), hello.len())
+        };
+        assert_eq!(hello, &slice[..]);
+    }
+}
diff --git a/components/skiplist/tests/tests.rs b/components/skiplist/tests/tests.rs
new file mode 100644
index 0000000000..78a5d81f78
--- /dev/null
+++ b/components/skiplist/tests/tests.rs
@@ -0,0 +1,261 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use std::{
+    str,
+    sync::{atomic::*, *},
+    thread::yield_now,
+    time::Duration,
+};
+
+use arena::MonoIncArena;
+use bytes::*;
+use skiplist::*;
+use yatp::task::callback::Handle;
+
+fn new_value(v: usize) -> Bytes {
+    Bytes::from(format!("{:05}", v))
+}
+
+fn key_with_ts(key: &str, ts: u64) -> Bytes {
+    Bytes::from(format!("{}{:08}", key, ts))
+}
+
+#[test]
+fn test_empty() {
+    let key = key_with_ts("aaa", 0);
+    let comp = FixedLengthSuffixComparator::new(8);
+    let arena = MonoIncArena::new(1 << 10);
+    let list = Skiplist::with_arena(comp, arena);
+    let v = list.get(&key);
+    assert!(v.is_none());
+
+    let mut iter = list.iter_ref();
+    assert!(!iter.valid());
+    iter.seek_to_first();
+    assert!(!iter.valid());
+    iter.seek_to_last();
+    assert!(!iter.valid());
+    iter.seek(&key);
+    assert!(!iter.valid());
+    assert!(list.is_empty());
+}
+
+#[test]
+fn test_basic() {
+    let comp = FixedLengthSuffixComparator::new(8);
+    let arena = MonoIncArena::new(1 << 10);
+    let list = Skiplist::with_arena(comp, arena);
+    let table = vec![
+        ("key1", new_value(42)),
+        ("key2", new_value(52)),
+        ("key3", new_value(62)),
+        ("key5", Bytes::from(format!("{:0102400}", 1))),
+        ("key4", new_value(72)),
+    ];
+
+    for (key, value) in &table {
+        list.put(&key_with_ts(*key, 0), value);
+    }
+
+    assert_eq!(list.get(&key_with_ts("key", 0)), None);
+    assert_eq!(list.len(), 5);
+    assert!(!list.is_empty());
+    for (key, value) in &table {
+        let get_key = key_with_ts(*key, 0);
+        assert_eq!(list.get(&get_key), Some(&value[..]), "{}", key);
+    }
+}
+
+fn test_concurrent_basic(n: usize, value_len: usize) {
+    let pool = yatp::Builder::new("concurrent_basic").build_callback_pool();
+    let comp = FixedLengthSuffixComparator::new(8);
+    let arena = MonoIncArena::new(1 << 10);
+    let list = Skiplist::with_arena(comp, arena);
+    let kvs: Vec<_> = (0..n)
+        .map(|i| {
+            (
+                key_with_ts(format!("{:05}", i).as_str(), 0),
+                Bytes::from(format!("{1:00$}", value_len, i)),
+            )
+        })
+        .collect();
+    let (tx, rx) = mpsc::channel();
+    for (k, v) in kvs.clone() {
+        let tx = tx.clone();
+        let list = list.clone();
+        pool.spawn(move |_: &mut Handle<'_>| {
+            list.put(&k, &v);
+            tx.send(()).unwrap();
+        })
+    }
+    for _ in 0..n {
+        rx.recv_timeout(Duration::from_secs(3)).unwrap();
+    }
+    for (k, v) in kvs {
+        let tx = tx.clone();
+        let list = list.clone();
+        pool.spawn(move |_: &mut Handle<'_>| {
+            let val = list.get(&k);
+            assert_eq!(val, Some(&v[..]), "{:?}", k);
+            tx.send(()).unwrap();
+        });
+    }
+    for _ in 0..n {
+        rx.recv_timeout(Duration::from_secs(3)).unwrap();
+    }
+    assert_eq!(list.len(), n);
+}
+
+#[test]
+fn test_concurrent_basic_small_value() {
+    test_concurrent_basic(1000, 5);
+}
+
+#[test]
+fn test_concurrent_basic_big_value() {
+    test_concurrent_basic(100, 1048576);
+}
+
+#[test]
+fn test_one_key() {
+    let n = 10000;
+    let write_pool = yatp::Builder::new("one_key_write").build_callback_pool();
+    let read_pool = yatp::Builder::new("one_key_read").build_callback_pool();
+    let comp = FixedLengthSuffixComparator::new(8);
+    let arena = MonoIncArena::new(1 << 10);
+    let list = Skiplist::with_arena(comp, arena);
+    let key = key_with_ts("thekey", 0);
+    let (tx, rx) = mpsc::channel();
+    list.put(&key, &new_value(0));
+    for i in 0..n {
+        let tx = tx.clone();
+        let list = list.clone();
+        let key = key.clone();
+        let value = new_value(i);
+        write_pool.spawn(move |_: &mut Handle<'_>| {
+            list.put(&key, &value);
+            tx.send("w").unwrap();
+            yield_now();
+        })
+    }
+    let mark = Arc::new(AtomicBool::new(false));
+    for _ in 0..n {
+        let tx = tx.clone();
+        let list = list.clone();
+        let mark = mark.clone();
+        let key = key.clone();
+        read_pool.spawn(move |_: &mut Handle<'_>| {
+            let val = list.get(&key);
+            if val.is_none() {
+                return;
+            }
+            let s = unsafe { str::from_utf8_unchecked(val.unwrap()) };
+            let val: usize = s.parse().unwrap();
+            assert!(val < n);
+            mark.store(true, Ordering::SeqCst);
+            tx.send("r").unwrap();
+            yield_now();
+        });
+    }
+    let mut r = 0;
+    let mut w = 0;
+    for _ in 0..(n * 2) {
+        match rx.recv_timeout(Duration::from_secs(3)) {
+            Ok("w") => w += 1,
+            Ok("r") => r += 1,
+            Err(err) => panic!("timeout on receiving r{} w{} msg {:?}", r, w, err),
+            _ => panic!("unexpected value"),
+        }
+    }
+    assert_eq!(list.len(), 1);
+    assert!(mark.load(Ordering::SeqCst));
+}
+
+#[test]
+fn test_iterator_next() {
+    let n = 100;
+    let comp = FixedLengthSuffixComparator::new(8);
+    let arena = MonoIncArena::new(1 << 10);
+    let list = Skiplist::with_arena(comp, arena);
+    let mut iter_ref = list.iter_ref();
+    assert!(!iter_ref.valid());
+    iter_ref.seek_to_first();
+    assert!(!iter_ref.valid());
+    for i in (0..n).rev() {
+        let key = key_with_ts(format!("{:05}", i).as_str(), 0);
+        list.put(&key, &new_value(i));
+    }
+    iter_ref.seek_to_first();
+    for i in 0..n {
+        assert!(iter_ref.valid());
+        let v = iter_ref.value();
+        assert_eq!(v, &new_value(i));
+        iter_ref.next();
+    }
+    assert!(!iter_ref.valid());
+}
+
+#[test]
+fn test_iterator_prev() {
+    let n = 100;
+    let comp = FixedLengthSuffixComparator::new(8);
+    let arena = MonoIncArena::new(1 << 10);
+    let list = Skiplist::with_arena(comp, arena);
+    let mut iter_ref = list.iter_ref();
+    assert!(!iter_ref.valid());
+    iter_ref.seek_to_last();
+    assert!(!iter_ref.valid());
+    for i in (0..n).rev() {
+        let key = key_with_ts(format!("{:05}", i).as_str(), 0);
+        list.put(&key, &new_value(i));
+    }
+    iter_ref.seek_to_last();
+    for i in (0..n).rev() {
+        assert!(iter_ref.valid());
+        let v = iter_ref.value();
+        assert_eq!(v, &new_value(i));
+        iter_ref.prev();
+    }
+    assert!(!iter_ref.valid());
+}
+
+#[test]
+fn test_iterator_seek() {
+    let n = 100;
+    let comp = FixedLengthSuffixComparator::new(8);
+    let arena = MonoIncArena::new(1 << 10);
+    let list = Skiplist::with_arena(comp, arena);
+    let mut iter_ref = list.iter_ref();
+    assert!(!iter_ref.valid());
+    iter_ref.seek_to_first();
+    assert!(!iter_ref.valid());
+    for i in (0..n).rev() {
+        let v = i * 10 + 1000;
+        let key = key_with_ts(format!("{:05}", v).as_str(), 0);
+        list.put(&key, &new_value(v));
+    }
+    iter_ref.seek_to_first();
+    assert!(iter_ref.valid());
+    assert_eq!(iter_ref.value(), b"01000" as &[u8]);
+
+    let cases = vec![
+        ("00000", Some(b"01000"), None),
+        ("01000", Some(b"01000"), Some(b"01000")),
+        ("01005", Some(b"01010"), Some(b"01000")),
+        ("01010", Some(b"01010"), Some(b"01010")),
+        ("99999", None, Some(b"01990")),
+    ];
+    for (key, seek_expect, for_prev_expect) in cases {
+        let key = key_with_ts(key, 0);
+        iter_ref.seek(&key);
+        assert_eq!(iter_ref.valid(), seek_expect.is_some());
+        if let Some(v) = seek_expect {
+            assert_eq!(iter_ref.value(), &v[..]);
+        }
+        iter_ref.seek_for_prev(&key);
+        assert_eq!(iter_ref.valid(), for_prev_expect.is_some());
+        if let Some(v) = for_prev_expect {
+            assert_eq!(iter_ref.value(), &v[..]);
+        }
+    }
+}
diff --git a/components/tracing/Cargo.toml b/components/tracing/Cargo.toml
new file mode 100644
index 0000000000..dc493f02cc
--- /dev/null
+++ b/components/tracing/Cargo.toml
@@ -0,0 +1,9 @@
+[package]
+name = "tracing"
+version = "0.1.0"
+edition = "2018"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+upstream = { version = "0.1.26", package = "tracing" }
diff --git a/components/tracing/src/lib.rs b/components/tracing/src/lib.rs
new file mode 100644
index 0000000000..5cdff967b6
--- /dev/null
+++ b/components/tracing/src/lib.rs
@@ -0,0 +1,5 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Tracing core lib re-export.
+
+pub use upstream::*;
diff --git a/components/tracing_examples/Cargo.toml b/components/tracing_examples/Cargo.toml
new file mode 100644
index 0000000000..b8bea30722
--- /dev/null
+++ b/components/tracing_examples/Cargo.toml
@@ -0,0 +1,10 @@
+[package]
+name = "trace_examples"
+version = "0.1.0"
+edition = "2018"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+tracing = { path = "../tracing" }
+tracing_util = { path = "../tracing_util" }
diff --git a/components/tracing_examples/examples/init_tracing_with_file.rs b/components/tracing_examples/examples/init_tracing_with_file.rs
new file mode 100644
index 0000000000..75f89f6dca
--- /dev/null
+++ b/components/tracing_examples/examples/init_tracing_with_file.rs
@@ -0,0 +1,41 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use tracing_util::{init_tracing_with_file, tracing_appender::rolling::Rotation};
+
+#[tracing::instrument(level = "debug")]
+fn nth_fibonacci(n: u64) -> u64 {
+    if n == 0 || n == 1 {
+        1
+    } else {
+        nth_fibonacci(n - 1) + nth_fibonacci(n - 2)
+    }
+}
+
+// default leve info
+#[tracing::instrument]
+fn fibonacci_seq(to: u64) -> Vec<u64> {
+    let mut sequence = vec![];
+
+    for n in 0..=to {
+        sequence.push(nth_fibonacci(n));
+    }
+
+    sequence
+}
+
+// cargo run --example init_tracing_with_file
+// log file: /tmp/test_logs/init_tracing_with_file
+// 2021-09-28T22:41:30.362078+08:00  INFO main ThreadId(01) fibonacci_seq{to=5}:
+// init_tracing_with_file: enter 2021-09-28T22:41:30.364181+08:00  INFO main
+// ThreadId(01) fibonacci_seq{to=5}: init_tracing_with_file: close
+// time.busy=2.13ms time.idle=34.8µs
+fn main() {
+    let _g = init_tracing_with_file(
+        "init_tracing_with_file",
+        "/tmp/test_logs",
+        "info",
+        Rotation::NEVER,
+    );
+    let ret = fibonacci_seq(5);
+    println!("{:?}", ret);
+}
diff --git a/components/tracing_util/Cargo.toml b/components/tracing_util/Cargo.toml
new file mode 100644
index 0000000000..15eb11520a
--- /dev/null
+++ b/components/tracing_util/Cargo.toml
@@ -0,0 +1,13 @@
+[package]
+name = "tracing_util"
+version = "0.1.0"
+authors = ["Databend Authors <opensource@datafuselabs.com>"]
+license = "Apache-2.0"
+publish = false
+edition = "2018"
+
+[dependencies] # In alphabetical order
+lazy_static = "1.4.0"
+tracing = "0.1.26"
+tracing-appender = "0.1.2"
+tracing-subscriber = "0.2.20"
diff --git a/components/tracing_util/src/lib.rs b/components/tracing_util/src/lib.rs
new file mode 100644
index 0000000000..69c7432fd4
--- /dev/null
+++ b/components/tracing_util/src/lib.rs
@@ -0,0 +1,22 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+// Copyright 2020 Datafuse Labs.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// fork from:https://github.com/datafuselabs/databend/tree/master/common/tracing
+
+mod logging;
+
+pub use logging::{init_default_tracing, init_default_ut_tracing, init_tracing_with_file};
+pub use tracing_appender;
diff --git a/components/tracing_util/src/logging.rs b/components/tracing_util/src/logging.rs
new file mode 100644
index 0000000000..7a314608f5
--- /dev/null
+++ b/components/tracing_util/src/logging.rs
@@ -0,0 +1,147 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+// Copyright 2020 Datafuse Labs.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::{
+    fs::OpenOptions,
+    path::Path,
+    sync::{Arc, Mutex, Once},
+};
+
+use fmt::format::FmtSpan;
+use lazy_static::lazy_static;
+use tracing::Subscriber;
+use tracing_appender::{
+    non_blocking::WorkerGuard,
+    rolling::{RollingFileAppender, Rotation},
+};
+use tracing_subscriber::{
+    fmt,
+    fmt::{time::ChronoLocal, Layer},
+    prelude::*,
+    registry::Registry,
+    EnvFilter,
+};
+
+/// Write logs to stdout.
+pub fn init_default_tracing() {
+    static START: Once = Once::new();
+
+    START.call_once(|| {
+        init_tracing_stdout();
+    });
+}
+
+/// Init tracing for unittest.
+/// Write logs to file `unittest`.
+pub fn init_default_ut_tracing() {
+    static START: Once = Once::new();
+
+    START.call_once(|| {
+        let mut g = GLOBAL_UT_LOG_GUARD.as_ref().lock().unwrap();
+        let (work_guard, sub) = init_file_subscriber("unittest", "_logs");
+        tracing::subscriber::set_global_default(sub)
+            .expect("error setting global tracing subscriber");
+
+        tracing::info!("init default ut tracing");
+        *g = Some(work_guard);
+    });
+}
+
+lazy_static! {
+    static ref GLOBAL_UT_LOG_GUARD: Arc<Mutex<Option<WorkerGuard>>> = Arc::new(Mutex::new(None));
+}
+
+fn init_tracing_stdout() {
+    let fmt_layer = Layer::default()
+        .with_thread_ids(true)
+        .with_thread_names(false)
+        .with_ansi(false)
+        .with_span_events(fmt::format::FmtSpan::FULL);
+
+    let subscriber = Registry::default()
+        .with(EnvFilter::from_default_env())
+        .with(fmt_layer);
+
+    tracing::subscriber::set_global_default(subscriber)
+        .expect("error setting global tracing subscriber");
+}
+
+/// Write logs to file and rotation.
+pub fn init_tracing_with_file(
+    app_name: &str,
+    dir: impl AsRef<Path>,
+    level: &str,
+    rotation: Rotation,
+) -> WorkerGuard {
+    let file_appender = RollingFileAppender::new(rotation, dir, app_name);
+    let (file_writer, file_guard) = tracing_appender::non_blocking(file_appender);
+    let f_layer = Layer::new()
+        .with_timer(ChronoLocal::rfc3339())
+        .with_writer(file_writer)
+        .with_thread_ids(true)
+        .with_thread_names(true)
+        .with_ansi(false)
+        .with_span_events(FmtSpan::ENTER | FmtSpan::CLOSE);
+
+    let subscriber = Registry::default()
+        .with(EnvFilter::new(level))
+        .with(f_layer);
+
+    tracing::subscriber::set_global_default(subscriber)
+        .expect("error setting global tracing subscriber");
+
+    file_guard
+}
+
+/// Create a file based tracing/logging subscriber.
+/// A guard must be held during using the logging.
+fn init_file_subscriber(app_name: &str, dir: &str) -> (WorkerGuard, impl Subscriber) {
+    let path_str = dir.to_string() + "/" + app_name;
+    let path: &Path = path_str.as_ref();
+
+    // open log file
+
+    let mut open_options = OpenOptions::new();
+    open_options.append(true).create(true);
+
+    let mut open_res = open_options.open(path);
+    if open_res.is_err() {
+        if let Some(parent) = path.parent() {
+            std::fs::create_dir_all(parent).unwrap();
+            open_res = open_options.open(path);
+        }
+    }
+
+    let f = open_res.unwrap();
+
+    // build subscriber
+
+    let (writer, writer_guard) = tracing_appender::non_blocking(f);
+
+    let f_layer = Layer::new()
+        .with_timer(ChronoLocal::rfc3339())
+        .with_writer(writer)
+        .with_thread_ids(true)
+        .with_thread_names(false)
+        .with_ansi(false)
+        .with_span_events(FmtSpan::ENTER | FmtSpan::CLOSE);
+
+    let subscriber = Registry::default()
+        .with(EnvFilter::from_default_env())
+        .with(f_layer);
+
+    (writer_guard, subscriber)
+}
diff --git a/configs/ceresdb.toml b/configs/ceresdb.toml
new file mode 100644
index 0000000000..7b1a216dfc
--- /dev/null
+++ b/configs/ceresdb.toml
@@ -0,0 +1,23 @@
+bind_addr = "0.0.0.0"
+http_port = ${HTTP_PORT}
+grpc_port = ${GRPC_PORT}
+log_level = "info"
+
+[analytic]
+data_path = "${DATA_PATH}"
+
+[[meta_client.cluster_view.schema_shards]]
+schema = 'public'
+auto_create_tables = true
+
+[[meta_client.cluster_view.schema_shards.shard_views]]
+shard_id = 0
+
+[meta_client.cluster_view.schema_shards.shard_views.node]
+addr = "${NODE_ADDR}"
+port = ${GRPC_PORT}
+
+[[route_rules.prefix_rules]]
+schema = 'public'
+prefix = 'special_prefix'
+shard = 0
diff --git a/docker/entrypoint.py b/docker/entrypoint.py
new file mode 100755
index 0000000000..35b3e12cdf
--- /dev/null
+++ b/docker/entrypoint.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env python2
+import os
+import sys
+import commands
+
+ENABLE_DATA_NODE = os.getenv('ENABLE_DATANODE') == 'true'
+HTTP_PORT = os.getenv('CERESDB_HTTP_PORT', '5000')
+GRPC_PORT = os.getenv('CERESDB_GRPC_PORT', '8831')
+DATA_PATH = '/home/admin/data/ceresdb'
+
+# hostname maybe return some ip(array)
+def get_local_ip():
+    return commands.getoutput('/usr/bin/localip').strip().split()[0]
+
+def create_datanode_config():
+    config = open('/etc/ceresdb/ceresdb.toml', 'r').read()
+    config = config.replace("${HTTP_PORT}", HTTP_PORT)
+    config = config.replace("${GRPC_PORT}", GRPC_PORT)
+    config = config.replace("${NODE_ADDR}", get_local_ip())
+    config = config.replace("${DATA_PATH}", DATA_PATH)
+    open('/etc/ceresdb/ceresdb.toml', 'w').write(config)
+
+def start_datanode():
+    create_datanode_config()
+
+    cmd = '''
+# load env
+. /ceresdb.env
+env
+exec /usr/bin/ceresdb-server --config /etc/ceresdb/ceresdb.toml
+'''
+    open('/usr/bin/ceresdb-start.sh', 'w').write(cmd)
+
+def start_supervisord():
+    port = int(os.getenv('SUPERVISORD_HTTP_PORT', '9001'))
+    conf = '/etc/supervisor/supervisord.conf'
+    if port:
+        os.system(''' sed -i 's/:9001/:%d/g' %s ''' % (port, conf))
+    open('/etc/supervisor/conf.d/touch-admin-cron.conf', 'a').write('\nkillasgroup=true\nstopasgroup=true\n')
+    os.system('/usr/bin/supervisord -c %s --nodaemon' % conf)
+
+def copy_environ():
+    envs = []
+    for k, v in os.environ.items():
+        envs.append('export %s="%s"' % (k, v))
+        # copy DATANODE_ to CSE_
+        if 'DATANODE_' in k:
+            envs.append('export %s="%s"' % (k.replace('DATANODE_', 'CSE_'), v))
+
+    envs.append('export LOCAL_IP=%s' % get_local_ip())
+    # support register ceres meta
+    envs.append('export CSE_CERES_META_NODE_ADDR=%s' % (get_local_ip()))
+
+    envs.append('export MALLOC_CONF=prof:true,prof_active:false,lg_prof_sample:19')
+
+    open('/ceresdb.env', 'w').write('\n'.join(envs))
+
+def init_dir():
+    cmd = '''
+mkdir -p /home/admin/logs /home/admin/data
+
+# set logdir
+mkdir -p /home/admin/logs/ceresdb
+
+ln -nsf /data /home/admin/data
+
+chmod +777 -R /data /home/admin/data /home/admin/logs
+chown -R admin.admin /data /home/admin/data /home/admin/logs
+'''
+    open('/ceresdb-init.sh', 'w').write(cmd)
+    os.system('sh /ceresdb-init.sh')
+
+def main():
+    print "copy_environ"
+    copy_environ()
+
+    print "init_dir"
+    init_dir()
+
+    if ENABLE_DATA_NODE:
+        print "start_datanode"
+        start_datanode()
+
+    print "start_datanode"
+    start_supervisord()
+
+if __name__ == '__main__':
+    main()
diff --git a/docker/supervisor/conf.d/ceresdb.conf b/docker/supervisor/conf.d/ceresdb.conf
new file mode 100644
index 0000000000..3b956c3118
--- /dev/null
+++ b/docker/supervisor/conf.d/ceresdb.conf
@@ -0,0 +1,17 @@
+[program:ceresdbx]
+command=sh /usr/bin/ceresdb-start.sh
+autostart=true
+startsecs=3
+startretries=9999
+autorestart=true
+;exitcodes=0,2
+;stopsignal=QUIT
+;stopwaitsecs=10
+stopasgroup=true
+killasgroup=true
+user=admin
+redirect_stderr=true
+stdout_logfile=/home/admin/logs/ceresdb/out.log
+stdout_logfile_maxbytes=200MB
+stdout_logfile_backups=5
+;environment=A="1",B="2"
diff --git a/docker/supervisor/supervisord.conf b/docker/supervisor/supervisord.conf
new file mode 100644
index 0000000000..401fb2e363
--- /dev/null
+++ b/docker/supervisor/supervisord.conf
@@ -0,0 +1,24 @@
+[unix_http_server]
+file=/tmp/supervisor.sock   ; (the path to the socket file)
+
+[inet_http_server]         ; inet (TCP) server disabled by default
+port=127.0.0.1:9001        ; (ip_address:port specifier, *:port for all iface)
+
+[supervisord]
+logfile=/tmp/supervisord.log ; (main log file;default $CWD/supervisord.log)
+logfile_maxbytes=500MB       ; (max main logfile bytes b4 rotation;default 50MB)
+logfile_backups=10           ; (num of main logfile rotation backups;default 10)
+loglevel=info                ; (log level;default info; others: debug,warn,trace)
+pidfile=/tmp/supervisord.pid ; (supervisord pidfile;default supervisord.pid)
+nodaemon=false               ; (start in foreground if true;default false)
+minfds=1024                  ; (min. avail startup file descriptors;default 1024)
+minprocs=200                 ; (min. avail process descriptors;default 200)
+
+[rpcinterface:supervisor]
+supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
+
+[supervisorctl]
+serverurl=http://127.0.0.1:9001
+
+[include]
+files = /etc/supervisor/conf.d/*.conf
diff --git a/docker/tini b/docker/tini
new file mode 100644
index 0000000000..03af82f09e
Binary files /dev/null and b/docker/tini differ
diff --git a/docs/crate-deps.dot b/docs/crate-deps.dot
new file mode 100644
index 0000000000..95622dc25d
--- /dev/null
+++ b/docs/crate-deps.dot
@@ -0,0 +1,93 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// to update svg, run:
+// ```bash
+// dot -Tsvg crate-deps.dot > ./crate-deps.svg
+// ```
+
+digraph G {
+
+    arrow_deps
+
+    analytic_engine -> arrow_deps
+    analytic_engine -> proto
+    analytic_engine -> table_engine
+    analytic_engine -> wal
+
+    catalog -> table_engine
+
+    catalog_impls -> catalog
+    catalog_impls -> system_catalog
+    catalog_impls -> table_engine
+
+    cluster -> analytic_engine
+    cluster -> catalog
+    cluster -> meta_client_v2
+
+    interpreters -> catalog
+    interpreters -> sql
+    interpreters -> table_engine
+    interpreters -> udf
+    interpreters -> query_engine
+    interpreters -> arrow_deps
+
+    meta_client -> catalog
+    meta_client -> table_engine
+
+    meta_client_v2 -> catalog
+    meta_client_v2 -> table_engine
+
+    query_engine -> arrow_deps
+    query_engine -> sql
+    query_engine -> table_engine
+    query_engine -> udf
+
+    server -> analytic_engine
+    server -> arrow_deps
+    server -> catalog
+    server -> interpreters
+    server -> meta_client
+    server -> query_engine
+    server -> sql
+    server -> system_catalog
+    server -> table_engine
+    server -> udf
+
+    sql -> arrow_deps
+    sql -> catalog
+    sql -> table_engine
+    sql -> udf
+
+    system_catalog -> arrow_deps
+    system_catalog -> catalog
+    system_catalog -> proto
+    system_catalog -> table_engine
+
+    table_engine -> arrow_deps
+    table_engine -> proto
+
+    udf -> arrow_deps
+
+    ceresdb -> analytic_engine
+    ceresdb -> catalog
+    ceresdb -> catalog_impls
+    ceresdb -> query_engine
+    ceresdb -> server
+    ceresdb -> table_engine
+    ceresdb -> udf
+}
diff --git a/docs/crate-deps.svg b/docs/crate-deps.svg
new file mode 100644
index 0000000000..a52863ea12
--- /dev/null
+++ b/docs/crate-deps.svg
@@ -0,0 +1,433 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+ "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by graphviz version 3.0.0 (0)
+ -->
+<!-- Title: G Pages: 1 -->
+<svg width="1235pt" height="548pt"
+ viewBox="0.00 0.00 1234.60 548.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 544)">
+<title>G</title>
+<polygon fill="white" stroke="transparent" points="-4,4 -4,-544 1230.6,-544 1230.6,4 -4,4"/>
+<!-- arrow_deps -->
+<g id="node1" class="node">
+<title>arrow_deps</title>
+<ellipse fill="none" stroke="black" cx="238.37" cy="-18" rx="51.99" ry="18"/>
+<text text-anchor="middle" x="238.37" y="-14.3" font-family="Times,serif" font-size="14.00">arrow_deps</text>
+</g>
+<!-- analytic_engine -->
+<g id="node2" class="node">
+<title>analytic_engine</title>
+<ellipse fill="none" stroke="black" cx="1148.37" cy="-234" rx="66.09" ry="18"/>
+<text text-anchor="middle" x="1148.37" y="-230.3" font-family="Times,serif" font-size="14.00">analytic_engine</text>
+</g>
+<!-- analytic_engine&#45;&gt;arrow_deps -->
+<g id="edge1" class="edge">
+<title>analytic_engine&#45;&gt;arrow_deps</title>
+<path fill="none" stroke="black" d="M1144.62,-215.96C1139.17,-195.52 1127.05,-161.78 1103.37,-144 976.1,-48.4 476.38,-25.54 300.24,-20.4"/>
+<polygon fill="black" stroke="black" points="300.2,-16.9 290.1,-20.12 300,-23.9 300.2,-16.9"/>
+</g>
+<!-- proto -->
+<g id="node3" class="node">
+<title>proto</title>
+<ellipse fill="none" stroke="black" cx="922.37" cy="-18" rx="29.5" ry="18"/>
+<text text-anchor="middle" x="922.37" y="-14.3" font-family="Times,serif" font-size="14.00">proto</text>
+</g>
+<!-- analytic_engine&#45;&gt;proto -->
+<g id="edge2" class="edge">
+<title>analytic_engine&#45;&gt;proto</title>
+<path fill="none" stroke="black" d="M1181.07,-218.2C1196.27,-209.44 1212.84,-196.74 1221.37,-180 1228.64,-165.75 1227.74,-158.68 1221.37,-144 1178.55,-45.32 1032.93,-24.32 961.93,-20.01"/>
+<polygon fill="black" stroke="black" points="961.78,-16.49 951.61,-19.46 961.42,-23.48 961.78,-16.49"/>
+</g>
+<!-- table_engine -->
+<g id="node4" class="node">
+<title>table_engine</title>
+<ellipse fill="none" stroke="black" cx="603.37" cy="-90" rx="55.79" ry="18"/>
+<text text-anchor="middle" x="603.37" y="-86.3" font-family="Times,serif" font-size="14.00">table_engine</text>
+</g>
+<!-- analytic_engine&#45;&gt;table_engine -->
+<g id="edge3" class="edge">
+<title>analytic_engine&#45;&gt;table_engine</title>
+<path fill="none" stroke="black" d="M1138.45,-215.83C1125.33,-194.92 1100.14,-160.38 1068.37,-144 1000.27,-108.89 780.11,-96.74 668.92,-92.78"/>
+<polygon fill="black" stroke="black" points="668.94,-89.28 658.83,-92.44 668.7,-96.28 668.94,-89.28"/>
+</g>
+<!-- wal -->
+<g id="node5" class="node">
+<title>wal</title>
+<ellipse fill="none" stroke="black" cx="1185.37" cy="-162" rx="27" ry="18"/>
+<text text-anchor="middle" x="1185.37" y="-158.3" font-family="Times,serif" font-size="14.00">wal</text>
+</g>
+<!-- analytic_engine&#45;&gt;wal -->
+<g id="edge4" class="edge">
+<title>analytic_engine&#45;&gt;wal</title>
+<path fill="none" stroke="black" d="M1157.33,-216.05C1161.71,-207.77 1167.08,-197.62 1171.94,-188.42"/>
+<polygon fill="black" stroke="black" points="1175.18,-189.78 1176.76,-179.31 1168.99,-186.51 1175.18,-189.78"/>
+</g>
+<!-- table_engine&#45;&gt;arrow_deps -->
+<g id="edge44" class="edge">
+<title>table_engine&#45;&gt;arrow_deps</title>
+<path fill="none" stroke="black" d="M556.4,-79.99C488.78,-67.02 364.28,-43.15 292.75,-29.43"/>
+<polygon fill="black" stroke="black" points="293.08,-25.93 282.6,-27.48 291.76,-32.8 293.08,-25.93"/>
+</g>
+<!-- table_engine&#45;&gt;proto -->
+<g id="edge45" class="edge">
+<title>table_engine&#45;&gt;proto</title>
+<path fill="none" stroke="black" d="M648.4,-79.12C711.91,-65.18 826.6,-40.02 885.41,-27.11"/>
+<polygon fill="black" stroke="black" points="886.36,-30.49 895.37,-24.92 884.86,-23.65 886.36,-30.49"/>
+</g>
+<!-- catalog -->
+<g id="node6" class="node">
+<title>catalog</title>
+<ellipse fill="none" stroke="black" cx="603.37" cy="-162" rx="36.29" ry="18"/>
+<text text-anchor="middle" x="603.37" y="-158.3" font-family="Times,serif" font-size="14.00">catalog</text>
+</g>
+<!-- catalog&#45;&gt;table_engine -->
+<g id="edge5" class="edge">
+<title>catalog&#45;&gt;table_engine</title>
+<path fill="none" stroke="black" d="M603.37,-143.7C603.37,-135.98 603.37,-126.71 603.37,-118.11"/>
+<polygon fill="black" stroke="black" points="606.87,-118.1 603.37,-108.1 599.87,-118.1 606.87,-118.1"/>
+</g>
+<!-- catalog_impls -->
+<g id="node7" class="node">
+<title>catalog_impls</title>
+<ellipse fill="none" stroke="black" cx="729.37" cy="-306" rx="59.59" ry="18"/>
+<text text-anchor="middle" x="729.37" y="-302.3" font-family="Times,serif" font-size="14.00">catalog_impls</text>
+</g>
+<!-- catalog_impls&#45;&gt;table_engine -->
+<g id="edge8" class="edge">
+<title>catalog_impls&#45;&gt;table_engine</title>
+<path fill="none" stroke="black" d="M773.45,-293.7C781.71,-291.71 790.29,-289.73 798.37,-288 890.36,-268.37 1052.58,-292.15 997.37,-216 957.52,-161.02 761.27,-119.02 661.64,-100.85"/>
+<polygon fill="black" stroke="black" points="662.19,-97.39 651.72,-99.06 660.94,-104.28 662.19,-97.39"/>
+</g>
+<!-- catalog_impls&#45;&gt;catalog -->
+<g id="edge6" class="edge">
+<title>catalog_impls&#45;&gt;catalog</title>
+<path fill="none" stroke="black" d="M707.35,-289.15C694.38,-279.25 678.07,-265.8 665.37,-252 647.08,-232.11 629.81,-206.55 618.17,-187.93"/>
+<polygon fill="black" stroke="black" points="621.14,-186.09 612.92,-179.4 615.18,-189.75 621.14,-186.09"/>
+</g>
+<!-- system_catalog -->
+<g id="node8" class="node">
+<title>system_catalog</title>
+<ellipse fill="none" stroke="black" cx="922.37" cy="-234" rx="65.79" ry="18"/>
+<text text-anchor="middle" x="922.37" y="-230.3" font-family="Times,serif" font-size="14.00">system_catalog</text>
+</g>
+<!-- catalog_impls&#45;&gt;system_catalog -->
+<g id="edge7" class="edge">
+<title>catalog_impls&#45;&gt;system_catalog</title>
+<path fill="none" stroke="black" d="M766.13,-291.67C796.99,-280.48 841.29,-264.41 874.84,-252.24"/>
+<polygon fill="black" stroke="black" points="876.23,-255.46 884.44,-248.76 873.84,-248.88 876.23,-255.46"/>
+</g>
+<!-- system_catalog&#45;&gt;arrow_deps -->
+<g id="edge40" class="edge">
+<title>system_catalog&#45;&gt;arrow_deps</title>
+<path fill="none" stroke="black" d="M903.24,-216.56C862.92,-182.93 764.96,-106.58 668.37,-72 543.33,-27.23 386.22,-19.01 300.76,-18.21"/>
+<polygon fill="black" stroke="black" points="300.72,-14.71 290.7,-18.15 300.68,-21.71 300.72,-14.71"/>
+</g>
+<!-- system_catalog&#45;&gt;proto -->
+<g id="edge42" class="edge">
+<title>system_catalog&#45;&gt;proto</title>
+<path fill="none" stroke="black" d="M922.37,-215.85C922.37,-178.83 922.37,-91.18 922.37,-46.39"/>
+<polygon fill="black" stroke="black" points="925.87,-46.23 922.37,-36.23 918.87,-46.23 925.87,-46.23"/>
+</g>
+<!-- system_catalog&#45;&gt;table_engine -->
+<g id="edge43" class="edge">
+<title>system_catalog&#45;&gt;table_engine</title>
+<path fill="none" stroke="black" d="M889.44,-218.34C830.69,-192.19 708,-137.58 644.08,-109.12"/>
+<polygon fill="black" stroke="black" points="645.27,-105.82 634.72,-104.95 642.43,-112.22 645.27,-105.82"/>
+</g>
+<!-- system_catalog&#45;&gt;catalog -->
+<g id="edge41" class="edge">
+<title>system_catalog&#45;&gt;catalog</title>
+<path fill="none" stroke="black" d="M873.83,-221.76C865.35,-219.82 856.61,-217.84 848.37,-216 777.59,-200.21 695.14,-182.52 646.21,-172.09"/>
+<polygon fill="black" stroke="black" points="646.8,-168.64 636.3,-169.98 645.35,-175.49 646.8,-168.64"/>
+</g>
+<!-- cluster -->
+<g id="node9" class="node">
+<title>cluster</title>
+<ellipse fill="none" stroke="black" cx="841.37" cy="-306" rx="34.39" ry="18"/>
+<text text-anchor="middle" x="841.37" y="-302.3" font-family="Times,serif" font-size="14.00">cluster</text>
+</g>
+<!-- cluster&#45;&gt;analytic_engine -->
+<g id="edge9" class="edge">
+<title>cluster&#45;&gt;analytic_engine</title>
+<path fill="none" stroke="black" d="M872.22,-297.97C922.88,-286.42 1024.54,-263.23 1089.38,-248.45"/>
+<polygon fill="black" stroke="black" points="1090.49,-251.79 1099.46,-246.15 1088.93,-244.96 1090.49,-251.79"/>
+</g>
+<!-- cluster&#45;&gt;catalog -->
+<g id="edge10" class="edge">
+<title>cluster&#45;&gt;catalog</title>
+<path fill="none" stroke="black" d="M814.59,-294.26C809.23,-292.16 803.63,-289.98 798.37,-288 754.18,-271.31 737.7,-278.17 698.37,-252 680.31,-239.97 680.99,-231.08 665.37,-216 654.06,-205.08 640.9,-193.66 629.62,-184.23"/>
+<polygon fill="black" stroke="black" points="631.73,-181.43 621.79,-177.75 627.26,-186.82 631.73,-181.43"/>
+</g>
+<!-- meta_client_v2 -->
+<g id="node10" class="node">
+<title>meta_client_v2</title>
+<ellipse fill="none" stroke="black" cx="773.37" cy="-234" rx="65.79" ry="18"/>
+<text text-anchor="middle" x="773.37" y="-230.3" font-family="Times,serif" font-size="14.00">meta_client_v2</text>
+</g>
+<!-- cluster&#45;&gt;meta_client_v2 -->
+<g id="edge11" class="edge">
+<title>cluster&#45;&gt;meta_client_v2</title>
+<path fill="none" stroke="black" d="M826.28,-289.46C817.55,-280.48 806.38,-268.98 796.56,-258.87"/>
+<polygon fill="black" stroke="black" points="798.85,-256.2 789.37,-251.47 793.83,-261.08 798.85,-256.2"/>
+</g>
+<!-- meta_client_v2&#45;&gt;table_engine -->
+<g id="edge21" class="edge">
+<title>meta_client_v2&#45;&gt;table_engine</title>
+<path fill="none" stroke="black" d="M753.87,-216.71C723.51,-191.35 664.87,-142.37 630.44,-113.6"/>
+<polygon fill="black" stroke="black" points="632.57,-110.82 622.65,-107.1 628.08,-116.2 632.57,-110.82"/>
+</g>
+<!-- meta_client_v2&#45;&gt;catalog -->
+<g id="edge20" class="edge">
+<title>meta_client_v2&#45;&gt;catalog</title>
+<path fill="none" stroke="black" d="M738.56,-218.67C709.85,-206.84 669.23,-190.12 640.03,-178.09"/>
+<polygon fill="black" stroke="black" points="641.17,-174.78 630.59,-174.21 638.51,-181.25 641.17,-174.78"/>
+</g>
+<!-- interpreters -->
+<g id="node11" class="node">
+<title>interpreters</title>
+<ellipse fill="none" stroke="black" cx="254.37" cy="-378" rx="51.19" ry="18"/>
+<text text-anchor="middle" x="254.37" y="-374.3" font-family="Times,serif" font-size="14.00">interpreters</text>
+</g>
+<!-- interpreters&#45;&gt;arrow_deps -->
+<g id="edge17" class="edge">
+<title>interpreters&#45;&gt;arrow_deps</title>
+<path fill="none" stroke="black" d="M247.29,-359.85C237.06,-333.49 219.37,-281.21 219.37,-235 219.37,-235 219.37,-235 219.37,-161 219.37,-120.86 226.93,-74.9 232.51,-46.4"/>
+<polygon fill="black" stroke="black" points="235.98,-46.88 234.53,-36.39 229.12,-45.5 235.98,-46.88"/>
+</g>
+<!-- interpreters&#45;&gt;table_engine -->
+<g id="edge14" class="edge">
+<title>interpreters&#45;&gt;table_engine</title>
+<path fill="none" stroke="black" d="M262.23,-359.99C285.01,-311.39 351.92,-173.51 391.37,-144 433.88,-112.21 493.26,-99.34 538.36,-94.2"/>
+<polygon fill="black" stroke="black" points="538.75,-97.68 548.33,-93.16 538.02,-90.72 538.75,-97.68"/>
+</g>
+<!-- interpreters&#45;&gt;catalog -->
+<g id="edge12" class="edge">
+<title>interpreters&#45;&gt;catalog</title>
+<path fill="none" stroke="black" d="M271.39,-360.69C313.56,-320.28 419.86,-218.6 424.37,-216 466.72,-191.58 521.3,-177.54 559,-170.12"/>
+<polygon fill="black" stroke="black" points="559.99,-173.5 569.16,-168.2 558.69,-166.62 559.99,-173.5"/>
+</g>
+<!-- sql -->
+<g id="node12" class="node">
+<title>sql</title>
+<ellipse fill="none" stroke="black" cx="274.37" cy="-234" rx="27" ry="18"/>
+<text text-anchor="middle" x="274.37" y="-230.3" font-family="Times,serif" font-size="14.00">sql</text>
+</g>
+<!-- interpreters&#45;&gt;sql -->
+<g id="edge13" class="edge">
+<title>interpreters&#45;&gt;sql</title>
+<path fill="none" stroke="black" d="M256.79,-359.87C260.2,-335.67 266.46,-291.21 270.52,-262.39"/>
+<polygon fill="black" stroke="black" points="274.02,-262.58 271.95,-252.19 267.09,-261.6 274.02,-262.58"/>
+</g>
+<!-- udf -->
+<g id="node13" class="node">
+<title>udf</title>
+<ellipse fill="none" stroke="black" cx="100.37" cy="-162" rx="27" ry="18"/>
+<text text-anchor="middle" x="100.37" y="-158.3" font-family="Times,serif" font-size="14.00">udf</text>
+</g>
+<!-- interpreters&#45;&gt;udf -->
+<g id="edge15" class="edge">
+<title>interpreters&#45;&gt;udf</title>
+<path fill="none" stroke="black" d="M237.73,-360.46C227.87,-350.32 215.39,-336.85 205.37,-324 169.71,-278.24 134.07,-220.39 114.86,-187.97"/>
+<polygon fill="black" stroke="black" points="117.75,-185.99 109.67,-179.14 111.72,-189.54 117.75,-185.99"/>
+</g>
+<!-- query_engine -->
+<g id="node14" class="node">
+<title>query_engine</title>
+<ellipse fill="none" stroke="black" cx="94.37" cy="-306" rx="59.29" ry="18"/>
+<text text-anchor="middle" x="94.37" y="-302.3" font-family="Times,serif" font-size="14.00">query_engine</text>
+</g>
+<!-- interpreters&#45;&gt;query_engine -->
+<g id="edge16" class="edge">
+<title>interpreters&#45;&gt;query_engine</title>
+<path fill="none" stroke="black" d="M223.52,-363.5C198.67,-352.63 163.48,-337.23 136.1,-325.26"/>
+<polygon fill="black" stroke="black" points="137.34,-321.98 126.78,-321.18 134.54,-328.39 137.34,-321.98"/>
+</g>
+<!-- sql&#45;&gt;arrow_deps -->
+<g id="edge36" class="edge">
+<title>sql&#45;&gt;arrow_deps</title>
+<path fill="none" stroke="black" d="M271.49,-215.85C265.25,-178.75 250.45,-90.81 242.93,-46.1"/>
+<polygon fill="black" stroke="black" points="246.38,-45.51 241.27,-36.23 239.48,-46.67 246.38,-45.51"/>
+</g>
+<!-- sql&#45;&gt;table_engine -->
+<g id="edge38" class="edge">
+<title>sql&#45;&gt;table_engine</title>
+<path fill="none" stroke="black" d="M278.13,-215.96C283.57,-195.51 295.69,-161.77 319.37,-144 353.38,-118.49 466.89,-103.52 539.69,-96.32"/>
+<polygon fill="black" stroke="black" points="540.07,-99.8 549.68,-95.36 539.39,-92.84 540.07,-99.8"/>
+</g>
+<!-- sql&#45;&gt;catalog -->
+<g id="edge37" class="edge">
+<title>sql&#45;&gt;catalog</title>
+<path fill="none" stroke="black" d="M299.63,-227.54C315.14,-224.2 335.4,-219.84 353.37,-216 426.09,-200.47 510.73,-182.56 560.53,-172.04"/>
+<polygon fill="black" stroke="black" points="561.55,-175.4 570.61,-169.91 560.1,-168.56 561.55,-175.4"/>
+</g>
+<!-- sql&#45;&gt;udf -->
+<g id="edge39" class="edge">
+<title>sql&#45;&gt;udf</title>
+<path fill="none" stroke="black" d="M251.63,-223.85C221.19,-211.6 167.06,-189.83 132.48,-175.92"/>
+<polygon fill="black" stroke="black" points="133.44,-172.53 122.85,-172.04 130.82,-179.02 133.44,-172.53"/>
+</g>
+<!-- udf&#45;&gt;arrow_deps -->
+<g id="edge46" class="edge">
+<title>udf&#45;&gt;arrow_deps</title>
+<path fill="none" stroke="black" d="M114.37,-146.6C138.14,-122.14 186.65,-72.22 215.44,-42.59"/>
+<polygon fill="black" stroke="black" points="218.2,-44.78 222.66,-35.17 213.18,-39.9 218.2,-44.78"/>
+</g>
+<!-- query_engine&#45;&gt;arrow_deps -->
+<g id="edge22" class="edge">
+<title>query_engine&#45;&gt;arrow_deps</title>
+<path fill="none" stroke="black" d="M84.89,-288.19C69.19,-257.96 41.66,-193.01 64.37,-144 89.53,-89.72 151.76,-54.31 194.55,-35.48"/>
+<polygon fill="black" stroke="black" points="196.07,-38.63 203.9,-31.49 193.32,-32.19 196.07,-38.63"/>
+</g>
+<!-- query_engine&#45;&gt;table_engine -->
+<g id="edge24" class="edge">
+<title>query_engine&#45;&gt;table_engine</title>
+<path fill="none" stroke="black" d="M108.5,-288.5C138.1,-255.13 210.39,-179.74 288.37,-144 368.89,-107.1 471.54,-95.62 537.37,-92.19"/>
+<polygon fill="black" stroke="black" points="537.79,-95.68 547.61,-91.71 537.46,-88.69 537.79,-95.68"/>
+</g>
+<!-- query_engine&#45;&gt;sql -->
+<g id="edge23" class="edge">
+<title>query_engine&#45;&gt;sql</title>
+<path fill="none" stroke="black" d="M129.51,-291.34C162.1,-278.66 210.21,-259.95 241.91,-247.62"/>
+<polygon fill="black" stroke="black" points="243.54,-250.75 251.59,-243.86 241,-244.22 243.54,-250.75"/>
+</g>
+<!-- query_engine&#45;&gt;udf -->
+<g id="edge25" class="edge">
+<title>query_engine&#45;&gt;udf</title>
+<path fill="none" stroke="black" d="M95.1,-287.87C96.12,-263.67 98,-219.21 99.22,-190.39"/>
+<polygon fill="black" stroke="black" points="102.72,-190.33 99.65,-180.19 95.73,-190.03 102.72,-190.33"/>
+</g>
+<!-- meta_client -->
+<g id="node15" class="node">
+<title>meta_client</title>
+<ellipse fill="none" stroke="black" cx="485.37" cy="-234" rx="51.99" ry="18"/>
+<text text-anchor="middle" x="485.37" y="-230.3" font-family="Times,serif" font-size="14.00">meta_client</text>
+</g>
+<!-- meta_client&#45;&gt;table_engine -->
+<g id="edge19" class="edge">
+<title>meta_client&#45;&gt;table_engine</title>
+<path fill="none" stroke="black" d="M498.75,-216.53C513.32,-198.61 537.38,-169.16 558.37,-144 566.19,-134.63 574.85,-124.4 582.47,-115.45"/>
+<polygon fill="black" stroke="black" points="585.29,-117.53 589.11,-107.65 579.96,-112.99 585.29,-117.53"/>
+</g>
+<!-- meta_client&#45;&gt;catalog -->
+<g id="edge18" class="edge">
+<title>meta_client&#45;&gt;catalog</title>
+<path fill="none" stroke="black" d="M510.4,-218.15C528.39,-207.48 552.77,-193.02 572.05,-181.58"/>
+<polygon fill="black" stroke="black" points="573.97,-184.52 580.78,-176.4 570.39,-178.5 573.97,-184.52"/>
+</g>
+<!-- server -->
+<g id="node16" class="node">
+<title>server</title>
+<ellipse fill="none" stroke="black" cx="409.37" cy="-450" rx="32.49" ry="18"/>
+<text text-anchor="middle" x="409.37" y="-446.3" font-family="Times,serif" font-size="14.00">server</text>
+</g>
+<!-- server&#45;&gt;arrow_deps -->
+<g id="edge27" class="edge">
+<title>server&#45;&gt;arrow_deps</title>
+<path fill="none" stroke="black" d="M377.03,-447.91C299.22,-444.72 104.2,-433.13 52.37,-396 23.13,-375.04 23.83,-358.97 15.37,-324 -11.96,-211.02 -0.68,-150.14 85.37,-72 111.76,-48.04 149.53,-34.81 180.95,-27.57"/>
+<polygon fill="black" stroke="black" points="181.86,-30.95 190.9,-25.42 180.39,-24.11 181.86,-30.95"/>
+</g>
+<!-- server&#45;&gt;analytic_engine -->
+<g id="edge26" class="edge">
+<title>server&#45;&gt;analytic_engine</title>
+<path fill="none" stroke="black" d="M438.76,-442C513.56,-424.03 717.13,-374.13 884.37,-324 960.54,-301.17 1047.76,-270.95 1100.42,-252.26"/>
+<polygon fill="black" stroke="black" points="1101.91,-255.44 1110.16,-248.79 1099.57,-248.85 1101.91,-255.44"/>
+</g>
+<!-- server&#45;&gt;table_engine -->
+<g id="edge34" class="edge">
+<title>server&#45;&gt;table_engine</title>
+<path fill="none" stroke="black" d="M405.76,-431.81C398.17,-391.21 384.39,-287.45 424.37,-216 453.97,-163.11 516.44,-127.52 559.16,-108.26"/>
+<polygon fill="black" stroke="black" points="560.74,-111.39 568.49,-104.17 557.93,-104.98 560.74,-111.39"/>
+</g>
+<!-- server&#45;&gt;catalog -->
+<g id="edge28" class="edge">
+<title>server&#45;&gt;catalog</title>
+<path fill="none" stroke="black" d="M421.86,-432.93C429.86,-422.51 440.4,-408.58 449.37,-396 502.24,-321.9 560.38,-231.18 587.64,-188.07"/>
+<polygon fill="black" stroke="black" points="590.75,-189.69 593.12,-179.37 584.83,-185.96 590.75,-189.69"/>
+</g>
+<!-- server&#45;&gt;system_catalog -->
+<g id="edge33" class="edge">
+<title>server&#45;&gt;system_catalog</title>
+<path fill="none" stroke="black" d="M440.79,-445.14C476.99,-439.42 536.95,-425.96 579.37,-396 628.39,-361.39 609.48,-319.78 660.37,-288 732.53,-242.94 765.13,-269.53 848.37,-252 853.61,-250.9 859.06,-249.72 864.5,-248.51"/>
+<polygon fill="black" stroke="black" points="865.27,-251.92 874.27,-246.32 863.74,-245.09 865.27,-251.92"/>
+</g>
+<!-- server&#45;&gt;interpreters -->
+<g id="edge29" class="edge">
+<title>server&#45;&gt;interpreters</title>
+<path fill="none" stroke="black" d="M384.81,-437.91C360.43,-426.89 322.64,-409.83 294,-396.9"/>
+<polygon fill="black" stroke="black" points="295.23,-393.61 284.67,-392.68 292.35,-399.99 295.23,-393.61"/>
+</g>
+<!-- server&#45;&gt;sql -->
+<g id="edge32" class="edge">
+<title>server&#45;&gt;sql</title>
+<path fill="none" stroke="black" d="M399.01,-432.56C375.43,-395.19 317.7,-303.68 289.87,-259.57"/>
+<polygon fill="black" stroke="black" points="292.71,-257.5 284.41,-250.91 286.79,-261.23 292.71,-257.5"/>
+</g>
+<!-- server&#45;&gt;udf -->
+<g id="edge35" class="edge">
+<title>server&#45;&gt;udf</title>
+<path fill="none" stroke="black" d="M377.23,-446.3C326.16,-441.19 224.02,-427.67 143.37,-396 86.54,-373.68 56.16,-377.3 26.37,-324 -0.6,-275.73 46.8,-215.86 77.42,-184.49"/>
+<polygon fill="black" stroke="black" points="80.17,-186.68 84.79,-177.15 75.23,-181.72 80.17,-186.68"/>
+</g>
+<!-- server&#45;&gt;query_engine -->
+<g id="edge31" class="edge">
+<title>server&#45;&gt;query_engine</title>
+<path fill="none" stroke="black" d="M377.42,-445.98C333.87,-440.82 254.42,-427.6 194.37,-396 163.33,-379.66 134.12,-351.64 115.38,-331.4"/>
+<polygon fill="black" stroke="black" points="117.8,-328.86 108.5,-323.8 112.62,-333.56 117.8,-328.86"/>
+</g>
+<!-- server&#45;&gt;meta_client -->
+<g id="edge30" class="edge">
+<title>server&#45;&gt;meta_client</title>
+<path fill="none" stroke="black" d="M415.34,-432.21C428.49,-395.19 460.05,-306.3 475.93,-261.6"/>
+<polygon fill="black" stroke="black" points="479.28,-262.61 479.33,-252.01 472.69,-260.26 479.28,-262.61"/>
+</g>
+<!-- ceresdb -->
+<g id="node17" class="node">
+<title>ceresdb</title>
+<ellipse fill="none" stroke="black" cx="603.37" cy="-522" rx="37.89" ry="18"/>
+<text text-anchor="middle" x="603.37" y="-518.3" font-family="Times,serif" font-size="14.00">ceresdb</text>
+</g>
+<!-- ceresdb&#45;&gt;analytic_engine -->
+<g id="edge47" class="edge">
+<title>ceresdb&#45;&gt;analytic_engine</title>
+<path fill="none" stroke="black" d="M641.11,-521.71C739.89,-522.81 1002.89,-520.6 1068.37,-468 1132.08,-416.83 1145.25,-312.26 1147.84,-262.5"/>
+<polygon fill="black" stroke="black" points="1151.35,-262.33 1148.26,-252.2 1144.36,-262.05 1151.35,-262.33"/>
+</g>
+<!-- ceresdb&#45;&gt;table_engine -->
+<g id="edge52" class="edge">
+<title>ceresdb&#45;&gt;table_engine</title>
+<path fill="none" stroke="black" d="M639.9,-517.45C747.27,-506.01 1054.37,-465.71 1054.37,-379 1054.37,-379 1054.37,-379 1054.37,-233 1054.37,-152.56 790.03,-112.06 666.1,-97.49"/>
+<polygon fill="black" stroke="black" points="666.33,-93.99 655.99,-96.32 665.53,-100.95 666.33,-93.99"/>
+</g>
+<!-- ceresdb&#45;&gt;catalog -->
+<g id="edge48" class="edge">
+<title>ceresdb&#45;&gt;catalog</title>
+<path fill="none" stroke="black" d="M603.37,-503.95C603.37,-477.29 603.37,-424.11 603.37,-379 603.37,-379 603.37,-379 603.37,-305 603.37,-265 603.37,-218.65 603.37,-190.08"/>
+<polygon fill="black" stroke="black" points="606.87,-190.05 603.37,-180.05 599.87,-190.05 606.87,-190.05"/>
+</g>
+<!-- ceresdb&#45;&gt;catalog_impls -->
+<g id="edge49" class="edge">
+<title>ceresdb&#45;&gt;catalog_impls</title>
+<path fill="none" stroke="black" d="M613.05,-504.56C634.89,-467.47 688.15,-377.02 714.33,-332.54"/>
+<polygon fill="black" stroke="black" points="717.43,-334.18 719.49,-323.79 711.4,-330.63 717.43,-334.18"/>
+</g>
+<!-- ceresdb&#45;&gt;udf -->
+<g id="edge53" class="edge">
+<title>ceresdb&#45;&gt;udf</title>
+<path fill="none" stroke="black" d="M565.61,-520.46C492.09,-518.47 325.3,-509.36 192.37,-468 125.56,-447.21 101.84,-445.48 52.37,-396 -4.82,-338.79 -16.69,-285.1 25.37,-216 35.63,-199.15 53.49,-186.21 69.21,-177.33"/>
+<polygon fill="black" stroke="black" points="70.96,-180.37 78.14,-172.58 67.67,-174.19 70.96,-180.37"/>
+</g>
+<!-- ceresdb&#45;&gt;query_engine -->
+<g id="edge50" class="edge">
+<title>ceresdb&#45;&gt;query_engine</title>
+<path fill="none" stroke="black" d="M565.63,-520.78C480.57,-518.35 271.33,-501.31 143.37,-396 123.68,-379.8 110.61,-353.51 102.95,-333.7"/>
+<polygon fill="black" stroke="black" points="106.24,-332.49 99.53,-324.29 99.66,-334.88 106.24,-332.49"/>
+</g>
+<!-- ceresdb&#45;&gt;server -->
+<g id="edge51" class="edge">
+<title>ceresdb&#45;&gt;server</title>
+<path fill="none" stroke="black" d="M574.33,-510.52C539.88,-498.09 482.46,-477.37 445.27,-463.95"/>
+<polygon fill="black" stroke="black" points="446.4,-460.64 435.81,-460.54 444.03,-467.22 446.4,-460.64"/>
+</g>
+</g>
+</svg>
diff --git a/docs/example.toml b/docs/example.toml
new file mode 100644
index 0000000000..2e0fdc5064
--- /dev/null
+++ b/docs/example.toml
@@ -0,0 +1,20 @@
+bind_addr = "0.0.0.0"
+http_port = 5440
+grpc_port = 8831
+log_level = "info"
+enable_cluster = true
+
+[analytic]
+data_path = "/tmp/ceresdbx"
+sst_data_cache_cap = 10000
+sst_meta_cache_cap = 10000
+
+[[meta_client.cluster_view.schema_shards]]
+schema = 'public'
+
+[[meta_client.cluster_view.schema_shards.shard_views]]
+shard_id = 0
+
+[meta_client.cluster_view.schema_shards.shard_views.node]
+addr = "127.0.0.1"
+port = 8831
diff --git a/etc/license.template b/etc/license.template
new file mode 100644
index 0000000000..377ec98bed
--- /dev/null
+++ b/etc/license.template
@@ -0,0 +1 @@
+// Copyright {\d+} CeresDB Project Authors. Licensed under Apache-2.0.
\ No newline at end of file
diff --git a/grpcio/Cargo.toml b/grpcio/Cargo.toml
new file mode 100644
index 0000000000..09a147a0d1
--- /dev/null
+++ b/grpcio/Cargo.toml
@@ -0,0 +1,16 @@
+[package]
+name = "grpcio"
+version = "0.1.0"
+authors = ["CeresDB Authors <ceresdb@service.alipay.com>"]
+edition = "2018"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+# Rename to workaround doctest bug
+# See: https://github.com/rust-lang/cargo/issues/6819
+
+[target.'cfg(target_os = "macos")'.dependencies]
+upstream = { version = "0.9.1", package = "grpcio" }
+
+[target.'cfg(target_os = "linux")'.dependencies]
+upstream = { version = "0.9.1", package = "grpcio", features = ["openssl"] }
diff --git a/grpcio/src/lib.rs b/grpcio/src/lib.rs
new file mode 100644
index 0000000000..99d9172ad7
--- /dev/null
+++ b/grpcio/src/lib.rs
@@ -0,0 +1,3 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+pub use upstream::*;
diff --git a/interpreters/Cargo.toml b/interpreters/Cargo.toml
new file mode 100644
index 0000000000..8d28241eef
--- /dev/null
+++ b/interpreters/Cargo.toml
@@ -0,0 +1,27 @@
+[package]
+name = "interpreters"
+version = "0.1.0"
+authors = ["CeresDB Authors <ceresdb@service.alipay.com>"]
+edition = "2018"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+# In alphabetical order
+async-trait = "0.1.41"
+catalog = { path = "../catalog" }
+common_types = { path = "../common_types" }
+common_util = { path = "../common_util" }
+log = "0.4"
+snafu = { version ="0.6.10", features = ["backtraces"]}
+sql = { path = "../sql" }
+table_engine = { path = "../table_engine" }
+udf = { path = "../udf" }
+query_engine = { path = "../query_engine" }
+arrow_deps = { path = "../arrow_deps" }
+
+[dev-dependencies]
+analytic_engine = { path = "../analytic_engine", features = ["test"] }
+catalog_impls = { path = "../catalog_impls" }
+sql = { path = "../sql", features = ["test"] }
+tokio = { version = "1.0", features = ["sync", "time"] }
diff --git a/interpreters/src/alter_table.rs b/interpreters/src/alter_table.rs
new file mode 100644
index 0000000000..acfce81adc
--- /dev/null
+++ b/interpreters/src/alter_table.rs
@@ -0,0 +1,132 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Interpreter for insert statement
+
+use async_trait::async_trait;
+use common_types::{
+    column_schema::{self, ColumnSchema},
+    schema::{self, Schema},
+};
+use common_util::define_result;
+use snafu::{ensure, ResultExt, Snafu};
+use sql::plan::{AlterTableOperation, AlterTablePlan};
+use table_engine::table::AlterSchemaRequest;
+
+use crate::interpreter::{self, AlterTable, Interpreter, InterpreterPtr, Output};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Failed to add column to schema, err:{}", source))]
+    AddColumnSchema { source: common_types::schema::Error },
+
+    #[snafu(display("Failed to build schema, err:{}", source))]
+    BuildSchema { source: common_types::schema::Error },
+
+    #[snafu(display("Failed to alter table schema, err:{}", source))]
+    AlterSchema { source: table_engine::table::Error },
+
+    #[snafu(display("Failed to alter table options, err:{}", source))]
+    AlterOptions { source: table_engine::table::Error },
+
+    #[snafu(display("Not allow to add a not null column, name:{}", name))]
+    AddNotNull { name: String },
+}
+
+define_result!(Error);
+
+pub struct AlterTableInterpreter {
+    plan: AlterTablePlan,
+}
+
+impl AlterTableInterpreter {
+    pub fn create(plan: AlterTablePlan) -> InterpreterPtr {
+        Box::new(Self { plan })
+    }
+}
+
+#[async_trait]
+impl Interpreter for AlterTableInterpreter {
+    async fn execute(self: Box<Self>) -> interpreter::Result<Output> {
+        self.execute_alter().await.context(AlterTable)
+    }
+}
+
+impl AlterTableInterpreter {
+    async fn execute_alter(self: Box<Self>) -> Result<Output> {
+        let AlterTablePlan { table, operations } = self.plan;
+
+        match operations {
+            AlterTableOperation::AddColumn(columns) => {
+                let current_schema = table.schema();
+                let new_schema = build_new_schema(&current_schema, columns)?;
+
+                let request = AlterSchemaRequest {
+                    schema: new_schema,
+                    pre_schema_version: current_schema.version(),
+                };
+
+                let num_rows = table.alter_schema(request).await.context(AlterSchema)?;
+
+                Ok(Output::AffectedRows(num_rows))
+            }
+            AlterTableOperation::ModifySetting(options) => {
+                let num_rows = table.alter_options(options).await.context(AlterOptions)?;
+                Ok(Output::AffectedRows(num_rows))
+            }
+        }
+    }
+}
+
+fn build_new_schema(current_schema: &Schema, column_schemas: Vec<ColumnSchema>) -> Result<Schema> {
+    let current_version = current_schema.version();
+
+    let mut builder =
+        schema::Builder::with_capacity(current_schema.num_columns() + column_schemas.len())
+            // Increment the schema version.
+            .version(current_version + 1);
+    // Add existing columns to builder.
+    for key_column in current_schema.key_columns() {
+        builder = builder
+            .add_key_column(key_column.clone())
+            .context(AddColumnSchema)?;
+    }
+    for normal_column in current_schema.normal_columns() {
+        builder = builder
+            .add_normal_column(normal_column.clone())
+            .context(AddColumnSchema)?;
+    }
+
+    builder = builder
+        // Enable column id generation of the schema builder.
+        .auto_increment_column_id(true)
+        .enable_tsid_primary_key(current_schema.index_of_tsid().is_some());
+
+    // Add new columns
+    for mut column_schema in column_schemas {
+        // Uninit the id of the column schema.
+        column_schema.id = column_schema::COLUMN_ID_UNINIT;
+
+        validate_add_column(&column_schema)?;
+
+        // Only allow to add normal column.
+        builder = builder
+            .add_normal_column(column_schema)
+            .context(AddColumnSchema)?;
+    }
+
+    // Build the final schema.
+    let new_schema = builder.build().context(BuildSchema)?;
+
+    Ok(new_schema)
+}
+
+fn validate_add_column(column_schema: &ColumnSchema) -> Result<()> {
+    ensure!(
+        column_schema.is_nullable,
+        AddNotNull {
+            name: &column_schema.name
+        }
+    );
+
+    Ok(())
+}
diff --git a/interpreters/src/context.rs b/interpreters/src/context.rs
new file mode 100644
index 0000000000..2e46f07082
--- /dev/null
+++ b/interpreters/src/context.rs
@@ -0,0 +1,79 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Interpreter context
+
+use std::sync::Arc;
+
+use common_types::request_id::RequestId;
+use query_engine::context::{Context as QueryContext, ContextRef as QueryContextRef};
+use snafu::Snafu;
+
+#[derive(Debug, Snafu)]
+pub enum Error {}
+
+define_result!(Error);
+
+/// Interpreter context
+///
+/// Contains information that all interpreters need
+pub struct Context {
+    request_id: RequestId,
+    default_catalog: String,
+    default_schema: String,
+}
+
+impl Context {
+    pub fn builder(request_id: RequestId) -> Builder {
+        Builder {
+            request_id,
+            default_catalog: String::new(),
+            default_schema: String::new(),
+        }
+    }
+
+    /// Create a new context of query executor
+    pub fn new_query_context(&self) -> Result<QueryContextRef> {
+        let ctx = QueryContext::builder(self.request_id)
+            .default_catalog_and_schema(self.default_catalog.clone(), self.default_schema.clone())
+            .build();
+        Ok(Arc::new(ctx))
+    }
+
+    #[inline]
+    pub fn default_catalog(&self) -> &str {
+        &self.default_catalog
+    }
+
+    #[inline]
+    pub fn default_schema(&self) -> &str {
+        &self.default_schema
+    }
+
+    #[inline]
+    pub fn request_id(&self) -> RequestId {
+        self.request_id
+    }
+}
+
+#[must_use]
+pub struct Builder {
+    request_id: RequestId,
+    default_catalog: String,
+    default_schema: String,
+}
+
+impl Builder {
+    pub fn default_catalog_and_schema(mut self, catalog: String, schema: String) -> Self {
+        self.default_catalog = catalog;
+        self.default_schema = schema;
+        self
+    }
+
+    pub fn build(self) -> Context {
+        Context {
+            request_id: self.request_id,
+            default_catalog: self.default_catalog,
+            default_schema: self.default_schema,
+        }
+    }
+}
diff --git a/interpreters/src/create.rs b/interpreters/src/create.rs
new file mode 100644
index 0000000000..252b459732
--- /dev/null
+++ b/interpreters/src/create.rs
@@ -0,0 +1,137 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Interpreter for create statements
+
+use async_trait::async_trait;
+use catalog::{manager::Manager, schema::CreateOptions};
+use snafu::{Backtrace, OptionExt, ResultExt, Snafu};
+use sql::plan::CreateTablePlan;
+use table_engine::engine::{CreateTableRequest, TableEngineRef, TableState};
+
+use crate::{
+    context::Context,
+    interpreter::{Create, Interpreter, InterpreterPtr, Output, Result as InterpreterResult},
+};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Failed to find catalog, name:{}, err:{}", name, source))]
+    FindCatalog {
+        name: String,
+        source: catalog::manager::Error,
+    },
+
+    #[snafu(display("Catalog not exists, name:{}.\nBacktrace:\n{}", name, backtrace))]
+    CatalogNotExists { name: String, backtrace: Backtrace },
+
+    #[snafu(display("Failed to find schema, name:{}, err:{}", name, source))]
+    FindSchema {
+        name: String,
+        source: catalog::Error,
+    },
+
+    #[snafu(display("Schema not exists, name:{}.\nBacktrace:\n{}", name, backtrace))]
+    SchemaNotExists { name: String, backtrace: Backtrace },
+
+    #[snafu(display("Failed to create table, name:{}, err:{}", table, source))]
+    SchemaCreateTable {
+        table: String,
+        source: catalog::schema::Error,
+    },
+
+    #[snafu(display("Failed to allocate table id, err:{}", source))]
+    AllocTableId { source: catalog::schema::Error },
+}
+
+define_result!(Error);
+
+/// Create interpreter
+pub struct CreateInterpreter<C> {
+    ctx: Context,
+    plan: CreateTablePlan,
+    catalog_manager: C,
+    table_engine: TableEngineRef,
+}
+
+impl<C: Manager + 'static> CreateInterpreter<C> {
+    pub fn create(
+        ctx: Context,
+        plan: CreateTablePlan,
+        catalog_manager: C,
+        table_engine: TableEngineRef,
+    ) -> InterpreterPtr {
+        Box::new(Self {
+            ctx,
+            plan,
+            catalog_manager,
+            table_engine,
+        })
+    }
+}
+
+impl<C: Manager> CreateInterpreter<C> {
+    async fn execute_create(self: Box<Self>) -> Result<Output> {
+        let default_catalog = self.ctx.default_catalog();
+        let catalog = self
+            .catalog_manager
+            .catalog_by_name(default_catalog)
+            .context(FindCatalog {
+                name: default_catalog,
+            })?
+            .context(CatalogNotExists {
+                name: default_catalog,
+            })?;
+
+        let default_schema = self.ctx.default_schema();
+        let schema = catalog
+            .schema_by_name(default_schema)
+            .context(FindSchema {
+                name: default_schema,
+            })?
+            .context(SchemaNotExists {
+                name: default_schema,
+            })?;
+
+        let CreateTablePlan {
+            engine,
+            table,
+            table_schema,
+            if_not_exists,
+            options,
+        } = self.plan;
+
+        let table_id = schema.alloc_table_id(&table).context(AllocTableId)?;
+        let request = CreateTableRequest {
+            catalog_name: catalog.name().to_string(),
+            schema_name: schema.name().to_string(),
+            table_id,
+            table_name: table.clone(),
+            table_schema,
+            partition_info: None,
+            engine,
+            options,
+            state: TableState::Stable,
+        };
+
+        let opts = CreateOptions {
+            table_engine: self.table_engine,
+            create_if_not_exists: if_not_exists,
+        };
+
+        schema
+            .create_table(request, opts)
+            .await
+            .context(SchemaCreateTable { table })?;
+
+        Ok(Output::AffectedRows(1))
+    }
+}
+
+// TODO(yingwen): Wrap a method that returns self::Result, simplify some code to
+// converting self::Error to super::Error
+#[async_trait]
+impl<C: Manager> Interpreter for CreateInterpreter<C> {
+    async fn execute(self: Box<Self>) -> InterpreterResult<Output> {
+        self.execute_create().await.context(Create)
+    }
+}
diff --git a/interpreters/src/describe.rs b/interpreters/src/describe.rs
new file mode 100644
index 0000000000..ca6266a872
--- /dev/null
+++ b/interpreters/src/describe.rs
@@ -0,0 +1,89 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use std::{convert::TryInto, sync::Arc};
+
+use arrow_deps::arrow::{
+    array::{BooleanArray, StringArray},
+    datatypes::{DataType, Field, Schema},
+    record_batch::RecordBatch,
+};
+use async_trait::async_trait;
+use query_engine::executor::RecordBatchVec;
+use snafu::{ResultExt, Snafu};
+use sql::plan::DescribeTablePlan;
+use table_engine::table::TableRef;
+
+use crate::interpreter::{
+    Describe, Interpreter, InterpreterPtr, Output, Result as InterpreterResult,
+};
+
+#[derive(Debug, Snafu)]
+pub enum Error {}
+
+define_result!(Error);
+
+pub struct DescribeInterpreter {
+    plan: DescribeTablePlan,
+}
+
+impl DescribeInterpreter {
+    pub fn create(plan: DescribeTablePlan) -> InterpreterPtr {
+        Box::new(Self { plan })
+    }
+
+    async fn execute_describe(self: Box<Self>) -> Result<Output> {
+        let DescribeTablePlan { table } = self.plan;
+
+        Self::table_ref_to_record_batch(table).map(Output::Records)
+    }
+
+    fn table_ref_to_record_batch(table_ref: TableRef) -> Result<RecordBatchVec> {
+        let table_schema = table_ref.schema();
+        let num_columns = table_schema.num_columns();
+        let num_key_columns = table_schema.num_key_columns();
+
+        let mut names = Vec::with_capacity(num_columns);
+        let mut types = Vec::with_capacity(num_columns);
+        let mut is_primary_keys = Vec::with_capacity(num_columns);
+        let mut is_nullables = Vec::with_capacity(num_columns);
+        let mut is_tags = Vec::with_capacity(num_columns);
+        for (idx, col) in table_schema.columns().iter().enumerate() {
+            names.push(col.name.to_string());
+            types.push(col.data_type.to_string());
+            is_primary_keys.push(idx < num_key_columns);
+            is_nullables.push(col.is_nullable);
+            is_tags.push(col.is_tag);
+        }
+
+        let schema = Schema::new(vec![
+            Field::new("name", DataType::Utf8, false),
+            Field::new("type", DataType::Utf8, false),
+            Field::new("is_primary", DataType::Boolean, false),
+            Field::new("is_nullable", DataType::Boolean, false),
+            Field::new("is_tag", DataType::Boolean, false),
+        ]);
+
+        let arrow_record_batch = RecordBatch::try_new(
+            Arc::new(schema),
+            vec![
+                Arc::new(StringArray::from(names)),
+                Arc::new(StringArray::from(types)),
+                Arc::new(BooleanArray::from(is_primary_keys)),
+                Arc::new(BooleanArray::from(is_nullables)),
+                Arc::new(BooleanArray::from(is_tags)),
+            ],
+        )
+        .unwrap();
+
+        let record_batch = arrow_record_batch.try_into().unwrap();
+
+        Ok(vec![record_batch])
+    }
+}
+
+#[async_trait]
+impl Interpreter for DescribeInterpreter {
+    async fn execute(self: Box<Self>) -> InterpreterResult<Output> {
+        self.execute_describe().await.context(Describe)
+    }
+}
diff --git a/interpreters/src/drop.rs b/interpreters/src/drop.rs
new file mode 100644
index 0000000000..7282ae3bc2
--- /dev/null
+++ b/interpreters/src/drop.rs
@@ -0,0 +1,126 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Interpreter for drop statements
+
+use async_trait::async_trait;
+use catalog::{manager::Manager, schema::DropOptions};
+use snafu::{Backtrace, OptionExt, ResultExt, Snafu};
+use sql::plan::DropTablePlan;
+use table_engine::engine::{DropTableRequest, TableEngineRef};
+
+use crate::{
+    context::Context,
+    interpreter::{Drop, Interpreter, InterpreterPtr, Output, Result as InterpreterResult},
+};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Failed to find catalog, name:{}, err:{}", name, source))]
+    FindCatalog {
+        name: String,
+        source: catalog::manager::Error,
+    },
+
+    #[snafu(display("Catalog not exists, name:{}.\nBacktrace:\n{}", name, backtrace))]
+    CatalogNotExists { name: String, backtrace: Backtrace },
+
+    #[snafu(display("Failed to find schema, name:{}, err:{}", name, source))]
+    FindSchema {
+        name: String,
+        source: catalog::Error,
+    },
+
+    #[snafu(display("Schema not exists, name:{}.\nBacktrace:\n{}", name, backtrace))]
+    SchemaNotExists { name: String, backtrace: Backtrace },
+
+    #[snafu(display("Failed to drop table in schema, name:{}, err:{}", table, source))]
+    SchemaDropTable {
+        table: String,
+        source: catalog::schema::Error,
+    },
+
+    #[snafu(display("Failed to drop table, name:{}, err:{}", table, source))]
+    DropTable {
+        table: String,
+        source: table_engine::engine::Error,
+    },
+}
+
+define_result!(Error);
+
+/// Drop interpreter
+pub struct DropInterpreter<C> {
+    ctx: Context,
+    plan: DropTablePlan,
+    catalog_manager: C,
+    table_engine: TableEngineRef,
+}
+
+impl<C: Manager + 'static> DropInterpreter<C> {
+    pub fn create(
+        ctx: Context,
+        plan: DropTablePlan,
+        catalog_manager: C,
+        table_engine: TableEngineRef,
+    ) -> InterpreterPtr {
+        Box::new(Self {
+            ctx,
+            plan,
+            catalog_manager,
+            table_engine,
+        })
+    }
+}
+
+impl<C: Manager> DropInterpreter<C> {
+    async fn execute_drop(self: Box<Self>) -> Result<Output> {
+        let default_catalog = self.ctx.default_catalog();
+        let catalog = self
+            .catalog_manager
+            .catalog_by_name(default_catalog)
+            .context(FindCatalog {
+                name: default_catalog,
+            })?
+            .context(CatalogNotExists {
+                name: default_catalog,
+            })?;
+
+        let default_schema = self.ctx.default_schema();
+        let schema = catalog
+            .schema_by_name(default_schema)
+            .context(FindSchema {
+                name: default_schema,
+            })?
+            .context(SchemaNotExists {
+                name: default_schema,
+            })?;
+
+        let table = self.plan.table;
+        let request = DropTableRequest {
+            catalog_name: catalog.name().to_string(),
+            schema_name: schema.name().to_string(),
+            table_name: table.clone(),
+            engine: self.plan.engine,
+        };
+
+        let opts = DropOptions {
+            table_engine: self.table_engine,
+        };
+
+        let dropped = schema
+            .drop_table(request, opts)
+            .await
+            .context(SchemaDropTable { table: &table })?;
+
+        Ok(Output::AffectedRows(if dropped { 1 } else { 0 }))
+    }
+}
+
+// TODO(yingwen): Wrap a method that returns self::Result, simplify some code to
+// converting self::Error to super::Error
+#[async_trait]
+impl<C: Manager> Interpreter for DropInterpreter<C> {
+    async fn execute(self: Box<Self>) -> InterpreterResult<Output> {
+        self.execute_drop().await.context(Drop)
+    }
+}
diff --git a/interpreters/src/exists.rs b/interpreters/src/exists.rs
new file mode 100644
index 0000000000..f926a700c4
--- /dev/null
+++ b/interpreters/src/exists.rs
@@ -0,0 +1,62 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use std::{convert::TryInto, sync::Arc};
+
+use arrow_deps::arrow::{
+    array::UInt8Array,
+    datatypes::{DataType, Field, Schema},
+    record_batch::RecordBatch,
+};
+use async_trait::async_trait;
+use query_engine::executor::RecordBatchVec;
+use snafu::{ResultExt, Snafu};
+use sql::plan::ExistsTablePlan;
+
+use crate::interpreter::{
+    Exists, Interpreter, InterpreterPtr, Output, Result as InterpreterResult,
+};
+
+#[derive(Debug, Snafu)]
+pub enum Error {}
+
+define_result!(Error);
+
+pub struct ExistsInterpreter {
+    plan: ExistsTablePlan,
+}
+
+impl ExistsInterpreter {
+    pub fn create(plan: ExistsTablePlan) -> InterpreterPtr {
+        Box::new(Self { plan })
+    }
+
+    async fn execute_exists(self: Box<Self>) -> Result<Output> {
+        let ExistsTablePlan { exists } = self.plan;
+
+        exists_table_result(exists).map(Output::Records)
+    }
+}
+
+fn exists_table_result(exists: bool) -> Result<RecordBatchVec> {
+    let schema = Schema::new(vec![Field::new("result", DataType::UInt8, false)]);
+
+    let arrow_record_batch = RecordBatch::try_new(
+        Arc::new(schema),
+        vec![Arc::new(UInt8Array::from_value(
+            if exists { 1u8 } else { 0u8 },
+            1,
+        ))],
+    )
+    .unwrap();
+
+    let record_batch = arrow_record_batch.try_into().unwrap();
+
+    Ok(vec![record_batch])
+}
+
+#[async_trait]
+impl Interpreter for ExistsInterpreter {
+    async fn execute(self: Box<Self>) -> InterpreterResult<Output> {
+        self.execute_exists().await.context(Exists)
+    }
+}
diff --git a/interpreters/src/factory.rs b/interpreters/src/factory.rs
new file mode 100644
index 0000000000..26b858723c
--- /dev/null
+++ b/interpreters/src/factory.rs
@@ -0,0 +1,49 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Interpreter factory
+
+use catalog::manager::Manager as CatalogManager;
+use query_engine::executor::Executor;
+use sql::plan::Plan;
+use table_engine::engine::TableEngineRef;
+
+use crate::{
+    alter_table::AlterTableInterpreter, context::Context, create::CreateInterpreter,
+    describe::DescribeInterpreter, drop::DropInterpreter, exists::ExistsInterpreter,
+    insert::InsertInterpreter, interpreter::InterpreterPtr, select::SelectInterpreter,
+    show_create::ShowCreateInInterpreter,
+};
+
+/// A factory to create interpreters
+pub struct Factory<Q, C> {
+    query_executor: Q,
+    catalog_manager: C,
+    table_engine: TableEngineRef,
+}
+
+impl<Q: Executor + 'static, C: CatalogManager + 'static> Factory<Q, C> {
+    pub fn new(query_executor: Q, catalog_manager: C, table_engine: TableEngineRef) -> Self {
+        Self {
+            query_executor,
+            catalog_manager,
+            table_engine,
+        }
+    }
+
+    pub fn create(self, ctx: Context, plan: Plan) -> InterpreterPtr {
+        match plan {
+            Plan::Query(p) => SelectInterpreter::create(ctx, p, self.query_executor),
+            Plan::Insert(p) => InsertInterpreter::create(ctx, p),
+            Plan::Create(p) => {
+                CreateInterpreter::create(ctx, p, self.catalog_manager, self.table_engine)
+            }
+            Plan::Drop(p) => {
+                DropInterpreter::create(ctx, p, self.catalog_manager, self.table_engine)
+            }
+            Plan::Describe(p) => DescribeInterpreter::create(p),
+            Plan::AlterTable(p) => AlterTableInterpreter::create(p),
+            Plan::ShowCreate(p) => ShowCreateInInterpreter::create(p),
+            Plan::Exists(p) => ExistsInterpreter::create(p),
+        }
+    }
+}
diff --git a/interpreters/src/insert.rs b/interpreters/src/insert.rs
new file mode 100644
index 0000000000..c2a2ddf636
--- /dev/null
+++ b/interpreters/src/insert.rs
@@ -0,0 +1,138 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Interpreter for insert statement
+
+use async_trait::async_trait;
+use common_types::{column_schema::ColumnId, datum::Datum, hash::hash64};
+use common_util::codec::{compact::MemCompactEncoder, Encoder};
+use snafu::{ResultExt, Snafu};
+use sql::plan::InsertPlan;
+use table_engine::table::WriteRequest;
+
+use crate::{
+    context::Context,
+    interpreter::{Insert, Interpreter, InterpreterPtr, Output, Result},
+};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Failed to write table, err:{}", source))]
+    WriteTable { source: table_engine::table::Error },
+
+    #[snafu(display("Failed to encode tsid, err:{}", source))]
+    EncodeTsid {
+        source: common_util::codec::compact::Error,
+    },
+}
+
+pub struct InsertInterpreter {
+    ctx: Context,
+    plan: InsertPlan,
+}
+
+impl InsertInterpreter {
+    pub fn create(ctx: Context, plan: InsertPlan) -> InterpreterPtr {
+        Box::new(Self { ctx, plan })
+    }
+}
+
+#[async_trait]
+impl Interpreter for InsertInterpreter {
+    async fn execute(mut self: Box<Self>) -> Result<Output> {
+        // Generate tsid if needed.
+        self.maybe_generate_tsid()?;
+        let InsertPlan { table, rows } = self.plan;
+
+        // Context is unused now
+        let _ctx = self.ctx;
+
+        let request = WriteRequest { row_group: rows };
+
+        let num_rows = table
+            .write(request)
+            .await
+            .context(WriteTable)
+            .context(Insert)?;
+
+        Ok(Output::AffectedRows(num_rows))
+    }
+}
+
+impl InsertInterpreter {
+    fn maybe_generate_tsid(&mut self) -> Result<()> {
+        let schema = self.plan.rows.schema();
+        let tsid_idx = schema.index_of_tsid();
+
+        if let Some(idx) = tsid_idx {
+            // Vec of (`index of tag`, `column id of tag`).
+            let tag_idx_column_ids: Vec<_> = schema
+                .columns()
+                .iter()
+                .enumerate()
+                .filter_map(|(i, column)| {
+                    if column.is_tag {
+                        Some((i, column.id))
+                    } else {
+                        None
+                    }
+                })
+                .collect();
+
+            let mut hash_bytes = Vec::new();
+            for i in 0..self.plan.rows.num_rows() {
+                let row = self.plan.rows.get_row_mut(i).unwrap();
+
+                let mut tsid_builder = TsidBuilder::new(&mut hash_bytes);
+
+                for (idx, column_id) in &tag_idx_column_ids {
+                    tsid_builder.maybe_write_datum(*column_id, &row[*idx])?;
+                }
+
+                let tsid = tsid_builder.finish();
+                row[idx] = Datum::UInt64(tsid);
+            }
+        }
+        Ok(())
+    }
+}
+
+struct TsidBuilder<'a> {
+    encoder: MemCompactEncoder,
+    hash_bytes: &'a mut Vec<u8>,
+}
+
+impl<'a> TsidBuilder<'a> {
+    fn new(hash_bytes: &'a mut Vec<u8>) -> Self {
+        // Clear the bytes buffer.
+        hash_bytes.clear();
+
+        Self {
+            encoder: MemCompactEncoder,
+            hash_bytes,
+        }
+    }
+
+    fn maybe_write_datum(&mut self, column_id: ColumnId, datum: &Datum) -> Result<()> {
+        // Null datum will be ignored, so tsid remains unchanged after adding a null
+        // column.
+        if datum.is_null() {
+            return Ok(());
+        }
+
+        // Write column id first.
+        self.encoder
+            .encode(self.hash_bytes, &Datum::UInt64(u64::from(column_id)))
+            .context(EncodeTsid)
+            .context(Insert)?;
+        // Write datum.
+        self.encoder
+            .encode(self.hash_bytes, datum)
+            .context(EncodeTsid)
+            .context(Insert)?;
+        Ok(())
+    }
+
+    fn finish(self) -> u64 {
+        hash64(self.hash_bytes)
+    }
+}
diff --git a/interpreters/src/interpreter.rs b/interpreters/src/interpreter.rs
new file mode 100644
index 0000000000..4591eb5df5
--- /dev/null
+++ b/interpreters/src/interpreter.rs
@@ -0,0 +1,56 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Interpreter trait
+
+use async_trait::async_trait;
+use query_engine::executor::RecordBatchVec;
+use snafu::Snafu;
+
+// Make the variant closer to actual error code like invalid arguments.
+#[derive(Debug, Snafu)]
+#[snafu(visibility(pub(crate)))]
+pub enum Error {
+    #[snafu(display("Failed to execute select, err:{}", source))]
+    Select { source: crate::select::Error },
+
+    #[snafu(display("Failed to execute create table, err:{}", source))]
+    Create { source: crate::create::Error },
+
+    #[snafu(display("Failed to execute drop table, err:{}", source))]
+    Drop { source: crate::drop::Error },
+
+    #[snafu(display("Failed to execute insert, err:{}", source))]
+    Insert { source: crate::insert::Error },
+
+    #[snafu(display("Failed to execute describe, err:{}", source))]
+    Describe { source: crate::describe::Error },
+
+    #[snafu(display("Failed to execute alter table, err:{}", source))]
+    AlterTable { source: crate::alter_table::Error },
+
+    #[snafu(display("Failed to show create table, err:{}", source))]
+    ShowCreate { source: crate::show_create::Error },
+
+    #[snafu(display("Failed to execute exists, err:{}", source))]
+    Exists { source: crate::exists::Error },
+}
+
+define_result!(Error);
+
+// TODO(yingwen): Maybe add a stream variant for streaming result
+/// The interpreter output
+pub enum Output {
+    /// Affected rows number
+    AffectedRows(usize),
+    /// A vec of RecordBatch
+    Records(RecordBatchVec),
+}
+
+/// Interpreter executes the plan it holds
+#[async_trait]
+pub trait Interpreter {
+    async fn execute(self: Box<Self>) -> Result<Output>;
+}
+
+/// A pointer to Interpreter
+pub type InterpreterPtr = Box<dyn Interpreter + Send>;
diff --git a/interpreters/src/lib.rs b/interpreters/src/lib.rs
new file mode 100644
index 0000000000..6f3b888e6e
--- /dev/null
+++ b/interpreters/src/lib.rs
@@ -0,0 +1,23 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Interpreters of query/insert/update/delete commands
+//!
+//! Inspired by fuse-query: <https://github.com/datafuselabs/fuse-query> and ClickHouse
+
+#[macro_use]
+extern crate common_util;
+
+pub mod alter_table;
+pub mod context;
+pub mod create;
+pub mod describe;
+pub mod drop;
+pub mod exists;
+pub mod factory;
+pub mod insert;
+pub mod interpreter;
+pub mod select;
+pub mod show_create;
+
+#[cfg(test)]
+mod tests;
diff --git a/interpreters/src/select.rs b/interpreters/src/select.rs
new file mode 100644
index 0000000000..97a0f84a57
--- /dev/null
+++ b/interpreters/src/select.rs
@@ -0,0 +1,75 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Interpreter for select statement
+
+use async_trait::async_trait;
+use log::debug;
+use query_engine::executor::{Executor, Query};
+use snafu::{ResultExt, Snafu};
+use sql::plan::QueryPlan;
+
+use crate::{
+    context::Context,
+    interpreter::{Interpreter, InterpreterPtr, Output, Result as InterpreterResult, Select},
+};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Failed to create query context, err:{}", source))]
+    CreateQueryContext { source: crate::context::Error },
+
+    #[snafu(display("Failed to execute logical plan, err:{}", source))]
+    ExecutePlan {
+        source: query_engine::executor::Error,
+    },
+}
+
+define_result!(Error);
+
+/// Select interpreter
+pub struct SelectInterpreter<T> {
+    ctx: Context,
+    plan: QueryPlan,
+    executor: T,
+}
+
+impl<T: Executor + 'static> SelectInterpreter<T> {
+    pub fn create(ctx: Context, plan: QueryPlan, executor: T) -> InterpreterPtr {
+        Box::new(Self {
+            ctx,
+            plan,
+            executor,
+        })
+    }
+}
+
+#[async_trait]
+impl<T: Executor> Interpreter for SelectInterpreter<T> {
+    async fn execute(self: Box<Self>) -> InterpreterResult<Output> {
+        let request_id = self.ctx.request_id();
+        debug!(
+            "Interpreter execute select begin, request_id:{}, plan:{:?}",
+            request_id, self.plan
+        );
+
+        let query_ctx = self
+            .ctx
+            .new_query_context()
+            .context(CreateQueryContext)
+            .context(Select)?;
+        let query = Query::new(self.plan);
+        let record_batches = self
+            .executor
+            .execute_logical_plan(query_ctx, query)
+            .await
+            .context(ExecutePlan)
+            .context(Select)?;
+
+        debug!(
+            "Interpreter execute select finish, request_id:{}",
+            request_id
+        );
+
+        Ok(Output::Records(record_batches))
+    }
+}
diff --git a/interpreters/src/show_create.rs b/interpreters/src/show_create.rs
new file mode 100644
index 0000000000..38d1747ab8
--- /dev/null
+++ b/interpreters/src/show_create.rs
@@ -0,0 +1,136 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use std::{collections::HashMap, convert::TryInto, sync::Arc};
+
+use arrow_deps::arrow::{
+    array::StringArray,
+    datatypes::{DataType, Field, Schema},
+    record_batch::RecordBatch,
+};
+use async_trait::async_trait;
+use query_engine::executor::RecordBatchVec;
+use snafu::{ensure, Backtrace, ResultExt, Snafu};
+use sql::{ast::ShowCreateObject, plan::ShowCreatePlan};
+use table_engine::table::TableRef;
+
+use crate::interpreter::{
+    Interpreter, InterpreterPtr, Output, Result as InterpreterResult, ShowCreate,
+};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display(
+        "Unsupported show create type, type: {:?}, err:{}",
+        obj_type,
+        backtrace
+    ))]
+    UnsupportedType {
+        obj_type: ShowCreateObject,
+        backtrace: Backtrace,
+    },
+}
+
+define_result!(Error);
+
+pub struct ShowCreateInInterpreter {
+    plan: ShowCreatePlan,
+}
+
+impl ShowCreateInInterpreter {
+    pub fn create(plan: ShowCreatePlan) -> InterpreterPtr {
+        Box::new(Self { plan })
+    }
+
+    async fn execute_show_create(self: Box<Self>) -> Result<Output> {
+        let ShowCreatePlan { table, obj_type } = self.plan;
+
+        ensure!(
+            obj_type == ShowCreateObject::Table,
+            UnsupportedType { obj_type }
+        );
+
+        Self::table_ref_to_record_batch(table).map(Output::Records)
+    }
+
+    fn table_ref_to_record_batch(table_ref: TableRef) -> Result<RecordBatchVec> {
+        let tables = vec![table_ref.name().to_string()];
+        let sqls = vec![Self::render_table_sql(table_ref)];
+
+        let schema = Schema::new(vec![
+            Field::new("Table", DataType::Utf8, false),
+            Field::new("Create Table", DataType::Utf8, false),
+        ]);
+
+        let arrow_record_batch = RecordBatch::try_new(
+            Arc::new(schema),
+            vec![
+                Arc::new(StringArray::from(tables)),
+                Arc::new(StringArray::from(sqls)),
+            ],
+        )
+        .unwrap();
+
+        let record_batch = arrow_record_batch.try_into().unwrap();
+
+        Ok(vec![record_batch])
+    }
+
+    fn render_table_sql(table_ref: TableRef) -> String {
+        //TODO(boyan) pretty output
+        format!(
+            "CREATE TABLE `{}` ({}) ENGINE={}{}",
+            table_ref.name(),
+            Self::render_columns_and_constrains(&table_ref),
+            table_ref.engine_type(),
+            Self::render_options(table_ref.options())
+        )
+    }
+
+    fn render_columns_and_constrains(table_ref: &TableRef) -> String {
+        let table_schema = table_ref.schema();
+        let key_columns = table_schema.key_columns();
+        let timestamp_key = table_schema.timestamp_name();
+
+        let mut res = String::new();
+        for col in table_schema.columns() {
+            res += format!("`{}` {}", col.name, col.data_type).as_str();
+            if col.is_tag {
+                res += " TAG";
+            }
+            if !col.is_nullable {
+                res += " NOT NULL";
+            }
+
+            if !col.comment.is_empty() {
+                res += format!(" COMMENT '{}'", col.comment).as_str();
+            }
+            res += ", ";
+        }
+        let keys: Vec<String> = key_columns.iter().map(|col| col.name.to_string()).collect();
+        res += format!("PRIMARY KEY({}), ", keys.join(",")).as_str();
+        res += format!("TIMESTAMP KEY({})", timestamp_key).as_str();
+
+        res
+    }
+
+    fn render_options(opts: HashMap<String, String>) -> String {
+        if !opts.is_empty() {
+            let mut v: Vec<String> = opts
+                .into_iter()
+                .map(|(k, v)| format!("{}='{}'", k, v))
+                .collect();
+            // sorted by option name
+            v.sort();
+            format!(" WITH({})", v.join(", "))
+        } else {
+            "".to_string()
+        }
+    }
+}
+
+#[async_trait]
+impl Interpreter for ShowCreateInInterpreter {
+    async fn execute(self: Box<Self>) -> InterpreterResult<Output> {
+        self.execute_show_create().await.context(ShowCreate)
+    }
+}
diff --git a/interpreters/src/tests.rs b/interpreters/src/tests.rs
new file mode 100644
index 0000000000..4b05a239f8
--- /dev/null
+++ b/interpreters/src/tests.rs
@@ -0,0 +1,236 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use std::sync::Arc;
+
+use analytic_engine::tests::util::TestEnv;
+use catalog::consts::{DEFAULT_CATALOG, DEFAULT_SCHEMA};
+use catalog_impls::table_based::TableBasedManager;
+use common_types::request_id::RequestId;
+use query_engine::executor::ExecutorImpl;
+use sql::{
+    parser::Parser, plan::Plan, planner::Planner, provider::MetaProvider, tests::MockMetaProvider,
+};
+use table_engine::engine::TableEngine;
+
+use crate::{
+    context::Context,
+    factory::Factory,
+    interpreter::{Output, Result},
+};
+
+async fn build_catalog_manager<E>(analytic: E) -> TableBasedManager
+where
+    E: TableEngine + Clone + Send + Sync + 'static,
+{
+    // Create catalog manager, use analytic table as backend
+    TableBasedManager::new(&analytic.clone(), Arc::new(analytic))
+        .await
+        .unwrap_or_else(|e| {
+            panic!("Failed to create catalog manager, err:{}", e);
+        })
+}
+
+fn sql_to_plan<M: MetaProvider>(meta_provider: &M, sql: &str) -> Plan {
+    let planner = Planner::new(meta_provider, RequestId::next_id(), 1);
+    let mut statements = Parser::parse_sql(sql).unwrap();
+    assert_eq!(statements.len(), 1);
+    planner.statement_to_plan(statements.remove(0)).unwrap()
+}
+
+async fn build_factory<E, M>(env: &Env<E, M>) -> Factory<ExecutorImpl, TableBasedManager>
+where
+    E: TableEngine + Clone + Send + Sync + 'static,
+    M: MetaProvider,
+{
+    let catalog_manager = build_catalog_manager(env.engine()).await;
+    Factory::new(ExecutorImpl::new(), catalog_manager, Arc::new(env.engine()))
+}
+
+async fn sql_to_output<E, M>(env: &Env<E, M>, sql: &str) -> Result<Output>
+where
+    E: TableEngine + Clone + Send + Sync + 'static,
+    M: MetaProvider,
+{
+    let plan = sql_to_plan(&env.meta_provider, sql);
+
+    let ctx = Context::builder(RequestId::next_id())
+        .default_catalog_and_schema(DEFAULT_CATALOG.to_string(), DEFAULT_SCHEMA.to_string())
+        .build();
+
+    let factory = build_factory(env).await;
+    let interpreter = factory.create(ctx, plan);
+    interpreter.execute().await
+}
+
+async fn test_create_table<E, M>(env: &Env<E, M>)
+where
+    E: TableEngine + Clone + Send + Sync + 'static,
+    M: MetaProvider,
+{
+    let sql="CREATE TABLE IF NOT EXISTS test_table(c1 string tag not null,ts timestamp not null, c3 string, timestamp key(ts),primary key(c1, ts)) \
+        ENGINE=Analytic WITH (ttl='70d',update_mode='overwrite',arena_block_size='1KB')";
+
+    let output = sql_to_output(env, sql).await.unwrap();
+    if let Output::AffectedRows(v) = output {
+        assert_eq!(v, 1);
+    } else {
+        panic!();
+    }
+}
+
+async fn test_desc_table<E, M>(env: &Env<E, M>)
+where
+    E: TableEngine + Clone + Send + Sync + 'static,
+    M: MetaProvider,
+{
+    let sql = "desc table test_table";
+    let output = sql_to_output(env, sql).await.unwrap();
+    if let Output::Records(v) = output {
+        assert_eq!(v.len(), 1);
+    } else {
+        panic!();
+    }
+}
+
+async fn test_exists_table<E, M>(env: &Env<E, M>)
+where
+    E: TableEngine + Clone + Send + Sync + 'static,
+    M: MetaProvider,
+{
+    let sql = "exists table test_table";
+    let output = sql_to_output(env, sql).await.unwrap();
+    if let Output::Records(v) = output {
+        assert_eq!(v.len(), 1);
+    } else {
+        panic!();
+    }
+}
+
+async fn test_insert_table<E, M>(env: &Env<E, M>)
+where
+    E: TableEngine + Clone + Send + Sync + 'static,
+    M: MetaProvider,
+{
+    let sql = "INSERT INTO test_table(key1, key2, field1,field2) VALUES('tagk', 1638428434000,100, 'hello3'),('tagk2', 1638428434000,100, 'hello3');";
+    let output = sql_to_output(env, sql).await.unwrap();
+    if let Output::AffectedRows(v) = output {
+        assert_eq!(v, 2);
+    } else {
+        panic!();
+    }
+}
+
+async fn test_select_table<E, M>(env: &Env<E, M>)
+where
+    E: TableEngine + Clone + Send + Sync + 'static,
+    M: MetaProvider,
+{
+    let sql = "select * from test_table";
+    let output = sql_to_output(env, sql).await.unwrap();
+    if let Output::Records(v) = output {
+        assert_eq!(v.len(), 1);
+        assert_eq!(v[0].num_rows(), 2);
+    } else {
+        panic!();
+    }
+
+    let sql = "select count(*) from test_table";
+    let output = sql_to_output(env, sql).await.unwrap();
+    if let Output::Records(v) = output {
+        assert_eq!(v.len(), 1);
+        assert_eq!(v[0].num_rows(), 1);
+    } else {
+        panic!();
+    }
+}
+
+async fn test_show_create_table<E, M>(env: &Env<E, M>)
+where
+    E: TableEngine + Clone + Send + Sync + 'static,
+    M: MetaProvider,
+{
+    let sql = "show create table test_table";
+    let output = sql_to_output(env, sql).await.unwrap();
+    if let Output::Records(v) = output {
+        assert_eq!(v.len(), 1);
+        assert_eq!(v[0].num_rows(), 1);
+    } else {
+        panic!();
+    }
+}
+
+async fn test_alter_table<E, M>(env: &Env<E, M>)
+where
+    E: TableEngine + Clone + Send + Sync + 'static,
+    M: MetaProvider,
+{
+    let sql = "alter table test_table add column add_col string";
+    let output = sql_to_output(env, sql).await.unwrap();
+    if let Output::AffectedRows(v) = output {
+        assert_eq!(v, 1);
+    } else {
+        panic!();
+    }
+
+    let sql = "alter table test_table modify SETTING ttl='9d'";
+    let output = sql_to_output(env, sql).await.unwrap();
+    if let Output::AffectedRows(v) = output {
+        assert_eq!(v, 1);
+    } else {
+        panic!();
+    }
+}
+
+async fn test_drop_table<E, M>(env: &Env<E, M>)
+where
+    E: TableEngine + Clone + Send + Sync + 'static,
+    M: MetaProvider,
+{
+    let sql = "drop table test_table";
+    let output = sql_to_output(env, sql).await.unwrap();
+    if let Output::AffectedRows(v) = output {
+        assert_eq!(v, 1);
+    } else {
+        panic!();
+    }
+}
+
+struct Env<E, M>
+where
+    E: TableEngine + Clone + Send + Sync + 'static,
+    M: MetaProvider,
+{
+    pub engine: E,
+    pub meta_provider: M,
+}
+
+impl<E, M> Env<E, M>
+where
+    E: TableEngine + Clone + Send + Sync + 'static,
+    M: MetaProvider,
+{
+    fn engine(&self) -> E {
+        self.engine.clone()
+    }
+}
+
+#[tokio::test]
+async fn test_interpreters() {
+    let env = TestEnv::builder().build();
+    let mut test_ctx = env.new_context();
+    test_ctx.open().await;
+    let mock = MockMetaProvider::default();
+    let env = Env {
+        engine: test_ctx.engine(),
+        meta_provider: mock,
+    };
+
+    test_create_table(&env).await;
+    test_desc_table(&env).await;
+    test_exists_table(&env).await;
+    test_insert_table(&env).await;
+    test_select_table(&env).await;
+    test_show_create_table(&env).await;
+    test_alter_table(&env).await;
+    test_drop_table(&env).await;
+}
diff --git a/meta_client/Cargo.toml b/meta_client/Cargo.toml
new file mode 100644
index 0000000000..bd544c5d7f
--- /dev/null
+++ b/meta_client/Cargo.toml
@@ -0,0 +1,26 @@
+[package]
+name = "meta_client"
+version = "0.1.0"
+authors = ["CeresDB Authors <ceresdb@service.alipay.com>"]
+edition = "2018"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+async-trait = "0.1.41"
+catalog = { path = "../catalog" }
+common_types = { path = "../common_types" }
+table_engine = { path = "../table_engine" }
+common_util = { path = "../common_util" }
+ceresdbproto = { git = "https://github.com/CeresDB/ceresdbproto.git"}
+futures = "0.3"
+grpcio = { path = "../grpcio" }
+log = "0.4"
+rand = "0.7"
+reqwest = "0.11"
+serde = "1.0"
+serde_derive = "1.0.81"
+serde_json = "1.0.60"
+snafu = { version ="0.6.10", features = ["backtraces"]}
+tokio = { version = "1.0", features = ["full"] }
+url = "2.2"
diff --git a/meta_client/src/lib.rs b/meta_client/src/lib.rs
new file mode 100644
index 0000000000..34563a1e71
--- /dev/null
+++ b/meta_client/src/lib.rs
@@ -0,0 +1,705 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Client to communicate with meta
+
+use std::{
+    collections::HashMap,
+    convert::TryFrom,
+    sync::{Arc, RwLock},
+    time::Duration,
+};
+
+use async_trait::async_trait;
+use ceresdbproto::{
+    meta::{CommonNodeInfo, NodeType},
+    metagrpc::{
+        ClusterViewResponse, FetchClusterViewRequest, NameSpace, RegisterNodeRequest,
+        RegisterNodeResponse,
+    },
+    metagrpc_grpc::CeresmetaRpcServiceClient,
+};
+use common_types::{bytes::Bytes, schema::TIMESTAMP_COLUMN};
+use common_util::{config::ReadableDuration, define_result, runtime::Runtime};
+use futures::TryStreamExt;
+use grpcio::{ChannelBuilder, Environment};
+use load_balance::{LoadBalancer, RandomLoadBalancer};
+use log::{error, info};
+use reqwest::{self, StatusCode, Url};
+use serde::de::DeserializeOwned;
+use serde_derive::Deserialize;
+use snafu::{Backtrace, OptionExt, ResultExt, Snafu};
+use table_engine::ANALYTIC_ENGINE_TYPE;
+use tokio::time;
+
+use crate::static_client::StaticMetaClient;
+
+mod load_balance;
+mod static_client;
+
+#[derive(Debug, Snafu)]
+#[snafu(visibility = "pub")]
+pub enum Error {
+    #[snafu(display("Build http client failed, err:{}.\nBacktrace:\n{}", source, backtrace))]
+    BuildHttpClient {
+        source: reqwest::Error,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Invalid meta addr, addr:{}, err:{}.\nBacktrace:\n{}",
+        meta_addr,
+        source,
+        backtrace
+    ))]
+    InvalidMetaAddr {
+        meta_addr: String,
+        source: url::ParseError,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Failed to join url, input:{}, err:{}.\nBacktrace:\n{}",
+        input,
+        source,
+        backtrace
+    ))]
+    JoinUrl {
+        input: String,
+        source: url::ParseError,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Failed to send http request, err:{}.\nBacktrace:\n{}",
+        source,
+        backtrace
+    ))]
+    SendHttp {
+        source: reqwest::Error,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Failed to parse http text, err:{}.\nBacktrace:\n{}",
+        source,
+        backtrace
+    ))]
+    ParseText {
+        source: reqwest::Error,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Bad http status, status:{}, url:{}, text:{:?}.\nBacktrace:\n{}",
+        status,
+        url,
+        text,
+        backtrace
+    ))]
+    BadHttpStatus {
+        status: StatusCode,
+        url: String,
+        text: Bytes,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Failed to parse json text, text:{:?}, err:{}.\nBacktrace:\n{}",
+        text,
+        source,
+        backtrace
+    ))]
+    ParseJson {
+        text: Bytes,
+        source: serde_json::Error,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Failed to fetch cluster view, err:{}.\nBacktrace:\n{}",
+        source,
+        backtrace
+    ))]
+    FetchClusterViewError {
+        source: grpcio::Error,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Encountered register node, err:{}.\nBacktrace:\n{}",
+        source,
+        backtrace
+    ))]
+    RegisterNodeError {
+        source: grpcio::Error,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Encountered build rpc client, err:{}", source))]
+    BuildRpcClientError { source: load_balance::Error },
+
+    #[snafu(display(
+        "Invalid node addr of cluster view, node:{}.\nBacktrace:\n{}",
+        node,
+        backtrace
+    ))]
+    InvalidNodeAddr { node: String, backtrace: Backtrace },
+
+    #[snafu(display(
+        "Invalid node port of cluster view, node:{}, err:{}.\nBacktrace:\n{}",
+        node,
+        source,
+        backtrace
+    ))]
+    InvalidNodePort {
+        node: String,
+        source: std::num::ParseIntError,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Failed to create schema:{}, catalog:{}, err:{}",
+        schema,
+        catalog,
+        source
+    ))]
+    FailOnChangeView {
+        schema: String,
+        catalog: String,
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+
+    #[snafu(display("Failed to get catalog:{}, err:{}", catalog, source))]
+    FailGetCatalog {
+        catalog: String,
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+}
+
+define_result!(Error);
+
+type ShardViewMap = HashMap<ShardId, ShardView>;
+
+#[async_trait]
+pub trait MetaWatcher {
+    async fn on_change(&self, view: ClusterViewRef) -> Result<()>;
+}
+
+pub type MetaWatcherPtr = Box<dyn MetaWatcher + Send + Sync>;
+
+/// Meta client abstraction
+#[async_trait]
+pub trait MetaClient {
+    /// Start the meta client
+    async fn start(&self) -> Result<()>;
+
+    /// Get current cluster view.
+    ///
+    /// The cluster view is updated by background workers periodically
+    fn get_cluster_view(&self) -> ClusterViewRef;
+}
+
+// TODO(yingwen): Now meta use i32 as shard id, maybe switch to unsigned number
+pub type ShardId = i32;
+
+#[derive(Debug, Clone, Deserialize)]
+pub struct Node {
+    pub addr: String,
+    pub port: u32,
+}
+
+#[derive(Debug, Clone, Deserialize)]
+pub struct ShardView {
+    pub shard_id: ShardId,
+    pub node: Node,
+}
+
+fn default_engine_type() -> String {
+    ANALYTIC_ENGINE_TYPE.to_string()
+}
+
+#[derive(Debug, Clone, Deserialize)]
+#[serde(default)]
+pub struct SchemaConfig {
+    pub auto_create_tables: bool,
+    pub default_engine_type: String,
+    pub default_timestamp_column_name: String,
+}
+
+impl Default for SchemaConfig {
+    fn default() -> Self {
+        Self {
+            auto_create_tables: false,
+            default_engine_type: default_engine_type(),
+            default_timestamp_column_name: default_timestamp_column_name(),
+        }
+    }
+}
+
+impl From<SchemaShardView> for SchemaConfig {
+    fn from(view: SchemaShardView) -> Self {
+        Self {
+            auto_create_tables: view.auto_create_tables,
+            default_engine_type: view.default_engine_type,
+            default_timestamp_column_name: view.default_timestamp_column_name,
+        }
+    }
+}
+
+#[derive(Debug, Default, Clone, Deserialize)]
+pub struct ClusterView {
+    pub schema_shards: HashMap<String, ShardViewMap>,
+    pub schema_configs: HashMap<String, SchemaConfig>,
+}
+
+impl TryFrom<ClusterViewResponse> for ClusterView {
+    type Error = Error;
+
+    fn try_from(result: ClusterViewResponse) -> Result<ClusterView> {
+        let mut schema_shards = HashMap::with_capacity(result.schema_shards.len());
+        let mut schema_configs = HashMap::with_capacity(result.schema_shards.len());
+
+        for (schema, shard_view) in result.schema_shards {
+            let mut schema_view = HashMap::with_capacity(shard_view.shard_nodes.len());
+            for (shard_id, shard_node) in shard_view.shard_nodes {
+                let mut addr_port = shard_node.split(':');
+                let addr = addr_port
+                    .next()
+                    .context(InvalidNodeAddr { node: &shard_node })?;
+                let port = addr_port
+                    .next()
+                    .context(InvalidNodeAddr { node: &shard_node })?
+                    .parse()
+                    .context(InvalidNodePort { node: &shard_node })?;
+                let node = Node {
+                    addr: addr.to_string(),
+                    port,
+                };
+                schema_view.insert(shard_id, ShardView { shard_id, node });
+            }
+            schema_shards.insert(schema.clone(), schema_view);
+            // TODO(boyan) support config in ClusterViewResponse
+            schema_configs.insert(schema, SchemaConfig::default());
+        }
+
+        Ok(ClusterView {
+            schema_shards,
+            schema_configs,
+        })
+    }
+}
+
+pub type ClusterViewRef = Arc<ClusterView>;
+
+#[derive(Debug, Deserialize)]
+#[serde(default)]
+pub struct MetaClientConfig {
+    pub cluster: String,
+    pub meta_addr: String,
+    pub meta_version: String,
+    /// Local ip address of this node, used as endpoint ip in meta.
+    pub node: String,
+    /// Grpc port of this node, also used as endpoint port in meta.
+    pub port: u16,
+    pub meta_members_url: String,
+    pub lease: ReadableDuration,
+    pub timeout: ReadableDuration,
+    pub cq_count: usize,
+    ///
+    /// - If `enable_meta` is true, the client will fetch cluster view from
+    ///   remote meta ndoe.
+    /// - If `enable_meta` is false, the client will try to read cluster view
+    ///   from `cluster_view`.
+    pub enable_meta: bool,
+    /// The static cluster view used by static meta client.
+    pub cluster_view: ClusterViewConfig,
+}
+
+impl Default for MetaClientConfig {
+    fn default() -> Self {
+        Self {
+            cluster: String::new(),
+            meta_addr: "http://127.0.0.1:8080".to_string(),
+            meta_version: String::from("v1"),
+            node: String::new(),
+            port: 8831,
+            meta_members_url: "ceresmeta/members".to_string(),
+            lease: ReadableDuration::secs(10),
+            timeout: ReadableDuration::secs(5),
+            cq_count: 8,
+            enable_meta: false,
+            cluster_view: ClusterViewConfig {
+                schema_shards: Vec::new(),
+            },
+        }
+    }
+}
+
+impl From<&MetaClientConfig> for RegisterNodeRequest {
+    fn from(meta_config: &MetaClientConfig) -> Self {
+        let mut req = RegisterNodeRequest::new();
+        req.set_node_type(NodeType::Data);
+        req.set_ns(NameSpace {
+            cluster: meta_config.cluster.to_string(),
+            version: meta_config.meta_version.to_string(),
+            ..Default::default()
+        });
+        req.set_node_info(CommonNodeInfo {
+            node: format!("{}:{}", meta_config.node, meta_config.port),
+            lease: meta_config.lease.as_secs() as i32,
+            ..Default::default()
+        });
+        req
+    }
+}
+
+#[derive(Debug, Deserialize, Clone)]
+#[serde(default)]
+pub struct SchemaShardView {
+    schema: String,
+    auto_create_tables: bool,
+    pub default_engine_type: String,
+    default_timestamp_column_name: String,
+    shard_views: Vec<ShardView>,
+}
+
+impl Default for SchemaShardView {
+    fn default() -> Self {
+        Self {
+            schema: "".to_string(),
+            auto_create_tables: false,
+            default_engine_type: default_engine_type(),
+            default_timestamp_column_name: default_timestamp_column_name(),
+            shard_views: Vec::default(),
+        }
+    }
+}
+
+#[inline]
+fn default_timestamp_column_name() -> String {
+    TIMESTAMP_COLUMN.to_string()
+}
+
+#[derive(Debug, Deserialize, Clone)]
+pub struct ClusterViewConfig {
+    schema_shards: Vec<SchemaShardView>,
+}
+
+impl ClusterViewConfig {
+    pub(crate) fn to_cluster_view(&self) -> ClusterView {
+        let mut schema_configs = HashMap::with_capacity(self.schema_shards.len());
+        let mut schema_shards = HashMap::with_capacity(self.schema_shards.len());
+
+        for schema_shard_view in self.schema_shards.clone() {
+            let schema = schema_shard_view.schema.clone();
+            schema_shards.insert(
+                schema.clone(),
+                schema_shard_view
+                    .shard_views
+                    .iter()
+                    .map(|shard| (shard.shard_id, shard.clone()))
+                    .collect(),
+            );
+            schema_configs.insert(schema, SchemaConfig::from(schema_shard_view));
+        }
+        ClusterView {
+            schema_shards,
+            schema_configs,
+        }
+    }
+}
+
+struct MetaClientImplInner {
+    meta_grpc_address: RwLock<Vec<String>>,
+    http_client: reqwest::Client,
+    balancer: Box<dyn LoadBalancer + Send + Sync>,
+    meta_config: MetaClientConfig,
+    cluster_view: RwLock<ClusterViewRef>,
+    members_url: Url,
+    watcher: Option<MetaWatcherPtr>,
+}
+
+impl MetaClientImplInner {
+    fn new(meta_config: MetaClientConfig, watcher: Option<MetaWatcherPtr>) -> Result<Self> {
+        let http_client = reqwest::Client::builder()
+            .timeout(Duration::from(meta_config.timeout))
+            .build()
+            .context(BuildHttpClient)?;
+
+        let members_url = Url::parse(&meta_config.meta_addr)
+            .context(InvalidMetaAddr {
+                meta_addr: &meta_config.meta_addr,
+            })?
+            .join(format!("{}/", meta_config.meta_version).as_str())
+            .context(JoinUrl {
+                input: &meta_config.meta_version,
+            })?
+            .join(&meta_config.meta_members_url)
+            .context(JoinUrl {
+                input: &meta_config.meta_members_url,
+            })?;
+
+        Ok(Self {
+            meta_grpc_address: RwLock::new(Vec::new()),
+            http_client,
+            balancer: Box::new(RandomLoadBalancer),
+            meta_config,
+            cluster_view: RwLock::new(Arc::new(ClusterView::default())),
+            members_url,
+            watcher,
+        })
+    }
+
+    async fn fetch_cluster_view(&self) -> Result<()> {
+        let client = self.build_rpc_client()?;
+        let mut req = FetchClusterViewRequest::new();
+        req.set_ns(NameSpace {
+            cluster: self.meta_config.cluster.to_string(),
+            version: self.meta_config.meta_version.to_string(),
+            ..Default::default()
+        });
+        let mut receiver = client
+            .fetch_cluster_view(&req)
+            .context(FetchClusterViewError)?;
+
+        while let Some(result) = receiver.try_next().await.context(FetchClusterViewError)? {
+            self.update_cluster_view_by_result(result).await?;
+
+            info!(
+                "Fetch cluster view from meta, cluster:{}, view:{:#?}",
+                self.meta_config.cluster,
+                *self.cluster_view.read().unwrap(),
+            );
+        }
+
+        Ok(())
+    }
+
+    async fn update_cluster_view_by_result(&self, view_result: ClusterViewResponse) -> Result<()> {
+        let view = Arc::new(ClusterView::try_from(view_result)?);
+
+        {
+            let mut cluster_view = self.cluster_view.write().unwrap();
+            *cluster_view = view.clone();
+        }
+
+        if let Some(w) = &self.watcher {
+            w.on_change(view).await?;
+        }
+
+        Ok(())
+    }
+
+    fn meta_addresses(&self) -> Vec<String> {
+        self.meta_grpc_address.read().unwrap().clone()
+    }
+
+    fn build_rpc_client(&self) -> Result<CeresmetaRpcServiceClient> {
+        let meta_addresses = self.meta_addresses();
+        let meta_rpc_addr = self
+            .balancer
+            .select(&meta_addresses)
+            .context(BuildRpcClientError)?;
+
+        let cb = ChannelBuilder::new(Arc::new(Environment::new(self.meta_config.cq_count)));
+        Ok(CeresmetaRpcServiceClient::new(cb.connect(meta_rpc_addr)))
+    }
+
+    async fn register(&self, client: &CeresmetaRpcServiceClient) -> Result<RegisterNodeResponse> {
+        let req = RegisterNodeRequest::from(&self.meta_config);
+        client.register_node(&req).context(RegisterNodeError)
+    }
+
+    async fn get_bytes_from_url(&self, url: Url) -> Result<Bytes> {
+        let resp = self
+            .http_client
+            .get(self.members_url.clone())
+            .send()
+            .await
+            .context(SendHttp)?;
+        let status = resp.status();
+        let text = resp.bytes().await.context(ParseText)?;
+
+        if status.is_success() {
+            info!(
+                "Get bytes from url success, status:{}, url:{}, bytes:{:?}",
+                status, url, text
+            );
+
+            Ok(text)
+        } else {
+            error!(
+                "Failed to get bytes from url, status:{}, url:{}, bytes:{:?}",
+                status, url, text
+            );
+
+            BadHttpStatus { status, url, text }.fail()
+        }
+    }
+
+    async fn get_from_url<T: DeserializeOwned>(&self, url: Url) -> Result<T> {
+        let full = self.get_bytes_from_url(url).await?;
+
+        serde_json::from_slice(&full).context(ParseJson { text: full })
+    }
+
+    async fn pull_meta_grpc_address(&self) -> Result<()> {
+        let addresses: Vec<String> = self.get_from_url(self.members_url.clone()).await?;
+
+        *self.meta_grpc_address.write().unwrap() = addresses;
+
+        Ok(())
+    }
+
+    // TODO(yingwen): Store the value in field
+    fn error_wait_lease(&self) -> Duration {
+        Duration::from_secs(self.meta_config.lease.as_secs() / 2)
+    }
+
+    // Register node every 2/3 lease
+    fn register_interval(&self) -> Duration {
+        Duration::from_secs(self.meta_config.lease.as_secs() * 2 / 3)
+    }
+
+    fn fetch_view_interval(&self) -> Duration {
+        Duration::from_secs(self.meta_config.lease.as_secs() * 3)
+    }
+
+    async fn start_fetch_cluster_view(&self) {
+        loop {
+            match self.fetch_cluster_view().await {
+                Ok(()) => {
+                    info!(
+                        "Fetch cluster view finished, cluster:{}",
+                        self.meta_config.cluster
+                    );
+                }
+                Err(e) => {
+                    error!(
+                        "Failed to fetch cluster view from meta, cluster:{}, error:{}",
+                        self.meta_config.cluster, e
+                    );
+                }
+            }
+
+            time::sleep(self.error_wait_lease()).await;
+        }
+    }
+
+    async fn register_loop(&self) -> Result<()> {
+        let mut interval = time::interval(self.register_interval());
+        let rpc_client = self.build_rpc_client()?;
+
+        loop {
+            let resp = self.register(&rpc_client).await?;
+            info!(
+                "Register node successfully, cluster:{}, response:{:#?}",
+                self.meta_config.cluster, resp
+            );
+
+            interval.tick().await;
+        }
+    }
+
+    async fn start_register(&self) {
+        loop {
+            if let Err(e) = self.register_loop().await {
+                error!(
+                    "Failed to register node to meta, cluster:{}, error:{}",
+                    self.meta_config.cluster, e
+                );
+
+                time::sleep(self.error_wait_lease()).await;
+            }
+        }
+    }
+
+    async fn start_refresh_meta_addresses(&self) {
+        let mut interval = time::interval(self.fetch_view_interval());
+
+        loop {
+            match self.pull_meta_grpc_address().await {
+                Ok(()) => {
+                    interval.tick().await;
+                }
+                Err(e) => {
+                    error!(
+                        "Failed to refresh meta addresses from meta, url:{}, error:{}",
+                        self.members_url, e
+                    );
+
+                    time::sleep(self.error_wait_lease()).await
+                }
+            }
+        }
+    }
+}
+
+/// Default meta client impl, will interact with a remote meta node.
+pub struct MetaClientImpl {
+    inner: Arc<MetaClientImplInner>,
+    runtime: Arc<Runtime>,
+}
+
+impl MetaClientImpl {
+    pub fn new(
+        config: MetaClientConfig,
+        runtime: Arc<Runtime>,
+        watcher: Option<MetaWatcherPtr>,
+    ) -> Result<Self> {
+        Ok(Self {
+            inner: Arc::new(MetaClientImplInner::new(config, watcher)?),
+            runtime,
+        })
+    }
+}
+
+#[async_trait]
+impl MetaClient for MetaClientImpl {
+    async fn start(&self) -> Result<()> {
+        info!(
+            "Meta client is starting, config:{:?}",
+            self.inner.meta_config
+        );
+
+        self.inner.pull_meta_grpc_address().await?;
+
+        let inner = self.inner.clone();
+        self.runtime.spawn(async move {
+            inner.start_refresh_meta_addresses().await;
+        });
+
+        let inner = self.inner.clone();
+        self.runtime.spawn(async move {
+            inner.start_register().await;
+        });
+
+        let inner = self.inner.clone();
+        self.runtime.spawn(async move {
+            inner.start_fetch_cluster_view().await;
+        });
+
+        info!("Meta client has started");
+
+        Ok(())
+    }
+
+    fn get_cluster_view(&self) -> ClusterViewRef {
+        self.inner.cluster_view.read().unwrap().clone()
+    }
+}
+
+/// Create a meta client with given `config`.
+pub fn build_meta_client(
+    config: MetaClientConfig,
+    runtime: Arc<Runtime>,
+    watcher: Option<MetaWatcherPtr>,
+) -> Result<Arc<dyn MetaClient + Send + Sync>> {
+    if config.enable_meta {
+        let meta_client = MetaClientImpl::new(config, runtime, watcher)?;
+        Ok(Arc::new(meta_client))
+    } else {
+        let meta_client = StaticMetaClient::new(config, watcher);
+        Ok(Arc::new(meta_client))
+    }
+}
diff --git a/meta_client/src/load_balance.rs b/meta_client/src/load_balance.rs
new file mode 100644
index 0000000000..707fb08d98
--- /dev/null
+++ b/meta_client/src/load_balance.rs
@@ -0,0 +1,65 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Load balancer
+
+use common_util::define_result;
+use rand::Rng;
+use snafu::{Backtrace, Snafu};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Meta Addresses empty.\nBacktrace:\n{}", backtrace))]
+    MetaAddressesEmpty { backtrace: Backtrace },
+}
+
+define_result!(Error);
+
+pub trait LoadBalancer {
+    fn select<'a>(&self, addresses: &'a [String]) -> Result<&'a String>;
+}
+
+pub struct RandomLoadBalancer;
+
+impl LoadBalancer for RandomLoadBalancer {
+    fn select<'a>(&self, addresses: &'a [String]) -> Result<&'a String> {
+        if addresses.is_empty() {
+            return MetaAddressesEmpty.fail();
+        }
+
+        let len = addresses.len();
+        if len == 1 {
+            return Ok(&addresses[0]);
+        }
+        let mut rng = rand::thread_rng();
+        let idx = rng.gen_range(0, len);
+
+        Ok(&addresses[idx])
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn test_random_loadbalancer() {
+        let lb = RandomLoadBalancer;
+        let addresses = vec![
+            "127.0.0.1:8080".to_string(),
+            "127.0.0.2:8080".to_string(),
+            "127.0.0.3:8080".to_string(),
+            "127.0.0.4:8080".to_string(),
+            "127.0.0.5:8080".to_string(),
+        ];
+        for _idx in 0..100 {
+            let addr = lb.select(&addresses).unwrap();
+            assert!(addresses.contains(addr));
+        }
+
+        // Empty case
+        assert!(lb.select(&[]).is_err());
+
+        let addresses = ["127.0.0.1:5000".to_string()];
+        assert_eq!(&addresses[0], lb.select(&addresses).unwrap());
+    }
+}
diff --git a/meta_client/src/static_client.rs b/meta_client/src/static_client.rs
new file mode 100644
index 0000000000..8639100f53
--- /dev/null
+++ b/meta_client/src/static_client.rs
@@ -0,0 +1,86 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Static meta client.
+
+use std::{collections::HashMap, sync::Arc};
+
+use async_trait::async_trait;
+use log::info;
+
+use crate::{
+    ClusterView, ClusterViewConfig, ClusterViewRef, MetaClient, MetaClientConfig, MetaWatcherPtr,
+    Node, Result, ShardView,
+};
+
+/// Static meta client.
+pub struct StaticMetaClient {
+    cluster_view: ClusterViewRef,
+    watcher: Option<MetaWatcherPtr>,
+}
+
+impl StaticMetaClient {
+    pub fn new(config: MetaClientConfig, watcher: Option<MetaWatcherPtr>) -> Self {
+        let cluster_view = match new_cluster_view(&config.cluster_view) {
+            Some(v) => v,
+            None => cluster_view_without_meta(&config.node, config.port),
+        };
+
+        Self {
+            cluster_view: Arc::new(cluster_view),
+            watcher,
+        }
+    }
+}
+
+#[async_trait]
+impl MetaClient for StaticMetaClient {
+    async fn start(&self) -> Result<()> {
+        info!(
+            "File meta client is starting, cluster_view:{:?}",
+            self.cluster_view
+        );
+
+        info!("File meta client invoke watcher");
+
+        if let Some(w) = &self.watcher {
+            w.on_change(self.cluster_view.clone()).await?;
+        }
+
+        info!("File meta client has started");
+
+        Ok(())
+    }
+
+    fn get_cluster_view(&self) -> ClusterViewRef {
+        self.cluster_view.clone()
+    }
+}
+
+fn new_cluster_view(config: &ClusterViewConfig) -> Option<ClusterView> {
+    if config.schema_shards.is_empty() {
+        return None;
+    }
+
+    Some(config.to_cluster_view())
+}
+
+fn cluster_view_without_meta(addr: &str, port: u16) -> ClusterView {
+    let shard_id = 0;
+    let mut static_shards = HashMap::new();
+    static_shards.insert(
+        shard_id,
+        ShardView {
+            shard_id,
+            node: Node {
+                addr: addr.to_string(),
+                port: u32::from(port),
+            },
+        },
+    );
+    let mut schema_shards = HashMap::new();
+    schema_shards.insert(catalog::consts::DEFAULT_SCHEMA.to_string(), static_shards);
+    ClusterView {
+        schema_shards,
+        schema_configs: HashMap::default(),
+    }
+}
diff --git a/meta_client_v2/Cargo.toml b/meta_client_v2/Cargo.toml
new file mode 100644
index 0000000000..6ca7a6338a
--- /dev/null
+++ b/meta_client_v2/Cargo.toml
@@ -0,0 +1,26 @@
+[package]
+name = "meta_client_v2"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+async-trait = "0.1.41"
+catalog = { path = "../catalog" }
+common_types = { path = "../common_types" }
+table_engine = { path = "../table_engine" }
+common_util = { path = "../common_util" }
+ceresdbproto = { git = "https://github.com/CeresDB/ceresdbproto.git"}
+futures = "0.3"
+grpcio = { path = "../grpcio" }
+log = "0.4"
+protobuf = "2.20"
+rand = "0.7"
+reqwest = "0.11"
+serde = "1.0"
+serde_derive = "1.0.81"
+serde_json = "1.0.60"
+snafu = { version ="0.6.10", features = ["backtraces"]}
+tokio = { version = "1.0", features = ["full"] }
+url = "2.2"
diff --git a/meta_client_v2/src/lib.rs b/meta_client_v2/src/lib.rs
new file mode 100644
index 0000000000..4dd4244c12
--- /dev/null
+++ b/meta_client_v2/src/lib.rs
@@ -0,0 +1,676 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Client to communicate with meta
+
+use std::{
+    sync::{Arc, RwLock as StdRwLock},
+    time::Duration,
+};
+
+use async_trait::async_trait;
+use ceresdbproto::{
+    metagrpcV2::{
+        AllocSchemaIdRequest as PbAllocSchemaIdRequest,
+        AllocTableIdRequest as PbAllocTableIdRequest, DropTableRequest as PbDropTableRequest,
+        GetTablesRequest as PbGetTablesRequest, NodeHeartbeatRequest as PbNodeHeartbeatRequest,
+        NodeHeartbeatResponse as PbNodeHeartbeatResponse,
+    },
+    metagrpcV2_grpc::CeresmetaRpcServiceClient,
+};
+use common_types::bytes::Bytes;
+use common_util::{config::ReadableDuration, define_result, runtime::Runtime};
+use futures::{SinkExt, TryStreamExt};
+use grpcio::{
+    CallOption, ChannelBuilder, ClientDuplexReceiver, ClientDuplexSender, Environment, WriteFlags,
+};
+use load_balance::{LoadBalancer, RandomLoadBalancer};
+use log::{error, info, warn};
+use reqwest::{self, StatusCode, Url};
+use serde::de::DeserializeOwned;
+use serde_derive::Deserialize;
+use snafu::{Backtrace, ResultExt, Snafu};
+use tokio::{
+    sync::{mpsc::Sender, RwLock},
+    time,
+};
+pub use types::*;
+
+mod load_balance;
+mod types;
+
+#[derive(Debug, Snafu)]
+#[snafu(visibility = "pub")]
+pub enum Error {
+    #[snafu(display("Build http client failed, err:{}.\nBacktrace:\n{}", source, backtrace))]
+    BuildHttpClient {
+        source: reqwest::Error,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Invalid meta addr, addr:{}, err:{}.\nBacktrace:\n{}",
+        meta_addr,
+        source,
+        backtrace
+    ))]
+    InvalidMetaAddr {
+        meta_addr: String,
+        source: url::ParseError,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Failed to join url, input:{}, err:{}.\nBacktrace:\n{}",
+        input,
+        source,
+        backtrace
+    ))]
+    JoinUrl {
+        input: String,
+        source: url::ParseError,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Failed to send http request, err:{}.\nBacktrace:\n{}",
+        source,
+        backtrace
+    ))]
+    SendHttp {
+        source: reqwest::Error,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Failed to parse http text, err:{}.\nBacktrace:\n{}",
+        source,
+        backtrace
+    ))]
+    ParseText {
+        source: reqwest::Error,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Bad http status, status:{}, url:{}, text:{:?}.\nBacktrace:\n{}",
+        status,
+        url,
+        text,
+        backtrace
+    ))]
+    BadHttpStatus {
+        status: StatusCode,
+        url: String,
+        text: Bytes,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Failed to parse json text, text:{:?}, err:{}.\nBacktrace:\n{}",
+        text,
+        source,
+        backtrace
+    ))]
+    ParseJson {
+        text: Bytes,
+        source: serde_json::Error,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Failed to fetch action cmd, err:{}.\nBacktrace:\n{}",
+        source,
+        backtrace
+    ))]
+    FetchActionCmdError {
+        source: grpcio::Error,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Encountered build rpc client, err:{}", source))]
+    BuildRpcClientError { source: load_balance::Error },
+
+    #[snafu(display("Failed to get grpc client, grpc client is none, msg:{}", msg))]
+    FailGetGrpcClient { msg: String },
+
+    #[snafu(display("Failed to send heartbeat, cluster:{}, err:{}", cluster, source))]
+    FailSendHeartbeat {
+        cluster: String,
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+
+    #[snafu(display(
+        "Failed to notify action cmd, action cmd:{:?}, err:{}",
+        action_cmd,
+        source
+    ))]
+    FailNotifyActionCmd {
+        action_cmd: ActionCmd,
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+
+    #[snafu(display("Failed to alloc schema id, err:{}", source))]
+    FailAllocSchemaId {
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+
+    #[snafu(display("Failed to alloc table id, err:{}", source))]
+    FailAllocTableId {
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+
+    #[snafu(display("Failed to drop table, err:{}", source))]
+    FailDropTable {
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+
+    #[snafu(display("Failed to get tables, err:{}", source))]
+    FailGetTables {
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+
+    #[snafu(display("Meta error, resp header:{:?}.\nBacktrace:\n{}", header, backtrace))]
+    Meta {
+        header: ResponseHeader,
+        backtrace: Backtrace,
+    },
+}
+
+define_result!(Error);
+
+const DEFAULT_META_URL_VERSION: &str = "v1";
+
+/// Meta client abstraction
+#[async_trait]
+pub trait MetaClient {
+    /// Start the meta client
+    async fn start(&self) -> Result<()>;
+
+    async fn alloc_schema_id(&self, _: AllocSchemaIdRequest) -> Result<AllocSchemaIdResponse>;
+
+    async fn alloc_table_id(&self, _: AllocTableIdRequest) -> Result<AllocTableIdResponse>;
+
+    async fn drop_table(&self, _: DropTableRequest) -> Result<DropTableResponse>;
+
+    async fn get_tables(&self, _: GetTablesRequest) -> Result<GetTablesResponse>;
+
+    async fn send_heartbeat(&self, _: Vec<ShardInfo>) -> Result<()>;
+}
+
+#[derive(Debug, Deserialize, Clone)]
+#[serde(default)]
+pub struct MetaClientConfig {
+    pub cluster_name: String,
+    pub meta_addr: String,
+    pub meta_members_url: String,
+    pub lease: ReadableDuration,
+    pub timeout: ReadableDuration,
+    pub cq_count: usize,
+}
+
+impl Default for MetaClientConfig {
+    fn default() -> Self {
+        Self {
+            cluster_name: String::new(),
+            meta_addr: "http://127.0.0.1:8080".to_string(),
+            meta_members_url: "ceresmeta/members".to_string(),
+            lease: ReadableDuration::secs(10),
+            timeout: ReadableDuration::secs(5),
+            cq_count: 8,
+        }
+    }
+}
+
+struct NodeHeartbeatChannel {
+    heartbeat_sender: ClientDuplexSender<PbNodeHeartbeatRequest>,
+    action_cmd_receiver: Option<ClientDuplexReceiver<PbNodeHeartbeatResponse>>,
+}
+
+struct GrpcClient {
+    client: CeresmetaRpcServiceClient,
+    heartbeat_channel: NodeHeartbeatChannel,
+}
+
+struct MetaClientImplInner {
+    meta_grpc_address: StdRwLock<Vec<String>>,
+    http_client: reqwest::Client,
+    balancer: Box<dyn LoadBalancer + Send + Sync>,
+    meta_config: MetaClientConfig,
+    node_meta_info: NodeMetaInfo,
+    members_url: Url,
+
+    grpc_client: RwLock<Option<GrpcClient>>,
+
+    notify_sender: Option<Sender<ActionCmd>>,
+}
+
+impl MetaClientImplInner {
+    fn new(
+        meta_config: MetaClientConfig,
+        node_meta_info: NodeMetaInfo,
+        sender: Option<Sender<ActionCmd>>,
+    ) -> Result<Self> {
+        let http_client = reqwest::Client::builder()
+            .timeout(Duration::from(meta_config.timeout))
+            .build()
+            .context(BuildHttpClient)?;
+
+        let members_url = Url::parse(&meta_config.meta_addr)
+            .context(InvalidMetaAddr {
+                meta_addr: &meta_config.meta_addr,
+            })?
+            .join(format!("{}/", DEFAULT_META_URL_VERSION).as_str())
+            .unwrap()
+            .join(&meta_config.meta_members_url)
+            .context(JoinUrl {
+                input: &meta_config.meta_members_url,
+            })?;
+
+        let client = Self {
+            meta_grpc_address: StdRwLock::new(Vec::new()),
+            http_client,
+            balancer: Box::new(RandomLoadBalancer),
+            meta_config,
+            node_meta_info,
+            members_url,
+            grpc_client: RwLock::new(None),
+            notify_sender: sender,
+        };
+
+        Ok(client)
+    }
+
+    fn request_header(&self) -> RequestHeader {
+        RequestHeader {
+            node: self.node_meta_info.node.to_string(),
+            cluster_name: self.meta_config.cluster_name.clone(),
+        }
+    }
+
+    fn node_meta_info(&self) -> NodeMetaInfo {
+        self.node_meta_info.clone()
+    }
+
+    fn get_cluster_name(&self) -> &str {
+        // let a :Option<ClientUnaryReceiver>=None;
+
+        self.meta_config.cluster_name.as_str()
+    }
+
+    fn connect_grpc_client(&self) -> Result<GrpcClient> {
+        let client = self.build_rpc_client()?;
+        let (sender, receiver) = client
+            .node_heartbeat_opt(CallOption::default())
+            .context(FetchActionCmdError)?;
+        Ok(GrpcClient {
+            client,
+            heartbeat_channel: NodeHeartbeatChannel {
+                heartbeat_sender: sender,
+                action_cmd_receiver: Some(receiver),
+            },
+        })
+    }
+
+    async fn reconnect_heartbeat_channel(&self) {
+        let grpc_client = &mut *self.grpc_client.write().await;
+        loop {
+            match self.connect_grpc_client() {
+                Ok(client) => {
+                    *grpc_client = Some(client);
+                    return;
+                }
+                Err(e) => {
+                    error!("Grpc reconnect failed, error:{}", e);
+                    time::sleep(self.error_wait_lease()).await;
+                }
+            }
+        }
+    }
+
+    fn meta_addresses(&self) -> Vec<String> {
+        self.meta_grpc_address.read().unwrap().clone()
+    }
+
+    fn build_rpc_client(&self) -> Result<CeresmetaRpcServiceClient> {
+        let meta_addresses = self.meta_addresses();
+        let meta_rpc_addr = self
+            .balancer
+            .select(&meta_addresses)
+            .context(BuildRpcClientError)?;
+
+        let cb = ChannelBuilder::new(Arc::new(Environment::new(self.meta_config.cq_count)));
+        Ok(CeresmetaRpcServiceClient::new(cb.connect(meta_rpc_addr)))
+    }
+
+    async fn get_bytes_from_url(&self, url: Url) -> Result<Bytes> {
+        let resp = self
+            .http_client
+            .get(self.members_url.clone())
+            .send()
+            .await
+            .context(SendHttp)?;
+        let status = resp.status();
+        let text = resp.bytes().await.context(ParseText)?;
+
+        if status.is_success() {
+            info!(
+                "Get bytes from url success, status:{}, url:{}, bytes:{:?}",
+                status, url, text
+            );
+
+            Ok(text)
+        } else {
+            error!(
+                "Failed to get bytes from url, status:{}, url:{}, bytes:{:?}",
+                status, url, text
+            );
+
+            BadHttpStatus { status, url, text }.fail()
+        }
+    }
+
+    async fn get_from_url<T: DeserializeOwned>(&self, url: Url) -> Result<T> {
+        let full = self.get_bytes_from_url(url).await?;
+
+        serde_json::from_slice(&full).context(ParseJson { text: full })
+    }
+
+    async fn pull_meta_grpc_address(&self) -> Result<()> {
+        let addresses: Vec<String> = self.get_from_url(self.members_url.clone()).await?;
+
+        *self.meta_grpc_address.write().unwrap() = addresses;
+
+        Ok(())
+    }
+
+    // TODO(yingwen): Store the value in field
+    fn error_wait_lease(&self) -> Duration {
+        Duration::from_secs(self.meta_config.lease.as_secs() / 2)
+    }
+
+    fn fetch_view_interval(&self) -> Duration {
+        Duration::from_secs(self.meta_config.lease.as_secs() * 3)
+    }
+
+    async fn start_refresh_meta_addresses(&self) {
+        let mut interval = time::interval(self.fetch_view_interval());
+
+        loop {
+            match self.pull_meta_grpc_address().await {
+                Ok(()) => {
+                    interval.tick().await;
+                }
+                Err(e) => {
+                    error!(
+                        "Failed to refresh meta addresses from meta, url:{}, error:{}",
+                        self.members_url, e
+                    );
+
+                    time::sleep(self.error_wait_lease()).await;
+                }
+            }
+        }
+    }
+
+    async fn start_fetch_action_cmd(&self) {
+        loop {
+            let mut receiver = None;
+            if let Some(client) = &mut *self.grpc_client.write().await {
+                receiver = client.heartbeat_channel.action_cmd_receiver.take();
+                if receiver.is_none() {
+                    error!("Failed to fetch action cmd receiver");
+                }
+            } else {
+                error!("Grpc client is not inited");
+            }
+
+            if let Some(v) = receiver {
+                match self.fetch_action_cmd(v).await {
+                    Ok(()) => {
+                        info!(
+                            "Fetch cluster view finished, cluster:{}",
+                            self.get_cluster_name()
+                        );
+                    }
+                    Err(e) => {
+                        self.reconnect_heartbeat_channel().await;
+                        error!(
+                            "Failed to get action cmd, cluster:{}, error:{}",
+                            self.get_cluster_name(),
+                            e
+                        );
+                    }
+                }
+            }
+
+            time::sleep(self.error_wait_lease()).await;
+        }
+    }
+
+    async fn fetch_action_cmd(
+        &self,
+        mut receiver: ClientDuplexReceiver<PbNodeHeartbeatResponse>,
+    ) -> Result<()> {
+        while let Some(resp) = receiver.try_next().await.context(FetchActionCmdError)? {
+            info!(
+                "Fetch action cmd from meta, cluster:{}, action_cmd:{:?}",
+                self.get_cluster_name(),
+                resp,
+            );
+            if let Some(notify_sender) = &self.notify_sender {
+                let resp: NodeHeartbeatResponse = resp.into();
+                if let Err(e) = check_response_header(&resp.header) {
+                    error!("Fetch action cmd failed, err:{}", e);
+                    continue;
+                }
+                if let Some(action_cmd) = resp.action_cmd {
+                    if let Err(e) = notify_sender.send(action_cmd.clone()).await {
+                        error!(
+                            "Notify sender send failed, action cmd:{:?}, err:{}",
+                            action_cmd, e
+                        );
+                    }
+                } else {
+                    warn!("Fetch action cmd is empty, resp:{:?}", resp)
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
+
+/// Default meta client impl, will interact with a remote meta node.
+pub struct MetaClientImpl {
+    inner: Arc<MetaClientImplInner>,
+    runtime: Arc<Runtime>,
+}
+
+impl MetaClientImpl {
+    pub fn new(
+        config: MetaClientConfig,
+        node_meta_info: NodeMetaInfo,
+        runtime: Arc<Runtime>,
+        sender: Option<Sender<ActionCmd>>,
+    ) -> Result<Self> {
+        Ok(Self {
+            inner: Arc::new(MetaClientImplInner::new(config, node_meta_info, sender)?),
+            runtime,
+        })
+    }
+}
+
+#[async_trait]
+impl MetaClient for MetaClientImpl {
+    async fn start(&self) -> Result<()> {
+        info!(
+            "Meta client is starting, config:{:?}",
+            self.inner.meta_config
+        );
+
+        self.inner.pull_meta_grpc_address().await?;
+        self.inner.reconnect_heartbeat_channel().await;
+
+        let inner = self.inner.clone();
+        self.runtime.spawn(async move {
+            inner.start_refresh_meta_addresses().await;
+        });
+
+        let inner = self.inner.clone();
+        self.runtime.spawn(async move {
+            inner.start_fetch_action_cmd().await;
+        });
+
+        info!("Meta client has started");
+
+        Ok(())
+    }
+
+    async fn alloc_schema_id(&self, req: AllocSchemaIdRequest) -> Result<AllocSchemaIdResponse> {
+        if let Some(grpc_client) = &mut *self.inner.grpc_client.write().await {
+            let mut pb_req: PbAllocSchemaIdRequest = req.into();
+            pb_req.set_header(self.inner.request_header().into());
+            let pb_resp = grpc_client
+                .client
+                .alloc_schema_id_async_opt(&pb_req, CallOption::default())
+                .map_err(|e| Box::new(e) as _)
+                .context(FailAllocSchemaId)?
+                .await
+                .map_err(|e| Box::new(e) as _)
+                .context(FailAllocSchemaId)?;
+            let resp: AllocSchemaIdResponse = pb_resp.into();
+            check_response_header(&resp.header)?;
+            Ok(resp)
+        } else {
+            FailGetGrpcClient {
+                msg: "alloc schema id".to_string(),
+            }
+            .fail()
+        }
+    }
+
+    async fn alloc_table_id(&self, req: AllocTableIdRequest) -> Result<AllocTableIdResponse> {
+        if let Some(grpc_client) = &mut *self.inner.grpc_client.write().await {
+            let mut pb_req: PbAllocTableIdRequest = req.into();
+            pb_req.set_header(self.inner.request_header().into());
+            let pb_resp = grpc_client
+                .client
+                .alloc_table_id_async_opt(&pb_req, CallOption::default())
+                .map_err(|e| Box::new(e) as _)
+                .context(FailAllocTableId)?
+                .await
+                .map_err(|e| Box::new(e) as _)
+                .context(FailAllocTableId)?;
+            let resp: AllocTableIdResponse = pb_resp.into();
+            check_response_header(&resp.header)?;
+            Ok(resp)
+        } else {
+            FailGetGrpcClient {
+                msg: "alloc table id".to_string(),
+            }
+            .fail()
+        }
+    }
+
+    async fn drop_table(&self, req: DropTableRequest) -> Result<DropTableResponse> {
+        if let Some(grpc_client) = &mut *self.inner.grpc_client.write().await {
+            let mut pb_req: PbDropTableRequest = req.into();
+            pb_req.set_header(self.inner.request_header().into());
+            let pb_resp = grpc_client
+                .client
+                .drop_table_async_opt(&pb_req, CallOption::default())
+                .map_err(|e| Box::new(e) as _)
+                .context(FailDropTable)?
+                .await
+                .map_err(|e| Box::new(e) as _)
+                .context(FailDropTable)?;
+            let resp: DropTableResponse = pb_resp.into();
+            check_response_header(&resp.header)?;
+            Ok(resp)
+        } else {
+            FailGetGrpcClient {
+                msg: "drop table".to_string(),
+            }
+            .fail()
+        }
+    }
+
+    async fn get_tables(&self, req: GetTablesRequest) -> Result<GetTablesResponse> {
+        if let Some(grpc_client) = &mut *self.inner.grpc_client.write().await {
+            let mut pb_req: PbGetTablesRequest = req.into();
+            pb_req.set_header(self.inner.request_header().into());
+            let pb_resp = grpc_client
+                .client
+                .get_tables_async_opt(&pb_req, CallOption::default())
+                .map_err(|e| Box::new(e) as _)
+                .context(FailGetTables)?
+                .await
+                .map_err(|e| Box::new(e) as _)
+                .context(FailGetTables)?;
+            let resp: GetTablesResponse = pb_resp.into();
+            check_response_header(&resp.header)?;
+            Ok(resp)
+        } else {
+            FailGetGrpcClient {
+                msg: "get tables".to_string(),
+            }
+            .fail()
+        }
+    }
+
+    async fn send_heartbeat(&self, shards_info: Vec<ShardInfo>) -> Result<()> {
+        if let Some(grpc_client) = &mut *self.inner.grpc_client.write().await {
+            info!(
+                "Meta client send heartbeat, cluster:{}, shards_info:{:?}",
+                self.inner.get_cluster_name(),
+                shards_info
+            );
+            let mut pb_request = PbNodeHeartbeatRequest::new();
+            pb_request.set_header(self.inner.request_header().into());
+            let node_info = NodeInfo {
+                node_meta_info: self.inner.node_meta_info(),
+                shards_info,
+            };
+            pb_request.set_info(node_info.into());
+            if let Err(e) = grpc_client
+                .heartbeat_channel
+                .heartbeat_sender
+                .send((pb_request, WriteFlags::default()))
+                .await
+                .map_err(|e| Box::new(e) as _)
+                .context(FailSendHeartbeat {
+                    cluster: self.inner.get_cluster_name(),
+                })
+            {
+                self.inner.reconnect_heartbeat_channel().await;
+                return Err(e);
+            };
+        } else {
+            error!("Grpc_client is none");
+        }
+
+        Ok(())
+    }
+}
+
+fn check_response_header(header: &ResponseHeader) -> Result<()> {
+    if header.success {
+        Ok(())
+    } else {
+        Meta {
+            header: header.clone(),
+        }
+        .fail()
+    }
+}
+
+/// Create a meta client with given `config`.
+pub fn build_meta_client(
+    config: MetaClientConfig,
+    node_meta_info: NodeMetaInfo,
+    runtime: Arc<Runtime>,
+    sender: Option<Sender<ActionCmd>>,
+) -> Result<Arc<dyn MetaClient + Send + Sync>> {
+    let meta_client = MetaClientImpl::new(config, node_meta_info, runtime, sender)?;
+    Ok(Arc::new(meta_client))
+}
diff --git a/meta_client_v2/src/load_balance.rs b/meta_client_v2/src/load_balance.rs
new file mode 100644
index 0000000000..707fb08d98
--- /dev/null
+++ b/meta_client_v2/src/load_balance.rs
@@ -0,0 +1,65 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Load balancer
+
+use common_util::define_result;
+use rand::Rng;
+use snafu::{Backtrace, Snafu};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Meta Addresses empty.\nBacktrace:\n{}", backtrace))]
+    MetaAddressesEmpty { backtrace: Backtrace },
+}
+
+define_result!(Error);
+
+pub trait LoadBalancer {
+    fn select<'a>(&self, addresses: &'a [String]) -> Result<&'a String>;
+}
+
+pub struct RandomLoadBalancer;
+
+impl LoadBalancer for RandomLoadBalancer {
+    fn select<'a>(&self, addresses: &'a [String]) -> Result<&'a String> {
+        if addresses.is_empty() {
+            return MetaAddressesEmpty.fail();
+        }
+
+        let len = addresses.len();
+        if len == 1 {
+            return Ok(&addresses[0]);
+        }
+        let mut rng = rand::thread_rng();
+        let idx = rng.gen_range(0, len);
+
+        Ok(&addresses[idx])
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn test_random_loadbalancer() {
+        let lb = RandomLoadBalancer;
+        let addresses = vec![
+            "127.0.0.1:8080".to_string(),
+            "127.0.0.2:8080".to_string(),
+            "127.0.0.3:8080".to_string(),
+            "127.0.0.4:8080".to_string(),
+            "127.0.0.5:8080".to_string(),
+        ];
+        for _idx in 0..100 {
+            let addr = lb.select(&addresses).unwrap();
+            assert!(addresses.contains(addr));
+        }
+
+        // Empty case
+        assert!(lb.select(&[]).is_err());
+
+        let addresses = ["127.0.0.1:5000".to_string()];
+        assert_eq!(&addresses[0], lb.select(&addresses).unwrap());
+    }
+}
diff --git a/meta_client_v2/src/types.rs b/meta_client_v2/src/types.rs
new file mode 100644
index 0000000000..7f558feec7
--- /dev/null
+++ b/meta_client_v2/src/types.rs
@@ -0,0 +1,458 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use std::collections::HashMap;
+
+use ceresdbproto::{
+    metaV2::ShardRole as PbShardRole,
+    metagrpcV2::{
+        AllocSchemaIdRequest as PbAllocSchemaIdRequest,
+        AllocSchemaIdResponse as PbAllocSchemaIdResponse,
+        AllocTableIdRequest as PbAllocTableIdRequest,
+        AllocTableIdResponse as PbAllocTableIdResponse, ChangeRoleCmd as PbChangeRoleCmd,
+        CloseCmd as PbCloseCmd, DropTableRequest as PbDropTableRequest,
+        DropTableResponse as PbDropTableResponse, Error as PbError, ErrorType as PbErrorType,
+        GetTablesRequest as PbGetTablesRequest, GetTablesResponse as PbGetTablesResponse,
+        NodeHeartbeatResponse as PbNodeHeartbeatResponse, NodeHeartbeatResponse_oneof_cmd,
+        NodeInfo as PbNodeInfo, NoneCmd as PbNoneCmd, OpenCmd as PbOpenCmd,
+        RequestHeader as PbRequestHeader, ResponseHeader as PbResponseHeader,
+        ShardInfo as PbShardInfo, ShardTables as PbShardTables, SplitCmd as PbSplitCmd,
+        TableInfo as PbTableInfo,
+    },
+};
+use common_util::config::ReadableDuration;
+use serde_derive::Deserialize;
+
+pub type TableId = u64;
+pub type ShardId = u32;
+pub type SchemaId = u32;
+
+#[derive(Debug, Clone)]
+pub struct RequestHeader {
+    pub node: String,
+    pub cluster_name: String,
+}
+
+#[derive(Debug, Clone)]
+pub struct ResponseHeader {
+    pub success: bool,
+    pub error: ResponseError,
+}
+
+#[derive(Debug, Clone)]
+pub struct ResponseError {
+    pub error_type: ErrorType,
+    pub message: String,
+}
+
+#[derive(Debug, Clone)]
+pub enum ErrorType {
+    UNKNOWN,
+}
+
+pub struct AllocSchemaIdRequest {
+    pub name: String,
+}
+
+pub struct AllocSchemaIdResponse {
+    pub header: ResponseHeader,
+
+    pub name: String,
+    pub id: SchemaId,
+}
+
+pub struct AllocTableIdRequest {
+    pub schema_name: String,
+    pub name: String,
+}
+
+pub struct AllocTableIdResponse {
+    pub header: ResponseHeader,
+
+    pub schema_name: String,
+    pub name: String,
+    pub shard_id: ShardId,
+    pub schema_id: SchemaId,
+    pub id: TableId,
+}
+
+pub struct DropTableRequest {
+    pub schema_name: String,
+    pub name: String,
+}
+
+pub struct DropTableResponse {
+    pub header: ResponseHeader,
+}
+
+#[derive(Clone, Debug)]
+pub struct GetTablesRequest {
+    pub shard_ids: Vec<ShardId>,
+}
+
+#[derive(Clone, Debug)]
+pub struct GetTablesResponse {
+    pub header: ResponseHeader,
+
+    pub tables_map: HashMap<ShardId, ShardTables>,
+}
+
+#[derive(Clone, Debug)]
+pub struct TableInfo {
+    pub id: TableId,
+    pub name: String,
+    pub schema_id: SchemaId,
+    pub schema_name: String,
+}
+
+#[derive(Clone, Debug)]
+pub struct ShardTables {
+    pub role: ShardRole,
+    pub tables: Vec<TableInfo>,
+}
+
+#[derive(Debug)]
+struct NodeHeartbeatRequest {
+    info: NodeInfo,
+}
+
+#[derive(Debug, Clone, Default, Deserialize)]
+pub struct Node {
+    pub addr: String,
+    pub port: u16,
+}
+
+impl ToString for Node {
+    fn to_string(&self) -> String {
+        format!("{}:{}", self.addr, self.port)
+    }
+}
+
+#[derive(Debug, Default, Clone, Deserialize)]
+pub struct NodeMetaInfo {
+    pub node: String,
+    pub zone: String,
+    pub idc: String,
+    pub binary_version: String,
+}
+
+#[derive(Debug, Clone)]
+pub struct NodeInfo {
+    pub node_meta_info: NodeMetaInfo,
+    pub shards_info: Vec<ShardInfo>,
+}
+
+#[derive(Debug)]
+pub struct NodeHeartbeatResponse {
+    pub header: ResponseHeader,
+
+    pub timestamp: u64,
+    pub action_cmd: Option<ActionCmd>,
+}
+
+#[derive(Debug, Clone)]
+pub struct ShardInfo {
+    pub shard_id: ShardId,
+    pub role: ShardRole,
+}
+
+#[derive(Debug, Copy, Clone)]
+pub enum ShardRole {
+    LEADER,
+    FOLLOWER,
+}
+
+#[derive(Debug, Clone)]
+pub enum ActionCmd {
+    NoneCmd(NoneCmd),
+    OpenCmd(OpenCmd),
+    SplitCmd(SplitCmd),
+    CloseCmd(CloseCmd),
+    ChangeRoleCmd(ChangeRoleCmd),
+}
+
+#[derive(Debug, Clone)]
+pub struct NoneCmd {}
+
+#[derive(Debug, Clone)]
+pub struct OpenCmd {
+    pub shard_ids: Vec<ShardId>,
+}
+
+#[derive(Debug, Clone)]
+pub struct SplitCmd {}
+
+#[derive(Debug, Clone)]
+pub struct CloseCmd {
+    pub shard_ids: Vec<ShardId>,
+}
+
+#[derive(Debug, Clone)]
+pub struct ChangeRoleCmd {}
+
+#[derive(Debug, Deserialize, Clone)]
+#[serde(default)]
+pub struct MetaClientConfig {
+    pub cluster_name: String,
+    pub meta_addr: String,
+    pub meta_members_url: String,
+    pub lease: ReadableDuration,
+    pub timeout: ReadableDuration,
+    pub cq_count: usize,
+
+    ///
+    /// - If `enable_meta` is true, the client will fetch cluster view from
+    ///   remote meta ndoe.
+    /// - If `enable_meta` is false, the client will try to read cluster view
+    ///   from `cluster_view`.
+    pub enable_meta: bool,
+}
+
+impl Default for MetaClientConfig {
+    fn default() -> Self {
+        Self {
+            cluster_name: String::new(),
+            meta_addr: "http://127.0.0.1:8080".to_string(),
+            meta_members_url: "ceresmeta/members".to_string(),
+            lease: ReadableDuration::secs(10),
+            timeout: ReadableDuration::secs(5),
+            cq_count: 8,
+            enable_meta: true,
+        }
+    }
+}
+
+impl From<NodeInfo> for PbNodeInfo {
+    fn from(node_info: NodeInfo) -> Self {
+        let mut pb_node_info = PbNodeInfo::new();
+        pb_node_info.set_node(node_info.node_meta_info.node.to_string());
+        pb_node_info.set_zone(node_info.node_meta_info.zone);
+        pb_node_info.set_binary_version(node_info.node_meta_info.binary_version);
+        pb_node_info.set_shardsInfo(protobuf::RepeatedField::from_vec(
+            node_info
+                .shards_info
+                .into_iter()
+                .map(|v| v.into())
+                .collect(),
+        ));
+        pb_node_info
+    }
+}
+
+impl From<ShardInfo> for PbShardInfo {
+    fn from(shard_info: ShardInfo) -> Self {
+        let mut pb_shard_info = PbShardInfo::new();
+        pb_shard_info.set_shard_id(shard_info.shard_id);
+        pb_shard_info.set_role(shard_info.role.into());
+        pb_shard_info
+    }
+}
+
+impl From<ShardRole> for PbShardRole {
+    fn from(shard_role: ShardRole) -> Self {
+        match shard_role {
+            ShardRole::LEADER => PbShardRole::LEADER,
+            ShardRole::FOLLOWER => PbShardRole::FOLLOWER,
+        }
+    }
+}
+
+impl From<PbShardRole> for ShardRole {
+    fn from(pb: PbShardRole) -> Self {
+        match pb {
+            PbShardRole::LEADER => ShardRole::LEADER,
+            PbShardRole::FOLLOWER => ShardRole::FOLLOWER,
+        }
+    }
+}
+
+impl From<PbNodeHeartbeatResponse> for NodeHeartbeatResponse {
+    fn from(mut pb: PbNodeHeartbeatResponse) -> Self {
+        let timestamp = pb.get_timestamp();
+        NodeHeartbeatResponse {
+            header: pb.take_header().into(),
+            timestamp,
+            action_cmd: pb.cmd.map(|v| v.into()),
+        }
+    }
+}
+
+impl From<NodeHeartbeatResponse_oneof_cmd> for ActionCmd {
+    fn from(pb: NodeHeartbeatResponse_oneof_cmd) -> Self {
+        match pb {
+            NodeHeartbeatResponse_oneof_cmd::none_cmd(_) => ActionCmd::NoneCmd(NoneCmd {}),
+            NodeHeartbeatResponse_oneof_cmd::open_cmd(v) => ActionCmd::OpenCmd(v.into()),
+            NodeHeartbeatResponse_oneof_cmd::split_cmd(v) => ActionCmd::SplitCmd(v.into()),
+            NodeHeartbeatResponse_oneof_cmd::close_cmd(v) => ActionCmd::CloseCmd(v.into()),
+            NodeHeartbeatResponse_oneof_cmd::change_role_cmd(v) => {
+                ActionCmd::ChangeRoleCmd(v.into())
+            }
+        }
+    }
+}
+
+impl From<PbNoneCmd> for NoneCmd {
+    fn from(_pb: PbNoneCmd) -> Self {
+        Self {}
+    }
+}
+
+impl From<PbOpenCmd> for OpenCmd {
+    fn from(mut pb: PbOpenCmd) -> Self {
+        Self {
+            shard_ids: pb.take_shard_ids(),
+        }
+    }
+}
+
+impl From<PbSplitCmd> for SplitCmd {
+    fn from(_pb: PbSplitCmd) -> Self {
+        Self {}
+    }
+}
+
+impl From<PbCloseCmd> for CloseCmd {
+    fn from(mut pb: PbCloseCmd) -> Self {
+        Self {
+            shard_ids: pb.take_shard_ids(),
+        }
+    }
+}
+
+impl From<PbChangeRoleCmd> for ChangeRoleCmd {
+    fn from(_pb: PbChangeRoleCmd) -> Self {
+        Self {}
+    }
+}
+
+impl From<GetTablesRequest> for PbGetTablesRequest {
+    fn from(req: GetTablesRequest) -> Self {
+        let mut pb = PbGetTablesRequest::new();
+        pb.set_shard_id(req.shard_ids);
+        pb
+    }
+}
+
+impl From<PbGetTablesResponse> for GetTablesResponse {
+    fn from(mut pb: PbGetTablesResponse) -> Self {
+        Self {
+            header: pb.take_header().into(),
+            tables_map: pb
+                .take_tables_map()
+                .into_iter()
+                .map(|(k, v)| (k, v.into()))
+                .collect(),
+        }
+    }
+}
+
+impl From<PbShardTables> for ShardTables {
+    fn from(mut pb: PbShardTables) -> Self {
+        Self {
+            role: pb.get_role().into(),
+            tables: pb.take_tables().into_iter().map(|v| v.into()).collect(),
+        }
+    }
+}
+
+impl From<PbTableInfo> for TableInfo {
+    fn from(mut pb: PbTableInfo) -> Self {
+        TableInfo {
+            id: pb.get_id(),
+            name: pb.take_name(),
+            schema_id: pb.get_schema_id(),
+            schema_name: pb.take_schema_name(),
+        }
+    }
+}
+
+impl From<RequestHeader> for PbRequestHeader {
+    fn from(req: RequestHeader) -> Self {
+        let mut pb = PbRequestHeader::new();
+        pb.set_node(req.node);
+        pb.set_cluster_name(req.cluster_name);
+        pb
+    }
+}
+
+impl From<PbResponseHeader> for ResponseHeader {
+    fn from(mut pb: PbResponseHeader) -> Self {
+        Self {
+            success: pb.get_success(),
+            error: pb.take_error().into(),
+        }
+    }
+}
+
+impl From<PbErrorType> for ErrorType {
+    fn from(pb: PbErrorType) -> Self {
+        match pb {
+            PbErrorType::UNKNOWN => ErrorType::UNKNOWN,
+        }
+    }
+}
+
+impl From<PbError> for ResponseError {
+    fn from(mut pb: PbError) -> Self {
+        Self {
+            error_type: pb.get_error_type().into(),
+            message: pb.take_message(),
+        }
+    }
+}
+
+impl From<AllocSchemaIdRequest> for PbAllocSchemaIdRequest {
+    fn from(req: AllocSchemaIdRequest) -> Self {
+        let mut pb = PbAllocSchemaIdRequest::new();
+        pb.set_name(req.name);
+        pb
+    }
+}
+
+impl From<PbAllocSchemaIdResponse> for AllocSchemaIdResponse {
+    fn from(mut pb: PbAllocSchemaIdResponse) -> Self {
+        Self {
+            header: pb.take_header().into(),
+            name: pb.take_name(),
+            id: pb.get_id(),
+        }
+    }
+}
+
+impl From<AllocTableIdRequest> for PbAllocTableIdRequest {
+    fn from(req: AllocTableIdRequest) -> Self {
+        let mut pb = PbAllocTableIdRequest::new();
+        pb.set_schema_name(req.schema_name);
+        pb.set_name(req.name);
+        pb
+    }
+}
+
+impl From<PbAllocTableIdResponse> for AllocTableIdResponse {
+    fn from(mut pb: PbAllocTableIdResponse) -> Self {
+        Self {
+            header: pb.take_header().into(),
+            schema_name: pb.take_schema_name(),
+            name: pb.take_name(),
+            shard_id: pb.get_shard_id(),
+            schema_id: pb.get_schema_id(),
+            id: pb.get_id(),
+        }
+    }
+}
+
+impl From<DropTableRequest> for PbDropTableRequest {
+    fn from(req: DropTableRequest) -> Self {
+        let mut pb = PbDropTableRequest::new();
+        pb.set_schema_name(req.schema_name);
+        pb.set_name(req.name);
+        pb
+    }
+}
+
+impl From<PbDropTableResponse> for DropTableResponse {
+    fn from(mut pb: PbDropTableResponse) -> Self {
+        Self {
+            header: pb.take_header().into(),
+        }
+    }
+}
diff --git a/proto/.gitignore b/proto/.gitignore
new file mode 100644
index 0000000000..5eb2f8833d
--- /dev/null
+++ b/proto/.gitignore
@@ -0,0 +1 @@
+src/protos
diff --git a/proto/Cargo.toml b/proto/Cargo.toml
new file mode 100644
index 0000000000..609680dd7f
--- /dev/null
+++ b/proto/Cargo.toml
@@ -0,0 +1,14 @@
+[package]
+name = "proto"
+version = "0.1.0"
+authors = ["CeresDB Authors <ceresdb@service.alipay.com>"]
+edition = "2018"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+protobuf = "2.20"
+
+[build-dependencies.protobuf-builder]
+git = "https://github.com/CeresDB/protobuf-builder.git"
+rev = "745cc8527d1c5eb48745f5ce74b2b5bdb75c3bf2"
diff --git a/proto/build.rs b/proto/build.rs
new file mode 100644
index 0000000000..e992a9163c
--- /dev/null
+++ b/proto/build.rs
@@ -0,0 +1,11 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use protobuf_builder::Builder;
+
+fn generate_pb() {
+    Builder::new().search_dir_for_protos("protos").generate();
+}
+
+fn main() {
+    generate_pb();
+}
diff --git a/proto/protos/analytic_common.proto b/proto/protos/analytic_common.proto
new file mode 100644
index 0000000000..c418296f99
--- /dev/null
+++ b/proto/protos/analytic_common.proto
@@ -0,0 +1,62 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+// Common protos of analytic engine
+syntax = "proto3";
+package analytic_common;
+
+// Options of a table that need to persist
+message TableOptions {
+    // Segment duration in ms.
+    uint64 segment_duration = 1;
+    bool enable_ttl = 2;
+    uint64 ttl = 3;
+    uint32 arena_block_size = 4;
+    uint64 num_rows_per_row_group = 5;
+    CompactionStrategy compaction_strategy= 6;
+    CompactionOptions compaction_options = 7;
+    UpdateMode update_mode = 8;
+    uint32 write_buffer_size = 9;
+    Compression compression = 10;
+    // If sampling_segment_duration is true, then the segment duration
+    // is still unknown.
+    bool sampling_segment_duration = 11;
+}
+
+enum UpdateMode {
+    Overwrite = 0;
+    Append = 1;
+}
+
+message CompactionOptions {
+    // Options for STCS
+    float bucket_low = 1;
+    float bucket_high = 2;
+    uint32 min_sstable_size = 3;
+    uint32 min_threshold = 4;
+    uint32 max_threshold = 5;
+    // Options for TWCS
+    TimeUnit timestamp_resolution = 6;
+}
+
+enum TimeUnit {
+    NANOSECONDS = 0;
+    MICROSECONDS = 1;
+    MILLISECONDS = 2;
+    SECONDS = 3;
+    MINUTES = 4;
+    HOURS = 5;
+    DAYS = 6;
+}
+
+enum CompactionStrategy {
+    DEFAULT = 0;
+    SIZE_TIERED = 1;
+    TIME_WINDOW = 2;
+}
+
+enum Compression {
+    UNCOMPRESSED = 0;
+    LZ4 = 1;
+    SNAPPY = 2;
+    ZSTD = 3;
+}
diff --git a/proto/protos/common.proto b/proto/protos/common.proto
new file mode 100644
index 0000000000..dc917685a7
--- /dev/null
+++ b/proto/protos/common.proto
@@ -0,0 +1,63 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+// Common types
+syntax = "proto3";
+package common;
+
+// Data type of column
+// TODO(yingwen): Do we need a null type?
+enum DataType {
+    NULL = 0;
+    TIMESTAMP = 1;
+    DOUBLE = 2;
+    VARBINARY = 3;
+    STRING = 4;
+    UINT64 = 5;
+    FLOAT = 6;
+    INT64 = 7;
+    INT32 = 8;
+    INT16 = 9;
+    INT8 = 10;
+    UINT32 = 11;
+    UINT16 = 12;
+    UINT8 = 13;
+    BOOL = 14;
+}
+
+// Column schema
+message ColumnSchema {
+    // Column name
+    string name = 1;
+    // Column type
+    DataType data_type = 2;
+    // Is the column nullable
+    bool is_nullable = 3;
+    // Id of the column
+    uint32 id = 4;
+    // Is the column used as tag
+    bool is_tag = 5;
+    // Comment of the column
+    string comment = 6;
+}
+
+// Table Schema
+message TableSchema {
+    // Schema of each column
+    repeated ColumnSchema columns = 1;
+    // Version of the schema
+    uint32 version = 2;
+    // Key column num
+    uint32 num_key_columns = 3;
+    // Timestamp index in columns
+    uint32 timestamp_index = 4;
+    // Enable auto generated tsid as primary key
+    bool enable_tsid_primary_key = 5;
+}
+
+// Time range of [start, end)
+message TimeRange {
+    // inclusive start
+    int64 start = 1;
+    // exclusive end
+    int64 end = 2;
+}
diff --git a/proto/protos/meta_update.proto b/proto/protos/meta_update.proto
new file mode 100644
index 0000000000..64c2b384ad
--- /dev/null
+++ b/proto/protos/meta_update.proto
@@ -0,0 +1,101 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+// Meta Updates of analytic engine
+syntax = "proto3";
+package meta_update;
+
+import "analytic_common.proto";
+import "common.proto";
+
+// Meta update for a new space
+message AddSpaceMeta {
+    uint32 space_id = 1;
+    string space_name = 2;
+}
+
+// Meta update for a new table
+message AddTableMeta {
+    uint32 space_id = 1;
+    uint64 table_id = 2;
+    string table_name = 3;
+    // Schema of the table
+    common.TableSchema schema = 4;
+    // Options of the table
+    analytic_common.TableOptions options = 5;
+}
+
+// Meta update for dropping a table
+message DropTableMeta {
+    uint32 space_id = 1;
+    uint64 table_id = 2;
+    string table_name = 3;
+}
+
+// Meta data of a sst file
+message AddFileMeta {
+    // Level of the file
+    uint32 level = 1;
+    // Id of the file
+    uint64 file_id = 2;
+    bytes min_key = 3;
+    bytes max_key = 4;
+    uint64 max_seq = 5;
+    common.TimeRange time_range = 6;
+    common.TableSchema schema = 7;
+    uint64 size = 8;
+    uint64 row_num = 9;
+}
+
+// Meta data of the file to delete
+message DeleteFileMeta {
+    // Level of the file
+    uint32 level = 1;
+    // Id of the file
+    uint64 file_id = 2;
+}
+
+// Meta data of version edit to table
+message VersionEditMeta {
+    uint32 space_id = 1;
+    uint64 table_id = 2;
+    uint64 flushed_sequence = 3;
+    repeated AddFileMeta files_to_add = 4;
+    repeated DeleteFileMeta files_to_delete = 5;
+}
+
+// Meta data of schema update.
+message AlterSchemaMeta {
+    uint32 space_id = 1;
+    uint64 table_id = 2;
+    // New schema of the table.
+    common.TableSchema schema = 3;
+    // Previous schema version.
+    uint32 pre_schema_version = 4;
+}
+
+// Meta data of schema update.
+message AlterOptionsMeta {
+    uint32 space_id = 1;
+    uint64 table_id = 2;
+    // New options of the table.
+    analytic_common.TableOptions options = 3;
+}
+
+// Meta data of manifest snapshot.
+message SnapshotManifestMeta {
+    uint64 region_id = 1;
+    uint64 sequence = 2;
+}
+
+// Meta update data to persist
+message MetaUpdate {
+    oneof meta {
+        AddSpaceMeta add_space = 1;
+        AddTableMeta add_table = 2;
+        VersionEditMeta version_edit = 3;
+        AlterSchemaMeta alter_schema = 4;
+        AlterOptionsMeta alter_options = 5;
+        DropTableMeta drop_table = 6;
+        SnapshotManifestMeta snapshot_manifest = 7;
+    }
+}
diff --git a/proto/protos/sst.proto b/proto/protos/sst.proto
new file mode 100644
index 0000000000..a1ab16e9a7
--- /dev/null
+++ b/proto/protos/sst.proto
@@ -0,0 +1,21 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+// Sst types
+syntax = "proto3";
+package sst;
+
+import "common.proto";
+
+message SstMetaData {
+  // Min key in the sst
+  bytes min_key = 1;
+  // Max key in the sst
+  bytes max_key = 2;
+  // Max sequence number in the sst
+  uint64 max_sequence = 3;
+  // The time range of the sst
+  common.TimeRange time_range = 4;
+  common.TableSchema schema = 5;
+  uint64 size = 6;
+  uint64 row_num = 7;
+}
diff --git a/proto/protos/sys_catalog.proto b/proto/protos/sys_catalog.proto
new file mode 100644
index 0000000000..11cce62d06
--- /dev/null
+++ b/proto/protos/sys_catalog.proto
@@ -0,0 +1,55 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+// Types for sys catalog
+syntax = "proto3";
+package sys_catalog;
+
+import "common.proto";
+
+// Catalog entry
+message CatalogEntry {
+    // Name of catalog
+    string catalog_name = 1;
+    // Created time: ms
+    int64 created_time = 2;
+}
+
+// Schema entry
+message SchemaEntry {
+    // Name of catalog
+    string catalog_name = 1;
+    // Name of schema
+    string schema_name = 2;
+    // Id of the schema
+    uint32 schema_id = 3;
+    // Created time: ms
+    int64 created_time = 4;
+}
+
+// State of the table
+enum TableState {
+    STABLE = 0;
+    DROPPING = 1;
+    DROPPED = 2;
+}
+
+// Table entry
+// TODO(yingwen): Add PartitionInfo
+message TableEntry {
+    // Name of catalog
+    string catalog_name = 1;
+    // Name of schema
+    string schema_name = 2;
+    // Table id
+    uint64 table_id = 3;
+    // Table name
+    string table_name = 4;
+    // Table engine type
+    string engine = 5;
+    // The state of the table.
+    TableState state = 6;
+    // Created time: ms
+    int64 created_time = 7;
+    // Modified time: ms
+    int64 modified_time = 8;
+}
diff --git a/proto/protos/table_requests.proto b/proto/protos/table_requests.proto
new file mode 100644
index 0000000000..a379299ef5
--- /dev/null
+++ b/proto/protos/table_requests.proto
@@ -0,0 +1,19 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+// Types for table requests
+syntax = "proto3";
+package table_requests;
+
+import "common.proto";
+
+// Write table request
+message WriteRequest {
+    // Version of row encoding method
+    uint32 version = 1;
+    // Schema of rows
+    common.TableSchema schema = 2;
+    // Rows in bytes
+    //
+    // Each row is encoded in the same format as memtable
+    repeated bytes rows = 3;
+}
diff --git a/proto/src/lib.rs b/proto/src/lib.rs
new file mode 100644
index 0000000000..d9d1e95e10
--- /dev/null
+++ b/proto/src/lib.rs
@@ -0,0 +1,10 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Protobuf messages
+
+// TODO(yingwen): All the protos need review
+mod protos {
+    include!(concat!(env!("OUT_DIR"), "/protos/mod.rs"));
+}
+
+pub use protos::*;
diff --git a/query_engine/Cargo.toml b/query_engine/Cargo.toml
new file mode 100644
index 0000000000..232992401c
--- /dev/null
+++ b/query_engine/Cargo.toml
@@ -0,0 +1,20 @@
+[package]
+name = "query_engine"
+version = "0.1.0"
+authors = ["CeresDB Authors <ceresdb@service.alipay.com>"]
+edition = "2018"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+# In alphabetical order
+arrow_deps = { path = "../arrow_deps" }
+async-trait = "0.1.41"
+common_types = { path = "../common_types" }
+common_util = { path = "../common_util" }
+futures = "0.3"
+log = "0.4"
+snafu = { version ="0.6.10", features = ["backtraces"]}
+sql = { path = "../sql" }
+table_engine = { path = "../table_engine" }
+udf = { path = "../udf" }
diff --git a/query_engine/src/context.rs b/query_engine/src/context.rs
new file mode 100644
index 0000000000..9ebc825f84
--- /dev/null
+++ b/query_engine/src/context.rs
@@ -0,0 +1,121 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Query context
+
+use std::sync::Arc;
+
+use arrow_deps::datafusion::{
+    execution::context::{ExecutionConfig, ExecutionContext},
+    optimizer::{
+        common_subexpr_eliminate::CommonSubexprEliminate, eliminate_limit::EliminateLimit,
+        filter_push_down::FilterPushDown, limit_push_down::LimitPushDown, optimizer::OptimizerRule,
+        projection_push_down::ProjectionPushDown, simplify_expressions::SimplifyExpressions,
+    },
+    physical_optimizer::optimizer::PhysicalOptimizerRule,
+};
+use common_types::request_id::RequestId;
+
+use crate::{
+    df_planner_extension::QueryPlannerAdapter,
+    logical_optimizer::{
+        order_by_primary_key::OrderByPrimaryKeyRule, type_conversion::TypeConversion,
+    },
+    physical_optimizer,
+};
+
+/// Query context
+pub struct Context {
+    request_id: RequestId,
+    df_exec_ctx: ExecutionContext,
+}
+
+impl Context {
+    // For datafusion, internal use only
+    #[inline]
+    pub(crate) fn df_exec_ctx(&self) -> &ExecutionContext {
+        &self.df_exec_ctx
+    }
+
+    #[inline]
+    pub fn request_id(&self) -> RequestId {
+        self.request_id
+    }
+
+    pub fn builder(request_id: RequestId) -> Builder {
+        Builder {
+            request_id,
+            df_exec_config: ExecutionConfig::new(),
+        }
+    }
+}
+
+pub type ContextRef = Arc<Context>;
+
+#[must_use]
+pub struct Builder {
+    request_id: RequestId,
+    df_exec_config: ExecutionConfig,
+}
+
+impl Builder {
+    /// Set default catalog and schema of this query context
+    pub fn default_catalog_and_schema(mut self, catalog: String, schema: String) -> Self {
+        self.df_exec_config = self
+            .df_exec_config
+            .with_default_catalog_and_schema(catalog, schema);
+
+        self
+    }
+
+    pub fn build(self) -> Context {
+        // Always create default catalog and schema now
+        let df_exec_config = {
+            let adapted_physical_optimize_rules = Self::apply_adapters_for_physical_optimize_rules(
+                &self.df_exec_config.physical_optimizers,
+            );
+            let logical_optimize_rules = Self::logical_optimize_rules();
+            self.df_exec_config
+                .with_query_planner(Arc::new(QueryPlannerAdapter))
+                .with_optimizer_rules(logical_optimize_rules)
+                .with_physical_optimizer_rules(adapted_physical_optimize_rules)
+        };
+
+        Context {
+            request_id: self.request_id,
+            df_exec_ctx: ExecutionContext::with_config(df_exec_config),
+        }
+    }
+
+    fn apply_adapters_for_physical_optimize_rules(
+        default_rules: &[Arc<dyn PhysicalOptimizerRule + Send + Sync>],
+    ) -> Vec<Arc<dyn PhysicalOptimizerRule + Send + Sync>> {
+        let mut new_rules = Vec::with_capacity(default_rules.len());
+        for rule in default_rules {
+            new_rules.push(physical_optimizer::may_adapt_optimize_rule(rule.clone()))
+        }
+
+        new_rules
+    }
+
+    fn logical_optimize_rules() -> Vec<Arc<dyn OptimizerRule + Send + Sync>> {
+        let mut optimizers: Vec<Arc<dyn OptimizerRule + Send + Sync>> = vec![
+            Arc::new(TypeConversion),
+            // These rules are the default settings of the datafusion.
+            Arc::new(SimplifyExpressions::new()),
+            Arc::new(CommonSubexprEliminate::new()),
+            Arc::new(EliminateLimit::new()),
+            Arc::new(ProjectionPushDown::new()),
+            Arc::new(FilterPushDown::new()),
+            Arc::new(LimitPushDown::new()),
+            // TODO(xikai): restore this rule after the bug of df is fixed.
+            // Arc::new(SingleDistinctToGroupBy::new()),
+        ];
+
+        // FIXME(xikai): use config to control the optimize rule.
+        if std::env::var("ENABLE_CUSTOM_OPTIMIZE").is_ok() {
+            optimizers.push(Arc::new(OrderByPrimaryKeyRule));
+        }
+
+        optimizers
+    }
+}
diff --git a/query_engine/src/df_execution_extension/mod.rs b/query_engine/src/df_execution_extension/mod.rs
new file mode 100644
index 0000000000..746499e79a
--- /dev/null
+++ b/query_engine/src/df_execution_extension/mod.rs
@@ -0,0 +1,4 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+pub mod prom_align;
+pub use prom_align::PromAlignExec;
diff --git a/query_engine/src/df_execution_extension/prom_align.rs b/query_engine/src/df_execution_extension/prom_align.rs
new file mode 100644
index 0000000000..5e41f6e9af
--- /dev/null
+++ b/query_engine/src/df_execution_extension/prom_align.rs
@@ -0,0 +1,931 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use std::{
+    any::Any,
+    collections::{hash_map, BTreeMap, HashMap, VecDeque},
+    fmt, mem,
+    pin::Pin,
+    sync::Arc,
+    task::{Context, Poll},
+};
+
+use arrow_deps::{
+    arrow::{
+        array::{
+            new_empty_array, Float64Array, StringArray, TimestampMillisecondArray, UInt64Array,
+        },
+        error::ArrowError,
+        record_batch::RecordBatch,
+    },
+    datafusion::{
+        error::{DataFusionError, Result as ArrowResult},
+        execution::runtime_env::RuntimeEnv,
+        physical_plan::{
+            repartition::RepartitionExec, ColumnarValue, DisplayFormatType, ExecutionPlan,
+            Partitioning, PhysicalExpr, RecordBatchStream,
+            SendableRecordBatchStream as DfSendableRecordBatchStream, Statistics,
+        },
+    },
+};
+use async_trait::async_trait;
+use common_types::{
+    schema::{ArrowSchema, ArrowSchemaRef, DataType, TSID_COLUMN},
+    time::{TimeRange, Timestamp},
+};
+use futures::{Stream, StreamExt};
+use log::debug;
+use snafu::{OptionExt, ResultExt, Snafu};
+use sql::promql::{AlignParameter, ColumnNames, Func as PromFunc};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Internal err, source:{:?}", source))]
+    Internal { source: DataFusionError },
+
+    #[snafu(display("Invalid schema, source:{:?}", source))]
+    InvalidSchema { source: common_types::schema::Error },
+
+    #[snafu(display("Tsid column is required"))]
+    TsidRequired,
+
+    #[snafu(display("Invalid column type, required:{:?}", required_type))]
+    InvalidColumnType { required_type: String },
+
+    #[snafu(display("{} column type cannot be null", name))]
+    NullColumn { name: String },
+
+    #[snafu(display("timestamp out of range"))]
+    TimestampOutOfRange {},
+}
+
+define_result!(Error);
+
+/// Limits Extrapolation range.
+/// Refer to https://github.com/prometheus/prometheus/pull/1295
+const PROMTHEUS_EXTRAPOLATION_THRESHOLD_COEFFICIENT: f64 = 1.1;
+
+#[derive(Debug)]
+struct ExtractTsidExpr {}
+
+impl fmt::Display for ExtractTsidExpr {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "(ExtractTsid)")
+    }
+}
+
+impl PhysicalExpr for ExtractTsidExpr {
+    fn as_any(&self) -> &dyn Any {
+        &*self
+    }
+
+    fn data_type(&self, _input_schema: &ArrowSchema) -> ArrowResult<DataType> {
+        Ok(DataType::UInt64)
+    }
+
+    fn nullable(&self, _input_schema: &ArrowSchema) -> ArrowResult<bool> {
+        Ok(false)
+    }
+
+    fn evaluate(&self, batch: &RecordBatch) -> ArrowResult<ColumnarValue> {
+        let tsid_idx = batch
+            .schema()
+            .index_of(TSID_COLUMN)
+            .expect("checked in plan build");
+        Ok(ColumnarValue::Array(batch.column(tsid_idx).clone()))
+    }
+}
+
+/// Note: caller should ensure data[tail_index] is valid
+pub(crate) trait AlignFunc: fmt::Debug {
+    fn call(
+        &self,
+        data: &VecDeque<Sample>,
+        tail_index: usize,
+        timestamp: Timestamp,
+        param: &AlignParameter,
+    ) -> Result<Option<Sample>>;
+}
+
+/// PromAlignExec will group data by tsid and align sample based on align_param
+#[derive(Debug)]
+pub struct PromAlignExec {
+    input: Arc<dyn ExecutionPlan>,
+    column_name: Arc<ColumnNames>,
+    align_func: Arc<dyn AlignFunc + Send + Sync>,
+    align_param: AlignParameter,
+}
+
+impl PromAlignExec {
+    pub fn try_new(
+        input: Arc<dyn ExecutionPlan>,
+        column_name: Arc<ColumnNames>,
+        func: PromFunc,
+        align_param: AlignParameter,
+        read_parallelism: usize,
+    ) -> Result<Self> {
+        let extract_tsid: Arc<dyn PhysicalExpr> = Arc::new(ExtractTsidExpr {});
+        let input = Arc::new(
+            RepartitionExec::try_new(
+                input,
+                Partitioning::Hash(vec![extract_tsid], read_parallelism),
+            )
+            .context(Internal)?,
+        ) as Arc<dyn ExecutionPlan>;
+        let align_func: Arc<dyn AlignFunc + Send + Sync> = match func {
+            PromFunc::Instant => Arc::new(InstantFunc {}),
+            PromFunc::Rate => Arc::new(RateFunc {}),
+            PromFunc::Irate => Arc::new(IrateFunc {}),
+            PromFunc::Delta => Arc::new(DeltaFunc {}),
+            PromFunc::Idelta => Arc::new(IdeltaFunc {}),
+            PromFunc::Increase => Arc::new(IncreaseFunc {}),
+        };
+        Ok(Self {
+            input,
+            column_name,
+            align_func,
+            align_param,
+        })
+    }
+}
+
+#[async_trait]
+impl ExecutionPlan for PromAlignExec {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn schema(&self) -> ArrowSchemaRef {
+        self.input.schema()
+    }
+
+    fn output_partitioning(&self) -> Partitioning {
+        self.input.output_partitioning()
+    }
+
+    fn children(&self) -> Vec<Arc<dyn ExecutionPlan>> {
+        vec![self.input.clone()]
+    }
+
+    fn with_new_children(
+        &self,
+        children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> ArrowResult<Arc<dyn ExecutionPlan>> {
+        match children.len() {
+            1 => Ok(Arc::new(PromAlignExec {
+                input: children[0].clone(),
+                column_name: self.column_name.clone(),
+                align_func: self.align_func.clone(),
+                align_param: self.align_param,
+            })),
+            _ => Err(DataFusionError::Internal(
+                "PromAlignExec wrong number of children".to_string(),
+            )),
+        }
+    }
+
+    async fn execute(
+        &self,
+        partition: usize,
+        runtime: Arc<RuntimeEnv>,
+    ) -> ArrowResult<DfSendableRecordBatchStream> {
+        debug!("PromAlignExec: partition:{}", partition);
+        Ok(Box::pin(PromAlignReader {
+            input: self.input.execute(partition, runtime).await?,
+            done: false,
+            column_name: self.column_name.clone(),
+            align_func: self.align_func.clone(),
+            align_param: self.align_param,
+            tsid_to_tags: HashMap::default(),
+            tsid_to_stepper: HashMap::default(),
+            record_schema: None,
+        }))
+    }
+
+    fn fmt_as(&self, _t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(
+            f,
+            "PromAlignExec: align_param={:?}, func={:?}, partition_count={}",
+            self.align_param,
+            self.align_func,
+            self.output_partitioning().partition_count(),
+        )
+    }
+
+    fn statistics(&self) -> Statistics {
+        // TODO(chenxiang)
+        Statistics::default()
+    }
+}
+
+struct PromAlignReader {
+    /// The input to read data from
+    input: DfSendableRecordBatchStream,
+    /// Have we produced the output yet?
+    done: bool,
+    column_name: Arc<ColumnNames>,
+    align_func: Arc<dyn AlignFunc + Send + Sync>,
+    align_param: AlignParameter,
+    tsid_to_tags: HashMap<u64, BTreeMap<String, String>>,
+    tsid_to_stepper: HashMap<u64, Box<dyn Stepper + Send + Sync>>,
+    record_schema: Option<ArrowSchemaRef>,
+}
+
+impl PromAlignReader {
+    fn step_helper(&mut self, tsid: u64, samples: Vec<Sample>) -> Result<Option<Vec<Sample>>> {
+        let start_timestamp = self.align_param.align_range.inclusive_start();
+        let offset = self.align_param.offset;
+        let stepper = self.tsid_to_stepper.entry(tsid).or_insert_with(|| {
+            Box::new(FixedStepper::new(start_timestamp)) as Box<dyn Stepper + Send + Sync>
+        });
+        let samples = samples
+            .into_iter()
+            .map(|Sample { timestamp, value }| {
+                Ok(Sample {
+                    timestamp: timestamp
+                        .checked_add(offset)
+                        .context(TimestampOutOfRange {})?,
+                    value,
+                })
+            })
+            .collect::<Result<VecDeque<_>>>()?;
+        let sample_range = if samples.is_empty() {
+            TimeRange::min_to_max()
+        } else {
+            TimeRange::new_unchecked(
+                samples.front().unwrap().timestamp, // we have at least one samples here
+                samples
+                    .back()
+                    .unwrap()
+                    .timestamp
+                    .checked_add_i64(1)
+                    .context(TimestampOutOfRange {})?,
+            )
+        };
+        stepper.step(
+            samples,
+            sample_range,
+            &self.align_param,
+            self.align_func.clone(),
+        )
+    }
+
+    fn accumulate_record_batch(
+        &mut self,
+        record_batch: RecordBatch,
+    ) -> Result<HashMap<u64, Vec<Sample>>> {
+        let schema = record_batch.schema();
+        let tsid_idx = schema.index_of(TSID_COLUMN).expect("checked in plan build");
+        let field_idx = schema
+            .index_of(&self.column_name.field)
+            .expect("checked in plan build");
+        let timestamp_idx = schema
+            .index_of(&self.column_name.timestamp)
+            .expect("checked in plan build");
+
+        let mut tsid_samples = HashMap::new();
+        let tsid_array = record_batch
+            .column(tsid_idx)
+            .as_any()
+            .downcast_ref::<UInt64Array>()
+            .expect("checked in build plan");
+        if tsid_array.is_empty() {
+            // empty array means end of data, but maybe there are still pending samples, so
+            // step one more time
+            let tsids = self.tsid_to_stepper.keys().cloned().collect::<Vec<_>>();
+            for tsid in tsids {
+                if let Some(result) = self.step_helper(tsid, vec![])? {
+                    tsid_samples.insert(tsid, result);
+                }
+            }
+            return Ok(tsid_samples);
+        }
+
+        let mut previous_tsid = tsid_array.value(0);
+        let mut duplicated_tsids = vec![(previous_tsid, 0)];
+        for row_idx in 1..tsid_array.len() {
+            let tsid = tsid_array.value(row_idx);
+            if tsid != previous_tsid {
+                previous_tsid = tsid;
+                duplicated_tsids.push((tsid, row_idx));
+            }
+        }
+        let mut step_helper = |tsid, batch| {
+            if let hash_map::Entry::Vacant(e) = self.tsid_to_tags.entry(tsid) {
+                e.insert(Self::build_tags(
+                    &self.column_name.tag_keys,
+                    schema.clone(),
+                    &batch,
+                )?);
+            }
+            if let Some(result) =
+                self.step_helper(tsid, self.build_sample(field_idx, timestamp_idx, batch)?)?
+            {
+                tsid_samples.insert(tsid, result);
+            }
+            Ok(())
+        };
+        if duplicated_tsids.len() == 1 {
+            // fast path, when there is only one tsid in record_batch
+            step_helper(duplicated_tsids[0].0, record_batch)?;
+        } else {
+            debug!("duplicated_tsids:{:?}", duplicated_tsids);
+            for i in 0..duplicated_tsids.len() {
+                let (tsid, offset) = duplicated_tsids[i];
+                let length = if i == duplicated_tsids.len() - 1 {
+                    tsid_array.len() - offset
+                } else {
+                    duplicated_tsids[i + 1].1 - offset
+                };
+                let current_batch = record_batch.slice(offset, length);
+                step_helper(tsid, current_batch)?;
+            }
+        }
+
+        Ok(tsid_samples)
+    }
+
+    fn build_tags(
+        tag_keys: &[String],
+        schema: ArrowSchemaRef,
+        record_batch: &RecordBatch,
+    ) -> Result<BTreeMap<String, String>> {
+        tag_keys
+            .iter()
+            .map(|key| {
+                let v = record_batch
+                    .column(schema.index_of(key).expect("checked in build plan"))
+                    .as_any()
+                    .downcast_ref::<StringArray>()
+                    .context(InvalidColumnType {
+                        required_type: "StringArray",
+                    })?
+                    .value(0);
+                Ok((key.to_owned(), v.to_string()))
+            })
+            .collect::<Result<BTreeMap<_, _>>>()
+    }
+
+    fn build_sample(
+        &self,
+        field_idx: usize,
+        timestamp_idx: usize,
+        record_batch: RecordBatch,
+    ) -> Result<Vec<Sample>> {
+        let field_array = record_batch
+            .column(field_idx)
+            .as_any()
+            .downcast_ref::<Float64Array>()
+            .context(InvalidColumnType {
+                required_type: "Float64Array",
+            })?;
+        let timestamp_array = record_batch
+            .column(timestamp_idx)
+            .as_any()
+            .downcast_ref::<TimestampMillisecondArray>()
+            .context(InvalidColumnType {
+                required_type: "TimestampMillisecondArray",
+            })?;
+        field_array
+            .into_iter()
+            .zip(timestamp_array.into_iter())
+            .map(|(field, timestamp)| {
+                Ok(Sample {
+                    value: field.context(NullColumn { name: "field" })?,
+                    timestamp: Timestamp::new(timestamp.context(NullColumn { name: "timestamp" })?),
+                })
+            })
+            .collect::<Result<Vec<_>>>()
+    }
+
+    fn samples_to_record_batch(
+        &self,
+        schema: ArrowSchemaRef,
+        tsid_samples: HashMap<u64, Vec<Sample>>,
+    ) -> std::result::Result<RecordBatch, ArrowError> {
+        let tsid_idx = schema.index_of(TSID_COLUMN).expect("checked in plan build");
+        let field_idx = schema
+            .index_of(&self.column_name.field)
+            .expect("checked in plan build");
+        let timestamp_idx = schema
+            .index_of(&self.column_name.timestamp)
+            .expect("checked in plan build");
+        let mut batches = Vec::with_capacity(tsid_samples.len());
+        for (tsid, samples) in tsid_samples {
+            let record_batch_len = samples.len();
+            let tags = self
+                .tsid_to_tags
+                .get(&tsid)
+                .expect("tags are ensured in accumulated_record_batch");
+            let mut arrays = vec![new_empty_array(&DataType::Int32); schema.fields().len()];
+            arrays[tsid_idx] = Arc::new(UInt64Array::from(vec![tsid; record_batch_len]));
+            let mut fields = Vec::with_capacity(record_batch_len);
+            let mut timestamps = Vec::with_capacity(record_batch_len);
+            for Sample {
+                timestamp,
+                value: field,
+            } in samples
+            {
+                fields.push(field);
+                timestamps.push(timestamp.as_i64());
+            }
+            arrays[timestamp_idx] = Arc::new(TimestampMillisecondArray::from(timestamps));
+            arrays[field_idx] = Arc::new(Float64Array::from(fields));
+
+            for tag_key in &self.column_name.tag_keys {
+                let tag_idx = schema
+                    .index_of(tag_key.as_str())
+                    .expect("checked in plan build");
+                arrays[tag_idx] = Arc::new(StringArray::from(vec![
+                    tags.get(tag_key)
+                        .expect("tag_key are ensured in accmulate_record_batch")
+                        .to_string();
+                    record_batch_len
+                ]));
+            }
+            batches.push(RecordBatch::try_new(schema.clone(), arrays)?);
+        }
+
+        RecordBatch::concat(&schema, &batches)
+    }
+}
+
+impl Stream for PromAlignReader {
+    type Item = std::result::Result<RecordBatch, ArrowError>;
+
+    fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        if self.done {
+            return Poll::Ready(None);
+        }
+
+        match self.input.poll_next_unpin(cx) {
+            Poll::Ready(Some(Ok(batch))) => {
+                let schema = batch.schema();
+                if self.record_schema.is_none() {
+                    self.record_schema = Some(schema.clone());
+                }
+                let tsid_samples = self
+                    .accumulate_record_batch(batch)
+                    .map_err(|e| ArrowError::SchemaError(e.to_string()))?; // convert all Error enum to SchemaError
+                if !tsid_samples.is_empty() {
+                    Poll::Ready(Some(self.samples_to_record_batch(schema, tsid_samples)))
+                } else {
+                    Poll::Ready(Some(Ok(RecordBatch::new_empty(schema))))
+                }
+            }
+            Poll::Ready(None) => {
+                self.done = true;
+                if let Some(schema) = mem::take(&mut self.record_schema) {
+                    let tsid_samples = self
+                        .accumulate_record_batch(RecordBatch::new_empty(schema.clone()))
+                        .map_err(|e| ArrowError::SchemaError(e.to_string()))?;
+                    if !tsid_samples.is_empty() {
+                        return Poll::Ready(Some(
+                            self.samples_to_record_batch(schema, tsid_samples),
+                        ));
+                    }
+                }
+                Poll::Ready(None)
+            }
+            other => other,
+        }
+    }
+}
+
+impl RecordBatchStream for PromAlignReader {
+    fn schema(&self) -> ArrowSchemaRef {
+        self.input.schema()
+    }
+}
+
+#[derive(Debug)]
+pub(crate) struct Sample {
+    timestamp: Timestamp,
+    value: f64,
+}
+
+/// `Stepper` is used for align samples, specified by [range queries](https://prometheus.io/docs/prometheus/latest/querying/api/#range-queries).
+/// Note: [instant queries](https://prometheus.io/docs/prometheus/latest/querying/api/#instant-queries) are models as range queries with step is 1.
+///
+/// # Diagram
+/// ```plaintext
+///                        range
+///                   +-------------+
+///                   v             |
+///  |------|-----|-----|-----|-----|-------->
+/// start                 step               end
+/// ```
+trait Stepper: fmt::Debug {
+    /// Calculate current sample based on new input samples.
+    /// Samples maybe kept since some function require large time range input,
+    /// such as rate(metric[1d])
+    fn step(
+        &mut self,
+        input: VecDeque<Sample>,
+        range: TimeRange,
+        param: &AlignParameter,
+        align_func: Arc<dyn AlignFunc + Send + Sync>,
+    ) -> Result<Option<Vec<Sample>>>;
+
+    // Returns size of samples kept during query, mainly used for metrics
+    fn pending_column_bytes(&self) -> usize;
+}
+
+/// `FixedStepper` is one implemention of `Stepper`, which will accumulate all
+/// samples within each step before pass control to next execution node.
+/// This implemention will consume high memory in large range query, such as
+/// rate(metric[30d])
+
+/// TODO(chenxiang): A streaming implemention is required for those large range
+/// query.
+#[derive(Debug)]
+struct FixedStepper {
+    /// accumulated samples used for calculate sample for current step
+    entries: VecDeque<Sample>,
+    /// tail index of entries for processing current step, which means
+    /// [0, tail_index] is used
+    tail_index: usize,
+    /// timestamp of current step sample
+    timestamp: Timestamp,
+}
+
+impl Stepper for FixedStepper {
+    fn step(
+        &mut self,
+        mut column: VecDeque<Sample>,
+        column_range: TimeRange,
+        param: &AlignParameter,
+        align_func: Arc<dyn AlignFunc + Send + Sync>,
+    ) -> Result<Option<Vec<Sample>>> {
+        self.entries.append(&mut column);
+        debug!(
+            "column_range:{:?}, param:{:?}, ts:{:?}",
+            column_range, param, self.timestamp
+        );
+        let curr_range = param.align_range.intersected_range(column_range);
+        if curr_range.is_none() {
+            return Ok(None);
+        }
+        let curr_range = curr_range.unwrap();
+        let mut result = vec![];
+
+        // self.timestamp = self.timestamp.max(start);
+        while self.timestamp < curr_range.inclusive_start() {
+            self.timestamp = self
+                .timestamp
+                .checked_add(param.step)
+                .context(TimestampOutOfRange {})?;
+        }
+
+        while curr_range.contains(self.timestamp) {
+            // push `tail_index`. In look ahead (by increasing index by 1) way.
+            while self.tail_index + 1 < self.entries.len()
+                && self.entries[self.tail_index + 1].timestamp <= self.timestamp
+            {
+                self.tail_index += 1;
+            }
+            let mint = self
+                .timestamp
+                .checked_sub(param.lookback_delta)
+                .context(TimestampOutOfRange {})?;
+            // drop some unneeded entries from begining of `entries`
+            while let Some(entry) = self.entries.front() {
+                if entry.timestamp < mint {
+                    self.entries.pop_front();
+                    if let Some(index) = self.tail_index.checked_sub(1) {
+                        self.tail_index = index
+                    }
+                } else {
+                    break;
+                }
+            }
+            // [mint, self.timestamp] has no data, skip to next step.
+            let skip = {
+                if let Some(first_entry) = self.entries.get(0) {
+                    first_entry.timestamp > self.timestamp
+                } else {
+                    true
+                }
+            };
+            if skip {
+                self.timestamp = self
+                    .timestamp
+                    .checked_add(param.step)
+                    .context(TimestampOutOfRange {})?;
+                continue;
+            }
+
+            // call range function
+            if let Some(value) =
+                align_func.call(&self.entries, self.tail_index, self.timestamp, param)?
+            {
+                result.push(value);
+            }
+
+            self.timestamp = self
+                .timestamp
+                .checked_add(param.step)
+                .context(TimestampOutOfRange {})?;
+        }
+
+        if !result.is_empty() {
+            Ok(Some(result))
+        } else {
+            Ok(None)
+        }
+    }
+
+    fn pending_column_bytes(&self) -> usize {
+        self.entries.len() * 16 // timestamp + float value
+    }
+}
+
+impl FixedStepper {
+    fn new(start_timestamp: Timestamp) -> FixedStepper {
+        Self {
+            entries: VecDeque::new(),
+            tail_index: 0,
+            timestamp: start_timestamp,
+        }
+    }
+}
+
+/// Helper for Promtheus functions which needs extrapolation. [Rate][rate],
+/// [Increase][increase] and [Delta][delta] for now.
+///
+/// Since "range" is not always equals to `data_duration`, extrapolation needs
+/// to be performed to estimate absent data. Extrapolation is named by
+/// Prometheus. This function is ported from [here][prom_extrapolate_code].
+/// "extrapolate" assumes absenting data is following the same distribution with
+/// existing data. Thus it simply zooms result calculated from existing data to
+/// required extrapolation time range.
+///
+/// [rate]: https://prometheus.io/docs/prometheus/latest/querying/functions/#rate
+/// [increase]: https://prometheus.io/docs/prometheus/latest/querying/functions/#increase
+/// [delta]: https://prometheus.io/docs/prometheus/latest/querying/functions/#delta
+/// [prom_extrapolate_code]: https://github.com/prometheus/prometheus/blob/063154eab720d8c3d495bd78312c0df090d0bf23/promql/functions.go#L59
+///
+/// This function can be roughly divided into three parts:
+/// - Calculate result from real data
+/// - Calculate time range needs extrapolate to.
+/// - Calculate extrapolated result.
+///
+/// The outputs of above three steps are `difference`, `extrapolated_duration`
+/// and `extrapolated_result`.
+///
+/// # Diagram
+/// ```plaintext
+/// range_start        first_timestamp       last_timestamp       range_end
+///      └─────────────────────┴────────────────────┴──────────────────┘
+///          range_to_start        data_duration         range_to_end
+/// ```
+///
+/// Legends:
+/// - `range_end` is the timestamp passed in
+/// - `range_start` is calculated by `timestamp` - `lookback_range`.
+/// - "range" here stands for `range_end` - `range_start`, which is equals to
+///   `range_to_start` + `data_duration` + `range_to_end`.
+/// - `first/last_timestamp` is the timestamp of provided data.
+/// - `data_duration` is a time range covered by data.
+fn extrapolate_fn_helper(
+    data: &VecDeque<Sample>,
+    tail_index: usize,
+    timestamp: Timestamp,
+    lookback_delta: Timestamp,
+    is_counter: bool,
+    is_rate: bool,
+) -> Result<Option<Sample>> {
+    // no sence to calculate rate on one single item.
+    if tail_index < 1 {
+        return Ok(None);
+    }
+
+    let first_data = data[0].value;
+
+    // calculate `counter_reset_correction` for counter type.
+    let mut counter_reset_correction = 0.0;
+    if is_counter {
+        let mut last_data = first_data;
+        for Sample { value, .. } in data.iter().take(tail_index + 1) {
+            if *value < last_data {
+                counter_reset_correction += last_data;
+            }
+            last_data = *value;
+        }
+    }
+
+    let difference = data[tail_index].value - first_data + counter_reset_correction;
+
+    // `average_duration_between_data` assumes all data is distributed evenly.
+    let first_timestamp = data[0].timestamp;
+    let last_timestamp = data[tail_index].timestamp;
+    let data_duration = (last_timestamp
+        .checked_sub(first_timestamp)
+        .context(TimestampOutOfRange {})?)
+    .as_i64() as f64;
+    let average_duration_between_data = data_duration / tail_index as f64;
+
+    let range_start = timestamp
+        .checked_sub(lookback_delta)
+        .context(TimestampOutOfRange {})?;
+    let range_end = timestamp;
+    let mut range_to_start = (first_timestamp
+        .checked_sub(range_start)
+        .context(TimestampOutOfRange)?)
+    .as_i64() as f64;
+    let mut range_to_end = (range_end
+        .checked_sub(last_timestamp)
+        .context(TimestampOutOfRange {})?)
+    .as_i64() as f64;
+
+    // Prometheus shorten forward-extrapolation to zero point.
+    if is_counter && difference > 0.0 && first_data >= 0.0 {
+        let range_to_zero_point = data_duration * (first_data / difference);
+        range_to_start = range_to_start.min(range_to_zero_point);
+    }
+
+    let extrapolation_threshold =
+        average_duration_between_data * PROMTHEUS_EXTRAPOLATION_THRESHOLD_COEFFICIENT;
+
+    // if lots of data is absent (`range_to_start` or `range_to_end` is longer than
+    // `extrapolation_threshold`), Prometheus will not estimate all time range. Use
+    // half of `average_duration_between_data` instead.
+    if range_to_start > extrapolation_threshold {
+        range_to_start = average_duration_between_data / 2.0;
+    }
+    if range_to_end > extrapolation_threshold {
+        range_to_end = average_duration_between_data / 2.0;
+    }
+
+    // `difference` is the real result calculated by existing data. Prometheus will
+    // zoom it to `extrapolated_duration` to get extrapolated estimated result.
+    let extrapolated_duration = data_duration + range_to_start + range_to_end;
+    let mut extrapolated_result = difference * extrapolated_duration / data_duration;
+
+    if is_rate {
+        // `lookback_delta` here is in millisecond.
+        extrapolated_result /= lookback_delta.as_i64() as f64 / 1000.0;
+    }
+
+    Ok(Some(Sample {
+        timestamp,
+        value: extrapolated_result,
+    }))
+}
+
+/// Implementation of `Rate` function in `Prometheus`. More
+/// [details](https://prometheus.io/docs/prometheus/latest/querying/functions/#rate)
+#[derive(Debug)]
+struct RateFunc {}
+
+impl AlignFunc for RateFunc {
+    fn call(
+        &self,
+        data: &VecDeque<Sample>,
+        tail_index: usize,
+        timestamp: Timestamp,
+        param: &AlignParameter,
+    ) -> Result<Option<Sample>> {
+        extrapolate_fn_helper(
+            data,
+            tail_index,
+            timestamp,
+            param.lookback_delta,
+            true,
+            true,
+        )
+    }
+}
+
+#[derive(Debug)]
+struct DeltaFunc {}
+
+impl AlignFunc for DeltaFunc {
+    fn call(
+        &self,
+        data: &VecDeque<Sample>,
+        tail_index: usize,
+        timestamp: Timestamp,
+        param: &AlignParameter,
+    ) -> Result<Option<Sample>> {
+        extrapolate_fn_helper(
+            data,
+            tail_index,
+            timestamp,
+            param.lookback_delta,
+            false,
+            false,
+        )
+    }
+}
+
+#[derive(Debug)]
+struct IncreaseFunc {}
+
+impl AlignFunc for IncreaseFunc {
+    fn call(
+        &self,
+        data: &VecDeque<Sample>,
+        tail_index: usize,
+        timestamp: Timestamp,
+        param: &AlignParameter,
+    ) -> Result<Option<Sample>> {
+        extrapolate_fn_helper(
+            data,
+            tail_index,
+            timestamp,
+            param.lookback_delta,
+            true,
+            false,
+        )
+    }
+}
+
+// Port from https://github.com/prometheus/prometheus/blob/063154eab720d8c3d495bd78312c0df090d0bf23/promql/functions.go#L159
+fn instant_value(
+    data: &VecDeque<Sample>,
+    tail_index: usize,
+    timestamp: Timestamp,
+    is_rate: bool,
+) -> Result<Option<Sample>> {
+    if tail_index < 2 {
+        return Ok(None);
+    }
+
+    let last_entry = &data[tail_index];
+    let previous_entry = &data[tail_index - 1];
+
+    let mut result = if is_rate && last_entry.value < previous_entry.value {
+        last_entry.value
+    } else {
+        last_entry.value - previous_entry.value
+    };
+
+    let interval = last_entry
+        .timestamp
+        .checked_sub(previous_entry.timestamp)
+        .context(TimestampOutOfRange {})?;
+    assert!(interval.as_i64() > 0);
+
+    if is_rate {
+        // Convert to per-second.
+        result /= interval.as_i64() as f64 / 1000.0;
+    }
+
+    Ok(Some(Sample {
+        value: result,
+        timestamp,
+    }))
+}
+
+#[derive(Debug)]
+pub struct IdeltaFunc;
+
+impl AlignFunc for IdeltaFunc {
+    fn call(
+        &self,
+        data: &VecDeque<Sample>,
+        tail_index: usize,
+        timestamp: Timestamp,
+        _param: &AlignParameter,
+    ) -> Result<Option<Sample>> {
+        instant_value(data, tail_index, timestamp, false)
+    }
+}
+
+#[derive(Debug)]
+struct IrateFunc;
+
+impl AlignFunc for IrateFunc {
+    fn call(
+        &self,
+        data: &VecDeque<Sample>,
+        tail_index: usize,
+        timestamp: Timestamp,
+        _param: &AlignParameter,
+    ) -> Result<Option<Sample>> {
+        instant_value(data, tail_index, timestamp, true)
+    }
+}
+
+/// This function is not in Promtheus' functions list.
+///
+/// It simulates the behavior of `Instant Selector` by finding the newest point
+/// from the input. Thus `Instant Selector` can be represented by [PromAlignOp]
+/// + [InstantFn].
+#[derive(Debug)]
+pub struct InstantFunc;
+
+impl AlignFunc for InstantFunc {
+    fn call(
+        &self,
+        data: &VecDeque<Sample>,
+        tail_index: usize,
+        timestamp: Timestamp,
+        _param: &AlignParameter,
+    ) -> Result<Option<Sample>> {
+        Ok(Some(Sample {
+            timestamp,
+            value: data[tail_index].value,
+        }))
+    }
+}
diff --git a/query_engine/src/df_planner_extension/mod.rs b/query_engine/src/df_planner_extension/mod.rs
new file mode 100644
index 0000000000..336cd128f5
--- /dev/null
+++ b/query_engine/src/df_planner_extension/mod.rs
@@ -0,0 +1,40 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! The query planner adapter provides some planner extensions of datafusion.
+
+use std::sync::Arc;
+
+use arrow_deps::datafusion::{
+    execution::context::{ExecutionContextState, QueryPlanner},
+    logical_plan::LogicalPlan,
+    physical_plan::{
+        planner::{DefaultPhysicalPlanner, ExtensionPlanner},
+        ExecutionPlan, PhysicalPlanner,
+    },
+};
+
+pub mod prom_align;
+pub mod table_scan_by_primary_key;
+use async_trait::async_trait;
+
+/// The adapter for extending the default datafusion planner.
+pub struct QueryPlannerAdapter;
+
+#[async_trait]
+impl QueryPlanner for QueryPlannerAdapter {
+    async fn create_physical_plan(
+        &self,
+        logical_plan: &LogicalPlan,
+        ctx_state: &ExecutionContextState,
+    ) -> arrow_deps::datafusion::error::Result<Arc<dyn ExecutionPlan>> {
+        let extension_planners: Vec<Arc<dyn ExtensionPlanner + Send + Sync>> = vec![
+            Arc::new(table_scan_by_primary_key::Planner),
+            Arc::new(prom_align::PromAlignPlanner),
+        ];
+
+        let physical_planner = DefaultPhysicalPlanner::with_extension_planners(extension_planners);
+        physical_planner
+            .create_physical_plan(logical_plan, ctx_state)
+            .await
+    }
+}
diff --git a/query_engine/src/df_planner_extension/prom_align.rs b/query_engine/src/df_planner_extension/prom_align.rs
new file mode 100644
index 0000000000..f55b7042e4
--- /dev/null
+++ b/query_engine/src/df_planner_extension/prom_align.rs
@@ -0,0 +1,53 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use std::sync::Arc;
+
+use arrow_deps::datafusion::{
+    error::DataFusionError,
+    execution::context::ExecutionContextState,
+    logical_plan::{LogicalPlan, UserDefinedLogicalNode},
+    physical_plan::{planner::ExtensionPlanner, ExecutionPlan, PhysicalPlanner},
+};
+use snafu::Snafu;
+use sql::promql::PromAlignNode;
+
+use crate::df_execution_extension::prom_align::{Error as ExecError, PromAlignExec};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Build execution failed. err:{:?}", source))]
+    ExecutionError { source: ExecError },
+}
+
+pub struct PromAlignPlanner;
+
+impl ExtensionPlanner for PromAlignPlanner {
+    fn plan_extension(
+        &self,
+        _planner: &dyn PhysicalPlanner,
+        node: &dyn UserDefinedLogicalNode,
+        logical_inputs: &[&LogicalPlan],
+        physical_inputs: &[Arc<dyn ExecutionPlan>],
+        _ctx_state: &ExecutionContextState,
+    ) -> arrow_deps::datafusion::error::Result<Option<Arc<dyn ExecutionPlan>>> {
+        Ok(
+            if let Some(node) = node.as_any().downcast_ref::<PromAlignNode>() {
+                assert_eq!(logical_inputs.len(), 1, "Inconsistent number of inputs");
+                assert_eq!(physical_inputs.len(), 1, "Inconsistent number of inputs");
+                Some(Arc::new(
+                    PromAlignExec::try_new(
+                        physical_inputs[0].clone(),
+                        node.column_name.clone(),
+                        node.func,
+                        node.align_param,
+                        node.read_parallelism,
+                    )
+                    // DataFusionError is lost when wrapped, use string instead.
+                    .map_err(|e| DataFusionError::Plan(e.to_string()))?,
+                ))
+            } else {
+                None
+            },
+        )
+    }
+}
diff --git a/query_engine/src/df_planner_extension/table_scan_by_primary_key.rs b/query_engine/src/df_planner_extension/table_scan_by_primary_key.rs
new file mode 100644
index 0000000000..c864270aaa
--- /dev/null
+++ b/query_engine/src/df_planner_extension/table_scan_by_primary_key.rs
@@ -0,0 +1,141 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use std::{
+    any::Any,
+    fmt::{Debug, Formatter},
+    sync::Arc,
+};
+
+use arrow_deps::datafusion::{
+    error::DataFusionError,
+    execution::context::ExecutionContextState,
+    logical_plan::{self, DFSchemaRef, Expr, LogicalPlan, TableScan, UserDefinedLogicalNode},
+    physical_plan::{planner::ExtensionPlanner, ExecutionPlan, PhysicalPlanner},
+};
+use table_engine::{provider::TableProviderAdapter, table::ReadOrder};
+
+/// The extension planner creates physical plan for the
+/// [`TableScanByPrimaryKey`] which is a logical plan node.
+pub struct Planner;
+
+impl ExtensionPlanner for Planner {
+    fn plan_extension(
+        &self,
+        _planner: &dyn PhysicalPlanner,
+        node: &dyn UserDefinedLogicalNode,
+        _logical_inputs: &[&LogicalPlan],
+        _physical_inputs: &[Arc<dyn ExecutionPlan>],
+        _ctx_state: &ExecutionContextState,
+    ) -> arrow_deps::datafusion::error::Result<Option<Arc<dyn ExecutionPlan>>> {
+        node.as_any()
+            .downcast_ref::<TableScanByPrimaryKey>()
+            .map(|order_by_node| order_by_node.build_scan_table_exec_plan())
+            .transpose()
+    }
+}
+
+/// TableScanInPrimaryKeyOrder is a [`UserDefinedLogicalNode`] of datafusion
+/// which normally is generated during logical plan optimization.
+///
+/// It differs from the default [`TableScan`] in its corresponding
+/// [`ExecutionPlan`] is a special [`ScanTable`] which can controls the scan
+/// order.
+#[derive(Clone)]
+pub struct TableScanByPrimaryKey {
+    asc: bool,
+    scan_plan: Arc<LogicalPlan>,
+}
+
+impl Debug for TableScanByPrimaryKey {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        self.fmt_for_explain(f)
+    }
+}
+
+impl TableScanByPrimaryKey {
+    /// Build the node from a [TableScan] node
+    ///
+    /// Note it panics if the plan node is not a LogicalPlan::TableScan.
+    pub fn new_from_scan_plan(asc: bool, scan_plan: Arc<LogicalPlan>) -> Self {
+        // TODO(xikai): should ensure the scan_plan is a real TableScan.
+        Self { asc, scan_plan }
+    }
+
+    /// Build the scan table [ExecutionPlan].
+    fn build_scan_table_exec_plan(
+        &self,
+    ) -> arrow_deps::datafusion::error::Result<Arc<dyn ExecutionPlan>> {
+        match self.scan_plan.as_ref() {
+            LogicalPlan::TableScan(TableScan {
+                source,
+                projection,
+                filters,
+                limit,
+                ..
+            }) => {
+                let table_provider =
+                    if let Some(v) = source.as_any().downcast_ref::<TableProviderAdapter>() {
+                        v
+                    } else {
+                        return Err(DataFusionError::Internal(format!(
+                            "expect table provider adapter, given plan:{:?}",
+                            self.scan_plan,
+                        )));
+                    };
+
+                // Remove all qualifiers from the scan as the provider
+                // doesn't know (nor should care) how the relation was
+                // referred to in the query
+                let filters = logical_plan::unnormalize_cols(filters.iter().cloned());
+
+                table_provider.scan_table(
+                    projection,
+                    &filters,
+                    *limit,
+                    ReadOrder::from_is_asc(Some(self.asc)),
+                )
+            }
+            _ => Err(DataFusionError::Internal(format!(
+                "expect scan plan, given plan:{:?}",
+                self.scan_plan
+            ))),
+        }
+    }
+}
+
+impl UserDefinedLogicalNode for TableScanByPrimaryKey {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn inputs(&self) -> Vec<&LogicalPlan> {
+        vec![]
+    }
+
+    fn schema(&self) -> &DFSchemaRef {
+        self.scan_plan.schema()
+    }
+
+    fn expressions(&self) -> Vec<Expr> {
+        vec![]
+    }
+
+    fn fmt_for_explain(&self, f: &mut Formatter) -> std::fmt::Result {
+        write!(
+            f,
+            "ScanTableInPrimaryKeyOrder, asc:{}, table_scan:{:?}",
+            self.asc, self.scan_plan
+        )
+    }
+
+    fn from_template(
+        &self,
+        _exprs: &[Expr],
+        _inputs: &[LogicalPlan],
+    ) -> Arc<dyn UserDefinedLogicalNode + Send + Sync> {
+        Arc::new(Self {
+            asc: self.asc,
+            scan_plan: self.scan_plan.clone(),
+        })
+    }
+}
diff --git a/query_engine/src/executor.rs b/query_engine/src/executor.rs
new file mode 100644
index 0000000000..99d8a637bf
--- /dev/null
+++ b/query_engine/src/executor.rs
@@ -0,0 +1,138 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Query executor
+
+use std::sync::Arc;
+
+use async_trait::async_trait;
+use common_types::record_batch::RecordBatch;
+use futures::TryStreamExt;
+use log::debug;
+use snafu::{ResultExt, Snafu};
+use sql::{plan::QueryPlan, provider::CatalogProviderAdapter};
+use table_engine::stream::SendableRecordBatchStream;
+
+use crate::{
+    context::ContextRef,
+    logical_optimizer::{LogicalOptimizer, LogicalOptimizerImpl},
+    physical_optimizer::{PhysicalOptimizer, PhysicalOptimizerImpl},
+    physical_plan::PhysicalPlanPtr,
+};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Failed to do logical optimization, err:{}", source))]
+    LogicalOptimize {
+        source: crate::logical_optimizer::Error,
+    },
+
+    #[snafu(display("Failed to do physical optimization, err:{}", source))]
+    PhysicalOptimize {
+        source: crate::physical_optimizer::Error,
+    },
+
+    #[snafu(display("Failed to execute physical plan, err:{}", source))]
+    ExecutePhysical { source: crate::physical_plan::Error },
+
+    #[snafu(display("Failed to collect record batch stream, err:{}", source,))]
+    Collect { source: table_engine::stream::Error },
+}
+
+define_result!(Error);
+
+// Use a type alias so that we are able to replace the implementation
+pub type RecordBatchVec = Vec<RecordBatch>;
+
+/// Query to execute
+///
+/// Contains the query plan and other infos
+#[derive(Debug)]
+pub struct Query {
+    /// The query plan
+    plan: QueryPlan,
+}
+
+impl Query {
+    pub fn new(plan: QueryPlan) -> Self {
+        Self { plan }
+    }
+}
+
+/// Query executor
+///
+/// Executes the logical plan
+#[async_trait]
+pub trait Executor: Clone + Send + Sync {
+    // TODO(yingwen): Maybe return a stream
+    /// Execute the query, returning the query results as RecordBatchVec
+    ///
+    /// REQUIRE: The meta data of tables in query should be found from
+    /// ContextRef
+    async fn execute_logical_plan(&self, ctx: ContextRef, query: Query) -> Result<RecordBatchVec>;
+}
+
+#[derive(Clone, Default)]
+pub struct ExecutorImpl;
+
+impl ExecutorImpl {
+    pub fn new() -> Self {
+        Self::default()
+    }
+}
+
+#[async_trait]
+impl Executor for ExecutorImpl {
+    async fn execute_logical_plan(&self, ctx: ContextRef, query: Query) -> Result<RecordBatchVec> {
+        let plan = query.plan;
+
+        // Register catalogs to datafusion execution context.
+        let catalogs = CatalogProviderAdapter::new_adapters(plan.tables.clone());
+        let df_ctx = ctx.df_exec_ctx();
+        for (name, catalog) in catalogs {
+            df_ctx.register_catalog(&name, Arc::new(catalog));
+        }
+        let request_id = ctx.request_id();
+
+        let physical_plan = optimize_plan(ctx, plan).await?;
+
+        debug!(
+            "Executor physical optimization finished, request_id:{}, physical_plan: {:?}",
+            request_id, physical_plan
+        );
+
+        let stream = physical_plan.execute().await.context(ExecutePhysical)?;
+
+        // Collect all records in the pool, as the stream may perform some costly
+        // calculation
+        let record_batches = collect(stream).await?;
+
+        debug!(
+            "Executor executed plan, request_id:{}, plan_and_metrics: {}",
+            request_id,
+            physical_plan.metrics_to_string()
+        );
+
+        Ok(record_batches)
+    }
+}
+
+async fn optimize_plan(ctx: ContextRef, plan: QueryPlan) -> Result<PhysicalPlanPtr> {
+    let mut logical_optimizer = LogicalOptimizerImpl::with_context(ctx.clone());
+    let plan = logical_optimizer.optimize(plan).context(LogicalOptimize)?;
+
+    debug!(
+        "Executor logical optimization finished, request_id:{}, plan: {:#?}",
+        ctx.request_id(),
+        plan
+    );
+
+    let mut physical_optimizer = PhysicalOptimizerImpl::with_context(ctx);
+    physical_optimizer
+        .optimize(plan)
+        .await
+        .context(PhysicalOptimize)
+}
+
+async fn collect(stream: SendableRecordBatchStream) -> Result<RecordBatchVec> {
+    stream.try_collect().await.context(Collect)
+}
diff --git a/query_engine/src/lib.rs b/query_engine/src/lib.rs
new file mode 100644
index 0000000000..36440dbb11
--- /dev/null
+++ b/query_engine/src/lib.rs
@@ -0,0 +1,19 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Query engine
+//!
+//! Optimizes and executes logical plan
+
+// TODO(yingwen): Maybe renamed to query_executor or query_backend?
+// TODO(yingwen): Use datafusion or fuse-query as query backend
+
+#[macro_use]
+extern crate common_util;
+
+pub mod context;
+pub mod df_execution_extension;
+pub mod df_planner_extension;
+pub mod executor;
+pub mod logical_optimizer;
+pub mod physical_optimizer;
+pub mod physical_plan;
diff --git a/query_engine/src/logical_optimizer/mod.rs b/query_engine/src/logical_optimizer/mod.rs
new file mode 100644
index 0000000000..2bcad7955f
--- /dev/null
+++ b/query_engine/src/logical_optimizer/mod.rs
@@ -0,0 +1,61 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Logical optimizer
+
+pub mod order_by_primary_key;
+#[cfg(test)]
+pub mod tests;
+pub mod type_conversion;
+
+use arrow_deps::datafusion::error::DataFusionError;
+use snafu::{Backtrace, ResultExt, Snafu};
+use sql::plan::QueryPlan;
+
+use crate::context::ContextRef;
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display(
+        "DataFusion Failed to optimize logical plan, err:{}.\nBacktrace:\n{}",
+        source,
+        backtrace
+    ))]
+    // TODO(yingwen): Should we carry plan in this context?
+    DataFusionOptimize {
+        source: DataFusionError,
+        backtrace: Backtrace,
+    },
+}
+
+define_result!(Error);
+
+/// LogicalOptimizer transform the QueryPlan into a potentially more efficient
+/// plan
+pub trait LogicalOptimizer {
+    // TODO(yingwen): Maybe support other plans
+    fn optimize(&mut self, plan: QueryPlan) -> Result<QueryPlan>;
+}
+
+pub struct LogicalOptimizerImpl {
+    ctx: ContextRef,
+}
+
+impl LogicalOptimizerImpl {
+    pub fn with_context(ctx: ContextRef) -> Self {
+        Self { ctx }
+    }
+}
+
+impl LogicalOptimizer for LogicalOptimizerImpl {
+    fn optimize(&mut self, plan: QueryPlan) -> Result<QueryPlan> {
+        // TODO(yingwen): Avoid clone the plan multiple times during optimization
+        let QueryPlan {
+            mut df_plan,
+            tables,
+        } = plan;
+        let exec_ctx = self.ctx.df_exec_ctx();
+        df_plan = exec_ctx.optimize(&df_plan).context(DataFusionOptimize)?;
+
+        Ok(QueryPlan { df_plan, tables })
+    }
+}
diff --git a/query_engine/src/logical_optimizer/order_by_primary_key.rs b/query_engine/src/logical_optimizer/order_by_primary_key.rs
new file mode 100644
index 0000000000..ef7942bbd9
--- /dev/null
+++ b/query_engine/src/logical_optimizer/order_by_primary_key.rs
@@ -0,0 +1,413 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use std::{convert::TryFrom, sync::Arc};
+
+use arrow_deps::datafusion::{
+    execution::context::ExecutionProps,
+    logical_plan::{
+        plan::{Extension, Filter, Projection, Sort},
+        DFSchemaRef, Expr, Limit, LogicalPlan, TableScan,
+    },
+    optimizer::optimizer::OptimizerRule,
+};
+use common_types::schema::Schema;
+use log::info;
+
+use crate::df_planner_extension::table_scan_by_primary_key::TableScanByPrimaryKey;
+
+/// The optimizer rule applies to the example plan:
+/// Limit: 1
+///   Sort: #test.id ASC NULLS FIRST, #test.t ASC NULLS FIRST
+///     Projection: #test.tsid, #test.t, #test.id, #test.tag1, #test.tag2
+///       TableScan: test projection=None
+pub struct OrderByPrimaryKeyRule;
+
+impl OrderByPrimaryKeyRule {
+    /// Optimize the plan if it is the pattern:
+    /// Limit:
+    ///   Sort:
+    ///     Project:
+    ///       (Filter): (Filer node is allowed to be not exist)
+    ///         TableScan
+    fn do_optimize(
+        &self,
+        plan: &LogicalPlan,
+    ) -> arrow_deps::datafusion::error::Result<Option<LogicalPlan>> {
+        if let LogicalPlan::Limit(Limit {
+            n,
+            input: sort_plan,
+        }) = plan
+        {
+            if let LogicalPlan::Sort(Sort {
+                expr: sort_exprs,
+                input: projection_plan,
+            }) = sort_plan.as_ref()
+            {
+                if let LogicalPlan::Projection(Projection {
+                    expr: projection_exprs,
+                    input: scan_or_filter_plan,
+                    schema: projection_schema,
+                    alias,
+                }) = projection_plan.as_ref()
+                {
+                    let (scan_plan, filter_predicate) = if let LogicalPlan::Filter(Filter {
+                        predicate,
+                        input: scan_plan,
+                    }) = scan_or_filter_plan.as_ref()
+                    {
+                        (scan_plan, Some(predicate))
+                    } else {
+                        (scan_or_filter_plan, None)
+                    };
+
+                    if let LogicalPlan::TableScan(TableScan {
+                        table_name, source, ..
+                    }) = scan_plan.as_ref()
+                    {
+                        let schema = Schema::try_from(source.schema()).map_err(|e| {
+                            let err_msg = format!(
+                                "fail to convert arrow schema to schema, table:{}, err:{:?}",
+                                table_name, e
+                            );
+                            arrow_deps::datafusion::error::DataFusionError::Plan(err_msg)
+                        })?;
+                        if let Some(sort_in_asc_order) =
+                            Self::detect_primary_key_order(&schema, sort_exprs.as_slice())
+                        {
+                            let new_plan = Self::rewrite_plan(RewriteContext {
+                                projection: projection_exprs.clone(),
+                                filter_predicate: filter_predicate.cloned(),
+                                schema: projection_schema.clone(),
+                                alias: alias.clone(),
+                                scan_plan: scan_plan.clone(),
+                                sort_exprs: sort_exprs.clone(),
+                                sort_in_asc_order,
+                                limit: *n,
+                            });
+                            return Ok(Some(new_plan));
+                        }
+                    }
+                }
+            }
+        }
+
+        Ok(None)
+    }
+
+    /// Check:
+    ///  - Whether `timestamp` is the first column in the primary key.
+    ///  - Whether `sort_exprs` is equals the any prefix of primary key.
+    ///  - Whether `sort_exprs` is in the same order.
+    ///
+    /// Returns: Some(sort_order) if the two rules above are true.
+    fn detect_primary_key_order(schema: &Schema, sort_exprs: &[Expr]) -> Option<bool> {
+        if schema.timestamp_index() != 0 {
+            return None;
+        }
+
+        let key_cols = schema.key_columns();
+        if sort_exprs.len() > key_cols.len() {
+            return None;
+        }
+        let sub_key_cols = &key_cols[..sort_exprs.len()];
+
+        let mut in_asc_order = None;
+        for (sort_expr, key_col) in sort_exprs.iter().zip(sub_key_cols.iter()) {
+            if let Expr::Sort { expr, asc, .. } = sort_expr {
+                if let Some(in_asc_order) = in_asc_order.as_mut() {
+                    if in_asc_order != asc {
+                        return None;
+                    }
+                }
+                in_asc_order = Some(*asc);
+
+                if let Expr::Column(col) = expr.as_ref() {
+                    if col.name == key_col.name {
+                        continue;
+                    }
+                }
+            }
+            return None;
+        }
+
+        in_asc_order
+    }
+
+    // TODO(xikai): The topmost limit and sort plan node of the rewritten plan is
+    //  not necessary now  because now the rewrite requires the timestamp key is
+    //  the first column in the primary key  and that means the output of
+    //  TableScanByPrimaryKey is in the correct order. And topmost two
+    //  plan nodes is used to optimize the normal cases where the timestamp key is
+    //  any column.
+    /// Rewrite the plan:
+    /// Limit:
+    ///   Sort:
+    ///     Project:
+    ///       Filter:
+    ///         TableScan
+    ///
+    /// Rewritten plan:
+    /// Limit:
+    ///   Sort:
+    ///     Limit:
+    ///       Project:
+    ///         Filter:
+    ///           TableScanByPrimaryKey
+    fn rewrite_plan(rewrite_ctx: RewriteContext) -> LogicalPlan {
+        let order_by_primary_key_scan = Arc::new(LogicalPlan::Extension(Extension {
+            node: Arc::new(TableScanByPrimaryKey::new_from_scan_plan(
+                rewrite_ctx.sort_in_asc_order,
+                rewrite_ctx.scan_plan,
+            )),
+        }));
+
+        let filter_plan = if let Some(predicate) = rewrite_ctx.filter_predicate {
+            Arc::new(LogicalPlan::Filter(Filter {
+                predicate,
+                input: order_by_primary_key_scan,
+            }))
+        } else {
+            order_by_primary_key_scan
+        };
+
+        let new_project_plan = Arc::new(LogicalPlan::Projection(Projection {
+            expr: rewrite_ctx.projection,
+            input: filter_plan,
+            schema: rewrite_ctx.schema,
+            alias: rewrite_ctx.alias,
+        }));
+
+        let new_limit_plan = Arc::new(LogicalPlan::Limit(Limit {
+            n: rewrite_ctx.limit,
+            input: new_project_plan,
+        }));
+
+        let new_sort_plan = Arc::new(LogicalPlan::Sort(Sort {
+            expr: rewrite_ctx.sort_exprs,
+            input: new_limit_plan,
+        }));
+        LogicalPlan::Limit(Limit {
+            n: rewrite_ctx.limit,
+            input: new_sort_plan,
+        })
+    }
+}
+
+impl OptimizerRule for OrderByPrimaryKeyRule {
+    fn optimize(
+        &self,
+        plan: &LogicalPlan,
+        _execution_props: &ExecutionProps,
+    ) -> arrow_deps::datafusion::error::Result<LogicalPlan> {
+        match self.do_optimize(plan)? {
+            Some(new_plan) => {
+                info!(
+                     "optimize plan by OrderByPrimaryKeyRule, original plan:\n{:?}\n optimized plan:\n{:?}",
+                     plan, new_plan
+                 );
+                Ok(new_plan)
+            }
+            None => Ok(plan.clone()),
+        }
+    }
+
+    fn name(&self) -> &str {
+        "order_by_primary_key"
+    }
+}
+
+struct RewriteContext {
+    projection: Vec<Expr>,
+    filter_predicate: Option<Expr>,
+    schema: DFSchemaRef,
+    alias: Option<String>,
+    scan_plan: Arc<LogicalPlan>,
+    sort_exprs: Vec<Expr>,
+    sort_in_asc_order: bool,
+    limit: usize,
+}
+
+#[cfg(test)]
+mod tests {
+    use arrow_deps::datafusion::{logical_plan::Column, scalar::ScalarValue};
+    use common_types::{column_schema, datum::DatumKind, schema};
+
+    use super::*;
+    use crate::logical_optimizer::tests::LogicalPlanNodeBuilder;
+
+    const TEST_TABLE_NAME: &str = "order_by_primary_key_test_table";
+
+    fn build_no_optimized_schema() -> Schema {
+        schema::Builder::new()
+            .auto_increment_column_id(true)
+            .add_key_column(
+                column_schema::Builder::new("key".to_string(), DatumKind::Varbinary)
+                    .build()
+                    .expect("Build column schema"),
+            )
+            .unwrap()
+            .add_key_column(
+                column_schema::Builder::new("t".to_string(), DatumKind::Timestamp)
+                    .build()
+                    .expect("Build column schema"),
+            )
+            .unwrap()
+            .add_normal_column(
+                column_schema::Builder::new("field".to_string(), DatumKind::Double)
+                    .build()
+                    .expect("Build column schema"),
+            )
+            .unwrap()
+            .build()
+            .expect("Build schema")
+    }
+
+    fn build_optimized_schema() -> Schema {
+        schema::Builder::new()
+            .auto_increment_column_id(true)
+            .add_key_column(
+                column_schema::Builder::new("t".to_string(), DatumKind::Timestamp)
+                    .build()
+                    .expect("Build column schema"),
+            )
+            .unwrap()
+            .add_key_column(
+                column_schema::Builder::new("key".to_string(), DatumKind::Varbinary)
+                    .build()
+                    .expect("Build column schema"),
+            )
+            .unwrap()
+            .add_normal_column(
+                column_schema::Builder::new("field".to_string(), DatumKind::Double)
+                    .build()
+                    .expect("Build column schema"),
+            )
+            .unwrap()
+            .build()
+            .expect("Build schema")
+    }
+
+    fn build_sort_expr(sort_col: &str, asc: bool) -> Expr {
+        let col_expr = Expr::Column(Column::from(sort_col));
+        Expr::Sort {
+            expr: Box::new(col_expr),
+            asc,
+            nulls_first: false,
+        }
+    }
+
+    fn build_primary_key_sort_exprs(schema: &Schema, asc: bool) -> Vec<Expr> {
+        schema
+            .key_columns()
+            .iter()
+            .map(|col| build_sort_expr(&col.name, asc))
+            .collect()
+    }
+
+    fn check_optimization_works(
+        schema: Schema,
+        sort_exprs: Vec<Expr>,
+        filter_expr: Option<Expr>,
+        asc: bool,
+    ) {
+        let builder = LogicalPlanNodeBuilder::new(TEST_TABLE_NAME.to_string(), schema);
+
+        let plan = {
+            let mut builder = builder.clone().table_scan();
+            if let Some(filter) = &filter_expr {
+                builder = builder.filter(filter.clone());
+            }
+            builder
+                .projection(vec![])
+                .sort(sort_exprs.clone())
+                .limit(10)
+                .take_plan()
+        };
+
+        let rule = OrderByPrimaryKeyRule;
+        let optimized_plan = rule
+            .do_optimize(&*plan)
+            .expect("Optimize plan")
+            .expect("Succeed to optimize plan");
+        let expected_plan = {
+            let mut builder = builder.table_scan().table_scan_in_primary_key_order(asc);
+            if let Some(filter) = filter_expr {
+                builder = builder.filter(filter);
+            }
+            builder
+                .projection(vec![])
+                .limit(10)
+                .sort(sort_exprs)
+                .limit(10)
+                .take_plan()
+        };
+
+        crate::logical_optimizer::tests::assert_logical_plan_eq(
+            &optimized_plan,
+            expected_plan.as_ref(),
+        );
+    }
+
+    #[test]
+    fn test_optimize_applied_with_no_filter() {
+        let schema = build_optimized_schema();
+        let sort_in_asc_order = true;
+        let sort_exprs = build_primary_key_sort_exprs(&schema, sort_in_asc_order);
+        check_optimization_works(schema, sort_exprs, None, sort_in_asc_order);
+    }
+
+    #[test]
+    fn test_optimize_applied_with_prefix_sort_exprs() {
+        let schema = build_optimized_schema();
+        let sort_in_asc_order = true;
+        let sort_exprs = build_primary_key_sort_exprs(&schema, sort_in_asc_order);
+        let prefix_sort_exprs = sort_exprs[..1].to_vec();
+        check_optimization_works(schema, prefix_sort_exprs, None, sort_in_asc_order);
+    }
+
+    #[test]
+    fn test_optimize_applied_with_filter() {
+        let schema = build_optimized_schema();
+        let filter_expr = Expr::Literal(ScalarValue::Int8(None));
+        let sort_in_asc_order = false;
+        let sort_exprs = build_primary_key_sort_exprs(&schema, sort_in_asc_order);
+
+        check_optimization_works(schema, sort_exprs, Some(filter_expr), sort_in_asc_order);
+    }
+
+    #[test]
+    fn test_optimize_fail_with_wrong_schema() {
+        let plan = {
+            let schema = build_no_optimized_schema();
+            let sort_exprs = build_primary_key_sort_exprs(&schema, true);
+            let builder = LogicalPlanNodeBuilder::new(TEST_TABLE_NAME.to_string(), schema);
+            builder
+                .table_scan()
+                .projection(vec![])
+                .sort(sort_exprs)
+                .limit(10)
+                .take_plan()
+        };
+
+        let rule = OrderByPrimaryKeyRule;
+        let optimized_plan = rule.do_optimize(&*plan).expect("Optimize plan");
+        assert!(optimized_plan.is_none());
+    }
+
+    #[test]
+    fn test_optimize_with_wrong_plan() {
+        let plan = {
+            let schema = build_optimized_schema();
+            let builder = LogicalPlanNodeBuilder::new(TEST_TABLE_NAME.to_string(), schema);
+            builder
+                .table_scan()
+                .projection(vec![])
+                .limit(10)
+                .take_plan()
+        };
+
+        let rule = OrderByPrimaryKeyRule;
+        let optimized_plan = rule.do_optimize(&*plan).expect("Optimize plan");
+        assert!(optimized_plan.is_none());
+    }
+}
diff --git a/query_engine/src/logical_optimizer/tests.rs b/query_engine/src/logical_optimizer/tests.rs
new file mode 100644
index 0000000000..7febd2283e
--- /dev/null
+++ b/query_engine/src/logical_optimizer/tests.rs
@@ -0,0 +1,159 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! test utils for logical optimizer
+
+use std::{any::Any, sync::Arc};
+
+use arrow_deps::{
+    arrow::datatypes::SchemaRef,
+    datafusion::{
+        datasource::TableProvider,
+        logical_plan::{
+            plan::{Extension, Filter, Projection, Sort},
+            DFSchemaRef, Expr, Limit, LogicalPlan, TableScan, ToDFSchema,
+        },
+        physical_plan::ExecutionPlan,
+    },
+};
+use async_trait::async_trait;
+use common_types::schema::Schema;
+
+use crate::df_planner_extension::table_scan_by_primary_key::TableScanByPrimaryKey;
+
+#[derive(Clone, Debug)]
+#[must_use]
+pub struct LogicalPlanNodeBuilder {
+    pub schema: Schema,
+    pub table_name: String,
+    pub plan: Option<Arc<LogicalPlan>>,
+}
+
+pub struct MockTableProvider {
+    schema: Schema,
+}
+
+#[async_trait]
+impl TableProvider for MockTableProvider {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn schema(&self) -> SchemaRef {
+        self.schema.to_arrow_schema_ref()
+    }
+
+    async fn scan(
+        &self,
+        _projection: &Option<Vec<usize>>,
+        _filters: &[Expr],
+        _limit: Option<usize>,
+    ) -> arrow_deps::datafusion::error::Result<Arc<dyn ExecutionPlan>> {
+        unimplemented!("not support")
+    }
+}
+
+impl LogicalPlanNodeBuilder {
+    pub fn new(table_name: String, schema: Schema) -> Self {
+        Self {
+            schema,
+            table_name,
+            plan: None,
+        }
+    }
+
+    // caller should ensure the sub plan exists.
+    pub fn take_plan(&mut self) -> Arc<LogicalPlan> {
+        self.plan.take().unwrap()
+    }
+
+    pub fn df_schema_ref(&self) -> DFSchemaRef {
+        self.schema
+            .to_arrow_schema_ref()
+            .to_dfschema_ref()
+            .expect("Build dfschema")
+    }
+
+    pub fn filter(mut self, predicate: Expr) -> Self {
+        let plan = LogicalPlan::Filter(Filter {
+            predicate,
+            input: self.take_plan(),
+        });
+
+        self.plan = Some(Arc::new(plan));
+
+        self
+    }
+
+    pub fn projection(mut self, proj_exprs: Vec<Expr>) -> Self {
+        let plan = LogicalPlan::Projection(Projection {
+            expr: proj_exprs,
+            input: self.take_plan(),
+            schema: self.df_schema_ref(),
+            alias: None,
+        });
+
+        self.plan = Some(Arc::new(plan));
+
+        self
+    }
+
+    pub fn limit(mut self, n: usize) -> Self {
+        let plan = LogicalPlan::Limit(Limit {
+            n,
+            input: self.take_plan(),
+        });
+
+        self.plan = Some(Arc::new(plan));
+
+        self
+    }
+
+    pub fn sort(mut self, sort_exprs: Vec<Expr>) -> Self {
+        let plan = LogicalPlan::Sort(Sort {
+            expr: sort_exprs,
+            input: self.take_plan(),
+        });
+
+        self.plan = Some(Arc::new(plan));
+
+        self
+    }
+
+    pub fn table_scan(mut self) -> Self {
+        let provider = MockTableProvider {
+            schema: self.schema.clone(),
+        };
+        let projected_schema = self.df_schema_ref();
+
+        let plan = LogicalPlan::TableScan(TableScan {
+            table_name: self.table_name.clone(),
+            source: Arc::new(provider),
+            projection: None,
+            projected_schema,
+            filters: vec![],
+            limit: None,
+        });
+
+        self.plan = Some(Arc::new(plan));
+
+        self
+    }
+
+    pub fn table_scan_in_primary_key_order(mut self, asc: bool) -> Self {
+        let sub_plan = self.take_plan();
+        let node = TableScanByPrimaryKey::new_from_scan_plan(asc, sub_plan);
+        let plan = LogicalPlan::Extension(Extension {
+            node: Arc::new(node),
+        });
+        self.plan = Some(Arc::new(plan));
+
+        self
+    }
+}
+
+/// Check whether the logical plans are equal.
+pub fn assert_logical_plan_eq(left: &LogicalPlan, right: &LogicalPlan) {
+    let left_plan_str = format!("{:#?}", left);
+    let right_plan_str = format!("{:#?}", right);
+    assert_eq!(left_plan_str, right_plan_str)
+}
diff --git a/query_engine/src/logical_optimizer/type_conversion.rs b/query_engine/src/logical_optimizer/type_conversion.rs
new file mode 100644
index 0000000000..ef6aaf6d12
--- /dev/null
+++ b/query_engine/src/logical_optimizer/type_conversion.rs
@@ -0,0 +1,506 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use std::{mem, sync::Arc};
+
+use arrow_deps::{
+    arrow::{compute, compute::kernels::cast_utils::string_to_timestamp_nanos},
+    datafusion::{
+        arrow::datatypes::DataType,
+        error::{DataFusionError, Result},
+        execution::context::ExecutionProps,
+        logical_plan::{
+            plan::Filter, DFSchemaRef, Expr, ExprRewriter, LogicalPlan, Operator, TableScan,
+        },
+        optimizer::{optimizer::OptimizerRule, utils},
+        scalar::ScalarValue,
+    },
+};
+use log::debug;
+
+/// Optimizer that cast literal value to target column's type
+///
+/// Example transformations that are applied:
+/// * `expr > '5'` to `expr > 5` when `expr` is of numeric type
+/// * `expr > '2021-12-02 15:00:34'` to `expr > 1638428434000(ms)` when `expr`
+///   is of timestamp type
+/// * `expr > 10` to `expr > '10'` when `expr` is of string type
+/// * `expr = 'true'` to `expr = true` when `expr` is of boolean type
+pub struct TypeConversion;
+
+impl OptimizerRule for TypeConversion {
+    fn optimize(
+        &self,
+        plan: &LogicalPlan,
+        execution_props: &ExecutionProps,
+    ) -> Result<LogicalPlan> {
+        let mut rewriter = TypeRewriter {
+            schemas: plan.all_schemas(),
+        };
+
+        match plan {
+            LogicalPlan::Filter(Filter { predicate, input }) => Ok(LogicalPlan::Filter(Filter {
+                predicate: predicate.clone().rewrite(&mut rewriter)?,
+                input: Arc::new(self.optimize(input, execution_props)?),
+            })),
+            LogicalPlan::TableScan(TableScan {
+                table_name,
+                source,
+                projection,
+                projected_schema,
+                filters,
+                limit,
+            }) => {
+                let rewrite_filters = filters
+                    .clone()
+                    .into_iter()
+                    .map(|e| e.rewrite(&mut rewriter))
+                    .collect::<Result<Vec<_>>>()?;
+                Ok(LogicalPlan::TableScan(TableScan {
+                    table_name: table_name.clone(),
+                    source: source.clone(),
+                    projection: projection.clone(),
+                    projected_schema: projected_schema.clone(),
+                    filters: rewrite_filters,
+                    limit: *limit,
+                }))
+            }
+            LogicalPlan::Projection { .. }
+            | LogicalPlan::Window { .. }
+            | LogicalPlan::Aggregate { .. }
+            | LogicalPlan::Repartition { .. }
+            | LogicalPlan::CreateExternalTable { .. }
+            | LogicalPlan::Extension { .. }
+            | LogicalPlan::Sort { .. }
+            | LogicalPlan::Explain { .. }
+            | LogicalPlan::Limit { .. }
+            | LogicalPlan::Union { .. }
+            | LogicalPlan::Join { .. }
+            | LogicalPlan::CrossJoin { .. }
+            | LogicalPlan::CreateMemoryTable { .. }
+            | LogicalPlan::DropTable { .. }
+            | LogicalPlan::Values { .. }
+            | LogicalPlan::Analyze { .. } => {
+                let inputs = plan.inputs();
+                let new_inputs = inputs
+                    .iter()
+                    .map(|plan| self.optimize(plan, execution_props))
+                    .collect::<Result<Vec<_>>>()?;
+
+                let expr = plan
+                    .expressions()
+                    .into_iter()
+                    .map(|e| e.rewrite(&mut rewriter))
+                    .collect::<Result<Vec<_>>>()?;
+
+                utils::from_plan(plan, &expr, &new_inputs)
+            }
+            LogicalPlan::EmptyRelation { .. } => Ok(plan.clone()),
+        }
+    }
+
+    fn name(&self) -> &str {
+        "type_conversion"
+    }
+}
+
+struct TypeRewriter<'a> {
+    /// input schemas
+    schemas: Vec<&'a DFSchemaRef>,
+}
+
+impl<'a> TypeRewriter<'a> {
+    fn column_data_type(&self, expr: &Expr) -> Option<DataType> {
+        if let Expr::Column(_) = expr {
+            for schema in &self.schemas {
+                if let Ok(v) = expr.get_type(schema) {
+                    return Some(v);
+                }
+            }
+        }
+
+        None
+    }
+
+    fn convert_type<'b>(&self, mut left: &'b Expr, mut right: &'b Expr) -> Result<(Expr, Expr)> {
+        let left_type = self.column_data_type(left);
+        let right_type = self.column_data_type(right);
+
+        let mut reverse = false;
+        let left_type = match (&left_type, &right_type) {
+            (Some(v), None) => v,
+            (None, Some(v)) => {
+                reverse = true;
+                mem::swap(&mut left, &mut right);
+                v
+            }
+            _ => return Ok((left.clone(), right.clone())),
+        };
+
+        match (left, right) {
+            (Expr::Column(col), Expr::Literal(value)) => {
+                let casted_right = Self::cast_scalar_value(value, left_type)?;
+                debug!(
+                    "TypeRewriter convert type, origin_left:{:?}, type:{}, right:{:?}, casted_right:{:?}",
+                    col, left_type, value, casted_right
+                );
+                if casted_right.is_null() {
+                    return Err(DataFusionError::Plan(format!(
+                        "column:{:?} value:{:?} is invalid",
+                        col, value
+                    )));
+                }
+                if reverse {
+                    Ok((Expr::Literal(casted_right), left.clone()))
+                } else {
+                    Ok((left.clone(), Expr::Literal(casted_right)))
+                }
+            }
+            _ => Ok((left.clone(), right.clone())),
+        }
+    }
+
+    fn cast_scalar_value(value: &ScalarValue, data_type: &DataType) -> Result<ScalarValue> {
+        if let DataType::Timestamp(_, _) = data_type {
+            if let ScalarValue::Utf8(Some(v)) = value {
+                return string_to_timestamp_ms(v);
+            }
+        }
+
+        if let DataType::Boolean = data_type {
+            if let ScalarValue::Utf8(Some(v)) = value {
+                return match v.to_lowercase().as_str() {
+                    "true" => Ok(ScalarValue::Boolean(Some(true))),
+                    "false" => Ok(ScalarValue::Boolean(Some(false))),
+                    _ => Ok(ScalarValue::Boolean(None)),
+                };
+            }
+        }
+
+        let array = value.to_array();
+        ScalarValue::try_from_array(
+            &compute::cast(&array, data_type).map_err(DataFusionError::ArrowError)?,
+            // index: Converts a value in `array` at `index` into a ScalarValue
+            0,
+        )
+    }
+}
+
+impl<'a> ExprRewriter for TypeRewriter<'a> {
+    fn mutate(&mut self, expr: Expr) -> Result<Expr> {
+        let new_expr = match expr {
+            Expr::BinaryExpr { left, op, right } => match op {
+                Operator::Eq
+                | Operator::NotEq
+                | Operator::Lt
+                | Operator::LtEq
+                | Operator::Gt
+                | Operator::GtEq => {
+                    let (left, right) = self.convert_type(&left, &right)?;
+                    Expr::BinaryExpr {
+                        left: Box::new(left),
+                        op,
+                        right: Box::new(right),
+                    }
+                }
+                _ => Expr::BinaryExpr { left, op, right },
+            },
+            Expr::Between {
+                expr,
+                negated,
+                low,
+                high,
+            } => {
+                let (expr, low) = self.convert_type(&expr, &low)?;
+                let (expr, high) = self.convert_type(&expr, &high)?;
+                Expr::Between {
+                    expr: Box::new(expr),
+                    negated,
+                    low: Box::new(low),
+                    high: Box::new(high),
+                }
+            }
+            Expr::InList {
+                expr,
+                list,
+                negated,
+            } => {
+                let mut list_expr = Vec::with_capacity(list.len());
+                for e in list {
+                    let (_, expr_conversion) = self.convert_type(&expr, &e)?;
+                    list_expr.push(expr_conversion);
+                }
+                Expr::InList {
+                    expr,
+                    list: list_expr,
+                    negated,
+                }
+            }
+            Expr::Literal(value) => match value {
+                ScalarValue::TimestampSecond(Some(i), _) => {
+                    timestamp_to_timestamp_ms_expr(TimestampType::Second, i)
+                }
+                ScalarValue::TimestampMicrosecond(Some(i), _) => {
+                    timestamp_to_timestamp_ms_expr(TimestampType::Microsecond, i)
+                }
+                ScalarValue::TimestampNanosecond(Some(i), _) => {
+                    timestamp_to_timestamp_ms_expr(TimestampType::Nanosecond, i)
+                }
+                _ => Expr::Literal(value),
+            },
+            expr => {
+                // no rewrite possible
+                expr
+            }
+        };
+        Ok(new_expr)
+    }
+}
+
+fn string_to_timestamp_ms(string: &str) -> Result<ScalarValue> {
+    Ok(ScalarValue::TimestampMillisecond(
+        Some(
+            string_to_timestamp_nanos(string)
+                .map(|t| t / 1_000_000)
+                .map_err(DataFusionError::from)?,
+        ),
+        None,
+    ))
+}
+
+#[allow(dead_code)]
+enum TimestampType {
+    Second,
+    Millisecond,
+    Microsecond,
+    Nanosecond,
+}
+
+fn timestamp_to_timestamp_ms_expr(typ: TimestampType, timestamp: i64) -> Expr {
+    let timestamp = match typ {
+        TimestampType::Second => timestamp * 1_000,
+        TimestampType::Millisecond => timestamp,
+        TimestampType::Microsecond => timestamp / 1_000,
+        TimestampType::Nanosecond => timestamp / 1_000 / 1_000,
+    };
+
+    Expr::Literal(ScalarValue::TimestampMillisecond(Some(timestamp), None))
+}
+
+#[cfg(test)]
+mod tests {
+    use arrow_deps::{
+        arrow::datatypes::TimeUnit,
+        datafusion::{
+            logical_plan::{DFField, DFSchema},
+            prelude::col,
+        },
+    };
+
+    use super::*;
+
+    fn expr_test_schema() -> DFSchemaRef {
+        Arc::new(
+            DFSchema::new(vec![
+                DFField::new(None, "c1", DataType::Utf8, true),
+                DFField::new(None, "c2", DataType::Int64, true),
+                DFField::new(None, "c3", DataType::Float64, true),
+                DFField::new(None, "c4", DataType::Float32, true),
+                DFField::new(None, "c5", DataType::Boolean, true),
+                DFField::new(
+                    None,
+                    "c6",
+                    DataType::Timestamp(TimeUnit::Millisecond, None),
+                    false,
+                ),
+            ])
+            .unwrap(),
+        )
+    }
+
+    #[test]
+    fn test_type_conversion_int64() {
+        let int_value = 100;
+        let int_str = int_value.to_string();
+        let not_int_str = "100ss".to_string();
+        let schema = expr_test_schema();
+        let mut rewriter = TypeRewriter {
+            schemas: vec![&schema],
+        };
+
+        // Int64 c2 > "100" success
+        let exp = col("c2").gt(Expr::Literal(ScalarValue::Utf8(Some(int_str.clone()))));
+        let rewrite_exp = exp.rewrite(&mut rewriter).unwrap();
+        assert_eq!(
+            rewrite_exp,
+            col("c2").gt(Expr::Literal(ScalarValue::Int64(Some(int_value)),))
+        );
+
+        // Int64 "100" > c2 success
+        let exp = Expr::Literal(ScalarValue::Utf8(Some(int_str))).gt(col("c2"));
+        let rewrite_exp = exp.rewrite(&mut rewriter).unwrap();
+        assert_eq!(
+            rewrite_exp,
+            Expr::Literal(ScalarValue::Int64(Some(int_value))).gt(col("c2"))
+        );
+
+        // Int64 c2 > "100ss" fail
+        let exp = col("c2").gt(Expr::Literal(ScalarValue::Utf8(Some(not_int_str))));
+        assert!(exp.rewrite(&mut rewriter).is_err());
+    }
+
+    #[test]
+    fn test_type_conversion_float() {
+        let double_value = 100.1;
+        let double_str = double_value.to_string();
+        let not_int_str = "100ss".to_string();
+        let schema = expr_test_schema();
+        let mut rewriter = TypeRewriter {
+            schemas: vec![&schema],
+        };
+
+        // Float64 c3 > "100" success
+        let exp = col("c3").gt(Expr::Literal(ScalarValue::Utf8(Some(double_str.clone()))));
+        let rewrite_exp = exp.rewrite(&mut rewriter).unwrap();
+        assert_eq!(
+            rewrite_exp,
+            col("c3").gt(Expr::Literal(ScalarValue::Float64(Some(double_value)),))
+        );
+
+        // Float64 c3 > "100ss" fail
+        let exp = col("c3").gt(Expr::Literal(ScalarValue::Utf8(Some(not_int_str.clone()))));
+        assert!(exp.rewrite(&mut rewriter).is_err());
+
+        // Float32 c4 > "100" success
+        let exp = col("c4").gt(Expr::Literal(ScalarValue::Utf8(Some(double_str))));
+        let rewrite_exp = exp.rewrite(&mut rewriter).unwrap();
+        assert_eq!(
+            rewrite_exp,
+            col("c4").gt(Expr::Literal(ScalarValue::Float32(Some(
+                double_value as f32
+            )),))
+        );
+
+        // Float32 c4 > "100ss" fail
+        let exp = col("c4").gt(Expr::Literal(ScalarValue::Utf8(Some(not_int_str))));
+        assert!(exp.rewrite(&mut rewriter).is_err());
+    }
+
+    #[test]
+    fn test_type_conversion_boolean() {
+        let bool_value = true;
+        let bool_str = bool_value.to_string();
+        let not_int_str = "100ss".to_string();
+        let schema = expr_test_schema();
+        let mut rewriter = TypeRewriter {
+            schemas: vec![&schema],
+        };
+
+        // Boolean c5 > "100ss" fail
+        let exp = col("c5").gt(Expr::Literal(ScalarValue::Utf8(Some(not_int_str))));
+        assert!(exp.rewrite(&mut rewriter).is_err());
+
+        // Boolean c5 > "true" success
+        let exp = col("c5").gt(Expr::Literal(ScalarValue::Utf8(Some(bool_str))));
+        let rewrite_exp = exp.rewrite(&mut rewriter).unwrap();
+        assert_eq!(
+            rewrite_exp,
+            col("c5").gt(Expr::Literal(ScalarValue::Boolean(Some(bool_value)),))
+        );
+
+        // Boolean c5 > true success
+        let exp = col("c5").gt(Expr::Literal(ScalarValue::Boolean(Some(bool_value))));
+        let rewrite_exp = exp.rewrite(&mut rewriter).unwrap();
+        assert_eq!(
+            rewrite_exp,
+            col("c5").gt(Expr::Literal(ScalarValue::Boolean(Some(bool_value)),))
+        );
+    }
+
+    #[test]
+    fn test_type_conversion_timestamp() {
+        let date_string = "2021-09-07 16:00:00".to_string();
+        let schema = expr_test_schema();
+        let mut rewriter = TypeRewriter {
+            schemas: vec![&schema],
+        };
+
+        // Timestamp c6 > "2021-09-07 16:00:00"
+        let exp = col("c6").gt(Expr::Literal(ScalarValue::Utf8(Some(date_string.clone()))));
+        let rewrite_exp = exp.rewrite(&mut rewriter).unwrap();
+        assert_eq!(
+            rewrite_exp,
+            col("c6").gt(Expr::Literal(ScalarValue::TimestampMillisecond(
+                Some(
+                    string_to_timestamp_nanos(&date_string)
+                        .map(|t| t / 1_000_000)
+                        .unwrap(),
+                ),
+                None
+            ),))
+        );
+
+        // "2021-09-07 16:00:00" > Timestamp c6
+        let exp = Expr::Literal(ScalarValue::Utf8(Some(date_string.clone()))).gt(col("c6"));
+        let rewrite_exp = exp.rewrite(&mut rewriter).unwrap();
+        assert_eq!(
+            rewrite_exp,
+            Expr::Literal(ScalarValue::TimestampMillisecond(
+                Some(
+                    string_to_timestamp_nanos(&date_string)
+                        .map(|t| t / 1_000_000)
+                        .unwrap(),
+                ),
+                None
+            ),)
+            .gt(col("c6"))
+        );
+
+        // Timestamp c6 > 1642141472
+        let timestamp_int = 1642141472;
+        let exp = col("c6").gt(Expr::Literal(ScalarValue::TimestampSecond(
+            Some(timestamp_int),
+            None,
+        )));
+        let rewrite_exp = exp.rewrite(&mut rewriter).unwrap();
+        assert_eq!(
+            rewrite_exp,
+            col("c6").gt(Expr::Literal(ScalarValue::TimestampMillisecond(
+                Some(timestamp_int * 1000),
+                None
+            )))
+        );
+
+        // Timestamp c6 between "2021-09-07 16:00:00" and "2021-09-07 17:00:00"
+        let date_string2 = "2021-09-07 17:00:00".to_string();
+        let exp = Expr::Between {
+            expr: Box::new(col("c6")),
+            negated: false,
+            low: Box::new(Expr::Literal(ScalarValue::Utf8(Some(date_string.clone())))),
+            high: Box::new(Expr::Literal(ScalarValue::Utf8(Some(date_string2.clone())))),
+        };
+        let rewrite_exp = exp.rewrite(&mut rewriter).unwrap();
+        assert_eq!(
+            rewrite_exp,
+            Expr::Between {
+                expr: Box::new(col("c6")),
+                negated: false,
+                low: Box::new(Expr::Literal(ScalarValue::TimestampMillisecond(
+                    Some(
+                        string_to_timestamp_nanos(&date_string)
+                            .map(|t| t / 1_000_000)
+                            .unwrap(),
+                    ),
+                    None
+                ),)),
+                high: Box::new(Expr::Literal(ScalarValue::TimestampMillisecond(
+                    Some(
+                        string_to_timestamp_nanos(&date_string2)
+                            .map(|t| t / 1_000_000)
+                            .unwrap(),
+                    ),
+                    None
+                ),))
+            }
+        );
+    }
+}
diff --git a/query_engine/src/physical_optimizer/coalesce_batches.rs b/query_engine/src/physical_optimizer/coalesce_batches.rs
new file mode 100644
index 0000000000..36645aa633
--- /dev/null
+++ b/query_engine/src/physical_optimizer/coalesce_batches.rs
@@ -0,0 +1,70 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use std::sync::Arc;
+
+use arrow_deps::datafusion::{
+    physical_optimizer::{coalesce_batches::CoalesceBatches, optimizer::PhysicalOptimizerRule},
+    physical_plan::{limit::GlobalLimitExec, ExecutionPlan},
+    prelude::ExecutionConfig,
+};
+
+use crate::physical_optimizer::{Adapter, OptimizeRuleRef};
+
+pub struct CoalesceBatchesAdapter {
+    original_rule: CoalesceBatches,
+}
+
+impl Default for CoalesceBatchesAdapter {
+    fn default() -> Self {
+        Self {
+            original_rule: CoalesceBatches::new(),
+        }
+    }
+}
+
+impl Adapter for CoalesceBatchesAdapter {
+    fn may_adapt(original_rule: OptimizeRuleRef) -> OptimizeRuleRef {
+        if original_rule.name() == CoalesceBatches::new().name() {
+            Arc::new(Self::default())
+        } else {
+            original_rule
+        }
+    }
+}
+
+impl CoalesceBatchesAdapter {
+    /// Detect the plan contains any limit plan with a small limit(smaller than
+    /// `batch_size`).
+    fn detect_small_limit_plan(plan: &dyn ExecutionPlan, batch_size: usize) -> bool {
+        if let Some(limit_plan) = plan.as_any().downcast_ref::<GlobalLimitExec>() {
+            return limit_plan.limit() < batch_size;
+        }
+
+        for child_plan in plan.children() {
+            if Self::detect_small_limit_plan(&*child_plan, batch_size) {
+                return true;
+            }
+        }
+
+        // No small limit plan is found.
+        false
+    }
+}
+
+impl PhysicalOptimizerRule for CoalesceBatchesAdapter {
+    fn optimize(
+        &self,
+        plan: Arc<dyn ExecutionPlan>,
+        config: &ExecutionConfig,
+    ) -> arrow_deps::datafusion::error::Result<Arc<dyn ExecutionPlan>> {
+        if Self::detect_small_limit_plan(&*plan, config.runtime.batch_size) {
+            Ok(plan)
+        } else {
+            self.original_rule.optimize(plan, config)
+        }
+    }
+
+    fn name(&self) -> &str {
+        "custom_coalesce_batches"
+    }
+}
diff --git a/query_engine/src/physical_optimizer/mod.rs b/query_engine/src/physical_optimizer/mod.rs
new file mode 100644
index 0000000000..98571d2d6f
--- /dev/null
+++ b/query_engine/src/physical_optimizer/mod.rs
@@ -0,0 +1,87 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Physical query optimizer
+
+use std::sync::Arc;
+
+use arrow_deps::datafusion::{
+    error::DataFusionError, physical_optimizer::optimizer::PhysicalOptimizerRule,
+};
+use async_trait::async_trait;
+use snafu::{Backtrace, ResultExt, Snafu};
+use sql::plan::QueryPlan;
+
+use crate::{
+    context::ContextRef,
+    physical_optimizer::{
+        coalesce_batches::CoalesceBatchesAdapter, repartition::RepartitionAdapter,
+    },
+    physical_plan::{DataFusionPhysicalPlan, PhysicalPlanPtr},
+};
+
+pub mod coalesce_batches;
+pub mod repartition;
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display(
+        "DataFusion Failed to optimize physical plan, err:{}.\nBacktrace:\n{}",
+        source,
+        backtrace
+    ))]
+    // TODO(yingwen): Should we carry plan in this context?
+    DataFusionOptimize {
+        source: DataFusionError,
+        backtrace: Backtrace,
+    },
+}
+
+define_result!(Error);
+
+/// Physical query optimizer that converts a logical plan to a
+/// physical plan suitable for execution
+#[async_trait]
+pub trait PhysicalOptimizer {
+    /// Create a physical plan from a logical plan
+    async fn optimize(&mut self, logical_plan: QueryPlan) -> Result<PhysicalPlanPtr>;
+}
+
+pub struct PhysicalOptimizerImpl {
+    ctx: ContextRef,
+}
+
+impl PhysicalOptimizerImpl {
+    pub fn with_context(ctx: ContextRef) -> Self {
+        Self { ctx }
+    }
+}
+
+#[async_trait]
+impl PhysicalOptimizer for PhysicalOptimizerImpl {
+    async fn optimize(&mut self, logical_plan: QueryPlan) -> Result<PhysicalPlanPtr> {
+        let exec_ctx = self.ctx.df_exec_ctx();
+        let exec_plan = exec_ctx
+            .create_physical_plan(&logical_plan.df_plan)
+            .await
+            .context(DataFusionOptimize)?;
+        let physical_plan = DataFusionPhysicalPlan::with_plan(exec_ctx.clone(), exec_plan);
+
+        Ok(Box::new(physical_plan))
+    }
+}
+
+pub type OptimizeRuleRef = Arc<dyn PhysicalOptimizerRule + Send + Sync>;
+
+/// The default optimize rules of the datafusion is not all suitable for our
+/// cases so the adapters may change the default rules(normally just decide
+/// whether to apply the rule according to the specific plan).
+pub trait Adapter {
+    /// May change the original rule into the custom one.
+    fn may_adapt(original_rule: OptimizeRuleRef) -> OptimizeRuleRef;
+}
+
+pub fn may_adapt_optimize_rule(
+    original_rule: Arc<dyn PhysicalOptimizerRule + Send + Sync>,
+) -> Arc<dyn PhysicalOptimizerRule + Send + Sync> {
+    CoalesceBatchesAdapter::may_adapt(RepartitionAdapter::may_adapt(original_rule))
+}
diff --git a/query_engine/src/physical_optimizer/repartition.rs b/query_engine/src/physical_optimizer/repartition.rs
new file mode 100644
index 0000000000..e45d2c939b
--- /dev/null
+++ b/query_engine/src/physical_optimizer/repartition.rs
@@ -0,0 +1,59 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Adapter for the original datafusion repartiton optimization rule.
+
+use std::sync::Arc;
+
+use arrow_deps::datafusion::{
+    physical_optimizer::{optimizer::PhysicalOptimizerRule, repartition::Repartition},
+    physical_plan::ExecutionPlan,
+    prelude::ExecutionConfig,
+};
+use log::debug;
+
+use crate::physical_optimizer::{Adapter, OptimizeRuleRef};
+
+pub struct RepartitionAdapter {
+    original_rule: Repartition,
+}
+
+impl Default for RepartitionAdapter {
+    fn default() -> Self {
+        Self {
+            original_rule: Repartition::new(),
+        }
+    }
+}
+
+impl Adapter for RepartitionAdapter {
+    fn may_adapt(original_rule: OptimizeRuleRef) -> OptimizeRuleRef {
+        if original_rule.name() == Repartition::new().name() {
+            Arc::new(Self::default())
+        } else {
+            original_rule
+        }
+    }
+}
+
+impl PhysicalOptimizerRule for RepartitionAdapter {
+    fn optimize(
+        &self,
+        plan: Arc<dyn ExecutionPlan>,
+        config: &ExecutionConfig,
+    ) -> arrow_deps::datafusion::error::Result<Arc<dyn ExecutionPlan>> {
+        // the underlying plan maybe requires the order of the output.
+        if plan.output_partitioning().partition_count() == 1 {
+            debug!(
+                "RepartitionAdapter avoid repartion optimization for plan:{:?}",
+                plan
+            );
+            Ok(plan)
+        } else {
+            self.original_rule.optimize(plan, config)
+        }
+    }
+
+    fn name(&self) -> &str {
+        "custom-repartition"
+    }
+}
diff --git a/query_engine/src/physical_plan.rs b/query_engine/src/physical_plan.rs
new file mode 100644
index 0000000000..28e344b839
--- /dev/null
+++ b/query_engine/src/physical_plan.rs
@@ -0,0 +1,101 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Physical execution plan
+
+use std::{
+    fmt::{Debug, Formatter},
+    sync::Arc,
+};
+
+use arrow_deps::datafusion::{
+    error::DataFusionError,
+    physical_plan::{
+        coalesce_partitions::CoalescePartitionsExec, display::DisplayableExecutionPlan,
+        ExecutionPlan,
+    },
+    prelude::ExecutionContext,
+};
+use async_trait::async_trait;
+use snafu::{Backtrace, ResultExt, Snafu};
+use table_engine::stream::{FromDfStream, SendableRecordBatchStream};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display(
+        "DataFusion Failed to execute plan, err:{}.\nBacktrace:\n{}",
+        source,
+        backtrace
+    ))]
+    DataFusionExec {
+        partition_count: usize,
+        source: DataFusionError,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Failed to convert datafusion stream, err:{}", source))]
+    ConvertStream { source: table_engine::stream::Error },
+}
+
+define_result!(Error);
+
+#[async_trait]
+pub trait PhysicalPlan: std::fmt::Debug {
+    /// execute this plan and returns the result
+    async fn execute(&self) -> Result<SendableRecordBatchStream>;
+
+    /// Convert internal metrics to string.
+    fn metrics_to_string(&self) -> String;
+}
+
+pub type PhysicalPlanPtr = Box<dyn PhysicalPlan + Send + Sync>;
+
+pub struct DataFusionPhysicalPlan {
+    ctx: ExecutionContext,
+    plan: Arc<dyn ExecutionPlan>,
+}
+
+impl DataFusionPhysicalPlan {
+    pub fn with_plan(ctx: ExecutionContext, plan: Arc<dyn ExecutionPlan>) -> Self {
+        Self { ctx, plan }
+    }
+}
+
+impl Debug for DataFusionPhysicalPlan {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("DataFusionPhysicalPlan")
+            .field("plan", &self.plan)
+            .finish()
+    }
+}
+
+#[async_trait]
+impl PhysicalPlan for DataFusionPhysicalPlan {
+    async fn execute(&self) -> Result<SendableRecordBatchStream> {
+        let runtime = self.ctx.state.lock().unwrap().runtime_env.clone();
+        let partition_count = self.plan.output_partitioning().partition_count();
+        let df_stream = if partition_count <= 1 {
+            self.plan
+                .execute(0, runtime)
+                .await
+                .context(DataFusionExec { partition_count })?
+        } else {
+            // merge into a single partition
+            let plan = CoalescePartitionsExec::new(self.plan.clone());
+            // MergeExec must produce a single partition
+            assert_eq!(1, plan.output_partitioning().partition_count());
+            plan.execute(0, runtime)
+                .await
+                .context(DataFusionExec { partition_count })?
+        };
+
+        let stream = FromDfStream::new(df_stream).context(ConvertStream)?;
+
+        Ok(Box::pin(stream))
+    }
+
+    fn metrics_to_string(&self) -> String {
+        DisplayableExecutionPlan::with_metrics(&*self.plan)
+            .indent()
+            .to_string()
+    }
+}
diff --git a/rust-toolchain b/rust-toolchain
new file mode 100644
index 0000000000..58d0130e05
--- /dev/null
+++ b/rust-toolchain
@@ -0,0 +1 @@
+nightly-2022-01-06
diff --git a/rustfmt.toml b/rustfmt.toml
new file mode 100644
index 0000000000..61594ccda0
--- /dev/null
+++ b/rustfmt.toml
@@ -0,0 +1,14 @@
+# https://github.com/rust-lang/rustfmt/blob/master/Configurations.md
+
+# Break comments to fit on the line
+wrap_comments = true
+# Merge multiple imports into a single nested import.
+imports_granularity = "Crate"
+# Format code snippet included in doc comments.
+format_code_in_doc_comments = true
+# Reorder impl items. type and const are put first, then macros and methods.
+reorder_impl_items = true
+# Discard existing import groups, and create three groups for std, external crates, crates
+group_imports = "StdExternalCrate"
+
+license_template_path = "etc/license.template"
\ No newline at end of file
diff --git a/server/Cargo.toml b/server/Cargo.toml
new file mode 100644
index 0000000000..5f7b349704
--- /dev/null
+++ b/server/Cargo.toml
@@ -0,0 +1,44 @@
+[package]
+name = "server"
+version = "0.1.0"
+authors = ["CeresDB Authors <ceresdb@service.alipay.com>"]
+edition = "2018"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+analytic_engine = { path = "../analytic_engine" }
+arrow_deps = { path = "../arrow_deps" }
+async-trait = "0.1.41"
+avro-rs = "0.13"
+catalog = { path = "../catalog" }
+ceresdbproto = { git = "https://github.com/CeresDB/ceresdbproto.git"}
+common_types = { path = "../common_types" }
+common_util = { path = "../common_util" }
+futures = "0.3"
+grpcio = { path = "../grpcio" }
+http = "0.2"
+interpreters = { path = "../interpreters" }
+lazy_static = "1.4.0"
+log = "0.4"
+logger = { path = "../components/logger" }
+meta_client = { path = "../meta_client" }
+profile = { path = "../components/profile" }
+protobuf = "2.20"
+query_engine = { path = "../query_engine" }
+prometheus = "0.12"
+prometheus-static-metric = "0.5"
+serde = "1.0"
+serde_derive = "1.0"
+serde_json = "1.0.60"
+snafu = { version ="0.6.10", features = ["backtraces"]}
+sql = { path = "../sql" }
+system_catalog = { path = "../system_catalog" }
+table_engine = { path = "../table_engine" }
+tokio = { version = "1.0", features = ["full"] }
+twox-hash = "1.6"
+udf = { path = "../udf" }
+warp = "0.3"
+
+[dev-dependencies]
+sql = { path = "../sql" , features=["test"]}
diff --git a/server/src/avro_util.rs b/server/src/avro_util.rs
new file mode 100644
index 0000000000..69ab049ca3
--- /dev/null
+++ b/server/src/avro_util.rs
@@ -0,0 +1,166 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Avro utility
+
+use std::collections::HashMap;
+
+use avro_rs::{
+    schema::{Name, RecordField, RecordFieldOrder},
+    types::{Record, Value},
+};
+use common_types::{
+    bytes::ByteVec,
+    column::ColumnBlock,
+    datum::{Datum, DatumKind},
+    record_batch::RecordBatch,
+    schema::RecordSchema,
+};
+use common_util::define_result;
+use snafu::{Backtrace, ResultExt, Snafu};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display(
+        "Failed to write avro record, err:{}.\nBacktrace:\n{}",
+        source,
+        backtrace
+    ))]
+    WriteAvroRecord {
+        source: avro_rs::Error,
+        backtrace: Backtrace,
+    },
+}
+
+define_result!(Error);
+
+/// Create [avro_rs::Schema] with given `name` from [RecordSchema]
+pub fn to_avro_schema(name: &str, schema: &RecordSchema) -> avro_rs::Schema {
+    let columns = schema.columns();
+    let mut lookup = HashMap::with_capacity(columns.len());
+    let mut avro_fields = Vec::with_capacity(columns.len());
+
+    for (pos, column) in columns.iter().enumerate() {
+        // Create avro record field
+        let default = if column.is_nullable {
+            Some(serde_json::value::Value::Null)
+        } else {
+            None
+        };
+
+        let field_schema = if column.is_nullable {
+            // We want to declare a schema which may be either a null or non-null value,
+            // for example: ["null", "string"].
+            //
+            // However, `avro_rs` does not provide an accessible API to build a `Union`.
+            // We can't find a better way to do this than using JSON.
+            let field_schema_str = format!(
+                r#"["null", {}]"#,
+                data_type_to_schema(&column.data_type).canonical_form()
+            );
+            avro_rs::Schema::parse_str(&field_schema_str).unwrap()
+        } else {
+            data_type_to_schema(&column.data_type)
+        };
+
+        let record_field = RecordField {
+            name: column.name.clone(),
+            doc: None,
+            default,
+            schema: field_schema,
+            order: RecordFieldOrder::Ignore,
+            position: pos,
+        };
+
+        avro_fields.push(record_field);
+        lookup.insert(column.name.clone(), pos);
+    }
+
+    avro_rs::Schema::Record {
+        name: Name::new(name),
+        doc: None,
+        fields: avro_fields,
+        lookup,
+    }
+}
+
+fn data_type_to_schema(data_type: &DatumKind) -> avro_rs::Schema {
+    match data_type {
+        DatumKind::Null => avro_rs::Schema::Null,
+        DatumKind::Timestamp => avro_rs::Schema::TimestampMillis,
+        DatumKind::Double => avro_rs::Schema::Double,
+        DatumKind::Float => avro_rs::Schema::Float,
+        DatumKind::Varbinary => avro_rs::Schema::Bytes,
+        DatumKind::String => avro_rs::Schema::String,
+        DatumKind::UInt32 | DatumKind::Int64 | DatumKind::UInt64 => avro_rs::Schema::Long,
+        DatumKind::UInt16
+        | DatumKind::UInt8
+        | DatumKind::Int32
+        | DatumKind::Int16
+        | DatumKind::Int8 => avro_rs::Schema::Int,
+        DatumKind::Boolean => avro_rs::Schema::Boolean,
+    }
+}
+
+/// Convert record batch to avro format
+pub fn record_batch_to_avro(
+    record_batch: &RecordBatch,
+    schema: &avro_rs::Schema,
+    rows: &mut Vec<ByteVec>,
+) -> Result<()> {
+    let record_batch_schema = record_batch.schema();
+    assert_eq!(
+        record_batch_schema.num_columns(),
+        record_batch.num_columns()
+    );
+
+    rows.reserve(record_batch.num_rows());
+
+    let column_schemas = record_batch_schema.columns();
+    for row_idx in 0..record_batch.num_rows() {
+        let mut record = Record::new(schema).unwrap();
+        for (col_idx, column_schema) in column_schemas.iter().enumerate() {
+            let column = record_batch.column(col_idx);
+            let value = column_to_value(column, row_idx, column_schema.is_nullable);
+
+            record.put(&column_schema.name, value);
+        }
+
+        let row_bytes = avro_rs::to_avro_datum(schema, record).context(WriteAvroRecord)?;
+
+        rows.push(row_bytes);
+    }
+
+    Ok(())
+}
+
+/// Panic if row_idx is out of bound.
+fn column_to_value(array: &ColumnBlock, row_idx: usize, is_nullable: bool) -> Value {
+    let datum = array.datum(row_idx);
+    match datum {
+        Datum::Null => may_union(Value::Null, is_nullable),
+        Datum::Timestamp(v) => may_union(Value::TimestampMillis(v.as_i64()), is_nullable),
+        Datum::Double(v) => may_union(Value::Double(v), is_nullable),
+        Datum::Float(v) => may_union(Value::Float(v), is_nullable),
+        Datum::Varbinary(v) => may_union(Value::Bytes(v.to_vec()), is_nullable),
+        Datum::String(v) => may_union(Value::String(v.to_string()), is_nullable),
+        // TODO(yingwen): Should we return error if overflow? Avro does not support uint64.
+        Datum::UInt64(v) => may_union(Value::Long(v as i64), is_nullable),
+        Datum::Int64(v) => may_union(Value::Long(v), is_nullable),
+        Datum::UInt32(v) => may_union(Value::Long(i64::from(v)), is_nullable),
+        Datum::UInt16(v) => may_union(Value::Int(i32::from(v)), is_nullable),
+        Datum::UInt8(v) => may_union(Value::Int(i32::from(v)), is_nullable),
+        Datum::Int32(v) => may_union(Value::Int(v), is_nullable),
+        Datum::Int16(v) => may_union(Value::Int(i32::from(v)), is_nullable),
+        Datum::Int8(v) => may_union(Value::Int(i32::from(v)), is_nullable),
+        Datum::Boolean(v) => may_union(Value::Boolean(v), is_nullable),
+    }
+}
+
+#[inline]
+fn may_union(val: Value, is_nullable: bool) -> Value {
+    if is_nullable {
+        Value::Union(Box::new(val))
+    } else {
+        val
+    }
+}
diff --git a/server/src/config.rs b/server/src/config.rs
new file mode 100644
index 0000000000..3a62758a0d
--- /dev/null
+++ b/server/src/config.rs
@@ -0,0 +1,88 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Server configs
+
+use analytic_engine;
+use meta_client::MetaClientConfig;
+use serde_derive::Deserialize;
+
+use crate::router::RuleList;
+
+#[derive(Debug, Deserialize)]
+#[serde(default)]
+pub struct RuntimeConfig {
+    // Runtime for reading data
+    pub read_thread_num: usize,
+    // Runtime for writing data
+    pub write_thread_num: usize,
+    // Runtime for background tasks
+    pub background_thread_num: usize,
+}
+
+// TODO(yingwen): Split config into several sub configs.
+#[derive(Debug, Deserialize)]
+#[serde(default)]
+pub struct Config {
+    /// The address to listen.
+    pub bind_addr: String,
+    pub http_port: u16,
+    pub grpc_port: u16,
+    pub grpc_server_cq_count: usize,
+
+    // Engine related configs:
+    pub runtime: RuntimeConfig,
+
+    // Log related configs:
+    pub log_level: String,
+    pub enable_async_log: bool,
+    pub async_log_channel_len: i32,
+
+    // Tracing related configs:
+    pub tracing_log_dir: String,
+    pub tracing_log_name: String,
+    pub tracing_level: String,
+
+    // Meta client related configs:
+    pub meta_client: MetaClientConfig,
+    // Config of router.
+    pub route_rules: RuleList,
+
+    // Analytic engine configs:
+    pub analytic: analytic_engine::Config,
+}
+
+impl Default for RuntimeConfig {
+    fn default() -> Self {
+        Self {
+            read_thread_num: 8,
+            write_thread_num: 8,
+            background_thread_num: 8,
+        }
+    }
+}
+
+impl Default for Config {
+    fn default() -> Self {
+        let grpc_port = 8831;
+        Self {
+            bind_addr: String::from("127.0.0.1"),
+            http_port: 5000,
+            grpc_port,
+            grpc_server_cq_count: 20,
+            runtime: RuntimeConfig::default(),
+            log_level: "debug".to_string(),
+            enable_async_log: true,
+            async_log_channel_len: 102400,
+            tracing_log_dir: String::from("/tmp/ceresdbx"),
+            tracing_log_name: String::from("tracing"),
+            tracing_level: String::from("info"),
+            meta_client: MetaClientConfig {
+                node: String::from("127.0.0.1"),
+                port: grpc_port,
+                ..Default::default()
+            },
+            route_rules: RuleList::default(),
+            analytic: analytic_engine::Config::default(),
+        }
+    }
+}
diff --git a/server/src/consts.rs b/server/src/consts.rs
new file mode 100644
index 0000000000..bbaa5c1c98
--- /dev/null
+++ b/server/src/consts.rs
@@ -0,0 +1,8 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Common constants
+
+/// Header of catalog name
+pub const CATALOG_HEADER: &str = "x-ceresdbx-catalog";
+/// Header of tenant name
+pub const TENANT_HEADER: &str = "x-ceresdbx-access-tenant";
diff --git a/server/src/context.rs b/server/src/context.rs
new file mode 100644
index 0000000000..119c3ec984
--- /dev/null
+++ b/server/src/context.rs
@@ -0,0 +1,81 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Server context
+
+use std::sync::Arc;
+
+use common_util::runtime::Runtime;
+use snafu::{ensure, Backtrace, OptionExt, Snafu};
+
+#[allow(clippy::enum_variant_names)]
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Missing catalog.\nBacktrace:\n{}", backtrace))]
+    MissingCatalog { backtrace: Backtrace },
+
+    #[snafu(display("Missing tenant.\nBacktrace:\n{}", backtrace))]
+    MissingTenant { backtrace: Backtrace },
+
+    #[snafu(display("Missing runtime.\nBacktrace:\n{}", backtrace))]
+    MissingRuntime { backtrace: Backtrace },
+}
+
+define_result!(Error);
+
+/// Server request context
+///
+/// Context for request, may contains
+/// 1. Request context and options
+/// 2. Info from http headers
+pub struct RequestContext {
+    /// Catalog of the request
+    pub catalog: String,
+    /// Tenant of request
+    pub tenant: String,
+    /// Runtime of this request
+    pub runtime: Arc<Runtime>,
+}
+
+impl RequestContext {
+    pub fn builder() -> Builder {
+        Builder::default()
+    }
+}
+
+#[derive(Default)]
+pub struct Builder {
+    catalog: String,
+    tenant: String,
+    runtime: Option<Arc<Runtime>>,
+}
+
+impl Builder {
+    pub fn catalog(mut self, catalog: String) -> Self {
+        self.catalog = catalog;
+        self
+    }
+
+    pub fn tenant(mut self, tenant: String) -> Self {
+        self.tenant = tenant;
+        self
+    }
+
+    pub fn runtime(mut self, runtime: Arc<Runtime>) -> Self {
+        self.runtime = Some(runtime);
+        self
+    }
+
+    pub fn build(self) -> Result<RequestContext> {
+        ensure!(!self.catalog.is_empty(), MissingCatalog);
+        // We use tenant as schema, so we use default schema if tenant is not specific
+        ensure!(!self.tenant.is_empty(), MissingTenant);
+
+        let runtime = self.runtime.context(MissingRuntime)?;
+
+        Ok(RequestContext {
+            catalog: self.catalog,
+            tenant: self.tenant,
+            runtime,
+        })
+    }
+}
diff --git a/server/src/error.rs b/server/src/error.rs
new file mode 100644
index 0000000000..47006fde7e
--- /dev/null
+++ b/server/src/error.rs
@@ -0,0 +1,67 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Server error
+
+use common_util::define_result;
+use snafu::Snafu;
+
+/// Server status code
+#[derive(Debug, Clone, Copy)]
+pub enum StatusCode {
+    Ok = 200,
+    InvalidArgument = 400,
+    NotFound = 404,
+    TooManyRequests = 429,
+    InternalError = 500,
+}
+
+impl StatusCode {
+    pub fn as_u32(&self) -> u32 {
+        *self as u32
+    }
+}
+
+define_result!(ServerError);
+
+#[derive(Snafu, Debug)]
+#[snafu(visibility(pub(crate)))]
+pub enum ServerError {
+    #[snafu(display("Rpc error, code:{}, message:{}", code.as_u32(), msg))]
+    ErrNoCause { code: StatusCode, msg: String },
+
+    #[snafu(display("Rpc error, code:{}, message:{}, cause:{}", code.as_u32(), msg, source))]
+    ErrWithCause {
+        code: StatusCode,
+        msg: String,
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+}
+
+impl ServerError {
+    pub fn code(&self) -> StatusCode {
+        match *self {
+            ServerError::ErrNoCause { code, .. } => code,
+            ServerError::ErrWithCause { code, .. } => code,
+        }
+    }
+
+    /// Get the error message returned to the user.
+    pub fn error_message(&self) -> String {
+        match self {
+            ServerError::ErrNoCause { msg, .. } => msg.clone(),
+
+            ServerError::ErrWithCause { msg, source, .. } => {
+                let err_string = source.to_string();
+                let first_line = first_line_in_error(&err_string);
+                format!("{}. Caused by: {}", msg, first_line)
+            }
+        }
+    }
+}
+
+/// Returns first line in error message, now we use this hack to exclude
+/// backtrace from error message that returned to user.
+// TODO(yingwen): Consider a better way to get the error message.
+pub(crate) fn first_line_in_error(err_string: &str) -> &str {
+    err_string.split('\n').next().unwrap_or(err_string)
+}
diff --git a/server/src/grpc/metrics.rs b/server/src/grpc/metrics.rs
new file mode 100644
index 0000000000..aec9f7acdc
--- /dev/null
+++ b/server/src/grpc/metrics.rs
@@ -0,0 +1,42 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+// Grpc server metrics
+
+use lazy_static::lazy_static;
+use prometheus::{exponential_buckets, register_histogram_vec, HistogramVec};
+use prometheus_static_metric::{auto_flush_from, make_auto_flush_static_metric};
+
+// Register auto flush static metrics.
+make_auto_flush_static_metric! {
+    pub label_enum GrpcTypeKind {
+        handle_route,
+        handle_write,
+        handle_query,
+        handle_stream_write,
+        handle_stream_query,
+    }
+
+    pub struct GrpcHandlerDurationHistogramVec: LocalHistogram {
+        "type" => GrpcTypeKind,
+    }
+}
+
+// Register global metrics.
+lazy_static! {
+    pub static ref GRPC_HANDLER_DURATION_HISTOGRAM_VEC_GLOBAL: HistogramVec =
+        register_histogram_vec!(
+            "grpc_handler_duration",
+            "Bucketed histogram of grpc server handler",
+            &["type"],
+            exponential_buckets(0.0005, 2.0, 20).unwrap()
+        )
+        .unwrap();
+}
+
+// Register thread local metrics with default flush interval (1s).
+lazy_static! {
+    pub static ref GRPC_HANDLER_DURATION_HISTOGRAM_VEC: GrpcHandlerDurationHistogramVec = auto_flush_from!(
+        GRPC_HANDLER_DURATION_HISTOGRAM_VEC_GLOBAL,
+        GrpcHandlerDurationHistogramVec
+    );
+}
diff --git a/server/src/grpc/mod.rs b/server/src/grpc/mod.rs
new file mode 100644
index 0000000000..521400ab72
--- /dev/null
+++ b/server/src/grpc/mod.rs
@@ -0,0 +1,1034 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Grpc services
+
+use std::{
+    collections::{BTreeMap, HashMap},
+    sync::Arc,
+    time::Instant,
+};
+
+use async_trait::async_trait;
+use catalog::{consts as catalogConst, manager::Manager as CatalogManager};
+use ceresdbproto::{
+    common::ResponseHeader,
+    prometheus::{PrometheusQueryRequest, PrometheusQueryResponse},
+    storage::{
+        QueryRequest, QueryResponse, RouteRequest, RouteResponse, Value_oneof_value, WriteMetric,
+        WriteRequest, WriteResponse,
+    },
+    storage_grpc::{self, StorageService},
+};
+use common_types::{
+    column_schema::{self, ColumnSchema},
+    datum::DatumKind,
+    schema::{Builder as SchemaBuilder, Error as SchemaError, Schema, TSID_COLUMN},
+};
+use common_util::{define_result, time::InstantExt};
+use futures::{stream::StreamExt, FutureExt, SinkExt, TryFutureExt};
+use grpcio::{
+    ClientStreamingSink, Environment, Metadata, RequestStream, RpcContext, Server, ServerBuilder,
+    ServerStreamingSink, UnarySink, WriteFlags,
+};
+use log::{error, info};
+use meta_client::{
+    ClusterViewRef, FailGetCatalog, FailOnChangeView, MetaClient, MetaClientConfig, MetaWatcher,
+    SchemaConfig,
+};
+use query_engine::executor::Executor as QueryExecutor;
+use snafu::{ensure, Backtrace, OptionExt, ResultExt, Snafu};
+use sql::plan::CreateTablePlan;
+use table_engine::engine::EngineRuntimes;
+use tokio::sync::oneshot;
+
+use crate::{
+    consts,
+    error::{ErrNoCause, ErrWithCause, Result as ServerResult, ServerError, StatusCode},
+    grpc::metrics::GRPC_HANDLER_DURATION_HISTOGRAM_VEC,
+    instance::InstanceRef,
+    router::{Router, RouterRef, RuleBasedRouter, RuleList},
+};
+
+mod metrics;
+mod prom_query;
+mod query;
+mod route;
+mod write;
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display(
+        "Failed to build rpc server, err:{}.\nBacktrace:\n{}",
+        source,
+        backtrace
+    ))]
+    BuildRpcServer {
+        source: grpcio::Error,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Failed to build meta client, err:{}", source))]
+    BuildMetaClient { source: meta_client::Error },
+
+    #[snafu(display("Failed to start meta client, err:{}", source))]
+    StartMetaClient { source: meta_client::Error },
+
+    #[snafu(display("Missing meta client config.\nBacktrace:\n{}", backtrace))]
+    MissingMetaClientConfig { backtrace: Backtrace },
+
+    #[snafu(display("Missing grpc environment.\nBacktrace:\n{}", backtrace))]
+    MissingEnv { backtrace: Backtrace },
+
+    #[snafu(display("Missing runtimes.\nBacktrace:\n{}", backtrace))]
+    MissingRuntimes { backtrace: Backtrace },
+
+    #[snafu(display("Missing instance.\nBacktrace:\n{}", backtrace))]
+    MissingInstance { backtrace: Backtrace },
+
+    #[snafu(display("Catalog name is not utf8.\nBacktrace:\n{}", backtrace))]
+    ParseCatalogName {
+        source: std::string::FromUtf8Error,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Schema name is not utf8.\nBacktrace:\n{}", backtrace))]
+    ParseSchemaName {
+        source: std::string::FromUtf8Error,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Fail to build table schema for metric: {}, err:{}", metric, source))]
+    BuildTableSchema { metric: String, source: SchemaError },
+
+    #[snafu(display(
+        "Fail to build column schema from column: {}, err:{}",
+        column_name,
+        source
+    ))]
+    BuildColumnSchema {
+        column_name: String,
+        source: column_schema::Error,
+    },
+    #[snafu(display("Invalid column: {} schema, err:{}", column_name, source))]
+    InvalidColumnSchema {
+        column_name: String,
+        source: column_schema::Error,
+    },
+
+    #[snafu(display("Invalid argument: {}", msg))]
+    InvalidArgument { msg: String },
+
+    #[snafu(display(
+        "Failed to send response to grpc sink, err:{}.\nBacktrace:\n{}",
+        source,
+        backtrace
+    ))]
+    GrpcSink {
+        source: grpcio::Error,
+        backtrace: Backtrace,
+    },
+}
+
+const STREAM_QUERY_CHANNEL_LEN: usize = 20;
+
+define_result!(Error);
+
+/// Rpc request header
+#[derive(Debug, Default)]
+pub struct RequestHeader {
+    metas: HashMap<String, Vec<u8>>,
+}
+
+impl From<&Metadata> for RequestHeader {
+    fn from(meta: &Metadata) -> Self {
+        let metas = meta
+            .iter()
+            .map(|(k, v)| (k.to_string(), v.to_vec()))
+            .collect();
+
+        Self { metas }
+    }
+}
+
+impl RequestHeader {
+    pub fn get(&self, key: &str) -> Option<&[u8]> {
+        self.metas.get(key).map(|v| v.as_slice())
+    }
+}
+
+pub struct HandlerContext<'a, C, Q> {
+    #[allow(dead_code)]
+    header: RequestHeader,
+    router: RouterRef,
+    instance: InstanceRef<C, Q>,
+    catalog: String,
+    schema: String,
+    schema_config: Option<&'a SchemaConfig>,
+}
+
+impl<'a, C: CatalogManager, Q> HandlerContext<'a, C, Q> {
+    fn new(
+        header: RequestHeader,
+        router: Arc<dyn Router + Sync + Send>,
+        instance: InstanceRef<C, Q>,
+        cluster_view: &'a ClusterViewRef,
+    ) -> Result<Self> {
+        let default_catalog = instance.catalog_manager.default_catalog_name();
+        let default_schema = instance.catalog_manager.default_schema_name();
+
+        let catalog = header
+            .get(consts::CATALOG_HEADER)
+            .map(|v| String::from_utf8(v.to_vec()))
+            .transpose()
+            .context(ParseCatalogName)?
+            .unwrap_or_else(|| default_catalog.to_string());
+
+        let schema = header
+            .get(consts::TENANT_HEADER)
+            .map(|v| String::from_utf8(v.to_vec()))
+            .transpose()
+            .context(ParseSchemaName)?
+            .unwrap_or_else(|| default_schema.to_string());
+
+        let schema_config = cluster_view.schema_configs.get(&schema);
+
+        Ok(Self {
+            header,
+            router,
+            instance,
+            catalog,
+            schema,
+            schema_config,
+        })
+    }
+
+    #[inline]
+    fn catalog(&self) -> &str {
+        &self.catalog
+    }
+
+    #[inline]
+    fn tenant(&self) -> &str {
+        &self.schema
+    }
+}
+
+/// Rpc services manages all grpc services of the server.
+pub struct RpcServices {
+    /// The grpc server
+    rpc_server: Server,
+    /// Meta client
+    meta_client: Arc<dyn MetaClient + Send + Sync>,
+}
+
+impl RpcServices {
+    /// Start the rpc services
+    pub async fn start(&mut self) -> Result<()> {
+        self.meta_client.start().await.context(StartMetaClient)?;
+
+        self.rpc_server.start();
+        for (host, port) in self.rpc_server.bind_addrs() {
+            info!("Grpc server listening on {}:{}", host, port);
+        }
+
+        Ok(())
+    }
+
+    pub fn shutdown(&mut self) {
+        self.rpc_server.shutdown();
+    }
+}
+
+pub struct Builder<C, Q> {
+    bind_addr: String,
+    port: u16,
+    meta_client_config: Option<MetaClientConfig>,
+    env: Option<Arc<Environment>>,
+    runtimes: Option<Arc<EngineRuntimes>>,
+    instance: Option<InstanceRef<C, Q>>,
+    route_rules: RuleList,
+}
+
+impl<C, Q> Builder<C, Q> {
+    pub fn new() -> Self {
+        Self {
+            bind_addr: String::from("0.0.0.0"),
+            port: 38081,
+            meta_client_config: None,
+            env: None,
+            runtimes: None,
+            instance: None,
+            route_rules: RuleList::default(),
+        }
+    }
+
+    pub fn bind_addr(mut self, addr: String) -> Self {
+        self.bind_addr = addr;
+        self
+    }
+
+    pub fn port(mut self, port: u16) -> Self {
+        self.port = port;
+        self
+    }
+
+    pub fn meta_client_config(mut self, config: MetaClientConfig) -> Self {
+        self.meta_client_config = Some(config);
+        self
+    }
+
+    pub fn env(mut self, env: Arc<Environment>) -> Self {
+        self.env = Some(env);
+        self
+    }
+
+    pub fn runtimes(mut self, runtimes: Arc<EngineRuntimes>) -> Self {
+        self.runtimes = Some(runtimes);
+        self
+    }
+
+    pub fn instance(mut self, instance: InstanceRef<C, Q>) -> Self {
+        self.instance = Some(instance);
+        self
+    }
+
+    pub fn route_rules(mut self, route_rules: RuleList) -> Self {
+        self.route_rules = route_rules;
+        self
+    }
+}
+
+impl<C: CatalogManager + 'static, Q: QueryExecutor + 'static> Builder<C, Q> {
+    pub fn build(self) -> Result<RpcServices> {
+        let meta_client_config = self.meta_client_config.context(MissingMetaClientConfig)?;
+        let runtimes = self.runtimes.context(MissingRuntimes)?;
+        let instance = self.instance.context(MissingInstance)?;
+
+        let watcher = Box::new(SchemaWatcher {
+            catalog_manager: instance.catalog_manager.clone(),
+        });
+
+        let meta_client = meta_client::build_meta_client(
+            meta_client_config,
+            runtimes.bg_runtime.clone(),
+            Some(watcher),
+        )
+        .context(BuildMetaClient)?;
+        let router = Arc::new(RuleBasedRouter::new(meta_client.clone(), self.route_rules));
+        let storage_service = StorageServiceImpl {
+            router,
+            instance,
+            runtimes,
+            meta_client: meta_client.clone(),
+        };
+        let rpc_service = storage_grpc::create_storage_service(storage_service);
+
+        let env = self.env.context(MissingEnv)?;
+
+        let rpc_server = ServerBuilder::new(env)
+            .register_service(rpc_service)
+            .bind(self.bind_addr, self.port)
+            .build()
+            .context(BuildRpcServer)?;
+
+        Ok(RpcServices {
+            rpc_server,
+            meta_client,
+        })
+    }
+}
+
+struct SchemaWatcher<C> {
+    catalog_manager: C,
+}
+
+#[async_trait]
+impl<C: CatalogManager> MetaWatcher for SchemaWatcher<C> {
+    async fn on_change(&self, view: ClusterViewRef) -> meta_client::Result<()> {
+        for schema in view.schema_shards.keys() {
+            let default_catalog = catalogConst::DEFAULT_CATALOG;
+            if let Some(catalog) = self
+                .catalog_manager
+                .catalog_by_name(default_catalog)
+                .map_err(|e| Box::new(e) as _)
+                .context(FailGetCatalog {
+                    catalog: default_catalog,
+                })?
+            {
+                catalog
+                    .create_schema(schema)
+                    .await
+                    .map_err(|e| Box::new(e) as _)
+                    .context(FailOnChangeView {
+                        schema,
+                        catalog: default_catalog,
+                    })?;
+            }
+        }
+        Ok(())
+    }
+}
+
+fn build_err_header(err: ServerError) -> ResponseHeader {
+    let mut header = ResponseHeader::new();
+    header.set_code(err.code().as_u32());
+    header.set_error(err.error_message());
+
+    header
+}
+
+fn build_ok_header() -> ResponseHeader {
+    let mut header = ResponseHeader::new();
+    header.set_code(StatusCode::Ok.as_u32());
+
+    header
+}
+
+struct StorageServiceImpl<C, Q> {
+    router: Arc<dyn Router + Send + Sync>,
+    instance: InstanceRef<C, Q>,
+    runtimes: Arc<EngineRuntimes>,
+    meta_client: Arc<dyn MetaClient + Send + Sync>,
+}
+
+impl<C, Q> Clone for StorageServiceImpl<C, Q> {
+    fn clone(&self) -> Self {
+        Self {
+            router: self.router.clone(),
+            instance: self.instance.clone(),
+            runtimes: self.runtimes.clone(),
+            meta_client: self.meta_client.clone(),
+        }
+    }
+}
+
+macro_rules! handle_request {
+    ($mod_name: ident, $handle_fn: ident, $req_ty: ident, $resp_ty: ident) => {
+        fn $mod_name(&mut self, ctx: RpcContext<'_>, req: $req_ty, sink: UnarySink<$resp_ty>) {
+            let begin_instant = Instant::now();
+
+            let router = self.router.clone();
+            let header = RequestHeader::from(ctx.request_headers());
+            let instance = self.instance.clone();
+            let (tx, rx) = oneshot::channel();
+
+            // The future spawned by tokio cannot be executed by other executor/runtime, so
+
+            let runtime = match stringify!($mod_name) {
+                "query" => &self.runtimes.read_runtime,
+                "write" => &self.runtimes.write_runtime,
+                _ => &self.runtimes.bg_runtime,
+            };
+
+            let cluster_view = self.meta_client.get_cluster_view();
+            // we need to pass the result via channel
+            runtime.spawn(
+                async move {
+                    let handler_ctx = HandlerContext::new(header, router, instance, &cluster_view)
+                        .map_err(|e| Box::new(e) as _)
+                        .context(ErrWithCause {
+                            code: StatusCode::InvalidArgument,
+                            msg: "Invalid header",
+                        })?;
+                    $mod_name::$handle_fn(&handler_ctx, req).await.map_err(|e| {
+                        error!(
+                            "Failed to handle request, mod:{}, handler:{}, err:{}",
+                            stringify!($mod_name),
+                            stringify!($handle_fn),
+                            e
+                        );
+                        e
+                    })
+                }
+                .then(|resp_result| async move {
+                    if tx.send(resp_result).is_err() {
+                        error!(
+                            "Failed to send handler result, mod:{}, handler:{}",
+                            stringify!($mod_name),
+                            stringify!($handle_fn),
+                        )
+                    }
+                }),
+            );
+
+            let task = async move {
+                let resp_result = match rx.await {
+                    Ok(resp_result) => resp_result,
+                    Err(_e) => ErrNoCause {
+                        code: StatusCode::InternalError,
+                        msg: "Result channel disconnected",
+                    }
+                    .fail(),
+                };
+
+                let resp = match resp_result {
+                    Ok(resp) => resp,
+                    Err(e) => {
+                        let mut resp = $resp_ty::new();
+                        resp.set_header(build_err_header(e));
+                        resp
+                    }
+                };
+                let ret = sink.success(resp).await.context(GrpcSink);
+
+                GRPC_HANDLER_DURATION_HISTOGRAM_VEC
+                    .$handle_fn
+                    .observe(begin_instant.saturating_elapsed().as_secs_f64());
+
+                ret?;
+
+                Result::Ok(())
+            }
+            .map_err(move |e| {
+                error!(
+                    "Failed to reply grpc resp, mod:{}, handler:{}, err:{:?}",
+                    stringify!($mod_name),
+                    stringify!($handle_fn),
+                    e
+                )
+            })
+            .map(|_| ());
+
+            ctx.spawn(task);
+        }
+    };
+}
+
+impl<C: CatalogManager + 'static, Q: QueryExecutor + 'static> StorageService
+    for StorageServiceImpl<C, Q>
+{
+    handle_request!(route, handle_route, RouteRequest, RouteResponse);
+
+    handle_request!(write, handle_write, WriteRequest, WriteResponse);
+
+    handle_request!(query, handle_query, QueryRequest, QueryResponse);
+
+    handle_request!(
+        prom_query,
+        handle_query,
+        PrometheusQueryRequest,
+        PrometheusQueryResponse
+    );
+
+    fn stream_write(
+        &mut self,
+        ctx: RpcContext<'_>,
+        mut stream_req: RequestStream<ceresdbproto::storage::WriteRequest>,
+        sink: ClientStreamingSink<ceresdbproto::storage::WriteResponse>,
+    ) {
+        let begin_instant = Instant::now();
+        let router = self.router.clone();
+        let header = RequestHeader::from(ctx.request_headers());
+        let instance = self.instance.clone();
+        let cluster_view = self.meta_client.get_cluster_view();
+
+        let (tx, rx) = oneshot::channel();
+        self.runtimes.write_runtime.spawn(async move {
+            let handler_ctx = HandlerContext::new(header, router, instance, &cluster_view)
+                .map_err(|e| Box::new(e) as _)
+                .context(ErrWithCause {
+                    code: StatusCode::InvalidArgument,
+                    msg: "Invalid header",
+                })?;
+            let mut total_success = 0;
+            let mut resp = WriteResponse::new();
+            let mut has_err = false;
+            while let Some(req) = stream_req.next().await {
+                let write_result = write::handle_write(
+                    &handler_ctx,
+                    req.map_err(|e| Box::new(e) as _).context(ErrWithCause {
+                        code: StatusCode::InternalError,
+                        msg: "Failed to fetch request",
+                    })?,
+                )
+                .await
+                .map_err(|e| {
+                    error!("Failed to handle request, mod:stream_write, handler:handle_stream_write, err:{}", e);
+                    e
+                });
+
+                match write_result {
+                    Ok(write_resp) => total_success += write_resp.success,
+                    Err(e) => {
+                        resp.set_header(build_err_header(e));
+                        has_err = true;
+                        break;
+                    }
+                }
+            }
+            if !has_err {
+                resp.set_header(build_ok_header());
+                resp.set_success(total_success as u32);
+            }
+
+            ServerResult::Ok(resp)
+        }.then(|resp_result| async move {
+            if tx.send(resp_result).is_err() {
+                error!("Failed to send handler result, mod:stream_write, handler:handle_stream_write");
+                }
+            }),
+        );
+
+        let task = async move {
+            let resp_result = match rx.await {
+                Ok(resp_result) => resp_result,
+                Err(_e) => ErrNoCause {
+                    code: StatusCode::InternalError,
+                    msg: "Result channel disconnected",
+                }
+                .fail(),
+            };
+
+            let resp = match resp_result {
+                Ok(resp) => resp,
+                Err(e) => {
+                    let mut resp = WriteResponse::new();
+                    resp.set_header(build_err_header(e));
+                    resp
+                }
+            };
+            sink.success(resp).await.context(GrpcSink)?;
+
+            GRPC_HANDLER_DURATION_HISTOGRAM_VEC
+                .handle_stream_write
+                .observe(begin_instant.saturating_elapsed().as_secs_f64());
+
+            Result::Ok(())
+        }
+        .map_err(move |e| {
+            error!(
+                "Failed to reply grpc resp, mod:stream_write, handler:handle_stream_write, err:{}",
+                e
+            )
+        })
+        .map(|_| ());
+
+        ctx.spawn(task);
+    }
+
+    fn stream_query(
+        &mut self,
+        ctx: RpcContext<'_>,
+        req: QueryRequest,
+        mut sink: ServerStreamingSink<QueryResponse>,
+    ) {
+        let begin_instant = Instant::now();
+        let router = self.router.clone();
+        let header = RequestHeader::from(ctx.request_headers());
+        let instance = self.instance.clone();
+        let cluster_view = self.meta_client.get_cluster_view();
+        let (tx, mut rx) = tokio::sync::mpsc::channel(STREAM_QUERY_CHANNEL_LEN);
+        self.runtimes.read_runtime.spawn(async move {
+            let handler_ctx = HandlerContext::new(header, router, instance, &cluster_view)
+                .map_err(|e| Box::new(e) as _)
+                .context(ErrWithCause {
+                    code: StatusCode::InvalidArgument,
+                    msg: "Invalid header",
+                })?;
+            let output = query::fetch_query_output(&handler_ctx, &req)
+                .await
+                .map_err(|e| {
+                    error!("Failed to handle request, mod:stream_query, handler:handle_stream_query, err:{}", e);
+                    e
+                })?;
+            if let Some(batch) = query::get_record_batch(&output) {
+                for i in 0..batch.len() {
+                    let resp = query::convert_records(&batch[i..i + 1]);
+                    if tx.send(resp).await.is_err() {
+                        error!("Failed to send handler result, mod:stream_query, handler:handle_stream_query");
+                        break;
+                    }
+                }
+            } else {
+                let mut resp = QueryResponse::new();
+                resp.set_header(build_ok_header());
+
+                if tx.send(ServerResult::Ok(resp)).await.is_err() {
+                    error!("Failed to send handler result, mod:stream_query, handler:handle_stream_query");
+                }
+            }
+            ServerResult::Ok(())
+        });
+
+        let mut has_err = false;
+        let task = async move {
+            while let Some(result) = rx.recv().await {
+                let resp = match result {
+                    Ok(resp) => resp,
+                    Err(e) => {
+                        has_err = true;
+                        let mut resp = QueryResponse::new();
+                        resp.set_header(build_err_header(e));
+                        resp
+                    }
+                };
+                sink.send((resp, WriteFlags::default()))
+                    .await
+                    .context(GrpcSink)?;
+                if has_err {
+                    break;
+                }
+            }
+            sink.flush().await.context(GrpcSink)?;
+            sink.close().await.context(GrpcSink)?;
+            GRPC_HANDLER_DURATION_HISTOGRAM_VEC
+                .handle_stream_query
+                .observe(begin_instant.saturating_elapsed().as_secs_f64());
+            Result::Ok(())
+        }
+        .map_err(move |e| {
+            error!(
+                "Failed to reply grpc resp, mod:stream_query, handler:handle_stream_query, err:{}",
+                e
+            );
+        })
+        .map(|_| ());
+
+        ctx.spawn(task);
+    }
+}
+
+/// Create CreateTablePlan from a write metric.
+// The caller must ENSURE that the HandlerContext's schema_config is not None.
+pub fn write_metric_to_create_table_plan<
+    C: CatalogManager + 'static,
+    Q: QueryExecutor + 'static,
+>(
+    ctx: &HandlerContext<C, Q>,
+    write_metric: &WriteMetric,
+) -> Result<CreateTablePlan> {
+    let schema_config = ctx.schema_config.unwrap();
+    Ok(CreateTablePlan {
+        engine: schema_config.default_engine_type.clone(),
+        if_not_exists: true,
+        table: write_metric.get_metric().to_string(),
+        table_schema: build_schema_from_metric(schema_config, write_metric)?,
+        options: HashMap::default(),
+    })
+}
+
+fn build_column_schema(
+    column_name: &str,
+    data_type: DatumKind,
+    is_tag: bool,
+) -> Result<ColumnSchema> {
+    let builder = column_schema::Builder::new(column_name.to_string(), data_type)
+        .is_nullable(true)
+        .is_tag(is_tag);
+
+    builder.build().context(BuildColumnSchema { column_name })
+}
+
+fn build_schema_from_metric(schema_config: &SchemaConfig, metric: &WriteMetric) -> Result<Schema> {
+    let field_names = metric.get_field_names();
+    let tag_names = metric.get_tag_names();
+    let table_name = metric.get_metric();
+
+    let mut schema_builder =
+        SchemaBuilder::with_capacity(field_names.len()).auto_increment_column_id(true);
+
+    let write_entries = metric.get_entries();
+
+    ensure!(
+        !write_entries.is_empty(),
+        InvalidArgument {
+            msg: format!("Emtpy write entires to write table:{}", table_name,),
+        }
+    );
+
+    let mut name_column_map: BTreeMap<_, ColumnSchema> = BTreeMap::new();
+    for write_entry in write_entries {
+        // parse tags
+        for tag in write_entry.get_tags() {
+            let name_index = tag.name_index as usize;
+            ensure!(
+                name_index < tag_names.len(),
+                InvalidArgument {
+                    msg: format!(
+                        "tag index {} is not found in tag_names:{:?}, table:{}",
+                        name_index, tag_names, table_name,
+                    ),
+                }
+            );
+
+            let tag_name = &tag_names[name_index];
+
+            let tag_value = tag
+                .get_value()
+                .value
+                .as_ref()
+                .with_context(|| InvalidArgument {
+                    msg: format!("Tag value is needed, tag_name:{} ", tag_name),
+                })?;
+
+            let data_type = try_get_data_type_from_value(tag_value)?;
+
+            if let Some(column_schema) = name_column_map.get(tag_name) {
+                ensure_data_type_compatible(table_name, tag_name, true, data_type, column_schema)?;
+            }
+            let column_schema = build_column_schema(tag_name, data_type, true)?;
+            name_column_map.insert(tag_name, column_schema);
+        }
+
+        // parse fields
+        for field_group in write_entry.get_field_groups().iter() {
+            for field in field_group.get_fields() {
+                if (field.name_index as usize) < field_names.len() {
+                    let field_name = &field_names[field.name_index as usize];
+                    let field_value =
+                        field
+                            .get_value()
+                            .value
+                            .as_ref()
+                            .with_context(|| InvalidArgument {
+                                msg: format!(
+                                    "Field: {} value is needed, table:{}",
+                                    field_name, table_name
+                                ),
+                            })?;
+
+                    let data_type = try_get_data_type_from_value(field_value)?;
+
+                    if let Some(column_schema) = name_column_map.get(field_name) {
+                        ensure_data_type_compatible(
+                            table_name,
+                            field_name,
+                            false,
+                            data_type,
+                            column_schema,
+                        )?;
+                    }
+
+                    let column_schema = build_column_schema(field_name, data_type, false)?;
+                    name_column_map.insert(field_name, column_schema);
+                }
+            }
+        }
+    }
+
+    // Timestamp column will be the last column
+    let timestamp_column_schema = column_schema::Builder::new(
+        schema_config.default_timestamp_column_name.clone(),
+        DatumKind::Timestamp,
+    )
+    .is_nullable(false)
+    .build()
+    .context(InvalidColumnSchema {
+        column_name: TSID_COLUMN,
+    })?;
+
+    // Use (timestamp, tsid) as primary key.
+    let tsid_column_schema =
+        column_schema::Builder::new(TSID_COLUMN.to_string(), DatumKind::UInt64)
+            .is_nullable(false)
+            .build()
+            .context(InvalidColumnSchema {
+                column_name: TSID_COLUMN,
+            })?;
+
+    schema_builder = schema_builder
+        .enable_tsid_primary_key(true)
+        .add_key_column(timestamp_column_schema)
+        .with_context(|| BuildTableSchema { metric: table_name })?
+        .add_key_column(tsid_column_schema)
+        .with_context(|| BuildTableSchema { metric: table_name })?;
+
+    for col in name_column_map.into_values() {
+        schema_builder = schema_builder
+            .add_normal_column(col)
+            .with_context(|| BuildTableSchema { metric: table_name })?;
+    }
+
+    schema_builder.build().with_context(|| BuildTableSchema {
+        metric: metric.get_metric(),
+    })
+}
+
+fn ensure_data_type_compatible(
+    table_name: &str,
+    column_name: &str,
+    is_tag: bool,
+    data_type: DatumKind,
+    column_schema: &ColumnSchema,
+) -> Result<()> {
+    ensure!(
+        column_schema.is_tag == is_tag,
+        InvalidArgument {
+            msg: format!(
+                "Duplicated column: {} in fields and tags for table: {}",
+                column_name, table_name,
+            ),
+        }
+    );
+    ensure!(
+        column_schema.data_type == data_type,
+        InvalidArgument {
+            msg: format!(
+                "Column: {} in table: {} data type is not same, expected: {}, actual: {}",
+                column_name, table_name, column_schema.data_type, data_type,
+            ),
+        }
+    );
+    Ok(())
+}
+
+fn try_get_data_type_from_value(value: &Value_oneof_value) -> Result<DatumKind> {
+    match value {
+        Value_oneof_value::float64_value(_) => Ok(DatumKind::Double),
+        Value_oneof_value::string_value(_) => Ok(DatumKind::String),
+        Value_oneof_value::int64_value(_) => Ok(DatumKind::Int64),
+        Value_oneof_value::float32_value(_) => Ok(DatumKind::Float),
+        Value_oneof_value::int32_value(_) => Ok(DatumKind::Int32),
+        Value_oneof_value::int16_value(_) => Ok(DatumKind::Int16),
+        Value_oneof_value::int8_value(_) => Ok(DatumKind::Int8),
+        Value_oneof_value::bool_value(_) => Ok(DatumKind::Boolean),
+        Value_oneof_value::uint64_value(_) => Ok(DatumKind::UInt64),
+        Value_oneof_value::uint32_value(_) => Ok(DatumKind::UInt32),
+        Value_oneof_value::uint16_value(_) => Ok(DatumKind::UInt16),
+        Value_oneof_value::uint8_value(_) => Ok(DatumKind::UInt8),
+        Value_oneof_value::timestamp_value(_) => Ok(DatumKind::Timestamp),
+        Value_oneof_value::varbinary_value(_) => Ok(DatumKind::Varbinary),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use ceresdbproto::storage::{Field, FieldGroup, Tag, Value, WriteEntry, WriteMetric};
+    use common_types::datum::DatumKind;
+    use meta_client::SchemaConfig;
+
+    use super::*;
+
+    const TAG1: &str = "host";
+    const TAG2: &str = "idc";
+    const FIELD1: &str = "cpu";
+    const FIELD2: &str = "memory";
+    const FIELD3: &str = "log";
+    const FIELD4: &str = "ping_ok";
+    const METRIC: &str = "pod_system_metric";
+    const TIMESTAMP_COLUMN: &str = "custom_timestamp";
+
+    fn generate_write_metric() -> WriteMetric {
+        let mut write_metric = WriteMetric::default();
+        write_metric.set_metric(METRIC.to_string());
+
+        let tag_names = vec![TAG1.to_string(), TAG2.to_string()];
+        let field_names = vec![
+            FIELD1.to_string(),
+            FIELD2.to_string(),
+            FIELD3.to_string(),
+            FIELD4.to_string(),
+        ];
+
+        write_metric.set_field_names(field_names.into());
+        write_metric.set_tag_names(tag_names.into());
+
+        //tags
+        let mut tag1 = Tag::new();
+        tag1.set_name_index(0);
+        let mut tag_val1 = Value::new();
+        tag_val1.set_string_value("test.host".to_string());
+        tag1.set_value(tag_val1);
+        let mut tag2 = Tag::new();
+        tag2.set_name_index(1);
+        let mut tag_val2 = Value::new();
+        tag_val2.set_string_value("test.idc".to_string());
+        tag2.set_value(tag_val2);
+        let tags = vec![tag1, tag2];
+
+        //fields
+        let mut field1 = Field::new();
+        field1.set_name_index(0);
+        let mut field_val1 = Value::new();
+        field_val1.set_float64_value(100.0);
+        field1.set_value(field_val1);
+        let mut field2 = Field::new();
+        field2.set_name_index(1);
+        let mut field_val2 = Value::new();
+        field_val2.set_float64_value(1024.0);
+        field2.set_value(field_val2);
+        let mut field3 = Field::new();
+        field3.set_name_index(2);
+        let mut field_val3 = Value::new();
+        field_val3.set_string_value("test log".to_string());
+        field3.set_value(field_val3);
+        let mut field4 = Field::new();
+        field4.set_name_index(3);
+        let mut field_val4 = Value::new();
+        field_val4.set_bool_value(true);
+        field4.set_value(field_val4);
+
+        let mut field_group1 = FieldGroup::new();
+        field_group1.set_timestamp(1000);
+        field_group1.set_fields(vec![field1.clone(), field4].into());
+
+        let mut field_group2 = FieldGroup::new();
+        field_group2.set_timestamp(2000);
+        field_group2.set_fields(vec![field1.clone(), field2.clone()].into());
+
+        let mut field_group3 = FieldGroup::new();
+        field_group3.set_timestamp(3000);
+        field_group3.set_fields(vec![field3].into());
+
+        let mut write_entry = WriteEntry::new();
+        write_entry.set_tags(tags.into());
+        write_entry.set_field_groups(vec![field_group1, field_group2, field_group3].into());
+
+        write_metric.set_entries(vec![write_entry].into());
+
+        write_metric
+    }
+
+    #[test]
+    fn test_build_schema_from_metric() {
+        let schema_config = SchemaConfig {
+            auto_create_tables: true,
+            default_timestamp_column_name: TIMESTAMP_COLUMN.to_string(),
+            ..SchemaConfig::default()
+        };
+        let write_metric = generate_write_metric();
+
+        let schema = build_schema_from_metric(&schema_config, &write_metric);
+        assert!(schema.is_ok());
+
+        let schema = schema.unwrap();
+
+        assert_eq!(8, schema.num_columns());
+        assert_eq!(2, schema.num_key_columns());
+        assert_eq!(TIMESTAMP_COLUMN, schema.timestamp_name());
+        let tsid = schema.tsid_column();
+        assert!(tsid.is_some());
+
+        let key_columns = schema.key_columns();
+        assert_eq!(2, key_columns.len());
+        assert_eq!(TIMESTAMP_COLUMN, key_columns[0].name);
+        assert_eq!("tsid", key_columns[1].name);
+
+        let columns = schema.normal_columns();
+        assert_eq!(6, columns.len());
+
+        // sorted by column names because of btree
+        assert_eq!(FIELD1, columns[0].name);
+        assert!(!columns[0].is_tag);
+        assert_eq!(DatumKind::Double, columns[0].data_type);
+        assert_eq!(TAG1, columns[1].name);
+        assert!(columns[1].is_tag);
+        assert_eq!(DatumKind::String, columns[1].data_type);
+        assert_eq!(TAG2, columns[2].name);
+        assert!(columns[2].is_tag);
+        assert_eq!(DatumKind::String, columns[2].data_type);
+        assert_eq!(FIELD3, columns[3].name);
+        assert!(!columns[3].is_tag);
+        assert_eq!(DatumKind::String, columns[3].data_type);
+        assert_eq!(FIELD2, columns[4].name);
+        assert!(!columns[4].is_tag);
+        assert_eq!(DatumKind::Double, columns[4].data_type);
+        assert_eq!(FIELD4, columns[5].name);
+        assert!(!columns[5].is_tag);
+        assert_eq!(DatumKind::Boolean, columns[5].data_type);
+
+        for column in columns {
+            assert!(column.is_nullable);
+        }
+    }
+}
diff --git a/server/src/grpc/prom_query.rs b/server/src/grpc/prom_query.rs
new file mode 100644
index 0000000000..44916788ff
--- /dev/null
+++ b/server/src/grpc/prom_query.rs
@@ -0,0 +1,467 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use std::{
+    collections::{BTreeMap, HashMap},
+    sync::Arc,
+};
+
+use catalog::manager::Manager as CatalogManager;
+use ceresdbproto::{
+    common::ResponseHeader,
+    prometheus::{Label, PrometheusQueryRequest, PrometheusQueryResponse, Sample, TimeSeries},
+};
+use common_types::{
+    datum::DatumKind,
+    record_batch::RecordBatch,
+    request_id::RequestId,
+    schema::{RecordSchema, TSID_COLUMN},
+};
+use interpreters::{context::Context as InterpreterContext, factory::Factory, interpreter::Output};
+use log::debug;
+use query_engine::executor::{Executor as QueryExecutor, RecordBatchVec};
+use snafu::{ensure, OptionExt, ResultExt};
+use sql::{
+    frontend::{Context as SqlContext, Frontend},
+    promql::ColumnNames,
+    provider::CatalogMetaProvider,
+};
+
+use crate::{
+    error::{ErrNoCause, ErrWithCause, Result, ServerError, StatusCode},
+    grpc::HandlerContext,
+};
+
+pub async fn handle_query<C, Q>(
+    ctx: &HandlerContext<'_, C, Q>,
+    req: PrometheusQueryRequest,
+) -> Result<PrometheusQueryResponse>
+where
+    C: CatalogManager + 'static,
+    Q: QueryExecutor + 'static,
+{
+    let request_id = RequestId::next_id();
+
+    debug!(
+        "Grpc handle query begin, catalog:{}, tenant:{}, request_id:{}, request:{:?}",
+        ctx.catalog(),
+        ctx.tenant(),
+        request_id,
+        req,
+    );
+
+    let instance = &ctx.instance;
+    // We use tenant as schema
+    // TODO(yingwen): Privilege check, cannot access data of other tenant
+    // TODO(yingwen): Maybe move MetaProvider to instance
+    let provider = CatalogMetaProvider {
+        manager: &instance.catalog_manager,
+        default_catalog: ctx.catalog(),
+        default_schema: ctx.tenant(),
+        function_registry: &*instance.function_registry,
+    };
+    let frontend = Frontend::new(provider);
+
+    let mut sql_ctx = SqlContext::new(request_id);
+    let expr = frontend
+        .parse_promql(&mut sql_ctx, req)
+        .map_err(|e| Box::new(e) as _)
+        .context(ErrWithCause {
+            code: StatusCode::InvalidArgument,
+            msg: "Invalid request",
+        })?;
+
+    let (plan, column_name) = frontend
+        .promql_expr_to_plan(&mut sql_ctx, expr)
+        .map_err(|e| {
+            // TODO(chenxiang): improve error match
+            let code = if e.to_string().contains("Table not found") {
+                StatusCode::NotFound
+            } else {
+                StatusCode::InternalError
+            };
+            ServerError::ErrWithCause {
+                code,
+                msg: "Failed to create plan".to_string(),
+                source: Box::new(e),
+            }
+        })?;
+
+    if ctx.instance.limiter.should_limit(&plan) {
+        ErrNoCause {
+            code: StatusCode::TooManyRequests,
+            msg: "Query limited by reject list",
+        }
+        .fail()?;
+    }
+
+    // Execute in interpreter
+    let interpreter_ctx = InterpreterContext::builder(request_id)
+        // Use current ctx's catalog and tenant as default catalog and tenant
+        .default_catalog_and_schema(ctx.catalog().to_string(), ctx.tenant().to_string())
+        .build();
+    let interpreter_factory = Factory::new(
+        instance.query_executor.clone(),
+        instance.catalog_manager.clone(),
+        instance.table_engine.clone(),
+    );
+    let interpreter = interpreter_factory.create(interpreter_ctx, plan);
+
+    let output = interpreter
+        .execute()
+        .await
+        .map_err(|e| Box::new(e) as _)
+        .with_context(|| ErrWithCause {
+            code: StatusCode::InternalError,
+            msg: "Failed to execute interpreter",
+        })?;
+
+    let resp = convert_output(output, column_name)
+        .map_err(|e| Box::new(e) as _)
+        .with_context(|| ErrWithCause {
+            code: StatusCode::InternalError,
+            msg: "Failed to convert output",
+        })?;
+
+    Ok(resp)
+}
+
+fn convert_output(
+    output: Output,
+    column_name: Arc<ColumnNames>,
+) -> Result<PrometheusQueryResponse> {
+    match output {
+        Output::Records(records) => convert_records(records, column_name),
+        _ => unreachable!(),
+    }
+}
+
+fn convert_records(
+    records: RecordBatchVec,
+    column_name: Arc<ColumnNames>,
+) -> Result<PrometheusQueryResponse> {
+    if records.is_empty() {
+        return Ok(empty_ok_resp());
+    }
+
+    let mut resp = empty_ok_resp();
+    let mut tsid_to_tags = HashMap::new();
+    let mut tsid_to_samples = HashMap::new();
+
+    // TODO(chenxiang): benchmark iterator by columns
+    for record_batch in records {
+        let converter = RecordConverter::try_new(&column_name, record_batch.schema())?;
+
+        for (tsid, samples) in converter.convert_to_samples(record_batch, &mut tsid_to_tags) {
+            tsid_to_samples
+                .entry(tsid)
+                .or_insert_with(Vec::new)
+                .extend(samples)
+        }
+    }
+
+    let series_set = tsid_to_samples
+        .into_iter()
+        .map(|(tsid, samples)| {
+            let tags = tsid_to_tags
+                .get(&tsid)
+                .expect("ensured in convert_to_samples");
+            let mut timeseries = TimeSeries::new();
+            timeseries.set_labels(
+                tags.iter()
+                    .map(|(k, v)| {
+                        let mut label = Label::new();
+                        label.set_name(k.clone());
+                        label.set_value(v.clone());
+                        label
+                    })
+                    .collect::<Vec<_>>()
+                    .into(),
+            );
+            timeseries.set_samples(samples.into());
+            timeseries
+        })
+        .collect::<Vec<_>>();
+
+    resp.set_timeseries(series_set.into());
+    Ok(resp)
+}
+
+fn empty_ok_resp() -> PrometheusQueryResponse {
+    let mut header = ResponseHeader::new();
+    header.code = StatusCode::Ok.as_u32();
+
+    let mut resp = PrometheusQueryResponse::new();
+    resp.set_header(header);
+
+    resp
+}
+
+/// RecordConverter convert RecordBatch to time series format required by PromQL
+struct RecordConverter {
+    tsid_idx: usize,
+    timestamp_idx: usize,
+    tags_idx: BTreeMap<String, usize>, // tag_key -> column_index
+    field_idx: usize,
+}
+
+impl RecordConverter {
+    fn try_new(column_name: &ColumnNames, record_schema: &RecordSchema) -> Result<Self> {
+        let tsid_idx = record_schema
+            .index_of(TSID_COLUMN)
+            .with_context(|| ErrNoCause {
+                code: StatusCode::InvalidArgument,
+                msg: "Failed to find Tsid column".to_string(),
+            })?;
+        let timestamp_idx = record_schema
+            .index_of(&column_name.timestamp)
+            .with_context(|| ErrNoCause {
+                code: StatusCode::InvalidArgument,
+                msg: "Failed to find Timestamp column".to_string(),
+            })?;
+        ensure!(
+            record_schema.column(timestamp_idx).data_type == DatumKind::Timestamp,
+            ErrNoCause {
+                code: StatusCode::InvalidArgument,
+                msg: "Timestamp column should be timestamp type"
+            }
+        );
+        let field_idx = record_schema
+            .index_of(&column_name.field)
+            .with_context(|| ErrNoCause {
+                code: StatusCode::InvalidArgument,
+                msg: format!("Failed to find {} column", column_name.field),
+            })?;
+        let field_type = record_schema.column(field_idx).data_type;
+        ensure!(
+            field_type.is_f64_castable(),
+            ErrNoCause {
+                code: StatusCode::InvalidArgument,
+                msg: format!(
+                    "Field type must be f64-compatibile type, current:{}",
+                    field_type
+                )
+            }
+        );
+
+        let tags_idx: BTreeMap<_, _> = column_name
+            .tag_keys
+            .iter()
+            .filter_map(|tag_key| {
+                record_schema
+                    .index_of(tag_key)
+                    .map(|idx| (tag_key.to_string(), idx))
+            })
+            .collect();
+
+        Ok(Self {
+            tsid_idx,
+            timestamp_idx,
+            tags_idx,
+            field_idx,
+        })
+    }
+
+    fn convert_to_samples(
+        &self,
+        record_batch: RecordBatch,
+        tsid_to_tags: &mut HashMap<u64, BTreeMap<String, String>>,
+    ) -> HashMap<u64, Vec<Sample>> {
+        let mut tsid_to_samples = HashMap::new();
+
+        let tsid_cols = record_batch.column(self.tsid_idx);
+        let timestamp_cols = record_batch.column(self.timestamp_idx);
+        let field_cols = record_batch.column(self.field_idx);
+        for row_idx in 0..record_batch.num_rows() {
+            let timestamp = timestamp_cols
+                .datum(row_idx)
+                .as_timestamp()
+                .expect("checked in try_new")
+                .as_i64();
+            let field = field_cols
+                .datum(row_idx)
+                .as_f64()
+                .expect("checked in try_new");
+            let tsid = tsid_cols
+                .datum(row_idx)
+                .as_u64()
+                .expect("checked in try_new");
+
+            tsid_to_tags.entry(tsid).or_insert_with(|| {
+                self.tags_idx
+                    .iter()
+                    .filter_map(|(tag_key, col_idx)| {
+                        // TODO(chenxiang): avoid clone?
+                        record_batch
+                            .column(*col_idx)
+                            .datum(row_idx)
+                            .as_str()
+                            .and_then(|tag_value| {
+                                // filter empty tag value out, since Prometheus don't allow it.
+                                if tag_value.is_empty() {
+                                    None
+                                } else {
+                                    Some((tag_key.clone(), tag_value.to_string()))
+                                }
+                            })
+                    })
+                    .collect::<BTreeMap<_, _>>()
+            });
+
+            let samples = tsid_to_samples.entry(tsid).or_insert_with(Vec::new);
+            let mut sample = Sample::new();
+            sample.set_value(field);
+            sample.set_timestamp(timestamp);
+            samples.push(sample);
+        }
+
+        tsid_to_samples
+    }
+}
+
+#[cfg(test)]
+mod tests {
+
+    use common_types::{
+        column::{ColumnBlock, ColumnBlockBuilder},
+        column_schema,
+        datum::Datum,
+        row::Row,
+        schema,
+        string::StringBytes,
+        time::Timestamp,
+    };
+
+    use super::*;
+
+    fn build_schema() -> schema::Schema {
+        schema::Builder::new()
+            .auto_increment_column_id(true)
+            .enable_tsid_primary_key(true)
+            .add_key_column(
+                column_schema::Builder::new("timestamp".to_string(), DatumKind::Timestamp)
+                    .build()
+                    .unwrap(),
+            )
+            .unwrap()
+            .add_key_column(
+                column_schema::Builder::new(TSID_COLUMN.to_string(), DatumKind::UInt64)
+                    .build()
+                    .unwrap(),
+            )
+            .unwrap()
+            .add_normal_column(
+                column_schema::Builder::new("field1".to_string(), DatumKind::Double)
+                    .build()
+                    .unwrap(),
+            )
+            .unwrap()
+            .add_normal_column(
+                column_schema::Builder::new("tag1".to_string(), DatumKind::String)
+                    .is_tag(true)
+                    .build()
+                    .unwrap(),
+            )
+            .unwrap()
+            .build()
+            .unwrap()
+    }
+
+    fn build_column_block() -> Vec<ColumnBlock> {
+        let build_row = |ts: i64, tsid: u64, field1: f64, field2: &str| -> Row {
+            let datums = vec![
+                Datum::Timestamp(Timestamp::new(ts)),
+                Datum::UInt64(tsid),
+                Datum::Double(field1),
+                Datum::String(StringBytes::from(field2)),
+            ];
+
+            Row::from_datums(datums)
+        };
+
+        let rows = vec![
+            build_row(1000001, 1, 10.0, "v5"),
+            build_row(1000002, 1, 11.0, "v5"),
+            build_row(1000000, 2, 10.0, "v4"),
+            build_row(1000000, 3, 10.0, "v3"),
+        ];
+
+        let mut builder = ColumnBlockBuilder::with_capacity(&DatumKind::Timestamp, 2);
+        for row in &rows {
+            builder.append(row[0].clone()).unwrap();
+        }
+        let timestamp_block = builder.build();
+
+        let mut builder = ColumnBlockBuilder::with_capacity(&DatumKind::UInt64, 2);
+        for row in &rows {
+            builder.append(row[1].clone()).unwrap();
+        }
+        let tsid_block = builder.build();
+
+        let mut builder = ColumnBlockBuilder::with_capacity(&DatumKind::Double, 2);
+        for row in &rows {
+            builder.append(row[2].clone()).unwrap();
+        }
+        let field_block = builder.build();
+
+        let mut builder = ColumnBlockBuilder::with_capacity(&DatumKind::String, 2);
+        for row in &rows {
+            builder.append(row[3].clone()).unwrap();
+        }
+        let tag_block = builder.build();
+
+        vec![timestamp_block, tsid_block, field_block, tag_block]
+    }
+
+    fn make_sample(timestamp: i64, value: f64) -> Sample {
+        let mut sample = Sample::new();
+        sample.set_value(value);
+        sample.set_timestamp(timestamp);
+        sample
+    }
+
+    fn make_tags(tags: Vec<(String, String)>) -> BTreeMap<String, String> {
+        tags.into_iter().collect::<BTreeMap<_, _>>()
+    }
+
+    #[test]
+    fn test_record_convert() {
+        let schema = build_schema();
+        let record_schema = schema.to_record_schema();
+        let column_blocks = build_column_block();
+        let record_batch = RecordBatch::new(record_schema, column_blocks).unwrap();
+
+        let column_name = ColumnNames {
+            timestamp: "timestamp".to_string(),
+            tag_keys: vec!["tag1".to_string()],
+            field: "field1".to_string(),
+        };
+        let converter = RecordConverter::try_new(&column_name, &schema.to_record_schema()).unwrap();
+        let mut tsid_to_tags = HashMap::new();
+        let tsid_to_samples = converter.convert_to_samples(record_batch, &mut tsid_to_tags);
+
+        assert_eq!(
+            tsid_to_samples.get(&1).unwrap().clone(),
+            vec![make_sample(1000001, 10.0), make_sample(1000002, 11.0)]
+        );
+        assert_eq!(
+            tsid_to_samples.get(&2).unwrap().clone(),
+            vec![make_sample(1000000, 10.0)]
+        );
+        assert_eq!(
+            tsid_to_samples.get(&3).unwrap().clone(),
+            vec![make_sample(1000000, 10.0)]
+        );
+        assert_eq!(
+            tsid_to_tags.get(&1).unwrap().clone(),
+            make_tags(vec![("tag1".to_string(), "v5".to_string())])
+        );
+        assert_eq!(
+            tsid_to_tags.get(&2).unwrap().clone(),
+            make_tags(vec![("tag1".to_string(), "v4".to_string())])
+        );
+        assert_eq!(
+            tsid_to_tags.get(&3).unwrap().clone(),
+            make_tags(vec![("tag1".to_string(), "v3".to_string())])
+        );
+    }
+}
diff --git a/server/src/grpc/query.rs b/server/src/grpc/query.rs
new file mode 100644
index 0000000000..9c36a196c4
--- /dev/null
+++ b/server/src/grpc/query.rs
@@ -0,0 +1,224 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Query handler
+
+use std::time::Instant;
+
+use catalog::manager::Manager as CatalogManager;
+use ceresdbproto::{
+    common::ResponseHeader,
+    storage::{QueryRequest, QueryResponse, QueryResponse_SchemaType},
+};
+use common_types::{record_batch::RecordBatch, request_id::RequestId};
+use common_util::time::InstantExt;
+use interpreters::{context::Context as InterpreterContext, factory::Factory, interpreter::Output};
+use log::info;
+use query_engine::executor::{Executor as QueryExecutor, RecordBatchVec};
+use snafu::{ensure, ResultExt};
+use sql::{
+    frontend::{Context as SqlContext, Frontend},
+    provider::CatalogMetaProvider,
+};
+
+use crate::{
+    avro_util,
+    error::{ErrNoCause, ErrWithCause, Result, StatusCode},
+    grpc::HandlerContext,
+};
+
+/// Schema name of the record
+const RECORD_NAME: &str = "Result";
+
+fn empty_ok_resp() -> QueryResponse {
+    let mut header = ResponseHeader::new();
+    header.code = StatusCode::Ok.as_u32();
+
+    let mut resp = QueryResponse::new();
+    resp.set_header(header);
+
+    resp
+}
+
+pub async fn handle_query<C: CatalogManager + 'static, Q: QueryExecutor + 'static>(
+    ctx: &HandlerContext<'_, C, Q>,
+    req: QueryRequest,
+) -> Result<QueryResponse> {
+    let output_result = fetch_query_output(ctx, &req).await?;
+    if let Some(output) = output_result {
+        convert_output(&output)
+            .map_err(|e| Box::new(e) as _)
+            .with_context(|| ErrWithCause {
+                code: StatusCode::InternalError,
+                msg: format!("Failed to convert output, query:{}", &req.ql),
+            })
+    } else {
+        Ok(empty_ok_resp())
+    }
+}
+
+pub async fn fetch_query_output<C: CatalogManager + 'static, Q: QueryExecutor + 'static>(
+    ctx: &HandlerContext<'_, C, Q>,
+    req: &QueryRequest,
+) -> Result<Option<Output>> {
+    let request_id = RequestId::next_id();
+    let begin_instant = Instant::now();
+
+    info!(
+        "Grpc handle query begin, catalog:{}, tenant:{}, request_id:{}, request:{:?}",
+        ctx.catalog(),
+        ctx.tenant(),
+        request_id,
+        req,
+    );
+
+    let instance = &ctx.instance;
+    // We use tenant as schema
+    // TODO(yingwen): Privilege check, cannot access data of other tenant
+    // TODO(yingwen): Maybe move MetaProvider to instance
+    let provider = CatalogMetaProvider {
+        manager: &instance.catalog_manager,
+        default_catalog: ctx.catalog(),
+        default_schema: ctx.tenant(),
+        function_registry: &*instance.function_registry,
+    };
+    let frontend = Frontend::new(provider);
+
+    let mut sql_ctx = SqlContext::new(request_id);
+    // Parse sql, frontend error of invalid sql already contains sql
+    // TODO(yingwen): Maybe move sql from frontend error to outer error
+    let mut stmts = frontend
+        .parse_sql(&mut sql_ctx, &req.ql)
+        .map_err(|e| Box::new(e) as _)
+        .context(ErrWithCause {
+            code: StatusCode::InvalidArgument,
+            msg: "Failed to parse sql",
+        })?;
+
+    if stmts.is_empty() {
+        return Ok(None);
+    }
+
+    // TODO(yingwen): For simplicity, we only support executing one statement now
+    // TODO(yingwen): INSERT/UPDATE/DELETE can be batched
+    ensure!(
+        stmts.len() == 1,
+        ErrNoCause {
+            code: StatusCode::InvalidArgument,
+            msg: format!(
+                "Only support execute one statement now, current num:{}, query:{}",
+                stmts.len(),
+                req.ql
+            ),
+        }
+    );
+
+    // Create logical plan
+    // Note: Remember to store sql in error when creating logical plan
+    let plan = frontend
+        // TODO(yingwen): Check error, some error may indicate that the sql is invalid. Now we
+        // return internal server error in those cases
+        .statement_to_plan(&mut sql_ctx, stmts.remove(0))
+        .map_err(|e| Box::new(e) as _)
+        .with_context(|| ErrWithCause {
+            code: StatusCode::InternalError,
+            msg: format!("Failed to create plan, query:{}", req.ql),
+        })?;
+
+    if ctx.instance.limiter.should_limit(&plan) {
+        ErrNoCause {
+            code: StatusCode::TooManyRequests,
+            msg: "Query limited by reject list",
+        }
+        .fail()?;
+    }
+
+    // Execute in interpreter
+    let interpreter_ctx = InterpreterContext::builder(request_id)
+        // Use current ctx's catalog and tenant as default catalog and tenant
+        .default_catalog_and_schema(ctx.catalog().to_string(), ctx.tenant().to_string())
+        .build();
+    let interpreter_factory = Factory::new(
+        instance.query_executor.clone(),
+        instance.catalog_manager.clone(),
+        instance.table_engine.clone(),
+    );
+    let interpreter = interpreter_factory.create(interpreter_ctx, plan);
+
+    let output = interpreter
+        .execute()
+        .await
+        .map_err(|e| Box::new(e) as _)
+        .with_context(|| ErrWithCause {
+            code: StatusCode::InternalError,
+            msg: format!("Failed to execute interpreter, query:{}", req.ql),
+        })?;
+
+    info!(
+        "Grpc handle query success, catalog:{}, tenant:{}, request_id:{}, cost:{}, request:{:?}",
+        ctx.catalog(),
+        ctx.tenant(),
+        request_id,
+        begin_instant.saturating_elapsed().as_millis(),
+        req,
+    );
+
+    Ok(Some(output))
+}
+
+fn convert_output(output: &Output) -> Result<QueryResponse> {
+    match output {
+        Output::Records(records) => convert_records(records),
+        _ => unreachable!(),
+    }
+}
+
+pub fn get_record_batch(op: &Option<Output>) -> Option<&RecordBatchVec> {
+    if let Some(output) = op {
+        match output {
+            Output::Records(records) => Some(records),
+            _ => unreachable!(),
+        }
+    } else {
+        None
+    }
+}
+
+/// REQUIRE: records have same schema
+pub fn convert_records(records: &[RecordBatch]) -> Result<QueryResponse> {
+    if records.is_empty() {
+        return Ok(empty_ok_resp());
+    }
+
+    let mut resp = empty_ok_resp();
+    let mut avro_schema_opt = None;
+
+    let total_row = records.iter().map(|v| v.num_rows()).sum();
+    let mut rows = Vec::with_capacity(total_row);
+    for record_batch in records {
+        let avro_schema = match avro_schema_opt.as_ref() {
+            Some(schema) => schema,
+            None => {
+                let avro_schema = avro_util::to_avro_schema(RECORD_NAME, record_batch.schema());
+
+                // We only set schema_json once, so all record batches need to have same schema
+                resp.schema_type = QueryResponse_SchemaType::AVRO;
+                resp.schema_content = avro_schema.canonical_form();
+
+                avro_schema_opt = Some(avro_schema);
+
+                avro_schema_opt.as_ref().unwrap()
+            }
+        };
+
+        avro_util::record_batch_to_avro(record_batch, avro_schema, &mut rows)
+            .map_err(|e| Box::new(e) as _)
+            .context(ErrWithCause {
+                code: StatusCode::InternalError,
+                msg: "Failed to convert record batch",
+            })?;
+    }
+
+    resp.set_rows(rows.into());
+
+    Ok(resp)
+}
diff --git a/server/src/grpc/route.rs b/server/src/grpc/route.rs
new file mode 100644
index 0000000000..ec0f354637
--- /dev/null
+++ b/server/src/grpc/route.rs
@@ -0,0 +1,35 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Route handler
+
+use std::sync::Arc;
+
+use catalog::manager::Manager;
+use ceresdbproto::storage::{RouteRequest, RouteResponse};
+
+use crate::{
+    error::Result,
+    grpc::{self, HandlerContext},
+    router::Router,
+};
+
+pub async fn handle_route<C: Manager, Q>(
+    ctx: &HandlerContext<'_, C, Q>,
+    req: RouteRequest,
+) -> Result<RouteResponse> {
+    handle_route_sync(ctx.router.clone(), req, ctx.tenant())
+}
+
+fn handle_route_sync(
+    router: Arc<dyn Router + Sync + Send>,
+    req: RouteRequest,
+    schema: &str,
+) -> Result<RouteResponse> {
+    let route_vec = router.route(schema, req)?;
+
+    let mut resp = RouteResponse::new();
+    resp.set_header(grpc::build_ok_header());
+    resp.set_routes(route_vec.into());
+
+    Ok(resp)
+}
diff --git a/server/src/grpc/write.rs b/server/src/grpc/write.rs
new file mode 100644
index 0000000000..55f1880d57
--- /dev/null
+++ b/server/src/grpc/write.rs
@@ -0,0 +1,586 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Write handler
+
+use std::collections::HashMap;
+
+use catalog::manager::Manager as CatalogManager;
+use ceresdbproto::storage::{
+    Value_oneof_value, WriteEntry, WriteMetric, WriteRequest, WriteResponse,
+};
+use common_types::{
+    bytes::Bytes,
+    datum::{Datum, DatumKind},
+    request_id::RequestId,
+    row::{Row, RowGroupBuilder},
+    schema::Schema,
+    time::Timestamp,
+};
+use interpreters::{context::Context as InterpreterContext, factory::Factory, interpreter::Output};
+use log::debug;
+use query_engine::executor::Executor as QueryExecutor;
+use snafu::{ensure, OptionExt, ResultExt};
+use sql::plan::{InsertPlan, Plan};
+use table_engine::table::TableRef;
+
+use crate::{
+    error::{ErrNoCause, ErrWithCause, Result, StatusCode},
+    grpc::{self, HandlerContext},
+};
+
+pub(crate) async fn handle_write<C: CatalogManager + 'static, Q: QueryExecutor + 'static>(
+    ctx: &HandlerContext<'_, C, Q>,
+    req: WriteRequest,
+) -> Result<WriteResponse> {
+    let request_id = RequestId::next_id();
+
+    debug!(
+        "Grpc handle write begin, catalog:{}, tenant:{}, request_id:{}, first_table:{:?}, num_tables:{}",
+        ctx.catalog(),
+        ctx.tenant(),
+        request_id,
+        req.get_metrics()
+            .first()
+            .map(|m| (m.get_metric(), m.get_tag_names(), m.get_field_names())),
+        req.get_metrics().len(),
+    );
+
+    let instance = &ctx.instance;
+    let plan_vec = write_request_to_insert_plan(ctx, req, request_id).await?;
+
+    let mut success = 0;
+    for insert_plan in plan_vec {
+        debug!(
+            "Grpc handle write table begin, table:{}, row_num:{}",
+            insert_plan.table.name(),
+            insert_plan.rows.num_rows()
+        );
+        let plan = Plan::Insert(insert_plan);
+
+        if ctx.instance.limiter.should_limit(&plan) {
+            ErrNoCause {
+                code: StatusCode::TooManyRequests,
+                msg: "Insert limited by reject list",
+            }
+            .fail()?;
+        }
+
+        let interpreter_ctx = InterpreterContext::builder(request_id)
+            // Use current ctx's catalog and tenant as default catalog and tenant
+            .default_catalog_and_schema(ctx.catalog().to_string(), ctx.tenant().to_string())
+            .build();
+        let interpreter_factory = Factory::new(
+            instance.query_executor.clone(),
+            instance.catalog_manager.clone(),
+            instance.table_engine.clone(),
+        );
+        let interpreter = interpreter_factory.create(interpreter_ctx, plan);
+
+        let row_num = match interpreter
+            .execute()
+            .await
+            .map_err(|e| Box::new(e) as _)
+            .context(ErrWithCause {
+                code: StatusCode::InternalError,
+                msg: "Failed to execute interpreter",
+            })? {
+            Output::AffectedRows(n) => n,
+            _ => unreachable!(),
+        };
+
+        success += row_num;
+    }
+
+    let mut resp = WriteResponse::new();
+    resp.set_header(grpc::build_ok_header());
+    resp.set_success(success as u32);
+
+    debug!(
+        "Grpc handle write finished, catalog:{}, tenant:{}, resp:{:?}",
+        ctx.catalog(),
+        ctx.tenant(),
+        resp
+    );
+
+    Ok(resp)
+}
+
+async fn write_request_to_insert_plan<C: CatalogManager + 'static, Q: QueryExecutor + 'static>(
+    ctx: &HandlerContext<'_, C, Q>,
+    mut write_request: WriteRequest,
+    request_id: RequestId,
+) -> Result<Vec<InsertPlan>> {
+    let mut plan_vec = Vec::with_capacity(write_request.get_metrics().len());
+
+    for write_metric in write_request.take_metrics() {
+        let table_name = write_metric.get_metric();
+        let mut table = try_get_table(ctx, table_name)?;
+
+        if table.is_none() {
+            if let Some(config) = ctx.schema_config {
+                if config.auto_create_tables {
+                    create_table(ctx, &write_metric, request_id).await?;
+                    // try to get table again
+                    table = try_get_table(ctx, table_name)?;
+                }
+            }
+        }
+
+        match table {
+            Some(table) => {
+                let plan = write_metric_to_insert_plan(table, write_metric)?;
+                plan_vec.push(plan);
+            }
+            None => {
+                return ErrNoCause {
+                    code: StatusCode::InvalidArgument,
+                    msg: format!("Table not found, table:{}", write_metric.get_metric()),
+                }
+                .fail();
+            }
+        }
+    }
+
+    Ok(plan_vec)
+}
+
+fn try_get_table<C: CatalogManager + 'static, Q: QueryExecutor + 'static>(
+    ctx: &HandlerContext<'_, C, Q>,
+    table_name: &str,
+) -> Result<Option<TableRef>> {
+    ctx.instance
+        .catalog_manager
+        .catalog_by_name(ctx.catalog())
+        .map_err(|e| Box::new(e) as _)
+        .with_context(|| ErrWithCause {
+            code: StatusCode::InternalError,
+            msg: format!("Failed to find catalog, catalog_name:{}", ctx.catalog()),
+        })?
+        .with_context(|| ErrNoCause {
+            code: StatusCode::InvalidArgument,
+            msg: format!("Catalog not found, catalog_name:{}", ctx.catalog()),
+        })?
+        .schema_by_name(ctx.tenant())
+        .map_err(|e| Box::new(e) as _)
+        .with_context(|| ErrWithCause {
+            code: StatusCode::InternalError,
+            msg: format!("Failed to find tenant, tenant_name:{}", ctx.tenant()),
+        })?
+        .with_context(|| ErrNoCause {
+            code: StatusCode::InvalidArgument,
+            msg: format!("Tenant not found, tenant_name:{}", ctx.tenant()),
+        })?
+        .table_by_name(table_name)
+        .map_err(|e| Box::new(e) as _)
+        .with_context(|| ErrWithCause {
+            code: StatusCode::InternalError,
+            msg: format!("Failed to find table, table:{}", table_name),
+        })
+}
+
+async fn create_table<C: CatalogManager + 'static, Q: QueryExecutor + 'static>(
+    ctx: &HandlerContext<'_, C, Q>,
+    write_metric: &WriteMetric,
+    request_id: RequestId,
+) -> Result<()> {
+    let create_table_plan = grpc::write_metric_to_create_table_plan(ctx, write_metric)
+        .map_err(|e| Box::new(e) as _)
+        .with_context(|| ErrWithCause {
+            code: StatusCode::InternalError,
+            msg: format!(
+                "Failed to build creating table plan from metric, table:{}",
+                write_metric.get_metric()
+            ),
+        })?;
+
+    debug!(
+        "Grpc handle create table begin, table:{}, schema: {:?}",
+        create_table_plan.table, create_table_plan.table_schema,
+    );
+    let plan = Plan::Create(create_table_plan);
+
+    let instance = &ctx.instance;
+
+    if instance.limiter.should_limit(&plan) {
+        ErrNoCause {
+            code: StatusCode::TooManyRequests,
+            msg: "Create table limited by reject list",
+        }
+        .fail()?;
+    }
+
+    let interpreter_ctx = InterpreterContext::builder(request_id)
+        // Use current ctx's catalog and tenant as default catalog and tenant
+        .default_catalog_and_schema(ctx.catalog().to_string(), ctx.tenant().to_string())
+        .build();
+    let interpreter_factory = Factory::new(
+        instance.query_executor.clone(),
+        instance.catalog_manager.clone(),
+        instance.table_engine.clone(),
+    );
+    let interpreter = interpreter_factory.create(interpreter_ctx, plan);
+
+    let _ = match interpreter
+        .execute()
+        .await
+        .map_err(|e| Box::new(e) as _)
+        .context(ErrWithCause {
+            code: StatusCode::InternalError,
+            msg: "Failed to execute interpreter",
+        })? {
+        Output::AffectedRows(n) => n,
+        _ => unreachable!(),
+    };
+
+    Ok(())
+}
+
+fn write_metric_to_insert_plan(
+    table: TableRef,
+    mut write_metric: WriteMetric,
+) -> Result<InsertPlan> {
+    let schema = table.schema();
+
+    let mut rows_total = Vec::new();
+    for write_entry in write_metric.take_entries() {
+        let mut rows = write_entry_to_rows(
+            write_metric.get_metric(),
+            &schema,
+            write_metric.get_tag_names(),
+            write_metric.get_field_names(),
+            write_entry,
+        )?;
+        rows_total.append(&mut rows);
+    }
+    // The row group builder will checks nullable.
+    let row_group = RowGroupBuilder::with_rows(schema, rows_total)
+        .map_err(|e| Box::new(e) as _)
+        .context(ErrWithCause {
+            code: StatusCode::InternalError,
+            msg: format!("Failed to build row group, table:{}", table.name()),
+        })?
+        .build();
+    Ok(InsertPlan {
+        table,
+        rows: row_group,
+    })
+}
+
+fn write_entry_to_rows(
+    table_name: &str,
+    schema: &Schema,
+    tag_names: &[String],
+    field_names: &[String],
+    mut write_entry: WriteEntry,
+) -> Result<Vec<Row>> {
+    // Init all columns by null.
+    let mut rows = vec![
+        Row::from_datums(vec![Datum::Null; schema.num_columns()]);
+        write_entry.get_field_groups().len()
+    ];
+
+    // Fill tsid by default value.
+    if let Some(tsid_idx) = schema.index_of_tsid() {
+        let kind = &schema.tsid_column().unwrap().data_type;
+        let default_datum = Datum::empty(kind);
+        for row in &mut rows {
+            row[tsid_idx] = default_datum.clone();
+        }
+    }
+
+    // Fill tags.
+    for mut tag in write_entry.take_tags() {
+        let name_index = tag.name_index as usize;
+        ensure!(
+            name_index < tag_names.len(),
+            ErrNoCause {
+                code: StatusCode::InvalidArgument,
+                msg: format!(
+                    "tag index {} is not found in tag_names:{:?}, table:{}",
+                    name_index, tag_names, table_name,
+                ),
+            }
+        );
+
+        let tag_name = &tag_names[name_index];
+        let tag_index_in_schema = schema.index_of(tag_name).with_context(|| ErrNoCause {
+            code: StatusCode::InvalidArgument,
+            msg: format!(
+                "Can't find tag in schema, table:{}, tag_name:{}",
+                table_name, tag_name
+            ),
+        })?;
+
+        let column_schema = schema.column(tag_index_in_schema);
+        ensure!(
+            column_schema.is_tag,
+            ErrNoCause {
+                code: StatusCode::InvalidArgument,
+                msg: format!(
+                    "column {} is a field rather than a tag, table:{}",
+                    tag_name, table_name
+                ),
+            }
+        );
+
+        let tag_value = tag.take_value().value.with_context(|| ErrNoCause {
+            code: StatusCode::InvalidArgument,
+            msg: format!(
+                "Tag value is needed, table:{}, tag_name:{}",
+                table_name, tag_name
+            ),
+        })?;
+        for row in &mut rows {
+            row[tag_index_in_schema] = convert_proto_value_to_datum(
+                table_name,
+                tag_name,
+                tag_value.clone(),
+                column_schema.data_type,
+            )?;
+        }
+    }
+
+    // Fill fields.
+    let mut field_name_index: HashMap<String, usize> = HashMap::new();
+    for (i, mut field_group) in write_entry.take_field_groups().into_iter().enumerate() {
+        // timestamp
+        let timestamp_index_in_schema = schema.timestamp_index();
+        rows[i][timestamp_index_in_schema] =
+            Datum::Timestamp(Timestamp::new(field_group.get_timestamp()));
+
+        for mut field in field_group.take_fields() {
+            if (field.name_index as usize) < field_names.len() {
+                let field_name = &field_names[field.name_index as usize];
+                let index_in_schema = if field_name_index.contains_key(field_name) {
+                    field_name_index.get(field_name).unwrap().to_owned()
+                } else {
+                    let index_in_schema =
+                        schema.index_of(field_name).with_context(|| ErrNoCause {
+                            code: StatusCode::InvalidArgument,
+                            msg: format!(
+                                "Can't find field in schema, table:{}, field_name:{}",
+                                table_name, field_name
+                            ),
+                        })?;
+                    field_name_index.insert(field_name.to_string(), index_in_schema);
+                    index_in_schema
+                };
+                let column_schema = schema.column(index_in_schema);
+                ensure!(
+                    !column_schema.is_tag,
+                    ErrNoCause {
+                        code: StatusCode::InvalidArgument,
+                        msg: format!(
+                            "Column {} is a tag rather than a field, table:{}",
+                            field_name, table_name
+                        )
+                    }
+                );
+                let field_value = field.take_value().value.with_context(|| ErrNoCause {
+                    code: StatusCode::InvalidArgument,
+                    msg: format!("Field is needed, table:{}", table_name),
+                })?;
+
+                rows[i][index_in_schema] = convert_proto_value_to_datum(
+                    table_name,
+                    field_name,
+                    field_value,
+                    column_schema.data_type,
+                )?;
+            }
+        }
+    }
+
+    Ok(rows)
+}
+
+/// Convert the `Value_oneof_value` defined in protos into the datum.
+fn convert_proto_value_to_datum(
+    table_name: &str,
+    name: &str,
+    value: Value_oneof_value,
+    data_type: DatumKind,
+) -> Result<Datum> {
+    match (value, data_type) {
+        (Value_oneof_value::float64_value(v), DatumKind::Double) => Ok(Datum::Double(v)),
+        (Value_oneof_value::string_value(v), DatumKind::String) => Ok(Datum::String(v.into())),
+        (Value_oneof_value::int64_value(v), DatumKind::Int64) => Ok(Datum::Int64(v)),
+        (Value_oneof_value::float32_value(v), DatumKind::Float) => Ok(Datum::Float(v)),
+        (Value_oneof_value::int32_value(v), DatumKind::Int32) => Ok(Datum::Int32(v)),
+        (Value_oneof_value::int16_value(v), DatumKind::Int16) => Ok(Datum::Int16(v as i16)),
+        (Value_oneof_value::int8_value(v), DatumKind::Int8) => Ok(Datum::Int8(v as i8)),
+        (Value_oneof_value::bool_value(v), DatumKind::Boolean) => Ok(Datum::Boolean(v)),
+        (Value_oneof_value::uint64_value(v), DatumKind::UInt64) => Ok(Datum::UInt64(v)),
+        (Value_oneof_value::uint32_value(v), DatumKind::UInt32) => Ok(Datum::UInt32(v)),
+        (Value_oneof_value::uint16_value(v), DatumKind::UInt16) => Ok(Datum::UInt16(v as u16)),
+        (Value_oneof_value::uint8_value(v), DatumKind::UInt8) => Ok(Datum::UInt8(v as u8)),
+        (Value_oneof_value::timestamp_value(v), DatumKind::Timestamp) => Ok(Datum::Timestamp(Timestamp::new(v))),
+        (Value_oneof_value::varbinary_value(v), DatumKind::Varbinary) => Ok(Datum::Varbinary(Bytes::from(v))),
+        (v, _) => ErrNoCause {
+            code: StatusCode::InvalidArgument,
+            msg: format!(
+                "Value type is not same, table:{}, value_name:{}, schema_type:{:?}, actual_value:{:?}",
+                table_name,
+                name,
+                data_type,
+                v
+            ),
+        }
+        .fail(),
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use ceresdbproto::storage::{Field, FieldGroup, Tag, Value};
+    use common_types::{
+        column_schema::{self, ColumnSchema},
+        schema::Builder,
+    };
+    use system_catalog::sys_catalog_table::TIMESTAMP_COLUMN_NAME;
+
+    use super::*;
+
+    const TAG_K: &str = "tagk";
+    const TAG_V: &str = "tagv";
+    const TAG_K1: &str = "tagk1";
+    const TAG_V1: &str = "tagv1";
+    const FIELD_NAME: &str = "field";
+    const FIELD_NAME1: &str = "field1";
+    const FIELD_VALUE_STRING: &str = "stringValue";
+
+    // tag_names field_names write_entry
+    fn generate_write_entry() -> (Schema, Vec<String>, Vec<String>, WriteEntry) {
+        let tag_names = vec![TAG_K.to_string(), TAG_K1.to_string()];
+        let field_names = vec![FIELD_NAME.to_string(), FIELD_NAME1.to_string()];
+
+        let mut tag = Tag::new();
+        tag.set_name_index(0);
+        let mut tag_val = Value::new();
+        tag_val.set_string_value(TAG_V.to_string());
+        tag.set_value(tag_val);
+
+        let mut tag1 = Tag::new();
+        tag1.set_name_index(1);
+        let mut tag_val1 = Value::new();
+        tag_val1.set_string_value(TAG_V1.to_string());
+        tag1.set_value(tag_val1);
+        let tags = vec![tag, tag1];
+
+        let mut field = Field::new();
+        field.set_name_index(0);
+        let mut field_val = Value::new();
+        field_val.set_float64_value(100.0);
+        field.set_value(field_val);
+        let mut field1 = Field::new();
+        field1.set_name_index(1);
+        let mut field_val1 = Value::new();
+        field_val1.set_string_value(FIELD_VALUE_STRING.to_string());
+        field1.set_value(field_val1);
+        let mut field_group = FieldGroup::new();
+        field_group.set_timestamp(1000);
+        field_group.set_fields(vec![field].into());
+
+        let mut field_group1 = FieldGroup::new();
+        field_group1.set_timestamp(2000);
+        field_group1.set_fields(vec![field1.clone()].into());
+
+        let mut field_group2 = FieldGroup::new();
+        field_group2.set_timestamp(3000);
+        field_group2.set_fields(vec![field1].into());
+
+        let mut write_entry = WriteEntry::new();
+
+        write_entry.set_tags(tags.into());
+
+        write_entry.set_field_groups(vec![field_group, field_group1, field_group2].into());
+
+        let schema_builder = Builder::new();
+        let schema = schema_builder
+            .auto_increment_column_id(true)
+            .add_key_column(ColumnSchema {
+                id: column_schema::COLUMN_ID_UNINIT,
+                name: TIMESTAMP_COLUMN_NAME.to_string(),
+                data_type: DatumKind::Timestamp,
+                is_nullable: false,
+                is_tag: false,
+                comment: String::new(),
+            })
+            .unwrap()
+            .add_key_column(ColumnSchema {
+                id: column_schema::COLUMN_ID_UNINIT,
+                name: TAG_K.to_string(),
+                data_type: DatumKind::String,
+                is_nullable: false,
+                is_tag: true,
+                comment: String::new(),
+            })
+            .unwrap()
+            .add_normal_column(ColumnSchema {
+                id: column_schema::COLUMN_ID_UNINIT,
+                name: TAG_K1.to_string(),
+                data_type: DatumKind::String,
+                is_nullable: false,
+                is_tag: true,
+                comment: String::new(),
+            })
+            .unwrap()
+            .add_normal_column(ColumnSchema {
+                id: column_schema::COLUMN_ID_UNINIT,
+                name: FIELD_NAME.to_string(),
+                data_type: DatumKind::Double,
+                is_nullable: true,
+                is_tag: false,
+                comment: String::new(),
+            })
+            .unwrap()
+            .add_normal_column(ColumnSchema {
+                id: column_schema::COLUMN_ID_UNINIT,
+                name: FIELD_NAME1.to_string(),
+                data_type: DatumKind::String,
+                is_nullable: true,
+                is_tag: false,
+                comment: String::new(),
+            })
+            .unwrap()
+            .build()
+            .unwrap();
+        (schema, tag_names, field_names, write_entry)
+    }
+
+    #[test]
+    fn test_write_entry_to_row_group() {
+        let (schema, tag_names, field_names, write_entry) = generate_write_entry();
+        let rows =
+            write_entry_to_rows("test_table", &schema, &tag_names, &field_names, write_entry)
+                .unwrap();
+        let row0 = vec![
+            Datum::Timestamp(Timestamp::new(1000)),
+            Datum::String(TAG_V.into()),
+            Datum::String(TAG_V1.into()),
+            Datum::Double(100.0),
+            Datum::Null,
+        ];
+        let row1 = vec![
+            Datum::Timestamp(Timestamp::new(2000)),
+            Datum::String(TAG_V.into()),
+            Datum::String(TAG_V1.into()),
+            Datum::Null,
+            Datum::String(FIELD_VALUE_STRING.into()),
+        ];
+        let row2 = vec![
+            Datum::Timestamp(Timestamp::new(3000)),
+            Datum::String(TAG_V.into()),
+            Datum::String(TAG_V1.into()),
+            Datum::Null,
+            Datum::String(FIELD_VALUE_STRING.into()),
+        ];
+
+        let expect_rows = vec![
+            Row::from_datums(row0),
+            Row::from_datums(row1),
+            Row::from_datums(row2),
+        ];
+        assert_eq!(rows, expect_rows);
+    }
+}
diff --git a/server/src/handlers/admin.rs b/server/src/handlers/admin.rs
new file mode 100644
index 0000000000..1779e917c6
--- /dev/null
+++ b/server/src/handlers/admin.rs
@@ -0,0 +1,71 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use std::collections::BTreeSet;
+
+use crate::handlers::prelude::*;
+
+#[derive(Debug, Deserialize)]
+pub enum Operation {
+    Add,
+    Set,
+    Remove,
+}
+
+#[derive(Debug, Deserialize)]
+pub struct RejectRequest {
+    operation: Operation,
+    write_reject_list: Vec<String>,
+    read_reject_list: Vec<String>,
+}
+
+#[derive(Serialize)]
+pub struct RejectResponse {
+    write_reject_list: BTreeSet<String>,
+    read_reject_list: BTreeSet<String>,
+}
+
+pub async fn handle_reject<C: CatalogManager + 'static, Q: QueryExecutor + 'static>(
+    _ctx: RequestContext,
+    instance: InstanceRef<C, Q>,
+    request: RejectRequest,
+) -> Result<RejectResponse> {
+    match request.operation {
+        Operation::Add => {
+            instance
+                .limiter
+                .add_write_reject_list(request.write_reject_list);
+            instance
+                .limiter
+                .add_read_reject_list(request.read_reject_list);
+        }
+        Operation::Set => {
+            instance
+                .limiter
+                .set_write_reject_list(request.write_reject_list);
+            instance
+                .limiter
+                .set_read_reject_list(request.read_reject_list);
+        }
+        Operation::Remove => {
+            instance
+                .limiter
+                .remove_write_reject_list(request.write_reject_list);
+            instance
+                .limiter
+                .remove_read_reject_list(request.read_reject_list);
+        }
+    }
+
+    Ok(RejectResponse {
+        write_reject_list: instance
+            .limiter
+            .get_write_reject_list()
+            .into_iter()
+            .collect::<BTreeSet<_>>(),
+        read_reject_list: instance
+            .limiter
+            .get_read_reject_list()
+            .into_iter()
+            .collect::<BTreeSet<_>>(),
+    })
+}
diff --git a/server/src/handlers/error.rs b/server/src/handlers/error.rs
new file mode 100644
index 0000000000..0d781f2560
--- /dev/null
+++ b/server/src/handlers/error.rs
@@ -0,0 +1,52 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Error of handlers
+
+use snafu::{Backtrace, Snafu};
+
+// TODO(yingwen): Avoid printing huge sql string
+// TODO(yingwen): Maybe add an error type to sql sub mod
+#[derive(Debug, Snafu)]
+#[snafu(visibility(pub(crate)))]
+pub enum Error {
+    #[snafu(display("Failed to parse sql, err:{}", source))]
+    ParseSql { source: sql::frontend::Error },
+
+    #[snafu(display("Failed to create plan, query:{}, err:{}", query, source))]
+    CreatePlan {
+        query: String,
+        source: sql::frontend::Error,
+    },
+
+    #[snafu(display(
+        "Only support execute one statement now, current num:{}, query:{}.\nBacktrace:\n{}",
+        len,
+        query,
+        backtrace,
+    ))]
+    TooMuchStmt {
+        len: usize,
+        query: String,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Failed to execute interpreter, query:{}, err:{}", query, source))]
+    InterpreterExec {
+        query: String,
+        source: interpreters::interpreter::Error,
+    },
+
+    #[snafu(display(
+        "Failed to convert arrow to string, query:{}, err:{}.\nBacktrace:\n{}",
+        query,
+        source,
+        backtrace
+    ))]
+    ArrowToString {
+        query: String,
+        source: arrow_deps::arrow::error::ArrowError,
+        backtrace: Backtrace,
+    },
+}
+
+define_result!(Error);
diff --git a/server/src/handlers/mod.rs b/server/src/handlers/mod.rs
new file mode 100644
index 0000000000..e695b3b610
--- /dev/null
+++ b/server/src/handlers/mod.rs
@@ -0,0 +1,21 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Request handlers
+
+pub mod admin;
+pub mod error;
+pub mod sql;
+
+mod prelude {
+    pub use catalog::manager::Manager as CatalogManager;
+    pub use query_engine::executor::Executor as QueryExecutor;
+    pub use serde_derive::{Deserialize, Serialize};
+    pub use snafu::ResultExt;
+    pub use warp::Filter;
+
+    pub use crate::{
+        context::RequestContext,
+        handlers::error::{Error, Result},
+        instance::InstanceRef,
+    };
+}
diff --git a/server/src/handlers/sql.rs b/server/src/handlers/sql.rs
new file mode 100644
index 0000000000..1fa96b1d54
--- /dev/null
+++ b/server/src/handlers/sql.rs
@@ -0,0 +1,148 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! SQL request handler
+
+use std::collections::HashMap;
+
+use arrow_deps::arrow::error::Result as ArrowResult;
+use common_types::{datum::Datum, request_id::RequestId};
+use interpreters::{context::Context as InterpreterContext, factory::Factory, interpreter::Output};
+use log::info;
+use query_engine::executor::RecordBatchVec;
+use serde_derive::Serialize;
+use snafu::ensure;
+use sql::{
+    frontend::{Context as SqlContext, Frontend},
+    provider::CatalogMetaProvider,
+};
+
+use crate::handlers::{
+    error::{ArrowToString, CreatePlan, InterpreterExec, ParseSql, TooMuchStmt},
+    prelude::*,
+};
+
+#[derive(Debug, Deserialize)]
+pub struct Request {
+    query: String,
+}
+
+// TODO(yingwen): Improve serialize performance
+#[derive(Serialize)]
+#[serde(rename_all = "snake_case")]
+pub enum Response {
+    AffectedRows(usize),
+    Rows(Vec<HashMap<String, Datum>>),
+}
+
+pub async fn handle_sql<C: CatalogManager + 'static, Q: QueryExecutor + 'static>(
+    ctx: RequestContext,
+    instance: InstanceRef<C, Q>,
+    request: Request,
+) -> Result<Response> {
+    let request_id = RequestId::next_id();
+
+    info!(
+        "sql handler try to process request, request_id:{}, request:{:?}",
+        request_id, request
+    );
+
+    // We use tenant as schema
+    // TODO(yingwen): Privilege check, cannot access data of other tenant
+    // TODO(yingwen): Maybe move MetaProvider to instance
+    let provider = CatalogMetaProvider {
+        manager: &instance.catalog_manager,
+        default_catalog: &ctx.catalog,
+        default_schema: &ctx.tenant,
+        function_registry: &*instance.function_registry,
+    };
+    let frontend = Frontend::new(provider);
+
+    let mut sql_ctx = SqlContext::new(request_id);
+    // Parse sql, frontend error of invalid sql already contains sql
+    // TODO(yingwen): Maybe move sql from frontend error to outer error
+    let mut stmts = frontend
+        .parse_sql(&mut sql_ctx, &request.query)
+        .context(ParseSql)?;
+
+    if stmts.is_empty() {
+        return Ok(Response::AffectedRows(0));
+    }
+
+    // TODO(yingwen): For simplicity, we only support executing one statement now
+    // TODO(yingwen): INSERT/UPDATE/DELETE can be batched
+    ensure!(
+        stmts.len() == 1,
+        TooMuchStmt {
+            len: stmts.len(),
+            query: request.query,
+        }
+    );
+
+    // Create logical plan
+    // Note: Remember to store sql in error when creating logical plan
+    let plan = frontend
+        .statement_to_plan(&mut sql_ctx, stmts.remove(0))
+        .context(CreatePlan {
+            query: &request.query,
+        })?;
+
+    // Execute in interpreter
+    let interpreter_ctx = InterpreterContext::builder(request_id)
+        // Use current ctx's catalog and tenant as default catalog and tenant
+        .default_catalog_and_schema(ctx.catalog, ctx.tenant)
+        .build();
+    let interpreter_factory = Factory::new(
+        instance.query_executor.clone(),
+        instance.catalog_manager.clone(),
+        instance.table_engine.clone(),
+    );
+    let interpreter = interpreter_factory.create(interpreter_ctx, plan);
+
+    let output = interpreter.execute().await.context(InterpreterExec {
+        query: &request.query,
+    })?;
+
+    // Convert output to json
+    let resp = convert_output(output).context(ArrowToString {
+        query: &request.query,
+    })?;
+
+    info!(
+        "sql handler finished processing request, request:{:?}",
+        request
+    );
+
+    Ok(resp)
+}
+
+fn convert_output(output: Output) -> ArrowResult<Response> {
+    match output {
+        Output::AffectedRows(n) => Ok(Response::AffectedRows(n)),
+        Output::Records(records) => convert_records(records),
+    }
+}
+
+fn convert_records(records: RecordBatchVec) -> ArrowResult<Response> {
+    let total_rows = records.iter().map(|v| v.num_rows()).sum();
+    let mut resp = Vec::with_capacity(total_rows);
+    for record_batch in records {
+        let num_cols = record_batch.num_columns();
+        let num_rows = record_batch.num_rows();
+        let schema = record_batch.schema();
+
+        for row_idx in 0..num_rows {
+            let mut row = HashMap::with_capacity(num_cols);
+            for col_idx in 0..num_cols {
+                let column = record_batch.column(col_idx);
+                let column = column.datum(row_idx);
+
+                let column_name = schema.column(col_idx).name.clone();
+                row.insert(column_name, column);
+            }
+
+            resp.push(row);
+        }
+    }
+
+    Ok(Response::Rows(resp))
+}
diff --git a/server/src/http.rs b/server/src/http.rs
new file mode 100644
index 0000000000..7318d60433
--- /dev/null
+++ b/server/src/http.rs
@@ -0,0 +1,341 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Http service
+
+use std::{convert::Infallible, net::IpAddr, sync::Arc};
+
+use catalog::manager::Manager as CatalogManager;
+use log::error;
+use profile::Profiler;
+use query_engine::executor::Executor as QueryExecutor;
+use serde_derive::Serialize;
+use snafu::{Backtrace, OptionExt, ResultExt, Snafu};
+use table_engine::engine::EngineRuntimes;
+use tokio::sync::oneshot::{self, Sender};
+use warp::{
+    header,
+    http::StatusCode,
+    reject,
+    reply::{self, Reply},
+    Filter,
+};
+
+use crate::{consts, context::RequestContext, error, handlers, instance::InstanceRef, metrics};
+
+#[derive(Debug)]
+pub struct Config {
+    pub ip: String,
+    pub port: u16,
+}
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Failed to create request context, err:{}", source))]
+    CreateContext { source: crate::context::Error },
+
+    #[snafu(display("Failed to handle request, err:{}", source))]
+    HandleRequest {
+        source: crate::handlers::error::Error,
+    },
+
+    #[snafu(display("Missing runtimes to build service.\nBacktrace:\n{}", backtrace))]
+    MissingRuntimes { backtrace: Backtrace },
+
+    #[snafu(display("Missing instance to build service.\nBacktrace:\n{}", backtrace))]
+    MissingInstance { backtrace: Backtrace },
+
+    #[snafu(display(
+        "Fail to do heap profiling, err:{}.\nBacktrace:\n{}",
+        source,
+        backtrace
+    ))]
+    ProfileHeap {
+        source: profile::Error,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Fail to join async task, err:{}.", source))]
+    JoinAsyncTask { source: common_util::runtime::Error },
+
+    #[snafu(display(
+        "Failed to parse ip addr, ip:{}, err:{}.\nBacktrace:\n{}",
+        ip,
+        source,
+        backtrace
+    ))]
+    ParseIpAddr {
+        ip: String,
+        source: std::net::AddrParseError,
+        backtrace: Backtrace,
+    },
+}
+
+define_result!(Error);
+
+impl reject::Reject for Error {}
+
+/// Http service
+///
+/// Note that the service does not owns the runtime
+pub struct Service<C, Q> {
+    runtimes: Arc<EngineRuntimes>,
+    instance: InstanceRef<C, Q>,
+    profiler: Arc<Profiler>,
+    tx: Sender<()>,
+}
+
+impl<C, Q> Service<C, Q> {
+    // TODO(yingwen): Maybe log error or return error
+    pub fn stop(self) {
+        let _ = self.tx.send(());
+    }
+}
+
+// TODO(yingwen): How to support non json response?
+impl<C: CatalogManager + 'static, Q: QueryExecutor + 'static> Service<C, Q> {
+    fn routes(&self) -> impl Filter<Extract = impl warp::Reply, Error = warp::Rejection> + Clone {
+        self.home()
+            .or(self.metrics())
+            .or(self.sql())
+            .or(self.heap_profile())
+            .or(self.admin_reject())
+    }
+
+    fn home(&self) -> impl Filter<Extract = impl warp::Reply, Error = warp::Rejection> + Clone {
+        warp::path::end().and(warp::get()).map(|| {
+            use std::collections::HashMap;
+            let mut resp = HashMap::new();
+            resp.insert("status", "ok");
+            reply::json(&resp)
+        })
+    }
+
+    // TODO(yingwen): Avoid boilterplate code if there are more handlers
+    fn sql(&self) -> impl Filter<Extract = impl warp::Reply, Error = warp::Rejection> + Clone {
+        warp::path!("sql")
+            .and(warp::post())
+            // TODO(yingwen): content length limit
+            .and(warp::body::json())
+            .and(self.with_context())
+            .and(self.with_instance())
+            .and_then(|req, ctx, instance| async {
+                // TODO(yingwen): Wrap common logic such as metrics, trace and error log
+                let result = handlers::sql::handle_sql(ctx, instance, req)
+                    .await
+                    .map_err(|e| {
+                        // TODO(yingwen): Maybe truncate and print the sql
+                        error!("Http service Failed to handle sql, err:{}", e);
+                        e
+                    })
+                    .context(HandleRequest);
+                match result {
+                    Ok(res) => Ok(reply::json(&res)),
+                    Err(e) => Err(reject::custom(e)),
+                }
+            })
+    }
+
+    fn metrics(&self) -> impl Filter<Extract = impl warp::Reply, Error = warp::Rejection> + Clone {
+        warp::path!("metrics").and(warp::get()).map(metrics::dump)
+    }
+
+    fn heap_profile(
+        &self,
+    ) -> impl Filter<Extract = impl warp::Reply, Error = warp::Rejection> + Clone {
+        warp::path!("debug" / "heap_profile" / ..)
+            .and(warp::path::param::<u64>())
+            .and(warp::get())
+            .and(self.with_context())
+            .and(self.with_profiler())
+            .and_then(
+                |duration_sec: u64, ctx: RequestContext, profiler: Arc<Profiler>| async move {
+                    let handle = ctx.runtime.spawn_blocking(move || {
+                        profiler.dump_mem_prof(duration_sec).context(ProfileHeap)
+                    });
+                    let result = handle.await.context(JoinAsyncTask);
+                    match result {
+                        Ok(Ok(prof_data)) => Ok(prof_data.into_response()),
+                        Ok(Err(e)) => Err(reject::custom(e)),
+                        Err(e) => Err(reject::custom(e)),
+                    }
+                },
+            )
+    }
+
+    fn with_context(
+        &self,
+    ) -> impl Filter<Extract = (RequestContext,), Error = warp::Rejection> + Clone {
+        let default_catalog = self
+            .instance
+            .catalog_manager
+            .default_catalog_name()
+            .to_string();
+        let default_schema = self
+            .instance
+            .catalog_manager
+            .default_schema_name()
+            .to_string();
+        //TODO(boyan) use read/write runtime by sql type.
+        let runtime = self.runtimes.bg_runtime.clone();
+
+        header::optional::<String>(consts::CATALOG_HEADER)
+            .and(header::optional::<String>(consts::TENANT_HEADER))
+            .and_then(move |catalog: Option<_>, tenant: Option<_>| {
+                // Clone the captured variables
+                let default_catalog = default_catalog.clone();
+                let default_schema = default_schema.clone();
+                let runtime = runtime.clone();
+                async {
+                    RequestContext::builder()
+                        .catalog(catalog.unwrap_or(default_catalog))
+                        .tenant(tenant.unwrap_or(default_schema))
+                        .runtime(runtime)
+                        .build()
+                        .context(CreateContext)
+                        .map_err(reject::custom)
+                }
+            })
+    }
+
+    fn with_profiler(&self) -> impl Filter<Extract = (Arc<Profiler>,), Error = Infallible> + Clone {
+        let profiler = self.profiler.clone();
+        warp::any().map(move || profiler.clone())
+    }
+
+    fn with_instance(
+        &self,
+    ) -> impl Filter<Extract = (InstanceRef<C, Q>,), Error = Infallible> + Clone {
+        let instance = self.instance.clone();
+        warp::any().map(move || instance.clone())
+    }
+
+    fn admin_reject(
+        &self,
+    ) -> impl Filter<Extract = impl warp::Reply, Error = warp::Rejection> + Clone {
+        warp::path!("reject")
+            .and(warp::post())
+            .and(warp::body::json())
+            .and(self.with_context())
+            .and(self.with_instance())
+            .and_then(|req, ctx, instance| async {
+                let result = handlers::admin::handle_reject(ctx, instance, req)
+                    .await
+                    .map_err(|e| {
+                        error!("Http service failed to handle admin reject, err:{}", e);
+                        e
+                    })
+                    .context(HandleRequest);
+
+                match result {
+                    Ok(res) => Ok(reply::json(&res)),
+                    Err(e) => Err(reject::custom(e)),
+                }
+            })
+    }
+}
+
+/// Service builder
+pub struct Builder<C, Q> {
+    config: Config,
+    runtimes: Option<Arc<EngineRuntimes>>,
+    instance: Option<InstanceRef<C, Q>>,
+}
+
+impl<C, Q> Builder<C, Q> {
+    pub fn new(config: Config) -> Self {
+        Self {
+            config,
+            runtimes: None,
+            instance: None,
+        }
+    }
+
+    pub fn runtimes(mut self, runtimes: Arc<EngineRuntimes>) -> Self {
+        self.runtimes = Some(runtimes);
+        self
+    }
+
+    pub fn instance(mut self, instance: InstanceRef<C, Q>) -> Self {
+        self.instance = Some(instance);
+        self
+    }
+}
+
+impl<C: CatalogManager + 'static, Q: QueryExecutor + 'static> Builder<C, Q> {
+    /// Build and start the service
+    pub fn build(self) -> Result<Service<C, Q>> {
+        let runtimes = self.runtimes.context(MissingRuntimes)?;
+        let instance = self.instance.context(MissingInstance)?;
+        let (tx, rx) = oneshot::channel();
+
+        let service = Service {
+            runtimes: runtimes.clone(),
+            instance,
+            profiler: Arc::new(Profiler::default()),
+            tx,
+        };
+
+        let ip_addr: IpAddr = self
+            .config
+            .ip
+            .parse()
+            .context(ParseIpAddr { ip: self.config.ip })?;
+
+        // Register filters to warp and rejection handler
+        let routes = service.routes().recover(handle_rejection);
+        let (_addr, server) =
+            warp::serve(routes).bind_with_graceful_shutdown((ip_addr, self.config.port), async {
+                rx.await.ok();
+            });
+        // Run the service
+        runtimes.bg_runtime.spawn(server);
+
+        Ok(service)
+    }
+}
+
+#[derive(Debug, Serialize)]
+struct ErrorResponse {
+    code: u16,
+    message: String,
+}
+
+fn error_to_status_code(err: &Error) -> StatusCode {
+    match err {
+        Error::CreateContext { .. } => StatusCode::BAD_REQUEST,
+        // TODO(yingwen): Map handle request error to more accurate status code
+        Error::HandleRequest { .. }
+        | Error::MissingRuntimes { .. }
+        | Error::MissingInstance { .. }
+        | Error::ParseIpAddr { .. }
+        | Error::ProfileHeap { .. }
+        | Error::JoinAsyncTask { .. } => StatusCode::INTERNAL_SERVER_ERROR,
+    }
+}
+
+async fn handle_rejection(
+    rejection: warp::Rejection,
+) -> std::result::Result<impl warp::Reply, Infallible> {
+    let code;
+    let message;
+
+    if rejection.is_not_found() {
+        code = StatusCode::NOT_FOUND;
+        message = String::from("NOT_FOUND");
+    } else if let Some(err) = rejection.find() {
+        code = error_to_status_code(err);
+        let err_string = err.to_string();
+        message = error::first_line_in_error(&err_string).to_string();
+    } else {
+        error!("handle error: {:?}", rejection);
+        code = StatusCode::INTERNAL_SERVER_ERROR;
+        message = format!("UNKNOWN_ERROR: {:?}", rejection);
+    }
+
+    let json = reply::json(&ErrorResponse {
+        code: code.as_u16(),
+        message,
+    });
+
+    Ok(reply::with_status(json, code))
+}
diff --git a/server/src/instance.rs b/server/src/instance.rs
new file mode 100644
index 0000000000..64d3ada775
--- /dev/null
+++ b/server/src/instance.rs
@@ -0,0 +1,26 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Instance contains shared states of service
+
+use std::sync::Arc;
+
+use table_engine::engine::TableEngineRef;
+use udf::registry::FunctionRegistryRef;
+
+use crate::limiter::Limiter;
+
+/// A cluster instance. Usually there is only one instance per cluster
+///
+/// C: catalog::manager::Manager
+/// Q: query_engine::executor::Executor
+pub struct Instance<C, Q> {
+    pub catalog_manager: C,
+    pub query_executor: Q,
+    pub table_engine: TableEngineRef,
+    // User defined functions registry.
+    pub function_registry: FunctionRegistryRef,
+    pub limiter: Limiter,
+}
+
+/// A reference counted instance pointer
+pub type InstanceRef<C, Q> = Arc<Instance<C, Q>>;
diff --git a/server/src/lib.rs b/server/src/lib.rs
new file mode 100644
index 0000000000..122735a07f
--- /dev/null
+++ b/server/src/lib.rs
@@ -0,0 +1,25 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Rpc server
+
+// TODO(yingwen):
+// Borrow some ideas from tikv: https://github.com/tikv/tikv/blob/dc8ce2cf6a8904cb3dad556f71b11bac3531689b/src/server/service/kv.rs#L51
+
+#[macro_use]
+extern crate common_util;
+
+mod avro_util;
+pub mod config;
+mod consts;
+mod context;
+mod error;
+mod grpc;
+mod handlers;
+mod http;
+mod instance;
+pub mod limiter;
+pub mod logger;
+mod metrics;
+mod router;
+pub mod server;
+pub mod table_engine;
diff --git a/server/src/limiter.rs b/server/src/limiter.rs
new file mode 100644
index 0000000000..f594b2028b
--- /dev/null
+++ b/server/src/limiter.rs
@@ -0,0 +1,194 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use std::{collections::HashSet, sync::RwLock};
+
+use arrow_deps::datafusion::catalog::TableReference;
+use sql::plan::Plan;
+
+pub struct Limiter {
+    write_reject_list: RwLock<HashSet<String>>,
+    read_reject_list: RwLock<HashSet<String>>,
+}
+
+impl Default for Limiter {
+    fn default() -> Self {
+        Self {
+            write_reject_list: RwLock::new(HashSet::new()),
+            read_reject_list: RwLock::new(HashSet::new()),
+        }
+    }
+}
+
+impl Limiter {
+    pub fn should_limit(&self, plan: &Plan) -> bool {
+        match plan {
+            Plan::Query(query) => {
+                let read_reject_list = self.read_reject_list.read().unwrap().clone();
+                for table in read_reject_list {
+                    if query
+                        .tables
+                        .get(TableReference::from(table.as_str()))
+                        .is_some()
+                    {
+                        return true;
+                    }
+                }
+                false
+            }
+            Plan::Insert(insert) => self
+                .write_reject_list
+                .read()
+                .unwrap()
+                .contains(insert.table.name()),
+            _ => false,
+        }
+    }
+
+    pub fn add_write_reject_list(&self, reject_list: Vec<String>) {
+        self.write_reject_list
+            .write()
+            .unwrap()
+            .extend(reject_list.into_iter())
+    }
+
+    pub fn add_read_reject_list(&self, reject_list: Vec<String>) {
+        self.read_reject_list
+            .write()
+            .unwrap()
+            .extend(reject_list.into_iter())
+    }
+
+    pub fn set_write_reject_list(&self, reject_list: Vec<String>) {
+        *self.write_reject_list.write().unwrap() = reject_list.into_iter().collect();
+    }
+
+    pub fn set_read_reject_list(&self, reject_list: Vec<String>) {
+        *self.read_reject_list.write().unwrap() = reject_list.into_iter().collect();
+    }
+
+    pub fn get_write_reject_list(&self) -> HashSet<String> {
+        self.write_reject_list.write().unwrap().clone()
+    }
+
+    pub fn get_read_reject_list(&self) -> HashSet<String> {
+        self.read_reject_list.write().unwrap().clone()
+    }
+
+    pub fn remove_write_reject_list(&self, reject_list: Vec<String>) {
+        let mut write_reject_list = self.write_reject_list.write().unwrap();
+        for value in reject_list {
+            write_reject_list.remove(&value);
+        }
+    }
+
+    pub fn remove_read_reject_list(&self, reject_list: Vec<String>) {
+        let mut read_reject_list = self.read_reject_list.write().unwrap();
+        for value in reject_list {
+            read_reject_list.remove(&value);
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use common_types::request_id::RequestId;
+    use sql::{parser::Parser, plan::Plan, planner::Planner, tests::MockMetaProvider};
+
+    use crate::limiter::Limiter;
+
+    fn sql_to_plan(meta_provider: &MockMetaProvider, sql: &str) -> Plan {
+        let planner = Planner::new(meta_provider, RequestId::next_id(), 1);
+        let mut statements = Parser::parse_sql(sql).unwrap();
+        assert_eq!(statements.len(), 1);
+        planner.statement_to_plan(statements.remove(0)).unwrap()
+    }
+
+    fn prepare() -> (MockMetaProvider, Limiter) {
+        let mock = MockMetaProvider::default();
+
+        let reject_list = vec!["test_table".to_string()];
+        let limiter = Limiter::default();
+        limiter.set_read_reject_list(reject_list.clone());
+        limiter.set_write_reject_list(reject_list);
+        (mock, limiter)
+    }
+
+    #[test]
+    fn test_limiter() {
+        let (mock, limiter) = prepare();
+        let query = "select * from test_table";
+        let query_plan = sql_to_plan(&mock, query);
+        assert!(limiter.should_limit(&query_plan));
+
+        let insert="INSERT INTO test_table(key1, key2, field1,field2) VALUES('tagk', 1638428434000,100, 'hello3')";
+        let insert_plan = sql_to_plan(&mock, insert);
+        assert!(limiter.should_limit(&insert_plan));
+    }
+
+    #[test]
+    fn test_limiter_remove() {
+        let (mock, limiter) = prepare();
+        let test_data = vec!["test_table".to_string()];
+
+        let query = "select * from test_table";
+        let query_plan = sql_to_plan(&mock, query);
+        assert!(limiter.should_limit(&query_plan));
+
+        let insert="INSERT INTO test_table(key1, key2, field1,field2) VALUES('tagk', 1638428434000,100, 'hello3')";
+        let insert_plan = sql_to_plan(&mock, insert);
+        assert!(limiter.should_limit(&insert_plan));
+
+        limiter.remove_write_reject_list(test_data.clone());
+        limiter.remove_read_reject_list(test_data);
+        assert!(!limiter.should_limit(&query_plan));
+        assert!(!limiter.should_limit(&insert_plan));
+    }
+
+    #[test]
+    fn test_limiter_add() {
+        let (mock, limiter) = prepare();
+        let test_data = vec!["test_table2".to_string()];
+
+        let query = "select * from test_table2";
+        let query_plan = sql_to_plan(&mock, query);
+        assert!(!limiter.should_limit(&query_plan));
+
+        let insert="INSERT INTO test_table2(key1, key2, field1,field2) VALUES('tagk', 1638428434000,100, 'hello3')";
+        let insert_plan = sql_to_plan(&mock, insert);
+        assert!(!limiter.should_limit(&insert_plan));
+
+        limiter.add_write_reject_list(test_data.clone());
+        limiter.add_read_reject_list(test_data);
+        assert!(limiter.should_limit(&query_plan));
+        assert!(limiter.should_limit(&insert_plan));
+    }
+
+    #[test]
+    fn test_limiter_set() {
+        let (mock, limiter) = prepare();
+        let test_data = vec!["test_table2".to_string()];
+
+        let query = "select * from test_table";
+        let query_plan = sql_to_plan(&mock, query);
+        assert!(limiter.should_limit(&query_plan));
+
+        let query2 = "select * from test_table2";
+        let query_plan2 = sql_to_plan(&mock, query2);
+        assert!(!limiter.should_limit(&query_plan2));
+
+        let insert="INSERT INTO test_table(key1, key2, field1,field2) VALUES('tagk', 1638428434000,100, 'hello3')";
+        let insert_plan = sql_to_plan(&mock, insert);
+        assert!(limiter.should_limit(&insert_plan));
+
+        let insert2="INSERT INTO test_table2(key1, key2, field1,field2) VALUES('tagk', 1638428434000,100, 'hello3')";
+        let insert_plan2 = sql_to_plan(&mock, insert2);
+        assert!(!limiter.should_limit(&insert_plan2));
+
+        limiter.set_read_reject_list(test_data.clone());
+        limiter.set_write_reject_list(test_data);
+        assert!(!limiter.should_limit(&query_plan));
+        assert!(!limiter.should_limit(&insert_plan));
+        assert!(limiter.should_limit(&query_plan2));
+        assert!(limiter.should_limit(&insert_plan2));
+    }
+}
diff --git a/server/src/logger.rs b/server/src/logger.rs
new file mode 100644
index 0000000000..a05ecd44ec
--- /dev/null
+++ b/server/src/logger.rs
@@ -0,0 +1,32 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use std::str::FromStr;
+
+use log::SetLoggerError;
+use logger::{Level, LogDispatcher, RuntimeLevel};
+
+use crate::config::Config;
+
+pub fn init_log(config: &Config) -> Result<RuntimeLevel, SetLoggerError> {
+    let level = match Level::from_str(&config.log_level) {
+        Ok(v) => v,
+        Err(e) => {
+            panic!(
+                "Parse log level failed, level: {}, err: {:?}",
+                &config.log_level, e
+            );
+        }
+    };
+
+    let term_drain = logger::term_drainer();
+    let drain = LogDispatcher::new(term_drain);
+
+    // Use async and init stdlog
+    logger::init_log(
+        drain,
+        level,
+        config.enable_async_log,
+        config.async_log_channel_len,
+        true,
+    )
+}
diff --git a/server/src/metrics.rs b/server/src/metrics.rs
new file mode 100644
index 0000000000..89dd08fdbd
--- /dev/null
+++ b/server/src/metrics.rs
@@ -0,0 +1,19 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Metrics util for server.
+
+use log::warn;
+use prometheus::{Encoder, TextEncoder};
+
+/// Gather and dump prometheus to string.
+pub fn dump() -> String {
+    let mut buffer = vec![];
+    let encoder = TextEncoder::new();
+    let metric_families = prometheus::gather();
+    for mf in metric_families {
+        if let Err(e) = encoder.encode(&[mf], &mut buffer) {
+            warn!("prometheus encoding error, err:{}", e);
+        }
+    }
+    String::from_utf8(buffer).unwrap()
+}
diff --git a/server/src/router.rs b/server/src/router.rs
new file mode 100644
index 0000000000..aa687c714b
--- /dev/null
+++ b/server/src/router.rs
@@ -0,0 +1,196 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use std::{
+    collections::HashMap,
+    hash::{Hash, Hasher},
+    sync::Arc,
+};
+
+use ceresdbproto::storage::{Endpoint, Route, RouteRequest};
+use log::info;
+use meta_client::{MetaClient, ShardId};
+use serde_derive::Deserialize;
+use twox_hash::XxHash64;
+
+use crate::error::{ErrNoCause, Result, StatusCode};
+
+/// Hash seed to build hasher. Modify the seed will result in different route
+/// result!
+const HASH_SEED: u64 = 0;
+
+pub type RouterRef = Arc<dyn Router + Sync + Send>;
+
+pub trait Router {
+    fn route(&self, schema: &str, req: RouteRequest) -> Result<Vec<Route>>;
+}
+
+#[derive(Debug, Deserialize)]
+pub struct PrefixRule {
+    /// Schema name of the prefix.
+    pub schema: String,
+    /// Prefix of the table name.
+    pub prefix: String,
+    /// The shard of matched tables.
+    pub shard: ShardId,
+}
+
+#[derive(Debug, Deserialize)]
+pub struct HashRule {
+    /// Schema name of the prefix.
+    pub schema: String,
+    /// The shard list for hash rule.
+    pub shards: Vec<ShardId>,
+}
+
+#[derive(Debug, Default, Deserialize)]
+pub struct RuleList {
+    pub prefix_rules: Vec<PrefixRule>,
+    pub hash_rules: Vec<HashRule>,
+}
+
+impl RuleList {
+    pub fn split_by_schema(self) -> SchemaRules {
+        let mut schema_rules = HashMap::new();
+
+        for rule in self.prefix_rules {
+            let rule_list = match schema_rules.get_mut(&rule.schema) {
+                Some(v) => v,
+                None => schema_rules
+                    .entry(rule.schema.clone())
+                    .or_insert_with(RuleList::default),
+            };
+
+            rule_list.prefix_rules.push(rule);
+        }
+
+        for rule in self.hash_rules {
+            let rule_list = match schema_rules.get_mut(&rule.schema) {
+                Some(v) => v,
+                None => schema_rules
+                    .entry(rule.schema.clone())
+                    .or_insert_with(RuleList::default),
+            };
+
+            rule_list.hash_rules.push(rule);
+        }
+
+        schema_rules
+    }
+}
+
+// Schema -> Rule list of the schema.
+type SchemaRules = HashMap<String, RuleList>;
+
+pub struct RuleBasedRouter {
+    meta_client: Arc<dyn MetaClient + Send + Sync>,
+    schema_rules: SchemaRules,
+}
+
+impl RuleBasedRouter {
+    pub fn new(meta_client: Arc<dyn MetaClient + Send + Sync>, rules: RuleList) -> Self {
+        let schema_rules = rules.split_by_schema();
+
+        info!("RuleBasedRouter init with rules, rules:{:?}", schema_rules);
+
+        Self {
+            meta_client,
+            schema_rules,
+        }
+    }
+
+    fn maybe_route_by_rule(metric: &str, rule_list: &RuleList) -> Option<ShardId> {
+        for prefix_rule in &rule_list.prefix_rules {
+            if metric.starts_with(&prefix_rule.prefix) {
+                return Some(prefix_rule.shard);
+            }
+        }
+
+        if let Some(hash_rule) = rule_list.hash_rules.get(0) {
+            let total_shards = hash_rule.shards.len();
+            let hash_value = hash_metric(metric);
+            let index = hash_value as usize % total_shards;
+
+            return Some(hash_rule.shards[index]);
+        }
+
+        None
+    }
+
+    #[inline]
+    fn route_by_hash(metric: &str, total_shards: usize) -> ShardId {
+        let hash_value = hash_metric(metric);
+        (hash_value as usize % total_shards) as ShardId
+    }
+
+    fn route_metric(
+        metric: &str,
+        rule_list_opt: Option<&RuleList>,
+        total_shards: usize,
+    ) -> ShardId {
+        if let Some(rule_list) = rule_list_opt {
+            if let Some(shard_id) = Self::maybe_route_by_rule(metric, rule_list) {
+                return shard_id;
+            }
+        }
+
+        // Fallback to hash route rule.
+        Self::route_by_hash(metric, total_shards)
+    }
+}
+
+impl Router for RuleBasedRouter {
+    fn route(&self, schema: &str, req: RouteRequest) -> Result<Vec<Route>> {
+        let cluster_view = self.meta_client.get_cluster_view();
+        if let Some(shard_view_map) = cluster_view.schema_shards.get(schema) {
+            if shard_view_map.is_empty() {
+                return ErrNoCause {
+                    code: StatusCode::NotFound,
+                    msg: "shards from meta is empty",
+                }
+                .fail();
+            }
+
+            // Get rule list of this schema.
+            let rule_list_opt = self.schema_rules.get(schema);
+
+            // TODO(yingwen): Better way to get total shard number
+            let total_shards = shard_view_map.len();
+            let mut route_vec = Vec::with_capacity(req.metrics.len());
+            for metric in req.metrics {
+                let mut route = Route::new();
+                route.set_metric(metric);
+
+                let shard_id = Self::route_metric(route.get_metric(), rule_list_opt, total_shards);
+
+                let mut endpoint = Endpoint::new();
+                if let Some(shard_view) = shard_view_map.get(&shard_id) {
+                    let node = &shard_view.node;
+                    endpoint.set_ip(node.addr.clone());
+                    endpoint.set_port(node.port);
+                } else {
+                    return ErrNoCause {
+                        code: StatusCode::NotFound,
+                        msg: format!(
+                            "Shard not found, metric:{}, shard_id:{}",
+                            route.get_metric(),
+                            shard_id
+                        ),
+                    }
+                    .fail();
+                }
+
+                route.set_endpoint(endpoint);
+                route_vec.push(route);
+            }
+            return Ok(route_vec);
+        }
+
+        Ok(Vec::new())
+    }
+}
+
+fn hash_metric(metric: &str) -> u64 {
+    let mut hasher = XxHash64::with_seed(HASH_SEED);
+    metric.hash(&mut hasher);
+    hasher.finish()
+}
diff --git a/server/src/server.rs b/server/src/server.rs
new file mode 100644
index 0000000000..90e5a999b9
--- /dev/null
+++ b/server/src/server.rs
@@ -0,0 +1,180 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Server
+
+use std::sync::Arc;
+
+use catalog::manager::Manager as CatalogManager;
+use grpcio::Environment;
+use query_engine::executor::Executor as QueryExecutor;
+use snafu::{Backtrace, OptionExt, ResultExt, Snafu};
+use table_engine::engine::{EngineRuntimes, TableEngineRef};
+use udf::registry::FunctionRegistryRef;
+
+use crate::{
+    config::Config,
+    grpc::{self, RpcServices},
+    http::{self, Service},
+    instance::{Instance, InstanceRef},
+    limiter::Limiter,
+};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Missing runtimes.\nBacktrace:\n{}", backtrace))]
+    MissingRuntimes { backtrace: Backtrace },
+
+    #[snafu(display("Missing catalog manager.\nBacktrace:\n{}", backtrace))]
+    MissingCatalogManager { backtrace: Backtrace },
+
+    #[snafu(display("Missing query executor.\nBacktrace:\n{}", backtrace))]
+    MissingQueryExecutor { backtrace: Backtrace },
+
+    #[snafu(display("Missing table engine.\nBacktrace:\n{}", backtrace))]
+    MissingTableEngine { backtrace: Backtrace },
+
+    #[snafu(display("Missing function registry.\nBacktrace:\n{}", backtrace))]
+    MissingFunctionRegistry { backtrace: Backtrace },
+
+    #[snafu(display("Missing limiter.\nBacktrace:\n{}", backtrace))]
+    MissingLimiter { backtrace: Backtrace },
+
+    #[snafu(display("Failed to start http service, err:{}", source))]
+    StartHttpService { source: crate::http::Error },
+
+    #[snafu(display("Failed to register system catalog, err:{}", source))]
+    RegisterSystemCatalog { source: catalog::manager::Error },
+
+    #[snafu(display("Failed to build grpc service, err:{}", source))]
+    BuildGrpcService { source: crate::grpc::Error },
+
+    #[snafu(display("Failed to start grpc service, err:{}", source))]
+    StartGrpcService { source: crate::grpc::Error },
+}
+
+define_result!(Error);
+
+// TODO(yingwen): Consider a config manager
+/// Server
+pub struct Server<C, Q> {
+    http_service: Service<C, Q>,
+    rpc_services: RpcServices,
+}
+
+impl<C, Q> Server<C, Q> {
+    pub fn stop(mut self) {
+        self.rpc_services.shutdown();
+        self.http_service.stop();
+    }
+
+    pub async fn start(&mut self) -> Result<()> {
+        self.rpc_services.start().await.context(StartGrpcService)
+    }
+}
+
+#[must_use]
+pub struct Builder<C, Q> {
+    config: Config,
+    runtimes: Option<Arc<EngineRuntimes>>,
+    catalog_manager: Option<C>,
+    query_executor: Option<Q>,
+    table_engine: Option<TableEngineRef>,
+    function_registry: Option<FunctionRegistryRef>,
+    limiter: Limiter,
+}
+
+impl<C: CatalogManager + 'static, Q: QueryExecutor + 'static> Builder<C, Q> {
+    pub fn new(config: Config) -> Self {
+        Self {
+            config,
+            runtimes: None,
+            catalog_manager: None,
+            query_executor: None,
+            table_engine: None,
+            function_registry: None,
+            limiter: Limiter::default(),
+        }
+    }
+
+    pub fn runtimes(mut self, runtimes: Arc<EngineRuntimes>) -> Self {
+        self.runtimes = Some(runtimes);
+        self
+    }
+
+    pub fn catalog_manager(mut self, val: C) -> Self {
+        self.catalog_manager = Some(val);
+        self
+    }
+
+    pub fn query_executor(mut self, val: Q) -> Self {
+        self.query_executor = Some(val);
+        self
+    }
+
+    pub fn table_engine(mut self, val: TableEngineRef) -> Self {
+        self.table_engine = Some(val);
+        self
+    }
+
+    pub fn function_registry(mut self, val: FunctionRegistryRef) -> Self {
+        self.function_registry = Some(val);
+        self
+    }
+
+    pub fn limiter(mut self, val: Limiter) -> Self {
+        self.limiter = val;
+        self
+    }
+
+    /// Build and run the server
+    pub fn build(self) -> Result<Server<C, Q>> {
+        // Build runtimes
+        let runtimes = self.runtimes.context(MissingRuntimes)?;
+
+        // Build instance
+        let catalog_manager = self.catalog_manager.context(MissingCatalogManager)?;
+        let query_executor = self.query_executor.context(MissingQueryExecutor)?;
+        let table_engine = self.table_engine.context(MissingTableEngine)?;
+        let function_registry = self.function_registry.context(MissingFunctionRegistry)?;
+        let instance = Instance {
+            catalog_manager,
+            query_executor,
+            table_engine,
+            function_registry,
+            limiter: self.limiter,
+        };
+        let instance = InstanceRef::new(instance);
+
+        // Create http config
+        let http_config = http::Config {
+            ip: self.config.bind_addr.clone(),
+            port: self.config.http_port,
+        };
+
+        // Start http service
+        let http_service = http::Builder::new(http_config)
+            .runtimes(runtimes.clone())
+            .instance(instance.clone())
+            .build()
+            .context(StartHttpService)?;
+
+        let meta_client_config = self.config.meta_client;
+        let env = Arc::new(Environment::new(self.config.grpc_server_cq_count));
+        let rpc_services = grpc::Builder::new()
+            .bind_addr(self.config.bind_addr)
+            .port(self.config.grpc_port)
+            .meta_client_config(meta_client_config)
+            .env(env)
+            .runtimes(runtimes)
+            .instance(instance)
+            .route_rules(self.config.route_rules)
+            .build()
+            .context(BuildGrpcService)?;
+
+        let server = Server {
+            http_service,
+            rpc_services,
+        };
+        Ok(server)
+    }
+}
diff --git a/server/src/table_engine.rs b/server/src/table_engine.rs
new file mode 100644
index 0000000000..7f7083b91c
--- /dev/null
+++ b/server/src/table_engine.rs
@@ -0,0 +1,97 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Table engine implementation
+
+use std::sync::Arc;
+
+use analytic_engine::AnalyticTableEngine;
+use async_trait::async_trait;
+use table_engine::{
+    engine::{
+        CreateTableRequest, DropTableRequest, OpenTableRequest, Result, TableEngine,
+        UnknownEngineType,
+    },
+    memory::MemoryTable,
+    table::TableRef,
+    ANALYTIC_ENGINE_TYPE, MEMORY_ENGINE_TYPE,
+};
+
+/// Memory table engine implementation
+// Mainly for test purpose now
+pub struct MemoryTableEngine;
+
+#[async_trait]
+impl TableEngine for MemoryTableEngine {
+    fn engine_type(&self) -> &str {
+        MEMORY_ENGINE_TYPE
+    }
+
+    async fn close(&self) -> Result<()> {
+        Ok(())
+    }
+
+    async fn create_table(&self, request: CreateTableRequest) -> Result<TableRef> {
+        Ok(Arc::new(MemoryTable::new(
+            request.table_name,
+            request.table_id,
+            request.table_schema,
+            MEMORY_ENGINE_TYPE.to_string(),
+        )))
+    }
+
+    async fn drop_table(&self, _request: DropTableRequest) -> Result<bool> {
+        Ok(true)
+    }
+
+    async fn open_table(&self, _request: OpenTableRequest) -> Result<Option<TableRef>> {
+        Ok(None)
+    }
+}
+
+/// Route [CreateTableRequest] to the correct engine by its engine type
+pub struct TableEngineProxy {
+    /// Memory table engine
+    pub memory: MemoryTableEngine,
+    /// Analytic table engine
+    pub analytic: AnalyticTableEngine,
+}
+
+#[async_trait]
+impl TableEngine for TableEngineProxy {
+    fn engine_type(&self) -> &str {
+        "TableEngineProxy"
+    }
+
+    async fn close(&self) -> Result<()> {
+        self.memory.close().await?;
+        self.analytic.close().await?;
+
+        Ok(())
+    }
+
+    async fn create_table(&self, request: CreateTableRequest) -> Result<TableRef> {
+        // TODO(yingwen): Use a map
+        match request.engine.as_str() {
+            MEMORY_ENGINE_TYPE => self.memory.create_table(request).await,
+            ANALYTIC_ENGINE_TYPE => self.analytic.create_table(request).await,
+            engine_type => UnknownEngineType { engine_type }.fail(),
+        }
+    }
+
+    async fn drop_table(&self, request: DropTableRequest) -> Result<bool> {
+        match request.engine.as_str() {
+            MEMORY_ENGINE_TYPE => self.memory.drop_table(request).await,
+            ANALYTIC_ENGINE_TYPE => self.analytic.drop_table(request).await,
+            engine_type => UnknownEngineType { engine_type }.fail(),
+        }
+    }
+
+    /// Open table, return error if table not exists
+    async fn open_table(&self, request: OpenTableRequest) -> Result<Option<TableRef>> {
+        match request.engine.as_str() {
+            MEMORY_ENGINE_TYPE => self.memory.open_table(request).await,
+            ANALYTIC_ENGINE_TYPE => self.analytic.open_table(request).await,
+            engine_type => UnknownEngineType { engine_type }.fail(),
+        }
+    }
+}
diff --git a/sql/Cargo.toml b/sql/Cargo.toml
new file mode 100644
index 0000000000..3056272218
--- /dev/null
+++ b/sql/Cargo.toml
@@ -0,0 +1,29 @@
+[package]
+name = "sql"
+version = "0.1.0"
+authors = ["CeresDB Authors <ceresdb@service.alipay.com>"]
+edition = "2018"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[features]
+test = []
+
+[dependencies]
+# In alphabetical order
+arrow_deps = { path = "../arrow_deps" }
+catalog = { path = "../catalog" }
+common_types = { path = "../common_types"}
+common_util = { path = "../common_util" }
+log = "0.4"
+paste = "1.0"
+snafu = { version ="0.6.10", features = ["backtraces"]}
+sqlparser = "0.13.0"
+table_engine = { path = "../table_engine" }
+udf = { path = "../udf" }
+ceresdbproto = { git = "https://github.com/CeresDB/ceresdbproto.git"}
+regex = "1"
+
+[dev-dependencies]
+common_types = { path = "../common_types", features = ["test"] }
+tokio = { version = "1.0", features = ["full"] }
diff --git a/sql/src/ast.rs b/sql/src/ast.rs
new file mode 100644
index 0000000000..e68e486e43
--- /dev/null
+++ b/sql/src/ast.rs
@@ -0,0 +1,80 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! SQL statement
+
+use sqlparser::ast::{
+    ColumnDef, ObjectName, SqlOption, Statement as SqlStatement, TableConstraint,
+};
+
+/// Statement representations
+#[derive(Debug, PartialEq)]
+pub enum Statement {
+    /// ANSI SQL AST node
+    Standard(Box<SqlStatement>),
+    // Other extensions
+    /// CREATE TABLE
+    Create(CreateTable),
+    /// Drop TABLE
+    Drop(DropTable),
+    Describe(DescribeTable),
+    AlterModifySetting(AlterModifySetting),
+    AlterAddColumn(AlterAddColumn),
+    /// SHOW CREATE TABLE
+    ShowCreate(ShowCreate),
+    Exists(ExistsTable),
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+pub enum ShowCreateObject {
+    Table,
+}
+
+#[derive(Debug, PartialEq)]
+pub struct CreateTable {
+    /// Create if not exists
+    pub if_not_exists: bool,
+    /// Table name
+    pub name: ObjectName,
+    pub columns: Vec<ColumnDef>,
+    pub engine: String,
+    pub constraints: Vec<TableConstraint>,
+    /// Table options in `WITH`.
+    pub options: Vec<SqlOption>,
+}
+
+#[derive(Debug, PartialEq)]
+pub struct DropTable {
+    /// Table name
+    pub name: ObjectName,
+    pub if_exists: bool,
+    pub engine: String,
+}
+
+#[derive(Debug, PartialEq)]
+pub struct DescribeTable {
+    pub table_name: ObjectName,
+}
+
+#[derive(Debug, PartialEq)]
+pub struct AlterModifySetting {
+    pub table_name: ObjectName,
+    pub options: Vec<SqlOption>,
+}
+
+#[derive(Debug, PartialEq)]
+pub struct AlterAddColumn {
+    pub table_name: ObjectName,
+    pub columns: Vec<ColumnDef>,
+}
+
+#[derive(Debug, PartialEq)]
+pub struct ShowCreate {
+    pub obj_type: ShowCreateObject,
+    pub obj_name: ObjectName,
+}
+
+#[derive(Debug, PartialEq)]
+pub struct ExistsTable {
+    pub table_name: ObjectName,
+}
diff --git a/sql/src/container.rs b/sql/src/container.rs
new file mode 100644
index 0000000000..eac30eb737
--- /dev/null
+++ b/sql/src/container.rs
@@ -0,0 +1,175 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Table container
+
+use std::{collections::HashMap, sync::Arc};
+
+pub use arrow_deps::datafusion::catalog::{ResolvedTableReference, TableReference};
+use table_engine::provider::TableProviderAdapter;
+
+// Rust has poor support of using tuple as map key, so we use a 3 level
+// map to store catalog -> schema -> table mapping
+type CatalogMap = HashMap<String, SchemaMap>;
+type SchemaMap = HashMap<String, TableMap>;
+type TableMap = HashMap<String, Arc<TableProviderAdapter>>;
+
+/// Container to hold table adapters
+///
+/// Optimized for default catalog and schema
+#[derive(Default)]
+pub struct TableContainer {
+    default_catalog: String,
+    default_schema: String,
+    default_tables: HashMap<String, Arc<TableProviderAdapter>>,
+    other_tables: CatalogMap,
+}
+
+impl TableContainer {
+    pub fn new(default_catalog: String, default_schema: String) -> Self {
+        Self {
+            default_catalog,
+            default_schema,
+            default_tables: HashMap::new(),
+            other_tables: CatalogMap::new(),
+        }
+    }
+
+    /// Catalog num
+    pub fn num_catalogs(&self) -> usize {
+        if self.other_tables.is_empty() {
+            1
+        } else {
+            self.other_tables.len() + 1
+        }
+    }
+
+    pub fn get(&self, name: TableReference) -> Option<Arc<TableProviderAdapter>> {
+        match name {
+            TableReference::Bare { table } => self.get_default(table),
+            TableReference::Partial { schema, table } => {
+                if schema == self.default_schema {
+                    self.get_default(table)
+                } else {
+                    self.get_other(&self.default_catalog, schema, table)
+                }
+            }
+            TableReference::Full {
+                catalog,
+                schema,
+                table,
+            } => {
+                if catalog == self.default_catalog && schema == self.default_schema {
+                    self.get_default(table)
+                } else {
+                    self.get_other(catalog, schema, table)
+                }
+            }
+        }
+    }
+
+    fn get_default(&self, table: &str) -> Option<Arc<TableProviderAdapter>> {
+        self.default_tables.get(table).cloned()
+    }
+
+    fn get_other(
+        &self,
+        catalog: &str,
+        schema: &str,
+        table: &str,
+    ) -> Option<Arc<TableProviderAdapter>> {
+        self.other_tables
+            .get(catalog)
+            .and_then(|schemas| schemas.get(schema))
+            .and_then(|tables| tables.get(table))
+            .cloned()
+    }
+
+    pub fn insert(&mut self, name: TableReference, table_adapter: Arc<TableProviderAdapter>) {
+        match name {
+            TableReference::Bare { table } => self.insert_default(table, table_adapter),
+            TableReference::Partial { schema, table } => {
+                if schema == self.default_schema {
+                    self.insert_default(table, table_adapter)
+                } else {
+                    self.insert_other(
+                        self.default_catalog.clone(),
+                        schema.to_string(),
+                        table.to_string(),
+                        table_adapter,
+                    )
+                }
+            }
+            TableReference::Full {
+                catalog,
+                schema,
+                table,
+            } => {
+                if catalog == self.default_catalog && schema == self.default_schema {
+                    self.insert_default(table, table_adapter)
+                } else {
+                    self.insert_other(
+                        catalog.to_string(),
+                        schema.to_string(),
+                        table.to_string(),
+                        table_adapter,
+                    )
+                }
+            }
+        }
+    }
+
+    fn insert_default(&mut self, table: &str, table_adapter: Arc<TableProviderAdapter>) {
+        self.default_tables.insert(table.to_string(), table_adapter);
+    }
+
+    fn insert_other(
+        &mut self,
+        catalog: String,
+        schema: String,
+        table: String,
+        table_adapter: Arc<TableProviderAdapter>,
+    ) {
+        self.other_tables
+            .entry(catalog)
+            .or_insert_with(HashMap::new)
+            .entry(schema)
+            .or_insert_with(HashMap::new)
+            .insert(table, table_adapter);
+    }
+
+    /// Visit all tables
+    ///
+    /// If f returns error, stop iteration and return the error
+    pub fn visit<F, E>(&self, mut f: F) -> Result<(), E>
+    where
+        F: FnMut(ResolvedTableReference, &Arc<TableProviderAdapter>) -> Result<(), E>,
+    {
+        // Visit default tables first
+        for (table, adapter) in &self.default_tables {
+            // default_catalog/default_schema can be empty string, but that's
+            // ok since we have table under them
+            let table_ref = ResolvedTableReference {
+                catalog: &self.default_catalog,
+                schema: &self.default_schema,
+                table,
+            };
+            f(table_ref, adapter)?;
+        }
+
+        // Visit other tables
+        for (catalog, schemas) in &self.other_tables {
+            for (schema, tables) in schemas {
+                for (table, adapter) in tables {
+                    let table_ref = ResolvedTableReference {
+                        catalog,
+                        schema,
+                        table,
+                    };
+                    f(table_ref, adapter)?;
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
diff --git a/sql/src/frontend.rs b/sql/src/frontend.rs
new file mode 100644
index 0000000000..f45e6def4d
--- /dev/null
+++ b/sql/src/frontend.rs
@@ -0,0 +1,108 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Frontend
+
+use std::{convert::TryInto, sync::Arc};
+
+use ceresdbproto::prometheus::PrometheusQueryRequest;
+use common_types::request_id::RequestId;
+use snafu::{ResultExt, Snafu};
+use table_engine::table;
+
+use crate::{
+    ast::Statement,
+    parser::Parser,
+    plan::Plan,
+    planner::Planner,
+    promql::{ColumnNames, Expr},
+    provider::MetaProvider,
+};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    // Invalid sql is quite common, so we don't provide a backtrace now.
+    #[snafu(display("Invalid sql, sql:{}, err:{}", sql, source))]
+    InvalidSql {
+        sql: String,
+        source: sqlparser::parser::ParserError,
+    },
+
+    // TODO(yingwen): Should we store stmt here?
+    #[snafu(display("Failed to create plan, err:{}", source))]
+    CreatePlan { source: crate::planner::Error },
+
+    #[snafu(display("Invalid prom request, err:{}", source))]
+    InvalidPromRequest { source: crate::promql::Error },
+}
+
+define_result!(Error);
+
+pub type StatementVec = Vec<Statement>;
+
+/// Context used by Frontend
+///
+/// We can collect metrics and trace info in it instead of using global
+/// metrics or trace collector.
+pub struct Context {
+    /// Id of the query request.
+    pub request_id: RequestId,
+    /// Parallelism to read table.
+    pub read_parallelism: usize,
+}
+
+impl Context {
+    pub fn new(request_id: RequestId) -> Self {
+        Self {
+            request_id,
+            read_parallelism: table::DEFAULT_READ_PARALLELISM,
+        }
+    }
+}
+
+/// SQL frontend implementation
+///
+/// Thought the parser supports using multiple statements in a sql, but
+/// this frontend only support planning one statement at a time now
+#[derive(Debug)]
+pub struct Frontend<P> {
+    provider: P,
+}
+
+impl<P> Frontend<P> {
+    pub fn new(provider: P) -> Self {
+        Self { provider }
+    }
+
+    /// Parse the sql and returns the statements
+    pub fn parse_sql(&self, _ctx: &mut Context, sql: &str) -> Result<StatementVec> {
+        Parser::parse_sql(sql).context(InvalidSql { sql })
+    }
+
+    /// Parse the request and returns the Expr
+    pub fn parse_promql(
+        &self,
+        _ctx: &mut Context,
+        mut req: PrometheusQueryRequest,
+    ) -> Result<Expr> {
+        req.take_expr().try_into().context(InvalidPromRequest)
+    }
+}
+
+impl<P: MetaProvider> Frontend<P> {
+    /// Create logical plan for the statement
+    pub fn statement_to_plan(&self, ctx: &mut Context, stmt: Statement) -> Result<Plan> {
+        let planner = Planner::new(&self.provider, ctx.request_id, ctx.read_parallelism);
+
+        planner.statement_to_plan(stmt).context(CreatePlan)
+    }
+
+    pub fn promql_expr_to_plan(
+        &self,
+        ctx: &mut Context,
+        expr: Expr,
+    ) -> Result<(Plan, Arc<ColumnNames>)> {
+        let planner = Planner::new(&self.provider, ctx.request_id, ctx.read_parallelism);
+
+        planner.promql_expr_to_plan(expr).context(CreatePlan)
+    }
+}
diff --git a/sql/src/lib.rs b/sql/src/lib.rs
new file mode 100644
index 0000000000..fe2f41e287
--- /dev/null
+++ b/sql/src/lib.rs
@@ -0,0 +1,19 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! SQL frontend
+//!
+//! Parse sql into logical plan that can be handled by interpreters
+
+#[macro_use]
+extern crate common_util;
+
+pub mod ast;
+pub mod container;
+pub mod frontend;
+pub mod parser;
+pub mod plan;
+pub mod planner;
+pub mod promql;
+pub mod provider;
+#[cfg(any(test, feature = "test"))]
+pub mod tests;
diff --git a/sql/src/parser.rs b/sql/src/parser.rs
new file mode 100644
index 0000000000..dca4d82ba2
--- /dev/null
+++ b/sql/src/parser.rs
@@ -0,0 +1,814 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! SQL parser
+//!
+//! Some codes are copied from datafusion: <https://github.com/apache/arrow/blob/9d86440946b8b07e03abb94fad2da278affae08f/rust/datafusion/src/sql/parser.rs#L74>
+
+use log::debug;
+use paste::paste;
+use sqlparser::{
+    ast::{ColumnDef, ColumnOption, ColumnOptionDef, Ident, TableConstraint},
+    dialect::{keywords::Keyword, Dialect, MySqlDialect},
+    parser::{IsOptional::Mandatory, Parser as SqlParser, ParserError},
+    tokenizer::{Token, Tokenizer},
+};
+use table_engine::ANALYTIC_ENGINE_TYPE;
+
+use crate::ast::{
+    AlterAddColumn, AlterModifySetting, CreateTable, DescribeTable, DropTable, ExistsTable,
+    ShowCreate, ShowCreateObject, Statement,
+};
+
+define_result!(ParserError);
+
+// Use `Parser::expected` instead, if possible
+macro_rules! parser_err {
+    ($MSG:expr) => {
+        Err(ParserError::ParserError($MSG.to_string()))
+    };
+}
+
+const TS_KEY: &str = "__ts_key";
+const TAG: &str = "TAG";
+const COMMENT: &str = "COMMENT";
+const UNSIGN: &str = "UNSIGN";
+const MODIFY: &str = "MODIFY";
+const SETTING: &str = "SETTING";
+
+macro_rules! is_custom_column {
+    ($name: ident) => {
+        paste! {
+            #[inline]
+            pub  fn [<is_ $name:lower _column>](opt: &ColumnOption) -> bool {
+                match opt {
+                    ColumnOption::DialectSpecific(tokens) => {
+                        if let [Token::Word(word)] = &tokens[..] {
+                            return word.value == $name;
+                        }
+                    }
+                    _ => return false,
+                }
+                return false;
+            }
+
+        }
+    };
+}
+
+is_custom_column!(TAG);
+is_custom_column!(UNSIGN);
+
+/// Get the comment from the [`ColumnOption`] if it is a comment option.
+pub fn get_column_comment(opt: &ColumnOption) -> Option<String> {
+    if let ColumnOption::DialectSpecific(tokens) = opt {
+        if let [Token::Word(keyword), Token::SingleQuotedString(comment)] = &tokens[..] {
+            if keyword.value == COMMENT {
+                return Some(comment.clone());
+            }
+        }
+    }
+
+    None
+}
+
+/// Returns true when is a TIMESTAMP KEY table constraint
+pub fn is_timestamp_key_constraint(constrait: &TableConstraint) -> bool {
+    if let TableConstraint::Unique {
+        name: Some(Ident {
+            value,
+            quote_style: None,
+        }),
+        columns: _,
+        is_primary: false,
+    } = constrait
+    {
+        return value == TS_KEY;
+    }
+    false
+}
+
+/// SQL Parser with ceresdb dialect support
+pub struct Parser<'a> {
+    parser: SqlParser<'a>,
+}
+
+impl<'a> Parser<'a> {
+    // Parse the specified tokens with dialect
+    fn new_with_dialect(sql: &str, dialect: &'a dyn Dialect) -> Result<Self> {
+        let mut tokenizer = Tokenizer::new(dialect, sql);
+        let tokens = tokenizer.tokenize()?;
+
+        Ok(Parser {
+            parser: SqlParser::new(tokens, dialect),
+        })
+    }
+
+    /// Parse a SQL statement and produce a set of statements
+    pub fn parse_sql(sql: &str) -> Result<Vec<Statement>> {
+        // Use MySqlDialect, so we can support "`" and chinese characters.
+        let dialect = &MySqlDialect {};
+        let mut parser = Parser::new_with_dialect(sql, dialect)?;
+        let mut stmts = Vec::new();
+        let mut expecting_statement_delimiter = false;
+        loop {
+            // ignore empty statements (between successive statement delimiters)
+            while parser.parser.consume_token(&Token::SemiColon) {
+                expecting_statement_delimiter = false;
+            }
+
+            if parser.parser.peek_token() == Token::EOF {
+                break;
+            }
+            if expecting_statement_delimiter {
+                return parser.expected("end of statement", parser.parser.peek_token());
+            }
+
+            let statement = parser.parse_statement()?;
+            stmts.push(statement);
+            expecting_statement_delimiter = true;
+        }
+
+        debug!("Parser parsed sql, sql:{}, stmts:{:#?}", sql, stmts);
+
+        Ok(stmts)
+    }
+
+    // Report unexpected token
+    fn expected<T>(&self, expected: &str, found: Token) -> Result<T> {
+        parser_err!(format!("Expected {}, found: {}", expected, found))
+    }
+
+    // Parse a new expression
+    fn parse_statement(&mut self) -> Result<Statement> {
+        match self.parser.peek_token() {
+            Token::Word(w) => {
+                match w.keyword {
+                    Keyword::CREATE => {
+                        // Move one token forward
+                        self.parser.next_token();
+                        // Use custom parse
+                        self.parse_create()
+                    }
+                    Keyword::DROP => {
+                        // Move one token forward
+                        self.parser.next_token();
+                        // Use custom parse
+                        self.parse_drop()
+                    }
+                    Keyword::DESCRIBE | Keyword::DESC => {
+                        self.parser.next_token();
+                        self.parse_describe()
+                    }
+                    Keyword::ALTER => {
+                        self.parser.next_token();
+                        self.parse_alter()
+                    }
+                    Keyword::SHOW => {
+                        self.parser.next_token();
+                        self.parse_show()
+                    }
+                    Keyword::EXISTS => {
+                        self.parser.next_token();
+                        self.parse_exists()
+                    }
+                    _ => {
+                        // use the native parser
+                        Ok(Statement::Standard(Box::new(
+                            self.parser.parse_statement()?,
+                        )))
+                    }
+                }
+            }
+            _ => {
+                // use the native parser
+                Ok(Statement::Standard(Box::new(
+                    self.parser.parse_statement()?,
+                )))
+            }
+        }
+    }
+
+    pub fn parse_alter(&mut self) -> Result<Statement> {
+        let nth1_token = self.parser.peek_token();
+        let nth2_token = self.parser.peek_nth_token(2);
+        let nth3_token = self.parser.peek_nth_token(3);
+        if let (Token::Word(nth1_word), Token::Word(nth2_word), Token::Word(nth3_word)) =
+            (nth1_token, nth2_token, nth3_token)
+        {
+            // example: ALTER TABLE test_ttl modify SETTING ttl='8d'
+            if let (Keyword::TABLE, MODIFY, SETTING) = (
+                nth1_word.keyword,
+                nth2_word.value.to_uppercase().as_str(),
+                nth3_word.value.to_uppercase().as_str(),
+            ) {
+                return self.parse_alter_modify_setting();
+            }
+            // examples:
+            // ALTER TABLE test_table ADD COLUMN col_17 STRING TAG
+            // ALTER TABLE test_table ADD COLUMN (col_18 STRING TAG, col_19 UNIT64)
+            if let (Keyword::TABLE, Keyword::ADD, Keyword::COLUMN) =
+                (nth1_word.keyword, nth2_word.keyword, nth3_word.keyword)
+            {
+                return self.parse_alter_add_column();
+            }
+        }
+        Ok(Statement::Standard(Box::new(self.parser.parse_alter()?)))
+    }
+
+    pub fn parse_show(&mut self) -> Result<Statement> {
+        if self
+            .parser
+            .parse_one_of_keywords(&[Keyword::CREATE])
+            .is_some()
+        {
+            Ok(self.parse_show_create()?)
+        } else {
+            self.expected("create", self.parser.peek_token())
+        }
+    }
+
+    fn parse_show_create(&mut self) -> Result<Statement> {
+        let obj_type = match self.parser.expect_one_of_keywords(&[Keyword::TABLE])? {
+            Keyword::TABLE => Ok(ShowCreateObject::Table),
+            keyword => Err(ParserError::ParserError(format!(
+                "Unable to map keyword to ShowCreateObject: {:?}",
+                keyword
+            ))),
+        }?;
+
+        let obj_name = self.parser.parse_object_name()?;
+
+        Ok(Statement::ShowCreate(ShowCreate { obj_type, obj_name }))
+    }
+
+    fn parse_alter_add_column(&mut self) -> Result<Statement> {
+        self.parser.expect_keyword(Keyword::TABLE)?;
+        let table_name = self.parser.parse_object_name()?;
+        self.parser
+            .expect_keywords(&[Keyword::ADD, Keyword::COLUMN])?;
+        let (mut columns, _) = self.parse_columns()?;
+        if columns.is_empty() {
+            let column_def = self.parse_column_def()?;
+            columns.push(column_def);
+        }
+        Ok(Statement::AlterAddColumn(AlterAddColumn {
+            table_name,
+            columns,
+        }))
+    }
+
+    fn parse_alter_modify_setting(&mut self) -> Result<Statement> {
+        self.parser.expect_keyword(Keyword::TABLE)?;
+        let table_name = self.parser.parse_object_name()?;
+        if self.consume_token(MODIFY) && self.consume_token(SETTING) {
+            let options = self
+                .parser
+                .parse_comma_separated(SqlParser::parse_sql_option)?;
+            Ok(Statement::AlterModifySetting(AlterModifySetting {
+                table_name,
+                options,
+            }))
+        } else {
+            unreachable!()
+        }
+    }
+
+    pub fn parse_describe(&mut self) -> Result<Statement> {
+        let _ = self.parser.parse_keyword(Keyword::TABLE);
+        let table_name = self.parser.parse_object_name()?;
+        Ok(Statement::Describe(DescribeTable { table_name }))
+    }
+
+    // Parse a SQL CREATE statement
+    pub fn parse_create(&mut self) -> Result<Statement> {
+        self.parser.expect_keyword(Keyword::TABLE)?;
+        let if_not_exists =
+            self.parser
+                .parse_keywords(&[Keyword::IF, Keyword::NOT, Keyword::EXISTS]);
+        let table_name = self.parser.parse_object_name()?;
+        let (columns, constraints) = self.parse_columns()?;
+        let engine = self.parse_table_engine()?;
+        let options = self.parser.parse_options(Keyword::WITH)?;
+
+        Ok(Statement::Create(CreateTable {
+            if_not_exists,
+            name: table_name,
+            columns,
+            engine,
+            constraints,
+            options,
+        }))
+    }
+
+    pub fn parse_drop(&mut self) -> Result<Statement> {
+        self.parser.expect_keyword(Keyword::TABLE)?;
+        let if_exists = self.parser.parse_keywords(&[Keyword::IF, Keyword::EXISTS]);
+        let table_name = self.parser.parse_object_name()?;
+        let engine = self.parse_table_engine()?;
+
+        Ok(Statement::Drop(DropTable {
+            name: table_name,
+            if_exists,
+            engine,
+        }))
+    }
+
+    pub fn parse_exists(&mut self) -> Result<Statement> {
+        let _ = self.parser.parse_keyword(Keyword::TABLE);
+        let table_name = self.parser.parse_object_name()?;
+        Ok(Statement::Exists(ExistsTable { table_name }))
+    }
+
+    // Copy from sqlparser
+    fn parse_columns(&mut self) -> Result<(Vec<ColumnDef>, Vec<TableConstraint>)> {
+        let mut columns = vec![];
+        let mut constraints = vec![];
+        if !self.parser.consume_token(&Token::LParen) || self.parser.consume_token(&Token::RParen) {
+            return Ok((columns, constraints));
+        }
+
+        loop {
+            if let Some(constraint) = self.parse_optional_table_constraint()? {
+                constraints.push(constraint);
+            } else if let Token::Word(_) = self.parser.peek_token() {
+                columns.push(self.parse_column_def()?);
+            } else {
+                return self.expected(
+                    "column name or constraint definition",
+                    self.parser.peek_token(),
+                );
+            }
+            let comma = self.parser.consume_token(&Token::Comma);
+            if self.parser.consume_token(&Token::RParen) {
+                // allow a trailing comma, even though it's not in standard
+                break;
+            } else if !comma {
+                return self.expected(
+                    "',' or ')' after column definition",
+                    self.parser.peek_token(),
+                );
+            }
+        }
+
+        Ok((columns, constraints))
+    }
+
+    /// Parses the set of valid formats
+    fn parse_table_engine(&mut self) -> Result<String> {
+        // TODO make ENGINE as a keyword
+        if !self.consume_token("ENGINE") {
+            return Ok(ANALYTIC_ENGINE_TYPE.to_string());
+        }
+
+        self.parser.expect_token(&Token::Eq)?;
+
+        match self.parser.next_token() {
+            Token::Word(w) => Ok(w.value),
+            unexpected => self.expected("Engine is missing", unexpected),
+        }
+    }
+
+    // Copy from sqlparser
+    fn parse_column_def(&mut self) -> Result<ColumnDef> {
+        let name = self.parser.parse_identifier()?;
+        let data_type = self.parser.parse_data_type()?;
+        let collation = if self.parser.parse_keyword(Keyword::COLLATE) {
+            Some(self.parser.parse_object_name()?)
+        } else {
+            None
+        };
+        let mut options = vec![];
+        loop {
+            if self.parser.parse_keyword(Keyword::CONSTRAINT) {
+                let name = Some(self.parser.parse_identifier()?);
+                if let Some(option) = self.parse_optional_column_option()? {
+                    options.push(ColumnOptionDef { name, option });
+                } else {
+                    return self.expected(
+                        "constraint details after CONSTRAINT <name>",
+                        self.parser.peek_token(),
+                    );
+                }
+            } else if let Some(option) = self.parse_optional_column_option()? {
+                options.push(ColumnOptionDef { name: None, option });
+            } else {
+                break;
+            };
+        }
+        Ok(ColumnDef {
+            name,
+            data_type,
+            collation,
+            options,
+        })
+    }
+
+    // Copy from sqlparser by boyan
+    fn parse_optional_table_constraint(&mut self) -> Result<Option<TableConstraint>> {
+        let name = if self.parser.parse_keyword(Keyword::CONSTRAINT) {
+            Some(self.parser.parse_identifier()?)
+        } else {
+            None
+        };
+        match self.parser.next_token() {
+            Token::Word(w) if w.keyword == Keyword::PRIMARY => {
+                self.parser.expect_keyword(Keyword::KEY)?;
+                let columns = self.parser.parse_parenthesized_column_list(Mandatory)?;
+                Ok(Some(TableConstraint::Unique {
+                    name,
+                    columns,
+                    is_primary: true,
+                }))
+            }
+            Token::Word(w) if w.keyword == Keyword::TIMESTAMP => {
+                self.parser.expect_keyword(Keyword::KEY)?;
+                let columns = self.parser.parse_parenthesized_column_list(Mandatory)?;
+                // TODO(boyan), TableConstraint doesn't support dialect right now
+                // we use unique constraint as TIMESTAMP KEY constraint.
+                Ok(Some(TableConstraint::Unique {
+                    name: Some(Ident {
+                        value: TS_KEY.to_owned(),
+                        quote_style: None,
+                    }),
+                    columns,
+                    is_primary: false,
+                }))
+            }
+            unexpected => {
+                if name.is_some() {
+                    self.expected("PRIMARY, TIMESTAMP", unexpected)
+                } else {
+                    self.parser.prev_token();
+                    Ok(None)
+                }
+            }
+        }
+    }
+
+    // Copy from sqlparser  by boyan
+    fn parse_optional_column_option(&mut self) -> Result<Option<ColumnOption>> {
+        if self.parser.parse_keywords(&[Keyword::NOT, Keyword::NULL]) {
+            Ok(Some(ColumnOption::NotNull))
+        } else if self.parser.parse_keyword(Keyword::NULL) {
+            Ok(Some(ColumnOption::Null))
+        } else if self.parser.parse_keyword(Keyword::DEFAULT) {
+            Ok(Some(ColumnOption::Default(self.parser.parse_expr()?)))
+        } else if self
+            .parser
+            .parse_keywords(&[Keyword::PRIMARY, Keyword::KEY])
+        {
+            Ok(Some(ColumnOption::Unique { is_primary: true }))
+        } else if self.consume_token(TAG) {
+            // Support TAG for ceresdbx
+            Ok(Some(ColumnOption::DialectSpecific(vec![
+                Token::make_keyword(TAG),
+            ])))
+        } else if self.consume_token(UNSIGN) {
+            // Support unsign for ceresdbx
+            Ok(Some(ColumnOption::DialectSpecific(vec![
+                Token::make_keyword(UNSIGN),
+            ])))
+        } else if self.consume_token(COMMENT) {
+            let comment = self.parser.parse_literal_string()?;
+            Ok(Some(ColumnOption::DialectSpecific(vec![
+                Token::make_keyword(COMMENT),
+                Token::SingleQuotedString(comment),
+            ])))
+        } else {
+            Ok(None)
+        }
+    }
+
+    fn consume_token(&mut self, expected: &str) -> bool {
+        if self.parser.peek_token().to_string().to_uppercase() == *expected.to_uppercase() {
+            self.parser.next_token();
+            true
+        } else {
+            false
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use sqlparser::ast::{DataType, Ident, ObjectName, Value};
+
+    use super::*;
+
+    fn expect_parse_ok(sql: &str, expected: Statement) -> Result<()> {
+        let statements = Parser::parse_sql(sql)?;
+        assert_eq!(
+            statements.len(),
+            1,
+            "Expected to parse exactly one statement"
+        );
+        assert_eq!(statements[0], expected);
+        Ok(())
+    }
+
+    /// Parses sql and asserts that the expected error message was found
+    fn expect_parse_error(sql: &str, expected_error: &str) {
+        match Parser::parse_sql(sql) {
+            Ok(statements) => {
+                panic!(
+                    "Expected parse error for '{}', but was successful: {:?}",
+                    sql, statements
+                );
+            }
+            Err(e) => {
+                let error_message = e.to_string();
+                assert!(
+                    error_message.contains(expected_error),
+                    "Expected error '{}' not found in actual error '{}'",
+                    expected_error,
+                    error_message
+                );
+            }
+        }
+    }
+
+    fn make_column_def(name: impl Into<String>, data_type: DataType) -> ColumnDef {
+        ColumnDef {
+            name: Ident {
+                value: name.into(),
+                quote_style: None,
+            },
+            data_type,
+            collation: None,
+            options: vec![],
+        }
+    }
+
+    fn make_tag_column_def(name: impl Into<String>, data_type: DataType) -> ColumnDef {
+        ColumnDef {
+            name: Ident {
+                value: name.into(),
+                quote_style: None,
+            },
+            data_type,
+            collation: None,
+            options: vec![ColumnOptionDef {
+                name: None,
+                option: ColumnOption::DialectSpecific(vec![Token::make_keyword(TAG)]),
+            }],
+        }
+    }
+
+    fn make_object_name(name: impl Into<String>) -> ObjectName {
+        ObjectName(vec![Ident::new(name)])
+    }
+
+    #[test]
+    fn create_table() {
+        // positive case
+        let sql = "CREATE TABLE IF NOT EXISTS t(c1 double)";
+        let expected = Statement::Create(CreateTable {
+            if_not_exists: true,
+            name: make_object_name("t"),
+            columns: vec![make_column_def("c1", DataType::Double)],
+            engine: table_engine::ANALYTIC_ENGINE_TYPE.to_string(),
+            constraints: vec![],
+            options: vec![],
+        });
+        expect_parse_ok(sql, expected).unwrap();
+
+        // positive case, multiple columns
+        let sql = "CREATE TABLE mytbl(c1 timestamp, c2 double, c3 string,) ENGINE = XX";
+        let expected = Statement::Create(CreateTable {
+            if_not_exists: false,
+            name: make_object_name("mytbl"),
+            columns: vec![
+                make_column_def("c1", DataType::Timestamp),
+                make_column_def("c2", DataType::Double),
+                make_column_def("c3", DataType::String),
+            ],
+            engine: "XX".to_string(),
+            constraints: vec![],
+            options: vec![],
+        });
+        expect_parse_ok(sql, expected).unwrap();
+
+        // Error cases: Invalid sql
+        let sql = "CREATE TABLE t(c1 timestamp) AS";
+        expect_parse_error(
+            sql,
+            "sql parser error: Expected end of statement, found: AS",
+        );
+    }
+
+    #[test]
+    fn test_unsign_tag_column() {
+        let sql = "CREATE TABLE IF NOT EXISTS t(c1 string tag, c2 float, c3 bigint unsign)";
+        let statements = Parser::parse_sql(sql).unwrap();
+        assert_eq!(statements.len(), 1);
+        match &statements[0] {
+            Statement::Create(v) => {
+                let columns = &v.columns;
+                assert_eq!(3, columns.len());
+                for c in columns {
+                    if c.name.value == "c1" {
+                        assert_eq!(1, c.options.len());
+                        let opt = &c.options[0];
+                        assert!(is_tag_column(&opt.option));
+                    } else if c.name.value == "c2" {
+                        assert_eq!(0, c.options.len());
+                    } else if c.name.value == "c3" {
+                        assert_eq!(1, c.options.len());
+                        let opt = &c.options[0];
+                        assert!(is_unsign_column(&opt.option));
+                    } else {
+                        panic!("failed");
+                    }
+                }
+            }
+            _ => panic!("failed"),
+        }
+    }
+
+    #[test]
+    fn test_comment_column() {
+        let sql = "CREATE TABLE IF NOT EXISTS t(c1 string, c2 float, c3 bigint comment 'id')";
+        let statements = Parser::parse_sql(sql).unwrap();
+        assert_eq!(statements.len(), 1);
+        match &statements[0] {
+            Statement::Create(v) => {
+                let columns = &v.columns;
+                assert_eq!(3, columns.len());
+                for c in columns {
+                    if c.name.value == "c3" {
+                        assert_eq!(1, c.options.len());
+                        let opt = &c.options[0];
+                        let comment = get_column_comment(&opt.option).unwrap();
+                        assert_eq!("id", comment);
+                    }
+                }
+            }
+            _ => panic!("failed"),
+        }
+    }
+
+    #[test]
+    fn test_timestamp_key_constraint() {
+        let sql = "CREATE TABLE IF NOT EXISTS t(c1 TIMESTAMP, TIMESTAMP key(c1))";
+        let statements = Parser::parse_sql(sql).unwrap();
+        assert_eq!(statements.len(), 1);
+        match &statements[0] {
+            Statement::Create(v) => {
+                let constraints = &v.constraints;
+                assert_eq!(1, constraints.len());
+                assert!(is_timestamp_key_constraint(&constraints[0]));
+            }
+            _ => panic!("failed"),
+        }
+    }
+
+    #[test]
+    fn create_table_engine() {
+        let sql = "CREATE TABLE IF NOT EXISTS t(c1 double)";
+        let statements = Parser::parse_sql(sql).unwrap();
+        assert_eq!(statements.len(), 1);
+        match &statements[0] {
+            Statement::Create(v) => {
+                assert_eq!(v.engine, table_engine::ANALYTIC_ENGINE_TYPE.to_string())
+            }
+            _ => panic!("failed"),
+        }
+
+        let sql = "CREATE TABLE IF NOT EXISTS t(c1 double) ENGINE = XX";
+        let statements = Parser::parse_sql(sql).unwrap();
+        assert_eq!(statements.len(), 1);
+        match &statements[0] {
+            Statement::Create(v) => assert_eq!(v.engine, "XX".to_string()),
+            _ => panic!("failed"),
+        }
+
+        let sql = "CREATE TABLE IF NOT EXISTS t(c1 double) engine = XX2";
+        let statements = Parser::parse_sql(sql).unwrap();
+        assert_eq!(statements.len(), 1);
+        match &statements[0] {
+            Statement::Create(v) => assert_eq!(v.engine, "XX2".to_string()),
+            _ => panic!("failed"),
+        }
+    }
+
+    #[test]
+    fn test_alter_table_option() {
+        let sql = "ALTER TABLE test_ttl modify SETTING arena_block_size='1k';";
+        let statements = Parser::parse_sql(sql).unwrap();
+        assert_eq!(statements.len(), 1);
+        match &statements[0] {
+            Statement::AlterModifySetting(v) => {
+                assert_eq!(v.table_name.to_string(), "test_ttl".to_string());
+                assert_eq!(v.options.len(), 1);
+                assert_eq!(v.options[0].name.value, "arena_block_size".to_string());
+                assert_eq!(
+                    v.options[0].value,
+                    Value::SingleQuotedString("1k".to_string())
+                );
+            }
+            _ => panic!("failed"),
+        }
+    }
+
+    #[test]
+    fn test_alter_table_column() {
+        {
+            let sql = "ALTER TABLE t ADD COLUMN (c1 DOUBLE, c2 STRING)";
+            let expected = Statement::AlterAddColumn(AlterAddColumn {
+                table_name: make_object_name("t"),
+                columns: vec![
+                    make_column_def("c1", DataType::Double),
+                    make_column_def("c2", DataType::String),
+                ],
+            });
+            expect_parse_ok(sql, expected).unwrap();
+        }
+
+        {
+            let sql = "ALTER TABLE t ADD COLUMN c1 DOUBLE";
+            let expected = Statement::AlterAddColumn(AlterAddColumn {
+                table_name: make_object_name("t"),
+                columns: vec![make_column_def("c1", DataType::Double)],
+            });
+            expect_parse_ok(sql, expected).unwrap();
+        }
+    }
+
+    #[test]
+    fn test_alter_table_tag_column() {
+        {
+            let sql = "ALTER TABLE t ADD COLUMN (c1 DOUBLE, c2 STRING tag)";
+            let expected = Statement::AlterAddColumn(AlterAddColumn {
+                table_name: make_object_name("t"),
+                columns: vec![
+                    make_column_def("c1", DataType::Double),
+                    make_tag_column_def("c2", DataType::String),
+                ],
+            });
+            expect_parse_ok(sql, expected).unwrap();
+        }
+
+        {
+            let sql = "ALTER TABLE t ADD COLUMN c1 string tag";
+            let expected = Statement::AlterAddColumn(AlterAddColumn {
+                table_name: make_object_name("t"),
+                columns: vec![make_tag_column_def("c1", DataType::String)],
+            });
+            expect_parse_ok(sql, expected).unwrap();
+        }
+    }
+
+    #[test]
+    fn test_drop_table() {
+        let sql = "drop table test_ttl";
+        let statements = Parser::parse_sql(sql).unwrap();
+        assert_eq!(statements.len(), 1);
+        match &statements[0] {
+            Statement::Drop(DropTable {
+                name,
+                if_exists,
+                engine,
+            }) => {
+                assert_eq!(name.to_string(), "test_ttl".to_string());
+                assert!(!if_exists);
+                assert_eq!(*engine, ANALYTIC_ENGINE_TYPE.to_string());
+            }
+            _ => panic!("failed"),
+        }
+
+        let sql = "drop table if exists test_ttl";
+        let statements = Parser::parse_sql(sql).unwrap();
+        assert_eq!(statements.len(), 1);
+        match &statements[0] {
+            Statement::Drop(DropTable {
+                name,
+                if_exists,
+                engine,
+            }) => {
+                assert_eq!(name.to_string(), "test_ttl".to_string());
+                assert!(if_exists);
+                assert_eq!(*engine, ANALYTIC_ENGINE_TYPE.to_string());
+            }
+            _ => panic!("failed"),
+        }
+    }
+
+    #[test]
+    fn test_exists_table() {
+        {
+            let sql = "EXISTS TABLE xxx_table";
+            let expected = Statement::Exists(ExistsTable {
+                table_name: make_object_name("xxx_table"),
+            });
+            expect_parse_ok(sql, expected).unwrap();
+        }
+
+        {
+            let sql = "EXISTS xxx_table";
+            let expected = Statement::Exists(ExistsTable {
+                table_name: make_object_name("xxx_table"),
+            });
+            expect_parse_ok(sql, expected).unwrap()
+        }
+    }
+}
diff --git a/sql/src/plan.rs b/sql/src/plan.rs
new file mode 100644
index 0000000000..25c9fe9874
--- /dev/null
+++ b/sql/src/plan.rs
@@ -0,0 +1,158 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Logical plans such as select/insert/update/delete
+
+use std::{
+    collections::{BTreeMap, HashMap},
+    fmt,
+    fmt::{Debug, Formatter},
+    sync::Arc,
+};
+
+use arrow_deps::datafusion::logical_plan::LogicalPlan as DataFusionLogicalPlan;
+use common_types::{column_schema::ColumnSchema, row::RowGroup, schema::Schema};
+use common_util::define_result;
+use snafu::Snafu;
+use table_engine::table::TableRef;
+
+use crate::{ast::ShowCreateObject, container::TableContainer};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Unsupported alter table operation."))]
+    UnsupportedOperation,
+
+    #[snafu(display("Unsupported column data type, err:{}.", source))]
+    UnsupportedDataType { source: common_types::datum::Error },
+
+    #[snafu(display("Unsupported column option:{}.", name))]
+    UnsupportedColumnOption { name: String },
+
+    #[snafu(display("Alter primary key is not allowed."))]
+    AlterPrimaryKey,
+}
+
+define_result!(Error);
+
+// TODO(yingwen): Custom Debug format
+/// Logical plan to be processed by interpreters
+#[derive(Debug)]
+pub enum Plan {
+    /// A SQL SELECT plan or other plans related to query
+    Query(QueryPlan),
+    // TODO(yingwen): Other sql command
+    Insert(InsertPlan),
+    /// Create table plan
+    Create(CreateTablePlan),
+    /// Drop table plan
+    Drop(DropTablePlan),
+    /// Describe table plan
+    Describe(DescribeTablePlan),
+    /// Alter table plan
+    AlterTable(AlterTablePlan),
+    /// Show create plan
+    ShowCreate(ShowCreatePlan),
+    /// Exists table
+    Exists(ExistsTablePlan),
+}
+
+pub struct QueryPlan {
+    pub df_plan: DataFusionLogicalPlan,
+    // Contains the TableProviders so we can register the them to ExecutionContext later.
+    // Use TableProviderAdapter here so we can get the underlying TableRef and also be
+    // able to cast to Arc<dyn TableProvider + Send + Sync>
+    pub tables: Arc<TableContainer>,
+}
+
+impl Debug for QueryPlan {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("QueryPlan")
+            .field("df_plan", &self.df_plan)
+            .finish()
+    }
+}
+
+pub struct CreateTablePlan {
+    /// Engine
+    pub engine: String,
+    /// Create table if not exists
+    pub if_not_exists: bool,
+    /// Table name
+    pub table: String,
+    /// Table schema
+    pub table_schema: Schema,
+    /// Table options
+    pub options: HashMap<String, String>,
+}
+
+impl Debug for CreateTablePlan {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        f.debug_struct("CreateTablePlan")
+            .field("engine", &self.engine)
+            .field("if_not_exists", &self.if_not_exists)
+            .field("table", &self.table)
+            .field("table_schema", &self.table_schema)
+            .field(
+                "options",
+                &self
+                    .options
+                    .iter()
+                    .map(|(k, v)| (k.clone(), v.clone()))
+                    .collect::<BTreeMap<String, String>>(),
+            )
+            .finish()
+    }
+}
+
+#[derive(Debug)]
+pub struct DropTablePlan {
+    /// Engine
+    pub engine: String,
+    /// If exists
+    pub if_exists: bool,
+    /// Table name
+    pub table: String,
+}
+
+/// Insert logical plan
+#[derive(Debug)]
+pub struct InsertPlan {
+    /// The table to insert
+    pub table: TableRef,
+    /// RowGroup to insert
+    pub rows: RowGroup,
+}
+
+#[derive(Debug)]
+pub struct DescribeTablePlan {
+    /// The table to describe
+    pub table: TableRef,
+}
+
+#[derive(Debug)]
+pub enum AlterTableOperation {
+    /// Add a new column, the column id will be ignored.
+    AddColumn(Vec<ColumnSchema>),
+    ModifySetting(HashMap<String, String>),
+}
+
+#[derive(Debug)]
+pub struct AlterTablePlan {
+    /// The table to alter.
+    pub table: TableRef,
+    // TODO(yingwen): Maybe use smallvec.
+    pub operations: AlterTableOperation,
+}
+
+#[derive(Debug)]
+pub struct ShowCreatePlan {
+    /// The table to show.
+    pub table: TableRef,
+    /// The type to show
+    pub obj_type: ShowCreateObject,
+}
+
+#[derive(Debug)]
+pub struct ExistsTablePlan {
+    pub exists: bool,
+}
diff --git a/sql/src/planner.rs b/sql/src/planner.rs
new file mode 100644
index 0000000000..5bc467c5c5
--- /dev/null
+++ b/sql/src/planner.rs
@@ -0,0 +1,1277 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Planner converts a SQL AST into logical plans
+
+use std::{
+    collections::{BTreeMap, HashMap},
+    convert::TryFrom,
+    mem,
+    sync::Arc,
+};
+
+use arrow_deps::datafusion::{error::DataFusionError, sql::planner::SqlToRel};
+use common_types::{
+    column_schema::{self, ColumnSchema},
+    datum::{Datum, DatumKind},
+    request_id::RequestId,
+    row::{RowGroup, RowGroupBuilder},
+    schema::{self, Schema, TSID_COLUMN},
+};
+use log::debug;
+use snafu::{ensure, Backtrace, OptionExt, ResultExt, Snafu};
+use sqlparser::ast::{
+    ColumnDef, ColumnOption, Expr, ObjectName, Query, SetExpr, SqlOption,
+    Statement as SqlStatement, TableConstraint, Value, Values,
+};
+use table_engine::table::TableRef;
+
+use crate::{
+    ast::{
+        AlterAddColumn, AlterModifySetting, CreateTable, DescribeTable, DropTable, ExistsTable,
+        ShowCreate, Statement,
+    },
+    container::TableReference,
+    parser,
+    plan::{
+        AlterTableOperation, AlterTablePlan, CreateTablePlan, DescribeTablePlan, DropTablePlan,
+        ExistsTablePlan, InsertPlan, Plan, QueryPlan, ShowCreatePlan,
+    },
+    promql::{ColumnNames, Expr as PromExpr},
+    provider::{ContextProviderAdapter, MetaProvider},
+};
+
+// We do not carry backtrace in sql error because it is mainly used in server
+// handler and the error is usually caused by invalid/unsupported sql, which
+// should be easy to find out the reason.
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("DataFusion Failed to plan, err:{}", source))]
+    DataFusionPlan { source: DataFusionError },
+
+    // Statement is too large and complicate to carry in Error, so we
+    // only return error here, so the caller should attach sql to its
+    // error context
+    #[snafu(display("Unsupported SQL statement"))]
+    UnsupportedStatement,
+
+    #[snafu(display("Create table name is empty"))]
+    CreateTableNameEmpty,
+
+    #[snafu(display("Table must contain timestamp constraint"))]
+    RequireTimestamp,
+
+    #[snafu(display(
+        "Table must contain only one timestamp key and it's data type must be TIMESTAMP"
+    ))]
+    InvalidTimetampKey,
+
+    #[snafu(display("Invalid unsign type: {}.\nBacktrace:\n{}", kind, backtrace))]
+    InvalidUnsignType {
+        kind: DatumKind,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Primary key not found, column name:{}", name))]
+    PrimaryKeyNotFound { name: String },
+
+    #[snafu(display("Tag column not found, name:{}", name))]
+    TagColumnNotFound { name: String },
+
+    #[snafu(display("Timestamp column not found, name:{}", name))]
+    TimestampColumnNotFound { name: String },
+
+    #[snafu(display("{} is a reserved column name", name))]
+    ColumnNameReserved { name: String },
+
+    #[snafu(display("Invalid create table name, err:{}", source))]
+    InvalidCreateTableName { source: DataFusionError },
+
+    #[snafu(display("Failed to build schema, err:{}", source))]
+    BuildTableSchema { source: common_types::schema::Error },
+
+    #[snafu(display("Unsupported SQL data type, err:{}", source))]
+    UnsupportedDataType { source: common_types::datum::Error },
+
+    #[snafu(display("Invalid column schema, column_name:{}, err:{}", column_name, source))]
+    InvalidColumnSchema {
+        column_name: String,
+        source: column_schema::Error,
+    },
+
+    #[snafu(display("Invalid table name, err:{}", source))]
+    InvalidTableName { source: DataFusionError },
+
+    #[snafu(display("Table not found, table:{}", name))]
+    TableNotFound { name: String },
+
+    #[snafu(display("Column is not null, table:{}, column:{}", table, column))]
+    InsertMissingColumn { table: String, column: String },
+
+    #[snafu(display("Column is reserved, table:{}, column:{}", table, column))]
+    InsertReservedColumn { table: String, column: String },
+
+    #[snafu(display("Unknown insert column, name:{}", name))]
+    UnknownInsertColumn { name: String },
+
+    #[snafu(display("Insert values not enough, len:{}, index:{}", len, index))]
+    InsertValuesNotEnough { len: usize, index: usize },
+
+    #[snafu(display("Invalid insert stmt, contains duplicate columns"))]
+    InsertDuplicateColumns,
+
+    #[snafu(display("Invalid insert stmt, source should be a set"))]
+    InsertSourceBodyNotSet,
+
+    #[snafu(display("Invalid insert stmt, source expr is not value"))]
+    InsertExprNotValue,
+
+    #[snafu(display("Insert Failed to convert value, err:{}", source))]
+    InsertConvertValue { source: common_types::datum::Error },
+
+    #[snafu(display("Failed to build row, err:{}", source))]
+    BuildRow { source: common_types::row::Error },
+
+    #[snafu(display("MetaProvider Failed to find table, err:{}", source))]
+    MetaProviderFindTable { source: crate::provider::Error },
+
+    #[snafu(display("Failed to find meta during planning, err:{}", source))]
+    FindMeta { source: crate::provider::Error },
+
+    #[snafu(display("Invalid alter table operation, err:{}", source))]
+    InvalidAlterTableOperation { source: crate::plan::Error },
+
+    #[snafu(display("Unsupported sql option, value:{}", value))]
+    UnsupportedOption { value: String },
+
+    #[snafu(display("Failed to build plan from promql, error:{}", source))]
+    BuildPromPlanError { source: crate::promql::Error },
+}
+
+define_result!(Error);
+
+/// Planner produces logical plans from SQL AST
+// TODO(yingwen): Rewrite Planner instead of using datafusion's planner
+pub struct Planner<'a, P: MetaProvider> {
+    provider: &'a P,
+    request_id: RequestId,
+    read_parallelism: usize,
+}
+
+impl<'a, P: MetaProvider> Planner<'a, P> {
+    /// Create a new logical planner
+    pub fn new(provider: &'a P, request_id: RequestId, read_parallelism: usize) -> Self {
+        Self {
+            provider,
+            request_id,
+            read_parallelism,
+        }
+    }
+
+    /// Create a logical plan from Statement
+    ///
+    /// Takes the ownership of statement because some statements like INSERT
+    /// statements contains lots of data
+    pub fn statement_to_plan(&self, statement: Statement) -> Result<Plan> {
+        let adapter =
+            ContextProviderAdapter::new(self.provider, self.request_id, self.read_parallelism);
+        // SqlToRel needs to hold the reference to adapter, thus we can't both holds the
+        // adapter and the SqlToRel in Planner, which is a self-referential
+        // case. We wrap a PlannerDelegate to workaround this and avoid the usage of
+        // pin.
+        let planner = PlannerDelegate::new(adapter);
+
+        match statement {
+            Statement::Standard(s) => planner.sql_statement_to_plan(*s),
+            Statement::Create(s) => planner.create_table_to_plan(s),
+            Statement::Drop(s) => planner.drop_table_to_plan(s),
+            Statement::Describe(s) => planner.describe_table_to_plan(s),
+            Statement::AlterModifySetting(s) => planner.alter_modify_setting_to_plan(s),
+            Statement::AlterAddColumn(s) => planner.alter_add_column_to_plan(s),
+            Statement::ShowCreate(s) => planner.show_create_to_plan(s),
+            Statement::Exists(s) => planner.exists_table_to_plan(s),
+        }
+    }
+
+    pub fn promql_expr_to_plan(&self, expr: PromExpr) -> Result<(Plan, Arc<ColumnNames>)> {
+        let adapter =
+            ContextProviderAdapter::new(self.provider, self.request_id, self.read_parallelism);
+        // SqlToRel needs to hold the reference to adapter, thus we can't both holds the
+        // adapter and the SqlToRel in Planner, which is a self-referential
+        // case. We wrap a PlannerDelegate to workaround this and avoid the usage of
+        // pin.
+        let planner = PlannerDelegate::new(adapter);
+
+        expr.to_plan(planner.meta_provider, self.read_parallelism)
+            .context(BuildPromPlanError)
+    }
+}
+
+/// A planner wraps the datafusion's logical planner, and delegate sql like
+/// select/explain to datafusion's planner.
+struct PlannerDelegate<'a, P: MetaProvider> {
+    meta_provider: ContextProviderAdapter<'a, P>,
+}
+
+impl<'a, P: MetaProvider> PlannerDelegate<'a, P> {
+    fn new(meta_provider: ContextProviderAdapter<'a, P>) -> Self {
+        Self { meta_provider }
+    }
+
+    fn sql_statement_to_plan(self, sql_stmt: SqlStatement) -> Result<Plan> {
+        match sql_stmt {
+            // Query statement use datafusion planner
+            SqlStatement::Explain { .. } | SqlStatement::Query(_) => {
+                self.sql_statement_to_datafusion_plan(sql_stmt)
+            }
+            SqlStatement::Insert { .. } => self.insert_to_plan(sql_stmt),
+            _ => UnsupportedStatement.fail(),
+        }
+    }
+
+    fn sql_statement_to_datafusion_plan(self, sql_stmt: SqlStatement) -> Result<Plan> {
+        let df_planner = SqlToRel::new(&self.meta_provider);
+
+        let df_plan = df_planner
+            .sql_statement_to_plan(&sql_stmt)
+            .context(DataFusionPlan)?;
+
+        debug!("Sql statement to datafusion plan, df_plan:\n{:#?}", df_plan);
+
+        // Get all tables needed in the plan
+        let tables = self.meta_provider.try_into_container().context(FindMeta)?;
+
+        Ok(Plan::Query(QueryPlan {
+            df_plan,
+            tables: Arc::new(tables),
+        }))
+    }
+
+    fn create_table_to_plan(&self, stmt: CreateTable) -> Result<Plan> {
+        ensure!(!stmt.name.0.is_empty(), CreateTableNameEmpty);
+
+        debug!("Create table to plan, stmt:{:?}", stmt);
+
+        // TODO(yingwen): Maybe support create table on other schema?
+        let table_ref = TableReference::try_from(&stmt.name).context(InvalidCreateTableName)?;
+
+        // Now we only takes the table name and ignore the schema and catalog name
+        let table = table_ref.table().to_string();
+
+        let mut schema_builder =
+            schema::Builder::with_capacity(stmt.columns.len()).auto_increment_column_id(true);
+        let mut name_column_map = BTreeMap::new();
+
+        // Build all column schemas.
+        for col in &stmt.columns {
+            name_column_map.insert(col.name.value.as_str(), parse_column(col)?);
+        }
+
+        // Tsid column is a reserved column.
+        ensure!(
+            !name_column_map.contains_key(TSID_COLUMN),
+            ColumnNameReserved {
+                name: TSID_COLUMN.to_string(),
+            }
+        );
+
+        // Find timestamp key and primary key contraint
+        let mut primary_key_constraint_idx = None;
+        let mut timestamp_name = None;
+        for (idx, constraint) in stmt.constraints.iter().enumerate() {
+            if let TableConstraint::Unique {
+                columns,
+                is_primary,
+                ..
+            } = constraint
+            {
+                if *is_primary {
+                    primary_key_constraint_idx = Some(idx);
+                } else if parser::is_timestamp_key_constraint(constraint) {
+                    // Only one timestamp key constraint
+                    ensure!(timestamp_name.is_none(), InvalidTimetampKey);
+                    // Only one column in constraint
+                    ensure!(columns.len() == 1, InvalidTimetampKey);
+
+                    let name = &columns[0].value;
+                    let timestamp_column = name_column_map
+                        .get(name as &str)
+                        .context(TimestampColumnNotFound { name })?;
+                    // Ensure type is timestamp
+                    ensure!(
+                        timestamp_column.data_type == DatumKind::Timestamp,
+                        InvalidTimetampKey
+                    );
+
+                    timestamp_name = Some(name.clone());
+                }
+            }
+        }
+
+        // Timestamp column must be provided.
+        let timestamp_name = timestamp_name.context(RequireTimestamp)?;
+
+        // Build primary key, the builder will check timestamp column is in primary key.
+        if let Some(idx) = primary_key_constraint_idx {
+            // If primary key is already provided, use that primary key.
+            if let TableConstraint::Unique { columns, .. } = &stmt.constraints[idx] {
+                for col in columns {
+                    let key_column = name_column_map.remove(&*col.value).with_context(|| {
+                        PrimaryKeyNotFound {
+                            name: col.value.clone(),
+                        }
+                    })?;
+                    // The schema builder will checks there is only one timestamp column in primary
+                    // key.
+                    schema_builder = schema_builder
+                        .add_key_column(key_column)
+                        .context(BuildTableSchema)?;
+                }
+            }
+        } else {
+            // If primary key is not set, Use (timestamp, tsid) as primary key.
+            let timestamp_column = name_column_map.remove(timestamp_name.as_str()).context(
+                TimestampColumnNotFound {
+                    name: &timestamp_name,
+                },
+            )?;
+            let column_schema =
+                column_schema::Builder::new(TSID_COLUMN.to_string(), DatumKind::UInt64)
+                    .is_nullable(false)
+                    .build()
+                    .context(InvalidColumnSchema {
+                        column_name: TSID_COLUMN,
+                    })?;
+            schema_builder = schema_builder
+                .enable_tsid_primary_key(true)
+                .add_key_column(timestamp_column)
+                .context(BuildTableSchema)?
+                .add_key_column(column_schema)
+                .context(BuildTableSchema)?;
+        }
+
+        // The key columns have been consumed.
+        for col in name_column_map.into_values() {
+            schema_builder = schema_builder
+                .add_normal_column(col)
+                .context(BuildTableSchema)?;
+        }
+
+        let table_schema = schema_builder.build().context(BuildTableSchema)?;
+
+        let options = parse_options(stmt.options)?;
+
+        let plan = CreateTablePlan {
+            engine: stmt.engine,
+            if_not_exists: stmt.if_not_exists,
+            table,
+            table_schema,
+            options,
+        };
+
+        debug!("Create table to plan, plan:{:?}", plan);
+
+        Ok(Plan::Create(plan))
+    }
+
+    fn drop_table_to_plan(&self, stmt: DropTable) -> Result<Plan> {
+        let table = if stmt.if_exists {
+            stmt.name.to_string()
+        } else {
+            self.find_table(stmt.name)?.name().to_string()
+        };
+
+        Ok(Plan::Drop(DropTablePlan {
+            engine: stmt.engine,
+            if_exists: stmt.if_exists,
+            table,
+        }))
+    }
+
+    fn describe_table_to_plan(&self, stmt: DescribeTable) -> Result<Plan> {
+        let table = self.find_table(stmt.table_name)?;
+
+        Ok(Plan::Describe(DescribeTablePlan { table }))
+    }
+
+    // REQUIRE: SqlStatement must be INSERT stmt
+    fn insert_to_plan(&self, sql_stmt: SqlStatement) -> Result<Plan> {
+        match sql_stmt {
+            SqlStatement::Insert {
+                table_name,
+                columns,
+                source,
+                ..
+            } => {
+                let table = self.find_table(table_name)?;
+
+                let schema = table.schema();
+                // Column name and its index in insert stmt: {column name} => index
+                let column_names_idx: HashMap<_, _> = columns
+                    .iter()
+                    .enumerate()
+                    .map(|(idx, ident)| (&ident.value, idx))
+                    .collect();
+                ensure!(
+                    column_names_idx.len() == columns.len(),
+                    InsertDuplicateColumns
+                );
+
+                validate_insert_stmt(table.name(), &schema, &column_names_idx)?;
+
+                // Index in insert values stmt of each column in table schema
+                let mut column_index_in_insert = Vec::with_capacity(schema.num_columns());
+
+                // Check all not null columns are provided in stmt, also init
+                // `column_index_in_insert`
+                for (idx, column) in schema.columns().iter().enumerate() {
+                    if let Some(tsid_idx) = schema.index_of_tsid() {
+                        if idx == tsid_idx {
+                            // This is a tsid column.
+                            column_index_in_insert.push(InsertMode::Auto);
+                            continue;
+                        }
+                    }
+                    match column_names_idx.get(&column.name) {
+                        Some(idx_in_insert) => {
+                            // This column in schema is also in insert stmt
+                            column_index_in_insert.push(InsertMode::Direct(*idx_in_insert));
+                        }
+                        None => {
+                            // This column in schema is not in insert stmt
+                            if column.is_nullable {
+                                column_index_in_insert.push(InsertMode::Null);
+                            } else {
+                                // Column is not null and input does not contains that column
+                                return InsertMissingColumn {
+                                    table: table.name(),
+                                    column: &column.name,
+                                }
+                                .fail();
+                            }
+                        }
+                    }
+                }
+
+                let rows = build_row_group(schema, source, column_index_in_insert)?;
+
+                Ok(Plan::Insert(InsertPlan { table, rows }))
+            }
+            // We already known this stmt is a INSERT stmt
+            _ => unreachable!(),
+        }
+    }
+
+    fn alter_modify_setting_to_plan(&self, stmt: AlterModifySetting) -> Result<Plan> {
+        let table = self.find_table(stmt.table_name)?;
+        let plan = AlterTablePlan {
+            table,
+            operations: AlterTableOperation::ModifySetting(parse_options(stmt.options)?),
+        };
+        Ok(Plan::AlterTable(plan))
+    }
+
+    fn alter_add_column_to_plan(&self, stmt: AlterAddColumn) -> Result<Plan> {
+        let table = self.find_table(stmt.table_name)?;
+        let plan = AlterTablePlan {
+            table,
+            operations: AlterTableOperation::AddColumn(parse_columns(stmt.columns)?),
+        };
+        Ok(Plan::AlterTable(plan))
+    }
+
+    fn exists_table_to_plan(&self, stmt: ExistsTable) -> Result<Plan> {
+        let table = self.find_table(stmt.table_name);
+        match table {
+            Ok(_) => Ok(Plan::Exists(ExistsTablePlan { exists: true })),
+            Err(_) => Ok(Plan::Exists(ExistsTablePlan { exists: false })),
+        }
+    }
+
+    fn show_create_to_plan(&self, show_create: ShowCreate) -> Result<Plan> {
+        let table = self.find_table(show_create.obj_name)?;
+        let plan = ShowCreatePlan {
+            table,
+            obj_type: show_create.obj_type,
+        };
+        Ok(Plan::ShowCreate(plan))
+    }
+
+    fn find_table(&self, table_name: ObjectName) -> Result<TableRef> {
+        let table_ref = TableReference::try_from(&table_name).context(InvalidTableName)?;
+
+        self.meta_provider
+            .table(table_ref)
+            .context(MetaProviderFindTable)?
+            .with_context(|| TableNotFound {
+                name: table_name.to_string(),
+            })
+    }
+}
+
+#[derive(Debug)]
+enum InsertMode {
+    // Insert the value in expr with given index directly.
+    Direct(usize),
+    // No value provided, insert a null.
+    Null,
+    // Auto generated column, just temporary fill by default value, the real value will
+    // be filled by interpreter.
+    Auto,
+}
+
+/// Build RowGroup
+fn build_row_group(
+    schema: Schema,
+    source: Box<Query>,
+    column_index_in_insert: Vec<InsertMode>,
+) -> Result<RowGroup> {
+    // Build row group by schema
+    match source.body {
+        SetExpr::Values(Values(values)) => {
+            let mut row_group_builder =
+                RowGroupBuilder::with_capacity(schema.clone(), values.len());
+            for mut exprs in values {
+                // Try to build row
+                let mut row_builder = row_group_builder.row_builder();
+
+                // For each column in schema, append datum into row builder
+                for (index_opt, column_schema) in
+                    column_index_in_insert.iter().zip(schema.columns())
+                {
+                    match index_opt {
+                        InsertMode::Direct(index) => {
+                            let exprs_len = exprs.len();
+                            let expr = exprs.get_mut(*index).context(InsertValuesNotEnough {
+                                len: exprs_len,
+                                index: *index,
+                            })?;
+
+                            match expr {
+                                Expr::Value(value) => {
+                                    let datum = Datum::try_from_sql_value(
+                                        &column_schema.data_type,
+                                        mem::replace(value, Value::Null),
+                                    )
+                                    .context(InsertConvertValue)?;
+                                    row_builder =
+                                        row_builder.append_datum(datum).context(BuildRow)?;
+                                }
+                                _ => {
+                                    InsertExprNotValue.fail()?;
+                                }
+                            }
+                        }
+                        InsertMode::Null => {
+                            // This is a null column
+                            row_builder =
+                                row_builder.append_datum(Datum::Null).context(BuildRow)?;
+                        }
+                        InsertMode::Auto => {
+                            // This is an auto generated column, fill by default value.
+                            let kind = &column_schema.data_type;
+                            row_builder = row_builder
+                                .append_datum(Datum::empty(kind))
+                                .context(BuildRow)?;
+                        }
+                    }
+                }
+
+                // Finish this row and append into row group
+                row_builder.finish().context(BuildRow)?;
+            }
+
+            // Build the whole row group
+            Ok(row_group_builder.build())
+        }
+        _ => InsertSourceBodyNotSet.fail(),
+    }
+}
+
+#[inline]
+fn is_tsid_column(name: &str) -> bool {
+    name == TSID_COLUMN
+}
+
+fn validate_insert_stmt(
+    table_name: &str,
+    schema: &Schema,
+    column_name_idx: &HashMap<&String, usize>,
+) -> Result<()> {
+    for name in column_name_idx.keys() {
+        if is_tsid_column(name.as_str()) {
+            return Err(Error::InsertReservedColumn {
+                table: table_name.to_string(),
+                column: name.to_string(),
+            });
+        }
+        schema.column_with_name(name).context(UnknownInsertColumn {
+            name: name.to_string(),
+        })?;
+    }
+
+    Ok(())
+}
+
+fn parse_options(options: Vec<SqlOption>) -> Result<HashMap<String, String>> {
+    let mut parsed_options = HashMap::with_capacity(options.len());
+
+    for option in options {
+        let key = option.name.value;
+        if let Some(value) = parse_for_option(option.value)? {
+            parsed_options.insert(key, value);
+        };
+    }
+
+    Ok(parsed_options)
+}
+
+/// Parse value for sql option.
+pub fn parse_for_option(value: Value) -> Result<Option<String>> {
+    let value_opt = match value {
+        Value::Number(n, _long) => Some(n),
+        Value::SingleQuotedString(v) | Value::DoubleQuotedString(v) => Some(v),
+        Value::NationalStringLiteral(v) | Value::HexStringLiteral(v) => {
+            return UnsupportedOption { value: v }.fail();
+        }
+        Value::Boolean(v) => Some(v.to_string()),
+        Value::Interval { value, .. } => {
+            return UnsupportedOption { value }.fail();
+        }
+        // Ignore this option if value is null.
+        Value::Null => None,
+    };
+
+    Ok(value_opt)
+}
+
+fn parse_columns(cols: Vec<ColumnDef>) -> Result<Vec<ColumnSchema>> {
+    let mut parsed_columns = Vec::with_capacity(cols.len());
+
+    // Build all column schemas.
+    for col in &cols {
+        parsed_columns.push(parse_column(col)?);
+    }
+
+    Ok(parsed_columns)
+}
+
+fn parse_column(col: &ColumnDef) -> Result<ColumnSchema> {
+    let mut data_type = DatumKind::try_from(&col.data_type).context(UnsupportedDataType)?;
+
+    // Process column options
+    let mut is_nullable = true; // A column is nullable by default.
+    let mut is_tag = false;
+    let mut is_unsign = false;
+    let mut comment = String::new();
+    for option_def in &col.options {
+        if matches!(option_def.option, ColumnOption::NotNull) {
+            is_nullable = false;
+        } else if parser::is_tag_column(&option_def.option) {
+            is_tag = true;
+        } else if parser::is_unsign_column(&option_def.option) {
+            is_unsign = true;
+        } else if let Some(v) = parser::get_column_comment(&option_def.option) {
+            comment = v;
+        }
+    }
+
+    if is_unsign {
+        data_type = data_type
+            .unsign_kind()
+            .context(InvalidUnsignType { kind: data_type })?;
+    }
+
+    let builder = column_schema::Builder::new(col.name.value.clone(), data_type)
+        .is_nullable(is_nullable)
+        .is_tag(is_tag)
+        .comment(comment);
+
+    builder.build().context(InvalidColumnSchema {
+        column_name: &col.name.value,
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use sqlparser::ast::Value;
+
+    use super::*;
+    use crate::{
+        parser::Parser,
+        planner::{parse_for_option, Planner},
+        tests::MockMetaProvider,
+    };
+
+    fn quick_test(sql: &str, expected: &str) -> Result<()> {
+        let mock = MockMetaProvider::default();
+        let planner = build_planner(&mock);
+        let mut statements = Parser::parse_sql(sql).unwrap();
+        assert_eq!(statements.len(), 1);
+        let plan = planner.statement_to_plan(statements.remove(0))?;
+        assert_eq!(format!("{:#?}", plan), expected);
+        Ok(())
+    }
+
+    fn build_planner(provider: &MockMetaProvider) -> Planner<MockMetaProvider> {
+        Planner::new(provider, RequestId::next_id(), 1)
+    }
+
+    #[test]
+    pub fn test_parse_for_option() {
+        let test_string = "aa".to_string();
+        // input is_err expected
+        let test_cases = vec![
+            (
+                Value::Number("1000".to_string(), false),
+                false,
+                Some("1000".to_string()),
+            ),
+            (
+                Value::SingleQuotedString(test_string.clone()),
+                false,
+                Some(test_string.clone()),
+            ),
+            (
+                Value::DoubleQuotedString(test_string.clone()),
+                false,
+                Some(test_string.clone()),
+            ),
+            (
+                Value::NationalStringLiteral(test_string.clone()),
+                true,
+                None,
+            ),
+            (Value::HexStringLiteral(test_string.clone()), true, None),
+            (Value::Boolean(true), false, Some("true".to_string())),
+            (
+                Value::Interval {
+                    value: test_string,
+                    leading_field: None,
+                    leading_precision: None,
+                    last_field: None,
+                    fractional_seconds_precision: None,
+                },
+                true,
+                None,
+            ),
+            (Value::Null, false, None),
+        ];
+
+        for (input, is_err, expected) in test_cases {
+            let ret = parse_for_option(input);
+            assert_eq!(ret.is_err(), is_err);
+            if !is_err {
+                assert_eq!(ret.unwrap(), expected);
+            }
+        }
+    }
+
+    #[test]
+    fn test_create_statement_to_plan() {
+        let sql = "CREATE TABLE IF NOT EXISTS t(c1 string tag not null,ts timestamp not null, c3 string, timestamp key(ts),primary key(c1, ts)) \
+        ENGINE=Analytic WITH (ttl='70d',update_mode='overwrite',arena_block_size='1KB')";
+        quick_test(
+            sql,
+            r#"Create(
+    CreateTablePlan {
+        engine: "Analytic",
+        if_not_exists: true,
+        table: "t",
+        table_schema: Schema {
+            num_key_columns: 2,
+            timestamp_index: 1,
+            tsid_index: None,
+            enable_tsid_primary_key: false,
+            column_schemas: ColumnSchemas {
+                columns: [
+                    ColumnSchema {
+                        id: 1,
+                        name: "c1",
+                        data_type: String,
+                        is_nullable: false,
+                        is_tag: true,
+                        comment: "",
+                    },
+                    ColumnSchema {
+                        id: 2,
+                        name: "ts",
+                        data_type: Timestamp,
+                        is_nullable: false,
+                        is_tag: false,
+                        comment: "",
+                    },
+                    ColumnSchema {
+                        id: 3,
+                        name: "c3",
+                        data_type: String,
+                        is_nullable: true,
+                        is_tag: false,
+                        comment: "",
+                    },
+                ],
+            },
+            version: 1,
+        },
+        options: {
+            "arena_block_size": "1KB",
+            "ttl": "70d",
+            "update_mode": "overwrite",
+        },
+    },
+)"#,
+        )
+        .unwrap();
+    }
+
+    #[test]
+    fn test_query_statement_to_plan() {
+        let sql = "select * from test_tablex;";
+        assert!(quick_test(sql, "").is_err());
+
+        let sql = "select * from test_table;";
+        quick_test(sql, "Query(
+    QueryPlan {
+        df_plan: Projection: #test_table.key1, #test_table.key2, #test_table.field1, #test_table.field2
+          TableScan: test_table projection=None,
+    },
+)").unwrap();
+    }
+
+    #[test]
+    fn test_insert_statement_to_plan() {
+        let sql = "INSERT INTO test_tablex(key1, key2, field1,field2) VALUES('tagk', 1638428434000,100, 'hello3');";
+        assert!(quick_test(sql, "").is_err());
+
+        let sql = "INSERT INTO test_table(key1, key2, field1,field2) VALUES('tagk', 1638428434000,100, 'hello3');";
+        quick_test(
+            sql,
+            r#"Insert(
+    InsertPlan {
+        table: MemoryTable {
+            name: "test_table",
+            id: TableId(100, 0, 100),
+            schema: Schema {
+                num_key_columns: 2,
+                timestamp_index: 1,
+                tsid_index: None,
+                enable_tsid_primary_key: false,
+                column_schemas: ColumnSchemas {
+                    columns: [
+                        ColumnSchema {
+                            id: 1,
+                            name: "key1",
+                            data_type: Varbinary,
+                            is_nullable: false,
+                            is_tag: false,
+                            comment: "",
+                        },
+                        ColumnSchema {
+                            id: 2,
+                            name: "key2",
+                            data_type: Timestamp,
+                            is_nullable: false,
+                            is_tag: false,
+                            comment: "",
+                        },
+                        ColumnSchema {
+                            id: 3,
+                            name: "field1",
+                            data_type: Double,
+                            is_nullable: false,
+                            is_tag: false,
+                            comment: "",
+                        },
+                        ColumnSchema {
+                            id: 4,
+                            name: "field2",
+                            data_type: String,
+                            is_nullable: false,
+                            is_tag: false,
+                            comment: "",
+                        },
+                    ],
+                },
+                version: 1,
+            },
+        },
+        rows: RowGroup {
+            schema: Schema {
+                num_key_columns: 2,
+                timestamp_index: 1,
+                tsid_index: None,
+                enable_tsid_primary_key: false,
+                column_schemas: ColumnSchemas {
+                    columns: [
+                        ColumnSchema {
+                            id: 1,
+                            name: "key1",
+                            data_type: Varbinary,
+                            is_nullable: false,
+                            is_tag: false,
+                            comment: "",
+                        },
+                        ColumnSchema {
+                            id: 2,
+                            name: "key2",
+                            data_type: Timestamp,
+                            is_nullable: false,
+                            is_tag: false,
+                            comment: "",
+                        },
+                        ColumnSchema {
+                            id: 3,
+                            name: "field1",
+                            data_type: Double,
+                            is_nullable: false,
+                            is_tag: false,
+                            comment: "",
+                        },
+                        ColumnSchema {
+                            id: 4,
+                            name: "field2",
+                            data_type: String,
+                            is_nullable: false,
+                            is_tag: false,
+                            comment: "",
+                        },
+                    ],
+                },
+                version: 1,
+            },
+            rows: [
+                Row {
+                    cols: [
+                        Varbinary(
+                            b"tagk",
+                        ),
+                        Timestamp(
+                            Timestamp(
+                                1638428434000,
+                            ),
+                        ),
+                        Double(
+                            100.0,
+                        ),
+                        String(
+                            StringBytes(
+                                b"hello3",
+                            ),
+                        ),
+                    ],
+                },
+            ],
+            min_timestamp: Timestamp(
+                1638428434000,
+            ),
+            max_timestamp: Timestamp(
+                1638428434000,
+            ),
+        },
+    },
+)"#,
+        )
+        .unwrap();
+    }
+
+    #[test]
+    fn test_drop_statement_to_plan() {
+        let sql = "drop table test_table;";
+        quick_test(
+            sql,
+            r#"Drop(
+    DropTablePlan {
+        engine: "Analytic",
+        if_exists: false,
+        table: "test_table",
+    },
+)"#,
+        )
+        .unwrap();
+
+        let sql = "drop table test_tablex;";
+        assert!(quick_test(sql, "",).is_err());
+
+        let sql = "drop table if exists test_tablex;";
+        quick_test(
+            sql,
+            r#"Drop(
+    DropTablePlan {
+        engine: "Analytic",
+        if_exists: true,
+        table: "test_tablex",
+    },
+)"#,
+        )
+        .unwrap();
+    }
+
+    #[test]
+    fn test_desc_statement_to_plan() {
+        let sql = "desc test_tablex;";
+        assert!(quick_test(sql, "").is_err());
+
+        let sql = "desc test_table;";
+        quick_test(
+            sql,
+            r#"Describe(
+    DescribeTablePlan {
+        table: MemoryTable {
+            name: "test_table",
+            id: TableId(100, 0, 100),
+            schema: Schema {
+                num_key_columns: 2,
+                timestamp_index: 1,
+                tsid_index: None,
+                enable_tsid_primary_key: false,
+                column_schemas: ColumnSchemas {
+                    columns: [
+                        ColumnSchema {
+                            id: 1,
+                            name: "key1",
+                            data_type: Varbinary,
+                            is_nullable: false,
+                            is_tag: false,
+                            comment: "",
+                        },
+                        ColumnSchema {
+                            id: 2,
+                            name: "key2",
+                            data_type: Timestamp,
+                            is_nullable: false,
+                            is_tag: false,
+                            comment: "",
+                        },
+                        ColumnSchema {
+                            id: 3,
+                            name: "field1",
+                            data_type: Double,
+                            is_nullable: false,
+                            is_tag: false,
+                            comment: "",
+                        },
+                        ColumnSchema {
+                            id: 4,
+                            name: "field2",
+                            data_type: String,
+                            is_nullable: false,
+                            is_tag: false,
+                            comment: "",
+                        },
+                    ],
+                },
+                version: 1,
+            },
+        },
+    },
+)"#,
+        )
+        .unwrap();
+    }
+
+    #[test]
+    fn test_alter_column_statement_to_plan() {
+        let sql = "ALTER TABLE test_tablex ADD column add_col string;";
+        assert!(quick_test(sql, "").is_err());
+
+        let sql = "ALTER TABLE test_table ADD column add_col string;";
+        quick_test(
+            sql,
+            r#"AlterTable(
+    AlterTablePlan {
+        table: MemoryTable {
+            name: "test_table",
+            id: TableId(100, 0, 100),
+            schema: Schema {
+                num_key_columns: 2,
+                timestamp_index: 1,
+                tsid_index: None,
+                enable_tsid_primary_key: false,
+                column_schemas: ColumnSchemas {
+                    columns: [
+                        ColumnSchema {
+                            id: 1,
+                            name: "key1",
+                            data_type: Varbinary,
+                            is_nullable: false,
+                            is_tag: false,
+                            comment: "",
+                        },
+                        ColumnSchema {
+                            id: 2,
+                            name: "key2",
+                            data_type: Timestamp,
+                            is_nullable: false,
+                            is_tag: false,
+                            comment: "",
+                        },
+                        ColumnSchema {
+                            id: 3,
+                            name: "field1",
+                            data_type: Double,
+                            is_nullable: false,
+                            is_tag: false,
+                            comment: "",
+                        },
+                        ColumnSchema {
+                            id: 4,
+                            name: "field2",
+                            data_type: String,
+                            is_nullable: false,
+                            is_tag: false,
+                            comment: "",
+                        },
+                    ],
+                },
+                version: 1,
+            },
+        },
+        operations: AddColumn(
+            [
+                ColumnSchema {
+                    id: 0,
+                    name: "add_col",
+                    data_type: String,
+                    is_nullable: true,
+                    is_tag: false,
+                    comment: "",
+                },
+            ],
+        ),
+    },
+)"#,
+        )
+        .unwrap();
+    }
+
+    #[test]
+    fn test_alter_option_statement_to_plan() {
+        let sql = "ALTER TABLE test_tablex modify SETTING ttl='9d';";
+        assert!(quick_test(sql, "").is_err());
+
+        let sql = "ALTER TABLE test_table modify SETTING ttl='9d';";
+        quick_test(
+            sql,
+            r#"AlterTable(
+    AlterTablePlan {
+        table: MemoryTable {
+            name: "test_table",
+            id: TableId(100, 0, 100),
+            schema: Schema {
+                num_key_columns: 2,
+                timestamp_index: 1,
+                tsid_index: None,
+                enable_tsid_primary_key: false,
+                column_schemas: ColumnSchemas {
+                    columns: [
+                        ColumnSchema {
+                            id: 1,
+                            name: "key1",
+                            data_type: Varbinary,
+                            is_nullable: false,
+                            is_tag: false,
+                            comment: "",
+                        },
+                        ColumnSchema {
+                            id: 2,
+                            name: "key2",
+                            data_type: Timestamp,
+                            is_nullable: false,
+                            is_tag: false,
+                            comment: "",
+                        },
+                        ColumnSchema {
+                            id: 3,
+                            name: "field1",
+                            data_type: Double,
+                            is_nullable: false,
+                            is_tag: false,
+                            comment: "",
+                        },
+                        ColumnSchema {
+                            id: 4,
+                            name: "field2",
+                            data_type: String,
+                            is_nullable: false,
+                            is_tag: false,
+                            comment: "",
+                        },
+                    ],
+                },
+                version: 1,
+            },
+        },
+        operations: ModifySetting(
+            {
+                "ttl": "9d",
+            },
+        ),
+    },
+)"#,
+        )
+        .unwrap();
+    }
+
+    #[test]
+    fn test_show_create_statement_to_plan() {
+        let sql = "show create table test_tablex;";
+        assert!(quick_test(sql, "").is_err());
+
+        let sql = "show create table test_table;";
+        quick_test(
+            sql,
+            r#"ShowCreate(
+    ShowCreatePlan {
+        table: MemoryTable {
+            name: "test_table",
+            id: TableId(100, 0, 100),
+            schema: Schema {
+                num_key_columns: 2,
+                timestamp_index: 1,
+                tsid_index: None,
+                enable_tsid_primary_key: false,
+                column_schemas: ColumnSchemas {
+                    columns: [
+                        ColumnSchema {
+                            id: 1,
+                            name: "key1",
+                            data_type: Varbinary,
+                            is_nullable: false,
+                            is_tag: false,
+                            comment: "",
+                        },
+                        ColumnSchema {
+                            id: 2,
+                            name: "key2",
+                            data_type: Timestamp,
+                            is_nullable: false,
+                            is_tag: false,
+                            comment: "",
+                        },
+                        ColumnSchema {
+                            id: 3,
+                            name: "field1",
+                            data_type: Double,
+                            is_nullable: false,
+                            is_tag: false,
+                            comment: "",
+                        },
+                        ColumnSchema {
+                            id: 4,
+                            name: "field2",
+                            data_type: String,
+                            is_nullable: false,
+                            is_tag: false,
+                            comment: "",
+                        },
+                    ],
+                },
+                version: 1,
+            },
+        },
+        obj_type: Table,
+    },
+)"#,
+        )
+        .unwrap();
+    }
+}
diff --git a/sql/src/promql.rs b/sql/src/promql.rs
new file mode 100644
index 0000000000..2113681eea
--- /dev/null
+++ b/sql/src/promql.rs
@@ -0,0 +1,10 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+mod convert;
+mod datafusion_util;
+mod pushdown;
+mod udf;
+
+pub use convert::{Error, Expr};
+pub use datafusion_util::{ColumnNames, PromAlignNode};
+pub use pushdown::{AlignParameter, Func};
diff --git a/sql/src/promql/convert.rs b/sql/src/promql/convert.rs
new file mode 100644
index 0000000000..005f2ebeb1
--- /dev/null
+++ b/sql/src/promql/convert.rs
@@ -0,0 +1,673 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use std::{
+    convert::{TryFrom, TryInto},
+    sync::Arc,
+};
+
+use arrow_deps::datafusion::{
+    error::DataFusionError,
+    logical_plan::{
+        avg, col, combine_filters, count, lit, max, min, plan::Extension, sum,
+        Expr as DataFusionExpr, LogicalPlan, LogicalPlanBuilder,
+    },
+    sql::planner::ContextProvider,
+};
+use ceresdbproto::prometheus::{
+    Expr as ExprPb, Filter as FilterPb, FilterType as FilterPbType, Operand as OperandPb,
+    Selector as PbSelector, SubExpr as PbSubExpr, SubExpr_OperatorType,
+};
+use common_types::{
+    schema::{Schema, TSID_COLUMN},
+    time::{TimeRange, Timestamp},
+};
+use snafu::{ensure, Backtrace, OptionExt, ResultExt, Snafu};
+
+use crate::{
+    plan::{Plan, QueryPlan},
+    promql::{
+        datafusion_util::{default_sort_exprs, timerange_to_expr},
+        pushdown::{AlignParameter, Func},
+        udf::{create_unique_id, regex_match_expr},
+        ColumnNames, PromAlignNode,
+    },
+    provider::{ContextProviderAdapter, MetaProvider},
+};
+
+const INIT_LEVEL: usize = 1;
+const DEFAULT_LOOKBACK: i64 = 300_000;
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Invalid expr, expected: {}, actual:{:?}", expected, actual))]
+    UnexpectedExpr { expected: String, actual: String },
+
+    #[snafu(display("Expr pushdown not implemented. expr:{:?}", expr))]
+    NotImplemented { expr: String },
+
+    #[snafu(display("MetaProvider {}, err:{}", msg, source))]
+    MetaProviderError {
+        msg: String,
+        source: crate::provider::Error,
+    },
+
+    #[snafu(display("Table not found, table:{}", name))]
+    TableNotFound { name: String },
+
+    #[snafu(display("Failed to build schema, err:{}", source))]
+    BuildTableSchema { source: common_types::schema::Error },
+
+    #[snafu(display("Failed to build plan, source:{}", source,))]
+    BuildPlanError { source: DataFusionError },
+
+    #[snafu(display("Invalid expr, msg:{}\nBacktrace:\n{}", msg, backtrace))]
+    InvalidExpr { msg: String, backtrace: Backtrace },
+
+    #[snafu(display("Failed to pushdown, source:{}", source))]
+    PushdownError {
+        source: crate::promql::pushdown::Error,
+    },
+}
+
+define_result!(Error);
+
+impl From<DataFusionError> for Error {
+    fn from(df_err: DataFusionError) -> Self {
+        Error::BuildPlanError { source: df_err }
+    }
+}
+
+#[derive(Debug, Clone)]
+pub enum Expr {
+    SimpleExpr(Operand),
+    RecursiveExpr(SubExpr),
+}
+
+impl TryFrom<OperandPb> for Expr {
+    type Error = Error;
+
+    fn try_from(mut pb_operand: OperandPb) -> Result<Self> {
+        let op = if pb_operand.has_selector() {
+            let PbSelector {
+                measurement: table,
+                start,
+                end,
+                align_start,
+                align_end,
+                filters,
+                range,
+                field,
+                offset,
+                step,
+                ..
+            } = pb_operand.take_selector();
+            let filters = Into::<Vec<_>>::into(filters)
+                .into_iter()
+                .map(Filter::from)
+                .collect::<Vec<_>>();
+            Operand::Selector(Selector {
+                table,
+                filters,
+                field,
+                query_range: TimeRange::new_unchecked(
+                    Timestamp::new(start),
+                    Timestamp::new(end + 1),
+                ), /* [start, end] */
+                align_range: TimeRange::new_unchecked(
+                    Timestamp::new(align_start),
+                    Timestamp::new(align_end + 1),
+                ), /* [align_start, align_end] */
+                step,
+                range,
+                offset,
+            })
+        } else if pb_operand.has_float_val() {
+            Operand::Float(pb_operand.get_float_val())
+        } else if pb_operand.has_string_val() {
+            Operand::String(pb_operand.take_string_val())
+        } else {
+            return InvalidExpr {
+                msg: format!("unknown operand:{:?}", pb_operand),
+            }
+            .fail();
+        };
+
+        Ok(Expr::SimpleExpr(op))
+    }
+}
+
+impl TryFrom<ExprPb> for Expr {
+    type Error = Error;
+
+    fn try_from(mut expr: ExprPb) -> Result<Self> {
+        if expr.has_operand() {
+            let operand = expr.take_operand();
+            return operand.try_into();
+        } else if expr.has_sub_expr() {
+            let sub_expr = expr.take_sub_expr();
+            return sub_expr.try_into();
+        }
+
+        InvalidExpr {
+            msg: format!("unknown expr:{:?}", expr),
+        }
+        .fail()
+    }
+}
+
+impl Expr {
+    pub fn get_selector(&self) -> &Selector {
+        match self {
+            Expr::SimpleExpr(se) => match se {
+                Operand::Selector(sel) => sel,
+                _ => unreachable!(),
+            },
+            Expr::RecursiveExpr(re) => re.get_selector(),
+        }
+    }
+
+    pub fn is_selector(&self) -> bool {
+        matches!(self, Expr::SimpleExpr(e) if matches!(e, Operand::Selector(_)))
+    }
+
+    /// For now, only filters and timestamp are pushdown, we translate it
+    /// into plan like:
+    /// Aggregate: (when needed)
+    ///   PromAlign:
+    ///     Sort: (tsid, timestamp) asc
+    ///       Project:
+    ///         Filter:
+    ///           TableScan
+    pub fn to_plan<P: MetaProvider>(
+        self,
+        meta_provider: ContextProviderAdapter<'_, P>,
+        read_parallelism: usize,
+    ) -> Result<(Plan, Arc<ColumnNames>)> {
+        let (logic_plan, column_name, _) =
+            self.build_plan_iter(&meta_provider, INIT_LEVEL, read_parallelism)?;
+        let tables = Arc::new(
+            meta_provider
+                .try_into_container()
+                .context(MetaProviderError {
+                    msg: "Failed to find meta",
+                })?,
+        );
+        Ok((
+            Plan::Query(QueryPlan {
+                df_plan: logic_plan,
+                tables,
+            }),
+            column_name,
+        ))
+    }
+
+    fn build_plan_iter<P: MetaProvider>(
+        self,
+        meta_provider: &ContextProviderAdapter<'_, P>,
+        level: usize,
+        read_parallelism: usize,
+    ) -> Result<(LogicalPlan, Arc<ColumnNames>, String)> {
+        match self {
+            Expr::SimpleExpr(simple_expr) => match simple_expr {
+                Operand::Selector(selector) => {
+                    let (sub_plan, column_name, table_name) =
+                        selector.clone().into_scan_plan(meta_provider)?;
+                    if level == INIT_LEVEL {
+                        // when only selector is pushdown, align is done in Prometheus itself
+                        // since maybe there are subquery inside one query which require complex
+                        // align logic.
+                        return Ok((sub_plan, column_name, table_name));
+                    }
+                    // insert PromAlignNode into plan with Func::Instant
+                    let Selector {
+                        align_range,
+                        step,
+                        offset,
+                        ..
+                    } = selector;
+                    let align_param = AlignParameter {
+                        align_range,
+                        step: step.into(),
+                        offset: offset.into(),
+                        lookback_delta: DEFAULT_LOOKBACK.into(),
+                    };
+                    let align_plan = LogicalPlan::Extension(Extension {
+                        node: Arc::new(PromAlignNode {
+                            input: sub_plan,
+                            func: Func::Instant,
+                            table_name: table_name.clone(),
+                            align_param,
+                            column_name: column_name.clone(),
+                            read_parallelism,
+                        }),
+                    });
+                    Ok((align_plan, column_name, table_name))
+                }
+                Operand::Float(_) | Operand::String(_) => InvalidExpr {
+                    msg: "scalar value not allowed in plan node",
+                }
+                .fail(),
+            },
+            // New plan like:
+            // PromAlign:
+            //   SubPlan
+            Expr::RecursiveExpr(recursive_expr) => match recursive_expr {
+                SubExpr::Func(FuncExpr { op, operands }) => {
+                    assert!(!operands.is_empty());
+                    let func = Func::try_from(op.as_str()).context(PushdownError {})?;
+                    let first_arg = &operands[0];
+                    if first_arg.is_selector() {
+                        let selector = first_arg.get_selector();
+                        let (sub_plan, column_name, table_name) =
+                            selector.clone().into_scan_plan(meta_provider)?;
+                        let Selector {
+                            align_range,
+                            step,
+                            range,
+                            offset,
+                            ..
+                        } = selector;
+                        let align_param = AlignParameter {
+                            align_range: *align_range,
+                            step: step.into(),
+                            offset: offset.into(),
+                            lookback_delta: range.into(),
+                        };
+                        let align_plan = LogicalPlan::Extension(Extension {
+                            node: Arc::new(PromAlignNode {
+                                input: sub_plan,
+                                table_name: table_name.clone(),
+                                func,
+                                align_param,
+                                column_name: column_name.clone(),
+                                read_parallelism,
+                            }),
+                        });
+                        return Ok((align_plan, column_name, table_name));
+                    }
+                    InvalidExpr {
+                        msg: "first arg of func must be selector",
+                    }
+                    .fail()
+                }
+
+                // New plan like:
+                // Sort:
+                //   Projection
+                //     Aggregate
+                //       SubPlan
+                SubExpr::Aggr(AggrExpr {
+                    op,
+                    operands,
+                    group_by,
+                    without,
+                }) => {
+                    assert!(!operands.is_empty());
+                    let next_level = level + 1;
+                    // aggregators don't have args, only need to deal with sub_node now.
+                    let sub_node = operands.into_iter().next().unwrap();
+                    let (sub_plan, column_name, table_name) =
+                        sub_node.build_plan_iter(meta_provider, next_level, read_parallelism)?;
+                    // filter out nonexistent tags
+                    let group_by = group_by
+                        .into_iter()
+                        .filter(|by| column_name.tag_keys.contains(by))
+                        .collect::<Vec<_>>();
+                    let groupby_columns = if without {
+                        column_name
+                            .tag_keys
+                            .iter()
+                            .filter_map(|tag_key| {
+                                if group_by.contains(tag_key) {
+                                    None
+                                } else {
+                                    Some(tag_key.as_str())
+                                }
+                            })
+                            .collect::<Vec<_>>()
+                    } else {
+                        group_by.iter().map(|s| (s.as_str())).collect::<Vec<_>>()
+                    };
+                    let aggr_expr =
+                        Self::aggr_op_expr(&op, &column_name.field, column_name.field.clone())?;
+                    let tag_exprs = groupby_columns.iter().map(|v| col(v)).collect::<Vec<_>>();
+                    let udf_args = tag_exprs.clone();
+                    let mut groupby_expr = vec![col(&column_name.timestamp)];
+                    groupby_expr.extend(udf_args);
+                    let unique_id_expr =
+                        // TSID is lost after aggregate, but PromAlignNode need a unique id, so
+                        // mock UUID as tsid based on groupby keys
+                        DataFusionExpr::Alias(
+                            Box::new(DataFusionExpr::ScalarUDF {
+                                fun: Arc::new(create_unique_id(tag_exprs.len())),
+                                args: tag_exprs.clone(),
+                            }),
+                            TSID_COLUMN.to_string(),
+                        );
+                    let mut projection = tag_exprs.clone();
+                    projection.extend(vec![
+                        col(&column_name.timestamp),
+                        col(&column_name.field),
+                        unique_id_expr.clone(),
+                    ]);
+                    let sort_exprs = if tag_exprs.is_empty() {
+                        vec![col(&column_name.timestamp).sort(true, true)]
+                    } else {
+                        vec![
+                            unique_id_expr.sort(true, true),
+                            col(&column_name.timestamp).sort(true, true),
+                        ]
+                    };
+                    let builder = LogicalPlanBuilder::from(sub_plan);
+                    let plan = builder
+                        .aggregate(groupby_expr, vec![aggr_expr])?
+                        .project(projection)?
+                        .sort(sort_exprs)?
+                        .build()?;
+
+                    Ok((plan, column_name, table_name))
+                }
+                SubExpr::Binary(_) => InvalidExpr {
+                    msg: "Binary Expr not supported",
+                }
+                .fail(),
+            },
+        }
+    }
+
+    fn aggr_op_expr(aggr_op: &str, field: &str, alias: String) -> Result<DataFusionExpr> {
+        let expr = match aggr_op {
+            "sum" => sum(col(field)),
+            "max" => max(col(field)),
+            "min" => min(col(field)),
+            "count" => count(col(field)),
+            "avg" => avg(col(field)),
+            _ => {
+                return InvalidExpr {
+                    msg: format!("aggr {} not supported now", aggr_op),
+                }
+                .fail()
+            }
+        };
+
+        Ok(DataFusionExpr::Alias(Box::new(expr), alias))
+    }
+}
+
+#[derive(Debug, Clone)]
+pub enum Operand {
+    String(String),
+    Float(f64),
+    Selector(Selector),
+}
+
+#[derive(Debug, Clone)]
+pub enum SubExpr {
+    Aggr(AggrExpr),
+    Func(FuncExpr),
+    Binary(BinaryExpr),
+}
+
+impl TryFrom<PbSubExpr> for Expr {
+    type Error = Error;
+
+    fn try_from(mut pb_sub_expr: PbSubExpr) -> Result<Self> {
+        let op_type = pb_sub_expr.get_op_type();
+
+        let operator = pb_sub_expr.take_operator();
+        let operands = pb_sub_expr
+            .take_operands()
+            .into_iter()
+            .map(Expr::try_from)
+            .collect::<Result<Vec<_>>>()?;
+        let sub_expr = match op_type {
+            SubExpr_OperatorType::AGGR => SubExpr::Aggr(AggrExpr {
+                op: operator,
+                operands,
+                group_by: pb_sub_expr.take_group().into_vec(),
+                without: pb_sub_expr.get_without(),
+            }),
+            SubExpr_OperatorType::FUNC => SubExpr::Func(FuncExpr {
+                op: operator,
+                operands,
+            }),
+            SubExpr_OperatorType::BINARY => {
+                return NotImplemented {
+                    expr: format!("{:?}", pb_sub_expr),
+                }
+                .fail()
+            }
+        };
+
+        Ok(Expr::RecursiveExpr(sub_expr))
+    }
+}
+
+impl SubExpr {
+    pub fn get_selector(&self) -> &Selector {
+        match self {
+            SubExpr::Aggr(AggrExpr { operands, .. }) => operands[0].get_selector(),
+            SubExpr::Func(FuncExpr { operands, .. }) => operands[0].get_selector(),
+            SubExpr::Binary(BinaryExpr { operands, .. }) => operands[0].get_selector(),
+        }
+    }
+
+    pub fn is_range_fn(&self) -> bool {
+        match self {
+            Self::Func(FuncExpr { operands, .. }) => match &operands[0] {
+                Expr::SimpleExpr(Operand::Selector(sel)) => sel.range > 0,
+                _ => false,
+            },
+            _ => false,
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct AggrExpr {
+    op: String,
+    operands: Vec<Expr>,
+    group_by: Vec<String>,
+    without: bool,
+}
+
+#[derive(Debug, Clone)]
+pub struct FuncExpr {
+    op: String,
+    operands: Vec<Expr>,
+}
+
+#[derive(Debug, Clone)]
+pub struct BinaryExpr {
+    _op: String,
+    operands: Vec<Expr>,
+    _return_bool: bool,
+}
+
+#[derive(Debug, Clone)]
+pub enum FilterType {
+    LiteralOr,
+    NotLiteralOr,
+    Regexp,
+    NotRegexpMatch,
+}
+
+impl From<FilterPbType> for FilterType {
+    fn from(pb_type: FilterPbType) -> Self {
+        match pb_type {
+            FilterPbType::LITERAL_OR => FilterType::LiteralOr,
+            FilterPbType::NOT_LITERAL_OR => FilterType::NotLiteralOr,
+            FilterPbType::REGEXP => FilterType::Regexp,
+            FilterPbType::NOT_REGEXP_MATCH => FilterType::NotRegexpMatch,
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct FilterOperator {
+    typ: FilterType,
+    params: Vec<String>,
+}
+
+#[derive(Debug, Clone)]
+pub struct Filter {
+    tag_key: String,
+    operators: Vec<FilterOperator>,
+}
+
+impl From<Filter> for DataFusionExpr {
+    fn from(mut f: Filter) -> DataFusionExpr {
+        let tag_key = col(&f.tag_key);
+        // TODO(chenxiang): only compare first op now
+        let mut first_op = f.operators.remove(0);
+        match first_op.typ {
+            // regepx filter only have one param
+            FilterType::Regexp => regex_match_expr(tag_key, first_op.params.remove(0), true),
+            FilterType::NotRegexpMatch => {
+                regex_match_expr(tag_key, first_op.params.remove(0), false)
+            }
+            FilterType::LiteralOr => tag_key.in_list(
+                first_op
+                    .params
+                    .iter()
+                    .map(|v| lit(v.as_str()))
+                    .collect::<Vec<_>>(),
+                false,
+            ),
+            FilterType::NotLiteralOr => tag_key.in_list(
+                first_op
+                    .params
+                    .iter()
+                    .map(|v| lit(v.as_str()))
+                    .collect::<Vec<_>>(),
+                true,
+            ),
+        }
+    }
+}
+
+impl From<FilterPb> for Filter {
+    fn from(mut pb_filter: FilterPb) -> Self {
+        Self {
+            tag_key: pb_filter.take_tag_key(),
+            operators: Into::<Vec<_>>::into(pb_filter.take_operators())
+                .into_iter()
+                .map(|mut f| FilterOperator {
+                    typ: f.get_filter_type().into(),
+                    params: f.take_params().into(),
+                })
+                .collect::<Vec<_>>(),
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct Selector {
+    // query params
+    pub query_range: TimeRange,
+    pub table: String,
+    pub filters: Vec<Filter>,
+    pub field: String,
+
+    // align params
+    pub align_range: TimeRange,
+    pub step: i64,
+    pub range: i64,
+    pub offset: i64,
+}
+
+impl Selector {
+    fn into_scan_plan<P: MetaProvider>(
+        self,
+        meta_provider: &ContextProviderAdapter<'_, P>,
+    ) -> Result<(LogicalPlan, Arc<ColumnNames>, String)> {
+        let Selector {
+            query_range,
+            field,
+            filters,
+            table,
+            ..
+        } = self;
+        let table_ref = meta_provider
+            .table(table.as_str().into())
+            .context(MetaProviderError {
+                msg: "failed to find table".to_string(),
+            })?
+            .context(TableNotFound { name: &table })?;
+
+        let table_provider = meta_provider
+            .get_table_provider(table_ref.name().into())
+            .context(TableNotFound { name: &table })?;
+        let schema = Schema::try_from(table_provider.schema()).context(BuildTableSchema)?;
+        let timestamp_column_name = schema.timestamp_name().to_string();
+        let (projection, tag_keys) = Self::build_projection_tag_keys(&schema, &field)?;
+        let mut filter_exprs = filters
+            .iter()
+            .filter_map(|f| {
+                // drop non_exist filter
+                if tag_keys.contains(&f.tag_key) {
+                    Some(DataFusionExpr::from(f.clone()))
+                } else {
+                    None
+                }
+            })
+            .collect::<Vec<_>>();
+        filter_exprs.push(timerange_to_expr(query_range, &timestamp_column_name));
+
+        let builder = LogicalPlanBuilder::scan(table.clone(), table_provider, None)?
+            .filter(combine_filters(&filter_exprs).expect("at least one filter(timestamp)"))?
+            .project(projection)?
+            .sort(default_sort_exprs(&timestamp_column_name))?;
+        let column_name = Arc::new(ColumnNames {
+            timestamp: timestamp_column_name,
+            tag_keys,
+            field,
+        });
+        let scan_plan = builder.build().context(BuildPlanError)?;
+        Ok((scan_plan, column_name, table))
+    }
+
+    fn build_projection_tag_keys(
+        schema: &Schema,
+        field: &str,
+    ) -> Result<(Vec<DataFusionExpr>, Vec<String>)> {
+        if let Some(f) = schema.column_with_name(field) {
+            ensure!(
+                f.data_type.is_f64_castable(),
+                InvalidExpr {
+                    msg: "field type must be f64-compatibile type",
+                }
+            );
+        } else {
+            return InvalidExpr {
+                msg: format!("field:{} not found", field),
+            }
+            .fail();
+        };
+        let mut tag_keys = Vec::new();
+        let mut projection = schema
+            .columns()
+            .iter()
+            .filter_map(|column| {
+                if column.is_tag {
+                    tag_keys.push(column.name.clone());
+                    Some(col(&column.name))
+                } else {
+                    None
+                }
+            })
+            .collect::<Vec<_>>();
+
+        let timestamp_expr = col(&schema.column(schema.timestamp_index()).name);
+        let tsid_expr = schema
+            .tsid_column()
+            .map(|c| col(&c.name))
+            .context(InvalidExpr {
+                msg: format!("{} not found", TSID_COLUMN),
+            })?;
+        let field_expr = col(field);
+        projection.extend(vec![timestamp_expr, tsid_expr, field_expr]);
+
+        Ok((projection, tag_keys))
+    }
+}
diff --git a/sql/src/promql/datafusion_util.rs b/sql/src/promql/datafusion_util.rs
new file mode 100644
index 0000000000..4e5003e963
--- /dev/null
+++ b/sql/src/promql/datafusion_util.rs
@@ -0,0 +1,105 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use std::{any::Any, fmt, sync::Arc};
+
+use arrow_deps::datafusion::logical_plan::{
+    col, lit, DFSchemaRef, Expr as DataFusionExpr, Expr, LogicalPlan, UserDefinedLogicalNode,
+};
+use common_types::{schema::TSID_COLUMN, time::TimeRange};
+
+use crate::promql::pushdown::{AlignParameter, Func};
+
+/// ColumnNames represents meaning of columns in one table.
+#[derive(Debug)]
+pub struct ColumnNames {
+    pub timestamp: String,
+    pub tag_keys: Vec<String>,
+    pub field: String,
+}
+
+/// Translate to `column_name BETWEEN start AND end` expr
+pub fn timerange_to_expr(query_range: TimeRange, column_name: &str) -> DataFusionExpr {
+    DataFusionExpr::Between {
+        expr: Box::new(col(column_name)),
+        negated: false,
+        low: Box::new(lit(query_range.inclusive_start().as_i64())),
+        high: Box::new(lit(query_range.exclusive_end().as_i64() + 1)),
+    }
+}
+
+pub fn default_sort_exprs(timestamp_column: &str) -> Vec<DataFusionExpr> {
+    vec![
+        col(TSID_COLUMN).sort(true, true),
+        col(timestamp_column).sort(true, true),
+    ]
+}
+
+pub struct PromAlignNode {
+    pub input: LogicalPlan,
+    pub column_name: Arc<ColumnNames>,
+    pub table_name: String,
+    pub func: Func,
+    pub align_param: AlignParameter,
+    pub read_parallelism: usize,
+}
+
+impl fmt::Debug for PromAlignNode {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        self.fmt_for_explain(f)
+    }
+}
+
+impl UserDefinedLogicalNode for PromAlignNode {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn inputs(&self) -> Vec<&LogicalPlan> {
+        vec![&self.input]
+    }
+
+    fn schema(&self) -> &DFSchemaRef {
+        self.input.schema()
+    }
+
+    fn expressions(&self) -> Vec<Expr> {
+        let qualified_name = |n| col(&format!("{}.{}", self.table_name, n));
+
+        let mut exprs = self
+            .column_name
+            .tag_keys
+            .iter()
+            .map(qualified_name)
+            .collect::<Vec<_>>();
+
+        exprs.extend(vec![
+            qualified_name(&self.column_name.timestamp),
+            qualified_name(&self.column_name.field),
+        ]);
+
+        exprs
+    }
+
+    fn fmt_for_explain(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(
+            f,
+            "PromAlign: align_param={:?}, column_name={:?}, read_parallelism={}",
+            self.align_param, self.column_name, self.read_parallelism
+        )
+    }
+
+    fn from_template(
+        &self,
+        _exprs: &[Expr],
+        inputs: &[LogicalPlan],
+    ) -> std::sync::Arc<dyn UserDefinedLogicalNode + Send + Sync> {
+        Arc::new(PromAlignNode {
+            input: inputs[0].clone(),
+            func: self.func,
+            table_name: self.table_name.clone(),
+            column_name: self.column_name.clone(),
+            align_param: self.align_param,
+            read_parallelism: self.read_parallelism,
+        })
+    }
+}
diff --git a/sql/src/promql/pushdown.rs b/sql/src/promql/pushdown.rs
new file mode 100644
index 0000000000..f9c0a279d9
--- /dev/null
+++ b/sql/src/promql/pushdown.rs
@@ -0,0 +1,50 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use std::convert::TryFrom;
+
+use common_types::time::{TimeRange, Timestamp};
+use snafu::Snafu;
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Func {} is not supported yet", func))]
+    NotSupportedFunc { func: String },
+}
+
+define_result!(Error);
+
+#[derive(Debug, Clone, Copy)]
+pub enum Func {
+    Instant, // used to simulate instant query
+    Rate,
+    Irate,
+    Delta,
+    Idelta,
+    Increase,
+}
+
+impl TryFrom<&str> for Func {
+    type Error = Error;
+
+    fn try_from(op: &str) -> Result<Self> {
+        let t = match op {
+            "rate" => Func::Rate,
+            "delta" => Func::Delta,
+            "irate" => Func::Irate,
+            "idelta" => Func::Idelta,
+            "increase" => Func::Increase,
+            func => return NotSupportedFunc { func }.fail(),
+        };
+
+        Ok(t)
+    }
+}
+
+#[derive(Debug, Clone, Copy)]
+pub struct AlignParameter {
+    pub align_range: TimeRange,
+    pub step: Timestamp,
+    pub offset: Timestamp,
+    /// 0 for no look back
+    pub lookback_delta: Timestamp,
+}
diff --git a/sql/src/promql/udf.rs b/sql/src/promql/udf.rs
new file mode 100644
index 0000000000..8928f6f790
--- /dev/null
+++ b/sql/src/promql/udf.rs
@@ -0,0 +1,300 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+// Copy from IOx
+// https://github.com/influxdata/influxdb_iox/blob/d0f588d3b800894fe0ebd06b6f9a184ca6a603d7/predicate/src/regex.rs
+
+use std::sync::Arc;
+
+use arrow_deps::{
+    arrow::{
+        array::{ArrayRef, BooleanArray, StringArray, UInt64Array},
+        datatypes::DataType,
+    },
+    datafusion::{
+        error::{DataFusionError, Result as DataFusionResult},
+        logical_plan::{create_udf, Expr},
+        physical_plan::{
+            functions::{make_scalar_function, Volatility},
+            udf::ScalarUDF,
+        },
+    },
+};
+use common_types::hash::hash64;
+use common_util::codec::{compact::MemCompactEncoder, Encoder};
+
+/// The name of the regex_match UDF given to DataFusion.
+pub const REGEX_MATCH_UDF_NAME: &str = "RegexMatch";
+pub const REGEX_NOT_MATCH_UDF_NAME: &str = "RegexNotMatch";
+
+/// Given a column containing string values and a single regex pattern,
+/// `regex_match_expr` determines which values satisfy the pattern and which do
+/// not.
+///
+/// If `matches` is true then this expression will filter values that do not
+/// satisfy the regex (equivalent to `col ~= /pattern/`). If `matches` is
+/// `false` then the expression will filter values that *do* match the regex,
+/// which is equivalent to `col !~ /pattern/`.
+///
+/// This UDF is designed to support the regex operator that can be pushed down
+/// via the InfluxRPC API.
+pub fn regex_match_expr(input: Expr, pattern: String, matches: bool) -> Expr {
+    // N.B., this function does not utilise the Arrow regexp compute kernel because
+    // in order to act as a filter it needs to return a boolean array of comparison
+    // results, not an array of strings as the regex compute kernel does.
+    let func = move |args: &[ArrayRef]| {
+        assert_eq!(args.len(), 1); // only works over a single column at a time.
+
+        let input_arr = &args[0].as_any().downcast_ref::<StringArray>().unwrap();
+
+        let pattern = regex::Regex::new(&pattern).map_err(|e| {
+            DataFusionError::Internal(format!("error compiling regex pattern: {}", e))
+        })?;
+
+        let results = input_arr
+            .iter()
+            .map(|row| {
+                // in arrow, any value can be null.
+                // Here we decide to make our UDF to return null when either base or exponent is
+                // null.
+                row.map(|v| pattern.is_match(v) == matches)
+            })
+            .collect::<BooleanArray>();
+
+        Ok(Arc::new(results) as ArrayRef)
+    };
+
+    // make_scalar_function is a helper to support accepting scalar values as
+    // well as arrays.
+    let func = make_scalar_function(func);
+
+    let udf_name = if matches {
+        REGEX_MATCH_UDF_NAME
+    } else {
+        REGEX_NOT_MATCH_UDF_NAME
+    };
+
+    let udf = create_udf(
+        udf_name,
+        vec![DataType::Utf8],
+        Arc::new(DataType::Boolean),
+        Volatility::Stable,
+        func,
+    );
+
+    udf.call(vec![input])
+}
+
+pub fn create_unique_id(input_len: usize) -> ScalarUDF {
+    let func = move |args: &[ArrayRef]| {
+        if args.is_empty() {
+            let builder = UUIDBuilder::new();
+            let tsid: UInt64Array = [Some(builder.finish())].iter().collect();
+            return Ok(Arc::new(tsid) as ArrayRef);
+        }
+        let array_len = args[0].len();
+        let inputs = args
+            .iter()
+            .map(|a| {
+                a.as_any()
+                    .downcast_ref::<StringArray>()
+                    .ok_or_else(|| DataFusionError::Execution("tag column not string".to_string()))
+            })
+            .collect::<DataFusionResult<Vec<_>>>()?;
+
+        let mut builders = Vec::new();
+        builders.resize_with(array_len, UUIDBuilder::new);
+        for array in &inputs {
+            array
+                .iter()
+                .zip(builders.iter_mut())
+                .for_each(|(v, builder)| {
+                    builder.write(v);
+                });
+        }
+        let results: UInt64Array = builders.into_iter().map(|b| Some(b.finish())).collect();
+        Ok(Arc::new(results) as ArrayRef)
+    };
+
+    create_udf(
+        "create_unique_id",
+        vec![DataType::Utf8; input_len],
+        Arc::new(DataType::UInt64),
+        Volatility::Stable,
+        make_scalar_function(func),
+    )
+}
+
+struct UUIDBuilder {
+    encoder: MemCompactEncoder,
+    buf: Vec<u8>,
+}
+
+impl UUIDBuilder {
+    fn new() -> Self {
+        Self {
+            encoder: MemCompactEncoder,
+            buf: Vec::new(),
+        }
+    }
+
+    fn write(&mut self, value: Option<&str>) {
+        let value = value.unwrap_or("");
+        self.encoder
+            .encode(&mut self.buf, value.as_bytes())
+            .unwrap(); // write mem is safe
+    }
+
+    fn finish(self) -> u64 {
+        hash64(&self.buf)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use arrow_deps::{
+        arrow::{
+            array::{StringArray, UInt64Array},
+            record_batch::RecordBatch,
+            util::pretty::pretty_format_batches,
+        },
+        datafusion::{
+            datasource::MemTable,
+            error::DataFusionError,
+            logical_plan::{col, Expr as DataFusionExpr},
+            prelude::ExecutionContext,
+        },
+    };
+    use common_types::schema::{ArrowSchema, ArrowSchemaRef, DataType, Field};
+
+    #[tokio::test]
+    async fn regex_match_expr() {
+        let cases = vec![
+            (
+                ".*", // match everything except NULL values
+                true, // keep the values matched
+                vec![
+                    "+---------------+--------+",
+                    "| words         | length |",
+                    "+---------------+--------+",
+                    "| air           | 3      |",
+                    "| aphex twin    | 10     |",
+                    "| bruce         | 5      |",
+                    "| Blood Orange  | 12     |",
+                    "| cocteau twins | 13     |",
+                    "+---------------+--------+",
+                ],
+            ),
+            (
+                ".*",  // match everything except NULL values
+                false, // filter away all the values matched
+                vec!["++", "++"],
+            ),
+            (
+                "", // an empty pattern also matches everything except NULL
+                true,
+                vec![
+                    "+---------------+--------+",
+                    "| words         | length |",
+                    "+---------------+--------+",
+                    "| air           | 3      |",
+                    "| aphex twin    | 10     |",
+                    "| bruce         | 5      |",
+                    "| Blood Orange  | 12     |",
+                    "| cocteau twins | 13     |",
+                    "+---------------+--------+",
+                ],
+            ),
+            (
+                ".+O.*", // match just words containing "O".
+                true,
+                vec![
+                    "+--------------+--------+",
+                    "| words        | length |",
+                    "+--------------+--------+",
+                    "| Blood Orange | 12     |",
+                    "+--------------+--------+",
+                ],
+            ),
+            (
+                "^(a|b).*", // match everything beginning with "a" or "b"
+                false,      // negate expression and filter away anything that matches
+                vec![
+                    "+---------------+--------+",
+                    "| words         | length |",
+                    "+---------------+--------+",
+                    "| Blood Orange  | 12     |",
+                    "| cocteau twins | 13     |",
+                    "+---------------+--------+",
+                ],
+            ),
+        ];
+
+        let schema = Arc::new(ArrowSchema::new(vec![
+            Field::new("words", DataType::Utf8, true),
+            Field::new("length", DataType::UInt64, false),
+        ]));
+
+        // define data for table
+        let words = vec![
+            Some("air"),
+            Some("aphex twin"),
+            Some("bruce"),
+            Some("Blood Orange"),
+            None,
+            None,
+            Some("cocteau twins"),
+        ];
+        let rb = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![
+                Arc::new(StringArray::from(words.clone())),
+                Arc::new(
+                    words
+                        .iter()
+                        .map(|word| word.map(|word| word.len() as u64))
+                        .collect::<UInt64Array>(),
+                ),
+            ],
+        )
+        .unwrap();
+        let rb = vec![vec![rb]];
+        for (pattern, matches, expected) in cases.into_iter() {
+            let regex_expr = super::regex_match_expr(col("words"), pattern.to_string(), matches);
+            let actual = run_plan(schema.clone(), rb.clone(), regex_expr)
+                .await
+                .unwrap();
+
+            assert_eq!(
+                expected, actual,
+                "\n\nEXPECTED:\n{:#?}\nACTUAL:\n{:#?}\n",
+                expected, actual
+            );
+        }
+    }
+
+    // Run a plan against the following input table as "t"
+    async fn run_plan(
+        schema: ArrowSchemaRef,
+        rb: Vec<Vec<RecordBatch>>,
+        op: DataFusionExpr,
+    ) -> Result<Vec<String>, DataFusionError> {
+        let provider = MemTable::try_new(Arc::clone(&schema), rb).unwrap();
+        let mut ctx = ExecutionContext::new();
+        ctx.register_table("t", Arc::new(provider)).unwrap();
+
+        let df = ctx.table("t").unwrap();
+        let df = df.filter(op).unwrap();
+
+        // execute the query
+        let record_batches = df.collect().await?;
+
+        Ok(pretty_format_batches(&record_batches)
+            .unwrap()
+            .to_string()
+            .split('\n')
+            .map(|s| s.to_owned())
+            .collect())
+    }
+}
diff --git a/sql/src/provider.rs b/sql/src/provider.rs
new file mode 100644
index 0000000000..fee689c411
--- /dev/null
+++ b/sql/src/provider.rs
@@ -0,0 +1,345 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Adapter to providers in datafusion
+
+use std::{any::Any, cell::RefCell, collections::HashMap, sync::Arc};
+
+use arrow_deps::datafusion::{
+    catalog::{catalog::CatalogProvider, schema::SchemaProvider},
+    datasource::TableProvider,
+    physical_plan::{udaf::AggregateUDF, udf::ScalarUDF},
+    sql::planner::ContextProvider,
+};
+use catalog::manager::Manager;
+use common_types::request_id::RequestId;
+use snafu::{ResultExt, Snafu};
+use table_engine::{provider::TableProviderAdapter, table::TableRef};
+use udf::{registry::FunctionRegistry, scalar::ScalarUdf, udaf::AggregateUdf};
+
+use crate::container::{TableContainer, TableReference};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Failed to find catalog, name:{}, err:{}", name, source))]
+    FindCatalog {
+        name: String,
+        source: catalog::manager::Error,
+    },
+
+    #[snafu(display("Failed to find schema, name:{}, err:{}", name, source))]
+    FindSchema {
+        name: String,
+        source: catalog::Error,
+    },
+
+    #[snafu(display("Failed to find table, name:{}, err:{}", name, source))]
+    FindTable {
+        name: String,
+        source: catalog::schema::Error,
+    },
+
+    #[snafu(display("Failed to find udf, err:{}", source))]
+    FindUdf { source: udf::registry::Error },
+}
+
+define_result!(Error);
+
+/// MetaProvider provides meta info needed by Frontend
+pub trait MetaProvider {
+    /// Default catalog name
+    fn default_catalog_name(&self) -> &str;
+
+    /// Default schema name
+    fn default_schema_name(&self) -> &str;
+
+    /// Get table meta by table reference
+    ///
+    /// Note that this function may block current thread. We can't make this
+    /// function async as the underlying (aka. datafusion) planner needs a
+    /// sync provider.
+    fn table(&self, name: TableReference) -> Result<Option<TableRef>>;
+
+    /// Get udf by name.
+    fn scalar_udf(&self, name: &str) -> Result<Option<ScalarUdf>>;
+
+    /// Get udaf by name.
+    fn aggregate_udf(&self, name: &str) -> Result<Option<AggregateUdf>>;
+}
+
+/// We use an adapter instead of using [catalog::Manager] directly, because
+/// - MetaProvider provides blocking method, but catalog::Manager may provide
+/// async method
+/// - Other meta data like default catalog and schema are needed
+// TODO(yingwen): Maybe support schema searching instead of using a fixed
+// default schema
+pub struct CatalogMetaProvider<'a, M> {
+    pub manager: &'a M,
+    pub default_catalog: &'a str,
+    pub default_schema: &'a str,
+    pub function_registry: &'a (dyn FunctionRegistry + Send + Sync),
+}
+
+impl<'a, M: Manager> MetaProvider for CatalogMetaProvider<'a, M> {
+    fn default_catalog_name(&self) -> &str {
+        self.default_catalog
+    }
+
+    fn default_schema_name(&self) -> &str {
+        self.default_schema
+    }
+
+    fn table(&self, name: TableReference) -> Result<Option<TableRef>> {
+        let resolved = name.resolve(self.default_catalog, self.default_schema);
+
+        let catalog = match self
+            .manager
+            .catalog_by_name(resolved.catalog)
+            .context(FindCatalog {
+                name: resolved.catalog,
+            })? {
+            Some(c) => c,
+            None => return Ok(None),
+        };
+
+        let schema = match catalog
+            .schema_by_name(resolved.schema)
+            .context(FindSchema {
+                name: resolved.schema,
+            })? {
+            Some(s) => s,
+            None => return Ok(None),
+        };
+
+        schema.table_by_name(resolved.table).context(FindTable {
+            name: resolved.table,
+        })
+    }
+
+    fn scalar_udf(&self, name: &str) -> Result<Option<ScalarUdf>> {
+        self.function_registry.find_udf(name).context(FindUdf)
+    }
+
+    fn aggregate_udf(&self, name: &str) -> Result<Option<AggregateUdf>> {
+        self.function_registry.find_udaf(name).context(FindUdf)
+    }
+}
+
+/// An adapter to ContextProvider, not thread safe
+pub struct ContextProviderAdapter<'a, P> {
+    /// Local cache for TableProvider to avoid create multiple adapter for the
+    /// same table, also save all the table needed during planning
+    table_cache: RefCell<TableContainer>,
+    /// Store the first error MetaProvider returns
+    err: RefCell<Option<Error>>,
+    meta_provider: &'a P,
+    request_id: RequestId,
+    /// Read parallelism for each table.
+    read_parallelism: usize,
+}
+
+impl<'a, P: MetaProvider> ContextProviderAdapter<'a, P> {
+    /// Create a adapter from meta provider
+    pub fn new(meta_provider: &'a P, request_id: RequestId, read_parallelism: usize) -> Self {
+        let default_catalog = meta_provider.default_catalog_name().to_string();
+        let default_schema = meta_provider.default_schema_name().to_string();
+
+        Self {
+            table_cache: RefCell::new(TableContainer::new(default_catalog, default_schema)),
+            err: RefCell::new(None),
+            meta_provider,
+            request_id,
+            read_parallelism,
+        }
+    }
+
+    /// Consumes the adapter, returning the tables used during planning if no
+    /// error occurs, otherwise returning the error
+    pub fn try_into_container(self) -> Result<TableContainer> {
+        if let Some(e) = self.err.into_inner() {
+            return Err(e);
+        }
+
+        Ok(self.table_cache.into_inner())
+    }
+
+    /// Save error if there is no existing error.
+    ///
+    /// The datafusion's ContextProvider can't return error, so here we save the
+    /// error in the adapter and return None, also let datafusion
+    /// return a provider not found error and abort the planning
+    /// procedure.
+    fn maybe_set_err(&self, err: Error) {
+        if self.err.borrow().is_none() {
+            *self.err.borrow_mut() = Some(err);
+        }
+    }
+}
+
+impl<'a, P: MetaProvider> MetaProvider for ContextProviderAdapter<'a, P> {
+    fn default_catalog_name(&self) -> &str {
+        self.meta_provider.default_catalog_name()
+    }
+
+    fn default_schema_name(&self) -> &str {
+        self.meta_provider.default_schema_name()
+    }
+
+    fn table(&self, name: TableReference) -> Result<Option<TableRef>> {
+        self.meta_provider.table(name)
+    }
+
+    fn scalar_udf(&self, name: &str) -> Result<Option<ScalarUdf>> {
+        self.meta_provider.scalar_udf(name)
+    }
+
+    fn aggregate_udf(&self, name: &str) -> Result<Option<AggregateUdf>> {
+        self.meta_provider.aggregate_udf(name)
+    }
+}
+
+impl<'a, P: MetaProvider> ContextProvider for ContextProviderAdapter<'a, P> {
+    fn get_table_provider(&self, name: TableReference) -> Option<Arc<dyn TableProvider>> {
+        // Find in local cache
+        if let Some(p) = self.table_cache.borrow().get(name) {
+            return Some(p);
+        }
+
+        // Find in meta provider
+        match self.meta_provider.table(name) {
+            Ok(Some(table)) => {
+                let table_adapter = Arc::new(TableProviderAdapter::new(
+                    table,
+                    self.request_id,
+                    self.read_parallelism,
+                ));
+                // Put into cache
+                self.table_cache
+                    .borrow_mut()
+                    .insert(name, table_adapter.clone());
+
+                Some(table_adapter)
+            }
+            Ok(None) => None,
+            Err(e) => {
+                self.maybe_set_err(e);
+                None
+            }
+        }
+    }
+
+    // ScalarUDF is not supported now
+    fn get_function_meta(&self, name: &str) -> Option<Arc<ScalarUDF>> {
+        // We don't cache udf used by the query because now we will register all udf to
+        // datafusion's context.
+        match self.meta_provider.scalar_udf(name) {
+            Ok(Some(udf)) => Some(udf.to_datafusion_udf()),
+            Ok(None) => None,
+            Err(e) => {
+                self.maybe_set_err(e);
+                None
+            }
+        }
+    }
+
+    // AggregateUDF is not supported now
+    fn get_aggregate_meta(&self, name: &str) -> Option<Arc<AggregateUDF>> {
+        match self.meta_provider.aggregate_udf(name) {
+            Ok(Some(udaf)) => Some(udaf.to_datafusion_udaf()),
+            Ok(None) => None,
+            Err(e) => {
+                self.maybe_set_err(e);
+                None
+            }
+        }
+    }
+}
+
+struct SchemaProviderAdapter {
+    catalog: String,
+    schema: String,
+    tables: Arc<TableContainer>,
+}
+
+impl SchemaProvider for SchemaProviderAdapter {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn table_names(&self) -> Vec<String> {
+        let mut names = Vec::new();
+        let _ = self.tables.visit::<_, ()>(|name, table| {
+            if name.catalog == self.catalog && name.schema == self.schema {
+                names.push(table.as_table_ref().name().to_string());
+            }
+            Ok(())
+        });
+        names
+    }
+
+    fn table(&self, name: &str) -> Option<Arc<dyn TableProvider>> {
+        let name_ref = TableReference::Full {
+            catalog: &self.catalog,
+            schema: &self.schema,
+            table: name,
+        };
+        self.tables
+            .get(name_ref)
+            .map(|v| v as Arc<dyn TableProvider>)
+    }
+
+    fn table_exist(&self, name: &str) -> bool {
+        self.table(name).is_some()
+    }
+}
+
+#[derive(Default)]
+pub struct CatalogProviderAdapter {
+    schemas: HashMap<String, Arc<SchemaProviderAdapter>>,
+}
+
+impl CatalogProviderAdapter {
+    pub fn new_adapters(tables: Arc<TableContainer>) -> HashMap<String, CatalogProviderAdapter> {
+        let mut catalog_adapters = HashMap::with_capacity(tables.num_catalogs());
+        let _ = tables.visit::<_, ()>(|name, _| {
+            // Get or create catalog
+            let catalog = match catalog_adapters.get_mut(name.catalog) {
+                Some(v) => v,
+                None => catalog_adapters
+                    .entry(name.catalog.to_string())
+                    .or_insert_with(CatalogProviderAdapter::default),
+            };
+            // Get or create schema
+            if catalog.schemas.get(name.schema).is_none() {
+                catalog.schemas.insert(
+                    name.schema.to_string(),
+                    Arc::new(SchemaProviderAdapter {
+                        catalog: name.catalog.to_string(),
+                        schema: name.schema.to_string(),
+                        tables: tables.clone(),
+                    }),
+                );
+            }
+
+            Ok(())
+        });
+
+        catalog_adapters
+    }
+}
+
+impl CatalogProvider for CatalogProviderAdapter {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn schema_names(&self) -> Vec<String> {
+        self.schemas.keys().cloned().collect()
+    }
+
+    fn schema(&self, name: &str) -> Option<Arc<dyn SchemaProvider>> {
+        self.schemas
+            .get(name)
+            .cloned()
+            .map(|v| v as Arc<dyn SchemaProvider>)
+    }
+}
diff --git a/sql/src/tests.rs b/sql/src/tests.rs
new file mode 100644
index 0000000000..bd49bded4b
--- /dev/null
+++ b/sql/src/tests.rs
@@ -0,0 +1,69 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use std::sync::Arc;
+
+use arrow_deps::datafusion::catalog::TableReference;
+use catalog::consts::{DEFAULT_CATALOG, DEFAULT_SCHEMA};
+use common_types::tests::build_schema;
+use table_engine::{
+    memory::MemoryTable,
+    table::{Table, TableId, TableRef},
+    ANALYTIC_ENGINE_TYPE,
+};
+use udf::{scalar::ScalarUdf, udaf::AggregateUdf};
+
+use crate::provider::MetaProvider;
+
+pub struct MockMetaProvider {
+    tables: Vec<Arc<MemoryTable>>,
+}
+
+impl Default for MockMetaProvider {
+    fn default() -> Self {
+        Self {
+            tables: vec![
+                Arc::new(MemoryTable::new(
+                    "test_table".to_string(),
+                    TableId::from(100),
+                    build_schema(),
+                    ANALYTIC_ENGINE_TYPE.to_string(),
+                )),
+                Arc::new(MemoryTable::new(
+                    "test_table2".to_string(),
+                    TableId::from(101),
+                    build_schema(),
+                    ANALYTIC_ENGINE_TYPE.to_string(),
+                )),
+            ],
+        }
+    }
+}
+
+impl MetaProvider for MockMetaProvider {
+    fn default_catalog_name(&self) -> &str {
+        DEFAULT_CATALOG
+    }
+
+    fn default_schema_name(&self) -> &str {
+        DEFAULT_SCHEMA
+    }
+
+    fn table(&self, name: TableReference) -> crate::provider::Result<Option<TableRef>> {
+        let resolved = name.resolve(self.default_catalog_name(), self.default_schema_name());
+        for table in &self.tables {
+            if resolved.table == table.name() {
+                return Ok(Some(table.clone()));
+            }
+        }
+
+        Ok(None)
+    }
+
+    fn scalar_udf(&self, _name: &str) -> crate::provider::Result<Option<ScalarUdf>> {
+        todo!()
+    }
+
+    fn aggregate_udf(&self, _name: &str) -> crate::provider::Result<Option<AggregateUdf>> {
+        todo!()
+    }
+}
diff --git a/src/bin/ceresdb-server.rs b/src/bin/ceresdb-server.rs
new file mode 100644
index 0000000000..627e9ab296
--- /dev/null
+++ b/src/bin/ceresdb-server.rs
@@ -0,0 +1,83 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! The main entry point to start the server
+
+// TODO(yingwen): ceresdb-server is a legacy name, maybe use a new name
+
+use std::env;
+
+use ceresdbx::setup;
+use clap::{App, Arg};
+use common_util::{panic, toml};
+use log::info;
+use server::config::Config;
+
+/// The ip address of current node.
+const NODE_ADDR: &str = "CSE_CERES_META_NODE_ADDR";
+const META_PEERS: &str = "META_PEERS";
+const CLUSTER_NAME: &str = "CLUSTER_NAME";
+/// Enable communication with meta node.
+const ENABLE_META: &str = "ENABLE_META";
+
+fn fetch_version() -> String {
+    let build_version = env!("VERGEN_BUILD_SEMVER");
+    let git_branch = env!("VERGEN_GIT_BRANCH");
+    let git_commit_id = env!("VERGEN_GIT_SHA_SHORT");
+    let build_time = env!("VERGEN_BUILD_TIMESTAMP");
+
+    format!(
+        "\nCeresDB Version: {}\nGit branch: {}\nGit commit: {}\nBuild: {}",
+        build_version, git_branch, git_commit_id, build_time
+    )
+}
+
+fn main() {
+    let version = fetch_version();
+    let matches = App::new("CeresDB Server")
+        .version(version.as_str())
+        .arg(
+            Arg::with_name("config")
+                .short("c")
+                .long("config")
+                .required(false)
+                .takes_value(true)
+                .help("Set configuration file, eg: \"/path/server.toml\""),
+        )
+        .get_matches();
+
+    let mut config = match matches.value_of("config") {
+        Some(path) => {
+            let mut toml_buf = String::new();
+            toml::parse_toml_from_path(path, &mut toml_buf).expect("Failed to parse config.")
+        }
+        None => Config::default(),
+    };
+
+    // Combine configs from env.
+    if let Ok(enable_meta) = env::var(ENABLE_META) {
+        if let Ok(enable_meta) = enable_meta.parse::<bool>() {
+            config.meta_client.enable_meta = enable_meta;
+        }
+    }
+    if let Ok(node_addr) = env::var(NODE_ADDR) {
+        config.meta_client.node = node_addr;
+    }
+    if let Ok(meta_addr) = env::var(META_PEERS) {
+        config.meta_client.meta_addr = meta_addr;
+    }
+    if let Ok(cluster) = env::var(CLUSTER_NAME) {
+        config.meta_client.cluster = cluster;
+    }
+
+    // Setup log.
+    let _runtime_level = setup::setup_log(&config);
+    // Setup tracing.
+    let _writer_guard = setup::setup_tracing(&config);
+
+    panic::set_panic_hook(false);
+
+    // Log version.
+    info!("version:{}", version);
+
+    setup::run_server(config);
+}
diff --git a/src/docs/config.toml b/src/docs/config.toml
new file mode 100644
index 0000000000..5a2ede377c
--- /dev/null
+++ b/src/docs/config.toml
@@ -0,0 +1,27 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+bind_addr = "0.0.0.0"
+http_port = 5000
+grpc_port = 8831
+log_level = "debug"
+
+[analytic]
+data_path = "/tmp/ceresdbx/"
+
+[analytic.table_opts]
+arena_block_size = 128
+
+[[meta_client.cluster_view.shards]]
+shard_id = 0
+[[meta_client.cluster_view.shards.nodes]]
+addr = '127.0.0.1'
+port = 38082
+
+[[meta_client.cluster_view.shards]]
+shard_id = 1
+[[meta_client.cluster_view.shards.nodes]]
+addr = '127.0.0.1'
+port = 48082
+[[meta_client.cluster_view.shards.nodes]]
+addr = '127.0.0.1'
+port = 58082
\ No newline at end of file
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000000..22fed20ac2
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,6 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! ceresdbx
+
+pub mod setup;
+mod signal_handler;
diff --git a/src/setup.rs b/src/setup.rs
new file mode 100644
index 0000000000..6c2d8263d4
--- /dev/null
+++ b/src/setup.rs
@@ -0,0 +1,127 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Setup server
+
+use std::sync::Arc;
+
+use analytic_engine::{self, setup};
+use catalog_impls::{table_based::TableBasedManager, CatalogManagerImpl};
+use common_util::runtime;
+use log::info;
+use logger::RuntimeLevel;
+use query_engine::executor::ExecutorImpl;
+use server::{
+    config::{Config, RuntimeConfig},
+    server::Builder,
+    table_engine::{MemoryTableEngine, TableEngineProxy},
+};
+use table_engine::engine::EngineRuntimes;
+use tracing_util::{
+    self,
+    tracing_appender::{non_blocking::WorkerGuard, rolling::Rotation},
+};
+use udf::registry::FunctionRegistryImpl;
+
+use crate::signal_handler;
+
+/// Setup log with given `config`, returns the runtime log level switch.
+pub fn setup_log(config: &Config) -> RuntimeLevel {
+    server::logger::init_log(config).expect("Failed to init log.")
+}
+
+/// Setup tracing with given `config`, returns the writer guard.
+pub fn setup_tracing(config: &Config) -> WorkerGuard {
+    tracing_util::init_tracing_with_file(
+        &config.tracing_log_name,
+        &config.tracing_log_dir,
+        &config.tracing_level,
+        Rotation::NEVER,
+    )
+}
+
+fn build_runtime(name: &str, threads_num: usize) -> runtime::Runtime {
+    runtime::Builder::default()
+        .worker_threads(threads_num)
+        .thread_name(name)
+        .enable_all()
+        .build()
+        .unwrap_or_else(|e| {
+            //TODO(yingwen) replace panic with fatal
+            panic!("Failed to create runtime, err:{}", e);
+        })
+}
+
+fn build_engine_runtimes(config: &RuntimeConfig) -> EngineRuntimes {
+    EngineRuntimes {
+        read_runtime: Arc::new(build_runtime("cse-read", config.read_thread_num)),
+        write_runtime: Arc::new(build_runtime("cse-write", config.write_thread_num)),
+        bg_runtime: Arc::new(build_runtime("cse-bg", config.background_thread_num)),
+    }
+}
+
+/// Run a server, returns when the server is shutdown by user
+pub fn run_server(config: Config) {
+    let runtimes = Arc::new(build_engine_runtimes(&config.runtime));
+    let engine_runtimes = runtimes.clone();
+
+    info!("Server starts up, config:{:#?}", config);
+
+    runtimes.bg_runtime.block_on(async {
+        // Build all table engine
+        // Create memory engine
+        let memory = MemoryTableEngine;
+        // Create analytic engine
+        let analytic_config = config.analytic.clone();
+        let analytic = setup::open_analytic_table_engine(analytic_config, engine_runtimes)
+            .await
+            .unwrap_or_else(|e| {
+                panic!("Failed to setup analytic engine, err:{}", e);
+            });
+
+        // Create table engine proxy
+        let engine_proxy = Arc::new(TableEngineProxy {
+            memory,
+            analytic: analytic.clone(),
+        });
+
+        // Create catalog manager, use analytic table as backend
+        let catalog_manager = CatalogManagerImpl::new(
+            TableBasedManager::new(&analytic, engine_proxy.clone())
+                .await
+                .unwrap_or_else(|e| {
+                    panic!("Failed to create catalog manager, err:{}", e);
+                }),
+        );
+
+        // Init function registry.
+        let mut function_registry = FunctionRegistryImpl::new();
+        function_registry.load_functions().unwrap_or_else(|e| {
+            panic!("Failed to create function registry, err:{}", e);
+        });
+        let function_registry = Arc::new(function_registry);
+
+        // Create query executor
+        let query_executor = ExecutorImpl::new();
+
+        // Build and start server
+        let mut server = Builder::new(config)
+            .runtimes(runtimes.clone())
+            .catalog_manager(catalog_manager)
+            .query_executor(query_executor)
+            .table_engine(engine_proxy)
+            .function_registry(function_registry)
+            .build()
+            .unwrap_or_else(|e| {
+                panic!("Failed to create server, err:{}", e);
+            });
+        server.start().await.unwrap_or_else(|e| {
+            panic!("Failed to start server,, err:{}", e);
+        });
+
+        // Wait for signal
+        signal_handler::wait_for_signal();
+
+        // Stop server
+        server.stop();
+    });
+}
diff --git a/src/signal_handler.rs b/src/signal_handler.rs
new file mode 100644
index 0000000000..39ad1733f4
--- /dev/null
+++ b/src/signal_handler.rs
@@ -0,0 +1,31 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Signal handler
+//!
+//! Only works on unix like environments
+
+pub use self::details::wait_for_signal;
+
+#[cfg(unix)]
+mod details {
+    use log::info;
+    use signal_hook::{consts::TERM_SIGNALS, iterator::Signals};
+
+    pub fn wait_for_signal() {
+        let mut sigs = Signals::new(TERM_SIGNALS).unwrap_or_else(|e| {
+            // TODO(yingwen): Log here
+            panic!("Failed to register signal handlers, err:{}", e);
+        });
+        for signal in &mut sigs {
+            if TERM_SIGNALS.contains(&signal) {
+                info!("Received signal {}, stopping server...", signal);
+                break;
+            }
+        }
+    }
+}
+
+#[cfg(not(unix))]
+mod details {
+    pub fn wait_for_signal() {}
+}
diff --git a/system_catalog/Cargo.toml b/system_catalog/Cargo.toml
new file mode 100644
index 0000000000..c6d4ff7b7a
--- /dev/null
+++ b/system_catalog/Cargo.toml
@@ -0,0 +1,22 @@
+[package]
+name = "system_catalog"
+version = "0.1.0"
+authors = ["CeresDB Authors <ceresdb@service.alipay.com>"]
+edition = "2018"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+# In alphabetical order
+arrow_deps = { path = "../arrow_deps" }
+async-trait = "0.1.41"
+catalog = { path = "../catalog" }
+common_types = { path = "../common_types" }
+common_util = { path = "../common_util" }
+futures = "0.3"
+log = "0.4"
+proto = { path = "../proto" }
+protobuf = "2.20"
+snafu = { version = "0.6.10", features = ["backtraces"] }
+table_engine = { path = "../table_engine" }
+tokio = { version = "1.0", features = ["sync"] }
diff --git a/system_catalog/src/lib.rs b/system_catalog/src/lib.rs
new file mode 100644
index 0000000000..a0e1855a70
--- /dev/null
+++ b/system_catalog/src/lib.rs
@@ -0,0 +1,168 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! System catalog implementations
+
+use std::{
+    collections::HashMap,
+    fmt::Debug,
+    pin::Pin,
+    sync::Arc,
+    task::{Context, Poll},
+};
+
+use async_trait::async_trait;
+use common_types::{
+    record_batch::RecordBatch,
+    row::Row,
+    schema::{RecordSchema, Schema},
+    time::Timestamp,
+};
+use futures::Stream;
+use table_engine::{
+    stream,
+    stream::{PartitionedStreams, RecordBatchStream, SendableRecordBatchStream},
+    table::{
+        AlterSchemaRequest, FlushRequest, GetRequest, ReadRequest, Table, TableId, TableStats,
+        WriteRequest,
+    },
+};
+
+pub mod sys_catalog_table;
+pub mod tables;
+
+/// Timestamp of entry
+pub const ENTRY_TIMESTAMP: Timestamp = Timestamp::new(0);
+
+/// The minimal thing that a system table needs to implement
+#[async_trait]
+pub trait SystemTable: Send + Sync + Debug {
+    /// System table name
+    fn name(&self) -> &str;
+
+    /// System table name
+    fn id(&self) -> TableId;
+
+    /// Produce the schema from this system table
+    fn schema(&self) -> Schema;
+
+    /// Get the contents of the system table as a single RecordBatch
+    async fn read(
+        &self,
+        request: ReadRequest,
+    ) -> table_engine::table::Result<SendableRecordBatchStream>;
+}
+
+#[derive(Debug)]
+pub struct SystemTableAdapter {
+    inner: Arc<dyn SystemTable>,
+}
+
+impl SystemTableAdapter {
+    pub fn new(inner: impl SystemTable + 'static) -> Self {
+        Self {
+            inner: Arc::new(inner),
+        }
+    }
+}
+
+#[async_trait]
+impl Table for SystemTableAdapter {
+    fn name(&self) -> &str {
+        self.inner.name()
+    }
+
+    fn id(&self) -> TableId {
+        self.inner.id()
+    }
+
+    fn schema(&self) -> Schema {
+        self.inner.schema()
+    }
+
+    fn options(&self) -> HashMap<String, String> {
+        HashMap::new()
+    }
+
+    fn engine_type(&self) -> &str {
+        "system"
+    }
+
+    fn stats(&self) -> TableStats {
+        TableStats::default()
+    }
+
+    async fn write(&self, _request: WriteRequest) -> table_engine::table::Result<usize> {
+        Ok(0)
+    }
+
+    async fn read(
+        &self,
+        request: ReadRequest,
+    ) -> table_engine::table::Result<SendableRecordBatchStream> {
+        self.inner.read(request).await
+    }
+
+    async fn get(&self, _request: GetRequest) -> table_engine::table::Result<Option<Row>> {
+        Ok(None)
+    }
+
+    async fn partitioned_read(
+        &self,
+        request: ReadRequest,
+    ) -> table_engine::table::Result<PartitionedStreams> {
+        let read_parallelism = request.opts.read_parallelism;
+        let stream = self.inner.read(request).await?;
+        let mut streams = Vec::with_capacity(read_parallelism);
+        streams.push(stream);
+        for _ in 0..read_parallelism - 1 {
+            streams.push(Box::pin(OneRecordBatchStream {
+                schema: self.schema().clone().to_record_schema(),
+                record_batch: None,
+            }));
+        }
+        Ok(PartitionedStreams { streams })
+    }
+
+    async fn alter_schema(
+        &self,
+        _request: AlterSchemaRequest,
+    ) -> table_engine::table::Result<usize> {
+        Ok(0)
+    }
+
+    async fn alter_options(
+        &self,
+        _options: HashMap<String, String>,
+    ) -> table_engine::table::Result<usize> {
+        Ok(0)
+    }
+
+    async fn flush(&self, _request: FlushRequest) -> table_engine::table::Result<()> {
+        Ok(())
+    }
+
+    async fn compact(&self) -> table_engine::table::Result<()> {
+        Ok(())
+    }
+}
+
+pub struct OneRecordBatchStream {
+    schema: RecordSchema,
+    record_batch: Option<RecordBatch>,
+}
+impl Stream for OneRecordBatchStream {
+    type Item = stream::Result<RecordBatch>;
+
+    fn poll_next(mut self: Pin<&mut Self>, _cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        if self.record_batch.is_none() {
+            Poll::Ready(None)
+        } else {
+            Poll::Ready(Some(Ok(self.record_batch.take().unwrap())))
+        }
+    }
+}
+impl RecordBatchStream for OneRecordBatchStream {
+    fn schema(&self) -> &RecordSchema {
+        &self.schema
+    }
+}
diff --git a/system_catalog/src/sys_catalog_table.rs b/system_catalog/src/sys_catalog_table.rs
new file mode 100644
index 0000000000..e1a4a004be
--- /dev/null
+++ b/system_catalog/src/sys_catalog_table.rs
@@ -0,0 +1,1017 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Table to store system catalog
+
+use std::{collections::HashMap, convert::TryFrom, mem};
+
+use async_trait::async_trait;
+use catalog::consts;
+use common_types::{
+    bytes::{Bytes, BytesMut, MemBuf, MemBufMut},
+    column_schema,
+    datum::{Datum, DatumKind},
+    projected_schema::ProjectedSchema,
+    record_batch::RecordBatch,
+    request_id::RequestId,
+    row::{Row, RowGroup, RowGroupBuilder},
+    schema::{self, Schema},
+    time::Timestamp,
+};
+use common_util::{
+    codec::{memcomparable::MemComparable, Encoder},
+    define_result,
+};
+use futures::TryStreamExt;
+use log::{debug, info, warn};
+use proto::sys_catalog::{CatalogEntry, SchemaEntry, TableEntry};
+use protobuf::Message;
+use snafu::{ensure, Backtrace, OptionExt, ResultExt, Snafu};
+use table_engine::{
+    self,
+    engine::{
+        CreateTableRequest, DropTableRequest, OpenTableRequest, TableEngine, TableRequestType,
+        TableState,
+    },
+    predicate::PredicateBuilder,
+    table::{
+        GetRequest, ReadOptions, ReadOrder, ReadRequest, SchemaId, TableId, TableInfo, TableRef,
+        TableSeq, WriteRequest,
+    },
+};
+use tokio::sync::Mutex;
+
+use crate::ENTRY_TIMESTAMP;
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Failed to build schema for sys_catalog, err:{}", source))]
+    BuildSchema { source: common_types::schema::Error },
+
+    #[snafu(display(
+        "Failed to get column index for sys_catalog, name:{}.\nBacktrace:\n{}",
+        name,
+        backtrace
+    ))]
+    GetColumnIndex { name: String, backtrace: Backtrace },
+
+    #[snafu(display("Failed to build table for sys_catalog, err:{}", source))]
+    BuildTable { source: table_engine::engine::Error },
+
+    #[snafu(display("Failed to open table for sys_catalog, err:{}", source))]
+    OpenTable { source: table_engine::engine::Error },
+
+    #[snafu(display("Failed to convert into RowGroup, err:{}", source))]
+    IntoRowGroup { source: common_types::row::Error },
+
+    #[snafu(display("Failed to persist catalog to table, err:{}", source))]
+    PersistCatalog { source: table_engine::table::Error },
+
+    #[snafu(display("Failed to persist schema to table, err:{}", source))]
+    PersistSchema { source: table_engine::table::Error },
+
+    #[snafu(display("Failed to persist tables to table, err:{}", source))]
+    PersistTables { source: table_engine::table::Error },
+
+    #[snafu(display("Failed to read table, err:{}", source))]
+    ReadTable { source: table_engine::table::Error },
+
+    #[snafu(display("Failed to read stream, err:{}", source))]
+    ReadStream { source: table_engine::stream::Error },
+
+    #[snafu(display(
+        "Visitor catalog not found, catalog:{}.\nBacktrace:\n{}",
+        catalog,
+        backtrace
+    ))]
+    #[snafu(visibility(pub))]
+    VisitorCatalogNotFound {
+        catalog: String,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Visitor schema not found, catalog:{}, schema:{}.\nBacktrace:\n{}",
+        catalog,
+        schema,
+        backtrace
+    ))]
+    #[snafu(visibility(pub))]
+    VisitorSchemaNotFound {
+        catalog: String,
+        schema: String,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Visitor Failed to open table, err:{}", source))]
+    #[snafu(visibility(pub))]
+    VisitorOpenTable { source: table_engine::engine::Error },
+
+    #[snafu(display("Failed to encode entry key header, err:{}", source))]
+    EncodeKeyHeader { source: common_types::bytes::Error },
+
+    #[snafu(display("Failed to encode entry body, err:{}", source))]
+    EncodeKeyBody {
+        source: common_util::codec::memcomparable::Error,
+    },
+
+    #[snafu(display("Failed to encode table key type, err:{}", source))]
+    EncodeTableKeyType { source: common_types::bytes::Error },
+
+    #[snafu(display("Failed to read entry key header, err:{}", source))]
+    ReadKeyHeader { source: common_types::bytes::Error },
+
+    #[snafu(display("Failed to read table key header, err:{}", source))]
+    ReadTableKeyHeader { source: common_types::bytes::Error },
+
+    #[snafu(display(
+        "Invalid entry key header, value:{}.\nBacktrace:\n{}",
+        value,
+        backtrace
+    ))]
+    InvalidKeyHeader { value: u8, backtrace: Backtrace },
+
+    #[snafu(display("Invalid table key type, value:{}.\nBacktrace:\n{}", value, backtrace))]
+    InvalidTableKeyType { value: u8, backtrace: Backtrace },
+
+    #[snafu(display(
+        "Failed to encode protobuf for entry, err:{}.\nBacktrace:\n{}",
+        source,
+        backtrace
+    ))]
+    EncodeEntryPb {
+        source: protobuf::error::ProtobufError,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Failed to build row for entry, err:{}", source))]
+    BuildRow { source: common_types::row::Error },
+
+    #[snafu(display(
+        "Failed to decode protobuf for entry, err:{}.\nBacktrace:\n{}",
+        source,
+        backtrace
+    ))]
+    DecodeEntryPb {
+        source: protobuf::error::ProtobufError,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Failed to decode table entry, err:{}", source))]
+    DecodeTableEntry {
+        source: table_engine::table::TryFromTableEntryError,
+    },
+
+    #[snafu(display(
+        "Failed to decode schema for table alter entry, table:{}, err:{}",
+        table,
+        source
+    ))]
+    DecodeSchema {
+        table: String,
+        source: common_types::schema::Error,
+    },
+
+    #[snafu(display("Table key type not found in key.\nBacktrace:\n{}", backtrace))]
+    EmptyTableKeyType { backtrace: Backtrace },
+
+    #[snafu(display(
+        "The row in the sys_catalog_table is invalid, row:{:?}.\nBacktrace:\n{}",
+        row,
+        backtrace
+    ))]
+    InvalidTableRow { row: Row, backtrace: Backtrace },
+
+    #[snafu(display(
+        "The fetched table is mismatched, expect:{}, given:{}.\nBacktrace:\n{}",
+        expect_table,
+        given_table,
+        backtrace
+    ))]
+    TableKeyMismatch {
+        expect_table: String,
+        given_table: String,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("The table is not found, table:{}.\nBacktrace:\n{}", table, backtrace))]
+    TableNotFound { table: String, backtrace: Backtrace },
+
+    #[snafu(display("Fail to get the table info, table:{}, err:{}.", table, source))]
+    GetTableInfo {
+        table: String,
+        source: table_engine::table::Error,
+    },
+
+    #[snafu(display("Invalid table state transition, table:{}, err:{}.", table, source))]
+    InvalidTableStateTransition {
+        table: String,
+        source: table_engine::engine::Error,
+    },
+
+    #[snafu(display("Invalid schema id, id:{}", id))]
+    InvalidSchemaId { id: u32 },
+}
+
+define_result!(Error);
+
+/// Table name of the sys catalog
+pub const TABLE_NAME: &str = "sys_catalog";
+/// Schema id of the sys catalog schema (`system/public`).
+pub const SCHEMA_ID: SchemaId = SchemaId::from_u16(1);
+/// Table sequence of the sys catalog table, always set to 1
+pub const TABLE_SEQ: TableSeq = TableSeq::from_u32(1);
+/// Table id of the `sys_catalog` table.
+pub const TABLE_ID: TableId = TableId::new(SCHEMA_ID, TABLE_SEQ);
+/// Name of key column (field)
+pub const KEY_COLUMN_NAME: &str = "key";
+/// Name of timestamp column (field)
+pub const TIMESTAMP_COLUMN_NAME: &str = "timestamp";
+/// Name of value column (field)
+pub const VALUE_COLUMN_NAME: &str = "value";
+/// Default enable ttl is false
+pub const DEFAULT_ENABLE_TTL: &str = "false";
+
+// TODO(yingwen): Add a type column once support int8 type and maybe split key
+// into multiple columns.
+/// SysCatalogTable is a special table to keep tracks of the system infomations
+///
+/// Similar to kudu's SysCatalogTable
+/// - see <https://github.com/apache/kudu/blob/76cb0dd808aaef548ef80682e13a00711e7dd6a4/src/kudu/master/sys_catalog.h#L133>
+/// - schema: (key, timestamp) -> metadata
+///
+/// The timestamp is used to support metadata ttl in the future, now it can set
+/// to 0.
+#[derive(Debug)]
+pub struct SysCatalogTable {
+    // TODO(yingwen): Table id
+    /// Underlying Table to actually store data
+    table: TableRef,
+    /// Index of the key column
+    key_column_index: usize,
+    /// Index of the value column
+    value_column_index: usize,
+    /// Protects table create/alter/drop
+    // TODO(xikai): A better way is to use a specific struct with the lock that takes
+    //  responsibilities to update table.
+    update_table_lock: Mutex<()>,
+}
+
+impl SysCatalogTable {
+    /// Create a new [SysCatalogTable]
+    pub async fn new<T: TableEngine>(table_engine: &T) -> Result<Self> {
+        let table_schema = new_sys_catalog_schema().context(BuildSchema)?;
+        let key_column_index = table_schema
+            .index_of(KEY_COLUMN_NAME)
+            .context(GetColumnIndex {
+                name: KEY_COLUMN_NAME,
+            })?;
+        let value_column_index =
+            table_schema
+                .index_of(VALUE_COLUMN_NAME)
+                .context(GetColumnIndex {
+                    name: VALUE_COLUMN_NAME,
+                })?;
+
+        let open_request = OpenTableRequest {
+            catalog_name: consts::SYSTEM_CATALOG.to_string(),
+            schema_name: consts::SYSTEM_CATALOG_SCHEMA.to_string(),
+            table_name: TABLE_NAME.to_string(),
+            engine: table_engine.engine_type().to_string(),
+        };
+
+        let table_opt = table_engine
+            .open_table(open_request)
+            .await
+            .context(OpenTable)?;
+        match table_opt {
+            Some(table) => {
+                info!("Sys catalog table open existing table");
+
+                // The sys_catalog table is already created
+                return Ok(Self {
+                    table,
+                    key_column_index,
+                    value_column_index,
+                    update_table_lock: Mutex::new(()),
+                });
+            }
+            None => {
+                info!("Sys catalog table is not exists, try to create a new table");
+            }
+        }
+
+        let mut options = HashMap::new();
+        options.insert(
+            table_engine::OPTION_KEY_ENABLE_TTL.to_string(),
+            DEFAULT_ENABLE_TTL.to_string(),
+        );
+        let create_request = CreateTableRequest {
+            catalog_name: consts::SYSTEM_CATALOG.to_string(),
+            schema_name: consts::SYSTEM_CATALOG_SCHEMA.to_string(),
+            table_id: TABLE_ID,
+            table_name: TABLE_NAME.to_string(),
+            table_schema,
+            partition_info: None,
+            engine: table_engine.engine_type().to_string(),
+            options,
+            state: TableState::Stable,
+        };
+
+        let table = table_engine
+            .create_table(create_request)
+            .await
+            .context(BuildTable)?;
+
+        Ok(Self {
+            table,
+            key_column_index,
+            value_column_index,
+            update_table_lock: Mutex::new(()),
+        })
+    }
+
+    /// Returns the table id of the sys catalog table.
+    #[inline]
+    pub fn table_id(&self) -> TableId {
+        TABLE_ID
+    }
+
+    /// Add and store the catalog info
+    pub async fn create_catalog(&self, request: CreateCatalogRequest) -> Result<()> {
+        info!("Add catalog to sys_catalog table, request:{:?}", request);
+
+        let row_group = request.into_row_group(self.table.schema())?;
+
+        let write_req = WriteRequest { row_group };
+        self.table.write(write_req).await.context(PersistCatalog)?;
+
+        Ok(())
+    }
+
+    /// Add and store the schema info
+    pub async fn create_schema(&self, request: CreateSchemaRequest) -> Result<()> {
+        info!("Add schema to sys_catalog table, request:{:?}", request);
+
+        let row_group = request.into_row_group(self.table.schema())?;
+
+        let write_req = WriteRequest { row_group };
+        self.table.write(write_req).await.context(PersistSchema)?;
+
+        Ok(())
+    }
+
+    /// Create table in the catalog.
+    pub async fn create_table(&self, table_info: TableInfo) -> Result<()> {
+        info!(
+            "Create table to sys_catalog table, table_info:{:?}",
+            table_info
+        );
+
+        let _lock = self.update_table_lock.lock().await;
+        self.write_table_info(table_info, TableRequestType::Create)
+            .await?;
+
+        Ok(())
+    }
+
+    /// Prepare to drop the table.
+    pub async fn prepare_drop_table(&self, request: DropTableRequest) -> Result<()> {
+        info!(
+            "Prepare to drop table to sys_catalog table, request:{:?}",
+            request
+        );
+
+        let table_key = TableKey {
+            catalog: &request.catalog_name,
+            schema: &request.schema_name,
+            table: &request.table_name,
+        };
+
+        // update the dropped flag the lock held.
+        {
+            let _lock = self.update_table_lock.lock().await;
+            if let Some(mut table_info) = self.get_table_info(table_key).await? {
+                table_info.state.try_transit(TableState::Dropping).context(
+                    InvalidTableStateTransition {
+                        table: &request.table_name,
+                    },
+                )?;
+
+                self.write_table_info(table_info, TableRequestType::Drop)
+                    .await?;
+            } else {
+                warn!("Prepare to drop a dropped table, request:{:?}", request);
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Drop the table.
+    ///
+    /// Note that [prepare_drop_table] should be called before this method.
+    pub async fn drop_table(&self, request: DropTableRequest) -> Result<()> {
+        info!("Drop table to sys_catalog table, request:{:?}", request);
+
+        let table_key = TableKey {
+            catalog: &request.catalog_name,
+            schema: &request.schema_name,
+            table: &request.table_name,
+        };
+
+        // update the table state with the lock held.
+        {
+            if let Some(mut table_info) = self.get_table_info(table_key).await? {
+                table_info.state.try_transit(TableState::Dropped).context(
+                    InvalidTableStateTransition {
+                        table: &request.table_name,
+                    },
+                )?;
+
+                self.write_table_info(table_info, TableRequestType::Drop)
+                    .await?;
+            } else {
+                warn!("Drop a dropped table, request:{:?}", request);
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Returns the inner table of the sys catalog.
+    #[inline]
+    pub fn inner_table(&self) -> TableRef {
+        self.table.clone()
+    }
+
+    /// Write the table info to the sys_catalog table without lock.
+    async fn write_table_info(&self, table_info: TableInfo, typ: TableRequestType) -> Result<()> {
+        info!(
+            "Write table info to sys_catalog table, table_info:{:?}",
+            table_info
+        );
+
+        let table_writer = TableWriter {
+            catalog_table: self.table.clone(),
+            table_to_write: table_info,
+            typ,
+        };
+
+        table_writer.write().await?;
+
+        Ok(())
+    }
+
+    async fn get_table_info<'a>(&'a self, table_key: TableKey<'a>) -> Result<Option<TableInfo>> {
+        let projected_schema = ProjectedSchema::no_projection(self.table.schema());
+        let primary_key = TableWriter::build_table_primary_key(table_key.clone())?;
+        let get_req = GetRequest {
+            request_id: RequestId::next_id(),
+            projected_schema,
+            primary_key,
+        };
+
+        match self.table.get(get_req).await.context(GetTableInfo {
+            table: table_key.table,
+        })? {
+            Some(row) => {
+                let table_info = self.decode_table_info(row)?;
+                let decoded_table_key = TableKey {
+                    catalog: &table_info.catalog_name,
+                    schema: &table_info.schema_name,
+                    table: &table_info.table_name,
+                };
+
+                ensure!(
+                    table_key == decoded_table_key,
+                    TableKeyMismatch {
+                        expect_table: table_key.table,
+                        given_table: decoded_table_key.table,
+                    }
+                );
+
+                Ok(Some(table_info))
+            }
+            None => Ok(None),
+        }
+    }
+
+    fn decode_table_info(&self, row: Row) -> Result<TableInfo> {
+        ensure!(
+            row.num_columns() > self.key_column_index,
+            InvalidTableRow { row }
+        );
+
+        ensure!(
+            row.num_columns() > self.value_column_index,
+            InvalidTableRow { row }
+        );
+
+        // Key and value column is always varbinary.
+        let key = &row[self.key_column_index]
+            .as_varbinary()
+            .with_context(|| InvalidTableRow { row: row.clone() })?;
+        let value = &row[self.value_column_index]
+            .as_varbinary()
+            .with_context(|| InvalidTableRow { row: row.clone() })?;
+
+        match decode_one_request(key, value)? {
+            DecodedRequest::TableEntry(request) => Ok(request),
+            _ => InvalidTableRow { row }.fail(),
+        }
+    }
+
+    /// Visit all data in the sys catalog table
+    // TODO(yingwen): Expose read options
+    pub async fn visit(&self, opts: ReadOptions, visitor: &mut dyn Visitor) -> Result<()> {
+        let read_request = ReadRequest {
+            request_id: RequestId::next_id(),
+            opts,
+            // The schema of sys catalog table is never changed
+            projected_schema: ProjectedSchema::no_projection(self.table.schema()),
+            predicate: PredicateBuilder::default().build(),
+            order: ReadOrder::None,
+        };
+        let mut batch_stream = self.table.read(read_request).await.context(ReadTable)?;
+
+        info!("batch_stream schema is:{:?}", batch_stream.schema());
+        // TODO(yingwen): Check stream schema and table schema?
+        while let Some(batch) = batch_stream.try_next().await.context(ReadStream)? {
+            // Visit all requests in the record batch
+            info!("real batch_stream schema is:{:?}", batch.schema());
+            self.visit_record_batch(batch, visitor).await?;
+        }
+
+        Ok(())
+    }
+
+    /// Visit the record batch
+    async fn visit_record_batch(
+        &self,
+        batch: RecordBatch,
+        visitor: &mut dyn Visitor,
+    ) -> Result<()> {
+        let key_column = batch.column(self.key_column_index);
+        let value_column = batch.column(self.value_column_index);
+
+        info!(
+            "Sys catalog table visit record batch, column_num:{}, row_num:{}",
+            batch.num_columns(),
+            batch.num_rows()
+        );
+
+        let num_rows = batch.num_rows();
+        for i in 0..num_rows {
+            // Key and value column is not nullable
+            let key = key_column.datum(i);
+            let value = value_column.datum(i);
+
+            debug!(
+                "Sys catalog table visit row, i:{}, key:{:?}, value:{:?}",
+                i, key, value
+            );
+
+            // Key and value column is always varbinary.
+            let request =
+                decode_one_request(key.as_varbinary().unwrap(), value.as_varbinary().unwrap())?;
+
+            Self::call_visitor(request, visitor).await?;
+        }
+
+        Ok(())
+    }
+
+    /// Invoke visitor
+    async fn call_visitor(request: DecodedRequest, visitor: &mut dyn Visitor) -> Result<()> {
+        match request {
+            DecodedRequest::CreateCatalog(req) => visitor.visit_catalog(req),
+            DecodedRequest::CreateSchema(req) => visitor.visit_schema(req),
+            DecodedRequest::TableEntry(req) => visitor.visit_tables(req).await,
+        }
+    }
+}
+
+/// Visitor for sys catalog requests
+// TODO(yingwen): Define an Error for visitor
+#[async_trait]
+pub trait Visitor {
+    // TODO(yingwen): Use enum another type if need more operation (delete/update)
+    fn visit_catalog(&mut self, request: CreateCatalogRequest) -> Result<()>;
+
+    fn visit_schema(&mut self, request: CreateSchemaRequest) -> Result<()>;
+
+    async fn visit_tables(&mut self, table_info: TableInfo) -> Result<()>;
+}
+
+/// Build a new table schema for sys catalog
+fn new_sys_catalog_schema() -> schema::Result<Schema> {
+    // NOTICE: Both key and value must be non-nullable, the visit function takes
+    // this assumption
+    schema::Builder::with_capacity(3)
+        .auto_increment_column_id(true)
+        // key
+        .add_key_column(
+            column_schema::Builder::new(KEY_COLUMN_NAME.to_string(), DatumKind::Varbinary)
+                .is_nullable(false)
+                .is_tag(false)
+                .build()
+                .expect("Should succeed to build column schema of catalog"),
+        )?
+        // timestamp
+        .add_key_column(
+            column_schema::Builder::new(TIMESTAMP_COLUMN_NAME.to_string(), DatumKind::Timestamp)
+                .is_nullable(false)
+                .is_tag(false)
+                .build()
+                .expect("Should succeed to build column schema of catalog"),
+        )?
+        // value
+        .add_normal_column(
+            column_schema::Builder::new(VALUE_COLUMN_NAME.to_string(), DatumKind::Varbinary)
+                .is_nullable(false)
+                .is_tag(false)
+                .build()
+                .expect("Should succeed to build column schema of catalog"),
+        )?
+        .build()
+}
+
+/// Request type, used as key header
+///
+/// 0 is reserved
+#[derive(Debug, Clone, Copy)]
+enum KeyType {
+    CreateCatalog = 1,
+    CreateSchema = 2,
+    TableEntry = 3,
+}
+
+impl KeyType {
+    fn to_u8(self) -> u8 {
+        self as u8
+    }
+
+    fn decode_from_bytes(mut buf: &[u8]) -> Result<Self> {
+        let v = buf.read_u8().context(ReadKeyHeader)?;
+
+        match v {
+            v if v == Self::CreateCatalog as u8 => Ok(Self::CreateCatalog),
+            v if v == Self::CreateSchema as u8 => Ok(Self::CreateSchema),
+            v if v == Self::TableEntry as u8 => Ok(Self::TableEntry),
+            value => InvalidKeyHeader { value }.fail(),
+        }
+    }
+}
+
+/// Catalog entry key
+///
+/// Use catalog name as key
+struct CatalogKey<'a>(&'a str);
+
+/// Schema entry key
+///
+/// Use (catalog, schema) as key
+struct SchemaKey<'a>(&'a str, &'a str);
+
+// TODO(yingwen): Maybe use same key for create/alter table.
+/// Table entry key
+///
+/// Use (catalog, schema, table_id) as key
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct TableKey<'a> {
+    catalog: &'a str,
+    schema: &'a str,
+    table: &'a str,
+}
+
+/// Encoder for entry key
+struct EntryKeyEncoder;
+
+impl<'a> Encoder<CatalogKey<'a>> for EntryKeyEncoder {
+    type Error = Error;
+
+    fn encode<B: MemBufMut>(&self, buf: &mut B, value: &CatalogKey) -> Result<()> {
+        buf.write_u8(KeyType::CreateCatalog.to_u8())
+            .context(EncodeKeyHeader)?;
+        let encoder = MemComparable;
+        encoder
+            .encode(buf, value.0.as_bytes())
+            .context(EncodeKeyBody)
+    }
+
+    fn estimate_encoded_size(&self, value: &CatalogKey) -> usize {
+        let encoder = MemComparable;
+        mem::size_of::<u8>() + encoder.estimate_encoded_size(value.0.as_bytes())
+    }
+}
+
+impl<'a> Encoder<SchemaKey<'a>> for EntryKeyEncoder {
+    type Error = Error;
+
+    fn encode<B: MemBufMut>(&self, buf: &mut B, value: &SchemaKey) -> Result<()> {
+        buf.write_u8(KeyType::CreateSchema.to_u8())
+            .context(EncodeKeyHeader)?;
+        let encoder = MemComparable;
+        encoder
+            .encode(buf, value.0.as_bytes())
+            .context(EncodeKeyBody)?;
+        encoder
+            .encode(buf, value.1.as_bytes())
+            .context(EncodeKeyBody)
+    }
+
+    fn estimate_encoded_size(&self, value: &SchemaKey) -> usize {
+        let encoder = MemComparable;
+        mem::size_of::<u8>()
+            + encoder.estimate_encoded_size(value.0.as_bytes())
+            + encoder.estimate_encoded_size(value.1.as_bytes())
+    }
+}
+
+impl<'a> Encoder<TableKey<'a>> for EntryKeyEncoder {
+    type Error = Error;
+
+    fn encode<B: MemBufMut>(&self, buf: &mut B, value: &TableKey) -> Result<()> {
+        buf.write_u8(KeyType::TableEntry.to_u8())
+            .context(EncodeKeyHeader)?;
+        let encoder = MemComparable;
+        encoder
+            .encode(buf, value.catalog.as_bytes())
+            .context(EncodeKeyBody)?;
+        encoder
+            .encode(buf, value.schema.as_bytes())
+            .context(EncodeKeyBody)?;
+        encoder
+            .encode(buf, value.table.as_bytes())
+            .context(EncodeKeyBody)?;
+        Ok(())
+    }
+
+    fn estimate_encoded_size(&self, value: &TableKey) -> usize {
+        let encoder = MemComparable;
+        mem::size_of::<u8>()
+            + encoder.estimate_encoded_size(value.catalog.as_bytes())
+            + encoder.estimate_encoded_size(value.schema.as_bytes())
+            + encoder.estimate_encoded_size(value.table.as_bytes())
+    }
+}
+
+/// Information of the catalog to add
+#[derive(Debug)]
+pub struct CreateCatalogRequest {
+    /// Catalog name
+    pub catalog_name: String,
+}
+
+impl CreateCatalogRequest {
+    /// Convert into [common_types::row::RowGroup]
+    fn into_row_group(self, schema: Schema) -> Result<RowGroup> {
+        let key = self.to_key()?;
+        let value = self.into_value()?;
+        let mut builder = RowGroupBuilder::new(schema);
+        builder
+            .row_builder()
+            // key
+            .append_datum(Datum::Varbinary(key))
+            .context(BuildRow)?
+            // timestamp
+            .append_datum(Datum::Timestamp(ENTRY_TIMESTAMP))
+            .context(BuildRow)?
+            // value
+            .append_datum(Datum::Varbinary(value))
+            .context(BuildRow)?
+            .finish()
+            .context(BuildRow)?;
+
+        Ok(builder.build())
+    }
+
+    fn to_key(&self) -> Result<Bytes> {
+        let encoder = EntryKeyEncoder;
+        let key = CatalogKey(&self.catalog_name);
+        let mut buf = BytesMut::with_capacity(encoder.estimate_encoded_size(&key));
+        encoder.encode(&mut buf, &key)?;
+        Ok(buf.into())
+    }
+
+    fn into_value(self) -> Result<Bytes> {
+        let entry = self.into_pb();
+
+        let buf = entry.write_to_bytes().context(EncodeEntryPb)?;
+        Ok(buf.into())
+    }
+
+    fn into_pb(self) -> CatalogEntry {
+        let mut entry = CatalogEntry::new();
+        entry.set_catalog_name(self.catalog_name);
+        entry.set_created_time(Timestamp::now().as_i64());
+
+        entry
+    }
+}
+
+impl From<CatalogEntry> for CreateCatalogRequest {
+    fn from(entry: CatalogEntry) -> Self {
+        Self {
+            catalog_name: entry.catalog_name,
+        }
+    }
+}
+
+/// Information of the schema to add.
+#[derive(Debug)]
+pub struct CreateSchemaRequest {
+    pub catalog_name: String,
+    pub schema_name: String,
+    pub schema_id: SchemaId,
+}
+
+impl CreateSchemaRequest {
+    /// Convert into [common_types::row::RowGroup]
+    fn into_row_group(self, schema: Schema) -> Result<RowGroup> {
+        let key = self.to_key()?;
+        let value = self.into_value()?;
+        let mut builder = RowGroupBuilder::new(schema);
+        builder
+            .row_builder()
+            // key
+            .append_datum(Datum::Varbinary(key))
+            .context(BuildRow)?
+            // timestamp
+            .append_datum(Datum::Timestamp(ENTRY_TIMESTAMP))
+            .context(BuildRow)?
+            // value
+            .append_datum(Datum::Varbinary(value))
+            .context(BuildRow)?
+            .finish()
+            .context(BuildRow)?;
+
+        Ok(builder.build())
+    }
+
+    fn to_key(&self) -> Result<Bytes> {
+        let encoder = EntryKeyEncoder;
+        let key = SchemaKey(&self.catalog_name, &self.schema_name);
+        let mut buf = BytesMut::with_capacity(encoder.estimate_encoded_size(&key));
+        encoder.encode(&mut buf, &key)?;
+        Ok(buf.into())
+    }
+
+    fn into_value(self) -> Result<Bytes> {
+        let entry = self.into_pb();
+
+        let buf = entry.write_to_bytes().context(EncodeEntryPb)?;
+        Ok(buf.into())
+    }
+
+    fn into_pb(self) -> SchemaEntry {
+        let mut entry = SchemaEntry::new();
+        entry.set_catalog_name(self.catalog_name);
+        entry.set_schema_name(self.schema_name);
+        entry.set_schema_id(self.schema_id.as_u32());
+        entry.set_created_time(Timestamp::now().as_i64());
+
+        entry
+    }
+}
+
+impl TryFrom<SchemaEntry> for CreateSchemaRequest {
+    type Error = Error;
+
+    fn try_from(entry: SchemaEntry) -> Result<Self> {
+        let schema_id = SchemaId::new(entry.schema_id).context(InvalidSchemaId {
+            id: entry.schema_id,
+        })?;
+
+        Ok(Self {
+            catalog_name: entry.catalog_name,
+            schema_name: entry.schema_name,
+            schema_id,
+        })
+    }
+}
+
+/// Information of the alter operations to the table.
+#[derive(Clone, Debug)]
+pub struct AlterTableRequest {
+    pub catalog_name: String,
+    pub schema_name: String,
+    pub table_name: String,
+    /// Schema after alteration.
+    pub schema: Schema,
+}
+
+/// Writer for writing the table information into the catalog table.
+pub struct TableWriter {
+    catalog_table: TableRef,
+    table_to_write: TableInfo,
+    typ: TableRequestType,
+}
+
+impl TableWriter {
+    async fn write(&self) -> Result<()> {
+        let row_group = self.convert_table_info_to_row_group()?;
+        let write_req = WriteRequest { row_group };
+        self.catalog_table
+            .write(write_req)
+            .await
+            .context(PersistTables)?;
+
+        Ok(())
+    }
+
+    /// Convert the table to write into [common_types::row::RowGroup].
+    fn convert_table_info_to_row_group(&self) -> Result<RowGroup> {
+        let mut builder = RowGroupBuilder::new(self.catalog_table.schema());
+        let key = Self::build_create_table_key(&self.table_to_write)?;
+        let value = Self::build_create_table_value(self.table_to_write.clone(), self.typ)?;
+
+        debug!(
+            "TableWriter build key value, key:{:?}, value:{:?}",
+            key, value
+        );
+
+        Self::build_row(&mut builder, key, value)?;
+
+        Ok(builder.build())
+    }
+
+    fn build_row(builder: &mut RowGroupBuilder, key: Bytes, value: Bytes) -> Result<()> {
+        builder
+            .row_builder()
+            // key
+            .append_datum(Datum::Varbinary(key))
+            .context(BuildRow)?
+            // timestamp
+            .append_datum(Datum::Timestamp(ENTRY_TIMESTAMP))
+            .context(BuildRow)?
+            // value
+            .append_datum(Datum::Varbinary(value))
+            .context(BuildRow)?
+            .finish()
+            .context(BuildRow)?;
+        Ok(())
+    }
+
+    fn build_create_table_key(table_info: &TableInfo) -> Result<Bytes> {
+        let key = TableKey {
+            catalog: &table_info.catalog_name,
+            schema: &table_info.schema_name,
+            table: &table_info.table_name,
+        };
+        Self::encode_table_key(key)
+    }
+
+    fn encode_table_key(key: TableKey) -> Result<Bytes> {
+        let encoder = EntryKeyEncoder;
+        let mut buf = BytesMut::with_capacity(encoder.estimate_encoded_size(&key));
+        encoder.encode(&mut buf, &key)?;
+        Ok(buf.into())
+    }
+
+    fn build_create_table_value(table_info: TableInfo, typ: TableRequestType) -> Result<Bytes> {
+        let entry = table_info.into_pb(typ);
+
+        let buf = entry.write_to_bytes().context(EncodeEntryPb)?;
+        Ok(buf.into())
+    }
+
+    fn build_table_primary_key(table_key: TableKey) -> Result<Vec<Datum>> {
+        let encoded_key = Self::encode_table_key(table_key)?;
+
+        Ok(vec![
+            Datum::Varbinary(encoded_key),
+            Datum::Timestamp(ENTRY_TIMESTAMP),
+        ])
+    }
+}
+
+/// Decoded sys catalog request
+#[derive(Debug)]
+enum DecodedRequest {
+    CreateCatalog(CreateCatalogRequest),
+    CreateSchema(CreateSchemaRequest),
+    TableEntry(TableInfo),
+}
+
+/// Decode request from key/value
+fn decode_one_request(key: &[u8], value: &[u8]) -> Result<DecodedRequest> {
+    let key_type = KeyType::decode_from_bytes(key)?;
+    let req = match key_type {
+        KeyType::CreateCatalog => {
+            let entry = CatalogEntry::parse_from_bytes(value).context(DecodeEntryPb)?;
+            DecodedRequest::CreateCatalog(CreateCatalogRequest::from(entry))
+        }
+        KeyType::CreateSchema => {
+            let entry = SchemaEntry::parse_from_bytes(value).context(DecodeEntryPb)?;
+            DecodedRequest::CreateSchema(CreateSchemaRequest::try_from(entry)?)
+        }
+        KeyType::TableEntry => {
+            let entry = TableEntry::parse_from_bytes(value).context(DecodeEntryPb)?;
+            let table_info = TableInfo::try_from(entry).context(DecodeTableEntry)?;
+            DecodedRequest::TableEntry(table_info)
+        }
+    };
+
+    Ok(req)
+}
diff --git a/system_catalog/src/tables.rs b/system_catalog/src/tables.rs
new file mode 100644
index 0000000000..67edfeaa35
--- /dev/null
+++ b/system_catalog/src/tables.rs
@@ -0,0 +1,179 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+/// implementation of system table: Tables
+/// For example `SELECT * FROM system.public.tables`
+use std::fmt::{Debug, Formatter};
+
+use async_trait::async_trait;
+use catalog::{manager::Manager, schema::SchemaRef, CatalogRef};
+use common_types::{
+    column_schema,
+    datum::{Datum, DatumKind},
+    record_batch::RecordBatchWithKeyBuilder,
+    row::Row,
+    schema,
+    schema::Schema,
+};
+use snafu::ResultExt;
+use table_engine::{
+    stream::SendableRecordBatchStream,
+    table::{ReadRequest, SchemaId, TableId, TableRef, TableSeq},
+};
+
+use crate::{OneRecordBatchStream, SystemTable, ENTRY_TIMESTAMP};
+
+/// Table name of the sys tables
+const TABLE_NAME: &str = "tables";
+/// Schema id of the sys catalog schema (`system/public`).
+pub const SCHEMA_ID: SchemaId = SchemaId::from_u16(1);
+/// Table sequence of the sys tables
+pub const TABLE_SEQ: TableSeq = TableSeq::from_u32(2);
+/// Table id of the `sys_catalog` table.
+pub const TABLE_ID: TableId = TableId::new(SCHEMA_ID, TABLE_SEQ);
+
+/// Build a new table schema for tables
+fn tables_schema() -> Schema {
+    schema::Builder::with_capacity(6)
+        .auto_increment_column_id(true)
+        .add_key_column(
+            column_schema::Builder::new("timestamp".to_string(), DatumKind::Timestamp)
+                .is_nullable(false)
+                .is_tag(false)
+                .build()
+                .unwrap(),
+        )
+        .unwrap()
+        .add_key_column(
+            column_schema::Builder::new("catalog".to_string(), DatumKind::String)
+                .is_nullable(false)
+                .is_tag(false)
+                .build()
+                .unwrap(),
+        )
+        .unwrap()
+        .add_key_column(
+            column_schema::Builder::new("schema".to_string(), DatumKind::String)
+                .is_nullable(false)
+                .is_tag(false)
+                .build()
+                .unwrap(),
+        )
+        .unwrap()
+        .add_key_column(
+            column_schema::Builder::new("table_name".to_string(), DatumKind::String)
+                .is_nullable(false)
+                .is_tag(false)
+                .build()
+                .unwrap(),
+        )
+        .unwrap()
+        .add_normal_column(
+            column_schema::Builder::new("table_id".to_string(), DatumKind::UInt64)
+                .is_nullable(false)
+                .is_tag(false)
+                .build()
+                .unwrap(),
+        )
+        .unwrap()
+        .add_normal_column(
+            column_schema::Builder::new("engine".to_string(), DatumKind::String)
+                .is_nullable(false)
+                .is_tag(false)
+                .build()
+                .unwrap(),
+        )
+        .unwrap()
+        .build()
+        .unwrap()
+}
+
+pub struct Tables<M> {
+    schema: Schema,
+    catalog_manager: M,
+}
+
+impl<M> Debug for Tables<M> {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("SysTables")
+            .field("schema", &self.schema)
+            .finish()
+    }
+}
+
+impl<M: Manager> Tables<M> {
+    pub fn new(catalog_manager: M) -> Self {
+        Self {
+            schema: tables_schema(),
+            catalog_manager,
+        }
+    }
+
+    fn from_table(&self, catalog: CatalogRef, schema: SchemaRef, table: TableRef) -> Row {
+        let mut datums = Vec::with_capacity(self.schema.num_columns());
+        datums.push(Datum::Timestamp(ENTRY_TIMESTAMP));
+        datums.push(Datum::from(catalog.name()));
+        datums.push(Datum::from(schema.name()));
+        datums.push(Datum::from(table.name()));
+        datums.push(Datum::from(table.id().as_u64()));
+        datums.push(Datum::from(table.engine_type()));
+        Row::from_datums(datums)
+    }
+}
+
+#[async_trait]
+impl<M: Manager> SystemTable for Tables<M> {
+    fn name(&self) -> &str {
+        TABLE_NAME
+    }
+
+    fn id(&self) -> TableId {
+        TABLE_ID
+    }
+
+    fn schema(&self) -> Schema {
+        self.schema.clone()
+    }
+
+    async fn read(
+        &self,
+        request: ReadRequest,
+    ) -> table_engine::table::Result<SendableRecordBatchStream> {
+        let catalogs = self
+            .catalog_manager
+            .all_catalogs()
+            .map_err(|e| Box::new(e) as _)
+            .context(table_engine::table::Scan { table: self.name() })?;
+        let mut builder =
+            RecordBatchWithKeyBuilder::new(self.schema.clone().to_record_schema_with_key());
+
+        let projector = request
+            .projected_schema
+            .try_project_with_key(&self.schema)
+            .expect("Should succeed to try_project_key of sys_tables");
+        for catalog in &catalogs {
+            for schema in &catalog
+                .all_schemas()
+                .map_err(|e| Box::new(e) as _)
+                .context(table_engine::table::Scan { table: self.name() })?
+            {
+                for table in &schema
+                    .all_tables()
+                    .map_err(|e| Box::new(e) as _)
+                    .context(table_engine::table::Scan { table: self.name() })?
+                {
+                    let row = self.from_table(catalog.clone(), schema.clone(), table.clone());
+                    let projected_row = projector.project_row(&row, Vec::new());
+                    builder
+                        .append_row(projected_row)
+                        .map_err(|e| Box::new(e) as _)
+                        .context(table_engine::table::Scan { table: self.name() })?;
+                }
+            }
+        }
+        let record_batch = builder.build().unwrap().into_record_batch();
+        Ok(Box::pin(OneRecordBatchStream {
+            schema: self.schema.clone().to_record_schema(),
+            record_batch: Some(record_batch),
+        }))
+    }
+}
diff --git a/table_engine/Cargo.toml b/table_engine/Cargo.toml
new file mode 100644
index 0000000000..b617b9f7cc
--- /dev/null
+++ b/table_engine/Cargo.toml
@@ -0,0 +1,23 @@
+[package]
+name = "table_engine"
+version = "0.1.0"
+authors = ["CeresDB Authors <ceresdb@service.alipay.com>"]
+edition = "2018"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+# In alphabetical order
+arrow_deps = { path = "../arrow_deps" }
+async-trait = "0.1.41"
+common_types = { path = "../common_types" }
+common_util = { path = "../common_util" }
+futures = "0.3"
+log = "0.4"
+proto = { path = "../proto" }
+protobuf = "2.20"
+serde = "1.0"
+serde_derive = "1.0"
+smallvec = "1.6"
+snafu = { version ="0.6.10", features = ["backtraces"]}
+tokio = { version = "1.0", features = ["sync"] }
diff --git a/table_engine/src/engine.rs b/table_engine/src/engine.rs
new file mode 100644
index 0000000000..b2aaeaaf6c
--- /dev/null
+++ b/table_engine/src/engine.rs
@@ -0,0 +1,261 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Table factory trait
+
+use std::{collections::HashMap, sync::Arc};
+
+use async_trait::async_trait;
+use common_types::{schema::Schema, time::Timestamp};
+use common_util::runtime::Runtime;
+use proto::sys_catalog::{TableEntry, TableState as TableStatePb};
+use snafu::{ensure, Backtrace, Snafu};
+
+use crate::{
+    partition::PartitionInfo,
+    table::{TableId, TableInfo, TableRef},
+};
+
+#[derive(Debug, Snafu)]
+#[snafu(visibility(pub))]
+pub enum Error {
+    #[snafu(display("Invalid table path, path:{}.\nBacktrace:\n{}", path, backtrace))]
+    InvalidTablePath { path: String, backtrace: Backtrace },
+
+    #[snafu(display("Table already exists, table:{}.\nBacktrace:\n{}", table, backtrace))]
+    TableExists { table: String, backtrace: Backtrace },
+
+    #[snafu(display("Invalid arguments, err:{}", source))]
+    InvalidArguments {
+        table: String,
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+
+    #[snafu(display("Failed to write meta data, err:{}", source))]
+    WriteMeta {
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+
+    #[snafu(display("Unexpected error, err:{}", source))]
+    Unexpected {
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+
+    #[snafu(display(
+        "Unknown engine type, type:{}.\nBacktrace:\n{}",
+        engine_type,
+        backtrace
+    ))]
+    UnknownEngineType {
+        engine_type: String,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Invalid table state transition, from:{:?}, to:{:?}.\nBacktrace:\n{}",
+        from,
+        to,
+        backtrace
+    ))]
+    InvalidTableStateTransition {
+        from: TableState,
+        to: TableState,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Failed to close the table engine, err:{}", source))]
+    Close {
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+}
+
+define_result!(Error);
+
+/// The state of table.
+///
+/// Transition rule is defined in the validate function.
+#[derive(Clone, Copy, Debug)]
+pub enum TableState {
+    Stable = 0,
+    Dropping = 1,
+    Dropped = 2,
+}
+
+impl TableState {
+    pub fn validate(&self, to: TableState) -> bool {
+        match self {
+            TableState::Stable => matches!(to, TableState::Stable | TableState::Dropping),
+            TableState::Dropping => matches!(to, TableState::Dropped),
+            TableState::Dropped => false,
+        }
+    }
+
+    /// Try to transit from the self state to the `to` state.
+    ///
+    /// Returns error if it is a invalid transition.
+    pub fn try_transit(&mut self, to: TableState) -> Result<()> {
+        ensure!(
+            self.validate(to),
+            InvalidTableStateTransition { from: *self, to }
+        );
+        *self = to;
+
+        Ok(())
+    }
+}
+
+impl From<TableState> for TableStatePb {
+    fn from(state: TableState) -> TableStatePb {
+        match state {
+            TableState::Stable => TableStatePb::STABLE,
+            TableState::Dropping => TableStatePb::DROPPING,
+            TableState::Dropped => TableStatePb::DROPPED,
+        }
+    }
+}
+
+impl From<TableStatePb> for TableState {
+    fn from(state: TableStatePb) -> TableState {
+        match state {
+            TableStatePb::STABLE => TableState::Stable,
+            TableStatePb::DROPPING => TableState::Dropping,
+            TableStatePb::DROPPED => TableState::Dropped,
+        }
+    }
+}
+
+#[derive(Copy, Clone)]
+pub enum TableRequestType {
+    Create,
+    Drop,
+}
+
+/// Create table request
+// TODO(yingwen): Add option for create_if_not_exists?
+#[derive(Debug, Clone)]
+pub struct CreateTableRequest {
+    /// Catalog name
+    pub catalog_name: String,
+    /// Schema name
+    pub schema_name: String,
+    /// Table id
+    pub table_id: TableId,
+    // TODO(yingwen): catalog and schema, or add a table path struct?
+    /// Table name
+    pub table_name: String,
+    /// Table schema
+    pub table_schema: Schema,
+    /// Partition info if this is a partitioned table
+    // TODO(yingwen): TableEngine should not have knowledge of partitioning
+    pub partition_info: Option<PartitionInfo>,
+    /// Table engine type
+    pub engine: String,
+    /// Table options used by each engine
+    pub options: HashMap<String, String>,
+    /// Tells state of the table
+    pub state: TableState,
+}
+
+impl CreateTableRequest {
+    // TODO(chunshao.rcs): refactor
+    pub fn into_pb(self, typ: TableRequestType) -> TableEntry {
+        let mut table_entry: TableEntry = self.into();
+        match typ {
+            TableRequestType::Create => table_entry.set_created_time(Timestamp::now().as_i64()),
+            TableRequestType::Drop => table_entry.set_modified_time(Timestamp::now().as_i64()),
+        }
+        table_entry
+    }
+}
+
+impl From<CreateTableRequest> for TableEntry {
+    fn from(req: CreateTableRequest) -> Self {
+        let mut entry = TableEntry::new();
+        entry.set_catalog_name(req.catalog_name);
+        entry.set_schema_name(req.schema_name);
+        entry.set_table_id(req.table_id.as_u64());
+        entry.set_table_name(req.table_name);
+        entry.set_engine(req.engine);
+        entry.set_state(TableStatePb::from(req.state));
+
+        entry
+    }
+}
+
+impl From<CreateTableRequest> for TableInfo {
+    fn from(req: CreateTableRequest) -> Self {
+        Self {
+            catalog_name: req.catalog_name,
+            schema_name: req.schema_name,
+            table_id: req.table_id,
+            table_name: req.table_name,
+            engine: req.engine,
+            state: req.state,
+        }
+    }
+}
+
+/// Drop table request
+#[derive(Debug, Clone)]
+pub struct DropTableRequest {
+    /// Catalog name
+    pub catalog_name: String,
+    /// Schema name
+    pub schema_name: String,
+    /// Table name
+    pub table_name: String,
+    /// Table engine type
+    pub engine: String,
+}
+
+#[derive(Debug, Clone)]
+pub struct OpenTableRequest {
+    /// Catalog name
+    pub catalog_name: String,
+    /// Schema name
+    pub schema_name: String,
+    /// Table name
+    pub table_name: String,
+    /// Table engine type
+    pub engine: String,
+}
+
+impl From<TableInfo> for OpenTableRequest {
+    fn from(table_info: TableInfo) -> Self {
+        Self {
+            catalog_name: table_info.catalog_name,
+            schema_name: table_info.schema_name,
+            table_name: table_info.table_name,
+            engine: table_info.engine,
+        }
+    }
+}
+
+/// Table engine
+// TODO(yingwen): drop table support to release resource owned by the table
+#[async_trait]
+pub trait TableEngine {
+    /// Returns the name of engine.
+    fn engine_type(&self) -> &str;
+
+    /// Close the engine gracefully.
+    async fn close(&self) -> Result<()>;
+
+    /// Create table
+    async fn create_table(&self, request: CreateTableRequest) -> Result<TableRef>;
+
+    /// Drop table
+    async fn drop_table(&self, request: DropTableRequest) -> Result<bool>;
+
+    /// Open table, return None if table not exists
+    async fn open_table(&self, request: OpenTableRequest) -> Result<Option<TableRef>>;
+}
+
+/// A reference counted pointer to table engine
+pub type TableEngineRef = Arc<dyn TableEngine + Send + Sync>;
+
+#[derive(Clone, Debug)]
+pub struct EngineRuntimes {
+    pub read_runtime: Arc<Runtime>,
+    pub write_runtime: Arc<Runtime>,
+    pub bg_runtime: Arc<Runtime>,
+}
diff --git a/table_engine/src/lib.rs b/table_engine/src/lib.rs
new file mode 100644
index 0000000000..ac60c1e8dc
--- /dev/null
+++ b/table_engine/src/lib.rs
@@ -0,0 +1,20 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Table engine facade, provides read/write interfaces of table
+
+#[macro_use]
+extern crate common_util;
+
+pub mod engine;
+pub mod memory;
+pub mod partition;
+pub mod predicate;
+pub mod provider;
+pub mod stream;
+pub mod table;
+
+/// Enable ttl key
+pub const OPTION_KEY_ENABLE_TTL: &str = "enable_ttl";
+
+pub const MEMORY_ENGINE_TYPE: &str = "Memory";
+pub const ANALYTIC_ENGINE_TYPE: &str = "Analytic";
diff --git a/table_engine/src/memory.rs b/table_engine/src/memory.rs
new file mode 100644
index 0000000000..d26448fddf
--- /dev/null
+++ b/table_engine/src/memory.rs
@@ -0,0 +1,252 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! In-memory table implementations
+
+use std::{
+    collections::HashMap,
+    fmt,
+    pin::Pin,
+    sync::{Arc, RwLock},
+    task::{Context, Poll},
+};
+
+use async_trait::async_trait;
+use common_types::{
+    column::{ColumnBlock, ColumnBlockBuilder},
+    datum::{Datum, DatumKind},
+    record_batch::RecordBatch,
+    row::{Row, RowGroup},
+    schema::{RecordSchema, Schema},
+};
+use futures::stream::Stream;
+use snafu::{OptionExt, ResultExt};
+
+use crate::{
+    stream::{
+        self, ErrNoSource, ErrWithSource, PartitionedStreams, RecordBatchStream,
+        SendableRecordBatchStream,
+    },
+    table::{
+        AlterSchemaRequest, FlushRequest, GetRequest, ReadRequest, Result, Table, TableId,
+        TableStats, UnsupportedMethod, WriteRequest,
+    },
+};
+
+type RowGroupVec = Vec<RowGroup>;
+
+/// In-memory table
+///
+/// Mainly for test, DO NOT use it in production. All data inserted are buffered
+/// in memory, does not support schema change.
+pub struct MemoryTable {
+    /// Table name
+    name: String,
+    /// Table id
+    id: TableId,
+    /// Table schema
+    schema: Schema,
+    /// Rows
+    row_groups: Arc<RwLock<RowGroupVec>>,
+    /// Engine type
+    engine_type: String,
+}
+
+impl MemoryTable {
+    pub fn new(name: String, id: TableId, schema: Schema, engine_type: String) -> Self {
+        Self {
+            name,
+            id,
+            schema,
+            row_groups: Arc::new(RwLock::new(Vec::new())),
+            engine_type,
+        }
+    }
+}
+
+impl fmt::Debug for MemoryTable {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("MemoryTable")
+            .field("name", &self.name)
+            .field("id", &self.id)
+            .field("schema", &self.schema)
+            // row_groups is ignored
+            .finish()
+    }
+}
+
+#[async_trait]
+impl Table for MemoryTable {
+    fn name(&self) -> &str {
+        &self.name
+    }
+
+    fn id(&self) -> TableId {
+        self.id
+    }
+
+    fn options(&self) -> HashMap<String, String> {
+        HashMap::new()
+    }
+
+    fn schema(&self) -> Schema {
+        self.schema.clone()
+    }
+
+    fn engine_type(&self) -> &str {
+        &self.engine_type
+    }
+
+    fn stats(&self) -> TableStats {
+        TableStats::default()
+    }
+
+    async fn write(&self, request: WriteRequest) -> Result<usize> {
+        // TODO(yingwen) Maybe check schema?
+        let mut row_groups = self.row_groups.write().unwrap();
+        let n = request.row_group.num_rows();
+        row_groups.push(request.row_group);
+
+        Ok(n)
+    }
+
+    // batch_size is ignored now
+    async fn read(&self, request: ReadRequest) -> Result<SendableRecordBatchStream> {
+        let scan = MemoryScan {
+            schema: request.projected_schema.to_record_schema(),
+            row_groups: self.row_groups.clone(),
+            index: 0,
+        };
+
+        Ok(Box::pin(scan))
+    }
+
+    async fn get(&self, _request: GetRequest) -> Result<Option<Row>> {
+        // Alter schema is not supported now.
+        UnsupportedMethod {
+            table: &self.name,
+            method: "get",
+        }
+        .fail()
+    }
+
+    async fn partitioned_read(&self, request: ReadRequest) -> Result<PartitionedStreams> {
+        let stream = self.read(request).await?;
+
+        Ok(PartitionedStreams::one_stream(stream))
+    }
+
+    // TODO: Alter schema is not supported now
+    async fn alter_schema(&self, _request: AlterSchemaRequest) -> Result<usize> {
+        Ok(1)
+    }
+
+    // TODO: Alter modify setting is not supported now
+    async fn alter_options(&self, _options: HashMap<String, String>) -> Result<usize> {
+        Ok(1)
+    }
+
+    async fn flush(&self, _request: FlushRequest) -> Result<()> {
+        // Flush is not supported now.
+        UnsupportedMethod {
+            table: self.name(),
+            method: "flush",
+        }
+        .fail()
+    }
+
+    async fn compact(&self) -> Result<()> {
+        // Compact is not supported now.
+        UnsupportedMethod {
+            table: self.name(),
+            method: "compact",
+        }
+        .fail()
+    }
+}
+
+#[derive(Debug)]
+struct MemoryScan {
+    // The schema of projected column indexed by ReadRequest::projection
+    schema: RecordSchema,
+    row_groups: Arc<RwLock<RowGroupVec>>,
+    index: usize,
+}
+
+impl Stream for MemoryScan {
+    type Item = stream::Result<RecordBatch>;
+
+    fn poll_next(mut self: Pin<&mut Self>, _ctx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        // TODO(yingwen): Batch row groups
+        let record_batch = {
+            let row_groups = self.row_groups.read().unwrap();
+            if self.index >= row_groups.len() {
+                return Poll::Ready(None);
+            }
+
+            let rows = &row_groups[self.index];
+            // Because the row group inserted may have different column order, so we cannot
+            // reuse the projection index, and must find projection index for each row
+            // group, which is inefficient
+            row_group_to_record_batch(rows, &self.schema)
+        };
+
+        self.index += 1;
+        Poll::Ready(Some(record_batch))
+    }
+}
+
+impl RecordBatchStream for MemoryScan {
+    fn schema(&self) -> &RecordSchema {
+        &self.schema
+    }
+}
+
+// REQUIRE: The schema is the projected schema
+fn row_group_to_record_batch(
+    rows: &RowGroup,
+    record_schema: &RecordSchema,
+) -> stream::Result<RecordBatch> {
+    if rows.is_empty() {
+        return Ok(RecordBatch::new_empty(record_schema.clone()));
+    }
+
+    let num_cols = record_schema.num_columns();
+    let mut column_blocks = Vec::with_capacity(num_cols);
+    // For each column, create an array for that column
+    for column in record_schema.columns().iter() {
+        let rows_schema = rows.schema();
+        let col_index = rows_schema
+            .index_of(&column.name)
+            .with_context(|| ErrNoSource {
+                msg: format!(
+                    "Failed to convert RowGroup to RecordBatch, column not found, column:{}",
+                    &column.name
+                ),
+            })?;
+        let cols = rows.iter_column(col_index);
+        let column_block = build_column_block(&column.data_type, cols)?;
+        column_blocks.push(column_block);
+    }
+
+    RecordBatch::new(record_schema.clone(), column_blocks)
+        .map_err(|e| Box::new(e) as _)
+        .context(ErrWithSource {
+            msg: "Failed to create RecordBatch",
+        })
+}
+
+fn build_column_block<'a, I: Iterator<Item = &'a Datum>>(
+    data_type: &DatumKind,
+    iter: I,
+) -> stream::Result<ColumnBlock> {
+    let mut builder = ColumnBlockBuilder::with_capacity(data_type, iter.size_hint().0);
+    for datum in iter {
+        builder
+            .append(datum.clone())
+            .map_err(|e| Box::new(e) as _)
+            .context(ErrWithSource {
+                msg: "Append datum",
+            })?;
+    }
+    Ok(builder.build())
+}
diff --git a/table_engine/src/partition/expression.rs b/table_engine/src/partition/expression.rs
new file mode 100644
index 0000000000..ae89d3a099
--- /dev/null
+++ b/table_engine/src/partition/expression.rs
@@ -0,0 +1,71 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Partition expression
+
+use std::ops::Deref;
+
+use common_types::datum::Datum;
+use common_util::define_result;
+use snafu::{Backtrace, OptionExt, Snafu};
+
+use crate::partition::PartitionInfo;
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("No datums for eval.\nBacktrace:\n{}", backtrace))]
+    EmptyDatums { backtrace: Backtrace },
+}
+
+define_result!(Error);
+
+/// Partition expression
+#[derive(Debug)]
+pub enum Expression {
+    ColumnExpr(ColumnExpr),
+}
+
+impl Expression {
+    pub fn new(partition_info: &PartitionInfo) -> Self {
+        Self::parse_expr(partition_info.expr.to_string())
+    }
+
+    /// Extract column name in expression
+    pub fn extract_column_name(&self) -> impl Iterator<Item = &str> {
+        match self {
+            Expression::ColumnExpr(col_expr) => col_expr.extract_column_name(),
+        }
+    }
+
+    fn parse_expr(expr_str: String) -> Expression {
+        Expression::ColumnExpr(ColumnExpr::new(expr_str))
+    }
+
+    pub fn eval_uint<T: Deref<Target = Datum>>(&self, datums: &[T]) -> Result<u64> {
+        match self {
+            Expression::ColumnExpr(column_expr) => {
+                column_expr.eval_uint(datums.get(0).context(EmptyDatums)?)
+            }
+        }
+    }
+}
+
+/// Column
+#[derive(Debug)]
+pub struct ColumnExpr {
+    column_name: String,
+}
+
+impl ColumnExpr {
+    fn new(column_name: String) -> Self {
+        Self { column_name }
+    }
+
+    fn extract_column_name(&self) -> impl Iterator<Item = &str> {
+        std::iter::once(self.column_name.as_str())
+    }
+
+    // TODO: handle error
+    fn eval_uint(&self, datum: &Datum) -> Result<u64> {
+        Ok(datum.convert_to_uint64())
+    }
+}
diff --git a/table_engine/src/partition/mod.rs b/table_engine/src/partition/mod.rs
new file mode 100644
index 0000000000..e419b3ef72
--- /dev/null
+++ b/table_engine/src/partition/mod.rs
@@ -0,0 +1,27 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Partitioned table supports
+
+mod expression;
+pub mod rule;
+
+/// Partition type of table
+#[derive(Clone, Debug, PartialEq)]
+pub enum PartitionType {
+    None = 0,
+    Hash = 1,
+}
+
+/// Size type of partition num
+pub type PartitionNum = u16;
+
+/// Info for how to partition table
+#[derive(Debug, Clone)]
+pub struct PartitionInfo {
+    /// Partition type
+    pub partition_type: PartitionType,
+    /// Partition expression
+    pub expr: String,
+    /// Partition num
+    pub partition_num: PartitionNum,
+}
diff --git a/table_engine/src/partition/rule.rs b/table_engine/src/partition/rule.rs
new file mode 100644
index 0000000000..28b31401c5
--- /dev/null
+++ b/table_engine/src/partition/rule.rs
@@ -0,0 +1,108 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Partition rules
+
+use common_types::{datum::Datum, row::Row, schema::Schema};
+use common_util::define_result;
+use smallvec::SmallVec;
+use snafu::{ensure, Backtrace, ResultExt, Snafu};
+
+use crate::partition::{expression::Expression, PartitionInfo, PartitionType};
+
+const HASH_COLUMN_NUM: usize = 1;
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("No column for hash partitioning.\nBacktrace:\n{}", backtrace))]
+    NoColumnForHash { backtrace: Backtrace },
+
+    #[snafu(display("Only support one hash column.\nBacktrace:\n{}", backtrace))]
+    TooMuchHashColumn { backtrace: Backtrace },
+
+    #[snafu(display("Failed to eval partition expr, err:{}", source))]
+    EvalExpr {
+        source: crate::partition::expression::Error,
+    },
+}
+
+define_result!(Error);
+
+/// Partition rule locate partition by input records
+// TODO(yingwen): Recreate partition rule once the schema of the table is changed
+#[derive(Debug)]
+pub enum PartitionRule {
+    None,
+    Hash(HashPartitionRule),
+}
+
+impl PartitionRule {
+    pub fn new(partition_info: &PartitionInfo, schema: &Schema) -> Result<Self> {
+        match partition_info.partition_type {
+            PartitionType::None => Ok(PartitionRule::None),
+            PartitionType::Hash => {
+                let rule = HashPartitionRule::new(partition_info, schema)?;
+                Ok(PartitionRule::Hash(rule))
+            }
+        }
+    }
+
+    /// Return the index of partition
+    pub fn locate_partition(&self, row: &Row) -> Result<usize> {
+        match self {
+            // Always return the first partition
+            PartitionRule::None => Ok(0),
+            PartitionRule::Hash(rule) => rule.eval_partition_index(row),
+        }
+    }
+}
+
+/// Partition rule based on hash
+#[derive(Debug)]
+pub struct HashPartitionRule {
+    /// Total number of partitions
+    partition_num: u16,
+    /// Expression to evaluate a hash value
+    expression: Expression,
+    /// Offsets of columns for evaluate
+    // TODO(yingwen): The column index may be invalid after schema change (add/del column)
+    column_index: SmallVec<[usize; HASH_COLUMN_NUM]>,
+}
+
+impl HashPartitionRule {
+    pub fn new(partition_info: &PartitionInfo, schema: &Schema) -> Result<Self> {
+        let expr = Expression::new(partition_info);
+
+        let col_name_list = expr.extract_column_name();
+        let mut column_index = SmallVec::with_capacity(col_name_list.size_hint().0);
+        for col_name in col_name_list {
+            for (i, v) in schema.columns().iter().enumerate() {
+                if col_name == v.name {
+                    column_index.push(i);
+                    break;
+                }
+            }
+        }
+
+        ensure!(!column_index.is_empty(), NoColumnForHash);
+        ensure!(column_index.len() == 1, TooMuchHashColumn);
+
+        Ok(Self {
+            partition_num: partition_info.partition_num,
+            expression: expr,
+            column_index,
+        })
+    }
+
+    // TODO(yingwen): Also pass schema?
+    pub fn eval_partition_index(&self, row: &Row) -> Result<usize> {
+        let mut col_vals: SmallVec<[&Datum; HASH_COLUMN_NUM]> =
+            SmallVec::with_capacity(self.column_index.len());
+        for i in &self.column_index {
+            // TODO(yingwen): Check index?
+            col_vals.push(&row[*i]);
+        }
+        let eval_uint = self.expression.eval_uint(&col_vals).context(EvalExpr)?;
+
+        Ok((eval_uint % self.partition_num as u64) as usize)
+    }
+}
diff --git a/table_engine/src/predicate/filter_record_batch.rs b/table_engine/src/predicate/filter_record_batch.rs
new file mode 100644
index 0000000000..cafbd960da
--- /dev/null
+++ b/table_engine/src/predicate/filter_record_batch.rs
@@ -0,0 +1,249 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use arrow_deps::datafusion::{
+    logical_plan::{Expr, Operator},
+    scalar::ScalarValue,
+};
+use common_types::{datum::DatumView, record_batch::RecordBatchWithKey};
+
+#[derive(Debug)]
+struct ColumnFilter {
+    name: String,
+    op: Operator,
+    literal: ScalarValue,
+}
+
+fn evaluate_by_operator<T: PartialOrd>(lhs: &T, rhs: &T, op: &Operator) -> Option<bool> {
+    let cmp_res = lhs.partial_cmp(rhs)?;
+    let v = match op {
+        Operator::Lt => cmp_res.is_lt(),
+        Operator::LtEq => cmp_res.is_le(),
+        Operator::Gt => cmp_res.is_gt(),
+        Operator::GtEq => cmp_res.is_ge(),
+        Operator::NotEq => cmp_res.is_ne(),
+        Operator::Eq => cmp_res.is_eq(),
+        _ => return None,
+    };
+    Some(v)
+}
+
+fn evaluate_datums_by_operator<'a>(
+    lhs: &DatumView<'a>,
+    rhs: &DatumView<'a>,
+    op: &Operator,
+) -> Option<bool> {
+    macro_rules! impl_evaluate {
+        ($($Kind: ident), *) => {
+           match (lhs, rhs){
+               (DatumView::Null, DatumView::Null) => Some(true),
+               $((DatumView::$Kind(v1), DatumView::$Kind(v2)) => evaluate_by_operator(v1, v2, op),)*
+               _ => None,
+           }
+        };
+    }
+
+    impl_evaluate!(
+        Timestamp, Double, Float, Varbinary, String, UInt64, UInt32, UInt16, UInt8, Int64, Int32,
+        Int16, Int8, Boolean
+    )
+}
+
+impl ColumnFilter {
+    fn filter(&self, record_batch: &RecordBatchWithKey, selected_buf: &mut [bool]) -> Option<()> {
+        let filter_datum_view = DatumView::from_scalar_value(&self.literal)?;
+
+        let column_idx = record_batch.schema_with_key().index_of(&self.name)?;
+        let column_data = record_batch.column(column_idx);
+
+        assert!(selected_buf.len() >= column_data.num_rows());
+        for (i, selected) in selected_buf
+            .iter_mut()
+            .enumerate()
+            .take(column_data.num_rows())
+        {
+            if *selected {
+                let datum_view = column_data.datum_view(i);
+                *selected = evaluate_datums_by_operator(&datum_view, &filter_datum_view, &self.op)
+                    .unwrap_or(true);
+            }
+        }
+
+        Some(())
+    }
+}
+
+/// Filter record batch by applying the `column_filters`.
+pub struct RecordBatchFilter {
+    column_filters: Vec<ColumnFilter>,
+}
+
+impl RecordBatchFilter {
+    /// Create filter according to the `exprs` whose logical relationship is
+    /// `AND` between each other. Note that the created filter is not
+    /// equivalent to the original `exprs` and actually only a subset of the
+    /// exprs is chosen to create the [`RecordBatchFilter`].
+    pub fn new(exprs: &[Expr]) -> Self {
+        let mut filters = Vec::with_capacity(exprs.len());
+        for expr in exprs {
+            if let Expr::BinaryExpr { left, op, right } = expr {
+                let (column_name, literal) = match (left.as_ref(), right.as_ref()) {
+                    (Expr::Column(col), Expr::Literal(v))
+                    | (Expr::Literal(v), Expr::Column(col)) => (col.name.to_string(), v.clone()),
+                    _ => continue,
+                };
+
+                if matches!(
+                    op,
+                    Operator::NotEq
+                        | Operator::Eq
+                        | Operator::Gt
+                        | Operator::GtEq
+                        | Operator::Lt
+                        | Operator::LtEq
+                ) {
+                    filters.push(ColumnFilter {
+                        name: column_name,
+                        op: *op,
+                        literal,
+                    })
+                }
+            }
+        }
+
+        RecordBatchFilter {
+            column_filters: filters,
+        }
+    }
+
+    /// Filter `record_batch` and save the filtering results into the
+    /// `selected_rows_buf`.
+    ///
+    /// Requires: `selected_rows_buf.len() == record_batch.num_rows()`.
+    pub fn filter(
+        &self,
+        record_batch: &RecordBatchWithKey,
+        selected_rows_buf: &mut [bool],
+    ) -> usize {
+        assert_eq!(record_batch.num_rows(), selected_rows_buf.len());
+
+        for selected in &mut *selected_rows_buf {
+            *selected = true;
+        }
+
+        for column_filter in &self.column_filters {
+            column_filter.filter(record_batch, selected_rows_buf.as_mut());
+        }
+
+        selected_rows_buf
+            .iter()
+            .map(|selected| if *selected { 1 } else { 0 })
+            .sum()
+    }
+
+    #[inline]
+    pub fn is_empty(&self) -> bool {
+        self.column_filters.is_empty()
+    }
+}
+
+impl From<&[Expr]> for RecordBatchFilter {
+    fn from(exprs: &[Expr]) -> Self {
+        Self::new(exprs)
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use arrow_deps::datafusion::prelude::Column;
+    use common_types::{
+        row::Row,
+        tests::{build_record_batch_with_key_by_rows, build_row},
+    };
+
+    use super::*;
+
+    fn build_record_batch(rows: Vec<Row>) -> RecordBatchWithKey {
+        build_record_batch_with_key_by_rows(rows)
+    }
+
+    fn build_filter_expr(column_name: &str, literal: ScalarValue, op: Operator) -> Expr {
+        Expr::BinaryExpr {
+            left: Box::new(Expr::Column(Column::from_name(column_name.to_string()))),
+            op,
+            right: Box::new(Expr::Literal(literal)),
+        }
+    }
+
+    #[test]
+    fn test_empty_filter() {
+        let rows = vec![
+            build_row(b"aaaa", 1, 11.0, "AAAA"),
+            build_row(b"aaaa", 1, 21.0, "BBBB"),
+        ];
+        let batch = build_record_batch(rows);
+
+        let filter = RecordBatchFilter::new(&[]);
+        let mut selected_rows = vec![false; batch.num_rows()];
+        let selected_num = filter.filter(&batch, &mut selected_rows);
+
+        assert_eq!(selected_num, selected_rows.len());
+        assert!(selected_rows.iter().all(|v| *v));
+    }
+
+    #[test]
+    fn test_all_filter() {
+        let rows = vec![
+            build_row(b"aaaa", 1, 11.0, "AAAA"),
+            build_row(b"aaaa", 1, 21.0, "BBBB"),
+            build_row(b"aaaa", 2, 21.0, "CCCC"),
+            build_row(b"bbbb", 2, 31.0, "DDDD"),
+            build_row(b"bbbb", 2, 31.0, "DDDD"),
+        ];
+        let batch = build_record_batch(rows);
+
+        let expr = build_filter_expr("key2", ScalarValue::Int64(Some(2)), Operator::LtEq);
+        let filter = RecordBatchFilter::new(&[expr]);
+        let mut selected_rows = vec![false; batch.num_rows()];
+        let selected_num = filter.filter(&batch, &mut selected_rows);
+
+        assert_eq!(selected_num, selected_rows.len());
+        assert!(selected_rows.iter().all(|v| *v));
+    }
+
+    #[test]
+    fn test_partial_filter() {
+        let rows = vec![
+            build_row(b"aaaa", 1, 11.0, "AAAA"),
+            build_row(b"aaaa", 1, 21.0, "BBBB"),
+            build_row(b"aaaa", 2, 21.0, "CCCC"),
+            build_row(b"bbbb", 2, 31.0, "DDDD"),
+            build_row(b"bbbb", 2, 31.0, "DDDD"),
+        ];
+        let batch = build_record_batch(rows);
+
+        let expr1 = build_filter_expr("key2", ScalarValue::Int64(Some(2)), Operator::LtEq);
+        let expr2 = build_filter_expr(
+            "key1",
+            ScalarValue::Binary(Some(b"aabb".to_vec())),
+            Operator::GtEq,
+        );
+        let filter = RecordBatchFilter::new(&[expr1, expr2]);
+        let mut selected_rows = vec![false; batch.num_rows()];
+        let selected_num = filter.filter(&batch, &mut selected_rows);
+        let expect_selected_rows = vec![false, false, false, true, true];
+
+        assert_eq!(selected_num, 2);
+        assert_eq!(selected_rows, expect_selected_rows);
+    }
+
+    #[test]
+    fn test_filter_empty_batch() {
+        let batch = build_record_batch(vec![]);
+        let expr1 = build_filter_expr("key2", ScalarValue::Int64(Some(2)), Operator::LtEq);
+        let filter = RecordBatchFilter::new(&[expr1]);
+        let mut selected_rows = vec![false; batch.num_rows()];
+        filter.filter(&batch, &mut selected_rows);
+
+        assert!(selected_rows.is_empty());
+    }
+}
diff --git a/table_engine/src/predicate/mod.rs b/table_engine/src/predicate/mod.rs
new file mode 100644
index 0000000000..2758dac513
--- /dev/null
+++ b/table_engine/src/predicate/mod.rs
@@ -0,0 +1,540 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Predict for query table.
+//! Reference to: https://github.com/influxdata/influxdb_iox/blob/29b10413051f8c4a2193e8633aa133e45b0e505a/query/src/predicate.rs
+
+use std::{collections::HashSet, convert::TryInto, sync::Arc};
+
+use arrow_deps::{
+    arrow::{
+        array::ArrayRef,
+        datatypes::{Schema as ArrowSchema, SchemaRef},
+    },
+    datafusion::{
+        logical_plan::{Column, Expr, Operator},
+        optimizer::utils as datafusion_util,
+        parquet::file::metadata::RowGroupMetaData,
+        physical_optimizer::pruning::{PruningPredicate, PruningStatistics},
+        scalar::ScalarValue,
+    },
+    parquet::file::statistics::Statistics as ParquetStatistics,
+};
+use common_types::{
+    schema::Schema,
+    time::{TimeRange, Timestamp},
+};
+use log::{debug, error};
+use snafu::{ResultExt, Snafu};
+
+pub mod filter_record_batch;
+
+#[derive(Debug, Snafu)]
+#[snafu(visibility = "pub")]
+pub enum Error {
+    #[snafu(display("Failed ot do pruning, err:{}", source))]
+    Prune {
+        source: arrow_deps::datafusion::error::DataFusionError,
+    },
+}
+
+define_result!(Error);
+
+/// port from datafusion.
+/// Extract the min/max statistics from a `ParquetStatistics` object
+macro_rules! get_statistic {
+    ($column_statistics:expr, $func:ident, $bytes_func:ident) => {{
+        if !$column_statistics.has_min_max_set() {
+            return None;
+        }
+        match $column_statistics {
+            ParquetStatistics::Boolean(s) => Some(ScalarValue::Boolean(Some(*s.$func()))),
+            ParquetStatistics::Int32(s) => Some(ScalarValue::Int32(Some(*s.$func()))),
+            ParquetStatistics::Int64(s) => Some(ScalarValue::Int64(Some(*s.$func()))),
+            // 96 bit ints not supported
+            ParquetStatistics::Int96(_) => None,
+            ParquetStatistics::Float(s) => Some(ScalarValue::Float32(Some(*s.$func()))),
+            ParquetStatistics::Double(s) => Some(ScalarValue::Float64(Some(*s.$func()))),
+            ParquetStatistics::ByteArray(s) => {
+                let s = std::str::from_utf8(s.$bytes_func())
+                    .map(|s| s.to_string())
+                    .ok();
+                Some(ScalarValue::Utf8(s))
+            }
+            // type not supported yet
+            ParquetStatistics::FixedLenByteArray(_) => None,
+        }
+    }};
+}
+
+/// port from datafusion.
+// Extract the min or max value calling `func` or `bytes_func` on the
+// ParquetStatistics as appropriate
+macro_rules! get_min_max_values {
+    ($self:expr, $column:expr, $func:ident, $bytes_func:ident) => {{
+        let (column_index, field) =
+            if let Some((v, f)) = $self.parquet_schema.column_with_name(&$column.name) {
+                (v, f)
+            } else {
+                // Named column was not present
+                return None;
+            };
+
+        let data_type = field.data_type();
+        let null_scalar: ScalarValue = if let Ok(v) = data_type.try_into() {
+            v
+        } else {
+            // DataFusion doesn't have support for ScalarValues of the column type
+            return None;
+        };
+
+        let scalar_values: Vec<ScalarValue> = $self
+            .row_group_metadata
+            .iter()
+            .flat_map(|meta| meta.column(column_index).statistics())
+            .map(|stats| get_statistic!(stats, $func, $bytes_func))
+            .map(|maybe_scalar| {
+                // column either did't have statistics at all or didn't have min/max values
+                maybe_scalar.unwrap_or_else(|| null_scalar.clone())
+            })
+            .collect();
+
+        // ignore errors converting to arrays (e.g. different types)
+        ScalarValue::iter_to_array(scalar_values).ok()
+    }};
+}
+
+/// Wraps parquet statistics in a way
+/// that implements [`PruningStatistics`]
+struct RowGroupPruningStatistics<'a> {
+    row_group_metadata: &'a [RowGroupMetaData],
+    parquet_schema: &'a ArrowSchema,
+}
+
+impl<'a> PruningStatistics for RowGroupPruningStatistics<'a> {
+    fn min_values(&self, column: &Column) -> Option<ArrayRef> {
+        get_min_max_values!(self, column, min, min_bytes)
+    }
+
+    fn max_values(&self, column: &Column) -> Option<ArrayRef> {
+        get_min_max_values!(self, column, max, max_bytes)
+    }
+
+    fn num_containers(&self) -> usize {
+        self.row_group_metadata.len()
+    }
+}
+
+fn build_row_group_predicate(
+    predicate_builder: &PruningPredicate,
+    row_group_metadata: &[RowGroupMetaData],
+) -> Result<Vec<bool>> {
+    let parquet_schema = predicate_builder.schema().as_ref();
+
+    let pruning_stats = RowGroupPruningStatistics {
+        row_group_metadata,
+        parquet_schema,
+    };
+
+    predicate_builder
+        .prune(&pruning_stats)
+        .map_err(|e| {
+            error!("Error evaluating row group predicate values {}", e);
+            e
+        })
+        .context(Prune)
+}
+
+/// Predicate helps determine whether specific row group should be read.
+#[derive(Debug, Clone)]
+pub struct Predicate {
+    /// Predicates in the query for filter out the columns that meet all the
+    /// exprs.
+    pub exprs: Vec<Expr>,
+    /// The time range involved by the query.
+    pub time_range: TimeRange,
+}
+
+pub type PredicateRef = Arc<Predicate>;
+
+impl Predicate {
+    pub fn empty() -> Self {
+        Self::new(TimeRange::min_to_max())
+    }
+
+    pub fn new(time_range: TimeRange) -> Self {
+        Self {
+            exprs: Vec::new(),
+            time_range,
+        }
+    }
+
+    /// Determine whether a row group should be read according to the meta data
+    /// in the `row_groups`.
+    ///
+    /// The boolean value in the returned vector denotes the corresponding row
+    /// group in the `row_groups` whether should be read.
+    pub fn filter_row_groups(&self, schema: &Schema, row_groups: &[RowGroupMetaData]) -> Vec<bool> {
+        let mut results = vec![true; row_groups.len()];
+        let arrow_schema: SchemaRef = schema.clone().into_arrow_schema_ref();
+        for expr in &self.exprs {
+            match PruningPredicate::try_new(expr, arrow_schema.clone()) {
+                Ok(pruning_predicate) => {
+                    debug!("pruning_predicate is:{:?}", pruning_predicate);
+
+                    if let Ok(values) = build_row_group_predicate(&pruning_predicate, row_groups) {
+                        for (curr_val, result_val) in values.into_iter().zip(results.iter_mut()) {
+                            *result_val = curr_val && *result_val
+                        }
+                    };
+                    // if fail to build, just ignore this filter so that all the
+                    // row groups should be read for this
+                    // filter.
+                }
+                Err(e) => {
+                    // for any error just ignore it and that is to say, for this filter all the row
+                    // groups should be read.
+                    error!("fail to build pruning predicate, err:{}", e);
+                }
+            }
+        }
+
+        results
+    }
+}
+
+/// Builder for [Predicate]
+#[derive(Debug, Clone, Default)]
+#[must_use]
+pub struct PredicateBuilder {
+    time_range: Option<TimeRange>,
+    exprs: Vec<Expr>,
+}
+
+impl PredicateBuilder {
+    /// Adds the expressions from `filter_exprs` that can be pushed down to
+    /// query engine.
+    pub fn add_pushdown_exprs(mut self, filter_exprs: &[Expr]) -> Self {
+        // For each expression of the filter_exprs, recursively split it if it is is an
+        // AND conjunction. For example, expression (x AND y) is split into [x,
+        // y].
+        let mut split_exprs = vec![];
+        for filter_expr in filter_exprs {
+            Self::split_and_expr(filter_expr, &mut split_exprs)
+        }
+
+        // Only keep single_column and primitive binary expressions
+        let pushdown_exprs: Vec<_> = split_exprs
+            .into_iter()
+            .filter(Self::is_able_to_pushdown)
+            .collect();
+
+        self.exprs = pushdown_exprs;
+
+        self
+    }
+
+    /// Extract the time range from the `filter_exprs` and set it as
+    /// `TimeRange::zero_to_max()` if no timestamp predicate is found.
+    pub fn set_time_range(mut self, schema: &Schema, filter_exprs: &[Expr]) -> Self {
+        let time_range_extractor = TimeRangeExtractor {
+            timestamp_column_name: schema.timestamp_name(),
+            filters: filter_exprs,
+        };
+
+        let time_range = time_range_extractor.extract();
+        debug!(
+            "finish extract time range from the filters, time_range:{:?}, filters:{:?}",
+            time_range, filter_exprs
+        );
+
+        self.time_range = Some(time_range);
+
+        self
+    }
+
+    pub fn build(self) -> PredicateRef {
+        Arc::new(Predicate {
+            exprs: self.exprs,
+            time_range: self.time_range.unwrap_or_else(TimeRange::min_to_max),
+        })
+    }
+
+    /// Determine whether the `expr` can be pushed down.
+    /// Returns false if any error occurs.
+    fn is_able_to_pushdown(expr: &Expr) -> bool {
+        let mut columns = HashSet::new();
+        if let Err(e) = datafusion_util::expr_to_columns(expr, &mut columns) {
+            error!(
+                "Failed to extract columns from the expr, ignore this expr:{:?}, err:{}",
+                expr, e
+            );
+            return false;
+        }
+
+        columns.len() == 1 && Self::is_primitive_binary_expr(expr)
+    }
+
+    /// Recursively split all "AND" expressions into smaller one
+    /// Example: "A AND B AND C" => [A, B, C]
+    fn split_and_expr(expr: &Expr, predicates: &mut Vec<Expr>) {
+        match expr {
+            Expr::BinaryExpr {
+                right,
+                op: Operator::And,
+                left,
+            } => {
+                Self::split_and_expr(left, predicates);
+                Self::split_and_expr(right, predicates);
+            }
+            other => predicates.push(other.clone()),
+        }
+    }
+
+    /// Return true if the given expression is in a primitive binary in the
+    /// form: `column op constant` and op must be a comparison one.
+    fn is_primitive_binary_expr(expr: &Expr) -> bool {
+        match expr {
+            Expr::BinaryExpr { left, op, right } => {
+                matches!(
+                    (&**left, &**right),
+                    (Expr::Column(_), Expr::Literal(_)) | (Expr::Literal(_), Expr::Column(_))
+                ) && matches!(
+                    op,
+                    Operator::Eq
+                        | Operator::NotEq
+                        | Operator::Lt
+                        | Operator::LtEq
+                        | Operator::Gt
+                        | Operator::GtEq
+                )
+            }
+            _ => false,
+        }
+    }
+}
+
+struct TimeRangeExtractor<'a> {
+    timestamp_column_name: &'a str,
+    filters: &'a [Expr],
+}
+
+impl<'a> TimeRangeExtractor<'a> {
+    /// Do extraction from the `self.filters` for TimeRange.
+    ///
+    /// Returns `TimeRange::zero_to_max()` if no timestamp predicate is found.
+    fn extract(&self) -> TimeRange {
+        let mut time_range = TimeRange::min_to_max();
+        for expr in self.filters {
+            let sub_time_range = self.extract_time_range_from_expr(expr);
+            let new_time_range = Self::and_time_ranges(&time_range, &sub_time_range);
+
+            debug!(
+                "do and logic for time range, left:{:?}, right:{:?}, output:{:?}, expr:{:?}",
+                time_range, sub_time_range, new_time_range, expr
+            );
+            time_range = new_time_range
+        }
+
+        time_range
+    }
+
+    /// Extract timestamp from the literal scalar expression.
+    fn timestamp_from_scalar_expr(expr: &Expr) -> Option<Timestamp> {
+        if let Expr::Literal(ScalarValue::TimestampMillisecond(v, _)) = expr {
+            return v.map(Timestamp::new);
+        }
+
+        None
+    }
+
+    /// Compute the intersection of the two time ranges.
+    fn and_time_ranges(left: &TimeRange, right: &TimeRange) -> TimeRange {
+        let start = left.inclusive_start().max(right.inclusive_start());
+        let end = left.exclusive_end().min(right.exclusive_end());
+        TimeRange::new(start, end).unwrap_or_else(TimeRange::empty)
+    }
+
+    /// Compute the union of the two time ranges and the union is defined as the
+    /// [min(left.start(), right.start()), max(left.end(), right.end())).
+    fn or_time_ranges(left: &TimeRange, right: &TimeRange) -> TimeRange {
+        let start = left.inclusive_start().min(right.inclusive_start());
+        let end = left.exclusive_end().max(right.exclusive_end());
+        TimeRange::new_unchecked(start, end)
+    }
+
+    /// Extract the timestamp from the column expression and its corresponding
+    /// literal expression. Returns `None` if the expression pair is not
+    /// involved with timestamp column. No assumption on the order of the
+    /// `left` and `right`.
+    fn timestamp_from_column_and_value_expr(&self, left: &Expr, right: &Expr) -> Option<Timestamp> {
+        let (column, val) = match (left, right) {
+            (Expr::Column(column), Expr::Literal(_)) => (column, right),
+            (Expr::Literal(_), Expr::Column(column)) => (column, left),
+            _ => return None,
+        };
+
+        if column.name == self.timestamp_column_name {
+            Self::timestamp_from_scalar_expr(val)
+        } else {
+            None
+        }
+    }
+
+    /// Extract time range from the binary expression.
+    fn extract_time_range_from_binary_expr(
+        &self,
+        left: &Expr,
+        right: &Expr,
+        op: &Operator,
+    ) -> TimeRange {
+        match op {
+            Operator::And => {
+                let time_range_left = self.extract_time_range_from_expr(left);
+                let time_range_right = self.extract_time_range_from_expr(right);
+                Self::and_time_ranges(&time_range_left, &time_range_right)
+            }
+            Operator::Or => {
+                let time_range_left = self.extract_time_range_from_expr(left);
+                let time_range_right = self.extract_time_range_from_expr(right);
+                Self::or_time_ranges(&time_range_left, &time_range_right)
+            }
+            Operator::Eq => self
+                .timestamp_from_column_and_value_expr(left, right)
+                .map(TimeRange::from_timestamp)
+                .unwrap_or_else(TimeRange::min_to_max),
+            Operator::NotEq => TimeRange::min_to_max(),
+            Operator::Lt => self
+                .timestamp_from_column_and_value_expr(left, right)
+                .map(|right_t| TimeRange::new_unchecked(Timestamp::MIN, right_t))
+                .unwrap_or_else(TimeRange::min_to_max),
+            Operator::LtEq => self
+                .timestamp_from_column_and_value_expr(left, right)
+                .map(|right_t| {
+                    let right_t = right_t.checked_add_i64(1).unwrap_or(right_t);
+                    TimeRange::new_unchecked(Timestamp::MIN, right_t)
+                })
+                .unwrap_or_else(TimeRange::min_to_max),
+            Operator::Gt => self
+                .timestamp_from_column_and_value_expr(left, right)
+                .map(|left_t| {
+                    let left_t = left_t.checked_add_i64(1).unwrap_or(left_t);
+                    TimeRange::new_unchecked(left_t, Timestamp::MAX)
+                })
+                .unwrap_or_else(TimeRange::min_to_max),
+            Operator::GtEq => self
+                .timestamp_from_column_and_value_expr(left, right)
+                .map(|left_t| TimeRange::new_unchecked(left_t, Timestamp::MAX))
+                .unwrap_or_else(TimeRange::min_to_max),
+            Operator::Plus
+            | Operator::Minus
+            | Operator::Multiply
+            | Operator::Divide
+            | Operator::Modulo
+            | Operator::Like
+            | Operator::NotLike
+            | Operator::IsDistinctFrom
+            | Operator::IsNotDistinctFrom
+            | Operator::RegexMatch
+            | Operator::RegexNotMatch
+            | Operator::RegexIMatch
+            | Operator::RegexNotIMatch => TimeRange::min_to_max(),
+        }
+    }
+
+    /// Extract time range from the between expression.
+    fn time_range_from_between_expr(low: &Expr, high: &Expr, negated: bool) -> TimeRange {
+        if negated {
+            return TimeRange::min_to_max();
+        }
+
+        let low_t = Self::timestamp_from_scalar_expr(low).unwrap_or(Timestamp::MIN);
+        // the two operands are inclusive in the `between` expression.
+        let high_t = {
+            let t = Self::timestamp_from_scalar_expr(high).unwrap_or(Timestamp::MAX);
+            t.checked_add_i64(1).unwrap_or(Timestamp::MAX)
+        };
+        TimeRange::new(low_t, high_t).unwrap_or_else(TimeRange::empty)
+    }
+
+    /// Extract time range from the list expressions.
+    fn time_range_from_list_expr(list: &[Expr], negated: bool) -> TimeRange {
+        if negated {
+            return TimeRange::min_to_max();
+        }
+
+        if list.is_empty() {
+            return TimeRange::empty();
+        }
+
+        let (mut inclusive_start, mut inclusive_end) = (Timestamp::MAX, Timestamp::MIN);
+        for expr in list {
+            match Self::timestamp_from_scalar_expr(expr) {
+                Some(t) => {
+                    inclusive_start = inclusive_start.min(t);
+                    inclusive_end = inclusive_end.max(t);
+                }
+                None => return TimeRange::min_to_max(),
+            }
+        }
+
+        TimeRange::new(inclusive_start, inclusive_end).unwrap_or_else(TimeRange::empty)
+    }
+
+    /// Extract the time range recursively from the `expr`.
+    ///
+    /// Now the strategy is conservative: for the sub-expr which we are not sure
+    /// how to handle it, returns `TimeRange::zero_to_max()`.
+    fn extract_time_range_from_expr(&self, expr: &Expr) -> TimeRange {
+        match expr {
+            Expr::BinaryExpr { left, op, right } => {
+                self.extract_time_range_from_binary_expr(left, right, op)
+            }
+            Expr::Between {
+                expr,
+                negated,
+                low,
+                high,
+            } => {
+                if let Expr::Column(column) = expr.as_ref() {
+                    if column.name == self.timestamp_column_name {
+                        return Self::time_range_from_between_expr(&*low, &*high, *negated);
+                    }
+                }
+
+                TimeRange::min_to_max()
+            }
+            Expr::InList {
+                expr,
+                list,
+                negated,
+            } => {
+                if let Expr::Column(column) = expr.as_ref() {
+                    if column.name == self.timestamp_column_name {
+                        return Self::time_range_from_list_expr(list, *negated);
+                    }
+                }
+
+                TimeRange::min_to_max()
+            }
+            Expr::Not(_)
+            | Expr::Alias(_, _)
+            | Expr::ScalarVariable(_)
+            | Expr::Column(_)
+            | Expr::Literal(_)
+            | Expr::IsNotNull(_)
+            | Expr::IsNull(_)
+            | Expr::Negative(_)
+            | Expr::Case { .. }
+            | Expr::Cast { .. }
+            | Expr::TryCast { .. }
+            | Expr::Sort { .. }
+            | Expr::ScalarFunction { .. }
+            | Expr::ScalarUDF { .. }
+            | Expr::AggregateFunction { .. }
+            | Expr::WindowFunction { .. }
+            | Expr::AggregateUDF { .. }
+            | Expr::Wildcard { .. }
+            | Expr::GetIndexedField { .. } => TimeRange::min_to_max(),
+        }
+    }
+}
diff --git a/table_engine/src/provider.rs b/table_engine/src/provider.rs
new file mode 100644
index 0000000000..92e2ed57e0
--- /dev/null
+++ b/table_engine/src/provider.rs
@@ -0,0 +1,275 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Datafusion `TableProvider` adapter
+
+use std::{any::Any, fmt, sync::Arc};
+
+use arrow_deps::{
+    arrow::datatypes::SchemaRef,
+    datafusion::{
+        datasource::datasource::{TableProvider, TableProviderFilterPushDown},
+        error::{DataFusionError, Result},
+        execution::runtime_env::RuntimeEnv,
+        logical_plan::Expr,
+        physical_plan::{
+            DisplayFormatType, ExecutionPlan, Partitioning,
+            SendableRecordBatchStream as DfSendableRecordBatchStream, Statistics,
+        },
+    },
+};
+use async_trait::async_trait;
+use common_types::{projected_schema::ProjectedSchema, request_id::RequestId, schema::Schema};
+use log::debug;
+use tokio::sync::Mutex;
+
+use crate::{
+    predicate::{PredicateBuilder, PredicateRef},
+    stream::{SendableRecordBatchStream, ToDfStream},
+    table::{self, ReadOptions, ReadOrder, ReadRequest, TableRef},
+};
+
+/// An adapter to [TableProvider] with schema snapshot.
+///
+/// This adapter holds a schema snapshot of the table and always returns that
+/// schema to caller.
+#[derive(Debug)]
+pub struct TableProviderAdapter {
+    table: TableRef,
+    /// The schema of the table when this adapter is created, used as schema
+    /// snapshot for read to avoid the reader sees different schema during
+    /// query
+    read_schema: Schema,
+    request_id: RequestId,
+    read_parallelism: usize,
+}
+
+impl TableProviderAdapter {
+    pub fn new(table: TableRef, request_id: RequestId, read_parallelism: usize) -> Self {
+        // Take a snapshot of the schema
+        let read_schema = table.schema();
+
+        Self {
+            table,
+            read_schema,
+            request_id,
+            read_parallelism,
+        }
+    }
+
+    pub fn as_table_ref(&self) -> &TableRef {
+        &self.table
+    }
+
+    pub fn scan_table(
+        &self,
+        projection: &Option<Vec<usize>>,
+        filters: &[Expr],
+        limit: Option<usize>,
+        read_order: ReadOrder,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        debug!(
+            "scan table, table:{}, request_id:{}, projection:{:?}, filters:{:?}, limit:{:?}, read_order:{:?}",
+            self.table.name(),
+            self.request_id,
+            projection,
+            filters,
+            limit,
+            read_order,
+        );
+
+        // Forbid the parallel reading if the data order is required.
+        let read_parallelism = if read_order.is_in_order() {
+            1
+        } else {
+            self.read_parallelism
+        };
+
+        let predicate = self.predicate_from_filters(filters);
+        Ok(Arc::new(ScanTable {
+            projected_schema: ProjectedSchema::new(self.read_schema.clone(), projection.clone())
+                .map_err(|e| {
+                    DataFusionError::Internal(format!(
+                        "Invalid projection, plan:{:?}, projection:{:?}, err:{:?}",
+                        self, projection, e
+                    ))
+                })?,
+            table: self.table.clone(),
+            request_id: self.request_id,
+            read_order,
+            read_parallelism,
+            predicate,
+            stream_state: Mutex::new(ScanStreamState::default()),
+        }))
+    }
+
+    fn predicate_from_filters(&self, filters: &[Expr]) -> PredicateRef {
+        PredicateBuilder::default()
+            .add_pushdown_exprs(filters)
+            .set_time_range(&self.read_schema, filters)
+            .build()
+    }
+}
+
+#[async_trait]
+impl TableProvider for TableProviderAdapter {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn schema(&self) -> SchemaRef {
+        // We use the `read_schema` as the schema of this `TableProvider`
+        self.read_schema.clone().into_arrow_schema_ref()
+    }
+
+    async fn scan(
+        &self,
+        projection: &Option<Vec<usize>>,
+        filters: &[Expr],
+        limit: Option<usize>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        self.scan_table(projection, filters, limit, ReadOrder::None)
+    }
+
+    fn supports_filter_pushdown(&self, _filter: &Expr) -> Result<TableProviderFilterPushDown> {
+        Ok(TableProviderFilterPushDown::Inexact)
+    }
+}
+
+#[derive(Default)]
+struct ScanStreamState {
+    inited: bool,
+    err: Option<table::Error>,
+    streams: Vec<Option<SendableRecordBatchStream>>,
+}
+
+impl ScanStreamState {
+    fn take_stream(&mut self, index: usize) -> Result<SendableRecordBatchStream> {
+        if let Some(e) = &self.err {
+            return Err(DataFusionError::Execution(format!(
+                "Failed to read table, partition:{}, err:{}",
+                index, e
+            )));
+        }
+
+        // TODO(yingwen): Return an empty stream if index is out of bound.
+        self.streams[index].take().ok_or_else(|| {
+            DataFusionError::Execution(format!(
+                "Read partition multiple times is not supported, partition:{}",
+                index
+            ))
+        })
+    }
+}
+
+/// Physical plan of scanning table.
+struct ScanTable {
+    projected_schema: ProjectedSchema,
+    table: TableRef,
+    request_id: RequestId,
+    read_order: ReadOrder,
+    read_parallelism: usize,
+    predicate: PredicateRef,
+
+    stream_state: Mutex<ScanStreamState>,
+}
+
+impl ScanTable {
+    async fn maybe_init_stream(&self, runtime: Arc<RuntimeEnv>) -> Result<()> {
+        let mut stream_state = self.stream_state.lock().await;
+        if stream_state.inited {
+            return Ok(());
+        }
+
+        let req = ReadRequest {
+            request_id: self.request_id,
+            opts: ReadOptions {
+                batch_size: runtime.batch_size(),
+                read_parallelism: self.read_parallelism,
+            },
+            projected_schema: self.projected_schema.clone(),
+            predicate: self.predicate.clone(),
+            order: self.read_order,
+        };
+
+        let read_res = self.table.partitioned_read(req).await;
+        match read_res {
+            Ok(partitioned_streams) => {
+                assert_eq!(self.read_parallelism, partitioned_streams.streams.len());
+                stream_state.streams = partitioned_streams.streams.into_iter().map(Some).collect();
+            }
+            Err(e) => {
+                stream_state.err = Some(e);
+            }
+        }
+        stream_state.inited = true;
+
+        Ok(())
+    }
+}
+
+#[async_trait]
+impl ExecutionPlan for ScanTable {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn schema(&self) -> SchemaRef {
+        self.projected_schema.to_projected_arrow_schema()
+    }
+
+    fn output_partitioning(&self) -> Partitioning {
+        Partitioning::RoundRobinBatch(self.read_parallelism)
+    }
+
+    fn children(&self) -> Vec<Arc<dyn ExecutionPlan>> {
+        // this is a leaf node and has no children
+        vec![]
+    }
+
+    fn with_new_children(&self, _: Vec<Arc<dyn ExecutionPlan>>) -> Result<Arc<dyn ExecutionPlan>> {
+        Err(DataFusionError::Internal(format!(
+            "Children cannot be replaced in {:?}",
+            self
+        )))
+    }
+
+    async fn execute(
+        &self,
+        partition: usize,
+        runtime: Arc<RuntimeEnv>,
+    ) -> Result<DfSendableRecordBatchStream> {
+        self.maybe_init_stream(runtime).await?;
+
+        let mut stream_state = self.stream_state.lock().await;
+        let stream = stream_state.take_stream(partition)?;
+
+        Ok(Box::pin(ToDfStream(stream)))
+    }
+
+    fn fmt_as(&self, _t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(
+            f,
+            "ScanTable: table={}, parallelism={}, order={:?}, ",
+            self.table.name(),
+            self.read_parallelism,
+            self.read_order,
+        )
+    }
+
+    fn statistics(&self) -> Statistics {
+        // TODO(yingwen): Implement this
+        Statistics::default()
+    }
+}
+
+impl fmt::Debug for ScanTable {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("ScanTable")
+            .field("projected_schema", &self.projected_schema)
+            .field("table", &self.table.name())
+            .field("read_order", &self.read_order)
+            .field("read_parallelism", &self.read_parallelism)
+            .field("predicate", &self.predicate)
+            .finish()
+    }
+}
diff --git a/table_engine/src/stream.rs b/table_engine/src/stream.rs
new file mode 100644
index 0000000000..fc8245d07c
--- /dev/null
+++ b/table_engine/src/stream.rs
@@ -0,0 +1,128 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Table record stream
+
+use std::{
+    convert::TryFrom,
+    pin::Pin,
+    task::{Context, Poll},
+};
+
+use arrow_deps::{
+    arrow::{
+        datatypes::SchemaRef,
+        error::{ArrowError, Result as ArrowResult},
+        record_batch::RecordBatch as ArrowRecordBatch,
+    },
+    datafusion::physical_plan::{
+        RecordBatchStream as DfRecordBatchStream,
+        SendableRecordBatchStream as DfSendableRecordBatchStream,
+    },
+};
+use common_types::{record_batch::RecordBatch, schema::RecordSchema};
+use common_util::define_result;
+use futures::stream::Stream;
+use snafu::{Backtrace, ResultExt, Snafu};
+
+// TODO(yingwen): Classify the error.
+#[derive(Debug, Snafu)]
+#[snafu(visibility(pub))]
+pub enum Error {
+    #[snafu(display("Stream error, msg:{}, err:{}", msg, source))]
+    ErrWithSource {
+        msg: String,
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+
+    #[snafu(display("Stream error, msg:{}.\nBacktrace:\n{}", msg, backtrace))]
+    ErrNoSource { msg: String, backtrace: Backtrace },
+}
+
+define_result!(Error);
+
+pub trait RecordBatchStream: Stream<Item = Result<RecordBatch>> {
+    fn schema(&self) -> &RecordSchema;
+}
+
+pub type SendableRecordBatchStream = Pin<Box<dyn RecordBatchStream + Send + Sync>>;
+
+/// Record batch streams divided by time range.
+pub struct PartitionedStreams {
+    pub streams: Vec<SendableRecordBatchStream>,
+}
+
+impl PartitionedStreams {
+    pub fn one_stream(stream: SendableRecordBatchStream) -> Self {
+        Self {
+            streams: vec![stream],
+        }
+    }
+}
+
+pub struct ToDfStream(pub SendableRecordBatchStream);
+
+impl Stream for ToDfStream {
+    type Item = ArrowResult<ArrowRecordBatch>;
+
+    fn poll_next(mut self: Pin<&mut Self>, ctx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        match self.0.as_mut().poll_next(ctx) {
+            Poll::Ready(Some(Ok(record_batch))) => {
+                Poll::Ready(Some(Ok(record_batch.into_arrow_record_batch())))
+            }
+            Poll::Ready(Some(Err(e))) => {
+                Poll::Ready(Some(Err(ArrowError::ExternalError(Box::new(e)))))
+            }
+            Poll::Ready(None) => Poll::Ready(None),
+            Poll::Pending => Poll::Pending,
+        }
+    }
+}
+
+impl DfRecordBatchStream for ToDfStream {
+    fn schema(&self) -> SchemaRef {
+        self.0.schema().to_arrow_schema_ref()
+    }
+}
+
+pub struct FromDfStream {
+    schema: RecordSchema,
+    df_stream: DfSendableRecordBatchStream,
+}
+
+impl FromDfStream {
+    pub fn new(df_stream: DfSendableRecordBatchStream) -> Result<Self> {
+        let df_schema = df_stream.schema();
+        let schema = RecordSchema::try_from(df_schema)
+            .map_err(|e| Box::new(e) as _)
+            .context(ErrWithSource {
+                msg: "Convert record schema",
+            })?;
+
+        Ok(Self { schema, df_stream })
+    }
+}
+
+impl Stream for FromDfStream {
+    type Item = Result<RecordBatch>;
+
+    fn poll_next(mut self: Pin<&mut Self>, ctx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        match self.df_stream.as_mut().poll_next(ctx) {
+            Poll::Ready(Some(record_batch_res)) => Poll::Ready(Some(
+                record_batch_res
+                    .map_err(|e| Box::new(e) as _)
+                    .and_then(|batch| RecordBatch::try_from(batch).map_err(|e| Box::new(e) as _))
+                    .context(ErrWithSource {
+                        msg: "Convert from arrow record batch",
+                    }),
+            )),
+            Poll::Ready(None) => Poll::Ready(None),
+            Poll::Pending => Poll::Pending,
+        }
+    }
+}
+
+impl RecordBatchStream for FromDfStream {
+    fn schema(&self) -> &RecordSchema {
+        &self.schema
+    }
+}
diff --git a/table_engine/src/table.rs b/table_engine/src/table.rs
new file mode 100644
index 0000000000..b361756e8d
--- /dev/null
+++ b/table_engine/src/table.rs
@@ -0,0 +1,608 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Table abstraction
+
+use std::{
+    collections::HashMap,
+    convert::TryFrom,
+    fmt,
+    sync::{
+        atomic::{AtomicU32, AtomicU64, Ordering},
+        Arc,
+    },
+};
+
+use async_trait::async_trait;
+use common_types::{
+    column_schema::ColumnSchema,
+    datum::Datum,
+    projected_schema::ProjectedSchema,
+    request_id::RequestId,
+    row::{Row, RowGroup},
+    schema::{RecordSchemaWithKey, Schema, Version},
+    time::Timestamp,
+};
+use proto::sys_catalog::{TableEntry, TableState as TableStatePb};
+use serde_derive::Deserialize;
+use snafu::{Backtrace, Snafu};
+
+use crate::{
+    engine::{TableRequestType, TableState},
+    predicate::PredicateRef,
+    stream::{PartitionedStreams, SendableRecordBatchStream},
+};
+
+/// Contains common error variant, implementation specific error should
+/// be cast into Box<dyn Error>
+#[derive(Debug, Snafu)]
+#[snafu(visibility = "pub")]
+pub enum Error {
+    #[snafu(display(
+        "Unsupported table method, table:{}, method:{}.\nBacktrace:\n{}",
+        table,
+        method,
+        backtrace
+    ))]
+    UnsupportedMethod {
+        table: String,
+        method: String,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Get Invalid primary key, expected schema:{:?}, given_primary_keys:{:?}.\nBacktrace:\n{}",
+        schema,
+        primary_key_columns,
+        backtrace
+    ))]
+    GetInvalidPrimaryKey {
+        schema: RecordSchemaWithKey,
+        primary_key_columns: Vec<ColumnSchema>,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Get null primary key, expected schema:{:?}, given_primary_keys:{:?}.\nBacktrace:\n{}",
+        schema,
+        primary_key_columns,
+        backtrace
+    ))]
+    GetNullPrimaryKey {
+        schema: RecordSchemaWithKey,
+        primary_key_columns: Vec<ColumnSchema>,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Unexpected error, err:{}", source))]
+    Unexpected {
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+
+    #[snafu(display("Invalid arguments, err:{}", source))]
+    InvalidArguments {
+        table: String,
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+
+    #[snafu(display("Failed to write table, table:{}, err:{}", table, source))]
+    Write {
+        table: String,
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+
+    #[snafu(display("Failed to scan table, table:{}, err:{}", table, source))]
+    Scan {
+        table: String,
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+
+    #[snafu(display("Failed to get table, table:{}, err:{}", table, source))]
+    Get {
+        table: String,
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+
+    #[snafu(display("Failed to alter schema, table:{}, err:{}", table, source))]
+    AlterSchema {
+        table: String,
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+
+    #[snafu(display("Failed to alter options, table:{}, err:{}", table, source))]
+    AlterOptions {
+        table: String,
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+
+    #[snafu(display("Failed to flush table, table:{}, err:{}", table, source))]
+    Flush {
+        table: String,
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+
+    #[snafu(display("Failed to compact table, table:{}, err:{}", table, source))]
+    Compact {
+        table: String,
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+}
+
+define_result!(Error);
+
+/// Default partition num to scan in parallelism.
+pub const DEFAULT_READ_PARALLELISM: usize = 8;
+
+/// Schema id (24 bits)
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub struct SchemaId(u32);
+
+impl SchemaId {
+    /// Bits of schema id.
+    const BITS: u32 = 24;
+    /// 24 bits mask (0xffffff)
+    const MASK: u32 = (1 << Self::BITS) - 1;
+    /// Max schema id.
+    pub const MAX: SchemaId = SchemaId(Self::MASK);
+    /// Min schema id.
+    pub const MIN: SchemaId = SchemaId(0);
+
+    /// Create a new schema id from u32, return None if `id` is invalid.
+    pub fn new(id: u32) -> Option<Self> {
+        // Only need to check max as min is 0.
+        if id <= SchemaId::MAX.0 {
+            Some(Self(id))
+        } else {
+            None
+        }
+    }
+
+    // It is safe to convert u16 into schema id.
+    pub const fn from_u16(id: u16) -> Self {
+        Self(id as u32)
+    }
+
+    /// Convert the schema id into u32.
+    #[inline]
+    pub fn as_u32(&self) -> u32 {
+        self.0
+    }
+}
+
+impl PartialEq<u32> for SchemaId {
+    fn eq(&self, other: &u32) -> bool {
+        self.0 == *other
+    }
+}
+
+impl From<u16> for SchemaId {
+    fn from(id: u16) -> SchemaId {
+        SchemaId::from_u16(id)
+    }
+}
+
+/// Sequence of a table under a schema (40 bits).
+#[derive(Debug, Clone, Copy)]
+pub struct TableSeq(u64);
+
+impl TableSeq {
+    /// Bits of schema id.
+    const BITS: u64 = 40;
+    /// 40 bits mask (0xffffffffff).
+    const MASK: u64 = (1 << Self::BITS) - 1;
+    /// Max sequence of table in a schema.
+    pub const MAX: TableSeq = TableSeq(Self::MASK);
+    /// Min sequence of table in a schema.
+    pub const MIN: TableSeq = TableSeq(0);
+
+    /// Create a new table sequence from u64, return None if `seq` is invalid.
+    pub const fn new(seq: u64) -> Option<Self> {
+        // Only need to check max as min is 0.
+        if seq <= TableSeq::MAX.0 {
+            Some(Self(seq))
+        } else {
+            None
+        }
+    }
+
+    // It is safe to convert u32 into table seq.
+    pub const fn from_u32(id: u32) -> Self {
+        Self(id as u64)
+    }
+
+    /// Convert the table sequence into u64.
+    #[inline]
+    pub fn as_u64(&self) -> u64 {
+        self.0
+    }
+}
+
+impl From<u32> for TableSeq {
+    fn from(id: u32) -> TableSeq {
+        TableSeq::from_u32(id)
+    }
+}
+
+/// Table Id (64 bits)
+///
+/// Table id is constructed via schema id (24 bits) and a table sequence (40
+/// bits).
+#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Deserialize)]
+pub struct TableId(u64);
+
+impl TableId {
+    /// Min table id.
+    pub const MIN: TableId = TableId(0);
+
+    /// Create a new table id from `schema_id` and `table_seq`.
+    pub const fn new(schema_id: SchemaId, table_seq: TableSeq) -> Self {
+        let schema_id_data = schema_id.0 as u64;
+        let schema_id_part = schema_id_data << TableSeq::BITS;
+        let table_id_data = schema_id_part | table_seq.0;
+
+        Self(table_id_data)
+    }
+
+    /// Get the schema id part of the table id.
+    #[inline]
+    pub fn schema_id(&self) -> SchemaId {
+        let schema_id_part = self.0 >> TableSeq::BITS;
+
+        SchemaId(schema_id_part as u32)
+    }
+
+    /// Get the sequence part of the table id.
+    #[inline]
+    pub fn table_seq(&self) -> TableSeq {
+        let seq_part = self.0 & TableSeq::MASK;
+
+        TableSeq(seq_part)
+    }
+
+    /// Convert table id into u64.
+    #[inline]
+    pub fn as_u64(&self) -> u64 {
+        self.0
+    }
+}
+
+impl From<u64> for TableId {
+    fn from(id: u64) -> TableId {
+        TableId(id)
+    }
+}
+
+impl fmt::Debug for TableId {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(
+            f,
+            "TableId({}, {}, {})",
+            self.0,
+            self.schema_id().as_u32(),
+            self.table_seq().as_u64()
+        )
+    }
+}
+
+impl fmt::Display for TableId {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "{}", self.0)
+    }
+}
+
+// TODO(yingwen): Support DELETE/UPDATE... , a mutation type is needed.
+#[derive(Debug)]
+pub struct WriteRequest {
+    /// rows to write
+    pub row_group: RowGroup,
+}
+
+#[derive(Debug)]
+pub struct ReadOptions {
+    pub batch_size: usize,
+    /// Suggested read parallelism, the actual returned stream should equal to
+    /// `read_parallelism`.
+    pub read_parallelism: usize,
+}
+
+impl Default for ReadOptions {
+    fn default() -> Self {
+        Self {
+            batch_size: 10000,
+            read_parallelism: DEFAULT_READ_PARALLELISM,
+        }
+    }
+}
+
+#[derive(Debug)]
+pub struct GetRequest {
+    /// Query request id.
+    pub request_id: RequestId,
+    /// The schema and projection for get, the output data should match this
+    /// schema.
+    pub projected_schema: ProjectedSchema,
+    /// The primary key of the row to get.
+    pub primary_key: Vec<Datum>,
+}
+
+#[derive(Copy, Clone, Debug)]
+pub enum ReadOrder {
+    /// No order requirements from the read request.
+    None,
+    Asc,
+    Desc,
+}
+
+impl ReadOrder {
+    pub fn from_is_asc(is_asc: Option<bool>) -> Self {
+        match is_asc {
+            Some(true) => ReadOrder::Asc,
+            Some(false) => ReadOrder::Desc,
+            None => ReadOrder::None,
+        }
+    }
+
+    #[inline]
+    pub fn is_out_of_order(&self) -> bool {
+        matches!(self, ReadOrder::None)
+    }
+
+    #[inline]
+    pub fn is_in_order(&self) -> bool {
+        !self.is_out_of_order()
+    }
+
+    #[inline]
+    pub fn is_in_desc_order(&self) -> bool {
+        matches!(self, ReadOrder::Desc)
+    }
+}
+
+#[derive(Debug)]
+pub struct ReadRequest {
+    /// Read request id.
+    pub request_id: RequestId,
+    /// Read options.
+    pub opts: ReadOptions,
+    /// The schema and projection for read, the output data should match this
+    /// schema.
+    pub projected_schema: ProjectedSchema,
+    /// Predicate of the query.
+    pub predicate: PredicateRef,
+    /// Read the rows in reverse order.
+    pub order: ReadOrder,
+}
+
+#[derive(Debug)]
+pub struct AlterSchemaRequest {
+    /// The new schema.
+    pub schema: Schema,
+    /// Previous schema version before alteration.
+    pub pre_schema_version: Version,
+}
+
+#[derive(Debug)]
+pub struct FlushRequest {
+    /// Trigger a compaction after flush, default is true.
+    pub compact_after_flush: bool,
+    /// Whether to wait flush task finishes, default is true.
+    pub sync: bool,
+}
+
+impl Default for FlushRequest {
+    fn default() -> Self {
+        Self {
+            compact_after_flush: true,
+            sync: true,
+        }
+    }
+}
+
+/// Table abstraction
+///
+/// We do not let Table trait extends datafusion's TableProvider, since
+/// that will tie out abstraction with datafusion. However, we still use
+/// datafusion's RecordBatchStream trait.
+#[async_trait]
+pub trait Table: std::fmt::Debug {
+    /// Returns table name.
+    fn name(&self) -> &str;
+
+    /// Returns the id of this table.
+    fn id(&self) -> TableId;
+
+    /// Schema of this table.
+    fn schema(&self) -> Schema;
+
+    /// Options of this table.
+    fn options(&self) -> HashMap<String, String>;
+
+    /// Engine type of this table.
+    fn engine_type(&self) -> &str;
+
+    /// Get table's statistics.
+    fn stats(&self) -> TableStats;
+
+    /// Write to table.
+    async fn write(&self, request: WriteRequest) -> Result<usize>;
+
+    /// Read from table.
+    async fn read(&self, request: ReadRequest) -> Result<SendableRecordBatchStream>;
+
+    /// Get the specific row according to the primary key.
+    /// TODO(xikai): object-safety is not ensured by now if the default
+    ///  implementation is provided. Actually it is better to use the read
+    ///  method to implement the get method.
+    async fn get(&self, request: GetRequest) -> Result<Option<Row>>;
+
+    /// Read multiple partition of the table in parallel.
+    async fn partitioned_read(&self, request: ReadRequest) -> Result<PartitionedStreams>;
+
+    /// Alter table schema to the schema specific in [AlterSchemaRequest] if
+    /// the `pre_schema_version` is equal to current schema version.
+    ///
+    /// Returns the affected rows (always 1).
+    async fn alter_schema(&self, request: AlterSchemaRequest) -> Result<usize>;
+
+    /// Alter table options.
+    ///
+    /// Returns the affected rows (always 1).
+    async fn alter_options(&self, options: HashMap<String, String>) -> Result<usize>;
+
+    /// Flush this table.
+    async fn flush(&self, request: FlushRequest) -> Result<()>;
+
+    /// Compact this table and wait until compaction completes.
+    async fn compact(&self) -> Result<()>;
+}
+
+/// Basic statistics of table.
+#[derive(Debug, Clone, Copy, Default)]
+pub struct TableStats {
+    /// Total write request
+    pub num_write: u64,
+    /// Total read request
+    pub num_read: u64,
+    /// Total flush request
+    pub num_flush: u64,
+}
+
+/// A reference-counted pointer to Table
+pub type TableRef = Arc<dyn Table + Send + Sync>;
+
+/// Helper to generate a schema id.
+pub struct SchemaIdGenerator {
+    last_schema_id: AtomicU32,
+}
+
+impl SchemaIdGenerator {
+    pub fn last_schema_id_u32(&self) -> u32 {
+        self.last_schema_id.load(Ordering::Relaxed)
+    }
+
+    pub fn set_last_schema_id(&self, last_schema_id: SchemaId) {
+        self.last_schema_id
+            .store(last_schema_id.as_u32(), Ordering::Relaxed);
+    }
+
+    pub fn alloc_schema_id(&self) -> Option<SchemaId> {
+        let last = self.last_schema_id.fetch_add(1, Ordering::Relaxed);
+
+        SchemaId::new(last + 1)
+    }
+}
+
+impl Default for SchemaIdGenerator {
+    fn default() -> Self {
+        Self {
+            last_schema_id: AtomicU32::new(SchemaId::MIN.as_u32()),
+        }
+    }
+}
+
+/// Helper to generate a table sequence.
+pub struct TableSeqGenerator {
+    last_table_seq: AtomicU64,
+}
+
+impl TableSeqGenerator {
+    pub fn last_table_seq_u64(&self) -> u64 {
+        self.last_table_seq.load(Ordering::Relaxed)
+    }
+
+    pub fn set_last_table_seq(&self, last_table_seq: TableSeq) {
+        self.last_table_seq
+            .store(last_table_seq.as_u64(), Ordering::Relaxed);
+    }
+
+    pub fn alloc_table_seq(&self) -> Option<TableSeq> {
+        let last = self.last_table_seq.fetch_add(1, Ordering::Relaxed);
+
+        TableSeq::new(last + 1)
+    }
+}
+
+impl Default for TableSeqGenerator {
+    fn default() -> Self {
+        Self {
+            last_table_seq: AtomicU64::new(TableSeq::MIN.as_u64()),
+        }
+    }
+}
+
+/// Create table request in catalog
+#[derive(Debug, Clone)]
+pub struct TableInfo {
+    /// Catalog name
+    pub catalog_name: String,
+    /// Schema name
+    pub schema_name: String,
+    /// Table id
+    pub table_id: TableId,
+    /// Table name
+    pub table_name: String,
+    /// Table engine type
+    pub engine: String,
+    /// Tells state of the table
+    pub state: TableState,
+}
+
+#[derive(Debug, Snafu)]
+pub struct TryFromTableEntryError(common_types::schema::Error);
+
+impl TryFrom<TableEntry> for TableInfo {
+    type Error = TryFromTableEntryError;
+
+    fn try_from(entry: TableEntry) -> std::result::Result<Self, Self::Error> {
+        Ok(Self {
+            catalog_name: entry.catalog_name,
+            schema_name: entry.schema_name,
+            table_id: entry.table_id.into(),
+            table_name: entry.table_name,
+            engine: entry.engine,
+            state: TableState::from(entry.state),
+        })
+    }
+}
+
+impl From<TableInfo> for TableEntry {
+    fn from(table_info: TableInfo) -> Self {
+        let mut entry = TableEntry::new();
+        entry.set_catalog_name(table_info.catalog_name);
+        entry.set_schema_name(table_info.schema_name);
+        entry.set_table_id(table_info.table_id.as_u64());
+        entry.set_table_name(table_info.table_name);
+        entry.set_engine(table_info.engine);
+        entry.set_state(TableStatePb::from(table_info.state));
+
+        entry
+    }
+}
+
+impl TableInfo {
+    // TODO(chunshao.rcs): refactor
+    pub fn into_pb(self, typ: TableRequestType) -> TableEntry {
+        let mut table_entry: TableEntry = self.into();
+        match typ {
+            TableRequestType::Create => table_entry.set_created_time(Timestamp::now().as_i64()),
+            TableRequestType::Drop => table_entry.set_modified_time(Timestamp::now().as_i64()),
+        }
+        table_entry
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_schema_id() {
+        assert_eq!(0, SchemaId::MIN.as_u32());
+        assert_eq!(0xffffff, SchemaId::MAX.as_u32());
+    }
+
+    #[test]
+    fn test_table_seq() {
+        assert_eq!(0, TableSeq::MIN.as_u64());
+        assert_eq!(0xffffffffff, TableSeq::MAX.as_u64());
+    }
+}
diff --git a/udf/Cargo.toml b/udf/Cargo.toml
new file mode 100644
index 0000000000..a4895e787d
--- /dev/null
+++ b/udf/Cargo.toml
@@ -0,0 +1,16 @@
+[package]
+name = "udf"
+version = "0.1.0"
+edition = "2018"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+arrow_deps = { path = "../arrow_deps" }
+base64 = "0.13"
+chrono = "0.4"
+common_types = { path = "../common_types" }
+common_util = { path = "../common_util" }
+hyperloglog = { path = "../components/rust-hyperloglog" }
+smallvec = "1.6"
+snafu = { version ="0.6.10", features = ["backtraces"]}
diff --git a/udf/src/aggregate.rs b/udf/src/aggregate.rs
new file mode 100644
index 0000000000..45fa24b73b
--- /dev/null
+++ b/udf/src/aggregate.rs
@@ -0,0 +1,164 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Aggregate functions.
+
+use std::{fmt, ops::Deref};
+
+use arrow_deps::{
+    arrow::array::ArrayRef as DfArrayRef,
+    datafusion::{
+        error::{DataFusionError, Result as DfResult},
+        physical_plan::Accumulator as DfAccumulator,
+        scalar::ScalarValue as DfScalarValue,
+    },
+};
+use common_util::define_result;
+use snafu::Snafu;
+
+use crate::functions::{ScalarValue, ScalarValueRef};
+
+#[derive(Debug, Snafu)]
+#[snafu(visibility(pub(crate)))]
+pub enum Error {
+    #[snafu(display("Failed to get state, err:{}", source))]
+    GetState {
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+
+    #[snafu(display("Failed to merge state, err:{}", source))]
+    MergeState {
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+}
+
+define_result!(Error);
+
+pub struct State(Vec<DfScalarValue>);
+
+impl State {
+    fn into_df_scalar_values(self) -> Vec<DfScalarValue> {
+        self.0
+    }
+}
+
+impl From<ScalarValue> for State {
+    fn from(value: ScalarValue) -> Self {
+        Self(vec![value.into_df_scalar_value()])
+    }
+}
+
+pub struct Input<'a>(&'a [DfScalarValue]);
+
+impl<'a> Input<'a> {
+    pub fn iter(&self) -> impl Iterator<Item = ScalarValueRef> {
+        self.0.iter().map(ScalarValueRef::from)
+    }
+
+    pub fn len(&self) -> usize {
+        self.0.len()
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.0.is_empty()
+    }
+
+    pub fn value(&self, index: usize) -> ScalarValueRef {
+        ScalarValueRef::from(&self.0[index])
+    }
+}
+
+pub struct StateRef<'a>(Input<'a>);
+
+impl<'a> Deref for StateRef<'a> {
+    type Target = Input<'a>;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+/// An accumulator represents a stateful object that lives throughout the
+/// evaluation of multiple rows and generically accumulates values.
+///
+/// An accumulator knows how to:
+/// * update its state from inputs via `update`
+/// * convert its internal state to a vector of scalar values
+/// * update its state from multiple accumulators' states via `merge`
+/// * compute the final value from its internal state via `evaluate`
+pub trait Accumulator: Send + Sync + fmt::Debug {
+    /// Returns the state of the accumulator at the end of the accumulation.
+    // in the case of an average on which we track `sum` and `n`, this function
+    // should return a vector of two values, sum and n.
+    fn state(&self) -> Result<State>;
+
+    /// updates the accumulator's state from a vector of scalars.
+    fn update(&mut self, values: Input) -> Result<()>;
+
+    /// updates the accumulator's state from a vector of scalars.
+    fn merge(&mut self, states: StateRef) -> Result<()>;
+
+    /// returns its value based on its current state.
+    fn evaluate(&self) -> Result<ScalarValue>;
+}
+
+#[derive(Debug)]
+pub struct ToDfAccumulator<T> {
+    accumulator: T,
+}
+
+impl<T> ToDfAccumulator<T> {
+    pub fn new(accumulator: T) -> Self {
+        Self { accumulator }
+    }
+}
+
+impl<T: Accumulator> DfAccumulator for ToDfAccumulator<T> {
+    fn state(&self) -> DfResult<Vec<DfScalarValue>> {
+        let state = self.accumulator.state().map_err(|e| {
+            DataFusionError::Execution(format!("Accumulator failed to get state, err:{}", e))
+        })?;
+        Ok(state.into_df_scalar_values())
+    }
+
+    fn update_batch(&mut self, values: &[DfArrayRef]) -> DfResult<()> {
+        if values.is_empty() {
+            return Ok(());
+        };
+        (0..values[0].len()).try_for_each(|index| {
+            let v = values
+                .iter()
+                .map(|array| DfScalarValue::try_from_array(array, index))
+                .collect::<DfResult<Vec<DfScalarValue>>>()?;
+            let input = Input(&v);
+
+            self.accumulator.update(input).map_err(|e| {
+                DataFusionError::Execution(format!("Accumulator failed to update, err:{}", e))
+            })
+        })
+    }
+
+    fn merge_batch(&mut self, states: &[DfArrayRef]) -> DfResult<()> {
+        if states.is_empty() {
+            return Ok(());
+        };
+        (0..states[0].len()).try_for_each(|index| {
+            let v = states
+                .iter()
+                .map(|array| DfScalarValue::try_from_array(array, index))
+                .collect::<DfResult<Vec<DfScalarValue>>>()?;
+            let state_ref = StateRef(Input(&v));
+
+            self.accumulator.merge(state_ref).map_err(|e| {
+                DataFusionError::Execution(format!("Accumulator failed to merge, err:{}", e))
+            })
+        })
+    }
+
+    fn evaluate(&self) -> DfResult<DfScalarValue> {
+        let value = self.accumulator.evaluate().map_err(|e| {
+            DataFusionError::Execution(format!("Accumulator failed to evaluate, err:{}", e))
+        })?;
+
+        Ok(value.into_df_scalar_value())
+    }
+}
diff --git a/udf/src/functions.rs b/udf/src/functions.rs
new file mode 100644
index 0000000000..6fcd2df4be
--- /dev/null
+++ b/udf/src/functions.rs
@@ -0,0 +1,326 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Functions.
+
+use std::{
+    hash::{Hash, Hasher},
+    sync::Arc,
+};
+
+use arrow_deps::{
+    arrow::datatypes::DataType,
+    datafusion::{
+        error::DataFusionError,
+        physical_plan::{
+            aggregates::{AccumulatorFunctionImplementation, StateTypeFunction},
+            functions::{
+                ReturnTypeFunction, ScalarFunctionImplementation, Signature as DfSignature,
+                TypeSignature as DfTypeSignature, Volatility,
+            },
+            ColumnarValue as DfColumnarValue,
+        },
+        scalar::ScalarValue as DfScalarValue,
+    },
+};
+use common_types::{column::ColumnBlock, datum::DatumKind};
+use common_util::define_result;
+use smallvec::SmallVec;
+use snafu::{ResultExt, Snafu};
+
+use crate::aggregate::{Accumulator, ToDfAccumulator};
+
+// Most functions have no more than 5 args.
+const FUNC_ARG_NUM: usize = 5;
+
+#[derive(Debug, Snafu)]
+#[snafu(visibility(pub(crate)))]
+pub enum Error {
+    #[snafu(display("Failed to convert array to ColumnarValue, err:{}", source))]
+    InvalidArray { source: common_types::column::Error },
+
+    #[snafu(display("Invalid function arguments, err:{}", source))]
+    InvalidArguments {
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+
+    #[snafu(display("Failed to execute function, err:{}", source))]
+    CallFunction {
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+}
+
+define_result!(Error);
+
+/// A dynamically typed, nullable single value.
+// TODO(yingwen): Can we use Datum?
+#[derive(Debug)]
+pub struct ScalarValue(DfScalarValue);
+
+impl ScalarValue {
+    pub(crate) fn into_df_scalar_value(self) -> DfScalarValue {
+        self.0
+    }
+
+    fn from_df_scalar_value(df_scalar: &DfScalarValue) -> Self {
+        Self(df_scalar.clone())
+    }
+
+    pub fn as_str(&self) -> Option<&str> {
+        match &self.0 {
+            DfScalarValue::Utf8(value_opt) => value_opt.as_ref().map(|v| v.as_str()),
+            _ => None,
+        }
+    }
+}
+
+impl From<String> for ScalarValue {
+    fn from(value: String) -> Self {
+        Self(DfScalarValue::Utf8(Some(value)))
+    }
+}
+
+impl From<u64> for ScalarValue {
+    fn from(value: u64) -> Self {
+        Self(value.into())
+    }
+}
+
+pub struct ScalarValueRef<'a>(&'a DfScalarValue);
+
+impl<'a> ScalarValueRef<'a> {
+    pub fn as_str(&self) -> Option<&str> {
+        match self.0 {
+            DfScalarValue::Utf8(value_opt) | DfScalarValue::LargeUtf8(value_opt) => {
+                value_opt.as_ref().map(|v| v.as_str())
+            }
+            _ => None,
+        }
+    }
+}
+
+impl<'a> From<&'a DfScalarValue> for ScalarValueRef<'a> {
+    fn from(value: &DfScalarValue) -> ScalarValueRef {
+        ScalarValueRef(value)
+    }
+}
+
+impl<'a> Hash for ScalarValueRef<'a> {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        self.0.hash(state)
+    }
+}
+
+/// Represent a value of function result.
+#[derive(Debug)]
+pub enum ColumnarValue {
+    /// Array of values.
+    Array(ColumnBlock),
+    /// A single value.
+    Scalar(ScalarValue),
+}
+
+impl ColumnarValue {
+    fn into_df_columnar_value(self) -> DfColumnarValue {
+        match self {
+            ColumnarValue::Array(v) => DfColumnarValue::Array(v.to_arrow_array_ref()),
+            ColumnarValue::Scalar(v) => DfColumnarValue::Scalar(v.into_df_scalar_value()),
+        }
+    }
+
+    fn try_from_df_columnar_value(df_value: &DfColumnarValue) -> Result<Self> {
+        let columnar_value = match df_value {
+            DfColumnarValue::Array(array) => {
+                let column_block =
+                    ColumnBlock::try_cast_arrow_array_ref(array).context(InvalidArray)?;
+                ColumnarValue::Array(column_block)
+            }
+            DfColumnarValue::Scalar(v) => {
+                ColumnarValue::Scalar(ScalarValue::from_df_scalar_value(v))
+            }
+        };
+
+        Ok(columnar_value)
+    }
+}
+
+/// A function's TypeSignature.
+#[derive(Debug)]
+pub enum TypeSignature {
+    /// exact number of arguments of an exact type
+    Exact(Vec<DatumKind>),
+    /// fixed number of arguments of an arbitrary but equal type out of a list
+    /// of valid types
+    // A function of one argument of double is `Uniform(1, vec![DatumKind::Double])`
+    // A function of one argument of double or uint64 is `Uniform(1, vec![DatumKind::Double,
+    // DatumKind::UInt64])`
+    Uniform(usize, Vec<DatumKind>),
+    /// One of a list of signatures
+    OneOf(Vec<TypeSignature>),
+}
+
+impl TypeSignature {
+    pub(crate) fn to_datafusion_signature(&self) -> DfSignature {
+        DfSignature::new(self.to_datafusion_type_signature(), Volatility::Immutable)
+    }
+
+    fn to_datafusion_type_signature(&self) -> DfTypeSignature {
+        match self {
+            TypeSignature::Exact(kinds) => {
+                let data_types = kinds.iter().map(|v| DataType::from(*v)).collect();
+                DfTypeSignature::Exact(data_types)
+            }
+            TypeSignature::Uniform(num, kinds) => {
+                let data_types = kinds.iter().map(|v| DataType::from(*v)).collect();
+                DfTypeSignature::Uniform(*num, data_types)
+            }
+            TypeSignature::OneOf(sigs) => {
+                let df_sigs = sigs
+                    .iter()
+                    .map(|v| v.to_datafusion_type_signature())
+                    .collect();
+                DfTypeSignature::OneOf(df_sigs)
+            }
+        }
+    }
+}
+
+/// A scalar function's return type.
+#[derive(Debug)]
+pub struct ReturnType {
+    kind: DatumKind,
+}
+
+impl ReturnType {
+    pub(crate) fn to_datafusion_return_type(&self) -> ReturnTypeFunction {
+        let data_type = Arc::new(DataType::from(self.kind));
+        Arc::new(move |_| Ok(data_type.clone()))
+    }
+}
+
+pub struct ScalarFunction {
+    signature: TypeSignature,
+    return_type: ReturnType,
+    df_scalar_fn: ScalarFunctionImplementation,
+}
+
+impl ScalarFunction {
+    pub fn make_by_fn<F>(signature: TypeSignature, return_type: DatumKind, func: F) -> Self
+    where
+        F: Fn(&[ColumnarValue]) -> Result<ColumnarValue> + Send + Sync + 'static,
+    {
+        let return_type = ReturnType { kind: return_type };
+
+        // Adapter to map func to Fn(&[DfColumnarValue]) -> Result<DfColumnarValue>
+        let df_adapter = move |df_args: &[DfColumnarValue]| {
+            // Convert df_args from DfColumnarValue to ColumnarValue.
+            let mut values: SmallVec<[ColumnarValue; FUNC_ARG_NUM]> =
+                SmallVec::with_capacity(df_args.len());
+            for df_arg in df_args {
+                let value = ColumnarValue::try_from_df_columnar_value(df_arg).map_err(|e| {
+                    DataFusionError::Internal(format!(
+                        "Failed to convert datafusion columnar value, err:{}",
+                        e
+                    ))
+                })?;
+                values.push(value);
+            }
+
+            // Execute our function.
+            let result_value = func(&values).map_err(|e| {
+                DataFusionError::Execution(format!("Failed to execute function, err:{}", e))
+            })?;
+
+            // Convert the result value to DfColumnarValue.
+            Ok(result_value.into_df_columnar_value())
+        };
+
+        let df_scalar_fn = Arc::new(df_adapter);
+
+        Self {
+            signature,
+            return_type,
+            df_scalar_fn,
+        }
+    }
+
+    #[inline]
+    pub fn signature(&self) -> &TypeSignature {
+        &self.signature
+    }
+
+    #[inline]
+    pub fn return_type(&self) -> &ReturnType {
+        &self.return_type
+    }
+
+    #[inline]
+    pub(crate) fn to_datafusion_function(&self) -> ScalarFunctionImplementation {
+        self.df_scalar_fn.clone()
+    }
+}
+
+pub struct AggregateFunction {
+    type_signature: TypeSignature,
+    return_type: ReturnType,
+    df_accumulator: AccumulatorFunctionImplementation,
+    state_type: Vec<DatumKind>,
+}
+
+impl AggregateFunction {
+    pub fn make_by_fn<F, A>(
+        type_signature: TypeSignature,
+        return_type: DatumKind,
+        state_type: Vec<DatumKind>,
+        accumulator_fn: F,
+    ) -> Self
+    where
+        F: Fn() -> Result<A> + Send + Sync + 'static,
+        A: Accumulator + 'static,
+    {
+        // Create accumulator.
+        let df_adapter = move || {
+            let accumulator = accumulator_fn().map_err(|e| {
+                DataFusionError::Execution(format!("Failed to create accumulator, err:{}", e))
+            })?;
+            let accumulator = Box::new(ToDfAccumulator::new(accumulator));
+
+            Ok(accumulator as _)
+        };
+        let df_accumulator = Arc::new(df_adapter);
+
+        // Create return type.
+        let return_type = ReturnType { kind: return_type };
+
+        Self {
+            type_signature,
+            return_type,
+            df_accumulator,
+            state_type,
+        }
+    }
+
+    #[inline]
+    pub fn signature(&self) -> &TypeSignature {
+        &self.type_signature
+    }
+
+    #[inline]
+    pub fn return_type(&self) -> &ReturnType {
+        &self.return_type
+    }
+
+    #[inline]
+    pub(crate) fn to_datafusion_accumulator(&self) -> AccumulatorFunctionImplementation {
+        self.df_accumulator.clone()
+    }
+
+    pub(crate) fn to_datafusion_state_type(&self) -> StateTypeFunction {
+        let data_types = Arc::new(
+            self.state_type
+                .iter()
+                .map(|kind| DataType::from(*kind))
+                .collect::<Vec<_>>(),
+        );
+        Arc::new(move |_| Ok(data_types.clone()))
+    }
+}
diff --git a/udf/src/lib.rs b/udf/src/lib.rs
new file mode 100644
index 0000000000..36d5f32fdf
--- /dev/null
+++ b/udf/src/lib.rs
@@ -0,0 +1,10 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! UDF support.
+
+pub mod aggregate;
+pub mod functions;
+pub mod registry;
+pub mod scalar;
+pub mod udaf;
+pub mod udfs;
diff --git a/udf/src/registry.rs b/udf/src/registry.rs
new file mode 100644
index 0000000000..34e0af7051
--- /dev/null
+++ b/udf/src/registry.rs
@@ -0,0 +1,92 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Function registry.
+
+use std::{collections::HashMap, sync::Arc};
+
+use common_util::define_result;
+use snafu::{ensure, Backtrace, Snafu};
+
+use crate::{scalar::ScalarUdf, udaf::AggregateUdf, udfs};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Udf already exists, name:{}.\nBacktrace:\n{}", name, backtrace))]
+    UdfExists { name: String, backtrace: Backtrace },
+}
+
+define_result!(Error);
+
+/// A registry knows how to build logical expressions out of user-defined
+/// function' names
+pub trait FunctionRegistry {
+    fn register_udf(&mut self, udf: ScalarUdf) -> Result<()>;
+
+    fn register_udaf(&mut self, udaf: AggregateUdf) -> Result<()>;
+
+    fn find_udf(&self, name: &str) -> Result<Option<ScalarUdf>>;
+
+    fn find_udaf(&self, name: &str) -> Result<Option<AggregateUdf>>;
+
+    fn list_udfs(&self) -> Result<Vec<ScalarUdf>>;
+}
+
+/// Default function registry.
+#[derive(Debug, Default)]
+pub struct FunctionRegistryImpl {
+    scalar_functions: HashMap<String, ScalarUdf>,
+    aggregate_functions: HashMap<String, AggregateUdf>,
+}
+
+impl FunctionRegistryImpl {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Load all provided udfs.
+    pub fn load_functions(&mut self) -> Result<()> {
+        udfs::register_all_udfs(self)
+    }
+}
+
+impl FunctionRegistry for FunctionRegistryImpl {
+    fn register_udf(&mut self, udf: ScalarUdf) -> Result<()> {
+        ensure!(
+            !self.scalar_functions.contains_key(udf.name()),
+            UdfExists { name: udf.name() }
+        );
+
+        self.scalar_functions.insert(udf.name().to_string(), udf);
+
+        Ok(())
+    }
+
+    fn register_udaf(&mut self, udaf: AggregateUdf) -> Result<()> {
+        ensure!(
+            !self.aggregate_functions.contains_key(udaf.name()),
+            UdfExists { name: udaf.name() }
+        );
+
+        self.aggregate_functions
+            .insert(udaf.name().to_string(), udaf);
+
+        Ok(())
+    }
+
+    fn find_udf(&self, name: &str) -> Result<Option<ScalarUdf>> {
+        let udf = self.scalar_functions.get(name).cloned();
+        Ok(udf)
+    }
+
+    fn find_udaf(&self, name: &str) -> Result<Option<AggregateUdf>> {
+        let udaf = self.aggregate_functions.get(name).cloned();
+        Ok(udaf)
+    }
+
+    fn list_udfs(&self) -> Result<Vec<ScalarUdf>> {
+        let udfs = self.scalar_functions.values().cloned().collect();
+        Ok(udfs)
+    }
+}
+
+pub type FunctionRegistryRef = Arc<dyn FunctionRegistry + Send + Sync>;
diff --git a/udf/src/scalar.rs b/udf/src/scalar.rs
new file mode 100644
index 0000000000..2ce056c3f3
--- /dev/null
+++ b/udf/src/scalar.rs
@@ -0,0 +1,39 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Scalar udfs.
+
+use std::sync::Arc;
+
+use arrow_deps::datafusion::physical_plan::udf::ScalarUDF;
+
+use crate::functions::ScalarFunction;
+
+/// Logical representation of a UDF.
+#[derive(Debug, Clone)]
+pub struct ScalarUdf {
+    /// DataFusion UDF.
+    df_udf: Arc<ScalarUDF>,
+}
+
+impl ScalarUdf {
+    pub fn create(name: &str, func: ScalarFunction) -> Self {
+        let signature = func.signature().to_datafusion_signature();
+        let return_type = func.return_type().to_datafusion_return_type();
+        let scalar_fn = func.to_datafusion_function();
+
+        let df_udf = Arc::new(ScalarUDF::new(name, &signature, &return_type, &scalar_fn));
+
+        Self { df_udf }
+    }
+
+    #[inline]
+    pub fn name(&self) -> &str {
+        &self.df_udf.name
+    }
+
+    /// Convert into datafusion's udf
+    #[inline]
+    pub fn to_datafusion_udf(&self) -> Arc<ScalarUDF> {
+        self.df_udf.clone()
+    }
+}
diff --git a/udf/src/udaf.rs b/udf/src/udaf.rs
new file mode 100644
index 0000000000..06f8983460
--- /dev/null
+++ b/udf/src/udaf.rs
@@ -0,0 +1,45 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! UDAF support.
+
+use std::sync::Arc;
+
+use arrow_deps::datafusion::physical_plan::udaf::AggregateUDF;
+
+use crate::functions::AggregateFunction;
+
+/// Logical representation of a UDAF.
+#[derive(Debug, Clone)]
+pub struct AggregateUdf {
+    /// DataFusion UDAF.
+    df_udaf: Arc<AggregateUDF>,
+}
+
+impl AggregateUdf {
+    pub fn create(name: &str, func: AggregateFunction) -> Self {
+        let signature = func.signature().to_datafusion_signature();
+        let return_type = func.return_type().to_datafusion_return_type();
+        let accumulator = func.to_datafusion_accumulator();
+        let state_type = func.to_datafusion_state_type();
+
+        let df_udaf = Arc::new(AggregateUDF::new(
+            name,
+            &signature,
+            &return_type,
+            &accumulator,
+            &state_type,
+        ));
+
+        Self { df_udaf }
+    }
+
+    #[inline]
+    pub fn name(&self) -> &str {
+        &self.df_udaf.name
+    }
+
+    #[inline]
+    pub fn to_datafusion_udaf(&self) -> Arc<AggregateUDF> {
+        self.df_udaf.clone()
+    }
+}
diff --git a/udf/src/udfs/mod.rs b/udf/src/udfs/mod.rs
new file mode 100644
index 0000000000..5d64edf237
--- /dev/null
+++ b/udf/src/udfs/mod.rs
@@ -0,0 +1,16 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! UDFs
+
+use crate::registry::{FunctionRegistry, Result};
+
+mod thetasketch_distinct;
+mod time_bucket;
+
+pub fn register_all_udfs(registry: &mut dyn FunctionRegistry) -> Result<()> {
+    // Register all udfs
+    time_bucket::register_to_registry(registry)?;
+    thetasketch_distinct::register_to_registry(registry)?;
+
+    Ok(())
+}
diff --git a/udf/src/udfs/thetasketch_distinct.rs b/udf/src/udfs/thetasketch_distinct.rs
new file mode 100644
index 0000000000..90ef3aefa5
--- /dev/null
+++ b/udf/src/udfs/thetasketch_distinct.rs
@@ -0,0 +1,166 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! thetasketch_distinct() udaf.
+
+use std::{fmt, mem};
+
+use common_types::datum::DatumKind;
+use common_util::define_result;
+use hyperloglog::HyperLogLog;
+use snafu::{ensure, OptionExt, ResultExt, Snafu};
+
+use crate::{
+    aggregate::{self, Accumulator, GetState, Input, MergeState, State, StateRef},
+    functions::{AggregateFunction, ScalarValue, TypeSignature},
+    registry::{self, FunctionRegistry},
+    udaf::AggregateUdf,
+};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Invalid argument number."))]
+    InvalidArgNum,
+
+    #[snafu(display("Invalid state len."))]
+    InvalidStateLen,
+
+    #[snafu(display("Invalid state, state is not string."))]
+    StateNotString,
+
+    #[snafu(display("Failed to decode base64 of hll, err:{}.", source))]
+    DecodeBase64 { source: base64::DecodeError },
+
+    #[snafu(display("Invalid state, failed to decode hll, err:{}.", source))]
+    DecodeHll { source: hyperloglog::Error },
+}
+
+define_result!(Error);
+
+const HLL_ERROR_RATE: f64 = 0.01;
+// Hll seeds:
+const HLL_KEY0: u64 = 0;
+const HLL_KEY1: u64 = 0;
+
+pub fn register_to_registry(registry: &mut dyn FunctionRegistry) -> registry::Result<()> {
+    registry.register_udaf(new_udaf())
+}
+
+fn new_udaf() -> AggregateUdf {
+    let aggregate_function = new_function();
+
+    AggregateUdf::create("thetasketch_distinct", aggregate_function)
+}
+
+pub(crate) fn new_function() -> AggregateFunction {
+    // Aways use the same hasher with same keys.
+    let hll = HyperLogLog::new_with_keys(HLL_ERROR_RATE, HLL_KEY0, HLL_KEY1);
+
+    let accumulator_fn = move || {
+        let distinct = HllDistinct {
+            hll: HyperLogLog::new_from_template(&hll),
+        };
+
+        Ok(distinct)
+    };
+
+    let type_signature = make_type_signature();
+    let state_type = make_state_type();
+
+    AggregateFunction::make_by_fn(
+        type_signature,
+        DatumKind::UInt64,
+        state_type,
+        accumulator_fn,
+    )
+}
+
+fn make_type_signature() -> TypeSignature {
+    TypeSignature::Uniform(
+        1,
+        vec![
+            DatumKind::Timestamp,
+            DatumKind::Double,
+            DatumKind::Varbinary,
+            DatumKind::String,
+            DatumKind::UInt64,
+        ],
+    )
+}
+
+fn make_state_type() -> Vec<DatumKind> {
+    vec![DatumKind::String]
+}
+
+/// Distinct counter based on HyperLogLog.
+///
+/// The HyperLogLogs must be initialized with same hash seeds (new from same
+/// template).
+struct HllDistinct {
+    hll: HyperLogLog,
+}
+
+// TODO(yingwen): Avoid base64 encode/decode if datafusion supports converting
+// binary datatype to scalarvalue.
+impl HllDistinct {
+    fn merge_impl(&mut self, states: StateRef) -> Result<()> {
+        // The states are serialize from hll.
+        ensure!(states.len() == 1, InvalidStateLen);
+        let value_ref = states.value(0);
+        let hll_string = value_ref.as_str().context(StateNotString)?;
+        let hll_bytes = base64::decode(hll_string).context(DecodeBase64)?;
+        let mut buf = &hll_bytes[..];
+        // Try to deserialize the hll.
+        let hll = HyperLogLog::read_from_buf(&mut buf).context(DecodeHll)?;
+
+        // Merge the hll, note that the two hlls must created or serialized from the
+        // same template hll.
+        self.hll.merge(&hll);
+
+        Ok(())
+    }
+}
+
+impl fmt::Debug for HllDistinct {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("HllDistinct")
+            .field("len", &self.hll.len())
+            .finish()
+    }
+}
+
+impl Accumulator for HllDistinct {
+    fn state(&self) -> aggregate::Result<State> {
+        // Serialize `self.hll` to bytes.
+        let mut buf = Vec::with_capacity(mem::size_of::<HyperLogLog>());
+        self.hll
+            .write_to_buf(&mut buf)
+            .map_err(|e| Box::new(e) as _)
+            .context(GetState)?;
+        // HACK: DataFusion does not support creating a scalar from binary, so we need
+        // to use base64 to convert a binary into string.
+        let hll_string = base64::encode(buf);
+
+        Ok(State::from(ScalarValue::from(hll_string)))
+    }
+
+    fn update(&mut self, values: Input) -> aggregate::Result<()> {
+        for value_ref in values.iter() {
+            // Insert value into hll.
+            self.hll.insert(&value_ref);
+        }
+
+        Ok(())
+    }
+
+    fn merge(&mut self, states: StateRef) -> aggregate::Result<()> {
+        self.merge_impl(states)
+            .map_err(|e| Box::new(e) as _)
+            .context(MergeState)
+    }
+
+    fn evaluate(&self) -> aggregate::Result<ScalarValue> {
+        let count = self.hll.len() as u64;
+
+        Ok(ScalarValue::from(count))
+    }
+}
diff --git a/udf/src/udfs/time_bucket.rs b/udf/src/udfs/time_bucket.rs
new file mode 100644
index 0000000000..40e428ec5a
--- /dev/null
+++ b/udf/src/udfs/time_bucket.rs
@@ -0,0 +1,324 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! time_bucket UDF.
+
+use std::time::Duration;
+
+use chrono::{Datelike, FixedOffset, TimeZone};
+use common_types::{
+    column::{ColumnBlock, ColumnBlockBuilder, TimestampColumn},
+    datum::{Datum, DatumKind},
+    time::Timestamp,
+};
+use common_util::define_result;
+use snafu::{ensure, OptionExt, ResultExt, Snafu};
+
+use crate::{
+    functions::{CallFunction, ColumnarValue, InvalidArguments, ScalarFunction, TypeSignature},
+    registry::{self, FunctionRegistry},
+    scalar::ScalarUdf,
+};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Invalid period, period:{}", period))]
+    InvalidPeriod { period: String },
+
+    #[snafu(display("Invalid period number, period:{}, err:{}", period, source))]
+    InvalidPeriodNumber {
+        period: String,
+        source: std::num::ParseIntError,
+    },
+
+    #[snafu(display("Invalid argument number."))]
+    InvalidArgNum,
+
+    #[snafu(display("Invalid arguments, require timestamp column."))]
+    NotTimestampColumn,
+
+    #[snafu(display("Invalid arguments, require period."))]
+    NotPeriod,
+
+    #[snafu(display("Period of week only support P1W."))]
+    UnsupportedWeek,
+
+    #[snafu(display("Period of month only support P1M."))]
+    UnsupportedMonth,
+
+    #[snafu(display("Period of year only support P1Y."))]
+    UnsupportedYear,
+
+    #[snafu(display(
+        "Failed to truncate timestamp, timestamp:{}, period:{:?}",
+        timestamp,
+        period
+    ))]
+    TruncateTimestamp { timestamp: i64, period: Period },
+
+    #[snafu(display("Failed to build result column, err:{}", source))]
+    BuildColumn { source: common_types::column::Error },
+}
+
+define_result!(Error);
+
+/// Default timezone: +08:00
+const DEFAULT_TIMEZONE_OFFSET_SECS: i32 = 8 * 3600;
+
+pub fn register_to_registry(registry: &mut dyn FunctionRegistry) -> registry::Result<()> {
+    registry.register_udf(new_udf())
+}
+
+fn new_udf() -> ScalarUdf {
+    // args:
+    // - timestamp column.
+    // - period.
+    // - input timestamp format in PARTITION BY (unsed now).
+    // - input timezone (ignored now).
+    // - timestamp output format (ignored now).
+    let func = |args: &[ColumnarValue]| {
+        let bucket = TimeBucket::parse_args(args)
+            .map_err(|e| Box::new(e) as _)
+            .context(InvalidArguments)?;
+
+        let result_column = bucket
+            .call()
+            .map_err(|e| Box::new(e) as _)
+            .context(CallFunction)?;
+
+        Ok(ColumnarValue::Array(result_column))
+    };
+
+    let signature = make_signature();
+    let scalar_function = ScalarFunction::make_by_fn(signature, DatumKind::Timestamp, func);
+
+    ScalarUdf::create("time_bucket", scalar_function)
+}
+
+fn make_signature() -> TypeSignature {
+    let sigs = vec![
+        TypeSignature::Exact(vec![DatumKind::Timestamp, DatumKind::String]),
+        TypeSignature::Exact(vec![
+            DatumKind::Timestamp,
+            DatumKind::String,
+            DatumKind::String,
+        ]),
+        TypeSignature::Exact(vec![
+            DatumKind::Timestamp,
+            DatumKind::String,
+            DatumKind::String,
+            DatumKind::String,
+        ]),
+        TypeSignature::Exact(vec![
+            DatumKind::Timestamp,
+            DatumKind::String,
+            DatumKind::String,
+            DatumKind::String,
+            DatumKind::String,
+        ]),
+    ];
+    TypeSignature::OneOf(sigs)
+}
+
+struct TimeBucket<'a> {
+    column: &'a TimestampColumn,
+    period: Period,
+}
+
+impl<'a> TimeBucket<'a> {
+    fn parse_args(args: &[ColumnarValue]) -> Result<TimeBucket> {
+        ensure!(args.len() >= 2, InvalidArgNum);
+
+        let column = match &args[0] {
+            ColumnarValue::Array(block) => block.as_timestamp().context(NotTimestampColumn)?,
+            _ => return NotTimestampColumn.fail(),
+        };
+        let period = match &args[1] {
+            ColumnarValue::Scalar(value) => {
+                let period_str = value.as_str().context(NotPeriod)?;
+                Period::parse(period_str)?
+            }
+            _ => return NotPeriod.fail(),
+        };
+
+        Ok(TimeBucket { column, period })
+    }
+
+    fn call(&self) -> Result<ColumnBlock> {
+        let mut out_column_builder =
+            ColumnBlockBuilder::with_capacity(&DatumKind::Timestamp, self.column.num_rows());
+        for ts_opt in self.column.iter() {
+            match ts_opt {
+                Some(ts) => {
+                    let truncated = self.period.truncate(ts).context(TruncateTimestamp {
+                        timestamp: ts,
+                        period: self.period,
+                    })?;
+                    out_column_builder
+                        .append(Datum::Timestamp(truncated))
+                        .context(BuildColumn)?;
+                }
+                None => {
+                    out_column_builder
+                        .append(Datum::Null)
+                        .context(BuildColumn)?;
+                }
+            }
+        }
+        Ok(out_column_builder.build())
+    }
+}
+
+/// A time bucket period.
+///
+/// e.g.
+/// - PT1S
+/// - PT1M
+/// - PT1H
+/// - P1D
+/// - P1W
+/// - P1M
+/// - P1Y
+#[derive(Debug, Clone, Copy)]
+pub enum Period {
+    Second(u16),
+    Minute(u16),
+    Hour(u16),
+    Day(u16),
+    Week,
+    Month,
+    Year,
+}
+
+impl Period {
+    fn parse(period: &str) -> Result<Period> {
+        ensure!(period.len() >= 3, InvalidPeriod { period });
+        let is_pt = if period.starts_with("PT") {
+            true
+        } else if period.starts_with('P') {
+            false
+        } else {
+            return InvalidPeriod { period }.fail();
+        };
+
+        let back = period.chars().last().context(InvalidPeriod { period })?;
+        let parsed = if is_pt {
+            let number = &period[2..period.len() - 1];
+            let number = number
+                .parse::<u16>()
+                .context(InvalidPeriodNumber { period })?;
+            match back {
+                'S' => Period::Second(number),
+                'M' => Period::Minute(number),
+                'H' => Period::Hour(number),
+                _ => return InvalidPeriod { period }.fail(),
+            }
+        } else {
+            let number = &period[1..period.len() - 1];
+            let number = number
+                .parse::<u16>()
+                .context(InvalidPeriodNumber { period })?;
+            match back {
+                'D' => Period::Day(number),
+                'W' => {
+                    ensure!(number == 1, UnsupportedWeek);
+                    Period::Week
+                }
+                'M' => {
+                    ensure!(number == 1, UnsupportedMonth);
+                    Period::Month
+                }
+                'Y' => {
+                    ensure!(number == 1, UnsupportedYear);
+                    Period::Year
+                }
+                _ => return InvalidPeriod { period }.fail(),
+            }
+        };
+
+        Ok(parsed)
+    }
+
+    fn truncate(&self, ts: Timestamp) -> Option<Timestamp> {
+        const MINUTE_SECONDS: u64 = 60;
+        const HOUR_SECONDS: u64 = 60 * MINUTE_SECONDS;
+
+        let truncated_ts = match self {
+            Period::Second(period) => {
+                let duration = Duration::from_secs(u64::from(*period));
+                ts.truncate_by(duration)
+            }
+            Period::Minute(period) => {
+                let duration = Duration::from_secs(u64::from(*period) * MINUTE_SECONDS);
+                ts.truncate_by(duration)
+            }
+            Period::Hour(period) => {
+                let duration = Duration::from_secs(u64::from(*period) * HOUR_SECONDS);
+                ts.truncate_by(duration)
+            }
+            Period::Day(period) => Self::truncate_day(ts, *period)?,
+            Period::Week => Self::truncate_week(ts),
+            Period::Month => Self::truncate_month(ts),
+            Period::Year => Self::truncate_year(ts),
+        };
+
+        Some(truncated_ts)
+    }
+
+    fn truncate_day(ts: Timestamp, period: u16) -> Option<Timestamp> {
+        let offset = FixedOffset::east(DEFAULT_TIMEZONE_OFFSET_SECS);
+        // Convert to local time.
+        let datetime = offset.timestamp_millis(ts.as_i64());
+
+        // Truncate day
+        let day = datetime.day();
+        let day = day - (day % u32::from(period));
+        let truncated_datetime = offset
+            .ymd(datetime.year(), datetime.month(), day)
+            .and_hms(0, 0, 0);
+        let truncated_ts = truncated_datetime.timestamp_millis();
+
+        Some(Timestamp::new(truncated_ts))
+    }
+
+    fn truncate_week(ts: Timestamp) -> Timestamp {
+        let offset = FixedOffset::east(DEFAULT_TIMEZONE_OFFSET_SECS);
+        // Convert to local time.
+        let datetime = offset.timestamp_millis(ts.as_i64());
+
+        // Truncate week.
+        let week_offset = datetime.weekday().num_days_from_monday();
+        let week_millis = 7 * 24 * 3600 * 1000;
+        let ts_offset = week_offset * week_millis;
+        // TODO(yingwen): Impl sub/divide for Timestamp
+        let week_millis = i64::from(week_millis);
+        let truncated_ts = (ts.as_i64() - i64::from(ts_offset)) / week_millis * week_millis;
+
+        Timestamp::new(truncated_ts)
+    }
+
+    fn truncate_month(ts: Timestamp) -> Timestamp {
+        let offset = FixedOffset::east(DEFAULT_TIMEZONE_OFFSET_SECS);
+        // Convert to local time.
+        let datetime = offset.timestamp_millis(ts.as_i64());
+
+        // Truncate month
+        let truncated_datetime = offset
+            .ymd(datetime.year(), datetime.month(), 1)
+            .and_hms(0, 0, 0);
+        let truncated_ts = truncated_datetime.timestamp_millis();
+
+        Timestamp::new(truncated_ts)
+    }
+
+    fn truncate_year(ts: Timestamp) -> Timestamp {
+        let offset = FixedOffset::east(DEFAULT_TIMEZONE_OFFSET_SECS);
+        // Convert to local time.
+        let datetime = offset.timestamp_millis(ts.as_i64());
+
+        // Truncate year
+        let truncated_datetime = offset.ymd(datetime.year(), 1, 1).and_hms(0, 0, 0);
+        let truncated_ts = truncated_datetime.timestamp_millis();
+
+        Timestamp::new(truncated_ts)
+    }
+}
diff --git a/wal/Cargo.toml b/wal/Cargo.toml
new file mode 100644
index 0000000000..574cffa9e2
--- /dev/null
+++ b/wal/Cargo.toml
@@ -0,0 +1,24 @@
+[package]
+name = "wal"
+version = "0.1.0"
+authors = ["CeresDB Authors <ceresdb@service.alipay.com>"]
+edition = "2018"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+async-trait = "0.1.41"
+common_util = {path = "../common_util"}
+common_types = {path = "../common_types"}
+log = "0.4"
+snafu = { version ="0.6.10", features = ["backtraces"] }
+tokio = { version = "1.0", features = ["sync"] }
+
+[dev-dependencies]
+tempfile = "3.1.0"
+futures = { version = "0.3", features = ["async-await"] }
+
+[dependencies.rocksdb]
+git = "https://github.com/tikv/rust-rocksdb.git"
+branch = "tikv-5.2"
+features = ["portable"]
diff --git a/wal/src/lib.rs b/wal/src/lib.rs
new file mode 100644
index 0000000000..440edb2d1e
--- /dev/null
+++ b/wal/src/lib.rs
@@ -0,0 +1,10 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Write Ahead Log
+
+pub mod log_batch;
+pub mod manager;
+pub mod rocks_impl;
+
+#[cfg(test)]
+mod tests;
diff --git a/wal/src/log_batch.rs b/wal/src/log_batch.rs
new file mode 100644
index 0000000000..7e08c6c10d
--- /dev/null
+++ b/wal/src/log_batch.rs
@@ -0,0 +1,89 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Log entries definition.
+
+use std::fmt::Debug;
+
+use common_types::{
+    bytes::{MemBuf, MemBufMut},
+    SequenceNumber,
+};
+
+use crate::manager::RegionId;
+
+pub trait Payload: Send + Sync + Debug {
+    type Error: std::error::Error + Send + Sync + 'static;
+    /// Compute size of the encoded payload.
+    fn encode_size(&self) -> usize;
+    /// Append the encoded payload to the `buf`.
+    fn encode_to<B: MemBufMut>(&self, buf: &mut B) -> Result<(), Self::Error>;
+}
+
+#[derive(Debug)]
+pub struct LogEntry<P> {
+    pub sequence: SequenceNumber,
+    pub payload: P,
+}
+
+/// An entry to be written into the Wal.
+///
+/// Generally, the `payload` is a lazily encoder whose constraint is
+/// `PayloadEncoder`. `region_id` is a logically region and set it as 0 if
+/// unnecessary.
+#[derive(Debug)]
+pub struct LogWriteEntry<P> {
+    pub payload: P,
+}
+
+/// A batch of `LogWriteEntry`s.
+#[derive(Debug)]
+pub struct LogWriteBatch<P> {
+    pub(crate) region_id: RegionId,
+    pub(crate) entries: Vec<LogWriteEntry<P>>,
+}
+
+impl<P: Payload> LogWriteBatch<P> {
+    pub fn new(region_id: RegionId) -> Self {
+        Self::with_capacity(region_id, 0)
+    }
+
+    pub fn with_capacity(region_id: RegionId, cap: usize) -> Self {
+        Self {
+            region_id,
+            entries: Vec::with_capacity(cap),
+        }
+    }
+
+    #[inline]
+    pub fn push(&mut self, entry: LogWriteEntry<P>) {
+        self.entries.push(entry)
+    }
+
+    #[inline]
+    pub fn len(&self) -> usize {
+        self.entries.len()
+    }
+
+    #[inline]
+    pub fn is_empty(&self) -> bool {
+        self.entries.is_empty()
+    }
+
+    #[inline]
+    pub fn clear(&mut self) {
+        self.entries.clear()
+    }
+}
+
+impl<P: Payload> Default for LogWriteBatch<P> {
+    fn default() -> Self {
+        Self::new(0)
+    }
+}
+
+pub trait PayloadDecoder: Send + Sync {
+    type Error: std::error::Error + Send + Sync + 'static;
+    type Target: Send + Sync;
+    /// Decode `Target` from the `bytes`.
+    fn decode<B: MemBuf>(&self, buf: &mut B) -> Result<Self::Target, Self::Error>;
+}
diff --git a/wal/src/manager.rs b/wal/src/manager.rs
new file mode 100644
index 0000000000..4ea8fe97ab
--- /dev/null
+++ b/wal/src/manager.rs
@@ -0,0 +1,237 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! WalManager abstraction
+
+use std::{fmt, time::Duration};
+
+use async_trait::async_trait;
+pub use common_types::SequenceNumber;
+
+use crate::log_batch::{LogEntry, LogWriteBatch, Payload, PayloadDecoder};
+
+pub mod error {
+    use common_util::define_result;
+    use snafu::{Backtrace, Snafu};
+
+    use crate::manager::RegionId;
+
+    // Now most error from manage implementation don't have backtrace, so we add
+    // backtrace here.
+    #[derive(Debug, Snafu)]
+    #[snafu(visibility(pub))]
+    pub enum Error {
+        #[snafu(display(
+            "Failed to open wal, path:{}, err:{}.\nBacktrace:\n{}",
+            wal_path,
+            source,
+            backtrace
+        ))]
+        Open {
+            wal_path: String,
+            source: Box<dyn std::error::Error + Send + Sync>,
+            backtrace: Backtrace,
+        },
+
+        #[snafu(display("Failed to initialize wal, err:{}.\nBacktrace:\n{}", source, backtrace))]
+        Initialization {
+            source: Box<dyn std::error::Error + Send + Sync>,
+            backtrace: Backtrace,
+        },
+
+        #[snafu(display(
+            "Region is not found, region_id:{}.\nBacktrace:\n{}",
+            region_id,
+            backtrace
+        ))]
+        RegionNotFound {
+            region_id: RegionId,
+            backtrace: Backtrace,
+        },
+
+        #[snafu(display(
+            "Failed to write log entries, err:{}.\nBacktrace:\n{}",
+            source,
+            backtrace
+        ))]
+        Write {
+            source: Box<dyn std::error::Error + Send + Sync>,
+            backtrace: Backtrace,
+        },
+
+        #[snafu(display(
+            "Failed to read log entries, err:{}.\nBacktrace:\n{}",
+            source,
+            backtrace
+        ))]
+        Read {
+            source: Box<dyn std::error::Error + Send + Sync>,
+            backtrace: Backtrace,
+        },
+
+        #[snafu(display(
+            "Failed to delete log entries, err:{}.\nBacktrace:\n{}",
+            source,
+            backtrace
+        ))]
+        Delete {
+            source: Box<dyn std::error::Error + Send + Sync>,
+            backtrace: Backtrace,
+        },
+
+        #[snafu(display("Failed to encode, err:{}.\nBacktrace:\n{}", source, backtrace))]
+        Encoding {
+            source: Box<dyn std::error::Error + Send + Sync>,
+            backtrace: Backtrace,
+        },
+
+        #[snafu(display("Failed to decode, err:{}.\nBacktrace:\n{}", source, backtrace))]
+        Decoding {
+            source: Box<dyn std::error::Error + Send + Sync>,
+            backtrace: Backtrace,
+        },
+    }
+
+    define_result!(Error);
+}
+
+use common_types::{MAX_SEQUENCE_NUMBER, MIN_SEQUENCE_NUMBER};
+pub use error::*;
+
+pub type RegionId = u64;
+pub const MAX_REGION_ID: RegionId = u64::MAX;
+
+#[derive(Debug, Clone)]
+pub struct WriteContext {
+    /// Timeout to write wal and it only takes effect when writing to a Wal on a
+    /// remote machine (writing to the local disk does not have timeout).
+    pub timeout: Duration,
+}
+
+impl Default for WriteContext {
+    fn default() -> Self {
+        Self {
+            timeout: Duration::from_secs(1),
+        }
+    }
+}
+
+/// Write abstraction for log entries in Wal.
+#[async_trait]
+pub trait LogWriter {
+    /// Write a batch of log entries to log.
+    ///
+    /// Returns the max sequence number for the batch of log entries.
+    async fn write<P: Payload>(
+        &self,
+        ctx: &WriteContext,
+        batch: &LogWriteBatch<P>,
+    ) -> Result<SequenceNumber>;
+}
+
+#[derive(Debug, Clone)]
+pub struct ReadContext {
+    /// Timeout to read log entries and it only takes effect when reading from a
+    /// Wal on a remote machine (reading from the local disk does not have
+    /// timeout).
+    pub timeout: Duration,
+}
+
+impl Default for ReadContext {
+    fn default() -> Self {
+        Self {
+            timeout: Duration::from_secs(5),
+        }
+    }
+}
+
+#[derive(Debug, Clone, Copy)]
+pub enum ReadBoundary {
+    Max,
+    Min,
+    Included(SequenceNumber),
+    Excluded(SequenceNumber),
+}
+
+impl ReadBoundary {
+    /// Convert the boundary to start sequence number.
+    ///
+    /// Returns `None` if the boundary is `Excluded(MAX_SEQUENCE_NUM)`
+    pub fn as_start_sequence_number(&self) -> Option<SequenceNumber> {
+        match *self {
+            ReadBoundary::Max => Some(MAX_SEQUENCE_NUMBER),
+            ReadBoundary::Min => Some(MIN_SEQUENCE_NUMBER),
+            ReadBoundary::Included(n) => Some(n),
+            ReadBoundary::Excluded(n) => {
+                if n == MAX_SEQUENCE_NUMBER {
+                    None
+                } else {
+                    Some(n + 1)
+                }
+            }
+        }
+    }
+
+    /// Convert the boundary to start sequence number.
+    ///
+    /// Returns `None` if the boundary is `Excluded(MIN_SEQUENCE_NUM)`
+    pub fn as_end_sequence_number(&self) -> Option<SequenceNumber> {
+        match *self {
+            ReadBoundary::Max => Some(MAX_SEQUENCE_NUMBER),
+            ReadBoundary::Min => Some(MIN_SEQUENCE_NUMBER),
+            ReadBoundary::Included(n) => Some(n),
+            ReadBoundary::Excluded(n) => {
+                if n == MIN_SEQUENCE_NUMBER {
+                    None
+                } else {
+                    Some(n - 1)
+                }
+            }
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct ReadRequest {
+    /// Region id of the wal to read
+    pub region_id: RegionId,
+    // TODO(yingwen): Or just rename to ReadBound?
+    /// Start bound
+    pub start: ReadBoundary,
+    /// End bound
+    pub end: ReadBoundary,
+}
+
+/// Iterator abstraction for log entry.
+pub trait LogIterator {
+    fn next_log_entry<D: PayloadDecoder>(
+        &mut self,
+        decoder: &D,
+    ) -> Result<Option<LogEntry<D::Target>>>;
+}
+
+/// Read abstraction for log entries in the Wal.
+pub trait LogReader {
+    /// Iterator over log entries.
+    type Iterator: LogIterator + Send;
+    /// Provide iterator on necessary entries according to `ReadRequest`.
+    fn read(&self, ctx: &ReadContext, req: &ReadRequest) -> Result<Self::Iterator>;
+}
+
+// TODO(xikai): define Error as associate type.
+/// Management of multi-region Wals.
+///
+/// Every region has its own increasing (and maybe hallow) sequence number
+/// space.
+#[async_trait]
+pub trait WalManager: LogWriter + LogReader + fmt::Debug {
+    /// Get current sequence number.
+    fn sequence_num(&self, region_id: RegionId) -> Result<SequenceNumber>;
+
+    /// Mark the entries whose sequence number is in [0, `sequence_number`] to
+    /// be deleted in the future.
+    async fn mark_delete_entries_up_to(
+        &self,
+        region_id: RegionId,
+        sequence_num: SequenceNumber,
+    ) -> Result<()>;
+}
diff --git a/wal/src/rocks_impl/encoding.rs b/wal/src/rocks_impl/encoding.rs
new file mode 100644
index 0000000000..727b5715f2
--- /dev/null
+++ b/wal/src/rocks_impl/encoding.rs
@@ -0,0 +1,533 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Encoding for Wal logs
+
+use common_types::{
+    bytes::{self, BytesMut, MemBuf, MemBufMut},
+    SequenceNumber,
+};
+use common_util::{
+    codec::{Decoder, Encoder},
+    define_result,
+};
+use snafu::{ensure, Backtrace, ResultExt, Snafu};
+
+use crate::{
+    log_batch::{Payload, PayloadDecoder},
+    manager::{self, RegionId},
+};
+
+const LOG_KEY_ENCODING_V0: u8 = 0;
+const NEWEST_LOG_KEY_ENCODING_VERSION: u8 = LOG_KEY_ENCODING_V0;
+
+const LOG_VALUE_ENCODING_V0: u8 = 0;
+const NEWEST_LOG_VALUE_ENCODING_VERSION: u8 = LOG_VALUE_ENCODING_V0;
+
+const META_KEY_ENCODING_V0: u8 = 0;
+const NEWEST_META_KEY_ENCODING_VERSION: u8 = META_KEY_ENCODING_V0;
+
+const META_VALUE_ENCODING_V0: u8 = 0;
+const NEWEST_META_VALUE_ENCODING_VERSION: u8 = META_VALUE_ENCODING_V0;
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Failed to encode log key, err:{}", source))]
+    EncodeLogKey {
+        source: bytes::Error,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Failed to encode log value header, err:{}", source))]
+    EncodeLogValueHeader { source: bytes::Error },
+
+    #[snafu(display("Failed to encode log value payload, err:{}", source))]
+    EncodeLogValuePayload {
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+
+    #[snafu(display("Failed to decode log key, err:{}", source))]
+    DecodeLogKey { source: bytes::Error },
+
+    #[snafu(display("Failed to decode log value header, err:{}", source))]
+    DecodeLogValueHeader { source: bytes::Error },
+
+    #[snafu(display("Failed to decode log value payload, err:{}", source))]
+    DecodeLogValuePayload {
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+
+    #[snafu(display("Failed to encode meta key, err:{}", source))]
+    EncodeMetaKey {
+        source: bytes::Error,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Failed to encode meta value, err:{}", source))]
+    EncodeMetaValue { source: bytes::Error },
+
+    #[snafu(display("Failed to decode meta key, err:{}", source))]
+    DecodeMetaKey { source: bytes::Error },
+
+    #[snafu(display("Failed to decode meta value, err:{}", source))]
+    DecodeMetaValue { source: bytes::Error },
+
+    #[snafu(display(
+        "Found invalid meta key type, expect:{:?}, given:{}.\nBacktrace:\n{}",
+        expect,
+        given,
+        backtrace
+    ))]
+    InvalidMetaKeyType {
+        expect: MetaKeyType,
+        given: u8,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Found invalid namespace, expect:{:?}, given:{}.\nBacktrace:\n{}",
+        expect,
+        given,
+        backtrace
+    ))]
+    InvalidNamespace {
+        expect: Namespace,
+        given: u8,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Found invalid version, expect:{}, given:{}.\nBacktrace:\n{}",
+        expect,
+        given,
+        backtrace
+    ))]
+    InvalidVersion {
+        expect: u8,
+        given: u8,
+        backtrace: Backtrace,
+    },
+}
+
+define_result!(Error);
+
+#[derive(Debug, Copy, Clone)]
+pub enum Namespace {
+    Meta = 0,
+    Log = 1,
+}
+
+#[derive(Debug, Clone)]
+pub struct LogEncoding {
+    key_enc: LogKeyEncoder,
+    value_enc: LogValueEncoder,
+    // value decoder is created dynamically from the version,
+    value_enc_version: u8,
+}
+
+impl LogEncoding {
+    pub fn newest() -> Self {
+        Self {
+            key_enc: LogKeyEncoder {
+                version: NEWEST_LOG_KEY_ENCODING_VERSION,
+                namespace: Namespace::Log,
+            },
+            value_enc: LogValueEncoder {
+                version: NEWEST_LOG_VALUE_ENCODING_VERSION,
+            },
+            value_enc_version: NEWEST_LOG_VALUE_ENCODING_VERSION,
+        }
+    }
+
+    // Encode [LogKey] into `buf` and caller should knows that the keys are ordered
+    // by ([RegionId], [SequenceNum]) so the caller can use this method to
+    // generate min/max key in specific scope(global or in some region).
+    pub fn encode_key(&self, buf: &mut BytesMut, log_key: &LogKey) -> manager::Result<()> {
+        buf.clear();
+        buf.reserve(self.key_enc.estimate_encoded_size(log_key));
+        self.key_enc
+            .encode(buf, log_key)
+            .map_err(|e| Box::new(e) as _)
+            .context(manager::Encoding)?;
+
+        Ok(())
+    }
+
+    pub fn encode_value(&self, buf: &mut BytesMut, payload: &impl Payload) -> manager::Result<()> {
+        buf.clear();
+        buf.reserve(self.value_enc.estimate_encoded_size(payload));
+        self.value_enc
+            .encode(buf, payload)
+            .map_err(|e| Box::new(e) as _)
+            .context(manager::Encoding)
+    }
+
+    pub fn is_log_key(&self, mut buf: &[u8]) -> manager::Result<bool> {
+        self.key_enc
+            .is_valid(&mut buf)
+            .map_err(|e| Box::new(e) as _)
+            .context(manager::Decoding)
+    }
+
+    pub fn decode_key(&self, mut buf: &[u8]) -> manager::Result<LogKey> {
+        self.key_enc
+            .decode(&mut buf)
+            .map_err(|e| Box::new(e) as _)
+            .context(manager::Decoding)
+    }
+
+    pub fn decode_value<D: PayloadDecoder>(
+        &self,
+        mut buf: &[u8],
+        decoder: &D,
+    ) -> manager::Result<D::Target> {
+        let value_dec = LogValueDecoder {
+            version: self.value_enc_version,
+            payload_dec: decoder,
+        };
+
+        value_dec
+            .decode(&mut buf)
+            .map_err(|e| Box::new(e) as _)
+            .context(manager::Decoding)
+    }
+}
+
+pub type LogKey = (RegionId, SequenceNumber);
+
+#[derive(Debug, Clone)]
+struct LogKeyEncoder {
+    version: u8,
+    namespace: Namespace,
+}
+
+impl LogKeyEncoder {
+    /// Determine whether the raw bytes is a log key.
+    pub fn is_valid<B: MemBuf>(&self, buf: &mut B) -> Result<bool> {
+        let namespace = buf.read_u8().context(DecodeLogKey)?;
+        Ok(namespace == self.namespace as u8)
+    }
+}
+
+impl Encoder<LogKey> for LogKeyEncoder {
+    type Error = Error;
+
+    /// Key format:
+    ///
+    /// ```text
+    /// +---------------+----------------+-------------------+--------------------+
+    /// | namespace(u8) | region_id(u64) | sequence_num(u64) | version header(u8) |
+    /// +---------------+----------------+-------------------+--------------------+
+    /// ```
+    ///
+    /// More information can be extended after the incremented `version header`.
+    fn encode<B: MemBufMut>(&self, buf: &mut B, log_key: &LogKey) -> Result<()> {
+        buf.write_u8(self.namespace as u8).context(EncodeLogKey)?;
+        buf.write_u64(log_key.0).context(EncodeLogKey)?;
+        buf.write_u64(log_key.1).context(EncodeLogKey)?;
+        buf.write_u8(self.version).context(EncodeLogKey)?;
+
+        Ok(())
+    }
+
+    fn estimate_encoded_size(&self, _log_key: &LogKey) -> usize {
+        // Refer to key format.
+        1 + 8 + 8 + 1
+    }
+}
+
+impl Decoder<LogKey> for LogKeyEncoder {
+    type Error = Error;
+
+    fn decode<B: MemBuf>(&self, buf: &mut B) -> Result<LogKey> {
+        // check namespace
+        let namespace = buf.read_u8().context(DecodeLogKey)?;
+        ensure!(
+            namespace == self.namespace as u8,
+            InvalidNamespace {
+                expect: self.namespace,
+                given: namespace
+            }
+        );
+
+        let log_key = (
+            buf.read_u64().context(DecodeLogKey)?,
+            buf.read_u64().context(DecodeLogKey)?,
+        );
+
+        // check version
+        let version = buf.read_u8().context(DecodeLogKey)?;
+        ensure!(
+            version == self.version,
+            InvalidVersion {
+                expect: self.version,
+                given: version
+            }
+        );
+
+        Ok(log_key)
+    }
+}
+
+#[derive(Debug, Clone)]
+struct LogValueEncoder {
+    version: u8,
+}
+
+impl<T: Payload> Encoder<T> for LogValueEncoder {
+    type Error = Error;
+
+    /// Value format:
+    /// +--------------------+---------+
+    /// | version_header(u8) | payload |
+    /// +--------------------+---------+
+    fn encode<B: MemBufMut>(&self, buf: &mut B, payload: &T) -> Result<()> {
+        buf.write_u8(self.version).context(EncodeLogValueHeader)?;
+
+        payload
+            .encode_to(buf)
+            .map_err(|e| Box::new(e) as _)
+            .context(EncodeLogValuePayload)
+    }
+
+    fn estimate_encoded_size(&self, payload: &T) -> usize {
+        // Refer to value format.
+        1 + payload.encode_size()
+    }
+}
+
+struct LogValueDecoder<'a, D: PayloadDecoder> {
+    version: u8,
+    payload_dec: &'a D,
+}
+
+impl<'a, D: PayloadDecoder> Decoder<D::Target> for LogValueDecoder<'a, D> {
+    type Error = Error;
+
+    fn decode<B: MemBuf>(&self, buf: &mut B) -> Result<D::Target> {
+        let version = buf.read_u8().context(DecodeLogValueHeader)?;
+        ensure!(
+            version == self.version,
+            InvalidVersion {
+                expect: self.version,
+                given: version
+            }
+        );
+
+        self.payload_dec
+            .decode(buf)
+            .map_err(|e| Box::new(e) as _)
+            .context(DecodeLogValuePayload)
+    }
+}
+
+#[derive(Clone, Copy, Debug)]
+pub enum MetaKeyType {
+    MaxSeq = 0,
+}
+
+#[derive(Clone, Debug)]
+pub struct MetaKeyEncoder {
+    version: u8,
+    key_type: MetaKeyType,
+    namespace: Namespace,
+}
+
+#[derive(Clone, Debug)]
+pub struct MetaKey {
+    pub region_id: RegionId,
+}
+
+impl MetaKeyEncoder {
+    /// Determine whether the raw bytes is a valid meta key.
+    pub fn is_valid<B: MemBuf>(&self, buf: &mut B) -> Result<bool> {
+        let namespace = buf.read_u8().context(DecodeMetaKey)?;
+        let key_type = buf.read_u8().context(DecodeMetaKey)?;
+        Ok(namespace == self.namespace as u8 && key_type == self.key_type as u8)
+    }
+}
+
+impl Encoder<MetaKey> for MetaKeyEncoder {
+    type Error = Error;
+
+    /// Key format:
+    ///
+    /// ```text
+    /// +---------------+--------------+----------------+--------------------+
+    /// | namespace(u8) | key_type(u8) | region_id(u64) | version header(u8) |
+    /// +---------------+--------------+----------------+--------------------+
+    /// ```
+    ///
+    /// More information can be extended after the incremented `version header`.
+    fn encode<B: MemBufMut>(&self, buf: &mut B, meta_key: &MetaKey) -> Result<()> {
+        buf.write_u8(self.namespace as u8).context(EncodeMetaKey)?;
+        buf.write_u8(self.key_type as u8).context(EncodeMetaKey)?;
+        buf.write_u64(meta_key.region_id).context(EncodeMetaKey)?;
+        buf.write_u8(self.version).context(EncodeMetaKey)?;
+
+        Ok(())
+    }
+
+    fn estimate_encoded_size(&self, _log_key: &MetaKey) -> usize {
+        // Refer to key format.
+        1 + 1 + 8 + 1
+    }
+}
+
+impl Decoder<MetaKey> for MetaKeyEncoder {
+    type Error = Error;
+
+    fn decode<B: MemBuf>(&self, buf: &mut B) -> Result<MetaKey> {
+        // check namespace
+        let namespace = buf.read_u8().context(DecodeMetaKey)?;
+        ensure!(
+            namespace == self.namespace as u8,
+            InvalidNamespace {
+                expect: self.namespace,
+                given: namespace
+            }
+        );
+
+        let key_type = buf.read_u8().context(DecodeMetaKey)?;
+        ensure!(
+            key_type == self.key_type as u8,
+            InvalidMetaKeyType {
+                expect: self.key_type,
+                given: key_type,
+            }
+        );
+
+        let region_id = buf.read_u64().context(DecodeMetaKey)?;
+
+        // check version
+        let version = buf.read_u8().context(DecodeMetaKey)?;
+        ensure!(
+            version == self.version,
+            InvalidVersion {
+                expect: self.version,
+                given: version
+            }
+        );
+
+        Ok(MetaKey { region_id })
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct MaxSeqMetaValue {
+    pub max_seq: SequenceNumber,
+}
+
+#[derive(Clone, Debug)]
+pub struct MaxSeqMetaValueEncoder {
+    version: u8,
+}
+
+impl Encoder<MaxSeqMetaValue> for MaxSeqMetaValueEncoder {
+    type Error = Error;
+
+    /// Value format:
+    ///
+    /// ```text
+    /// +--------------------+--------------+
+    /// | version header(u8) | max_seq(u64) |
+    /// +--------------------+--------------+
+    /// ```
+    ///
+    /// More information can be extended after the incremented `version header`.
+    fn encode<B: MemBufMut>(&self, buf: &mut B, meta_value: &MaxSeqMetaValue) -> Result<()> {
+        buf.write_u8(self.version).context(EncodeMetaValue)?;
+        buf.write_u64(meta_value.max_seq).context(EncodeMetaValue)?;
+
+        Ok(())
+    }
+
+    fn estimate_encoded_size(&self, _meta_value: &MaxSeqMetaValue) -> usize {
+        // Refer to value format.
+        1 + 8
+    }
+}
+
+impl Decoder<MaxSeqMetaValue> for MaxSeqMetaValueEncoder {
+    type Error = Error;
+
+    fn decode<B: MemBuf>(&self, buf: &mut B) -> Result<MaxSeqMetaValue> {
+        // check version
+        let version = buf.read_u8().context(DecodeMetaValue)?;
+        ensure!(
+            version == self.version,
+            InvalidVersion {
+                expect: self.version,
+                given: version
+            }
+        );
+
+        let max_seq = buf.read_u64().context(DecodeMetaValue)?;
+        Ok(MaxSeqMetaValue { max_seq })
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct MaxSeqMetaEncoding {
+    key_enc: MetaKeyEncoder,
+    value_enc: MaxSeqMetaValueEncoder,
+}
+
+impl MaxSeqMetaEncoding {
+    pub fn newest() -> Self {
+        Self {
+            key_enc: MetaKeyEncoder {
+                version: NEWEST_META_KEY_ENCODING_VERSION,
+                key_type: MetaKeyType::MaxSeq,
+                namespace: Namespace::Meta,
+            },
+            value_enc: MaxSeqMetaValueEncoder {
+                version: NEWEST_META_VALUE_ENCODING_VERSION,
+            },
+        }
+    }
+
+    pub fn is_max_seq_meta_key(&self, mut buf: &[u8]) -> manager::Result<bool> {
+        self.key_enc
+            .is_valid(&mut buf)
+            .map_err(|e| Box::new(e) as _)
+            .context(manager::Decoding)
+    }
+
+    pub fn encode_key(&self, buf: &mut BytesMut, meta_key: &MetaKey) -> manager::Result<()> {
+        buf.clear();
+        buf.reserve(self.key_enc.estimate_encoded_size(meta_key));
+        self.key_enc
+            .encode(buf, meta_key)
+            .map_err(|e| Box::new(e) as _)
+            .context(manager::Encoding)?;
+
+        Ok(())
+    }
+
+    pub fn encode_value(
+        &self,
+        buf: &mut BytesMut,
+        meta_value: &MaxSeqMetaValue,
+    ) -> manager::Result<()> {
+        buf.clear();
+        buf.reserve(self.value_enc.estimate_encoded_size(meta_value));
+        self.value_enc
+            .encode(buf, meta_value)
+            .map_err(|e| Box::new(e) as _)
+            .context(manager::Encoding)
+    }
+
+    pub fn decode_key(&self, mut buf: &[u8]) -> manager::Result<MetaKey> {
+        self.key_enc
+            .decode(&mut buf)
+            .map_err(|e| Box::new(e) as _)
+            .context(manager::Decoding)
+    }
+
+    pub fn decode_value(&self, mut buf: &[u8]) -> manager::Result<MaxSeqMetaValue> {
+        self.value_enc
+            .decode(&mut buf)
+            .map_err(|e| Box::new(e) as _)
+            .context(manager::Decoding)
+    }
+}
diff --git a/wal/src/rocks_impl/manager.rs b/wal/src/rocks_impl/manager.rs
new file mode 100644
index 0000000000..bdf71eba0e
--- /dev/null
+++ b/wal/src/rocks_impl/manager.rs
@@ -0,0 +1,621 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! WalManager implementation based on RocksDB
+
+use std::{
+    collections::HashMap,
+    fmt,
+    fmt::Formatter,
+    path::PathBuf,
+    sync::{
+        atomic::{AtomicU64, Ordering},
+        Arc, RwLock,
+    },
+};
+
+use async_trait::async_trait;
+use common_types::{bytes::BytesMut, SequenceNumber, MAX_SEQUENCE_NUMBER, MIN_SEQUENCE_NUMBER};
+use common_util::runtime::Runtime;
+use log::{debug, info, warn};
+use rocksdb::{DBIterator, DBOptions, ReadOptions, SeekKey, Writable, WriteBatch, DB};
+use snafu::ResultExt;
+use tokio::sync::Mutex;
+
+use crate::{
+    log_batch::{LogEntry, LogWriteBatch, Payload, PayloadDecoder},
+    manager::{
+        error::*, LogIterator, LogReader, LogWriter, ReadContext, ReadRequest, RegionId,
+        WalManager, WriteContext, MAX_REGION_ID,
+    },
+    rocks_impl::encoding::{LogEncoding, LogKey, MaxSeqMetaEncoding, MaxSeqMetaValue, MetaKey},
+};
+
+/// Region in the Wal.
+struct Region {
+    /// id of the Region
+    id: RegionId,
+    /// RocksDB instance
+    db: Arc<DB>,
+    /// `next_sequence_num` is ensured to be positive
+    next_sequence_num: AtomicU64,
+    /// Encoding for log entries
+    log_encoding: LogEncoding,
+    /// Encoding for meta data of max sequence
+    max_seq_meta_encoding: MaxSeqMetaEncoding,
+    /// Runtime for write requests
+    runtime: Arc<Runtime>,
+    /// Ensure the delete procedure to be sequential
+    delete_lock: Mutex<()>,
+}
+
+impl Region {
+    /// Allocate a continuous range of [SequenceNumber] and returns
+    /// the start [SequenceNumber] of the range [start, start+`number`).
+    #[inline]
+    fn alloc_sequence_num(&self, number: u64) -> SequenceNumber {
+        self.next_sequence_num.fetch_add(number, Ordering::Relaxed)
+    }
+
+    #[inline]
+    /// Generate [LogKey] from `region_id` and `sequence_num`
+    fn log_key(&self, sequence_num: SequenceNumber) -> LogKey {
+        (self.id, sequence_num)
+    }
+
+    /// Returns the current sequence number which must be positive.
+    fn sequence_num(&self) -> Result<u64> {
+        let next_seq_num = self.next_sequence_num.load(Ordering::Relaxed);
+        debug_assert!(next_seq_num > 0);
+
+        Ok(next_seq_num - 1)
+    }
+
+    /// Delete entries in the range `[0, sequence_num]`.
+    ///
+    /// The delete procedure is ensured to be sequential.
+    async fn delete_entries_up_to(&self, mut sequence_num: SequenceNumber) -> Result<()> {
+        debug!(
+            "Wal Region delete entries begin deleting, sequence_num:{:?}",
+            sequence_num
+        );
+
+        let _delete_guard = self.delete_lock.lock().await;
+        let max_seq = self.sequence_num()?;
+        if sequence_num > max_seq {
+            warn!(
+                "Try to delete entries up to sequence number({}) greater than current max sequence \
+                number({})",
+                sequence_num,
+                max_seq
+            );
+            sequence_num = max_seq;
+        }
+
+        let wb = {
+            let wb = WriteBatch::default();
+
+            // Delete the range [0, sequence_num]
+            let start_log_key = (self.id, 0);
+            let end_log_key = if sequence_num < MAX_SEQUENCE_NUMBER {
+                (self.id, sequence_num + 1)
+            } else {
+                // Region id is unlikely to overflow.
+                (self.id + 1, 0)
+            };
+            let (mut start_key_buf, mut end_key_buf) = (BytesMut::new(), BytesMut::new());
+            self.log_encoding
+                .encode_key(&mut start_key_buf, &start_log_key)?;
+            self.log_encoding
+                .encode_key(&mut end_key_buf, &end_log_key)?;
+            wb.delete_range(&start_key_buf, &end_key_buf)
+                .map_err(|e| e.into())
+                .context(Delete)?;
+
+            // Update the max sequence number.
+            let meta_key = MetaKey { region_id: self.id };
+            let meta_value = MaxSeqMetaValue { max_seq };
+            let (mut meta_key_buf, mut meta_value_buf) = (BytesMut::new(), BytesMut::new());
+            self.max_seq_meta_encoding
+                .encode_key(&mut meta_key_buf, &meta_key)?;
+            self.max_seq_meta_encoding
+                .encode_value(&mut meta_value_buf, &meta_value)?;
+            wb.put(&meta_key_buf, &meta_value_buf)
+                .map_err(|e| e.into())
+                .context(Delete)?;
+
+            wb
+        };
+
+        let db = self.db.clone();
+        self.runtime
+            .spawn_blocking(move || db.write(&wb).map_err(|e| e.into()).context(Delete))
+            .await
+            .map_err(|e| Box::new(e) as _)
+            .context(Delete)?
+    }
+
+    fn read(&self, ctx: &ReadContext, req: &ReadRequest) -> Result<RocksLogIterator> {
+        debug!("Wal region begin reading, ctx:{:?}, req:{:?}", ctx, req);
+
+        let read_opts = ReadOptions::default();
+        let iter = DBIterator::new(self.db.clone(), read_opts);
+
+        let start_sequence = if let Some(n) = req.start.as_start_sequence_number() {
+            n
+        } else {
+            return Ok(RocksLogIterator::new_empty(self.log_encoding.clone(), iter));
+        };
+
+        let end_sequence = if let Some(n) = req.end.as_end_sequence_number() {
+            n
+        } else {
+            return Ok(RocksLogIterator::new_empty(self.log_encoding.clone(), iter));
+        };
+
+        let (min_log_key, max_log_key) = (self.log_key(start_sequence), self.log_key(end_sequence));
+
+        let log_iter =
+            RocksLogIterator::with_data(self.log_encoding.clone(), iter, min_log_key, max_log_key);
+        Ok(log_iter)
+    }
+
+    async fn write<P: Payload>(&self, ctx: &WriteContext, batch: &LogWriteBatch<P>) -> Result<u64> {
+        debug!(
+            "Wal region begin writing, ctx:{:?}, log_entries_num:{}",
+            ctx,
+            batch.entries.len()
+        );
+
+        let entries_num = batch.len() as u64;
+        let (wb, max_sequence_num) = {
+            let wb = WriteBatch::default();
+            let mut next_sequence_num = self.alloc_sequence_num(entries_num);
+            let mut key_buf = BytesMut::new();
+            let mut value_buf = BytesMut::new();
+
+            for entry in &batch.entries {
+                self.log_encoding
+                    .encode_key(&mut key_buf, &(batch.region_id, next_sequence_num))?;
+                self.log_encoding
+                    .encode_value(&mut value_buf, &entry.payload)?;
+                wb.put(&key_buf, &value_buf)
+                    .map_err(|e| e.into())
+                    .context(Write)?;
+
+                next_sequence_num += 1;
+            }
+
+            (wb, next_sequence_num - 1)
+        };
+
+        let db = self.db.clone();
+        self.runtime
+            .spawn_blocking(move || {
+                db.write(&wb)
+                    .map(|_| max_sequence_num)
+                    .map_err(|e| e.into())
+                    .context(Write)
+            })
+            .await
+            .map_err(|e| Box::new(e) as _)
+            .context(Write)?
+    }
+}
+
+/// [WalManager] implementation based on RocksDB.
+/// A [RocksImpl] consists of multiple [Region]s and any read/write/delete
+/// request is delegated to specific [Region].
+pub struct RocksImpl {
+    /// Wal data path
+    wal_path: String,
+    /// RocksDB instance
+    db: Arc<DB>,
+    /// Runtime for read/write log entries
+    runtime: Arc<Runtime>,
+    /// Encoding for log entry
+    log_encoding: LogEncoding,
+    /// Encoding for meta data of max sequence
+    max_seq_meta_encoding: MaxSeqMetaEncoding,
+    /// Regions
+    regions: RwLock<HashMap<RegionId, Arc<Region>>>,
+}
+
+impl Drop for RocksImpl {
+    fn drop(&mut self) {
+        // Clear all regions.
+        {
+            let mut regions = self.regions.write().unwrap();
+            regions.clear();
+        }
+
+        info!("RocksImpl dropped, wal_path:{}", self.wal_path);
+    }
+}
+
+impl RocksImpl {
+    fn build_regions(&self) -> Result<()> {
+        let region_seqs = self.find_region_seqs_from_db()?;
+
+        info!(
+            "RocksImpl build regions, wal_path:{}, region_seqs:{:?}",
+            self.wal_path, region_seqs
+        );
+
+        let mut regions = self.regions.write().unwrap();
+        for (region_id, sequence_number) in region_seqs {
+            let region = Region {
+                id: region_id,
+                db: self.db.clone(),
+                next_sequence_num: AtomicU64::new(sequence_number + 1),
+                log_encoding: self.log_encoding.clone(),
+                max_seq_meta_encoding: self.max_seq_meta_encoding.clone(),
+                runtime: self.runtime.clone(),
+                delete_lock: Mutex::new(()),
+            };
+
+            regions.insert(region_id, Arc::new(region));
+        }
+
+        Ok(())
+    }
+
+    fn find_region_seqs_from_region_data(
+        &self,
+        region_max_seqs: &mut HashMap<RegionId, SequenceNumber>,
+    ) -> Result<()> {
+        let mut current_region_id = MAX_REGION_ID;
+        let mut end_boundary_key_buf = BytesMut::new();
+        loop {
+            let log_key = (current_region_id, MAX_SEQUENCE_NUMBER);
+            self.log_encoding
+                .encode_key(&mut end_boundary_key_buf, &log_key)?;
+            let mut iter = self.db.iter();
+            let seek_key = SeekKey::Key(&end_boundary_key_buf);
+
+            let found = iter
+                .seek_for_prev(seek_key)
+                .map_err(|e| e.into())
+                .context(Initialization)?;
+
+            if !found {
+                debug!("RocksImpl find region pairs stop scanning, because of no entries to scan");
+                break;
+            }
+
+            if !self.log_encoding.is_log_key(iter.key())? {
+                debug!("RocksImpl find region pairs stop scanning, because log keys are exhausted");
+                break;
+            }
+
+            let log_key = self.log_encoding.decode_key(iter.key())?;
+            region_max_seqs.insert(log_key.0, log_key.1);
+
+            if log_key.0 == 0 {
+                debug!("RocksImpl find region pairs stop scanning, because region 0 is reached");
+                break;
+            }
+            current_region_id = log_key.0 - 1;
+        }
+
+        Ok(())
+    }
+
+    fn find_region_seqs_from_region_meta(
+        &self,
+        region_max_seqs: &mut HashMap<RegionId, SequenceNumber>,
+    ) -> Result<()> {
+        let meta_key = MetaKey { region_id: 0 };
+        let mut start_boundary_key_buf = BytesMut::new();
+        self.max_seq_meta_encoding
+            .encode_key(&mut start_boundary_key_buf, &meta_key)?;
+        let mut iter = self.db.iter();
+        let seek_key = SeekKey::Key(&start_boundary_key_buf);
+        iter.seek(seek_key)
+            .map_err(|e| e.into())
+            .context(Initialization)?;
+
+        loop {
+            if !iter.valid().map_err(|e| e.into()).context(Initialization)? {
+                debug!("RocksImpl exhausts the iterator for meta information");
+                break;
+            }
+            if !self.max_seq_meta_encoding.is_max_seq_meta_key(iter.key())? {
+                debug!("RocksImpl exhausts max sequence meta key");
+                break;
+            }
+
+            let meta_key = self.max_seq_meta_encoding.decode_key(iter.key())?;
+            let meta_value = self.max_seq_meta_encoding.decode_value(iter.value())?;
+            region_max_seqs
+                .entry(meta_key.region_id)
+                .and_modify(|v| {
+                    *v = meta_value.max_seq.max(*v);
+                })
+                .or_insert(meta_value.max_seq);
+
+            iter.next().map_err(|e| e.into()).context(Initialization)?;
+        }
+
+        Ok(())
+    }
+
+    /// Collect all the regions with its max sequence number from the db.
+    ///
+    /// Returns the mapping: region_id -> max_sequence_number
+    fn find_region_seqs_from_db(&self) -> Result<HashMap<RegionId, SequenceNumber>> {
+        // build the mapping: region_id -> max_sequence_number
+        let mut region_max_seqs = HashMap::new();
+
+        // scan the region information from the data part.
+        self.find_region_seqs_from_region_data(&mut region_max_seqs)?;
+
+        // scan the region information from the meta part.
+        self.find_region_seqs_from_region_meta(&mut region_max_seqs)?;
+
+        Ok(region_max_seqs)
+    }
+
+    /// Get the region and create it if not found.
+    fn get_or_create_region(&self, region_id: RegionId) -> Arc<Region> {
+        {
+            let regions = self.regions.read().unwrap();
+            if let Some(region) = regions.get(&region_id) {
+                return region.clone();
+            }
+        }
+
+        let mut regions = self.regions.write().unwrap();
+        if let Some(region) = regions.get(&region_id) {
+            return region.clone();
+        }
+
+        info!(
+            "RocksImpl create new region, wal_path:{}, region_id:{}",
+            self.wal_path, region_id
+        );
+
+        // create a new region
+        let region = Arc::new(Region {
+            id: region_id,
+            db: self.db.clone(),
+            // ensure `next_sequence_number` to start from 1 (larger than MIN_SEQUENCE_NUMBER)
+            next_sequence_num: AtomicU64::new(MIN_SEQUENCE_NUMBER + 1),
+            log_encoding: self.log_encoding.clone(),
+            max_seq_meta_encoding: self.max_seq_meta_encoding.clone(),
+            runtime: self.runtime.clone(),
+            delete_lock: Mutex::new(()),
+        });
+
+        regions.insert(region_id, region.clone());
+        region
+    }
+
+    /// Get the region
+    fn region(&self, region_id: RegionId) -> Option<Arc<Region>> {
+        let regions = self.regions.read().unwrap();
+        regions.get(&region_id).cloned()
+    }
+}
+
+/// Builder for `RocksImpl`.
+pub struct Builder {
+    wal_path: String,
+    rocksdb_config: DBOptions,
+    runtime: Arc<Runtime>,
+}
+
+impl Builder {
+    pub fn with_default_rocksdb_config(
+        wal_path: impl Into<PathBuf>,
+        runtime: Arc<Runtime>,
+    ) -> Self {
+        let mut rocksdb_config = DBOptions::default();
+        // TODO(yingwen): Move to another function?
+        rocksdb_config.create_if_missing(true);
+        Self::new(wal_path, runtime, rocksdb_config)
+    }
+
+    pub fn new(
+        wal_path: impl Into<PathBuf>,
+        runtime: Arc<Runtime>,
+        rocksdb_config: DBOptions,
+    ) -> Self {
+        let wal_path: PathBuf = wal_path.into();
+        Self {
+            wal_path: wal_path.to_str().unwrap().to_owned(),
+            rocksdb_config,
+            runtime,
+        }
+    }
+
+    pub fn build(self) -> Result<RocksImpl> {
+        let db = DB::open(self.rocksdb_config, &self.wal_path)
+            .map_err(|e| e.into())
+            .context(Open {
+                wal_path: self.wal_path.clone(),
+            })?;
+        let rocks_impl = RocksImpl {
+            wal_path: self.wal_path,
+            db: Arc::new(db),
+            runtime: self.runtime,
+            log_encoding: LogEncoding::newest(),
+            max_seq_meta_encoding: MaxSeqMetaEncoding::newest(),
+            regions: RwLock::new(HashMap::new()),
+        };
+        rocks_impl.build_regions()?;
+
+        Ok(rocks_impl)
+    }
+}
+
+/// Iterator over log entries based on RocksDB iterator.
+pub struct RocksLogIterator {
+    log_encoding: LogEncoding,
+    /// denotes no more data to iterate and it is set to true when:
+    ///  - initialized as no data iterator, or
+    ///  - iterate to the end.
+    no_more_data: bool,
+    min_log_key: LogKey,
+    max_log_key: LogKey,
+    /// denote whether `iter` is seeked
+    seeked: bool,
+    /// RocksDB iterator
+    iter: DBIterator<Arc<DB>>,
+}
+
+impl RocksLogIterator {
+    /// Create iterator maybe containing data.
+    fn with_data(
+        log_encoding: LogEncoding,
+        iter: DBIterator<Arc<DB>>,
+        min_log_key: LogKey,
+        max_log_key: LogKey,
+    ) -> Self {
+        Self {
+            log_encoding,
+            no_more_data: false,
+            min_log_key,
+            max_log_key,
+            seeked: false,
+            iter,
+        }
+    }
+
+    /// Create empty iterator.
+    fn new_empty(log_encoding: LogEncoding, iter: DBIterator<Arc<DB>>) -> Self {
+        Self {
+            log_encoding,
+            no_more_data: true,
+            min_log_key: (0, 0),
+            max_log_key: (0, 0),
+            seeked: false,
+            iter,
+        }
+    }
+
+    /// it's a valid log key if it is in the range `[self.min_log_key,
+    /// self.max_log_key]`.
+    fn is_valid_log_key(&self, curr_log_key: &LogKey) -> bool {
+        curr_log_key <= &self.max_log_key && curr_log_key >= &self.min_log_key
+    }
+
+    /// End is reached iteration if `curr_log_key` is greater than
+    /// `max_log_key`.
+    fn is_end_reached(&self, curr_log_key: &LogKey) -> bool {
+        curr_log_key >= &self.max_log_key
+    }
+
+    /// let `iter` seek to `min_log_key`
+    /// no guarantee on that `self.iter` is valid
+    fn seek(&mut self) -> Result<()> {
+        self.seeked = true;
+
+        let mut seek_key_buf = BytesMut::new();
+        self.log_encoding
+            .encode_key(&mut seek_key_buf, &self.min_log_key)?;
+        let seek_key = SeekKey::Key(&seek_key_buf);
+        self.iter
+            .seek(seek_key)
+            .map_err(|e| e.into())
+            .context(Read)?;
+
+        Ok(())
+    }
+}
+
+impl LogIterator for RocksLogIterator {
+    fn next_log_entry<D: PayloadDecoder>(
+        &mut self,
+        decoder: &D,
+    ) -> Result<Option<LogEntry<D::Target>>> {
+        if self.no_more_data {
+            return Ok(None);
+        }
+
+        if !self.seeked {
+            self.seek()?;
+
+            let valid = self.iter.valid().map_err(|e| e.into()).context(Read)?;
+            if !valid {
+                self.no_more_data = true;
+                return Ok(None);
+            }
+        } else {
+            let found = self.iter.next().map_err(|e| e.into()).context(Read)?;
+            if !found {
+                self.no_more_data = true;
+                return Ok(None);
+            }
+        }
+
+        let curr_log_key = self.log_encoding.decode_key(self.iter.key())?;
+        self.no_more_data = self.is_end_reached(&curr_log_key);
+
+        if self.is_valid_log_key(&curr_log_key) {
+            let payload = self.log_encoding.decode_value(self.iter.value(), decoder)?;
+            let log_entry = LogEntry {
+                sequence: curr_log_key.1,
+                payload,
+            };
+            Ok(Some(log_entry))
+        } else {
+            Ok(None)
+        }
+    }
+}
+
+impl LogReader for RocksImpl {
+    type Iterator = RocksLogIterator;
+
+    fn read(&self, ctx: &ReadContext, req: &ReadRequest) -> Result<Self::Iterator> {
+        if let Some(region) = self.region(req.region_id) {
+            region.read(ctx, req)
+        } else {
+            let iter = DBIterator::new(self.db.clone(), ReadOptions::default());
+            Ok(RocksLogIterator::new_empty(self.log_encoding.clone(), iter))
+        }
+    }
+}
+
+#[async_trait]
+impl LogWriter for RocksImpl {
+    async fn write<P: Payload>(
+        &self,
+        ctx: &WriteContext,
+        batch: &LogWriteBatch<P>,
+    ) -> Result<SequenceNumber> {
+        let region = self.get_or_create_region(batch.region_id);
+        region.write(ctx, batch).await
+    }
+}
+
+#[async_trait]
+impl WalManager for RocksImpl {
+    fn sequence_num(&self, region_id: RegionId) -> Result<u64> {
+        if let Some(region) = self.region(region_id) {
+            return region.sequence_num();
+        }
+
+        Ok(MIN_SEQUENCE_NUMBER)
+    }
+
+    async fn mark_delete_entries_up_to(
+        &self,
+        region_id: RegionId,
+        sequence_num: SequenceNumber,
+    ) -> Result<()> {
+        if let Some(region) = self.region(region_id) {
+            return region.delete_entries_up_to(sequence_num).await;
+        }
+
+        Ok(())
+    }
+}
+
+impl fmt::Debug for RocksImpl {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        f.debug_struct("RocksImpl")
+            .field("wal_path", &self.wal_path)
+            .finish()
+    }
+}
diff --git a/wal/src/rocks_impl/mod.rs b/wal/src/rocks_impl/mod.rs
new file mode 100644
index 0000000000..e25bca788a
--- /dev/null
+++ b/wal/src/rocks_impl/mod.rs
@@ -0,0 +1,6 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! WalManager implementation based on RocksDB
+
+pub mod encoding;
+pub mod manager;
diff --git a/wal/src/tests/mod.rs b/wal/src/tests/mod.rs
new file mode 100644
index 0000000000..c52a689521
--- /dev/null
+++ b/wal/src/tests/mod.rs
@@ -0,0 +1,6 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! integration tests for wal
+
+mod read_write;
+pub mod util;
diff --git a/wal/src/tests/read_write.rs b/wal/src/tests/read_write.rs
new file mode 100644
index 0000000000..a38bb1282c
--- /dev/null
+++ b/wal/src/tests/read_write.rs
@@ -0,0 +1,449 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use std::{ops::Deref, sync::Arc};
+
+use common_types::SequenceNumber;
+
+use crate::{
+    log_batch::LogWriteBatch,
+    manager::{LogReader, LogWriter, ReadBoundary, ReadRequest, RegionId, WalManager},
+    tests::util::{RocksTestEnv, TestEnv, TestPayload, WalBuilder},
+};
+
+fn check_write_batch_with_read_request<B: WalBuilder>(
+    env: &TestEnv<B>,
+    wal: Arc<B::Wal>,
+    read_req: ReadRequest,
+    max_seq: SequenceNumber,
+    write_batch: &LogWriteBatch<TestPayload>,
+) {
+    let iter = wal
+        .read(&env.read_ctx, &read_req)
+        .expect("should succeed to read");
+    env.check_log_entries(max_seq, write_batch, iter);
+}
+
+fn check_write_batch<B: WalBuilder>(
+    env: &TestEnv<B>,
+    wal: Arc<B::Wal>,
+    region_id: RegionId,
+    max_seq: SequenceNumber,
+    write_batch: &LogWriteBatch<TestPayload>,
+) {
+    let read_req = ReadRequest {
+        region_id,
+        start: ReadBoundary::Included(max_seq + 1 - write_batch.entries.len() as u64),
+        end: ReadBoundary::Included(max_seq),
+    };
+    check_write_batch_with_read_request(env, wal, read_req, max_seq, write_batch)
+}
+
+async fn simple_read_write_with_wal<B: WalBuilder>(
+    env: impl Deref<Target = TestEnv<B>>,
+    wal: Arc<B::Wal>,
+    region_id: RegionId,
+) {
+    let write_batch = env.build_log_batch(region_id, 0, 10);
+    let seq = wal
+        .write(&env.write_ctx, &write_batch)
+        .await
+        .expect("should succeed to write");
+    check_write_batch(&env, wal, region_id, seq, &write_batch)
+}
+
+async fn simple_read_write<B: WalBuilder>(env: &TestEnv<B>, region_id: RegionId) {
+    let wal = env.build_wal();
+    simple_read_write_with_wal(env, wal.clone(), region_id).await;
+}
+
+/// Test the read with different kinds of boundaries.
+async fn read_with_boundary<B: WalBuilder>(env: &TestEnv<B>) {
+    let wal = env.build_wal();
+    let region_id = 0;
+    let write_batch = env.build_log_batch(region_id, 0, 10);
+    let end_seq = wal
+        .write(&env.write_ctx, &write_batch)
+        .await
+        .expect("should succeed to write");
+    let start_seq = end_seq + 1 - write_batch.entries.len() as u64;
+
+    // [min, max]
+    {
+        let read_req = ReadRequest {
+            region_id,
+            start: ReadBoundary::Min,
+            end: ReadBoundary::Max,
+        };
+        check_write_batch_with_read_request(env, wal.clone(), read_req, end_seq, &write_batch);
+    }
+
+    // [0, 10]
+    {
+        let read_req = ReadRequest {
+            region_id,
+            start: ReadBoundary::Included(start_seq),
+            end: ReadBoundary::Included(end_seq),
+        };
+        check_write_batch_with_read_request(env, wal.clone(), read_req, end_seq, &write_batch);
+    }
+
+    // (0, 10]
+    {
+        let read_req = ReadRequest {
+            region_id,
+            start: ReadBoundary::Excluded(start_seq),
+            end: ReadBoundary::Included(end_seq),
+        };
+        let write_batch = env.build_log_batch(region_id, 1, 10);
+        check_write_batch_with_read_request(env, wal.clone(), read_req, end_seq, &write_batch);
+    }
+
+    // [0, 10)
+    {
+        let read_req = ReadRequest {
+            region_id,
+            start: ReadBoundary::Included(start_seq),
+            end: ReadBoundary::Excluded(end_seq),
+        };
+        let write_batch = env.build_log_batch(region_id, 0, 9);
+        check_write_batch_with_read_request(env, wal.clone(), read_req, end_seq - 1, &write_batch);
+    }
+
+    // (0, 10)
+    {
+        let read_req = ReadRequest {
+            region_id,
+            start: ReadBoundary::Excluded(start_seq),
+            end: ReadBoundary::Excluded(end_seq),
+        };
+        let write_batch = env.build_log_batch(region_id, 1, 9);
+        check_write_batch_with_read_request(env, wal.clone(), read_req, end_seq - 1, &write_batch);
+    }
+}
+
+/// Test read and write across multiple regions parallely.
+async fn write_multiple_regions_parallelly<B: WalBuilder + 'static>(env: Arc<TestEnv<B>>) {
+    let wal = env.build_wal();
+    let mut handles = Vec::with_capacity(10);
+    for i in 0..5 {
+        let read_write_0 =
+            env.runtime
+                .spawn(simple_read_write_with_wal(env.clone(), wal.clone(), i));
+        let read_write_1 =
+            env.runtime
+                .spawn(simple_read_write_with_wal(env.clone(), wal.clone(), i));
+        handles.push(read_write_0);
+        handles.push(read_write_1);
+    }
+    futures::future::join_all(handles)
+        .await
+        .into_iter()
+        .for_each(|res| {
+            res.expect("should succeed to join the write");
+        });
+}
+
+/// Test whether the written logs can be read after reopen.
+async fn reopen<B: WalBuilder>(env: &TestEnv<B>) {
+    let region_id = 0;
+    let (write_batch, seq) = {
+        let wal = env.build_wal();
+        let write_batch = env.build_log_batch(region_id, 0, 10);
+        let seq = wal
+            .write(&env.write_ctx, &write_batch)
+            .await
+            .expect("should succeed to write");
+        (write_batch, seq)
+    };
+
+    // reopen the wal
+    let wal = env.build_wal();
+    let read_req = ReadRequest {
+        region_id,
+        start: ReadBoundary::Included(seq + 1 - write_batch.entries.len() as u64),
+        end: ReadBoundary::Included(seq),
+    };
+    let iter = wal
+        .read(&env.read_ctx, &read_req)
+        .expect("should succeed to read");
+    env.check_log_entries(seq, &write_batch, iter);
+}
+
+/// A complex test case for read and write:
+///  - Write two log batch
+///  - Read the first batch and then read the second batch.
+///  - Read the whole batch.
+///  - Read the part of first batch and second batch.
+async fn complex_read_write<B: WalBuilder>(env: &TestEnv<B>) {
+    let wal = env.build_wal();
+    let region_id = 0;
+
+    // write two batches
+    let (start_val, mid_val, end_val) = (0, 10, 50);
+    let write_batch_1 = env.build_log_batch(region_id, start_val, mid_val);
+    let seq_1 = wal
+        .write(&env.write_ctx, &write_batch_1)
+        .await
+        .expect("should succeed to write");
+    let write_batch_2 = env.build_log_batch(region_id, mid_val, end_val);
+    let seq_2 = wal
+        .write(&env.write_ctx, &write_batch_2)
+        .await
+        .expect("should succeed to write");
+
+    // read the first batch
+    check_write_batch(env, wal.clone(), region_id, seq_1, &write_batch_1);
+    // read the second batch
+    check_write_batch(env, wal.clone(), region_id, seq_2, &write_batch_2);
+
+    // read the whole batch
+    let (seq_3, write_batch_3) = (seq_2, env.build_log_batch(region_id, start_val, end_val));
+    check_write_batch(env, wal.clone(), region_id, seq_3, &write_batch_3);
+
+    // read the part of batch1 and batch2
+    let (seq_4, write_batch_4) = {
+        let new_start = (start_val + mid_val) / 2;
+        let new_end = (mid_val + end_val) / 2;
+        let seq = seq_2 - (end_val - new_end) as u64;
+        (seq, env.build_log_batch(region_id, new_start, new_end))
+    };
+    check_write_batch(env, wal.clone(), region_id, seq_4, &write_batch_4);
+}
+
+/// Test whether data can be deleted.
+async fn simple_write_delete<B: WalBuilder>(env: &TestEnv<B>) {
+    let region_id = 0;
+    let wal = env.build_wal();
+    let mut write_batch = env.build_log_batch(region_id, 0, 10);
+    let seq = wal
+        .write(&env.write_ctx, &write_batch)
+        .await
+        .expect("should succeed to write");
+    check_write_batch(env, wal.clone(), region_id, seq, &write_batch);
+
+    // delete all logs
+    wal.mark_delete_entries_up_to(region_id, seq)
+        .await
+        .expect("should succeed to delete");
+    let read_req = ReadRequest {
+        region_id,
+        start: ReadBoundary::Min,
+        end: ReadBoundary::Max,
+    };
+    let iter = wal
+        .read(&env.read_ctx, &read_req)
+        .expect("should succeed to read");
+    write_batch.entries.clear();
+    env.check_log_entries(seq, &write_batch, iter);
+}
+
+/// Delete half of the written data and check the remaining half can be read.
+async fn write_delete_half<B: WalBuilder>(env: &TestEnv<B>) {
+    let region_id = 0;
+    let wal = env.build_wal();
+    let mut write_batch = env.build_log_batch(region_id, 0, 10);
+    let seq = wal
+        .write(&env.write_ctx, &write_batch)
+        .await
+        .expect("should succeed to write");
+    check_write_batch(env, wal.clone(), region_id, seq, &write_batch);
+
+    // delete all logs
+    wal.mark_delete_entries_up_to(region_id, seq / 2)
+        .await
+        .expect("should succeed to delete");
+    let read_req = ReadRequest {
+        region_id,
+        start: ReadBoundary::Min,
+        end: ReadBoundary::Max,
+    };
+    let iter = wal
+        .read(&env.read_ctx, &read_req)
+        .expect("should succeed to read");
+    write_batch.entries.drain(..write_batch.entries.len() / 2);
+    env.check_log_entries(seq, &write_batch, iter);
+}
+
+/// Test delete across multiple regions.
+async fn write_delete_multiple_regions<B: WalBuilder>(env: &TestEnv<B>) {
+    let (region_id_1, region_id_2) = (1, 2);
+    let wal = env.build_wal();
+    let mut write_batch_1 = env.build_log_batch(region_id_1, 0, 10);
+    let seq_1 = wal
+        .write(&env.write_ctx, &write_batch_1)
+        .await
+        .expect("should succeed to write");
+
+    let write_batch_2 = env.build_log_batch(region_id_2, 10, 20);
+    let seq_2 = wal
+        .write(&env.write_ctx, &write_batch_2)
+        .await
+        .expect("should succeed to write");
+
+    // delete all logs of region 1.
+    wal.mark_delete_entries_up_to(region_id_1, seq_1)
+        .await
+        .expect("should succeed to delete");
+    let read_req = ReadRequest {
+        region_id: region_id_1,
+        start: ReadBoundary::Min,
+        end: ReadBoundary::Max,
+    };
+    let iter = wal
+        .read(&env.read_ctx, &read_req)
+        .expect("should succeed to read");
+    write_batch_1.entries.clear();
+    env.check_log_entries(seq_1, &write_batch_1, iter);
+
+    check_write_batch(env, wal.clone(), region_id_2, seq_2, &write_batch_2);
+}
+
+/// The sequence number should increase monotonically after multiple writes.
+async fn sequence_increase_monotonically_multiple_writes<B: WalBuilder>(env: &TestEnv<B>) {
+    let region_id = 0;
+    let wal = env.build_wal();
+    let write_batch = env.build_log_batch(region_id, 0, 10);
+    let seq_1 = wal
+        .write(&env.write_ctx, &write_batch)
+        .await
+        .expect("should succeed to write");
+    let seq_2 = wal
+        .write(&env.write_ctx, &write_batch)
+        .await
+        .expect("should succeed to write");
+    let seq_3 = wal
+        .write(&env.write_ctx, &write_batch)
+        .await
+        .expect("should succeed to write");
+
+    assert!(seq_2 > seq_1);
+    assert!(seq_3 > seq_2);
+}
+
+/// The sequence number should increase monotonically after write, delete and
+/// one more write.
+async fn sequence_increase_monotonically_delete_write<B: WalBuilder>(env: &TestEnv<B>) {
+    let region_id = 0;
+    let wal = env.build_wal();
+    let write_batch = env.build_log_batch(region_id, 0, 10);
+    // write
+    let seq_1 = wal
+        .write(&env.write_ctx, &write_batch)
+        .await
+        .expect("should succeed to write");
+    // delete
+    wal.mark_delete_entries_up_to(region_id, seq_1)
+        .await
+        .expect("should succeed to delete");
+    // write again
+    let seq_2 = wal
+        .write(&env.write_ctx, &write_batch)
+        .await
+        .expect("should succeed to write");
+
+    assert!(seq_2 > seq_1);
+}
+
+/// The sequence number should increase monotonically after write, delete,
+/// reopen and write.
+async fn sequence_increase_monotonically_delete_reopen_write<B: WalBuilder>(env: &TestEnv<B>) {
+    let region_id = 0;
+    let wal = env.build_wal();
+    let write_batch = env.build_log_batch(region_id, 0, 10);
+    // write
+    let seq_1 = wal
+        .write(&env.write_ctx, &write_batch)
+        .await
+        .expect("should succeed to write");
+    // delete
+    wal.mark_delete_entries_up_to(region_id, seq_1)
+        .await
+        .expect("should succeed to delete");
+    // restart
+    drop(wal);
+    let wal = env.build_wal();
+    // write again
+    let seq_2 = wal
+        .write(&env.write_ctx, &write_batch)
+        .await
+        .expect("should succeed to write");
+
+    assert!(seq_2 > seq_1);
+}
+
+#[test]
+fn test_simple_read_write() {
+    let rocks_env = RocksTestEnv::new(2);
+    rocks_env.runtime.block_on(simple_read_write(&rocks_env, 0));
+}
+
+#[test]
+fn test_read_with_boundary() {
+    let rocks_env = RocksTestEnv::new(2);
+    rocks_env.runtime.block_on(read_with_boundary(&rocks_env));
+}
+
+#[test]
+fn test_write_multiple_regions() {
+    let rocks_env = Arc::new(RocksTestEnv::new(4));
+    rocks_env
+        .runtime
+        .block_on(write_multiple_regions_parallelly(rocks_env.clone()));
+}
+
+#[test]
+fn test_reopen() {
+    let rocks_env = RocksTestEnv::new(2);
+    rocks_env.runtime.block_on(reopen(&rocks_env));
+}
+
+#[test]
+fn test_complex_read_write() {
+    let rocks_env = RocksTestEnv::new(2);
+    rocks_env.runtime.block_on(complex_read_write(&rocks_env));
+}
+
+#[test]
+fn test_simple_write_delete() {
+    let rocks_env = RocksTestEnv::new(2);
+    rocks_env.runtime.block_on(simple_write_delete(&rocks_env));
+}
+
+#[test]
+fn test_write_delete_half() {
+    let rocks_env = RocksTestEnv::new(2);
+    rocks_env.runtime.block_on(write_delete_half(&rocks_env));
+}
+#[test]
+fn test_write_delete_multiple_regions() {
+    let rocks_env = RocksTestEnv::new(2);
+    rocks_env
+        .runtime
+        .block_on(write_delete_multiple_regions(&rocks_env));
+}
+
+#[test]
+fn test_sequence_increase_monotonically_multiple_writes() {
+    let rocks_env = RocksTestEnv::new(2);
+    rocks_env
+        .runtime
+        .block_on(sequence_increase_monotonically_multiple_writes(&rocks_env));
+}
+
+#[test]
+fn test_sequence_increase_monotonically_delete_write() {
+    let rocks_env = RocksTestEnv::new(2);
+    rocks_env
+        .runtime
+        .block_on(sequence_increase_monotonically_delete_write(&rocks_env));
+}
+
+#[test]
+fn test_sequence_increase_monotonically_delete_reopen_write() {
+    let rocks_env = RocksTestEnv::new(2);
+    rocks_env
+        .runtime
+        .block_on(sequence_increase_monotonically_delete_reopen_write(
+            &rocks_env,
+        ));
+}
diff --git a/wal/src/tests/util.rs b/wal/src/tests/util.rs
new file mode 100644
index 0000000000..cd631363f6
--- /dev/null
+++ b/wal/src/tests/util.rs
@@ -0,0 +1,158 @@
+// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! utilities for testing wal module.
+
+use std::{path::Path, sync::Arc};
+
+use common_types::bytes::{MemBuf, MemBufMut};
+use common_util::runtime::{self, Runtime};
+use tempfile::TempDir;
+
+use crate::{
+    log_batch::{LogWriteBatch, LogWriteEntry, Payload, PayloadDecoder},
+    manager::{LogIterator, LogReader, ReadContext, RegionId, WalManager, WriteContext},
+    rocks_impl::{self, manager::RocksImpl},
+};
+
+pub trait WalBuilder: Default + Send + Sync {
+    type Wal: WalManager + Send + Sync;
+    fn build(&self, data_path: &Path, runtime: Arc<Runtime>) -> Arc<Self::Wal>;
+}
+use common_types::SequenceNumber;
+use snafu::Snafu;
+
+#[derive(Debug, Snafu)]
+pub enum Error {}
+
+#[derive(Default)]
+pub struct RocksWalBuilder;
+
+impl WalBuilder for RocksWalBuilder {
+    type Wal = RocksImpl;
+
+    fn build(&self, data_path: &Path, runtime: Arc<Runtime>) -> Arc<Self::Wal> {
+        let wal_builder =
+            rocks_impl::manager::Builder::with_default_rocksdb_config(data_path, runtime);
+
+        Arc::new(
+            wal_builder
+                .build()
+                .expect("should succeed to build rocksimpl wal"),
+        )
+    }
+}
+
+pub type RocksTestEnv = TestEnv<RocksWalBuilder>;
+
+/// The environment for testing wal.
+pub struct TestEnv<B> {
+    pub dir: TempDir,
+    pub runtime: Arc<Runtime>,
+    pub write_ctx: WriteContext,
+    pub read_ctx: ReadContext,
+    /// Builder for a specific wal.
+    builder: B,
+}
+
+impl<B: WalBuilder> TestEnv<B> {
+    pub fn new(num_workers: usize) -> Self {
+        let runtime = runtime::Builder::default()
+            .worker_threads(num_workers)
+            .enable_all()
+            .build()
+            .unwrap();
+
+        Self {
+            dir: tempfile::tempdir().unwrap(),
+            runtime: Arc::new(runtime),
+            write_ctx: WriteContext::default(),
+            read_ctx: ReadContext::default(),
+            builder: B::default(),
+        }
+    }
+
+    pub fn build_wal(&self) -> Arc<B::Wal> {
+        self.builder.build(self.dir.path(), self.runtime.clone())
+    }
+
+    /// Build the log batch with [TestPayload].val range [start, end).
+    pub fn build_log_batch(
+        &self,
+        region_id: RegionId,
+        start: u32,
+        end: u32,
+    ) -> LogWriteBatch<TestPayload> {
+        let mut write_batch = LogWriteBatch::new(region_id);
+        for val in start..end {
+            let payload = TestPayload { val };
+            write_batch.entries.push(LogWriteEntry { payload });
+        }
+
+        write_batch
+    }
+
+    /// Check whether the log entries from the iterator equals the
+    /// `write_batch`.
+    pub fn check_log_entries(
+        &self,
+        max_seq: SequenceNumber,
+        write_batch: &LogWriteBatch<TestPayload>,
+        mut iter: <B::Wal as LogReader>::Iterator,
+    ) {
+        let dec = TestPayloadDecoder;
+        let mut log_entries = Vec::with_capacity(write_batch.entries.len());
+        loop {
+            let log_entry = iter
+                .next_log_entry(&dec)
+                .expect("should succeed to fetch next log entry");
+            if log_entry.is_none() {
+                break;
+            }
+
+            log_entries.push(log_entry.unwrap());
+        }
+
+        assert_eq!(write_batch.entries.len(), log_entries.len());
+        for (idx, (expect_log_write_entry, log_entry)) in write_batch
+            .entries
+            .iter()
+            .zip(log_entries.iter())
+            .rev()
+            .enumerate()
+        {
+            assert_eq!(max_seq - idx as u64, log_entry.sequence);
+            assert_eq!(expect_log_write_entry.payload, log_entry.payload);
+        }
+    }
+}
+
+/// The payload for Wal log entry for testing.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct TestPayload {
+    val: u32,
+}
+
+impl Payload for TestPayload {
+    type Error = Error;
+
+    fn encode_size(&self) -> usize {
+        4
+    }
+
+    fn encode_to<B: MemBufMut>(&self, buf: &mut B) -> Result<(), Self::Error> {
+        buf.write_u32(self.val).expect("must write");
+        Ok(())
+    }
+}
+
+pub struct TestPayloadDecoder;
+
+impl PayloadDecoder for TestPayloadDecoder {
+    type Error = Error;
+    type Target = TestPayload;
+
+    fn decode<B: MemBuf>(&self, buf: &mut B) -> Result<Self::Target, Self::Error> {
+        let val = buf.read_u32().expect("should succeed to read u32");
+        Ok(TestPayload { val })
+    }
+}