From 947bb61ade3b7bb6237ec22172847dfd9d1de9a5 Mon Sep 17 00:00:00 2001 From: mingfeng Date: Tue, 16 Jun 2026 04:22:58 -0700 Subject: [PATCH 01/12] feat(cli): add mosaic inspector CLI (schema/meta/cat/pages) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mosaic previously shipped no viewer tooling — inspecting a file meant writing Rust against the library API. Add a `mosaic` binary (a new `cli` workspace crate) mirroring parquet-cli: - schema: column names, Arrow types, nullability, bucket assignment - meta: row groups, rows, per-column stats (null_count/min/max) - cat: first N rows as a table, with -n and --columns projection - pages: per-column encoding (plain/const/dict/all_null) + slot size All commands support --json. The reader is driven over a new file-backed InputFile (pread). Core gains three small read-only accessors used by `pages`: BucketReader::encodings(), ColumnPageReader::encoding(), and MosaicReader::page_infos(). No format/behavior change; 199 core tests pass. Co-Authored-By: Claude Opus 4.8 --- Cargo.toml | 2 +- cli/Cargo.toml | 33 ++++++ cli/examples/gen.rs | 54 ++++++++++ cli/src/fmt.rs | 197 +++++++++++++++++++++++++++++++++++ cli/src/input.rs | 66 ++++++++++++ cli/src/main.rs | 214 ++++++++++++++++++++++++++++++++++++++ core/src/bucket_reader.rs | 10 ++ core/src/reader.rs | 71 +++++++++++++ 8 files changed, 646 insertions(+), 1 deletion(-) create mode 100644 cli/Cargo.toml create mode 100644 cli/examples/gen.rs create mode 100644 cli/src/fmt.rs create mode 100644 cli/src/input.rs create mode 100644 cli/src/main.rs diff --git a/Cargo.toml b/Cargo.toml index 240d079..00133bb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,7 +16,7 @@ # under the License. [workspace] -members = ["core", "ffi", "jni"] +members = ["core", "ffi", "jni", "cli"] resolver = "2" [profile.release] diff --git a/cli/Cargo.toml b/cli/Cargo.toml new file mode 100644 index 0000000..97485ab --- /dev/null +++ b/cli/Cargo.toml @@ -0,0 +1,33 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "paimon-mosaic-cli" +version = "0.2.0" +edition = "2021" +description = "Mosaic file format — command line inspector (cat/meta/schema)" +license = "Apache-2.0" + +[[bin]] +name = "mosaic" +path = "src/main.rs" + +[dependencies] +paimon-mosaic-core = { path = "../core" } +arrow-array = "58" +arrow-schema = "58" +clap = { version = "4", features = ["derive"] } diff --git a/cli/examples/gen.rs b/cli/examples/gen.rs new file mode 100644 index 0000000..b117018 --- /dev/null +++ b/cli/examples/gen.rs @@ -0,0 +1,54 @@ +// Generates a tiny mosaic file for CLI verification. +use std::fs::File; +use std::io::Write; +use std::sync::Arc; + +use arrow_array::{Int32Array, RecordBatch, StringArray}; +use arrow_schema::{DataType, Field, Schema}; +use paimon_mosaic_core::writer::{MosaicWriter, OutputFile, WriterOptions}; + +struct FileOut { + f: File, + pos: u64, +} +impl OutputFile for FileOut { + fn write(&mut self, d: &[u8]) -> std::io::Result<()> { + self.f.write_all(d)?; + self.pos += d.len() as u64; + Ok(()) + } + fn flush(&mut self) -> std::io::Result<()> { + self.f.flush() + } + fn pos(&self) -> u64 { + self.pos + } +} + +fn main() { + let path = std::env::args().nth(1).unwrap_or_else(|| "/tmp/sample.mosaic".into()); + let schema = Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("name", DataType::Utf8, true), + Field::new("age", DataType::Int32, true), + ]); + let out = FileOut { f: File::create(&path).unwrap(), pos: 0 }; + let opts = WriterOptions { + num_buckets: 2, + stats_columns: vec!["id".into(), "name".into(), "age".into()], + ..Default::default() + }; + let mut w = MosaicWriter::new(out, &schema, opts).unwrap(); + let batch = RecordBatch::try_new( + Arc::new(schema), + vec![ + Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5])), + Arc::new(StringArray::from(vec![Some("alice"), Some("bob"), None, Some("dan"), Some("eve")])), + Arc::new(Int32Array::from(vec![Some(30), Some(25), Some(40), None, Some(28)])), + ], + ) + .unwrap(); + w.write_batch(&batch).unwrap(); + w.close().unwrap(); + println!("wrote {path}"); +} diff --git a/cli/src/fmt.rs b/cli/src/fmt.rs new file mode 100644 index 0000000..cb85ce5 --- /dev/null +++ b/cli/src/fmt.rs @@ -0,0 +1,197 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_array::{Array, RecordBatch}; +use paimon_mosaic_core::values::Value; + +/// Render a stats min/max [`Value`] to a short, human-readable string. +pub fn render_value(v: &Value) -> String { + match v { + Value::Null => "null".to_string(), + Value::Boolean(b) => b.to_string(), + Value::TinyInt(x) => x.to_string(), + Value::SmallInt(x) => x.to_string(), + Value::Integer(x) => x.to_string(), + Value::BigInt(x) => x.to_string(), + Value::Float(x) => x.to_string(), + Value::Double(x) => x.to_string(), + Value::Date(x) => format!("{} (epoch-day)", x), + Value::Time(x) => format!("{} (ms)", x), + Value::String(b) => String::from_utf8_lossy(b).into_owned(), + Value::Bytes(b) | Value::DecimalLarge(b) => format!("0x{}", hex(b)), + Value::DecimalCompact(x) => x.to_string(), + Value::TimestampMillis(x) => format!("{} (ms)", x), + Value::TimestampMicros(x) => format!("{} (us)", x), + Value::TimestampNanos { millis, nanos_of_milli } => { + format!("{}ms+{}ns", millis, nanos_of_milli) + } + } +} + +fn hex(b: &[u8]) -> String { + b.iter().map(|x| format!("{:02x}", x)).collect() +} + +/// Human-readable encoding name for a `spec::ENCODING_*` id. +pub fn encoding_name(e: u8) -> &'static str { + use paimon_mosaic_core::spec::*; + match e { + ENCODING_PLAIN => "plain", + ENCODING_CONST => "const", + ENCODING_DICT => "dict", + ENCODING_ALL_NULL => "all_null", + _ => "?", + } +} + +/// Escape a string as a JSON string literal (quotes included). +pub fn json_str(s: &str) -> String { + let mut o = String::with_capacity(s.len() + 2); + o.push('"'); + for c in s.chars() { + match c { + '"' => o.push_str("\\\""), + '\\' => o.push_str("\\\\"), + '\n' => o.push_str("\\n"), + '\r' => o.push_str("\\r"), + '\t' => o.push_str("\\t"), + c if (c as u32) < 0x20 => o.push_str(&format!("\\u{:04x}", c as u32)), + c => o.push(c), + } + } + o.push('"'); + o +} + +/// Pretty-print a slice of record batches as an aligned ASCII table. +pub fn pretty_table(batches: &[RecordBatch], max_rows: usize) -> String { + let schema = batches[0].schema(); + let headers: Vec = schema.fields().iter().map(|f| f.name().clone()).collect(); + let ncols = headers.len(); + + let mut rows: Vec> = Vec::new(); + 'outer: for batch in batches { + for r in 0..batch.num_rows() { + if rows.len() >= max_rows { + break 'outer; + } + let mut row = Vec::with_capacity(ncols); + for c in 0..ncols { + row.push(cell(batch.column(c).as_ref(), r)); + } + rows.push(row); + } + } + + let mut widths: Vec = headers.iter().map(|h| h.chars().count()).collect(); + for row in &rows { + for (i, v) in row.iter().enumerate() { + widths[i] = widths[i].max(v.chars().count()); + } + } + + let sep = |out: &mut String| { + out.push('+'); + for w in &widths { + out.push_str(&"-".repeat(w + 2)); + out.push('+'); + } + out.push('\n'); + }; + let line = |out: &mut String, cells: &[String]| { + out.push('|'); + for (i, c) in cells.iter().enumerate() { + out.push_str(&format!(" {: String { + let schema = batches[0].schema(); + let names: Vec = schema.fields().iter().map(|f| f.name().clone()).collect(); + let mut out = String::new(); + let mut got = 0usize; + 'outer: for batch in batches { + for r in 0..batch.num_rows() { + if got >= max_rows { + break 'outer; + } + out.push('{'); + for (c, name) in names.iter().enumerate() { + if c > 0 { + out.push(','); + } + out.push_str(&json_str(name)); + out.push(':'); + out.push_str(&cell_json(batch.column(c).as_ref(), r)); + } + out.push_str("}\n"); + got += 1; + } + } + out +} + +/// Render one Arrow cell as a JSON value (numbers bare, strings quoted, null). +fn cell_json(arr: &dyn Array, row: usize) -> String { + use arrow_schema::DataType::*; + if arr.is_null(row) { + return "null".to_string(); + } + match arr.data_type() { + Utf8 | Date32 => json_str(&cell(arr, row)), + _ => cell(arr, row), + } +} + +/// Render one Arrow cell to a string by downcasting on the column type. +fn cell(arr: &dyn Array, row: usize) -> String { + use arrow_array::*; + use arrow_schema::DataType::*; + if arr.is_null(row) { + return "".to_string(); + } + macro_rules! d { + ($ty:ty) => { + arr.as_any().downcast_ref::<$ty>().unwrap().value(row).to_string() + }; + } + match arr.data_type() { + Boolean => d!(BooleanArray), + Int8 => d!(Int8Array), + Int16 => d!(Int16Array), + Int32 => d!(Int32Array), + Int64 => d!(Int64Array), + Float32 => d!(Float32Array), + Float64 => d!(Float64Array), + Date32 => d!(Date32Array), + Utf8 => arr.as_any().downcast_ref::().unwrap().value(row).to_string(), + _ => "?".to_string(), + } +} diff --git a/cli/src/input.rs b/cli/src/input.rs new file mode 100644 index 0000000..b6e3722 --- /dev/null +++ b/cli/src/input.rs @@ -0,0 +1,66 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::fs::File; +use std::io; +use std::path::Path; + +use paimon_mosaic_core::reader::InputFile; + +/// A read-only [`InputFile`] backed by a real file using positional reads. +/// +/// `read_exact_at` does not move a shared cursor, so concurrent calls from the +/// reader's coalescing threads are safe — satisfying the `Sync` bound. +pub struct FileInput { + file: File, + len: u64, +} + +impl FileInput { + pub fn open(path: &Path) -> io::Result { + let file = File::open(path)?; + let len = file.metadata()?.len(); + Ok(Self { file, len }) + } + + pub fn len(&self) -> u64 { + self.len + } +} + +impl InputFile for FileInput { + fn read_at(&self, offset: u64, buf: &mut [u8]) -> io::Result<()> { + #[cfg(unix)] + { + use std::os::unix::fs::FileExt; + self.file.read_exact_at(buf, offset) + } + #[cfg(windows)] + { + use std::os::windows::fs::FileExt; + let mut read = 0; + while read < buf.len() { + let n = self.file.seek_read(&mut buf[read..], offset + read as u64)?; + if n == 0 { + return Err(io::Error::new(io::ErrorKind::UnexpectedEof, "read past end")); + } + read += n; + } + Ok(()) + } + } +} diff --git a/cli/src/main.rs b/cli/src/main.rs new file mode 100644 index 0000000..74b6918 --- /dev/null +++ b/cli/src/main.rs @@ -0,0 +1,214 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +mod fmt; +mod input; + +use std::path::PathBuf; +use std::process::ExitCode; + +use arrow_array::RecordBatch; +use clap::{Parser, Subcommand}; +use paimon_mosaic_core::reader::{MosaicReader, ReaderAccess}; + +use crate::input::FileInput; + +/// Mosaic file inspector — the cat/meta/schema/pages toolkit (cf. parquet-cli). +#[derive(Parser)] +#[command(name = "mosaic", version, about)] +struct Cli { + #[command(subcommand)] + cmd: Cmd, +} + +#[derive(Subcommand)] +enum Cmd { + /// Print the column names, types, nullability and bucket assignment. + Schema { + file: PathBuf, + #[arg(long)] + json: bool, + }, + /// Print row-group / bucket / stats metadata. + Meta { + file: PathBuf, + #[arg(long)] + json: bool, + }, + /// Print per-column encoding and slot size for each row group. + Pages { + file: PathBuf, + #[arg(long)] + json: bool, + }, + /// Print the first N rows as a table. + Cat { + file: PathBuf, + /// Number of rows to print. + #[arg(short = 'n', long, default_value_t = 10)] + num: usize, + /// Comma-separated columns to project. + #[arg(short, long)] + columns: Option, + #[arg(long)] + json: bool, + }, +} + +fn main() -> ExitCode { + let cli = Cli::parse(); + let res = match cli.cmd { + Cmd::Schema { file, json } => schema(&file, json), + Cmd::Meta { file, json } => meta(&file, json), + Cmd::Pages { file, json } => pages(&file, json), + Cmd::Cat { file, num, columns, json } => cat(&file, num, columns, json), + }; + match res { + Ok(()) => ExitCode::SUCCESS, + Err(e) => { + eprintln!("error: {e}"); + ExitCode::FAILURE + } + } +} + +fn open(file: &PathBuf) -> std::io::Result> { + let input = FileInput::open(file)?; + let len = input.len(); + MosaicReader::new(input, len) +} + +/// Columns in original (write) order rather than the name-sorted layout. +fn original_order(s: &paimon_mosaic_core::schema::MosaicSchema) -> Vec { + let mut by_sorted = vec![0usize; s.columns.len()]; + for (orig, &sorted) in s.original_order.iter().enumerate() { + by_sorted[sorted] = orig; + } + let mut cols: Vec = (0..s.columns.len()).collect(); + cols.sort_by_key(|&i| by_sorted[i]); + cols +} + +fn schema(file: &PathBuf, json: bool) -> std::io::Result<()> { + let reader = open(file)?; + let s = reader.schema(); + let cols = original_order(s); + if json { + let items: Vec = cols.iter().map(|&i| { + let c = &s.columns[i]; + format!("{{\"name\":{},\"type\":{},\"nullable\":{},\"bucket\":{}}}", + fmt::json_str(&c.name), fmt::json_str(&format!("{:?}", c.data_type)), c.nullable, c.bucket_id) + }).collect(); + println!("{{\"columns\":{},\"buckets\":{},\"fields\":[{}]}}", s.columns.len(), s.num_buckets, items.join(",")); + return Ok(()); + } + println!("{} columns, {} buckets", s.columns.len(), s.num_buckets); + for i in cols { + let c = &s.columns[i]; + let null = if c.nullable { "" } else { " not null" }; + println!(" {}: {:?}{} [bucket {}]", c.name, c.data_type, null, c.bucket_id); + } + Ok(()) +} + +fn meta(file: &PathBuf, json: bool) -> std::io::Result<()> { + let reader = open(file)?; + let s = reader.schema(); + let nrg = reader.num_row_groups(); + let total: usize = (0..nrg).map(|i| reader.row_group_num_rows(i).unwrap_or(0)).sum(); + if json { + let mut rgs = Vec::new(); + for rg in 0..nrg { + let st: Vec = reader.row_group_stats(rg)?.iter().map(|x| { + let mm = match (&x.min, &x.max) { + (Some(lo), Some(hi)) => format!(",\"min\":{},\"max\":{}", fmt::json_str(&fmt::render_value(lo)), fmt::json_str(&fmt::render_value(hi))), + _ => String::new(), + }; + format!("{{\"column\":{},\"nulls\":{}{}}}", fmt::json_str(&s.columns[x.column_index].name), x.null_count, mm) + }).collect(); + rgs.push(format!("{{\"rows\":{},\"stats\":[{}]}}", reader.row_group_num_rows(rg)?, st.join(","))); + } + println!("{{\"rows\":{},\"columns\":{},\"buckets\":{},\"row_groups\":[{}]}}", total, s.columns.len(), s.num_buckets, rgs.join(",")); + return Ok(()); + } + println!("file: {} rows, {} columns, {} buckets, {} row groups", total, s.columns.len(), s.num_buckets, nrg); + for rg in 0..nrg { + println!("row group {rg}: {} rows", reader.row_group_num_rows(rg)?); + for st in reader.row_group_stats(rg)? { + let mm = match (&st.min, &st.max) { + (Some(lo), Some(hi)) => format!("min={} max={}", fmt::render_value(lo), fmt::render_value(hi)), + _ => "no min/max".to_string(), + }; + println!(" {}: nulls={} {}", s.columns[st.column_index].name, st.null_count, mm); + } + } + Ok(()) +} + +fn pages(file: &PathBuf, json: bool) -> std::io::Result<()> { + let reader = open(file)?; + let s = reader.schema(); + let nrg = reader.num_row_groups(); + if json { + let mut rgs = Vec::new(); + for rg in 0..nrg { + let items: Vec = reader.page_infos(rg)?.iter().map(|p| { + format!("{{\"column\":{},\"bucket\":{},\"encoding\":{},\"slot_size\":{}}}", + fmt::json_str(&s.columns[p.column_index].name), p.bucket, fmt::json_str(fmt::encoding_name(p.encoding)), p.slot_size) + }).collect(); + rgs.push(format!("[{}]", items.join(","))); + } + println!("{{\"row_groups\":[{}]}}", rgs.join(",")); + return Ok(()); + } + for rg in 0..nrg { + println!("row group {rg}:"); + for p in reader.page_infos(rg)? { + let c = &s.columns[p.column_index]; + println!(" {}: bucket {} encoding={} slot={}B", c.name, p.bucket, fmt::encoding_name(p.encoding), p.slot_size); + } + } + Ok(()) +} + +fn cat(file: &PathBuf, num: usize, columns: Option, json: bool) -> std::io::Result<()> { + let mut reader = open(file)?; + if let Some(list) = &columns { + let names: Vec<&str> = list.split(',').map(|x| x.trim()).filter(|x| !x.is_empty()).collect(); + reader.project(&names)?; + } + let mut batches: Vec = Vec::new(); + let mut got = 0usize; + for rg in 0..reader.num_row_groups() { + if got >= num { + break; + } + let batch = reader.row_group_reader(rg)?.read_columns()?; + got += batch.num_rows(); + batches.push(batch); + } + if batches.is_empty() { + if !json { + println!("(no rows)"); + } + } else if json { + print!("{}", fmt::ndjson(&batches, num)); + } else { + print!("{}", fmt::pretty_table(&batches, num)); + } + Ok(()) +} diff --git a/core/src/bucket_reader.rs b/core/src/bucket_reader.rs index 115bae4..3675d5b 100644 --- a/core/src/bucket_reader.rs +++ b/core/src/bucket_reader.rs @@ -507,6 +507,11 @@ impl BucketReader { self.num_rows - null_count } + /// Per-column encoding ids (in this bucket's column order). See `spec::ENCODING_*`. + pub fn encodings(&self) -> &[u8] { + &self.encodings + } + pub fn read_all_columns(&self) -> io::Result> { let num_rows = self.num_rows; let mut result = Vec::with_capacity(self.num_columns); @@ -768,6 +773,11 @@ impl ColumnPageReader { } } + /// Encoding id of this column page. See `spec::ENCODING_*`. + pub fn encoding(&self) -> u8 { + self.encoding + } + pub fn read_all(&self) -> io::Result { let num_rows = self.num_rows; let variant = data_variant_for_type(&self.col_type); diff --git a/core/src/reader.rs b/core/src/reader.rs index e111b9d..40692d2 100644 --- a/core/src/reader.rs +++ b/core/src/reader.rs @@ -179,6 +179,16 @@ fn read_merged_ranges( Ok((merged, fetched)) } +/// Physical placement of one column within one row group. +pub struct PageInfo { + pub column_index: usize, + pub bucket: usize, + /// `spec::ENCODING_*`: plain / const / dict / all_null. + pub encoding: u8, + /// Paged-bucket on-disk slot size in bytes; 0 for monolithic/empty buckets. + pub slot_size: usize, +} + pub struct RowGroupMeta { pub num_rows: usize, pub bucket_offsets: Vec, @@ -447,6 +457,67 @@ impl MosaicReader { &self.input } + /// Per-column physical layout for a row group: bucket, encoding and on-disk + /// slot size. Reads and decompresses each non-empty bucket; used by tooling + /// (the `pages` command). Columns are reported in global (name-sorted) order. + pub fn page_infos(&self, rg_index: usize) -> io::Result> { + if rg_index >= self.row_group_metas.len() { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "row group index out of range", + )); + } + let meta = &self.row_group_metas[rg_index]; + let mut out = Vec::with_capacity(self.schema.columns.len()); + for b in 0..self.num_buckets { + let globals = &self.schema.bucket_to_global[b]; + match meta.bucket_layouts[b] { + BucketLayout::Empty => { + for &gi in globals { + out.push(PageInfo { column_index: gi, bucket: b, encoding: ENCODING_ALL_NULL, slot_size: 0 }); + } + } + BucketLayout::Monolithic { compressed_size, uncompressed_size } => { + let buf = read_range(&self.input, meta.bucket_offsets[b], compressed_size)?; + let data = match self.compression { + COMPRESSION_NONE => buf, + COMPRESSION_ZSTD => zstd::bulk::decompress(&buf, uncompressed_size) + .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?, + _ => return Err(io::Error::new(io::ErrorKind::InvalidData, "unsupported compression")), + }; + let col_types: Vec = globals.iter().map(|&gi| self.schema.columns[gi].data_type.clone()).collect(); + let reader = BucketReader::new(col_types, data, meta.num_rows)?; + for (local, &gi) in globals.iter().enumerate() { + out.push(PageInfo { column_index: gi, bucket: b, encoding: reader.encodings()[local], slot_size: 0 }); + } + } + BucketLayout::Paged { total_size } => { + let dir_size = globals.len() * 4; + let dir = read_range(&self.input, meta.bucket_offsets[b], dir_size)?; + let mut sizes = Vec::with_capacity(globals.len()); + for i in 0..globals.len() { + sizes.push(u32::from_le_bytes(dir[i * 4..i * 4 + 4].try_into().unwrap()) as usize); + } + let mut foff = meta.bucket_offsets[b] + dir_size as u64; + for (local, &gi) in globals.iter().enumerate() { + let enc = if sizes[local] == 0 { + ENCODING_ALL_NULL + } else { + let slot = read_range(&self.input, foff, sizes[local])?; + let ct = self.schema.columns[gi].data_type.clone(); + Self::parse_column_slot(&slot, &ct, meta.num_rows)?.encoding() + }; + out.push(PageInfo { column_index: gi, bucket: b, encoding: enc, slot_size: sizes[local] }); + foff += sizes[local] as u64; + } + let _ = total_size; + } + } + } + out.sort_by_key(|p| p.column_index); + Ok(out) + } + fn parse_column_slot( slot_data: &[u8], col_type: &DataType, From b173d18f4ab00221f21fc8169fff7bfd9b88804d Mon Sep 17 00:00:00 2001 From: mingfeng Date: Tue, 16 Jun 2026 04:42:48 -0700 Subject: [PATCH 02/12] test: cover page_infos encodings and CLI formatters Add a core regression test for MosaicReader::page_infos asserting plain/dict/const detection on a paged-bucket file, and CLI unit tests for the fmt helpers (json escaping, value/encoding rendering, ndjson null handling, table truncation). --- cli/src/fmt.rs | 57 ++++++++++++++++++++++++++++++++++++++++ core/src/reader_tests.rs | 51 +++++++++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+) diff --git a/cli/src/fmt.rs b/cli/src/fmt.rs index cb85ce5..a6e8f9e 100644 --- a/cli/src/fmt.rs +++ b/cli/src/fmt.rs @@ -195,3 +195,60 @@ fn cell(arr: &dyn Array, row: usize) -> String { _ => "?".to_string(), } } + +#[cfg(test)] +mod tests { + use super::*; + use arrow_array::{Int32Array, StringArray}; + use arrow_schema::{DataType, Field, Schema}; + use std::sync::Arc; + + fn sample() -> RecordBatch { + let schema = Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("name", DataType::Utf8, true), + ]); + RecordBatch::try_new( + Arc::new(schema), + vec![ + Arc::new(Int32Array::from(vec![1, 2])), + Arc::new(StringArray::from(vec![Some("ann"), None])), + ], + ) + .unwrap() + } + + #[test] + fn json_str_escapes() { + assert_eq!(json_str("a\"b\n"), "\"a\\\"b\\n\""); + assert_eq!(json_str("x"), "\"x\""); + } + + #[test] + fn render_value_types() { + assert_eq!(render_value(&Value::Integer(5)), "5"); + assert_eq!(render_value(&Value::String(b"hi".to_vec())), "hi"); + assert_eq!(render_value(&Value::Null), "null"); + } + + #[test] + fn encoding_names() { + assert_eq!(encoding_name(0), "plain"); + assert_eq!(encoding_name(2), "dict"); + assert_eq!(encoding_name(3), "all_null"); + } + + #[test] + fn ndjson_renders_null_and_quotes() { + let out = ndjson(&[sample()], 10); + assert_eq!(out, "{\"id\":1,\"name\":\"ann\"}\n{\"id\":2,\"name\":null}\n"); + } + + #[test] + fn pretty_table_truncates_and_aligns() { + let t = pretty_table(&[sample()], 1); + assert!(t.contains("| id ")); + assert!(t.contains("| 1 ")); + assert!(!t.contains("| 2 ")); + } +} diff --git a/core/src/reader_tests.rs b/core/src/reader_tests.rs index 55eecee..fde6b55 100644 --- a/core/src/reader_tests.rs +++ b/core/src/reader_tests.rs @@ -4180,3 +4180,54 @@ fn test_row_group_num_rows_out_of_range() { assert!(reader.row_group_num_rows(1).is_err()); assert!(reader.row_group_num_rows(999).is_err()); } + +#[test] +fn test_page_infos_encodings() { + use crate::spec::{ENCODING_CONST, ENCODING_DICT, ENCODING_PLAIN}; + let columns = vec![ + ("id".to_string(), DataType::Int32, false), // unique -> plain + ("kind".to_string(), DataType::Utf8, true), // low cardinality -> dict + ("flag".to_string(), DataType::Int32, true), // constant -> const + ]; + let out = MemOutputFile::new(); + let mut writer = MosaicWriter::new( + out, + &columns_to_arrow_schema(&columns), + WriterOptions { + num_buckets: 3, + page_size_threshold: 1, // force paged buckets + ..Default::default() + }, + ) + .unwrap(); + + let rows: Vec> = (0..200) + .map(|i| { + vec![ + Value::Integer(i), + Value::String(["a", "b", "c"][(i % 3) as usize].as_bytes().to_vec()), + Value::Integer(7), + ] + }) + .collect(); + write_values(&mut writer, &columns, &rows); + writer.close().unwrap(); + let data = writer.output().buf.clone(); + let len = data.len() as u64; + let reader = MosaicReader::new(ByteArrayInputFile::new(data), len).unwrap(); + + let infos = reader.page_infos(0).unwrap(); + assert_eq!(infos.len(), 3); + // page_infos is sorted by column_index; name-sorted order: flag, id, kind + let by_name = |n: &str| { + infos + .iter() + .find(|p| reader.schema().columns[p.column_index].name == n) + .unwrap() + }; + assert_eq!(by_name("id").encoding, ENCODING_PLAIN); + assert_eq!(by_name("kind").encoding, ENCODING_DICT); + assert_eq!(by_name("flag").encoding, ENCODING_CONST); + assert!(by_name("id").slot_size > 0); + assert!(reader.page_infos(999).is_err()); +} From 963db7565a086859fc7f10b7709e21cd280c1172 Mon Sep 17 00:00:00 2001 From: mingfeng Date: Tue, 16 Jun 2026 04:56:41 -0700 Subject: [PATCH 03/12] test: add zero-dep end-to-end tests for mosaic CLI Drive the mosaic binary against a fixture file (via CARGO_BIN_EXE) and assert stdout for schema/meta/pages/cat, --json output, projection, row truncation and missing-file failure. No external dev-deps. --- cli/tests/e2e.rs | 143 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 143 insertions(+) create mode 100644 cli/tests/e2e.rs diff --git a/cli/tests/e2e.rs b/cli/tests/e2e.rs new file mode 100644 index 0000000..3f19e21 --- /dev/null +++ b/cli/tests/e2e.rs @@ -0,0 +1,143 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! End-to-end tests: drive the `mosaic` binary against a fixture file and +//! assert stdout. Zero external dev-deps — uses CARGO_BIN_EXE and std only. + +use std::fs::File; +use std::io::Write; +use std::process::Command; +use std::sync::Arc; + +use arrow_array::{Int32Array, RecordBatch, StringArray}; +use arrow_schema::{DataType, Field, Schema}; +use paimon_mosaic_core::writer::{MosaicWriter, OutputFile, WriterOptions}; + +struct FileOut { + f: File, + pos: u64, +} +impl OutputFile for FileOut { + fn write(&mut self, d: &[u8]) -> std::io::Result<()> { + self.f.write_all(d)?; + self.pos += d.len() as u64; + Ok(()) + } + fn flush(&mut self) -> std::io::Result<()> { + self.f.flush() + } + fn pos(&self) -> u64 { + self.pos + } +} + +/// Write a small fixture and return its path under the test temp dir. +fn fixture(name: &str) -> String { + let path = format!("{}/mosaic_e2e_{}.mosaic", std::env::temp_dir().display(), name); + let schema = Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("kind", DataType::Utf8, true), + Field::new("flag", DataType::Int32, true), + ]); + let out = FileOut { f: File::create(&path).unwrap(), pos: 0 }; + let opts = WriterOptions { + num_buckets: 3, + page_size_threshold: 1, + stats_columns: vec!["id".into()], + ..Default::default() + }; + let mut w = MosaicWriter::new(out, &schema, opts).unwrap(); + let n = 200; + let ids: Vec = (0..n).collect(); + let kinds: Vec<&str> = (0..n).map(|i| ["a", "b", "c"][(i % 3) as usize]).collect(); + let flags = vec![7; n as usize]; + let batch = RecordBatch::try_new( + Arc::new(schema), + vec![ + Arc::new(Int32Array::from(ids)), + Arc::new(StringArray::from(kinds)), + Arc::new(Int32Array::from(flags)), + ], + ) + .unwrap(); + w.write_batch(&batch).unwrap(); + w.close().unwrap(); + path +} + +fn run(args: &[&str]) -> (String, String, bool) { + let out = Command::new(env!("CARGO_BIN_EXE_mosaic")).args(args).output().unwrap(); + ( + String::from_utf8(out.stdout).unwrap(), + String::from_utf8(out.stderr).unwrap(), + out.status.success(), + ) +} + +#[test] +fn schema_lists_columns() { + let f = fixture("schema"); + let (out, _, ok) = run(&["schema", &f]); + assert!(ok); + assert!(out.contains("3 columns, 3 buckets")); + assert!(out.contains("id: Int32 not null")); + assert!(out.contains("kind: Utf8")); +} + +#[test] +fn meta_shows_stats() { + let f = fixture("meta"); + let (out, _, ok) = run(&["meta", &f]); + assert!(ok); + assert!(out.contains("200 rows")); + assert!(out.contains("id: nulls=0 min=0 max=199")); +} + +#[test] +fn pages_shows_encodings() { + let f = fixture("pages"); + let (out, _, ok) = run(&["pages", &f]); + assert!(ok); + assert!(out.contains("flag: bucket 0 encoding=const")); + assert!(out.contains("kind: bucket 2 encoding=dict")); +} + +#[test] +fn cat_truncates_and_projects() { + let f = fixture("cat"); + let (out, _, ok) = run(&["cat", &f, "-n", "2"]); + assert!(ok); + assert!(out.contains("| id | kind | flag |")); + assert_eq!(out.matches('\n').count(), 6); // 3 borders + header + 2 rows + let (proj, _, _) = run(&["cat", &f, "-c", "kind,id", "-n", "1"]); + assert!(proj.contains("| kind | id |")); +} + +#[test] +fn cat_json_is_ndjson() { + let f = fixture("json"); + let (out, _, ok) = run(&["cat", &f, "-n", "2", "--json"]); + assert!(ok); + assert_eq!(out, "{\"id\":0,\"kind\":\"a\",\"flag\":7}\n{\"id\":1,\"kind\":\"b\",\"flag\":7}\n"); +} + +#[test] +fn missing_file_fails() { + let (_, err, ok) = run(&["schema", "/no/such/file.mosaic"]); + assert!(!ok); + assert!(err.contains("error:")); +} From d4c3a71e99676fc9d470dd465dd03e91ae8c1946 Mon Sep 17 00:00:00 2001 From: jianguotian Date: Thu, 18 Jun 2026 00:43:03 -0700 Subject: [PATCH 04/12] docs: add CLI page + parquet-cli comparison Adds docs/cli.html documenting the mosaic inspector (schema/meta/pages/cat, text + JSON) with a parquet-cli command mapping and design-difference table, addressing the review asks on #66. Adds CLI to the nav across doc pages. --- docs/cli.html | 164 ++++++++++++++++++++++++ docs/cpp-api.html | 1 + docs/creating-a-release.html | 1 + docs/design.html | 1 + docs/index.html | 1 + docs/java-api.html | 1 + docs/python-api.html | 1 + docs/releases.html | 1 + docs/verifying-a-release-candidate.html | 1 + 9 files changed, 172 insertions(+) create mode 100644 docs/cli.html diff --git a/docs/cli.html b/docs/cli.html new file mode 100644 index 0000000..043ebee --- /dev/null +++ b/docs/cli.html @@ -0,0 +1,164 @@ + + + + + + + + CLI - Paimon Mosaic + + + + + +
+ + + +
+
+

CLI

+

A mosaic binary for inspecting files from the shell — schema, metadata, contents, and physical encoding, in text or JSON.

+ +

Install

+

+ The CLI is the cli/ workspace crate. Build the binary and put it on your + PATH: +

+
cargo build --release -p paimon-mosaic-cli
+cp target/release/mosaic /usr/local/bin/
+

Every command takes a file path and prints to stdout; add --json for machine-readable output.

+
# schema, metadata, encoding, and first rows
+mosaic schema file.mosaic
+mosaic meta   file.mosaic
+mosaic pages  file.mosaic
+mosaic cat    file.mosaic -n 20
+ +

schema

+

Column names, Arrow types, nullability, and bucket assignment. Columns are listed in original (write) order.

+
$ mosaic schema file.mosaic
+3 columns, 2 buckets
+  id: Int32 not null [bucket 0]
+  name: Utf8 [bucket 1]
+  age: Int32 [bucket 0]
+
$ mosaic schema file.mosaic --json
+{"columns":3,"buckets":2,"fields":[{"name":"id","type":"Int32","nullable":false,"bucket":0}, ...]}
+ +

meta

+

Row counts, row groups, and per-column statistics (null count and min/max for stats columns).

+
$ mosaic meta file.mosaic
+file: 5 rows, 3 columns, 2 buckets, 1 row groups
+row group 0: 5 rows
+    age: nulls=1 min=25 max=40
+    id: nulls=0 min=1 max=5
+    name: nulls=1 min=alice max=eve
+ +

pages

+

Per-column physical encoding and slot size, per row group. Encodings are const, plain, dict, or all_null.

+
$ mosaic pages file.mosaic
+row group 0:
+    age: bucket 0 encoding=plain slot=0B
+    id: bucket 0 encoding=plain slot=0B
+    name: bucket 1 encoding=plain slot=0B
+ +

cat

+

First N rows as a table. Project with -c/--columns; output NDJSON with --json.

+
$ mosaic cat file.mosaic -n 3
++----+-------+-----+
+| id | name  | age |
++----+-------+-----+
+| 1  | alice | 30  |
+| 2  | bob   | 25  |
+| 3  |       | 40  |
++----+-------+-----+
+
+$ mosaic cat file.mosaic -c name,id -n 1 --json
+{"name":"alice","id":1}
+ +

Options

+ + + + + + + + + +
CommandFlagDescription
all--jsonMachine-readable output (objects/NDJSON)
cat-n, --numRows to print (default 10)
cat-c, --columnsComma-separated columns to project and reorder
+ +

Comparison with parquet-cli

+

+ The command surface mirrors Apache Parquet's parquet-cli so the muscle memory carries over, + but the design is deliberately leaner. The mapping: +

+ + + + + + + + + + +
mosaicparquet-cliPurpose
schemaschemaColumns, types, nullability (+ bucket assignment)
metametaRow groups + per-column stats
catcat / headRows; -n caps like head
pagespages / column-indexPer-column encoding & slot size
+

Design differences worth noting:

+ + + + + + + + + + + +
Aspectparquet-climosaic
RuntimeJVM jarSingle static native binary, no runtime
JSON outputPer-command, uneven--json on every command
Physical unitColumn chunk per row groupBucket (the Mosaic column-group unit)
Column orderAs storedOriginal write order (disk is name-sorted; CLI un-sorts)
StatsAlways presentOnly opt-in stats columns carry min/max
+ +
+ JSON for pipelines + --json emits objects (schema/meta/pages) and NDJSON (cat), so you can pipe + straight into jq: mosaic meta f.mosaic --json | jq .row_groups[0].rows. +
+
+
+ + diff --git a/docs/cpp-api.html b/docs/cpp-api.html index 9b3fa4c..5ad8b23 100644 --- a/docs/cpp-api.html +++ b/docs/cpp-api.html @@ -42,6 +42,7 @@

Paimon Mosaic

  • Java API
  • Python API
  • C++ API
  • +
  • CLI
  • Releases
  • diff --git a/docs/creating-a-release.html b/docs/creating-a-release.html index 8b69a4d..91750df 100644 --- a/docs/creating-a-release.html +++ b/docs/creating-a-release.html @@ -42,6 +42,7 @@

    Paimon Mosaic

  • Java API
  • Python API
  • C++ API
  • +
  • CLI
  • Releases
  • diff --git a/docs/design.html b/docs/design.html index 84c5e4f..4f142ec 100644 --- a/docs/design.html +++ b/docs/design.html @@ -42,6 +42,7 @@

    Paimon Mosaic

  • Java API
  • Python API
  • C++ API
  • +
  • CLI
  • Releases
  • diff --git a/docs/index.html b/docs/index.html index 55e9ec8..0c3ff7b 100644 --- a/docs/index.html +++ b/docs/index.html @@ -42,6 +42,7 @@

    Paimon Mosaic

  • Java API
  • Python API
  • C++ API
  • +
  • CLI
  • Releases
  • diff --git a/docs/java-api.html b/docs/java-api.html index f654b99..b413bf8 100644 --- a/docs/java-api.html +++ b/docs/java-api.html @@ -42,6 +42,7 @@

    Paimon Mosaic

  • Java API
  • Python API
  • C++ API
  • +
  • CLI
  • Releases
  • diff --git a/docs/python-api.html b/docs/python-api.html index 7d50afe..e70b372 100644 --- a/docs/python-api.html +++ b/docs/python-api.html @@ -42,6 +42,7 @@

    Paimon Mosaic

  • Java API
  • Python API
  • C++ API
  • +
  • CLI
  • Releases
  • diff --git a/docs/releases.html b/docs/releases.html index 2e6b35b..b5dd083 100644 --- a/docs/releases.html +++ b/docs/releases.html @@ -42,6 +42,7 @@

    Paimon Mosaic

  • Java API
  • Python API
  • C++ API
  • +
  • CLI
  • Releases
  • diff --git a/docs/verifying-a-release-candidate.html b/docs/verifying-a-release-candidate.html index 6f620de..b9c3ead 100644 --- a/docs/verifying-a-release-candidate.html +++ b/docs/verifying-a-release-candidate.html @@ -42,6 +42,7 @@

    Paimon Mosaic

  • Java API
  • Python API
  • C++ API
  • +
  • CLI
  • Releases
  • From 963504251a2d772780e4194fdf13f1b8023dae1c Mon Sep 17 00:00:00 2001 From: mingfeng Date: Mon, 22 Jun 2026 03:26:28 -0700 Subject: [PATCH 05/12] docs: expand CLI page with meta example and parquet-cli design table --- docs/cli.html | 144 +++++++++++++++++++++++--------------------------- 1 file changed, 65 insertions(+), 79 deletions(-) diff --git a/docs/cli.html b/docs/cli.html index 043ebee..e677af6 100644 --- a/docs/cli.html +++ b/docs/cli.html @@ -39,10 +39,10 @@

    Paimon Mosaic

    @@ -54,109 +54,95 @@

    Paimon Mosaic

    CLI

    -

    A mosaic binary for inspecting files from the shell — schema, metadata, contents, and physical encoding, in text or JSON.

    +

    Inspect Mosaic files from the terminal with the mosaic binary — schema, meta, cat and pages. A native, JVM-free toolkit modeled on parquet-cli.

    Install

    - The CLI is the cli/ workspace crate. Build the binary and put it on your - PATH: + The CLI lives in the cli/ directory and drives the read-only + MosaicReader API. Build and run from source, or install the binary:

    -
    cargo build --release -p paimon-mosaic-cli
    -cp target/release/mosaic /usr/local/bin/
    -

    Every command takes a file path and prints to stdout; add --json for machine-readable output.

    -
    # schema, metadata, encoding, and first rows
    -mosaic schema file.mosaic
    -mosaic meta   file.mosaic
    -mosaic pages  file.mosaic
    -mosaic cat    file.mosaic -n 20
    +
    # run from source
    +cargo run -p paimon-mosaic-cli -- schema data.mosaic
    +
    +# install the `mosaic` binary
    +cargo install --path cli
    +mosaic schema data.mosaic
    + +

    Commands

    + + + + + + + + + + +
    CommandShowsReads
    schemacolumn names, Arrow types, nullability, bucketfooter only
    metarow groups, rows, per-column stats (null/min/max)footer + index
    pagesper-column encoding + on-disk slot sizebucket data
    catfirst N rows as a tablecolumn data
    +

    All four accept --json. cat also takes -n <N> and -c a,b (projection).

    schema

    -

    Column names, Arrow types, nullability, and bucket assignment. Columns are listed in original (write) order.

    -
    $ mosaic schema file.mosaic
    -3 columns, 2 buckets
    -  id: Int32 not null [bucket 0]
    -  name: Utf8 [bucket 1]
    -  age: Int32 [bucket 0]
    -
    $ mosaic schema file.mosaic --json
    -{"columns":3,"buckets":2,"fields":[{"name":"id","type":"Int32","nullable":false,"bucket":0}, ...]}
    +
    $ mosaic schema data.mosaic
    +3 columns, 3 buckets
    +  id: Int32 not null [bucket 1]
    +  kind: Utf8 [bucket 2]
    +  flag: Int32 [bucket 0]

    meta

    -

    Row counts, row groups, and per-column statistics (null count and min/max for stats columns).

    -
    $ mosaic meta file.mosaic
    -file: 5 rows, 3 columns, 2 buckets, 1 row groups
    -row group 0: 5 rows
    -    age: nulls=1 min=25 max=40
    -    id: nulls=0 min=1 max=5
    -    name: nulls=1 min=alice max=eve
    +
    $ mosaic meta data.mosaic
    +file: 200 rows, 3 columns, 3 buckets, 1 row groups
    +row group 0: 200 rows
    +    id: nulls=0 min=0 max=199

    pages

    -

    Per-column physical encoding and slot size, per row group. Encodings are const, plain, dict, or all_null.

    -
    $ mosaic pages file.mosaic
    +            

    + Per-column physical encoding within each row group: plain, + const, dict or all_null, plus the on-disk slot size. +

    +
    $ mosaic pages data.mosaic
     row group 0:
    -    age: bucket 0 encoding=plain slot=0B
    -    id: bucket 0 encoding=plain slot=0B
    -    name: bucket 1 encoding=plain slot=0B
    + flag: bucket 0 encoding=const slot=0B + id: bucket 1 encoding=plain slot=8684B + kind: bucket 2 encoding=dict slot=31B

    cat

    -

    First N rows as a table. Project with -c/--columns; output NDJSON with --json.

    -
    $ mosaic cat file.mosaic -n 3
    -+----+-------+-----+
    -| id | name  | age |
    -+----+-------+-----+
    -| 1  | alice | 30  |
    -| 2  | bob   | 25  |
    -| 3  |       | 40  |
    -+----+-------+-----+
    +
    $ mosaic cat data.mosaic -n 2
    ++----+------+------+
    +| id | kind | flag |
    ++----+------+------+
    +| 0  | a    | 7    |
    +| 1  | b    | 7    |
    ++----+------+------+
     
    -$ mosaic cat file.mosaic -c name,id -n 1 --json
    -{"name":"alice","id":1}
    +$ mosaic cat data.mosaic -n 2 --json +{"id":0,"kind":"a","flag":7} +{"id":1,"kind":"b","flag":7}
    -

    Options

    - - - - - - - - - -
    CommandFlagDescription
    all--jsonMachine-readable output (objects/NDJSON)
    cat-n, --numRows to print (default 10)
    cat-c, --columnsComma-separated columns to project and reorder
    - -

    Comparison with parquet-cli

    +

    Design vs parquet-cli

    - The command surface mirrors Apache Parquet's parquet-cli so the muscle memory carries over, - but the design is deliberately leaner. The mapping: + Both follow the same shape — thin subcommands over a shared reader — but Mosaic + is a single Rust implementation of a private format, so the tool is intentionally + smaller and read-only.

    - - - - - - - - -
    mosaicparquet-cliPurpose
    schemaschemaColumns, types, nullability (+ bucket assignment)
    metametaRow groups + per-column stats
    catcat / headRows; -n caps like head
    pagespages / column-indexPer-column encoding & slot size
    -

    Design differences worth noting:

    - - - + - - - - - + + + + + +
    Aspectparquet-climosaic
    parquet-cli (Java)mosaic CLI
    RuntimeJVM jarSingle static native binary, no runtime
    JSON outputPer-command, uneven--json on every command
    Physical unitColumn chunk per row groupBucket (the Mosaic column-group unit)
    Column orderAs storedOriginal write order (disk is name-sorted; CLI un-sorts)
    StatsAlways presentOnly opt-in stats columns carry min/max
    Parserairline annotationsclap derive
    BaseBaseCommand + Hadoop FSMosaicReader + FileInput (pread)
    IOlocal / HDFS / S3local file
    Schemaprinted as Avroprinted as Arrow
    RuntimeJVM (hadoop jar)native, no JVM
    Scope22 cmds (view + convert + rewrite)4 view cmds
    - JSON for pipelines - --json emits objects (schema/meta/pages) and NDJSON (cat), so you can pipe - straight into jq: mosaic meta f.mosaic --json | jq .row_groups[0].rows. + Embedding instead + For C/C++ or Java callers, embed the format directly via the ffi + (mosaic.h) or jni crates rather than shelling out to this CLI.
    From c76ed0af1062f32cdd3461abb4ba499b6d5dd5a5 Mon Sep 17 00:00:00 2001 From: mingfeng Date: Mon, 22 Jun 2026 05:19:29 -0700 Subject: [PATCH 06/12] feat(cli): add head, footer, column-size, dictionary commands Align the viewer command set with parquet-cli/arrow-rs: head (alias of cat), footer (magic/version/buckets/compression), column-size (on-disk bytes per column), dictionary (dump dict-encoded entries). Core gains compression()/dict_values()/dictionary() read-only accessors. e2e tests cover the new commands. --- cli/src/main.rs | 83 +++++++++++++++++++++++++++++++++++++++ cli/tests/e2e.rs | 28 +++++++++++++ core/src/bucket_reader.rs | 10 +++++ core/src/reader.rs | 28 +++++++++++++ 4 files changed, 149 insertions(+) diff --git a/cli/src/main.rs b/cli/src/main.rs index 74b6918..895ffdd 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -67,6 +67,30 @@ enum Cmd { #[arg(long)] json: bool, }, + /// Print the first N rows (alias of cat). + Head { + file: PathBuf, + #[arg(short = 'n', long, default_value_t = 10)] + num: usize, + #[arg(short, long)] + columns: Option, + #[arg(long)] + json: bool, + }, + /// Print the file footer: version, buckets, compression, offsets. + Footer { + file: PathBuf, + #[arg(long)] + json: bool, + }, + /// Print on-disk bytes per column (summed over row groups). + ColumnSize { + file: PathBuf, + #[arg(long)] + json: bool, + }, + /// Print the dictionary of a dict-encoded column. + Dictionary { file: PathBuf, column: String }, } fn main() -> ExitCode { @@ -76,6 +100,10 @@ fn main() -> ExitCode { Cmd::Meta { file, json } => meta(&file, json), Cmd::Pages { file, json } => pages(&file, json), Cmd::Cat { file, num, columns, json } => cat(&file, num, columns, json), + Cmd::Head { file, num, columns, json } => cat(&file, num, columns, json), + Cmd::Footer { file, json } => footer(&file, json), + Cmd::ColumnSize { file, json } => column_size(&file, json), + Cmd::Dictionary { file, column } => dictionary(&file, &column), }; match res { Ok(()) => ExitCode::SUCCESS, @@ -212,3 +240,58 @@ fn cat(file: &PathBuf, num: usize, columns: Option, json: bool) -> std:: } Ok(()) } + +fn footer(file: &PathBuf, json: bool) -> std::io::Result<()> { + use paimon_mosaic_core::spec::{COMPRESSION_ZSTD, MAGIC, VERSION}; + let reader = open(file)?; + let s = reader.schema(); + let comp = if reader.compression() == COMPRESSION_ZSTD { "zstd" } else { "none" }; + let magic = std::str::from_utf8(&MAGIC).unwrap_or("MOSA"); + if json { + println!("{{\"magic\":{},\"version\":{},\"buckets\":{},\"row_groups\":{},\"compression\":{}}}", + fmt::json_str(magic), VERSION, s.num_buckets, reader.num_row_groups(), fmt::json_str(comp)); + } else { + println!("magic={} version={} buckets={} row_groups={} compression={}", + magic, VERSION, s.num_buckets, reader.num_row_groups(), comp); + } + Ok(()) +} + +fn column_size(file: &PathBuf, json: bool) -> std::io::Result<()> { + let reader = open(file)?; + let s = reader.schema(); + let mut bytes = vec![0usize; s.columns.len()]; + for rg in 0..reader.num_row_groups() { + for p in reader.page_infos(rg)? { + bytes[p.column_index] += p.slot_size; + } + } + let cols = original_order(s); + if json { + let items: Vec = cols.iter().map(|&i| format!("{{\"column\":{},\"bytes\":{}}}", fmt::json_str(&s.columns[i].name), bytes[i])).collect(); + println!("[{}]", items.join(",")); + } else { + for i in cols { + println!(" {}: {} B", s.columns[i].name, bytes[i]); + } + } + Ok(()) +} + +fn dictionary(file: &PathBuf, column: &str) -> std::io::Result<()> { + let reader = open(file)?; + let col = reader.schema().columns.iter().position(|c| c.name == column) + .ok_or_else(|| std::io::Error::new(std::io::ErrorKind::InvalidInput, format!("column '{column}' not found")))?; + for rg in 0..reader.num_row_groups() { + match reader.dictionary(rg, col)? { + Some(vals) => { + println!("row group {rg}: {} entries", vals.len()); + for (i, v) in vals.iter().enumerate() { + println!(" {i}: {}", fmt::render_value(v)); + } + } + None => println!("row group {rg}: not dict-encoded"), + } + } + Ok(()) +} diff --git a/cli/tests/e2e.rs b/cli/tests/e2e.rs index 3f19e21..5736ac4 100644 --- a/cli/tests/e2e.rs +++ b/cli/tests/e2e.rs @@ -141,3 +141,31 @@ fn missing_file_fails() { assert!(!ok); assert!(err.contains("error:")); } + +#[test] +fn footer_shows_format() { + let f = fixture("footer"); + let (out, _, ok) = run(&["footer", &f]); + assert!(ok); + assert!(out.contains("magic=MOSA")); + assert!(out.contains("buckets=3")); + assert!(out.contains("compression=zstd")); +} + +#[test] +fn dictionary_dumps_entries() { + let f = fixture("dict"); + let (out, _, ok) = run(&["dictionary", &f, "kind"]); + assert!(ok); + assert!(out.contains("3 entries")); + assert!(out.contains("a") && out.contains("b") && out.contains("c")); +} + +#[test] +fn column_size_sums_bytes() { + let f = fixture("size"); + let (out, _, ok) = run(&["column-size", &f]); + assert!(ok); + assert!(out.contains("id:") && out.contains("kind:")); + assert!(out.contains("flag: 0 B")); // const column has no slot +} diff --git a/core/src/bucket_reader.rs b/core/src/bucket_reader.rs index 3675d5b..2655ebb 100644 --- a/core/src/bucket_reader.rs +++ b/core/src/bucket_reader.rs @@ -512,6 +512,11 @@ impl BucketReader { &self.encodings } + /// Dictionary entries for one column (empty if it is not dict-encoded). + pub fn dict_values(&self, col: usize) -> &[Value] { + &self.dict_values[col] + } + pub fn read_all_columns(&self) -> io::Result> { let num_rows = self.num_rows; let mut result = Vec::with_capacity(self.num_columns); @@ -778,6 +783,11 @@ impl ColumnPageReader { self.encoding } + /// Dictionary entries for a dict-encoded page (empty for other encodings). + pub fn dict_values(&self) -> &[Value] { + &self.dict_values + } + pub fn read_all(&self) -> io::Result { let num_rows = self.num_rows; let variant = data_variant_for_type(&self.col_type); diff --git a/core/src/reader.rs b/core/src/reader.rs index 40692d2..fe9e014 100644 --- a/core/src/reader.rs +++ b/core/src/reader.rs @@ -457,6 +457,18 @@ impl MosaicReader { &self.input } + /// Footer compression code (`spec::COMPRESSION_*`). + pub fn compression(&self) -> u8 { + self.compression + } + + /// Dictionary entries for one column in one row group, or `None` if that + /// column is not dict-encoded there. Used by the `dictionary` command. + pub fn dictionary(&self, rg_index: usize, col: usize) -> io::Result>> { + let rg = self.row_group_reader_projected(rg_index, &[col])?; + Ok(rg.take_dictionary(col)) + } + /// Per-column physical layout for a row group: bucket, encoding and on-disk /// slot size. Reads and decompresses each non-empty bucket; used by tooling /// (the `pages` command). Columns are reported in global (name-sorted) order. @@ -1036,6 +1048,22 @@ impl RowGroupReader { } } + /// Dictionary entries for a projected column, or `None` if not dict-encoded. + pub fn take_dictionary(&self, global_col: usize) -> Option> { + let bucket = self.schema.columns[global_col].bucket_id; + let local = self.bucket_to_global[bucket].iter().position(|&g| g == global_col)?; + match self.bucket_states[bucket].as_ref()? { + BucketState::Paged { column_readers } => { + let d = column_readers[local].as_ref()?.dict_values(); + if d.is_empty() { None } else { Some(d.to_vec()) } + } + BucketState::Monolithic { reader } => { + let d = reader.dict_values(local); + if d.is_empty() { None } else { Some(d.to_vec()) } + } + } + } + pub fn num_rows(&self) -> usize { self.num_rows } From 20716c65590a84c3cb58761c327c0f3af647f22d Mon Sep 17 00:00:00 2001 From: mingfeng Date: Mon, 22 Jun 2026 05:26:29 -0700 Subject: [PATCH 07/12] feat(cli): add buckets command for mosaic bucket layout Mosaic's column-bucket grouping has no parquet equivalent. Add a buckets command printing, per row group, each bucket's kind (empty/monolithic/paged), on-disk size and member columns. Core gains MosaicReader::bucket_infos(). e2e covered. --- cli/src/main.rs | 34 ++++++++++++++++++++++++++++++++++ cli/tests/e2e.rs | 10 ++++++++++ core/src/reader.rs | 29 +++++++++++++++++++++++++++++ 3 files changed, 73 insertions(+) diff --git a/cli/src/main.rs b/cli/src/main.rs index 895ffdd..3032fa5 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -91,6 +91,12 @@ enum Cmd { }, /// Print the dictionary of a dict-encoded column. Dictionary { file: PathBuf, column: String }, + /// Print bucket layout per row group (Mosaic's column grouping). + Buckets { + file: PathBuf, + #[arg(long)] + json: bool, + }, } fn main() -> ExitCode { @@ -104,6 +110,7 @@ fn main() -> ExitCode { Cmd::Footer { file, json } => footer(&file, json), Cmd::ColumnSize { file, json } => column_size(&file, json), Cmd::Dictionary { file, column } => dictionary(&file, &column), + Cmd::Buckets { file, json } => buckets(&file, json), }; match res { Ok(()) => ExitCode::SUCCESS, @@ -295,3 +302,30 @@ fn dictionary(file: &PathBuf, column: &str) -> std::io::Result<()> { } Ok(()) } + +fn buckets(file: &PathBuf, json: bool) -> std::io::Result<()> { + let reader = open(file)?; + let s = reader.schema(); + let name = |i: usize| s.columns[i].name.clone(); + let mut rgs = Vec::new(); + for rg in 0..reader.num_row_groups() { + let infos = reader.bucket_infos(rg)?; + if json { + let items: Vec = infos.iter().map(|b| { + let cols: Vec = b.columns.iter().map(|&i| fmt::json_str(&name(i))).collect(); + format!("{{\"bucket\":{},\"kind\":{},\"size\":{},\"columns\":[{}]}}", b.bucket, fmt::json_str(b.kind), b.size, cols.join(",")) + }).collect(); + rgs.push(format!("[{}]", items.join(","))); + } else { + println!("row group {rg}:"); + for b in &infos { + let cols: Vec = b.columns.iter().map(|&i| name(i)).collect(); + println!(" bucket {}: {} {}B [{}]", b.bucket, b.kind, b.size, cols.join(", ")); + } + } + } + if json { + println!("{{\"row_groups\":[{}]}}", rgs.join(",")); + } + Ok(()) +} diff --git a/cli/tests/e2e.rs b/cli/tests/e2e.rs index 5736ac4..5581843 100644 --- a/cli/tests/e2e.rs +++ b/cli/tests/e2e.rs @@ -169,3 +169,13 @@ fn column_size_sums_bytes() { assert!(out.contains("id:") && out.contains("kind:")); assert!(out.contains("flag: 0 B")); // const column has no slot } + +#[test] +fn buckets_show_layout() { + let f = fixture("buckets"); + let (out, _, ok) = run(&["buckets", &f]); + assert!(ok); + assert!(out.contains("row group 0:")); + assert!(out.contains("[flag]") && out.contains("[id]") && out.contains("[kind]")); + assert!(out.contains("monolithic") || out.contains("paged")); +} diff --git a/core/src/reader.rs b/core/src/reader.rs index fe9e014..ac89dfa 100644 --- a/core/src/reader.rs +++ b/core/src/reader.rs @@ -189,6 +189,17 @@ pub struct PageInfo { pub slot_size: usize, } +/// Layout of one bucket within one row group. +pub struct BucketInfo { + pub bucket: usize, + /// "empty" | "monolithic" | "paged". + pub kind: &'static str, + /// On-disk compressed size in bytes (0 for empty buckets). + pub size: usize, + /// Member column indices (global, name-sorted order). + pub columns: Vec, +} + pub struct RowGroupMeta { pub num_rows: usize, pub bucket_offsets: Vec, @@ -462,6 +473,24 @@ impl MosaicReader { self.compression } + /// Per-bucket layout for a row group: kind, on-disk size and member columns + /// (global indices). The bucket is Mosaic's defining structure — exposed for + /// the `buckets` command. + pub fn bucket_infos(&self, rg_index: usize) -> io::Result> { + if rg_index >= self.row_group_metas.len() { + return Err(io::Error::new(io::ErrorKind::InvalidInput, "row group index out of range")); + } + let meta = &self.row_group_metas[rg_index]; + Ok((0..self.num_buckets).map(|b| { + let (kind, size) = match meta.bucket_layouts[b] { + BucketLayout::Empty => ("empty", 0), + BucketLayout::Monolithic { compressed_size, .. } => ("monolithic", compressed_size), + BucketLayout::Paged { total_size } => ("paged", total_size), + }; + BucketInfo { bucket: b, kind, size, columns: self.schema.bucket_to_global[b].clone() } + }).collect()) + } + /// Dictionary entries for one column in one row group, or `None` if that /// column is not dict-encoded there. Used by the `dictionary` command. pub fn dictionary(&self, rg_index: usize, col: usize) -> io::Result>> { From 0a64296815cd2f537541234fca267d3925211c67 Mon Sep 17 00:00:00 2001 From: mingfeng Date: Mon, 22 Jun 2026 05:43:27 -0700 Subject: [PATCH 08/12] feat(cli): dictionary takes -c/--column to match parquet-cli Align dictionary column selection with parquet-cli's -c flag instead of a positional argument; update e2e. --- cli/src/main.rs | 7 ++++++- cli/tests/e2e.rs | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/cli/src/main.rs b/cli/src/main.rs index 3032fa5..a7980b0 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -90,7 +90,12 @@ enum Cmd { json: bool, }, /// Print the dictionary of a dict-encoded column. - Dictionary { file: PathBuf, column: String }, + Dictionary { + file: PathBuf, + /// Column name to dump. + #[arg(short = 'c', long)] + column: String, + }, /// Print bucket layout per row group (Mosaic's column grouping). Buckets { file: PathBuf, diff --git a/cli/tests/e2e.rs b/cli/tests/e2e.rs index 5581843..c417a8b 100644 --- a/cli/tests/e2e.rs +++ b/cli/tests/e2e.rs @@ -155,7 +155,7 @@ fn footer_shows_format() { #[test] fn dictionary_dumps_entries() { let f = fixture("dict"); - let (out, _, ok) = run(&["dictionary", &f, "kind"]); + let (out, _, ok) = run(&["dictionary", &f, "-c", "kind"]); assert!(ok); assert!(out.contains("3 entries")); assert!(out.contains("a") && out.contains("b") && out.contains("c")); From 550d59674f42d07835f7a42c8c32223279989b61 Mon Sep 17 00:00:00 2001 From: mingfeng Date: Mon, 22 Jun 2026 05:55:10 -0700 Subject: [PATCH 09/12] feat(cli): add --json to dictionary Completes JSON output across all 9 commands; dict columns emit an array, non-dict row groups emit null. e2e extended. --- cli/src/main.rs | 20 ++++++++++++++++++-- cli/tests/e2e.rs | 3 +++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/cli/src/main.rs b/cli/src/main.rs index a7980b0..5806386 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -95,6 +95,8 @@ enum Cmd { /// Column name to dump. #[arg(short = 'c', long)] column: String, + #[arg(long)] + json: bool, }, /// Print bucket layout per row group (Mosaic's column grouping). Buckets { @@ -114,7 +116,7 @@ fn main() -> ExitCode { Cmd::Head { file, num, columns, json } => cat(&file, num, columns, json), Cmd::Footer { file, json } => footer(&file, json), Cmd::ColumnSize { file, json } => column_size(&file, json), - Cmd::Dictionary { file, column } => dictionary(&file, &column), + Cmd::Dictionary { file, column, json } => dictionary(&file, &column, json), Cmd::Buckets { file, json } => buckets(&file, json), }; match res { @@ -290,10 +292,24 @@ fn column_size(file: &PathBuf, json: bool) -> std::io::Result<()> { Ok(()) } -fn dictionary(file: &PathBuf, column: &str) -> std::io::Result<()> { +fn dictionary(file: &PathBuf, column: &str, json: bool) -> std::io::Result<()> { let reader = open(file)?; let col = reader.schema().columns.iter().position(|c| c.name == column) .ok_or_else(|| std::io::Error::new(std::io::ErrorKind::InvalidInput, format!("column '{column}' not found")))?; + if json { + let mut rgs = Vec::new(); + for rg in 0..reader.num_row_groups() { + match reader.dictionary(rg, col)? { + Some(vals) => { + let e: Vec = vals.iter().map(|v| fmt::json_str(&fmt::render_value(v))).collect(); + rgs.push(format!("[{}]", e.join(","))); + } + None => rgs.push("null".to_string()), + } + } + println!("{{\"column\":{},\"row_groups\":[{}]}}", fmt::json_str(column), rgs.join(",")); + return Ok(()); + } for rg in 0..reader.num_row_groups() { match reader.dictionary(rg, col)? { Some(vals) => { diff --git a/cli/tests/e2e.rs b/cli/tests/e2e.rs index c417a8b..3a078e3 100644 --- a/cli/tests/e2e.rs +++ b/cli/tests/e2e.rs @@ -159,6 +159,9 @@ fn dictionary_dumps_entries() { assert!(ok); assert!(out.contains("3 entries")); assert!(out.contains("a") && out.contains("b") && out.contains("c")); + let (j, _, ok) = run(&["dictionary", &f, "-c", "kind", "--json"]); + assert!(ok); + assert_eq!(j, "{\"column\":\"kind\",\"row_groups\":[[\"a\",\"b\",\"c\"]]}\n"); } #[test] From 04839b69562bf4ad2591f98bb792aab1e9914521 Mon Sep 17 00:00:00 2001 From: mingfeng Date: Mon, 22 Jun 2026 05:59:28 -0700 Subject: [PATCH 10/12] docs: document all 9 commands; remove parquet comparison Expand docs/cli.html and cli/README.md to cover every command (schema/meta/footer/buckets/pages/dictionary/column-size/cat/head) with usage and example output. Drop all comparison content per maintainer preference. --- cli/README.md | 72 +++++++++++++++++++++++++++++ docs/cli.html | 126 ++++++++++++++++++++++++++++++-------------------- 2 files changed, 147 insertions(+), 51 deletions(-) create mode 100644 cli/README.md diff --git a/cli/README.md b/cli/README.md new file mode 100644 index 0000000..fc38491 --- /dev/null +++ b/cli/README.md @@ -0,0 +1,72 @@ + + +# mosaic CLI + +A native command-line inspector for Mosaic files. It drives the read-only +`MosaicReader` API, so it needs no JVM and ships as a single native binary. + +## Build & run + +```bash +cargo run -p paimon-mosaic-cli -- # from source +cargo install --path cli # install `mosaic` +mosaic +``` + +## Commands + +| Command | Shows | Reads | +|---------|-------|-------| +| `schema` | column names, Arrow types, nullability, bucket | footer only | +| `meta` | row groups, rows, per-column stats (null/min/max) | footer + index | +| `footer` | magic, version, buckets, compression | footer only | +| `buckets`| per-bucket layout and member columns | footer + index | +| `pages` | per-column encoding + on-disk slot size | bucket data | +| `dictionary` | dictionary entries of a dict column (`-c`) | bucket data | +| `column-size` | on-disk bytes per column | footer + index | +| `cat` / `head` | first N rows as a table | column data | + +Every command accepts `--json`. `cat`/`head` take `-n ` and `-c a,b` +(projection); `dictionary` takes `-c `. + +```text +$ mosaic schema data.mosaic +5 columns, 4 buckets + id: Int32 not null [bucket 0] + name: Utf8 [bucket 2] + kind: Utf8 [bucket 1] + +$ mosaic buckets data.mosaic +row group 0: + bucket 0: paged 373B [flag, id] + bucket 1: paged 32B [kind] + +$ mosaic pages data.mosaic +row group 0: + flag: bucket 0 encoding=const slot=16B + kind: bucket 1 encoding=dict slot=28B + +$ mosaic cat data.mosaic -n 2 --json +{"id":0,"name":"user_0","kind":"a","score":0,"flag":7} +{"id":1,"name":"user_1","kind":"b","score":1.5,"flag":7} +``` + +For C/C++ or Java callers, embed the format directly via the `ffi` +(`mosaic.h`) or `jni` crates rather than shelling out to this CLI. diff --git a/docs/cli.html b/docs/cli.html index e677af6..6e7e7dd 100644 --- a/docs/cli.html +++ b/docs/cli.html @@ -54,13 +54,9 @@

    Paimon Mosaic

    CLI

    -

    Inspect Mosaic files from the terminal with the mosaic binary — schema, meta, cat and pages. A native, JVM-free toolkit modeled on parquet-cli.

    +

    Inspect Mosaic files from the terminal with the mosaic binary. A native, JVM-free toolkit driving the read-only MosaicReader API.

    Install

    -

    - The CLI lives in the cli/ directory and drives the read-only - MosaicReader API. Build and run from source, or install the binary: -

    # run from source
     cargo run -p paimon-mosaic-cli -- schema data.mosaic
     
    @@ -75,69 +71,97 @@ 

    Commands

    schemacolumn names, Arrow types, nullability, bucketfooter only - metarow groups, rows, per-column stats (null/min/max)footer + index - pagesper-column encoding + on-disk slot sizebucket data - catfirst N rows as a tablecolumn data + metarow groups, rows, per-column statsfooter + index + footermagic, version, buckets, compressionfooter only + bucketsper-bucket layout and member columnsfooter + index + pagesper-column encoding + slot sizebucket data + dictionarydictionary entries of a dict columnbucket data + column-sizeon-disk bytes per columnfooter + index + cat / headfirst N rows as a tablecolumn data -

    All four accept --json. cat also takes -n <N> and -c a,b (projection).

    +

    Every command accepts --json. cat/head take -n <N> and -c a,b (projection); dictionary takes -c <col>.

    schema

    +

    Columns, Arrow types, nullability and bucket assignment, in original input order. Footer only.

    $ mosaic schema data.mosaic
    -3 columns, 3 buckets
    -  id: Int32 not null [bucket 1]
    -  kind: Utf8 [bucket 2]
    -  flag: Int32 [bucket 0]
    +5 columns, 4 buckets + id: Int32 not null [bucket 0] + name: Utf8 [bucket 2] + kind: Utf8 [bucket 1] + score: Float64 [bucket 3] + flag: Int32 [bucket 0] + +$ mosaic schema data.mosaic --json +{"columns":5,"buckets":4,"fields":[{"name":"id","type":"Int32","nullable":false,"bucket":0}, ...]}

    meta

    +

    Total rows, row groups, and per-column stats (null count / min / max) for columns configured with stats.

    $ mosaic meta data.mosaic
    -file: 200 rows, 3 columns, 3 buckets, 1 row groups
    +file: 200 rows, 5 columns, 4 buckets, 1 row groups
     row group 0: 200 rows
    -    id: nulls=0 min=0 max=199
    + id: nulls=0 min=0 max=199 + score: nulls=0 min=0 max=298.5 + +

    footer

    +

    The 32-byte file footer: magic, format version, bucket count, row groups and compression.

    +
    $ mosaic footer data.mosaic
    +magic=MOSA version=1 buckets=4 row_groups=1 compression=zstd
    + +

    buckets

    +

    Per row group, each bucket's layout (empty / monolithic / paged), on-disk size and member columns. Mosaic groups columns into buckets by name order.

    +
    $ mosaic buckets data.mosaic
    +row group 0:
    +    bucket 0: paged 373B [flag, id]
    +    bucket 1: paged 32B [kind]
    +    bucket 2: paged 220B [name]
    +    bucket 3: paged 542B [score]

    pages

    -

    - Per-column physical encoding within each row group: plain, - const, dict or all_null, plus the on-disk slot size. -

    +

    Per-column physical encoding (plain / const / dict / all_null) and on-disk slot size.

    $ mosaic pages data.mosaic
     row group 0:
    -    flag: bucket 0 encoding=const slot=0B
    -    id: bucket 1 encoding=plain slot=8684B
    -    kind: bucket 2 encoding=dict slot=31B
    + flag: bucket 0 encoding=const slot=16B + id: bucket 0 encoding=plain slot=349B + kind: bucket 1 encoding=dict slot=28B + name: bucket 2 encoding=plain slot=216B + score: bucket 3 encoding=plain slot=538B + +

    dictionary

    +

    Dump the dictionary of a dict-encoded column. Non-dict columns report as such.

    +
    $ mosaic dictionary data.mosaic -c kind
    +row group 0: 3 entries
    +    0: a
    +    1: b
    +    2: c
    +
    +$ mosaic dictionary data.mosaic -c kind --json
    +{"column":"kind","row_groups":[["a","b","c"]]}
    + +

    column-size

    +

    On-disk bytes per column, summed across row groups.

    +
    $ mosaic column-size data.mosaic
    +  id: 349 B
    +  name: 216 B
    +  kind: 28 B
    +  score: 538 B
    +  flag: 16 B
    -

    cat

    +

    cat / head

    +

    Read the first N rows as a table. -n sets the count, -c projects columns, --json emits newline-delimited JSON. head is an alias of cat.

    $ mosaic cat data.mosaic -n 2
    -+----+------+------+
    -| id | kind | flag |
    -+----+------+------+
    -| 0  | a    | 7    |
    -| 1  | b    | 7    |
    -+----+------+------+
    ++----+--------+------+-------+------+
    +| id | name   | kind | score | flag |
    ++----+--------+------+-------+------+
    +| 0  | user_0 | a    | 0     | 7    |
    +| 1  | user_1 | b    | 1.5   | 7    |
    ++----+--------+------+-------+------+
    +
    +$ mosaic cat data.mosaic -n 2 -c name,score   # projection
     
     $ mosaic cat data.mosaic -n 2 --json
    -{"id":0,"kind":"a","flag":7}
    -{"id":1,"kind":"b","flag":7}
    - -

    Design vs parquet-cli

    -

    - Both follow the same shape — thin subcommands over a shared reader — but Mosaic - is a single Rust implementation of a private format, so the tool is intentionally - smaller and read-only. -

    - - - - - - - - - - - - -
    parquet-cli (Java)mosaic CLI
    Parserairline annotationsclap derive
    BaseBaseCommand + Hadoop FSMosaicReader + FileInput (pread)
    IOlocal / HDFS / S3local file
    Schemaprinted as Avroprinted as Arrow
    RuntimeJVM (hadoop jar)native, no JVM
    Scope22 cmds (view + convert + rewrite)4 view cmds
    +{"id":0,"name":"user_0","kind":"a","score":0,"flag":7} +{"id":1,"name":"user_1","kind":"b","score":1.5,"flag":7}
    Embedding instead From bb0a0fc88c1f53fd0a74dd831fa236a3c43c2e72 Mon Sep 17 00:00:00 2001 From: mingfeng Date: Mon, 22 Jun 2026 06:08:36 -0700 Subject: [PATCH 11/12] test: drop trivial encoding_names unit; assert footer/buckets --json Remove the near-trivial encoding_names mapping test; extend footer and buckets e2e to cover their --json output, improving CLI feature coverage. --- cli/src/fmt.rs | 7 ------- cli/tests/e2e.rs | 6 ++++++ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/cli/src/fmt.rs b/cli/src/fmt.rs index a6e8f9e..0eb914e 100644 --- a/cli/src/fmt.rs +++ b/cli/src/fmt.rs @@ -231,13 +231,6 @@ mod tests { assert_eq!(render_value(&Value::Null), "null"); } - #[test] - fn encoding_names() { - assert_eq!(encoding_name(0), "plain"); - assert_eq!(encoding_name(2), "dict"); - assert_eq!(encoding_name(3), "all_null"); - } - #[test] fn ndjson_renders_null_and_quotes() { let out = ndjson(&[sample()], 10); diff --git a/cli/tests/e2e.rs b/cli/tests/e2e.rs index 3a078e3..f630503 100644 --- a/cli/tests/e2e.rs +++ b/cli/tests/e2e.rs @@ -150,6 +150,9 @@ fn footer_shows_format() { assert!(out.contains("magic=MOSA")); assert!(out.contains("buckets=3")); assert!(out.contains("compression=zstd")); + let (j, _, ok) = run(&["footer", &f, "--json"]); + assert!(ok); + assert!(j.contains("\"magic\":\"MOSA\"") && j.contains("\"compression\":\"zstd\"")); } #[test] @@ -181,4 +184,7 @@ fn buckets_show_layout() { assert!(out.contains("row group 0:")); assert!(out.contains("[flag]") && out.contains("[id]") && out.contains("[kind]")); assert!(out.contains("monolithic") || out.contains("paged")); + let (j, _, ok) = run(&["buckets", &f, "--json"]); + assert!(ok); + assert!(j.contains("\"bucket\":0") && j.contains("\"columns\":")); } From 924ac4b7b771b84037e918a8fa837924589ec066 Mon Sep 17 00:00:00 2001 From: mingfeng Date: Mon, 22 Jun 2026 06:11:34 -0700 Subject: [PATCH 12/12] chore(cli): drop redundant gen.rs example The e2e tests carry their own fixture writer; the standalone gen.rs example duplicated it and was unreferenced. --- cli/examples/gen.rs | 54 --------------------------------------------- 1 file changed, 54 deletions(-) delete mode 100644 cli/examples/gen.rs diff --git a/cli/examples/gen.rs b/cli/examples/gen.rs deleted file mode 100644 index b117018..0000000 --- a/cli/examples/gen.rs +++ /dev/null @@ -1,54 +0,0 @@ -// Generates a tiny mosaic file for CLI verification. -use std::fs::File; -use std::io::Write; -use std::sync::Arc; - -use arrow_array::{Int32Array, RecordBatch, StringArray}; -use arrow_schema::{DataType, Field, Schema}; -use paimon_mosaic_core::writer::{MosaicWriter, OutputFile, WriterOptions}; - -struct FileOut { - f: File, - pos: u64, -} -impl OutputFile for FileOut { - fn write(&mut self, d: &[u8]) -> std::io::Result<()> { - self.f.write_all(d)?; - self.pos += d.len() as u64; - Ok(()) - } - fn flush(&mut self) -> std::io::Result<()> { - self.f.flush() - } - fn pos(&self) -> u64 { - self.pos - } -} - -fn main() { - let path = std::env::args().nth(1).unwrap_or_else(|| "/tmp/sample.mosaic".into()); - let schema = Schema::new(vec![ - Field::new("id", DataType::Int32, false), - Field::new("name", DataType::Utf8, true), - Field::new("age", DataType::Int32, true), - ]); - let out = FileOut { f: File::create(&path).unwrap(), pos: 0 }; - let opts = WriterOptions { - num_buckets: 2, - stats_columns: vec!["id".into(), "name".into(), "age".into()], - ..Default::default() - }; - let mut w = MosaicWriter::new(out, &schema, opts).unwrap(); - let batch = RecordBatch::try_new( - Arc::new(schema), - vec![ - Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5])), - Arc::new(StringArray::from(vec![Some("alice"), Some("bob"), None, Some("dan"), Some("eve")])), - Arc::new(Int32Array::from(vec![Some(30), Some(25), Some(40), None, Some(28)])), - ], - ) - .unwrap(); - w.write_batch(&batch).unwrap(); - w.close().unwrap(); - println!("wrote {path}"); -}