From 947bb61ade3b7bb6237ec22172847dfd9d1de9a5 Mon Sep 17 00:00:00 2001
From: mingfeng <tjg281622@antgroup.com>
Date: Tue, 16 Jun 2026 04:22:58 -0700
Subject: [PATCH 01/12] feat(cli): add mosaic inspector CLI
 (schema/meta/cat/pages)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Mosaic previously shipped no viewer tooling — inspecting a file meant
writing Rust against the library API. Add a `mosaic` binary (a new `cli`
workspace crate) mirroring parquet-cli:

- schema: column names, Arrow types, nullability, bucket assignment
- meta:   row groups, rows, per-column stats (null_count/min/max)
- cat:    first N rows as a table, with -n and --columns projection
- pages:  per-column encoding (plain/const/dict/all_null) + slot size

All commands support --json. The reader is driven over a new file-backed
InputFile (pread). Core gains three small read-only accessors used by
`pages`: BucketReader::encodings(), ColumnPageReader::encoding(), and
MosaicReader::page_infos(). No format/behavior change; 199 core tests pass.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 Cargo.toml                |   2 +-
 cli/Cargo.toml            |  33 ++++++
 cli/examples/gen.rs       |  54 ++++++++++
 cli/src/fmt.rs            | 197 +++++++++++++++++++++++++++++++++++
 cli/src/input.rs          |  66 ++++++++++++
 cli/src/main.rs           | 214 ++++++++++++++++++++++++++++++++++++++
 core/src/bucket_reader.rs |  10 ++
 core/src/reader.rs        |  71 +++++++++++++
 8 files changed, 646 insertions(+), 1 deletion(-)
 create mode 100644 cli/Cargo.toml
 create mode 100644 cli/examples/gen.rs
 create mode 100644 cli/src/fmt.rs
 create mode 100644 cli/src/input.rs
 create mode 100644 cli/src/main.rs

diff --git a/Cargo.toml b/Cargo.toml
index 240d079..00133bb 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -16,7 +16,7 @@
 # under the License.
 
 [workspace]
-members = ["core", "ffi", "jni"]
+members = ["core", "ffi", "jni", "cli"]
 resolver = "2"
 
 [profile.release]
diff --git a/cli/Cargo.toml b/cli/Cargo.toml
new file mode 100644
index 0000000..97485ab
--- /dev/null
+++ b/cli/Cargo.toml
@@ -0,0 +1,33 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+[package]
+name = "paimon-mosaic-cli"
+version = "0.2.0"
+edition = "2021"
+description = "Mosaic file format — command line inspector (cat/meta/schema)"
+license = "Apache-2.0"
+
+[[bin]]
+name = "mosaic"
+path = "src/main.rs"
+
+[dependencies]
+paimon-mosaic-core = { path = "../core" }
+arrow-array = "58"
+arrow-schema = "58"
+clap = { version = "4", features = ["derive"] }
diff --git a/cli/examples/gen.rs b/cli/examples/gen.rs
new file mode 100644
index 0000000..b117018
--- /dev/null
+++ b/cli/examples/gen.rs
@@ -0,0 +1,54 @@
+// Generates a tiny mosaic file for CLI verification.
+use std::fs::File;
+use std::io::Write;
+use std::sync::Arc;
+
+use arrow_array::{Int32Array, RecordBatch, StringArray};
+use arrow_schema::{DataType, Field, Schema};
+use paimon_mosaic_core::writer::{MosaicWriter, OutputFile, WriterOptions};
+
+struct FileOut {
+    f: File,
+    pos: u64,
+}
+impl OutputFile for FileOut {
+    fn write(&mut self, d: &[u8]) -> std::io::Result<()> {
+        self.f.write_all(d)?;
+        self.pos += d.len() as u64;
+        Ok(())
+    }
+    fn flush(&mut self) -> std::io::Result<()> {
+        self.f.flush()
+    }
+    fn pos(&self) -> u64 {
+        self.pos
+    }
+}
+
+fn main() {
+    let path = std::env::args().nth(1).unwrap_or_else(|| "/tmp/sample.mosaic".into());
+    let schema = Schema::new(vec![
+        Field::new("id", DataType::Int32, false),
+        Field::new("name", DataType::Utf8, true),
+        Field::new("age", DataType::Int32, true),
+    ]);
+    let out = FileOut { f: File::create(&path).unwrap(), pos: 0 };
+    let opts = WriterOptions {
+        num_buckets: 2,
+        stats_columns: vec!["id".into(), "name".into(), "age".into()],
+        ..Default::default()
+    };
+    let mut w = MosaicWriter::new(out, &schema, opts).unwrap();
+    let batch = RecordBatch::try_new(
+        Arc::new(schema),
+        vec![
+            Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5])),
+            Arc::new(StringArray::from(vec![Some("alice"), Some("bob"), None, Some("dan"), Some("eve")])),
+            Arc::new(Int32Array::from(vec![Some(30), Some(25), Some(40), None, Some(28)])),
+        ],
+    )
+    .unwrap();
+    w.write_batch(&batch).unwrap();
+    w.close().unwrap();
+    println!("wrote {path}");
+}
diff --git a/cli/src/fmt.rs b/cli/src/fmt.rs
new file mode 100644
index 0000000..cb85ce5
--- /dev/null
+++ b/cli/src/fmt.rs
@@ -0,0 +1,197 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow_array::{Array, RecordBatch};
+use paimon_mosaic_core::values::Value;
+
+/// Render a stats min/max [`Value`] to a short, human-readable string.
+pub fn render_value(v: &Value) -> String {
+    match v {
+        Value::Null => "null".to_string(),
+        Value::Boolean(b) => b.to_string(),
+        Value::TinyInt(x) => x.to_string(),
+        Value::SmallInt(x) => x.to_string(),
+        Value::Integer(x) => x.to_string(),
+        Value::BigInt(x) => x.to_string(),
+        Value::Float(x) => x.to_string(),
+        Value::Double(x) => x.to_string(),
+        Value::Date(x) => format!("{} (epoch-day)", x),
+        Value::Time(x) => format!("{} (ms)", x),
+        Value::String(b) => String::from_utf8_lossy(b).into_owned(),
+        Value::Bytes(b) | Value::DecimalLarge(b) => format!("0x{}", hex(b)),
+        Value::DecimalCompact(x) => x.to_string(),
+        Value::TimestampMillis(x) => format!("{} (ms)", x),
+        Value::TimestampMicros(x) => format!("{} (us)", x),
+        Value::TimestampNanos { millis, nanos_of_milli } => {
+            format!("{}ms+{}ns", millis, nanos_of_milli)
+        }
+    }
+}
+
+fn hex(b: &[u8]) -> String {
+    b.iter().map(|x| format!("{:02x}", x)).collect()
+}
+
+/// Human-readable encoding name for a `spec::ENCODING_*` id.
+pub fn encoding_name(e: u8) -> &'static str {
+    use paimon_mosaic_core::spec::*;
+    match e {
+        ENCODING_PLAIN => "plain",
+        ENCODING_CONST => "const",
+        ENCODING_DICT => "dict",
+        ENCODING_ALL_NULL => "all_null",
+        _ => "?",
+    }
+}
+
+/// Escape a string as a JSON string literal (quotes included).
+pub fn json_str(s: &str) -> String {
+    let mut o = String::with_capacity(s.len() + 2);
+    o.push('"');
+    for c in s.chars() {
+        match c {
+            '"' => o.push_str("\\\""),
+            '\\' => o.push_str("\\\\"),
+            '\n' => o.push_str("\\n"),
+            '\r' => o.push_str("\\r"),
+            '\t' => o.push_str("\\t"),
+            c if (c as u32) < 0x20 => o.push_str(&format!("\\u{:04x}", c as u32)),
+            c => o.push(c),
+        }
+    }
+    o.push('"');
+    o
+}
+
+/// Pretty-print a slice of record batches as an aligned ASCII table.
+pub fn pretty_table(batches: &[RecordBatch], max_rows: usize) -> String {
+    let schema = batches[0].schema();
+    let headers: Vec<String> = schema.fields().iter().map(|f| f.name().clone()).collect();
+    let ncols = headers.len();
+
+    let mut rows: Vec<Vec<String>> = Vec::new();
+    'outer: for batch in batches {
+        for r in 0..batch.num_rows() {
+            if rows.len() >= max_rows {
+                break 'outer;
+            }
+            let mut row = Vec::with_capacity(ncols);
+            for c in 0..ncols {
+                row.push(cell(batch.column(c).as_ref(), r));
+            }
+            rows.push(row);
+        }
+    }
+
+    let mut widths: Vec<usize> = headers.iter().map(|h| h.chars().count()).collect();
+    for row in &rows {
+        for (i, v) in row.iter().enumerate() {
+            widths[i] = widths[i].max(v.chars().count());
+        }
+    }
+
+    let sep = |out: &mut String| {
+        out.push('+');
+        for w in &widths {
+            out.push_str(&"-".repeat(w + 2));
+            out.push('+');
+        }
+        out.push('\n');
+    };
+    let line = |out: &mut String, cells: &[String]| {
+        out.push('|');
+        for (i, c) in cells.iter().enumerate() {
+            out.push_str(&format!(" {:<w$} |", c, w = widths[i]));
+        }
+        out.push('\n');
+    };
+
+    let mut out = String::new();
+    sep(&mut out);
+    line(&mut out, &headers);
+    sep(&mut out);
+    for row in &rows {
+        line(&mut out, row);
+    }
+    sep(&mut out);
+    out
+}
+
+/// Render up to `max_rows` as newline-delimited JSON objects.
+pub fn ndjson(batches: &[RecordBatch], max_rows: usize) -> String {
+    let schema = batches[0].schema();
+    let names: Vec<String> = schema.fields().iter().map(|f| f.name().clone()).collect();
+    let mut out = String::new();
+    let mut got = 0usize;
+    'outer: for batch in batches {
+        for r in 0..batch.num_rows() {
+            if got >= max_rows {
+                break 'outer;
+            }
+            out.push('{');
+            for (c, name) in names.iter().enumerate() {
+                if c > 0 {
+                    out.push(',');
+                }
+                out.push_str(&json_str(name));
+                out.push(':');
+                out.push_str(&cell_json(batch.column(c).as_ref(), r));
+            }
+            out.push_str("}\n");
+            got += 1;
+        }
+    }
+    out
+}
+
+/// Render one Arrow cell as a JSON value (numbers bare, strings quoted, null).
+fn cell_json(arr: &dyn Array, row: usize) -> String {
+    use arrow_schema::DataType::*;
+    if arr.is_null(row) {
+        return "null".to_string();
+    }
+    match arr.data_type() {
+        Utf8 | Date32 => json_str(&cell(arr, row)),
+        _ => cell(arr, row),
+    }
+}
+
+/// Render one Arrow cell to a string by downcasting on the column type.
+fn cell(arr: &dyn Array, row: usize) -> String {
+    use arrow_array::*;
+    use arrow_schema::DataType::*;
+    if arr.is_null(row) {
+        return "".to_string();
+    }
+    macro_rules! d {
+        ($ty:ty) => {
+            arr.as_any().downcast_ref::<$ty>().unwrap().value(row).to_string()
+        };
+    }
+    match arr.data_type() {
+        Boolean => d!(BooleanArray),
+        Int8 => d!(Int8Array),
+        Int16 => d!(Int16Array),
+        Int32 => d!(Int32Array),
+        Int64 => d!(Int64Array),
+        Float32 => d!(Float32Array),
+        Float64 => d!(Float64Array),
+        Date32 => d!(Date32Array),
+        Utf8 => arr.as_any().downcast_ref::<StringArray>().unwrap().value(row).to_string(),
+        _ => "?".to_string(),
+    }
+}
diff --git a/cli/src/input.rs b/cli/src/input.rs
new file mode 100644
index 0000000..b6e3722
--- /dev/null
+++ b/cli/src/input.rs
@@ -0,0 +1,66 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::fs::File;
+use std::io;
+use std::path::Path;
+
+use paimon_mosaic_core::reader::InputFile;
+
+/// A read-only [`InputFile`] backed by a real file using positional reads.
+///
+/// `read_exact_at` does not move a shared cursor, so concurrent calls from the
+/// reader's coalescing threads are safe — satisfying the `Sync` bound.
+pub struct FileInput {
+    file: File,
+    len: u64,
+}
+
+impl FileInput {
+    pub fn open(path: &Path) -> io::Result<Self> {
+        let file = File::open(path)?;
+        let len = file.metadata()?.len();
+        Ok(Self { file, len })
+    }
+
+    pub fn len(&self) -> u64 {
+        self.len
+    }
+}
+
+impl InputFile for FileInput {
+    fn read_at(&self, offset: u64, buf: &mut [u8]) -> io::Result<()> {
+        #[cfg(unix)]
+        {
+            use std::os::unix::fs::FileExt;
+            self.file.read_exact_at(buf, offset)
+        }
+        #[cfg(windows)]
+        {
+            use std::os::windows::fs::FileExt;
+            let mut read = 0;
+            while read < buf.len() {
+                let n = self.file.seek_read(&mut buf[read..], offset + read as u64)?;
+                if n == 0 {
+                    return Err(io::Error::new(io::ErrorKind::UnexpectedEof, "read past end"));
+                }
+                read += n;
+            }
+            Ok(())
+        }
+    }
+}
diff --git a/cli/src/main.rs b/cli/src/main.rs
new file mode 100644
index 0000000..74b6918
--- /dev/null
+++ b/cli/src/main.rs
@@ -0,0 +1,214 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+mod fmt;
+mod input;
+
+use std::path::PathBuf;
+use std::process::ExitCode;
+
+use arrow_array::RecordBatch;
+use clap::{Parser, Subcommand};
+use paimon_mosaic_core::reader::{MosaicReader, ReaderAccess};
+
+use crate::input::FileInput;
+
+/// Mosaic file inspector — the cat/meta/schema/pages toolkit (cf. parquet-cli).
+#[derive(Parser)]
+#[command(name = "mosaic", version, about)]
+struct Cli {
+    #[command(subcommand)]
+    cmd: Cmd,
+}
+
+#[derive(Subcommand)]
+enum Cmd {
+    /// Print the column names, types, nullability and bucket assignment.
+    Schema {
+        file: PathBuf,
+        #[arg(long)]
+        json: bool,
+    },
+    /// Print row-group / bucket / stats metadata.
+    Meta {
+        file: PathBuf,
+        #[arg(long)]
+        json: bool,
+    },
+    /// Print per-column encoding and slot size for each row group.
+    Pages {
+        file: PathBuf,
+        #[arg(long)]
+        json: bool,
+    },
+    /// Print the first N rows as a table.
+    Cat {
+        file: PathBuf,
+        /// Number of rows to print.
+        #[arg(short = 'n', long, default_value_t = 10)]
+        num: usize,
+        /// Comma-separated columns to project.
+        #[arg(short, long)]
+        columns: Option<String>,
+        #[arg(long)]
+        json: bool,
+    },
+}
+
+fn main() -> ExitCode {
+    let cli = Cli::parse();
+    let res = match cli.cmd {
+        Cmd::Schema { file, json } => schema(&file, json),
+        Cmd::Meta { file, json } => meta(&file, json),
+        Cmd::Pages { file, json } => pages(&file, json),
+        Cmd::Cat { file, num, columns, json } => cat(&file, num, columns, json),
+    };
+    match res {
+        Ok(()) => ExitCode::SUCCESS,
+        Err(e) => {
+            eprintln!("error: {e}");
+            ExitCode::FAILURE
+        }
+    }
+}
+
+fn open(file: &PathBuf) -> std::io::Result<MosaicReader<FileInput>> {
+    let input = FileInput::open(file)?;
+    let len = input.len();
+    MosaicReader::new(input, len)
+}
+
+/// Columns in original (write) order rather than the name-sorted layout.
+fn original_order(s: &paimon_mosaic_core::schema::MosaicSchema) -> Vec<usize> {
+    let mut by_sorted = vec![0usize; s.columns.len()];
+    for (orig, &sorted) in s.original_order.iter().enumerate() {
+        by_sorted[sorted] = orig;
+    }
+    let mut cols: Vec<usize> = (0..s.columns.len()).collect();
+    cols.sort_by_key(|&i| by_sorted[i]);
+    cols
+}
+
+fn schema(file: &PathBuf, json: bool) -> std::io::Result<()> {
+    let reader = open(file)?;
+    let s = reader.schema();
+    let cols = original_order(s);
+    if json {
+        let items: Vec<String> = cols.iter().map(|&i| {
+            let c = &s.columns[i];
+            format!("{{\"name\":{},\"type\":{},\"nullable\":{},\"bucket\":{}}}",
+                fmt::json_str(&c.name), fmt::json_str(&format!("{:?}", c.data_type)), c.nullable, c.bucket_id)
+        }).collect();
+        println!("{{\"columns\":{},\"buckets\":{},\"fields\":[{}]}}", s.columns.len(), s.num_buckets, items.join(","));
+        return Ok(());
+    }
+    println!("{} columns, {} buckets", s.columns.len(), s.num_buckets);
+    for i in cols {
+        let c = &s.columns[i];
+        let null = if c.nullable { "" } else { " not null" };
+        println!("  {}: {:?}{} [bucket {}]", c.name, c.data_type, null, c.bucket_id);
+    }
+    Ok(())
+}
+
+fn meta(file: &PathBuf, json: bool) -> std::io::Result<()> {
+    let reader = open(file)?;
+    let s = reader.schema();
+    let nrg = reader.num_row_groups();
+    let total: usize = (0..nrg).map(|i| reader.row_group_num_rows(i).unwrap_or(0)).sum();
+    if json {
+        let mut rgs = Vec::new();
+        for rg in 0..nrg {
+            let st: Vec<String> = reader.row_group_stats(rg)?.iter().map(|x| {
+                let mm = match (&x.min, &x.max) {
+                    (Some(lo), Some(hi)) => format!(",\"min\":{},\"max\":{}", fmt::json_str(&fmt::render_value(lo)), fmt::json_str(&fmt::render_value(hi))),
+                    _ => String::new(),
+                };
+                format!("{{\"column\":{},\"nulls\":{}{}}}", fmt::json_str(&s.columns[x.column_index].name), x.null_count, mm)
+            }).collect();
+            rgs.push(format!("{{\"rows\":{},\"stats\":[{}]}}", reader.row_group_num_rows(rg)?, st.join(",")));
+        }
+        println!("{{\"rows\":{},\"columns\":{},\"buckets\":{},\"row_groups\":[{}]}}", total, s.columns.len(), s.num_buckets, rgs.join(","));
+        return Ok(());
+    }
+    println!("file: {} rows, {} columns, {} buckets, {} row groups", total, s.columns.len(), s.num_buckets, nrg);
+    for rg in 0..nrg {
+        println!("row group {rg}: {} rows", reader.row_group_num_rows(rg)?);
+        for st in reader.row_group_stats(rg)? {
+            let mm = match (&st.min, &st.max) {
+                (Some(lo), Some(hi)) => format!("min={} max={}", fmt::render_value(lo), fmt::render_value(hi)),
+                _ => "no min/max".to_string(),
+            };
+            println!("    {}: nulls={} {}", s.columns[st.column_index].name, st.null_count, mm);
+        }
+    }
+    Ok(())
+}
+
+fn pages(file: &PathBuf, json: bool) -> std::io::Result<()> {
+    let reader = open(file)?;
+    let s = reader.schema();
+    let nrg = reader.num_row_groups();
+    if json {
+        let mut rgs = Vec::new();
+        for rg in 0..nrg {
+            let items: Vec<String> = reader.page_infos(rg)?.iter().map(|p| {
+                format!("{{\"column\":{},\"bucket\":{},\"encoding\":{},\"slot_size\":{}}}",
+                    fmt::json_str(&s.columns[p.column_index].name), p.bucket, fmt::json_str(fmt::encoding_name(p.encoding)), p.slot_size)
+            }).collect();
+            rgs.push(format!("[{}]", items.join(",")));
+        }
+        println!("{{\"row_groups\":[{}]}}", rgs.join(","));
+        return Ok(());
+    }
+    for rg in 0..nrg {
+        println!("row group {rg}:");
+        for p in reader.page_infos(rg)? {
+            let c = &s.columns[p.column_index];
+            println!("    {}: bucket {} encoding={} slot={}B", c.name, p.bucket, fmt::encoding_name(p.encoding), p.slot_size);
+        }
+    }
+    Ok(())
+}
+
+fn cat(file: &PathBuf, num: usize, columns: Option<String>, json: bool) -> std::io::Result<()> {
+    let mut reader = open(file)?;
+    if let Some(list) = &columns {
+        let names: Vec<&str> = list.split(',').map(|x| x.trim()).filter(|x| !x.is_empty()).collect();
+        reader.project(&names)?;
+    }
+    let mut batches: Vec<RecordBatch> = Vec::new();
+    let mut got = 0usize;
+    for rg in 0..reader.num_row_groups() {
+        if got >= num {
+            break;
+        }
+        let batch = reader.row_group_reader(rg)?.read_columns()?;
+        got += batch.num_rows();
+        batches.push(batch);
+    }
+    if batches.is_empty() {
+        if !json {
+            println!("(no rows)");
+        }
+    } else if json {
+        print!("{}", fmt::ndjson(&batches, num));
+    } else {
+        print!("{}", fmt::pretty_table(&batches, num));
+    }
+    Ok(())
+}
diff --git a/core/src/bucket_reader.rs b/core/src/bucket_reader.rs
index 115bae4..3675d5b 100644
--- a/core/src/bucket_reader.rs
+++ b/core/src/bucket_reader.rs
@@ -507,6 +507,11 @@ impl BucketReader {
         self.num_rows - null_count
     }
 
+    /// Per-column encoding ids (in this bucket's column order). See `spec::ENCODING_*`.
+    pub fn encodings(&self) -> &[u8] {
+        &self.encodings
+    }
+
     pub fn read_all_columns(&self) -> io::Result<Vec<ArrayRef>> {
         let num_rows = self.num_rows;
         let mut result = Vec::with_capacity(self.num_columns);
@@ -768,6 +773,11 @@ impl ColumnPageReader {
         }
     }
 
+    /// Encoding id of this column page. See `spec::ENCODING_*`.
+    pub fn encoding(&self) -> u8 {
+        self.encoding
+    }
+
     pub fn read_all(&self) -> io::Result<ArrayRef> {
         let num_rows = self.num_rows;
         let variant = data_variant_for_type(&self.col_type);
diff --git a/core/src/reader.rs b/core/src/reader.rs
index e111b9d..40692d2 100644
--- a/core/src/reader.rs
+++ b/core/src/reader.rs
@@ -179,6 +179,16 @@ fn read_merged_ranges<I: InputFile + ?Sized>(
     Ok((merged, fetched))
 }
 
+/// Physical placement of one column within one row group.
+pub struct PageInfo {
+    pub column_index: usize,
+    pub bucket: usize,
+    /// `spec::ENCODING_*`: plain / const / dict / all_null.
+    pub encoding: u8,
+    /// Paged-bucket on-disk slot size in bytes; 0 for monolithic/empty buckets.
+    pub slot_size: usize,
+}
+
 pub struct RowGroupMeta {
     pub num_rows: usize,
     pub bucket_offsets: Vec<u64>,
@@ -447,6 +457,67 @@ impl<I: InputFile> MosaicReader<I> {
         &self.input
     }
 
+    /// Per-column physical layout for a row group: bucket, encoding and on-disk
+    /// slot size. Reads and decompresses each non-empty bucket; used by tooling
+    /// (the `pages` command). Columns are reported in global (name-sorted) order.
+    pub fn page_infos(&self, rg_index: usize) -> io::Result<Vec<PageInfo>> {
+        if rg_index >= self.row_group_metas.len() {
+            return Err(io::Error::new(
+                io::ErrorKind::InvalidInput,
+                "row group index out of range",
+            ));
+        }
+        let meta = &self.row_group_metas[rg_index];
+        let mut out = Vec::with_capacity(self.schema.columns.len());
+        for b in 0..self.num_buckets {
+            let globals = &self.schema.bucket_to_global[b];
+            match meta.bucket_layouts[b] {
+                BucketLayout::Empty => {
+                    for &gi in globals {
+                        out.push(PageInfo { column_index: gi, bucket: b, encoding: ENCODING_ALL_NULL, slot_size: 0 });
+                    }
+                }
+                BucketLayout::Monolithic { compressed_size, uncompressed_size } => {
+                    let buf = read_range(&self.input, meta.bucket_offsets[b], compressed_size)?;
+                    let data = match self.compression {
+                        COMPRESSION_NONE => buf,
+                        COMPRESSION_ZSTD => zstd::bulk::decompress(&buf, uncompressed_size)
+                            .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?,
+                        _ => return Err(io::Error::new(io::ErrorKind::InvalidData, "unsupported compression")),
+                    };
+                    let col_types: Vec<DataType> = globals.iter().map(|&gi| self.schema.columns[gi].data_type.clone()).collect();
+                    let reader = BucketReader::new(col_types, data, meta.num_rows)?;
+                    for (local, &gi) in globals.iter().enumerate() {
+                        out.push(PageInfo { column_index: gi, bucket: b, encoding: reader.encodings()[local], slot_size: 0 });
+                    }
+                }
+                BucketLayout::Paged { total_size } => {
+                    let dir_size = globals.len() * 4;
+                    let dir = read_range(&self.input, meta.bucket_offsets[b], dir_size)?;
+                    let mut sizes = Vec::with_capacity(globals.len());
+                    for i in 0..globals.len() {
+                        sizes.push(u32::from_le_bytes(dir[i * 4..i * 4 + 4].try_into().unwrap()) as usize);
+                    }
+                    let mut foff = meta.bucket_offsets[b] + dir_size as u64;
+                    for (local, &gi) in globals.iter().enumerate() {
+                        let enc = if sizes[local] == 0 {
+                            ENCODING_ALL_NULL
+                        } else {
+                            let slot = read_range(&self.input, foff, sizes[local])?;
+                            let ct = self.schema.columns[gi].data_type.clone();
+                            Self::parse_column_slot(&slot, &ct, meta.num_rows)?.encoding()
+                        };
+                        out.push(PageInfo { column_index: gi, bucket: b, encoding: enc, slot_size: sizes[local] });
+                        foff += sizes[local] as u64;
+                    }
+                    let _ = total_size;
+                }
+            }
+        }
+        out.sort_by_key(|p| p.column_index);
+        Ok(out)
+    }
+
     fn parse_column_slot(
         slot_data: &[u8],
         col_type: &DataType,

From b173d18f4ab00221f21fc8169fff7bfd9b88804d Mon Sep 17 00:00:00 2001
From: mingfeng <tjg281622@antgroup.com>
Date: Tue, 16 Jun 2026 04:42:48 -0700
Subject: [PATCH 02/12] test: cover page_infos encodings and CLI formatters

Add a core regression test for MosaicReader::page_infos asserting
plain/dict/const detection on a paged-bucket file, and CLI unit tests
for the fmt helpers (json escaping, value/encoding rendering, ndjson
null handling, table truncation).
---
 cli/src/fmt.rs           | 57 ++++++++++++++++++++++++++++++++++++++++
 core/src/reader_tests.rs | 51 +++++++++++++++++++++++++++++++++++
 2 files changed, 108 insertions(+)

diff --git a/cli/src/fmt.rs b/cli/src/fmt.rs
index cb85ce5..a6e8f9e 100644
--- a/cli/src/fmt.rs
+++ b/cli/src/fmt.rs
@@ -195,3 +195,60 @@ fn cell(arr: &dyn Array, row: usize) -> String {
         _ => "?".to_string(),
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow_array::{Int32Array, StringArray};
+    use arrow_schema::{DataType, Field, Schema};
+    use std::sync::Arc;
+
+    fn sample() -> RecordBatch {
+        let schema = Schema::new(vec![
+            Field::new("id", DataType::Int32, false),
+            Field::new("name", DataType::Utf8, true),
+        ]);
+        RecordBatch::try_new(
+            Arc::new(schema),
+            vec![
+                Arc::new(Int32Array::from(vec![1, 2])),
+                Arc::new(StringArray::from(vec![Some("ann"), None])),
+            ],
+        )
+        .unwrap()
+    }
+
+    #[test]
+    fn json_str_escapes() {
+        assert_eq!(json_str("a\"b\n"), "\"a\\\"b\\n\"");
+        assert_eq!(json_str("x"), "\"x\"");
+    }
+
+    #[test]
+    fn render_value_types() {
+        assert_eq!(render_value(&Value::Integer(5)), "5");
+        assert_eq!(render_value(&Value::String(b"hi".to_vec())), "hi");
+        assert_eq!(render_value(&Value::Null), "null");
+    }
+
+    #[test]
+    fn encoding_names() {
+        assert_eq!(encoding_name(0), "plain");
+        assert_eq!(encoding_name(2), "dict");
+        assert_eq!(encoding_name(3), "all_null");
+    }
+
+    #[test]
+    fn ndjson_renders_null_and_quotes() {
+        let out = ndjson(&[sample()], 10);
+        assert_eq!(out, "{\"id\":1,\"name\":\"ann\"}\n{\"id\":2,\"name\":null}\n");
+    }
+
+    #[test]
+    fn pretty_table_truncates_and_aligns() {
+        let t = pretty_table(&[sample()], 1);
+        assert!(t.contains("| id "));
+        assert!(t.contains("| 1  "));
+        assert!(!t.contains("| 2 "));
+    }
+}
diff --git a/core/src/reader_tests.rs b/core/src/reader_tests.rs
index 55eecee..fde6b55 100644
--- a/core/src/reader_tests.rs
+++ b/core/src/reader_tests.rs
@@ -4180,3 +4180,54 @@ fn test_row_group_num_rows_out_of_range() {
     assert!(reader.row_group_num_rows(1).is_err());
     assert!(reader.row_group_num_rows(999).is_err());
 }
+
+#[test]
+fn test_page_infos_encodings() {
+    use crate::spec::{ENCODING_CONST, ENCODING_DICT, ENCODING_PLAIN};
+    let columns = vec![
+        ("id".to_string(), DataType::Int32, false),    // unique -> plain
+        ("kind".to_string(), DataType::Utf8, true),    // low cardinality -> dict
+        ("flag".to_string(), DataType::Int32, true),   // constant -> const
+    ];
+    let out = MemOutputFile::new();
+    let mut writer = MosaicWriter::new(
+        out,
+        &columns_to_arrow_schema(&columns),
+        WriterOptions {
+            num_buckets: 3,
+            page_size_threshold: 1, // force paged buckets
+            ..Default::default()
+        },
+    )
+    .unwrap();
+
+    let rows: Vec<Vec<Value>> = (0..200)
+        .map(|i| {
+            vec![
+                Value::Integer(i),
+                Value::String(["a", "b", "c"][(i % 3) as usize].as_bytes().to_vec()),
+                Value::Integer(7),
+            ]
+        })
+        .collect();
+    write_values(&mut writer, &columns, &rows);
+    writer.close().unwrap();
+    let data = writer.output().buf.clone();
+    let len = data.len() as u64;
+    let reader = MosaicReader::new(ByteArrayInputFile::new(data), len).unwrap();
+
+    let infos = reader.page_infos(0).unwrap();
+    assert_eq!(infos.len(), 3);
+    // page_infos is sorted by column_index; name-sorted order: flag, id, kind
+    let by_name = |n: &str| {
+        infos
+            .iter()
+            .find(|p| reader.schema().columns[p.column_index].name == n)
+            .unwrap()
+    };
+    assert_eq!(by_name("id").encoding, ENCODING_PLAIN);
+    assert_eq!(by_name("kind").encoding, ENCODING_DICT);
+    assert_eq!(by_name("flag").encoding, ENCODING_CONST);
+    assert!(by_name("id").slot_size > 0);
+    assert!(reader.page_infos(999).is_err());
+}

From 963db7565a086859fc7f10b7709e21cd280c1172 Mon Sep 17 00:00:00 2001
From: mingfeng <tjg281622@antgroup.com>
Date: Tue, 16 Jun 2026 04:56:41 -0700
Subject: [PATCH 03/12] test: add zero-dep end-to-end tests for mosaic CLI

Drive the mosaic binary against a fixture file (via CARGO_BIN_EXE) and
assert stdout for schema/meta/pages/cat, --json output, projection,
row truncation and missing-file failure. No external dev-deps.
---
 cli/tests/e2e.rs | 143 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 143 insertions(+)
 create mode 100644 cli/tests/e2e.rs

diff --git a/cli/tests/e2e.rs b/cli/tests/e2e.rs
new file mode 100644
index 0000000..3f19e21
--- /dev/null
+++ b/cli/tests/e2e.rs
@@ -0,0 +1,143 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! End-to-end tests: drive the `mosaic` binary against a fixture file and
+//! assert stdout. Zero external dev-deps — uses CARGO_BIN_EXE and std only.
+
+use std::fs::File;
+use std::io::Write;
+use std::process::Command;
+use std::sync::Arc;
+
+use arrow_array::{Int32Array, RecordBatch, StringArray};
+use arrow_schema::{DataType, Field, Schema};
+use paimon_mosaic_core::writer::{MosaicWriter, OutputFile, WriterOptions};
+
+struct FileOut {
+    f: File,
+    pos: u64,
+}
+impl OutputFile for FileOut {
+    fn write(&mut self, d: &[u8]) -> std::io::Result<()> {
+        self.f.write_all(d)?;
+        self.pos += d.len() as u64;
+        Ok(())
+    }
+    fn flush(&mut self) -> std::io::Result<()> {
+        self.f.flush()
+    }
+    fn pos(&self) -> u64 {
+        self.pos
+    }
+}
+
+/// Write a small fixture and return its path under the test temp dir.
+fn fixture(name: &str) -> String {
+    let path = format!("{}/mosaic_e2e_{}.mosaic", std::env::temp_dir().display(), name);
+    let schema = Schema::new(vec![
+        Field::new("id", DataType::Int32, false),
+        Field::new("kind", DataType::Utf8, true),
+        Field::new("flag", DataType::Int32, true),
+    ]);
+    let out = FileOut { f: File::create(&path).unwrap(), pos: 0 };
+    let opts = WriterOptions {
+        num_buckets: 3,
+        page_size_threshold: 1,
+        stats_columns: vec!["id".into()],
+        ..Default::default()
+    };
+    let mut w = MosaicWriter::new(out, &schema, opts).unwrap();
+    let n = 200;
+    let ids: Vec<i32> = (0..n).collect();
+    let kinds: Vec<&str> = (0..n).map(|i| ["a", "b", "c"][(i % 3) as usize]).collect();
+    let flags = vec![7; n as usize];
+    let batch = RecordBatch::try_new(
+        Arc::new(schema),
+        vec![
+            Arc::new(Int32Array::from(ids)),
+            Arc::new(StringArray::from(kinds)),
+            Arc::new(Int32Array::from(flags)),
+        ],
+    )
+    .unwrap();
+    w.write_batch(&batch).unwrap();
+    w.close().unwrap();
+    path
+}
+
+fn run(args: &[&str]) -> (String, String, bool) {
+    let out = Command::new(env!("CARGO_BIN_EXE_mosaic")).args(args).output().unwrap();
+    (
+        String::from_utf8(out.stdout).unwrap(),
+        String::from_utf8(out.stderr).unwrap(),
+        out.status.success(),
+    )
+}
+
+#[test]
+fn schema_lists_columns() {
+    let f = fixture("schema");
+    let (out, _, ok) = run(&["schema", &f]);
+    assert!(ok);
+    assert!(out.contains("3 columns, 3 buckets"));
+    assert!(out.contains("id: Int32 not null"));
+    assert!(out.contains("kind: Utf8"));
+}
+
+#[test]
+fn meta_shows_stats() {
+    let f = fixture("meta");
+    let (out, _, ok) = run(&["meta", &f]);
+    assert!(ok);
+    assert!(out.contains("200 rows"));
+    assert!(out.contains("id: nulls=0 min=0 max=199"));
+}
+
+#[test]
+fn pages_shows_encodings() {
+    let f = fixture("pages");
+    let (out, _, ok) = run(&["pages", &f]);
+    assert!(ok);
+    assert!(out.contains("flag: bucket 0 encoding=const"));
+    assert!(out.contains("kind: bucket 2 encoding=dict"));
+}
+
+#[test]
+fn cat_truncates_and_projects() {
+    let f = fixture("cat");
+    let (out, _, ok) = run(&["cat", &f, "-n", "2"]);
+    assert!(ok);
+    assert!(out.contains("| id | kind | flag |"));
+    assert_eq!(out.matches('\n').count(), 6); // 3 borders + header + 2 rows
+    let (proj, _, _) = run(&["cat", &f, "-c", "kind,id", "-n", "1"]);
+    assert!(proj.contains("| kind | id |"));
+}
+
+#[test]
+fn cat_json_is_ndjson() {
+    let f = fixture("json");
+    let (out, _, ok) = run(&["cat", &f, "-n", "2", "--json"]);
+    assert!(ok);
+    assert_eq!(out, "{\"id\":0,\"kind\":\"a\",\"flag\":7}\n{\"id\":1,\"kind\":\"b\",\"flag\":7}\n");
+}
+
+#[test]
+fn missing_file_fails() {
+    let (_, err, ok) = run(&["schema", "/no/such/file.mosaic"]);
+    assert!(!ok);
+    assert!(err.contains("error:"));
+}

From d4c3a71e99676fc9d470dd465dd03e91ae8c1946 Mon Sep 17 00:00:00 2001
From: jianguotian <jianguotian2010@gmail.com>
Date: Thu, 18 Jun 2026 00:43:03 -0700
Subject: [PATCH 04/12] docs: add CLI page + parquet-cli comparison

Adds docs/cli.html documenting the mosaic inspector (schema/meta/pages/cat,
text + JSON) with a parquet-cli command mapping and design-difference table,
addressing the review asks on #66. Adds CLI to the nav across doc pages.
---
 docs/cli.html                           | 164 ++++++++++++++++++++++++
 docs/cpp-api.html                       |   1 +
 docs/creating-a-release.html            |   1 +
 docs/design.html                        |   1 +
 docs/index.html                         |   1 +
 docs/java-api.html                      |   1 +
 docs/python-api.html                    |   1 +
 docs/releases.html                      |   1 +
 docs/verifying-a-release-candidate.html |   1 +
 9 files changed, 172 insertions(+)
 create mode 100644 docs/cli.html

diff --git a/docs/cli.html b/docs/cli.html
new file mode 100644
index 0000000..043ebee
--- /dev/null
+++ b/docs/cli.html
@@ -0,0 +1,164 @@
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>CLI - Paimon Mosaic</title>
+    <link rel="stylesheet" href="css/style.css">
+    <script src="js/main.js"></script>
+</head>
+<body>
+    <button class="menu-toggle" aria-label="Menu">&#9776;</button>
+    <div class="overlay"></div>
+
+    <aside class="sidebar">
+        <div class="sidebar-header">
+            <h2>Paimon Mosaic</h2>
+            <p>Columnar-bucket hybrid format</p>
+        </div>
+        <nav>
+            <ul>
+                <li><a href="index.html">Home</a></li>
+                <li><a href="design.html">Design</a></li>
+                <li><a href="java-api.html">Java API</a></li>
+                <li><a href="python-api.html">Python API</a></li>
+                <li><a href="cpp-api.html">C++ API</a></li>
+                <li><a href="cli.html" class="active">CLI</a></li>
+                <li><a href="releases.html">Releases</a></li>
+            </ul>
+        </nav>
+        <div class="sidebar-footer">
+            <button class="theme-toggle">Dark Mode</button>
+        </div>
+    </aside>
+
+    <main class="main">
+        <div class="content">
+            <h1>CLI</h1>
+            <p class="subtitle">A <code>mosaic</code> binary for inspecting files from the shell &mdash; schema, metadata, contents, and physical encoding, in text or JSON.</p>
+
+            <h2>Install</h2>
+            <p>
+                The CLI is the <code>cli/</code> workspace crate. Build the binary and put it on your
+                <code>PATH</code>:
+            </p>
+<pre><code>cargo build --release -p paimon-mosaic-cli
+cp target/release/mosaic /usr/local/bin/</code></pre>
+            <p>Every command takes a file path and prints to stdout; add <code>--json</code> for machine-readable output.</p>
+<pre><code><span class="cmt"># schema, metadata, encoding, and first rows</span>
+mosaic schema file.mosaic
+mosaic meta   file.mosaic
+mosaic pages  file.mosaic
+mosaic cat    file.mosaic -n 20</code></pre>
+
+            <h2>schema</h2>
+            <p>Column names, Arrow types, nullability, and bucket assignment. Columns are listed in original (write) order.</p>
+<pre><code>$ mosaic schema file.mosaic
+3 columns, 2 buckets
+  id: Int32 not null [bucket 0]
+  name: Utf8 [bucket 1]
+  age: Int32 [bucket 0]</code></pre>
+<pre><code>$ mosaic schema file.mosaic --json
+{<span class="str">"columns"</span>:3,<span class="str">"buckets"</span>:2,<span class="str">"fields"</span>:[{<span class="str">"name"</span>:<span class="str">"id"</span>,<span class="str">"type"</span>:<span class="str">"Int32"</span>,<span class="str">"nullable"</span>:false,<span class="str">"bucket"</span>:0}, ...]}</code></pre>
+
+            <h2>meta</h2>
+            <p>Row counts, row groups, and per-column statistics (null count and min/max for stats columns).</p>
+<pre><code>$ mosaic meta file.mosaic
+file: 5 rows, 3 columns, 2 buckets, 1 row groups
+row group 0: 5 rows
+    age: nulls=1 min=25 max=40
+    id: nulls=0 min=1 max=5
+    name: nulls=1 min=alice max=eve</code></pre>
+
+            <h2>pages</h2>
+            <p>Per-column physical encoding and slot size, per row group. Encodings are <code>const</code>, <code>plain</code>, <code>dict</code>, or <code>all_null</code>.</p>
+<pre><code>$ mosaic pages file.mosaic
+row group 0:
+    age: bucket 0 encoding=plain slot=0B
+    id: bucket 0 encoding=plain slot=0B
+    name: bucket 1 encoding=plain slot=0B</code></pre>
+
+            <h2>cat</h2>
+            <p>First <code>N</code> rows as a table. Project with <code>-c</code>/<code>--columns</code>; output NDJSON with <code>--json</code>.</p>
+<pre><code>$ mosaic cat file.mosaic -n 3
++----+-------+-----+
+| id | name  | age |
++----+-------+-----+
+| 1  | alice | 30  |
+| 2  | bob   | 25  |
+| 3  |       | 40  |
++----+-------+-----+
+
+$ mosaic cat file.mosaic -c name,id -n 1 --json
+{<span class="str">"name"</span>:<span class="str">"alice"</span>,<span class="str">"id"</span>:1}</code></pre>
+
+            <h3>Options</h3>
+            <table>
+                <thead>
+                    <tr><th>Command</th><th>Flag</th><th>Description</th></tr>
+                </thead>
+                <tbody>
+                    <tr><td>all</td><td><code>--json</code></td><td>Machine-readable output (objects/NDJSON)</td></tr>
+                    <tr><td><code>cat</code></td><td><code>-n, --num</code></td><td>Rows to print (default 10)</td></tr>
+                    <tr><td><code>cat</code></td><td><code>-c, --columns</code></td><td>Comma-separated columns to project and reorder</td></tr>
+                </tbody>
+            </table>
+
+            <h2>Comparison with parquet-cli</h2>
+            <p>
+                The command surface mirrors Apache Parquet's <code>parquet-cli</code> so the muscle memory carries over,
+                but the design is deliberately leaner. The mapping:
+            </p>
+            <table>
+                <thead>
+                    <tr><th>mosaic</th><th>parquet-cli</th><th>Purpose</th></tr>
+                </thead>
+                <tbody>
+                    <tr><td><code>schema</code></td><td><code>schema</code></td><td>Columns, types, nullability (+ bucket assignment)</td></tr>
+                    <tr><td><code>meta</code></td><td><code>meta</code></td><td>Row groups + per-column stats</td></tr>
+                    <tr><td><code>cat</code></td><td><code>cat</code> / <code>head</code></td><td>Rows; <code>-n</code> caps like <code>head</code></td></tr>
+                    <tr><td><code>pages</code></td><td><code>pages</code> / <code>column-index</code></td><td>Per-column encoding &amp; slot size</td></tr>
+                </tbody>
+            </table>
+            <p>Design differences worth noting:</p>
+            <table>
+                <thead>
+                    <tr><th>Aspect</th><th>parquet-cli</th><th>mosaic</th></tr>
+                </thead>
+                <tbody>
+                    <tr><td>Runtime</td><td>JVM jar</td><td>Single static native binary, no runtime</td></tr>
+                    <tr><td>JSON output</td><td>Per-command, uneven</td><td><code>--json</code> on every command</td></tr>
+                    <tr><td>Physical unit</td><td>Column chunk per row group</td><td>Bucket (the Mosaic column-group unit)</td></tr>
+                    <tr><td>Column order</td><td>As stored</td><td>Original write order (disk is name-sorted; CLI un-sorts)</td></tr>
+                    <tr><td>Stats</td><td>Always present</td><td>Only opt-in stats columns carry min/max</td></tr>
+                </tbody>
+            </table>
+
+            <div class="tip">
+                <strong>JSON for pipelines</strong>
+                <code>--json</code> emits objects (schema/meta/pages) and NDJSON (cat), so you can pipe
+                straight into <code>jq</code>: <code>mosaic meta f.mosaic --json | jq .row_groups[0].rows</code>.
+            </div>
+        </div>
+    </main>
+</body>
+</html>
diff --git a/docs/cpp-api.html b/docs/cpp-api.html
index 9b3fa4c..5ad8b23 100644
--- a/docs/cpp-api.html
+++ b/docs/cpp-api.html
@@ -42,6 +42,7 @@ <h2>Paimon Mosaic</h2>
                 <li><a href="java-api.html">Java API</a></li>
                 <li><a href="python-api.html">Python API</a></li>
                 <li><a href="cpp-api.html">C++ API</a></li>
+                <li><a href="cli.html">CLI</a></li>
                 <li><a href="releases.html">Releases</a></li>
             </ul>
         </nav>
diff --git a/docs/creating-a-release.html b/docs/creating-a-release.html
index 8b69a4d..91750df 100644
--- a/docs/creating-a-release.html
+++ b/docs/creating-a-release.html
@@ -42,6 +42,7 @@ <h2>Paimon Mosaic</h2>
                 <li><a href="java-api.html">Java API</a></li>
                 <li><a href="python-api.html">Python API</a></li>
                 <li><a href="cpp-api.html">C++ API</a></li>
+                <li><a href="cli.html">CLI</a></li>
                 <li><a href="releases.html">Releases</a></li>
             </ul>
         </nav>
diff --git a/docs/design.html b/docs/design.html
index 84c5e4f..4f142ec 100644
--- a/docs/design.html
+++ b/docs/design.html
@@ -42,6 +42,7 @@ <h2>Paimon Mosaic</h2>
                 <li><a href="java-api.html">Java API</a></li>
                 <li><a href="python-api.html">Python API</a></li>
                 <li><a href="cpp-api.html">C++ API</a></li>
+                <li><a href="cli.html">CLI</a></li>
                 <li><a href="releases.html">Releases</a></li>
             </ul>
         </nav>
diff --git a/docs/index.html b/docs/index.html
index 55e9ec8..0c3ff7b 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -42,6 +42,7 @@ <h2>Paimon Mosaic</h2>
                 <li><a href="java-api.html">Java API</a></li>
                 <li><a href="python-api.html">Python API</a></li>
                 <li><a href="cpp-api.html">C++ API</a></li>
+                <li><a href="cli.html">CLI</a></li>
                 <li><a href="releases.html">Releases</a></li>
             </ul>
         </nav>
diff --git a/docs/java-api.html b/docs/java-api.html
index f654b99..b413bf8 100644
--- a/docs/java-api.html
+++ b/docs/java-api.html
@@ -42,6 +42,7 @@ <h2>Paimon Mosaic</h2>
                 <li><a href="java-api.html" class="active">Java API</a></li>
                 <li><a href="python-api.html">Python API</a></li>
                 <li><a href="cpp-api.html">C++ API</a></li>
+                <li><a href="cli.html">CLI</a></li>
                 <li><a href="releases.html">Releases</a></li>
             </ul>
         </nav>
diff --git a/docs/python-api.html b/docs/python-api.html
index 7d50afe..e70b372 100644
--- a/docs/python-api.html
+++ b/docs/python-api.html
@@ -42,6 +42,7 @@ <h2>Paimon Mosaic</h2>
                 <li><a href="java-api.html">Java API</a></li>
                 <li><a href="python-api.html" class="active">Python API</a></li>
                 <li><a href="cpp-api.html">C++ API</a></li>
+                <li><a href="cli.html">CLI</a></li>
                 <li><a href="releases.html">Releases</a></li>
             </ul>
         </nav>
diff --git a/docs/releases.html b/docs/releases.html
index 2e6b35b..b5dd083 100644
--- a/docs/releases.html
+++ b/docs/releases.html
@@ -42,6 +42,7 @@ <h2>Paimon Mosaic</h2>
                 <li><a href="java-api.html">Java API</a></li>
                 <li><a href="python-api.html">Python API</a></li>
                 <li><a href="cpp-api.html">C++ API</a></li>
+                <li><a href="cli.html">CLI</a></li>
                 <li><a href="releases.html" class="active">Releases</a></li>
             </ul>
         </nav>
diff --git a/docs/verifying-a-release-candidate.html b/docs/verifying-a-release-candidate.html
index 6f620de..b9c3ead 100644
--- a/docs/verifying-a-release-candidate.html
+++ b/docs/verifying-a-release-candidate.html
@@ -42,6 +42,7 @@ <h2>Paimon Mosaic</h2>
                 <li><a href="java-api.html">Java API</a></li>
                 <li><a href="python-api.html">Python API</a></li>
                 <li><a href="cpp-api.html">C++ API</a></li>
+                <li><a href="cli.html">CLI</a></li>
                 <li><a href="releases.html">Releases</a></li>
             </ul>
         </nav>

From 963504251a2d772780e4194fdf13f1b8023dae1c Mon Sep 17 00:00:00 2001
From: mingfeng <tjg281622@antgroup.com>
Date: Mon, 22 Jun 2026 03:26:28 -0700
Subject: [PATCH 05/12] docs: expand CLI page with meta example and parquet-cli
 design table

---
 docs/cli.html | 144 +++++++++++++++++++++++---------------------------
 1 file changed, 65 insertions(+), 79 deletions(-)

diff --git a/docs/cli.html b/docs/cli.html
index 043ebee..e677af6 100644
--- a/docs/cli.html
+++ b/docs/cli.html
@@ -39,10 +39,10 @@ <h2>Paimon Mosaic</h2>
             <ul>
                 <li><a href="index.html">Home</a></li>
                 <li><a href="design.html">Design</a></li>
+                <li><a href="cli.html" class="active">CLI</a></li>
                 <li><a href="java-api.html">Java API</a></li>
                 <li><a href="python-api.html">Python API</a></li>
                 <li><a href="cpp-api.html">C++ API</a></li>
-                <li><a href="cli.html" class="active">CLI</a></li>
                 <li><a href="releases.html">Releases</a></li>
             </ul>
         </nav>
@@ -54,109 +54,95 @@ <h2>Paimon Mosaic</h2>
     <main class="main">
         <div class="content">
             <h1>CLI</h1>
-            <p class="subtitle">A <code>mosaic</code> binary for inspecting files from the shell &mdash; schema, metadata, contents, and physical encoding, in text or JSON.</p>
+            <p class="subtitle">Inspect Mosaic files from the terminal with the <code>mosaic</code> binary — schema, meta, cat and pages. A native, JVM-free toolkit modeled on parquet-cli.</p>
 
             <h2>Install</h2>
             <p>
-                The CLI is the <code>cli/</code> workspace crate. Build the binary and put it on your
-                <code>PATH</code>:
+                The CLI lives in the <code>cli/</code> directory and drives the read-only
+                <code>MosaicReader</code> API. Build and run from source, or install the binary:
             </p>
-<pre><code>cargo build --release -p paimon-mosaic-cli
-cp target/release/mosaic /usr/local/bin/</code></pre>
-            <p>Every command takes a file path and prints to stdout; add <code>--json</code> for machine-readable output.</p>
-<pre><code><span class="cmt"># schema, metadata, encoding, and first rows</span>
-mosaic schema file.mosaic
-mosaic meta   file.mosaic
-mosaic pages  file.mosaic
-mosaic cat    file.mosaic -n 20</code></pre>
+<pre><code><span class="cmt"># run from source</span>
+cargo run -p paimon-mosaic-cli -- schema data.mosaic
+
+<span class="cmt"># install the `mosaic` binary</span>
+cargo install --path cli
+mosaic schema data.mosaic</code></pre>
+
+            <h2>Commands</h2>
+            <table>
+                <thead>
+                    <tr><th>Command</th><th>Shows</th><th>Reads</th></tr>
+                </thead>
+                <tbody>
+                    <tr><td><code>schema</code></td><td>column names, Arrow types, nullability, bucket</td><td>footer only</td></tr>
+                    <tr><td><code>meta</code></td><td>row groups, rows, per-column stats (null/min/max)</td><td>footer + index</td></tr>
+                    <tr><td><code>pages</code></td><td>per-column encoding + on-disk slot size</td><td>bucket data</td></tr>
+                    <tr><td><code>cat</code></td><td>first N rows as a table</td><td>column data</td></tr>
+                </tbody>
+            </table>
+            <p>All four accept <code>--json</code>. <code>cat</code> also takes <code>-n &lt;N&gt;</code> and <code>-c a,b</code> (projection).</p>
 
             <h2>schema</h2>
-            <p>Column names, Arrow types, nullability, and bucket assignment. Columns are listed in original (write) order.</p>
-<pre><code>$ mosaic schema file.mosaic
-3 columns, 2 buckets
-  id: Int32 not null [bucket 0]
-  name: Utf8 [bucket 1]
-  age: Int32 [bucket 0]</code></pre>
-<pre><code>$ mosaic schema file.mosaic --json
-{<span class="str">"columns"</span>:3,<span class="str">"buckets"</span>:2,<span class="str">"fields"</span>:[{<span class="str">"name"</span>:<span class="str">"id"</span>,<span class="str">"type"</span>:<span class="str">"Int32"</span>,<span class="str">"nullable"</span>:false,<span class="str">"bucket"</span>:0}, ...]}</code></pre>
+<pre><code><span class="cmt">$ mosaic schema data.mosaic</span>
+3 columns, 3 buckets
+  id: Int32 not null [bucket 1]
+  kind: Utf8 [bucket 2]
+  flag: Int32 [bucket 0]</code></pre>
 
             <h2>meta</h2>
-            <p>Row counts, row groups, and per-column statistics (null count and min/max for stats columns).</p>
-<pre><code>$ mosaic meta file.mosaic
-file: 5 rows, 3 columns, 2 buckets, 1 row groups
-row group 0: 5 rows
-    age: nulls=1 min=25 max=40
-    id: nulls=0 min=1 max=5
-    name: nulls=1 min=alice max=eve</code></pre>
+<pre><code><span class="cmt">$ mosaic meta data.mosaic</span>
+file: 200 rows, 3 columns, 3 buckets, 1 row groups
+row group 0: 200 rows
+    id: nulls=0 min=0 max=199</code></pre>
 
             <h2>pages</h2>
-            <p>Per-column physical encoding and slot size, per row group. Encodings are <code>const</code>, <code>plain</code>, <code>dict</code>, or <code>all_null</code>.</p>
-<pre><code>$ mosaic pages file.mosaic
+            <p>
+                Per-column physical encoding within each row group: <code>plain</code>,
+                <code>const</code>, <code>dict</code> or <code>all_null</code>, plus the on-disk slot size.
+            </p>
+<pre><code><span class="cmt">$ mosaic pages data.mosaic</span>
 row group 0:
-    age: bucket 0 encoding=plain slot=0B
-    id: bucket 0 encoding=plain slot=0B
-    name: bucket 1 encoding=plain slot=0B</code></pre>
+    flag: bucket 0 encoding=const slot=0B
+    id: bucket 1 encoding=plain slot=8684B
+    kind: bucket 2 encoding=dict slot=31B</code></pre>
 
             <h2>cat</h2>
-            <p>First <code>N</code> rows as a table. Project with <code>-c</code>/<code>--columns</code>; output NDJSON with <code>--json</code>.</p>
-<pre><code>$ mosaic cat file.mosaic -n 3
-+----+-------+-----+
-| id | name  | age |
-+----+-------+-----+
-| 1  | alice | 30  |
-| 2  | bob   | 25  |
-| 3  |       | 40  |
-+----+-------+-----+
+<pre><code><span class="cmt">$ mosaic cat data.mosaic -n 2</span>
++----+------+------+
+| id | kind | flag |
++----+------+------+
+| 0  | a    | 7    |
+| 1  | b    | 7    |
++----+------+------+
 
-$ mosaic cat file.mosaic -c name,id -n 1 --json
-{<span class="str">"name"</span>:<span class="str">"alice"</span>,<span class="str">"id"</span>:1}</code></pre>
+<span class="cmt">$ mosaic cat data.mosaic -n 2 --json</span>
+{"id":0,"kind":"a","flag":7}
+{"id":1,"kind":"b","flag":7}</code></pre>
 
-            <h3>Options</h3>
-            <table>
-                <thead>
-                    <tr><th>Command</th><th>Flag</th><th>Description</th></tr>
-                </thead>
-                <tbody>
-                    <tr><td>all</td><td><code>--json</code></td><td>Machine-readable output (objects/NDJSON)</td></tr>
-                    <tr><td><code>cat</code></td><td><code>-n, --num</code></td><td>Rows to print (default 10)</td></tr>
-                    <tr><td><code>cat</code></td><td><code>-c, --columns</code></td><td>Comma-separated columns to project and reorder</td></tr>
-                </tbody>
-            </table>
-
-            <h2>Comparison with parquet-cli</h2>
+            <h2>Design vs parquet-cli</h2>
             <p>
-                The command surface mirrors Apache Parquet's <code>parquet-cli</code> so the muscle memory carries over,
-                but the design is deliberately leaner. The mapping:
+                Both follow the same shape — thin subcommands over a shared reader — but Mosaic
+                is a single Rust implementation of a private format, so the tool is intentionally
+                smaller and read-only.
             </p>
             <table>
                 <thead>
-                    <tr><th>mosaic</th><th>parquet-cli</th><th>Purpose</th></tr>
-                </thead>
-                <tbody>
-                    <tr><td><code>schema</code></td><td><code>schema</code></td><td>Columns, types, nullability (+ bucket assignment)</td></tr>
-                    <tr><td><code>meta</code></td><td><code>meta</code></td><td>Row groups + per-column stats</td></tr>
-                    <tr><td><code>cat</code></td><td><code>cat</code> / <code>head</code></td><td>Rows; <code>-n</code> caps like <code>head</code></td></tr>
-                    <tr><td><code>pages</code></td><td><code>pages</code> / <code>column-index</code></td><td>Per-column encoding &amp; slot size</td></tr>
-                </tbody>
-            </table>
-            <p>Design differences worth noting:</p>
-            <table>
-                <thead>
-                    <tr><th>Aspect</th><th>parquet-cli</th><th>mosaic</th></tr>
+                    <tr><th></th><th>parquet-cli (Java)</th><th>mosaic CLI</th></tr>
                 </thead>
                 <tbody>
-                    <tr><td>Runtime</td><td>JVM jar</td><td>Single static native binary, no runtime</td></tr>
-                    <tr><td>JSON output</td><td>Per-command, uneven</td><td><code>--json</code> on every command</td></tr>
-                    <tr><td>Physical unit</td><td>Column chunk per row group</td><td>Bucket (the Mosaic column-group unit)</td></tr>
-                    <tr><td>Column order</td><td>As stored</td><td>Original write order (disk is name-sorted; CLI un-sorts)</td></tr>
-                    <tr><td>Stats</td><td>Always present</td><td>Only opt-in stats columns carry min/max</td></tr>
+                    <tr><td>Parser</td><td>airline annotations</td><td>clap derive</td></tr>
+                    <tr><td>Base</td><td>BaseCommand + Hadoop FS</td><td>MosaicReader + FileInput (pread)</td></tr>
+                    <tr><td>IO</td><td>local / HDFS / S3</td><td>local file</td></tr>
+                    <tr><td>Schema</td><td>printed as Avro</td><td>printed as Arrow</td></tr>
+                    <tr><td>Runtime</td><td>JVM (hadoop jar)</td><td>native, no JVM</td></tr>
+                    <tr><td>Scope</td><td>22 cmds (view + convert + rewrite)</td><td>4 view cmds</td></tr>
                 </tbody>
             </table>
 
             <div class="tip">
-                <strong>JSON for pipelines</strong>
-                <code>--json</code> emits objects (schema/meta/pages) and NDJSON (cat), so you can pipe
-                straight into <code>jq</code>: <code>mosaic meta f.mosaic --json | jq .row_groups[0].rows</code>.
+                <strong>Embedding instead</strong>
+                For C/C++ or Java callers, embed the format directly via the <code>ffi</code>
+                (<code>mosaic.h</code>) or <code>jni</code> crates rather than shelling out to this CLI.
             </div>
         </div>
     </main>

From c76ed0af1062f32cdd3461abb4ba499b6d5dd5a5 Mon Sep 17 00:00:00 2001
From: mingfeng <tjg281622@antgroup.com>
Date: Mon, 22 Jun 2026 05:19:29 -0700
Subject: [PATCH 06/12] feat(cli): add head, footer, column-size, dictionary
 commands

Align the viewer command set with parquet-cli/arrow-rs: head (alias of
cat), footer (magic/version/buckets/compression), column-size (on-disk
bytes per column), dictionary (dump dict-encoded entries). Core gains
compression()/dict_values()/dictionary() read-only accessors. e2e tests
cover the new commands.
---
 cli/src/main.rs           | 83 +++++++++++++++++++++++++++++++++++++++
 cli/tests/e2e.rs          | 28 +++++++++++++
 core/src/bucket_reader.rs | 10 +++++
 core/src/reader.rs        | 28 +++++++++++++
 4 files changed, 149 insertions(+)

diff --git a/cli/src/main.rs b/cli/src/main.rs
index 74b6918..895ffdd 100644
--- a/cli/src/main.rs
+++ b/cli/src/main.rs
@@ -67,6 +67,30 @@ enum Cmd {
         #[arg(long)]
         json: bool,
     },
+    /// Print the first N rows (alias of cat).
+    Head {
+        file: PathBuf,
+        #[arg(short = 'n', long, default_value_t = 10)]
+        num: usize,
+        #[arg(short, long)]
+        columns: Option<String>,
+        #[arg(long)]
+        json: bool,
+    },
+    /// Print the file footer: version, buckets, compression, offsets.
+    Footer {
+        file: PathBuf,
+        #[arg(long)]
+        json: bool,
+    },
+    /// Print on-disk bytes per column (summed over row groups).
+    ColumnSize {
+        file: PathBuf,
+        #[arg(long)]
+        json: bool,
+    },
+    /// Print the dictionary of a dict-encoded column.
+    Dictionary { file: PathBuf, column: String },
 }
 
 fn main() -> ExitCode {
@@ -76,6 +100,10 @@ fn main() -> ExitCode {
         Cmd::Meta { file, json } => meta(&file, json),
         Cmd::Pages { file, json } => pages(&file, json),
         Cmd::Cat { file, num, columns, json } => cat(&file, num, columns, json),
+        Cmd::Head { file, num, columns, json } => cat(&file, num, columns, json),
+        Cmd::Footer { file, json } => footer(&file, json),
+        Cmd::ColumnSize { file, json } => column_size(&file, json),
+        Cmd::Dictionary { file, column } => dictionary(&file, &column),
     };
     match res {
         Ok(()) => ExitCode::SUCCESS,
@@ -212,3 +240,58 @@ fn cat(file: &PathBuf, num: usize, columns: Option<String>, json: bool) -> std::
     }
     Ok(())
 }
+
+fn footer(file: &PathBuf, json: bool) -> std::io::Result<()> {
+    use paimon_mosaic_core::spec::{COMPRESSION_ZSTD, MAGIC, VERSION};
+    let reader = open(file)?;
+    let s = reader.schema();
+    let comp = if reader.compression() == COMPRESSION_ZSTD { "zstd" } else { "none" };
+    let magic = std::str::from_utf8(&MAGIC).unwrap_or("MOSA");
+    if json {
+        println!("{{\"magic\":{},\"version\":{},\"buckets\":{},\"row_groups\":{},\"compression\":{}}}",
+            fmt::json_str(magic), VERSION, s.num_buckets, reader.num_row_groups(), fmt::json_str(comp));
+    } else {
+        println!("magic={} version={} buckets={} row_groups={} compression={}",
+            magic, VERSION, s.num_buckets, reader.num_row_groups(), comp);
+    }
+    Ok(())
+}
+
+fn column_size(file: &PathBuf, json: bool) -> std::io::Result<()> {
+    let reader = open(file)?;
+    let s = reader.schema();
+    let mut bytes = vec![0usize; s.columns.len()];
+    for rg in 0..reader.num_row_groups() {
+        for p in reader.page_infos(rg)? {
+            bytes[p.column_index] += p.slot_size;
+        }
+    }
+    let cols = original_order(s);
+    if json {
+        let items: Vec<String> = cols.iter().map(|&i| format!("{{\"column\":{},\"bytes\":{}}}", fmt::json_str(&s.columns[i].name), bytes[i])).collect();
+        println!("[{}]", items.join(","));
+    } else {
+        for i in cols {
+            println!("  {}: {} B", s.columns[i].name, bytes[i]);
+        }
+    }
+    Ok(())
+}
+
+fn dictionary(file: &PathBuf, column: &str) -> std::io::Result<()> {
+    let reader = open(file)?;
+    let col = reader.schema().columns.iter().position(|c| c.name == column)
+        .ok_or_else(|| std::io::Error::new(std::io::ErrorKind::InvalidInput, format!("column '{column}' not found")))?;
+    for rg in 0..reader.num_row_groups() {
+        match reader.dictionary(rg, col)? {
+            Some(vals) => {
+                println!("row group {rg}: {} entries", vals.len());
+                for (i, v) in vals.iter().enumerate() {
+                    println!("    {i}: {}", fmt::render_value(v));
+                }
+            }
+            None => println!("row group {rg}: not dict-encoded"),
+        }
+    }
+    Ok(())
+}
diff --git a/cli/tests/e2e.rs b/cli/tests/e2e.rs
index 3f19e21..5736ac4 100644
--- a/cli/tests/e2e.rs
+++ b/cli/tests/e2e.rs
@@ -141,3 +141,31 @@ fn missing_file_fails() {
     assert!(!ok);
     assert!(err.contains("error:"));
 }
+
+#[test]
+fn footer_shows_format() {
+    let f = fixture("footer");
+    let (out, _, ok) = run(&["footer", &f]);
+    assert!(ok);
+    assert!(out.contains("magic=MOSA"));
+    assert!(out.contains("buckets=3"));
+    assert!(out.contains("compression=zstd"));
+}
+
+#[test]
+fn dictionary_dumps_entries() {
+    let f = fixture("dict");
+    let (out, _, ok) = run(&["dictionary", &f, "kind"]);
+    assert!(ok);
+    assert!(out.contains("3 entries"));
+    assert!(out.contains("a") && out.contains("b") && out.contains("c"));
+}
+
+#[test]
+fn column_size_sums_bytes() {
+    let f = fixture("size");
+    let (out, _, ok) = run(&["column-size", &f]);
+    assert!(ok);
+    assert!(out.contains("id:") && out.contains("kind:"));
+    assert!(out.contains("flag: 0 B")); // const column has no slot
+}
diff --git a/core/src/bucket_reader.rs b/core/src/bucket_reader.rs
index 3675d5b..2655ebb 100644
--- a/core/src/bucket_reader.rs
+++ b/core/src/bucket_reader.rs
@@ -512,6 +512,11 @@ impl BucketReader {
         &self.encodings
     }
 
+    /// Dictionary entries for one column (empty if it is not dict-encoded).
+    pub fn dict_values(&self, col: usize) -> &[Value] {
+        &self.dict_values[col]
+    }
+
     pub fn read_all_columns(&self) -> io::Result<Vec<ArrayRef>> {
         let num_rows = self.num_rows;
         let mut result = Vec::with_capacity(self.num_columns);
@@ -778,6 +783,11 @@ impl ColumnPageReader {
         self.encoding
     }
 
+    /// Dictionary entries for a dict-encoded page (empty for other encodings).
+    pub fn dict_values(&self) -> &[Value] {
+        &self.dict_values
+    }
+
     pub fn read_all(&self) -> io::Result<ArrayRef> {
         let num_rows = self.num_rows;
         let variant = data_variant_for_type(&self.col_type);
diff --git a/core/src/reader.rs b/core/src/reader.rs
index 40692d2..fe9e014 100644
--- a/core/src/reader.rs
+++ b/core/src/reader.rs
@@ -457,6 +457,18 @@ impl<I: InputFile> MosaicReader<I> {
         &self.input
     }
 
+    /// Footer compression code (`spec::COMPRESSION_*`).
+    pub fn compression(&self) -> u8 {
+        self.compression
+    }
+
+    /// Dictionary entries for one column in one row group, or `None` if that
+    /// column is not dict-encoded there. Used by the `dictionary` command.
+    pub fn dictionary(&self, rg_index: usize, col: usize) -> io::Result<Option<Vec<Value>>> {
+        let rg = self.row_group_reader_projected(rg_index, &[col])?;
+        Ok(rg.take_dictionary(col))
+    }
+
     /// Per-column physical layout for a row group: bucket, encoding and on-disk
     /// slot size. Reads and decompresses each non-empty bucket; used by tooling
     /// (the `pages` command). Columns are reported in global (name-sorted) order.
@@ -1036,6 +1048,22 @@ impl RowGroupReader {
         }
     }
 
+    /// Dictionary entries for a projected column, or `None` if not dict-encoded.
+    pub fn take_dictionary(&self, global_col: usize) -> Option<Vec<Value>> {
+        let bucket = self.schema.columns[global_col].bucket_id;
+        let local = self.bucket_to_global[bucket].iter().position(|&g| g == global_col)?;
+        match self.bucket_states[bucket].as_ref()? {
+            BucketState::Paged { column_readers } => {
+                let d = column_readers[local].as_ref()?.dict_values();
+                if d.is_empty() { None } else { Some(d.to_vec()) }
+            }
+            BucketState::Monolithic { reader } => {
+                let d = reader.dict_values(local);
+                if d.is_empty() { None } else { Some(d.to_vec()) }
+            }
+        }
+    }
+
     pub fn num_rows(&self) -> usize {
         self.num_rows
     }

From 20716c65590a84c3cb58761c327c0f3af647f22d Mon Sep 17 00:00:00 2001
From: mingfeng <tjg281622@antgroup.com>
Date: Mon, 22 Jun 2026 05:26:29 -0700
Subject: [PATCH 07/12] feat(cli): add buckets command for mosaic bucket layout

Mosaic's column-bucket grouping has no parquet equivalent. Add a
buckets command printing, per row group, each bucket's kind
(empty/monolithic/paged), on-disk size and member columns. Core gains
MosaicReader::bucket_infos(). e2e covered.
---
 cli/src/main.rs    | 34 ++++++++++++++++++++++++++++++++++
 cli/tests/e2e.rs   | 10 ++++++++++
 core/src/reader.rs | 29 +++++++++++++++++++++++++++++
 3 files changed, 73 insertions(+)

diff --git a/cli/src/main.rs b/cli/src/main.rs
index 895ffdd..3032fa5 100644
--- a/cli/src/main.rs
+++ b/cli/src/main.rs
@@ -91,6 +91,12 @@ enum Cmd {
     },
     /// Print the dictionary of a dict-encoded column.
     Dictionary { file: PathBuf, column: String },
+    /// Print bucket layout per row group (Mosaic's column grouping).
+    Buckets {
+        file: PathBuf,
+        #[arg(long)]
+        json: bool,
+    },
 }
 
 fn main() -> ExitCode {
@@ -104,6 +110,7 @@ fn main() -> ExitCode {
         Cmd::Footer { file, json } => footer(&file, json),
         Cmd::ColumnSize { file, json } => column_size(&file, json),
         Cmd::Dictionary { file, column } => dictionary(&file, &column),
+        Cmd::Buckets { file, json } => buckets(&file, json),
     };
     match res {
         Ok(()) => ExitCode::SUCCESS,
@@ -295,3 +302,30 @@ fn dictionary(file: &PathBuf, column: &str) -> std::io::Result<()> {
     }
     Ok(())
 }
+
+fn buckets(file: &PathBuf, json: bool) -> std::io::Result<()> {
+    let reader = open(file)?;
+    let s = reader.schema();
+    let name = |i: usize| s.columns[i].name.clone();
+    let mut rgs = Vec::new();
+    for rg in 0..reader.num_row_groups() {
+        let infos = reader.bucket_infos(rg)?;
+        if json {
+            let items: Vec<String> = infos.iter().map(|b| {
+                let cols: Vec<String> = b.columns.iter().map(|&i| fmt::json_str(&name(i))).collect();
+                format!("{{\"bucket\":{},\"kind\":{},\"size\":{},\"columns\":[{}]}}", b.bucket, fmt::json_str(b.kind), b.size, cols.join(","))
+            }).collect();
+            rgs.push(format!("[{}]", items.join(",")));
+        } else {
+            println!("row group {rg}:");
+            for b in &infos {
+                let cols: Vec<String> = b.columns.iter().map(|&i| name(i)).collect();
+                println!("    bucket {}: {} {}B [{}]", b.bucket, b.kind, b.size, cols.join(", "));
+            }
+        }
+    }
+    if json {
+        println!("{{\"row_groups\":[{}]}}", rgs.join(","));
+    }
+    Ok(())
+}
diff --git a/cli/tests/e2e.rs b/cli/tests/e2e.rs
index 5736ac4..5581843 100644
--- a/cli/tests/e2e.rs
+++ b/cli/tests/e2e.rs
@@ -169,3 +169,13 @@ fn column_size_sums_bytes() {
     assert!(out.contains("id:") && out.contains("kind:"));
     assert!(out.contains("flag: 0 B")); // const column has no slot
 }
+
+#[test]
+fn buckets_show_layout() {
+    let f = fixture("buckets");
+    let (out, _, ok) = run(&["buckets", &f]);
+    assert!(ok);
+    assert!(out.contains("row group 0:"));
+    assert!(out.contains("[flag]") && out.contains("[id]") && out.contains("[kind]"));
+    assert!(out.contains("monolithic") || out.contains("paged"));
+}
diff --git a/core/src/reader.rs b/core/src/reader.rs
index fe9e014..ac89dfa 100644
--- a/core/src/reader.rs
+++ b/core/src/reader.rs
@@ -189,6 +189,17 @@ pub struct PageInfo {
     pub slot_size: usize,
 }
 
+/// Layout of one bucket within one row group.
+pub struct BucketInfo {
+    pub bucket: usize,
+    /// "empty" | "monolithic" | "paged".
+    pub kind: &'static str,
+    /// On-disk compressed size in bytes (0 for empty buckets).
+    pub size: usize,
+    /// Member column indices (global, name-sorted order).
+    pub columns: Vec<usize>,
+}
+
 pub struct RowGroupMeta {
     pub num_rows: usize,
     pub bucket_offsets: Vec<u64>,
@@ -462,6 +473,24 @@ impl<I: InputFile> MosaicReader<I> {
         self.compression
     }
 
+    /// Per-bucket layout for a row group: kind, on-disk size and member columns
+    /// (global indices). The bucket is Mosaic's defining structure — exposed for
+    /// the `buckets` command.
+    pub fn bucket_infos(&self, rg_index: usize) -> io::Result<Vec<BucketInfo>> {
+        if rg_index >= self.row_group_metas.len() {
+            return Err(io::Error::new(io::ErrorKind::InvalidInput, "row group index out of range"));
+        }
+        let meta = &self.row_group_metas[rg_index];
+        Ok((0..self.num_buckets).map(|b| {
+            let (kind, size) = match meta.bucket_layouts[b] {
+                BucketLayout::Empty => ("empty", 0),
+                BucketLayout::Monolithic { compressed_size, .. } => ("monolithic", compressed_size),
+                BucketLayout::Paged { total_size } => ("paged", total_size),
+            };
+            BucketInfo { bucket: b, kind, size, columns: self.schema.bucket_to_global[b].clone() }
+        }).collect())
+    }
+
     /// Dictionary entries for one column in one row group, or `None` if that
     /// column is not dict-encoded there. Used by the `dictionary` command.
     pub fn dictionary(&self, rg_index: usize, col: usize) -> io::Result<Option<Vec<Value>>> {

From 0a64296815cd2f537541234fca267d3925211c67 Mon Sep 17 00:00:00 2001
From: mingfeng <tjg281622@antgroup.com>
Date: Mon, 22 Jun 2026 05:43:27 -0700
Subject: [PATCH 08/12] feat(cli): dictionary takes -c/--column to match
 parquet-cli

Align dictionary column selection with parquet-cli's -c flag instead of
a positional argument; update e2e.
---
 cli/src/main.rs  | 7 ++++++-
 cli/tests/e2e.rs | 2 +-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/cli/src/main.rs b/cli/src/main.rs
index 3032fa5..a7980b0 100644
--- a/cli/src/main.rs
+++ b/cli/src/main.rs
@@ -90,7 +90,12 @@ enum Cmd {
         json: bool,
     },
     /// Print the dictionary of a dict-encoded column.
-    Dictionary { file: PathBuf, column: String },
+    Dictionary {
+        file: PathBuf,
+        /// Column name to dump.
+        #[arg(short = 'c', long)]
+        column: String,
+    },
     /// Print bucket layout per row group (Mosaic's column grouping).
     Buckets {
         file: PathBuf,
diff --git a/cli/tests/e2e.rs b/cli/tests/e2e.rs
index 5581843..c417a8b 100644
--- a/cli/tests/e2e.rs
+++ b/cli/tests/e2e.rs
@@ -155,7 +155,7 @@ fn footer_shows_format() {
 #[test]
 fn dictionary_dumps_entries() {
     let f = fixture("dict");
-    let (out, _, ok) = run(&["dictionary", &f, "kind"]);
+    let (out, _, ok) = run(&["dictionary", &f, "-c", "kind"]);
     assert!(ok);
     assert!(out.contains("3 entries"));
     assert!(out.contains("a") && out.contains("b") && out.contains("c"));

From 550d59674f42d07835f7a42c8c32223279989b61 Mon Sep 17 00:00:00 2001
From: mingfeng <tjg281622@antgroup.com>
Date: Mon, 22 Jun 2026 05:55:10 -0700
Subject: [PATCH 09/12] feat(cli): add --json to dictionary

Completes JSON output across all 9 commands; dict columns emit an array,
non-dict row groups emit null. e2e extended.
---
 cli/src/main.rs  | 20 ++++++++++++++++++--
 cli/tests/e2e.rs |  3 +++
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/cli/src/main.rs b/cli/src/main.rs
index a7980b0..5806386 100644
--- a/cli/src/main.rs
+++ b/cli/src/main.rs
@@ -95,6 +95,8 @@ enum Cmd {
         /// Column name to dump.
         #[arg(short = 'c', long)]
         column: String,
+        #[arg(long)]
+        json: bool,
     },
     /// Print bucket layout per row group (Mosaic's column grouping).
     Buckets {
@@ -114,7 +116,7 @@ fn main() -> ExitCode {
         Cmd::Head { file, num, columns, json } => cat(&file, num, columns, json),
         Cmd::Footer { file, json } => footer(&file, json),
         Cmd::ColumnSize { file, json } => column_size(&file, json),
-        Cmd::Dictionary { file, column } => dictionary(&file, &column),
+        Cmd::Dictionary { file, column, json } => dictionary(&file, &column, json),
         Cmd::Buckets { file, json } => buckets(&file, json),
     };
     match res {
@@ -290,10 +292,24 @@ fn column_size(file: &PathBuf, json: bool) -> std::io::Result<()> {
     Ok(())
 }
 
-fn dictionary(file: &PathBuf, column: &str) -> std::io::Result<()> {
+fn dictionary(file: &PathBuf, column: &str, json: bool) -> std::io::Result<()> {
     let reader = open(file)?;
     let col = reader.schema().columns.iter().position(|c| c.name == column)
         .ok_or_else(|| std::io::Error::new(std::io::ErrorKind::InvalidInput, format!("column '{column}' not found")))?;
+    if json {
+        let mut rgs = Vec::new();
+        for rg in 0..reader.num_row_groups() {
+            match reader.dictionary(rg, col)? {
+                Some(vals) => {
+                    let e: Vec<String> = vals.iter().map(|v| fmt::json_str(&fmt::render_value(v))).collect();
+                    rgs.push(format!("[{}]", e.join(",")));
+                }
+                None => rgs.push("null".to_string()),
+            }
+        }
+        println!("{{\"column\":{},\"row_groups\":[{}]}}", fmt::json_str(column), rgs.join(","));
+        return Ok(());
+    }
     for rg in 0..reader.num_row_groups() {
         match reader.dictionary(rg, col)? {
             Some(vals) => {
diff --git a/cli/tests/e2e.rs b/cli/tests/e2e.rs
index c417a8b..3a078e3 100644
--- a/cli/tests/e2e.rs
+++ b/cli/tests/e2e.rs
@@ -159,6 +159,9 @@ fn dictionary_dumps_entries() {
     assert!(ok);
     assert!(out.contains("3 entries"));
     assert!(out.contains("a") && out.contains("b") && out.contains("c"));
+    let (j, _, ok) = run(&["dictionary", &f, "-c", "kind", "--json"]);
+    assert!(ok);
+    assert_eq!(j, "{\"column\":\"kind\",\"row_groups\":[[\"a\",\"b\",\"c\"]]}\n");
 }
 
 #[test]

From 04839b69562bf4ad2591f98bb792aab1e9914521 Mon Sep 17 00:00:00 2001
From: mingfeng <tjg281622@antgroup.com>
Date: Mon, 22 Jun 2026 05:59:28 -0700
Subject: [PATCH 10/12] docs: document all 9 commands; remove parquet
 comparison

Expand docs/cli.html and cli/README.md to cover every command
(schema/meta/footer/buckets/pages/dictionary/column-size/cat/head) with
usage and example output. Drop all comparison content per maintainer
preference.
---
 cli/README.md |  72 +++++++++++++++++++++++++++++
 docs/cli.html | 126 ++++++++++++++++++++++++++++++--------------------
 2 files changed, 147 insertions(+), 51 deletions(-)
 create mode 100644 cli/README.md

diff --git a/cli/README.md b/cli/README.md
new file mode 100644
index 0000000..fc38491
--- /dev/null
+++ b/cli/README.md
@@ -0,0 +1,72 @@
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# mosaic CLI
+
+A native command-line inspector for Mosaic files. It drives the read-only
+`MosaicReader` API, so it needs no JVM and ships as a single native binary.
+
+## Build & run
+
+```bash
+cargo run -p paimon-mosaic-cli -- <command> <file>   # from source
+cargo install --path cli                             # install `mosaic`
+mosaic <command> <file>
+```
+
+## Commands
+
+| Command | Shows | Reads |
+|---------|-------|-------|
+| `schema` | column names, Arrow types, nullability, bucket | footer only |
+| `meta`   | row groups, rows, per-column stats (null/min/max) | footer + index |
+| `footer` | magic, version, buckets, compression | footer only |
+| `buckets`| per-bucket layout and member columns | footer + index |
+| `pages`  | per-column encoding + on-disk slot size | bucket data |
+| `dictionary` | dictionary entries of a dict column (`-c`) | bucket data |
+| `column-size` | on-disk bytes per column | footer + index |
+| `cat` / `head` | first N rows as a table | column data |
+
+Every command accepts `--json`. `cat`/`head` take `-n <N>` and `-c a,b`
+(projection); `dictionary` takes `-c <col>`.
+
+```text
+$ mosaic schema data.mosaic
+5 columns, 4 buckets
+  id: Int32 not null [bucket 0]
+  name: Utf8 [bucket 2]
+  kind: Utf8 [bucket 1]
+
+$ mosaic buckets data.mosaic
+row group 0:
+    bucket 0: paged 373B [flag, id]
+    bucket 1: paged 32B [kind]
+
+$ mosaic pages data.mosaic
+row group 0:
+    flag: bucket 0 encoding=const slot=16B
+    kind: bucket 1 encoding=dict slot=28B
+
+$ mosaic cat data.mosaic -n 2 --json
+{"id":0,"name":"user_0","kind":"a","score":0,"flag":7}
+{"id":1,"name":"user_1","kind":"b","score":1.5,"flag":7}
+```
+
+For C/C++ or Java callers, embed the format directly via the `ffi`
+(`mosaic.h`) or `jni` crates rather than shelling out to this CLI.
diff --git a/docs/cli.html b/docs/cli.html
index e677af6..6e7e7dd 100644
--- a/docs/cli.html
+++ b/docs/cli.html
@@ -54,13 +54,9 @@ <h2>Paimon Mosaic</h2>
     <main class="main">
         <div class="content">
             <h1>CLI</h1>
-            <p class="subtitle">Inspect Mosaic files from the terminal with the <code>mosaic</code> binary — schema, meta, cat and pages. A native, JVM-free toolkit modeled on parquet-cli.</p>
+            <p class="subtitle">Inspect Mosaic files from the terminal with the <code>mosaic</code> binary. A native, JVM-free toolkit driving the read-only <code>MosaicReader</code> API.</p>
 
             <h2>Install</h2>
-            <p>
-                The CLI lives in the <code>cli/</code> directory and drives the read-only
-                <code>MosaicReader</code> API. Build and run from source, or install the binary:
-            </p>
 <pre><code><span class="cmt"># run from source</span>
 cargo run -p paimon-mosaic-cli -- schema data.mosaic
 
@@ -75,69 +71,97 @@ <h2>Commands</h2>
                 </thead>
                 <tbody>
                     <tr><td><code>schema</code></td><td>column names, Arrow types, nullability, bucket</td><td>footer only</td></tr>
-                    <tr><td><code>meta</code></td><td>row groups, rows, per-column stats (null/min/max)</td><td>footer + index</td></tr>
-                    <tr><td><code>pages</code></td><td>per-column encoding + on-disk slot size</td><td>bucket data</td></tr>
-                    <tr><td><code>cat</code></td><td>first N rows as a table</td><td>column data</td></tr>
+                    <tr><td><code>meta</code></td><td>row groups, rows, per-column stats</td><td>footer + index</td></tr>
+                    <tr><td><code>footer</code></td><td>magic, version, buckets, compression</td><td>footer only</td></tr>
+                    <tr><td><code>buckets</code></td><td>per-bucket layout and member columns</td><td>footer + index</td></tr>
+                    <tr><td><code>pages</code></td><td>per-column encoding + slot size</td><td>bucket data</td></tr>
+                    <tr><td><code>dictionary</code></td><td>dictionary entries of a dict column</td><td>bucket data</td></tr>
+                    <tr><td><code>column-size</code></td><td>on-disk bytes per column</td><td>footer + index</td></tr>
+                    <tr><td><code>cat</code> / <code>head</code></td><td>first N rows as a table</td><td>column data</td></tr>
                 </tbody>
             </table>
-            <p>All four accept <code>--json</code>. <code>cat</code> also takes <code>-n &lt;N&gt;</code> and <code>-c a,b</code> (projection).</p>
+            <p>Every command accepts <code>--json</code>. <code>cat</code>/<code>head</code> take <code>-n &lt;N&gt;</code> and <code>-c a,b</code> (projection); <code>dictionary</code> takes <code>-c &lt;col&gt;</code>.</p>
 
             <h2>schema</h2>
+            <p>Columns, Arrow types, nullability and bucket assignment, in original input order. Footer only.</p>
 <pre><code><span class="cmt">$ mosaic schema data.mosaic</span>
-3 columns, 3 buckets
-  id: Int32 not null [bucket 1]
-  kind: Utf8 [bucket 2]
-  flag: Int32 [bucket 0]</code></pre>
+5 columns, 4 buckets
+  id: Int32 not null [bucket 0]
+  name: Utf8 [bucket 2]
+  kind: Utf8 [bucket 1]
+  score: Float64 [bucket 3]
+  flag: Int32 [bucket 0]
+
+<span class="cmt">$ mosaic schema data.mosaic --json</span>
+{"columns":5,"buckets":4,"fields":[{"name":"id","type":"Int32","nullable":false,"bucket":0}, ...]}</code></pre>
 
             <h2>meta</h2>
+            <p>Total rows, row groups, and per-column stats (null count / min / max) for columns configured with stats.</p>
 <pre><code><span class="cmt">$ mosaic meta data.mosaic</span>
-file: 200 rows, 3 columns, 3 buckets, 1 row groups
+file: 200 rows, 5 columns, 4 buckets, 1 row groups
 row group 0: 200 rows
-    id: nulls=0 min=0 max=199</code></pre>
+    id: nulls=0 min=0 max=199
+    score: nulls=0 min=0 max=298.5</code></pre>
+
+            <h2>footer</h2>
+            <p>The 32-byte file footer: magic, format version, bucket count, row groups and compression.</p>
+<pre><code><span class="cmt">$ mosaic footer data.mosaic</span>
+magic=MOSA version=1 buckets=4 row_groups=1 compression=zstd</code></pre>
+
+            <h2>buckets</h2>
+            <p>Per row group, each bucket's layout (empty / monolithic / paged), on-disk size and member columns. Mosaic groups columns into buckets by name order.</p>
+<pre><code><span class="cmt">$ mosaic buckets data.mosaic</span>
+row group 0:
+    bucket 0: paged 373B [flag, id]
+    bucket 1: paged 32B [kind]
+    bucket 2: paged 220B [name]
+    bucket 3: paged 542B [score]</code></pre>
 
             <h2>pages</h2>
-            <p>
-                Per-column physical encoding within each row group: <code>plain</code>,
-                <code>const</code>, <code>dict</code> or <code>all_null</code>, plus the on-disk slot size.
-            </p>
+            <p>Per-column physical encoding (<code>plain</code> / <code>const</code> / <code>dict</code> / <code>all_null</code>) and on-disk slot size.</p>
 <pre><code><span class="cmt">$ mosaic pages data.mosaic</span>
 row group 0:
-    flag: bucket 0 encoding=const slot=0B
-    id: bucket 1 encoding=plain slot=8684B
-    kind: bucket 2 encoding=dict slot=31B</code></pre>
+    flag: bucket 0 encoding=const slot=16B
+    id: bucket 0 encoding=plain slot=349B
+    kind: bucket 1 encoding=dict slot=28B
+    name: bucket 2 encoding=plain slot=216B
+    score: bucket 3 encoding=plain slot=538B</code></pre>
+
+            <h2>dictionary</h2>
+            <p>Dump the dictionary of a dict-encoded column. Non-dict columns report as such.</p>
+<pre><code><span class="cmt">$ mosaic dictionary data.mosaic -c kind</span>
+row group 0: 3 entries
+    0: a
+    1: b
+    2: c
+
+<span class="cmt">$ mosaic dictionary data.mosaic -c kind --json</span>
+{"column":"kind","row_groups":[["a","b","c"]]}</code></pre>
+
+            <h2>column-size</h2>
+            <p>On-disk bytes per column, summed across row groups.</p>
+<pre><code><span class="cmt">$ mosaic column-size data.mosaic</span>
+  id: 349 B
+  name: 216 B
+  kind: 28 B
+  score: 538 B
+  flag: 16 B</code></pre>
 
-            <h2>cat</h2>
+            <h2>cat / head</h2>
+            <p>Read the first N rows as a table. <code>-n</code> sets the count, <code>-c</code> projects columns, <code>--json</code> emits newline-delimited JSON. <code>head</code> is an alias of <code>cat</code>.</p>
 <pre><code><span class="cmt">$ mosaic cat data.mosaic -n 2</span>
-+----+------+------+
-| id | kind | flag |
-+----+------+------+
-| 0  | a    | 7    |
-| 1  | b    | 7    |
-+----+------+------+
++----+--------+------+-------+------+
+| id | name   | kind | score | flag |
++----+--------+------+-------+------+
+| 0  | user_0 | a    | 0     | 7    |
+| 1  | user_1 | b    | 1.5   | 7    |
++----+--------+------+-------+------+
+
+<span class="cmt">$ mosaic cat data.mosaic -n 2 -c name,score</span>   <span class="cmt"># projection</span>
 
 <span class="cmt">$ mosaic cat data.mosaic -n 2 --json</span>
-{"id":0,"kind":"a","flag":7}
-{"id":1,"kind":"b","flag":7}</code></pre>
-
-            <h2>Design vs parquet-cli</h2>
-            <p>
-                Both follow the same shape — thin subcommands over a shared reader — but Mosaic
-                is a single Rust implementation of a private format, so the tool is intentionally
-                smaller and read-only.
-            </p>
-            <table>
-                <thead>
-                    <tr><th></th><th>parquet-cli (Java)</th><th>mosaic CLI</th></tr>
-                </thead>
-                <tbody>
-                    <tr><td>Parser</td><td>airline annotations</td><td>clap derive</td></tr>
-                    <tr><td>Base</td><td>BaseCommand + Hadoop FS</td><td>MosaicReader + FileInput (pread)</td></tr>
-                    <tr><td>IO</td><td>local / HDFS / S3</td><td>local file</td></tr>
-                    <tr><td>Schema</td><td>printed as Avro</td><td>printed as Arrow</td></tr>
-                    <tr><td>Runtime</td><td>JVM (hadoop jar)</td><td>native, no JVM</td></tr>
-                    <tr><td>Scope</td><td>22 cmds (view + convert + rewrite)</td><td>4 view cmds</td></tr>
-                </tbody>
-            </table>
+{"id":0,"name":"user_0","kind":"a","score":0,"flag":7}
+{"id":1,"name":"user_1","kind":"b","score":1.5,"flag":7}</code></pre>
 
             <div class="tip">
                 <strong>Embedding instead</strong>

From bb0a0fc88c1f53fd0a74dd831fa236a3c43c2e72 Mon Sep 17 00:00:00 2001
From: mingfeng <tjg281622@antgroup.com>
Date: Mon, 22 Jun 2026 06:08:36 -0700
Subject: [PATCH 11/12] test: drop trivial encoding_names unit; assert
 footer/buckets --json

Remove the near-trivial encoding_names mapping test; extend footer and
buckets e2e to cover their --json output, improving CLI feature coverage.
---
 cli/src/fmt.rs   | 7 -------
 cli/tests/e2e.rs | 6 ++++++
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/cli/src/fmt.rs b/cli/src/fmt.rs
index a6e8f9e..0eb914e 100644
--- a/cli/src/fmt.rs
+++ b/cli/src/fmt.rs
@@ -231,13 +231,6 @@ mod tests {
         assert_eq!(render_value(&Value::Null), "null");
     }
 
-    #[test]
-    fn encoding_names() {
-        assert_eq!(encoding_name(0), "plain");
-        assert_eq!(encoding_name(2), "dict");
-        assert_eq!(encoding_name(3), "all_null");
-    }
-
     #[test]
     fn ndjson_renders_null_and_quotes() {
         let out = ndjson(&[sample()], 10);
diff --git a/cli/tests/e2e.rs b/cli/tests/e2e.rs
index 3a078e3..f630503 100644
--- a/cli/tests/e2e.rs
+++ b/cli/tests/e2e.rs
@@ -150,6 +150,9 @@ fn footer_shows_format() {
     assert!(out.contains("magic=MOSA"));
     assert!(out.contains("buckets=3"));
     assert!(out.contains("compression=zstd"));
+    let (j, _, ok) = run(&["footer", &f, "--json"]);
+    assert!(ok);
+    assert!(j.contains("\"magic\":\"MOSA\"") && j.contains("\"compression\":\"zstd\""));
 }
 
 #[test]
@@ -181,4 +184,7 @@ fn buckets_show_layout() {
     assert!(out.contains("row group 0:"));
     assert!(out.contains("[flag]") && out.contains("[id]") && out.contains("[kind]"));
     assert!(out.contains("monolithic") || out.contains("paged"));
+    let (j, _, ok) = run(&["buckets", &f, "--json"]);
+    assert!(ok);
+    assert!(j.contains("\"bucket\":0") && j.contains("\"columns\":"));
 }

From 924ac4b7b771b84037e918a8fa837924589ec066 Mon Sep 17 00:00:00 2001
From: mingfeng <tjg281622@antgroup.com>
Date: Mon, 22 Jun 2026 06:11:34 -0700
Subject: [PATCH 12/12] chore(cli): drop redundant gen.rs example

The e2e tests carry their own fixture writer; the standalone gen.rs
example duplicated it and was unreferenced.
---
 cli/examples/gen.rs | 54 ---------------------------------------------
 1 file changed, 54 deletions(-)
 delete mode 100644 cli/examples/gen.rs

diff --git a/cli/examples/gen.rs b/cli/examples/gen.rs
deleted file mode 100644
index b117018..0000000
--- a/cli/examples/gen.rs
+++ /dev/null
@@ -1,54 +0,0 @@
-// Generates a tiny mosaic file for CLI verification.
-use std::fs::File;
-use std::io::Write;
-use std::sync::Arc;
-
-use arrow_array::{Int32Array, RecordBatch, StringArray};
-use arrow_schema::{DataType, Field, Schema};
-use paimon_mosaic_core::writer::{MosaicWriter, OutputFile, WriterOptions};
-
-struct FileOut {
-    f: File,
-    pos: u64,
-}
-impl OutputFile for FileOut {
-    fn write(&mut self, d: &[u8]) -> std::io::Result<()> {
-        self.f.write_all(d)?;
-        self.pos += d.len() as u64;
-        Ok(())
-    }
-    fn flush(&mut self) -> std::io::Result<()> {
-        self.f.flush()
-    }
-    fn pos(&self) -> u64 {
-        self.pos
-    }
-}
-
-fn main() {
-    let path = std::env::args().nth(1).unwrap_or_else(|| "/tmp/sample.mosaic".into());
-    let schema = Schema::new(vec![
-        Field::new("id", DataType::Int32, false),
-        Field::new("name", DataType::Utf8, true),
-        Field::new("age", DataType::Int32, true),
-    ]);
-    let out = FileOut { f: File::create(&path).unwrap(), pos: 0 };
-    let opts = WriterOptions {
-        num_buckets: 2,
-        stats_columns: vec!["id".into(), "name".into(), "age".into()],
-        ..Default::default()
-    };
-    let mut w = MosaicWriter::new(out, &schema, opts).unwrap();
-    let batch = RecordBatch::try_new(
-        Arc::new(schema),
-        vec![
-            Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5])),
-            Arc::new(StringArray::from(vec![Some("alice"), Some("bob"), None, Some("dan"), Some("eve")])),
-            Arc::new(Int32Array::from(vec![Some(30), Some(25), Some(40), None, Some(28)])),
-        ],
-    )
-    .unwrap();
-    w.write_batch(&batch).unwrap();
-    w.close().unwrap();
-    println!("wrote {path}");
-}