From 1ff6b327d365ec5a09a9291c8c7c31011b1e5013 Mon Sep 17 00:00:00 2001 From: Jefffrey Date: Tue, 2 Apr 2024 21:26:59 +1100 Subject: [PATCH] Update documentation and cleanup root level files --- .github/workflows/ci.yml | 2 +- .gitignore | 2 +- Cargo.toml | 13 +--- README.md | 154 ++++++++++++++++++++++++++++----------- regen.sh | 1 + rustfmt.toml | 1 - src/lib.rs | 13 ++++ typos.toml | 3 +- 8 files changed, 132 insertions(+), 57 deletions(-) delete mode 100644 rustfmt.toml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9584e3d..7b8da28 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -84,7 +84,7 @@ jobs: - name: Install taplo run: cargo install taplo-cli --version ^0.8 --locked - name: Run taplo - run: taplo format --check --option "indent_string= " + run: taplo format --check fmt: name: Rustfmt diff --git a/.gitignore b/.gitignore index 91b027a..83ab408 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ -/target +**/target Cargo.lock venv diff --git a/Cargo.toml b/Cargo.toml index c88e495..d70fe7e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,21 +1,16 @@ -[workspace] -members = ["gen"] - [package] -name = "datafusion-orc" -version = "0.2.43" +name = "orc-rust" +version = "0.3.0" edition = "2021" homepage = "https://github.com/datafusion-contrib/datafusion-orc" repository = "https://github.com/datafusion-contrib/datafusion-orc" -authors = ["Weny "] +authors = ["Weny ", "Jeffrey Vo "] license = "Apache-2.0" -description = "Implementation of ORC file format" +description = "Implementation of Apache ORC file format using Apache Arrow in-memory format" keywords = ["arrow", "orc", "arrow-rs", "datafusion"] include = ["src/**/*.rs", "Cargo.toml"] rust-version = "1.70" -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - [dependencies] arrow = { version = "50", features = ["prettyprint"] } bytes = "1.4" diff --git a/README.md b/README.md index 3a9351a..9629b6f 100644 --- a/README.md +++ b/README.md @@ -1,50 +1,116 @@ -# datafusion-orc -Implementation of ORC file format read/write with Arrow in-memory format - [![test](https://github.com/datafusion-contrib/datafusion-orc/actions/workflows/ci.yml/badge.svg)](https://github.com/datafusion-contrib/datafusion-orc/actions/workflows/ci.yml) [![codecov](https://codecov.io/gh/WenyXu/orc-rs/branch/main/graph/badge.svg?token=2CSHZX02XM)](https://codecov.io/gh/WenyXu/orc-rs) [![Crates.io](https://img.shields.io/crates/v/orc-rust)](https://crates.io/crates/orc-rust) [![Crates.io](https://img.shields.io/crates/d/orc-rust)](https://crates.io/crates/orc-rust) -Read [Apache ORC](https://orc.apache.org/) in Rust. - -* Read ORC files -* Read stripes (the conversion from proto metadata to memory regions) -* Decode stripes (the math of decode stripes into e.g. booleans, runs of RLE, etc.) -* Decode ORC data to [Arrow Datatypes](https://docs.rs/arrow/latest/arrow/datatypes/enum.DataType.html) (Async/Sync) - - -## Current Support - -| Column Encoding | Read | Write | Arrow DataType | -| ------------------------- | ---- | ----- | -------------------------- | -| SmallInt, Int, BigInt | ✓ | | Int16, Int32, Int64 | -| Float, Double | ✓ | | Float32, Float64 | -| String, Char, and VarChar | ✓ | | Utf8 | -| Boolean | ✓ | | Boolean | -| TinyInt | ✓ | | Int8 | -| Binary | ✓ | | Binary | -| Decimal | ✓ | | Decimal128 | -| Date | ✓ | | Date32 | -| Timestamp | ✓ | | Timestamp(Nanosecond,_) | -| Timestamp instant | ✓ | | Timestamp(Nanosecond, UTC) | -| Struct | ✓ | | Struct | -| List | ✓ | | List | -| Map | ✓ | | Map | -| Union | ✓ | | Union(_, Sparse) | - -## Compression Support - -| Compression | Read | Write | -| ----------- | ---- | ----- | -| None | ✓ | ✗ | -| ZLIB | ✓ | ✗ | -| SNAPPY | ✓ | ✗ | -| LZO | ✓ | ✗ | -| LZ4 | ✓ | ✗ | -| ZSTD | ✓ | ✗ | - -## Benchmark - -Run `cargo bench` for simple benchmarks. +# orc-rust + +A native Rust implementation of the [Apache ORC](https://orc.apache.org) file format, +providing API's to read data into [Apache Arrow](https://arrow.apache.org) in-memory arrays. + +See the [documentation](https://docs.rs/orc-rust/latest/orc_rust/) for examples on how to use this crate. + +## Supported features + +This crate currently only supports reading ORC files into Arrow arrays. Write support is planned +(see [Roadmap](#roadmap)). The below features listed relate only to reading ORC files. +At this time, we aim to support the [ORCv1](https://orc.apache.org/specification/ORCv1/) specification only. + +- Read synchronously & asynchronously (using Tokio) +- All compression types (Zlib, Snappy, Lzo, Lz4, Zstd) +- All ORC data types +- All encodings +- Rudimentary support for retrieving statistics +- Retrieving user metadata into Arrow schema metadata + +## Roadmap + +The long term vision for this crate is to be feature complete enough to be donated to the +[arrow-rs](https://github.com/apache/arrow-rs) project. + +The following lists the rough roadmap for features to be implemented, from highest to lowest priority. + +- Performance enhancements +- DataFusion integration +- Predicate pushdown +- Row indices +- Bloom filters +- Write from Arrow arrays +- Encryption + +A non-Arrow API interface is not planned at the moment. Feel free to raise an issue if there is such +a use case. + +## Version compatibility + +No guarantees are provided about stability across versions. We will endeavour to keep the top level API's +(`ArrowReader` and `ArrowStreamReader`) as stable as we can, but other API's provided may change as we +explore the interface we want the library to expose. + +Versions will be released on an ad-hoc basis (with no fixed schedule). + +## Mapping ORC types to Arrow types + +The following table lists how ORC data types are read into Arrow data types: + +| ORC Data Type | Arrow Data Type | +| ----------------- | -------------------------- | +| Boolean | Boolean | +| TinyInt | Int8 | +| SmallInt | Int16 | +| Int | Int32 | +| BigInt | Int64 | +| Float | Float32 | +| Double | Float64 | +| String | Utf8 | +| Char | Utf8 | +| VarChar | Utf8 | +| Binary | Binary | +| Decimal | Decimal128 | +| Date | Date32 | +| Timestamp | Timestamp(Nanosecond, None) | +| Timestamp instant | Timestamp(Nanosecond, UTC) | +| Struct | Struct | +| List | List | +| Map | Map | +| Union | Union(_, Sparse) | + +## Contributing + +All contributions are welcome! Feel free to raise an issue if you have a feature request, bug report, +or a question. Feel free to raise a Pull Request without raising an issue first, as long as the Pull +Request is descriptive enough. + +Some tools we use in addition to the standard `cargo` that require installation are: + +- [taplo](https://taplo.tamasfe.dev/) +- [typos](https://crates.io/crates/typos) + +```shell +cargo install typos-cli +cargo install taplo-cli +``` + +```shell +# Building the crate +cargo build + +# Running the test suite +cargo test + +# Simple benchmarks +cargo bench + +# Formatting TOML files +taplo format + +# Detect any typos in the codebase +typos +``` + +To regenerate/update the [proto.rs](src/proto.rs) file, execute the [regen.sh](regen.sh) script. + +```shell +./regen.sh +``` diff --git a/regen.sh b/regen.sh index d83f9d5..87086c8 100755 --- a/regen.sh +++ b/regen.sh @@ -19,3 +19,4 @@ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) cd $SCRIPT_DIR && cargo run --manifest-path gen/Cargo.toml +rustfmt src/proto.rs diff --git a/rustfmt.toml b/rustfmt.toml deleted file mode 100644 index 3a26366..0000000 --- a/rustfmt.toml +++ /dev/null @@ -1 +0,0 @@ -edition = "2021" diff --git a/src/lib.rs b/src/lib.rs index d16705f..036d3aa 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,16 @@ +//! A native Rust implementation of the [Apache ORC](https://orc.apache.org) file format, +//! providing API's to read data into [Apache Arrow](https://arrow.apache.org) in-memory arrays. +//! +//! # Example usage +//! +//! ```no_run +//! # use std::fs::File; +//! # use datafusion_orc::arrow_reader::{ArrowReader, ArrowReaderBuilder}; +//! let file = File::open("/path/to/file.orc").unwrap(); +//! let reader = ArrowReaderBuilder::try_new(file).unwrap().build(); +//! let record_batches = reader.collect::, _>>().unwrap(); +//! ``` + pub mod arrow_reader; #[cfg(feature = "async")] pub mod async_arrow_reader; diff --git a/typos.toml b/typos.toml index 9fb9682..90983b5 100644 --- a/typos.toml +++ b/typos.toml @@ -1,9 +1,10 @@ [default.extend-words] ue = "ue" datas = "datas" + [files] extend-exclude = [ - "corrupted", + "tests/**/data/**", "format/orc_proto.proto", "src/proto.rs" ]