Skip to content

Improve comments for avro #7449

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 30, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 39 additions & 2 deletions arrow-avro/src/codec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,21 @@ impl AvroDataType {
Field::new(name, d, self.nullability.is_some()).with_metadata(self.metadata.clone())
}

/// Returns a reference to the codec used by this data type
///
/// The codec determines how Avro data is encoded and mapped to Arrow data types.
/// This is useful when we need to inspect or use the specific encoding of a field.
pub fn codec(&self) -> &Codec {
&self.codec
}

/// Returns the nullability status of this data type
///
/// In Avro, nullability is represented through unions with null types.
/// The returned value indicates how nulls are encoded in the Avro format:
/// - `Some(Nullability::NullFirst)` - Nulls are encoded as the first union variant
/// - `Some(Nullability::NullSecond)` - Nulls are encoded as the second union variant
/// - `None` - The type is not nullable
pub fn nullability(&self) -> Option<Nullability> {
self.nullability
}
Expand All @@ -78,6 +89,10 @@ impl AvroField {
&self.data_type
}

/// Returns the name of this Avro field
///
/// This is the field name as defined in the Avro schema.
/// It's used to identify fields within a record structure.
pub fn name(&self) -> &str {
&self.name
}
Expand Down Expand Up @@ -108,24 +123,46 @@ impl<'a> TryFrom<&Schema<'a>> for AvroField {
/// <https://avro.apache.org/docs/1.11.1/specification/#encodings>
#[derive(Debug, Clone)]
pub enum Codec {
/// Represents Avro null type, maps to Arrow's Null data type
Null,
/// Represents Avro boolean type, maps to Arrow's Boolean data type
Boolean,
/// Represents Avro int type, maps to Arrow's Int32 data type
Int32,
/// Represents Avro long type, maps to Arrow's Int64 data type
Int64,
/// Represents Avro float type, maps to Arrow's Float32 data type
Float32,
/// Represents Avro double type, maps to Arrow's Float64 data type
Float64,
/// Represents Avro bytes type, maps to Arrow's Binary data type
Binary,
/// String data represented as UTF-8 encoded bytes, corresponding to Arrow's StringArray
Utf8,
/// Represents Avro date logical type, maps to Arrow's Date32 data type
Date32,
/// Represents Avro time-millis logical type, maps to Arrow's Time32(TimeUnit::Millisecond) data type
TimeMillis,
/// Represents Avro time-micros logical type, maps to Arrow's Time64(TimeUnit::Microsecond) data type
TimeMicros,
/// TimestampMillis(is_utc)
/// Represents Avro timestamp-millis or local-timestamp-millis logical type
///
/// Maps to Arrow's Timestamp(TimeUnit::Millisecond) data type
/// The boolean parameter indicates whether the timestamp has a UTC timezone (true) or is local time (false)
TimestampMillis(bool),
/// TimestampMicros(is_utc)
/// Represents Avro timestamp-micros or local-timestamp-micros logical type
///
/// Maps to Arrow's Timestamp(TimeUnit::Microsecond) data type
/// The boolean parameter indicates whether the timestamp has a UTC timezone (true) or is local time (false)
TimestampMicros(bool),
/// Represents Avro fixed type, maps to Arrow's FixedSizeBinary data type
/// The i32 parameter indicates the fixed binary size
Fixed(i32),
/// Represents Avro array type, maps to Arrow's List data type
List(Arc<AvroDataType>),
/// Represents Avro record type, maps to Arrow's Struct data type
Struct(Arc<[AvroField]>),
/// Represents Avro duration logical type, maps to Arrow's Interval(IntervalUnit::MonthDayNano) data type
Interval,
}

Expand Down
7 changes: 7 additions & 0 deletions arrow-avro/src/compression.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,16 @@ use std::io::Read;
pub const CODEC_METADATA_KEY: &str = "avro.codec";

#[derive(Debug, Copy, Clone, Eq, PartialEq)]
/// Supported compression codecs for Avro data
///
/// Avro supports multiple compression formats for data blocks.
/// This enum represents the compression codecs available in this implementation.
pub enum CompressionCodec {
/// Deflate compression (RFC 1951)
Deflate,
/// Snappy compression
Snappy,
/// ZStandard compression
ZStandard,
}

Expand Down
15 changes: 15 additions & 0 deletions arrow-avro/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,26 @@
#![warn(missing_docs)]
#![allow(unused)] // Temporary

/// Core functionality for reading Avro data into Arrow arrays
///
/// Implements the primary reader interface and record decoding logic.
pub mod reader;

/// Avro schema parsing and representation
///
/// Provides types for parsing and representing Avro schema definitions.
mod schema;

/// Compression codec implementations for Avro
///
/// Provides support for various compression algorithms used in Avro files,
/// including Deflate, Snappy, and ZStandard.
mod compression;

/// Data type conversions between Avro and Arrow types
///
/// This module contains the necessary types and functions to convert between
/// Avro data types and Arrow data types.
mod codec;

#[cfg(test)]
Expand Down
1 change: 1 addition & 0 deletions arrow-avro/src/reader/record.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ pub struct RecordDecoder {
}

impl RecordDecoder {
/// Create a new [`RecordDecoder`] from the provided [`AvroDataType`]
pub fn try_new(data_type: &AvroDataType) -> Result<Self, ArrowError> {
match Decoder::try_new(data_type)? {
Decoder::Record(fields, encodings) => Ok(Self {
Expand Down
53 changes: 53 additions & 0 deletions arrow-avro/src/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,13 @@ pub const SCHEMA_METADATA_KEY: &str = "avro.schema";
/// <https://avro.apache.org/docs/1.11.1/specification/#names>
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(untagged)]
/// A type name in an Avro schema
///
/// This represents the different ways a type can be referenced in an Avro schema.
pub enum TypeName<'a> {
/// A primitive type like null, boolean, int, etc.
Primitive(PrimitiveType),
/// A reference to another named type
Ref(&'a str),
}

Expand All @@ -37,13 +42,21 @@ pub enum TypeName<'a> {
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub enum PrimitiveType {
/// null: no value
Null,
/// boolean: a binary value
Boolean,
/// int: 32-bit signed integer
Int,
/// long: 64-bit signed integer
Long,
/// float: single precision (32-bit) IEEE 754 floating-point number
Float,
/// double: double precision (64-bit) IEEE 754 floating-point number
Double,
/// bytes: sequence of 8-bit unsigned bytes
Bytes,
/// string: Unicode character sequence
String,
}

Expand Down Expand Up @@ -78,22 +91,31 @@ impl Attributes<'_> {
#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct Type<'a> {
/// The type of this Avro data structure
#[serde(borrow)]
pub r#type: TypeName<'a>,
/// Additional attributes associated with this type
#[serde(flatten)]
pub attributes: Attributes<'a>,
}

/// An Avro schema
///
/// This represents the different shapes of Avro schemas as defined in the specification.
/// See <https://avro.apache.org/docs/1.11.1/specification/#schemas> for more details.
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(untagged)]
pub enum Schema<'a> {
/// A direct type name (primitive or reference)
#[serde(borrow)]
TypeName(TypeName<'a>),
/// A union of multiple schemas (e.g., ["null", "string"])
#[serde(borrow)]
Union(Vec<Schema<'a>>),
/// A complex type such as record, array, map, etc.
#[serde(borrow)]
Complex(ComplexType<'a>),
/// A type with attributes
#[serde(borrow)]
Type(Type<'a>),
}
Expand All @@ -104,14 +126,19 @@ pub enum Schema<'a> {
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(tag = "type", rename_all = "camelCase")]
pub enum ComplexType<'a> {
/// Record type: a sequence of fields with names and types
#[serde(borrow)]
Record(Record<'a>),
/// Enum type: a set of named values
#[serde(borrow)]
Enum(Enum<'a>),
/// Array type: a sequence of values of the same type
#[serde(borrow)]
Array(Array<'a>),
/// Map type: a mapping from strings to values of the same type
#[serde(borrow)]
Map(Map<'a>),
/// Fixed type: a fixed-size byte array
#[serde(borrow)]
Fixed(Fixed<'a>),
}
Expand All @@ -121,29 +148,39 @@ pub enum ComplexType<'a> {
/// <https://avro.apache.org/docs/1.11.1/specification/#schema-record>
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct Record<'a> {
/// Name of the record
#[serde(borrow)]
pub name: &'a str,
/// Optional namespace for the record, provides a way to organize names
#[serde(borrow, default)]
pub namespace: Option<&'a str>,
/// Optional documentation string for the record
#[serde(borrow, default)]
pub doc: Option<&'a str>,
/// Alternative names for this record
#[serde(borrow, default)]
pub aliases: Vec<&'a str>,
/// The fields contained in this record
#[serde(borrow)]
pub fields: Vec<Field<'a>>,
/// Additional attributes for this record
#[serde(flatten)]
pub attributes: Attributes<'a>,
}

/// A field within a [`Record`]
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct Field<'a> {
/// Name of the field within the record
#[serde(borrow)]
pub name: &'a str,
/// Optional documentation for this field
#[serde(borrow, default)]
pub doc: Option<&'a str>,
/// The field's type definition
#[serde(borrow)]
pub r#type: Schema<'a>,
/// Optional default value for this field
#[serde(borrow, default)]
pub default: Option<&'a str>,
}
Expand All @@ -153,18 +190,25 @@ pub struct Field<'a> {
/// <https://avro.apache.org/docs/1.11.1/specification/#enums>
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct Enum<'a> {
/// Name of the enum
#[serde(borrow)]
pub name: &'a str,
/// Optional namespace for the enum, provides organizational structure
#[serde(borrow, default)]
pub namespace: Option<&'a str>,
/// Optional documentation string describing the enum
#[serde(borrow, default)]
pub doc: Option<&'a str>,
/// Alternative names for this enum
#[serde(borrow, default)]
pub aliases: Vec<&'a str>,
/// The symbols (values) that this enum can have
#[serde(borrow)]
pub symbols: Vec<&'a str>,
/// Optional default value for this enum
#[serde(borrow, default)]
pub default: Option<&'a str>,
/// Additional attributes for this enum
#[serde(flatten)]
pub attributes: Attributes<'a>,
}
Expand All @@ -174,8 +218,10 @@ pub struct Enum<'a> {
/// <https://avro.apache.org/docs/1.11.1/specification/#arrays>
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct Array<'a> {
/// The schema for items in this array
#[serde(borrow)]
pub items: Box<Schema<'a>>,
/// Additional attributes for this array
#[serde(flatten)]
pub attributes: Attributes<'a>,
}
Expand All @@ -185,8 +231,10 @@ pub struct Array<'a> {
/// <https://avro.apache.org/docs/1.11.1/specification/#maps>
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct Map<'a> {
/// The schema for values in this map
#[serde(borrow)]
pub values: Box<Schema<'a>>,
/// Additional attributes for this map
#[serde(flatten)]
pub attributes: Attributes<'a>,
}
Expand All @@ -196,13 +244,18 @@ pub struct Map<'a> {
/// <https://avro.apache.org/docs/1.11.1/specification/#fixed>
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct Fixed<'a> {
/// Name of the fixed type
#[serde(borrow)]
pub name: &'a str,
/// Optional namespace for the fixed type
#[serde(borrow, default)]
pub namespace: Option<&'a str>,
/// Alternative names for this fixed type
#[serde(borrow, default)]
pub aliases: Vec<&'a str>,
/// The number of bytes in this fixed type
pub size: usize,
/// Additional attributes for this fixed type
#[serde(flatten)]
pub attributes: Attributes<'a>,
}
Expand Down
Loading