Skip to content

Commit c7f7dc0

Browse files
authored
Improve comments for avro (#7449)
1 parent 07093a4 commit c7f7dc0

File tree

5 files changed

+115
-2
lines changed

5 files changed

+115
-2
lines changed

arrow-avro/src/codec.rs

+39-2
Original file line numberDiff line numberDiff line change
@@ -51,10 +51,21 @@ impl AvroDataType {
5151
Field::new(name, d, self.nullability.is_some()).with_metadata(self.metadata.clone())
5252
}
5353

54+
/// Returns a reference to the codec used by this data type
55+
///
56+
/// The codec determines how Avro data is encoded and mapped to Arrow data types.
57+
/// This is useful when we need to inspect or use the specific encoding of a field.
5458
pub fn codec(&self) -> &Codec {
5559
&self.codec
5660
}
5761

62+
/// Returns the nullability status of this data type
63+
///
64+
/// In Avro, nullability is represented through unions with null types.
65+
/// The returned value indicates how nulls are encoded in the Avro format:
66+
/// - `Some(Nullability::NullFirst)` - Nulls are encoded as the first union variant
67+
/// - `Some(Nullability::NullSecond)` - Nulls are encoded as the second union variant
68+
/// - `None` - The type is not nullable
5869
pub fn nullability(&self) -> Option<Nullability> {
5970
self.nullability
6071
}
@@ -78,6 +89,10 @@ impl AvroField {
7889
&self.data_type
7990
}
8091

92+
/// Returns the name of this Avro field
93+
///
94+
/// This is the field name as defined in the Avro schema.
95+
/// It's used to identify fields within a record structure.
8196
pub fn name(&self) -> &str {
8297
&self.name
8398
}
@@ -108,24 +123,46 @@ impl<'a> TryFrom<&Schema<'a>> for AvroField {
108123
/// <https://avro.apache.org/docs/1.11.1/specification/#encodings>
109124
#[derive(Debug, Clone)]
110125
pub enum Codec {
126+
/// Represents Avro null type, maps to Arrow's Null data type
111127
Null,
128+
/// Represents Avro boolean type, maps to Arrow's Boolean data type
112129
Boolean,
130+
/// Represents Avro int type, maps to Arrow's Int32 data type
113131
Int32,
132+
/// Represents Avro long type, maps to Arrow's Int64 data type
114133
Int64,
134+
/// Represents Avro float type, maps to Arrow's Float32 data type
115135
Float32,
136+
/// Represents Avro double type, maps to Arrow's Float64 data type
116137
Float64,
138+
/// Represents Avro bytes type, maps to Arrow's Binary data type
117139
Binary,
140+
/// String data represented as UTF-8 encoded bytes, corresponding to Arrow's StringArray
118141
Utf8,
142+
/// Represents Avro date logical type, maps to Arrow's Date32 data type
119143
Date32,
144+
/// Represents Avro time-millis logical type, maps to Arrow's Time32(TimeUnit::Millisecond) data type
120145
TimeMillis,
146+
/// Represents Avro time-micros logical type, maps to Arrow's Time64(TimeUnit::Microsecond) data type
121147
TimeMicros,
122-
/// TimestampMillis(is_utc)
148+
/// Represents Avro timestamp-millis or local-timestamp-millis logical type
149+
///
150+
/// Maps to Arrow's Timestamp(TimeUnit::Millisecond) data type
151+
/// The boolean parameter indicates whether the timestamp has a UTC timezone (true) or is local time (false)
123152
TimestampMillis(bool),
124-
/// TimestampMicros(is_utc)
153+
/// Represents Avro timestamp-micros or local-timestamp-micros logical type
154+
///
155+
/// Maps to Arrow's Timestamp(TimeUnit::Microsecond) data type
156+
/// The boolean parameter indicates whether the timestamp has a UTC timezone (true) or is local time (false)
125157
TimestampMicros(bool),
158+
/// Represents Avro fixed type, maps to Arrow's FixedSizeBinary data type
159+
/// The i32 parameter indicates the fixed binary size
126160
Fixed(i32),
161+
/// Represents Avro array type, maps to Arrow's List data type
127162
List(Arc<AvroDataType>),
163+
/// Represents Avro record type, maps to Arrow's Struct data type
128164
Struct(Arc<[AvroField]>),
165+
/// Represents Avro duration logical type, maps to Arrow's Interval(IntervalUnit::MonthDayNano) data type
129166
Interval,
130167
}
131168

arrow-avro/src/compression.rs

+7
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,16 @@ use std::io::Read;
2323
pub const CODEC_METADATA_KEY: &str = "avro.codec";
2424

2525
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
26+
/// Supported compression codecs for Avro data
27+
///
28+
/// Avro supports multiple compression formats for data blocks.
29+
/// This enum represents the compression codecs available in this implementation.
2630
pub enum CompressionCodec {
31+
/// Deflate compression (RFC 1951)
2732
Deflate,
33+
/// Snappy compression
2834
Snappy,
35+
/// ZStandard compression
2936
ZStandard,
3037
}
3138

arrow-avro/src/lib.rs

+15
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,26 @@
2828
#![warn(missing_docs)]
2929
#![allow(unused)] // Temporary
3030

31+
/// Core functionality for reading Avro data into Arrow arrays
32+
///
33+
/// Implements the primary reader interface and record decoding logic.
3134
pub mod reader;
35+
36+
/// Avro schema parsing and representation
37+
///
38+
/// Provides types for parsing and representing Avro schema definitions.
3239
mod schema;
3340

41+
/// Compression codec implementations for Avro
42+
///
43+
/// Provides support for various compression algorithms used in Avro files,
44+
/// including Deflate, Snappy, and ZStandard.
3445
mod compression;
3546

47+
/// Data type conversions between Avro and Arrow types
48+
///
49+
/// This module contains the necessary types and functions to convert between
50+
/// Avro data types and Arrow data types.
3651
mod codec;
3752

3853
#[cfg(test)]

arrow-avro/src/reader/record.rs

+1
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ pub struct RecordDecoder {
3737
}
3838

3939
impl RecordDecoder {
40+
/// Create a new [`RecordDecoder`] from the provided [`AvroDataType`]
4041
pub fn try_new(data_type: &AvroDataType) -> Result<Self, ArrowError> {
4142
match Decoder::try_new(data_type)? {
4243
Decoder::Record(fields, encodings) => Ok(Self {

arrow-avro/src/schema.rs

+53
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,13 @@ pub const SCHEMA_METADATA_KEY: &str = "avro.schema";
2626
/// <https://avro.apache.org/docs/1.11.1/specification/#names>
2727
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
2828
#[serde(untagged)]
29+
/// A type name in an Avro schema
30+
///
31+
/// This represents the different ways a type can be referenced in an Avro schema.
2932
pub enum TypeName<'a> {
33+
/// A primitive type like null, boolean, int, etc.
3034
Primitive(PrimitiveType),
35+
/// A reference to another named type
3136
Ref(&'a str),
3237
}
3338

@@ -37,13 +42,21 @@ pub enum TypeName<'a> {
3742
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
3843
#[serde(rename_all = "camelCase")]
3944
pub enum PrimitiveType {
45+
/// null: no value
4046
Null,
47+
/// boolean: a binary value
4148
Boolean,
49+
/// int: 32-bit signed integer
4250
Int,
51+
/// long: 64-bit signed integer
4352
Long,
53+
/// float: single precision (32-bit) IEEE 754 floating-point number
4454
Float,
55+
/// double: double precision (64-bit) IEEE 754 floating-point number
4556
Double,
57+
/// bytes: sequence of 8-bit unsigned bytes
4658
Bytes,
59+
/// string: Unicode character sequence
4760
String,
4861
}
4962

@@ -78,22 +91,31 @@ impl Attributes<'_> {
7891
#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)]
7992
#[serde(rename_all = "camelCase")]
8093
pub struct Type<'a> {
94+
/// The type of this Avro data structure
8195
#[serde(borrow)]
8296
pub r#type: TypeName<'a>,
97+
/// Additional attributes associated with this type
8398
#[serde(flatten)]
8499
pub attributes: Attributes<'a>,
85100
}
86101

87102
/// An Avro schema
103+
///
104+
/// This represents the different shapes of Avro schemas as defined in the specification.
105+
/// See <https://avro.apache.org/docs/1.11.1/specification/#schemas> for more details.
88106
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
89107
#[serde(untagged)]
90108
pub enum Schema<'a> {
109+
/// A direct type name (primitive or reference)
91110
#[serde(borrow)]
92111
TypeName(TypeName<'a>),
112+
/// A union of multiple schemas (e.g., ["null", "string"])
93113
#[serde(borrow)]
94114
Union(Vec<Schema<'a>>),
115+
/// A complex type such as record, array, map, etc.
95116
#[serde(borrow)]
96117
Complex(ComplexType<'a>),
118+
/// A type with attributes
97119
#[serde(borrow)]
98120
Type(Type<'a>),
99121
}
@@ -104,14 +126,19 @@ pub enum Schema<'a> {
104126
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
105127
#[serde(tag = "type", rename_all = "camelCase")]
106128
pub enum ComplexType<'a> {
129+
/// Record type: a sequence of fields with names and types
107130
#[serde(borrow)]
108131
Record(Record<'a>),
132+
/// Enum type: a set of named values
109133
#[serde(borrow)]
110134
Enum(Enum<'a>),
135+
/// Array type: a sequence of values of the same type
111136
#[serde(borrow)]
112137
Array(Array<'a>),
138+
/// Map type: a mapping from strings to values of the same type
113139
#[serde(borrow)]
114140
Map(Map<'a>),
141+
/// Fixed type: a fixed-size byte array
115142
#[serde(borrow)]
116143
Fixed(Fixed<'a>),
117144
}
@@ -121,29 +148,39 @@ pub enum ComplexType<'a> {
121148
/// <https://avro.apache.org/docs/1.11.1/specification/#schema-record>
122149
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
123150
pub struct Record<'a> {
151+
/// Name of the record
124152
#[serde(borrow)]
125153
pub name: &'a str,
154+
/// Optional namespace for the record, provides a way to organize names
126155
#[serde(borrow, default)]
127156
pub namespace: Option<&'a str>,
157+
/// Optional documentation string for the record
128158
#[serde(borrow, default)]
129159
pub doc: Option<&'a str>,
160+
/// Alternative names for this record
130161
#[serde(borrow, default)]
131162
pub aliases: Vec<&'a str>,
163+
/// The fields contained in this record
132164
#[serde(borrow)]
133165
pub fields: Vec<Field<'a>>,
166+
/// Additional attributes for this record
134167
#[serde(flatten)]
135168
pub attributes: Attributes<'a>,
136169
}
137170

138171
/// A field within a [`Record`]
139172
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
140173
pub struct Field<'a> {
174+
/// Name of the field within the record
141175
#[serde(borrow)]
142176
pub name: &'a str,
177+
/// Optional documentation for this field
143178
#[serde(borrow, default)]
144179
pub doc: Option<&'a str>,
180+
/// The field's type definition
145181
#[serde(borrow)]
146182
pub r#type: Schema<'a>,
183+
/// Optional default value for this field
147184
#[serde(borrow, default)]
148185
pub default: Option<&'a str>,
149186
}
@@ -153,18 +190,25 @@ pub struct Field<'a> {
153190
/// <https://avro.apache.org/docs/1.11.1/specification/#enums>
154191
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
155192
pub struct Enum<'a> {
193+
/// Name of the enum
156194
#[serde(borrow)]
157195
pub name: &'a str,
196+
/// Optional namespace for the enum, provides organizational structure
158197
#[serde(borrow, default)]
159198
pub namespace: Option<&'a str>,
199+
/// Optional documentation string describing the enum
160200
#[serde(borrow, default)]
161201
pub doc: Option<&'a str>,
202+
/// Alternative names for this enum
162203
#[serde(borrow, default)]
163204
pub aliases: Vec<&'a str>,
205+
/// The symbols (values) that this enum can have
164206
#[serde(borrow)]
165207
pub symbols: Vec<&'a str>,
208+
/// Optional default value for this enum
166209
#[serde(borrow, default)]
167210
pub default: Option<&'a str>,
211+
/// Additional attributes for this enum
168212
#[serde(flatten)]
169213
pub attributes: Attributes<'a>,
170214
}
@@ -174,8 +218,10 @@ pub struct Enum<'a> {
174218
/// <https://avro.apache.org/docs/1.11.1/specification/#arrays>
175219
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
176220
pub struct Array<'a> {
221+
/// The schema for items in this array
177222
#[serde(borrow)]
178223
pub items: Box<Schema<'a>>,
224+
/// Additional attributes for this array
179225
#[serde(flatten)]
180226
pub attributes: Attributes<'a>,
181227
}
@@ -185,8 +231,10 @@ pub struct Array<'a> {
185231
/// <https://avro.apache.org/docs/1.11.1/specification/#maps>
186232
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
187233
pub struct Map<'a> {
234+
/// The schema for values in this map
188235
#[serde(borrow)]
189236
pub values: Box<Schema<'a>>,
237+
/// Additional attributes for this map
190238
#[serde(flatten)]
191239
pub attributes: Attributes<'a>,
192240
}
@@ -196,13 +244,18 @@ pub struct Map<'a> {
196244
/// <https://avro.apache.org/docs/1.11.1/specification/#fixed>
197245
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
198246
pub struct Fixed<'a> {
247+
/// Name of the fixed type
199248
#[serde(borrow)]
200249
pub name: &'a str,
250+
/// Optional namespace for the fixed type
201251
#[serde(borrow, default)]
202252
pub namespace: Option<&'a str>,
253+
/// Alternative names for this fixed type
203254
#[serde(borrow, default)]
204255
pub aliases: Vec<&'a str>,
256+
/// The number of bytes in this fixed type
205257
pub size: usize,
258+
/// Additional attributes for this fixed type
206259
#[serde(flatten)]
207260
pub attributes: Attributes<'a>,
208261
}

0 commit comments

Comments
 (0)