Skip to content

Commit 3ed0f06

Browse files
authored
Hook up Avro Decoder (#6820)
* Hook up Avro Decoder * Docs * Improved varint decode * Docs * Clippy * More clippy
1 parent 7247e6b commit 3ed0f06

File tree

7 files changed

+670
-32
lines changed

7 files changed

+670
-32
lines changed

arrow-avro/Cargo.toml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,9 @@ deflate = ["flate2"]
3939
snappy = ["snap", "crc"]
4040

4141
[dependencies]
42-
arrow-schema = { workspace = true }
42+
arrow-schema = { workspace = true }
43+
arrow-buffer = { workspace = true }
44+
arrow-array = { workspace = true }
4345
serde_json = { version = "1.0", default-features = false, features = ["std"] }
4446
serde = { version = "1.0.188", features = ["derive"] }
4547
flate2 = { version = "1.0", default-features = false, features = ["rust_backend"], optional = true }
@@ -49,4 +51,5 @@ crc = { version = "3.0", optional = true }
4951

5052

5153
[dev-dependencies]
54+
rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] }
5255

arrow-avro/src/codec.rs

Lines changed: 27 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ use std::sync::Arc;
2929
/// To accommodate this we special case two-variant unions where one of the
3030
/// variants is the null type, and use this to derive arrow's notion of nullability
3131
#[derive(Debug, Copy, Clone)]
32-
enum Nulls {
32+
pub enum Nullability {
3333
/// The nulls are encoded as the first union variant
3434
NullFirst,
3535
/// The nulls are encoded as the second union variant
@@ -39,7 +39,7 @@ enum Nulls {
3939
/// An Avro datatype mapped to the arrow data model
4040
#[derive(Debug, Clone)]
4141
pub struct AvroDataType {
42-
nulls: Option<Nulls>,
42+
nullability: Option<Nullability>,
4343
metadata: HashMap<String, String>,
4444
codec: Codec,
4545
}
@@ -48,7 +48,15 @@ impl AvroDataType {
4848
/// Returns an arrow [`Field`] with the given name
4949
pub fn field_with_name(&self, name: &str) -> Field {
5050
let d = self.codec.data_type();
51-
Field::new(name, d, self.nulls.is_some()).with_metadata(self.metadata.clone())
51+
Field::new(name, d, self.nullability.is_some()).with_metadata(self.metadata.clone())
52+
}
53+
54+
pub fn codec(&self) -> &Codec {
55+
&self.codec
56+
}
57+
58+
pub fn nullability(&self) -> Option<Nullability> {
59+
self.nullability
5260
}
5361
}
5462

@@ -65,9 +73,13 @@ impl AvroField {
6573
self.data_type.field_with_name(&self.name)
6674
}
6775

68-
/// Returns the [`Codec`]
69-
pub fn codec(&self) -> &Codec {
70-
&self.data_type.codec
76+
/// Returns the [`AvroDataType`]
77+
pub fn data_type(&self) -> &AvroDataType {
78+
&self.data_type
79+
}
80+
81+
pub fn name(&self) -> &str {
82+
&self.name
7183
}
7284
}
7385

@@ -114,7 +126,7 @@ pub enum Codec {
114126
Fixed(i32),
115127
List(Arc<AvroDataType>),
116128
Struct(Arc<[AvroField]>),
117-
Duration,
129+
Interval,
118130
}
119131

120132
impl Codec {
@@ -137,7 +149,7 @@ impl Codec {
137149
Self::TimestampMicros(is_utc) => {
138150
DataType::Timestamp(TimeUnit::Microsecond, is_utc.then(|| "+00:00".into()))
139151
}
140-
Self::Duration => DataType::Interval(IntervalUnit::MonthDayNano),
152+
Self::Interval => DataType::Interval(IntervalUnit::MonthDayNano),
141153
Self::Fixed(size) => DataType::FixedSizeBinary(*size),
142154
Self::List(f) => {
143155
DataType::List(Arc::new(f.field_with_name(Field::LIST_FIELD_DEFAULT_NAME)))
@@ -200,7 +212,7 @@ fn make_data_type<'a>(
200212
) -> Result<AvroDataType, ArrowError> {
201213
match schema {
202214
Schema::TypeName(TypeName::Primitive(p)) => Ok(AvroDataType {
203-
nulls: None,
215+
nullability: None,
204216
metadata: Default::default(),
205217
codec: (*p).into(),
206218
}),
@@ -213,12 +225,12 @@ fn make_data_type<'a>(
213225
match (f.len() == 2, null) {
214226
(true, Some(0)) => {
215227
let mut field = make_data_type(&f[1], namespace, resolver)?;
216-
field.nulls = Some(Nulls::NullFirst);
228+
field.nullability = Some(Nullability::NullFirst);
217229
Ok(field)
218230
}
219231
(true, Some(1)) => {
220232
let mut field = make_data_type(&f[0], namespace, resolver)?;
221-
field.nulls = Some(Nulls::NullSecond);
233+
field.nullability = Some(Nullability::NullSecond);
222234
Ok(field)
223235
}
224236
_ => Err(ArrowError::NotYetImplemented(format!(
@@ -241,7 +253,7 @@ fn make_data_type<'a>(
241253
.collect::<Result<_, ArrowError>>()?;
242254

243255
let field = AvroDataType {
244-
nulls: None,
256+
nullability: None,
245257
codec: Codec::Struct(fields),
246258
metadata: r.attributes.field_metadata(),
247259
};
@@ -251,7 +263,7 @@ fn make_data_type<'a>(
251263
ComplexType::Array(a) => {
252264
let mut field = make_data_type(a.items.as_ref(), namespace, resolver)?;
253265
Ok(AvroDataType {
254-
nulls: None,
266+
nullability: None,
255267
metadata: a.attributes.field_metadata(),
256268
codec: Codec::List(Arc::new(field)),
257269
})
@@ -262,7 +274,7 @@ fn make_data_type<'a>(
262274
})?;
263275

264276
let field = AvroDataType {
265-
nulls: None,
277+
nullability: None,
266278
metadata: f.attributes.field_metadata(),
267279
codec: Codec::Fixed(size),
268280
};
@@ -298,7 +310,7 @@ fn make_data_type<'a>(
298310
(Some("local-timestamp-micros"), c @ Codec::Int64) => {
299311
*c = Codec::TimestampMicros(false)
300312
}
301-
(Some("duration"), c @ Codec::Fixed(12)) => *c = Codec::Duration,
313+
(Some("duration"), c @ Codec::Fixed(12)) => *c = Codec::Interval,
302314
(Some(logical), _) => {
303315
// Insert unrecognized logical type into metadata map
304316
field.metadata.insert("logicalType".into(), logical.into());

arrow-avro/src/reader/cursor.rs

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
use crate::reader::vlq::read_varint;
19+
use arrow_schema::ArrowError;
20+
21+
/// A wrapper around a byte slice, providing low-level decoding for Avro
22+
///
23+
/// <https://avro.apache.org/docs/1.11.1/specification/#encodings>
24+
#[derive(Debug)]
25+
pub(crate) struct AvroCursor<'a> {
26+
buf: &'a [u8],
27+
start_len: usize,
28+
}
29+
30+
impl<'a> AvroCursor<'a> {
31+
pub(crate) fn new(buf: &'a [u8]) -> Self {
32+
Self {
33+
buf,
34+
start_len: buf.len(),
35+
}
36+
}
37+
38+
/// Returns the current cursor position
39+
#[inline]
40+
pub(crate) fn position(&self) -> usize {
41+
self.start_len - self.buf.len()
42+
}
43+
44+
/// Read a single `u8`
45+
#[inline]
46+
pub(crate) fn get_u8(&mut self) -> Result<u8, ArrowError> {
47+
match self.buf.first().copied() {
48+
Some(x) => {
49+
self.buf = &self.buf[1..];
50+
Ok(x)
51+
}
52+
None => Err(ArrowError::ParseError("Unexpected EOF".to_string())),
53+
}
54+
}
55+
56+
#[inline]
57+
pub(crate) fn get_bool(&mut self) -> Result<bool, ArrowError> {
58+
Ok(self.get_u8()? != 0)
59+
}
60+
61+
pub(crate) fn read_vlq(&mut self) -> Result<u64, ArrowError> {
62+
let (val, offset) = read_varint(self.buf)
63+
.ok_or_else(|| ArrowError::ParseError("bad varint".to_string()))?;
64+
self.buf = &self.buf[offset..];
65+
Ok(val)
66+
}
67+
68+
#[inline]
69+
pub(crate) fn get_int(&mut self) -> Result<i32, ArrowError> {
70+
let varint = self.read_vlq()?;
71+
let val: u32 = varint
72+
.try_into()
73+
.map_err(|_| ArrowError::ParseError("varint overflow".to_string()))?;
74+
Ok((val >> 1) as i32 ^ -((val & 1) as i32))
75+
}
76+
77+
#[inline]
78+
pub(crate) fn get_long(&mut self) -> Result<i64, ArrowError> {
79+
let val = self.read_vlq()?;
80+
Ok((val >> 1) as i64 ^ -((val & 1) as i64))
81+
}
82+
83+
pub(crate) fn get_bytes(&mut self) -> Result<&'a [u8], ArrowError> {
84+
let len: usize = self.get_long()?.try_into().map_err(|_| {
85+
ArrowError::ParseError("offset overflow reading avro bytes".to_string())
86+
})?;
87+
88+
if (self.buf.len() < len) {
89+
return Err(ArrowError::ParseError(
90+
"Unexpected EOF reading bytes".to_string(),
91+
));
92+
}
93+
let ret = &self.buf[..len];
94+
self.buf = &self.buf[len..];
95+
Ok(ret)
96+
}
97+
98+
#[inline]
99+
pub(crate) fn get_float(&mut self) -> Result<f32, ArrowError> {
100+
if (self.buf.len() < 4) {
101+
return Err(ArrowError::ParseError(
102+
"Unexpected EOF reading float".to_string(),
103+
));
104+
}
105+
let ret = f32::from_le_bytes(self.buf[..4].try_into().unwrap());
106+
self.buf = &self.buf[4..];
107+
Ok(ret)
108+
}
109+
110+
#[inline]
111+
pub(crate) fn get_double(&mut self) -> Result<f64, ArrowError> {
112+
if (self.buf.len() < 8) {
113+
return Err(ArrowError::ParseError(
114+
"Unexpected EOF reading float".to_string(),
115+
));
116+
}
117+
let ret = f64::from_le_bytes(self.buf[..8].try_into().unwrap());
118+
self.buf = &self.buf[8..];
119+
Ok(ret)
120+
}
121+
}

arrow-avro/src/reader/header.rs

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
2020
use crate::compression::{CompressionCodec, CODEC_METADATA_KEY};
2121
use crate::reader::vlq::VLQDecoder;
22-
use crate::schema::Schema;
22+
use crate::schema::{Schema, SCHEMA_METADATA_KEY};
2323
use arrow_schema::ArrowError;
2424

2525
#[derive(Debug)]
@@ -89,6 +89,17 @@ impl Header {
8989
))),
9090
}
9191
}
92+
93+
/// Returns the [`Schema`] if any
94+
pub fn schema(&self) -> Result<Option<Schema<'_>>, ArrowError> {
95+
self.get(SCHEMA_METADATA_KEY)
96+
.map(|x| {
97+
serde_json::from_slice(x).map_err(|e| {
98+
ArrowError::ParseError(format!("Failed to parse Avro schema JSON: {e}"))
99+
})
100+
})
101+
.transpose()
102+
}
92103
}
93104

94105
/// A decoder for [`Header`]

0 commit comments

Comments
 (0)