Skip to content

Commit 1ee59c1

Browse files
committed
Add Parser::from_bytes, with BOM detection and Content-Type charset.
1 parent 111e0f0 commit 1ee59c1

File tree

2 files changed

+151
-5
lines changed

2 files changed

+151
-5
lines changed

src/driver.rs

Lines changed: 142 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,13 @@ use tokenizer::{Attribute, Tokenizer, TokenizerOpts};
1313
use tree_builder::{TreeBuilderOpts, TreeBuilder, TreeSink};
1414

1515
use std::borrow::Cow;
16+
use std::mem;
1617

18+
use encoding::{self, EncodingRef};
1719
use string_cache::QualName;
1820
use tendril;
19-
use tendril::StrTendril;
20-
use tendril::stream::{TendrilSink, Utf8LossyDecoder};
21+
use tendril::{StrTendril, ByteTendril};
22+
use tendril::stream::{TendrilSink, Utf8LossyDecoder, LossyDecoder};
2123

2224
/// All-encompassing options struct for the parser.
2325
#[derive(Clone, Default)]
@@ -30,13 +32,25 @@ pub struct ParseOpts {
3032
}
3133

3234
/// Parse an HTML document
35+
///
36+
/// The returned value implements `tendril::TendrilSink`
37+
/// so that Unicode input may be provided incrementally,
38+
/// or all at once with the `one` method.
39+
///
40+
/// If your input is bytes, use `Parser::from_utf8` or `Parser::from_bytes`.
3341
pub fn parse_document<Sink>(sink: Sink, opts: ParseOpts) -> Parser<Sink> where Sink: TreeSink {
3442
let tb = TreeBuilder::new(sink, opts.tree_builder);
3543
let tok = Tokenizer::new(tb, opts.tokenizer);
3644
Parser { tokenizer: tok }
3745
}
3846

3947
/// Parse an HTML fragment
48+
///
49+
/// The returned value implements `tendril::TendrilSink`
50+
/// so that Unicode input may be provided incrementally,
51+
/// or all at once with the `one` method.
52+
///
53+
/// If your input is bytes, use `Parser::from_utf8` or `Parser::from_bytes`.
4054
pub fn parse_fragment<Sink>(mut sink: Sink, opts: ParseOpts,
4155
context_name: QualName, context_attrs: Vec<Attribute>)
4256
-> Parser<Sink>
@@ -51,8 +65,10 @@ pub fn parse_fragment<Sink>(mut sink: Sink, opts: ParseOpts,
5165
Parser { tokenizer: tok }
5266
}
5367

68+
/// An HTML parser,
69+
/// ready to recieve Unicode input through the `tendril::TendrilSink` trait’s methods.
5470
pub struct Parser<Sink> where Sink: TreeSink {
55-
tokenizer: Tokenizer<TreeBuilder<Sink::Handle, Sink>>
71+
tokenizer: Tokenizer<TreeBuilder<Sink::Handle, Sink>>,
5672
}
5773

5874
impl<Sink: TreeSink> TendrilSink<tendril::fmt::UTF8> for Parser<Sink> {
@@ -74,7 +90,130 @@ impl<Sink: TreeSink> TendrilSink<tendril::fmt::UTF8> for Parser<Sink> {
7490
}
7591

7692
impl<Sink: TreeSink> Parser<Sink> {
93+
/// Wrap this parser into a `TendrilSink` that accepts UTF-8 bytes.
94+
///
95+
/// Use this when your input is bytes that are known to be in the UTF-8 encoding.
96+
/// Decoding is lossy, like `String::from_utf8_lossy`.
7797
pub fn from_utf8(self) -> Utf8LossyDecoder<Self> {
7898
Utf8LossyDecoder::new(self)
7999
}
100+
101+
/// Wrap this parser into a `TendrilSink` that accepts bytes
102+
/// and tries to detect the correct character encoding.
103+
///
104+
/// Currently this looks for a Byte Order Mark,
105+
/// then uses `BytesOpts::transport_layer_encoding`,
106+
/// then falls back to UTF-8.
107+
///
108+
/// FIXME: this should look for `<meta>` elements and other data per
109+
/// https://html.spec.whatwg.org/multipage/syntax.html#determining-the-character-encoding
110+
pub fn from_bytes(self, opts: BytesOpts) -> BytesParser<Sink> {
111+
BytesParser {
112+
state: BytesParserState::Initial { parser: self },
113+
opts: opts,
114+
}
115+
}
116+
}
117+
118+
/// Options for choosing a character encoding
119+
#[derive(Clone, Default)]
120+
pub struct BytesOpts {
121+
/// The character encoding specified by the transport layer, if any.
122+
/// In HTTP for example, this is the `charset` parameter of the `Content-Type` response header.
123+
pub transport_layer_encoding: Option<EncodingRef>,
124+
}
125+
126+
/// An HTML parser,
127+
/// ready to recieve bytes input through the `tendril::TendrilSink` trait’s methods.
128+
///
129+
/// See `Parser::from_bytes`.
130+
pub struct BytesParser<Sink> where Sink: TreeSink {
131+
state: BytesParserState<Sink>,
132+
opts: BytesOpts,
133+
}
134+
135+
enum BytesParserState<Sink> where Sink: TreeSink {
136+
Initial {
137+
parser: Parser<Sink>,
138+
},
139+
Buffering {
140+
parser: Parser<Sink>,
141+
buffer: ByteTendril
142+
},
143+
Parsing {
144+
decoder: LossyDecoder<Parser<Sink>>,
145+
},
146+
Transient
147+
}
148+
149+
impl<Sink: TreeSink> TendrilSink<tendril::fmt::Bytes> for BytesParser<Sink> {
150+
fn process(&mut self, t: ByteTendril) {
151+
if let &mut BytesParserState::Parsing { ref mut decoder } = &mut self.state {
152+
return decoder.process(t)
153+
}
154+
let (parser, buffer) = match mem::replace(&mut self.state, BytesParserState::Transient) {
155+
BytesParserState::Initial{ parser } => (parser, t),
156+
BytesParserState::Buffering { parser, mut buffer } => {
157+
buffer.push_tendril(&t);
158+
(parser, buffer)
159+
}
160+
BytesParserState::Parsing { .. } | BytesParserState::Transient => unreachable!(),
161+
};
162+
if buffer.len32() >= PRESCAN_BYTES {
163+
let encoding = detect_encoding(&buffer, &self.opts);
164+
let decoder = LossyDecoder::new(encoding, parser);
165+
self.state = BytesParserState::Parsing { decoder: decoder }
166+
} else {
167+
self.state = BytesParserState::Buffering {
168+
parser: parser,
169+
buffer: buffer,
170+
}
171+
}
172+
}
173+
174+
fn error(&mut self, desc: Cow<'static, str>) {
175+
match self.state {
176+
BytesParserState::Initial { ref mut parser } => parser.error(desc),
177+
BytesParserState::Buffering { ref mut parser, .. } => parser.error(desc),
178+
BytesParserState::Parsing { ref mut decoder } => decoder.error(desc),
179+
BytesParserState::Transient => unreachable!(),
180+
}
181+
}
182+
183+
type Output = Sink::Output;
184+
185+
fn finish(self) -> Self::Output {
186+
match self.state {
187+
BytesParserState::Initial { parser } => parser.finish(),
188+
BytesParserState::Buffering { parser, buffer } => {
189+
let encoding = detect_encoding(&buffer, &self.opts);
190+
let decoder = LossyDecoder::new(encoding, parser);
191+
decoder.finish()
192+
},
193+
BytesParserState::Parsing { decoder } => decoder.finish(),
194+
BytesParserState::Transient => unreachable!(),
195+
}
196+
}
197+
}
198+
199+
/// How many bytes does detect_encoding() need
200+
// NOTE: 3 would be enough for a BOM, but 1024 is specified for <meta> elements.
201+
const PRESCAN_BYTES: u32 = 1024;
202+
203+
/// https://html.spec.whatwg.org/multipage/syntax.html#determining-the-character-encoding
204+
fn detect_encoding(bytes: &ByteTendril, opts: &BytesOpts) -> EncodingRef {
205+
if bytes.starts_with(b"\xEF\xBB\xBF") {
206+
return encoding::all::UTF_8
207+
}
208+
if bytes.starts_with(b"\xFE\xFF") {
209+
return encoding::all::UTF_16BE
210+
}
211+
if bytes.starts_with(b"\xFF\xFE") {
212+
return encoding::all::UTF_16LE
213+
}
214+
if let Some(encoding) = opts.transport_layer_encoding {
215+
return encoding
216+
}
217+
// FIXME: <meta> etc.
218+
return encoding::all::UTF_8
80219
}

src/lib.rs

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,6 @@ extern crate log;
2424
#[macro_use]
2525
extern crate string_cache;
2626

27-
extern crate tendril;
28-
2927
#[macro_use]
3028
extern crate mac;
3129

@@ -52,3 +50,12 @@ pub mod tree_builder;
5250
pub mod serialize;
5351
pub mod driver;
5452
pub mod rcdom;
53+
54+
/// Re-export the tendril crate.
55+
pub mod tendril {
56+
extern crate tendril;
57+
pub use self::tendril::*;
58+
}
59+
60+
/// Re-export the encoding crate.
61+
pub use tendril::encoding;

0 commit comments

Comments
 (0)