Skip to content

Commit 2f4f64b

Browse files
committed
Add Parser::from_bytes, with BOM detection and Content-Type charset.
1 parent 874e369 commit 2f4f64b

File tree

1 file changed

+143
-3
lines changed

1 file changed

+143
-3
lines changed

src/driver.rs

Lines changed: 143 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,13 @@ use tokenizer::{Attribute, Tokenizer, TokenizerOpts};
1313
use tree_builder::{TreeBuilderOpts, TreeBuilder, TreeSink};
1414

1515
use std::borrow::Cow;
16+
use std::mem;
1617

18+
use encoding::{self, EncodingRef};
1719
use string_cache::QualName;
1820
use tendril;
19-
use tendril::StrTendril;
20-
use tendril::stream::{TendrilSink, Utf8LossyDecoder};
21+
use tendril::{StrTendril, ByteTendril};
22+
use tendril::stream::{TendrilSink, Utf8LossyDecoder, LossyDecoder};
2123

2224
/// All-encompassing options struct for the parser.
2325
#[derive(Clone, Default)]
@@ -30,13 +32,25 @@ pub struct ParseOpts {
3032
}
3133

3234
/// Parse an HTML document
35+
///
36+
/// The returned value implements `tendril::TendrilSink`
37+
/// so that Unicode input may be provided incrementally,
38+
/// or all at once with the `one` method.
39+
///
40+
/// If your input is bytes, use `Parser::from_utf8` or `Parser::from_bytes`.
3341
pub fn parse_document<Sink>(sink: Sink, opts: ParseOpts) -> Parser<Sink> where Sink: TreeSink {
3442
let tb = TreeBuilder::new(sink, opts.tree_builder);
3543
let tok = Tokenizer::new(tb, opts.tokenizer);
3644
Parser { tokenizer: tok }
3745
}
3846

3947
/// Parse an HTML fragment
48+
///
49+
/// The returned value implements `tendril::TendrilSink`
50+
/// so that Unicode input may be provided incrementally,
51+
/// or all at once with the `one` method.
52+
///
53+
/// If your input is bytes, use `Parser::from_utf8` or `Parser::from_bytes`.
4054
pub fn parse_fragment<Sink>(mut sink: Sink, opts: ParseOpts,
4155
context_name: QualName, context_attrs: Vec<Attribute>)
4256
-> Parser<Sink>
@@ -51,8 +65,10 @@ pub fn parse_fragment<Sink>(mut sink: Sink, opts: ParseOpts,
5165
Parser { tokenizer: tok }
5266
}
5367

68+
/// An HTML parser,
69+
/// ready to recieve Unicode input through the `tendril::TendrilSink` trait’s methods.
5470
pub struct Parser<Sink> where Sink: TreeSink {
55-
tokenizer: Tokenizer<TreeBuilder<Sink::Handle, Sink>>
71+
tokenizer: Tokenizer<TreeBuilder<Sink::Handle, Sink>>,
5672
}
5773

5874
impl<Sink: TreeSink> TendrilSink<tendril::fmt::UTF8> for Parser<Sink> {
@@ -74,7 +90,131 @@ impl<Sink: TreeSink> TendrilSink<tendril::fmt::UTF8> for Parser<Sink> {
7490
}
7591

7692
impl<Sink: TreeSink> Parser<Sink> {
93+
/// Wrap this parser into a `TendrilSink` that accepts UTF-8 bytes.
94+
///
95+
/// Use this when your input is bytes that are known to be in the UTF-8 encoding.
96+
/// Decoding is lossy, like `String::from_utf8_lossy`.
7797
pub fn from_utf8(self) -> Utf8LossyDecoder<Self> {
7898
Utf8LossyDecoder::new(self)
7999
}
100+
101+
/// Wrap this parser into a `TendrilSink` that accepts bytes
102+
/// and tries to detect the correct character encoding.
103+
///
104+
/// Currently this looks for a Byte Order Mark,
105+
/// then uses `BytesOpts::transport_layer_encoding`,
106+
/// then falls back to UTF-8.
107+
///
108+
/// FIXME(https://github.com/servo/html5ever/issues/18): this should look for `<meta>` elements
109+
/// and other data per
110+
/// https://html.spec.whatwg.org/multipage/syntax.html#determining-the-character-encoding
111+
pub fn from_bytes(self, opts: BytesOpts) -> BytesParser<Sink> {
112+
BytesParser {
113+
state: BytesParserState::Initial { parser: self },
114+
opts: opts,
115+
}
116+
}
117+
}
118+
119+
/// Options for choosing a character encoding
120+
#[derive(Clone, Default)]
121+
pub struct BytesOpts {
122+
/// The character encoding specified by the transport layer, if any.
123+
/// In HTTP for example, this is the `charset` parameter of the `Content-Type` response header.
124+
pub transport_layer_encoding: Option<EncodingRef>,
125+
}
126+
127+
/// An HTML parser,
128+
/// ready to recieve bytes input through the `tendril::TendrilSink` trait’s methods.
129+
///
130+
/// See `Parser::from_bytes`.
131+
pub struct BytesParser<Sink> where Sink: TreeSink {
132+
state: BytesParserState<Sink>,
133+
opts: BytesOpts,
134+
}
135+
136+
enum BytesParserState<Sink> where Sink: TreeSink {
137+
Initial {
138+
parser: Parser<Sink>,
139+
},
140+
Buffering {
141+
parser: Parser<Sink>,
142+
buffer: ByteTendril
143+
},
144+
Parsing {
145+
decoder: LossyDecoder<Parser<Sink>>,
146+
},
147+
Transient
148+
}
149+
150+
impl<Sink: TreeSink> TendrilSink<tendril::fmt::Bytes> for BytesParser<Sink> {
151+
fn process(&mut self, t: ByteTendril) {
152+
if let &mut BytesParserState::Parsing { ref mut decoder } = &mut self.state {
153+
return decoder.process(t)
154+
}
155+
let (parser, buffer) = match mem::replace(&mut self.state, BytesParserState::Transient) {
156+
BytesParserState::Initial{ parser } => (parser, t),
157+
BytesParserState::Buffering { parser, mut buffer } => {
158+
buffer.push_tendril(&t);
159+
(parser, buffer)
160+
}
161+
BytesParserState::Parsing { .. } | BytesParserState::Transient => unreachable!(),
162+
};
163+
if buffer.len32() >= PRESCAN_BYTES {
164+
let encoding = detect_encoding(&buffer, &self.opts);
165+
let decoder = LossyDecoder::new(encoding, parser);
166+
self.state = BytesParserState::Parsing { decoder: decoder }
167+
} else {
168+
self.state = BytesParserState::Buffering {
169+
parser: parser,
170+
buffer: buffer,
171+
}
172+
}
173+
}
174+
175+
fn error(&mut self, desc: Cow<'static, str>) {
176+
match self.state {
177+
BytesParserState::Initial { ref mut parser } => parser.error(desc),
178+
BytesParserState::Buffering { ref mut parser, .. } => parser.error(desc),
179+
BytesParserState::Parsing { ref mut decoder } => decoder.error(desc),
180+
BytesParserState::Transient => unreachable!(),
181+
}
182+
}
183+
184+
type Output = Sink::Output;
185+
186+
fn finish(self) -> Self::Output {
187+
match self.state {
188+
BytesParserState::Initial { parser } => parser.finish(),
189+
BytesParserState::Buffering { parser, buffer } => {
190+
let encoding = detect_encoding(&buffer, &self.opts);
191+
let decoder = LossyDecoder::new(encoding, parser);
192+
decoder.finish()
193+
},
194+
BytesParserState::Parsing { decoder } => decoder.finish(),
195+
BytesParserState::Transient => unreachable!(),
196+
}
197+
}
198+
}
199+
200+
/// How many bytes does detect_encoding() need
201+
// NOTE: 3 would be enough for a BOM, but 1024 is specified for <meta> elements.
202+
const PRESCAN_BYTES: u32 = 1024;
203+
204+
/// https://html.spec.whatwg.org/multipage/syntax.html#determining-the-character-encoding
205+
fn detect_encoding(bytes: &ByteTendril, opts: &BytesOpts) -> EncodingRef {
206+
if bytes.starts_with(b"\xEF\xBB\xBF") {
207+
return encoding::all::UTF_8
208+
}
209+
if bytes.starts_with(b"\xFE\xFF") {
210+
return encoding::all::UTF_16BE
211+
}
212+
if bytes.starts_with(b"\xFF\xFE") {
213+
return encoding::all::UTF_16LE
214+
}
215+
if let Some(encoding) = opts.transport_layer_encoding {
216+
return encoding
217+
}
218+
// FIXME: <meta> etc.
219+
return encoding::all::UTF_8
80220
}

0 commit comments

Comments
 (0)