@@ -13,11 +13,13 @@ use tokenizer::{Attribute, Tokenizer, TokenizerOpts};
13
13
use tree_builder:: { TreeBuilderOpts , TreeBuilder , TreeSink } ;
14
14
15
15
use std:: borrow:: Cow ;
16
+ use std:: mem;
16
17
18
+ use encoding:: { self , EncodingRef } ;
17
19
use string_cache:: QualName ;
18
20
use tendril;
19
- use tendril:: StrTendril ;
20
- use tendril:: stream:: { TendrilSink , Utf8LossyDecoder } ;
21
+ use tendril:: { StrTendril , ByteTendril } ;
22
+ use tendril:: stream:: { TendrilSink , Utf8LossyDecoder , LossyDecoder } ;
21
23
22
24
/// All-encompassing options struct for the parser.
23
25
#[ derive( Clone , Default ) ]
@@ -30,13 +32,25 @@ pub struct ParseOpts {
30
32
}
31
33
32
34
/// Parse an HTML document
35
+ ///
36
+ /// The returned value implements `tendril::TendrilSink`
37
+ /// so that Unicode input may be provided incrementally,
38
+ /// or all at once with the `one` method.
39
+ ///
40
+ /// If your input is bytes, use `Parser::from_utf8` or `Parser::from_bytes`.
33
41
pub fn parse_document < Sink > ( sink : Sink , opts : ParseOpts ) -> Parser < Sink > where Sink : TreeSink {
34
42
let tb = TreeBuilder :: new ( sink, opts. tree_builder ) ;
35
43
let tok = Tokenizer :: new ( tb, opts. tokenizer ) ;
36
44
Parser { tokenizer : tok }
37
45
}
38
46
39
47
/// Parse an HTML fragment
48
+ ///
49
+ /// The returned value implements `tendril::TendrilSink`
50
+ /// so that Unicode input may be provided incrementally,
51
+ /// or all at once with the `one` method.
52
+ ///
53
+ /// If your input is bytes, use `Parser::from_utf8` or `Parser::from_bytes`.
40
54
pub fn parse_fragment < Sink > ( mut sink : Sink , opts : ParseOpts ,
41
55
context_name : QualName , context_attrs : Vec < Attribute > )
42
56
-> Parser < Sink >
@@ -51,8 +65,10 @@ pub fn parse_fragment<Sink>(mut sink: Sink, opts: ParseOpts,
51
65
Parser { tokenizer : tok }
52
66
}
53
67
68
+ /// An HTML parser,
69
+ /// ready to recieve Unicode input through the `tendril::TendrilSink` trait’s methods.
54
70
pub struct Parser < Sink > where Sink : TreeSink {
55
- tokenizer : Tokenizer < TreeBuilder < Sink :: Handle , Sink > >
71
+ tokenizer : Tokenizer < TreeBuilder < Sink :: Handle , Sink > > ,
56
72
}
57
73
58
74
impl < Sink : TreeSink > TendrilSink < tendril:: fmt:: UTF8 > for Parser < Sink > {
@@ -74,7 +90,131 @@ impl<Sink: TreeSink> TendrilSink<tendril::fmt::UTF8> for Parser<Sink> {
74
90
}
75
91
76
92
impl < Sink : TreeSink > Parser < Sink > {
93
+ /// Wrap this parser into a `TendrilSink` that accepts UTF-8 bytes.
94
+ ///
95
+ /// Use this when your input is bytes that are known to be in the UTF-8 encoding.
96
+ /// Decoding is lossy, like `String::from_utf8_lossy`.
77
97
pub fn from_utf8 ( self ) -> Utf8LossyDecoder < Self > {
78
98
Utf8LossyDecoder :: new ( self )
79
99
}
100
+
101
+ /// Wrap this parser into a `TendrilSink` that accepts bytes
102
+ /// and tries to detect the correct character encoding.
103
+ ///
104
+ /// Currently this looks for a Byte Order Mark,
105
+ /// then uses `BytesOpts::transport_layer_encoding`,
106
+ /// then falls back to UTF-8.
107
+ ///
108
+ /// FIXME(https://github.com/servo/html5ever/issues/18): this should look for `<meta>` elements
109
+ /// and other data per
110
+ /// https://html.spec.whatwg.org/multipage/syntax.html#determining-the-character-encoding
111
+ pub fn from_bytes ( self , opts : BytesOpts ) -> BytesParser < Sink > {
112
+ BytesParser {
113
+ state : BytesParserState :: Initial { parser : self } ,
114
+ opts : opts,
115
+ }
116
+ }
117
+ }
118
+
119
+ /// Options for choosing a character encoding
120
+ #[ derive( Clone , Default ) ]
121
+ pub struct BytesOpts {
122
+ /// The character encoding specified by the transport layer, if any.
123
+ /// In HTTP for example, this is the `charset` parameter of the `Content-Type` response header.
124
+ pub transport_layer_encoding : Option < EncodingRef > ,
125
+ }
126
+
127
+ /// An HTML parser,
128
+ /// ready to recieve bytes input through the `tendril::TendrilSink` trait’s methods.
129
+ ///
130
+ /// See `Parser::from_bytes`.
131
+ pub struct BytesParser < Sink > where Sink : TreeSink {
132
+ state : BytesParserState < Sink > ,
133
+ opts : BytesOpts ,
134
+ }
135
+
136
+ enum BytesParserState < Sink > where Sink : TreeSink {
137
+ Initial {
138
+ parser : Parser < Sink > ,
139
+ } ,
140
+ Buffering {
141
+ parser : Parser < Sink > ,
142
+ buffer : ByteTendril
143
+ } ,
144
+ Parsing {
145
+ decoder : LossyDecoder < Parser < Sink > > ,
146
+ } ,
147
+ Transient
148
+ }
149
+
150
+ impl < Sink : TreeSink > TendrilSink < tendril:: fmt:: Bytes > for BytesParser < Sink > {
151
+ fn process ( & mut self , t : ByteTendril ) {
152
+ if let & mut BytesParserState :: Parsing { ref mut decoder } = & mut self . state {
153
+ return decoder. process ( t)
154
+ }
155
+ let ( parser, buffer) = match mem:: replace ( & mut self . state , BytesParserState :: Transient ) {
156
+ BytesParserState :: Initial { parser } => ( parser, t) ,
157
+ BytesParserState :: Buffering { parser, mut buffer } => {
158
+ buffer. push_tendril ( & t) ;
159
+ ( parser, buffer)
160
+ }
161
+ BytesParserState :: Parsing { .. } | BytesParserState :: Transient => unreachable ! ( ) ,
162
+ } ;
163
+ if buffer. len32 ( ) >= PRESCAN_BYTES {
164
+ let encoding = detect_encoding ( & buffer, & self . opts ) ;
165
+ let decoder = LossyDecoder :: new ( encoding, parser) ;
166
+ self . state = BytesParserState :: Parsing { decoder : decoder }
167
+ } else {
168
+ self . state = BytesParserState :: Buffering {
169
+ parser : parser,
170
+ buffer : buffer,
171
+ }
172
+ }
173
+ }
174
+
175
+ fn error ( & mut self , desc : Cow < ' static , str > ) {
176
+ match self . state {
177
+ BytesParserState :: Initial { ref mut parser } => parser. error ( desc) ,
178
+ BytesParserState :: Buffering { ref mut parser, .. } => parser. error ( desc) ,
179
+ BytesParserState :: Parsing { ref mut decoder } => decoder. error ( desc) ,
180
+ BytesParserState :: Transient => unreachable ! ( ) ,
181
+ }
182
+ }
183
+
184
+ type Output = Sink :: Output ;
185
+
186
+ fn finish ( self ) -> Self :: Output {
187
+ match self . state {
188
+ BytesParserState :: Initial { parser } => parser. finish ( ) ,
189
+ BytesParserState :: Buffering { parser, buffer } => {
190
+ let encoding = detect_encoding ( & buffer, & self . opts ) ;
191
+ let decoder = LossyDecoder :: new ( encoding, parser) ;
192
+ decoder. finish ( )
193
+ } ,
194
+ BytesParserState :: Parsing { decoder } => decoder. finish ( ) ,
195
+ BytesParserState :: Transient => unreachable ! ( ) ,
196
+ }
197
+ }
198
+ }
199
+
200
+ /// How many bytes does detect_encoding() need
201
+ // NOTE: 3 would be enough for a BOM, but 1024 is specified for <meta> elements.
202
+ const PRESCAN_BYTES : u32 = 1024 ;
203
+
204
+ /// https://html.spec.whatwg.org/multipage/syntax.html#determining-the-character-encoding
205
+ fn detect_encoding ( bytes : & ByteTendril , opts : & BytesOpts ) -> EncodingRef {
206
+ if bytes. starts_with ( b"\xEF \xBB \xBF " ) {
207
+ return encoding:: all:: UTF_8
208
+ }
209
+ if bytes. starts_with ( b"\xFE \xFF " ) {
210
+ return encoding:: all:: UTF_16BE
211
+ }
212
+ if bytes. starts_with ( b"\xFF \xFE " ) {
213
+ return encoding:: all:: UTF_16LE
214
+ }
215
+ if let Some ( encoding) = opts. transport_layer_encoding {
216
+ return encoding
217
+ }
218
+ // FIXME: <meta> etc.
219
+ return encoding:: all:: UTF_8
80
220
}
0 commit comments