@@ -13,11 +13,13 @@ use tokenizer::{Attribute, Tokenizer, TokenizerOpts};
13
13
use tree_builder:: { TreeBuilderOpts , TreeBuilder , TreeSink } ;
14
14
15
15
use std:: borrow:: Cow ;
16
+ use std:: mem;
16
17
18
+ use encoding:: { self , EncodingRef } ;
17
19
use string_cache:: QualName ;
18
20
use tendril;
19
- use tendril:: StrTendril ;
20
- use tendril:: stream:: { TendrilSink , Utf8LossyDecoder } ;
21
+ use tendril:: { StrTendril , ByteTendril } ;
22
+ use tendril:: stream:: { TendrilSink , Utf8LossyDecoder , LossyDecoder } ;
21
23
22
24
/// All-encompassing options struct for the parser.
23
25
#[ derive( Clone , Default ) ]
@@ -30,13 +32,25 @@ pub struct ParseOpts {
30
32
}
31
33
32
34
/// Parse an HTML document
35
+ ///
36
+ /// The returned value implements `tendril::TendrilSink`
37
+ /// so that Unicode input may be provided incrementally,
38
+ /// or all at once with the `one` method.
39
+ ///
40
+ /// If your input is bytes, use `Parser::from_utf8` or `Parser::from_bytes`.
33
41
pub fn parse_document < Sink > ( sink : Sink , opts : ParseOpts ) -> Parser < Sink > where Sink : TreeSink {
34
42
let tb = TreeBuilder :: new ( sink, opts. tree_builder ) ;
35
43
let tok = Tokenizer :: new ( tb, opts. tokenizer ) ;
36
44
Parser { tokenizer : tok }
37
45
}
38
46
39
47
/// Parse an HTML fragment
48
+ ///
49
+ /// The returned value implements `tendril::TendrilSink`
50
+ /// so that Unicode input may be provided incrementally,
51
+ /// or all at once with the `one` method.
52
+ ///
53
+ /// If your input is bytes, use `Parser::from_utf8` or `Parser::from_bytes`.
40
54
pub fn parse_fragment < Sink > ( mut sink : Sink , opts : ParseOpts ,
41
55
context_name : QualName , context_attrs : Vec < Attribute > )
42
56
-> Parser < Sink >
@@ -51,8 +65,10 @@ pub fn parse_fragment<Sink>(mut sink: Sink, opts: ParseOpts,
51
65
Parser { tokenizer : tok }
52
66
}
53
67
68
+ /// An HTML parser,
69
+ /// ready to recieve Unicode input through the `tendril::TendrilSink` trait’s methods.
54
70
pub struct Parser < Sink > where Sink : TreeSink {
55
- tokenizer : Tokenizer < TreeBuilder < Sink :: Handle , Sink > >
71
+ tokenizer : Tokenizer < TreeBuilder < Sink :: Handle , Sink > > ,
56
72
}
57
73
58
74
impl < Sink : TreeSink > TendrilSink < tendril:: fmt:: UTF8 > for Parser < Sink > {
@@ -74,7 +90,130 @@ impl<Sink: TreeSink> TendrilSink<tendril::fmt::UTF8> for Parser<Sink> {
74
90
}
75
91
76
92
impl < Sink : TreeSink > Parser < Sink > {
93
+ /// Wrap this parser into a `TendrilSink` that accepts UTF-8 bytes.
94
+ ///
95
+ /// Use this when your input is bytes that are known to be in the UTF-8 encoding.
96
+ /// Decoding is lossy, like `String::from_utf8_lossy`.
77
97
pub fn from_utf8 ( self ) -> Utf8LossyDecoder < Self > {
78
98
Utf8LossyDecoder :: new ( self )
79
99
}
100
+
101
+ /// Wrap this parser into a `TendrilSink` that accepts bytes
102
+ /// and tries to detect the correct character encoding.
103
+ ///
104
+ /// Currently this looks for a Byte Order Mark,
105
+ /// then uses `BytesOpts::transport_layer_encoding`,
106
+ /// then falls back to UTF-8.
107
+ ///
108
+ /// FIXME: this should look for `<meta>` elements and other data per
109
+ /// https://html.spec.whatwg.org/multipage/syntax.html#determining-the-character-encoding
110
+ pub fn from_bytes ( self , opts : BytesOpts ) -> BytesParser < Sink > {
111
+ BytesParser {
112
+ state : BytesParserState :: Initial { parser : self } ,
113
+ opts : opts,
114
+ }
115
+ }
116
+ }
117
+
118
+ /// Options for choosing a character encoding
119
+ #[ derive( Clone , Default ) ]
120
+ pub struct BytesOpts {
121
+ /// The character encoding specified by the transport layer, if any.
122
+ /// In HTTP for example, this is the `charset` parameter of the `Content-Type` response header.
123
+ pub transport_layer_encoding : Option < EncodingRef > ,
124
+ }
125
+
126
+ /// An HTML parser,
127
+ /// ready to recieve bytes input through the `tendril::TendrilSink` trait’s methods.
128
+ ///
129
+ /// See `Parser::from_bytes`.
130
+ pub struct BytesParser < Sink > where Sink : TreeSink {
131
+ state : BytesParserState < Sink > ,
132
+ opts : BytesOpts ,
133
+ }
134
+
135
+ enum BytesParserState < Sink > where Sink : TreeSink {
136
+ Initial {
137
+ parser : Parser < Sink > ,
138
+ } ,
139
+ Buffering {
140
+ parser : Parser < Sink > ,
141
+ buffer : ByteTendril
142
+ } ,
143
+ Parsing {
144
+ decoder : LossyDecoder < Parser < Sink > > ,
145
+ } ,
146
+ Transient
147
+ }
148
+
149
+ impl < Sink : TreeSink > TendrilSink < tendril:: fmt:: Bytes > for BytesParser < Sink > {
150
+ fn process ( & mut self , t : ByteTendril ) {
151
+ if let & mut BytesParserState :: Parsing { ref mut decoder } = & mut self . state {
152
+ return decoder. process ( t)
153
+ }
154
+ let ( parser, buffer) = match mem:: replace ( & mut self . state , BytesParserState :: Transient ) {
155
+ BytesParserState :: Initial { parser } => ( parser, t) ,
156
+ BytesParserState :: Buffering { parser, mut buffer } => {
157
+ buffer. push_tendril ( & t) ;
158
+ ( parser, buffer)
159
+ }
160
+ BytesParserState :: Parsing { .. } | BytesParserState :: Transient => unreachable ! ( ) ,
161
+ } ;
162
+ if buffer. len32 ( ) >= PRESCAN_BYTES {
163
+ let encoding = detect_encoding ( & buffer, & self . opts ) ;
164
+ let decoder = LossyDecoder :: new ( encoding, parser) ;
165
+ self . state = BytesParserState :: Parsing { decoder : decoder }
166
+ } else {
167
+ self . state = BytesParserState :: Buffering {
168
+ parser : parser,
169
+ buffer : buffer,
170
+ }
171
+ }
172
+ }
173
+
174
+ fn error ( & mut self , desc : Cow < ' static , str > ) {
175
+ match self . state {
176
+ BytesParserState :: Initial { ref mut parser } => parser. error ( desc) ,
177
+ BytesParserState :: Buffering { ref mut parser, .. } => parser. error ( desc) ,
178
+ BytesParserState :: Parsing { ref mut decoder } => decoder. error ( desc) ,
179
+ BytesParserState :: Transient => unreachable ! ( ) ,
180
+ }
181
+ }
182
+
183
+ type Output = Sink :: Output ;
184
+
185
+ fn finish ( self ) -> Self :: Output {
186
+ match self . state {
187
+ BytesParserState :: Initial { parser } => parser. finish ( ) ,
188
+ BytesParserState :: Buffering { parser, buffer } => {
189
+ let encoding = detect_encoding ( & buffer, & self . opts ) ;
190
+ let decoder = LossyDecoder :: new ( encoding, parser) ;
191
+ decoder. finish ( )
192
+ } ,
193
+ BytesParserState :: Parsing { decoder } => decoder. finish ( ) ,
194
+ BytesParserState :: Transient => unreachable ! ( ) ,
195
+ }
196
+ }
197
+ }
198
+
199
+ /// How many bytes does detect_encoding() need
200
+ // NOTE: 3 would be enough for a BOM, but 1024 is specified for <meta> elements.
201
+ const PRESCAN_BYTES : u32 = 1024 ;
202
+
203
+ /// https://html.spec.whatwg.org/multipage/syntax.html#determining-the-character-encoding
204
+ fn detect_encoding ( bytes : & ByteTendril , opts : & BytesOpts ) -> EncodingRef {
205
+ if bytes. starts_with ( b"\xEF \xBB \xBF " ) {
206
+ return encoding:: all:: UTF_8
207
+ }
208
+ if bytes. starts_with ( b"\xFE \xFF " ) {
209
+ return encoding:: all:: UTF_16BE
210
+ }
211
+ if bytes. starts_with ( b"\xFF \xFE " ) {
212
+ return encoding:: all:: UTF_16LE
213
+ }
214
+ if let Some ( encoding) = opts. transport_layer_encoding {
215
+ return encoding
216
+ }
217
+ // FIXME: <meta> etc.
218
+ return encoding:: all:: UTF_8
80
219
}
0 commit comments