Skip to content

Commit 111e0f0

Browse files
committed
Rewrite the driver module based on TendrilSink
Depends on servo/tendril#23
1 parent 996fff2 commit 111e0f0

13 files changed

+111
-164
lines changed

Cargo.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
[package]
22

33
name = "html5ever"
4-
version = "0.2.11"
4+
version = "0.3.0"
55
authors = [ "The html5ever Project Developers" ]
66
license = "MIT / Apache-2.0"
77
repository = "https://github.com/servo/html5ever"
@@ -26,7 +26,7 @@ log = "0"
2626
phf = "0.7"
2727
string_cache = "0.2.0"
2828
mac = "0"
29-
tendril = "0.1.6"
29+
tendril = "0.2"
3030
heapsize = { version = "0.1.1", optional = true }
3131
heapsize_plugin = { version = "0.1.0", optional = true }
3232

examples/html2html.rs

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,24 +21,26 @@ extern crate html5ever;
2121
use std::io::{self, Write};
2222
use std::default::Default;
2323

24-
use tendril::{ByteTendril, ReadExt};
24+
use tendril::TendrilSink;
2525

2626
use html5ever::driver::ParseOpts;
2727
use html5ever::tree_builder::TreeBuilderOpts;
28-
use html5ever::{parse, one_input, serialize};
28+
use html5ever::{parse_document, serialize};
2929
use html5ever::rcdom::RcDom;
3030

3131
fn main() {
32-
let mut input = ByteTendril::new();
33-
io::stdin().read_to_tendril(&mut input).unwrap();
34-
let input = input.try_reinterpret().unwrap();
35-
let dom: RcDom = parse(one_input(input), ParseOpts {
32+
let opts = ParseOpts {
3633
tree_builder: TreeBuilderOpts {
3734
drop_doctype: true,
3835
..Default::default()
3936
},
4037
..Default::default()
41-
});
38+
};
39+
let stdin = io::stdin();
40+
let dom = parse_document(RcDom::default(), opts)
41+
.from_utf8()
42+
.read_from(&mut stdin.lock())
43+
.unwrap();
4244

4345
// The validator.nu HTML2HTML always prints a doctype at the very beginning.
4446
io::stdout().write_all(b"<!DOCTYPE html>\n")

examples/noop-tokenize.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,7 @@ use std::default::Default;
1717

1818
use tendril::{ByteTendril, ReadExt};
1919

20-
use html5ever::tokenizer::{TokenSink, Token};
21-
use html5ever::driver::{tokenize_to, one_input};
20+
use html5ever::tokenizer::{TokenSink, Token, Tokenizer};
2221

2322
struct Sink(Vec<Token>);
2423

@@ -35,5 +34,7 @@ fn main() {
3534
io::stdin().read_to_tendril(&mut input).unwrap();
3635
let input = input.try_reinterpret().unwrap();
3736

38-
tokenize_to(Sink(Vec::new()), one_input(input), Default::default());
37+
let mut tok = Tokenizer::new(Sink(Vec::new()), Default::default());
38+
tok.feed(input);
39+
tok.end();
3940
}

examples/noop-tree-builder.rs

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,9 @@ use std::collections::HashMap;
1818
use std::borrow::Cow;
1919
use string_cache::QualName;
2020

21-
use tendril::{StrTendril, ByteTendril, ReadExt};
21+
use tendril::{StrTendril, TendrilSink};
2222

23-
use html5ever::{parse_to, one_input};
23+
use html5ever::parse_document;
2424
use html5ever::tokenizer::Attribute;
2525
use html5ever::tree_builder::{TreeSink, QuirksMode, NodeOrText};
2626

@@ -39,6 +39,8 @@ impl Sink {
3939

4040
impl TreeSink for Sink {
4141
type Handle = usize;
42+
type Output = Self;
43+
fn finish(self) -> Self { self }
4244

4345
fn get_document(&mut self) -> usize {
4446
0
@@ -96,9 +98,9 @@ fn main() {
9698
next_id: 1,
9799
names: HashMap::new(),
98100
};
99-
100-
let mut input = ByteTendril::new();
101-
io::stdin().read_to_tendril(&mut input).unwrap();
102-
let input = input.try_reinterpret().unwrap();
103-
parse_to(sink, one_input(input), Default::default());
101+
let stdin = io::stdin();
102+
parse_document(sink, Default::default())
103+
.from_utf8()
104+
.read_from(&mut stdin.lock())
105+
.unwrap();
104106
}

examples/print-rcdom.rs

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@ use std::iter::repeat;
1818
use std::default::Default;
1919
use std::string::String;
2020

21-
use tendril::{ByteTendril, ReadExt};
22-
use html5ever::{parse, one_input};
21+
use tendril::TendrilSink;
22+
use html5ever::parse_document;
2323
use html5ever::rcdom::{Document, Doctype, Text, Comment, Element, RcDom, Handle};
2424

2525
// This is not proper HTML serialization, of course.
@@ -63,10 +63,11 @@ pub fn escape_default(s: &str) -> String {
6363
}
6464

6565
fn main() {
66-
let mut input = ByteTendril::new();
67-
io::stdin().read_to_tendril(&mut input).unwrap();
68-
let input = input.try_reinterpret().unwrap();
69-
let dom: RcDom = parse(one_input(input), Default::default());
66+
let stdin = io::stdin();
67+
let dom = parse_document(RcDom::default(), Default::default())
68+
.from_utf8()
69+
.read_from(&mut stdin.lock())
70+
.unwrap();
7071
walk(0, dom.document);
7172

7273
if !dom.errors.is_empty() {

examples/print-tree-actions.rs

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,11 @@ use std::collections::HashMap;
1818
use std::borrow::Cow;
1919
use string_cache::QualName;
2020

21-
use tendril::{ByteTendril, StrTendril, ReadExt};
21+
use tendril::{StrTendril, TendrilSink};
2222

23-
use html5ever::{parse_to, one_input};
2423
use html5ever::tokenizer::Attribute;
2524
use html5ever::tree_builder::{TreeSink, QuirksMode, NodeOrText, AppendNode, AppendText};
25+
use html5ever::parse_document;
2626

2727
struct Sink {
2828
next_id: usize,
@@ -39,6 +39,8 @@ impl Sink {
3939

4040
impl TreeSink for Sink {
4141
type Handle = usize;
42+
type Output = Self;
43+
fn finish(self) -> Self { self }
4244

4345
fn parse_error(&mut self, msg: Cow<'static, str>) {
4446
println!("Parse error: {}", msg);
@@ -143,9 +145,9 @@ fn main() {
143145
next_id: 1,
144146
names: HashMap::new(),
145147
};
146-
147-
let mut input = ByteTendril::new();
148-
io::stdin().read_to_tendril(&mut input).unwrap();
149-
let input = input.try_reinterpret().unwrap();
150-
parse_to(sink, one_input(input), Default::default());
148+
let stdin = io::stdin();
149+
parse_document(sink, Default::default())
150+
.from_utf8()
151+
.read_from(&mut stdin.lock())
152+
.unwrap();
151153
}

examples/tokenize.rs

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,8 @@ use std::default::Default;
1515

1616
use tendril::{ByteTendril, ReadExt};
1717

18-
use html5ever::tokenizer::{TokenSink, Token, TokenizerOpts, ParseError};
18+
use html5ever::tokenizer::{TokenSink, Tokenizer, Token, TokenizerOpts, ParseError};
1919
use html5ever::tokenizer::{CharacterTokens, NullCharacterToken, TagToken, StartTag, EndTag};
20-
use html5ever::driver::{tokenize_to, one_input};
2120

2221
#[derive(Copy, Clone)]
2322
struct TokenPrinter {
@@ -84,9 +83,12 @@ fn main() {
8483
let mut input = ByteTendril::new();
8584
io::stdin().read_to_tendril(&mut input).unwrap();
8685
let input = input.try_reinterpret().unwrap();
87-
tokenize_to(sink, one_input(input), TokenizerOpts {
86+
87+
let mut tok = Tokenizer::new(sink, TokenizerOpts {
8888
profile: true,
8989
.. Default::default()
9090
});
91+
tok.feed(input);
92+
tok.end();
9193
sink.is_char(false);
9294
}

src/driver.rs

Lines changed: 37 additions & 105 deletions
Original file line numberDiff line numberDiff line change
@@ -9,39 +9,15 @@
99

1010
//! High-level interface to the parser.
1111
12-
use tokenizer::{Attribute, TokenSink, Tokenizer, TokenizerOpts};
12+
use tokenizer::{Attribute, Tokenizer, TokenizerOpts};
1313
use tree_builder::{TreeBuilderOpts, TreeBuilder, TreeSink};
1414

15-
use std::option;
16-
use std::default::Default;
15+
use std::borrow::Cow;
1716

1817
use string_cache::QualName;
18+
use tendril;
1919
use tendril::StrTendril;
20-
21-
/// Convenience function to turn a single value into an iterator.
22-
pub fn one_input<T>(x: T) -> option::IntoIter<T> {
23-
Some(x).into_iter()
24-
}
25-
26-
/// Tokenize and send results to a `TokenSink`.
27-
///
28-
/// ## Example
29-
///
30-
/// ```ignore
31-
/// let sink = MySink;
32-
/// tokenize_to(sink, one_input(my_str), Default::default());
33-
/// ```
34-
pub fn tokenize_to<Sink, It>(sink: Sink, input: It, opts: TokenizerOpts) -> Sink
35-
where Sink: TokenSink,
36-
It: Iterator<Item=StrTendril>,
37-
{
38-
let mut tok = Tokenizer::new(sink, opts);
39-
for s in input {
40-
tok.feed(s);
41-
}
42-
tok.end();
43-
tok.unwrap()
44-
}
20+
use tendril::stream::{TendrilSink, Utf8LossyDecoder};
4521

4622
/// All-encompassing options struct for the parser.
4723
#[derive(Clone, Default)]
@@ -53,96 +29,52 @@ pub struct ParseOpts {
5329
pub tree_builder: TreeBuilderOpts,
5430
}
5531

56-
/// Parse and send results to a `TreeSink`.
57-
///
58-
/// ## Example
59-
///
60-
/// ```ignore
61-
/// let sink = MySink;
62-
/// parse_to(sink, one_input(my_str), Default::default());
63-
/// ```
64-
pub fn parse_to<Sink, It>(sink: Sink, input: It, opts: ParseOpts) -> Sink
65-
where Sink: TreeSink,
66-
It: Iterator<Item=StrTendril>,
67-
{
32+
/// Parse an HTML document
33+
pub fn parse_document<Sink>(sink: Sink, opts: ParseOpts) -> Parser<Sink> where Sink: TreeSink {
6834
let tb = TreeBuilder::new(sink, opts.tree_builder);
69-
let mut tok = Tokenizer::new(tb, opts.tokenizer);
70-
for s in input {
71-
tok.feed(s);
72-
}
73-
tok.end();
74-
tok.unwrap().unwrap()
35+
let tok = Tokenizer::new(tb, opts.tokenizer);
36+
Parser { tokenizer: tok }
7537
}
7638

77-
/// Parse an HTML fragment and send results to a `TreeSink`.
78-
///
79-
/// ## Example
80-
///
81-
/// ```ignore
82-
/// let sink = MySink;
83-
/// parse_fragment_to(sink, one_input(my_str), context_name, context_attrs, Default::default());
84-
/// ```
85-
pub fn parse_fragment_to<Sink, It>(mut sink: Sink,
86-
input: It,
87-
context_name: QualName,
88-
context_attrs: Vec<Attribute>,
89-
opts: ParseOpts) -> Sink
90-
where Sink: TreeSink,
91-
It: Iterator<Item=StrTendril>
92-
{
39+
/// Parse an HTML fragment
40+
pub fn parse_fragment<Sink>(mut sink: Sink, opts: ParseOpts,
41+
context_name: QualName, context_attrs: Vec<Attribute>)
42+
-> Parser<Sink>
43+
where Sink: TreeSink {
9344
let context_elem = sink.create_element(context_name, context_attrs);
9445
let tb = TreeBuilder::new_for_fragment(sink, context_elem, None, opts.tree_builder);
9546
let tok_opts = TokenizerOpts {
9647
initial_state: Some(tb.tokenizer_state_for_context_elem()),
9748
.. opts.tokenizer
9849
};
99-
let mut tok = Tokenizer::new(tb, tok_opts);
100-
for s in input {
101-
tok.feed(s);
102-
}
103-
tok.end();
104-
tok.unwrap().unwrap()
50+
let tok = Tokenizer::new(tb, tok_opts);
51+
Parser { tokenizer: tok }
10552
}
10653

107-
/// Results which can be extracted from a `TreeSink`.
108-
///
109-
/// Implement this for your parse tree data type so that it
110-
/// can be returned by `parse()`.
111-
pub trait ParseResult {
112-
type Sink: TreeSink + Default;
113-
fn get_result(sink: Self::Sink) -> Self;
54+
pub struct Parser<Sink> where Sink: TreeSink {
55+
tokenizer: Tokenizer<TreeBuilder<Sink::Handle, Sink>>
11456
}
11557

116-
/// Parse into a type which implements `ParseResult`.
117-
///
118-
/// ## Example
119-
///
120-
/// ```ignore
121-
/// let dom: RcDom = parse(one_input(my_str), Default::default());
122-
/// ```
123-
pub fn parse<Output, It>(input: It, opts: ParseOpts) -> Output
124-
where Output: ParseResult,
125-
It: Iterator<Item=StrTendril>,
126-
{
127-
let sink = parse_to(Default::default(), input, opts);
128-
ParseResult::get_result(sink)
58+
impl<Sink: TreeSink> TendrilSink<tendril::fmt::UTF8> for Parser<Sink> {
59+
fn process(&mut self, t: StrTendril) {
60+
self.tokenizer.feed(t)
61+
}
62+
63+
// FIXME: Is it too noisy to report every character decoding error?
64+
fn error(&mut self, desc: Cow<'static, str>) {
65+
self.tokenizer.sink_mut().sink_mut().parse_error(desc)
66+
}
67+
68+
type Output = Sink::Output;
69+
70+
fn finish(mut self) -> Self::Output {
71+
self.tokenizer.end();
72+
self.tokenizer.unwrap().unwrap().finish()
73+
}
12974
}
13075

131-
/// Parse an HTML fragment into a type which implements `ParseResult`.
132-
///
133-
/// ## Example
134-
///
135-
/// ```ignore
136-
/// let dom: RcDom = parse_fragment(
137-
/// one_input(my_str), context_name, context_attrs, Default::default());
138-
/// ```
139-
pub fn parse_fragment<Output, It>(input: It,
140-
context_name: QualName,
141-
context_attrs: Vec<Attribute>,
142-
opts: ParseOpts) -> Output
143-
where Output: ParseResult,
144-
It: Iterator<Item=StrTendril>,
145-
{
146-
let sink = parse_fragment_to(Default::default(), input, context_name, context_attrs, opts);
147-
ParseResult::get_result(sink)
76+
impl<Sink: TreeSink> Parser<Sink> {
77+
pub fn from_utf8(self) -> Utf8LossyDecoder<Self> {
78+
Utf8LossyDecoder::new(self)
79+
}
14880
}

src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ extern crate phf;
3434
extern crate time;
3535

3636
pub use tokenizer::Attribute;
37-
pub use driver::{one_input, ParseOpts, parse_to, parse_fragment_to, parse, parse_fragment};
37+
pub use driver::{ParseOpts, parse_document, parse_fragment, Parser};
3838

3939
pub use serialize::serialize;
4040

0 commit comments

Comments
 (0)