Skip to content

Commit 7be620c

Browse files
committed
Implement zero-copy parsing
Based on servo#60 and servo#114. Fixes servo#20. Fixes servo#115.
1 parent 1e1f5a1 commit 7be620c

31 files changed

+344
-353
lines changed

Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@ git = "https://github.com/servo/string-cache"
2121
[dependencies.string_cache_plugin]
2222
git = "https://github.com/servo/string-cache"
2323

24+
[dependencies.tendril]
25+
git = "https://github.com/kmcallister/tendril"
26+
2427
[dependencies.mac]
2528
git = "https://github.com/reem/rust-mac"
2629

benches/tokenizer.rs

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#![feature(box_syntax, std_misc, start, test)]
1111

1212
extern crate test;
13+
extern crate tendril;
1314
extern crate html5ever;
1415

1516
use std::{fs, env, cmp, rt};
@@ -21,6 +22,7 @@ use test::{black_box, Bencher, TestDesc, TestDescAndFn};
2122
use test::{DynTestName, DynBenchFn, TDynBenchFn};
2223
use test::ShouldPanic::No;
2324

25+
use tendril::{ByteTendril, StrTendril, ReadExt, SliceExt};
2426
use html5ever::tokenizer::{TokenSink, Token, Tokenizer, TokenizerOpts};
2527

2628
struct Sink;
@@ -36,7 +38,7 @@ impl TokenSink for Sink {
3638
// This could almost be the TokenSink too, but it's not
3739
// mut within run().
3840
struct Bench {
39-
input: Vec<String>,
41+
input: Vec<StrTendril>,
4042
clone_only: bool,
4143
opts: TokenizerOpts,
4244
}
@@ -50,8 +52,9 @@ impl Bench {
5052
let mut file = fs::File::open(&path).ok().expect("can't open file");
5153

5254
// Read the file and treat it as an infinitely repeating sequence of characters.
53-
let mut file_input = String::new();
54-
file.read_to_string(&mut file_input).ok().expect("can't read file");
55+
let mut file_input = ByteTendril::new();
56+
file.read_to_tendril(&mut file_input).ok().expect("can't read file");
57+
let file_input: StrTendril = file_input.try_reinterpret().unwrap();
5558
let size = size.unwrap_or(file_input.len());
5659
let mut stream = file_input.chars().cycle();
5760

@@ -63,7 +66,7 @@ impl Bench {
6366
// The by_ref() call is important, otherwise we get wrong results!
6467
// See rust-lang/rust#18045.
6568
let sz = cmp::min(1024, size - total);
66-
input.push(stream.by_ref().take(sz).collect());
69+
input.push(stream.by_ref().take(sz).collect::<String>().to_tendril());
6770
total += sz;
6871
}
6972

capi/Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,3 +18,6 @@ path = "../"
1818
git = "https://github.com/servo/string-cache"
1919
[dependencies.string_cache_plugin]
2020
git = "https://github.com/servo/string-cache"
21+
22+
[dependencies.tendril]
23+
git = "https://github.com/kmcallister/tendril"

capi/src/lib.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
extern crate libc;
1111
extern crate string_cache;
12+
extern crate tendril;
1213
extern crate html5ever;
1314

1415
use std::{ptr, slice, str};
@@ -19,6 +20,8 @@ use libc::{size_t, c_int, c_char, strlen};
1920

2021
use string_cache::Atom;
2122

23+
use tendril::StrTendril;
24+
2225
#[repr(C)]
2326
pub struct h5e_buf {
2427
data: *const u8,
@@ -86,6 +89,12 @@ impl AsLifetimeBuf for String {
8689
}
8790
}
8891

92+
impl AsLifetimeBuf for StrTendril {
93+
fn as_lifetime_buf<'a>(&'a self) -> LifetimeBuf<'a> {
94+
LifetimeBuf::from_str(self)
95+
}
96+
}
97+
8998
impl AsLifetimeBuf for Atom {
9099
fn as_lifetime_buf<'a>(&'a self) -> LifetimeBuf<'a> {
91100
LifetimeBuf::from_str(self)

capi/src/tokenizer.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ use {LifetimeBuf, AsLifetimeBuf, h5e_buf, c_bool};
1414
use html5ever::tokenizer::{TokenSink, Token, Doctype, Tag, ParseError, DoctypeToken};
1515
use html5ever::tokenizer::{CommentToken, CharacterTokens, NullCharacterToken};
1616
use html5ever::tokenizer::{TagToken, StartTag, EndTag, EOFToken, Tokenizer};
17+
use html5ever::Tendril;
1718

1819
use std::mem;
1920
use std::default::Default;
@@ -71,7 +72,7 @@ impl TokenSink for *mut h5e_token_sink {
7172
($name:ident) => (call!($name,)); // bleh
7273
}
7374

74-
fn opt_str_to_buf<'a>(s: &'a Option<String>) -> LifetimeBuf<'a> {
75+
fn opt_str_to_buf<'a>(s: &'a Option<Tendril>) -> LifetimeBuf<'a> {
7576
match *s {
7677
None => LifetimeBuf::null(),
7778
Some(ref s) => s.as_lifetime_buf(),

dom_sink/src/common.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
use html5ever::tokenizer::Attribute;
1111

1212
use string_cache::QualName;
13+
use tendril::StrTendril;
1314

1415
pub use self::NodeEnum::{Document, Doctype, Text, Comment, Element};
1516

@@ -20,13 +21,13 @@ pub enum NodeEnum {
2021
Document,
2122

2223
/// A `DOCTYPE` with name, public id, and system id.
23-
Doctype(String, String, String),
24+
Doctype(StrTendril, StrTendril, StrTendril),
2425

2526
/// A text node.
26-
Text(String),
27+
Text(StrTendril),
2728

2829
/// A comment.
29-
Comment(String),
30+
Comment(StrTendril),
3031

3132
/// An element with attributes.
3233
Element(QualName, Vec<Attribute>),

dom_sink/src/owned_dom.rs

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ use std::collections::HashSet;
3838
use std::ops::{Deref, DerefMut};
3939

4040
use string_cache::QualName;
41+
use tendril::StrTendril;
4142

4243
/// The internal type we use for nodes during parsing.
4344
pub struct SquishyNode {
@@ -135,7 +136,7 @@ fn get_parent_and_index(child: Handle) -> Option<(Handle, usize)> {
135136
fn append_to_existing_text(mut prev: Handle, text: &str) -> bool {
136137
match prev.deref_mut().node {
137138
Text(ref mut existing) => {
138-
existing.push_str(text);
139+
existing.push_slice(text);
139140
true
140141
}
141142
_ => false,
@@ -208,7 +209,7 @@ impl TreeSink for Sink {
208209
self.new_node(Element(name, attrs))
209210
}
210211

211-
fn create_comment(&mut self, text: String) -> Handle {
212+
fn create_comment(&mut self, text: StrTendril) -> Handle {
212213
self.new_node(Comment(text))
213214
}
214215

@@ -262,7 +263,10 @@ impl TreeSink for Sink {
262263
Ok(())
263264
}
264265

265-
fn append_doctype_to_document(&mut self, name: String, public_id: String, system_id: String) {
266+
fn append_doctype_to_document(&mut self,
267+
name: StrTendril,
268+
public_id: StrTendril,
269+
system_id: StrTendril) {
266270
append(self.document, self.new_node(Doctype(name, public_id, system_id)));
267271
}
268272

dom_sink/src/rcdom.rs

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ use std::io::{self, Write};
3030
use std::ops::{Deref, DerefMut};
3131

3232
use string_cache::QualName;
33+
use tendril::StrTendril;
3334

3435
/// A DOM node.
3536
pub struct Node {
@@ -99,7 +100,7 @@ fn get_parent_and_index(target: &Handle) -> Option<(Handle, usize)> {
99100
fn append_to_existing_text(prev: &Handle, text: &str) -> bool {
100101
match prev.borrow_mut().deref_mut().node {
101102
Text(ref mut existing) => {
102-
existing.push_str(text);
103+
existing.push_slice(text);
103104
true
104105
}
105106
_ => false,
@@ -159,7 +160,7 @@ impl TreeSink for RcDom {
159160
new_node(Element(name, attrs))
160161
}
161162

162-
fn create_comment(&mut self, text: String) -> Handle {
163+
fn create_comment(&mut self, text: StrTendril) -> Handle {
163164
new_node(Comment(text))
164165
}
165166

@@ -214,7 +215,10 @@ impl TreeSink for RcDom {
214215
Ok(())
215216
}
216217

217-
fn append_doctype_to_document(&mut self, name: String, public_id: String, system_id: String) {
218+
fn append_doctype_to_document(&mut self,
219+
name: StrTendril,
220+
public_id: StrTendril,
221+
system_id: StrTendril) {
218222
append(&self.document, new_node(Doctype(name, public_id, system_id)));
219223
}
220224

examples/html2html.rs

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,20 +15,25 @@
1515
//!
1616
//! where htmlparser-1.4.jar comes from http://about.validator.nu/htmlparser/
1717
18+
extern crate tendril;
1819
extern crate html5ever;
1920
extern crate html5ever_dom_sink;
2021

21-
use std::io::{self, Read, Write};
22+
use std::io::{self, Write};
2223
use std::default::Default;
2324

25+
use tendril::{ByteTendril, ReadExt};
26+
27+
use html5ever::sink::rcdom::RcDom;
2428
use html5ever::driver::ParseOpts;
2529
use html5ever_dom_sink::rcdom::RcDom;
2630
use html5ever::tree_builder::TreeBuilderOpts;
2731
use html5ever::{parse, one_input, serialize};
2832

2933
fn main() {
30-
let mut input = String::new();
31-
io::stdin().read_to_string(&mut input).unwrap();
34+
let mut input = ByteTendril::new();
35+
io::stdin().read_to_tendril(&mut input).unwrap();
36+
let input = input.try_reinterpret().unwrap();
3237
let dom: RcDom = parse(one_input(input), ParseOpts {
3338
tree_builder: TreeBuilderOpts {
3439
drop_doctype: true,

examples/noop-tokenize.rs

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,15 @@
1313

1414
extern crate test;
1515
extern crate html5ever;
16+
extern crate tendril;
1617

1718
use std::io;
18-
use std::io::prelude::*;
1919
use std::default::Default;
2020

2121
use test::black_box;
2222

23+
use tendril::{ByteTendril, ReadExt};
24+
2325
use html5ever::tokenizer::{TokenSink, Token};
2426
use html5ever::driver::{tokenize_to, one_input};
2527

@@ -34,8 +36,9 @@ impl TokenSink for Sink {
3436
}
3537

3638
fn main() {
37-
let mut input = String::new();
38-
io::stdin().read_to_string(&mut input).unwrap();
39+
let mut input = ByteTendril::new();
40+
io::stdin().read_to_tendril(&mut input).unwrap();
41+
let input = input.try_reinterpret().unwrap();
3942

4043
tokenize_to(Sink, one_input(input), Default::default());
4144
}

examples/noop-tree-builder.rs

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,16 +8,17 @@
88
// except according to those terms.
99

1010
extern crate string_cache;
11-
11+
extern crate tendril;
1212
extern crate html5ever;
1313

14-
use std::io::{self, Read};
14+
use std::io;
1515
use std::default::Default;
16-
use std::string::String;
1716
use std::collections::HashMap;
1817
use std::borrow::Cow;
1918
use string_cache::QualName;
2019

20+
use tendril::{StrTendril, ByteTendril, ReadExt};
21+
2122
use html5ever::{parse_to, one_input};
2223
use html5ever::tokenizer::Attribute;
2324
use html5ever::tree_builder::{TreeSink, QuirksMode, NodeOrText};
@@ -56,7 +57,7 @@ impl TreeSink for Sink {
5657
id
5758
}
5859

59-
fn create_comment(&mut self, _text: String) -> usize {
60+
fn create_comment(&mut self, _text: StrTendril) -> usize {
6061
self.get_id()
6162
}
6263

@@ -72,7 +73,7 @@ impl TreeSink for Sink {
7273
fn set_quirks_mode(&mut self, _mode: QuirksMode) { }
7374
fn append(&mut self, _parent: usize, _child: NodeOrText<usize>) { }
7475

75-
fn append_doctype_to_document(&mut self, _name: String, _public_id: String, _system_id: String) { }
76+
fn append_doctype_to_document(&mut self, _: StrTendril, _: StrTendril, _: StrTendril) { }
7677
fn add_attrs_if_missing(&mut self, _target: usize, _attrs: Vec<Attribute>) { }
7778
fn remove_from_parent(&mut self, _target: usize) { }
7879
fn reparent_children(&mut self, _node: usize, _new_parent: usize) { }
@@ -85,7 +86,8 @@ fn main() {
8586
names: HashMap::new(),
8687
};
8788

88-
let mut input = String::new();
89-
io::stdin().read_to_string(&mut input).unwrap();
89+
let mut input = ByteTendril::new();
90+
io::stdin().read_to_tendril(&mut input).unwrap();
91+
let input = input.try_reinterpret().unwrap();
9092
parse_to(sink, one_input(input), Default::default());
9193
}

examples/print-rcdom.rs

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,14 @@ extern crate html5ever_dom_sink;
1515

1616
#[macro_use]
1717
extern crate string_cache;
18+
extern crate tendril;
1819

1920
use std::io::{self, Read};
2021
use std::iter::repeat;
2122
use std::default::Default;
2223
use std::string::String;
2324

25+
use tendril::{ByteTendril, ReadExt};
2426
use html5ever::{parse, one_input};
2527
use html5ever_dom_sink::common::{Document, Doctype, Text, Comment, Element};
2628
use html5ever_dom_sink::rcdom::{RcDom, Handle};
@@ -61,8 +63,9 @@ fn walk(indent: usize, handle: Handle) {
6163
}
6264

6365
fn main() {
64-
let mut input = String::new();
65-
io::stdin().read_to_string(&mut input).unwrap();
66+
let mut input = ByteTendril::new();
67+
io::stdin().read_to_tendril(&mut input).unwrap();
68+
let input = input.try_reinterpret().unwrap();
6669
let dom: RcDom = parse(one_input(input), Default::default());
6770
walk(0, dom.document);
6871

examples/print-tree-actions.rs

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,16 +10,17 @@
1010
#![feature(collections)]
1111

1212
extern crate string_cache;
13-
13+
extern crate tendril;
1414
extern crate html5ever;
1515

16-
use std::io::{self, Read};
16+
use std::io;
1717
use std::default::Default;
18-
use std::string::String;
1918
use std::collections::HashMap;
2019
use std::borrow::Cow;
2120
use string_cache::QualName;
2221

22+
use tendril::{ByteTendril, StrTendril, ReadExt};
23+
2324
use html5ever::{parse_to, one_input};
2425
use html5ever::tokenizer::Attribute;
2526
use html5ever::tree_builder::{TreeSink, QuirksMode, NodeOrText, AppendNode, AppendText};
@@ -67,7 +68,7 @@ impl TreeSink for Sink {
6768
id
6869
}
6970

70-
fn create_comment(&mut self, text: String) -> usize {
71+
fn create_comment(&mut self, text: StrTendril) -> usize {
7172
let id = self.get_id();
7273
println!("Created comment \"{}\" as {}", text.escape_default(), id);
7374
id
@@ -97,7 +98,10 @@ impl TreeSink for Sink {
9798
Ok(())
9899
}
99100

100-
fn append_doctype_to_document(&mut self, name: String, public_id: String, system_id: String) {
101+
fn append_doctype_to_document(&mut self,
102+
name: StrTendril,
103+
public_id: StrTendril,
104+
system_id: StrTendril) {
101105
println!("Append doctype: {} {} {}", name, public_id, system_id);
102106
}
103107

@@ -127,7 +131,8 @@ fn main() {
127131
names: HashMap::new(),
128132
};
129133

130-
let mut input = String::new();
131-
io::stdin().read_to_string(&mut input).unwrap();
134+
let mut input = ByteTendril::new();
135+
io::stdin().read_to_tendril(&mut input).unwrap();
136+
let input = input.try_reinterpret().unwrap();
132137
parse_to(sink, one_input(input), Default::default());
133138
}

0 commit comments

Comments
 (0)