Skip to content

Commit 84a8113

Browse files
committed
Reset parser before correct elements, texts, etc
Closes GH-11.
1 parent 5fa3795 commit 84a8113

File tree

2 files changed

+56
-22
lines changed

2 files changed

+56
-22
lines changed

index.js

Lines changed: 38 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -130,22 +130,19 @@ function wrap(tree, file) {
130130
function element(node) {
131131
var empty = voids.indexOf(node.tagName) !== -1
132132

133+
resetTokenizer()
133134
parser._processToken(startTag(node), ns.html)
134135

135136
all(node.children)
136137

137138
if (!empty) {
139+
resetTokenizer()
138140
parser._processToken(endTag(node))
139-
140-
// Put the parser back in the data state: some elements, like textareas
141-
// and iframes, change the state.
142-
// See <syntax-tree/hast-util-raw#7>.
143-
// See <https://github.com/inikulin/parse5/blob/2528196/packages/parse5/lib/tokenizer/index.js#L222>.
144-
tokenizer.state = dataState
145141
}
146142
}
147143

148144
function text(node) {
145+
resetTokenizer()
149146
parser._processToken({
150147
type: characterToken,
151148
chars: node.value,
@@ -155,7 +152,7 @@ function wrap(tree, file) {
155152

156153
function doctype(node) {
157154
var p5 = toParse5(node)
158-
155+
resetTokenizer()
159156
parser._processToken({
160157
type: doctypeToken,
161158
name: p5.name,
@@ -167,6 +164,7 @@ function wrap(tree, file) {
167164
}
168165

169166
function comment(node) {
167+
resetTokenizer()
170168
parser._processToken({
171169
type: commentToken,
172170
data: node.value,
@@ -182,35 +180,38 @@ function wrap(tree, file) {
182180
var token
183181

184182
// Reset preprocessor:
185-
// See: <https://github.com/inikulin/parse5/blob/0491902/packages/parse5/lib/tokenizer/preprocessor.js>.
183+
// See: <https://github.com/inikulin/parse5/blob/9c683e1/packages/parse5/lib/tokenizer/preprocessor.js>.
186184
preprocessor.html = null
187-
preprocessor.endOfChunkHit = false
188-
preprocessor.lastChunkWritten = false
189-
preprocessor.lastCharPos = -1
190185
preprocessor.pos = -1
186+
preprocessor.lastGapPos = -1
187+
preprocessor.lastCharPos = -1
188+
preprocessor.gapStack = []
189+
preprocessor.skipNextNewLine = false
190+
preprocessor.lastChunkWritten = false
191+
preprocessor.endOfChunkHit = false
191192

192193
// Reset preprocessor mixin:
193-
// See: <https://github.com/inikulin/parse5/blob/0491902/packages/parse5/lib/extensions/position-tracking/preprocessor-mixin.js>.
194-
posTracker.droppedBufferSize = 0
195-
posTracker.line = line
196-
posTracker.col = 1
197-
posTracker.offset = 0
198-
posTracker.lineStartPos = -column + 1
194+
// See: <https://github.com/inikulin/parse5/blob/9c683e1/packages/parse5/lib/extensions/position-tracking/preprocessor-mixin.js>.
195+
posTracker.isEol = false
196+
posTracker.lineStartPos = -column + 1 // Looks weird, but ensures we get correct positional info.
199197
posTracker.droppedBufferSize = offset
198+
posTracker.offset = 0
199+
posTracker.col = 1
200+
posTracker.line = line
200201

201202
// Reset location tracker:
202-
// See: <https://github.com/inikulin/parse5/blob/0491902/packages/parse5/lib/extensions/location-info/tokenizer-mixin.js>.
203+
// See: <https://github.com/inikulin/parse5/blob/9c683e1/packages/parse5/lib/extensions/location-info/tokenizer-mixin.js>.
203204
locationTracker.currentAttrLocation = null
204205
locationTracker.ctLoc = createParse5Location(node)
205206

206207
// See the code for `parse` and `parseFragment`:
207-
// See: <https://github.com/inikulin/parse5/blob/0491902/packages/parse5/lib/parser/index.js#L371>.
208+
// See: <https://github.com/inikulin/parse5/blob/9c683e1/packages/parse5/lib/parser/index.js#L371>.
208209
tokenizer.write(node.value)
209210
parser._runParsingLoop(null)
210211

211212
// Process final characters if they’re still there after hibernating.
212213
// Similar to:
213-
// See: <https://github.com/inikulin/parse5/blob/3bfa7d9/packages/parse5/lib/extensions/location-info/tokenizer-mixin.js#L95>.
214+
// See: <https://github.com/inikulin/parse5/blob/9c683e1/packages/parse5/lib/extensions/location-info/tokenizer-mixin.js#L95>.
214215
token = tokenizer.currentCharacterToken
215216

216217
if (token) {
@@ -219,11 +220,26 @@ function wrap(tree, file) {
219220
token.location.endOffset = posTracker.offset + 1
220221
parser._processToken(token)
221222
}
223+
}
222224

225+
function resetTokenizer() {
223226
// Reset tokenizer:
224-
// See: <https://github.com/inikulin/parse5/blob/8b0048e/packages/parse5/lib/tokenizer/index.js#L215>.
225-
tokenizer.currentToken = null
227+
// See: <https://github.com/inikulin/parse5/blob/9c683e1/packages/parse5/lib/tokenizer/index.js#L218-L234>.
228+
// Especially putting it back in the `data` state is useful: some elements,
229+
// like textareas and iframes, change the state.
230+
// See GH-7.
231+
// But also if broken HTML is in `raw`, and then a correct element is given.
232+
// See GH-11.
233+
tokenizer.tokenQueue = []
234+
tokenizer.state = dataState
235+
tokenizer.returnState = ''
236+
tokenizer.charRefCode = -1
237+
tokenizer.tempBuff = []
238+
tokenizer.lastStartTagName = ''
239+
tokenizer.consumedAfterSnapshot = -1
240+
tokenizer.active = false
226241
tokenizer.currentCharacterToken = null
242+
tokenizer.currentToken = null
227243
tokenizer.currentAttr = null
228244
}
229245
}

test.js

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,24 @@ test('raw', function (t) {
167167
'should support template nodes'
168168
)
169169

170+
t.deepEqual(
171+
raw(u('root', [u('raw', '<i'), h('b')])),
172+
u('root', {data: {quirksMode: false}}, [h('b')]),
173+
'should discard broken HTML when a proper element node is found'
174+
)
175+
176+
t.deepEqual(
177+
raw(u('root', [u('raw', '<i'), u('text', 'a')])),
178+
u('root', {data: {quirksMode: false}}, [u('text', 'a')]),
179+
'should discard broken HTML when a proper text node is found'
180+
)
181+
182+
t.deepEqual(
183+
raw(u('root', [u('raw', '<i'), u('raw', '>'), h('b')])),
184+
u('root', {data: {quirksMode: false}}, [h('i', [h('b')])]),
185+
'should not discard HTML broken over several raw nodes'
186+
)
187+
170188
t.deepEqual(
171189
raw(u('root', [u('raw', '<script>alert(1)</script>')])),
172190
u('root', {data: {quirksMode: false}}, [

0 commit comments

Comments
 (0)