Skip to content

Commit 82ce95b

Browse files
authored
perf(parser): fast-path name tokenization to avoid TextDecoder (#33)
Skip intermediate number[] array, Uint8Array allocation, and TextDecoder.decode() for the 99%+ of PDF names that contain no #XX hex escapes. Build string directly via String.fromCharCode loop. 5-10% improvement on parsing benchmarks (CPU profile showed readName + TextDecoder.decode at ~20% of total parse time).
1 parent 13256ad commit 82ce95b

File tree

2 files changed

+63
-0
lines changed

2 files changed

+63
-0
lines changed

src/parser/token-reader.test.ts

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -323,6 +323,27 @@ describe("TokenReader", () => {
323323
expect(token).toMatchObject({ type: "name", value: "Test#5" });
324324
});
325325

326+
it("uses fast path for plain ASCII name", () => {
327+
const r = reader("/Type");
328+
const token = r.nextToken();
329+
330+
expect(token).toMatchObject({ type: "name", value: "Type" });
331+
});
332+
333+
it("falls back to slow path when # is first char", () => {
334+
const r = reader("/#48ello"); // #48 = 'H'
335+
const token = r.nextToken();
336+
337+
expect(token).toMatchObject({ type: "name", value: "Hello" });
338+
});
339+
340+
it("falls back to slow path when # appears mid-name", () => {
341+
const r = reader("/Type#20Name"); // #20 = space
342+
const token = r.nextToken();
343+
344+
expect(token).toMatchObject({ type: "name", value: "Type Name" });
345+
});
346+
326347
it("stops at whitespace", () => {
327348
const r = reader("/Type /Page");
328349

src/parser/token-reader.ts

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,48 @@ export class TokenReader {
284284
// Skip the leading /
285285
this.scanner.advance();
286286

287+
const data = this.scanner.bytes;
288+
const start = this.scanner.position;
289+
let pos = start;
290+
const len = data.length;
291+
292+
// Fast path: scan for end of name, checking for # escapes
293+
let hasEscape = false;
294+
295+
while (pos < len) {
296+
const byte = data[pos];
297+
298+
if (!isRegularChar(byte)) {
299+
break;
300+
}
301+
302+
if (byte === CHAR_HASH) {
303+
hasEscape = true;
304+
break;
305+
}
306+
307+
pos++;
308+
}
309+
310+
if (!hasEscape) {
311+
// Common case: pure ASCII name with no escapes.
312+
// Build string directly from byte range — no intermediate array,
313+
// no Uint8Array allocation, no TextDecoder.
314+
this.scanner.moveTo(pos);
315+
316+
let value = "";
317+
318+
for (let i = start; i < pos; i++) {
319+
value += String.fromCharCode(data[i]);
320+
}
321+
322+
return { type: "name", value, position };
323+
}
324+
325+
// Slow path: name contains # escapes, need byte-by-byte processing.
326+
// Reset scanner to start and use the original accumulation approach.
327+
this.scanner.moveTo(start);
328+
287329
const bytes: number[] = [];
288330

289331
while (true) {

0 commit comments

Comments
 (0)