Skip to content

Commit 5e8443d

Browse files
authored
Merge pull request #929 from nobu/tokenizer
Refactor `RDoc::Markup::Parser#tokenize`
2 parents b118bc9 + 41ceae9 commit 5e8443d

File tree

2 files changed

+118
-117
lines changed

2 files changed

+118
-117
lines changed

lib/rdoc/markup/parser.rb

Lines changed: 108 additions & 106 deletions
Original file line numberDiff line numberDiff line change
@@ -272,44 +272,11 @@ def build_verbatim margin
272272
end
273273

274274
case type
275-
when :HEADER then
276-
line << '=' * data
277-
_, _, peek_column, = peek_token
278-
peek_column ||= column + data
279-
indent = peek_column - column - data
280-
line << ' ' * indent
281-
when :RULE then
282-
width = 2 + data
283-
line << '-' * width
284-
_, _, peek_column, = peek_token
285-
peek_column ||= column + width
286-
indent = peek_column - column - width
287-
line << ' ' * indent
288275
when :BREAK, :TEXT then
289276
line << data
290-
when :BLOCKQUOTE then
291-
line << '>>>'
292-
peek_type, _, peek_column = peek_token
293-
if peek_type != :NEWLINE and peek_column
294-
line << ' ' * (peek_column - column - 3)
295-
end
296-
else # *LIST_TOKENS
297-
list_marker = case type
298-
when :BULLET then data
299-
when :LABEL then "[#{data}]"
300-
when :NOTE then "#{data}::"
301-
else # :LALPHA, :NUMBER, :UALPHA
302-
"#{data}."
303-
end
304-
line << list_marker
305-
peek_type, _, peek_column = peek_token
306-
unless peek_type == :NEWLINE then
307-
peek_column ||= column + list_marker.length
308-
indent = peek_column - column - list_marker.length
309-
line << ' ' * indent
310-
end
277+
else
278+
raise TypeError, "unexpected token under verbatim: #{type}"
311279
end
312-
313280
end
314281

315282
verbatim << line << "\n" unless line.empty?
@@ -481,11 +448,37 @@ def skip token_type, error = true
481448
##
482449
# Turns text +input+ into a stream of tokens
483450

484-
def tokenize input
451+
def tokenize(input)
485452
setup_scanner input
453+
margin = @s.pos[0]
454+
tokenize_indented(margin)
455+
tokenize_input(margin)
456+
end
457+
458+
def newline!(pos = nil)
459+
if pos or (@s.scan(/ *(?=\r?\n)/) and pos = @s.pos and @s.scan(/\r?\n/))
460+
@tokens << [:NEWLINE, @s.matched, *pos]
461+
@s.newline!
462+
end
463+
end
486464

487-
until @s.eos? do
465+
def tokenize_indented(column)
466+
indent = / {#{column+1},}(?=\S)| *(?=\r?\n)/
467+
while @s.scan(indent)
488468
pos = @s.pos
469+
if @s.scan(/(.+)(?=\r?\n)?/)
470+
@tokens << [:TEXT, @s.matched, *pos]
471+
end
472+
newline! or break
473+
end
474+
end
475+
476+
def tokenize_input(margin)
477+
column = 0
478+
479+
until @s.eos?
480+
pos = @s.pos
481+
break if pos[0] < (margin ||= pos[0])
489482

490483
# leading spaces will be reflected by the column of the next token
491484
# the only thing we loose are trailing spaces at the end of the file
@@ -494,75 +487,84 @@ def tokenize input
494487
# note: after BULLET, LABEL, etc.,
495488
# indent will be the column of the next non-newline token
496489

497-
@tokens << case
498-
# [CR]LF => :NEWLINE
499-
when @s.scan(/\r?\n/) then
500-
token = [:NEWLINE, @s.matched, *pos]
501-
@s.newline!
502-
token
503-
# === text => :HEADER then :TEXT
504-
when @s.scan(/(=+)(\s*)/) then
505-
level = @s[1].length
506-
header = [:HEADER, level, *pos]
507-
508-
if @s[2] =~ /^\r?\n/ then
509-
@s.unscan(@s[2])
510-
header
511-
else
512-
pos = @s.pos
513-
@s.scan(/.*/)
514-
@tokens << header
515-
[:TEXT, @s.matched.sub(/\r$/, ''), *pos]
516-
end
517-
# --- (at least 3) and nothing else on the line => :RULE
518-
when @s.scan(/(-{3,}) *\r?$/) then
519-
[:RULE, @s[1].length - 2, *pos]
520-
# * or - followed by white space and text => :BULLET
521-
when @s.scan(/([*-]) +(\S)/) then
522-
@s.unscan(@s[2])
523-
[:BULLET, @s[1], *pos]
524-
# A. text, a. text, 12. text => :UALPHA, :LALPHA, :NUMBER
525-
when @s.scan(/([a-z]|\d+)\. +(\S)/i) then
526-
# FIXME if tab(s), the column will be wrong
527-
# either support tabs everywhere by first expanding them to
528-
# spaces, or assume that they will have been replaced
529-
# before (and provide a check for that at least in debug
530-
# mode)
531-
list_label = @s[1]
532-
@s.unscan(@s[2])
533-
list_type =
534-
case list_label
535-
when /[a-z]/ then :LALPHA
536-
when /[A-Z]/ then :UALPHA
537-
when /\d/ then :NUMBER
538-
else
539-
raise ParseError, "BUG token #{list_label}"
540-
end
541-
[list_type, list_label, *pos]
542-
# [text] followed by spaces or end of line => :LABEL
543-
when @s.scan(/\[(.*?)\]( +|\r?$)/) then
544-
[:LABEL, @s[1], *pos]
545-
# text:: followed by spaces or end of line => :NOTE
546-
when @s.scan(/(.*?)::( +|\r?$)/) then
547-
[:NOTE, @s[1], *pos]
548-
# >>> followed by end of line => :BLOCKQUOTE
549-
when @s.scan(/>>> *(\w+)?$/) then
550-
if word = @s[1]
551-
@s.unscan(word)
552-
end
553-
[:BLOCKQUOTE, word, *pos]
554-
# anything else: :TEXT
555-
else
556-
@s.scan(/(.*?)( )?\r?$/)
557-
token = [:TEXT, @s[1], *pos]
558-
559-
if @s[2] then
560-
@tokens << token
561-
[:BREAK, @s[2], pos[0] + @s[1].length, pos[1]]
562-
else
563-
token
564-
end
565-
end
490+
case
491+
# [CR]LF => :NEWLINE
492+
when @s.scan(/\r?\n/)
493+
newline!(pos)
494+
next
495+
496+
# === text => :HEADER then :TEXT
497+
when @s.scan(/(=+)(\s*)/)
498+
level = @s[1].length
499+
header = [:HEADER, level, *pos]
500+
501+
if @s[2] =~ /^\r?\n/
502+
@s.unscan(@s[2])
503+
@tokens << header
504+
else
505+
pos = @s.pos
506+
@s.scan(/.*/)
507+
@tokens << header
508+
@tokens << [:TEXT, @s.matched.sub(/\r$/, ''), *pos]
509+
end
510+
511+
# --- (at least 3) and nothing else on the line => :RULE
512+
when @s.scan(/(-{3,}) *\r?$/)
513+
@tokens << [:RULE, @s[1].length - 2, *pos]
514+
515+
# * or - followed by white space and text => :BULLET
516+
when @s.scan(/([*-]) +(?=\S)/)
517+
@tokens << [:BULLET, @s[1], *pos]
518+
tokenize_input(nil)
519+
520+
# A. text, a. text, 12. text => :UALPHA, :LALPHA, :NUMBER
521+
when @s.scan(/([a-z]|\d+)\. +(?=\S)/i)
522+
# FIXME if tab(s), the column will be wrong
523+
# either support tabs everywhere by first expanding them to
524+
# spaces, or assume that they will have been replaced
525+
# before (and provide a check for that at least in debug
526+
# mode)
527+
list_label = @s[1]
528+
list_type =
529+
case list_label
530+
when /[a-z]/ then :LALPHA
531+
when /[A-Z]/ then :UALPHA
532+
when /\d/ then :NUMBER
533+
else
534+
raise ParseError, "BUG token #{list_label}"
535+
end
536+
@tokens << [list_type, list_label, *pos]
537+
tokenize_input(nil)
538+
539+
# [text] followed by spaces or end of line => :LABEL
540+
when @s.scan(/\[(.*?)\]( +|\r?$)/)
541+
@tokens << [:LABEL, @s[1], *pos]
542+
tokenize_input(nil)
543+
544+
# text:: followed by spaces or end of line => :NOTE
545+
when @s.scan(/(.*?)::( +|\r?$)/)
546+
@tokens << [:NOTE, @s[1], *pos]
547+
tokenize_input(nil)
548+
549+
# >>> followed by end of line => :BLOCKQUOTE
550+
when @s.scan(/>>> *(\w+)?\r?$/)
551+
@tokens << [:BLOCKQUOTE, @s[1], *pos]
552+
newline!
553+
tokenize_input(nil)
554+
555+
# anything else: :TEXT
556+
else
557+
column = pos[0]
558+
@s.scan(/(.*?)( )?\r?$/)
559+
@tokens << [:TEXT, @s[1], *pos]
560+
561+
if @s[2]
562+
@tokens << [:BREAK, @s[2], pos[0] + @s[1].length, pos[1]]
563+
end
564+
if newline!
565+
tokenize_indented(column)
566+
end
567+
end
566568
end
567569

568570
self

test/rdoc/test_rdoc_markup_parser.rb

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1591,8 +1591,7 @@ def test_tokenize_verbatim_heading
15911591
[:TEXT, 'Example heading:', 0, 0],
15921592
[:NEWLINE, "\n", 16, 0],
15931593
[:NEWLINE, "\n", 0, 1],
1594-
[:HEADER, 3, 3, 2],
1595-
[:TEXT, 'heading three', 7, 2],
1594+
[:TEXT, '=== heading three', 3, 2],
15961595
[:NEWLINE, "\n", 20, 2],
15971596
]
15981597

@@ -1608,7 +1607,7 @@ def test_tokenize_verbatim_rule
16081607
expected = [
16091608
[:TEXT, 'Verbatim section here that is double-underlined', 2, 0],
16101609
[:NEWLINE, "\n", 49, 0],
1611-
[:HEADER, 47, 2, 1],
1610+
[:TEXT, '='*47, 2, 1],
16121611
[:NEWLINE, "\n", 49, 1],
16131612
]
16141613

@@ -1624,14 +1623,14 @@ def test_tokenize_verbatim_rule_fancy
16241623
STR
16251624

16261625
expected = [
1627-
[:TEXT, 'A', 2, 0],
1628-
[:NEWLINE, "\n", 3, 0],
1629-
[:TEXT, 'b', 4, 1],
1630-
[:NEWLINE, "\n", 5, 1],
1631-
[:HEADER, 47, 2, 2],
1632-
[:NEWLINE, "\n", 49, 2],
1633-
[:TEXT, 'c', 4, 3],
1634-
[:NEWLINE, "\n", 5, 3],
1626+
[:TEXT, 'A', 2, 0],
1627+
[:NEWLINE, "\n", 3, 0],
1628+
[:TEXT, 'b', 4, 1],
1629+
[:NEWLINE, "\n", 5, 1],
1630+
[:TEXT, '='*47, 2, 2],
1631+
[:NEWLINE, "\n", 49, 2],
1632+
[:TEXT, 'c', 4, 3],
1633+
[:NEWLINE, "\n", 5, 3],
16351634
]
16361635

16371636
assert_equal expected, @RMP.tokenize(str)

0 commit comments

Comments
 (0)