Merge pull request #929 from nobu/tokenizer

nobu · web-flow · commit 5e8443d34a73 · 2022-11-28T02:24:34.000+09:00
Refactor `RDoc::Markup::Parser#tokenize`
diff --git a/lib/rdoc/markup/parser.rb b/lib/rdoc/markup/parser.rb
@@ -272,44 +272,11 @@ def build_verbatim margin
       end
 
       case type
-      when :HEADER then
-        line << '=' * data
-        _, _, peek_column, = peek_token
-        peek_column ||= column + data
-        indent = peek_column - column - data
-        line << ' ' * indent
-      when :RULE then
-        width = 2 + data
-        line << '-' * width
-        _, _, peek_column, = peek_token
-        peek_column ||= column + width
-        indent = peek_column - column - width
-        line << ' ' * indent
       when :BREAK, :TEXT then
         line << data
-      when :BLOCKQUOTE then
-        line << '>>>'
-        peek_type, _, peek_column = peek_token
-        if peek_type != :NEWLINE and peek_column
-          line << ' ' * (peek_column - column - 3)
-        end
-      else # *LIST_TOKENS
-        list_marker = case type
-                      when :BULLET then data
-                      when :LABEL  then "[#{data}]"
-                      when :NOTE   then "#{data}::"
-                      else # :LALPHA, :NUMBER, :UALPHA
-                        "#{data}."
-                      end
-        line << list_marker
-        peek_type, _, peek_column = peek_token
-        unless peek_type == :NEWLINE then
-          peek_column ||= column + list_marker.length
-          indent = peek_column - column - list_marker.length
-          line << ' ' * indent
-        end
+      else
+        raise TypeError, "unexpected token under verbatim: #{type}"
       end
-
     end
 
     verbatim << line << "\n" unless line.empty?
@@ -481,11 +448,37 @@ def skip token_type, error = true
   ##
   # Turns text +input+ into a stream of tokens
 
-  def tokenize input
+  def tokenize(input)
     setup_scanner input
+    margin = @s.pos[0]
+    tokenize_indented(margin)
+    tokenize_input(margin)
+  end
+
+  def newline!(pos = nil)
+    if pos or (@s.scan(/ *(?=\r?\n)/) and pos = @s.pos and @s.scan(/\r?\n/))
+      @tokens << [:NEWLINE, @s.matched, *pos]
+      @s.newline!
+    end
+  end
 
-    until @s.eos? do
+  def tokenize_indented(column)
+    indent = / {#{column+1},}(?=\S)| *(?=\r?\n)/
+    while @s.scan(indent)
       pos = @s.pos
+      if @s.scan(/(.+)(?=\r?\n)?/)
+        @tokens << [:TEXT, @s.matched, *pos]
+      end
+      newline! or break
+    end
+  end
+
+  def tokenize_input(margin)
+    column = 0
+
+    until @s.eos?
+      pos = @s.pos
+      break if pos[0] < (margin ||= pos[0])
 
       # leading spaces will be reflected by the column of the next token
       # the only thing we loose are trailing spaces at the end of the file
@@ -494,75 +487,84 @@ def tokenize input
       # note: after BULLET, LABEL, etc.,
       # indent will be the column of the next non-newline token
 
-      @tokens << case
-                 # [CR]LF => :NEWLINE
-                 when @s.scan(/\r?\n/) then
-                   token = [:NEWLINE, @s.matched, *pos]
-                   @s.newline!
-                   token
-                 # === text => :HEADER then :TEXT
-                 when @s.scan(/(=+)(\s*)/) then
-                   level = @s[1].length
-                   header = [:HEADER, level, *pos]
-
-                   if @s[2] =~ /^\r?\n/ then
-                     @s.unscan(@s[2])
-                     header
-                   else
-                     pos = @s.pos
-                     @s.scan(/.*/)
-                     @tokens << header
-                     [:TEXT, @s.matched.sub(/\r$/, ''), *pos]
-                   end
-                 # --- (at least 3) and nothing else on the line => :RULE
-                 when @s.scan(/(-{3,}) *\r?$/) then
-                   [:RULE, @s[1].length - 2, *pos]
-                 # * or - followed by white space and text => :BULLET
-                 when @s.scan(/([*-]) +(\S)/) then
-                   @s.unscan(@s[2])
-                   [:BULLET, @s[1], *pos]
-                 # A. text, a. text, 12. text => :UALPHA, :LALPHA, :NUMBER
-                 when @s.scan(/([a-z]|\d+)\. +(\S)/i) then
-                   # FIXME if tab(s), the column will be wrong
-                   # either support tabs everywhere by first expanding them to
-                   # spaces, or assume that they will have been replaced
-                   # before (and provide a check for that at least in debug
-                   # mode)
-                   list_label = @s[1]
-                   @s.unscan(@s[2])
-                   list_type =
-                     case list_label
-                     when /[a-z]/ then :LALPHA
-                     when /[A-Z]/ then :UALPHA
-                     when /\d/    then :NUMBER
-                     else
-                       raise ParseError, "BUG token #{list_label}"
-                     end
-                   [list_type, list_label, *pos]
-                 # [text] followed by spaces or end of line => :LABEL
-                 when @s.scan(/\[(.*?)\]( +|\r?$)/) then
-                   [:LABEL, @s[1], *pos]
-                 # text:: followed by spaces or end of line => :NOTE
-                 when @s.scan(/(.*?)::( +|\r?$)/) then
-                   [:NOTE, @s[1], *pos]
-                 # >>> followed by end of line => :BLOCKQUOTE
-                 when @s.scan(/>>> *(\w+)?$/) then
-                   if word = @s[1]
-                     @s.unscan(word)
-                   end
-                   [:BLOCKQUOTE, word, *pos]
-                 # anything else: :TEXT
-                 else
-                   @s.scan(/(.*?)(  )?\r?$/)
-                   token = [:TEXT, @s[1], *pos]
-
-                   if @s[2] then
-                     @tokens << token
-                     [:BREAK, @s[2], pos[0] + @s[1].length, pos[1]]
-                   else
-                     token
-                   end
-                 end
+      case
+      # [CR]LF => :NEWLINE
+      when @s.scan(/\r?\n/)
+        newline!(pos)
+        next
+
+      # === text => :HEADER then :TEXT
+      when @s.scan(/(=+)(\s*)/)
+        level = @s[1].length
+        header = [:HEADER, level, *pos]
+
+        if @s[2] =~ /^\r?\n/
+          @s.unscan(@s[2])
+          @tokens << header
+        else
+          pos = @s.pos
+          @s.scan(/.*/)
+          @tokens << header
+          @tokens << [:TEXT, @s.matched.sub(/\r$/, ''), *pos]
+        end
+
+      # --- (at least 3) and nothing else on the line => :RULE
+      when @s.scan(/(-{3,}) *\r?$/)
+        @tokens << [:RULE, @s[1].length - 2, *pos]
+
+      # * or - followed by white space and text => :BULLET
+      when @s.scan(/([*-]) +(?=\S)/)
+        @tokens << [:BULLET, @s[1], *pos]
+        tokenize_input(nil)
+
+      # A. text, a. text, 12. text => :UALPHA, :LALPHA, :NUMBER
+      when @s.scan(/([a-z]|\d+)\. +(?=\S)/i)
+        # FIXME if tab(s), the column will be wrong
+        # either support tabs everywhere by first expanding them to
+        # spaces, or assume that they will have been replaced
+        # before (and provide a check for that at least in debug
+        # mode)
+        list_label = @s[1]
+        list_type =
+          case list_label
+          when /[a-z]/ then :LALPHA
+          when /[A-Z]/ then :UALPHA
+          when /\d/    then :NUMBER
+          else
+            raise ParseError, "BUG token #{list_label}"
+          end
+        @tokens << [list_type, list_label, *pos]
+        tokenize_input(nil)
+
+      # [text] followed by spaces or end of line => :LABEL
+      when @s.scan(/\[(.*?)\]( +|\r?$)/)
+        @tokens << [:LABEL, @s[1], *pos]
+        tokenize_input(nil)
+
+      # text:: followed by spaces or end of line => :NOTE
+      when @s.scan(/(.*?)::( +|\r?$)/)
+        @tokens << [:NOTE, @s[1], *pos]
+        tokenize_input(nil)
+
+      # >>> followed by end of line => :BLOCKQUOTE
+      when @s.scan(/>>> *(\w+)?\r?$/)
+        @tokens << [:BLOCKQUOTE, @s[1], *pos]
+        newline!
+        tokenize_input(nil)
+
+      # anything else: :TEXT
+      else
+        column = pos[0]
+        @s.scan(/(.*?)(  )?\r?$/)
+        @tokens << [:TEXT, @s[1], *pos]
+
+        if @s[2]
+          @tokens << [:BREAK, @s[2], pos[0] + @s[1].length, pos[1]]
+        end
+        if newline!
+          tokenize_indented(column)
+        end
+      end
     end
 
     self
diff --git a/test/rdoc/test_rdoc_markup_parser.rb b/test/rdoc/test_rdoc_markup_parser.rb
@@ -1591,8 +1591,7 @@ def test_tokenize_verbatim_heading
       [:TEXT,    'Example heading:',  0, 0],
       [:NEWLINE, "\n",               16, 0],
       [:NEWLINE, "\n",                0, 1],
-      [:HEADER,  3,                   3, 2],
-      [:TEXT,    'heading three',     7, 2],
+      [:TEXT,    '=== heading three', 3, 2],
       [:NEWLINE, "\n",               20, 2],
     ]
 
@@ -1608,7 +1607,7 @@ def test_tokenize_verbatim_rule
     expected = [
       [:TEXT,    'Verbatim section here that is double-underlined',  2, 0],
       [:NEWLINE, "\n",                                              49, 0],
-      [:HEADER,  47,                                                 2, 1],
+      [:TEXT,    '='*47,                                             2, 1],
       [:NEWLINE, "\n",                                              49, 1],
     ]
 
@@ -1624,14 +1623,14 @@ def test_tokenize_verbatim_rule_fancy
     STR
 
     expected = [
-      [:TEXT,    'A',   2, 0],
-      [:NEWLINE, "\n",  3, 0],
-      [:TEXT,    'b',   4, 1],
-      [:NEWLINE, "\n",  5, 1],
-      [:HEADER,  47,    2, 2],
-      [:NEWLINE, "\n", 49, 2],
-      [:TEXT,    'c',   4, 3],
-      [:NEWLINE, "\n",  5, 3],
+      [:TEXT,    'A',     2, 0],
+      [:NEWLINE, "\n",    3, 0],
+      [:TEXT,    'b',     4, 1],
+      [:NEWLINE, "\n",    5, 1],
+      [:TEXT,    '='*47,  2, 2],
+      [:NEWLINE, "\n",   49, 2],
+      [:TEXT,    'c',     4, 3],
+      [:NEWLINE, "\n",    5, 3],
     ]
 
     assert_equal expected, @RMP.tokenize(str)