diff --git a/CHANGELOG.md b/CHANGELOG.md index acfff98..de69eb3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Fixed + +- fixed `#char` & `#codepoint` errors for single-digit hex escapes + * e.g. `\xA` + ## [2.9.3] - 2024-11-29 - Janosch Müller ### Fixed diff --git a/lib/regexp_parser/expression.rb b/lib/regexp_parser/expression.rb index e865e43..f8a247a 100644 --- a/lib/regexp_parser/expression.rb +++ b/lib/regexp_parser/expression.rb @@ -25,6 +25,8 @@ require_relative 'expression/classes/unicode_property' require_relative 'expression/methods/construct' +require_relative 'expression/methods/escape_sequence_char' +require_relative 'expression/methods/escape_sequence_codepoint' require_relative 'expression/methods/human_name' require_relative 'expression/methods/match' require_relative 'expression/methods/match_length' diff --git a/lib/regexp_parser/expression/classes/escape_sequence.rb b/lib/regexp_parser/expression/classes/escape_sequence.rb index c6c51a8..b18f7a2 100644 --- a/lib/regexp_parser/expression/classes/escape_sequence.rb +++ b/lib/regexp_parser/expression/classes/escape_sequence.rb @@ -1,100 +1,28 @@ module Regexp::Expression module EscapeSequence - class Base < Regexp::Expression::Base - def codepoint - char.ord - end + Base = Class.new(Regexp::Expression::Base) - if ''.respond_to?(:undump) - def char - %("#{text}").undump - end - else - # poor man's unescape without using eval - require 'yaml' - def char - YAML.load(%Q(---\n"#{text}"\n)) - end - end - end + AsciiEscape = Class.new(Base) # \e + Backspace = Class.new(Base) # \b + Bell = Class.new(Base) # \a + FormFeed = Class.new(Base) # \f + Newline = Class.new(Base) # \n + Return = Class.new(Base) # \r + Tab = Class.new(Base) # \t + VerticalTab = Class.new(Base) # \v - class Literal < EscapeSequence::Base - def char - text[1..-1] - end - end + Literal = Class.new(Base) # e.g. \j, \@, \😀 (ineffectual escapes) - class AsciiEscape < EscapeSequence::Base; end - class Backspace < EscapeSequence::Base; end - class Bell < EscapeSequence::Base; end - class FormFeed < EscapeSequence::Base; end - class Newline < EscapeSequence::Base; end - class Return < EscapeSequence::Base; end - class Tab < EscapeSequence::Base; end - class VerticalTab < EscapeSequence::Base; end + Octal = Class.new(Base) # e.g. \012 + Hex = Class.new(Base) # e.g. \x0A + Codepoint = Class.new(Base) # e.g. \u000A - class Hex < EscapeSequence::Base; end - class Codepoint < EscapeSequence::Base; end + CodepointList = Class.new(Base) # e.g. \u{A B} - class CodepointList < EscapeSequence::Base - def char - raise NoMethodError, 'CodepointList responds only to #chars' - end - - def codepoint - raise NoMethodError, 'CodepointList responds only to #codepoints' - end - - def chars - codepoints.map { |cp| cp.chr('utf-8') } - end - - def codepoints - text.scan(/\h+/).map(&:hex) - end - end - - class Octal < EscapeSequence::Base - def char - text[1..-1].to_i(8).chr('utf-8') - end - end - - class AbstractMetaControlSequence < EscapeSequence::Base - def char - codepoint.chr('utf-8') - end - - private - - def control_sequence_to_s(control_sequence) - five_lsb = control_sequence.unpack('B*').first[-5..-1] - ["000#{five_lsb}"].pack('B*') - end - - def meta_char_to_codepoint(meta_char) - byte_value = meta_char.ord - byte_value < 128 ? byte_value + 128 : byte_value - end - end - - class Control < AbstractMetaControlSequence - def codepoint - control_sequence_to_s(text).ord - end - end - - class Meta < AbstractMetaControlSequence - def codepoint - meta_char_to_codepoint(text[-1]) - end - end - - class MetaControl < AbstractMetaControlSequence - def codepoint - meta_char_to_codepoint(control_sequence_to_s(text)) - end - end + AbstractMetaControlSequence = Class.new(Base) + Control = Class.new(AbstractMetaControlSequence) # e.g. \cB + Meta = Class.new(AbstractMetaControlSequence) # e.g. \M-Z + MetaControl = Class.new(AbstractMetaControlSequence) # e.g. \M-\cX end # alias for symmetry between Token::* and Expression::* diff --git a/lib/regexp_parser/expression/methods/escape_sequence_char.rb b/lib/regexp_parser/expression/methods/escape_sequence_char.rb new file mode 100644 index 0000000..0925d12 --- /dev/null +++ b/lib/regexp_parser/expression/methods/escape_sequence_char.rb @@ -0,0 +1,5 @@ +Regexp::Expression::EscapeSequence::Base.class_eval do + def char + codepoint.chr('utf-8') + end +end diff --git a/lib/regexp_parser/expression/methods/escape_sequence_codepoint.rb b/lib/regexp_parser/expression/methods/escape_sequence_codepoint.rb new file mode 100644 index 0000000..4decec2 --- /dev/null +++ b/lib/regexp_parser/expression/methods/escape_sequence_codepoint.rb @@ -0,0 +1,68 @@ +module Regexp::Expression::EscapeSequence + AsciiEscape.class_eval { def codepoint; 0x1B end } + Backspace.class_eval { def codepoint; 0x8 end } + Bell.class_eval { def codepoint; 0x7 end } + FormFeed.class_eval { def codepoint; 0xC end } + Newline.class_eval { def codepoint; 0xA end } + Return.class_eval { def codepoint; 0xD end } + Tab.class_eval { def codepoint; 0x9 end } + VerticalTab.class_eval { def codepoint; 0xB end } + + Literal.class_eval { def codepoint; text[1].ord end } + + Octal.class_eval { def codepoint; text[/\d+/].to_i(8) end } + + Hex.class_eval { def codepoint; text[/\h+/].hex end } + Codepoint.class_eval { def codepoint; text[/\h+/].hex end } + + CodepointList.class_eval do + # Maybe this should be a unique top-level expression class? + def char + raise NoMethodError, 'CodepointList responds only to #chars' + end + + def codepoint + raise NoMethodError, 'CodepointList responds only to #codepoints' + end + + def chars + codepoints.map { |cp| cp.chr('utf-8') } + end + + def codepoints + text.scan(/\h+/).map(&:hex) + end + end + + AbstractMetaControlSequence.class_eval do + private + + def control_sequence_to_s(control_sequence) + five_lsb = control_sequence.unpack('B*').first[-5..-1] + ["000#{five_lsb}"].pack('B*') + end + + def meta_char_to_codepoint(meta_char) + byte_value = meta_char.ord + byte_value < 128 ? byte_value + 128 : byte_value + end + end + + Control.class_eval do + def codepoint + control_sequence_to_s(text).ord + end + end + + Meta.class_eval do + def codepoint + meta_char_to_codepoint(text[-1]) + end + end + + MetaControl.class_eval do + def codepoint + meta_char_to_codepoint(control_sequence_to_s(text)) + end + end +end diff --git a/spec/parser/escapes_spec.rb b/spec/parser/escapes_spec.rb index 5fc170a..e5817a8 100644 --- a/spec/parser/escapes_spec.rb +++ b/spec/parser/escapes_spec.rb @@ -38,6 +38,7 @@ include_examples 'parse', /\?/, 0 => [char: '?', codepoint: 63 ] include_examples 'parse', /\101/, 0 => [char: 'A', codepoint: 65 ] include_examples 'parse', /\x42/, 0 => [char: 'B', codepoint: 66 ] + include_examples 'parse', /\xA/, 0 => [char: "\n", codepoint: 10 ] include_examples 'parse', /\u0043/, 0 => [char: 'C', codepoint: 67 ] include_examples 'parse', /\u{44 45}/, 0 => [chars: %w[D E], codepoints: [68, 69]]