From 02b4e29608216d045f3927cb8cb976f57bd034ed Mon Sep 17 00:00:00 2001
From: petervwyatt <26521615+petervwyatt@users.noreply.github.com>
Date: Tue, 2 Jul 2024 01:03:24 +1000
Subject: [PATCH 1/9] Initial PDF COS rouge lexer

---
 lib/rouge/lexers/pdf.rb | 92 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 92 insertions(+)
 create mode 100644 lib/rouge/lexers/pdf.rb

diff --git a/lib/rouge/lexers/pdf.rb b/lib/rouge/lexers/pdf.rb
new file mode 100644
index 0000000000..08220ced2f
--- /dev/null
+++ b/lib/rouge/lexers/pdf.rb
@@ -0,0 +1,92 @@
+# -*- coding: utf-8 -*- #
+# frozen_string_literal: true
+
+# Adapted from Rouge lib/rouge/lexers/PostScript.rb
+module Rouge
+  module Lexers
+    class Pdf < RegexLexer
+      title "PDF"
+      desc "PDF - Portable Document Format (ISO 32000)"
+      tag "pdf"
+      aliases "fdf", "cos"
+      filenames "*.pdf", "*.fdf"
+      mimetypes "application/pdf", "application/fdf" # IANA registered media types
+
+      # PDF and FDF files must start with "%PDF-x.y" or "%FDF-x.y"
+      # where x is the emajor version (1-9) and y is the minor version (0-9)
+      # Supports invalid PDF versions.
+      def self.detect?(text)
+        return true if /^%[PF]DF-[1-9]\.\d/ =~ text
+      end
+
+      # PDF Delimiters (ISO 32000-2:2020, Table 2) including Ruby whitespace 
+      delimiter = %s"()<>\[\]/%\s"
+
+      delimiter_end = Regexp.new("(?=[#{delimiter}])")
+      valid_name_chars = Regexp.new("[^#{delimiter}]")
+
+      state :root do
+        # PDF only has single-line comments: from "%"" to EOL
+        rule %r'%.*?$', Comment::Single
+
+        # PDF Boolean object
+        rule %r'(false|true)#{delimiter_end}', Keyword::Constant
+
+        # PDF Null object
+        rule %r'(null)#{delimiter_end}', Keyword::Constant
+
+        # PDF Hex string - can contain whitespace and span multiple lines
+        rule %r/<[0-9A-Fa-f\s]+>/m, String::Other
+
+        # PDF Dictionary
+        rule %r/<</, Variable::Instance
+        rule %r/>>/, Variable::Instance
+
+        # PDF Arrays
+        rule %r/\[/, Variable::Instance
+        rule %r/\]/, Variable::Instance
+
+        # PDF literal strings are complex (multi-line, escapes, etc.); enter separate state.
+        rule %r'\(', String, :stringliteral
+
+        # PDF Name objects - can be empty (nothing after "/")
+        # No special processing needed for 2-digit hex codes starting with "#"
+        rule %r'/\/#{valid_name_chars}*#{delimiter_end}', Name::Variable
+
+        # PDF objects and stream (no checking of object number)
+        rule %r/\d+\s\d+obj#{delimiter_end}/, Keyword::Declaration
+        rule %r/stream/, Keyword::Declaration
+        rule %r/endstream/, Keyword::Declaration
+        rule %r/endobj/, Keyword::Declaration
+
+        # PDF file layout
+        rule %r/xref/, Keyword::Constant
+        rule %r/trailer/, Keyword::Constant
+        rule %r/startxref/, Keyword::Constant
+
+        # PDF cross reference section entries (supposedly 20 bytes including EOL)
+        rule %r/^\d{10} \d{5} [nf]\s?/, Keyword::Namespace
+
+        # PDF Indirect reference (lax, allows zero as the object number)
+        rule %r/\d+\s\d+R#{delimiter_end}/. Keyword::Pseudo
+
+        # PDF Real object
+        rule %r/(\-|\+)?([0-9]+\.?|[0-9]*\.[0-9]+|[0-9]+\.[0-9]*)#{delimiter_end}/, Number::Float
+
+        # PDF Integer object
+        rule %r/(\-|\+)?[0-9]+#{delimiter_end}/, Number::Integer
+
+        # most likely PDF content stream operators
+        rule valid_name_chars, Operator::Word
+      end
+
+      # PDF literal string - see ISO 32000-2:2020 clause 7.3.4.2 and Table 3
+      state :stringliteral do
+        rule %r/\(/, String, :stringliteral     # recursive for internal balanced literal strings
+        rule %r/\)/, String, :pop!
+        rule %r/\\([0-7]{3}|n|r|t|b|f|\\)/, String::Escape
+        rule %r/[^\(\)\\]+/, String
+      end
+    end
+  end
+end

From 82785e3e76463c8e0faf2a75052eb558030f115b Mon Sep 17 00:00:00 2001
From: petervwyatt <26521615+petervwyatt@users.noreply.github.com>
Date: Tue, 2 Jul 2024 15:09:55 +1000
Subject: [PATCH 2/9] Update pdf.rb

---
 lib/rouge/lexers/pdf.rb | 114 ++++++++++++++++++++++------------------
 1 file changed, 64 insertions(+), 50 deletions(-)

diff --git a/lib/rouge/lexers/pdf.rb b/lib/rouge/lexers/pdf.rb
index 08220ced2f..7c0d48d541 100644
--- a/lib/rouge/lexers/pdf.rb
+++ b/lib/rouge/lexers/pdf.rb
@@ -1,7 +1,20 @@
 # -*- coding: utf-8 -*- #
 # frozen_string_literal: true
 
-# Adapted from Rouge lib/rouge/lexers/PostScript.rb
+#
+# PDF = Portable Document Format page description language
+# As defined by ISO 32000-2:2020 including resolved errata from https://pdf-issues.pdfa.org/
+#
+# The PDF syntax is also know as "COS" and can also be used with FDF (Forms Data Field) files. 
+#
+# This is a token-based parser ONLY! It is intended to syntax highlight full or partial fragments 
+# of nicely written hand-writteen PDF syntax in documentation such as ISO specifications. It is NOT
+# intended to cope with real-world PDFs that will contain arbitrary binary data (that form invalid
+# UTF-8 sequences and generate "ArgumentError: invalid byte sequence in UTF-8" Ruby errors) and 
+# other types of malformation/syntax errors. 
+#
+# Author: Peter Wyatt, CTO, PDF Association. 2024
+#
 module Rouge
   module Lexers
     class Pdf < RegexLexer
@@ -13,79 +26,80 @@ class Pdf < RegexLexer
       mimetypes "application/pdf", "application/fdf" # IANA registered media types
 
       # PDF and FDF files must start with "%PDF-x.y" or "%FDF-x.y"
-      # where x is the emajor version (1-9) and y is the minor version (0-9)
-      # Supports invalid PDF versions.
+      # where x is the single digit major version and y is the single digit minor version.
       def self.detect?(text)
-        return true if /^%[PF]DF-[1-9]\.\d/ =~ text
+        return true if /^%(P|F)DF-\d.\d/ =~ text
       end
 
-      # PDF Delimiters (ISO 32000-2:2020, Table 2) including Ruby whitespace 
-      delimiter = %s"()<>\[\]/%\s"
-
-      delimiter_end = Regexp.new("(?=[#{delimiter}])")
-      valid_name_chars = Regexp.new("[^#{delimiter}]")
+      # PDF Delimiters (ISO 32000-2:2020, Table 1 and Table 2).
+      # Ruby whitespace "\s" is /[ \t\r\n\f\v]/ which does not include NUL (ISO 32000-2:2020, Table 1).
+      # PDF also support 2 character EOL sequences.
+      delimiter = %r/\(\)<>\[\]\/%\s/
 
       state :root do
-        # PDF only has single-line comments: from "%"" to EOL
-        rule %r'%.*?$', Comment::Single
+        # Start-of-file header comment is special (comment is up to EOL)
+        rule %r/^%(P|F)DF-\d\.\d.*$/, Comment::Special
 
-        # PDF Boolean object
-        rule %r'(false|true)#{delimiter_end}', Keyword::Constant
+        # End-of-file marker comment is special (comment is up to EOL)
+        rule %r/^%%EOF.*$/, Comment::Special
 
-        # PDF Null object
-        rule %r'(null)#{delimiter_end}', Keyword::Constant
+        # PDF only has single-line comments: from "%" to EOL
+        rule %r/%.*$/, Comment::Single
 
-        # PDF Hex string - can contain whitespace and span multiple lines
-        rule %r/<[0-9A-Fa-f\s]+>/m, String::Other
+        # PDF Boolean and null object keywords
+        rule %r/(false|true|null)/, Keyword::Constant
 
-        # PDF Dictionary
-        rule %r/<</, Variable::Instance
-        rule %r/>>/, Variable::Instance
+        # PDF Dictionary and array object start and end tokens
+        rule %r/(<<|>>|\[|\])/, Punctuation
 
-        # PDF Arrays
-        rule %r/\[/, Variable::Instance
-        rule %r/\]/, Variable::Instance
+        # PDF Hex string - can contain whitespace and span multiple lines.
+        # This rule must be after "<<"/">>"
+        rule %r/<[0-9A-Fa-f\s]*>/m, Str::Other
 
-        # PDF literal strings are complex (multi-line, escapes, etc.); enter separate state.
-        rule %r'\(', String, :stringliteral
+        # PDF literal strings are complex (multi-line, escapes, etc.). Use separate state machine.
+        rule %r/\(/, Str, :stringliteral
 
-        # PDF Name objects - can be empty (nothing after "/")
-        # No special processing needed for 2-digit hex codes starting with "#"
-        rule %r'/\/#{valid_name_chars}*#{delimiter_end}', Name::Variable
+        # PDF Name objects - can be empty (i.e., nothing after "/").
+        # No special processing required for 2-digit hex codes that start with "#".
+        rule %r/\/[^\(\)<>\[\]\/%\s]*/, Name::Entity
 
-        # PDF objects and stream (no checking of object number)
-        rule %r/\d+\s\d+obj#{delimiter_end}/, Keyword::Declaration
-        rule %r/stream/, Keyword::Declaration
-        rule %r/endstream/, Keyword::Declaration
-        rule %r/endobj/, Keyword::Declaration
+        # PDF objects and stream (no checking of object ID)
+        # Note that object number and generation numbers do not have sign.
+        rule %r/\d+\s\d+\sobj/, Keyword::Declaration
+        rule %r/(endstream|endobj|stream)/, Keyword::Declaration
 
-        # PDF file layout
-        rule %r/xref/, Keyword::Constant
-        rule %r/trailer/, Keyword::Constant
-        rule %r/startxref/, Keyword::Constant
+        # PDF conventional file layout keywords
+        rule %r/(startxref|trailer|xref)/, Keyword::Constant
 
-        # PDF cross reference section entries (supposedly 20 bytes including EOL)
-        rule %r/^\d{10} \d{5} [nf]\s?/, Keyword::Namespace
+        # PDF cross reference section entries (20 bytes including EOL).
+        # Explicit single SPACE separators.
+        rule %r/^\d{10} \d{5} (n|f)\s*$/, Keyword::Namespace
 
-        # PDF Indirect reference (lax, allows zero as the object number)
-        rule %r/\d+\s\d+R#{delimiter_end}/. Keyword::Pseudo
+        # PDF Indirect reference (lax, allows zero as the object number).
+        # Requires terminating delimiter lookahead to disambiguate from "RG" operator
+        rule %r/\d+\s\d+\sR(?=[\(\)<>\[\]\/%\s])/, Keyword::Variable
 
         # PDF Real object
-        rule %r/(\-|\+)?([0-9]+\.?|[0-9]*\.[0-9]+|[0-9]+\.[0-9]*)#{delimiter_end}/, Number::Float
+        rule %r/(\-|\+)?([0-9]+\.?|[0-9]*\.[0-9]+|[0-9]+\.[0-9]*)/, Num::Float
 
         # PDF Integer object
-        rule %r/(\-|\+)?[0-9]+#{delimiter_end}/, Number::Integer
+        rule %r/(\-|\+)?[0-9]+/, Num::Integer
+
+        # A run of non-delimiters is most likely a PDF content stream 
+        # operator (ISO 32000-2:2020, Annex A).
+        rule %r/[^\(\)<>\[\]\/%\s]+/, Operator::Word
 
-        # most likely PDF content stream operators
-        rule valid_name_chars, Operator::Word
+        # Whitespace (except inside strings and comments) is ignored = /[ \t\r\n\f\v]/.
+        # Ruby doesn't include NUL as whitespace (vs ISO 32000-2:2020 Table 1)
+        rule %r/\s+/, Text::Whitespace
       end
 
-      # PDF literal string - see ISO 32000-2:2020 clause 7.3.4.2 and Table 3
+      # PDF literal string. See ISO 32000-2:2020 clause 7.3.4.2 and Table 3
       state :stringliteral do
-        rule %r/\(/, String, :stringliteral     # recursive for internal balanced literal strings
-        rule %r/\)/, String, :pop!
-        rule %r/\\([0-7]{3}|n|r|t|b|f|\\)/, String::Escape
-        rule %r/[^\(\)\\]+/, String
+        rule %r/\(/, Str, :stringliteral     # recursive for internal balanced(!) literal strings
+        rule %r/\)/, Str, :pop!
+        rule %r/\\([0-7]{3}|n|r|t|b|f|\\)/, Str::Escape
+        rule %r/[^\(\)\\]+/, Str
       end
     end
   end

From e54e1d31ff76f36c25f702c0ce5de8016554c319 Mon Sep 17 00:00:00 2001
From: petervwyatt <26521615+petervwyatt@users.noreply.github.com>
Date: Tue, 2 Jul 2024 15:13:09 +1000
Subject: [PATCH 3/9] Create demo PDF (functional)

Needs to be treated as binary for xref to remain valid
---
 lib/rouge/demos/pdf | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 lib/rouge/demos/pdf

diff --git a/lib/rouge/demos/pdf b/lib/rouge/demos/pdf
new file mode 100644
index 0000000000..849cf59d52
--- /dev/null
+++ b/lib/rouge/demos/pdf
@@ -0,0 +1,29 @@
+%PDF-1.6
+%©©©©
+
+1 0 obj<</Type/Catalog/Pages 2 0 R/StructTreeRoot null/MarkInfo<</Marked false>>>>
+endobj
+2 0 obj<</Type/Pages/Kids[3 0 R]/Count 1>>
+endobj
+3 0 obj<</Type/Page/Parent 2 0 R/MediaBox[.0 0 200 200]/Contents 4 0 R/Resources<<>>>>
+endobj
+4 0 obj<</Length 60>>
+stream
+  +8 w 1 j
+  1.0 0 0 rg
+  0 0 1 RG
+  10 10 180 180 re B
+endstream
+endobj
+xref
+0 5
+0000000000 65535 f
+0000000021 00000 n
+0000000113 00000 n
+0000000165 00000 n
+0000000261 00000 n
+trailer
+<</Root 1 0 R/Size 5/ID[<18D6B641245C03F28E67D93AD879D6EC><18D6B641245C03F28E67D93AD879D6EC>]>>
+startxref
+371
+%%EOF
\ No newline at end of file

From 91d499cfda718f1ba6a68e44f37fb14edfae8370 Mon Sep 17 00:00:00 2001
From: petervwyatt <26521615+petervwyatt@users.noreply.github.com>
Date: Tue, 2 Jul 2024 15:22:14 +1000
Subject: [PATCH 4/9] Update pdf.rb

---
 lib/rouge/lexers/pdf.rb | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lib/rouge/lexers/pdf.rb b/lib/rouge/lexers/pdf.rb
index 7c0d48d541..c80dfcd95c 100644
--- a/lib/rouge/lexers/pdf.rb
+++ b/lib/rouge/lexers/pdf.rb
@@ -1,7 +1,6 @@
 # -*- coding: utf-8 -*- #
 # frozen_string_literal: true
 
-#
 # PDF = Portable Document Format page description language
 # As defined by ISO 32000-2:2020 including resolved errata from https://pdf-issues.pdfa.org/
 #
@@ -20,7 +19,7 @@ module Lexers
     class Pdf < RegexLexer
       title "PDF"
       desc "PDF - Portable Document Format (ISO 32000)"
-      tag "pdf"
+      tag "Pdf"
       aliases "fdf", "cos"
       filenames "*.pdf", "*.fdf"
       mimetypes "application/pdf", "application/fdf" # IANA registered media types

From 062647e54a9473f080250ce6056659669cf49427 Mon Sep 17 00:00:00 2001
From: petervwyatt <26521615+petervwyatt@users.noreply.github.com>
Date: Tue, 2 Jul 2024 15:22:31 +1000
Subject: [PATCH 5/9] Add basic spec checker

---
 spec/lexers/pdf_spec.rb | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 spec/lexers/pdf_spec.rb

diff --git a/spec/lexers/pdf_spec.rb b/spec/lexers/pdf_spec.rb
new file mode 100644
index 0000000000..9fbb001327
--- /dev/null
+++ b/spec/lexers/pdf_spec.rb
@@ -0,0 +1,29 @@
+# -*- coding: utf-8 -*- #
+# frozen_string_literal: true
+
+describe Rouge::Lexers::Pdf do
+  let(:subject) { Rouge::Lexers::Pdf.new }
+
+  describe 'guessing' do
+    include Support::Guessing
+
+    it 'guesses by filename' do
+      assert_guess :filename => 'foo.pdf'
+      assert_guess :filename => 'foo.fdf'
+    end
+
+    it 'guesses by mimetype' do
+      assert_guess :mimetype => 'application/pdf'
+      assert_guess :mimetype => 'application/fdf'
+    end
+
+    it 'guesses by source' do
+      assert_guess :source => '%PDF-1.6'
+      assert_guess :source => '%PDF-2.0'
+      assert_guess :source => '%PDF-0.3' # Fake PDF version
+      assert_guess :source => '%PDF-6.8' # Fake PDF version
+      assert_guess :source => '%FDF-1.2'
+    end
+  end
+
+end

From 9cf372f293ea7f617330ce410b964f7440bf0fe4 Mon Sep 17 00:00:00 2001
From: petervwyatt <26521615+petervwyatt@users.noreply.github.com>
Date: Tue, 2 Jul 2024 15:47:46 +1000
Subject: [PATCH 6/9] Fixups

---
 lib/rouge/lexers/pdf.rb | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/lib/rouge/lexers/pdf.rb b/lib/rouge/lexers/pdf.rb
index c80dfcd95c..37f7240240 100644
--- a/lib/rouge/lexers/pdf.rb
+++ b/lib/rouge/lexers/pdf.rb
@@ -1,16 +1,18 @@
 # -*- coding: utf-8 -*- #
 # frozen_string_literal: true
+# vim: set ts=2 sw=2 et:
 
 # PDF = Portable Document Format page description language
 # As defined by ISO 32000-2:2020 including resolved errata from https://pdf-issues.pdfa.org/
 #
-# The PDF syntax is also know as "COS" and can also be used with FDF (Forms Data Field) files. 
+# The PDF syntax is also known as "COS" and can be used with FDF (Forms Data Field) files as
+# per ISO 32000-2:2020 clause 12.7.8. 
 #
 # This is a token-based parser ONLY! It is intended to syntax highlight full or partial fragments 
 # of nicely written hand-writteen PDF syntax in documentation such as ISO specifications. It is NOT
 # intended to cope with real-world PDFs that will contain arbitrary binary data (that form invalid
 # UTF-8 sequences and generate "ArgumentError: invalid byte sequence in UTF-8" Ruby errors) and 
-# other types of malformation/syntax errors. 
+# other types of malformations or syntax errors. 
 #
 # Author: Peter Wyatt, CTO, PDF Association. 2024
 #
@@ -19,10 +21,10 @@ module Lexers
     class Pdf < RegexLexer
       title "PDF"
       desc "PDF - Portable Document Format (ISO 32000)"
-      tag "Pdf"
-      aliases "fdf", "cos"
-      filenames "*.pdf", "*.fdf"
-      mimetypes "application/pdf", "application/fdf" # IANA registered media types
+      tag 'pdf'
+      aliases "fdf", 'cos'
+      filenames '*.pdf', '*.fdf'
+      mimetypes 'application/pdf', 'application/fdf' # IANA registered media types
 
       # PDF and FDF files must start with "%PDF-x.y" or "%FDF-x.y"
       # where x is the single digit major version and y is the single digit minor version.
@@ -33,7 +35,7 @@ def self.detect?(text)
       # PDF Delimiters (ISO 32000-2:2020, Table 1 and Table 2).
       # Ruby whitespace "\s" is /[ \t\r\n\f\v]/ which does not include NUL (ISO 32000-2:2020, Table 1).
       # PDF also support 2 character EOL sequences.
-      delimiter = %r/\(\)<>\[\]\/%\s/
+      # NOT USED: delimiter = %r/\(\)<>\[\]\/%\s/
 
       state :root do
         # Start-of-file header comment is special (comment is up to EOL)

From 24889094faa55875dd118b83fde49bf0fc28c2e3 Mon Sep 17 00:00:00 2001
From: petervwyatt <26521615+petervwyatt@users.noreply.github.com>
Date: Tue, 2 Jul 2024 16:05:22 +1000
Subject: [PATCH 7/9] Altered tokens for better color

---
 lib/rouge/lexers/pdf.rb | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/lib/rouge/lexers/pdf.rb b/lib/rouge/lexers/pdf.rb
index 37f7240240..8b38cc89e0 100644
--- a/lib/rouge/lexers/pdf.rb
+++ b/lib/rouge/lexers/pdf.rb
@@ -35,14 +35,13 @@ def self.detect?(text)
       # PDF Delimiters (ISO 32000-2:2020, Table 1 and Table 2).
       # Ruby whitespace "\s" is /[ \t\r\n\f\v]/ which does not include NUL (ISO 32000-2:2020, Table 1).
       # PDF also support 2 character EOL sequences.
-      # NOT USED: delimiter = %r/\(\)<>\[\]\/%\s/
 
       state :root do
         # Start-of-file header comment is special (comment is up to EOL)
-        rule %r/^%(P|F)DF-\d\.\d.*$/, Comment::Special
+        rule %r/^%(P|F)DF-\d\.\d.*$/, Comment::Preproc
 
         # End-of-file marker comment is special (comment is up to EOL)
-        rule %r/^%%EOF.*$/, Comment::Special
+        rule %r/^%%EOF.*$/, Comment::Preproc
 
         # PDF only has single-line comments: from "%" to EOL
         rule %r/%.*$/, Comment::Single
@@ -62,7 +61,7 @@ def self.detect?(text)
 
         # PDF Name objects - can be empty (i.e., nothing after "/").
         # No special processing required for 2-digit hex codes that start with "#".
-        rule %r/\/[^\(\)<>\[\]\/%\s]*/, Name::Entity
+        rule %r/\/[^\(\)<>\[\]\/%\s]*/, Name::Other
 
         # PDF objects and stream (no checking of object ID)
         # Note that object number and generation numbers do not have sign.
@@ -70,7 +69,7 @@ def self.detect?(text)
         rule %r/(endstream|endobj|stream)/, Keyword::Declaration
 
         # PDF conventional file layout keywords
-        rule %r/(startxref|trailer|xref)/, Keyword::Constant
+        rule %r/(startxref|trailer|xref)/, Keyword::Declaration
 
         # PDF cross reference section entries (20 bytes including EOL).
         # Explicit single SPACE separators.
@@ -78,7 +77,7 @@ def self.detect?(text)
 
         # PDF Indirect reference (lax, allows zero as the object number).
         # Requires terminating delimiter lookahead to disambiguate from "RG" operator
-        rule %r/\d+\s\d+\sR(?=[\(\)<>\[\]\/%\s])/, Keyword::Variable
+        rule %r/\d+\s\d+\sR(?=[\(\)<>\[\]\/%\s])/, Name::Decorator
 
         # PDF Real object
         rule %r/(\-|\+)?([0-9]+\.?|[0-9]*\.[0-9]+|[0-9]+\.[0-9]*)/, Num::Float
@@ -97,8 +96,10 @@ def self.detect?(text)
 
       # PDF literal string. See ISO 32000-2:2020 clause 7.3.4.2 and Table 3
       state :stringliteral do
-        rule %r/\(/, Str, :stringliteral     # recursive for internal balanced(!) literal strings
+        rule %r/\(/, Str, :stringliteral             # recursive for internal bracketed strings
+        rule %r/\\\(/, Str::Escape, :stringliteral   # recursive for internal escaped bracketed strings
         rule %r/\)/, Str, :pop!
+        rule %r/\\\)/, Str::Escape, :pop!
         rule %r/\\([0-7]{3}|n|r|t|b|f|\\)/, Str::Escape
         rule %r/[^\(\)\\]+/, Str
       end

From a8e8c8b0fbadcd2262eae8376fa3566273a1cc77 Mon Sep 17 00:00:00 2001
From: petervwyatt <26521615+petervwyatt@users.noreply.github.com>
Date: Tue, 2 Jul 2024 16:05:41 +1000
Subject: [PATCH 8/9] More complex PDF for visual test

---
 spec/visual/samples/pdf | 58 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)
 create mode 100644 spec/visual/samples/pdf

diff --git a/spec/visual/samples/pdf b/spec/visual/samples/pdf
new file mode 100644
index 0000000000..40bdca1e75
--- /dev/null
+++ b/spec/visual/samples/pdf
@@ -0,0 +1,58 @@
+%PDF-1.7
+%©©
+1 0 obj
+<</Type/Catalog/MarkInfo<<%comment after dictionary start
+/Marked true/Suspects true%comment after a boolean
+/UserProperties true>>/StructTreeRoot null/AA<</WP<</S/JavaScript/JS(//JavaScript comment
+app.alert\( "Document Will-Print Action!!"\))>>>>/Pages 3 0 R>>%comment after dictionary close
+endobj
+2 0 obj
+null%comment after null
+endobj
+3 0 obj
+<</FakeBigDataArray[true[[[]]]true<686931>null<686932>null[/Dummy](hi3)[(hi4)(hi5)true(hi6)null(hi7)12(hi8)]-1.<</ABC +.123/DEF +.0>>[](hi99)[]null[]<</DEF null>>true<</GHI/JKL>>[<</MNO +.0>>]<686933>1 0 R[.1 -2 +.3]6 0 R<686934>4 0 R(hi9)2 0 R<</QRS true>>[true]<</TUV true>><686935><</XYZ true>>3 0 R<</AAB true>>(hi10)<</AAC true>>null<686936>true(hi11)<686937>(hi12)+.0<686938>]
+/Type/Pages/Count 1/Kids[4 0 R%comment after indirect ref
+]>>endobj
+4 0 obj
+<</Type/Page/Parent 3 0 R/MediaBox[%comment after array start 
++0 .0 999 999.]%comment after array end token
+/CropBox[+0 .0 999%comment after an integer
+999.]/Contents[5 0 R]/UserUnit +0.88
+/Resources<</Pattern<<>>/ProcSet[null]/ExtGState<</ 6 0 R>>/Font<</F1<</Type/Font/Subtype/Type1/BaseFont/Times-Bold/Encoding/WinAnsiEncoding>>>>>>>>
+endobj
+5 0 obj
+<</Length 757 >>
+stream
+BX /BreakMyParser <</FakeBigDataArray[true[[[]]]true<686931>null<686932>null[/Dummy](hi3)[(hi4)(hi5)true(hi6)null(hi7)12(hi8)]-1.<</ABC +.123/DEF +.0>>[](hi99)[]null[]<</DEF null>>true<</GHI/JKL>>[<</MNO +.0>>]<686933>[1 2 3]<686934>(hi9)<</QRS true>>[true]<</TUV true>><686935><</XYZ true>><</AAB true>>(hi10)<</AAC true>>null<686936>true(hi11)<686937>(hi12)+.0<686938>]>> DP EX
+BT/F1 30 Tf 0 Tr 1 0 0 1 10 950 Tm(PDF Ruby Rouge test file)Tj 1 0 0 1 10 900 Tm 
+(This file must NOT be resaved or modified by any tool!!)Tj ET% 3 colored vector graphic squares that are clipped
+/ gs q 40 w 75 75 400 400 re W S % stroke then clip a path with a wide black border
+1 0. .0 rg 75 75 200 200 re f 0 1 0 rg 275 75 200 200 re f .0 0 1 rg 275 275 200 200 re f Q
+endstream
+endobj
+6 0 obj<</Type/ExtGState/ca 0.33/CA 0.66%comment after a real
+>>
+endobj
+7 0 obj
+<</Subject(Compacted Syntax v3.0)%comment after literal string end
+/Title<436f6d7061637465642073796e746178>%comment after hex string end
+/Keywords(PDF,Compacted,Syntax,ISO 32000-2:2020)/CreationDate(D:20200317)/Author(Peter Wyatt)/Creator< 48616e
+642d65646974>/Producer<48616e 6 4 2 d 6 5646974>>>
+endobj
+xref
+0 8
+0000000000 65535 f
+0000000017 00000 n
+0000000332 00000 n
+0000000374 00000 n
+0000000837 00000 n
+0000001198 00000 n
+0000002009 00000 n
+0000002084 00000 n
+trailer
+<</Root 1 0 R/Info%comment after name
+7 0 R/ID[<18D6B6412
+45C033A6E67D93AD879D6EC><18D 6B 641245C033A6E67D93AD879D6EC>]/Size 8>>
+startxref
+  2403
+%%EOF
\ No newline at end of file

From 643179c07907953272dfdb493bed06436da2047f Mon Sep 17 00:00:00 2001
From: petervwyatt <26521615+petervwyatt@users.noreply.github.com>
Date: Fri, 5 Jul 2024 10:58:09 +1000
Subject: [PATCH 9/9] Added EOL to last line of PDF

Added EOL to last line of PDF to pass linelint CI check used by Rouge. This is not required by real PDF files.
---
 lib/rouge/demos/pdf     | 2 +-
 spec/visual/samples/pdf | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/rouge/demos/pdf b/lib/rouge/demos/pdf
index 849cf59d52..9c8c326987 100644
--- a/lib/rouge/demos/pdf
+++ b/lib/rouge/demos/pdf
@@ -26,4 +26,4 @@ trailer
 <</Root 1 0 R/Size 5/ID[<18D6B641245C03F28E67D93AD879D6EC><18D6B641245C03F28E67D93AD879D6EC>]>>
 startxref
 371
-%%EOF
\ No newline at end of file
+%%EOF
diff --git a/spec/visual/samples/pdf b/spec/visual/samples/pdf
index 40bdca1e75..10da023287 100644
--- a/spec/visual/samples/pdf
+++ b/spec/visual/samples/pdf
@@ -55,4 +55,4 @@ trailer
 45C033A6E67D93AD879D6EC><18D 6B 641245C033A6E67D93AD879D6EC>]/Size 8>>
 startxref
   2403
-%%EOF
\ No newline at end of file
+%%EOF