diff --git a/lib/rouge/demos/pdf b/lib/rouge/demos/pdf new file mode 100644 index 0000000000..9c8c326987 --- /dev/null +++ b/lib/rouge/demos/pdf @@ -0,0 +1,29 @@ +%PDF-1.6 +%©©©© + +1 0 obj<>>> +endobj +2 0 obj<> +endobj +3 0 obj<>>> +endobj +4 0 obj<> +stream + +8 w 1 j + 1.0 0 0 rg + 0 0 1 RG + 10 10 180 180 re B +endstream +endobj +xref +0 5 +0000000000 65535 f +0000000021 00000 n +0000000113 00000 n +0000000165 00000 n +0000000261 00000 n +trailer +<<18D6B641245C03F28E67D93AD879D6EC>]>> +startxref +371 +%%EOF diff --git a/lib/rouge/lexers/pdf.rb b/lib/rouge/lexers/pdf.rb new file mode 100644 index 0000000000..8b38cc89e0 --- /dev/null +++ b/lib/rouge/lexers/pdf.rb @@ -0,0 +1,108 @@ +# -*- coding: utf-8 -*- # +# frozen_string_literal: true +# vim: set ts=2 sw=2 et: + +# PDF = Portable Document Format page description language +# As defined by ISO 32000-2:2020 including resolved errata from https://pdf-issues.pdfa.org/ +# +# The PDF syntax is also known as "COS" and can be used with FDF (Forms Data Field) files as +# per ISO 32000-2:2020 clause 12.7.8. +# +# This is a token-based parser ONLY! It is intended to syntax highlight full or partial fragments +# of nicely written hand-writteen PDF syntax in documentation such as ISO specifications. It is NOT +# intended to cope with real-world PDFs that will contain arbitrary binary data (that form invalid +# UTF-8 sequences and generate "ArgumentError: invalid byte sequence in UTF-8" Ruby errors) and +# other types of malformations or syntax errors. +# +# Author: Peter Wyatt, CTO, PDF Association. 2024 +# +module Rouge + module Lexers + class Pdf < RegexLexer + title "PDF" + desc "PDF - Portable Document Format (ISO 32000)" + tag 'pdf' + aliases "fdf", 'cos' + filenames '*.pdf', '*.fdf' + mimetypes 'application/pdf', 'application/fdf' # IANA registered media types + + # PDF and FDF files must start with "%PDF-x.y" or "%FDF-x.y" + # where x is the single digit major version and y is the single digit minor version. + def self.detect?(text) + return true if /^%(P|F)DF-\d.\d/ =~ text + end + + # PDF Delimiters (ISO 32000-2:2020, Table 1 and Table 2). + # Ruby whitespace "\s" is /[ \t\r\n\f\v]/ which does not include NUL (ISO 32000-2:2020, Table 1). + # PDF also support 2 character EOL sequences. + + state :root do + # Start-of-file header comment is special (comment is up to EOL) + rule %r/^%(P|F)DF-\d\.\d.*$/, Comment::Preproc + + # End-of-file marker comment is special (comment is up to EOL) + rule %r/^%%EOF.*$/, Comment::Preproc + + # PDF only has single-line comments: from "%" to EOL + rule %r/%.*$/, Comment::Single + + # PDF Boolean and null object keywords + rule %r/(false|true|null)/, Keyword::Constant + + # PDF Dictionary and array object start and end tokens + rule %r/(<<|>>|\[|\])/, Punctuation + + # PDF Hex string - can contain whitespace and span multiple lines. + # This rule must be after "<<"/">>" + rule %r/<[0-9A-Fa-f\s]*>/m, Str::Other + + # PDF literal strings are complex (multi-line, escapes, etc.). Use separate state machine. + rule %r/\(/, Str, :stringliteral + + # PDF Name objects - can be empty (i.e., nothing after "/"). + # No special processing required for 2-digit hex codes that start with "#". + rule %r/\/[^\(\)<>\[\]\/%\s]*/, Name::Other + + # PDF objects and stream (no checking of object ID) + # Note that object number and generation numbers do not have sign. + rule %r/\d+\s\d+\sobj/, Keyword::Declaration + rule %r/(endstream|endobj|stream)/, Keyword::Declaration + + # PDF conventional file layout keywords + rule %r/(startxref|trailer|xref)/, Keyword::Declaration + + # PDF cross reference section entries (20 bytes including EOL). + # Explicit single SPACE separators. + rule %r/^\d{10} \d{5} (n|f)\s*$/, Keyword::Namespace + + # PDF Indirect reference (lax, allows zero as the object number). + # Requires terminating delimiter lookahead to disambiguate from "RG" operator + rule %r/\d+\s\d+\sR(?=[\(\)<>\[\]\/%\s])/, Name::Decorator + + # PDF Real object + rule %r/(\-|\+)?([0-9]+\.?|[0-9]*\.[0-9]+|[0-9]+\.[0-9]*)/, Num::Float + + # PDF Integer object + rule %r/(\-|\+)?[0-9]+/, Num::Integer + + # A run of non-delimiters is most likely a PDF content stream + # operator (ISO 32000-2:2020, Annex A). + rule %r/[^\(\)<>\[\]\/%\s]+/, Operator::Word + + # Whitespace (except inside strings and comments) is ignored = /[ \t\r\n\f\v]/. + # Ruby doesn't include NUL as whitespace (vs ISO 32000-2:2020 Table 1) + rule %r/\s+/, Text::Whitespace + end + + # PDF literal string. See ISO 32000-2:2020 clause 7.3.4.2 and Table 3 + state :stringliteral do + rule %r/\(/, Str, :stringliteral # recursive for internal bracketed strings + rule %r/\\\(/, Str::Escape, :stringliteral # recursive for internal escaped bracketed strings + rule %r/\)/, Str, :pop! + rule %r/\\\)/, Str::Escape, :pop! + rule %r/\\([0-7]{3}|n|r|t|b|f|\\)/, Str::Escape + rule %r/[^\(\)\\]+/, Str + end + end + end +end diff --git a/spec/lexers/pdf_spec.rb b/spec/lexers/pdf_spec.rb new file mode 100644 index 0000000000..9fbb001327 --- /dev/null +++ b/spec/lexers/pdf_spec.rb @@ -0,0 +1,29 @@ +# -*- coding: utf-8 -*- # +# frozen_string_literal: true + +describe Rouge::Lexers::Pdf do + let(:subject) { Rouge::Lexers::Pdf.new } + + describe 'guessing' do + include Support::Guessing + + it 'guesses by filename' do + assert_guess :filename => 'foo.pdf' + assert_guess :filename => 'foo.fdf' + end + + it 'guesses by mimetype' do + assert_guess :mimetype => 'application/pdf' + assert_guess :mimetype => 'application/fdf' + end + + it 'guesses by source' do + assert_guess :source => '%PDF-1.6' + assert_guess :source => '%PDF-2.0' + assert_guess :source => '%PDF-0.3' # Fake PDF version + assert_guess :source => '%PDF-6.8' # Fake PDF version + assert_guess :source => '%FDF-1.2' + end + end + +end diff --git a/spec/visual/samples/pdf b/spec/visual/samples/pdf new file mode 100644 index 0000000000..10da023287 --- /dev/null +++ b/spec/visual/samples/pdf @@ -0,0 +1,58 @@ +%PDF-1.7 +%©© +1 0 obj +<>/StructTreeRoot null/AA<>>>/Pages 3 0 R>>%comment after dictionary close +endobj +2 0 obj +null%comment after null +endobj +3 0 obj +<null<686932>null[/Dummy](hi3)[(hi4)(hi5)true(hi6)null(hi7)12(hi8)]-1.<>[](hi99)[]null[]<>true<>[<>]<686933>1 0 R[.1 -2 +.3]6 0 R<686934>4 0 R(hi9)2 0 R<>[true]<><686935><>3 0 R<>(hi10)<>null<686936>true(hi11)<686937>(hi12)+.0<686938>] +/Type/Pages/Count 1/Kids[4 0 R%comment after indirect ref +]>>endobj +4 0 obj +<>/ProcSet[null]/ExtGState<>/Font<>>>>>>> +endobj +5 0 obj +<> +stream +BX /BreakMyParser <null<686932>null[/Dummy](hi3)[(hi4)(hi5)true(hi6)null(hi7)12(hi8)]-1.<>[](hi99)[]null[]<>true<>[<>]<686933>[1 2 3]<686934>(hi9)<>[true]<><686935><><>(hi10)<>null<686936>true(hi11)<686937>(hi12)+.0<686938>]>> DP EX +BT/F1 30 Tf 0 Tr 1 0 0 1 10 950 Tm(PDF Ruby Rouge test file)Tj 1 0 0 1 10 900 Tm +(This file must NOT be resaved or modified by any tool!!)Tj ET% 3 colored vector graphic squares that are clipped +/ gs q 40 w 75 75 400 400 re W S % stroke then clip a path with a wide black border +1 0. .0 rg 75 75 200 200 re f 0 1 0 rg 275 75 200 200 re f .0 0 1 rg 275 275 200 200 re f Q +endstream +endobj +6 0 obj<> +endobj +7 0 obj +<%comment after hex string end +/Keywords(PDF,Compacted,Syntax,ISO 32000-2:2020)/CreationDate(D:20200317)/Author(Peter Wyatt)/Creator< 48616e +642d65646974>/Producer<48616e 6 4 2 d 6 5646974>>> +endobj +xref +0 8 +0000000000 65535 f +0000000017 00000 n +0000000332 00000 n +0000000374 00000 n +0000000837 00000 n +0000001198 00000 n +0000002009 00000 n +0000002084 00000 n +trailer +<<18D 6B 641245C033A6E67D93AD879D6EC>]/Size 8>> +startxref + 2403 +%%EOF