diff --git a/CHANGELOG.md b/CHANGELOG.md
index f7ff74a..9eb5cb3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,6 +14,9 @@
*Mike Dalessio*
+* `FullSanitizer` now supports the optional argument `preserve_whitespace` to keep whitespace around block elements and line break elements.
+
+ *Earlopain*
## 1.5.0 / 2023-01-20
diff --git a/README.md b/README.md
index 059368e..a043095 100644
--- a/README.md
+++ b/README.md
@@ -62,20 +62,15 @@ All sanitizers respond to `sanitize`, and are available in variants that use eit
full_sanitizer = Rails::HTML5::FullSanitizer.new
full_sanitizer.sanitize("Bold no more! See more here...")
# => Bold no more! See more here...
-```
-or, if you insist on parsing the content as HTML4:
+# Whitespace is swallowed by default. If whitespace is significant you must pass an option to preserve it.
+# This option is slower, but is clever about whitespace around block elements and line break elements.
-```ruby
-full_sanitizer = Rails::HTML4::FullSanitizer.new
-full_sanitizer.sanitize("Bold no more! See more here...")
-# => Bold no more! See more here...
+full_sanitizer = Rails::HTML5::FullSanitizer.new
+full_sanitizer.sanitize("
Paragraphs
and
newlines", preserve_whitespace: true)
+# => \nParagraphs\n and \n newlines
```
-HTML5 version:
-
-
-
#### LinkSanitizer
```ruby
diff --git a/lib/rails/html/sanitizer.rb b/lib/rails/html/sanitizer.rb
index dfbdb1d..632bd35 100644
--- a/lib/rails/html/sanitizer.rb
+++ b/lib/rails/html/sanitizer.rb
@@ -66,6 +66,19 @@ def parse_fragment(html)
end if Rails::HTML::Sanitizer.html5_support?
end
+ module Sanitizer
+ module PreserveWhitespace
+ def sanitize(html, options = {})
+ return unless html
+ if options[:preserve_whitespace]
+ parse_fragment(html).to_text
+ else
+ super
+ end
+ end
+ end
+ end
+
module Scrubber
module Full
def scrub(fragment, options = {})
@@ -217,11 +230,20 @@ module HTML4
# full_sanitizer.sanitize("Bold no more! See more here...")
# # => "Bold no more! See more here..."
#
+ # === Options
+ #
+ # If whitespace is significant you can pass preserve_whitespace: true.
+ # This option is slower, but is clever about whitespace around block elements and line break elements.
+ #
+ # full_sanitizer = Rails::HTML4::FullSanitizer.new
+ # full_sanitizer.sanitize("Paragraphs
and
newlines", preserve_whitespace: true)
+ # # => \nParagraphs\n and \n newlines
class FullSanitizer < Rails::HTML::Sanitizer
include HTML::Concern::ComposedSanitize
include HTML::Concern::Parser::HTML4
include HTML::Concern::Scrubber::Full
include HTML::Concern::Serializer::UTF8Encode
+ include HTML::Concern::Sanitizer::PreserveWhitespace
end
# == Rails::HTML4::LinkSanitizer
@@ -307,11 +329,20 @@ module HTML5
# full_sanitizer.sanitize("Bold no more! See more here...")
# # => "Bold no more! See more here..."
#
+ # === Options
+ #
+ # If whitespace is significant you can pass preserve_whitespace: true.
+ # This option is slower, but is clever about whitespace around block elements and line break elements.
+ #
+ # full_sanitizer = Rails::HTML5::FullSanitizer.new
+ # full_sanitizer.sanitize("Paragraphs
and
newlines", preserve_whitespace: true)
+ # # => \nParagraphs\n and \n newlines
class FullSanitizer < Rails::HTML::Sanitizer
include HTML::Concern::ComposedSanitize
include HTML::Concern::Parser::HTML5
include HTML::Concern::Scrubber::Full
include HTML::Concern::Serializer::UTF8Encode
+ include HTML::Concern::Sanitizer::PreserveWhitespace
end
# == Rails::HTML5::LinkSanitizer
diff --git a/test/sanitizer_test.rb b/test/sanitizer_test.rb
index 3cde41a..f9583bc 100644
--- a/test/sanitizer_test.rb
+++ b/test/sanitizer_test.rb
@@ -80,37 +80,41 @@ module FullSanitizerTest
def test_strip_tags_with_quote
input = '<" hi'
- result = full_sanitize(input)
acceptable_results = [
# libxml2 >= 2.9.14 and xerces+neko
%{<" hi},
# other libxml2
%{ hi},
+ # preserve_whitespace: true
+ "<" hi",
]
- assert_includes(acceptable_results, result)
+ assert_full_sanitized(acceptable_results, input)
end
def test_strip_invalid_html
- assert_equal "<<", full_sanitize("<<This is a test.\n\n\n\nIt no longer contains any HTML.
\n}
+ acceptable_results = [
+ %{This is a test.\n\n\n\nIt no longer contains any HTML.\n},
+ # preserve_whitespace: true
+ %{\nThis is a test.\n\nIt no longer contains any HTML.\n\n}
+ ]
- assert_equal expected, full_sanitize(input)
+ assert_full_sanitized acceptable_results, input
end
def test_remove_unclosed_tags
input = "This is <-- not\n a comment here."
- result = full_sanitize(input)
acceptable_results = [
# libxml2 >= 2.9.14 and xerces+neko
%{This is <-- not\n a comment here.},
@@ -118,12 +122,11 @@ def test_remove_unclosed_tags
%{This is },
]
- assert_includes(acceptable_results, result)
+ assert_full_sanitized(acceptable_results, input)
end
def test_strip_cdata
input = "This has a ]]> here."
- result = full_sanitize(input)
acceptable_results = [
# libxml2 = 2.9.14
%{This has a <![CDATA[]]> here.},
@@ -133,51 +136,68 @@ def test_strip_cdata
%{This has a here.},
]
- assert_includes(acceptable_results, result)
+ assert_full_sanitized(acceptable_results, input)
end
def test_strip_blank_string
assert_nil full_sanitize(nil)
- assert_equal "", full_sanitize("")
- assert_equal " ", full_sanitize(" ")
+ assert_nil full_sanitize(nil, preserve_whitespace: true)
+ assert_full_sanitized "", ""
+ assert_full_sanitized " ", " "
end
def test_strip_tags_with_plaintext
- assert_equal "Don't touch me", full_sanitize("Don't touch me")
+ assert_full_sanitized "Don't touch me", "Don't touch me"
end
def test_strip_tags_with_tags
- assert_equal "This is a test.", full_sanitize("This is a test.
")
+ assert_full_sanitized "This is a test.", "This is a test."
end
def test_escape_tags_with_many_open_quotes
- assert_equal "<<", full_sanitize("<<")
+ assert_full_sanitized "<<", "<<"
end
def test_strip_tags_with_sentence
- assert_equal "This is a test.", full_sanitize("This is a test.")
+ assert_full_sanitized "This is a test.", "This is a test."
end
def test_strip_tags_with_comment
- assert_equal "This has a here.", full_sanitize("This has a here.")
+ assert_full_sanitized "This has a here.", "This has a here."
end
def test_strip_tags_with_frozen_string
- assert_equal "Frozen string with no tags", full_sanitize("Frozen string with no tags")
+ assert_full_sanitized "Frozen string with no tags", "Frozen string with no tags"
end
def test_full_sanitize_respect_html_escaping_of_the_given_string
- assert_equal 'test\r\nstring', full_sanitize('test\r\nstring')
- assert_equal "&", full_sanitize("&")
- assert_equal "&", full_sanitize("&")
- assert_equal "&", full_sanitize("&")
- assert_equal "omg <script>BOM</script>", full_sanitize("omg <script>BOM</script>")
+ assert_full_sanitized 'test\r\nstring', 'test\r\nstring'
+ assert_full_sanitized "&", "&"
+ assert_full_sanitized "&", "&"
+ assert_full_sanitized "&", "&"
+ assert_full_sanitized "omg <script>BOM</script>", "omg <script>BOM</script>"
+ end
+
+ def test_full_sanitize_preserve_whitespace
+ assert_equal "\nParagraphs\n and \n newlines", full_sanitize("Paragraphs
and
newlines", preserve_whitespace: true)
+ end
+
+ def test_full_sanitize_preserve_whitespace_ascii_8bit_string
+ full_sanitize("hello".encode("ASCII-8BIT")).tap do |sanitized|
+ assert_equal "hello", sanitized
+ assert_equal Encoding::UTF_8, sanitized.encoding
+ end
end
protected
def full_sanitize(input, options = {})
module_under_test::FullSanitizer.new.sanitize(input, options)
end
+
+ def assert_full_sanitized(acceptable_results, input)
+ assert_includes(Array(acceptable_results), full_sanitize(input))
+ assert_includes(Array(acceptable_results), full_sanitize(input, preserve_whitespace: true))
+ end
end
class HTML4FullSanitizerTest < Minitest::Test