From 52d4885e2c605f2c5a0caf9f0cfe589badda4be5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20M=C3=BCller?= Date: Wed, 20 May 2020 01:18:22 +0200 Subject: [PATCH] Initial commit --- .circleci/config.yml | 94 ++++ .editorconfig | 9 + .gitignore | 9 + LICENSE | 202 ++++++++ Makefile | 56 +++ README.md | 128 +++++ scripts/generate-docs.sh | 18 + shard.yml | 13 + spec/html_sanitizer/basic.hrx | 70 +++ spec/html_sanitizer/class.hrx | 34 ++ spec/html_sanitizer/combined_policies.hrx | 42 ++ spec/html_sanitizer/combined_policies_spec.cr | 11 + spec/html_sanitizer/default.hrx | 138 +++++ spec/html_sanitizer/html_sanitizer_spec.cr | 102 ++++ spec/html_sanitizer/img.hrx | 46 ++ spec/html_sanitizer/links.hrx | 89 ++++ .../protocol-based-javascript.hrx | 160 ++++++ spec/html_sanitizer/protocol_javascript.hrx | 67 +++ spec/html_sanitizer/url_spec.cr | 8 + spec/html_sanitizer/xss.hrx | 475 ++++++++++++++++++ spec/spec_helper.cr | 1 + spec/support/hrx.cr | 83 +++ spec/text_policy.hrx | 66 +++ spec/text_policy_spec.cr | 17 + spec/uri_sanitizer_spec.cr | 113 +++++ src/adapter/libxml2.cr | 137 +++++ src/policy.cr | 45 ++ src/policy/html_sanitizer.cr | 348 +++++++++++++ src/policy/html_sanitizer/safelist.cr | 70 +++ src/policy/text.cr | 23 + src/policy/whitelist.cr | 57 +++ src/processor.cr | 110 ++++ src/sanitize.cr | 5 + src/uri_sanitizer.cr | 91 ++++ 34 files changed, 2937 insertions(+) create mode 100644 .circleci/config.yml create mode 100644 .editorconfig create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 Makefile create mode 100644 README.md create mode 100755 scripts/generate-docs.sh create mode 100644 shard.yml create mode 100644 spec/html_sanitizer/basic.hrx create mode 100644 spec/html_sanitizer/class.hrx create mode 100644 spec/html_sanitizer/combined_policies.hrx create mode 100644 spec/html_sanitizer/combined_policies_spec.cr create mode 100644 spec/html_sanitizer/default.hrx create mode 100644 spec/html_sanitizer/html_sanitizer_spec.cr create mode 100644 spec/html_sanitizer/img.hrx create mode 100644 spec/html_sanitizer/links.hrx create mode 100644 spec/html_sanitizer/protocol-based-javascript.hrx create mode 100644 spec/html_sanitizer/protocol_javascript.hrx create mode 100644 spec/html_sanitizer/url_spec.cr create mode 100644 spec/html_sanitizer/xss.hrx create mode 100644 spec/spec_helper.cr create mode 100644 spec/support/hrx.cr create mode 100644 spec/text_policy.hrx create mode 100644 spec/text_policy_spec.cr create mode 100644 spec/uri_sanitizer_spec.cr create mode 100644 src/adapter/libxml2.cr create mode 100644 src/policy.cr create mode 100644 src/policy/html_sanitizer.cr create mode 100644 src/policy/html_sanitizer/safelist.cr create mode 100644 src/policy/text.cr create mode 100644 src/policy/whitelist.cr create mode 100644 src/processor.cr create mode 100644 src/sanitize.cr create mode 100644 src/uri_sanitizer.cr diff --git a/.circleci/config.yml b/.circleci/config.yml new file mode 100644 index 0000000..df9b752 --- /dev/null +++ b/.circleci/config.yml @@ -0,0 +1,94 @@ +version: 2 + +dry: + restore_shards_cache: &restore_shards_cache + keys: + - shards-cache-v1-{{ .Branch }}-{{ checksum "shard.yml" }} + - shards-cache-v1-{{ .Branch }} + - shards-cache-v1 + + save_shards_cache: &save_shards_cache + key: shards-cache-v1-{{ .Branch }}-{{ checksum "shard.yml" }} + paths: + - ./shards-cache + +jobs: + test: + docker: + - image: crystallang/crystal:latest + environment: + SHARDS_CACHE_PATH: ./shards-cache + steps: + - run: crystal --version + + - checkout + + - restore_cache: *restore_shards_cache + - run: shards + - save_cache: *save_shards_cache + + - run: make test + + - run: crystal tool format --check spec src + + deploy-docs: + docker: + - image: crystallang/crystal:latest + environment: + SHARDS_CACHE_PATH: ./shards-cache + steps: + - run: crystal --version + + - checkout + + - run: scripts/generate-docs.sh + + - run: apt update && apt install -y curl rsync + - run: + command: curl https://raw.githubusercontent.com/straight-shoota/autodeploy-docs/master/autodeploy-docs.sh | bash + environment: + GIT_COMMITTER_NAME: cirlceci + GIT_COMMITTER_EMAIL: circle@circleci.com + + test-on-nightly: + docker: + - image: crystallang/crystal:nightly + environment: + SHARDS_CACHE_PATH: ./shards-cache + steps: + - run: crystal --version + + - checkout + + - restore_cache: *restore_shards_cache + - run: shards + + - run: make test + + - run: crystal tool format --check spec src + +workflows: + version: 2 + # Run tests on every single commit + ci: + jobs: + - test + # Build and depoy docs only on master branch + - deploy-docs: + requires: + - test + filters: &master-only + branches: + only: + - master + # Run tests every night using crystal nightly + nightly: + triggers: + - schedule: + cron: "0 4 * * *" + filters: + branches: + only: + - master + jobs: + - test-on-nightly diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..163eb75 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,9 @@ +root = true + +[*.cr] +charset = utf-8 +end_of_line = lf +insert_final_newline = true +indent_style = space +indent_size = 2 +trim_trailing_whitespace = true diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0bbd4a9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,9 @@ +/docs/ +/lib/ +/bin/ +/.shards/ +*.dwarf + +# Libraries don't need dependency lock +# Dependencies will be locked in applications that use them +/shard.lock diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..d645695 --- /dev/null +++ b/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..b88d98d --- /dev/null +++ b/Makefile @@ -0,0 +1,56 @@ +-include Makefile.local # for optional local options + +BUILD_TARGET ::= bin/app + +# The shards command to use +SHARDS ?= shards +# The crystal command to use +CRYSTAL ?= crystal + +SRC_SOURCES ::= $(shell find src -name '*.cr' 2>/dev/null) +LIB_SOURCES ::= $(shell find lib -name '*.cr' 2>/dev/null) +SPEC_SOURCES ::= $(shell find spec -name '*.cr' 2>/dev/null) + +.PHONY: test +test: ## Run the test suite +test: lib + $(CRYSTAL) spec + +.PHONY: format +format: ## Apply source code formatting +format: $(SRC_SOURCES) $(SPEC_SOURCES) + $(CRYSTAL) tool format src spec + +docs: ## Generate API docs +docs: $(SRC_SOURCES) lib + $(CRYSTAL) docs -o docs + +lib: shard.lock + $(SHARDS) install + # Touch is necessary because `shards install` always touches shard.lock + touch lib + +shard.lock: shard.yml + $(SHARDS) update + +.PHONY: clean +clean: ## Remove application binary +clean: + @rm -f $(BUILD_TARGET) + +.PHONY: help +help: ## Show this help + @echo + @printf '\033[34mtargets:\033[0m\n' + @grep -hE '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) |\ + sort |\ + awk 'BEGIN {FS = ":.*?## "}; {printf " \033[36m%-15s\033[0m %s\n", $$1, $$2}' + @echo + @printf '\033[34moptional variables:\033[0m\n' + @grep -hE '^[a-zA-Z_-]+ \?=.*?## .*$$' $(MAKEFILE_LIST) |\ + sort |\ + awk 'BEGIN {FS = " \\?=.*?## "}; {printf " \033[36m%-15s\033[0m %s\n", $$1, $$2}' + @echo + @printf '\033[34mrecipes:\033[0m\n' + @grep -hE '^##.*$$' $(MAKEFILE_LIST) |\ + awk 'BEGIN {FS = "## "}; /^## [a-zA-Z_-]/ {printf " \033[36m%s\033[0m\n", $$2}; /^## / {printf " %s\n", $$2}' diff --git a/README.md b/README.md new file mode 100644 index 0000000..fdca90d --- /dev/null +++ b/README.md @@ -0,0 +1,128 @@ +# sanitize + +`sanitize` is a Crystal library for transforming HTML/XML trees. It's primarily +used to sanitize HTML from untrusted sources in order to prevent +[XSS attacks](http://en.wikipedia.org/wiki/Cross-site_scripting) and other +adversities. + +It builds on stdlib's [`XML`](https://crystal-lang.org/api/XML.html) module to +parse HTML/XML. Based on [libxml2](http://xmlsoft.org/) it's a solid parser and +turns malformed and malicious input into valid and safe markup. + +* Code: [https://github.com/straight-shoota/sanitize](https://github.com/straight-shoota/sanitize) +* API docs: [https://straight-shoota.github.io/sanitize/api/latest/](https://straight-shoota.github.io/sanitize/api/latest/) +* Issue tracker: [https://github.com/straight-shoota/sanitize/issues](https://github.com/straight-shoota/sanitize/issues) +* Shardbox: [https://shardbox.org/shards/sanitize](https://shardbox.org/shards/sanitize) + +## Installation + +1. Add the dependency to your `shard.yml`: + + ```yaml + dependencies: + sanitize: + github: straight-shoota/sanitize + ``` + +2. Run `shards install` + +## Sanitization Features + +The `Sanitize::Policy::HTMLSanitizer` policy applies the following sanitization steps. Except +for the first one (which is essential to the entire process), all can be disabled +or configured. + +* Turns malformed and malicious HTML into valid and safe markup. +* Strips HTML elements and attributes not included in the safe list. +* Sanitizes URL attributes (like `href` or `src`) with customizable sanitization + policy. +* Adds `rel="nofollow"` to all links and `rel="noopener"` to links with `target`. +* Validates values of accepted attributes `align`, `width` and `height`. +* Filters `class` attributes based on a whitelist (by default all classes are + rejected). + +## Usage + +Transformation is based on rules defined by `Sanitize::Policy` implementations. + +The recommended standard policy for HTML sanitization is `Sanitize::Policy::HTMLSanitizer.common` +which represents good defaults for most use cases. +It sanitizes user input against a known safe list of accepted elements and their +attributes. + +```crystal +require "sanitize" + +sanitizer = Sanitize::Policy::HTMLSanitizer.common +sanitizer.process(%(foo)) # => %(foo) +sanitizer.process(%(

foo

)) # => %(

foo

) +sanitizer.process(%()) # => %() +sanitizer.process(%(
foobar
)) # => %(
foobar
) +``` + +Sanitization should always run after any other processing (for example rendering +Markdown) and is a must when including HTML from untrusted sources into a web +page. + +### With Markd + +A typical format for user generated content is `Markdown`. Even though it has +only a very limited feature set compared to HTML, it can still produce +potentially harmful HTML and is is usually possible to embed raw HTML directly. +So Sanitization is necessary. + +The most common Markdown renderer is [markd](https://shardbox.org/shards/markd), +so here is a sample how to use it with `sanitize`: + +````crystal +sanitizer = Sanitize::Policy::HTMLSanitizer.common +# Allow classes with `language-` prefix which are used for syntax highlighting. +sanitizer.valid_classes << /language-.+/ + +markdown = <<-MD + Sanitization with [https://shardbox.org/shards/sanitize](sanitize) is not that + **difficult**. + ```cr + puts "Hello World!" + ``` +

Hello world!

+ MD + +html = Markd.to_html(markdown) +sanitized = sanitizer.process(html) +puts sanitized +```` + +The result: + +```html +

Sanitization with https://shardbox.org/shards/sanitize is not that +difficult.

+
puts "Hello World!"
+
+

Hello world!

+``` + +## Limitations + +Sanitizing CSS is not supported. Thus `style` attributes can't be accepted in a +safe way. +CSS sanitization features may be added when a CSS parsing library is available. + +## Security + +If you want to privately disclose security-issues, please contact +[straightshoota](https://keybase.io/straightshoota) on Keybase or +[straightshoota@gmail.com](mailto:straightshoota@gmail.com) (PGP: `DF2D C9E9 FFB9 6AE0 2070 D5BC F0F3 4963 7AC5 087A`). + +## Contributing + +1. Fork it ([https://github.com/straight-shoota/sanitize/fork](https://github.com/straight-shoota/sanitize/fork)) +2. Create your feature branch (`git checkout -b my-new-feature`) +3. Commit your changes (`git commit -am 'Add some feature'`) +4. Push to the branch (`git push origin my-new-feature`) +5. Create a new Pull Request + +## Contributors + +- [Johannes Müller](https://github.com/straight-shoota) - creator and maintainer diff --git a/scripts/generate-docs.sh b/scripts/generate-docs.sh new file mode 100755 index 0000000..5dbaf34 --- /dev/null +++ b/scripts/generate-docs.sh @@ -0,0 +1,18 @@ +#! /usr/bin/env bash + +set -e + +GENERATED_DOCS_DIR="./docs" + +echo -e "Building docs into ${GENERATED_DOCS_DIR}" +echo -e "Clearing ${GENERATED_DOCS_DIR} directory" +rm -rf "${GENERATED_DOCS_DIR}" + +echo -e "Running \`make docs\`..." +make docs + +echo -e "Copying README.md" + +# "{{" and "{%"" need to be escaped, otherwise Jekyll might interpret the expressions (on Github Pages) +ESCAPE_TEMPLATE='s/{{/{{"{{"}}/g; s/{\%/{{"{%"}}/g;' +sed "${ESCAPE_TEMPLATE}" README.md > "${GENERATED_DOCS_DIR}/README.md" diff --git a/shard.yml b/shard.yml new file mode 100644 index 0000000..92498c0 --- /dev/null +++ b/shard.yml @@ -0,0 +1,13 @@ +name: sanitize +version: 0.1.0 + +authors: + - Johannes Müller + +crystal: 0.34.0 + +license: Apache-2.0 + +development_dependencies: + hrx: + github: straight-shoota/hrx diff --git a/spec/html_sanitizer/basic.hrx b/spec/html_sanitizer/basic.hrx new file mode 100644 index 0000000..e2c1047 --- /dev/null +++ b/spec/html_sanitizer/basic.hrx @@ -0,0 +1,70 @@ +<===> empty/document.html +<===> + + +<===> pending:skeleton/document.html + + + + + +<===> + + +<===> invalid/fragment.html +foo

bar

bazz
quux
+<===> invalid/common.html +foo

bar

bazz
quux
+<===> + + + +<===> invalid-div/fragment.html +foo

bar

bazz
quux
+<===> invalid-div/common.html +foo

bar

bazz quux +<===> + + +<===> basic/fragment.html +Lorem ipsum dolor sit
amet +<===> basic/common.html +Lorem ipsum dolor sit
amet +<===> + + +<===> malformed/fragment.html +Lorem dolor sit
amet +<===> malicious/common.html +Lorem ipsum dolor sit
amet <script>alert("hello world"); +<===> + + +<===> target="_blank"/fragment.html +foo +<===> target="_blank"/common.html +foo +<===> + + +<===> percent encoded URL/fragment.html +CI Status +<===> percent encoded URL/common.html +CI Status +<===> diff --git a/spec/html_sanitizer/class.hrx b/spec/html_sanitizer/class.hrx new file mode 100644 index 0000000..897c1c2 --- /dev/null +++ b/spec/html_sanitizer/class.hrx @@ -0,0 +1,34 @@ +<===> reject/fragment.html +
+<===> reject/common.html +
+<===> reject/allow-prefix.html +
+<===> + + +<===> allow-with-prefix/fragment.html +
+<===> allow-with-prefix/common.html +
+<===> allow-with-prefix/allow-prefix.html +
+<===> + + +<===> reject-non-prefix/fragment.html +
+<===> reject-non-prefix/common.html +
+<===> reject-non-prefix/allow-prefix.html +
+<===> + + +<===> allow-explicit/fragment.html +
+<===> allow-explicit/common.html +
+<===> allow-explicit/allow-prefix.html +
+<===> diff --git a/spec/html_sanitizer/combined_policies.hrx b/spec/html_sanitizer/combined_policies.hrx new file mode 100644 index 0000000..b648ae6 --- /dev/null +++ b/spec/html_sanitizer/combined_policies.hrx @@ -0,0 +1,42 @@ +<===> basic/fragment.html +Lorem ipsum dolor sit
amet +<===> basic/text.html +Lorem ipsum dolor sit amet +<===> basic/inline.html +Lorem ipsum dolor sit amet +<===> basic/common.html +Lorem ipsum dolor sit
amet +<===> + + +<===> malformed/fragment.html +Lorem dolor sit
amet +<===> malicious/text.html +Lorem ipsum dolor sit amet <script>alert("hello world"); +<===> malicious/inline.html +Lorem ipsum dolor sit amet <script>alert("hello world"); +<===> malicious/common.html +Lorem ipsum dolor sit
amet <script>alert("hello world"); +<===> diff --git a/spec/html_sanitizer/combined_policies_spec.cr b/spec/html_sanitizer/combined_policies_spec.cr new file mode 100644 index 0000000..5751fba --- /dev/null +++ b/spec/html_sanitizer/combined_policies_spec.cr @@ -0,0 +1,11 @@ +require "../support/hrx" +require "../../src/processor" +require "../../src/policy/html_sanitizer" +require "../../src/policy/text" + +run_hrx_samples Path["./combined_policies.hrx"], { + "text" => Sanitize::Policy::Text.new, + "inline" => Sanitize::Policy::HTMLSanitizer.inline.no_links, + "basic" => Sanitize::Policy::HTMLSanitizer.basic, + "common" => Sanitize::Policy::HTMLSanitizer.common, +} diff --git a/spec/html_sanitizer/default.hrx b/spec/html_sanitizer/default.hrx new file mode 100644 index 0000000..627adb8 --- /dev/null +++ b/spec/html_sanitizer/default.hrx @@ -0,0 +1,138 @@ +<===> invalid/fragment.html +foo

bar

bazz
quux
+<===> invalid/stripped.html +foo

bar

bazz
quux
+<===> invalid/escaped.html +<invalid>foo<p>bar</p>bazz</invalid>
quux
+<===> invalid/pruned.html +
quux
+<===> + + +<===> bad_argument/fragment.html +
foo
+<===> bad_argument/stripped.html +
foo
+<===> + +<==> whitewash/fragment.html +no
foo
bar +<==> whitewash/pruned.html +
foo
+<==> + + +<===> nofollow/fragment.html +Click here +<===> nofollow/stripped.html +Click here +<===> + + +<===> nofollow-rel/fragment.html +Click here +<===> nofollow-rel/stripped.html +Click here +<===> + + +<===> unprintable/fragment.html +Lo\u2029ofah ro\u2028cks! +<===> unprintable/stripped.html +Loofah rocks! +<===> + + +<===> msword/fragment.html + + +

Foo BOLD

+<===> msword/stripped.html + + +

Foo BOLD

+<===> + + +<===> entities/fragment.html +

foo bar

+<===> + + +<===> align/fragment.html +

foo

+<===> + + +<===> align-empty/fragment.html +

foo

+<===> align-empty/common.html +

foo

+<===> + + +<===> align-invalid/fragment.html +

foo

+<===> align-invalid/common.html +

foo

+<===> diff --git a/spec/html_sanitizer/html_sanitizer_spec.cr b/spec/html_sanitizer/html_sanitizer_spec.cr new file mode 100644 index 0000000..f70a965 --- /dev/null +++ b/spec/html_sanitizer/html_sanitizer_spec.cr @@ -0,0 +1,102 @@ +require "../support/hrx" +require "../../src/policy/html_sanitizer" + +describe Sanitize::Policy::HTMLSanitizer do + it "removes invalid element" do + Sanitize::Policy::HTMLSanitizer.common.process("

foobar

").should eq "

foobar

" + end + + it "inserts whitespace for removed block tag" do + Sanitize::Policy::HTMLSanitizer.common.process("

foo

bar
baz

").should eq "

foo bar baz

" + end + + it "strips tag with invalid URL attribute" do + Sanitize::Policy::HTMLSanitizer.common.process(%()).should eq %() + Sanitize::Policy::HTMLSanitizer.common.process(%(foo)).should eq "foo" + end + + it "escapes URL attribute" do + Sanitize::Policy::HTMLSanitizer.common.process(%()).should eq %() + end + + it %(adds rel="noopener" on target="_blank") do + policy = Sanitize::Policy::HTMLSanitizer.common + policy.process(%(foo)).should eq(%(foo)) + policy.accepted_attributes["a"] << "target" + policy.process(%(foo)).should eq(%(foo)) + end + + it "doesn't leak configuration" do + policy = Sanitize::Policy::HTMLSanitizer.common + policy.accepted_attributes["p"] << "invalid" + policy.process(%(

bar

)).should eq(%(

bar

)) + Sanitize::Policy::HTMLSanitizer.common.process(%(

bar

)).should eq(%(

bar

)) + end + + describe "html scaffold" do + it "fragment" do + Sanitize::Policy::HTMLSanitizer.common.process("FOO

BAR

").should eq "FOO

BAR

" + end + + it "document" do + sanitizer = Sanitize::Policy::HTMLSanitizer.common + sanitizer.accept_tag("html") + sanitizer.accept_tag("head") + sanitizer.accept_tag("body") + sanitizer.process_document("FOO

BAR

").should eq "FOO

BAR

\n" + end + end + + describe "#transform_classes" do + it "strips classes by default" do + policy = Sanitize::Policy::HTMLSanitizer.inline + orig_attributes = {"class" => "foo bar baz"} + attributes = orig_attributes.clone + policy.transform_classes("div", attributes) + attributes.should eq Hash(String, String).new + end + + it "accepts classes" do + policy = Sanitize::Policy::HTMLSanitizer.inline + orig_attributes = {"class" => "foo bar baz"} + attributes = orig_attributes.clone + + policy.valid_classes << /fo*/ + policy.valid_classes << "bar" + policy.transform_classes("div", attributes) + attributes.should eq({"class" => "foo bar"}) + end + + it "only matches full class name" do + policy = Sanitize::Policy::HTMLSanitizer.inline + orig_attributes = {"class" => "foobar barfoo barfoobaz foo fom"} + attributes = orig_attributes.clone + + policy.valid_classes << /fo./ + policy.transform_classes("div", attributes) + attributes.should eq({"class" => "foo fom"}) + end + end + + run_hrx_samples Path["basic.hrx"], { + "common" => Sanitize::Policy::HTMLSanitizer.common, + } + run_hrx_samples Path["protocol_javascript.hrx"], { + "common" => Sanitize::Policy::HTMLSanitizer.common, + } + run_hrx_samples Path["links.hrx"], { + "common" => Sanitize::Policy::HTMLSanitizer.common, + } + run_hrx_samples Path["xss.hrx"], { + "common" => Sanitize::Policy::HTMLSanitizer.common, + } + run_hrx_samples Path["img.hrx"], { + "common" => Sanitize::Policy::HTMLSanitizer.common, + } + run_hrx_samples Path["class.hrx"], { + "common" => Sanitize::Policy::HTMLSanitizer.common, + "allow-prefix" => Sanitize::Policy::HTMLSanitizer.common.tap { |sanitizer| + sanitizer.valid_classes = Set{/allowed-.+/, "explicitly-allowed"} + }, + } +end diff --git a/spec/html_sanitizer/img.hrx b/spec/html_sanitizer/img.hrx new file mode 100644 index 0000000..1fd81d0 --- /dev/null +++ b/spec/html_sanitizer/img.hrx @@ -0,0 +1,46 @@ +<===> img/fragment.html + +<===> + + +<===> img with width/fragment.html + +<===> + + +<===> img with height/fragment.html + +<===> + + +<===> img with width and height/fragment.html + +<===> + + +<===> img invalid height/fragment.html + +<===> img invalid height/common.html + +<===> + + +<===> img invalid width/fragment.html + +<===> img invalid width/common.html + +<===> + + + +<===> img invalid width and height/fragment.html + +<===> img invalid width and height/common.html + +<===> + + + +<===> img percent width and height/fragment.html + +<===> diff --git a/spec/html_sanitizer/links.hrx b/spec/html_sanitizer/links.hrx new file mode 100644 index 0000000..1047408 --- /dev/null +++ b/spec/html_sanitizer/links.hrx @@ -0,0 +1,89 @@ +<===> links/1/fragment.html + +<===> links/1/common.html + +<===> + + +<===> links/2/fragment.html + +<===> links/2/common.html + +<===> + + +<===> links/3/fragment.html + +<===> links/3/common.html + +<===> + + +<===> links/4/fragment.html + +<===> links/4/common.html + +<===> + + +<===> links/5/fragment.html + +<===> links/5/common.html + +<===> + + +<===> links/6/fragment.html + +<===> links/6/common.html + +<===> + + +<===> links/7/fragment.html + +<===> links/7/common.html + +<===> + + +<===> links/8/fragment.html + +<===> links/8/common.html + +<===> + + +<===> links/9/fragment.html + +<===> links/9/common.html + +<===> + + +<===> links/10/fragment.html + +<===> links/10/common.html + +<===> + + +<===> links/11/fragment.html +Red dot +<===> links/11/common.html +Red dot +<===> + + +<===> links/12/fragment.html + +<===> links/12/common.html + +<===> + + +<===> links/13/fragment.html + +<===> links/13/common.html + +<===> diff --git a/spec/html_sanitizer/protocol-based-javascript.hrx b/spec/html_sanitizer/protocol-based-javascript.hrx new file mode 100644 index 0000000..16576ea --- /dev/null +++ b/spec/html_sanitizer/protocol-based-javascript.hrx @@ -0,0 +1,160 @@ + +<===> simple, no spaces/fragment.html +foo +<===> simple, no spaces/common.html +foo +<===> simple, no spaces/restricted.html +foo +<===> simple, no spaces/basic.html +foo +<===> simple, no spaces/relaxed.html +foo + +<===> simple, spaces before/fragment.html +foo +<===> simple, spaces before/common.html +foo +<===> simple, spaces before/restricted.html +foo +<===> simple, spaces before/basic.html +foo +<===> simple, spaces before/relaxed.html +foo + +<===> simple, spaces after/fragment.html +foo +<===> simple, spaces after/common.html +foo +<===> simple, spaces after/restricted.html +foo +<===> simple, spaces after/basic.html +foo +<===> simple, spaces after/relaxed.html +foo + +<===> simple, spaces before and after/fragment.html +foo +<===> simple, spaces before and after/common.html +foo +<===> simple, spaces before and after/restricted.html +foo +<===> simple, spaces before and after/basic.html +foo +<===> simple, spaces before and after/relaxed.html +foo + +<===> preceding colon/fragment.html +foo +<===> preceding colon/common.html +foo +<===> preceding colon/restricted.html +foo +<===> preceding colon/basic.html +foo +<===> preceding colon/relaxed.html +foo + +<===> UTF-8 encoding/fragment.html +foo +<===> UTF-8 encoding/common.html +foo +<===> UTF-8 encoding/restricted.html +foo +<===> UTF-8 encoding/basic.html +foo +<===> UTF-8 encoding/relaxed.html +foo + +<===> long UTF-8 encoding/fragment.html +foo +<===> long UTF-8 encoding/common.html +foo +<===> long UTF-8 encoding/restricted.html +foo +<===> long UTF-8 encoding/basic.html +foo +<===> long UTF-8 encoding/relaxed.html +foo + +<===> long UTF-8 encoding without semicolons/fragment.html +foo +<===> long UTF-8 encoding without semicolons/common.html +foo +<===> long UTF-8 encoding without semicolons/restricted.html +foo +<===> long UTF-8 encoding without semicolons/basic.html +foo +<===> long UTF-8 encoding without semicolons/relaxed.html +foo + +<===> hex encoding/fragment.html +foo +<===> hex encoding/common.html +foo +<===> hex encoding/restricted.html +foo +<===> hex encoding/basic.html +foo +<===> hex encoding/relaxed.html +foo + +<===> long hex encoding/fragment.html +foo +<===> long hex encoding/common.html +foo +<===> long hex encoding/restricted.html +foo +<===> long hex encoding/basic.html +foo +<===> long hex encoding/relaxed.html +foo + +<===> hex encoding without semicolons/fragment.html +foo +<===> hex encoding without semicolons/common.html +foo +<===> hex encoding without semicolons/restricted.html +foo +<===> hex encoding without semicolons/basic.html +foo +<===> hex encoding without semicolons/relaxed.html +foo + +<===> null char/fragment.html + +<===> null char/common.html +<===> null char/restricted.html +<===> null char/basic.html +<===> null char/relaxed.html +<===> invalid URL char/fragment.html + +<===> invalid URL char/common.html + +<===> invalid URL char/restricted.html + +<===> invalid URL char/basic.html + +<===> invalid URL char/relaxed.html + + +<===> spaces and entities/fragment.html + +<===> spaces and entities/common.html + +<===> spaces and entities/restricted.html + +<===> spaces and entities/basic.html + +<===> spaces and entities/relaxed.html + + +<===> protocol whitespace/fragment.html + +<===> protocol whitespace/common.html + +<===> protocol whitespace/restricted.html + +<===> protocol whitespace/basic.html + +<===> protocol whitespace/relaxed.html + diff --git a/spec/html_sanitizer/protocol_javascript.hrx b/spec/html_sanitizer/protocol_javascript.hrx new file mode 100644 index 0000000..fc4b86c --- /dev/null +++ b/spec/html_sanitizer/protocol_javascript.hrx @@ -0,0 +1,67 @@ +<===> simple, no spaces/fragment.html +foo +<===> simple, no spaces/common.html +foo +<===> simple, spaces before/fragment.html +foo +<===> +# TODO: Maybe this should strip the a tag +<===> simple, spaces before/common.html +foo +<===> simple, spaces after/fragment.html +foo +<===> simple, spaces after/common.html +foo +<===> simple, spaces before and after/fragment.html +foo +<===> +# TODO: Maybe this should strip the a tag +<===> simple, spaces before and after/common.html +foo +<===> preceding colon/fragment.html +foo +<===> +# TODO: Maybe this should strip the a tag +<===> preceding colon/common.html +foo +<===> UTF-8 encoding/fragment.html +foo +<===> UTF-8 encoding/common.html +foo +<===> long UTF-8 encoding/fragment.html +foo +<===> long UTF-8 encoding/common.html +foo +<===> long UTF-8 encoding without semicolons/fragment.html +foo +<===> long UTF-8 encoding without semicolons/common.html +foo +<===> hex encoding/fragment.html +foo +<===> hex encoding/common.html +foo +<===> long hex encoding/fragment.html +foo +<===> long hex encoding/common.html +foo +<===> hex encoding without semicolons/fragment.html +foo +<===> hex encoding without semicolons/common.html +foo +<===> null char/fragment.html + +<===> +# TODO: Maybe this should strip the a tag +<===> null char/common.html + +<===> invalid URL char/fragment.html + +<===> +# TODO: Maybe this should strip the a tag +<===> invalid URL char/common.html + +<===> spaces and entities/fragment.html + +<===> spaces and entities/common.html + +<===> diff --git a/spec/html_sanitizer/url_spec.cr b/spec/html_sanitizer/url_spec.cr new file mode 100644 index 0000000..5e1aade --- /dev/null +++ b/spec/html_sanitizer/url_spec.cr @@ -0,0 +1,8 @@ +require "../support/hrx" +require "../../src/policy/html_sanitizer" + +describe "Sanitize::Policy::HTMLSanitizer" do + it "escapes URL attribute" do + Sanitize::Policy::HTMLSanitizer.common.process(%()).should eq %() + end +end diff --git a/spec/html_sanitizer/xss.hrx b/spec/html_sanitizer/xss.hrx new file mode 100644 index 0000000..d573491 --- /dev/null +++ b/spec/html_sanitizer/xss.hrx @@ -0,0 +1,475 @@ +<===> # Basic XSS +<===> fragment.html +test +<===> common.html +test +<===> + + +<===> fragment.html +<<<>< +<===> common.html + +<===> + + +<===> fragment.html + +<===> +` +<===> common.html + +<===> + + +<===> fragment.html + +<===> common.html + +<===> + + +<===> fragment.html + +<===> common.html + +<===> + + +<===> fragment.html + +<===> common.html + +<===> + + +<===> fragment.html + +<===> common.html +
+<===> + + +<===> fragment.html +
+<===> common.html +
+<===> + + +<===> fragment.html +
+<===> common.html +
+<===> + + +<===> fragment.html +
+<===> common.html +
+<===> + + +<===> fragment.html + +<===> common.html + +<===> + + +<===> fragment.html + +<===> common.html + +<===> + + +<===> fragment.html + +<===> common.html + +<===> + + +<===> fragment.html + +<===> common.html + +<===> + + +<===> fragment.html + +<===> common.html + +<===> + + +<===> fragment.html + +<===> common.html + +<===> + + +<===> fragment.html + +<===> common.html + +<===> + + +<===> fragment.html + +<===> common.html + +<===> + + +<===> fragment.html + +<===> common.html + +<===> + + +<===> fragment.html + +<===> common.html + +<===> + + +<===> fragment.html + +<===> + +<===> common.html + +<===> + + +<===> fragment.html + +<===> common.html + +<===> + + +<===> fragment.html +PT SRC="http://ha.ckers.org/xss.js"> +<===> common.html +PT SRC="http://ha.ckers.org/xss.js"> +<===> + + +<===> fragment.html + +<==> complex/text.html +Lorem ipsum dolor sit amet +<==> + + +<==> html-special-chars/fragment.html +<script> +<==> html-special-chars/text.html +<script> +<==> + + +<==> prune script/fragment.html + +<==> prune script/text.html +<==> + + +<==> prune style/fragment.html + +<==> prune script/text.html +<==> diff --git a/spec/text_policy_spec.cr b/spec/text_policy_spec.cr new file mode 100644 index 0000000..8b02a15 --- /dev/null +++ b/spec/text_policy_spec.cr @@ -0,0 +1,17 @@ +require "./support/hrx" +require "../src/policy/text" +require "../src/processor" + +describe Sanitize::Policy::Text do + it "continues on tag" do + Sanitize::Policy::Text.new.transform_tag("foo", {} of String => String).should eq Sanitize::Policy::CONTINUE + end + + it "adds whitespace" do + Sanitize::Policy::Text.new.process("foo
bar").should eq "foo bar" + end + + run_hrx_samples Path["./text_policy.hrx"], { + "text" => Sanitize::Policy::Text.new, + } +end diff --git a/spec/uri_sanitizer_spec.cr b/spec/uri_sanitizer_spec.cr new file mode 100644 index 0000000..34161cc --- /dev/null +++ b/spec/uri_sanitizer_spec.cr @@ -0,0 +1,113 @@ +require "../src/uri_sanitizer" +require "spec" +require "uri" + +private def assert_sanitize(source : String, expected : String? = source, sanitizer = Sanitize::URISanitizer.new) + if expected + expected = URI.parse(expected) + end + sanitizer.sanitize(URI.parse(source)).should eq expected +end + +describe Sanitize::URISanitizer do + describe "#accepted_schemes" do + it "has default value" do + Sanitize::URISanitizer.new.accepted_schemes.should eq Set{"http", "https", "mailto", "tel"} + end + + it "accepts minimal schemes" do + assert_sanitize("http://example.com") + assert_sanitize("https://example.com") + assert_sanitize("mailto:mail@example.com") + assert_sanitize("tel:example.com") + end + + it "refutes unsafe schemes" do + assert_sanitize("javascript:alert();", nil) + assert_sanitize("ssh:git@github.com", nil) + end + + it "custom schemes" do + sanitizer = Sanitize::URISanitizer.new + sanitizer.accept_scheme "javascript" + assert_sanitize("javascript:alert();", sanitizer: sanitizer) + end + + it "can be disabled" do + sanitizer = Sanitize::URISanitizer.new + sanitizer.accepted_schemes = nil + assert_sanitize("javascript:alert();", sanitizer: sanitizer) + assert_sanitize("foo:bar", sanitizer: sanitizer) + end + end + + describe "#base_url" do + it "disabled by default" do + Sanitize::URISanitizer.new.base_url.should be_nil + assert_sanitize("foo") + end + + it "set to absolute URL" do + sanitizer = Sanitize::URISanitizer.new + sanitizer.base_url = URI.parse("https://example.com/base/") + + assert_sanitize("foo", "https://example.com/base/foo", sanitizer: sanitizer) + assert_sanitize("/foo", "https://example.com/foo", sanitizer: sanitizer) + end + end + + describe "#accepted_hosts" do + it "disabled by default" do + Sanitize::URISanitizer.new.accepted_hosts.should be_nil + end + + it "restricts hosts" do + sanitizer = Sanitize::URISanitizer.new + sanitizer.accepted_hosts = Set{"foo.example.com"} + assert_sanitize("http://foo.example.com", sanitizer: sanitizer) + assert_sanitize("http://bar.example.com", nil, sanitizer: sanitizer) + assert_sanitize("http://example.com", nil, sanitizer: sanitizer) + assert_sanitize("http://foo.foo.example.com", nil, sanitizer: sanitizer) + assert_sanitize("foo", sanitizer: sanitizer) + end + + it "works with base_url" do + sanitizer = Sanitize::URISanitizer.new + sanitizer.accepted_hosts = Set{"foo.example.com"} + sanitizer.base_url = URI.parse("http://bar.example.com/") + assert_sanitize("foo", "http://bar.example.com/foo", sanitizer: sanitizer) + assert_sanitize("http://bar.example.com/foo", nil, sanitizer: sanitizer) + end + end + + describe "#rejected_hosts" do + it "disabled by default" do + Sanitize::URISanitizer.new.rejected_hosts.should be_a(Set(String)) + end + + it "restricts hosts" do + sanitizer = Sanitize::URISanitizer.new + sanitizer.rejected_hosts = Set{"bar.example.com"} + assert_sanitize("http://foo.example.com", sanitizer: sanitizer) + assert_sanitize("http://bar.example.com", nil, sanitizer: sanitizer) + assert_sanitize("http://example.com", sanitizer: sanitizer) + assert_sanitize("http://bar.bar.example.com", sanitizer: sanitizer) + assert_sanitize("foo", sanitizer: sanitizer) + end + + it "works with base_url" do + sanitizer = Sanitize::URISanitizer.new + sanitizer.rejected_hosts = Set{"foo.example.com"} + sanitizer.base_url = URI.parse("http://foo.example.com/") + assert_sanitize("foo", "http://foo.example.com/foo", sanitizer: sanitizer) + assert_sanitize("http://foo.example.com/foo", nil, sanitizer: sanitizer) + end + + it "overrides accepted_hosts" do + sanitizer = Sanitize::URISanitizer.new + sanitizer.rejected_hosts = Set{"foo.example.com"} + sanitizer.accepted_hosts = Set{"foo.example.com"} + assert_sanitize("http://foo.example.com/foo", nil, sanitizer: sanitizer) + end + end +end diff --git a/src/adapter/libxml2.cr b/src/adapter/libxml2.cr new file mode 100644 index 0000000..51d8994 --- /dev/null +++ b/src/adapter/libxml2.cr @@ -0,0 +1,137 @@ +struct Sanitize::Adapter::LibXML2 + include Adapter + + def self.process(policy : Policy, html : String, fragment : Bool = false) + return "" if html.empty? + + node = parse(html, fragment) + process(policy, node, fragment) + end + + def self.process(policy : Policy, node : XML::Node, fragment : Bool = false) + build(fragment) do |builder| + process(policy, node, builder, fragment) + end + end + + def self.process(policy : Policy, node : XML::Node, builder : XML::Builder, fragment : Bool = false) + processor = Processor.new(policy, new(builder)) + visit(processor, node, fragment) + builder.end_document + builder.flush + end + + def self.parse(html : String, fragment : Bool) + if fragment + html = "#{html}" + end + + node = XML.parse_html(html, XML::HTMLParserOptions.default | XML::HTMLParserOptions::NOIMPLIED | XML::HTMLParserOptions::NODEFDTD) + end + + def self.build(fragment : Bool) + result = String.build do |io| + builder = XML::Builder.new(io) + + if fragment + builder.start_element("fragment") + end + + yield(builder) + end + + if fragment + result = "" if result == "\n" + result = result.lchop("").rchop("\n") + end + # strip trailing non-linebreak whitespace + if result.ends_with?("\n") + result + else + result.rstrip + end + end + + def self.visit(processor : Processor, node : XML::Node, fragment : Bool) + visitor = Visitor.new(processor, fragment) + visitor.visit(node) + end + + # :nodoc: + struct Visitor + @attributes = Hash(String, String).new + + def initialize(@processor : Processor, @fragment : Bool) + end + + # :nodoc: + def visit(node : XML::Node) + case node.type + when .html_document_node? + visit_children(node) + when .dtd_node? + # skip DTD + when .text_node? + visit_text(node) + when .element_node? + visit_element(node) + when .comment_node? + # skip comments + when .cdata_section_node? + # skip CDATA + else + raise "Not implemented for: #{node.type}:#{node.name}:#{node.content}" + end + end + + def visit_children(node) + node.children.each do |child| + visit(child) + end + end + + def visit_text(node) + @processor.process_text(node.content) + end + + def visit_element(node) + if @fragment && node.name.in?({"html", "body"}) + @attributes.clear + @processor.process_element(node.name, @attributes, Processor::CONTINUE) do + visit_children(node) + end + return + end + + @attributes.clear + node.attributes.each do |attribute| + @attributes[attribute.name] = attribute.content + end + + name = node.name + if namespace = node.namespace + name = "#{namespace}:#{name}" + end + + @processor.process_element(name, @attributes) do + visit_children(node) + end + end + end + + def initialize(@builder : XML::Builder) + end + + def start_tag(name : String, attributes : Hash(String, String)) : Nil + @builder.start_element(name) + @builder.attributes(attributes) + end + + def end_tag(name : String, attributes : Hash(String, String)) : Nil + @builder.end_element + end + + def write_text(text : String) : Nil + @builder.text(text) + end +end diff --git a/src/policy.cr b/src/policy.cr new file mode 100644 index 0000000..d1ce31c --- /dev/null +++ b/src/policy.cr @@ -0,0 +1,45 @@ +# A policy defines the rules for transforming an HTML/XML tree. +# +# * `HTMLSanitizer` is a policy for HTML sanitization. +# * `Whitelist` is a whitelist-based transformer that's useful either for +# simple stripping applications or as a building block for more advanced +# sanitization policies. +# * `Text` is a policy that turns HTML into plain text. +abstract class Sanitize::Policy + # :nodoc: + alias CONTINUE = Processor::CONTINUE + # :nodoc: + alias STOP = Processor::STOP + + # Defines the string that is added when whitespace is needed when a block tag + # is stripped. + property block_whitespace = " " + + # Receives the content of a text node and returns the transformed content. + # + # If the return value is `nil`, the content is skipped. + abstract def transform_text(text : String) : String? + + # Receives the element name and attributes of an opening tag and returns the + # transformed element name (usually the same as the input name). + # + # *attributes* are transformed directly in place. + # + # Special return values: + # * `Processor::CONTINUE`: Tells the processor to strip the current tag but + # continue traversing its children. + # * `Processor::CONTINUE`: Tells the processor to skip the current tag and its + # children completely and move to the next sibling. + abstract def transform_tag(name : String, attributes : Hash(String, String)) : String | Processor::CONTINUE | Processor::STOP + + HTML_BLOCK_ELEMENTS = Set{ + "address", "article", "aside", "audio", "video", "blockquote", "br", + "canvas", "dd", "div", "dl", "fieldset", "figcaption", "figure", "footer", + "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hgroup", "hr", + "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", + } + + def block_tag?(name) + HTML_BLOCK_ELEMENTS.includes?(name) + end +end diff --git a/src/policy/html_sanitizer.cr b/src/policy/html_sanitizer.cr new file mode 100644 index 0000000..f751bfd --- /dev/null +++ b/src/policy/html_sanitizer.cr @@ -0,0 +1,348 @@ +require "./whitelist" +require "../uri_sanitizer" + +# This policy serves as a good default configuration that should fit most +# typical use cases for HTML sanitization. +# +# ## Configurations +# It comes in three different configurations with different sets of supported +# HTML tags. +# +# They only differ in the default configuration of allowed tags and attributes. +# The transformation behaviour is otherwise the same. +# +# ### Common Configuration +# `.common`: Accepts most standard tags and thus allows using a good +# amount of HTML features (see `COMMON_SAFELIST`). +# +# This is the recommended default configuration and should work for typical use +# cases unless strong restrictions on allowed content is required. +# +# ``` +# sanitizer = Sanitize::Policy::HTMLSanitizer.common +# sanitizer.process(%(foo)) # => %(foo) +# sanitizer.process(%(

foo

)) # => %(

foo

) +# sanitizer.process(%()) # => %() +# sanitizer.process(%(
foobar
)) # => %(
foobar
) +# ``` +# +# NOTE: This configuration (nor any other) does not accept `<html>`, +# `<head>`, or # `<body>` tags by default. In order to use +# `#sanitized_document` they need to be added explicitly to `accepted_arguments`. +# +# ### Basic Configuration +# +# `.basic`: This set accepts some basic tags including paragraphs, headlines, +# lists, and images (see `BASIC_SAFELIST`). +# +# ``` +# sanitizer = Sanitize::Policy::HTMLSanitizer.basic +# sanitizer.process(%(foo)) # => %(foo) +# sanitizer.process(%(

foo

)) # => %(

foo

) +# sanitizer.process(%()) # => %() +# sanitizer.process(%(
foobar
)) # => %(foo bar) +# ``` +# +# ### Inline Configuration +# +# `.inline`: Accepts only a limited set of inline tags (see `INLINE_SAFELIST`). +# +# ``` +# sanitizer = Sanitize::Policy::HTMLSanitizer.inline +# sanitizer.process(%(foo)) # => %(foo) +# sanitizer.process(%(

foo

)) # => %(foo) +# sanitizer.process(%()) # => %() +# sanitizer.process(%(
foobar
)) # => %(foo bar) +# ``` +# +# ## Attribute Transformations +# +# Attribute transformations are identical in all three configurations. But more +# advanced transforms won't apply if the respective attribute is not allowed in +# `accepted_tags`. +# So you can easily add additional elements and attributes to lower-tier sets +# and get the same attribute validation. For example: `.inline` doesn't include +# `<img>` tags, but when `img` is added to `accepted_attributes`, +# the policy validates img tags the same way as in `.common`. +# +# ### URL Sanitization +# +# This transformation applies to attributes that contain a URL (configurable +# through (`url_attributes`). +# +# * Makes sure the value is a valid URI (via `URI.parse`). If it does not parse, +# the attribute value is set to empty string. +# * Sanitizes the URI via `URISanitizer (configurable trough `uri_sanitizer`). +# If the sanitizer returns `nil`, the attribute value is set to empty string. +# +# The same `URISanitizer` is used for any URL attributes. +# +# ### Anchor Tags +# +# For `<a>` tags with a `href` attribute, there are two transforms: +# +# * `rel="nofollow"` is added (can be disabled with `add_rel_nofollow`). +# * `rel="noopener"` is added to links with `target` attribute (can be disabled +# with `add_rel_noopener`). +# +# Anchor tags the have neither a `href`, `name` or `id` attribute are stripped. +# +# NOTE: `name` and `id` attributes are not in any of the default sets of +# accepted attributes, so they can only be used when explicitly enabled. +# +# ### Image Tags +# +# `<img>` tags are stripped if they don't have a `src` attribute. +# +# ### Size Attributes +# +# If a tag has `width` or `height` attributes, the values are validated to be +# numerical or percent values. +# By default, these attributes are only accepted for <img> tags. +# +# ### Alignment Attribute +# +# The `align` attribute is validated against allowed values for this attribute: +# `center, left, right, justify, char`. +# If the value is invalid, the attribute is stripped. +# +# ### Classes +# +# `class` attributes are filtered to accept only classes described by +# `valid_classes`. String values need to match the class name exactly, regex +# values need to match the entire class name. +# +# `class` is accepted as a global attribute in the default configuration, but no +# values are allowed in `valid_classes`. +# +# All classes can be accepted by adding the match-all regular expression `/.*/` +# to `valid_classes`. +class Sanitize::Policy::HTMLSanitizer < Sanitize::Policy::Whitelist + # Add `rel="nofollow"` to every `<a>` tag with `href` attribute. + property add_rel_nofollow = true + + # Add `rel="noopener"` to every `<a>` tag with `href` and `target` attribute. + property add_rel_noopener = true + + # Configures the `URISanitizer` to use for sanitizing URL attributes. + property uri_sanitizer = URISanitizer.new + + # Configures which attributes are considered to contain URLs. If empty, URL + # sanitization is disabled. + # + # Default value: `Set{"src", "href", "action", "cite", "longdesc"}`. + property url_attributes : Set(String) = Set{"src", "href", "action", "cite", "longdesc"} + + # Configures which classes are valid for `class` attributes. + # + # String values need to match the class name exactly, regex + # values need to match the entire class name. + # + # Default value: empty + property valid_classes : Set(String | Regex) = Set(String | Regex).new + + def valid_classes=(classes) + valid_classes = classes.map(&.as(String | Regex)).to_set + end + + # Creates an instance which accepts a limited set of inline tags (see + # `INLINE_SAFELIST`). + def self.inline : HTMLSanitizer + new( + accepted_attributes: INLINE_SAFELIST.clone + ) + end + + # Creates an instance which accepts more basic tags including paragraphs, + # headlines, lists, and images (see `BASIC_SAFELIST`). + def self.basic : HTMLSanitizer + new( + accepted_attributes: BASIC_SAFELIST.clone + ) + end + + # Creates an instance which accepts even more standard tags and thus allows + # using a good amount of HTML features (see `COMMON_SAFELIST`). + # + # Unless you need tight restrictions on allowed content, this is the + # recommended default. + def self.common : HTMLSanitizer + new( + accepted_attributes: COMMON_SAFELIST.clone + ) + end + + # Removes anchor tag (`<a>` from the list of accepted tags). + # + # NOTE: This doesn't reject attributes with URL values for other tags. + def no_links + accepted_attributes.delete("a") + + self + end + + def accept_tag(tag : String, attributes : Set(String) = Set(String).new) + accepted_attributes[tag] = attributes + end + + def transform_attributes(tag : String, attributes : Hash(String, String)) : String | CONTINUE | STOP + transform_url_attributes(tag, attributes) + + tag_result = case tag + when "a" + transform_tag_a(attributes) + when "img" + transform_tag_img(attributes) + end + + if tag_result + return tag_result + end + + limit_numeric_or_percent(attributes, "width") + limit_numeric_or_percent(attributes, "height") + limit_enum(attributes, "align", ["center", "left", "right", "justify", "char"]) + + transform_classes(tag, attributes) + + tag + end + + def transform_tag_img(attributes) + unless attributes.has_key?("src") + return CONTINUE + end + end + + def transform_tag_a(attributes) + if href = attributes["href"]? + if add_rel_nofollow + append_attribute(attributes, "rel", "nofollow") + end + if add_rel_noopener && attributes.has_key?("target") + append_attribute(attributes, "rel", "noopener") + end + end + if !(((href = attributes["href"]?) && !href.empty?) || attributes.has_key?("id") || attributes.has_key?("tag")) + return CONTINUE + end + end + + def transform_url_attributes(tag, attributes) + all_ok = true + url_attributes.each do |key| + if value = attributes[key]? + all_ok &&= transform_url_attribute(tag, attributes, key, value) + end + end + all_ok + end + + def transform_url_attribute(tag, attributes, attribute, value) + begin + uri = URI.parse(value.strip) + rescue URI::Error + attributes[attribute] = "" + return false + end + + uri = transform_uri(tag, attributes, attribute, uri) + if uri.nil? || (uri.blank? || uri == "#") + attributes[attribute] = "" + return false + end + + attributes[attribute] = uri + true + end + + def transform_uri(tag, attributes, attribute, uri : URI) : String? + if uri_sanitizer = self.uri_sanitizer + uri = uri_sanitizer.sanitize(uri) + + return unless uri + end + + # Make sure special characters are properly encoded to avoid interpretation + # of tweaked relative paths as "javascript:" URI (for example) + if path = uri.path + uri.path = URI.encode(URI.decode(path)) + end + + uri.to_s + end + + def transform_classes(tag, attributes) + attribute = attributes["class"]? + return unless attribute + + classes = attribute.split + classes = classes.select { |klass| valid_class?(tag, klass, valid_classes) } + if classes.empty? + attributes.delete("class") + else + attributes["class"] = classes.join(" ") + end + end + + private def limit_numeric_or_percent(attributes, attribute) + if value = attributes[attribute]? + value = value.strip + if value.ends_with?("%") + value = value.byte_slice(0, value.size - 1) + end + value.each_char do |char| + unless char.ascii_number? + attributes.delete(attribute) + break + end + end + end + end + + private def limit_enum(attributes, attribute, list) + if value = attributes[attribute]? + value = value.strip + if valid_with_list?(value, list) + attributes[attribute] = value + else + attributes.delete(attribute) + end + end + end + + def valid_class?(tag, klass, valid_classes) + valid_with_list?(klass, valid_classes) + end + + private def valid_with_list?(value, list) + list.any? { |validator| + case validator + when String + validator == value + when Regex + data = validator.match(value) + next unless data + data.byte_begin == 0 && data.byte_end == value.bytesize + end + } + end + + def append_attribute(attributes, attribute, value) + if curr_value = attributes[attribute]? + values = curr_value.split + if values.includes?(value) + return false + else + values << value + attributes[attribute] = values.join(" ") + end + else + attributes[attribute] = value + end + + true + end +end + +require "./html_sanitizer/safelist" diff --git a/src/policy/html_sanitizer/safelist.cr b/src/policy/html_sanitizer/safelist.cr new file mode 100644 index 0000000..2d5a7ed --- /dev/null +++ b/src/policy/html_sanitizer/safelist.cr @@ -0,0 +1,70 @@ +class Sanitize::Policy::HTMLSanitizer < Sanitize::Policy::Whitelist + # Only limited elements for inline text markup. + INLINE_SAFELIST = { + "a" => Set{"href", "hreflang"}, + "abbr" => Set(String).new, + "acronym" => Set(String).new, + "b" => Set(String).new, + "code" => Set(String).new, + "em" => Set(String).new, + "i" => Set(String).new, + "strong" => Set(String).new, + "*" => Set{ + "dir", + "lang", + "title", + "class", + }, + } + + # Compatible with basic Markdown features. + BASIC_SAFELIST = INLINE_SAFELIST.merge({ + "blockquote" => Set{"cite"}, + "br" => Set(String).new, + "h1" => Set(String).new, + "h2" => Set(String).new, + "h3" => Set(String).new, + "h4" => Set(String).new, + "h5" => Set(String).new, + "h6" => Set(String).new, + "hr" => Set(String).new, + "img" => Set{"alt", "src", "longdesc", "width", "height", "align"}, + "li" => Set(String).new, + "ol" => Set{"start"}, + "p" => Set{"align"}, + "pre" => Set(String).new, + "ul" => Set(String).new, + }) + + # Accepts most standard tags and thus allows using a good amount of HTML features. + COMMON_SAFELIST = BASIC_SAFELIST.merge({ + "dd" => Set(String).new, + "del" => Set{"cite"}, + "details" => Set(String).new, + "dl" => Set(String).new, + "dt" => Set(String).new, + "div" => Set(String).new, + "ins" => Set{"cite"}, + "kbd" => Set(String).new, + "q" => Set{"cite"}, + "ruby" => Set(String).new, + "rp" => Set(String).new, + "rt" => Set(String).new, + "s" => Set(String).new, + "samp" => Set(String).new, + "strike" => Set(String).new, + "sub" => Set(String).new, + "summary" => Set(String).new, + "sup" => Set(String).new, + "table" => Set(String).new, + "time" => Set{"datetime"}, + "tbody" => Set(String).new, + "td" => Set(String).new, + "tfoot" => Set(String).new, + "th" => Set(String).new, + "thead" => Set(String).new, + "tr" => Set(String).new, + "tt" => Set(String).new, + "var" => Set(String).new, + }) +end diff --git a/src/policy/text.cr b/src/policy/text.cr new file mode 100644 index 0000000..82a2e67 --- /dev/null +++ b/src/policy/text.cr @@ -0,0 +1,23 @@ +require "../policy" + +# Reduces an HTML tree to the content of its text nodes. +# It renders a plain text result, similar to copying HTML content rendered by +# a browser to a text editor. +# HTML special characters are escaped. +# +# ``` +# policy = Sanitize::Policy::Text.new +# policy.process(%(foo bar!)) # => "foo bar!" +# policy.process(%(

foo

bar

)) # => "foo bar" +# policy.block_whitespace = "\n" +# policy.process(%(

foo

bar

)) # => "foo\nbar" +# ``` +class Sanitize::Policy::Text < Sanitize::Policy + def transform_text(text : String) : String? + text + end + + def transform_tag(name : String, attributes : Hash(String, String)) : String | CONTINUE | STOP + CONTINUE + end +end diff --git a/src/policy/whitelist.cr b/src/policy/whitelist.cr new file mode 100644 index 0000000..6dc3c45 --- /dev/null +++ b/src/policy/whitelist.cr @@ -0,0 +1,57 @@ +require "../policy" + +# This is a simple policy based on a tag and attribute whitelist. +# +# This policy accepts only `<div>` and `<p>` tags with optional `title` attributes: +# ``` +# policy = Sanitize::Policy::Whitelist.new({ +# "div" => Set{"title"}, +# "p" => Set{"title"}, +# }) +# ``` +# +# The special `*` key applies to *all* tag names and can be used to allow global +# attributes: +# +# This example is equivalent to the above. If more tag names were added, they +# would also accept `title` attributes. +# ``` +# policy = Sanitize::Policy::Whitelist.new({ +# "div" => Set(String).new, +# "p" => Set(String).new, +# "*" => Set{"title"}, +# }) +# ``` +# +# Attributes are always optional, so this policy won't enforce the presence of +# an attribute. +# +# If a tag's attribute list is empty, no attributes are allowed for this tag. +# +# Attribute values are not changed by this policy. +class Sanitize::Policy::Whitelist < Sanitize::Policy + # Mapping of accepted tag names and attributes. + property accepted_attributes : Hash(String, Set(String)) + + # Short cut to `accepted_attributes["*"]`. + getter global_attributes : Set(String) { accepted_attributes.fetch("*") { Set(String).new } } + + def initialize(@accepted_attributes : Hash(String, Set(String))) + end + + def transform_text(text : String) : String? + text + end + + def transform_tag(name : String, attributes : Hash(String, String)) : String | CONTINUE | STOP + acceptable_attributes = accepted_attributes.fetch(name) { return CONTINUE } + + attributes.delete_if { |attr, _| !acceptable_attributes.includes?(attr) && !global_attributes.includes?(attr) } + + transform_attributes(name, attributes) + end + + def transform_attributes(name : String, attributes : Hash(String, String)) : String | CONTINUE | STOP + name + end +end diff --git a/src/processor.cr b/src/processor.cr new file mode 100644 index 0000000..6d4e4ac --- /dev/null +++ b/src/processor.cr @@ -0,0 +1,110 @@ +require "xml" +require "log" +require "./adapter/libxml2" + +module Sanitize + abstract class Policy + # Processes the HTML fragment *html* with this policy using the default + # adapter (`Adapter::LibXML2`). + def process(html : String | XML::Node) : String + Adapter::LibXML2.process(self, html, fragment: true) + end + + # Processes the HTML document *html* with this policy using the default + # adapter (`Adapter::LibXML2`). + def process_document(html : String | XML::Node) : String + Adapter::LibXML2.process(self, html, fragment: false) + end + end + + module Adapter + abstract def write_text(text : String) : Nil + abstract def start_tag(name : String, attributes : Hash(String, String)) : Nil + abstract def end_tag(name : String, attributes : Hash(String, String)) : Nil + end + + # A processor traverses the HTML/XML tree, applies transformations through the + # policy and passes the result to the adapter which then builds the result. + class Processor + Log = ::Log.for(self) + + # This module serves as a singleton constant that signals the processor to + # skip the current tag but continue to traverse its children. + module CONTINUE + extend self + end + + # This module serves as a singleton constant that signals the processor to + # skip the current tag and its children. + module STOP + extend self + end + + @last_text_ended_with_whitespace = true + @stripped_block_tag = false + + def initialize(@policy : Policy, @adapter : Adapter) + end + + def process_text(text : String) + text = @policy.transform_text(text) + + if @stripped_block_tag && !@last_text_ended_with_whitespace && !text.try(&.[0].whitespace?) + @adapter.write_text(@policy.block_whitespace) + end + + @stripped_block_tag = false + + if text + @adapter.write_text(text) + @last_text_ended_with_whitespace = text.[-1].whitespace? + else + @last_text_ended_with_whitespace = false + end + end + + def process_element(name : String, attributes : Hash(String, String), &) + process_element(name, attributes, @policy.transform_tag(name, attributes)) do + yield + end + end + + def process_element(orig_name : String, attributes : Hash(String, String), name, &) + case name + when STOP + Log.debug { "#{@policy.class} stopping at tag #{orig_name} #{attributes}" } + if @policy.block_tag?(orig_name) + @stripped_block_tag = true + end + return + when CONTINUE + Log.debug { "#{@policy.class} stripping tag #{orig_name} #{attributes}" } + if @policy.block_tag?(orig_name) + @stripped_block_tag = true + end + when String + @stripped_block_tag = false + @adapter.start_tag(name, attributes) + end + + yield + + case name + when CONTINUE + if @policy.block_tag?(orig_name) + @stripped_block_tag = true + end + when String + @stripped_block_tag = false + @adapter.end_tag(name, attributes) + end + end + + def reset + @last_text_ended_with_whitespace = true + @stripped_block_tag = false + end + end +end + +require "./adapter/libxml2" diff --git a/src/sanitize.cr b/src/sanitize.cr new file mode 100644 index 0000000..a94e7c6 --- /dev/null +++ b/src/sanitize.cr @@ -0,0 +1,5 @@ +require "./policy/*" +require "./processor" + +module Sanitize +end diff --git a/src/uri_sanitizer.cr b/src/uri_sanitizer.cr new file mode 100644 index 0000000..d835b7c --- /dev/null +++ b/src/uri_sanitizer.cr @@ -0,0 +1,91 @@ +require "uri" + +# A `URISanitizer` is used to validate and transform a URI based on specified +# rules. +class Sanitize::URISanitizer + # Specifies a whitelist of URI schemes this sanitizer accepts. + # + # If empty, no schemes are accepted (i.e. only relative URIs are valid). + # If `nil`, all schemes are accepted (this setting is potentially dangerous). + # + # Relative URIs are not affected by this setting. + property accepted_schemes : Set(String)? + + # Specifies a whitelist of hosts this sanitizer accepts. + # + # If empty, no hosts are accepted (i.e. only relative URIs are valid). + # If `nil`, all hosts are accepted (default). + # + # The blacklist `rejected_hosts` has precedence over this whitelist. + property accepted_hosts : Set(String)? + + # Specifies a blacklist of hosts this sanitizer rejects. + # + # If empty, no hosts are rejected. + # + # This blacklist has precedence over the whitelist `accepted_hosts`. + property rejected_hosts : Set(String) = Set(String).new + + # Specifies a base URL all relative URLs are resolved against. + # + # If `nil`, relative URLs are not resolved. + property base_url : URI? + + def initialize(@accepted_schemes : Set(String)? = Set{"http", "https", "mailto", "tel"}) + end + + # Adds *scheme* to `accepted_schemes`. + def accept_scheme(scheme : String) + schemes = self.accepted_schemes ||= Set(String).new + schemes << scheme + end + + def sanitize(uri : URI) : URI? + unless accepts_scheme?(uri.scheme) + return nil + end + + unless accepts_host?(uri.host) + return nil + end + + uri = resolve_base_url(uri) + + uri + end + + def accepts_scheme?(scheme) + if scheme.nil? + return true + end + + if accepted_schemes = self.accepted_schemes + return accepted_schemes.includes?(scheme) + end + + true + end + + def accepts_host?(host) + if host.nil? + return true + end + + return false if rejected_hosts.includes?(host) + + if accepted_hosts = self.accepted_hosts + return false unless accepted_hosts.includes?(host) + end + + true + end + + def resolve_base_url(uri) + if base_url = self.base_url + unless uri.absolute? + uri = base_url.resolve(uri) + end + end + uri + end +end