github
diff --git a/‎javascript/change-notes/2021-01-08-js-incomplete-multi-character-sanitization.md
Lines changed: 2 additions & 0 deletions b/‎javascript/change-notes/2021-01-08-js-incomplete-multi-character-sanitization.md
Lines changed: 2 additions & 0 deletions
diff --git a/‎javascript/ql/src/Security/CWE-116/IncompleteMultiCharacterSanitization.ql
Lines changed: 138 additions & 52 deletions b/‎javascript/ql/src/Security/CWE-116/IncompleteMultiCharacterSanitization.ql
Lines changed: 138 additions & 52 deletions
@@ -0,0 +1,2 @@
+lgtm,codescanning
+* The query "Incomplete multi-character sanitization" (`js/incomplete-multi-character-sanitization`) has been improved to produce additional true positives and fewer false positives.
@@ -12,69 +12,155 @@
  */
 
 import javascript
-import semmle.javascript.security.IncompleteBlacklistSanitizer
 
-predicate isDangerous(RegExpTerm t) {
-  // path traversals
-  t.getAMatchedString() = ["..", "/..", "../"]
-  or
-  exists(RegExpTerm start |
-    start = t.(RegExpSequence).getAChild() and
-    start.getConstantValue() = "." and
-    start.getSuccessor().getConstantValue() = "." and
-    not [start.getPredecessor(), start.getSuccessor().getSuccessor()].getConstantValue() = "."
+/**
+ * A regexp term that matches substrings that should be replaced with the empty string.
+ */
+class EmptyReplaceRegExpTerm extends RegExpTerm {
+  EmptyReplaceRegExpTerm() {
+    exists(StringReplaceCall replace |
+      [replace.getRawReplacement(), replace.getCallback(1).getAReturn()].mayHaveStringValue("") and
+      this = replace.getRegExp().getRoot().getAChild*()
+    )
+  }
+}
+
+/**
+ * A prefix that may be dangerous to sanitize explicitly.
+ *
+ * Note that this class exists solely as a (necessary) optimization for this query.
+ */
+class DangerousPrefix extends string {
+  DangerousPrefix() {
+    this = ["/..", "../"] or
+    this = "<!--" or
+    this = "<" + ["iframe", "script", "cript", "scrip", "style"]
+  }
+}
+
+/**
+ * A substring of a prefix that may be dangerous to sanitize explicitly.
+ */
+class DangerousPrefixSubstring extends string {
+  DangerousPrefixSubstring() {
+    exists(DangerousPrefix s | this = s.substring([0 .. s.length()], [0 .. s.length()]))
+  }
+}
+
+/**
+ * Gets a dangerous prefix that is in the prefix language of `t`.
+ */
+DangerousPrefix getADangerousMatchedPrefix(EmptyReplaceRegExpTerm t) {
+  result = getADangerousMatchedPrefixSubstring(t) and
+  not exists(EmptyReplaceRegExpTerm pred | pred = t.getPredecessor+() and not pred.isNullable())
+}
+
+/**
+ * Gets a substring of a dangerous prefix that is in the language starting at `t` (ignoring lookarounds).
+ *
+ * Note that the language of `t` is slightly restricted as not all RegExpTerm types are supported.
+ */
+DangerousPrefixSubstring getADangerousMatchedPrefixSubstring(EmptyReplaceRegExpTerm t) {
+  exists(string left |
+    t.isNullable() and left = ""
+    or
+    t.getAMatchedString() = left
+    or
+    (
+      t instanceof RegExpOpt or
+      t instanceof RegExpStar or
+      t instanceof RegExpPlus or
+      t instanceof RegExpGroup or
+      t instanceof RegExpAlt
+    ) and
+    left = getADangerousMatchedPrefixSubstring(t.getAChild())
+  |
+    result = left + getADangerousMatchedPrefixSubstring(t.getSuccessor()) or
+    result = left
   )
-  or
-  // HTML comments
-  t.getAMatchedString() = "<!--"
-  or
-  // HTML scripts
-  t.getAMatchedString().regexpMatch("(?i)<script.*")
-  or
-  exists(RegExpSequence seq | seq = t |
-    t.getChild(0).getConstantValue() = "<" and
-    // the `cript|scrip` case has been observed in the wild, not sure what the goal of that pattern is...
-    t.getChild(0)
-        .getSuccessor+()
-        .getAMatchedString()
-        .regexpMatch("(?i)iframe|script|cript|scrip|style")
+}
+
+/**
+ * Holds if `t` may match the dangerous `prefix` and some suffix, indicating intent to prevent a vulnerablity of kind `kind`.
+ */
+predicate matchesDangerousPrefix(EmptyReplaceRegExpTerm t, string prefix, string kind) {
+  prefix = getADangerousMatchedPrefix(t) and
+  (
+    kind = "path injection" and
+    // upwards navigation
+    prefix = ["/..", "../"] and
+    not t.getSuccessor*().getAMatchedString().regexpMatch("(?is).*[a-z0-9_-].*") // explicit path name mentions make this an unlikely sanitizer
+    or
+    kind = "HTML element injection" and
+    (
+      // comments
+      prefix = "<!--" and
+      not t.getSuccessor*().getAMatchedString().regexpMatch("(?is).*[a-z0-9_].*") // explicit comment content mentions make this an unlikely sanitizer
+      or
+      // specific tags
+      prefix = "<" + ["iframe", "script", "cript", "scrip", "style"] // the `cript|scrip` case has been observed in the wild several times
+    )
   )
   or
-  // HTML attributes
-  exists(string dangerousPrefix | dangerousPrefix = ["ng-", "on"] |
-    t.getAMatchedString().regexpMatch("(i?)" + dangerousPrefix + "[a-z]+")
+  kind = "HTML attribute injection" and
+  prefix =
+    [
+      // ordinary event handler prefix
+      "on",
+      // angular prefixes
+      "ng-", "ng:", "data-ng-", "x-ng-"
+    ] and
+  (
+    // explicit matching: `onclick` and `ng-bind`
+    t.getAMatchedString().regexpMatch("(?i)" + prefix + "[a-z]+")
     or
-    exists(RegExpTerm start, RegExpTerm event | start = t.getAChild() |
-      start.getConstantValue().regexpMatch("(?i)[^a-z]*" + dangerousPrefix) and
-      event = start.getSuccessor() and
-      exists(RegExpTerm quantified | quantified = event.(RegExpQuantifier).getChild(0) |
-        quantified
-            .(RegExpCharacterClass)
-            .getAChild()
-            .(RegExpCharacterRange)
-            .isRange(["a", "A"], ["z", "Z"]) or
-        [quantified, quantified.(RegExpRange).getAChild()].(RegExpCharacterClassEscape).getValue() =
-          "w"
-      )
+    // regexp-based matching: `on[a-z]+`
+    exists(EmptyReplaceRegExpTerm start | start = t.getAChild() |
+      start.getConstantValue().regexpMatch("(?i)[^a-z]*" + prefix) and
+      isCommonWordMatcher(start.getSuccessor())
     )
   )
 }
 
-from StringReplaceCall replace, RegExpTerm regexp, RegExpTerm dangerous
+/**
+ * Holds if `t` is a common pattern for matching words
+ */
+predicate isCommonWordMatcher(RegExpTerm t) {
+  exists(RegExpTerm quantified | quantified = t.(RegExpQuantifier).getChild(0) |
+    // [a-z]+ and similar
+    quantified
+        .(RegExpCharacterClass)
+        .getAChild()
+        .(RegExpCharacterRange)
+        .isRange(["a", "A"], ["z", "Z"])
+    or
+    // \w+ or [\w]+
+    [quantified, quantified.(RegExpCharacterClass).getAChild()]
+        .(RegExpCharacterClassEscape)
+        .getValue() = "w"
+  )
+}
+
+from
+  StringReplaceCall replace, EmptyReplaceRegExpTerm regexp, EmptyReplaceRegExpTerm dangerous,
+  string prefix, string kind
 where
-  [replace.getRawReplacement(), replace.getCallback(1).getAReturn()].mayHaveStringValue("") and
-  replace.isGlobal() and
   regexp = replace.getRegExp().getRoot() and
   dangerous.getRootTerm() = regexp and
-  isDangerous(dangerous) and
-  // avoid anchored terms
-  not exists(RegExpAnchor a | a.getRootTerm() = regexp) and
-  // avoid flagging wrappers
-  not (
-    dangerous instanceof RegExpAlt or
-    dangerous instanceof RegExpGroup
+  // skip leading optional elements
+  not dangerous.isNullable() and
+  // only warn about the longest match (presumably the most descriptive)
+  prefix = max(string m | matchesDangerousPrefix(dangerous, m, kind) | m order by m.length()) and
+  // only warn once per kind
+  not exists(EmptyReplaceRegExpTerm other |
+    other = dangerous.getAChild+() or other = dangerous.getPredecessor+()
+  |
+    matchesDangerousPrefix(other, _, kind) and
+    not other.isNullable()
   ) and
   // don't flag replace operations in a loop
-  not replace.getReceiver().getALocalSource() = replace
-select replace, "The replaced string may still contain a substring that starts matching at $@.",
-  dangerous, dangerous.toString()
+  not replace.getAMethodCall*().flowsTo(replace.getReceiver()) and
+  // avoid anchored terms
+  not exists(RegExpAnchor a | regexp = a.getRootTerm())
+select replace, "This string may still contain $@, which may cause a " + kind + " vulnerability.",
+  dangerous, prefix
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+lgtm,codescanning`
	`2`	+* The query "Incomplete multi-character sanitization" (`js/incomplete-multi-character-sanitization`) has been improved to produce additional true positives and fewer false positives.