Skip to content

Commit 1a3c9c8

Browse files
committed
improve performance of regular-expression type-tracking by adding an exploratory initial analysis
1 parent b8f6feb commit 1a3c9c8

File tree

1 file changed

+132
-57
lines changed

1 file changed

+132
-57
lines changed
Lines changed: 132 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,17 @@
1+
/**
2+
* Provides predicates that track strings and regular expressions to where they are used.
3+
* This is implemented using TypeTracking in two phases:
4+
*
5+
* 1: An exploratory analysis that just imprecisely tracks all string and regular expressions
6+
* to all places where regular expressions (as string or as regular expression objects) can be used.
7+
* The exploratory phase then ends with a backwards analysis from the uses that were reached.
8+
* This is similar to the exploratory phase of the JavaScript global DataFlow library.
9+
*
10+
* 2: A precise type tracking analysis that tracks
11+
* strings and regular expressions to the places where they are used.
12+
* This phase keeps track of which strings and regular expressions ends up in which places.
13+
*/
14+
115
private import codeql.ruby.Regexp as RE
216
private import codeql.ruby.AST as Ast
317
private import codeql.ruby.CFG
@@ -11,41 +25,114 @@ private import codeql.ruby.dataflow.internal.DataFlowPrivate as DataFlowPrivate
1125
private import codeql.ruby.TaintTracking
1226
private import codeql.ruby.frameworks.core.String
1327

14-
/**
15-
* Gets a node that has been tracked from the string constant `start` to some node.
16-
* This is used to figure out where `start` is evaluated as a regular expression against an input string,
17-
* or where `start` is compiled into a regular expression.
18-
*/
19-
private DataFlow::LocalSourceNode strToReg(DataFlow::Node start, TypeTracker t) {
20-
t.start() and
21-
start = result and
28+
/** Gets a constant string value that may be used as a regular expression. */
29+
DataFlow::LocalSourceNode strStart() {
2230
result.asExpr() =
2331
any(ExprCfgNode e |
2432
e.getConstantValue().isString(_) and
2533
not e instanceof ExprNodes::VariableReadAccessCfgNode and
2634
not e instanceof ExprNodes::ConstantReadAccessCfgNode
2735
)
36+
}
37+
38+
/** Gets a dataflow node for a regular expression literal. */
39+
DataFlow::LocalSourceNode regStart() { result.asExpr().getExpr() instanceof Ast::RegExpLiteral }
40+
41+
/**
42+
* Holds if the analysis should track flow from `nodeFrom` to `nodeTo` on top of the ordinary type-tracking steps.
43+
* `nodeFrom` and `nodeTo` has type `fromType` and `toType` respectively.
44+
* The types are either "string" or "regexp".
45+
*/
46+
predicate step(
47+
DataFlow::Node nodeFrom, DataFlow::LocalSourceNode nodeTo, string fromType, string toType
48+
) {
49+
fromType = toType and
50+
fromType = "string" and
51+
(
52+
// include taint flow through `String` summaries
53+
TaintTracking::localTaintStep(nodeFrom, nodeTo) and
54+
nodeFrom.(DataFlowPrivate::SummaryNode).getSummarizedCallable() instanceof
55+
String::SummarizedCallable
56+
or
57+
// string concatenations, and
58+
exists(CfgNodes::ExprNodes::OperationCfgNode op |
59+
op = nodeTo.asExpr() and
60+
op.getAnOperand() = nodeFrom.asExpr() and
61+
op.getExpr().(Ast::BinaryOperation).getOperator() = "+"
62+
)
63+
or
64+
// string interpolations
65+
nodeFrom.asExpr() =
66+
nodeTo.asExpr().(CfgNodes::ExprNodes::StringlikeLiteralCfgNode).getAComponent()
67+
)
68+
or
69+
fromType = "string" and
70+
toType = "reg" and
71+
exists(DataFlow::CallNode call |
72+
call = API::getTopLevelMember("Regexp").getAMethodCall(["compile", "new"]) and
73+
nodeFrom = call.getArgument(0) and
74+
nodeTo = call
75+
)
76+
}
77+
78+
/** Gets a node where string values that flow to the node are interpreted as regular expressions. */
79+
DataFlow::Node stringSink() {
80+
result instanceof RE::RegExpInterpretation::Range and
81+
not exists(DataFlow::CallNode mce | mce.getMethodName() = ["match", "match?"] |
82+
// receiver of https://ruby-doc.org/core-2.4.0/String.html#method-i-match
83+
result = mce.getReceiver() and
84+
mce.getArgument(0) = trackRegexpType()
85+
or
86+
// first argument of https://ruby-doc.org/core-2.4.0/Regexp.html#method-i-match
87+
result = mce.getArgument(0) and
88+
mce.getReceiver() = trackRegexpType()
89+
)
90+
}
91+
92+
/** Gets a node where regular expressions that flow to the node are used. */
93+
DataFlow::Node regSink() { result = any(RegexExecution exec).getRegex() }
94+
95+
/** Gets a node that is reachable by type-tracking from any string or regular expression. */
96+
DataFlow::LocalSourceNode forward(TypeTracker t) {
97+
t.start() and
98+
result = [strStart(), regStart()]
99+
or
100+
exists(TypeTracker t2 | result = forward(t2).track(t2, t))
101+
or
102+
exists(TypeTracker t2 | t2 = t.continue() | step(forward(t2).getALocalUse(), result, _, _))
103+
}
104+
105+
/**
106+
* Gets a node that is backwards reachable from any regular expression use,
107+
* where that use is reachable by type-tracking from any string or regular expression.
108+
*/
109+
DataFlow::LocalSourceNode backwards(TypeBackTracker t) {
110+
t.start() and
111+
result.flowsTo([stringSink(), regSink()]) and
112+
result = forward(TypeTracker::end())
28113
or
29-
exists(TypeTracker t2 | result = strToReg(start, t2).track(t2, t))
114+
exists(TypeBackTracker t2 | result = backwards(t2).backtrack(t2, t))
30115
or
31-
exists(TypeTracker t2, DataFlow::Node nodeFrom | t2 = t.continue() |
32-
strToReg(start, t2).flowsTo(nodeFrom) and
33-
(
34-
// include taint flow through `String` summaries
35-
TaintTracking::localTaintStep(nodeFrom, result) and
36-
nodeFrom.(DataFlowPrivate::SummaryNode).getSummarizedCallable() instanceof
37-
String::SummarizedCallable
38-
or
39-
// string concatenations, and
40-
exists(CfgNodes::ExprNodes::OperationCfgNode op |
41-
op = result.asExpr() and
42-
op.getAnOperand() = nodeFrom.asExpr() and
43-
op.getExpr().(Ast::BinaryOperation).getOperator() = "+"
44-
)
45-
or
46-
// string interpolations
47-
nodeFrom.asExpr() =
48-
result.asExpr().(CfgNodes::ExprNodes::StringlikeLiteralCfgNode).getAComponent()
116+
exists(TypeBackTracker t2 | t2 = t.continue() | step(result.getALocalUse(), backwards(t2), _, _))
117+
}
118+
119+
/**
120+
* Gets a node that has been tracked from the string constant `start` to some node.
121+
* This is used to figure out where `start` is evaluated as a regular expression against an input string,
122+
* or where `start` is compiled into a regular expression.
123+
*/
124+
private DataFlow::LocalSourceNode trackStrings(DataFlow::Node start, TypeTracker t) {
125+
result = backwards(_) and
126+
(
127+
t.start() and
128+
start = result and
129+
result = strStart()
130+
or
131+
exists(TypeTracker t2 | result = trackStrings(start, t2).track(t2, t))
132+
or
133+
// an additional step from string to string
134+
exists(TypeTracker t2 | t2 = t.continue() |
135+
step(trackStrings(start, t2).getALocalUse(), result, "string", "string")
49136
)
50137
)
51138
}
@@ -54,19 +141,18 @@ private DataFlow::LocalSourceNode strToReg(DataFlow::Node start, TypeTracker t)
54141
* Gets a node that has been tracked from the regular expression `start` to some node.
55142
* This is used to figure out where `start` is executed against an input string.
56143
*/
57-
private DataFlow::LocalSourceNode regToReg(DataFlow::Node start, TypeTracker t) {
58-
t.start() and
59-
start = result and
60-
result.asExpr().getExpr() instanceof Ast::RegExpLiteral
61-
or
62-
exists(TypeTracker t2 | result = regToReg(start, t2).track(t2, t))
63-
or
64-
exists(TypeTracker t2 |
65-
t2 = t.continue() and
66-
exists(DataFlow::CallNode call |
67-
call = API::getTopLevelMember("Regexp").getAMethodCall(["compile", "new"]) and
68-
strToReg(start, t2).flowsTo(call.getArgument(0)) and
69-
result = call
144+
private DataFlow::LocalSourceNode trackRegs(DataFlow::Node start, TypeTracker t) {
145+
result = backwards(_) and
146+
(
147+
t.start() and
148+
start = result and
149+
result = regStart()
150+
or
151+
exists(TypeTracker t2 | result = trackRegs(start, t2).track(t2, t))
152+
or
153+
// an additional step where a string is converted to a regular expression
154+
exists(TypeTracker t2 | t2 = t.continue() |
155+
step(trackStrings(start, t2).getALocalUse(), result, "string", "reg")
70156
)
71157
)
72158
}
@@ -75,7 +161,7 @@ private DataFlow::LocalSourceNode regToReg(DataFlow::Node start, TypeTracker t)
75161
private DataFlow::LocalSourceNode trackRegexpType(TypeTracker t) {
76162
t.start() and
77163
(
78-
result.asExpr().getExpr() instanceof Ast::RegExpLiteral or
164+
result = regStart() or
79165
result = API::getTopLevelMember("Regexp").getAMethodCall(["compile", "new"])
80166
)
81167
or
@@ -85,25 +171,14 @@ private DataFlow::LocalSourceNode trackRegexpType(TypeTracker t) {
85171
/** Gests a node that references a regular expression. */
86172
DataFlow::Node trackRegexpType() { trackRegexpType(TypeTracker::end()).flowsTo(result) }
87173

88-
/** Gets a the value for the regular expression that is evaluated at `re`. */
174+
/** Gets a node holding a value for the regular expression that is evaluated at `re`. */
89175
cached
90176
DataFlow::Node regExpSource(DataFlow::Node re) {
91-
exists(DataFlow::LocalSourceNode end | end = strToReg(result, TypeTracker::end()) |
92-
end.flowsTo(re) and
93-
re instanceof RE::RegExpInterpretation::Range and
94-
not exists(DataFlow::CallNode mce | mce.getMethodName() = ["match", "match?"] |
95-
// receiver of https://ruby-doc.org/core-2.4.0/String.html#method-i-match
96-
re = mce.getReceiver() and
97-
mce.getArgument(0) = trackRegexpType()
98-
or
99-
// first argument of https://ruby-doc.org/core-2.4.0/Regexp.html#method-i-match
100-
re = mce.getArgument(0) and
101-
mce.getReceiver() = trackRegexpType()
102-
)
177+
exists(DataFlow::LocalSourceNode end | end = trackStrings(result, TypeTracker::end()) |
178+
end.getALocalUse() = re and re = stringSink()
103179
)
104180
or
105-
exists(DataFlow::LocalSourceNode end | end = regToReg(result, TypeTracker::end()) |
106-
end.flowsTo(re) and
107-
re = any(RegexExecution exec).getRegex()
181+
exists(DataFlow::LocalSourceNode end | end = trackRegs(result, TypeTracker::end()) |
182+
end.getALocalUse() = re and re = regSink()
108183
)
109184
}

0 commit comments

Comments
 (0)