Skip to content

Commit 78683e4

Browse files
authored
Merge pull request github#11879 from erik-krogh/rbRegConcept
RB: add a RegexExecution concept, and use it for better regexp tracking
2 parents 3508a4b + f04a9cb commit 78683e4

File tree

12 files changed

+391
-5030
lines changed

12 files changed

+391
-5030
lines changed

config/identical-files.json

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,6 @@
3636
"python/ql/lib/semmle/python/dataflow/new/internal/DataFlowImpl4.qll",
3737
"ruby/ql/lib/codeql/ruby/dataflow/internal/DataFlowImpl.qll",
3838
"ruby/ql/lib/codeql/ruby/dataflow/internal/DataFlowImpl2.qll",
39-
"ruby/ql/lib/codeql/ruby/dataflow/internal/DataFlowImplForRegExp.qll",
4039
"ruby/ql/lib/codeql/ruby/dataflow/internal/DataFlowImplForHttpClientLibraries.qll",
4140
"ruby/ql/lib/codeql/ruby/dataflow/internal/DataFlowImplForPathname.qll",
4241
"swift/ql/lib/codeql/swift/dataflow/internal/DataFlowImpl.qll"

ruby/ql/lib/codeql/ruby/Concepts.qll

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ private import codeql.ruby.DataFlow
1010
private import codeql.ruby.Frameworks
1111
private import codeql.ruby.dataflow.RemoteFlowSources
1212
private import codeql.ruby.ApiGraphs
13+
private import codeql.ruby.Regexp as RE
1314

1415
/**
1516
* A data-flow node that constructs a SQL statement.
@@ -77,6 +78,49 @@ module SqlExecution {
7778
}
7879
}
7980

81+
/**
82+
* A data-flow node that executes a regular expression.
83+
*
84+
* Extend this class to refine existing API models. If you want to model new APIs,
85+
* extend `RegexExecution::Range` instead.
86+
*/
87+
class RegexExecution extends DataFlow::Node instanceof RegexExecution::Range {
88+
/** Gets the data flow node for the regex being executed by this node. */
89+
DataFlow::Node getRegex() { result = super.getRegex() }
90+
91+
/** Gets a dataflow node for the string to be searched or matched against. */
92+
DataFlow::Node getString() { result = super.getString() }
93+
94+
/**
95+
* Gets the name of this regex execution, typically the name of an executing method.
96+
* This is used for nice alert messages and should include the module if possible.
97+
*/
98+
string getName() { result = super.getName() }
99+
}
100+
101+
/** Provides classes for modeling new regular-expression execution APIs. */
102+
module RegexExecution {
103+
/**
104+
* A data-flow node that executes a regular expression.
105+
*
106+
* Extend this class to model new APIs. If you want to refine existing API models,
107+
* extend `RegexExecution` instead.
108+
*/
109+
abstract class Range extends DataFlow::Node {
110+
/** Gets the data flow node for the regex being executed by this node. */
111+
abstract DataFlow::Node getRegex();
112+
113+
/** Gets a dataflow node for the string to be searched or matched against. */
114+
abstract DataFlow::Node getString();
115+
116+
/**
117+
* Gets the name of this regex execution, typically the name of an executing method.
118+
* This is used for nice alert messages and should include the module if possible.
119+
*/
120+
abstract string getName();
121+
}
122+
}
123+
80124
/**
81125
* A data flow node that performs a file system access, including reading and writing data,
82126
* creating and deleting files and folders, checking and updating permissions, and so on.

ruby/ql/lib/codeql/ruby/Regexp.qll

Lines changed: 97 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,12 @@
77

88
import regexp.RegExpTreeView // re-export
99
private import regexp.internal.ParseRegExp
10-
private import regexp.internal.RegExpConfiguration
11-
private import codeql.ruby.ast.Literal as Ast
10+
private import regexp.internal.RegExpTracking as RegExpTracking
11+
private import codeql.ruby.AST as Ast
12+
private import codeql.ruby.CFG
1213
private import codeql.ruby.DataFlow
1314
private import codeql.ruby.ApiGraphs
15+
private import codeql.ruby.Concepts
1416

1517
/**
1618
* Provides utility predicates related to regular expressions.
@@ -63,7 +65,11 @@ private class RegExpLiteralPatternSource extends RegExpPatternSource {
6365
private class StringRegExpPatternSource extends RegExpPatternSource {
6466
private DataFlow::Node parse;
6567

66-
StringRegExpPatternSource() { this = regExpSource(parse) }
68+
StringRegExpPatternSource() {
69+
this = regExpSource(parse) and
70+
// `regExpSource()` tracks both strings and regex literals, narrow it down to strings.
71+
this.asExpr().getConstantValue().isString(_)
72+
}
6773

6874
override DataFlow::Node getAParse() { result = parse }
6975

@@ -104,6 +110,7 @@ module RegExpInterpretation {
104110

105111
/**
106112
* A node interpreted as a regular expression.
113+
* Speficically nodes where string values are interpreted as regular expressions.
107114
*/
108115
class StdLibRegExpInterpretation extends RegExpInterpretation::Range {
109116
StdLibRegExpInterpretation() {
@@ -115,16 +122,100 @@ class StdLibRegExpInterpretation extends RegExpInterpretation::Range {
115122
mce.getMethodName() = ["match", "match?"] and
116123
this = mce.getArgument(0) and
117124
// exclude https://ruby-doc.org/core-2.4.0/Regexp.html#method-i-match
118-
not mce.getReceiver() = trackRegexpType()
125+
not mce.getReceiver() = RegExpTracking::trackRegexpType()
119126
)
120127
}
121128
}
122129

130+
/**
131+
* Holds if `exec` is a node where `regexp` is interpreted as a regular expression and
132+
* tested against the string value of `input`.
133+
* `name` describes the regexp execution, typically the name of the method being called.
134+
*/
135+
private predicate regexExecution(
136+
DataFlow::Node exec, DataFlow::Node input, DataFlow::Node regexp, string name
137+
) {
138+
// `=~` or `!~`
139+
exists(CfgNodes::ExprNodes::BinaryOperationCfgNode op |
140+
name = op.getOperator() and
141+
exec.asExpr() = op and
142+
(
143+
op.getExpr() instanceof Ast::RegExpMatchExpr or
144+
op.getExpr() instanceof Ast::NoRegExpMatchExpr
145+
) and
146+
(
147+
input.asExpr() = op.getLeftOperand() and regexp.asExpr() = op.getRightOperand()
148+
or
149+
input.asExpr() = op.getRightOperand() and regexp.asExpr() = op.getLeftOperand()
150+
)
151+
)
152+
or
153+
// Any of the methods on `String` that take a regexp.
154+
exists(DataFlow::CallNode call | exec = call |
155+
name = "String#" + call.getMethodName() and
156+
call.getMethodName() =
157+
[
158+
"[]", "gsub", "gsub!", "index", "match", "match?", "partition", "rindex", "rpartition",
159+
"scan", "slice!", "split", "sub", "sub!"
160+
] and
161+
input = call.getReceiver() and
162+
regexp = call.getArgument(0) and
163+
// exclude https://ruby-doc.org/core-2.4.0/Regexp.html#method-i-match, they are handled on the next case of this disjunction
164+
// also see `StdLibRegExpInterpretation`
165+
not (
166+
call.getMethodName() = ["match", "match?"] and
167+
call.getReceiver() = RegExpTracking::trackRegexpType()
168+
)
169+
)
170+
or
171+
// A call to `match` or `match?` where the regexp is the receiver.
172+
exists(DataFlow::CallNode call | exec = call |
173+
name = "Regexp#" + call.getMethodName() and
174+
call.getMethodName() = ["match", "match?"] and
175+
regexp = call.getReceiver() and
176+
input = call.getArgument(0)
177+
)
178+
or
179+
// a case-when statement
180+
exists(CfgNodes::ExprNodes::CaseExprCfgNode caseExpr |
181+
exec.asExpr() = caseExpr and
182+
input.asExpr() = caseExpr.getValue()
183+
|
184+
name = "case-when" and
185+
regexp.asExpr() = caseExpr.getBranch(_).(CfgNodes::ExprNodes::WhenClauseCfgNode).getPattern(_)
186+
or
187+
name = "case-in" and
188+
regexp.asExpr() = caseExpr.getBranch(_).(CfgNodes::ExprNodes::InClauseCfgNode).getPattern()
189+
)
190+
}
191+
192+
/**
193+
* An execution of a regular expression by the standard library.
194+
*/
195+
private class StdRegexpExecution extends RegexExecution::Range {
196+
DataFlow::Node regexp;
197+
DataFlow::Node input;
198+
string name;
199+
200+
StdRegexpExecution() { regexExecution(this, input, regexp, name) }
201+
202+
override DataFlow::Node getRegex() { result = regexp }
203+
204+
override DataFlow::Node getString() { result = input }
205+
206+
override string getName() { result = name }
207+
}
208+
123209
/**
124210
* Gets a node whose value may flow (inter-procedurally) to `re`, where it is interpreted
125211
* as a part of a regular expression.
126212
*/
127213
cached
128-
DataFlow::Node regExpSource(DataFlow::Node re) {
129-
exists(RegExpConfiguration c | c.hasFlow(result, re))
214+
DataFlow::Node regExpSource(DataFlow::Node re) { result = RegExpTracking::regExpSource(re) }
215+
216+
/** Gets a parsed regular expression term that is executed at `exec`. */
217+
RegExpTerm getTermForExecution(RegexExecution exec) {
218+
exists(RegExpPatternSource source | source = regExpSource(exec.getRegex()) |
219+
result = source.getRegExpTerm()
220+
)
130221
}

0 commit comments

Comments
 (0)