Skip to content

Commit b83c743

Browse files
authored
Merge pull request #14944 from yoff/python/captured-variables-basic
Python: Basic implementation of variable capture
2 parents 706dee9 + da4aef8 commit b83c743

File tree

18 files changed

+539
-22
lines changed

18 files changed

+539
-22
lines changed

python/ql/consistency-queries/DataFlowConsistency.ql

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ private module Input implements InputSig<PythonDataFlow> {
1414
private import Private
1515
private import Public
1616

17+
predicate postWithInFlowExclude(Node n) { n instanceof FlowSummaryNode }
18+
1719
predicate argHasPostUpdateExclude(ArgumentNode n) {
1820
// TODO: Implement post-updates for *args, see tests added in https://github.com/github/codeql/pull/14936
1921
exists(ArgumentPosition apos | n.argumentOf(_, apos) and apos.isStarArgs(_))
@@ -44,6 +46,13 @@ private module Input implements InputSig<PythonDataFlow> {
4446
)
4547
}
4648

49+
predicate uniqueEnclosingCallableExclude(Node n) {
50+
// We only have a selection of valid callables.
51+
// For instance, we do not have classes as `DataFlowCallable`s.
52+
not n.(SynthCaptureNode).getSynthesizedCaptureNode().getEnclosingCallable() instanceof Function and
53+
not n.(SynthCaptureNode).getSynthesizedCaptureNode().getEnclosingCallable() instanceof Module
54+
}
55+
4756
predicate uniqueCallEnclosingCallableExclude(DataFlowCall call) {
4857
not exists(call.getLocation().getFile().getRelativePath())
4958
}
@@ -53,7 +62,7 @@ private module Input implements InputSig<PythonDataFlow> {
5362
}
5463

5564
predicate multipleArgumentCallExclude(ArgumentNode arg, DataFlowCall call) {
56-
// since we can have multiple DataFlowCall for a CallNode (for example if can
65+
// since we can have multiple DataFlowCall for a CallNode (for example if it can
5766
// resolve to multiple functions), but we only make _one_ ArgumentNode for each
5867
// argument in the CallNode, we end up violating this consistency check in those
5968
// cases. (see `getCallArg` in DataFlowDispatch.qll)
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
category: majorAnalysis
3+
---
4+
* Added support for global data-flow through captured variables.

python/ql/lib/semmle/python/dataflow/new/internal/DataFlowDispatch.qll

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,13 @@ private import semmle.python.dataflow.new.internal.TypeTrackingImpl::CallGraphCo
4242
newtype TParameterPosition =
4343
/** Used for `self` in methods, and `cls` in classmethods. */
4444
TSelfParameterPosition() or
45+
/**
46+
* This is used for tracking flow through captured variables, and
47+
* we use separate parameter/argument positions in order to distinguish
48+
* "lambda self" from "normal self", as lambdas may also access outer `self`
49+
* variables (through variable capture).
50+
*/
51+
TLambdaSelfParameterPosition() or
4552
TPositionalParameterPosition(int index) {
4653
index = any(Parameter p).getPosition()
4754
or
@@ -78,6 +85,9 @@ class ParameterPosition extends TParameterPosition {
7885
/** Holds if this position represents a `self`/`cls` parameter. */
7986
predicate isSelf() { this = TSelfParameterPosition() }
8087

88+
/** Holds if this position represents a reference to a lambda itself. Only used for tracking flow through captured variables. */
89+
predicate isLambdaSelf() { this = TLambdaSelfParameterPosition() }
90+
8191
/** Holds if this position represents a positional parameter at (0-based) `index`. */
8292
predicate isPositional(int index) { this = TPositionalParameterPosition(index) }
8393

@@ -109,6 +119,8 @@ class ParameterPosition extends TParameterPosition {
109119
string toString() {
110120
this.isSelf() and result = "self"
111121
or
122+
this.isLambdaSelf() and result = "lambda self"
123+
or
112124
exists(int index | this.isPositional(index) and result = "position " + index)
113125
or
114126
exists(string name | this.isKeyword(name) and result = "keyword " + name)
@@ -129,6 +141,13 @@ class ParameterPosition extends TParameterPosition {
129141
newtype TArgumentPosition =
130142
/** Used for `self` in methods, and `cls` in classmethods. */
131143
TSelfArgumentPosition() or
144+
/**
145+
* This is used for tracking flow through captured variables, and
146+
* we use separate parameter/argument positions in order to distinguish
147+
* "lambda self" from "normal self", as lambdas may also access outer `self`
148+
* variables (through variable capture).
149+
*/
150+
TLambdaSelfArgumentPosition() or
132151
TPositionalArgumentPosition(int index) {
133152
exists(any(CallNode c).getArg(index))
134153
or
@@ -153,6 +172,9 @@ class ArgumentPosition extends TArgumentPosition {
153172
/** Holds if this position represents a `self`/`cls` argument. */
154173
predicate isSelf() { this = TSelfArgumentPosition() }
155174

175+
/** Holds if this position represents a lambda `self` argument. Only used for tracking flow through captured variables. */
176+
predicate isLambdaSelf() { this = TLambdaSelfArgumentPosition() }
177+
156178
/** Holds if this position represents a positional argument at (0-based) `index`. */
157179
predicate isPositional(int index) { this = TPositionalArgumentPosition(index) }
158180

@@ -169,6 +191,8 @@ class ArgumentPosition extends TArgumentPosition {
169191
string toString() {
170192
this.isSelf() and result = "self"
171193
or
194+
this.isLambdaSelf() and result = "lambda self"
195+
or
172196
exists(int pos | this.isPositional(pos) and result = "position " + pos)
173197
or
174198
exists(string name | this.isKeyword(name) and result = "keyword " + name)
@@ -183,6 +207,8 @@ class ArgumentPosition extends TArgumentPosition {
183207
predicate parameterMatch(ParameterPosition ppos, ArgumentPosition apos) {
184208
ppos.isSelf() and apos.isSelf()
185209
or
210+
ppos.isLambdaSelf() and apos.isLambdaSelf()
211+
or
186212
exists(int index | ppos.isPositional(index) and apos.isPositional(index))
187213
or
188214
exists(string name | ppos.isKeyword(name) and apos.isKeyword(name))
@@ -1506,6 +1532,37 @@ abstract class ParameterNodeImpl extends Node {
15061532
}
15071533
}
15081534

1535+
/**
1536+
* A synthetic parameter representing the values of the variables captured
1537+
* by the callable being called. This parameter represents a single object
1538+
* where all the values are stored as attributes.
1539+
* This is also known as the environment part of a closure.
1540+
*
1541+
* This is used for tracking flow through captured variables.
1542+
*/
1543+
class SynthCapturedVariablesParameterNode extends ParameterNodeImpl,
1544+
TSynthCapturedVariablesParameterNode
1545+
{
1546+
private Function callable;
1547+
1548+
SynthCapturedVariablesParameterNode() { this = TSynthCapturedVariablesParameterNode(callable) }
1549+
1550+
final Function getCallable() { result = callable }
1551+
1552+
override Parameter getParameter() { none() }
1553+
1554+
override predicate isParameterOf(DataFlowCallable c, ParameterPosition pos) {
1555+
c = TFunction(callable) and
1556+
pos.isLambdaSelf()
1557+
}
1558+
1559+
override Scope getScope() { result = callable }
1560+
1561+
override Location getLocation() { result = callable.getLocation() }
1562+
1563+
override string toString() { result = "lambda self in " + callable }
1564+
}
1565+
15091566
/** A parameter for a library callable with a flow summary. */
15101567
class SummaryParameterNode extends ParameterNodeImpl, FlowSummaryNode {
15111568
SummaryParameterNode() {
@@ -1580,6 +1637,39 @@ private class SummaryPostUpdateNode extends FlowSummaryNode, PostUpdateNodeImpl
15801637
override Node getPreUpdateNode() { result = pre }
15811638
}
15821639

1640+
/**
1641+
* A synthetic argument representing the values of the variables captured
1642+
* by the callable being called. This argument represents a single object
1643+
* where all the values are stored as attributes.
1644+
* This is also known as the environment part of a closure.
1645+
*
1646+
* This is used for tracking flow through captured variables.
1647+
*
1648+
* TODO:
1649+
* We might want a synthetic node here, but currently that incurs problems
1650+
* with non-monotonic recursion, because of the use of `resolveCall` in the
1651+
* char pred. This may be solvable by using
1652+
* `CallGraphConstruction::Make` in stead of
1653+
* `CallGraphConstruction::Simple::Make` appropriately.
1654+
*/
1655+
class CapturedVariablesArgumentNode extends CfgNode, ArgumentNode {
1656+
CallNode callNode;
1657+
1658+
CapturedVariablesArgumentNode() {
1659+
node = callNode.getFunction() and
1660+
exists(Function target | resolveCall(callNode, target, _) |
1661+
target = any(VariableCapture::CapturedVariable v).getACapturingScope()
1662+
)
1663+
}
1664+
1665+
override string toString() { result = "Capturing closure argument" }
1666+
1667+
override predicate argumentOf(DataFlowCall call, ArgumentPosition pos) {
1668+
callNode = call.getNode() and
1669+
pos.isLambdaSelf()
1670+
}
1671+
}
1672+
15831673
/** Gets a viable run-time target for the call `call`. */
15841674
DataFlowCallable viableCallable(DataFlowCall call) {
15851675
call instanceof ExtractedDataFlowCall and

python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPrivate.qll

Lines changed: 57 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ private import semmle.python.Frameworks
1717
import MatchUnpacking
1818
import IterableUnpacking
1919
import DataFlowDispatch
20+
import VariableCapture as VariableCapture
2021

2122
/** Gets the callable in which this node occurs. */
2223
DataFlowCallable nodeGetEnclosingCallable(Node n) { result = n.getEnclosingCallable() }
@@ -474,6 +475,8 @@ predicate simpleLocalFlowStep(Node nodeFrom, Node nodeTo) {
474475
simpleLocalFlowStepForTypetracking(nodeFrom, nodeTo)
475476
or
476477
summaryFlowSteps(nodeFrom, nodeTo)
478+
or
479+
variableCaptureLocalFlowStep(nodeFrom, nodeTo)
477480
}
478481

479482
/**
@@ -496,6 +499,16 @@ predicate summaryFlowSteps(Node nodeFrom, Node nodeTo) {
496499
IncludePostUpdateFlow<PhaseDependentFlow<summaryLocalStep/2>::step/2>::step(nodeFrom, nodeTo)
497500
}
498501

502+
predicate variableCaptureLocalFlowStep(Node nodeFrom, Node nodeTo) {
503+
// Blindly applying use-use flow can result in a node that steps to itself, for
504+
// example in while-loops. To uphold dataflow consistency checks, we don't want
505+
// that. However, we do want to allow `[post] n` to `n` (to handle while loops), so
506+
// we should only do the filtering after `IncludePostUpdateFlow` has ben applied.
507+
IncludePostUpdateFlow<PhaseDependentFlow<VariableCapture::valueStep/2>::step/2>::step(nodeFrom,
508+
nodeTo) and
509+
nodeFrom != nodeTo
510+
}
511+
499512
/** `ModuleVariable`s are accessed via jump steps at runtime. */
500513
predicate runtimeJumpStep(Node nodeFrom, Node nodeTo) {
501514
// Module variable read
@@ -559,7 +572,7 @@ predicate compatibleTypes(DataFlowType t1, DataFlowType t2) { any() }
559572

560573
predicate typeStrongerThan(DataFlowType t1, DataFlowType t2) { none() }
561574

562-
predicate localMustFlowStep(Node node1, Node node2) { none() }
575+
predicate localMustFlowStep(Node nodeFrom, Node nodeTo) { none() }
563576

564577
/**
565578
* Gets the type of `node`.
@@ -663,6 +676,38 @@ predicate storeStep(Node nodeFrom, ContentSet c, Node nodeTo) {
663676
synthStarArgsElementParameterNodeStoreStep(nodeFrom, c, nodeTo)
664677
or
665678
synthDictSplatArgumentNodeStoreStep(nodeFrom, c, nodeTo)
679+
or
680+
VariableCapture::storeStep(nodeFrom, c, nodeTo)
681+
}
682+
683+
/**
684+
* A synthesized data flow node representing a closure object that tracks
685+
* captured variables.
686+
*/
687+
class SynthCaptureNode extends Node, TSynthCaptureNode {
688+
private VariableCapture::Flow::SynthesizedCaptureNode cn;
689+
690+
SynthCaptureNode() { this = TSynthCaptureNode(cn) }
691+
692+
/** Gets the `SynthesizedCaptureNode` that this node represents. */
693+
VariableCapture::Flow::SynthesizedCaptureNode getSynthesizedCaptureNode() { result = cn }
694+
695+
override Scope getScope() { result = cn.getEnclosingCallable() }
696+
697+
override Location getLocation() { result = cn.getLocation() }
698+
699+
override string toString() { result = cn.toString() }
700+
}
701+
702+
private class SynthCapturePostUpdateNode extends PostUpdateNodeImpl, SynthCaptureNode {
703+
private SynthCaptureNode pre;
704+
705+
SynthCapturePostUpdateNode() {
706+
VariableCapture::Flow::capturePostUpdateNode(this.getSynthesizedCaptureNode(),
707+
pre.getSynthesizedCaptureNode())
708+
}
709+
710+
override Node getPreUpdateNode() { result = pre }
666711
}
667712

668713
/**
@@ -866,6 +911,8 @@ predicate readStep(Node nodeFrom, ContentSet c, Node nodeTo) {
866911
nodeTo.(FlowSummaryNode).getSummaryNode())
867912
or
868913
synthDictSplatParameterNodeReadStep(nodeFrom, c, nodeTo)
914+
or
915+
VariableCapture::readStep(nodeFrom, c, nodeTo)
869916
}
870917

871918
/** Data flows from a sequence to a subscript of the sequence. */
@@ -995,6 +1042,10 @@ predicate nodeIsHidden(Node n) {
9951042
n instanceof SynthDictSplatArgumentNode
9961043
or
9971044
n instanceof SynthDictSplatParameterNode
1045+
or
1046+
n instanceof SynthCaptureNode
1047+
or
1048+
n instanceof SynthCapturedVariablesParameterNode
9981049
}
9991050

10001051
class LambdaCallKind = Unit;
@@ -1034,6 +1085,11 @@ predicate allowParameterReturnInSelf(ParameterNode p) {
10341085
p.(ParameterNodeImpl).isParameterOf(c, pos) and
10351086
FlowSummaryImpl::Private::summaryAllowParameterReturnInSelf(c.asLibraryCallable(), pos)
10361087
)
1088+
or
1089+
exists(Function f |
1090+
VariableCapture::Flow::heuristicAllowInstanceParameterReturnInSelf(f) and
1091+
p = TSynthCapturedVariablesParameterNode(f)
1092+
)
10371093
}
10381094

10391095
/** An approximated `Content`. */

python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPublic.qll

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,14 @@ newtype TNode =
117117
/** A synthetic node to allow flow to keyword parameters from a `**kwargs` argument. */
118118
TSynthDictSplatParameterNode(DataFlowCallable callable) {
119119
exists(ParameterPosition ppos | ppos.isKeyword(_) | exists(callable.getParameter(ppos)))
120+
} or
121+
/** A synthetic node representing a captured variable. */
122+
TSynthCaptureNode(VariableCapture::Flow::SynthesizedCaptureNode cn) or
123+
/** A synthetic node representing the heap of a function. Used for variable capture. */
124+
TSynthCapturedVariablesParameterNode(Function f) {
125+
f = any(VariableCapture::CapturedVariable v).getACapturingScope() and
126+
// TODO: Remove this restriction when adding proper support for captured variables in the body of the function we generate for comprehensions
127+
exists(TFunction(f))
120128
}
121129

122130
private import semmle.python.internal.CachedStages
@@ -627,7 +635,9 @@ newtype TContent =
627635
exists(string input, string output | ModelOutput::relevantSummaryModel(_, _, input, output, _) |
628636
attr = [input, output].regexpFind("(?<=(^|\\.)Attribute\\[)[^\\]]+(?=\\])", _, _).trim()
629637
)
630-
}
638+
} or
639+
/** A captured variable. */
640+
TCapturedVariableContent(VariableCapture::CapturedVariable v)
631641

632642
/**
633643
* A data-flow value can have associated content.
@@ -690,6 +700,18 @@ class AttributeContent extends TAttributeContent, Content {
690700
override string toString() { result = "Attribute " + attr }
691701
}
692702

703+
/** A captured variable. */
704+
class CapturedVariableContent extends Content, TCapturedVariableContent {
705+
private VariableCapture::CapturedVariable v;
706+
707+
CapturedVariableContent() { this = TCapturedVariableContent(v) }
708+
709+
/** Gets the captured variable. */
710+
VariableCapture::CapturedVariable getVariable() { result = v }
711+
712+
override string toString() { result = "captured " + v }
713+
}
714+
693715
/**
694716
* An entity that represents a set of `Content`s.
695717
*

python/ql/lib/semmle/python/dataflow/new/internal/FlowSummaryImpl.qll

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,16 @@ private import DataFlowImplSpecific::Public
1212
module Input implements InputSig<DataFlowImplSpecific::PythonDataFlow> {
1313
class SummarizedCallableBase = string;
1414

15-
ArgumentPosition callbackSelfParameterPosition() { none() }
15+
ArgumentPosition callbackSelfParameterPosition() { result.isLambdaSelf() }
1616

1717
ReturnKind getStandardReturnValueKind() { any() }
1818

1919
string encodeParameterPosition(ParameterPosition pos) {
2020
pos.isSelf() and result = "self"
2121
or
22+
pos.isLambdaSelf() and
23+
result = "lambda-self"
24+
or
2225
exists(int i |
2326
pos.isPositional(i) and
2427
result = i.toString()
@@ -33,6 +36,9 @@ module Input implements InputSig<DataFlowImplSpecific::PythonDataFlow> {
3336
string encodeArgumentPosition(ArgumentPosition pos) {
3437
pos.isSelf() and result = "self"
3538
or
39+
pos.isLambdaSelf() and
40+
result = "lambda-self"
41+
or
3642
exists(int i |
3743
pos.isPositional(i) and
3844
result = i.toString()

0 commit comments

Comments
 (0)