diff --git a/python/ql/consistency-queries/DataFlowConsistency.ql b/python/ql/consistency-queries/DataFlowConsistency.ql index 0afc4f4884d9..f0a0d0356ca2 100644 --- a/python/ql/consistency-queries/DataFlowConsistency.ql +++ b/python/ql/consistency-queries/DataFlowConsistency.ql @@ -14,6 +14,8 @@ private module Input implements InputSig { private import Private private import Public + predicate postWithInFlowExclude(Node n) { n instanceof FlowSummaryNode } + predicate argHasPostUpdateExclude(ArgumentNode n) { // TODO: Implement post-updates for *args, see tests added in https://github.com/github/codeql/pull/14936 exists(ArgumentPosition apos | n.argumentOf(_, apos) and apos.isStarArgs(_)) @@ -44,6 +46,13 @@ private module Input implements InputSig { ) } + predicate uniqueEnclosingCallableExclude(Node n) { + // We only have a selection of valid callables. + // For instance, we do not have classes as `DataFlowCallable`s. + not n.(SynthCaptureNode).getSynthesizedCaptureNode().getEnclosingCallable() instanceof Function and + not n.(SynthCaptureNode).getSynthesizedCaptureNode().getEnclosingCallable() instanceof Module + } + predicate uniqueCallEnclosingCallableExclude(DataFlowCall call) { not exists(call.getLocation().getFile().getRelativePath()) } @@ -53,7 +62,7 @@ private module Input implements InputSig { } predicate multipleArgumentCallExclude(ArgumentNode arg, DataFlowCall call) { - // since we can have multiple DataFlowCall for a CallNode (for example if can + // since we can have multiple DataFlowCall for a CallNode (for example if it can // resolve to multiple functions), but we only make _one_ ArgumentNode for each // argument in the CallNode, we end up violating this consistency check in those // cases. (see `getCallArg` in DataFlowDispatch.qll) diff --git a/python/ql/lib/change-notes/2023-12-18-support-variable-capture.md b/python/ql/lib/change-notes/2023-12-18-support-variable-capture.md new file mode 100644 index 000000000000..e7aee047fa15 --- /dev/null +++ b/python/ql/lib/change-notes/2023-12-18-support-variable-capture.md @@ -0,0 +1,4 @@ +--- +category: majorAnalysis +--- +* Added support for global data-flow through captured variables. \ No newline at end of file diff --git a/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowDispatch.qll b/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowDispatch.qll index df9329058951..87a278e0f6bc 100644 --- a/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowDispatch.qll +++ b/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowDispatch.qll @@ -42,6 +42,13 @@ private import semmle.python.dataflow.new.internal.TypeTrackingImpl::CallGraphCo newtype TParameterPosition = /** Used for `self` in methods, and `cls` in classmethods. */ TSelfParameterPosition() or + /** + * This is used for tracking flow through captured variables, and + * we use separate parameter/argument positions in order to distinguish + * "lambda self" from "normal self", as lambdas may also access outer `self` + * variables (through variable capture). + */ + TLambdaSelfParameterPosition() or TPositionalParameterPosition(int index) { index = any(Parameter p).getPosition() or @@ -78,6 +85,9 @@ class ParameterPosition extends TParameterPosition { /** Holds if this position represents a `self`/`cls` parameter. */ predicate isSelf() { this = TSelfParameterPosition() } + /** Holds if this position represents a reference to a lambda itself. Only used for tracking flow through captured variables. */ + predicate isLambdaSelf() { this = TLambdaSelfParameterPosition() } + /** Holds if this position represents a positional parameter at (0-based) `index`. */ predicate isPositional(int index) { this = TPositionalParameterPosition(index) } @@ -109,6 +119,8 @@ class ParameterPosition extends TParameterPosition { string toString() { this.isSelf() and result = "self" or + this.isLambdaSelf() and result = "lambda self" + or exists(int index | this.isPositional(index) and result = "position " + index) or exists(string name | this.isKeyword(name) and result = "keyword " + name) @@ -129,6 +141,13 @@ class ParameterPosition extends TParameterPosition { newtype TArgumentPosition = /** Used for `self` in methods, and `cls` in classmethods. */ TSelfArgumentPosition() or + /** + * This is used for tracking flow through captured variables, and + * we use separate parameter/argument positions in order to distinguish + * "lambda self" from "normal self", as lambdas may also access outer `self` + * variables (through variable capture). + */ + TLambdaSelfArgumentPosition() or TPositionalArgumentPosition(int index) { exists(any(CallNode c).getArg(index)) or @@ -153,6 +172,9 @@ class ArgumentPosition extends TArgumentPosition { /** Holds if this position represents a `self`/`cls` argument. */ predicate isSelf() { this = TSelfArgumentPosition() } + /** Holds if this position represents a lambda `self` argument. Only used for tracking flow through captured variables. */ + predicate isLambdaSelf() { this = TLambdaSelfArgumentPosition() } + /** Holds if this position represents a positional argument at (0-based) `index`. */ predicate isPositional(int index) { this = TPositionalArgumentPosition(index) } @@ -169,6 +191,8 @@ class ArgumentPosition extends TArgumentPosition { string toString() { this.isSelf() and result = "self" or + this.isLambdaSelf() and result = "lambda self" + or exists(int pos | this.isPositional(pos) and result = "position " + pos) or exists(string name | this.isKeyword(name) and result = "keyword " + name) @@ -183,6 +207,8 @@ class ArgumentPosition extends TArgumentPosition { predicate parameterMatch(ParameterPosition ppos, ArgumentPosition apos) { ppos.isSelf() and apos.isSelf() or + ppos.isLambdaSelf() and apos.isLambdaSelf() + or exists(int index | ppos.isPositional(index) and apos.isPositional(index)) or exists(string name | ppos.isKeyword(name) and apos.isKeyword(name)) @@ -1506,6 +1532,37 @@ abstract class ParameterNodeImpl extends Node { } } +/** + * A synthetic parameter representing the values of the variables captured + * by the callable being called. This parameter represents a single object + * where all the values are stored as attributes. + * This is also known as the environment part of a closure. + * + * This is used for tracking flow through captured variables. + */ +class SynthCapturedVariablesParameterNode extends ParameterNodeImpl, + TSynthCapturedVariablesParameterNode +{ + private Function callable; + + SynthCapturedVariablesParameterNode() { this = TSynthCapturedVariablesParameterNode(callable) } + + final Function getCallable() { result = callable } + + override Parameter getParameter() { none() } + + override predicate isParameterOf(DataFlowCallable c, ParameterPosition pos) { + c = TFunction(callable) and + pos.isLambdaSelf() + } + + override Scope getScope() { result = callable } + + override Location getLocation() { result = callable.getLocation() } + + override string toString() { result = "lambda self in " + callable } +} + /** A parameter for a library callable with a flow summary. */ class SummaryParameterNode extends ParameterNodeImpl, FlowSummaryNode { SummaryParameterNode() { @@ -1580,6 +1637,39 @@ private class SummaryPostUpdateNode extends FlowSummaryNode, PostUpdateNodeImpl override Node getPreUpdateNode() { result = pre } } +/** + * A synthetic argument representing the values of the variables captured + * by the callable being called. This argument represents a single object + * where all the values are stored as attributes. + * This is also known as the environment part of a closure. + * + * This is used for tracking flow through captured variables. + * + * TODO: + * We might want a synthetic node here, but currently that incurs problems + * with non-monotonic recursion, because of the use of `resolveCall` in the + * char pred. This may be solvable by using + * `CallGraphConstruction::Make` in stead of + * `CallGraphConstruction::Simple::Make` appropriately. + */ +class CapturedVariablesArgumentNode extends CfgNode, ArgumentNode { + CallNode callNode; + + CapturedVariablesArgumentNode() { + node = callNode.getFunction() and + exists(Function target | resolveCall(callNode, target, _) | + target = any(VariableCapture::CapturedVariable v).getACapturingScope() + ) + } + + override string toString() { result = "Capturing closure argument" } + + override predicate argumentOf(DataFlowCall call, ArgumentPosition pos) { + callNode = call.getNode() and + pos.isLambdaSelf() + } +} + /** Gets a viable run-time target for the call `call`. */ DataFlowCallable viableCallable(DataFlowCall call) { call instanceof ExtractedDataFlowCall and diff --git a/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPrivate.qll b/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPrivate.qll index 0043aa3b2fda..61d84764d87c 100644 --- a/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPrivate.qll +++ b/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPrivate.qll @@ -17,6 +17,7 @@ private import semmle.python.Frameworks import MatchUnpacking import IterableUnpacking import DataFlowDispatch +import VariableCapture as VariableCapture /** Gets the callable in which this node occurs. */ DataFlowCallable nodeGetEnclosingCallable(Node n) { result = n.getEnclosingCallable() } @@ -474,6 +475,8 @@ predicate simpleLocalFlowStep(Node nodeFrom, Node nodeTo) { simpleLocalFlowStepForTypetracking(nodeFrom, nodeTo) or summaryFlowSteps(nodeFrom, nodeTo) + or + variableCaptureLocalFlowStep(nodeFrom, nodeTo) } /** @@ -496,6 +499,16 @@ predicate summaryFlowSteps(Node nodeFrom, Node nodeTo) { IncludePostUpdateFlow::step/2>::step(nodeFrom, nodeTo) } +predicate variableCaptureLocalFlowStep(Node nodeFrom, Node nodeTo) { + // Blindly applying use-use flow can result in a node that steps to itself, for + // example in while-loops. To uphold dataflow consistency checks, we don't want + // that. However, we do want to allow `[post] n` to `n` (to handle while loops), so + // we should only do the filtering after `IncludePostUpdateFlow` has ben applied. + IncludePostUpdateFlow::step/2>::step(nodeFrom, + nodeTo) and + nodeFrom != nodeTo +} + /** `ModuleVariable`s are accessed via jump steps at runtime. */ predicate runtimeJumpStep(Node nodeFrom, Node nodeTo) { // Module variable read @@ -559,7 +572,7 @@ predicate compatibleTypes(DataFlowType t1, DataFlowType t2) { any() } predicate typeStrongerThan(DataFlowType t1, DataFlowType t2) { none() } -predicate localMustFlowStep(Node node1, Node node2) { none() } +predicate localMustFlowStep(Node nodeFrom, Node nodeTo) { none() } /** * Gets the type of `node`. @@ -663,6 +676,38 @@ predicate storeStep(Node nodeFrom, ContentSet c, Node nodeTo) { synthStarArgsElementParameterNodeStoreStep(nodeFrom, c, nodeTo) or synthDictSplatArgumentNodeStoreStep(nodeFrom, c, nodeTo) + or + VariableCapture::storeStep(nodeFrom, c, nodeTo) +} + +/** + * A synthesized data flow node representing a closure object that tracks + * captured variables. + */ +class SynthCaptureNode extends Node, TSynthCaptureNode { + private VariableCapture::Flow::SynthesizedCaptureNode cn; + + SynthCaptureNode() { this = TSynthCaptureNode(cn) } + + /** Gets the `SynthesizedCaptureNode` that this node represents. */ + VariableCapture::Flow::SynthesizedCaptureNode getSynthesizedCaptureNode() { result = cn } + + override Scope getScope() { result = cn.getEnclosingCallable() } + + override Location getLocation() { result = cn.getLocation() } + + override string toString() { result = cn.toString() } +} + +private class SynthCapturePostUpdateNode extends PostUpdateNodeImpl, SynthCaptureNode { + private SynthCaptureNode pre; + + SynthCapturePostUpdateNode() { + VariableCapture::Flow::capturePostUpdateNode(this.getSynthesizedCaptureNode(), + pre.getSynthesizedCaptureNode()) + } + + override Node getPreUpdateNode() { result = pre } } /** @@ -866,6 +911,8 @@ predicate readStep(Node nodeFrom, ContentSet c, Node nodeTo) { nodeTo.(FlowSummaryNode).getSummaryNode()) or synthDictSplatParameterNodeReadStep(nodeFrom, c, nodeTo) + or + VariableCapture::readStep(nodeFrom, c, nodeTo) } /** Data flows from a sequence to a subscript of the sequence. */ @@ -995,6 +1042,10 @@ predicate nodeIsHidden(Node n) { n instanceof SynthDictSplatArgumentNode or n instanceof SynthDictSplatParameterNode + or + n instanceof SynthCaptureNode + or + n instanceof SynthCapturedVariablesParameterNode } class LambdaCallKind = Unit; @@ -1034,6 +1085,11 @@ predicate allowParameterReturnInSelf(ParameterNode p) { p.(ParameterNodeImpl).isParameterOf(c, pos) and FlowSummaryImpl::Private::summaryAllowParameterReturnInSelf(c.asLibraryCallable(), pos) ) + or + exists(Function f | + VariableCapture::Flow::heuristicAllowInstanceParameterReturnInSelf(f) and + p = TSynthCapturedVariablesParameterNode(f) + ) } /** An approximated `Content`. */ diff --git a/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPublic.qll b/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPublic.qll index 35eae2f99cb8..8f08efb0f93d 100644 --- a/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPublic.qll +++ b/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPublic.qll @@ -117,6 +117,14 @@ newtype TNode = /** A synthetic node to allow flow to keyword parameters from a `**kwargs` argument. */ TSynthDictSplatParameterNode(DataFlowCallable callable) { exists(ParameterPosition ppos | ppos.isKeyword(_) | exists(callable.getParameter(ppos))) + } or + /** A synthetic node representing a captured variable. */ + TSynthCaptureNode(VariableCapture::Flow::SynthesizedCaptureNode cn) or + /** A synthetic node representing the heap of a function. Used for variable capture. */ + TSynthCapturedVariablesParameterNode(Function f) { + f = any(VariableCapture::CapturedVariable v).getACapturingScope() and + // TODO: Remove this restriction when adding proper support for captured variables in the body of the function we generate for comprehensions + exists(TFunction(f)) } private import semmle.python.internal.CachedStages @@ -627,7 +635,9 @@ newtype TContent = exists(string input, string output | ModelOutput::relevantSummaryModel(_, _, input, output, _) | attr = [input, output].regexpFind("(?<=(^|\\.)Attribute\\[)[^\\]]+(?=\\])", _, _).trim() ) - } + } or + /** A captured variable. */ + TCapturedVariableContent(VariableCapture::CapturedVariable v) /** * A data-flow value can have associated content. @@ -690,6 +700,18 @@ class AttributeContent extends TAttributeContent, Content { override string toString() { result = "Attribute " + attr } } +/** A captured variable. */ +class CapturedVariableContent extends Content, TCapturedVariableContent { + private VariableCapture::CapturedVariable v; + + CapturedVariableContent() { this = TCapturedVariableContent(v) } + + /** Gets the captured variable. */ + VariableCapture::CapturedVariable getVariable() { result = v } + + override string toString() { result = "captured " + v } +} + /** * An entity that represents a set of `Content`s. * diff --git a/python/ql/lib/semmle/python/dataflow/new/internal/FlowSummaryImpl.qll b/python/ql/lib/semmle/python/dataflow/new/internal/FlowSummaryImpl.qll index 055a7cee03b2..4a55d38edb6c 100644 --- a/python/ql/lib/semmle/python/dataflow/new/internal/FlowSummaryImpl.qll +++ b/python/ql/lib/semmle/python/dataflow/new/internal/FlowSummaryImpl.qll @@ -12,13 +12,16 @@ private import DataFlowImplSpecific::Public module Input implements InputSig { class SummarizedCallableBase = string; - ArgumentPosition callbackSelfParameterPosition() { none() } + ArgumentPosition callbackSelfParameterPosition() { result.isLambdaSelf() } ReturnKind getStandardReturnValueKind() { any() } string encodeParameterPosition(ParameterPosition pos) { pos.isSelf() and result = "self" or + pos.isLambdaSelf() and + result = "lambda-self" + or exists(int i | pos.isPositional(i) and result = i.toString() @@ -33,6 +36,9 @@ module Input implements InputSig { string encodeArgumentPosition(ArgumentPosition pos) { pos.isSelf() and result = "self" or + pos.isLambdaSelf() and + result = "lambda-self" + or exists(int i | pos.isPositional(i) and result = i.toString() diff --git a/python/ql/lib/semmle/python/dataflow/new/internal/VariableCapture.qll b/python/ql/lib/semmle/python/dataflow/new/internal/VariableCapture.qll new file mode 100644 index 000000000000..dd4d3f038677 --- /dev/null +++ b/python/ql/lib/semmle/python/dataflow/new/internal/VariableCapture.qll @@ -0,0 +1,207 @@ +/** Provides logic related to captured variables. */ + +private import python +private import DataFlowPublic +private import semmle.python.dataflow.new.internal.DataFlowPrivate +private import codeql.dataflow.VariableCapture as Shared + +// Note: The Javascript implementation (on the branch https://github.com/github/codeql/pull/14412) +// had some tweaks related to performance. See these two commits: +// - JS: Capture flow: https://github.com/github/codeql/pull/14412/commits/7bcf8b858babfea0a3e36ce61145954c249e13ac +// - JS: Disallow consecutive captured contents: https://github.com/github/codeql/pull/14412/commits/46e4cdc6232604ea7f58138a336d5a222fad8567 +// The first is the main implementation, the second is a performance motivated restriction. +// The restriction is to clear any `CapturedVariableContent` before writing a new one +// to avoid long access paths (see the link for a nice explanation). +private module CaptureInput implements Shared::InputSig { + private import python as PY + + additional class ExprCfgNode extends ControlFlowNode { + ExprCfgNode() { isExpressionNode(this) } + } + + class Callable extends Scope { + predicate isConstructor() { none() } + } + + class BasicBlock extends PY::BasicBlock { + Callable getEnclosingCallable() { result = this.getScope() } + + // Note `PY:BasicBlock` does not have a `getLocation`. + // (Instead it has a complicated location info logic.) + // Using the location of the first node is simple + // and we just need a way to identify the basic block + // during debugging, so this will be serviceable. + Location getLocation() { result = super.getNode(0).getLocation() } + } + + BasicBlock getImmediateBasicBlockDominator(BasicBlock bb) { result = bb.getImmediateDominator() } + + BasicBlock getABasicBlockSuccessor(BasicBlock bb) { result = bb.getASuccessor() } + + class CapturedVariable extends LocalVariable { + Function f; + + CapturedVariable() { + // note: captured variables originating on module scope is currently + // covered by global variable handling. + this.getScope() = f and + this.getAnAccess().getScope() != f + } + + Callable getCallable() { result = f } + + Location getLocation() { result = f.getLocation() } + + /** Gets a scope that captures this variable. */ + Scope getACapturingScope() { + result = this.getAnAccess().getScope().getScope*() and + result.getScope+() = f + } + } + + class CapturedParameter extends CapturedVariable { + CapturedParameter() { this.isParameter() } + + ControlFlowNode getCfgNode() { result.getNode().(Parameter) = this.getAnAccess() } + } + + class Expr extends ExprCfgNode { + predicate hasCfgNode(BasicBlock bb, int i) { this = bb.getNode(i) } + } + + class VariableWrite extends ControlFlowNode { + CapturedVariable v; + + VariableWrite() { this = v.getAStore().getAFlowNode().(DefinitionNode).getValue() } + + CapturedVariable getVariable() { result = v } + + predicate hasCfgNode(BasicBlock bb, int i) { this = bb.getNode(i) } + } + + class VariableRead extends Expr { + CapturedVariable v; + + VariableRead() { this = v.getALoad().getAFlowNode() } + + CapturedVariable getVariable() { result = v } + } + + private predicate closureFlowStep(ExprCfgNode nodeFrom, ExprCfgNode nodeTo) { + // TODO: Other languages have an extra case here looking like + // simpleAstFlowStep(nodeFrom, nodeTo) + // we should investigate the potential benefit of adding that. + exists(SsaVariable def | + def.getAUse() = nodeTo and + def.getAnUltimateDefinition().getDefinition().(DefinitionNode).getValue() = nodeFrom + ) + } + + class ClosureExpr extends Expr { + ClosureExpr() { + this.getNode() instanceof CallableExpr + or + this.getNode() instanceof Comp + } + + predicate hasBody(Callable body) { + body = this.getNode().(CallableExpr).getInnerScope() + or + body = this.getNode().(Comp).getFunction() + } + + predicate hasAliasedAccess(Expr f) { closureFlowStep+(this, f) and not closureFlowStep(f, _) } + } +} + +class CapturedVariable = CaptureInput::CapturedVariable; + +class ClosureExpr = CaptureInput::ClosureExpr; + +module Flow = Shared::Flow; + +private Flow::ClosureNode asClosureNode(Node n) { + result = n.(SynthCaptureNode).getSynthesizedCaptureNode() + or + result.(Flow::ExprNode).getExpr() = n.(CfgNode).getNode() + or + result.(Flow::VariableWriteSourceNode).getVariableWrite() = n.(CfgNode).getNode() + or + result.(Flow::ExprPostUpdateNode).getExpr() = + n.(PostUpdateNode).getPreUpdateNode().(CfgNode).getNode() + or + result.(Flow::ParameterNode).getParameter().getCfgNode() = n.(CfgNode).getNode() + or + result.(Flow::ThisParameterNode).getCallable() = + n.(SynthCapturedVariablesParameterNode).getCallable() +} + +predicate storeStep(Node nodeFrom, CapturedVariableContent c, Node nodeTo) { + Flow::storeStep(asClosureNode(nodeFrom), c.getVariable(), asClosureNode(nodeTo)) +} + +predicate readStep(Node nodeFrom, CapturedVariableContent c, Node nodeTo) { + Flow::readStep(asClosureNode(nodeFrom), c.getVariable(), asClosureNode(nodeTo)) +} + +predicate valueStep(Node nodeFrom, Node nodeTo) { + Flow::localFlowStep(asClosureNode(nodeFrom), asClosureNode(nodeTo)) +} + +/** + * Provides predicates to understand the behavior of the variable capture + * library instantiation on Python code bases. + * + * The predicates in here are meant to be run by quick-eval on databases of + * interest. The `unmapped*`-predicates should ideally be empty. + */ +private module Debug { + predicate flowStoreStep( + Node nodeFrom, Flow::ClosureNode closureNodeFrom, CapturedVariable v, + Flow::ClosureNode closureNodeTo, Node nodeTo + ) { + closureNodeFrom = asClosureNode(nodeFrom) and + closureNodeTo = asClosureNode(nodeTo) and + Flow::storeStep(closureNodeFrom, v, closureNodeTo) + } + + predicate unmappedFlowStoreStep( + Flow::ClosureNode closureNodeFrom, CapturedVariable v, Flow::ClosureNode closureNodeTo + ) { + Flow::storeStep(closureNodeFrom, v, closureNodeTo) and + not flowStoreStep(_, closureNodeFrom, v, closureNodeTo, _) + } + + predicate flowReadStep( + Node nodeFrom, Flow::ClosureNode closureNodeFrom, CapturedVariable v, + Flow::ClosureNode closureNodeTo, Node nodeTo + ) { + closureNodeFrom = asClosureNode(nodeFrom) and + closureNodeTo = asClosureNode(nodeTo) and + Flow::readStep(closureNodeFrom, v, closureNodeTo) + } + + predicate unmappedFlowReadStep( + Flow::ClosureNode closureNodeFrom, CapturedVariable v, Flow::ClosureNode closureNodeTo + ) { + Flow::readStep(closureNodeFrom, v, closureNodeTo) and + not flowReadStep(_, closureNodeFrom, v, closureNodeTo, _) + } + + predicate flowValueStep( + Node nodeFrom, Flow::ClosureNode closureNodeFrom, Flow::ClosureNode closureNodeTo, Node nodeTo + ) { + closureNodeFrom = asClosureNode(nodeFrom) and + closureNodeTo = asClosureNode(nodeTo) and + Flow::localFlowStep(closureNodeFrom, closureNodeTo) + } + + predicate unmappedFlowValueStep(Flow::ClosureNode closureNodeFrom, Flow::ClosureNode closureNodeTo) { + Flow::localFlowStep(closureNodeFrom, closureNodeTo) and + not flowValueStep(_, closureNodeFrom, closureNodeTo, _) + } + + predicate unmappedFlowClosureNode(Flow::ClosureNode closureNode) { + not closureNode = asClosureNode(_) + } +} diff --git a/python/ql/test/experimental/dataflow/typetracking/tracked.ql b/python/ql/test/experimental/dataflow/typetracking/tracked.ql index 56e0c19eed3c..ca893688256c 100644 --- a/python/ql/test/experimental/dataflow/typetracking/tracked.ql +++ b/python/ql/test/experimental/dataflow/typetracking/tracked.ql @@ -3,6 +3,7 @@ import semmle.python.dataflow.new.DataFlow import semmle.python.dataflow.new.TypeTracking import TestUtilities.InlineExpectationsTest import semmle.python.ApiGraphs +private import semmle.python.dataflow.new.internal.DataFlowPrivate as DP // ----------------------------------------------------------------------------- // tracked @@ -27,6 +28,8 @@ module TrackedTest implements TestSig { // We do not wish to annotate scope entry definitions, // as they do not appear in the source code. not e instanceof DataFlow::ScopeEntryDefinitionNode and + // ...same for `SynthCaptureNode`s + not e instanceof DP::SynthCaptureNode and tag = "tracked" and location = e.getLocation() and value = t.getAttr() and diff --git a/python/ql/test/experimental/dataflow/validTest.py b/python/ql/test/experimental/dataflow/validTest.py index 5e12e7cccc71..76bd7bb52a53 100644 --- a/python/ql/test/experimental/dataflow/validTest.py +++ b/python/ql/test/experimental/dataflow/validTest.py @@ -74,6 +74,8 @@ def check_tests_valid_after_version(testFile, version): check_tests_valid("variable-capture.dict") check_tests_valid("variable-capture.test_collections") check_tests_valid("variable-capture.by_value") + check_tests_valid("variable-capture.test_library_calls") + check_tests_valid("variable-capture.test_fields") check_tests_valid("module-initialization.multiphase") check_tests_valid("fieldflow.test") check_tests_valid("fieldflow.test_dict") diff --git a/python/ql/test/experimental/dataflow/variable-capture/by_value.py b/python/ql/test/experimental/dataflow/variable-capture/by_value.py index f43b1c6b120a..fa7546b8f2b6 100644 --- a/python/ql/test/experimental/dataflow/variable-capture/by_value.py +++ b/python/ql/test/experimental/dataflow/variable-capture/by_value.py @@ -34,14 +34,14 @@ def by_value1(): a = SOURCE def inner(a_val=a): SINK(a_val) #$ captured - SINK_F(a) + SINK_F(a) #$ SPURIOUS: captured a = NONSOURCE inner() def by_value2(): a = NONSOURCE def inner(a_val=a): - SINK(a) #$ MISSING:captured + SINK(a) #$ captured SINK_F(a_val) a = SOURCE inner() diff --git a/python/ql/test/experimental/dataflow/variable-capture/dataflow-capture-consistency.expected b/python/ql/test/experimental/dataflow/variable-capture/dataflow-capture-consistency.expected new file mode 100644 index 000000000000..35f4edcf1fb9 --- /dev/null +++ b/python/ql/test/experimental/dataflow/variable-capture/dataflow-capture-consistency.expected @@ -0,0 +1,17 @@ +uniqueToString +uniqueEnclosingCallable +uniqueDominator +localDominator +localSuccessor +uniqueDefiningScope +variableIsCaptured +uniqueLocation +uniqueCfgNode +uniqueWriteTarget +uniqueWriteCfgNode +uniqueReadVariable +closureMustHaveBody +closureAliasMustBeInSameScope +variableAccessAstNesting +uniqueCallableLocation +consistencyOverview diff --git a/python/ql/test/experimental/dataflow/variable-capture/dataflow-capture-consistency.ql b/python/ql/test/experimental/dataflow/variable-capture/dataflow-capture-consistency.ql new file mode 100644 index 000000000000..653d00b70398 --- /dev/null +++ b/python/ql/test/experimental/dataflow/variable-capture/dataflow-capture-consistency.ql @@ -0,0 +1,2 @@ +import python +import semmle.python.dataflow.new.internal.DataFlowPrivate::VariableCapture::Flow::ConsistencyChecks diff --git a/python/ql/test/experimental/dataflow/variable-capture/dict.py b/python/ql/test/experimental/dataflow/variable-capture/dict.py index 3cfdfea7a1c4..8a8165d66747 100644 --- a/python/ql/test/experimental/dataflow/variable-capture/dict.py +++ b/python/ql/test/experimental/dataflow/variable-capture/dict.py @@ -37,7 +37,7 @@ def out(): def captureOut1(): sinkO1["x"] = SOURCE captureOut1() - SINK(sinkO1["x"]) #$ MISSING:captured + SINK(sinkO1["x"]) #$ captured sinkO2 = { "x": "" } def captureOut2(): @@ -45,7 +45,7 @@ def m(): sinkO2["x"] = SOURCE m() captureOut2() - SINK(sinkO2["x"]) #$ MISSING:captured + SINK(sinkO2["x"]) #$ captured nonSink0 = { "x": "" } def captureOut1NotCalled(): @@ -67,7 +67,7 @@ def through(tainted): def captureOut1(): sinkO1["x"] = tainted captureOut1() - SINK(sinkO1["x"]) #$ MISSING:captured + SINK(sinkO1["x"]) #$ captured sinkO2 = { "x": "" } def captureOut2(): @@ -75,7 +75,7 @@ def m(): sinkO2["x"] = tainted m() captureOut2() - SINK(sinkO2["x"]) #$ MISSING:captured + SINK(sinkO2["x"]) #$ captured nonSink1 = { "x": "" } def captureOut1NotCalled(): diff --git a/python/ql/test/experimental/dataflow/variable-capture/global.py b/python/ql/test/experimental/dataflow/variable-capture/global.py index ef36b4a4d573..92d273592b81 100644 --- a/python/ql/test/experimental/dataflow/variable-capture/global.py +++ b/python/ql/test/experimental/dataflow/variable-capture/global.py @@ -78,7 +78,7 @@ def captureOut1(): global sinkT1 sinkT1 = tainted captureOut1() - SINK(sinkT1) #$ MISSING:captured + SINK(sinkT1) #$ captured def captureOut2(): def m(): @@ -86,7 +86,7 @@ def m(): sinkT2 = tainted m() captureOut2() - SINK(sinkT2) #$ MISSING:captured + SINK(sinkT2) #$ captured def captureOut1NotCalled(): global nonSinkT1 diff --git a/python/ql/test/experimental/dataflow/variable-capture/in.py b/python/ql/test/experimental/dataflow/variable-capture/in.py index fffedfb1e77f..35fad035355c 100644 --- a/python/ql/test/experimental/dataflow/variable-capture/in.py +++ b/python/ql/test/experimental/dataflow/variable-capture/in.py @@ -34,17 +34,17 @@ def SINK_F(x): def inParam(tainted): def captureIn1(): sinkI1 = tainted - SINK(sinkI1) #$ MISSING:captured + SINK(sinkI1) #$ captured captureIn1() def captureIn2(): def m(): sinkI2 = tainted - SINK(sinkI2) #$ MISSING:captured + SINK(sinkI2) #$ captured m() captureIn2() - captureIn3 = lambda arg: SINK(tainted) + captureIn3 = lambda arg: SINK(tainted) #$ captured captureIn3("") def captureIn1NotCalled(): @@ -68,17 +68,17 @@ def inLocal(): def captureIn1(): sinkI1 = tainted - SINK(sinkI1) #$ MISSING:captured + SINK(sinkI1) #$ captured captureIn1() def captureIn2(): def m(): sinkI2 = tainted - SINK(sinkI2) #$ MISSING:captured + SINK(sinkI2) #$ captured m() captureIn2() - captureIn3 = lambda arg: SINK(tainted) + captureIn3 = lambda arg: SINK(tainted) #$ captured captureIn3("") def captureIn1NotCalled(): diff --git a/python/ql/test/experimental/dataflow/variable-capture/nonlocal.py b/python/ql/test/experimental/dataflow/variable-capture/nonlocal.py index 0e8233532af9..91a06aae639f 100644 --- a/python/ql/test/experimental/dataflow/variable-capture/nonlocal.py +++ b/python/ql/test/experimental/dataflow/variable-capture/nonlocal.py @@ -38,7 +38,7 @@ def captureOut1(): nonlocal sinkO1 sinkO1 = SOURCE captureOut1() - SINK(sinkO1) #$ MISSING:captured + SINK(sinkO1) #$ captured sinkO2 = "" def captureOut2(): @@ -47,7 +47,7 @@ def m(): sinkO2 = SOURCE m() captureOut2() - SINK(sinkO2) #$ MISSING:captured + SINK(sinkO2) #$ captured nonSink1 = "" def captureOut1NotCalled(): @@ -74,7 +74,7 @@ def captureOut1(): nonlocal sinkO1 sinkO1 = tainted captureOut1() - SINK(sinkO1) #$ MISSING:captured + SINK(sinkO1) #$ captured sinkO2 = "" def captureOut2(): @@ -83,7 +83,7 @@ def m(): sinkO2 = tainted m() captureOut2() - SINK(sinkO2) #$ MISSING:captured + SINK(sinkO2) #$ captured nonSink1 = "" def captureOut1NotCalled(): diff --git a/python/ql/test/experimental/dataflow/variable-capture/test_fields.py b/python/ql/test/experimental/dataflow/variable-capture/test_fields.py new file mode 100644 index 000000000000..79f2c4d04f49 --- /dev/null +++ b/python/ql/test/experimental/dataflow/variable-capture/test_fields.py @@ -0,0 +1,51 @@ +import sys +import os + +sys.path.append(os.path.dirname(os.path.dirname((__file__)))) +from testlib import expects + +# These are defined so that we can evaluate the test code. +NONSOURCE = "not a source" +SOURCE = "source" + +def is_source(x): + return x == "source" or x == b"source" or x == 42 or x == 42.0 or x == 42j + + +def SINK(x): + if is_source(x): + print("OK") + else: + print("Unexpected flow", x) + + +def SINK_F(x): + if is_source(x): + print("Unexpected flow", x) + else: + print("OK") + +class MyObj(object): + def setFoo(self, foo): + self.foo = foo + + def getFoo(self): + return self.foo + +@expects(3) +def test_captured_field(): + foo = MyObj() + foo.setFoo(NONSOURCE) + + def test(): + SINK(foo.getFoo()) #$ captured + + def read(): + return foo.getFoo() + + SINK_F(read()) + + foo.setFoo(SOURCE) + test() + + SINK(read()) #$ captured \ No newline at end of file diff --git a/python/ql/test/experimental/dataflow/variable-capture/test_library_calls.py b/python/ql/test/experimental/dataflow/variable-capture/test_library_calls.py new file mode 100644 index 000000000000..70b07f66557a --- /dev/null +++ b/python/ql/test/experimental/dataflow/variable-capture/test_library_calls.py @@ -0,0 +1,48 @@ +# Here we test the case where a captured variable is being written inside a library call. + +# All functions starting with "test_" should run and execute `print("OK")` exactly once. +# This can be checked by running validTest.py. + +import sys +import os + +sys.path.append(os.path.dirname(os.path.dirname((__file__)))) +from testlib import expects + +# These are defined so that we can evaluate the test code. +NONSOURCE = "not a source" +SOURCE = "source" + +def is_source(x): + return x == "source" or x == b"source" or x == 42 or x == 42.0 or x == 42j + + +def SINK(x): + if is_source(x): + print("OK") + else: + print("Unexpected flow", x) + + +def SINK_F(x): + if is_source(x): + print("Unexpected flow", x) + else: + print("OK") + +# Actual tests start here + +@expects(2) +def test_library_call(): + captured = {"x": NONSOURCE} + + def set(x): + captured["x"] = SOURCE + return x + + SINK_F(captured["x"]) + + for x in map(set, [1]): + pass + + SINK(captured["x"]) #$ MISSING: captured