Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions libs/openant-core/parsers/javascript/dependency_resolver.js
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,21 @@ class DependencyResolver {
buildCallGraph() {
for (const [funcId, funcData] of Object.entries(this.functions)) {
const calls = this._extractCalls(funcData.code, funcId);

// Merge in any explicit call edges declared by the analyzer.
// This is used for cases the body-text regex can't see — e.g.
// Express middleware identifiers passed as sibling args:
// app.post('/x', authenticateToken, async (req,res) => {...})
const explicitCalls = funcData.explicitCalls || [];
const callerFile = funcId.split(':')[0];
for (const name of explicitCalls) {
if (!name) continue;
const resolved = this._resolveCall(name, callerFile, funcId);
if (resolved && !calls.includes(resolved)) {
calls.push(resolved);
}
}

this.callGraph[funcId] = calls;

// Build reverse graph
Expand Down
225 changes: 225 additions & 0 deletions libs/openant-core/parsers/javascript/typescript_analyzer.js
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,231 @@ class TypeScriptAnalyzer {
// Extract functions from module.exports.propertyName = function() {...}
// Pattern used by DVNA and similar CommonJS codebases
this._extractModuleExportsPropertyFunctions(sourceFile, relativePath);

// Extract anonymous callbacks used as Express route handlers / middleware
// Pattern: app.get('/x', auth, async (req, res) => {...})
this._extractExpressRouteCallbacks(sourceFile, relativePath);
}

/**
* Express HTTP verbs we recognise on a router/app object.
* `use` is included to pick up middleware-mount callbacks.
*/
static EXPRESS_VERBS = new Set([
"get",
"post",
"put",
"patch",
"delete",
"options",
"head",
"all",
"use",
]);

/**
* Walk a source file looking for Express-style route registrations and
* emit a synthetic function entry for each anonymous arrow / function
* expression used as a callback.
*
* Recognises patterns of the form:
* <obj>.<verb>(<path>, ...callbacks)
* <obj>.<verb>(...callbacks) // only for `use`
* where `<verb>` is one of the Express HTTP verbs (or `use`) and the
* first argument (when present) is a string-literal path.
*
* For each anonymous callback at index >= 1 we synthesise a function
* entry. The last anonymous-or-named callback is treated as the route
* handler; earlier callbacks are middleware. Named identifiers in
* callback positions are recorded as explicit call edges from the
* synthesised callbacks (e.g. `authenticateToken` becomes an upstream
* dependency of the handler so call-graph based analyses see the
* relationship).
*/
/**
* Heuristic: does `receiver` look like an Express app / router?
*
* We accept identifiers whose name contains `app`, `router`, `routes`, or
* `server` (case-insensitive), and chained calls like `app.route(...)` or
* `router.route(...)`. We deliberately reject other receivers so generic
* `.get(...)` calls on caches / clients / query-builders aren't misread
* as routes.
*/
_isPlausibleExpressReceiver(receiver) {
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The receiver filter has known false negatives for common naming conventions: codebases that use single-word identifiers like web, api, endpoints, controller for the Express app/router get no extraction at all, since those names don't end in app/router/routes/server. From a quick scan of popular Express apps in the wild, web and api are not unusual choices.

Suggestions (non-blocking):

  • Either expand the suffix list (add endpoints, controller, web, api — or treat any short identifier as a candidate and rely on the verb-set + string-path filter to reject false positives), or
  • Document the assumption in the JSDoc and the route_handler extraction docstring so reviewers / users understand why their app might silently produce zero route units.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in f869c97. Expanded the stem list to include web, api, endpoints, and controller, and consolidated both the Identifier and PropertyAccessExpression branches to share a single static EXPRESS_RECEIVER_STEMS field so they can't drift out of sync. Updated the JSDoc to document the remaining coverage boundary (identifiers outside the stem list will produce zero units).

if (!receiver) return false;
const kind = receiver.getKindName();

if (kind === "Identifier") {
const name = receiver.getText().toLowerCase();
return /(^|_)(app|router|routes|server)(\d|$|_)/.test(name)
|| /app$|router$|routes$|server$/.test(name)
|| name === "app"
|| name === "router"
|| name === "routes"
|| name === "server";
}
if (kind === "CallExpression") {
// e.g. app.route('/x').get(...) — receiver is the .route() call
const inner = receiver.getExpression && receiver.getExpression();
if (inner && inner.getKindName && inner.getKindName() === "PropertyAccessExpression") {
const innerName = inner.getName && inner.getName();
if (innerName === "route" || innerName === "Router") return true;
}
return false;
}
if (kind === "PropertyAccessExpression") {
// e.g. this.app.get(...) or express.Router().get(...) — accept when
// the trailing identifier matches our identifier pattern.
const trailing = receiver.getName && receiver.getName();
if (!trailing) return false;
const lower = trailing.toLowerCase();
return ["app", "router", "routes", "server"].some((s) => lower.endsWith(s));
}
return false;
}

_extractExpressRouteCallbacks(sourceFile, relativePath) {
const callExpressions = sourceFile
.getDescendantsOfKind(ts.SyntaxKind.CallExpression);

for (const callExpr of callExpressions) {
const expression = callExpr.getExpression();
if (!expression || expression.getKindName() !== "PropertyAccessExpression") {
continue;
}

const methodName = expression.getName ? expression.getName() : null;
if (!methodName || !TypeScriptAnalyzer.EXPRESS_VERBS.has(methodName)) {
continue;
}

// Filter to plausibly-Express receivers. Without this we'd match any
// `foo.get('x', () => {})` style call (e.g. cache lookups, query
// builders) and synthesise bogus route units.
const receiver = expression.getExpression
? expression.getExpression()
: null;
if (!this._isPlausibleExpressReceiver(receiver)) {
continue;
}

const args = callExpr.getArguments();
if (args.length === 0) continue;

// Determine whether the first argument is a path string literal.
const firstArg = args[0];
const firstKind = firstArg.getKindName();
let httpPath = null;
let callbackStartIndex = 0;
if (firstKind === "StringLiteral" || firstKind === "NoSubstitutionTemplateLiteral") {
httpPath = firstArg.getLiteralValue
? firstArg.getLiteralValue()
: firstArg.getText().slice(1, -1);
callbackStartIndex = 1;
} else if (methodName === "use") {
// `app.use(middleware)` — no path, all args are callbacks.
httpPath = null;
callbackStartIndex = 0;
} else {
// Not an Express-shaped call (no string path and not `use`).
continue;
}

// Gather the callback arguments (functions + named identifiers).
const callbacks = args.slice(callbackStartIndex);
if (callbacks.length === 0) continue;

// We only emit units when at least one callback is an inline
// anonymous function. Otherwise the existing extraction logic
// already handles named handlers.
const hasInline = callbacks.some((a) => {
const k = a.getKindName();
return k === "ArrowFunction" || k === "FunctionExpression";
});
if (!hasInline) continue;

const httpMethod = methodName.toUpperCase();
const lastCallbackIndex = callbacks.length - 1;

// Collect named middleware identifiers (Identifier / PropertyAccess)
// that appear as siblings in the args list. They become explicit
// call-graph edges from each synthesised callback.
const namedMiddleware = [];
for (let i = 0; i < callbacks.length; i++) {
const arg = callbacks[i];
const k = arg.getKindName();
if (k === "Identifier") {
namedMiddleware.push(arg.getText());
} else if (k === "PropertyAccessExpression") {
// e.g. middleware.auth — keep the trailing name
const name = arg.getName ? arg.getName() : arg.getText();
namedMiddleware.push(name);
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For PropertyAccessExpression args (app.post('/x', middleware.auth, ...)), this stores just the trailing name ("auth"), which _resolveCall will then look up in the global function map. If there's any other function called auth anywhere in the codebase (e.g. someClass.auth), _resolveCall may resolve to the wrong one — silent false-positive call graph edge.

Suggestions (non-blocking):

  • Store the full arg.getText() (e.g. "middleware.auth") and let dependency_resolver's existing _resolveMethodCall handle the property-access pattern.
  • Or, if you want to keep the simple-name semantics, document the limitation and add a test exercising the ambiguous-name case so the behavior is intentional.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Documented with a comment explaining the known limitation — trailing-name-only storage (e.g. "auth" from "middleware.auth") can resolve to the wrong target if another function shares the same simple name. Chose documentation over a dual-file refactor (also touching dependency_resolver.js) as the simpler, lower-risk path for a non-blocking nit.

}
}

for (let i = 0; i < callbacks.length; i++) {
const arg = callbacks[i];
const k = arg.getKindName();
if (k !== "ArrowFunction" && k !== "FunctionExpression") continue;

// Only emit for *anonymous* function expressions. A function
// expression with a name like `function named(req,res){}` is
// already extracted elsewhere.
if (k === "FunctionExpression" && arg.getName && arg.getName()) {
continue;
}

const isHandler = i === lastCallbackIndex;
const role = isHandler ? "handler" : `middleware:${i}`;
const pathLabel = httpPath !== null ? httpPath : "";
const baseName = pathLabel
? `${httpMethod} ${pathLabel} [${role}]`
: `${httpMethod} [${role}]`;
const synthName = baseName;

const code = arg.getFullText();
const startLine = arg.getStartLineNumber();
const endLine = arg.getEndLineNumber();
// Synthesise an ID that's stable per file/line so two routes on
// the same line+path don't collide.
const idSuffix = `${httpMethod}:${pathLabel}:${startLine}:${i}`;
const functionId = `${relativePath}:express(${idSuffix})`;

if (this.functions[functionId]) continue;

const unitType = isHandler ? "route_handler" : "route_middleware";
const explicitCalls = namedMiddleware.filter((n) => n && n !== synthName);

this.functions[functionId] = {
name: synthName,
code: code,
isExported: false,
unitType: unitType,
startLine: startLine,
endLine: endLine,
isEntryPoint: isHandler,
routeMetadata: {
http_method: httpMethod,
http_path: httpPath,
callback_index: i,
total_callbacks: callbacks.length,
named_middleware: explicitCalls,
},
explicitCalls: explicitCalls,
};

// Emit a callGraph entry for the synthesised callback so the
// invariant `callGraph keys ≡ functions keys` holds. The named
// middleware identifiers are recorded as upstream dependencies via
// explicitCalls (merged downstream by dependency_resolver.js); here
// we capture any inline call expressions from the callback body so
// call-graph based analyses can see them too.
this.callGraph[functionId] = this.extractCallsFromFunction(
arg,
relativePath,
);
}
}
}

/**
Expand Down
17 changes: 17 additions & 0 deletions libs/openant-core/parsers/javascript/unit_generator.js
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,19 @@ class UnitGenerator {
unitType = 'route_handler';
}

// If the analyzer attached Express route metadata directly to the
// function (anonymous arrow handler / middleware), surface it on the
// unit's `route` field even when no external routes.json was given.
if (!routeData && funcData.routeMetadata) {
const meta = funcData.routeMetadata;
routeData = {
method: meta.http_method,
path: meta.http_path,
handler: funcData.name,
middleware: meta.named_middleware || [],
};
}

// Get upstream dependencies (functions this calls)
const upstreamIds = this.resolver.getDependencies(functionId);
const upstreamDependencies = [];
Expand Down Expand Up @@ -314,6 +327,10 @@ class UnitGenerator {
handler: routeData.handler,
middleware: routeData.middleware || []
} : null,
is_entry_point: funcData.isEntryPoint === true ? true : undefined,
http_method: funcData.routeMetadata ? funcData.routeMetadata.http_method : undefined,
http_path: funcData.routeMetadata ? funcData.routeMetadata.http_path : undefined,
callback_index: funcData.routeMetadata ? funcData.routeMetadata.callback_index : undefined,
ground_truth: {
status: 'UNKNOWN',
vulnerability_types: [],
Expand Down
Empty file.
Loading
Loading