diff --git a/libs/openant-core/parsers/javascript/dependency_resolver.js b/libs/openant-core/parsers/javascript/dependency_resolver.js index 52d130e..30ab96f 100644 --- a/libs/openant-core/parsers/javascript/dependency_resolver.js +++ b/libs/openant-core/parsers/javascript/dependency_resolver.js @@ -60,6 +60,21 @@ class DependencyResolver { buildCallGraph() { for (const [funcId, funcData] of Object.entries(this.functions)) { const calls = this._extractCalls(funcData.code, funcId); + + // Merge in any explicit call edges declared by the analyzer. + // This is used for cases the body-text regex can't see — e.g. + // Express middleware identifiers passed as sibling args: + // app.post('/x', authenticateToken, async (req,res) => {...}) + const explicitCalls = funcData.explicitCalls || []; + const callerFile = funcId.split(':')[0]; + for (const name of explicitCalls) { + if (!name) continue; + const resolved = this._resolveCall(name, callerFile, funcId); + if (resolved && !calls.includes(resolved)) { + calls.push(resolved); + } + } + this.callGraph[funcId] = calls; // Build reverse graph diff --git a/libs/openant-core/parsers/javascript/typescript_analyzer.js b/libs/openant-core/parsers/javascript/typescript_analyzer.js index a41a80d..b99decb 100644 --- a/libs/openant-core/parsers/javascript/typescript_analyzer.js +++ b/libs/openant-core/parsers/javascript/typescript_analyzer.js @@ -240,6 +240,242 @@ class TypeScriptAnalyzer { // Extract functions from module.exports.propertyName = function() {...} // Pattern used by DVNA and similar CommonJS codebases this._extractModuleExportsPropertyFunctions(sourceFile, relativePath); + + // Extract anonymous callbacks used as Express route handlers / middleware + // Pattern: app.get('/x', auth, async (req, res) => {...}) + this._extractExpressRouteCallbacks(sourceFile, relativePath); + } + + /** + * Express HTTP verbs we recognise on a router/app object. + * `use` is included to pick up middleware-mount callbacks. + */ + static EXPRESS_VERBS = new Set([ + "get", + "post", + "put", + "patch", + "delete", + "options", + "head", + "all", + "use", + ]); + + /** + * Walk a source file looking for Express-style route registrations and + * emit a synthetic function entry for each anonymous arrow / function + * expression used as a callback. + * + * Recognises patterns of the form: + * .(, ...callbacks) + * .(...callbacks) // only for `use` + * where `` is one of the Express HTTP verbs (or `use`) and the + * first argument (when present) is a string-literal path. + * + * For each anonymous callback at index >= 1 we synthesise a function + * entry. The last anonymous-or-named callback is treated as the route + * handler; earlier callbacks are middleware. Named identifiers in + * callback positions are recorded as explicit call edges from the + * synthesised callbacks (e.g. `authenticateToken` becomes an upstream + * dependency of the handler so call-graph based analyses see the + * relationship). + */ + /** + * Heuristic: does `receiver` look like an Express app / router? + * + * We accept identifiers whose name ends with or contains one of the common + * Express app/router stems (case-insensitive), and chained calls like + * `app.route(...)` or `router.route(...)`. We deliberately reject other + * receivers so generic `.get(...)` calls on caches / clients / query-builders + * aren't misread as routes. + * + * Accepted stems: app, router, routes, server, web, api, endpoints, controller. + * Codebases using single-word identifiers outside this list (e.g. `http`) will + * not be extracted; add the stem here if needed. + */ + // Stems that strongly suggest an Express app/router object. + static EXPRESS_RECEIVER_STEMS = + "app|router|routes|server|web|api|endpoints|controller"; + + _isPlausibleExpressReceiver(receiver) { + if (!receiver) return false; + const kind = receiver.getKindName(); + const stems = TypeScriptAnalyzer.EXPRESS_RECEIVER_STEMS; + + if (kind === "Identifier") { + const name = receiver.getText().toLowerCase(); + // Accept exact stems, suffix matches (myApp), and underscore-prefixed + // variants (app_server) while rejecting generic short names. + return new RegExp(`(^|_)(${stems})(\\d|$|_)`).test(name) + || new RegExp(`(${stems})$`).test(name); + } + if (kind === "CallExpression") { + // e.g. app.route('/x').get(...) — receiver is the .route() call + const inner = receiver.getExpression && receiver.getExpression(); + if (inner && inner.getKindName && inner.getKindName() === "PropertyAccessExpression") { + const innerName = inner.getName && inner.getName(); + if (innerName === "route" || innerName === "Router") return true; + } + return false; + } + if (kind === "PropertyAccessExpression") { + // e.g. this.app.get(...) or express.Router().get(...) — accept when + // the trailing identifier matches our identifier pattern. + const trailing = receiver.getName && receiver.getName(); + if (!trailing) return false; + const lower = trailing.toLowerCase(); + return new RegExp(`(${stems})$`).test(lower); + } + return false; + } + + _extractExpressRouteCallbacks(sourceFile, relativePath) { + const callExpressions = sourceFile + .getDescendantsOfKind(ts.SyntaxKind.CallExpression); + + for (const callExpr of callExpressions) { + const expression = callExpr.getExpression(); + if (!expression || expression.getKindName() !== "PropertyAccessExpression") { + continue; + } + + const methodName = expression.getName ? expression.getName() : null; + if (!methodName || !TypeScriptAnalyzer.EXPRESS_VERBS.has(methodName)) { + continue; + } + + // Filter to plausibly-Express receivers. Without this we'd match any + // `foo.get('x', () => {})` style call (e.g. cache lookups, query + // builders) and synthesise bogus route units. + const receiver = expression.getExpression + ? expression.getExpression() + : null; + if (!this._isPlausibleExpressReceiver(receiver)) { + continue; + } + + const args = callExpr.getArguments(); + if (args.length === 0) continue; + + // Determine whether the first argument is a path string literal. + const firstArg = args[0]; + const firstKind = firstArg.getKindName(); + let httpPath = null; + let callbackStartIndex = 0; + if (firstKind === "StringLiteral" || firstKind === "NoSubstitutionTemplateLiteral") { + httpPath = firstArg.getLiteralValue + ? firstArg.getLiteralValue() + : firstArg.getText().slice(1, -1); + callbackStartIndex = 1; + } else if (methodName === "use") { + // `app.use(middleware)` — no path, all args are callbacks. + httpPath = null; + callbackStartIndex = 0; + } else { + // Not an Express-shaped call (no string path and not `use`). + continue; + } + + // Gather the callback arguments (functions + named identifiers). + const callbacks = args.slice(callbackStartIndex); + if (callbacks.length === 0) continue; + + // We only emit units when at least one callback is an inline + // anonymous function. Otherwise the existing extraction logic + // already handles named handlers. + const hasInline = callbacks.some((a) => { + const k = a.getKindName(); + return k === "ArrowFunction" || k === "FunctionExpression"; + }); + if (!hasInline) continue; + + const httpMethod = methodName.toUpperCase(); + const lastCallbackIndex = callbacks.length - 1; + + // Collect named middleware identifiers (Identifier / PropertyAccess) + // that appear as siblings in the args list. They become explicit + // call-graph edges from each synthesised callback. + const namedMiddleware = []; + for (let i = 0; i < callbacks.length; i++) { + const arg = callbacks[i]; + const k = arg.getKindName(); + if (k === "Identifier") { + namedMiddleware.push(arg.getText()); + } else if (k === "PropertyAccessExpression") { + // Stores only the trailing name (e.g. "auth" from "middleware.auth"). + // dependency_resolver._resolveCall looks up by simple name, so if + // another unrelated function shares the same name the edge may + // resolve to the wrong target (silent false-positive). This is a + // known limitation of the current simple-name resolution model. + const name = arg.getName ? arg.getName() : arg.getText(); + namedMiddleware.push(name); + } + } + + for (let i = 0; i < callbacks.length; i++) { + const arg = callbacks[i]; + const k = arg.getKindName(); + if (k !== "ArrowFunction" && k !== "FunctionExpression") continue; + + // Only emit for *anonymous* function expressions. A function + // expression with a name like `function named(req,res){}` is + // already extracted elsewhere. + if (k === "FunctionExpression" && arg.getName && arg.getName()) { + continue; + } + + const isHandler = i === lastCallbackIndex; + const role = isHandler ? "handler" : `middleware:${i}`; + const pathLabel = httpPath !== null ? httpPath : ""; + const baseName = pathLabel + ? `${httpMethod} ${pathLabel} [${role}]` + : `${httpMethod} [${role}]`; + const synthName = baseName; + + const code = arg.getFullText(); + const startLine = arg.getStartLineNumber(); + const endLine = arg.getEndLineNumber(); + // Synthesise an ID that's stable per file/line so two routes on + // the same line+path don't collide. + const idSuffix = `${httpMethod}:${pathLabel}:${startLine}:${i}`; + const functionId = `${relativePath}:express(${idSuffix})`; + + if (this.functions[functionId]) continue; + + const unitType = isHandler ? "route_handler" : "route_middleware"; + const explicitCalls = namedMiddleware.filter((n) => n && n !== synthName); + + this.functions[functionId] = { + name: synthName, + code: code, + isExported: false, + unitType: unitType, + startLine: startLine, + endLine: endLine, + isEntryPoint: isHandler, + routeMetadata: { + http_method: httpMethod, + http_path: httpPath, + callback_index: i, + total_callbacks: callbacks.length, + named_middleware: explicitCalls, + }, + explicitCalls: explicitCalls, + }; + + // Emit a callGraph entry for the synthesised callback so the + // invariant `callGraph keys ≡ functions keys` holds. The named + // middleware identifiers are recorded as upstream dependencies via + // explicitCalls (merged downstream by dependency_resolver.js); here + // we capture any inline call expressions from the callback body so + // call-graph based analyses can see them too. + this.callGraph[functionId] = this.extractCallsFromFunction( + arg, + relativePath, + ); + } + } } /** diff --git a/libs/openant-core/parsers/javascript/unit_generator.js b/libs/openant-core/parsers/javascript/unit_generator.js index 3650792..7b76219 100644 --- a/libs/openant-core/parsers/javascript/unit_generator.js +++ b/libs/openant-core/parsers/javascript/unit_generator.js @@ -239,6 +239,19 @@ class UnitGenerator { unitType = 'route_handler'; } + // If the analyzer attached Express route metadata directly to the + // function (anonymous arrow handler / middleware), surface it on the + // unit's `route` field even when no external routes.json was given. + if (!routeData && funcData.routeMetadata) { + const meta = funcData.routeMetadata; + routeData = { + method: meta.http_method, + path: meta.http_path, + handler: funcData.name, + middleware: meta.named_middleware || [], + }; + } + // Get upstream dependencies (functions this calls) const upstreamIds = this.resolver.getDependencies(functionId); const upstreamDependencies = []; @@ -314,6 +327,10 @@ class UnitGenerator { handler: routeData.handler, middleware: routeData.middleware || [] } : null, + is_entry_point: funcData.isEntryPoint === true ? true : undefined, + http_method: funcData.routeMetadata ? funcData.routeMetadata.http_method : undefined, + http_path: funcData.routeMetadata ? funcData.routeMetadata.http_path : undefined, + callback_index: funcData.routeMetadata ? funcData.routeMetadata.callback_index : undefined, ground_truth: { status: 'UNKNOWN', vulnerability_types: [], diff --git a/libs/openant-core/tests/parsers/javascript/__init__.py b/libs/openant-core/tests/parsers/javascript/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/libs/openant-core/tests/parsers/javascript/test_express_route_handlers.py b/libs/openant-core/tests/parsers/javascript/test_express_route_handlers.py new file mode 100644 index 0000000..804e207 --- /dev/null +++ b/libs/openant-core/tests/parsers/javascript/test_express_route_handlers.py @@ -0,0 +1,400 @@ +"""Tests for Express anonymous route handler extraction in the JS parser. + +These exercise the typescript_analyzer.js + unit_generator.js pipeline by +running the Node.js scripts as subprocesses (mirroring tests/test_js_parser.py). + +Skips when Node.js or the parser's npm dependencies aren't installed. +""" +import json +import shutil +import subprocess +from pathlib import Path + +import pytest + + +PARSERS_JS_DIR = Path(__file__).parent.parent.parent.parent / "parsers" / "javascript" +NODE_MODULES = PARSERS_JS_DIR / "node_modules" + +pytestmark = pytest.mark.skipif( + not shutil.which("node") or not NODE_MODULES.exists(), + reason="Node.js or JS parser npm dependencies not available", +) + + +def _run_node(script_name, *args): + cmd = ["node", str(PARSERS_JS_DIR / script_name)] + list(args) + return subprocess.run(cmd, capture_output=True, text=True, timeout=30) + + +def _analyze(repo_path, file_path): + """Run the analyzer on a single file and return parsed output.""" + result = _run_node("typescript_analyzer.js", str(repo_path), str(file_path)) + assert result.returncode == 0, ( + f"analyzer failed:\nstdout={result.stdout}\nstderr={result.stderr}" + ) + return json.loads(result.stdout) + + +def _generate_units(analyzer_output_path, dataset_output_path): + result = _run_node( + "unit_generator.js", + str(analyzer_output_path), + "--output", str(dataset_output_path), + ) + assert result.returncode == 0, ( + f"unit_generator failed:\nstdout={result.stdout}\nstderr={result.stderr}" + ) + return json.loads(Path(dataset_output_path).read_text()) + + +def _write_fixture(tmp_path: Path, name: str, content: str) -> Path: + repo = tmp_path / name + repo.mkdir(parents=True, exist_ok=True) + file_path = repo / "server.js" + file_path.write_text(content) + return file_path + + +def _express_units(dataset): + return [u for u in dataset["units"] if "express(" in u["id"]] + + +def test_anonymous_handler_with_named_middleware(tmp_path): + """router.post(path, namedMiddleware, async (req, res) => {...}).""" + file_path = _write_fixture( + tmp_path, + "anon_with_mw", + """ +const express = require('express'); +const router = express.Router(); + +function authenticateToken(req, res, next) { next(); } + +router.post('/orders', authenticateToken, async (req, res) => { + const { productId, quantity } = req.body; + res.json({ productId, quantity }); +}); + +module.exports = router; +""", + ) + repo = file_path.parent + out = _analyze(repo, file_path) + + express_funcs = {k: v for k, v in out["functions"].items() if "express(" in k} + assert len(express_funcs) == 1, f"expected 1 anon handler, got {express_funcs}" + + fid, fdata = next(iter(express_funcs.items())) + assert fdata["unitType"] == "route_handler" + assert fdata["isEntryPoint"] is True + meta = fdata["routeMetadata"] + assert meta["http_method"] == "POST" + assert meta["http_path"] == "/orders" + assert meta["named_middleware"] == ["authenticateToken"] + + # Run unit_generator and verify the call-graph edge to authenticateToken. + analyzer_path = tmp_path / "analyzer.json" + analyzer_path.write_text(json.dumps(out)) + dataset_path = tmp_path / "dataset.json" + dataset = _generate_units(analyzer_path, dataset_path) + + handler_unit = next(u for u in dataset["units"] if u["id"] == fid) + assert handler_unit["unit_type"] == "route_handler" + assert handler_unit["is_entry_point"] is True + assert handler_unit["http_method"] == "POST" + assert handler_unit["http_path"] == "/orders" + assert handler_unit["route"]["method"] == "POST" + assert handler_unit["route"]["path"] == "/orders" + assert handler_unit["route"]["middleware"] == ["authenticateToken"] + + # Call-graph edge: handler -> authenticateToken + upstream_ids = handler_unit["metadata"]["direct_calls"] + auth_id = "server.js:authenticateToken" + assert auth_id in upstream_ids, ( + f"expected handler to call authenticateToken; direct_calls={upstream_ids}" + ) + + +def test_handler_no_middleware(tmp_path): + """app.get(path, (req, res) => res.json([])) — no extra edges.""" + file_path = _write_fixture( + tmp_path, + "no_mw", + """ +const express = require('express'); +const app = express(); +app.get('/users', (req, res) => res.json([])); +module.exports = app; +""", + ) + repo = file_path.parent + out = _analyze(repo, file_path) + express_funcs = {k: v for k, v in out["functions"].items() if "express(" in k} + assert len(express_funcs) == 1 + fid, fdata = next(iter(express_funcs.items())) + meta = fdata["routeMetadata"] + assert meta["http_method"] == "GET" + assert meta["http_path"] == "/users" + assert meta["named_middleware"] == [] + assert fdata["isEntryPoint"] is True + + +def test_use_with_multiple_anonymous_callbacks(tmp_path): + """router.use(path, anonMw1, anonMw2, anonHandler) — + one route_handler + two route_middleware units.""" + file_path = _write_fixture( + tmp_path, + "use_multi", + """ +const express = require('express'); +const router = express.Router(); + +router.use('/api', + (req, res, next) => { req.start = Date.now(); next(); }, + (req, res, next) => { console.log(req.path); next(); }, + async (req, res, next) => { + if (!req.headers.authorization) return res.status(401).end(); + next(); + } +); + +module.exports = router; +""", + ) + repo = file_path.parent + out = _analyze(repo, file_path) + express_funcs = {k: v for k, v in out["functions"].items() if "express(" in k} + assert len(express_funcs) == 3, f"expected 3 callbacks, got {list(express_funcs)}" + + by_type = {} + for fdata in express_funcs.values(): + by_type.setdefault(fdata["unitType"], []).append(fdata) + + assert len(by_type.get("route_handler", [])) == 1 + assert len(by_type.get("route_middleware", [])) == 2 + + handler = by_type["route_handler"][0] + assert handler["isEntryPoint"] is True + assert handler["routeMetadata"]["http_method"] == "USE" + assert handler["routeMetadata"]["http_path"] == "/api" + + for mw in by_type["route_middleware"]: + assert mw["isEntryPoint"] is False or mw.get("isEntryPoint") is None + assert mw["routeMetadata"]["http_method"] == "USE" + assert mw["routeMetadata"]["http_path"] == "/api" + assert mw["routeMetadata"]["callback_index"] < 2 + + +def test_non_express_call_is_skipped(tmp_path): + """myCache.get('foo', () => {}) must not be claimed as a route.""" + file_path = _write_fixture( + tmp_path, + "non_express", + """ +const myCache = makeCache(); +myCache.get('foo', () => { return 1; }); +const queryBuilder = makeBuilder(); +queryBuilder.post('users', () => {}); +""", + ) + repo = file_path.parent + out = _analyze(repo, file_path) + express_funcs = {k: v for k, v in out["functions"].items() if "express(" in k} + assert express_funcs == {}, ( + f"non-Express receivers must not be extracted; got {list(express_funcs)}" + ) + + +def test_synthetic_handlers_have_call_graph_entries(tmp_path): + """Synthetic Express handlers must also appear as callGraph keys. + + Regression for the invariant `len(callGraph) == len(functions)` that + other tests (e.g. test_js_parser.test_builds_call_graph) rely on. + """ + file_path = _write_fixture( + tmp_path, + "callgraph_invariant", + """ +const express = require('express'); +const router = express.Router(); + +function authenticateToken(req, res, next) { next(); } + +router.post('/orders', authenticateToken, async (req, res) => { + const { productId, quantity } = req.body; + res.json({ productId, quantity }); +}); + +module.exports = router; +""", + ) + repo = file_path.parent + out = _analyze(repo, file_path) + + express_funcs = {k: v for k, v in out["functions"].items() if "express(" in k} + assert len(express_funcs) == 1 + + # Every synthetic Express function must have a callGraph entry. + for fid in express_funcs: + assert fid in out["callGraph"], ( + f"synthetic function {fid} missing from callGraph; " + f"callGraph keys={list(out['callGraph'])}" + ) + + # Global invariant: callGraph keys ≡ functions keys. + assert len(out["callGraph"]) == len(out["functions"]), ( + f"callGraph/functions size mismatch: " + f"{len(out['callGraph'])} vs {len(out['functions'])}" + ) + + +def test_typescript_typed_callback(tmp_path): + """TS callback with type annotations: + `(req: Request, res: Response, next: NextFunction) => {...}`. + + Type annotations on the parameters and return type must not prevent + the AST walk from recognising the callback as an ArrowFunction. + """ + repo = tmp_path / "ts_typed" + repo.mkdir(parents=True, exist_ok=True) + file_path = repo / "server.ts" + file_path.write_text( + """ +import express, { Request, Response, NextFunction } from 'express'; +const app = express(); + +function authenticateToken(req: Request, res: Response, next: NextFunction): void { next(); } + +app.post('/orders', authenticateToken, async (req: Request, res: Response): Promise => { + const { productId, quantity } = req.body; + res.json({ productId, quantity }); +}); + +export default app; +""" + ) + out = _analyze(repo, file_path) + express_funcs = {k: v for k, v in out["functions"].items() if "express(" in k} + assert len(express_funcs) == 1, ( + f"expected 1 anon TS handler, got {express_funcs}" + ) + fid, fdata = next(iter(express_funcs.items())) + assert fdata["unitType"] == "route_handler" + assert fdata["isEntryPoint"] is True + meta = fdata["routeMetadata"] + assert meta["http_method"] == "POST" + assert meta["http_path"] == "/orders" + assert meta["named_middleware"] == ["authenticateToken"] + + +def test_dynamic_path_does_not_crash(tmp_path): + """`app.get('/' + prefix, handler)` — first arg isn't a string literal. + + The extractor should skip such calls without throwing. We can't + reliably extract a path from a runtime-built expression. + """ + file_path = _write_fixture( + tmp_path, + "dynamic_path", + """ +const express = require('express'); +const app = express(); +const prefix = 'foo'; +app.get('/' + prefix, (req, res) => res.send('ok')); +module.exports = app; +""", + ) + repo = file_path.parent + out = _analyze(repo, file_path) + express_funcs = {k: v for k, v in out["functions"].items() if "express(" in k} + assert express_funcs == {}, ( + f"dynamic path should be skipped, got {list(express_funcs)}" + ) + + +def test_use_no_path_anonymous_middleware(tmp_path): + """`app.use((req, res, next) => {...})` — middleware with no path. + + The synthetic unit should be emitted with http_path=null and + http_method='USE'. + """ + file_path = _write_fixture( + tmp_path, + "use_no_path", + """ +const express = require('express'); +const app = express(); +app.use((req, res, next) => { req.start = Date.now(); next(); }); +module.exports = app; +""", + ) + repo = file_path.parent + out = _analyze(repo, file_path) + express_funcs = {k: v for k, v in out["functions"].items() if "express(" in k} + assert len(express_funcs) == 1, ( + f"expected 1 anon middleware unit, got {list(express_funcs)}" + ) + fid, fdata = next(iter(express_funcs.items())) + meta = fdata["routeMetadata"] + assert meta["http_method"] == "USE" + assert meta["http_path"] is None + + +def test_anon_middleware_named_handler_mixed(tmp_path): + """`app.get(path, anonMw, namedHandler)` — anon middleware before + named handler. Anon gets a route_middleware unit; the named handler + is left to the regular extractor (no synthetic unit for it).""" + file_path = _write_fixture( + tmp_path, + "mixed", + """ +const express = require('express'); +const app = express(); +function namedHandler(req, res) { res.send('ok'); } +app.get('/x', (req, res, next) => { console.log('mw'); next(); }, namedHandler); +module.exports = app; +""", + ) + repo = file_path.parent + out = _analyze(repo, file_path) + express_funcs = {k: v for k, v in out["functions"].items() if "express(" in k} + assert len(express_funcs) == 1, ( + f"expected 1 anon middleware unit, got {list(express_funcs)}" + ) + fid, fdata = next(iter(express_funcs.items())) + assert fdata["unitType"] == "route_middleware" + # named_middleware should include the namedHandler sibling + assert fdata["routeMetadata"]["named_middleware"] == ["namedHandler"] + # namedHandler must still be extracted normally + assert any( + f.get("name") == "namedHandler" for f in out["functions"].values() + ) + + +def test_named_handler_no_anonymous_unit(tmp_path): + """router.get('/x', namedHandler) — no anon unit synthesised.""" + file_path = _write_fixture( + tmp_path, + "named", + """ +const express = require('express'); +const router = express.Router(); + +function namedHandler(req, res) { res.send('ok'); } + +router.get('/x', namedHandler); + +module.exports = router; +""", + ) + repo = file_path.parent + out = _analyze(repo, file_path) + express_funcs = {k: v for k, v in out["functions"].items() if "express(" in k} + assert express_funcs == {}, ( + f"named-only callbacks must not synthesise anon units; got {list(express_funcs)}" + ) + # namedHandler should still be picked up by the regular extractor. + assert any( + f.get("name") == "namedHandler" for f in out["functions"].values() + ) diff --git a/libs/openant-core/tests/test_entry_point_detector.py b/libs/openant-core/tests/test_entry_point_detector.py new file mode 100644 index 0000000..8250485 --- /dev/null +++ b/libs/openant-core/tests/test_entry_point_detector.py @@ -0,0 +1,56 @@ +"""Tests for EntryPointDetector — specifically that Express unit types +produced by the JS analyzer are recognised as entry points and therefore +survive the reachability filter. +""" +import pytest + +from utilities.agentic_enhancer.entry_point_detector import ( + ENTRY_POINT_TYPES, + EntryPointDetector, +) + + +def _make_detector(unit_type: str) -> EntryPointDetector: + functions = { + "server.js:fn": { + "name": "fn", + "unit_type": unit_type, + "code": "async (req, res, next) => { next(); }", + } + } + return EntryPointDetector(functions, call_graph={}) + + +def test_route_handler_is_entry_point(): + detector = _make_detector("route_handler") + entry_points = detector.detect_entry_points() + assert "server.js:fn" in entry_points + + +def test_route_middleware_is_entry_point(): + """route_middleware units must be detected as entry points so they are not + silently dropped by the reachability filter. + + Regression for the gap where `route_middleware` was missing from + ENTRY_POINT_TYPES: Express anonymous middleware bodies (which receive req + directly and can be doing anything dangerous) were filtered out before the + LLM ever saw them. + """ + assert "route_middleware" in ENTRY_POINT_TYPES, ( + "route_middleware must be in ENTRY_POINT_TYPES so the reachability " + "filter treats anonymous Express middleware as entry points" + ) + + detector = _make_detector("route_middleware") + entry_points = detector.detect_entry_points() + assert "server.js:fn" in entry_points, ( + "route_middleware unit was filtered out — it must survive as an entry point" + ) + + +def test_unknown_unit_type_is_not_entry_point(): + """A unit with an unrecognised unit_type is not an entry point unless it + matches a decorator or user-input pattern.""" + detector = _make_detector("utility") + entry_points = detector.detect_entry_points() + assert "server.js:fn" not in entry_points diff --git a/libs/openant-core/utilities/agentic_enhancer/entry_point_detector.py b/libs/openant-core/utilities/agentic_enhancer/entry_point_detector.py index 22aab91..16df5b5 100644 --- a/libs/openant-core/utilities/agentic_enhancer/entry_point_detector.py +++ b/libs/openant-core/utilities/agentic_enhancer/entry_point_detector.py @@ -25,6 +25,7 @@ # Entry point patterns by unit_type (from function extractor classification) ENTRY_POINT_TYPES = { 'route_handler', # Flask/FastAPI/Express routes + 'route_middleware', # Express anonymous middleware callbacks (req, res, next) 'view_function', # Django views 'websocket_handler', # WebSocket endpoints 'cli_handler', # CLI commands