From 0d67ff9aba8fdb4d44fa10ab56aeae51fc3df821 Mon Sep 17 00:00:00 2001 From: jhovanny linares Date: Fri, 29 May 2026 12:21:03 -0600 Subject: [PATCH] feat: add Oracle PL/SQL parser with broad Oracle object support Adds a regex-based PL/SQL parser (_parse_plsql) that extracts Oracle database objects without requiring tree-sitter grammar support. Supported constructs: - PACKAGE spec and PACKAGE BODY with member PROCEDURE/FUNCTION extraction - Standalone PROCEDURE and FUNCTION (with or without CREATE OR REPLACE) - TRIGGER with event and target table metadata - TYPE definitions (AS OBJECT, TABLE OF, UNDER, etc.) - IMPORTS_FROM edges for FROM/JOIN table references - CALLS edges for inter-package calls; Oracle system packages suppressed New Oracle file extensions: .plsql, .pks, .pkb, .prc, .fnc, .trg, .pls Auto-detection: .sql files with Oracle headers routed to PL/SQL parser Wrapped (obfuscated) Oracle files are silently skipped 24 new tests; tests/fixtures/sample.plsql fixture covers all constructs. README: language list updated + Oracle PL/SQL usage collapsible section. CHANGELOG: entry added under [Unreleased]. --- CHANGELOG.md | 15 ++ README.md | 16 +- code_review_graph/parser.py | 416 +++++++++++++++++++++++++++++++++++- tests/fixtures/sample.plsql | 85 ++++++++ tests/test_multilang.py | 167 +++++++++++++++ 5 files changed, 695 insertions(+), 4 deletions(-) create mode 100644 tests/fixtures/sample.plsql diff --git a/CHANGELOG.md b/CHANGELOG.md index 38aa3610..e588c3ff 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,21 @@ ## [Unreleased] +### Added + +- **Oracle PL/SQL support**: `.pls`, `.pks`, `.pkb`, `.prc`, `.fnc`, `.trg`, and `.plsql` + file extensions are parsed via a dedicated regex parser (no tree-sitter grammar exists for + Oracle PL/SQL). Extracts PACKAGE specs, PACKAGE BODYs with member PROCEDURE/FUNCTION nodes, + standalone PROCEDUREs and FUNCTIONs, TRIGGERs with event and table metadata, and TYPE + definitions. Emits CALLS edges for inter-package calls (Oracle built-in system packages such + as `DBMS_*` and `UTL_*` are suppressed to reduce noise) and IMPORTS_FROM edges for FROM/JOIN + table references and trigger target tables. `.sql` files whose first 500 bytes match an Oracle + object keyword (`PACKAGE`, `TRIGGER`, `PROCEDURE`, `FUNCTION`, `TYPE`) are auto-routed to the + PL/SQL parser. Wrapped (obfuscated) Oracle files are silently skipped. Adds 24 tests and + `tests/fixtures/sample.plsql`. Files must be present on disk — no Oracle database connection + is required; export your objects from SQL Developer, TOAD, or version control before running + `code-review-graph build`. + ## [2.3.5] - 2026-05-25 **Real-time token savings, visible to humans.** The estimated context-savings diff --git a/README.md b/README.md index 6396f788..d0a0a543 100644 --- a/README.md +++ b/README.md @@ -113,7 +113,7 @@ Large monorepos are where token waste is most painful. The graph cuts through th Language coverage organized by category: Web, Backend, Systems, Mobile, Scripting, Config, plus Jupyter and Databricks notebook support

-Parser support covers functions, classes, imports, call sites, inheritance, and test detection across the current parser surface, using Tree-sitter where available and targeted fallbacks where needed. Current support includes Python, JavaScript/TypeScript/TSX, Go, Rust, Java, C/C++, C#, Ruby, Kotlin, Swift, PHP, Scala, Solidity, Dart, R, Perl, Lua/Luau, Objective-C, shell scripts, Elixir, Zig, PowerShell, Julia, ReScript, GDScript, Nix, Verilog/SystemVerilog, SQL, Vue/Svelte SFCs, Astro files parsed through the TypeScript parser, Jupyter/Databricks notebooks (`.ipynb`), and Perl XS files (`.xs`). +Parser support covers functions, classes, imports, call sites, inheritance, and test detection across the current parser surface, using Tree-sitter where available and targeted fallbacks where needed. Current support includes Python, JavaScript/TypeScript/TSX, Go, Rust, Java, C/C++, C#, Ruby, Kotlin, Swift, PHP, Scala, Solidity, Dart, R, Perl, Lua/Luau, Objective-C, shell scripts, Elixir, Zig, PowerShell, Julia, ReScript, GDScript, Nix, Verilog/SystemVerilog, SQL, Oracle PL/SQL, Vue/Svelte SFCs, Astro files parsed through the TypeScript parser, Jupyter/Databricks notebooks (`.ipynb`), and Perl XS files (`.xs`). --- @@ -198,7 +198,7 @@ Blast-radius analysis reaches 100% recall on every one of the 13 evaluation comm | Feature | Details | |---------|---------| | **Incremental updates** | Re-parses only changed files. Subsequent updates complete in under 2 seconds. | -| **Broad language + notebook support** | Python, JavaScript/TypeScript/TSX, Go, Rust, Java, C/C++, C#, Ruby, Kotlin, Swift, PHP, Scala, Solidity, Dart, R, Perl, Lua/Luau, Objective-C, shell scripts, Elixir, Zig, PowerShell, Julia, ReScript, GDScript, Nix, Verilog/SystemVerilog, SQL, Vue/Svelte SFCs, Astro files parsed through the TypeScript parser, Jupyter/Databricks (.ipynb), and Perl XS (.xs) | +| **Broad language + notebook support** | Python, JavaScript/TypeScript/TSX, Go, Rust, Java, C/C++, C#, Ruby, Kotlin, Swift, PHP, Scala, Solidity, Dart, R, Perl, Lua/Luau, Objective-C, shell scripts, Elixir, Zig, PowerShell, Julia, ReScript, GDScript, Nix, Verilog/SystemVerilog, SQL, Oracle PL/SQL, Vue/Svelte SFCs, Astro files parsed through the TypeScript parser, Jupyter/Databricks (.ipynb), and Perl XS (.xs) | | **Blast-radius analysis** | Shows which functions, classes, and files are likely affected by a change | | **Auto-update hooks** | Hooks and watch mode can update the graph on file saves and supported commit hooks | | **Semantic search** | Optional vector embeddings via sentence-transformers, Google Gemini, MiniMax, or any OpenAI-compatible endpoint (real OpenAI, Azure, new-api, LiteLLM, vLLM, LocalAI) | @@ -562,6 +562,18 @@ pip install -e ".[dev]" pytest ``` +
+Oracle PL/SQL projects +
+ +Oracle PL/SQL objects live inside the database, not the filesystem. To use `code-review-graph` with an Oracle codebase you need the source files checked out locally first — export them from **SQL Developer** (Tools → Export DDL), **TOAD** (Schema Browser → Script), or pull them from your version-control repository. + +The parser recognises these Oracle-specific extensions automatically: `.pks` (package spec), `.pkb` (package body), `.prc` (procedure), `.fnc` (function), `.trg` (trigger), `.pls` / `.plsql` (generic PL/SQL). Plain `.sql` files are also detected when their first lines contain an Oracle object keyword (`PACKAGE`, `TRIGGER`, `PROCEDURE`, `FUNCTION`, `TYPE`). + +Folder layout does not matter — any directory structure is scanned recursively. Wrapped (obfuscated) files are silently skipped. No Oracle database connection is required at any point. + +
+
Adding a new language
diff --git a/code_review_graph/parser.py b/code_review_graph/parser.py index f94519db..a42abc76 100644 --- a/code_review_graph/parser.py +++ b/code_review_graph/parser.py @@ -142,6 +142,14 @@ class EdgeInfo: ".v": "verilog", ".vh": "verilog", ".sql": "sql", + # Oracle PL/SQL: projects using Oracle-specific file extensions + ".plsql": "plsql", # generic PL/SQL (also used as test fixture extension) + ".pls": "plsql", # generic PL/SQL source + ".pks": "plsql", # package specification + ".pkb": "plsql", # package body + ".prc": "plsql", # stored procedure + ".fnc": "plsql", # function + ".trg": "plsql", # trigger } # Shebang interpreter → language mapping for extension-less Unix scripts. @@ -235,6 +243,8 @@ class EdgeInfo: "gdscript": ["class_definition", "class_name_statement"], # SQL: CREATE TABLE / CREATE VIEW are handled via _parse_sql dispatch. "sql": [], + # PL/SQL (Oracle): all constructs handled via _parse_plsql dispatch. + "plsql": [], } _FUNCTION_TYPES: dict[str, list[str]] = { @@ -296,6 +306,8 @@ class EdgeInfo: "gdscript": ["function_definition"], # SQL: CREATE FUNCTION / CREATE PROCEDURE handled via _parse_sql dispatch. "sql": [], + # PL/SQL (Oracle): all constructs handled via _parse_plsql dispatch. + "plsql": [], } _IMPORT_TYPES: dict[str, list[str]] = { @@ -348,6 +360,8 @@ class EdgeInfo: "gdscript": ["extends_statement"], # SQL: table references extracted as IMPORTS_FROM via _parse_sql dispatch. "sql": [], + # PL/SQL (Oracle): all constructs handled via _parse_plsql dispatch. + "plsql": [], } _CALL_TYPES: dict[str, list[str]] = { @@ -406,6 +420,8 @@ class EdgeInfo: "gdscript": ["call", "attribute_call"], # SQL: no call edges extracted (grammar too unreliable for procedure calls). "sql": [], + # PL/SQL (Oracle): all constructs handled via _parse_plsql dispatch. + "plsql": [], } # Patterns that indicate a test function @@ -753,6 +769,33 @@ def _is_test_file(path: str) -> bool: return any(p.search(path) for p in _TEST_FILE_PATTERNS) +# --------------------------------------------------------------------------- +# Oracle PL/SQL detection helpers (module-level, used by parse_bytes) +# --------------------------------------------------------------------------- + +# Matches the first keyword of any Oracle PL/SQL object definition, +# with or without a leading CREATE [OR REPLACE] prefix. +_ORACLE_HEADER_RE = re.compile( + r"^\s*(?:CREATE\s+(?:OR\s+REPLACE\s+)?)?" + r"(?:PACKAGE(?:\s+BODY)?|TRIGGER|PROCEDURE|FUNCTION|TYPE(?:\s+BODY)?)\s+", + re.IGNORECASE, +) + +# Oracle wraps obfuscated package bodies with the literal word "wrapped" +# on the first line: e.g. "PACKAGE BODY blc_accounts wrapped". +_ORACLE_WRAPPED_RE = re.compile(r"\bwrapped\b", re.IGNORECASE) + + +def _is_oracle_plsql(source: bytes) -> bool: + """Return True if *source* looks like an Oracle PL/SQL object definition. + + Inspects the first 500 bytes only. Wrapped (obfuscated) files are NOT + considered Oracle PL/SQL here — the caller must skip them separately. + """ + head = source[:500].decode("utf-8", errors="replace").lstrip() + return bool(_ORACLE_HEADER_RE.match(head)) + + def _is_test_function( name: str, file_path: str, decorators: tuple[str, ...] = (), ) -> bool: @@ -934,11 +977,21 @@ def parse_bytes(self, path: Path, source: bytes) -> tuple[list[NodeInfo], list[E if language == "rescript": return self._parse_rescript(path, source) - # SQL: dedicated parser — tree-sitter for tables/views/functions + - # regex fallback for CREATE PROCEDURE (unsupported by the grammar). + # SQL / Oracle PL/SQL: route to the appropriate dedicated parser. + # Wrapped Oracle objects (obfuscated bytecode) are skipped entirely. if language == "sql": + if _is_oracle_plsql(source): + if _ORACLE_WRAPPED_RE.search(source[:300].decode("utf-8", errors="replace")): + return [], [] + return self._parse_plsql(path, source) return self._parse_sql(path, source) + # PL/SQL via Oracle-specific file extensions (.pks, .pkb, .prc, etc.) + if language == "plsql": + if _ORACLE_WRAPPED_RE.search(source[:300].decode("utf-8", errors="replace")): + return [], [] + return self._parse_plsql(path, source) + parser = self._get_parser(language) if not parser: return [], [] @@ -2110,6 +2163,365 @@ def _extract_sql_ddl( line=line_start, )) + # ------------------------------------------------------------------ + # PL/SQL parser (Oracle) — regex constants and parser method + # ------------------------------------------------------------------ + + # PACKAGE spec: "PACKAGE ["]name["]" with optional CREATE [OR REPLACE]. + # IS/AS may appear on a later line after comments, so we only capture name. + _PLSQL_PACKAGE_SPEC_RE = re.compile( + r"^\s*(?:CREATE\s+(?:OR\s+REPLACE\s+)?)?PACKAGE\s+(?!BODY\b)" + r'"?([\w.]+)"?', + re.IGNORECASE | re.MULTILINE, + ) + + # PACKAGE BODY: IS/AS may appear on the same or next line (\s+ covers both). + _PLSQL_PACKAGE_BODY_RE = re.compile( + r"^\s*(?:CREATE\s+(?:OR\s+REPLACE\s+)?)?PACKAGE\s+BODY\s+" + r'"?([\w.]+)"?\s+(?:IS|AS)\b', + re.IGNORECASE | re.MULTILINE, + ) + + # Standalone PROCEDURE (with or without CREATE OR REPLACE / schema prefix). + _PLSQL_PROC_RE = re.compile( + r"^\s*(?:CREATE\s+(?:OR\s+REPLACE\s+)?)?PROCEDURE\s+" + r'"?([\w.]+)"?', + re.IGNORECASE | re.MULTILINE, + ) + + # Standalone FUNCTION. + _PLSQL_FUNC_RE = re.compile( + r"^\s*(?:CREATE\s+(?:OR\s+REPLACE\s+)?)?FUNCTION\s+" + r'"?([\w.]+)"?', + re.IGNORECASE | re.MULTILINE, + ) + + # TRIGGER with event and table captured. + _PLSQL_TRIGGER_RE = re.compile( + r"^\s*(?:CREATE\s+(?:OR\s+REPLACE\s+)?)?TRIGGER\s+" + r'"?([\w.]+)"?' + r"\s+(BEFORE|AFTER|INSTEAD\s+OF)\s+" + r"((?:INSERT|UPDATE|DELETE)(?:\s+OR\s+(?:INSERT|UPDATE|DELETE))*)" + r"\s+ON\s+" + r'"?([\w.]+)"?', + re.IGNORECASE | re.MULTILINE, + ) + + # TYPE spec (TABLE OF, AS OBJECT, UNDER, etc.). + # FORCE and EDITIONABLE/NONEDITIONABLE are each optional and separated by + # their own \s+, so that "FORCE AS" doesn't consume the space before AS. + _PLSQL_TYPE_RE = re.compile( + r"^\s*(?:CREATE\s+(?:OR\s+REPLACE\s+)?)?TYPE\s+(?!BODY\b)" + r'"?([\w.]+)"?' + r"(?:\s+FORCE)?" + r"(?:\s+(?:EDITIONABLE|NONEDITIONABLE))?" + r"\s+(?:AS|IS|UNDER)\b", + re.IGNORECASE | re.MULTILINE, + ) + + # PROCEDURE/FUNCTION member inside a package body slice. + # No indentation requirement — some Oracle shops place members at column 0. + _PLSQL_MEMBER_RE = re.compile( + r"^(?:MEMBER\s+)?(PROCEDURE|FUNCTION)\s+(\w+)", + re.IGNORECASE | re.MULTILINE, + ) + + # Package-qualified call: PKG.PROC( or SCHEMA.PKG.PROC( (matches innermost pair) + _PLSQL_INTERCALL_RE = re.compile( + r"\b([A-Za-z_][A-Za-z0-9_$#]*)\.([A-Za-z_][A-Za-z0-9_$#]*)\s*\(", + re.IGNORECASE, + ) + + # Oracle built-in / system packages — suppress noisy but semantically + # uninteresting edges (infra calls rather than business-logic calls). + _ORACLE_SYSTEM_PKGS: frozenset[str] = frozenset({ + "DBMS_OUTPUT", "DBMS_SCHEDULER", "DBMS_SQL", "DBMS_LOB", "DBMS_CRYPTO", + "DBMS_UTILITY", "DBMS_METADATA", "DBMS_STATS", "DBMS_LOCK", "DBMS_PIPE", + "DBMS_ALERT", "DBMS_TRANSACTION", "DBMS_SESSION", "DBMS_APPLICATION_INFO", + "DBMS_RANDOM", "DBMS_XMLGEN", "DBMS_ROWID", "DBMS_TYPES", + "UTL_FILE", "UTL_HTTP", "UTL_SMTP", "UTL_TCP", "UTL_RAW", + "UTL_I18N", "UTL_URL", "UTL_ENCODE", + "SYS", "STANDARD", "APEX_UTIL", "HTP", "OWA_UTIL", + }) + + def _extract_plsql_calls( + self, + body_text: str, + body_offset: int, + source_qname: str, + file_path_str: str, + edges: list[EdgeInfo], + line_of_fn, + ) -> None: + """Emit CALLS edges for package-qualified calls found in *body_text*. + + *body_offset* is the character offset of *body_text* within the full + source — used to compute correct line numbers. Each unique + PKG.PROC target produces at most one edge per source node. + """ + seen_calls: set[str] = set() + for cm in self._PLSQL_INTERCALL_RE.finditer(body_text): + pkg = cm.group(1) + proc = cm.group(2) + if pkg.upper() in self._ORACLE_SYSTEM_PKGS: + continue + call_target = f"{pkg}.{proc}" + if call_target not in seen_calls: + seen_calls.add(call_target) + edges.append(EdgeInfo( + kind="CALLS", + source=source_qname, + target=call_target, + file_path=file_path_str, + line=line_of_fn(body_offset + cm.start()), + )) + + def _parse_plsql( + self, path: Path, source: bytes, + ) -> tuple[list[NodeInfo], list[EdgeInfo]]: + """Parse an Oracle PL/SQL file using regex (no tree-sitter grammar). + + Extracts: + - PACKAGE specs → Class nodes, extra["plsql_kind"]="package" + - PACKAGE BODYs → Class nodes, extra["plsql_kind"]="package_body" + - member PROCEDURE/FUNCTION → Function nodes with parent_name=pkg + - Standalone PROCEDURE → Function nodes, extra["plsql_kind"]="procedure" + - Standalone FUNCTION → Function nodes, extra["plsql_kind"]="function" + - TRIGGER → Function nodes, extra["plsql_kind"]="trigger" + - TYPE → Class nodes, extra["plsql_kind"]="type" + - FROM/JOIN references → IMPORTS_FROM edges (reuses _SQL_TABLE_RE) + + Wrapped (obfuscated) files must be rejected by the caller before reaching + this method — they are never passed in. + """ + text = source.decode("utf-8", errors="replace") + file_path_str = str(path) + test_file = _is_test_file(file_path_str) + + nodes: list[NodeInfo] = [] + edges: list[EdgeInfo] = [] + + nodes.append(NodeInfo( + kind="File", + name=file_path_str, + file_path=file_path_str, + line_start=1, + line_end=text.count("\n") + 1, + language="plsql", + is_test=test_file, + )) + + seen: set[str] = set() + # Bare member names added via the package body loop — used to avoid + # re-extracting their CALLS in the standalone proc/func loops below. + member_bare_names: set[str] = set() + + def _strip_schema(raw: str) -> str: + return raw.strip('"').split(".")[-1] + + def _qualified(name: str) -> str: + return f"{file_path_str}::{name}" + + def _line_of(offset: int) -> int: + return text[:offset].count("\n") + 1 + + # --- PACKAGE BODY --- + # Spec and body use distinct seen-keys so both nodes coexist when a + # file contains both (common in Oracle shops). + for m in self._PLSQL_PACKAGE_BODY_RE.finditer(text): + name = _strip_schema(m.group(1)) + body_key = _qualified(f"__body__{name}") + qname = _qualified(name) + line = _line_of(m.start()) + if body_key not in seen: + seen.add(body_key) + nodes.append(NodeInfo( + kind="Class", name=name, file_path=file_path_str, + line_start=line, line_end=line, language="plsql", + extra={"plsql_kind": "package_body"}, + )) + edges.append(EdgeInfo( + kind="CONTAINS", source=file_path_str, + target=qname, file_path=file_path_str, line=line, + )) + + # Find the body boundary using the package name specifically to + # avoid stopping at inner END proc_name; closers. + body_start = m.end() + end_pat = re.compile( + rf"^\s*END\s+{re.escape(name)}\s*;", + re.IGNORECASE | re.MULTILINE, + ) + end_m = end_pat.search(text, body_start) + body_end = end_m.start() if end_m else len(text) + body_slice = text[body_start:body_end] + for mm in self._PLSQL_MEMBER_RE.finditer(body_slice): + kind_kw = mm.group(1).upper() + member_name = mm.group(2) + member_line = _line_of(body_start + mm.start()) + mq = _qualified(f"{name}.{member_name}") + if mq not in seen: + seen.add(mq) + nodes.append(NodeInfo( + kind="Function", name=member_name, + file_path=file_path_str, + line_start=member_line, line_end=member_line, + language="plsql", parent_name=name, + extra={"plsql_kind": kind_kw.lower()}, + )) + edges.append(EdgeInfo( + kind="CONTAINS", source=_qualified(name), + target=mq, file_path=file_path_str, line=member_line, + )) + member_bare_names.add(member_name) + # Extract inter-package CALLS from this member's body. + mem_body_start = body_start + mm.end() + end_mem_pat = re.compile( + rf"^\s*END\s+{re.escape(member_name)}\s*;", + re.IGNORECASE | re.MULTILINE, + ) + end_mem_m = end_mem_pat.search(text, body_start + mm.start()) + mem_body_end = end_mem_m.start() if end_mem_m else body_end + self._extract_plsql_calls( + text[mem_body_start:mem_body_end], + mem_body_start, mq, file_path_str, edges, _line_of, + ) + + # --- PACKAGE SPEC --- + for m in self._PLSQL_PACKAGE_SPEC_RE.finditer(text): + name = _strip_schema(m.group(1)) + spec_key = _qualified(f"__spec__{name}") + qname = _qualified(name) + line = _line_of(m.start()) + if spec_key not in seen: + seen.add(spec_key) + nodes.append(NodeInfo( + kind="Class", name=name, file_path=file_path_str, + line_start=line, line_end=line, language="plsql", + extra={"plsql_kind": "package"}, + )) + edges.append(EdgeInfo( + kind="CONTAINS", source=file_path_str, + target=qname, file_path=file_path_str, line=line, + )) + + # --- TRIGGERS --- + for m in self._PLSQL_TRIGGER_RE.finditer(text): + name = _strip_schema(m.group(1)) + event = f"{m.group(2).upper()} {m.group(3).upper()}" + table = _strip_schema(m.group(4)) + qname = _qualified(name) + line = _line_of(m.start()) + if qname not in seen: + seen.add(qname) + nodes.append(NodeInfo( + kind="Function", name=name, file_path=file_path_str, + line_start=line, line_end=line, language="plsql", + extra={ + "plsql_kind": "trigger", + "trigger_event": event, + "trigger_table": table, + }, + )) + edges.append(EdgeInfo( + kind="CONTAINS", source=file_path_str, + target=qname, file_path=file_path_str, line=line, + )) + edges.append(EdgeInfo( + kind="IMPORTS_FROM", source=file_path_str, + target=table, file_path=file_path_str, line=line, + )) + + # --- STANDALONE PROCEDURES --- + for m in self._PLSQL_PROC_RE.finditer(text): + name = _strip_schema(m.group(1)) + qname = _qualified(name) + line = _line_of(m.start()) + if qname not in seen: + seen.add(qname) + nodes.append(NodeInfo( + kind="Function", name=name, file_path=file_path_str, + line_start=line, line_end=line, language="plsql", + extra={"plsql_kind": "procedure"}, + )) + edges.append(EdgeInfo( + kind="CONTAINS", source=file_path_str, + target=qname, file_path=file_path_str, line=line, + )) + # Skip procedures already extracted as package body members to + # avoid emitting duplicate CALLS edges with a bare source name. + if name not in member_bare_names: + proc_body_start = m.end() + end_proc_pat = re.compile( + rf"^\s*END\s+{re.escape(name)}\s*;", + re.IGNORECASE | re.MULTILINE, + ) + end_proc_m = end_proc_pat.search(text, m.start()) + proc_body_end = end_proc_m.start() if end_proc_m else len(text) + self._extract_plsql_calls( + text[proc_body_start:proc_body_end], + proc_body_start, qname, file_path_str, edges, _line_of, + ) + + # --- STANDALONE FUNCTIONS --- + for m in self._PLSQL_FUNC_RE.finditer(text): + name = _strip_schema(m.group(1)) + qname = _qualified(name) + line = _line_of(m.start()) + if qname not in seen: + seen.add(qname) + nodes.append(NodeInfo( + kind="Function", name=name, file_path=file_path_str, + line_start=line, line_end=line, language="plsql", + extra={"plsql_kind": "function"}, + )) + edges.append(EdgeInfo( + kind="CONTAINS", source=file_path_str, + target=qname, file_path=file_path_str, line=line, + )) + if name not in member_bare_names: + func_body_start = m.end() + end_func_pat = re.compile( + rf"^\s*END\s+{re.escape(name)}\s*;", + re.IGNORECASE | re.MULTILINE, + ) + end_func_m = end_func_pat.search(text, m.start()) + func_body_end = end_func_m.start() if end_func_m else len(text) + self._extract_plsql_calls( + text[func_body_start:func_body_end], + func_body_start, qname, file_path_str, edges, _line_of, + ) + + # --- TYPES --- + for m in self._PLSQL_TYPE_RE.finditer(text): + name = _strip_schema(m.group(1)) + qname = _qualified(name) + line = _line_of(m.start()) + if qname not in seen: + seen.add(qname) + nodes.append(NodeInfo( + kind="Class", name=name, file_path=file_path_str, + line_start=line, line_end=line, language="plsql", + extra={"plsql_kind": "type"}, + )) + edges.append(EdgeInfo( + kind="CONTAINS", source=file_path_str, + target=qname, file_path=file_path_str, line=line, + )) + + # --- TABLE REFERENCES (FROM / JOIN) — reuse SQL regex --- + seen_refs: set[str] = set() + for m in _SQL_TABLE_RE.finditer(text): + ref = m.group(1).strip('"`').split(".")[-1] + if ref and ref.upper() not in _SQL_KEYWORDS and ref not in seen_refs: + seen_refs.add(ref) + edges.append(EdgeInfo( + kind="IMPORTS_FROM", source=file_path_str, + target=ref, file_path=file_path_str, line=_line_of(m.start()), + )) + + return nodes, edges + def _resolve_call_targets( self, nodes: list[NodeInfo], diff --git a/tests/fixtures/sample.plsql b/tests/fixtures/sample.plsql new file mode 100644 index 00000000..d5304527 --- /dev/null +++ b/tests/fixtures/sample.plsql @@ -0,0 +1,85 @@ +-- Oracle PL/SQL fixture covering common real-world Oracle code patterns. +-- Some files start directly with the object keyword (no CREATE OR REPLACE prefix). + +-------------------------------------------------------------------------------- +-- Package specification: PACKAGE name ... IS +-------------------------------------------------------------------------------- +PACKAGE HR_PKG +-------------------------------------------------------------------------------- +-- PACKAGE DESCRIPTION: Human resources utilities. +-------------------------------------------------------------------------------- +IS + PROCEDURE hire_employee(p_name VARCHAR2, p_dept NUMBER, pio_Err IN OUT SrvErr); + FUNCTION get_salary(p_id NUMBER) RETURN NUMBER; +END HR_PKG; +/ + +-------------------------------------------------------------------------------- +-- Package body: PACKAGE BODY name AS +-------------------------------------------------------------------------------- +PACKAGE BODY HR_PKG AS + +PROCEDURE hire_employee(p_name VARCHAR2, p_dept NUMBER, pio_Err IN OUT SrvErr) IS +BEGIN + INSERT INTO employees (emp_name, dept_id) VALUES (p_name, p_dept); + AUDIT_PKG.log_change('HIRE', p_name); +END hire_employee; + +FUNCTION get_salary(p_id NUMBER) RETURN NUMBER IS + v_sal NUMBER; +BEGIN + SELECT salary INTO v_sal FROM employees WHERE emp_id = p_id; + NOTIF_PKG.send_alert(p_id, v_sal); + RETURN v_sal; +END get_salary; + +END HR_PKG; +/ + +-------------------------------------------------------------------------------- +-- Trigger: schema-qualified name and table (TRIGGER schema.name BEFORE ... ON schema.table) +-------------------------------------------------------------------------------- +TRIGGER CURRENCIES.AUDIT_EMP_TRG + BEFORE + INSERT OR UPDATE + ON CURRENCIES.EMPLOYEES + REFERENCING OLD AS OLD NEW AS NEW + FOR EACH ROW +BEGIN + :new.updated_at := SYSDATE; +END; +/ + +-------------------------------------------------------------------------------- +-- Standalone procedure +-------------------------------------------------------------------------------- +PROCEDURE standalone_proc(p_id IN NUMBER) IS +BEGIN + UPDATE employees SET active = 0 WHERE emp_id = p_id; +END; +/ + +-------------------------------------------------------------------------------- +-- Standalone function (with CREATE OR REPLACE — some files use this form) +-------------------------------------------------------------------------------- +create or replace PROCEDURE HR_SCHEMA.standalone_func_alt(pi_msg VARCHAR2) IS +BEGIN NULL; END; +/ + +FUNCTION standalone_func(p_id IN NUMBER) RETURN VARCHAR2 AS + v_name VARCHAR2(100); +BEGIN + SELECT emp_name INTO v_name FROM employees WHERE emp_id = p_id; + RETURN v_name; +END; +/ + +-------------------------------------------------------------------------------- +-- Type definition +-------------------------------------------------------------------------------- +TYPE "ADDRESS_T" FORCE AS OBJECT( + street VARCHAR2(100), + city VARCHAR2(50), + MEMBER FUNCTION to_string RETURN VARCHAR2 +); +/ diff --git a/tests/test_multilang.py b/tests/test_multilang.py index afda355e..0db09870 100644 --- a/tests/test_multilang.py +++ b/tests/test_multilang.py @@ -2548,3 +2548,170 @@ def test_table_reference_edges(self): targets = {e.target for e in imports} # active_orders view and archive procedure both reference orders/users assert "orders" in targets or "users" in targets + + +# --------------------------------------------------------------------------- +# Oracle PL/SQL +# --------------------------------------------------------------------------- + +class TestPlsqlParsing: + def setup_method(self): + self.parser = CodeParser() + self.nodes, self.edges = self.parser.parse_file(FIXTURES / "sample.plsql") + + # --- language detection --- + + def test_detects_language_by_extension(self): + for ext in (".pks", ".pkb", ".prc", ".fnc", ".trg", ".pls"): + assert self.parser.detect_language(Path(f"hr{ext}")) == "plsql" + + def test_sql_file_with_oracle_header_routes_to_plsql(self): + """A .sql file starting with PACKAGE/TRIGGER/etc. is parsed as PL/SQL.""" + src = b"PACKAGE BODY hr_pkg AS\nPROCEDURE p IS BEGIN NULL; END;\nEND hr_pkg;\n" + nodes, _ = self.parser.parse_bytes(Path("hr.sql"), src) + langs = {n.language for n in nodes} + assert "plsql" in langs + + def test_wrapped_sql_returns_empty(self): + """Wrapped (obfuscated) Oracle files are silently skipped.""" + src = b"PACKAGE BODY blc_accounts wrapped \na000000\n1\nabcd\n" + nodes, edges = self.parser.parse_bytes(Path("blc.sql"), src) + assert nodes == [] and edges == [] + + def test_wrapped_plsql_extension_returns_empty(self): + src = b"PACKAGE BODY blc_accounts wrapped \na000000\nabcd\n" + nodes, edges = self.parser.parse_bytes(Path("blc.pkb"), src) + assert nodes == [] and edges == [] + + # --- file node --- + + def test_file_node_exists_and_language(self): + file_nodes = [n for n in self.nodes if n.kind == "File"] + assert len(file_nodes) == 1 + assert file_nodes[0].language == "plsql" + + # --- package spec --- + + def test_finds_package_spec(self): + pkgs = [n for n in self.nodes + if n.kind == "Class" and n.extra.get("plsql_kind") == "package"] + assert any(p.name == "HR_PKG" for p in pkgs) + + def test_package_spec_contains_edge(self): + contains = [e for e in self.edges if e.kind == "CONTAINS"] + targets = {e.target.split("::")[-1] for e in contains} + assert "HR_PKG" in targets + + # --- package body --- + + def test_finds_package_body(self): + bodies = [n for n in self.nodes + if n.extra.get("plsql_kind") == "package_body"] + assert any(b.name == "HR_PKG" for b in bodies) + + def test_finds_package_members(self): + members = [n for n in self.nodes + if n.kind == "Function" and n.parent_name == "HR_PKG"] + names = {m.name for m in members} + assert "hire_employee" in names + assert "get_salary" in names + + def test_member_plsql_kind(self): + members = {n.name: n for n in self.nodes + if n.kind == "Function" and n.parent_name == "HR_PKG"} + assert members["hire_employee"].extra["plsql_kind"] == "procedure" + assert members["get_salary"].extra["plsql_kind"] == "function" + + def test_package_member_contains_edge(self): + contains = [e for e in self.edges if e.kind == "CONTAINS"] + # Edge source should reference the package name + pkg_contains = [e for e in contains if "HR_PKG" in e.source] + assert len(pkg_contains) >= 2 + + # --- trigger --- + + def test_finds_trigger(self): + trigs = [n for n in self.nodes + if n.extra.get("plsql_kind") == "trigger"] + assert any(t.name == "AUDIT_EMP_TRG" for t in trigs) + + def test_trigger_event_metadata(self): + trig = next(n for n in self.nodes if n.extra.get("plsql_kind") == "trigger") + assert "trigger_event" in trig.extra + assert "trigger_table" in trig.extra + assert "INSERT" in trig.extra["trigger_event"] + + def test_trigger_imports_table_edge(self): + imports = [e for e in self.edges if e.kind == "IMPORTS_FROM"] + targets = {e.target for e in imports} + assert "EMPLOYEES" in targets + + # --- standalone procedure / function --- + + def test_finds_standalone_procedure(self): + procs = [n for n in self.nodes + if n.kind == "Function" + and n.extra.get("plsql_kind") == "procedure" + and n.parent_name is None] + assert any(p.name == "standalone_proc" for p in procs) + + def test_finds_standalone_function(self): + funcs = [n for n in self.nodes + if n.kind == "Function" + and n.extra.get("plsql_kind") == "function" + and n.parent_name is None] + assert any(f.name == "standalone_func" for f in funcs) + + def test_create_or_replace_procedure_detected(self): + """Procedures using CREATE OR REPLACE prefix are also captured.""" + procs = [n for n in self.nodes + if n.kind == "Function" and n.extra.get("plsql_kind") == "procedure"] + names = {p.name for p in procs} + assert "standalone_func_alt" in names + + # --- type --- + + def test_finds_type(self): + types = [n for n in self.nodes + if n.kind == "Class" and n.extra.get("plsql_kind") == "type"] + assert any(t.name == "ADDRESS_T" for t in types) + + # --- table references --- + + def test_table_reference_edges(self): + imports = [e for e in self.edges if e.kind == "IMPORTS_FROM"] + assert len(imports) >= 1 + + # --- inter-package CALLS edges --- + + def test_calls_edges_emitted(self): + calls = [e for e in self.edges if e.kind == "CALLS"] + assert len(calls) >= 1, "Expected at least one CALLS edge from package body members" + + def test_calls_edge_from_hire_employee_to_audit_pkg(self): + calls = [e for e in self.edges if e.kind == "CALLS"] + targets = {e.target for e in calls} + assert "AUDIT_PKG.log_change" in targets + + def test_calls_edge_from_get_salary_to_notif_pkg(self): + calls = [e for e in self.edges if e.kind == "CALLS"] + targets = {e.target for e in calls} + assert "NOTIF_PKG.send_alert" in targets + + def test_calls_edge_source_is_qualified_member(self): + """CALLS edge source must be the package-qualified member, not bare file.""" + hire_calls = [ + e for e in self.edges + if e.kind == "CALLS" and e.target == "AUDIT_PKG.log_change" + ] + assert len(hire_calls) == 1 + assert "HR_PKG.hire_employee" in hire_calls[0].source + + def test_system_pkg_calls_not_emitted(self): + """Calls to Oracle system packages (DBMS_*, UTL_*) are suppressed.""" + calls = [e for e in self.edges if e.kind == "CALLS"] + targets = {e.target for e in calls} + assert not any( + t.upper().startswith(("DBMS_", "UTL_", "SYS.")) + for t in targets + )