Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 26 additions & 13 deletions apps/openant-cli/cmd/parse.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ var (
parseDiffBase string
parsePR int
parseDiffScope string
parseFresh bool
)

func init() {
Expand All @@ -38,6 +39,30 @@ func init() {
parseCmd.Flags().StringVar(&parseDiffBase, "diff-base", "", "Incremental mode: tag units overlapping diff vs this ref")
parseCmd.Flags().IntVar(&parsePR, "pr", 0, "Incremental mode against a GitHub PR number (mutex with --diff-base)")
parseCmd.Flags().StringVar(&parseDiffScope, "diff-scope", "changed_functions", "Diff scope: changed_files, changed_functions, callers")
parseCmd.Flags().BoolVar(&parseFresh, "fresh", false, "Delete existing dataset.json and reparse from scratch (other artifacts preserved)")
}

// buildParsePyArgs constructs the argv passed to the Python parse subcommand.
// Extracted so tests can verify pass-through behavior without invoking the
// full Python runtime.
func buildParsePyArgs(repoPath, outputDir, datasetName, language, level, manifestPath string, fresh bool) []string {
pyArgs := []string{"parse", repoPath, "--output", outputDir}
if datasetName != "" {
pyArgs = append(pyArgs, "--name", datasetName)
}
if language != "auto" {
pyArgs = append(pyArgs, "--language", language)
}
if level != "all" {
pyArgs = append(pyArgs, "--level", level)
}
if manifestPath != "" {
pyArgs = append(pyArgs, "--diff-manifest", manifestPath)
}
if fresh {
pyArgs = append(pyArgs, "--fresh")
}
return pyArgs
}

func runParse(cmd *cobra.Command, args []string) {
Expand Down Expand Up @@ -92,19 +117,7 @@ func runParse(cmd *cobra.Command, args []string) {
}
}

pyArgs := []string{"parse", repoPath, "--output", parseOutput}
if datasetName != "" {
pyArgs = append(pyArgs, "--name", datasetName)
}
if parseLanguage != "auto" {
pyArgs = append(pyArgs, "--language", parseLanguage)
}
if parseLevel != "all" {
pyArgs = append(pyArgs, "--level", parseLevel)
}
if manifestPath != "" {
pyArgs = append(pyArgs, "--diff-manifest", manifestPath)
}
pyArgs := buildParsePyArgs(repoPath, parseOutput, datasetName, parseLanguage, parseLevel, manifestPath, parseFresh)

result, err := python.Invoke(rt.Path, pyArgs, "", quiet, resolvedAPIKey())
if err != nil {
Expand Down
114 changes: 114 additions & 0 deletions apps/openant-cli/cmd/parse_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
package cmd

import (
"testing"

"github.com/spf13/cobra"
)

// TestParseCmdHasFreshFlag verifies that the --fresh flag is registered
// on parseCmd, defaults to false, and is documented in the help text.
func TestParseCmdHasFreshFlag(t *testing.T) {
flag := parseCmd.Flags().Lookup("fresh")
if flag == nil {
t.Fatal("parseCmd is missing the --fresh flag")
}
if flag.Value.Type() != "bool" {
t.Errorf("--fresh should be a bool flag, got type %q", flag.Value.Type())
}
if flag.DefValue != "false" {
t.Errorf("--fresh default should be false, got %q", flag.DefValue)
}
if flag.Usage == "" {
t.Error("--fresh flag is missing a usage/help string")
}
}

// TestParseCmdFreshFlagInitialState verifies the package-level parseFresh
// variable starts as false (its zero value), so an unset flag won't add
// --fresh to the Python args.
func TestParseCmdFreshFlagInitialState(t *testing.T) {
// Save and restore so we don't leak state into other tests in this package.
orig := parseFresh
defer func() { parseFresh = orig }()

// Reset to default (zero value).
parseFresh = false
if parseFresh {
t.Errorf("parseFresh should default to false, got true")
}
}

// TestParseCmdFreshFlagParses verifies cobra correctly parses --fresh
// into the parseFresh package-level variable.
func TestParseCmdFreshFlagParses(t *testing.T) {
orig := parseFresh
defer func() {
parseFresh = orig
// Reset the flag's underlying value so subsequent tests start clean.
_ = parseCmd.Flags().Set("fresh", "false")
}()

parseFresh = false
if err := parseCmd.Flags().Set("fresh", "true"); err != nil {
t.Fatalf("failed to set --fresh: %v", err)
}
if !parseFresh {
t.Error("setting --fresh=true should make parseFresh true")
}

if err := parseCmd.Flags().Set("fresh", "false"); err != nil {
t.Fatalf("failed to set --fresh=false: %v", err)
}
if parseFresh {
t.Error("setting --fresh=false should make parseFresh false")
}
}

// TestParsePyArgsIncludesFreshWhenSet verifies that --fresh is appended to
// the python argv only when the flag is true.
func TestParsePyArgsIncludesFreshWhenSet(t *testing.T) {
args := buildParsePyArgs("/some/repo", "/out", "", "auto", "all", "", true)

found := false
for _, a := range args {
if a == "--fresh" {
found = true
break
}
}
if !found {
t.Errorf("expected --fresh in pyArgs when fresh=true, got %v", args)
}
}

// TestParsePyArgsOmitsFreshWhenUnset verifies --fresh is omitted from the
// python argv when the flag is false. Otherwise --fresh would be a hidden
// always-on default and we'd lose any incremental-reuse behavior.
func TestParsePyArgsOmitsFreshWhenUnset(t *testing.T) {
args := buildParsePyArgs("/some/repo", "/out", "", "auto", "all", "", false)

for _, a := range args {
if a == "--fresh" {
t.Errorf("did not expect --fresh in pyArgs when fresh=false, got %v", args)
}
}
}

// TestParseCmdIsRegisteredOnRoot ensures parseCmd is wired into rootCmd so
// `openant parse --fresh` is reachable from the command line.
func TestParseCmdIsRegisteredOnRoot(t *testing.T) {
var found *cobra.Command
for _, c := range rootCmd.Commands() {
if c.Name() == "parse" {
found = c
break
}
}
if found == nil {
t.Fatal("parse command not registered on rootCmd")
}
if found.Flags().Lookup("fresh") == nil {
t.Error("parse subcommand resolved from root is missing --fresh flag")
}
}
16 changes: 16 additions & 0 deletions libs/openant-core/core/parser_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ def parse_repository(
skip_tests: bool = True,
name: str = None,
diff_manifest: str | None = None,
fresh: bool = False,
) -> ParseResult:
"""Parse a repository into an OpenAnt dataset.

Expand All @@ -89,6 +90,9 @@ def parse_repository(
processing_level: "all", "reachable", "codeql", or "exploitable".
skip_tests: If True, exclude test files from parsing (default: True).
name: Dataset name override (default: derived from repo path basename).
fresh: If True, delete existing dataset.json before parsing so all
units are regenerated from scratch. Only dataset.json is deleted;
other artifacts in output_dir (e.g. analyzer outputs) are preserved.

Returns:
ParseResult with paths to generated files and stats.
Expand All @@ -101,6 +105,18 @@ def parse_repository(
output_dir = os.path.abspath(output_dir)
os.makedirs(output_dir, exist_ok=True)

if fresh:
dataset_path = os.path.join(output_dir, "dataset.json")
# Use try/except instead of exists()+remove() to avoid a TOCTOU race
# if a concurrent --fresh run removes the file between the two calls.
# Only dataset.json is deleted; other artifacts (analyzer outputs, etc.)
# in output_dir are preserved.
try:
os.remove(dataset_path)
print("[Parser] --fresh: deleted existing dataset.json", file=sys.stderr)
except FileNotFoundError:
pass

# Detect language if auto
if language == "auto":
language = detect_language(repo_path)
Expand Down
3 changes: 3 additions & 0 deletions libs/openant-core/openant/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ def cmd_parse(args):
skip_tests=not args.no_skip_tests,
name=getattr(args, "name", None),
diff_manifest=getattr(args, "diff_manifest", None),
fresh=getattr(args, "fresh", False),
)

ctx.summary = {
Expand Down Expand Up @@ -1017,6 +1018,8 @@ def main():
parse_p.add_argument("--no-skip-tests", action="store_true", help="Include test files in parsing (default: tests are skipped)")
parse_p.add_argument("--name", help="Dataset name (default: derived from repo path)")
parse_p.add_argument("--diff-manifest", help="Path to diff_manifest.json; tags units with diff_selected")
parse_p.add_argument("--fresh", action="store_true",
help="Delete existing dataset.json and reparse from scratch (default: reuse existing units; other artifacts preserved)")
parse_p.set_defaults(func=cmd_parse)

# ---------------------------------------------------------------
Expand Down
3 changes: 3 additions & 0 deletions libs/openant-core/parsers/javascript/unit_generator.js
Original file line number Diff line number Diff line change
Expand Up @@ -416,6 +416,9 @@ if (require.main === module) {
console.error(` Existing units: ${existingUnits.length}`);
console.error(` New units to add: ${newUnits.length}`);
console.error(` Duplicates skipped: ${duplicateCount}`);
if (duplicateCount > 0) {
console.error(` Note: ${duplicateCount} existing units kept as-is (use --fresh to regenerate all units)`);
}

// Append new units to existing
finalResult = {
Expand Down
149 changes: 149 additions & 0 deletions libs/openant-core/tests/test_parse_fresh.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
"""Tests for the `--fresh` flag plumbing in core.parser_adapter.parse_repository.

These tests stub out the language-specific parsers so we can verify the
pre-parse cleanup behavior of `fresh=True` in isolation, without relying
on the real Python/JS/Go parsers.
"""
import json
import os
from pathlib import Path

import pytest

from core import parser_adapter
from core.schemas import ParseResult


def _make_stub_parser(record):
"""Build a fake `_parse_python` that records what it sees on disk.

The stub captures whether `dataset.json` exists in `output_dir` at the
time it is invoked, then writes a fresh dataset itself so the rest of
`parse_repository` has something to work with.
"""
def _stub(repo_path, output_dir, processing_level, skip_tests=True, name=None):
dataset_path = os.path.join(output_dir, "dataset.json")
record["dataset_existed_when_parser_ran"] = os.path.exists(dataset_path)
# Mimic real parser output
with open(dataset_path, "w") as f:
json.dump({"units": [{"id": "u1", "code": "def f(): pass"}]}, f)
return ParseResult(
dataset_path=dataset_path,
analyzer_output_path=None,
units_count=1,
language="python",
processing_level=processing_level,
)
return _stub


class TestParseFreshFlag:
def test_fresh_true_deletes_existing_dataset_before_parser_runs(
self, tmp_path, monkeypatch
):
output_dir = tmp_path / "output"
output_dir.mkdir()
existing = output_dir / "dataset.json"
existing.write_text(json.dumps({"units": [{"id": "stale"}]}))

record = {}
monkeypatch.setattr(parser_adapter, "_parse_python", _make_stub_parser(record))

parser_adapter.parse_repository(
repo_path=str(tmp_path), # repo path not actually used by stub
output_dir=str(output_dir),
language="python",
processing_level="all",
fresh=True,
)

# The pre-existing dataset.json must be gone by the time the
# parser runs, proving --fresh removed it before dispatch.
assert record["dataset_existed_when_parser_ran"] is False

def test_fresh_false_leaves_existing_dataset_in_place(
self, tmp_path, monkeypatch
):
output_dir = tmp_path / "output"
output_dir.mkdir()
existing = output_dir / "dataset.json"
existing.write_text(json.dumps({"units": [{"id": "stale"}]}))

record = {}
monkeypatch.setattr(parser_adapter, "_parse_python", _make_stub_parser(record))

parser_adapter.parse_repository(
repo_path=str(tmp_path),
output_dir=str(output_dir),
language="python",
processing_level="all",
fresh=False,
)

# Without --fresh the existing dataset must still be present when
# the parser is invoked (so the parser can decide whether to
# incrementally reuse it).
assert record["dataset_existed_when_parser_ran"] is True

def test_fresh_default_is_false(self, tmp_path, monkeypatch):
"""`fresh` must default to False so existing scans aren't wiped."""
output_dir = tmp_path / "output"
output_dir.mkdir()
existing = output_dir / "dataset.json"
existing.write_text(json.dumps({"units": [{"id": "stale"}]}))

record = {}
monkeypatch.setattr(parser_adapter, "_parse_python", _make_stub_parser(record))

# Note: no `fresh=` kwarg.
parser_adapter.parse_repository(
repo_path=str(tmp_path),
output_dir=str(output_dir),
language="python",
processing_level="all",
)

assert record["dataset_existed_when_parser_ran"] is True

def test_fresh_true_with_no_existing_dataset_is_noop(
self, tmp_path, monkeypatch
):
"""Passing --fresh when no dataset.json exists must not error."""
output_dir = tmp_path / "output"
output_dir.mkdir()
# Note: no pre-existing dataset.json

record = {}
monkeypatch.setattr(parser_adapter, "_parse_python", _make_stub_parser(record))

result = parser_adapter.parse_repository(
repo_path=str(tmp_path),
output_dir=str(output_dir),
language="python",
processing_level="all",
fresh=True,
)

# The parser still runs and produces a dataset
assert Path(result.dataset_path).exists()
assert record["dataset_existed_when_parser_ran"] is False

def test_fresh_creates_output_dir_if_missing(
self, tmp_path, monkeypatch
):
"""`fresh=True` must not crash when output_dir doesn't yet exist."""
output_dir = tmp_path / "does_not_exist_yet"

record = {}
monkeypatch.setattr(parser_adapter, "_parse_python", _make_stub_parser(record))

result = parser_adapter.parse_repository(
repo_path=str(tmp_path),
output_dir=str(output_dir),
language="python",
processing_level="all",
fresh=True,
)

assert output_dir.exists()
assert Path(result.dataset_path).exists()
Loading