knostic · joshbouncesecurity · Mar 22, 2026 · May 4, 2026 · May 4, 2026
@@ -29,6 +29,7 @@ var (
 	parseDiffBase  string
 	parsePR        int
 	parseDiffScope string
+	parseFresh     bool
 )
 
 func init() {
@@ -38,6 +39,30 @@ func init() {
 	parseCmd.Flags().StringVar(&parseDiffBase, "diff-base", "", "Incremental mode: tag units overlapping diff vs this ref")
 	parseCmd.Flags().IntVar(&parsePR, "pr", 0, "Incremental mode against a GitHub PR number (mutex with --diff-base)")
 	parseCmd.Flags().StringVar(&parseDiffScope, "diff-scope", "changed_functions", "Diff scope: changed_files, changed_functions, callers")
+	parseCmd.Flags().BoolVar(&parseFresh, "fresh", false, "Delete existing dataset.json and reparse from scratch (other artifacts preserved)")
+}
+
+// buildParsePyArgs constructs the argv passed to the Python parse subcommand.
+// Extracted so tests can verify pass-through behavior without invoking the
+// full Python runtime.
+func buildParsePyArgs(repoPath, outputDir, datasetName, language, level, manifestPath string, fresh bool) []string {
+	pyArgs := []string{"parse", repoPath, "--output", outputDir}
+	if datasetName != "" {
+		pyArgs = append(pyArgs, "--name", datasetName)
+	}
+	if language != "auto" {
+		pyArgs = append(pyArgs, "--language", language)
+	}
+	if level != "all" {
+		pyArgs = append(pyArgs, "--level", level)
+	}
+	if manifestPath != "" {
+		pyArgs = append(pyArgs, "--diff-manifest", manifestPath)
+	}
+	if fresh {
+		pyArgs = append(pyArgs, "--fresh")
+	}
+	return pyArgs
 }
 
 func runParse(cmd *cobra.Command, args []string) {
@@ -92,19 +117,7 @@ func runParse(cmd *cobra.Command, args []string) {
 		}
 	}
 
-	pyArgs := []string{"parse", repoPath, "--output", parseOutput}
-	if datasetName != "" {
-		pyArgs = append(pyArgs, "--name", datasetName)
-	}
-	if parseLanguage != "auto" {
-		pyArgs = append(pyArgs, "--language", parseLanguage)
-	}
-	if parseLevel != "all" {
-		pyArgs = append(pyArgs, "--level", parseLevel)
-	}
-	if manifestPath != "" {
-		pyArgs = append(pyArgs, "--diff-manifest", manifestPath)
-	}
+	pyArgs := buildParsePyArgs(repoPath, parseOutput, datasetName, parseLanguage, parseLevel, manifestPath, parseFresh)
 
 	result, err := python.Invoke(rt.Path, pyArgs, "", quiet, resolvedAPIKey())
 	if err != nil {

@@ -0,0 +1,114 @@
+package cmd
+
+import (
+	"testing"
+
+	"github.com/spf13/cobra"
+)
+
+// TestParseCmdHasFreshFlag verifies that the --fresh flag is registered
+// on parseCmd, defaults to false, and is documented in the help text.
+func TestParseCmdHasFreshFlag(t *testing.T) {
+	flag := parseCmd.Flags().Lookup("fresh")
+	if flag == nil {
+		t.Fatal("parseCmd is missing the --fresh flag")
+	}
+	if flag.Value.Type() != "bool" {
+		t.Errorf("--fresh should be a bool flag, got type %q", flag.Value.Type())
+	}
+	if flag.DefValue != "false" {
+		t.Errorf("--fresh default should be false, got %q", flag.DefValue)
+	}
+	if flag.Usage == "" {
+		t.Error("--fresh flag is missing a usage/help string")
+	}
+}
+
+// TestParseCmdFreshFlagInitialState verifies the package-level parseFresh
+// variable starts as false (its zero value), so an unset flag won't add
+// --fresh to the Python args.
+func TestParseCmdFreshFlagInitialState(t *testing.T) {
+	// Save and restore so we don't leak state into other tests in this package.
+	orig := parseFresh
+	defer func() { parseFresh = orig }()
+
+	// Reset to default (zero value).
+	parseFresh = false
+	if parseFresh {
+		t.Errorf("parseFresh should default to false, got true")
+	}
+}
+
+// TestParseCmdFreshFlagParses verifies cobra correctly parses --fresh
+// into the parseFresh package-level variable.
+func TestParseCmdFreshFlagParses(t *testing.T) {
+	orig := parseFresh
+	defer func() {
+		parseFresh = orig
+		// Reset the flag's underlying value so subsequent tests start clean.
+		_ = parseCmd.Flags().Set("fresh", "false")
+	}()
+
+	parseFresh = false
+	if err := parseCmd.Flags().Set("fresh", "true"); err != nil {
+		t.Fatalf("failed to set --fresh: %v", err)
+	}
+	if !parseFresh {
+		t.Error("setting --fresh=true should make parseFresh true")
+	}
+
+	if err := parseCmd.Flags().Set("fresh", "false"); err != nil {
+		t.Fatalf("failed to set --fresh=false: %v", err)
+	}
+	if parseFresh {
+		t.Error("setting --fresh=false should make parseFresh false")
+	}
+}
+
+// TestParsePyArgsIncludesFreshWhenSet verifies that --fresh is appended to
+// the python argv only when the flag is true.
+func TestParsePyArgsIncludesFreshWhenSet(t *testing.T) {
+	args := buildParsePyArgs("/some/repo", "/out", "", "auto", "all", "", true)
+
+	found := false
+	for _, a := range args {
+		if a == "--fresh" {
+			found = true
+			break
+		}
+	}
+	if !found {
+		t.Errorf("expected --fresh in pyArgs when fresh=true, got %v", args)
+	}
+}
+
+// TestParsePyArgsOmitsFreshWhenUnset verifies --fresh is omitted from the
+// python argv when the flag is false. Otherwise --fresh would be a hidden
+// always-on default and we'd lose any incremental-reuse behavior.
+func TestParsePyArgsOmitsFreshWhenUnset(t *testing.T) {
+	args := buildParsePyArgs("/some/repo", "/out", "", "auto", "all", "", false)
+
+	for _, a := range args {
+		if a == "--fresh" {
+			t.Errorf("did not expect --fresh in pyArgs when fresh=false, got %v", args)
+		}
+	}
+}
+
+// TestParseCmdIsRegisteredOnRoot ensures parseCmd is wired into rootCmd so
+// `openant parse --fresh` is reachable from the command line.
+func TestParseCmdIsRegisteredOnRoot(t *testing.T) {
+	var found *cobra.Command
+	for _, c := range rootCmd.Commands() {
+		if c.Name() == "parse" {
+			found = c
+			break
+		}
+	}
+	if found == nil {
+		t.Fatal("parse command not registered on rootCmd")
+	}
+	if found.Flags().Lookup("fresh") == nil {
+		t.Error("parse subcommand resolved from root is missing --fresh flag")
+	}
+}
@@ -76,6 +76,7 @@ def parse_repository(
     skip_tests: bool = True,
     name: str = None,
     diff_manifest: str | None = None,
+    fresh: bool = False,
 ) -> ParseResult:
     """Parse a repository into an OpenAnt dataset.
 
@@ -89,6 +90,9 @@ def parse_repository(
         processing_level: "all", "reachable", "codeql", or "exploitable".
         skip_tests: If True, exclude test files from parsing (default: True).
         name: Dataset name override (default: derived from repo path basename).
+        fresh: If True, delete existing dataset.json before parsing so all
+            units are regenerated from scratch. Only dataset.json is deleted;
+            other artifacts in output_dir (e.g. analyzer outputs) are preserved.
 
     Returns:
         ParseResult with paths to generated files and stats.
@@ -101,6 +105,18 @@ def parse_repository(
     output_dir = os.path.abspath(output_dir)
     os.makedirs(output_dir, exist_ok=True)
 
+    if fresh:
+        dataset_path = os.path.join(output_dir, "dataset.json")
+        # Use try/except instead of exists()+remove() to avoid a TOCTOU race
+        # if a concurrent --fresh run removes the file between the two calls.
+        # Only dataset.json is deleted; other artifacts (analyzer outputs, etc.)
+        # in output_dir are preserved.
+        try:
+            os.remove(dataset_path)
+            print("[Parser] --fresh: deleted existing dataset.json", file=sys.stderr)
+        except FileNotFoundError:
+            pass
+
     # Detect language if auto
     if language == "auto":
         language = detect_language(repo_path)

@@ -124,6 +124,7 @@ def cmd_parse(args):
                 skip_tests=not args.no_skip_tests,
                 name=getattr(args, "name", None),
                 diff_manifest=getattr(args, "diff_manifest", None),
+                fresh=getattr(args, "fresh", False),
             )
 
             ctx.summary = {
@@ -1017,6 +1018,8 @@ def main():
     parse_p.add_argument("--no-skip-tests", action="store_true", help="Include test files in parsing (default: tests are skipped)")
     parse_p.add_argument("--name", help="Dataset name (default: derived from repo path)")
     parse_p.add_argument("--diff-manifest", help="Path to diff_manifest.json; tags units with diff_selected")
+    parse_p.add_argument("--fresh", action="store_true",
+                         help="Delete existing dataset.json and reparse from scratch (default: reuse existing units; other artifacts preserved)")
     parse_p.set_defaults(func=cmd_parse)
 
     # ---------------------------------------------------------------

@@ -416,6 +416,9 @@ if (require.main === module) {
                 console.error(`  Existing units: ${existingUnits.length}`);
                 console.error(`  New units to add: ${newUnits.length}`);
                 console.error(`  Duplicates skipped: ${duplicateCount}`);
+                if (duplicateCount > 0) {
+                    console.error(`  Note: ${duplicateCount} existing units kept as-is (use --fresh to regenerate all units)`);
+                }
 
                 // Append new units to existing
                 finalResult = {

@@ -0,0 +1,149 @@
+"""Tests for the `--fresh` flag plumbing in core.parser_adapter.parse_repository.
+
+These tests stub out the language-specific parsers so we can verify the
+pre-parse cleanup behavior of `fresh=True` in isolation, without relying
+on the real Python/JS/Go parsers.
+"""
+import json
+import os
+from pathlib import Path
+
+import pytest
+
+from core import parser_adapter
+from core.schemas import ParseResult
+
+
+def _make_stub_parser(record):
+    """Build a fake `_parse_python` that records what it sees on disk.
+
+    The stub captures whether `dataset.json` exists in `output_dir` at the
+    time it is invoked, then writes a fresh dataset itself so the rest of
+    `parse_repository` has something to work with.
+    """
+    def _stub(repo_path, output_dir, processing_level, skip_tests=True, name=None):
+        dataset_path = os.path.join(output_dir, "dataset.json")
+        record["dataset_existed_when_parser_ran"] = os.path.exists(dataset_path)
+        # Mimic real parser output
+        with open(dataset_path, "w") as f:
+            json.dump({"units": [{"id": "u1", "code": "def f(): pass"}]}, f)
+        return ParseResult(
+            dataset_path=dataset_path,
+            analyzer_output_path=None,
+            units_count=1,
+            language="python",
+            processing_level=processing_level,
+        )
+    return _stub
+
+
+class TestParseFreshFlag:
+    def test_fresh_true_deletes_existing_dataset_before_parser_runs(
+        self, tmp_path, monkeypatch
+    ):
+        output_dir = tmp_path / "output"
+        output_dir.mkdir()
+        existing = output_dir / "dataset.json"
+        existing.write_text(json.dumps({"units": [{"id": "stale"}]}))
+
+        record = {}
+        monkeypatch.setattr(parser_adapter, "_parse_python", _make_stub_parser(record))
+
+        parser_adapter.parse_repository(
+            repo_path=str(tmp_path),  # repo path not actually used by stub
+            output_dir=str(output_dir),
+            language="python",
+            processing_level="all",
+            fresh=True,
+        )
+
+        # The pre-existing dataset.json must be gone by the time the
+        # parser runs, proving --fresh removed it before dispatch.
+        assert record["dataset_existed_when_parser_ran"] is False
+
+    def test_fresh_false_leaves_existing_dataset_in_place(
+        self, tmp_path, monkeypatch
+    ):
+        output_dir = tmp_path / "output"
+        output_dir.mkdir()
+        existing = output_dir / "dataset.json"
+        existing.write_text(json.dumps({"units": [{"id": "stale"}]}))
+
+        record = {}
+        monkeypatch.setattr(parser_adapter, "_parse_python", _make_stub_parser(record))
+
+        parser_adapter.parse_repository(
+            repo_path=str(tmp_path),
+            output_dir=str(output_dir),
+            language="python",
+            processing_level="all",
+            fresh=False,
+        )
+
+        # Without --fresh the existing dataset must still be present when
+        # the parser is invoked (so the parser can decide whether to
+        # incrementally reuse it).
+        assert record["dataset_existed_when_parser_ran"] is True
+
+    def test_fresh_default_is_false(self, tmp_path, monkeypatch):
+        """`fresh` must default to False so existing scans aren't wiped."""
+        output_dir = tmp_path / "output"
+        output_dir.mkdir()
+        existing = output_dir / "dataset.json"
+        existing.write_text(json.dumps({"units": [{"id": "stale"}]}))
+
+        record = {}
+        monkeypatch.setattr(parser_adapter, "_parse_python", _make_stub_parser(record))
+
+        # Note: no `fresh=` kwarg.
+        parser_adapter.parse_repository(
+            repo_path=str(tmp_path),
+            output_dir=str(output_dir),
+            language="python",
+            processing_level="all",
+        )
+
+        assert record["dataset_existed_when_parser_ran"] is True
+
+    def test_fresh_true_with_no_existing_dataset_is_noop(
+        self, tmp_path, monkeypatch
+    ):
+        """Passing --fresh when no dataset.json exists must not error."""
+        output_dir = tmp_path / "output"
+        output_dir.mkdir()
+        # Note: no pre-existing dataset.json
+
+        record = {}
+        monkeypatch.setattr(parser_adapter, "_parse_python", _make_stub_parser(record))
+
+        result = parser_adapter.parse_repository(
+            repo_path=str(tmp_path),
+            output_dir=str(output_dir),
+            language="python",
+            processing_level="all",
+            fresh=True,
+        )
+
+        # The parser still runs and produces a dataset
+        assert Path(result.dataset_path).exists()
+        assert record["dataset_existed_when_parser_ran"] is False
+
+    def test_fresh_creates_output_dir_if_missing(
+        self, tmp_path, monkeypatch
+    ):
+        """`fresh=True` must not crash when output_dir doesn't yet exist."""
+        output_dir = tmp_path / "does_not_exist_yet"
+
+        record = {}
+        monkeypatch.setattr(parser_adapter, "_parse_python", _make_stub_parser(record))
+
+        result = parser_adapter.parse_repository(
+            repo_path=str(tmp_path),
+            output_dir=str(output_dir),
+            language="python",
+            processing_level="all",
+            fresh=True,
+        )
+
+        assert output_dir.exists()
+        assert Path(result.dataset_path).exists()