diff --git a/README.md b/README.md index e5e5c8e..acd1957 100644 --- a/README.md +++ b/README.md @@ -392,8 +392,10 @@ and binary version. Different models or Lumen versions automatically get separate indexes. No files are added to your repo, no `.gitignore` modifications needed. -You can safely delete the entire `lumen` directory to clear all indexes, or use -`lumen purge` to do it automatically. +You can safely delete the entire `lumen` directory to clear all indexes. Or use +`lumen purge` (current project only), `lumen purge --all` (every index), or +`lumen purge --missing` (drop indexes whose source folder was deleted; add +`--dry-run` to preview). **Git worktrees** are detected automatically. When you create a new worktree (`git worktree add` or `claude --worktree`), Lumen finds a sibling worktree's diff --git a/cmd/purge.go b/cmd/purge.go index 37ec3cc..47c9be4 100644 --- a/cmd/purge.go +++ b/cmd/purge.go @@ -27,25 +27,57 @@ import ( "github.com/spf13/cobra" ) +const ( + flagAll = "all" + flagMissing = "missing" + flagLegacy = "legacy" + flagDryRun = "dry-run" +) + func init() { + registerPurgeFlags(purgeCmd) rootCmd.AddCommand(purgeCmd) } +// registerPurgeFlags declares the purge command's flags. Shared by init and +// the test helper so tests exercise the real flag set. +func registerPurgeFlags(cmd *cobra.Command) { + cmd.Flags().Bool(flagAll, false, "Remove every index under the data directory") + cmd.Flags().Bool(flagMissing, false, "Remove indexes whose project folder no longer exists") + cmd.Flags().Bool(flagLegacy, false, "Remove only legacy/unreadable indexes lacking project_path metadata") + cmd.Flags().Bool(flagDryRun, false, "With --missing, list what would be removed without deleting") +} + +// lumenDataDir returns the directory holding all lumen index databases. +func lumenDataDir() string { + return filepath.Join(config.XDGDataDir(), "lumen") +} + var purgeCmd = &cobra.Command{ Use: "purge [path...]", Short: "Remove lumen index data", Long: `Deletes lumen index databases under ~/.local/share/lumen/. -With no arguments, removes every index (irreversible — all indexes will be -rebuilt on the next search). +With no arguments, removes only the index for the current working directory's +project (the path is normalized to its git root first). -With one or more paths, removes only the index directories associated with -those projects. Each path is normalized to its git root first, then matched -against the project_path recorded inside each index database, so switching -embedding models or using custom models never leaves orphan indexes. +With one or more paths, removes the index directories associated with those +projects. Each path is normalized to its git root, then matched against the +project_path recorded inside each index database, so switching embedding models +or using custom models never leaves orphan indexes. -Indexes created by older binaries that did not record project_path cannot be -matched by path; run "lumen purge" with no arguments to wipe those. + --all Remove every index (irreversible — all indexes will be rebuilt on + the next search). Also clears legacy indexes created by older + binaries that did not record project_path. + --missing Remove every index whose recorded project folder no longer exists + on disk (only deletes a project index when its folder is confirmed + missing), plus any unreadable/corrupt index directories. + --legacy Remove only legacy indexes created by older binaries that did not + record project_path, plus any unreadable/corrupt index directories. + Legacy indexes are still usable by the system (located by path + hash) but invisible to path- and --missing-based purge. Cannot be + combined with other flags or paths. + --dry-run With --missing, list what would be removed without deleting. Note: a concurrently running indexer for a purged project may log a write error and exit; re-run "lumen index" afterwards to rebuild.`, @@ -54,14 +86,66 @@ error and exit; re-run "lumen index" afterwards to rebuild.`, } func runPurge(cmd *cobra.Command, args []string) error { - if len(args) == 0 { - return purgeAll(cmd.ErrOrStderr()) + all, _ := cmd.Flags().GetBool(flagAll) + missing, _ := cmd.Flags().GetBool(flagMissing) + legacy, _ := cmd.Flags().GetBool(flagLegacy) + dryRun, _ := cmd.Flags().GetBool(flagDryRun) + + if err := validatePurgeFlags(all, missing, legacy, dryRun, len(args)); err != nil { + return err + } + + stderr := cmd.ErrOrStderr() + stdout := cmd.OutOrStdout() + + switch { + case all: + return purgeAll(stderr) + case legacy: + return purgeLegacy(stderr, stdout) + case missing: + return purgeMissing(stderr, stdout, dryRun) + default: + if len(args) == 0 { + cwd, err := os.Getwd() + if err != nil { + return fmt.Errorf("determine working directory: %w", err) + } + args = []string{cwd} + } + return purgeProjects(stderr, stdout, args) } - return purgeProjects(cmd.ErrOrStderr(), cmd.OutOrStdout(), args) } +// validatePurgeFlags enforces mutual exclusivity of the purge modes. --all, +// --missing, and --legacy are the three exclusive whole-dataset modes; explicit +// paths select the default per-project mode and combine with none of them. +func validatePurgeFlags(all, missing, legacy, dryRun bool, nArgs int) error { + modes := 0 + for _, set := range []bool{all, missing, legacy} { + if set { + modes++ + } + } + if modes > 1 { + return fmt.Errorf("--all, --missing, and --legacy are mutually exclusive") + } + if modes > 0 && nArgs > 0 { + return fmt.Errorf("--all, --missing, and --legacy cannot be combined with explicit paths") + } + if dryRun && !missing { + return fmt.Errorf("--dry-run is only valid with --missing") + } + return nil +} + +// purgeAll removes the entire lumen data directory. This is unconditional by +// design: --all must wipe everything regardless of whether individual indexes +// can be scanned or read. The pre-delete scan is best-effort and used only for +// per-index logging — its error is deliberately ignored, since a corrupt or +// unreadable index is exactly the kind of state --all exists to clear. func purgeAll(stderr io.Writer) error { - dataDir := filepath.Join(config.XDGDataDir(), "lumen") + dataDir := lumenDataDir() info, err := os.Stat(dataDir) if err != nil { @@ -75,6 +159,21 @@ func purgeAll(stderr io.Writer) error { return fmt.Errorf("%s is not a directory", dataDir) } + // Log each index directory before wiping, matching the per-index logging + // used by the other purge modes. Dirs without project_path metadata and + // unreadable dirs are logged by path alone. + indexMap, noMeta, unreadable, _ := scanIndexes(dataDir) + for projectPath, hashDirs := range indexMap { + for _, hashDir := range hashDirs { + _, _ = fmt.Fprintf(stderr, "Removed %s (%s)\n", hashDir, projectPath) + } + } + for _, dirs := range [][]string{noMeta, unreadable} { + for _, hashDir := range dirs { + _, _ = fmt.Fprintf(stderr, "Removed %s\n", hashDir) + } + } + if err := os.RemoveAll(dataDir); err != nil { return fmt.Errorf("remove index data: %w", err) } @@ -83,8 +182,7 @@ func purgeAll(stderr io.Writer) error { } func purgeProjects(stderr, stdout io.Writer, args []string) error { - dataDir := filepath.Join(config.XDGDataDir(), "lumen") - indexMap, err := scanIndexes(dataDir) + indexMap, _, _, err := scanIndexes(lumenDataDir()) if err != nil { return err } @@ -102,18 +200,105 @@ func purgeProjects(stderr, stdout io.Writer, args []string) error { return nil } -// scanIndexes walks dataDir (one level deep) and returns a map of stored -// project_path → list of hash directories for that project. Hash directories -// that can't be read or lack project_path metadata are silently skipped so a -// single broken index never blocks purging of others. -func scanIndexes(dataDir string) (map[string][]string, error) { - result := make(map[string][]string) +func purgeMissing(stderr, stdout io.Writer, dryRun bool) error { + indexMap, _, unreadable, err := scanIndexes(lumenDataDir()) + if err != nil { + return err + } + + verb := "Removed" + if dryRun { + verb = "Would remove" + } + + remove := func(hashDir, reason string) error { + if !dryRun { + if err := os.RemoveAll(hashDir); err != nil { + return fmt.Errorf("remove %s: %w", hashDir, err) + } + } + _, _ = fmt.Fprintf(stderr, "%s %s (%s)\n", verb, hashDir, reason) + return nil + } + + removed := 0 + for projectPath, hashDirs := range indexMap { + if _, statErr := os.Stat(projectPath); statErr == nil { + continue // folder still exists — keep the index + } else if !os.IsNotExist(statErr) { + // Conservative: any error other than "not exist" must never delete. + _, _ = fmt.Fprintf(stderr, "Skipping %s: cannot stat (%v)\n", projectPath, statErr) + continue + } + for _, hashDir := range hashDirs { + if err := remove(hashDir, projectPath); err != nil { + return err + } + removed++ + } + } + + // Unreadable/corrupt index dirs have no folder mapping and can never be + // served or rebuilt in place, so --missing clears them too. + for _, hashDir := range unreadable { + if err := remove(hashDir, "unreadable"); err != nil { + return err + } + removed++ + } + + _, _ = fmt.Fprintf(stdout, "%s %d index director%s (missing folders and unreadable indexes).\n", verb, removed, pluralY(removed)) + return nil +} + +// purgeLegacy removes legacy hash directories: readable DBs that do not record +// project_path (still usable by the system, located by path hash, but invisible +// to path- and --missing-based purge) plus unreadable/corrupt directories. +// --legacy is the explicit way to clear both. +func purgeLegacy(stderr, stdout io.Writer) error { + _, noMeta, unreadable, err := scanIndexes(lumenDataDir()) + if err != nil { + return err + } + + removed := 0 + for _, dirs := range [][]string{noMeta, unreadable} { + for _, hashDir := range dirs { + if err := os.RemoveAll(hashDir); err != nil { + return fmt.Errorf("remove %s: %w", hashDir, err) + } + _, _ = fmt.Fprintf(stderr, "Removed %s\n", hashDir) + removed++ + } + } + + _, _ = fmt.Fprintf(stdout, "Removed %d legacy index director%s.\n", removed, pluralY(removed)) + return nil +} + +// scanIndexes walks dataDir (one level deep) and classifies each hash directory +// into three buckets: +// +// - indexMap: stored project_path → hash directories (readable DB with +// project_path metadata). +// - noMeta: readable DBs that do not record project_path (created by older +// binaries). These remain usable by the system — they are located by path +// hash, not by metadata — so path- and --missing-based purge leave them +// alone; only --all and --legacy remove them. +// - unreadable: directories whose index.db is missing or corrupt. These can +// never be served or rebuilt in place, so every purge mode that scans +// (--missing and --legacy) clears them. +// +// Path-based purge ignores the non-indexMap buckets so a single broken index +// never blocks purging of others. +func scanIndexes(dataDir string) (indexMap map[string][]string, noMeta, unreadable []string, err error) { + indexMap = make(map[string][]string) entries, err := os.ReadDir(dataDir) if err != nil { if os.IsNotExist(err) { - return result, nil + return indexMap, nil, nil, nil } - return nil, fmt.Errorf("read data dir: %w", err) + return nil, nil, nil, fmt.Errorf("read data dir: %w", err) } for _, entry := range entries { if !entry.IsDir() { @@ -121,13 +306,17 @@ func scanIndexes(dataDir string) (map[string][]string, error) { } hashDir := filepath.Join(dataDir, entry.Name()) dbPath := filepath.Join(hashDir, "index.db") - stored, err := store.ReadMetaAt(dbPath, "project_path") - if err != nil || stored == "" { - continue + stored, readErr := store.ReadMetaAt(dbPath, "project_path") + switch { + case readErr != nil: + unreadable = append(unreadable, hashDir) + case stored == "": + noMeta = append(noMeta, hashDir) + default: + indexMap[stored] = append(indexMap[stored], hashDir) } - result[stored] = append(result[stored], hashDir) } - return result, nil + return indexMap, noMeta, unreadable, nil } // purgeOneTarget resolves arg to a project root and removes every hash diff --git a/cmd/purge_test.go b/cmd/purge_test.go index 1a1badf..b475e56 100644 --- a/cmd/purge_test.go +++ b/cmd/purge_test.go @@ -29,60 +29,121 @@ import ( "github.com/stretchr/testify/require" ) -// seedIndex creates a real SQLite index DB at the hash-named directory for -// (projectPath, model) with project_path recorded in project_meta, so that -// tests exercise the same metadata-scan code path as production. -func seedIndex(t *testing.T, projectPath, model string) string { +// seedIndexWithMeta creates a real SQLite index DB at the hash-named directory +// for (projectPath, model). When recordPath is true it records project_path in +// project_meta (the modern layout exercised by the metadata-scan code path); +// when false it omits it, mimicking indexes written by older binaries. +func seedIndexWithMeta(t *testing.T, projectPath, model string, recordPath bool) string { t.Helper() dbPath := config.DBPathForProject(projectPath, model) require.NoError(t, os.MkdirAll(filepath.Dir(dbPath), 0o755)) s, err := store.New(dbPath, 4) require.NoError(t, err) - require.NoError(t, s.SetMeta("project_path", projectPath)) + if recordPath { + require.NoError(t, s.SetMeta("project_path", projectPath)) + } require.NoError(t, s.Close()) return filepath.Dir(dbPath) } -// runPurgeCmd invokes runPurge with the provided args and returns captured -// stdout, stderr, and the error (if any). -func runPurgeCmd(t *testing.T, args []string) (stdout, stderr string, err error) { +// seedIndex creates an index DB that records project_path. +func seedIndex(t *testing.T, projectPath, model string) string { + t.Helper() + return seedIndexWithMeta(t, projectPath, model, true) +} + +// seedLegacyIndex creates an index DB with NO project_path metadata. Such DBs +// are fully usable by the system (located by path hash) but invisible to +// project_path-based purge, so scanIndexes classifies them as no-metadata legacy. +func seedLegacyIndex(t *testing.T, projectPath, model string) string { + t.Helper() + return seedIndexWithMeta(t, projectPath, model, false) +} + +// seedCorruptDir creates a hash directory whose index.db is missing, mimicking +// an unreadable/corrupt index. scanIndexes classifies it as unreadable. +func seedCorruptDir(t *testing.T, name string) string { + t.Helper() + dir := filepath.Join(lumenDataDir(), name) + require.NoError(t, os.MkdirAll(dir, 0o755)) + return dir +} + +// runPurgeCmd invokes runPurge with the provided args and optional flag names +// (each set to "true") and returns captured stdout, stderr, and the error. +func runPurgeCmd(t *testing.T, args []string, flags ...string) (stdout, stderr string, err error) { t.Helper() outBuf := new(bytes.Buffer) errBuf := new(bytes.Buffer) cmd := &cobra.Command{} + registerPurgeFlags(cmd) + for _, f := range flags { + require.NoError(t, cmd.Flags().Set(f, "true")) + } cmd.SetOut(outBuf) cmd.SetErr(errBuf) err = runPurge(cmd, args) return outBuf.String(), errBuf.String(), err } -func TestPurge_NoArgs_RemovesEverything(t *testing.T) { +func TestPurge_NoArgs_PurgesCwdOnly(t *testing.T) { tmp := resolvedTempDir(t) t.Setenv("XDG_DATA_HOME", tmp) - seedIndex(t, "/project/a", embedder.DefaultModel) - seedIndex(t, "/project/b", embedder.DefaultModel) + projectA := filepath.Join(tmp, "projectA") + projectB := filepath.Join(tmp, "projectB") + require.NoError(t, os.MkdirAll(projectA, 0o755)) + require.NoError(t, os.MkdirAll(projectB, 0o755)) + runGit(t, projectA, "init") + runGit(t, projectB, "init") - lumenRoot := filepath.Join(tmp, "lumen") - entries, err := os.ReadDir(lumenRoot) - require.NoError(t, err) - require.Len(t, entries, 2, "should have seeded two hash dirs") + hashDirA := seedIndex(t, projectA, embedder.DefaultModel) + hashDirB := seedIndex(t, projectB, embedder.DefaultModel) - _, stderrOut, err := runPurgeCmd(t, nil) + // No-args default operates on the current working directory. + t.Chdir(projectA) + + _, _, err := runPurgeCmd(t, nil) require.NoError(t, err) - assert.Contains(t, stderrOut, "Removed all index data") - _, err = os.Stat(lumenRoot) - assert.True(t, os.IsNotExist(err), "lumen data dir should be gone, got err=%v", err) + _, err = os.Stat(hashDirA) + assert.True(t, os.IsNotExist(err), "cwd project A hash dir should be gone") + _, err = os.Stat(hashDirB) + assert.NoError(t, err, "project B hash dir should be untouched") } -func TestPurge_NoArgs_NothingToPurge(t *testing.T) { +func TestPurge_NoArgs_CwdWithoutIndex_ReportsNone(t *testing.T) { tmp := resolvedTempDir(t) t.Setenv("XDG_DATA_HOME", tmp) + empty := filepath.Join(tmp, "empty") + require.NoError(t, os.MkdirAll(empty, 0o755)) + t.Chdir(empty) + _, stderrOut, err := runPurgeCmd(t, nil) require.NoError(t, err) - assert.Contains(t, stderrOut, "No index data found") + assert.Contains(t, strings.ToLower(stderrOut), "no index found") +} + +func TestPurge_All_RemovesEverything(t *testing.T) { + tmp := resolvedTempDir(t) + t.Setenv("XDG_DATA_HOME", tmp) + + seedIndex(t, "/project/a", embedder.DefaultModel) + hashDirB := seedIndex(t, "/project/b", embedder.DefaultModel) + + // A legacy index dir with no project_path metadata must also be wiped. + lumenRoot := filepath.Join(tmp, "lumen") + legacyDir := filepath.Join(lumenRoot, "legacyhash") + require.NoError(t, os.MkdirAll(legacyDir, 0o755)) + + _, stderrOut, err := runPurgeCmd(t, nil, flagAll) + require.NoError(t, err) + assert.Contains(t, stderrOut, "Removed all index data") + assert.Contains(t, stderrOut, hashDirB, "should log each removed index dir") + + _, err = os.Stat(lumenRoot) + assert.True(t, os.IsNotExist(err), "lumen data dir should be gone, got err=%v", err) } func TestPurge_SinglePath_RemovesOnlyThatProject(t *testing.T) { @@ -220,3 +281,164 @@ func TestPurge_MultiplePaths_MixedHitsAndMisses(t *testing.T) { assert.True(t, os.IsNotExist(err), "project A hash dir should be gone") assert.Contains(t, strings.ToLower(stderrOut), "no index found", "miss should be reported") } + +// TestPurge_FlagBehavior consolidates the flag-validation and --missing/--legacy +// behaviors into one table-driven test. Each case sets up its own data dir state +// via setup (returning named paths for postCheck) and asserts error, stderr, and +// resulting filesystem state. +func TestPurge_FlagBehavior(t *testing.T) { + cases := []struct { + name string + flags []string + args []string + setup func(t *testing.T, tmp string) map[string]string + wantErr bool + wantErrContains []string + wantStderrContains string + postCheck func(t *testing.T, paths map[string]string, stderrOut string) + }{ + { + name: "all with paths errors", + flags: []string{flagAll}, + args: []string{"/some/path"}, + wantErr: true, + wantErrContains: []string{"--all"}, + }, + { + name: "all and missing errors", + flags: []string{flagAll, flagMissing}, + wantErr: true, + wantErrContains: []string{"--all", "--missing"}, + }, + { + name: "missing with paths errors", + flags: []string{flagMissing}, + args: []string{"/some/path"}, + wantErr: true, + wantErrContains: []string{"--missing"}, + }, + { + name: "dry-run without missing errors", + flags: []string{flagDryRun}, + wantErr: true, + wantErrContains: []string{"--dry-run is only valid with --missing"}, + }, + { + name: "legacy with all errors", + flags: []string{flagLegacy, flagAll}, + wantErr: true, + wantErrContains: []string{"--legacy"}, + }, + { + name: "legacy with paths errors", + flags: []string{flagLegacy}, + args: []string{"/some/path"}, + wantErr: true, + wantErrContains: []string{"--legacy"}, + }, + { + name: "missing removes deleted folders and unreadable dirs", + flags: []string{flagMissing}, + setup: func(t *testing.T, tmp string) map[string]string { + gone := filepath.Join(tmp, "gone") + alive := filepath.Join(tmp, "alive") + require.NoError(t, os.MkdirAll(gone, 0o755)) + require.NoError(t, os.MkdirAll(alive, 0o755)) + hashGone := seedIndex(t, gone, embedder.DefaultModel) + hashAlive := seedIndex(t, alive, embedder.DefaultModel) + hashCorrupt := seedCorruptDir(t, "corrupthash") + require.NoError(t, os.RemoveAll(gone)) // delete one project's folder + return map[string]string{"gone": hashGone, "alive": hashAlive, "corrupt": hashCorrupt} + }, + postCheck: func(t *testing.T, paths map[string]string, stderrOut string) { + assert.Contains(t, stderrOut, paths["gone"], "should log the removed index dir") + _, err := os.Stat(paths["gone"]) + assert.True(t, os.IsNotExist(err), "index for deleted folder should be removed") + _, err = os.Stat(paths["corrupt"]) + assert.True(t, os.IsNotExist(err), "unreadable index dir should be removed") + _, err = os.Stat(paths["alive"]) + assert.NoError(t, err, "index for existing folder should be kept") + }, + }, + { + name: "missing keeps existing folders", + flags: []string{flagMissing}, + setup: func(t *testing.T, tmp string) map[string]string { + alive := filepath.Join(tmp, "alive") + require.NoError(t, os.MkdirAll(alive, 0o755)) + return map[string]string{"alive": seedIndex(t, alive, embedder.DefaultModel)} + }, + postCheck: func(t *testing.T, paths map[string]string, stderrOut string) { + _, err := os.Stat(paths["alive"]) + assert.NoError(t, err, "index for existing folder should be kept") + }, + }, + { + name: "dry-run missing deletes nothing", + flags: []string{flagMissing, flagDryRun}, + wantStderrContains: "Would remove", + setup: func(t *testing.T, tmp string) map[string]string { + gone := filepath.Join(tmp, "gone") + require.NoError(t, os.MkdirAll(gone, 0o755)) + hashGone := seedIndex(t, gone, embedder.DefaultModel) + require.NoError(t, os.RemoveAll(gone)) + return map[string]string{"gone": hashGone} + }, + postCheck: func(t *testing.T, paths map[string]string, stderrOut string) { + assert.Contains(t, stderrOut, paths["gone"]) + _, err := os.Stat(paths["gone"]) + assert.NoError(t, err, "dry-run must not delete the index dir") + }, + }, + { + name: "legacy removes metadata-less and unreadable indexes", + flags: []string{flagLegacy}, + setup: func(t *testing.T, tmp string) map[string]string { + alive := filepath.Join(tmp, "alive") + require.NoError(t, os.MkdirAll(alive, 0o755)) + hashAlive := seedIndex(t, alive, embedder.DefaultModel) + hashLegacy := seedLegacyIndex(t, filepath.Join(tmp, "legacyproj"), embedder.DefaultModel) + hashCorrupt := seedCorruptDir(t, "corrupthash") + return map[string]string{"alive": hashAlive, "legacy": hashLegacy, "corrupt": hashCorrupt} + }, + postCheck: func(t *testing.T, paths map[string]string, stderrOut string) { + assert.Contains(t, stderrOut, paths["legacy"], "should log the removed legacy dir") + _, err := os.Stat(paths["legacy"]) + assert.True(t, os.IsNotExist(err), "legacy (no-metadata) index dir should be removed") + _, err = os.Stat(paths["corrupt"]) + assert.True(t, os.IsNotExist(err), "unreadable index dir should be removed") + _, err = os.Stat(paths["alive"]) + assert.NoError(t, err, "index with project_path metadata should be kept") + }, + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + tmp := resolvedTempDir(t) + t.Setenv("XDG_DATA_HOME", tmp) + + var paths map[string]string + if tc.setup != nil { + paths = tc.setup(t, tmp) + } + + _, stderrOut, err := runPurgeCmd(t, tc.args, tc.flags...) + + if tc.wantErr { + require.Error(t, err) + for _, want := range tc.wantErrContains { + assert.Contains(t, err.Error(), want) + } + return + } + require.NoError(t, err) + if tc.wantStderrContains != "" { + assert.Contains(t, stderrOut, tc.wantStderrContains) + } + if tc.postCheck != nil { + tc.postCheck(t, paths, stderrOut) + } + }) + } +}