diff --git a/.gitignore b/.gitignore index ea6cf36bef..171a1f601d 100644 --- a/.gitignore +++ b/.gitignore @@ -40,6 +40,7 @@ mise.local.toml # Binary output (only in root) /entire +/migrate-v2-checkpoints /vogon /testreport /bin diff --git a/.golangci.yaml b/.golangci.yaml index cfe0ec0924..fe2a72394c 100644 --- a/.golangci.yaml +++ b/.golangci.yaml @@ -140,6 +140,7 @@ linters: rules: - path: _test\.go linters: + - goconst - gosec - wrapcheck - forbidigo diff --git a/cmd/entire/cli/checkpoint/checkpoint.go b/cmd/entire/cli/checkpoint/checkpoint.go index dc8752749f..3973f06626 100644 --- a/cmd/entire/cli/checkpoint/checkpoint.go +++ b/cmd/entire/cli/checkpoint/checkpoint.go @@ -212,6 +212,10 @@ type WriteCommittedOptions struct { // the original v1 checkpoint time in v2 metadata. CreatedAt time.Time + // CommitTime is the optional git author/committer timestamp for the + // metadata-branch commit. When zero, writers use the current time. + CommitTime time.Time + // Strategy is the name of the strategy that created this checkpoint Strategy string diff --git a/cmd/entire/cli/checkpoint/committed.go b/cmd/entire/cli/checkpoint/committed.go index 964d016854..e5c8b9ba5e 100644 --- a/cmd/entire/cli/checkpoint/committed.go +++ b/cmd/entire/cli/checkpoint/committed.go @@ -43,12 +43,26 @@ import ( // errStopIteration is used to stop commit iteration early in GetCheckpointAuthor. var errStopIteration = errors.New("stop iteration") +type commitSigningDisabledContextKey struct{} + // chunkTranscript is an indirection over agent.ChunkTranscript so tests can // count or intercept chunking calls (e.g., to verify the short-circuit avoids // re-chunking identical content). Production code paths always use the // unwrapped function. var chunkTranscript = agent.ChunkTranscript +// WithCommitSigningDisabled returns a context that prevents metadata branch +// commit signing. Use for replay/migration writes whose author line is sourced +// from historical data rather than the local operator. +func WithCommitSigningDisabled(ctx context.Context) context.Context { + return context.WithValue(ctx, commitSigningDisabledContextKey{}, true) +} + +func commitSigningDisabled(ctx context.Context) bool { + disabled, ok := ctx.Value(commitSigningDisabledContextKey{}).(bool) + return ok && disabled +} + // WriteCommitted writes a committed checkpoint to the entire/checkpoints/v1 branch. // Checkpoints are stored at sharded paths: // // @@ -118,7 +132,11 @@ func (s *GitStore) WriteCommitted(ctx context.Context, opts WriteCommittedOption } commitMsg := s.buildCommitMessage(opts, taskMetadataPath) - newCommitHash, err := s.createCommit(ctx, newTreeHash, parentHash, commitMsg, opts.AuthorName, opts.AuthorEmail) + commitTime := opts.CommitTime + if commitTime.IsZero() { + commitTime = time.Now() + } + newCommitHash, err := s.createCommitAt(ctx, newTreeHash, parentHash, commitMsg, opts.AuthorName, opts.AuthorEmail, commitTime) if err != nil { return err } @@ -363,12 +381,16 @@ func (s *GitStore) writeStandardCheckpointEntries(ctx context.Context, opts Writ // Build the sessions array var sessions []SessionFilePaths if existingSummary != nil { - sessions = make([]SessionFilePaths, max(len(existingSummary.Sessions), sessionIndex+1)) - copy(sessions, existingSummary.Sessions) + sessions = append([]SessionFilePaths(nil), existingSummary.Sessions...) } else { - sessions = make([]SessionFilePaths, 1) + sessions = []SessionFilePaths{} } - sessions[sessionIndex] = sessionFilePaths + if position := sessionFilePathsPosition(basePath, sessions, sessionIndex); position >= 0 { + sessions[position] = sessionFilePaths + } else { + sessions = append(sessions, sessionFilePaths) + } + sortSessionFilePaths(basePath, sessions) // Tripwire: an unreproduced production report had session 0 silently // replaced with a different sessionID's data. The symptom was @@ -484,7 +506,7 @@ func (s *GitStore) writeSessionToSubdirectory(ctx context.Context, opts WriteCom // writeCheckpointSummary writes the root-level CheckpointSummary with aggregated statistics. // sessions is the complete sessions array (already built by the caller). func (s *GitStore) writeCheckpointSummary(opts WriteCommittedOptions, basePath string, entries map[string]object.TreeEntry, sessions []SessionFilePaths) error { - checkpointsCount, filesTouched, tokenUsage, err := s.reaggregateFromEntries(basePath, len(sessions), entries) + checkpointsCount, filesTouched, tokenUsage, err := s.reaggregateFromEntries(basePath, sessions, entries) if err != nil { return fmt.Errorf("failed to aggregate session stats: %w", err) } @@ -614,8 +636,15 @@ func (s *GitStore) findSessionIndex(ctx context.Context, basePath string, existi if existingSummary == nil { return 0 } - for i := range len(existingSummary.Sessions) { - path := fmt.Sprintf("%s%d/%s", basePath, i, paths.MetadataFileName) + usedIndexes := make(map[int]struct{}, len(existingSummary.Sessions)) + for summaryIndex, sessionPaths := range existingSummary.Sessions { + sessionIndex, ok := sessionIndexFromFilePaths(basePath, sessionPaths) + if !ok { + sessionIndex = summaryIndex + } + usedIndexes[sessionIndex] = struct{}{} + + path := fmt.Sprintf("%s%d/%s", basePath, sessionIndex, paths.MetadataFileName) entry, exists := entries[path] if !exists { continue @@ -623,35 +652,47 @@ func (s *GitStore) findSessionIndex(ctx context.Context, basePath string, existi meta, err := s.readMetadataFromBlob(entry.Hash) if err != nil { logging.Warn(ctx, "failed to read session metadata during dedup check", - slog.Int("session_index", i), + slog.Int("session_index", sessionIndex), slog.String("session_id", sessionID), slog.String("error", err.Error()), ) continue } if meta.SessionID == sessionID { - return i + return sessionIndex + } + } + for sessionIndex := 0; ; sessionIndex++ { + if _, used := usedIndexes[sessionIndex]; used { + continue + } + if sessionPathHasEntries(basePath, sessionIndex, entries) { + continue } + return sessionIndex } - return len(existingSummary.Sessions) } // reaggregateFromEntries reads all session metadata from the entries map and // reaggregates CheckpointsCount, FilesTouched, and TokenUsage. -func (s *GitStore) reaggregateFromEntries(basePath string, sessionCount int, entries map[string]object.TreeEntry) (int, []string, *agent.TokenUsage, error) { +func (s *GitStore) reaggregateFromEntries(basePath string, sessions []SessionFilePaths, entries map[string]object.TreeEntry) (int, []string, *agent.TokenUsage, error) { var totalCount int var allFiles []string var totalTokens *agent.TokenUsage - for i := range sessionCount { - path := fmt.Sprintf("%s%d/%s", basePath, i, paths.MetadataFileName) + for summaryIndex, sessionPaths := range sessions { + sessionIndex, ok := sessionIndexFromFilePaths(basePath, sessionPaths) + if !ok { + return 0, nil, nil, fmt.Errorf("session %d metadata path %q is invalid", summaryIndex, sessionPaths.Metadata) + } + path := fmt.Sprintf("%s%d/%s", basePath, sessionIndex, paths.MetadataFileName) entry, exists := entries[path] if !exists { - return 0, nil, nil, fmt.Errorf("session %d metadata not found at %s", i, path) + return 0, nil, nil, fmt.Errorf("session %d metadata not found at %s", summaryIndex, path) } meta, err := s.readMetadataFromBlob(entry.Hash) if err != nil { - return 0, nil, nil, fmt.Errorf("failed to read session %d metadata: %w", i, err) + return 0, nil, nil, fmt.Errorf("failed to read session %d metadata: %w", summaryIndex, err) } totalCount += meta.CheckpointsCount allFiles = mergeFilesTouched(allFiles, meta.FilesTouched) @@ -661,6 +702,57 @@ func (s *GitStore) reaggregateFromEntries(basePath string, sessionCount int, ent return totalCount, allFiles, totalTokens, nil } +func sessionFilePathsPosition(basePath string, sessions []SessionFilePaths, targetIndex int) int { + for i, sessionPaths := range sessions { + sessionIndex, ok := sessionIndexFromFilePaths(basePath, sessionPaths) + if ok && sessionIndex == targetIndex { + return i + } + } + return -1 +} + +func sortSessionFilePaths(basePath string, sessions []SessionFilePaths) { + sort.SliceStable(sessions, func(i, j int) bool { + left, leftOK := sessionIndexFromFilePaths(basePath, sessions[i]) + right, rightOK := sessionIndexFromFilePaths(basePath, sessions[j]) + if !leftOK || !rightOK { + return leftOK + } + return left < right + }) +} + +func sessionIndexFromFilePaths(basePath string, sessionPaths SessionFilePaths) (int, bool) { + if sessionPaths.Metadata == "" { + return 0, false + } + metadataPath := strings.TrimPrefix(sessionPaths.Metadata, "/") + relativePath, ok := strings.CutPrefix(metadataPath, basePath) + if !ok { + return 0, false + } + sessionDir, fileName, ok := strings.Cut(relativePath, "/") + if !ok || fileName != paths.MetadataFileName { + return 0, false + } + sessionIndex, err := strconv.Atoi(sessionDir) + if err != nil || sessionIndex < 0 { + return 0, false + } + return sessionIndex, true +} + +func sessionPathHasEntries(basePath string, sessionIndex int, entries map[string]object.TreeEntry) bool { + prefix := fmt.Sprintf("%s%d/", basePath, sessionIndex) + for path := range entries { + if strings.HasPrefix(path, prefix) { + return true + } + } + return false +} + func checkpointCreatedAt(opts WriteCommittedOptions) time.Time { if opts.CreatedAt.IsZero() { return time.Now().UTC() @@ -1941,11 +2033,14 @@ func GetGitAuthorFromRepo(repo *git.Repository) (name, email string) { // CreateCommit creates a git commit object with the given tree, parent, message, and author. // If parentHash is ZeroHash, the commit is created without a parent (orphan commit). func CreateCommit(ctx context.Context, repo *git.Repository, treeHash, parentHash plumbing.Hash, message, authorName, authorEmail string) (plumbing.Hash, error) { - now := time.Now() + return createCommitObject(ctx, repo, treeHash, parentHash, message, authorName, authorEmail, time.Now()) +} + +func createCommitObject(ctx context.Context, repo *git.Repository, treeHash, parentHash plumbing.Hash, message, authorName, authorEmail string, commitTime time.Time) (plumbing.Hash, error) { sig := object.Signature{ Name: authorName, Email: authorEmail, - When: now, + When: commitTime, } commit := &object.Commit{ @@ -1978,6 +2073,9 @@ func CreateCommit(ctx context.Context, repo *git.Repository, treeHash, parentHas // If signing is disabled, no signer can be created, or signing fails, the commit // is left unsigned and the error is logged. func SignCommitBestEffort(ctx context.Context, commit *object.Commit) { + if commitSigningDisabled(ctx) { + return + } if !settings.IsSignCheckpointCommitsEnabled(ctx) { return } diff --git a/cmd/entire/cli/checkpoint/committed_commit_time_test.go b/cmd/entire/cli/checkpoint/committed_commit_time_test.go new file mode 100644 index 0000000000..91c5546c21 --- /dev/null +++ b/cmd/entire/cli/checkpoint/committed_commit_time_test.go @@ -0,0 +1,100 @@ +package checkpoint + +import ( + "context" + "testing" + "time" + + "github.com/entireio/cli/cmd/entire/cli/checkpoint/id" + "github.com/entireio/cli/cmd/entire/cli/paths" + "github.com/entireio/cli/cmd/entire/cli/testutil" + "github.com/entireio/cli/redact" + + "github.com/go-git/go-git/v6" + "github.com/go-git/go-git/v6/plumbing" + "github.com/go-git/go-git/v6/plumbing/object" + "github.com/stretchr/testify/require" +) + +const ( + commitTimeStrategy = "manual-commit" + commitTimeTestAuthor = "Test" + commitTimeTestEmail = "test@example.com" +) + +func TestWriteCommitted_CommitTime(t *testing.T) { + t.Parallel() + + repo, store := setupCommittedCommitTimeRepo(t) + commitTime := time.Date(2024, 3, 2, 1, 2, 3, 0, time.UTC) + + err := store.WriteCommitted(context.Background(), WriteCommittedOptions{ + CheckpointID: id.MustCheckpointID("a1b2c3d4e5f6"), + SessionID: "session-commit-time", + CreatedAt: time.Date(2024, 3, 1, 1, 2, 3, 0, time.UTC), + CommitTime: commitTime, + Strategy: commitTimeStrategy, + Transcript: redact.AlreadyRedacted([]byte("transcript line\n")), + AuthorName: "Migration", + AuthorEmail: "migration@example.com", + }) + require.NoError(t, err) + + commit := metadataHeadCommit(t, repo) + require.True(t, commit.Author.When.Equal(commitTime), "author time = %s, want %s", commit.Author.When, commitTime) + require.True(t, commit.Committer.When.Equal(commitTime), "committer time = %s, want %s", commit.Committer.When, commitTime) +} + +func TestWriteCommitted_ZeroCommitTimeUsesCurrentTime(t *testing.T) { + t.Parallel() + + repo, store := setupCommittedCommitTimeRepo(t) + createdAt := time.Date(2020, 1, 2, 3, 4, 5, 0, time.UTC) + before := time.Now().Add(-time.Second) + + err := store.WriteCommitted(context.Background(), WriteCommittedOptions{ + CheckpointID: id.MustCheckpointID("b2c3d4e5f6a1"), + SessionID: "session-current-time", + CreatedAt: createdAt, + Strategy: commitTimeStrategy, + Transcript: redact.AlreadyRedacted([]byte("transcript line\n")), + AuthorName: commitTimeTestAuthor, + AuthorEmail: commitTimeTestEmail, + }) + require.NoError(t, err) + after := time.Now().Add(time.Second) + + commit := metadataHeadCommit(t, repo) + require.False(t, commit.Author.When.Equal(createdAt), "zero CommitTime should not reuse CreatedAt as the commit timestamp") + require.False(t, commit.Author.When.Before(before), "author time = %s, want no earlier than %s", commit.Author.When, before) + require.False(t, commit.Author.When.After(after), "author time = %s, want no later than %s", commit.Author.When, after) + require.True(t, commit.Committer.When.Equal(commit.Author.When), "committer time = %s, want author time %s", commit.Committer.When, commit.Author.When) +} + +func setupCommittedCommitTimeRepo(t *testing.T) (*git.Repository, *GitStore) { + t.Helper() + + dir := t.TempDir() + testutil.InitRepo(t, dir) + + repo, err := git.PlainOpen(dir) + require.NoError(t, err) + + testutil.WriteFile(t, dir, "README.md", "# Test\n") + testutil.GitAdd(t, dir, "README.md") + testutil.GitCommit(t, dir, "initial commit") + + return repo, NewGitStore(repo) +} + +func metadataHeadCommit(t *testing.T, repo *git.Repository) *object.Commit { + t.Helper() + + ref, err := repo.Reference(plumbing.NewBranchReferenceName(paths.MetadataBranchName), true) + require.NoError(t, err) + + commit, err := repo.CommitObject(ref.Hash()) + require.NoError(t, err) + + return commit +} diff --git a/cmd/entire/cli/checkpoint/committed_signing_test.go b/cmd/entire/cli/checkpoint/committed_signing_test.go index 6cc7e82782..927537402c 100644 --- a/cmd/entire/cli/checkpoint/committed_signing_test.go +++ b/cmd/entire/cli/checkpoint/committed_signing_test.go @@ -100,6 +100,22 @@ func TestSignCommitBestEffort_SkipsWhenDisabled(t *testing.T) { //nolint:paralle } } +func TestSignCommitBestEffort_SkipsWhenContextDisabled(t *testing.T) { //nolint:paralleltest // t.Chdir requires non-parallel + setupSigningEnv(t, false) + + objectSignerLoader = func(context.Context) (plugin.Signer, bool) { + t.Fatal("signer should not be called when commit signing is disabled by context") + return nil, true + } + + commit := newTestCommit() + SignCommitBestEffort(WithCommitSigningDisabled(context.Background()), commit) + + if commit.Signature != "" { + t.Errorf("expected empty signature, got %q", commit.Signature) + } +} + func TestSignCommitBestEffort_ErrorIsBestEffort(t *testing.T) { //nolint:paralleltest // t.Chdir requires non-parallel setupSigningEnv(t, false) diff --git a/cmd/entire/cli/checkpoint/temporary.go b/cmd/entire/cli/checkpoint/temporary.go index ec0cd6a8a3..bbe5c76246 100644 --- a/cmd/entire/cli/checkpoint/temporary.go +++ b/cmd/entire/cli/checkpoint/temporary.go @@ -849,6 +849,10 @@ func (s *GitStore) createCommit(ctx context.Context, treeHash, parentHash plumbi return CreateCommit(ctx, s.repo, treeHash, parentHash, message, authorName, authorEmail) } +func (s *GitStore) createCommitAt(ctx context.Context, treeHash, parentHash plumbing.Hash, message, authorName, authorEmail string, commitTime time.Time) (plumbing.Hash, error) { + return createCommitObject(ctx, s.repo, treeHash, parentHash, message, authorName, authorEmail, commitTime) +} + // Helper functions extracted from strategy/common.go // These are exported for use by strategy package (push_common.go, session_test.go) diff --git a/cmd/entire/cli/checkpoint/v2_read.go b/cmd/entire/cli/checkpoint/v2_read.go index 1a85413ffa..76b9e6a198 100644 --- a/cmd/entire/cli/checkpoint/v2_read.go +++ b/cmd/entire/cli/checkpoint/v2_read.go @@ -266,7 +266,7 @@ func (s *V2GitStore) GetCheckpointAuthor(ctx context.Context, checkpointID id.Ch } // ReadSessionContent reads a session's metadata and prompts from the v2 /main ref, -// and the raw transcript (raw_transcript) from /full/current. +// and the raw transcript (raw_transcript) from local or remote /full refs. // This is the v2 equivalent of GitStore.ReadSessionContent — it reads the raw agent // transcript, not the compact transcript.jsonl. Used by resume and RestoreLogsOnly. // Returns ErrNoTranscript if the session exists but no raw transcript is available. diff --git a/cmd/entire/cli/checkpoint/v2_store.go b/cmd/entire/cli/checkpoint/v2_store.go index 7d44bf933d..c177005173 100644 --- a/cmd/entire/cli/checkpoint/v2_store.go +++ b/cmd/entire/cli/checkpoint/v2_store.go @@ -23,8 +23,9 @@ import ( // V2GitStore is separate from GitStore (v1) to keep concerns isolated // and simplify future v1 removal. type V2GitStore struct { - repo *git.Repository - gs *GitStore // shared entry-building helpers (same package) + repo *git.Repository + repoRoot string + gs *GitStore // shared entry-building helpers (same package) // blobFetcher fetches missing blobs by hash. When set, read paths wrap // trees with FetchingTree so missing blobs are auto-recovered (and the @@ -35,10 +36,14 @@ type V2GitStore struct { // NewV2GitStore creates a new v2 checkpoint store backed by the given git repository. func NewV2GitStore(repo *git.Repository) *V2GitStore { - return &V2GitStore{ + store := &V2GitStore{ repo: repo, gs: &GitStore{repo: repo}, } + if worktree, err := repo.Worktree(); err == nil { + store.repoRoot = worktree.Filesystem().Root() + } + return store } // SetBlobFetcher configures the store to automatically fetch missing blobs @@ -69,11 +74,11 @@ func (s *V2GitStore) GetRefState(refName plumbing.ReferenceName) (parentHash, tr commit, err := s.repo.CommitObject(ref.Hash()) if err != nil { - cliTreeHash, cliErr := commitTreeHashViaCLI(context.Background(), ref.Hash()) + cliTreeHash, cliErr := commitTreeHashViaCLI(context.Background(), s.repoRoot, ref.Hash()) if cliErr != nil { return plumbing.ZeroHash, plumbing.ZeroHash, fmt.Errorf("failed to get commit for ref %s: %w", refName, errors.Join(err, cliErr)) } - logging.Warn(context.Background(), "GetRefState: go-git commit read failed, used git rev-parse fallback", + logging.Debug(context.Background(), "GetRefState: go-git commit read failed, used git rev-parse fallback", slog.String("ref", refName.String()), slog.String("commit", ref.Hash().String()[:12]), slog.String("gogit_error", err.Error()), @@ -86,8 +91,9 @@ func (s *V2GitStore) GetRefState(refName plumbing.ReferenceName) (parentHash, tr // commitTreeHashViaCLI resolves the tree hash of a commit via // `git rev-parse ^{tree}`. See GetRefState for the rationale. -func commitTreeHashViaCLI(ctx context.Context, commitHash plumbing.Hash) (plumbing.Hash, error) { +func commitTreeHashViaCLI(ctx context.Context, repoRoot string, commitHash plumbing.Hash) (plumbing.Hash, error) { cmd := exec.CommandContext(ctx, "git", "rev-parse", commitHash.String()+"^{tree}") + cmd.Dir = repoRoot output, err := cmd.Output() if err != nil { return plumbing.ZeroHash, fmt.Errorf("git rev-parse %s^{tree}: %w", commitHash.String()[:12], err) diff --git a/cmd/entire/cli/checkpoint/v2_store_test.go b/cmd/entire/cli/checkpoint/v2_store_test.go index b07983dc49..712996dc98 100644 --- a/cmd/entire/cli/checkpoint/v2_store_test.go +++ b/cmd/entire/cli/checkpoint/v2_store_test.go @@ -5,9 +5,13 @@ import ( "github.com/entireio/cli/cmd/entire/cli/checkpoint/id" "github.com/entireio/cli/cmd/entire/cli/paths" + "github.com/entireio/cli/cmd/entire/cli/testutil" + "github.com/entireio/cli/redact" "github.com/stretchr/testify/require" + "github.com/go-git/go-git/v6" "github.com/go-git/go-git/v6/plumbing" + "github.com/go-git/go-git/v6/storage" ) func TestNewV2GitStore(t *testing.T) { @@ -53,3 +57,52 @@ func TestV2GitStore_GetRefState_ErrorsOnMissingRef(t *testing.T) { require.Error(t, err) require.Contains(t, err.Error(), "ref refs/entire/checkpoints/v2/main not found") } + +func TestV2GitStore_GetRefState_FallsBackToGitCLIWhenCommitObjectMissing(t *testing.T) { + dir := t.TempDir() + testutil.InitRepo(t, dir) + testutil.WriteFile(t, dir, "README.md", "init") + testutil.GitAdd(t, dir, "README.md") + testutil.GitCommit(t, dir, "initial") + t.Chdir(t.TempDir()) + + repo, err := git.PlainOpen(dir) + require.NoError(t, err) + writeV2TestCheckpoint(t, repo, v2TestCheckpointOptions{ + CheckpointID: id.MustCheckpointID("b1b2b3b4b5b6"), + SessionID: "session-fallback", + Strategy: "manual-commit", + Transcript: redact.AlreadyRedacted([]byte("transcript\n")), + }) + + refName := plumbing.ReferenceName(paths.V2FullCurrentRefName) + ref, err := repo.Reference(refName, true) + require.NoError(t, err) + commit, err := repo.CommitObject(ref.Hash()) + require.NoError(t, err) + + store := NewV2GitStore(repo) + store.repo = &git.Repository{ + Storer: commitObjectMissingStorer{ + Storer: repo.Storer, + missing: ref.Hash(), + }, + } + parentHash, treeHash, err := store.GetRefState(refName) + require.NoError(t, err) + require.Equal(t, ref.Hash(), parentHash) + require.Equal(t, commit.TreeHash, treeHash) +} + +type commitObjectMissingStorer struct { + storage.Storer + + missing plumbing.Hash +} + +func (s commitObjectMissingStorer) EncodedObject(objectType plumbing.ObjectType, hash plumbing.Hash) (plumbing.EncodedObject, error) { + if hash == s.missing && objectType == plumbing.CommitObject { + return nil, plumbing.ErrObjectNotFound + } + return s.Storer.EncodedObject(objectType, hash) +} diff --git a/cmd/entire/cli/logging/logger_test.go b/cmd/entire/cli/logging/logger_test.go index 7ef0dbd387..ff6c2172c9 100644 --- a/cmd/entire/cli/logging/logger_test.go +++ b/cmd/entire/cli/logging/logger_test.go @@ -14,7 +14,7 @@ import ( "time" ) -// Test constants to avoid goconst warnings +// Shared test literals. const ( testSessionID = "2025-01-15-test-session" testComponent = "hooks" diff --git a/cmd/entire/cli/settings/settings_test.go b/cmd/entire/cli/settings/settings_test.go index 2782cd9df1..54ee214861 100644 --- a/cmd/entire/cli/settings/settings_test.go +++ b/cmd/entire/cli/settings/settings_test.go @@ -172,7 +172,7 @@ func TestLoad_AcceptsValidKeys(t *testing.T) { if settings.SummaryGeneration.Provider != "claude-code" { t.Errorf("expected summary_generation.provider 'claude-code', got %q", settings.SummaryGeneration.Provider) } - if settings.SummaryGeneration.Model != "sonnet" { //nolint:goconst // test literal + if settings.SummaryGeneration.Model != "sonnet" { t.Errorf("expected summary_generation.model 'sonnet', got %q", settings.SummaryGeneration.Model) } if settings.Redaction == nil { diff --git a/cmd/entire/cli/strategy/hook_managers_test.go b/cmd/entire/cli/strategy/hook_managers_test.go index 877bb62e56..b60fa25c2f 100644 --- a/cmd/entire/cli/strategy/hook_managers_test.go +++ b/cmd/entire/cli/strategy/hook_managers_test.go @@ -55,7 +55,7 @@ func TestDetectHookManagers_Lefthook(t *testing.T) { if len(managers) != 1 { t.Fatalf("expected 1 manager, got %d", len(managers)) } - if managers[0].Name != "Lefthook" { //nolint:goconst // test assertion, not a magic string + if managers[0].Name != "Lefthook" { t.Errorf("expected Lefthook, got %s", managers[0].Name) } if managers[0].ConfigPath != "lefthook.yml" { diff --git a/cmd/entire/cli/strategy/manual_commit_test.go b/cmd/entire/cli/strategy/manual_commit_test.go index 39d92ef097..65af4f0ee4 100644 --- a/cmd/entire/cli/strategy/manual_commit_test.go +++ b/cmd/entire/cli/strategy/manual_commit_test.go @@ -893,7 +893,7 @@ func TestShadowStrategy_PrepareCommitMsg_SkipsSessionWhenContentCheckFails(t *te func TestAddCheckpointTrailer_NoComment(t *testing.T) { // Test that addCheckpointTrailer adds trailer without any comment lines - message := "Test commit message\n" //nolint:goconst // already present in codebase + message := "Test commit message\n" result := addCheckpointTrailer(message, testTrailerCheckpointID) diff --git a/cmd/entire/cli/strategy/push_common.go b/cmd/entire/cli/strategy/push_common.go index 42908e8003..798572a8c9 100644 --- a/cmd/entire/cli/strategy/push_common.go +++ b/cmd/entire/cli/strategy/push_common.go @@ -231,7 +231,7 @@ func tryPushSessionsCommon(ctx context.Context, remoteName, branchName string) ( // git_push step (git_push~1) in the trace. A rejected first push records an // error flag, which signals the recovery path was taken. _, pushSpan := perf.Start(ctx, "git_push") - result, err := remote.Push(ctx, remoteName, branchName) + result, err := remote.Push(ctx, remoteName, branchPushRefSpec(branchName)) pushSpan.RecordError(err) pushSpan.End() @@ -243,6 +243,11 @@ func tryPushSessionsCommon(ctx context.Context, remoteName, branchName string) ( return parsePushResult(outputStr), nil } +func branchPushRefSpec(branchName string) string { + branchRef := plumbing.NewBranchReferenceName(branchName).String() + return branchRef + ":" + branchRef +} + // protectedRefError means the remote is blocking writes to the ref itself. type protectedRefError struct { output string diff --git a/cmd/entire/cli/strategy/push_common_test.go b/cmd/entire/cli/strategy/push_common_test.go index f962768cc1..86cda0c18f 100644 --- a/cmd/entire/cli/strategy/push_common_test.go +++ b/cmd/entire/cli/strategy/push_common_test.go @@ -1424,6 +1424,49 @@ func TestDoPushBranch_NewContent_SaysDone(t *testing.T) { assert.NotContains(t, output, "already up-to-date", "should not say 'already up-to-date' when content was pushed") } +// TestDoPushBranch_AmbiguousLocalRefs verifies that checkpoint pushes qualify +// the branch refspec. A stale refs/entire/checkpoints/v1 ref can otherwise make +// git reject the unqualified source ref as ambiguous. +// +// Not parallel: uses t.Chdir() and os.Stderr redirection. +func TestDoPushBranch_AmbiguousLocalRefs(t *testing.T) { + workDir := setupRepoWithCheckpointBranch(t) + + headCmd := exec.CommandContext(context.Background(), "git", "rev-parse", "HEAD") + headCmd.Dir = workDir + headCmd.Env = testutil.GitIsolatedEnv() + headOut, err := headCmd.Output() + require.NoError(t, err) + + staleRefCmd := exec.CommandContext( + context.Background(), + "git", + "update-ref", + "refs/entire/checkpoints/v1", + strings.TrimSpace(string(headOut)), + ) + staleRefCmd.Dir = workDir + staleRefCmd.Env = testutil.GitIsolatedEnv() + out, err := staleRefCmd.CombinedOutput() + require.NoError(t, err, "stale ref setup failed: %s", out) + + bareDir := t.TempDir() + initCmd := exec.CommandContext(context.Background(), "git", "init", "--bare") + initCmd.Dir = bareDir + initCmd.Env = testutil.GitIsolatedEnv() + out, err = initCmd.CombinedOutput() + require.NoError(t, err, "git init --bare failed: %s", out) + + t.Chdir(workDir) + + restore := captureStderr(t) + err = doPushBranch(context.Background(), bareDir, paths.MetadataBranchName) + output := restore() + + require.NoError(t, err) + assert.Contains(t, output, " done", "should push despite ambiguous local refs") +} + func TestIsProtectedRefRejection(t *testing.T) { t.Parallel() diff --git a/cmd/entire/cli/versioncheck/versioncheck_test.go b/cmd/entire/cli/versioncheck/versioncheck_test.go index 33afe4c609..85fa5f7fd8 100644 --- a/cmd/entire/cli/versioncheck/versioncheck_test.go +++ b/cmd/entire/cli/versioncheck/versioncheck_test.go @@ -341,8 +341,7 @@ func TestParseGitHubRelease(t *testing.T) { } // brewUpgradeCmd is the install command produced for any brew-installed -// binary on a stable channel. Hoisted to a const so tests can reference -// it without tripping goconst on repeated string literals. +// binary on a stable channel. const brewUpgradeCmd = "brew upgrade entire" func TestUpdateCommand(t *testing.T) { diff --git a/cmd/migrate-v2-checkpoints/VALIDATION.md b/cmd/migrate-v2-checkpoints/VALIDATION.md new file mode 100644 index 0000000000..1994c8a393 --- /dev/null +++ b/cmd/migrate-v2-checkpoints/VALIDATION.md @@ -0,0 +1,1374 @@ +# Validating `migrate-v2-checkpoints` + +Reusable runbook for verifying that `migrate-v2-checkpoints` (read-only or applied) +identifies the correct checkpoints, attributes the correct sessions, and — once +applied — writes complete, hash-consistent data to the v1 branch. + +Tested against the `tmp-migrate-v2-script-go` branch of the CLI at +`~/entire/cli/.worktrees/review`. The binary lives there as +`migrate-v2-checkpoints`. + +> Background: the project is rolling **back** checkpoints v2. v2 stores live +> under `refs/entire/checkpoints/v2/*` and are no longer being written. The +> v1 branch `entire/checkpoints/v1` is the surviving format. This tool reads +> v2 metadata + raw transcripts and replays them as v1 writes via +> `checkpoint.GitStore.WriteCommitted`. + +> ⛔ **DO NOT push `entire/checkpoints/v1` to any remote at any point while +> following this runbook.** The migration writes new commits to the local +> v1 branch and nothing else. Publishing those commits is a separate, +> manual decision the operator makes **only after** §5 validation has +> fully passed and they are satisfied with the result. Pushing early +> propagates any bad migration to every consumer (other clones, +> `checkpoint_remote`, the API) and makes rollback significantly more +> expensive than a local `update-ref`. If you are not sure whether you +> are about to push, you are not ready to push. + +## 1. What the tool does + +### 1.1 Discovery (`cmd/migrate-v2-checkpoints/history.go`) + +- Walks every history tip (branches under `refs/heads/*` and `refs/remotes/*/*`, + excluding `entire/checkpoints/v1`, `entire/trails/v1`, and any `*/HEAD` + symbolic ref). Falls back to `HEAD` if no other tips qualify. +- For each commit on those tips, parses `Entire-Checkpoint: ` trailers + (`trailers.ParseAllCheckpoints`, key constant + `trailers/trailers.go:41`). One commit can carry many trailers (squash + merges). +- After the trailer walk, lists every checkpoint ID on + `refs/entire/checkpoints/v2/main` (`addV2OrphanCheckpoints`). Any v2 /main + ID not already discovered through a commit trailer is appended as an + **orphan** — a `discoveredCheckpoint{ID, Commits: nil}` with no commit + attribution. Orphans flow through the migration filter the same way as + commit-attributed candidates; only their reporting label differs. +- Produces a list of `discoveredCheckpoint{ID, Commits}` — every checkpoint + ID ever referenced in commit history plus every v2 /main ID, sorted by ID. +- `--since `/positional commit narrows to commits not reachable from + the named commit. `--head ` restricts to a single tip. **Either + flag suppresses the v2 /main orphan augmentation**: when commit scope is + set the tool re-runs the trailer walk unscoped, counts how many v2 /main + IDs would have been newly discovered as orphans, and prints + `warning: N v2 orphans skipped; re-run without --since/--head to include + them` to stdout before the report. Those IDs are **not** added to the + migration plan in the scoped run. +- Discovery is **not** v2-specific by default, but the orphan augmentation + reaches into v2 /main, so v2 refs (or at least the local copy) influence + the candidate set. + +### 1.2 Migration filter (`cmd/migrate-v2-checkpoints/migration.go`) + +For each discovered checkpoint: + +1. Read v1 summary from `entire/checkpoints/v1`. If present, collect existing + v1 session IDs by reading each session's `metadata.json` (`session_id` + field). v1 session paths are recovered from + `summary.Sessions[*].Metadata` via `v1SessionIndexFromSummary`, so sparse + or non-contiguous v1 indices are handled correctly. +2. Read v2 summary from `refs/entire/checkpoints/v2/main`. If absent or has + no sessions → `missing v2 checkpoint metadata` and skip. +3. For every session index in the v2 summary: + - Read v2 session metadata + prompts from `/main`. Missing or empty + `checkpoint_id` / `session_id` → `missing required v2 session metadata`. + - If that session ID already exists in v1 → `already present v1 sessions`. + - Read v2 raw transcript from `/full/current`, falling back to archived + `/full/<13-digit-suffix>` refs. `ErrNoTranscript` → + `missing raw transcripts`. + - Otherwise: count `sessions eligible for migration`, and on `--apply` + resolve the v2 `/main` commit that last touched that session's + `metadata.json`, then write to v1 via `GitStore.WriteCommitted` using + v2-sourced fields and that original v2 commit author line. The transcript + is wrapped in `redact.AlreadyRedacted(...)` so the v1 writer does not + re-redact bytes that were already redacted on v2. + +A checkpoint is **eligible** if at least one v2 session is missing from v1 and +fully readable from v2. The candidate's `sessions=N` is that net count, not +the v2 session count. + +Additionally, the report tracks how many eligible checkpoints were orphans +(discovered through v2 /main alone, with no commit trailer attribution). An +eligible checkpoint with `len(discovered.Commits) == 0` increments the +`v2 orphan checkpoints eligible for migration` counter; this is a subset of +`checkpoints eligible for migration`, never larger than `EC`. + +### 1.3 What ends up on v1 after `--apply` + +For each migrated session, the v1 tree at `///` gains: + +| file | source | constant in `paths/paths.go` | +|--------------------|------------------------------|-------------------------------------| +| `metadata.json` | v2 session `metadata.json` | `MetadataFileName` (line 36) | +| `prompt.txt` | v2 session prompts (joined) | `PromptFileName` (line 29) | +| `full.jsonl[.NNN]` | reassembled v2 `raw_transcript[.NNN]` | `TranscriptFileName` (line 30) | +| `content_hash.txt` | `sha256:` of v1 bytes | `ContentHashFileName` (line 38) | + +Plus the root `//metadata.json` gets rewritten to add the new +session to `sessions[]` and recompute aggregate fields (see §3.2). +Each migrated v1 metadata-branch commit uses the author name, email, and author +timestamp from the v2 `/main` commit that wrote the corresponding v2 session +`metadata.json`; the session metadata's own `created_at` remains the v2 JSON +value. + +`` is the v1 slot. New sessions append (`findSessionIndex` in +`committed.go:610`); if v1 already had session 0 and v2 contributes one new +session, it lands in v1 slot 1. v1 indices and v2 indices for the **same** +checkpoint can differ; `session_id` is the stable cross-store identifier. + +Chunking note: `full.jsonl` is chunked via `agent.ChunkTranscript`. Chunks +are `full.jsonl`, `full.jsonl.001`, `full.jsonl.002`, … +(`agent/chunking.go:126` `ChunkFileName`, with +`ChunkSuffix = ".%03d"` at line 19). Index 0 has no suffix. + +Codex caveat: for sessions whose agent is `codex`, `writeTranscript` applies +`codex.SanitizePortableTranscript` before chunking and hashing +(`committed.go:746`). The bytes written to v1 may differ from the bytes +read out of v2's `/full/*`, but they are still self-consistent against the +new v1 `content_hash.txt`. + +## 2. Run modes & expected report shape + +```text +$ migrate-v2-checkpoints [--repo PATH] [--since SHA | SHA] [--head SHA] \ + (--list | --dry-run | --apply) +``` + +Default mode is `plan` (same output as `--dry-run`). + +For `plan`, `--dry-run`, and `--apply` (but not `--list`), the tool resolves +the checkpoint fetch remote and refreshes the local v1 branch plus v2 refs +before discovery: + +- `refs/heads/entire/checkpoints/v1` via `ensureLatestV1Ref` +- `refs/entire/checkpoints/v2/main` plus every + `refs/entire/checkpoints/v2/full/*` ref via `ensureLatestV2Refs` + +These modes intentionally write local refs even when no migration data is +written. If the remote resolves, it must advertise both v1 and v2 /main; stale +local refs do not bypass a missing remote v1 or v2 /main. If no fetch target +can be resolved, the tool only proceeds when the required local refs already +exist. Otherwise it errors out before doing any analysis. + +`--list` produces one line per checkpoint: +```text + [ ...] + (orphan) +``` +The first form is for commit-attributed IDs; the second is for orphans +(IDs present on v2 /main with no commit trailer in history). This is the +**universe** discovered — NOT the eligible set. + +`plan`, `--dry-run`, and `--apply` produce: +```text +Migration plan: (or "Migration result:" on --apply) + discovered checkpoints: D + already present v1 sessions: A + missing v2 checkpoint metadata: M1 + missing required v2 session metadata: M2 + missing raw transcripts: M3 + checkpoints eligible for migration: EC + v2 orphan checkpoints eligible for migration: V2O + sessions eligible for migration: ES + migrated checkpoints: ... (--apply only) + migrated sessions: ... (--apply only) + checkpoints to migrate: (or "migrated checkpoint details:" on --apply) + sessions=N commits=[,...] + sessions=N commits=(orphan) +``` + +If `--since` or `--head` is set and the v2 /main ref carries IDs the scoped +trailer walk wouldn't have found, the tool prints a single line **before** +the report: +```text +warning: N v2 orphans skipped; re-run without --since/--head to include them +``` + +Checks that should always hold on the report: + +- `EC ≤ D`. +- `V2O ≤ EC` (orphan-eligible is a subset of eligible). +- `ES ≥ EC` (each eligible checkpoint contributes ≥ 1 eligible session). +- `ES = Σ candidate.SessionCount`. The candidate list is exhaustive. +- The candidate list is sorted by `` ascending; commit SHAs within + a candidate are sorted by commit date descending (most recent first), + ties broken by hash. Orphan candidates print `commits=(orphan)` instead + of a SHA list. +- A ` sessions=N commits=(orphan)` line corresponds to one of the `V2O` + checkpoints; its trailer never appears on any history tip included in the + discovery walk. +- On `--apply`: `migrated checkpoints = EC` and `migrated sessions = ES` if + no write errors. Anything less means a partial write failure — re-run + the tool and the remainder should re-appear as eligible. +- Do not try to balance `D` with + `eligible non-orphan + V2O + already-present + M1 + M3`. `D` is a + checkpoint discovery count and includes both trailer-discovered and + v2-orphan IDs; `A`, `M2`, and `M3` are session counters. +- `D = EC + (checkpoints with v2 summary but eligibleSessions==0) + M1`. + The middle term covers both "all v2 sessions already in v1" and "every + v2 session was unreadable (missing metadata or transcript)" — those land + in the per-session counters `A`, `M2`, `M3` rather than dropping the + checkpoint at the summary level. +- Counter sums for skipped sessions: + `A + M2 + M3 = (Σ over all v2 sessions in checkpoints whose v2 summary + exists) − ES`. Useful for spot-checking after `--apply`: if `A` is + large and `EC` is small, most v2 checkpoints are already mirrored. If + `V2O` is close to `EC` and `A` is small, this repo skipped v2 entirely + and the migration is largely "import from v2 /main." + +## 3. Validation procedure + +The procedure below is the same regardless of repo. Substitute `$REPO` and +`$TOOL` per environment: + +```sh +REPO=/path/to/some-repo # e.g. ~/entire/marvin +TOOL=~/entire/cli/.worktrees/review/migrate-v2-checkpoints +cd "$REPO" + +sha256_stdin() { + if command -v sha256sum >/dev/null 2>&1; then + sha256sum | awk '{print $1}' + else + shasum -a 256 | awk '{print $1}' + fi +} + +sha256_file() { + if command -v sha256sum >/dev/null 2>&1; then + sha256sum "$1" | awk '{print $1}' + else + shasum -a 256 "$1" | awk '{print $1}' + fi +} +``` + +### 3.1 Pre-flight: confirm both stores exist + +```sh +git -C "$REPO" show-ref refs/heads/entire/checkpoints/v1 \ + || git -C "$REPO" show-ref refs/remotes/origin/entire/checkpoints/v1 +git -C "$REPO" show-ref refs/entire/checkpoints/v2/main +git -C "$REPO" show-ref refs/entire/checkpoints/v2/full/current +git -C "$REPO" for-each-ref 'refs/entire/checkpoints/v2/full/*' \ + --format='%(refname)' +``` + +If `entire/checkpoints/v1` is missing locally but present on the remote, the +migration tool will fetch it and create the local branch before planning or +applying. If both local and remote v1 are missing, the tool aborts; it will +not synthesize a fresh orphan v1 baseline for this rollback migration. + +`plan`, `--dry-run`, and `--apply` auto-fetch checkpoint refs from the repo's +checkpoint remote before discovery (`ensureLatestV1Ref` and +`ensureLatestV2Refs`), so the local state *after* those modes runs will +reflect the remote. This is intentional even for `--dry-run`: the tool refuses +to analyze a stale checkpoint snapshot. If the remote lacks v1 or v2 /main, or +rejects a fetch of `refs/entire/checkpoints/v2/full/*`, the tool exits +non-zero before analysis. Ensure the repo can reach a checkpoint remote with +v1 and v2 refs before running. To work against a strictly local copy, +temporarily remove or disable the checkpoint fetch remote and keep local v1 +and v2 refs present. + +Pre-flight is still useful to sanity-check that the local v2 /main looks like +it's frozen rather than actively advancing. `--list` does **not** auto-fetch; +if you intend to inspect the universe via `--list`, pre-fetch manually (see +§9). + +Also sanity-check the head of v2 isn't surprising — a recent commit +means v2 was being dual-written; a long-stale v2 head matches the +rollback narrative: + +```sh +git -C "$REPO" log -1 --format='%h %ci %s' refs/entire/checkpoints/v2/main +``` + +### 3.2 Step A — sanity check the dry-run report + +```sh +"$TOOL" --repo "$REPO" --dry-run | tee /tmp/migrate.plan +``` + +Spot-check the counter math against §2: + +```sh +grep -E "^ (discovered|already|missing|checkpoints eligible|v2 orphan|sessions eligible)" \ + /tmp/migrate.plan +``` + +- `EC ≤ D` and `V2O ≤ EC` and `ES ≥ EC`. +- For each candidate line, parse `sessions=N` and sum — must equal `ES`. +- The number of candidate lines with `commits=(orphan)` must equal `V2O`. + +```sh +awk '/^ [0-9a-f]{12} sessions=/ {sub(/sessions=/,"",$2); s+=$2} END {print s}' \ + /tmp/migrate.plan +# Should equal the "sessions eligible for migration" value. + +grep -cE '^ [0-9a-f]{12} sessions=[0-9]+ commits=\(orphan\)$' /tmp/migrate.plan +# Should equal the "v2 orphan checkpoints eligible for migration" value. +``` + +If the run was launched with `--since` or `--head`, also confirm the +orphan-skip warning matches expectations: + +```sh +grep "^warning: " /tmp/migrate.plan || echo "(no scope-orphan warning)" +``` + +### 3.3 Step B — confirm every candidate is genuinely v2-only-or-partial + +For every candidate ``: + +```sh +ID=02d9783342a2 # example +SHARD=${ID:0:2}/${ID:2} +if git -C "$REPO" rev-parse --verify --quiet \ + refs/heads/entire/checkpoints/v1 >/dev/null; then + V1_REF=refs/heads/entire/checkpoints/v1 +else + V1_REF=refs/remotes/origin/entire/checkpoints/v1 +fi + +# Does v2 /main carry this checkpoint? +git -C "$REPO" cat-file -p \ + refs/entire/checkpoints/v2/main:"$SHARD/metadata.json" \ + | jq '{checkpoint_id, sessions: [.sessions[].metadata]}' + +# Does v1 already carry it? (Either the path doesn't exist, or the session +# IDs differ.) +git -C "$REPO" cat-file -p \ + "$V1_REF:$SHARD/metadata.json" 2>/dev/null \ + | jq '{checkpoint_id, sessions: [.sessions[].metadata]}' \ + || echo "(absent in v1)" +``` + +Use the **effective** v1 baseline the binary reads, not only the local branch. +The v1 store reads `refs/heads/entire/checkpoints/v1` first and falls back to +`refs/remotes/origin/entire/checkpoints/v1` if the local branch is missing. +This distinction matters before the tool has run in a fresh clone; after +`plan`/`--dry-run` or `--apply`, the v1 preflight should have created the +local branch from the remote baseline. + +The candidate must satisfy at least one of: + +1. `/metadata.json` doesn't exist on `entire/checkpoints/v1` → + **fully v2-only**, all v2 sessions are eligible. +2. It exists on v1, but the v2 summary lists session IDs not present in v1 → + **partial migration** to fill in missing sessions. + +The reverse check — every v2 /main checkpoint should appear in the report +unless it's `already present` / `missing metadata` / `missing transcript`: + +```sh +# Enumerate every checkpoint ID present on v2 /main (sharded layout). +git -C "$REPO" ls-tree -r refs/entire/checkpoints/v2/main \ + | awk '$4 ~ /metadata\.json$/ && $4 !~ /\// {next} \ + $4 ~ /^[0-9a-f]{2}\/[0-9a-f]{10}\/metadata\.json$/ { + split($4, p, "/"); print p[1] p[2] + }' \ + | sort -u > /tmp/v2_ids.txt +wc -l /tmp/v2_ids.txt + +# IDs already in effective v1 (any session present). +if git -C "$REPO" rev-parse --verify --quiet \ + refs/heads/entire/checkpoints/v1 >/dev/null; then + V1_REF=refs/heads/entire/checkpoints/v1 +else + V1_REF=refs/remotes/origin/entire/checkpoints/v1 +fi +git -C "$REPO" ls-tree -r "$V1_REF" 2>/dev/null \ + | awk '$4 ~ /^[0-9a-f]{2}\/[0-9a-f]{10}\/metadata\.json$/ { \ + split($4, p, "/"); print p[1] p[2] \ + }' \ + | sort -u > /tmp/v1_ids.txt +comm -23 /tmp/v2_ids.txt /tmp/v1_ids.txt > /tmp/v2_only_ids.txt +wc -l /tmp/v2_only_ids.txt +``` + +Every ID in `v2_only_ids.txt` should be either a candidate or accounted for +by missing v2 checkpoint metadata, missing required v2 session metadata, or +missing raw transcript skips. Orphan candidates also live in this set: they +are exactly v2 /main IDs with no commit attribution but with intact v2 +metadata + transcripts. + +A quick predicate: the eligible candidate count plus the missing summary, +required-metadata, and missing-raw counters should equal or exceed the +v2-only set. If it's less, something is being silently dropped. + +```sh +EC=$(grep "checkpoints eligible for migration" /tmp/migrate.plan | awk '{print $NF}') +V2O=$(grep "v2 orphan checkpoints" /tmp/migrate.plan | awk '{print $NF}') +M1=$(grep "missing v2 checkpoint metadata" /tmp/migrate.plan | awk '{print $NF}') +M2=$(grep "missing required v2 session metadata" /tmp/migrate.plan | awk '{print $NF}') +M3=$(grep "missing raw transcripts" /tmp/migrate.plan | awk '{print $NF}') +echo "v2-only on disk: $(wc -l < /tmp/v2_only_ids.txt)" +echo "EC=$EC V2O=$V2O M1=$M1 M2=$M2 M3=$M3" +echo " EC + M1 + M2 + M3 must be >= v2-only count" +echo " V2O <= EC must hold (orphan is a subset of eligible)" +``` + +(`>=` rather than `=` because `M1`, `M2`, and `M3` are counted over the +entire discovered universe, not only the v2-only set; `M2` and `M3` are +also per-session counters. `V2O` is exactly the subset of `EC` whose +discovery came from v2 /main alone.) + +### 3.4 Step C — confirm commit-list accuracy + +The report's `commits=...` are short SHAs of commits in history whose +message carries `Entire-Checkpoint: `. Verify directly: + +```sh +ID=02d9783342a2 +git -C "$REPO" log --all --format='%h %s' --grep "Entire-Checkpoint: $ID" +``` + +The set of short SHAs that this prints should match the report's +`commits=…` for that ID. If they differ: + +- `commits=(orphan)` in the report means the ID is on v2 /main but no + reachable commit message carries its trailer. `git log --grep` should + produce **no** output for that ID. If it does produce output, something + is wrong — either the trailer walk dropped the commit or the orphan + pass mislabelled the candidate. +- Extra in the report but absent here: the discovery walk picked up a tip + this `--all` view doesn't include (rare). +- Extra here but absent in the report: a tip was filtered out + (`entire/checkpoints/v1`, `entire/trails/v1`, or `*/HEAD` symbolic refs + — see `isInternalHistoryRefName` / `isHistoryRef` in `history.go`). + +A commit may also appear under multiple candidate IDs if it's a squash +merge with multiple trailers; that's expected. + +### 3.5 Step D — DRY-RUN INSPECTION of session count + +For each candidate, the report claims `sessions=N`. Confirm by counting v2 +sessions that are not already in v1 **and** are eligible by the same filters +the migration applies: required metadata present and raw transcript present on +`/full/current` or an archived `/full/*` ref. + +```sh +ID=02d9783342a2 +SHARD=${ID:0:2}/${ID:2} +EXPECTED_SESSIONS=1 # report's sessions=N for this checkpoint + +# Sessions advertised by the v2 summary (from /main). +V2_SESSION_COUNT=$(git -C "$REPO" cat-file -p \ + refs/entire/checkpoints/v2/main:"$SHARD/metadata.json" \ + | jq -r '.sessions | length') + +# Session IDs already in v1 for this checkpoint. +if git -C "$REPO" cat-file -e \ + "entire/checkpoints/v1:$SHARD/metadata.json" 2>/dev/null; then + V1_SESSION_COUNT=$(git -C "$REPO" cat-file -p \ + "entire/checkpoints/v1:$SHARD/metadata.json" \ + | jq -r '.sessions | length') + for i in $(seq 0 $((V1_SESSION_COUNT-1))); do + git -C "$REPO" cat-file -p \ + "entire/checkpoints/v1:$SHARD/$i/metadata.json" \ + | jq -r '.session_id' + done | sort -u > /tmp/v1_sids.txt +else + : > /tmp/v1_sids.txt +fi + +FULL_REFS=$(git -C "$REPO" for-each-ref \ + --format='%(refname)' 'refs/entire/checkpoints/v2/full/*' \ + | awk '/full\/current$/ {print "1 " $0; next} {print "0 " $0}' \ + | sort -k1,1nr -k2,2r \ + | awk '{print $2}') + +eligible=0 +for i in $(seq 0 $((V2_SESSION_COUNT-1))); do + META=$(git -C "$REPO" cat-file -p \ + refs/entire/checkpoints/v2/main:"$SHARD/$i/metadata.json" 2>/dev/null) \ + || continue + + SID=$(echo "$META" | jq -r '.session_id // ""') + CPID=$(echo "$META" | jq -r '.checkpoint_id // ""') + if [ -z "$SID" ] || [ -z "$CPID" ]; then + continue + fi + if grep -qxF "$SID" /tmp/v1_sids.txt; then + continue + fi + + has_raw=0 + for r in $FULL_REFS; do + if git -C "$REPO" cat-file -e \ + "$r:$SHARD/$i/raw_transcript" 2>/dev/null || + git -C "$REPO" ls-tree --name-only "$r:$SHARD/$i" 2>/dev/null \ + | grep -qE '^raw_transcript\.[0-9]{3}$'; then + has_raw=1 + break + fi + done + if [ "$has_raw" = 1 ]; then + eligible=$((eligible+1)) + fi +done + +echo "eligible sessions from v2: $eligible" +echo "report sessions=N: $EXPECTED_SESSIONS" +[ "$eligible" -eq "$EXPECTED_SESSIONS" ] && echo OK || echo MISMATCH +``` + +Repeat for a random sample (5–10) across the candidate list. If your +sample matches 1:1, the report's accounting is trustworthy. + +## 4. Apply the migration + +> ⛔ **Human operator only. Agents must not run `--apply`.** If an agent is +> helping with this runbook, it may prepare commands, inspect dry-run output, +> update documentation, and analyze validation results, but it must stop before +> executing any command that includes `--apply`. The repository owner/operator +> runs the apply command manually in their own terminal and then shares the +> resulting report/output for follow-up validation. + +> ⛔ **No `git push` for `entire/checkpoints/v1` from this point until §5 +> has fully passed and the operator has consciously decided to publish.** +> The migration itself never pushes — but the v1 branch is the same ref +> any other tooling on the repo might push as part of its normal flow. +> Before running `--apply`: +> +> - confirm no automatic push hook, scheduler, or CI job will push +> `refs/heads/entire/checkpoints/v1` in the background; +> - if `entire`'s own push path runs in this repo (e.g. on the next +> `entire`-driven commit), pause it until §5 is done; +> - if the repo has `checkpoint_remote` configured, treat that as another +> push target that must stay quiet. +> +> Pushing before §5 passes means a bad migration is now everyone else's +> problem. Pushing after §5 passes is a separate, manual procedure that +> lives outside this runbook. + +**This is the destructive (local) step.** Up to here everything was +read-only. Now we write new commits to the local +`entire/checkpoints/v1` branch. Nothing is pushed to any remote — that's +a separate, explicit decision once the post-apply checks in §5 pass. + +### Preconditions + +- §3 ran clean: the candidate list looks plausible, counter math adds up, + and a spot sample (Steps C and D) confirmed the candidates really are + v2-only / partial migrations. +- The repo has either a checkpoint fetch remote that advertises both + `refs/heads/entire/checkpoints/v1` and `refs/entire/checkpoints/v2/main`, + or no resolvable fetch remote but already-present local v1 and v2 /main + refs. `--apply` calls `ensureLatestV1Ref` and `ensureLatestV2Refs` first; + it refreshes the local v1 branch, v2 /main, and every + `refs/entire/checkpoints/v2/full/*` ref from the remote (forced fetch of + v2 `/full/current`, fast-forward fetch of archives). If the fetch target + resolves but lacks v1 or v2 /main, the tool errors out even if a stale + local ref exists; that prevents silently using stale rollback data. A + manual pre-fetch is no longer required, but remains a safe no-op: + + ```sh + git -C "$REPO" fetch origin \ + 'refs/heads/entire/checkpoints/v1:refs/heads/entire/checkpoints/v1' \ + --no-tags + git -C "$REPO" fetch origin \ + 'refs/entire/checkpoints/v2/*:refs/entire/checkpoints/v2/*' \ + --no-tags + ``` + +- Working tree is clean OR you don't mind running with uncommitted changes + in `$REPO`. The tool only touches refs, not the working tree, but a clean + tree makes it easier to roll back if needed. + +### Recommended invocation + +```sh +REPO=/path/to/some-repo +REPO_NAME=$(basename "$REPO") +TOOL=~/entire/cli/.worktrees/review/migrate-v2-checkpoints +APPLIED_REPORT="/tmp/migrate-${REPO_NAME}.applied" + +# Snapshot the v1 branch tip so you can roll back deterministically. +PRE_APPLY_TIP=$(git -C "$REPO" rev-parse entire/checkpoints/v1 2>/dev/null || echo "none") +echo "pre-apply v1 tip: $PRE_APPLY_TIP" + +# USER ONLY: an agent must not execute this command. +# Apply. Tee the report into /tmp/migrate-${REPO_NAME}.applied — §5 reads it back. +"$TOOL" --repo "$REPO" --apply | tee "$APPLIED_REPORT" + +# Sanity-check the report. +grep -E "^ (checkpoints eligible|v2 orphan|sessions eligible|migrated)" "$APPLIED_REPORT" +# migrated checkpoints == checkpoints eligible +# migrated sessions == sessions eligible +# v2 orphan ... == subset of EC (informational; not a pass/fail gate) +# Anything less means at least one write failed silently — re-run --apply +# (idempotent) and inspect logs. + +# Confirm the v1 branch actually advanced. +POST_APPLY_TIP=$(git -C "$REPO" rev-parse entire/checkpoints/v1) +echo "post-apply v1 tip: $POST_APPLY_TIP" +git -C "$REPO" log --format='%h %ci %s' \ + "$PRE_APPLY_TIP".."$POST_APPLY_TIP" 2>/dev/null \ + | head -20 +``` + +### Behavior notes + +- **Idempotent.** Re-running `--apply` after a successful apply yields + `checkpoints eligible for migration: 0` (and re-runs are cheap). Safe + to retry on partial failure. +- **Local only — and stays local for the rest of this runbook.** No + remotes are touched by `--apply` itself. The new v1 commits live on + `refs/heads/entire/checkpoints/v1` locally. **Do not** `git push` this + branch, do not let `entire`'s push path publish it, do not let any + CI/hook/scheduler publish it, and do not let a configured + `checkpoint_remote` mirror it. Push is a separate manual procedure + that is explicitly out of scope here, and is only safe **after** every + step in §5 passes and the operator is satisfied. +- **Per-session atomicity, not transactional.** Each migrated session is + written as its own commit on v1. If `--apply` errors out partway + through a checkpoint with multiple eligible sessions, earlier sessions + remain written and later sessions reappear on the next run. +- **v1 commit author matches v2.** Each new v1 commit is authored with + the same name, email, and author timestamp as the v2 `/main` non-merge + commit that actually changed the migrated session's `metadata.json`. + Later commits that merely carry that path through their tree do not + count. §5.6 treats the `author` header in `entire explain` as a required + check; a mismatch is a regression, not an accepted divergence. +- **Migration-run commits are unsigned.** The tool disables checkpoint commit + signing for the whole migration run, including the v1/v2 preflight ref + refresh and the migrated writes, even if normal checkpoint signing is enabled + in the repo. The v1 author line is replayed from v2 history; adding a local + operator signature to that replayed author would be misleading. Any signed + commit created by this tool is a bug. +- **Roll back** by resetting v1 back to `$PRE_APPLY_TIP`: + + ```sh + # Only if you need to undo — this discards the new commits locally. + if [ "$PRE_APPLY_TIP" = "none" ]; then + git -C "$REPO" update-ref -d refs/heads/entire/checkpoints/v1 + else + git -C "$REPO" update-ref refs/heads/entire/checkpoints/v1 "$PRE_APPLY_TIP" + fi + ``` + + Safe before any push. Destructive after push. + +### Operator checkpoint + +**Stop here. If you are an agent, do not run `--apply`. The human operator +must run the apply command themselves and confirm:** + +1. `migrated checkpoints` equals `checkpoints eligible for migration` from + the dry-run. +2. `migrated sessions` equals `sessions eligible for migration` from the + dry-run. +3. `git rev-parse entire/checkpoints/v1` advanced. +4. `/tmp/migrate-${REPO_NAME}.applied` contains the full report for §5 to reference. +5. **You have NOT pushed `entire/checkpoints/v1`.** Confirm by checking + that no remote tracking ref has advanced: + + ```sh + git -C "$REPO" for-each-ref \ + --format='%(refname) %(objectname:short)' \ + 'refs/remotes/*/entire/checkpoints/v1' + ``` + + Each remote ref should still point at the pre-apply tip (or be + absent). If a remote ref has already moved to the new local tip, + pause and figure out who pushed — do not proceed to §5 until you've + understood the source of the push and decided whether to roll back. + +Then proceed to §5. Do not push between §4 and §5; do not push during +§5; do not push without the operator's explicit go-ahead after §5 +passes. + +## 5. Post-apply validation + +This section assumes `--apply` has been run and +`/tmp/migrate-${REPO_NAME}.applied` holds the report. The +`migrated sessions=...` count is the population you will validate below. +Extract the migrated checkpoint IDs once and reuse that list for every +bulk check: + +```sh +MIGRATED_IDS="/tmp/migrate-${REPO_NAME}.migrated-checkpoints" +awk '/^ [0-9a-f]{12} sessions=/ {print $1}' "$APPLIED_REPORT" \ + > "$MIGRATED_IDS" +wc -l "$MIGRATED_IDS" +``` + +### 5.1 Step E — root `metadata.json` (CheckpointSummary) on v1 + +For each migrated checkpoint, decode the v1 root metadata and confirm: + +```sh +ID=02d9783342a2 +SHARD=${ID:0:2}/${ID:2} + +git -C "$REPO" cat-file -p "entire/checkpoints/v1:$SHARD/metadata.json" | jq . +``` + +Expected shape (schema lives at +`cmd/entire/cli/checkpoint/checkpoint.go:545-563`): + +```jsonc +{ + "cli_version": "…", // optional + "checkpoint_id": "02d9783342a2", + "strategy": "manual-commit", + "branch": "main", // optional + "checkpoints_count": 1, + "files_touched": ["…"], + "sessions": [ + { + "metadata": "/02/d9783342a2/0/metadata.json", + "transcript": "/02/d9783342a2/0/full.jsonl", // omitempty + "content_hash": "/02/d9783342a2/0/content_hash.txt", // omitempty + "prompt": "/02/d9783342a2/0/prompt.txt" + } + ], + "token_usage": { … }, // omitempty fields + "combined_attribution": { … }, + "has_review": true // omitempty +} +``` + +For checkpoints that were fully v2-only and whose v2 sessions all migrated, +the root summary should match the v2 summary for the stable fields below: + +```sh +diff <(git -C "$REPO" cat-file -p \ + refs/entire/checkpoints/v2/main:"$SHARD/metadata.json" \ + | jq '{checkpoint_id, strategy, branch, checkpoints_count, + files_touched, combined_attribution, has_review, + token_usage}') \ + <(git -C "$REPO" cat-file -p \ + entire/checkpoints/v1:"$SHARD/metadata.json" \ + | jq '{checkpoint_id, strategy, branch, checkpoints_count, + files_touched, combined_attribution, has_review, + token_usage}') +``` + +Acceptable differences: + +- `sessions[]` entries differ — paths point to v1 file names + (`full.jsonl`, `content_hash.txt`), not v2's compact format. +- If v1 already had sessions, `sessions[]` length on v1 may exceed v2's; + the candidate's contributions are appended. +- If only some v2 sessions migrated (because others were already present, + lacked required metadata, or lacked raw transcripts), aggregate fields + such as `checkpoints_count`, `files_touched`, `token_usage`, and + `has_review` may differ. The v1 writer reaggregates those fields from + the sessions actually present in v1, not from every session in v2. +- `combined_attribution` may also differ when v1 already had sessions. For + purely v2-only checkpoints with all v2 sessions migrated, it should match + the v2 summary exactly because the migration uses + `summary.CombinedAttribution` from v2 verbatim (`migration.go:242`). + +Hard requirements: + +- `checkpoint_id` equals the directory shard. +- `sessions[].metadata`, `sessions[].transcript` (if non-empty), + `sessions[].content_hash` (if non-empty), `sessions[].prompt` all start + with `///` and end with the correct filename constants. + +### 5.2 Step F — per-session `metadata.json` + +For each migrated session, locate it by `session_id` rather than by index: + +```sh +ID=02d9783342a2 +SHARD=${ID:0:2}/${ID:2} +WANT_SID=… # session_id from the v2 side + +V1_SUM=$(git -C "$REPO" cat-file -p "entire/checkpoints/v1:$SHARD/metadata.json") +V1_LEN=$(echo "$V1_SUM" | jq '.sessions | length') +V1_SLOT= +for n in $(seq 0 $((V1_LEN-1))); do + SID=$(git -C "$REPO" cat-file -p \ + "entire/checkpoints/v1:$SHARD/$n/metadata.json" \ + | jq -r '.session_id') + if [ "$SID" = "$WANT_SID" ]; then + V1_SLOT=$n; break + fi +done +if [ -n "$V1_SLOT" ]; then + echo "session $WANT_SID lives in v1 slot $V1_SLOT" +else + echo "session $WANT_SID is absent from v1" +fi +``` + +If a v2 `session_id` is present in v2 /main but absent from v1 after apply, +check that it was not skipped because v2 no longer has a raw transcript. Those +sessions are counted in `missing raw transcripts` and are intentionally not +written to v1. + +```sh +V2_SLOT=… # slot the session occupied on v2 (its index in v2 summary) +if [ -z "$V1_SLOT" ]; then + V2_FULL_REFS=$(git -C "$REPO" for-each-ref \ + --format='%(refname)' 'refs/entire/checkpoints/v2/full/*' \ + | awk '/full\/current$/ {print "1 " $0; next} {print "0 " $0}' \ + | sort -k1,1nr -k2,2r \ + | awk '{print $2}') + + RAW_FOUND= + for r in $V2_FULL_REFS; do + if git -C "$REPO" cat-file -e \ + "$r:$SHARD/$V2_SLOT/raw_transcript" 2>/dev/null || + git -C "$REPO" ls-tree --name-only "$r:$SHARD/$V2_SLOT" 2>/dev/null \ + | grep -qE '^raw_transcript\.[0-9]{3}$'; then + RAW_FOUND=1 + break + fi + done + + if [ -z "$RAW_FOUND" ]; then + echo "session $WANT_SID absent in v1: M3 skip, expected" + else + echo "MISMATCH: session $WANT_SID has raw v2 transcript but is absent in v1" + fi +fi +``` + +When `V1_SLOT` is non-empty, diff the per-session metadata, comparing +**fields that are expected to survive migration** (`migration.go:216-248` +lists them explicitly): + +```sh +V2_SLOT=… # slot the session occupied on v2 (its index in v2 summary) + +diff <(git -C "$REPO" cat-file -p \ + refs/entire/checkpoints/v2/main:"$SHARD/$V2_SLOT/metadata.json" \ + | jq '{checkpoint_id, session_id, strategy, branch, + files_touched, checkpoints_count, agent, model, + turn_id, is_task, tool_use_id, + transcript_identifier_at_start, + checkpoint_transcript_start, + token_usage, session_metrics, + initial_attribution, prompt_attributions, + summary, kind, review_skills, review_prompt}') \ + <(git -C "$REPO" cat-file -p \ + entire/checkpoints/v1:"$SHARD/$V1_SLOT/metadata.json" \ + | jq '{checkpoint_id, session_id, strategy, branch, + files_touched, checkpoints_count, agent, model, + turn_id, is_task, tool_use_id, + transcript_identifier_at_start, + checkpoint_transcript_start, + token_usage, session_metrics, + initial_attribution, prompt_attributions, + summary, kind, review_skills, review_prompt}') +``` + +Expected: no diff. Special cases: + +- `created_at` is replayed from v2's `created_at` into the v1 metadata JSON. + The v1 metadata-branch commit timestamp is a separate git author timestamp + copied from the v2 `/main` commit that last touched this session's + `metadata.json`. +- The migration sets `HasReview = session.Kind(meta.Kind).IsReview()` + (`migration.go:247`). For non-review kinds this is `false` and may + have been absent (omitempty) in v2; that's still a match. +- `cli_version` on the v1 session may differ from v2's. The migration + doesn't pass `CLIVersion`, so v1 inherits whatever default the writer + applies — generally an empty value or the current binary's version. Not + a correctness issue. +- Root summary aggregation is covered in §5.1. Session-level comparison here + should ignore root-only fields such as `combined_attribution`; per-session + `token_usage` is copied from v2 and then folded into the root summary by + the v1 writer. + +Schema sanity per session: + +```sh +git -C "$REPO" cat-file -p "entire/checkpoints/v1:$SHARD/$V1_SLOT/metadata.json" \ + | jq -e 'has("checkpoint_id") and has("session_id") and has("created_at")' \ + > /dev/null && echo OK +``` + +Author and signature status for the metadata-branch commit: + +```sh +V2_AUTHOR=$(git -C "$REPO" log -1 --format='%an <%ae> %aI' \ + refs/entire/checkpoints/v2/main -- "$SHARD/$V2_SLOT/metadata.json") +V1_AUTHOR=$(git -C "$REPO" log -1 --format='%an <%ae> %aI' \ + entire/checkpoints/v1 -- "$SHARD/$V1_SLOT/metadata.json") +V1_SIGNATURE_STATUS=$(git -C "$REPO" log -1 --format='%G?' \ + entire/checkpoints/v1 -- "$SHARD/$V1_SLOT/metadata.json") + +echo "v2: $V2_AUTHOR" +echo "v1: $V1_AUTHOR" +[ "$V1_AUTHOR" = "$V2_AUTHOR" ] && echo OK || echo MISMATCH + +echo "v1 signature status: $V1_SIGNATURE_STATUS" +[ "$V1_SIGNATURE_STATUS" = "N" ] && echo OK || echo MISMATCH +``` + +Expected: exact author match, and v1 signature status `N` (`%G? = N` means +no signature). For orphan candidates the author check is still valid: the v2 +`/main` path history is the source of the author line even though no user +commit trailer exists. + +### 5.3 Step G — `prompt.txt` content + +The migration joins v2 prompts (split form on disk) back into a single +`prompt.txt` via `SplitPromptContent` round-trip. If `prompt.txt` exists on +v2, the v1 bytes should match. If it is absent on v2, it should also be +absent on v1. + +```sh +if git -C "$REPO" cat-file -e \ + "refs/entire/checkpoints/v2/main:$SHARD/$V2_SLOT/prompt.txt" 2>/dev/null; then + V2_PROMPT_HASH=$(git -C "$REPO" cat-file -p \ + "refs/entire/checkpoints/v2/main:$SHARD/$V2_SLOT/prompt.txt" \ + | sha256_stdin) + V1_PROMPT_HASH=$(git -C "$REPO" cat-file -p \ + "entire/checkpoints/v1:$SHARD/$V1_SLOT/prompt.txt" \ + | sha256_stdin) + echo "v2 prompt: $V2_PROMPT_HASH" + echo "v1 prompt: $V1_PROMPT_HASH" + [ "$V1_PROMPT_HASH" = "$V2_PROMPT_HASH" ] && echo OK || echo MISMATCH +else + git -C "$REPO" cat-file -e \ + "entire/checkpoints/v1:$SHARD/$V1_SLOT/prompt.txt" 2>/dev/null \ + && echo "MISMATCH: v1 prompt exists but v2 prompt is absent" \ + || echo "OK: prompt absent in both stores" +fi +``` + +If the digests don't match, inspect with a `diff -u` between the two +`cat-file -p` outputs to see whether it's an ordering / separator issue. + +### 5.4 Step H — raw transcript & `content_hash.txt` + +This is the most important check. Two layers: + +1. **Self-consistency on v1**: the value in `content_hash.txt` must equal + `sha256:` of the reassembled `full.jsonl[.NNN]` content. +2. **Cross-store match (non-Codex agents)**: reassembled v1 bytes should + equal reassembled v2 `raw_transcript[.NNN]` bytes, and v1's + `content_hash.txt` should equal v2's `raw_transcript_hash.txt`. + +Reassemble logic: ordered list `full.jsonl`, `full.jsonl.001`, +`full.jsonl.002`, … For most agents this is JSONL with `\n` separators +between chunks (`agent.ReassembleJSONL` in `agent/chunking.go:109-118`); +for `vogon`, OpenCode etc. the agent's own `ReassembleTranscript` is +used at read time. For validation, byte-concatenation in chunk order is +what the v1 writer hashed (`committed.go:784` — the hash is over +`transcriptBytes` BEFORE chunking), so the easier check is to read the +original v1 input bytes back via the v1 store API, OR to validate that +each chunk blob is what the v1 writer would have produced. + +The simplest robust shell check: reconstruct via ordered concat and +compute the digest, then compare to `content_hash.txt`. This is exact for +agents whose `ChunkTranscript` is a byte-preserving JSONL chunker +(Claude Code, Gemini CLI, Cursor, Copilot CLI, Codex except for the pre- +chunk sanitization step, and the generic case). It's slightly fuzzy for +agents whose chunking strips/reflows bytes — but in practice the round +trip is byte-exact for the supported set. + +```sh +ID=02d9783342a2; SHARD=${ID:0:2}/${ID:2}; V1_SLOT=0 + +# Enumerate transcript chunks in order. +git -C "$REPO" ls-tree --name-only \ + "entire/checkpoints/v1:$SHARD/$V1_SLOT" \ + | grep -E '^full\.jsonl(\.[0-9]{3})?$' \ + | sort > /tmp/chunks.txt +cat /tmp/chunks.txt + +# Concatenate chunks (no extra separator — chunk files are written as +# they will be read by the agent's reassembler). For JSONL agents, +# the writer already trimmed the trailing newline per chunk; the +# reassembler joins with "\n". Reproduce that here. +tmp=$(mktemp) +first=1 +while IFS= read -r f; do + if [ $first -eq 0 ]; then printf '\n' >> "$tmp"; fi + git -C "$REPO" cat-file -p \ + "entire/checkpoints/v1:$SHARD/$V1_SLOT/$f" >> "$tmp" + first=0 +done < /tmp/chunks.txt + +# Recompute and compare. +COMPUTED="sha256:$(sha256_file "$tmp")" +STORED=$(git -C "$REPO" cat-file -p \ + "entire/checkpoints/v1:$SHARD/$V1_SLOT/content_hash.txt") +echo "stored: $STORED" +echo "computed: $COMPUTED" +[ "$STORED" = "$COMPUTED" ] && echo OK || echo MISMATCH +``` + +If `STORED ≠ COMPUTED` for **JSONL-based agents** (Claude Code, Gemini +CLI, etc.), something is wrong with the migration — flag it. For agents +with custom chunkers the shell heuristic above can produce a false +mismatch; in those cases fall back to using the CLI's own reader by +running `entire checkpoint explain ` or, more directly, by writing +a small Go probe that calls `agent.ReassembleTranscript(chunks, agent)` +and re-hashes the result. + +Cross-store comparison (non-Codex): + +```sh +# Same /full ref resolution as the migration (current first, then archives). +FULL_REFS=$(git -C "$REPO" for-each-ref \ + --format='%(refname)' 'refs/entire/checkpoints/v2/full/*' \ + | awk '/full\/current$/ {print "1 " $0; next} {print "0 " $0}' \ + | sort -k1,1nr -k2,2r \ + | awk '{print $2}') +RAW_HASH="" +for r in $FULL_REFS; do + if git -C "$REPO" cat-file -e \ + "$r:$SHARD/$V2_SLOT/raw_transcript_hash.txt" 2>/dev/null; then + RAW_HASH=$(git -C "$REPO" cat-file -p \ + "$r:$SHARD/$V2_SLOT/raw_transcript_hash.txt") + echo "raw transcript found on $r: $RAW_HASH" + break + fi +done +echo "v1 content_hash: $STORED" +echo "v2 raw_transcript: $RAW_HASH" +``` + +For non-Codex agents, the two hashes should match. For Codex (agent +field on the session metadata is `codex`), they are allowed to differ — +v1 sanitizes via `codex.SanitizePortableTranscript` before hashing +(`committed.go:746`). The v1 self-consistency check above is still +required in that case. + +### 5.5 Step I — bulk sweep + +Once the per-checkpoint procedure is established, sweep every migrated +checkpoint: + +```sh +wc -l "$MIGRATED_IDS" +``` + +Then for each ID in `$MIGRATED_IDS`, run: + +- §5.1 root metadata diff (`grep -q` for errors). +- §5.2 per-session field diff for every session ID that the checkpoint + brought in. +- §5.3 prompt presence/content check for every migrated session. +- §5.4 hash check on every transcript chunk set. +- §5.6 `entire explain` comparison against the dual-reads-removed binary. + +A single shell loop is fine, and the validation completes in seconds per +checkpoint. Surface any non-empty diffs or any `MISMATCH` lines. + +### 5.6 Step J — `entire explain` parity after removing v2 dual reads + +Run a current build from the branch that removes v2-first dual reads and +compare it to the Homebrew-installed `entire` for every migrated checkpoint. +On a real repo the two binaries will **not** produce identical output — +some divergence is structural and expected. The gate here is that every +diff falls into a known, bounded category and the required checks hold: the +file list, displayed session id, author header, and exit status agree on +every checkpoint. + +#### Expected divergences + +1. **Codex sessions: sanitized transcript on v1** (§1.3). + `codex.SanitizePortableTranscript` runs at write time, so the v1 + transcript bytes are not equal to v2's raw transcript bytes for any + session with `agent == "Codex"`. Self-consistency on each side still + holds — verified in §5.4. +2. **v2 compact transcript not migrated** (§7). The v2 store holds two + transcripts per session: the raw form on `/full/*` (migrated to v1) and + the compact form on `/main/transcript.jsonl` (not migrated). BREW + renders explain output from the compact form when available; FIX has to + parse the raw JSONL on v1 and pick fields ad hoc. Two visible + consequences: + - For Claude Code multi-argument tools (`Glob`, `Grep`, …), the tool + summary line picks different arguments. BREW tends to surface + `path`; FIX tends to surface `pattern`. Both arguments are present + in the raw JSONL. + - For the **Intent** block, BREW shows a prompt derived from the + compact transcript; FIX picks a user message from the raw + transcript. The underlying `prompt.txt` blobs are byte-identical + between v1 and v2 — only the renderer's selection differs. + +#### Required checks that must still hold + +- Exit status of both binaries matches per checkpoint. +- `## Files` (the touched-files list) is byte-identical. +- The displayed `session ` header line is identical (both binaries + choose the same session id on both sides). +- The `author` header is identical (the migration preserves the v2 + author identity onto v1 — see §4 Behavior notes). + +Anything that violates one of those is **unaccounted for** — flag it. + +Build the comparison binary immediately before the sweep: + +```sh +FIX_WORKTREE=/Users/pfleidi/entire/cli/.worktrees/fix/checkpoints-v2-remove-dual-reads +FIX_ENTIRE="/tmp/entire-${REPO_NAME}-remove-dual-reads" +BREW_ENTIRE="$(brew --prefix)/bin/entire" + +git -C "$FIX_WORKTREE" status --short --branch +(cd "$FIX_WORKTREE" && go build -o "$FIX_ENTIRE" ./cmd/entire) + +"$FIX_ENTIRE" version +"$BREW_ENTIRE" version +``` + +Run both binaries from the migrated repo for every migrated checkpoint and +audit each diff against the required checks above: + +```sh +EXPLAIN_DIR="/tmp/migrate-${REPO_NAME}-explain" +mkdir -p "$EXPLAIN_DIR" +: > "$EXPLAIN_DIR/unaccounted.txt" +set +e + +while IFS= read -r ID; do + FIX_RESULT="$EXPLAIN_DIR/$ID.fix" + BREW_RESULT="$EXPLAIN_DIR/$ID.brew" + DIFF_FILE="$EXPLAIN_DIR/$ID.diff" + + (cd "$REPO" && "$FIX_ENTIRE" explain "$ID") \ + > "$FIX_RESULT.out" 2> "$FIX_RESULT.err" + FIX_STATUS=$? + (cd "$REPO" && "$BREW_ENTIRE" explain "$ID") \ + > "$BREW_RESULT.out" 2> "$BREW_RESULT.err" + BREW_STATUS=$? + + { + echo "status=$FIX_STATUS" + cat "$FIX_RESULT.out" + printf '\n--- stderr ---\n' + cat "$FIX_RESULT.err" + } > "$FIX_RESULT" + { + echo "status=$BREW_STATUS" + cat "$BREW_RESULT.out" + printf '\n--- stderr ---\n' + cat "$BREW_RESULT.err" + } > "$BREW_RESULT" + + if diff -u "$BREW_RESULT" "$FIX_RESULT" > "$DIFF_FILE" 2>&1; then + rm -f "$DIFF_FILE" + continue + fi + + # --- Required checks --------------------------------------------------- + if [ "$FIX_STATUS" != "$BREW_STATUS" ]; then + echo "$ID reason=exit-status brew=$BREW_STATUS fix=$FIX_STATUS" \ + >> "$EXPLAIN_DIR/unaccounted.txt" + continue + fi + BREW_FILES=$(awk '/^## Files/{f=1} /^── Transcript/{f=0} f' \ + "$BREW_RESULT" | sha256_stdin) + FIX_FILES=$(awk '/^## Files/{f=1} /^── Transcript/{f=0} f' \ + "$FIX_RESULT" | sha256_stdin) + if [ "$BREW_FILES" != "$FIX_FILES" ]; then + echo "$ID reason=files-list-diverges" \ + >> "$EXPLAIN_DIR/unaccounted.txt" + continue + fi + BREW_SID=$(awk '/^ session /{print $2; exit}' "$BREW_RESULT") + FIX_SID=$(awk '/^ session /{print $2; exit}' "$FIX_RESULT") + if [ "$BREW_SID" != "$FIX_SID" ]; then + echo "$ID reason=session-id-mismatch brew=$BREW_SID fix=$FIX_SID" \ + >> "$EXPLAIN_DIR/unaccounted.txt" + continue + fi + BREW_AUTHOR=$(grep -m1 '^ author ' "$BREW_RESULT") + FIX_AUTHOR=$(grep -m1 '^ author ' "$FIX_RESULT") + if [ "$BREW_AUTHOR" != "$FIX_AUTHOR" ]; then + echo "$ID reason=author-mismatch" \ + >> "$EXPLAIN_DIR/unaccounted.txt" + continue + fi + # Remaining diffs fall in the expected buckets above. Leave $DIFF_FILE + # on disk for spot-checking. +done < "$MIGRATED_IDS" + +if [ -s "$EXPLAIN_DIR/unaccounted.txt" ]; then + echo "unaccounted-for explain divergences:" + cat "$EXPLAIN_DIR/unaccounted.txt" + exit 1 +fi + +echo "all explain divergences fall in accepted buckets" +``` + +#### Optional: divergence distribution + +Useful for spotting a sudden shift in divergence shape between releases. +Bucket each checkpoint as `identical`, `body-with-codex` (sanitization + +compact-rendering), or `body-without-codex` (compact-rendering only): + +```sh +identical=0; codex_body=0; non_codex_body=0 +while IFS= read -r ID; do + DIFF="$EXPLAIN_DIR/$ID.diff" + if [ ! -f "$DIFF" ]; then + identical=$((identical+1)); continue + fi + SHARD=${ID:0:2}/${ID:2} + has_codex=0 + V1_LEN=$(git -C "$REPO" cat-file -p \ + "entire/checkpoints/v1:$SHARD/metadata.json" \ + | jq -r '.sessions | length') + for i in $(seq 0 $((V1_LEN-1))); do + AGENT=$(git -C "$REPO" cat-file -p \ + "entire/checkpoints/v1:$SHARD/$i/metadata.json" \ + | jq -r '.agent // ""') + if [ "$AGENT" = "Codex" ] || [ "$AGENT" = "codex" ]; then + has_codex=1; break + fi + done + if [ "$has_codex" = 1 ]; then + codex_body=$((codex_body+1)) + else + non_codex_body=$((non_codex_body+1)) + fi +done < "$MIGRATED_IDS" + +printf 'identical: %4d\n' "$identical" +printf 'body diff, has codex: %4d (expected: §1.3 sanitization + §7 compact-not-migrated)\n' \ + "$codex_body" +printf 'body diff, no codex: %4d (expected: §7 compact-not-migrated)\n' \ + "$non_codex_body" +``` + +`$EXPLAIN_DIR/*.diff`, `*.fix`, and `*.brew` are kept on disk for +inspection. If a body diff in the no-codex bucket touches anything other +than transcript-tool-argument rendering or Intent text, file a bug — that +would be a real read-path regression. + +### 5.7 After validation passes + +You're done with this runbook only after every step in §5 produced the +expected result on every migrated checkpoint. Publishing the migration is **out of +scope for this runbook** and explicitly a manual decision. + +When the operator is satisfied and ready to publish: + +1. Re-read §4's push warning. Nothing about it has changed. +2. Decide deliberately, out-of-band, that you want the new v1 commits on + the remote. Coordinate with anyone else who has the repo cloned — + they will pick up the new commits on their next fetch. +3. Use your repo's normal push path. The runbook does not prescribe one + because publishing semantics vary per repo (some use `entire`'s push + integration, some use `checkpoint_remote`, some do a plain + `git push`). Pick the right one explicitly. + +Until that conscious decision is made, `entire/checkpoints/v1` stays +local. If §5 surfaces a problem, roll back with the `update-ref` snippet +from §4's "Behavior notes" — cheap and local, because you did not push. + +## 6. Failure modes and what they mean + +| Symptom in dry-run | Meaning | Action | +|----------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------| +| `missing v2 checkpoint metadata: N (large)` | v2 `/main` is missing or its tree lacks summaries for many discovered IDs. | Confirm `refs/entire/checkpoints/v2/main` exists, was fetched, and is reasonably recent. | +| `missing required v2 session metadata: > 0` | v2 session `metadata.json` lacks `checkpoint_id` or `session_id`. Could indicate corruption or a partial v2 write. | Inspect the affected sessions manually; they will be skipped, not failed. | +| `missing raw transcripts: > 0` | v2 `/main` has a session but `/full/current` and archived `/full/*` don't carry its `raw_transcript*` data. | Confirm archived `/full/*` refs are present locally (or accessible via remote fetch). | +| Candidate `commits=(orphan)` | The ID is on v2 /main with no commit-trailer attribution in history. Expected and benign; counted by `V2O`. | None — verify against `git log --grep` in §3.4 to confirm there's no missed trailer. | +| `warning: N v2 orphans skipped` on a `--since`/`--head` run | Commit-scoped run found N v2 /main IDs that an unscoped walk would have surfaced as orphan candidates. | Re-run without `--since`/`--head` to include them, or accept the scope deliberately. | +| `v2 orphan checkpoints eligible for migration > checkpoints eligible for migration` | Should be impossible (`V2O ⊆ EC` by construction). | File a bug. | +| `sessions=N` for a candidate doesn't match the §3.5 expected | Either v1 already has the session (so report should have lower N), or session IDs are non-unique within v2. | Inspect; non-unique session IDs are a v2 corruption. | +| Post-apply, `content_hash.txt` ≠ recomputed SHA-256 | Codex agent + ours-vs-original sanitization difference, OR a bug. Confirm `agent` field on the session. | If non-Codex, file a bug with chunk listing + bytes. | +| Post-apply, `content_hash.txt` matches but v2's `raw_transcript_hash.txt` doesn't | Codex sanitization (expected) OR transcript was rewritten in transit. Confirm agent first. | If non-Codex, file a bug. | +| Post-apply, migrated v1 commit has signature status other than `N` | The migration signed a replayed-author commit. This should not happen. | File a bug and do not publish the migrated v1 branch until re-run with an unsigned tool. | +| Re-running `--dry-run` after `--apply` still lists the same candidates | Apply failed silently or didn't get pushed before re-fetch. Look at the `migrated sessions` count. | Re-run with verbose logging; check that v1 branch actually advanced. | + +The report does not enumerate the exact checkpoint/session IDs behind `M1`, +`M2`, or `M3`. Manual inspection requires re-walking v2 /main and v2 /full +refs as shown in §3.3 and §3.5. + +## 7. Quick reference: file & ref constants + +| Concept | Constant | Value | Source | +|--------------------------|------------------------------------------------|----------------------------------------------------|---------------------------------------| +| v1 branch | `paths.MetadataBranchName` | `entire/checkpoints/v1` (under `refs/heads/`) | `paths/paths.go:43` | +| v2 main ref | `paths.V2MainRefName` | `refs/entire/checkpoints/v2/main` | `paths/paths.go:49` | +| v2 full current ref | `paths.V2FullCurrentRefName` | `refs/entire/checkpoints/v2/full/current` | `paths/paths.go:52` | +| v2 archived full ref | (pattern) | `refs/entire/checkpoints/v2/full/<13-digit-suffix>`| `v2_read.go:523-533` | +| Root summary | `paths.MetadataFileName` | `metadata.json` | `paths/paths.go:36` | +| Session metadata | `paths.MetadataFileName` | `metadata.json` | `paths/paths.go:36` | +| Session prompt | `paths.PromptFileName` | `prompt.txt` | `paths/paths.go:29` | +| v1 transcript | `paths.TranscriptFileName` | `full.jsonl` (+ `.001`, `.002`, …) | `paths/paths.go:30` | +| v1 transcript hash | `paths.ContentHashFileName` | `content_hash.txt` (format `sha256:`) | `paths/paths.go:38`, `committed.go:784` | +| v2 compact transcript | `paths.CompactTranscriptFileName` | `transcript.jsonl` (on `/main`, not migrated) | `paths/paths.go:32` | +| v2 compact hash | `paths.CompactTranscriptHashFileName` | `transcript_hash.txt` (on `/main`, not migrated) | `paths/paths.go:33` | +| v2 raw transcript | `paths.V2RawTranscriptFileName` | `raw_transcript` (+ `.001`, …) on `/full/*` | `paths/paths.go:34` | +| v2 raw hash | `paths.V2RawTranscriptHashFileName` | `raw_transcript_hash.txt` on `/full/*` | `paths/paths.go:35` | +| Sharded path | `id.Path()` | `/` (12-char lowercase hex) | `checkpoint/id/id.go` | +| Trailer key | `trailers.CheckpointTrailerKey` | `Entire-Checkpoint` | `trailers/trailers.go:41` | +| Chunk filename suffix | `agent.ChunkSuffix` | `.%03d` | `agent/chunking.go:19` | + +## 8. Source map + +- Tool entry: `cmd/migrate-v2-checkpoints/main.go` +- History walk: `cmd/migrate-v2-checkpoints/history.go` — + `discoverCheckpointHistoryWithSkippedOrphans` (line 55), + `addV2OrphanCheckpoints` (line 364), `listV2MainCheckpointIDs` + (line 408), `writeCheckpointList` (line 484, includes the `(orphan)` + label), `writeDiscoveryWarnings` (line 497, prints the scope-skip + warning). +- v2 ref auto-fetch: `cmd/migrate-v2-checkpoints/v2_preflight.go` — + `ensureLatestV2Refs` (line 24), `fetchV2MainRef` (line 75), + `fetchV2FullRefs` (line 101). +- v1 ref auto-fetch: `cmd/migrate-v2-checkpoints/v1_preflight.go` — + `ensureLatestV1Ref` (line 23), `remoteRefExists` (line 55), + `fetchV1Ref` (line 70). +- Migration loop: `cmd/migrate-v2-checkpoints/migration.go` — + `migrateDiscoveredCheckpoints` (line 53), `migrateCheckpoint` + (line 96, disables commit signing before v1 writes), + `writeOptionsFromV2Content` (line 217), + `writeMigrationReport` (line 252), `candidateCommitLabel` (line 291, + emits `(orphan)`). +- v2 session author lookup: `cmd/migrate-v2-checkpoints/v2_author.go` — + `buildV2SessionAuthorIndex` (line 30), + `changedV2SessionMetadataPaths` (line 79), + `v2SessionMetadataPath` (line 152). +- v1 write: `cmd/entire/cli/checkpoint/committed.go` — `WriteCommitted` + (line 72), `WithCommitSigningDisabled` (line 57), + `writeStandardCheckpointEntries` (line 324), + `writeSessionToSubdirectory` (line 418), `writeTranscript` (line 741), + `findSessionIndex` (line 631), `SignCommitBestEffort` (line 2001). +- v2 read: `cmd/entire/cli/checkpoint/v2_read.go` — `ReadCommitted` + (line 26), `ReadSessionMetadataAndPrompts` (line 205), + `ReadSessionContent` (line 274), `readTranscriptFromFullRefs` + (line 342), `readTranscriptFromRef` (line 540), + `isV2ArchivedFullRefSuffix` (line 523). +- Schemas: `cmd/entire/cli/checkpoint/checkpoint.go` — `CheckpointSummary` + (line 545), `CommittedMetadata` (line 444), `SessionFilePaths` + (line 520). +- Trailer parsing: `cmd/entire/cli/trailers/trailers.go`. +- Chunking: `cmd/entire/cli/agent/chunking.go`. +- Sanitization (Codex only): `cmd/entire/cli/agent/codex/` + (`SanitizePortableTranscript`). +- ID + sharded path: `cmd/entire/cli/checkpoint/id/id.go`. + +## 9. Notes for re-use on other repos + +- `--repo PATH` works from anywhere; you do not need to `cd`. Bear in mind + the tool walks `refs/remotes/*/*` too, so if the local repo has stale + remote refs the candidate list may include IDs whose underlying commits + are only reachable via those remotes. That's still correct — those + commits really did reference the IDs. +- `plan`, `--dry-run`, and `--apply` auto-fetch checkpoint refs from the repo's + checkpoint remote (`ensureLatestV1Ref`, `ensureLatestV2Refs`). If the + remote resolves, you get an up-to-date local copy of + `refs/heads/entire/checkpoints/v1`, `refs/entire/checkpoints/v2/main`, and + every `refs/entire/checkpoints/v2/full/*`; the tool errors if that remote + does not advertise v1 or v2 /main. If the remote cannot be resolved at all, + the tool only proceeds when local v1 and v2 /main refs are already present. + `--list` does **not** auto-fetch — if you want a candidate universe that + reflects the remote, refresh manually first: + ```sh + git -C "$REPO" fetch origin \ + 'refs/heads/entire/checkpoints/v1:refs/heads/entire/checkpoints/v1' + git -C "$REPO" fetch origin \ + 'refs/entire/checkpoints/v2/*:refs/entire/checkpoints/v2/*' + ``` +- Orphan augmentation is enabled by default. Pass `--since` or `--head` + if you intentionally want to exclude v2-only IDs from migration; the + tool will still print a single-line warning summarising how many were + skipped. +- The tool is **idempotent** in `--apply` mode. Re-running after a + successful apply should produce `checkpoints eligible for migration: 0` + modulo any new v2 data that landed in the meantime. +- The tool only writes to the local repo. After `--apply`, push the + updated v1 branch yourself when ready (and only after §5 passes). diff --git a/cmd/migrate-v2-checkpoints/history.go b/cmd/migrate-v2-checkpoints/history.go new file mode 100644 index 0000000000..7b2c432a70 --- /dev/null +++ b/cmd/migrate-v2-checkpoints/history.go @@ -0,0 +1,510 @@ +package main + +import ( + "context" + "errors" + "fmt" + "io" + "sort" + "strings" + "time" + + "github.com/entireio/cli/cmd/entire/cli/checkpoint" + checkpointID "github.com/entireio/cli/cmd/entire/cli/checkpoint/id" + "github.com/entireio/cli/cmd/entire/cli/paths" + "github.com/entireio/cli/cmd/entire/cli/trailers" + + "github.com/go-git/go-git/v6" + "github.com/go-git/go-git/v6/plumbing" + "github.com/go-git/go-git/v6/plumbing/object" + "github.com/go-git/go-git/v6/plumbing/storer" +) + +type discoveryOptions struct { + since string + head string +} + +type discoveredCheckpoint struct { + ID checkpointID.CheckpointID + Commits []discoveredCommit +} + +type discoveredCommit struct { + Hash plumbing.Hash + ShortSHA string + Date time.Time +} + +type historyTip struct { + name string + hash plumbing.Hash +} + +type discoveryScope struct { + excluded map[plumbing.Hash]bool + sinceHash plumbing.Hash + hasSince bool +} + +func discoverCheckpointHistory(ctx context.Context, repo *git.Repository, opts discoveryOptions) ([]discoveredCheckpoint, error) { + checkpoints, _, err := discoverCheckpointHistoryWithSkippedOrphans(ctx, repo, opts) + return checkpoints, err +} + +func discoverCheckpointHistoryWithSkippedOrphans(ctx context.Context, repo *git.Repository, opts discoveryOptions) ([]discoveredCheckpoint, int, error) { + checkpoints, checkpointIndexes, err := discoverTrailerCheckpointHistory(ctx, repo, opts) + if err != nil { + return nil, 0, err + } + + v2OrphansSkipped, err := addV2OrphanCheckpoints(ctx, repo, opts, checkpointIndexes, &checkpoints) + if err != nil { + return nil, 0, err + } + + sortDiscoveredCheckpoints(checkpoints) + return checkpoints, v2OrphansSkipped, nil +} + +func discoverTrailerCheckpointHistory(ctx context.Context, repo *git.Repository, opts discoveryOptions) ([]discoveredCheckpoint, map[string]int, error) { + scope, err := newDiscoveryScope(ctx, repo, opts.since) + if err != nil { + return nil, nil, err + } + + tips, err := historyTips(ctx, repo, opts.head, scope) + if err != nil { + return nil, nil, err + } + + seenCommits := make(map[plumbing.Hash]bool) + checkpointIndexes := make(map[string]int) + checkpoints := make([]discoveredCheckpoint, 0) + + for _, tip := range tips { + if err := scanTip(ctx, repo, tip, scope.excluded, seenCommits, checkpointIndexes, &checkpoints); err != nil { + return nil, nil, err + } + } + + return checkpoints, checkpointIndexes, nil +} + +func newDiscoveryScope(ctx context.Context, repo *git.Repository, since string) (discoveryScope, error) { + if since == "" { + return discoveryScope{excluded: make(map[plumbing.Hash]bool)}, nil + } + + sinceHash, err := resolveRevision(repo, since) + if err != nil { + return discoveryScope{}, fmt.Errorf("resolve --since %q: %w", since, err) + } + excluded, err := reachableCommits(ctx, repo, sinceHash) + if err != nil { + return discoveryScope{}, err + } + return discoveryScope{ + excluded: excluded, + sinceHash: sinceHash, + hasSince: true, + }, nil +} + +func historyTips(ctx context.Context, repo *git.Repository, head string, scope discoveryScope) ([]historyTip, error) { + if head != "" { + hash, err := resolveRevision(repo, head) + if err != nil { + return nil, fmt.Errorf("resolve --head %q: %w", head, err) + } + if err := requireTipContainsSince(ctx, repo, hash, head, scope); err != nil { + return nil, err + } + return []historyTip{{name: head, hash: hash}}, nil + } + + iter, err := repo.References() + if err != nil { + return nil, fmt.Errorf("list refs: %w", err) + } + defer iter.Close() + + var tips []historyTip + seenHashes := make(map[plumbing.Hash]bool) + err = iter.ForEach(func(ref *plumbing.Reference) error { + if !isHistoryRef(ref) { + return nil + } + + hash := ref.Hash() + if seenHashes[hash] { + return nil + } + include, includeErr := tipContainsSince(ctx, repo, hash, scope) + if includeErr != nil { + return fmt.Errorf("check whether %s contains --since: %w", ref.Name(), includeErr) + } + if !include { + return nil + } + seenHashes[hash] = true + tips = append(tips, historyTip{name: ref.Name().String(), hash: hash}) + return nil + }) + if err != nil { + return nil, fmt.Errorf("iterate refs: %w", err) + } + + if len(tips) == 0 { + headRef, headErr := repo.Head() + if headErr != nil { + return nil, fmt.Errorf("find HEAD: %w", headErr) + } + include, includeErr := tipContainsSince(ctx, repo, headRef.Hash(), scope) + if includeErr != nil { + return nil, fmt.Errorf("check whether HEAD contains --since: %w", includeErr) + } + if include { + tips = append(tips, historyTip{name: headRef.Name().String(), hash: headRef.Hash()}) + } + } + + sort.Slice(tips, func(i, j int) bool { + return tips[i].name < tips[j].name + }) + return tips, nil +} + +func requireTipContainsSince(ctx context.Context, repo *git.Repository, tipHash plumbing.Hash, tipName string, scope discoveryScope) error { + contains, err := tipContainsSince(ctx, repo, tipHash, scope) + if err != nil { + return fmt.Errorf("check whether --head %q contains --since: %w", tipName, err) + } + if !contains { + return fmt.Errorf("%s is not an ancestor of --head %q", scope.sinceHash, tipName) + } + return nil +} + +func tipContainsSince(ctx context.Context, repo *git.Repository, tipHash plumbing.Hash, scope discoveryScope) (bool, error) { + if !scope.hasSince { + return true, nil + } + return commitReachableFrom(ctx, repo, tipHash, scope.sinceHash) +} + +func isHistoryRef(ref *plumbing.Reference) bool { + if ref.Type() != plumbing.HashReference { + return false + } + name := ref.Name() + if !name.IsBranch() && !name.IsRemote() { + return false + } + if isInternalHistoryRefName(name) { + return false + } + return !strings.HasSuffix(name.String(), "/HEAD") +} + +func isInternalHistoryRefName(name plumbing.ReferenceName) bool { + if name == plumbing.NewBranchReferenceName(paths.MetadataBranchName) || + name == plumbing.NewBranchReferenceName(paths.TrailsBranchName) { + return true + } + + remotePrefix := "refs/remotes/" + nameString := name.String() + if !strings.HasPrefix(nameString, remotePrefix) { + return false + } + remoteAndBranch := strings.TrimPrefix(nameString, remotePrefix) + _, branchName, ok := strings.Cut(remoteAndBranch, "/") + if !ok { + return false + } + return branchName == paths.MetadataBranchName || branchName == paths.TrailsBranchName +} + +func resolveRevision(repo *git.Repository, revision string) (plumbing.Hash, error) { + if isShortHexRevision(revision) { + if err := rejectAmbiguousCommitPrefix(repo, revision); err != nil { + return plumbing.ZeroHash, err + } + } + + hash, err := repo.ResolveRevision(plumbing.Revision(revision)) + if err != nil { + return plumbing.ZeroHash, err //nolint:wrapcheck // callers add flag-specific context + } + if hash == nil { + return plumbing.ZeroHash, fmt.Errorf("revision %q resolved to no commit", revision) + } + return *hash, nil +} + +func isShortHexRevision(revision string) bool { + if revision == "" || len(revision) >= len(plumbing.ZeroHash.String()) { + return false + } + for _, r := range revision { + switch { + case r >= '0' && r <= '9': + case r >= 'a' && r <= 'f': + case r >= 'A' && r <= 'F': + default: + return false + } + } + return true +} + +func rejectAmbiguousCommitPrefix(repo *git.Repository, revision string) error { + prefix := strings.ToLower(revision) + iter, err := repo.CommitObjects() + if err != nil { + return fmt.Errorf("list commit objects for revision %q: %w", revision, err) + } + defer iter.Close() + + var matches []plumbing.Hash + err = iter.ForEach(func(commit *object.Commit) error { + if strings.HasPrefix(commit.Hash.String(), prefix) { + matches = append(matches, commit.Hash) + if len(matches) == 2 { + return storer.ErrStop + } + } + return nil + }) + if err != nil && !errors.Is(err, storer.ErrStop) { + return fmt.Errorf("scan commit objects for revision %q: %w", revision, err) + } + if len(matches) < 2 { + return nil + } + sort.Slice(matches, func(i, j int) bool { + return matches[i].String() < matches[j].String() + }) + return fmt.Errorf("ambiguous revision %q matches commit prefixes %s and %s", revision, matches[0], matches[1]) +} + +func reachableCommits(ctx context.Context, repo *git.Repository, from plumbing.Hash) (map[plumbing.Hash]bool, error) { + iter, err := repo.Log(&git.LogOptions{From: from, Order: git.LogOrderCommitterTime}) + if err != nil { + return nil, fmt.Errorf("get log from %s: %w", from, err) + } + defer iter.Close() + + commits := make(map[plumbing.Hash]bool) + err = iter.ForEach(func(commit *object.Commit) error { + if err := ctx.Err(); err != nil { + return fmt.Errorf("context canceled while excluding commits: %w", err) + } + commits[commit.Hash] = true + return nil + }) + if err != nil { + return nil, fmt.Errorf("iterate commits reachable from %s: %w", from, err) + } + return commits, nil +} + +func commitReachableFrom(ctx context.Context, repo *git.Repository, from, target plumbing.Hash) (bool, error) { + iter, err := repo.Log(&git.LogOptions{From: from, Order: git.LogOrderCommitterTime}) + if err != nil { + return false, fmt.Errorf("get log from %s: %w", from, err) + } + defer iter.Close() + + found := false + err = iter.ForEach(func(commit *object.Commit) error { + if err := ctx.Err(); err != nil { + return fmt.Errorf("context canceled while checking ancestry: %w", err) + } + if commit.Hash == target { + found = true + return storer.ErrStop + } + return nil + }) + if errors.Is(err, storer.ErrStop) { + return true, nil + } + if err != nil { + return false, fmt.Errorf("iterate commits from %s: %w", from, err) + } + return found, nil +} + +func scanTip(ctx context.Context, repo *git.Repository, tip historyTip, excluded, seenCommits map[plumbing.Hash]bool, checkpointIndexes map[string]int, checkpoints *[]discoveredCheckpoint) error { + iter, err := repo.Log(&git.LogOptions{From: tip.hash, Order: git.LogOrderCommitterTime}) + if err != nil { + return fmt.Errorf("get log from %s: %w", tip.name, err) + } + defer iter.Close() + + err = iter.ForEach(func(commit *object.Commit) error { + if err := ctx.Err(); err != nil { + return fmt.Errorf("context canceled while scanning commits: %w", err) + } + if excluded[commit.Hash] || seenCommits[commit.Hash] { + return nil + } + seenCommits[commit.Hash] = true + addCheckpointCommit(commit, checkpointIndexes, checkpoints) + return nil + }) + if err != nil { + return fmt.Errorf("iterate commits from %s: %w", tip.name, err) + } + return nil +} + +func addV2OrphanCheckpoints(ctx context.Context, repo *git.Repository, opts discoveryOptions, checkpointIndexes map[string]int, checkpoints *[]discoveredCheckpoint) (int, error) { + v2CheckpointIDs, err := listV2MainCheckpointIDs(ctx, repo) + if err != nil { + return 0, err + } + if len(v2CheckpointIDs) == 0 { + return 0, nil + } + + if hasCommitScope(opts) { + _, unscopedIndexes, err := discoverTrailerCheckpointHistory(ctx, repo, discoveryOptions{}) + if err != nil { + return 0, err + } + + return countMissingCheckpointIDs(v2CheckpointIDs, unscopedIndexes), nil + } + + for _, cpID := range v2CheckpointIDs { + key := cpID.String() + if _, exists := checkpointIndexes[key]; exists { + continue + } + checkpointIndexes[key] = len(*checkpoints) + *checkpoints = append(*checkpoints, discoveredCheckpoint{ID: cpID}) + } + + return 0, nil +} + +func hasCommitScope(opts discoveryOptions) bool { + return opts.since != "" || opts.head != "" +} + +func countMissingCheckpointIDs(ids []checkpointID.CheckpointID, indexes map[string]int) int { + missing := 0 + for _, cpID := range ids { + if _, exists := indexes[cpID.String()]; !exists { + missing++ + } + } + return missing +} + +func listV2MainCheckpointIDs(ctx context.Context, repo *git.Repository) ([]checkpointID.CheckpointID, error) { + v2Store := checkpoint.NewV2GitStore(repo) + _, rootTreeHash, err := v2Store.GetRefState(plumbing.ReferenceName(paths.V2MainRefName)) + if errors.Is(err, plumbing.ErrReferenceNotFound) { + return nil, nil + } + if err != nil { + return nil, fmt.Errorf("read %s ref state: %w", paths.V2MainRefName, err) + } + + rootTree, err := repo.TreeObject(rootTreeHash) + if err != nil { + return nil, fmt.Errorf("read %s root tree: %w", paths.V2MainRefName, err) + } + + var ids []checkpointID.CheckpointID + err = checkpoint.WalkCheckpointShards(ctx, repo, rootTree, func(cpID checkpointID.CheckpointID, cpTreeHash plumbing.Hash) error { + cpTree, cpTreeErr := repo.TreeObject(cpTreeHash) + if cpTreeErr != nil { + return fmt.Errorf("read v2 checkpoint %s tree: %w", cpID, cpTreeErr) + } + if _, fileErr := cpTree.File(paths.MetadataFileName); fileErr == nil { + ids = append(ids, cpID) + } + return nil + }) + if err != nil { + return nil, fmt.Errorf("walk %s checkpoints: %w", paths.V2MainRefName, err) + } + + sort.Slice(ids, func(i, j int) bool { + return ids[i].String() < ids[j].String() + }) + return ids, nil +} + +func addCheckpointCommit(commit *object.Commit, checkpointIndexes map[string]int, checkpoints *[]discoveredCheckpoint) { + ids := trailers.ParseAllCheckpoints(commit.Message) + if len(ids) == 0 { + return + } + + discovered := discoveredCommit{ + Hash: commit.Hash, + ShortSHA: shortHash(commit.Hash), + Date: commit.Committer.When, + } + + for _, id := range ids { + key := id.String() + index, ok := checkpointIndexes[key] + if !ok { + index = len(*checkpoints) + checkpointIndexes[key] = index + *checkpoints = append(*checkpoints, discoveredCheckpoint{ID: id}) + } + (*checkpoints)[index].Commits = append((*checkpoints)[index].Commits, discovered) + } +} + +func sortDiscoveredCheckpoints(checkpoints []discoveredCheckpoint) { + sort.Slice(checkpoints, func(i, j int) bool { + return checkpoints[i].ID.String() < checkpoints[j].ID.String() + }) + for i := range checkpoints { + sort.Slice(checkpoints[i].Commits, func(j, k int) bool { + left := checkpoints[i].Commits[j] + right := checkpoints[i].Commits[k] + if !left.Date.Equal(right.Date) { + return left.Date.After(right.Date) + } + return left.Hash.String() < right.Hash.String() + }) + } +} + +func writeCheckpointList(w io.Writer, checkpoints []discoveredCheckpoint) { + for _, checkpoint := range checkpoints { + fmt.Fprint(w, checkpoint.ID) + if len(checkpoint.Commits) == 0 { + fmt.Fprint(w, " (orphan)") + } + for _, commit := range checkpoint.Commits { + fmt.Fprintf(w, " %s", commit.ShortSHA) + } + fmt.Fprintln(w) + } +} + +func writeDiscoveryWarnings(w io.Writer, v2OrphansSkipped int) { + if v2OrphansSkipped == 0 { + return + } + fmt.Fprintf(w, "warning: %d v2 orphans skipped; re-run without --since/--head to include them\n", v2OrphansSkipped) +} + +func shortHash(hash plumbing.Hash) string { + full := hash.String() + if len(full) <= checkpointID.ShortIDLength { + return full + } + return full[:checkpointID.ShortIDLength] +} diff --git a/cmd/migrate-v2-checkpoints/main.go b/cmd/migrate-v2-checkpoints/main.go new file mode 100644 index 0000000000..ac246a23ce --- /dev/null +++ b/cmd/migrate-v2-checkpoints/main.go @@ -0,0 +1,202 @@ +package main + +import ( + "context" + "errors" + "fmt" + "io" + "os" + + "github.com/entireio/cli/cmd/entire/cli/checkpoint" + "github.com/entireio/cli/cmd/entire/cli/gitrepo" + "github.com/entireio/cli/cmd/entire/cli/paths" + "github.com/entireio/cli/cmd/entire/cli/settings" + + "github.com/go-git/go-git/v6" + "github.com/spf13/pflag" +) + +type runMode string + +const ( + modePlan runMode = "plan" + modeList runMode = "list" + modeDryRun runMode = "dry-run" + modeApply runMode = "apply" +) + +type options struct { + repoPath string + since string + head string + mode runMode + help bool +} + +func main() { + if err := run(context.Background(), os.Args[1:], os.Stdout); err != nil { + fmt.Fprintf(os.Stderr, "error: %v\n", err) + os.Exit(1) + } +} + +func run(ctx context.Context, args []string, stdout io.Writer) error { + opts, err := parseOptions(args) + if err != nil { + return err + } + if opts.help { + printUsage(stdout) + return nil + } + + repoRoot, repo, err := openRepository(ctx, opts.repoPath) + if err != nil { + return err + } + ctx = settings.WithWorktreeRoot(ctx, repoRoot) + ctx = checkpoint.WithCommitSigningDisabled(ctx) + + if shouldEnsureCheckpointRefs(opts) { + if err := ensureLatestV1Ref(ctx, repoRoot, repo); err != nil { + return err + } + if err := ensureLatestV2Refs(ctx, repoRoot, repo); err != nil { + return err + } + } + + checkpoints, v2OrphansSkipped, err := discoverCheckpointHistoryWithSkippedOrphans(ctx, repo, discoveryOptions{ + since: opts.since, + head: opts.head, + }) + if err != nil { + return err + } + writeDiscoveryWarnings(stdout, v2OrphansSkipped) + + switch opts.mode { + case modeList: + writeCheckpointList(stdout, checkpoints) + return nil + case modePlan, modeDryRun: + report, err := migrateDiscoveredCheckpoints(ctx, repo, checkpoints, migrationOptions{apply: false}) + if err != nil { + return err + } + writeMigrationReport(stdout, report, false) + return nil + case modeApply: + report, err := migrateDiscoveredCheckpoints(ctx, repo, checkpoints, migrationOptions{apply: true}) + if err != nil { + return err + } + writeMigrationReport(stdout, report, true) + return nil + default: + return fmt.Errorf("unknown mode %q", opts.mode) + } +} + +func shouldEnsureCheckpointRefs(opts options) bool { + return opts.mode == modePlan || opts.mode == modeDryRun || opts.mode == modeApply +} + +func parseOptions(args []string) (options, error) { + var opts options + opts.mode = modePlan + + flags := pflag.NewFlagSet("migrate-v2-checkpoints", pflag.ContinueOnError) + flags.SetOutput(io.Discard) + + var listMode bool + var dryRun bool + var apply bool + flags.BoolVarP(&opts.help, "help", "h", false, "show help") + flags.BoolVar(&listMode, "list", false, "print checkpoint IDs and associated commit IDs only") + flags.BoolVar(&dryRun, "dry-run", false, "print the migration plan without writing refs") + flags.BoolVar(&apply, "apply", false, "write migration commits") + flags.StringVar(&opts.repoPath, "repo", "", "local repository path to inspect") + flags.StringVar(&opts.since, "since", "", "commit before the checkpoints to inspect") + flags.StringVar(&opts.head, "head", "", "limit scan to one history tip") + + if err := flags.Parse(args); err != nil { + return opts, fmt.Errorf("parse options: %w", err) + } + + positionals := flags.Args() + if len(positionals) > 1 { + return opts, fmt.Errorf("expected at most one since commit argument, got %d", len(positionals)) + } + if len(positionals) == 1 { + if opts.since != "" { + return opts, errors.New("use either --since or positional since commit, not both") + } + opts.since = positionals[0] + } + + modeCount := 0 + if listMode { + opts.mode = modeList + modeCount++ + } + if dryRun { + opts.mode = modeDryRun + modeCount++ + } + if apply { + opts.mode = modeApply + modeCount++ + } + if modeCount > 1 { + return opts, errors.New("use only one of --list, --dry-run, or --apply") + } + + return opts, nil +} + +func printUsage(w io.Writer) { + fmt.Fprint(w, `migrate-v2-checkpoints migrates legacy v2 checkpoint data back to v1. + +Usage: + migrate-v2-checkpoints [OPTIONS] [SINCE_COMMIT] + +Options: + -h, --help Show this help message + --list Print checkpoint IDs and associated commit IDs only + --dry-run Print the migration plan without writing refs + --apply Write migration commits + --repo Local repository path to inspect + --since Commit before the checkpoints to inspect + --head Limit scan to one history tip +`) +} + +func openRepository(ctx context.Context, repoPath string) (string, *git.Repository, error) { + if repoPath == "" { + root, err := paths.WorktreeRoot(ctx) + if err != nil { + return "", nil, fmt.Errorf("find git worktree root: %w", err) + } + repoPath = root + } + + // DetectDotGit walks up from a subdir to find the worktree root; then + // re-open via gitrepo.OpenPath so shared clones with object alternates + // resolve correctly. + detector, err := git.PlainOpenWithOptions(repoPath, &git.PlainOpenOptions{DetectDotGit: true}) + if err != nil { + return "", nil, fmt.Errorf("open repository %q: %w", repoPath, err) + } + defer detector.Close() + repoRoot := repoPath + if worktree, worktreeErr := detector.Worktree(); worktreeErr == nil { + repoRoot = worktree.Filesystem().Root() + } + + repo, err := gitrepo.OpenPath(repoRoot) + if err != nil { + return "", nil, fmt.Errorf("open repository %q: %w", repoRoot, err) + } + return repoRoot, repo, nil +} diff --git a/cmd/migrate-v2-checkpoints/main_test.go b/cmd/migrate-v2-checkpoints/main_test.go new file mode 100644 index 0000000000..b4c272566e --- /dev/null +++ b/cmd/migrate-v2-checkpoints/main_test.go @@ -0,0 +1,1136 @@ +package main + +import ( + "bytes" + "context" + "fmt" + "os" + "os/exec" + "path/filepath" + "strings" + "testing" + "time" + + "github.com/entireio/cli/cmd/entire/cli/agent" + "github.com/entireio/cli/cmd/entire/cli/checkpoint" + "github.com/entireio/cli/cmd/entire/cli/checkpoint/id" + "github.com/entireio/cli/cmd/entire/cli/jsonutil" + "github.com/entireio/cli/cmd/entire/cli/paths" + "github.com/entireio/cli/cmd/entire/cli/session" + "github.com/entireio/cli/cmd/entire/cli/testutil" + "github.com/entireio/cli/redact" + + "github.com/go-git/go-git/v6" + "github.com/go-git/go-git/v6/plumbing" + "github.com/go-git/go-git/v6/plumbing/object" + "github.com/stretchr/testify/require" +) + +const ( + oldCheckpointID = "000000000001" + mainCheckpointID = "111111111111" + featureCheckpointID = "222222222222" + featureCheckpointID2 = "333333333333" + unrelatedCheckpointID = "444444444444" + testSinceRevision = "abc123" + testHeadRevision = "HEAD" + testRepoPath = "/tmp/repo" + testBaseFilename = "base.txt" + testMainFilename = "main.txt" + testFeatureFilename = "feature.txt" + testFeatureBranchName = "feature" + testStrategy = "manual-commit" + testAuthorName = "Test" + testAuthorEmail = "test@example.com" + testBranchName = "main" + testReviewSkill = "review-skill" + testToolUseID = "toolu_test123" +) + +func TestParseOptions(t *testing.T) { + t.Parallel() + + opts, err := parseOptions([]string{ + "--repo", testRepoPath, + "--since", testSinceRevision, + "--head", testHeadRevision, + "--list", + }) + require.NoError(t, err) + require.Equal(t, testRepoPath, opts.repoPath) + require.Equal(t, testSinceRevision, opts.since) + require.Equal(t, testHeadRevision, opts.head) + require.Equal(t, modeList, opts.mode) + + opts, err = parseOptions([]string{"--dry-run", testSinceRevision}) + require.NoError(t, err) + require.Equal(t, testSinceRevision, opts.since) + require.Equal(t, modeDryRun, opts.mode) + + _, err = parseOptions([]string{"--since", testSinceRevision, "def456"}) + require.ErrorContains(t, err, "use either --since or positional since commit") + + _, err = parseOptions([]string{"--list", "--apply"}) + require.ErrorContains(t, err, "use only one") +} + +func TestDiscoverCheckpointHistory_AllRefsNewerThanSince(t *testing.T) { + t.Parallel() + + fixture := setupMigrationHistoryRepo(t) + + checkpoints, err := discoverCheckpointHistory(context.Background(), fixture.repo, discoveryOptions{ + since: fixture.baseHash.String(), + }) + require.NoError(t, err) + + require.Equal(t, []string{mainCheckpointID, featureCheckpointID, featureCheckpointID2}, discoveredCheckpointIDs(checkpoints)) + require.Equal(t, []string{shortHash(fixture.mainHash)}, discoveredCommitShortSHAs(t, checkpoints, mainCheckpointID)) + require.Equal(t, []string{shortHash(fixture.featureHash)}, discoveredCommitShortSHAs(t, checkpoints, featureCheckpointID)) + require.Equal(t, []string{shortHash(fixture.featureHash)}, discoveredCommitShortSHAs(t, checkpoints, featureCheckpointID2)) +} + +func TestDiscoverCheckpointHistory_HeadLimitsScan(t *testing.T) { + t.Parallel() + + fixture := setupMigrationHistoryRepo(t) + + checkpoints, err := discoverCheckpointHistory(context.Background(), fixture.repo, discoveryOptions{ + since: fixture.baseHash.String(), + head: fixture.mainHash.String(), + }) + require.NoError(t, err) + + require.Equal(t, []string{mainCheckpointID}, discoveredCheckpointIDs(checkpoints)) +} + +func TestDiscoverCheckpointHistory_SkipsRefsThatDoNotContainSince(t *testing.T) { + t.Parallel() + + fixture := setupMigrationHistoryRepo(t) + commitUnrelatedMigrationTestFile(t, fixture.dir) + + checkpoints, err := discoverCheckpointHistory(context.Background(), fixture.repo, discoveryOptions{ + since: fixture.baseHash.String(), + }) + require.NoError(t, err) + + require.Equal(t, []string{mainCheckpointID, featureCheckpointID, featureCheckpointID2}, discoveredCheckpointIDs(checkpoints)) +} + +func TestDiscoverCheckpointHistory_HeadMustContainSince(t *testing.T) { + t.Parallel() + + fixture := setupMigrationHistoryRepo(t) + unrelatedHash := commitUnrelatedMigrationTestFile(t, fixture.dir) + + _, err := discoverCheckpointHistory(context.Background(), fixture.repo, discoveryOptions{ + since: fixture.baseHash.String(), + head: unrelatedHash.String(), + }) + require.ErrorContains(t, err, "is not an ancestor of --head") +} + +func TestDiscoverCheckpointHistory_ExcludesInternalRefs(t *testing.T) { + t.Parallel() + + fixture := setupMigrationHistoryRepo(t) + runMigrationGit(t, fixture.dir, "checkout", paths.MetadataBranchName) + commitMigrationTestFile(t, fixture.dir, "internal.txt", "internal\n", + "internal checkpoint\n\nEntire-Checkpoint: "+unrelatedCheckpointID) + + repo, err := git.PlainOpen(fixture.dir) + require.NoError(t, err) + checkpoints, err := discoverCheckpointHistory(context.Background(), repo, discoveryOptions{ + since: fixture.baseHash.String(), + }) + require.NoError(t, err) + + require.Equal(t, []string{mainCheckpointID, featureCheckpointID, featureCheckpointID2}, discoveredCheckpointIDs(checkpoints)) +} + +func TestDiscoverCheckpointHistory_IncludesV2OrphansWithoutScope(t *testing.T) { + t.Parallel() + + fixture := setupMigrationOrphanRepo(t, "555555555555") + + checkpoints, err := discoverCheckpointHistory(context.Background(), fixture.repo, discoveryOptions{}) + require.NoError(t, err) + + require.Equal(t, []string{"555555555555"}, discoveredCheckpointIDs(checkpoints)) + require.Empty(t, checkpoints[0].Commits) +} + +func TestResolveRevisionRejectsAmbiguousShortCommitPrefix(t *testing.T) { + t.Parallel() + + fixture := setupMigrationHistoryRepo(t) + prefixes := map[string]struct{}{} + ambiguousPrefix := "" + for i := range 17 { + hash := commitMigrationTestFile(t, fixture.dir, fmt.Sprintf("ambiguous-%02d.txt", i), fmt.Sprintf("%d\n", i), fmt.Sprintf("ambiguous %d", i)) + prefix := hash.String()[:1] + if _, exists := prefixes[prefix]; exists { + ambiguousPrefix = prefix + break + } + prefixes[prefix] = struct{}{} + } + require.NotEmpty(t, ambiguousPrefix) + + repo, err := git.PlainOpen(fixture.dir) + require.NoError(t, err) + _, err = resolveRevision(repo, ambiguousPrefix) + require.ErrorContains(t, err, "ambiguous revision") +} + +func TestAddCheckpointCommitUsesCommitterTime(t *testing.T) { + t.Parallel() + + authorTime := time.Date(2024, 1, 2, 3, 4, 5, 0, time.UTC) + committerTime := time.Date(2024, 2, 3, 4, 5, 6, 0, time.UTC) + commit := &object.Commit{ + Hash: plumbing.NewHash("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"), + Author: object.Signature{When: authorTime}, + Committer: object.Signature{When: committerTime}, + Message: "commit\n\nEntire-Checkpoint: " + mainCheckpointID, + } + checkpointIndexes := map[string]int{} + checkpoints := []discoveredCheckpoint{} + + addCheckpointCommit(commit, checkpointIndexes, &checkpoints) + + require.Len(t, checkpoints, 1) + require.Len(t, checkpoints[0].Commits, 1) + require.Equal(t, committerTime, checkpoints[0].Commits[0].Date) +} + +func TestRunListModeOpensRepoFromSubdirectory(t *testing.T) { + t.Parallel() + + fixture := setupMigrationHistoryRepo(t) + subdir := filepath.Join(fixture.dir, "nested") + require.NoError(t, os.MkdirAll(subdir, 0o755)) + + var stdout bytes.Buffer + err := run(context.Background(), []string{ + "--repo", subdir, + "--since", fixture.baseHash.String(), + "--head", fixture.mainHash.String(), + "--list", + }, &stdout) + require.NoError(t, err) + + require.Equal(t, mainCheckpointID+" "+shortHash(fixture.mainHash)+"\n", stdout.String()) +} + +func TestRunListModePrintsV2Orphans(t *testing.T) { + t.Parallel() + + fixture := setupMigrationOrphanRepo(t, "666666666666") + + var stdout bytes.Buffer + err := run(context.Background(), []string{ + "--repo", fixture.dir, + "--list", + }, &stdout) + require.NoError(t, err) + + require.Equal(t, "666666666666 (orphan)\n", stdout.String()) +} + +func TestRunDryRunFetchesRemoteV2RefsBeforePlanning(t *testing.T) { + t.Parallel() + + fixture := setupMigrationHistoryRepo(t) + cpID := id.MustCheckpointID(mainCheckpointID) + writeTestV2Checkpoint(t, fixture.repo, testV2CheckpointOptions{ + CheckpointID: cpID, + SessionID: "session-remote-v2", + Transcript: redact.AlreadyRedacted([]byte("{\"message\":\"remote v2\"}\n")), + }) + cloneDir := cloneMigrationRepoWithOrigin(t, fixture) + cloneRepo, err := git.PlainOpen(cloneDir) + require.NoError(t, err) + _, err = cloneRepo.Reference(plumbing.ReferenceName(paths.V2MainRefName), true) + require.ErrorIs(t, err, plumbing.ErrReferenceNotFound) + + var stdout bytes.Buffer + err = run(context.Background(), []string{ + "--repo", cloneDir, + "--since", fixture.baseHash.String(), + "--head", fixture.mainHash.String(), + "--dry-run", + }, &stdout) + require.NoError(t, err) + require.Contains(t, stdout.String(), "checkpoints eligible for migration: 1") + require.Contains(t, stdout.String(), "sessions eligible for migration: 1") + + _, err = cloneRepo.Reference(plumbing.ReferenceName(paths.V2MainRefName), true) + require.NoError(t, err) + _, err = cloneRepo.Reference(plumbing.ReferenceName(paths.V2FullCurrentRefName), true) + require.NoError(t, err) +} + +func TestRunDryRunFailsWhenRemoteV2MainIsUnavailable(t *testing.T) { + t.Parallel() + + fixture := setupMigrationHistoryRepo(t) + cloneDir := cloneMigrationRepoWithOrigin(t, fixture) + + var stdout bytes.Buffer + err := run(context.Background(), []string{ + "--repo", cloneDir, + "--since", fixture.baseHash.String(), + "--head", fixture.mainHash.String(), + "--dry-run", + }, &stdout) + require.ErrorContains(t, err, paths.V2MainRefName+" not found on remote") +} + +func TestRunDryRunFailsWhenRemoteV1RefIsUnavailable(t *testing.T) { + t.Parallel() + + fixture := setupMigrationHistoryRepoWithoutV1(t) + writeTestV2Checkpoint(t, fixture.repo, testV2CheckpointOptions{ + CheckpointID: id.MustCheckpointID(mainCheckpointID), + SessionID: "session-remote-v2", + Transcript: redact.AlreadyRedacted([]byte("{\"message\":\"remote v2\"}\n")), + }) + cloneDir := cloneMigrationRepoWithOrigin(t, fixture) + + var stdout bytes.Buffer + err := run(context.Background(), []string{ + "--repo", cloneDir, + "--since", fixture.baseHash.String(), + "--head", fixture.mainHash.String(), + "--dry-run", + }, &stdout) + + require.ErrorContains(t, err, plumbing.NewBranchReferenceName(paths.MetadataBranchName).String()+" not found on remote") +} + +func TestRunApplySeedsLocalV1FromRemoteBeforeMigrating(t *testing.T) { + t.Parallel() + + fixture := setupMigrationHistoryRepo(t) + cpID := id.MustCheckpointID(mainCheckpointID) + writeTestV2Checkpoint(t, fixture.repo, testV2CheckpointOptions{ + CheckpointID: cpID, + SessionID: "session-seeded-v1", + Transcript: redact.AlreadyRedacted([]byte("{\"message\":\"seeded v1\"}\n")), + }) + cloneDir := cloneMigrationRepoWithOrigin(t, fixture) + cloneRepo, err := git.PlainOpen(cloneDir) + require.NoError(t, err) + + localV1RefName := plumbing.NewBranchReferenceName(paths.MetadataBranchName) + _, err = cloneRepo.Reference(localV1RefName, true) + require.ErrorIs(t, err, plumbing.ErrReferenceNotFound) + remoteV1Ref, err := cloneRepo.Reference(plumbing.NewRemoteReferenceName("origin", paths.MetadataBranchName), true) + require.NoError(t, err) + + var stdout bytes.Buffer + err = run(context.Background(), []string{ + "--repo", cloneDir, + "--since", fixture.baseHash.String(), + "--head", fixture.mainHash.String(), + "--apply", + }, &stdout) + require.NoError(t, err) + require.Contains(t, stdout.String(), "migrated checkpoints: 1") + require.Contains(t, stdout.String(), "migrated sessions: 1") + + cloneRepo, err = git.PlainOpen(cloneDir) + require.NoError(t, err) + localV1Ref, err := cloneRepo.Reference(localV1RefName, true) + require.NoError(t, err) + commit, err := cloneRepo.CommitObject(localV1Ref.Hash()) + require.NoError(t, err) + require.Equal(t, []plumbing.Hash{remoteV1Ref.Hash()}, commit.ParentHashes) + + summary, err := checkpoint.NewGitStore(cloneRepo).ReadCommitted(context.Background(), cpID) + require.NoError(t, err) + require.NotNil(t, summary) + require.Len(t, summary.Sessions, 1) +} + +func TestRunApplyDisablesSigningDuringPreflightRefReplay(t *testing.T) { //nolint:paralleltest // mutates git config env + markerPath := configureFailingCheckpointSigner(t) + fixture := setupMigrationHistoryRepo(t) + cpID := id.MustCheckpointID(mainCheckpointID) + writeTestV2Checkpoint(t, fixture.repo, testV2CheckpointOptions{ + CheckpointID: cpID, + SessionID: "session-signing-disabled", + Transcript: redact.AlreadyRedacted([]byte("{\"message\":\"signing disabled\"}\n")), + }) + cloneDir := cloneMigrationRepoWithOrigin(t, fixture) + cloneRepo, err := git.PlainOpen(cloneDir) + require.NoError(t, err) + + localV1RefName := plumbing.NewBranchReferenceName(paths.MetadataBranchName) + localOnlyBlob, err := checkpoint.CreateBlobFromContent(cloneRepo, []byte("local-only\n")) + require.NoError(t, err) + writeTestV2RefEntriesWithAuthor(t, cloneRepo, localV1RefName, plumbing.ZeroHash, map[string]object.TreeEntry{ + "local-only.txt": { + Name: "local-only.txt", + Mode: 0o100644, + Hash: localOnlyBlob, + }, + }, "local v1 commit", object.Signature{ + Name: testAuthorName, + Email: testAuthorEmail, + When: time.Date(2024, 8, 9, 10, 11, 12, 0, time.UTC), + }) + + var stdout bytes.Buffer + err = run(context.Background(), []string{ + "--repo", cloneDir, + "--since", fixture.baseHash.String(), + "--head", fixture.mainHash.String(), + "--apply", + }, &stdout) + require.NoError(t, err) + require.Contains(t, stdout.String(), "migrated sessions: 1") + require.NoFileExists(t, markerPath) + + cloneRepo, err = git.PlainOpen(cloneDir) + require.NoError(t, err) + ref, err := cloneRepo.Reference(localV1RefName, true) + require.NoError(t, err) + migrationCommit, err := cloneRepo.CommitObject(ref.Hash()) + require.NoError(t, err) + require.Empty(t, migrationCommit.Signature) + require.NotEmpty(t, migrationCommit.ParentHashes) + replayedCommit, err := cloneRepo.CommitObject(migrationCommit.ParentHashes[0]) + require.NoError(t, err) + require.Empty(t, replayedCommit.Signature) +} + +func TestRunApplyMigratesV2CheckpointToV1(t *testing.T) { + t.Parallel() + + fixture := setupMigrationHistoryRepo(t) + cpID := id.MustCheckpointID(mainCheckpointID) + createdAt := time.Date(2024, 5, 6, 7, 8, 9, 0, time.UTC) + v2AuthorWhen := time.Date(2024, 5, 6, 8, 9, 10, 0, time.UTC) + transcript := []byte("{\"type\":\"assistant\",\"message\":\"migrated\"}\n") + v2AuthorName := "Original V2 Author" + v2AuthorEmail := "original-v2@example.com" + + writeTestV2Checkpoint(t, fixture.repo, testV2CheckpointOptions{ + CheckpointID: cpID, + SessionID: "session-to-migrate", + CreatedAt: createdAt, + Strategy: testStrategy, + Branch: testBranchName, + Transcript: redact.AlreadyRedacted(transcript), + Prompts: []string{"first prompt", "second prompt"}, + FilesTouched: []string{"main.go"}, + CheckpointsCount: 2, + AuthorName: v2AuthorName, + AuthorEmail: v2AuthorEmail, + AuthorWhen: v2AuthorWhen, + Agent: agent.AgentTypeClaudeCode, + Model: "claude-test-model", + TurnID: "turn-1", + CheckpointTranscriptStart: 42, + CompactTranscriptStart: 9, + Kind: string(session.KindAgentReview), + ReviewSkills: []string{testReviewSkill}, + ReviewPrompt: "review this", + HasReview: true, + }) + + var stdout bytes.Buffer + err := run(context.Background(), []string{ + "--repo", fixture.dir, + "--since", fixture.baseHash.String(), + "--head", fixture.mainHash.String(), + "--apply", + }, &stdout) + require.NoError(t, err) + require.Contains(t, stdout.String(), "migrated checkpoints: 1") + require.Contains(t, stdout.String(), "migrated sessions: 1") + + v1Store := checkpoint.NewGitStore(fixture.repo) + summary, err := v1Store.ReadCommitted(context.Background(), cpID) + require.NoError(t, err) + require.NotNil(t, summary) + require.Len(t, summary.Sessions, 1) + require.Equal(t, 2, summary.CheckpointsCount) + require.Equal(t, []string{"main.go"}, summary.FilesTouched) + require.True(t, summary.HasReview) + + content, err := v1Store.ReadSessionContent(context.Background(), cpID, 0) + require.NoError(t, err) + require.Equal(t, transcript, content.Transcript) + require.Equal(t, strings.Join([]string{"first prompt", "second prompt"}, checkpoint.PromptSeparator), content.Prompts) + require.Equal(t, createdAt, content.Metadata.CreatedAt) + require.Equal(t, testStrategy, content.Metadata.Strategy) + require.Equal(t, testBranchName, content.Metadata.Branch) + require.Equal(t, agent.AgentTypeClaudeCode, content.Metadata.Agent) + require.Equal(t, "claude-test-model", content.Metadata.Model) + require.Equal(t, "turn-1", content.Metadata.TurnID) + require.Equal(t, 9, content.Metadata.CheckpointTranscriptStart) + require.Equal(t, string(session.KindAgentReview), content.Metadata.Kind) + require.Equal(t, []string{testReviewSkill}, content.Metadata.ReviewSkills) + require.Equal(t, "review this", content.Metadata.ReviewPrompt) + + ref, err := fixture.repo.Reference(plumbing.NewBranchReferenceName(paths.MetadataBranchName), true) + require.NoError(t, err) + commit, err := fixture.repo.CommitObject(ref.Hash()) + require.NoError(t, err) + require.Equal(t, v2AuthorName, commit.Author.Name) + require.Equal(t, v2AuthorEmail, commit.Author.Email) + require.True(t, commit.Author.When.Equal(v2AuthorWhen), "author time = %s, want %s", commit.Author.When, v2AuthorWhen) + require.Empty(t, commit.Signature) +} + +func TestRunApplyMigratesV2OrphanCheckpointAndIsIdempotent(t *testing.T) { + t.Parallel() + + cpID := id.MustCheckpointID("777777777777") + v2AuthorWhen := time.Date(2024, 7, 8, 9, 10, 11, 0, time.UTC) + v2AuthorName := "Original Orphan Author" + v2AuthorEmail := "original-orphan@example.com" + fixture := setupMigrationOrphanRepoWithOptions(t, cpID.String(), testV2CheckpointOptions{ + AuthorName: v2AuthorName, + AuthorEmail: v2AuthorEmail, + AuthorWhen: v2AuthorWhen, + }) + + var stdout bytes.Buffer + err := run(context.Background(), []string{ + "--repo", fixture.dir, + "--apply", + }, &stdout) + require.NoError(t, err) + require.Contains(t, stdout.String(), "checkpoints eligible for migration: 1") + require.Contains(t, stdout.String(), "v2 orphan checkpoints eligible for migration: 1") + require.Contains(t, stdout.String(), "migrated checkpoints: 1") + require.Contains(t, stdout.String(), cpID.String()+" sessions=1 commits=(orphan)") + + v1Store := checkpoint.NewGitStore(fixture.repo) + summary, err := v1Store.ReadCommitted(context.Background(), cpID) + require.NoError(t, err) + require.NotNil(t, summary) + require.Len(t, summary.Sessions, 1) + content, err := v1Store.ReadSessionContent(context.Background(), cpID, 0) + require.NoError(t, err) + require.Equal(t, "orphan-session", content.Metadata.SessionID) + require.JSONEq(t, `{"message":"orphan"}`, string(content.Transcript)) + + ref, err := fixture.repo.Reference(plumbing.NewBranchReferenceName(paths.MetadataBranchName), true) + require.NoError(t, err) + commit, err := fixture.repo.CommitObject(ref.Hash()) + require.NoError(t, err) + require.Equal(t, v2AuthorName, commit.Author.Name) + require.Equal(t, v2AuthorEmail, commit.Author.Email) + require.True(t, commit.Author.When.Equal(v2AuthorWhen), "author time = %s, want %s", commit.Author.When, v2AuthorWhen) + require.Empty(t, commit.Signature) + + stdout.Reset() + err = run(context.Background(), []string{ + "--repo", fixture.dir, + "--apply", + }, &stdout) + require.NoError(t, err) + require.Contains(t, stdout.String(), "already present v1 sessions: 1") + require.Contains(t, stdout.String(), "checkpoints eligible for migration: 0") + require.Contains(t, stdout.String(), "v2 orphan checkpoints eligible for migration: 0") + require.NotContains(t, stdout.String(), cpID.String()+" sessions=1 commits=(orphan)") +} + +func TestRunApplyMigratesInvestigationMetadata(t *testing.T) { + t.Parallel() + + fixture := setupMigrationHistoryRepo(t) + cpID := id.MustCheckpointID(mainCheckpointID) + writeTestV2Checkpoint(t, fixture.repo, testV2CheckpointOptions{ + CheckpointID: cpID, + SessionID: "investigation-session", + Transcript: redact.AlreadyRedacted([]byte("{\"message\":\"investigate\"}\n")), + Kind: string(session.KindAgentInvestigate), + InvestigateRunID: "0123456789ab", + InvestigateTopic: "Why is checkout flaky?", + HasInvestigation: true, + }) + + stdout := runMigrationCommand(t, fixture, fixture.mainHash, "--apply") + require.Contains(t, stdout, "migrated sessions: 1") + + v1Store := checkpoint.NewGitStore(fixture.repo) + summary, err := v1Store.ReadCommitted(context.Background(), cpID) + require.NoError(t, err) + require.True(t, summary.HasInvestigation) + + content, err := v1Store.ReadSessionContent(context.Background(), cpID, 0) + require.NoError(t, err) + require.Equal(t, string(session.KindAgentInvestigate), content.Metadata.Kind) + require.Equal(t, "0123456789ab", content.Metadata.InvestigateRunID) + require.Equal(t, "Why is checkout flaky?", content.Metadata.InvestigateTopic) +} + +func TestRunDryRunPlansWithoutWritingV1(t *testing.T) { + t.Parallel() + + fixture := setupMigrationHistoryRepo(t) + cpID := id.MustCheckpointID(mainCheckpointID) + writeTestV2Checkpoint(t, fixture.repo, testV2CheckpointOptions{ + CheckpointID: cpID, + SessionID: "session-dry-run", + Transcript: redact.AlreadyRedacted([]byte("{\"message\":\"dry run\"}\n")), + }) + + stdout := runMigrationCommand(t, fixture, fixture.mainHash, "--dry-run") + require.Contains(t, stdout, "Migration plan:") + require.Contains(t, stdout, "checkpoints eligible for migration: 1") + require.Contains(t, stdout, "sessions eligible for migration: 1") + require.Contains(t, stdout, "checkpoints to migrate:") + require.Contains(t, stdout, mainCheckpointID+" sessions=1 commits="+shortHash(fixture.mainHash)) + + summary, err := checkpoint.NewGitStore(fixture.repo).ReadCommitted(context.Background(), cpID) + require.NoError(t, err) + require.Nil(t, summary) +} + +func TestRunDryRunSkipsV2OrphansWhenScoped(t *testing.T) { + t.Parallel() + + testCases := []struct { + name string + flag string + id string + }{ + {name: "since", flag: "--since", id: "888888888888"}, + {name: "head", flag: "--head", id: "999999999999"}, + } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + + fixture := setupMigrationOrphanRepo(t, tc.id) + args := []string{ + "--repo", fixture.dir, + tc.flag, fixture.baseHash.String(), + "--dry-run", + } + var stdout bytes.Buffer + err := run(context.Background(), args, &stdout) + require.NoError(t, err) + + require.Contains(t, stdout.String(), "warning: 1 v2 orphans skipped; re-run without --since/--head to include them") + require.Contains(t, stdout.String(), "discovered checkpoints: 0") + require.Contains(t, stdout.String(), "checkpoints eligible for migration: 0") + require.NotContains(t, stdout.String(), tc.id+" sessions=1 commits=(orphan)") + }) + } +} + +func TestRunApplySkipsExistingV1SessionsAndMigratesMissingSessions(t *testing.T) { + t.Parallel() + + fixture := setupMigrationHistoryRepo(t) + cpID := id.MustCheckpointID(mainCheckpointID) + writeTestV2Checkpoint(t, fixture.repo, testV2CheckpointOptions{ + CheckpointID: cpID, + SessionID: "session-existing-v1", + Transcript: redact.AlreadyRedacted([]byte("{\"message\":\"from v2 existing\"}\n")), + }) + writeTestV2Checkpoint(t, fixture.repo, testV2CheckpointOptions{ + CheckpointID: cpID, + SessionID: "session-v2-missing-from-v1", + Transcript: redact.AlreadyRedacted([]byte("{\"message\":\"from v2 new\"}\n")), + }) + + existingTranscript := []byte("{\"message\":\"already v1\"}\n") + err := checkpoint.NewGitStore(fixture.repo).WriteCommitted(context.Background(), checkpoint.WriteCommittedOptions{ + CheckpointID: cpID, + SessionID: "session-existing-v1", + CreatedAt: time.Date(2024, 1, 2, 3, 4, 5, 0, time.UTC), + Strategy: testStrategy, + Branch: testBranchName, + Transcript: redact.AlreadyRedacted(existingTranscript), + AuthorName: testAuthorName, + AuthorEmail: testAuthorEmail, + }) + require.NoError(t, err) + + stdout := runMigrationCommand(t, fixture, fixture.mainHash, "--apply") + require.Contains(t, stdout, "already present v1 sessions: 1") + require.Contains(t, stdout, "migrated checkpoints: 1") + require.Contains(t, stdout, "migrated sessions: 1") + require.Contains(t, stdout, "migrated checkpoint details:") + require.Contains(t, stdout, mainCheckpointID+" sessions=1 commits="+shortHash(fixture.mainHash)) + + v1Store := checkpoint.NewGitStore(fixture.repo) + summary, err := v1Store.ReadCommitted(context.Background(), cpID) + require.NoError(t, err) + require.Len(t, summary.Sessions, 2) + content, err := v1Store.ReadSessionContentByID(context.Background(), cpID, "session-existing-v1") + require.NoError(t, err) + require.Equal(t, existingTranscript, content.Transcript) + require.Equal(t, "session-existing-v1", content.Metadata.SessionID) + content, err = v1Store.ReadSessionContentByID(context.Background(), cpID, "session-v2-missing-from-v1") + require.NoError(t, err) + require.JSONEq(t, `{"message":"from v2 new"}`, string(content.Transcript)) +} + +func TestRunDryRunReadsSparseExistingV1SessionPaths(t *testing.T) { + t.Parallel() + + fixture := setupMigrationHistoryRepo(t) + cpID := id.MustCheckpointID(mainCheckpointID) + writeTestV2Checkpoint(t, fixture.repo, testV2CheckpointOptions{ + CheckpointID: cpID, + SessionID: "session-existing-zero", + Transcript: redact.AlreadyRedacted([]byte("{\"message\":\"from v2 existing\"}\n")), + }) + + v1Store := checkpoint.NewGitStore(fixture.repo) + require.NoError(t, v1Store.WriteCommitted(context.Background(), checkpoint.WriteCommittedOptions{ + CheckpointID: cpID, + SessionID: "session-existing-zero", + Strategy: testStrategy, + Branch: testBranchName, + Transcript: redact.AlreadyRedacted([]byte("{\"message\":\"already v1 zero\"}\n")), + AuthorName: testAuthorName, + AuthorEmail: testAuthorEmail, + })) + require.NoError(t, v1Store.WriteCommitted(context.Background(), checkpoint.WriteCommittedOptions{ + CheckpointID: cpID, + SessionID: "session-existing-two", + Strategy: testStrategy, + Branch: testBranchName, + Transcript: redact.AlreadyRedacted([]byte("{\"message\":\"already v1 two\"}\n")), + AuthorName: testAuthorName, + AuthorEmail: testAuthorEmail, + })) + rewriteV1SecondSessionToSparseSlot(t, fixture.repo, cpID) + + stdout := runMigrationCommand(t, fixture, fixture.mainHash, "--dry-run") + require.Contains(t, stdout, "already present v1 sessions: 1") + require.Contains(t, stdout, "checkpoints eligible for migration: 0") +} + +func TestRunApplyAppendsSparseExistingV1SessionWithoutOverwriting(t *testing.T) { + t.Parallel() + + fixture := setupMigrationHistoryRepo(t) + cpID := id.MustCheckpointID(mainCheckpointID) + writeTestV2Checkpoint(t, fixture.repo, testV2CheckpointOptions{ + CheckpointID: cpID, + SessionID: "session-existing-zero", + Transcript: redact.AlreadyRedacted([]byte("{\"message\":\"from v2 existing\"}\n")), + }) + writeTestV2Checkpoint(t, fixture.repo, testV2CheckpointOptions{ + CheckpointID: cpID, + SessionID: "session-v2-new", + Transcript: redact.AlreadyRedacted([]byte("{\"message\":\"from v2 new\"}\n")), + }) + + v1Store := checkpoint.NewGitStore(fixture.repo) + require.NoError(t, v1Store.WriteCommitted(context.Background(), checkpoint.WriteCommittedOptions{ + CheckpointID: cpID, + SessionID: "session-existing-zero", + Strategy: testStrategy, + Branch: testBranchName, + Transcript: redact.AlreadyRedacted([]byte("{\"message\":\"already v1 zero\"}\n")), + AuthorName: testAuthorName, + AuthorEmail: testAuthorEmail, + })) + require.NoError(t, v1Store.WriteCommitted(context.Background(), checkpoint.WriteCommittedOptions{ + CheckpointID: cpID, + SessionID: "session-existing-two", + Strategy: testStrategy, + Branch: testBranchName, + Transcript: redact.AlreadyRedacted([]byte("{\"message\":\"already v1 two\"}\n")), + AuthorName: testAuthorName, + AuthorEmail: testAuthorEmail, + })) + rewriteV1SecondSessionToSparseSlot(t, fixture.repo, cpID) + + stdout := runMigrationCommand(t, fixture, fixture.mainHash, "--apply") + require.Contains(t, stdout, "already present v1 sessions: 1") + require.Contains(t, stdout, "migrated sessions: 1") + + preserved, err := v1Store.ReadSessionContent(context.Background(), cpID, 2) + require.NoError(t, err) + require.Equal(t, "session-existing-two", preserved.Metadata.SessionID) + require.JSONEq(t, `{"message":"already v1 two"}`, string(preserved.Transcript)) + + migrated, err := v1Store.ReadSessionContent(context.Background(), cpID, 1) + require.NoError(t, err) + require.Equal(t, "session-v2-new", migrated.Metadata.SessionID) + require.JSONEq(t, `{"message":"from v2 new"}`, string(migrated.Transcript)) + + summary, err := v1Store.ReadCommitted(context.Background(), cpID) + require.NoError(t, err) + require.Len(t, summary.Sessions, 3) + require.Equal(t, "/"+cpID.Path()+"/0/metadata.json", summary.Sessions[0].Metadata) + require.Equal(t, "/"+cpID.Path()+"/1/metadata.json", summary.Sessions[1].Metadata) + require.Equal(t, "/"+cpID.Path()+"/2/metadata.json", summary.Sessions[2].Metadata) +} + +func TestRunApplyMigratesTaskMetadata(t *testing.T) { + t.Parallel() + + fixture := setupMigrationHistoryRepo(t) + cpID := id.MustCheckpointID(mainCheckpointID) + writeTestV2Checkpoint(t, fixture.repo, testV2CheckpointOptions{ + CheckpointID: cpID, + SessionID: "task-session", + IsTask: true, + ToolUseID: testToolUseID, + Transcript: redact.AlreadyRedacted([]byte("{\"message\":\"task\"}\n")), + }) + + stdout := runMigrationCommand(t, fixture, fixture.mainHash, "--apply") + require.Contains(t, stdout, "migrated sessions: 1") + + v1Store := checkpoint.NewGitStore(fixture.repo) + content, err := v1Store.ReadSessionContent(context.Background(), cpID, 0) + require.NoError(t, err) + require.True(t, content.Metadata.IsTask) + require.Equal(t, testToolUseID, content.Metadata.ToolUseID) + + ref, err := fixture.repo.Reference(plumbing.NewBranchReferenceName(paths.MetadataBranchName), true) + require.NoError(t, err) + commit, err := fixture.repo.CommitObject(ref.Hash()) + require.NoError(t, err) + tree, err := commit.Tree() + require.NoError(t, err) + _, err = tree.File(cpID.Path() + "/tasks/" + testToolUseID + "/checkpoint.json") + require.NoError(t, err) +} + +func TestRunApplyHasReviewReflectsOnlyMigratedSessions(t *testing.T) { + t.Parallel() + + fixture := setupMigrationHistoryRepo(t) + cpID := id.MustCheckpointID(mainCheckpointID) + writeTestV2Checkpoint(t, fixture.repo, testV2CheckpointOptions{ + CheckpointID: cpID, + SessionID: "normal-session", + Transcript: redact.AlreadyRedacted([]byte("{\"message\":\"normal\"}\n")), + CombinedAttribution: &checkpoint.InitialAttribution{ + AgentLines: 12, + TotalLinesChanged: 12, + AgentPercentage: 100, + }, + }) + writeTestV2Checkpoint(t, fixture.repo, testV2CheckpointOptions{ + CheckpointID: cpID, + SessionID: "review-session-without-raw-transcript", + Kind: string(session.KindAgentReview), + ReviewSkills: []string{testReviewSkill}, + ReviewPrompt: "review this", + HasReview: true, + CompactTranscript: []byte("{\"message\":\"compact review only\"}\n"), + }) + + stdout := runMigrationCommand(t, fixture, fixture.mainHash, "--apply") + require.Contains(t, stdout, "missing raw transcripts: 1") + require.Contains(t, stdout, "migrated checkpoints: 1") + require.Contains(t, stdout, "migrated sessions: 1") + + summary, err := checkpoint.NewGitStore(fixture.repo).ReadCommitted(context.Background(), cpID) + require.NoError(t, err) + require.NotNil(t, summary) + require.False(t, summary.HasReview) + require.Nil(t, summary.CombinedAttribution) + require.Len(t, summary.Sessions, 1) +} + +func TestRunDryRunReportsMissingV2MetadataAndRawTranscripts(t *testing.T) { + t.Parallel() + + fixture := setupMigrationHistoryRepo(t) + writeTestV2Checkpoint(t, fixture.repo, testV2CheckpointOptions{ + CheckpointID: id.MustCheckpointID(featureCheckpointID), + SessionID: "session-missing-raw", + }) + + var stdout bytes.Buffer + err := run(context.Background(), []string{ + "--repo", fixture.dir, + "--since", fixture.baseHash.String(), + "--dry-run", + }, &stdout) + require.NoError(t, err) + + require.Contains(t, stdout.String(), "missing v2 checkpoint metadata: 2") + require.Contains(t, stdout.String(), "missing required v2 session metadata: 0") + require.Contains(t, stdout.String(), "missing raw transcripts: 1") + require.Contains(t, stdout.String(), "checkpoints eligible for migration: 0") + require.Contains(t, stdout.String(), "sessions eligible for migration: 0") +} + +type migrationHistoryFixture struct { + dir string + repo *git.Repository + baseHash plumbing.Hash + mainHash plumbing.Hash + featureHash plumbing.Hash +} + +func setupMigrationHistoryRepo(t *testing.T) migrationHistoryFixture { + t.Helper() + + return setupMigrationHistoryRepoWithV1(t, true) +} + +func setupMigrationHistoryRepoWithoutV1(t *testing.T) migrationHistoryFixture { + t.Helper() + + return setupMigrationHistoryRepoWithV1(t, false) +} + +func setupMigrationHistoryRepoWithV1(t *testing.T, seedV1 bool) migrationHistoryFixture { + t.Helper() + + dir := t.TempDir() + testutil.InitRepo(t, dir) + + baseHash := commitMigrationTestFile(t, dir, testBaseFilename, "base\n", + "base checkpoint\n\nEntire-Checkpoint: "+oldCheckpointID) + mainHash := commitMigrationTestFile(t, dir, testMainFilename, "main\n", + "main checkpoint\n\nEntire-Checkpoint: "+mainCheckpointID) + + testutil.GitCheckoutNewBranch(t, dir, testFeatureBranchName) + featureHash := commitMigrationTestFile(t, dir, testFeatureFilename, "feature\n", + "feature checkpoint\n\nEntire-Checkpoint: "+featureCheckpointID+"\nEntire-Checkpoint: "+featureCheckpointID2) + + repo, err := git.PlainOpen(dir) + require.NoError(t, err) + if seedV1 { + writeTestV1Baseline(t, repo) + } + + return migrationHistoryFixture{ + dir: dir, + repo: repo, + baseHash: baseHash, + mainHash: mainHash, + featureHash: featureHash, + } +} + +func setupMigrationOrphanRepo(t *testing.T, checkpointID string) migrationHistoryFixture { + t.Helper() + + return setupMigrationOrphanRepoWithOptions(t, checkpointID, testV2CheckpointOptions{}) +} + +func setupMigrationOrphanRepoWithOptions(t *testing.T, checkpointID string, opts testV2CheckpointOptions) migrationHistoryFixture { + t.Helper() + + dir := t.TempDir() + testutil.InitRepo(t, dir) + + baseHash := commitMigrationTestFile(t, dir, "initial.txt", "initial\n", "initial commit") + repo, err := git.PlainOpen(dir) + require.NoError(t, err) + writeTestV1Baseline(t, repo) + + opts.CheckpointID = id.MustCheckpointID(checkpointID) + if opts.SessionID == "" { + opts.SessionID = "orphan-session" + } + if opts.Transcript.Len() == 0 { + opts.Transcript = redact.AlreadyRedacted([]byte("{\"message\":\"orphan\"}\n")) + } + writeTestV2Checkpoint(t, repo, opts) + + return migrationHistoryFixture{ + dir: dir, + repo: repo, + baseHash: baseHash, + } +} + +func cloneMigrationRepoWithOrigin(t *testing.T, fixture migrationHistoryFixture) string { + t.Helper() + + remoteDir := filepath.Join(t.TempDir(), "origin.git") + runMigrationGit(t, "", "init", "--bare", remoteDir) + runMigrationGit(t, remoteDir, "symbolic-ref", "HEAD", "refs/heads/main") + runMigrationGit(t, fixture.dir, "remote", "add", "origin", remoteDir) + + refspecs := []string{ + fixture.mainHash.String() + ":refs/heads/main", + fixture.featureHash.String() + ":refs/heads/" + testFeatureBranchName, + } + v1RefName := plumbing.NewBranchReferenceName(paths.MetadataBranchName) + if refExists(t, fixture.repo, v1RefName) { + refspecs = append(refspecs, v1RefName.String()+":"+v1RefName.String()) + } + if refExists(t, fixture.repo, plumbing.ReferenceName(paths.V2MainRefName)) { + refspecs = append(refspecs, paths.V2MainRefName+":"+paths.V2MainRefName) + } + if refExists(t, fixture.repo, plumbing.ReferenceName(paths.V2FullCurrentRefName)) { + refspecs = append(refspecs, paths.V2FullCurrentRefName+":"+paths.V2FullCurrentRefName) + } + runMigrationGit(t, fixture.dir, append([]string{"push", "origin"}, refspecs...)...) + + cloneDir := filepath.Join(t.TempDir(), "clone") + runMigrationGit(t, "", "clone", remoteDir, cloneDir) + return cloneDir +} + +func writeTestV1Baseline(t *testing.T, repo *git.Repository) { + t.Helper() + + refName := plumbing.NewBranchReferenceName(paths.MetadataBranchName) + if refExists(t, repo, refName) { + return + } + writeTestV2RefEntries(t, repo, refName, plumbing.ZeroHash, map[string]object.TreeEntry{}, "test v1 baseline") +} + +func refExists(t *testing.T, repo *git.Repository, refName plumbing.ReferenceName) bool { + t.Helper() + + _, err := repo.Reference(refName, true) + if err == nil { + return true + } + require.ErrorIs(t, err, plumbing.ErrReferenceNotFound) + return false +} + +func rewriteV1SecondSessionToSparseSlot(t *testing.T, repo *git.Repository, cpID id.CheckpointID) { + t.Helper() + + refName := plumbing.NewBranchReferenceName(paths.MetadataBranchName) + parentHash, entries := readTestV2RefEntries(t, repo, refName) + basePath := cpID.Path() + "/" + rootMetadataPath := basePath + paths.MetadataFileName + rootEntry := entries[rootMetadataPath] + summary := readTestJSONFromBlob[checkpoint.CheckpointSummary](t, repo, rootEntry.Hash) + require.Len(t, summary.Sessions, 2) + summary.Sessions[1] = rewriteSessionFilePathSlot(summary.Sessions[1], "/1/", "/2/") + + summaryJSON, err := jsonutil.MarshalIndentWithNewline(summary, "", " ") + require.NoError(t, err) + summaryBlob, err := checkpoint.CreateBlobFromContent(repo, summaryJSON) + require.NoError(t, err) + entries[rootMetadataPath] = object.TreeEntry{ + Name: rootMetadataPath, + Mode: rootEntry.Mode, + Hash: summaryBlob, + } + + oldPrefix := basePath + "1/" + newPrefix := basePath + "2/" + for entryPath, entry := range entries { + if !strings.HasPrefix(entryPath, oldPrefix) { + continue + } + newPath := newPrefix + strings.TrimPrefix(entryPath, oldPrefix) + entry.Name = newPath + entries[newPath] = entry + delete(entries, entryPath) + } + writeTestV2RefEntries(t, repo, refName, parentHash, entries, "test sparse v1 fixture") +} + +func rewriteSessionFilePathSlot(sessionPaths checkpoint.SessionFilePaths, oldSlot, newSlot string) checkpoint.SessionFilePaths { + sessionPaths.Metadata = strings.Replace(sessionPaths.Metadata, oldSlot, newSlot, 1) + sessionPaths.Transcript = strings.Replace(sessionPaths.Transcript, oldSlot, newSlot, 1) + sessionPaths.ContentHash = strings.Replace(sessionPaths.ContentHash, oldSlot, newSlot, 1) + sessionPaths.Prompt = strings.Replace(sessionPaths.Prompt, oldSlot, newSlot, 1) + return sessionPaths +} + +func commitMigrationTestFile(t *testing.T, dir, name, content, message string) plumbing.Hash { + t.Helper() + + testutil.WriteFile(t, dir, name, content) + testutil.GitAdd(t, dir, name) + testutil.GitCommit(t, dir, message) + return plumbing.NewHash(testutil.GetHeadHash(t, dir)) +} + +func commitUnrelatedMigrationTestFile(t *testing.T, dir string) plumbing.Hash { + t.Helper() + + runMigrationGit(t, dir, "checkout", "--orphan", "unrelated") + runMigrationGit(t, dir, "rm", "-rf", ".") + return commitMigrationTestFile(t, dir, "unrelated.txt", "unrelated\n", + "unrelated checkpoint\n\nEntire-Checkpoint: "+unrelatedCheckpointID) +} + +func runMigrationGit(t *testing.T, dir string, args ...string) { + t.Helper() + + cmd := exec.CommandContext(context.Background(), "git", args...) + cmd.Dir = dir + cmd.Env = testutil.GitIsolatedEnv() + output, err := cmd.CombinedOutput() + require.NoError(t, err, "git %s failed: %s", strings.Join(args, " "), output) +} + +func configureFailingCheckpointSigner(t *testing.T) string { + t.Helper() + + testutil.IsolateGitConfigEnv(t) + dir := t.TempDir() + markerPath := filepath.Join(dir, "signer-called") + signerPath := filepath.Join(dir, "fake-gpg") + script := fmt.Sprintf("#!/bin/sh\nprintf called > %q\nexit 1\n", markerPath) + require.NoError(t, os.WriteFile(signerPath, []byte(script), 0o755)) + + globalConfig := fmt.Sprintf(`[commit] + gpgsign = true +[user] + signingkey = TESTKEY +[gpg] + program = %s +`, signerPath) + globalConfigPath := filepath.Join(dir, "gitconfig") + require.NoError(t, os.WriteFile(globalConfigPath, []byte(globalConfig), 0o644)) + t.Setenv("GIT_CONFIG_GLOBAL", globalConfigPath) + return markerPath +} + +func runMigrationCommand(t *testing.T, fixture migrationHistoryFixture, head plumbing.Hash, mode string) string { + t.Helper() + + args := []string{ + "--repo", fixture.dir, + "--since", fixture.baseHash.String(), + "--head", head.String(), + mode, + } + var stdout bytes.Buffer + err := run(context.Background(), args, &stdout) + require.NoError(t, err) + return stdout.String() +} + +func discoveredCheckpointIDs(checkpoints []discoveredCheckpoint) []string { + ids := make([]string, len(checkpoints)) + for i, checkpoint := range checkpoints { + ids[i] = checkpoint.ID.String() + } + return ids +} + +func discoveredCommitShortSHAs(t *testing.T, checkpoints []discoveredCheckpoint, checkpointID string) []string { + t.Helper() + + for _, checkpoint := range checkpoints { + if checkpoint.ID.String() != checkpointID { + continue + } + commits := make([]string, len(checkpoint.Commits)) + for i, commit := range checkpoint.Commits { + commits[i] = commit.ShortSHA + } + return commits + } + t.Fatalf("checkpoint %s not found in %#v", checkpointID, checkpoints) + return nil +} diff --git a/cmd/migrate-v2-checkpoints/migration.go b/cmd/migrate-v2-checkpoints/migration.go new file mode 100644 index 0000000000..f3d696573c --- /dev/null +++ b/cmd/migrate-v2-checkpoints/migration.go @@ -0,0 +1,329 @@ +package main + +import ( + "context" + "errors" + "fmt" + "io" + "strconv" + "strings" + + "github.com/entireio/cli/cmd/entire/cli/checkpoint" + checkpointID "github.com/entireio/cli/cmd/entire/cli/checkpoint/id" + "github.com/entireio/cli/cmd/entire/cli/paths" + "github.com/entireio/cli/cmd/entire/cli/session" + "github.com/entireio/cli/redact" + + "github.com/go-git/go-git/v6" + "github.com/go-git/go-git/v6/plumbing/object" +) + +type migrationOptions struct { + apply bool +} + +type migrationReport struct { + DiscoveredCheckpoints int + ExistingV1Sessions int + MissingV2CheckpointMetadata int + MissingV2SessionMetadata int + MissingRawTranscripts int + EligibleCheckpoints int + V2OrphanCheckpoints int + EligibleSessions int + MigratedCheckpoints int + MigratedSessions int + Candidates []migrationCandidate +} + +type migrationCandidate struct { + CheckpointID string + SessionCount int + CommitSHAs []string +} + +type checkpointMigrator struct { + repo *git.Repository + v1Store *checkpoint.GitStore + v2Store *checkpoint.V2GitStore + authorIndex *v2SessionAuthorIndex + opts migrationOptions + report *migrationReport +} + +func migrateDiscoveredCheckpoints(ctx context.Context, repo *git.Repository, discovered []discoveredCheckpoint, opts migrationOptions) (migrationReport, error) { + v2Store := checkpoint.NewV2GitStore(repo) + report := migrationReport{DiscoveredCheckpoints: len(discovered)} + migrator := checkpointMigrator{ + repo: repo, + v1Store: checkpoint.NewGitStore(repo), + v2Store: v2Store, + opts: opts, + report: &report, + } + + for _, discoveredCheckpoint := range discovered { + eligibleSessions, err := migrator.migrateCheckpoint(ctx, discoveredCheckpoint) + if err != nil { + return report, err + } + if eligibleSessions == 0 { + continue + } + report.EligibleCheckpoints++ + if len(discoveredCheckpoint.Commits) == 0 { + report.V2OrphanCheckpoints++ + } + report.Candidates = append(report.Candidates, migrationCandidateFromDiscovered(discoveredCheckpoint, eligibleSessions)) + if opts.apply { + report.MigratedCheckpoints++ + } + } + return report, nil +} + +func migrationCandidateFromDiscovered(discovered discoveredCheckpoint, sessionCount int) migrationCandidate { + commitSHAs := make([]string, len(discovered.Commits)) + for i, commit := range discovered.Commits { + commitSHAs[i] = commit.ShortSHA + } + return migrationCandidate{ + CheckpointID: discovered.ID.String(), + SessionCount: sessionCount, + CommitSHAs: commitSHAs, + } +} + +func (m *checkpointMigrator) migrateCheckpoint(ctx context.Context, discovered discoveredCheckpoint) (int, error) { + existing, err := m.v1Store.ReadCommitted(ctx, discovered.ID) + if err != nil { + return 0, fmt.Errorf("read v1 checkpoint %s: %w", discovered.ID, err) + } + existingSessionIDs, err := m.existingV1SessionIDs(ctx, discovered, existing) + if err != nil { + return 0, err + } + + summary, err := m.v2Store.ReadCommitted(ctx, discovered.ID) + if err != nil { + return 0, fmt.Errorf("read v2 checkpoint %s: %w", discovered.ID, err) + } + if summary == nil || len(summary.Sessions) == 0 { + m.report.MissingV2CheckpointMetadata++ + return 0, nil + } + + eligibleSessions := 0 + canPreserveCombinedAttribution := true + var eligible []eligibleV2Session + for sessionIndex := range summary.Sessions { + metadataContent, err := m.v2Store.ReadSessionMetadataAndPrompts(ctx, discovered.ID, sessionIndex) + if err != nil { + if errors.Is(err, checkpoint.ErrCheckpointNotFound) { + m.report.MissingV2SessionMetadata++ + canPreserveCombinedAttribution = false + continue + } + return eligibleSessions, fmt.Errorf("read v2 checkpoint %s session %d metadata: %w", discovered.ID, sessionIndex, err) + } + if !hasRequiredV2Metadata(metadataContent) { + m.report.MissingV2SessionMetadata++ + canPreserveCombinedAttribution = false + continue + } + if _, exists := existingSessionIDs[metadataContent.Metadata.SessionID]; exists { + m.report.ExistingV1Sessions++ + continue + } + + content, err := m.v2Store.ReadSessionContent(ctx, discovered.ID, sessionIndex) + if err != nil { + if errors.Is(err, checkpoint.ErrNoTranscript) { + m.report.MissingRawTranscripts++ + canPreserveCombinedAttribution = false + continue + } + return eligibleSessions, fmt.Errorf("read v2 checkpoint %s session %d: %w", discovered.ID, sessionIndex, err) + } + + m.report.EligibleSessions++ + eligible = append(eligible, eligibleV2Session{sessionIndex: sessionIndex, content: content}) + eligibleSessions++ + } + + if m.opts.apply { + combinedAttribution := summary.CombinedAttribution + if !canPreserveCombinedAttribution { + combinedAttribution = nil + } + for _, entry := range eligible { + author, err := m.findV2SessionAuthor(ctx, discovered.ID, entry.sessionIndex) + if err != nil { + return eligibleSessions, fmt.Errorf("resolve v2 checkpoint %s session %d author: %w", discovered.ID, entry.sessionIndex, err) + } + writeOpts := writeOptionsFromV2Content(entry.content, combinedAttribution, author) + writeCtx := checkpoint.WithCommitSigningDisabled(ctx) + if err := m.v1Store.WriteCommitted(writeCtx, writeOpts); err != nil { + return eligibleSessions, fmt.Errorf("write v1 checkpoint %s session %d: %w", discovered.ID, entry.sessionIndex, err) + } + m.report.MigratedSessions++ + } + } + return eligibleSessions, nil +} + +func (m *checkpointMigrator) findV2SessionAuthor(ctx context.Context, cpID checkpointID.CheckpointID, sessionIndex int) (object.Signature, error) { + if m.authorIndex == nil { + authorIndex, err := buildV2SessionAuthorIndex(ctx, m.repo) + if err != nil { + return object.Signature{}, err + } + m.authorIndex = authorIndex + } + return m.authorIndex.find(cpID, sessionIndex) +} + +type eligibleV2Session struct { + sessionIndex int + content *checkpoint.SessionContent +} + +func (m *checkpointMigrator) existingV1SessionIDs(ctx context.Context, discovered discoveredCheckpoint, summary *checkpoint.CheckpointSummary) (map[string]struct{}, error) { + existing := make(map[string]struct{}) + if summary == nil { + return existing, nil + } + for summaryIndex, sessionPaths := range summary.Sessions { + sessionIndex, ok, err := v1SessionIndexFromSummary(discovered.ID, sessionPaths) + if err != nil { + return nil, fmt.Errorf("resolve v1 checkpoint %s session %d metadata path: %w", discovered.ID, summaryIndex, err) + } + if !ok { + continue + } + content, err := m.v1Store.ReadSessionMetadataAndPrompts(ctx, discovered.ID, sessionIndex) + if err != nil { + if errors.Is(err, checkpoint.ErrCheckpointNotFound) { + continue + } + return nil, fmt.Errorf("read v1 checkpoint %s session %d metadata: %w", discovered.ID, sessionIndex, err) + } + if content.Metadata.SessionID == "" { + continue + } + existing[content.Metadata.SessionID] = struct{}{} + } + return existing, nil +} + +func v1SessionIndexFromSummary(cpID checkpointID.CheckpointID, sessionPaths checkpoint.SessionFilePaths) (int, bool, error) { + if sessionPaths.Metadata == "" { + return 0, false, nil + } + + metadataPath := strings.TrimPrefix(sessionPaths.Metadata, "/") + expectedPrefix := cpID.Path() + "/" + relativePath, ok := strings.CutPrefix(metadataPath, expectedPrefix) + if !ok { + return 0, false, fmt.Errorf("metadata path %q is outside checkpoint path %q", sessionPaths.Metadata, cpID.Path()) + } + + sessionDir, fileName, ok := strings.Cut(relativePath, "/") + if !ok || fileName != paths.MetadataFileName { + return 0, false, fmt.Errorf("metadata path %q does not point to a session metadata file", sessionPaths.Metadata) + } + + sessionIndex, err := strconv.Atoi(sessionDir) + if err != nil || sessionIndex < 0 { + return 0, false, fmt.Errorf("metadata path %q has invalid session index %q", sessionPaths.Metadata, sessionDir) + } + return sessionIndex, true, nil +} + +func hasRequiredV2Metadata(content *checkpoint.SessionContent) bool { + return !content.Metadata.CheckpointID.IsEmpty() && content.Metadata.SessionID != "" +} + +func writeOptionsFromV2Content(content *checkpoint.SessionContent, combinedAttribution *checkpoint.InitialAttribution, author object.Signature) checkpoint.WriteCommittedOptions { + meta := content.Metadata + return checkpoint.WriteCommittedOptions{ + CheckpointID: meta.CheckpointID, + SessionID: meta.SessionID, + CreatedAt: meta.CreatedAt, + CommitTime: author.When, + Strategy: meta.Strategy, + Branch: meta.Branch, + Transcript: redact.AlreadyRedacted(content.Transcript), + Prompts: checkpoint.SplitPromptContent(content.Prompts), + FilesTouched: meta.FilesTouched, + CheckpointsCount: meta.CheckpointsCount, + AuthorName: author.Name, + AuthorEmail: author.Email, + Agent: meta.Agent, + Model: meta.Model, + TurnID: meta.TurnID, + IsTask: meta.IsTask, + ToolUseID: meta.ToolUseID, + TranscriptIdentifierAtStart: meta.TranscriptIdentifierAtStart, + CheckpointTranscriptStart: meta.GetTranscriptStart(), + TokenUsage: meta.TokenUsage, + SessionMetrics: meta.SessionMetrics, + InitialAttribution: meta.InitialAttribution, + PromptAttributionsJSON: meta.PromptAttributions, + CombinedAttribution: combinedAttribution, + Summary: meta.Summary, + Kind: meta.Kind, + ReviewSkills: meta.ReviewSkills, + ReviewPrompt: meta.ReviewPrompt, + HasReview: session.Kind(meta.Kind).IsReview(), + InvestigateRunID: meta.InvestigateRunID, + InvestigateTopic: meta.InvestigateTopic, + HasInvestigation: session.Kind(meta.Kind).IsInvestigate(), + } +} + +func writeMigrationReport(w io.Writer, report migrationReport, applied bool) { + if applied { + fmt.Fprintln(w, "Migration result:") + } else { + fmt.Fprintln(w, "Migration plan:") + } + fmt.Fprintf(w, " discovered checkpoints: %d\n", report.DiscoveredCheckpoints) + fmt.Fprintf(w, " already present v1 sessions: %d\n", report.ExistingV1Sessions) + fmt.Fprintf(w, " missing v2 checkpoint metadata: %d\n", report.MissingV2CheckpointMetadata) + fmt.Fprintf(w, " missing required v2 session metadata: %d\n", report.MissingV2SessionMetadata) + fmt.Fprintf(w, " missing raw transcripts: %d\n", report.MissingRawTranscripts) + fmt.Fprintf(w, " checkpoints eligible for migration: %d\n", report.EligibleCheckpoints) + fmt.Fprintf(w, " v2 orphan checkpoints eligible for migration: %d\n", report.V2OrphanCheckpoints) + fmt.Fprintf(w, " sessions eligible for migration: %d\n", report.EligibleSessions) + if applied { + fmt.Fprintf(w, " migrated checkpoints: %d\n", report.MigratedCheckpoints) + fmt.Fprintf(w, " migrated sessions: %d\n", report.MigratedSessions) + } + writeMigrationCandidates(w, report.Candidates, applied) +} + +func writeMigrationCandidates(w io.Writer, candidates []migrationCandidate, applied bool) { + if len(candidates) == 0 { + return + } + if applied { + fmt.Fprintln(w, " migrated checkpoint details:") + } else { + fmt.Fprintln(w, " checkpoints to migrate:") + } + for _, candidate := range candidates { + fmt.Fprintf(w, " %s sessions=%d commits=%s\n", + candidate.CheckpointID, + candidate.SessionCount, + candidateCommitLabel(candidate), + ) + } +} + +func candidateCommitLabel(candidate migrationCandidate) string { + if len(candidate.CommitSHAs) == 0 { + return "(orphan)" + } + return strings.Join(candidate.CommitSHAs, ",") +} diff --git a/cmd/migrate-v2-checkpoints/v1_preflight.go b/cmd/migrate-v2-checkpoints/v1_preflight.go new file mode 100644 index 0000000000..fa138b4f8b --- /dev/null +++ b/cmd/migrate-v2-checkpoints/v1_preflight.go @@ -0,0 +1,108 @@ +package main + +import ( + "context" + "errors" + "fmt" + "strings" + "time" + + "github.com/entireio/cli/cmd/entire/cli/checkpoint/remote" + "github.com/entireio/cli/cmd/entire/cli/paths" + "github.com/entireio/cli/cmd/entire/cli/strategy" + + "github.com/go-git/go-git/v6" + "github.com/go-git/go-git/v6/plumbing" +) + +const ( + v1RefFetchTimeout = 2 * time.Minute + v1FetchTmpRef = strategy.FetchTmpRefPrefix + "migrate-v1" +) + +func ensureLatestV1Ref(ctx context.Context, repoRoot string, repo *git.Repository) error { + refName := plumbing.NewBranchReferenceName(paths.MetadataBranchName) + fetchTarget, err := remote.FetchURL(ctx, remote.FetchURLOptions{WorktreeRoot: repoRoot}) + if err != nil { + if localV1RefExists(repo) { + return nil + } + return fmt.Errorf("resolve v1 checkpoint fetch target: %w", err) + } + + ctx, cancel := context.WithTimeout(ctx, v1RefFetchTimeout) + defer cancel() + + remoteHasV1, err := remoteRefExists(ctx, repoRoot, fetchTarget, refName.String()) + if err != nil { + return err + } + if !remoteHasV1 { + return fmt.Errorf("%s not found on remote %s", refName, remote.RedactURL(fetchTarget)) + } + + if err := fetchV1Ref(ctx, repoRoot, repo, fetchTarget); err != nil { + return err + } + return nil +} + +func localV1RefExists(repo *git.Repository) bool { + _, err := repo.Reference(plumbing.NewBranchReferenceName(paths.MetadataBranchName), true) + return err == nil +} + +func remoteRefExists(ctx context.Context, repoRoot, fetchTarget, refName string) (bool, error) { + output, err := remote.LsRemoteInDir(ctx, repoRoot, fetchTarget, refName) + if err != nil { + return false, fmt.Errorf("list remote %s from %s: %w", refName, remote.RedactURL(fetchTarget), err) + } + + for line := range strings.SplitSeq(strings.TrimSpace(string(output)), "\n") { + fields := strings.Fields(line) + if len(fields) >= 2 && fields[1] == refName { + return true, nil + } + } + return false, nil +} + +func fetchV1Ref(ctx context.Context, repoRoot string, repo *git.Repository, fetchTarget string) error { + refName := plumbing.NewBranchReferenceName(paths.MetadataBranchName) + tmpRefName := plumbing.ReferenceName(v1FetchTmpRef) + refSpec := fmt.Sprintf("+%s:%s", refName, tmpRefName) + output, err := remote.Fetch(ctx, remote.FetchOptions{ + Remote: fetchTarget, + RefSpecs: []string{refSpec}, + NoTags: true, + NoFilter: true, + Dir: repoRoot, + }) + if err != nil { + return fetchV1RefError("fetch v1 checkpoint ref", fetchTarget, output, err) + } + + defer func() { _ = repo.Storer.RemoveReference(tmpRefName) }() //nolint:errcheck // cleanup is best-effort + + tmpRef, err := repo.Reference(tmpRefName, true) + if err != nil { + return fmt.Errorf("v1 checkpoint ref not found after fetch (tmp ref %s missing): %w", tmpRefName, err) + } + if err := strategy.SafelyAdvanceLocalRef(ctx, repo, refName, tmpRef.Hash()); err != nil { + return fmt.Errorf("advance local %s: %w", refName, err) + } + return nil +} + +func fetchV1RefError(action, fetchTarget string, output []byte, err error) error { + if errors.Is(err, context.DeadlineExceeded) { + return fmt.Errorf("%s timed out after %s", action, v1RefFetchTimeout) + } + + redactedTarget := remote.RedactURL(fetchTarget) + msg := strings.TrimSpace(strings.ReplaceAll(string(output), fetchTarget, redactedTarget)) + if msg != "" { + return fmt.Errorf("%s from %s failed: %s: %w", action, redactedTarget, msg, err) + } + return fmt.Errorf("%s from %s failed: %w", action, redactedTarget, err) +} diff --git a/cmd/migrate-v2-checkpoints/v2_author.go b/cmd/migrate-v2-checkpoints/v2_author.go new file mode 100644 index 0000000000..8fe3d43d5e --- /dev/null +++ b/cmd/migrate-v2-checkpoints/v2_author.go @@ -0,0 +1,154 @@ +package main + +import ( + "context" + "fmt" + "strconv" + "strings" + + checkpointID "github.com/entireio/cli/cmd/entire/cli/checkpoint/id" + "github.com/entireio/cli/cmd/entire/cli/paths" + + "github.com/go-git/go-git/v6" + "github.com/go-git/go-git/v6/plumbing" + "github.com/go-git/go-git/v6/plumbing/object" + "github.com/go-git/go-git/v6/utils/merkletrie" +) + +type v2SessionAuthorIndex struct { + authors map[string]object.Signature +} + +func findV2SessionAuthor(ctx context.Context, repo *git.Repository, cpID checkpointID.CheckpointID, sessionIndex int) (object.Signature, error) { + index, err := buildV2SessionAuthorIndex(ctx, repo) + if err != nil { + return object.Signature{}, err + } + return index.find(cpID, sessionIndex) +} + +func buildV2SessionAuthorIndex(ctx context.Context, repo *git.Repository) (*v2SessionAuthorIndex, error) { + ref, err := repo.Reference(plumbing.ReferenceName(paths.V2MainRefName), true) + if err != nil { + return nil, fmt.Errorf("resolve %s: %w", paths.V2MainRefName, err) + } + + iter, err := repo.Log(&git.LogOptions{ + From: ref.Hash(), + Order: git.LogOrderCommitterTime, + }) + if err != nil { + return nil, fmt.Errorf("read %s history: %w", paths.V2MainRefName, err) + } + defer iter.Close() + + index := &v2SessionAuthorIndex{authors: make(map[string]object.Signature)} + err = iter.ForEach(func(commit *object.Commit) error { + if err := ctx.Err(); err != nil { + return err //nolint:wrapcheck // Propagating context cancellation + } + if commit.NumParents() > 1 { + return nil + } + paths, err := changedV2SessionMetadataPaths(ctx, commit) + if err != nil { + return fmt.Errorf("read changed v2 session metadata paths in %s: %w", commit.Hash, err) + } + for _, path := range paths { + if _, exists := index.authors[path]; !exists { + index.authors[path] = commit.Author + } + } + return nil + }) + if err != nil { + return nil, fmt.Errorf("walk %s history: %w", paths.V2MainRefName, err) + } + return index, nil +} + +func (index *v2SessionAuthorIndex) find(cpID checkpointID.CheckpointID, sessionIndex int) (object.Signature, error) { + metadataPath := v2SessionMetadataPath(cpID, sessionIndex) + author, ok := index.authors[metadataPath] + if !ok { + return object.Signature{}, fmt.Errorf("%s not found in %s history", metadataPath, paths.V2MainRefName) + } + return author, nil +} + +func changedV2SessionMetadataPaths(ctx context.Context, commit *object.Commit) ([]string, error) { + commitTree, err := commit.Tree() + if err != nil { + return nil, fmt.Errorf("read commit tree: %w", err) + } + + var parentTree *object.Tree + if commit.NumParents() > 0 { + parent, err := commit.Parent(0) + if err != nil { + return nil, fmt.Errorf("read parent: %w", err) + } + parentTree, err = parent.Tree() + if err != nil { + return nil, fmt.Errorf("read parent tree: %w", err) + } + } + + changes, err := object.DiffTreeContext(ctx, parentTree, commitTree) + if err != nil { + return nil, fmt.Errorf("diff commit tree: %w", err) + } + + var paths []string + for _, change := range changes { + path, ok, err := v2SessionMetadataPathFromChange(change) + if err != nil { + return nil, err + } + if ok { + paths = append(paths, path) + } + } + return paths, nil +} + +func v2SessionMetadataPathFromChange(change *object.Change) (string, bool, error) { + action, err := change.Action() + if err != nil { + return "", false, fmt.Errorf("read change action: %w", err) + } + if action != merkletrie.Insert && action != merkletrie.Modify { + return "", false, nil + } + if !isV2SessionMetadataPath(change.To.Name) { + return "", false, nil + } + return change.To.Name, true, nil +} + +func isV2SessionMetadataPath(path string) bool { + shard, rest, ok := strings.Cut(path, "/") + if !ok || len(shard) != 2 { + return false + } + suffix, rest, ok := strings.Cut(rest, "/") + if !ok || len(suffix) != 10 { + return false + } + if _, err := checkpointID.NewCheckpointID(shard + suffix); err != nil { + return false + } + sessionDir, fileName, ok := strings.Cut(rest, "/") + if !ok || fileName != paths.MetadataFileName { + return false + } + sessionIndex, err := strconv.Atoi(sessionDir) + if err != nil { + return false + } + return sessionIndex >= 0 +} + +func v2SessionMetadataPath(cpID checkpointID.CheckpointID, sessionIndex int) string { + return cpID.Path() + "/" + strconv.Itoa(sessionIndex) + "/" + paths.MetadataFileName +} diff --git a/cmd/migrate-v2-checkpoints/v2_author_test.go b/cmd/migrate-v2-checkpoints/v2_author_test.go new file mode 100644 index 0000000000..70cb5a5730 --- /dev/null +++ b/cmd/migrate-v2-checkpoints/v2_author_test.go @@ -0,0 +1,273 @@ +package main + +import ( + "context" + "testing" + "time" + + "github.com/entireio/cli/cmd/entire/cli/checkpoint" + checkpointID "github.com/entireio/cli/cmd/entire/cli/checkpoint/id" + "github.com/entireio/cli/cmd/entire/cli/paths" + "github.com/entireio/cli/cmd/entire/cli/testutil" + "github.com/stretchr/testify/require" + + "github.com/go-git/go-git/v6" + "github.com/go-git/go-git/v6/plumbing" + "github.com/go-git/go-git/v6/plumbing/filemode" + "github.com/go-git/go-git/v6/plumbing/object" +) + +func TestFindV2SessionAuthorSkipsV2MainMergeCommits(t *testing.T) { + t.Parallel() + + repo := setupV2AuthorRepo(t) + cpID := checkpointID.MustCheckpointID("aaaaaaaaaaaa") + checkpointAuthor := object.Signature{ + Name: "Checkpoint Author", + Email: "checkpoint@example.com", + When: time.Date(2024, 5, 11, 17, 19, 31, 0, time.UTC), + } + baseAuthor := object.Signature{ + Name: "Base Author", + Email: "base@example.com", + When: checkpointAuthor.When.Add(-48 * time.Hour), + } + baseHash := writeTestEmptyV2MainCommit(t, repo, nil, baseAuthor, "base v2/main") + mergeAuthor := object.Signature{ + Name: "Merge Author", + Email: "merge@example.com", + When: time.Date(2024, 5, 20, 16, 0, 6, 0, time.UTC), + } + writeTestV2Checkpoint(t, repo, testV2CheckpointOptions{ + CheckpointID: cpID, + SessionID: "session-with-merge-history", + AuthorName: checkpointAuthor.Name, + AuthorEmail: checkpointAuthor.Email, + AuthorWhen: checkpointAuthor.When, + }) + ref, err := repo.Reference(plumbing.ReferenceName(paths.V2MainRefName), true) + require.NoError(t, err) + writeTestV2MergeCommitWithCheckpointParent(t, repo, baseHash, ref.Hash(), mergeAuthor) + + author, err := findV2SessionAuthor(context.Background(), repo, cpID, 0) + + require.NoError(t, err) + require.Equal(t, checkpointAuthor.Name, author.Name) + require.Equal(t, checkpointAuthor.Email, author.Email) + require.True(t, author.When.Equal(checkpointAuthor.When), "author time = %s, want %s", author.When, checkpointAuthor.When) +} + +func TestFindV2SessionAuthorSkipsLaterCheckpointCommitsThatOnlyCarryPath(t *testing.T) { + t.Parallel() + + repo := setupV2AuthorRepo(t) + cpID := checkpointID.MustCheckpointID("0b0206eed178") + metadataPath := v2SessionMetadataPath(cpID, 0) + metadataBlob, err := checkpoint.CreateBlobFromContent(repo, []byte(`{"session_id":"original"}`+"\n")) + require.NoError(t, err) + metadataEntries := map[string]object.TreeEntry{ + metadataPath: { + Name: metadataPath, + Mode: filemode.Regular, + Hash: metadataBlob, + }, + } + metadataTree, err := checkpoint.BuildTreeFromEntries(context.Background(), repo, metadataEntries) + require.NoError(t, err) + emptyTree, err := checkpoint.BuildTreeFromEntries(context.Background(), repo, map[string]object.TreeEntry{}) + require.NoError(t, err) + + checkpointAuthor := object.Signature{ + Name: "Checkpoint Author", + Email: "checkpoint@example.com", + When: time.Date(2024, 5, 11, 17, 19, 31, 0, time.UTC), + } + baseAuthor := object.Signature{ + Name: "Base Author", + Email: "base@example.com", + When: checkpointAuthor.When.Add(-24 * time.Hour), + } + baseHash := writeTestCommitObject(t, repo, &object.Commit{ + TreeHash: emptyTree, + Author: baseAuthor, + Committer: baseAuthor, + Message: "base v2/main", + }) + checkpointHash := writeTestCommitObject(t, repo, &object.Commit{ + TreeHash: metadataTree, + ParentHashes: []plumbing.Hash{baseHash}, + Author: checkpointAuthor, + Committer: checkpointAuthor, + Message: "Checkpoint: 0b0206eed178", + }) + + sideAuthor := object.Signature{ + Name: "Side Author", + Email: "side@example.com", + When: checkpointAuthor.When.Add(47 * time.Hour), + } + sideHash := writeTestCommitObject(t, repo, &object.Commit{ + TreeHash: emptyTree, + ParentHashes: []plumbing.Hash{baseHash}, + Author: sideAuthor, + Committer: sideAuthor, + Message: "side commit without metadata", + }) + mergeAuthor := object.Signature{ + Name: "Merge Author", + Email: "merge@example.com", + When: checkpointAuthor.When.Add(24 * time.Hour), + } + mergeHash := writeTestCommitObject(t, repo, &object.Commit{ + TreeHash: metadataTree, + ParentHashes: []plumbing.Hash{checkpointHash, sideHash}, + Author: mergeAuthor, + Committer: mergeAuthor, + Message: "Merge remote v2/main", + }) + laterAuthor := object.Signature{ + Name: "Later Checkpoint Author", + Email: "later@example.com", + When: checkpointAuthor.When.Add(48 * time.Hour), + } + laterBlob, err := checkpoint.CreateBlobFromContent(repo, []byte(`{"session_id":"later"}`+"\n")) + require.NoError(t, err) + laterEntries := map[string]object.TreeEntry{ + metadataPath: metadataEntries[metadataPath], + "68/0da8552908/0/metadata.json": { + Name: "68/0da8552908/0/metadata.json", + Mode: filemode.Regular, + Hash: laterBlob, + }, + } + laterTree, err := checkpoint.BuildTreeFromEntries(context.Background(), repo, laterEntries) + require.NoError(t, err) + laterHash := writeTestCommitObject(t, repo, &object.Commit{ + TreeHash: laterTree, + ParentHashes: []plumbing.Hash{mergeHash}, + Author: laterAuthor, + Committer: laterAuthor, + Message: "Checkpoint: 680da8552908", + }) + require.NoError(t, repo.Storer.SetReference(plumbing.NewHashReference(plumbing.ReferenceName(paths.V2MainRefName), laterHash))) + + author, err := findV2SessionAuthor(context.Background(), repo, cpID, 0) + + require.NoError(t, err) + require.Equal(t, checkpointAuthor.Name, author.Name) + require.Equal(t, checkpointAuthor.Email, author.Email) + require.True(t, author.When.Equal(checkpointAuthor.When), "author time = %s, want %s", author.When, checkpointAuthor.When) +} + +func TestFindV2SessionAuthorReturnsNotFoundWhenOnlyMergeTouchedPath(t *testing.T) { + t.Parallel() + + repo := setupV2AuthorRepo(t) + cpID := checkpointID.MustCheckpointID("bbbbbbbbbbbb") + metadataPath := v2SessionMetadataPath(cpID, 0) + metadataBlob, err := checkpoint.CreateBlobFromContent(repo, []byte("{}\n")) + require.NoError(t, err) + treeWithMetadata, err := checkpoint.BuildTreeFromEntries(context.Background(), repo, map[string]object.TreeEntry{ + metadataPath: { + Name: metadataPath, + Mode: filemode.Regular, + Hash: metadataBlob, + }, + }) + require.NoError(t, err) + + emptyTree, err := checkpoint.BuildTreeFromEntries(context.Background(), repo, map[string]object.TreeEntry{}) + require.NoError(t, err) + parentAuthor := object.Signature{ + Name: "Parent Author", + Email: "parent@example.com", + When: time.Date(2024, 5, 10, 10, 0, 0, 0, time.UTC), + } + firstParent := writeTestCommitObject(t, repo, &object.Commit{ + TreeHash: emptyTree, + Author: parentAuthor, + Committer: parentAuthor, + Message: "parent one", + }) + secondParent := writeTestCommitObject(t, repo, &object.Commit{ + TreeHash: emptyTree, + Author: parentAuthor, + Committer: parentAuthor, + Message: "parent two", + }) + mergeAuthor := object.Signature{ + Name: "Merge Author", + Email: "merge@example.com", + When: time.Date(2024, 5, 20, 16, 0, 6, 0, time.UTC), + } + mergeHash := writeTestCommitObject(t, repo, &object.Commit{ + TreeHash: treeWithMetadata, + ParentHashes: []plumbing.Hash{firstParent, secondParent}, + Author: mergeAuthor, + Committer: mergeAuthor, + Message: "Merge remote v2/main", + }) + require.NoError(t, repo.Storer.SetReference(plumbing.NewHashReference(plumbing.ReferenceName(paths.V2MainRefName), mergeHash))) + + _, err = findV2SessionAuthor(context.Background(), repo, cpID, 0) + + require.ErrorContains(t, err, metadataPath+" not found in "+paths.V2MainRefName+" history") +} + +func setupV2AuthorRepo(t *testing.T) *git.Repository { + t.Helper() + + dir := t.TempDir() + testutil.InitRepo(t, dir) + repo, err := git.PlainOpen(dir) + require.NoError(t, err) + return repo +} + +func writeTestEmptyV2MainCommit(t *testing.T, repo *git.Repository, parentHashes []plumbing.Hash, author object.Signature, message string) plumbing.Hash { + t.Helper() + + emptyTree, err := checkpoint.BuildTreeFromEntries(context.Background(), repo, map[string]object.TreeEntry{}) + require.NoError(t, err) + hash := writeTestCommitObject(t, repo, &object.Commit{ + TreeHash: emptyTree, + ParentHashes: parentHashes, + Author: author, + Committer: author, + Message: message, + }) + require.NoError(t, repo.Storer.SetReference(plumbing.NewHashReference(plumbing.ReferenceName(paths.V2MainRefName), hash))) + return hash +} + +func writeTestV2MergeCommitWithCheckpointParent(t *testing.T, repo *git.Repository, baseHash, checkpointParent plumbing.Hash, author object.Signature) plumbing.Hash { + t.Helper() + + checkpointCommit, err := repo.CommitObject(checkpointParent) + require.NoError(t, err) + mainParentAuthor := object.Signature{ + Name: "Main Parent", + Email: "main-parent@example.com", + When: author.When.Add(-24 * time.Hour), + } + mainParent := writeTestEmptyV2MainCommit(t, repo, []plumbing.Hash{baseHash}, mainParentAuthor, "main-side parent") + mergeHash := writeTestCommitObject(t, repo, &object.Commit{ + TreeHash: checkpointCommit.TreeHash, + ParentHashes: []plumbing.Hash{mainParent, checkpointParent}, + Author: author, + Committer: author, + Message: "Merge remote v2/main", + }) + require.NoError(t, repo.Storer.SetReference(plumbing.NewHashReference(plumbing.ReferenceName(paths.V2MainRefName), mergeHash))) + return mergeHash +} + +func writeTestCommitObject(t *testing.T, repo *git.Repository, commit *object.Commit) plumbing.Hash { + t.Helper() + + encoded := repo.Storer.NewEncodedObject() + require.NoError(t, commit.Encode(encoded)) + hash, err := repo.Storer.SetEncodedObject(encoded) + require.NoError(t, err) + return hash +} diff --git a/cmd/migrate-v2-checkpoints/v2_fixture_test.go b/cmd/migrate-v2-checkpoints/v2_fixture_test.go new file mode 100644 index 0000000000..a23de0bef8 --- /dev/null +++ b/cmd/migrate-v2-checkpoints/v2_fixture_test.go @@ -0,0 +1,313 @@ +package main + +import ( + "context" + "crypto/sha256" + "encoding/json" + "errors" + "fmt" + "io" + "testing" + "time" + + "github.com/entireio/cli/cmd/entire/cli/agent" + "github.com/entireio/cli/cmd/entire/cli/agent/types" + "github.com/entireio/cli/cmd/entire/cli/checkpoint" + "github.com/entireio/cli/cmd/entire/cli/checkpoint/id" + "github.com/entireio/cli/cmd/entire/cli/jsonutil" + "github.com/entireio/cli/cmd/entire/cli/paths" + "github.com/entireio/cli/cmd/entire/cli/versioninfo" + "github.com/entireio/cli/redact" + "github.com/stretchr/testify/require" + + "github.com/go-git/go-git/v6" + "github.com/go-git/go-git/v6/plumbing" + "github.com/go-git/go-git/v6/plumbing/filemode" + "github.com/go-git/go-git/v6/plumbing/object" +) + +type testV2CheckpointOptions struct { + CheckpointID id.CheckpointID + SessionID string + CreatedAt time.Time + Strategy string + Branch string + Transcript redact.RedactedBytes + CompactTranscript []byte + Prompts []string + FilesTouched []string + CheckpointsCount int + AuthorName string + AuthorEmail string + AuthorWhen time.Time + Agent types.AgentType + Model string + TurnID string + IsTask bool + ToolUseID string + CheckpointTranscriptStart int + CompactTranscriptStart int + TokenUsage *agent.TokenUsage + SessionMetrics *checkpoint.SessionMetrics + Summary *checkpoint.Summary + InitialAttribution *checkpoint.InitialAttribution + PromptAttributions json.RawMessage + CombinedAttribution *checkpoint.InitialAttribution + Kind string + ReviewSkills []string + ReviewPrompt string + HasReview bool + InvestigateRunID string + InvestigateTopic string + HasInvestigation bool +} + +func writeTestV2Checkpoint(t *testing.T, repo *git.Repository, opts testV2CheckpointOptions) { + t.Helper() + + if opts.CreatedAt.IsZero() { + opts.CreatedAt = time.Date(2024, 5, 6, 7, 8, 9, 0, time.UTC) + } + if opts.Strategy == "" { + opts.Strategy = testStrategy + } + if opts.Branch == "" { + opts.Branch = testBranchName + } + if opts.AuthorName == "" { + opts.AuthorName = testAuthorName + } + if opts.AuthorEmail == "" { + opts.AuthorEmail = testAuthorEmail + } + if opts.AuthorWhen.IsZero() { + opts.AuthorWhen = opts.CreatedAt + } + + sessionIndex := writeTestV2MainCheckpoint(t, repo, opts) + if opts.Transcript.Len() > 0 { + writeTestV2FullTranscript(t, repo, opts.CheckpointID, sessionIndex, opts.Transcript.Bytes()) + } +} + +func writeTestV2MainCheckpoint(t *testing.T, repo *git.Repository, opts testV2CheckpointOptions) int { + t.Helper() + + refName := plumbing.ReferenceName(paths.V2MainRefName) + parentHash, entries := readTestV2RefEntries(t, repo, refName) + basePath := opts.CheckpointID.Path() + "/" + + summary := checkpoint.CheckpointSummary{ + CLIVersion: versioninfo.Version, + CheckpointID: opts.CheckpointID, + Strategy: opts.Strategy, + Branch: opts.Branch, + CheckpointsCount: opts.CheckpointsCount, + FilesTouched: opts.FilesTouched, + TokenUsage: opts.TokenUsage, + CombinedAttribution: opts.CombinedAttribution, + HasReview: opts.HasReview, + HasInvestigation: opts.HasInvestigation, + } + if entry, ok := entries[basePath+paths.MetadataFileName]; ok { + existing := readTestJSONFromBlob[checkpoint.CheckpointSummary](t, repo, entry.Hash) + summary = *existing + if opts.HasReview { + summary.HasReview = true + } + } + + sessionIndex := len(summary.Sessions) + sessionPath := fmt.Sprintf("%s%d/", basePath, sessionIndex) + filePaths := checkpoint.SessionFilePaths{ + Metadata: "/" + sessionPath + paths.MetadataFileName, + } + + if len(opts.Prompts) > 0 { + promptBlob, err := checkpoint.CreateBlobFromContent(repo, []byte(checkpoint.JoinPrompts(opts.Prompts))) + require.NoError(t, err) + entries[sessionPath+paths.PromptFileName] = object.TreeEntry{ + Name: sessionPath + paths.PromptFileName, + Mode: filemode.Regular, + Hash: promptBlob, + } + filePaths.Prompt = "/" + sessionPath + paths.PromptFileName + } + + if len(opts.CompactTranscript) > 0 { + compactBlob, err := checkpoint.CreateBlobFromContent(repo, opts.CompactTranscript) + require.NoError(t, err) + entries[sessionPath+paths.CompactTranscriptFileName] = object.TreeEntry{ + Name: sessionPath + paths.CompactTranscriptFileName, + Mode: filemode.Regular, + Hash: compactBlob, + } + filePaths.Transcript = "/" + sessionPath + paths.CompactTranscriptFileName + + compactHash := []byte(fmt.Sprintf("sha256:%x", sha256.Sum256(opts.CompactTranscript))) + compactHashBlob, err := checkpoint.CreateBlobFromContent(repo, compactHash) + require.NoError(t, err) + entries[sessionPath+paths.CompactTranscriptHashFileName] = object.TreeEntry{ + Name: sessionPath + paths.CompactTranscriptHashFileName, + Mode: filemode.Regular, + Hash: compactHashBlob, + } + filePaths.ContentHash = "/" + sessionPath + paths.CompactTranscriptHashFileName + } + + transcriptStart := opts.CheckpointTranscriptStart + if opts.CompactTranscriptStart > 0 { + transcriptStart = opts.CompactTranscriptStart + } + metadata := checkpoint.CommittedMetadata{ + CLIVersion: versioninfo.Version, + CheckpointID: opts.CheckpointID, + SessionID: opts.SessionID, + Strategy: opts.Strategy, + CreatedAt: opts.CreatedAt, + Branch: opts.Branch, + CheckpointsCount: opts.CheckpointsCount, + FilesTouched: opts.FilesTouched, + Agent: opts.Agent, + Model: opts.Model, + TurnID: opts.TurnID, + IsTask: opts.IsTask, + ToolUseID: opts.ToolUseID, + CheckpointTranscriptStart: transcriptStart, + TranscriptLinesAtStart: transcriptStart, + TokenUsage: opts.TokenUsage, + SessionMetrics: opts.SessionMetrics, + Summary: opts.Summary, + InitialAttribution: opts.InitialAttribution, + PromptAttributions: opts.PromptAttributions, + Kind: opts.Kind, + ReviewSkills: opts.ReviewSkills, + ReviewPrompt: opts.ReviewPrompt, + InvestigateRunID: opts.InvestigateRunID, + InvestigateTopic: opts.InvestigateTopic, + } + metadataJSON, err := jsonutil.MarshalIndentWithNewline(metadata, "", " ") + require.NoError(t, err) + metadataBlob, err := checkpoint.CreateBlobFromContent(repo, metadataJSON) + require.NoError(t, err) + entries[sessionPath+paths.MetadataFileName] = object.TreeEntry{ + Name: sessionPath + paths.MetadataFileName, + Mode: filemode.Regular, + Hash: metadataBlob, + } + + summary.Sessions = append(summary.Sessions, filePaths) + summaryJSON, err := jsonutil.MarshalIndentWithNewline(summary, "", " ") + require.NoError(t, err) + summaryBlob, err := checkpoint.CreateBlobFromContent(repo, summaryJSON) + require.NoError(t, err) + entries[basePath+paths.MetadataFileName] = object.TreeEntry{ + Name: basePath + paths.MetadataFileName, + Mode: filemode.Regular, + Hash: summaryBlob, + } + + writeTestV2RefEntriesWithAuthor(t, repo, refName, parentHash, entries, "test v2 main fixture", object.Signature{ + Name: opts.AuthorName, + Email: opts.AuthorEmail, + When: opts.AuthorWhen, + }) + return sessionIndex +} + +func writeTestV2FullTranscript(t *testing.T, repo *git.Repository, cpID id.CheckpointID, sessionIndex int, transcript []byte) { + t.Helper() + + sessionPath := fmt.Sprintf("%s/%d/", cpID.Path(), sessionIndex) + contentHash := []byte(fmt.Sprintf("sha256:%x", sha256.Sum256(transcript))) + writeTestV2FullSessionFiles(t, repo, map[string][]byte{ + sessionPath + paths.V2RawTranscriptFileName: transcript, + sessionPath + paths.V2RawTranscriptHashFileName: contentHash, + }) +} + +func writeTestV2FullSessionFiles(t *testing.T, repo *git.Repository, files map[string][]byte) { + t.Helper() + + refName := plumbing.ReferenceName(paths.V2FullCurrentRefName) + parentHash, entries := readTestV2RefEntries(t, repo, refName) + for path, content := range files { + blobHash, err := checkpoint.CreateBlobFromContent(repo, content) + require.NoError(t, err) + entries[path] = object.TreeEntry{Name: path, Mode: filemode.Regular, Hash: blobHash} + } + writeTestV2RefEntries(t, repo, refName, parentHash, entries, "test v2 full fixture") +} + +func readTestV2RefEntries(t *testing.T, repo *git.Repository, refName plumbing.ReferenceName) (plumbing.Hash, map[string]object.TreeEntry) { + t.Helper() + + entries := make(map[string]object.TreeEntry) + ref, err := repo.Reference(refName, true) + if errors.Is(err, plumbing.ErrReferenceNotFound) { + return plumbing.ZeroHash, entries + } + require.NoError(t, err) + commit, err := repo.CommitObject(ref.Hash()) + require.NoError(t, err) + tree, err := commit.Tree() + require.NoError(t, err) + + files := tree.Files() + err = files.ForEach(func(file *object.File) error { + entries[file.Name] = object.TreeEntry{Name: file.Name, Mode: file.Mode, Hash: file.Hash} + return nil + }) + require.NoError(t, err) + return ref.Hash(), entries +} + +func writeTestV2RefEntries(t *testing.T, repo *git.Repository, refName plumbing.ReferenceName, parentHash plumbing.Hash, entries map[string]object.TreeEntry, message string) { + t.Helper() + + authorName, authorEmail := checkpoint.GetGitAuthorFromRepo(repo) + writeTestV2RefEntriesWithAuthor(t, repo, refName, parentHash, entries, message, object.Signature{ + Name: authorName, + Email: authorEmail, + When: time.Now(), + }) +} + +func writeTestV2RefEntriesWithAuthor(t *testing.T, repo *git.Repository, refName plumbing.ReferenceName, parentHash plumbing.Hash, entries map[string]object.TreeEntry, message string, author object.Signature) { + t.Helper() + + treeHash, err := checkpoint.BuildTreeFromEntries(context.Background(), repo, entries) + require.NoError(t, err) + commit := &object.Commit{ + TreeHash: treeHash, + Author: author, + Committer: author, + Message: message, + } + if parentHash != plumbing.ZeroHash { + commit.ParentHashes = []plumbing.Hash{parentHash} + } + encoded := repo.Storer.NewEncodedObject() + require.NoError(t, commit.Encode(encoded)) + commitHash, err := repo.Storer.SetEncodedObject(encoded) + require.NoError(t, err) + require.NoError(t, repo.Storer.SetReference(plumbing.NewHashReference(refName, commitHash))) +} + +func readTestJSONFromBlob[T any](t *testing.T, repo *git.Repository, hash plumbing.Hash) *T { + t.Helper() + + blob, err := repo.BlobObject(hash) + require.NoError(t, err) + reader, err := blob.Reader() + require.NoError(t, err) + defer func() { + require.NoError(t, reader.Close()) + }() + content, err := io.ReadAll(reader) + require.NoError(t, err) + + var result T + require.NoError(t, json.Unmarshal(content, &result)) + return &result +} diff --git a/cmd/migrate-v2-checkpoints/v2_preflight.go b/cmd/migrate-v2-checkpoints/v2_preflight.go new file mode 100644 index 0000000000..c300da3054 --- /dev/null +++ b/cmd/migrate-v2-checkpoints/v2_preflight.go @@ -0,0 +1,167 @@ +package main + +import ( + "context" + "errors" + "fmt" + "sort" + "strings" + "time" + + "github.com/entireio/cli/cmd/entire/cli/checkpoint/remote" + "github.com/entireio/cli/cmd/entire/cli/paths" + "github.com/entireio/cli/cmd/entire/cli/strategy" + + "github.com/go-git/go-git/v6" + "github.com/go-git/go-git/v6/plumbing" +) + +const ( + v2RefFetchTimeout = 2 * time.Minute + v2MainFetchTmpRef = strategy.FetchTmpRefPrefix + "migrate-v2-main" +) + +func ensureLatestV2Refs(ctx context.Context, repoRoot string, repo *git.Repository) error { + fetchTarget, err := remote.FetchURL(ctx, remote.FetchURLOptions{WorktreeRoot: repoRoot}) + if err != nil { + if localV2MainRefExists(repo) { + return nil + } + return fmt.Errorf("resolve v2 checkpoint fetch target: %w", err) + } + + ctx, cancel := context.WithTimeout(ctx, v2RefFetchTimeout) + defer cancel() + + remoteRefs, err := listRemoteV2Refs(ctx, repoRoot, fetchTarget) + if err != nil { + return err + } + if _, ok := remoteRefs[paths.V2MainRefName]; !ok { + return fmt.Errorf("%s not found on remote %s", paths.V2MainRefName, remote.RedactURL(fetchTarget)) + } + + if err := fetchV2MainRef(ctx, repoRoot, repo, fetchTarget); err != nil { + return err + } + if err := fetchV2FullRefs(ctx, repoRoot, fetchTarget, remoteRefs); err != nil { + return err + } + return nil +} + +func localV2MainRefExists(repo *git.Repository) bool { + _, err := repo.Reference(plumbing.ReferenceName(paths.V2MainRefName), true) + return err == nil +} + +func listRemoteV2Refs(ctx context.Context, repoRoot, fetchTarget string) (map[string]struct{}, error) { + output, err := remote.LsRemoteInDir(ctx, repoRoot, fetchTarget, "refs/entire/checkpoints/v2/*") + if err != nil { + return nil, fmt.Errorf("list remote v2 checkpoint refs from %s: %w", remote.RedactURL(fetchTarget), err) + } + + refs := make(map[string]struct{}) + for line := range strings.SplitSeq(strings.TrimSpace(string(output)), "\n") { + fields := strings.Fields(line) + if len(fields) < 2 { + continue + } + refs[fields[1]] = struct{}{} + } + return refs, nil +} + +func fetchV2MainRef(ctx context.Context, repoRoot string, repo *git.Repository, fetchTarget string) error { + refSpec := fmt.Sprintf("+%s:%s", paths.V2MainRefName, v2MainFetchTmpRef) + output, err := remote.Fetch(ctx, remote.FetchOptions{ + Remote: fetchTarget, + RefSpecs: []string{refSpec}, + NoTags: true, + NoFilter: true, + Dir: repoRoot, + }) + if err != nil { + return fetchV2RefsError("fetch v2 /main", fetchTarget, output, err) + } + + tmpRefName := plumbing.ReferenceName(v2MainFetchTmpRef) + defer func() { _ = repo.Storer.RemoveReference(tmpRefName) }() //nolint:errcheck // cleanup is best-effort + + tmpRef, err := repo.Reference(tmpRefName, true) + if err != nil { + return fmt.Errorf("v2 /main not found after fetch (tmp ref %s missing): %w", tmpRefName, err) + } + if err := strategy.SafelyAdvanceLocalRef(ctx, repo, plumbing.ReferenceName(paths.V2MainRefName), tmpRef.Hash()); err != nil { + return fmt.Errorf("advance local %s: %w", paths.V2MainRefName, err) + } + return nil +} + +func fetchV2FullRefs(ctx context.Context, repoRoot, fetchTarget string, remoteRefs map[string]struct{}) error { + refSpecs := v2FullRefSpecs(remoteRefs) + if len(refSpecs) == 0 { + return nil + } + + output, err := remote.Fetch(ctx, remote.FetchOptions{ + Remote: fetchTarget, + RefSpecs: refSpecs, + NoTags: true, + NoFilter: true, + Dir: repoRoot, + }) + if err != nil { + return fetchV2RefsError("fetch v2 /full refs", fetchTarget, output, err) + } + return nil +} + +func v2FullRefSpecs(remoteRefs map[string]struct{}) []string { + refSpecs := make([]string, 0, len(remoteRefs)) + for refName := range remoteRefs { + if !isV2FullRefName(refName) { + continue + } + refSpec := refName + ":" + refName + if refName == paths.V2FullCurrentRefName { + refSpec = "+" + refSpec + } + refSpecs = append(refSpecs, refSpec) + } + sort.Strings(refSpecs) + return refSpecs +} + +func isV2FullRefName(refName string) bool { + prefix := strings.TrimSuffix(paths.V2FullCurrentRefName, "current") + if !strings.HasPrefix(refName, prefix) { + return false + } + suffix := strings.TrimPrefix(refName, prefix) + if suffix == "current" { + return true + } + if len(suffix) != 13 { + return false + } + for _, r := range suffix { + if r < '0' || r > '9' { + return false + } + } + return true +} + +func fetchV2RefsError(action, fetchTarget string, output []byte, err error) error { + if errors.Is(err, context.DeadlineExceeded) { + return fmt.Errorf("%s timed out after %s", action, v2RefFetchTimeout) + } + + redactedTarget := remote.RedactURL(fetchTarget) + msg := strings.TrimSpace(strings.ReplaceAll(string(output), fetchTarget, redactedTarget)) + if msg != "" { + return fmt.Errorf("%s from %s failed: %s: %w", action, redactedTarget, msg, err) + } + return fmt.Errorf("%s from %s failed: %w", action, redactedTarget, err) +} diff --git a/mise-tasks/build b/mise-tasks/build index 4f48f47c3d..2ad5a3295f 100755 --- a/mise-tasks/build +++ b/mise-tasks/build @@ -1,4 +1,13 @@ #!/bin/sh #MISE description="Build the CLI" -go build ./cmd/entire +TAG=$(git describe --tags --abbrev=0 || echo "v0.0.0") +COMMIT=$(git rev-parse --short HEAD 2>/dev/null || echo "unknown") +VERSION="${TAG}-dev-${COMMIT}" + +OUTPUT="entire" +case "$(uname -s)" in MINGW*|MSYS*|CYGWIN*|Windows_NT) OUTPUT="entire.exe" ;; esac + +go build -ldflags "-X github.com/entireio/cli/cmd/entire/cli/versioninfo.Version=${VERSION} -X github.com/entireio/cli/cmd/entire/cli/versioninfo.Commit=${COMMIT}" -o "$OUTPUT" ./cmd/entire + +go build -ldflags "-X github.com/entireio/cli/cmd/entire/cli/versioninfo.Version=${VERSION} -X github.com/entireio/cli/cmd/entire/cli/versioninfo.Commit=${COMMIT}" -o "migrate-v2-checkpoints" ./cmd/migrate-v2-checkpoints/