[autorevert] Breaks down dry_run logic for revert and restart (#7179)

jeanschmidt · izaitsevfb · web-flow · commit d8451aee70db · 2025-09-17T16:29:04.000-07:00
*PLEASE NOTE* this PR is intended to be on top of #7179 maybe merge that one before reviewing this one to make reviewing easier Breaking down the dry_run logic for revert and restart is required so we can continue to safely work towards improving the autorevert safely. Not adding the logic is not the best option, as I hope we'll be able to run it locally a few times and iterate on that before publishing. And rely on code commenting and other not so great approaches is not ideal. --------- Co-authored-by: Ivan Zaitsev <ivanzaitsev@fb.com>
diff --git a/aws/lambda/pytorch-auto-revert/Makefile b/aws/lambda/pytorch-auto-revert/Makefile
@@ -21,13 +21,9 @@ venv/bin/lintrunner: venv/bin/python
 run-local: venv/bin/python
 	venv/bin/python -m pytorch_auto_revert
 
-.PHONY: run-local-dry
-run-local-dry: venv/bin/python
-	venv/bin/python -m pytorch_auto_revert --dry-run
-
 .PHONY: run-local-workflows
 run-local-workflows: venv/bin/python
-	venv/bin/python -m pytorch_auto_revert autorevert-checker Lint trunk pull inductor linux-binary-manywheel --hours 4380 --ignore-common-errors --verbose
+	venv/bin/python -m pytorch_auto_revert autorevert-checker Lint trunk pull inductor linux-binary-manywheel --hours 8 --revert-action log
 
 deployment.zip:
 	mkdir -p deployment
diff --git a/aws/lambda/pytorch-auto-revert/SIGNAL_ACTIONS.md b/aws/lambda/pytorch-auto-revert/SIGNAL_ACTIONS.md
@@ -5,7 +5,7 @@ This document specifies the Actions layer that consumes extracted Signals with t
 ## Overview
 
 - Inputs (provided by integration code):
-  - Run parameters: `repo_full_name`, `workflows`, `lookback_hours`, `dry_run`.
+  - Run parameters: `repo_full_name`, `workflows`, `lookback_hours`, `restart_action`, `revert_action`.
   - A list of pairs: `List[Tuple[Signal, SignalProcOutcome]]`, where `SignalProcOutcome = Union[AutorevertPattern, RestartCommits, Ineligible]`.
 - Decisions: per-signal outcome mapped to a concrete action:
   - `AutorevertPattern` → record a global revert intent for the suspected commit
@@ -22,7 +22,7 @@ Immutable run-scoped metadata shared by all actions in the same run:
 - `repo_full_name`: e.g., `pytorch/pytorch`
 - `workflows`: list of workflow display names
 - `lookback_hours`: window used for extraction
-- `dry_run`: bool
+- `restart_action`, `revert_action`: independent enums controlling behavior
 
 ## Action Semantics
 
@@ -40,9 +40,7 @@ Immutable run-scoped metadata shared by all actions in the same run:
   - Not logged in `autorevert_events_v2` (only actions taken are logged)
 
 - Multiple signals targeting same workflow/commit are coalesced in-memory, then deduped again via ClickHouse checks.
-- Dry-run behavior:
-  - Simulate restarts (no dispatch), log actions with `dry_run=1`
-  - Dry-run rows do not count toward caps/pacing or revert dedup criteria
+- Event `dry_run` is per-action and derived from the mode’s side effects: `dry_run = 1` means “no side effects”.
 
 ## ClickHouse Logging
 
@@ -59,6 +57,7 @@ Two tables, sharing the same `ts` per CLI/lambda run.
   - `source_signal_keys` Array(String) — signal keys that contributed to this action
   - `failed` UInt8 DEFAULT 0 — marks a failed attempt (e.g., restart dispatch failed)
   - `notes` String DEFAULT '' — optional free-form metadata
+  - `dry_run` UInt8 — per-event; 1 means “no side effects” for this logged action
 
 ### `autorevert_state` (separate module)
 
@@ -67,6 +66,9 @@ Two tables, sharing the same `ts` per CLI/lambda run.
   - `ts` DateTime — run timestamp (matches `autorevert_events_v2.ts`)
   - `state` String — JSON-encoded model of the HUD grid and outcomes
   - `params` String DEFAULT '' — optional, free-form
+  - `dry_run` UInt8 — run-level convenience flag; 1 when the run performed no side effects
+
+State JSON `meta` contains: `repo`, `workflows`, `lookback_hours`, `ts`, `restart_action`, `revert_action`.
 
 ## Processing Flow
 
@@ -78,8 +80,8 @@ Two tables, sharing the same `ts` per CLI/lambda run.
 4. For each group, consult `autorevert_events_v2` (non-dry-run rows) to enforce dedup rules:
    - Reverts: skip if any prior recorded `revert` exists for `commit_sha`
    - Restarts: skip if ≥2 prior restarts exist for `(workflow_target, commit_sha)`; skip if the latest is within 15 minutes of `ts`
-5. Execute eligible actions:
-   - Restart: if not `dry_run`, dispatch and capture success/failure in `notes`
-   - Revert: record only
-6. Insert one `autorevert_events_v2` row per executed group with aggregated `workflows` and `source_signal_keys` (dry-run rows use `dry_run=1`).
+5. Execute eligible actions using the per-action mode:
+   - Restart: `run` → dispatch and log; `log` → log only; `skip` → no logging
+   - Revert: currently record intent only; `run-notify`/`run-revert` modes are allowed but still log intent (no GH revert yet)
+6. Insert one `autorevert_events_v2` row per executed group with aggregated `workflows` and `source_signal_keys`.
 7. Separately (integration), build the full run state and call the run‑state logger to write a single `autorevert_state` row with the same `ts`.
diff --git a/aws/lambda/pytorch-auto-revert/pytorch_auto_revert/__main__.py b/aws/lambda/pytorch-auto-revert/pytorch_auto_revert/__main__.py
@@ -12,6 +12,10 @@
 from .testers.autorevert_v2 import autorevert_v2
 from .testers.hud import run_hud
 from .testers.restart_checker import workflow_restart_checker
+from .utils import RestartAction, RevertAction
+
+
+DEFAULT_WORKFLOWS = ["Lint", "trunk", "pull", "inductor"]
 
 
 def setup_logging(log_level: str) -> None:
@@ -71,14 +75,15 @@ def get_opts() -> argparse.Namespace:
     # no subcommand runs the lambda flow
     subparsers = parser.add_subparsers(dest="subcommand")
 
-    # autorevert-checker subcommand (new default; legacy behind a flag)
+    # autorevert-checker subcommand
     workflow_parser = subparsers.add_parser(
         "autorevert-checker",
         help="Analyze workflows for autorevert using Signals (default), or legacy via flag",
     )
     workflow_parser.add_argument(
         "workflows",
         nargs="+",
+        default=DEFAULT_WORKFLOWS,
         help="Workflow name(s) to analyze - single name or comma/space separated"
         + ' list (e.g., "pull" or "pull,trunk,inductor")',
     )
@@ -91,25 +96,22 @@ def get_opts() -> argparse.Namespace:
         help="Full repo name to filter by (owner/repo).",
     )
     workflow_parser.add_argument(
-        "--verbose",
-        "-v",
-        action="store_true",
-        help="Show detailed output including commit summaries",
-    )
-    workflow_parser.add_argument(
-        "--do-restart",
-        action="store_true",
-        help="Actually restart workflows for detected autorevert patterns",
-    )
-    workflow_parser.add_argument(
-        "--do-revert",
-        action="store_true",
-        help="When restarts complete and secondary pattern matches, log REVERT",
+        "--restart-action",
+        type=RestartAction,
+        default=RestartAction.RUN,
+        choices=list(RestartAction),
+        help=(
+            "Restart mode: skip (no logging), log (no side effects), or run (dispatch)."
+        ),
     )
     workflow_parser.add_argument(
-        "--ignore-common-errors",
-        action="store_true",
-        help="Ignore common errors in autorevert patterns (e.g., 'No tests found')",
+        "--revert-action",
+        type=RevertAction,
+        default=RevertAction.LOG,
+        choices=list(RevertAction),
+        help=(
+            "Revert mode: skip, log (no side effects), run-notify (side effect), or run-revert (side effect)."
+        ),
     )
 
     # workflow-restart-checker subcommand
@@ -200,19 +202,17 @@ def main(*args, **kwargs) -> None:
             os.environ.get("WORKFLOWS", "Lint,trunk,pull,inductor").split(","),
             hours=int(os.environ.get("HOURS", 16)),
             repo_full_name=os.environ.get("REPO_FULL_NAME", "pytorch/pytorch"),
-            dry_run=opts.dry_run,
-            do_restart=True,
-            do_revert=True,
+            restart_action=(RestartAction.LOG if opts.dry_run else RestartAction.RUN),
+            revert_action=RevertAction.LOG,
         )
     elif opts.subcommand == "autorevert-checker":
         # New default behavior under the same subcommand
         autorevert_v2(
             opts.workflows,
             hours=opts.hours,
             repo_full_name=opts.repo_full_name,
-            dry_run=opts.dry_run,
-            do_restart=opts.do_restart,
-            do_revert=opts.do_revert,
+            restart_action=(RestartAction.LOG if opts.dry_run else opts.restart_action),
+            revert_action=(RevertAction.LOG if opts.dry_run else opts.revert_action),
         )
     elif opts.subcommand == "workflow-restart-checker":
         workflow_restart_checker(opts.workflow, commit=opts.commit, days=opts.days)
diff --git a/aws/lambda/pytorch-auto-revert/pytorch_auto_revert/run_state_logger.py b/aws/lambda/pytorch-auto-revert/pytorch_auto_revert/run_state_logger.py
@@ -121,7 +121,8 @@ def _build_state_json(
                 "workflows": ctx.workflows,
                 "lookback_hours": ctx.lookback_hours,
                 "ts": ctx.ts.isoformat(),
-                "dry_run": ctx.dry_run,
+                "restart_action": str(ctx.restart_action),
+                "revert_action": str(ctx.revert_action),
             },
         }
         return json.dumps(doc, separators=(",", ":"))
@@ -151,7 +152,11 @@ def insert_state(
                 ctx.ts,
                 ctx.repo_full_name,
                 state_json,
-                1 if ctx.dry_run else 0,
+                1
+                if not (
+                    ctx.restart_action.side_effects or ctx.revert_action.side_effects
+                )
+                else 0,
                 ctx.workflows,
                 int(ctx.lookback_hours),
                 params or "",
diff --git a/aws/lambda/pytorch-auto-revert/pytorch_auto_revert/signal_actions.py b/aws/lambda/pytorch-auto-revert/pytorch_auto_revert/signal_actions.py
@@ -8,6 +8,7 @@
 from .clickhouse_client_helper import CHCliFactory, ensure_utc_datetime
 from .signal import AutorevertPattern, Ineligible, RestartCommits, Signal
 from .signal_extraction_types import RunContext
+from .utils import RestartAction, RevertAction
 from .workflow_checker import WorkflowRestartChecker
 
 
@@ -199,10 +200,20 @@ def execute_revert(
         self, *, commit_sha: str, sources: List[SignalMetadata], ctx: RunContext
     ) -> bool:
         """Record a revert intent if not previously logged for the commit."""
+        if ctx.revert_action == RevertAction.SKIP:
+            logging.debug(
+                "[v2][action] revert for sha %s: skipping (ignored)", commit_sha[:8]
+            )
+            return False
+
+        dry_run = not ctx.revert_action.side_effects
+
         if self._logger.prior_revert_exists(
             repo=ctx.repo_full_name, commit_sha=commit_sha
         ):
-            logging.info("[v2][action] revert: skipping existing")
+            logging.info(
+                "[v2][action] revert for sha %s: skipping existing", commit_sha[:8]
+            )
             return False
         self._logger.insert_event(
             repo=ctx.repo_full_name,
@@ -211,12 +222,14 @@ def execute_revert(
             commit_sha=commit_sha,
             workflows=sorted({s.workflow_name for s in sources}),
             source_signal_keys=[s.key for s in sources],
-            dry_run=ctx.dry_run,
+            dry_run=dry_run,
             failed=False,
             notes="",
         )
         logging.info(
-            "[v2][action] revert: logged%s", " (dry_run)" if ctx.dry_run else ""
+            "[v2][action] revert for sha %s: logged%s",
+            commit_sha[:8],
+            " (dry_run)" if dry_run else "",
         )
         return True
 
@@ -229,22 +242,36 @@ def execute_restart(
         ctx: RunContext,
     ) -> bool:
         """Dispatch a workflow restart if under cap and outside pacing window; always logs the event."""
+        if ctx.restart_action == RestartAction.SKIP:
+            logging.info(
+                "[v2][action] restart for sha %s: skipping (ignored)", commit_sha[:8]
+            )
+            return False
+
+        dry_run = not ctx.restart_action.side_effects
+
         recent = self._logger.recent_restarts(
             repo=ctx.repo_full_name, workflow=workflow_target, commit_sha=commit_sha
         )
         if len(recent) >= 2:
-            logging.info("[v2][action] restart: skipping cap (recent=%d)", len(recent))
+            logging.info(
+                "[v2][action] restart for sha %s: skipping cap (recent=%d)",
+                commit_sha[:8],
+                len(recent),
+            )
             return False
         if recent and (ctx.ts - recent[0]) < timedelta(minutes=15):
             delta = (ctx.ts - recent[0]).total_seconds()
             logging.info(
-                "[v2][action] restart: skipping pacing (delta_sec=%d)", int(delta)
+                "[v2][action] restart for sha %s: skipping pacing (delta_sec=%d)",
+                commit_sha[:8],
+                int(delta),
             )
             return False
 
         notes = ""
         ok = True
-        if not ctx.dry_run:
+        if not dry_run:
             ok = self._restart.restart_workflow(workflow_target, commit_sha)
             if not ok:
                 notes = "dispatch_failed"
@@ -255,14 +282,20 @@ def execute_restart(
             commit_sha=commit_sha,
             workflows=[workflow_target],
             source_signal_keys=[s.key for s in sources],
-            dry_run=ctx.dry_run,
+            dry_run=dry_run,
             failed=not ok,
             notes=notes,
         )
-        if not ctx.dry_run and notes == "":
-            logging.info("[v2][action] restart: dispatched")
-        elif notes:
-            logging.info("[v2][action] restart: dispatch_failed: %s", notes)
+        if not dry_run and ok:
+            logging.info("[v2][action] restart for sha %s: dispatched", commit_sha[:8])
+        elif not ok:
+            logging.info(
+                "[v2][action] restart for sha %s: dispatch_failed: %s",
+                commit_sha[:8],
+                notes,
+            )
         else:
-            logging.info("[v2][action] restart: logged (dry_run)")
+            logging.info(
+                "[v2][action] restart for sha %s: logged (dry_run)", commit_sha[:8]
+            )
         return True
diff --git a/aws/lambda/pytorch-auto-revert/pytorch_auto_revert/signal_extraction_types.py b/aws/lambda/pytorch-auto-revert/pytorch_auto_revert/signal_extraction_types.py
@@ -6,6 +6,8 @@
 from functools import cached_property
 from typing import List, NewType, Set
 
+from .utils import RestartAction, RevertAction
+
 
 # Default classification rules that indicate test failures.
 DEFAULT_TEST_RULES: Set[str] = {
@@ -29,11 +31,12 @@
 
 @dataclass(frozen=True)
 class RunContext:
-    ts: datetime
+    lookback_hours: int
     repo_full_name: str
+    restart_action: RestartAction
+    revert_action: RevertAction
+    ts: datetime
     workflows: List[str]
-    lookback_hours: int
-    dry_run: bool = False
 
 
 # Represents a job row from the jobs table in ClickHouse
diff --git a/aws/lambda/pytorch-auto-revert/pytorch_auto_revert/testers/autorevert_v2.py b/aws/lambda/pytorch-auto-revert/pytorch_auto_revert/testers/autorevert_v2.py
@@ -7,16 +7,16 @@
 from ..signal_actions import SignalActionProcessor, SignalProcOutcome
 from ..signal_extraction import SignalExtractor
 from ..signal_extraction_types import RunContext
+from ..utils import RestartAction, RevertAction
 
 
 def autorevert_v2(
     workflows: Iterable[str],
     *,
     hours: int = 24,
     repo_full_name: str = "pytorch/pytorch",
-    dry_run: bool = False,
-    do_restart: bool = True,
-    do_revert: bool = True,
+    restart_action: RestartAction = RestartAction.RUN,
+    revert_action: RevertAction = RevertAction.LOG,
 ) -> Tuple[List[Signal], List[Tuple[Signal, SignalProcOutcome]]]:
     """Run the Signals-based autorevert flow end-to-end.
 
@@ -32,11 +32,12 @@ def autorevert_v2(
     ts = datetime.now(timezone.utc)
 
     logging.info(
-        "[v2] Start: workflows=%s hours=%s repo=%s dry_run=%s",
+        "[v2] Start: workflows=%s hours=%s repo=%s restart_action=%s revert_action=%s",
         ",".join(workflows),
         hours,
         repo_full_name,
-        dry_run,
+        restart_action,
+        revert_action,
     )
     logging.info("[v2] Run timestamp (CH log ts) = %s", ts.isoformat())
 
@@ -57,24 +58,19 @@ def autorevert_v2(
 
     # Build run context
     run_ctx = RunContext(
-        ts=ts,
+        lookback_hours=hours,
         repo_full_name=repo_full_name,
+        restart_action=restart_action,
+        revert_action=revert_action,
+        ts=ts,
         workflows=workflows,
-        lookback_hours=hours,
-        dry_run=dry_run,
     )
 
     # Group and execute actions
     proc = SignalActionProcessor()
     groups = proc.group_actions(pairs)
     logging.info("[v2] Candidate action groups: %d", len(groups))
 
-    # Support toggling specific kinds of actions via flags
-    if not do_revert:
-        groups = [g for g in groups if g.type != "revert"]
-    if not do_restart:
-        groups = [g for g in groups if g.type != "restart"]
-
     executed_count = sum(1 for g in groups if proc.execute(g, run_ctx))
     logging.info("[v2] Executed action groups: %d", executed_count)
 
diff --git a/aws/lambda/pytorch-auto-revert/pytorch_auto_revert/tests/test_signal_actions.py b/aws/lambda/pytorch-auto-revert/pytorch_auto_revert/tests/test_signal_actions.py
diff --git a/aws/lambda/pytorch-auto-revert/pytorch_auto_revert/utils.py b/aws/lambda/pytorch-auto-revert/pytorch_auto_revert/utils.py