Skip to content

Commit 9489aad

Browse files
authored
[autorevert] update failure threshold to 3 for autorevert eligibility (#7224)
Changes: - Failure side now requires 3+ failure events; if less than that, it schedules restarts on the first (oldest) failed commit sequentially (allowing pending events to resolve) - Success and failed sides now issue restarts independently (should speed up the whole process a bit)
1 parent 742f25f commit 9489aad

File tree

2 files changed

+64
-19
lines changed

2 files changed

+64
-19
lines changed

aws/lambda/pytorch-auto-revert/pytorch_auto_revert/signal.py

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ class IneligibleReason(Enum):
5050
NO_SUCCESSES = "no_successes"
5151
NO_PARTITION = "no_partition" # insufficient commit history to form partitions
5252
INFRA_NOT_CONFIRMED = "infra_not_confirmed" # infra check not confirmed
53+
INSUFFICIENT_FAILURES = "insufficient_failures" # not enough failures to make call
5354
PENDING_GAP = "pending_gap" # unknown/pending commits present
5455

5556

@@ -366,21 +367,29 @@ def process_valid_autorevert_pattern(
366367
restart_commits.add(c.head_sha)
367368

368369
infra_check_result = partition.confirm_not_an_infra_issue()
369-
# note re: event_count < 2:
370+
# note re: event_count < 3:
370371
# this is a confidence heuristic to detect flakiness, can adjust as needed
371372
if (
372373
infra_check_result == InfraCheckResult.RESTART_FAILURE
373-
or partition.failure_events_count() < 2
374+
or partition.failure_events_count() < 3
374375
):
375376
if not partition.failed[-1].has_pending:
376377
# restarting oldest failed
377378
restart_commits.add(partition.failed[-1].head_sha)
378379
else:
379-
return Ineligible(
380-
IneligibleReason.INFRA_NOT_CONFIRMED,
381-
f"waiting on pending events on suspected failure side: {partition.failed[-1].head_sha}",
382-
)
383-
elif (
380+
if infra_check_result == InfraCheckResult.RESTART_FAILURE:
381+
return Ineligible(
382+
IneligibleReason.INFRA_NOT_CONFIRMED,
383+
f"waiting on pending events on suspected failure side: {partition.failed[-1].head_sha}",
384+
)
385+
else:
386+
return Ineligible(
387+
IneligibleReason.INSUFFICIENT_FAILURES,
388+
f"insufficient failures to make call, "
389+
f"pending events on suspected failure side: {partition.failed[-1].head_sha}",
390+
)
391+
392+
if (
384393
infra_check_result == InfraCheckResult.RESTART_SUCCESS
385394
or partition.success_events_count() < 2
386395
):

aws/lambda/pytorch-auto-revert/pytorch_auto_revert/tests/test_signal.py

Lines changed: 48 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,11 @@ def test_confirm_not_an_infra_issue_false_no_bracketing_success(self):
171171
)
172172

173173
def test_detect_autorevert_pattern_basic(self):
174-
# Commits newest -> older
174+
# Commits newest -> older. With 3 failures total, we meet the new 3+ threshold
175+
c_newest = SignalCommit(
176+
head_sha="sha_newest",
177+
events=[self._ev("job", SignalStatus.FAILURE, 7)],
178+
)
175179
c_newer = SignalCommit(
176180
head_sha="sha_newer",
177181
events=[self._ev("job", SignalStatus.FAILURE, 5)],
@@ -189,16 +193,16 @@ def test_detect_autorevert_pattern_basic(self):
189193
],
190194
)
191195
s = Signal(
192-
key="job", workflow_name="wf", commits=[c_newer, c_suspected, c_base]
196+
key="job",
197+
workflow_name="wf",
198+
commits=[c_newest, c_newer, c_suspected, c_base],
193199
)
194200
res = s.process_valid_autorevert_pattern()
195201
self.assertIsNotNone(res)
196202
self.assertIsInstance(res, AutorevertPattern)
197203
self.assertEqual(res.workflow_name, "wf")
198204
# newer failing commits are those after the suspected one (newest->older)
199-
self.assertEqual(
200-
res.newer_failing_commits, ["sha_newer"]
201-
) # only newer of the two
205+
self.assertEqual(res.newer_failing_commits, ["sha_newest", "sha_newer"])
202206
self.assertEqual(res.suspected_commit, "sha_mid")
203207
self.assertEqual(res.older_successful_commit, "sha_old")
204208

@@ -245,8 +249,10 @@ def test_insufficient_failures_returns_restart_oldest_failed_when_no_pending(sel
245249
self.assertTrue(hasattr(res, "commit_shas"))
246250
self.assertIn("sha_failed", res.commit_shas)
247251

248-
def test_insufficient_failures_infra_not_confirmed_when_pending_on_failed(self):
249-
# Only one failure overall, but the failed commit also has pending → ineligible awaiting confirmation
252+
def test_insufficient_failures_returns_insufficient_failures_when_pending_on_failed(
253+
self,
254+
):
255+
# Only one failure overall, but the failed commit also has pending → ineligible due to insufficient failures
250256
c_failed_pending = SignalCommit(
251257
head_sha="sha_failed_pend",
252258
events=[
@@ -264,13 +270,17 @@ def test_insufficient_failures_infra_not_confirmed_when_pending_on_failed(self):
264270
s = Signal(key="job", workflow_name="wf", commits=[c_failed_pending, c_base])
265271
res = s.process_valid_autorevert_pattern()
266272
self.assertIsInstance(res, Ineligible)
267-
self.assertEqual(res.reason, IneligibleReason.INFRA_NOT_CONFIRMED)
273+
self.assertEqual(res.reason, IneligibleReason.INSUFFICIENT_FAILURES)
268274
self.assertIn("sha_failed_pend", res.message)
269275

270276
def test_insufficient_successes_returns_restart_newest_success_when_no_pending(
271277
self,
272278
):
273-
# Ensure we have at least two failure events to avoid the failure-side heuristic
279+
# Ensure we have at least three failure events to avoid the failure-side heuristic
280+
c_fail_newest = SignalCommit(
281+
head_sha="sha_fail_newest",
282+
events=[self._ev("job", SignalStatus.FAILURE, 11)],
283+
)
274284
c_fail_new = SignalCommit(
275285
head_sha="sha_fail_new",
276286
events=[self._ev("job", SignalStatus.FAILURE, 10)],
@@ -287,7 +297,7 @@ def test_insufficient_successes_returns_restart_newest_success_when_no_pending(
287297
s = Signal(
288298
key="job",
289299
workflow_name="wf",
290-
commits=[c_fail_new, c_fail_old, c_success],
300+
commits=[c_fail_newest, c_fail_new, c_fail_old, c_success],
291301
)
292302
res = s.process_valid_autorevert_pattern()
293303
self.assertIsNotNone(res)
@@ -298,7 +308,11 @@ def test_insufficient_successes_returns_restart_newest_success_when_no_pending(
298308
self.assertIn("sha_success", res.commit_shas)
299309

300310
def test_insufficient_successes_infra_not_confirmed_when_pending_on_success(self):
301-
# Two failures present, but newest successful has pending → ineligible awaiting confirmation
311+
# Three failures present, but newest successful has pending → ineligible awaiting confirmation
312+
c_fail_newest = SignalCommit(
313+
head_sha="sha_fail_newest",
314+
events=[self._ev("job", SignalStatus.FAILURE, 11)],
315+
)
302316
c_fail_new = SignalCommit(
303317
head_sha="sha_fail_new",
304318
events=[self._ev("job", SignalStatus.FAILURE, 10)],
@@ -317,13 +331,35 @@ def test_insufficient_successes_infra_not_confirmed_when_pending_on_success(self
317331
s = Signal(
318332
key="job",
319333
workflow_name="wf",
320-
commits=[c_fail_new, c_fail_old, c_success_pending],
334+
commits=[c_fail_newest, c_fail_new, c_fail_old, c_success_pending],
321335
)
322336
res = s.process_valid_autorevert_pattern()
323337
self.assertIsInstance(res, Ineligible)
324338
self.assertEqual(res.reason, IneligibleReason.INFRA_NOT_CONFIRMED)
325339
self.assertIn("sha_success_pend", res.message)
326340

341+
def test_both_sides_restart_accumulate_when_below_thresholds(self):
342+
# One failure total (<3) and one success total (<2), neither pending.
343+
# Should propose restarts for both oldest failed and newest successful.
344+
c_failed = SignalCommit(
345+
head_sha="sha_failed",
346+
events=[self._ev("job", SignalStatus.FAILURE, 10)],
347+
)
348+
c_success = SignalCommit(
349+
head_sha="sha_success",
350+
events=[self._ev("job", SignalStatus.SUCCESS, 3)],
351+
)
352+
s = Signal(
353+
key="job",
354+
workflow_name="wf",
355+
commits=[c_failed, c_success],
356+
)
357+
res = s.process_valid_autorevert_pattern()
358+
# Should be a RestartCommits with both SHAs present
359+
self.assertTrue(hasattr(res, "commit_shas"))
360+
self.assertIn("sha_failed", res.commit_shas)
361+
self.assertIn("sha_success", res.commit_shas)
362+
327363

328364
if __name__ == "__main__":
329365
unittest.main()

0 commit comments

Comments
 (0)