Skip to content

Commit 8b591dc

Browse files
authored
Increase the retry timeout for scaleUp by using a stochastic overshoot to overcome SQSs limitations (#4051)
1 parent e7abc40 commit 8b591dc

File tree

6 files changed

+140
-15
lines changed

6 files changed

+140
-15
lines changed

terraform-aws-github-runner/modules/runners/lambdas/runners/src/lambda.test.ts

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,52 @@ describe('scaleUp', () => {
9696
expect(sqsDeleteMessageBatch).toBeCalledWith(metrics, evts);
9797
});
9898

99+
it('stochasticOvershoot when retryCount > 5', async () => {
100+
const config = {
101+
maxRetryScaleUpRecord: 1999,
102+
retryScaleUpRecordDelayS: 20,
103+
retryScaleUpRecordJitterPct: 0.2,
104+
retryScaleUpRecordQueueUrl: 'asdf',
105+
};
106+
jest.spyOn(Config, 'Instance', 'get').mockImplementation(() => config as unknown as Config);
107+
const records = [
108+
{ eventSource: 'aws:sqs', body: '{"id":1}', eventSourceARN: '1:2:3:4:5:6', receiptHandle: 'xxx', messageId: 1 },
109+
{
110+
eventSource: 'aws:sqs',
111+
body: '{"id":2,"retryCount":3}',
112+
eventSourceARN: '1:2:3:4:5:6:7',
113+
receiptHandle: 'xxx',
114+
messageId: 2,
115+
},
116+
{
117+
eventSource: 'aws:sqs',
118+
body: '{"id":3,"retryCount":12}',
119+
eventSourceARN: '1:2:3:4:5:6:7',
120+
receiptHandle: 'xxx',
121+
messageId: 3,
122+
},
123+
];
124+
125+
const mockedScaleUp = mocked(scaleUp).mockResolvedValue(undefined);
126+
jest.spyOn(global.Math, 'random').mockReturnValue(0.5);
127+
128+
const callback = jest.fn();
129+
await scaleUpL({ Records: records } as unknown as SQSEvent, {} as unknown as Context, callback);
130+
expect(mockedScaleUp).toBeCalledTimes(2);
131+
132+
const expected = [
133+
{
134+
id: 3,
135+
retryCount: 12,
136+
delaySeconds: 900,
137+
},
138+
];
139+
expect(sqsSendMessages).toBeCalledTimes(1);
140+
expect(sqsSendMessages).toBeCalledWith(metrics, expected, 'asdf');
141+
142+
expect(sqsDeleteMessageBatch).toBeCalledTimes(0);
143+
});
144+
99145
it('RetryableScalingError', async () => {
100146
const config = {
101147
maxRetryScaleUpRecord: 12,

terraform-aws-github-runner/modules/runners/lambdas/runners/src/lambda.ts

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3,31 +3,35 @@ import { Context, SQSEvent, SQSRecord, ScheduledEvent } from 'aws-lambda';
33

44
import { Config } from './scale-runners/config';
55
import { ScaleUpMetrics, sendMetricsAtTimeout, sendMetricsTimeoutVars } from './scale-runners/metrics';
6-
import { getDelayWithJitterRetryCount } from './scale-runners/utils';
6+
import { getDelayWithJitterRetryCount, stochaticRunOvershoot } from './scale-runners/utils';
77
import { scaleDown as scaleDownR } from './scale-runners/scale-down';
88
import { sqsSendMessages, sqsDeleteMessageBatch } from './scale-runners/sqs';
99

10-
async function sendRetryEvents(evtFailed: Array<[SQSRecord, boolean]>, metrics: ScaleUpMetrics) {
10+
async function sendRetryEvents(evtFailed: Array<[SQSRecord, boolean, number]>, metrics: ScaleUpMetrics) {
1111
console.error(`Detected ${evtFailed.length} errors when processing messages, will retry relevant messages.`);
1212
metrics.exception();
1313
const messagesToSend: Array<ActionRequestMessage> = [];
1414
console.dir(evtFailed);
1515

16-
for (const [evt, retryable] of evtFailed) {
16+
for (const [evt, retryable, incRetry] of evtFailed) {
1717
const body: ActionRequestMessage = JSON.parse(evt.body);
1818
const retryCount = body?.retryCount ?? 0;
1919

2020
if (
2121
retryCount < Config.Instance.maxRetryScaleUpRecord &&
2222
(Config.Instance.retryScaleUpRecordQueueUrl?.length ?? 0) > 0
2323
) {
24-
if (retryable) {
25-
metrics.scaleUpFailureRetryable(retryCount);
24+
if (incRetry > 0) {
25+
if (retryable) {
26+
metrics.scaleUpFailureRetryable(retryCount);
27+
} else {
28+
metrics.scaleUpFailureNonRetryable(retryCount);
29+
}
2630
} else {
27-
metrics.scaleUpFailureNonRetryable(retryCount);
31+
metrics.stochasticOvershoot();
2832
}
2933

30-
body.retryCount = retryCount + 1;
34+
body.retryCount = retryCount + incRetry;
3135
body.delaySeconds = Math.min(
3236
900,
3337
getDelayWithJitterRetryCount(
@@ -64,24 +68,32 @@ export async function scaleUp(event: SQSEvent, context: Context, callback: any)
6468
(Config.Instance.lambdaTimeout - 10) * 1000,
6569
);
6670

67-
const evtFailed: Array<[SQSRecord, boolean]> = [];
71+
const evtFailed: Array<[SQSRecord, boolean, number]> = [];
6872

6973
try {
7074
recordsIterProcess: for (let i = 0; i < event.Records.length; i += 1) {
7175
const evt = event.Records[i];
7276

7377
try {
74-
await scaleUpR(evt.eventSource, JSON.parse(evt.body), metrics);
75-
metrics.scaleUpSuccess();
78+
const body = JSON.parse(evt.body) as ActionRequestMessage;
79+
if (
80+
!stochaticRunOvershoot(body?.retryCount ?? 0, 900, Math.max(Config.Instance.retryScaleUpRecordDelayS, 20))
81+
) {
82+
console.warn(`Skipping message due to stochatic run overshoot: ${evt.body}`);
83+
evtFailed.push([evt, true, 0]);
84+
} else {
85+
await scaleUpR(evt.eventSource, body, metrics);
86+
metrics.scaleUpSuccess();
87+
}
7688
} catch (e) {
7789
if (e instanceof RetryableScalingError) {
7890
console.error(`Retryable error thrown: "${e.message}"`);
79-
evtFailed.push([evt, true]);
91+
evtFailed.push([evt, true, 1]);
8092
} else {
8193
console.error(`Non-retryable error during request: "${e.message}"`);
8294
console.error(`All remaning '${event.Records.length - i}' messages will be scheduled to retry`);
8395
for (let ii = i; ii < event.Records.length; ii += 1) {
84-
evtFailed.push([event.Records[ii], false]);
96+
evtFailed.push([event.Records[ii], false, 1]);
8597
}
8698
break recordsIterProcess;
8799
}

terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/metrics.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -769,6 +769,11 @@ export class ScaleUpMetrics extends Metrics {
769769
this.countEntry('run.scaleup.success');
770770
}
771771

772+
/* istanbul ignore next */
773+
stochasticOvershoot() {
774+
this.countEntry('run.scaleup.stochasticOvershoot');
775+
}
776+
772777
/* istanbul ignore next */
773778
scaleUpFailureRetryable(retries: number) {
774779
this.countEntry('run.scaleup.failure.total.count');

terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/utils.test.ts

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,14 @@
11
import {
22
expBackOff,
33
getBoolean,
4+
getDelay,
45
getDelayWithJitter,
56
getDelayWithJitterRetryCount,
67
getRepo,
78
getRepoKey,
89
groupBy,
910
shuffleArrayInPlace,
11+
stochaticRunOvershoot,
1012
} from './utils';
1113
import nock from 'nock';
1214

@@ -140,6 +142,20 @@ describe('./utils', () => {
140142
});
141143
});
142144

145+
describe('getDelay', () => {
146+
it('test some numbers', () => {
147+
expect(getDelay(0, 3)).toEqual(3);
148+
expect(getDelay(1, 3)).toEqual(6);
149+
expect(getDelay(2, 3)).toEqual(12);
150+
expect(getDelay(3, 3)).toEqual(24);
151+
152+
expect(getDelay(0, 5)).toEqual(5);
153+
expect(getDelay(1, 5)).toEqual(10);
154+
expect(getDelay(2, 5)).toEqual(20);
155+
expect(getDelay(3, 5)).toEqual(40);
156+
});
157+
});
158+
143159
describe('getDelayWithJitter', () => {
144160
it('have jitter == 0', () => {
145161
expect(getDelayWithJitter(20, 0.0)).toEqual(20);
@@ -259,4 +275,41 @@ describe('shuffleArrayInPlace', () => {
259275
expect(arr).toContain(number);
260276
}
261277
});
278+
279+
describe('stochaticRunOvershoot', () => {
280+
afterEach(() => {
281+
jest.spyOn(global.Math, 'random').mockRestore();
282+
});
283+
284+
it('test some values', () => {
285+
jest.spyOn(global.Math, 'random').mockReturnValueOnce(0.0);
286+
expect(stochaticRunOvershoot(0, 100, 10)).toBe(true);
287+
jest.spyOn(global.Math, 'random').mockReturnValueOnce(0.5);
288+
expect(stochaticRunOvershoot(0, 100, 10)).toBe(true);
289+
jest.spyOn(global.Math, 'random').mockReturnValueOnce(1.0);
290+
expect(stochaticRunOvershoot(0, 100, 10)).toBe(true);
291+
292+
jest.spyOn(global.Math, 'random').mockReturnValueOnce(0.0);
293+
expect(stochaticRunOvershoot(4, 100, 10)).toBe(true);
294+
jest.spyOn(global.Math, 'random').mockReturnValueOnce(0.5);
295+
expect(stochaticRunOvershoot(4, 100, 10)).toBe(true);
296+
jest.spyOn(global.Math, 'random').mockReturnValueOnce(0.6249);
297+
expect(stochaticRunOvershoot(4, 100, 10)).toBe(true);
298+
jest.spyOn(global.Math, 'random').mockReturnValueOnce(0.6251);
299+
expect(stochaticRunOvershoot(4, 100, 10)).toBe(false);
300+
jest.spyOn(global.Math, 'random').mockReturnValueOnce(1.0);
301+
expect(stochaticRunOvershoot(4, 100, 10)).toBe(false);
302+
303+
jest.spyOn(global.Math, 'random').mockReturnValueOnce(0.0);
304+
expect(stochaticRunOvershoot(12, 100, 10)).toBe(true);
305+
jest.spyOn(global.Math, 'random').mockReturnValueOnce(0.00244139);
306+
expect(stochaticRunOvershoot(12, 100, 10)).toBe(true);
307+
jest.spyOn(global.Math, 'random').mockReturnValueOnce(0.00244141);
308+
expect(stochaticRunOvershoot(12, 100, 10)).toBe(false);
309+
jest.spyOn(global.Math, 'random').mockReturnValueOnce(0.5);
310+
expect(stochaticRunOvershoot(12, 100, 10)).toBe(false);
311+
jest.spyOn(global.Math, 'random').mockReturnValueOnce(1.0);
312+
expect(stochaticRunOvershoot(12, 100, 10)).toBe(false);
313+
});
314+
});
262315
});

terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/utils.ts

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,8 +119,17 @@ export function getDelayWithJitter(delayBase: number, jitter: number) {
119119
return Math.max(0, delayBase) * (1 + Math.random() * Math.max(0, jitter));
120120
}
121121

122+
export function getDelay(retryCount: number, delayBase: number) {
123+
return Math.max(0, delayBase) * Math.pow(2, Math.max(0, retryCount));
124+
}
125+
122126
export function getDelayWithJitterRetryCount(retryCount: number, delayBase: number, jitter: number) {
123-
return getDelayWithJitter(Math.max(0, delayBase) * Math.pow(2, Math.max(0, retryCount)), jitter);
127+
return getDelayWithJitter(getDelay(retryCount, delayBase), jitter);
128+
}
129+
130+
export function stochaticRunOvershoot(retryCount: number, maxTime: number, delayBase: number) {
131+
const prob = maxTime / getDelay(retryCount, delayBase);
132+
return Math.random() < prob;
124133
}
125134

126135
/* eslint-disable-next-line @typescript-eslint/no-explicit-any */

terraform-aws-github-runner/modules/runners/scale-up.tf

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,11 +46,11 @@ resource "aws_lambda_function" "scale_up" {
4646
LAUNCH_TEMPLATE_VERSION_LINUX = aws_launch_template.linux_runner.latest_version
4747
LAUNCH_TEMPLATE_VERSION_LINUX_NVIDIA = aws_launch_template.linux_runner_nvidia.latest_version
4848
LAUNCH_TEMPLATE_VERSION_WINDOWS = aws_launch_template.windows_runner.latest_version
49-
MAX_RETRY_SCALEUP_RECORD = "5"
49+
MAX_RETRY_SCALEUP_RECORD = "10"
5050
MUST_HAVE_ISSUES_LABELS = join(",", var.must_have_issues_labels)
5151
REDIS_ENDPOINT = var.redis_endpoint
5252
REDIS_LOGIN = var.redis_login
53-
RETRY_SCALE_UP_RECORD_DELAY_S = "110"
53+
RETRY_SCALE_UP_RECORD_DELAY_S = "60"
5454
RETRY_SCALE_UP_RECORD_JITTER_PCT = "0.5"
5555
RETRY_SCALE_UP_RECORD_QUEUE_URL = var.sqs_build_queue_retry.url
5656
RUNNER_EXTRA_LABELS = var.runner_extra_labels

0 commit comments

Comments
 (0)