Skip to content

Commit 7a7c4ea

Browse files
authored
FIX: Don't remove EC2 instance when fails to remove githubRunner (#904)
`removeGithubRunner[Org || Repo]` used to remove the EC2 instance, so no need to call `terminateRunner` again. This potentially could cause runners that failed to be unregistered from GHA to be terminated on EC2. As a fix, `removeGithubRunner` won't terminate the instance, nor generate logs. This will enable `scaleDown` to control when to call `terminateRunner` and generate the proper logs and metrics. Avoiding having this issue in the future. This bug also explains why we had in the past more EC2 instances being kept at its minimum time: instances with less than minimum time got unregistered and terminated without being tracked on main application metric. This is obvious when we compare the API calls to terminate and the count of app level termination. ![Screenshot 2022-10-18 at 09 21 47](https://user-images.githubusercontent.com/4520845/196364535-5aaab331-2080-44be-b6af-0702f99d50d9.png) ![Screenshot 2022-10-18 at 09 26 19](https://user-images.githubusercontent.com/4520845/196364542-376ff99f-617e-4e82-b459-dfc8364219ad.png) Bug initially flagged on [87134](pytorch/pytorch#87134)
1 parent da320a7 commit 7a7c4ea

File tree

5 files changed

+218
-141
lines changed

5 files changed

+218
-141
lines changed

terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/gh-runners.test.ts

Lines changed: 4 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@ import {
1414
removeGithubRunnerRepo,
1515
resetGHRunnersCaches,
1616
} from './gh-runners';
17-
import { RunnerInfo } from './utils';
1817
import { createGithubAuth, createOctoClient } from './gh-auth';
1918
import { ScaleUpMetrics } from './metrics';
2019

@@ -390,12 +389,6 @@ describe('listGithubRunners', () => {
390389
});
391390

392391
describe('removeGithubRunnerRepo', () => {
393-
const repo = { owner: 'owner', repo: 'repo' };
394-
const irrelevantRunnerInfo: RunnerInfo = {
395-
...repo,
396-
instanceId: '113',
397-
};
398-
399392
it('succeeds', async () => {
400393
const runnerId = 33;
401394
const repo = { owner: 'owner', repo: 'repo' };
@@ -419,16 +412,13 @@ describe('removeGithubRunnerRepo', () => {
419412
mockCreateOctoClient.mockReturnValueOnce(mockedOctokit as unknown as Octokit);
420413

421414
resetGHRunnersCaches();
422-
await removeGithubRunnerRepo(irrelevantRunnerInfo, runnerId, repo, metrics);
415+
await removeGithubRunnerRepo(runnerId, repo, metrics);
423416

424417
expect(mockedOctokit.actions.deleteSelfHostedRunnerFromRepo).toBeCalledWith({
425418
...repo,
426419
runner_id: runnerId,
427420
});
428421
expect(getRepoInstallation).toBeCalled();
429-
expect(mockEC2.terminateInstances).toBeCalledWith({
430-
InstanceIds: [irrelevantRunnerInfo.instanceId],
431-
});
432422
});
433423

434424
it('fails', async () => {
@@ -454,23 +444,18 @@ describe('removeGithubRunnerRepo', () => {
454444
mockCreateOctoClient.mockReturnValueOnce(mockedOctokit as unknown as Octokit);
455445

456446
resetGHRunnersCaches();
457-
await removeGithubRunnerRepo(irrelevantRunnerInfo, runnerId, repo, metrics);
447+
await expect(removeGithubRunnerRepo(runnerId, repo, metrics)).rejects.toThrow();
458448

459449
expect(mockedOctokit.actions.deleteSelfHostedRunnerFromRepo).toBeCalledWith({
460450
...repo,
461451
runner_id: runnerId,
462452
});
463453
expect(getRepoInstallation).toBeCalled();
464-
expect(mockEC2.terminateInstances).not.toBeCalled();
465454
});
466455
});
467456

468457
describe('removeGithubRunnerOrg', () => {
469458
const org = 'mockedOrg';
470-
const irrelevantRunnerInfo: RunnerInfo = {
471-
org: org,
472-
instanceId: '113',
473-
};
474459

475460
it('succeeds', async () => {
476461
const runnerId = 33;
@@ -494,16 +479,13 @@ describe('removeGithubRunnerOrg', () => {
494479
mockCreateOctoClient.mockReturnValueOnce(mockedOctokit as unknown as Octokit);
495480

496481
resetGHRunnersCaches();
497-
await removeGithubRunnerOrg(irrelevantRunnerInfo, runnerId, org, metrics);
482+
await removeGithubRunnerOrg(runnerId, org, metrics);
498483

499484
expect(mockedOctokit.actions.deleteSelfHostedRunnerFromOrg).toBeCalledWith({
500485
org: org,
501486
runner_id: runnerId,
502487
});
503488
expect(getOrgInstallation).toBeCalled();
504-
expect(mockEC2.terminateInstances).toBeCalledWith({
505-
InstanceIds: [irrelevantRunnerInfo.instanceId],
506-
});
507489
});
508490

509491
it('fails', async () => {
@@ -528,14 +510,13 @@ describe('removeGithubRunnerOrg', () => {
528510
mockCreateOctoClient.mockReturnValueOnce(mockedOctokit as unknown as Octokit);
529511

530512
resetGHRunnersCaches();
531-
await removeGithubRunnerOrg(irrelevantRunnerInfo, runnerId, org, metrics);
513+
await expect(removeGithubRunnerOrg(runnerId, org, metrics)).rejects.toThrow();
532514

533515
expect(mockedOctokit.actions.deleteSelfHostedRunnerFromOrg).toBeCalledWith({
534516
org: org,
535517
runner_id: runnerId,
536518
});
537519
expect(getOrgInstallation).toBeCalled();
538-
expect(mockEC2.terminateInstances).not.toBeCalled();
539520
});
540521
});
541522

terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/gh-runners.ts

Lines changed: 54 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
import { Repo, RunnerInfo, getRepoKey } from './utils';
2-
import { RunnerType, terminateRunner } from './runners';
1+
import { Repo, getRepoKey, expBackOff } from './utils';
2+
import { RunnerType } from './runners';
33
import { createGithubAuth, createOctoClient } from './gh-auth';
44

55
import { Config } from './config';
@@ -127,10 +127,10 @@ type UnboxPromise<T> = T extends Promise<infer U> ? U : T;
127127

128128
export type GhRunners = UnboxPromise<ReturnType<Octokit['actions']['listSelfHostedRunnersForRepo']>>['data']['runners'];
129129

130-
export async function removeGithubRunnerRepo(ec2runner: RunnerInfo, ghRunnerId: number, repo: Repo, metrics: Metrics) {
131-
try {
132-
const githubAppClient = await createGitHubClientForRunnerRepo(repo, metrics);
133-
const result = await metrics.trackRequest(
130+
export async function removeGithubRunnerRepo(ghRunnerId: number, repo: Repo, metrics: Metrics) {
131+
const githubAppClient = await createGitHubClientForRunnerRepo(repo, metrics);
132+
const result = await expBackOff(() => {
133+
return metrics.trackRequest(
134134
metrics.deleteSelfHostedRunnerFromRepoGHCallSuccess,
135135
metrics.deleteSelfHostedRunnerFromRepoGHCallFailure,
136136
() => {
@@ -140,26 +140,21 @@ export async function removeGithubRunnerRepo(ec2runner: RunnerInfo, ghRunnerId:
140140
});
141141
},
142142
);
143+
});
143144

144-
/* istanbul ignore next */
145-
if (result?.status == 204) {
146-
await terminateRunner(ec2runner, metrics);
147-
console.info(
148-
`AWS runner instance '${ec2runner.instanceId}' [${ec2runner.runnerType}] is terminated ` +
149-
`and GitHub runner is de-registered. (removeGithubRunnerRepo)`,
150-
);
151-
}
152-
} catch (e) {
153-
console.warn(
154-
`Error scaling down (removeGithubRunnerRepo) '${ec2runner.instanceId}' [${ec2runner.runnerType}]: ${e}`,
145+
/* istanbul ignore next */
146+
if ((result?.status ?? 0) != 204) {
147+
throw (
148+
`Request deleteSelfHostedRunnerFromRepoGHCallSuccess returned status code different than 204: ` +
149+
`${result?.status ?? 0} for ${repo} ${ghRunnerId}`
155150
);
156151
}
157152
}
158153

159-
export async function removeGithubRunnerOrg(ec2runner: RunnerInfo, ghRunnerId: number, org: string, metrics: Metrics) {
160-
try {
161-
const githubAppClient = await createGitHubClientForRunnerOrg(org, metrics);
162-
const result = await metrics.trackRequest(
154+
export async function removeGithubRunnerOrg(ghRunnerId: number, org: string, metrics: Metrics) {
155+
const githubAppClient = await createGitHubClientForRunnerOrg(org, metrics);
156+
const result = await expBackOff(() => {
157+
return metrics.trackRequest(
163158
metrics.deleteSelfHostedRunnerFromOrgGHCallSuccess,
164159
metrics.deleteSelfHostedRunnerFromOrgGHCallFailure,
165160
() => {
@@ -169,18 +164,13 @@ export async function removeGithubRunnerOrg(ec2runner: RunnerInfo, ghRunnerId: n
169164
});
170165
},
171166
);
167+
});
172168

173-
/* istanbul ignore next */
174-
if (result?.status == 204) {
175-
await terminateRunner(ec2runner, metrics);
176-
console.info(
177-
`AWS runner instance '${ec2runner.instanceId}' [${ec2runner.runnerType}] is terminated ` +
178-
`and GitHub runner is de-registered. (removeGithubRunnerOrg)`,
179-
);
180-
}
181-
} catch (e) {
182-
console.warn(
183-
`Error scaling down (removeGithubRunnerOrg) '${ec2runner.instanceId}' [${ec2runner.runnerType}]: ${e}`,
169+
/* istanbul ignore next */
170+
if ((result?.status ?? 0) != 204) {
171+
throw (
172+
`Request deleteSelfHostedRunnerFromRepoGHCallSuccess returned status code different than 204: ` +
173+
`${result?.status ?? 0} for ${org} ${ghRunnerId}`
184174
);
185175
}
186176
}
@@ -245,7 +235,7 @@ async function listGithubRunners(key: string, listCallback: () => Promise<GhRunn
245235
}
246236

247237
console.debug(`[listGithubRunners] Cache miss for ${key}`);
248-
const runners = await listCallback();
238+
const runners = await expBackOff(listCallback);
249239
ghRunnersCache.set(key, runners);
250240
return runners;
251241
} catch (e) {
@@ -261,16 +251,18 @@ export async function getRunnerRepo(repo: Repo, runnerID: string, metrics: Metri
261251

262252
try {
263253
return (
264-
await metrics.trackRequest(
265-
metrics.getSelfHostedRunnerForRepoGHCallSuccess,
266-
metrics.getSelfHostedRunnerForRepoGHCallFailure,
267-
() => {
268-
return client.actions.getSelfHostedRunnerForRepo({
269-
...repo,
270-
runner_id: runnerID as unknown as number,
271-
});
272-
},
273-
)
254+
await expBackOff(() => {
255+
return metrics.trackRequest(
256+
metrics.getSelfHostedRunnerForRepoGHCallSuccess,
257+
metrics.getSelfHostedRunnerForRepoGHCallFailure,
258+
() => {
259+
return client.actions.getSelfHostedRunnerForRepo({
260+
...repo,
261+
runner_id: runnerID as unknown as number,
262+
});
263+
},
264+
);
265+
})
274266
).data;
275267
} catch (e) {
276268
console.warn(`[getRunnerRepo <inner try>]: ${e}`);
@@ -283,16 +275,18 @@ export async function getRunnerOrg(org: string, runnerID: string, metrics: Metri
283275

284276
try {
285277
return (
286-
await metrics.trackRequest(
287-
metrics.getSelfHostedRunnerForOrgGHCallSuccess,
288-
metrics.getSelfHostedRunnerForOrgGHCallFailure,
289-
() => {
290-
return client.actions.getSelfHostedRunnerForOrg({
291-
org: org,
292-
runner_id: runnerID as unknown as number,
293-
});
294-
},
295-
)
278+
await expBackOff(() => {
279+
return metrics.trackRequest(
280+
metrics.getSelfHostedRunnerForOrgGHCallSuccess,
281+
metrics.getSelfHostedRunnerForOrgGHCallFailure,
282+
() => {
283+
return client.actions.getSelfHostedRunnerForOrg({
284+
org: org,
285+
runner_id: runnerID as unknown as number,
286+
});
287+
},
288+
);
289+
})
296290
).data;
297291
} catch (e) {
298292
console.warn(`[getRunnerOrg <inner try>]: ${e}`);
@@ -309,8 +303,9 @@ export async function getRunnerTypes(
309303
try {
310304
const runnerTypeKey = getRepoKey(repo);
311305

312-
if (runnerTypeCache.get(runnerTypeKey) !== undefined) {
313-
return runnerTypeCache.get(runnerTypeKey) as Map<string, RunnerType>;
306+
const runnerTypeCacheActualContent = runnerTypeCache.get(runnerTypeKey);
307+
if (runnerTypeCacheActualContent !== undefined) {
308+
return runnerTypeCacheActualContent as Map<string, RunnerType>;
314309
}
315310
console.debug(`[getRunnerTypes] cache miss for ${runnerTypeKey}`);
316311

@@ -320,16 +315,14 @@ export async function getRunnerTypes(
320315
? await createGitHubClientForRunnerOrg(repo.owner, metrics)
321316
: await createGitHubClientForRunnerRepo(repo, metrics);
322317

323-
const response = await metrics.trackRequest(
324-
metrics.reposGetContentGHCallSuccess,
325-
metrics.reposGetContentGHCallFailure,
326-
() => {
318+
const response = await expBackOff(() => {
319+
return metrics.trackRequest(metrics.reposGetContentGHCallSuccess, metrics.reposGetContentGHCallFailure, () => {
327320
return githubAppClient.repos.getContent({
328321
...repo,
329322
path: filepath,
330323
});
331-
},
332-
);
324+
});
325+
});
333326

334327
/* istanbul ignore next */
335328
const { content } = { ...(response?.data || {}) };

terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/metrics.ts

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -691,6 +691,48 @@ export class ScaleDownMetrics extends Metrics {
691691
this.countEntry(`run.ec2runners.${org}.${ec2Runner.runnerType}.notFound`);
692692
}
693693

694+
/* istanbul ignore next */
695+
runnerGhTerminateSuccessOrg(org: string, ec2runner: RunnerInfo) {
696+
this.countEntry(`run.ghRunner.byOrg.${org}.terminate.success`);
697+
this.countEntry(`run.ghRunner.byOrg.${org}.byType.${ec2runner.runnerType}.terminate.success`);
698+
this.countEntry(`run.ghRunner.byType.${ec2runner.runnerType}.terminate.success`);
699+
}
700+
701+
/* istanbul ignore next */
702+
runnerGhTerminateSuccessRepo(repo: Repo, ec2runner: RunnerInfo) {
703+
this.countEntry(`run.ghRunner.byRepo.${repo.owner}.${repo.repo}.terminate.success`);
704+
this.countEntry(`run.ghRunner.byRepo.${repo.owner}.${repo.repo}.byType.${ec2runner.runnerType}.terminate.success`);
705+
this.countEntry(`run.ghRunner.byType.${ec2runner.runnerType}.terminate.success`);
706+
}
707+
708+
/* istanbul ignore next */
709+
runnerGhTerminateFailureOrg(org: string, ec2runner: RunnerInfo) {
710+
this.countEntry(`run.ghRunner.byOrg.${org}.terminate.failure`);
711+
this.countEntry(`run.ghRunner.byOrg.${org}.byType.${ec2runner.runnerType}.terminate.failure`);
712+
this.countEntry(`run.ghRunner.byType.${ec2runner.runnerType}.terminate.failure`);
713+
}
714+
715+
/* istanbul ignore next */
716+
runnerGhTerminateFailureRepo(repo: Repo, ec2runner: RunnerInfo) {
717+
this.countEntry(`run.ghRunner.byRepo.${repo.owner}.${repo.repo}.terminate.failure`);
718+
this.countEntry(`run.ghRunner.byRepo.${repo.owner}.${repo.repo}.byType.${ec2runner.runnerType}.terminate.failure`);
719+
this.countEntry(`run.ghRunner.byType.${ec2runner.runnerType}.terminate.failure`);
720+
}
721+
722+
/* istanbul ignore next */
723+
runnerGhTerminateNotFoundOrg(org: string, ec2runner: RunnerInfo) {
724+
this.countEntry(`run.ghRunner.byOrg.${org}.terminate.notfound`);
725+
this.countEntry(`run.ghRunner.byOrg.${org}.byType.${ec2runner.runnerType}.terminate.notfound`);
726+
this.countEntry(`run.ghRunner.byType.${ec2runner.runnerType}.terminate.notfound`);
727+
}
728+
729+
/* istanbul ignore next */
730+
runnerGhTerminateNotFoundRepo(repo: Repo, ec2runner: RunnerInfo) {
731+
this.countEntry(`run.ghRunner.byRepo.${repo.owner}.${repo.repo}.terminate.notfound`);
732+
this.countEntry(`run.ghRunner.byRepo.${repo.owner}.${repo.repo}.byType.${ec2runner.runnerType}.terminate.notfound`);
733+
this.countEntry(`run.ghRunner.byType.${ec2runner.runnerType}.terminate.notfound`);
734+
}
735+
694736
/* istanbul ignore next */
695737
runnerTerminateSuccess(ec2Runner: RunnerInfo) {
696738
this.countEntry('run.ec2Runners.terminate.success');
@@ -724,6 +766,23 @@ export class ScaleDownMetrics extends Metrics {
724766
this.countEntry(`run.ec2runners.${repo.owner}.${repo.repo}.terminate.failure`);
725767
}
726768
}
769+
770+
/* istanbul ignore next */
771+
runnerTerminateSkipped(ec2Runner: RunnerInfo) {
772+
this.countEntry('run.ec2Runners.terminate.skipped');
773+
this.countEntry(`run.ec2runners.${ec2Runner.runnerType}.terminate.skipped`);
774+
775+
if (ec2Runner.org !== undefined) {
776+
this.countEntry(`run.ec2runners.${ec2Runner.org}.${ec2Runner.runnerType}.terminate.skipped`);
777+
this.countEntry(`run.ec2runners.${ec2Runner.org}.terminate.skipped`);
778+
}
779+
780+
if (ec2Runner.repo !== undefined) {
781+
const repo = getRepo(ec2Runner.repo as string);
782+
this.countEntry(`run.ec2runners.${repo.owner}.${repo.repo}.${ec2Runner.runnerType}.terminate.skipped`);
783+
this.countEntry(`run.ec2runners.${repo.owner}.${repo.repo}.terminate.skipped`);
784+
}
785+
}
727786
}
728787

729788
export interface sendMetricsTimeoutVars {

0 commit comments

Comments
 (0)