Skip to content

Commit e32b496

Browse files
authored
adds new LaunchTemplate for linux instances with Nvidia GPUs enabling its monitoring (#890)
Adds LaunchTemplate `${var.environment}-action-linux-runner-nvidia` where main startup script points to the SSM where cloudWatch agent config is set on SSM `aws_ssm_parameter.cloudwatch_agent_config_runner_linux_nvidia[0].name`. This template is then chose when the runner names ends contains `.nvidia.gpu` on its name.
1 parent baaf077 commit e32b496

File tree

8 files changed

+192
-32
lines changed

8 files changed

+192
-32
lines changed

terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/config.test.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,10 @@ describe('Config', () => {
2424
process.env.KMS_KEY_ID = 'KMS_KEY_ID';
2525
process.env.LAMBDA_TIMEOUT = '113';
2626
process.env.LAUNCH_TEMPLATE_NAME_LINUX = 'LAUNCH_TEMPLATE_NAME_LINUX';
27+
process.env.LAUNCH_TEMPLATE_NAME_LINUX_NVIDIA = 'LAUNCH_TEMPLATE_NAME_LINUX_NVIDIA';
2728
process.env.LAUNCH_TEMPLATE_NAME_WINDOWS = 'LAUNCH_TEMPLATE_NAME_WINDOWS';
2829
process.env.LAUNCH_TEMPLATE_VERSION_LINUX = 'LAUNCH_TEMPLATE_VERSION_LINUX';
30+
process.env.LAUNCH_TEMPLATE_VERSION_LINUX_NVIDIA = 'LAUNCH_TEMPLATE_VERSION_LINUX_NVIDIA';
2931
process.env.LAUNCH_TEMPLATE_VERSION_WINDOWS = 'LAUNCH_TEMPLATE_VERSION_WINDOWS';
3032
process.env.MINIMUM_RUNNING_TIME_IN_MINUTES = '33';
3133
process.env.MIN_AVAILABLE_RUNNERS = '113';
@@ -55,8 +57,10 @@ describe('Config', () => {
5557
expect(Config.Instance.kmsKeyId).toBe('KMS_KEY_ID');
5658
expect(Config.Instance.lambdaTimeout).toBe(113);
5759
expect(Config.Instance.launchTemplateNameLinux).toBe('LAUNCH_TEMPLATE_NAME_LINUX');
60+
expect(Config.Instance.launchTemplateNameLinuxNvidia).toBe('LAUNCH_TEMPLATE_NAME_LINUX_NVIDIA');
5861
expect(Config.Instance.launchTemplateNameWindows).toBe('LAUNCH_TEMPLATE_NAME_WINDOWS');
5962
expect(Config.Instance.launchTemplateVersionLinux).toBe('LAUNCH_TEMPLATE_VERSION_LINUX');
63+
expect(Config.Instance.launchTemplateVersionLinuxNvidia).toBe('LAUNCH_TEMPLATE_VERSION_LINUX_NVIDIA');
6064
expect(Config.Instance.launchTemplateVersionWindows).toBe('LAUNCH_TEMPLATE_VERSION_WINDOWS');
6165
expect(Config.Instance.minAvailableRunners).toBe(113);
6266
expect(Config.Instance.minimumRunningTimeInMinutes).toBe(33);
@@ -92,8 +96,10 @@ describe('Config', () => {
9296
delete process.env.KMS_KEY_ID;
9397
delete process.env.LAMBDA_TIMEOUT;
9498
process.env.LAUNCH_TEMPLATE_NAME_LINUX = 'LAUNCH_TEMPLATE_NAME_LINUX';
99+
process.env.LAUNCH_TEMPLATE_NAME_LINUX_NVIDIA = 'LAUNCH_TEMPLATE_NAME_LINUX_NVIDIA';
95100
process.env.LAUNCH_TEMPLATE_NAME_WINDOWS = 'LAUNCH_TEMPLATE_NAME_WINDOWS';
96101
process.env.LAUNCH_TEMPLATE_VERSION_LINUX = 'LAUNCH_TEMPLATE_VERSION_LINUX';
102+
process.env.LAUNCH_TEMPLATE_VERSION_LINUX_NVIDIA = 'LAUNCH_TEMPLATE_VERSION_LINUX_NVIDIA';
97103
process.env.LAUNCH_TEMPLATE_VERSION_WINDOWS = 'LAUNCH_TEMPLATE_VERSION_WINDOWS';
98104
delete process.env.MIN_AVAILABLE_RUNNERS;
99105
delete process.env.MUST_HAVE_ISSUES_LABELS;
@@ -120,8 +126,10 @@ describe('Config', () => {
120126
expect(Config.Instance.kmsKeyId).toBeUndefined();
121127
expect(Config.Instance.lambdaTimeout).toEqual(600);
122128
expect(Config.Instance.launchTemplateNameLinux).toBe('LAUNCH_TEMPLATE_NAME_LINUX');
129+
expect(Config.Instance.launchTemplateNameLinuxNvidia).toBe('LAUNCH_TEMPLATE_NAME_LINUX_NVIDIA');
123130
expect(Config.Instance.launchTemplateNameWindows).toBe('LAUNCH_TEMPLATE_NAME_WINDOWS');
124131
expect(Config.Instance.launchTemplateVersionLinux).toBe('LAUNCH_TEMPLATE_VERSION_LINUX');
132+
expect(Config.Instance.launchTemplateVersionLinuxNvidia).toBe('LAUNCH_TEMPLATE_VERSION_LINUX_NVIDIA');
125133
expect(Config.Instance.launchTemplateVersionWindows).toBe('LAUNCH_TEMPLATE_VERSION_WINDOWS');
126134
expect(Config.Instance.minAvailableRunners).toBe(10);
127135
expect(Config.Instance.minimumRunningTimeInMinutes).toBe(10);

terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/config.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,10 @@ export class Config {
1515
readonly kmsKeyId: string | undefined;
1616
readonly lambdaTimeout: number;
1717
readonly launchTemplateNameLinux: string | undefined;
18+
readonly launchTemplateNameLinuxNvidia: string | undefined;
1819
readonly launchTemplateNameWindows: string | undefined;
1920
readonly launchTemplateVersionLinux: string | undefined;
21+
readonly launchTemplateVersionLinuxNvidia: string | undefined;
2022
readonly launchTemplateVersionWindows: string | undefined;
2123
readonly minAvailableRunners: number;
2224
readonly minimumRunningTimeInMinutes: number;
@@ -46,8 +48,10 @@ export class Config {
4648
this.kmsKeyId = process.env.KMS_KEY_ID;
4749
this.lambdaTimeout = Number(process.env.LAMBDA_TIMEOUT || '600');
4850
this.launchTemplateNameLinux = process.env.LAUNCH_TEMPLATE_NAME_LINUX;
51+
this.launchTemplateNameLinuxNvidia = process.env.LAUNCH_TEMPLATE_NAME_LINUX_NVIDIA;
4952
this.launchTemplateNameWindows = process.env.LAUNCH_TEMPLATE_NAME_WINDOWS;
5053
this.launchTemplateVersionLinux = process.env.LAUNCH_TEMPLATE_VERSION_LINUX;
54+
this.launchTemplateVersionLinuxNvidia = process.env.LAUNCH_TEMPLATE_VERSION_LINUX_NVIDIA;
5155
this.launchTemplateVersionWindows = process.env.LAUNCH_TEMPLATE_VERSION_WINDOWS;
5256

5357
/* istanbul ignore next */

terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/runners.test.ts

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,12 @@ function createExpectedRunInstancesLinux(runnerParameters: RunnerInputParameters
5252
MaxCount: 1,
5353
MinCount: 1,
5454
LaunchTemplate: {
55-
LaunchTemplateName: Config.Instance.launchTemplateNameLinux,
56-
Version: Config.Instance.launchTemplateVersionLinux,
55+
LaunchTemplateName: runnerParameters.runnerType.runnerTypeName.includes('.nvidia.gpu')
56+
? Config.Instance.launchTemplateNameLinuxNvidia
57+
: Config.Instance.launchTemplateNameLinux,
58+
Version: runnerParameters.runnerType.runnerTypeName.includes('.nvidia.gpu')
59+
? Config.Instance.launchTemplateVersionLinuxNvidia
60+
: Config.Instance.launchTemplateVersionLinux,
5761
},
5862
InstanceType: runnerParameters.runnerType.instance_type,
5963
BlockDeviceMappings: [
@@ -345,7 +349,7 @@ describe('create runner', () => {
345349
os: 'linux',
346350
max_available: 200,
347351
disk_size: 100,
348-
runnerTypeName: 'linuxCpu',
352+
runnerTypeName: 'linuxCpu.nvidia.gpu',
349353
is_ephemeral: true,
350354
},
351355
};

terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/runners.ts

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,18 @@ async function addSSMParameterRunnerConfig(
221221
console.debug(`Created SSM Parameters(s): ${createdSSMParams.join(',')}`);
222222
}
223223

224+
function getLaunchTemplateName(runnerParameters: RunnerInputParameters): Array<string | undefined> {
225+
if (runnerParameters.runnerType.os === 'linux') {
226+
if (runnerParameters.runnerType.runnerTypeName.includes('.nvidia.gpu')) {
227+
return [Config.Instance.launchTemplateNameLinuxNvidia, Config.Instance.launchTemplateVersionLinuxNvidia];
228+
} else {
229+
return [Config.Instance.launchTemplateNameLinux, Config.Instance.launchTemplateVersionLinux];
230+
}
231+
} else {
232+
return [Config.Instance.launchTemplateNameWindows, Config.Instance.launchTemplateVersionWindows];
233+
}
234+
}
235+
224236
export async function createRunner(runnerParameters: RunnerInputParameters, metrics: Metrics): Promise<void> {
225237
try {
226238
console.debug('Runner configuration: ' + JSON.stringify(runnerParameters));
@@ -256,6 +268,8 @@ export async function createRunner(runnerParameters: RunnerInputParameters, metr
256268
});
257269
}
258270

271+
const [launchTemplateName, launchTemplateVersion] = getLaunchTemplateName(runnerParameters);
272+
259273
const runInstancesResponse = await expBackOff(() => {
260274
return metrics.trackRequest(
261275
metrics.ec2RunInstancesAWSCallSuccess,
@@ -266,14 +280,8 @@ export async function createRunner(runnerParameters: RunnerInputParameters, metr
266280
MaxCount: 1,
267281
MinCount: 1,
268282
LaunchTemplate: {
269-
LaunchTemplateName:
270-
runnerParameters.runnerType.os === 'linux'
271-
? Config.Instance.launchTemplateNameLinux
272-
: Config.Instance.launchTemplateNameWindows,
273-
Version:
274-
runnerParameters.runnerType.os === 'linux'
275-
? Config.Instance.launchTemplateVersionLinux
276-
: Config.Instance.launchTemplateVersionWindows,
283+
LaunchTemplateName: launchTemplateName,
284+
Version: launchTemplateVersion,
277285
},
278286
InstanceType: runnerParameters.runnerType.instance_type,
279287
BlockDeviceMappings: [

terraform-aws-github-runner/modules/runners/logging.tf

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,26 @@ resource "aws_ssm_parameter" "cloudwatch_agent_config_runner_linux" {
7777
tags = local.tags
7878
}
7979

80+
resource "aws_ssm_parameter" "cloudwatch_agent_config_runner_linux_nvidia" {
81+
count = var.enable_cloudwatch_agent ? 1 : 0
82+
name = "${var.environment}-cloudwatch_agent_config_runner_linux_nvidia"
83+
type = "String"
84+
value = jsonencode(
85+
jsondecode(
86+
templatefile(
87+
"${path.module}/templates/cloudwatch_config.json",
88+
{
89+
aws_region = var.aws_region
90+
environment = var.environment
91+
logfiles = jsonencode(local.logfiles_linux)
92+
metrics_collected = templatefile("${path.module}/templates/cloudwatch_config_linux_nvidia.json", {})
93+
}
94+
)
95+
)
96+
)
97+
tags = local.tags
98+
}
99+
80100
resource "aws_cloudwatch_log_group" "gh_runners_linux" {
81101
count = length(local.loggroups_names_linux)
82102
name = local.loggroups_names_linux[count.index]
@@ -95,6 +115,17 @@ resource "aws_iam_role_policy" "cloudwatch_linux" {
95115
)
96116
}
97117

118+
resource "aws_iam_role_policy" "cloudwatch_linux_nvidia" {
119+
count = var.enable_ssm_on_runners ? 1 : 0
120+
name = "CloudWatchLogginAndMetricsLinuxNvidia"
121+
role = aws_iam_role.runner.name
122+
policy = templatefile("${path.module}/policies/instance-cloudwatch-policy.json",
123+
{
124+
ssm_parameter_arn = aws_ssm_parameter.cloudwatch_agent_config_runner_linux_nvidia[0].arn
125+
}
126+
)
127+
}
128+
98129
resource "aws_ssm_parameter" "cloudwatch_agent_config_runner_windows" {
99130
count = var.enable_cloudwatch_agent ? 1 : 0
100131
name = "${var.environment}-cloudwatch_agent_config_runner_windows"

terraform-aws-github-runner/modules/runners/main.tf

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,52 @@ resource "aws_launch_template" "linux_runner" {
102102
tags = local.tags
103103
}
104104

105+
resource "aws_launch_template" "linux_runner_nvidia" {
106+
name = "${var.environment}-action-linux-runner-nvidia"
107+
108+
iam_instance_profile {
109+
name = aws_iam_instance_profile.runner.name
110+
}
111+
112+
instance_initiated_shutdown_behavior = "terminate"
113+
114+
image_id = data.aws_ami.runner_ami_linux.id
115+
instance_type = var.instance_type
116+
key_name = var.key_name
117+
118+
tag_specifications {
119+
resource_type = "instance"
120+
tags = merge(
121+
local.tags,
122+
{
123+
"Name" = format("%s", local.name_runner)
124+
},
125+
)
126+
}
127+
128+
tag_specifications {
129+
resource_type = "volume"
130+
tags = merge(
131+
local.tags,
132+
{
133+
"Name" = format("%s", local.name_runner)
134+
},
135+
)
136+
}
137+
138+
user_data = base64encode(templatefile(local.userdata_template, {
139+
environment = var.environment
140+
pre_install = var.userdata_pre_install
141+
post_install = var.userdata_post_install
142+
enable_cloudwatch_agent = var.enable_cloudwatch_agent
143+
ssm_key_cloudwatch_agent_config = var.enable_cloudwatch_agent ? aws_ssm_parameter.cloudwatch_agent_config_runner_linux_nvidia[0].name : ""
144+
ghes_url = var.ghes_url
145+
install_config_runner = local.install_config_runner_linux
146+
}))
147+
148+
tags = local.tags
149+
}
150+
105151
resource "aws_launch_template" "windows_runner" {
106152
name = "${var.environment}-action-windows-runner"
107153

terraform-aws-github-runner/modules/runners/scale-up.tf

Lines changed: 22 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -30,25 +30,27 @@ resource "aws_lambda_function" "scale_up" {
3030

3131
environment {
3232
variables = {
33-
AWS_REGION_INSTANCES = join(",", var.aws_region_instances)
34-
CANT_HAVE_ISSUES_LABELS = join(",", var.cant_have_issues_labels)
35-
ENABLE_ORGANIZATION_RUNNERS = var.enable_organization_runners
36-
ENVIRONMENT = var.environment
37-
GITHUB_APP_CLIENT_ID = var.github_app.client_id
38-
GITHUB_APP_CLIENT_SECRET = local.github_app_client_secret
39-
GITHUB_APP_ID = var.github_app.id
40-
GITHUB_APP_KEY_BASE64 = local.github_app_key_base64
41-
KMS_KEY_ID = var.encryption.kms_key_id
42-
LAMBDA_TIMEOUT = var.lambda_timeout_scale_up
43-
LAUNCH_TEMPLATE_NAME_LINUX = aws_launch_template.linux_runner.name
44-
LAUNCH_TEMPLATE_NAME_WINDOWS = aws_launch_template.windows_runner.name
45-
LAUNCH_TEMPLATE_VERSION_LINUX = aws_launch_template.linux_runner.latest_version
46-
LAUNCH_TEMPLATE_VERSION_WINDOWS = aws_launch_template.windows_runner.latest_version
47-
MUST_HAVE_ISSUES_LABELS = join(",", var.must_have_issues_labels)
48-
RUNNER_EXTRA_LABELS = var.runner_extra_labels
49-
SECRETSMANAGER_SECRETS_ID = var.secretsmanager_secrets_id
50-
SECURITY_GROUP_IDS = join(",", concat([aws_security_group.runner_sg.id], var.runner_additional_security_group_ids))
51-
SUBNET_IDS = join(",", var.subnet_ids)
33+
AWS_REGION_INSTANCES = join(",", var.aws_region_instances)
34+
CANT_HAVE_ISSUES_LABELS = join(",", var.cant_have_issues_labels)
35+
ENABLE_ORGANIZATION_RUNNERS = var.enable_organization_runners
36+
ENVIRONMENT = var.environment
37+
GITHUB_APP_CLIENT_ID = var.github_app.client_id
38+
GITHUB_APP_CLIENT_SECRET = local.github_app_client_secret
39+
GITHUB_APP_ID = var.github_app.id
40+
GITHUB_APP_KEY_BASE64 = local.github_app_key_base64
41+
KMS_KEY_ID = var.encryption.kms_key_id
42+
LAMBDA_TIMEOUT = var.lambda_timeout_scale_up
43+
LAUNCH_TEMPLATE_NAME_LINUX = aws_launch_template.linux_runner.name
44+
LAUNCH_TEMPLATE_NAME_LINUX_NVIDIA = aws_launch_template.linux_runner_nvidia.name
45+
LAUNCH_TEMPLATE_NAME_WINDOWS = aws_launch_template.windows_runner.name
46+
LAUNCH_TEMPLATE_VERSION_LINUX = aws_launch_template.linux_runner.latest_version
47+
LAUNCH_TEMPLATE_VERSION_LINUX_NVIDIA = aws_launch_template.linux_runner_nvidia.latest_version
48+
LAUNCH_TEMPLATE_VERSION_WINDOWS = aws_launch_template.windows_runner.latest_version
49+
MUST_HAVE_ISSUES_LABELS = join(",", var.must_have_issues_labels)
50+
RUNNER_EXTRA_LABELS = var.runner_extra_labels
51+
SECRETSMANAGER_SECRETS_ID = var.secretsmanager_secrets_id
52+
SECURITY_GROUP_IDS = join(",", concat([aws_security_group.runner_sg.id], var.runner_additional_security_group_ids))
53+
SUBNET_IDS = join(",", var.subnet_ids)
5254
}
5355
}
5456

@@ -62,7 +64,6 @@ resource "aws_lambda_function" "scale_up" {
6264
}
6365

6466
resource "aws_lambda_alias" "scale_up_lambda_alias" {
65-
count = var.scale_up_provisioned_concurrent_executions > 0 ? 1 : 0
6667
name = "provisioned-${aws_lambda_function.scale_up.function_name}"
6768
description = "Alias for provisioned instances of ${aws_lambda_function.scale_up.function_name}"
6869
function_name = aws_lambda_function.scale_up.function_name
@@ -73,7 +74,7 @@ resource "aws_lambda_provisioned_concurrency_config" "scale_up_provisioned_concu
7374
count = var.scale_up_provisioned_concurrent_executions > 0 ? 1 : 0
7475
function_name = aws_lambda_alias.scale_up_lambda_alias.function_name
7576
provisioned_concurrent_executions = var.scale_up_provisioned_concurrent_executions
76-
qualifier = aws_lambda_alias.scale_up_lambda_alias.version
77+
qualifier = aws_lambda_alias.scale_up_lambda_alias.name
7778
}
7879

7980
resource "aws_cloudwatch_log_group" "scale_up" {
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
{
2+
"cpu": {
3+
"measurement": [
4+
"cpu_usage_idle",
5+
"cpu_usage_iowait",
6+
"cpu_usage_user",
7+
"cpu_usage_system"
8+
],
9+
"metrics_collection_interval": 10
10+
},
11+
"disk": {
12+
"measurement": [
13+
"free",
14+
"total",
15+
"used",
16+
"used_percent",
17+
"inodes_free",
18+
"inodes_total"
19+
],
20+
"metrics_collection_interval": 10,
21+
"resources": [
22+
"*"
23+
]
24+
},
25+
"diskio": {
26+
"measurement": [
27+
"io_time"
28+
],
29+
"metrics_collection_interval": 10,
30+
"resources": [
31+
"/"
32+
]
33+
},
34+
"mem": {
35+
"measurement": [
36+
"total",
37+
"used",
38+
"free",
39+
"used_percent"
40+
],
41+
"metrics_collection_interval": 10
42+
},
43+
"swap": {
44+
"measurement": [
45+
"swap_used_percent"
46+
],
47+
"metrics_collection_interval": 10
48+
},
49+
"nvidia_gpu": {
50+
"measurement": [
51+
"utilization_gpu",
52+
"utilization_memory",
53+
"memory_total",
54+
"memory_used",
55+
"memory_free"
56+
]
57+
}
58+
}

0 commit comments

Comments
 (0)