diff --git a/terraform-aws-github-runner/main.tf b/terraform-aws-github-runner/main.tf index 6202a95323..c60a0ba658 100644 --- a/terraform-aws-github-runner/main.tf +++ b/terraform-aws-github-runner/main.tf @@ -115,6 +115,10 @@ module "runners" { retry_scale_up_chron_hud_query_url = var.retry_scale_up_chron_hud_query_url + enable_scale_cycle = var.enable_scale_cycle + scale_cycle_schedule_expression = var.scale_cycle_schedule_expression + lambda_timeout_scale_cycle = var.lambda_timeout_scale_cycle + must_have_issues_labels = var.must_have_issues_labels cant_have_issues_labels = var.cant_have_issues_labels diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/lambda.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/lambda.ts index 2c3cfaead2..71417ef571 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/lambda.ts +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/lambda.ts @@ -7,11 +7,13 @@ import { ScaleUpChronMetrics, sendMetricsAtTimeout, sendMetricsTimeoutVars, + ScaleCycleMetrics, } from './scale-runners/metrics'; import { getDelayWithJitterRetryCount, stochaticRunOvershoot } from './scale-runners/utils'; import { scaleDown as scaleDownR } from './scale-runners/scale-down'; import { scaleUpChron as scaleUpChronR } from './scale-runners/scale-up-chron'; import { sqsSendMessages, sqsDeleteMessageBatch } from './scale-runners/sqs'; +import { scaleCycle as scaleCycleR } from './scale-runners/scale-cycle'; async function sendRetryEvents(evtFailed: Array<[SQSRecord, boolean, number]>, metrics: ScaleUpMetrics) { console.error(`Detected ${evtFailed.length} errors when processing messages, will retry relevant messages.`); @@ -202,3 +204,38 @@ export async function scaleUpChron(event: ScheduledEvent, context: Context, call } callback(callbackOutput); } + +// eslint-disable-next-line @typescript-eslint/no-explicit-any +export async function scaleCycle(event: ScheduledEvent, context: Context, callback: any) { + // we mantain open connections to redis, so the event pool is only cleaned when the SIGTERM is sent + context.callbackWaitsForEmptyEventLoop = false; + + const metrics = new ScaleCycleMetrics(); + const sndMetricsTimout: sendMetricsTimeoutVars = { + metrics: metrics, + }; + sndMetricsTimout.setTimeout = setTimeout( + sendMetricsAtTimeout(sndMetricsTimout), + (Config.Instance.lambdaTimeout - 10) * 1000, + ); + + let callbackOutput: string | null = null; + + try { + await scaleCycleR(metrics); + } catch (e) { + console.error(e); + callbackOutput = `Failed to scale cycle: ${e}`; + } finally { + try { + clearTimeout(sndMetricsTimout.setTimeout); + sndMetricsTimout.metrics = undefined; + sndMetricsTimout.setTimeout = undefined; + await metrics.sendMetrics(); + } catch (e) { + callbackOutput = `Error sending metrics: ${e}`; + } + } + + callback(callbackOutput); +} diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/metrics.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/metrics.ts index d0cc938db3..a04a1fab7b 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/metrics.ts +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/metrics.ts @@ -1813,3 +1813,30 @@ export function sendMetricsAtTimeout(metricsTimeouts: sendMetricsTimeoutVars) { } }; } + +export class ScaleCycleMetrics extends ScaleUpMetrics { + constructor() { + super('scaleCycle'); + } + + scaleCycleRunnerReuseFound(runnerType: string) { + const dimensions = new Map([['RunnerType', runnerType]]); + this.countEntry('run.scaleCycle.runnerReuse.found', 1, dimensions); + } + + scaleCycleRunnerReuseFoundOrg(org: string, runnerType: string) { + const dimensions = new Map([ + ['Org', org], + ['RunnerType', runnerType], + ]); + this.countEntry('run.scaleCycle.runnerReuse.found.org', 1, dimensions); + } + + scaleCycleRunnerReuseFoundRepo(repo: string, runnerType: string) { + const dimensions = new Map([ + ['Repo', repo], + ['RunnerType', runnerType], + ]); + this.countEntry('run.scaleCycle.runnerReuse.found.repo', 1, dimensions); + } +} diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-cycle.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-cycle.ts new file mode 100644 index 0000000000..65c51feaf3 --- /dev/null +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-cycle.ts @@ -0,0 +1,77 @@ +import { Config } from './config'; +import { listRunners, RunnerInputParameters, tryReuseRunner } from './runners'; +import { getRepo, getRepoKey } from './utils'; +import { ScaleCycleMetrics } from './metrics'; +import { getRunnerTypes } from './gh-runners'; +import { createRunnerConfigArgument } from './scale-up'; + +export async function scaleCycle(metrics: ScaleCycleMetrics) { + // Get runner types configuration first + const scaleConfigRepo = getRepo(Config.Instance.scaleConfigOrg, Config.Instance.scaleConfigRepo); + const runnerTypes = await getRunnerTypes(scaleConfigRepo, metrics); + + // Get all valid runner type names for filtering + const validRunnerTypeNames = Array.from(runnerTypes.keys()); + + // Make separate calls for each runner type to filter at EC2 level + const allRunners = await Promise.all( + validRunnerTypeNames.map((runnerTypeName) => + listRunners(metrics, { + containsTags: ['GithubRunnerID', 'EphemeralRunnerFinished', 'RunnerType'], + runnerType: runnerTypeName, + }), + ), + ); + + // Flatten the results + const runners = allRunners.flat(); + + for (const runner of runners) { + // Skip if required fields are missing (org/repo still need to be checked) + if (!runner.runnerType || !runner.org || !runner.repo) { + console.warn(`Skipping runner ${runner.instanceId} due to missing required tags`); + continue; + } + + // Get the RunnerType object from the string (we know it exists since we filtered by it) + const runnerType = runnerTypes.get(runner.runnerType); + if (!runnerType) { + console.warn(`Unknown runner type: ${runner.runnerType}, skipping`); + continue; + } + + // Create repo object + const repo = getRepo(runner.org, runner.repo); + + // For each runner send an EBS volume replacement task + const runnerInputParameters: RunnerInputParameters = { + runnerConfig: (awsRegion: string, experimentalRunner: boolean) => { + return createRunnerConfigArgument( + runnerType, + repo, + // NOTE: installationId can actually be undefined here but this may incur lower rate limits + // TODO: figure out if we need to pass an actual installationId here + undefined, + metrics, + awsRegion, + experimentalRunner, + ); + }, + environment: Config.Instance.environment, + runnerType: runnerType, + }; + + // Set orgName or repoName based on configuration + if (Config.Instance.enableOrganizationRunners) { + runnerInputParameters.orgName = runner.org; + metrics.scaleCycleRunnerReuseFoundOrg(runner.org, runner.runnerType); + console.info(`Reusing runner ${runner.instanceId} for ${runner.org}`); + } else { + runnerInputParameters.repoName = getRepoKey(repo); + metrics.scaleCycleRunnerReuseFoundRepo(getRepoKey(repo), runner.runnerType); + console.info(`Reusing runner ${runner.instanceId} for ${getRepoKey(repo)}`); + } + + await tryReuseRunner(runnerInputParameters, metrics); + } +} diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts index 65752c9138..d5f7f9fb80 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts @@ -163,7 +163,7 @@ export async function scaleUp( } } -async function createRunnerConfigArgument( +export async function createRunnerConfigArgument( runnerType: RunnerType, repo: Repo, installationId: number | undefined, diff --git a/terraform-aws-github-runner/modules/runners/policies/lambda-scale-cycle.json b/terraform-aws-github-runner/modules/runners/policies/lambda-scale-cycle.json new file mode 100644 index 0000000000..6152ed8e33 --- /dev/null +++ b/terraform-aws-github-runner/modules/runners/policies/lambda-scale-cycle.json @@ -0,0 +1,44 @@ +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "ec2:DescribeInstances", + "ec2:DescribeTags", + "ec2:RunInstances", + "ec2:CreateNetworkInterface", + "ec2:DescribeNetworkInterfaces", + "ec2:DeleteNetworkInterface", + "ec2:DescribeImages", + "ec2:CreateTags", + "ec2:DeleteTags", + "ec2:CreateReplaceRootVolumeTask", + "ec2:DescribeReplaceRootVolumeTasks" + ], + "Resource": ["*"] + }, + { + "Effect": "Allow", + "Action": [ + "ec2:CreateTags" + ], + "Resource": ["*"], + "Condition": { + "StringEquals": { + "ec2:CreateAction" : "RunInstances" + } + } + }, + { + "Effect": "Allow", + "Action": "iam:PassRole", + "Resource": "${arn_runner_instance_role}" + }, + { + "Effect": "Allow", + "Action": ["ssm:PutParameter", "ssm:GetParameter", "ssm:DeleteParameter"], + "Resource": "*" + } + ] + } \ No newline at end of file diff --git a/terraform-aws-github-runner/modules/runners/scale-cycle.tf b/terraform-aws-github-runner/modules/runners/scale-cycle.tf new file mode 100644 index 0000000000..7a014038b8 --- /dev/null +++ b/terraform-aws-github-runner/modules/runners/scale-cycle.tf @@ -0,0 +1,176 @@ +resource "aws_kms_grant" "scale_cycle" { + count = var.encryption.encrypt ? (var.enable_scale_cycle ? 1 : 0) : 0 + name = "${var.environment}-scale-cycle" + key_id = var.encryption.kms_key_id + grantee_principal = aws_iam_role.scale_cycle[0].arn + operations = ["Decrypt"] + + constraints { + encryption_context_equals = { + Environment = var.environment + } + } +} + +resource "aws_lambda_function" "scale_cycle" { + count = var.enable_scale_cycle ? 1 : 0 + s3_bucket = var.lambda_s3_bucket != null ? var.lambda_s3_bucket : null + s3_key = var.runners_lambda_s3_key != null ? var.runners_lambda_s3_key : null + s3_object_version = var.runners_lambda_s3_object_version != null ? var.runners_lambda_s3_object_version : null + filename = var.lambda_s3_bucket == null ? local.lambda_zip : null + source_code_hash = var.lambda_s3_bucket == null ? filebase64sha256(local.lambda_zip) : null + function_name = "${var.environment}-scale-cycle" + role = aws_iam_role.scale_cycle[0].arn + handler = "index.scaleCycle" + runtime = "nodejs20.x" + timeout = var.lambda_timeout_scale_cycle + tags = local.tags + memory_size = 2048 + + environment { + variables = { + DATETIME_DEPLOY = local.datetime_deploy + ENABLE_ORGANIZATION_RUNNERS = var.enable_organization_runners + ENVIRONMENT = var.environment + GITHUB_APP_CLIENT_ID = var.github_app.client_id + GITHUB_APP_CLIENT_SECRET = var.github_app_client_secret + GITHUB_APP_ID = var.github_app.id + GITHUB_APP_KEY_BASE64 = var.github_app_key_base64 + KMS_KEY_ID = var.encryption.kms_key_id + LAMBDA_TIMEOUT = var.lambda_timeout_scale_cycle + LAUNCH_TEMPLATE_NAME_LINUX = var.launch_template_name_linux + LAUNCH_TEMPLATE_NAME_LINUX_ARM64 = var.launch_template_name_linux_arm64 + LAUNCH_TEMPLATE_NAME_LINUX_NVIDIA = var.launch_template_name_linux_nvidia + LAUNCH_TEMPLATE_NAME_WINDOWS = var.launch_template_name_windows + LAUNCH_TEMPLATE_VERSION_LINUX = var.launch_template_version_linux + LAUNCH_TEMPLATE_VERSION_LINUX_ARM64 = var.launch_template_version_linux_arm64 + LAUNCH_TEMPLATE_VERSION_LINUX_NVIDIA = var.launch_template_version_linux_nvidia + LAUNCH_TEMPLATE_VERSION_WINDOWS = var.launch_template_version_windows + MINIMUM_RUNNING_TIME_IN_MINUTES = var.minimum_running_time_in_minutes + REDIS_ENDPOINT = var.redis_endpoint + REDIS_LOGIN = var.redis_login + RUNNER_EXTRA_LABELS = var.runner_extra_labels + SCALE_CONFIG_ORG = var.scale_config_org + SCALE_CONFIG_REPO = var.scale_config_repo + SCALE_CONFIG_REPO_PATH = var.scale_config_repo_path + SECRETSMANAGER_SECRETS_ID = var.secretsmanager_secrets_id + + AWS_REGIONS_TO_VPC_IDS = join( + ",", + sort(distinct([ + for region_vpc in var.vpc_ids : + format("%s|%s", region_vpc.region, region_vpc.vpc) + ])) + ) + VPC_ID_TO_SECURITY_GROUP_IDS = join( + ",", + sort(distinct(concat( + [ + for vpc in var.vpc_ids : + format( + "%s|%s", + vpc.vpc, + var.runners_security_group_ids[local.vpc_id_to_idx[vpc.vpc]] + ) + ], + [ + for vpc_subnet in var.vpc_sgs : + format("%s|%s", vpc_subnet.vpc, vpc_subnet.sg) + ] + ))) + ) + VPC_ID_TO_SUBNET_IDS = join( + ",", + sort(distinct([ + for vpc_subnet in var.subnet_vpc_ids : + format("%s|%s", vpc_subnet.vpc, vpc_subnet.subnet) + ])) + ) + SUBNET_ID_TO_AZ = join( + ",", + sort(distinct([ + for subnet_az in var.subnet_azs : + format("%s|%s", subnet_az.subnet, subnet_az.az) + ])) + ) + } + } + + vpc_config { + security_group_ids = concat( + var.lambda_security_group_ids, + [var.runners_security_group_ids[0]] + ) + subnet_ids = var.lambda_subnet_ids + } +} + +resource "aws_cloudwatch_log_group" "scale_cycle" { + count = var.enable_scale_cycle ? 1 : 0 + name = "/aws/lambda/${aws_lambda_function.scale_cycle[0].function_name}" + retention_in_days = var.logging_retention_in_days + tags = var.tags +} + +resource "aws_cloudwatch_event_rule" "scale_cycle" { + count = var.enable_scale_cycle ? 1 : 0 + name = "${var.environment}-scale-cycle-rule" + schedule_expression = var.scale_cycle_schedule_expression + tags = var.tags +} + +resource "aws_cloudwatch_event_target" "scale_cycle" { + count = var.enable_scale_cycle ? 1 : 0 + rule = aws_cloudwatch_event_rule.scale_cycle[0].name + arn = aws_lambda_function.scale_cycle[0].arn +} + +resource "aws_lambda_permission" "scale_cycle" { + count = var.enable_scale_cycle ? 1 : 0 + statement_id = "AllowExecutionFromCloudWatch" + action = "lambda:InvokeFunction" + function_name = aws_lambda_function.scale_cycle[0].function_name + principal = "events.amazonaws.com" + source_arn = aws_cloudwatch_event_rule.scale_cycle[0].arn +} + +resource "aws_iam_role" "scale_cycle" { + count = var.enable_scale_cycle ? 1 : 0 + name = "${var.environment}-action-scale-cycle-lambda-role" + assume_role_policy = data.aws_iam_policy_document.lambda_assume_role_policy.json + path = local.role_path + permissions_boundary = var.role_permissions_boundary + tags = local.tags +} + +resource "aws_iam_role_policy" "scale_cycle" { + count = var.enable_scale_cycle ? 1 : 0 + name = "${var.environment}-lambda-scale-cycle-policy" + role = aws_iam_role.scale_cycle[0].name + policy = templatefile("${path.module}/policies/lambda-scale-cycle.json", { + arn_runner_instance_role = var.role_runner_arn + }) +} + +resource "aws_iam_role_policy" "scale_cycle_logging" { + count = var.enable_scale_cycle ? 1 : 0 + name = "${var.environment}-lambda-logging" + role = aws_iam_role.scale_cycle[0].name + policy = templatefile("${path.module}/policies/lambda-cloudwatch.json", { + log_group_arn = aws_cloudwatch_log_group.scale_cycle[0].arn + }) +} + +resource "aws_iam_role_policy_attachment" "scale_cycle_vpc_execution_role" { + count = length(var.lambda_subnet_ids) > 0 ? (var.enable_scale_cycle ? 1 : 0) : 0 + role = aws_iam_role.scale_cycle[0].name + policy_arn = "arn:aws:iam::aws:policy/service-role/AWSLambdaVPCAccessExecutionRole" +} + +resource "aws_iam_role_policy" "scale_cycle_secretsmanager_access" { + count = var.secretsmanager_secrets_id != null ? (var.enable_scale_cycle ? 1 : 0) : 0 + role = aws_iam_role.scale_cycle[0].name + policy = templatefile("${path.module}/policies/lambda-secretsmanager.json", { + secretsmanager_arn = data.aws_secretsmanager_secret_version.app_creds.arn + }) +} \ No newline at end of file diff --git a/terraform-aws-github-runner/modules/runners/variables.tf b/terraform-aws-github-runner/modules/runners/variables.tf index da1296973a..b0363d6b7f 100644 --- a/terraform-aws-github-runner/modules/runners/variables.tf +++ b/terraform-aws-github-runner/modules/runners/variables.tf @@ -100,6 +100,12 @@ variable "scale_up_chron_schedule_expression" { default = "cron(*/10 * * * ? *)" # every 10 minutes } +variable "scale_cycle_schedule_expression" { + description = "Scheduler expression for EBS volume replacement cycle." + type = string + default = "cron(*/5 * * * ? *)" # every 5 minutes +} + variable "minimum_running_time_in_minutes" { description = "The time an ec2 action runner should be running at minimum before terminated if non busy." type = number @@ -124,6 +130,12 @@ variable "lambda_timeout_scale_up_chron" { default = 900 } +variable "lambda_timeout_scale_cycle" { + description = "Time out for the scale cycle lambda in seconds." + type = number + default = 900 +} + variable "role_permissions_boundary" { description = "Permissions boundary that will be added to the created role for the lambda." type = string @@ -323,3 +335,9 @@ variable "retry_scale_up_chron_hud_query_url" { description = "URL used in scale-up-chron to query HUD for queued jobs, if empty scale up cron will not run." type = string } + +variable "enable_scale_cycle" { + description = "Enable the scale cycle lambda for EBS volume replacement." + type = bool + default = false +} diff --git a/terraform-aws-github-runner/variables.tf b/terraform-aws-github-runner/variables.tf index 72e3eb1154..3692e954e7 100644 --- a/terraform-aws-github-runner/variables.tf +++ b/terraform-aws-github-runner/variables.tf @@ -374,6 +374,24 @@ variable "retry_scale_up_chron_hud_query_url" { default = "" } +variable "enable_scale_cycle" { + description = "Enable the scale cycle lambda for runner reuse." + type = bool + default = false +} + +variable "scale_cycle_schedule_expression" { + description = "Scheduler expression for runner reuse cycle." + type = string + default = "cron(*/5 * * * ? *)" # every 5 minutes +} + +variable "lambda_timeout_scale_cycle" { + description = "Time out for the scale cycle lambda in seconds." + type = number + default = 900 +} + variable "wiz_secret_arn" { description = "ARN of AWS Secrets Manager secret that the runner role should have access to" type = string