diff --git a/terraform-aws-github-runner/main.tf b/terraform-aws-github-runner/main.tf index 6202a95323..c60a0ba658 100644 --- a/terraform-aws-github-runner/main.tf +++ b/terraform-aws-github-runner/main.tf @@ -115,6 +115,10 @@ module "runners" { retry_scale_up_chron_hud_query_url = var.retry_scale_up_chron_hud_query_url + enable_scale_cycle = var.enable_scale_cycle + scale_cycle_schedule_expression = var.scale_cycle_schedule_expression + lambda_timeout_scale_cycle = var.lambda_timeout_scale_cycle + must_have_issues_labels = var.must_have_issues_labels cant_have_issues_labels = var.cant_have_issues_labels diff --git a/terraform-aws-github-runner/modules/runners/policies/lambda-scale-cycle.json b/terraform-aws-github-runner/modules/runners/policies/lambda-scale-cycle.json new file mode 100644 index 0000000000..6152ed8e33 --- /dev/null +++ b/terraform-aws-github-runner/modules/runners/policies/lambda-scale-cycle.json @@ -0,0 +1,44 @@ +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "ec2:DescribeInstances", + "ec2:DescribeTags", + "ec2:RunInstances", + "ec2:CreateNetworkInterface", + "ec2:DescribeNetworkInterfaces", + "ec2:DeleteNetworkInterface", + "ec2:DescribeImages", + "ec2:CreateTags", + "ec2:DeleteTags", + "ec2:CreateReplaceRootVolumeTask", + "ec2:DescribeReplaceRootVolumeTasks" + ], + "Resource": ["*"] + }, + { + "Effect": "Allow", + "Action": [ + "ec2:CreateTags" + ], + "Resource": ["*"], + "Condition": { + "StringEquals": { + "ec2:CreateAction" : "RunInstances" + } + } + }, + { + "Effect": "Allow", + "Action": "iam:PassRole", + "Resource": "${arn_runner_instance_role}" + }, + { + "Effect": "Allow", + "Action": ["ssm:PutParameter", "ssm:GetParameter", "ssm:DeleteParameter"], + "Resource": "*" + } + ] + } \ No newline at end of file diff --git a/terraform-aws-github-runner/modules/runners/scale-cycle.tf b/terraform-aws-github-runner/modules/runners/scale-cycle.tf new file mode 100644 index 0000000000..7a014038b8 --- /dev/null +++ b/terraform-aws-github-runner/modules/runners/scale-cycle.tf @@ -0,0 +1,176 @@ +resource "aws_kms_grant" "scale_cycle" { + count = var.encryption.encrypt ? (var.enable_scale_cycle ? 1 : 0) : 0 + name = "${var.environment}-scale-cycle" + key_id = var.encryption.kms_key_id + grantee_principal = aws_iam_role.scale_cycle[0].arn + operations = ["Decrypt"] + + constraints { + encryption_context_equals = { + Environment = var.environment + } + } +} + +resource "aws_lambda_function" "scale_cycle" { + count = var.enable_scale_cycle ? 1 : 0 + s3_bucket = var.lambda_s3_bucket != null ? var.lambda_s3_bucket : null + s3_key = var.runners_lambda_s3_key != null ? var.runners_lambda_s3_key : null + s3_object_version = var.runners_lambda_s3_object_version != null ? var.runners_lambda_s3_object_version : null + filename = var.lambda_s3_bucket == null ? local.lambda_zip : null + source_code_hash = var.lambda_s3_bucket == null ? filebase64sha256(local.lambda_zip) : null + function_name = "${var.environment}-scale-cycle" + role = aws_iam_role.scale_cycle[0].arn + handler = "index.scaleCycle" + runtime = "nodejs20.x" + timeout = var.lambda_timeout_scale_cycle + tags = local.tags + memory_size = 2048 + + environment { + variables = { + DATETIME_DEPLOY = local.datetime_deploy + ENABLE_ORGANIZATION_RUNNERS = var.enable_organization_runners + ENVIRONMENT = var.environment + GITHUB_APP_CLIENT_ID = var.github_app.client_id + GITHUB_APP_CLIENT_SECRET = var.github_app_client_secret + GITHUB_APP_ID = var.github_app.id + GITHUB_APP_KEY_BASE64 = var.github_app_key_base64 + KMS_KEY_ID = var.encryption.kms_key_id + LAMBDA_TIMEOUT = var.lambda_timeout_scale_cycle + LAUNCH_TEMPLATE_NAME_LINUX = var.launch_template_name_linux + LAUNCH_TEMPLATE_NAME_LINUX_ARM64 = var.launch_template_name_linux_arm64 + LAUNCH_TEMPLATE_NAME_LINUX_NVIDIA = var.launch_template_name_linux_nvidia + LAUNCH_TEMPLATE_NAME_WINDOWS = var.launch_template_name_windows + LAUNCH_TEMPLATE_VERSION_LINUX = var.launch_template_version_linux + LAUNCH_TEMPLATE_VERSION_LINUX_ARM64 = var.launch_template_version_linux_arm64 + LAUNCH_TEMPLATE_VERSION_LINUX_NVIDIA = var.launch_template_version_linux_nvidia + LAUNCH_TEMPLATE_VERSION_WINDOWS = var.launch_template_version_windows + MINIMUM_RUNNING_TIME_IN_MINUTES = var.minimum_running_time_in_minutes + REDIS_ENDPOINT = var.redis_endpoint + REDIS_LOGIN = var.redis_login + RUNNER_EXTRA_LABELS = var.runner_extra_labels + SCALE_CONFIG_ORG = var.scale_config_org + SCALE_CONFIG_REPO = var.scale_config_repo + SCALE_CONFIG_REPO_PATH = var.scale_config_repo_path + SECRETSMANAGER_SECRETS_ID = var.secretsmanager_secrets_id + + AWS_REGIONS_TO_VPC_IDS = join( + ",", + sort(distinct([ + for region_vpc in var.vpc_ids : + format("%s|%s", region_vpc.region, region_vpc.vpc) + ])) + ) + VPC_ID_TO_SECURITY_GROUP_IDS = join( + ",", + sort(distinct(concat( + [ + for vpc in var.vpc_ids : + format( + "%s|%s", + vpc.vpc, + var.runners_security_group_ids[local.vpc_id_to_idx[vpc.vpc]] + ) + ], + [ + for vpc_subnet in var.vpc_sgs : + format("%s|%s", vpc_subnet.vpc, vpc_subnet.sg) + ] + ))) + ) + VPC_ID_TO_SUBNET_IDS = join( + ",", + sort(distinct([ + for vpc_subnet in var.subnet_vpc_ids : + format("%s|%s", vpc_subnet.vpc, vpc_subnet.subnet) + ])) + ) + SUBNET_ID_TO_AZ = join( + ",", + sort(distinct([ + for subnet_az in var.subnet_azs : + format("%s|%s", subnet_az.subnet, subnet_az.az) + ])) + ) + } + } + + vpc_config { + security_group_ids = concat( + var.lambda_security_group_ids, + [var.runners_security_group_ids[0]] + ) + subnet_ids = var.lambda_subnet_ids + } +} + +resource "aws_cloudwatch_log_group" "scale_cycle" { + count = var.enable_scale_cycle ? 1 : 0 + name = "/aws/lambda/${aws_lambda_function.scale_cycle[0].function_name}" + retention_in_days = var.logging_retention_in_days + tags = var.tags +} + +resource "aws_cloudwatch_event_rule" "scale_cycle" { + count = var.enable_scale_cycle ? 1 : 0 + name = "${var.environment}-scale-cycle-rule" + schedule_expression = var.scale_cycle_schedule_expression + tags = var.tags +} + +resource "aws_cloudwatch_event_target" "scale_cycle" { + count = var.enable_scale_cycle ? 1 : 0 + rule = aws_cloudwatch_event_rule.scale_cycle[0].name + arn = aws_lambda_function.scale_cycle[0].arn +} + +resource "aws_lambda_permission" "scale_cycle" { + count = var.enable_scale_cycle ? 1 : 0 + statement_id = "AllowExecutionFromCloudWatch" + action = "lambda:InvokeFunction" + function_name = aws_lambda_function.scale_cycle[0].function_name + principal = "events.amazonaws.com" + source_arn = aws_cloudwatch_event_rule.scale_cycle[0].arn +} + +resource "aws_iam_role" "scale_cycle" { + count = var.enable_scale_cycle ? 1 : 0 + name = "${var.environment}-action-scale-cycle-lambda-role" + assume_role_policy = data.aws_iam_policy_document.lambda_assume_role_policy.json + path = local.role_path + permissions_boundary = var.role_permissions_boundary + tags = local.tags +} + +resource "aws_iam_role_policy" "scale_cycle" { + count = var.enable_scale_cycle ? 1 : 0 + name = "${var.environment}-lambda-scale-cycle-policy" + role = aws_iam_role.scale_cycle[0].name + policy = templatefile("${path.module}/policies/lambda-scale-cycle.json", { + arn_runner_instance_role = var.role_runner_arn + }) +} + +resource "aws_iam_role_policy" "scale_cycle_logging" { + count = var.enable_scale_cycle ? 1 : 0 + name = "${var.environment}-lambda-logging" + role = aws_iam_role.scale_cycle[0].name + policy = templatefile("${path.module}/policies/lambda-cloudwatch.json", { + log_group_arn = aws_cloudwatch_log_group.scale_cycle[0].arn + }) +} + +resource "aws_iam_role_policy_attachment" "scale_cycle_vpc_execution_role" { + count = length(var.lambda_subnet_ids) > 0 ? (var.enable_scale_cycle ? 1 : 0) : 0 + role = aws_iam_role.scale_cycle[0].name + policy_arn = "arn:aws:iam::aws:policy/service-role/AWSLambdaVPCAccessExecutionRole" +} + +resource "aws_iam_role_policy" "scale_cycle_secretsmanager_access" { + count = var.secretsmanager_secrets_id != null ? (var.enable_scale_cycle ? 1 : 0) : 0 + role = aws_iam_role.scale_cycle[0].name + policy = templatefile("${path.module}/policies/lambda-secretsmanager.json", { + secretsmanager_arn = data.aws_secretsmanager_secret_version.app_creds.arn + }) +} \ No newline at end of file diff --git a/terraform-aws-github-runner/modules/runners/variables.tf b/terraform-aws-github-runner/modules/runners/variables.tf index da1296973a..b0363d6b7f 100644 --- a/terraform-aws-github-runner/modules/runners/variables.tf +++ b/terraform-aws-github-runner/modules/runners/variables.tf @@ -100,6 +100,12 @@ variable "scale_up_chron_schedule_expression" { default = "cron(*/10 * * * ? *)" # every 10 minutes } +variable "scale_cycle_schedule_expression" { + description = "Scheduler expression for EBS volume replacement cycle." + type = string + default = "cron(*/5 * * * ? *)" # every 5 minutes +} + variable "minimum_running_time_in_minutes" { description = "The time an ec2 action runner should be running at minimum before terminated if non busy." type = number @@ -124,6 +130,12 @@ variable "lambda_timeout_scale_up_chron" { default = 900 } +variable "lambda_timeout_scale_cycle" { + description = "Time out for the scale cycle lambda in seconds." + type = number + default = 900 +} + variable "role_permissions_boundary" { description = "Permissions boundary that will be added to the created role for the lambda." type = string @@ -323,3 +335,9 @@ variable "retry_scale_up_chron_hud_query_url" { description = "URL used in scale-up-chron to query HUD for queued jobs, if empty scale up cron will not run." type = string } + +variable "enable_scale_cycle" { + description = "Enable the scale cycle lambda for EBS volume replacement." + type = bool + default = false +} diff --git a/terraform-aws-github-runner/variables.tf b/terraform-aws-github-runner/variables.tf index 72e3eb1154..3692e954e7 100644 --- a/terraform-aws-github-runner/variables.tf +++ b/terraform-aws-github-runner/variables.tf @@ -374,6 +374,24 @@ variable "retry_scale_up_chron_hud_query_url" { default = "" } +variable "enable_scale_cycle" { + description = "Enable the scale cycle lambda for runner reuse." + type = bool + default = false +} + +variable "scale_cycle_schedule_expression" { + description = "Scheduler expression for runner reuse cycle." + type = string + default = "cron(*/5 * * * ? *)" # every 5 minutes +} + +variable "lambda_timeout_scale_cycle" { + description = "Time out for the scale cycle lambda in seconds." + type = number + default = 900 +} + variable "wiz_secret_arn" { description = "ARN of AWS Secrets Manager secret that the runner role should have access to" type = string