Skip to content

runners: Add terraform module for scale-cycle #6893

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: gh/seemethere/3/head
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions terraform-aws-github-runner/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,10 @@ module "runners" {

retry_scale_up_chron_hud_query_url = var.retry_scale_up_chron_hud_query_url

enable_scale_cycle = var.enable_scale_cycle
scale_cycle_schedule_expression = var.scale_cycle_schedule_expression
lambda_timeout_scale_cycle = var.lambda_timeout_scale_cycle

must_have_issues_labels = var.must_have_issues_labels
cant_have_issues_labels = var.cant_have_issues_labels

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": [
"ec2:DescribeInstances",
"ec2:DescribeTags",
"ec2:RunInstances",
"ec2:CreateNetworkInterface",
"ec2:DescribeNetworkInterfaces",
"ec2:DeleteNetworkInterface",
Comment on lines +10 to +12
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are these actually needed? I do see that scale up has these as well, but it seems odd that they'd require this permission.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is techincally what is needed to create an EC2 instance

Copy link
Collaborator

@zxiiro zxiiro Jul 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe this is only necessary if we need more complex networking outside of what EC2 instance launch creates when you create a new EC2 instance. Like an instance with multiple NICs connected to more than 1 network.

Edit: I think ec2:RunInstance already creates the default NIC.

"ec2:DescribeImages",
"ec2:CreateTags",
"ec2:DeleteTags",
"ec2:CreateReplaceRootVolumeTask",
"ec2:DescribeReplaceRootVolumeTasks"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is this permission needed for? Scale up/down don't use it

],
"Resource": ["*"]
},
{
"Effect": "Allow",
"Action": [
"ec2:CreateTags"
],
"Resource": ["*"],
"Condition": {
"StringEquals": {
"ec2:CreateAction" : "RunInstances"
}
}
},
{
"Effect": "Allow",
"Action": "iam:PassRole",
"Resource": "${arn_runner_instance_role}"
},
{
"Effect": "Allow",
"Action": ["ssm:PutParameter", "ssm:GetParameter", "ssm:DeleteParameter"],
"Resource": "*"
}
]
}
176 changes: 176 additions & 0 deletions terraform-aws-github-runner/modules/runners/scale-cycle.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
resource "aws_kms_grant" "scale_cycle" {
count = var.encryption.encrypt ? (var.enable_scale_cycle ? 1 : 0) : 0
name = "${var.environment}-scale-cycle"
key_id = var.encryption.kms_key_id
grantee_principal = aws_iam_role.scale_cycle[0].arn
operations = ["Decrypt"]

constraints {
encryption_context_equals = {
Environment = var.environment
}
}
}

resource "aws_lambda_function" "scale_cycle" {
count = var.enable_scale_cycle ? 1 : 0
s3_bucket = var.lambda_s3_bucket != null ? var.lambda_s3_bucket : null
s3_key = var.runners_lambda_s3_key != null ? var.runners_lambda_s3_key : null
s3_object_version = var.runners_lambda_s3_object_version != null ? var.runners_lambda_s3_object_version : null
filename = var.lambda_s3_bucket == null ? local.lambda_zip : null
source_code_hash = var.lambda_s3_bucket == null ? filebase64sha256(local.lambda_zip) : null
function_name = "${var.environment}-scale-cycle"
role = aws_iam_role.scale_cycle[0].arn
handler = "index.scaleCycle"
runtime = "nodejs20.x"
timeout = var.lambda_timeout_scale_cycle
tags = local.tags
memory_size = 2048

environment {
variables = {
DATETIME_DEPLOY = local.datetime_deploy
ENABLE_ORGANIZATION_RUNNERS = var.enable_organization_runners
ENVIRONMENT = var.environment
GITHUB_APP_CLIENT_ID = var.github_app.client_id
GITHUB_APP_CLIENT_SECRET = var.github_app_client_secret
GITHUB_APP_ID = var.github_app.id
GITHUB_APP_KEY_BASE64 = var.github_app_key_base64
KMS_KEY_ID = var.encryption.kms_key_id
LAMBDA_TIMEOUT = var.lambda_timeout_scale_cycle
LAUNCH_TEMPLATE_NAME_LINUX = var.launch_template_name_linux
LAUNCH_TEMPLATE_NAME_LINUX_ARM64 = var.launch_template_name_linux_arm64
LAUNCH_TEMPLATE_NAME_LINUX_NVIDIA = var.launch_template_name_linux_nvidia
LAUNCH_TEMPLATE_NAME_WINDOWS = var.launch_template_name_windows
LAUNCH_TEMPLATE_VERSION_LINUX = var.launch_template_version_linux
LAUNCH_TEMPLATE_VERSION_LINUX_ARM64 = var.launch_template_version_linux_arm64
LAUNCH_TEMPLATE_VERSION_LINUX_NVIDIA = var.launch_template_version_linux_nvidia
LAUNCH_TEMPLATE_VERSION_WINDOWS = var.launch_template_version_windows
MINIMUM_RUNNING_TIME_IN_MINUTES = var.minimum_running_time_in_minutes
REDIS_ENDPOINT = var.redis_endpoint
REDIS_LOGIN = var.redis_login
RUNNER_EXTRA_LABELS = var.runner_extra_labels
SCALE_CONFIG_ORG = var.scale_config_org
SCALE_CONFIG_REPO = var.scale_config_repo
SCALE_CONFIG_REPO_PATH = var.scale_config_repo_path
SECRETSMANAGER_SECRETS_ID = var.secretsmanager_secrets_id

AWS_REGIONS_TO_VPC_IDS = join(
",",
sort(distinct([
for region_vpc in var.vpc_ids :
format("%s|%s", region_vpc.region, region_vpc.vpc)
]))
)
VPC_ID_TO_SECURITY_GROUP_IDS = join(
",",
sort(distinct(concat(
[
for vpc in var.vpc_ids :
format(
"%s|%s",
vpc.vpc,
var.runners_security_group_ids[local.vpc_id_to_idx[vpc.vpc]]
)
],
[
for vpc_subnet in var.vpc_sgs :
format("%s|%s", vpc_subnet.vpc, vpc_subnet.sg)
]
)))
)
VPC_ID_TO_SUBNET_IDS = join(
",",
sort(distinct([
for vpc_subnet in var.subnet_vpc_ids :
format("%s|%s", vpc_subnet.vpc, vpc_subnet.subnet)
]))
)
SUBNET_ID_TO_AZ = join(
",",
sort(distinct([
for subnet_az in var.subnet_azs :
format("%s|%s", subnet_az.subnet, subnet_az.az)
]))
)
}
}

vpc_config {
security_group_ids = concat(
var.lambda_security_group_ids,
[var.runners_security_group_ids[0]]
)
subnet_ids = var.lambda_subnet_ids
}
}

resource "aws_cloudwatch_log_group" "scale_cycle" {
count = var.enable_scale_cycle ? 1 : 0
name = "/aws/lambda/${aws_lambda_function.scale_cycle[0].function_name}"
retention_in_days = var.logging_retention_in_days
tags = var.tags
}

resource "aws_cloudwatch_event_rule" "scale_cycle" {
count = var.enable_scale_cycle ? 1 : 0
name = "${var.environment}-scale-cycle-rule"
schedule_expression = var.scale_cycle_schedule_expression
tags = var.tags
}

resource "aws_cloudwatch_event_target" "scale_cycle" {
count = var.enable_scale_cycle ? 1 : 0
rule = aws_cloudwatch_event_rule.scale_cycle[0].name
arn = aws_lambda_function.scale_cycle[0].arn
}

resource "aws_lambda_permission" "scale_cycle" {
count = var.enable_scale_cycle ? 1 : 0
statement_id = "AllowExecutionFromCloudWatch"
action = "lambda:InvokeFunction"
function_name = aws_lambda_function.scale_cycle[0].function_name
principal = "events.amazonaws.com"
source_arn = aws_cloudwatch_event_rule.scale_cycle[0].arn
}

resource "aws_iam_role" "scale_cycle" {
count = var.enable_scale_cycle ? 1 : 0
name = "${var.environment}-action-scale-cycle-lambda-role"
assume_role_policy = data.aws_iam_policy_document.lambda_assume_role_policy.json
path = local.role_path
permissions_boundary = var.role_permissions_boundary
tags = local.tags
}

resource "aws_iam_role_policy" "scale_cycle" {
count = var.enable_scale_cycle ? 1 : 0
name = "${var.environment}-lambda-scale-cycle-policy"
role = aws_iam_role.scale_cycle[0].name
policy = templatefile("${path.module}/policies/lambda-scale-cycle.json", {
arn_runner_instance_role = var.role_runner_arn
})
}

resource "aws_iam_role_policy" "scale_cycle_logging" {
count = var.enable_scale_cycle ? 1 : 0
name = "${var.environment}-lambda-logging"
role = aws_iam_role.scale_cycle[0].name
policy = templatefile("${path.module}/policies/lambda-cloudwatch.json", {
log_group_arn = aws_cloudwatch_log_group.scale_cycle[0].arn
})
}

resource "aws_iam_role_policy_attachment" "scale_cycle_vpc_execution_role" {
count = length(var.lambda_subnet_ids) > 0 ? (var.enable_scale_cycle ? 1 : 0) : 0
role = aws_iam_role.scale_cycle[0].name
policy_arn = "arn:aws:iam::aws:policy/service-role/AWSLambdaVPCAccessExecutionRole"
}

resource "aws_iam_role_policy" "scale_cycle_secretsmanager_access" {
count = var.secretsmanager_secrets_id != null ? (var.enable_scale_cycle ? 1 : 0) : 0
role = aws_iam_role.scale_cycle[0].name
policy = templatefile("${path.module}/policies/lambda-secretsmanager.json", {
secretsmanager_arn = data.aws_secretsmanager_secret_version.app_creds.arn
})
}
18 changes: 18 additions & 0 deletions terraform-aws-github-runner/modules/runners/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,12 @@ variable "scale_up_chron_schedule_expression" {
default = "cron(*/10 * * * ? *)" # every 10 minutes
}

variable "scale_cycle_schedule_expression" {
description = "Scheduler expression for EBS volume replacement cycle."
type = string
default = "cron(*/5 * * * ? *)" # every 5 minutes
}

variable "minimum_running_time_in_minutes" {
description = "The time an ec2 action runner should be running at minimum before terminated if non busy."
type = number
Expand All @@ -124,6 +130,12 @@ variable "lambda_timeout_scale_up_chron" {
default = 900
}

variable "lambda_timeout_scale_cycle" {
description = "Time out for the scale cycle lambda in seconds."
type = number
default = 900
}

variable "role_permissions_boundary" {
description = "Permissions boundary that will be added to the created role for the lambda."
type = string
Expand Down Expand Up @@ -323,3 +335,9 @@ variable "retry_scale_up_chron_hud_query_url" {
description = "URL used in scale-up-chron to query HUD for queued jobs, if empty scale up cron will not run."
type = string
}

variable "enable_scale_cycle" {
description = "Enable the scale cycle lambda for EBS volume replacement."
type = bool
default = false
}
18 changes: 18 additions & 0 deletions terraform-aws-github-runner/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,24 @@ variable "retry_scale_up_chron_hud_query_url" {
default = ""
}

variable "enable_scale_cycle" {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: enabling could be inferred based on whether scale_cycle_schedule_expression set

description = "Enable the scale cycle lambda for runner reuse."
type = bool
default = false
}

variable "scale_cycle_schedule_expression" {
description = "Scheduler expression for runner reuse cycle."
type = string
default = "cron(*/5 * * * ? *)" # every 5 minutes
}

variable "lambda_timeout_scale_cycle" {
description = "Time out for the scale cycle lambda in seconds."
type = number
default = 900
}

variable "wiz_secret_arn" {
description = "ARN of AWS Secrets Manager secret that the runner role should have access to"
type = string
Expand Down
Loading