From 509f111e0dcb79aef4abbff67159fccbe953046a Mon Sep 17 00:00:00 2001 From: Jeff French <209994+jeff-french@users.noreply.github.com> Date: Fri, 26 Jun 2026 13:54:22 -0500 Subject: [PATCH] fix(pool): clamp pool top-up to runners_maximum_count MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The pool lambda (`adjustPool`) topped up purely against `pool_size` and never read `runners_maximum_count`, so under sustained load—where newly created runners immediately become busy and stop counting toward the idle-only pool size—it would launch a fresh `pool_size` batch every cycle with no upper bound, driving total runners far past the configured maximum while the scale-up lambda correctly refused to launch. Clamp the top-up to the remaining headroom under `runners_maximum_count` (busy + idle). `ec2runners` already holds every running runner for the type, so its length is the current total—no extra API call. The env var defaults to `-1` (unlimited), matching scale-up semantics and preserving behavior when unset. Thread the value into the pool lambda's environment via Terraform. Fixes #5186 Co-Authored-By: Claude Opus 4.8 (1M context) --- .../control-plane/src/pool/pool.test.ts | 66 ++++++++++++++++++- .../functions/control-plane/src/pool/pool.ts | 20 +++++- modules/runners/pool.tf | 1 + modules/runners/pool/README.md | 2 +- modules/runners/pool/main.tf | 1 + modules/runners/pool/variables.tf | 1 + 6 files changed, 88 insertions(+), 3 deletions(-) diff --git a/lambdas/functions/control-plane/src/pool/pool.test.ts b/lambdas/functions/control-plane/src/pool/pool.test.ts index ee4e36a463..a60d787c11 100644 --- a/lambdas/functions/control-plane/src/pool/pool.test.ts +++ b/lambdas/functions/control-plane/src/pool/pool.test.ts @@ -130,7 +130,7 @@ beforeEach(() => { process.env.GITHUB_APP_ID = '1337'; process.env.GITHUB_APP_CLIENT_ID = 'TEST_CLIENT_ID'; process.env.GITHUB_APP_CLIENT_SECRET = 'TEST_CLIENT_SECRET'; - process.env.RUNNERS_MAXIMUM_COUNT = '3'; + process.env.RUNNERS_MAXIMUM_COUNT = '-1'; process.env.ENVIRONMENT = 'unit-test-environment'; process.env.ENABLE_ORGANIZATION_RUNNERS = 'true'; process.env.LAUNCH_TEMPLATE_NAME = 'lt-1'; @@ -360,4 +360,68 @@ describe('Test simple pool.', () => { ); }); }); + + describe('Respecting runners_maximum_count', () => { + beforeEach(() => { + (getGitHubEnterpriseApiUrl as ReturnType).mockReturnValue({ + ghesApiUrl: '', + ghesBaseUrl: '', + }); + }); + + it('Should not top up when the total number of running runners is at the maximum.', async () => { + // 4 running runners (2 idle, 1 busy, 1 offline) already meet the maximum, so a large pool size + // must not create more. This is the over-provisioning case from issue #5186. + process.env.RUNNERS_MAXIMUM_COUNT = '4'; + await adjust({ poolSize: 10 }); + expect(createRunners).not.toHaveBeenCalled(); + }); + + it('Should not top up when the total number of running runners exceeds the maximum.', async () => { + process.env.RUNNERS_MAXIMUM_COUNT = '3'; + await adjust({ poolSize: 10 }); + expect(createRunners).not.toHaveBeenCalled(); + }); + + it('Should clamp the top-up to the remaining headroom under the maximum.', async () => { + // 4 running runners with a maximum of 6 leaves headroom for 2, even though the pool of 10 and the + // 2 idle runners would otherwise request a top-up of 8. + process.env.RUNNERS_MAXIMUM_COUNT = '6'; + await adjust({ poolSize: 10 }); + expect(createRunners).toHaveBeenCalledWith( + expect.anything(), + expect.anything(), + 2, + expect.anything(), + 'pool-lambda', + ); + }); + + it('Should top up against the pool size when below the maximum headroom.', async () => { + // Headroom (6 - 4 = 2) is larger than the pool demand (5 - 2 idle = 3 would exceed it, so use a + // pool that stays within headroom): pool of 3 with 2 idle requests 1, which is under the cap. + process.env.RUNNERS_MAXIMUM_COUNT = '6'; + await adjust({ poolSize: 3 }); + expect(createRunners).toHaveBeenCalledWith( + expect.anything(), + expect.anything(), + 1, + expect.anything(), + 'pool-lambda', + ); + }); + + it('Should ignore the maximum when set to -1 (unlimited).', async () => { + process.env.RUNNERS_MAXIMUM_COUNT = '-1'; + // 2 idle of 4 running, pool of 10 tops up with 8 regardless of how many are already running. + await adjust({ poolSize: 10 }); + expect(createRunners).toHaveBeenCalledWith( + expect.anything(), + expect.anything(), + 8, + expect.anything(), + 'pool-lambda', + ); + }); + }); }); diff --git a/lambdas/functions/control-plane/src/pool/pool.ts b/lambdas/functions/control-plane/src/pool/pool.ts index cece8d9951..202bd8103c 100644 --- a/lambdas/functions/control-plane/src/pool/pool.ts +++ b/lambdas/functions/control-plane/src/pool/pool.ts @@ -47,6 +47,9 @@ export async function adjust(event: PoolEvent): Promise { ? validateSsmParameterStoreTags(process.env.SSM_PARAMETER_STORE_TAGS) : []; const scaleErrors = JSON.parse(process.env.SCALE_ERRORS) as [string]; + // -1 disables the maximum check, matching the scale-up lambda's semantics. Defaults to unlimited + // when unset so the pool keeps its previous behavior on stacks that do not provide the variable. + const maximumRunners = parseInt(process.env.RUNNERS_MAXIMUM_COUNT || '-1'); const { ghesApiUrl, ghesBaseUrl } = getGitHubEnterpriseApiUrl(); @@ -70,7 +73,22 @@ export async function adjust(event: PoolEvent): Promise { }); const numberOfRunnersInPool = calculatePooSize(ec2runners, runnerStatusses); - const topUp = event.poolSize - numberOfRunnersInPool; + let topUp = event.poolSize - numberOfRunnersInPool; + + // The pool must never push the total number of runners (busy + idle) past the configured maximum. + // ec2runners contains every running runner for this type, so its length is the current total and no + // extra API call is needed. Without this clamp the pool keeps topping up against idle-only counts and + // can overshoot runners_maximum_count, while the scale-up lambda correctly refuses to launch. + if (maximumRunners !== -1 && topUp > 0) { + const headroom = maximumRunners - ec2runners.length; + if (topUp > headroom) { + logger.info( + `Capping pool top-up from ${topUp} to ${Math.max(headroom, 0)} to respect the maximum of ` + + `${maximumRunners} runners (currently ${ec2runners.length} running).`, + ); + topUp = headroom; + } + } if (topUp > 0) { logger.info(`The pool will be topped up with ${topUp} runners.`); diff --git a/modules/runners/pool.tf b/modules/runners/pool.tf index ae82f3f127..3e274a46c2 100644 --- a/modules/runners/pool.tf +++ b/modules/runners/pool.tf @@ -15,6 +15,7 @@ module "pool" { instance_max_spot_price = var.instance_max_spot_price instance_target_capacity_type = var.instance_target_capacity_type instance_types = var.instance_types + runners_maximum_count = var.runners_maximum_count kms_key_arn = local.kms_key_arn ami_kms_key_arn = local.ami_kms_key_arn ami_id_ssm_parameter_arn = local.ami_id_ssm_module_managed ? aws_ssm_parameter.runner_ami_id[0].arn : var.ami.id_ssm_parameter_arn diff --git a/modules/runners/pool/README.md b/modules/runners/pool/README.md index 560ad7f763..401c7f2cc8 100644 --- a/modules/runners/pool/README.md +++ b/modules/runners/pool/README.md @@ -49,7 +49,7 @@ No modules. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| | [aws\_partition](#input\_aws\_partition) | (optional) partition for the arn if not 'aws' | `string` | `"aws"` | no | -| [config](#input\_config) | Lookup details in parent module. |
object({
lambda = object({
log_level = string
logging_retention_in_days = number
logging_kms_key_id = string
log_class = string
reserved_concurrent_executions = number
s3_bucket = string
s3_key = string
s3_object_version = string
security_group_ids = list(string)
runtime = string
architecture = string
memory_size = number
timeout = number
zip = string
subnet_ids = list(string)
parameter_store_tags = string
})
tags = map(string)
ghes = object({
url = string
ssl_verify = string
})
github_app_parameters = object({
key_base64 = map(string)
id = map(string)
})
subnet_ids = list(string)
runner = object({
disable_runner_autoupdate = bool
ephemeral = bool
enable_jit_config = bool
enable_on_demand_failover_for_errors = list(string)
scale_errors = list(string)
boot_time_in_minutes = number
labels = list(string)
launch_template = object({
name = string
})
group_name = string
name_prefix = string
pool_owner = string
role = object({
arn = string
})
use_dedicated_host = bool
})
instance_types = list(string)
instance_target_capacity_type = string
instance_allocation_strategy = string
instance_max_spot_price = string
prefix = string
pool = list(object({
schedule_expression = string
schedule_expression_timezone = string
size = number
}))
role_permissions_boundary = string
kms_key_arn = string
ami_kms_key_arn = string
ami_id_ssm_parameter_arn = string
role_path = string
ssm_token_path = string
ssm_config_path = string
ami_id_ssm_parameter_name = string
ami_id_ssm_parameter_read_policy_arn = string
arn_ssm_parameters_path_config = string
lambda_tags = map(string)
user_agent = string
})
| n/a | yes | +| [config](#input\_config) | Lookup details in parent module. |
object({
lambda = object({
log_level = string
logging_retention_in_days = number
logging_kms_key_id = string
log_class = string
reserved_concurrent_executions = number
s3_bucket = string
s3_key = string
s3_object_version = string
security_group_ids = list(string)
runtime = string
architecture = string
memory_size = number
timeout = number
zip = string
subnet_ids = list(string)
parameter_store_tags = string
})
tags = map(string)
ghes = object({
url = string
ssl_verify = string
})
github_app_parameters = object({
key_base64 = map(string)
id = map(string)
})
subnet_ids = list(string)
runner = object({
disable_runner_autoupdate = bool
ephemeral = bool
enable_jit_config = bool
enable_on_demand_failover_for_errors = list(string)
scale_errors = list(string)
boot_time_in_minutes = number
labels = list(string)
launch_template = object({
name = string
})
group_name = string
name_prefix = string
pool_owner = string
role = object({
arn = string
})
use_dedicated_host = bool
})
runners_maximum_count = number
instance_types = list(string)
instance_target_capacity_type = string
instance_allocation_strategy = string
instance_max_spot_price = string
prefix = string
pool = list(object({
schedule_expression = string
schedule_expression_timezone = string
size = number
}))
role_permissions_boundary = string
kms_key_arn = string
ami_kms_key_arn = string
ami_id_ssm_parameter_arn = string
role_path = string
ssm_token_path = string
ssm_config_path = string
ami_id_ssm_parameter_name = string
ami_id_ssm_parameter_read_policy_arn = string
arn_ssm_parameters_path_config = string
lambda_tags = map(string)
user_agent = string
})
| n/a | yes | | [tracing\_config](#input\_tracing\_config) | Configuration for lambda tracing. |
object({
mode = optional(string, null)
capture_http_requests = optional(bool, false)
capture_error = optional(bool, false)
})
| `{}` | no | ## Outputs diff --git a/modules/runners/pool/main.tf b/modules/runners/pool/main.tf index 8f70713f9e..ec8b50bbd5 100644 --- a/modules/runners/pool/main.tf +++ b/modules/runners/pool/main.tf @@ -39,6 +39,7 @@ resource "aws_lambda_function" "pool" { RUNNER_GROUP_NAME = var.config.runner.group_name RUNNER_NAME_PREFIX = var.config.runner.name_prefix RUNNER_OWNER = var.config.runner.pool_owner + RUNNERS_MAXIMUM_COUNT = var.config.runners_maximum_count SSM_TOKEN_PATH = var.config.ssm_token_path SSM_CONFIG_PATH = var.config.ssm_config_path SUBNET_IDS = join(",", var.config.subnet_ids) diff --git a/modules/runners/pool/variables.tf b/modules/runners/pool/variables.tf index a613c5563b..86ad585e09 100644 --- a/modules/runners/pool/variables.tf +++ b/modules/runners/pool/variables.tf @@ -48,6 +48,7 @@ variable "config" { }) use_dedicated_host = bool }) + runners_maximum_count = number instance_types = list(string) instance_target_capacity_type = string instance_allocation_strategy = string