From 509f111e0dcb79aef4abbff67159fccbe953046a Mon Sep 17 00:00:00 2001
From: Jeff French <209994+jeff-french@users.noreply.github.com>
Date: Fri, 26 Jun 2026 13:54:22 -0500
Subject: [PATCH] fix(pool): clamp pool top-up to runners_maximum_count
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The pool lambda (`adjustPool`) topped up purely against `pool_size` and
never read `runners_maximum_count`, so under sustained load—where newly
created runners immediately become busy and stop counting toward the
idle-only pool size—it would launch a fresh `pool_size` batch every cycle
with no upper bound, driving total runners far past the configured maximum
while the scale-up lambda correctly refused to launch.

Clamp the top-up to the remaining headroom under `runners_maximum_count`
(busy + idle). `ec2runners` already holds every running runner for the
type, so its length is the current total—no extra API call. The env var
defaults to `-1` (unlimited), matching scale-up semantics and preserving
behavior when unset. Thread the value into the pool lambda's environment
via Terraform.

Fixes #5186

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../control-plane/src/pool/pool.test.ts       | 66 ++++++++++++++++++-
 .../functions/control-plane/src/pool/pool.ts  | 20 +++++-
 modules/runners/pool.tf                       |  1 +
 modules/runners/pool/README.md                |  2 +-
 modules/runners/pool/main.tf                  |  1 +
 modules/runners/pool/variables.tf             |  1 +
 6 files changed, 88 insertions(+), 3 deletions(-)
diff --git a/lambdas/functions/control-plane/src/pool/pool.test.ts b/lambdas/functions/control-plane/src/pool/pool.test.ts
index ee4e36a463..a60d787c11 100644
--- a/lambdas/functions/control-plane/src/pool/pool.test.ts
+++ b/lambdas/functions/control-plane/src/pool/pool.test.ts
@@ -130,7 +130,7 @@ beforeEach(() => {
   process.env.GITHUB_APP_ID = '1337';
   process.env.GITHUB_APP_CLIENT_ID = 'TEST_CLIENT_ID';
   process.env.GITHUB_APP_CLIENT_SECRET = 'TEST_CLIENT_SECRET';
-  process.env.RUNNERS_MAXIMUM_COUNT = '3';
+  process.env.RUNNERS_MAXIMUM_COUNT = '-1';
   process.env.ENVIRONMENT = 'unit-test-environment';
   process.env.ENABLE_ORGANIZATION_RUNNERS = 'true';
   process.env.LAUNCH_TEMPLATE_NAME = 'lt-1';
@@ -360,4 +360,68 @@ describe('Test simple pool.', () => {
       );
     });
   });
+
+  describe('Respecting runners_maximum_count', () => {
+    beforeEach(() => {
+      (getGitHubEnterpriseApiUrl as ReturnType<typeof vi.fn>).mockReturnValue({
+        ghesApiUrl: '',
+        ghesBaseUrl: '',
+      });
+    });
+
+    it('Should not top up when the total number of running runners is at the maximum.', async () => {
+      // 4 running runners (2 idle, 1 busy, 1 offline) already meet the maximum, so a large pool size
+      // must not create more. This is the over-provisioning case from issue #5186.
+      process.env.RUNNERS_MAXIMUM_COUNT = '4';
+      await adjust({ poolSize: 10 });
+      expect(createRunners).not.toHaveBeenCalled();
+    });
+
+    it('Should not top up when the total number of running runners exceeds the maximum.', async () => {
+      process.env.RUNNERS_MAXIMUM_COUNT = '3';
+      await adjust({ poolSize: 10 });
+      expect(createRunners).not.toHaveBeenCalled();
+    });
+
+    it('Should clamp the top-up to the remaining headroom under the maximum.', async () => {
+      // 4 running runners with a maximum of 6 leaves headroom for 2, even though the pool of 10 and the
+      // 2 idle runners would otherwise request a top-up of 8.
+      process.env.RUNNERS_MAXIMUM_COUNT = '6';
+      await adjust({ poolSize: 10 });
+      expect(createRunners).toHaveBeenCalledWith(
+        expect.anything(),
+        expect.anything(),
+        2,
+        expect.anything(),
+        'pool-lambda',
+      );
+    });
+
+    it('Should top up against the pool size when below the maximum headroom.', async () => {
+      // Headroom (6 - 4 = 2) is larger than the pool demand (5 - 2 idle = 3 would exceed it, so use a
+      // pool that stays within headroom): pool of 3 with 2 idle requests 1, which is under the cap.
+      process.env.RUNNERS_MAXIMUM_COUNT = '6';
+      await adjust({ poolSize: 3 });
+      expect(createRunners).toHaveBeenCalledWith(
+        expect.anything(),
+        expect.anything(),
+        1,
+        expect.anything(),
+        'pool-lambda',
+      );
+    });
+
+    it('Should ignore the maximum when set to -1 (unlimited).', async () => {
+      process.env.RUNNERS_MAXIMUM_COUNT = '-1';
+      // 2 idle of 4 running, pool of 10 tops up with 8 regardless of how many are already running.
+      await adjust({ poolSize: 10 });
+      expect(createRunners).toHaveBeenCalledWith(
+        expect.anything(),
+        expect.anything(),
+        8,
+        expect.anything(),
+        'pool-lambda',
+      );
+    });
+  });
 });
diff --git a/lambdas/functions/control-plane/src/pool/pool.ts b/lambdas/functions/control-plane/src/pool/pool.ts
index cece8d9951..202bd8103c 100644
--- a/lambdas/functions/control-plane/src/pool/pool.ts
+++ b/lambdas/functions/control-plane/src/pool/pool.ts
@@ -47,6 +47,9 @@ export async function adjust(event: PoolEvent): Promise<void> {
       ? validateSsmParameterStoreTags(process.env.SSM_PARAMETER_STORE_TAGS)
       : [];
   const scaleErrors = JSON.parse(process.env.SCALE_ERRORS) as [string];
+  // -1 disables the maximum check, matching the scale-up lambda's semantics. Defaults to unlimited
+  // when unset so the pool keeps its previous behavior on stacks that do not provide the variable.
+  const maximumRunners = parseInt(process.env.RUNNERS_MAXIMUM_COUNT || '-1');
 
   const { ghesApiUrl, ghesBaseUrl } = getGitHubEnterpriseApiUrl();
 
@@ -70,7 +73,22 @@ export async function adjust(event: PoolEvent): Promise<void> {
   });
 
   const numberOfRunnersInPool = calculatePooSize(ec2runners, runnerStatusses);
-  const topUp = event.poolSize - numberOfRunnersInPool;
+  let topUp = event.poolSize - numberOfRunnersInPool;
+
+  // The pool must never push the total number of runners (busy + idle) past the configured maximum.
+  // ec2runners contains every running runner for this type, so its length is the current total and no
+  // extra API call is needed. Without this clamp the pool keeps topping up against idle-only counts and
+  // can overshoot runners_maximum_count, while the scale-up lambda correctly refuses to launch.
+  if (maximumRunners !== -1 && topUp > 0) {
+    const headroom = maximumRunners - ec2runners.length;
+    if (topUp > headroom) {
+      logger.info(
+        `Capping pool top-up from ${topUp} to ${Math.max(headroom, 0)} to respect the maximum of ` +
+          `${maximumRunners} runners (currently ${ec2runners.length} running).`,
+      );
+      topUp = headroom;
+    }
+  }
 
   if (topUp > 0) {
     logger.info(`The pool will be topped up with ${topUp} runners.`);
diff --git a/modules/runners/pool.tf b/modules/runners/pool.tf
index ae82f3f127..3e274a46c2 100644
--- a/modules/runners/pool.tf
+++ b/modules/runners/pool.tf
@@ -15,6 +15,7 @@ module "pool" {
     instance_max_spot_price       = var.instance_max_spot_price
     instance_target_capacity_type = var.instance_target_capacity_type
     instance_types                = var.instance_types
+    runners_maximum_count         = var.runners_maximum_count
     kms_key_arn                   = local.kms_key_arn
     ami_kms_key_arn               = local.ami_kms_key_arn
     ami_id_ssm_parameter_arn      = local.ami_id_ssm_module_managed ? aws_ssm_parameter.runner_ami_id[0].arn : var.ami.id_ssm_parameter_arn
diff --git a/modules/runners/pool/README.md b/modules/runners/pool/README.md
index 560ad7f763..401c7f2cc8 100644
--- a/modules/runners/pool/README.md
+++ b/modules/runners/pool/README.md
@@ -49,7 +49,7 @@ No modules.
 | Name | Description | Type | Default | Required |
 |------|-------------|------|---------|:--------:|
 | <a name="input_aws_partition"></a> [aws\_partition](#input\_aws\_partition) | (optional) partition for the arn if not 'aws' | `string` | `"aws"` | no |
-| <a name="input_config"></a> [config](#input\_config) | Lookup details in parent module. | <pre>object({<br/>    lambda = object({<br/>      log_level                      = string<br/>      logging_retention_in_days      = number<br/>      logging_kms_key_id             = string<br/>      log_class                      = string<br/>      reserved_concurrent_executions = number<br/>      s3_bucket                      = string<br/>      s3_key                         = string<br/>      s3_object_version              = string<br/>      security_group_ids             = list(string)<br/>      runtime                        = string<br/>      architecture                   = string<br/>      memory_size                    = number<br/>      timeout                        = number<br/>      zip                            = string<br/>      subnet_ids                     = list(string)<br/>      parameter_store_tags           = string<br/>    })<br/>    tags = map(string)<br/>    ghes = object({<br/>      url        = string<br/>      ssl_verify = string<br/>    })<br/>    github_app_parameters = object({<br/>      key_base64 = map(string)<br/>      id         = map(string)<br/>    })<br/>    subnet_ids = list(string)<br/>    runner = object({<br/>      disable_runner_autoupdate            = bool<br/>      ephemeral                            = bool<br/>      enable_jit_config                    = bool<br/>      enable_on_demand_failover_for_errors = list(string)<br/>      scale_errors                         = list(string)<br/>      boot_time_in_minutes                 = number<br/>      labels                               = list(string)<br/>      launch_template = object({<br/>        name = string<br/>      })<br/>      group_name  = string<br/>      name_prefix = string<br/>      pool_owner  = string<br/>      role = object({<br/>        arn = string<br/>      })<br/>      use_dedicated_host = bool<br/>    })<br/>    instance_types                = list(string)<br/>    instance_target_capacity_type = string<br/>    instance_allocation_strategy  = string<br/>    instance_max_spot_price       = string<br/>    prefix                        = string<br/>    pool = list(object({<br/>      schedule_expression          = string<br/>      schedule_expression_timezone = string<br/>      size                         = number<br/>    }))<br/>    role_permissions_boundary            = string<br/>    kms_key_arn                          = string<br/>    ami_kms_key_arn                      = string<br/>    ami_id_ssm_parameter_arn             = string<br/>    role_path                            = string<br/>    ssm_token_path                       = string<br/>    ssm_config_path                      = string<br/>    ami_id_ssm_parameter_name            = string<br/>    ami_id_ssm_parameter_read_policy_arn = string<br/>    arn_ssm_parameters_path_config       = string<br/>    lambda_tags                          = map(string)<br/>    user_agent                           = string<br/>  })</pre> | n/a | yes |
+| <a name="input_config"></a> [config](#input\_config) | Lookup details in parent module. | <pre>object({<br/>    lambda = object({<br/>      log_level                      = string<br/>      logging_retention_in_days      = number<br/>      logging_kms_key_id             = string<br/>      log_class                      = string<br/>      reserved_concurrent_executions = number<br/>      s3_bucket                      = string<br/>      s3_key                         = string<br/>      s3_object_version              = string<br/>      security_group_ids             = list(string)<br/>      runtime                        = string<br/>      architecture                   = string<br/>      memory_size                    = number<br/>      timeout                        = number<br/>      zip                            = string<br/>      subnet_ids                     = list(string)<br/>      parameter_store_tags           = string<br/>    })<br/>    tags = map(string)<br/>    ghes = object({<br/>      url        = string<br/>      ssl_verify = string<br/>    })<br/>    github_app_parameters = object({<br/>      key_base64 = map(string)<br/>      id         = map(string)<br/>    })<br/>    subnet_ids = list(string)<br/>    runner = object({<br/>      disable_runner_autoupdate            = bool<br/>      ephemeral                            = bool<br/>      enable_jit_config                    = bool<br/>      enable_on_demand_failover_for_errors = list(string)<br/>      scale_errors                         = list(string)<br/>      boot_time_in_minutes                 = number<br/>      labels                               = list(string)<br/>      launch_template = object({<br/>        name = string<br/>      })<br/>      group_name  = string<br/>      name_prefix = string<br/>      pool_owner  = string<br/>      role = object({<br/>        arn = string<br/>      })<br/>      use_dedicated_host = bool<br/>    })<br/>    runners_maximum_count         = number<br/>    instance_types                = list(string)<br/>    instance_target_capacity_type = string<br/>    instance_allocation_strategy  = string<br/>    instance_max_spot_price       = string<br/>    prefix                        = string<br/>    pool = list(object({<br/>      schedule_expression          = string<br/>      schedule_expression_timezone = string<br/>      size                         = number<br/>    }))<br/>    role_permissions_boundary            = string<br/>    kms_key_arn                          = string<br/>    ami_kms_key_arn                      = string<br/>    ami_id_ssm_parameter_arn             = string<br/>    role_path                            = string<br/>    ssm_token_path                       = string<br/>    ssm_config_path                      = string<br/>    ami_id_ssm_parameter_name            = string<br/>    ami_id_ssm_parameter_read_policy_arn = string<br/>    arn_ssm_parameters_path_config       = string<br/>    lambda_tags                          = map(string)<br/>    user_agent                           = string<br/>  })</pre> | n/a | yes |
 | <a name="input_tracing_config"></a> [tracing\_config](#input\_tracing\_config) | Configuration for lambda tracing. | <pre>object({<br/>    mode                  = optional(string, null)<br/>    capture_http_requests = optional(bool, false)<br/>    capture_error         = optional(bool, false)<br/>  })</pre> | `{}` | no |
 
 ## Outputs
diff --git a/modules/runners/pool/main.tf b/modules/runners/pool/main.tf
index 8f70713f9e..ec8b50bbd5 100644
--- a/modules/runners/pool/main.tf
+++ b/modules/runners/pool/main.tf
@@ -39,6 +39,7 @@ resource "aws_lambda_function" "pool" {
       RUNNER_GROUP_NAME                        = var.config.runner.group_name
       RUNNER_NAME_PREFIX                       = var.config.runner.name_prefix
       RUNNER_OWNER                             = var.config.runner.pool_owner
+      RUNNERS_MAXIMUM_COUNT                    = var.config.runners_maximum_count
       SSM_TOKEN_PATH                           = var.config.ssm_token_path
       SSM_CONFIG_PATH                          = var.config.ssm_config_path
       SUBNET_IDS                               = join(",", var.config.subnet_ids)
diff --git a/modules/runners/pool/variables.tf b/modules/runners/pool/variables.tf
index a613c5563b..86ad585e09 100644
--- a/modules/runners/pool/variables.tf
+++ b/modules/runners/pool/variables.tf
@@ -48,6 +48,7 @@ variable "config" {
       })
       use_dedicated_host = bool
     })
+    runners_maximum_count         = number
     instance_types                = list(string)
     instance_target_capacity_type = string
     instance_allocation_strategy  = string