Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 65 additions & 1 deletion lambdas/functions/control-plane/src/pool/pool.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ beforeEach(() => {
process.env.GITHUB_APP_ID = '1337';
process.env.GITHUB_APP_CLIENT_ID = 'TEST_CLIENT_ID';
process.env.GITHUB_APP_CLIENT_SECRET = 'TEST_CLIENT_SECRET';
process.env.RUNNERS_MAXIMUM_COUNT = '3';
process.env.RUNNERS_MAXIMUM_COUNT = '-1';
process.env.ENVIRONMENT = 'unit-test-environment';
process.env.ENABLE_ORGANIZATION_RUNNERS = 'true';
process.env.LAUNCH_TEMPLATE_NAME = 'lt-1';
Expand Down Expand Up @@ -360,4 +360,68 @@ describe('Test simple pool.', () => {
);
});
});

describe('Respecting runners_maximum_count', () => {
beforeEach(() => {
(getGitHubEnterpriseApiUrl as ReturnType<typeof vi.fn>).mockReturnValue({
ghesApiUrl: '',
ghesBaseUrl: '',
});
});

it('Should not top up when the total number of running runners is at the maximum.', async () => {
// 4 running runners (2 idle, 1 busy, 1 offline) already meet the maximum, so a large pool size
// must not create more. This is the over-provisioning case from issue #5186.
process.env.RUNNERS_MAXIMUM_COUNT = '4';
await adjust({ poolSize: 10 });
expect(createRunners).not.toHaveBeenCalled();
});

it('Should not top up when the total number of running runners exceeds the maximum.', async () => {
process.env.RUNNERS_MAXIMUM_COUNT = '3';
await adjust({ poolSize: 10 });
expect(createRunners).not.toHaveBeenCalled();
});

it('Should clamp the top-up to the remaining headroom under the maximum.', async () => {
// 4 running runners with a maximum of 6 leaves headroom for 2, even though the pool of 10 and the
// 2 idle runners would otherwise request a top-up of 8.
process.env.RUNNERS_MAXIMUM_COUNT = '6';
await adjust({ poolSize: 10 });
expect(createRunners).toHaveBeenCalledWith(
expect.anything(),
expect.anything(),
2,
expect.anything(),
'pool-lambda',
);
});

it('Should top up against the pool size when below the maximum headroom.', async () => {
// Headroom (6 - 4 = 2) is larger than the pool demand (5 - 2 idle = 3 would exceed it, so use a
// pool that stays within headroom): pool of 3 with 2 idle requests 1, which is under the cap.
process.env.RUNNERS_MAXIMUM_COUNT = '6';
await adjust({ poolSize: 3 });
expect(createRunners).toHaveBeenCalledWith(
expect.anything(),
expect.anything(),
1,
expect.anything(),
'pool-lambda',
);
});

it('Should ignore the maximum when set to -1 (unlimited).', async () => {
process.env.RUNNERS_MAXIMUM_COUNT = '-1';
// 2 idle of 4 running, pool of 10 tops up with 8 regardless of how many are already running.
await adjust({ poolSize: 10 });
expect(createRunners).toHaveBeenCalledWith(
expect.anything(),
expect.anything(),
8,
expect.anything(),
'pool-lambda',
);
});
});
});
20 changes: 19 additions & 1 deletion lambdas/functions/control-plane/src/pool/pool.ts
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ export async function adjust(event: PoolEvent): Promise<void> {
? validateSsmParameterStoreTags(process.env.SSM_PARAMETER_STORE_TAGS)
: [];
const scaleErrors = JSON.parse(process.env.SCALE_ERRORS) as [string];
// -1 disables the maximum check, matching the scale-up lambda's semantics. Defaults to unlimited
// when unset so the pool keeps its previous behavior on stacks that do not provide the variable.
const maximumRunners = parseInt(process.env.RUNNERS_MAXIMUM_COUNT || '-1');

const { ghesApiUrl, ghesBaseUrl } = getGitHubEnterpriseApiUrl();

Expand All @@ -70,7 +73,22 @@ export async function adjust(event: PoolEvent): Promise<void> {
});

const numberOfRunnersInPool = calculatePooSize(ec2runners, runnerStatusses);
const topUp = event.poolSize - numberOfRunnersInPool;
let topUp = event.poolSize - numberOfRunnersInPool;

// The pool must never push the total number of runners (busy + idle) past the configured maximum.
// ec2runners contains every running runner for this type, so its length is the current total and no
// extra API call is needed. Without this clamp the pool keeps topping up against idle-only counts and
// can overshoot runners_maximum_count, while the scale-up lambda correctly refuses to launch.
if (maximumRunners !== -1 && topUp > 0) {
const headroom = maximumRunners - ec2runners.length;
if (topUp > headroom) {
logger.info(
`Capping pool top-up from ${topUp} to ${Math.max(headroom, 0)} to respect the maximum of ` +
`${maximumRunners} runners (currently ${ec2runners.length} running).`,
);
topUp = headroom;
}
}

if (topUp > 0) {
logger.info(`The pool will be topped up with ${topUp} runners.`);
Expand Down
1 change: 1 addition & 0 deletions modules/runners/pool.tf
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ module "pool" {
instance_max_spot_price = var.instance_max_spot_price
instance_target_capacity_type = var.instance_target_capacity_type
instance_types = var.instance_types
runners_maximum_count = var.runners_maximum_count
kms_key_arn = local.kms_key_arn
ami_kms_key_arn = local.ami_kms_key_arn
ami_id_ssm_parameter_arn = local.ami_id_ssm_module_managed ? aws_ssm_parameter.runner_ami_id[0].arn : var.ami.id_ssm_parameter_arn
Expand Down
2 changes: 1 addition & 1 deletion modules/runners/pool/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ No modules.
| Name | Description | Type | Default | Required |
|------|-------------|------|---------|:--------:|
| <a name="input_aws_partition"></a> [aws\_partition](#input\_aws\_partition) | (optional) partition for the arn if not 'aws' | `string` | `"aws"` | no |
| <a name="input_config"></a> [config](#input\_config) | Lookup details in parent module. | <pre>object({<br/> lambda = object({<br/> log_level = string<br/> logging_retention_in_days = number<br/> logging_kms_key_id = string<br/> log_class = string<br/> reserved_concurrent_executions = number<br/> s3_bucket = string<br/> s3_key = string<br/> s3_object_version = string<br/> security_group_ids = list(string)<br/> runtime = string<br/> architecture = string<br/> memory_size = number<br/> timeout = number<br/> zip = string<br/> subnet_ids = list(string)<br/> parameter_store_tags = string<br/> })<br/> tags = map(string)<br/> ghes = object({<br/> url = string<br/> ssl_verify = string<br/> })<br/> github_app_parameters = object({<br/> key_base64 = map(string)<br/> id = map(string)<br/> })<br/> subnet_ids = list(string)<br/> runner = object({<br/> disable_runner_autoupdate = bool<br/> ephemeral = bool<br/> enable_jit_config = bool<br/> enable_on_demand_failover_for_errors = list(string)<br/> scale_errors = list(string)<br/> boot_time_in_minutes = number<br/> labels = list(string)<br/> launch_template = object({<br/> name = string<br/> })<br/> group_name = string<br/> name_prefix = string<br/> pool_owner = string<br/> role = object({<br/> arn = string<br/> })<br/> use_dedicated_host = bool<br/> })<br/> instance_types = list(string)<br/> instance_target_capacity_type = string<br/> instance_allocation_strategy = string<br/> instance_max_spot_price = string<br/> prefix = string<br/> pool = list(object({<br/> schedule_expression = string<br/> schedule_expression_timezone = string<br/> size = number<br/> }))<br/> role_permissions_boundary = string<br/> kms_key_arn = string<br/> ami_kms_key_arn = string<br/> ami_id_ssm_parameter_arn = string<br/> role_path = string<br/> ssm_token_path = string<br/> ssm_config_path = string<br/> ami_id_ssm_parameter_name = string<br/> ami_id_ssm_parameter_read_policy_arn = string<br/> arn_ssm_parameters_path_config = string<br/> lambda_tags = map(string)<br/> user_agent = string<br/> })</pre> | n/a | yes |
| <a name="input_config"></a> [config](#input\_config) | Lookup details in parent module. | <pre>object({<br/> lambda = object({<br/> log_level = string<br/> logging_retention_in_days = number<br/> logging_kms_key_id = string<br/> log_class = string<br/> reserved_concurrent_executions = number<br/> s3_bucket = string<br/> s3_key = string<br/> s3_object_version = string<br/> security_group_ids = list(string)<br/> runtime = string<br/> architecture = string<br/> memory_size = number<br/> timeout = number<br/> zip = string<br/> subnet_ids = list(string)<br/> parameter_store_tags = string<br/> })<br/> tags = map(string)<br/> ghes = object({<br/> url = string<br/> ssl_verify = string<br/> })<br/> github_app_parameters = object({<br/> key_base64 = map(string)<br/> id = map(string)<br/> })<br/> subnet_ids = list(string)<br/> runner = object({<br/> disable_runner_autoupdate = bool<br/> ephemeral = bool<br/> enable_jit_config = bool<br/> enable_on_demand_failover_for_errors = list(string)<br/> scale_errors = list(string)<br/> boot_time_in_minutes = number<br/> labels = list(string)<br/> launch_template = object({<br/> name = string<br/> })<br/> group_name = string<br/> name_prefix = string<br/> pool_owner = string<br/> role = object({<br/> arn = string<br/> })<br/> use_dedicated_host = bool<br/> })<br/> runners_maximum_count = number<br/> instance_types = list(string)<br/> instance_target_capacity_type = string<br/> instance_allocation_strategy = string<br/> instance_max_spot_price = string<br/> prefix = string<br/> pool = list(object({<br/> schedule_expression = string<br/> schedule_expression_timezone = string<br/> size = number<br/> }))<br/> role_permissions_boundary = string<br/> kms_key_arn = string<br/> ami_kms_key_arn = string<br/> ami_id_ssm_parameter_arn = string<br/> role_path = string<br/> ssm_token_path = string<br/> ssm_config_path = string<br/> ami_id_ssm_parameter_name = string<br/> ami_id_ssm_parameter_read_policy_arn = string<br/> arn_ssm_parameters_path_config = string<br/> lambda_tags = map(string)<br/> user_agent = string<br/> })</pre> | n/a | yes |
| <a name="input_tracing_config"></a> [tracing\_config](#input\_tracing\_config) | Configuration for lambda tracing. | <pre>object({<br/> mode = optional(string, null)<br/> capture_http_requests = optional(bool, false)<br/> capture_error = optional(bool, false)<br/> })</pre> | `{}` | no |

## Outputs
Expand Down
1 change: 1 addition & 0 deletions modules/runners/pool/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ resource "aws_lambda_function" "pool" {
RUNNER_GROUP_NAME = var.config.runner.group_name
RUNNER_NAME_PREFIX = var.config.runner.name_prefix
RUNNER_OWNER = var.config.runner.pool_owner
RUNNERS_MAXIMUM_COUNT = var.config.runners_maximum_count
SSM_TOKEN_PATH = var.config.ssm_token_path
SSM_CONFIG_PATH = var.config.ssm_config_path
SUBNET_IDS = join(",", var.config.subnet_ids)
Expand Down
1 change: 1 addition & 0 deletions modules/runners/pool/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ variable "config" {
})
use_dedicated_host = bool
})
runners_maximum_count = number
instance_types = list(string)
instance_target_capacity_type = string
instance_allocation_strategy = string
Expand Down