Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

staging -> production: batch queue with on-demand instances #591

Merged
merged 13 commits into from
Oct 10, 2024
Merged
5 changes: 3 additions & 2 deletions app/models/pydantic/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from ...settings.globals import (
AURORA_JOB_QUEUE,
ON_DEMAND_COMPUTE_JOB_QUEUE,
DATA_LAKE_JOB_QUEUE,
DEFAULT_JOB_DURATION,
GDAL_PYTHON_JOB_DEFINITION,
Expand Down Expand Up @@ -138,9 +139,9 @@ class PixETLJob(Job):


class GDALCOGJob(Job):
"""Use for creating COG files using GDAL Python docker in PixETL queue."""
"""Use for creating COG files using GDAL Python docker in on-demand compute queue."""

job_queue = PIXETL_JOB_QUEUE
job_queue = ON_DEMAND_COMPUTE_JOB_QUEUE
job_definition = GDAL_PYTHON_JOB_DEFINITION
vcpus = 8
memory = 64000
Expand Down
1 change: 1 addition & 0 deletions app/settings/globals.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@
MAX_MEM = config("MAX_MEM", cast=int, default=760000)
PIXETL_JOB_DEFINITION = config("PIXETL_JOB_DEFINITION", cast=str)
PIXETL_JOB_QUEUE = config("PIXETL_JOB_QUEUE", cast=str)
ON_DEMAND_COMPUTE_JOB_QUEUE = config("ON_DEMAND_COMPUTE_JOB_QUEUE", cast=str)
PIXETL_CORES = config("PIXETL_CORES", cast=int, default=48)
PIXETL_MAX_MEM = config("PIXETL_MAX_MEM", cast=int, default=380000)
PIXETL_DEFAULT_RESAMPLING = config(
Expand Down
1 change: 1 addition & 0 deletions docker-compose.dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ services:
- TILE_CACHE_CLUSTER=tile_cache_cluster
- TILE_CACHE_SERVICE=tile_cache_service
- PIXETL_JOB_QUEUE=pixetl_jq
- ON_DEMAND_COMPUTE_JOB_QUEUE=cogify_jq
- API_URL=http://app_dev:80
- RASTER_ANALYSIS_LAMBDA_NAME=raster-analysis-tiled_raster_analysis-default
- RW_API_URL=https://staging-api.resourcewatch.org
Expand Down
1 change: 1 addition & 0 deletions docker-compose.prod.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ services:
- DATA_LAKE_JOB_QUEUE=data_lake_jq
- TILE_CACHE_JOB_QUEUE=tile_cache_jq
- PIXETL_JOB_QUEUE=pixetl_jq
- ON_DEMAND_COMPUTE_JOB_QUEUE=cogify_jq
- RASTER_ANALYSIS_LAMBDA_NAME=raster_analysis
- API_URL="http://app_dev:80"
- RW_API_URL=https://api.resourcewatch.org
Expand Down
1 change: 1 addition & 0 deletions docker-compose.test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ services:
- TILE_CACHE_CLUSTER=tile_cache_cluster
- TILE_CACHE_SERVICE=tile_cache_service
- PIXETL_JOB_QUEUE=pixetl_jq
- ON_DEMAND_COMPUTE_JOB_QUEUE=cogify_jq
- PIXETL_CORES=1
- MAX_CORES=1
- NUM_PROCESSES=1
Expand Down
22 changes: 12 additions & 10 deletions terraform/data.tf
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ data "template_file" "container_definition" {
tile_cache_job_queue = module.batch_job_queues.tile_cache_job_queue_arn
pixetl_job_definition = module.batch_job_queues.pixetl_job_definition_arn
pixetl_job_queue = module.batch_job_queues.pixetl_job_queue_arn
on_demand_compute_job_queue = module.batch_job_queues.on_demand_compute_job_queue_arn
raster_analysis_lambda_name = "raster-analysis-tiled_raster_analysis-default"
raster_analysis_sfn_arn = data.terraform_remote_state.raster_analysis_lambda.outputs.raster_analysis_state_machine_arn
service_url = local.service_url
Expand Down Expand Up @@ -95,15 +96,16 @@ data "template_file" "container_definition" {
data "template_file" "task_batch_policy" {
template = file("${path.root}/templates/run_batch_policy.json.tmpl")
vars = {
aurora_job_definition_arn = module.batch_job_queues.aurora_job_definition_arn
aurora_job_queue_arn = module.batch_job_queues.aurora_job_queue_arn
aurora_job_queue_fast_arn = module.batch_job_queues.aurora_job_queue_fast_arn
data_lake_job_definition_arn = module.batch_job_queues.data_lake_job_definition_arn
data_lake_job_queue_arn = module.batch_job_queues.data_lake_job_queue_arn
tile_cache_job_definition_arn = module.batch_job_queues.tile_cache_job_definition_arn
tile_cache_job_queue_arn = module.batch_job_queues.tile_cache_job_queue_arn
pixetl_job_definition_arn = module.batch_job_queues.pixetl_job_definition_arn
pixetl_job_queue_arn = module.batch_job_queues.pixetl_job_queue_arn
aurora_job_definition_arn = module.batch_job_queues.aurora_job_definition_arn
aurora_job_queue_arn = module.batch_job_queues.aurora_job_queue_arn
aurora_job_queue_fast_arn = module.batch_job_queues.aurora_job_queue_fast_arn
data_lake_job_definition_arn = module.batch_job_queues.data_lake_job_definition_arn
data_lake_job_queue_arn = module.batch_job_queues.data_lake_job_queue_arn
tile_cache_job_definition_arn = module.batch_job_queues.tile_cache_job_definition_arn
tile_cache_job_queue_arn = module.batch_job_queues.tile_cache_job_queue_arn
pixetl_job_definition_arn = module.batch_job_queues.pixetl_job_definition_arn
pixetl_job_queue_arn = module.batch_job_queues.pixetl_job_queue_arn
on_demand_compute_job_queue_arn = module.batch_job_queues.on_demand_compute_job_queue_arn
}
depends_on = [
module.batch_job_queues.aurora_job_definition,
Expand Down Expand Up @@ -190,4 +192,4 @@ data "template_file" "step_function_policy" {
vars = {
raster_analysis_state_machine_arn = data.terraform_remote_state.raster_analysis_lambda.outputs.raster_analysis_state_machine_arn
}
}
}
35 changes: 29 additions & 6 deletions terraform/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -174,21 +174,44 @@ module "batch_data_lake_writer" {
tags = local.batch_tags
use_ephemeral_storage = true
# SPOT is actually the default, this is just a placeholder until GTC-1791 is done
launch_type = "SPOT"
instance_types = [
"r6id.large", "r6id.xlarge", "r6id.2xlarge", "r6id.4xlarge", "r6id.8xlarge", "r6id.12xlarge", "r6id.16xlarge", "r6id.24xlarge",
"r5ad.large", "r5ad.xlarge", "r5ad.2xlarge", "r5ad.4xlarge", "r5ad.8xlarge", "r5ad.12xlarge", "r5ad.16xlarge", "r5ad.24xlarge",
"r5d.large", "r5d.xlarge", "r5d.2xlarge", "r5d.4xlarge", "r5d.8xlarge", "r5d.12xlarge", "r5d.16xlarge", "r5d.24xlarge"
]
launch_type = "SPOT"
instance_types = var.data_lake_writer_instance_types
compute_environment_name = "data_lake_writer"
}

module "batch_cogify" {
source = "git::https://github.com/wri/gfw-terraform-modules.git//terraform/modules/compute_environment?ref=v0.4.2.3"
ecs_role_policy_arns = [
aws_iam_policy.query_batch_jobs.arn,
aws_iam_policy.s3_read_only.arn,
data.terraform_remote_state.core.outputs.iam_policy_s3_write_data-lake_arn,
data.terraform_remote_state.core.outputs.secrets_postgresql-reader_policy_arn,
data.terraform_remote_state.core.outputs.secrets_postgresql-writer_policy_arn,
data.terraform_remote_state.core.outputs.secrets_read-gfw-gee-export_policy_arn
]
key_pair = var.key_pair
max_vcpus = var.data_lake_max_vcpus
project = local.project
security_group_ids = [
data.terraform_remote_state.core.outputs.default_security_group_id,
data.terraform_remote_state.core.outputs.postgresql_security_group_id
]
subnets = data.terraform_remote_state.core.outputs.private_subnet_ids
suffix = local.name_suffix
tags = local.batch_tags
use_ephemeral_storage = true
launch_type = "EC2"
instance_types = var.data_lake_writer_instance_types
compute_environment_name = "batch_cogify"
}

module "batch_job_queues" {
source = "./modules/batch"
aurora_compute_environment_arn = module.batch_aurora_writer.arn
data_lake_compute_environment_arn = module.batch_data_lake_writer.arn
pixetl_compute_environment_arn = module.batch_data_lake_writer.arn
tile_cache_compute_environment_arn = module.batch_data_lake_writer.arn
cogify_compute_environment_arn = module.batch_cogify.arn
environment = var.environment
name_suffix = local.name_suffix
project = local.project
Expand Down
9 changes: 8 additions & 1 deletion terraform/modules/batch/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,13 @@ resource "aws_batch_job_queue" "pixetl" {
depends_on = [var.pixetl_compute_environment_arn]
}

resource "aws_batch_job_queue" "on_demand" {
name = substr("${var.project}-on-demand-job-queue${var.name_suffix}", 0, 64)
state = "ENABLED"
priority = 1
compute_environments = [var.cogify_compute_environment_arn]
depends_on = [var.cogify_compute_environment_arn]
}

resource "aws_batch_job_definition" "tile_cache" {
name = substr("${var.project}-tile_cache${var.name_suffix}", 0, 64)
Expand Down Expand Up @@ -190,4 +197,4 @@ data "template_file" "ecs-task_assume" {
vars = {
service = "ecs-tasks"
}
}
}
6 changes: 5 additions & 1 deletion terraform/modules/batch/outputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ output "pixetl_job_queue_arn" {
value = aws_batch_job_queue.pixetl.arn
}

output "on_demand_compute_job_queue_arn" {
value = aws_batch_job_queue.on_demand.arn
}

output "tile_cache_job_definition_arn" {
value = aws_batch_job_definition.tile_cache.arn
}
Expand All @@ -48,4 +52,4 @@ output "tile_cache_job_definition" {

output "tile_cache_job_queue_arn" {
value = aws_batch_job_queue.tile_cache.arn
}
}
1 change: 1 addition & 0 deletions terraform/modules/batch/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ variable "project" { type = string }
variable "name_suffix" { type = string }
variable "aurora_compute_environment_arn" { type = string }
variable "data_lake_compute_environment_arn" { type = string }
variable "cogify_compute_environment_arn" { type = string }
variable "tile_cache_compute_environment_arn" { type = string }
variable "pixetl_compute_environment_arn" { type = string }
variable "gdal_repository_url" { type = string }
Expand Down
4 changes: 4 additions & 0 deletions terraform/templates/container_definition.json.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,10 @@
"name": "PIXETL_JOB_QUEUE",
"value": "${pixetl_job_queue}"
},
{
"name": "ON_DEMAND_COMPUTE_JOB_QUEUE",
"value": "${on_demand_compute_job_queue}"
},
{
"name": "API_URL",
"value": "${service_url}"
Expand Down
4 changes: 3 additions & 1 deletion terraform/templates/run_batch_policy.json.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@
"${tile_cache_job_definition_arn}",

"${pixetl_job_queue_arn}",
"${pixetl_job_definition_arn}"
"${pixetl_job_definition_arn}",

"${on_demand_compute_job_queue_arn}"
]
},
{
Expand Down
10 changes: 10 additions & 0 deletions terraform/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -156,3 +156,13 @@ variable "api_gateway_url" {
description = "The invoke url of the API Gateway stage"
default = ""
}

variable "data_lake_writer_instance_types" {
type = list(string)
description = "memory optimized EC2 instances with local NVMe SSDs for data lake writer batche queues"
default = [
"r6id.large", "r6id.xlarge", "r6id.2xlarge", "r6id.4xlarge", "r6id.8xlarge", "r6id.12xlarge", "r6id.16xlarge", "r6id.24xlarge",
"r5ad.large", "r5ad.xlarge", "r5ad.2xlarge", "r5ad.4xlarge", "r5ad.8xlarge", "r5ad.12xlarge", "r5ad.16xlarge", "r5ad.24xlarge",
"r5d.large", "r5d.xlarge", "r5d.2xlarge", "r5d.4xlarge", "r5d.8xlarge", "r5d.12xlarge", "r5d.16xlarge", "r5d.24xlarge"
]
}
5 changes: 5 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
GDAL_PYTHON_JOB_DEFINITION,
PIXETL_JOB_DEFINITION,
PIXETL_JOB_QUEUE,
ON_DEMAND_COMPUTE_JOB_QUEUE,
POSTGRESQL_CLIENT_JOB_DEFINITION,
TILE_CACHE_BUCKET,
TILE_CACHE_JOB_DEFINITION,
Expand Down Expand Up @@ -167,6 +168,7 @@ def patch_run(self, *k, **kwargs):
"s3_writer", subnet_id, sg_id, iam_arn
)
pixetl_env = aws_mock.add_compute_environment("pixetl", subnet_id, sg_id, iam_arn)
cogify_env = aws_mock.add_compute_environment("cogify", subnet_id, sg_id, iam_arn)

aws_mock.add_job_queue(AURORA_JOB_QUEUE, aurora_writer_env["computeEnvironmentArn"])
aws_mock.add_job_queue(
Expand All @@ -175,6 +177,9 @@ def patch_run(self, *k, **kwargs):
aws_mock.add_job_queue(DATA_LAKE_JOB_QUEUE, s3_writer_env["computeEnvironmentArn"])
aws_mock.add_job_queue(TILE_CACHE_JOB_QUEUE, s3_writer_env["computeEnvironmentArn"])
aws_mock.add_job_queue(PIXETL_JOB_QUEUE, pixetl_env["computeEnvironmentArn"])
aws_mock.add_job_queue(
ON_DEMAND_COMPUTE_JOB_QUEUE, cogify_env["computeEnvironmentArn"]
)

aws_mock.add_job_definition(GDAL_PYTHON_JOB_DEFINITION, "batch_gdal-python_test")
aws_mock.add_job_definition(
Expand Down
Loading