I am trying to start using fargate but my terraform setup seems to be wrong since the public DNS of the ALB returns 503 Service Temporarily Unavailable
. The task never get started and health status is unknown.
Infra setup
Public ALB->ECS service in private subnets with Fargate
I have also created necessary vpc endpoints
NAT gateway : 1 per AZ
ECS
module "ecs_service" {
source = "terraform-aws-modules/ecs/aws//modules/service"
name = var.ecs_service_name
cluster_arn = module.ecs_cluster.arn
runtime_platform = {
operating_system_family = "LINUX"
cpu_architecture = "X86_64"
}
cpu = 1024
memory = 4096
# Container definition(s)
container_definitions = {
ecs-sample = {
cpu = 512
memory = 1024
essential = true
image = "public.ecr.aws/aws-containers/ecsdemo-frontend:latest"
port_mappings = [
{
name = var.ecs_service_name
containerPort = var.ecs_service_port
hostPort = var.ecs_service_port
protocol = "tcp"
}
]
}
}
subnet_ids = module.vpc.private_subnets
security_group_ids = [module.ecs-container-sg.security_group_id]
autoscaling_min_capacity = 1
autoscaling_max_capacity = 5
load_balancer = {
service = {
target_group_arn = element(module.alb.target_group_arns, 0)
container_name = var.ecs_service_name
container_port = var.ecs_service_port
}
}
create_security_group = false
depends_on = [
module.ecs-container-sg,
module.alb
]
}
vars.tf
ecs_service_name = "ecs-sample"
ecs_service_port = 3000
ALB
module "alb" {
source = "terraform-aws-modules/alb/aws"
version = "~> 8.0"
name = "${var.project_name}-alb"
load_balancer_type = "application"
vpc_id = module.vpc.vpc_id
subnets = module.vpc.public_subnets
security_groups = [module.alb-webtraffic-sg.security_group_id]
http_tcp_listeners = [
{
port = 80
protocol = "HTTP"
target_group_index = 0
}
]
target_groups = [
{
name = "${var.ecs_service_name}-blue-tg"
backend_protocol = "HTTP"
backend_port = var.ecs_service_port
target_type = "ip"
deregistration_delay = 10
load_balancing_cross_zone_enabled = false
health_check = {
enabled = true
interval = 5
path = "/"
port = var.ecs_service_port
healthy_threshold = 2
unhealthy_threshold = 3
timeout = 3
protocol = "HTTP"
matcher = "200"
}
}
]
}
Security Groups
module "alb-webtraffic-sg" {
source = "terraform-aws-modules/security-group/aws"
version = "~> 4.0"
name = "${var.project_name}-alb-webtraffic-sg"
description = "Allow Inbound traffic port 80 & 443 from anywhere"
vpc_id = module.vpc.vpc_id
ingress_cidr_blocks = ["0.0.0.0/0"]
ingress_rules = ["http-80-tcp", "https-443-tcp"]
egress_rules = ["all-all"]
tags = {
Name = "${var.project_name}-sg-webtraffic"
}
}
module "ecs-container-sg" {
source = "terraform-aws-modules/security-group/aws"
version = "~> 4.0"
name = "${var.project_name}-ecs-container-sg"
description = "Allow Inbound traffic from ALB Security Group"
vpc_id = module.vpc.vpc_id
ingress_with_source_security_group_id = [
{
description = "Allow Inbound traffic from ALB Security Group"
rule = "all-tcp"
source_security_group_id = module.alb-webtraffic-sg.security_group_id
}
]
tags = {
Name = "${var.project_name}-sg-webtraffic"
}
}
module "vpc-sg" {
source = "terraform-aws-modules/security-group/aws"
version = "~> 4.0"
name = "vpc-sg"
description = "Allow all traffic within the VPC"
vpc_id = module.vpc.vpc_id
ingress_cidr_blocks = [var.main_vpc_prefix]
ingress_rules = ["all-all"]
egress_rules = ["all-all"]
tags = {
Name = "${var.project_name}-vpc-sg"
}
}
VPC endpoints
module "endpoints" {
source = "terraform-aws-modules/vpc/aws//modules/vpc-endpoints"
vpc_id = module.vpc.vpc_id
security_group_ids = [module.vpc-sg.security_group_id]
endpoints = {
dynamodb = {
service = "dynamodb",
service_type = "Gateway"
route_table_ids = module.vpc.private_route_table_ids
tags = { Name = "dynamodb-vpc-endpoint" }
},
s3 = {
service = "s3",
service_type = "Gateway"
route_table_ids = module.vpc.private_route_table_ids
tags = { Name = "s3-vpc-endpoint" }
},
ecr = {
# interface endpoint
service_name = "com.amazonaws.${data.aws_region.current.name}.ecr.api"
subnet_ids = module.vpc.private_subnets
private_dns_enabled = true
tags = { Name = "logs-vpc-endpoint" }
},
dkr = {
# interface endpoint
service_name = "com.amazonaws.${data.aws_region.current.name}.ecr.dkr"
subnet_ids = module.vpc.private_subnets
private_dns_enabled = true
tags = { Name = "logs-vpc-endpoint" }
},
logs = {
# interface endpoint
service_name = "com.amazonaws.${data.aws_region.current.name}.logs"
subnet_ids = module.vpc.private_subnets
private_dns_enabled = true
tags = { Name = "logs-vpc-endpoint" }
}
}
}
I don't see any logs being generated in cloudwatch either
Edit It seems that the Task have been stopped for the following reason:
CannotPullContainerError: pull image manifest has been retried 5 time(s): failed to resolve ref public.ecr.aws/aws-containers/ecsdemo-frontend:latest: failed to do request: Head "https://public.ecr.aws/v2/aws-containers/ecsdemo-frontend/manifests/latest": dial tcp 99.83.145.10:443: i/o timeout
I do have a NAT gateway so it should work as expected.
module "vpc" {
source = "terraform-aws-modules/vpc/aws"
name = var.main_vpc_name
cidr = var.main_vpc_prefix
azs = var.availability_zones
private_subnets = var.private_subnets
public_subnets = var.public_subnets
database_subnets = var.database_subnets
elasticache_subnets = var.elasticache_subnets
enable_nat_gateway = true
single_nat_gateway = false
one_nat_gateway_per_az = true
enable_vpn_gateway = false
enable_dns_support = true
enable_dns_hostnames = true
create_database_subnet_group = true
create_database_subnet_route_table = true
create_database_internet_gateway_route = true
}
Edit 2
I have added the extra permission to the ecs task execution role but pulling the ecr image still fails
resource "aws_iam_role_policy_attachment" "task_exec_ecs" {
role = "${module.ecs_service.task_exec_iam_role_name}"
policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy"
}
This is obviously because ECS cant reach ECR, as opposed to a permission issue which might appear clearly in the logs. My main suspect would be the Security Group of the ECS Service. I noticed you created ecs-container-sg with All inbound from the ALB. Assuming no outbound was rule was created, this might cause the error you are seeing.
Additionally, you might also want to look at the Security Group and the Policy for the VPC endpoint created for ECR and if they appropriate Inbound/Outbound rules defined.