amazon-web-servicesterraformamazon-ecsaws-fargateterraform-aws-modules

ECS with Fargate returns 503


I am trying to start using fargate but my terraform setup seems to be wrong since the public DNS of the ALB returns 503 Service Temporarily Unavailable. The task never get started and health status is unknown.

Infra setup

Public ALB->ECS service in private subnets with Fargate

I have also created necessary vpc endpoints

NAT gateway : 1 per AZ

ECS

module "ecs_service" {
  source = "terraform-aws-modules/ecs/aws//modules/service"

  name = var.ecs_service_name
  cluster_arn = module.ecs_cluster.arn

  runtime_platform = {
    operating_system_family = "LINUX"
    cpu_architecture        = "X86_64"
  }

  cpu    = 1024
  memory = 4096

  # Container definition(s)
  container_definitions = {

    ecs-sample = {
      cpu       = 512
      memory    = 1024
      essential = true
      image     = "public.ecr.aws/aws-containers/ecsdemo-frontend:latest"
      port_mappings = [
        {
          name          = var.ecs_service_name
          containerPort = var.ecs_service_port
          hostPort      = var.ecs_service_port
          protocol      = "tcp"
        }
      ]
    }
  }

  subnet_ids = module.vpc.private_subnets
  security_group_ids = [module.ecs-container-sg.security_group_id]

  autoscaling_min_capacity = 1
  autoscaling_max_capacity = 5
  
  load_balancer = {
    service = {
      target_group_arn = element(module.alb.target_group_arns, 0)
      container_name   = var.ecs_service_name
      container_port   = var.ecs_service_port
    }
  } 

  create_security_group = false

  depends_on = [
   module.ecs-container-sg,
   module.alb
 ]
}

vars.tf

ecs_service_name = "ecs-sample" 
ecs_service_port = 3000

ALB

 module "alb" {
  source  = "terraform-aws-modules/alb/aws"
  version = "~> 8.0"

  name = "${var.project_name}-alb"

  load_balancer_type = "application"

  vpc_id          = module.vpc.vpc_id
  subnets         = module.vpc.public_subnets
  security_groups = [module.alb-webtraffic-sg.security_group_id]

  http_tcp_listeners = [
    {
      port               = 80
      protocol           = "HTTP"
      target_group_index = 0
    }
  ]

 target_groups = [
    {
      name                       = "${var.ecs_service_name}-blue-tg"
      backend_protocol                  = "HTTP"
      backend_port                      = var.ecs_service_port
      target_type                       = "ip"
      deregistration_delay              = 10
      load_balancing_cross_zone_enabled = false
      health_check = {
        enabled             = true
        interval            = 5
        path                = "/"
        port                = var.ecs_service_port
        healthy_threshold   = 2
        unhealthy_threshold = 3
        timeout             = 3
        protocol            = "HTTP"
        matcher             = "200"
      }
    }
  ]

} 

Security Groups

module "alb-webtraffic-sg" {
  source  = "terraform-aws-modules/security-group/aws"
  version = "~> 4.0"

  name        = "${var.project_name}-alb-webtraffic-sg"
  description = "Allow Inbound traffic port 80 & 443 from anywhere"
  vpc_id      = module.vpc.vpc_id

  ingress_cidr_blocks = ["0.0.0.0/0"]
  ingress_rules       = ["http-80-tcp", "https-443-tcp"]
  egress_rules        = ["all-all"]

  tags = {
    Name = "${var.project_name}-sg-webtraffic"
  }
}

module "ecs-container-sg" {
  source  = "terraform-aws-modules/security-group/aws"
  version = "~> 4.0"

  name        = "${var.project_name}-ecs-container-sg"
  description = "Allow Inbound traffic from ALB Security Group"
  vpc_id      = module.vpc.vpc_id

  ingress_with_source_security_group_id = [
    {
      description              = "Allow Inbound traffic from ALB Security Group"
      rule                     = "all-tcp"
      source_security_group_id = module.alb-webtraffic-sg.security_group_id
    }
  ]

  tags = {
    Name = "${var.project_name}-sg-webtraffic"
  }
}

module "vpc-sg" {
  source  = "terraform-aws-modules/security-group/aws"
  version = "~> 4.0"

  name        = "vpc-sg"
  description = "Allow all traffic within the VPC"
  vpc_id      = module.vpc.vpc_id

  ingress_cidr_blocks = [var.main_vpc_prefix]
  ingress_rules       = ["all-all"]
  egress_rules        = ["all-all"]

  tags = {
    Name = "${var.project_name}-vpc-sg"
  }
}

VPC endpoints

module "endpoints" {
  source = "terraform-aws-modules/vpc/aws//modules/vpc-endpoints"

  vpc_id             = module.vpc.vpc_id
  security_group_ids = [module.vpc-sg.security_group_id]

  endpoints = {
    dynamodb = {
      service         = "dynamodb",
      service_type    = "Gateway"
      route_table_ids = module.vpc.private_route_table_ids
      tags            = { Name = "dynamodb-vpc-endpoint" }
    },
    s3 = {
      service         = "s3",
      service_type    = "Gateway"
      route_table_ids = module.vpc.private_route_table_ids
      tags            = { Name = "s3-vpc-endpoint" }
    },
    ecr = {
      # interface endpoint
      service_name        = "com.amazonaws.${data.aws_region.current.name}.ecr.api"
      subnet_ids          = module.vpc.private_subnets
      private_dns_enabled = true
      tags                = { Name = "logs-vpc-endpoint" }
    },
    dkr = {
      # interface endpoint
      service_name        = "com.amazonaws.${data.aws_region.current.name}.ecr.dkr"
      subnet_ids          = module.vpc.private_subnets
      private_dns_enabled = true
      tags                = { Name = "logs-vpc-endpoint" }
    },
    logs = {
      # interface endpoint
      service_name        = "com.amazonaws.${data.aws_region.current.name}.logs"
      subnet_ids          = module.vpc.private_subnets
      private_dns_enabled = true
      tags                = { Name = "logs-vpc-endpoint" }
    }
  }
}

I don't see any logs being generated in cloudwatch either

Edit It seems that the Task have been stopped for the following reason:

CannotPullContainerError: pull image manifest has been retried 5 time(s): failed to resolve ref public.ecr.aws/aws-containers/ecsdemo-frontend:latest: failed to do request: Head "https://public.ecr.aws/v2/aws-containers/ecsdemo-frontend/manifests/latest": dial tcp 99.83.145.10:443: i/o timeout

I do have a NAT gateway so it should work as expected.

module "vpc" {
  source = "terraform-aws-modules/vpc/aws"

  name = var.main_vpc_name
  cidr = var.main_vpc_prefix

  azs                 = var.availability_zones
  private_subnets     = var.private_subnets
  public_subnets      = var.public_subnets
  database_subnets    = var.database_subnets
  elasticache_subnets = var.elasticache_subnets

  enable_nat_gateway     = true
  single_nat_gateway     = false
  one_nat_gateway_per_az = true


  enable_vpn_gateway   = false
  enable_dns_support   = true
  enable_dns_hostnames = true

  create_database_subnet_group           = true
  create_database_subnet_route_table     = true
  create_database_internet_gateway_route = true
}

Edit 2

I have added the extra permission to the ecs task execution role but pulling the ecr image still fails

resource "aws_iam_role_policy_attachment" "task_exec_ecs" {
  role       = "${module.ecs_service.task_exec_iam_role_name}"
  policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy"
}

Solution

  • This is obviously because ECS cant reach ECR, as opposed to a permission issue which might appear clearly in the logs. My main suspect would be the Security Group of the ECS Service. I noticed you created ecs-container-sg with All inbound from the ALB. Assuming no outbound was rule was created, this might cause the error you are seeing.

    Additionally, you might also want to look at the Security Group and the Policy for the VPC endpoint created for ECR and if they appropriate Inbound/Outbound rules defined.