My personal blog

How to handle EC2 Spot Instance Interruption

2020.09.21

TLDR; this post was inspired by official AWS documentation for spot interruption handling. This is only for my self note. All credits should go to AWS Engineering team. This tutorial is only cover EC2 instance that is managed by AutoScaling group.

Previously we already talked about average cost comparison between on-demand and spot instance. In the summary, we can save up to 90% of regular on-demand cost using Spot.

However, this cost saving could reduce another aspect, reliability. Basically when we use spot, we borrow unused physical hardware for running the EC2, thus AWS can take it back once there is on-demand request that needs to use the resource. Hence, our spot-instance can be interrupted anytime.

Luckily, AWS provide 2 minutes warning via Cloudwatch Event prior the interruption happens. So we can at least do some preparation to gracefully stop our service to avoid downtime and transaction failure.

Workflow

So here is the sequence of the workflow:

  1. 2 minutes prior interruption happens, Cloudwatch emit event with details of instance-id that will be interrupted
  2. Lambda will consume the event payload and execute the detach operation to AutoScaling group
  3. Autoscaling group will detach the instance from ALB target group to drain the existing connection and ensure no new traffic redirected to the instance
  4. Once the operation completed, instance will be removed from AutoScaling group and can be safely terrminated / stopped / hibernated by AWS

You can check detailed explanation in the source code

  • IAM Role and Policy for Lambda
data "aws_iam_policy_document" "lambda_policy" {

  statement {
    effect = "Allow"

    actions = [
      "logs:CreateLogGroup",
      "logs:CreateLogStream",
      "logs:PutLogEvents",
    ]

    resources = [
      "${aws_cloudwatch_log_group.this.arn}",
    ]
  }

  statement {
    effect = "Allow"

    actions = [
      "autoscaling:DescribeAutoScalingInstances",
      "autoscaling:DetachInstances",
      "autoscaling:SetDesiredCapacity",
    ]

    resources = [
      "*",
    ]
  }
}

resource "aws_iam_role" "this" {
  name        = var.service_name
  path        = "/"
  description = "IAM Role of ${var.service_name}"

  assume_role_policy    = data.aws_iam_policy_document.lambda.json
  force_detach_policies = true
  max_session_duration  = 3600

  tags = {
    Name          = var.service_name
    Environment   = var.environment
    Description   = "IAM Role of ${var.service_name}"
    ManagedBy     = "terraform"
  }
}
  • Lambda resource definition
resource "aws_lambda_function" "this" {
  function_name    = var.service_name
  filename         = "${path.module}/build/main.zip"
  handler          = "main"
  source_code_hash = data.archive_file.lambda.output_base64sha256
  role             = aws_iam_role.this.arn
  runtime          = "go1.x"
  memory_size      = 128
  timeout          = var.lambda_timeout
  description      = var.description

  tags = {
    Service     = var.service_name
    Description = "Lambda Function for ${var.service_name}"
    Environment = var.environment
    ManagedBy   = "terraform"
  }
}
  • Code block for lambda
package main

import (
  "context"
  "encoding/json"
  "log"
  "os"
  "strings"
  "time"

  "github.com/aws/aws-lambda-go/events"
  "github.com/aws/aws-lambda-go/lambda"
  "github.com/aws/aws-sdk-go/aws"
  "github.com/aws/aws-sdk-go/aws/request"
  "github.com/aws/aws-sdk-go/aws/session"
  "github.com/aws/aws-sdk-go/service/autoscaling"
)

type eventDetail struct {
  InstanceID     string `json:"instance-id"`
  InstanceAction string `json:"instance-action"`
}

func handleRequest(ctx context.Context, event events.CloudWatchEvent) {
  // Create new session
  sess := session.Must(session.NewSession())

  // Declare variables
  var detail eventDetail

  // Validate event detail
  if strings.Contains(event.DetailType, "EC2 Spot Instance Interruption Warning") && event.Source == "aws.ec2" {
    // Parse event detail
    err := json.Unmarshal(event.Detail, &detail)
    if err != nil {
      log.Printf("ERROR - Unable to parse event detail - %v", err)
    }

    // Initialize autoscaling client
    asg := autoscaling.New(sess)
    // Get the autoscaling group metadata
    autoScalingInstanceOutput, err := asg.DescribeAutoScalingInstances(&autoscaling.DescribeAutoScalingInstancesInput{
      InstanceIds: []*string{
        aws.String(detail.InstanceID),
      },
    })
    if err != nil {
      log.Printf("ERROR - Unable to get autoscaling group metadata of of %s - %v", detail.InstanceID, err)
    }

    for _, asgInstance := range autoScalingInstanceOutput.AutoScalingInstances {
      log.Printf("INFO - Handling %s of %s", event.DetailType, aws.StringValue(asgInstance.AutoScalingGroupName))
      // If instance < 2, scale out to 2 to avoid unexpected downtime during detach operation
      if len(autoScalingInstanceOutput.AutoScalingInstances) < 2 {
        _, err = asg.SetDesiredCapacity(&autoscaling.SetDesiredCapacityInput{
          AutoScalingGroupName: asgInstance.AutoScalingGroupName,
          DesiredCapacity:      aws.Int64(2),
          HonorCooldown:        aws.Bool(false),
        })
        if err != nil {
          log.Printf("ERROR - Unable to increase capacity of autoscaling-group %s - %v", aws.StringValue(asgInstance.AutoScalingGroupName), err)
        }
        // Wait until scale-out operation is completed and desired capacity with InService status > 1
        _ = func(ctx aws.Context, input *autoscaling.DescribeAutoScalingGroupsInput) error {
          w := request.Waiter{
            Name:        "WaitUntilNoSingleInstance",
            MaxAttempts: 15,
            Delay:       request.ConstantWaiterDelay(4 * time.Second),
            Acceptors: []request.WaiterAcceptor{
              {
                State:   request.SuccessWaiterState,
                Matcher: request.PathWaiterMatch, Argument: "contains(AutoScalingGroups[].[length(Instances[?LifecycleState=='InService']) == DesiredCapacity ][], `false`)",
                Expected: true,
              },
              {
                State:   request.RetryWaiterState,
                Matcher: request.PathWaiterMatch, Argument: "contains(AutoScalingGroups[].[length(Instances[?LifecycleState=='InService']) < DesiredCapacity ][], `false`)",
                Expected: false,
              },
            },
            Logger: asg.Config.Logger,
            NewRequest: func(opts []request.Option) (*request.Request, error) {
              req, _ := asg.DescribeAutoScalingGroupsRequest(input)
              req.SetContext(ctx)
              return req, nil
            },
          }
          return w.WaitWithContext(ctx)
        }(ctx, &autoscaling.DescribeAutoScalingGroupsInput{
          AutoScalingGroupNames: []*string{
            asgInstance.AutoScalingGroupName,
          },
        })
      }

      // Detach the instance from autoscaling group to also drain the connection from Load Balancer
      _, err = asg.DetachInstances(&autoscaling.DetachInstancesInput{
        AutoScalingGroupName: asgInstance.AutoScalingGroupName,
        InstanceIds: []*string{
          aws.String(detail.InstanceID),
        },
        ShouldDecrementDesiredCapacity: aws.Bool(false),
      })
      if err != nil {
        log.Printf("ERROR - Unable to detach %s from autoscaling-group %s - %v", detail.InstanceID, aws.StringValue(asgInstance.AutoScalingGroupName), err)
      }
    }
  }
}

func main() {
  _, isLambda := os.LookupEnv("AWS_LAMBDA_RUNTIME_API")
  if isLambda {
    lambda.Start(handleRequest)
  } else {
    log.Fatal("This is only intended to be run inside AWS Lambda")
  }
}
  • Create Cloudwatch Event rule to only filter event for EC2 spot interruption
resource "aws_cloudwatch_event_rule" "spot" {
  name          = var.service_name
  event_pattern = <<PATTERN
  {
    "detail-type": [
      "EC2 Spot Instance Interruption Warning"
    ],
    "source": [
      "aws.ec2"
    ]
  }
  PATTERN
}
  • Set the cloudwatch event target to lambda handler
resource "aws_cloudwatch_event_target" "lambda" {
  rule = aws_cloudwatch_event_rule.spot.name
  arn  = aws_lambda_function.this.arn
}

Testing

  • You can go to the example directory
  • Apply the infrastructure and code with terraform apply
  • Go to Lambda console and create test event with this sample payload
{
    "version": "0",
    "id": "12345678-1234-1234-1234-123456789012",
    "detail-type": "EC2 Spot Instance Interruption Warning",
    "source": "aws.ec2",
    "account": "123456789012",
    "time": "yyyy-mm-ddThh:mm:ssZ",
    "region": "us-east-2",
    "resources": ["arn:aws:ec2:us-east-2:123456789012:instance/i-1234567890abcdef0"],
    "detail": {
        "instance-id": "i-1234567890abcdef0",
        "instance-action": "action"
    }
}

note:

  • Adjust the mandatory key like instance-id and time
  • Test lambda invocation and make sure your test instance detached gracefully and no exception occurs on your service

Reference

comments powered by Disqus