TLDR; this post was inspired by official AWS documentation for spot interruption handling. This is only for my self note. All credits should go to AWS Engineering team. This tutorial is only cover EC2 instance that is managed by AutoScaling group.
Previously we already talked about average cost comparison between on-demand and spot instance. In the summary, we can save up to 90% of regular on-demand cost using Spot.
However, this cost saving could reduce another aspect, reliability. Basically when we use spot, we borrow unused physical hardware for running the EC2, thus AWS can take it back once there is on-demand request that needs to use the resource. Hence, our spot-instance can be interrupted anytime.
Luckily, AWS provide 2 minutes warning via Cloudwatch Event prior the interruption happens. So we can at least do some preparation to gracefully stop our service to avoid downtime and transaction failure.
Workflow
So here is the sequence of the workflow:
- 2 minutes prior interruption happens, Cloudwatch emit event with details of instance-id that will be interrupted
- Lambda will consume the event payload and execute the detach operation to AutoScaling group
- Autoscaling group will detach the instance from ALB target group to drain the existing connection and ensure no new traffic redirected to the instance
- Once the operation completed, instance will be removed from AutoScaling group and can be safely terrminated / stopped / hibernated by AWS
You can check detailed explanation in the source code
- IAM Role and Policy for Lambda
data "aws_iam_policy_document" "lambda_policy" {
statement {
effect = "Allow"
actions = [
"logs:CreateLogGroup",
"logs:CreateLogStream",
"logs:PutLogEvents",
]
resources = [
"${aws_cloudwatch_log_group.this.arn}",
]
}
statement {
effect = "Allow"
actions = [
"autoscaling:DescribeAutoScalingInstances",
"autoscaling:DetachInstances",
"autoscaling:SetDesiredCapacity",
]
resources = [
"*",
]
}
}
resource "aws_iam_role" "this" {
name = var.service_name
path = "/"
description = "IAM Role of ${var.service_name}"
assume_role_policy = data.aws_iam_policy_document.lambda.json
force_detach_policies = true
max_session_duration = 3600
tags = {
Name = var.service_name
Environment = var.environment
Description = "IAM Role of ${var.service_name}"
ManagedBy = "terraform"
}
}
- Lambda resource definition
resource "aws_lambda_function" "this" {
function_name = var.service_name
filename = "${path.module}/build/main.zip"
handler = "main"
source_code_hash = data.archive_file.lambda.output_base64sha256
role = aws_iam_role.this.arn
runtime = "go1.x"
memory_size = 128
timeout = var.lambda_timeout
description = var.description
tags = {
Service = var.service_name
Description = "Lambda Function for ${var.service_name}"
Environment = var.environment
ManagedBy = "terraform"
}
}
- Code block for lambda
package main
import (
"context"
"encoding/json"
"log"
"os"
"strings"
"time"
"github.com/aws/aws-lambda-go/events"
"github.com/aws/aws-lambda-go/lambda"
"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/aws/request"
"github.com/aws/aws-sdk-go/aws/session"
"github.com/aws/aws-sdk-go/service/autoscaling"
)
type eventDetail struct {
InstanceID string `json:"instance-id"`
InstanceAction string `json:"instance-action"`
}
func handleRequest(ctx context.Context, event events.CloudWatchEvent) {
// Create new session
sess := session.Must(session.NewSession())
// Declare variables
var detail eventDetail
// Validate event detail
if strings.Contains(event.DetailType, "EC2 Spot Instance Interruption Warning") && event.Source == "aws.ec2" {
// Parse event detail
err := json.Unmarshal(event.Detail, &detail)
if err != nil {
log.Printf("ERROR - Unable to parse event detail - %v", err)
}
// Initialize autoscaling client
asg := autoscaling.New(sess)
// Get the autoscaling group metadata
autoScalingInstanceOutput, err := asg.DescribeAutoScalingInstances(&autoscaling.DescribeAutoScalingInstancesInput{
InstanceIds: []*string{
aws.String(detail.InstanceID),
},
})
if err != nil {
log.Printf("ERROR - Unable to get autoscaling group metadata of of %s - %v", detail.InstanceID, err)
}
for _, asgInstance := range autoScalingInstanceOutput.AutoScalingInstances {
log.Printf("INFO - Handling %s of %s", event.DetailType, aws.StringValue(asgInstance.AutoScalingGroupName))
// If instance < 2, scale out to 2 to avoid unexpected downtime during detach operation
if len(autoScalingInstanceOutput.AutoScalingInstances) < 2 {
_, err = asg.SetDesiredCapacity(&autoscaling.SetDesiredCapacityInput{
AutoScalingGroupName: asgInstance.AutoScalingGroupName,
DesiredCapacity: aws.Int64(2),
HonorCooldown: aws.Bool(false),
})
if err != nil {
log.Printf("ERROR - Unable to increase capacity of autoscaling-group %s - %v", aws.StringValue(asgInstance.AutoScalingGroupName), err)
}
// Wait until scale-out operation is completed and desired capacity with InService status > 1
_ = func(ctx aws.Context, input *autoscaling.DescribeAutoScalingGroupsInput) error {
w := request.Waiter{
Name: "WaitUntilNoSingleInstance",
MaxAttempts: 15,
Delay: request.ConstantWaiterDelay(4 * time.Second),
Acceptors: []request.WaiterAcceptor{
{
State: request.SuccessWaiterState,
Matcher: request.PathWaiterMatch, Argument: "contains(AutoScalingGroups[].[length(Instances[?LifecycleState=='InService']) == DesiredCapacity ][], `false`)",
Expected: true,
},
{
State: request.RetryWaiterState,
Matcher: request.PathWaiterMatch, Argument: "contains(AutoScalingGroups[].[length(Instances[?LifecycleState=='InService']) < DesiredCapacity ][], `false`)",
Expected: false,
},
},
Logger: asg.Config.Logger,
NewRequest: func(opts []request.Option) (*request.Request, error) {
req, _ := asg.DescribeAutoScalingGroupsRequest(input)
req.SetContext(ctx)
return req, nil
},
}
return w.WaitWithContext(ctx)
}(ctx, &autoscaling.DescribeAutoScalingGroupsInput{
AutoScalingGroupNames: []*string{
asgInstance.AutoScalingGroupName,
},
})
}
// Detach the instance from autoscaling group to also drain the connection from Load Balancer
_, err = asg.DetachInstances(&autoscaling.DetachInstancesInput{
AutoScalingGroupName: asgInstance.AutoScalingGroupName,
InstanceIds: []*string{
aws.String(detail.InstanceID),
},
ShouldDecrementDesiredCapacity: aws.Bool(false),
})
if err != nil {
log.Printf("ERROR - Unable to detach %s from autoscaling-group %s - %v", detail.InstanceID, aws.StringValue(asgInstance.AutoScalingGroupName), err)
}
}
}
}
func main() {
_, isLambda := os.LookupEnv("AWS_LAMBDA_RUNTIME_API")
if isLambda {
lambda.Start(handleRequest)
} else {
log.Fatal("This is only intended to be run inside AWS Lambda")
}
}
- Create Cloudwatch Event rule to only filter event for EC2 spot interruption
resource "aws_cloudwatch_event_rule" "spot" {
name = var.service_name
event_pattern = <<PATTERN
{
"detail-type": [
"EC2 Spot Instance Interruption Warning"
],
"source": [
"aws.ec2"
]
}
PATTERN
}
- Set the cloudwatch event target to lambda handler
resource "aws_cloudwatch_event_target" "lambda" {
rule = aws_cloudwatch_event_rule.spot.name
arn = aws_lambda_function.this.arn
}
Testing
- You can go to the example directory
- Apply the infrastructure and code with
terraform apply
- Go to Lambda console and create test event with this sample payload
{
"version": "0",
"id": "12345678-1234-1234-1234-123456789012",
"detail-type": "EC2 Spot Instance Interruption Warning",
"source": "aws.ec2",
"account": "123456789012",
"time": "yyyy-mm-ddThh:mm:ssZ",
"region": "us-east-2",
"resources": ["arn:aws:ec2:us-east-2:123456789012:instance/i-1234567890abcdef0"],
"detail": {
"instance-id": "i-1234567890abcdef0",
"instance-action": "action"
}
}
note:
- Adjust the mandatory key like instance-id and time
- Test lambda invocation and make sure your test instance detached gracefully and no exception occurs on your service
Reference
- https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/spot-interruptions.html#spot-instance-termination-notices
- https://aws.amazon.com/blogs/compute/best-practices-for-handling-ec2-spot-instance-interruptions/#:~:text=Lean%20on%20Spot%20Integrated%20Services,%2C%20sizes%2C%20and%20Availability%20Zones.