Monitoring and troubleshooting AWS
John Q. Martin
Principal Consultant
aws cloudwatch put-metric-alarm \
--alarm-name HighCPUUtilization \
--metric-name CPUUtilization \
--namespace AWS/EC2 \
--statistic Average \
--period 300 \
--evaluation-periods 2 \
--threshold 80 \
--comparison-operator GreaterThanThreshold \
--alarm-actions arn:aws:sns:us-east-1:123456789012:production-alerts \
--ok-actions arn:aws:sns:us-east-1:123456789012:recovery-notifications


aws sns create-topic \
--name production-alerts
aws sns create-topic \
--name production-alerts.fifo \
--attributes FifoTopic=true,\
ContentBasedDeduplication=true
aws sns create-topic \
--name production-alerts \
--attributes KmsMasterKeyId=alias/aws/sns
aws sns subscribe \
--topic-arn arn:aws:sns:us-east-1:123456789012:production-alerts \
--protocol lambda \
--notification-endpoint arn:aws:lambda:us-east-1:123456789012:function:ProcessAlert
aws sns subscribe \
--topic-arn arn:aws:sns:us-east-1:123456789012:critical-alerts \
--protocol sms \
--notification-endpoint +1234567890
{
"AlarmName": "HighCPUUtilization",
"NewStateValue": "ALARM",
"OldStateValue": "OK",
"NewStateReason": "Threshold Crossed: 2 datapoints [85.0, 90.0] were greater than the threshold (80.0).",
"StateChangeTime": "2026-03-27T10:30:45.123+0000",
"Trigger": {
"MetricName": "CPUUtilization",
"Namespace": "AWS/EC2",
"Statistic": "AVERAGE",
"Period": 300,
"Threshold": 80.0,
"ComparisonOperator": "GreaterThanThreshold"
}
}
aws sns set-subscription-attributes \
--subscription-arn arn:aws:sns:...:production-alerts:abc123 \
--attribute-name FilterPolicy \
--attribute-value '{"AlarmName":["HighCPUUtilization"],"NewStateValue":["ALARM"]}'
aws sns set-subscription-attributes \
--subscription-arn arn:aws:sns:...:production-alerts:abc123 \
--attribute-name FilterPolicyScope \
--attribute-value MessageBody
def lambda_handler(event, context):
alarm = json.loads(event['Records'][0]['Sns']['Message'])
message = f"""
ALERT: {alarm['AlarmName']}
Status: {alarm['NewStateValue']}
Reason: {alarm['NewStateReason']}
Resource: {alarm['Trigger']['Dimensions'][0]['value']}
Runbook: https://wiki.example.com/runbooks/high-cpu
"""
sns.publish(
TopicArn='arn:aws:sns:...:formatted-alerts',
Subject=f"{alarm['AlarmName']}",
Message=message
)


{
"Effect": "Allow",
"Principal": { "Service": "sns.amazonaws.com" },
"Action": "sqs:SendMessage",
"Resource": "arn:aws:sqs:...:alarm-logging-queue",
"Condition": {
"ArnEquals": {
"aws:SourceArn": "arn:aws:sns:...:production-alerts"
}
}
}
aws sns subscribe \
--topic-arn arn:aws:sns:...:production-alerts \
--protocol sqs \
--notification-endpoint arn:aws:sqs:...:alarm-logging-queue
response = sqs.receive_message(
QueueUrl=queue_url,
MaxNumberOfMessages=10,
WaitTimeSeconds=20 # Long polling
)
for message in response.get('Messages', []):
sns_msg = json.loads(message['Body'])
alarm = json.loads(sns_msg['Message'])
# Process alarm data
sqs.delete_message(QueueUrl=queue_url,
ReceiptHandle=message['ReceiptHandle'])
{"NewStateValue":["ALARM"],"Severity":["Critical"]}{"MessageType":["Metric"]}
aws sqs set-queue-attributes \
--queue-url https://sqs..../alarm-logging-queue \
--attributes '{
"RedrivePolicy": "{\"deadLetterTargetArn\":\"arn:aws:sqs:...:alarm-logging-dlq\",\"maxReceiveCount\":\"3\"}"
}'





Monitoring and troubleshooting AWS