From bc4807f7a5aebb045daae35ef307d1a413681994 Mon Sep 17 00:00:00 2001 From: hitesh Date: Tue, 28 Jan 2025 16:08:19 +0530 Subject: [PATCH] scripts to simulate ec2 resources and test cloudwatch alarms --- ops-scripts/check_cloudwatch_logs.sh | 113 ++++++++ ops-scripts/simulate_volume_load.sh | 221 +++++++++++++++ ops-scripts/stress_ec2_with_alert_check.sh | 252 ++++++++++++++++++ .../trigger_and_verify_cloudwatch_alarms.sh | 166 ++++++++++++ 4 files changed, 752 insertions(+) create mode 100755 ops-scripts/check_cloudwatch_logs.sh create mode 100755 ops-scripts/simulate_volume_load.sh create mode 100755 ops-scripts/stress_ec2_with_alert_check.sh create mode 100755 ops-scripts/trigger_and_verify_cloudwatch_alarms.sh diff --git a/ops-scripts/check_cloudwatch_logs.sh b/ops-scripts/check_cloudwatch_logs.sh new file mode 100755 index 0000000..601dfe4 --- /dev/null +++ b/ops-scripts/check_cloudwatch_logs.sh @@ -0,0 +1,113 @@ +#!/bin/bash +# +################################################################################# +# Script Name: check_cloudwatch_logs.sh +# Description: This script checks if logs are being received in specified +# AWS CloudWatch Logs streams across multiple log groups. It retrieves +# the latest log event for each stream and checks if the last log +# was received within the specified window period in minutes. +# Author: Hitesh Bhati +# Email: hitesh.bhati@napses.com +# Version: 1.5 +# Date: 2025-01-16 +################################################################################ + +# Check if LOG_WINDOW_PERIOD argument is passed +if [ -z "$1" ]; then + echo "Error: LOG_WINDOW_PERIOD (in minutes) argument is required." + exit 1 +fi + +# Convert the LOG_WINDOW_PERIOD from minutes to seconds +LOG_WINDOW_PERIOD_MINUTES=$1 +LOG_WINDOW_PERIOD_SECONDS=$((LOG_WINDOW_PERIOD_MINUTES * 60)) + +# Define AWS region (optional, can also be passed via environment variable) +AWS_REGION=${AWS_REGION:-"ap-south-1"} # Default to ap-south-1 if not set + +# Check if AWS credentials are provided through environment variables +if [ -z "$AWS_ACCESS_KEY_ID" ] || [ -z "$AWS_SECRET_ACCESS_KEY" ]; then + echo "Error: AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment variables must be set." + exit 1 +fi + +# Define an array of log groups and their corresponding log streams +log_groups_and_streams=( + '{"nidana": ["nidana"]}' + '{"deon": ["deon-qa"]}' + '{"magically": ["magically-qa"]}' + '{"circle": ["circle-qa", "circle-prod"]}' + # Add more log groups and streams as needed +) + +# Fetch the caller identity to get user details associated with the provided credentials +CALLER_ID=$(aws sts get-caller-identity --region "$AWS_REGION" --output text --query 'Arn' 2>&1) + +# Check if the `aws sts get-caller-identity` command was successful +if [ $? -eq 0 ]; then + # If successful, we print the identity (IAM user or role) associated with the credentials + echo "Using AWS identity: $CALLER_ID" +else + # If it fails (no role or user), use the provided access key and secret key directly + echo "No role found, using provided access key and secret key." +fi + +# Get the current time in seconds since Unix epoch (for comparison later) +CURRENT_TIME=$(date +%s) + +# Red color code +RED='\033[0;31m' +# Reset color code +NC='\033[0m' # No Color + +# Loop through each log group and its associated log streams +for entry in "${log_groups_and_streams[@]}"; do + # Extract the log group name and the log streams using `jq` + LOG_GROUP=$(echo $entry | jq -r 'keys[0]') + LOG_STREAMS=$(echo $entry | jq -r '.[keys[0]][]') # Use [] to extract elements from the array + + echo "Checking logs in log group: $LOG_GROUP" + + # Loop through each log stream for the current log group + for LOG_STREAM in $LOG_STREAMS; do + echo " Checking log stream: $LOG_STREAM" + + # Get the latest log event from the stream + LATEST_LOG_EVENT=$(aws logs get-log-events \ + --log-group-name "$LOG_GROUP" \ + --log-stream-name "$LOG_STREAM" \ + --limit 1 \ + --region "$AWS_REGION" \ + --query 'events[0].timestamp' \ + --output text 2>&1) + + # Check if the AWS command was successful + if [ $? -ne 0 ]; then + echo " Error: Failed to retrieve log events. Details: $LATEST_LOG_EVENT" + continue + fi + + # Check if the log stream has any events + if [ "$LATEST_LOG_EVENT" == "None" ]; then + echo " No logs found in the stream '$LOG_STREAM' under the log group '$LOG_GROUP'." + continue + fi + + # Convert the latest log event timestamp into seconds + LATEST_LOG_TIME=$(($LATEST_LOG_EVENT / 1000)) + + # Calculate the time difference between the current time and the last log received + TIME_DIFF=$((CURRENT_TIME - LATEST_LOG_TIME)) + + # Print the time when the last log was received + echo " Last log received at $(date -d @$LATEST_LOG_TIME)" + + # Check if the last log was received within the specified window period + if [ "$TIME_DIFF" -le "$LOG_WINDOW_PERIOD_SECONDS" ]; then + echo " Logs are being received within the last $LOG_WINDOW_PERIOD_MINUTES minutes." + else + # Print the "No logs received" message in red + echo -e " ${RED}No logs received in the last $LOG_WINDOW_PERIOD_MINUTES minutes.${NC}" + fi + done +done diff --git a/ops-scripts/simulate_volume_load.sh b/ops-scripts/simulate_volume_load.sh new file mode 100755 index 0000000..57d21d8 --- /dev/null +++ b/ops-scripts/simulate_volume_load.sh @@ -0,0 +1,221 @@ +#!/bin/bash + +################################################################################## +# Script Name: simulate_volume_load.sh +# Description: This script connects to EC2 instances in a specified AWS region, +# simulates a disk load by creating a temporary file to fill up +# the disk to 93% capacity, and then monitors the disk usage before +# and after the operation. The simulation can be executed on one or +# all instances in parallel. +# +# Arguments: +# - --aws-profile: AWS CLI profile to use for accessing AWS resources (required). +# - --region: AWS region where the EC2 instances are located (required). +# - --parallel: Optionally, run the disk load simulation on all instances in parallel. +# +# Author: [Your Name/Your Company] +# Version: 1.0 +# Date Created: [Date] +# Last Modified: [Date] +# Contact: [Your Contact Information] +# +# Usage Example: +# ./simulate_volume_load.sh --aws-profile 474888828713_AdministratorAccess --region ap-southeast-1 +# +# License: [Your License Information, e.g., MIT License] +################################################################################## + +# Help function +help() { + echo "Usage: $0 --aws-profile --region [--parallel]" + echo "This script simulates a disk load on EC2 instances by creating a temporary file to simulate load." + echo "You can run the simulation on one instance or all instances in parallel." + echo + echo "Mandatory Arguments:" + echo " --aws-profile AWS CLI profile to use." + echo " --region AWS region where the EC2 instances are located." + echo + echo "Optional Arguments:" + echo " --parallel Run the disk load simulation on all instances in parallel." + echo + echo "Example:" + echo " $0 --aws-profile 474888828713_AdministratorAccess --region ap-southeast-1" + echo " $0 --aws-profile 474888828713_AdministratorAccess --region ap-southeast-1 --parallel" + echo + exit 0 +} + +# Function to log messages with timestamp (to both console and log file) +log() { + local message="$1" + echo "[$(date +'%Y-%m-%d %H:%M:%S')] $message" + echo "[$(date +'%Y-%m-%d %H:%M:%S')] $message" >> "$LOG_FILE" +} + +# Function to log errors in red color (to both console and log file) +log_error() { + local message="$1" + echo -e "\e[31m[$(date +'%Y-%m-%d %H:%M:%S')] ERROR: $message\e[0m" + echo "[$(date +'%Y-%m-%d %H:%M:%S')] ERROR: $message" >> "$LOG_FILE" +} + +# Function to fetch all EC2 instance IDs in the specified region +fetch_instance_ids() { + local aws_profile="$1" + local aws_region="$2" + instance_ids=$(aws ec2 describe-instances \ + --profile "$aws_profile" \ + --region "$aws_region" \ + --filters "Name=instance-state-name,Values=running" \ + --query "Reservations[*].Instances[*].InstanceId" \ + --output text) + + echo "$instance_ids" +} + +# Validate and parse command-line arguments +if [[ $# -eq 0 ]]; then + help +fi + +while [[ $# -gt 0 ]]; do + case "$1" in + --aws-profile) + AWS_PROFILE="$2" + shift 2 + ;; + --region) + AWS_REGION="$2" + shift 2 + ;; + --parallel) + PARALLEL=true + shift + ;; + --help) + help + ;; + *) + log_error "Unknown argument: $1" + help + exit 1 + ;; + esac +done + +# Validate mandatory arguments +if [[ -z "$AWS_PROFILE" || -z "$AWS_REGION" ]]; then + log_error "Both --aws-profile and --region are mandatory arguments." + help + exit 1 +fi + +# Default variables +IDENTITY_PATH="/home/hitesh/napses/deon/scripts/config/deon.pem" # SSH key path +REMOTE_USER="ubuntu" # Remote SSH user for EC2 (Ubuntu default) +START_TIME=$(date -u +"%Y-%m-%dT%H:%M:%SZ") # Start time for CloudWatch metrics (optional for alarm checking) +LOG_FILE="volume_simulation.log" # Log file name + +# Initialize the log file +echo "Script execution started at $(date)" > "$LOG_FILE" + +# Fetch the EC2 instances +log "Fetching EC2 instances in region: $AWS_REGION" +INSTANCE_IDS=$(fetch_instance_ids "$AWS_PROFILE" "$AWS_REGION") + +if [[ -z "$INSTANCE_IDS" ]]; then + log_error "No running EC2 instances found in region $AWS_REGION." + exit 1 +fi + +log "Found EC2 instances: $INSTANCE_IDS" + +# Function to simulate the disk load on an individual instance +simulate_disk_load() { + local instance_id="$1" + + log "Connecting to instance $instance_id and starting disk load simulation..." + + ssh -i "$IDENTITY_PATH" "$REMOTE_USER@$instance_id" \ + -o ProxyCommand="aws ec2-instance-connect open-tunnel --instance-id $instance_id --profile $AWS_PROFILE --region $AWS_REGION" << 'EOF' + + # Decorative Header + echo "==============================================================" + echo "Starting disk load simulation on EC2 instance..." + echo "==============================================================" + + # Step 1: Show current disk usage in human-readable format + echo "Step 1: Checking current disk usage..." + df -h / + echo "==============================================================" + + # Step 2: Execute df command and store total and available space into variables + df_output=$(df /) # Get the disk usage for the root volume + total_space=$(echo "$df_output" | awk 'NR==2 {print $2}') # Total space in 1K blocks + available_space=$(echo "$df_output" | awk 'NR==2 {print $4}') # Available space in 1K blocks + + # Convert from 1K blocks to bytes for easier calculations + total_space_bytes=$((total_space * 1024)) # Total space in bytes + available_space_bytes=$((available_space * 1024)) # Available space in bytes + + # Step 3: Calculate the desired output file size to bring volume usage to 93% + desired_file_size_bytes=$((available_space_bytes * 93 / 100)) + + # Make sure we don’t exceed 90% of total space + max_file_size_bytes=$((total_space_bytes * 90 / 100)) + + # Use the minimum of the calculated size and the 90% of total space + if [ $desired_file_size_bytes -gt $max_file_size_bytes ]; then + desired_file_size_bytes=$max_file_size_bytes + fi + + # Convert the file size back to MB for the dd command (1MB = 1024*1024 bytes) + desired_file_size_mb=$((desired_file_size_bytes / 1024 / 1024)) + + # Step 4: Show the calculation result + echo "Step 4: Calculating the size of the temporary file..." + echo "Desired file size to bring disk usage to 93%: ${desired_file_size_mb}MB" + echo "==============================================================" + + # Step 5: Execute dd command to create the file of the desired size + echo "Step 5: Creating a temporary file with the calculated size..." + dd if=/dev/zero of=/tmp/tempfile bs=1M count=$desired_file_size_mb status=progress + echo "Temporary file created. Size: ${desired_file_size_mb}MB" + echo "==============================================================" + + # Step 6: Show disk usage again after the temporary file is created + echo "Step 6: Checking disk usage after creating the temporary file..." + df -h / + echo "==============================================================" + + # Step 7: Remove the temporary file to free up space + echo "Step 7: Removing the temporary file to free up space..." + rm /tmp/tempfile + echo "Temporary file removed." + echo "==============================================================" + + # Step 8: Show disk usage again after the temporary file is removed + echo "Step 8: Checking disk usage after removing the temporary file..." + df -h / + echo "==============================================================" + + echo "Disk load simulation complete." + +EOF +} + +# Run the disk load simulation +if [[ "$PARALLEL" == true ]]; then + log "Running the disk load simulation on all instances in parallel..." + for INSTANCE_ID in $INSTANCE_IDS; do + simulate_disk_load "$INSTANCE_ID" & + done + wait # Wait for all background jobs to finish +else + log "Running the disk load simulation on a single instance ($INSTANCE_ID)..." + # Default to the first instance if running sequentially + FIRST_INSTANCE_ID=$(echo "$INSTANCE_IDS" | head -n 1) + simulate_disk_load "$FIRST_INSTANCE_ID" +fi + +log "Script execution completed at $(date)." diff --git a/ops-scripts/stress_ec2_with_alert_check.sh b/ops-scripts/stress_ec2_with_alert_check.sh new file mode 100755 index 0000000..4173d3c --- /dev/null +++ b/ops-scripts/stress_ec2_with_alert_check.sh @@ -0,0 +1,252 @@ +#!/bin/bash + +################################################################################## +# Script Name: stress_ec2_with_alert_check.sh +# Description: This script performs a stress test on EC2 instances by simulating +# high CPU, then checks for CloudWatch CPU utilization +# alarms triggered by the stress test. The script can stress multiple +# instances in parallel and is capable of checking CloudWatch alarms +# associated with those instances after a specified wait time. +# +# The script performs the following: +# 1. Fetches a list of EC2 instances that are in the 'running' state. +# 2. Executes stress tests to simulate high CPU utilization (above 80%) on the instances. +# 3. Waits for 5 minutes to allow alarms to trigger. +# 4. Checks for CloudWatch alarms triggered by CPU utilization surpassing 80%. +# +# Author: Hitesh Bhati +# Version: 2.7 +# Date Created: 2025-01-17 +# Last Modified: 2025-01-28 +# Contact: hitesh.bhati@napses.com +# +################################################################################## + +# Help function +help() { + echo "Usage: $0 [OPTIONS]" + echo "Perform a stress test on EC2 instances and check for CloudWatch CPU utilization alarms." + echo + echo "Mandatory Arguments:" + echo " AWS_PROFILE AWS CLI profile to use (default: 'default')." + echo " AWS_REGION AWS region where instances are located (default: 'ap-southeast-1')." + echo + echo "Optional Arguments:" + echo " -a, --alarm-name CloudWatch alarm name to check (default: 'UI-EC2-CPU-Utilization-High')." + echo " -p, --parallel Stress all instances in parallel (default: 'no')." + echo " -h, --help Show this help message and exit." + echo + echo "Examples:" + echo " $0 my-aws-profile ap-southeast-1" + echo " $0 my-aws-profile ap-southeast-1 -a MyAlarmName -p yes" + echo + exit 0 +} + +# Default values +AWS_PROFILE="" +AWS_REGION="" +ALARM_NAME="UI-EC2-CPU-Utilization-High" +STRESS_ALL_PARALLEL="no" +IDENTITY_PATH="/home/hitesh/napses/deon/scripts/config/deon.pem" # SSH key path +REMOTE_USER="ubuntu" # Remote SSH user for EC2 (Ubuntu default) + +# Parse command-line arguments +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + help + ;; + -a|--alarm-name) + ALARM_NAME="$2" + shift 2 + ;; + -p|--parallel) + STRESS_ALL_PARALLEL="$2" + shift 2 + ;; + *) + if [[ -z "$AWS_PROFILE" ]]; then + AWS_PROFILE="$1" + elif [[ -z "$AWS_REGION" ]]; then + AWS_REGION="$1" + else + echo "Error: Unknown argument '$1'." + help + exit 1 + fi + shift + ;; + esac +done + +# Validate mandatory arguments +if [[ -z "$AWS_PROFILE" || -z "$AWS_REGION" ]]; then + echo "Error: AWS_PROFILE and AWS_REGION are mandatory arguments." + help + exit 1 +fi + +# Variables +INSTANCE_IDS="" # Will store fetched instance IDs +STRESSED_INSTANCE_IDS="" # Will store instances that were stressed +START_TIME=$(date -u +"%Y-%m-%dT%H:%M:%SZ") # Start time for CloudWatch metrics +REPORT_FILE="stress_test_report_$(date +'%Y%m%d_%H%M%S').log" # Report file + +# Logging function +log() { + local message="$1" + echo "[$(date +'%Y-%m-%d %H:%M:%S')] $message" | tee -a "$REPORT_FILE" +} + +# Function to fetch instance IDs +fetch_instance_ids() { + log "Fetching instance IDs in region: $AWS_REGION" + INSTANCE_IDS=$(aws ec2 describe-instances \ + --profile "$AWS_PROFILE" \ + --region "$AWS_REGION" \ + --filters "Name=instance-state-name,Values=running" \ + --query "Reservations[*].Instances[*].InstanceId" \ + --output text) + + if [ -z "$INSTANCE_IDS" ]; then + log "Error: No running instances found." + exit 1 + fi + log "Fetched instance IDs: $INSTANCE_IDS" +} + +# Function to validate the instance ID +validate_instance_id() { + local instance_id="$1" + log "Validating instance ID: $instance_id" + aws ec2 describe-instances \ + --profile "$AWS_PROFILE" \ + --region "$AWS_REGION" \ + --instance-ids "$instance_id" &> /dev/null + + if [ $? -ne 0 ]; then + log "Error: Instance ID $instance_id is invalid or does not exist." + return 1 + fi + log "Instance ID $instance_id is valid." +} + +# Function to fetch CloudWatch alarms associated with an EC2 instance +get_alarms_for_instance() { + local instance_id="$1" + log "Fetching CloudWatch alarms associated with instance: $instance_id" + ALARMS=$(aws cloudwatch describe-alarms \ + --profile "$AWS_PROFILE" \ + --region "$AWS_REGION" \ + --query "MetricAlarms[?Dimensions[?Name=='InstanceId' && Value=='$instance_id']].AlarmName" \ + --output text) + + if [ -z "$ALARMS" ]; then + log "No alarms found for instance: $instance_id." + else + log "Found alarms: $ALARMS" + fi +} + +# Function to check if an alarm was triggered +get_alarm_triggered_history() { + local alarm_name="$1" + local instance_id="$2" + local end_time=$(date -u +"%Y-%m-%dT%H:%M:%SZ") + + log "Checking CloudWatch alarm history for alarm: $alarm_name on instance: $instance_id" + latest_timestamp=$(aws cloudwatch describe-alarm-history \ + --profile "$AWS_PROFILE" \ + --region "$AWS_REGION" \ + --alarm-name "$alarm_name" \ + --start-date "$START_TIME" \ + --end-date "$end_time" \ + --history-item-type "StateUpdate" \ + --query "AlarmHistoryItems[?HistorySummary=='Alarm updated from OK to ALARM'] | [0].Timestamp" \ + --output text) + + if [[ "$latest_timestamp" != "None" && "$latest_timestamp" > "$START_TIME" ]]; then + log "Alarm triggered for instance $instance_id at $latest_timestamp." + else + log "No matching alarm history found for instance $instance_id." + fi +} + +# Function to run the stress test on an EC2 instance +run_stress_test() { + local instance_id="$1" + log "Starting stress test on instance: $instance_id" + + ssh -i "$IDENTITY_PATH" "$REMOTE_USER@$instance_id" \ + -o ProxyCommand="aws ec2-instance-connect open-tunnel --instance-id $instance_id --profile $AWS_PROFILE --region $AWS_REGION" \ + -t << 'EOF' + # Install dependencies if not present + if ! command -v stress-ng &> /dev/null; then + echo "Installing stress-ng..." + sudo apt-get update -y || { echo "Failed to update package list"; exit 1; } + sudo apt-get install -y stress-ng || { echo "Failed to install stress-ng"; exit 1; } + fi + + # Run stress-ng commands to simulate CPU load above 80% for a sustained period + echo "Simulating high CPU load (over 80%)..." + + # Increase the number of CPU stressors and use more intensive methods + sudo stress-ng --cpu 16 --cpu-method fft --timeout 300s --metrics-brief || { echo "Failed to run stress-ng"; exit 1; } + + # Ensure a sustained CPU load for 5 minutes or more + sudo stress-ng --cpu 16 --cpu-method matrixprod --timeout 300s --metrics-brief || { echo "Failed to run stress-ng"; exit 1; } + +EOF + + if [ $? -ne 0 ]; then + log "Error: Stress test failed on instance $instance_id." + return 1 + fi + log "Stress test completed on instance $instance_id." + STRESSED_INSTANCE_IDS="$STRESSED_INSTANCE_IDS $instance_id" # Add to stressed instances list +} + + +# Main function +main() { + log "Starting script execution." + fetch_instance_ids + + if [[ "$STRESS_ALL_PARALLEL" == "yes" ]]; then + log "Stressing all instances in parallel." + for INSTANCE_ID in $INSTANCE_IDS; do + if validate_instance_id "$INSTANCE_ID"; then + run_stress_test "$INSTANCE_ID" & + fi + done + # Wait for all background processes to complete + wait + else + log "Stressing only one instance." + INSTANCE_ID=$(echo "$INSTANCE_IDS" | head -n 1) # Take the first instance from the list + if validate_instance_id "$INSTANCE_ID"; then + run_stress_test "$INSTANCE_ID" + fi + fi + + log "Stress tests completed. Checking CloudWatch alarms..." + + # Check alarms for all stressed instances + for INSTANCE_ID in $STRESSED_INSTANCE_IDS; do + log "Checking alarms for instance: $INSTANCE_ID" + get_alarms_for_instance "$INSTANCE_ID" + + if [ -n "$ALARMS" ]; then + for ALARM_NAME in $ALARMS; do + sleep 5m + get_alarm_triggered_history "$ALARM_NAME" "$INSTANCE_ID" + done + fi + done + + log "Script execution completed. Report saved to $REPORT_FILE." +} + +# Execute the main function +main diff --git a/ops-scripts/trigger_and_verify_cloudwatch_alarms.sh b/ops-scripts/trigger_and_verify_cloudwatch_alarms.sh new file mode 100755 index 0000000..3b1f3b2 --- /dev/null +++ b/ops-scripts/trigger_and_verify_cloudwatch_alarms.sh @@ -0,0 +1,166 @@ +#!/bin/bash + +################################################################################## +# Script Name: trigger_and_verify_cloudwatch_alarms.sh +# Description: This script manually triggers CloudWatch alarms associated with +# running EC2 instances in a specific region, and verifies whether +# the alarms were triggered successfully. +# +# The script performs the following steps: +# 1. Fetches a list of running EC2 instances in the specified region. +# 2. Retrieves the CloudWatch alarms associated with each EC2 instance. +# 3. Manually triggers each alarm by setting the state to 'ALARM'. +# 4. Verifies if the alarm was triggered by checking CloudWatch alarm history. +# 5. Logs all actions and results to a log file for reference. +# +# Author: Hitesh Bhati +# Version: 1.2 +# Date Created: 2025-01-12 +# Last Modified: 2025-01-28 +# Contact: hitesh.bhati@napses.com + +# Help function +help() { + echo "Usage: $0 --aws-profile --region " + echo "Manually trigger CloudWatch alarms associated with running EC2 instances and verify their triggering." + echo + echo "Mandatory Arguments:" + echo " --aws-profile AWS CLI profile to use." + echo " --region AWS region where instances are located." + echo + echo "Example:" + echo " $0 --aws-profile 474888828713_AdministratorAccess --region ap-southeast-1" + echo + exit 0 +} + +# Function to log messages with timestamp +log() { + local message="$1" + echo "[$(date +'%Y-%m-%d %H:%M:%S')] $message" | tee -a "$LOG_FILE" +} + +# Function to log errors in red color +log_error() { + local message="$1" + echo -e "\e[31m[$(date +'%Y-%m-%d %H:%M:%S')] ERROR: $message\e[0m" | tee -a "$LOG_FILE" +} + +# Validate and parse command-line arguments +if [[ $# -eq 0 ]]; then + help +fi + +while [[ $# -gt 0 ]]; do + case "$1" in + --aws-profile) + AWS_PROFILE="$2" + shift 2 + ;; + --region) + AWS_REGION="$2" + shift 2 + ;; + --help) + help + ;; + *) + log_error "Unknown argument: $1" + help + exit 1 + ;; + esac +done + +# Validate mandatory arguments +if [[ -z "$AWS_PROFILE" || -z "$AWS_REGION" ]]; then + log_error "Both --aws-profile and --region are mandatory arguments." + help + exit 1 +fi + +# Log file +LOG_FILE="cloudwatch_alarm_trigger_$(date +'%Y%m%d_%H%M%S').log" +touch "$LOG_FILE" + +# Script start time +START_TIME=$(date -u +"%Y-%m-%dT%H:%M:%SZ") +log "Script start time: $START_TIME" + +# Get all running instances +log "Fetching running instances in region: $AWS_REGION" +INSTANCE_IDS=$(aws ec2 describe-instances \ + --profile "$AWS_PROFILE" \ + --region "$AWS_REGION" \ + --filters "Name=instance-state-name,Values=running" \ + --query "Reservations[*].Instances[*].InstanceId" \ + --output text) + +if [[ -z "$INSTANCE_IDS" ]]; then + log_error "No running instances found." + exit 1 +fi +log "Found running instances: $INSTANCE_IDS" + +# Iterate through each instance and trigger associated alarms +for INSTANCE_ID in $INSTANCE_IDS; do + log "Processing instance: $INSTANCE_ID" + + # Get associated alarms for the instance + ALARM_NAMES=$(aws cloudwatch describe-alarms \ + --profile "$AWS_PROFILE" \ + --region "$AWS_REGION" \ + --query "MetricAlarms[?Dimensions[?Name=='InstanceId' && Value=='$INSTANCE_ID']].AlarmName" \ + --output text) + + if [[ -z "$ALARM_NAMES" ]]; then + log "No CloudWatch alarms found for instance: $INSTANCE_ID" + continue + fi + + log "Found alarms for instance $INSTANCE_ID: $ALARM_NAMES" + + # Trigger each alarm + for ALARM_NAME in $ALARM_NAMES; do + log "Triggering alarm: $ALARM_NAME" + aws cloudwatch set-alarm-state \ + --alarm-name "$ALARM_NAME" \ + --state-value ALARM \ + --state-reason "Manually setting alarm state for testing" \ + --profile "$AWS_PROFILE" \ + --region "$AWS_REGION" + + if [[ $? -ne 0 ]]; then + log_error "Failed to trigger alarm: $ALARM_NAME" + continue + fi + + log "Successfully triggered alarm: $ALARM_NAME" + + # Verify if the alarm was triggered + END_TIME=$(date -u +"%Y-%m-%dT%H:%M:%SZ") + log "Verifying alarm triggering for: $ALARM_NAME" + LAST_TRIGGERED_TIME=$(aws cloudwatch describe-alarm-history \ + --profile "$AWS_PROFILE" \ + --region "$AWS_REGION" \ + --alarm-name "$ALARM_NAME" \ + --start-date "$START_TIME" \ + --end-date "$END_TIME" \ + --history-item-type "StateUpdate" \ + --query "AlarmHistoryItems[?HistorySummary=='Alarm updated from OK to ALARM'] | [0].Timestamp" \ + --output text) + + if [[ -z "$LAST_TRIGGERED_TIME" ]]; then + log_error "Alarm $ALARM_NAME was not triggered." + else + log "Alarm $ALARM_NAME was last triggered at: $LAST_TRIGGERED_TIME" + if [[ "$LAST_TRIGGERED_TIME" > "$START_TIME" ]]; then + log "Verification successful: Alarm $ALARM_NAME was triggered after the script started." + else + log_error "Verification failed: Alarm $ALARM_NAME was not triggered after the script started." + fi + fi + done +done + +log "Script execution completed. Logs saved to: $LOG_FILE" \ No newline at end of file