From 4e3bc74cefa441fe0f327699c4f4985f6cb16aa9 Mon Sep 17 00:00:00 2001 From: Jakub Filipczak Date: Fri, 2 Apr 2021 16:05:21 +0200 Subject: [PATCH 1/4] * AWS SQ Exporter * Automation update to handle AWS-SQ-Exporter --- exporters/aws-sq-exporter/.dockerignore | 1 + exporters/aws-sq-exporter/Dockerfile | 14 + exporters/aws-sq-exporter/README.md | 53 ++++ exporters/aws-sq-exporter/metrics.yaml | 20 ++ exporters/aws-sq-exporter/sq_exporter.py | 239 ++++++++++++++++++ exporters/aws-sq-exporter/version.json | 3 + playbooks/infra-prometheus/setup-all.yml | 1 + prometheus/generic/add-target/tasks/main.yml | 20 ++ .../templates/aws_sq_exporter.yml.j2 | 4 + .../generic/setup-aws-sq-exporter/README.md | 70 +++++ .../setup-aws-sq-exporter/defaults/main.yml | 13 + .../setup-aws-sq-exporter/tasks/docker.yml | 31 +++ .../setup-aws-sq-exporter/tasks/main.yml | 6 + .../setup-aws-sq-exporter/tasks/prereqs.yml | 35 +++ .../templates/prometheus.yml.j2 | 7 +- 15 files changed, 516 insertions(+), 1 deletion(-) create mode 100644 exporters/aws-sq-exporter/.dockerignore create mode 100644 exporters/aws-sq-exporter/Dockerfile create mode 100644 exporters/aws-sq-exporter/README.md create mode 100644 exporters/aws-sq-exporter/metrics.yaml create mode 100755 exporters/aws-sq-exporter/sq_exporter.py create mode 100644 exporters/aws-sq-exporter/version.json create mode 100644 prometheus/generic/add-target/templates/aws_sq_exporter.yml.j2 create mode 100644 prometheus/generic/setup-aws-sq-exporter/README.md create mode 100644 prometheus/generic/setup-aws-sq-exporter/defaults/main.yml create mode 100644 prometheus/generic/setup-aws-sq-exporter/tasks/docker.yml create mode 100644 prometheus/generic/setup-aws-sq-exporter/tasks/main.yml create mode 100644 prometheus/generic/setup-aws-sq-exporter/tasks/prereqs.yml diff --git a/exporters/aws-sq-exporter/.dockerignore b/exporters/aws-sq-exporter/.dockerignore new file mode 100644 index 0000000..42061c0 --- /dev/null +++ b/exporters/aws-sq-exporter/.dockerignore @@ -0,0 +1 @@ +README.md \ No newline at end of file diff --git a/exporters/aws-sq-exporter/Dockerfile b/exporters/aws-sq-exporter/Dockerfile new file mode 100644 index 0000000..0f37a00 --- /dev/null +++ b/exporters/aws-sq-exporter/Dockerfile @@ -0,0 +1,14 @@ +FROM registry.access.redhat.com/ubi8/python-38 +# Add application sources with correct permissions for OpenShift +USER 0 +ADD sq_exporter.py . +ADD metrics.yaml . +RUN chown -R 1001:0 ./ +USER 1001 +EXPOSE 8000 +# Install the dependencies +RUN pip install --upgrade pip && \ + pip install prometheus-client boto3 python-benedict + +# Run the application +CMD sq_exporter.py diff --git a/exporters/aws-sq-exporter/README.md b/exporters/aws-sq-exporter/README.md new file mode 100644 index 0000000..e9b136d --- /dev/null +++ b/exporters/aws-sq-exporter/README.md @@ -0,0 +1,53 @@ +## AWS Service Quotas Exporter ## +*** *** +This is a simple Prometheus Exporter that querries AWS API for quota values of specific configuration items and calculates actual usage of those quotas. + +### AWS SQs ### +*** *** +Currently there's support for only two SQs: +* L-0263D0A3 - number of Elastic IPs defined for the region +* L-F678F1CE - number of VCPs defined for the region + +## Building the exporter Docker image ## +Docker image should be based on provided Dockerfile, to build the image run that command from repository root directory: + + `export VERSION="0.1.1"; docker build -t aws-sq-exporter:${VERSION} exporters/aws-sq-exporter/` + +## Running the exporter and AWS credentials ## +Exporter uses AWS API directly, simplest way of injecting API keys is by mounting prepopulated .aws into the container: + + `docker run -p 8000:8000 -v /${HOME}/.aws:/home/exporter/.aws aws-sq-exporter:0.1.1` + +Other options are: + +* -a APIKEY, --apikey APIKEY : AWS Access Key ID +* -s SECRETKEY, --secretkey SECRETKEY : AWS Sercet Access Key +* -r REGION(S), --regions REGION : AWS Region or list of comma separated regions to be used for queries +* -t TIME, --time TIME : Sleep time between fetching the AWS API input +* -d, --debug : Should we be more verbose? +* -p PORT, --port PORT : TCP port to be used to expose metrics HTTP endpoint + +## Metric file format ## +Metric definitions should follow the example format: + +```yaml +--- +- metricNameUsage: "aws_vpc_per_region_quota_usage" + usageDesc: "Number of VPCs in use" + metricNameQuota: "aws_vpc_per_region_quota_value" + quotaDesc: "Administrative Quota set on VPCs per Region" + serviceCode: "vpc" + quotaCode: "L-F678F1CE" + usageRetrieval: "describe_vpcs" + usageFilter: "Vpcs" + paginate: True +``` +* metricNameUsage - a name for Prometheus metric showing actual usage +* usageDesc - description that will be added to Prometheus usage metric +* metricNameQuota - a name for Prometheus metric showing the quota value +* quotaDesc - description that will be added to Prometheus quota value metrics +* serviceCode - serviceCode that's assigned to the metric (see AWS CLI manual) +* quotaCode - unique quotaCode (see AWS CLI manual) +* usageRetrieval - name of method which presents the information used to count the actual usage values +* usageFiter - name of dictionary that AWS API returns for usageRetrieval query +* paginate - reserved for future development diff --git a/exporters/aws-sq-exporter/metrics.yaml b/exporters/aws-sq-exporter/metrics.yaml new file mode 100644 index 0000000..d30cf11 --- /dev/null +++ b/exporters/aws-sq-exporter/metrics.yaml @@ -0,0 +1,20 @@ +--- +- metricNameUsage: "aws_eip_quota_usage" + usageDesc: "Administrative Quota set on EIP" + metricNameQuota: "aws_eip_quota_value" + quotaDesc: "Number of Elastic IPs in use" + serviceCode: "ec2" + quotaCode: "L-0263D0A3" + usageRetrieval: "describe_addresses" + usageFilter: "Addresses" + paginate: False + +- metricNameUsage: "aws_vpc_per_region_quota_usage" + usageDesc: "Number of VPCs in use" + metricNameQuota: "aws_vpc_per_region_quota_value" + quotaDesc: "Administrative Quota set on VPCs per Region" + serviceCode: "vpc" + quotaCode: "L-F678F1CE" + usageRetrieval: "describe_vpcs" + usageFilter: "Vpcs" + paginate: True diff --git a/exporters/aws-sq-exporter/sq_exporter.py b/exporters/aws-sq-exporter/sq_exporter.py new file mode 100755 index 0000000..90a349e --- /dev/null +++ b/exporters/aws-sq-exporter/sq_exporter.py @@ -0,0 +1,239 @@ +#!/opt/app-root/bin/python + +import subprocess, os +from prometheus_client import start_http_server, Summary, Gauge, Counter +import argparse +import time +import boto3 +import botocore +from benedict import benedict + +# Generic function to fetch administrative quota values +def getQuotaValue(quotaCode, serviceCode, cSessions): + paginator = cSessions["service-quotas"].get_paginator("list_service_quotas") + pCursor = paginator.paginate(ServiceCode=serviceCode, PaginationConfig={"MaxItems": 1000, "PageSize": 10}) + currentValue = 0 + currentQ = 0 + for page in pCursor: + for quotas in page["Quotas"]: + if quotas["QuotaCode"] == quotaCode: + currentQ = str(quotas["Value"]) + currentValue = currentValue + 1 + return currentQ + +# fetch actual usage of specific service, works for EIP and Vpcs +def getUsage(cSessions, usageRetrieval, usageFilter): + awsCall = getattr(cSessions["ec2"], usageRetrieval) + awsReturns = awsCall() + return len(awsReturns[usageFilter]) + +def getAccountID(): + awsSession = boto3.client("sts", aws_access_key_id=args.apikey, aws_secret_access_key=args.secretkey) + awsReturns = awsSession.get_caller_identity() + return awsReturns["Account"] + + +## If we want to fetch the usage for all of the regions on given account +## we'll need to fetch a list of regions available on this particular AWS account +def getRegions(): + awsSession = boto3.client("ec2", aws_access_key_id=args.apikey, aws_secret_access_key=args.secretkey, region_name="us-east-1") + awsReturns = awsSession.describe_regions() + if args.debug == True: + print("Regions fetched from active account: " + str(awsReturns)) + regions = [] + for page in awsReturns["Regions"]: + regions.append(page["RegionName"]) + if args.debug == True: + print("Adding " + str(page["RegionName"]) + " to the region list") + return regions + + +if __name__ == "__main__": + # Fetch&parse args + parser = argparse.ArgumentParser() + parser.add_argument("-a", "--apikey", help=" AWS Access Key ID ") + parser.add_argument("-s", "--secretkey", help=" AWS Sercet Access Key") + parser.add_argument("-r", "--regions", default="All", help="List of AWS Regions to be used for queries") + parser.add_argument( + "-t", "--time", type=int, default=900, help=" Sleep time between fetching the AWS API input" + ) + parser.add_argument("-d", "--debug", help=" Should we be more verbose?", action="store_true") + parser.add_argument( + "-p", "--port", default=8000, help=" TCP port to be used to expose metrics HTTP endpoint" + ) + parser.add_argument("-m", "--metricsfile", default="./metrics.yaml", help=" Metrics definition file") + args = parser.parse_args() + + ## Strip regions string from leading and trailing spaces + aRegions = str(args.regions).strip() + + ## Setting up basic variables + awsRegions = {} + awsRegionsList = [] + + ## slice the string if we find comma or space between regions names + if aRegions.find(" ") > 0: + awsRegionsList = aRegions.split("\s") + for region in awsRegionsList: + awsRegions[region] = {} + elif aRegions.find(",") > 0: + awsRegionsList = aRegions.split(",") + for region in awsRegionsList: + awsRegions[region] = {} + ## If no region was specified, we're defaulting to "All" + elif aRegions == "All": + print("Region parameter was not passed, fetching all available AWS Regions") + awsRegionsList = getRegions() + for region in awsRegionsList: + awsRegions[region] = {} + ## Falling back to a single specified region + else: + if args.debug == True: + print("Following AWS region will be scraped for data: ") + awsRegionsList.append(aRegions) + print(str(awsRegionsList)) + for region in awsRegionsList: + awsRegions[region] = {} + + print("Loading metrics definition file located at " + str(args.metricsfile)) + + # Getting AccountId + awsAccountID = getAccountID() + print("Exporter configured to calculate metrics on : " + str(awsAccountID)) + + ## Setting initial sessions, per region + for region in awsRegionsList: + awsRegions[region]["clientSession"] = {} + awsRegions[region]["clientSession"]["ec2"] = boto3.client( + "ec2", + aws_access_key_id=args.apikey, + aws_secret_access_key=args.secretkey, + region_name=region, + ) + awsRegions[region]["clientSession"]["service-quotas"] = boto3.client( + "service-quotas", + aws_access_key_id=args.apikey, + aws_secret_access_key=args.secretkey, + region_name=region, + ) + + # Loading up metrics configuration + promMetrics = benedict(args.metricsfile, format="yaml") + if args.debug == True: + print("Metric configuration: ") + print(str(promMetrics)) + + # Initializing Prometheus Gauge metrics + for metric in promMetrics["values"]: + if args.debug == True: + print("Creating metric for " + metric["quotaCode"] + " quota code") + metric["mObjectUsage"] = Gauge( + metric["metricNameUsage"], metric["usageDesc"], ["region", "accountid"] + ) + metric["mObjectQuota"] = Gauge( + metric["metricNameQuota"], metric["quotaDesc"], ["region", "accountid"] + ) + + ## Setting up Counter metrics to track AWS API call failures + # Setting variables + apiCallFailureMetricObjectID = "apiCallFailure" + apiCallFailureMetricName = "aws_api_failed_requests" + apiCallFailureMetricDesc = "Counter set on failed AWS API calls" + apiCallSuccessMetricObjectID = "apiCallSuccess" + apiCallSuccessMetricName = "aws_api_success_requests" + apiCallSuccessMetricDesc = "Counter set on succesfull AWS API calls" + # Initializing metrics + apiCallFails = Counter(apiCallFailureMetricName, apiCallFailureMetricDesc) + apiCallSuccess = Counter(apiCallSuccessMetricName, apiCallSuccessMetricDesc) + + # Resetting counters + apiCallFails.inc(0) + apiCallSuccess.inc(0) + + ## Initializing HTTP /metrics endpoint for Prometheus metrics + start_http_server(int(args.port)) + print("Started AWS Service Quota Exporter listening on port: " + str(args.port)) + + # Variables controlling the flow on main loop + initialRequestsCounter = 0 + warmUpPeriod = 1 + requestDelay = 0.5 + requestCounterHardStop = 8196 + + if args.debug == True: + print("Total of ServiceQuotas Metric/Label set to be calculated: " + +str(len(awsRegionsList) * len(promMetrics["values"]))) + + ## Main loop, going through the regions and setting current metrics values for both value and usage + while True: + for region in awsRegionsList: + # Looping through metrics definitions: + for metric in promMetrics["values"]: + try: + quotaValue = getQuotaValue( + metric["quotaCode"], + metric["serviceCode"], + awsRegions[region]["clientSession"], + ) + apiCallSuccess.inc() + metric["mObjectQuota"].labels(region=region, accountid=awsAccountID).set(quotaValue) + except botocore.exceptions.EndpointConnectionError as error: + apiCallFails.inc() + print(str(error)) + except botocore.exceptions.ClientError as error: + apiCallFails.inc() + print(str(error)) + try: + usage = getUsage( + awsRegions[region]["clientSession"], + metric["usageRetrieval"], + metric["usageFilter"], + ) + apiCallSuccess.inc() + metric["mObjectUsage"].labels(region=region, accountid=awsAccountID).set(usage) + except botocore.exceptions.EndpointConnectionError as error: + apiCallFails.inc() + print(str(error)) + except botocore.exceptions.ClientError as error: + apiCallFails.inc() + print(str(error)) + + ## Initial Requests are executed quicker to ensure we got all values in metrics + #initialRequestsCounter = initialRequestsCounter + 1 + # Check if we completed initial run + # If so throttle down to delay value specified in command line + + if ( + initialRequestsCounter >= (len(awsRegionsList) * len(promMetrics["values"])) + and initialRequestsCounter != requestCounterHardStop): + + if args.debug == True: + print("Warmup completed after " + str(initialRequestsCounter) + ", throttling down") + requestDelay = args.time + warmUpPeriod = 0 + initialRequestsCounter = requestCounterHardStop + + if warmUpPeriod == 1: + initialRequestsCounter = initialRequestsCounter + 1 + + if args.debug == True: + print( + "Last obtained AWS Quota Value for " + + str(metric["mObjectQuota"]) + + " on " + + str(region) + + " is:" + ) + print(str(quotaValue)) + print( + "Last obtained AWS resource usage for " + + str(metric["mObjectUsage"]) + + " on " + + str(region) + + " is:" + ) + print(str(usage)) + ## Hardcoded sleep to ensure we don't choke on AWS API + time.sleep(0.5) + time.sleep(requestDelay) +exit() diff --git a/exporters/aws-sq-exporter/version.json b/exporters/aws-sq-exporter/version.json new file mode 100644 index 0000000..1159bb1 --- /dev/null +++ b/exporters/aws-sq-exporter/version.json @@ -0,0 +1,3 @@ +{ + "version": "v0.0.1" +} diff --git a/playbooks/infra-prometheus/setup-all.yml b/playbooks/infra-prometheus/setup-all.yml index 6fefe9e..c6137c3 100644 --- a/playbooks/infra-prometheus/setup-all.yml +++ b/playbooks/infra-prometheus/setup-all.yml @@ -41,6 +41,7 @@ - "{{ playbook_dir }}/../../prometheus/generic/setup-openstack-exporter" - "{{ playbook_dir }}/../../prometheus/generic/setup-junos-exporter" - "{{ playbook_dir }}/../../prometheus/generic/setup-openstack-exporter" + - "{{ playbook_dir }}/../../prometheus/generic/setup-aws-sq-exporter" tags: - exporters - onboard-exporters diff --git a/prometheus/generic/add-target/tasks/main.yml b/prometheus/generic/add-target/tasks/main.yml index 4109d8c..2b054e3 100644 --- a/prometheus/generic/add-target/tasks/main.yml +++ b/prometheus/generic/add-target/tasks/main.yml @@ -89,6 +89,15 @@ seuser: system_u setype: container_file_t +- name: create aws-sq-exporter_targets directory + file: + path: "/var/prometheus_targets/aws_sq_exporter_targets" + state: directory + mode: '0775' + group: monitoring-editors + seuser: system_u + setype: container_file_t + - name: create federated_prometheus_targets directory file: path: "/var/prometheus_targets/federated_targets" @@ -120,6 +129,17 @@ loop: "{{ groups['prometheus_target_haproxy'] }}" when: "'prometheus_target_haproxy' in groups" +- name: template the aws-sq-exporter_targets + template: + src: aws_sq_exporter.yml.j2 + dest: "/var/prometheus_targets/aws_sq_exporter_targets/aws-sq-exporter_target_{{ item.awsAccount }}.yml" + mode: '0775' + group: monitoring-editors + seuser: system_u + setype: container_file_t + loop: "{{ ansible_sq_exporter }}" + when: "'monitoring-aws-sq-exporter' in groups" + - name: template the bind_targets template: src: bind_target.yml.j2 diff --git a/prometheus/generic/add-target/templates/aws_sq_exporter.yml.j2 b/prometheus/generic/add-target/templates/aws_sq_exporter.yml.j2 new file mode 100644 index 0000000..f7fb3a7 --- /dev/null +++ b/prometheus/generic/add-target/templates/aws_sq_exporter.yml.j2 @@ -0,0 +1,4 @@ +- targets: + - {{ ansible_ssh_host }}:{{ item.port }} + labels: + name: 'AWS SQ Exporter {{ item.awsAccount }}' diff --git a/prometheus/generic/setup-aws-sq-exporter/README.md b/prometheus/generic/setup-aws-sq-exporter/README.md new file mode 100644 index 0000000..0282d99 --- /dev/null +++ b/prometheus/generic/setup-aws-sq-exporter/README.md @@ -0,0 +1,70 @@ +setup-aws-sq-exporter +========= + +This role will instantiate a AWS SQ Exporter container on targeted hosts. Role accepts a list of AWS accounts to monitor, and will spin up one Docker container per account. + +Requirements +------------ + +Docker must be available and running on the targeted hosts. + +Role Variables +-------------- +## Default values of variables: +``` +--- +ansible_connection: "local" +aws_sq_exporter_image: 'prom/aws-sq-exporter' +aws_sq_exporter_image_version: 'latest' +aws_sq_exporter_port: '8080' + +provision_state: "started" + +ansible_sq_exporter: + - awsAccount: "Dummy-Account" + port: 9420 + apikey: 22222 + secretkey: 3333 + regions: "us-east-1,us-east-2" + debug: false +``` +``` +aws_sq_exporter_image - The AWS SQ Exporter image to deploy. +aws_sq_exporter_image_version - The image tag to deploy. +aws_sq_exporter_port - The port to be exposed on container. +provision_state - Options: [absent, killed, present, reloaded, restarted, **started** (default), stopped] + +ansible_sq_exporter: - variable holding individual account configuration + - awsAccount: "Dummy-Account" - AWS Account alias + port: 9420 - Port on which this specific container will be exposed for metrics scraping + apikey: 22222 - AWS Account API Key + secretkey: 3333 - AWS Account SecretKey + regions: "ex1,ex2" - Commaseparated list of regions to query for SQs and usage + debug: false - Increase logging verbosity +``` + + +Dependencies +------------ +``` +python >= 2.6 +docker-py >= 0.3.0 +The docker server >= 0.10.0 +``` + +Example Playbook +---------------- +``` +- name: Setup AWS SQ Exporter + hosts: prometheus_master + become: True + vars: + provision_state: "started" + roles: + - prometheus/generic/setup-aws-sq-exporter +``` + +License +------- + +BSD diff --git a/prometheus/generic/setup-aws-sq-exporter/defaults/main.yml b/prometheus/generic/setup-aws-sq-exporter/defaults/main.yml new file mode 100644 index 0000000..1c2fb33 --- /dev/null +++ b/prometheus/generic/setup-aws-sq-exporter/defaults/main.yml @@ -0,0 +1,13 @@ +--- +aws_sq_exporter_image: 'prom/aws-sq-exporter' +aws_sq_exporter_image_version: 'latest' +aws_sq_exporter_port: '8080' + +provision_state: "started" + +ansible_sq_exporter: + - awsAccount: "Dummy-Account" + port: 9420 + apikey: 22222 + secretkey: 3333 + regions: "us-east-1" diff --git a/prometheus/generic/setup-aws-sq-exporter/tasks/docker.yml b/prometheus/generic/setup-aws-sq-exporter/tasks/docker.yml new file mode 100644 index 0000000..a684707 --- /dev/null +++ b/prometheus/generic/setup-aws-sq-exporter/tasks/docker.yml @@ -0,0 +1,31 @@ +--- + +- name: Enable firewalld + service: + name: firewalld + enabled: yes + state: started + +- name: Open Firewall for Prometheus + firewalld: + port: "{{ item.port }}/tcp" + permanent: yes + state: enabled + immediate: yes + loop: "{{ ansible_sq_exporter }}" + +- name: Run AWS SQ Exporter Docker container + docker_container: + name: "aws-sq-exporter_{{ item.awsAccount }}" + image: "{{ aws_sq_exporter_image }}:{{ aws_sq_exporter_image_version }}" + restart_policy: unless-stopped + network_mode: host + state: "{{ provision_state }}" + command: | + /opt/app-root/src/sq_exporter.py + --apikey "{{ item.apikey}}" + --secretkey "{{ item.secretkey }}" + --regions "{{ item.regions}}" + --port "{{ item.port }}" + restart: yes + loop: "{{ ansible_sq_exporter }}" diff --git a/prometheus/generic/setup-aws-sq-exporter/tasks/main.yml b/prometheus/generic/setup-aws-sq-exporter/tasks/main.yml new file mode 100644 index 0000000..5affcdf --- /dev/null +++ b/prometheus/generic/setup-aws-sq-exporter/tasks/main.yml @@ -0,0 +1,6 @@ +--- +- name: Run prereqs + import_tasks: prereqs.yml + +- name: Run the docker images + import_tasks: docker.yml diff --git a/prometheus/generic/setup-aws-sq-exporter/tasks/prereqs.yml b/prometheus/generic/setup-aws-sq-exporter/tasks/prereqs.yml new file mode 100644 index 0000000..213f376 --- /dev/null +++ b/prometheus/generic/setup-aws-sq-exporter/tasks/prereqs.yml @@ -0,0 +1,35 @@ +--- +- name: "install EPEL GPG key - if specified" + rpm_key: + key: "{{ monitoring_host_epel_gpg_download_url }}" + state: present + when: + - monitoring_host_epel_gpg_download_url is defined + - monitoring_host_epel_gpg_download_url|trim != '' + - monitoring_host_epel_disable_gpg_check|lower == 'no' + +- name: "install epel-release" + yum: + name: "{{ monitoring_host_epel_download_url }}" + state: present + disable_gpg_check: "{{ monitoring_host_epel_disable_gpg_check | default('no') }}" + +- name: Ensure epel-release is installed + yum: + name: "{{ item }}" + state: present + with_items: + - epel-release + +- name: Ensure pip is installed + yum: + name: "{{ item }}" + state: present + with_items: + - python-pip + +- name: Install required python libraries + pip: + name: "docker-py" + state: present + diff --git a/prometheus/generic/setup-prometheus/templates/prometheus.yml.j2 b/prometheus/generic/setup-prometheus/templates/prometheus.yml.j2 index 6eb3067..4b06bd2 100644 --- a/prometheus/generic/setup-prometheus/templates/prometheus.yml.j2 +++ b/prometheus/generic/setup-prometheus/templates/prometheus.yml.j2 @@ -31,6 +31,12 @@ scrape_configs: - files: - /etc/prometheus/targets/node_targets/*.yml + - job_name: 'aws_sq_exporter' + scrape_interval: 60s + file_sd_configs: + - files: + - /etc/prometheus/targets/aws_sq_exporter_targets/*.yml + {% if (groups['monitoring-hosts'] |length ) > 1 %} - job_name: 'federate-sanity-check' scrape_interval: 15s @@ -59,7 +65,6 @@ scrape_configs: - files: - /etc/prometheus/targets/federated_targets/*.yml - - job_name: 'haproxy_exporter' scrape_interval: 5s file_sd_configs: From 747aaf89e614e44df527ead2f2aacb3ba05fc26f Mon Sep 17 00:00:00 2001 From: Jakub Filipczak Date: Tue, 17 Aug 2021 07:23:57 -0400 Subject: [PATCH 2/4] README fix --- prometheus/generic/setup-aws-sq-exporter/README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/prometheus/generic/setup-aws-sq-exporter/README.md b/prometheus/generic/setup-aws-sq-exporter/README.md index 0282d99..ce6ba22 100644 --- a/prometheus/generic/setup-aws-sq-exporter/README.md +++ b/prometheus/generic/setup-aws-sq-exporter/README.md @@ -13,7 +13,6 @@ Role Variables ## Default values of variables: ``` --- -ansible_connection: "local" aws_sq_exporter_image: 'prom/aws-sq-exporter' aws_sq_exporter_image_version: 'latest' aws_sq_exporter_port: '8080' From bfe2082236dcf2d14f1ce8d1679310f4a71afade Mon Sep 17 00:00:00 2001 From: Jakub Filipczak Date: Fri, 27 Aug 2021 04:56:42 -0400 Subject: [PATCH 3/4] Final touches --- grafana/generic/setup-grafana/tasks/docker.yml | 6 +++--- playbooks/infra-prometheus/setup-all.yml | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/grafana/generic/setup-grafana/tasks/docker.yml b/grafana/generic/setup-grafana/tasks/docker.yml index edfe950..b580d12 100644 --- a/grafana/generic/setup-grafana/tasks/docker.yml +++ b/grafana/generic/setup-grafana/tasks/docker.yml @@ -32,12 +32,12 @@ name: grafana image: "{{ grafana_image }}:{{ grafana_image_version }}" network_mode: host - published_ports: - - "{{ grafana_port }}:3000" + #published_ports: + #- "{{ grafana_port }}:3000" volumes: - "{{ monitoring_config_dir }}/datasources.yml:/etc/grafana/provisioning/datasources/datasources.yml:Z" - "{{ monitoring_config_dir }}/dashboards/:/etc/grafana/provisioning/dashboards/:Z" env: - GF_SECURITY_ADMIN_PASSWORD: "{{ grafana_password }}" + GF_SECURITY_ADMIN_PASSWORD: "alamakota" state: "{{ provision_state }}" restart: yes diff --git a/playbooks/infra-prometheus/setup-all.yml b/playbooks/infra-prometheus/setup-all.yml index c6137c3..4902b4b 100644 --- a/playbooks/infra-prometheus/setup-all.yml +++ b/playbooks/infra-prometheus/setup-all.yml @@ -22,6 +22,8 @@ - "{{ playbook_dir }}/../../prometheus/generic/setup-prometheus" - "{{ playbook_dir }}/../../prometheus/generic/setup-alertmanager" - "{{ playbook_dir }}/../../prometheus/generic/update-thresholds" + - "{{ playbook_dir }}/../../grafana/generic/setup-grafana" +# - "{{ playbook_dir }}/../../grafana/generic/configure-grafana-datasource" tags: - prometheus - alertmanager @@ -31,8 +33,6 @@ - name: Setup onboard exporters hosts: monitoring-hosts become: True - vars: - provision_state: "started" roles: - "{{ playbook_dir }}/../../prometheus/generic/setup-ssl-exporter" - "{{ playbook_dir }}/../../prometheus/generic/setup-ilo-exporter" From 7dcebe9f95e4ee096f4830f9950d64854d611412 Mon Sep 17 00:00:00 2001 From: Jakub Filipczak Date: Fri, 8 Apr 2022 13:01:33 +0200 Subject: [PATCH 4/4] Removed aws-sq-exporter code --- exporters/aws-sq-exporter/.dockerignore | 1 - exporters/aws-sq-exporter/Dockerfile | 14 -- exporters/aws-sq-exporter/README.md | 53 ----- exporters/aws-sq-exporter/metrics.yaml | 20 -- exporters/aws-sq-exporter/sq_exporter.py | 239 ----------------------- exporters/aws-sq-exporter/version.json | 3 - 6 files changed, 330 deletions(-) delete mode 100644 exporters/aws-sq-exporter/.dockerignore delete mode 100644 exporters/aws-sq-exporter/Dockerfile delete mode 100644 exporters/aws-sq-exporter/README.md delete mode 100644 exporters/aws-sq-exporter/metrics.yaml delete mode 100755 exporters/aws-sq-exporter/sq_exporter.py delete mode 100644 exporters/aws-sq-exporter/version.json diff --git a/exporters/aws-sq-exporter/.dockerignore b/exporters/aws-sq-exporter/.dockerignore deleted file mode 100644 index 42061c0..0000000 --- a/exporters/aws-sq-exporter/.dockerignore +++ /dev/null @@ -1 +0,0 @@ -README.md \ No newline at end of file diff --git a/exporters/aws-sq-exporter/Dockerfile b/exporters/aws-sq-exporter/Dockerfile deleted file mode 100644 index 0f37a00..0000000 --- a/exporters/aws-sq-exporter/Dockerfile +++ /dev/null @@ -1,14 +0,0 @@ -FROM registry.access.redhat.com/ubi8/python-38 -# Add application sources with correct permissions for OpenShift -USER 0 -ADD sq_exporter.py . -ADD metrics.yaml . -RUN chown -R 1001:0 ./ -USER 1001 -EXPOSE 8000 -# Install the dependencies -RUN pip install --upgrade pip && \ - pip install prometheus-client boto3 python-benedict - -# Run the application -CMD sq_exporter.py diff --git a/exporters/aws-sq-exporter/README.md b/exporters/aws-sq-exporter/README.md deleted file mode 100644 index e9b136d..0000000 --- a/exporters/aws-sq-exporter/README.md +++ /dev/null @@ -1,53 +0,0 @@ -## AWS Service Quotas Exporter ## -*** *** -This is a simple Prometheus Exporter that querries AWS API for quota values of specific configuration items and calculates actual usage of those quotas. - -### AWS SQs ### -*** *** -Currently there's support for only two SQs: -* L-0263D0A3 - number of Elastic IPs defined for the region -* L-F678F1CE - number of VCPs defined for the region - -## Building the exporter Docker image ## -Docker image should be based on provided Dockerfile, to build the image run that command from repository root directory: - - `export VERSION="0.1.1"; docker build -t aws-sq-exporter:${VERSION} exporters/aws-sq-exporter/` - -## Running the exporter and AWS credentials ## -Exporter uses AWS API directly, simplest way of injecting API keys is by mounting prepopulated .aws into the container: - - `docker run -p 8000:8000 -v /${HOME}/.aws:/home/exporter/.aws aws-sq-exporter:0.1.1` - -Other options are: - -* -a APIKEY, --apikey APIKEY : AWS Access Key ID -* -s SECRETKEY, --secretkey SECRETKEY : AWS Sercet Access Key -* -r REGION(S), --regions REGION : AWS Region or list of comma separated regions to be used for queries -* -t TIME, --time TIME : Sleep time between fetching the AWS API input -* -d, --debug : Should we be more verbose? -* -p PORT, --port PORT : TCP port to be used to expose metrics HTTP endpoint - -## Metric file format ## -Metric definitions should follow the example format: - -```yaml ---- -- metricNameUsage: "aws_vpc_per_region_quota_usage" - usageDesc: "Number of VPCs in use" - metricNameQuota: "aws_vpc_per_region_quota_value" - quotaDesc: "Administrative Quota set on VPCs per Region" - serviceCode: "vpc" - quotaCode: "L-F678F1CE" - usageRetrieval: "describe_vpcs" - usageFilter: "Vpcs" - paginate: True -``` -* metricNameUsage - a name for Prometheus metric showing actual usage -* usageDesc - description that will be added to Prometheus usage metric -* metricNameQuota - a name for Prometheus metric showing the quota value -* quotaDesc - description that will be added to Prometheus quota value metrics -* serviceCode - serviceCode that's assigned to the metric (see AWS CLI manual) -* quotaCode - unique quotaCode (see AWS CLI manual) -* usageRetrieval - name of method which presents the information used to count the actual usage values -* usageFiter - name of dictionary that AWS API returns for usageRetrieval query -* paginate - reserved for future development diff --git a/exporters/aws-sq-exporter/metrics.yaml b/exporters/aws-sq-exporter/metrics.yaml deleted file mode 100644 index d30cf11..0000000 --- a/exporters/aws-sq-exporter/metrics.yaml +++ /dev/null @@ -1,20 +0,0 @@ ---- -- metricNameUsage: "aws_eip_quota_usage" - usageDesc: "Administrative Quota set on EIP" - metricNameQuota: "aws_eip_quota_value" - quotaDesc: "Number of Elastic IPs in use" - serviceCode: "ec2" - quotaCode: "L-0263D0A3" - usageRetrieval: "describe_addresses" - usageFilter: "Addresses" - paginate: False - -- metricNameUsage: "aws_vpc_per_region_quota_usage" - usageDesc: "Number of VPCs in use" - metricNameQuota: "aws_vpc_per_region_quota_value" - quotaDesc: "Administrative Quota set on VPCs per Region" - serviceCode: "vpc" - quotaCode: "L-F678F1CE" - usageRetrieval: "describe_vpcs" - usageFilter: "Vpcs" - paginate: True diff --git a/exporters/aws-sq-exporter/sq_exporter.py b/exporters/aws-sq-exporter/sq_exporter.py deleted file mode 100755 index 90a349e..0000000 --- a/exporters/aws-sq-exporter/sq_exporter.py +++ /dev/null @@ -1,239 +0,0 @@ -#!/opt/app-root/bin/python - -import subprocess, os -from prometheus_client import start_http_server, Summary, Gauge, Counter -import argparse -import time -import boto3 -import botocore -from benedict import benedict - -# Generic function to fetch administrative quota values -def getQuotaValue(quotaCode, serviceCode, cSessions): - paginator = cSessions["service-quotas"].get_paginator("list_service_quotas") - pCursor = paginator.paginate(ServiceCode=serviceCode, PaginationConfig={"MaxItems": 1000, "PageSize": 10}) - currentValue = 0 - currentQ = 0 - for page in pCursor: - for quotas in page["Quotas"]: - if quotas["QuotaCode"] == quotaCode: - currentQ = str(quotas["Value"]) - currentValue = currentValue + 1 - return currentQ - -# fetch actual usage of specific service, works for EIP and Vpcs -def getUsage(cSessions, usageRetrieval, usageFilter): - awsCall = getattr(cSessions["ec2"], usageRetrieval) - awsReturns = awsCall() - return len(awsReturns[usageFilter]) - -def getAccountID(): - awsSession = boto3.client("sts", aws_access_key_id=args.apikey, aws_secret_access_key=args.secretkey) - awsReturns = awsSession.get_caller_identity() - return awsReturns["Account"] - - -## If we want to fetch the usage for all of the regions on given account -## we'll need to fetch a list of regions available on this particular AWS account -def getRegions(): - awsSession = boto3.client("ec2", aws_access_key_id=args.apikey, aws_secret_access_key=args.secretkey, region_name="us-east-1") - awsReturns = awsSession.describe_regions() - if args.debug == True: - print("Regions fetched from active account: " + str(awsReturns)) - regions = [] - for page in awsReturns["Regions"]: - regions.append(page["RegionName"]) - if args.debug == True: - print("Adding " + str(page["RegionName"]) + " to the region list") - return regions - - -if __name__ == "__main__": - # Fetch&parse args - parser = argparse.ArgumentParser() - parser.add_argument("-a", "--apikey", help=" AWS Access Key ID ") - parser.add_argument("-s", "--secretkey", help=" AWS Sercet Access Key") - parser.add_argument("-r", "--regions", default="All", help="List of AWS Regions to be used for queries") - parser.add_argument( - "-t", "--time", type=int, default=900, help=" Sleep time between fetching the AWS API input" - ) - parser.add_argument("-d", "--debug", help=" Should we be more verbose?", action="store_true") - parser.add_argument( - "-p", "--port", default=8000, help=" TCP port to be used to expose metrics HTTP endpoint" - ) - parser.add_argument("-m", "--metricsfile", default="./metrics.yaml", help=" Metrics definition file") - args = parser.parse_args() - - ## Strip regions string from leading and trailing spaces - aRegions = str(args.regions).strip() - - ## Setting up basic variables - awsRegions = {} - awsRegionsList = [] - - ## slice the string if we find comma or space between regions names - if aRegions.find(" ") > 0: - awsRegionsList = aRegions.split("\s") - for region in awsRegionsList: - awsRegions[region] = {} - elif aRegions.find(",") > 0: - awsRegionsList = aRegions.split(",") - for region in awsRegionsList: - awsRegions[region] = {} - ## If no region was specified, we're defaulting to "All" - elif aRegions == "All": - print("Region parameter was not passed, fetching all available AWS Regions") - awsRegionsList = getRegions() - for region in awsRegionsList: - awsRegions[region] = {} - ## Falling back to a single specified region - else: - if args.debug == True: - print("Following AWS region will be scraped for data: ") - awsRegionsList.append(aRegions) - print(str(awsRegionsList)) - for region in awsRegionsList: - awsRegions[region] = {} - - print("Loading metrics definition file located at " + str(args.metricsfile)) - - # Getting AccountId - awsAccountID = getAccountID() - print("Exporter configured to calculate metrics on : " + str(awsAccountID)) - - ## Setting initial sessions, per region - for region in awsRegionsList: - awsRegions[region]["clientSession"] = {} - awsRegions[region]["clientSession"]["ec2"] = boto3.client( - "ec2", - aws_access_key_id=args.apikey, - aws_secret_access_key=args.secretkey, - region_name=region, - ) - awsRegions[region]["clientSession"]["service-quotas"] = boto3.client( - "service-quotas", - aws_access_key_id=args.apikey, - aws_secret_access_key=args.secretkey, - region_name=region, - ) - - # Loading up metrics configuration - promMetrics = benedict(args.metricsfile, format="yaml") - if args.debug == True: - print("Metric configuration: ") - print(str(promMetrics)) - - # Initializing Prometheus Gauge metrics - for metric in promMetrics["values"]: - if args.debug == True: - print("Creating metric for " + metric["quotaCode"] + " quota code") - metric["mObjectUsage"] = Gauge( - metric["metricNameUsage"], metric["usageDesc"], ["region", "accountid"] - ) - metric["mObjectQuota"] = Gauge( - metric["metricNameQuota"], metric["quotaDesc"], ["region", "accountid"] - ) - - ## Setting up Counter metrics to track AWS API call failures - # Setting variables - apiCallFailureMetricObjectID = "apiCallFailure" - apiCallFailureMetricName = "aws_api_failed_requests" - apiCallFailureMetricDesc = "Counter set on failed AWS API calls" - apiCallSuccessMetricObjectID = "apiCallSuccess" - apiCallSuccessMetricName = "aws_api_success_requests" - apiCallSuccessMetricDesc = "Counter set on succesfull AWS API calls" - # Initializing metrics - apiCallFails = Counter(apiCallFailureMetricName, apiCallFailureMetricDesc) - apiCallSuccess = Counter(apiCallSuccessMetricName, apiCallSuccessMetricDesc) - - # Resetting counters - apiCallFails.inc(0) - apiCallSuccess.inc(0) - - ## Initializing HTTP /metrics endpoint for Prometheus metrics - start_http_server(int(args.port)) - print("Started AWS Service Quota Exporter listening on port: " + str(args.port)) - - # Variables controlling the flow on main loop - initialRequestsCounter = 0 - warmUpPeriod = 1 - requestDelay = 0.5 - requestCounterHardStop = 8196 - - if args.debug == True: - print("Total of ServiceQuotas Metric/Label set to be calculated: " - +str(len(awsRegionsList) * len(promMetrics["values"]))) - - ## Main loop, going through the regions and setting current metrics values for both value and usage - while True: - for region in awsRegionsList: - # Looping through metrics definitions: - for metric in promMetrics["values"]: - try: - quotaValue = getQuotaValue( - metric["quotaCode"], - metric["serviceCode"], - awsRegions[region]["clientSession"], - ) - apiCallSuccess.inc() - metric["mObjectQuota"].labels(region=region, accountid=awsAccountID).set(quotaValue) - except botocore.exceptions.EndpointConnectionError as error: - apiCallFails.inc() - print(str(error)) - except botocore.exceptions.ClientError as error: - apiCallFails.inc() - print(str(error)) - try: - usage = getUsage( - awsRegions[region]["clientSession"], - metric["usageRetrieval"], - metric["usageFilter"], - ) - apiCallSuccess.inc() - metric["mObjectUsage"].labels(region=region, accountid=awsAccountID).set(usage) - except botocore.exceptions.EndpointConnectionError as error: - apiCallFails.inc() - print(str(error)) - except botocore.exceptions.ClientError as error: - apiCallFails.inc() - print(str(error)) - - ## Initial Requests are executed quicker to ensure we got all values in metrics - #initialRequestsCounter = initialRequestsCounter + 1 - # Check if we completed initial run - # If so throttle down to delay value specified in command line - - if ( - initialRequestsCounter >= (len(awsRegionsList) * len(promMetrics["values"])) - and initialRequestsCounter != requestCounterHardStop): - - if args.debug == True: - print("Warmup completed after " + str(initialRequestsCounter) + ", throttling down") - requestDelay = args.time - warmUpPeriod = 0 - initialRequestsCounter = requestCounterHardStop - - if warmUpPeriod == 1: - initialRequestsCounter = initialRequestsCounter + 1 - - if args.debug == True: - print( - "Last obtained AWS Quota Value for " - + str(metric["mObjectQuota"]) - + " on " - + str(region) - + " is:" - ) - print(str(quotaValue)) - print( - "Last obtained AWS resource usage for " - + str(metric["mObjectUsage"]) - + " on " - + str(region) - + " is:" - ) - print(str(usage)) - ## Hardcoded sleep to ensure we don't choke on AWS API - time.sleep(0.5) - time.sleep(requestDelay) -exit() diff --git a/exporters/aws-sq-exporter/version.json b/exporters/aws-sq-exporter/version.json deleted file mode 100644 index 1159bb1..0000000 --- a/exporters/aws-sq-exporter/version.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "version": "v0.0.1" -}