diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d00545b --- /dev/null +++ b/.gitignore @@ -0,0 +1,46 @@ +# Local .terraform directories +**/.terraform/* + +# .tfstate files +*.tfstate +*.tfstate.* + +# Crash log files +crash.log +crash.*.log + +# Exclude all .tfvars files, which are likely to contain sensitive data, such as +# password, private keys, and other secrets. These should not be part of version +# control as they are data points which are potentially sensitive and subject +# to change depending on the environment. +*.tfvars +*.tfvars.json + +# Ignore override files as they are usually used to override resources locally and so +# are not checked in +override.tf +override.tf.json +*_override.tf +*_override.tf.json + +# Ignore transient lock info files created by terraform apply +.terraform.tfstate.lock.info + +# Include override files you do wish to add to version control using negated pattern +# !example_override.tf + +# Include tfplan files to ignore the plan output of command: terraform plan -out=tfplan +# example: *tfplan* + +# Ignore CLI configuration files +.terraformrc +terraform.rc + +# Include example automated tfvars file +!template.automated.tfvars + +# Exclude pycache +__pycache__/ +**/__pycache__ + +**/.env diff --git a/README.md b/README.md index 351c2b4..c41eb78 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,1049 @@ # CTFp - CTF Pilot's CTF Platform +> [!TIP] +> If you are looking for **how to build challenges for CTFp**, please check out the **[CTF Pilot's Challenges Template](https://github.com/ctfpilot/challenges-template)** and **[CTF Pilot's Challenge Toolkit](https://github.com/ctfpilot/challenge-toolkit)** repositories. +> +> To learn more about the ecosystem, and view all related repositories, please visit the **[CTF Pilot organization page](https://github.com/ctfpilot)**. + +CTFp (CTF Pilot's CTF Platform) is a CTF platform designed to host large-scale Capture The Flag (CTF) competitions, with a focus on scalability, resilience, and ease of use. +The platform uses Kubernetes as the underlying orchestration system, where the management, scoreboard, and challenge infrastructure are deployed as Kubernetes resources. It then leverages GitOps through [ArgoCD](https://argo-cd.readthedocs.io/en/stable/) for managing the platform's configuration and deployments, including the CTF challenges. + +CTFp acts as the orchestration layer for deploying and managing the platform, while utilizing a variety of CTF Pilot's components to provide the full functionality of the platform. + +CTFp provides a CLI tool for managing the deployment of the platform, but it is possible to use the individual Terraform components directly if desired. To manage the platform after initial deployment, you will primarily interact with the Kubernetes cluster using `kubectl`, ArgoCD, and the other monitoring systems deployed. + +> [!IMPORTANT] +> In order to run CTFp properly, you will need to have a working knowledge of **Cloud**, **Kubernetes**, **Terraform/OpenTofu**, **GitOps**, and **CTFd**. +> The platform is designed to work with CTF Pilot's Challenges ecosystem, to ensure secure hosting of CTF challenges. +> +> This platform is not intended for beginners, and it is assumed that you have prior experience with these technologies and systems. +> Incorrect handling of Kubernetes resources can lead to data loss, downtime, and security vulnerabilities. +> Incorrectly configured challenges may lead to security vulnerabilities or platform instability. + +This platform deploys real-world infrastructure and will incur costs when deployed. + +## Table of Contents + +- [CTFp - CTF Pilot's CTF Platform](#ctfp---ctf-pilots-ctf-platform) + - [Table of Contents](#table-of-contents) + - [Features](#features) + - [Quick start](#quick-start) + - [How to run](#how-to-run) + - [Pre-requisites](#pre-requisites) + - [Environments](#environments) + - [Configuring the platform](#configuring-the-platform) + - [CLI Tool](#cli-tool) + - [Commands](#commands) + - [`init` - Initialize Platform Configuration](#init---initialize-platform-configuration) + - [`generate-keys` - Generate SSH Keys](#generate-keys---generate-ssh-keys) + - [`insert-keys` - Insert SSH Keys into Configuration](#insert-keys---insert-ssh-keys-into-configuration) + - [`generate-images` - Generate Custom Server Images](#generate-images---generate-custom-server-images) + - [`generate-backend` - Generate Terraform Backend Configuration](#generate-backend---generate-terraform-backend-configuration) + - [`deploy` - Deploy Platform Components](#deploy---deploy-platform-components) + - [`destroy` - Destroy Platform Components](#destroy---destroy-platform-components) + - [Workflow Overview](#workflow-overview) + - [Guides](#guides) + - [Updating sizes of nodes in a running platform](#updating-sizes-of-nodes-in-a-running-platform) + - [Deploying a new challenge](#deploying-a-new-challenge) + - [Updating a challenge](#updating-a-challenge) + - [Deploying a page](#deploying-a-page) + - [The CLI tool does not seem to support my setup](#the-cli-tool-does-not-seem-to-support-my-setup) + - [Restoring the database from a backup](#restoring-the-database-from-a-backup) + - [Restoring the CTFd-manager](#restoring-the-ctfd-manager) + - [Architecture](#architecture) + - [Directory structure](#directory-structure) + - [Overview](#overview) + - [Cluster](#cluster) + - [Ops](#ops) + - [Platform](#platform) + - [Challenges](#challenges) + - [Challenge deployment](#challenge-deployment) + - [Network](#network) + - [Cluster networking](#cluster-networking) + - [Challenge networking](#challenge-networking) + - [Getting help](#getting-help) + - [Contributing](#contributing) + - [Background](#background) + - [License](#license) + - [Code of Conduct](#code-of-conduct) + +## Features + +CTFp offers a wide range of features to facilitate the deployment and management of CTF competitions. Below is an overview of the key features: + +- **Infrastructure & Deployment** + - **Multi-environment support** with isolated configurations for Test, Dev, and Production + - **Component-based architecture** with four deployable components: Cluster, Ops, Platform, and Challenges + - **Infrastructure as Code** using Terraform/OpenTofu with automated state management and S3 backend + - **Multi-region Kubernetes clusters** on Hetzner Cloud with configurable node types and auto-scaling + - **Custom server images** generation using Packer + - **Cloudflare DNS integration** for management, platform, and CTF zones +- **Operations & Monitoring** + - **GitOps workflow** powered by ArgoCD for automated deployments + - **Comprehensive monitoring** with Prometheus, Grafana, and metrics exporters + - **Log aggregation** via Filebeat to Elasticsearch + - **Traefik ingress controller** with SSL certificate management (cert-manager) + - **Discord webhook notifications** for platform events + - **Automated descheduling** for optimal resource distribution +- **Scoreboard** + - **Customizable CTFd scoreboard deployment** allowing for bring-your-own CTFd configuration + - **Auto deployment of CTFd configuration** providing a ready-to-use CTFd instance + - **Flexible CTF settings** supporting a large portion of CTFd's configuration options + - **S3 storage configuration** for challenge files and user uploads in CTFd + - **Clustered database setup** with MariaDB operator and automated backups to S3 + - **Redis caching** with Redis operator for ease of use + - **Automatic deployment of CTFd pages** from GitHub +- **Challenge Management** + - **Full support for CTF Pilot's Challenges ecosystem**, including KubeCTF integration + - **Support for three challenge deployment modes**: Isolated, Shared, and Instanced + - **Git-based deployment** with branch-specific configurations + - **IP whitelisting** for challenge access control + - **Custom fallback pages** for errors and instancing states +- **CLI Tool** + - **Simple command-line interface** for managing the deployment and lifecycle of the platform + - **Modular commands** for initializing, deploying, destroying, and managing components + - **Environment management** for handling multiple deployment environments (Test, Dev, Prod) + - **State management** with automated backend configuration, with states stored in S3 + - **Plan generation and review** before applying changes + - **Under 20 minutes** deployment time for the entire platform (excluding image generation) + - **Fully configured through configuration files** for easy setup and management + +## Quick start + +> [!TIP] +> **This is a quick start guide for getting the platform up and running, and acts as a quick reference guide.** +> If it is your first time working with CTFp, we recommend going through the full documentation for a more in-depth understanding of the platform and its components. + +To use the CTFp CLI tool, you first need to clone the repository: + +```bash +git clone https://github.com/ctfpilot/ctfp +cd ctfp +``` + +First, you need to initialize the platform configuration for your desired environment (test, dev, prod): + +```bash +./ctfp.py init +``` + +> [!NOTE] +> You can add `--test`, `--dev` or `--prod` to specify the environment you want to initialize. +> The default environment is `test` (`--test`). +> +> Used in all commands except the `generate-images` command, as it asks for the Hetzner Cloud project to use when generating images. + +Next, you need to fill out the configuration located in the `automated..tfvars` file. + +In order to deploy, ensure you have SSH keys created, and inserted into your configuration: + +```bash +./ctfp.py generate-keys --insert +``` + +To create the server images used for the Kubernetes cluster nodes, run: + +```bash +./ctfp.py generate-images +``` + +To use the Terraform modules, you need to generate the backend configuration for each component. + +```bash +./ctfp.py generate-backend cluster +./ctfp.py generate-backend ops +./ctfp.py generate-backend platform +./ctfp.py generate-backend challenges +``` + +*Replace ``, ``, and `` with your S3 bucket details.* + +Finally, you can deploy the entire platform with: + +```bash +./ctfp.py deploy all +``` + +To destroy the entire platform, run: + +```bash +./ctfp.py destroy all +``` + +`all` can be replaced with any of the individual components: `cluster`, `ops`, `platform`, `challenges`. + +To interact with the cluster, run the following command to configure your `kubectl` context: + +```bash +source kubectl.sh [test|dev|prod] +``` + +*`source` is required to set the environment variables in your current shell session.* + +## How to run + +### Pre-requisites + +In order to even deploy the platform, the following software needs to be installed on your local machine: + +- [OpenTofu](https://opentofu.org) (Alternative version of [Terraform](https://www.terraform.io/downloads.html)) +- [Packer](https://developer.hashicorp.com/packer/tutorials/docker-get-started/get-started-install-cli#installing-packer) - For initial generation of server images +- [Kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/) - For interacting with the Kubernetes cluster +- [hcloud CLI tool](https://github.com/hetznercloud/cli) - For interacting with the Hetzner Cloud API (Otherwise use the Hetzner web interface) +- SSH client - For connecting to the servers +- Python 3 - For running the CTFp CLI tool +- Python package [`python-hcl2`](https://github.com/amplify-education/python-hcl2) - Required by the CTFp CLI tool for parsing Terraform configuration files + +And the following is required in order to deploy the platform: + +- [Hetzner Cloud](https://www.hetzner.com/cloud) account with one or more Hetzner Cloud projects +- [Hetzner Cloud API Token](https://console.hetzner.cloud/projects) - For authenticating with the Hetzner Cloud API +- [Hetzner S3 buckets](https://console.hetzner.cloud/projects) - For storing the Terraform state files, backups, and challenge data. We recommend using 3 separate buckets with separate access keys for security reasons +- [Cloudflare](https://www.cloudflare.com/) account +- [Cloudflare API Token](https://dash.cloudflare.com/profile/api-tokens) - For authenticating with the Cloudflare API +- [3 Cloudflare-managed domains](https://dash.cloudflare.com/) - For allowing the system to allocate a domain for the Kubernetes cluster. Used to allocate management, platform, and challenge domains. +- SMTP mail server - To allow CTFd to send emails to users (Password resets, notifications, etc.). The system is set up to allow outbound connections to [Brevo](https://brevo.com) SMTP on port 587. +- [Discord](https://discord.com) channels to receive notifications. One for monitoring alerts and one for first-blood notifications. +- GitHub repository following [CTF Pilot's Challenges template](https://github.com/ctfpilot/challenges-template) for CTF challenges and CTFd pages - A Git repository containing the CTF challenges to be deployed. This should be your own private repository using the CTF Pilot Challenges Template as a base. This may also contain the pages to be used in CTFd. +- GitHub repository containing the CTFd configuration - We recommend forking [CTF Pilot's CTFd configuration repository](https://github.com/ctfpilot/ctfd). +- Access tokens to access the GitHub repositories and container registry - Fine-grained personal access token and Personal Access Tokens (PAT) with read access to the repositories containing the CTF challenges and CTFd configuration and GitHub container registry. We recommend setting up a bot account for this purpose. +- [Elasticsearch endpoint](https://www.elastic.co/) - Elasticsearch instance with an endpoint and user credentials for log aggregation. Used to connect Filebeat to Elasticsearch. + +### Environments + +CTFp supports three different environments for deployment: + +- **Test**: Intended for testing and experimentation. This environment is suitable for trying out new features, configurations, and updates without affecting the production environment. It is recommended to use smaller server sizes and fewer nodes to minimize costs. +- **Dev**: Intended for development and staging purposes. This environment is suitable for testing new challenges, configurations, and updates before deploying them to production. It should closely resemble the production environment in terms of server sizes and configurations, but can still be scaled down to save costs. +- **Prod**: Intended for hosting live CTF competitions. This environment should be configured for high availability, performance, and security. It is recommended to use larger server sizes, more nodes, and robust configurations to ensure a smooth experience for participants. + +The environments are configured through separate `automated..tfvars` files, allowing for isolated configurations and deployments. + +In the CLI tool, you can specify the environment using the `--test`, `--dev`, or `--prod` flags in the commands. If no flag is provided, the default environment is `test`. + +### Configuring the platform + +> [!TIP] +> To understand the full configuration options and their implications, please refer to the documentation in the `automated..tfvars` or [`template.automated.tfvars`](./template.automated.tfvars) file. + +To configure the platform, you need to configure the `automated..tfvars` file located in the root of the repository. + +It contains a number of configuration options for the platform. +Each configuration option is within the file, explaining and listed with its possible values. + +An automated check, checks if all values are filled out correctly when running the CLI tool. +Therefore, be sure to fill out all required values before attempting to deploy the platform. +Non-required values are per default commented out, and can be left as is if the default value is acceptable. + +The configuration file is the single source of truth for the platform's configuration, and is used by the CLI tool to deploy and manage the platform. +If configuration in the configuration file is changed, the changes will be applied to the platform during the next deployment. +If the platform is manually changed outside of the CLI tool, the changes will be reverted during the next deployment. + +> [!IMPORTANT] +> The `template.automated.tfvars` file is git tracked, and **MUST NOT** be changed in the repository to include sensitive information. +> Instead, copy the file to `automated..tfvars` and fill out the values there. +> The `automated..tfvars` files are git ignored, and will not be tracked by git. +> +> The file can be initialized using the `./ctfp.py init` command. + +Each component is not fully configurable, and may in certain situations require advanced configuration. These configurations are not included in the main configuration file. +These options are either intended to be static, or require manual configuration through the individual Terraform components. +Changing these options may lead to instability or data loss, and should be done with caution. + +### CLI Tool + +The CTFp CLI tool is a Python script that can be executed directly from the command line, and manages the deployment and lifecycle of the CTFp platform. + +**Prerequisites:** + +1. Install required Python dependencies: + + ```bash + pip install -r requirements.txt + ``` + + This installs `python-hcl2`, which is required for parsing Terraform configuration files. + +2. Ensure the script has executable permissions: + + ```bash + chmod +x ctfp.py + ``` + +**Running the CLI tool:** + +You can now run commands directly: + +```bash +./ctfp.py [options] +``` + +Alternatively, you can always run it explicitly with Python: + +```bash +python3 ctfp.py [options] +``` + +Both methods are functionally equivalent. The direct execution method (first example) is more convenient for regular use. + +#### Commands + +> [!TIP] +> You can run any command with the `--help` flag to get more information about the command and its options. +> For example: `./ctfp.py deploy --help` +> +> Available commands: +> +> - [`init`](#init---initialize-platform-configuration) - Initialize Platform Configuration +> - [`generate-keys`](#generate-keys---generate-ssh-keys) - Generate SSH Keys +> - [`insert-keys`](#insert-keys---insert-ssh-keys-into-configuration) - Insert SSH Keys into Configuration +> - [`generate-images`](#generate-images---generate-custom-server-images) - Generate Custom Server Images +> - [`generate-backend`](#generate-backend---generate-terraform-backend-configuration) - Generate Terraform Backend Configuration +> - [`deploy`](#deploy---deploy-platform-components) - Deploy Platform Components +> - [`destroy`](#destroy---destroy-platform-components) - Destroy Platform Components + +Below is a detailed overview of each available command: + +##### `init` - Initialize Platform Configuration + +Initializes the platform configuration for a specified environment by creating an `automated..tfvars` file based on the template. + +**Syntax:** + +```bash +./ctfp.py init [--force] [--test|--dev|--prod] +``` + +**Options:** + +- `--force`: Force overwrite the configuration file if it already exists (by default, the tool prompts before overwriting) +- `--test`: Initialize TEST environment (default) +- `--dev`: Initialize DEV environment +- `--prod`: Initialize PROD environment + +**Example:** + +```bash +./ctfp.py init --test +./ctfp.py init --prod --force +``` + +**Output:** Creates `automated.test.tfvars`, `automated.dev.tfvars`, or `automated.prod.tfvars` in the repository root. + +##### `generate-keys` - Generate SSH Keys + +Generates SSH keys (ed25519) required for accessing the cluster nodes. Optionally inserts the base64-encoded keys directly into the configuration file. + +**Syntax:** + +```bash +./ctfp.py generate-keys [--insert] [--test|--dev|--prod] +``` + +**Options:** + +- `--insert`: Automatically insert the generated keys into the `automated..tfvars` file +- `--test`: Generate keys for TEST environment (default) +- `--dev`: Generate keys for DEV environment +- `--prod`: Generate keys for PROD environment + +**Example:** + +```bash +./ctfp.py generate-keys --insert --test +./ctfp.py generate-keys --dev +``` + +**Output:** Creates `keys/k8s-.pub` (public key) and `keys/k8s-` (private key) in the `keys/` directory. + +##### `insert-keys` - Insert SSH Keys into Configuration + +Manually inserts previously generated SSH keys into the configuration file. Useful if keys were generated separately or if you need to update existing keys. + +**Syntax:** + +```bash +./ctfp.py insert-keys [--test|--dev|--prod] +``` + +**Options:** + +- `--test`: Insert keys for TEST environment (default) +- `--dev`: Insert keys for DEV environment +- `--prod`: Insert keys for PROD environment + +**Example:** + +```bash +./ctfp.py insert-keys --test +./ctfp.py insert-keys --prod +``` + +**Prerequisite:** Keys must already exist in the `keys/` directory. + +##### `generate-images` - Generate Custom Server Images + +Generates custom Packer images for Kubernetes cluster nodes. These images are used when provisioning the cluster infrastructure on Hetzner Cloud. + +**Syntax:** + +```bash +./ctfp.py generate-images +``` + +> [!NOTE] +> The `generate-images` command does not use environment flags. It requires you to select the Hetzner Cloud project interactively during execution. + +**Output:** Packer creates and uploads custom images to your Hetzner Cloud project. + +**Time:** This is typically the longest-running operation, taking 5-15 minutes. + +##### `generate-backend` - Generate Terraform Backend Configuration + +Generates the Terraform backend configuration file (`backend.tf`) for the specified environment. This file configures the S3 backend for storing Terraform state files. + +**Syntax:** + +```bash +./ctfp.py generate-backend +``` + +**Arguments:** + +- ``: Component for which to generate the backend configuration: `cluster`, `ops`, `platform`, or `challenges` +- ``: Name of the S3 bucket to use for storing the Terraform state +- ``: Region where the S3 bucket is located +- ``: Endpoint URL for the S3-compatible storage. For example `nbg1.your-objectstorage.com` for Hetzner Cloud Object Storage in `nbg1` region. + +**Example:** + +```bash +./ctfp.py generate-backend cluster ctfp-cluster-state nbg1 nbg1.your-objectstorage.com +./ctfp.py generate-backend platform ctfp-platform-state fsn1 fsn1.your-objectstorage.com +``` + +**Output:** Creates a HCL configuration for the specified component's Terraform backend in the `backend/generated/` directory. + +See more about this command in the [backend directory](./backend). + +##### `deploy` - Deploy Platform Components + +Deploys one or more components of the platform to the specified environment. Can deploy individual components or the entire platform at once. + +**Syntax:** + +```bash +./ctfp.py deploy [--auto-apply] [--test|--dev|--prod] +``` + +**Arguments:** + +- ``: Component to deploy: `cluster`, `ops`, `platform`, `challenges`, or `all` + - `cluster`: Provisions Kubernetes infrastructure on Hetzner Cloud + - `ops`: Deploys operational tools (ArgoCD, monitoring, logging, ingress) + - `platform`: Deploys CTFd scoreboard and associated services + - `challenges`: Deploys CTF challenges infrastructure + - `all`: Deploys all components in sequence + +**Options:** + +- `--auto-apply`: Automatically apply Terraform changes without interactive prompts (use with extreme caution) +- `--test`: Deploy to TEST environment (default) +- `--dev`: Deploy to DEV environment +- `--prod`: Deploy to PROD environment + +**Example:** + +```bash +./ctfp.py deploy all --test +./ctfp.py deploy cluster --prod +./ctfp.py deploy platform --dev --auto-apply +``` + +**Deployment Order:** When deploying `all`, components are deployed in this order: `cluster` → `ops` → `platform` → `challenges`. Each component must be successfully deployed before the next begins. + +**Output:** Creates Terraform state files in the `terraform/` directory and outputs deployment status and timing information. + +##### `destroy` - Destroy Platform Components + +> [!WARNING] +> Destroying the platform will **delete all data** associated with the environment, including databases, user data, and challenge instances. This action cannot be undone. Always ensure you have backups before destroying production environments. + +Destroys one or more components of the platform. This is the reverse of `deploy` and tears down infrastructure, databases, and services. + +**Syntax:** + +```bash +./ctfp.py destroy [--auto-apply] [--test|--dev|--prod] +``` + +**Arguments:** + +- ``: Component to destroy: `cluster`, `ops`, `platform`, `challenges`, or `all` + +**Options:** + +- `--auto-apply`: Automatically confirm destruction without interactive prompts (use with extreme caution) +- `--test`: Destroy TEST environment (default) +- `--dev`: Destroy DEV environment +- `--prod`: Destroy PROD environment + +**Example:** + +```bash +./ctfp.py destroy all --prod +./ctfp.py destroy challenges --test --auto-apply +``` + +**Destruction Order:** When destroying `all`, components are destroyed in reverse order: `challenges` → `platform` → `ops` → `cluster`. This ensures dependencies are properly cleaned up. + +### Workflow Overview + +The workflow for deploying and managing CTFp can be summarized in the following key phases: + +1. **Setup Phase**: + - Clone the repository and generate backend configurations. + +2. **Preparation Phase**: + - Generate custom server images (one-time setup per Hetzner project). + - Generate SSH keys. + - Create needed pre-requisites. + - Configure the platform using the `automated..tfvars` file. + +3. **Deployment Phase**: + - Deploy components in sequence: `Cluster → Ops → Platform → Challenges`. + - Use `deploy all` for automated deployment or deploy components individually. + +4. **Live Operations**: + - Monitor the platform using tools like ArgoCD, Grafana, and Prometheus. + - Manage challenges, and apply updates as needed. + +5. **Teardown Phase**: + - Destroy components in reverse order: `Challenges → Platform → Ops → Cluster`. + - Use `destroy all` for automated teardown or destroy components individually. + +### Guides + +#### Updating sizes of nodes in a running platform + +> [!TIP] +> When upgrading existing clusters, it is recommended to drain node pools before changing their sizes, to avoid disruption of running workloads. +> Update one node pool at a time, to minimize the impact on the cluster. + +When updating the sizes of nodes in an existing cluster, it is important to follow a specific procedure to ensure a smooth transition and avoid downtime or data loss. +Below are the steps to update the sizes of nodes in an existing cluster: + +1. **Drain the Node Pool**: Before making any changes, drain the node pool that you intend to update. This will safely evict all workloads from the nodes in the pool, allowing them to be rescheduled on other nodes in the cluster. + + ```bash + # List nodes + kubectl get nodes + + # Drain each node in the node pool + kubectl drain --ignore-daemonsets --delete-local-data + ``` + + *You will need to repeat this for each node in the node pool. You can use tools such as [`draino`](https://github.com/planetlabs/draino) to automate this process.* + +2. **Update the Configuration**: Modify the `automated..tfvars` file to reflect the new sizes for the nodes in the node pool. Ensure that you only change the sizes for the specific node pool you are updating. +3. **Deploy the Changes**: Use the CTFp CLI tool to deploy the changes to the cluster. This will apply the updated configuration and resize the nodes in the specified node pool. + + ```bash + ./ctfp.py deploy cluster -- + ``` + + *Replace `` with the appropriate environment flag (`--test`, `--dev`, or `--prod`).* +4. **Monitor the Deployment**: Keep an eye on the deployment process to ensure that the nodes are resized correctly and that there are no issues. You can use `kubectl get nodes` to check the status of the nodes in the cluster. +5. **Uncordon the Node Pool**: Once the nodes have been resized and are ready, uncordon the node pool to allow workloads to be scheduled on the nodes again. + + ```bash + kubectl uncordon + ``` + + *Repeat this for each node in the node pool.* +6. **Verify the Changes**: Finally, verify that the workloads are running correctly on the resized nodes and that there are no issues in the cluster. +7. **Repeat for Other Node Pools**: If you have multiple node pools to update, repeat the above steps for each node pool, one at a time. + +> [!WARNING] +> Changing node sizes can lead to temporary disruption of workloads. +> Always ensure that you have backups of critical data before making changes to the cluster configuration. + +Changes to the `scale_type` will only affect new nodes being created, and will not resize existing nodes, as the deployment of these nodes is done as resources are needed. + +You may need to manually intervene to resize existing nodes if required, or delete them, forcing the system to create new nodes with the updated sizes. However, this may lead to downtime for workloads running on the nodes being deleted. + +> [!NOTE] +> Downscaling nodes may not be possible, depending on the initial size of the nodes and the new size. + +Hetzner does not support downsizing nodes if they were initially created with a larger size. +In such cases, the nodes will need to be deleted, forcing the system to create new nodes with the desired size. + +#### Deploying a new challenge + +To deploy a new challenge, you will need to add the challenge to the configuration file, and then deploy the changes to the platform. + +Challenges are split into three types: + +- `static` - Static challenge, often with a handout (files, puzzles, etc.). +- `shared` - Challenge with a single instance for all teams to connect to. +- `instanced` - Challenge with individual instances for each team. + +The challenge should be formatted using the [CTF Pilot's Challenges Template](https://github.com/ctfpilot/challenges-template), and built using the [CTF Pilot's Challenge Toolkit](https://github.com/ctfpilot/challenge-toolkit) and [CTF Pilot's Challenge Schema](https://github.com/ctfpilot/challenge-schema). + +In the configuration file, you will need to add the challenge under the `Challenges configuration` section. + +For static files, add the challenge under the `challenges_static` list: + +```hcl +challenges_static = { + = [ + "" + ] +} +``` + +For shared challenges, add the challenge under the `challenges_shared` list: + +```hcl +challenges_shared = { + = [ + "" + ] +} +``` + +For instanced challenges, add the challenge under the `challenges_instanced` list: + +```hcl +challenges_instanced = { + = [ + "" + ] +} +``` + +An example of this, using [CTF Pilot's Challenges example repository](https://github.com/ctfpilot/challenges-example), would look like this: + +```hcl +challenges_static = { + forensics = ["oh-look-a-flag"], +} +challenges_shared = { + web = ["the-shared-site"], +} +challenges_instanced = { + web = ["where-robots-cannot-search"], + misc = ["a-true-connection"], +} +``` + +In order to deploy the new challenge, you need to deploy the `challenges` component using the CLI tool: + +```bash +./ctfp.py deploy challenges -- +``` + +To remove a challenge, delete it from the configuration file, and then deploy the `challenges` component again. + +Challenge changes are automatically and continuously deployed through ArgoCD, so no manual intervention is required after the initial deployment. + +#### Updating a challenge + +Challenge updates are handled through the Git repository containing the challenges. + +If a challenge's slug has been changed, you need to remove the old slug from the configuration file, and add the new slug. +For this, follow the [Deploying a new challenge](#deploying-a-new-challenge) guide. + +#### Deploying a page + +To deploy a new page to CTFd, you will need to add the page to a Git repository that should be formatted using the [CTF Pilot's Challenges Template](https://github.com/ctfpilot/challenges-template), and built using the [CTF Pilot's Challenge Toolkit](https://github.com/ctfpilot/challenge-toolkit) and [CTF Pilot's Page Schema](https://github.com/ctfpilot/page-schema). + +In the configuration file, you will need to add the page under the `Pages configuration` section. + +For pages, add the page under the `pages` list: + +```hcl +pages = [ + "" +] +``` + +An example of this, using the [CTF Pilot's Challenges example repository](https://github.com/ctfpilot/challenges-example), would look like this: + +```hcl +pages = ["index"] +``` + +In order to deploy the new page, you need to deploy the `platform` component using the CLI tool: + +```bash +./ctfp.py deploy platform -- +``` + +To remove a page, you need to remove it from the configuration file, and then deploy the `platform` component again. + +Page changes are automatically and continuously deployed through ArgoCD, so no manual intervention is required after the initial deployment. + +#### The CLI tool does not seem to support my setup + +The CLI tool is designed to cover a wide range of deployment scenarios, but it may be that your specific setup requires some customization in each Terraform component. + +Each component is located in its own directory, and can be deployed manually using OpenTofu/terraform commands. + +However, be aware that the CLI tool also manages the Terraform backend configuration, and you will need to set this up manually if you choose to deploy the components manually. + +Documentation is located within each component directory, explaining the configuration options and how to deploy the component manually. +A template tfvars file is also located in each component directory in `tfvars/template.tfvars`, explaining the configuration options available for that component. + +#### Restoring the database from a backup + +By default, the platform is set up to create automated backups of the database every 15 minutes, and store them in the configured S3 bucket. + +You can restore the database from any available backup by timestamp. + +To restore the database from a backup, follow these steps: + +1. **Identify the Backup**: Determine the timestamp of the backup you want to restore from. Backups are stored in the S3 bucket specified in the configuration file, under the `s3_bucket` setting. You can list the backups using your S3 management tool or CLI. +2. **Create a restore resource**: The MariaDB operator provides an easy-to-use restore resource that can be used to restore the database from a backup. + Create a YAML file named `mariadb-restore.yaml` with the following content, replacing `` with the timestamp of the backup you want to restore from: + + ```yaml + apiVersion: k8s.mariadb.com/v1alpha1 + kind: Restore + metadata: + name: restore + namespace: db + spec: + mariaDbRef: + name: ctfd-db + namespace: db + backupRef: + name: db-backup-ctfd-db + targetRecoveryTime: 2025-07-17T20:25:00Z + ``` + + Replace the `targetRecoveryTime` value with the desired timestamp in [RFC 3339 format](https://www.ietf.org/rfc/rfc3339.txt). The time does not need to be exact, as the restore operation will restore to the nearest available backup before the specified time. + + *This requires the platform to be running, with the database operator and platform component both deployed, as this will set up the necessary resources for the restore operation.* +3. **Apply the restore resource**: Apply the restore resource using `kubectl`: + + ```bash + kubectl apply -f mariadb-restore.yaml + ``` + +4. **Monitor the restore process**: Monitor the restore process by checking the status of the restore resource: + + ```bash + kubectl -n db get restore + ``` + +5. **Cleanup**: Once the restore is complete, you can delete the restore resource: + + ```bash + kubectl -n db delete -f mariadb-restore.yaml + ``` + +If you are restoring the full platform, you need to first deploy the `cluster`, `ops`, `platform`, and `challenges` components, before applying the restore resource. +After this, follow the ["Restoring the CTFd-manager"](#restoring-the-ctfd-manager) guide to restore the CTFd-manager data. + +If you want to restore the database to another MariaDB instance, you can copy the backup files from the S3 bucket, and use the MariaDB tools to restore the database manually. +The backup files are cleartext SQL dump files. + +#### Restoring the CTFd-manager + +The CTFd-manager is responsible for maintaining page and challenge states within CTFd, and has local configuration to keep track of what challenges are deployed and their IDs within CTFd. +To ensure there does not exist a disconnect, and the manager can correctly connect and manage the challenges, it is important to restore the CTFd-manager data alongside the database. + +You must manually update the challenge IDs in the challenge manager. +In order to do this, the following flow can be used: + +1. Retrieve the current challenge-id mapping from the ctfd-manager + + ```sh + kubectl -n challenge-config get configmap ctfd-challenges -o yaml > challenges.yaml + ``` + +2. Open the `challenges.yaml` file and update the challenge ids. (See CTFd dashboard for challenge names and IDs) + +3. Apply the updated challenge mapping: + + ```sh + kubectl -n challenge-config apply -f challenges.yaml + ``` + +4. Generate new access token for the CTFd manager. This is done on the admin user in CTFd. +5. Update the access token in the secrets for the CTFd manager: + + ```sh + kubectl -n challenge-config edit configmap ctfd-access-token + ``` + +6. Replace the `token` value with the new access token generated in step 4. + +7. Restart the ctfd-manager to ensure it picks up the new configs: + + ```sh + kubectl -n challenge-config rollout restart deployment ctfd-manager + ``` + + *If it does not pick up the data, you can empty out the `challenge-configmap-hashset` configmap to force a reload.* + +The CTFd manager is now updated with the new challenge IDs and access token. +The system should therefore self-heal with files and missing elements of the challenges. + +If you are restoring the full platform, you need to first deploy the `cluster`, `ops`, `platform`, and `challenges` components, before applying the restore resource. +*You need to restore the CTFd-manager after restoring the database. You may restore the CTFd-manager before deploying the `challenges` component, but the configmap `ctfd-challenges` will then be empty, and you will need to manually format it.* + +## Architecture + +CTFp is composed of four main components, each responsible for different aspects of the platform's functionality: + +1. **Cluster**: Responsible for provisioning and managing the underlying Kubernetes cluster infrastructure on Hetzner Cloud. + This includes setting up the necessary servers, networking, and storage resources required for the cluster to operate. + This can be found in the [`cluster`](./cluster) directory, and as the `cluster` component in the CLI tool. +2. **Ops** (Operations): Focuses on deploying and managing the operational tools and monitoring systems for the platform. + This includes setting up ArgoCD, monitoring, logging, ingress controllers, and other essential services that ensure the smooth operation of the platform. + This can be found in the [`ops`](./ops) directory, and as the `ops` component in the CLI tool. +3. **Platform**: Handles the deployment and configuration of the CTFd scoreboard and its associated services. + This includes setting up the database, caching, and storage solutions required for the scoreboard to function effectively. + This can be found in the [`platform`](./platform) directory, and as the `platform` component in the CLI tool. +4. **Challenges**: Manages the deployment and configuration of the CTF challenges. + This includes setting up the necessary resources and configurations to host and manage the challenges securely and efficiently. + This can be found in the [`challenges`](./challenges) directory, and as the `challenges` component in the CLI tool. + +Each component is designed to be modular and can be deployed independently or together, allowing for flexibility in managing the platform's infrastructure and services. + +### Directory structure + +The CTFp repository is structured as follows: + +```txt +ctfp/ +├── backend/ # Terraform backend configurations +├── keys/ # Generated SSH keys +├── terraform/ # Terraform plans +├── tf-modules/ # Reusable Terraform modules +├── cluster/ # Cluster component Terraform configurations +├── ops/ # Ops component Terraform configurations +├── platform/ # Platform component Terraform configurations +├── challenges/ # Challenges component Terraform configurations +├── ctfp.py # CTFp CLI tool +├── kubectl.sh # Script for configuring kubectl context +├── README.md # This README file +├── requirements.txt # Python dependencies for the CLI tool +├── template.automated.tfvars # Template for CTFp CLI configuration +└── ... # Other files and directories, such as license, contributing guidelines, etc. +``` + +### Overview + +![CTFp Architecture](./docs/attachments/architecture/overview.svg) + +The above figure, details how the different components come together to form the complete CTFp platform. +It highlights the central elements: [CTFd](https://github.com/ctfpilot/ctfd), DB Cluster, Redis, [CTFd-manager](https://github.com/ctfpilot/ctfd-manager), [KubeCTF](https://github.com/ctfpilot/kube-ctf), monitoring and deployment flow. + +*The figure serves as an overview of the platform's architecture, and does therefore not include all components and services involved in the platform.* + +#### Cluster + +The Cluster component is responsible for provisioning and managing the Kubernetes cluster infrastructure on Hetzner Cloud. + +It deploys a [kube-hetzner](https://github.com/kube-hetzner/terraform-hcloud-kube-hetzner) cluster within the Hetzner Cloud environment, setting up the necessary servers, networking, and storage resources required for the cluster to operate. + +Specifically, it handles: + +- **Cluster provisioning**: Creating and configuring the Kubernetes cluster using Hetzner Cloud resources. +- **Node management**: Setting up and managing the worker nodes that will run the workloads. + This includes configuring node pools, scaling, and updating nodes as needed, along with setting up the node-autoscaler for automatic scaling based on demand. +- **Networking**: Configuring the network settings to ensure proper communication between cluster components. + This includes setting up a private network, configuring VPN connectivity between the nodes and setting up Flannel CNI for pod networking. + It opens the required firewall rules to allow communication between nodes, and outbound connections to required services. +- **Storage**: Setting up storage controller (CSI) to use Hetzner Block storage volumes. +- **Traefik proxy**: Deploying Traefik as the ingress controller for managing incoming traffic to the cluster. + +If an alternative cluster setup is desired, the Cluster component can be replaced with a different Kubernetes cluster, as long as it meets the requirements for running the platform. + +**Cluster requirements**: + +The Kubernetes cluster used for CTFp must meet the following requirements: + +- Kubernetes version 1.33 or higher +- Traefik ingress controller, with correctly configured load balancer +- Persistent storage support (CSI). You may use whatever storage solution you prefer, as long as it supports dynamic provisioning of Persistent Volumes, and is set as the default storage class. +- Provides a kubeconfig file for the cluster, to allow the CLI tool to interact with the cluster. This config should have full admin access to the cluster. +- Has at least a single node with the taint `cluster.ctfpilot.com/node=scaler:PreferNoSchedule` for running challenge instances. + *May be skipped, if no instanced challenges are to be deployed, or you change the taints in the challenge deployment configuration.* +- Enough resources to run the platform components. + *This depends on the CTFd setup, challenges and CTF size.* +- Has correct firewall rules to allow outbound connections to required services, such as logging aggregation, SMTP servers, Discord, Cloudflare API, GitHub, and reverse connections from challenges (if they need internet access). +- Flannel CNI installed for networking. +- Cert-manager is not installed, as it is managed by the Ops component. + +#### Ops + +The Ops component is responsible for deploying and managing the operational tools, services, and configurations required for the platform to function. + +It deploys essential infrastructure components on top of the Kubernetes cluster, providing foundational services that other platform components depend on. This component must be deployed after the Cluster and before the Platform and Challenges components. + +Specifically, it deploys the following: + +- **ArgoCD**: GitOps continuous delivery tool used to deploy and manage applications within the Kubernetes cluster. ArgoCD continuously synchronizes the cluster state with Git repositories, enabling declarative infrastructure management. +- **Cert-manager**: Certificate management system for automating TLS/SSL certificate provisioning and renewal. It integrates with Cloudflare for DNS validation challenges. +- **Traefik configuration**: Deploys additional Helm chart configuration for the Traefik ingress controller already present in the cluster, enabling advanced routing and middleware features, along with additional logging with Filebeat log aggregation. +- **Descheduler**: Continuously rebalances the cluster by evicting workloads from nodes, ensuring optimal resource utilization and distribution across available nodes. +- **Error Fallback**: Deploys [CTF Pilot's Error Fallback](https://github.com/ctfpilot/error-fallback) page service, providing custom error pages for HTTP error responses (e.g., 404, 502, 503). +- **Filebeat**: Log aggregation and forwarding system that sends logs to Elasticsearch or other log aggregation services, enabling centralized logging and analysis. +- **MariaDB Operator**: Kubernetes operator for managing MariaDB database instances. Allows automated provisioning, scaling, and management of MySQL-compatible databases. +- **Redis Operator**: Kubernetes operator for managing Redis cache instances. Enables automated deployment and management of Redis clusters for caching and data storage. +- **Prometheus & Grafana Stack**: Comprehensive monitoring and visualization solution. Prometheus scrapes metrics from cluster components, while Grafana provides dashboards for monitoring cluster health, resource usage, and application performance. Custom dashboards for Kubernetes, CTFd, and KubeCTF are included. +- **Alertmanager**: Alerting system integrated with Prometheus, used to send notifications based on defined alerting rules. Configured to send alerts to Discord channels for monitoring purposes. + +#### Platform + +The Platform component is responsible for deploying and managing the CTFd scoreboard and its associated services. + +It handles the complete setup of the CTF competition's scoring system, database infrastructure, and management services. The Platform component must be deployed after both the Cluster and Ops components, as it depends on services provided by the Ops component. + +Specifically, it deploys the following: + +- **CTFd**: The main CTF scoreboard application. This is deployed as a customizable instance that manages team registration, challenge submissions, scoring, and leaderboards. It deploys using the provided CTFd configuration from the defined GitHub repository. See [CTF Pilot's CTFd configuration](https://github.com/ctfpilot/ctfd) for more information. +- [**CTFd-manager**](https://github.com/ctfpilot/ctfd-manager): A companion service for CTFd that provides automated configuration management and administrative functions. It handles initial setup of CTFd and continuous synchronization of pages and challenges. +- **MariaDB database cluster**: A highly available database cluster for storing CTFd data, user accounts, challenge information, and competition state. Deployed using the MariaDB Operator with automated backups to S3. +- **Redis caching layer**: A Redis cluster for caching CTFd data and improving performance. +- **S3 storage configuration**: Integration with S3-compatible object storage for storing challenge files, user uploads, and other assets uploaded to CTFd. +- **Metrics and monitoring**: Deploys metrics exporters and monitoring configurations specific to the CTFd instance for tracking performance and availability. +- **Pages deployment**: Automatically deploys CTF-related pages (e.g., rules, schedule, information pages) from the defined GitHub repository using [CTFd-manager](https://github.com/ctfpilot/ctfd-manager). +- **Traefik ingress configuration**: Sets up ingress routing rules to expose CTFd and related services through the Traefik ingress controller. +- **Initial CTFd setup**: Configures initial CTFd settings, such as competition name, start/end times, and other global settings using [CTFd-manager](https://github.com/ctfpilot/ctfd-manager). + +The Platform automatically sets up Kubernetes secrets and configurations for the components deployed, so that this information is not required to be tracked within Git. +This means that critical secrets are stored within Kubernetes secrets once the Platform component is deployed. + +Backups of the database are automatically created and stored in the configured S3 storage, allowing for disaster recovery and data retention. Currently backups are configured to run every 15 minutes, and retained for 30 days. +Backups are stored as cleartext SQL dump files, so ensure that the S3 storage has proper access policies in place to prevent unauthorized access. + +#### Challenges + +The Challenges component is responsible for managing the deployment and configuration of CTF challenges within the platform. + +It handles the infrastructure setup required to host, isolate, and manage challenges across the Kubernetes cluster. Challenge instances can be deployed in different modes (static, shared or instanced), and the component manages the networking, resource allocation, and lifecycle of challenge containers. The Challenges component must be deployed after the Cluster, Ops, and Platform components. + +Specifically, it manages the following: + +- **Challenge deployment infrastructure**: Sets up the necessary Kubernetes resources for hosting challenges. This includes namespaces, network policies, and RBAC configurations for proper challenge isolation and access control. +- **KubeCTF integration**: Integrates with [KubeCTF](https://github.com/ctfpilot/kube-ctf) to enable dynamic challenge instance management. [KubeCTF](https://github.com/ctfpilot/kube-ctf) handles the creation, scaling, and destruction of challenge instances. +- **Challenge mode support**: Supports three deployment modes: + - **Static challenges**: Challenges that are deployed as static files (e.g., forensics challenges) and are only deployed to CTFd through [CTFd-manager](https://github.com/ctfpilot/ctfd-manager). + - **Shared challenges**: Challenges that have a single instance shared among all teams (e.g., web challenges). This is deployed through ArgoCD. + - **Instanced challenges**: Challenges that have individual instances for each team (e.g., dynamic web challenges). This is managed through [KubeCTF](https://github.com/ctfpilot/kube-ctf). +- **IP whitelisting**: Implements IP-based access control to challenges, allowing restrictions on which IPs or networks can access specific challenges. For public access, the `0.0.0.0/0` CIDR can be used. +- **Custom fallback pages**: Deploys custom error pages for various challenge states (e.g., instancing fallback page for when a challenge is being provisioned). +- **Challenge deployment and configuration management**: Deploys challenge deployment configurations through ArgoCD, allowing for GitOps-style management of challenge definitions and updates, controlling it through defined GitHub repository and defined challenge slugs to be deployed. + +Challenges are deployed and managed through Git repositories, with configurations defined in challenge definition files. Use the [CTF Pilot's Challenge Toolkit](https://github.com/ctfpilot/challenge-toolkit) and [CTF Pilot's Challenges Template](https://github.com/ctfpilot/challenges-template) for challenge development. + +By default, the [CTF Pilot's Challenge Toolkit](https://github.com/ctfpilot/challenge-toolkit) deployment templates use taints to control which nodes challenge instances are scheduled on. Therefore, the cluster must have at least one node with the taint `cluster.ctfpilot.com/node=scaler:PreferNoSchedule` if using Instanced challenges, to ensure challenge instances are properly scheduled and deployed. + +### Challenge deployment + +![CTFp Challenge Deployment](./docs/attachments/architecture/challenge-deployment.svg) + +The challenge deployment system, utilizes a combination of GitOps principles and dynamic instance management to efficiently deploy and manage CTF challenges. + +It is built to use [CTF Pilot's Challenge Toolkit](https://github.com/ctfpilot/challenge-toolkit) and [CTF Pilot's Challenges Template](https://github.com/ctfpilot/challenges-template) for preparing the challenge definitions, and ArgoCD for deploying the challenge configurations to the Kubernetes cluster. +Here, ArgoCD continuously monitors the defined GitHub repository for changes, and automatically applies updates to the cluster. + +Static challenges are deployed as configurations for CTFd through [CTFd-manager](https://github.com/ctfpilot/ctfd-manager), while Shared challenges are deployed as single instances through ArgoCD. +Instanced challenges are managed through [KubeCTF](https://github.com/ctfpilot/kube-ctf), where ArgoCD deploys deployment templates to [KubeCTF](https://github.com/ctfpilot/kube-ctf). + +Container images can be stored in any container registry, as long as the Kubernetes cluster has access to pull the images. +By default, pull secrets are configured for GitHub Container Registry, and are currently **not** configurable through the platform configuration. +Any additional pull secrets must be created manually in the cluster, and referenced in the challenge deployment configuration. + +For more information on how to develop challenges, see the [CTF Pilot's Challenge Toolkit](https://github.com/ctfpilot/challenge-toolkit) and [CTF Pilot's Challenges Template](https://github.com/ctfpilot/challenges-template). An example challenges repository can be found at [CTF Pilot's Challenges example repository](https://github.com/ctfpilot/challenges-example). + +### Network + +The following diagrams provide an overview of CTFp's cluster and challenge networking setups. + +#### Cluster networking + +![CTFp Cluster Networking Overview](./docs/attachments/architecture/cluster-network-architecture.svg) + +CTFp requires three domains, as it configures different services under different domains: + +- **Management domain**: Used for accessing the management services, such as ArgoCD, Grafana, and Prometheus. + This domain should only be distributed to administrators. +- **Platform domain**: Used for accessing the CTFd scoreboard and related services. + This domain is distributed to participants for accessing the CTF platform. +- **CTF domain**: Used for accessing the challenges. + This domain is also distributed to participants for accessing the challenges. + +The platform does not require you to allocate the full top-level domain (TLD) for CTFp, as subdomains for each of the three domains can be configured. + +Management and Platform domains are configured to be proxied through Cloudflare, to take advantage of their CDN and DDoS protection services. +CTF domain is not proxied, as challenges often require direct access to the challenge instances. + +Domain management is built into the system, and DNS entries are therefore automatically created and managed through Cloudflare's API. + +Hetzner Cloud's Load Balancers are used to distribute incoming traffic to the Traefik ingress controllers deployed on each node in the cluster. +Within the cluster, Traefik handles routing of incoming requests to the appropriate services based on the configured ingress rules. +Network is shared between nodes using Hetzner Cloud's private networking, ensuring efficient and secure communication between cluster components. + +#### Challenge networking + +![CTFp Challenge Networking Overview](./docs/attachments/architecture/challenge-network-architecture.svg) + +As described in the [Cluster networking](#cluster-networking) section, CTFp utilizes three main domains for different purposes. +Challenges are accessed through the CTF domain, which is specifically designated for hosting and serving challenge instances, and are therefore not proxied through Cloudflare; they point directly to the Hetzner Cloud Load Balancers. + +This load balancer is set up to forward all incoming traffic to the Traefik ingress controllers deployed within the Kubernetes cluster. + +Traefik supports TCP and HTTP(S) routing, allowing it to handle a wide range of challenge types and protocols. +However, a limited number of middleware options are available for TCP routing, so ensure that your challenges are compatible with the available features. + +IP whitelisting is implemented at the ingress level, allowing challenges to restrict access based on IP addresses or CIDR ranges. + +By default, HTTP(S) traffic is configured with fallback middleware, providing custom error pages for various HTTP error responses (e.g., 404, 502, 503). +When an instanced challenge is being provisioned, the custom error page will inform the user that the challenge is being started and automatically refresh the page until the challenge is ready. + +Shared and Instanced challenges are deployed within either `ctfpilot-challenges` or `ctfpilot-challenges-instanced` namespaces, while static challenges are only deployed to CTFd through [CTFd-manager](https://github.com/ctfpilot/ctfd-manager). +The two namespaces are configured with network policies to restrict any outgoing local traffic, allowing only outbound internet access. + +Challenges can therefore not talk to each other, nor communicate across multiple deployments. +If your challenge requires multiple containers, they need to be deployed within the same challenge deployment, and set up in a sidecar pattern. + +Cluster DNS is not available for challenges, so any service discovery must be handled through external DNS services. +Challenges allow for multiple endpoints to be defined, across both HTTP(S) and TCP protocols. + +TCP endpoints are handled either through a custom Traefik port (only available for shared TCP challenges), or as an SSL TCP endpoint using SNI routing (recommended). +Hetzner limits the number of ports available for Load Balancers, so ensure that you plan accordingly when deploying challenges requiring TCP endpoints using custom ports. +*Currently, configuring custom ports for TCP endpoints is not supported through the platform configuration, and must be set up manually after deployment, or manually in the cluster Terraform module.* + +SSL TCP connections can be made using one of the following command examples: + +```bash +# Using openssl +openssl s_client -connect :443 -servername + +# Netcat +ncat --ssl 443 +``` + +*The netcat command is the one displayed in the [CTFd plugin for Kube-CTF](https://github.com/ctfpilot/ctfd-kubectf-plugin).* + +We understand that this increases the complexity of challenge connection, but it provides a way to easily and dynamically allocate TCP endpoints without the need for managing multiple ports on the Load Balancer. + +## Getting help + +If you need help or have questions regarding CTFp, you can reach out through the following channels: + +- **GitHub Issues**: You can open an issue in the [CTFp GitHub repository](https://github.com/ctfpilot/ctfp/issues) for bug reports, feature requests, or general questions. +- **Discord**: Join the [CTF Pilot Discord server](https://discord.ctfpilot.com) to engage with the community, ask questions, and get support from other users and contributors. + +*The project is delivered as-is, and we do not provide official support services. However, we encourage community engagement and collaboration to help each other out.* +*Contributors and maintainers may assist with questions and issues as time permits.* + ## Contributing We welcome contributions of all kinds, from **code** and **documentation** to **bug reports** and **feedback**! @@ -16,14 +1060,25 @@ To administrate the CLA signing process, we are using **[CLA assistant lite](htt ## Background -CTF Pilot started as a CTF Platform project, originating in **[Brunnerne](https://github.com/brunnerne)**. +CTF Pilot started as a CTF platform project, originating in **[Brunnerne](https://github.com/brunnerne)**. + +The goal of the project is to provide a scalable, resilient, and easy-to-use CTF platform for hosting large-scale Capture The Flag competitions, starting with BrunnerCTF 2025. + +The project is still in active development, and we welcome contributions from the community to help improve and expand the platform's capabilities. ## License -CTFp is licensed under a dual license, the **PolyForm Noncommercial License 1.0.0** for non-commercial use, and a **Commercial License** for commercial use. +CTFp is licensed under a dual license, the **PolyForm Noncommercial License 1.0.0** for non-commercial use, and a **Commercial License** for commercial use. You can find the full license for non-commercial use in the **[LICENSE.md](LICENSE.md)** file. For commercial licensing, please contact **[The0Mikkel](https://github.com/The0Mikkel)**. +Without commercial licensing, the platform **MUST NOT** be used for commercial purposes, including but not limited to: + +- Hosting CTF competitions for profit +- Hosting a CTF as a commercial organization, even if the CTF itself is free or only provided to internal users +- Offering CTF hosting as a paid service +- Using the platform in any commercial product or service + We encourage all modifications and contributions to be shared back with the community, for example through pull requests to this repository. We also encourage all derivative works to be publicly available under **PolyForm Noncommercial License 1.0.0**. At all times must the license terms be followed. diff --git a/backend/README.md b/backend/README.md new file mode 100644 index 0000000..2030ce3 --- /dev/null +++ b/backend/README.md @@ -0,0 +1,26 @@ +# Backend Terraform Configuration Generator + +This script generates Terraform backend configuration files for different components of CTFp. + +## Usage + +To generate a backend configuration file, run the script with the required arguments: + +```bash +python generate.py +``` + +It will create a backend configuration file in the `generated` directory. + +This can be used when initializing Terraform for the respective component: + +```bash +tofu init -backend-config=../backend/generated/.hcl +``` + +### Arguments + +- ``: The component for which to generate the backend configuration. Valid options are `cluster`, `ops`, `platform`, and `challenges`. +- ``: The S3 bucket name where the Terraform state will be stored. +- ``: The region of the S3 bucket. +- ``: The endpoint URL for the S3-compatible storage. diff --git a/backend/backend.hcl b/backend/backend.hcl new file mode 100644 index 0000000..18899b6 --- /dev/null +++ b/backend/backend.hcl @@ -0,0 +1,23 @@ +# BACKEND CONFIGURATION TEMPLATE FOR TERRAFORM +# This file is a template for the backend configurations located in the `generated` directory. + +key = "%%KEY%%" + +bucket = "%%S3_BUCKET%%" +region = "%%S3_REGION%%" +endpoints = { + s3 = "%%S3_ENDPOINT%%" +} + +workspace_key_prefix = "state/%%COMPONENT%%" + +# The following settings are to skip various +# aws related checks and validation +# which is not possible when using third party s3 compatible storage +skip_region_validation = true +skip_credentials_validation = true +skip_requesting_account_id = true +skip_metadata_api_check = true + +skip_s3_checksum = false +use_path_style = false diff --git a/backend/generate.py b/backend/generate.py new file mode 100644 index 0000000..ef408d2 --- /dev/null +++ b/backend/generate.py @@ -0,0 +1,125 @@ +import os +import sys +import argparse + +class Args: + args = None + subcommand = False + + def __init__(self, parent_parser = None): + if parent_parser: + self.subcommand = True + self.parser = parent_parser.add_parser("generate-backend", help="Generate Terraform backend configuration") + else: + self.parser = argparse.ArgumentParser(description="Backend generator for Terraform") + + self.parser.add_argument("component", help="Component to generate backend for", choices=["cluster", "ops", "platform", "challenges"]) + self.parser.add_argument("bucket", help="S3 bucket name for Terraform state storage") + self.parser.add_argument("region", help="Region for S3 bucket") + self.parser.add_argument("endpoint", help="Endpoint URL for S3-compatible storage") + + def parse(self): + if self.subcommand: + self.args = self.parser.parse_args(sys.argv[2:]) + else: + self.args = self.parser.parse_args() + + def __getattr__(self, name): + return getattr(self.args, name) + +class Template: + component = None + bucket = None + region = None + endpoint = None + + def __init__(self, component, bucket, region, endpoint): + self.component = component + self.bucket = bucket + self.region = region + self.endpoint = endpoint + pass + + def replace(self, template_str, replacements): + for key, value in replacements.items(): + template_str = template_str.replace(f"%%{key}%%", value) + return template_str + + def get_template_path(self): + base_dir = os.path.dirname(os.path.abspath(__file__)) + template_path = os.path.join(base_dir, "backend.hcl") + return template_path + + def get_target_path(self): + base_dir = os.path.dirname(os.path.abspath(__file__)) + target_dir = os.path.join(base_dir, "generated") + if not os.path.exists(target_dir): + os.makedirs(target_dir) + target_path = os.path.join(target_dir, f"{self.component}.hcl") + return target_path + + def get_template(self): + template_path = self.get_template_path() + with open(template_path, "r") as f: + template_str = f.read() + return template_str + + def template(self) -> str: + template = self.get_template() + replacements = { + "COMPONENT": self.component, + "KEY": f"{self.component}.tfstate", + "S3_BUCKET": self.bucket, + "S3_REGION": self.region, + "S3_ENDPOINT": self.endpoint + } + output = self.replace(template, replacements) + return output + + def run(self): + backend = self.template() + target_path = self.get_target_path() + with open(target_path, "w") as f: + f.write(backend) + print(f"Generated backend file at: {target_path}") + + +class Generator: + args = None + + def __init__(self, subparser = None): + if not subparser: + self.subparser = argparse.ArgumentParser(description="Backend generator for Terraform") + self.subparser.set_defaults(func=self.run) + return + + self.subparser = subparser.add_parser("generate-backend", help="Generate Terraform backend configuration", description="Generate Terraform backend configuration for specified component") + self.subparser.set_defaults(func=self.run) + + def register_subcommand(self): + self.subparser.add_argument("component", help="Component to generate backend for", choices=["cluster", "ops", "platform", "challenges"]) + self.subparser.add_argument("bucket", help="S3 bucket name for Terraform state storage") + self.subparser.add_argument("region", help="Region for S3 bucket") + self.subparser.add_argument("endpoint", help="Endpoint URL for S3-compatible storage") + + def run(self, args): + template = Template( + component=args.component, + bucket=args.bucket, + region=args.region, + endpoint=args.endpoint + ) + template.run() + +if __name__ == "__main__": + args = Args() + if args.parser is None: + print("Failed to initialize argument parser") + exit(1) + + generator = Generator() + generator.register_subcommand() + + namespace = args.parser.parse_args() + + generator.run(namespace) diff --git a/backend/generated/.gitignore b/backend/generated/.gitignore new file mode 100644 index 0000000..c96a04f --- /dev/null +++ b/backend/generated/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore \ No newline at end of file diff --git a/challenges/.env.example b/challenges/.env.example new file mode 100644 index 0000000..5fe1f9d --- /dev/null +++ b/challenges/.env.example @@ -0,0 +1,2 @@ +AWS_ACCESS_KEY_ID= +AWS_SECRET_ACCESS_KEY= \ No newline at end of file diff --git a/challenges/.gitignore b/challenges/.gitignore new file mode 100644 index 0000000..2faf43d --- /dev/null +++ b/challenges/.gitignore @@ -0,0 +1,37 @@ +# Local .terraform directories +**/.terraform/* + +# .tfstate files +*.tfstate +*.tfstate.* + +# Crash log files +crash.log +crash.*.log + +# Exclude all .tfvars files, which are likely to contain sensitive data, such as +# password, private keys, and other secrets. These should not be part of version +# control as they are data points which are potentially sensitive and subject +# to change depending on the environment. +*.tfvars +*.tfvars.json + +# Ignore override files as they are usually used to override resources locally and so +# are not checked in +override.tf +override.tf.json +*_override.tf +*_override.tf.json + +# Ignore transient lock info files created by terraform apply +.terraform.tfstate.lock.info + +# Include override files you do wish to add to version control using negated pattern +# !example_override.tf + +# Include tfplan files to ignore the plan output of command: terraform plan -out=tfplan +# example: *tfplan* + +# Ignore CLI configuration files +.terraformrc +terraform.rc diff --git a/challenges/.terraform.lock.hcl b/challenges/.terraform.lock.hcl new file mode 100644 index 0000000..df63395 --- /dev/null +++ b/challenges/.terraform.lock.hcl @@ -0,0 +1,79 @@ +# This file is maintained automatically by "tofu init". +# Manual edits may be lost in future updates. + +provider "registry.opentofu.org/alekc/kubectl" { + version = "2.1.3" + constraints = ">= 2.0.2" + hashes = [ + "h1:AymCb0DCWzmyLqn1qEhVs2pcFUZGT/kxPK+I/BObFH8=", + "zh:0e601ae36ebc32eb8c10aff4c48c1125e471fa09f5668465af7581c9057fa22c", + "zh:1773f08a412d1a5f89bac174fe1efdfd255ecdda92d31a2e31937e4abf843a2f", + "zh:1da2db1f940c5d34e31c2384c7bd7acba68725cc1d3ba6db0fec42efe80dbfb7", + "zh:20dc810fb09031bcfea4f276e1311e8286d8d55705f55433598418b7bcc76357", + "zh:326a01c86ba90f6c6eb121bacaabb85cfa9059d6587aea935a9bbb6d3d8e3f3f", + "zh:5a3737ea1e08421fe3e700dc833c6fd2c7b8c3f32f5444e844b3fe0c2352757b", + "zh:5f490acbd0348faefea273cb358db24e684cbdcac07c71002ee26b6cfd2c54a0", + "zh:777688cda955213ba637e2ac6b1994e438a5af4d127a34ecb9bb010a8254f8a8", + "zh:7acc32371053592f55ee0bcbbc2f696a8466415dea7f4bc5a6573f03953fc926", + "zh:81f0108e2efe5ae71e651a8826b61d0ce6918811ccfdc0e5b81b2cfb0f7f57fe", + "zh:88b785ea7185720cf40679cb8fa17e57b8b07fd6322cf2d4000b835282033d81", + "zh:89d833336b5cd027e671b46f9c5bc7d10c5109e95297639bbec8001da89aa2f7", + "zh:df108339a89d4372e5b13f77bd9d53c02a04362fb5d85e1d9b6b47292e30821c", + "zh:e8a2e3a5c50ca124e6014c361d72a9940d8e815f37ae2d1e9487ac77c3043013", + ] +} + +provider "registry.opentofu.org/hashicorp/http" { + version = "3.5.0" + hashes = [ + "h1:yvwvVZ0vdbsTUMru+7Cr0On1FVgDJHAaC6TNvy/OWzM=", + "zh:0a2b33494eec6a91a183629cf217e073be063624c5d3f70870456ddb478308e9", + "zh:180f40124fa01b98b3d2f79128646b151818e09d6a1a9ca08e0b032a0b1e9cb1", + "zh:3e29e1de149dc10bf78620526c7cb8c62cd76087f5630dfaba0e93cda1f3aa7b", + "zh:4420950200cf86042ec940d0e2c9b7c89966bf556bf8038ba36217eae663bca5", + "zh:5d1f7d02109b2e2dca7ec626e5563ee765583792d0fd64081286f16f9433bd0d", + "zh:8500b138d338b1994c4206aa577b5c44e1d7260825babcf43245a7075bfa52a5", + "zh:b42165a6c4cfb22825938272d12b676e4a6946ac4e750f85df870c947685df2d", + "zh:b919bf3ee8e3b01051a0da3433b443a925e272893d3724ee8fc0f666ec7012c9", + "zh:d13b81ea6755cae785b3e11634936cdff2dc1ec009dc9610d8e3c7eb32f42e69", + "zh:f1c9d2eb1a6b618ae77ad86649679241bd8d6aacec06d0a68d86f748687f4eb3", + ] +} + +provider "registry.opentofu.org/hashicorp/kubernetes" { + version = "2.38.0" + constraints = ">= 2.32.0" + hashes = [ + "h1:nY7J9jFXcsRINog0KYagiWZw1GVYF9D2JmtIB7Wnrao=", + "zh:1096b41c4e5b2ee6c1980916fb9a8579bc1892071396f7a9432be058aabf3cbc", + "zh:2959fde9ae3d1deb5e317df0d7b02ea4977951ee6b9c4beb083c148ca8f3681c", + "zh:5082f98fcb3389c73339365f7df39fc6912bf2bd1a46d5f97778f441a67fd337", + "zh:620fd5d0fbc2d7a24ac6b420a4922e6093020358162a62fa8cbd37b2bac1d22e", + "zh:7f47c2de179bba35d759147c53082cad6c3449d19b0ec0c5a4ca8db5b06393e1", + "zh:89c3aa2a87e29febf100fd21cead34f9a4c0e6e7ae5f383b5cef815c677eb52a", + "zh:96eecc9f94938a0bc35b8a63d2c4a5f972395e44206620db06760b730d0471fc", + "zh:e15567c1095f898af173c281b66bffdc4f3068afdd9f84bb5b5b5521d9f29584", + "zh:ecc6b912629734a9a41a7cf1c4c73fb13b4b510afc9e7b2e0011d290bcd6d77f", + ] +} + +provider "registry.opentofu.org/loafoe/htpasswd" { + version = "1.2.1" + hashes = [ + "h1:W1euQGM6t+QlB6Rq4fDbRKRHmeCIyYdIYdHrxL97BeE=", + "zh:14460c85ddc40a9ecadf583c22a7de91b83798a8ca4843949d50c3288c6f5bdd", + "zh:1af9416e28dd0a77c5d2c685561c4f60e19e2d606df0477ebc18eaa110c77807", + "zh:2245325864faaf027701ab12a04d641359a0dc439dd23c6e8f768407b78a5c18", + "zh:3813ff98198405d7c467565b52c7f0ad4533f43957da6390477dc898f8ed02c2", + "zh:3c0658e132232a181223f7ff65678d99cd2e8431c317f72281b67464e5e16892", + "zh:43505c0f42bc7635ec7c1fe5043c502f9b00ae4b5e74b81464bc494936643fc1", + "zh:52efdabb0abba99a33fd3ed981610f13c99bb383f94e997f90d95441d8558177", + "zh:75b5d9b4a610dfd0ff4dfb4039f61e79a0e56338e0a4cd45e0bc0edec34dfa62", + "zh:7aee5df091672d29f29dda57382a41d771fa21740cef6bb9a1b15afc6d84ffa4", + "zh:7ff618706e2953a21a22c7555e11f5cbe8e95c171704fcfdc6beedb0c25e49c0", + "zh:94e8a15c83a1a5a60ff1b58938dd9692d800fe05c5d8269e0916b5de03d89d3a", + "zh:c1ace4f322f9ec4956e4f30086da5b6a73f4d05e1266047d629b14a485c5a76d", + "zh:d4570075de49e3ee98494f7c44eab12e964c9776029ed536fd9352c3203cc635", + "zh:d99403b843de5939ea2e54b3ca46fd901d5c5b7fe34f44b8aeb8b38f4f792df6", + ] +} diff --git a/challenges/README.md b/challenges/README.md new file mode 100644 index 0000000..0adcc3b --- /dev/null +++ b/challenges/README.md @@ -0,0 +1,55 @@ +# CTF Pilot's Kubernetes Challenges + +> [!IMPORTANT] +> You are leaving the automated CTF Pilot setup and entering a more advanced manual setup. +> This requires knowledge of Kubernetes, Terraform/OpenTofu, and cloud infrastructure management. +> If you are not comfortable with these technologies, it is recommended to use the automated setup provided by CTF Pilot. +> Learn more about the automated setup in the [CTFp main README](../README.md). + +This directory contains deployment configuration for the challenges within the CTFp system. + +## Pre-requisites + +The following software needs to be installed on your local machine: + +- [Terraform](https://www.terraform.io/downloads.html) / [OpenTofu](https://opentofu.org) +- [Kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/) (For interacting with the Kubernetes cluster) + +The following services are required, in order to deploy the services to the cluster: + +- A Kubernetes cluster (Deployed using the [CTF Pilot's Kubernetes Cluster on Hetzner Cloud](../cluster/README.md) guide or other means) +- Correctly deployed [ArgoCD](https://argo-cd.readthedocs.io/) within the Kubernetes cluster. +- Correctly deployed [CTF Pilot's Kubernetes Platform](../platform/README.md) within the Kubernetes cluster. + +> [!NOTE] +> The challenges has only been tested within the CTFp system. + +## Setup + +Copy the `tfvars/template.tfvars` file to `tfvars/data.tfvars` and edit the file with your own values. +The [`tfvars/template.tfvars`](tfvars/template.tfvars) file contains further information on each variable. + +> [!IMPORTANT] +> Make sure you generate the backend configuration file before creating the cluster. +> See the [backend generation instructions](../backend/README.md) for more information. +> +> You will also need to set the following environment variables for authentication to the S3 backend: +> - `AWS_ACCESS_KEY_ID` +> - `AWS_SECRET_ACCESS_KEY` +> +> See [OpenTofub backend S3 configuration](https://opentofu.org/docs/language/settings/backends/s3/) for more information. + +Run the following command to apply the ressources to the Kubernetes cluster: + +```bash +tofu init -backend-config=../backend/generated/challenges.hcl +tofu apply --var-file tfvars/data.tfvars +``` + +### Destroying the challenges + +To destroy the deployed challenges, run the following command: + +```bash +tofu destroy --var-file tfvars/data.tfvars +``` diff --git a/challenges/challenge/challenge.tf b/challenges/challenge/challenge.tf new file mode 100644 index 0000000..22c3d61 --- /dev/null +++ b/challenges/challenge/challenge.tf @@ -0,0 +1,79 @@ +variable "enabled" { + description = "Enable or disable the challenge deployment" + default = true + nullable = false +} + +variable "revision" { + description = "The revision of the repository to use" + default = "main" + nullable = false +} + +variable "category" { + description = "The category of the challenge" +} + +variable "identifier" { + description = "The identifier of the challenge" +} + +variable "path" { + description = "The path to the challenge" + default = null +} + +variable "argocd_project" { + description = "The ArgoCD project to use" + nullable = false +} + +variable "argocd_namespace" { + description = "The namespace where ArgoCD is installed" + default = "argocd" +} + +variable "challenge_namespace" { + description = "The namespace where the challenge will be deployed" + nullable = false +} + +variable "application_name" { + description = "The name of the application" + default = null +} + +variable "application_repo_url" { + description = "The URL of the repository where the application manifests are stored" + nullable = false +} + +variable "helm" { + description = "Helm chart configuration" + type = any + default = null +} + +module "argocd-challenge" { + source = "../../tf-modules/argocd/application" + + count = var.enabled ? 1 : 0 + + argocd_namespace = var.argocd_namespace + application_namespace = var.challenge_namespace + application_name = var.application_name != null ? var.application_name : "${var.category}-${var.identifier}" + application_repo_url = var.application_repo_url + application_repo_path = var.path != null ? var.path : "challenges/${var.category}/${var.identifier}/k8s/challenge" + application_repo_revision = var.revision + application_project = var.argocd_project + helm = var.helm + + argocd_labels = { + "part-of" = "ctfpilot" + "component" = "challenge" + "version" = var.revision + "category" = var.category + "instance" = var.identifier + } +} + diff --git a/challenges/challenges-config.tf b/challenges/challenges-config.tf new file mode 100644 index 0000000..0baf95d --- /dev/null +++ b/challenges/challenges-config.tf @@ -0,0 +1,16 @@ +locals { + instanced_challenges = var.challenges_instanced + shared_challenges = var.challenges_shared + static_challenges = var.challenges_static + + challenges_branch = var.challenges_branch == "" ? local.env_branch : var.challenges_branch + + challenge_repo_url = var.challenges_repository + branch = local.challenges_branch + + argocd_project_instanced = "instanced-challenges" + argocd_project_shared = "shared-challenges" + argocd_project_static = "static-challenges" + + config_namespace = "challenge-config" +} diff --git a/challenges/challenges-deployment.tf b/challenges/challenges-deployment.tf new file mode 100644 index 0000000..f97cfb8 --- /dev/null +++ b/challenges/challenges-deployment.tf @@ -0,0 +1,200 @@ +locals { + categories_config = keys(local.static_challenges) + categories_shared = keys(local.shared_challenges) + categories_instanced = keys(local.instanced_challenges) +} + +module "argocd_project_shared" { + source = "../tf-modules/argocd/project" + + argocd_namespace = var.argocd_namespace + project_name = local.argocd_project_shared + project_destinations = [ + { + namespace = module.kube_ctf.namespace_standard_challenges + server = "*" + } + ] + + depends_on = [ + module.kube_ctf + ] +} + +module "argocd_project_instanced" { + source = "../tf-modules/argocd/project" + + argocd_namespace = var.argocd_namespace + project_name = local.argocd_project_instanced + project_destinations = [ + { + namespace = module.kube_ctf.namespace_instanced_challenges + server = "*" + } + ] + + depends_on = [ + module.kube_ctf + ] +} + +module "argocd_project_static" { + source = "../tf-modules/argocd/project" + + argocd_namespace = var.argocd_namespace + project_name = local.argocd_project_static + project_destinations = [ + { + namespace = local.config_namespace + server = "*" + } + ] + + depends_on = [ + module.kube_ctf + ] +} + +module "repo_access_standard" { + source = "../tf-modules/private-repo" + + name = local.argocd_project_shared + argocd_namespace = var.argocd_namespace + ghcr_username = var.ghcr_username + git_token = var.git_token + git_repo = local.challenge_repo_url + argocd_project = local.argocd_project_shared + + depends_on = [ + module.kube_ctf, + module.argocd_project_shared + ] +} + +module "repo_access_isolated" { + source = "../tf-modules/private-repo" + + name = local.argocd_project_instanced + argocd_namespace = var.argocd_namespace + ghcr_username = var.ghcr_username + git_token = var.git_token + git_repo = local.challenge_repo_url + argocd_project = local.argocd_project_instanced + + depends_on = [ + module.kube_ctf, + module.argocd_project_instanced + ] +} + +module "repo_access_config" { + source = "../tf-modules/private-repo" + + name = local.argocd_project_static + argocd_namespace = var.argocd_namespace + ghcr_username = var.ghcr_username + git_token = var.git_token + git_repo = local.challenge_repo_url + argocd_project = local.argocd_project_static + + depends_on = [ + module.kube_ctf, + module.argocd_project_static + ] +} + +module "shared_challenges" { + source = "./challenges" + + for_each = toset(local.categories_shared) + + revision = local.branch + category = each.key + challenges = local.shared_challenges[each.key] + + config_only = false + argocd_project = local.argocd_project_shared + argocd_config_project = local.argocd_project_static + argocd_namespace = var.argocd_namespace + application_repo_url = local.challenge_repo_url + challenge_namespace = module.kube_ctf.namespace_standard_challenges + config_namespace = local.config_namespace + helm = { + valuesObject = { + kubectf = { + host = "challs.${var.cluster_dns_ctf}" + } + } + } + + depends_on = [ + module.kube_ctf, + module.argocd_project_static, + module.argocd_project_shared, + module.repo_access_standard + ] +} + +module "instanced_challenges" { + source = "./challenges" + + for_each = toset(local.categories_instanced) + + revision = local.branch + category = each.key + challenges = local.instanced_challenges[each.key] + + config_only = false + argocd_project = local.argocd_project_instanced + argocd_config_project = local.argocd_project_static + argocd_namespace = var.argocd_namespace + application_repo_url = local.challenge_repo_url + challenge_namespace = module.kube_ctf.namespace_instanced_challenges + config_namespace = local.config_namespace + helm = { + valuesObject = { + kubectf = { + host = "challs.${var.cluster_dns_ctf}" + } + } + } + config_helm_only = true + + depends_on = [ + module.kube_ctf, + module.argocd_project_static, + module.argocd_project_instanced, + module.repo_access_isolated + ] +} + +module "static_challenges" { + source = "./challenges" + + for_each = toset(local.categories_config) + + revision = local.branch + category = each.key + challenges = local.static_challenges[each.key] + + config_only = true + argocd_project = local.argocd_project_static + argocd_config_project = local.argocd_project_static + argocd_namespace = var.argocd_namespace + application_repo_url = local.challenge_repo_url + challenge_namespace = local.config_namespace + config_namespace = local.config_namespace + helm = { + valuesObject = { + kubectf = { + host = "challs.${var.cluster_dns_ctf}" + } + } + } + + depends_on = [ + module.kube_ctf, + module.argocd_project_static, + module.repo_access_config + ] +} diff --git a/challenges/challenges/challenges.tf b/challenges/challenges/challenges.tf new file mode 100644 index 0000000..34853d9 --- /dev/null +++ b/challenges/challenges/challenges.tf @@ -0,0 +1,115 @@ +variable "config_only" { + description = "Should challenges only be deployed with config" + type = bool + default = false +} + +variable "revision" { + description = "The revision of the repository to use" + default = "main" + nullable = false +} + +variable "category" { + description = "The category of the challenge" +} + +variable "challenges" { + description = "The challenges to deploy in a given category" + type = list(string) + default = [] +} + +variable "path" { + description = "The path to the challenge" + default = null +} + +variable "path_config" { + description = "The path to the challenge config" + default = null +} + +variable "argocd_project" { + description = "The ArgoCD project to use" + nullable = false +} + +variable "argocd_config_project" { + description = "The ArgoCD project to use for config only challenges" + nullable = false +} + +variable "argocd_namespace" { + description = "The namespace where ArgoCD is installed" + default = "argocd" +} + +variable "challenge_namespace" { + description = "The namespace where the challenge will be deployed" + nullable = false +} + +variable "config_namespace" { + description = "The namespace where the challenge config will be deployed" + nullable = false +} + +variable "application_name" { + description = "The name of the application" + default = null +} + +variable "application_repo_url" { + description = "The URL of the repository where the application manifests are stored" + nullable = false +} + +variable "helm" { + description = "Helm chart configuration" + type = any + default = null +} + +variable "config_helm_only" { + description = "Helm chart configuration for config only challenges" + type = bool + default = false +} + +module "argocd-challenge" { + source = "../challenge" + + for_each = toset(var.challenges) + enabled = !var.config_only + + identifier = each.value + + revision = var.revision + category = var.category + argocd_project = var.argocd_project + argocd_namespace = var.argocd_namespace + application_repo_url = var.application_repo_url + challenge_namespace = var.challenge_namespace + application_name = var.application_name + path = var.path + helm = var.config_helm_only ? null : var.helm +} + +module "argocd-challenge-config" { + source = "../config" + + for_each = toset(var.challenges) + + identifier = each.value + + revision = var.revision + category = var.category + argocd_project = var.argocd_config_project + argocd_namespace = var.argocd_namespace + application_repo_url = var.application_repo_url + config_namespace = var.config_namespace + application_name = var.application_name + path = var.path_config + helm = var.helm +} diff --git a/challenges/config/config.tf b/challenges/config/config.tf new file mode 100644 index 0000000..172524e --- /dev/null +++ b/challenges/config/config.tf @@ -0,0 +1,70 @@ +variable "revision" { + description = "The revision of the repository to use" + default = "main" + nullable = false +} + +variable "category" { + description = "The category of the challenge" +} + +variable "identifier" { + description = "The identifier of the challenge" +} + +variable "path" { + description = "The path to the challenge config" + default = null +} + +variable "argocd_project" { + description = "The ArgoCD project to use" + nullable = false +} + +variable "argocd_namespace" { + description = "The namespace where ArgoCD is installed" + default = "argocd" +} + +variable "config_namespace" { + description = "The namespace where the challenge config will be deployed" + nullable = false +} + +variable "application_name" { + description = "The name of the application" + default = null +} + +variable "application_repo_url" { + description = "The URL of the repository where the application manifests are stored" + nullable = false +} + +variable "helm" { + description = "Helm chart configuration" + type = any + default = null +} + +module "argocd-challenge-config" { + source = "../../tf-modules/argocd/application" + + argocd_namespace = var.argocd_namespace + application_namespace = var.config_namespace + application_name = var.application_name != null ? "${var.application_name}-config" : "${var.category}-${var.identifier}-config" + application_repo_url = var.application_repo_url + application_repo_path = var.path != null ? var.path : "challenges/${var.category}/${var.identifier}/k8s/config" + application_repo_revision = var.revision + application_project = var.argocd_project + helm = var.helm + + argocd_labels = { + "part-of" = "ctfpilot" + "component" = "challenge-config" + "version" = var.revision + "category" = var.category + "instance" = var.identifier + } +} diff --git a/challenges/kube-ctf.tf b/challenges/kube-ctf.tf new file mode 100644 index 0000000..837c0e9 --- /dev/null +++ b/challenges/kube-ctf.tf @@ -0,0 +1,28 @@ +module "kube_ctf" { + source = "../tf-modules/kubectf" + + challenge_dns = var.cluster_dns_ctf + management_dns = var.cluster_dns_management + + org_name = "ctfpilot.com" + cert_manager = "cert-manager-global" + + management_auth_secret = var.kubectf_auth_secret + container_secret = var.kubectf_container_secret + + image_landing = var.image_instancing_fallback + image_challenge_manager = var.image_kubectf + registry_prefix = "docker.io" # Optional, used in rendering templates + + ghcr_username = var.ghcr_username + ghcr_token = var.ghcr_token + + max_instances = 6 +} + +output "Hosts" { + value = { + "Challenges" = module.kube_ctf.challenge_host + "Management" = module.kube_ctf.challenge_manager_host + } +} diff --git a/challenges/providers.tf b/challenges/providers.tf new file mode 100644 index 0000000..75c454e --- /dev/null +++ b/challenges/providers.tf @@ -0,0 +1,59 @@ +# ---------------------- +# Terraform Configuration +# ---------------------- + +terraform { + required_version = ">= 1.9.5" + + backend "s3" {} + + required_providers { + kubernetes = { + source = "hashicorp/kubernetes" + version = ">= 2.32.0" + } + + kubectl = { + source = "alekc/kubectl" + version = ">= 2.0.2" + } + + htpasswd = { + source = "loafoe/htpasswd" + } + + http = { + source = "hashicorp/http" + } + } +} + +# ---------------------- +# Providers +# ---------------------- + +locals { + kube_config = yamldecode(base64decode(var.kubeconfig)) +} + +provider "kubernetes" { + host = local.kube_config.clusters[0].cluster.server + cluster_ca_certificate = base64decode(local.kube_config.clusters[0].cluster.certificate-authority-data) + + client_certificate = base64decode(local.kube_config.users[0].user.client-certificate-data) + client_key = base64decode(local.kube_config.users[0].user.client-key-data) +} + +provider "kubectl" { + load_config_file = false + + host = local.kube_config.clusters[0].cluster.server + cluster_ca_certificate = base64decode(local.kube_config.clusters[0].cluster.certificate-authority-data) + + client_certificate = base64decode(local.kube_config.users[0].user.client-certificate-data) + client_key = base64decode(local.kube_config.users[0].user.client-key-data) +} + +locals { + env_branch = var.environment == "prod" ? "main" : "develop" +} diff --git a/challenges/tfvars/.gitignore b/challenges/tfvars/.gitignore new file mode 100644 index 0000000..8147f77 --- /dev/null +++ b/challenges/tfvars/.gitignore @@ -0,0 +1 @@ +!template.tfvars diff --git a/challenges/tfvars/template.tfvars b/challenges/tfvars/template.tfvars new file mode 100644 index 0000000..38808ab --- /dev/null +++ b/challenges/tfvars/template.tfvars @@ -0,0 +1,51 @@ +# ------------------------ +# Kubernetes variables +# ------------------------ +kubeconfig = "AA==" # Base64 encoded kubeconfig file + +# ------------------------ +# Generic information +# ------------------------ +environment = "test" # Environment name for the CTF +cluster_dns_management = "" # The specific domain name to use for the DNS records for the management part of the cluster +cluster_dns_ctf = "" # The domain name to use for the DNS records for the CTF part of the cluster + +# ------------------------ +# GitHub variables +# ------------------------ +ghcr_username = "" # GitHub Container Registry username +ghcr_token = "" # GitHub Container Registry token. This token is used to pull images from the GitHub Container Registry. Only let this token have registry read access +git_token = "" # GitHub repo token. Only let this token have read access to the needed repositories. + +# ---------------------- +# CTF configuration +# ---------------------- +kubectf_auth_secret = "" # The secret to use for the authSecret in the CTF configuration +kubectf_container_secret = "" # The secret to use for the containerSecret in the CTF configuration + +# ------------------------ +# Challenges configuration +# ------------------------ +chall_whitelist_ips = ["", ""] # List of IPs to whitelist for challenge access + +challenges_static = { + "" = ["", ""], +} # List of static challenges to deploy. Needs to be the slugs of the challenges +challenges_shared = { + "" = ["", ""], +} # List of shared challenges to deploy. Needs to be the slugs of the challenges +challenges_instanced = { + "" = ["", ""], +} # List of instanced challenges to deploy. Needs to be the slugs of the challenges + +challenges_repository = "" # URL of the Git repository containing the challenge definitions +challenges_branch = "" # Branch of the Git repository to use for the challenge definitions. Leave empty for environment based branch (environment == prod ? main : develop) + +# ---------------------- +# Docker images +# ---------------------- +# Values are maintained in the variables.tf file. +# You can override these values by uncommenting and setting your own images here. + +# image_instancing_fallback = "ghcr.io/ctfpilot/instancing-fallback:1.0.2" # The docker image for the instancing fallback deployment. See https://github.com/ctfpilot/instancing-fallback +# image_kubectf = "ghcr.io/ctfpilot/kube-ctf:1.0.1" # The docker image for the kube-ctf deployment. See https://github.com/ctfpilot/kube-ctf diff --git a/challenges/variables.tf b/challenges/variables.tf new file mode 100644 index 0000000..401d90a --- /dev/null +++ b/challenges/variables.tf @@ -0,0 +1,122 @@ +# ------------------------ +# Variables +# ------------------------ + +variable "kubeconfig" { + type = string + description = "Base64 encoded kubeconfig file" + sensitive = true + nullable = false +} + +variable "environment" { + type = string + description = "Environment name for the CTF" + default = "test" + nullable = false +} + +variable "cluster_dns_management" { + type = string + description = "The specific domain name to use for the DNS records for the management part of the cluster. Must be the TLD or subdomain of `cloudflare_dns_management`" + nullable = false +} + +variable "cluster_dns_ctf" { + type = string + description = "The domain name to use for the DNS records for the CTF challenges part of the cluster. Must be the TLD or subdomain of `cloudflare_dns_ctf`" + nullable = false +} + +variable "ghcr_username" { + description = "GitHub Container Registry username" + type = string + nullable = false +} + +variable "ghcr_token" { + description = "GitHub Container Registry token. This token is used to pull images from the GitHub Container Registry. Only let this token have registry read access" + type = string + sensitive = true + nullable = false +} + +variable "git_token" { + description = "GitHub repo token. Only let this token have read access to the needed repositories." + type = string + sensitive = true + nullable = false +} + +variable "argocd_namespace" { + description = "Namespace for ArgoCD" + type = string + default = "argocd" +} + +variable "kubectf_auth_secret" { + type = string + nullable = false + description = "The secret to use for the authSecret in the CTF configuration" + sensitive = true +} + +variable "kubectf_container_secret" { + type = string + nullable = false + description = "The secret to use for the containerSecret in the CTF configuration" + sensitive = true +} + +variable "chall_whitelist_ips" { + type = list(string) + description = "List of IPs to whitelist for challenges, e.g., [ \"\", \"\" ]" + default = [] + sensitive = true + validation { + condition = length(var.chall_whitelist_ips) > 0 + error_message = "At least one IP address must be whitelisted" + } +} + +variable "challenges_static" { + type = map(list(string)) + description = "List of static challenges to deploy. In the format { \"\" = [\"\", \"\"] }" + default = {} +} + +variable "challenges_shared" { + type = map(list(string)) + description = "List of shared challenges to deploy. In the format { \"\" = [\"\", \"\"] }" + default = {} +} + +variable "challenges_instanced" { + type = map(list(string)) + description = "List of instanced challenges to deploy. In the format { \"\" = [\"\", \"\"] }" + default = {} +} + +variable "challenges_repository" { + type = string + description = "Repository URL for challenges, generated using the challenge-toolkit. See https://github.com/ctfpilot/challenge-toolkit" + nullable = false +} + +variable "challenges_branch" { + type = string + description = "Git branch for challenges. Leave empty for environment based branch (environment == prod ? main : develop)" + default = "" +} + +variable "image_instancing_fallback" { + type = string + description = "The docker image for the instancing fallback deployment. See https://github.com/ctfpilot/instancing-fallback" + default = "ghcr.io/ctfpilot/instancing-fallback:1.0.2" +} + +variable "image_kubectf" { + type = string + description = "The docker image for the kube-ctf deployment. See https://github.com/ctfpilot/kube-ctf" + default = "ghcr.io/ctfpilot/kube-ctf:1.0.2" +} diff --git a/challenges/whitelist.tf b/challenges/whitelist.tf new file mode 100644 index 0000000..7dee1eb --- /dev/null +++ b/challenges/whitelist.tf @@ -0,0 +1,63 @@ +resource "kubernetes_manifest" "ip_whitelist_web" { + manifest = { + apiVersion = "traefik.io/v1alpha1" + kind = "Middleware" + metadata = { + name = "challenge-ipwhitelist-web" + namespace = "ctfpilot-challenges" + } + spec = { + ipAllowList = { + sourceRange = var.chall_whitelist_ips + } + } + } +} + +resource "kubernetes_manifest" "ip_whitelist_instanced_web" { + manifest = { + apiVersion = "traefik.io/v1alpha1" + kind = "Middleware" + metadata = { + name = "challenge-ipwhitelist-web" + namespace = "ctfpilot-challenges-instanced" + } + spec = { + ipAllowList = { + sourceRange = var.chall_whitelist_ips + } + } + } +} + +resource "kubernetes_manifest" "ip_whitelist_tcp" { + manifest = { + apiVersion = "traefik.io/v1alpha1" + kind = "MiddlewareTCP" + metadata = { + name = "challenge-ipwhitelist-tcp" + namespace = "ctfpilot-challenges" + } + spec = { + ipAllowList = { + sourceRange = var.chall_whitelist_ips + } + } + } +} + +resource "kubernetes_manifest" "ip_whitelist_instanced_tcp" { + manifest = { + apiVersion = "traefik.io/v1alpha1" + kind = "MiddlewareTCP" + metadata = { + name = "challenge-ipwhitelist-tcp" + namespace = "ctfpilot-challenges-instanced" + } + spec = { + ipAllowList = { + sourceRange = var.chall_whitelist_ips + } + } + } +} diff --git a/cluster/.env.example b/cluster/.env.example new file mode 100644 index 0000000..5fe1f9d --- /dev/null +++ b/cluster/.env.example @@ -0,0 +1,2 @@ +AWS_ACCESS_KEY_ID= +AWS_SECRET_ACCESS_KEY= \ No newline at end of file diff --git a/cluster/.gitignore b/cluster/.gitignore new file mode 100644 index 0000000..943abaa --- /dev/null +++ b/cluster/.gitignore @@ -0,0 +1,43 @@ +# Local .terraform directories +**/.terraform/* + +# .tfstate files +*.tfstate +*.tfstate.* + +# Crash log files +crash.log +crash.*.log + +# Exclude all .tfvars files, which are likely to contain sensitive data, such as +# password, private keys, and other secrets. These should not be part of version +# control as they are data points which are potentially sensitive and subject +# to change depending on the environment. +*.tfvars +*.tfvars.json + +# Ignore override files as they are usually used to override resources locally and so +# are not checked in +override.tf +override.tf.json +*_override.tf +*_override.tf.json + +# Ignore transient lock info files created by terraform apply +.terraform.tfstate.lock.info + +# Include override files you do wish to add to version control using negated pattern +# !example_override.tf + +# Include tfplan files to ignore the plan output of command: terraform plan -out=tfplan +# example: *tfplan* + +# Ignore CLI configuration files +.terraformrc +terraform.rc + +**/k3s_kubeconfig.yaml +**/k3s_kustomization_backup.yaml + +# Extracted kubeconfig file +kube-config.yml diff --git a/cluster/.terraform.lock.hcl b/cluster/.terraform.lock.hcl new file mode 100644 index 0000000..8c4f5c2 --- /dev/null +++ b/cluster/.terraform.lock.hcl @@ -0,0 +1,204 @@ +# This file is maintained automatically by "tofu init". +# Manual edits may be lost in future updates. + +provider "registry.opentofu.org/anapsix/semvers" { + version = "0.7.1" + constraints = ">= 0.7.1" + hashes = [ + "h1:muej1ceXoABJVeyCSQa42xSfRCCOYuX+HGNYaa91cdo=", + "zh:049fa2bc555b1264427296c55462c24151aedd251ec32673e7775c451c5b0339", + "zh:18d72a3d0e3e502ea68e477396651922e59d97ddaeda132004ca6bc8e13334ed", + "zh:20cacd13b826250ce29e19691492e958db95fd8e66163bb6402050f791f82c93", + "zh:3d13be2d81197f66e69d544c7b708184a4f0341b3fd413a76ec7ff37dbd67999", + "zh:408012764fab3b5d79751ff1c3413dc17ff02a0b1e64655e131218b5a2c970da", + "zh:416c589984585c19952e75866a08a7299c9a2eeb81b015302962bbe09004484a", + "zh:4227a3dcac531608b6b89a2db5f40e99def6864c24b5e8f3e04c15290e2233dd", + "zh:6c1f4226a2fa7ee74c87974f0b5b1420668541d614a958744e17090ea0e6476e", + "zh:90c381aab648cd7507e93725c2bed847c91d2b186eaff09193adec771e689a5c", + "zh:9aa5755bdadff19265f3a434fcadc4a04cd622a40bac7381b5a6ac74b4d5fe8f", + "zh:ac6b01165bc361ddff7d3392311ea494aa6af2d089ab0b43b48f24ac949612ae", + "zh:ba589c0dfa18929244578f02c2d9f4a4c32c79cb57cd9c3ad7e8cbe123cc98fa", + "zh:bc0f3ff5b24e1d2ccf8e1e85fb0f931fd58e1ddaac4abe6b07f94844a1425cd1", + "zh:ced293727d8d91f7ddf85d07c897d23c0a3188251d1065dea5a65342637c5853", + "zh:d3f92af0ee440a540826a61a7b07f30d0155ddc40d70484d1c68c5abdbdabed2", + "zh:d4736f830f2913ba018868fe888a267441b33d1ecb8275ce941fb60150dd60f3", + "zh:df202a4bf895a3a60fdcd39a55296c95e44d3ce300cf6ce3e3e12744c24a941c", + "zh:ec198fa89c5039e3a41dfe0a704e9428389a2f663a69dc438136945887cc6711", + "zh:f809ab383cca0a5f83072981c64208cbd7fa67e986a86ee02dd2c82333221e32", + "zh:fe0d8d1bbe515d52b04b01cb93f3cdd25fb3f74833d5e62b3a7ea40ea8f229bf", + ] +} + +provider "registry.opentofu.org/cloudflare/cloudflare" { + version = "4.52.0" + constraints = "4.52.0" + hashes = [ + "h1:NTaOQfYINA0YTG/V1/9+SYtgX1it63+cBugj4WK4FWc=", + "zh:19be1a91c982b902c42aba47766860dfa5dc151eed1e95fd39ca642229381ef0", + "zh:1de451c4d1ecf7efbe67b6dace3426ba810711afdd644b0f1b870364c8ae91f8", + "zh:352b4a2120173298622e669258744554339d959ac3a95607b117a48ee4a83238", + "zh:3c6f1346d9154afbd2d558fabb4b0150fc8d559aa961254144fe1bc17fe6032f", + "zh:4c4c92d53fb535b1e0eff26f222bbd627b97d3b4c891ec9c321268676d06152f", + "zh:53276f68006c9ceb7cdb10a6ccf91a5c1eadd1407a28edb5741e84e88d7e29e8", + "zh:7925a97773948171a63d4f65bb81ee92fd6d07a447e36012977313293a5435c9", + "zh:7dfb0a4496cfe032437386d0a2cd9229a1956e9c30bd920923c141b0f0440060", + "zh:890df766e9b839623b1f0437355032a3c006226a6c200cd911e15ee1a9014e9f", + "zh:8d4aa79f0a414bb4163d771063c70cd991c8fac6c766e685bac2ee12903c5bd6", + "zh:a67540c13565616a7e7e51ee9366e88b0dc60046e1d75c72680e150bd02725bb", + "zh:a936383a4767f5393f38f622e92bf2d0c03fe04b69c284951f27345766c7b31b", + "zh:d4887d73c466ff036eecf50ad6404ba38fd82ea4855296b1846d244b0f13c380", + "zh:e9093c8bd5b6cd99c81666e315197791781b8f93afa14fc2e0f732d1bb2a44b7", + "zh:efd3b3f1ec59a37f635aa1d4efcf178734c2fcf8ddb0d56ea690bec342da8672", + ] +} + +provider "registry.opentofu.org/hashicorp/assert" { + version = "0.16.0" + constraints = ">= 0.16.0" + hashes = [ + "h1:2jeV46S9jN2rk0GXOa+HGNlVvyWzaB3wz0T65elbjOc=", + "zh:3c04d08d1bb4ae810b7972a219c8dd42a8ab901a9bc25197b250c38f3fa57033", + "zh:46119bcc47b545809c0ee873a72d44f4f875cca4d7228605f5c7a8956a5e7d55", + "zh:511949ee8a6ac8ff7296b4c9778deb2aec2783f5b85c4f27382a3b623fc50a4a", + "zh:b4ebb8b832bae26443880d2e17493f754495db2d6c3f02c6d0070cbf5ae21598", + "zh:bebed6c1873871eb824103f08e72055c077f01b10a40944760d19ffdd721d9ab", + "zh:e412855fd2fd81e0a847e45308bdbac99995315c503fdddf262ee59e1b7c5263", + "zh:ed47c4fe28c6f148f11fa4098516abea008c49fa670c3cedd2ff94596cac0831", + "zh:edee914b1d12ac6db241a1fecaa5186c47f361f4ceb2deb23ad45d67bf95c7b1", + "zh:eff5b2e1c2128217bdbc600eda4fe011831e5c655bf4acd84b6495fc20d128d3", + "zh:ff64424784171a3361b1ea95d8cef334ec1c4a395812edd0a77a1ed6b4119b0f", + ] +} + +provider "registry.opentofu.org/hashicorp/cloudinit" { + version = "2.3.7" + hashes = [ + "h1:dkGeAxGbAGgglocp0fl1OzvT6O4KKsJTEsCW0ixdQJs=", + "zh:2d48b8452eae9bac2e62273e8f535f73694d8cb05ea38f4b27ee735dcc38eed4", + "zh:4add11b87e48d0e6ecd19243a06ecfc42fc07d0a3748fe568c2971d5f4767486", + "zh:4c9c4e3319cf3328595ea2d68eba7c604325fbcba38cd443e39e982b0b4e29f2", + "zh:503dd83a05b0421ecbcb140d5fdbe3a6b82f163495a82587a1390cf66d7a27be", + "zh:7dd34de7e68036dbbb70c249968a2a10bccba1cb92d3b4dccbc0eb65a3fc58ea", + "zh:a4d7b4480d38446b8da96ce4ecbc2e5a081c4ddc3da2bad97d7b228821b77895", + "zh:bdec6329c3d2d5f034080d9cd6f9a15a2c052faacd716f981e247b48e6845c01", + "zh:e1519544ae3f67196d144e18c21ad681dc29da3133a537ffdd5c2c6271b8db0c", + "zh:e58cd6b05ed51a6fa072e5de2208ba36a58557c3fb414d50c42b3d40a11366b7", + "zh:fafc4a49c297516f2a40490f9a7e6d2b437d77a94330797d4eead178c987ccb5", + ] +} + +provider "registry.opentofu.org/hashicorp/local" { + version = "2.5.3" + constraints = ">= 2.5.2" + hashes = [ + "h1:mC9+u1eaUILTjxey6Ivyf/3djm//RNNze9kBVX/trng=", + "zh:32e1d4b0595cea6cda4ca256195c162772ddff25594ab4008731a2ec7be230bf", + "zh:48c390af0c87df994ec9796f04ec2582bcac581fb81ed6bb58e0671da1c17991", + "zh:4be7289c969218a57b40902e2f359914f8d35a7f97b439140cb711aa21e494bd", + "zh:4cf958e631e99ed6c8b522c9b22e1f1b568c0bdadb01dd002ca7dffb1c927764", + "zh:7a0132c0faca4c4c96aa70808effd6817e28712bf5a39881666ac377b4250acf", + "zh:7d60de08fac427fb045e4590d1b921b6778498eee9eb16f78c64d4c577bde096", + "zh:91003bee5981e99ec3925ce2f452a5f743827f9d0e131a86613549c1464796f0", + "zh:9fe2fe75977c8149e2515fb30c6cc6cfd57b225d4ce592c570d81a3831d7ffa3", + "zh:e210e6be54933ce93e03d0994e520ba289aa01b2c1f70e77afb8f2ee796b0fe3", + "zh:e8793e5f9422f2b31a804e51806595f335b827c9a38db18766960464566f21d5", + ] +} + +provider "registry.opentofu.org/hashicorp/null" { + version = "3.2.4" + hashes = [ + "h1:jsKjBiLb+v3OIC3xuDiY4sR0r1OHUMSWPYKult9MhT0=", + "zh:1769783386610bed8bb1e861a119fe25058be41895e3996d9216dd6bb8a7aee3", + "zh:32c62a9387ad0b861b5262b41c5e9ed6e940eda729c2a0e58100e6629af27ddb", + "zh:339bf8c2f9733fce068eb6d5612701144c752425cebeafab36563a16be460fb2", + "zh:36731f23343aee12a7e078067a98644c0126714c4fe9ac930eecb0f2361788c4", + "zh:3d106c7e32a929e2843f732625a582e562ff09120021e510a51a6f5d01175b8d", + "zh:74bcb3567708171ad83b234b92c9d63ab441ef882b770b0210c2b14fdbe3b1b6", + "zh:90b55bdbffa35df9204282251059e62c178b0ac7035958b93a647839643c0072", + "zh:ae24c0e5adc692b8f94cb23a000f91a316070fdc19418578dcf2134ff57cf447", + "zh:b5c10d4ad860c4c21273203d1de6d2f0286845edf1c64319fa2362df526b5f58", + "zh:e05bbd88e82e1d6234988c85db62fd66f11502645838fff594a2ec25352ecd80", + ] +} + +provider "registry.opentofu.org/hashicorp/random" { + version = "3.7.2" + hashes = [ + "h1:yHMBbZOIHlXUuBQ8Mhioe0hwmhermuboq2eNNoCJaf8=", + "zh:2ffeb1058bd7b21a9e15a5301abb863053a2d42dffa3f6cf654a1667e10f4727", + "zh:519319ed8f4312ed76519652ad6cd9f98bc75cf4ec7990a5684c072cf5dd0a5d", + "zh:7371c2cc28c94deb9dba62fbac2685f7dde47f93019273a758dd5a2794f72919", + "zh:9b0ac4c1d8e36a86b59ced94fa517ae9b015b1d044b3455465cc6f0eab70915d", + "zh:c6336d7196f1318e1cbb120b3de8426ce43d4cacd2c75f45dba2dbdba666ce00", + "zh:c71f18b0cb5d55a103ea81e346fb56db15b144459123f1be1b0209cffc1deb4e", + "zh:d2dc49a6cac2d156e91b0506d6d756809e36bf390844a187f305094336d3e8d8", + "zh:d5b5fc881ccc41b268f952dae303501d6ec9f9d24ee11fe2fa56eed7478e15d0", + "zh:db9723eaca26d58c930e13fde221d93501529a5cd036b1f167ef8cff6f1a03cc", + "zh:fe3359f733f3ab518c6f85f3a9cd89322a7143463263f30321de0973a52d4ad8", + ] +} + +provider "registry.opentofu.org/hetznercloud/hcloud" { + version = "1.51.0" + constraints = ">= 1.51.0" + hashes = [ + "h1:yER+O3OKYfxBAO7KVYZzH+4EYrmorCO0J0hlnRUfH00=", + "zh:0e8e78084c12866e8e3873011bcac125780b62afeaa518d4749b9a063ae6e32b", + "zh:145738cee21bcdeea1cf82f0d44f7f239c27c2214249e5e5079668c479522a8a", + "zh:164406be8ee83952f58a449d514837cc6d9763b6d29e72262d5582d5d5b89315", + "zh:1a0e6ffab3196b35ca65eb445622615bb8dddd68d0bf350ed60d25e1e74f67dc", + "zh:3b7729d1bb5cc7a5af60b42a607f7b3fec690192b1efb55e2341cee88405ecb0", + "zh:3bcfc5c40d1b7702f39dac5d2dd9eef58c9c934effb4676e26fbe85fe2057e8f", + "zh:3ce193892dca025b804de6d99316c50a33462eb36336006a9db7ea44be439eba", + "zh:4f92437e1eba8eafe4417f8b61d557ed47f121622305ee2b3c13c31e45c69ca4", + "zh:554c308bf64b603a075a8f13a151a136b68ba382c2d83977a0df26de7dea2d3d", + "zh:8c57aa6032fed5da43a0102a4f26262c0496803b99f2f92e5ceb02c80161e291", + "zh:99cd4d246d0ad3a3529176df22a47f254700f8c4fc33f62c14464259284945b7", + "zh:af38a4d1e93f2392a296970ba4ecea341204e888d579cd74642e9f23a94b3b06", + "zh:f0766d42dd97b3eac6fa614fa5809ff2511c9104f3834d0d4b6e84674f13f092", + "zh:f20f7379876ede225f3b6f0719826706a171ea4c1dd438a8a3103dee8fe43ccc", + ] +} + +provider "registry.opentofu.org/integrations/github" { + version = "6.6.0" + constraints = ">= 6.4.0" + hashes = [ + "h1:Fp0RrNe+w167AQkVUWC1WRAsyjhhHN7aHWUky7VkKW8=", + "zh:0b1b5342db6a17de7c71386704e101be7d6761569e03fb3ff1f3d4c02c32d998", + "zh:2fb663467fff76852126b58315d9a1a457e3b04bec51f04bf1c0ddc9dfbb3517", + "zh:4183e557a1dfd413dae90ca4bac37dbbe499eae5e923567371f768053f977800", + "zh:48b2979f88fb55cdb14b7e4c37c44e0dfbc21b7a19686ce75e339efda773c5c2", + "zh:5d803fb06625e0bcf83abb590d4235c117fa7f4aa2168fa3d5f686c41bc529ec", + "zh:6f1dd094cbab36363583cda837d7ca470bef5f8abf9b19f23e9cd8b927153498", + "zh:772edb5890d72b32868f9fdc0a9a1d4f4701d8e7f8acb37a7ac530d053c776e3", + "zh:798f443dbba6610431dcef832047f6917fb5a4e184a3a776c44e6213fb429cc6", + "zh:cc08dfcc387e2603f6dbaff8c236c1254185450d6cadd6bad92879fe7e7dbce9", + "zh:d5e2c8d7f50f91d6847ddce27b10b721bdfce99c1bbab42a68fa271337d73d63", + "zh:e69a0045440c706f50f84a84ff8b1df520ec9bf757de4b8f9959f2ed20c3f440", + "zh:efc5358573a6403cbea3a08a2fcd2407258ac083d9134c641bdcb578966d8bdf", + "zh:f627a255e5809ec2375f79949c79417847fa56b9e9222ea7c45a463eb663f137", + "zh:f7c02f762e4cf1de7f58bde520798491ccdd54a5bd52278d579c146d1d07d4f0", + "zh:fbd1fee2c9df3aa19cf8851ce134dea6e45ea01cb85695c1726670c285797e25", + ] +} + +provider "registry.opentofu.org/loafoe/ssh" { + version = "2.7.0" + constraints = "2.7.0" + hashes = [ + "h1:MYcyNF/9w/O0nEeKmopbji1NqeD9kpd2a55r9E4rFXs=", + "zh:0301be53defa9294c713fb3ce4c9925e83051b7444b6eb7262c692ad514f9c46", + "zh:2670797441d6fefddaaac4498f31b0dc8053fe82a3744fca44da7471e6449f1f", + "zh:2d70166644fba761aec397920e9e843cce2c060875ddd224f7791ea2cd7bd6e6", + "zh:30bda314598fee47cf890adfb6f3e1db606feab99252ccfdd0e5c93108f38fdd", + "zh:3a0c0c9f1aff15818fb5fe97b361b879baf19886d413fa468165c3c6de49d348", + "zh:5183c1a7fb5d1f1394bfcfe716a61c4191198ccbd64311601c68c52a3a1ea7e2", + "zh:5190fd7e18f0e46d2263fafa04a6862578abb1c14d60ea3e6597f1b00b041ec7", + "zh:825e2a7eb6c176dc96b82a1123d63ce6e04ef502a973a7ac44ab156cae4f991a", + "zh:8e0716c9a628801284663cad3a8f70e026780f34d04fa5ffb822f0cd5876c353", + "zh:8f19c94a72fb4cecdc70ac97f04c24fa24c46a4e125bbb7c24f642e95f753c70", + "zh:a965929f10651c7139009aa509a6929f2205f90e85ce91a8354416d17624ed04", + ] +} diff --git a/cluster/README.md b/cluster/README.md new file mode 100644 index 0000000..ac978bd --- /dev/null +++ b/cluster/README.md @@ -0,0 +1,106 @@ +# CTF Pilot's Kubernetes Cluster on Hetzner Cloud + +> [!IMPORTANT] +> You are leaving the automated CTF Pilot setup and entering a more advanced manual setup. +> This requires knowledge of Kubernetes, Terraform/OpenTofu, and cloud infrastructure management. +> If you are not comfortable with these technologies, it is recommended to use the automated setup provided by CTF Pilot. +> Learn more about the automated setup in the [CTFp main README](../README.md). + +This setup uses [Terraform](https://www.terraform.io/) / [OpenTofu](https://opentofu.org) to create and manage a Kubernetes cluster on [Hetzner Cloud](https://www.hetzner.com/cloud), using the [kube-hetzner](https://github.com/kube-hetzner/terraform-hcloud-kube-hetzner). The cluster is configured to use [Cloudflare](https://www.cloudflare.com/) for DNS management. + +## Pre-requisites + +The following software needs to be installed on your local machine: + +- [Terraform](https://www.terraform.io/downloads.html) / [OpenTofu](https://opentofu.org) +- [Packer](https://developer.hashicorp.com/packer/tutorials/docker-get-started/get-started-install-cli#installing-packer) (For initial setup of snapshots for the servers) +- [Kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/) (For interacting with the Kubernetes cluster) +- [hcloud cli tool](https://github.com/hetznercloud/cli) (For interacting with the Hetzner Cloud API) +- SSH client (For connecting to the servers) + +The following services are required, in order to deploy the Kubernetes cluster: + +- [Hetzner Cloud](https://www.hetzner.com/cloud) account +- [Hetzner Cloud API Token](https://console.hetzner.cloud/projects) (For authenticating with the Hetzner Cloud API) +- [Cloudflare](https://www.cloudflare.com/) account +- [Cloudflare API Token](https://dash.cloudflare.com/profile/api-tokens) (For authenticating with the Cloudflare API) +- [Cloudflare controlled domain](https://dash.cloudflare.com/) (For allowing the system to allocate a domain for the Kubernetes cluster) + +### SSH keys + +In order to connect to the servers, you need to have an SSH key pair. The keys path needs to be set in the tfvars file. + +*The SSH key should be ssh-ed25519 or rsa-sha2-512 (for easy use, passphrase-less)* +`ssh-keygen -t ed25519` + +*Can be generated by running: `./keys/create.sh` and copying the keys into tfvars.* + +## Setup + +### Image creation (Done once) + +> [!TIP] +> +> Image creation is only required once, to create the snapshots for the servers. +> If you want to update the the snapshots, this step can be repeated. + +In order to create the cluster, we need to create snapshots of the servers that will be used in the cluster. This is done by running the following command (say yes, to build snapshots using packer): + +```bash +tmp_script=$(mktemp) && curl -sSL -o "${tmp_script}" https://raw.githubusercontent.com/kube-hetzner/terraform-hcloud-kube-hetzner/master/scripts/create.sh && chmod +x "${tmp_script}" && "${tmp_script}" && rm "${tmp_script}" +``` + +**Note:** This will create a snapshot of the server, which will be used as the base image for the Kubernetes cluster, as well as ensuring local software is installed. +*The software has been provided by the [kube-hetzner](https://github.com/kube-hetzner/terraform-hcloud-kube-hetzner) project.* + +### Cluster creation + +Copy the `tfvars/template.tfvars` file to `tfvars/data.tfvars` and edit the file with your own values. +The [`tfvars/template.tfvars`](tfvars/template.tfvars) file contains further information on each variable. + +For deeper customization, you can edit the `kube.tf` file to change the cluster configuration. +This file is a configured version of the [kube-hetzner `kube.tf`](https://github.com/kube-hetzner/terraform-hcloud-kube-hetzner). + +> [!IMPORTANT] +> Make sure you generate the backend configuration file before creating the cluster. +> See the [backend generation instructions](../backend/README.md) for more information. +> +> You will also need to set the following environment variables for authentication to the S3 backend: +> - `AWS_ACCESS_KEY_ID` +> - `AWS_SECRET_ACCESS_KEY` +> +> See [OpenTofub backend S3 configuration](https://opentofu.org/docs/language/settings/backends/s3/) for more information. + +Run the following command to create the Kubernetes cluster: + +```bash +tofu init -backend-config=../backend/generated/cluster.hcl +tofu apply --var-file tfvars/data.tfvars +``` + +The creation process may take several minutes to complete. + +If you expereience issues, it may be one or multiple of the following issues: + +- The type of servers in Hetzner may not be available. Check to see if they are available in your selected datacenter. +- Your firewall may also be blocking ssh requests to the servers, which causes a deadlock in the configuration of the servers. + +For more detailed troubleshooting, refer to the [kube-hetzner documentation](https://github.com/kube-hetzner/terraform-hcloud-kube-hetzner). + +During creation, Cloudflare DNS records will be created for the cluster. + +### Cluster deletion + +To delete the Kubernetes cluster, run the following command: + +```bash +tofu destroy --var-file tfvars/template.tfvars +``` + +## Accessing the cluster + +To access the Kubernetes cluster, you need to set up the `kubeconfig` file. This can be done by running the following command: + +```bash +tofu output --raw kubeconfig > ./kube-config.yml +``` diff --git a/cluster/cloudflare.tf b/cluster/cloudflare.tf new file mode 100644 index 0000000..1f70dfb --- /dev/null +++ b/cluster/cloudflare.tf @@ -0,0 +1,103 @@ +# ---------------------- +# DNS +# ---------------------- + +data "cloudflare_zones" "domain_name_zone_management" { + filter { + name = var.cloudflare_dns_management + } +} + +# Create DNS A record +resource "cloudflare_record" "domain_name_management" { + zone_id = data.cloudflare_zones.domain_name_zone_management.zones.0.id + name = var.cluster_dns_management + content = module.kube-hetzner.ingress_public_ipv4 + type = "A" + ttl = 1 + + depends_on = [ + data.cloudflare_zones.domain_name_zone_management, + ] +} + +# Create DNS A wildcard record +resource "cloudflare_record" "wildcard_domain_name_management" { + zone_id = data.cloudflare_zones.domain_name_zone_management.zones.0.id + name = "*.${var.cluster_dns_management}" + content = var.cluster_dns_management + type = "CNAME" + ttl = 1 + + depends_on = [ + data.cloudflare_zones.domain_name_zone_management, + ] +} + +data "cloudflare_zones" "domain_name_zone_ctf" { + filter { + name = var.cloudflare_dns_ctf + } +} + +# Create DNS A record +resource "cloudflare_record" "domain_name_ctf" { + zone_id = data.cloudflare_zones.domain_name_zone_ctf.zones.0.id + name = var.cluster_dns_ctf + content = module.kube-hetzner.ingress_public_ipv4 + type = "A" + ttl = 1 + proxied = true + + depends_on = [ + data.cloudflare_zones.domain_name_zone_ctf, + ] +} + +# Create DNS A wildcard record +resource "cloudflare_record" "wildcard_domain_name_ctf" { + zone_id = data.cloudflare_zones.domain_name_zone_ctf.zones.0.id + name = "*.${var.cluster_dns_ctf}" + content = module.kube-hetzner.ingress_public_ipv4 + type = "A" + ttl = 1 + proxied = false + + depends_on = [ + data.cloudflare_zones.domain_name_zone_ctf, + ] +} + +data "cloudflare_zones" "domain_name_zone_platform" { + filter { + name = var.cloudflare_dns_platform + } +} + +# Create DNS A record +resource "cloudflare_record" "domain_name_platform" { + zone_id = data.cloudflare_zones.domain_name_zone_platform.zones.0.id + name = var.cluster_dns_platform + content = module.kube-hetzner.ingress_public_ipv4 + type = "A" + ttl = 1 + proxied = true + + depends_on = [ + data.cloudflare_zones.domain_name_zone_platform, + ] +} + +# Create DNS A wildcard record +resource "cloudflare_record" "wildcard_domain_name_platform" { + zone_id = data.cloudflare_zones.domain_name_zone_platform.zones.0.id + name = "*.${var.cluster_dns_platform}" + content = module.kube-hetzner.ingress_public_ipv4 + type = "A" + ttl = 1 + proxied = true + + depends_on = [ + data.cloudflare_zones.domain_name_zone_platform, + ] +} diff --git a/cluster/hcloud-microos-snapshots.pkr.hcl b/cluster/hcloud-microos-snapshots.pkr.hcl new file mode 100644 index 0000000..0e3a954 --- /dev/null +++ b/cluster/hcloud-microos-snapshots.pkr.hcl @@ -0,0 +1,164 @@ +/* + * Creates a MicroOS snapshot for Kube-Hetzner + */ +packer { + required_plugins { + hcloud = { + version = ">= 1.0.5" + source = "github.com/hashicorp/hcloud" + } + } +} + +variable "hcloud_token" { + type = string + default = env("HCLOUD_TOKEN") + sensitive = true +} + +# We download the OpenSUSE MicroOS x86 image from an automatically selected mirror. +variable "opensuse_microos_x86_mirror_link" { + type = string + default = "https://download.opensuse.org/tumbleweed/appliances/openSUSE-MicroOS.x86_64-ContainerHost-OpenStack-Cloud.qcow2" +} + +# We download the OpenSUSE MicroOS ARM image from an automatically selected mirror. +variable "opensuse_microos_arm_mirror_link" { + type = string + default = "https://download.opensuse.org/ports/aarch64/tumbleweed/appliances/openSUSE-MicroOS.aarch64-ContainerHost-OpenStack-Cloud.qcow2" +} + +# If you need to add other packages to the OS, do it here in the default value, like ["vim", "curl", "wget"] +# When looking for packages, you need to search for OpenSUSE Tumbleweed packages, as MicroOS is based on Tumbleweed. +variable "packages_to_install" { + type = list(string) + default = [] +} + +locals { + needed_packages = join(" ", concat(["restorecond policycoreutils policycoreutils-python-utils setools-console audit bind-utils wireguard-tools open-iscsi nfs-client xfsprogs cryptsetup lvm2 git cifs-utils bash-completion mtr tcpdump"], var.packages_to_install)) + + # Add local variables for inline shell commands + download_image = "wget --timeout=5 --waitretry=5 --tries=5 --retry-connrefused --inet4-only " + + write_image = <<-EOT + set -ex + echo 'MicroOS image loaded, writing to disk... ' + qemu-img convert -p -f qcow2 -O host_device $(ls -a | grep -ie '^opensuse.*microos.*qcow2$') /dev/sda + echo 'done. Rebooting...' + sleep 1 && udevadm settle && reboot + EOT + + install_packages = <<-EOT + set -ex + echo "First reboot successful, installing needed packages..." + transactional-update --continue pkg install -y ${local.needed_packages} + transactional-update --continue shell <<- EOF + setenforce 0 + rpm --import https://rpm.rancher.io/public.key + zypper install -y https://github.com/k3s-io/k3s-selinux/releases/download/v1.4.stable.1/k3s-selinux-1.4-1.sle.noarch.rpm + zypper addlock k3s-selinux + restorecon -Rv /etc/selinux/targeted/policy + restorecon -Rv /var/lib + setenforce 1 + EOF + sleep 1 && udevadm settle && reboot + EOT + + clean_up = <<-EOT + set -ex + echo "Second reboot successful, cleaning-up..." + rm -rf /etc/ssh/ssh_host_* + echo "Make sure to use NetworkManager" + touch /etc/NetworkManager/NetworkManager.conf + sleep 1 && udevadm settle + EOT +} + +# Source for the MicroOS x86 snapshot +source "hcloud" "microos-x86-snapshot" { + image = "ubuntu-22.04" + rescue = "linux64" + location = "fsn1" + server_type = "cpx11" # disk size of >= 40GiB is needed to install the MicroOS image + snapshot_labels = { + microos-snapshot = "yes" + creator = "kube-hetzner" + } + snapshot_name = "OpenSUSE MicroOS x86 by Kube-Hetzner" + ssh_username = "root" + token = var.hcloud_token +} + +# Source for the MicroOS ARM snapshot +source "hcloud" "microos-arm-snapshot" { + image = "ubuntu-22.04" + rescue = "linux64" + location = "fsn1" + server_type = "cax11" # disk size of >= 40GiB is needed to install the MicroOS image + snapshot_labels = { + microos-snapshot = "yes" + creator = "kube-hetzner" + } + snapshot_name = "OpenSUSE MicroOS ARM by Kube-Hetzner" + ssh_username = "root" + token = var.hcloud_token +} + +# Build the MicroOS x86 snapshot +build { + sources = ["source.hcloud.microos-x86-snapshot"] + + # Download the MicroOS x86 image + provisioner "shell" { + inline = ["${local.download_image}${var.opensuse_microos_x86_mirror_link}"] + } + + # Write the MicroOS x86 image to disk + provisioner "shell" { + inline = [local.write_image] + expect_disconnect = true + } + + # Ensure connection to MicroOS x86 and do house-keeping + provisioner "shell" { + pause_before = "5s" + inline = [local.install_packages] + expect_disconnect = true + } + + # Ensure connection to MicroOS x86 and do house-keeping + provisioner "shell" { + pause_before = "5s" + inline = [local.clean_up] + } +} + +# Build the MicroOS ARM snapshot +build { + sources = ["source.hcloud.microos-arm-snapshot"] + + # Download the MicroOS ARM image + provisioner "shell" { + inline = ["${local.download_image}${var.opensuse_microos_arm_mirror_link}"] + } + + # Write the MicroOS ARM image to disk + provisioner "shell" { + inline = [local.write_image] + expect_disconnect = true + } + + # Ensure connection to MicroOS ARM and do house-keeping + provisioner "shell" { + pause_before = "5s" + inline = [local.install_packages] + expect_disconnect = true + } + + # Ensure connection to MicroOS ARM and do house-keeping + provisioner "shell" { + pause_before = "5s" + inline = [local.clean_up] + } +} diff --git a/cluster/keys/.gitignore b/cluster/keys/.gitignore new file mode 100644 index 0000000..b2e756f --- /dev/null +++ b/cluster/keys/.gitignore @@ -0,0 +1,3 @@ +* +!.gitignore +!create.sh diff --git a/cluster/keys/create.sh b/cluster/keys/create.sh new file mode 100755 index 0000000..40f0fc4 --- /dev/null +++ b/cluster/keys/create.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash + +# Get location of this file +DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" + +ssh-keygen -t ed25519 -f $DIR/k8s -q -N "" + +# base64 encode the keys (into single base64 string) +base64 $DIR/k8s | tr -d '\n' >$DIR/k8s.b64 +base64 $DIR/k8s.pub | tr -d '\n' >$DIR/k8s.pub.b64 diff --git a/cluster/kube.tf b/cluster/kube.tf new file mode 100644 index 0000000..1b88ec3 --- /dev/null +++ b/cluster/kube.tf @@ -0,0 +1,1282 @@ +# ---------------------- +# Cluster setup +# ---------------------- + +module "kube-hetzner" { + providers = { + hcloud = hcloud + } + hcloud_token = var.hcloud_token + + # Then fill or edit the below values. Only the first values starting with a * are obligatory; the rest can remain with their default values, or you + # could adapt them to your needs. + + # * source can be specified in multiple ways: + # 1. For normal use, (the official version published on the Terraform Registry), use + source = "kube-hetzner/kube-hetzner/hcloud" + # When using the terraform registry as source, you can optionally specify a version number. + # See https://registry.terraform.io/modules/kube-hetzner/kube-hetzner/hcloud for the available versions + version = var.kube_hetzner_version # Not possible to make a variable when using Terraform - See https://github.com/hashicorp/terraform/issues/28912 + # 2. For local dev, path to the git repo + # source = "../../kube-hetzner/" + # 3. If you want to use the latest master branch (see https://developer.hashicorp.com/terraform/language/modules/sources#github), use + # source = "github.com/The0mikkel/terraform-hcloud-kube-hetzner?ref=2.18.0-fix" + + # Note that some values, notably "location" and "public_key" have no effect after initializing the cluster. + # This is to keep Terraform from re-provisioning all nodes at once, which would lose data. If you want to update + # those, you should instead change the value here and manually re-provision each node. Grep for "lifecycle". + + # Customize the SSH port (by default 22) + # ssh_port = 2222 + + # * Your ssh public key + ssh_public_key = base64decode(var.ssh_key_public_base64) + # * Your private key must be "ssh_private_key = null" when you want to use ssh-agent for a Yubikey-like device authentication or an SSH key-pair with a passphrase. + # For more details on SSH see https://github.com/kube-hetzner/kube-hetzner/blob/master/docs/ssh.md + ssh_private_key = base64decode(var.ssh_key_private_base64) + # You can add additional SSH public Keys to grant other team members root access to your cluster nodes. + # ssh_additional_public_keys = var.ssh_extra_keys_path + + # You can also add additional SSH public Keys which are saved in the hetzner cloud by a label. + # See https://docs.hetzner.cloud/#label-selector + # ssh_hcloud_key_label = var.ssh_extra_keys_label + + # These can be customized, or left with the default values + # * For Hetzner locations see https://docs.hetzner.com/general/others/data-centers-and-connection/ + network_region = var.network_zone + + # IMPORTANT: Before you create your cluster, you can do anything you want with the nodepools, but you need at least one of each, control plane and agent. + # Once the cluster is up and running, you can change nodepool count and even set it to 0 (in the case of the first control-plane nodepool, the minimum is 1). + # You can also rename it (if the count is 0), but do not remove a nodepool from the list. + + # You can safely add or remove nodepools at the end of each list. That is due to how subnets and IPs get allocated (FILO). + # The maximum number of nodepools you can create combined for both lists is 50 (see above). + # Also, before decreasing the count of any nodepools to 0, it's essential to drain and cordon the nodes in question. Otherwise, it will leave your cluster in a bad state. + + # Before initializing the cluster, you can change all parameters and add or remove any nodepools. You need at least one nodepool of each kind, control plane, and agent. + # ⚠️ The nodepool names are entirely arbitrary, but all lowercase, no special characters or underscore (dashes are allowed), and they must be unique. + + # If you want to have a single node cluster, have one control plane nodepools with a count of 1, and one agent nodepool with a count of 0. + + # Please note that changing labels and taints after the first run will have no effect. If needed, you can do that through Kubernetes directly. + + # Multi-architecture clusters are OK for most use cases, as container underlying images tend to be multi-architecture too. + + # * Example below: + + control_plane_nodepools = [ + { + name = "control-plane-1", + server_type = var.control_plane_type_1, + location = var.region_1, + labels = [ + "ressource-type=node", + "node-type=control-plane", + "control-plane=fsn1", + "node-pool=control-plane-fsn1", + "cluster.ctfpilot.com/node=control-plane", + ], + taints = [], + count = var.control_plane_count_1 + # swap_size = "2G" # remember to add the suffix, examples: 512M, 1G + # zram_size = "2G" # remember to add the suffix, examples: 512M, 1G + kubelet_args = ["kube-reserved=cpu=250m,memory=1500Mi,ephemeral-storage=1Gi", "system-reserved=cpu=250m,memory=300Mi"] + + # Fine-grained control over placement groups (nodes in the same group are spread over different physical servers, 10 nodes per placement group max): + # placement_group = "default" + + # Enable automatic backups via Hetzner (default: false) + backups = false + + # To disable public ips (default: false) + # WARNING: If both values are set to "true", your server will only be accessible via a private network. Make sure you have followed + # the instructions regarding this type of setup in README.md: "Use only private IPs in your cluster". + # disable_ipv4 = true + # disable_ipv6 = true + }, + { + name = "control-plane-2", + server_type = var.control_plane_type_2, + location = var.region_2, + labels = [ + "ressource-type=node", + "node-type=control-plane", + "control-plane=nbg1", + "node-pool=control-plane-nbg1", + "cluster.ctfpilot.com/node=control-plane", + ], + taints = [], + count = var.control_plane_count_2 + kubelet_args = ["kube-reserved=cpu=250m,memory=1500Mi,ephemeral-storage=1Gi", "system-reserved=cpu=250m,memory=300Mi"] + + # Fine-grained control over placement groups (nodes in the same group are spread over different physical servers, 10 nodes per placement group max): + # placement_group = "default" + + # Enable automatic backups via Hetzner (default: false) + backups = false + }, + { + name = "control-plane-3", + server_type = var.control_plane_type_3, + location = var.region_3, + labels = [ + "ressource-type=node", + "node-type=control-plane", + "control-plane=hel1", + "node-pool=control-plane-hel1", + "cluster.ctfpilot.com/node=control-plane", + ], + taints = [], + count = var.control_plane_count_3 + kubelet_args = ["kube-reserved=cpu=250m,memory=1500Mi,ephemeral-storage=1Gi", "system-reserved=cpu=250m,memory=300Mi"] + + # Fine-grained control over placement groups (nodes in the same group are spread over different physical servers, 10 nodes per placement group max): + # placement_group = "default" + + # Enable automatic backups via Hetzner (default: false) + backups = false + } + ] + + agent_nodepools = [ + { + name = "agents-1", + server_type = var.agent_type_1, + location = var.region_1, + labels = [ + "ressource-type=node", + "node-type=agent", + "region=${var.region_2}", + "node-pool=agents-${var.region_2}", + "cluster.ctfpilot.com/node=agent", + ], + taints = [], + count = var.agent_count_1 + kubelet_args = [ + "kube-reserved=cpu=250m,memory=750Mi,ephemeral-storage=1Gi", + "system-reserved=cpu=400m,memory=750Mi", + "eviction-soft=memory.available<512Mi", # Recommend 1Gi for larger nodes + "eviction-soft-grace-period=memory.available=1m", + "eviction-hard=memory.available<500Mi,nodefs.available<5%,imagefs.available<5%", + ] + # swap_size = "2G" # remember to add the suffix, examples: 512M, 1G + # zram_size = "2G" # remember to add the suffix, examples: 512M, 1G + + # Fine-grained control over placement groups (nodes in the same group are spread over different physical servers, 10 nodes per placement group max): + # placement_group = "default" + + # Enable automatic backups via Hetzner (default: false) + backups = false + }, + { + name = "agents-2", + server_type = var.agent_type_2, + location = var.region_2, + labels = [ + "ressource-type=node", + "node-type=agent", + "region=${var.region_2}", + "node-pool=agents-${var.region_2}", + "cluster.ctfpilot.com/node=agent", + ], + taints = [], + count = var.agent_count_2 + kubelet_args = [ + "kube-reserved=cpu=250m,memory=750Mi,ephemeral-storage=1Gi", + "system-reserved=cpu=400m,memory=750Mi", + "eviction-soft=memory.available<512Mi", # Recommend 1Gi for larger nodes + "eviction-soft-grace-period=memory.available=1m", + "eviction-hard=memory.available<500Mi,nodefs.available<5%,imagefs.available<5%", + ] + # swap_size = "2G" # remember to add the suffix, examples: 512M, 1G + # zram_size = "2G" # remember to add the suffix, examples: 512M, 1G + # kubelet_args = ["kube-reserved=cpu=50m,memory=300Mi,ephemeral-storage=1Gi", "system-reserved=cpu=250m,memory=300Mi"] + + # Fine-grained control over placement groups (nodes in the same group are spread over different physical servers, 10 nodes per placement group max): + # placement_group = "default" + + # Enable automatic backups via Hetzner (default: false) + backups = false + }, + { + name = "agents-3", + server_type = var.agent_type_3, + location = var.region_3, + labels = [ + "ressource-type=node", + "node-type=agent", + "region=${var.region_2}", + "node-pool=agents-${var.region_2}", + "cluster.ctfpilot.com/node=agent", + ], + taints = [], + count = var.agent_count_3 + kubelet_args = [ + "kube-reserved=cpu=250m,memory=750Mi,ephemeral-storage=1Gi", + "system-reserved=cpu=400m,memory=750Mi", + "eviction-soft=memory.available<512Mi", # Recommend 1Gi for larger nodes + "eviction-soft-grace-period=memory.available=1m", + "eviction-hard=memory.available<500Mi,nodefs.available<5%,imagefs.available<5%", + ] + + # Fine-grained control over placement groups (nodes in the same group are spread over different physical servers, 10 nodes per placement group max): + # placement_group = "default" + + # Enable automatic backups via Hetzner (default: false) + backups = false + }, + { + name = "challs-1", + server_type = var.challs_type, + location = var.region_1, + labels = [ + "ressource-type=node", + "node-type=scale", + "region=${var.region_1}", + "node-pool=challs", + "cluster.ctfpilot.com/node=scaler" + ] + taints = [ + "cluster.ctfpilot.com/node=scaler:PreferNoSchedule" + ] + count = var.challs_count + kubelet_args = [ + "kube-reserved=cpu=150m,memory=750Mi,ephemeral-storage=1Gi", + "system-reserved=cpu=300m,memory=750Mi", + "eviction-soft=memory.available<2Gi", # Recommend 3Gi for larger nodes + "eviction-soft-grace-period=memory.available=10m", + "eviction-hard=memory.available<500Mi,nodefs.available<5%,imagefs.available<5%", + ] + + # Fine-grained control over placement groups (nodes in the same group are spread over different physical servers, 10 nodes per placement group max): + # placement_group = "default" + + # Enable automatic backups via Hetzner (default: false) + backups = false + }, + #{ + # name = "storage", + # server_type = "cx21", + # location = "hel1", + # # Fully optional, just a demo. + # labels = [ + # "node.kubernetes.io/server-usage=storage" + # ], + # taints = [], + # count = 1 + + # In the case of using Longhorn, you can use Hetzner volumes instead of using the node's own storage by specifying a value from 10 to 10240 (in GB) + # It will create one volume per node in the nodepool, and configure Longhorn to use them. + # Something worth noting is that Volume storage is slower than node storage, which is achieved by not mentioning longhorn_volume_size or setting it to 0. + # So for something like DBs, you definitely want node storage, for other things like backups, volume storage is fine, and cheaper. + # longhorn_volume_size = 20 + + # Enable automatic backups via Hetzner (default: false) + # backups = true + #}, + # Egress nodepool useful to route egress traffic using Hetzner Floating IPs (https://docs.hetzner.com/cloud/floating-ips) + # used with Cilium's Egress Gateway feature https://docs.cilium.io/en/stable/gettingstarted/egress-gateway/ + # See the https://github.com/kube-hetzner/terraform-hcloud-kube-hetzner#examples for an example use case. + #{ + # name = "egress", + # server_type = "cx21", + # location = "fsn1", + # labels = [ + # "node.kubernetes.io/role=egress" + # ], + # taints = [ + # "node.kubernetes.io/role=egress:NoSchedule" + # ], + # floating_ip = true + # Optionally associate a reverse DNS entry with the floating IP(s). + # This is useful in combination with the Egress Gateway feature for hosting certain services in the cluster, such as email servers. + # floating_ip_rns = "my.domain.com" + # count = 1 + #}, + # Arm based nodes + #{ + # name = "agent-arm-small", + # server_type = "cax11", + # location = "fsn1", + # labels = [], + # taints = [], + # count = 1 + #}, + # For fine-grained control over the nodes in a node pool, replace the count variable with a nodes map. + # In this case, the node-pool variables are defaults which can be overridden on a per-node basis. + # Each key in the nodes map refers to a single node and must be an integer string ("1", "123", ...). + #{ + # name = "agent-arm-small", + # server_type = "cax11", + # location = "fsn1", + # labels = [], + # taints = [], + # nodes = { + # "1" : { + # location = "nbg1" + # labels = [ + # "testing-labels=a1", + # ] + # }, + # "20" : { + # labels = [ + # "testing-labels=b1", + # ] + # } + # } + #}, + ] + # Add additional configuration options for control planes here. + # E.g to enable monitoring for etcd, proxy etc: + # control_planes_custom_config = { + # etcd-expose-metrics = true, + # kube-controller-manager-arg = "bind-address=0.0.0.0", + # kube-proxy-arg ="metrics-bind-address=0.0.0.0", + # kube-scheduler-arg = "bind-address=0.0.0.0", + # } + + # Add additional configuration options for agent nodes and autoscaler nodes here. + # E.g to enable monitoring for proxy: + # agent_nodes_custom_config = { + # kube-proxy-arg ="metrics-bind-address=0.0.0.0", + # } + + # You can enable encrypted wireguard for the CNI by setting this to "true". Default is "false". + # FYI, Hetzner says "Traffic between cloud servers inside a Network is private and isolated, but not automatically encrypted." + # Source: https://docs.hetzner.com/cloud/networks/faq/#is-traffic-inside-hetzner-cloud-networks-encrypted + # It works with all CNIs that we support. + # Just note, that if Cilium with cilium_values, the responsibility of enabling of disabling Wireguard falls on you. + enable_wireguard = true + + # * LB location and type, the latter will depend on how much load you want it to handle, see https://www.hetzner.com/cloud/load-balancer + load_balancer_type = var.load_balancer_type + load_balancer_location = var.region_1 + + # Disable IPv6 for the load balancer, the default is false. + # load_balancer_disable_ipv6 = true + + # Disables the public network of the load balancer. (default: false). + # load_balancer_disable_public_network = true + + # Specifies the algorithm type of the load balancer. (default: round_robin). + # load_balancer_algorithm_type = "least_connections" + + # Specifies the interval at which a health check is performed. Minimum is 3s (default: 15s). + # load_balancer_health_check_interval = "5s" + + # Specifies the timeout of a single health check. Must not be greater than the health check interval. Minimum is 1s (default: 10s). + # load_balancer_health_check_timeout = "3s" + + # Specifies the number of times a health check is retried before a target is marked as unhealthy. (default: 3) + # load_balancer_health_check_retries = 3 + + # Setup a NAT router, and automatically disable public ips on all control plane and agent nodes. + # To use this, you must also set use_control_plane_lb = true, otherwise kubectl can never + # reach the cluster. The NAT router will also function as bastion. This makes securing the cluster + # easier, as all public traffic passes through a single strongly secured node. It does + # however also introduce a single point of failure, so if you need high-availability on your + # egress, you should consider other configurations. + # + # + # nat_router = { + # server_type = "cax21" + # location = "fsn1" + # enable_sudo = false # optional, default to false. Set to true to add nat-router user to the sudo'ers. Note that ssh as root is disabled. + # labels = {} # optionally add labels. + # } + + ### The following values are entirely optional (and can be removed from this if unused) + + # You can refine a base domain name to be use in this form of nodename.base_domain for setting the reverse dns inside Hetzner + # base_domain = "mycluster.example.com" + + # Cluster Autoscaler + # Providing at least one map for the array enables the cluster autoscaler feature, default is disabled. + # ⚠️ Based on how the autoscaler works with this project, you can only choose either x86 instances or ARM server types for ALL autoscaler nodepools. + # If you are curious, it's ok to have a multi-architecture cluster, as most underlying container images are multi-architecture too. + # + # ⚠️ Setting labels and taints will only work on cluster-autoscaler images versions released after > 20 October 2023. Or images built from master after that date. + # + # * Example below: + autoscaler_nodepools = [ + { + name = "autoscaled-challs-nodes" + server_type = var.scale_type + location = var.region_1 + min_nodes = 0 + max_nodes = var.scale_max + labels = { + "ressource-type" : "node", + "node-type" : "scale", + "region" : "${var.region_1}", + "node-pool" : "scaled", + "cluster.ctfpilot.com/node" : "scaler" + } + taints = [ + { + key = "cluster.ctfpilot.com/node" + value = "scaler" + effect = "PreferNoSchedule" + } + ] + kubelet_args = [ + "kube-reserved=cpu=150m,memory=750Mi,ephemeral-storage=1Gi", + "system-reserved=cpu=300m,memory=750Mi", + "eviction-soft=memory.available<512Mi", # Recommend 3Gi for larger nodes + "eviction-soft-grace-period=memory.available=10m", + "eviction-hard=memory.available<500Mi,nodefs.available<5%,imagefs.available<5%", + ] + } + ] + # + # To disable public ips on your autoscaled nodes, uncomment the following lines: + # autoscaler_disable_ipv4 = true + # autoscaler_disable_ipv6 = true + + # ⚠️ Deprecated, will be removed after a new Cluster Autoscaler version has been released which support the new way of setting labels and taints. See above. + # Add extra labels on nodes started by the Cluster Autoscaler + # This argument is not used if autoscaler_nodepools is not set, because the Cluster Autoscaler is installed only if autoscaler_nodepools is set + # autoscaler_labels = [ + # "node.kubernetes.io/role=peak-workloads" + # ] + + # Add extra taints on nodes started by the Cluster Autoscaler + # This argument is not used if autoscaler_nodepools is not set, because the Cluster Autoscaler is installed only if autoscaler_nodepools is set + # autoscaler_taints = [ + # "node.kubernetes.io/role=specific-workloads:NoExecute" + # ] + + # Configuration of the Cluster Autoscaler binary + # + # These arguments and variables are not used if autoscaler_nodepools is not set, because the Cluster Autoscaler is installed only if autoscaler_nodepools is set. + # + # Image and version of Kubernetes Cluster Autoscaler for Hetzner Cloud: + # - cluster_autoscaler_image: Image of Kubernetes Cluster Autoscaler for Hetzner Cloud to be used. + # The default is the official image from the Kubernetes project: registry.k8s.io/autoscaling/cluster-autoscaler + # - cluster_autoscaler_version: Version of Kubernetes Cluster Autoscaler for Hetzner Cloud. Should be aligned with Kubernetes version. + # Available versions for the official image can be found at https://explore.ggcr.dev/?repo=registry.k8s.io%2Fautoscaling%2Fcluster-autoscaler + # + # Logging related arguments are managed using separate variables: + # - cluster_autoscaler_log_level: Controls the verbosity of logs (--v), the value is from 0 to 5, default is 4, for max debug info set it to 5. + # - cluster_autoscaler_log_to_stderr: Determines whether to log to stderr (--logtostderr). + # - cluster_autoscaler_stderr_threshold: Sets the threshold for logs that go to stderr (--stderrthreshold). + # + # Server/node creation timeout variable: + # - cluster_autoscaler_server_creation_timeout: Sets the timeout (in minutes) until which a newly created server/node has to become available before giving up and destroying it (defaults to 15, unit is minutes) + # + # Example: + # + # cluster_autoscaler_image = "registry.k8s.io/autoscaling/cluster-autoscaler" + # cluster_autoscaler_version = "v1.30.3" + # cluster_autoscaler_log_level = 4 + # cluster_autoscaler_log_to_stderr = true + # cluster_autoscaler_stderr_threshold = "INFO" + # cluster_autoscaler_server_creation_timeout = 15 + + # Additional Cluster Autoscaler binary configuration + # + # cluster_autoscaler_extra_args can be used for additional arguments. The default is an empty array. + # + # Please note that following arguments are managed by terraform-hcloud-kube-hetzner or the variables above and should not be set manually: + # - --v=${var.cluster_autoscaler_log_level} + # - --logtostderr=${var.cluster_autoscaler_log_to_stderr} + # - --stderrthreshold=${var.cluster_autoscaler_stderr_threshold} + # - --cloud-provider=hetzner + # - --nodes ... + # + # See the Cluster Autoscaler FAQ for the full list of arguments: https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/FAQ.md#what-are-the-parameters-to-ca + # + # Example: + # + cluster_autoscaler_extra_args = [ + "--ignore-daemonsets-utilization=true", + "--enforce-node-group-min-size=true", + ] + + # Enable delete protection on compatible resources to prevent accidental deletion from the Hetzner Cloud Console. + # This does not protect deletion from Terraform itself. + # enable_delete_protection = { + # floating_ip = true + # load_balancer = true + # volume = true + # } + + # Enable etcd snapshot backups to S3 storage. + # Just provide a map with the needed settings (according to your S3 storage provider) and backups to S3 will + # be enabled (with the default settings for etcd snapshots). + # Cloudflare's R2 offers 10GB, 10 million reads and 1 million writes per month for free. + # For proper context, have a look at https://docs.k3s.io/datastore/backup-restore. + # You also can use additional parameters from https://docs.k3s.io/cli/etcd-snapshot, such as `etc-s3-folder` + # etcd_s3_backup = { + # etcd-s3-endpoint = "xxxx.r2.cloudflarestorage.com" + # etcd-s3-access-key = "" + # etcd-s3-secret-key = "" + # etcd-s3-bucket = "k3s-etcd-snapshots" + # etcd-s3-region = "" + # } + + # To enable Hetzner Storage Box support, you can enable csi-driver-smb, default is "false". + # enable_csi_driver_smb = true + + # To enable iscid without setting enable_longhorn = true, set enable_iscsid = true. You will need this if + # you install your own version of longhorn outside of this module. + # Default is false. If enable_longhorn=true, this variable is ignored and iscsid is enabled anyway. + # enable_iscsid = true + + # To use local storage on the nodes, you can enable Longhorn, default is "false". + # See a full recap on how to configure agent nodepools for longhorn here https://github.com/kube-hetzner/terraform-hcloud-kube-hetzner/discussions/373#discussioncomment-3983159 + # Also see Longhorn best practices here https://gist.github.com/ifeulner/d311b2868f6c00e649f33a72166c2e5b + enable_longhorn = false + + # By default, longhorn is pulled from https://charts.longhorn.io. + # If you need a version of longhorn which assures compatibility with rancher you can set this variable to https://charts.rancher.io. + # longhorn_repository = "https://charts.rancher.io" + + # The namespace for longhorn deployment, default is "longhorn-system". + # longhorn_namespace = "longhorn-system" + + # The file system type for Longhorn, if enabled (ext4 is the default, otherwise you can choose xfs). + # longhorn_fstype = "xfs" + + # how many replica volumes should longhorn create (default is 3). + # longhorn_replica_count = 1 + + # When you enable Longhorn, you can go with the default settings and just modify the above two variables OR you can add a longhorn_values variable + # with all needed helm values, see towards the end of the file in the advanced section. + # If that file is present, the system will use it during the deploy, if not it will use the default values with the two variable above that can be customized. + # After the cluster is deployed, you can always use HelmChartConfig definition to tweak the configuration. + + # Also, you can choose to use a Hetzner volume with Longhorn. By default, it will use the nodes own storage space, but if you add an attribute of + # longhorn_volume_size (⚠️ not a variable, just a possible agent nodepool attribute) with a value between 10 and 10000 GB to your agent nodepool definition, it will create and use the volume in question. + # See the agent nodepool section for an example of how to do that. + + # To disable Hetzner CSI storage, you can set the following to "true", default is "false". + # disable_hetzner_csi = true + + # If you want to use a specific Hetzner CCM and CSI version, set them below; otherwise, leave them as-is for the latest versions. + # hetzner_ccm_version = "" + # hetzner_csi_version = "" + + # If you want to specify the Kured version, set it below - otherwise it'll use the latest version available. + # See https://github.com/kubereboot/kured/releases for the available versions. + # kured_version = "" + + # Default is "traefik". + # If you want to enable the Nginx (https://kubernetes.github.io/ingress-nginx/) or HAProxy ingress controller instead of Traefik, you can set this to "nginx" or "haproxy". + # By the default we load optimal Traefik, Nginx or HAProxy ingress controller config for Hetzner, however you may need to tweak it to your needs, so to do, + # we allow you to add a traefik_values, nginx_values or haproxy_values, see towards the end of this file in the advanced section. + # After the cluster is deployed, you can always use HelmChartConfig definition to tweak the configuration. + # If you want to disable both controllers set this to "none" + # ingress_controller = "nginx" + # Namespace in which to deploy the ingress controllers. Defaults to the ingress_controller variable, eg (haproxy, nginx, traefik) + # ingress_target_namespace = "" + + # Use the klipperLB (similar to metalLB), instead of the default Hetzner one, that has an advantage of dropping the cost of the setup. + # Automatically "true" in the case of single node cluster (as it does not make sense to use the Hetzner LB in that situation). + # It can work with any ingress controller that you choose to deploy. + # Please note that because the klipperLB points to all nodes, we automatically allow scheduling on the control plane when it is active. + # enable_klipper_metal_lb = "true" + + # If you want to configure additional arguments for traefik, enter them here as a list and in the form of traefik CLI arguments; see https://doc.traefik.io/traefik/reference/static-configuration/cli/ + # They are the options that go into the additionalArguments section of the Traefik helm values file. + # We already add "providers.kubernetesingress.ingressendpoint.publishedservice" by default so that Traefik works automatically with services such as External-DNS and ArgoCD. + # Example: + traefik_additional_options = ["--api", "--api.dashboard=true", "--api.insecure=true"] # ["--log.level=DEBUG", "--tracing=true"] + + # By default traefik image tag is an empty string which uses latest image tag. + # The default is "". + # traefik_image_tag = "v3.0.0-beta5" + + # By default traefik is configured to redirect http traffic to https, you can set this to "false" to disable the redirection. + # The default is true. + # traefik_redirect_to_https = false + + # Enable or disable Horizontal Pod Autoscaler for traefik. + # The default is true. + # traefik_autoscaling = false + + # Enable or disable pod disruption budget for traefik. Values are maxUnavailable: 33% and minAvailable: 1. + # The default is true. + # traefik_pod_disruption_budget = false + + # Enable or disable default resource requests and limits for traefik. Values requested are 100m & 50Mi and limits 300m & 150Mi. + # The default is true. + # traefik_resource_limits = false + + # If you want to configure additional ports for traefik, enter them here as a list of objects with name, port, and exposedPort properties. + # Example: + traefik_additional_ports = [ + # { + # name = "blockchain", + # port = 8545, + # exposedPort = 8545 + # }, + ] + + # If you want to configure additional trusted IPs for traefik, enter them here as a list of IPs (strings). + # Example for Cloudflare: + # traefik_additional_trusted_ips = [ + # "173.245.48.0/20", + # "103.21.244.0/22", + # "103.22.200.0/22", + # "103.31.4.0/22", + # "141.101.64.0/18", + # "108.162.192.0/18", + # "190.93.240.0/20", + # "188.114.96.0/20", + # "197.234.240.0/22", + # "198.41.128.0/17", + # "162.158.0.0/15", + # "104.16.0.0/13", + # "104.24.0.0/14", + # "172.64.0.0/13", + # "131.0.72.0/22", + # "2400:cb00::/32", + # "2606:4700::/32", + # "2803:f800::/32", + # "2405:b500::/32", + # "2405:8100::/32", + # "2a06:98c0::/29", + # "2c0f:f248::/32" + # ] + + # If you want to disable the metric server set this to "false". Default is "true". + # enable_metrics_server = false + + # If you want to enable the k3s built-in local-storage controller set this to "true". Default is "false". + # Warning: When enabled together with the Hetzner CSI, there will be two default storage classes: "local-path" and "hcloud-volumes"! + # Even if patched to remove the "default" label, the local-path storage class will be reset as default on each reboot of + # the node where the controller runs. + # This is not a problem if you explicitly define which storageclass to use in your PVCs. + # Workaround if you don't want two default storage classes: leave this to false and add the local-path-provisioner helm chart + # as an extra (https://github.com/kube-hetzner/terraform-hcloud-kube-hetzner#adding-extras). + # enable_local_storage = false + + # If you want to allow non-control-plane workloads to run on the control-plane nodes, set this to "true". The default is "false". + # True by default for single node clusters, and when enable_klipper_metal_lb is true. In those cases, the value below will be ignored. + # allow_scheduling_on_control_plane = true + + # If you want to disable the automatic upgrade of k3s, you can set below to "false". + # Ideally, keep it on, to always have the latest Kubernetes version, but lock the initial_k3s_channel to a kube major version, + # of your choice, like v1.25 or v1.26. That way you get the best of both worlds without the breaking changes risk. + # For production use, always use an HA setup with at least 3 control-plane nodes and 2 agents, and keep this on for maximum security. + + # The default is "true" (in HA setup i.e. at least 3 control plane nodes & 2 agents, just keep it enabled since it works flawlessly). + automatically_upgrade_k3s = false + + # By default nodes are drained before k3s upgrade, which will delete and transfer all pods to other nodes. + # Set this to false to cordon nodes instead, which just prevents scheduling new pods on the node during upgrade + # and keeps all pods running. This may be useful if you have pods which are known to be slow to start e.g. + # because they have to mount volumes with many files which require to get the right security context applied. + system_upgrade_use_drain = true + + # During k3s via system-upgrade-manager pods are evicted by default. + # On small clusters this can lead to hanging upgrades and indefinitely unschedulable nodes, + # in that case, set this to false to immediately delete pods before upgrading. + # NOTE: Turning this flag off might lead to downtimes of services (which may be acceptable for your use case) + # NOTE: This flag takes effect only when system_upgrade_use_drain is set to true. + # system_upgrade_enable_eviction = false + + # The default is "true" (in HA setup it works wonderfully well, with automatic roll-back to the previous snapshot in case of an issue). + # IMPORTANT! For non-HA clusters i.e. when the number of control-plane nodes is < 3, you have to turn it off. + automatically_upgrade_os = false + + # If you need more control over kured and the reboot behaviour, you can pass additional options to kured. + # For example limiting reboots to certain timeframes. For all options see: https://kured.dev/docs/configuration/ + # By default, the kured lock does not expire and is only released once a node successfully reboots. You can add the option + # "lock-ttl" : "30m", if you have a single node which sometimes gets stuck. Note however, that in that case, kured continuous + # draining the next node because the lock was released. You may end up with all nodes drained and your cluster completely down. + # The default options are: `--reboot-command=/usr/bin/systemctl reboot --pre-reboot-node-labels=kured=rebooting --post-reboot-node-labels=kured=done --period=5m` + # Defaults can be overridden by using the same key. + # kured_options = { + # "reboot-days": "su", + # "start-time": "3am", + # "end-time": "8am", + # "time-zone": "Local", + # "lock-ttl" : "30m", + # } + + # Allows you to specify the k3s version. If defined, supersedes initial_k3s_channel. + # See https://github.com/k3s-io/k3s/releases for the available versions. + # install_k3s_version = "v1.30.2+k3s2" + + # Allows you to specify either stable, latest, testing or supported minor versions. + # see https://rancher.com/docs/k3s/latest/en/upgrades/basic/ and https://update.k3s.io/v1-release/channels + # ⚠️ If you are going to use Rancher addons for instance, it's always a good idea to fix the kube version to one minor version below the latest stable, + # e.g. v1.29 instead of the stable v1.30. + # The default is "v1.30". + # initial_k3s_channel = "stable" + + # Allows to specify the version of the System Upgrade Controller for automated upgrades of k3s + # See https://github.com/rancher/system-upgrade-controller/releases for the available versions. + # sys_upgrade_controller_version = "v0.14.2" + + # The cluster name, by default "k3s" + # cluster_name = "k3s" + + # Whether to use the cluster name in the node name, in the form of {cluster_name}-{nodepool_name}, the default is "true". + # use_cluster_name_in_node_name = false + + # Extra k3s registries. This is useful if you have private registries and you want to pull images without additional secrets. + # Or if you want to proxy registries for various reasons like rate-limiting. + # It will create the registries.yaml file, more info here https://docs.k3s.io/installation/private-registry. + # Note that you do not need to get this right from the first time, you can update it when you want during the life of your cluster. + # The default is blank. + /* k3s_registries = <<-EOT + mirrors: + hub.my_registry.com: + endpoint: + - "hub.my_registry.com" + configs: + hub.my_registry.com: + auth: + username: username + password: password + EOT */ + + # Additional environment variables for the host OS on which k3s runs. See for example https://docs.k3s.io/advanced#configuring-an-http-proxy . + # additional_k3s_environment = { + # "CONTAINERD_HTTP_PROXY" : "http://your.proxy:port", + # "CONTAINERD_HTTPS_PROXY" : "http://your.proxy:port", + # "NO_PROXY" : "127.0.0.0/8,10.0.0.0/8,", + # } + + # Additional commands to execute on the host OS before the k3s install, for example fetching and installing certs. + # preinstall_exec = [ + # "curl https://somewhere.over.the.rainbow/ca.crt > /root/ca.crt", + # "trust anchor --store /root/ca.crt", + # ] + + # Structured authentication configuration. Multiple authentication providers support requires v1.30+ of + # kubernetes. + # https://kubernetes.io/docs/reference/access-authn-authz/authentication/#using-authentication-configuration + # + # authentication_config = <<-EOT + # apiVersion: apiserver.config.k8s.io/v1beta1 + # kind: AuthenticationConfiguration + # jwt: + # - issuer: + # url: "https://token.actions.githubusercontent.com" + # audiences: + # - "https://github.com/octo-org" + # claimMappings: + # username: + # claim: sub + # prefix: "gh:" + # groups: + # claim: repository_owner + # prefix: "gh:" + # claimValidationRules: + # - claim: repository + # requiredValue: "octo-org/octo-repo" + # - claim: "repository_visibility" + # requiredValue: "public" + # - claim: "ref" + # requiredValue: "refs/heads/main" + # - claim: "ref_type" + # requiredValue: "branch" + # - issuer: + # url: "https://your.oidc.issuer" + # audiences: + # - "oidc_client_id" + # claimMappings: + # username: + # claim: oidc_username_claim + # prefix: "oidc:" + # groups: + # claim: oidc_groups_claim + # prefix: "oidc:" + # EOT + + # Set to true if util-linux breaks on the OS (temporary regression fixed in util-linux v2.41.1). + # k3s_prefer_bundled_bin = true + + # Additional flags to pass to the k3s server command (the control plane). + # k3s_exec_server_args = "--kube-apiserver-arg enable-admission-plugins=PodTolerationRestriction,PodNodeSelector" + + # Additional flags to pass to the k3s agent command (every agents nodes, including autoscaler nodepools). + # k3s_exec_agent_args = "--kubelet-arg kube-reserved=cpu=100m,memory=200Mi,ephemeral-storage=1Gi" + + # The vars below here passes it to the k3s config.yaml. This way it persist across reboots + # Make sure you set "feature-gates=NodeSwap=true,CloudDualStackNodeIPs=true" if want to use swap_size + # see https://github.com/k3s-io/k3s/issues/8811#issuecomment-1856974516 + # k3s_global_kubelet_args = ["kube-reserved=cpu=100m,ephemeral-storage=1Gi", "system-reserved=cpu=memory=200Mi", "image-gc-high-threshold=50", "image-gc-low-threshold=40"] + # k3s_control_plane_kubelet_args = [] + # k3s_agent_kubelet_args = [] + # k3s_autoscaler_kubelet_args = [] + + # If you want to allow all outbound traffic you can set this to "false". Default is "true". + # restrict_outbound_traffic = false + + # Allow access to the Kube API from the specified networks. The default is ["0.0.0.0/0", "::/0"]. + # Allowed values: null (disable Kube API rule entirely) or a list of allowed networks with CIDR notation. + # For maximum security, it's best to disable it completely by setting it to null. However, in that case, to get access to the kube api, + # you would have to connect to any control plane node via SSH, as you can run kubectl from within these. + # Please be advised that this setting has no effect on the load balancer when the use_control_plane_lb variable is set to true. This is + # because firewall rules cannot be applied to load balancers yet. + # firewall_kube_api_source = null + + # Allow SSH access from the specified networks. Default: ["0.0.0.0/0", "::/0"] + # Allowed values: null (disable SSH rule entirely) or a list of allowed networks with CIDR notation. + # Ideally you would set your IP there. And if it changes after cluster deploy, you can always update this variable and apply again. + # firewall_ssh_source = ["1.2.3.4/32"] + + # By default, SELinux is enabled in enforcing mode on all nodes. For container-specific SELinux issues, + # consider using the pre-installed 'udica' tool to create custom, targeted SELinux policies instead of + # disabling SELinux globally. See the "Fix SELinux issues with udica" example in the README for details. + # disable_selinux = false + + # Adding extra firewall rules, like opening a port + # More info on the format here https://registry.terraform.io/providers/hetznercloud/hcloud/latest/docs/resources/firewall + extra_firewall_rules = [ + { + description = "Allow Outbound SMTP Requests - Brevo" + direction = "out" + protocol = "tcp" + port = "587" + source_ips = [] + destination_ips = ["1.179.112.0/20", "172.246.240.0/20"] + }, + { + description = "CTF - Allow all outbound connection" + direction = "out" + protocol = "tcp" + port = "1024-65535" + source_ips = [] + destination_ips = ["0.0.0.0/0", "::/0"] + }, + # { + # description = "For tcpsecure" + # direction = "in" + # protocol = "tcp" + # port = "32000" + # source_ips = ["0.0.0.0/0", "::/0"] + # destination_ips = [] # Won't be used for this rule + # }, + # { + # description = "For tcp" + # direction = "in" + # protocol = "tcp" + # port = "33000" + # source_ips = ["0.0.0.0/0", "::/0"] + # destination_ips = [] # Won't be used for this rule + # }, + + # { + # description = "For tcp 2" + # direction = "in" + # protocol = "tcp" + # port = "31000" + # source_ips = ["0.0.0.0/0", "::/0"] + # destination_ips = [] # Won't be used for this rule + # }, + ] + + # If you want to configure a different CNI for k3s, use this flag + # possible values: flannel (Default), calico, and cilium + # As for Cilium, we allow infinite configurations via helm values, please check the CNI section of the readme over at https://github.com/kube-hetzner/terraform-hcloud-kube-hetzner/#cni. + # Also, see the cilium_values at towards the end of this file, in the advanced section. + # ⚠️ Depending on your setup, sometimes you need your control-planes to have more than + # 2GB of RAM if you are going to use Cilium, otherwise the pods will not start. + # cni_plugin = "cilium" + + # You can choose the version of Cilium that you want. By default we keep the version up to date and configure Cilium with compatible settings according to the version. + # See https://github.com/cilium/cilium/releases for the available versions. + # cilium_version = "v1.14.0" + + # Set native-routing mode ("native") or tunneling mode ("tunnel"). Default: tunnel + # cilium_routing_mode = "native" + + # Used when Cilium is configured in native routing mode. The CNI assumes that the underlying network stack will forward packets to this destination without the need to apply SNAT. Default: value of "cluster_ipv4_cidr" + # cilium_ipv4_native_routing_cidr = "10.0.0.0/8" + + # Enables egress gateway to redirect and SNAT the traffic that leaves the cluster. Default: false + # cilium_egress_gateway_enabled = true + + # Enables Hubble Observability to collect and visualize network traffic. Default: false + # cilium_hubble_enabled = true + + # You can choose the version of Calico that you want. By default, the latest is used. + # More info on available versions can be found at https://github.com/projectcalico/calico/releases + # Please note that if you are getting 403s from Github, it's also useful to set the version manually. However there is rarely a need for that! + # calico_version = "v3.27.2" + + # If you want to disable the k3s kube-proxy, use this flag. The default is "false". + # Ensure that your CNI is capable of handling all the functionalities typically covered by kube-proxy. + # disable_kube_proxy = true + + # If you want to disable the k3s default network policy controller, use this flag! + # Both Calico and Cilium cni_plugin values override this value to true automatically, the default is "false". + # disable_network_policy = true + + # If you want to disable the automatic use of placement group "spread". See https://docs.hetzner.com/cloud/placement-groups/overview/ + # We advise to not touch that setting, unless you have a specific purpose. + # The default is "false", meaning it's enabled by default. + # placement_group_disable = true + + # By default, we allow ICMP ping in to the nodes, to check for liveness for instance. If you do not want to allow that, you can. Just set this flag to true (false by default). + # block_icmp_ping_in = true + + # You can enable cert-manager (installed by Helm behind the scenes) with the following flag, the default is "true". + enable_cert_manager = false + + # IP Addresses to use for the DNS Servers, the defaults are the ones provided by Hetzner https://docs.hetzner.com/dns-console/dns/general/recursive-name-servers/. + # The number of different DNS servers is limited to 3 by Kubernetes itself. + # It's always a good idea to have at least 1 IPv4 and 1 IPv6 DNS server for robustness. + dns_servers = [ + "1.1.1.1", + "8.8.8.8", + "2606:4700:4700::1111", + ] + + # When this is enabled, rather than the first node, all external traffic will be routed via a control-plane loadbalancer, allowing for high availability. + # The default is false. + use_control_plane_lb = true + + # When the above use_control_plane_lb is enabled, you can change the lb type for it, the default is "lb11". + # control_plane_lb_type = "lb21" + + # When the above use_control_plane_lb is enabled, you can change to disable the public interface for control plane load balancer, the default is true. + # control_plane_lb_enable_public_interface = false + + # Let's say you are not using the control plane LB solution above, and still want to have one hostname point to all your control-plane nodes. + # You could create multiple A records of to let's say cp.cluster.my.org pointing to all of your control-plane nodes ips. + # In which case, you need to define that hostname in the k3s TLS-SANs config to allow connection through it. It can be hostnames or IP addresses. + # additional_tls_sans = ["cp.cluster.my.org"] + + # If you create a hostname with multiple A records pointing to all of your + # control-plane nodes ips, you may want to use that hostname in the generated + # kubeconfig. + # kubeconfig_server_address = "cp.cluster.my.org" + + # lb_hostname Configuration: + # + # Purpose: + # The lb_hostname setting optimizes communication between services within the Kubernetes cluster + # when they use domain names instead of direct service names. By associating a domain name directly + # with the Hetzner Load Balancer, this setting can help reduce potential communication delays. + # + # Scenario: + # If Service B communicates with Service A using a domain (e.g., `a.mycluster.domain.com`) that points + # to an external Load Balancer, there can be a slowdown in communication. + # + # Guidance: + # - If your internal services use domain names pointing to an external LB, set lb_hostname to a domain + # like `mycluster.domain.com`. + # - Create an A record pointing `mycluster.domain.com` to your LB's IP. + # - Create a CNAME record for `a.mycluster.domain.com` (or xyz.com) pointing to `mycluster.domain.com`. + # + # Technical Note: + # This setting sets the `load-balancer.hetzner.cloud/hostname` in the Hetzner LB definition, suitable for + # HAProxy, Nginx and Traefik ingress controllers. + # + # Recommendation: + # This setting is optional. If services communicate using direct service names, you can leave this unset. + # For inter-namespace communication, use `.service_name` as per Kubernetes norms. + # + # Example: + # lb_hostname = var.cluster_dns + + # You can enable Rancher (installed by Helm behind the scenes) with the following flag, the default is "false". + # ⚠️ Rancher often doesn't support the latest Kubernetes version. You will need to set initial_k3s_channel to a supported version. + # When Rancher is enabled, it automatically installs cert-manager too, and it uses rancher's own self-signed certificates. + # See for options https://ranchermanager.docs.rancher.com/getting-started/installation-and-upgrade/install-upgrade-on-a-kubernetes-cluster#3-choose-your-ssl-configuration + # The easiest thing is to leave everything as is (using the default rancher self-signed certificate) and put Cloudflare in front of it. + # As for the number of replicas, by default it is set to the number of control plane nodes. + # You can customized all of the above by adding a rancher_values variable see at the end of this file in the advanced section. + # After the cluster is deployed, you can always use HelmChartConfig definition to tweak the configuration. + # IMPORTANT: Rancher's install is quite memory intensive, you will require at least 4GB if RAM, meaning cx21 server type (for your control plane). + # ALSO, in order for Rancher to successfully deploy, you have to set the "rancher_hostname". + # enable_rancher = true + + # If using Rancher you can set the Rancher hostname, it must be unique hostname even if you do not use it. + # If not pointing the DNS, you can just port-forward locally via kubectl to get access to the dashboard. + # If you already set the lb_hostname above and are using a Hetzner LB, you do not need to set this one, as it will be used by default. + # But if you set this one explicitly, it will have preference over the lb_hostname in rancher settings. + # rancher_hostname = "rancher.xyz.dev" + + # When Rancher is deployed, by default is uses the "latest" channel. But this can be customized. + # The allowed values are "stable" or "latest". + # rancher_install_channel = "stable" + + # Finally, you can specify a bootstrap-password for your rancher instance. Minimum 48 characters long! + # If you leave empty, one will be generated for you. + # (Can be used by another rancher2 provider to continue setup of rancher outside this module.) + # rancher_bootstrap_password = "" + + # Separate from the above Rancher config (only use one or the other). You can import this cluster directly on an + # an already active Rancher install. By clicking "import cluster" choosing "generic", giving it a name and pasting + # the cluster registration url below. However, you can also ignore that and apply the url via kubectl as instructed + # by Rancher in the wizard, and that would register your cluster too. + # More information about the registration can be found here https://rancher.com/docs/rancher/v2.6/en/cluster-provisioning/registered-clusters/ + # rancher_registration_manifest_url = "https://rancher.xyz.dev/v3/import/xxxxxxxxxxxxxxxxxxYYYYYYYYYYYYYYYYYYYzzzzzzzzzzzzzzzzzzzzz.yaml" + + # Extra commands to be executed after the `kubectl apply -k` (useful for post-install actions, e.g. wait for CRD, apply additional manifests, etc.). + # extra_kustomize_deployment_commands="" + + # Extra values that will be passed to the `extra-manifests/kustomization.yaml.tpl` if its present. + # extra_kustomize_parameters={} + + # See an working example for just a manifest.yaml, a HelmChart and a HelmChartConfig examples/kustomization_user_deploy/README.md + + # It is best practice to turn this off, but for backwards compatibility it is set to "true" by default. + # See https://github.com/kube-hetzner/terraform-hcloud-kube-hetzner/issues/349 + # When "false". The kubeconfig file can instead be created by executing: "terraform output --raw kubeconfig > cluster_kubeconfig.yaml" + # Always be careful to not commit this file! + create_kubeconfig = false + + # Don't create the kustomize backup. This can be helpful for automation. + # create_kustomization = false + + # Export the values.yaml files used for the deployment of traefik, longhorn, cert-manager, etc. + # This can be helpful to use them for later deployments like with ArgoCD. + # The default is false. + # export_values = true + + # MicroOS snapshot IDs to be used. Per default empty, the most recent image created using createkh will be used. + # We recommend the default, but if you want to use specific IDs you can. + # You can fetch the ids with the hcloud cli by running the "hcloud image list --selector 'microos-snapshot=yes'" command. + # microos_x86_snapshot_id = "1234567" + # microos_arm_snapshot_id = "1234567" + + ### ADVANCED - Custom helm values for packages above (search _values if you want to located where those are mentioned upper in this file) + # ⚠️ Inside the _values variable below are examples, up to you to find out the best helm values possible, we do not provide support for customized helm values. + # Please understand that the indentation is very important, inside the EOTs, as those are proper yaml helm values. + # We advise you to use the default values, and only change them if you know what you are doing! + + # You can inline the values here in heredoc-style (as the examples below with the <= 0.14 + type = string + description = "Cloudflare API Token for updating the DNS records (Zone.Zone.Read and Zone.DNS.Edit permissions required for the two following domains)" +} + +variable "cloudflare_dns_management" { + type = string + description = "The top level domain (TLD) to use for the DNS records for the management part of the cluster" +} + +variable "cloudflare_dns_platform" { + type = string + description = "The top level domain (TLD) to use for the DNS records for the platform part of the cluster" +} + +variable "cloudflare_dns_ctf" { + type = string + description = "The top level domain (TLD) to use for the DNS records for the CTF challenges part of the cluster" +} + +variable "cluster_dns_management" { + type = string + description = "The specific domain name to use for the DNS records for the management part of the cluster. Must be the TLD or subdomain of `cloudflare_dns_management`" +} + +variable "cluster_dns_platform" { + type = string + description = "The domain name to use for the DNS records for the platform part of the cluster. Must be the TLD or subdomain of `cloudflare_dns_platform`" +} + +variable "cluster_dns_ctf" { + type = string + description = "The domain name to use for the DNS records for the CTF challenges part of the cluster. Must be the TLD or subdomain of `cloudflare_dns_ctf`" +} + +# Cluster configuration +variable "region_1" { + type = string + description = "Region to deploy nodes in subgroup 1" + default = "fsn1" + validation { + condition = contains(["fsn1", "hel1", "nbg1"], var.region_1) + error_message = "Region must be one of fsn1, hel1, or nbg1." + } +} + +variable "region_2" { + type = string + description = "Region to deploy nodes in subgroup 2" + default = "fsn1" + validation { + condition = contains(["fsn1", "hel1", "nbg1"], var.region_2) + error_message = "Region must be one of fsn1, hel1, or nbg1." + } +} + +variable "region_3" { + type = string + description = "Region to deploy nodes in subgroup 3" + default = "fsn1" + validation { + condition = contains(["fsn1", "hel1", "nbg1", "ash", "hil", "sin"], var.region_3) + error_message = "Region must be one of fsn1, hel1, or nbg1." + } +} + +variable "network_zone" { + type = string + description = "The Hetzner network zone to deploy the cluster in" + default = "eu-central" + validation { + condition = contains(["eu-central", "us-east", "us-west", "ap-southeast"], var.network_zone) + error_message = "Network zone must be one of eu-central or us-west." + } +} + +variable "control_plane_type_1" { + type = string + description = "Control plane group 1 server type" + default = "cx33" +} + +variable "control_plane_type_2" { + type = string + description = "Control plane group 2 server type" + default = "cx33" +} + +variable "control_plane_type_3" { + type = string + description = "Control plane group 3 server type" + default = "cx33" +} + +variable "agent_type_1" { + type = string + description = "Agent group 1 server type" + default = "cx33" +} + +variable "agent_type_2" { + type = string + description = "Agent group 2 server type" + default = "cx33" +} + +variable "agent_type_3" { + type = string + description = "Agent group 3 server type" + default = "cx33" +} + +variable "challs_type" { + type = string + description = "CTF challenge nodes server type" + default = "cx33" +} +variable "scale_type" { + type = string + description = "Scale group server type" + default = "cx33" +} + +variable "load_balancer_type" { + type = string + description = "Load balancer type" + default = "lb11" + validation { + condition = contains(["lb11", "lb21", "lb31"], var.load_balancer_type) + error_message = "Load balancer type must be one of lb11, lb21, or lb31." + } +} + +variable "control_plane_count_1" { + type = number + description = "Number of control plane nodes in group 1" + default = 1 + validation { + condition = var.control_plane_count_1 >= 0 + error_message = "Control plane count must be at least 0." + } +} + +variable "control_plane_count_2" { + type = number + description = "Number of control plane nodes in group 2" + default = 1 + validation { + condition = var.control_plane_count_2 >= 0 + error_message = "Control plane count must be at least 0." + } +} + +variable "control_plane_count_3" { + type = number + description = "Number of control plane nodes in group 3" + default = 1 + validation { + condition = var.control_plane_count_3 >= 0 + error_message = "Control plane count must be at least 0." + } +} + +variable "agent_count_1" { + type = number + description = "Number of agent nodes in group 1" + default = 1 + validation { + condition = var.agent_count_1 >= 0 + error_message = "Agent count must be at least 0." + } +} + +variable "agent_count_2" { + type = number + description = "Number of agent nodes in group 2" + default = 1 + validation { + condition = var.agent_count_2 >= 0 + error_message = "Agent count must be at least 0." + } +} + +variable "agent_count_3" { + type = number + description = "Number of agent nodes in group 3" + default = 1 + validation { + condition = var.agent_count_3 >= 0 + error_message = "Agent count must be at least 0." + } +} + +variable "challs_count" { + type = number + description = "Number of CTF challenge nodes" + default = 0 + validation { + condition = var.challs_count >= 0 + error_message = "CTF challenge count must be at least 0." + } +} + +variable "scale_max" { + type = number + description = "Maximum number of scale nodes. Set to 0 to disable autoscaling (default: 0)" + default = 0 + validation { + condition = var.scale_max >= 0 + error_message = "Scale max must be at least 0." + } +} diff --git a/ctfp.py b/ctfp.py new file mode 100755 index 0000000..bc1b040 --- /dev/null +++ b/ctfp.py @@ -0,0 +1,1355 @@ +#!/usr/bin/env python3 + +# CTFp CLI tool +# Licensed under PolyForm Noncommercial License 1.0.0. +# See LICENSE file in the project root for full license information. +# This file must not be distributed without the LICENSE file. +# Required Notice: Copyright Mikkel Albrechtsen () + +import os +import sys +import argparse +import time +import subprocess +import shutil + +# Terraform parser - https://github.com/amplify-education/python-hcl2 +import hcl2 + +import backend.generate as backend_generate + +AUTO_APPLY = False +ENVIRONMENTS = ["test", "dev", "prod"] +FLAVOR = "tofu" # Can be "terraform" or "tofu". Only tested with "tofu" +COMPONENTS = ["cluster", "ops", "platform", "challenges"] + +CLUSTER_TFVARS = [ + # Hetzner + "hcloud_token", + + # SSH + "ssh_key_private_base64", + "ssh_key_public_base64", + + # Cloudflare variables + "cloudflare_api_token", + "cloudflare_dns_management", + "cloudflare_dns_platform", + "cloudflare_dns_ctf", + + # DNS information + "cluster_dns_management", + "cluster_dns_platform", + "cluster_dns_ctf", + + # Cluster configuration + "region_1", + "region_2", + "region_3", + "network_zone", + "control_plane_type_1", + "control_plane_type_2", + "control_plane_type_3", + "agent_type_1", + "agent_type_2", + "agent_type_3", + "challs_type", + "scale_type", + "control_plane_count_1", + "control_plane_count_2", + "control_plane_count_3", + "agent_count_1", + "agent_count_2", + "agent_count_3", + "challs_count", + "scale_max", + "load_balancer_type", + + # Versions + "kube_hetzner_version", +] +OPS_TFVARS = [ + # Generic information + "email", + "discord_webhook_url", + + # Cloudflare variables + "cloudflare_api_token", + "cloudflare_dns_management", + "cloudflare_dns_platform", + "cloudflare_dns_ctf", + "cluster_dns_management", + + # Filebeat configuration + "filebeat_elasticsearch_host", + "filebeat_elasticsearch_username", + "filebeat_elasticsearch_password", + + # Prometheus configuration + "prometheus_storage_size", + + # Management configuration + "argocd_github_secret", + "argocd_admin_password", + "grafana_admin_password", + "traefik_basic_auth", + + # GitHub variables + "ghcr_username", + "ghcr_token", + + # Docker images + "image_error_fallback", + "image_filebeat", + + # Versions + "argocd_version", + "cert_manager_version", + "descheduler_version", + "mariadb_operator_version", + "kube_prometheus_stack_version", + "redis_operator_version", +] +PLATFORM_TFVARS = [ + # Generic information + "cluster_dns_management", + "cluster_dns_platform", + + # GitHub variables + "ghcr_username", + "ghcr_token", + "git_token", + + # Filebeat configuration + "filebeat_elasticsearch_host", + "filebeat_elasticsearch_username", + "filebeat_elasticsearch_password", + + # CTF configuration + "kubectf_auth_secret", + + # DB configuration + "db_root_password", + "db_user", + "db_password", + # DB backup configuration + "s3_bucket", + "s3_region", + "s3_endpoint", + "s3_access_key", + "s3_secret_key", + + # CTFd Manager configuration + "ctfd_manager_password", + "ctfd_manager_github_repo", + "ctfd_manager_github_branch", + + # CTFd configuration + "ctfd_secret_key", + "ctf_name", + "ctf_description", + "ctf_start_time", + "ctf_end_time", + "ctf_user_mode", + "ctf_challenge_visibility", + "ctf_account_visibility", + "ctf_score_visibility", + "ctf_registration_visibility", + "ctf_verify_emails", + "ctf_team_size", + "ctf_brackets", + "ctf_theme", + "ctf_admin_name", + "ctf_admin_email", + "ctf_admin_password", + "ctf_registration_code", + "ctf_mail_server", + "ctf_mail_port", + "ctf_mail_username", + "ctf_mail_password", + "ctf_mail_tls", + "ctf_mail_from", + "ctf_logo_path", + "ctf_s3_bucket", + "ctf_s3_region", + "ctf_s3_endpoint", + "ctf_s3_access_key", + "ctf_s3_secret_key", + "ctf_s3_prefix", + "ctfd_plugin_first_blood_limit_url", + "ctfd_plugin_first_blood_limit", + "ctfd_plugin_first_blood_message", + "pages", + "pages_repository", + "pages_branch", + "ctfd_k8s_deployment_repository", + "ctfd_k8s_deployment_path", + "ctfd_k8s_deployment_branch", + + # Docker images + "image_ctfd_manager", + "image_error_fallback", + "image_filebeat", + "image_ctfd_exporter", + + # Versions + "mariadb_version", +] +CHALLENGES_TFVARS = [ + # Generic information + "cluster_dns_management", + "cluster_dns_ctf", + + # GitHub variables + "ghcr_username", + "ghcr_token", + "git_token", + + # CTF configuration + "kubectf_auth_secret", + "kubectf_container_secret", + + # Challenges configuration + "chall_whitelist_ips", + "challenges_static", + "challenges_shared", + "challenges_instanced", + "challenges_repository", + "challenges_branch", + + # Docker images + "image_instancing_fallback", + "image_kubectf", +] + +PATH = os.path.dirname(os.path.realpath(__file__)) + +class Logger: + RED = "\033[91m" + GREEN = "\033[92m" + YELLOW = "\033[93m" + BLUE = "\033[94m" + RESET = "\033[0m" + + @staticmethod + def error(message): + print(f"{Logger.RED}Error: {message}{Logger.RESET}") + exit(1) + + @staticmethod + def info(message): + print(f"{Logger.BLUE}Info: {message}{Logger.RESET}") + + @staticmethod + def success(message): + print(f"{Logger.GREEN}Success: {message}{Logger.RESET}") + + @staticmethod + def warning(message): + print(f"{Logger.YELLOW}Warning: {message}{Logger.RESET}") + + @staticmethod + def debug(message): + print(f"{Logger.BLUE}Debug: {message}{Logger.RESET}") + + @staticmethod + def space(): + print("") + + +# Validate PATH: reject if it contains special characters that may break shell commands +for char in [' ', '"', "'", '&', ';', '$', '>', '<', '|', '`', '!', '*', '?', '(', ')', '[', ']', '{', '}', '~']: + if char in PATH: + Logger.error(f"Path to script contains special character '{char}'. Please move the script to a path without special characters") + exit(1) + +# Load env from .env +if os.path.exists(".env"): + with open(".env", "r") as f: + for line in f: + if line.strip() and not line.startswith("#"): + key, value = line.strip().split("=", 1) + os.environ[key.strip()] = value.strip() + +def run(cmd, shell=True): + ''' + Run a subprocess in a new process group (where supported) and forward + KeyboardInterrupt (SIGINT) to it. Returns the process returncode. + ''' + import signal + # Use os.setsid only on platforms where it is available (POSIX). + preexec = os.setsid if hasattr(os, "setsid") else None + proc = subprocess.Popen( + cmd, + shell=shell, + preexec_fn=preexec + ) + try: + proc.wait() + except KeyboardInterrupt: + # On POSIX, if we created a new process group, send SIGINT to the group. + if preexec is not None and hasattr(os, "killpg"): + os.killpg(proc.pid, signal.SIGINT) + else: + # Fallback for non-POSIX: send SIGINT directly to the child. + try: + proc.send_signal(signal.SIGINT) + except Exception: + # As a last resort, terminate the process. + proc.terminate() + proc.wait() + return proc.returncode + +class Args: + command = None + parser = None + + def __init__(self): + self.parser = argparse.ArgumentParser(description="CTFp CLI") + + def print_help(self): + if self.parser is None: + Logger.error("Parser is not initialized") + exit(1) + + self.parser.print_help() + +class TFBackend: + @staticmethod + def get_backend_filename(component): + return f"{component}.hcl" + + @staticmethod + def get_backend_path(component): + return f"{PATH}/backend/generated/{TFBackend.get_backend_filename(component)}" + + @staticmethod + def backend_exists(component): + return os.path.exists(TFBackend.get_backend_path(component)) + +''' +Subcommand pattern +''' +class Command: + name = "Command" + help = "Command" + description = "Command" + + def __init__(self, subparser): + self.subparser = subparser.add_parser(self.name, help=self.help, description=self.description) + self.subparser.set_defaults(func=self.run) + + def register_subcommand(self): + raise NotImplementedError + + def run(self, args): + raise NotImplementedError + +class GenerateImages(Command): + name = "generate-images" + help = "Generate server images" + description = "Generate server images" + + def register_subcommand(self): + # No arguments to register + return + + def run(self, args): + Logger.info("Generating server images") + try: + rc = run(f"cd \"{PATH}/cluster\" && tmp_script=$(mktemp) && curl -sSL -o \"${{tmp_script}}\" https://raw.githubusercontent.com/kube-hetzner/terraform-hcloud-kube-hetzner/master/scripts/create.sh && chmod +x \"${{tmp_script}}\" && \"${{tmp_script}}\" && rm \"${{tmp_script}}\"") + if rc != 0: + raise Exception + except Exception: + Logger.error("Failed to generate images") + Logger.success("Images generated successfully") + + +''' +Initialize automated.tfvars with the template +''' +class InitializeTFVars(Command): + name = "init" + help = "Initialize automated.tfvars" + description = "Initialize automated.tfvars" + environment = "test" # Default environment + + def register_subcommand(self): + self.subparser.add_argument("--force", action="store_true", help="Force overwrite automated.tfvars") + self.subparser.add_argument("--test", action="store_true", help="Work with TEST cluster (default)") + self.subparser.add_argument("--dev", action="store_true", help="Work with DEV cluster") + self.subparser.add_argument("--prod", action="store_true", help="Work with PROD cluster") + return + + def run(self, args): + if [args.test, args.dev, args.prod].count(True) > 1: + Logger.error("Please specify only one environment: --test, --dev or --prod") + exit(1) + + self.environment = "test" + if args.dev: + self.environment = "dev" + elif args.prod: + self.environment = "prod" + + Logger.info(f"Initializing {self.get_filename_tfvars()} (ENV: {self.environment})") + template = f"{PATH}/template.automated.tfvars" + destination = f"{PATH}/{self.get_filename_tfvars()}" + + # Check if destination file already exists + if os.path.exists(destination) and not args.force: + Logger.warning(f"{self.get_filename_tfvars()} already exists") + + # Ask user if they want to overwrite the file + response = input("Do you want to overwrite the file? (y/N): ") + if response.lower() != "y": + Logger.info("Exiting") + exit(0) + + # Clone the template to the destination + try: + shutil.copyfile(template, destination) + except Exception: + Logger.error(f"Failed to initialize {self.get_filename_tfvars()}") + Logger.success(f"{self.get_filename_tfvars()} initialized successfully") + + def get_filename_tfvars(self): + return TFVARS.get_filename_tfvars(self.environment) + +''' +Generate SSH keys +''' +class GenerateKeys(Command): + name = "generate-keys" + help = "Generate SSH keys" + description = "Generate SSH keys" + environment = "test" # Default environment + + def register_subcommand(self): + self.subparser.add_argument("--insert", action="store_true", help="Insert keys into automated.tfvars") + self.subparser.add_argument("--test", action="store_true", help="Work with TEST cluster (default)") + self.subparser.add_argument("--dev", action="store_true", help="Work with DEV cluster") + self.subparser.add_argument("--prod", action="store_true", help="Work with PROD cluster") + return + + def run(self, args): + if [args.test, args.dev, args.prod].count(True) > 1: + Logger.error("Please specify only one environment: --test, --dev or --prod") + exit(1) + + self.environment = "test" + if args.dev: + self.environment = "dev" + elif args.prod: + self.environment = "prod" + + Logger.info("Generating SSH keys") + try: + rc = run([f"\"{PATH}\"/keys/create.sh \"{self.environment}\""]) + if rc != 0: + raise Exception + except Exception: + Logger.error("Failed to generate keys") + + Logger.success("Keys generated successfully in keys/ using ed25519") + Logger.info(f"Public key: keys/k8s-{self.environment}.pub") + Logger.info(f"Private key: keys/k8s-{self.environment}") + + # Insert keys into automated.tfvars + if args.insert: + TFVARS.insert_keys(self.environment) + Logger.success(f"Keys inserted successfully into {TFVARS.get_filename_tfvars(self.environment)}") + +''' +Insert SSH keys into automated.tfvars +''' +class InsertKeys(Command): + name = "insert-keys" + help = "Insert SSH keys into automated.tfvars" + description = "Insert SSH keys into automated.tfvars" + + def register_subcommand(self): + self.subparser.add_argument("--test", action="store_true", help="Works with TEST cluster (default)") + self.subparser.add_argument("--dev", action="store_true", help="Works with DEV cluster") + self.subparser.add_argument("--prod", action="store_true", help="Works with PROD cluster") + return + + def run(self, args): + if [args.test, args.dev, args.prod].count(True) > 1: + Logger.error("Please specify only one environment: --test, --dev or --prod") + exit(1) + + self.environment = "test" + if args.dev: + self.environment = "dev" + elif args.prod: + self.environment = "prod" + + Logger.info(f"Inserting SSH keys into {TFVARS.get_filename_tfvars(self.environment)}") + TFVARS.insert_keys(self.environment) + Logger.success(f"Keys inserted successfully into {TFVARS.get_filename_tfvars(self.environment)}") + +''' +Deploy the platform +''' +class Deploy(Command): + name = "deploy" + help = "Deploy the platform" + description = "Deploy the platform" + environment = "test" # Default environment + components = COMPONENTS + ["all"] + + def register_subcommand(self): + self.subparser.add_argument("component", help="Component to deploy (cluster, ops, platform, challenges, all)", choices=self.components) + self.subparser.add_argument("--test", action="store_true", help="Deploy TEST cluster (default)") + self.subparser.add_argument("--dev", action="store_true", help="Deploy DEV cluster") + self.subparser.add_argument("--prod", action="store_true", help="Deploy PROD cluster") + self.subparser.add_argument("--auto-apply", action="store_true", help="Automatically apply Terraform changes without prompting") + return + + def run(self, args): + global AUTO_APPLY + + # Check component is valid + component = args.component.lower() + if component not in COMPONENTS and component != "all": + Logger.error(f"Invalid component. Please specify one of: {', '.join(self.components)}") + exit(1) + + if [args.test, args.dev, args.prod].count(True) > 1: + Logger.error("Please specify only one environment: --test, --dev or --prod") + exit(1) + + if args.auto_apply: + AUTO_APPLY = True + + deploy_all = component == "all" + + self.environment = "test" + if args.dev: + self.environment = "dev" + elif args.prod: + self.environment = "prod" + + times = {} + times["start"] = time.time() + Logger.info("Deploying " + (self.environment.upper() if self.environment != "test" else "TEST") + " environment") + Logger.space() + + terraform = Terraform(self.environment) + Logger.space() + + if deploy_all or component == "cluster": + component_start = time.time() + terraform.cluster_deploy() + times["cluster"] = time.time() - component_start + Logger.space() + Logger.info(f"Time taken: {round(times['cluster'], 2)} seconds") + Logger.space() + + if deploy_all or component == "ops": + component_start = time.time() + terraform.ops_deploy() + times["ops"] = time.time() - component_start + Logger.space() + Logger.info(f"Time taken: {round(times['ops'], 2)} seconds") + Logger.space() + + if deploy_all or component == "platform": + component_start = time.time() + terraform.platform_deploy() + times["platform"] = time.time() - component_start + Logger.space() + Logger.info(f"Time taken: {round(times['platform'], 2)} seconds") + Logger.space() + + if deploy_all or component == "challenges": + component_start = time.time() + terraform.challenges_deploy() + times["challenges"] = time.time() - component_start + Logger.space() + Logger.info(f"Time taken: {round(times['challenges'], 2)} seconds") + Logger.space() + + Logger.success("Platform deployed") + total_time = time.time() - times["start"] + + Logger.info(f"Time taken: {round(total_time, 2)} seconds") + + if deploy_all or component == "cluster": + Logger.info(f"Cluster time: {round(times['cluster'], 2)} seconds") + if deploy_all or component == "ops": + Logger.info(f"Ops time: {round(times['ops'], 2)} seconds") + if deploy_all or component == "platform": + Logger.info(f"Platform time: {round(times['platform'], 2)} seconds") + if deploy_all or component == "challenges": + Logger.info(f"Challenges time: {round(times['challenges'], 2)} seconds") + +''' +Destroy the platform +''' +class Destroy(Command): + name = "destroy" + help = "Destroy the platform" + description = "Destroy the platform" + times = [] + environment = "test" # Default environment + components = COMPONENTS + ["all"] + + def register_subcommand(self): + # Only run listed parts of the destruction + self.subparser.add_argument("component", help="Component to destroy (cluster, ops, platform, challenges, all)", choices=self.components) + self.subparser.add_argument("--test", action="store_true", help="Destroy TEST cluster (default)") + self.subparser.add_argument("--dev", action="store_true", help="Destroy DEV cluster") + self.subparser.add_argument("--prod", action="store_true", help="Destroy PROD cluster") + self.subparser.add_argument("--auto-apply", action="store_true", help="Automatically apply Terraform changes without prompting") + return + + def run(self, args): + global AUTO_APPLY + + # Check component is valid + component = args.component.lower() + if component not in COMPONENTS and component != "all": + Logger.error(f"Invalid component. Please specify one of: {', '.join(self.components)}") + exit(1) + + if [args.test, args.dev, args.prod].count(True) > 1: + Logger.error("Please specify only one environment: --test, --dev or --prod") + exit(1) + + if args.auto_apply: + AUTO_APPLY = True + + destroy_all = component == "all" + + self.environment = "test" + if args.dev: + self.environment = "dev" + elif args.prod: + self.environment = "prod" + + times = {} + times["start"] = time.time() + Logger.info("Destroying " + (self.environment.upper() if self.environment != "test" else "TEST") + " environment") + Logger.space() + + terraform = Terraform(self.environment) + + if destroy_all or component == "challenges": + component_start = time.time() + terraform.challenges_destroy() + times["challenges"] = time.time() - component_start + Logger.space() + Logger.info(f"Time taken: {round(times['challenges'], 2)} seconds") + Logger.space() + + if destroy_all or component == "platform": + component_start = time.time() + terraform.platform_destroy() + times["platform"] = time.time() - component_start + Logger.space() + Logger.info(f"Time taken: {round(times['platform'], 2)} seconds") + Logger.space() + + if destroy_all or component == "ops": + component_start = time.time() + terraform.ops_destroy() + times["ops"] = time.time() - component_start + Logger.space() + Logger.info(f"Time taken: {round(times['ops'], 2)} seconds") + Logger.space() + + if destroy_all or component == "cluster": + component_start = time.time() + terraform.cluster_destroy() + times["cluster"] = time.time() - component_start + Logger.space() + Logger.info(f"Time taken: {round(times['cluster'], 2)} seconds") + Logger.space() + + Logger.success("Destroyed action") + total_time = time.time() - times["start"] + + Logger.info(f"Time taken: {round(total_time, 2)} seconds") + + if destroy_all or component == "cluster": + Logger.info(f"Cluster time: {round(times['cluster'], 2)} seconds") + if destroy_all or component == "ops": + Logger.info(f"Ops time: {round(times['ops'], 2)} seconds") + if destroy_all or component == "platform": + Logger.info(f"Platform time: {round(times['platform'], 2)} seconds") + if destroy_all or component == "challenges": + Logger.info(f"Challenges time: {round(times['challenges'], 2)} seconds") + + +''' +TFVars handler class +''' +class TFVARS: + root: str + destination: str + + def __init__(self, root, destination): + self.root = root + self.destination = destination + + @staticmethod + def get_filename_tfvars(environment="test"): + ''' + Get the filename for the tfvars file based on the environment + + :param environment: The environment name (test, dev, prod) + :return: The filename for the tfvars file + ''' + + return f"automated.{environment}.tfvars" + + @staticmethod + def load_tfvars(file_path: str): + ''' + Load a tfvars file and return its contents as a dictionary + + :param file_path: The path to the tfvars file + :return: A dictionary containing the tfvars key-value pairs + ''' + + with open(file_path, "r") as tfvars_file: + tfvars = hcl2.api.load(tfvars_file) + return tfvars + + @staticmethod + def safe_load_tfvars(file_path: str): + ''' + Safely load a tfvars file and handle errors by exiting the program + + :param file_path: The path to the tfvars file + :return: A dictionary containing the tfvars key-value pairs + ''' + + try: + return TFVARS.load_tfvars(file_path) + except FileNotFoundError: + Logger.error("tfvars file not found. Please create the file and try again.") + exit(1) + except Exception as e: + Logger.error(f"Error loading tfvars file: {e}") + exit(1) + + @staticmethod + def safe_write_tfvars(file_path: str, data: dict): + ''' + Safely write a dictionary to a tfvars file and handle errors by exiting the program + + :param file_path: The path to the tfvars file + :param data: A dictionary containing the tfvars key-value pairs + :return: None + ''' + + try: + tree = hcl2.api.reverse_transform(data) + formatted_data = hcl2.api.writes(tree) + + with open(file_path, "w") as tfvars_file: + tfvars_file.write(formatted_data) + except Exception as e: + Logger.error(f"Error writing tfvars file: {e}") + exit(1) + + def create(self, fields=[]): + # Check if destination exists + exists = os.path.exists(self.destination) + + # Create the file or empty it + with open(self.destination, "w") as file: + if exists: + Logger.info(f"Overwriting {self.destination}") + else: + Logger.info(f"Creating {self.destination}") + + file.write("") + + # Parse the root file into key-value pairs + key_value_pairs = TFVARS.safe_load_tfvars(self.root) + + # Filter and write only the specified fields to the destination file + filtered_values = {} + for field in fields: + if field in key_value_pairs: + filtered_values[field] = key_value_pairs[field] + else: + Logger.warning(f"Field '{field}' not found in {self.root}") + TFVARS.safe_write_tfvars(self.destination, filtered_values) + + def add(self, key, value): + ''' + Add a key-value pair to the tfvars file + + :param key: The key to add + :param value: The value to add + :return: None + ''' + + # Check if destination exists + exists = os.path.exists(self.destination) + if not exists: + Logger.error(f"{self.destination} does not exist") + exit(1) + + data = TFVARS.safe_load_tfvars(self.destination) + data[key] = value + TFVARS.safe_write_tfvars(self.destination, data) + + def add_dict(self, dict_data): + ''' + Add multiple key-value pairs from a dictionary to the tfvars file + + :param dict_data: A dictionary containing the key-value pairs to add + :return: None + ''' + + # Check if destination exists + exists = os.path.exists(self.destination) + if not exists: + Logger.error(f"{self.destination} does not exist") + exit(1) + + data = TFVARS.safe_load_tfvars(self.destination) + for key, value in dict_data.items(): + data[key] = value + TFVARS.safe_write_tfvars(self.destination, data) + + def destroy(self): + # Check if destination exists + exists = os.path.exists(self.destination) + + # Remove the file + if exists: + Logger.info(f"Removing {self.destination}") + os.remove(self.destination) + else: + Logger.info(f"{self.destination} does not exist") + + @staticmethod + def insert_keys(environment="test"): + # Read the keys + public_key = "" + private_key = "" + try: + with open(f"{PATH}/keys/k8s-{environment}.pub.b64", "r") as file: + public_key = file.read() + with open(f"{PATH}/keys/k8s-{environment}.b64", "r") as file: + private_key = file.read() + except FileNotFoundError: + Logger.error("SSH keys not found. Please run 'generate-keys' first.") + exit(1) + except OSError as e: + Logger.error(f"Failed to read SSH key files: {e}") + exit(1) + + # Insert the keys into automated.tfvars (in place) + with open(f"{PATH}/{TFVARS.get_filename_tfvars(environment)}", "r") as file: + lines = file.readlines() + with open(f"{PATH}/{TFVARS.get_filename_tfvars(environment)}", "w") as file: + for line in lines: + if "ssh_key_public_base64" in line: + file.write(f'ssh_key_public_base64 = "{public_key}"\n') + elif "ssh_key_private_base64" in line: + file.write(f'ssh_key_private_base64 = "{private_key}"\n') + else: + file.write(line) + +''' +Terraform handler +''' +class Terraform: + environment: str + + @staticmethod + def is_installed(): + ''' + Check if Terraform is installed + + :return: True if installed, False otherwise + ''' + try: + rc = run(f"{FLAVOR} version") + return rc == 0 + except Exception: + return False + + def __init__(self, environment="test"): + self.environment = environment + + ''' + Initialize Terraform to a given environment (workspace) + ''' + def init_terraform(self, path, components: str = ""): + Logger.info("Initializing Terraform") + current_dir = os.getcwd() + os.chdir(path) + + try: + # Check if tfvars file exists and is valid + self.check_values() + + # Load backend connection credentials + self.load_backend_credentials() + + # Check if backend config exists + if not TFBackend.backend_exists(components): + Logger.error(f"Backend configuration for {components} does not exist. Please generate it first.") + raise Exception + + # Initialize the backend (if not already done for this project) + Logger.info("Running terraform init") + rc = run(f"{FLAVOR} init -backend-config=\"{TFBackend.get_backend_path(components)}\"") + if rc != 0: + # Try to init with reconfigure + response = input(f"The init of the backend for {components} failed. Do you want to try to reconfigure the backend? (y/N): ") + if response.lower() != "y": + Logger.info("Exiting") + exit(0) + + Logger.warning("Reconfiguring backend") + rc = run(f"{FLAVOR} init -reconfigure -backend-config=\"{TFBackend.get_backend_path(components)}\"") + if rc != 0: + raise Exception + + # Create workspaces + Logger.info("Creating workspaces if they do not exist") + for env in ENVIRONMENTS: + subprocess.run([FLAVOR, "workspace", "new", env], check=False) + + # Select the workspace based on the environment + Logger.info(f"Selecting workspace: {self.environment}") + rc = run(f"{FLAVOR} workspace select {self.environment}") + if rc != 0: + raise Exception + except subprocess.CalledProcessError as e: + Logger.error("Terraform initialization failed") + raise e + finally: + os.chdir(current_dir) # Always change back to the original directory + Logger.success("Terraform initialized successfully") + + def get_filename_tfvars(self): + return TFVARS.get_filename_tfvars(self.environment) + + def get_path_tfvars(self): + return f"{PATH}/{self.get_filename_tfvars()}" + + def execute(self, component, generate_plan=True, action="apply"): + ''' + Execute Terraform action (apply or destroy) + + :param component: The component to execute + :param generate_plan: Whether to generate a plan before executing + :param action: The action to execute (apply or destroy) + ''' + if action not in ["apply", "destroy"]: + Logger.error("Invalid action. Must be 'apply' or 'destroy'") + exit(1) + + is_apply = action == "apply" + + # Initialize Terraform + component_path = f"{PATH}/{component}" + self.init_terraform(component_path, component) + + rc = 0 + if generate_plan: + # Generate plan + Logger.info("Generating Terraform plan") + rc = run(f"cd \"{component_path}\" && {FLAVOR} workspace select {self.environment} && {FLAVOR} plan {'' if is_apply else '-destroy'} -out=\"{PATH}/terraform/{component}-{self.environment}.tfplan\"") + if rc != 0: + raise Exception(f"Terraform plan failed for {component} ({action}), with return code: {rc}") + + # Store the plan as human-readable output (Allowing user to review it) + rc = run(f"cd \"{component_path}\" && {FLAVOR} show -no-color \"{PATH}/terraform/{component}-{self.environment}.tfplan\" > \"{PATH}/terraform/{component}-{self.environment}.plan.txt\"") + if rc != 0: + raise Exception(f"Terraform show plan failed for {component} ({action}), with return code: {rc}") + + Logger.success(f"Terraform plan generated successfully - It can be found at terraform/{component}-{self.environment}.plan.txt") + + # Ask if user wants to proceed + if not AUTO_APPLY: + response = input(f"Do you want to apply this plan on {component} ({action} {component} in {self.environment})? (y/N): ") + if response.lower() != "y": + Logger.info(f"Exiting without applying the plan on {component} ({action})") + exit(0) + + # Run apply + rc = run(f"cd \"{component_path}\" && {FLAVOR} workspace select {self.environment} && {FLAVOR} apply \"{PATH}/terraform/{component}-{self.environment}.tfplan\"") + + # Remove the plan files + os.remove(f"{PATH}/terraform/{component}-{self.environment}.tfplan") + + # Move human readable plan to .old + os.rename(f"{PATH}/terraform/{component}-{self.environment}.plan.txt", f"{PATH}/terraform/{component}-{self.environment}.plan.txt.old") + else: + # Run apply directly + rc = run(f"cd \"{component_path}\" && {FLAVOR} {action} {'-auto-approve' if AUTO_APPLY else ''}") + if rc != 0: + raise Exception(f"Terraform {action} failed for {component}, with return code: {rc}") + + ''' + Run Terraform apply + ''' + def apply(self, component, generate_plan=True): + ''' + Run Terraform apply + + :param component: The component to apply + :param generate_plan: Whether to generate a plan before applying + ''' + self.execute(component, generate_plan, action="apply") + + ''' + Run Terraform destroy + ''' + def destroy(self, component, generate_plan=True): + ''' + Run Terraform destroy + + :param component: The component to destroy + :param generate_plan: Whether to generate a plan before destroying + ''' + self.execute(component, generate_plan, action="destroy") + + ''' + Validate automated.tfvars is set, and values are set + ''' + def check_values(self): + # Check if automated.tfvars exists + tfvars_path = self.get_path_tfvars() + if not os.path.exists(tfvars_path): + Logger.error(f"{self.get_filename_tfvars()} not found. Please create the file and try again") + exit(1) + + # Load tfvars file + tfvars_data = TFVARS.safe_load_tfvars(tfvars_path) + + # Check if fields include "<" or ">" + def check_placeholders(value): + if isinstance(value, str) and (value.startswith("<") or value.startswith("https://github.com/<")) and value.endswith(">"): + return True + elif isinstance(value, dict): + for v in value.values(): + if check_placeholders(v): + return True + elif isinstance(value, list): + for item in value: + if check_placeholders(item): + return True + return False + for key, value in tfvars_data.items(): + if check_placeholders(value): + Logger.error(f"{self.get_filename_tfvars()} does not seem to be filled out (see field '{key}'). Please fill out all fields and try again") + exit(1) + + Logger.info(f"{self.get_filename_tfvars()} is filled out correctly") + + + def load_backend_credentials(self): + ''' + Load S3 backend credentials from automated.tfvars, to set Terraform S3 connection credentials + ''' + + # Load tfvars file + tfvars_data = TFVARS.safe_load_tfvars(self.get_path_tfvars()) + + # Set environment variables for S3 backend + os.environ["AWS_ACCESS_KEY_ID"] = tfvars_data.get("terraform_backend_s3_access_key", "") + os.environ["AWS_SECRET_ACCESS_KEY"] = tfvars_data.get("terraform_backend_s3_secret_key", "") + + if os.environ["AWS_ACCESS_KEY_ID"] == "" or os.environ["AWS_SECRET_ACCESS_KEY"] == "": + Logger.error("S3 backend credentials not found in automated.tfvars. Please fill out terraform_backend_s3_access_key and terraform_backend_s3_secret_key as they are required to run the Terraform components.") + exit(1) + + Logger.info(f"S3 backend credentials loaded") + + def cluster_deploy(self): + Logger.info("Deploying the cluster") + + # Configure tfvars file + tfvars = TFVARS(self.get_path_tfvars(), f"{PATH}/cluster/data.auto.tfvars") + tfvars.create(CLUSTER_TFVARS) + # tfvars.add("environment", self.environment) + Logger.space() + + # Deploy the cluster + try: + self.apply("cluster") + except Exception: + Logger.error("Cluster terraform failed") + Logger.success("Cluster terraform applied successfully") + # Export kubeconfig + self.export_kubeconfig() + Logger.success("Cluster deployed successfully") + + def export_kubeconfig(self): + Logger.info("Exporting kubeconfig") + + # Export kubeconfig + try: + rc = run(f"cd \"{PATH}/cluster\" && {FLAVOR} output --raw kubeconfig > \"{PATH}\"/kube-config/kube-config.{self.environment}.yml") + if rc != 0: + raise Exception + rc = run(f"cat \"{PATH}\"/kube-config/kube-config.{self.environment}.yml | base64 -w0 > \"{PATH}\"/kube-config/kube-config.{self.environment}.b64") + if rc != 0: + raise Exception + except Exception: + Logger.error("Failed to export kubeconfig") + Logger.success("Kubeconfig exported") + + def get_kubeconfig_b64(self): + try: + with open(f"{PATH}/kube-config/kube-config.{self.environment}.b64", "r") as file: + return file.read() + except FileNotFoundError: + Logger.error("Kubeconfig file not found. Please deploy the cluster first.") + exit(1) + except OSError as e: + Logger.error(f"Failed to read kubeconfig file: {e}") + exit(1) + + def ops_deploy(self): + Logger.info("Deploying the ops on the cluster") + + # Configure tfvars file + tfvars = TFVARS(self.get_path_tfvars(), f"{PATH}/ops/data.auto.tfvars") + tfvars.create(OPS_TFVARS) + tfvars.add_dict({ + "kubeconfig": self.get_kubeconfig_b64(), + "environment": self.environment + }) + Logger.space() + + # Deploy the cluster + try: + self.apply("ops") + except Exception: + Logger.error("Ops apply failed") + Logger.success("Ops deployed successfully") + + def platform_deploy(self): + Logger.info("Deploying the platform on the cluster") + + # Configure tfvars file + tfvars = TFVARS(self.get_path_tfvars(), f"{PATH}/platform/data.auto.tfvars") + tfvars.create(PLATFORM_TFVARS) + tfvars.add_dict({ + "kubeconfig": self.get_kubeconfig_b64(), + "environment": self.environment + }) + Logger.space() + + # Deploy the cluster + try: + self.apply("platform") + except Exception: + Logger.error("Platform apply failed") + Logger.success("Platform deployed successfully") + + def challenges_deploy(self): + Logger.info("Deploying the challenges on the cluster") + + # Configure tfvars file + tfvars = TFVARS(self.get_path_tfvars(), f"{PATH}/challenges/data.auto.tfvars") + tfvars.create(CHALLENGES_TFVARS) + tfvars.add_dict({ + "kubeconfig": self.get_kubeconfig_b64(), + "environment": self.environment + }) + Logger.space() + + # Deploy the cluster + try: + self.apply("challenges") + except Exception: + Logger.error("Challenges apply failed") + Logger.success("Challenges deployed successfully") + + def cluster_destroy(self): + Logger.info("Destroying the cluster") + + # Configure tfvars file + tfvars = TFVARS(self.get_path_tfvars(), f"{PATH}/cluster/data.auto.tfvars") + tfvars.create(CLUSTER_TFVARS) + # tfvars.add("environment", self.environment) + Logger.space() + + # Destroy the cluster + try: + self.destroy("cluster") + except Exception: + Logger.error("Cluster terraform destroy failed") + + # Remove the tfvars file + TFVARS(self.get_path_tfvars(), f"{PATH}/cluster/data.auto.tfvars").destroy() + + Logger.success("Cluster terraform destroy applied successfully") + + # remove kubeconfig + self.remove_kubeconfig() + + def remove_kubeconfig(self): + Logger.info("Removing kubeconfig") + + # Remove kubeconfig + try: + rc = run(f"rm \"{PATH}\"/kube-config/kube-config.{self.environment}.yml") + if rc != 0: + raise Exception + rc = run(f"rm \"{PATH}\"/kube-config/kube-config.{self.environment}.b64") + if rc != 0: + raise Exception + except Exception: + Logger.error("Failed to remove kubeconfig") + Logger.success("Kubeconfig removed") + + def ops_destroy(self): + Logger.info("Destroying the ops on the cluster") + + # Configure tfvars file + tfvars = TFVARS(self.get_path_tfvars(), f"{PATH}/ops/data.auto.tfvars") + tfvars.create(OPS_TFVARS) + tfvars.add_dict({ + "kubeconfig": self.get_kubeconfig_b64(), + "environment": self.environment + }) + Logger.space() + + # Destroy the ops + try: + self.destroy("ops") + except Exception: + Logger.error("Ops destroy failed") + + # Remove the tfvars file + TFVARS(self.get_path_tfvars(), f"{PATH}/ops/data.auto.tfvars").destroy() + + Logger.success("Ops destroyed successfully") + + def platform_destroy(self): + Logger.info("Destroying the platform on the cluster") + + # Configure tfvars file + tfvars = TFVARS(self.get_path_tfvars(), f"{PATH}/platform/data.auto.tfvars") + tfvars.create(PLATFORM_TFVARS) + tfvars.add_dict({ + "kubeconfig": self.get_kubeconfig_b64(), + "environment": self.environment + }) + Logger.space() + + # Destroy the platform + try: + self.destroy("platform") + except Exception: + Logger.error("Platform destroy failed") + + # Remove the tfvars file + TFVARS(self.get_path_tfvars(), f"{PATH}/platform/data.auto.tfvars").destroy() + + Logger.success("Platform destroyed successfully") + + def challenges_destroy(self): + Logger.info("Destroying the challenges on the cluster") + + # Configure tfvars file + tfvars = TFVARS(self.get_path_tfvars(), f"{PATH}/challenges/data.auto.tfvars") + tfvars.create(CHALLENGES_TFVARS) + tfvars.add_dict({ + "kubeconfig": self.get_kubeconfig_b64(), + "environment": self.environment + }) + Logger.space() + + # Destroy the challenges + try: + self.destroy("challenges") + except Exception: + Logger.error("Challenges destroy failed") + + # Remove the tfvars file + TFVARS(self.get_path_tfvars(), f"{PATH}/challenges/data.auto.tfvars").destroy() + + Logger.success("Challenges destroyed successfully") + +''' +CLI tool +''' +class CLI: + def run(self): + Logger.info("Starting CTF-Pilot CLI") + + args = Args() + if args.parser is None: + Logger.error("Failed to initialize argument parser") + exit(1) + + subparser = args.parser.add_subparsers(dest="command", help="Subcommand to run", title="subcommands") + + # Commands + commands = [ + InitializeTFVars(subparser), + GenerateImages(subparser), + GenerateKeys(subparser), + InsertKeys(subparser), + Deploy(subparser), + Destroy(subparser), + backend_generate.Generator(subparser) + ] + for command in commands: + command.register_subcommand() + + # Get arguments + namespace = args.parser.parse_args() + + # Fallback to help if no subcommand is provided + if not hasattr(namespace, "func"): + args.print_help() + exit(1) + + Logger.info("Checking availability of required tools") + self.platform_check() + self.tool_check() + Logger.success("Required Tools are available") + + # Run the subcommand + try: + namespace.func(namespace) + except Exception as e: + Logger.error(f"Failed to run subcommand: {e}") + + def platform_check(self): + # Check if system is linux and if bash is available + if sys.platform != "linux" or not os.path.exists("/bin/bash"): + Logger.error("This script requires Linux and bash") + exit(1) + + def tool_check(self): + # Check if Terraform is installed + if not Terraform.is_installed(): + Logger.error("Terraform is not installed. Please install Terraform and try again.") + exit(1) + + # Check if curl is installed + if run("which curl") != 0: + Logger.error("curl is not installed. Please install curl and try again.") + exit(1) + + # Check if base64 is installed + if run("which base64") != 0: + Logger.error("base64 is not installed. Please install base64 and try again.") + exit(1) + + # Check if keygen is installed + if run("which ssh-keygen") != 0: + Logger.error("ssh-keygen is not installed. Please install ssh-keygen and try again.") + exit(1) + +if __name__ == "__main__": + CLI().run() diff --git a/docs/attachments/architecture/challenge-deployment.drawio b/docs/attachments/architecture/challenge-deployment.drawio new file mode 100644 index 0000000..43c9d4a --- /dev/null +++ b/docs/attachments/architecture/challenge-deployment.drawio @@ -0,0 +1,166 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/attachments/architecture/challenge-deployment.png b/docs/attachments/architecture/challenge-deployment.png new file mode 100644 index 0000000..a75cefa Binary files /dev/null and b/docs/attachments/architecture/challenge-deployment.png differ diff --git a/docs/attachments/architecture/challenge-deployment.svg b/docs/attachments/architecture/challenge-deployment.svg new file mode 100644 index 0000000..553aaff --- /dev/null +++ b/docs/attachments/architecture/challenge-deployment.svg @@ -0,0 +1 @@ +
ArgoCD
ArgoCD
Shared challenge
Shared challenge
Instanced challenge
Instanced challenge
Deploys
Deploys
KubeCTF
KubeCTF
Deploys
instanced challenge template
Deploys...
Deploys
Deploys
Master
Master
Container registry
Container registry
Generate
deployment files
Github actions
Generate...
Update deployment
Update dep...
Challenge updated
Challenge...
Push docker images
Github actions
Push docker images...
Pulls
Deployment templates
Pulls...
Pulls
Docker image
Pulls...
Chall dev
Chall...
Commit
Commit
Kubernetes
cluster
Kuberne...
Github
Github
Service / Deployment
Service / Deployment
Cluster
Cluster
Github
Github
Action
Action
Background
operation
Background...
Github branch
Github branch
Challenge deployment
Challenge deployment
CTFd
CTFd
CTFd manager
CTFd manager
Updates CTFd
Updates CTFd
Deploys
Chall information
Deploys...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/attachments/architecture/challenge-network-architecture.drawio b/docs/attachments/architecture/challenge-network-architecture.drawio new file mode 100644 index 0000000..725a121 --- /dev/null +++ b/docs/attachments/architecture/challenge-network-architecture.drawio @@ -0,0 +1,104 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/attachments/architecture/challenge-network-architecture.png b/docs/attachments/architecture/challenge-network-architecture.png new file mode 100644 index 0000000..125154d Binary files /dev/null and b/docs/attachments/architecture/challenge-network-architecture.png differ diff --git a/docs/attachments/architecture/challenge-network-architecture.svg b/docs/attachments/architecture/challenge-network-architecture.svg new file mode 100644 index 0000000..0dcd4d2 --- /dev/null +++ b/docs/attachments/architecture/challenge-network-architecture.svg @@ -0,0 +1 @@ +
User
User
Kubernetes
cluster
Kuberne...
Service / Deployment
Service / Deployment
Cluster
Cluster
Hetzner Cloud
Hetzner Cloud
Request
Request
Challenge Network architecture
Challenge Network architecture
Load balancer
Load balancer
Branching
Branching
Challenge
Challenge
Yes
Yes
No
No
TCP?
TCP?
Yes
Yes
No
No
Available?
Available?
Fallback
Fallback
Traefik
Traefik
Traefik
Traefik
Traefik
Traefik
Hetzner
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/attachments/architecture/cluster-network-architecture.drawio b/docs/attachments/architecture/cluster-network-architecture.drawio new file mode 100644 index 0000000..9c85d9f --- /dev/null +++ b/docs/attachments/architecture/cluster-network-architecture.drawio @@ -0,0 +1,214 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/attachments/architecture/cluster-network-architecture.png b/docs/attachments/architecture/cluster-network-architecture.png new file mode 100644 index 0000000..8717a6a Binary files /dev/null and b/docs/attachments/architecture/cluster-network-architecture.png differ diff --git a/docs/attachments/architecture/cluster-network-architecture.svg b/docs/attachments/architecture/cluster-network-architecture.svg new file mode 100644 index 0000000..1f23851 --- /dev/null +++ b/docs/attachments/architecture/cluster-network-architecture.svg @@ -0,0 +1 @@ +
User
User
Service / Deployment
Service / Deployment
Private network
Private network
Hetzner Cloud
Hetzner Cloud
Request
Request
Cluster Network architecture
Cluster Network architecture
Load balancer
Load balancer
Server
Server
Control plane
Load balancer
Control planeLoad ba...
Control plane
Control plane
Control plane
Control plane
Control plane
Control plane
Agents
Agents
Challs
Challs
Scale
Scale
Scale
Scale
Scale
Scale
Challs
Challs
Challs
Challs
Agents
Agents
Agents
Agents
Cluster
Cluster
Kubernetes
cluster
Kuberne...
K8s
resources
K8s...
Hetzner
Platform
domain
Platform...
Management
domain
Management...
CTF
domain
CTF...
Cloudflare proxy
Cloudflare proxy
Cloudflare
Cloudflare
Traefik
Traefik
Traefik
Traefik
Traefik
Traefik
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/attachments/architecture/overview.drawio b/docs/attachments/architecture/overview.drawio new file mode 100644 index 0000000..d65521c --- /dev/null +++ b/docs/attachments/architecture/overview.drawio @@ -0,0 +1,297 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/attachments/architecture/overview.png b/docs/attachments/architecture/overview.png new file mode 100644 index 0000000..87a00ba Binary files /dev/null and b/docs/attachments/architecture/overview.png differ diff --git a/docs/attachments/architecture/overview.svg b/docs/attachments/architecture/overview.svg new file mode 100644 index 0000000..cd1b8db --- /dev/null +++ b/docs/attachments/architecture/overview.svg @@ -0,0 +1 @@ +
CTFd
CTFd
Redis
Redis
Redis
Redis
DB
DB
DB
DB
CTFd
CTFd
CTFd
CTFd
Chall
Chall
Chall
Chall
ArgoCD
ArgoCD
DB cluster
DB cluster
Redis
Redis
KubeCTF
KubeCTF
Instanced
Challenges
Instanced...
Prometheus
Grafana
Prometheus...
Logging
Logging
Chall
Chall
Chall
Chall
Shared
Challenges
Shared...
Kubernetes
cluster
Kuberne...
Uses
Uses
Deploys
Deploys
Deploys
Instanced challenges
templates
Deploys...
Ops
Ops
Deploys
Deploys
Platform
Platform
Deploys
Deploys
Challenges
Challenges
Cluster
Cluster
Deploys
Deploys
Deploys
Deploys
Configures
Configures
CTFd
CTFd
Pulls
deployment config
Pulls...
Orders instanced deployment
Orders instanced...
Deploys
Deploys
Deploys
Deploys
Challenges
Challenges
CTFp
CTFp
Service / Deployment
Service / Deployment
Repository
Repository
Terraform project
Terraform project
Cluster
Cluster
Github
Github
Action
Action
CTFd-manager
CTFd-manager
Deploys
Challs
Deploys...
Configures
Configures
Git
Git
Architecture overview
Architecture overview
Text is not SVG - cannot display
\ No newline at end of file diff --git a/keys/.gitignore b/keys/.gitignore new file mode 100644 index 0000000..b2e756f --- /dev/null +++ b/keys/.gitignore @@ -0,0 +1,3 @@ +* +!.gitignore +!create.sh diff --git a/keys/create.sh b/keys/create.sh new file mode 100755 index 0000000..69cb1de --- /dev/null +++ b/keys/create.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +# Usage: ./create.sh [test|dev|prod] +CTFP_EXECUTE=true +if [ -z "$1" ]; then + echo "Usage: $0 [test|dev|prod]" + CTFP_EXECUTE=false +fi + +if [ "$CTFP_EXECUTE" = true ]; then + CTFP_ENVIRONMENT=$1 + echo "Creating SSH keys for environment: $CTFP_ENVIRONMENT" + + # Get location of this file + DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" + + ssh-keygen -t ed25519 -f "$DIR/k8s-$CTFP_ENVIRONMENT" -q -N "" + + # base64 encode the keys (into single base64 string) + base64 "$DIR/k8s-$CTFP_ENVIRONMENT" -w0 > "$DIR/k8s-$CTFP_ENVIRONMENT.b64" + base64 "$DIR/k8s-$CTFP_ENVIRONMENT.pub" -w0 > "$DIR/k8s-$CTFP_ENVIRONMENT.pub.b64" +fi diff --git a/kube-config/.gitignore b/kube-config/.gitignore new file mode 100644 index 0000000..d6b7ef3 --- /dev/null +++ b/kube-config/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/kubectl.sh b/kubectl.sh new file mode 100755 index 0000000..1fd9b1d --- /dev/null +++ b/kubectl.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash +# Select environment between test, dev or prod +# Usage: source ./kubectl.sh [test|dev|prod] +CTFP_EXECUTE=true +if [ -z "$1" ]; then + echo "Usage: source $0 [test|dev|prod]" + CTFP_EXECUTE=false +fi + +if [ "$CTFP_EXECUTE" = true ]; then + CTFP_ENVIRONMENT=$1 + echo "Setting up kubectl for environment: $CTFP_ENVIRONMENT" + + # Check if the kube-config directory exists, if not create it + if [ ! -d "./kube-config" ]; then + mkdir -p ./kube-config + fi + + # Check if the kube-config file exists, if not then fail + if [ -f "./kube-config/kube-config.$CTFP_ENVIRONMENT.yml" ]; then + export KUBECONFIG=${KUBECONFIG:-~/.kube/config}:$(pwd)/kube-config/kube-config.$CTFP_ENVIRONMENT.yml + kubectl config use-context k3s + echo "KUBECONFIG set to k3s" + else + echo "Kube-config file not found!" + fi +fi diff --git a/ops/.env.example b/ops/.env.example new file mode 100644 index 0000000..5fe1f9d --- /dev/null +++ b/ops/.env.example @@ -0,0 +1,2 @@ +AWS_ACCESS_KEY_ID= +AWS_SECRET_ACCESS_KEY= \ No newline at end of file diff --git a/ops/.gitignore b/ops/.gitignore new file mode 100644 index 0000000..2faf43d --- /dev/null +++ b/ops/.gitignore @@ -0,0 +1,37 @@ +# Local .terraform directories +**/.terraform/* + +# .tfstate files +*.tfstate +*.tfstate.* + +# Crash log files +crash.log +crash.*.log + +# Exclude all .tfvars files, which are likely to contain sensitive data, such as +# password, private keys, and other secrets. These should not be part of version +# control as they are data points which are potentially sensitive and subject +# to change depending on the environment. +*.tfvars +*.tfvars.json + +# Ignore override files as they are usually used to override resources locally and so +# are not checked in +override.tf +override.tf.json +*_override.tf +*_override.tf.json + +# Ignore transient lock info files created by terraform apply +.terraform.tfstate.lock.info + +# Include override files you do wish to add to version control using negated pattern +# !example_override.tf + +# Include tfplan files to ignore the plan output of command: terraform plan -out=tfplan +# example: *tfplan* + +# Ignore CLI configuration files +.terraformrc +terraform.rc diff --git a/ops/.terraform.lock.hcl b/ops/.terraform.lock.hcl new file mode 100644 index 0000000..f710863 --- /dev/null +++ b/ops/.terraform.lock.hcl @@ -0,0 +1,131 @@ +# This file is maintained automatically by "tofu init". +# Manual edits may be lost in future updates. + +provider "registry.opentofu.org/alekc/kubectl" { + version = "2.1.3" + constraints = "~> 2.0, >= 2.0.2" + hashes = [ + "h1:AymCb0DCWzmyLqn1qEhVs2pcFUZGT/kxPK+I/BObFH8=", + "zh:0e601ae36ebc32eb8c10aff4c48c1125e471fa09f5668465af7581c9057fa22c", + "zh:1773f08a412d1a5f89bac174fe1efdfd255ecdda92d31a2e31937e4abf843a2f", + "zh:1da2db1f940c5d34e31c2384c7bd7acba68725cc1d3ba6db0fec42efe80dbfb7", + "zh:20dc810fb09031bcfea4f276e1311e8286d8d55705f55433598418b7bcc76357", + "zh:326a01c86ba90f6c6eb121bacaabb85cfa9059d6587aea935a9bbb6d3d8e3f3f", + "zh:5a3737ea1e08421fe3e700dc833c6fd2c7b8c3f32f5444e844b3fe0c2352757b", + "zh:5f490acbd0348faefea273cb358db24e684cbdcac07c71002ee26b6cfd2c54a0", + "zh:777688cda955213ba637e2ac6b1994e438a5af4d127a34ecb9bb010a8254f8a8", + "zh:7acc32371053592f55ee0bcbbc2f696a8466415dea7f4bc5a6573f03953fc926", + "zh:81f0108e2efe5ae71e651a8826b61d0ce6918811ccfdc0e5b81b2cfb0f7f57fe", + "zh:88b785ea7185720cf40679cb8fa17e57b8b07fd6322cf2d4000b835282033d81", + "zh:89d833336b5cd027e671b46f9c5bc7d10c5109e95297639bbec8001da89aa2f7", + "zh:df108339a89d4372e5b13f77bd9d53c02a04362fb5d85e1d9b6b47292e30821c", + "zh:e8a2e3a5c50ca124e6014c361d72a9940d8e815f37ae2d1e9487ac77c3043013", + ] +} + +provider "registry.opentofu.org/hashicorp/helm" { + version = "3.0.2" + constraints = ">= 2.16.1, ~> 3.0" + hashes = [ + "h1:17Ro1Gs9aCN5QGQ6RDvuianmNV3AxgegYqTJODlYdHI=", + "zh:100f75a700074568cfaee7884e4477c50b5468e086db5bb95d7d519581b65621", + "zh:578d09c7319d0dd0fee03a7fcb48bf68ac978c1fefaa0752cfcb9ecfb0a56a4e", + "zh:64e7cce303362b4bf132d1c61858ef0ada221af4a2ea0fdfd16ec43e562d459c", + "zh:7a64933e70733aeec44bf9b9b6ea3617fd075acb346b082197ded993cfa7d2be", + "zh:7caf4655a5bf72e6d212209ad5ea5c619269eca6e0d9930c85b59bbbdf57ce28", + "zh:a1e0208423445e2443516e52a4d72c556b1303705c90aaeb139fbb64a10d7c1c", + "zh:ac9e4417e9e0486bc60f6796da06356b59161c9923c56a7a5c9b4900a46ee52d", + "zh:b9588da386c17456b242bd18122836baeccdce3227aac4752e189ec9ad218da7", + "zh:d5b6ac3b0b6beb3d94886f45a5a96eb6d78ca2b657efd62b8e0650d8097ee60f", + "zh:db6761e7cf86825f13628e8f4e32818683efff61b0d909211e1096cc6ad84f83", + ] +} + +provider "registry.opentofu.org/hashicorp/http" { + version = "3.5.0" + hashes = [ + "h1:yvwvVZ0vdbsTUMru+7Cr0On1FVgDJHAaC6TNvy/OWzM=", + "zh:0a2b33494eec6a91a183629cf217e073be063624c5d3f70870456ddb478308e9", + "zh:180f40124fa01b98b3d2f79128646b151818e09d6a1a9ca08e0b032a0b1e9cb1", + "zh:3e29e1de149dc10bf78620526c7cb8c62cd76087f5630dfaba0e93cda1f3aa7b", + "zh:4420950200cf86042ec940d0e2c9b7c89966bf556bf8038ba36217eae663bca5", + "zh:5d1f7d02109b2e2dca7ec626e5563ee765583792d0fd64081286f16f9433bd0d", + "zh:8500b138d338b1994c4206aa577b5c44e1d7260825babcf43245a7075bfa52a5", + "zh:b42165a6c4cfb22825938272d12b676e4a6946ac4e750f85df870c947685df2d", + "zh:b919bf3ee8e3b01051a0da3433b443a925e272893d3724ee8fc0f666ec7012c9", + "zh:d13b81ea6755cae785b3e11634936cdff2dc1ec009dc9610d8e3c7eb32f42e69", + "zh:f1c9d2eb1a6b618ae77ad86649679241bd8d6aacec06d0a68d86f748687f4eb3", + ] +} + +provider "registry.opentofu.org/hashicorp/kubernetes" { + version = "2.38.0" + constraints = "~> 2.0, >= 2.32.0" + hashes = [ + "h1:nY7J9jFXcsRINog0KYagiWZw1GVYF9D2JmtIB7Wnrao=", + "zh:1096b41c4e5b2ee6c1980916fb9a8579bc1892071396f7a9432be058aabf3cbc", + "zh:2959fde9ae3d1deb5e317df0d7b02ea4977951ee6b9c4beb083c148ca8f3681c", + "zh:5082f98fcb3389c73339365f7df39fc6912bf2bd1a46d5f97778f441a67fd337", + "zh:620fd5d0fbc2d7a24ac6b420a4922e6093020358162a62fa8cbd37b2bac1d22e", + "zh:7f47c2de179bba35d759147c53082cad6c3449d19b0ec0c5a4ca8db5b06393e1", + "zh:89c3aa2a87e29febf100fd21cead34f9a4c0e6e7ae5f383b5cef815c677eb52a", + "zh:96eecc9f94938a0bc35b8a63d2c4a5f972395e44206620db06760b730d0471fc", + "zh:e15567c1095f898af173c281b66bffdc4f3068afdd9f84bb5b5b5521d9f29584", + "zh:ecc6b912629734a9a41a7cf1c4c73fb13b4b510afc9e7b2e0011d290bcd6d77f", + ] +} + +provider "registry.opentofu.org/hashicorp/random" { + version = "3.7.2" + hashes = [ + "h1:yHMBbZOIHlXUuBQ8Mhioe0hwmhermuboq2eNNoCJaf8=", + "zh:2ffeb1058bd7b21a9e15a5301abb863053a2d42dffa3f6cf654a1667e10f4727", + "zh:519319ed8f4312ed76519652ad6cd9f98bc75cf4ec7990a5684c072cf5dd0a5d", + "zh:7371c2cc28c94deb9dba62fbac2685f7dde47f93019273a758dd5a2794f72919", + "zh:9b0ac4c1d8e36a86b59ced94fa517ae9b015b1d044b3455465cc6f0eab70915d", + "zh:c6336d7196f1318e1cbb120b3de8426ce43d4cacd2c75f45dba2dbdba666ce00", + "zh:c71f18b0cb5d55a103ea81e346fb56db15b144459123f1be1b0209cffc1deb4e", + "zh:d2dc49a6cac2d156e91b0506d6d756809e36bf390844a187f305094336d3e8d8", + "zh:d5b5fc881ccc41b268f952dae303501d6ec9f9d24ee11fe2fa56eed7478e15d0", + "zh:db9723eaca26d58c930e13fde221d93501529a5cd036b1f167ef8cff6f1a03cc", + "zh:fe3359f733f3ab518c6f85f3a9cd89322a7143463263f30321de0973a52d4ad8", + ] +} + +provider "registry.opentofu.org/hashicorp/time" { + version = "0.13.1" + hashes = [ + "h1:ueilLAoXlZPufdJYuPFeqznwP39ZwLsRcQtqow+NUiI=", + "zh:10f32af8b544a039f19abd546e345d056a55cb7bdd69d5bbd7322cbc86883848", + "zh:35dd5beb34a9f73de8d0fed332814c69acae69397c9c065ce63ccd8315442bef", + "zh:56545d1dd5f2e7262e0c0c124264974229ec9cc234d0d7a0e36e14b869590f4a", + "zh:8d7259c3f819fd3470ff933c904b6a549502a8351feb1b5c040a4560decaf7e0", + "zh:a40f26878826b142e26fe193f7e3e14fc97f615cd6af140e88ce5bc25f3fcf50", + "zh:b2e82f25fecff172a9a9e24ea37d37e4fc630ee9245617cb40b10e66a6b979c8", + "zh:d4b699850a40ed07ef83c6b827605d24050b2732646ee017bda278e4ddf01c91", + "zh:e4e6a5e5614b6a54557400aabb748ebd57e947cdbd21ad1c7602c51368a80559", + "zh:eb78fb97bca22931e730487a20a90f5a6221ddfb3138aaf070737ea2b7c9c885", + "zh:faba366a1352ee679bba2a5b09c073c6854721db94b191d49b620b60946a065f", + ] +} + +provider "registry.opentofu.org/loafoe/htpasswd" { + version = "1.2.1" + hashes = [ + "h1:W1euQGM6t+QlB6Rq4fDbRKRHmeCIyYdIYdHrxL97BeE=", + "zh:14460c85ddc40a9ecadf583c22a7de91b83798a8ca4843949d50c3288c6f5bdd", + "zh:1af9416e28dd0a77c5d2c685561c4f60e19e2d606df0477ebc18eaa110c77807", + "zh:2245325864faaf027701ab12a04d641359a0dc439dd23c6e8f768407b78a5c18", + "zh:3813ff98198405d7c467565b52c7f0ad4533f43957da6390477dc898f8ed02c2", + "zh:3c0658e132232a181223f7ff65678d99cd2e8431c317f72281b67464e5e16892", + "zh:43505c0f42bc7635ec7c1fe5043c502f9b00ae4b5e74b81464bc494936643fc1", + "zh:52efdabb0abba99a33fd3ed981610f13c99bb383f94e997f90d95441d8558177", + "zh:75b5d9b4a610dfd0ff4dfb4039f61e79a0e56338e0a4cd45e0bc0edec34dfa62", + "zh:7aee5df091672d29f29dda57382a41d771fa21740cef6bb9a1b15afc6d84ffa4", + "zh:7ff618706e2953a21a22c7555e11f5cbe8e95c171704fcfdc6beedb0c25e49c0", + "zh:94e8a15c83a1a5a60ff1b58938dd9692d800fe05c5d8269e0916b5de03d89d3a", + "zh:c1ace4f322f9ec4956e4f30086da5b6a73f4d05e1266047d629b14a485c5a76d", + "zh:d4570075de49e3ee98494f7c44eab12e964c9776029ed536fd9352c3203cc635", + "zh:d99403b843de5939ea2e54b3ca46fd901d5c5b7fe34f44b8aeb8b38f4f792df6", + ] +} diff --git a/ops/README.md b/ops/README.md new file mode 100644 index 0000000..78fd6d6 --- /dev/null +++ b/ops/README.md @@ -0,0 +1,67 @@ +# CTF Pilot's Kubernetes Operations (Ops) + +> [!IMPORTANT] +> You are leaving the automated CTF Pilot setup and entering a more advanced manual setup. +> This requires knowledge of Kubernetes, Terraform/OpenTofu, and cloud infrastructure management. +> If you are not comfortable with these technologies, it is recommended to use the automated setup provided by CTF Pilot. +> Learn more about the automated setup in the [CTFp main README](../README.md). + +This directory contains various operational applications, services and configurations, deployed as a base on top of the Kubernetes cluster. + +Ops contians elements, that needs to be properly configured and deployed, before the CTF Platform can be correctly deployed within the cluster. + +The following applications/services are included in the Ops: + +- [ArgoCD](https://argo-cd.readthedocs.io/) - GitOps continuous delivery tool, used to deploy and manage applications within the Kubernetes cluster. +- [Cert manager](https://cert-manager.io/) - Certificate management +- [Descheduler](https://github.com/kubernetes-sigs/descheduler) - Continuously rebalance the cluster +- [Error fallback](https://github.com/ctfpilot/error-fallback) - CTF Pilot's Error Fallback page +- [Filebeat](https://www.elastic.co/beats/filebeat) - Log offload to Elasticseach +- [MariaDB Operator](https://github.com/mariadb-operator/mariadb-operator) - Operator to manage MariaDB within the cluster +- [Prometheus & Grafana stack](https://artifacthub.io/packages/helm/prometheus-community/kube-prometheus-stack) - Prometheus and Grafana stack for monitoring +- [Redis operator](https://github.com/OT-CONTAINER-KIT/redis-operator) - Redis operator to manage Redis within the cluster +- [Traefik](https://traefik.io/traefik) - Configuration of Traefik. This project only deploys additional Helm chart configuration. + +## Pre-requisites + +The following software needs to be installed on your local machine: + +- [Terraform](https://www.terraform.io/downloads.html) / [OpenTofu](https://opentofu.org) +- [Kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/) (For interacting with the Kubernetes cluster) + +The following services are required, in order to deploy the services to the cluster: + +- A Kubernetes cluster (Deployed using the [CTF Pilot's Kubernetes Cluster on Hetzner Cloud](../cluster/README.md) guide or other means) +- [Cloudflare](https://www.cloudflare.com/) account +- [Cloudflare API Token](https://dash.cloudflare.com/profile/api-tokens) (For authenticating with the Cloudflare API) +- [Cloudflare controlled domain](https://dash.cloudflare.com/) (For allowing the system to do DNS challenges for TLS certificates) + +## Setup + +Copy the `tfvars/template.tfvars` file to `tfvars/data.tfvars` and edit the file with your own values. +The [`tfvars/template.tfvars`](tfvars/template.tfvars) file contains further information on each variable. + +> [!IMPORTANT] +> Make sure you generate the backend configuration file before creating the cluster. +> See the [backend generation instructions](../backend/README.md) for more information. +> +> You will also need to set the following environment variables for authentication to the S3 backend: +> - `AWS_ACCESS_KEY_ID` +> - `AWS_SECRET_ACCESS_KEY` +> +> See [OpenTofub backend S3 configuration](https://opentofu.org/docs/language/settings/backends/s3/) for more information. + +Run the following command to apply the ressources to the Kubernetes cluster: + +```bash +tofu init -backend-config=../backend/generated/ops.hcl +tofu apply --var-file tfvars/data.tfvars +``` + +### Destroying the Ops + +To destroy the deployed ops, run the following command: + +```bash +tofu destroy --var-file tfvars/data.tfvars +``` diff --git a/ops/argocd.tf b/ops/argocd.tf new file mode 100644 index 0000000..101ec46 --- /dev/null +++ b/ops/argocd.tf @@ -0,0 +1,116 @@ +resource "kubernetes_namespace_v1" "argocd" { + metadata { + name = "argocd" + } +} + +resource "helm_release" "argocd" { + namespace = kubernetes_namespace_v1.argocd.metadata.0.name + create_namespace = false + name = "argocd" + repository = "https://argoproj.github.io/argo-helm" + chart = "argo-cd" + version = var.argocd_version + + # Helm chart deployment can sometimes take longer than the default 5 minutes + timeout = 800 + + # If values file specified by the var.values_file input variable exists then apply the values from this file + # else apply the default values from the chart + values = [ + yamlencode({ + # "redis-ha" = { + # enabled = true + # }, + controller = { + replicas : 1 + }, + server = { + replicas : 2 + }, + repoServer = { + replicas : 2 + }, + applicationSet = { + replicas : 2 + } + }), + ] + + set_sensitive = [{ + name = "configs.secret.argocdServerAdminPassword" + value = var.argocd_admin_password == "" ? "" : bcrypt(var.argocd_admin_password) + }, + { + name = "configs.secret.githubSecret" + value = var.argocd_github_secret + }] + + set = [ + { + name = "dex.enabled" + value = true + }, + { + name = "configs.params.server\\.insecure" + value = true + } + ] + + depends_on = [ + kubernetes_namespace_v1.argocd + ] +} + +resource "kubernetes_ingress_v1" "argocd-ingress" { + metadata { + name = "argocd-ingress" + namespace = kubernetes_namespace_v1.argocd.metadata.0.name + + annotations = { + "cert-manager.io/cluster-issuer" = module.cert_manager.cluster_issuer_name + "traefik.ingress.kubernetes.io/router.middlewares" = "errors-errors@kubernetescrd" + } + } + + spec { + default_backend { + service { + name = "argocd-server" + port { + number = 80 + } + } + } + + rule { + host = "argocd.${var.cluster_dns_management}" + http { + path { + backend { + service { + name = "argocd-server" + port { + number = 80 + } + } + } + } + } + } + + tls { + hosts = [ + "argocd.${var.cluster_dns_management}" + ] + + secret_name = "argocd-cert" + } + } + + depends_on = [ + kubernetes_namespace_v1.argocd, + helm_release.argocd, + module.cert_manager, + ] +} diff --git a/ops/cert-manager.tf b/ops/cert-manager.tf new file mode 100644 index 0000000..81594d9 --- /dev/null +++ b/ops/cert-manager.tf @@ -0,0 +1,90 @@ +resource "kubernetes_namespace_v1" "cert_manager" { + metadata { + name = "cert-manager" + } +} + +module "cert_manager" { + source = "terraform-iaac/cert-manager/kubernetes" + + cluster_issuer_email = var.email + cluster_issuer_name = "cert-manager-global" + cluster_issuer_private_key_secret_name = "cert-manager-private-key" + chart_version = var.cert_manager_version + + namespace_name = kubernetes_namespace_v1.cert_manager.metadata.0.name + create_namespace = false + + + solvers = [ + { + dns01 = { + cloudflare = { + email = var.email + apiTokenSecretRef = { + name = kubernetes_secret.cloudflare_api_key_secret.metadata.0.name + key = "API" + } + }, + }, + selector = { + dnsZones = [ + var.cloudflare_dns_management + ] + } + }, + { + dns01 = { + cloudflare = { + email = var.email + apiTokenSecretRef = { + name = kubernetes_secret.cloudflare_api_key_secret.metadata.0.name + key = "API" + } + }, + }, + selector = { + dnsZones = [ + var.cloudflare_dns_ctf + ] + } + }, + { + dns01 = { + cloudflare = { + email = var.email + apiTokenSecretRef = { + name = kubernetes_secret.cloudflare_api_key_secret.metadata.0.name + key = "API" + } + }, + }, + selector = { + dnsZones = [ + var.cloudflare_dns_platform + ] + } + } + ] + + depends_on = [ + kubernetes_namespace_v1.cert_manager, + kubernetes_secret.cloudflare_api_key_secret + ] +} + +# Cloudflare api token secret +resource "kubernetes_secret" "cloudflare_api_key_secret" { + metadata { + name = "cloudflare-api-key-secret" + namespace = kubernetes_namespace_v1.cert_manager.metadata.0.name + } + + data = { + API = var.cloudflare_api_token + } + + depends_on = [ + kubernetes_namespace_v1.cert_manager + ] +} diff --git a/ops/descheduler.tf b/ops/descheduler.tf new file mode 100644 index 0000000..ffde222 --- /dev/null +++ b/ops/descheduler.tf @@ -0,0 +1,24 @@ +resource "kubernetes_namespace_v1" "descheduler" { + metadata { + name = "descheduler" + } +} + +resource "helm_release" "descheduler" { + name = "descheduler" + repository = "https://kubernetes-sigs.github.io/descheduler/" + chart = "descheduler" + version = var.descheduler_version + + namespace = kubernetes_namespace_v1.descheduler.metadata.0.name + create_namespace = false + + values = [ + yamlencode({ + serviceMonitor = { + enabled = true + namespace = "prometheus" + } + }) + ] +} diff --git a/ops/errors.tf b/ops/errors.tf new file mode 100644 index 0000000..561e705 --- /dev/null +++ b/ops/errors.tf @@ -0,0 +1,150 @@ +resource "kubernetes_namespace" "errors" { + metadata { + name = "errors" + labels = { + role = "errors" + } + } +} + +module "errors-pull-secret" { + source = "../tf-modules/pull-secret" + + namespace = kubernetes_namespace.errors.metadata[0].name + ghcr_token = var.ghcr_token + ghcr_username = var.ghcr_username + + depends_on = [ + kubernetes_namespace.errors + ] +} + +resource "kubernetes_deployment_v1" "errors" { + metadata { + name = "errors" + namespace = "errors" + + labels = { + role = "errors" + } + } + + spec { + replicas = 2 + + selector { + match_labels = { + role = "errors" + } + } + + template { + metadata { + labels = { + role = "errors" + } + } + + spec { + enable_service_links = false + automount_service_account_token = false + + image_pull_secrets { + name = var.ghcr_token != "" ? module.errors-pull-secret.pull-secret : "" + } + + container { + name = "errors" + image = var.image_error_fallback + image_pull_policy = "Always" + + port { + container_port = 80 + } + + resources { + limits = { + cpu = "100m" + memory = "256Mi" + } + requests = { + cpu = "10m" + memory = "50Mi" + } + } + + liveness_probe { + http_get { + path = "/" + port = 80 + } + + initial_delay_seconds = 5 + period_seconds = 10 + } + } + } + } + } + + depends_on = [ + kubernetes_namespace.errors, + module.errors-pull-secret + ] +} + +resource "kubernetes_service_v1" "errors" { + metadata { + name = "errors" + namespace = "errors" + + labels = { + role = "errors" + } + } + + spec { + selector = { + role = "errors" + } + + port { + port = 80 + target_port = 80 + } + } + + depends_on = [ + kubernetes_deployment_v1.errors + ] +} + +resource "kubernetes_manifest" "traefik-errors-middleware" { + manifest = { + apiVersion = "traefik.io/v1alpha1" + kind = "Middleware" + metadata = { + name = "errors" + namespace = "errors" + } + spec = { + errors = { + status = [ + "502", + "503", + "504" + ] + query = "/{status}.html" + service = { + name = "errors" + port = 80 + } + } + } + } + + depends_on = [ + kubernetes_namespace.errors, + kubernetes_service_v1.errors + ] +} diff --git a/ops/filebeat-values/values.yaml b/ops/filebeat-values/values.yaml new file mode 100644 index 0000000..c07a667 --- /dev/null +++ b/ops/filebeat-values/values.yaml @@ -0,0 +1,22 @@ +daemonset: + # Include the daemonset + enabled: true + extraEnvs: + - name: "ELASTICSEARCH_HOSTS" + valueFrom: + secretKeyRef: + name: es-credentials + key: hosts + - name: "ELASTICSEARCH_USERNAME" + valueFrom: + secretKeyRef: + name: es-credentials + key: username + - name: "ELASTICSEARCH_PASSWORD" + valueFrom: + secretKeyRef: + name: es-credentials + key: password + secretMounts: NULL +deployments: + secretMounts: NULL diff --git a/ops/filebeat.tf b/ops/filebeat.tf new file mode 100644 index 0000000..60a24bb --- /dev/null +++ b/ops/filebeat.tf @@ -0,0 +1,226 @@ +resource "kubernetes_namespace" "logging-namespace" { + metadata { + name = "logging" + } + lifecycle { + ignore_changes = [metadata] + } +} + +resource "kubernetes_secret" "es_credentials" { + metadata { + name = "es-credentials" + namespace = kubernetes_namespace.logging-namespace.metadata.0.name + } + data = { + "username" = var.filebeat_elasticsearch_username + "password" = var.filebeat_elasticsearch_password + } + type = "Opaque" +} + +resource "kubernetes_service_account" "filebeat_service_account" { + metadata { + name = "filebeat" + namespace = kubernetes_namespace.logging-namespace.metadata.0.name + } +} + +resource "kubernetes_cluster_role_v1" "filebeat_cluster_role" { + metadata { + name = "filebeat" + } + rule { + api_groups = [""] + resources = ["namespaces", "pods", "serviceaccounts", "nodes", "endpoints"] + verbs = ["get", "list", "watch"] + } +} + +resource "kubernetes_cluster_role_binding_v1" "filebeat_cluster_role_binding" { + metadata { + name = "filebeat" + } + role_ref { + api_group = "rbac.authorization.k8s.io" + kind = "ClusterRole" + name = kubernetes_cluster_role_v1.filebeat_cluster_role.metadata.0.name + } + subject { + kind = "ServiceAccount" + name = kubernetes_service_account.filebeat_service_account.metadata.0.name + namespace = kubernetes_namespace.logging-namespace.metadata.0.name + } +} + +resource "kubernetes_config_map" "filebeat_config" { + metadata { + name = "filebeat-config" + namespace = kubernetes_namespace.logging-namespace.metadata.0.name + } + data = { + "filebeat.yml" = <<-EOF + filebeat.inputs: + - type: container + paths: + - /var/log/containers/*.log + processors: + - add_kubernetes_metadata: + host: $${NODE_NAME} + matchers: + - logs_path: + logs_path: "/var/log/containers/" + - add_fields: + target: '' + fields: + cluster_dns: "${var.cluster_dns_management}" + + output.elasticsearch: + hosts: ["https://${var.filebeat_elasticsearch_host}:443"] + username: "${var.filebeat_elasticsearch_username}" + password: "${var.filebeat_elasticsearch_password}" + protocol: https + ssl.verification_mode: "full" + index: filebeat-${var.environment}-logs + + setup: + template: + name: "filebeat-${var.environment}-logs" + pattern: "filebeat-${var.environment}-logs*" + overwrite: false + ilm: + enabled: true + policy_name: "filebeat" + EOF + } +} + +resource "kubernetes_daemonset" "filebeat_daemonset" { + metadata { + name = "filebeat" + namespace = kubernetes_namespace.logging-namespace.metadata.0.name + labels = { + k8s-app = "filebeat-logging" + version = "v1" + app = "filebeat" + } + } + + spec { + selector { + match_labels = { + k8s-app = "filebeat-logging" + version = "v1" + app = "filebeat" + } + } + + template { + metadata { + labels = { + k8s-app = "filebeat-logging" + version = "v1" + app = "filebeat" + } + } + + spec { + service_account_name = kubernetes_service_account.filebeat_service_account.metadata.0.name + + toleration { + key = "node-role.kubernetes.io/control-plane" + effect = "NoSchedule" + } + toleration { + key = "node-role.kubernetes.io/master" + effect = "NoSchedule" + } + toleration { + key = "cluster.ctfpilot.com/node" + value = "scaler" + effect = "PreferNoSchedule" + } + + container { + name = "filebeat" + image = var.image_filebeat + security_context { + privileged = true + } + env { + name = "ELASTICSEARCH_HOST" + value = "https://${var.filebeat_elasticsearch_host}:443" + } + env { + name = "ELASTICSEARCH_USERNAME" + value_from { + secret_key_ref { + name = "es-credentials" + key = "username" + } + } + } + env { + name = "ELASTICSEARCH_PASSWORD" + value_from { + secret_key_ref { + name = "es-credentials" + key = "password" + } + } + } + env { + name = "NODE_NAME" + value_from { + field_ref { + field_path = "spec.nodeName" + } + } + } + + resources { + requests = { + cpu = "10m" + memory = "100Mi" + } + limits = { + cpu = "200m" + memory = "200Mi" + } + } + + volume_mount { + name = "varlog" + mount_path = "/var/log" + } + volume_mount { + name = "filebeat-config" + mount_path = "/usr/share/filebeat/filebeat.yml" + sub_path = "filebeat.yml" + } + } + + termination_grace_period_seconds = 30 + + volume { + name = "varlog" + host_path { + path = "/var/log" + } + } + volume { + name = "filebeat-config" + config_map { + name = kubernetes_config_map.filebeat_config.metadata.0.name + } + } + } + } + } + + depends_on = [ + kubernetes_namespace.logging-namespace, + kubernetes_secret.es_credentials, + kubernetes_config_map.filebeat_config + ] +} diff --git a/ops/ingress.tf b/ops/ingress.tf new file mode 100644 index 0000000..a37f886 --- /dev/null +++ b/ops/ingress.tf @@ -0,0 +1,44 @@ +resource "htpasswd_password" "traefik_basic_auth" { + password = var.traefik_basic_auth.password + salt = random_password.salt.result + + depends_on = [ + random_password.salt + ] +} + +resource "kubernetes_secret" "traefik_basic_auth" { + metadata { + name = "admin-ui-basic-auth" + namespace = var.traefik_namespace + } + + data = { + "auth" = "${var.traefik_basic_auth.user}:${htpasswd_password.traefik_basic_auth.apr1}" + } + + depends_on = [ + htpasswd_password.traefik_basic_auth + ] +} + +# Traefik basic auth middleware +resource "kubernetes_manifest" "traefik_basic_auth" { + manifest = { + apiVersion = "traefik.io/v1alpha1" + kind = "Middleware" + metadata = { + name = kubernetes_secret.traefik_basic_auth.metadata.0.name + namespace = var.traefik_namespace + } + spec = { + basicAuth = { + secret = kubernetes_secret.traefik_basic_auth.metadata.0.name + } + } + } + + depends_on = [ + kubernetes_secret.traefik_basic_auth + ] +} diff --git a/ops/mariadb-operator.tf b/ops/mariadb-operator.tf new file mode 100644 index 0000000..99ac19f --- /dev/null +++ b/ops/mariadb-operator.tf @@ -0,0 +1,52 @@ +resource "kubernetes_namespace_v1" "mariadb" { + metadata { + name = "mariadb-operator" + } +} + +resource "helm_release" "mariadb-operator-crds" { + name = "mariadb-operator-crds" + repository = "https://helm.mariadb.com/mariadb-operator" + namespace = kubernetes_namespace_v1.mariadb.metadata.0.name + create_namespace = false + + chart = "mariadb-operator-crds" + version = var.mariadb_operator_version + + // timeout 10min + timeout = 600 + + // Force use of longhorn storage class + # set = [{ + # name = "mariadb-operator.storageClass" + # value = "longhorn" + # }] + + depends_on = [ + kubernetes_namespace_v1.mariadb + ] +} + +resource "helm_release" "mariadb-operator" { + name = "mariadb-operator" + repository = "https://helm.mariadb.com/mariadb-operator" + namespace = kubernetes_namespace_v1.mariadb.metadata.0.name + create_namespace = false + + chart = "mariadb-operator" + version = var.mariadb_operator_version + + # timeout 10min + timeout = 600 + + // Force use of longhorn storage class + # set = [{ + # name = "mariadb-operator.storageClass" + # value = "longhorn" + # }] + + depends_on = [ + helm_release.mariadb-operator-crds, + kubernetes_namespace_v1.mariadb + ] +} diff --git a/ops/prod-default-web.tf b/ops/prod-default-web.tf new file mode 100644 index 0000000..743b310 --- /dev/null +++ b/ops/prod-default-web.tf @@ -0,0 +1,132 @@ +# ---------------------- +# Default web entrypoint +# ---------------------- + +# Namespace +resource "kubernetes_namespace" "prod-default-web" { + metadata { + name = "prod-default-web" + } +} + +# Ingress +resource "kubernetes_ingress_v1" "prod-default-web" { + metadata { + name = "prod-default-web-ingress" + namespace = kubernetes_namespace.prod-default-web.metadata.0.name + + annotations = { + "cert-manager.io/cluster-issuer" = module.cert_manager.cluster_issuer_name + "traefik.ingress.kubernetes.io/router.middlewares" = "errors-errors@kubernetescrd" + } + } + + spec { + default_backend { + service { + name = kubernetes_service_v1.prod-default-web.metadata.0.name + port { + number = 80 + } + } + } + + rule { + host = var.cluster_dns_management + http { + path { + path = "/" + backend { + service { + name = kubernetes_service_v1.prod-default-web.metadata.0.name + port { + number = 80 + } + } + } + } + } + } + + tls { + hosts = [ + "${var.cluster_dns_management}" + ] + + secret_name = "prod-default-web-cert" + } + } + + depends_on = [ + kubernetes_namespace.prod-default-web, + kubernetes_service_v1.prod-default-web, + module.cert_manager, + ] +} + +# Service +resource "kubernetes_service_v1" "prod-default-web" { + metadata { + name = "prod-default-web" + namespace = kubernetes_namespace.prod-default-web.metadata.0.name + } + + spec { + selector = { + app = "prod-default-web" + } + + port { + port = 80 + target_port = 5678 + } + } + + depends_on = [ + kubernetes_deployment_v1.prod-default-web + ] +} + +# Deployment +resource "kubernetes_deployment_v1" "prod-default-web" { + metadata { + name = "prod-default-web" + namespace = kubernetes_namespace.prod-default-web.metadata.0.name + } + + spec { + replicas = 3 + + selector { + match_labels = { + app = "prod-default-web" + } + } + + template { + metadata { + labels = { + app = "prod-default-web" + } + } + + spec { + container { + name = "prod-default-web" + image = "hashicorp/http-echo" + args = [ + "-text=Welcome to CTF Pilot!" + ] + + port { + container_port = 5678 + } + } + } + } + } + + depends_on = [ + kubernetes_namespace.prod-default-web + ] +} diff --git a/ops/prometheus.tf b/ops/prometheus.tf new file mode 100644 index 0000000..2ff5fc3 --- /dev/null +++ b/ops/prometheus.tf @@ -0,0 +1,214 @@ +resource "kubernetes_namespace_v1" "prometheus" { + metadata { + name = "prometheus" + } +} + +# --- Grafana Dashboards ConfigMaps --- +resource "kubernetes_config_map" "grafana-dashboards-k8s" { + metadata { + name = "grafana-dashboards-k8s" + namespace = kubernetes_namespace_v1.prometheus.metadata.0.name + + labels = { + grafana_dashboard = 1 + } + + annotations = { + k8s-sidecar-target-directory = "/tmp/dashboards/k8s" + } + } + + data = { + for file in fileset("${path.module}/prometheus/grafana/dashboards/k8s", "*.json") : file => file("${path.module}/prometheus/grafana/dashboards/k8s/${file}") + } +} + +resource "kubernetes_config_map" "grafana-dashboards-redis" { + metadata { + name = "grafana-dashboards-redis" + namespace = kubernetes_namespace_v1.prometheus.metadata.0.name + + labels = { + grafana_dashboard = 1 + } + + annotations = { + k8s-sidecar-target-directory = "/tmp/dashboards/redis" + } + } + + data = { + for file in fileset("${path.module}/prometheus/grafana/dashboards/redis", "*.json") : file => file("${path.module}/prometheus/grafana/dashboards/redis/${file}") + } +} + +resource "kubernetes_config_map" "grafana-dashboards-traefik" { + metadata { + name = "grafana-dashboards-traefik" + namespace = kubernetes_namespace_v1.prometheus.metadata.0.name + + labels = { + grafana_dashboard = 1 + } + + annotations = { + k8s-sidecar-target-directory = "/tmp/dashboards/traefik" + } + } + + data = { + for file in fileset("${path.module}/prometheus/grafana/dashboards/traefik", "*.json") : file => file("${path.module}/prometheus/grafana/dashboards/traefik/${file}") + } +} + +resource "kubernetes_config_map" "grafana-dashboards-ctf" { + metadata { + name = "grafana-dashboards-ctf" + namespace = kubernetes_namespace_v1.prometheus.metadata.0.name + + labels = { + grafana_dashboard = 1 + } + + annotations = { + k8s-sidecar-target-directory = "/tmp/dashboards/ctf" + } + } + + data = { + for file in fileset("${path.module}/prometheus/grafana/dashboards/ctf", "*.json") : file => file("${path.module}/prometheus/grafana/dashboards/ctf/${file}") + } +} + +# --- Grafana Alerting Rules and Contacts --- +resource "kubernetes_secret" "grafana-alerts-contact-rules" { + metadata { + name = "grafana-alerts-contact-rules" + namespace = kubernetes_namespace_v1.prometheus.metadata.0.name + + labels = { + grafana_alert = "1" + } + } + + data = { + for file in fileset("${path.module}/prometheus/grafana/contact", "*.yaml") : file => templatefile("${path.module}/prometheus/grafana/contact/${file}", { + cluster_dns_management = var.cluster_dns_management, + discord_webhook_url = var.discord_webhook_url, + }) + } + type = "Opaque" +} + +resource "kubernetes_config_map" "grafana-alerts-notification-rules" { + metadata { + name = "grafana-alerts-notification-rules" + namespace = kubernetes_namespace_v1.prometheus.metadata.0.name + + labels = { + grafana_alert = "1" + } + } + + data = { + for file in fileset("${path.module}/prometheus/grafana/notification", "*.yaml") : file => templatefile("${path.module}/prometheus/grafana/notification/${file}", { + cluster_dns_management = var.cluster_dns_management, + discord_webhook_url = var.discord_webhook_url, + }) + } +} + +# --- Prometheus Helm Release --- +resource "helm_release" "prometheus" { + name = "prometheus" + + namespace = kubernetes_namespace_v1.prometheus.metadata.0.name + create_namespace = false + + repository = "https://prometheus-community.github.io/helm-charts" + chart = "kube-prometheus-stack" + version = var.kube_prometheus_stack_version + + # Set password for grafana dashboard + set_sensitive = [{ + name = "grafana.adminPassword" + value = var.grafana_admin_password + }] + + # Use PVC for prometheus data + set = [ + # { + # name = "prometheus.prometheusSpec.storageSpec.volumeClaimTemplate.spec.storageClassName" + # value = "longhorn" + # }, + { + name = "prometheus.prometheusSpec.storageSpec.volumeClaimTemplate.spec.resources.requests.storage" + value = var.prometheus_storage_size + } + ] + + values = [ + templatefile("${path.module}/prometheus/kube_prometheus_custom_values.yaml", { + cluster_dns_management = var.cluster_dns_management, + discord_webhook_url = var.discord_webhook_url, + }) + ] + + depends_on = [ + kubernetes_namespace_v1.prometheus, + kubernetes_config_map.grafana-dashboards-k8s, + kubernetes_config_map.grafana-dashboards-redis + ] +} + +# --- Grafana Ingress --- +resource "kubernetes_ingress_v1" "grafana-ingress" { + metadata { + name = "grafana-ingress" + namespace = kubernetes_namespace_v1.prometheus.metadata.0.name + + annotations = { + "cert-manager.io/cluster-issuer" = module.cert_manager.cluster_issuer_name + "traefik.ingress.kubernetes.io/router.middlewares" = "errors-errors@kubernetescrd" + } + } + + spec { + default_backend { + service { + name = "prometheus-grafana" + port { + number = 80 + } + } + } + + rule { + host = "grafana.${var.cluster_dns_management}" + http { + path { + backend { + service { + name = "prometheus-grafana" + port { + number = 80 + } + } + } + } + } + } + + tls { + hosts = [ + "grafana.${var.cluster_dns_management}" + ] + secret_name = "grafana-ingress-tls-cert" + } + } + + depends_on = [ + helm_release.prometheus + ] +} diff --git a/ops/prometheus/README.md b/ops/prometheus/README.md new file mode 100644 index 0000000..198ea89 --- /dev/null +++ b/ops/prometheus/README.md @@ -0,0 +1,7 @@ +# Dashboards + +Custom dashboards can be added in [`/dashboards`](./dashboards/). + +There are a number of external dashboards: + +- diff --git a/ops/prometheus/grafana/alerts/.gitkeep b/ops/prometheus/grafana/alerts/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/ops/prometheus/grafana/contact/notificationpolicy.yaml b/ops/prometheus/grafana/contact/notificationpolicy.yaml new file mode 100644 index 0000000..f6e81aa --- /dev/null +++ b/ops/prometheus/grafana/contact/notificationpolicy.yaml @@ -0,0 +1,9 @@ +apiVersion: 1 +policies: + - orgId: 1 + receiver: discord + matchers: + - severity = critical + group_wait: 30s + group_interval: 5m + repeat_interval: 4h diff --git a/ops/prometheus/grafana/contact/notifiers.yaml b/ops/prometheus/grafana/contact/notifiers.yaml new file mode 100644 index 0000000..cb78b3f --- /dev/null +++ b/ops/prometheus/grafana/contact/notifiers.yaml @@ -0,0 +1,11 @@ +apiVersion: 1 +contactPoints: + - name: "discord" + org_id: 1 + receivers: + - uid: "discord-contact" + name: "discord" + type: "discord" + is_default: true + settings: + url: "${discord_webhook_url}" diff --git a/ops/prometheus/grafana/dashboards/ctf/container-usage.json b/ops/prometheus/grafana/dashboards/ctf/container-usage.json new file mode 100644 index 0000000..15f9c02 --- /dev/null +++ b/ops/prometheus/grafana/dashboards/ctf/container-usage.json @@ -0,0 +1,1096 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 13, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "count(kube_deployment_labels{namespace=~\"$namespaces\", label_ctfpilot_com_name!=\"\"})", + "instant": false, + "legendFormat": "Deployments", + "range": true, + "refId": "A" + } + ], + "title": "Total CTF Pilot deployments", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "vCPU" + }, + "overrides": [ + { + "__systemRef": "hideSeriesFrom", + "matcher": { + "id": "byNames", + "options": { + "mode": "exclude", + "names": [ + "Total vCPU usage", + "Total vCPU available", + "Node available vCPU" + ], + "prefix": "All except:", + "readOnly": true + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": false, + "tooltip": false, + "viz": true + } + } + ] + } + ] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 12 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=~\"$namespaces\", container!=\"\", image!=\"\", node=~\"$nodes\"}[$__rate_interval]))", + "instant": false, + "legendFormat": "Total vCPU usage", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(node:node_num_cpu:sum{node=~\"$nodes\"})", + "hide": false, + "instant": false, + "legendFormat": "Total vCPU available", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(node:node_num_cpu:sum{node=~\"$nodes\"})\r\n/\r\ncount(count(node:node_num_cpu:sum{node=~\"$nodes\"}) by (node))\r\n", + "hide": false, + "instant": false, + "legendFormat": "Node available vCPU", + "range": true, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(node:node_num_cpu:sum)", + "hide": false, + "instant": false, + "legendFormat": "Cluster vCPU available", + "range": true, + "refId": "D" + } + ], + "title": "CPU usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": -3, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [ + { + "__systemRef": "hideSeriesFrom", + "matcher": { + "id": "byNames", + "options": { + "mode": "exclude", + "names": [ + "Total memory usage", + "Total available memory", + "Node available memory" + ], + "prefix": "All except:", + "readOnly": true + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": false, + "tooltip": false, + "viz": true + } + } + ] + } + ] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 12 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(container_memory_working_set_bytes{namespace=~\"$namespaces\", container!=\"\", image!=\"\", instance=~\"$internalIp:10250\"})", + "hide": false, + "instant": false, + "legendFormat": "Total memory usage", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(node_memory_MemTotal_bytes{instance=~\"$internalIp:9100\"})", + "hide": false, + "instant": false, + "legendFormat": "Total available memory", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(node_memory_MemTotal_bytes{instance=~\"$internalIp:9100\"}) /\r\ncount(count(node_memory_MemTotal_bytes{instance=~\"$internalIp:9100\"}) by (instance))", + "hide": false, + "instant": false, + "legendFormat": "Node available memory", + "range": true, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(node_memory_MemTotal_bytes)", + "hide": false, + "instant": false, + "legendFormat": "Cluster memory available", + "range": true, + "refId": "D" + } + ], + "title": "RAM usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 3, + "x": 0, + "y": 24 + }, + "id": 6, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "count(kube_node_info{node=~\"$nodes\"})", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Node", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "vCPU" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 3, + "x": 3, + "y": 24 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(node:node_num_cpu:sum{node=~\"$nodes\"}) / count(count(node:node_num_cpu:sum{node=~\"$nodes\"}) by (node))", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Node vCPU", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 3, + "x": 6, + "y": 24 + }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(node_memory_MemTotal_bytes{instance=~\"$internalIp:9100\"}) /\r\ncount(count(node_memory_MemTotal_bytes{instance=~\"$internalIp:9100\"}) by (instance))", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Node memory", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "vCPU" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 3, + "x": 9, + "y": 24 + }, + "id": 10, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(node:node_num_cpu:sum{node=~\"$nodes\"}) / count(count(node:node_num_cpu:sum{node=~\"$nodes\"}) by (node))", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Total vCPU", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 3, + "x": 12, + "y": 24 + }, + "id": 11, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(node_memory_MemTotal_bytes{instance=~\"$internalIp:9100\"})", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Total memory", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 3, + "x": 15, + "y": 24 + }, + "id": 7, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "count(kube_node_info)", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Cluster node count", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "vCPU" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 3, + "x": 18, + "y": 24 + }, + "id": 8, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(node:node_num_cpu:sum)", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Cluster vCPU", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 3, + "x": 21, + "y": 24 + }, + "id": 9, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(node_memory_MemTotal_bytes)", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Node memory", + "type": "stat" + } + ], + "refresh": "", + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": [ + "ctfpilot-challenges", + "ctfpilot-challenges-instanced" + ], + "value": [ + "ctfpilot-challenges", + "ctfpilot-challenges-instanced" + ] + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(namespace)", + "hide": 0, + "includeAll": true, + "label": "Namespaces", + "multi": true, + "name": "namespaces", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(namespace)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(container_cpu_usage_seconds_total{namespace=~\"$namespaces\"},node)", + "hide": 0, + "includeAll": true, + "label": "Nodes", + "multi": true, + "name": "nodes", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(container_cpu_usage_seconds_total{namespace=~\"$namespaces\"},node)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(kube_node_info{node=~\"$nodes\"},internal_ip)", + "hide": 2, + "includeAll": true, + "label": "Internal IP", + "multi": true, + "name": "internalIp", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(kube_node_info{node=~\"$nodes\"},internal_ip)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "2025-08-22T11:30:00.000Z", + "to": "2025-08-24T12:15:00.000Z" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Container usage", + "uid": "bew212y4i2sqoe", + "version": 3, + "weekStart": "" +} \ No newline at end of file diff --git a/ops/prometheus/grafana/dashboards/ctf/ctfd-challenges.json b/ops/prometheus/grafana/dashboards/ctf/ctfd-challenges.json new file mode 100644 index 0000000..f516d0d --- /dev/null +++ b/ops/prometheus/grafana/dashboards/ctf/ctfd-challenges.json @@ -0,0 +1,116 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 55, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 21, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "interval": "1", + "maxDataPoints": 999999999, + "options": { + "colorMode": "background", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": { + "titleSize": 16, + "valueSize": 22 + }, + "textMode": "value_and_name", + "wideLayout": true + }, + "pluginVersion": "11.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "ctfd_challenge_solves{}", + "instant": false, + "legendFormat": "{{name}}", + "range": true, + "refId": "A" + } + ], + "title": "Challenge Solves", + "type": "stat" + } + ], + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "CTFd Challenges", + "uid": "devryawzpgav4b", + "version": 4, + "weekStart": "" +} \ No newline at end of file diff --git a/ops/prometheus/grafana/dashboards/ctf/ctfd.json b/ops/prometheus/grafana/dashboards/ctf/ctfd.json new file mode 100644 index 0000000..545f386 --- /dev/null +++ b/ops/prometheus/grafana/dashboards/ctf/ctfd.json @@ -0,0 +1,1550 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "gnetId": 23095, + "graphTooltip": 0, + "id": 12, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 11, + "panels": [], + "title": "Core information", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Top 10 teams in brackets", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 3, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "fieldMinMax": false, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 17, + "w": 11, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": false + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "exemplar": false, + "expr": "topk(10, sum by(name) (ctfd_team_score{bracket=~\"$brackets\"}))", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Scoreboard", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-RdYlGr" + }, + "custom": { + "align": "center", + "cellOptions": { + "type": "color-text" + }, + "filterable": false, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "name" + }, + "properties": [ + { + "id": "displayName", + "value": "User" + }, + { + "id": "color", + "value": { + "fixedColor": "text", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "Score" + } + ] + } + ] + }, + "gridPos": { + "h": 17, + "w": 5, + "x": 11, + "y": 1 + }, + "id": 1, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "enablePagination": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true + }, + "pluginVersion": "11.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "topk(15, sum by(name) (last_over_time(ctfd_team_score[$__interval])))", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Top 15 teams", + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "custom": { + "align": "center", + "cellOptions": { + "type": "color-text" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 17, + "w": 4, + "x": 16, + "y": 1 + }, + "id": 17, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [] + }, + "pluginVersion": "11.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "topk(15, sum(ctfd_submission_solves) by(name))", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Most solves", + "transformations": [ + { + "id": "filterFieldsByName", + "options": { + "include": { + "names": [ + "name", + "Value" + ] + } + } + }, + { + "id": "organize", + "options": { + "excludeByName": {}, + "includeByName": {}, + "indexByName": {}, + "renameByName": { + "Value": "Solves", + "name": "Name" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-RdYlGr" + }, + "custom": { + "align": "center", + "cellOptions": { + "type": "color-text" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "category" + }, + "properties": [ + { + "id": "custom.width", + "value": 122 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "category" + }, + "properties": [ + { + "id": "custom.width" + }, + { + "id": "color", + "value": { + "fixedColor": "text", + "mode": "fixed" + } + }, + { + "id": "displayName", + "value": "Category" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "name" + }, + "properties": [ + { + "id": "displayName", + "value": "Name" + }, + { + "id": "color", + "value": { + "fixedColor": "text", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "Solves" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Name" + }, + "properties": [ + { + "id": "custom.width", + "value": 237 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Solves" + }, + "properties": [ + { + "id": "custom.width", + "value": 89 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Category" + }, + "properties": [ + { + "id": "custom.width", + "value": 111 + } + ] + } + ] + }, + "gridPos": { + "h": 17, + "w": 4, + "x": 20, + "y": 1 + }, + "id": 4, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "enablePagination": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [] + }, + "pluginVersion": "11.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "bottomk(\n 15,\n sum(ctfd_challenge_solves) by(name) > 0\n)", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Least solved (> 0)", + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [] + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 5, + "x": 0, + "y": 18 + }, + "id": 3, + "options": { + "displayLabels": [ + "name", + "percent" + ], + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true, + "values": [] + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(ctfd_challenge_solves) by (category)", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Solves per category", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-RdYlGr" + }, + "custom": { + "align": "center", + "cellOptions": { + "type": "color-text" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "category" + }, + "properties": [ + { + "id": "custom.width", + "value": 122 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "category" + }, + "properties": [ + { + "id": "custom.width" + }, + { + "id": "color", + "value": { + "fixedColor": "text", + "mode": "fixed" + } + }, + { + "id": "displayName", + "value": "Category" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "name" + }, + "properties": [ + { + "id": "displayName", + "value": "Name" + }, + { + "id": "color", + "value": { + "fixedColor": "text", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "Solves" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Name" + }, + "properties": [ + { + "id": "custom.width", + "value": 237 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Solves" + }, + "properties": [ + { + "id": "custom.width", + "value": 89 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Category" + }, + "properties": [ + { + "id": "custom.width", + "value": 111 + } + ] + } + ] + }, + "gridPos": { + "h": 12, + "w": 6, + "x": 5, + "y": 18 + }, + "id": 19, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "enablePagination": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [] + }, + "pluginVersion": "11.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "bottomk(\n 10,\n sum(ctfd_challenge_solves) by(name, category) == 0\n)", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Unsolved challenges", + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "custom": { + "align": "center", + "cellOptions": { + "type": "color-text" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 5, + "x": 11, + "y": 18 + }, + "id": 5, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [] + }, + "pluginVersion": "11.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "topk(10, sum(ctfd_submission_fails) by(name))", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Most fails", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true + }, + "includeByName": {}, + "indexByName": {}, + "renameByName": { + "Value": "Fails", + "name": "Name" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 16, + "y": 18 + }, + "id": 6, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "ctfd_users_total", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Total users", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 19, + "y": 18 + }, + "id": 9, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "ctfd_teams_total", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Total teams", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 2, + "x": 22, + "y": 18 + }, + "id": 7, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "ctfd_challenges_total", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Total challenges", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "displayName": "Alive containers", + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 16, + "y": 23 + }, + "id": 8, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "count(kube_deployment_labels{namespace=~\"ctfpilot-challenges-instanced|ctfpilot-challenges\", label_ctfpilot_com_name!=\"\"})", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Active instances", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 2, + "x": 22, + "y": 23 + }, + "id": 10, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "count(kube_deployment_labels{namespace=~\"ctfpilot-challenges-instanced|ctfpilot-challenges\", label_ctfpilot_com_name!=\"\"})", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Total instances", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 30 + }, + "id": 12, + "panels": [], + "title": "Scoreboard", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "left", + "cellOptions": { + "type": "auto", + "wrapText": true + }, + "filterable": false, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 31 + }, + "id": 13, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [] + }, + "pluginVersion": "11.1.5", + "repeat": "brackets", + "repeatDirection": "h", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "disableTextWrap": false, + "editorMode": "builder", + "exemplar": false, + "expr": "topk(20, ctfd_team_score{bracket=\"$brackets\"})", + "format": "table", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A", + "useBackend": false + } + ], + "title": "Top 20 teams for $brackets", + "transformations": [ + { + "id": "filterFieldsByName", + "options": { + "include": { + "names": [ + "name", + "Value" + ] + } + } + }, + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [ + { + "desc": true, + "field": "Value" + } + ] + } + }, + { + "id": "organize", + "options": { + "excludeByName": {}, + "includeByName": {}, + "indexByName": {}, + "renameByName": { + "Value": "Score", + "name": "Team name" + } + } + } + ], + "type": "table" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 49 + }, + "id": 15, + "panels": [], + "title": "Additional data", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 50 + }, + "id": 16, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "ctfd_users_total", + "instant": false, + "legendFormat": "Users", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "ctfd_teams_total", + "hide": false, + "instant": false, + "legendFormat": "Teams", + "range": true, + "refId": "B" + } + ], + "title": "Panel Title", + "type": "timeseries" + } + ], + "refresh": "1m", + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "Prometheus", + "value": "prometheus" + }, + "hide": 0, + "includeAll": false, + "multi": false, + "name": "DS_PROMETHEUS", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": "", + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(ctfd_team_score,bracket)", + "hide": 0, + "includeAll": true, + "label": "Brackets", + "multi": true, + "name": "brackets", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(ctfd_team_score,bracket)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "CTFd", + "uid": "ctfdcet4q2i3du29", + "version": 7, + "weekStart": "" +} \ No newline at end of file diff --git a/ops/prometheus/grafana/dashboards/ctf/kubectf.json b/ops/prometheus/grafana/dashboards/ctf/kubectf.json new file mode 100644 index 0000000..7867aa0 --- /dev/null +++ b/ops/prometheus/grafana/dashboards/ctf/kubectf.json @@ -0,0 +1,1577 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 13, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "title": "Running challenges", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "count(kube_deployment_labels{namespace=~\"ctfpilot-challenges-instanced|ctfpilot-challenges\", label_ctfpilot_com_name!=\"\"})", + "instant": true, + "legendFormat": "Total Instances", + "refId": "A" + } + ], + "title": "Total Instances", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "custom": { + "align": "left", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 5, + "x": 4, + "y": 1 + }, + "id": 6, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": true + }, + "showHeader": true + }, + "pluginVersion": "11.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "count by (label_ctfpilot_com_name) (kube_deployment_labels{namespace=~\"ctfpilot-challenges-instanced|ctfpilot-challenges\", label_ctfpilot_com_name!=\"\"})", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Instances by Challenge Name", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true + }, + "includeByName": {}, + "indexByName": {}, + "renameByName": { + "Value": "Count", + "label_ctfpilot_com_name": "Challenge Name" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "custom": { + "align": "left", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 5, + "x": 9, + "y": 1 + }, + "id": 15, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": true + }, + "showHeader": true + }, + "pluginVersion": "11.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "count by (label_ctfpilot_com_type) (kube_deployment_labels{namespace=~\"ctfpilot-challenges-instanced|ctfpilot-challenges\", label_ctfpilot_com_name!=\"\"})", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Instances by Challenge Type", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true + }, + "includeByName": {}, + "indexByName": {}, + "renameByName": { + "Value": "Count", + "label_ctfpilot_com_name": "Challenge Name", + "label_ctfpilot_com_type": "Challenge Type" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "custom": { + "align": "left", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 5, + "x": 14, + "y": 1 + }, + "id": 16, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": true + }, + "showHeader": true + }, + "pluginVersion": "11.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "count by (label_instanced_challenges_ctfpilot_com_owner) (kube_deployment_labels{namespace=~\"ctfpilot-challenges-instanced|ctfpilot-challenges\", label_ctfpilot_com_name!=\"\"})", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Instances by Challenge Owner", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true + }, + "includeByName": {}, + "indexByName": {}, + "renameByName": { + "Value": "Count", + "label_instanced_challenges_ctfpilot_com_owner": "Challenge Owner", + "label_ctfpilot_com_name": "Challenge Name" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "custom": { + "align": "left", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "deployment" + }, + "properties": [ + { + "id": "custom.width", + "value": 363 + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 5, + "x": 19, + "y": 1 + }, + "id": 5, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": true + }, + "showHeader": true, + "sortBy": [] + }, + "pluginVersion": "11.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "count by (deployment) (kube_deployment_labels{namespace=~\"ctfpilot-challenges-instanced|ctfpilot-challenges\", label_instanced_challenges_ctfpilot_com_deployment!=\"\"})", + "format": "table", + "hide": false, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Instances by Deployment ID", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true + }, + "includeByName": {}, + "indexByName": {}, + "renameByName": { + "Value": "Count", + "deployment": "Challenge Deployment", + "label_instanced_challenges_ctfpilot_com_deployment": "Deployment ID" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 0, + "y": 6 + }, + "id": 8, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "count(kube_configmap_labels{namespace=\"challenge-config\", label_challenges_ctfpilot_com_name!=\"\", label_challenges_ctfpilot_com_enabled!=\"false\"})", + "instant": true, + "legendFormat": "Enabled Challenges", + "refId": "A" + } + ], + "title": "Total Enabled Challenges", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 11 + }, + "id": 10, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "count(kube_deployment_labels{namespace=~\"ctfpilot-challenges-instanced|ctfpilot-challenges\", label_ctfpilot_com_name!=\"\"})", + "legendFormat": "Total Instances", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "count(kube_deployment_labels{namespace=~\"ctfpilot-challenges-instanced|ctfpilot-challenges\", label_ctfpilot_com_name!=\"\"}) by (label_ctfpilot_com_name)", + "hide": false, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "B" + } + ], + "title": "Total Instances Over Time", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 21 + }, + "id": 13, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "count by (label_ctfpilot_com_name) (kube_deployment_labels{namespace=~\"ctfpilot-challenges-instanced|ctfpilot-challenges\", label_ctfpilot_com_name!=\"\"})", + "legendFormat": "{{label_ctfpilot_com_name}}", + "refId": "A" + } + ], + "title": "Instances by Challenge Name Over Time", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 21 + }, + "id": 12, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "count by (label_instanced_challenges_ctfpilot_com_owner) (kube_deployment_labels{namespace=~\"ctfpilot-challenges-instanced|ctfpilot-challenges\", label_instanced_challenges_ctfpilot_com_owner!=\"\"})", + "legendFormat": "{{label_instanced_challenges_ctfpilot_com_owner}}", + "refId": "A" + } + ], + "title": "Instances by Owner Over Time", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 31 + }, + "id": 11, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "count by (label_ctfpilot_com_type) (kube_deployment_labels{namespace=~\"ctfpilot-challenges-instanced|ctfpilot-challenges\", label_ctfpilot_com_type!=\"\"})", + "legendFormat": "{{label_ctfpilot_com_type}}", + "refId": "A" + } + ], + "title": "Instances by Type Over Time", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 31 + }, + "id": 9, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "count by (label_challenges_ctfpilot_com_type) (kube_configmap_labels{namespace=\"challenge-config\", label_challenges_ctfpilot_com_name!=\"\", label_challenges_ctfpilot_com_enabled!=\"false\"})", + "instant": false, + "legendFormat": "{{label_challenges_ctfpilot_com_type}}", + "range": true, + "refId": "A" + } + ], + "title": "Enabled Challenges by Type", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 41 + }, + "id": 20, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum (rate(container_cpu_usage_seconds_total{namespace=~\"ctfpilot-challenges|ctfpilot-challenges-instanced\"}[$__rate_interval]))", + "hide": false, + "instant": false, + "legendFormat": "Total CPU", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum by(namespace) (rate(container_cpu_usage_seconds_total{namespace=~\"ctfpilot-challenges|ctfpilot-challenges-instanced\"}[$__rate_interval]))", + "instant": false, + "legendFormat": "{{label_challenges_ctfpilot_com_type}}", + "range": true, + "refId": "A" + } + ], + "title": "CPU usage of instances", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decgbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 41 + }, + "id": 21, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "sum(rate(container_memory_usage_bytes{namespace=~\"ctfpilot-challenges|ctfpilot-challenges-instanced\"}[$__rate_interval])) / 1000000000", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "Total CPU", + "range": true, + "refId": "B", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "exemplar": false, + "expr": "sum by(namespace) (rate(container_memory_usage_bytes{namespace=~\"ctfpilot-challenges|ctfpilot-challenges-instanced\"}[$__rate_interval])) / 1000000000", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{label_challenges_ctfpilot_com_type}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Memory usage of instances", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "decimals": 0, + "mappings": [] + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 5, + "x": 0, + "y": 52 + }, + "id": 18, + "options": { + "displayLabels": [ + "name", + "value" + ], + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "count by (label_challenges_ctfpilot_com_category) (kube_configmap_labels{namespace=\"challenge-config\", label_challenges_ctfpilot_com_name!=\"\", label_challenges_ctfpilot_com_enabled!=\"false\"})", + "instant": true, + "legendFormat": "{{label_challenges_ctfpilot_com_type}}", + "range": false, + "refId": "A" + } + ], + "title": "Enabled Challenges by Category", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "decimals": 0, + "mappings": [] + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 5, + "x": 5, + "y": 52 + }, + "id": 17, + "options": { + "displayLabels": [ + "name", + "value" + ], + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "count by (label_challenges_ctfpilot_com_enabled) (kube_configmap_labels{namespace=\"challenge-config\", label_challenges_ctfpilot_com_name!=\"\", label_challenges_ctfpilot_com_enabled!=\"false\"})", + "instant": true, + "legendFormat": "{{label_challenges_ctfpilot_com_type}}", + "range": false, + "refId": "A" + } + ], + "title": "Challenges by Enabled", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [] + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 5, + "x": 10, + "y": 52 + }, + "id": 19, + "options": { + "displayLabels": [ + "name", + "value" + ], + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "count(kube_deployment_labels{namespace=~\"ctfpilot-challenges-instanced|ctfpilot-challenges\", label_ctfpilot_com_name!=\"\"}) by (label_ctfpilot_com_name)", + "hide": false, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "B" + } + ], + "title": "Instances deployed by name", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [] + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 5, + "x": 15, + "y": 52 + }, + "id": 22, + "options": { + "displayLabels": [ + "name", + "value" + ], + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "count(kube_deployment_labels{namespace=~\"ctfpilot-challenges-instanced|ctfpilot-challenges\", label_ctfpilot_com_name!=\"\"}) by (label_ctfpilot_com_type)", + "hide": false, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "B" + } + ], + "title": "Instances deployed by type", + "type": "piechart" + } + ], + "refresh": "1m", + "schemaVersion": 39, + "tags": [ + "ctfpilot", + "kubernetes" + ], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "Prometheus", + "value": "prometheus" + }, + "hide": 0, + "includeAll": false, + "multi": false, + "name": "DS_PROMETHEUS", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "auto": false, + "auto_count": 30, + "auto_min": "10s", + "current": { + "selected": false, + "text": "1h", + "value": "1h" + }, + "hide": 0, + "includeAll": false, + "multi": false, + "name": "Granularity", + "options": [ + { + "selected": false, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "5m", + "value": "5m" + }, + { + "selected": false, + "text": "15m", + "value": "15m" + }, + { + "selected": true, + "text": "1h", + "value": "1h" + } + ], + "query": "1m,5m,15m,1h", + "queryValue": "", + "refresh": 1, + "skipUrlSync": false, + "type": "interval" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Challenges Dashboard", + "uid": "ctfpilot-overview", + "version": 10, + "weekStart": "" +} \ No newline at end of file diff --git a/ops/prometheus/grafana/dashboards/ctf/node-usage.json b/ops/prometheus/grafana/dashboards/ctf/node-usage.json new file mode 100644 index 0000000..2151df2 --- /dev/null +++ b/ops/prometheus/grafana/dashboards/ctf/node-usage.json @@ -0,0 +1,788 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": 13, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 5, + "panels": [], + "title": "Cluster Totals", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "System uptime", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "GB" + }, + "overrides": [] + }, + "gridPos": { + "h": 13, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(node_memory_MemTotal_bytes{instance=~\"$node\"} - node_memory_MemAvailable_bytes{instance=~\"$node\"}) / (1024*1024*1024)", + "legendFormat": "Memory Usage", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum by(instance) (node_memory_MemTotal_bytes{instance=~\"$node\"} - node_memory_MemAvailable_bytes{instance=~\"$node\"}) / (1024*1024*1024)", + "hide": false, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "B" + } + ], + "title": "Uptime", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "cores" + }, + "overrides": [] + }, + "gridPos": { + "h": 13, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(node_cpu_seconds_total{mode!=\"idle\", instance=~\"$node\"}[5m]))", + "legendFormat": "CPU Usage", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum by(instance) (rate(node_cpu_seconds_total{mode!=\"idle\", instance=~\"$node\"}[5m]))", + "hide": false, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "B" + } + ], + "title": "Total CPU Usage (Cores)", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 14 + }, + "id": 6, + "panels": [], + "repeat": "node", + "repeatDirection": "h", + "title": "Node - $node", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 15 + }, + "id": 25, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "/^nodename$/", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "max by (nodename) (node_uname_info{instance=~\"$node\"})", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Node name", + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "k3s-agents-1-dxg": true + }, + "includeByName": {}, + "indexByName": {}, + "renameByName": {} + } + } + ], + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 4, + "y": 15 + }, + "id": 26, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "count by (cluster, node) (node_cpu_seconds_total{instance=~\"$node\",mode=\"idle\"} * on (cluster, namespace, pod) group_left (node) topk by (cluster, namespace, pod) (1, node_namespace_pod:kube_pod_info:))", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Total CPU", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 7, + "y": 15 + }, + "id": 27, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum by (cluster) (node_memory_MemTotal_bytes{instance=~\"$node\"})", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Total RAM", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "System uptime", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 10, + "y": 15 + }, + "id": 64, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "node_time_seconds{instance=~\"$node\"} - node_boot_time_seconds{instance=~\"$node\"}", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Uptime", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "cores" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 19 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "expr": "sum by (instance)(rate(node_cpu_seconds_total{mode!=\"idle\", instance=~\"$node\"}[5m]))", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "title": "CPU Usage per Node (Cores)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "GB" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 19 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "expr": "(node_memory_MemTotal_bytes{instance=~\"$node\"} - node_memory_MemAvailable_bytes{instance=~\"$node\"}) / (1024*1024*1024)", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "title": "Memory Usage per Node (GB)", + "type": "timeseries" + } + ], + "schemaVersion": 39, + "tags": [ + "kubernetes", + "nodes", + "resources" + ], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(node_cpu_seconds_total,instance)", + "hide": 0, + "includeAll": true, + "multi": true, + "name": "node", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(node_cpu_seconds_total,instance)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Kubernetes Nodes - CPU & Memory (by Label)", + "uid": "k8s-nodes-extended2", + "version": 12, + "weekStart": "" +} \ No newline at end of file diff --git a/ops/prometheus/grafana/dashboards/ctf/team-instances.json b/ops/prometheus/grafana/dashboards/ctf/team-instances.json new file mode 100644 index 0000000..38121cd --- /dev/null +++ b/ops/prometheus/grafana/dashboards/ctf/team-instances.json @@ -0,0 +1,178 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 14, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "count by (deployment) (\r\n kube_deployment_labels{\r\n namespace=~\"ctfpilot-challenges-instanced|ctfpilot-challenges\",\r\n label_instanced_challenges_ctfpilot_com_deployment!=\"\",\r\n label_instanced_challenges_ctfpilot_com_owner=\"$teamid\"\r\n }\r\n)", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Team instances", + "transformations": [ + { + "id": "rowsToFields", + "options": { + "mappings": [] + } + } + ], + "type": "timeseries" + } + ], + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(label_instanced_challenges_ctfpilot_com_owner)", + "hide": 0, + "includeAll": true, + "label": "Team id", + "multi": true, + "name": "teamid", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(label_instanced_challenges_ctfpilot_com_owner)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "now-2d", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Team instances", + "uid": "aevzf9eqwc2kga", + "version": 3, + "weekStart": "" +} \ No newline at end of file diff --git a/ops/prometheus/grafana/dashboards/k8s/k8s-addons-prometheus.json b/ops/prometheus/grafana/dashboards/k8s/k8s-addons-prometheus.json new file mode 100644 index 0000000..b2dbb65 --- /dev/null +++ b/ops/prometheus/grafana/dashboards/k8s/k8s-addons-prometheus.json @@ -0,0 +1,3187 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": [], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "8.5.0" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "5.0.0" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + }, + { + "type": "panel", + "id": "stat", + "name": "Stat", + "version": "" + }, + { + "type": "panel", + "id": "table", + "name": "Table", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + }, + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": false, + "iconColor": "#5c4ee5", + "name": "terraform", + "target": { + "limit": 100, + "matchAny": false, + "tags": [ + "terraform" + ], + "type": "tags" + } + }, + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": false, + "iconColor": "red", + "name": "oncall", + "target": { + "limit": 100, + "matchAny": false, + "tags": [ + "oncall" + ], + "type": "tags" + } + } + ] + }, + "description": "This is a modern 'Prometheus' dashboard for your Kubernetes cluster(s). Made for kube-prometheus-stack and take advantage of the latest Grafana features. GitHub repository: https://github.com/dotdc/grafana-dashboards-kubernetes", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 89, + "panels": [], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "refId": "A" + } + ], + "title": "Information", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "noValue": "?", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "orange", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 1 + }, + "id": 78, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "name" + }, + "pluginVersion": "10.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "prometheus_build_info{pod=~\"$pod\", cluster=~\"$cluster\"}", + "instant": true, + "interval": "", + "legendFormat": "{{ version }}", + "range": false, + "refId": "A" + } + ], + "title": "Prometheus version", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 1 + }, + "id": 92, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "value" + }, + "pluginVersion": "10.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "up{pod=~\"$pod\", cluster=~\"$cluster\"} < 1", + "instant": true, + "interval": "", + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Instance Down", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "text", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 1 + }, + "id": 72, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "value" + }, + "pluginVersion": "10.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(prometheus_tsdb_head_series{pod=~\"$pod\", cluster=~\"$cluster\"}) by (pod)", + "interval": "", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "TSDB Head Series", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 1 + }, + "id": 94, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "value" + }, + "pluginVersion": "10.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(prometheus_sd_discovered_targets{pod=~\"$pod\", cluster=~\"$cluster\"}) by (pod)", + "instant": true, + "interval": "", + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Discovered Targets", + "type": "stat" + }, + { + "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 5 + }, + "id": 64, + "panels": [], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "refId": "A" + } + ], + "title": "Prometheus", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 6 + }, + "id": 93, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "list", + "placement": "right", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "up{pod=~\"$pod\", cluster=~\"$cluster\"}", + "interval": "", + "legendFormat": "{{ pod }}", + "range": true, + "refId": "A" + } + ], + "title": "Liveness by pod", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 6 + }, + "id": 96, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "list", + "placement": "right", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(prometheus_config_last_reload_successful{pod=~\"$pod\", cluster=~\"$cluster\"}) by (pod)", + "interval": "", + "legendFormat": "{{ pod }}", + "range": true, + "refId": "A" + } + ], + "title": "Config - Last Successful Reload by pod", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 14 + }, + "id": 74, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "list", + "placement": "right", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(prometheus_target_scrapes_exceeded_body_size_limit_total{pod=~\"$pod\", cluster=~\"$cluster\"}[$__rate_interval])) by (pod)", + "interval": "", + "legendFormat": "{{ pod }} - Exceeded body size limit", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(rate(prometheus_target_scrapes_exceeded_sample_limit_total{pod=~\"$pod\", cluster=~\"$cluster\"}[$__rate_interval])) by (pod)", + "hide": false, + "legendFormat": "{{ pod }} - Exceeded sample limit", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{pod=~\"$pod\", cluster=~\"$cluster\"}[$__rate_interval])) by (pod)", + "hide": false, + "legendFormat": "{{ pod }} - Duplicate timestamp", + "range": true, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(rate(prometheus_target_scrapes_sample_out_of_bounds_total{pod=~\"$pod\", cluster=~\"$cluster\"}[$__rate_interval])) by (pod)", + "hide": false, + "legendFormat": "{{ pod }} - Sample out of bounds", + "range": true, + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(rate(prometheus_target_scrapes_sample_out_of_order_total{pod=~\"$pod\", cluster=~\"$cluster\"}[$__rate_interval])) by (pod)", + "hide": false, + "legendFormat": "{{ pod }} - Sample out of order", + "range": true, + "refId": "E" + } + ], + "title": "Target Scrapes Errors by pod", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 14 + }, + "id": 84, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "list", + "placement": "right", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(prometheus_sd_discovered_targets{pod=~\"$pod\", cluster=~\"$cluster\"}) by (pod)", + "interval": "", + "legendFormat": "{{ pod }}", + "range": true, + "refId": "A" + } + ], + "title": "Number of Targets by pod", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 22 + }, + "id": 75, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "list", + "placement": "right", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(prometheus_target_sync_length_seconds_sum{pod=~\"$pod\", cluster=~\"$cluster\"}[$__rate_interval])) by (pod, scrape_job) * 1000", + "interval": "", + "legendFormat": "{{ pod }} - {{ scrape_job }}", + "range": true, + "refId": "A" + } + ], + "title": "Target Sync by pod, scrape_job", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 22 + }, + "id": 85, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "list", + "placement": "right", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "round(sum(rate(prometheus_target_interval_length_seconds_sum{pod=~\"$pod\", cluster=~\"$cluster\"}[$__rate_interval]) / rate(prometheus_target_interval_length_seconds_count{pod=~\"$pod\", cluster=~\"$cluster\"}[$__rate_interval])) by (pod))", + "interval": "", + "legendFormat": "{{ pod }}", + "range": true, + "refId": "A" + } + ], + "title": "Average Scrape Interval by pod", + "type": "timeseries" + }, + { + "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 30 + }, + "id": 98, + "panels": [], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "refId": "A" + } + ], + "title": "Prometheus TSDB / Query Engine", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 31 + }, + "id": 59, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "list", + "placement": "right", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(prometheus_tsdb_head_series{pod=~\"$pod\", cluster=~\"$cluster\"}) by (pod)", + "interval": "", + "legendFormat": "{{ pod }} - Head Series", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(prometheus_tsdb_head_chunks{pod=~\"$pod\", cluster=~\"$cluster\"}) by (pod)", + "hide": false, + "legendFormat": "{{ pod }} - Head Chunks", + "range": true, + "refId": "B" + } + ], + "title": "TSDB Head Series & Chunks by pod", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 31 + }, + "id": 60, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "list", + "placement": "right", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(prometheus_tsdb_head_samples_appended_total{pod=~\"$pod\", cluster=~\"$cluster\"}[$__rate_interval])) by (pod)", + "interval": "", + "legendFormat": "{{ pod }}", + "range": true, + "refId": "A" + } + ], + "title": "TSDB Head samples appended - rate by pod", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 39 + }, + "id": 101, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "list", + "placement": "right", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(prometheus_tsdb_blocks_loaded{pod=~\"$pod\", cluster=~\"$cluster\"}) by (pod)", + "interval": "", + "legendFormat": "{{ pod }} - Head Series", + "range": true, + "refId": "A" + } + ], + "title": "TSDB Blocks Loaded by pod", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 39 + }, + "id": 102, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "list", + "placement": "right", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(prometheus_tsdb_compactions_total{pod=~\"$pod\", cluster=~\"$cluster\"}[$__rate_interval])) by (pod)", + "interval": "", + "legendFormat": "{{ pod }} - Total Compactions", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(rate(prometheus_tsdb_compactions_triggered_total{pod=~\"$pod\", cluster=~\"$cluster\"}[$__rate_interval])) by (pod)", + "hide": false, + "legendFormat": "{{ pod }} - Triggered Compactions", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(rate(prometheus_tsdb_compactions_skipped_total{pod=~\"$pod\", cluster=~\"$cluster\"}[$__rate_interval])) by (pod)", + "hide": false, + "legendFormat": "{{ pod }} - Skipped Compactions", + "range": true, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(rate(prometheus_tsdb_compactions_failed_total{pod=~\"$pod\", cluster=~\"$cluster\"}[$__rate_interval])) by (pod)", + "hide": false, + "legendFormat": "{{ pod }} - Failed Compactions", + "range": true, + "refId": "D" + } + ], + "title": "TSDB Rate of Compactions by pod", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 47 + }, + "id": 90, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "list", + "placement": "right", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(prometheus_tsdb_reloads_failures_total{pod=~\"$pod\", cluster=~\"$cluster\"}[$__rate_interval])) by (pod)", + "interval": "", + "legendFormat": "{{ pod }}", + "range": true, + "refId": "A" + } + ], + "title": "TSDB Reload Failures by pod", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 47 + }, + "id": 95, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "list", + "placement": "right", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(prometheus_tsdb_head_series_created_total{pod=~\"$pod\", cluster=~\"$cluster\"}[$__rate_interval])) by (pod)", + "interval": "", + "legendFormat": "{{ pod }} - Created series", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(rate(prometheus_tsdb_head_series_removed_total{pod=~\"$pod\", cluster=~\"$cluster\"}[$__rate_interval])) by (pod)", + "hide": false, + "legendFormat": "{{ pod }} - Deleted series", + "range": true, + "refId": "B" + } + ], + "title": "TSDB Created & Deleted series by pod", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 55 + }, + "id": 73, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "list", + "placement": "right", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(prometheus_engine_query_duration_seconds_count{pod=~\"$pod\", slice=\"inner_eval\", cluster=~\"$cluster\"}[$__rate_interval])) by (pod)", + "interval": "", + "legendFormat": "{{ pod }}", + "range": true, + "refId": "A" + } + ], + "title": "Engine Query Count by pod", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 55 + }, + "id": 86, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "list", + "placement": "right", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "max(prometheus_engine_query_duration_seconds{pod=~\"$pod\", cluster=~\"$cluster\"}) by (pod, slice) * 1000", + "interval": "", + "legendFormat": "{{ pod }} - {{ slice }}", + "range": true, + "refId": "A" + } + ], + "title": "Engine Query Duration by pod, slice", + "type": "timeseries" + }, + { + "collapsed": false, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 63 + }, + "id": 47, + "panels": [], + "targets": [ + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "refId": "A" + } + ], + "title": "Resources", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "CPU Cores", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 4, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "limit" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F2495C", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 64 + }, + "id": 29, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"$pod\", image!=\"\", container!=\"\", cluster=~\"$cluster\"}[$__rate_interval])) by (pod, container)", + "interval": "$resolution", + "legendFormat": "{{ pod }} - {{ container }}", + "range": true, + "refId": "A" + } + ], + "title": "CPU Usage by pod, container", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Bytes", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 64 + }, + "id": 51, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(container_memory_working_set_bytes{pod=~\"$pod\", image!=\"\", container!=\"\", cluster=~\"$cluster\"}) by (pod, container)", + "interval": "", + "legendFormat": "{{ pod }} - {{ container }}", + "range": true, + "refId": "A" + } + ], + "title": "Memory Usage by container", + "type": "timeseries" + }, + { + "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 72 + }, + "id": 66, + "panels": [], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "refId": "A" + } + ], + "title": "Storage", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 73 + }, + "id": 62, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(kubelet_volume_stats_used_bytes{persistentvolumeclaim=~\".*prom.*\", cluster=~\"$cluster\"}) by (persistentvolumeclaim) / sum(kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~\".*prom.*\", cluster=~\"$cluster\"}) by (persistentvolumeclaim)", + "interval": "", + "legendFormat": "{{ persistentvolumeclaim }}", + "range": true, + "refId": "A" + } + ], + "title": "Persistent Volumes - Capacity and usage in %", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 73 + }, + "id": 87, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(kubelet_volume_stats_used_bytes{persistentvolumeclaim=~\".*prom.*\", cluster=~\"$cluster\"}) by (persistentvolumeclaim)", + "interval": "", + "legendFormat": "{{ persistentvolumeclaim }} - Used", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~\".*prom.*\", cluster=~\"$cluster\"}) by (persistentvolumeclaim)", + "hide": false, + "legendFormat": "{{ persistentvolumeclaim }} - Capacity", + "range": true, + "refId": "B" + } + ], + "title": "Persistent Volumes - Capacity and usage in bytes", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 81 + }, + "id": 68, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.4", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "1 - sum(kubelet_volume_stats_inodes_used{persistentvolumeclaim=~\".*prom.*\", cluster=~\"$cluster\"}) by (persistentvolumeclaim) / sum(kubelet_volume_stats_inodes{persistentvolumeclaim=~\".*prom.*\", cluster=~\"$cluster\"}) by (persistentvolumeclaim)", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{ persistentvolumeclaim }}", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Persistent Volumes - Inodes", + "type": "timeseries" + }, + { + "collapsed": false, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 89 + }, + "id": 45, + "panels": [], + "targets": [ + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "refId": "A" + } + ], + "title": "Network", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 90 + }, + "id": 31, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(container_network_receive_bytes_total{pod=~\"$pod\", cluster=~\"$cluster\"}[$__rate_interval])) by (pod)", + "interval": "$resolution", + "legendFormat": "{{ pod }} - Received", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "- sum(rate(container_network_transmit_bytes_total{pod=~\"$pod\", cluster=~\"$cluster\"}[$__rate_interval])) by (pod)", + "interval": "$resolution", + "legendFormat": "{{ pod }} - Transmitted", + "range": true, + "refId": "B" + } + ], + "title": "Network - Bandwidth by pod", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "pps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 90 + }, + "id": 34, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(container_network_receive_packets_total{pod=~\"$pod\", cluster=~\"$cluster\"}[$__rate_interval])) by (pod)", + "interval": "$resolution", + "legendFormat": "{{ pod }} - Received", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "- sum(rate(container_network_transmit_packets_total{pod=~\"$pod\", cluster=~\"$cluster\"}[$__rate_interval])) by (pod)", + "interval": "$resolution", + "legendFormat": "{{ pod }} - Transmitted", + "range": true, + "refId": "B" + } + ], + "title": "Network - Packets rate by pod", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "pps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 98 + }, + "id": 36, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(container_network_receive_packets_dropped_total{pod=~\"$pod\", cluster=~\"$cluster\"}[$__rate_interval])) by (pod)", + "interval": "$resolution", + "legendFormat": "{{ pod }} - Received", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "- sum(rate(container_network_transmit_packets_dropped_total{pod=~\"$pod\", cluster=~\"$cluster\"}[$__rate_interval])) by (pod)", + "interval": "$resolution", + "legendFormat": "{{ pod }} - Transmitted", + "range": true, + "refId": "B" + } + ], + "title": "Network - Packets Dropped by pod", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "pps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 98 + }, + "id": 37, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(container_network_receive_errors_total{pod=~\"$pod\", cluster=~\"$cluster\"}[$__rate_interval])) by (pod)", + "interval": "$resolution", + "legendFormat": "{{ pod }} - Received", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "- sum(rate(container_network_transmit_errors_total{pod=~\"$pod\", cluster=~\"$cluster\"}[$__rate_interval])) by (pod)", + "interval": "$resolution", + "legendFormat": "{{ pod }} - Transmitted", + "range": true, + "refId": "B" + } + ], + "title": "Network - Errors by pod", + "type": "timeseries" + } + ], + "refresh": "30s", + "revision": 1, + "schemaVersion": 38, + "style": "dark", + "tags": [ + "Kubernetes", + "Prometheus" + ], + "templating": { + "list": [ + { + "current": { + "selected": true, + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "includeAll": false, + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "current": { + "isNone": true, + "selected": false, + "text": "None", + "value": "" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(kube_node_info,cluster)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "cluster", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(kube_node_info,cluster)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "allValue": ".*", + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(prometheus_build_info{cluster=\"$cluster\"}, pod)", + "hide": 0, + "includeAll": true, + "multi": false, + "name": "pod", + "options": [], + "query": { + "query": "label_values(prometheus_build_info{cluster=\"$cluster\"}, pod)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": { + "selected": false, + "text": "30s", + "value": "30s" + }, + "hide": 0, + "includeAll": false, + "multi": false, + "name": "resolution", + "options": [ + { + "selected": false, + "text": "1s", + "value": "1s" + }, + { + "selected": false, + "text": "15s", + "value": "15s" + }, + { + "selected": true, + "text": "30s", + "value": "30s" + }, + { + "selected": false, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "3m", + "value": "3m" + }, + { + "selected": false, + "text": "5m", + "value": "5m" + } + ], + "query": "1s, 15s, 30s, 1m, 3m, 5m", + "queryValue": "", + "skipUrlSync": false, + "type": "custom" + } + ] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Prometheus", + "uid": "k8s_addons_prometheus", + "version": 7, + "weekStart": "" +} \ No newline at end of file diff --git a/ops/prometheus/grafana/dashboards/k8s/k8s-addons-trivy-operator.json b/ops/prometheus/grafana/dashboards/k8s/k8s-addons-trivy-operator.json new file mode 100644 index 0000000..803bc0f --- /dev/null +++ b/ops/prometheus/grafana/dashboards/k8s/k8s-addons-trivy-operator.json @@ -0,0 +1,2733 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": [], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "8.5.0" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "5.0.0" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + }, + { + "type": "panel", + "id": "stat", + "name": "Stat", + "version": "" + }, + { + "type": "panel", + "id": "table", + "name": "Table", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + }, + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": false, + "iconColor": "#5c4ee5", + "name": "terraform", + "target": { + "limit": 100, + "matchAny": false, + "tags": [ + "terraform" + ], + "type": "tags" + } + }, + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": false, + "iconColor": "red", + "name": "oncall", + "target": { + "limit": 100, + "matchAny": false, + "tags": [ + "oncall" + ], + "type": "tags" + } + } + ] + }, + "description": "This is a modern dashboard for the Trivy Operator from Aqua Security. Made to take advantage of the latest Grafana features. GitHub repository: https://github.com/dotdc/grafana-dashboards-kubernetes", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 43, + "panels": [], + "title": "Vulnerabilities", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 1 + }, + "id": 51, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.3.8", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(trivy_image_vulnerabilities{severity=\"Critical\", namespace=~\"$namespace\", cluster=~\"$cluster\"})", + "instant": true, + "interval": "$resolution", + "legendFormat": "__auto", + "refId": "A" + } + ], + "title": "CRITICAL", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 4, + "y": 1 + }, + "id": 50, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.3.8", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(trivy_image_vulnerabilities{severity=\"High\", namespace=~\"$namespace\", cluster=~\"$cluster\"})", + "instant": true, + "interval": "$resolution", + "legendFormat": "__auto", + "refId": "A" + } + ], + "title": "HIGH", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 8, + "y": 1 + }, + "id": 49, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.3.8", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(trivy_image_vulnerabilities{severity=\"Medium\", namespace=~\"$namespace\", cluster=~\"$cluster\"})", + "instant": true, + "interval": "$resolution", + "legendFormat": "__auto", + "refId": "A" + } + ], + "title": "MEDIUM", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "blue", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 12, + "y": 1 + }, + "id": 60, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.3.8", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(trivy_image_vulnerabilities{severity=\"Low\", namespace=~\"$namespace\", cluster=~\"$cluster\"})", + "instant": true, + "interval": "$resolution", + "legendFormat": "__auto", + "refId": "A" + } + ], + "title": "LOW", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "purple", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 16, + "y": 1 + }, + "id": 52, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.3.8", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(trivy_image_vulnerabilities{severity=\"Unknown\", namespace=~\"$namespace\", cluster=~\"$cluster\"})", + "instant": true, + "interval": "$resolution", + "legendFormat": "__auto", + "refId": "A" + } + ], + "title": "UNKNOWN", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "text", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 20, + "y": 1 + }, + "id": 39, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.3.8", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(trivy_image_vulnerabilities{namespace=~\"$namespace\", cluster=~\"$cluster\"})", + "instant": true, + "interval": "$resolution", + "legendFormat": "__auto", + "refId": "A" + } + ], + "title": "TOTAL", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "blue", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 5 + }, + "id": 58, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.5.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(trivy_image_vulnerabilities{cluster=~\"$cluster\", namespace=~\"$namespace\"}) by (namespace)", + "instant": false, + "interval": "$resolution", + "legendFormat": "{{namespace}}", + "range": true, + "refId": "A" + } + ], + "title": "Total vulnerabilities by namespaces", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "blue", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Critical" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "High" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Medium" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Low" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Unknown" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "purple", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 5 + }, + "id": 61, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.5.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(trivy_image_vulnerabilities{cluster=~\"$cluster\"}) by (severity)", + "instant": false, + "interval": "$resolution", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Total vulnerabilities by severity in selected namespace(s)", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 13 + }, + "id": 85, + "panels": [], + "title": "Vulnerability Details", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "align": "auto", + "displayMode": "auto", + "filterable": true, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "severity" + }, + "properties": [ + { + "id": "mappings", + "value": [ + { + "options": { + "Critical": { + "color": "red", + "index": 0 + }, + "High": { + "color": "orange", + "index": 1 + }, + "Low": { + "color": "blue", + "index": 3 + }, + "Medium": { + "color": "yellow", + "index": 2 + }, + "Unknown": { + "color": "purple", + "index": 4 + } + }, + "type": "value" + } + ] + }, + { + "id": "custom.displayMode", + "value": "color-text" + } + ] + } + ] + }, + "gridPos": { + "h": 12, + "w": 24, + "x": 0, + "y": 14 + }, + "id": 83, + "options": { + "footer": { + "enablePagination": true, + "fields": [ + "Value" + ], + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [] + }, + "pluginVersion": "9.3.8", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(trivy_image_vulnerabilities{namespace=~\"$namespace\", cluster=~\"$cluster\"}) by (namespace, image_registry, image_repository, image_tag, severity) > 0", + "format": "table", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Vulnerability count per image and severity in $namespace namespace(s)", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Value": false + }, + "indexByName": { + "Time": 0, + "Value": 6, + "image_registry": 2, + "image_repository": 3, + "image_tag": 4, + "namespace": 1, + "severity": 5 + }, + "renameByName": { + "Value": "Nb of vulnerabilities", + "image_registry": "Image Registry", + "image_repository": "Image Repository", + "image_tag": "Image Tag", + "namespace": "Namespace", + "severity": "Severity" + } + } + }, + { + "id": "groupBy", + "options": { + "fields": { + "All values": { + "aggregations": [], + "operation": "groupby" + }, + "Count": { + "aggregations": [], + "operation": "groupby" + }, + "Image Registry": { + "aggregations": [], + "operation": "groupby" + }, + "Image Repository": { + "aggregations": [], + "operation": "groupby" + }, + "Image Tag": { + "aggregations": [], + "operation": "groupby" + }, + "Namespace": { + "aggregations": [], + "operation": "groupby" + }, + "Nb of vulnerabilities": { + "aggregations": [], + "operation": "groupby" + }, + "Severity": { + "aggregations": [], + "operation": "groupby" + }, + "Value": { + "aggregations": [], + "operation": "groupby" + }, + "image_registry": { + "aggregations": [], + "operation": "groupby" + }, + "image_repository": { + "aggregations": [], + "operation": "groupby" + }, + "image_tag": { + "aggregations": [], + "operation": "groupby" + }, + "namespace": { + "aggregations": [], + "operation": "groupby" + }, + "severity": { + "aggregations": [], + "operation": "groupby" + } + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Require operator.metricsVulnIdEnabled: true", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "displayMode": "auto", + "filterable": true, + "inspect": false + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "blue", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "severity" + }, + "properties": [ + { + "id": "mappings", + "value": [ + { + "options": { + "Critical": { + "color": "red", + "index": 0 + }, + "High": { + "color": "orange", + "index": 1 + }, + "Low": { + "color": "blue", + "index": 3 + }, + "Medium": { + "color": "yellow", + "index": 2 + }, + "Unknown": { + "color": "purple", + "index": 4 + } + }, + "type": "value" + } + ] + }, + { + "id": "custom.displayMode", + "value": "color-text" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "vuln_id" + }, + "properties": [ + { + "id": "links", + "value": [ + { + "targetBlank": true, + "title": "https://nvd.nist.gov/vuln/detail/${__value.text}", + "url": "https://nvd.nist.gov/vuln/detail/${__value.text}" + } + ] + } + ] + } + ] + }, + "gridPos": { + "h": 12, + "w": 24, + "x": 0, + "y": 26 + }, + "id": 78, + "options": { + "footer": { + "enablePagination": true, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [] + }, + "pluginVersion": "9.3.8", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(trivy_vulnerability_id{vuln_id=~\"CVE.*\", namespace=~\"$namespace\", cluster=~\"$cluster\"}) by (namespace, image_registry, image_repository, image_tag, vuln_id, severity)", + "format": "table", + "instant": false, + "interval": "$resolution", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Detailed CVE vulnerabilities in $namespace namespace(s)", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Value": true, + "__name__": true, + "container": true, + "endpoint": true, + "instance": true, + "job": true, + "namespace": false, + "service": true + }, + "indexByName": { + "Time": 0, + "Value": 7, + "image_registry": 2, + "image_repository": 3, + "image_tag": 4, + "namespace": 1, + "severity": 6, + "vuln_id": 5 + }, + "renameByName": { + "image_namespace": "namespace", + "image_registry": "Image Registry", + "image_repository": "Image Repository", + "image_tag": "Image Tag", + "namespace": "Namespace", + "severity": "Severity", + "vuln_id": "Vulnerability", + "vulnerability_id": "" + } + } + }, + { + "id": "groupBy", + "options": { + "fields": { + "Image Registry": { + "aggregations": [], + "operation": "groupby" + }, + "Image Repository": { + "aggregations": [], + "operation": "groupby" + }, + "Image Tag": { + "aggregations": [], + "operation": "groupby" + }, + "Namespace": { + "aggregations": [], + "operation": "groupby" + }, + "Severity": { + "aggregations": [], + "operation": "groupby" + }, + "Value": { + "aggregations": [ + "lastNotNull" + ] + }, + "Vulnerability": { + "aggregations": [], + "operation": "groupby" + }, + "image_namespace": { + "aggregations": [], + "operation": "groupby" + }, + "namespace": { + "aggregations": [], + "operation": "groupby" + }, + "severity": { + "aggregations": [], + "operation": "groupby" + }, + "vuln_id": { + "aggregations": [], + "operation": "groupby" + }, + "vulnerability_id": { + "aggregations": [], + "operation": "groupby" + } + } + } + } + ], + "type": "table" + }, + { + "collapsed": false, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 38 + }, + "id": 47, + "panels": [], + "title": "Config Audit Reports", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 39 + }, + "id": 56, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.3.8", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(trivy_resource_configaudits{severity=\"Critical\", namespace=~\"$namespace\", cluster=~\"$cluster\"})", + "instant": true, + "interval": "$resolution", + "legendFormat": "__auto", + "refId": "A" + } + ], + "title": "CRITICAL", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 4, + "y": 39 + }, + "id": 55, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.3.8", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(trivy_resource_configaudits{severity=\"High\", namespace=~\"$namespace\", cluster=~\"$cluster\"})", + "instant": true, + "interval": "$resolution", + "legendFormat": "__auto", + "refId": "A" + } + ], + "title": "HIGH", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 8, + "y": 39 + }, + "id": 54, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.3.8", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(trivy_resource_configaudits{severity=\"Medium\", namespace=~\"$namespace\", cluster=~\"$cluster\"})", + "instant": true, + "interval": "$resolution", + "legendFormat": "__auto", + "refId": "A" + } + ], + "title": "MEDIUM", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "blue", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 12, + "y": 39 + }, + "id": 53, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.3.8", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(trivy_resource_configaudits{severity=\"Low\", namespace=~\"$namespace\", cluster=~\"$cluster\"})", + "instant": true, + "interval": "$resolution", + "legendFormat": "__auto", + "refId": "A" + } + ], + "title": "LOW", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "text", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 16, + "y": 39 + }, + "id": 65, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.3.8", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(trivy_resource_configaudits{namespace=~\"$namespace\", cluster=~\"$cluster\"})", + "instant": true, + "interval": "$resolution", + "legendFormat": "__auto", + "refId": "A" + } + ], + "title": "TOTAL", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "blue", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 43 + }, + "id": 62, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.5.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(trivy_resource_configaudits{cluster=~\"$cluster\", namespace=~\"$namespace\"}) by (namespace)", + "instant": false, + "interval": "$resolution", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Total config audit report by namespaces", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "blue", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Critical" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "High" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Medium" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Low" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 43 + }, + "id": 63, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.5.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(trivy_resource_configaudits{cluster=~\"$cluster\"}) by (severity)", + "instant": false, + "interval": "$resolution", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Total config audit report by severity", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 51 + }, + "id": 68, + "panels": [], + "title": "RBAC Assessments", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 52 + }, + "id": 72, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.3.8", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(trivy_role_rbacassessments{severity=\"Critical\", namespace=~\"$namespace\", cluster=~\"$cluster\"})", + "instant": true, + "interval": "$resolution", + "legendFormat": "__auto", + "refId": "A" + } + ], + "title": "CRITICAL", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 4, + "y": 52 + }, + "id": 71, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.3.8", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(trivy_role_rbacassessments{severity=\"High\", namespace=~\"$namespace\", cluster=~\"$cluster\"})", + "instant": true, + "interval": "$resolution", + "legendFormat": "__auto", + "refId": "A" + } + ], + "title": "HIGH", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 8, + "y": 52 + }, + "id": 70, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.3.8", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(trivy_role_rbacassessments{severity=\"Medium\", namespace=~\"$namespace\", cluster=~\"$cluster\"})", + "instant": true, + "interval": "$resolution", + "legendFormat": "__auto", + "refId": "A" + } + ], + "title": "MEDIUM", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "blue", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 12, + "y": 52 + }, + "id": 69, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.3.8", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(trivy_role_rbacassessments{severity=\"Low\", namespace=~\"$namespace\", cluster=~\"$cluster\"})", + "instant": true, + "interval": "$resolution", + "legendFormat": "__auto", + "refId": "A" + } + ], + "title": "LOW", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "text", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 16, + "y": 52 + }, + "id": 73, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.3.8", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(trivy_role_rbacassessments{namespace=~\"$namespace\", cluster=~\"$cluster\"})", + "instant": true, + "interval": "$resolution", + "legendFormat": "__auto", + "refId": "A" + } + ], + "title": "TOTAL", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "blue", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 56 + }, + "id": 74, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.5.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(trivy_role_rbacassessments{cluster=~\"$cluster\"}) by (namespace)", + "instant": false, + "interval": "$resolution", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Total RBAC Assessments by namespaces", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "blue", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Critical" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "High" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Medium" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Low" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 56 + }, + "id": 75, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.5.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(trivy_role_rbacassessments{cluster=~\"$cluster\"}) by (severity)", + "instant": false, + "interval": "$resolution", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Total RBAC Assessments by severity", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 64 + }, + "id": 81, + "panels": [], + "title": "Exposed Secrets", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "blue", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 65 + }, + "id": 76, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.5.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(trivy_image_exposedsecrets{cluster=~\"$cluster\"}) by (namespace)", + "instant": false, + "interval": "$resolution", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Total Exposed Secrets by namespaces", + "type": "timeseries" + } + ], + "refresh": "30s", + "schemaVersion": 37, + "style": "dark", + "tags": [ + "Prometheus", + "Addons", + "Trivy", + "Trivy-operator" + ], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "includeAll": false, + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "current": { + "isNone": true, + "selected": false, + "text": "None", + "value": "" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(kube_node_info,cluster)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "cluster", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(kube_node_info,cluster)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "allValue": ".*", + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", + "hide": 0, + "includeAll": true, + "multi": true, + "name": "namespace", + "options": [], + "query": { + "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "current": { + "selected": true, + "text": "30s", + "value": "30s" + }, + "hide": 0, + "includeAll": false, + "multi": false, + "name": "resolution", + "options": [ + { + "selected": false, + "text": "1s", + "value": "1s" + }, + { + "selected": false, + "text": "15s", + "value": "15s" + }, + { + "selected": true, + "text": "30s", + "value": "30s" + }, + { + "selected": false, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "3m", + "value": "3m" + }, + { + "selected": false, + "text": "5m", + "value": "5m" + } + ], + "query": "1s, 15s, 30s, 1m, 3m, 5m", + "queryValue": "", + "skipUrlSync": false, + "type": "custom" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Trivy Operator - Vulnerabilities", + "uid": "security_trivy_operator", + "version": 15, + "weekStart": "" +} diff --git a/ops/prometheus/grafana/dashboards/k8s/k8s-system-api-server.json b/ops/prometheus/grafana/dashboards/k8s/k8s-system-api-server.json new file mode 100644 index 0000000..4c520f3 --- /dev/null +++ b/ops/prometheus/grafana/dashboards/k8s/k8s-system-api-server.json @@ -0,0 +1,1399 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": [], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "8.4.4" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "5.0.0" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + }, + { + "type": "panel", + "id": "stat", + "name": "Stat", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + }, + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": false, + "iconColor": "#5c4ee5", + "name": "terraform", + "target": { + "limit": 100, + "matchAny": false, + "tags": [ + "terraform" + ], + "type": "tags" + } + }, + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": false, + "iconColor": "red", + "name": "oncall", + "target": { + "limit": 100, + "matchAny": false, + "tags": [ + "oncall" + ], + "type": "tags" + } + } + ] + }, + "description": "This is a modern API Server dashboard for your Kubernetes cluster(s). Made for kube-prometheus-stack and take advantage of the latest Grafana features. GitHub repository: https://github.com/dotdc/grafana-dashboards-kubernetes", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "0": { + "text": "DOWN" + }, + "1": { + "text": "UP" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 42, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value_and_name" + }, + "pluginVersion": "10.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "up{job=~\"kubernetes-apiservers|apiserver\", cluster=~\"$cluster\"}", + "interval": "", + "legendFormat": "{{ instance }}", + "refId": "A" + } + ], + "title": "API Server - Health Status", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "__name__" + }, + "properties": [ + { + "id": "custom.width", + "value": 188 + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 60, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [ + { + "desc": false, + "displayName": "removed_release" + } + ] + }, + "pluginVersion": "10.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "apiserver_requested_deprecated_apis{cluster=~\"$cluster\"}", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Deprecated Kubernetes Resources", + "transformations": [ + { + "id": "labelsToFields", + "options": { + "keepLabels": [ + "group", + "job", + "removed_release", + "resource", + "version", + "name" + ], + "mode": "columns" + } + }, + { + "id": "merge", + "options": {} + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Value": true, + "job": true + }, + "indexByName": { + "Time": 6, + "Value": 7, + "group": 1, + "job": 5, + "namespace": 0, + "removed_release": 4, + "resource": 3, + "version": 2 + }, + "renameByName": {} + } + }, + { + "id": "groupBy", + "options": { + "fields": { + "group": { + "aggregations": [ + "lastNotNull" + ], + "operation": "groupby" + }, + "job": { + "aggregations": [], + "operation": "groupby" + }, + "namespace": { + "aggregations": [ + "lastNotNull" + ], + "operation": "groupby" + }, + "removed_release": { + "aggregations": [], + "operation": "groupby" + }, + "resource": { + "aggregations": [ + "lastNotNull" + ], + "operation": "groupby" + }, + "version": { + "aggregations": [], + "operation": "groupby" + } + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 38, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "sum by (code) (rate(apiserver_request_total{cluster=~\"$cluster\"}[$__rate_interval]))", + "interval": "$resolution", + "legendFormat": "{{ code }}", + "refId": "A" + } + ], + "title": "API Server - HTTP Requests by code", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 39, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "sum by (verb) (rate(apiserver_request_total{cluster=~\"$cluster\"}[$__rate_interval]))", + "interval": "$resolution", + "legendFormat": "{{ verb}}", + "refId": "A" + } + ], + "title": "API Server - HTTP Requests by verb", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 53, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "sum(rate(apiserver_request_duration_seconds_sum{job=~\"kubernetes-apiservers|apiserver\", cluster=~\"$cluster\"}[$__rate_interval])) by (instance)\n/\nsum(rate(apiserver_request_duration_seconds_count{job=~\"kubernetes-apiservers|apiserver\", cluster=~\"$cluster\"}[$__rate_interval])) by (instance)", + "interval": "$resolution", + "legendFormat": "{{ instance }}", + "refId": "A" + } + ], + "title": "API Server - HTTP Requests Latency by instance", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 54, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "sum(rate(apiserver_request_duration_seconds_sum{job=~\"kubernetes-apiservers|apiserver\", cluster=~\"$cluster\"}[$__rate_interval])) by (verb)\n/\nsum(rate(apiserver_request_duration_seconds_count{job=~\"kubernetes-apiservers|apiserver\", cluster=~\"$cluster\"}[$__rate_interval])) by (verb)", + "interval": "$resolution", + "legendFormat": "{{ verb }}", + "refId": "A" + } + ], + "title": "API Server - HTTP Requests Latency by verb", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "id": 50, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "sum by(instance) (rate(apiserver_request_total{code=~\"5..\", job=~\"kubernetes-apiservers|apiserver\", cluster=~\"$cluster\"}[$__rate_interval]))\n / sum by(instance) (rate(apiserver_request_total{job=~\"kubernetes-apiservers|apiserver\", cluster=~\"$cluster\"}[$__rate_interval]))", + "interval": "$resolution", + "legendFormat": "{{ instance }}", + "refId": "A" + } + ], + "title": "API Server - Errors by Instance", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "id": 51, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "sum by(verb) (rate(apiserver_request_total{code=~\"5..\",job=~\"kubernetes-apiservers|apiserver\", cluster=~\"$cluster\"}[$__rate_interval]))\n / sum by(verb) (rate(apiserver_request_total{job=~\"kubernetes-apiservers|apiserver\", cluster=~\"$cluster\"}[$__rate_interval]))", + "interval": "$resolution", + "legendFormat": "{{ verb }}", + "refId": "A" + } + ], + "title": "API Server - Errors by verb", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 32 + }, + "id": 40, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "sum(rate(apiserver_request_total{cluster=~\"$cluster\"}[$__rate_interval])) by (instance)", + "interval": "$resolution", + "legendFormat": "{{ instance }}", + "refId": "A" + } + ], + "title": "API Server - Stacked HTTP Requests by instance", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 32 + }, + "id": 56, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "sum(rate(workqueue_depth{job=~\"kubernetes-apiservers|apiserver\", cluster=~\"$cluster\"}[$__rate_interval])) by (instance)", + "interval": "$resolution", + "legendFormat": "{{ instance }}", + "refId": "A" + } + ], + "title": "API Server - Work Queue by instance", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 40 + }, + "id": 47, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "rate(process_cpu_seconds_total{job=~\"kubernetes-apiservers|apiserver\", cluster=~\"$cluster\"}[$__rate_interval])", + "interval": "$resolution", + "legendFormat": "{{ instance }}", + "refId": "A" + } + ], + "title": "API Server - CPU Usage by instance", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 40 + }, + "id": 48, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "process_resident_memory_bytes{job=~\"kubernetes-apiservers|apiserver\", cluster=~\"$cluster\"}", + "interval": "$resolution", + "legendFormat": "{{ instance }}", + "refId": "A" + } + ], + "title": "API Server - Memory Usage by instance", + "type": "timeseries" + } + ], + "refresh": "30s", + "schemaVersion": 38, + "style": "dark", + "tags": [ + "Kubernetes", + "Prometheus" + ], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "includeAll": false, + "label": "", + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "current": { + "isNone": true, + "selected": false, + "text": "None", + "value": "" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(kube_node_info,cluster)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "cluster", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(kube_node_info,cluster)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "current": { + "selected": true, + "text": "30s", + "value": "30s" + }, + "hide": 0, + "includeAll": false, + "multi": false, + "name": "resolution", + "options": [ + { + "selected": false, + "text": "1s", + "value": "1s" + }, + { + "selected": false, + "text": "15s", + "value": "15s" + }, + { + "selected": true, + "text": "30s", + "value": "30s" + }, + { + "selected": false, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "3m", + "value": "3m" + }, + { + "selected": false, + "text": "5m", + "value": "5m" + } + ], + "query": "1s, 15s, 30s, 1m, 3m, 5m", + "queryValue": "", + "skipUrlSync": false, + "type": "custom" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Kubernetes / System / API Server", + "uid": "k8s_system_apisrv", + "version": 19, + "weekStart": "" +} diff --git a/ops/prometheus/grafana/dashboards/k8s/k8s-system-coredns.json b/ops/prometheus/grafana/dashboards/k8s/k8s-system-coredns.json new file mode 100644 index 0000000..266ec29 --- /dev/null +++ b/ops/prometheus/grafana/dashboards/k8s/k8s-system-coredns.json @@ -0,0 +1,1600 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": [], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "8.4.4" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "5.0.0" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + }, + { + "type": "panel", + "id": "stat", + "name": "Stat", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + }, + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": false, + "iconColor": "#5c4ee5", + "name": "terraform", + "target": { + "limit": 100, + "matchAny": false, + "tags": [ + "terraform" + ], + "type": "tags" + } + }, + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": false, + "iconColor": "red", + "name": "oncall", + "target": { + "limit": 100, + "matchAny": false, + "tags": [ + "oncall" + ], + "type": "tags" + } + } + ] + }, + "description": "This is a modern CoreDNS dashboard for your Kubernetes cluster(s). Made for kube-prometheus-stack and take advantage of the latest Grafana features. GitHub repository: https://github.com/dotdc/grafana-dashboards-kubernetes", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "0": { + "text": "DOWN" + }, + "1": { + "text": "UP" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 25, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "vertical", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "value_and_name", + "wideLayout": true + }, + "pluginVersion": "10.4.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "up{job=~\"$job\", instance=~\"$instance\", cluster=~\"$cluster\"}", + "interval": "", + "legendFormat": "{{ instance }}", + "refId": "A" + } + ], + "title": "CoreDNS - Health Status", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 3 + }, + "id": 19, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "rate(process_cpu_seconds_total{job=~\"$job\", instance=~\"$instance\", cluster=~\"$cluster\"}[$__rate_interval])", + "interval": "$resolution", + "legendFormat": "{{ instance }}", + "refId": "A" + } + ], + "title": "CoreDNS - CPU Usage by instance", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 3 + }, + "id": 21, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "process_resident_memory_bytes{job=~\"$job\", instance=~\"$instance\", cluster=~\"$cluster\"}", + "interval": "", + "legendFormat": "{{ instance }}", + "refId": "A" + } + ], + "title": "CoreDNS - Memory Usage by instance", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 11 + }, + "id": 9, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "sum(rate(coredns_dns_requests_total{instance=~\"$instance\",proto=\"$protocol\", cluster=~\"$cluster\"}[$__rate_interval]))", + "interval": "$resolution", + "legendFormat": "total $protocol requests", + "refId": "A" + } + ], + "title": "CoreDNS - Total DNS Requests ($protocol)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 11 + }, + "id": 7, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "sum(rate(coredns_dns_request_size_bytes_sum{instance=~\"$instance\",proto=\"$protocol\", cluster=~\"$cluster\"}[$__rate_interval])) by (proto) / sum(rate(coredns_dns_request_size_bytes_count{instance=~\"$instance\",proto=\"$protocol\", cluster=~\"$cluster\"}[$__rate_interval])) by (proto)", + "interval": "$resolution", + "legendFormat": "average $protocol packet size", + "refId": "A" + } + ], + "title": "CoreDNS - Average Packet Size ($protocol)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 19 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "sum(rate(coredns_dns_requests_total{instance=~\"$instance\", cluster=~\"$cluster\"}[$__rate_interval])) by (type)", + "interval": "$resolution", + "legendFormat": "{{ type }}", + "refId": "A" + } + ], + "title": "CoreDNS - Requests by type", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 19 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "sum(rate(coredns_dns_responses_total{instance=~\"$instance\", cluster=~\"$cluster\"}[$__rate_interval])) by (rcode)", + "interval": "$resolution", + "legendFormat": "{{ rcode }}", + "refId": "A" + } + ], + "title": "CoreDNS - Requests by return code", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 27 + }, + "id": 23, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "sum(rate(coredns_forward_requests_total{cluster=~\"$cluster\"}[$__rate_interval]))", + "interval": "$resolution", + "legendFormat": "total forward requests", + "refId": "A" + } + ], + "title": "CoreDNS - Total Forward Requests", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 27 + }, + "id": 13, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "sum(rate(coredns_forward_responses_total{rcode=~\"SERVFAIL|REFUSED\", cluster=~\"$cluster\"}[$__rate_interval])) by (rcode)", + "interval": "$resolution", + "legendFormat": "{{ rcode }}", + "refId": "A" + } + ], + "title": "CoreDNS - DNS Errors", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 35 + }, + "id": 17, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "sum(rate(coredns_cache_hits_total{instance=~\"$instance\", cluster=~\"$cluster\"}[$__rate_interval])) by (type)", + "interval": "$resolution", + "legendFormat": "{{ type }}", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "sum(rate(coredns_cache_misses_total{instance=~\"$instance\", cluster=~\"$cluster\"}[$__rate_interval])) by (type)", + "interval": "$resolution", + "legendFormat": "misses", + "refId": "B" + } + ], + "title": "CoreDNS - Cache Hits / Misses", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 35 + }, + "id": 15, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "sum(coredns_cache_entries{cluster=~\"$cluster\"}) by (type)", + "interval": "", + "legendFormat": "{{ type }}", + "refId": "A" + } + ], + "title": "CoreDNS - Cache Size", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 43 + }, + "id": 27, + "options": { + "calculate": false, + "cellGap": 1, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "RdYlBu", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "reverse": false, + "unit": "s" + } + }, + "pluginVersion": "10.4.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(increase(coredns_dns_request_duration_seconds_bucket{instance=~\"$instance\", cluster=~\"$cluster\"}[$__rate_interval])) by (le)", + "format": "heatmap", + "legendFormat": "{{le}}", + "range": true, + "refId": "A" + } + ], + "title": "CoreDNS - DNS request duration", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 43 + }, + "id": 28, + "options": { + "calculate": false, + "cellGap": 1, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "RdYlBu", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "reverse": false, + "unit": "decbytes" + } + }, + "pluginVersion": "10.4.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(increase(coredns_dns_request_size_bytes_bucket{instance=~\"$instance\", le!=\"0\", cluster=~\"$cluster\"}[$__rate_interval])) by (le)", + "format": "heatmap", + "legendFormat": "{{le}}", + "range": true, + "refId": "A" + } + ], + "title": "CoreDNS - DNS request size", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 53 + }, + "id": 29, + "options": { + "calculate": false, + "cellGap": 1, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "RdYlBu", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "reverse": false, + "unit": "decbytes" + } + }, + "pluginVersion": "10.4.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(increase(coredns_dns_response_size_bytes_bucket{instance=~\"$instance\", le!=\"0\", cluster=~\"$cluster\"}[$__rate_interval])) by (le)", + "format": "heatmap", + "legendFormat": "{{le}}", + "range": true, + "refId": "A" + } + ], + "title": "CoreDNS - DNS response size", + "type": "heatmap" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "tags": [ + "Kubernetes", + "Prometheus" + ], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "includeAll": false, + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "current": { + "isNone": true, + "selected": false, + "text": "None", + "value": "" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(kube_node_info,cluster)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "cluster", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(kube_node_info,cluster)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "allValue": ".*", + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(up{job=\"$job\", cluster=\"$cluster\"},instance)", + "hide": 0, + "includeAll": true, + "label": "", + "multi": false, + "name": "instance", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(up{job=\"$job\", cluster=\"$cluster\"},instance)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": "udp,tcp", + "current": { + "selected": false, + "text": "udp", + "value": "udp" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(coredns_dns_requests_total{cluster=\"$cluster\"}, proto)", + "hide": 0, + "includeAll": false, + "label": "", + "multi": false, + "name": "protocol", + "options": [], + "query": { + "query": "label_values(coredns_dns_requests_total{cluster=\"$cluster\"}, proto)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": { + "selected": false, + "text": "30s", + "value": "30s" + }, + "hide": 0, + "includeAll": false, + "multi": false, + "name": "resolution", + "options": [ + { + "selected": false, + "text": "1s", + "value": "1s" + }, + { + "selected": false, + "text": "15s", + "value": "15s" + }, + { + "selected": true, + "text": "30s", + "value": "30s" + }, + { + "selected": false, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "3m", + "value": "3m" + }, + { + "selected": false, + "text": "5m", + "value": "5m" + } + ], + "query": "1s, 15s, 30s, 1m, 3m, 5m", + "queryValue": "", + "skipUrlSync": false, + "type": "custom" + }, + { + "current": { + "selected": true, + "text": [ + "coredns" + ], + "value": [ + "coredns" + ] + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(coredns_build_info{cluster=\"$cluster\"},job)", + "hide": 0, + "includeAll": false, + "multi": true, + "name": "job", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(coredns_build_info{cluster=\"$cluster\"},job)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Kubernetes / System / CoreDNS", + "uid": "k8s_system_coredns", + "version": 20, + "weekStart": "" +} diff --git a/ops/prometheus/grafana/dashboards/k8s/k8s-views-global.json b/ops/prometheus/grafana/dashboards/k8s/k8s-views-global.json new file mode 100644 index 0000000..eb7e016 --- /dev/null +++ b/ops/prometheus/grafana/dashboards/k8s/k8s-views-global.json @@ -0,0 +1,3561 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": {}, + "__requires": [ + { + "type": "panel", + "id": "bargauge", + "name": "Bar gauge", + "version": "" + }, + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "10.3.1" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "stat", + "name": "Stat", + "version": "" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + }, + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": false, + "iconColor": "#5c4ee5", + "name": "terraform", + "target": { + "limit": 100, + "matchAny": false, + "tags": [ + "terraform" + ], + "type": "tags" + } + }, + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": false, + "iconColor": "red", + "name": "oncall", + "target": { + "limit": 100, + "matchAny": false, + "tags": [ + "oncall" + ], + "type": "tags" + } + } + ] + }, + "description": "This is a modern 'Global View' dashboard for your Kubernetes cluster(s). Made for kube-prometheus-stack and take advantage of the latest Grafana features. GitHub repository: https://github.com/dotdc/grafana-dashboards-kubernetes", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 67, + "panels": [], + "title": "Overview", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 1 + }, + "id": 77, + "options": { + "displayMode": "lcd", + "maxVizHeight": 300, + "minVizHeight": 10, + "minVizWidth": 0, + "namePlacement": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "valueMode": "color" + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "avg(sum by (instance, cpu) (rate(node_cpu_seconds_total{mode!~\"idle|iowait|steal\", cluster=\"$cluster\", job=\"$job\"}[$__rate_interval])))", + "interval": "", + "legendFormat": "Real Linux", + "range": true, + "refId": "Real Linux" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "avg(sum by (core) (rate(windows_cpu_time_total{mode!=\"idle\", cluster=\"$cluster\"}[$__rate_interval])))", + "hide": false, + "interval": "", + "legendFormat": "Real Windows", + "range": true, + "refId": "Real Windows" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_resource_requests{resource=\"cpu\", cluster=\"$cluster\"}) / sum(machine_cpu_cores{cluster=\"$cluster\"})", + "hide": false, + "legendFormat": "Requests", + "range": true, + "refId": "Requests" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_resource_limits{resource=\"cpu\", cluster=\"$cluster\"}) / sum(machine_cpu_cores{cluster=\"$cluster\"})", + "hide": false, + "legendFormat": "Limits", + "range": true, + "refId": "Limits" + } + ], + "title": "Global CPU Usage", + "transformations": [ + { + "id": "calculateField", + "options": { + "alias": "Real", + "mode": "reduceRow", + "reduce": { + "include": [ + "Real Linux", + "Real Windows" + ], + "reducer": "mean" + } + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Real Linux": true, + "Real Windows": true, + "Time": true + }, + "indexByName": { + "Limits": 5, + "Real": 1, + "Real Linux": 2, + "Real Windows": 3, + "Requests": 4, + "Time": 0 + }, + "renameByName": {} + } + } + ], + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "decimals": 2, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 1 + }, + "id": 78, + "options": { + "displayMode": "lcd", + "maxVizHeight": 300, + "minVizHeight": 10, + "minVizWidth": 0, + "namePlacement": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "text": {}, + "valueMode": "color" + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(node_memory_MemTotal_bytes{cluster=\"$cluster\", job=\"$job\"} - node_memory_MemAvailable_bytes{cluster=\"$cluster\", job=\"$job\"}) / sum(node_memory_MemTotal_bytes{cluster=\"$cluster\", job=\"$job\"})", + "hide": false, + "interval": "", + "legendFormat": "Real Linux", + "range": true, + "refId": "Real Linux" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(windows_memory_available_bytes{cluster=\"$cluster\"} + windows_memory_cache_bytes{cluster=\"$cluster\"}) / sum(windows_os_visible_memory_bytes{cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Real Windows", + "range": true, + "refId": "Real Windows" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_resource_requests{resource=\"memory\", cluster=\"$cluster\"}) / sum(machine_memory_bytes{cluster=\"$cluster\"})", + "hide": false, + "legendFormat": "Requests", + "range": true, + "refId": "Requests" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_resource_limits{resource=\"memory\", cluster=\"$cluster\"}) / sum(machine_memory_bytes{cluster=\"$cluster\"})", + "hide": false, + "legendFormat": "Limits", + "range": true, + "refId": "Limits" + } + ], + "title": "Global RAM Usage", + "transformations": [ + { + "id": "calculateField", + "options": { + "alias": "Real", + "mode": "reduceRow", + "reduce": { + "include": [ + "Real Linux", + "Real Windows" + ], + "reducer": "mean" + } + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Real Linux": true, + "Real Windows": true, + "Time": true + }, + "includeByName": {}, + "indexByName": { + "Limits": 5, + "Real": 3, + "Real Linux": 1, + "Real Windows": 2, + "Requests": 4, + "Time": 0 + }, + "renameByName": {} + } + } + ], + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 2, + "x": 12, + "y": 1 + }, + "id": 63, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "value", + "wideLayout": true + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "count(count by (node) (kube_node_info{cluster=\"$cluster\"}))", + "interval": "", + "legendFormat": "", + "range": true, + "refId": "A" + } + ], + "title": "Nodes", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 10, + "x": 14, + "y": 1 + }, + "id": 52, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "sum(kube_namespace_labels{cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Namespaces", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_pod_container_status_running{cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Running Containers", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_pod_status_phase{phase=\"Running\", cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Running Pods", + "refId": "O" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_service_info{cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Services", + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_endpoint_info{cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Endpoints", + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_ingress_info{cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Ingresses", + "refId": "E" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_deployment_labels{cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Deployments", + "refId": "F" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_statefulset_labels{cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Statefulsets", + "refId": "G" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_daemonset_labels{cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Daemonsets", + "refId": "H" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_persistentvolumeclaim_info{cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Persistent Volume Claims", + "refId": "I" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_hpa_labels{cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Horizontal Pod Autoscalers", + "refId": "J" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_configmap_info{cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Configmaps", + "refId": "K" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_secret_info{cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Secrets", + "refId": "L" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_networkpolicy_labels{cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Network Policies", + "refId": "M" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "count(count by (node) (kube_node_info{cluster=\"$cluster\"}))", + "hide": false, + "interval": "", + "legendFormat": "Nodes", + "refId": "N" + } + ], + "title": "Kubernetes Resource Count", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 2, + "x": 12, + "y": 5 + }, + "id": 59, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "value", + "wideLayout": true + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "count(kube_namespace_created{cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Namespaces", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgb(255, 255, 255)", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 9 + }, + "id": 37, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(node_cpu_seconds_total{mode!~\"idle|iowait|steal\", cluster=\"$cluster\", job=\"$job\"}[$__rate_interval]))", + "interval": "", + "legendFormat": "Real Linux", + "range": true, + "refId": "Real Linux" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(windows_cpu_time_total{mode!=\"idle\", cluster=\"$cluster\"}[$__rate_interval]))", + "hide": false, + "interval": "", + "legendFormat": "Real Windows", + "range": true, + "refId": "Real Windows" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_resource_requests{resource=\"cpu\", cluster=\"$cluster\"})", + "hide": false, + "legendFormat": "Requests", + "range": true, + "refId": "Requests" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_resource_limits{resource=\"cpu\", cluster=\"$cluster\"})", + "hide": false, + "legendFormat": "Limits", + "range": true, + "refId": "Limits" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(machine_cpu_cores{cluster=\"$cluster\"})", + "hide": false, + "legendFormat": "Total", + "range": true, + "refId": "Total" + } + ], + "title": "CPU Usage", + "transformations": [ + { + "id": "calculateField", + "options": { + "alias": "Real", + "mode": "reduceRow", + "reduce": { + "include": [ + "Real Linux", + "Real Windows" + ], + "reducer": "sum" + } + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Real Linux": true, + "Real Windows": true, + "Time": true, + "Total Linux": true, + "Total Windows": true + }, + "indexByName": { + "Limits": 5, + "Real": 3, + "Real Linux": 1, + "Real Windows": 2, + "Requests": 4, + "Time": 0, + "Total": 8, + "Total Linux": 6, + "Total Windows": 7 + }, + "renameByName": {} + } + } + ], + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgb(255, 255, 255)", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 9 + }, + "id": 39, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(node_memory_MemTotal_bytes{cluster=\"$cluster\", job=\"$job\"} - node_memory_MemAvailable_bytes{cluster=\"$cluster\", job=\"$job\"})", + "interval": "", + "legendFormat": "Real Linux", + "range": true, + "refId": "Real Linux" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(windows_os_visible_memory_bytes{cluster=\"$cluster\"} - windows_memory_available_bytes{cluster=\"$cluster\"} - windows_memory_cache_bytes{cluster=\"$cluster\"})", + "hide": false, + "interval": "", + "legendFormat": "Real Windows", + "range": true, + "refId": "Real Windows" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_resource_requests{resource=\"memory\", cluster=\"$cluster\"})", + "hide": false, + "legendFormat": "Requests", + "range": true, + "refId": "Requests" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_resource_limits{resource=\"memory\", cluster=\"$cluster\"})", + "hide": false, + "legendFormat": "Limits", + "range": true, + "refId": "Limits" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(machine_memory_bytes{cluster=\"$cluster\"})", + "hide": false, + "legendFormat": "Total", + "range": true, + "refId": "Total" + } + ], + "title": "RAM Usage", + "transformations": [ + { + "id": "calculateField", + "options": { + "alias": "Real", + "mode": "reduceRow", + "reduce": { + "include": [ + "Real Linux", + "Real Windows" + ], + "reducer": "mean" + } + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Real Linux": true, + "Real Windows": true, + "Time": true + }, + "includeByName": {}, + "indexByName": { + "Limits": 5, + "Real": 3, + "Real Linux": 1, + "Real Windows": 2, + "Requests": 4, + "Time": 0, + "Total": 6 + }, + "renameByName": {} + } + } + ], + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 2, + "x": 12, + "y": 9 + }, + "id": 62, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "value", + "wideLayout": true + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_pod_status_phase{phase=\"Running\", cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Running Pods", + "type": "stat" + }, + { + "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 13 + }, + "id": 71, + "panels": [], + "title": "Resources", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd", + "seriesBy": "last" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "CPU %", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 0.5 + }, + { + "color": "red", + "value": 0.7 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 14 + }, + "id": 72, + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "right", + "showLegend": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "avg(sum by (instance, cpu) (rate(node_cpu_seconds_total{mode!~\"idle|iowait|steal\", cluster=\"$cluster\", job=\"$job\"}[$__rate_interval])))", + "interval": "$resolution", + "legendFormat": "Linux", + "range": true, + "refId": "Linux" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "1 - avg(rate(windows_cpu_time_total{cluster=\"$cluster\",mode=\"idle\"}[$__rate_interval]))", + "hide": false, + "interval": "$resolution", + "legendFormat": "Windows", + "range": true, + "refId": "Windows" + } + ], + "title": "Cluster CPU Utilization", + "transformations": [ + { + "id": "calculateField", + "options": { + "alias": "CPU usage in %", + "mode": "reduceRow", + "reduce": { + "reducer": "mean" + }, + "replaceFields": true + } + } + ], + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "MEMORY", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 0.5 + }, + { + "color": "red", + "value": 0.7 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 14 + }, + "id": 55, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "hidden", + "placement": "right", + "showLegend": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(node_memory_MemTotal_bytes{cluster=\"$cluster\", job=\"$job\"} - node_memory_MemAvailable_bytes{cluster=\"$cluster\", job=\"$job\"}) / sum(node_memory_MemTotal_bytes{cluster=\"$cluster\", job=\"$job\"})", + "interval": "$resolution", + "legendFormat": "Linux", + "range": true, + "refId": "Linux" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(windows_os_visible_memory_bytes{cluster=\"$cluster\"} - windows_memory_available_bytes{cluster=\"$cluster\"}) / sum(windows_os_visible_memory_bytes{cluster=\"$cluster\"})", + "hide": false, + "interval": "$resolution", + "legendFormat": "Windows", + "range": true, + "refId": "Windows" + } + ], + "title": "Cluster Memory Utilization", + "transformations": [ + { + "id": "calculateField", + "options": { + "alias": "Memory usage in %", + "mode": "reduceRow", + "reduce": { + "reducer": "mean" + }, + "replaceFields": true + } + } + ], + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "CPU CORES", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 22 + }, + "id": 46, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(container_cpu_usage_seconds_total{image!=\"\", cluster=\"$cluster\"}[$__rate_interval])) by (namespace)\n+ on (namespace)\n(sum(rate(windows_container_cpu_usage_seconds_total{container_id!=\"\", cluster=\"$cluster\"}[$__rate_interval]) * on (container_id) group_left (container, pod, namespace) max by ( container, container_id, pod, namespace) (kube_pod_container_info{container_id!=\"\", cluster=\"$cluster\"}) OR kube_namespace_created{cluster=\"$cluster\"} * 0) by (namespace))", + "format": "time_series", + "hide": false, + "interval": "$resolution", + "legendFormat": "{{ namespace }}", + "range": true, + "refId": "A" + } + ], + "title": "CPU Utilization by namespace", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 22 + }, + "id": 50, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(container_memory_working_set_bytes{image!=\"\", cluster=\"$cluster\"}) by (namespace)\n+ on (namespace)\n(sum(windows_container_memory_usage_commit_bytes{container_id!=\"\", cluster=\"$cluster\"} * on (container_id) group_left (container, pod, namespace) max by ( container, container_id, pod, namespace) (kube_pod_container_info{container_id!=\"\", cluster=\"$cluster\"}) OR kube_namespace_created{cluster=\"$cluster\"} * 0) by (namespace))", + "interval": "$resolution", + "legendFormat": "{{ namespace }}", + "range": true, + "refId": "A" + } + ], + "title": "Memory Utilization by namespace", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "CPU %", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 30 + }, + "id": 54, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "avg(sum by (instance, cpu) (rate(node_cpu_seconds_total{mode!~\"idle|iowait|steal\", cluster=\"$cluster\", job=\"$job\"}[$__rate_interval]))) by (instance)", + "interval": "$resolution", + "legendFormat": "{{ node }}", + "range": true, + "refId": "Linux" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "avg(sum by (instance,core) (rate(windows_cpu_time_total{mode!=\"idle\", cluster=\"$cluster\"}[$__rate_interval]))) by (instance)", + "hide": false, + "interval": "$resolution", + "legendFormat": "{{ node }}", + "range": true, + "refId": "Windows" + } + ], + "title": "CPU Utilization by instance", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "MEMORY", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 30 + }, + "id": 73, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(node_memory_MemTotal_bytes{cluster=\"$cluster\", job=\"$job\"} - node_memory_MemAvailable_bytes{cluster=\"$cluster\", job=\"$job\"}) by (instance)", + "hide": false, + "interval": "$resolution", + "legendFormat": "{{ instance }}", + "range": true, + "refId": "Linux" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(windows_os_visible_memory_bytes{cluster=\"$cluster\"} - windows_memory_available_bytes{cluster=\"$cluster\"}) by (instance)", + "hide": false, + "interval": "$resolution", + "legendFormat": "{{ instance }}", + "range": true, + "refId": "Windows" + } + ], + "title": "Memory Utilization by instance", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "No data is generally a good thing here.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "SECONDS", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 38 + }, + "id": 82, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(container_cpu_cfs_throttled_seconds_total{image!=\"\", cluster=\"$cluster\"}[$__rate_interval])) by (namespace) > 0", + "interval": "$resolution", + "legendFormat": "{{ namespace }}", + "range": true, + "refId": "A" + } + ], + "title": "CPU Throttled seconds by namespace", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "No data is generally a good thing here.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "NB", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 38 + }, + "id": 83, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(node_cpu_core_throttles_total{cluster=\"$cluster\", job=\"$job\"}[$__rate_interval])) by (instance)", + "interval": "$resolution", + "legendFormat": "{{ instance }}", + "range": true, + "refId": "A" + } + ], + "title": "CPU Core Throttled by instance", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 46 + }, + "id": 86, + "panels": [], + "title": "Kubernetes", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 47 + }, + "id": 84, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(kube_pod_status_qos_class{cluster=\"$cluster\"}) by (qos_class)", + "interval": "", + "legendFormat": "{{ qos_class }} pods", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_info{cluster=\"$cluster\"})", + "hide": false, + "legendFormat": "Total pods", + "range": true, + "refId": "B" + } + ], + "title": "Kubernetes Pods QoS classes", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 47 + }, + "id": 85, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(kube_pod_status_reason{cluster=\"$cluster\"}) by (reason)", + "interval": "", + "legendFormat": "{{ reason }}", + "range": true, + "refId": "A" + } + ], + "title": "Kubernetes Pods Status Reason", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "No data is generally a good thing here.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "points", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 56 + }, + "id": 87, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(increase(container_oom_events_total{cluster=\"$cluster\"}[$__rate_interval])) by (namespace) > 0", + "interval": "", + "legendFormat": "{{ namespace }}", + "range": true, + "refId": "A" + } + ], + "title": "OOM Events by namespace", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "No data is generally a good thing here.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "points", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 56 + }, + "id": 88, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(increase(kube_pod_container_status_restarts_total{cluster=\"$cluster\"}[$__rate_interval])) by (namespace) > 0", + "interval": "", + "legendFormat": "{{ namespace }}", + "range": true, + "refId": "A" + } + ], + "title": "Container Restarts by namespace", + "type": "timeseries" + }, + { + "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 65 + }, + "id": 69, + "panels": [], + "title": "Network", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Dropped noisy virtual devices for readability.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "BANDWIDTH", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "binBps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 66 + }, + "id": 44, + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(node_network_receive_bytes_total{device!~\"(veth|azv|lxc).*\", cluster=\"$cluster\", job=\"$job\"}[$__rate_interval])) by (device)", + "interval": "$resolution", + "legendFormat": "Received : {{ device }}", + "range": true, + "refId": "Linux Received" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "- sum(rate(node_network_transmit_bytes_total{device!~\"(veth|azv|lxc).*\", cluster=\"$cluster\", job=\"$job\"}[$__rate_interval])) by (device)", + "interval": "$resolution", + "legendFormat": "Transmitted : {{ device }}", + "range": true, + "refId": "Linux Transmitted" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(windows_net_bytes_received_total{cluster=\"$cluster\"}[$__rate_interval])) by (nic)", + "hide": false, + "interval": "$resolution", + "legendFormat": "Received : {{ nic }}", + "range": true, + "refId": "Windows Received" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "- sum(rate(windows_net_bytes_sent_total{cluster=\"$cluster\"}[$__rate_interval])) by (nic)", + "hide": false, + "interval": "$resolution", + "legendFormat": "Transmitted : {{ device }}", + "range": true, + "refId": "Windows Transmitted" + } + ], + "title": "Global Network Utilization by device", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "DROPPED PACKETS", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 66 + }, + "id": 53, + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(node_network_receive_drop_total{cluster=\"$cluster\", job=\"$job\"}[$__rate_interval]))", + "interval": "$resolution", + "legendFormat": "Linux Packets dropped (receive)", + "range": true, + "refId": "Linux Packets dropped (receive)" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "- sum(rate(node_network_transmit_drop_total{cluster=\"$cluster\", job=\"$job\"}[$__rate_interval]))", + "interval": "$resolution", + "legendFormat": "Linux Packets dropped (transmit)", + "range": true, + "refId": "Linux Packets dropped (transmit)" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(windows_net_packets_received_discarded_total{cluster=\"$cluster\"}[$__rate_interval]))", + "hide": false, + "interval": "$resolution", + "legendFormat": "Windows Packets dropped (receive)", + "range": true, + "refId": "Windows Packets dropped (receive)" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "- sum(rate(windows_net_packets_outbound_discarded_total{cluster=\"$cluster\"}[$__rate_interval]))", + "hide": false, + "interval": "$resolution", + "legendFormat": "Windows Packets dropped (transmit)", + "range": true, + "refId": "Windows Packets dropped (transmit)" + } + ], + "title": "Network Saturation - Packets dropped", + "transformations": [ + { + "id": "calculateField", + "options": { + "alias": "Packets dropped (receive)", + "mode": "reduceRow", + "reduce": { + "include": [ + "Linux Packets dropped (receive)", + "Windows Packets dropped (receive)" + ], + "reducer": "mean" + } + } + }, + { + "id": "calculateField", + "options": { + "alias": "Packets dropped (transmit)", + "mode": "reduceRow", + "reduce": { + "include": [ + "Linux Packets dropped (transmit)", + "Windows Packets dropped (transmit)" + ], + "reducer": "mean" + } + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Linux Packets dropped (receive)": true, + "Linux Packets dropped (transmit)": true, + "Time": false, + "Windows Packets dropped (receive)": true, + "Windows Packets dropped (transmit)": true + }, + "includeByName": {}, + "indexByName": {}, + "renameByName": {} + } + } + ], + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "BANDWIDTH", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "binBps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 74 + }, + "id": 79, + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(container_network_receive_bytes_total{cluster=\"$cluster\"}[$__rate_interval])) by (namespace)\n+ on (namespace)\n(sum(rate(windows_container_network_receive_bytes_total{container_id!=\"\", cluster=\"$cluster\"}[$__rate_interval]) * on (container_id) group_left (container, pod, namespace) max by ( container, container_id, pod, namespace) (kube_pod_container_info{container_id!=\"\", cluster=\"$cluster\"}) OR kube_namespace_created{cluster=\"$cluster\"} * 0) by (namespace))", + "interval": "$resolution", + "legendFormat": "Received : {{ namespace }}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "- (sum(rate(container_network_transmit_bytes_total{cluster=\"$cluster\"}[$__rate_interval])) by (namespace)\n+ on (namespace)\n(sum(rate(windows_container_network_transmit_bytes_total{container_id!=\"\", cluster=\"$cluster\"}[$__rate_interval]) * on (container_id) group_left (container, pod, namespace) max by ( container, container_id, pod, namespace) (kube_pod_container_info{container_id!=\"\", cluster=\"$cluster\"}) OR kube_namespace_created{cluster=\"$cluster\"} * 0) by (namespace)))", + "hide": false, + "interval": "$resolution", + "legendFormat": "Transmitted : {{ namespace }}", + "range": true, + "refId": "B" + } + ], + "title": "Network Received by namespace", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "BANDWIDTH", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "binBps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 74 + }, + "id": 80, + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(node_network_receive_bytes_total{cluster=\"$cluster\", job=\"$job\"}[$__rate_interval])) by (instance)", + "interval": "$resolution", + "legendFormat": "Received bytes in {{ instance }}", + "range": true, + "refId": "Linux Received bytes" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "- sum(rate(node_network_transmit_bytes_total{cluster=\"$cluster\", job=\"$job\"}[$__rate_interval])) by (instance)", + "hide": false, + "interval": "$resolution", + "legendFormat": "Transmitted bytes in {{ instance }}", + "range": true, + "refId": "Linux Transmitted bytes" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(windows_net_bytes_received_total{cluster=\"$cluster\"}[$__rate_interval])) by (instance)", + "hide": false, + "interval": "$resolution", + "legendFormat": "Received bytes in {{ instance }}", + "range": true, + "refId": "Windows Received bytes" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "- sum(rate(windows_net_bytes_sent_total{cluster=\"$cluster\"}[$__rate_interval])) by (instance)", + "hide": false, + "interval": "$resolution", + "legendFormat": "Transmitted bytes in {{ instance }}", + "range": true, + "refId": "Windows Transmitted bytes" + } + ], + "title": "Total Network Received (with all virtual devices) by instance", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Dropped noisy virtual devices for readability.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "BANDWIDTH", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "binBps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 82 + }, + "id": 56, + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(node_network_receive_bytes_total{device!~\"(veth|azv|lxc|lo).*\", cluster=\"$cluster\", job=\"$job\"}[$__rate_interval])) by (instance)", + "interval": "$resolution", + "legendFormat": "Received bytes in {{ instance }}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "- sum(rate(node_network_transmit_bytes_total{device!~\"(veth|azv|lxc|lo).*\", cluster=\"$cluster\", job=\"$job\"}[$__rate_interval])) by (instance)", + "hide": false, + "interval": "$resolution", + "legendFormat": "Transmitted bytes in {{ instance }}", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(windows_net_bytes_received_total{nic!~\".*Virtual.*\",cluster=\"$cluster\"}[$__rate_interval])) by (instance)", + "hide": false, + "interval": "$resolution", + "legendFormat": "Received bytes in {{ instance }}", + "range": true, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "- sum(rate(windows_net_bytes_sent_total{nic!~\".*Virtual.*\",cluster=\"$cluster\"}[$__rate_interval])) by (instance)", + "hide": false, + "interval": "$resolution", + "legendFormat": "Transmitted bytes in {{ instance }}", + "range": true, + "refId": "D" + } + ], + "title": "Network Received (without loopback) by instance", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Dropped noisy virtual devices for readability.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "BANDWIDTH", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "binBps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 82 + }, + "id": 81, + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(node_network_receive_bytes_total{device=\"lo\", cluster=\"$cluster\", job=\"$job\"}[$__rate_interval])) by (instance)", + "interval": "$resolution", + "legendFormat": "Received bytes in {{ instance }}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "- sum(rate(node_network_transmit_bytes_total{device=\"lo\", cluster=\"$cluster\", job=\"$job\"}[$__rate_interval])) by (instance)", + "hide": false, + "interval": "$resolution", + "legendFormat": "Transmitted bytes in {{ instance }}", + "range": true, + "refId": "B" + } + ], + "title": "Network Received (loopback only) by instance", + "type": "timeseries" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "tags": [ + "Kubernetes", + "Prometheus" + ], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "", + "value": "" + }, + "hide": 0, + "includeAll": false, + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(kube_node_info,cluster)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "cluster", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(kube_node_info,cluster)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "30s", + "value": "30s" + }, + "hide": 0, + "includeAll": false, + "multi": false, + "name": "resolution", + "options": [ + { + "selected": false, + "text": "1s", + "value": "1s" + }, + { + "selected": false, + "text": "15s", + "value": "15s" + }, + { + "selected": true, + "text": "30s", + "value": "30s" + }, + { + "selected": false, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "3m", + "value": "3m" + }, + { + "selected": false, + "text": "5m", + "value": "5m" + } + ], + "query": "1s, 15s, 30s, 1m, 3m, 5m", + "queryValue": "", + "skipUrlSync": false, + "type": "custom" + }, + { + "current": { + "selected": false, + "text": "", + "value": "" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(node_cpu_seconds_total{cluster=\"$cluster\"},job)", + "hide": 0, + "includeAll": false, + "multi": true, + "name": "job", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(node_cpu_seconds_total{cluster=\"$cluster\"},job)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Kubernetes / Views / Global", + "uid": "k8s_views_global", + "version": 43, + "weekStart": "" +} diff --git a/ops/prometheus/grafana/dashboards/k8s/k8s-views-namespaces.json b/ops/prometheus/grafana/dashboards/k8s/k8s-views-namespaces.json new file mode 100644 index 0000000..8b8efca --- /dev/null +++ b/ops/prometheus/grafana/dashboards/k8s/k8s-views-namespaces.json @@ -0,0 +1,3035 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": {}, + "__requires": [ + { + "type": "panel", + "id": "gauge", + "name": "Gauge", + "version": "" + }, + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "10.3.1" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "stat", + "name": "Stat", + "version": "" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + }, + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": false, + "iconColor": "#5c4ee5", + "name": "terraform", + "target": { + "limit": 100, + "matchAny": false, + "tags": [ + "terraform" + ], + "type": "tags" + } + }, + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": false, + "iconColor": "red", + "name": "oncall", + "target": { + "limit": 100, + "matchAny": false, + "tags": [ + "oncall" + ], + "type": "tags" + } + } + ] + }, + "description": "This is a modern 'Namespaces View' dashboard for your Kubernetes cluster(s). Made for kube-prometheus-stack and take advantage of the latest Grafana features. GitHub repository: https://github.com/dotdc/grafana-dashboards-kubernetes", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 38, + "panels": [], + "title": "Overview", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "decimals": 2, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 50 + }, + { + "color": "red", + "value": 70 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 0, + "y": 1 + }, + "id": 46, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto", + "text": {} + }, + "pluginVersion": "11.2.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=~\"$namespace\", image!=\"\", cluster=\"$cluster\"}[$__rate_interval])) / sum(machine_cpu_cores{cluster=\"$cluster\"})", + "instant": true, + "interval": "", + "legendFormat": "", + "range": false, + "refId": "A" + } + ], + "title": "Namespace(s) usage on total cluster CPU in %", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "decimals": 2, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 50 + }, + { + "color": "red", + "value": 70 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 6, + "y": 1 + }, + "id": 48, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto", + "text": {} + }, + "pluginVersion": "11.2.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "sum(container_memory_working_set_bytes{namespace=~\"$namespace\", image!=\"\", cluster=\"$cluster\"}) / sum(machine_memory_bytes{cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Namespace(s) usage on total cluster RAM in %", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 32, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_pod_info{namespace=~\"$namespace\", cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Running Pods", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_service_info{namespace=~\"$namespace\", cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Services", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_ingress_info{namespace=~\"$namespace\", cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Ingresses", + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_deployment_labels{namespace=~\"$namespace\", cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Deployments", + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_statefulset_labels{namespace=~\"$namespace\", cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Statefulsets", + "refId": "E" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_daemonset_labels{namespace=~\"$namespace\", cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Daemonsets", + "refId": "F" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_persistentvolumeclaim_info{namespace=~\"$namespace\", cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Persistent Volume Claims", + "refId": "G" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_hpa_labels{namespace=~\"$namespace\", cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Horizontal Pod Autoscalers", + "refId": "H" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_configmap_info{namespace=~\"$namespace\", cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Configmaps", + "refId": "I" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_secret_info{namespace=~\"$namespace\", cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Secrets", + "refId": "J" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_networkpolicy_labels{namespace=~\"$namespace\", cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Network Policies", + "refId": "K" + } + ], + "title": "Kubernetes Resource Count", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgb(255, 255, 255)", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 8 + }, + "id": 62, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.2.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=~\"$namespace\", image!=\"\", cluster=\"$cluster\"}[$__rate_interval]))", + "interval": "", + "legendFormat": "Real", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_resource_requests{namespace=~\"$namespace\", resource=\"cpu\", cluster=\"$cluster\"})", + "hide": false, + "legendFormat": "Requests", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_resource_limits{namespace=~\"$namespace\", resource=\"cpu\", cluster=\"$cluster\"})", + "hide": false, + "legendFormat": "Limits", + "range": true, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(machine_cpu_cores{cluster=\"$cluster\"})", + "hide": false, + "legendFormat": "Cluster Total", + "range": true, + "refId": "D" + } + ], + "title": "Namespace(s) CPU Usage in cores", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgb(255, 255, 255)", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 8 + }, + "id": 64, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.2.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(container_memory_working_set_bytes{namespace=~\"$namespace\", image!=\"\", cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Real", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_resource_requests{namespace=~\"$namespace\", resource=\"memory\", cluster=\"$cluster\"})", + "hide": false, + "legendFormat": "Requests", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_resource_limits{namespace=~\"$namespace\", resource=\"memory\", cluster=\"$cluster\"})", + "hide": false, + "legendFormat": "Limits", + "range": true, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(machine_memory_bytes{cluster=\"$cluster\"})", + "hide": false, + "legendFormat": "Cluster Total", + "range": true, + "refId": "D" + } + ], + "title": "Namespace(s) RAM Usage in bytes", + "type": "stat" + }, + { + "collapsed": false, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 12 + }, + "id": 40, + "panels": [], + "title": "Resources", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "CPU CORES", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 13 + }, + "id": 29, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=~\"$namespace\", image!=\"\", pod=~\"${created_by}.*\", cluster=\"$cluster\"}[$__rate_interval])) by (pod)", + "interval": "$resolution", + "legendFormat": "{{ pod }}", + "range": true, + "refId": "A" + } + ], + "title": "CPU usage by Pod", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 13 + }, + "id": 30, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(container_memory_working_set_bytes{namespace=~\"$namespace\", image!=\"\", pod=~\"${created_by}.*\", cluster=\"$cluster\"}) by (pod)", + "interval": "$resolution", + "legendFormat": "{{ pod }}", + "range": true, + "refId": "A" + } + ], + "title": "Memory usage by Pod", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "SECONDS", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 21 + }, + "id": 68, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(container_cpu_cfs_throttled_seconds_total{namespace=~\"$namespace\", image!=\"\", pod=~\"${created_by}.*\", cluster=\"$cluster\"}[$__rate_interval])) by (pod) > 0", + "interval": "$resolution", + "legendFormat": "{{ pod }}", + "range": true, + "refId": "A" + } + ], + "title": "CPU Throttled seconds by pod", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 29 + }, + "id": 73, + "panels": [], + "title": "Kubernetes", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 30 + }, + "id": 70, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(kube_pod_status_qos_class{namespace=~\"$namespace\", cluster=\"$cluster\"}) by (qos_class)", + "interval": "", + "legendFormat": "{{ qos_class }} pods", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_info{namespace=~\"$namespace\", cluster=\"$cluster\"})", + "hide": false, + "legendFormat": "Total pods", + "range": true, + "refId": "B" + } + ], + "title": "Kubernetes Pods QoS classes", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 30 + }, + "id": 72, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(kube_pod_status_reason{cluster=\"$cluster\"}) by (reason)", + "interval": "", + "legendFormat": "{{ reason }}", + "range": true, + "refId": "A" + } + ], + "title": "Kubernetes Pods Status Reason", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "No data is generally a good thing here.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "points", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 39 + }, + "id": 74, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(increase(container_oom_events_total{namespace=~\"${namespace}\", cluster=\"$cluster\"}[$__rate_interval])) by (namespace, pod) > 0", + "interval": "", + "legendFormat": "namespace: {{ namespace }} - pod: {{ pod }}", + "range": true, + "refId": "A" + } + ], + "title": "OOM Events by namespace, pod", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "No data is generally a good thing here.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "points", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 39 + }, + "id": 75, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(increase(kube_pod_container_status_restarts_total{namespace=~\"${namespace}\", cluster=\"$cluster\"}[$__rate_interval])) by (namespace, pod) > 0", + "interval": "", + "legendFormat": "namespace: {{ namespace }} - pod: {{ pod }}", + "range": true, + "refId": "A" + } + ], + "title": "Container Restarts by namespace, pod", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 0, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 48 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_status_ready{namespace=~\"$namespace\", pod=~\"${created_by}.*\", cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Ready", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_status_running{namespace=~\"$namespace\", pod=~\"${created_by}.*\", cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Running", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_pod_container_status_waiting{namespace=~\"$namespace\", cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Waiting", + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_pod_container_status_restarts_total{namespace=~\"$namespace\", cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Restarts Total", + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_pod_container_status_terminated{namespace=~\"$namespace\", cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Terminated", + "refId": "E" + } + ], + "title": "Nb of pods by state", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 0, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 48 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "right", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_info{namespace=~\"$namespace\", pod=~\"${created_by}.*\", cluster=\"$cluster\"}) by (pod)", + "interval": "", + "legendFormat": "{{ pod }}", + "range": true, + "refId": "A" + } + ], + "title": "Nb of containers by pod", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 56 + }, + "id": 7, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(kube_deployment_status_replicas_available{namespace=~\"$namespace\", cluster=\"$cluster\"}) by (deployment)", + "interval": "", + "legendFormat": "{{ deployment }}", + "range": true, + "refId": "A" + } + ], + "title": "Replicas available by deployment", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 56 + }, + "id": 8, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(kube_deployment_status_replicas_unavailable{namespace=~\"$namespace\", pod=~\"${created_by}.*\", cluster=\"$cluster\"}) by (deployment)", + "interval": "", + "legendFormat": "{{ deployment }}", + "range": true, + "refId": "A" + } + ], + "title": "Replicas unavailable by deployment", + "type": "timeseries" + }, + { + "datasource": { + "default": false, + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "List of pods that are not in Running or Succeeded status.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 64 + }, + "id": 83, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_status_phase{phase!~\"Running|Succeeded\", namespace=~\"$namespace\", cluster=\"$cluster\"}) by (pod) > 0", + "interval": "", + "legendFormat": "{{ deployment }}", + "range": true, + "refId": "A" + } + ], + "title": "Pods with unexpected status", + "type": "timeseries" + }, + { + "datasource": { + "default": false, + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "bars", + "fillOpacity": 25, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 64 + }, + "id": 82, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "last" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "count(rate(container_cpu_usage_seconds_total{namespace=~\"$namespace\", image!=\"\", pod=~\"${created_by}.*\", cluster=\"$cluster\"}[$__rate_interval])) by (image)", + "interval": "", + "legendFormat": "{{ image }}", + "range": true, + "refId": "A" + } + ], + "title": "Container Image Used", + "type": "timeseries" + }, + { + "collapsed": false, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 72 + }, + "id": 42, + "panels": [], + "title": "Kubernetes Storage", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 73 + }, + "id": 65, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "sum(kubelet_volume_stats_used_bytes{namespace=~\"$namespace\", cluster=\"$cluster\"}) by (persistentvolumeclaim) / sum(kubelet_volume_stats_capacity_bytes{namespace=~\"$namespace\", cluster=\"$cluster\"}) by (persistentvolumeclaim)", + "interval": "", + "legendFormat": "{{ persistentvolumeclaim }}", + "refId": "A" + } + ], + "title": "Persistent Volumes - Capacity and usage in %", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 73 + }, + "id": 66, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "sum(kubelet_volume_stats_used_bytes{namespace=~\"$namespace\", cluster=\"$cluster\"}) by (persistentvolumeclaim)", + "interval": "", + "legendFormat": "{{ persistentvolumeclaim }} - Used", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "sum(kubelet_volume_stats_capacity_bytes{namespace=~\"$namespace\", cluster=\"$cluster\"}) by (persistentvolumeclaim)", + "hide": false, + "interval": "", + "legendFormat": "{{ persistentvolumeclaim }} - Capacity", + "refId": "B" + } + ], + "title": "Persistent Volumes - Capacity and usage in bytes", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 81 + }, + "id": 27, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "1 - sum(kubelet_volume_stats_inodes_used{namespace=~\"$namespace\", cluster=\"$cluster\"}) by (persistentvolumeclaim) / sum(kubelet_volume_stats_inodes{namespace=~\"$namespace\", cluster=\"$cluster\"}) by (persistentvolumeclaim)", + "interval": "", + "legendFormat": "{{ persistentvolumeclaim }}", + "refId": "A" + } + ], + "title": "Persistent Volumes - Inodes", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 89 + }, + "id": 76, + "panels": [], + "title": "Network", + "type": "row" + }, + { + "datasource": { + "default": false, + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "binBps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 90 + }, + "id": 78, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(container_network_receive_bytes_total{namespace=~\"$namespace\", pod=~\"${created_by}.*\", cluster=\"$cluster\"}[$__rate_interval])) by (pod)", + "interval": "$resolution", + "legendFormat": "Received - {{ pod }}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "- sum(rate(container_network_transmit_bytes_total{namespace=~\"$namespace\", pod=~\"${created_by}.*\", cluster=\"$cluster\"}[$__rate_interval])) by (pod)", + "interval": "$resolution", + "legendFormat": "Transmitted - {{ pod }}", + "range": true, + "refId": "B" + } + ], + "title": "Network - Bandwidth by pod", + "type": "timeseries" + }, + { + "datasource": { + "default": false, + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "pps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 90 + }, + "id": 79, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(container_network_receive_packets_total{namespace=~\"$namespace\", pod=~\"${created_by}.*\", cluster=\"$cluster\"}[$__rate_interval])) by (pod)", + "interval": "$resolution", + "legendFormat": "Received - {{ pod }}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "- sum(rate(container_network_transmit_packets_total{namespace=~\"$namespace\", pod=~\"${created_by}.*\", cluster=\"$cluster\"}[$__rate_interval])) by (pod)", + "interval": "$resolution", + "legendFormat": "Transmitted - {{ pod }}", + "range": true, + "refId": "B" + } + ], + "title": "Network - Packets Rate by pod", + "type": "timeseries" + }, + { + "datasource": { + "default": false, + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "pps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 98 + }, + "id": 80, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\", pod=~\"${created_by}.*\", cluster=\"$cluster\"}[$__rate_interval])) by (pod)", + "interval": "$resolution", + "legendFormat": "Received - {{ pod }}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "- sum(rate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\", pod=~\"${created_by}.*\", cluster=\"$cluster\"}[$__rate_interval])) by (pod)", + "interval": "$resolution", + "legendFormat": "Transmitted - {{ pod }}", + "range": true, + "refId": "B" + } + ], + "title": "Network - Packets Dropped by pod", + "type": "timeseries" + }, + { + "datasource": { + "default": false, + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "pps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 98 + }, + "id": 81, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(container_network_receive_errors_total{namespace=~\"$namespace\", pod=~\"${created_by}.*\", cluster=\"$cluster\"}[$__rate_interval])) by (pod)", + "interval": "$resolution", + "legendFormat": "Received - {{ pod }}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "- sum(rate(container_network_transmit_errors_total{namespace=~\"$namespace\", pod=~\"${created_by}.*\", cluster=\"$cluster\"}[$__rate_interval])) by (pod)", + "interval": "$resolution", + "legendFormat": "Transmitted - {{ pod }}", + "range": true, + "refId": "B" + } + ], + "title": "Network - Errors by pod", + "type": "timeseries" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "tags": [ + "Kubernetes", + "Prometheus" + ], + "templating": { + "list": [ + { + "current": {}, + "hide": 0, + "includeAll": false, + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(kube_node_info,cluster)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "cluster", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(kube_node_info,cluster)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "allValue": ".*", + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", + "hide": 0, + "includeAll": true, + "multi": true, + "name": "namespace", + "options": [], + "query": { + "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": { + "selected": false, + "text": "30s", + "value": "30s" + }, + "hide": 0, + "includeAll": false, + "multi": false, + "name": "resolution", + "options": [ + { + "selected": false, + "text": "1s", + "value": "1s" + }, + { + "selected": false, + "text": "15s", + "value": "15s" + }, + { + "selected": true, + "text": "30s", + "value": "30s" + }, + { + "selected": false, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "3m", + "value": "3m" + }, + { + "selected": false, + "text": "5m", + "value": "5m" + } + ], + "query": "1s, 15s, 30s, 1m, 3m, 5m", + "queryValue": "", + "skipUrlSync": false, + "type": "custom" + }, + { + "allValue": ".*", + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(kube_pod_info{namespace=~\"$namespace\", cluster=\"$cluster\"},created_by_name)", + "description": "Can be used to filter on a specific deployment, statefulset or deamonset (only relevant panels).", + "hide": 0, + "includeAll": true, + "multi": true, + "name": "created_by", + "options": [], + "query": { + "query": "label_values(kube_pod_info{namespace=~\"$namespace\", cluster=\"$cluster\"},created_by_name)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Kubernetes / Views / Namespaces", + "uid": "k8s_views_ns", + "version": 43, + "weekStart": "" +} diff --git a/ops/prometheus/grafana/dashboards/k8s/k8s-views-nodes.json b/ops/prometheus/grafana/dashboards/k8s/k8s-views-nodes.json new file mode 100644 index 0000000..d4e3040 --- /dev/null +++ b/ops/prometheus/grafana/dashboards/k8s/k8s-views-nodes.json @@ -0,0 +1,4019 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": {}, + "__requires": [ + { + "type": "panel", + "id": "gauge", + "name": "Gauge", + "version": "" + }, + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "10.3.1" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "stat", + "name": "Stat", + "version": "" + }, + { + "type": "panel", + "id": "table", + "name": "Table", + "version": "" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + }, + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": false, + "iconColor": "#5c4ee5", + "name": "terraform", + "target": { + "limit": 100, + "matchAny": false, + "tags": [ + "terraform" + ], + "type": "tags" + } + }, + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": false, + "iconColor": "red", + "name": "oncall", + "target": { + "limit": 100, + "matchAny": false, + "tags": [ + "oncall" + ], + "type": "tags" + } + } + ] + }, + "description": "This is a modern 'Nodes View' dashboard for your Kubernetes cluster(s). Made for kube-prometheus-stack and take advantage of the latest Grafana features. GitHub repository: https://github.com/dotdc/grafana-dashboards-kubernetes", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 40, + "panels": [], + "title": "Overview", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "decimals": 2, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 50 + }, + { + "color": "red", + "value": 70 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 4, + "x": 0, + "y": 1 + }, + "id": 7, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto", + "text": {} + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "avg(sum by (cpu) (rate(node_cpu_seconds_total{mode!~\"idle|iowait|steal\", instance=\"$instance\", cluster=\"$cluster\"}[$__rate_interval])))", + "instant": true, + "interval": "$resolution", + "legendFormat": "", + "refId": "A" + } + ], + "title": "CPU Usage", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "decimals": 2, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 50 + }, + { + "color": "red", + "value": 70 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 4, + "x": 4, + "y": 1 + }, + "id": 13, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto", + "text": {} + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "sum(node_memory_MemTotal_bytes{instance=\"$instance\", cluster=\"$cluster\"} - node_memory_MemAvailable_bytes{instance=\"$instance\", cluster=\"$cluster\"}) / sum(node_memory_MemTotal_bytes{instance=\"$instance\", cluster=\"$cluster\"})", + "instant": true, + "interval": "$resolution", + "legendFormat": "", + "refId": "A" + } + ], + "title": "RAM Usage", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 4, + "x": 8, + "y": 1 + }, + "id": 24, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "value", + "wideLayout": true + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "sum(kube_pod_info{node=\"$node\", cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Pods on node", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "links": [ + { + "targetBlank": true, + "title": "Pod details", + "url": "/d/k8s_views_pods/kubernetes-views-pods?${datasource:queryparam}&var-namespace=${__data.fields.namespace}&${cluster:queryparam}&var-pod=${__data.fields.pod}&${resolution:queryparam}&${__url_time_range}" + } + ], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "pod" + }, + "properties": [ + { + "id": "custom.width", + "value": 416 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "priority_class" + }, + "properties": [ + { + "id": "custom.width", + "value": 176 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "pod_ip" + }, + "properties": [ + { + "id": "custom.width", + "value": 157 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "created_by_kind" + }, + "properties": [ + { + "id": "custom.width", + "value": 205 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "namespace" + }, + "properties": [ + { + "id": "custom.width", + "value": 263 + } + ] + } + ] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 5, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [] + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "kube_pod_info{node=\"$node\", cluster=\"$cluster\"}", + "format": "table", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "List of pods on node ($node)", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Value": true, + "__name__": true, + "container": true, + "created_by_kind": false, + "created_by_name": true, + "endpoint": true, + "env": true, + "host_ip": true, + "host_network": true, + "instance": true, + "job": true, + "node": true, + "project": true, + "prometheus_replica": true, + "service": true, + "uid": true + }, + "indexByName": { + "Time": 6, + "Value": 20, + "__name__": 7, + "container": 8, + "created_by_kind": 2, + "created_by_name": 9, + "endpoint": 10, + "env": 11, + "host_ip": 5, + "host_network": 12, + "instance": 13, + "job": 14, + "namespace": 1, + "node": 15, + "pod": 0, + "pod_ip": 3, + "priority_class": 4, + "project": 16, + "prometheus_replica": 17, + "service": 18, + "uid": 19 + }, + "renameByName": {} + } + }, + { + "id": "groupBy", + "options": { + "fields": { + "created_by_kind": { + "aggregations": [], + "operation": "groupby" + }, + "host_ip": { + "aggregations": [], + "operation": "groupby" + }, + "namespace": { + "aggregations": [ + "last" + ], + "operation": "groupby" + }, + "pod": { + "aggregations": [], + "operation": "groupby" + }, + "pod_ip": { + "aggregations": [], + "operation": "groupby" + }, + "priority_class": { + "aggregations": [], + "operation": "groupby" + } + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "decimals": 3, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgb(255, 255, 255)", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 2, + "x": 0, + "y": 9 + }, + "id": 9, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "sum(rate(node_cpu_seconds_total{mode!~\"idle|iowait|steal\", instance=\"$instance\", cluster=\"$cluster\"}[$__rate_interval]))", + "instant": true, + "interval": "$resolution", + "legendFormat": "", + "refId": "A" + } + ], + "title": "CPU Used", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgb(255, 255, 255)", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 2, + "x": 2, + "y": 9 + }, + "id": 11, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "sum(machine_cpu_cores{node=\"$node\", cluster=\"$cluster\"})", + "interval": "$resolution", + "legendFormat": "", + "refId": "A" + } + ], + "title": "CPU Total", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgb(255, 255, 255)", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 2, + "x": 4, + "y": 9 + }, + "id": 15, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "sum(node_memory_MemTotal_bytes{instance=\"$instance\", cluster=\"$cluster\"} - node_memory_MemAvailable_bytes{instance=\"$instance\", cluster=\"$cluster\"})", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "RAM Used", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgb(255, 255, 255)", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 2, + "x": 6, + "y": 9 + }, + "id": 17, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "machine_memory_bytes{node=\"$node\", cluster=\"$cluster\"}", + "instant": false, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "RAM Total", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 25228800 + }, + { + "color": "red", + "value": 31536000 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 8, + "y": 9 + }, + "id": 18, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "node_time_seconds{instance=\"$instance\", cluster=\"$cluster\"} - node_boot_time_seconds{instance=\"$instance\", cluster=\"$cluster\"}", + "instant": false, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "uptime", + "type": "stat" + }, + { + "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 12 + }, + "id": 38, + "panels": [], + "title": "Resources", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "max": 100, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 13 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "avg(rate(node_cpu_seconds_total{instance=\"$instance\", cluster=\"$cluster\"}[$__rate_interval]) * 100) by (mode)", + "hide": false, + "instant": false, + "interval": "$resolution", + "legendFormat": "{{ mode }}", + "refId": "A" + } + ], + "title": "CPU Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 13 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "node_memory_MemTotal_bytes{instance=\"$instance\", cluster=\"$cluster\"} - node_memory_MemFree_bytes{instance=\"$instance\", cluster=\"$cluster\"} - (node_memory_Cached_bytes{instance=\"$instance\", cluster=\"$cluster\"} + node_memory_Buffers_bytes{instance=\"$instance\", cluster=\"$cluster\"})", + "instant": false, + "interval": "$resolution", + "legendFormat": "RAM Used", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "node_memory_MemTotal_bytes{instance=\"$instance\", cluster=\"$cluster\"}", + "hide": false, + "interval": "$resolution", + "legendFormat": "RAM Total", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "node_memory_Cached_bytes{instance=\"$instance\", cluster=\"$cluster\"}", + "hide": false, + "interval": "$resolution", + "legendFormat": "RAM Cache", + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "node_memory_Buffers_bytes{instance=\"$instance\", cluster=\"$cluster\"}", + "hide": false, + "interval": "$resolution", + "legendFormat": "RAM Buffer", + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "node_memory_MemFree_bytes{instance=\"$instance\", cluster=\"$cluster\"}", + "hide": false, + "interval": "$resolution", + "legendFormat": "RAM Free", + "refId": "E" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "node_memory_SwapTotal_bytes{instance=\"$instance\", cluster=\"$cluster\"} - node_memory_SwapFree_bytes{instance=\"$instance\", cluster=\"$cluster\"}", + "hide": false, + "interval": "$resolution", + "legendFormat": "SWAP Used", + "refId": "F" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "node_memory_SwapTotal_bytes{instance=\"$instance\", cluster=\"$cluster\"}", + "hide": false, + "interval": "$resolution", + "legendFormat": "SWAP Total", + "refId": "G" + } + ], + "title": "Memory Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "CPU Cores", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 22 + }, + "id": 26, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "sum(rate(container_cpu_usage_seconds_total{node=\"$node\", image!=\"\", cluster=\"$cluster\"}[$__rate_interval])) by (pod)", + "interval": "$resolution", + "legendFormat": "{{ pod }}", + "refId": "A" + } + ], + "title": "CPU usage by Pod", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 22 + }, + "id": 28, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "sum(container_memory_working_set_bytes{node=\"$node\", image!=\"\", cluster=\"$cluster\"}) by (pod)", + "interval": "$resolution", + "legendFormat": "{{ pod }}", + "refId": "A" + } + ], + "title": "Memory usage by Pod", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Number of times a CPU core has been throttled on an instance.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "CPU CORES", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 30 + }, + "id": 66, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(node_cpu_core_throttles_total{instance=\"$instance\", cluster=\"$cluster\"}[$__rate_interval]))", + "interval": "$resolution", + "legendFormat": "Nb of cpu core throttles", + "range": true, + "refId": "A" + } + ], + "title": "Number of CPU Core Throttled", + "type": "timeseries" + }, + { + "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 38 + }, + "id": 44, + "panels": [], + "title": "System", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 39 + }, + "id": 48, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "node_load1{instance=\"$instance\", cluster=\"$cluster\"}", + "interval": "$resolution", + "legendFormat": "1m", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "node_load5{instance=\"$instance\", cluster=\"$cluster\"}", + "hide": false, + "interval": "$resolution", + "legendFormat": "5m", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "node_load15{instance=\"$instance\", cluster=\"$cluster\"}", + "hide": false, + "interval": "$resolution", + "legendFormat": "15m", + "range": true, + "refId": "C" + } + ], + "title": "System Load", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 39 + }, + "id": 46, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "rate(node_context_switches_total{instance=\"$instance\", cluster=\"$cluster\"}[$__rate_interval])", + "interval": "$resolution", + "intervalFactor": 1, + "legendFormat": "Context switches", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "rate(node_intr_total{instance=\"$instance\", cluster=\"$cluster\"}[$__rate_interval])", + "hide": false, + "interval": "$resolution", + "legendFormat": "Interrupts", + "range": true, + "refId": "B" + } + ], + "title": "Context Switches & Interrupts", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 47 + }, + "id": 49, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "node_filefd_maximum{instance=\"$instance\", cluster=\"$cluster\"}", + "instant": false, + "interval": "$resolution", + "legendFormat": "Maximum file descriptors", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "node_filefd_allocated{instance=\"$instance\", cluster=\"$cluster\"}", + "hide": false, + "instant": false, + "interval": "$resolution", + "legendFormat": "Allocated file descriptors", + "refId": "B" + } + ], + "title": "File Descriptors", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 47 + }, + "id": 50, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "node_timex_estimated_error_seconds{instance=\"$instance\", cluster=\"$cluster\"}", + "instant": false, + "interval": "$resolution", + "intervalFactor": 1, + "legendFormat": "Estimated error in seconds", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "node_timex_maxerror_seconds{instance=\"$instance\", cluster=\"$cluster\"}", + "hide": false, + "interval": "$resolution", + "intervalFactor": 1, + "legendFormat": "Maximum error in seconds", + "range": true, + "refId": "B" + } + ], + "title": "Time Sync", + "type": "timeseries" + }, + { + "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 55 + }, + "id": 36, + "panels": [], + "title": "Network", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "BANDWIDTH", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "binBps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 56 + }, + "id": 20, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(node_network_receive_bytes_total{instance=\"$instance\", cluster=\"$cluster\"}[$__rate_interval]))", + "interval": "$resolution", + "legendFormat": "In", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "- sum(rate(node_network_transmit_bytes_total{instance=\"$instance\", cluster=\"$cluster\"}[$__rate_interval]))", + "interval": "$resolution", + "legendFormat": "Out", + "range": true, + "refId": "B" + } + ], + "title": "Network usage (bytes/s)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 56 + }, + "id": 61, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "sum(rate(node_network_receive_errs_total{instance=\"$instance\", cluster=\"$cluster\"}[$__rate_interval]))", + "interval": "$resolution", + "legendFormat": "In", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "- sum(rate(node_network_transmit_errs_total{instance=\"$instance\", cluster=\"$cluster\"}[$__rate_interval]))", + "interval": "$resolution", + "legendFormat": "Out", + "range": true, + "refId": "B" + } + ], + "title": "Network errors", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "pps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 64 + }, + "id": 62, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(node_network_receive_packets_total{instance=\"$instance\", cluster=\"$cluster\"}[$__rate_interval]))", + "interval": "$resolution", + "legendFormat": "In", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "- sum(rate(node_network_transmit_packets_total{instance=\"$instance\", cluster=\"$cluster\"}[$__rate_interval]))", + "interval": "$resolution", + "legendFormat": "Out", + "range": true, + "refId": "B" + } + ], + "title": "Network usage (packet/s)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 64 + }, + "id": 64, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "sum(rate(node_network_receive_drop_total{instance=\"$instance\", cluster=\"$cluster\"}[$__rate_interval]))", + "hide": false, + "interval": "$resolution", + "legendFormat": "In", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "- sum(rate(node_network_transmit_drop_total{instance=\"$instance\", cluster=\"$cluster\"}[$__rate_interval]))", + "hide": false, + "interval": "$resolution", + "legendFormat": "Out", + "refId": "B" + } + ], + "title": "Network total drops", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 72 + }, + "id": 60, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "node_netstat_Tcp_CurrEstab{instance=\"$instance\", cluster=\"$cluster\"}", + "instant": false, + "interval": "$resolution", + "legendFormat": "TCP Currently Established", + "refId": "A" + } + ], + "title": "TCP Currently Established", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "NF Conntrack limit" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 72 + }, + "id": 63, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "node_nf_conntrack_entries{instance=\"$instance\", cluster=\"$cluster\"}", + "instant": false, + "interval": "$resolution", + "legendFormat": "NF Conntrack entries", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "node_nf_conntrack_entries_limit{instance=\"$instance\", cluster=\"$cluster\"}", + "hide": false, + "interval": "$resolution", + "legendFormat": "NF Conntrack limit", + "range": true, + "refId": "B" + } + ], + "title": "NF Conntrack", + "type": "timeseries" + }, + { + "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 80 + }, + "id": 54, + "panels": [], + "title": "Kubernetes Storage", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 81 + }, + "id": 30, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(kubelet_volume_stats_used_bytes{node=\"$node\", cluster=\"$cluster\"}) by (persistentvolumeclaim) / sum(kubelet_volume_stats_capacity_bytes{node=\"$node\", cluster=\"$cluster\"}) by (persistentvolumeclaim)", + "interval": "$resolution", + "legendFormat": "{{ persistentvolumeclaim }}", + "range": true, + "refId": "A" + } + ], + "title": "Persistent Volumes - Usage in %", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "filterable": false, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Used" + }, + "properties": [ + { + "id": "custom.width", + "value": 146 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Total" + }, + "properties": [ + { + "id": "custom.width", + "value": 167 + } + ] + } + ] + }, + "gridPos": { + "h": 16, + "w": 12, + "x": 12, + "y": 81 + }, + "id": 34, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [] + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "sum(kubelet_volume_stats_used_bytes{node=\"$node\", cluster=\"$cluster\"}) by (persistentvolumeclaim)", + "format": "table", + "hide": false, + "interval": "", + "legendFormat": "", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "sum(kubelet_volume_stats_capacity_bytes{node=\"$node\", cluster=\"$cluster\"}) by (persistentvolumeclaim)", + "format": "table", + "hide": false, + "interval": "", + "legendFormat": "", + "refId": "B" + } + ], + "title": "Persistent Volumes - Usage in GB", + "transformations": [ + { + "id": "groupBy", + "options": { + "fields": { + "Value": { + "aggregations": [ + "lastNotNull" + ], + "operation": "aggregate" + }, + "Value #A": { + "aggregations": [ + "lastNotNull" + ], + "operation": "aggregate" + }, + "Value #B": { + "aggregations": [ + "lastNotNull" + ], + "operation": "aggregate" + }, + "persistentvolumeclaim": { + "aggregations": [], + "operation": "groupby" + } + } + } + }, + { + "id": "seriesToColumns", + "options": { + "byField": "persistentvolumeclaim" + } + }, + { + "id": "organize", + "options": { + "excludeByName": {}, + "indexByName": {}, + "renameByName": { + "Value #A (lastNotNull)": "Used", + "Value #B (lastNotNull)": "Total", + "persistentvolumeclaim": "Persistent Volume Claim" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 89 + }, + "id": 32, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(kubelet_volume_stats_inodes_used{node=\"$node\", cluster=\"$cluster\"}) by (persistentvolumeclaim) / sum(kubelet_volume_stats_inodes{node=\"$node\", cluster=\"$cluster\"}) by (persistentvolumeclaim) * 100", + "interval": "$resolution", + "legendFormat": "{{ persistentvolumeclaim }}", + "range": true, + "refId": "A" + } + ], + "title": "Persistent Volumes - Inodes", + "type": "timeseries" + }, + { + "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 97 + }, + "id": 42, + "panels": [], + "title": "Node Storage", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 98 + }, + "id": 33, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "100 - ((node_filesystem_avail_bytes{instance=\"$instance\", cluster=\"$cluster\"} * 100) / node_filesystem_size_bytes{instance=\"$instance\", cluster=\"$cluster\"})", + "hide": false, + "interval": "$resolution", + "legendFormat": "{{ mountpoint }}", + "range": true, + "refId": "A" + } + ], + "title": "FS usage in %", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 98 + }, + "id": 59, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "100 - (node_filesystem_files_free{instance=\"$instance\", cluster=\"$cluster\"} / node_filesystem_files{instance=\"$instance\", cluster=\"$cluster\"} * 100)", + "hide": false, + "interval": "$resolution", + "legendFormat": "{{ mountpoint }}", + "range": true, + "refId": "A" + } + ], + "title": "FS inode usage in %", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 106 + }, + "id": 52, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "rate(node_disk_read_bytes_total{instance=\"$instance\", cluster=\"$cluster\"}[$__rate_interval])", + "interval": "$resolution", + "legendFormat": "{{device}}", + "range": true, + "refId": "A" + } + ], + "title": "Reads by disk (bytes)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 106 + }, + "id": 57, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "rate(node_disk_written_bytes_total{instance=\"$instance\", cluster=\"$cluster\"}[$__rate_interval])", + "hide": false, + "interval": "$resolution", + "legendFormat": "{{device}}", + "range": true, + "refId": "A" + } + ], + "title": "Writes by disk (bytes)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "read/s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 114 + }, + "id": 51, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "rate(node_disk_reads_completed_total{instance=\"$instance\", cluster=\"$cluster\"}[$__rate_interval])", + "interval": "$resolution", + "legendFormat": "{{device}}", + "range": true, + "refId": "A" + } + ], + "title": "Completed reads by disk", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "write/s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 114 + }, + "id": 56, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "rate(node_disk_writes_completed_total{instance=\"$instance\", cluster=\"$cluster\"}[$__rate_interval])", + "hide": false, + "interval": "$resolution", + "legendFormat": "{{device}}", + "range": true, + "refId": "A" + } + ], + "title": "Completed writes by disk", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "io/s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 122 + }, + "id": 58, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "rate(node_disk_io_now{instance=\"$instance\", cluster=\"$cluster\"}[$__rate_interval]) ", + "interval": "$resolution", + "legendFormat": "{{device}}", + "range": true, + "refId": "A" + } + ], + "title": "Disk(s) io/s", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 122 + }, + "id": 55, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(node_filesystem_device_error{instance=\"$instance\", cluster=\"$cluster\"}) by (mountpoint)", + "interval": "$resolution", + "legendFormat": "{{ mountpoint }}", + "range": true, + "refId": "A" + } + ], + "title": "FS - Device Errors", + "type": "timeseries" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "tags": [ + "Kubernetes", + "Prometheus" + ], + "templating": { + "list": [ + { + "current": {}, + "hide": 0, + "includeAll": false, + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(kube_node_info,cluster)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "cluster", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(kube_node_info,cluster)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "30s", + "value": "30s" + }, + "hide": 0, + "includeAll": false, + "multi": false, + "name": "resolution", + "options": [ + { + "selected": false, + "text": "1s", + "value": "1s" + }, + { + "selected": false, + "text": "15s", + "value": "15s" + }, + { + "selected": true, + "text": "30s", + "value": "30s" + }, + { + "selected": false, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "3m", + "value": "3m" + }, + { + "selected": false, + "text": "5m", + "value": "5m" + } + ], + "query": "1s, 15s, 30s, 1m, 3m, 5m", + "queryValue": "", + "skipUrlSync": false, + "type": "custom" + }, + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(kube_node_info{cluster=\"$cluster\"}, node)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "node", + "options": [], + "query": { + "query": "label_values(kube_node_info{cluster=\"$cluster\"}, node)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(node_uname_info{nodename=~\"(?i:($node)(.[a-z0-9.]+)?)\", cluster=\"$cluster\"}, instance)", + "hide": 2, + "includeAll": false, + "multi": false, + "name": "instance", + "options": [], + "query": { + "query": "label_values(node_uname_info{nodename=~\"(?i:($node)(.[a-z0-9.]+)?)\", cluster=\"$cluster\"}, instance)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Kubernetes / Views / Nodes", + "uid": "k8s_views_nodes", + "version": 37, + "weekStart": "" +} diff --git a/ops/prometheus/grafana/dashboards/k8s/k8s-views-pods.json b/ops/prometheus/grafana/dashboards/k8s/k8s-views-pods.json new file mode 100644 index 0000000..83a1d31 --- /dev/null +++ b/ops/prometheus/grafana/dashboards/k8s/k8s-views-pods.json @@ -0,0 +1,2717 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": [], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "8.3.4" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "5.0.0" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + }, + { + "type": "panel", + "id": "stat", + "name": "Stat", + "version": "" + }, + { + "type": "panel", + "id": "gauge", + "name": "Gauge", + "version": "" + }, + { + "type": "panel", + "id": "table", + "name": "Table", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + }, + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": false, + "iconColor": "#5c4ee5", + "name": "terraform", + "target": { + "limit": 100, + "matchAny": false, + "tags": ["terraform"], + "type": "tags" + } + }, + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": false, + "iconColor": "red", + "name": "oncall", + "target": { + "limit": 100, + "matchAny": false, + "tags": ["oncall"], + "type": "tags" + } + } + ] + }, + "description": "This is a modern 'Pods View' dashboard for your Kubernetes cluster(s). Made for kube-prometheus-stack and take advantage of the latest Grafana features. GitHub repository: https://github.com/dotdc/grafana-dashboards-kubernetes", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 43, + "panels": [], + "targets": [ + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "refId": "A" + } + ], + "title": "Information", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Panel only works when a single pod is selected.", + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgb(255, 255, 255)", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": ["mean"], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "name", + "wideLayout": true + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "kube_pod_info{namespace=\"$namespace\", pod=\"$pod\", cluster=\"$cluster\"}", + "instant": true, + "interval": "", + "legendFormat": "{{ created_by_kind }}: {{ created_by_name }}", + "refId": "A" + } + ], + "title": "Created by", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Panel only works when a single pod is selected.", + "fieldConfig": { + "defaults": { + "links": [ + { + "title": "", + "url": "/d/k8s_views_nodes/kubernetes-views-nodes?var-datasource=${datasource}&var-node=${__field.labels.node}&${cluster:queryparam}" + } + ], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgb(255, 255, 255)", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 6, + "x": 12, + "y": 1 + }, + "id": 33, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": ["mean"], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "name", + "wideLayout": true + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "kube_pod_info{namespace=\"$namespace\", pod=\"$pod\", cluster=\"$cluster\"}", + "instant": true, + "interval": "", + "legendFormat": "{{ node }}", + "refId": "A" + } + ], + "title": "Running on", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Panel only works when a single pod is selected.", + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgb(255, 255, 255)", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 6, + "x": 18, + "y": 1 + }, + "id": 41, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": ["mean"], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "name", + "wideLayout": true + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "kube_pod_info{namespace=\"$namespace\", pod=\"$pod\", cluster=\"$cluster\"}", + "instant": true, + "interval": "", + "legendFormat": "{{ pod_ip }}", + "refId": "A" + } + ], + "title": "Pod IP", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Panel only works when a single pod is selected.", + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgb(255, 255, 255)", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 5, + "x": 0, + "y": 3 + }, + "id": 52, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": ["mean"], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "name", + "wideLayout": true + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "kube_pod_info{namespace=\"$namespace\", pod=\"$pod\", priority_class!=\"\", cluster=\"$cluster\"}", + "format": "time_series", + "instant": true, + "interval": "", + "legendFormat": "{{ priority_class }}", + "range": false, + "refId": "A" + } + ], + "title": "Priority Class", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Panel only works when a single pod is selected.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Burstable" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "BestEffort" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 2, + "w": 7, + "x": 5, + "y": 3 + }, + "id": 53, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "name", + "wideLayout": true + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "kube_pod_status_qos_class{namespace=\"$namespace\", pod=\"$pod\", cluster=\"$cluster\"} > 0", + "instant": true, + "interval": "", + "legendFormat": "{{ qos_class }}", + "refId": "A" + } + ], + "title": "QOS Class", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Panel only works when a single pod is selected.", + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "text", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 6, + "x": 12, + "y": 3 + }, + "id": 56, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "name", + "wideLayout": true + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "kube_pod_container_status_last_terminated_reason{namespace=\"$namespace\", pod=\"$pod\", cluster=\"$cluster\"}", + "instant": true, + "interval": "", + "legendFormat": "{{ reason }}", + "refId": "A" + } + ], + "title": "Last Terminated Reason", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Panel only works when a single pod is selected.", + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "text", + "value": null + }, + { + "color": "red", + "value": 1 + }, + { + "color": "#EAB839", + "value": 2 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 6, + "x": 18, + "y": 3 + }, + "id": 57, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [], + "fields": "", + "values": true + }, + "showPercentChange": false, + "textMode": "value", + "wideLayout": true + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "kube_pod_container_status_last_terminated_exitcode{namespace=\"$namespace\", pod=\"$pod\", cluster=\"$cluster\"}", + "instant": true, + "interval": "", + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Last Terminated Exit Code", + "type": "stat" + }, + { + "collapsed": false, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 5 + }, + "id": 47, + "panels": [], + "targets": [ + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "refId": "A" + } + ], + "title": "Resources", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "blue", + "mode": "fixed" + }, + "decimals": 2, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 60 + }, + { + "color": "red", + "value": 75 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 3, + "x": 0, + "y": 6 + }, + "id": 39, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": ["last"], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod=~\"$pod\", image!=\"\", cluster=\"$cluster\"}[$__rate_interval])) / sum(kube_pod_container_resource_requests{namespace=\"$namespace\", pod=~\"$pod\", resource=\"cpu\", job=~\"$job\", cluster=\"$cluster\"})", + "instant": true, + "interval": "$resolution", + "legendFormat": "Requests", + "refId": "A" + } + ], + "title": "Total pod CPU Requests usage", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "decimals": 2, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 60 + }, + { + "color": "red", + "value": 75 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 3, + "x": 3, + "y": 6 + }, + "id": 48, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": ["last"], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod=~\"$pod\", image!=\"\", cluster=\"$cluster\"}[$__rate_interval])) / sum(kube_pod_container_resource_limits{namespace=\"$namespace\", pod=~\"$pod\", resource=\"cpu\", job=~\"$job\", cluster=\"$cluster\"})", + "instant": true, + "interval": "$resolution", + "legendFormat": "Limits", + "refId": "A" + } + ], + "title": "Total pod CPU Limits usage", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "blue", + "mode": "fixed" + }, + "decimals": 2, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "blue", + "value": null + }, + { + "color": "#EAB839", + "value": 80 + }, + { + "color": "red", + "value": 99 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 3, + "x": 6, + "y": 6 + }, + "id": 40, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": ["last"], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(container_memory_working_set_bytes{namespace=\"$namespace\", pod=~\"$pod\", image!=\"\", cluster=\"$cluster\"}) / sum(kube_pod_container_resource_requests{namespace=\"$namespace\", pod=~\"$pod\", resource=\"memory\", job=~\"$job\", cluster=\"$cluster\"})", + "instant": true, + "interval": "$resolution", + "legendFormat": "Requests", + "refId": "A" + } + ], + "title": "Total pod RAM Requests usage", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "decimals": 2, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 60 + }, + { + "color": "red", + "value": 75 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 3, + "x": 9, + "y": 6 + }, + "id": 49, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": ["last"], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(container_memory_working_set_bytes{namespace=\"$namespace\", pod=~\"$pod\", image!=\"\", cluster=\"$cluster\"}) / sum(kube_pod_container_resource_limits{namespace=\"$namespace\", pod=~\"$pod\", resource=\"memory\", job=~\"$job\", cluster=\"$cluster\"}) ", + "instant": true, + "interval": "$resolution", + "legendFormat": "Limits", + "refId": "B" + } + ], + "title": "Total pod RAM Limits usage", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "filterable": false, + "inspect": false, + "minWidth": 100 + }, + "decimals": 4, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Memory Requests" + }, + "properties": [ + { + "id": "unit", + "value": "bytes" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Memory Limits" + }, + "properties": [ + { + "id": "unit", + "value": "bytes" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Memory Used" + }, + "properties": [ + { + "id": "unit", + "value": "bytes" + }, + { + "id": "decimals", + "value": 2 + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 6 + }, + "id": 38, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": ["sum"], + "show": false + }, + "showHeader": true, + "sortBy": [] + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(kube_pod_container_resource_requests{namespace=\"$namespace\", pod=~\"$pod\", resource=\"cpu\", job=~\"$job\", cluster=\"$cluster\"}) by (container)", + "format": "table", + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(kube_pod_container_resource_limits{namespace=\"$namespace\", pod=~\"$pod\", resource=\"cpu\", job=~\"$job\", cluster=\"$cluster\"}) by (container)", + "format": "table", + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(kube_pod_container_resource_requests{namespace=\"$namespace\", pod=~\"$pod\", resource=\"memory\", job=~\"$job\", cluster=\"$cluster\"}) by (container)", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(kube_pod_container_resource_limits{namespace=\"$namespace\", pod=~\"$pod\", resource=\"memory\", job=~\"$job\", cluster=\"$cluster\"}) by (container)", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod=~\"$pod\", image!=\"\", container!=\"\", cluster=\"$cluster\"}[$__rate_interval])) by (container)", + "format": "table", + "hide": false, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "E" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(container_memory_working_set_bytes{namespace=\"$namespace\", pod=~\"$pod\", image!=\"\", container!=\"\", cluster=\"$cluster\"}) by (container)", + "format": "table", + "hide": false, + "instant": true, + "range": false, + "refId": "F" + } + ], + "title": "Resources by container", + "transformations": [ + { + "id": "seriesToColumns", + "options": { + "byField": "container" + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Time 1": true, + "Time 2": true, + "Time 4": true, + "__name__": true, + "__name__ 1": true, + "__name__ 2": true, + "__name__ 3": true, + "__name__ 4": true, + "container": false, + "endpoint": true, + "endpoint 2": true, + "endpoint 3": true, + "endpoint 4": true, + "instance": true, + "instance 2": true, + "instance 3": true, + "instance 4": true, + "job": true, + "job 2": true, + "job 3": true, + "job 4": true, + "namespace": true, + "namespace 2": true, + "namespace 3": true, + "namespace 4": true, + "node": true, + "node 2": true, + "node 3": true, + "node 4": true, + "pod": true, + "pod 2": true, + "pod 3": true, + "pod 4": true, + "resource 1": true, + "resource 2": true, + "resource 3": true, + "resource 4": true, + "service": true, + "service 2": true, + "service 3": true, + "service 4": true, + "uid 1": true, + "uid 2": true, + "uid 3": true, + "uid 4": true, + "unit 1": true, + "unit 2": true, + "unit 3": true, + "unit 4": true + }, + "indexByName": { + "Time 1": 7, + "Time 2": 8, + "Time 3": 9, + "Time 4": 10, + "Time 5": 11, + "Time 6": 12, + "Value #A": 2, + "Value #B": 3, + "Value #C": 5, + "Value #D": 6, + "Value #E": 1, + "Value #F": 4, + "container": 0 + }, + "renameByName": { + "Value #A": "CPU Requests", + "Value #B": "CPU Limits", + "Value #C": "Memory Requests", + "Value #D": "Memory Limits", + "Value #E": "CPU Used", + "Value #F": "Memory Used", + "container": "Container" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Percent", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "area" + } + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "yellow", + "value": 20 + }, + { + "color": "green", + "value": 30 + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 14 + }, + "id": 50, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod=~\"$pod\", image!=\"\", cluster=\"$cluster\"}[$__rate_interval])) by (container) / sum(kube_pod_container_resource_requests{namespace=\"$namespace\", pod=~\"$pod\", resource=\"cpu\", job=~\"$job\", cluster=\"$cluster\"}) by (container)", + "interval": "$resolution", + "legendFormat": "{{ container }} REQUESTS", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod=~\"$pod\", image!=\"\", cluster=\"$cluster\"}[$__rate_interval])) by (container) / sum(kube_pod_container_resource_limits{namespace=\"$namespace\", pod=~\"$pod\", resource=\"cpu\", job=~\"$job\", cluster=\"$cluster\"}) by (container)", + "hide": false, + "legendFormat": "{{ container }} LIMITS", + "range": true, + "refId": "B" + } + ], + "title": "CPU Usage / Requests & Limits by container", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "blue", + "mode": "thresholds" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Percent", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "area" + } + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "yellow", + "value": 20 + }, + { + "color": "green", + "value": 30 + }, + { + "color": "#EAB839", + "value": 70 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 14 + }, + "id": 30, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(container_memory_working_set_bytes{namespace=\"$namespace\", pod=~\"$pod\", image!=\"\", cluster=\"$cluster\"}) by (container) / sum(kube_pod_container_resource_requests{namespace=\"$namespace\", pod=~\"$pod\", resource=\"memory\", job=~\"$job\", cluster=\"$cluster\"}) by (container)", + "interval": "", + "legendFormat": "{{ container }} REQUESTS", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(container_memory_working_set_bytes{namespace=\"$namespace\", pod=~\"$pod\", image!=\"\", cluster=\"$cluster\"}) by (container) / sum(kube_pod_container_resource_limits{namespace=\"$namespace\", pod=~\"$pod\", resource=\"memory\", job=~\"$job\", cluster=\"$cluster\"}) by (container)", + "hide": false, + "legendFormat": "{{ container }} LIMITS", + "range": true, + "refId": "B" + } + ], + "title": "Memory Usage / Requests & Limits by container", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "CPU Cores", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 4, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "limit" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F2495C", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 22 + }, + "id": 29, + "options": { + "legend": { + "calcs": ["min", "max", "mean"], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod=~\"$pod\", image!=\"\", container!=\"\", cluster=\"$cluster\"}[$__rate_interval])) by (container, id)", + "interval": "$resolution", + "legendFormat": "{{ container }}", + "range": true, + "refId": "A" + } + ], + "title": "CPU Usage by container", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Bytes", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 22 + }, + "id": 51, + "options": { + "legend": { + "calcs": ["min", "max", "mean"], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(container_memory_working_set_bytes{namespace=\"$namespace\", pod=~\"$pod\", image!=\"\", container!=\"\", cluster=\"$cluster\"}) by (container, id)", + "interval": "", + "legendFormat": "{{ container }}", + "range": true, + "refId": "A" + } + ], + "title": "Memory Usage by container", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "SECONDS", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 30 + }, + "id": 59, + "options": { + "legend": { + "calcs": ["min", "max", "mean"], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(container_cpu_cfs_throttled_seconds_total{namespace=~\"$namespace\", pod=~\"$pod\", image!=\"\", container!=\"\", cluster=\"$cluster\"}[$__rate_interval])) by (container)", + "interval": "$resolution", + "legendFormat": "{{ container }}", + "range": true, + "refId": "A" + } + ], + "title": "CPU Throttled seconds by container", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 38 + }, + "id": 62, + "panels": [], + "title": "Kubernetes", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "blue", + "mode": "thresholds" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Percent", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "area" + } + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "yellow", + "value": 20 + }, + { + "color": "green", + "value": 30 + }, + { + "color": "#EAB839", + "value": 70 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 39 + }, + "id": 60, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(increase(container_oom_events_total{namespace=\"${namespace}\", pod=\"${pod}\", container!=\"\", cluster=\"$cluster\"}[$__rate_interval])) by (container)", + "interval": "", + "legendFormat": "{{ container }}", + "range": true, + "refId": "A" + } + ], + "title": "OOM Events by container", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "blue", + "mode": "thresholds" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Percent", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "area" + } + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "yellow", + "value": 20 + }, + { + "color": "green", + "value": 30 + }, + { + "color": "#EAB839", + "value": 70 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 39 + }, + "id": 61, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(increase(kube_pod_container_status_restarts_total{namespace=~\"${namespace}\", pod=\"${pod}\", container!=\"\", job=~\"$job\", cluster=\"$cluster\"}[$__rate_interval])) by (container)", + "interval": "", + "legendFormat": "{{ container }}", + "range": true, + "refId": "A" + } + ], + "title": "Container Restarts by container", + "type": "timeseries" + }, + { + "collapsed": false, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 47 + }, + "id": 45, + "panels": [], + "targets": [ + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "refId": "A" + } + ], + "title": "Network", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "binBps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 48 + }, + "id": 31, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "sum(rate(container_network_receive_bytes_total{namespace=\"$namespace\", pod=~\"$pod\", cluster=\"$cluster\"}[$__rate_interval]))", + "interval": "$resolution", + "legendFormat": "Received", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "- sum(rate(container_network_transmit_bytes_total{namespace=\"$namespace\", pod=~\"$pod\", cluster=\"$cluster\"}[$__rate_interval]))", + "interval": "$resolution", + "legendFormat": "Transmitted", + "refId": "B" + } + ], + "title": "Network - Bandwidth", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "pps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 48 + }, + "id": 34, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "sum(rate(container_network_receive_packets_total{namespace=\"$namespace\", pod=~\"$pod\", cluster=\"$cluster\"}[$__rate_interval]))", + "interval": "$resolution", + "legendFormat": "Received", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "- sum(rate(container_network_transmit_packets_total{namespace=\"$namespace\", pod=~\"$pod\", cluster=\"$cluster\"}[$__rate_interval]))", + "interval": "$resolution", + "legendFormat": "Transmitted", + "refId": "B" + } + ], + "title": "Network - Packets Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "pps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 56 + }, + "id": 36, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "sum(rate(container_network_receive_packets_dropped_total{namespace=\"$namespace\", pod=~\"$pod\", cluster=\"$cluster\"}[$__rate_interval]))", + "interval": "$resolution", + "legendFormat": "Received", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "- sum(rate(container_network_transmit_packets_dropped_total{namespace=\"$namespace\", pod=~\"$pod\", cluster=\"$cluster\"}[$__rate_interval]))", + "interval": "$resolution", + "legendFormat": "Transmitted", + "refId": "B" + } + ], + "title": "Network - Packets Dropped", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "pps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 56 + }, + "id": 37, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "sum(rate(container_network_receive_errors_total{namespace=\"$namespace\", pod=~\"$pod\", cluster=\"$cluster\"}[$__rate_interval]))", + "interval": "$resolution", + "legendFormat": "Received", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "- sum(rate(container_network_transmit_errors_total{namespace=\"$namespace\", pod=~\"$pod\", cluster=\"$cluster\"}[$__rate_interval]))", + "interval": "$resolution", + "legendFormat": "Transmitted", + "refId": "B" + } + ], + "title": "Network - Errors", + "type": "timeseries" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "tags": ["Kubernetes", "Prometheus"], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "", + "value": "" + }, + "hide": 0, + "includeAll": false, + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "current": { + "isNone": true, + "selected": false, + "text": "None", + "value": "" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(kube_node_info,cluster)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "cluster", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(kube_node_info,cluster)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "monitoring", + "value": "monitoring" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "namespace", + "options": [], + "query": { + "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", + "refId": "Prometheus-namespace-Variable-Query" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": { + "selected": false, + "text": "", + "value": "" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(kube_pod_info{namespace=\"$namespace\", cluster=\"$cluster\"}, pod)", + "hide": 0, + "includeAll": true, + "multi": true, + "name": "pod", + "options": [], + "query": { + "query": "label_values(kube_pod_info{namespace=\"$namespace\", cluster=\"$cluster\"}, pod)", + "refId": "Prometheus-pod-Variable-Query" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": { + "selected": false, + "text": "30s", + "value": "30s" + }, + "hide": 0, + "includeAll": false, + "multi": false, + "name": "resolution", + "options": [ + { + "selected": false, + "text": "1s", + "value": "1s" + }, + { + "selected": false, + "text": "15s", + "value": "15s" + }, + { + "selected": true, + "text": "30s", + "value": "30s" + }, + { + "selected": false, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "3m", + "value": "3m" + }, + { + "selected": false, + "text": "5m", + "value": "5m" + } + ], + "query": "1s, 15s, 30s, 1m, 3m, 5m", + "queryValue": "", + "skipUrlSync": false, + "type": "custom" + }, + { + "current": { + "selected": false, + "text": "kube-state-metrics", + "value": "kube-state-metrics" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(kube_pod_info{namespace=\"$namespace\", cluster=\"$cluster\"},job)", + "hide": 0, + "includeAll": false, + "multi": true, + "name": "job", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(kube_pod_info{namespace=\"$namespace\", cluster=\"$cluster\"},job)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Kubernetes / Views / Pods", + "uid": "k8s_views_pods", + "version": 36, + "weekStart": "" +} diff --git a/ops/prometheus/grafana/dashboards/redis/redis-dashboard.json b/ops/prometheus/grafana/dashboards/redis/redis-dashboard.json new file mode 100644 index 0000000..ebab195 --- /dev/null +++ b/ops/prometheus/grafana/dashboards/redis/redis-dashboard.json @@ -0,0 +1,1543 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 2, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 0 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "exemplar": true, + "expr": "count(redis_up{namespace=~\"$namespace\",service=~\"$service\",env=~\"$env\"})", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Redis Nodes", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 4, + "y": 0 + }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "exemplar": true, + "expr": "sum(redis_connected_clients{namespace=~\"$namespace\",service=~\"$service\",env=~\"$env\"})", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Redis Connected Clients", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 8, + "y": 0 + }, + "id": 18, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": { + "valueSize": 40 + }, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "exemplar": true, + "expr": "avg((redis_config_maxclients{env=~\"$env\"} - redis_connected_clients{env=~\"$env\"}) / redis_config_maxclients{env=~\"$env\"})", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Redis Connection Usage", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 100 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 7, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "exemplar": true, + "expr": "increase(redis_commands_processed_total{namespace=~\"$namespace\",service=~\"$service\",env=~\"$env\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{pod}}", + "refId": "A" + } + ], + "title": "Redis Commands Executed", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 0 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 4 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "exemplar": true, + "expr": "sum(redis_cluster_slots_fail{namespace=~\"$namespace\",service=~\"$service\",env=~\"$env\"})", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Redis Slots Failed", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "from": 0, + "result": { + "index": 0, + "text": "Master" + }, + "to": 100 + }, + "type": "range" + }, + { + "options": { + "match": "null", + "result": { + "index": 1, + "text": "Slave" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "dark-orange", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 4, + "y": 4 + }, + "id": 12, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": { + "valueSize": 30 + }, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "exemplar": true, + "expr": "count(redis_instance_info{namespace=~\"$namespace\",service=~\"$service\",role=\"master\",env=~\"$env\"})", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Redis Role", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 8, + "y": 4 + }, + "id": 19, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "exemplar": true, + "expr": "avg(redis_rdb_changes_since_last_save{env=~\"$env\"})", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Redis Last Changes", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 8 + }, + "id": 9, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "exemplar": true, + "expr": "topk(5, increase(redis_commands_total{namespace=~\"$namespace\",service=~\"$service\",env=~\"$env\"} [$__rate_interval]))", + "interval": "", + "legendFormat": "{{cmd}}", + "refId": "A" + } + ], + "title": "Redis Executed Commands", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 8 + }, + "id": 13, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "exemplar": true, + "expr": "avg(irate(redis_commands_duration_seconds_total{namespace=~\"$namespace\",service=~\"$service\",env=~\"$env\"}[1m])) by (cmd)\n /\navg(irate(redis_commands_total{namespace=~\"$namespace\",service=~\"$service\",env=~\"$env\"}[1m])) by (cmd)", + "interval": "", + "legendFormat": "{{cmd}}", + "refId": "A" + } + ], + "title": "Redis Command Latency", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 8 + }, + "id": 11, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.0.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "exemplar": true, + "expr": "redis_db_keys{namespace=~\"$namespace\",service=~\"$service\",env=~\"$env\"}", + "interval": "", + "legendFormat": "{{db}}", + "refId": "A" + } + ], + "title": "Redis Keys", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 14, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.0.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "exemplar": true, + "expr": "rate(redis_net_input_bytes_total{namespace=~\"$namespace\",service=~\"$service\",env=~\"$env\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{pod}}", + "refId": "A" + } + ], + "title": "Redis Input Network", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 15, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.0.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "exemplar": true, + "expr": "rate(redis_net_output_bytes_total{namespace=~\"$namespace\",service=~\"$service\",env=~\"$env\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{pod}}", + "refId": "A" + } + ], + "title": "Redis Output Network", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "id": 10, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.0.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "exemplar": true, + "expr": "redis_memory_used_bytes{namespace=~\"$namespace\",service=~\"$service\",env=~\"$env\"}", + "interval": "", + "legendFormat": "{{pod}}", + "refId": "A" + } + ], + "title": "Redis Used Memory", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "id": 16, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.0.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "exemplar": true, + "expr": "redis_connected_clients{namespace=~\"$namespace\",service=~\"$service\",env=~\"$env\"}", + "interval": "", + "legendFormat": "{{pod}}", + "refId": "A" + } + ], + "title": "Redis Connections", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 32 + }, + "id": 17, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.0.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "exemplar": true, + "expr": "redis_master_repl_offset{namespace=~\"$namespace\",service=~\"$service\",env=~\"$env\"} - redis_slave_repl_offset{namespace=~\"$namespace\",service=~\"$service\",env=~\"$env\"}", + "interval": "", + "legendFormat": "{{pod}}", + "refId": "A" + } + ], + "title": "Redis Replication Lag", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 32 + }, + "id": 20, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.0.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "exemplar": true, + "expr": "sum(redis_cluster_slots_ok{env=~\"$env\"})", + "interval": "", + "legendFormat": "Ok", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "exemplar": true, + "expr": "sum(redis_cluster_slots_fail{env=~\"$env\"})", + "hide": false, + "interval": "", + "legendFormat": "fail", + "refId": "B" + } + ], + "title": "Redis Cluster Slot", + "type": "timeseries" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": true, + "text": "ctfd", + "value": "ctfd" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(kube_namespace_status_phase{job=\"kube-state-metrics\"},namespace)", + "hide": 0, + "includeAll": false, + "label": "Namespace", + "multi": false, + "name": "namespace", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(kube_namespace_status_phase{job=\"kube-state-metrics\"},namespace)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "selected": true, + "text": "redis-standalone", + "value": "redis-standalone" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values({namespace=\"$namespace\", endpoint=\"redis-exporter\"},service)", + "hide": 0, + "includeAll": false, + "label": "Service", + "multi": false, + "name": "service", + "options": [], + "query": { + "qryType": 1, + "query": "label_values({namespace=\"$namespace\", endpoint=\"redis-exporter\"},service)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "Prometheus", + "value": "prometheus" + }, + "hide": 0, + "includeAll": false, + "label": "Data source", + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "current": {}, + "definition": "", + "hide": 0, + "includeAll": false, + "label": "Env", + "multi": false, + "name": "env", + "options": [], + "query": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Redis Operator | Cluster Dashboard", + "uid": "OsAINfZnk", + "version": 11, + "weekStart": "" +} \ No newline at end of file diff --git a/ops/prometheus/grafana/dashboards/redis/redis-operator.json b/ops/prometheus/grafana/dashboards/redis/redis-operator.json new file mode 100644 index 0000000..bf51150 --- /dev/null +++ b/ops/prometheus/grafana/dashboards/redis/redis-operator.json @@ -0,0 +1,874 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 406435, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 9, + "panels": [], + "title": "Cluster", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 0, + "y": 1 + }, + "id": 1, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "rediscluster_healthy{namespace=~\"$namespace\",instance=~\"$instance\"}", + "legendFormat": "{{namespace}}/{{instance}}", + "refId": "A" + } + ], + "title": "Cluster Health", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 6, + "y": 1 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "rate(rediscluster_rebalance_total{namespace=~\"$namespace\",instance=~\"$instance\"}[5m])", + "legendFormat": "Rebalance", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "rate(rediscluster_reshard_total{namespace=~\"$namespace\",instance=~\"$instance\"}[5m])", + "legendFormat": "Reshard", + "refId": "B" + } + ], + "title": "Cluster Rebalance / Reshard (per 5m)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 12, + "y": 1 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "rate(rediscluster_adding_node_attempt{namespace=~\"$namespace\",instance=~\"$instance\"}[5m])", + "legendFormat": "Add Node", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "rate(rediscluster_remove_follower_attempt{namespace=~\"$namespace\",instance=~\"$instance\"}[5m])", + "legendFormat": "Remove Follower", + "refId": "B" + } + ], + "title": "Cluster Add / Remove Node Attempts (per 5m)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 18, + "y": 1 + }, + "id": 8, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "rediscluster_skipreconcile{namespace=~\"$namespace\",instance=~\"$instance\"}", + "legendFormat": "Cluster - {{namespace}}/{{instance}}", + "refId": "A" + } + ], + "title": "Skip Reconcile Cluster", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 8 + }, + "id": 10, + "panels": [], + "title": "Replication", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 9 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "redisreplication_has_master{namespace=~\"$namespace\",instance=~\"$instance\"}", + "legendFormat": "{{namespace}}/{{instance}}", + "refId": "A" + } + ], + "title": "Replication Master Status", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 9 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "redisreplication_replicas_size_desired{namespace=~\"$namespace\",instance=~\"$instance\"}", + "legendFormat": "Desired - {{namespace}}/{{instance}}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "redisreplication_replicas_size_current{namespace=~\"$namespace\",instance=~\"$instance\"}", + "legendFormat": "Current - {{namespace}}/{{instance}}", + "range": true, + "refId": "B" + } + ], + "title": "Replication Replica Counts (Desired vs Current)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 9 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "redisreplication_connected_slaves_total{namespace=~\"$namespace\",instance=~\"$instance\"}", + "legendFormat": "{{namespace}}/{{instance}}", + "refId": "A" + } + ], + "title": "Replication Connected Slaves", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 9 + }, + "id": 7, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "redisreplication_replicas_size_mismatch{namespace=~\"$namespace\",instance=~\"$instance\"}", + "legendFormat": "{{namespace}}/{{instance}}", + "refId": "A" + } + ], + "title": "Replication Replica Size Mismatch", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 0, + "y": 15 + }, + "id": 11, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "redisreplication_skipreconcile{namespace=~\"$namespace\",instance=~\"$instance\"}", + "legendFormat": "Replication - {{namespace}}/{{instance}}", + "refId": "B" + } + ], + "title": "Skip Reconcile Cluster", + "type": "stat" + } + ], + "preload": false, + "refresh": "10s", + "schemaVersion": 40, + "tags": [ + "redis", + "operator", + "kubernetes" + ], + "templating": { + "list": [ + { + "current": { + "text": "Beholder-Mars", + "value": "cc291c92-8dba-4776-bb65-2ef00118db13" + }, + "label": "Datasource", + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "type": "datasource" + }, + { + "allValue": ".*", + "current": { + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": "$datasource", + "includeAll": true, + "label": "Namespace", + "multi": true, + "name": "namespace", + "options": [], + "query": "label_values(rediscluster_healthy, namespace)", + "refresh": 1, + "type": "query" + }, + { + "allValue": ".*", + "current": { + "text": "All", + "value": "$__all" + }, + "datasource": "$datasource", + "includeAll": true, + "label": "Instance", + "multi": true, + "name": "instance", + "options": [], + "query": "label_values(rediscluster_healthy{namespace=~\"$namespace\"}, instance)", + "refresh": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-5m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Redis Operator Controller", + "uid": "redis-operator-controller", + "version": 1, + "weekStart": "" +} \ No newline at end of file diff --git a/ops/prometheus/grafana/dashboards/traefik/traefik-custom.json b/ops/prometheus/grafana/dashboards/traefik/traefik-custom.json new file mode 100644 index 0000000..3b9b3cf --- /dev/null +++ b/ops/prometheus/grafana/dashboards/traefik/traefik-custom.json @@ -0,0 +1,1482 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "description": "Official dashboard for Standalone Traefik", + "editable": true, + "fiscalYearStartMonth": 0, + "gnetId": 17346, + "graphTooltip": 0, + "id": 12, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 9, + "panels": [], + "title": "General", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 5, + "x": 0, + "y": 1 + }, + "id": 13, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "count(traefik_config_reloads_total)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Traefik Instances", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 7, + "x": 5, + "y": 1 + }, + "id": 7, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(rate(traefik_entrypoint_requests_total{entrypoint=~\"$entrypoint\"}[1m])) by (entrypoint)", + "legendFormat": "{{entrypoint}}", + "range": true, + "refId": "A" + } + ], + "title": "Requests per Entrypoint", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "https://medium.com/@tristan_96324/prometheus-apdex-alerting-d17a065e39d0", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 6, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "(sum(rate(traefik_entrypoint_request_duration_seconds_bucket{le=\"0.3\",code=\"200\",entrypoint=~\"$entrypoint\"}[5m])) by (method) + \n sum(rate(traefik_entrypoint_request_duration_seconds_bucket{le=\"1.2\",code=\"200\",entrypoint=~\"$entrypoint\"}[5m])) by (method)) / 2 / \n sum(rate(traefik_entrypoint_request_duration_seconds_count{code=\"200\",entrypoint=~\"$entrypoint\"}[5m])) by (method)\n", + "legendFormat": "{{method}}", + "range": true, + "refId": "A" + } + ], + "title": "Apdex score", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Mean Distribution", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [], + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 5, + "x": 0, + "y": 3 + }, + "id": 14, + "options": { + "legend": { + "displayMode": "list", + "placement": "right", + "showLegend": true, + "values": [ + "percent" + ] + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "tooltip": { + "mode": "multi", + "sort": "asc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(rate(traefik_service_requests_total{service=~\"$service.*\",protocol=\"http\"}[1m])) by (method, code)", + "legendFormat": "{{method}}[{{code}}]", + "range": true, + "refId": "A" + } + ], + "title": "Http Code ", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 9 + }, + "id": 23, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "topk(15,\n label_replace(\n traefik_service_request_duration_seconds_sum{service=~\"$service.*\",protocol=\"http\"} / \n traefik_service_request_duration_seconds_count{service=~\"$service.*\",protocol=\"http\"},\n \"service\", \"$2\", \"exported_service\", \"(([^-]+)-[^-]+).*\")\n)\n\n", + "legendFormat": "{{method}}[{{code}}] on {{service}}", + "range": true, + "refId": "A" + } + ], + "title": "Top slow services", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 9 + }, + "id": 5, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "topk(15,\n label_replace(\n sum by (service,code) \n (rate(traefik_service_requests_total{service=~\"$service.*\",protocol=\"http\"}[5m])) > 0,\n \"service\", \"$2\", \"exported_service\", \"(([^-]+)-[^-]+).*\")\n)", + "legendFormat": "[{{code}}] on {{service}}", + "range": true, + "refId": "A" + } + ], + "title": "Most requested services", + "type": "timeseries" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 17 + }, + "id": 11, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 18 + }, + "id": 3, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "label_replace(\n 1 - (sum by (service)\n (rate(traefik_service_request_duration_seconds_bucket{le=\"1.2\",service=~\"$service.*\"}[5m])) / sum by (service) \n (rate(traefik_service_request_duration_seconds_count{service=~\"$service.*\"}[5m]))\n ) > 0,\n \"service\", \"$1\", \"service\", \"([^-]+-[^-]+).*\"\n)", + "legendFormat": "{{service}}", + "range": true, + "refId": "A" + } + ], + "title": "Services failing SLO of 1200ms", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 18 + }, + "id": 4, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "label_replace(\n 1 - (sum by (service)\n (rate(traefik_service_request_duration_seconds_bucket{le=\"0.3\",service=~\"$service.*\"}[5m])) / sum by (service) \n (rate(traefik_service_request_duration_seconds_count{service=~\"$service.*\"}[5m]))\n ) > 0,\n \"service\", \"$1\", \"service\", \"([^-]+-[^-]+).*\"\n)", + "legendFormat": "{{service}}", + "range": true, + "refId": "A" + } + ], + "title": "Services failing SLO of 300ms", + "type": "timeseries" + } + ], + "title": "SLO", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 18 + }, + "id": 16, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 8, + "x": 0, + "y": 19 + }, + "id": 17, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "topk(15,\n label_replace(\n sum by (service,method,code) \n (rate(traefik_service_requests_total{service=~\"$service.*\",code=~\"2..\",protocol=\"http\"}[5m])) > 0,\n \"service\", \"$1\", \"service\", \"([^-]+-[^-]+).*\")\n)", + "legendFormat": "{{method}}[{{code}}] on {{service}}", + "range": true, + "refId": "A" + } + ], + "title": "2xx over 5 min", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisGridShow": true, + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 8, + "x": 8, + "y": 19 + }, + "id": 18, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "topk(15,\n label_replace(\n sum by (service,method,code) \n (rate(traefik_service_requests_total{service=~\"$service.*\",code=~\"5..\",protocol=\"http\"}[5m])) > 0,\n \"service\", \"$1\", \"service\", \"([^-]+-[^-]+).*\")\n)", + "legendFormat": "{{method}}[{{code}}] on {{service}}", + "range": true, + "refId": "A" + } + ], + "title": "5xx over 5 min", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisGridShow": true, + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 8, + "x": 16, + "y": 19 + }, + "id": 19, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "topk(15,\n label_replace(\n sum by (service,method,code) \n (rate(traefik_service_requests_total{service=~\"$service.*\",code!~\"2..|5..\",protocol=\"http\"}[5m])) > 0,\n \"service\", \"$1\", \"service\", \"([^-]+-[^-]+).*\")\n)", + "legendFormat": "{{method}}[{{code}}] on {{service}}", + "range": true, + "refId": "A" + } + ], + "title": "Other codes over 5 min", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisGridShow": true, + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "binBps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 31 + }, + "id": 20, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "topk(15,\n label_replace(\n sum by (service,method) \n (rate(traefik_service_requests_bytes_total{service=~\"$service.*\",protocol=\"http\"}[1m])) > 0,\n \"service\", \"$1\", \"service\", \"([^-]+-[^-]+).*\")\n)", + "legendFormat": "{{method}} on {{service}}", + "range": true, + "refId": "A" + } + ], + "title": "Requests Size", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisGridShow": true, + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "binBps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 31 + }, + "id": 24, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "topk(15,\n label_replace(\n sum by (service,method) \n (rate(traefik_service_responses_bytes_total{service=~\"$service.*\",protocol=\"http\"}[1m])) > 0,\n \"service\", \"$1\", \"service\", \"([^-]+-[^-]+).*\")\n)", + "legendFormat": "{{method}} on {{service}}", + "range": true, + "refId": "A" + } + ], + "title": "Responses Size", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 39 + }, + "id": 21, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(traefik_open_connections{entrypoint=~\"$entrypoint\"}) by (entrypoint)\n", + "legendFormat": "{{entrypoint}}", + "range": true, + "refId": "A" + } + ], + "title": "Connections per Entrypoint", + "type": "timeseries" + } + ], + "title": "HTTP Details", + "type": "row" + } + ], + "refresh": "", + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "Prometheus", + "value": "prometheus" + }, + "hide": 0, + "includeAll": false, + "label": "datasource", + "multi": false, + "name": "DS_PROMETHEUS", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "definition": "label_values(traefik_open_connections, entrypoint)", + "hide": 0, + "includeAll": true, + "multi": false, + "name": "entrypoint", + "options": [], + "query": { + "query": "label_values(traefik_open_connections, entrypoint)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "selected": true, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "definition": "label_values(traefik_service_requests_total, service)", + "hide": 0, + "includeAll": true, + "multi": false, + "name": "service", + "options": [], + "query": { + "query": "label_values(traefik_service_requests_total, service)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Traefik Dashboard", + "uid": "n5bu_kv46", + "version": 2, + "weekStart": "" +} \ No newline at end of file diff --git a/ops/prometheus/grafana/notification/.gitkeep b/ops/prometheus/grafana/notification/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/ops/prometheus/kube_prometheus_custom_values.yaml b/ops/prometheus/kube_prometheus_custom_values.yaml new file mode 100644 index 0000000..ad2ad19 --- /dev/null +++ b/ops/prometheus/kube_prometheus_custom_values.yaml @@ -0,0 +1,147 @@ +grafana: + enabled: true + grafana.ini: + server: + root_url: "https://grafana.${cluster_dns_management}" + sidecar: + dashboards: + enabled: true + label: grafana_dashboard + folder: /tmp/dashboards + provider: + allowUiUpdates: true + foldersFromFilesStructure: true + + annotations: + k8s-sidecar-target-directory: "/tmp/dashboards/kubernetes" + alerts: + enabled: true + label: grafana_alert + labelValue: "1" + dashboardProviders: + dashboardproviders.yaml: + apiVersion: 1 + providers: + - name: "db" + orgId: 1 + folder: "db" + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards/db + - name: "redis" + orgId: 1 + folder: "redis" + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards/redis + - name: "k8s" + orgId: 1 + folder: "k8s" + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards/k8s + - name: "traefik" + orgId: 1 + folder: "traefik" + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards/traefik + - name: "ctf" + orgId: 1 + folder: "ctf" + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards/ctf + dashboards: + db: + mysql-overview: + gnetId: 7362 + revision: 5 + datasource: Prometheus + mysql-replication: + gnetId: 7371 + revision: 1 + datasource: Prometheus + mariadb-galera: + gnetId: 13106 + revision: 3 + datasource: Prometheus + mysql-quickstart: + gnetId: 14057 + revision: 1 + datasource: Prometheus + redis: + redis-overview: + gnetId: 11145 + revision: 1 + datasource: Prometheus + k8s: + kubernetes-cluster: + gnetId: 6417 + revision: 1 + datasource: Prometheus + +prometheus: + prometheusSpec: + serviceMonitorSelector: + matchLabels: null + podMonitorSelector: + matchLabels: null + ruleSelector: + matchLabels: null + scrapeConfigSelector: + matchLabels: null + +kube-state-metrics: + enabled: true + metricLabelsAllowlist: + - pods=[ctfpilot.com/component,instanced.challenges.ctfpilot.com/deployment,instanced.challenges.ctfpilot.com/owner,challenges.ctfpilot.com/type,challenges.ctfpilot.com/name,challenges.ctfpilot.com/version,challenges.ctfpilot.com/configmap,challenges.ctfpilot.com/category,challenges.ctfpilot.com/enabled,challenges.ctfpilot.com/version] + - services=[ctfpilot.com/component,instanced.challenges.ctfpilot.com/deployment,instanced.challenges.ctfpilot.com/owner,challenges.ctfpilot.com/type,challenges.ctfpilot.com/name,challenges.ctfpilot.com/version,challenges.ctfpilot.com/configmap,challenges.ctfpilot.com/category,challenges.ctfpilot.com/enabled,challenges.ctfpilot.com/version] + - deployments=[ctfpilot.com/component,instanced.challenges.ctfpilot.com/deployment,instanced.challenges.ctfpilot.com/owner,challenges.ctfpilot.com/type,challenges.ctfpilot.com/name,challenges.ctfpilot.com/version,challenges.ctfpilot.com/configmap,challenges.ctfpilot.com/category,challenges.ctfpilot.com/enabled,challenges.ctfpilot.com/version] + - configmaps=[ctfpilot.com/component,instanced.challenges.ctfpilot.com/deployment,instanced.challenges.ctfpilot.com/owner,challenges.ctfpilot.com/type,challenges.ctfpilot.com/name,challenges.ctfpilot.com/version,challenges.ctfpilot.com/configmap,challenges.ctfpilot.com/category,challenges.ctfpilot.com/enabled,challenges.ctfpilot.com/version,page.ctfpilot.com/slug,page.ctfpilot.com/version,page.ctfpilot.com/enabled,page.ctfpilot.com/configmap] + - ingresses=[ctfpilot.com/component,instanced.challenges.ctfpilot.com/deployment,instanced.challenges.ctfpilot.com/owner,challenges.ctfpilot.com/type,challenges.ctfpilot.com/name,challenges.ctfpilot.com/version,challenges.ctfpilot.com/configmap,challenges.ctfpilot.com/category,challenges.ctfpilot.com/enabled,challenges.ctfpilot.com/version] + +alertmanager: + config: + route: + group_by: ["alertname"] + group_wait: 30s + group_interval: 5m + repeat_interval: 48h + # Default alerter + receiver: "null" + routes: + # "Muted" alerts, currently only false positive "Watchdog" resides here + - receiver: "null" + matchers: + - alertname =~ "Watchdog|InfoInhibitor" + # Nosiy alerts + - receiver: "null" + matchers: + - alertname =~ "UnusedCpu|UnusedMemory|KubeSchedulerDown|KubeControllerManagerDown|KubeProxyDown" + + # Matches anything + - receiver: "discord" + receivers: + - name: discord + discord_configs: + - webhook_url: "${discord_webhook_url}" + - name: "null" +# prometheus-node-exporter: +# hostRootFsMount: +# enabled: true +# tolerations: [] +# resources: {} +# securityContext: +# runAsUser: 0 +# privileged: true diff --git a/ops/providers.tf b/ops/providers.tf new file mode 100644 index 0000000..61e00d5 --- /dev/null +++ b/ops/providers.tf @@ -0,0 +1,81 @@ +# ---------------------- +# Terraform Configuration +# ---------------------- + +terraform { + required_version = ">= 1.9.5" + + backend "s3" {} + + required_providers { + kubernetes = { + source = "hashicorp/kubernetes" + version = ">= 2.32.0" + } + + kubectl = { + source = "alekc/kubectl" + version = ">= 2.0.2" + } + + helm = { + source = "hashicorp/helm" + version = ">= 3.0.2" + } + + htpasswd = { + source = "loafoe/htpasswd" + } + + http = { + source = "hashicorp/http" + } + } +} + +# ---------------------- +# Providers +# ---------------------- + +locals { + kube_config = yamldecode(base64decode(var.kubeconfig)) +} + +provider "kubernetes" { + host = local.kube_config.clusters[0].cluster.server + cluster_ca_certificate = base64decode(local.kube_config.clusters[0].cluster.certificate-authority-data) + + client_certificate = base64decode(local.kube_config.users[0].user.client-certificate-data) + client_key = base64decode(local.kube_config.users[0].user.client-key-data) +} + +provider "kubectl" { + load_config_file = false + + host = local.kube_config.clusters[0].cluster.server + cluster_ca_certificate = base64decode(local.kube_config.clusters[0].cluster.certificate-authority-data) + + client_certificate = base64decode(local.kube_config.users[0].user.client-certificate-data) + client_key = base64decode(local.kube_config.users[0].user.client-key-data) +} + +provider "helm" { + kubernetes = { + host = local.kube_config.clusters[0].cluster.server + cluster_ca_certificate = base64decode(local.kube_config.clusters[0].cluster.certificate-authority-data) + + client_certificate = base64decode(local.kube_config.users[0].user.client-certificate-data) + client_key = base64decode(local.kube_config.users[0].user.client-key-data) + } +} + +provider "http" { +} + +provider "htpasswd" { +} + +resource "random_password" "salt" { + length = 8 + special = true +} diff --git a/ops/redis-operator.tf b/ops/redis-operator.tf new file mode 100644 index 0000000..5ca4adb --- /dev/null +++ b/ops/redis-operator.tf @@ -0,0 +1,25 @@ +resource "kubernetes_namespace_v1" "redis" { + metadata { + name = "redis-operator" + } +} + +resource "helm_release" "redis-operator" { + name = "redis-operator" + repository = "https://ot-container-kit.github.io/helm-charts/" + namespace = kubernetes_namespace_v1.redis.metadata.0.name + create_namespace = false + + chart = "redis-operator" + version = var.redis_operator_version + + // Force use of longhorn storage class + # set = [{ + # name = "redis-operator.storageClass" + # value = "longhorn" + # }] + + depends_on = [ + kubernetes_namespace_v1.redis + ] +} diff --git a/ops/tfvars/.gitignore b/ops/tfvars/.gitignore new file mode 100644 index 0000000..8147f77 --- /dev/null +++ b/ops/tfvars/.gitignore @@ -0,0 +1 @@ +!template.tfvars diff --git a/ops/tfvars/template.tfvars b/ops/tfvars/template.tfvars new file mode 100644 index 0000000..1ae78fe --- /dev/null +++ b/ops/tfvars/template.tfvars @@ -0,0 +1,80 @@ +# ------------------------ +# Kubernetes variables +# ------------------------ +kubeconfig = "AA==" # The base64 encoded kubeconfig file (base64 -w 0 ) + +# ------------------------ +# Generic information +# ------------------------ +environment = "test" # Deployment environment name for the CTF (i.e. prod, staging, dev, test) +email = "" # Email to use for the ACME certificate +discord_webhook_url = "" # Discord webhook URL for sending alerts and notifications + +# ------------------------ +# Cloudflare variables +# ------------------------ +cloudflare_api_token = "" # Cloudflare API Token for updating the DNS records (Zone.Zone.Read and Zone.DNS.Edit permissions required for the two following domains) +cloudflare_dns_management = "" # The top level domain (TLD) to use for the DNS records for the management part of the cluster +cloudflare_dns_platform = "" # The top level domain (TLD) to use for the DNS records for the platform part of the cluster +cloudflare_dns_ctf = "" # The top level domain (TLD) to use for the DNS records for the CTF challenges part of the cluster +cluster_dns_management = "" # The specific domain name to use for the DNS records for the management part of the cluster. Must be the TLD or subdomain of `cloudflare_dns_management` + +# ---------------------- +# Filebeat configuration +# ---------------------- +filebeat_elasticsearch_host = "" # The hostname of the Elasticsearch instance for Filebeat to send logs to. Must be a https 443 endpoint. +filebeat_elasticsearch_username = "" # The username for the Elasticsearch instance +filebeat_elasticsearch_password = "" # The password for the Elasticsearch instance + +# ------------------------ +# GitHub variables +# ------------------------ +ghcr_username = "" # GitHub Container Registry username +ghcr_token = "" # GitHub Container Registry token. This token is used to pull images from the GitHub Container Registry. Only let this token have registry read access + +# ---------------------- +# Prometheus configuration +# ---------------------- +prometheus_storage_size = "15Gi" # The size of the persistent volume claim for Prometheus data storage. Format: (e.g., 20Gi, 100Gi) + +# ---------------------- +# Management configuration +# ---------------------- +# The following is the configuration for the management part of the cluster. + +# ArgoCD password +argocd_admin_password = "" # The password for the ArgoCD admin user +argocd_github_secret = "" # The GitHub secret for ArgoCD webhooks - Send webhook to /api/webhook with this secret as the secret header. This is used to trigger ArgoCD to sync the repositories. + +# Grafana password +grafana_admin_password = "" # The password for the Grafana admin user + +# Alert endpoints +discord_webhook_url = "" # Discord webhook URL for notifications + +# Username and password for basic auth (used for some management services) +# user: The username for the basic auth +# password: The password for the basic auth +traefik_basic_auth = { user = "", password = "" } + +# ---------------------- +# Docker images +# ---------------------- +# Values are maintained in the variables.tf file. +# You can override these values by uncommenting and setting your own images here. + +# image_error_fallback = "ghcr.io/ctfpilot/error-fallback:1.2.1" # The docker image for the error fallback deployment. See https://github.com/ctfpilot/error-fallback +# image_filebeat = "docker.elastic.co/beats/filebeat:8.19.0" # The docker image for Filebeat + +# ---------------------- +# Versions +# ---------------------- +# Values are maintained in the variables.tf file. +# You can override these values by uncommenting and setting your own versions here. + +# argocd_version = "8.2.5" # The version of the ArgoCD Helm chart to deploy. More information at https://github.com/argoproj/argo-helm +# cert_manager_version = "1.17.1" # The version of the Cert-Manager Helm chart to deploy. More information at https://github.com/cert-manager/cert-manager +# descheduler_version = "0.34.0" # The version of descheduler Helm chart to deploy. More information at https://github.com/kubernetes-sigs/descheduler +# mariadb_operator_version = "25.8.1" # The version of the MariaDB Operator Helm chart to deploy. More information at https://github.com/mariadb-operator/mariadb-operator +# kube_prometheus_stack_version = "62.3.1" # The version of the kube-prometheus-stack Helm chart to deploy. More information at https://github.com/prometheus-community/helm-charts/ +# redis_operator_version = "0.22.2" # The version of the Redis Operator Helm chart to deploy. More information at https://github.com/OT-CONTAINER-KIT/redis-operator diff --git a/ops/traefik.tf b/ops/traefik.tf new file mode 100644 index 0000000..4e39a81 --- /dev/null +++ b/ops/traefik.tf @@ -0,0 +1,246 @@ +resource "kubernetes_service" "traefik_dashboard" { + metadata { + name = "traefik-dashboard" + namespace = var.traefik_namespace + labels = { + app = "traefik" + release = "traefik" + role = "dashboard" + } + } + + spec { + selector = { + "app.kubernetes.io/name" = "traefik" + } + + port { + name = "dashboard" + port = 8080 + target_port = 8080 + } + } +} + +resource "kubernetes_ingress_v1" "traefik-dashboard-ingress" { + metadata { + name = "traefik-dashboard-ingress" + namespace = var.traefik_namespace + + # Basic auth + annotations = { + "cert-manager.io/cluster-issuer" = module.cert_manager.cluster_issuer_name + "ingress.kubernetes.io/auth-realm" = "traefik" + "ingress.kubernetes.io/auth-type" = "basic" + "ingress.kubernetes.io/auth-secret" = kubernetes_secret.traefik_basic_auth.metadata.0.name + "traefik.ingress.kubernetes.io/router.middlewares" = "${var.traefik_namespace}-${kubernetes_secret.traefik_basic_auth.metadata.0.name}@kubernetescrd,errors-errors@kubernetescrd" + } + } + + spec { + default_backend { + service { + name = "traefik-dashboard" + port { + number = 8080 + } + } + } + + rule { + host = "traefik.${var.cluster_dns_management}" + http { + path { + backend { + service { + name = "traefik-dashboard" + port { + number = 8080 + } + } + } + } + } + } + + tls { + hosts = [ + "traefik.${var.cluster_dns_management}" + ] + secret_name = "traefik-dashboard-tls-cert" + } + } + + depends_on = [ + kubernetes_secret.traefik_basic_auth, + kubernetes_service.traefik_dashboard + ] +} + +resource "kubernetes_service" "traefik_metrics" { + metadata { + name = "traefik-metrics" + namespace = var.traefik_namespace + labels = { + app = "traefik" + role = "metrics" + release = "prometheus" + } + } + + spec { + selector = { + "app.kubernetes.io/name" = "traefik" + } + + port { + name = "metrics" + port = 9100 + target_port = 9100 + } + } +} + + + +resource "kubernetes_config_map_v1" "ctfd_filebeat_config" { + metadata { + name = "ctfd-filebeat-config" + namespace = var.traefik_namespace + } + + data = { + "filebeat.yml" = <<-EOF + filebeat.inputs: + - type: filestream + paths: + - /var/log/traefik/*.log + processors: + - add_fields: + target: '' + fields: + cluster_dns: "${var.cluster_dns_management}" + - decode_json_fields: + fields: ["message"] + process_array: false + max_depth: 1 + target: "traefik" + overwrite_keys: false + - drop_fields: + fields: ["ecs.version"] + + output.elasticsearch: + hosts: ["https://${var.filebeat_elasticsearch_host}:443"] + username: "${var.filebeat_elasticsearch_username}" + password: "${var.filebeat_elasticsearch_password}" + protocol: https + ssl.verification_mode: "full" + index: filebeat-${var.environment}-access + + setup: + template: + name: "filebeat-${var.environment}-access" + pattern: "filebeat-${var.environment}-access*" + overwrite: false + ilm: + enabled: true + policy_name: "filebeat" + EOF + } +} + +resource "kubernetes_manifest" "traefik-additional-config" { + manifest = { + apiVersion = "helm.cattle.io/v1" + kind = "HelmChartConfig" + metadata = { + name = "traefik" + namespace = "kube-system" + } + + # This amends the Helm chart for the traefik ingress controller which is included with k3s. + # https://github.com/traefik/traefik-helm-chart/blob/master/traefik/values.yaml + spec = { + valuesContent = <<-EOF + autoscaling: + enabled: true + minReplicas: 3 + maxReplicas: 50 + resources: + requests: + cpu: "500m" + memory: "100Mi" + limits: + cpu: "2000m" + memory: "1Gi" + tolerations: + - key: "cluster.ctfpilot.com/node" + value: "scaler" + effect: "PreferNoSchedule" + logs: + access: + enabled: true + format: json + filePath: "/var/log/traefik/access.log" + bufferingSize: 1000 + fields: + headers: + defaultmode: keep + names: + Accept: drop + Connection: drop + Authorization: redact + env: + - name: TZ + value: "Europe/Copenhagen" + deployment: + initContainers: + - name: fix-permissions + image: busybox:latest + command: ["sh", "-c", "mkdir -p /usr/share/filebeat/data"] + securityContext: + fsGroup: 1000 + volumeMounts: + - name: filebeat-data + mountPath: /usr/share/filebeat/data + additionalContainers: + - image: ${var.image_filebeat} + imagePullPolicy: Always + name: traefik-stream-accesslog + volumeMounts: + - name: logs + mountPath: /var/log/traefik + - name: ctfd-filebeat-config + mountPath: /usr/share/filebeat/filebeat.yml + subPath: filebeat.yml + - name: filebeat-data + mountPath: /usr/share/filebeat/data + resources: + requests: + cpu: "10m" + memory: "56M" + limits: + cpu: "100m" + memory: "256M" + additionalVolumes: + - name: logs + - name: ctfd-filebeat-config + configMap: + name: ctfd-filebeat-config + - name: filebeat-data + emptyDir: {} + additionalVolumeMounts: + - name: logs + mountPath: /var/log/traefik + hub: + redis: + cluster: true + endpoints: redis-cluster-leaders:6379 + EOF + } + } + + depends_on = [ + kubernetes_config_map_v1.ctfd_filebeat_config + ] +} diff --git a/ops/variables.tf b/ops/variables.tf new file mode 100644 index 0000000..57222d8 --- /dev/null +++ b/ops/variables.tf @@ -0,0 +1,169 @@ +# ------------------------ +# Variables +# ------------------------ + +variable "kubeconfig" { + type = string + description = "Base64 encoded kubeconfig file" + sensitive = true +} + +variable "environment" { + type = string + description = "Deployment environment name for the CTF (i.e. prod, staging, dev, test)" + default = "test" +} + +variable "email" { + description = "Email to use for the ACME certificate" +} + +variable "cloudflare_api_token" { + sensitive = true # Requires terraform >= 0.14 + type = string + description = "Cloudflare API Token for updating the DNS records (Zone.Zone.Read and Zone.DNS.Edit permissions required for the two following domains)" +} + +variable "cloudflare_dns_management" { + type = string + description = "The top level domain (TLD) to use for the DNS records for the management part of the cluster" +} + +variable "cloudflare_dns_platform" { + type = string + description = "The top level domain (TLD) to use for the DNS records for the platform part of the cluster" +} + +variable "cloudflare_dns_ctf" { + type = string + description = "The top level domain (TLD) to use for the DNS records for the CTF challenges part of the cluster" +} + +variable "cluster_dns_management" { + type = string + description = "The specific domain name to use for the DNS records for the management part of the cluster. Must be the TLD or subdomain of `cloudflare_dns_management`" +} + +variable "traefik_namespace" { + type = string + default = "traefik" + description = "The Kubernetes namespace where Traefik is deployed" +} + +variable "traefik_basic_auth" { + type = map(string) + default = { + "user" = "admin" + "password" = "admin" + } + sensitive = true + description = "Username and password for basic auth. Format: { user = \"username\", password = \"password\" }" +} + +variable "filebeat_elasticsearch_host" { + type = string + nullable = false + description = "The hostname of the Elasticsearch instance for Filebeat to send logs to. Must be a https 443 endpoint." +} + +variable "filebeat_elasticsearch_username" { + type = string + nullable = false + description = "The username for Elasticsearch authentication." +} + +variable "filebeat_elasticsearch_password" { + type = string + nullable = false + description = "The password for Elasticsearch authentication." +} + +variable "prometheus_storage_size" { + type = string + default = "15Gi" + description = "The size of the persistent volume claim for Prometheus data storage. Format: (e.g., 20Gi, 100Gi)" +} + +variable "discord_webhook_url" { + type = string + description = "Discord webhook URL for notifications" + sensitive = true +} + +variable "ghcr_username" { + description = "GitHub Container Registry username" + type = string +} + +variable "ghcr_token" { + description = "GitHub Container Registry token. This token is used to pull images from the GitHub Container Registry. Only let this token have registry read access" + type = string + sensitive = true +} + +variable "argocd_admin_password" { + sensitive = true + type = string + description = "The password for the ArgoCD admin user" +} + +variable "argocd_github_secret" { + sensitive = true + type = string + description = "The GitHub secret for ArgoCD webhooks - Send webhook to /api/webhook with this secret as the secret header. This is used to trigger ArgoCD to sync the repositories." +} + +variable "grafana_admin_password" { + sensitive = true + type = string + description = "The password for the Grafana admin user" +} + +variable "image_error_fallback" { + type = string + description = "The docker image for the error fallback deployment. See https://github.com/ctfpilot/error-fallback" + default = "ghcr.io/ctfpilot/error-fallback:1.2.1" +} + +variable "image_filebeat" { + type = string + description = "The docker image for Filebeat" + default = "docker.elastic.co/beats/filebeat:8.19.0" +} + +# Variables +variable "argocd_version" { + type = string + description = "The version of ArgoCD Helm chart to deploy. More information at https://github.com/argoproj/argo-helm" + default = "8.2.5" +} + +variable "cert_manager_version" { + type = string + description = "The version of cert-manager Helm chart to deploy. More information at https://github.com/cert-manager/cert-manager" + default = "1.17.1" +} + +variable "descheduler_version" { + type = string + description = "The version of descheduler Helm chart to deploy. More information at https://github.com/kubernetes-sigs/descheduler" + default = "0.34.0" +} + +variable "mariadb_operator_version" { + type = string + description = "The version of the MariaDB Operator Helm chart to deploy. More information at https://github.com/mariadb-operator/mariadb-operator" + default = "25.8.1" +} + +variable "kube_prometheus_stack_version" { + type = string + description = "The version of the kube-prometheus-stack Helm chart to deploy. More information at https://github.com/prometheus-community/helm-charts/" + default = "62.3.1" +} + +variable "redis_operator_version" { + type = string + description = "The version of the Redis Operator Helm chart to deploy. More information at https://github.com/OT-CONTAINER-KIT/redis-operator" + default = "0.22.2" +} diff --git a/platform/.env.example b/platform/.env.example new file mode 100644 index 0000000..5fe1f9d --- /dev/null +++ b/platform/.env.example @@ -0,0 +1,2 @@ +AWS_ACCESS_KEY_ID= +AWS_SECRET_ACCESS_KEY= \ No newline at end of file diff --git a/platform/.gitignore b/platform/.gitignore new file mode 100644 index 0000000..12c7fe8 --- /dev/null +++ b/platform/.gitignore @@ -0,0 +1,41 @@ +# Local .terraform directories +**/.terraform/* + +# .tfstate files +*.tfstate +*.tfstate.* + +# Crash log files +crash.log +crash.*.log + +# Exclude all .tfvars files, which are likely to contain sensitive data, such as +# password, private keys, and other secrets. These should not be part of version +# control as they are data points which are potentially sensitive and subject +# to change depending on the environment. +*.tfvars +*.tfvars.json + +# Ignore override files as they are usually used to override resources locally and so +# are not checked in +override.tf +override.tf.json +*_override.tf +*_override.tf.json + +# Ignore transient lock info files created by terraform apply +.terraform.tfstate.lock.info + +# Include override files you do wish to add to version control using negated pattern +# !example_override.tf + +# Include tfplan files to ignore the plan output of command: terraform plan -out=tfplan +# example: *tfplan* + +# Ignore CLI configuration files +.terraformrc +terraform.rc + +# CTFd deployment +ctfd_config.json +**/ctfd_config.json \ No newline at end of file diff --git a/platform/.terraform.lock.hcl b/platform/.terraform.lock.hcl new file mode 100644 index 0000000..ee17999 --- /dev/null +++ b/platform/.terraform.lock.hcl @@ -0,0 +1,136 @@ +# This file is maintained automatically by "tofu init". +# Manual edits may be lost in future updates. + +provider "registry.opentofu.org/alekc/kubectl" { + version = "2.1.3" + constraints = ">= 2.0.2" + hashes = [ + "h1:AymCb0DCWzmyLqn1qEhVs2pcFUZGT/kxPK+I/BObFH8=", + "zh:0e601ae36ebc32eb8c10aff4c48c1125e471fa09f5668465af7581c9057fa22c", + "zh:1773f08a412d1a5f89bac174fe1efdfd255ecdda92d31a2e31937e4abf843a2f", + "zh:1da2db1f940c5d34e31c2384c7bd7acba68725cc1d3ba6db0fec42efe80dbfb7", + "zh:20dc810fb09031bcfea4f276e1311e8286d8d55705f55433598418b7bcc76357", + "zh:326a01c86ba90f6c6eb121bacaabb85cfa9059d6587aea935a9bbb6d3d8e3f3f", + "zh:5a3737ea1e08421fe3e700dc833c6fd2c7b8c3f32f5444e844b3fe0c2352757b", + "zh:5f490acbd0348faefea273cb358db24e684cbdcac07c71002ee26b6cfd2c54a0", + "zh:777688cda955213ba637e2ac6b1994e438a5af4d127a34ecb9bb010a8254f8a8", + "zh:7acc32371053592f55ee0bcbbc2f696a8466415dea7f4bc5a6573f03953fc926", + "zh:81f0108e2efe5ae71e651a8826b61d0ce6918811ccfdc0e5b81b2cfb0f7f57fe", + "zh:88b785ea7185720cf40679cb8fa17e57b8b07fd6322cf2d4000b835282033d81", + "zh:89d833336b5cd027e671b46f9c5bc7d10c5109e95297639bbec8001da89aa2f7", + "zh:df108339a89d4372e5b13f77bd9d53c02a04362fb5d85e1d9b6b47292e30821c", + "zh:e8a2e3a5c50ca124e6014c361d72a9940d8e815f37ae2d1e9487ac77c3043013", + ] +} + +provider "registry.opentofu.org/hashicorp/http" { + version = "3.5.0" + hashes = [ + "h1:yvwvVZ0vdbsTUMru+7Cr0On1FVgDJHAaC6TNvy/OWzM=", + "zh:0a2b33494eec6a91a183629cf217e073be063624c5d3f70870456ddb478308e9", + "zh:180f40124fa01b98b3d2f79128646b151818e09d6a1a9ca08e0b032a0b1e9cb1", + "zh:3e29e1de149dc10bf78620526c7cb8c62cd76087f5630dfaba0e93cda1f3aa7b", + "zh:4420950200cf86042ec940d0e2c9b7c89966bf556bf8038ba36217eae663bca5", + "zh:5d1f7d02109b2e2dca7ec626e5563ee765583792d0fd64081286f16f9433bd0d", + "zh:8500b138d338b1994c4206aa577b5c44e1d7260825babcf43245a7075bfa52a5", + "zh:b42165a6c4cfb22825938272d12b676e4a6946ac4e750f85df870c947685df2d", + "zh:b919bf3ee8e3b01051a0da3433b443a925e272893d3724ee8fc0f666ec7012c9", + "zh:d13b81ea6755cae785b3e11634936cdff2dc1ec009dc9610d8e3c7eb32f42e69", + "zh:f1c9d2eb1a6b618ae77ad86649679241bd8d6aacec06d0a68d86f748687f4eb3", + ] +} + +provider "registry.opentofu.org/hashicorp/kubernetes" { + version = "2.38.0" + constraints = ">= 2.32.0" + hashes = [ + "h1:nY7J9jFXcsRINog0KYagiWZw1GVYF9D2JmtIB7Wnrao=", + "zh:1096b41c4e5b2ee6c1980916fb9a8579bc1892071396f7a9432be058aabf3cbc", + "zh:2959fde9ae3d1deb5e317df0d7b02ea4977951ee6b9c4beb083c148ca8f3681c", + "zh:5082f98fcb3389c73339365f7df39fc6912bf2bd1a46d5f97778f441a67fd337", + "zh:620fd5d0fbc2d7a24ac6b420a4922e6093020358162a62fa8cbd37b2bac1d22e", + "zh:7f47c2de179bba35d759147c53082cad6c3449d19b0ec0c5a4ca8db5b06393e1", + "zh:89c3aa2a87e29febf100fd21cead34f9a4c0e6e7ae5f383b5cef815c677eb52a", + "zh:96eecc9f94938a0bc35b8a63d2c4a5f972395e44206620db06760b730d0471fc", + "zh:e15567c1095f898af173c281b66bffdc4f3068afdd9f84bb5b5b5521d9f29584", + "zh:ecc6b912629734a9a41a7cf1c4c73fb13b4b510afc9e7b2e0011d290bcd6d77f", + ] +} + +provider "registry.opentofu.org/hashicorp/local" { + version = "2.5.3" + hashes = [ + "h1:mC9+u1eaUILTjxey6Ivyf/3djm//RNNze9kBVX/trng=", + "zh:32e1d4b0595cea6cda4ca256195c162772ddff25594ab4008731a2ec7be230bf", + "zh:48c390af0c87df994ec9796f04ec2582bcac581fb81ed6bb58e0671da1c17991", + "zh:4be7289c969218a57b40902e2f359914f8d35a7f97b439140cb711aa21e494bd", + "zh:4cf958e631e99ed6c8b522c9b22e1f1b568c0bdadb01dd002ca7dffb1c927764", + "zh:7a0132c0faca4c4c96aa70808effd6817e28712bf5a39881666ac377b4250acf", + "zh:7d60de08fac427fb045e4590d1b921b6778498eee9eb16f78c64d4c577bde096", + "zh:91003bee5981e99ec3925ce2f452a5f743827f9d0e131a86613549c1464796f0", + "zh:9fe2fe75977c8149e2515fb30c6cc6cfd57b225d4ce592c570d81a3831d7ffa3", + "zh:e210e6be54933ce93e03d0994e520ba289aa01b2c1f70e77afb8f2ee796b0fe3", + "zh:e8793e5f9422f2b31a804e51806595f335b827c9a38db18766960464566f21d5", + ] +} + +provider "registry.opentofu.org/hashicorp/null" { + version = "3.2.4" + hashes = [ + "h1:jsKjBiLb+v3OIC3xuDiY4sR0r1OHUMSWPYKult9MhT0=", + "zh:1769783386610bed8bb1e861a119fe25058be41895e3996d9216dd6bb8a7aee3", + "zh:32c62a9387ad0b861b5262b41c5e9ed6e940eda729c2a0e58100e6629af27ddb", + "zh:339bf8c2f9733fce068eb6d5612701144c752425cebeafab36563a16be460fb2", + "zh:36731f23343aee12a7e078067a98644c0126714c4fe9ac930eecb0f2361788c4", + "zh:3d106c7e32a929e2843f732625a582e562ff09120021e510a51a6f5d01175b8d", + "zh:74bcb3567708171ad83b234b92c9d63ab441ef882b770b0210c2b14fdbe3b1b6", + "zh:90b55bdbffa35df9204282251059e62c178b0ac7035958b93a647839643c0072", + "zh:ae24c0e5adc692b8f94cb23a000f91a316070fdc19418578dcf2134ff57cf447", + "zh:b5c10d4ad860c4c21273203d1de6d2f0286845edf1c64319fa2362df526b5f58", + "zh:e05bbd88e82e1d6234988c85db62fd66f11502645838fff594a2ec25352ecd80", + ] +} + +provider "registry.opentofu.org/loafoe/htpasswd" { + version = "1.2.1" + hashes = [ + "h1:W1euQGM6t+QlB6Rq4fDbRKRHmeCIyYdIYdHrxL97BeE=", + "zh:14460c85ddc40a9ecadf583c22a7de91b83798a8ca4843949d50c3288c6f5bdd", + "zh:1af9416e28dd0a77c5d2c685561c4f60e19e2d606df0477ebc18eaa110c77807", + "zh:2245325864faaf027701ab12a04d641359a0dc439dd23c6e8f768407b78a5c18", + "zh:3813ff98198405d7c467565b52c7f0ad4533f43957da6390477dc898f8ed02c2", + "zh:3c0658e132232a181223f7ff65678d99cd2e8431c317f72281b67464e5e16892", + "zh:43505c0f42bc7635ec7c1fe5043c502f9b00ae4b5e74b81464bc494936643fc1", + "zh:52efdabb0abba99a33fd3ed981610f13c99bb383f94e997f90d95441d8558177", + "zh:75b5d9b4a610dfd0ff4dfb4039f61e79a0e56338e0a4cd45e0bc0edec34dfa62", + "zh:7aee5df091672d29f29dda57382a41d771fa21740cef6bb9a1b15afc6d84ffa4", + "zh:7ff618706e2953a21a22c7555e11f5cbe8e95c171704fcfdc6beedb0c25e49c0", + "zh:94e8a15c83a1a5a60ff1b58938dd9692d800fe05c5d8269e0916b5de03d89d3a", + "zh:c1ace4f322f9ec4956e4f30086da5b6a73f4d05e1266047d629b14a485c5a76d", + "zh:d4570075de49e3ee98494f7c44eab12e964c9776029ed536fd9352c3203cc635", + "zh:d99403b843de5939ea2e54b3ca46fd901d5c5b7fe34f44b8aeb8b38f4f792df6", + ] +} + +provider "registry.opentofu.org/mastercard/restapi" { + version = "2.0.1" + constraints = "2.0.1" + hashes = [ + "h1:B9x7Fql5sPqIHYSjEvQRXGOcOIUhvjV6RHKfPBUvSK8=", + "zh:09438372b8569003dabaf2fc3a98591bb9ec2505a599a37383e908432be8bed7", + "zh:0f6008de6fdbc92ee2408a34c485bf4de4bf8f46b80f9c54947c9ab89a195704", + "zh:1c3e89cf19118fc07d7b04257251fc9897e722c16e0a0df7b07fcd261f8c12e7", + "zh:2171088aca38b049705bf7052c1cc0a370dddbe1850f2efee88304b819e8966f", + "zh:2a249e06ccbd13c652676f200de6dc9347d6319fd888476e6a807e11bad8c8bd", + "zh:2a306c68bca64dd63e7269de0d4131dd8de5f5f34f9958c0cf10a937ceb89757", + "zh:36c35b155157cffe590d8acd02d6540c2171f02995d7aa7c9802d5a57973ac2c", + "zh:401d28cad51efdf1b8e1b8fdbb91b0e905eea5dfc4a96baf0e270dcd84cf7a03", + "zh:6db051e5ff4b947bdd1428f555d50b7b5157e47bc72a489f8e7b60c31cb233ef", + "zh:791cac45de5b056babcc78c8ec1996666be5fbaabd770cf619ddc7679533c003", + "zh:a0ab80133a55ec19369841d82285c6603c7b140acfd5298eb3e535444c971055", + "zh:bd72f18bcf74fcfce132dc45e4cb372bbdf7a4459cc55c29aa51b5511c8985ea", + "zh:c6b96d5b075cbbd62274a69f625f0371f3c93604b8358d18be66c4b4063bef1b", + "zh:d275ba2d17d3cac3f4b55829fffe25257f89449459c44b058a58d4521f2a481e", + "zh:f38998efd8e051e433e5aee941e835418e24bd2dc02c85be9cd7cee8455f9b9d", + ] +} diff --git a/platform/README.md b/platform/README.md new file mode 100644 index 0000000..2f9b57d --- /dev/null +++ b/platform/README.md @@ -0,0 +1,55 @@ +# CTF Pilot's Kubernetes Platform + +> [!IMPORTANT] +> You are leaving the automated CTF Pilot setup and entering a more advanced manual setup. +> This requires knowledge of Kubernetes, Terraform/OpenTofu, and cloud infrastructure management. +> If you are not comfortable with these technologies, it is recommended to use the automated setup provided by CTF Pilot. +> Learn more about the automated setup in the [CTFp main README](../README.md). + +This directory contains deployment configuration for the scoreboard and related services, such as [ctfd](https://github.com/ctfpilot/ctfd) and [ctfd-manager](https://github.com/ctfpilot/ctfd-manager) + +## Pre-requisites + +The following software needs to be installed on your local machine: + +- [Terraform](https://www.terraform.io/downloads.html) / [OpenTofu](https://opentofu.org) +- [Kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/) (For interacting with the Kubernetes cluster) + +The following services are required, in order to deploy the services to the cluster: + +- A Kubernetes cluster (Deployed using the [CTF Pilot's Kubernetes Cluster on Hetzner Cloud](../cluster/README.md) guide or other means) +- Correctly deployed [ArgoCD](https://argo-cd.readthedocs.io/) within the Kubernetes cluster. + +> [!NOTE] +> The platform has only been tested within the CTFp system. +> We recommend at least having [the Ops](../ops/README.md) deployed, as this project relies on those configurations. + +## Setup + +Copy the `tfvars/template.tfvars` file to `tfvars/data.tfvars` and edit the file with your own values. +The [`tfvars/template.tfvars`](tfvars/template.tfvars) file contains further information on each variable. + +> [!IMPORTANT] +> Make sure you generate the backend configuration file before creating the cluster. +> See the [backend generation instructions](../backend/README.md) for more information. +> +> You will also need to set the following environment variables for authentication to the S3 backend: +> - `AWS_ACCESS_KEY_ID` +> - `AWS_SECRET_ACCESS_KEY` +> +> See [OpenTofub backend S3 configuration](https://opentofu.org/docs/language/settings/backends/s3/) for more information. + +Run the following command to apply the ressources to the Kubernetes cluster: + +```bash +tofu init -backend-config=../backend/generated/platform.hcl +tofu apply --var-file tfvars/data.tfvars +``` + +### Destroying the platform + +To destroy the deployed platform, run the following command: + +```bash +tofu destroy --var-file tfvars/data.tfvars +``` diff --git a/platform/configure-ctfd.tf b/platform/configure-ctfd.tf new file mode 100644 index 0000000..1dc5b1c --- /dev/null +++ b/platform/configure-ctfd.tf @@ -0,0 +1,102 @@ +# Wait for the CTFd URL to be reachable +resource "null_resource" "wait_for_url" { + provisioner "local-exec" { + interpreter = ["bash", "-c"] + command = < 0 ? var.ctf_team_size : null + brackets = length(var.ctf_brackets) > 0 ? var.ctf_brackets : null + mail_server = var.ctf_mail_server + mail_port = var.ctf_mail_port + mail_username = var.ctf_mail_username + mail_password = var.ctf_mail_password + mail_tls = var.ctf_mail_tls + mail_from = var.ctf_mail_from + registration_code = var.ctf_registration_code + ctf_logo = { + name = "logo.png" + data = base64encode(filebase64("${path.module}/${var.ctf_logo_path}")) + } + } +} + +# Write payload to local file +resource "local_file" "ctfd_config" { + content = jsonencode(local.configure_ctfd_payload) + filename = "${path.module}/ctfd_config.json" + + depends_on = [ + null_resource.wait_for_url, + module.ctfd-ingress, + module.ctfd-manager-ingress, + ] +} + +resource "null_resource" "configure-ctfd" { + depends_on = [ + null_resource.wait_for_url, + module.ctfd-ingress, + module.ctfd-manager-ingress, + local_file.ctfd_config, + ] + + provisioner "local-exec" { + command = <= 7.3.1 diff --git a/template.automated.tfvars b/template.automated.tfvars new file mode 100644 index 0000000..0ee1eb3 --- /dev/null +++ b/template.automated.tfvars @@ -0,0 +1,273 @@ +# Template for the automated setup process. +# Clone this file to `automated.tfvars` and fill in the values. +# This file (`template.automated.tfvars`) is git tracked, and MUST NOT be changed in the repository to include sensitive information. + +# ------------------------ +# CLI Tool configuration +# ------------------------ +# The following variables are used by the CLI tool to configure the backend connection. +# Specifically setting the credentials to access the Terraform S3 backend. +terraform_backend_s3_access_key = "" # Access key for the S3 backend +terraform_backend_s3_secret_key = "" # Secret key for the S3 backend + +# ------------------------ +# Cluster configuration +# ------------------------ +# WARNING: Changing region while the cluster is running will cause all servers in the group to be destroyed and recreated. +# For optimal performance, it is recommended to use the same region for all servers. If you want redundancy, use different regions for each group. +# Region 1 is used for challs nodes, scale nodes and loadbalancer. +# Possible values: fsn1, hel1, nbg1, ash, hil, sin - See https://docs.hetzner.com/cloud/general/locations/ +region_1 = "nbg1" # Region for group 1, challs nodes, scale nodes and loadbalancer +region_2 = "nbg1" # Region for group 2 +region_3 = "nbg1" # Region for group 3 +network_zone = "eu-central" # Hetzner network zone. Possible values: "eu-central", "us-east", "us-west", "ap-southeast". Regions must be within the network zone. + +# Servers +# Server definitions are split into four groups: Control Plane, Agents, Challs and Scale. Control plane and agents has three groups each, while challs and scale is one group each. +# Each group can be scaled and defined independently, to allow for smooth transitions between different server types and sizes. +# Control planes are the servers that run the Kubernetes control plane, and are responsible for managing the cluster. +# Agents are the servers that run the workloads, and scale is used to scale the cluster up or down dynamically. +# Challs are the servers that run the CTF challenges. +# Scale is automatically scaled agent nodes, which is handled by the cluster autoscaler. It is optional, and can be used to scale the cluster up or down dynamically if there is not enough resources in the cluster. +# Challs and scale nodes are placed in region_1, and are tainted to make normal resources prefer agent nodes, but allow scheduling on challs and scale nodes if needed. + +# Server types. See https://www.hetzner.com/cloud +# Control plane nodes - Nodes that run the Kubernetes control plane components. +control_plane_type_1 = "cx23" # Control plane group 1 +control_plane_type_2 = "cx23" # Control plane group 2 +control_plane_type_3 = "cx23" # Control plane group 3 +# Agent nodes - Nodes that run general workloads, excluding CTF challenges. +agent_type_1 = "cx33" # Agent group 1 +agent_type_2 = "cx33" # Agent group 2 +agent_type_3 = "cx33" # Agent group 3 +# Challenge nodes - Nodes dedicated to running CTF challenges. +challs_type = "cx33" # CTF challenge nodes +# Scale nodes - Nodes that are automatically scaled by the cluster autoscaler. These nodes are used to scale the cluster up or down dynamically. +scale_type = "cx33" # Scale group + +# Server count +# Control plane nodes - Nodes that run the Kubernetes control plane components. +# Minimum of 1 control plane across all groups. 1 in each group is recommended for HA. +control_plane_count_1 = 1 # Number of control plane nodes in group 1 +control_plane_count_2 = 1 # Number of control plane nodes in group 2 +control_plane_count_3 = 1 # Number of control plane nodes in group 3 +# Agent nodes - Nodes that run general workloads, excluding CTF challenges. +# Minimum of 1 agent across all groups. 1 in each group is recommended for HA. +agent_count_1 = 1 # Number of agent nodes in group 1 +agent_count_2 = 1 # Number of agent nodes in group 2 +agent_count_3 = 1 # Number of agent nodes in group 3 +# Challenge nodes - Nodes dedicated to running CTF challenges. These nodes are tainted to only run challenge workloads. +challs_count = 1 # Number of challenge nodes. +# Scale nodes - Nodes that are automatically scaled by the cluster autoscaler. These nodes are used to scale the cluster up or down dynamically. +scale_max = 0 # Maximum number of scale nodes. Set to 0 to disable autoscaling. + +load_balancer_type = "lb11" # Load balancer type, see https://www.hetzner.com/cloud/load-balancer + +# ------------------------ +# Hetzner +# ------------------------ +hcloud_token = "" # Hetzner cloud project token (obtained from a specific project in Hetzner cloud) + +# ------------------------ +# SSH +# ------------------------ +# The following tokens are base64 encoded public and private keys. +# To generate these, leave the template as is, and run the following commands to fill in the values: +# $ python3 cli.py generate-keys --insert +ssh_key_private_base64 = "" # The private key to use for SSH access to the servers (base64 encoded) +ssh_key_public_base64 = "" # The public key to use for SSH access to the servers (base64 encoded) + +# ------------------------ +# Cloudflare variables +# ------------------------ +# The cluster uses two domains for the management and CTF parts of the cluster. +# This is to separate the two parts of the cluster, and to allow for different DNS records for the two parts. It may be the same domain. The specific subdomains is set later. +cloudflare_api_token = "" # Cloudflare API Token for updating the DNS records (Zone.Zone.Read and Zone.DNS.Edit permissions required for the two following domains) +cloudflare_dns_management = "" # The top level domain (TLD) to use for the DNS records for the management part of the cluster +cloudflare_dns_platform = "" # The top level domain (TLD) to use for the DNS records for the platform part of the cluster +cloudflare_dns_ctf = "" # The top level domain (TLD) to use for the DNS records for the CTF part of the cluster + +# ------------------------ +# DNS information +# ------------------------ +# The cluster uses two domains for the management and CTF parts of the cluster. +# The following is the actually used subdomains for the two parts of the cluster. They may be either TLD or subdomains. +cluster_dns_management = "" # The specific domain name to use for the DNS records for the management part of the cluster +cluster_dns_platform = "" # The domain name to use for the DNS records for the platform part of the cluster +cluster_dns_ctf = "" # The domain name to use for the DNS records for the CTF part of the cluster + +# The following is used for the ACME certificate (https) for the cluster. +email = "" # Email to use for the ACME certificate + + +# ---------------------- +# Management configuration +# ---------------------- +# The following is the configuration for the management part of the cluster. + +# ArgoCD password +argocd_admin_password = "" # The password for the ArgoCD admin user +argocd_github_secret = "" # The GitHub secret for ArgoCD webhooks - Send webhook to /api/webhook with this secret as the secret header. This is used to trigger ArgoCD to sync the repositories. + +# Grafana password +grafana_admin_password = "" # The password for the Grafana admin user + +# Alert endpoints +discord_webhook_url = "" # Discord webhook URL for notifications + +# Username and password for basic auth (used for some management services) +# user: The username for the basic auth +# password: The password for the basic auth +traefik_basic_auth = { user = "", password = "" } + +# ---------------------- +# Filebeat configuration +# ---------------------- +filebeat_elasticsearch_host = "" # The hostname of the Elasticsearch instance for Filebeat to send logs to. Must be a https 443 endpoint. +filebeat_elasticsearch_username = "" # The username for the Elasticsearch instance +filebeat_elasticsearch_password = "" # The password for the Elasticsearch instance + +# ---------------------- +# Prometheus configuration +# ---------------------- +prometheus_storage_size = "15Gi" # The size of the persistent volume claim for Prometheus data storage. Format: (e.g., 20Gi, 100Gi) + +# ---------------------- +# Github configuration +# ---------------------- +# The following configures the cluster access to Github and needed Github repositories. +ghcr_username = "" # GitHub Container Registry username +ghcr_token = "" # GitHub Container Registry token. This token is used to pull images from the GitHub Container Registry. Only let this token have registry read access +git_token = "" # GitHub repo token. Only let this token have read access to the needed repositories. + +# ---------------------- +# CTF configuration +# ---------------------- +# The following is the configuration for the instanced challenge management system. +# They should be unique and strong passwords. +kubectf_auth_secret = "" # The secret to use for the authSecret in the CTF configuration +kubectf_container_secret = "" # The secret to use for the containerSecret in the CTF configuration + +# ------------------------ +# DB configuration +# ------------------------ +# DB configuration for the MariaDB cluster, used for the CTFd instance. +db_root_password = "" # Root password for the MariaDB cluster +db_user = "" # Database user +db_password = "" # Database password + +# S3 backup +s3_bucket = "" # S3 bucket name for backups +s3_region = "" # S3 region for backups +s3_endpoint = "" # S3 endpoint for backups +s3_access_key = "" # Access key for S3 for backups +s3_secret_key = "" # Secret key for S3 for backups + +# ------------------------ +# CTFd Manager configuration +# ------------------------ +# The CTFd manager is used to manage the CTFd instance, and is not used for the CTFd instance itself. +ctfd_manager_password = "" # Password for the CTFd Manager +ctfd_manager_github_repo = "" # Github repository used in the CTFd Manager. Env variable GITHUB_REPO. See https://github.com/ctfpilot/ctfd-manager +ctfd_manager_github_branch = "" # Github branch used in the CTFd Manager. Leave empty for environment based branch (environment == prod ? main : develop). Env variable GITHUB_BRANCH. See https://github.com/ctfpilot/ctfd-manager + +# ------------------------ +# CTFd configuration +# ------------------------ +ctf_name = "" # Name of the CTF event +ctf_description = "" # Description of the CTF event +ctf_start_time = "" # Start time of the CTF event (ISO 8601 format, e.g., "2023-10-01T00:00:00Z") +ctf_end_time = "" # End time of the CTF event +ctf_user_mode = "" # User mode for CTFd (e.g., "teams") +ctf_challenge_visibility = "" # Challenge visibility (e.g., "public") +ctf_account_visibility = "" # Account visibility (e.g., "private") +ctf_score_visibility = "" # Score visibility (e.g., "public") +ctf_registration_visibility = "" # Registration visibility (e.g., "public") +ctf_verify_emails = true # Whether to verify emails +ctf_team_size = 0 # Team size for the CTF. 0 means no limit +ctf_brackets = [] # List of brackets, optional. +ctf_theme = "" # Theme for CTFd +ctf_admin_name = "" # Name of the admin user +ctf_admin_email = "" # Email of the admin user +ctf_admin_password = "" # Password for the admin user +ctf_registration_code = "" # Registration code for the CTF + +ctf_mail_server = "" # Mail server for CTFd +ctf_mail_port = 465 # Mail server port +ctf_mail_username = "" # Mail server username +ctf_mail_password = "" # Mail server password +ctf_mail_tls = true # Whether to use TLS for the mail server +ctf_mail_from = "" # From address for the mail server + +ctf_logo_path = "data/logo.png" # Path to the CTF logo file (e.g., "ctf-logo.png"). Path from `platform/` directory. + +ctfd_secret_key = "" # Secret key for CTFd + +# CTFd S3 Configuration +ctf_s3_bucket = "" # S3 bucket name for CTFd files +ctf_s3_region = "" # S3 region for CTFd files +ctf_s3_endpoint = "" # S3 endpoint for CTFd files +ctf_s3_access_key = "" # Access key for S3 for CTFd files +ctf_s3_secret_key = "" # Secret key for S3 for CTFd files +ctf_s3_prefix = "ctfd/" # S3 prefix for CTFd files, e.g., 'ctfd/dev/' + +# CTFd Plugin Configuration +ctfd_plugin_first_blood_limit_url = "" # Webhook URL for the First Blood plugin +ctfd_plugin_first_blood_limit = "1" # Limit configuration for the First Blood plugin +ctfd_plugin_first_blood_message = ":drop_of_blood: First blood for **{challenge}** goes to **{user}**! :drop_of_blood:" # Message configuration for the First Blood plugin + +# Pages Configuration +pages = [] # List of pages to deploy to CTFd +pages_repository = "https://github.com/" # Repository URL for pages +pages_branch = "" # Git branch for pages. Leave empty for environment based branch (environment == prod ? main : develop) + +# CTFd Deployment Configuration +ctfd_k8s_deployment_repository = "https://github.com/" # Repository URL for CTFd deployment files +ctfd_k8s_deployment_path = "k8s" # Path for CTFd deployment files within the git repository +ctfd_k8s_deployment_branch = "" # Git branch for CTFd deployment files. Leave empty for environment based branch (environment == prod ? main : develop) + +# ------------------------ +# Challenges configuration +# ------------------------ +chall_whitelist_ips = ["", ""] # List of IPs to whitelist for challenge access + +challenges_static = { + "" = ["", ""], +} # List of static challenges to deploy. Needs to be the slugs of the challenges +challenges_shared = { + "" = ["", ""], +} # List of shared challenges to deploy. Needs to be the slugs of the challenges +challenges_instanced = { + "" = ["", ""], +} # List of instanced challenges to deploy. Needs to be the slugs of the challenges + +challenges_repository = "https://github.com/" # URL of the Git repository containing the challenge definitions +challenges_branch = "" # Branch of the Git repository to use for the challenge definitions. Leave empty for environment based branch (environment == prod ? main : develop) + +# ---------------------- +# Docker images +# ---------------------- +# Values are maintained within each component as defaults. +# You can override these values by uncommenting and setting your own images here. + +# image_error_fallback = "ghcr.io/ctfpilot/error-fallback:1.2.1" # The docker image for the error fallback deployment. See https://github.com/ctfpilot/error-fallback +# image_filebeat = "docker.elastic.co/beats/filebeat:8.19.0" # The docker image for Filebeat +# image_ctfd_manager = "ghcr.io/ctfpilot/ctfd-manager:1.0.1" # Docker image for the CTFd Manager deployment +# image_ctfd_exporter = "ghcr.io/the0mikkel/ctfd-exporter:1.1.1" # Docker image for the CTFd Exporter +# image_instancing_fallback = "ghcr.io/ctfpilot/instancing-fallback:1.0.2" # The docker image for the instancing fallback deployment. See https://github.com/ctfpilot/instancing-fallback +# image_kubectf = "ghcr.io/ctfpilot/kube-ctf:1.0.1" # The docker image for the kube-ctf deployment. See https://github.com/ctfpilot/kube-ctf + +# ---------------------- +# Versions +# ---------------------- +# Values are maintained within each component as defaults. +# You can override these values by uncommenting and setting your own versions here. + +# kube_hetzner_version = "2.18.2" # The version of the Kube-Hetzner module to use. More information at https://github.com/mysticaltech/terraform-hcloud-kube-hetzner +# argocd_version = "8.2.5" # The version of the ArgoCD Helm chart to deploy. More information at https://github.com/argoproj/argo-helm +# cert_manager_version = "1.17.1" # The version of the Cert-Manager Helm chart to deploy. More information at https://github.com/cert-manager/cert-manager +# descheduler_version = "0.34.0" # The version of descheduler Helm chart to deploy. More information at https://github.com/kubernetes-sigs/descheduler +# mariadb_operator_version = "25.8.1" # The version of the MariaDB Operator Helm chart to deploy. More information at https://github.com/mariadb-operator/mariadb-operator +# kube_prometheus_stack_version = "62.3.1" # The version of the kube-prometheus-stack Helm chart to deploy. More information at https://github.com/prometheus-community/helm-charts/ +# redis_operator_version = "0.22.2" # The version of the Redis Operator Helm chart to deploy. More information at https://github.com/OT-CONTAINER-KIT/redis-operator +# mariadb_version = "25.8.1" # The version of MariaDB deploy. More information at https://github.com/mariadb-operator/mariadb-operator diff --git a/terraform/.gitignore b/terraform/.gitignore new file mode 100644 index 0000000..e57f689 --- /dev/null +++ b/terraform/.gitignore @@ -0,0 +1,5 @@ +# This directory contains Terraform plans when deploying CTFp. + +* +!.gitignore +!create.sh diff --git a/tf-modules/argocd/application/argocd-application.tf b/tf-modules/argocd/application/argocd-application.tf new file mode 100644 index 0000000..d32f40c --- /dev/null +++ b/tf-modules/argocd/application/argocd-application.tf @@ -0,0 +1,98 @@ +variable "argocd_namespace" { + description = "The namespace where ArgoCD is installed" + default = "argocd" +} + +variable "application_namespace" { + description = "The namespace where the application will be deployed" +} + +variable "application_name" { + description = "The name of the application" +} + +variable "application_repo_url" { + description = "The URL of the repository where the application manifests are stored" +} + +variable "application_repo_path" { + description = "The path within the repository where the application manifests are stored" +} + +variable "application_repo_revision" { + description = "The revision of the repository to use" +} + +variable "application_project" { + description = "The ArgoCD project to use" + default = "default" + + validation { + error_message = "The project name must be lowercase" + condition = can(regex("^[a-z0-9-]*$", var.application_project)) + } +} + +variable "argocd_labels" { + description = "The labels to apply to the ArgoCD Application" + type = map(string) + default = {} + +} + +variable "argocd_finalizers" { + description = "The finalizers to apply to the ArgoCD Application" + type = list(string) + default = ["resources-finalizer.argocd.argoproj.io"] +} + +variable "helm" { + description = "Helm chart configuration" + type = any + default = null +} + +locals { + argocd_source = merge( + { + repoURL = var.application_repo_url + path = var.application_repo_path + targetRevision = var.application_repo_revision + }, + var.helm != null ? { helm = var.helm } : {} + ) +} + +resource "kubernetes_manifest" "application" { + manifest = { + apiVersion = "argoproj.io/v1alpha1" + kind = "Application" + metadata = { + name = var.application_name + namespace = var.argocd_namespace + + labels = merge( + { + "managed-by" = "terraform" + }, + var.argocd_labels + ) + finalizers = length(var.argocd_finalizers) > 0 ? var.argocd_finalizers : null + } + spec = { + project = var.application_project + source = local.argocd_source + destination = { + namespace = var.application_namespace + server = "https://kubernetes.default.svc" + } + + syncPolicy = { + automated = { + prune = true + selfHeal = true + } + } + } + } +} diff --git a/tf-modules/argocd/project/argocd-project.tf b/tf-modules/argocd/project/argocd-project.tf new file mode 100644 index 0000000..c353e1f --- /dev/null +++ b/tf-modules/argocd/project/argocd-project.tf @@ -0,0 +1,46 @@ +variable "argocd_namespace" { + description = "The namespace where ArgoCD is installed" + default = "argocd" +} + +variable "project_name" { + description = "The name of the project" + + validation { + error_message = "The project name must be lowercase" + condition = can(regex("^[a-z0-9-]*$", var.project_name)) + } +} + +variable "project_destinations" { + description = "The destinations for the project" + type = list(object({ + namespace = string + server = string + })) + + default = [{ + namespace = "*" + server = "*" + }] +} + +resource "kubernetes_manifest" "project" { + manifest = { + apiVersion = "argoproj.io/v1alpha1" + kind = "AppProject" + metadata = { + name = var.project_name + namespace = var.argocd_namespace + } + spec = { + destinations = var.project_destinations + clusterResourceWhitelist = [ + { + group = "*" + kind = "*" + } + ] + } + } +} diff --git a/tf-modules/database/database-user/database-user.tf b/tf-modules/database/database-user/database-user.tf new file mode 100644 index 0000000..11594d2 --- /dev/null +++ b/tf-modules/database/database-user/database-user.tf @@ -0,0 +1,159 @@ +# ---------- +# Required +# ---------- + +variable "db_name" { + type = string + description = "The name of the database to create" +} + +variable "namespace" { + type = string + description = "The namespace to deploy the database to" +} + +variable "mariadb_cluster" { + type = string + description = "The name of the MariaDB cluster to connect to" +} + +variable "mariadb_cluster_namespace" { + type = string + description = "The namespace of the MariaDB cluster to connect to" +} + +variable "password" { + type = string + description = "The password for the user" + sensitive = true +} + +# ---------- +# Optional +# ---------- + +variable "db_connection_secret_name" { + type = string + description = "The name of the secret to create for the database connection" + default = "db-connection" +} + +variable "db_connection_format" { + type = string + description = "The format of the connection string" + default = "mysql://{{ .Username }}:{{ .Password }}@{{ .Host }}:{{ .Port }}/{{ .Database }}{{ .Params }}" +} + +# ---------- +# Resources +# ---------- + +resource "null_resource" "replace-trigger" { + triggers = { + "db_name" = var.db_name + "namespace" = var.namespace + "maria_db_cluster" = var.mariadb_cluster + "maria_db_cluster_namespace" = var.mariadb_cluster_namespace + } +} + + +module "database" { + source = "../database" + + db_name = var.db_name + mariadb_cluster = var.mariadb_cluster + mariadb_cluster_namespace = var.mariadb_cluster_namespace + namespace = var.namespace +} + +module "user" { + source = "../user" + + user_name = var.db_name + password = var.password + mariadb_cluster = var.mariadb_cluster + mariadb_cluster_namespace = var.mariadb_cluster_namespace + namespace = var.namespace + + depends_on = [ + module.database + ] +} + +resource "kubernetes_manifest" "connection" { + manifest = { + apiVersion = "k8s.mariadb.com/v1alpha1" + kind = "Connection" + metadata = { + name = "connection" + namespace = var.namespace + } + + spec = { + mariaDbRef = { + name = var.mariadb_cluster + namespace = var.mariadb_cluster_namespace + } + username = var.db_name + passwordSecretKeyRef = { + name = "user-${var.db_name}" + key = "password" + } + database = var.db_name + secretName = var.db_connection_secret_name + + secretTemplate = { + key = "dsn" + format = var.db_connection_format + usernameKey = "username" + passwordKey = "password" + hostKey = "host" + portKey = "port" + databaseKey = "database" + } + + serviceName = var.mariadb_cluster + } + } + + lifecycle { + replace_triggered_by = [ + null_resource.replace-trigger + ] + } +} + +resource "kubernetes_manifest" "grant" { + manifest = { + apiVersion = "k8s.mariadb.com/v1alpha1" + kind = "Grant" + metadata = { + name = "${var.db_name}" + namespace = var.namespace + } + spec = { + mariaDbRef = { + name = var.mariadb_cluster + namespace = var.mariadb_cluster_namespace + } + privileges = [ + "ALL PRIVILEGES" + ] + database = var.db_name + table = "*" + username = var.db_name + grantOption = true + host = "%" + cleanupPolicy = "Delete" + requeueInterval = "30s" + retryInterval = "5s" + } + } + + lifecycle { + replace_triggered_by = [ + null_resource.replace-trigger + ] + } +} diff --git a/tf-modules/database/database/database.tf b/tf-modules/database/database/database.tf new file mode 100644 index 0000000..8c50282 --- /dev/null +++ b/tf-modules/database/database/database.tf @@ -0,0 +1,97 @@ +# ---------- +# Required +# ---------- + +variable "db_name" { + type = string + description = "The name of the database to create" +} + +variable "namespace" { + type = string + description = "The namespace to deploy the database to" +} + +variable "mariadb_cluster" { + type = string + description = "The name of the MariaDB cluster to connect to" +} + +variable "mariadb_cluster_namespace" { + type = string + description = "The namespace of the MariaDB cluster to connect to" +} + +# ---------- +# Optional +# ---------- + +variable "db_character_set" { + type = string + description = "The character set to use for the database" + default = "utf8mb4" +} + +variable "db_collate" { + type = string + description = "The collation to use for the database" + default = "utf8mb4_0900_ai_ci" +} + +variable "db_cleanup_policy" { + type = string + description = "The cleanup policy to use for the database - Is it deleted or retained when this resource is deleted" + default = "Delete" +} + +variable "db_requeue_interval" { + type = string + description = "The requeue interval to use for the database" + default = "30s" +} + +variable "db_retry_interval" { + type = string + description = "The retry interval to use for the database" + default = "5s" +} + +# ---------- +# Resources +# ---------- + +resource "null_resource" "replace-trigger" { + triggers = { + "db_name" = var.db_name + "namespace" = var.namespace + "maria_db_cluster" = var.mariadb_cluster + "maria_db_cluster_namespace" = var.mariadb_cluster_namespace + } +} + +resource "kubernetes_manifest" "database" { + manifest = { + apiVersion = "k8s.mariadb.com/v1alpha1" + kind = "Database" + metadata = { + name = "${var.db_name}" + namespace = var.namespace + } + spec = { + name = var.db_name + mariaDbRef = { + name = var.mariadb_cluster + namespace = var.mariadb_cluster_namespace + } + characterSet = var.db_character_set + collate = var.db_collate + cleanupPolicy = var.db_cleanup_policy + requeueInterval = var.db_requeue_interval + retryInterval = var.db_retry_interval + } + } + + depends_on = [ + null_resource.replace-trigger + ] +} diff --git a/tf-modules/database/user/user.tf b/tf-modules/database/user/user.tf new file mode 100644 index 0000000..452f711 --- /dev/null +++ b/tf-modules/database/user/user.tf @@ -0,0 +1,85 @@ +# ---------- +# Required +# ---------- + +variable "user_name" { + type = string + description = "The name of the user to create" +} + +variable "namespace" { + type = string + description = "The namespace to deploy the user to" +} + +variable "mariadb_cluster" { + type = string + description = "The name of the MariaDB cluster to connect to" +} + +variable "mariadb_cluster_namespace" { + type = string + description = "The namespace of the MariaDB cluster to connect to" +} + +variable "password" { + type = string + description = "The password for the user" + sensitive = true +} + +# ---------- +# Resources +# ---------- + +resource "kubernetes_secret" "database_user_password" { + metadata { + name = "user-${var.user_name}" + namespace = var.namespace + } + + data = { + password = var.password + } +} + +resource "null_resource" "replace-trigger" { + triggers = { + "user_name" = var.user_name + "namespace" = var.namespace + "maria_db_cluster" = var.mariadb_cluster + "maria_db_cluster_namespace" = var.mariadb_cluster_namespace + } +} + + +resource "kubernetes_manifest" "database_user" { + manifest = { + apiVersion = "k8s.mariadb.com/v1alpha1" + kind = "User" + metadata = { + name = "${var.user_name}" + namespace = var.namespace + } + spec = { + mariaDbRef = { + name = var.mariadb_cluster + namespace = var.mariadb_cluster_namespace + } + passwordSecretKeyRef = { + name = kubernetes_secret.database_user_password.metadata[0].name + key = "password" + } + maxUserConnections = 2000 + host = "%" # Allow connections from any host + } + } + + lifecycle { + # Replace if any variables change + replace_triggered_by = [ + null_resource.replace-trigger, + kubernetes_secret.database_user_password + ] + } +} diff --git a/tf-modules/kubectf/challenge-manager.tf b/tf-modules/kubectf/challenge-manager.tf new file mode 100644 index 0000000..d8e0ca8 --- /dev/null +++ b/tf-modules/kubectf/challenge-manager.tf @@ -0,0 +1,361 @@ +resource "kubernetes_secret_v1" "challenge-manager" { + metadata { + name = "challenge-manager" + namespace = local.management_namespace + } + + data = { + "auth" = var.management_auth_secret + "container" = var.container_secret + } + + depends_on = [ + kubernetes_namespace.management + ] +} + +resource "kubernetes_cluster_role_binding_v1" "challenge-management" { + metadata { + name = "kubectf-challenge-manager-read-instanced-challenges" + } + + role_ref { + kind = "ClusterRole" + name = kubernetes_cluster_role_v1.challenge-management.metadata.0.name + api_group = "rbac.authorization.k8s.io" + } + + subject { + kind = "ServiceAccount" + name = kubernetes_service_account_v1.challenge-manager.metadata.0.name + namespace = local.management_namespace + } + + depends_on = [ + kubernetes_cluster_role_v1.challenge-management, + kubernetes_service_account_v1.challenge-manager + ] +} + +resource "kubernetes_role_binding_v1" "challenge-management" { + metadata { + name = "challenge-manager" + namespace = local.instanced_challenge_namespace + } + + role_ref { + kind = "Role" + name = kubernetes_role_v1.challenge-management.metadata.0.name + api_group = "rbac.authorization.k8s.io" + } + + subject { + kind = "ServiceAccount" + name = kubernetes_service_account_v1.challenge-manager.metadata.0.name + namespace = local.management_namespace + } + + depends_on = [ + kubernetes_role_v1.challenge-management + ] +} + +resource "kubernetes_cluster_role_v1" "challenge-management" { + metadata { + name = "kubectf-read-instanced-challenges" + } + + rule { + api_groups = ["kube-ctf.${var.org_name}"] + resources = ["instanced-challenges"] + verbs = ["get", "list"] + } +} + +resource "kubernetes_role_v1" "challenge-management" { + metadata { + name = "challenge-manager" + namespace = local.instanced_challenge_namespace + } + + rule { + api_groups = ["*"] + resources = [ + "ingresses", + "ingressroutes", + "ingressroutetcps", + "pods", + "deployments", + "services", + "namespaces", + "secrets", + "networkpolicies" + ] + verbs = [ + "create", + "delete", + "get", + "list", + "patch", + "update", + "watch" + ] + } + + depends_on = [ + kubernetes_namespace.management + ] +} + +resource "kubernetes_service_account_v1" "challenge-manager" { + metadata { + name = "challenge-manager" + namespace = local.management_namespace + + labels = { + system = "kube-ctf" + org = var.org_name + } + } + + depends_on = [ + kubernetes_namespace.management + ] +} + +locals { + management_dns = "manager.${var.management_dns}" +} + +resource "kubernetes_deployment_v1" "challenge-manager" { + metadata { + name = "challenge-manager" + namespace = local.management_namespace + + labels = { + system = "kube-ctf" + org = var.org_name + + "app.kubernetes.io/name" = "kube-ctf-challenge-manager" + "app.kubernetes.io/instance" = "kubectf" + "app.kubernetes.io/component" = "challenge-manager" + + "kube-ctf.${var.org_name}/service" = "challenge-manager" + } + } + + spec { + replicas = var.services_replicas + + selector { + match_labels = { + "kube-ctf.${var.org_name}/service" = "challenge-manager" + } + } + + template { + metadata { + labels = { + "kube-ctf.${var.org_name}/service" = "challenge-manager" + } + } + + spec { + service_account_name = kubernetes_service_account_v1.challenge-manager.metadata.0.name + + image_pull_secrets { + name = var.ghcr_token != "" ? module.pull-secret[local.management_namespace].pull-secret : "" + } + + container { + name = "challenge-manager" + image = var.image_challenge_manager + image_pull_policy = "Always" + + + port { + container_port = 3000 + } + + readiness_probe { + http_get { + path = "/healthz" + port = 3000 + } + initial_delay_seconds = 10 + period_seconds = 10 + } + + liveness_probe { + http_get { + path = "/healthz" + port = 3000 + } + initial_delay_seconds = 30 + period_seconds = 10 + } + + env { + name = "KUBECTF_BASE_DOMAIN" + value = local.challenges_host + } + + env { + name = "KUBECTF_API_DOMAIN" + value = local.management_dns + } + + env { + name = "KUBECTF_NAMESPACE" + value = local.instanced_challenge_namespace + } + + env { + name = "KUBECTF_MAX_OWNER_DEPLOYMENTS" + value = var.max_instances + } + + env { + name = "KUBECTF_REGISTRY_PREFIX" + value = var.registry_prefix + } + + env { + name = "KUBECTF_AUTH_SECRET" + value_from { + secret_key_ref { + name = kubernetes_secret_v1.challenge-manager.metadata.0.name + key = "auth" + } + } + } + + env { + name = "KUBECTF_CONTAINER_SECRET" + value_from { + secret_key_ref { + name = kubernetes_secret_v1.challenge-manager.metadata.0.name + key = "container" + } + } + } + + resources { + limits = { + cpu = "250m" + memory = "512Mi" + } + requests = { + cpu = "10m" + memory = "128Mi" + } + } + } + } + } + } + + depends_on = [ + kubernetes_service_account_v1.challenge-manager, + kubernetes_secret_v1.challenge-manager, + kubernetes_cluster_role_binding_v1.challenge-management, + kubernetes_role_binding_v1.challenge-management, + kubernetes_cluster_role_v1.challenge-management, + kubernetes_role_v1.challenge-management, + module.pull-secret, + local.challenges_host + ] +} + +resource "kubernetes_service_v1" "challenge-manager" { + metadata { + name = "challenge-manager" + namespace = local.management_namespace + + labels = { + system = "kube-ctf" + org = var.org_name + + "app.kubernetes.io/name" = "kube-ctf-challenge-manager-service" + "app.kubernetes.io/instance" = "kubectf" + "app.kubernetes.io/component" = "challenge-manager" + + "kube-ctf.${var.org_name}/service" = "challenge-manager" + } + } + + spec { + selector = { + "kube-ctf.${var.org_name}/service" = "challenge-manager" + } + + port { + port = 3000 + } + } + + depends_on = [ + kubernetes_deployment_v1.challenge-manager + ] +} + +resource "kubernetes_ingress_v1" "challenge-manager" { + metadata { + name = "challenge-manager" + namespace = local.management_namespace + + labels = { + system = "kube-ctf" + org = var.org_name + + "app.kubernetes.io/name" = "kube-ctf-challenge-manager-ingress" + "app.kubernetes.io/instance" = "kubectf" + "app.kubernetes.io/component" = "challenge-manager" + + "kube-ctf.${var.org_name}/service" = "challenge-manager" + } + + annotations = { + "cert-manager.io/cluster-issuer" = var.cert_manager + "traefik.ingress.kubernetes.io/router.priority" = "10" + "traefik.ingress.kubernetes.io/router.middlewares" = "errors-errors@kubernetescrd" + } + } + + spec { + tls { + hosts = [ + local.management_dns + ] + + secret_name = "kubectf-cert-challenge-manager" + } + + rule { + host = local.management_dns + + http { + path { + path = "/" + path_type = "Prefix" + backend { + service { + name = kubernetes_service_v1.challenge-manager.metadata.0.name + port { + number = 3000 + } + } + } + } + } + } + } + + depends_on = [ + kubernetes_service_v1.challenge-manager + ] +} + +output "challenge_manager_host" { + value = local.management_dns +} diff --git a/tf-modules/kubectf/crd.tf b/tf-modules/kubectf/crd.tf new file mode 100644 index 0000000..ff0f500 --- /dev/null +++ b/tf-modules/kubectf/crd.tf @@ -0,0 +1,54 @@ +resource "kubernetes_manifest" "crd" { + manifest = { + apiVersion = "apiextensions.k8s.io/v1" + kind = "CustomResourceDefinition" + metadata = { + name = "instanced-challenges.kube-ctf.${var.org_name}" + } + spec = { + group = "kube-ctf.${var.org_name}" + names = { + + plural = "instanced-challenges" + singular = "instanced-challenge" + kind = "InstancedChallenge" + shortNames = [ + "instanced-challenge" + ] + } + versions = [ + { + name = "v1" + served = true + storage = true + schema = { + openAPIV3Schema = { + type = "object" + properties = { + spec = { + type = "object" + properties = { + expires = { + type = "integer" + } + available_at = { + type = "integer" + } + type = { + type = "string" + } + template = { + type = "string" + type = "string" + } + } + } + } + } + } + } + ] + scope = "Cluster" + } + } +} diff --git a/tf-modules/kubectf/kube-janitor.tf b/tf-modules/kubectf/kube-janitor.tf new file mode 100644 index 0000000..f9b331a --- /dev/null +++ b/tf-modules/kubectf/kube-janitor.tf @@ -0,0 +1,187 @@ +resource "kubernetes_config_map_v1" "kube-janitor" { + metadata { + name = "kube-janitor" + namespace = local.management_namespace + + labels = { + system = "kube-ctf" + org = var.org_name + + "app.kubernetes.io/name" = "kube-ctf-kube-janitor-config" + "app.kubernetes.io/instance" = "kubectf" + "app.kubernetes.io/component" = "kube-janitor" + } + } + + data = { + "rules.yaml" = <