diff --git a/.codesandbox/tasks.json b/.codesandbox/tasks.json new file mode 100644 index 0000000..e8190fd --- /dev/null +++ b/.codesandbox/tasks.json @@ -0,0 +1,16 @@ +{ + // These tasks will run in order when initializing your CodeSandbox project. + "setupTasks": [ + { + "name": "Install httpie", + "command": "sudo apt update -y && sudo apt install -y httpie" + }, + { + "name": "Install latest compose version", + "command": "./scripts/download-compose.sh" + } + ], + + // These tasks can be run from CodeSandbox. Running one will open a log in the app. + "tasks": {} +} diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 2d27207..a2bb04f 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -39,8 +39,6 @@ ] } } - - // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root. // "remoteUser": "root" } diff --git a/.github/workflows/claat-prod.yml b/.github/workflows/claat-prod.yml deleted file mode 100644 index 428a6d1..0000000 --- a/.github/workflows/claat-prod.yml +++ /dev/null @@ -1,39 +0,0 @@ -name: Export Codelab (PROD) - -on: - push: - branches: ["main"] - workflow_dispatch: -jobs: - publish-codelab: - runs-on: ubuntu-latest - permissions: - contents: write - pages: write - id-token: write - concurrency: - group: "pages" - cancel-in-progress: false - environment: - name: github-pages - url: ${{ steps.deployment.outputs.page_url }} - steps: - - uses: actions/checkout@v4 - - uses: actions/setup-go@v5 - with: - go-version: '1.22.2' # The Go version to download (if necessary) and use. - - run: go install github.com/googlecodelabs/tools/claat@latest - - name: Export with CLAAT - run: mkdir -p ./public/ && claat export -f html -o public docs/workshop.md - - name: Setup Pages - uses: actions/configure-pages@v5 - - name: Upload artifact - if: github.ref == 'refs/heads/main' # Authorize the deployment for the main branch only - uses: actions/upload-pages-artifact@v3 - with: - # Upload entire repository - path: './public/observability-workshop' - - name: Deploy to GitHub Pages - if: github.ref == 'refs/heads/main' # Authorize the deployment for the main branch only - id: deployment - uses: actions/deploy-pages@v4 diff --git a/.github/workflows/deploy-pages.yml b/.github/workflows/deploy-pages.yml new file mode 100644 index 0000000..d93a0b1 --- /dev/null +++ b/.github/workflows/deploy-pages.yml @@ -0,0 +1,124 @@ +name: Export and Deploy Sites + +on: + push: + branches: ["main"] + pull_request: + workflow_dispatch: + +defaults: + run: + shell: bash + +permissions: + contents: read + pages: write + id-token: write + +jobs: + build-codelab: + runs-on: ubuntu-latest + permissions: + contents: write + pages: write + id-token: write + concurrency: + group: "pages" + cancel-in-progress: false + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-go@v5 + with: + go-version: '1.22.2' # The Go version to download (if necessary) and use. + - run: go install github.com/googlecodelabs/tools/claat@latest + - name: Export with CLAAT + run: mkdir -p ./public/ && claat export -f html -o public docs/workshop.md + - name: Move all files from observability-workshop to public + run: | + shopt -s dotglob nullglob + mv ./public/observability-workshop/* ./public/ + rmdir ./public/observability-workshop + shopt -u dotglob nullglob + - name: Upload artifact + uses: actions/upload-artifact@v4 + with: + # Upload entire repository + name: claat-site + path: ./public + + build-hugo-devoxxpl: + runs-on: ubuntu-latest + env: + HUGO_VERSION: '0.147.2' + HUGO_ENVIRONMENT: 'production' + TZ: 'Europe/Paris' + steps: + - name: Install Hugo CLI + run: | + wget -O ${{ runner.temp }}/hugo.deb https://github.com/gohugoio/hugo/releases/download/v${HUGO_VERSION}/hugo_extended_${HUGO_VERSION}_linux-amd64.deb \ + && sudo dpkg -i ${{ runner.temp }}/hugo.deb + - name: Install Dart Sass + run: sudo snap install dart-sass + - name: Checkout + uses: actions/checkout@v4 + with: + submodules: recursive + fetch-depth: 0 + - name: Setup Pages + id: pages + uses: actions/configure-pages@v5 + - name: Install Node.js dependencies + run: "[[ -f package-lock.json || -f npm-shrinkwrap.json ]] && npm ci || true" + - name: Configure Git + run: git config core.quotepath false + - name: Build with Hugo + run: | + hugo \ + --gc \ + --minify \ + --baseURL "${{ steps.pages.outputs.base_url }}/devoxxpl/" \ + -s ./docs \ + -d ../public/devoxxpl + - name: Upload Hugo artifact + uses: actions/upload-artifact@v4 + with: + name: hugo-site + path: public/devoxxpl + + deploy: + runs-on: ubuntu-latest + needs: [build-codelab, build-hugo-devoxxpl] + if: github.ref == 'refs/heads/main' # Ensure deployment only occurs on the main branch + steps: + - uses: actions/checkout@v4 + + # Download CLAAT artifact + - name: Download CLAAT artifact + uses: actions/download-artifact@v4 + with: + name: claat-site + path: ./public + + # Download Hugo artifact + - name: Download Hugo artifact + uses: actions/download-artifact@v4 + with: + name: hugo-site + path: ./public/devoxxpl + + # Deploy to GitHub Pages + - name: Setup Pages + uses: actions/configure-pages@v5 + + - name: Upload artifact + if: github.ref == 'refs/heads/main' # Authorize the deployment for the main branch only + uses: actions/upload-pages-artifact@v3 + with: + path: ./public + - name: Deploy to GitHub Pages + if: github.ref == 'refs/heads/main' # Authorize the deployment for the main branch only + id: deployment + uses: actions/deploy-pages@v4 \ No newline at end of file diff --git a/.github/workflows/hugo-noprod.yml b/.github/workflows/hugo-noprod.yml new file mode 100644 index 0000000..cd0381c --- /dev/null +++ b/.github/workflows/hugo-noprod.yml @@ -0,0 +1,48 @@ +name: Export Hugo (NO PROD) + +on: + push: + branches: ["!main"] + pull_request: + workflow_dispatch: + +defaults: + run: + shell: bash + +permissions: + contents: read + pages: write + id-token: write + +jobs: + test-hugo-devoxxpl: + runs-on: ubuntu-latest + env: + HUGO_VERSION: '0.147.2' + HUGO_ENVIRONMENT: 'production' + TZ: 'Europe/Paris' + steps: + - name: Install Hugo CLI + run: | + wget -O ${{ runner.temp }}/hugo.deb https://github.com/gohugoio/hugo/releases/download/v${HUGO_VERSION}/hugo_extended_${HUGO_VERSION}_linux-amd64.deb \ + && sudo dpkg -i ${{ runner.temp }}/hugo.deb + - name: Install Dart Sass + run: sudo snap install dart-sass + - name: Checkout + uses: actions/checkout@v4 + with: + submodules: recursive + fetch-depth: 0 + - name: Install Node.js dependencies + run: "[[ -f package-lock.json || -f npm-shrinkwrap.json ]] && npm ci || true" + - name: Configure Git + run: git config core.quotepath false + - name: Build with Hugo + run: | + hugo \ + --gc \ + --minify \ + --baseURL "${{ steps.pages.outputs.base_url }}/devoxxpl/" \ + -s ./docs \ + -d ../public/devoxxpl \ No newline at end of file diff --git a/.gitignore b/.gitignore index bac3848..009ce5c 100644 --- a/.gitignore +++ b/.gitignore @@ -63,9 +63,6 @@ out/ /nbdist/ /.nb-gradle/ -### VS Code ### -.vscode/ - ### Project ### logs/*.log logs/*.json diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..c263dbe --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "docs/themes/hugo-theme-relearn"] + path = docs/themes/hugo-theme-relearn + url = https://github.com/McShelby/hugo-theme-relearn.git diff --git a/.vscode/recommendations.json b/.vscode/recommendations.json new file mode 100644 index 0000000..3b6a03e --- /dev/null +++ b/.vscode/recommendations.json @@ -0,0 +1,3 @@ +{ + "recommendations": ["vscjava.vscode-java-pack"] +} \ No newline at end of file diff --git a/docs/.hugo_build.lock b/docs/.hugo_build.lock new file mode 100644 index 0000000..e69de29 diff --git a/docs/archetypes/default.md b/docs/archetypes/default.md new file mode 100644 index 0000000..25b6752 --- /dev/null +++ b/docs/archetypes/default.md @@ -0,0 +1,5 @@ ++++ +date = '{{ .Date }}' +draft = true +title = '{{ replace .File.ContentBaseName "-" " " | title }}' ++++ diff --git a/docs/assets/css/theme-worldline-dark.css b/docs/assets/css/theme-worldline-dark.css new file mode 100644 index 0000000..c09960b --- /dev/null +++ b/docs/assets/css/theme-worldline-dark.css @@ -0,0 +1,5 @@ +@import "theme-relearn-dark.css"; + +:root { + --PRIMARY-color: rgb(90, 190, 170); /* brand primary color */ +} \ No newline at end of file diff --git a/docs/assets/css/theme-worldline-light.css b/docs/assets/css/theme-worldline-light.css new file mode 100644 index 0000000..c33f94c --- /dev/null +++ b/docs/assets/css/theme-worldline-light.css @@ -0,0 +1,5 @@ +@import "theme-relearn-light.css"; + +:root { + --PRIMARY-color: rgb(90, 190, 170); /* brand primary color */ +} \ No newline at end of file diff --git a/docs/content/_index.md b/docs/content/_index.md new file mode 100644 index 0000000..00e59a6 --- /dev/null +++ b/docs/content/_index.md @@ -0,0 +1,28 @@ ++++ +title = "Observability Worshop" +type = "home" ++++ + +> Observability from Development: Master Your Java Applications in Production with Open Telemetry + +This workshop aims to introduce how to make a Java application fully observable with: + +* Logs with insightful information +* Metrics with [Prometheus](https://prometheus.io/) +* [Distributed Tracing](https://blog.touret.info/2023/09/05/distributed-tracing-opentelemetry-camel-artemis/) + +During this workshop we will use OpenTelemetry and a Grafana stack: + +* [Grafana](https://grafana.com/): for dashboards +* [Loki](https://grafana.com/oss/loki/): for storing our logs +* [Tempo](https://grafana.com/oss/tempo/): for storing traces +* [Prometheus](https://prometheus.io/): for gathering and storing metrics. + +We will also cover the: + +* OpenTelemetry Protocol (OTLP) to send our telemetry data over the network, +* [OpenTelemetry Collector](https://opentelemetry.io/docs/collector/) which gathers & broadcasts + then the data coming from our microservices, +* And obviously the OpenTelemetry Java instrumentation to collect all the telemetry data of our application. + +## Start with [Workshop Overview](./workshop-overview/_index.md) \ No newline at end of file diff --git a/docs/content/correlation/_index.md b/docs/content/correlation/_index.md new file mode 100644 index 0000000..eba9f7f --- /dev/null +++ b/docs/content/correlation/_index.md @@ -0,0 +1,18 @@ ++++ +title = "Correlation" +type = "chapter" +weight = 6 ++++ + +Grafana allows correlation between all our telemetry data: + +* Logs with Traces, +* Metrics with Traces, +* Traces with Logs, +* Traces with Metrics, +* โ€ฆ + +When discussing observability, correlation is essential. It enables you to diagnose the root cause of an issue quickly +by using all the telemetry data involved. + +{{% children containerstyle="div" style="h3" description=false %}} \ No newline at end of file diff --git a/docs/content/correlation/logs-and-traces.md b/docs/content/correlation/logs-and-traces.md new file mode 100644 index 0000000..186f045 --- /dev/null +++ b/docs/content/correlation/logs-and-traces.md @@ -0,0 +1,60 @@ ++++ +date = '2025-06-07T16:16:10+02:00' +title = 'Logs and Traces' +weight = 2 ++++ + +๐Ÿ› ๏ธ Let's go back to the Grafana `Explore` dashboard: + +* Select the ``Loki`` data source, +* Add a label filter to select logs coming from ``easypay-service``, +* Run a query and select a log entry corresponding to a payment query. + +๐Ÿ‘€ Now check to see if there are +``trace_id``](https://www.w3.org/TR/trace-context/#trace-id) and [ +``span_id``](https://www.w3.org/TR/trace-context/#parent-id) attributes. +These will help us correlate our different request logs and traces. + +> [!INFO] +> +> These concepts are part of the [W3C Trace Context Specification](https://www.w3.org/TR/trace-context/). + +#### Enable correlation + +๐Ÿ› ๏ธ In Grafana, go to `Connections` > `Data sources`: + +* Select the `Loki` data source, +* Create a `Derived fields` configuration: + * `Name`: `TraceID` + * `Type`: `Label` + * `Label`: `trace_id` + * `Query`: `${__value.raw}` + * `URL Label`: `View Trace` + * Enable `Internal Link` and select `Tempo`. + + +๐Ÿ› ๏ธ Go back to the Grafana `Explore` dashboard and try to find the same kind of log message: + +* Expand the log, +* At the bottom of the log entry, you should find the `Fields` and `Links` sections, +* If the log contains a trace ID, you should see a button labeled `View Trace`, +* Click on this button! + +๐Ÿ‘€ Grafana should open a pane with the corresponding trace from Tempo! + +Now you can correlate logs and traces! +If you encounter any exceptions in your error logs, you can now see where it happens and get the bigger picture of the +transaction from the customer's point of view. + +#### How was it done? + +First of all, logs should contain the `trace_id` information. +Most frameworks handle this for you. Whenever a request generates a trace or span, the value is placed in the MDC ( +Mapped Diagnostic Context) and can be printed in the logs. + +On the other hand, Grafana has the ability to parse logs to extract certain values for the Loki data source. This is the +purpose of `Derived fields`. + +When configuring the Loki data source, we provided Grafana with the trace ID label to use from logs and linked +it to the Tempo data source. Behind the scenes, Grafana creates the bridge between the two telemetry data sources. And +thatโ€™s all ๐Ÿ˜Ž \ No newline at end of file diff --git a/docs/content/correlation/metrics-and-traces.md b/docs/content/correlation/metrics-and-traces.md new file mode 100644 index 0000000..d138eb5 --- /dev/null +++ b/docs/content/correlation/metrics-and-traces.md @@ -0,0 +1,87 @@ ++++ +date = '2025-06-07T16:17:35+02:00' +title = 'Metrics and Traces' +weight = 3 ++++ + +Exemplars are annotations used in metrics that link specific occurrences, like logs or traces, to data points within a metric time series. +They provide direct insights into system behaviors at moments captured by the metric, aiding quick diagnostics by showing related trace data where anomalies occur. +This feature is valuable for debugging, offering a clearer understanding of metrics in relation to system events. + +๐Ÿ› ๏ธ Generate some load towards the `easypay-service`: + +```bash +$ k6 run -u 1 -d 2m k6/01-payment-only.js +``` + +โ„น๏ธ Exemplars are tied to a metric and contain a trace ID and a span ID. They are especially available in the OpenMetrics +format, but also OpenTelemetry support them. In OpenMetrics format, they appear as follows: + +``` +http_server_requests_seconds_bucket{application="easypay-service",error="none",exception="none",instance="easypay-service:39a9ae31-f73a-4a63-abe5-33049b8272ca",method="GET",namespace="local",outcome="SUCCESS",status="200",uri="/actuator/prometheus",le="0.027962026"} 1121 # {span_id="d0cf53bcde7b60be",trace_id="969873d828346bb616dca9547f0d9fc9"} 0.023276118 1719913187.631 +``` + +The interesting part starts after the `#` character, this is the so-called exemplar: + +``` + SPAN ID TRACE ID VALUE TIMESTAMP +# {span_id="d0cf53bcde7b60be",trace_id="969873d828346bb616dca9547f0d9fc9"} 0.023276118 1719913187.631 +``` + +That could be translated by: + +* `easypay-service` handled an HTTP request, +* Which generated trace ID id `969873d828346bb616dca9547f0d9fc9`, +* Request duration was `0.023276118` second, +* At timestamp `1719913187.631` + +๐Ÿ‘€ Exemplars can be analyzed in Grafana: + +* Go to the Grafana `Explore` view, +* Select the `Prometheus` data source, +* Switch to the `Code` mode (button on the right), +* Paste the following PromQL query: + +``` +http_server_request_duration_seconds_bucket{http_route="/payments",service_name="easypay-service"} +``` + +* Unfold the `Options` section and enable `Exemplars`, +* Click on `Run query`. + +๐Ÿ‘€ In addition to the line graph, you should see square dots at the bottom of the graph: + +* Hover over a dot, +* It should display useful information for correlation, particularly a `trace_id`. + +#### Enable correlation + +๐Ÿ› ๏ธ In Grafana, go to the `Connections` > `Data sources` section: + +* Select the `Prometheus` data source, +* Click on `Exemplars`: + * Enable `Internal link` and select the Tempo data source, + * `URL Label`: `Go to Trace`, + * `Label name:`: `trace_id` (as displayed in the exemplar values), +* Click on `Save & test`. + +๐Ÿ› ๏ธ Go back to the Grafana `Explore` dashboard and try to find the same exemplar as before: + +* Hover over it, +* You should see a new button `Go to Trace` next to the `trace_id` label, +* Click on the button. + +๐Ÿ‘€ Grafana should open a new pane with the corresponding trace from Tempo! + +We have added a new correlation dimension to our system between metrics and traces! + +#### How was it done? + +Meters such as histograms are updated in the context of a request, and thus of a recorded trace. + +When histogram is updated, the OpenTelemetry instrumentation library gets the current trace and span ids in the context, +and adds them to the corresponding bucket _as an example_. + +Exemplars are then propagated with the metric to the time series database, which should support them. +Hopefully Prometheus can support them: we just had to start the server with the `--enable-feature=exemplar-storage` +flag (see `compose.infrastructure.yml`) to enable their storage. \ No newline at end of file diff --git a/docs/content/correlation/traces-to-logs.md b/docs/content/correlation/traces-to-logs.md new file mode 100644 index 0000000..5ac363b --- /dev/null +++ b/docs/content/correlation/traces-to-logs.md @@ -0,0 +1,51 @@ ++++ +date = '2025-06-07T16:19:18+02:00' +title = 'Traces to Logs' +weight = 4 ++++ + +We are able to link logs to traces thanks to Lokiโ€™s data source `Derived fields`, but what about traces to logs? + +Fortunately, the Tempo data source can be configured the same way! + +#### Enable correlation + +๐Ÿ› ๏ธ In Grafana, go to `Connections` > `Data sources`: + +* Select the `Tempo` data source, +* Configure `Trace to logs` as follows: + * Data source: `Loki`, + * Span start time shift: `-5m`, + * Span end time shift: `5m`, + * Tags: `service.name` as `service_name`, + * Enable `Filter by trace ID` if and only if you want to show logs that match the trace ID. +* Click on `Save & test`. + +โ„น๏ธ Just to give you some context about this configuration: + +* *Span start time shift* and *Span end time shift* allow retrieving logs within the specified interval, as log + timestamps and trace timestamps may not match exactly. +* *Tags* is required: we should have a common tag between Tempo and Loki to bridge the gap between traces and logs. + Here, the application name (defined as `service.name` in Tempo and `service_name` in Loki) serves this purpose. +* Using filters will remove logs that do not match trace or span identifiers. + +#### Test correlation + +๐Ÿ› ๏ธ Hit the easypay payment endpoint with curl or k6 to generate some traces (whichever you prefer): + +* `http POST :8080/api/easypay/payments posId=POS-01 cardNumber=5555567898780008 expiryDate=789456123 amount:=40000` +* `k6 run -u 1 -d 2m k6/01-payment-only.js` + +๐Ÿ› ๏ธ Open the Grafana `Explore` dashboard to find a trace: + +* You can use the following TraceQL directly: `{name="POST easypay-service"}` + * It is equivalent to filtering on Span Name: `POST easypay-service` + * More information about TraceQL: [Grafana TraceQL documentation](https://grafana.com/docs/tempo/latest/traceql/) +* Drill down a trace (you can widen the new pane). + +๐Ÿ‘€ A new **LOG** icon should appear for each line of the trace (middle of the screen): + +* Click on the icon for several spans (for the various services involved in the transaction), +* You can try toggling the `Filter by trace ID` option in the Tempo data source configuration to see the difference. + +Yeah, we have added a new dimension to the correlation of our telemetry data, further improving our observability. \ No newline at end of file diff --git a/docs/content/correlation/traces-to-metrics.md b/docs/content/correlation/traces-to-metrics.md new file mode 100644 index 0000000..4f017c4 --- /dev/null +++ b/docs/content/correlation/traces-to-metrics.md @@ -0,0 +1,60 @@ ++++ +date = '2025-06-07T16:20:08+02:00' +title = 'Traces to Metrics' +weight = 5 ++++ + +The last correlation we will explore in todayโ€™s workshop is between traces and metrics. + +Sometimes, we are interested in knowing the state of our application when inspecting traces, such as JVM heap or system +CPU usage. + +In this section, we will configure the Tempo data source to link our traces to these metrics. + +#### Enable correlation + +๐Ÿ› ๏ธ In Grafana, go to `Connections` > `Data sources`: + +* Select the `Tempo` data source, +* Configure `Trace to metrics` as follows: + * Data source: `Prometheus`, + * Span start time shift: `-2m`, + * Span end time shift: `2m`, + * Tags: `service.name` as `service_name` + +๐Ÿ› ๏ธ Now, we will add some metric queries (click on `+ Add query` for each query): + +* Heap usage as ratio: + * Link Label: `Heap Usage (ratio)`, + * Query: `sum(jvm_memory_used_bytes{$__tags})/sum(jvm_memory_limit_bytes{$__tags})` +* System CPU usage: + * Link Label: `System CPU Usage`, + * Query: `jvm_cpu_recent_utilization_ratio{$__tags}` + +> [!INFO] +> `$__tags` will be expanded by the tags defined in the `Tags` section. +> For `easypay-service`, the query becomes `jvm_cpu_recent_utilization_ratio{application=easypay-service}` + +๐Ÿ› ๏ธ Finally click on `Save & test` and go back to Grafana `Explore` dashboard to test our new setup. + +#### Test correlation + +๐Ÿ› ๏ธ Hit the easypay payment endpoint with curl or k6 to generate some traces (whichever you prefer): + +* `http POST :8080/api/easypay/payments posId=POS-01 cardNumber=5555567898780008 expiryDate=789456123 amount:=40000` +* `k6 run -u 1 -d 2m k6/01-payment-only.js` + +๐Ÿ› ๏ธ Open the Grafana `Explore` dashboard to find a trace: + +* You can use the following TraceQL directly: `{name="POST easypay-service"}` + * It is equivalent to filtering on Span Name: `POST easypay-service` + * More information about TraceQL: [Grafana TraceQL documentation](https://grafana.com/docs/tempo/latest/traceql/) +* Drill down a trace (you can widen the new pane). + +๐Ÿ‘€ The previous **LOG** icon has been replaced by a link icon (middle of the screen): + +* Click on the icon for several spans (for the various services involved in the transaction), + * You have now three choices: `Heap Usage (ratio)`, `System CPU Usage` and `Related logs`, + * `Related logs` behaves the same way as the previous **LOG** button. + +This was the last correlation dimension we wanted to show you! \ No newline at end of file diff --git a/docs/content/logs/_index.md b/docs/content/logs/_index.md new file mode 100644 index 0000000..398f6da --- /dev/null +++ b/docs/content/logs/_index.md @@ -0,0 +1,7 @@ ++++ +title = "Logs" +type = "chapter" +weight = 3 ++++ + +{{% children containerstyle="div" style="h3" description=false %}} \ No newline at end of file diff --git a/docs/content/logs/introduction/_index.md b/docs/content/logs/introduction/_index.md new file mode 100644 index 0000000..860a8f7 --- /dev/null +++ b/docs/content/logs/introduction/_index.md @@ -0,0 +1,238 @@ ++++ +date = '2025-06-06T23:21:18+02:00' +title = 'Introduction' +weight = 1 ++++ + +## Some functional issues + +๐Ÿ› ๏ธ Itโ€™s time to have a look to our `easypay-service` logs! +Service is started in a Docker container. To get its output you can use the following command: + +```bash +$ docker compose logs -f easypay-service +``` + +One of our customers raised an issue: + +> ยซ When I reach your API, I usually either an ``AMOUNT_EXCEEDED`` or ``INVALID_CARD_NUMBER`` error. ยป + +Normally the first thing to do is checking the logs. +Before that, we will reproduce this behavior. + +๐Ÿ› ๏ธ You can check the API as following: + +* For the ``AMOUNT_EXCEEDED`` error (any ``amount`` above 50000 would give the same result): + +```bash +$ http POST :8080/api/easypay/payments posId=POS-01 cardNumber=5555567898780008 expiryDate=789456123 amount:=51000 + +HTTP/1.1 201 Created +Content-Type: application/json +Date: Wed, 05 Jun 2024 13:45:40 GMT +Location: http://172.19.25.95:44523/payments/5459b20a-ac91-458f-9578-019c05483bb3 +transfer-encoding: chunked + +{ + "amount": 51000, + "authorId": "6ace318f-b669-4e4a-b366-3f09048becb7", + "authorized": false, + "bankCalled": true, + "cardNumber": "5555567898780008", + "cardType": "MASTERCARD", + "expiryDate": "789456123", + "paymentId": "5459b20a-ac91-458f-9578-019c05483bb3", + "posId": "POS-01", + "processingMode": "STANDARD", + "responseCode": "AUTHORIZATION_DENIED", + "responseTime": 25 +} +``` + +* And for the ``INVALID_CARD_NUMBER`` error: + +```bash +$ http POST :8080/api/easypay/payments posId=POS-01 cardNumber=5555567898780007 expiryDate=789456123 amount:=25000 + +HTTP/1.1 201 Created +Content-Type: application/json +Date: Wed, 05 Jun 2024 13:46:09 GMT +Location: http://172.19.25.95:44523/payments/2dbf3823-fb11-4c63-a540-ab43ac663e68 +transfer-encoding: chunked + +{ + "amount": 51000, + "authorId": null, + "authorized": false, + "bankCalled": false, + "cardNumber": "5555567898780007", + "cardType": null, + "expiryDate": "789456123", + "paymentId": "2dbf3823-fb11-4c63-a540-ab43ac663e68", + "posId": "POS-01", + "processingMode": "STANDARD", + "responseCode": "INVALID_CARD_NUMBER", + "responseTime": 5 +} + +``` + +๐Ÿ‘€ Look into the console `logs` to pinpoint these issues. + +> [!INFO] +> As you can see, logs can give you quickly some information. +> +> They are easy to implement as most of our frameworks provides a logging mechanism (here logback, but could be log4j, +> log4j2, etc.). Unfortunately, they are also dependent of the work of the developer to be insightful, who can also miss +> some important information. +> +> If you want to dig into this particular topic, you can check out this +> article: [Back to basics - Logging](https://blog.worldline.tech/2020/01/22/back-to-basics-logging.html). + +### Additional logs + +It's time to add more contextual information into our code! + +We will use in this workshop SLF4J. It is a logging facade that provides a simple API to log messages, and which can be +bind to different logging frameworks (Logback, Log4j, etc.). + +The logger can be created by adding a static class variable such as: + +```java + private static final Logger log = LoggerFactory.getLogger(PaymentResource.class); +``` + +Think to use the corresponding class to instantiate it! + +#### *What about log levels?* + +Use the most appropriate log level + +The log level is a fundamental concept in logging. Whether the logging framework you use, it allows you to tag log +records according to their severity or importance. +For instance, [SLF4J](https://www.slf4j.org/) offers +the [following log levels by default](https://www.slf4j.org/apidocs/org/slf4j/event/Level.html): + +* ``TRACE`` : typically used to provide detailed diagnostic information that can be used for troubleshooting and + debugging. Compare to DEBUG messages, TRACE messages are more fine-grained and verbose. +* ``DEBUG``: used to provide information that can be used to diagnose issues especially those related to program state. +* ``INFO``: used to record events that indicate that program is functioning normally. +* ``WARN``: used to record potential issues in your application. They may not be critical but should be investigated. +* ``ERROR``: records unexpected errors that occur during the operation of your application. In most cases, the error + should be addressed as soon as possible to prevent further problems or outages. + +We can also log payment requests and responses to provide even more context, which could be helpful for following +requests in this workshop. + +#### *Add logs* + +๐Ÿ“ Modify the `easypay-service/src/main/java/com/worldline/easypay/payment/boundary/PaymentResource.java` class by +uncommenting all `// LOG.โ€ฆ` lines (keep MDC lines for later ๐Ÿ˜‰). + +## The technical issue + +Another issue was raised for the POS (Point of Sell) ``POS-02`` (but we didnโ€™t know yet!). + +๐Ÿ› ๏ธ When you reach the API using this command: + +```bash +http POST :8080/api/easypay/payments posId=POS-02 cardNumber=5555567898780008 expiryDate=789456123 amount:=25000 +``` + +๐Ÿ‘€ You should get the following output: + +````bash + +HTTP/1.1 500 +[...] + +{ + "error":"Internal Server Error", + "path": "/payments", + "status": 500, + [...] +} +```` + +๐Ÿ‘€ You then get the following log message: + +```bash +2024-06-05T15:45:35.215+02:00 ERROR 135386 --- [easypay-service] [o-auto-1-exec-7] o.a.c.c.C.[.[.[/].[dispatcherServlet] : Servlet.service() for servlet [dispatcherServlet] in context with path [] threw exception [Request processing failed: java.lang.NullPointerException: Cannot invoke "java.lang.Boolean.booleanValue()" because "java.util.List.get(int).active" is null] with root cause + +java.lang.NullPointerException: Cannot invoke "java.lang.Boolean.booleanValue()" because "java.util.List.get(int).active" is null + at com.worldline.easypay.payment.control.PosValidator.isActive(PosValidator.java:34) ~[main/:na] + at com.worldline.easypay.payment.control.PaymentService.process(PaymentService.java:46) ~[main/:na] + at com.worldline.easypay.payment.control.PaymentService.accept(PaymentService.java:108) ~[main/:na] + at java.base/jdk.internal.reflect.DirectMethodHandleAccessor.invoke(DirectMethodHandleAccessor.java:103) ~[na:na] + at java.base/java.lang.reflect.Method.invoke(Method.java:580) ~[na:na] + at org.springframework.aop.support.AopUtils.invokeJoinpointUsingReflection(AopUtils.java:354) ~[spring-aop-6.1.6.jar:6.1.6] + at org.springframework.aop.framework.ReflectiveMethodInvocation.invokeJoinpoint(ReflectiveMethodInvocation.java:196) ~[spring-aop-6.1.6.jar:6.1.6] + at org.springframework.aop.framework.ReflectiveMethodInvocation.proceed(ReflectiveMethodInvocation.java:163) ~[spring-aop-6.1.6.jar:6.1.6] + [...] +``` + +### Let's fix it! + +๐Ÿ“ To find the root cause, add first a _smart_ log entry in the +``easypay-service/src/main/java/com/worldline/easypay/payment/control/PosValidator.java`` class. In the ``isActive()`` +method, catch the exception and trace the error: + +```java +public boolean isActive(String posId) { + PosRef probe = new PosRef(); + probe.posId = posId; + try { + List posList = posRefRepository.findAll(Example.of(probe)); + + if (posList.isEmpty()) { + LOG.warn("checkPosStatus NOK, unknown posId {}", posId); + return false; + } + + boolean result = posList.get(0).active; + + if (!result) { + LOG.warn("checkPosStatus NOK, inactive posId {}", posId); + } + return result; + } catch (NullPointerException e) { + LOG.warn("Invalid value for this POS: {}", posId); + throw e; + } +} +``` + +๐Ÿ› ๏ธ You can redeploy the `easypay-service` with the following commands: + +```bash +$ docker compose up -d --build easypay-service +``` + +๐Ÿ› ๏ธ Now you can run the same commands ran earlier and check again the logs (`http POSTโ€ฆ`). + +This is most likely an issue with some data in the databaseโ€ฆ Thanks to logs, we may quickly get the idea that the +problem comes from the point of sale with ID `POS-02`. + +It is easy to identify the issue if we donโ€™t have traffic against our application, but what if we have more realistic +traffic? + +๐Ÿ› ๏ธ Generate some load with `k6` (a Grafana tool for load testing): + +```bash +$ k6 run -u 5 -d 5s k6/01-payment-only.js +``` + +๐Ÿ‘€ Check again logs: + +```bash +$ docker compose logs -f easypay-service +``` + +Logs are now interleaved due to several requests being executed concurrently. It becomes harder to identify which point +of sale is causing this error... + +๐Ÿ‘€ If you look into the SQL import file (`easypay-service/src/main/resources/db/postgresql/data.sql`), you'll notice a +`NULL` value instead of a boolean for the `active` column. + +> [!CAUTION] +> Letโ€™s keep the issue as it is for now. \ No newline at end of file diff --git a/docs/content/logs/logs-management/_index.md b/docs/content/logs/logs-management/_index.md new file mode 100644 index 0000000..e1afcef --- /dev/null +++ b/docs/content/logs/logs-management/_index.md @@ -0,0 +1,387 @@ ++++ +date = '2025-06-07T12:06:34+02:00' +title = 'Logs Management' +weight = 3 ++++ + +> Let's dive into our logs with Grafana! + +There are several ways to export logs to a log collector such as Loki or an ElasticSearch instance: + +* Writing logs into a file and exporting its content with an external process (such as FileBeat or Promtail), +* Sending logs directly to the collector. + +โ„น๏ธ We target Loki as our log storage backend. + +## Exporting content of a log file & Structured Logging + +This approach consists in letting your logging framework write your logs into a file, which is read by another process +to send them to a log collector. + +It is a common way to export logs. You just have to configure a log file processor such as FileBeat or Promtail to +read the file and send its content in the corresponding log backend. + +It also requires to format your logs in a structured way, such as JSON, to ease its parsing and ingestion in the +backend. +This is known as Structured Logging. + +With Spring Boot, you can write your logs in Elastic Common Schema (ECS), Graylog Extended Log Format (GELF) or Logstash +JSON formats: +[Structured Logging with Spring Boot Logging](https://docs.spring.io/spring-boot/reference/features/logging.html#features.logging.structured). + +๐Ÿ“ **OPTIONAL:** You can try to change the console log format to one of ``ecs``, ``gelf`` or ``logstash`` in +``easypay-service``, by adding to the application.yaml file: + +```yaml +# Structured Logging: could be either ecs, gelf or logstash +logging: + # ... + structured: + format: + console: ecs +`````` + +๐Ÿ› ๏ธ Rebuild and redeploy the `easypay-service`: + +```bash +$ docker compose up -d --build easypay-service +``` + +๐Ÿ‘€ Check logs in the console to see the new format. + +``` +(...) +easypay-service | {"@timestamp":"2025-04-11T21:02:40.171457832Z","log.level":"INFO","process.pid":1,"process.thread.name":"http-nio-8080-exec-1","service.name":"easypay-service","log.logger":"org.apache.catalina.core.ContainerBase.[Tomcat].[localhost].[\/]","message":"Initializing Spring DispatcherServlet 'dispatcherServlet'","ecs.version":"8.11"} +easypay-service | {"@timestamp":"2025-04-11T21:02:40.171819865Z","log.level":"INFO","process.pid":1,"process.thread.name":"http-nio-8080-exec-1","service.name":"easypay-service","log.logger":"org.springframework.web.servlet.DispatcherServlet","message":"Initializing Servlet 'dispatcherServlet'","ecs.version":"8.11"} +easypay-service | {"@timestamp":"2025-04-11T21:02:40.175088520Z","log.level":"INFO","process.pid":1,"process.thread.name":"http-nio-8080-exec-1","service.name":"easypay-service","log.logger":"org.springframework.web.servlet.DispatcherServlet","message":"Completed initialization in 3 ms","ecs.version":"8.11"} +(...) +``` + +> [!IMPORTANT] +> Structured logging may be less readable for humans, but it is perfect for log concentrators as they are easy to parse! + +## Our choice: sending logs directly to the log collector + +It is also possible to send logs directly to a log collector by configuring an appender in your logging framework. + +This approach offers a more real-time experience compared to the previous method. + +Loki can ingest logs using its own API or using the OpenTelemetry protocol. So we have several options: + +* Using the [Loki Logback appender](https://loki4j.github.io/loki-logback-appender/), +* Using + the [OpenTelemetry Logback appender](https://github.com/open-telemetry/opentelemetry-java-instrumentation/tree/main/instrumentation/logback/logback-appender-1.0/library) + provided by the OpenTelemetry project, +* Or even better, a zero code instrumentation approach with + the [OpenTelemetry Java Agent](https://opentelemetry.io/docs/zero-code/java/agent/). + +We will use the latter as we focus on what OpenTelemetry can bring to us for the observability of our Java applications +๐Ÿ˜‰, and not only Spring Boot ones! + +### 2 ways of instrumentation: Pros & Cons + +There is two ways to instrument the byte code and broadcast telemetry : Using a library or through a Java Agent +Here is a short summary of the pros & cons + +**Java Agent** + +* It is the default choice for instrumenting a Java program +* Enable loose coupling between the the artifact & the agent + +**Library / Starter** + +* Faster than using an agent +* Mandatory with native mode + +If you want to know more about this topic, you can [check out this documentation](https://opentelemetry.io/docs/zero-code/java/spring-boot-starter/). + +### Target Architecture + +![Logs Architecture](./archi-logs.png) + +The OpenTelemetry Collector is an important component in our architecture: it acts as an ETL (Extract, Transform, Load) +process for +telemetry data (logs, metrics and traces). +It will receive logs from the application, transform them into a format that can be ingested by the log storage backend, +and send them to the backend. + +A practice is to install a collector on each host where your application is running, or kube node. +It will then collect logs from all applications running on the host. + +In the next steps, we will attach the OpenTelemetry Agent to the `easypay-service`, and configure it to send logs +to the OpenTelemetry Collector. + +### OpenTelemetry Java Agent + +The [OpenTelemetry Java Agent](https://opentelemetry.io/docs/zero-code/java/agent/) is a Java agent that can be attached +to a JVM to automatically instrument your application. + +It is able to collect and send all your application telemetry: logs, metrics and traces, for most of the frameworks and +libraries you may use in your application. + +๐Ÿ‘€ You can have a look to +[all the supported libraries, frameworks, applications servers and JVMs](https://github.com/open-telemetry/opentelemetry-java-instrumentation/blob/main/docs/supported-libraries.md) +supported by the Agent. + +โ„น๏ธ To attach an agent to a JVM, you just have to add the `-javaagent` option to the JVM command line. + +โ„น๏ธ `opentelemetry-javaagent.jar` is already available as `/opentelemetry-javaagent.jar` in the container (`easypay-service/src/main/docker/Dockerfile`): + +```Dockerfile +ENV OTEL_AGENT_VERSION "v2.14.0" +ENV OTEL_AGENT_URL "https://github.com/open-telemetry/opentelemetry-java-instrumentation/releases/download/${OTEL_AGENT_VERSION}/opentelemetry-javaagent.jar" + +ADD --chown=$UID:$GID ${OTEL_AGENT_URL} /opentelemetry-javaagent.jar +``` + +๐Ÿ“ Modify the `entrypoint` definition in the `compose.yml` file to attach the OpenTelemetry Java Agent to the +`easypay-service`: + +```yaml +services: + easypay-service: + # ... + entrypoint: + - java + - -javaagent:/opentelemetry-javaagent.jar # < Add this line + - -cp + - app:app/lib/* + - com.worldline.easypay.EasypayServiceApplication +``` + +By default, the OpenTelemetry Agent target endpoint is configured to `localhost:4317`. +It is overridable by setting the system property `otel.exporter.otlp.endpoint` or by using +the `OTEL_EXPORTER_OTLP_ENDPOINT` environment variable. + +โ„น๏ธ Our collector is listening on `http://opentelemetry-collector:4317`. + +๐Ÿ“ Add the following environment variables to the `easypay-service` service in the `compose.yml` file such as: + +```yaml +services: + easypay-service: + # ... + environment: + # ... + OTEL_RESOURCE_ATTRIBUTES: "service.name=easypay-service,deployment.environment=dev,service.namespace=service,service.version=1.0.0,service.instance.id=easypay-service:8080" # (1) + OTEL_EXPORTER_OTLP_PROTOCOL: grpc # (2) + OTEL_EXPORTER_OTLP_ENDPOINT: http://opentelemetry-collector:4317 # (3) + # ... +``` + +1. The `OTEL_RESOURCE_ATTRIBUTES` environment variable is used to define the service name, the deployment environment, + the + service namespace, the service version and the service instance id, + * OpenTelemetryโ€™s [Open Agent Management Protocol specification](https://github.com/open-telemetry/opamp-spec/blob/main/specification.md) + defines some [expected attributes](https://github.com/open-telemetry/opamp-spec/blob/main/specification.md#agentdescriptionidentifying_attributes) + for telemetry data. +2. The `OTEL_EXPORTER_OTLP_PROTOCOL` environment variable is used to define the protocol used to send telemetry data to + the collector. +3. The `OTEL_EXPORTER_OTLP_ENDPOINT` environment variable is used to define the endpoint of the collector. + +### MDC support + +By default, exporting MDC values with the OpenTelemetry Java Agent is experimental and requires an opt-in configuration. + +But that does not prevent us from using it in our workshop! We should set the following properties: + +* `otel.instrumentation.logback-appender.experimental-log-attributes=true` +* `otel.instrumentation.logback-appender.experimental.capture-mdc-attributes=*` + * Wildcard means that we want all the MDC attributes. + +Agent can be configured using either: + +* System properties, +* Environment variables, +* Or configuration file. + +๐Ÿ“ Modify the `entrypoint` definition in the `compose.yml` file to add the following system properties: + +```yaml +services: + easypay-service: + # ... + entrypoint: + - java + - -javaagent:/opentelemetry-javaagent.jar + - -Dotel.instrumentation.logback-appender.experimental-log-attributes=true # < Add this line + - -Dotel.instrumentation.logback-appender.experimental.capture-mdc-attributes=* # < Add this line + - -cp + - app:app/lib/* + - com.worldline.easypay.EasypayServiceApplication +``` + +### OpenTelemetry Collector + +> [!TIP] +> Utilizing collectors offers several advantages for managing telemetry data: +> - Reduces the need for complicated application configurations: just send data to `localhost`, +> - Centralizes configuration to a single point: the collector, +> - Acts as a buffer to prevent resource overuse, +> - Can transform data before ingestion, +> - Supports data intake from various protocols and can relay them to any backend, +> - ... + +โ„น๏ธ The OpenTelemetry collector is already configured to receive logs and forward metrics to the Loki backend. + +๐Ÿ‘€ You can check the collector configuration located in the `docker/otelcol/otelcol.yaml` file. + +```yaml +receivers: + # Listen for telemetry data via OpenTelemetry protocol + otlp: # (1) + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +processors: + batch: # (2) + +exporters: + # Configure an exporter using the OpenTelemetry protocol over HTTP to send logs to Loki + otlphttp/loki: # (3) + endpoint: http://loki:3100/otlp + +service: + pipelines: + # Declare the pipeline for logs processing: + # 1. Receive logs via the OpenTelemetry protocol + # 2. Optimize data by batching them (optional but recommended) + # 3. Export logs to Loki + logs: # (4) + receivers: [ otlp ] + processors: [ batch ] + exporters: [ otlphttp/loki ] +``` + +1. Declare an input receiver to listen for telemetry data via the OpenTelemetry protocol on ports 4317 (GRPC) and 4318 (HTTP/PROTOBUF), +2. Declare a processor of type batch to optimize data by batching them, +3. Declare an exporter to send logs to Loki using the OpenTelemetry protocol over HTTP, + * Exporter name here contains its type (otlphttp) and a key (loki), + * The key is optional, and is used to differentiate exporters of the same type, + * You should use the full exporter name in the pipeline configuration. +4. Configure the logs pipeline by defining the receivers, processors, and exporters to use. + +๐Ÿ› ๏ธ Redeploy the `easypay-service`: + +```bash +$ docker compose up -d easypay-service +``` + +โœ… To ensure `easypay-service` has started up correctly, check its logs with: + +```bash +$ docker compose logs -f easypay-service +``` + +โœ… If Java agent was correctly taken into account, logs should start with: + +``` +easypay-service | OpenJDK 64-Bit Server VM warning: Sharing is only supported for boot loader classes because bootstrap classpath has been appended +easypay-service | [otel.javaagent 2025-01-16 15:41:46:550 +0000] [main] INFO io.opentelemetry.javaagent.tooling.VersionLogger - opentelemetry-javaagent - version: 2.11.0 +``` + +## Explore logs with Grafana + +> [!INFO] +> For this workshop, we have already configured the Loki datasource in Grafana. +> You can take a look at its configuration in Grafana (port `3000`) by navigating to the `Connections` > `Data sources` +> section. +> We only set up the Loki server url. + +๐Ÿ› ๏ธ Go to Grafana (port `3000`): + +* Open an `Explore` dashboard, +* Select the Loki datasource. + +Grafana offers a form to help you build your queries: + +* Filtering labels, +* Finding text in logs, +* Parsing logs to extract and filter values, +* ... + +You can also use a dedicated query language to make your queries directly: this is +named [LogQL](https://grafana.com/docs/loki/latest/query/). + +๐Ÿ› ๏ธ Letโ€™s get logs from the `easypay-service`: + +* In the `Label filter`, select the application with ``service_name`` equal to ``easypay-service``, +* Click on ``Run Query``, +* Check out logs on the bottom of the view and unfold some of them. + +๐Ÿ› ๏ธ Do not hesitate to hit the easypay payment endpoint with curl/httpie or k6 to generate some logs (whichever you +prefer): + +```bash +http POST :8080/api/easypay/payments posId=POS-01 cardNumber=5555567898780008 expiryDate=789456123 amount:=40000 +# OR +k6 run -u 1 -d 1m k6/01-payment-only.js +``` + +๐Ÿ‘€ You can also view logs for the other services (e.g., ``api-gateway``). + +Maybe another issue? Do you see the card numbers? ๐Ÿ˜จ + +## Personal Identifiable Information (PII) obfuscation + +For compliance and to prevent personal data loss, we will obfuscate the card number in the logs. + +The OpenTelemetry collector in its contrib flavor provides +a [redaction processor](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/redactionprocessor) +we can use to obfuscate sensitive data. +The processor can be declared and attached to the log pipeline in order to mask all attributes containing a sensitive +value. + +๐Ÿ“ Letโ€™s add the redaction processor to the OpenTelemetry collector configuration: + +``` +(...) + +processors: + batch: + + redaction/card-numbers: # (1) + allow_all_keys: true + blocked_values: + - "4[0-9]{12}(?:[0-9]{3})?" ## VISA + - "(5[1-5][0-9]{14}|2(22[1-9][0-9]{12}|2[3-9][0-9]{13}|[3-6][0-9]{14}|7[0-1][0-9]{13}|720[0-9]{12}))" ## MasterCard + - "3(?:0[0-5]|[68][0-9])[0-9]{11}" ## Diners Club + - "3[47][0-9]{13}" ## American Express + - "65[4-9][0-9]{13}|64[4-9][0-9]{13}|6011[0-9]{12}|(622(?:12[6-9]|1[3-9][0-9]|[2-8][0-9][0-9]|9[01][0-9]|92[0-5])[0-9]{10})" ## Discover + - "(?:2131|1800|35[0-9]{3})[0-9]{11}" ## JCB + - "62[0-9]{14,17}" ## UnionPay + summary: debug + +(...) + +service: + pipelines: + logs: + receivers: [otlp] + processors: [batch,redaction/card-numbers] # (2) + exporters: [otlphttp/loki] + +(...) +``` + +1. We declare a new processor named `redaction/card-numbers` that will obfuscate all attributes containing a card + number, +2. We attach the processor to the logs pipeline. + +๐Ÿ› ๏ธ Restart the collector to take into account the new configuration: + +```bash +docker compose up -d --build opentelemetry-collector +``` + +๐Ÿ› ๏ธ Generate some logs with curl/httpie or k6. + +โœ… Check the card numbers are now obfuscated with the ``****`` content. + +> [!TIP] +> Having a collector located near to your application provides several benefits: +> * It reduces latency between the application and the collector, +> * You can have a collector configuration tailored to your application needs (here by redacting sensitive data). \ No newline at end of file diff --git a/docs/content/logs/logs-management/archi-logs.png b/docs/content/logs/logs-management/archi-logs.png new file mode 100755 index 0000000..b0d96c4 Binary files /dev/null and b/docs/content/logs/logs-management/archi-logs.png differ diff --git a/docs/content/logs/mdc/_index.md b/docs/content/logs/mdc/_index.md new file mode 100644 index 0000000..1c5b0d5 --- /dev/null +++ b/docs/content/logs/mdc/_index.md @@ -0,0 +1,103 @@ ++++ +date = '2025-06-06T23:24:44+02:00' +title = 'Mapped Diagnostic Context (MDC)' +weight = 2 ++++ + +Mapped Diagnostic Context (MDC) will help us add more context on every log output. For more information, refer to this +web page: [https://logback.qos.ch/manual/mdc.html](https://logback.qos.ch/manual/mdc.html). +It is a kind of Map attached to the Thread context and maintained by the logging framework where you can put values and +use them in your log layout. + +As it is attached to a Thread, we can put values at the beginning of a request and print them in all the logs related to +this request. +That can give you a lot of information about the context of the request, without having to add them to all logs +manually. + +## Implement MDC + +A good sketch is better than a long speech. In the next section, we will use MDC to add the card number and the POS ID +to all the logs related to a request. + +๐Ÿ“ Go to the ``easypay-service/src/main/java/com/worldline/easypay/payment/boundary/PaymentResource`` class and modify +the method ``processPayment()`` to instantiate the [MDC](https://logback.qos.ch/manual/mdc.html): + +```java +public ResponseEntity processPayment(PaymentRequest paymentRequest) { + // Add cardNumber to SLF4J MDC + MDC.put("cardNumber",paymentRequest.cardNumber()); + // Add Point Of Sale identifier to SLF4J MDC + MDC.put("pos",paymentRequest.posId()); + + try { // Add a try-finally construct and wrap the initial code here + //... + return httpResponse; + catch (Exception e) { + // Catch any exception to log it with MDC value + LOG.error(e.getMessage()); + throw e; + } finally { + // Clear MDC at the end + MDC.clear(); + } +} +``` + +> [!CAUTION] +> Donโ€™t forget to clear the MDC at the end of the method to avoid any data leak. + +Now, we want to print these values when a log line is printed in the console. + +๐Ÿ“ Modify to the spring configuration file (``easypay-service/src/main/resources/application.yaml``) and add the +`logging.level.pattern` property to add both the `cardNumber` & `pos` fields to all logs: + +```yaml +logging: + pattern: + level: "%5p [%mdc]" +``` + +> [!TIP] +> `%mdc` prints the full content of the MDC Map attached to the current thread. +> +> If you want to print a single value, you can use `%X{key}` where `key` is the key of the value you want to print. + +```yaml +# Alternative to print specific fields instead +logging: + pattern: + level: "%5p [%X{cardNumber} - %X{pos}]" +``` + +> [!INFO] +> Using the Spring Boot ``logging.pattern.level`` property is just a way to configure the logback pattern. You can also +> use a logback configuration file to do the same thing. +> You can also use the ``logging.pattern.correlation`` property (used by tracing) or a logback configuration file to do +> the same thing. + +๐Ÿ› ๏ธ Rebuild and redeploy the `easypay-service`: + +```bash +$ docker compose up -d --build easypay-service +``` + +## Adding more content in our logs + +๐Ÿ› ๏ธ To have more logs, we will run several HTTP requests using [K6](https://k6.io/). Run the following command: + +```bash +$ k6 run -u 5 -d 5s k6/01-payment-only.js +``` + +๐Ÿ‘€ Check then the logs to pinpoint some exceptions. + +### Logs Correlation + +> [!TIP] +> You are probably wondering how to smartly debug in production when you have plenty of logs for several users and by +> the way different transactions? +> +> One approach would be to correlate all of your logs using a correlation Id. +> If an incoming request has no correlation id header, the API creates it. If there is one, it uses it instead. +> +> โ„น๏ธ This is a topic we will address in the tracing section. \ No newline at end of file diff --git a/docs/content/metrics/_index.md b/docs/content/metrics/_index.md new file mode 100644 index 0000000..1898409 --- /dev/null +++ b/docs/content/metrics/_index.md @@ -0,0 +1,14 @@ ++++ +title = "Metrics" +type = "chapter" +weight = 4 ++++ + +Letโ€™s take control of our applicationโ€™s metrics! + +We target to collect metrics from our services and forward them to the Prometheus time-series database. +As with logs, we will use the OpenTelemetry collector as a gateway to collect and forward these metrics. + +![Metrics architecture](./archi-metrics.png) + +{{% children containerstyle="div" style="h3" description=false %}} \ No newline at end of file diff --git a/docs/content/metrics/archi-metrics.png b/docs/content/metrics/archi-metrics.png new file mode 100755 index 0000000..e540f68 Binary files /dev/null and b/docs/content/metrics/archi-metrics.png differ diff --git a/docs/content/metrics/business-metrics.md b/docs/content/metrics/business-metrics.md new file mode 100644 index 0000000..d2c41be --- /dev/null +++ b/docs/content/metrics/business-metrics.md @@ -0,0 +1,272 @@ ++++ +date = '2025-06-07T15:59:02+02:00' +title = 'Business Metrics' +weight = 4 ++++ + +Observability is not only about incidents. You can also define your own metrics. + +[OpenTelemetry API](https://opentelemetry.io/docs/languages/java/) provides an API to create your + +* [Counters](https://opentelemetry.io/docs/languages/java/api/#counter): value which can only increment ( + such as the number of processed requests), +* [Gauges](https://opentelemetry.io/docs/languages/java/api/#gauge): represents the current value (such as + the speed gauge of a car), +* [Histograms](https://opentelemetry.io/docs/languages/java/api/#histogram): to record values with large distribution + such as latencies. + +Letโ€™s go back to our code! + +## Objectives + +We want to add new metrics to the easypay service to measure they payment processing and store time. +So we target a metric of **Histogram** type. + +In order to achieve this goal, we will measure the time spent in the two methods `process` and `store` of the +`com.worldline.easypay.payment.control.PaymentService` class of the `easypay-service` module. +This class is the central component responsible for processing payments: it provides the ``accept`` public method, which +delegates its responsibility to two private ones: + +* ``process``: which does all the processing of the payment: validation, calling third partiesโ€ฆ +* ``store``: to save the processing result in database. + +We also want to count the number of payment requests processed by our system. We will use a metric of **Counter** type. + +## 1. Add the opentelemetry-api dependency + +We need to add the `opentelemetry-api` dependency to `easypay-service` in order to use the OpenTelemetry API to create +custom metrics. + +๐Ÿ“ Add the following dependency to the `easypay-service` `build.gradle.kts` file: + +```kotlin +dependencies { + // ... + implementation("io.opentelemetry:opentelemetry-api") +} +``` + +## 2. Declare the histogram + +We need to declare two timers in our code: + +* ``processTimer`` to record the ``easypay.payment.process`` metric: it represents the payment processing time and + record the time spent in the `process` method, +* ``storeTimer`` to record the ``easypay.payment.store`` metric: it represents the time required to store a payment + in database by recording the time spent in the `store` method. + +๐Ÿ“ Letโ€™s modify the ``com.worldline.easypay.payment.control.PaymentService`` class to declare them: + +```java +// ... + +import io.micrometer.core.instrument.MeterRegistry; +import io.micrometer.core.instrument.Timer; +import io.opentelemetry.api.OpenTelemetry; +import io.opentelemetry.api.GlobalOpenTelemetry; + + +@Service +public class PaymentService { + // ... + private LongHistogram processHistogram; // (1) + private LongHistogram storeHistogram; + + public PaymentService(/* ... */) { + // ... + + OpenTelemetry openTelemetry = GlobalOpenTelemetry.get(); // (2) + + processHistogram = openTelemetry.getMeter(EasypayServiceApplication.class.getName()) //(3) + .histogramBuilder("easypay.payment.process") // (4) + .setDescription("Payment processing time") // (5) + .setUnit("ms") // (6) + .ofLongs() // (7) + .build(); + storeHistogram = openTelemetry.getMeter(EasypayServiceApplication.class.getName()) + .histogramBuilder("easypay.payment.store") + .setDescription("Payment storing time") + .setUnit("ms") + .ofLongs() + .build(); + } +} +``` + +1. Declare the two timers, +2. Inject the OpenTelemetry instance to get the Meter object to create the histograms, +3. Initialize the two histograms by giving them a name (4), a description (5), a unit (6) and setting the type of the + values (7). + +## 3. Record time spent in the methods + +๐Ÿ“ Letโ€™s modify our `process` and `store` methods to record our latency with the new metrics. +We can simply wrap our original code in a `try-finally` construct such as: + +```java + // ... +private void process(PaymentProcessingContext context) { + long startTime = System.currentTimeMillis(); // (1) + try { // (2) + if (!posValidator.isActive(context.posId)) { + context.responseCode = PaymentResponseCode.INACTIVE_POS; + return; + } + // ... + } finally { + long duration = System.currentTimeMillis() - startTime; // (3) + processHistogram.record(duration); // (4) + } +} + +private void store(PaymentProcessingContext context) { + long startTime = System.currentTimeMillis(); // (5) + try { + Payment payment = new Payment(); + // ... + } finally { + long duration = System.currentTimeMillis() - startTime; + storeHistogram.record(duration); + } +} +``` + +1. Get the start time, +2. Wrap the original code in a `try-finally` construct, +3. Compute the duration, +4. Record the duration in the `processHistogram` histogram, +5. Do the same for the `store` method. + +#### 4. Add counter + +๐Ÿ“ Letโ€™s do the same for the counter: + +```java +// ... + +import io.micrometer.core.instrument.Counter; +import io.opentelemetry.api.metrics.LongCounter; +import io.opentelemetry.api.metrics.LongHistogram; + + +@Service +public class PaymentService { + //... + private LongCounter requestCounter; // (1) + + public PaymentService(/* ... */) { + // ... + requestCounter = openTelemetry.getMeter(EasypayServiceApplication.class.getName()) // (2) + .counterBuilder("easypay.payment.requests") + .setDescription("Payment requests counter") + .build(); + } +} +``` + +1. Declares the counter, +2. Initializes the counter. + +๐Ÿ“ The method ``accept`` of the ``PaymentService`` class is invoked for each payment request, it is a good candidate to +increment our counter: + +```java + +@Transactional(Transactional.TxType.REQUIRED) +public void accept(PaymentProcessingContext paymentContext) { + requestCounter.add(1); // < Add this (1) + process(paymentContext); + store(paymentContext); + paymentTracker.track(paymentContext); +} +``` + +1. Increment the counter each time the method is invoked. + +## 5. Redeploy easypay + +๐Ÿ› ๏ธ Rebuild and redeploy `easypay-service`: + +```bash +$ docker compose up -d --build easypay-service +``` + +๐Ÿ› ๏ธ Once easypay is started (you can check logs with the ``docker compose logs -f easypay-service`` command and wait for +an output like ``Started EasypayServiceApplication in 32.271 seconds``): + +* Execute some queries: + +```bash +$ k6 run -u 1 -d 1m k6/01-payment-only.js +``` + +๐Ÿ› ๏ธ Then go to Grafana and explore Metrics to find your newly created metrics: + +* Search for metric with base name `easypay_payment_process`, +* ๐Ÿ‘€ You should get 3 new metrics: + * `easypay_payment_process_milliseconds_bucket`, + * `easypay_payment_process_milliseconds_count`, + * `easypay_payment_process_milliseconds_sum`. + +๐Ÿ‘€ Explore them, especially the `_bucket` one. + +When using a `Histogram` you get several metrics by default, suffixed with: + +* `_bucket`: contains the number of event which lasts less than the value defined in the `le` tag, +* `_count`: the number of hits, +* `_sum`: the sum of time spent in the method. + +Especially: + +* We can get the average time spent in the method by dividing the `sum` by the `count`, +* We can calculate the latency percentile thanks to the buckets. + +Finally, our ``Counter`` becomes a metric suffixed with ``_total``: `easypay_payment_requests_total`. + +## 6. Compute percentiles + +Letโ€™s compute percentiles for the `process` and `store` methods. + +As we have seen, the `Histogram` metric provides the necessary data to compute percentiles, we can +query Prometheus to display the percentiles of our application: + +๐Ÿ› ๏ธ Go to Grafana, to explore Metrics again. + +๐Ÿ› ๏ธ To compute the percentiles for the `easypay_payment_process` histogram we have created: + +* Select the `easypay_payment_process_milliseconds_bucket` metric, +* Click on `Operations` and select `Aggregations` > `Histogram quantile`, +* Select a Quantile value, +* Click on `Run query`. + +## 7. Visualization + +๐Ÿ› ๏ธ Go back to Grafana (`port 3000`), and go into the ``Dashboards`` section. + +๐Ÿ› ๏ธ We will import the dashboard defined in the ``docker/grafana/dashboards/easypay-monitoring.json`` file: + +* Click on ``New`` (top right), and select ``Import``, +* In the ``Import via dashboard JSON model`` field, paste the content of the ``easypay-monitoring.json`` file and click + on ``Load``, +* Select Prometheus as a data source. + +You should be redirected to the ``Easypay Monitoring`` dashboard. + +It provides some dashboards we have created from the new metrics you exposed in your application: + +* `Payment request count total (rated)`: represents the number of hit per second in our application computed from our + counter, +* ``Payment Duration distribution``: represents the various percentiles of our application computed from the + ``easypay_payment_process`` histogram, +* ``Requests process performance`` and ``Requests store performance``: are a visualization of the buckets of the two + histograms we created previously. + +๐Ÿ› ๏ธ You can generate some load to view your dashboards evolving live: + +```bash +$ k6 run -u 2 -d 2m k6/01-payment-only.js +``` + +> [!NOTE] +> Do not hesitate to explore the way the panels are created, and the queries we used! +> Just hover the panel you are interested in, click on the three dots and select Edit. \ No newline at end of file diff --git a/docs/content/metrics/explore.md b/docs/content/metrics/explore.md new file mode 100644 index 0000000..8673ff9 --- /dev/null +++ b/docs/content/metrics/explore.md @@ -0,0 +1,110 @@ ++++ +date = '2025-06-07T15:56:26+02:00' +title = 'Explore' +weight = 3 ++++ + +## Let's explore the metrics + +> [!INFO] +> For this workshop, we have already configured in Grafana the Prometheus datasource. +> You can have a look at its configuration in Grafana (``port 3000``) in the ``Connections`` > ``Data sources`` +> section. +> It is pretty straightforward as we have only set the Prometheus server URL. + +๐Ÿ› ๏ธ Go to Grafana and start again an ``Explore`` dashboard. + +๐Ÿ› ๏ธ Select the ``Prometheus`` datasource instead of the ``Loki`` one. + +In this section you will hand on the metrics query builder of Grafana. + +The ``Metric`` field lists all the metrics available in the Prometheus server: take time to explore them. + +๐Ÿ› ๏ธ For example, you can select the metric named ``jvm_memory_used_bytes``, and click on the ``Run query`` button to plot +the memory usage of all your services by memory area, + +๐Ÿ› ๏ธ If you want to plot the total memory usage of your services: + +* Click on ``Operations`` and select ``Aggregations`` > ``Sum``, and ``Run query``: you obtain the whole memory + consumption of all your JVMs, +* To split the memory usage per service, you can click on the ``By label`` button and select the label named + ``service-name`` (do not forget to click on ``Run query`` afterthat). + +๐Ÿ› ๏ธ You can also filter metrics to be displayed using ``Label filters``: try to create a filter to display only the +metric related to the service named easypay-service. + +> [!TIP] +> At the bottom of the query builder, you should see something like: +> `sum by(service_name) (jvm_memory_used_bytes{service_name="easypay-service"})`. +> This is the effective query raised by Grafana to Prometheus in order to get its metrics. +> This query language is named [PromQL](https://prometheus.io/docs/prometheus/latest/querying/basics/). + +### Dashboards + +With Grafana, you can either create your own dashboards or import some provided by the community from +[Grafanaโ€™s dashboards collection](https://grafana.com/grafana/dashboards/). + +We will choose the second solution right now and import the following dashboards: + +* [JMX Overview (Opentelemetry)](https://grafana.com/grafana/dashboards/12271-jvm-micrometer/), which ID is `17582`, +* [OpenTelemetry JDBC Dashboard](https://grafana.com/grafana/dashboards/20729-spring-boot-jdbc-hikaricp/), which ID is + `19732`. + +๐Ÿ› ๏ธ To import these dashboards: + +* Go to Grafana (``port 3000``), and select the ``Dashboards`` section on the left, +* Then click on ``New`` (top right), and click on ``Import``, +* In the ``Find and importโ€ฆ`` field, just paste the ID of the dashboard and click on ``Load``, +* In the ``Select a Prometheus data source``, select ``Prometheus`` and click on ``Import``, +* You should be redirected to the newly imported dashboard. + +> [!TIP] +> Imported dashboards are available directly from the ``Dashboards`` section of Grafana. + +๐Ÿ‘€ Explore the ``JMX Overview`` dashboard: it works almost out of box. +It contains a lot of useful information about JVMs running our services. + +The ``job`` filter (top of the dashboard) let you select the service you want to explore metrics. + +### Incident! + +๐Ÿ› ๏ธ Now let's simulate some traffic using [Grafana K6](https://k6.io/). Run the following command: + +```bash +$ k6 run -u 5 -d 2m k6/02-payment-smartbank.js +``` + +๐Ÿ‘€ Go back to the Grafana dashboard, click on ``Dashboards`` and select ``JVM Micrometer``: + +* Explore the dashboard for the ``easypay-service``, especially the Garbage collector and CPU statistics. + +* Look around the other ``OpenTelemetry JDBC`` dashboard then and see what happens on the database connection + pool for ``easypay-service``. + +We were talking about an incident, isnโ€™t it? + +๐Ÿ‘€ Let's go back to the Explore view of Grafana, select Loki as a data source and see what happens! + +๐Ÿ› ๏ธ Create a query with the following parameters to get error logs of the ``smartbank-gateway`` service: + +* Label filters: ``service_name`` = ``smartbank-gateway`` +* label filter expression: ``label`` = ``detected_level ; ``operator`` = ``=~`` ; ``value`` = ``warn|error`` + +๐Ÿ› ๏ธ Click on ``Run query`` and check out the logs. + +Normally you would get a ``java.lang.OutOfMemoryError`` due to a saturated Java heap space. + +๐Ÿ‘€ To get additional insights, you can go back to the `JMX Overview` dashboard and select the ``smartbank-gateway`` application. + +Normally you will see the used JVM Heap reaching the maximum allowed. + +> [!INFO] +> Grafana and Prometheus allows you to generate alerts based on metrics, +> using [Grafana Alertmanager](https://grafana.com/docs/grafana/latest/alerting/set-up/configure-alertmanager/). +> For instance, if CPU usage is greater than 80%, free memory is less than 1GB, used heap is greater than 80%, etc. + +๐Ÿ› ๏ธ You may have to restart smartbank: + +```bash +$ docker compose restart smartbank-gateway +``` \ No newline at end of file diff --git a/docs/content/metrics/export-metrics.md b/docs/content/metrics/export-metrics.md new file mode 100644 index 0000000..e85782e --- /dev/null +++ b/docs/content/metrics/export-metrics.md @@ -0,0 +1,91 @@ ++++ +date = '2025-06-07T15:53:56+02:00' +title = 'Export Metrics' +weight = 2 ++++ + +## Expose metrics of our Java application + +With Spring Boot, there are several ways to expose application metrics: + +* The Spring Boot-way using the Actuator module relying on Micrometer, +* Using the Spring Boot OpenTelemetry Starter, but metric support is not as advanced as with Micrometer, +* Using OpenTelemetry library directly, but it needs more configuration, +* Using the OpenTelemetry Agent. + +We will continue to use the OpenTelemetry Agent, as it is a straightforward way to collect metrics, and we already +configured it! + +### Configure the OpenTelemetry Agent + +The Agent should already collect metrics of the application, sending data every 60s. +Thatโ€™s a bit long for our workshop, so we will reduce this frequency to 5s. + +๐Ÿ“ Modify the `entrypoint` definition in the `compose.yml` file to add the following system properties: + +```yaml +services: + easypay-service: + # ... + entrypoint: + - java + - -javaagent:/opentelemetry-javaagent.jar + - -Dotel.instrumentation.logback-appender.experimental-log-attributes=true + - -Dotel.instrumentation.logback-appender.experimental.capture-mdc-attributes=* + - -Dotel.metric.export.interval=5000 # <-- Add this line + - -cp + - app:app/lib/* + - com.worldline.easypay.EasypayServiceApplication +``` + +โ„น๏ธ The `otel.metric.export.interval` system property is used to define the frequency at which metrics are sent to the +target endpoint in milliseconds. As a reminder, you can also use environment variables to configure the Agent +(`otel.metric.export.interval` becomes `OTEL_METRIC_EXPORT_INTERVAL` environment variable name). + +๐Ÿ› ๏ธ Restart the easypay-service to take into account the new configuration: + +```bash +docker compose up -d --build easypay-service +``` + + +## Export metrics to Prometheus + +Prometheus is a well-known time-series database and monitoring system that scrapes metrics from instrumented +applications. It even supports the OTLP protocol to ingest metrics. + +Instead of sending them directly, we will keep to use the OpenTelemetry collector to collect metrics +and forward them to the target database. + +๐Ÿ‘€ For this workshop, the OpenTelemetry collector is already configured to receive metrics and forward them to +Prometheus: + +```yaml +# Input +receivers: + # OpenTelemetry Protocol: logs, metrics and traces + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +#... +# Output +exporters: + #... + # Export to Prometheus via HTTP using the OpenTelemetry Protocol + otlphttp/prometheus: + endpoint: http://prometheus:9090/api/v1/otlp + +# Telemetry processing pipelines +service: + pipelines: + # ... + # Receive metrics using the OpenTelemetry Protocol and export to Prometheus + metrics: + receivers: [ otlp ] + processors: [ batch ] + exporters: [ otlphttp/prometheus ] +``` \ No newline at end of file diff --git a/docs/content/prerequisites/_index.md b/docs/content/prerequisites/_index.md new file mode 100644 index 0000000..4fc3a3f --- /dev/null +++ b/docs/content/prerequisites/_index.md @@ -0,0 +1,9 @@ ++++ +title = "Prerequisites" +type = "chapter" +weight = 2 ++++ + +Defines prerequisites to follow this handsโ€™on and instruction to setup a local or CDE environment. + +{{% children containerstyle="div" style="h3" description=false %}} \ No newline at end of file diff --git a/docs/content/prerequisites/environment/_index.md b/docs/content/prerequisites/environment/_index.md new file mode 100644 index 0000000..0151674 --- /dev/null +++ b/docs/content/prerequisites/environment/_index.md @@ -0,0 +1,33 @@ ++++ +date = '2025-06-05T23:18:03+02:00' +title = 'Environment Setup' +weight = '2' ++++ + +You can start the workshop using a Cloud Development Environment, a DevContainer or a local setup. + +## Cloud Development Environment (CDE)? + +A **Cloud Development Environment** is an online platform that provides all the tools and resources needed for software development directly in the cloud. Instead of installing and configuring software on your local machine, you can access a ready-to-use development workspace through your web browser. This allows you to code, build, and test applications from anywhere, collaborate easily with others, and ensures a consistent setup for all team members. + +{{% notice warning %}} +GitHub Codespaces does not work well on a shared public Wifi (you can use your mobile phone data plan to overcome this issue) +{{% /notice %}} + +{{% notice tip %}} +Most of the CDEโ€ฏprovides monthly free credits which are largely sufficient for this workshop. +{{% /notice %}} + +## DevContainer + +A **DevContainer** is a development environment defined by configuration files that specify the tools, extensions, and settings needed for a project. It uses containers (such as Docker) to create a consistent and isolated workspace, ensuring that everyone on the team works with the same setup regardless of their local machine. DevContainers are especially useful for onboarding, collaboration, and avoiding "it works on my machine" issues. + +## Which environment? + +* **CDE**: You donโ€™t want to bother to setup your environment, +* **DevContainer**: You canโ€™t use a CDE or prefer to have code available locally, +* **Local**: You prefer to master everything, we provide instructions to setup a local installation. + +Follow the instructions which suits you the most: + +{{% children containerstyle="div" description=false %}} \ No newline at end of file diff --git a/docs/content/prerequisites/environment/build_codespace.png b/docs/content/prerequisites/environment/build_codespace.png new file mode 100644 index 0000000..2671ccc Binary files /dev/null and b/docs/content/prerequisites/environment/build_codespace.png differ diff --git a/docs/content/prerequisites/environment/codesandbox.md b/docs/content/prerequisites/environment/codesandbox.md new file mode 100644 index 0000000..8a4cfd8 --- /dev/null +++ b/docs/content/prerequisites/environment/codesandbox.md @@ -0,0 +1,68 @@ ++++ +date = '2025-06-05T23:19:40+02:00' +title = 'CodeSandbox CDE' +weight = 1 ++++ + +> You want to use CodeSandbox Cloud Development Environment to follow this workshop. + +> [!IMPORTANT] +> A GitHub, Google or Apple account is required to register to CodeSandbox. + +## Registration + +* Go to [CodeSandbox.io](https://codesandbox.io) and click on the [Sign In](https://codesandbox.io/signin) link, +* Use your GitHub, Google or Apple account to register. + +## Import the repository + +* Click on the {{% button icon="github" color="black" %}}Import{{% /button %}} button (top right of your screen), +* In the `New Repository` view, select `Find by URL` and enter `worldline/observability-workshop`, +* Select the proposed repository and click on `+Import` + +![Import Repository](../codesandbox_import.png) + +* Click on the {{% button color="green" %}}Fork repository{{% /button %}} button. + +After some minutes, CodeSandbox should have opened your code in a VSCode Web view. + +![CodeSandbox VSCode View](../codesandbox_vscode.png) + +> [!TIP] +> You have the choice to continue to work in the VSCode web view, or use the `Open in VS Code` button to use your Desktop instance instead. + +## Virtual Machine settings + +We recommend to use a stronger Virtual Machine for this workshop as it involves lot of services and components. + +๐Ÿ› ๏ธ Click on the square button on the top left of the screen and select Virtual Machine: + +![CodeSandbox Virtual Machine property](../codesandbox_vm.png) + +๐Ÿ› ๏ธ Select the `Micro` sizing and click on {{% button color="green" %}}Apply{{% /button %}} + +New Virtual Machine specifications should have been applied without restart. + +> [!IMPORTANT] +> During the first startup, the gradle build is automatically started. Please wait until it is completely finished. + +## Wait for startup! + +It will take some time to the project start, so please be patient. + +Before starting the infrastructure, you should wait that VSCode has correctly imported all the Java modules and packages. + +![VSCode is ready](../vscode_ready.png) + +### Troubleshooting: Gradle or Java error? + +If Java state is in "Error" or see some Gradle issues, you can try the following VSCode commands (`CTRL-SHIFT-P`): + +- `Gradle: Refresh Gradle Projects View` +- `Java: Clean Java Language Server Workspace` + +### Environment Error: Failed to load DevContainer or Workspace? + +You can reload your CodeSandbox environment with the following VSCode command (`CTRL-SHIFT-P`): + +- `Developer: Reload Window` \ No newline at end of file diff --git a/docs/content/prerequisites/environment/codesandbox_import.png b/docs/content/prerequisites/environment/codesandbox_import.png new file mode 100644 index 0000000..eff92b6 Binary files /dev/null and b/docs/content/prerequisites/environment/codesandbox_import.png differ diff --git a/docs/content/prerequisites/environment/codesandbox_vm.png b/docs/content/prerequisites/environment/codesandbox_vm.png new file mode 100644 index 0000000..d13243b Binary files /dev/null and b/docs/content/prerequisites/environment/codesandbox_vm.png differ diff --git a/docs/content/prerequisites/environment/codesandbox_vscode.png b/docs/content/prerequisites/environment/codesandbox_vscode.png new file mode 100644 index 0000000..db62bc1 Binary files /dev/null and b/docs/content/prerequisites/environment/codesandbox_vscode.png differ diff --git a/docs/content/prerequisites/environment/codespaces.md b/docs/content/prerequisites/environment/codespaces.md new file mode 100644 index 0000000..6dc7855 --- /dev/null +++ b/docs/content/prerequisites/environment/codespaces.md @@ -0,0 +1,22 @@ ++++ +date = '2025-06-05T23:19:30+02:00' +title = 'GitHub Codespaces CDE' +weight = 2 ++++ + +> You want to use GitHub Codespaces Cloud Development Environment to follow this workshop. + +* Log on [GitHub](https://github.com/) and + [fork this repository](https://github.com/worldline/observability-workshop/fork). +* Click on ``Code>Codespaces>Create a codespace`` on the ``main`` branch + +![start codespace](../start_build_codespace.png) + +When a message invites you making a URL public, select and validate it. + +Wait until the Codespace is ready. + +![build codespace](../build_codespace.png) + +> [!IMPORTANT] +> During the first startup, the gradle build is automatically started. Please wait until it is completely finished. diff --git a/docs/content/prerequisites/environment/devcontainer.md b/docs/content/prerequisites/environment/devcontainer.md new file mode 100644 index 0000000..be6eb40 --- /dev/null +++ b/docs/content/prerequisites/environment/devcontainer.md @@ -0,0 +1,63 @@ ++++ +date = '2025-06-05T23:19:44+02:00' +title = 'DevContainer' +weight = 3 ++++ + +> You want to execute this workshop on your desktop with [DevContainers](https://containers.dev/). + +A configuration to set the project up in DevContainer is available. You can check it out in the project [``.devcontainer/devcontainer.json``](https://github.com/worldline/observability-workshop/tree/main/.devcontainer) file. + +If you want to know more about DevContainers, you can check out this [documentation](https://containers.dev/). + +You **MUST** have set up these tools first: + +* [Docker](https://docs.docker.com/), +* An IDE: ([IntelliJ IDEA](https://www.jetbrains.com/idea) or [VSCode](https://code.visualstudio.com/)). + +๐Ÿ› ๏ธ You can validate your environment running these commands: + +**Docker** + +```jshelllanguage +$ docker version + Client: + Docker Engine -Community + Version: + 27.4.1 + API version:1.47 + Go version:go1.22.10 + Git commit:b9d17ea + Built:Tue Dec 17 15:45:46 2024 + OS/Arch:linux/amd64 + Context:default + +``` + +{{% notice tip %}} +This workshop was tested with: +* [Rancher Desktop](https://rancherdesktop.io/) as Docker environment, +* IDEs: VSCode. +{{% /notice %}} + +## VSCode Open Project with DevContainer + +You can use this command in VSCode (`CTRL-SHIFT-P`): + +- `Dev Containers: Clone Repository in Container Volume...` +- Select `GitHub`, +- Repository name: `worldline/observability-workshop` + +## Wait for startup! + +It will take some time to the project start, so please be patient. + +Before starting the infrastructure, you should wait that VSCode has correctly imported all the Java modules and packages. + +![VSCode is ready](../vscode_ready.png) + +## Troubleshooting: Gradle or Java error? +If Java state is in โ€œErrorโ€ or see some Gradle issues, you can try the following VSCode commands (CTRL-SHIFT-P): + +- `Gradle: Refresh Gradle Projects View` +- `Java: Clean Java Language Server Workspace` diff --git a/docs/content/prerequisites/environment/local.md b/docs/content/prerequisites/environment/local.md new file mode 100644 index 0000000..865da23 --- /dev/null +++ b/docs/content/prerequisites/environment/local.md @@ -0,0 +1,59 @@ ++++ +date = '2025-06-05T23:19:47+02:00' +title = 'Local' +weight = 4 ++++ + +> You prefer to run your environment locally and install all tools. + +You **MUST** have set up these tools: + +* [Java 21+](https://adoptium.net/temurin/releases/?version=21) +* [Gradle 8.7+](https://gradle.org/) +* [Docker](https://docs.docker.com/) & [Docker compose](https://docs.docker.com/compose/) +* Any + IDE ([IntelliJ IDEA](https://www.jetbrains.com/idea), [VSCode](https://code.visualstudio.com/), [Netbeans](https://netbeans.apache.org/),...) + you want +* [cURL](https://curl.se/), [jq](https://stedolan.github.io/jq/), [HTTPie](https://httpie.io/) or any tool to call your + REST APIs + +๐Ÿ› ๏ธ Here are commands to validate your environment: + +**Java** + +```bash +$ java -version + +openjdk version "21.0.3" 2024-04-16 LTS +OpenJDK Runtime Environment Temurin-21.0.3+9 (build 21.0.3+9-LTS) +OpenJDK 64-Bit Server VM Temurin-21.0.3+9 (build 21.0.3+9-LTS, mixed mode, sharing) +``` + +**Gradle** + +๐Ÿ› ๏ธ If you use the wrapper, you won't have troubles. Otherwise...: + +```bash +$ gradle -version + +------------------------------------------------------------ +Gradle 8.7 +------------------------------------------------------------ + +Build time: 2024-03-22 15:52:46 UTC +Revision: 650af14d7653aa949fce5e886e685efc9cf97c10 + +Kotlin: 1.9.22 +Groovy: 3.0.17 +Ant: Apache Ant(TM) version 1.10.13 compiled on January 4 2023 +JVM: 21.0.3 (Eclipse Adoptium 21.0.3+9-LTS) +OS: Linux 5.15.146.1-microsoft-standard-WSL2 amd64 +``` + +**Docker Compose** + +``` bash +$ docker compose version + +Docker Compose version v2.24.7 +``` \ No newline at end of file diff --git a/docs/content/prerequisites/environment/start_build_codespace.png b/docs/content/prerequisites/environment/start_build_codespace.png new file mode 100644 index 0000000..795ea48 Binary files /dev/null and b/docs/content/prerequisites/environment/start_build_codespace.png differ diff --git a/docs/content/prerequisites/environment/vscode_ready.png b/docs/content/prerequisites/environment/vscode_ready.png new file mode 100644 index 0000000..e5acbf8 Binary files /dev/null and b/docs/content/prerequisites/environment/vscode_ready.png differ diff --git a/docs/content/prerequisites/skills/_index.md b/docs/content/prerequisites/skills/_index.md new file mode 100644 index 0000000..25c5573 --- /dev/null +++ b/docs/content/prerequisites/skills/_index.md @@ -0,0 +1,20 @@ ++++ +date = '2025-06-05T23:12:58+02:00' +title = 'Skills' +weight = 1 ++++ + +This workshop expects the following skills: + +| Skill | Level | +|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------| +| [REST API](https://google.aip.dev/general) | proficient | +| [Java](https://www.oracle.com/java/) | proficient | +| [Gradle](https://gradle.org/) | novice | +| [Spring Framework](https://spring.io/projects/spring-framework), [Boot](https://spring.io/projects/spring-boot), [Cloud Config](https://docs.spring.io/spring-cloud-config/docs/current/reference/html/#_quick_start), [Cloud Gateway](https://spring.io/projects/spring-cloud-gateway)[^1] | proficient | +| [Docker](https://docs.docker.com/) | novice | +| [Grafana stack](https://grafana.com/) | novice | +| [Prometheus](https://prometheus.io/) | novice | +| [Kafka](https://kafka.apache.org/) | novice | + +[^1] EasyPay was written using Spring Boot framework, but content can apply to other Java frameworks. \ No newline at end of file diff --git a/docs/content/prerequisites/start-infrastructure/_index.md b/docs/content/prerequisites/start-infrastructure/_index.md new file mode 100644 index 0000000..c87d91a --- /dev/null +++ b/docs/content/prerequisites/start-infrastructure/_index.md @@ -0,0 +1,68 @@ ++++ +date = '2025-06-06T22:23:20+02:00' +title = 'Start Infrastructure' +weight = 3 ++++ + +The "infrastructure stack" is composed of the following components: + +* One [PostgreSQL](https://www.postgresql.org/) instance per micro service +* One [Kafka broker](https://kafka.apache.org/) +* One [Service Discovery](https://spring.io/guides/gs/service-registration-and-discovery) microservice to enable load + balancing & loose coupling. +* One [Configuration server](https://docs.spring.io/spring-cloud-config/) is also used to centralise the configuration + of our microservices. +* The following microservices: API Gateway, Merchant BO, Fraud Detect, Smart Bank Gateway + +๐Ÿ› ๏ธ Execute the following commands + +``` bash +$ ./gradlew tasks +``` + +``` bash +$ docker compose up -d --build --remove-orphans +``` + +โœ… To check if all the services are up, you can run this command: + +``` bash +$ docker compose ps -a +``` + +And check the status of every service. + +For instance: + +```bash +โฏ docker compose ps +NAME IMAGE COMMAND SERVICE CREATED STATUS PORTS +api-gateway api-gateway:latest "java -javaagent:/opโ€ฆ" api-gateway 3 minutes ago Up 2 minutes (healthy) 0.0.0.0:8080->8080/tcp, :::8080->8080/tcp +config-server config-server:latest "java -javaagent:/opโ€ฆ" config-server 3 minutes ago Up 3 minutes (healthy) 0.0.0.0:8888->8888/tcp, :::8888->8888/tcp +discovery-server discovery-server:latest "java -javaagent:/opโ€ฆ" discovery-server 3 minutes ago Up 3 minutes (healthy) 0.0.0.0:8761->8761/tcp, :::8761->8761/tcp +easypay-service easypay-service:latest "java -cp app:app/liโ€ฆ" easypay-service 3 minutes ago Up 2 minutes (healthy) +fraudetect fraudetect-service:latest "java -javaagent:/opโ€ฆ" fraudetect-service 3 minutes ago Up 2 minutes (healthy) +kafka confluentinc/cp-kafka:7.6.1 "/etc/confluent/dockโ€ฆ" kafka 3 minutes ago Up 3 minutes (healthy) 9092/tcp, 0.0.0.0:19092->19092/tcp, :::19092->19092/tcp +merchant-backoffice merchant-backoffice:latest "java -javaagent:/opโ€ฆ" merchant-backoffice 3 minutes ago Up 2 minutes (healthy) +observability-workshop-grafana-1 grafana/grafana:latest "sh -xeuc 'mkdir -p โ€ฆ" grafana 3 minutes ago Up 3 minutes 0.0.0.0:3000->3000/tcp, :::3000->3000/tcp +observability-workshop-loki-1 grafana/loki:latest "/usr/bin/loki -confโ€ฆ" loki 3 minutes ago Up 3 minutes 0.0.0.0:3100->3100/tcp, :::3100->3100/tcp +observability-workshop-opentelemetry-collector-1 otel/opentelemetry-collector-contrib:latest "/otelcol-contrib --โ€ฆ" opentelemetry-collector 3 minutes ago Up 3 minutes 0.0.0.0:4317-4318->4317-4318/tcp, :::4317-4318->4317-4318/tcp, 55678-55679/tcp +observability-workshop-postgres-easypay-1 postgres:16 "docker-entrypoint.sโ€ฆ" postgres-easypay 3 minutes ago Up 3 minutes (healthy) 0.0.0.0:5432->5432/tcp, :::5432->5432/tcp +observability-workshop-postgres-fraudetect-1 postgres:16 "docker-entrypoint.sโ€ฆ" postgres-fraudetect 3 minutes ago Up 3 minutes (healthy) 0.0.0.0:5434->5432/tcp, [::]:5434->5432/tcp +observability-workshop-postgres-merchantbo-1 postgres:16 "docker-entrypoint.sโ€ฆ" postgres-merchantbo 3 minutes ago Up 3 minutes (healthy) 0.0.0.0:5435->5432/tcp, [::]:5435->5432/tcp +observability-workshop-postgres-smartbank-1 postgres:16 "docker-entrypoint.sโ€ฆ" postgres-smartbank 3 minutes ago Up 3 minutes (healthy) 0.0.0.0:5433->5432/tcp, [::]:5433->5432/tcp +observability-workshop-prometheus-1 prom/prometheus:latest "/bin/prometheus --cโ€ฆ" prometheus 3 minutes ago Up 3 minutes 0.0.0.0:9090->9090/tcp, :::9090->9090/tcp +observability-workshop-tempo-1 grafana/tempo:latest "/tempo -config.fileโ€ฆ" tempo 3 minutes ago Up 3 minutes 0.0.0.0:3200->3200/tcp, :::3200->3200/tcp, 0.0.0.0:9095->9095/tcp, :::9095->9095/tcp, 0.0.0.0:9411->9411/tcp, :::9411->9411/tcp +smartbank-gateway smartbank-gateway:latest "java -Xmx4g -cp appโ€ฆ" smartbank-gateway 3 minutes ago Up 2 minutes (healthy) +``` + +## Troubleshooting + +### Gradle build error + +You may encounter Gradle build errors due to a timeout when trying to get the lock on some files (as all the services are built concurrently). You can either: + +- Execute the `docker compose up -d --build` command each time you have the error until the environment is started, +- Or disable concurrency by executing the two following commands: + - `./scripts/build-services.sh` (takes a long time as all services are built one by one), + - `docker compose up -d --build` (should work this time as all services are built by the previous command). \ No newline at end of file diff --git a/docs/content/prerequisites/validation/_index.md b/docs/content/prerequisites/validation/_index.md new file mode 100644 index 0000000..bae398d --- /dev/null +++ b/docs/content/prerequisites/validation/_index.md @@ -0,0 +1,52 @@ ++++ +date = '2025-06-06T23:15:06+02:00' +title = 'Validation' +weight = 4 ++++ + +โœ… Open the [Eureka](https://cloud.spring.io/spring-cloud-netflix/) website started during the infrastructure setup. The +following instances should be registered with Eureka: + +* API-GATEWAY +* EASYPAY-SERVICE +* FRAUDETECT-SERVICE +* MERCHANT-BACKOFFICE +* SMARTBANK-GATEWAY + +> [!TIP] +> If you run this workshop on your desktop, you can go to this URL: [http://localhost:8761](http://localhost:8761). +> If you run it on a CDE, you can go to the corresponding URL instead by going into the `PORTS` view and +> select the url next to the port `8761`. You may have to `Add Port` manually if not detected by VSCode. + +โœ… All services should be registered before continuingโ€ฆ + +๐Ÿ› ๏ธ You can now access our platform to initiate a payment: + +```bash +$ http POST :8080/api/easypay/payments posId=POS-01 cardNumber=5555567898780008 expiryDate=789456123 amount:=25000 +``` + +โœ… You should get the following content: + +```bash +HTTP/1.1 201 Created +Content-Type: application/json +Date: Wed, 05 Jun 2024 13:42:12 GMT +Location: http://172.19.25.95:44523/payments/3cd8df14-8c39-460b-a429-dc113d003aed +transfer-encoding: chunked + +{ + "amount": 25000, + "authorId": "5d364f1a-569c-4c1d-9735-619947ccbea6", + "authorized": true, + "bankCalled": true, + "cardNumber": "5555567898780008", + "cardType": "MASTERCARD", + "expiryDate": "789456123", + "paymentId": "3cd8df14-8c39-460b-a429-dc113d003aed", + "posId": "POS-01", + "processingMode": "STANDARD", + "responseCode": "ACCEPTED", + "responseTime": 414 +} +``` \ No newline at end of file diff --git a/docs/content/profiling/_index.md b/docs/content/profiling/_index.md new file mode 100644 index 0000000..eaf5d64 --- /dev/null +++ b/docs/content/profiling/_index.md @@ -0,0 +1,45 @@ ++++ +title = "Profiling (Bonus)" +type = "chapter" +weight = 7 ++++ + +The OpenTelemetry project standardizes telemetry signals, particularly the logs, metrics, and traces we have seen so +far. +However, last year [they announced their work on a fourth signal: profiling](https://opentelemetry.io/blog/2024/profiling/). + +Profiling involves measuring the performance characteristics of your application, such as execution time, CPU +utilization, or memory usage. It helps identify bottlenecks and optimize resource usage in your application. + +If you're familiar with Java, you may already know [async_profiler](https://github.com/async-profiler/async-profiler) +for HotSpot JVMs. It defines itself as a "low overhead sampling profiler for Java." + +You may have also heard about eBPF, a technology embedded in the Linux kernel that allows running code in a sandbox +within the kernel space. This technology is gaining traction in service meshes and in continuous profiling. + +Continuous profiling is an ongoing area of interest in the observability field, aimed at finding additional performance +improvements. + +### (Grafana) Pyroscope + +Pyroscope was an open-source project for continuous profiling. It consists of a server that receives profiling samples, +which can then be analyzed and displayed as a [flamegraph](https://www.brendangregg.com/flamegraphs.html). + +In the Java landscape, it offers a Java Agent based on *async_profiler*, compatible with other agents such as the +OpenTelemetry agent. Phew! + +In 2023, Grafana acquired Pyroscope and merged it with its own solution, Phlare. Welcome +to [Grafana Pyroscope](https://grafana.com/docs/pyroscope/latest/)! + +If you want to know more about Continuous Profiling and what it can bring to you, you may want to check out +the [Grafana Pyroscope documentation](https://grafana.com/docs/pyroscope/latest/introduction/profiling/). + +### Objectives + +In this section, we aim to show you: + +* What profiling is, +* What a flamegraph is, +* How it integrates in Grafana. + +{{% children containerstyle="div" style="h3" description=false %}} \ No newline at end of file diff --git a/docs/content/profiling/enable-continuous-profiling.md b/docs/content/profiling/enable-continuous-profiling.md new file mode 100644 index 0000000..6cd54f6 --- /dev/null +++ b/docs/content/profiling/enable-continuous-profiling.md @@ -0,0 +1,107 @@ ++++ +date = '2025-06-07T16:24:31+02:00' +title = 'Enable Continuous Profiling' +weight = 2 ++++ + +## Start the Pyroscope server + +๐Ÿ› ๏ธ We will use the `grafana/pyroscope` container image: we already defined a pyroscope service in our `compose.yml` +file, but it is not yet enabled. You can start it by enabling the `profiling` profile: + +```bash +$ docker compose --profile=profiling up -d +``` + +โœ… It should start a new service on port `4040`. + +๐Ÿ› ๏ธ Go to Grafana Pyroscope dashboard UI on port `4040`: + +* You should see Pyroscope self-profiling and a new graph type: a flamegraph. +* On the top of the dashboard you can select the type of information you want to display: + * CPU profiling, + * Memory, + * Goroutines (itโ€™s a Go process), + * Etc. +* You can also filter your data by tagsโ€ฆ +* And there is a query language: you should be used to this by now! ๐Ÿ˜‰ + +## Setup easypay-service for Continuous Profiling + +Letโ€™s use an agent again to profile our application, and the Pyroscope extension for OpenTelemetry agent +to match span with profiling data. + +โœ… Both `pyroscope.jar` agent and `pyroscope-otel.jar` were downloaded for you in the `/` of the container (`easypay-service/src/main/docker/Dockerfile`): + +```Dockerfile +ENV PYROSCOPE_VERSION=v2.0.0 +ENV PYROSCOPE_URL="https://github.com/grafana/pyroscope-java/releases/download/${PYROSCOPE_VERSION}/pyroscope.jar" +ENV PYROSCOPE_OTEL_VERSION=v1.0.1 +ENV PYROSCOPE_OTEL_URL="https://github.com/grafana/otel-profiling-java/releases/download/${PYROSCOPE_OTEL_VERSION}/pyroscope-otel.jar" + +ADD --chown=$UID:$GID ${PYROSCOPE_URL} /pyroscope.jar +ADD --chown=$UID:$GID ${PYROSCOPE_OTEL_URL} /pyroscope-otel.jar +``` + +๐Ÿ“ Just like for logs and metrics, we should modify the `compose.yml` deployment file for the `easypay-service` to enable +and configure profiling with Pyroscope: + +```yaml +services: + easypay-service: + # ... + environment: + # ... + # Pyroscope agent configuration --vv + PYROSCOPE_APPLICATION_NAME: easypay-service # (1) + PYROSCOPE_FORMAT: jfr # (2) + PYROSCOPE_PROFILING_INTERVAL: 10ms + PYROSCOPE_PROFILER_EVENT: itimer # (3) + PYROSCOPE_PROFILER_LOCK: 10ms # (4) + PYROSCOPE_PROFILER_ALLOC: 512k # (5) + PYROSCOPE_UPLOAD_INTERVAL: 5s + OTEL_JAVAAGENT_EXTENSIONS: /pyroscope-otel.jar # (6) + OTEL_PYROSCOPE_ADD_PROFILE_URL: false + OTEL_PYROSCOPE_ADD_PROFILE_BASELINE_URL: false + OTEL_PYROSCOPE_START_PROFILING: true + PYROSCOPE_SERVER_ADDRESS: http://pyroscope:4040 # (7) + # ... + entrypoint: + - java + - -javaagent:/pyroscope.jar # < Add + - -javaagent:/opentelemetry-javaagent.jar + - -Dotel.instrumentation.logback-appender.experimental-log-attributes=true + - -Dotel.instrumentation.logback-appender.experimental.capture-mdc-attributes=* + - -Dotel.metric.export.interval=5000 + - -cp + - app:app/lib/* + - com.worldline.easypay.EasypayServiceApplication +``` + +1. Define an application name (this will create the `service_name` label), +2. Set format: JFR allows to have multiple events to be recorded, +3. Type of event to profile: `wall` allows to record the time spent in methods. Other valid values are `itimer` and + `cpu`. +4. Threshold to record lock events, +5. Threshold to record memory events, +6. Declare the Pyroscope extension for the OpenTelemetry agent, +7. Server address. + +๐Ÿ› ๏ธ Redeploy `easypay-service`: + +```bash +$ docker compose up -d easypay-service +``` + +โœ… Check logs for correct startup: + +```bash +docker compose logs -f easypay-service +``` + +You should see additional logs related to Pyroscope. + +๐Ÿ‘€ Go back to the Pyroscope dashboard (port `4040`): + +* In the top menu, you should be able to select the `easypay-service` application, +* Try to display TPU profiling. \ No newline at end of file diff --git a/docs/content/profiling/explore.md b/docs/content/profiling/explore.md new file mode 100644 index 0000000..99a7dc9 --- /dev/null +++ b/docs/content/profiling/explore.md @@ -0,0 +1,23 @@ ++++ +date = '2025-06-07T16:25:15+02:00' +title = 'Explore' +weight = 3 ++++ + +> [!INFO] +> We already configured the Pyroscope data source in Grafana. +> You can take a look at its configuration in the `Connections` > `Data sources` section. + +๐Ÿ‘€ Letโ€™s go to the Grafana `Explore` dashboard: + +* Select the `Pyroscope` data source, +* For the profiling type, select `process_cpu` > `cpu`, +* In the field next to the profiling type, enter a filter to get profiling of the `easypay-service`: + `{service_name="easypay-service"}`, +* Select another profiling type (such as memory allocation in TLAB). + +๐Ÿ› ๏ธ Generate some load with `k6`: + +```bash +$ k6 run -u 1 -d 5m k6/02-payment-smartbank.js +``` \ No newline at end of file diff --git a/docs/content/profiling/trace-to-profile-correlation.md b/docs/content/profiling/trace-to-profile-correlation.md new file mode 100644 index 0000000..452521a --- /dev/null +++ b/docs/content/profiling/trace-to-profile-correlation.md @@ -0,0 +1,36 @@ ++++ +date = '2025-06-07T16:26:14+02:00' +title = 'Trace to Profile Correlation' +weight = 4 ++++ + +It is possible to link traces to profiles in Grafana Pyroscope, thanks to the Pyroscope extension for the OpenTelemetry Agent. +This extensions attaches span context to profiles making possible to correlate traces with profiles. + +#### Configure correlation + +๏ธ๐Ÿ› ๏ธ In Grafana, go to `Connections` > `Data sources`: + +* Select the `Tempo` data source, +* Configure `Trace to profiles` as follows: + * Data source: `Pyroscope`, + * Tags: `service.name` as `application`, + * Profile type: `process_cpu` > `cpu`. + +๐Ÿ› ๏ธ Finally click on `Save & test` and go back to Grafana `Explore` dashboard to test our new setup. + +#### Test correlation + +๐Ÿ› ๏ธ Hit the easypay payment endpoint with curl or k6 to generate some traces (whichever you prefer): + +* `http POST :8080/api/easypay/payments posId=POS-01 cardNumber=5555567898780008 expiryDate=789456123 amount:=40000` +* `k6 run -u 1 -d 2m k6/01-payment-only.js` + +๐Ÿ› ๏ธ Open the Grafana `Explore` dashboard to find a trace: + +* You can use the following TraceQL directly: `{name="POST easypay-service"}` + * It is equivalent to filtering on Span Name: `POST easypay-service` + * More information about TraceQL: [Grafana TraceQL documentation](https://grafana.com/docs/tempo/latest/traceql/) +* Drill down a trace (you can widen the new pane). + +๐Ÿ‘€ In the link icon, at the root of a service, select `Related Profile`. \ No newline at end of file diff --git a/docs/content/traces/_index.md b/docs/content/traces/_index.md new file mode 100644 index 0000000..b109c83 --- /dev/null +++ b/docs/content/traces/_index.md @@ -0,0 +1,42 @@ ++++ +title = "Traces" +type = "chapter" +weight = 5 ++++ + +In this section, we'll explore **distributed tracing**, the third pillar of application observability. + +Distributed tracing is an essential tool for monitoring and analyzing the performance of complex applications. It tracks +the flow of requests across multiple services and components, helping to identify bottlenecks and improve efficiency โ€” +particularly useful for intricate systems like Easypay. + +With Spring Boot, there are a couple of approaches to incorporate distributed tracing into your application: + +* Utilize + the [Spring Boot Actuator integration](https://docs.spring.io/spring-boot/docs/current/reference/html/actuator.html#actuator.tracing) + with support from [Micrometer Tracing](https://docs.micrometer.io/docs/tracing), +* The Spring Boot Starter for OpenTelemetry, which provides an easy way to instrument your application using + OpenTelemetry instrumentation libraries (but limited to a subset of all available libraries), +* Or adopt a broader [Java Agent approach](https://github.com/open-telemetry/opentelemetry-java-instrumentation) + provided by the OpenTelemetry project, which automatically instruments our code when attached to our JVM regardless of + the framework you use. + +**For this workshop, we'll keep to use the Java Agent approach, as it's the most straightforward way to instrument our application +and independent of the libraries we use.** + +The OpenTelemetry Collector will be used once again, tasked with receiving traces and forwarding them to the Tempo +backend. + +![Architecture with traces](./archi-traces.png) + +> [!TIP] +> Utilizing collectors offers several advantages for managing telemetry data: +> - Reduces the need for complicated application configurations: just send data to `localhost`, +> - Centralizes configuration to a single point: the collector, +> - Acts as a buffer to prevent resource overuse, +> - Can transform data before ingestion, +> - Supports data intake from various protocols and can relay them to any backend, +> - ... + +Lastly, we will use Grafana to examine and interpret these traces, allowing us to better understand and optimize our +application's performance. \ No newline at end of file diff --git a/docs/content/traces/archi-traces.png b/docs/content/traces/archi-traces.png new file mode 100644 index 0000000..3097a9a Binary files /dev/null and b/docs/content/traces/archi-traces.png differ diff --git a/docs/content/traces/business-traces.md b/docs/content/traces/business-traces.md new file mode 100644 index 0000000..62d0095 --- /dev/null +++ b/docs/content/traces/business-traces.md @@ -0,0 +1,120 @@ ++++ +date = '2025-06-07T16:12:49+02:00' +title = 'Business Traces' +weight = 5 ++++ + +Just like metrics, it is also possible to add your own spans on arbitrary methods to provide more business value to the +observability of your application. + +Letโ€™s return to our code! + +## Objectives + +We want to add new spans to the traces generated in the `easypay-service` application to track payment processing and +store events. + +To achieve this goal, we will create new spans when the `process` and `store` methods of the +`com.worldline.easypay.payment.control.PaymentService` class in the `easypay-service` module are invoked. + +As a reminder, this class is the central component responsible for processing payments. It provides the public method +`accept`, which delegates its responsibilities to two private methods: + +* `process`: which handles all the processing of the payment, including validation and calling third parties. +* `store`: which saves the processing result in the database. + +### 1. Add Required Dependencies + +We need to add the `io.opentelemetry.instrumentation:opentelemetry-instrumentation-annotations` dependency to our module +to access some useful annotations. + +๐Ÿ‘€ This has already been done in advance for this workshop. The following dependencies were added to the Gradle build +file (`build.gradle.kts`) of the `easypay-service` module: + +```kotlin +dependencies { + //... + // Add opentelemetry Annotations support + implementation("io.opentelemetry.instrumentation:opentelemetry-instrumentation-annotations") + + // ... +} +``` + +### 2. Add Custom Spans + +๐Ÿ“ To add new spans based on methods, we can simply use the `@WithSpan` Java annotation. When a traced transaction +invokes the annotated method, a new span will be created. Hereโ€™s how to do it: + +```java +// ... + +import io.opentelemetry.instrumentation.annotations.WithSpan; + +@Service +public class PaymentService { + // ... + + @WithSpan("easypay: Payment processing method") + private void process(PaymentProcessingContext context) { + //... + } + + @WithSpan("easypay: Payment store method") + private void store(PaymentProcessingContext context) { + //... + } + + // ... +} +``` + +๐Ÿ“ We can also provide additional information to the span, such as method parameters using the ``@SpanAttribute`` +annotation: + +```java +// ... + +import io.opentelemetry.instrumentation.annotations.SpanAttribute; + +@Service +public class PaymentService { + // ... + + @WithSpan("easypay: Payment processing method") + private void process(@SpanAttribute("context") PaymentProcessingContext context) { // <-- HERE + // ... + } + + @WithSpan("easypay: Payment store method") + private void store(@SpanAttribute("context") PaymentProcessingContext context) { // <-- HERE + // ... + } + // ... +} +``` + +This will provide the whole PaymentProcessingContext into the trace. + +### 3. Build and redeploy + +๐Ÿ› ๏ธ As we did before: + +```bash +$ docker compose up -d --build easypay-service +``` + +### 4. Test it! + +๐Ÿ› ๏ธ Generate some payments: + +```bash +$ http POST :8080/api/easypay/payments posId=POS-01 cardNumber=5555567898780008 expiryDate=789456123 amount:=40000 +``` + +๐Ÿ‘€ Go back to Grafana and try to find your new traces using what you've learned previously. Observe the spans you added. + +> [!CAUTION] +> It may take some time for `easypay-service` to be registered in the service discovery and be available from the API +> gateway. +> Similarly, your traces being ingested by Tempo might also take some time. Patience is key ๐Ÿ˜… \ No newline at end of file diff --git a/docs/content/traces/explore-traces.md b/docs/content/traces/explore-traces.md new file mode 100644 index 0000000..aad87c3 --- /dev/null +++ b/docs/content/traces/explore-traces.md @@ -0,0 +1,67 @@ ++++ +date = '2025-06-07T16:06:26+02:00' +title = 'Explore' +weight = 3 ++++ + +## Explore Traces with Grafana + +> [!INFO] +> For this workshop, we've already configured the Tempo datasource in Grafana. +> You can take a look at its configuration in Grafana (available on port ``3000``) by navigating to the `Connections` > +`Data sources` section. +> Similar to Prometheus, the configuration is quite straightforward as we only need to set up the Tempo server URL. + +๐Ÿ› ๏ธ Generate some load on the application to produce traces: + +```bash +$ k6 run -u 1 -d 5m k6/01-payment-only.js +``` + +๐Ÿ› ๏ธ Letโ€™s explore your first traces in Grafana: + +* Go to Grafana and open an ``Explore`` dashboard, +* Select the `Tempo` data source and click on ``Run query`` to refresh the view. + +> [!CAUTION] +> You may need to wait one or two minutes to allow Tempo to ingest some tracesโ€ฆ + +๐Ÿ‘€ Click on `Service Graph` and explore the `Node graph`: this view is extremely helpful for visualizing and +understanding how our services communicate with each other. + +๐Ÿ‘€ Go back to `Search` and click on `Run query`. You should see a table named `Table - Traces`. +By default, this view provides the most recent traces available in *Tempo*. + +๐Ÿ› ๏ธ Let's find an interesting trace using the query builder: + +* Look at all traces corresponding to a POST to `easypay-service` with a duration greater than 50 ms: + * Span Name: `POST easypay-service` + * Duration: `trace` `>` `50ms` + * You can review the generated query, which uses a syntax called TraceQL. +* Click on `Run query`. +* Sort the table by `Duration` (click on the column name) to find the slowest trace. +* Drill down a `Trace ID`. + +You should see the full stack of the corresponding transaction. + +๐Ÿ‘€ Grafana should open a new view (you can enlarge it by clicking on the three vertical dots and selecting `Widen pane`): + +* Pinpoint the different nodes and their corresponding response times: + * Each line is a span and corresponds to the time spent in a method/event. +* Examine the SQL queries and their response times. +* Discover that distributed tracing can link transactions through: + * HTTP (`api-gateway` to `easypay-service` and `easypay-service` to `smartbank-gateway`). + * Kafka (`easypay-service` to `fraudetect-service` and `merchant-backoffice`). + +๐Ÿ› ๏ธ Grafana allows to display a graph of spans as interconnected nodes: + +* Modify the Tempo data source: + * Go to `Additional settings`, + * Check the `Enable node graph` option. +* Go back to the same kind of trace, +* Click on `Node graph` to get a graphical view of all the spans participating in the trace. + +๐Ÿ› ๏ธ Continue your exploration in the `Search` pane: + +* For example, you can add the `Status` `=` `error` filter to see only traces that contain errors, +* Try to find our requests with a `NullPointerException`. diff --git a/docs/content/traces/export-traces.md b/docs/content/traces/export-traces.md new file mode 100644 index 0000000..3f7e430 --- /dev/null +++ b/docs/content/traces/export-traces.md @@ -0,0 +1,57 @@ ++++ +date = '2025-06-07T16:05:12+02:00' +title = 'Export' +weight = 2 ++++ + +## Enable distributed tracing + +To capture the entire transaction across all services in a trace, it's essential to instrument all the services in our +application. + +> [!IMPORTANT] +> In this workshop, our primary focus keeps to be on the `easypay` service. +> For efficiency, we have already instrumented the other services beforehand. + +We have already configured the OpenTelemetry Java Agent to transmit telemetry data directly to the collector, so our +application have already sent traces to the collector. + +#### OpenTelemetry Collector + +OpenTelemetry Collector is already configured to accept traces through the +OpenTelemetry GRPC protocol (OTLP) on port `4317`, and then forward them to *Grafana Tempo*, which listens on the host +`tempo` on the same port `4317` (this setup specifically handles OTLP traces). + +๐Ÿ‘€ You can have a look at the collector configuration located in the `docker/otelcol/otelcol.yaml` file: + +```yaml +# Input +receivers: + # OpenTelemetry Protocol: logs, metrics and traces + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +#... +# Output +exporters: + # ... + # Export to Tempo via GRPC using the OpenTelemetry Protocol + otlp/tempo: + endpoint: tempo:4317 + tls: + insecure: true + +# Telemetry processing pipelines +service: + pipelines: + # ... + # Receive traces using the OpenTelemetry Protocol and export to Tempo + traces: + receivers: [ otlp ] + processors: [ batch ] + exporters: [ otlp/tempo ] +``` \ No newline at end of file diff --git a/docs/content/traces/sampling.md b/docs/content/traces/sampling.md new file mode 100644 index 0000000..0232dac --- /dev/null +++ b/docs/content/traces/sampling.md @@ -0,0 +1,71 @@ ++++ +date = '2025-06-07T16:11:48+02:00' +title = 'Sampling' +weight = 4 ++++ + +When we instrument our services using the agent, every interaction, including Prometheus calls to the +`actuator/prometheus` endpoint, is recorded. + +In the `Service Graph` you should have seen a link between the `User` and services other than the `api-gateway` it seems +not normal for us: we only created payments through the `api-gateway`! + +๐Ÿ‘€ If you click on the link and select `View traces` you should see a lot of traces regarding `actuator/health`. + +To avoid storing unnecessary data in Tempo, we can sample the data in two ways: + +* [Head Sampling](https://opentelemetry.io/docs/concepts/sampling/#head-sampling) +* [Tail Sampling](https://opentelemetry.io/docs/concepts/sampling/#tail-sampling) + +In this workshop, we will implement Tail Sampling, using the +[tail_sampling processor](https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/processor/tailsamplingprocessor/README.md). + +Modify the OpenTelemetry Collector configuration file (``docker/otelcol/otelcol.yaml``) as follows: + +```yaml +# ... +# Transform +processors: + # ... + tail_sampling/actuator: # Add a tail sampling definition for + policies: + [ + { + name: "filter-http-url", + type: "string_attribute", + string_attribute: { + key: "http.url", + values: [ "/actuator/health" ], + enabled_regex_matching: true, + invert_match: true + } + }, + { + name: "filter-url-path", + type: "string_attribute", + string_attribute: { + key: "url.path", + values: [ "/actuator/health" ], + enabled_regex_matching: true, + invert_match: true + } + } + ] + +# ... +# Telemetry processing pipelines +service: + # Receive traces using the OpenTelemetry Protocol and export to Tempo + traces: + receivers: [ otlp ] + processors: [ batch, tail_sampling/actuator ] # < add the tail_sampling processor here + exporters: [ otlp/tempo ] +``` + +๐Ÿ› ๏ธ Restart the collector: + +```bash +$ docker compose up -d --build opentelemetry-collector +``` + +Starting from this moment, you should no longer see traces related to `actuator/health` endpoints. \ No newline at end of file diff --git a/docs/content/workshop-overview/_index.md b/docs/content/workshop-overview/_index.md new file mode 100644 index 0000000..ea427c6 --- /dev/null +++ b/docs/content/workshop-overview/_index.md @@ -0,0 +1,9 @@ ++++ +title = "Workshop Overview" +description = "Application components and Workshop structure" +type = "chapter" +weight = 1 ++++ + +{{% children containerstyle="div" style="h3" description=true %}} + diff --git a/docs/content/workshop-overview/application/_index.md b/docs/content/workshop-overview/application/_index.md new file mode 100644 index 0000000..3be78a9 --- /dev/null +++ b/docs/content/workshop-overview/application/_index.md @@ -0,0 +1,43 @@ ++++ +date = '2025-06-05T22:52:13+02:00' +draft = false +title = 'Our Application: EasyPay' +description = "Describes EasyPay architecture and components." +weight = 1 ++++ + +## High Level Design + +![The Easy Pay System](architecture.svg) + +### API Gateway + +Centralises all API calls. + +### Easy Pay Service + +Payment microservices which accepts (or not) payments. + +This is how it validates every payment: + +1. Check the POS (Point of Sell) number +2. Check the credit card number +3. Check the credit card type +4. Check the payment threshold, it calls the Smart Bank Gateway for authorization + +If the payment is validated, it stores it and broadcasts it to all the other microservices through Kafka. + +### Fraud detection Service + +After fetching a message from the Kafka topic, this service search in its database if the payment's card number is +registered for fraud. + +In this case, only a ``WARN`` log is thrown. + +### Merchant Back Office Service + +For this lab, it only simulates the subscription of messages. + +### Smart Bank Gateway + +This external service authorizes the payment. \ No newline at end of file diff --git a/docs/content/workshop-overview/application/architecture.svg b/docs/content/workshop-overview/application/architecture.svg new file mode 100644 index 0000000..493ad87 --- /dev/null +++ b/docs/content/workshop-overview/application/architecture.svg @@ -0,0 +1 @@ +Easy Pay System[Container]API Gateway[Spring Cloud Gateway] Exposes the APIsEasyPay[Spring Boot] Exposes an API, checks POS &card, authorizes the paymentMerchant Back OfficeFraud DetectionPostgreSQL[PostgreSQL]kafka[kafka]Discovery ServerConfiguration ServerCustomer A customerSmart Bank GatewayCalls an APIHTTPSJDBCkafka[Broadcasts paymentinformation]kafka[Gets payments]kafka[Gets payments]Sends payment \ No newline at end of file diff --git a/docs/content/workshop-overview/icons/_index.md b/docs/content/workshop-overview/icons/_index.md new file mode 100644 index 0000000..6aba9e5 --- /dev/null +++ b/docs/content/workshop-overview/icons/_index.md @@ -0,0 +1,13 @@ ++++ +date = '2025-06-05T23:02:34+02:00' +title = 'Icons' +description = "Defines the icons you will meet during this workshop." ++++ + +You will meet the following icons during the workshop: + +๐Ÿ› ๏ธ An action to perform, +๐Ÿ“ A file to modify, +๐Ÿ‘€ Something to observe, +โœ… Validate something, +โ„น๏ธ Some information. \ No newline at end of file diff --git a/docs/content/workshop-overview/target/_index.md b/docs/content/workshop-overview/target/_index.md new file mode 100644 index 0000000..27a7b12 --- /dev/null +++ b/docs/content/workshop-overview/target/_index.md @@ -0,0 +1,23 @@ ++++ +date = '2025-06-05T22:56:29+02:00' +draft = false +title = 'Target Platform' +weight = 2 +description = "Defines the target architecture with Observability enabled." ++++ + +## To a fully observable platform + +![The Easy Pay observable System](architecture_observable.png) + +### Short explanation + +As mentioned earlier, our observability stack is composed of : + +* [Prometheus](https://prometheus.io/) for gathering & storing the metrics +* [Loki](https://grafana.com/oss/loki/) for storing the logs +* [Tempo](https://grafana.com/oss/tempo/) for storing the traces +* [Grafana](https://grafana.com/) for the dashboards +* [OTEL collector](https://opentelemetry.io/docs/collector/) which gathers all the data to send it then to + +In addition, the microservices are started with an agent to broadcast their telemetry to the collector. \ No newline at end of file diff --git a/docs/content/workshop-overview/target/architecture_observable.png b/docs/content/workshop-overview/target/architecture_observable.png new file mode 100644 index 0000000..3fd330f Binary files /dev/null and b/docs/content/workshop-overview/target/architecture_observable.png differ diff --git a/docs/hugo.toml b/docs/hugo.toml new file mode 100644 index 0000000..cd5eba7 --- /dev/null +++ b/docs/hugo.toml @@ -0,0 +1,9 @@ +baseURL = 'https://example.org/' +languageCode = 'en-us' +title = 'Observability Workshop' + +theme = 'hugo-theme-relearn' + +[params] +showVisitedLinks = true +themeVariant = ['worldline-light', 'worldline-dark'] diff --git a/docs/themes/hugo-theme-relearn b/docs/themes/hugo-theme-relearn new file mode 160000 index 0000000..e59b0bd --- /dev/null +++ b/docs/themes/hugo-theme-relearn @@ -0,0 +1 @@ +Subproject commit e59b0bd36c54529a48bd1a28625d9e34354bb169 diff --git a/scripts/build-services.sh b/scripts/build-services.sh new file mode 100755 index 0000000..765c3f5 --- /dev/null +++ b/scripts/build-services.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash + +for service in {"config-server","discovery-server","api-gateway","smartbank-gateway","fraudetect-service","merchant-backoffice"}; do + echo "Building service: $service" + docker compose build "$service" +done \ No newline at end of file diff --git a/scripts/download-compose.sh b/scripts/download-compose.sh new file mode 100755 index 0000000..43d04e8 --- /dev/null +++ b/scripts/download-compose.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +VERSION=2.37.0 + +DOCKER_CONFIG=${DOCKER_CONFIG:-$HOME/.docker} +mkdir -p $DOCKER_CONFIG/cli-plugins +curl -SL https://github.com/docker/compose/releases/download/v${VERSION}/docker-compose-linux-x86_64 -o $DOCKER_CONFIG/cli-plugins/docker-compose +chmod +x $DOCKER_CONFIG/cli-plugins/docker-compose \ No newline at end of file