diff --git a/.github/workflows/service-registration.yml b/.github/workflows/service-registration.yml index 287224ee4e..e72ea7705d 100644 --- a/.github/workflows/service-registration.yml +++ b/.github/workflows/service-registration.yml @@ -34,9 +34,27 @@ jobs: run: > ./gradlew runStartUpCheck --info --scan -Denvironment.startServices=true + - name: Start OpenTelemetry containers + run: | + cd otel + sh/start_containers.sh + - name: Run startup check for modulith - run: > - ./gradlew runStartUpCheck --info --scan -Denvironment.startServices=true -Denvironment.modulith=true + run: | + export OTEL_SDK_DISABLED=false + export OTEL_RESOURCE_ATTRIBUTES_DEPLOYMENT_ENVIRONMENT=dev + export OTEL_RESOURCE_ATTRIBUTES_SERVICE_NAME=apiml + export OTEL_RESOURCE_ATTRIBUTES_ZOS_SMF_ID=SYS1 + export OTEL_RESOURCE_ATTRIBUTES_ZOS_SYSPLEX_NAME=SYSPLEX1 + export OTEL_RESOURCE_ATTRIBUTES_MAINFRAME_LPAR_NAME=LPAR01 + export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318 + ./gradlew runStartUpCheckWithOpenTelemetry --info --scan -Denvironment.startServices=true -Denvironment.modulith=true + + - name: Validate telemetry data and stop containers + if: always() + run: | + cd otel + sh/validate_and_stop.sh - name: Store results uses: actions/upload-artifact@v4 @@ -45,5 +63,6 @@ jobs: name: BuildAndTest-${{ env.JOB_ID }} path: | */build/reports/** + otel/** - uses: ./.github/actions/teardown diff --git a/integration-tests/build.gradle b/integration-tests/build.gradle index a323367d7c..ab91e9073b 100644 --- a/integration-tests/build.gradle +++ b/integration-tests/build.gradle @@ -101,6 +101,18 @@ task runStartUpCheck(type: Test) { group 'integration tests' description "Check that the API Mediation Layer is up and running" + systemProperties System.properties + useJUnitPlatform { + includeTags 'StartupCheck' + excludeTags 'OpenTelemetryTest' + } + outputs.upToDateWhen { false } +} + +task runStartUpCheckWithOpenTelemetry(type: Test) { + group 'integration tests' + description "Check that the API Mediation Layer is up and running with graceful wait for OpenTelemetry" + systemProperties System.properties useJUnitPlatform { includeTags 'StartupCheck' diff --git a/integration-tests/src/test/java/org/zowe/apiml/startup/ApiMediationLayerStartTest.java b/integration-tests/src/test/java/org/zowe/apiml/startup/ApiMediationLayerStartTest.java index 32cb4c1242..1303124d0c 100644 --- a/integration-tests/src/test/java/org/zowe/apiml/startup/ApiMediationLayerStartTest.java +++ b/integration-tests/src/test/java/org/zowe/apiml/startup/ApiMediationLayerStartTest.java @@ -10,11 +10,15 @@ package org.zowe.apiml.startup; +import lombok.SneakyThrows; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.zowe.apiml.startup.impl.ApiMediationLayerStartupChecker; +import org.zowe.apiml.util.categories.OpenTelemetryTest; import org.zowe.apiml.util.categories.StartupCheck; +import java.time.Duration; + import static org.junit.jupiter.api.Assertions.assertTrue; @StartupCheck @@ -29,4 +33,15 @@ void setUp() { void checkApiMediationLayerStart() { assertTrue(true); } + + @Test + @OpenTelemetryTest + @SneakyThrows + void giveOpenTelemetryTimeToSendMetrics() { + //The application has to run for a while to collect and send the telemetry data + //so they can be evaluated in the OpenTelemetry Golden Tester + Thread.sleep(Duration.ofSeconds(30).toMillis()); + assertTrue(true); + } + } diff --git a/integration-tests/src/test/java/org/zowe/apiml/util/categories/OpenTelemetryTest.java b/integration-tests/src/test/java/org/zowe/apiml/util/categories/OpenTelemetryTest.java new file mode 100644 index 0000000000..a3aa8100ef --- /dev/null +++ b/integration-tests/src/test/java/org/zowe/apiml/util/categories/OpenTelemetryTest.java @@ -0,0 +1,27 @@ +/* + * This program and the accompanying materials are made available under the terms of the + * Eclipse Public License v2.0 which accompanies this distribution, and is available at + * https://www.eclipse.org/legal/epl-v20.html + * + * SPDX-License-Identifier: EPL-2.0 + * + * Copyright Contributors to the Zowe Project. + */ + +package org.zowe.apiml.util.categories; + +import org.junit.jupiter.api.Tag; + +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; +import java.lang.annotation.Target; + +import static java.lang.annotation.ElementType.METHOD; +import static java.lang.annotation.ElementType.TYPE; + +@Tag("OpenTelemetryTest") +@Target({ TYPE, METHOD }) +@Retention(RetentionPolicy.RUNTIME) +public @interface OpenTelemetryTest { +} + diff --git a/integration-tests/src/test/java/org/zowe/apiml/util/service/FullApiMediationLayer.java b/integration-tests/src/test/java/org/zowe/apiml/util/service/FullApiMediationLayer.java index 318e1dd3c8..7c19106865 100644 --- a/integration-tests/src/test/java/org/zowe/apiml/util/service/FullApiMediationLayer.java +++ b/integration-tests/src/test/java/org/zowe/apiml/util/service/FullApiMediationLayer.java @@ -24,14 +24,14 @@ import java.util.Map; import java.util.Optional; +import static org.zowe.apiml.util.config.ConfigReader.IS_MODULITH_ENABLED; + //TODO this class doesn't lend itself well to switching of configurations. //attls is integrated in a kludgy way, and deserves a rewrite @Slf4j public class FullApiMediationLayer { - public static final boolean IS_MODULITH_ENABLED = Boolean.parseBoolean(System.getProperty("environment.modulith")); - private RunningService discoveryService; private RunningService gatewayService; private RunningService apiCatalogService; diff --git a/otel/README.md b/otel/README.md new file mode 100644 index 0000000000..f7b341baf5 --- /dev/null +++ b/otel/README.md @@ -0,0 +1,73 @@ +# OpenTelemetry containers for integration testing + +The [docker-compose.yml](docker-compose.yml) defines 2 containers: +- OpenTelemetry Collector (oallector) +- OpenTelemetry Golden Validator (golden) + +The collector is the standard OpenTelemetry Collector ([docs](https://opentelemetry.io/docs/collector/), [repo](https://github.com/open-telemetry/opentelemetry-collector-contrib)). The Golden Tester comes from the [same](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/cmd/golden) repository and validates data exported from the collector. Only metrics (in [alpha](https://github.com/open-telemetry/opentelemetry-collector/blob/main/docs/component-stability.md#alpha) stability level) are supported as of January 2026. + +## Integration test flow + +The API mediation layer produce telemetry data, that are exported to the Collector. Then the Collector exports the data to the Golden Tester the same way the data are published to an observability stack in real deployment. The Golden Tester validates the telemetry data against a definition from yaml file. If the validation does not pass within a timeout the container exits with exit code 1. + +```mermaid +flowchart LR + apiml["APIML (modulith)"] + collector["OpenTelemetry Collector"] + collector-config{{config.yml}} + golden["OpenTelemetry Golden Tester"] + golden-config{{expected.yml}} + apiml -- sends telemetry data --> collector + subgraph docker + collector -- forwards telemetry data --> golden + collector-config -.-> collector + golden -. validates against .-> golden-config + end + +``` + +The Golden Tester validates all metrics received, which makes definition of expected data difficult as the definition needs to be exhaustive. For this reason the OpenTelemetry collector is configured to produce at most one metric for validation, check the collector configuration file [otel-collector/config.yml](otel-collector/config.yml), which is mounted to the collector docker image. + +The Golden Tester configuration is split into 2 parts: +- Configuration of the tester like timeout, ports, fields to ignore, etc. is done via cli arguments. CLI arguments for the golden binary are placed in the [docker-compose.yml](docker-compose.yml) file. The list of supported options can be found in the [golden binary sources](https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/cmd/golden/internal/config.go). +- The definition of expected observability data is in [otel-golden/expected.yml](otel-golden/expected.yml). + +### Golden Tester configuration consideration +Ideally, we want to have generic docker-compose file and configuration injected via mounted configuration files or environment variables. Unfortunately, the golden binary accepts only CLI arguments (except the definition of expected data). + +Every CLI argument that requires a value is processed as 2 distinct arguments by the golden binary. Given the fact, that the [official golden docker image](https://github.com/open-telemetry/opentelemetry-collector-contrib/pkgs/container/opentelemetry-collector-contrib%2Fgolden) is build from the [scratch base](https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/cmd/golden/Dockerfile), there is no shell inside the golden image that preprocess the cli arguments so the arguments are passed to the binary exactly as defined in the [docker-compose.yml](docker-compose.yml) file. + +For instance if your docker file contains: +``` + command: + - "--ignore-resource-attribute-value process.pid" +``` +The whole string is passed to the binary and thus never matches the argument in the binary resulting in the value being ignored. The argument and value must be passed as two arguments: +``` + command: [ + "--ignore-resource-attribute-value", "process.pid" + ] +``` + +When environment variables are used to pass values to the docker files, only simple values that can be used in single argument value can be used. Unfortunately, this is not usable for the `--ignore-resource-attribute-value` as they must be repeated for every single value to be ignored. + +Possible workarounds are: +- Use Docker multi-stage build to create a custom Golden Tester image with a shell. The shell parses the string arguments on white spaces and pass them as individual arguments to the binary. Then multiple arguments can be defined in an environment variable: + ``` + GOLDEN_IGNORE_FIELDS = "--ignore-resource-attribute-value service.instance.id --ignore-resource-attribute-value host.name --ignore-resource-attribute-value host.arch --ignore-resource-attribute-value process.pid" + ``` + and the variable used as a placeholder in the docker compose `command`. + +- Add the arguments to the `docker compose run` command: + ```shell + $ docker compose run --rm --service-ports golden ----ignore-resource-attribute-value service.instance.id --ignore-resource-attribute-value host.name --ignore-resource-attribute-value host.arch --ignore-resource-attribute-value process.pid + ``` +Note that `docker compose` cli arguments override the `command` value in the docker file, and the containers must be started individually in comparison to the simple `docker compose up`. + +## Local run for development +To run the docker containers locally with the same setup as used in the integration tests, just run `docker compose up` (optionally with `-d`), and then start the APIML modulith with the OpenTelemetry enabled. The signals received and exported by the collector are saved to the [otel-golden](otel-golden) folder. The Golden Tester exits after timeout reporting the result of validation in the container console/log. The timeout can be set in the [docker-compose.yml](docker-compose.yml) file. + + + + + diff --git a/otel/docker-compose.yml b/otel/docker-compose.yml new file mode 100644 index 0000000000..d13271a22d --- /dev/null +++ b/otel/docker-compose.yml @@ -0,0 +1,59 @@ +services: + # 1. OpenTelemetry Golden Tester + golden: + image: ghcr.io/open-telemetry/opentelemetry-collector-contrib/golden:latest + container_name: golden + ports: + - "5318:4318" #For validation in pipeline + command: [ + "--otlp-http-endpoint", "0.0.0.0:4318", + "--otlp-endpoint", "0.0.0.0:4317", + "--ignore-timestamp", + "--ignore-start-timestamp", + "--timeout", "3m", + "--ignore-resource-attribute-value", "service.instance.id", + "--ignore-resource-attribute-value", "host.name", + "--ignore-resource-attribute-value", "host.arch", + "--ignore-resource-attribute-value", "process.pid", + "--ignore-resource-attribute-value", "process.command_line", + "--ignore-resource-attribute-value", "process.command_args", + "--ignore-resource-attribute-value", "process.executable.path", + "--ignore-resource-attribute-value", "process.runtime.description", + "--ignore-resource-attribute-value", "process.runtime.version", + "--ignore-resource-attribute-value", "process.runtime.name", + "--ignore-resource-attribute-value", "os.description", + "--ignore-resource-attribute-value", "os.type", + "--ignore-resource-attribute-value", "telemetry.sdk.version", + "--ignore-resource-attribute-value", "telemetry.distro.name", + "--ignore-resource-attribute-value", "telemetry.distro.version", + "--ignore-resource-attribute-value", "telemetry.sdk.language", + "--ignore-resource-attribute-value", "telemetry.sdk.name", + "--ignore-resource-attribute-value", "service.version", + "--ignore-resource-attribute-value", "service.name", + "--ignore-metric-attribute-value", "service.instance.id", + "--ignore-metric-attribute-value", "service.version", + "--ignore-metric-attribute-value", "service.name", + "--ignore-resource-metrics-order", + "--ignore-scope-metrics-order", + "--ignore-metrics-order", + "--ignore-metrics-data-points-order", + "--ignore-metric-values", + "--ignore-data-points-attributes-order", + "--ignore-scope-version", + "--expected", "/var/data/expected.yaml", +# "--write-expected" # generates the expected definition file from received data + ] + volumes: + - ./otel-golden:/var/data + + # 2. OpenTelemetry Collector + collector: + image: ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector-contrib:latest + container_name: collector + ports: + - "4317:4317" # OTLP gRPC vstup + - "4318:4318" # OTLP HTTP vstup + volumes: + - ./otel-collector:/etc/otel-collector + command: + - "--config=/etc/otel-collector/config.yaml" diff --git a/otel/otel-collector/config.yaml b/otel/otel-collector/config.yaml new file mode 100755 index 0000000000..42f62821e1 --- /dev/null +++ b/otel/otel-collector/config.yaml @@ -0,0 +1,96 @@ +receivers: + # OTLP receiver to receive telemetry data from services + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +exporters: + # signals (received and exported) are saved to files + file/metrics: + path: /etc/otel-collector/metrics.json + append: false + format: json + file/metrics_filtered: + path: /etc/otel-collector/metrics_filtered.json + append: false + format: json + file/traces: + path: /etc/otel-collector/traces.json + append: false + format: json + file/logs: + path: /etc/otel-collector/logs.json + append: false + format: json + # debug printed to the console when added to exporters + debug: + verbosity: detailed + # otlp-http exporter to forward telemetry to the Golden Tester + otlphttp/golden: + endpoint: "http://golden:4318" + tls: + insecure: true + nop: {} + +# The OpenTelemetry Golden Tester validates all received data, +# which requires all data to be described in golden expected definitions file. +# The metrics are filtered so only one metric is produced for validation. +processors: + # All metrics except jvm.cpu.count are ignored. + # The metric is expected to be always produced as we run java application. + filter/keep_specific_metrics: + error_mode: ignore + metrics: + include: + match_type: strict + metric_names: + - jvm.cpu.count + + # Every metric is converted to test.metric: + # - The name reflects this is purely artificial metrics for testing + # - Is converted to gauge as the gauge carries fewer attributes which simplifies the validation + # - The value is always 1.0 so it is easy to define the expected value for validation + transform/all_in_one: + error_mode: ignore + metric_statements: + - context: metric + statements: + - set(name, "test.metric") + - set(description, "Synthetic metric for resource attribute validation carrying fixed dummy value") + - set(instrumentation_scope.name, "test") + - set(unit, "") + - convert_sum_to_gauge() + - context: datapoint + statements: + - set(value_double, 1.0) + +service: + telemetry: + metrics: + level: none # Disables generation of internal otelcol_ metrics + pipelines: + traces: + # received traces are exported to console and file only + receivers: [otlp] + processors: [] + exporters: [file/traces] + # received traces are exported to console and file only + metrics: + receivers: [otlp] + processors: [] + exporters: [file/metrics] + # Received metrics are filtered and transformed. For any number of received metrics, at most one will be exported + # to console, file and Golden Tester. + metrics/filtered: + receivers: [ otlp ] + processors: [ filter/keep_specific_metrics, transform/all_in_one ] + exporters: [ file/metrics_filtered, otlphttp/golden ] + # received logs are exported to console and file only + logs: + receivers: [otlp] + processors: [] + exporters: [file/logs] + diff --git a/otel/otel-golden/expected.yaml b/otel/otel-golden/expected.yaml new file mode 100644 index 0000000000..c4bf7a4321 --- /dev/null +++ b/otel/otel-golden/expected.yaml @@ -0,0 +1,15 @@ +resourceMetrics: + - resource: + attributes: + - key: deployment.environment + value: { stringValue: "dev" } + - key: mainframe.lpar.name + value: { stringValue: "LPAR01" } + - key: zos.smf.id + value: { stringValue: "SYS1" } + - key: zos.sysplex.name + value: { stringValue: "SYSPLEX1" } + schemaUrl: "https://opentelemetry.io/schemas/1.24.0" + scopeMetrics: + - scope: { "name": "test" } + metrics: [ { "name": "test.metric", "description": "Synthetic metric for resource attribute validation carrying fixed dummy value", "gauge": {"dataPoints": [{"asDouble": 1}]} } ] diff --git a/otel/sh/start_containers.sh b/otel/sh/start_containers.sh new file mode 100755 index 0000000000..b46411b40e --- /dev/null +++ b/otel/sh/start_containers.sh @@ -0,0 +1,34 @@ +#!/bin/sh + +cd otel +chmod -R 777 otel-* +docker compose up -d + +echo "Checking OpenTelemetry Golden Tester..." +curl -s -v -w "\n" http://localhost:5318/v1/metrics -H "Content-Type: application/json" -d "{}" \ + --fail \ + --retry-all-errors \ + --retry-delay 10 \ + --retry 3 +if [ "$?" -eq 0 ]; then + echo "OpenTelemetry Golden Tester is ready!" +else + echo "OpenTelemetry Golden Tester startup failed" + docker compose stop + exit 1 +fi + +echo "" +echo "Checking OpenTelemetry Collector..." +curl -s -v -w "\n" /dev/null http://localhost:4318/v1/metrics -H "Content-Type: application/json" -d "{}" \ + --fail \ + --retry-all-errors \ + --retry-delay 10 \ + --retry 3 +if [ $? -eq 0 ]; then + echo "OpenTelemetry Collector is ready!" +else + echo "OpenTelemetry Collector startup failed" + docker compose stop + exit 1 +fi diff --git a/otel/sh/validate_and_stop.sh b/otel/sh/validate_and_stop.sh new file mode 100755 index 0000000000..570e7fe808 --- /dev/null +++ b/otel/sh/validate_and_stop.sh @@ -0,0 +1,22 @@ +#!/bin/sh + +echo "Waiting for Golden Validator to finish..." +# This blocks until the golden container exits (success or timeout) +EXIT_CODE=$(docker wait golden) + +echo "Stopping collector container..." +docker stop collector -t 60 +echo "Collector container logs:" +docker logs collector 2>&1 | tee otel/otel-collector/container.log + +# Display logs to see the diff if it failed +echo "Golden container logs:" +docker logs golden 2>&1 | tee otel/otel-golden/container.log + +echo "" + +if [ "$EXIT_CODE" -ne 0 ]; then + echo "OpenTelemetry data validation failed! See logs above for diff." + exit 1 +fi +echo "OpenTelemetry data validation passed!"