diff --git a/charmcraft.yaml b/charmcraft.yaml index 19f3306349..910e91ae8e 100644 --- a/charmcraft.yaml +++ b/charmcraft.yaml @@ -240,6 +240,12 @@ config: description: >- The log level for the runner manager application. The value can be CRITICAL, FATAL, ERROR, WARNING, INFO, or DEBUG. + otel-collector-endpoint: + type: string + default: "" + description: >- + The endpoint to send OpenTelemetry metrics to in the format "host:port". If not set, OpenTelemetry + will be disabled. actions: check-runners: diff --git a/docs/changelog.md b/docs/changelog.md index dd6187cd93..efe1af49c0 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -2,6 +2,10 @@ This changelog documents user-relevant changes to the GitHub runner charm. +## 2026-04-27 + +- Added configuration option `otel-collector-endpoint` to enable the otel-collector to export metric. Setting this configuration option will add the environment variable ACTION_OTEL_EXPORTER_OTLP_ENDPOINT to the runner, which allow users to configure their own metrics to be exported. + ## 2026-04-22 - Removed `KillMode=process` from the runner manager systemd service, restoring the default `control-group` kill mode. This ensures all child processes in the service's cgroup are properly terminated when the service stops, preventing orphaned runner processes. diff --git a/github-runner-manager/src/github_runner_manager/configuration/base.py b/github-runner-manager/src/github_runner_manager/configuration/base.py index 29eb1f69a6..7b227aff98 100644 --- a/github-runner-manager/src/github_runner_manager/configuration/base.py +++ b/github-runner-manager/src/github_runner_manager/configuration/base.py @@ -91,6 +91,7 @@ class SupportServiceConfig(BaseModel): aproxy_redirect_ports: A list of ports to redirect to the aproxy proxy. dockerhub_mirror: The dockerhub mirror to use for runners. ssh_debug_connections: The information on the ssh debug services. + otel_collector_config: The configuration for the OpenTelemetry collector. custom_pre_job_script: The custom pre-job script to run before the job. """ @@ -103,6 +104,7 @@ class SupportServiceConfig(BaseModel): dockerhub_mirror: str | None ssh_debug_connections: "list[SSHDebugConnection]" custom_pre_job_script: str | None + otel_collector_config: Optional["OtelCollectorConfig"] = None @root_validator(pre=False, skip_on_failure=True) @classmethod @@ -127,6 +129,18 @@ def check_use_aproxy(cls, values: dict) -> dict: return values +class OtelCollectorConfig(BaseModel): + """Configuration for OpenTelemetry collector. + + Attributes: + host: The OpenTelemetry collector hostname. + port: The OpenTelemetry collector port. + """ + + host: str + port: int = Field(gt=0, le=65535) + + class ProxyConfig(BaseModel): """Proxy configuration. diff --git a/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_runner_manager.py b/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_runner_manager.py index 26075b053c..d1b1bc0fbf 100644 --- a/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_runner_manager.py +++ b/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_runner_manager.py @@ -170,11 +170,18 @@ def _generate_cloud_init(self, runner_context: RunnerContext) -> str: if service_config.ssh_debug_connections else None ) + otel_collector_config = service_config.otel_collector_config + otel_collector_endpoint = ( + f"{otel_collector_config.host}:{otel_collector_config.port}" + if otel_collector_config + else "" + ) env_contents = jinja.get_template("env.j2").render( pre_job_script=str(PRE_JOB_SCRIPT), dockerhub_mirror=service_config.dockerhub_mirror or "", ssh_debug_info=ssh_debug_info, tmate_server_proxy=runner_http_proxy, + otel_collector_endpoint=otel_collector_endpoint, ) pre_job_contents_dict = { "issue_metrics": True, @@ -182,6 +189,7 @@ def _generate_cloud_init(self, runner_context: RunnerContext) -> str: "do_repo_policy_check": False, "custom_pre_job_script": service_config.custom_pre_job_script, "allow_external_contributor": self._config.allow_external_contributor, + "otel_collector_endpoint": otel_collector_endpoint, } pre_job_contents = jinja.get_template("pre-job.j2").render(pre_job_contents_dict) diff --git a/github-runner-manager/src/github_runner_manager/templates/env.j2 b/github-runner-manager/src/github_runner_manager/templates/env.j2 index 6814afb09d..ac701ce6bc 100644 --- a/github-runner-manager/src/github_runner_manager/templates/env.j2 +++ b/github-runner-manager/src/github_runner_manager/templates/env.j2 @@ -15,3 +15,6 @@ TMATE_SERVER_HOST={{ssh_debug_info.local_proxy_host}} TMATE_SERVER_PORT={{ssh_debug_info.local_proxy_port}} {% endif %} {% endif %} +{% if otel_collector_endpoint %} +ACTION_OTEL_EXPORTER_OTLP_ENDPOINT={{otel_collector_endpoint}} +{% endif %} diff --git a/github-runner-manager/src/github_runner_manager/templates/pre-job.j2 b/github-runner-manager/src/github_runner_manager/templates/pre-job.j2 index 54725fa1cd..c7d5f9b9be 100644 --- a/github-runner-manager/src/github_runner_manager/templates/pre-job.j2 +++ b/github-runner-manager/src/github_runner_manager/templates/pre-job.j2 @@ -131,6 +131,57 @@ jq -n \ logger -s "Contributor check passed - proceeding to execute jobs" {% endif %} +# Setup the OpenTelemetry collector configurations. +{% if otel_collector_endpoint %} +/usr/bin/logger -s "OpenTelemetry collector is enabled." +/usr/bin/logger -s "Additional OpenTelemetery collector configuration can be added." +/usr/bin/logger -s "The exporter endpoint is at the environment variable ACTION_OTEL_EXPORTER_OTLP_ENDPOINT." +/usr/bin/sudo /usr/bin/mkdir -p /etc/otelcol/config.d +/usr/bin/sudo /usr/bin/touch /etc/otelcol/config.d/github.yaml +/usr/bin/sudo /usr/bin/tee /etc/otelcol/config.d/github.yaml < list[SSHDebugCon return ssh_debug_connections +def _build_otel_collector_config_from_charm(charm: CharmBase) -> OtelCollectorConfig | None: + """Initialize the OtelCollectorConfig from charm configuration. + + Args: + charm: The charm instance. + + Returns: + OtelCollectorConfig if endpoint config is set; otherwise None. + """ + endpoint = cast(str, charm.config.get(OTEL_COLLECTOR_ENDPOINT_CONFIG_NAME, "")) + if not endpoint: + return None + + parsed_endpoint = urlsplit(f"//{endpoint}") + if not parsed_endpoint.hostname or parsed_endpoint.port is None: + raise CharmConfigInvalidError( + f"Invalid {OTEL_COLLECTOR_ENDPOINT_CONFIG_NAME} config, expected host:port" + ) + + if ( + parsed_endpoint.username + or parsed_endpoint.password + or parsed_endpoint.path + or parsed_endpoint.query + or parsed_endpoint.fragment + ): + raise CharmConfigInvalidError( + f"Invalid {OTEL_COLLECTOR_ENDPOINT_CONFIG_NAME} config, expected host:port" + ) + + return OtelCollectorConfig(host=parsed_endpoint.hostname, port=parsed_endpoint.port) + + def _build_planner_config_from_charm(charm: CharmBase) -> PlannerConfig | None: """Initialize planner endpoint and token from relation data. @@ -896,6 +931,7 @@ class CharmState: # pylint: disable=too-many-instance-attributes runner_proxy_config: Proxy-related configuration for the runner. runner_config: The charm configuration related to runner VM configuration. ssh_debug_connections: SSH debug connections configuration information. + otel_collector_config: OpenTelemetry collector configuration information. planner_config: Planner endpoint and token from relation data. """ @@ -905,6 +941,7 @@ class CharmState: # pylint: disable=too-many-instance-attributes charm_config: CharmConfig runner_config: OpenstackRunnerConfig ssh_debug_connections: list[SSHDebugConnection] + otel_collector_config: OtelCollectorConfig | None planner_config: PlannerConfig | None @classmethod @@ -923,6 +960,11 @@ def _store_state(cls, state: "CharmState") -> None: state_dict["ssh_debug_connections"] = [ debug_info.json() for debug_info in state_dict["ssh_debug_connections"] ] + state_dict["otel_collector_config"] = ( + json.loads(state_dict["otel_collector_config"].json()) + if state_dict["otel_collector_config"] + else None + ) json_data = json.dumps(state_dict, ensure_ascii=False) CHARM_STATE_PATH.write_text(json_data, encoding="utf-8") @@ -975,6 +1017,12 @@ def from_charm(cls, charm: CharmBase) -> "CharmState": # noqa: C901 logger.error("Invalid SSH debug info: %s.", exc) raise CharmConfigInvalidError("Invalid SSH Debug info") from exc + try: + otel_collector_config = _build_otel_collector_config_from_charm(charm) + except (ValidationError, ValueError) as exc: + logger.error("Invalid OpenTelemetry collector config: %s.", exc) + raise CharmConfigInvalidError("Invalid OpenTelemetry collector config") from exc + planner_config = _build_planner_config_from_charm(charm) state = cls( @@ -984,6 +1032,7 @@ def from_charm(cls, charm: CharmBase) -> "CharmState": # noqa: C901 charm_config=charm_config, runner_config=runner_config, ssh_debug_connections=ssh_debug_connections, + otel_collector_config=otel_collector_config, planner_config=planner_config, ) diff --git a/src/factories.py b/src/factories.py index ae34da9686..acb767a985 100644 --- a/src/factories.py +++ b/src/factories.py @@ -53,6 +53,7 @@ def create_application_configuration( runner_proxy_config=state.runner_proxy_config, dockerhub_mirror=state.charm_config.dockerhub_mirror, ssh_debug_connections=state.ssh_debug_connections, + otel_collector_config=state.otel_collector_config, use_aproxy=state.charm_config.use_aproxy, aproxy_exclude_addresses=state.charm_config.aproxy_exclude_addresses, aproxy_redirect_ports=state.charm_config.aproxy_redirect_ports, diff --git a/tests/integration/helpers/openstack.py b/tests/integration/helpers/openstack.py index cf6f4d19b7..0c06d690a3 100644 --- a/tests/integration/helpers/openstack.py +++ b/tests/integration/helpers/openstack.py @@ -125,10 +125,11 @@ def run_in_instance( exit_code, _, _ = run_in_unit(self.juju, unit_name, f"ls {key_path}") assert exit_code == 0, f"Unable to find key file {key_path}" ssh_cmd = f'ssh -i {key_path} -o "StrictHostKeyChecking no" ubuntu@{ip} {command}' - ssh_cmd_as_ubuntu_user = f"su - ubuntu -c '{ssh_cmd}'" - logging.warning("ssh_cmd: %s", ssh_cmd_as_ubuntu_user) + # The SSH command needs to be run as the manager user to have access to the SSH keys. + ssh_cmd_as_manager_user = f"su - {constants.RUNNER_MANAGER_USER} -c '{ssh_cmd}'" + logging.warning("ssh_cmd: %s", ssh_cmd_as_manager_user) exit_code, stdout, stderr = run_in_unit( - self.juju, unit_name, ssh_cmd_as_ubuntu_user, timeout + self.juju, unit_name, ssh_cmd_as_manager_user, timeout ) logger.info( "Run command '%s' in runner with result %s: '%s' '%s'", diff --git a/tests/integration/test_charm_runner.py b/tests/integration/test_charm_runner.py index 7efb8b4e80..d2ed7b5459 100644 --- a/tests/integration/test_charm_runner.py +++ b/tests/integration/test_charm_runner.py @@ -10,7 +10,11 @@ from github.Branch import Branch from github.Repository import Repository -from charm_state import BASE_VIRTUAL_MACHINES_CONFIG_NAME, CUSTOM_PRE_JOB_SCRIPT_CONFIG_NAME +from charm_state import ( + BASE_VIRTUAL_MACHINES_CONFIG_NAME, + CUSTOM_PRE_JOB_SCRIPT_CONFIG_NAME, + OTEL_COLLECTOR_ENDPOINT_CONFIG_NAME, +) from tests.integration.helpers.common import ( DISPATCH_TEST_WORKFLOW_FILENAME, DISPATCH_WAIT_TEST_WORKFLOW_FILENAME, @@ -186,3 +190,47 @@ def test_custom_pre_job_script( logs = get_job_logs(workflow_run.jobs("latest")[0]) assert "SSH config" in logs assert "proxycommand socat - PROXY:squid.internal:%h:%p,proxyport=3128" in logs + + +@pytest.mark.openstack +@pytest.mark.abort_on_fail +def test_otel_collector_endpoint_pre_job_installs_config( + juju: jubilant.Juju, + app: str, + github_repository: Repository, + test_github_branch: Branch, + instance_helper: OpenStackInstanceHelper, +) -> None: + """ + arrange: A working application with one runner and otel collector endpoint configured. + act: Dispatch a workflow to run pre-job script. + assert: The workflow writes otel collector config to /etc/otelcol/config.d/github.yaml. + """ + endpoint = "10.10.0.12:4317" + juju.config( + app, + values={ + BASE_VIRTUAL_MACHINES_CONFIG_NAME: "1", + OTEL_COLLECTOR_ENDPOINT_CONFIG_NAME: endpoint, + }, + ) + wait_for_runner_ready(juju, app) + + dispatch_workflow( + app_name=app, + branch=test_github_branch, + github_repository=github_repository, + conclusion="success", + workflow_id_or_name=DISPATCH_TEST_WORKFLOW_FILENAME, + dispatch_input={"runner": app}, + ) + + exit_code, stdout, stderr = instance_helper.run_in_instance( + unit_name=f"{app}/0", + command="sudo cat /etc/otelcol/config.d/github.yaml", + ) + + assert exit_code == 0, stderr + assert stdout is not None + assert "exporters:" in stdout + assert f"endpoint: {endpoint}" in stdout diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 3aaf3f1c8b..8ef6020cac 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -188,5 +188,6 @@ def complete_charm_state_fixture(): ed25519_fingerprint="SHA256:ed25519", ), ], + otel_collector_config=None, planner_config=None, ) diff --git a/tests/unit/test_charm_state.py b/tests/unit/test_charm_state.py index a009c8bb53..3ebbf05a13 100644 --- a/tests/unit/test_charm_state.py +++ b/tests/unit/test_charm_state.py @@ -33,6 +33,7 @@ OPENSTACK_CLOUDS_YAML_CONFIG_NAME, OPENSTACK_CLOUDS_YAML_SECRET_ID_CONFIG_NAME, OPENSTACK_FLAVOR_CONFIG_NAME, + OTEL_COLLECTOR_ENDPOINT_CONFIG_NAME, PATH_CONFIG_NAME, PLANNER_INTEGRATION_NAME, RECONCILE_INTERVAL_CONFIG_NAME, @@ -53,6 +54,7 @@ PlannerConfig, ProxyConfig, SSHDebugConnection, + _build_otel_collector_config_from_charm, _build_planner_config_from_charm, ) from tests.unit.factories import MockGithubRunnerCharmFactory @@ -1306,3 +1308,57 @@ def test_invalid_aproxy_config_in_charm_state( with pytest.raises(CharmConfigInvalidError): CharmState.from_charm(mock_charm) + + +def test_build_otel_collector_config_from_charm_not_set() -> None: + """ + arrange: Mock CharmBase without otel collector endpoint. + act: Build otel collector config. + assert: None is returned. + """ + mock_charm = MockGithubRunnerCharmFactory() + + otel_collector_config = _build_otel_collector_config_from_charm(mock_charm) + + assert otel_collector_config is None + + +@pytest.mark.parametrize("hostname, port", [("10.10.0.12", 42), ("mock_hostname", 823)]) +def test_build_otel_collector_config_from_charm_valid_endpoint(hostname: str, port: int) -> None: + """ + arrange: Mock CharmBase with a valid otel collector endpoint. + act: Build otel collector config. + assert: Parsed host and port are returned. + """ + mock_charm = MockGithubRunnerCharmFactory() + mock_charm.config[OTEL_COLLECTOR_ENDPOINT_CONFIG_NAME] = f"{hostname}:{port}" + + otel_collector_config = _build_otel_collector_config_from_charm(mock_charm) + + assert otel_collector_config is not None + assert str(otel_collector_config.host) == hostname + assert otel_collector_config.port == port + + +@pytest.mark.parametrize( + "endpoint", + [ + "10.10.0.12", + "10.10.0.12:", + "http://10.10.0.12:4317", + "fake_hostname", + "10.10.0.12:4317?x=y", + "10.10.0.12:4317#x", + ], +) +def test_build_otel_collector_config_from_charm_invalid_endpoint(endpoint: str) -> None: + """ + arrange: Mock CharmBase with malformed endpoint formats. + act: Build otel collector config. + assert: CharmConfigInvalidError is raised. + """ + mock_charm = MockGithubRunnerCharmFactory() + mock_charm.config[OTEL_COLLECTOR_ENDPOINT_CONFIG_NAME] = endpoint + + with pytest.raises(CharmConfigInvalidError): + _build_otel_collector_config_from_charm(mock_charm) diff --git a/tests/unit/test_factories.py b/tests/unit/test_factories.py index 3b3cbc1548..ec88294a52 100644 --- a/tests/unit/test_factories.py +++ b/tests/unit/test_factories.py @@ -7,6 +7,7 @@ ApplicationConfiguration, Flavor, Image, + OtelCollectorConfig, ProxyConfig, RunnerCombination, RunnerConfiguration, @@ -149,6 +150,26 @@ def test_create_application_configuration_with_planner( assert app_configuration.planner_token == "planner-token-value" +def test_create_application_configuration_with_otel_collector_config( + complete_charm_state: charm_state.CharmState, +): + """ + arrange: Prepare CharmState with otel collector config. + act: Call create_application_configuration. + assert: The service config contains the collector endpoint. + """ + state = dataclasses.replace( + complete_charm_state, + otel_collector_config=OtelCollectorConfig(host="10.10.0.12", port=4317), + ) + + app_configuration = factories.create_application_configuration(state, "app_name", "unit_name") + + assert app_configuration.service_config.otel_collector_config is not None + assert str(app_configuration.service_config.otel_collector_config.host) == "10.10.0.12" + assert app_configuration.service_config.otel_collector_config.port == 4317 + + @pytest.mark.parametrize( "charm_config_updates, expected_auth", [