From 6d6ed021e40235f81b3cb6e644d9f4b7c78700ea Mon Sep 17 00:00:00 2001 From: Petr Evstifeev Date: Wed, 26 Nov 2025 11:07:09 -0600 Subject: [PATCH 01/51] fixed config path --- src/rootfs/files/scripts/install_pccs.sh | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/src/rootfs/files/scripts/install_pccs.sh b/src/rootfs/files/scripts/install_pccs.sh index e14642d3..ac241425 100755 --- a/src/rootfs/files/scripts/install_pccs.sh +++ b/src/rootfs/files/scripts/install_pccs.sh @@ -67,10 +67,10 @@ function move_pccs_to_custom_location() { function create_pccs_config() { log_info "creating PCCS configuration directory"; - mkdir -p "${OUTPUTDIR}${PCCS_INSTALL_DIR}/config/"; + mkdir -p "${OUTPUTDIR}${PCCS_INSTALL_DIR}/${PCCS_DIRNAME}/config/"; log_info "creating PCCS configuration file"; - cat > "${OUTPUTDIR}${PCCS_INSTALL_DIR}/config/default.json" << EOL + cat > "${OUTPUTDIR}${PCCS_INSTALL_DIR}/${PCCS_DIRNAME}/config/default.json" << EOL { "HTTPS_PORT" : ${PCCS_PORT}, "hosts" : "127.0.0.1", @@ -107,6 +107,16 @@ function create_pccs_config() { EOL } +function generate_ssl_keys() { + log_info "generating SSL keys for PCCS"; + mkdir -p "${OUTPUTDIR}${PCCS_INSTALL_DIR}/${PCCS_DIRNAME}/ssl_key"; + + chroot "${OUTPUTDIR}" /bin/bash -c "cd ${PCCS_INSTALL_DIR}/${PCCS_DIRNAME} && \ + openssl genrsa -out ssl_key/private.pem 2048 && \ + openssl req -new -key ssl_key/private.pem -out ssl_key/csr.pem -subj '/CN=localhost' && \ + openssl x509 -req -days 365 -in ssl_key/csr.pem -signkey ssl_key/private.pem -out ssl_key/file.crt"; +} + function set_pccs_permissions() { log_info "setting PCCS permissions"; chroot "${OUTPUTDIR}" /bin/bash -c "chown -R pccs:pccs ${PCCS_INSTALL_DIR}/${PCCS_DIRNAME} && chmod -R 750 ${PCCS_INSTALL_DIR}/${PCCS_DIRNAME}"; @@ -143,6 +153,7 @@ add_intel_sgx_repository; install_pccs_package; move_pccs_to_custom_location; create_pccs_config; +generate_ssl_keys; update_pccs_service; enable_pccs_service; set_pccs_permissions; From 7363e0ccab0184e634c8281b472fc91fa262bc2a Mon Sep 17 00:00:00 2001 From: Petr Evstifeev Date: Wed, 26 Nov 2025 12:52:28 -0600 Subject: [PATCH 02/51] added iptables rules to access to pccs from lxc-container --- .../pki-service/create-and-configure-pki.sh | 236 ++++++++++++------ .../configs/pki-service/pki-authority.service | 4 +- 2 files changed, 160 insertions(+), 80 deletions(-) diff --git a/src/rootfs/files/configs/pki-service/create-and-configure-pki.sh b/src/rootfs/files/configs/pki-service/create-and-configure-pki.sh index 7d2df1ed..4fc3c678 100755 --- a/src/rootfs/files/configs/pki-service/create-and-configure-pki.sh +++ b/src/rootfs/files/configs/pki-service/create-and-configure-pki.sh @@ -2,15 +2,6 @@ set -euo pipefail CONTAINER_NAME="pki-authority" - -if lxc-info -n "${CONTAINER_NAME}" &>/dev/null; then - echo "Container '${CONTAINER_NAME}' already exists." -else - echo "Container '${CONTAINER_NAME}' not found. Creating..." - lxc-create -n "${CONTAINER_NAME}" -t oci -- --url docker-archive://root/containers/pki-authority.tar - echo "Container '${CONTAINER_NAME}' created." -fi - CPU_TYPE="untrusted" if [[ -c "/dev/tdx_guest" ]] ; then CPU_TYPE="tdx"; @@ -20,81 +11,170 @@ fi export CPU_TYPE="${CPU_TYPE}" -SRC_YAML="/root/containers/lxc-template.yaml" -DST_YAML="/var/lib/lxc/pki-authority/rootfs/app/conf/lxc.yaml" - -if [[ -f "${SRC_YAML}" ]]; then - if command -v yq-go >/dev/null 2>&1; then - yq-go e '.pki.ownChallenge.type = strenv(CPU_TYPE)' "${SRC_YAML}" > "${DST_YAML}" - echo "Patched ${DST_YAML} with type: ${CPU_TYPE} using yq." +# Create LXC container if it doesn't exist +create_container() { + if lxc-info -n "${CONTAINER_NAME}" &>/dev/null; then + echo "Container '${CONTAINER_NAME}' already exists." else - echo "Error: yq-go is not installed. Please install yq-go for YAML editing." - exit 1 + echo "Container '${CONTAINER_NAME}' not found. Creating..." + lxc-create -n "${CONTAINER_NAME}" -t oci -- --url docker-archive://root/containers/pki-authority.tar + echo "Container '${CONTAINER_NAME}' created." fi -else - echo "Error: ${SRC_YAML} not found." - exit 1 -fi - -# --- Trusted subroot env handling --- -TRUSTED_VARS=( - AS__pki__baseDomain - AS__pki__ownDomain - AS__pki__certParams__ocspUrl - AS__pki__mode__attestationServiceSource__baseUrl - AS__pki__mode__attestationServiceSource__caBundle -) - -SRC_SUBROOT_ENV="/sp/subroot.env" -DST_SUBROOT_ENV="/var/lib/lxc/pki-authority/rootfs/app/subroot.env" - -# If source exists, (re)create destination with only trusted variables -if [[ -f "${SRC_SUBROOT_ENV}" ]]; then - # Remove destination first to ensure a clean recreate - rm -f "${DST_SUBROOT_ENV}" - - # Header explaining autogenerated file - echo "# Autogenerated from ${SRC_SUBROOT_ENV}. Contains only trusted variables." > "${DST_SUBROOT_ENV}" - - for var in "${TRUSTED_VARS[@]}"; do - # capture first matching line in form VAR="value" - line="$(grep -m1 -E "^${var}=\".*\"" "${SRC_SUBROOT_ENV}" 2>/dev/null || true)" - if [[ -n "${line}" ]]; then - echo "${line}" >> "${DST_SUBROOT_ENV}" +} + +# Set own challenge type in LXC container configuration +set_own_challenge() { + local src_yaml="/root/containers/lxc-template.yaml" + local dst_yaml="/var/lib/lxc/${CONTAINER_NAME}/rootfs/app/conf/lxc.yaml" + + if [[ -f "${src_yaml}" ]]; then + if command -v yq-go >/dev/null 2>&1; then + yq-go e '.pki.ownChallenge.type = strenv(CPU_TYPE)' "${src_yaml}" > "${dst_yaml}" + echo "Patched ${dst_yaml} with type: ${CPU_TYPE} using yq." + else + echo "Error: yq-go is not installed. Please install yq-go for YAML editing." + exit 1 fi - done + else + echo "Error: ${src_yaml} not found." + exit 1 + fi +} + +# Copy trusted environment variables to container +set_subbroot_env() { + # --- Trusted subroot env handling --- + local trusted_vars=( + AS__pki__baseDomain + AS__pki__ownDomain + AS__pki__certParams__ocspUrl + AS__pki__mode__attestationServiceSource__baseUrl + AS__pki__mode__attestationServiceSource__caBundle + ) + + local src_subroot_env="/sp/subroot.env" + local dst_subroot_env="/var/lib/lxc/${CONTAINER_NAME}/rootfs/app/subroot.env" + + # If source exists, (re)create destination with only trusted variables + if [[ -f "${src_subroot_env}" ]]; then + # Remove destination first to ensure a clean recreate + rm -f "${dst_subroot_env}" + + # Header explaining autogenerated file + echo "# Autogenerated from ${src_subroot_env}. Contains only trusted variables." > "${dst_subroot_env}" + + for var in "${trusted_vars[@]}"; do + # capture first matching line in form VAR="value" + local line + line="$(grep -m1 -E "^${var}=\".*\"" "${src_subroot_env}" 2>/dev/null || true)" + if [[ -n "${line}" ]]; then + echo "${line}" >> "${dst_subroot_env}" + fi + done + + chmod 0644 "${dst_subroot_env}" || true + echo "Created ${dst_subroot_env} with trusted variables." + else + echo "Info: ${src_subroot_env} not found; skipping creation of ${dst_subroot_env}" + fi +} - chmod 0644 "${DST_SUBROOT_ENV}" || true - echo "Created ${DST_SUBROOT_ENV} with trusted variables." -else - echo "Info: ${SRC_SUBROOT_ENV} not found; skipping creation of ${DST_SUBROOT_ENV}" -fi +# Patch LXC container configuration +patch_lxc_config() { + local config_file="/var/lib/lxc/${CONTAINER_NAME}/config" + local config_bak="${config_file}.bak" -CONFIG_FILE="/var/lib/lxc/pki-authority/config" -CONFIG_BAK="${CONFIG_FILE}.bak" + # Always restore config from backup if backup exists + if [[ -f "${config_bak}" ]]; then + cp "${config_bak}" "${config_file}" + else + # Create backup before first patch + if [[ -f "${config_file}" ]]; then + cp "${config_file}" "${config_bak}" + fi + fi -# Always restore config from backup if backup exists -if [[ -f "${CONFIG_BAK}" ]]; then - cp "${CONFIG_BAK}" "${CONFIG_FILE}" -else - # Create backup before first patch - if [[ -f "${CONFIG_FILE}" ]]; then - cp "${CONFIG_FILE}" "${CONFIG_BAK}" + # This MAC address is used to get a static IP address from DHCP, see /etc/lxc/dnsmasq.conf + echo "lxc.net.0.hwaddr = 4e:fc:0a:d5:2d:ff" >> "${config_file}" + + if [[ "${CPU_TYPE}" = "sev-snp" ]]; then + local dev_id + dev_id="$(stat -c '%t:%T' /dev/sev-guest | awk -F: '{printf "%d:%d\n", "0x"$1, "0x"$2}')" + echo "lxc.cgroup2.devices.allow = c ${dev_id} rwm" >> "${config_file}" + echo "lxc.mount.entry = /dev/sev-guest dev/sev-guest none bind,optional,create=file" >> "${config_file}" + elif [[ "${CPU_TYPE}" = "tdx" ]]; then + local dev_id + dev_id="$(stat -c '%t:%T' /dev/tdx_guest | awk -F: '{printf "%d:%d\n", "0x"$1, "0x"$2}')" + echo "lxc.cgroup2.devices.allow = c ${dev_id} rwm" >> "${config_file}" + echo "lxc.mount.entry = /dev/tdx_guest dev/tdx_guest none bind,optional,create=file" >> "${config_file}" + if [[ -f "/etc/tdx-attest.conf" ]]; then + echo "lxc.mount.entry = /etc/tdx-attest.conf etc/tdx-attest.conf none bind,ro,create=file" >> "${config_file}" + fi fi -fi +} + +# Update PCCS URL and setup iptables NAT rules for LXC container access to host service +update_pccs_url_and_setup_iptables() { + local bridge_name="lxcbr0" + local service_port="8081" + local qcnl_conf="/var/lib/lxc/${CONTAINER_NAME}/rootfs/etc/sgx_default_qcnl.conf" + local qcnl_conf_bak="${qcnl_conf}.bak" + + # Get host IP address on the LXC bridge + local host_ip + host_ip=$(ip -4 addr show "${bridge_name}" 2>/dev/null | grep -oP '(?<=inet\s)\d+(\.\d+){3}' | head -n1) + + if [[ -z "${host_ip}" ]]; then + echo "Error: Could not determine IP address for bridge ${bridge_name}" + exit 1 + fi + + #local network=$(echo "${host_ip}" | awk -F. '{print $1"."$2"."$3".0/24"}') -# This MAC address is used to get a static IP address from DHCP, see /etc/lxc/dnsmasq.conf -echo "lxc.net.0.hwaddr = 4e:fc:0a:d5:2d:ff" >> "${CONFIG_FILE}" - -if [[ "${CPU_TYPE}" = "sev-snp" ]]; then - DEV_ID="$(stat -c '%t:%T' /dev/sev-guest | awk -F: '{printf "%d:%d\n", "0x"$1, "0x"$2}')" - echo "lxc.cgroup2.devices.allow = c ${DEV_ID} rwm" >> "${CONFIG_FILE}" - echo "lxc.mount.entry = /dev/sev-guest dev/sev-guest none bind,optional,create=file" >> "${CONFIG_FILE}" -elif [[ "${CPU_TYPE}" = "tdx" ]]; then - DEV_ID="$(stat -c '%t:%T' /dev/tdx_guest | awk -F: '{printf "%d:%d\n", "0x"$1, "0x"$2}')" - echo "lxc.cgroup2.devices.allow = c ${DEV_ID} rwm" >> "${CONFIG_FILE}" - echo "lxc.mount.entry = /dev/tdx_guest dev/tdx_guest none bind,optional,create=file" >> "${CONFIG_FILE}" - if [[ -f "/etc/tdx-attest.conf" ]]; then - echo "lxc.mount.entry = /etc/tdx-attest.conf etc/tdx-attest.conf none bind,ro,create=file" >> "${CONFIG_FILE}" + # Enable route_localnet for the bridge to allow routing to localhost + if [[ $(sysctl -n net.ipv4.conf.${bridge_name}.route_localnet 2>/dev/null) != "1" ]]; then + sysctl -w net.ipv4.conf.${bridge_name}.route_localnet=1 + echo "Enabled route_localnet for ${bridge_name}" + else + echo "route_localnet already enabled for ${bridge_name}" fi -fi + + # Add DNAT rule to redirect traffic from LXC bridge to localhost (if not already present) + if ! iptables -t nat -C PREROUTING -p tcp -d "${host_ip}" --dport "${service_port}" -j DNAT --to-destination 127.0.0.1:"${service_port}" 2>/dev/null; then + iptables -t nat -A PREROUTING -p tcp -d "${host_ip}" --dport "${service_port}" -j DNAT --to-destination 127.0.0.1:"${service_port}" + echo "iptables DNAT rule added: ${host_ip}:${service_port} -> 127.0.0.1:${service_port}" + else + echo "iptables DNAT rule already exists for ${host_ip}:${service_port}" + fi + + # Add MASQUERADE rule for the network (if not already present) + #if ! iptables -t nat -C POSTROUTING -s "${network}" -j MASQUERADE 2>/dev/null; then + # iptables -t nat -A POSTROUTING -s "${network}" -j MASQUERADE + # echo "iptables MASQUERADE rule added for network ${network}" + #else + # echo "iptables MASQUERADE rule already exists for network ${network}" + #fi + + # Update PCCS URL in QCNL configuration + local pccs_url="https://${host_ip}:${service_port}/sgx/certification/v4/" + if [[ -f "${qcnl_conf}" ]]; then + # Create backup if it doesn't exist + if [[ ! -f "${qcnl_conf_bak}" ]]; then + cp "${qcnl_conf}" "${qcnl_conf_bak}" + # Update pccs_url in the JSON configuration file + sed -i "s|\"pccs_url\": \".*\"|\"pccs_url\": \"${pccs_url}\"|g" "${qcnl_conf}" + echo "Updated PCCS URL in ${qcnl_conf} to ${pccs_url}" + else + echo "Backup ${qcnl_conf_bak} already exists, skipping PCCS URL update" + fi + else + echo "Error: ${qcnl_conf} not found" + exit 1 + fi +} + +create_container +set_own_challenge +set_subbroot_env +patch_lxc_config +update_pccs_url_and_setup_iptables diff --git a/src/rootfs/files/configs/pki-service/pki-authority.service b/src/rootfs/files/configs/pki-service/pki-authority.service index 612289f5..b5db7142 100644 --- a/src/rootfs/files/configs/pki-service/pki-authority.service +++ b/src/rootfs/files/configs/pki-service/pki-authority.service @@ -1,7 +1,7 @@ [Unit] Description=PKI authority lxc-container -After=lxc.service lxc-net.service lxc-monitord.service nvidia-persistenced.service -Requires=lxc.service lxc-net.service lxc-monitord.service +After=lxc.service lxc-net.service lxc-monitord.service nvidia-persistenced.service pccs.service +Requires=lxc.service lxc-net.service lxc-monitord.service pccs.service Wants=nvidia-persistenced.service [Service] From c9865c571f58989d69451b09f80c36009308808b Mon Sep 17 00:00:00 2001 From: Petr Evstifeev Date: Thu, 27 Nov 2025 04:46:11 -0600 Subject: [PATCH 03/51] move lxc container into /etc/super --- src/Dockerfile | 4 ++-- .../files/configs/pki-service/create-and-configure-pki.sh | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Dockerfile b/src/Dockerfile index b58035c1..72cd9195 100644 --- a/src/Dockerfile +++ b/src/Dockerfile @@ -226,8 +226,8 @@ ADD rootfs/files/scripts/install_lxc_deps.sh /buildroot/files/scripts/ ADD rootfs/files/configs/pki-service/pki-authority.service "${OUTPUTDIR}/etc/systemd/system" RUN ln -s /etc/systemd/system/pki-authority.service "${OUTPUTDIR}/etc/systemd/system/multi-user.target.wants/pki-authority.service" ADD rootfs/files/configs/pki-service/create-and-configure-pki.sh "${OUTPUTDIR}/usr/local/bin" -RUN mkdir -p "${OUTPUTDIR}/root/containers" -COPY --from=ghcr.io/super-protocol/tee-pki-authority-service-lxc:build-18725490828 /pki-authority.tar "${OUTPUTDIR}/root/containers/pki-authority.tar" +RUN mkdir -p "${OUTPUTDIR}/etc/super/containers" +COPY --from=ghcr.io/super-protocol/tee-pki-authority-service-lxc:build-18725490828 /pki-authority.tar "${OUTPUTDIR}/etc/super/containers/pki-authority.tar" ADD rootfs/files/configs/pki-service/lxc-template.yaml "${OUTPUTDIR}/root/containers/lxc-template.yaml" ADD rootfs/files/configs/pki-service/dnsmasq.conf "${OUTPUTDIR}/etc/lxc/dnsmasq.conf" ADD rootfs/files/configs/pki-service/lxc-net "${OUTPUTDIR}/etc/default/lxc-net" diff --git a/src/rootfs/files/configs/pki-service/create-and-configure-pki.sh b/src/rootfs/files/configs/pki-service/create-and-configure-pki.sh index 4fc3c678..d64526eb 100755 --- a/src/rootfs/files/configs/pki-service/create-and-configure-pki.sh +++ b/src/rootfs/files/configs/pki-service/create-and-configure-pki.sh @@ -17,7 +17,7 @@ create_container() { echo "Container '${CONTAINER_NAME}' already exists." else echo "Container '${CONTAINER_NAME}' not found. Creating..." - lxc-create -n "${CONTAINER_NAME}" -t oci -- --url docker-archive://root/containers/pki-authority.tar + lxc-create -n "${CONTAINER_NAME}" -t oci -- --url docker-archive:/etc/super/containers/pki-authority.tar echo "Container '${CONTAINER_NAME}' created." fi } From 60c312a53180281932c183c1e0bc47e1118aa1b9 Mon Sep 17 00:00:00 2001 From: Petr Evstifeev Date: Thu, 27 Nov 2025 12:21:58 -0600 Subject: [PATCH 04/51] fixed pccs cachemode to lazy --- src/Dockerfile | 7 ++--- .../pki-service/create-and-configure-pki.sh | 14 ++++++++-- ...plate.yaml => lxc-legacy-vm-template.yaml} | 0 .../pki-service/lxc-swarm-template.yaml | 27 +++++++++++++++++++ src/rootfs/files/scripts/install_pccs.sh | 12 ++++----- 5 files changed, 49 insertions(+), 11 deletions(-) rename src/rootfs/files/configs/pki-service/{lxc-template.yaml => lxc-legacy-vm-template.yaml} (100%) create mode 100644 src/rootfs/files/configs/pki-service/lxc-swarm-template.yaml diff --git a/src/Dockerfile b/src/Dockerfile index 86a8709d..3df5eaf3 100644 --- a/src/Dockerfile +++ b/src/Dockerfile @@ -227,9 +227,10 @@ ADD rootfs/files/scripts/install_lxc_deps.sh /buildroot/files/scripts/ ADD rootfs/files/configs/pki-service/pki-authority.service "${OUTPUTDIR}/etc/systemd/system" RUN ln -s /etc/systemd/system/pki-authority.service "${OUTPUTDIR}/etc/systemd/system/multi-user.target.wants/pki-authority.service" ADD rootfs/files/configs/pki-service/create-and-configure-pki.sh "${OUTPUTDIR}/usr/local/bin" -RUN mkdir -p "${OUTPUTDIR}/etc/super/containers" -COPY --from=ghcr.io/super-protocol/tee-pki-authority-service-lxc:build-18725490828 /pki-authority.tar "${OUTPUTDIR}/etc/super/containers/pki-authority.tar" -ADD rootfs/files/configs/pki-service/lxc-template.yaml "${OUTPUTDIR}/root/containers/lxc-template.yaml" +RUN mkdir -p "${OUTPUTDIR}/etc/super/containers/pki-authority" +COPY --from=ghcr.io/super-protocol/tee-pki-authority-service-lxc:build-18725490828 /pki-authority.tar "${OUTPUTDIR}/etc/super/containers/pki-authority/pki-authority.tar" +ADD rootfs/files/configs/pki-service/lxc-swarm-template.yaml "${OUTPUTDIR}/etc/super/containers/pki-authority/lxc-swarm-template.yaml" +ADD rootfs/files/configs/pki-service/lxc-legacy-vm-template.yaml "${OUTPUTDIR}/etc/super/containers/pki-authority/lxc-legacy-vm-template.yaml" ADD rootfs/files/configs/pki-service/dnsmasq.conf "${OUTPUTDIR}/etc/lxc/dnsmasq.conf" ADD rootfs/files/configs/pki-service/lxc-net "${OUTPUTDIR}/etc/default/lxc-net" RUN --security=insecure /buildroot/files/scripts/install_lxc_deps.sh diff --git a/src/rootfs/files/configs/pki-service/create-and-configure-pki.sh b/src/rootfs/files/configs/pki-service/create-and-configure-pki.sh index d64526eb..62262f6e 100755 --- a/src/rootfs/files/configs/pki-service/create-and-configure-pki.sh +++ b/src/rootfs/files/configs/pki-service/create-and-configure-pki.sh @@ -17,14 +17,24 @@ create_container() { echo "Container '${CONTAINER_NAME}' already exists." else echo "Container '${CONTAINER_NAME}' not found. Creating..." - lxc-create -n "${CONTAINER_NAME}" -t oci -- --url docker-archive:/etc/super/containers/pki-authority.tar + lxc-create -n "${CONTAINER_NAME}" -t oci -- --url docker-archive:/etc/super/containers/pki-authority/pki-authority.tar echo "Container '${CONTAINER_NAME}' created." fi } # Set own challenge type in LXC container configuration set_own_challenge() { - local src_yaml="/root/containers/lxc-template.yaml" + local template_name="lxc-swarm-template.yaml" + + # Check if vm_mode=legacy is set in kernel command line + if grep -q "vm_mode=legacy" /proc/cmdline 2>/dev/null; then + template_name="lxc-legacy-vm-template.yaml" + echo "Detected vm_mode=legacy in kernel cmdline, using legacy template" + else + echo "Using swarm template" + fi + + local src_yaml="/etc/super/containers/pki-authority/${template_name}" local dst_yaml="/var/lib/lxc/${CONTAINER_NAME}/rootfs/app/conf/lxc.yaml" if [[ -f "${src_yaml}" ]]; then diff --git a/src/rootfs/files/configs/pki-service/lxc-template.yaml b/src/rootfs/files/configs/pki-service/lxc-legacy-vm-template.yaml similarity index 100% rename from src/rootfs/files/configs/pki-service/lxc-template.yaml rename to src/rootfs/files/configs/pki-service/lxc-legacy-vm-template.yaml diff --git a/src/rootfs/files/configs/pki-service/lxc-swarm-template.yaml b/src/rootfs/files/configs/pki-service/lxc-swarm-template.yaml new file mode 100644 index 00000000..6d53f72e --- /dev/null +++ b/src/rootfs/files/configs/pki-service/lxc-swarm-template.yaml @@ -0,0 +1,27 @@ +api: + port: 443 +pki: + allowedChallenges: + - token + - tdx + allowedSubRootChallenges: + - type: tdx + signatureVerification: github + - type: sev-snp + signatureVerification: github + tokenChallengeFilePath: /app/access-token + ownDomain: ca-subroot.super-protocol.svc.cluster.local + ownChallenge: + type: tdx + certParams: + ocspUrl: https://ocsp.certs.superprotocol.com/v1/ocsp + keyStorage: + type: trusted + storage: + type: super + keysPath: /app/keys + mode: + role: subroot + attestationServiceSource: + type: swarm + mode: init \ No newline at end of file diff --git a/src/rootfs/files/scripts/install_pccs.sh b/src/rootfs/files/scripts/install_pccs.sh index ac241425..6400a9c3 100755 --- a/src/rootfs/files/scripts/install_pccs.sh +++ b/src/rootfs/files/scripts/install_pccs.sh @@ -67,10 +67,10 @@ function move_pccs_to_custom_location() { function create_pccs_config() { log_info "creating PCCS configuration directory"; - mkdir -p "${OUTPUTDIR}${PCCS_INSTALL_DIR}/${PCCS_DIRNAME}/config/"; + mkdir -p "${OUTPUTDIR}${PCCS_ORIGINAL_LOCATION}/${PCCS_DIRNAME}/config/"; log_info "creating PCCS configuration file"; - cat > "${OUTPUTDIR}${PCCS_INSTALL_DIR}/${PCCS_DIRNAME}/config/default.json" << EOL + cat > "${OUTPUTDIR}${PCCS_ORIGINAL_LOCATION}/${PCCS_DIRNAME}/config/default.json" << EOL { "HTTPS_PORT" : ${PCCS_PORT}, "hosts" : "127.0.0.1", @@ -80,7 +80,7 @@ function create_pccs_config() { "RefreshSchedule": "0 0 1 * *", "UserTokenHash" : "${USER_TOKEN}", "AdminTokenHash" : "${USER_TOKEN}", - "CachingFillMode" : "REQ", + "CachingFillMode" : "LAZY", "LogLevel" : "debug", "DB_CONFIG" : "sqlite", "sqlite" : { @@ -109,9 +109,9 @@ EOL function generate_ssl_keys() { log_info "generating SSL keys for PCCS"; - mkdir -p "${OUTPUTDIR}${PCCS_INSTALL_DIR}/${PCCS_DIRNAME}/ssl_key"; + mkdir -p "${OUTPUTDIR}${PCCS_ORIGINAL_LOCATION}/${PCCS_DIRNAME}/ssl_key"; - chroot "${OUTPUTDIR}" /bin/bash -c "cd ${PCCS_INSTALL_DIR}/${PCCS_DIRNAME} && \ + chroot "${OUTPUTDIR}" /bin/bash -c "cd ${PCCS_ORIGINAL_LOCATION}/${PCCS_DIRNAME} && \ openssl genrsa -out ssl_key/private.pem 2048 && \ openssl req -new -key ssl_key/private.pem -out ssl_key/csr.pem -subj '/CN=localhost' && \ openssl x509 -req -days 365 -in ssl_key/csr.pem -signkey ssl_key/private.pem -out ssl_key/file.crt"; @@ -151,11 +151,11 @@ function enable_pccs_service() { chroot_init; add_intel_sgx_repository; install_pccs_package; -move_pccs_to_custom_location; create_pccs_config; generate_ssl_keys; update_pccs_service; enable_pccs_service; +move_pccs_to_custom_location; set_pccs_permissions; chroot_deinit; From ab9abe17bc6b995d627394e08cd4e117929d8ae0 Mon Sep 17 00:00:00 2001 From: Petr Evstifeev Date: Mon, 1 Dec 2025 07:43:34 -0600 Subject: [PATCH 05/51] use mongodb as certificate storage --- .../pki-service/create-and-configure-pki.sh | 19 +++++++++---------- .../pki-service/lxc-swarm-template.yaml | 5 ++++- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/src/rootfs/files/configs/pki-service/create-and-configure-pki.sh b/src/rootfs/files/configs/pki-service/create-and-configure-pki.sh index 62262f6e..332c91ba 100755 --- a/src/rootfs/files/configs/pki-service/create-and-configure-pki.sh +++ b/src/rootfs/files/configs/pki-service/create-and-configure-pki.sh @@ -139,8 +139,6 @@ update_pccs_url_and_setup_iptables() { exit 1 fi - #local network=$(echo "${host_ip}" | awk -F. '{print $1"."$2"."$3".0/24"}') - # Enable route_localnet for the bridge to allow routing to localhost if [[ $(sysctl -n net.ipv4.conf.${bridge_name}.route_localnet 2>/dev/null) != "1" ]]; then sysctl -w net.ipv4.conf.${bridge_name}.route_localnet=1 @@ -149,7 +147,7 @@ update_pccs_url_and_setup_iptables() { echo "route_localnet already enabled for ${bridge_name}" fi - # Add DNAT rule to redirect traffic from LXC bridge to localhost (if not already present) + # PCCS service on port 8081 if ! iptables -t nat -C PREROUTING -p tcp -d "${host_ip}" --dport "${service_port}" -j DNAT --to-destination 127.0.0.1:"${service_port}" 2>/dev/null; then iptables -t nat -A PREROUTING -p tcp -d "${host_ip}" --dport "${service_port}" -j DNAT --to-destination 127.0.0.1:"${service_port}" echo "iptables DNAT rule added: ${host_ip}:${service_port} -> 127.0.0.1:${service_port}" @@ -157,13 +155,14 @@ update_pccs_url_and_setup_iptables() { echo "iptables DNAT rule already exists for ${host_ip}:${service_port}" fi - # Add MASQUERADE rule for the network (if not already present) - #if ! iptables -t nat -C POSTROUTING -s "${network}" -j MASQUERADE 2>/dev/null; then - # iptables -t nat -A POSTROUTING -s "${network}" -j MASQUERADE - # echo "iptables MASQUERADE rule added for network ${network}" - #else - # echo "iptables MASQUERADE rule already exists for network ${network}" - #fi + # MongoDB service on port 27017 + local mongodb_port="27017" + if ! iptables -t nat -C PREROUTING -p tcp -d "${host_ip}" --dport "${mongodb_port}" -j DNAT --to-destination 127.0.0.1:"${mongodb_port}" 2>/dev/null; then + iptables -t nat -A PREROUTING -p tcp -d "${host_ip}" --dport "${mongodb_port}" -j DNAT --to-destination 127.0.0.1:"${mongodb_port}" + echo "iptables DNAT rule added: ${host_ip}:${mongodb_port} -> 127.0.0.1:${mongodb_port}" + else + echo "iptables DNAT rule already exists for ${host_ip}:${mongodb_port}" + fi # Update PCCS URL in QCNL configuration local pccs_url="https://${host_ip}:${service_port}/sgx/certification/v4/" diff --git a/src/rootfs/files/configs/pki-service/lxc-swarm-template.yaml b/src/rootfs/files/configs/pki-service/lxc-swarm-template.yaml index 6d53f72e..6eb3eb2a 100644 --- a/src/rootfs/files/configs/pki-service/lxc-swarm-template.yaml +++ b/src/rootfs/files/configs/pki-service/lxc-swarm-template.yaml @@ -24,4 +24,7 @@ pki: role: subroot attestationServiceSource: type: swarm - mode: init \ No newline at end of file + mode: init + storage: + storageType: mongodb + connectionString: mongodb://10.0.3.1:27017/pki From c33eabdc89909eb62fcca097f8374fa6b19adf2f Mon Sep 17 00:00:00 2001 From: Petr Evstifeev Date: Tue, 2 Dec 2025 08:19:25 -0600 Subject: [PATCH 06/51] pki-authority as swarm-service draft --- src/Dockerfile | 5 +- .../configs/pki-service/pki-authority.service | 22 -- src/services/apps/pki-authority/main.py | 272 ++++++++++++++++++ src/services/apps/pki-authority/manifest.yaml | 57 ++++ swarm-scripts/80.setup-pki-authority.sh | 44 +++ 5 files changed, 375 insertions(+), 25 deletions(-) delete mode 100644 src/rootfs/files/configs/pki-service/pki-authority.service create mode 100755 src/services/apps/pki-authority/main.py create mode 100644 src/services/apps/pki-authority/manifest.yaml create mode 100644 swarm-scripts/80.setup-pki-authority.sh diff --git a/src/Dockerfile b/src/Dockerfile index 649071d5..bb34aa90 100644 --- a/src/Dockerfile +++ b/src/Dockerfile @@ -224,8 +224,8 @@ RUN mkdir -p "${OUTPUTDIR}/usr/local/bin"; # copy pki-authority service files ADD rootfs/files/scripts/install_lxc_deps.sh /buildroot/files/scripts/ -ADD rootfs/files/configs/pki-service/pki-authority.service "${OUTPUTDIR}/etc/systemd/system" -RUN ln -s /etc/systemd/system/pki-authority.service "${OUTPUTDIR}/etc/systemd/system/multi-user.target.wants/pki-authority.service" +RUN --security=insecure /buildroot/files/scripts/install_lxc_deps.sh + ADD rootfs/files/configs/pki-service/create-and-configure-pki.sh "${OUTPUTDIR}/usr/local/bin" RUN mkdir -p "${OUTPUTDIR}/etc/super/containers/pki-authority" COPY --from=ghcr.io/super-protocol/tee-pki-authority-service-lxc:build-18725490828 /pki-authority.tar "${OUTPUTDIR}/etc/super/containers/pki-authority/pki-authority.tar" @@ -233,7 +233,6 @@ ADD rootfs/files/configs/pki-service/lxc-swarm-template.yaml "${OUTPUTDIR}/etc/s ADD rootfs/files/configs/pki-service/lxc-legacy-vm-template.yaml "${OUTPUTDIR}/etc/super/containers/pki-authority/lxc-legacy-vm-template.yaml" ADD rootfs/files/configs/pki-service/dnsmasq.conf "${OUTPUTDIR}/etc/lxc/dnsmasq.conf" ADD rootfs/files/configs/pki-service/lxc-net "${OUTPUTDIR}/etc/default/lxc-net" -RUN --security=insecure /buildroot/files/scripts/install_lxc_deps.sh ADD rootfs/files/configs/etc/multipath.conf.append /buildroot/files/configs/etc/multipath.conf.append ADD rootfs/files/configs/etc/sysctl.conf.append /buildroot/files/configs/etc/sysctl.conf.append diff --git a/src/rootfs/files/configs/pki-service/pki-authority.service b/src/rootfs/files/configs/pki-service/pki-authority.service deleted file mode 100644 index b5db7142..00000000 --- a/src/rootfs/files/configs/pki-service/pki-authority.service +++ /dev/null @@ -1,22 +0,0 @@ -[Unit] -Description=PKI authority lxc-container -After=lxc.service lxc-net.service lxc-monitord.service nvidia-persistenced.service pccs.service -Requires=lxc.service lxc-net.service lxc-monitord.service pccs.service -Wants=nvidia-persistenced.service - -[Service] -Type=simple -Restart=always -RestartSec=5 -TimeoutStartSec=3min - -ExecStartPre=/usr/local/bin/create-and-configure-pki.sh - -ExecStart=/usr/bin/lxc-start -n pki-authority -F - -ExecStop=/usr/bin/lxc-stop -n pki-authority - -KillMode=control-group - -[Install] -WantedBy=multi-user.target diff --git a/src/services/apps/pki-authority/main.py b/src/services/apps/pki-authority/main.py new file mode 100755 index 00000000..b110667c --- /dev/null +++ b/src/services/apps/pki-authority/main.py @@ -0,0 +1,272 @@ +#!/usr/bin/env python3 + +import sys +import subprocess +import time +import urllib.request +import ssl +from typing import List, Optional + +from provision_plugin_sdk import ProvisionPlugin, PluginInput, PluginOutput + +# Configuration +PKI_SERVICE_NAME = "pki-authority" + +plugin = ProvisionPlugin() + + +# Helpers +def get_node_tunnel_ip(node_id: str, wg_props: List[dict]) -> Optional[str]: + for prop in wg_props: + if prop.get("node_id") == node_id and prop.get("name") == "tunnel_ip": + return prop.get("value") + return None + +def lxc_start_container(container_name: str, timeout: int = 30) -> int: + """Start LXC container. Returns exit code.""" + print(f"[*] Starting LXC container {container_name}") + result = subprocess.run( + ["lxc-start", "-n", container_name], + capture_output=True, + text=True, + timeout=timeout + ) + + return result.returncode + +def lxc_stop_container(container_name: str, graceful_timeout: int = 30, command_timeout: int = 60) -> int: + """Stop LXC container gracefully. Returns exit code.""" + print(f"[*] Stopping LXC container {container_name} gracefully") + result = subprocess.run( + ["lxc-stop", "-n", container_name, "-t", str(graceful_timeout)], + capture_output=True, + text=True, + timeout=command_timeout + ) + + return result.returncode + +def is_pki_running() -> bool: + """Check if PKI Authority service is running.""" + try: + # 1. Check if LXC container is running + result = subprocess.run( + ["lxc-ls", "--running"], + capture_output=True, + text=True + ) + if PKI_SERVICE_NAME not in result.stdout: + print(f"[*] LXC container {PKI_SERVICE_NAME} is not running") + return False + + # 2. Check tee-pki service status inside container + result = subprocess.run( + ["lxc-attach", "-n", PKI_SERVICE_NAME, "--", "systemctl", "is-active", "tee-pki"], + capture_output=True, + text=True + ) + status = result.stdout.strip() + + if status not in ["active", "activating"]: + print(f"[*] Service tee-pki status: {status}") + return False + + # 3. If service is active, check how long it's been running + if status == "active": + result = subprocess.run( + ["lxc-attach", "-n", PKI_SERVICE_NAME, "--", "systemctl", "show", "tee-pki", "--property=ActiveEnterTimestamp"], + capture_output=True, + text=True + ) + + # Parse ActiveEnterTimestamp + for line in result.stdout.split('\n'): + if line.startswith('ActiveEnterTimestamp='): + timestamp_str = line.split('=', 1)[1].strip() + if timestamp_str and timestamp_str != '0': + # Parse timestamp (format: "Day YYYY-MM-DD HH:MM:SS TZ") + try: + # Get timestamp in seconds since epoch + ts_result = subprocess.run( + ["date", "+%s", "-d", timestamp_str], + capture_output=True, + text=True + ) + start_time = int(ts_result.stdout.strip()) + current_time = int(time.time()) + uptime_seconds = current_time - start_time + + # If running more than 2 minutes (120 seconds), check healthcheck + if uptime_seconds > 120: + # Get container IP + ip_result = subprocess.run( + ["lxc-info", "-n", PKI_SERVICE_NAME, "-iH"], + capture_output=True, + text=True + ) + container_ip = ip_result.stdout.strip() if ip_result.stdout.strip() else None + + if container_ip: + # Perform HTTPS healthcheck without certificate verification + try: + ctx = ssl.create_default_context() + ctx.check_hostname = False + ctx.verify_mode = ssl.CERT_NONE + + req = urllib.request.Request(f"https://{container_ip}/healthcheck") + with urllib.request.urlopen(req, context=ctx, timeout=5) as response: + if response.status == 200: + return True + else: + print(f"[*] Healthcheck returned status: {response.status}") + return False + except Exception as e: + print(f"[*] Healthcheck failed: {e}") + return False + except Exception as e: + print(f"[*] Failed to parse service uptime: {e}") + + # Service is active or activating (but not ready for healthcheck yet) + return True + + except Exception as e: + print(f"[!] Failed to check PKI status: {e}", file=sys.stderr) + return False + +# Plugin commands +@plugin.command("init") +def handle_init(input_data: PluginInput) -> PluginOutput: + """Initialize PKI Authority service.""" + try: + # Run PKI initialization script + print("[*] Running PKI initialization script") + result = subprocess.run( + ["/usr/local/bin/create-and-configure-pki.sh"], + capture_output=True, + text=True, + timeout=180 + ) + + if result.returncode != 0: + error_msg = f"PKI initialization script failed with exit code {result.returncode}: {result.stderr}" + print(f"[!] {error_msg}", file=sys.stderr) + return PluginOutput(status="error", error_message=error_msg, local_state=input_data.local_state) + + print("[*] PKI initialization completed") + return PluginOutput(status="completed", local_state=input_data.local_state) + except subprocess.CalledProcessError as e: + error_msg = f"Failed to initialize PKI: {e.stderr if e.stderr else str(e)}" + print(f"[!] {error_msg}", file=sys.stderr) + return PluginOutput(status="error", error_message=error_msg, local_state=input_data.local_state) + except Exception as e: + error_msg = f"Unexpected error: {str(e)}" + print(f"[!] {error_msg}", file=sys.stderr) + return PluginOutput(status="error", error_message=error_msg, local_state=input_data.local_state) + + +@plugin.command("apply") +def handle_apply(input_data: PluginInput) -> PluginOutput: + """Apply PKI Authority configuration and start service.""" + local_node_id = input_data.local_node_id + state_json = input_data.state or {} + local_state = input_data.local_state or {} + + if not isinstance(state_json, dict): + return PluginOutput(status="error", error_message="Invalid state format", local_state=local_state) + + wg_props = state_json.get("wgNodeProperties", []) + + local_tunnel_ip = get_node_tunnel_ip(local_node_id, wg_props) + if not local_tunnel_ip: + return PluginOutput(status="error", error_message="Local node has no WireGuard tunnel IP", local_state=local_state) + + try: + # Start or restart LXC container + if is_pki_running(): + print(f"[*] Restarting LXC container {PKI_SERVICE_NAME}") + + # Stop container gracefully + exit_code = lxc_stop_container(PKI_SERVICE_NAME, graceful_timeout=30, command_timeout=60) + if exit_code != 0: + error_msg = f"Failed to stop container with exit code {exit_code}" + return PluginOutput(status="error", error_message=error_msg, local_state=local_state) + + # Start container + exit_code = lxc_start_container(PKI_SERVICE_NAME, timeout=30) + if exit_code != 0: + error_msg = f"Failed to start container with exit code {exit_code}" + return PluginOutput(status="error", error_message=error_msg, local_state=local_state) + else: + # Start container + exit_code = lxc_start_container(PKI_SERVICE_NAME, timeout=30) + if exit_code != 0: + error_msg = f"Failed to start container with exit code {exit_code}" + return PluginOutput(status="error", error_message=error_msg, local_state=local_state) + + print(f"[*] LXC container {PKI_SERVICE_NAME} is running") + return PluginOutput(status="completed", local_state=local_state) + + except subprocess.CalledProcessError as e: + error_msg = f"Failed to start service: {e.stderr if e.stderr else str(e)}" + print(f"[!] {error_msg}", file=sys.stderr) + return PluginOutput(status="error", error_message=error_msg, local_state=local_state) + except Exception as e: + error_msg = f"Unexpected error: {str(e)}" + print(f"[!] {error_msg}", file=sys.stderr) + return PluginOutput(status="error", error_message=error_msg, local_state=local_state) + + +@plugin.command("health") +def handle_health(input_data: PluginInput) -> PluginOutput: + """Check health of PKI Authority service.""" + local_state = input_data.local_state or {} + + if is_pki_running(): + return PluginOutput(status="healthy", local_state=local_state) + else: + return PluginOutput( + status="unhealthy", + error_message="Service is not running", + local_state=local_state + ) + + +@plugin.command("finalize") +def handle_finalize(input_data: PluginInput) -> PluginOutput: + """Finalize PKI Authority service setup.""" + print("[*] PKI Authority finalized") + return PluginOutput(status="completed", local_state=input_data.local_state) + + +@plugin.command("destroy") +def handle_destroy(input_data: PluginInput) -> PluginOutput: + """Destroy PKI Authority service and clean up.""" + local_state = input_data.local_state or {} + + try: + print(f"[*] Stopping {PKI_SERVICE_NAME}") + subprocess.run( + ["systemctl", "stop", PKI_SERVICE_NAME], + capture_output=True, + text=True, + timeout=30 + ) + + print(f"[*] Disabling {PKI_SERVICE_NAME}") + subprocess.run( + ["systemctl", "disable", PKI_SERVICE_NAME], + capture_output=True, + text=True + ) + + print("[*] PKI Authority destroyed") + return PluginOutput(status="completed", local_state=local_state) + + except Exception as e: + error_msg = f"Destroy failed: {str(e)}" + print(f"[!] {error_msg}", file=sys.stderr) + return PluginOutput(status="error", error_message=error_msg, local_state=local_state) + + +if __name__ == "__main__": + plugin.run() diff --git a/src/services/apps/pki-authority/manifest.yaml b/src/services/apps/pki-authority/manifest.yaml new file mode 100644 index 00000000..e504db4d --- /dev/null +++ b/src/services/apps/pki-authority/manifest.yaml @@ -0,0 +1,57 @@ +name: pki-authority +version: 1.0.0 +commands: + - init + - apply + - health + - finalize + - destroy +healthcheckIntervalSecs: 60 +entrypoint: main.py +stateExpr: + engine: jq + query: | + ($swarmdb.clusters[] | select(.id == "{{ clusterId }}" and .deleted_ts == null)) as $cluster | + + ([$swarmdb.clusternodes[] | select(.cluster == "{{ clusterId }}" and .deleted_ts == null)]) as $pkiClusterNodes | + + ($pkiClusterNodes | map(.node)) as $pkiNodeIds | + + ( + $swarmdb.clusters[] | + select(.cluster_policy == "wireguard" and .deleted_ts == null) | + select( + ( + [$swarmdb.clusternodes[] | select(.deleted_ts == null and (.node | IN($pkiNodeIds[])))] | + length > 0 + ) + ) + ) as $wgCluster | + + { + cluster: { + id: $cluster.id, + cluster_policy: $cluster.cluster_policy, + leader_node: $cluster.leader_node + }, + + clusterNodes: [ + $pkiClusterNodes[] | + {id, node_id: .node, cluster} + ] | sort_by(.id, .node_id, .cluster), + + wgCluster: { + id: $wgCluster.id + }, + + wgNodeProperties: [ + $swarmdb.clusternodeproperties[] | + select( + (.cluster_node | startswith($wgCluster.id)) and + .deleted_ts == null and + .name == "tunnel_ip" + ) | + {cluster_node, name, value, node_id: ( .cluster_node as $cn | $swarmdb.clusternodes[] | select(.id == $cn)) | .node} + ] | sort_by(.cluster_node, .name, .value, .node_id) + } + diff --git a/swarm-scripts/80.setup-pki-authority.sh b/swarm-scripts/80.setup-pki-authority.sh new file mode 100644 index 00000000..f9bb770c --- /dev/null +++ b/swarm-scripts/80.setup-pki-authority.sh @@ -0,0 +1,44 @@ +#!/bin/bash +set -euo pipefail + +# This script bootstraps the pki-authority service into SwarmDB via mysql client. +# Run it INSIDE the container. Assumes mysql client is available. +# +# Note: +# - The pki-authority manifest and main.py should be available inside the container at: +# /etc/swarm-services/pki-authority/manifest.yaml and /etc/swarm-services/pki-authority/main.py +# (mount or copy them similarly to the wireguard service) +# +# - pki-authority depends on a WireGuard cluster existing and sharing nodes with it. +# When bootstrapping WireGuard, prefer ClusterPolicy id 'wireguard' to match pki-authority's stateExpr. +DB_HOST=${DB_HOST:-127.0.0.1} +DB_PORT=${DB_PORT:-3306} +DB_USER=${DB_USER:-root} +DB_NAME=${DB_NAME:-swarmdb} + +# Service descriptors +SERVICE_NAME=${SERVICE_NAME:-pki-authority} +SERVICE_VERSION=${SERVICE_VERSION:-1.0.0} +CLUSTER_POLICY=${CLUSTER_POLICY:-pki-authority} +CLUSTER_ID=${CLUSTER_ID:-pki-authority} + +# Path to manifest file INSIDE the container (configs are mounted to /configs) +MANIFEST_PATH=${MANIFEST_PATH:-/etc/swarm-services/${SERVICE_NAME}/manifest.yaml} +LOCATION_PATH=${LOCATION_PATH:-/etc/swarm-services/${SERVICE_NAME}} +SERVICE_PK="${CLUSTER_POLICY}:${SERVICE_NAME}" + +if [ ! -f "$MANIFEST_PATH" ]; then + echo "Manifest not found at: $MANIFEST_PATH" >&2 + exit 1 +fi + +CLI="$(dirname "$0")/swarm-cli.sh" +echo "Creating/Updating ClusterPolicies '$CLUSTER_POLICY'..." +DB_HOST="$DB_HOST" DB_PORT="$DB_PORT" DB_USER="$DB_USER" DB_NAME="$DB_NAME" \ + python3 "$(dirname "$0")/swarm-cli.py" create ClusterPolicies "$CLUSTER_POLICY" + +echo "Creating/Updating ClusterServices '$SERVICE_PK'..." +DB_HOST="$DB_HOST" DB_PORT="$DB_PORT" DB_USER="$DB_USER" DB_NAME="$DB_NAME" \ + python3 "$(dirname "$0")/swarm-cli.py" create ClusterServices "$SERVICE_PK" --name="$SERVICE_NAME" --cluster_policy="$CLUSTER_POLICY" --version="$SERVICE_VERSION" --location="$LOCATION_PATH" + +echo "Done. The provision worker will reconcile '$SERVICE_NAME' shortly." From 7f69dcbd75bad99bab72e18bf921e4afdf59f3df Mon Sep 17 00:00:00 2001 From: Petr Evstifeev Date: Tue, 2 Dec 2025 13:31:03 -0600 Subject: [PATCH 07/51] create and configure lxc container from python script --- src/Dockerfile | 1 - .../pki-service/create-and-configure-pki.sh | 189 ------- src/services/apps/pki-authority/helpers.py | 473 ++++++++++++++++++ src/services/apps/pki-authority/main.py | 240 +++------ 4 files changed, 537 insertions(+), 366 deletions(-) delete mode 100755 src/rootfs/files/configs/pki-service/create-and-configure-pki.sh create mode 100644 src/services/apps/pki-authority/helpers.py diff --git a/src/Dockerfile b/src/Dockerfile index f745aab3..de0af2e5 100644 --- a/src/Dockerfile +++ b/src/Dockerfile @@ -226,7 +226,6 @@ RUN mkdir -p "${OUTPUTDIR}/usr/local/bin"; ADD rootfs/files/scripts/install_lxc_deps.sh /buildroot/files/scripts/ RUN --security=insecure /buildroot/files/scripts/install_lxc_deps.sh -ADD rootfs/files/configs/pki-service/create-and-configure-pki.sh "${OUTPUTDIR}/usr/local/bin" RUN mkdir -p "${OUTPUTDIR}/etc/super/containers/pki-authority" COPY --from=ghcr.io/super-protocol/tee-pki-authority-service-lxc:build-18725490828 /pki-authority.tar "${OUTPUTDIR}/etc/super/containers/pki-authority/pki-authority.tar" ADD rootfs/files/configs/pki-service/lxc-swarm-template.yaml "${OUTPUTDIR}/etc/super/containers/pki-authority/lxc-swarm-template.yaml" diff --git a/src/rootfs/files/configs/pki-service/create-and-configure-pki.sh b/src/rootfs/files/configs/pki-service/create-and-configure-pki.sh deleted file mode 100755 index 332c91ba..00000000 --- a/src/rootfs/files/configs/pki-service/create-and-configure-pki.sh +++ /dev/null @@ -1,189 +0,0 @@ -#!/bin/bash -set -euo pipefail - -CONTAINER_NAME="pki-authority" -CPU_TYPE="untrusted" -if [[ -c "/dev/tdx_guest" ]] ; then - CPU_TYPE="tdx"; -elif [[ -c "/dev/sev-guest" ]]; then - CPU_TYPE="sev-snp"; -fi - -export CPU_TYPE="${CPU_TYPE}" - -# Create LXC container if it doesn't exist -create_container() { - if lxc-info -n "${CONTAINER_NAME}" &>/dev/null; then - echo "Container '${CONTAINER_NAME}' already exists." - else - echo "Container '${CONTAINER_NAME}' not found. Creating..." - lxc-create -n "${CONTAINER_NAME}" -t oci -- --url docker-archive:/etc/super/containers/pki-authority/pki-authority.tar - echo "Container '${CONTAINER_NAME}' created." - fi -} - -# Set own challenge type in LXC container configuration -set_own_challenge() { - local template_name="lxc-swarm-template.yaml" - - # Check if vm_mode=legacy is set in kernel command line - if grep -q "vm_mode=legacy" /proc/cmdline 2>/dev/null; then - template_name="lxc-legacy-vm-template.yaml" - echo "Detected vm_mode=legacy in kernel cmdline, using legacy template" - else - echo "Using swarm template" - fi - - local src_yaml="/etc/super/containers/pki-authority/${template_name}" - local dst_yaml="/var/lib/lxc/${CONTAINER_NAME}/rootfs/app/conf/lxc.yaml" - - if [[ -f "${src_yaml}" ]]; then - if command -v yq-go >/dev/null 2>&1; then - yq-go e '.pki.ownChallenge.type = strenv(CPU_TYPE)' "${src_yaml}" > "${dst_yaml}" - echo "Patched ${dst_yaml} with type: ${CPU_TYPE} using yq." - else - echo "Error: yq-go is not installed. Please install yq-go for YAML editing." - exit 1 - fi - else - echo "Error: ${src_yaml} not found." - exit 1 - fi -} - -# Copy trusted environment variables to container -set_subbroot_env() { - # --- Trusted subroot env handling --- - local trusted_vars=( - AS__pki__baseDomain - AS__pki__ownDomain - AS__pki__certParams__ocspUrl - AS__pki__mode__attestationServiceSource__baseUrl - AS__pki__mode__attestationServiceSource__caBundle - ) - - local src_subroot_env="/sp/subroot.env" - local dst_subroot_env="/var/lib/lxc/${CONTAINER_NAME}/rootfs/app/subroot.env" - - # If source exists, (re)create destination with only trusted variables - if [[ -f "${src_subroot_env}" ]]; then - # Remove destination first to ensure a clean recreate - rm -f "${dst_subroot_env}" - - # Header explaining autogenerated file - echo "# Autogenerated from ${src_subroot_env}. Contains only trusted variables." > "${dst_subroot_env}" - - for var in "${trusted_vars[@]}"; do - # capture first matching line in form VAR="value" - local line - line="$(grep -m1 -E "^${var}=\".*\"" "${src_subroot_env}" 2>/dev/null || true)" - if [[ -n "${line}" ]]; then - echo "${line}" >> "${dst_subroot_env}" - fi - done - - chmod 0644 "${dst_subroot_env}" || true - echo "Created ${dst_subroot_env} with trusted variables." - else - echo "Info: ${src_subroot_env} not found; skipping creation of ${dst_subroot_env}" - fi -} - -# Patch LXC container configuration -patch_lxc_config() { - local config_file="/var/lib/lxc/${CONTAINER_NAME}/config" - local config_bak="${config_file}.bak" - - # Always restore config from backup if backup exists - if [[ -f "${config_bak}" ]]; then - cp "${config_bak}" "${config_file}" - else - # Create backup before first patch - if [[ -f "${config_file}" ]]; then - cp "${config_file}" "${config_bak}" - fi - fi - - # This MAC address is used to get a static IP address from DHCP, see /etc/lxc/dnsmasq.conf - echo "lxc.net.0.hwaddr = 4e:fc:0a:d5:2d:ff" >> "${config_file}" - - if [[ "${CPU_TYPE}" = "sev-snp" ]]; then - local dev_id - dev_id="$(stat -c '%t:%T' /dev/sev-guest | awk -F: '{printf "%d:%d\n", "0x"$1, "0x"$2}')" - echo "lxc.cgroup2.devices.allow = c ${dev_id} rwm" >> "${config_file}" - echo "lxc.mount.entry = /dev/sev-guest dev/sev-guest none bind,optional,create=file" >> "${config_file}" - elif [[ "${CPU_TYPE}" = "tdx" ]]; then - local dev_id - dev_id="$(stat -c '%t:%T' /dev/tdx_guest | awk -F: '{printf "%d:%d\n", "0x"$1, "0x"$2}')" - echo "lxc.cgroup2.devices.allow = c ${dev_id} rwm" >> "${config_file}" - echo "lxc.mount.entry = /dev/tdx_guest dev/tdx_guest none bind,optional,create=file" >> "${config_file}" - if [[ -f "/etc/tdx-attest.conf" ]]; then - echo "lxc.mount.entry = /etc/tdx-attest.conf etc/tdx-attest.conf none bind,ro,create=file" >> "${config_file}" - fi - fi -} - -# Update PCCS URL and setup iptables NAT rules for LXC container access to host service -update_pccs_url_and_setup_iptables() { - local bridge_name="lxcbr0" - local service_port="8081" - local qcnl_conf="/var/lib/lxc/${CONTAINER_NAME}/rootfs/etc/sgx_default_qcnl.conf" - local qcnl_conf_bak="${qcnl_conf}.bak" - - # Get host IP address on the LXC bridge - local host_ip - host_ip=$(ip -4 addr show "${bridge_name}" 2>/dev/null | grep -oP '(?<=inet\s)\d+(\.\d+){3}' | head -n1) - - if [[ -z "${host_ip}" ]]; then - echo "Error: Could not determine IP address for bridge ${bridge_name}" - exit 1 - fi - - # Enable route_localnet for the bridge to allow routing to localhost - if [[ $(sysctl -n net.ipv4.conf.${bridge_name}.route_localnet 2>/dev/null) != "1" ]]; then - sysctl -w net.ipv4.conf.${bridge_name}.route_localnet=1 - echo "Enabled route_localnet for ${bridge_name}" - else - echo "route_localnet already enabled for ${bridge_name}" - fi - - # PCCS service on port 8081 - if ! iptables -t nat -C PREROUTING -p tcp -d "${host_ip}" --dport "${service_port}" -j DNAT --to-destination 127.0.0.1:"${service_port}" 2>/dev/null; then - iptables -t nat -A PREROUTING -p tcp -d "${host_ip}" --dport "${service_port}" -j DNAT --to-destination 127.0.0.1:"${service_port}" - echo "iptables DNAT rule added: ${host_ip}:${service_port} -> 127.0.0.1:${service_port}" - else - echo "iptables DNAT rule already exists for ${host_ip}:${service_port}" - fi - - # MongoDB service on port 27017 - local mongodb_port="27017" - if ! iptables -t nat -C PREROUTING -p tcp -d "${host_ip}" --dport "${mongodb_port}" -j DNAT --to-destination 127.0.0.1:"${mongodb_port}" 2>/dev/null; then - iptables -t nat -A PREROUTING -p tcp -d "${host_ip}" --dport "${mongodb_port}" -j DNAT --to-destination 127.0.0.1:"${mongodb_port}" - echo "iptables DNAT rule added: ${host_ip}:${mongodb_port} -> 127.0.0.1:${mongodb_port}" - else - echo "iptables DNAT rule already exists for ${host_ip}:${mongodb_port}" - fi - - # Update PCCS URL in QCNL configuration - local pccs_url="https://${host_ip}:${service_port}/sgx/certification/v4/" - if [[ -f "${qcnl_conf}" ]]; then - # Create backup if it doesn't exist - if [[ ! -f "${qcnl_conf_bak}" ]]; then - cp "${qcnl_conf}" "${qcnl_conf_bak}" - # Update pccs_url in the JSON configuration file - sed -i "s|\"pccs_url\": \".*\"|\"pccs_url\": \"${pccs_url}\"|g" "${qcnl_conf}" - echo "Updated PCCS URL in ${qcnl_conf} to ${pccs_url}" - else - echo "Backup ${qcnl_conf_bak} already exists, skipping PCCS URL update" - fi - else - echo "Error: ${qcnl_conf} not found" - exit 1 - fi -} - -create_container -set_own_challenge -set_subbroot_env -patch_lxc_config -update_pccs_url_and_setup_iptables diff --git a/src/services/apps/pki-authority/helpers.py b/src/services/apps/pki-authority/helpers.py new file mode 100644 index 00000000..a753ce00 --- /dev/null +++ b/src/services/apps/pki-authority/helpers.py @@ -0,0 +1,473 @@ +#!/usr/bin/env python3 +""" +PKI Authority LXC container management helpers. +""" + +import os +import sys +import subprocess +import shutil +import re +import yaml +import time +import urllib.request +import ssl +from pathlib import Path +from typing import List, Optional + +PKI_SERVICE_NAME = "pki-authority" +SERVICE_INSIDE_CONTAINER = "tee-pki" +BRIDGE_NAME = "lxcbr0" + +class LXCContainer: + """Manager for LXC container operations.""" + + def __init__(self, container_name: str = PKI_SERVICE_NAME): + self.container_name = container_name + + def start(self, timeout: int = 30) -> int: + """Start LXC container. Returns exit code.""" + print(f"[*] Starting LXC container {self.container_name}") + result = subprocess.run( + ["lxc-start", "-n", self.container_name], + capture_output=True, + text=True, + timeout=timeout + ) + return result.returncode + + def stop(self, graceful_timeout: int = 30, command_timeout: int = 60) -> int: + """Stop LXC container gracefully. Returns exit code.""" + print(f"[*] Stopping LXC container {self.container_name} gracefully") + result = subprocess.run( + ["lxc-stop", "-n", self.container_name, "-t", str(graceful_timeout)], + capture_output=True, + text=True, + timeout=command_timeout + ) + return result.returncode + + def destroy(self) -> int: + """Destroy LXC container. Returns exit code.""" + print(f"[*] Destroying LXC container {self.container_name}") + result = subprocess.run( + ["lxc-destroy", "-n", self.container_name, "-f"], + capture_output=True, + text=True, + timeout=60 + ) + + if result.returncode != 0: + print(f"[!] Failed to destroy container: {result.stderr}", file=sys.stderr) + + return result.returncode + + def is_running(self) -> bool: + """Check if LXC container is running.""" + try: + result = subprocess.run( + ["lxc-ls", "--running"], + capture_output=True, + text=True + ) + if self.container_name not in result.stdout: + print(f"[*] LXC container {self.container_name} is not running") + return False + return True + except Exception as e: + print(f"[!] Failed to check LXC container status: {e}", file=sys.stderr) + return False + + def get_ip(self) -> Optional[str]: + """Get container IP address.""" + try: + result = subprocess.run( + ["lxc-info", "-n", self.container_name, "-iH"], + capture_output=True, + text=True + ) + container_ip = result.stdout.strip() if result.stdout.strip() else None + return container_ip + except Exception as e: + print(f"[!] Failed to get container IP: {e}", file=sys.stderr) + return None + + def create(self, archive_path: str = "/etc/super/containers/pki-authority/pki-authority.tar") -> bool: + """Create LXC container if it doesn't exist. Returns True if created or already exists.""" + # Check if container already exists + result = subprocess.run( + ["lxc-info", "-n", self.container_name], + capture_output=True, + text=True + ) + + if result.returncode == 0: + print(f"Container '{self.container_name}' already exists.") + return True + else: + print(f"Container '{self.container_name}' not found. Creating...") + try: + subprocess.run( + [ + "lxc-create", + "-n", self.container_name, + "-t", "oci", + "--", + "--url", f"docker-archive:{archive_path}" + ], + check=True + ) + print(f"Container '{self.container_name}' created.") + return True + except subprocess.CalledProcessError as e: + print(f"[!] Failed to create container: {e}", file=sys.stderr) + return False + + def is_service_healthy(self, min_uptime: int = 120, healthcheck_url: str = "/healthcheck") -> bool: + """Check if service inside container is running and healthy.""" + try: + # 1. Check service status inside container + result = subprocess.run( + ["lxc-attach", "-n", self.container_name, "--", "systemctl", "is-active", SERVICE_INSIDE_CONTAINER], + capture_output=True, + text=True + ) + status = result.stdout.strip() + + if status not in ["active", "activating"]: + print(f"[*] Service {SERVICE_INSIDE_CONTAINER} status: {status}") + return False + + # 2. If service is active, check how long it's been running + if status == "active": + result = subprocess.run( + ["lxc-attach", "-n", self.container_name, "--", "systemctl", "show", + SERVICE_INSIDE_CONTAINER, "--property=ActiveEnterTimestamp"], + capture_output=True, + text=True + ) + + # Parse ActiveEnterTimestamp + for line in result.stdout.split('\n'): + if line.startswith('ActiveEnterTimestamp='): + timestamp_str = line.split('=', 1)[1].strip() + if timestamp_str and timestamp_str != '0': + try: + # Get timestamp in seconds since epoch + ts_result = subprocess.run( + ["date", "+%s", "-d", timestamp_str], + capture_output=True, + text=True + ) + start_time = int(ts_result.stdout.strip()) + current_time = int(time.time()) + uptime_seconds = current_time - start_time + + # If running more than min_uptime, check healthcheck endpoint + if uptime_seconds > min_uptime: + container_ip = self.get_ip() + + if container_ip: + # Perform HTTPS healthcheck without certificate verification + try: + ctx = ssl.create_default_context() + ctx.check_hostname = False + ctx.verify_mode = ssl.CERT_NONE + + req = urllib.request.Request(f"https://{container_ip}{healthcheck_url}") + with urllib.request.urlopen(req, context=ctx, timeout=5) as response: + if response.status == 200: + return True + else: + print(f"[*] Healthcheck returned status: {response.status}") + return False + except Exception as e: + print(f"[*] Healthcheck failed: {e}") + return False + except Exception as e: + print(f"[*] Failed to parse service uptime: {e}") + + # Service is active or activating (but not ready for healthcheck yet) + return True + + except Exception as e: + print(f"[!] Failed to check service health: {e}", file=sys.stderr) + return False + + +def detect_cpu_type() -> str: + """Detect CPU type based on available devices.""" + if Path("/dev/tdx_guest").is_char_device(): + return "tdx" + elif Path("/dev/sev-guest").is_char_device(): + return "sev-snp" + else: + return "untrusted" + +def set_own_challenge(cpu_type: str): + """Set own challenge type in LXC container configuration.""" + # Check if vm_mode=legacy is set in kernel command line + try: + with open("/proc/cmdline", "r") as f: + cmdline = f.read() + + if "vm_mode=legacy" in cmdline: + template_name = "lxc-legacy-vm-template.yaml" + print("Detected vm_mode=legacy in kernel cmdline, using legacy template") + else: + template_name = "lxc-swarm-template.yaml" + print("Using swarm template") + except FileNotFoundError: + template_name = "lxc-swarm-template.yaml" + print("Using swarm template") + + src_yaml = Path(f"/etc/super/containers/pki-authority/{template_name}") + dst_yaml = Path(f"/var/lib/lxc/{PKI_SERVICE_NAME}/rootfs/app/conf/lxc.yaml") + + if not src_yaml.exists(): + print(f"Error: {src_yaml} not found.") + sys.exit(1) + + # Load YAML, modify, and save + with open(src_yaml, "r") as f: + config = yaml.safe_load(f) + + # Set the CPU type in the configuration + if "pki" not in config: + config["pki"] = {} + if "ownChallenge" not in config["pki"]: + config["pki"]["ownChallenge"] = {} + config["pki"]["ownChallenge"]["type"] = cpu_type + + # Ensure destination directory exists + dst_yaml.parent.mkdir(parents=True, exist_ok=True) + + # Write modified YAML + with open(dst_yaml, "w") as f: + yaml.dump(config, f, default_flow_style=False) + + print(f"Patched {dst_yaml} with type: {cpu_type}") + + +def set_subroot_env(): + """Copy trusted environment variables to container.""" + trusted_vars = [ + "AS__pki__baseDomain", + "AS__pki__ownDomain", + "AS__pki__certParams__ocspUrl", + "AS__pki__mode__attestationServiceSource__baseUrl", + "AS__pki__mode__attestationServiceSource__caBundle", + ] + + src_subroot_env = Path("/sp/subroot.env") + dst_subroot_env = Path(f"/var/lib/lxc/{PKI_SERVICE_NAME}/rootfs/app/subroot.env") + + if not src_subroot_env.exists(): + print(f"Info: {src_subroot_env} not found; skipping creation of {dst_subroot_env}") + return + + # Remove destination first to ensure a clean recreate + dst_subroot_env.unlink(missing_ok=True) + + # Read source file + with open(src_subroot_env, "r") as f: + lines = f.readlines() + + # Write destination with header + with open(dst_subroot_env, "w") as f: + f.write(f"# Autogenerated from {src_subroot_env}. Contains only trusted variables.\n") + + for var in trusted_vars: + # Find first matching line + for line in lines: + if line.strip().startswith(f'{var}="'): + f.write(line) + break + + # Set permissions + dst_subroot_env.chmod(0o644) + print(f"Created {dst_subroot_env} with trusted variables.") + + +def patch_lxc_config(cpu_type: str): + """Patch LXC container configuration.""" + config_file = Path(f"/var/lib/lxc/{PKI_SERVICE_NAME}/config") + config_bak = Path(f"{config_file}.bak") + + # Always restore config from backup if backup exists + if config_bak.exists(): + shutil.copy(config_bak, config_file) + else: + # Create backup before first patch + if config_file.exists(): + shutil.copy(config_file, config_bak) + + # Append MAC address configuration + with open(config_file, "a") as f: + f.write("lxc.net.0.hwaddr = 4e:fc:0a:d5:2d:ff\n") + + # Add device-specific configuration + if cpu_type == "sev-snp": + dev_path = Path("/dev/sev-guest") + stat_info = dev_path.stat() + dev_id = f"{os.major(stat_info.st_rdev)}:{os.minor(stat_info.st_rdev)}" + + with open(config_file, "a") as f: + f.write(f"lxc.cgroup2.devices.allow = c {dev_id} rwm\n") + f.write("lxc.mount.entry = /dev/sev-guest dev/sev-guest none bind,optional,create=file\n") + + elif cpu_type == "tdx": + dev_path = Path("/dev/tdx_guest") + stat_info = dev_path.stat() + dev_id = f"{os.major(stat_info.st_rdev)}:{os.minor(stat_info.st_rdev)}" + + with open(config_file, "a") as f: + f.write(f"lxc.cgroup2.devices.allow = c {dev_id} rwm\n") + f.write("lxc.mount.entry = /dev/tdx_guest dev/tdx_guest none bind,optional,create=file\n") + + if Path("/etc/tdx-attest.conf").exists(): + f.write("lxc.mount.entry = /etc/tdx-attest.conf etc/tdx-attest.conf none bind,ro,create=file\n") + + +def get_bridge_ip(bridge_name: str) -> str: + """Get host IP address on the LXC bridge.""" + result = subprocess.run( + ["ip", "-4", "addr", "show", bridge_name], + capture_output=True, + text=True + ) + + if result.returncode != 0: + print(f"Error: Could not determine IP address for bridge {bridge_name}") + sys.exit(1) + + # Parse IP address from output + match = re.search(r'inet\s+(\d+\.\d+\.\d+\.\d+)', result.stdout) + if not match: + print(f"Error: Could not determine IP address for bridge {bridge_name}") + sys.exit(1) + + return match.group(1) + + +def enable_route_localnet(bridge_name: str): + """Enable route_localnet for the bridge.""" + sysctl_key = f"net.ipv4.conf.{bridge_name}.route_localnet" + + result = subprocess.run( + ["sysctl", "-n", sysctl_key], + capture_output=True, + text=True + ) + + if result.returncode == 0 and result.stdout.strip() == "1": + print(f"route_localnet already enabled for {bridge_name}") + else: + subprocess.run( + ["sysctl", "-w", f"{sysctl_key}=1"], + check=True + ) + print(f"Enabled route_localnet for {bridge_name}") + + +def add_iptables_rule(host_ip: str, port: str): + """Add iptables DNAT rule if it doesn't exist.""" + # Check if rule exists + check_result = subprocess.run( + [ + "iptables", "-t", "nat", "-C", "PREROUTING", + "-p", "tcp", + "-d", host_ip, + "--dport", port, + "-j", "DNAT", + "--to-destination", f"127.0.0.1:{port}" + ], + capture_output=True + ) + + if check_result.returncode == 0: + print(f"iptables DNAT rule already exists for {host_ip}:{port}") + else: + subprocess.run( + [ + "iptables", "-t", "nat", "-A", "PREROUTING", + "-p", "tcp", + "-d", host_ip, + "--dport", port, + "-j", "DNAT", + "--to-destination", f"127.0.0.1:{port}" + ], + check=True + ) + print(f"iptables DNAT rule added: {host_ip}:{port} -> 127.0.0.1:{port}") + +def delete_iptables_rules(): + """Delete all iptables DNAT rules in PREROUTING that contain host_ip.""" + host_ip = get_bridge_ip(BRIDGE_NAME) + result = subprocess.run( + ["iptables", "-t", "nat", "-S", "PREROUTING"], + capture_output=True, text=True, check=True + ) + + rules = result.stdout.splitlines() + + for rule in rules: + if host_ip in rule: + delete_rule = rule.replace("-A", "-D", 1) + subprocess.run(["iptables", "-t", "nat"] + delete_rule.split()[1:], check=True) + print(f"Deleted iptables rule: {delete_rule}") + +def update_pccs_url_and_setup_iptables(): + """Update PCCS URL and setup iptables NAT rules for LXC container access to host service.""" + service_port = "8081" + qcnl_conf = Path(f"/var/lib/lxc/{PKI_SERVICE_NAME}/rootfs/etc/sgx_default_qcnl.conf") + qcnl_conf_bak = Path(f"{qcnl_conf}.bak") + + # Get host IP address on the LXC bridge + host_ip = get_bridge_ip(BRIDGE_NAME) + + # Enable route_localnet for the bridge + enable_route_localnet(BRIDGE_NAME) + + # Add iptables rules for PCCS and MongoDB + add_iptables_rule(host_ip, service_port) + add_iptables_rule(host_ip, "27017") + + # Update PCCS URL in QCNL configuration + pccs_url = f"https://{host_ip}:{service_port}/sgx/certification/v4/" + + if not qcnl_conf.exists(): + print(f"Error: {qcnl_conf} not found") + sys.exit(1) + + # Create backup if it doesn't exist + if not qcnl_conf_bak.exists(): + shutil.copy(qcnl_conf, qcnl_conf_bak) + + # Update pccs_url in the JSON configuration file + with open(qcnl_conf, "r") as f: + content = f.read() + + content = re.sub( + r'"pccs_url":\s*"[^"]*"', + f'"pccs_url": "{pccs_url}"', + content + ) + + with open(qcnl_conf, "w") as f: + f.write(content) + + print(f"Updated PCCS URL in {qcnl_conf} to {pccs_url}") + else: + print(f"Backup {qcnl_conf_bak} already exists, skipping PCCS URL update") + + +def init_container(): + LXCContainer(PKI_SERVICE_NAME).create() + + +def get_node_tunnel_ip(node_id: str, wg_props: List[dict]) -> Optional[str]: + for prop in wg_props: + if prop.get("node_id") == node_id and prop.get("name") == "tunnel_ip": + return prop.get("value") + return None \ No newline at end of file diff --git a/src/services/apps/pki-authority/main.py b/src/services/apps/pki-authority/main.py index b110667c..bf255f32 100755 --- a/src/services/apps/pki-authority/main.py +++ b/src/services/apps/pki-authority/main.py @@ -1,165 +1,39 @@ #!/usr/bin/env python3 import sys -import subprocess -import time -import urllib.request -import ssl -from typing import List, Optional +from pathlib import Path from provision_plugin_sdk import ProvisionPlugin, PluginInput, PluginOutput -# Configuration -PKI_SERVICE_NAME = "pki-authority" +# Import helpers +sys.path.insert(0, str(Path(__file__).parent)) +from helpers import ( + delete_iptables_rules, + detect_cpu_type, + set_own_challenge, + set_subroot_env, + patch_lxc_config, + update_pccs_url_and_setup_iptables, + LXCContainer, + PKI_SERVICE_NAME, + get_node_tunnel_ip, + init_container, +) +# Configuration plugin = ProvisionPlugin() - -# Helpers -def get_node_tunnel_ip(node_id: str, wg_props: List[dict]) -> Optional[str]: - for prop in wg_props: - if prop.get("node_id") == node_id and prop.get("name") == "tunnel_ip": - return prop.get("value") - return None - -def lxc_start_container(container_name: str, timeout: int = 30) -> int: - """Start LXC container. Returns exit code.""" - print(f"[*] Starting LXC container {container_name}") - result = subprocess.run( - ["lxc-start", "-n", container_name], - capture_output=True, - text=True, - timeout=timeout - ) - - return result.returncode - -def lxc_stop_container(container_name: str, graceful_timeout: int = 30, command_timeout: int = 60) -> int: - """Stop LXC container gracefully. Returns exit code.""" - print(f"[*] Stopping LXC container {container_name} gracefully") - result = subprocess.run( - ["lxc-stop", "-n", container_name, "-t", str(graceful_timeout)], - capture_output=True, - text=True, - timeout=command_timeout - ) - - return result.returncode - -def is_pki_running() -> bool: - """Check if PKI Authority service is running.""" - try: - # 1. Check if LXC container is running - result = subprocess.run( - ["lxc-ls", "--running"], - capture_output=True, - text=True - ) - if PKI_SERVICE_NAME not in result.stdout: - print(f"[*] LXC container {PKI_SERVICE_NAME} is not running") - return False - - # 2. Check tee-pki service status inside container - result = subprocess.run( - ["lxc-attach", "-n", PKI_SERVICE_NAME, "--", "systemctl", "is-active", "tee-pki"], - capture_output=True, - text=True - ) - status = result.stdout.strip() - - if status not in ["active", "activating"]: - print(f"[*] Service tee-pki status: {status}") - return False - - # 3. If service is active, check how long it's been running - if status == "active": - result = subprocess.run( - ["lxc-attach", "-n", PKI_SERVICE_NAME, "--", "systemctl", "show", "tee-pki", "--property=ActiveEnterTimestamp"], - capture_output=True, - text=True - ) - - # Parse ActiveEnterTimestamp - for line in result.stdout.split('\n'): - if line.startswith('ActiveEnterTimestamp='): - timestamp_str = line.split('=', 1)[1].strip() - if timestamp_str and timestamp_str != '0': - # Parse timestamp (format: "Day YYYY-MM-DD HH:MM:SS TZ") - try: - # Get timestamp in seconds since epoch - ts_result = subprocess.run( - ["date", "+%s", "-d", timestamp_str], - capture_output=True, - text=True - ) - start_time = int(ts_result.stdout.strip()) - current_time = int(time.time()) - uptime_seconds = current_time - start_time - - # If running more than 2 minutes (120 seconds), check healthcheck - if uptime_seconds > 120: - # Get container IP - ip_result = subprocess.run( - ["lxc-info", "-n", PKI_SERVICE_NAME, "-iH"], - capture_output=True, - text=True - ) - container_ip = ip_result.stdout.strip() if ip_result.stdout.strip() else None - - if container_ip: - # Perform HTTPS healthcheck without certificate verification - try: - ctx = ssl.create_default_context() - ctx.check_hostname = False - ctx.verify_mode = ssl.CERT_NONE - - req = urllib.request.Request(f"https://{container_ip}/healthcheck") - with urllib.request.urlopen(req, context=ctx, timeout=5) as response: - if response.status == 200: - return True - else: - print(f"[*] Healthcheck returned status: {response.status}") - return False - except Exception as e: - print(f"[*] Healthcheck failed: {e}") - return False - except Exception as e: - print(f"[*] Failed to parse service uptime: {e}") - - # Service is active or activating (but not ready for healthcheck yet) - return True - - except Exception as e: - print(f"[!] Failed to check PKI status: {e}", file=sys.stderr) - return False - # Plugin commands @plugin.command("init") def handle_init(input_data: PluginInput) -> PluginOutput: """Initialize PKI Authority service.""" try: - # Run PKI initialization script - print("[*] Running PKI initialization script") - result = subprocess.run( - ["/usr/local/bin/create-and-configure-pki.sh"], - capture_output=True, - text=True, - timeout=180 - ) - - if result.returncode != 0: - error_msg = f"PKI initialization script failed with exit code {result.returncode}: {result.stderr}" - print(f"[!] {error_msg}", file=sys.stderr) - return PluginOutput(status="error", error_message=error_msg, local_state=input_data.local_state) - + print("[*] Running PKI initialization") + init_container() print("[*] PKI initialization completed") return PluginOutput(status="completed", local_state=input_data.local_state) - except subprocess.CalledProcessError as e: - error_msg = f"Failed to initialize PKI: {e.stderr if e.stderr else str(e)}" - print(f"[!] {error_msg}", file=sys.stderr) - return PluginOutput(status="error", error_message=error_msg, local_state=input_data.local_state) except Exception as e: - error_msg = f"Unexpected error: {str(e)}" + error_msg = f"Failed to initialize PKI: {str(e)}" print(f"[!] {error_msg}", file=sys.stderr) return PluginOutput(status="error", error_message=error_msg, local_state=input_data.local_state) @@ -167,6 +41,14 @@ def handle_init(input_data: PluginInput) -> PluginOutput: @plugin.command("apply") def handle_apply(input_data: PluginInput) -> PluginOutput: """Apply PKI Authority configuration and start service.""" + + cpu_type = detect_cpu_type() + set_own_challenge(cpu_type) + set_subroot_env() + patch_lxc_config(cpu_type) + update_pccs_url_and_setup_iptables() + + local_node_id = input_data.local_node_id state_json = input_data.state or {} local_state = input_data.local_state or {} @@ -181,24 +63,26 @@ def handle_apply(input_data: PluginInput) -> PluginOutput: return PluginOutput(status="error", error_message="Local node has no WireGuard tunnel IP", local_state=local_state) try: + container = LXCContainer(PKI_SERVICE_NAME) + # Start or restart LXC container - if is_pki_running(): + if container.is_running(): print(f"[*] Restarting LXC container {PKI_SERVICE_NAME}") # Stop container gracefully - exit_code = lxc_stop_container(PKI_SERVICE_NAME, graceful_timeout=30, command_timeout=60) + exit_code = container.stop(graceful_timeout=30, command_timeout=60) if exit_code != 0: error_msg = f"Failed to stop container with exit code {exit_code}" return PluginOutput(status="error", error_message=error_msg, local_state=local_state) # Start container - exit_code = lxc_start_container(PKI_SERVICE_NAME, timeout=30) + exit_code = container.start(timeout=30) if exit_code != 0: error_msg = f"Failed to start container with exit code {exit_code}" return PluginOutput(status="error", error_message=error_msg, local_state=local_state) else: # Start container - exit_code = lxc_start_container(PKI_SERVICE_NAME, timeout=30) + exit_code = container.start(timeout=30) if exit_code != 0: error_msg = f"Failed to start container with exit code {exit_code}" return PluginOutput(status="error", error_message=error_msg, local_state=local_state) @@ -206,12 +90,8 @@ def handle_apply(input_data: PluginInput) -> PluginOutput: print(f"[*] LXC container {PKI_SERVICE_NAME} is running") return PluginOutput(status="completed", local_state=local_state) - except subprocess.CalledProcessError as e: - error_msg = f"Failed to start service: {e.stderr if e.stderr else str(e)}" - print(f"[!] {error_msg}", file=sys.stderr) - return PluginOutput(status="error", error_message=error_msg, local_state=local_state) except Exception as e: - error_msg = f"Unexpected error: {str(e)}" + error_msg = f"Failed to start service: {str(e)}" print(f"[!] {error_msg}", file=sys.stderr) return PluginOutput(status="error", error_message=error_msg, local_state=local_state) @@ -221,14 +101,21 @@ def handle_health(input_data: PluginInput) -> PluginOutput: """Check health of PKI Authority service.""" local_state = input_data.local_state or {} - if is_pki_running(): - return PluginOutput(status="healthy", local_state=local_state) - else: - return PluginOutput( - status="unhealthy", - error_message="Service is not running", - local_state=local_state - ) + try: + container = LXCContainer(PKI_SERVICE_NAME) + + if container.is_running() and container.is_service_healthy(): + return PluginOutput(status="completed", local_state=local_state) + else: + return PluginOutput( + status="error", + error_message="PKI service is not healthy or container is not running", + local_state=local_state + ) + except Exception as e: + error_msg = f"Health check failed: {str(e)}" + print(f"[!] {error_msg}", file=sys.stderr) + return PluginOutput(status="error", error_message=error_msg, local_state=local_state) @plugin.command("finalize") @@ -244,20 +131,21 @@ def handle_destroy(input_data: PluginInput) -> PluginOutput: local_state = input_data.local_state or {} try: - print(f"[*] Stopping {PKI_SERVICE_NAME}") - subprocess.run( - ["systemctl", "stop", PKI_SERVICE_NAME], - capture_output=True, - text=True, - timeout=30 - ) - - print(f"[*] Disabling {PKI_SERVICE_NAME}") - subprocess.run( - ["systemctl", "disable", PKI_SERVICE_NAME], - capture_output=True, - text=True - ) + container = LXCContainer(PKI_SERVICE_NAME) + + # Stop container if running + if container.is_running(): + exit_code = container.stop(graceful_timeout=30, command_timeout=60) + if exit_code != 0: + print(f"[!] Warning: Failed to stop container gracefully", file=sys.stderr) + + # Destroy container + exit_code = container.destroy() + if exit_code != 0: + error_msg = f"Failed to destroy container with exit code {exit_code}" + return PluginOutput(status="error", error_message=error_msg, local_state=local_state) + + delete_iptables_rules() print("[*] PKI Authority destroyed") return PluginOutput(status="completed", local_state=local_state) From 929241f80912f2eb95bc0aa20f1b76cf5c458ab0 Mon Sep 17 00:00:00 2001 From: Petr Evstifeev Date: Wed, 3 Dec 2025 06:35:59 -0600 Subject: [PATCH 08/51] some improvement --- src/services/apps/pki-authority/helpers.py | 73 ++++++++++++++++------ src/services/apps/pki-authority/main.py | 12 ++-- 2 files changed, 63 insertions(+), 22 deletions(-) diff --git a/src/services/apps/pki-authority/helpers.py b/src/services/apps/pki-authority/helpers.py index a753ce00..c4f28d64 100644 --- a/src/services/apps/pki-authority/helpers.py +++ b/src/services/apps/pki-authority/helpers.py @@ -14,10 +14,20 @@ import ssl from pathlib import Path from typing import List, Optional +from enum import Enum PKI_SERVICE_NAME = "pki-authority" SERVICE_INSIDE_CONTAINER = "tee-pki" BRIDGE_NAME = "lxcbr0" +PCCS_PORT = "8081" +MONGODB_PORT = "27017" + + +class VMMode(Enum): + """VM mode types.""" + LEGACY = "legacy" + SWARM_INIT = "swarm-init" + SWARM_NORMAL = "swarm-normal" class LXCContainer: """Manager for LXC container operations.""" @@ -204,22 +214,33 @@ def detect_cpu_type() -> str: else: return "untrusted" -def set_own_challenge(cpu_type: str): - """Set own challenge type in LXC container configuration.""" - # Check if vm_mode=legacy is set in kernel command line + +def detect_vm_mode() -> VMMode: + """Detect VM mode from kernel command line.""" try: with open("/proc/cmdline", "r") as f: cmdline = f.read() if "vm_mode=legacy" in cmdline: - template_name = "lxc-legacy-vm-template.yaml" - print("Detected vm_mode=legacy in kernel cmdline, using legacy template") + return VMMode.LEGACY + elif "vm_mode=swarm-init" in cmdline: + return VMMode.SWARM_INIT else: - template_name = "lxc-swarm-template.yaml" - print("Using swarm template") + return VMMode.SWARM_NORMAL except FileNotFoundError: + return VMMode.SWARM_NORMAL + + +def patch_yaml_config(cpu_type: str): + """Set own challenge type in LXC container configuration.""" + vm_mode = detect_vm_mode() + + if vm_mode == VMMode.LEGACY: + template_name = "lxc-legacy-vm-template.yaml" + print(f"Detected {vm_mode.value} mode, using legacy template") + else: template_name = "lxc-swarm-template.yaml" - print("Using swarm template") + print(f"Detected {vm_mode.value} mode, using swarm template") src_yaml = Path(f"/etc/super/containers/pki-authority/{template_name}") dst_yaml = Path(f"/var/lib/lxc/{PKI_SERVICE_NAME}/rootfs/app/conf/lxc.yaml") @@ -239,6 +260,17 @@ def set_own_challenge(cpu_type: str): config["pki"]["ownChallenge"] = {} config["pki"]["ownChallenge"]["type"] = cpu_type + # Set mode.attestationServiceSource.mode for swarm modes + if vm_mode in (VMMode.SWARM_INIT, VMMode.SWARM_NORMAL): + if "mode" not in config["pki"]: + config["pki"]["mode"] = {} + if "attestationServiceSource" not in config["pki"]["mode"]: + config["pki"]["mode"]["attestationServiceSource"] = {} + + mode_value = "init" if vm_mode == VMMode.SWARM_INIT else "normal" + config["pki"]["mode"]["attestationServiceSource"]["mode"] = mode_value + print(f"Set attestationServiceSource mode to: {mode_value}") + # Ensure destination directory exists dst_yaml.parent.mkdir(parents=True, exist_ok=True) @@ -417,24 +449,29 @@ def delete_iptables_rules(): subprocess.run(["iptables", "-t", "nat"] + delete_rule.split()[1:], check=True) print(f"Deleted iptables rule: {delete_rule}") -def update_pccs_url_and_setup_iptables(): - """Update PCCS URL and setup iptables NAT rules for LXC container access to host service.""" - service_port = "8081" - qcnl_conf = Path(f"/var/lib/lxc/{PKI_SERVICE_NAME}/rootfs/etc/sgx_default_qcnl.conf") - qcnl_conf_bak = Path(f"{qcnl_conf}.bak") - - # Get host IP address on the LXC bridge + +def setup_iptables(): + """Setup iptables NAT rules for LXC container access to host services.""" host_ip = get_bridge_ip(BRIDGE_NAME) # Enable route_localnet for the bridge enable_route_localnet(BRIDGE_NAME) # Add iptables rules for PCCS and MongoDB - add_iptables_rule(host_ip, service_port) - add_iptables_rule(host_ip, "27017") + add_iptables_rule(host_ip, PCCS_PORT) + add_iptables_rule(host_ip, MONGODB_PORT) + + +def update_pccs_url(): + """Update PCCS URL in QCNL configuration.""" + qcnl_conf = Path(f"/var/lib/lxc/{PKI_SERVICE_NAME}/rootfs/etc/sgx_default_qcnl.conf") + qcnl_conf_bak = Path(f"{qcnl_conf}.bak") + + # Get host IP address on the LXC bridge + host_ip = get_bridge_ip(BRIDGE_NAME) # Update PCCS URL in QCNL configuration - pccs_url = f"https://{host_ip}:{service_port}/sgx/certification/v4/" + pccs_url = f"https://{host_ip}:{PCCS_PORT}/sgx/certification/v4/" if not qcnl_conf.exists(): print(f"Error: {qcnl_conf} not found") diff --git a/src/services/apps/pki-authority/main.py b/src/services/apps/pki-authority/main.py index bf255f32..418303d6 100755 --- a/src/services/apps/pki-authority/main.py +++ b/src/services/apps/pki-authority/main.py @@ -10,12 +10,15 @@ from helpers import ( delete_iptables_rules, detect_cpu_type, - set_own_challenge, + detect_vm_mode, + patch_yaml_config, set_subroot_env, patch_lxc_config, - update_pccs_url_and_setup_iptables, + setup_iptables, + update_pccs_url, LXCContainer, PKI_SERVICE_NAME, + VMMode, get_node_tunnel_ip, init_container, ) @@ -43,10 +46,11 @@ def handle_apply(input_data: PluginInput) -> PluginOutput: """Apply PKI Authority configuration and start service.""" cpu_type = detect_cpu_type() - set_own_challenge(cpu_type) + patch_yaml_config(cpu_type) set_subroot_env() patch_lxc_config(cpu_type) - update_pccs_url_and_setup_iptables() + setup_iptables() + update_pccs_url() local_node_id = input_data.local_node_id From 11a940e0937a0f85944405b3fd6c3254f2753de9 Mon Sep 17 00:00:00 2001 From: Petr Evstifeev Date: Wed, 3 Dec 2025 10:13:43 -0600 Subject: [PATCH 09/51] access to lxc container from wg network --- src/services/apps/pki-authority/helpers.py | 74 ++++++++++++++++++---- src/services/apps/pki-authority/main.py | 16 ++--- 2 files changed, 68 insertions(+), 22 deletions(-) diff --git a/src/services/apps/pki-authority/helpers.py b/src/services/apps/pki-authority/helpers.py index c4f28d64..3b7a25bf 100644 --- a/src/services/apps/pki-authority/helpers.py +++ b/src/services/apps/pki-authority/helpers.py @@ -21,6 +21,9 @@ BRIDGE_NAME = "lxcbr0" PCCS_PORT = "8081" MONGODB_PORT = "27017" +PKI_SERVICE_EXTERNAL_PORT = "8443" +CONTAINER_IP = "10.0.3.100" +WIREGUARD_INTERFACE = "wg0" class VMMode(Enum): @@ -434,23 +437,27 @@ def add_iptables_rule(host_ip: str, port: str): print(f"iptables DNAT rule added: {host_ip}:{port} -> 127.0.0.1:{port}") def delete_iptables_rules(): - """Delete all iptables DNAT rules in PREROUTING that contain host_ip.""" + """Delete all iptables NAT rules for PKI container.""" host_ip = get_bridge_ip(BRIDGE_NAME) - result = subprocess.run( - ["iptables", "-t", "nat", "-S", "PREROUTING"], - capture_output=True, text=True, check=True - ) - - rules = result.stdout.splitlines() - for rule in rules: - if host_ip in rule: - delete_rule = rule.replace("-A", "-D", 1) - subprocess.run(["iptables", "-t", "nat"] + delete_rule.split()[1:], check=True) - print(f"Deleted iptables rule: {delete_rule}") + # Delete rules from all chains: PREROUTING, OUTPUT, POSTROUTING + for chain in ["PREROUTING", "OUTPUT", "POSTROUTING"]: + result = subprocess.run( + ["iptables", "-t", "nat", "-S", chain], + capture_output=True, text=True, check=True + ) + + rules = result.stdout.splitlines() + + for rule in rules: + # Delete rules that contain host_ip or CONTAINER_IP + if host_ip in rule or CONTAINER_IP in rule: + delete_rule = rule.replace("-A", "-D", 1) + subprocess.run(["iptables", "-t", "nat"] + delete_rule.split()[1:], check=True) + print(f"Deleted iptables rule: {delete_rule}") -def setup_iptables(): +def setup_iptables(wg_ip): """Setup iptables NAT rules for LXC container access to host services.""" host_ip = get_bridge_ip(BRIDGE_NAME) @@ -460,6 +467,47 @@ def setup_iptables(): # Add iptables rules for PCCS and MongoDB add_iptables_rule(host_ip, PCCS_PORT) add_iptables_rule(host_ip, MONGODB_PORT) + + # Add WireGuard interface routing rules + # Route external traffic from WireGuard to container + subprocess.run( + [ + "iptables", "-t", "nat", "-A", "PREROUTING", + "-i", WIREGUARD_INTERFACE, + "-p", "tcp", + "--dport", PKI_SERVICE_EXTERNAL_PORT, + "-j", "DNAT", + "--to-destination", f"{CONTAINER_IP}:443" + ], + check=True + ) + print(f"Added iptables rule: PREROUTING WireGuard {PKI_SERVICE_EXTERNAL_PORT} -> {CONTAINER_IP}:443") + + # Route local traffic to container + subprocess.run( + [ + "iptables", "-t", "nat", "-A", "OUTPUT", + "-d", wg_ip, + "-p", "tcp", + "--dport", PKI_SERVICE_EXTERNAL_PORT, + "-j", "DNAT", + "--to-destination", f"{CONTAINER_IP}:443" + ], + check=True + ) + print(f"Added iptables rule: OUTPUT {wg_ip}:{PKI_SERVICE_EXTERNAL_PORT} -> {CONTAINER_IP}:443") + + # Add masquerading for container traffic + subprocess.run( + [ + "iptables", "-t", "nat", "-A", "POSTROUTING", + "-s", f"{CONTAINER_IP}/32", + "-j", "MASQUERADE" + ], + check=True + ) + print(f"Added iptables rule: POSTROUTING MASQUERADE for {CONTAINER_IP}/32") + def update_pccs_url(): diff --git a/src/services/apps/pki-authority/main.py b/src/services/apps/pki-authority/main.py index 418303d6..6ce43f77 100755 --- a/src/services/apps/pki-authority/main.py +++ b/src/services/apps/pki-authority/main.py @@ -18,7 +18,6 @@ update_pccs_url, LXCContainer, PKI_SERVICE_NAME, - VMMode, get_node_tunnel_ip, init_container, ) @@ -45,14 +44,6 @@ def handle_init(input_data: PluginInput) -> PluginOutput: def handle_apply(input_data: PluginInput) -> PluginOutput: """Apply PKI Authority configuration and start service.""" - cpu_type = detect_cpu_type() - patch_yaml_config(cpu_type) - set_subroot_env() - patch_lxc_config(cpu_type) - setup_iptables() - update_pccs_url() - - local_node_id = input_data.local_node_id state_json = input_data.state or {} local_state = input_data.local_state or {} @@ -67,6 +58,13 @@ def handle_apply(input_data: PluginInput) -> PluginOutput: return PluginOutput(status="error", error_message="Local node has no WireGuard tunnel IP", local_state=local_state) try: + cpu_type = detect_cpu_type() + delete_iptables_rules() + patch_yaml_config(cpu_type) + set_subroot_env() + patch_lxc_config(cpu_type) + update_pccs_url() + setup_iptables(local_tunnel_ip) container = LXCContainer(PKI_SERVICE_NAME) # Start or restart LXC container From 18d3bec89f7287d7936ae1031b33729a8c0e2ee7 Mon Sep 17 00:00:00 2001 From: Petr Evstifeev Date: Fri, 5 Dec 2025 00:54:37 -0600 Subject: [PATCH 10/51] allowed challenges sev --- src/rootfs/files/configs/pki-service/lxc-swarm-template.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/src/rootfs/files/configs/pki-service/lxc-swarm-template.yaml b/src/rootfs/files/configs/pki-service/lxc-swarm-template.yaml index 6eb3eb2a..979a88f3 100644 --- a/src/rootfs/files/configs/pki-service/lxc-swarm-template.yaml +++ b/src/rootfs/files/configs/pki-service/lxc-swarm-template.yaml @@ -4,6 +4,7 @@ pki: allowedChallenges: - token - tdx + - sev-snp allowedSubRootChallenges: - type: tdx signatureVerification: github From e61e1bdaed28253786d24bff21d7b5a755aa40d1 Mon Sep 17 00:00:00 2001 From: Petr Evstifeev Date: Fri, 12 Dec 2025 12:40:29 -0600 Subject: [PATCH 11/51] store token in mongodb --- .../pki-service/lxc-swarm-template.yaml | 4 +- src/services/apps/pki-authority/helpers.py | 80 ++++++++++++------- src/services/apps/pki-authority/main.py | 7 ++ 3 files changed, 63 insertions(+), 28 deletions(-) diff --git a/src/rootfs/files/configs/pki-service/lxc-swarm-template.yaml b/src/rootfs/files/configs/pki-service/lxc-swarm-template.yaml index 979a88f3..fc0bfe96 100644 --- a/src/rootfs/files/configs/pki-service/lxc-swarm-template.yaml +++ b/src/rootfs/files/configs/pki-service/lxc-swarm-template.yaml @@ -10,7 +10,9 @@ pki: signatureVerification: github - type: sev-snp signatureVerification: github - tokenChallengeFilePath: /app/access-token + tokenStorage: + storageType: mongodb + connectionString: mongodb://10.0.3.1:27017/pki ownDomain: ca-subroot.super-protocol.svc.cluster.local ownChallenge: type: tdx diff --git a/src/services/apps/pki-authority/helpers.py b/src/services/apps/pki-authority/helpers.py index 3b7a25bf..ddfe8ec0 100644 --- a/src/services/apps/pki-authority/helpers.py +++ b/src/services/apps/pki-authority/helpers.py @@ -461,15 +461,10 @@ def setup_iptables(wg_ip): """Setup iptables NAT rules for LXC container access to host services.""" host_ip = get_bridge_ip(BRIDGE_NAME) - # Enable route_localnet for the bridge enable_route_localnet(BRIDGE_NAME) - # Add iptables rules for PCCS and MongoDB add_iptables_rule(host_ip, PCCS_PORT) add_iptables_rule(host_ip, MONGODB_PORT) - - # Add WireGuard interface routing rules - # Route external traffic from WireGuard to container subprocess.run( [ "iptables", "-t", "nat", "-A", "PREROUTING", @@ -483,7 +478,6 @@ def setup_iptables(wg_ip): ) print(f"Added iptables rule: PREROUTING WireGuard {PKI_SERVICE_EXTERNAL_PORT} -> {CONTAINER_IP}:443") - # Route local traffic to container subprocess.run( [ "iptables", "-t", "nat", "-A", "OUTPUT", @@ -497,7 +491,6 @@ def setup_iptables(wg_ip): ) print(f"Added iptables rule: OUTPUT {wg_ip}:{PKI_SERVICE_EXTERNAL_PORT} -> {CONTAINER_IP}:443") - # Add masquerading for container traffic subprocess.run( [ "iptables", "-t", "nat", "-A", "POSTROUTING", @@ -515,36 +508,69 @@ def update_pccs_url(): qcnl_conf = Path(f"/var/lib/lxc/{PKI_SERVICE_NAME}/rootfs/etc/sgx_default_qcnl.conf") qcnl_conf_bak = Path(f"{qcnl_conf}.bak") - # Get host IP address on the LXC bridge host_ip = get_bridge_ip(BRIDGE_NAME) - # Update PCCS URL in QCNL configuration pccs_url = f"https://{host_ip}:{PCCS_PORT}/sgx/certification/v4/" if not qcnl_conf.exists(): print(f"Error: {qcnl_conf} not found") sys.exit(1) - # Create backup if it doesn't exist if not qcnl_conf_bak.exists(): shutil.copy(qcnl_conf, qcnl_conf_bak) - - # Update pccs_url in the JSON configuration file - with open(qcnl_conf, "r") as f: - content = f.read() - - content = re.sub( - r'"pccs_url":\s*"[^"]*"', - f'"pccs_url": "{pccs_url}"', - content - ) - - with open(qcnl_conf, "w") as f: - f.write(content) - - print(f"Updated PCCS URL in {qcnl_conf} to {pccs_url}") - else: - print(f"Backup {qcnl_conf_bak} already exists, skipping PCCS URL update") + + shutil.copy(qcnl_conf_bak, qcnl_conf) + + with open(qcnl_conf, "r") as f: + content = f.read() + + content = re.sub( + r'"pccs_url":\s*"[^"]*"', + f'"pccs_url": "{pccs_url}"', + content + ) + + with open(qcnl_conf, "w") as f: + f.write(content) + + print(f"Updated PCCS URL in {qcnl_conf} to {pccs_url}") + + +def update_mongodb_connection(nodes: List[str]): + """Update MongoDB connection string in LXC YAML configuration. + + Args: + nodes: List of MongoDB nodes in format "ip:port" + """ + lxc_yaml = Path(f"/var/lib/lxc/{PKI_SERVICE_NAME}/rootfs/app/conf/lxc.yaml") + + if not lxc_yaml.exists(): + print(f"Warning: {lxc_yaml} not found, skipping MongoDB connection update") + return + + with open(lxc_yaml, "r") as f: + config = yaml.safe_load(f) + + hosts = ",".join(nodes) + new_connection_string = f"mongodb://{hosts}/pki" + + if "pki" in config and "tokenStorage" in config["pki"]: + if "connectionString" in config["pki"]["tokenStorage"]: + old_conn = config["pki"]["tokenStorage"]["connectionString"] + config["pki"]["tokenStorage"]["connectionString"] = new_connection_string + print(f"Updated tokenStorage connectionString: {old_conn} -> {new_connection_string}") + + if "pki" in config and "mode" in config["pki"]: + if "attestationServiceSource" in config["pki"]["mode"]: + if "storage" in config["pki"]["mode"]["attestationServiceSource"]: + if "connectionString" in config["pki"]["mode"]["attestationServiceSource"]["storage"]: + old_conn = config["pki"]["mode"]["attestationServiceSource"]["storage"]["connectionString"] + config["pki"]["mode"]["attestationServiceSource"]["storage"]["connectionString"] = new_connection_string + print(f"Updated attestationServiceSource storage connectionString: {old_conn} -> {new_connection_string}") + + with open(lxc_yaml, "w") as f: + yaml.dump(config, f, default_flow_style=False) + print(f"MongoDB connection updated in {lxc_yaml}") def init_container(): diff --git a/src/services/apps/pki-authority/main.py b/src/services/apps/pki-authority/main.py index 6ce43f77..0e8c1bf4 100755 --- a/src/services/apps/pki-authority/main.py +++ b/src/services/apps/pki-authority/main.py @@ -8,14 +8,18 @@ # Import helpers sys.path.insert(0, str(Path(__file__).parent)) from helpers import ( + BRIDGE_NAME, + MONGODB_PORT, delete_iptables_rules, detect_cpu_type, detect_vm_mode, + get_bridge_ip, patch_yaml_config, set_subroot_env, patch_lxc_config, setup_iptables, update_pccs_url, + update_mongodb_connection, LXCContainer, PKI_SERVICE_NAME, get_node_tunnel_ip, @@ -64,6 +68,9 @@ def handle_apply(input_data: PluginInput) -> PluginOutput: set_subroot_env() patch_lxc_config(cpu_type) update_pccs_url() + host_ip = get_bridge_ip(BRIDGE_NAME) + mongodb_nodes = [f"{host_ip}:{MONGODB_PORT}"] + update_mongodb_connection(mongodb_nodes) setup_iptables(local_tunnel_ip) container = LXCContainer(PKI_SERVICE_NAME) From e52b92c64bcbb7258aad10c503b6d6d8d4cfa29e Mon Sep 17 00:00:00 2001 From: Petr Evstifeev Date: Tue, 16 Dec 2025 05:34:21 -0600 Subject: [PATCH 12/51] store authority service properties in swarmdb --- .../pki-service/lxc-swarm-template.yaml | 8 +- src/services/apps/pki-authority/helpers.py | 58 +++------ src/services/apps/pki-authority/main.py | 123 ++++++++++++++---- src/services/apps/pki-authority/manifest.yaml | 12 +- 4 files changed, 131 insertions(+), 70 deletions(-) diff --git a/src/rootfs/files/configs/pki-service/lxc-swarm-template.yaml b/src/rootfs/files/configs/pki-service/lxc-swarm-template.yaml index fc0bfe96..60d9ce90 100644 --- a/src/rootfs/files/configs/pki-service/lxc-swarm-template.yaml +++ b/src/rootfs/files/configs/pki-service/lxc-swarm-template.yaml @@ -11,8 +11,8 @@ pki: - type: sev-snp signatureVerification: github tokenStorage: - storageType: mongodb - connectionString: mongodb://10.0.3.1:27017/pki + storageType: file + storageFolder: /app/swarm-storage ownDomain: ca-subroot.super-protocol.svc.cluster.local ownChallenge: type: tdx @@ -29,5 +29,5 @@ pki: type: swarm mode: init storage: - storageType: mongodb - connectionString: mongodb://10.0.3.1:27017/pki + storageType: file + storageFolder: /app/swarm-storage diff --git a/src/services/apps/pki-authority/helpers.py b/src/services/apps/pki-authority/helpers.py index ddfe8ec0..b4fc605f 100644 --- a/src/services/apps/pki-authority/helpers.py +++ b/src/services/apps/pki-authority/helpers.py @@ -20,10 +20,10 @@ SERVICE_INSIDE_CONTAINER = "tee-pki" BRIDGE_NAME = "lxcbr0" PCCS_PORT = "8081" -MONGODB_PORT = "27017" PKI_SERVICE_EXTERNAL_PORT = "8443" CONTAINER_IP = "10.0.3.100" WIREGUARD_INTERFACE = "wg0" +STORAGE_PATH = Path(f"/var/lib/lxc/{PKI_SERVICE_NAME}/rootfs/app/swarm-storage") class VMMode(Enum): @@ -464,7 +464,7 @@ def setup_iptables(wg_ip): enable_route_localnet(BRIDGE_NAME) add_iptables_rule(host_ip, PCCS_PORT) - add_iptables_rule(host_ip, MONGODB_PORT) + subprocess.run( [ "iptables", "-t", "nat", "-A", "PREROUTING", @@ -536,43 +536,6 @@ def update_pccs_url(): print(f"Updated PCCS URL in {qcnl_conf} to {pccs_url}") -def update_mongodb_connection(nodes: List[str]): - """Update MongoDB connection string in LXC YAML configuration. - - Args: - nodes: List of MongoDB nodes in format "ip:port" - """ - lxc_yaml = Path(f"/var/lib/lxc/{PKI_SERVICE_NAME}/rootfs/app/conf/lxc.yaml") - - if not lxc_yaml.exists(): - print(f"Warning: {lxc_yaml} not found, skipping MongoDB connection update") - return - - with open(lxc_yaml, "r") as f: - config = yaml.safe_load(f) - - hosts = ",".join(nodes) - new_connection_string = f"mongodb://{hosts}/pki" - - if "pki" in config and "tokenStorage" in config["pki"]: - if "connectionString" in config["pki"]["tokenStorage"]: - old_conn = config["pki"]["tokenStorage"]["connectionString"] - config["pki"]["tokenStorage"]["connectionString"] = new_connection_string - print(f"Updated tokenStorage connectionString: {old_conn} -> {new_connection_string}") - - if "pki" in config and "mode" in config["pki"]: - if "attestationServiceSource" in config["pki"]["mode"]: - if "storage" in config["pki"]["mode"]["attestationServiceSource"]: - if "connectionString" in config["pki"]["mode"]["attestationServiceSource"]["storage"]: - old_conn = config["pki"]["mode"]["attestationServiceSource"]["storage"]["connectionString"] - config["pki"]["mode"]["attestationServiceSource"]["storage"]["connectionString"] = new_connection_string - print(f"Updated attestationServiceSource storage connectionString: {old_conn} -> {new_connection_string}") - - with open(lxc_yaml, "w") as f: - yaml.dump(config, f, default_flow_style=False) - print(f"MongoDB connection updated in {lxc_yaml}") - - def init_container(): LXCContainer(PKI_SERVICE_NAME).create() @@ -581,4 +544,19 @@ def get_node_tunnel_ip(node_id: str, wg_props: List[dict]) -> Optional[str]: for prop in wg_props: if prop.get("node_id") == node_id and prop.get("name") == "tunnel_ip": return prop.get("value") - return None \ No newline at end of file + return None + + +def save_property_into_fs(file_name: str, content: bytes): + STORAGE_PATH.mkdir(parents=True, exist_ok=True) + file_path = STORAGE_PATH / file_name + file_path.write_bytes(content) + + +def read_property_from_fs(file_name: str) -> tuple[bool, bytes]: + file_path = STORAGE_PATH / file_name + if file_path.exists(): + content = file_path.read_bytes() + if content: + return (True, content) + return (False, b"") diff --git a/src/services/apps/pki-authority/main.py b/src/services/apps/pki-authority/main.py index 0e8c1bf4..8c138e21 100755 --- a/src/services/apps/pki-authority/main.py +++ b/src/services/apps/pki-authority/main.py @@ -1,34 +1,40 @@ #!/usr/bin/env python3 import sys +import time from pathlib import Path from provision_plugin_sdk import ProvisionPlugin, PluginInput, PluginOutput +import base64 # Import helpers sys.path.insert(0, str(Path(__file__).parent)) from helpers import ( - BRIDGE_NAME, - MONGODB_PORT, delete_iptables_rules, detect_cpu_type, detect_vm_mode, - get_bridge_ip, patch_yaml_config, set_subroot_env, patch_lxc_config, setup_iptables, update_pccs_url, - update_mongodb_connection, LXCContainer, PKI_SERVICE_NAME, get_node_tunnel_ip, init_container, + VMMode, + save_property_into_fs, + read_property_from_fs, ) # Configuration plugin = ProvisionPlugin() +# Authority service property prefix and names +AUTHORITY_SERVICE_PREFIX = "pki_authority_" +AUTHORITY_SERVICE_PROPERTIES = ["auth_token", "basic_certificate", "basic_privateKey", "lite_certificate", "lite_privateKey"] +PROP_INITIALIZED = f"{AUTHORITY_SERVICE_PREFIX}initialized" + # Plugin commands @plugin.command("init") def handle_init(input_data: PluginInput) -> PluginOutput: @@ -56,21 +62,48 @@ def handle_apply(input_data: PluginInput) -> PluginOutput: return PluginOutput(status="error", error_message="Invalid state format", local_state=local_state) wg_props = state_json.get("wgNodeProperties", []) + authority_props = state_json.get("authorityServiceProperties", []) + + # Convert authority service properties to dict for easier access + authority_config = {prop["name"]: prop["value"] for prop in authority_props} local_tunnel_ip = get_node_tunnel_ip(local_node_id, wg_props) if not local_tunnel_ip: return PluginOutput(status="error", error_message="Local node has no WireGuard tunnel IP", local_state=local_state) - + try: + vm_mode = detect_vm_mode() + initialized = authority_config.get(PROP_INITIALIZED) + # If initialized is true, verify all required properties are present + if initialized == "true": + missing = [] + + for prop in AUTHORITY_SERVICE_PROPERTIES: + prop_name = f"{AUTHORITY_SERVICE_PREFIX}{prop}" + prop_value = authority_config.get(prop_name, "") + + if not prop_value: + missing.append(prop_name) + + + if missing: + error_msg = f"Service marked as initialized but missing properties: {', '.join(missing)}" + print(f"[!] {error_msg}", file=sys.stderr) + return PluginOutput(status="error", error_message=error_msg, local_state=local_state) + + if vm_mode == VMMode.SWARM_NORMAL and initialized != "true": + return PluginOutput( + status="postponed", + error_message="Waiting for authority service properties to be initialized", + local_state=local_state + ) + cpu_type = detect_cpu_type() delete_iptables_rules() patch_yaml_config(cpu_type) set_subroot_env() patch_lxc_config(cpu_type) update_pccs_url() - host_ip = get_bridge_ip(BRIDGE_NAME) - mongodb_nodes = [f"{host_ip}:{MONGODB_PORT}"] - update_mongodb_connection(mongodb_nodes) setup_iptables(local_tunnel_ip) container = LXCContainer(PKI_SERVICE_NAME) @@ -78,29 +111,69 @@ def handle_apply(input_data: PluginInput) -> PluginOutput: if container.is_running(): print(f"[*] Restarting LXC container {PKI_SERVICE_NAME}") - # Stop container gracefully exit_code = container.stop(graceful_timeout=30, command_timeout=60) if exit_code != 0: - error_msg = f"Failed to stop container with exit code {exit_code}" - return PluginOutput(status="error", error_message=error_msg, local_state=local_state) - - # Start container - exit_code = container.start(timeout=30) - if exit_code != 0: - error_msg = f"Failed to start container with exit code {exit_code}" - return PluginOutput(status="error", error_message=error_msg, local_state=local_state) - else: - # Start container - exit_code = container.start(timeout=30) - if exit_code != 0: - error_msg = f"Failed to start container with exit code {exit_code}" - return PluginOutput(status="error", error_message=error_msg, local_state=local_state) + raise Exception(f"Failed to stop container with exit code {exit_code}") + + if initialized == "true": + for prop in AUTHORITY_SERVICE_PROPERTIES: + prop_name = f"{AUTHORITY_SERVICE_PREFIX}{prop}" + prop_value = authority_config.get(prop_name, "") + save_property_into_fs(prop, base64.b64decode(prop_value)) + + exit_code = container.start(timeout=30) + if exit_code != 0: + raise Exception(f"Failed to start container with exit code {exit_code}") print(f"[*] LXC container {PKI_SERVICE_NAME} is running") - return PluginOutput(status="completed", local_state=local_state) + # If not initialized, wait for tee-pki service to generate property files + if initialized != "true": + missing_properties = AUTHORITY_SERVICE_PROPERTIES.copy() + timeout = 30 + interval = 5 + elapsed = 0 + collected_properties = {} + + while elapsed < timeout: + # Try to read each missing property + for prop in missing_properties[:]: + success, value = read_property_from_fs(prop) + + if success: + collected_properties[f"{AUTHORITY_SERVICE_PREFIX}{prop}"] = base64.b64encode(value).decode() + missing_properties.remove(prop) + + # Check if all properties collected + if not missing_properties: + print("[*] All property files have been generated by tee-pki service") + + # Add initialized flag + collected_properties[PROP_INITIALIZED] = "true" + + return PluginOutput( + status="completed", + cluster_properties=collected_properties, + local_state=local_state + ) + + # Show what's still missing + print(f"[*] Waiting for property files. Missing: {', '.join(missing_properties)} (elapsed: {elapsed}s)") + + time.sleep(interval) + elapsed += interval + + # Timeout reached + return PluginOutput( + status="postponed", + error_message=f"Timeout waiting for tee-pki to generate property files: {', '.join(missing_properties)}", + local_state=local_state + ) + + return PluginOutput(status="completed", local_state=local_state) + except Exception as e: - error_msg = f"Failed to start service: {str(e)}" + error_msg = f"Apply failed: {str(e)}" print(f"[!] {error_msg}", file=sys.stderr) return PluginOutput(status="error", error_message=error_msg, local_state=local_state) diff --git a/src/services/apps/pki-authority/manifest.yaml b/src/services/apps/pki-authority/manifest.yaml index e504db4d..f329c866 100644 --- a/src/services/apps/pki-authority/manifest.yaml +++ b/src/services/apps/pki-authority/manifest.yaml @@ -52,6 +52,16 @@ stateExpr: .name == "tunnel_ip" ) | {cluster_node, name, value, node_id: ( .cluster_node as $cn | $swarmdb.clusternodes[] | select(.id == $cn)) | .node} - ] | sort_by(.cluster_node, .name, .value, .node_id) + ] | sort_by(.cluster_node, .name, .value, .node_id), + + authorityServiceProperties: [ + $swarmdb.clusterproperties[] | + select( + .cluster == $cluster.id and + .deleted_ts == null and + (.name | startswith("pki_authority_")) + ) | + {name, value} + ] | sort_by(.name) } From 9c1d259e524441b65b66bfa3bac07f57d984e987 Mon Sep 17 00:00:00 2001 From: Petr Evstifeev Date: Wed, 17 Dec 2025 11:43:35 -0600 Subject: [PATCH 13/51] impoved restart reasons --- src/services/apps/pki-authority/helpers.py | 129 ++++++++++++--------- src/services/apps/pki-authority/main.py | 60 ++++++++-- 2 files changed, 125 insertions(+), 64 deletions(-) diff --git a/src/services/apps/pki-authority/helpers.py b/src/services/apps/pki-authority/helpers.py index b4fc605f..112e3a84 100644 --- a/src/services/apps/pki-authority/helpers.py +++ b/src/services/apps/pki-authority/helpers.py @@ -24,6 +24,7 @@ CONTAINER_IP = "10.0.3.100" WIREGUARD_INTERFACE = "wg0" STORAGE_PATH = Path(f"/var/lib/lxc/{PKI_SERVICE_NAME}/rootfs/app/swarm-storage") +IPTABLES_RULE_COMMENT = f"{PKI_SERVICE_NAME}-rule" class VMMode(Enum): @@ -234,10 +235,8 @@ def detect_vm_mode() -> VMMode: return VMMode.SWARM_NORMAL -def patch_yaml_config(cpu_type: str): +def patch_yaml_config(cpu_type: str, vm_mode: VMMode): """Set own challenge type in LXC container configuration.""" - vm_mode = detect_vm_mode() - if vm_mode == VMMode.LEGACY: template_name = "lxc-legacy-vm-template.yaml" print(f"Detected {vm_mode.value} mode, using legacy template") @@ -405,41 +404,8 @@ def enable_route_localnet(bridge_name: str): print(f"Enabled route_localnet for {bridge_name}") -def add_iptables_rule(host_ip: str, port: str): - """Add iptables DNAT rule if it doesn't exist.""" - # Check if rule exists - check_result = subprocess.run( - [ - "iptables", "-t", "nat", "-C", "PREROUTING", - "-p", "tcp", - "-d", host_ip, - "--dport", port, - "-j", "DNAT", - "--to-destination", f"127.0.0.1:{port}" - ], - capture_output=True - ) - - if check_result.returncode == 0: - print(f"iptables DNAT rule already exists for {host_ip}:{port}") - else: - subprocess.run( - [ - "iptables", "-t", "nat", "-A", "PREROUTING", - "-p", "tcp", - "-d", host_ip, - "--dport", port, - "-j", "DNAT", - "--to-destination", f"127.0.0.1:{port}" - ], - check=True - ) - print(f"iptables DNAT rule added: {host_ip}:{port} -> 127.0.0.1:{port}") - def delete_iptables_rules(): """Delete all iptables NAT rules for PKI container.""" - host_ip = get_bridge_ip(BRIDGE_NAME) - # Delete rules from all chains: PREROUTING, OUTPUT, POSTROUTING for chain in ["PREROUTING", "OUTPUT", "POSTROUTING"]: result = subprocess.run( @@ -450,57 +416,116 @@ def delete_iptables_rules(): rules = result.stdout.splitlines() for rule in rules: - # Delete rules that contain host_ip or CONTAINER_IP - if host_ip in rule or CONTAINER_IP in rule: + # Delete rules that contain our comment + if IPTABLES_RULE_COMMENT in rule: delete_rule = rule.replace("-A", "-D", 1) subprocess.run(["iptables", "-t", "nat"] + delete_rule.split()[1:], check=True) print(f"Deleted iptables rule: {delete_rule}") +def ensure_iptables_rule(check_args: List[str], add_args: List[str], description: str): + print(f"[*] Checking iptables rule: {description}") + + check_result = subprocess.run(check_args, capture_output=True) + + if check_result.returncode == 0: + print(f"[*] Rule already exists") + else: + subprocess.run(add_args, check=True) + print(f"[*] Rule added") + + def setup_iptables(wg_ip): """Setup iptables NAT rules for LXC container access to host services.""" host_ip = get_bridge_ip(BRIDGE_NAME) enable_route_localnet(BRIDGE_NAME) - add_iptables_rule(host_ip, PCCS_PORT) + # Rule 1: PCCS DNAT + ensure_iptables_rule( + check_args=[ + "iptables", "-t", "nat", "-C", "PREROUTING", + "-p", "tcp", + "-d", host_ip, + "--dport", PCCS_PORT, + "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, + "-j", "DNAT", + "--to-destination", f"127.0.0.1:{PCCS_PORT}" + ], + add_args=[ + "iptables", "-t", "nat", "-A", "PREROUTING", + "-p", "tcp", + "-d", host_ip, + "--dport", PCCS_PORT, + "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, + "-j", "DNAT", + "--to-destination", f"127.0.0.1:{PCCS_PORT}" + ], + description=f"PCCS DNAT {host_ip}:{PCCS_PORT} -> 127.0.0.1:{PCCS_PORT}" + ) - subprocess.run( - [ + # Rule 2: WireGuard PREROUTING + ensure_iptables_rule( + check_args=[ + "iptables", "-t", "nat", "-C", "PREROUTING", + "-i", WIREGUARD_INTERFACE, + "-p", "tcp", + "--dport", PKI_SERVICE_EXTERNAL_PORT, + "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, + "-j", "DNAT", + "--to-destination", f"{CONTAINER_IP}:443" + ], + add_args=[ "iptables", "-t", "nat", "-A", "PREROUTING", "-i", WIREGUARD_INTERFACE, "-p", "tcp", "--dport", PKI_SERVICE_EXTERNAL_PORT, + "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, "-j", "DNAT", "--to-destination", f"{CONTAINER_IP}:443" ], - check=True + description=f"PREROUTING WireGuard {PKI_SERVICE_EXTERNAL_PORT} -> {CONTAINER_IP}:443" ) - print(f"Added iptables rule: PREROUTING WireGuard {PKI_SERVICE_EXTERNAL_PORT} -> {CONTAINER_IP}:443") - subprocess.run( - [ + # Rule 3: OUTPUT + ensure_iptables_rule( + check_args=[ + "iptables", "-t", "nat", "-C", "OUTPUT", + "-d", wg_ip, + "-p", "tcp", + "--dport", PKI_SERVICE_EXTERNAL_PORT, + "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, + "-j", "DNAT", + "--to-destination", f"{CONTAINER_IP}:443" + ], + add_args=[ "iptables", "-t", "nat", "-A", "OUTPUT", "-d", wg_ip, "-p", "tcp", "--dport", PKI_SERVICE_EXTERNAL_PORT, + "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, "-j", "DNAT", "--to-destination", f"{CONTAINER_IP}:443" ], - check=True + description=f"OUTPUT {wg_ip}:{PKI_SERVICE_EXTERNAL_PORT} -> {CONTAINER_IP}:443" ) - print(f"Added iptables rule: OUTPUT {wg_ip}:{PKI_SERVICE_EXTERNAL_PORT} -> {CONTAINER_IP}:443") - subprocess.run( - [ + # Rule 4: MASQUERADE + ensure_iptables_rule( + check_args=[ + "iptables", "-t", "nat", "-C", "POSTROUTING", + "-s", f"{CONTAINER_IP}/32", + "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, + "-j", "MASQUERADE" + ], + add_args=[ "iptables", "-t", "nat", "-A", "POSTROUTING", "-s", f"{CONTAINER_IP}/32", + "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, "-j", "MASQUERADE" ], - check=True + description=f"POSTROUTING MASQUERADE for {CONTAINER_IP}/32" ) - print(f"Added iptables rule: POSTROUTING MASQUERADE for {CONTAINER_IP}/32") - def update_pccs_url(): diff --git a/src/services/apps/pki-authority/main.py b/src/services/apps/pki-authority/main.py index 8c138e21..001ba364 100755 --- a/src/services/apps/pki-authority/main.py +++ b/src/services/apps/pki-authority/main.py @@ -35,6 +35,38 @@ AUTHORITY_SERVICE_PROPERTIES = ["auth_token", "basic_certificate", "basic_privateKey", "lite_certificate", "lite_privateKey"] PROP_INITIALIZED = f"{AUTHORITY_SERVICE_PREFIX}initialized" +def is_restart_required(authority_config: dict) -> bool: + # Compare authority config properties with values stored in filesystem + for prop in AUTHORITY_SERVICE_PROPERTIES: + prop_name = f"{AUTHORITY_SERVICE_PREFIX}{prop}" + config_value = authority_config.get(prop_name, "") + + if not config_value: + continue + + # Read current value from filesystem + success, fs_value = read_property_from_fs(prop) + + if not success: + # File doesn't exist in FS, restart required + print(f"[*] Property {prop} not found in filesystem, restart required") + return True + + # Decode config value from base64 and compare with filesystem value + try: + decoded_config_value = base64.b64decode(config_value) + if decoded_config_value != fs_value: + print(f"[*] Property {prop} has changed, restart required") + return True + except Exception as e: + print(f"[!] Failed to decode property {prop}: {e}", file=sys.stderr) + return True + + # No changes detected + print("[*] No configuration changes detected") + return False + + # Plugin commands @plugin.command("init") def handle_init(input_data: PluginInput) -> PluginOutput: @@ -98,23 +130,27 @@ def handle_apply(input_data: PluginInput) -> PluginOutput: local_state=local_state ) - cpu_type = detect_cpu_type() - delete_iptables_rules() - patch_yaml_config(cpu_type) - set_subroot_env() - patch_lxc_config(cpu_type) - update_pccs_url() - setup_iptables(local_tunnel_ip) container = LXCContainer(PKI_SERVICE_NAME) # Start or restart LXC container if container.is_running(): - print(f"[*] Restarting LXC container {PKI_SERVICE_NAME}") - - exit_code = container.stop(graceful_timeout=30, command_timeout=60) - if exit_code != 0: - raise Exception(f"Failed to stop container with exit code {exit_code}") + if initialized != "true" or is_restart_required(authority_config): + print(f"[*] Restarting LXC container {PKI_SERVICE_NAME}") + + exit_code = container.stop(graceful_timeout=30, command_timeout=60) + if exit_code != 0: + raise Exception(f"Failed to stop container with exit code {exit_code}") + else: + print(f"[*] Container {PKI_SERVICE_NAME} is already running, no restart required") + return PluginOutput(status="completed", local_state=local_state) + cpu_type = detect_cpu_type() + patch_yaml_config(cpu_type, vm_mode) + set_subroot_env() + patch_lxc_config(cpu_type) + update_pccs_url() + setup_iptables(local_tunnel_ip) + if initialized == "true": for prop in AUTHORITY_SERVICE_PROPERTIES: prop_name = f"{AUTHORITY_SERVICE_PREFIX}{prop}" From b0ad5772c50fafabe57618420cf7c73f8fe9268e Mon Sep 17 00:00:00 2001 From: Petr Evstifeev Date: Thu, 18 Dec 2025 11:33:49 -0600 Subject: [PATCH 14/51] registry external endpoints --- src/services/apps/pki-authority/helpers.py | 82 ++-- src/services/apps/pki-authority/main.py | 448 +++++++++++------- src/services/apps/pki-authority/manifest.yaml | 26 + 3 files changed, 360 insertions(+), 196 deletions(-) diff --git a/src/services/apps/pki-authority/helpers.py b/src/services/apps/pki-authority/helpers.py index 112e3a84..bb704884 100644 --- a/src/services/apps/pki-authority/helpers.py +++ b/src/services/apps/pki-authority/helpers.py @@ -15,6 +15,7 @@ from pathlib import Path from typing import List, Optional from enum import Enum +from datetime import datetime PKI_SERVICE_NAME = "pki-authority" SERVICE_INSIDE_CONTAINER = "tee-pki" @@ -27,6 +28,20 @@ IPTABLES_RULE_COMMENT = f"{PKI_SERVICE_NAME}-rule" +class LogLevel(Enum): + """Log levels for structured logging.""" + INFO = "INFO" + WARN = "WARN" + ERROR = "ERROR" + DEBUG = "DEBUG" + + +def log(level: LogLevel, message: str): + """Log message with timestamp, service name and level.""" + timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + print(f"[{timestamp}] [{PKI_SERVICE_NAME}] [{level.value}] {message}", file=sys.stderr) + + class VMMode(Enum): """VM mode types.""" LEGACY = "legacy" @@ -41,7 +56,7 @@ def __init__(self, container_name: str = PKI_SERVICE_NAME): def start(self, timeout: int = 30) -> int: """Start LXC container. Returns exit code.""" - print(f"[*] Starting LXC container {self.container_name}") + log(LogLevel.INFO, f"Starting LXC container {self.container_name}") result = subprocess.run( ["lxc-start", "-n", self.container_name], capture_output=True, @@ -52,7 +67,7 @@ def start(self, timeout: int = 30) -> int: def stop(self, graceful_timeout: int = 30, command_timeout: int = 60) -> int: """Stop LXC container gracefully. Returns exit code.""" - print(f"[*] Stopping LXC container {self.container_name} gracefully") + log(LogLevel.INFO, f"Stopping LXC container {self.container_name} gracefully") result = subprocess.run( ["lxc-stop", "-n", self.container_name, "-t", str(graceful_timeout)], capture_output=True, @@ -63,7 +78,7 @@ def stop(self, graceful_timeout: int = 30, command_timeout: int = 60) -> int: def destroy(self) -> int: """Destroy LXC container. Returns exit code.""" - print(f"[*] Destroying LXC container {self.container_name}") + log(LogLevel.INFO, f"Destroying LXC container {self.container_name}") result = subprocess.run( ["lxc-destroy", "-n", self.container_name, "-f"], capture_output=True, @@ -72,7 +87,7 @@ def destroy(self) -> int: ) if result.returncode != 0: - print(f"[!] Failed to destroy container: {result.stderr}", file=sys.stderr) + log(LogLevel.ERROR, f"Failed to destroy container: {result.stderr}") return result.returncode @@ -85,11 +100,11 @@ def is_running(self) -> bool: text=True ) if self.container_name not in result.stdout: - print(f"[*] LXC container {self.container_name} is not running") + log(LogLevel.INFO, f"LXC container {self.container_name} is not running") return False return True except Exception as e: - print(f"[!] Failed to check LXC container status: {e}", file=sys.stderr) + log(LogLevel.ERROR, f"Failed to check LXC container status: {e}") return False def get_ip(self) -> Optional[str]: @@ -103,7 +118,7 @@ def get_ip(self) -> Optional[str]: container_ip = result.stdout.strip() if result.stdout.strip() else None return container_ip except Exception as e: - print(f"[!] Failed to get container IP: {e}", file=sys.stderr) + log(LogLevel.ERROR, f"Failed to get container IP: {e}") return None def create(self, archive_path: str = "/etc/super/containers/pki-authority/pki-authority.tar") -> bool: @@ -116,10 +131,10 @@ def create(self, archive_path: str = "/etc/super/containers/pki-authority/pki-au ) if result.returncode == 0: - print(f"Container '{self.container_name}' already exists.") + log(LogLevel.INFO, f"Container '{self.container_name}' already exists.") return True else: - print(f"Container '{self.container_name}' not found. Creating...") + log(LogLevel.INFO, f"Container '{self.container_name}' not found. Creating...") try: subprocess.run( [ @@ -131,10 +146,10 @@ def create(self, archive_path: str = "/etc/super/containers/pki-authority/pki-au ], check=True ) - print(f"Container '{self.container_name}' created.") + log(LogLevel.INFO, f"Container '{self.container_name}' created.") return True except subprocess.CalledProcessError as e: - print(f"[!] Failed to create container: {e}", file=sys.stderr) + log(LogLevel.ERROR, f"Failed to create container: {e}") return False def is_service_healthy(self, min_uptime: int = 120, healthcheck_url: str = "/healthcheck") -> bool: @@ -149,7 +164,7 @@ def is_service_healthy(self, min_uptime: int = 120, healthcheck_url: str = "/hea status = result.stdout.strip() if status not in ["active", "activating"]: - print(f"[*] Service {SERVICE_INSIDE_CONTAINER} status: {status}") + log(LogLevel.INFO, f"Service {SERVICE_INSIDE_CONTAINER} status: {status}") return False # 2. If service is active, check how long it's been running @@ -193,19 +208,19 @@ def is_service_healthy(self, min_uptime: int = 120, healthcheck_url: str = "/hea if response.status == 200: return True else: - print(f"[*] Healthcheck returned status: {response.status}") + log(LogLevel.INFO, f"Healthcheck returned status: {response.status}") return False except Exception as e: - print(f"[*] Healthcheck failed: {e}") + log(LogLevel.INFO, f"Healthcheck failed: {e}") return False except Exception as e: - print(f"[*] Failed to parse service uptime: {e}") + log(LogLevel.INFO, f"Failed to parse service uptime: {e}") # Service is active or activating (but not ready for healthcheck yet) return True except Exception as e: - print(f"[!] Failed to check service health: {e}", file=sys.stderr) + log(LogLevel.ERROR, f"Failed to check service health: {e}") return False @@ -239,16 +254,16 @@ def patch_yaml_config(cpu_type: str, vm_mode: VMMode): """Set own challenge type in LXC container configuration.""" if vm_mode == VMMode.LEGACY: template_name = "lxc-legacy-vm-template.yaml" - print(f"Detected {vm_mode.value} mode, using legacy template") + log(LogLevel.INFO, f"Detected {vm_mode.value} mode, using legacy template") else: template_name = "lxc-swarm-template.yaml" - print(f"Detected {vm_mode.value} mode, using swarm template") + log(LogLevel.INFO, f"Detected {vm_mode.value} mode, using swarm template") src_yaml = Path(f"/etc/super/containers/pki-authority/{template_name}") dst_yaml = Path(f"/var/lib/lxc/{PKI_SERVICE_NAME}/rootfs/app/conf/lxc.yaml") if not src_yaml.exists(): - print(f"Error: {src_yaml} not found.") + log(LogLevel.ERROR, f"Error: {src_yaml} not found.") sys.exit(1) # Load YAML, modify, and save @@ -271,7 +286,7 @@ def patch_yaml_config(cpu_type: str, vm_mode: VMMode): mode_value = "init" if vm_mode == VMMode.SWARM_INIT else "normal" config["pki"]["mode"]["attestationServiceSource"]["mode"] = mode_value - print(f"Set attestationServiceSource mode to: {mode_value}") + log(LogLevel.INFO, f"Set attestationServiceSource mode to: {mode_value}") # Ensure destination directory exists dst_yaml.parent.mkdir(parents=True, exist_ok=True) @@ -280,7 +295,7 @@ def patch_yaml_config(cpu_type: str, vm_mode: VMMode): with open(dst_yaml, "w") as f: yaml.dump(config, f, default_flow_style=False) - print(f"Patched {dst_yaml} with type: {cpu_type}") + log(LogLevel.INFO, f"Patched {dst_yaml} with type: {cpu_type}") def set_subroot_env(): @@ -297,7 +312,7 @@ def set_subroot_env(): dst_subroot_env = Path(f"/var/lib/lxc/{PKI_SERVICE_NAME}/rootfs/app/subroot.env") if not src_subroot_env.exists(): - print(f"Info: {src_subroot_env} not found; skipping creation of {dst_subroot_env}") + log(LogLevel.INFO, f"Info: {src_subroot_env} not found; skipping creation of {dst_subroot_env}") return # Remove destination first to ensure a clean recreate @@ -320,7 +335,7 @@ def set_subroot_env(): # Set permissions dst_subroot_env.chmod(0o644) - print(f"Created {dst_subroot_env} with trusted variables.") + log(LogLevel.INFO, f"Created {dst_subroot_env} with trusted variables.") def patch_lxc_config(cpu_type: str): @@ -372,13 +387,13 @@ def get_bridge_ip(bridge_name: str) -> str: ) if result.returncode != 0: - print(f"Error: Could not determine IP address for bridge {bridge_name}") + log(LogLevel.ERROR, f"Error: Could not determine IP address for bridge {bridge_name}") sys.exit(1) # Parse IP address from output match = re.search(r'inet\s+(\d+\.\d+\.\d+\.\d+)', result.stdout) if not match: - print(f"Error: Could not determine IP address for bridge {bridge_name}") + log(LogLevel.ERROR, f"Error: Could not determine IP address for bridge {bridge_name}") sys.exit(1) return match.group(1) @@ -395,13 +410,13 @@ def enable_route_localnet(bridge_name: str): ) if result.returncode == 0 and result.stdout.strip() == "1": - print(f"route_localnet already enabled for {bridge_name}") + log(LogLevel.INFO, f"route_localnet already enabled for {bridge_name}") else: subprocess.run( ["sysctl", "-w", f"{sysctl_key}=1"], check=True ) - print(f"Enabled route_localnet for {bridge_name}") + log(LogLevel.INFO, f"Enabled route_localnet for {bridge_name}") def delete_iptables_rules(): @@ -420,19 +435,19 @@ def delete_iptables_rules(): if IPTABLES_RULE_COMMENT in rule: delete_rule = rule.replace("-A", "-D", 1) subprocess.run(["iptables", "-t", "nat"] + delete_rule.split()[1:], check=True) - print(f"Deleted iptables rule: {delete_rule}") + log(LogLevel.INFO, f"Deleted iptables rule: {delete_rule}") def ensure_iptables_rule(check_args: List[str], add_args: List[str], description: str): - print(f"[*] Checking iptables rule: {description}") + log(LogLevel.INFO, f"Checking iptables rule: {description}") check_result = subprocess.run(check_args, capture_output=True) if check_result.returncode == 0: - print(f"[*] Rule already exists") + log(LogLevel.INFO, f"Rule already exists") else: subprocess.run(add_args, check=True) - print(f"[*] Rule added") + log(LogLevel.INFO, f"Rule added") def setup_iptables(wg_ip): @@ -538,7 +553,7 @@ def update_pccs_url(): pccs_url = f"https://{host_ip}:{PCCS_PORT}/sgx/certification/v4/" if not qcnl_conf.exists(): - print(f"Error: {qcnl_conf} not found") + log(LogLevel.ERROR, f"Error: {qcnl_conf} not found") sys.exit(1) if not qcnl_conf_bak.exists(): @@ -558,7 +573,7 @@ def update_pccs_url(): with open(qcnl_conf, "w") as f: f.write(content) - print(f"Updated PCCS URL in {qcnl_conf} to {pccs_url}") + log(LogLevel.INFO, f"Updated PCCS URL in {qcnl_conf} to {pccs_url}") def init_container(): @@ -585,3 +600,4 @@ def read_property_from_fs(file_name: str) -> tuple[bool, bytes]: if content: return (True, content) return (False, b"") + diff --git a/src/services/apps/pki-authority/main.py b/src/services/apps/pki-authority/main.py index 001ba364..b57e2c7e 100755 --- a/src/services/apps/pki-authority/main.py +++ b/src/services/apps/pki-authority/main.py @@ -2,7 +2,11 @@ import sys import time +import json +import subprocess from pathlib import Path +from datetime import datetime +from enum import Enum from provision_plugin_sdk import ProvisionPlugin, PluginInput, PluginOutput import base64 @@ -25,46 +29,289 @@ VMMode, save_property_into_fs, read_property_from_fs, + LogLevel, + log, ) # Configuration plugin = ProvisionPlugin() -# Authority service property prefix and names -AUTHORITY_SERVICE_PREFIX = "pki_authority_" -AUTHORITY_SERVICE_PROPERTIES = ["auth_token", "basic_certificate", "basic_privateKey", "lite_certificate", "lite_privateKey"] -PROP_INITIALIZED = f"{AUTHORITY_SERVICE_PREFIX}initialized" -def is_restart_required(authority_config: dict) -> bool: - # Compare authority config properties with values stored in filesystem - for prop in AUTHORITY_SERVICE_PROPERTIES: - prop_name = f"{AUTHORITY_SERVICE_PREFIX}{prop}" - config_value = authority_config.get(prop_name, "") +class ApplyHandler: + """Handler for apply command logic with unified exit point.""" + + # Authority service property prefix and names + AUTHORITY_SERVICE_PREFIX = "pki_authority_" + AUTHORITY_SERVICE_PROPERTIES = ["auth_token", "basic_certificate", "basic_privateKey", "lite_certificate", "lite_privateKey"] + PROP_INITIALIZED = f"{AUTHORITY_SERVICE_PREFIX}initialized" + PROP_REGISTERED_ENDPOINTS=f"{AUTHORITY_SERVICE_PREFIX}registered_endpoints" + + def __init__(self, input_data: PluginInput): + self.input_data = input_data + self.local_node_id = input_data.local_node_id + self.state_json = input_data.state or {} + self.local_state = input_data.local_state or {} + self.cluster_info = self.state_json.get("cluster", {}) + leader_node_id = self.cluster_info.get("leader_node") + self.is_leader = (self.local_node_id == leader_node_id) + self.pki_cluster_nodes = self.state_json.get("clusterNodes", []) + self.wg_props = self.state_json.get("wgNodeProperties", []) + self.authority_props = self.state_json.get("authorityServiceProperties", []) + self.authority_config = {prop["name"]: prop["value"] for prop in self.authority_props} - if not config_value: - continue - - # Read current value from filesystem - success, fs_value = read_property_from_fs(prop) + # Output parameters + self.status = None + self.error_message = None + self.cluster_properties = {} + + def get_redis_tunnel_ips(self) -> list[str]: + redis_node_props = self.state_json.get("redisNodeProperties", []) + wg_props = self.state_json.get("wgNodeProperties", []) + + redis_hosts = [] + for prop in redis_node_props: + if prop.get("name") == "redis_node_ready" and prop.get("value") == "true": + node_id = prop.get("node_id") + tunnel_ip = get_node_tunnel_ip(node_id, wg_props) + if tunnel_ip: + redis_hosts.append(tunnel_ip) + + return sorted(set(redis_hosts)) + + + def create_gateway_endpoints(self): + if not self.is_leader: + return + + registered_endpoints = self.authority_config.get(self.PROP_REGISTERED_ENDPOINTS, "").split(";") + + current_endpoints = [] + for node in self.pki_cluster_nodes: + node_id = node.get("node_id") + tunnel_ip = get_node_tunnel_ip(node_id, self.wg_props) + if tunnel_ip: + current_endpoints.append(tunnel_ip) + + # Compare endpoints regardless of order + if set(registered_endpoints) == set(current_endpoints): + log(LogLevel.INFO, f"Gateway endpoints are up to date: registered={registered_endpoints}, current={current_endpoints}") + return + + log(LogLevel.INFO, f"Gateway endpoints changed: registered={registered_endpoints}, current={current_endpoints}") + + # Get list of all Redis nodes with IP addresses + redis_tunnel_ips = self.get_redis_tunnel_ips() - if not success: - # File doesn't exist in FS, restart required - print(f"[*] Property {prop} not found in filesystem, restart required") - return True + if not redis_tunnel_ips and current_endpoints: + self.status = "postponed" + self.error_message = "No Redis nodes available to configure gateway routes" + return + + # Build targets list from current endpoints + targets = [{"url": f"https://{endpoint}:8443", "weight": 1} for endpoint in current_endpoints] + route_config = { + "targets": targets, + "policy": "rr", + "preserve_host": False + } + route_json = json.dumps(route_config) + route_key = "routes:super-develop-tdx.superprotocol.dev" + + last_redis_error = None + for redis_ip in redis_tunnel_ips: + try: + log(LogLevel.INFO, f"Attempting to set gateway route in Redis at {redis_ip}:6379") + result = subprocess.run( + ["redis-cli", "-h", redis_ip, "-p", "6379", "SET", route_key, route_json], + capture_output=True, + text=True, + timeout=15 + ) + + if result.returncode == 0 and result.stdout.strip() == "OK": + log(LogLevel.INFO, f"Successfully set gateway route in Redis at {redis_ip}:6379") + if self.cluster_properties is None: + self.cluster_properties = {} + self.cluster_properties[self.PROP_REGISTERED_ENDPOINTS] = ";".join(current_endpoints) + return + else: + last_redis_error = f"Failed to set route in Redis at {redis_ip}: {result.stderr}" + except subprocess.TimeoutExpired: + last_redis_error = f"Timeout connecting to Redis at {redis_ip}" + except Exception as e: + last_redis_error = f"Exception setting route in Redis at {redis_ip}: {e}" + + self.status = "error" + self.error_message = last_redis_error + log(LogLevel.ERROR, last_redis_error) + + def create_output(self) -> PluginOutput: + if self.status == "completed": + self.create_gateway_endpoints() + elif self.status =="postponed": + log(LogLevel.INFO, f"Apply postponed: {self.error_message}") + elif self.status == "error": + log(LogLevel.ERROR, f"Apply error: {self.error_message}") + else: + log(LogLevel.ERROR, f"Apply ended with unknown status {self.status}") + + return PluginOutput( + status=self.status, + local_state=self.local_state if self.status == "completed" else None, + error_message=self.error_message, + cluster_properties=self.cluster_properties if self.status == "completed" else None + ) + + def apply(self) -> PluginOutput: + if not isinstance(self.state_json, dict): + self.status = "error" + self.error_message = "Invalid state format" + return self.create_output() + + local_tunnel_ip = get_node_tunnel_ip(self.local_node_id, self.wg_props) + if not local_tunnel_ip: + self.status = "error" + self.error_message = "Local node has no WireGuard tunnel IP" + return self.create_output() - # Decode config value from base64 and compare with filesystem value try: - decoded_config_value = base64.b64decode(config_value) - if decoded_config_value != fs_value: - print(f"[*] Property {prop} has changed, restart required") - return True + vm_mode = detect_vm_mode() + initialized = self.authority_config.get(self.PROP_INITIALIZED) + + # If initialized is true, verify all required properties are present + if initialized == "true": + missing = [] + + for prop in self.AUTHORITY_SERVICE_PROPERTIES: + prop_name = f"{self.AUTHORITY_SERVICE_PREFIX}{prop}" + prop_value = self.authority_config.get(prop_name, "") + + if not prop_value: + missing.append(prop_name) + + if missing: + error_msg = f"Service marked as initialized but missing properties: {', '.join(missing)}" + log(LogLevel.ERROR, error_msg) + self.status = "error" + self.error_message = error_msg + return self.create_output() + + if vm_mode == VMMode.SWARM_NORMAL and initialized != "true": + self.status = "postponed" + self.error_message = "Waiting for authority service properties to be initialized" + return self.create_output() + + container = LXCContainer(PKI_SERVICE_NAME) + + # Start or restart LXC container + if container.is_running(): + if initialized != "true" or self.is_restart_required(): + exit_code = container.stop(graceful_timeout=30, command_timeout=60) + if exit_code != 0: + raise Exception(f"Failed to stop container with exit code {exit_code}") + else: + log(LogLevel.INFO, f"Container {PKI_SERVICE_NAME} is already running, no restart required") + self.status = "completed" + return self.create_output() + + cpu_type = detect_cpu_type() + patch_yaml_config(cpu_type, vm_mode) + set_subroot_env() + patch_lxc_config(cpu_type) + update_pccs_url() + setup_iptables(local_tunnel_ip) + + if initialized == "true": + for prop in self.AUTHORITY_SERVICE_PROPERTIES: + prop_name = f"{self.AUTHORITY_SERVICE_PREFIX}{prop}" + prop_value = self.authority_config.get(prop_name, "") + save_property_into_fs(prop, base64.b64decode(prop_value)) + + exit_code = container.start(timeout=30) + if exit_code != 0: + raise Exception(f"Failed to start container with exit code {exit_code}") + + log(LogLevel.INFO, f"LXC container {PKI_SERVICE_NAME} is running") + + # If not initialized, wait for tee-pki service to generate property files + if initialized != "true": + missing_properties = self.AUTHORITY_SERVICE_PROPERTIES.copy() + timeout = 30 + interval = 5 + elapsed = 0 + collected_properties = {} + + while elapsed < timeout: + # Try to read each missing property + for prop in missing_properties[:]: + success, value = read_property_from_fs(prop) + + if success: + collected_properties[f"{self.AUTHORITY_SERVICE_PREFIX}{prop}"] = base64.b64encode(value).decode() + missing_properties.remove(prop) + + # Check if all properties collected + if not missing_properties: + log(LogLevel.INFO, "All property files have been generated by tee-pki service") + + # Add initialized flag + collected_properties[self.PROP_INITIALIZED] = "true" + + self.status = "completed" + self.cluster_properties = collected_properties + return self.create_output() + + # Show what's still missing + log(LogLevel.INFO, f"Waiting for property files. Missing: {', '.join(missing_properties)} (elapsed: {elapsed}s)") + + time.sleep(interval) + elapsed += interval + + # Timeout reached + self.status = "postponed" + self.error_message = f"Timeout waiting for tee-pki to generate property files: {', '.join(missing_properties)}" + return self.create_output() + + self.status = "completed" + return self.create_output() + except Exception as e: - print(f"[!] Failed to decode property {prop}: {e}", file=sys.stderr) - return True + error_msg = f"Apply failed: {str(e)}" + log(LogLevel.ERROR, error_msg) + self.status = "error" + self.error_message = error_msg + return self.create_output() - # No changes detected - print("[*] No configuration changes detected") - return False + def is_restart_required(self) -> bool: + """Check if container restart is required based on configuration changes.""" + for prop in self.AUTHORITY_SERVICE_PROPERTIES: + prop_name = f"{self.AUTHORITY_SERVICE_PREFIX}{prop}" + config_value = self.authority_config.get(prop_name, "") + + if not config_value: + continue + + # Read current value from filesystem + success, fs_value = read_property_from_fs(prop) + + if not success: + # File doesn't exist in FS, restart required + log(LogLevel.INFO, f"Property {prop} not found in filesystem, restart required") + return True + + # Decode config value from base64 and compare with filesystem value + try: + decoded_config_value = base64.b64decode(config_value) + if decoded_config_value != fs_value: + log(LogLevel.INFO, f"Property {prop} has changed, restart required") + return True + except Exception as e: + log(LogLevel.ERROR, f"Failed to decode property {prop}: {e}") + return True + + # No changes detected + log(LogLevel.INFO, "No configuration changes detected") + return False # Plugin commands @@ -72,146 +319,21 @@ def is_restart_required(authority_config: dict) -> bool: def handle_init(input_data: PluginInput) -> PluginOutput: """Initialize PKI Authority service.""" try: - print("[*] Running PKI initialization") + log(LogLevel.INFO, "Running PKI initialization") init_container() - print("[*] PKI initialization completed") + log(LogLevel.INFO, "PKI initialization completed") return PluginOutput(status="completed", local_state=input_data.local_state) except Exception as e: error_msg = f"Failed to initialize PKI: {str(e)}" - print(f"[!] {error_msg}", file=sys.stderr) + log(LogLevel.ERROR, error_msg) return PluginOutput(status="error", error_message=error_msg, local_state=input_data.local_state) @plugin.command("apply") def handle_apply(input_data: PluginInput) -> PluginOutput: """Apply PKI Authority configuration and start service.""" - - local_node_id = input_data.local_node_id - state_json = input_data.state or {} - local_state = input_data.local_state or {} - - if not isinstance(state_json, dict): - return PluginOutput(status="error", error_message="Invalid state format", local_state=local_state) - - wg_props = state_json.get("wgNodeProperties", []) - authority_props = state_json.get("authorityServiceProperties", []) - - # Convert authority service properties to dict for easier access - authority_config = {prop["name"]: prop["value"] for prop in authority_props} - - local_tunnel_ip = get_node_tunnel_ip(local_node_id, wg_props) - if not local_tunnel_ip: - return PluginOutput(status="error", error_message="Local node has no WireGuard tunnel IP", local_state=local_state) - - try: - vm_mode = detect_vm_mode() - initialized = authority_config.get(PROP_INITIALIZED) - # If initialized is true, verify all required properties are present - if initialized == "true": - missing = [] - - for prop in AUTHORITY_SERVICE_PROPERTIES: - prop_name = f"{AUTHORITY_SERVICE_PREFIX}{prop}" - prop_value = authority_config.get(prop_name, "") - - if not prop_value: - missing.append(prop_name) - - - if missing: - error_msg = f"Service marked as initialized but missing properties: {', '.join(missing)}" - print(f"[!] {error_msg}", file=sys.stderr) - return PluginOutput(status="error", error_message=error_msg, local_state=local_state) - - if vm_mode == VMMode.SWARM_NORMAL and initialized != "true": - return PluginOutput( - status="postponed", - error_message="Waiting for authority service properties to be initialized", - local_state=local_state - ) - - container = LXCContainer(PKI_SERVICE_NAME) - - # Start or restart LXC container - if container.is_running(): - if initialized != "true" or is_restart_required(authority_config): - print(f"[*] Restarting LXC container {PKI_SERVICE_NAME}") - - exit_code = container.stop(graceful_timeout=30, command_timeout=60) - if exit_code != 0: - raise Exception(f"Failed to stop container with exit code {exit_code}") - else: - print(f"[*] Container {PKI_SERVICE_NAME} is already running, no restart required") - return PluginOutput(status="completed", local_state=local_state) - - cpu_type = detect_cpu_type() - patch_yaml_config(cpu_type, vm_mode) - set_subroot_env() - patch_lxc_config(cpu_type) - update_pccs_url() - setup_iptables(local_tunnel_ip) - - if initialized == "true": - for prop in AUTHORITY_SERVICE_PROPERTIES: - prop_name = f"{AUTHORITY_SERVICE_PREFIX}{prop}" - prop_value = authority_config.get(prop_name, "") - save_property_into_fs(prop, base64.b64decode(prop_value)) - - exit_code = container.start(timeout=30) - if exit_code != 0: - raise Exception(f"Failed to start container with exit code {exit_code}") - - print(f"[*] LXC container {PKI_SERVICE_NAME} is running") - - # If not initialized, wait for tee-pki service to generate property files - if initialized != "true": - missing_properties = AUTHORITY_SERVICE_PROPERTIES.copy() - timeout = 30 - interval = 5 - elapsed = 0 - collected_properties = {} - - while elapsed < timeout: - # Try to read each missing property - for prop in missing_properties[:]: - success, value = read_property_from_fs(prop) - - if success: - collected_properties[f"{AUTHORITY_SERVICE_PREFIX}{prop}"] = base64.b64encode(value).decode() - missing_properties.remove(prop) - - # Check if all properties collected - if not missing_properties: - print("[*] All property files have been generated by tee-pki service") - - # Add initialized flag - collected_properties[PROP_INITIALIZED] = "true" - - return PluginOutput( - status="completed", - cluster_properties=collected_properties, - local_state=local_state - ) - - # Show what's still missing - print(f"[*] Waiting for property files. Missing: {', '.join(missing_properties)} (elapsed: {elapsed}s)") - - time.sleep(interval) - elapsed += interval - - # Timeout reached - return PluginOutput( - status="postponed", - error_message=f"Timeout waiting for tee-pki to generate property files: {', '.join(missing_properties)}", - local_state=local_state - ) - - return PluginOutput(status="completed", local_state=local_state) - - except Exception as e: - error_msg = f"Apply failed: {str(e)}" - print(f"[!] {error_msg}", file=sys.stderr) - return PluginOutput(status="error", error_message=error_msg, local_state=local_state) + handler = ApplyHandler(input_data) + return handler.apply() @plugin.command("health") @@ -232,14 +354,14 @@ def handle_health(input_data: PluginInput) -> PluginOutput: ) except Exception as e: error_msg = f"Health check failed: {str(e)}" - print(f"[!] {error_msg}", file=sys.stderr) + log(LogLevel.ERROR, error_msg) return PluginOutput(status="error", error_message=error_msg, local_state=local_state) @plugin.command("finalize") def handle_finalize(input_data: PluginInput) -> PluginOutput: """Finalize PKI Authority service setup.""" - print("[*] PKI Authority finalized") + log(LogLevel.INFO, "PKI Authority finalized") return PluginOutput(status="completed", local_state=input_data.local_state) @@ -255,7 +377,7 @@ def handle_destroy(input_data: PluginInput) -> PluginOutput: if container.is_running(): exit_code = container.stop(graceful_timeout=30, command_timeout=60) if exit_code != 0: - print(f"[!] Warning: Failed to stop container gracefully", file=sys.stderr) + log(LogLevel.WARN, "Failed to stop container gracefully") # Destroy container exit_code = container.destroy() @@ -265,12 +387,12 @@ def handle_destroy(input_data: PluginInput) -> PluginOutput: delete_iptables_rules() - print("[*] PKI Authority destroyed") + log(LogLevel.INFO, "PKI Authority destroyed") return PluginOutput(status="completed", local_state=local_state) except Exception as e: error_msg = f"Destroy failed: {str(e)}" - print(f"[!] {error_msg}", file=sys.stderr) + log(LogLevel.ERROR, error_msg) return PluginOutput(status="error", error_message=error_msg, local_state=local_state) diff --git a/src/services/apps/pki-authority/manifest.yaml b/src/services/apps/pki-authority/manifest.yaml index f329c866..2595ed2c 100644 --- a/src/services/apps/pki-authority/manifest.yaml +++ b/src/services/apps/pki-authority/manifest.yaml @@ -17,6 +17,18 @@ stateExpr: ($pkiClusterNodes | map(.node)) as $pkiNodeIds | + # Find Redis cluster + ( + $swarmdb.clusters[] | + select(.cluster_policy == "redis" and .deleted_ts == null) + ) as $redisCluster | + + # Get Redis cluster nodes + ([$swarmdb.clusternodes[] | select(.cluster == $redisCluster.id and .deleted_ts == null)]) as $redisClusterNodes | + + ($redisClusterNodes | map(.node)) as $redisNodeIds | + + # Find WireGuard cluster that contains PKI nodes ( $swarmdb.clusters[] | select(.cluster_policy == "wireguard" and .deleted_ts == null) | @@ -40,6 +52,20 @@ stateExpr: {id, node_id: .node, cluster} ] | sort_by(.id, .node_id, .cluster), + redisCluster: { + id: $redisCluster.id + }, + + redisNodeProperties: [ + $swarmdb.clusternodeproperties[] | + select( + (.cluster_node | startswith($redisCluster.id)) and + .deleted_ts == null and + .name == "redis_node_ready" + ) | + {cluster_node, name, value, node_id: ( .cluster_node as $cn | $swarmdb.clusternodes[] | select(.id == $cn)) | .node} + ] | sort_by(.cluster_node, .name, .value, .node_id), + wgCluster: { id: $wgCluster.id }, From 7fba30dfd85c237e2615033339d89313d782f95d Mon Sep 17 00:00:00 2001 From: Petr Evstifeev Date: Fri, 19 Dec 2025 12:32:08 -0600 Subject: [PATCH 15/51] ssl passthrough support for openresty --- src/services/apps/openresty/main.py | 144 +++++++++++++++++++++++- src/services/apps/pki-authority/main.py | 3 +- 2 files changed, 143 insertions(+), 4 deletions(-) diff --git a/src/services/apps/openresty/main.py b/src/services/apps/openresty/main.py index 1b4c0db5..2903e27d 100755 --- a/src/services/apps/openresty/main.py +++ b/src/services/apps/openresty/main.py @@ -373,7 +373,7 @@ def generate_nginx_config(redis_hosts: list[str]) -> str: """Generate Nginx configuration content.""" redis_host_list = ", ".join([f'"{host}:6379"' for host in redis_hosts]) - config = f"""# Nginx/OpenResty configuration for Gateway with AutoSSL and Redis routing + config = f"""# Nginx/OpenResty configuration for Gateway with AutoSSL, SSL Passthrough and Redis routing user www-data; worker_processes auto; @@ -383,6 +383,144 @@ def generate_nginx_config(redis_hosts: list[str]) -> str: worker_connections 1024; }} +# Stream block for SSL passthrough routing (port 443) +stream {{ + # Logging + log_format basic '$remote_addr [$time_local] ' + '$protocol $status $bytes_sent $bytes_received ' + '$session_time'; + + access_log /var/log/openresty/stream-access.log basic; + error_log /var/log/openresty/stream-error.log info; + + # Shared memory zones + lua_shared_dict stream_route_cache 10m; + lua_shared_dict stream_rr_counters 10m; + + # Redis hosts + init_by_lua_block {{ + stream_redis_hosts = {{{redis_host_list}}} + }} + + # SSL preread to get SNI hostname + ssl_preread on; + + server {{ + listen 443; + + # Variable for dynamic upstream + set $upstream_target ""; + + # Lua code to route based on passthrough flag + preread_by_lua_block {{ + local redis = require "resty.redis" + local cjson = require "cjson" + local domain = ngx.var.ssl_preread_server_name + + if not domain or domain == "" then + -- No SNI, close connection + return ngx.exit(421) + end + + -- Try to get route from cache + local cache = ngx.shared.stream_route_cache + local cached_route = cache:get(domain) + + if not cached_route then + -- Query Redis for route + local red = redis:new() + red:set_timeout(1000) + + local route_data = nil + for _, host in ipairs(stream_redis_hosts) do + local ok, err = red:connect(host:match("([^:]+)"), tonumber(host:match(":(%d+)")) or 6379) + if ok then + local route, err = red:get("routes:" .. domain) + if route and route ~= ngx.null then + route_data = route + red:close() + break + end + red:close() + end + end + + if not route_data then + -- No route found, close connection + return ngx.exit(421) + end + + -- Cache for 30 seconds + cache:set(domain, route_data, 30) + cached_route = route_data + end + + -- Parse route configuration + local route = cjson.decode(cached_route) + local passthrough = route.passthrough or false + + if not passthrough then + -- SSL termination mode: proxy to local HTTP server + ngx.var.upstream_target = "127.0.0.1:8443" + return + end + + -- Passthrough mode: select backend target + local targets = route.targets or {{}} + + if #targets == 0 then + return ngx.exit(421) + end + + -- Select target based on policy + local policy = route.policy or "rr" + local target_url + + if policy == "rr" then + -- Round-robin + local counters = ngx.shared.stream_rr_counters + local counter = counters:get(domain) or 0 + local idx = (counter % #targets) + 1 + target_url = targets[idx].url + counters:incr(domain, 1, 0) + + elseif policy == "ip_hash" then + -- IP hash + local ip = ngx.var.remote_addr + local hash = ngx.crc32_long(ip) + local idx = (hash % #targets) + 1 + target_url = targets[idx].url + + else + -- Default to first target + target_url = targets[1].url + end + + -- Extract host and port from URL + -- Format: https://host:port or http://host:port + local backend_host, backend_port = target_url:match("^https?://([^:/]+):?(%d*)/?") + + if not backend_host then + return ngx.exit(421) + end + + -- Default port to 443 if not specified and using https + if not backend_port or backend_port == "" then + if target_url:match("^https://") then + backend_port = "443" + else + backend_port = "80" + end + end + + ngx.var.upstream_target = backend_host .. ":" .. backend_port + }} + + proxy_pass $upstream_target; + proxy_connect_timeout 5s; + }} +}} + http {{ include /usr/local/openresty/nginx/conf/mime.types; default_type application/octet-stream; @@ -465,9 +603,9 @@ def generate_nginx_config(redis_hosts: list[str]) -> str: }} }} - # HTTPS server (port 443) + # HTTPS server (port 8443 - for SSL termination via stream proxy) server {{ - listen 443 ssl; + listen 127.0.0.1:8443 ssl; server_name _; # Declare variable for use in proxy_pass diff --git a/src/services/apps/pki-authority/main.py b/src/services/apps/pki-authority/main.py index b57e2c7e..7c9a5203 100755 --- a/src/services/apps/pki-authority/main.py +++ b/src/services/apps/pki-authority/main.py @@ -112,7 +112,8 @@ def create_gateway_endpoints(self): route_config = { "targets": targets, "policy": "rr", - "preserve_host": False + "preserve_host": False, + "passthrough": True } route_json = json.dumps(route_config) route_key = "routes:super-develop-tdx.superprotocol.dev" From d3fd941fc4187b303bc133a70ee4743d03e3997e Mon Sep 17 00:00:00 2001 From: Petr Evstifeev Date: Mon, 22 Dec 2025 10:00:06 -0600 Subject: [PATCH 16/51] config pki-domain --- src/services/apps/pki-authority/helpers.py | 92 +++++++++++---------- src/services/apps/pki-authority/main.py | 95 ++++++++++++---------- 2 files changed, 103 insertions(+), 84 deletions(-) diff --git a/src/services/apps/pki-authority/helpers.py b/src/services/apps/pki-authority/helpers.py index bb704884..74276756 100644 --- a/src/services/apps/pki-authority/helpers.py +++ b/src/services/apps/pki-authority/helpers.py @@ -26,6 +26,7 @@ WIREGUARD_INTERFACE = "wg0" STORAGE_PATH = Path(f"/var/lib/lxc/{PKI_SERVICE_NAME}/rootfs/app/swarm-storage") IPTABLES_RULE_COMMENT = f"{PKI_SERVICE_NAME}-rule" +SWARM_ENV_YAML = "/sp/swarm/swarm-env.yaml" class LogLevel(Enum): @@ -250,7 +251,51 @@ def detect_vm_mode() -> VMMode: return VMMode.SWARM_NORMAL -def patch_yaml_config(cpu_type: str, vm_mode: VMMode): +def get_pki_domain() -> str: + """Read PKI authority domain from swarm-env.yaml. + + Returns: + Domain string. + + Raises: + FileNotFoundError: If swarm-env.yaml does not exist. + ValueError: If configuration is empty or domain is not found. + Exception: For other errors during reading. + """ + swarm_env_path = Path(SWARM_ENV_YAML) + + if not swarm_env_path.exists(): + error_msg = f"Swarm environment config not found: {SWARM_ENV_YAML}" + log(LogLevel.ERROR, error_msg) + raise FileNotFoundError(error_msg) + + try: + with open(swarm_env_path, "r") as f: + config = yaml.safe_load(f) + + if not config: + error_msg = f"Empty configuration in {SWARM_ENV_YAML}" + log(LogLevel.ERROR, error_msg) + raise ValueError(error_msg) + + domain = config.get("pki-authority", {}).get("domain") + if not domain: + error_msg = f"No domain found in {SWARM_ENV_YAML} under pki-authority.domain" + log(LogLevel.ERROR, error_msg) + raise ValueError(error_msg) + + log(LogLevel.INFO, f"Read PKI domain from config: {domain}") + return domain + + except (FileNotFoundError, ValueError): + raise + except Exception as e: + error_msg = f"Failed to read domain from {SWARM_ENV_YAML}: {e}" + log(LogLevel.ERROR, error_msg) + raise Exception(error_msg) from e + + +def patch_yaml_config(cpu_type: str, vm_mode: VMMode, pki_domain: str): """Set own challenge type in LXC container configuration.""" if vm_mode == VMMode.LEGACY: template_name = "lxc-legacy-vm-template.yaml" @@ -277,6 +322,11 @@ def patch_yaml_config(cpu_type: str, vm_mode: VMMode): config["pki"]["ownChallenge"] = {} config["pki"]["ownChallenge"]["type"] = cpu_type + # Set ownDomain from parameter + if pki_domain: + config["pki"]["ownDomain"] = pki_domain + log(LogLevel.INFO, f"Set ownDomain to: {pki_domain}") + # Set mode.attestationServiceSource.mode for swarm modes if vm_mode in (VMMode.SWARM_INIT, VMMode.SWARM_NORMAL): if "mode" not in config["pki"]: @@ -298,46 +348,6 @@ def patch_yaml_config(cpu_type: str, vm_mode: VMMode): log(LogLevel.INFO, f"Patched {dst_yaml} with type: {cpu_type}") -def set_subroot_env(): - """Copy trusted environment variables to container.""" - trusted_vars = [ - "AS__pki__baseDomain", - "AS__pki__ownDomain", - "AS__pki__certParams__ocspUrl", - "AS__pki__mode__attestationServiceSource__baseUrl", - "AS__pki__mode__attestationServiceSource__caBundle", - ] - - src_subroot_env = Path("/sp/subroot.env") - dst_subroot_env = Path(f"/var/lib/lxc/{PKI_SERVICE_NAME}/rootfs/app/subroot.env") - - if not src_subroot_env.exists(): - log(LogLevel.INFO, f"Info: {src_subroot_env} not found; skipping creation of {dst_subroot_env}") - return - - # Remove destination first to ensure a clean recreate - dst_subroot_env.unlink(missing_ok=True) - - # Read source file - with open(src_subroot_env, "r") as f: - lines = f.readlines() - - # Write destination with header - with open(dst_subroot_env, "w") as f: - f.write(f"# Autogenerated from {src_subroot_env}. Contains only trusted variables.\n") - - for var in trusted_vars: - # Find first matching line - for line in lines: - if line.strip().startswith(f'{var}="'): - f.write(line) - break - - # Set permissions - dst_subroot_env.chmod(0o644) - log(LogLevel.INFO, f"Created {dst_subroot_env} with trusted variables.") - - def patch_lxc_config(cpu_type: str): """Patch LXC container configuration.""" config_file = Path(f"/var/lib/lxc/{PKI_SERVICE_NAME}/config") diff --git a/src/services/apps/pki-authority/main.py b/src/services/apps/pki-authority/main.py index 7c9a5203..c395cf87 100755 --- a/src/services/apps/pki-authority/main.py +++ b/src/services/apps/pki-authority/main.py @@ -3,13 +3,12 @@ import sys import time import json -import subprocess from pathlib import Path -from datetime import datetime -from enum import Enum from provision_plugin_sdk import ProvisionPlugin, PluginInput, PluginOutput import base64 +from redis import RedisCluster +from redis.cluster import ClusterNode # Import helpers sys.path.insert(0, str(Path(__file__).parent)) @@ -18,7 +17,6 @@ detect_cpu_type, detect_vm_mode, patch_yaml_config, - set_subroot_env, patch_lxc_config, setup_iptables, update_pccs_url, @@ -31,6 +29,7 @@ read_property_from_fs, LogLevel, log, + get_pki_domain, ) # Configuration @@ -44,7 +43,8 @@ class ApplyHandler: AUTHORITY_SERVICE_PREFIX = "pki_authority_" AUTHORITY_SERVICE_PROPERTIES = ["auth_token", "basic_certificate", "basic_privateKey", "lite_certificate", "lite_privateKey"] PROP_INITIALIZED = f"{AUTHORITY_SERVICE_PREFIX}initialized" - PROP_REGISTERED_ENDPOINTS=f"{AUTHORITY_SERVICE_PREFIX}registered_endpoints" + PROP_REGISTERED_ENDPOINTS = f"{AUTHORITY_SERVICE_PREFIX}registered_endpoints" + PROP_PKI_DOMAIN = f"{AUTHORITY_SERVICE_PREFIX}pki_domain" def __init__(self, input_data: PluginInput): self.input_data = input_data @@ -59,12 +59,15 @@ def __init__(self, input_data: PluginInput): self.authority_props = self.state_json.get("authorityServiceProperties", []) self.authority_config = {prop["name"]: prop["value"] for prop in self.authority_props} + self.pki_domain = self.authority_config.get(self.PROP_PKI_DOMAIN, "") + # Output parameters self.status = None self.error_message = None self.cluster_properties = {} def get_redis_tunnel_ips(self) -> list[str]: + """Get list of Redis node tunnel IPs.""" redis_node_props = self.state_json.get("redisNodeProperties", []) wg_props = self.state_json.get("wgNodeProperties", []) @@ -78,6 +81,14 @@ def get_redis_tunnel_ips(self) -> list[str]: return sorted(set(redis_hosts)) + def get_redis_connection_info(self) -> list[tuple[str, int]]: + """Get Redis cluster connection endpoints. + + Returns list of (host, port) tuples for Redis nodes. + """ + redis_tunnel_ips = self.get_redis_tunnel_ips() + return [(ip, 6379) for ip in redis_tunnel_ips] + def create_gateway_endpoints(self): if not self.is_leader: @@ -99,10 +110,10 @@ def create_gateway_endpoints(self): log(LogLevel.INFO, f"Gateway endpoints changed: registered={registered_endpoints}, current={current_endpoints}") - # Get list of all Redis nodes with IP addresses - redis_tunnel_ips = self.get_redis_tunnel_ips() + # Get Redis connection info + redis_endpoints = self.get_redis_connection_info() - if not redis_tunnel_ips and current_endpoints: + if not redis_endpoints and current_endpoints: self.status = "postponed" self.error_message = "No Redis nodes available to configure gateway routes" return @@ -116,35 +127,31 @@ def create_gateway_endpoints(self): "passthrough": True } route_json = json.dumps(route_config) - route_key = "routes:super-develop-tdx.superprotocol.dev" + route_key = f"routes:{self.pki_domain}" - last_redis_error = None - for redis_ip in redis_tunnel_ips: - try: - log(LogLevel.INFO, f"Attempting to set gateway route in Redis at {redis_ip}:6379") - result = subprocess.run( - ["redis-cli", "-h", redis_ip, "-p", "6379", "SET", route_key, route_json], - capture_output=True, - text=True, - timeout=15 - ) - - if result.returncode == 0 and result.stdout.strip() == "OK": - log(LogLevel.INFO, f"Successfully set gateway route in Redis at {redis_ip}:6379") - if self.cluster_properties is None: - self.cluster_properties = {} - self.cluster_properties[self.PROP_REGISTERED_ENDPOINTS] = ";".join(current_endpoints) - return - else: - last_redis_error = f"Failed to set route in Redis at {redis_ip}: {result.stderr}" - except subprocess.TimeoutExpired: - last_redis_error = f"Timeout connecting to Redis at {redis_ip}" - except Exception as e: - last_redis_error = f"Exception setting route in Redis at {redis_ip}: {e}" + startup_nodes = [ClusterNode(host, port) for host, port in redis_endpoints] - self.status = "error" - self.error_message = last_redis_error - log(LogLevel.ERROR, last_redis_error) + try: + redis_client = RedisCluster( + startup_nodes=startup_nodes, + decode_responses=True, + skip_full_coverage_check=True, + socket_connect_timeout=5, + ) + redis_client.ping() + + redis_client.set(route_key, route_json) + log(LogLevel.INFO, f"Successfully set gateway route {route_key} in Redis Cluster") + + if self.cluster_properties is None: + self.cluster_properties = {} + self.cluster_properties[self.PROP_REGISTERED_ENDPOINTS] = ";".join(current_endpoints) + + except Exception as e: + error_msg = f"Failed to set route in Redis Cluster: {str(e)}" + self.status = "error" + self.error_message = error_msg + log(LogLevel.ERROR, error_msg) def create_output(self) -> PluginOutput: if self.status == "completed": @@ -168,7 +175,7 @@ def apply(self) -> PluginOutput: self.status = "error" self.error_message = "Invalid state format" return self.create_output() - + local_tunnel_ip = get_node_tunnel_ip(self.local_node_id, self.wg_props) if not local_tunnel_ip: self.status = "error" @@ -190,12 +197,14 @@ def apply(self) -> PluginOutput: if not prop_value: missing.append(prop_name) + if not self.pki_domain: + self.pki_domain = get_pki_domain() + missing.append(self.PROP_PKI_DOMAIN) + if missing: error_msg = f"Service marked as initialized but missing properties: {', '.join(missing)}" log(LogLevel.ERROR, error_msg) - self.status = "error" - self.error_message = error_msg - return self.create_output() + initialized = "false" if vm_mode == VMMode.SWARM_NORMAL and initialized != "true": self.status = "postponed" @@ -216,8 +225,9 @@ def apply(self) -> PluginOutput: return self.create_output() cpu_type = detect_cpu_type() - patch_yaml_config(cpu_type, vm_mode) - set_subroot_env() + if not self.pki_domain: + self.pki_domain = get_pki_domain() + patch_yaml_config(cpu_type, vm_mode, self.pki_domain) patch_lxc_config(cpu_type) update_pccs_url() setup_iptables(local_tunnel_ip) @@ -254,8 +264,7 @@ def apply(self) -> PluginOutput: # Check if all properties collected if not missing_properties: log(LogLevel.INFO, "All property files have been generated by tee-pki service") - - # Add initialized flag + collected_properties[self.PROP_PKI_DOMAIN] = self.pki_domain collected_properties[self.PROP_INITIALIZED] = "true" self.status = "completed" From 8890f085f88a4c16f04fa7fc3291a9a9a21743cd Mon Sep 17 00:00:00 2001 From: Lex Kheben Date: Thu, 18 Dec 2025 17:04:30 +0300 Subject: [PATCH 17/51] Enhance hardening-vm.sh: Add iptables rule to accept traffic from the 10.13.0.0/16 subnet, improving network security configurations for the VM environment. --- src/rootfs/files/configs/usr/local/bin/hardening-vm.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/src/rootfs/files/configs/usr/local/bin/hardening-vm.sh b/src/rootfs/files/configs/usr/local/bin/hardening-vm.sh index 6f71be58..01d48c74 100755 --- a/src/rootfs/files/configs/usr/local/bin/hardening-vm.sh +++ b/src/rootfs/files/configs/usr/local/bin/hardening-vm.sh @@ -25,6 +25,7 @@ if ! grep -q 'sp-debug=true' /proc/cmdline; then # @TODO this will ignore NetworkPolicies in k8s, refactor in future iptables -I INPUT -s 10.43.0.0/16 -j ACCEPT iptables -I INPUT -s 10.42.0.0/16 -j ACCEPT + iptables -I INPUT -s 10.13.0.0/16 -j ACCEPT # Allow DHCP for LXC containers (client:68 -> server:67) iptables -A INPUT -i lxcbr0 -p udp --sport 68 --dport 67 -j ACCEPT From 5d24dfc35d2dd2a5aa80b7edcddd00ffed983d2e Mon Sep 17 00:00:00 2001 From: Lex Kheben Date: Thu, 18 Dec 2025 18:00:40 +0300 Subject: [PATCH 18/51] Update hardening-vm.sh: Modify iptables rule to allow incoming traffic on TCP port 443 from all sources, enhancing accessibility for the API server while maintaining security configurations. --- src/rootfs/files/configs/usr/local/bin/hardening-vm.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rootfs/files/configs/usr/local/bin/hardening-vm.sh b/src/rootfs/files/configs/usr/local/bin/hardening-vm.sh index 01d48c74..05ded5aa 100755 --- a/src/rootfs/files/configs/usr/local/bin/hardening-vm.sh +++ b/src/rootfs/files/configs/usr/local/bin/hardening-vm.sh @@ -19,7 +19,7 @@ if ! grep -q 'sp-debug=true' /proc/cmdline; then iptables -A INPUT -p udp --sport 53 -j ACCEPT # Allow API server (TCP 443 for HTTPS) - iptables -A INPUT -p tcp --dport 443 -s 10.43.0.1 -j ACCEPT + iptables -A INPUT -p tcp --dport 443 -j ACCEPT # Allow incoming traffic in the cluster network # @TODO this will ignore NetworkPolicies in k8s, refactor in future From d7d8886c72b2b94646843181bc7164bc5ce15627 Mon Sep 17 00:00:00 2001 From: Lex Kheben Date: Thu, 18 Dec 2025 18:45:41 +0300 Subject: [PATCH 19/51] Comment out iptables rule for TCP port 443 in hardening-vm.sh to enhance security configurations while maintaining existing network access rules. --- src/rootfs/files/configs/usr/local/bin/hardening-vm.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rootfs/files/configs/usr/local/bin/hardening-vm.sh b/src/rootfs/files/configs/usr/local/bin/hardening-vm.sh index 05ded5aa..7806341c 100755 --- a/src/rootfs/files/configs/usr/local/bin/hardening-vm.sh +++ b/src/rootfs/files/configs/usr/local/bin/hardening-vm.sh @@ -19,7 +19,7 @@ if ! grep -q 'sp-debug=true' /proc/cmdline; then iptables -A INPUT -p udp --sport 53 -j ACCEPT # Allow API server (TCP 443 for HTTPS) - iptables -A INPUT -p tcp --dport 443 -j ACCEPT + # iptables -A INPUT -p tcp --dport 443 -j ACCEPT # Allow incoming traffic in the cluster network # @TODO this will ignore NetworkPolicies in k8s, refactor in future From 6ee33c85e753e17c6c6f501829ef4bb6eed092cf Mon Sep 17 00:00:00 2001 From: Lex Kheben Date: Fri, 19 Dec 2025 18:30:59 +0300 Subject: [PATCH 20/51] Re-enable iptables rule for TCP port 443 in hardening-vm.sh to allow incoming traffic for the API server, improving accessibility while maintaining security configurations. --- src/rootfs/files/configs/usr/local/bin/hardening-vm.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rootfs/files/configs/usr/local/bin/hardening-vm.sh b/src/rootfs/files/configs/usr/local/bin/hardening-vm.sh index 7806341c..05ded5aa 100755 --- a/src/rootfs/files/configs/usr/local/bin/hardening-vm.sh +++ b/src/rootfs/files/configs/usr/local/bin/hardening-vm.sh @@ -19,7 +19,7 @@ if ! grep -q 'sp-debug=true' /proc/cmdline; then iptables -A INPUT -p udp --sport 53 -j ACCEPT # Allow API server (TCP 443 for HTTPS) - # iptables -A INPUT -p tcp --dport 443 -j ACCEPT + iptables -A INPUT -p tcp --dport 443 -j ACCEPT # Allow incoming traffic in the cluster network # @TODO this will ignore NetworkPolicies in k8s, refactor in future From 35f567f9bd0e52db294ec1d6942ae50d3993b9a3 Mon Sep 17 00:00:00 2001 From: Petr Evstifeev Date: Tue, 23 Dec 2025 09:00:50 -0600 Subject: [PATCH 21/51] delete redis endpoint on destroy --- src/services/apps/pki-authority/main.py | 94 +++++++++++++++++-------- 1 file changed, 65 insertions(+), 29 deletions(-) diff --git a/src/services/apps/pki-authority/main.py b/src/services/apps/pki-authority/main.py index c395cf87..654645d6 100755 --- a/src/services/apps/pki-authority/main.py +++ b/src/services/apps/pki-authority/main.py @@ -36,8 +36,8 @@ plugin = ProvisionPlugin() -class ApplyHandler: - """Handler for apply command logic with unified exit point.""" +class EventHandler: + """Handler for plugin events with unified exit point."""" # Authority service property prefix and names AUTHORITY_SERVICE_PREFIX = "pki_authority_" @@ -322,6 +322,66 @@ def is_restart_required(self) -> bool: # No changes detected log(LogLevel.INFO, "No configuration changes detected") return False + + def delete_route_from_redis(self) -> None: + """Delete the PKI Authority route from Redis Cluster. + + Raises: + Exception: If deletion fails + """ + redis_endpoints = self.get_redis_connection_info() + + if not redis_endpoints: + log(LogLevel.WARN, "No Redis endpoints available, skipping route deletion") + return + + route_key = f"routes:{self.pki_domain}" + startup_nodes = [ClusterNode(host, port) for host, port in redis_endpoints] + + redis_client = RedisCluster( + startup_nodes=startup_nodes, + decode_responses=True, + skip_full_coverage_check=True, + socket_connect_timeout=5, + ) + redis_client.delete(route_key) + log(LogLevel.INFO, f"Deleted route {route_key} from Redis Cluster") + + def destroy(self) -> PluginOutput: + """Destroy PKI Authority service and clean up.""" + try: + container = LXCContainer(PKI_SERVICE_NAME) + + # Stop container if running + if container.is_running(): + exit_code = container.stop(graceful_timeout=30, command_timeout=60) + if exit_code != 0: + log(LogLevel.WARN, "Failed to stop container gracefully") + + # Destroy container + exit_code = container.destroy() + if exit_code != 0: + error_msg = f"Failed to destroy container with exit code {exit_code}" + return PluginOutput(status="error", error_message=error_msg, local_state=self.local_state) + + delete_iptables_rules() + + # If this is the last node and domain is configured, delete route from Redis + if len(self.pki_cluster_nodes) <= 1 and self.pki_domain: + try: + log(LogLevel.INFO, "This is the last PKI Authority node, deleting route from Redis") + self.delete_route_from_redis() + except Exception as e: + log(LogLevel.WARN, f"Failed to delete route from Redis: {e}") + # Don't fail the destroy operation if route deletion fails + + log(LogLevel.INFO, "PKI Authority destroyed") + return PluginOutput(status="completed", local_state=self.local_state) + + except Exception as e: + error_msg = f"Destroy failed: {str(e)}" + log(LogLevel.ERROR, error_msg) + return PluginOutput(status="error", error_message=error_msg, local_state=self.local_state) # Plugin commands @@ -342,7 +402,7 @@ def handle_init(input_data: PluginInput) -> PluginOutput: @plugin.command("apply") def handle_apply(input_data: PluginInput) -> PluginOutput: """Apply PKI Authority configuration and start service.""" - handler = ApplyHandler(input_data) + handler = EventHandler(input_data) return handler.apply() @@ -378,32 +438,8 @@ def handle_finalize(input_data: PluginInput) -> PluginOutput: @plugin.command("destroy") def handle_destroy(input_data: PluginInput) -> PluginOutput: """Destroy PKI Authority service and clean up.""" - local_state = input_data.local_state or {} - - try: - container = LXCContainer(PKI_SERVICE_NAME) - - # Stop container if running - if container.is_running(): - exit_code = container.stop(graceful_timeout=30, command_timeout=60) - if exit_code != 0: - log(LogLevel.WARN, "Failed to stop container gracefully") - - # Destroy container - exit_code = container.destroy() - if exit_code != 0: - error_msg = f"Failed to destroy container with exit code {exit_code}" - return PluginOutput(status="error", error_message=error_msg, local_state=local_state) - - delete_iptables_rules() - - log(LogLevel.INFO, "PKI Authority destroyed") - return PluginOutput(status="completed", local_state=local_state) - - except Exception as e: - error_msg = f"Destroy failed: {str(e)}" - log(LogLevel.ERROR, error_msg) - return PluginOutput(status="error", error_message=error_msg, local_state=local_state) + handler = EventHandler(input_data) + return handler.destroy() if __name__ == "__main__": From e61506a023eb339034a4780bd64f61c58741ad3c Mon Sep 17 00:00:00 2001 From: Petr Evstifeev Date: Wed, 24 Dec 2025 08:01:55 -0600 Subject: [PATCH 22/51] some fixes --- src/services/apps/pki-authority/main.py | 9 ++------- src/services/apps/pki-authority/manifest.yaml | 3 ++- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/src/services/apps/pki-authority/main.py b/src/services/apps/pki-authority/main.py index 654645d6..ed307a21 100755 --- a/src/services/apps/pki-authority/main.py +++ b/src/services/apps/pki-authority/main.py @@ -37,7 +37,6 @@ class EventHandler: - """Handler for plugin events with unified exit point."""" # Authority service property prefix and names AUTHORITY_SERVICE_PREFIX = "pki_authority_" @@ -368,12 +367,8 @@ def destroy(self) -> PluginOutput: # If this is the last node and domain is configured, delete route from Redis if len(self.pki_cluster_nodes) <= 1 and self.pki_domain: - try: - log(LogLevel.INFO, "This is the last PKI Authority node, deleting route from Redis") - self.delete_route_from_redis() - except Exception as e: - log(LogLevel.WARN, f"Failed to delete route from Redis: {e}") - # Don't fail the destroy operation if route deletion fails + log(LogLevel.INFO, "This is the last PKI Authority node, deleting route from Redis") + self.delete_route_from_redis() log(LogLevel.INFO, "PKI Authority destroyed") return PluginOutput(status="completed", local_state=self.local_state) diff --git a/src/services/apps/pki-authority/manifest.yaml b/src/services/apps/pki-authority/manifest.yaml index 2595ed2c..e3501f36 100644 --- a/src/services/apps/pki-authority/manifest.yaml +++ b/src/services/apps/pki-authority/manifest.yaml @@ -32,9 +32,10 @@ stateExpr: ( $swarmdb.clusters[] | select(.cluster_policy == "wireguard" and .deleted_ts == null) | + . as $currentCluster | select( ( - [$swarmdb.clusternodes[] | select(.deleted_ts == null and (.node | IN($pkiNodeIds[])))] | + [$swarmdb.clusternodes[] | select(.cluster == $currentCluster.id and .deleted_ts == null and (.node | IN($pkiNodeIds[])))] | length > 0 ) ) From 17d1b7e1b76a5a8332496561103515665104378c Mon Sep 17 00:00:00 2001 From: Petr Evstifeev Date: Thu, 25 Dec 2025 06:16:46 -0600 Subject: [PATCH 23/51] linter --- src/services/apps/pki-authority/helpers.py | 388 ++++++++++++--------- src/services/apps/pki-authority/main.py | 276 +++++++++------ 2 files changed, 394 insertions(+), 270 deletions(-) diff --git a/src/services/apps/pki-authority/helpers.py b/src/services/apps/pki-authority/helpers.py index 74276756..3b4bdcc4 100644 --- a/src/services/apps/pki-authority/helpers.py +++ b/src/services/apps/pki-authority/helpers.py @@ -4,18 +4,19 @@ """ import os -import sys -import subprocess -import shutil import re -import yaml +import shutil +import ssl +import subprocess +import sys import time import urllib.request -import ssl +from datetime import datetime +from enum import Enum from pathlib import Path from typing import List, Optional -from enum import Enum -from datetime import datetime + +import yaml PKI_SERVICE_NAME = "pki-authority" SERVICE_INSIDE_CONTAINER = "tee-pki" @@ -51,10 +52,10 @@ class VMMode(Enum): class LXCContainer: """Manager for LXC container operations.""" - + def __init__(self, container_name: str = PKI_SERVICE_NAME): self.container_name = container_name - + def start(self, timeout: int = 30) -> int: """Start LXC container. Returns exit code.""" log(LogLevel.INFO, f"Starting LXC container {self.container_name}") @@ -62,10 +63,11 @@ def start(self, timeout: int = 30) -> int: ["lxc-start", "-n", self.container_name], capture_output=True, text=True, - timeout=timeout + timeout=timeout, + check=False ) return result.returncode - + def stop(self, graceful_timeout: int = 30, command_timeout: int = 60) -> int: """Stop LXC container gracefully. Returns exit code.""" log(LogLevel.INFO, f"Stopping LXC container {self.container_name} gracefully") @@ -73,10 +75,11 @@ def stop(self, graceful_timeout: int = 30, command_timeout: int = 60) -> int: ["lxc-stop", "-n", self.container_name, "-t", str(graceful_timeout)], capture_output=True, text=True, - timeout=command_timeout + timeout=command_timeout, + check=False ) return result.returncode - + def destroy(self) -> int: """Destroy LXC container. Returns exit code.""" log(LogLevel.INFO, f"Destroying LXC container {self.container_name}") @@ -84,99 +87,118 @@ def destroy(self) -> int: ["lxc-destroy", "-n", self.container_name, "-f"], capture_output=True, text=True, - timeout=60 + timeout=60, + check=False ) - + if result.returncode != 0: log(LogLevel.ERROR, f"Failed to destroy container: {result.stderr}") - + return result.returncode - + def is_running(self) -> bool: """Check if LXC container is running.""" try: result = subprocess.run( ["lxc-ls", "--running"], capture_output=True, - text=True + text=True, + check=False ) if self.container_name not in result.stdout: log(LogLevel.INFO, f"LXC container {self.container_name} is not running") return False return True - except Exception as e: - log(LogLevel.ERROR, f"Failed to check LXC container status: {e}") + except Exception as error: # pylint: disable=broad-exception-caught + log(LogLevel.ERROR, f"Failed to check LXC container status: {error}") return False - + def get_ip(self) -> Optional[str]: """Get container IP address.""" try: result = subprocess.run( ["lxc-info", "-n", self.container_name, "-iH"], capture_output=True, - text=True + text=True, + check=False ) container_ip = result.stdout.strip() if result.stdout.strip() else None return container_ip - except Exception as e: - log(LogLevel.ERROR, f"Failed to get container IP: {e}") + except Exception as error: # pylint: disable=broad-exception-caught + log(LogLevel.ERROR, f"Failed to get container IP: {error}") return None - - def create(self, archive_path: str = "/etc/super/containers/pki-authority/pki-authority.tar") -> bool: - """Create LXC container if it doesn't exist. Returns True if created or already exists.""" + + def create( + self, + archive_path: str = "/etc/super/containers/pki-authority/pki-authority.tar" + ) -> bool: + """Create LXC container if it doesn't exist. + + Returns True if created or already exists. + """ # Check if container already exists result = subprocess.run( ["lxc-info", "-n", self.container_name], capture_output=True, - text=True + text=True, + check=False ) - + if result.returncode == 0: log(LogLevel.INFO, f"Container '{self.container_name}' already exists.") return True - else: - log(LogLevel.INFO, f"Container '{self.container_name}' not found. Creating...") - try: - subprocess.run( - [ - "lxc-create", - "-n", self.container_name, - "-t", "oci", - "--", - "--url", f"docker-archive:{archive_path}" - ], - check=True - ) - log(LogLevel.INFO, f"Container '{self.container_name}' created.") - return True - except subprocess.CalledProcessError as e: - log(LogLevel.ERROR, f"Failed to create container: {e}") - return False - + + log(LogLevel.INFO, f"Container '{self.container_name}' not found. Creating...") + try: + subprocess.run( + [ + "lxc-create", + "-n", self.container_name, + "-t", "oci", + "--", + "--url", f"docker-archive:{archive_path}" + ], + check=True + ) + log(LogLevel.INFO, f"Container '{self.container_name}' created.") + return True + except subprocess.CalledProcessError as error: + log(LogLevel.ERROR, f"Failed to create container: {error}") + return False + def is_service_healthy(self, min_uptime: int = 120, healthcheck_url: str = "/healthcheck") -> bool: """Check if service inside container is running and healthy.""" try: # 1. Check service status inside container result = subprocess.run( - ["lxc-attach", "-n", self.container_name, "--", "systemctl", "is-active", SERVICE_INSIDE_CONTAINER], + [ + "lxc-attach", "-n", self.container_name, "--", + "systemctl", "is-active", SERVICE_INSIDE_CONTAINER + ], capture_output=True, - text=True + text=True, + check=False ) status = result.stdout.strip() - + if status not in ["active", "activating"]: log(LogLevel.INFO, f"Service {SERVICE_INSIDE_CONTAINER} status: {status}") return False - - # 2. If service is active, check how long it's been running + + # If service is active, check how long it's been running if status == "active": result = subprocess.run( - ["lxc-attach", "-n", self.container_name, "--", "systemctl", "show", - SERVICE_INSIDE_CONTAINER, "--property=ActiveEnterTimestamp"], + [ + "lxc-attach", "-n", self.container_name, "--", + "systemctl", "show", + SERVICE_INSIDE_CONTAINER, + "--property=ActiveEnterTimestamp" + ], capture_output=True, - text=True + text=True, + check=False ) - + # Parse ActiveEnterTimestamp for line in result.stdout.split('\n'): if line.startswith('ActiveEnterTimestamp='): @@ -187,41 +209,56 @@ def is_service_healthy(self, min_uptime: int = 120, healthcheck_url: str = "/hea ts_result = subprocess.run( ["date", "+%s", "-d", timestamp_str], capture_output=True, - text=True + text=True, + check=False ) start_time = int(ts_result.stdout.strip()) current_time = int(time.time()) uptime_seconds = current_time - start_time - + # If running more than min_uptime, check healthcheck endpoint if uptime_seconds > min_uptime: container_ip = self.get_ip() - + if container_ip: # Perform HTTPS healthcheck without certificate verification try: ctx = ssl.create_default_context() ctx.check_hostname = False ctx.verify_mode = ssl.CERT_NONE - - req = urllib.request.Request(f"https://{container_ip}{healthcheck_url}") - with urllib.request.urlopen(req, context=ctx, timeout=5) as response: + + req = urllib.request.Request( + f"https://{container_ip}{healthcheck_url}" + ) + with urllib.request.urlopen( + req, context=ctx, timeout=5 + ) as response: if response.status == 200: return True - else: - log(LogLevel.INFO, f"Healthcheck returned status: {response.status}") - return False - except Exception as e: - log(LogLevel.INFO, f"Healthcheck failed: {e}") + + log( + LogLevel.INFO, + f"Healthcheck returned status: " + f"{response.status}" + ) + return False + except Exception as error: # pylint: disable=broad-exception-caught + log( + LogLevel.INFO, + f"Healthcheck failed: {error}" + ) return False - except Exception as e: - log(LogLevel.INFO, f"Failed to parse service uptime: {e}") - + except Exception as error: # pylint: disable=broad-exception-caught + log( + LogLevel.INFO, + f"Failed to parse service uptime: {error}" + ) + # Service is active or activating (but not ready for healthcheck yet) return True - - except Exception as e: - log(LogLevel.ERROR, f"Failed to check service health: {e}") + + except Exception as error: # pylint: disable=broad-exception-caught + log(LogLevel.ERROR, f"Failed to check service health: {error}") return False @@ -229,130 +266,132 @@ def detect_cpu_type() -> str: """Detect CPU type based on available devices.""" if Path("/dev/tdx_guest").is_char_device(): return "tdx" - elif Path("/dev/sev-guest").is_char_device(): + if Path("/dev/sev-guest").is_char_device(): return "sev-snp" - else: - return "untrusted" + return "untrusted" def detect_vm_mode() -> VMMode: """Detect VM mode from kernel command line.""" try: - with open("/proc/cmdline", "r") as f: - cmdline = f.read() - + with open("/proc/cmdline", "r", encoding="utf-8") as file: + cmdline = file.read() + if "vm_mode=legacy" in cmdline: return VMMode.LEGACY - elif "vm_mode=swarm-init" in cmdline: + if "vm_mode=swarm-init" in cmdline: return VMMode.SWARM_INIT - else: - return VMMode.SWARM_NORMAL + return VMMode.SWARM_NORMAL except FileNotFoundError: return VMMode.SWARM_NORMAL def get_pki_domain() -> str: """Read PKI authority domain from swarm-env.yaml. - + Returns: Domain string. - + Raises: FileNotFoundError: If swarm-env.yaml does not exist. ValueError: If configuration is empty or domain is not found. Exception: For other errors during reading. """ swarm_env_path = Path(SWARM_ENV_YAML) - + if not swarm_env_path.exists(): error_msg = f"Swarm environment config not found: {SWARM_ENV_YAML}" log(LogLevel.ERROR, error_msg) raise FileNotFoundError(error_msg) - + try: - with open(swarm_env_path, "r") as f: - config = yaml.safe_load(f) - + with open(swarm_env_path, "r", encoding="utf-8") as file: + config = yaml.safe_load(file) + if not config: error_msg = f"Empty configuration in {SWARM_ENV_YAML}" log(LogLevel.ERROR, error_msg) raise ValueError(error_msg) - + domain = config.get("pki-authority", {}).get("domain") if not domain: error_msg = f"No domain found in {SWARM_ENV_YAML} under pki-authority.domain" log(LogLevel.ERROR, error_msg) raise ValueError(error_msg) - + log(LogLevel.INFO, f"Read PKI domain from config: {domain}") return domain - + except (FileNotFoundError, ValueError): raise - except Exception as e: - error_msg = f"Failed to read domain from {SWARM_ENV_YAML}: {e}" + except Exception as error: # pylint: disable=broad-exception-caught + error_msg = f"Failed to read domain from {SWARM_ENV_YAML}: {error}" log(LogLevel.ERROR, error_msg) - raise Exception(error_msg) from e + raise Exception(error_msg) from error def patch_yaml_config(cpu_type: str, vm_mode: VMMode, pki_domain: str): """Set own challenge type in LXC container configuration.""" if vm_mode == VMMode.LEGACY: template_name = "lxc-legacy-vm-template.yaml" - log(LogLevel.INFO, f"Detected {vm_mode.value} mode, using legacy template") + log( + LogLevel.INFO, + f"Detected {vm_mode.value} mode, using legacy template" + ) else: template_name = "lxc-swarm-template.yaml" - log(LogLevel.INFO, f"Detected {vm_mode.value} mode, using swarm template") - + log( + LogLevel.INFO, + f"Detected {vm_mode.value} mode, using swarm template" + ) + src_yaml = Path(f"/etc/super/containers/pki-authority/{template_name}") dst_yaml = Path(f"/var/lib/lxc/{PKI_SERVICE_NAME}/rootfs/app/conf/lxc.yaml") - + if not src_yaml.exists(): log(LogLevel.ERROR, f"Error: {src_yaml} not found.") sys.exit(1) - + # Load YAML, modify, and save - with open(src_yaml, "r") as f: - config = yaml.safe_load(f) - + with open(src_yaml, "r", encoding="utf-8") as file: + config = yaml.safe_load(file) + # Set the CPU type in the configuration if "pki" not in config: config["pki"] = {} if "ownChallenge" not in config["pki"]: config["pki"]["ownChallenge"] = {} config["pki"]["ownChallenge"]["type"] = cpu_type - + # Set ownDomain from parameter if pki_domain: config["pki"]["ownDomain"] = pki_domain log(LogLevel.INFO, f"Set ownDomain to: {pki_domain}") - + # Set mode.attestationServiceSource.mode for swarm modes if vm_mode in (VMMode.SWARM_INIT, VMMode.SWARM_NORMAL): if "mode" not in config["pki"]: config["pki"]["mode"] = {} if "attestationServiceSource" not in config["pki"]["mode"]: config["pki"]["mode"]["attestationServiceSource"] = {} - + mode_value = "init" if vm_mode == VMMode.SWARM_INIT else "normal" config["pki"]["mode"]["attestationServiceSource"]["mode"] = mode_value log(LogLevel.INFO, f"Set attestationServiceSource mode to: {mode_value}") - + # Ensure destination directory exists dst_yaml.parent.mkdir(parents=True, exist_ok=True) - + # Write modified YAML - with open(dst_yaml, "w") as f: - yaml.dump(config, f, default_flow_style=False) - - log(LogLevel.INFO, f"Patched {dst_yaml} with type: {cpu_type}") + with open(dst_yaml, "w", encoding="utf-8") as file: + yaml.dump(config, file, default_flow_style=False) def patch_lxc_config(cpu_type: str): """Patch LXC container configuration.""" config_file = Path(f"/var/lib/lxc/{PKI_SERVICE_NAME}/config") config_bak = Path(f"{config_file}.bak") - + # Always restore config from backup if backup exists if config_bak.exists(): shutil.copy(config_bak, config_file) @@ -360,32 +399,41 @@ def patch_lxc_config(cpu_type: str): # Create backup before first patch if config_file.exists(): shutil.copy(config_file, config_bak) - + # Append MAC address configuration - with open(config_file, "a") as f: - f.write("lxc.net.0.hwaddr = 4e:fc:0a:d5:2d:ff\n") - + with open(config_file, "a", encoding="utf-8") as file: + file.write("lxc.net.0.hwaddr = 4e:fc:0a:d5:2d:ff\n") + # Add device-specific configuration if cpu_type == "sev-snp": dev_path = Path("/dev/sev-guest") stat_info = dev_path.stat() dev_id = f"{os.major(stat_info.st_rdev)}:{os.minor(stat_info.st_rdev)}" - - with open(config_file, "a") as f: - f.write(f"lxc.cgroup2.devices.allow = c {dev_id} rwm\n") - f.write("lxc.mount.entry = /dev/sev-guest dev/sev-guest none bind,optional,create=file\n") - + + with open(config_file, "a", encoding="utf-8") as file: + file.write(f"lxc.cgroup2.devices.allow = c {dev_id} rwm\n") + file.write( + "lxc.mount.entry = /dev/sev-guest dev/sev-guest " + "none bind,optional,create=file\n" + ) + elif cpu_type == "tdx": dev_path = Path("/dev/tdx_guest") stat_info = dev_path.stat() dev_id = f"{os.major(stat_info.st_rdev)}:{os.minor(stat_info.st_rdev)}" - - with open(config_file, "a") as f: - f.write(f"lxc.cgroup2.devices.allow = c {dev_id} rwm\n") - f.write("lxc.mount.entry = /dev/tdx_guest dev/tdx_guest none bind,optional,create=file\n") - + + with open(config_file, "a", encoding="utf-8") as file: + file.write(f"lxc.cgroup2.devices.allow = c {dev_id} rwm\n") + file.write( + "lxc.mount.entry = /dev/tdx_guest dev/tdx_guest " + "none bind,optional,create=file\n" + ) + if Path("/etc/tdx-attest.conf").exists(): - f.write("lxc.mount.entry = /etc/tdx-attest.conf etc/tdx-attest.conf none bind,ro,create=file\n") + file.write( + "lxc.mount.entry = /etc/tdx-attest.conf etc/tdx-attest.conf " + "none bind,ro,create=file\n" + ) def get_bridge_ip(bridge_name: str) -> str: @@ -393,32 +441,40 @@ def get_bridge_ip(bridge_name: str) -> str: result = subprocess.run( ["ip", "-4", "addr", "show", bridge_name], capture_output=True, - text=True + text=True, + check=False ) - + if result.returncode != 0: - log(LogLevel.ERROR, f"Error: Could not determine IP address for bridge {bridge_name}") + log( + LogLevel.ERROR, + f"Error: Could not determine IP address for bridge {bridge_name}" + ) sys.exit(1) - + # Parse IP address from output match = re.search(r'inet\s+(\d+\.\d+\.\d+\.\d+)', result.stdout) if not match: - log(LogLevel.ERROR, f"Error: Could not determine IP address for bridge {bridge_name}") + log( + LogLevel.ERROR, + f"Error: Could not determine IP address for bridge {bridge_name}" + ) sys.exit(1) - + return match.group(1) def enable_route_localnet(bridge_name: str): """Enable route_localnet for the bridge.""" sysctl_key = f"net.ipv4.conf.{bridge_name}.route_localnet" - + result = subprocess.run( ["sysctl", "-n", sysctl_key], capture_output=True, - text=True + text=True, + check=False ) - + if result.returncode == 0 and result.stdout.strip() == "1": log(LogLevel.INFO, f"route_localnet already enabled for {bridge_name}") else: @@ -437,9 +493,9 @@ def delete_iptables_rules(): ["iptables", "-t", "nat", "-S", chain], capture_output=True, text=True, check=True ) - + rules = result.stdout.splitlines() - + for rule in rules: # Delete rules that contain our comment if IPTABLES_RULE_COMMENT in rule: @@ -449,23 +505,23 @@ def delete_iptables_rules(): def ensure_iptables_rule(check_args: List[str], add_args: List[str], description: str): + """Ensure iptables rule exists, add if missing.""" log(LogLevel.INFO, f"Checking iptables rule: {description}") - - check_result = subprocess.run(check_args, capture_output=True) - + + check_result = subprocess.run(check_args, capture_output=True, check=False) + if check_result.returncode == 0: - log(LogLevel.INFO, f"Rule already exists") + log(LogLevel.INFO, "Rule already exists") else: subprocess.run(add_args, check=True) - log(LogLevel.INFO, f"Rule added") - + log(LogLevel.INFO, "Rule added") def setup_iptables(wg_ip): """Setup iptables NAT rules for LXC container access to host services.""" host_ip = get_bridge_ip(BRIDGE_NAME) - + enable_route_localnet(BRIDGE_NAME) - + # Rule 1: PCCS DNAT ensure_iptables_rule( check_args=[ @@ -511,7 +567,7 @@ def setup_iptables(wg_ip): ], description=f"PREROUTING WireGuard {PKI_SERVICE_EXTERNAL_PORT} -> {CONTAINER_IP}:443" ) - + # Rule 3: OUTPUT ensure_iptables_rule( check_args=[ @@ -534,7 +590,7 @@ def setup_iptables(wg_ip): ], description=f"OUTPUT {wg_ip}:{PKI_SERVICE_EXTERNAL_PORT} -> {CONTAINER_IP}:443" ) - + # Rule 4: MASQUERADE ensure_iptables_rule( check_args=[ @@ -557,40 +613,41 @@ def update_pccs_url(): """Update PCCS URL in QCNL configuration.""" qcnl_conf = Path(f"/var/lib/lxc/{PKI_SERVICE_NAME}/rootfs/etc/sgx_default_qcnl.conf") qcnl_conf_bak = Path(f"{qcnl_conf}.bak") - + host_ip = get_bridge_ip(BRIDGE_NAME) - + pccs_url = f"https://{host_ip}:{PCCS_PORT}/sgx/certification/v4/" - + if not qcnl_conf.exists(): log(LogLevel.ERROR, f"Error: {qcnl_conf} not found") sys.exit(1) - + if not qcnl_conf_bak.exists(): shutil.copy(qcnl_conf, qcnl_conf_bak) - + shutil.copy(qcnl_conf_bak, qcnl_conf) - - with open(qcnl_conf, "r") as f: - content = f.read() - + + with open(qcnl_conf, "r", encoding="utf-8") as file: + content = file.read() + content = re.sub( r'"pccs_url":\s*"[^"]*"', f'"pccs_url": "{pccs_url}"', content ) - - with open(qcnl_conf, "w") as f: - f.write(content) - - log(LogLevel.INFO, f"Updated PCCS URL in {qcnl_conf} to {pccs_url}") + + with open(qcnl_conf, "w", encoding="utf-8") as file: + file.write(content) + def init_container(): + """Initialize LXC container for PKI Authority.""" LXCContainer(PKI_SERVICE_NAME).create() def get_node_tunnel_ip(node_id: str, wg_props: List[dict]) -> Optional[str]: + """Get tunnel IP for a node from WireGuard properties.""" for prop in wg_props: if prop.get("node_id") == node_id and prop.get("name") == "tunnel_ip": return prop.get("value") @@ -598,12 +655,14 @@ def get_node_tunnel_ip(node_id: str, wg_props: List[dict]) -> Optional[str]: def save_property_into_fs(file_name: str, content: bytes): + """Save property content to filesystem.""" STORAGE_PATH.mkdir(parents=True, exist_ok=True) file_path = STORAGE_PATH / file_name file_path.write_bytes(content) def read_property_from_fs(file_name: str) -> tuple[bool, bytes]: + """Read property content from filesystem.""" file_path = STORAGE_PATH / file_name if file_path.exists(): content = file_path.read_bytes() @@ -611,3 +670,4 @@ def read_property_from_fs(file_name: str) -> tuple[bool, bytes]: return (True, content) return (False, b"") + diff --git a/src/services/apps/pki-authority/main.py b/src/services/apps/pki-authority/main.py index ed307a21..b8beab2c 100755 --- a/src/services/apps/pki-authority/main.py +++ b/src/services/apps/pki-authority/main.py @@ -1,12 +1,13 @@ #!/usr/bin/env python3 +"""PKI Authority service provisioning plugin.""" +import base64 +import json import sys import time -import json from pathlib import Path from provision_plugin_sdk import ProvisionPlugin, PluginInput, PluginOutput -import base64 from redis import RedisCluster from redis.cluster import ClusterNode @@ -37,14 +38,18 @@ class EventHandler: - + """Handler for PKI Authority provisioning events.""" + # Authority service property prefix and names AUTHORITY_SERVICE_PREFIX = "pki_authority_" - AUTHORITY_SERVICE_PROPERTIES = ["auth_token", "basic_certificate", "basic_privateKey", "lite_certificate", "lite_privateKey"] + AUTHORITY_SERVICE_PROPERTIES = [ + "auth_token", "basic_certificate", "basic_privateKey", + "lite_certificate", "lite_privateKey" + ] PROP_INITIALIZED = f"{AUTHORITY_SERVICE_PREFIX}initialized" PROP_REGISTERED_ENDPOINTS = f"{AUTHORITY_SERVICE_PREFIX}registered_endpoints" PROP_PKI_DOMAIN = f"{AUTHORITY_SERVICE_PREFIX}pki_domain" - + def __init__(self, input_data: PluginInput): self.input_data = input_data self.local_node_id = input_data.local_node_id @@ -52,19 +57,19 @@ def __init__(self, input_data: PluginInput): self.local_state = input_data.local_state or {} self.cluster_info = self.state_json.get("cluster", {}) leader_node_id = self.cluster_info.get("leader_node") - self.is_leader = (self.local_node_id == leader_node_id) + self.is_leader = self.local_node_id == leader_node_id self.pki_cluster_nodes = self.state_json.get("clusterNodes", []) self.wg_props = self.state_json.get("wgNodeProperties", []) self.authority_props = self.state_json.get("authorityServiceProperties", []) self.authority_config = {prop["name"]: prop["value"] for prop in self.authority_props} - + self.pki_domain = self.authority_config.get(self.PROP_PKI_DOMAIN, "") - + # Output parameters self.status = None self.error_message = None self.cluster_properties = {} - + def get_redis_tunnel_ips(self) -> list[str]: """Get list of Redis node tunnel IPs.""" redis_node_props = self.state_json.get("redisNodeProperties", []) @@ -79,46 +84,59 @@ def get_redis_tunnel_ips(self) -> list[str]: redis_hosts.append(tunnel_ip) return sorted(set(redis_hosts)) - + def get_redis_connection_info(self) -> list[tuple[str, int]]: """Get Redis cluster connection endpoints. - + Returns list of (host, port) tuples for Redis nodes. """ redis_tunnel_ips = self.get_redis_tunnel_ips() return [(ip, 6379) for ip in redis_tunnel_ips] - - + def create_gateway_endpoints(self): + """Create and update gateway endpoints in Redis.""" if not self.is_leader: return - - registered_endpoints = self.authority_config.get(self.PROP_REGISTERED_ENDPOINTS, "").split(";") - + + registered_endpoints = self.authority_config.get( + self.PROP_REGISTERED_ENDPOINTS, "" + ).split(";") + current_endpoints = [] for node in self.pki_cluster_nodes: node_id = node.get("node_id") tunnel_ip = get_node_tunnel_ip(node_id, self.wg_props) if tunnel_ip: current_endpoints.append(tunnel_ip) - + # Compare endpoints regardless of order if set(registered_endpoints) == set(current_endpoints): - log(LogLevel.INFO, f"Gateway endpoints are up to date: registered={registered_endpoints}, current={current_endpoints}") + log( + LogLevel.INFO, + f"Gateway endpoints are up to date: " + f"registered={registered_endpoints}, current={current_endpoints}" + ) return - - log(LogLevel.INFO, f"Gateway endpoints changed: registered={registered_endpoints}, current={current_endpoints}") - + + log( + LogLevel.INFO, + f"Gateway endpoints changed: " + f"registered={registered_endpoints}, current={current_endpoints}" + ) + # Get Redis connection info redis_endpoints = self.get_redis_connection_info() - + if not redis_endpoints and current_endpoints: self.status = "postponed" self.error_message = "No Redis nodes available to configure gateway routes" return - + # Build targets list from current endpoints - targets = [{"url": f"https://{endpoint}:8443", "weight": 1} for endpoint in current_endpoints] + targets = [ + {"url": f"https://{endpoint}:8443", "weight": 1} + for endpoint in current_endpoints + ] route_config = { "targets": targets, "policy": "rr", @@ -127,9 +145,9 @@ def create_gateway_endpoints(self): } route_json = json.dumps(route_config) route_key = f"routes:{self.pki_domain}" - + startup_nodes = [ClusterNode(host, port) for host, port in redis_endpoints] - + try: redis_client = RedisCluster( startup_nodes=startup_nodes, @@ -138,24 +156,29 @@ def create_gateway_endpoints(self): socket_connect_timeout=5, ) redis_client.ping() - + redis_client.set(route_key, route_json) - log(LogLevel.INFO, f"Successfully set gateway route {route_key} in Redis Cluster") - + log( + LogLevel.INFO, + f"Successfully set gateway route {route_key} in Redis Cluster" + ) + if self.cluster_properties is None: self.cluster_properties = {} - self.cluster_properties[self.PROP_REGISTERED_ENDPOINTS] = ";".join(current_endpoints) - - except Exception as e: - error_msg = f"Failed to set route in Redis Cluster: {str(e)}" + self.cluster_properties[self.PROP_REGISTERED_ENDPOINTS] = \ + ";".join(current_endpoints) + + except Exception as error: # pylint: disable=broad-exception-caught + error_msg = f"Failed to set route in Redis Cluster: {str(error)}" self.status = "error" self.error_message = error_msg log(LogLevel.ERROR, error_msg) def create_output(self) -> PluginOutput: + """Create plugin output based on current status.""" if self.status == "completed": self.create_gateway_endpoints() - elif self.status =="postponed": + elif self.status == "postponed": log(LogLevel.INFO, f"Apply postponed: {self.error_message}") elif self.status == "error": log(LogLevel.ERROR, f"Apply error: {self.error_message}") @@ -166,63 +189,82 @@ def create_output(self) -> PluginOutput: status=self.status, local_state=self.local_state if self.status == "completed" else None, error_message=self.error_message, - cluster_properties=self.cluster_properties if self.status == "completed" else None + cluster_properties=( + self.cluster_properties if self.status == "completed" else None + ) ) - + + # pylint: disable=too-many-locals,too-many-return-statements + # pylint: disable=too-many-branches,too-many-statements def apply(self) -> PluginOutput: + """Apply PKI Authority configuration.""" if not isinstance(self.state_json, dict): self.status = "error" self.error_message = "Invalid state format" return self.create_output() - + local_tunnel_ip = get_node_tunnel_ip(self.local_node_id, self.wg_props) if not local_tunnel_ip: self.status = "error" self.error_message = "Local node has no WireGuard tunnel IP" return self.create_output() - + try: vm_mode = detect_vm_mode() initialized = self.authority_config.get(self.PROP_INITIALIZED) - + # If initialized is true, verify all required properties are present if initialized == "true": missing = [] - + for prop in self.AUTHORITY_SERVICE_PROPERTIES: prop_name = f"{self.AUTHORITY_SERVICE_PREFIX}{prop}" prop_value = self.authority_config.get(prop_name, "") - + if not prop_value: missing.append(prop_name) - + if not self.pki_domain: self.pki_domain = get_pki_domain() missing.append(self.PROP_PKI_DOMAIN) if missing: - error_msg = f"Service marked as initialized but missing properties: {', '.join(missing)}" + error_msg = ( + f"Service marked as initialized but missing properties: " + f"{', '.join(missing)}" + ) log(LogLevel.ERROR, error_msg) initialized = "false" - + if vm_mode == VMMode.SWARM_NORMAL and initialized != "true": self.status = "postponed" - self.error_message = "Waiting for authority service properties to be initialized" + self.error_message = ( + "Waiting for authority service properties to be initialized" + ) return self.create_output() container = LXCContainer(PKI_SERVICE_NAME) - + # Start or restart LXC container if container.is_running(): if initialized != "true" or self.is_restart_required(): - exit_code = container.stop(graceful_timeout=30, command_timeout=60) + exit_code = container.stop( + graceful_timeout=30, command_timeout=60 + ) if exit_code != 0: - raise Exception(f"Failed to stop container with exit code {exit_code}") - else: - log(LogLevel.INFO, f"Container {PKI_SERVICE_NAME} is already running, no restart required") - self.status = "completed" - return self.create_output() - + raise Exception( + f"Failed to stop container with exit code {exit_code}" + ) + + if container.is_running(): + log( + LogLevel.INFO, + f"Container {PKI_SERVICE_NAME} is already running, " + f"no restart required" + ) + self.status = "completed" + return self.create_output() + cpu_type = detect_cpu_type() if not self.pki_domain: self.pki_domain = get_pki_domain() @@ -239,7 +281,9 @@ def apply(self) -> PluginOutput: exit_code = container.start(timeout=30) if exit_code != 0: - raise Exception(f"Failed to start container with exit code {exit_code}") + raise Exception( + f"Failed to start container with exit code {exit_code}" + ) log(LogLevel.INFO, f"LXC container {PKI_SERVICE_NAME} is running") @@ -250,93 +294,106 @@ def apply(self) -> PluginOutput: interval = 5 elapsed = 0 collected_properties = {} - + while elapsed < timeout: # Try to read each missing property for prop in missing_properties[:]: - success, value = read_property_from_fs(prop) - + success, value = read_property_from_fs(prop) + if success: - collected_properties[f"{self.AUTHORITY_SERVICE_PREFIX}{prop}"] = base64.b64encode(value).decode() + prop_key = f"{self.AUTHORITY_SERVICE_PREFIX}{prop}" + collected_properties[prop_key] = \ + base64.b64encode(value).decode() missing_properties.remove(prop) - + # Check if all properties collected if not missing_properties: - log(LogLevel.INFO, "All property files have been generated by tee-pki service") - collected_properties[self.PROP_PKI_DOMAIN] = self.pki_domain + log( + LogLevel.INFO, + "All property files have been generated " + "by tee-pki service" + ) + collected_properties[self.PROP_PKI_DOMAIN] = self.pki_domain collected_properties[self.PROP_INITIALIZED] = "true" - + self.status = "completed" self.cluster_properties = collected_properties return self.create_output() - + # Show what's still missing - log(LogLevel.INFO, f"Waiting for property files. Missing: {', '.join(missing_properties)} (elapsed: {elapsed}s)") - + log( + LogLevel.INFO, + f"Waiting for property files. Missing: " + f"{', '.join(missing_properties)} (elapsed: {elapsed}s)" + ) + time.sleep(interval) elapsed += interval - + # Timeout reached self.status = "postponed" - self.error_message = f"Timeout waiting for tee-pki to generate property files: {', '.join(missing_properties)}" + self.error_message = ( + f"Timeout waiting for tee-pki to generate property files: " + f"{', '.join(missing_properties)}" + ) return self.create_output() - + self.status = "completed" return self.create_output() - - except Exception as e: - error_msg = f"Apply failed: {str(e)}" + + except Exception as error: # pylint: disable=broad-exception-caught + error_msg = f"Apply failed: {str(error)}" log(LogLevel.ERROR, error_msg) self.status = "error" self.error_message = error_msg return self.create_output() - + def is_restart_required(self) -> bool: - """Check if container restart is required based on configuration changes.""" + """Check if container restart is required based on config changes.""" for prop in self.AUTHORITY_SERVICE_PROPERTIES: prop_name = f"{self.AUTHORITY_SERVICE_PREFIX}{prop}" config_value = self.authority_config.get(prop_name, "") - + if not config_value: continue - + # Read current value from filesystem success, fs_value = read_property_from_fs(prop) - + if not success: # File doesn't exist in FS, restart required log(LogLevel.INFO, f"Property {prop} not found in filesystem, restart required") return True - + # Decode config value from base64 and compare with filesystem value try: decoded_config_value = base64.b64decode(config_value) if decoded_config_value != fs_value: log(LogLevel.INFO, f"Property {prop} has changed, restart required") return True - except Exception as e: - log(LogLevel.ERROR, f"Failed to decode property {prop}: {e}") + except Exception as error: # pylint: disable=broad-exception-caught + log(LogLevel.ERROR, f"Failed to decode property {prop}: {error}") return True - + # No changes detected log(LogLevel.INFO, "No configuration changes detected") return False - + def delete_route_from_redis(self) -> None: """Delete the PKI Authority route from Redis Cluster. - + Raises: Exception: If deletion fails """ redis_endpoints = self.get_redis_connection_info() - + if not redis_endpoints: log(LogLevel.WARN, "No Redis endpoints available, skipping route deletion") return - + route_key = f"routes:{self.pki_domain}" startup_nodes = [ClusterNode(host, port) for host, port in redis_endpoints] - + redis_client = RedisCluster( startup_nodes=startup_nodes, decode_responses=True, @@ -345,38 +402,43 @@ def delete_route_from_redis(self) -> None: ) redis_client.delete(route_key) log(LogLevel.INFO, f"Deleted route {route_key} from Redis Cluster") - + def destroy(self) -> PluginOutput: """Destroy PKI Authority service and clean up.""" try: container = LXCContainer(PKI_SERVICE_NAME) - + # Stop container if running if container.is_running(): exit_code = container.stop(graceful_timeout=30, command_timeout=60) if exit_code != 0: log(LogLevel.WARN, "Failed to stop container gracefully") - + # Destroy container exit_code = container.destroy() if exit_code != 0: error_msg = f"Failed to destroy container with exit code {exit_code}" return PluginOutput(status="error", error_message=error_msg, local_state=self.local_state) - + delete_iptables_rules() - + # If this is the last node and domain is configured, delete route from Redis if len(self.pki_cluster_nodes) <= 1 and self.pki_domain: - log(LogLevel.INFO, "This is the last PKI Authority node, deleting route from Redis") + log( + LogLevel.INFO, + "This is the last PKI Authority node, deleting route from Redis" + ) self.delete_route_from_redis() log(LogLevel.INFO, "PKI Authority destroyed") return PluginOutput(status="completed", local_state=self.local_state) - except Exception as e: - error_msg = f"Destroy failed: {str(e)}" + except Exception as error: # pylint: disable=broad-exception-caught + error_msg = f"Destroy failed: {str(error)}" log(LogLevel.ERROR, error_msg) - return PluginOutput(status="error", error_message=error_msg, local_state=self.local_state) + return PluginOutput( + status="error", error_message=error_msg, local_state=self.local_state + ) # Plugin commands @@ -388,10 +450,12 @@ def handle_init(input_data: PluginInput) -> PluginOutput: init_container() log(LogLevel.INFO, "PKI initialization completed") return PluginOutput(status="completed", local_state=input_data.local_state) - except Exception as e: - error_msg = f"Failed to initialize PKI: {str(e)}" + except Exception as error: # pylint: disable=broad-exception-caught + error_msg = f"Failed to initialize PKI: {str(error)}" log(LogLevel.ERROR, error_msg) - return PluginOutput(status="error", error_message=error_msg, local_state=input_data.local_state) + return PluginOutput( + status="error", error_message=error_msg, local_state=input_data.local_state + ) @plugin.command("apply") @@ -408,17 +472,17 @@ def handle_health(input_data: PluginInput) -> PluginOutput: try: container = LXCContainer(PKI_SERVICE_NAME) - + if container.is_running() and container.is_service_healthy(): return PluginOutput(status="completed", local_state=local_state) - else: - return PluginOutput( - status="error", - error_message="PKI service is not healthy or container is not running", - local_state=local_state - ) - except Exception as e: - error_msg = f"Health check failed: {str(e)}" + + return PluginOutput( + status="error", + error_message="PKI service is not healthy or container is not running", + local_state=local_state + ) + except Exception as error: # pylint: disable=broad-exception-caught + error_msg = f"Health check failed: {str(error)}" log(LogLevel.ERROR, error_msg) return PluginOutput(status="error", error_message=error_msg, local_state=local_state) From 59e4142a15a33a4880e2bc52dd04545989bf1f63 Mon Sep 17 00:00:00 2001 From: Petr Evstifeev Date: Mon, 5 Jan 2026 08:58:21 -0600 Subject: [PATCH 24/51] update pki-authority container --- src/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Dockerfile b/src/Dockerfile index 9708722d..ec2650d8 100644 --- a/src/Dockerfile +++ b/src/Dockerfile @@ -247,7 +247,7 @@ ADD rootfs/files/scripts/install_lxc_deps.sh /buildroot/files/scripts/ RUN --security=insecure /buildroot/files/scripts/install_lxc_deps.sh RUN mkdir -p "${OUTPUTDIR}/etc/super/containers/pki-authority" -COPY --from=ghcr.io/super-protocol/tee-pki-authority-service-lxc:build-18725490828 /pki-authority.tar "${OUTPUTDIR}/etc/super/containers/pki-authority/pki-authority.tar" +COPY --from=ghcr.io/super-protocol/tee-pki-authority-service-lxc:build-20717465559 /pki-authority.tar "${OUTPUTDIR}/etc/super/containers/pki-authority/pki-authority.tar" ADD rootfs/files/configs/pki-service/lxc-swarm-template.yaml "${OUTPUTDIR}/etc/super/containers/pki-authority/lxc-swarm-template.yaml" ADD rootfs/files/configs/pki-service/lxc-legacy-vm-template.yaml "${OUTPUTDIR}/etc/super/containers/pki-authority/lxc-legacy-vm-template.yaml" ADD rootfs/files/configs/pki-service/dnsmasq.conf "${OUTPUTDIR}/etc/lxc/dnsmasq.conf" From f152ab98faf3867a7566edd86b4c3b2095d970e8 Mon Sep 17 00:00:00 2001 From: Petr Evstifeev Date: Fri, 9 Jan 2026 10:03:11 -0600 Subject: [PATCH 25/51] additional rules for production mode --- src/services/apps/pki-authority/helpers.py | 42 ++++++++++++++++++++-- src/services/apps/pki-authority/main.py | 2 +- 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/src/services/apps/pki-authority/helpers.py b/src/services/apps/pki-authority/helpers.py index 3b4bdcc4..0305de0d 100644 --- a/src/services/apps/pki-authority/helpers.py +++ b/src/services/apps/pki-authority/helpers.py @@ -486,8 +486,8 @@ def enable_route_localnet(bridge_name: str): def delete_iptables_rules(): - """Delete all iptables NAT rules for PKI container.""" - # Delete rules from all chains: PREROUTING, OUTPUT, POSTROUTING + """Delete all iptables rules for PKI container (NAT and filter tables).""" + # Delete rules from NAT table chains: PREROUTING, OUTPUT, POSTROUTING for chain in ["PREROUTING", "OUTPUT", "POSTROUTING"]: result = subprocess.run( ["iptables", "-t", "nat", "-S", chain], @@ -501,7 +501,22 @@ def delete_iptables_rules(): if IPTABLES_RULE_COMMENT in rule: delete_rule = rule.replace("-A", "-D", 1) subprocess.run(["iptables", "-t", "nat"] + delete_rule.split()[1:], check=True) - log(LogLevel.INFO, f"Deleted iptables rule: {delete_rule}") + log(LogLevel.INFO, f"Deleted iptables NAT rule: {delete_rule}") + + # Delete rules from filter table (INPUT chain) + result = subprocess.run( + ["iptables", "-S", "INPUT"], + capture_output=True, text=True, check=True + ) + + rules = result.stdout.splitlines() + + for rule in rules: + # Delete rules that contain our comment + if IPTABLES_RULE_COMMENT in rule: + delete_rule = rule.replace("-A", "-D", 1) + subprocess.run(["iptables"] + delete_rule.split()[1:], check=True) + log(LogLevel.INFO, f"Deleted iptables INPUT rule: {delete_rule}") def ensure_iptables_rule(check_args: List[str], add_args: List[str], description: str): @@ -608,6 +623,27 @@ def setup_iptables(wg_ip): description=f"POSTROUTING MASQUERADE for {CONTAINER_IP}/32" ) + # Rule 5: Allow port 8081 on lxcbr0 + ensure_iptables_rule( + check_args=[ + "iptables", "-C", "INPUT", + "-i", "lxcbr0", + "-p", "tcp", + "--dport", "8081", + "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, + "-j", "ACCEPT" + ], + add_args=[ + "iptables", "-A", "INPUT", + "-i", "lxcbr0", + "-p", "tcp", + "--dport", "8081", + "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, + "-j", "ACCEPT" + ], + description="Allow TCP port 8081 on lxcbr0" + ) + def update_pccs_url(): """Update PCCS URL in QCNL configuration.""" diff --git a/src/services/apps/pki-authority/main.py b/src/services/apps/pki-authority/main.py index b8beab2c..8528552d 100755 --- a/src/services/apps/pki-authority/main.py +++ b/src/services/apps/pki-authority/main.py @@ -170,7 +170,7 @@ def create_gateway_endpoints(self): except Exception as error: # pylint: disable=broad-exception-caught error_msg = f"Failed to set route in Redis Cluster: {str(error)}" - self.status = "error" + self.status = "postponed" self.error_message = error_msg log(LogLevel.ERROR, error_msg) From 6921ab2fd530cec3fe1bc07bf153ed0b96e5dd5a Mon Sep 17 00:00:00 2001 From: Petr Evstifeev Date: Thu, 15 Jan 2026 04:45:44 -0600 Subject: [PATCH 26/51] http support for pki-auth --- .../pki-service/lxc-swarm-template.yaml | 3 +- src/services/apps/pki-authority/helpers.py | 50 ++++++++++++++++++- src/services/apps/pki-authority/main.py | 12 ++++- 3 files changed, 61 insertions(+), 4 deletions(-) diff --git a/src/rootfs/files/configs/pki-service/lxc-swarm-template.yaml b/src/rootfs/files/configs/pki-service/lxc-swarm-template.yaml index 60d9ce90..be28e865 100644 --- a/src/rootfs/files/configs/pki-service/lxc-swarm-template.yaml +++ b/src/rootfs/files/configs/pki-service/lxc-swarm-template.yaml @@ -1,5 +1,6 @@ api: - port: 443 + httpsPort: 443 + httpPort: 80 pki: allowedChallenges: - token diff --git a/src/services/apps/pki-authority/helpers.py b/src/services/apps/pki-authority/helpers.py index 0305de0d..7fae3974 100644 --- a/src/services/apps/pki-authority/helpers.py +++ b/src/services/apps/pki-authority/helpers.py @@ -560,7 +560,7 @@ def setup_iptables(wg_ip): description=f"PCCS DNAT {host_ip}:{PCCS_PORT} -> 127.0.0.1:{PCCS_PORT}" ) - # Rule 2: WireGuard PREROUTING + # Rule 2: WireGuard PREROUTING (HTTPS) ensure_iptables_rule( check_args=[ "iptables", "-t", "nat", "-C", "PREROUTING", @@ -583,7 +583,30 @@ def setup_iptables(wg_ip): description=f"PREROUTING WireGuard {PKI_SERVICE_EXTERNAL_PORT} -> {CONTAINER_IP}:443" ) - # Rule 3: OUTPUT + # Rule 2a: WireGuard PREROUTING (HTTP) + ensure_iptables_rule( + check_args=[ + "iptables", "-t", "nat", "-C", "PREROUTING", + "-i", WIREGUARD_INTERFACE, + "-p", "tcp", + "--dport", "8080", + "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, + "-j", "DNAT", + "--to-destination", f"{CONTAINER_IP}:80" + ], + add_args=[ + "iptables", "-t", "nat", "-A", "PREROUTING", + "-i", WIREGUARD_INTERFACE, + "-p", "tcp", + "--dport", "8080", + "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, + "-j", "DNAT", + "--to-destination", f"{CONTAINER_IP}:80" + ], + description=f"PREROUTING WireGuard 8080 -> {CONTAINER_IP}:80" + ) + + # Rule 3: OUTPUT (HTTPS) ensure_iptables_rule( check_args=[ "iptables", "-t", "nat", "-C", "OUTPUT", @@ -606,6 +629,29 @@ def setup_iptables(wg_ip): description=f"OUTPUT {wg_ip}:{PKI_SERVICE_EXTERNAL_PORT} -> {CONTAINER_IP}:443" ) + # Rule 3a: OUTPUT (HTTP) + ensure_iptables_rule( + check_args=[ + "iptables", "-t", "nat", "-C", "OUTPUT", + "-d", wg_ip, + "-p", "tcp", + "--dport", "8080", + "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, + "-j", "DNAT", + "--to-destination", f"{CONTAINER_IP}:80" + ], + add_args=[ + "iptables", "-t", "nat", "-A", "OUTPUT", + "-d", wg_ip, + "-p", "tcp", + "--dport", "8080", + "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, + "-j", "DNAT", + "--to-destination", f"{CONTAINER_IP}:80" + ], + description=f"OUTPUT {wg_ip}:8080 -> {CONTAINER_IP}:80" + ) + # Rule 4: MASQUERADE ensure_iptables_rule( check_args=[ diff --git a/src/services/apps/pki-authority/main.py b/src/services/apps/pki-authority/main.py index 8528552d..9e58003a 100755 --- a/src/services/apps/pki-authority/main.py +++ b/src/services/apps/pki-authority/main.py @@ -403,6 +403,12 @@ def delete_route_from_redis(self) -> None: redis_client.delete(route_key) log(LogLevel.INFO, f"Deleted route {route_key} from Redis Cluster") + # Clear registered endpoints to ensure route is recreated on next PKI node start + if self.cluster_properties is None: + self.cluster_properties = {} + self.cluster_properties[self.PROP_REGISTERED_ENDPOINTS] = "" + log(LogLevel.INFO, "Cleared registered endpoints in cluster properties") + def destroy(self) -> PluginOutput: """Destroy PKI Authority service and clean up.""" try: @@ -431,7 +437,11 @@ def destroy(self) -> PluginOutput: self.delete_route_from_redis() log(LogLevel.INFO, "PKI Authority destroyed") - return PluginOutput(status="completed", local_state=self.local_state) + return PluginOutput( + status="completed", + local_state=self.local_state, + cluster_properties=self.cluster_properties if self.cluster_properties else None + ) except Exception as error: # pylint: disable=broad-exception-caught error_msg = f"Destroy failed: {str(error)}" From be21132da5916eac669c5102a0fa5de470e3ca84 Mon Sep 17 00:00:00 2001 From: Petr Evstifeev Date: Thu, 15 Jan 2026 06:30:41 -0600 Subject: [PATCH 27/51] get registered endpoints direct from redis --- src/services/apps/pki-authority/main.py | 118 +++++++++++++++--------- 1 file changed, 72 insertions(+), 46 deletions(-) diff --git a/src/services/apps/pki-authority/main.py b/src/services/apps/pki-authority/main.py index 9e58003a..c6b0f436 100755 --- a/src/services/apps/pki-authority/main.py +++ b/src/services/apps/pki-authority/main.py @@ -47,7 +47,6 @@ class EventHandler: "lite_certificate", "lite_privateKey" ] PROP_INITIALIZED = f"{AUTHORITY_SERVICE_PREFIX}initialized" - PROP_REGISTERED_ENDPOINTS = f"{AUTHORITY_SERVICE_PREFIX}registered_endpoints" PROP_PKI_DOMAIN = f"{AUTHORITY_SERVICE_PREFIX}pki_domain" def __init__(self, input_data: PluginInput): @@ -98,10 +97,7 @@ def create_gateway_endpoints(self): if not self.is_leader: return - registered_endpoints = self.authority_config.get( - self.PROP_REGISTERED_ENDPOINTS, "" - ).split(";") - + # Get current endpoints from cluster nodes current_endpoints = [] for node in self.pki_cluster_nodes: node_id = node.get("node_id") @@ -109,21 +105,6 @@ def create_gateway_endpoints(self): if tunnel_ip: current_endpoints.append(tunnel_ip) - # Compare endpoints regardless of order - if set(registered_endpoints) == set(current_endpoints): - log( - LogLevel.INFO, - f"Gateway endpoints are up to date: " - f"registered={registered_endpoints}, current={current_endpoints}" - ) - return - - log( - LogLevel.INFO, - f"Gateway endpoints changed: " - f"registered={registered_endpoints}, current={current_endpoints}" - ) - # Get Redis connection info redis_endpoints = self.get_redis_connection_info() @@ -132,20 +113,7 @@ def create_gateway_endpoints(self): self.error_message = "No Redis nodes available to configure gateway routes" return - # Build targets list from current endpoints - targets = [ - {"url": f"https://{endpoint}:8443", "weight": 1} - for endpoint in current_endpoints - ] - route_config = { - "targets": targets, - "policy": "rr", - "preserve_host": False, - "passthrough": True - } - route_json = json.dumps(route_config) route_key = f"routes:{self.pki_domain}" - startup_nodes = [ClusterNode(host, port) for host, port in redis_endpoints] try: @@ -155,18 +123,82 @@ def create_gateway_endpoints(self): skip_full_coverage_check=True, socket_connect_timeout=5, ) - redis_client.ping() - redis_client.set(route_key, route_json) + # Read current route from Redis + registered_endpoints = [] + try: + existing_route = redis_client.get(route_key) + if existing_route: + route_data = json.loads(existing_route) + # Extract IPs from targets URLs + for target in route_data.get("targets", []): + url = target.get("url", "") + # Parse https://IP:PORT format + if "://" in url: + ip_port = url.split("://")[1] + ip = ip_port.split(":")[0] + registered_endpoints.append(ip) + except Exception as error: # pylint: disable=broad-exception-caught + log( + LogLevel.WARN, + f"Failed to read existing route from Redis, treating as empty: {error}" + ) + registered_endpoints = [] + + # Compare endpoints regardless of order + if set(registered_endpoints) == set(current_endpoints): + log( + LogLevel.INFO, + f"Gateway endpoints are up to date: " + f"registered={registered_endpoints}, current={current_endpoints}" + ) + return + log( LogLevel.INFO, - f"Successfully set gateway route {route_key} in Redis Cluster" + f"Gateway endpoints changed: " + f"registered={registered_endpoints}, current={current_endpoints}" ) - if self.cluster_properties is None: - self.cluster_properties = {} - self.cluster_properties[self.PROP_REGISTERED_ENDPOINTS] = \ - ";".join(current_endpoints) + # Build targets list from current endpoints + targets = [ + {"url": f"https://{endpoint}:8443", "weight": 1} + for endpoint in current_endpoints + ] + route_config = { + "targets": targets, + "policy": "rr", + "preserve_host": False, + "passthrough": True + } + route_json = json.dumps(route_config) + + # Retry logic for setting route in Redis + max_retries = 3 + retry_delay = 5 + + for attempt in range(1, max_retries + 1): + try: + redis_client.set(route_key, route_json) + log( + LogLevel.INFO, + f"Successfully set gateway route {route_key} in Redis Cluster" + ) + break # Success, exit retry loop + except Exception as set_error: # pylint: disable=broad-exception-caught + if attempt < max_retries: + log( + LogLevel.WARN, + f"Failed to set route (attempt {attempt}/{max_retries}): {set_error}. " + f"Retrying in {retry_delay}s..." + ) + time.sleep(retry_delay) + else: + log( + LogLevel.ERROR, + f"Failed to set route after {max_retries} attempts: {set_error}" + ) + raise except Exception as error: # pylint: disable=broad-exception-caught error_msg = f"Failed to set route in Redis Cluster: {str(error)}" @@ -403,12 +435,6 @@ def delete_route_from_redis(self) -> None: redis_client.delete(route_key) log(LogLevel.INFO, f"Deleted route {route_key} from Redis Cluster") - # Clear registered endpoints to ensure route is recreated on next PKI node start - if self.cluster_properties is None: - self.cluster_properties = {} - self.cluster_properties[self.PROP_REGISTERED_ENDPOINTS] = "" - log(LogLevel.INFO, "Cleared registered endpoints in cluster properties") - def destroy(self) -> PluginOutput: """Destroy PKI Authority service and clean up.""" try: From 190a301560b7482710a5755d64465fe825911a51 Mon Sep 17 00:00:00 2001 From: Petr Evstifeev Date: Fri, 16 Jan 2026 09:19:10 -0600 Subject: [PATCH 28/51] new route format, new lxc container --- src/Dockerfile | 2 +- src/services/apps/pki-authority/main.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Dockerfile b/src/Dockerfile index 5c9dde54..47ce7628 100644 --- a/src/Dockerfile +++ b/src/Dockerfile @@ -247,7 +247,7 @@ ADD rootfs/files/scripts/install_lxc_deps.sh /buildroot/files/scripts/ RUN --security=insecure /buildroot/files/scripts/install_lxc_deps.sh RUN mkdir -p "${OUTPUTDIR}/etc/super/containers/pki-authority" -COPY --from=ghcr.io/super-protocol/tee-pki-authority-service-lxc:build-20717465559 /pki-authority.tar "${OUTPUTDIR}/etc/super/containers/pki-authority/pki-authority.tar" +COPY --from=ghcr.io/super-protocol/tee-pki-authority-service-lxc:build-21069314544 /pki-authority.tar "${OUTPUTDIR}/etc/super/containers/pki-authority/pki-authority.tar" ADD rootfs/files/configs/pki-service/lxc-swarm-template.yaml "${OUTPUTDIR}/etc/super/containers/pki-authority/lxc-swarm-template.yaml" ADD rootfs/files/configs/pki-service/lxc-legacy-vm-template.yaml "${OUTPUTDIR}/etc/super/containers/pki-authority/lxc-legacy-vm-template.yaml" ADD rootfs/files/configs/pki-service/dnsmasq.conf "${OUTPUTDIR}/etc/lxc/dnsmasq.conf" diff --git a/src/services/apps/pki-authority/main.py b/src/services/apps/pki-authority/main.py index c6b0f436..ba5ecc17 100755 --- a/src/services/apps/pki-authority/main.py +++ b/src/services/apps/pki-authority/main.py @@ -113,7 +113,7 @@ def create_gateway_endpoints(self): self.error_message = "No Redis nodes available to configure gateway routes" return - route_key = f"routes:{self.pki_domain}" + route_key = f"manual-routes:{self.pki_domain}" startup_nodes = [ClusterNode(host, port) for host, port in redis_endpoints] try: From b39bbe454c1bc27752d65dee2ee90d4215c3d82e Mon Sep 17 00:00:00 2001 From: Petr Evstifeev Date: Sun, 18 Jan 2026 12:47:26 -0600 Subject: [PATCH 29/51] fixed untrusted support --- src/services/apps/pki-authority/helpers.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/services/apps/pki-authority/helpers.py b/src/services/apps/pki-authority/helpers.py index 7fae3974..5216ea56 100644 --- a/src/services/apps/pki-authority/helpers.py +++ b/src/services/apps/pki-authority/helpers.py @@ -5,6 +5,7 @@ import os import re +import secrets import shutil import ssl import subprocess @@ -363,6 +364,19 @@ def patch_yaml_config(cpu_type: str, vm_mode: VMMode, pki_domain: str): config["pki"]["ownChallenge"] = {} config["pki"]["ownChallenge"]["type"] = cpu_type + # For untrusted, generate random deviceIdHex (32 bytes) + if cpu_type == "untrusted": + device_id_hex = secrets.token_hex(32) + config["pki"]["ownChallenge"]["deviceIdHex"] = device_id_hex + log(LogLevel.INFO, f"Generated deviceIdHex for untrusted type: {device_id_hex}") + + # Add 'untrusted' to allowedChallenges if not present + if "allowedChallenges" not in config["pki"]: + config["pki"]["allowedChallenges"] = [] + if "untrusted" not in config["pki"]["allowedChallenges"]: + config["pki"]["allowedChallenges"].append("untrusted") + log(LogLevel.INFO, "Added 'untrusted' to allowedChallenges") + # Set ownDomain from parameter if pki_domain: config["pki"]["ownDomain"] = pki_domain From 33d3e385a2e39fd28bcb2b27c8280e0897b876b9 Mon Sep 17 00:00:00 2001 From: Petr Evstifeev Date: Tue, 20 Jan 2026 06:42:41 -0600 Subject: [PATCH 30/51] draft --- .../pki-service/lxc-legacy-vm-template.yaml | 144 ------- .../pki-service/lxc-swarm-template.yaml | 11 +- src/services/apps/pki-authority/helpers.py | 104 +++-- src/services/apps/pki-authority/main.py | 385 ++++++++++++------ 4 files changed, 330 insertions(+), 314 deletions(-) delete mode 100644 src/rootfs/files/configs/pki-service/lxc-legacy-vm-template.yaml diff --git a/src/rootfs/files/configs/pki-service/lxc-legacy-vm-template.yaml b/src/rootfs/files/configs/pki-service/lxc-legacy-vm-template.yaml deleted file mode 100644 index c336ee59..00000000 --- a/src/rootfs/files/configs/pki-service/lxc-legacy-vm-template.yaml +++ /dev/null @@ -1,144 +0,0 @@ -api: - port: 443 -pki: - allowedChallenges: - - token - tokenChallengeFilePath: /app/access-token - ownDomain: ca-subroot.super-protocol.svc.cluster.local - ownChallenge: - type: untrusted - certParams: - ocspUrl: https://ocsp.certs.superprotocol.com/v1/ocsp - keyStorage: - type: trusted - storage: - type: super - keysPath: /app/keys - mode: - attestationServiceSource: - baseUrl: https://ca-subroot.certs.superprotocol.com/api/v1/pki - caBundle: | - -----BEGIN CERTIFICATE----- - MIIWzDCCFbSgAwIBAgIQRvHafieXotvccTAPGbFOlTANBgkqhkiG9w0BAQsFADB2 - MSIwIAYDVQQDExlTdXBlclByb3RvY29sIFRFRSBSb290IENBMQswCQYDVQQGEwJV - UzELMAkGA1UECBMCTlkxETAPBgNVBAcTCE5ldyBZb3JrMRYwFAYDVQQKEw1TdXBl - clByb3RvY29sMQswCQYDVQQLEwJJVDAeFw0yNTA3MjExNDU2MjNaFw0zNTA4MDEw - MDAwMDBaMHYxIjAgBgNVBAMTGVN1cGVyUHJvdG9jb2wgVEVFIFJvb3QgQ0ExCzAJ - BgNVBAYTAlVTMQswCQYDVQQIEwJOWTERMA8GA1UEBxMITmV3IFlvcmsxFjAUBgNV - BAoTDVN1cGVyUHJvdG9jb2wxCzAJBgNVBAsTAklUMIIBIjANBgkqhkiG9w0BAQEF - AAOCAQ8AMIIBCgKCAQEAuscjQuMdTyM+COTzmj1SFcCwBQtMxkK8uqk2dy7okwU0 - U2beMso47+AZ7hROOpRSCT2Z9lf5sJEI+Jw3ptdjxAK0ALvvokNF5/9Wg6IipLsO - sRaCgpsPwTNFWN374vnDdWQsi4hOlcKLBoUifSQrl7/dSLjz6qxUOCLifqYtdd4/ - Sln8G+9GHzmBtfq0CWB3xtjYFxXPCmuy5lUTJPg+QFL5EYY/e3/UrefmLxEXjskn - QbmpA9pMXexwdEzCXDVocPQJOh+SxR7K4ZGXpWxpMcP71jBJfPLKb46ijxyK2YZW - xzfWXYgkctKhPLx5v3M7vByqNLcn8Lm/VkwZy3hXhwIDAQABo4ITVDCCE1AwDwYD - VR0TAQH/BAUwAwEB/zAlBgNVHREEHjAcghpjYS5jZXJ0cy5zdXBlcnByb3RvY29s - LmNvbTAdBgNVHSUEFjAUBggrBgEFBQcDAQYIKwYBBQUHAwIwDgYDVR0PAQH/BAQD - AgKkMB8GA1UdIwQYMBaAFCVxTWT14hJHoY4lE0vbxcpala/DMB0GA1UdDgQWBBQl - cU1k9eISR6GOJRNL28XKWpWvwzAUBggrBgEDxTgBAQQIc2d4LWRjYXAwghKPBgsG - CSqGSIb4TYo5BgSCEn4DAAIAAAAAAAsAEACTmnIz95xMqZQKDbOVfwYHwuAeEM7b - IyMk8VaVVsIMaAAAAAALEA8O//8AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA - AAAAAAAAAAAAAAAAAAAFAAAAAAAAAOcAAAAAAAAAg090avZ3FKIIEzhH7K4PGXPW - 5VTSomOCmLCnduwrcNUAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADTq - fKiANP6A8gnpfND4RC97piHrKl9gOTqwWHGua5edAAAAAAAAAAAAAAAAAAAAAAAA - AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA - AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA - AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAALq+ - EkZ4saPu4J+/2S4ClapSnxhp06LLcUkuKbsv9ZIhAAAAAAAAAAAAAAAAAAAAAAAA - AAAAAAAAAAAAAAAAAADKEAAAaM6Hlshrc9tC24YIJocfnTlZYBKHTDTj0+mv/7Ov - lJ2x9T5sIaTNGfStH46LE1+5hVH3bPwJvDI0Ws2E/BBGDrdAjZv2T83/USoIQTTL - TZ+kcZqLMigsqNeJHh5uBLsF5uqzPNwBDeJF1nIi/MaV+qcGve37Eu8XoSCGSOCj - b2gLEA8O//8AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA - AAAVAAAAAAAAAOcAAAAAAAAAeP6M/QEJWg8Qiv9cQGJLk2EtbCi3PhqNKBecnd8O - BoYAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIxPV3XXllA+lhN/d8aK - gpoAVqyN7XAUCwgbCUSQxXv/AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA - AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA - AAAAAAAAAAAAAAAAAAAAAAAAAQALAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA - AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANWq0gbaZ0iNLGHsRl9v - WJK1s51EREhaa5246jLZcL3NAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA - AAArkN01FJ5vTX8xsD4zCyuu9jkUA276SlrfE9PttOJ2vf2ljC3d1oyCUyIPv+8d - fR9Jbk3FLW/LY+BFJaQKnnGIIAAAAQIDBAUGBwgJCgsMDQ4PEBESExQVFhcYGRob - HB0eHwUAYg4AAC0tLS0tQkVHSU4gQ0VSVElGSUNBVEUtLS0tLQpNSUlFOHpDQ0JK - aWdBd0lCQWdJVVFaT1NWSy8vQlZkVkJXeUIwaHM4dURoTkc2Z3dDZ1lJS29aSXpq - MEVBd0l3CmNERWlNQ0FHQTFVRUF3d1pTVzUwWld3Z1UwZFlJRkJEU3lCUWJHRjBa - bTl5YlNCRFFURWFNQmdHQTFVRUNnd1IKU1c1MFpXd2dRMjl5Y0c5eVlYUnBiMjR4 - RkRBU0JnTlZCQWNNQzFOaGJuUmhJRU5zWVhKaE1Rc3dDUVlEVlFRSQpEQUpEUVRF - TE1Ba0dBMVVFQmhNQ1ZWTXdIaGNOTWpVd056QTVNVFV3T1RBd1doY05Nekl3TnpB - NU1UVXdPVEF3CldqQndNU0l3SUFZRFZRUUREQmxKYm5SbGJDQlRSMWdnVUVOTElF - TmxjblJwWm1sallYUmxNUm93R0FZRFZRUUsKREJGSmJuUmxiQ0JEYjNKd2IzSmhk - R2x2YmpFVU1CSUdBMVVFQnd3TFUyRnVkR0VnUTJ4aGNtRXhDekFKQmdOVgpCQWdN - QWtOQk1Rc3dDUVlEVlFRR0V3SlZVekJaTUJNR0J5cUdTTTQ5QWdFR0NDcUdTTTQ5 - QXdFSEEwSUFCRFFBCkZNa2lVR1Z4S2luMnpIODB2MkMvbXpGQ0N1MnV0VHBOdE1V - TzhjMUJ6U1F1ZlZ0YjYyT2JFMUhOUVNvZmE4Q3IKdXVLeWJ0VHBRT3BaakZXQkFh - S2pnZ01PTUlJRENqQWZCZ05WSFNNRUdEQVdnQlNWYjEzTnZSdmg2VUJKeWRUMApN - ODRCVnd2ZVZEQnJCZ05WSFI4RVpEQmlNR0NnWHFCY2hscG9kSFJ3Y3pvdkwyRndh - UzUwY25WemRHVmtjMlZ5CmRtbGpaWE11YVc1MFpXd3VZMjl0TDNObmVDOWpaWEow - YVdacFkyRjBhVzl1TDNZMEwzQmphMk55YkQ5allUMXcKYkdGMFptOXliU1psYm1O - dlpHbHVaejFrWlhJd0hRWURWUjBPQkJZRUZPUHdhb25yYnpPZks0My8vZ3Y5M0Fm - QgpiMGNZTUE0R0ExVWREd0VCL3dRRUF3SUd3REFNQmdOVkhSTUJBZjhFQWpBQU1J - SUNPd1lKS29aSWh2aE5BUTBCCkJJSUNMRENDQWlnd0hnWUtLb1pJaHZoTkFRMEJB - UVFRM3cvN014Q2Vwbkt2V2Q2WXhZb0VJVENDQVdVR0NpcUcKU0liNFRRRU5BUUl3 - Z2dGVk1CQUdDeXFHU0liNFRRRU5BUUlCQWdFTE1CQUdDeXFHU0liNFRRRU5BUUlD - QWdFTApNQkFHQ3lxR1NJYjRUUUVOQVFJREFnRURNQkFHQ3lxR1NJYjRUUUVOQVFJ - RUFnRURNQkVHQ3lxR1NJYjRUUUVOCkFRSUZBZ0lBL3pBUkJnc3Foa2lHK0UwQkRR - RUNCZ0lDQVA4d0VBWUxLb1pJaHZoTkFRMEJBZ2NDQVFBd0VBWUwKS29aSWh2aE5B - UTBCQWdnQ0FRQXdFQVlMS29aSWh2aE5BUTBCQWdrQ0FRQXdFQVlMS29aSWh2aE5B - UTBCQWdvQwpBUUF3RUFZTEtvWklodmhOQVEwQkFnc0NBUUF3RUFZTEtvWklodmhO - QVEwQkFnd0NBUUF3RUFZTEtvWklodmhOCkFRMEJBZzBDQVFBd0VBWUxLb1pJaHZo - TkFRMEJBZzRDQVFBd0VBWUxLb1pJaHZoTkFRMEJBZzhDQVFBd0VBWUwKS29aSWh2 - aE5BUTBCQWhBQ0FRQXdFQVlMS29aSWh2aE5BUTBCQWhFQ0FRMHdId1lMS29aSWh2 - aE5BUTBCQWhJRQpFQXNMQXdQLy93QUFBQUFBQUFBQUFBQXdFQVlLS29aSWh2aE5B - UTBCQXdRQ0FBQXdGQVlLS29aSWh2aE5BUTBCCkJBUUdNR0JxQUFBQU1BOEdDaXFH - U0liNFRRRU5BUVVLQVFFd0hnWUtLb1pJaHZoTkFRMEJCZ1FRcDZtY096M0EKdDRB - SVRmdk84R1Y3cHpCRUJnb3Foa2lHK0UwQkRRRUhNRFl3RUFZTEtvWklodmhOQVEw - QkJ3RUJBZjh3RUFZTApLb1pJaHZoTkFRMEJCd0lCQWY4d0VBWUxLb1pJaHZoTkFR - MEJCd01CQWY4d0NnWUlLb1pJemowRUF3SURTUUF3ClJnSWhBTEVtR1U2VldkdG5n - YTZiTG5yaksrWEdWczlMVERSRkZURTRpcy9qU05tQkFpRUExY21KNUpkV0VIYlEK - L2dyWVJ2L3c4MytTQmpidTJKQWcxR0dJNTJmSjFUUT0KLS0tLS1FTkQgQ0VSVElG - SUNBVEUtLS0tLQotLS0tLUJFR0lOIENFUlRJRklDQVRFLS0tLS0KTUlJQ2xqQ0NB - ajJnQXdJQkFnSVZBSlZ2WGMyOUcrSHBRRW5KMVBRenpnRlhDOTVVTUFvR0NDcUdT - TTQ5QkFNQwpNR2d4R2pBWUJnTlZCQU1NRVVsdWRHVnNJRk5IV0NCU2IyOTBJRU5C - TVJvd0dBWURWUVFLREJGSmJuUmxiQ0JECmIzSndiM0poZEdsdmJqRVVNQklHQTFV - RUJ3d0xVMkZ1ZEdFZ1EyeGhjbUV4Q3pBSkJnTlZCQWdNQWtOQk1Rc3cKQ1FZRFZR - UUdFd0pWVXpBZUZ3MHhPREExTWpFeE1EVXdNVEJhRncwek16QTFNakV4TURVd01U - QmFNSEF4SWpBZwpCZ05WQkFNTUdVbHVkR1ZzSUZOSFdDQlFRMHNnVUd4aGRHWnZj - bTBnUTBFeEdqQVlCZ05WQkFvTUVVbHVkR1ZzCklFTnZjbkJ2Y21GMGFXOXVNUlF3 - RWdZRFZRUUhEQXRUWVc1MFlTQkRiR0Z5WVRFTE1Ba0dBMVVFQ0F3Q1EwRXgKQ3pB - SkJnTlZCQVlUQWxWVE1Ga3dFd1lIS29aSXpqMENBUVlJS29aSXpqMERBUWNEUWdB - RU5TQi83dDIxbFhTTwoyQ3V6cHh3NzRlSkI3MkV5REdnVzVyWEN0eDJ0VlRMcTZo - S2s2eitVaVJaQ25xUjdwc092Z3FGZVN4bG1UbEpsCmVUbWkyV1l6M3FPQnV6Q0J1 - REFmQmdOVkhTTUVHREFXZ0JRaVpReldXcDAwaWZPRHRKVlN2MUFiT1NjR3JEQlMK - QmdOVkhSOEVTekJKTUVlZ1JhQkRoa0ZvZEhSd2N6b3ZMMk5sY25ScFptbGpZWFJs - Y3k1MGNuVnpkR1ZrYzJWeQpkbWxqWlhNdWFXNTBaV3d1WTI5dEwwbHVkR1ZzVTBk - WVVtOXZkRU5CTG1SbGNqQWRCZ05WSFE0RUZnUVVsVzlkCnpiMGI0ZWxBU2NuVTlE - UE9BVmNMM2xRd0RnWURWUjBQQVFIL0JBUURBZ0VHTUJJR0ExVWRFd0VCL3dRSU1B - WUIKQWY4Q0FRQXdDZ1lJS29aSXpqMEVBd0lEUndBd1JBSWdYc1ZraTB3K2k2VllH - VzNVRi8yMnVhWGUwWUpEajFVZQpuQStUakQxYWk1Y0NJQ1liMVNBbUQ1eGtmVFZw - dm80VW95aVNZeHJEV0xtVVI0Q0k5Tkt5ZlBOKwotLS0tLUVORCBDRVJUSUZJQ0FU - RS0tLS0tCi0tLS0tQkVHSU4gQ0VSVElGSUNBVEUtLS0tLQpNSUlDanpDQ0FqU2dB - d0lCQWdJVUltVU0xbHFkTkluemc3U1ZVcjlRR3prbkJxd3dDZ1lJS29aSXpqMEVB - d0l3CmFERWFNQmdHQTFVRUF3d1JTVzUwWld3Z1UwZFlJRkp2YjNRZ1EwRXhHakFZ - QmdOVkJBb01FVWx1ZEdWc0lFTnYKY25CdmNtRjBhVzl1TVJRd0VnWURWUVFIREF0 - VFlXNTBZU0JEYkdGeVlURUxNQWtHQTFVRUNBd0NRMEV4Q3pBSgpCZ05WQkFZVEFs - VlRNQjRYRFRFNE1EVXlNVEV3TkRVeE1Gb1hEVFE1TVRJek1USXpOVGsxT1Zvd2FE - RWFNQmdHCkExVUVBd3dSU1c1MFpXd2dVMGRZSUZKdmIzUWdRMEV4R2pBWUJnTlZC - QW9NRVVsdWRHVnNJRU52Y25CdmNtRjAKYVc5dU1SUXdFZ1lEVlFRSERBdFRZVzUw - WVNCRGJHRnlZVEVMTUFrR0ExVUVDQXdDUTBFeEN6QUpCZ05WQkFZVApBbFZUTUZr - d0V3WUhLb1pJemowQ0FRWUlLb1pJemowREFRY0RRZ0FFQzZuRXdNRElZWk9qL2lQ - V3NDemFFS2k3CjFPaU9TTFJGaFdHamJuQlZKZlZua1k0dTNJamtEWVlMME14TzRt - cXN5WWpsQmFsVFZZeEZQMnNKQks1emxLT0IKdXpDQnVEQWZCZ05WSFNNRUdEQVdn - QlFpWlF6V1dwMDBpZk9EdEpWU3YxQWJPU2NHckRCU0JnTlZIUjhFU3pCSgpNRWVn - UmFCRGhrRm9kSFJ3Y3pvdkwyTmxjblJwWm1sallYUmxjeTUwY25WemRHVmtjMlZ5 - ZG1salpYTXVhVzUwClpXd3VZMjl0TDBsdWRHVnNVMGRZVW05dmRFTkJMbVJsY2pB - ZEJnTlZIUTRFRmdRVUltVU0xbHFkTkluemc3U1YKVXI5UUd6a25CcXd3RGdZRFZS - MFBBUUgvQkFRREFnRUdNQklHQTFVZEV3RUIvd1FJTUFZQkFmOENBUUV3Q2dZSQpL - b1pJemowRUF3SURTUUF3UmdJaEFPVy81UWtSK1M5Q2lTRGNOb293THVQUkxzV0dm - L1lpN0dTWDk0Qmd3VHdnCkFpRUE0SjBsckhvTXMrWG81by9zWDZPOVFXeEhSQXZa - VUdPZFJRN2N2cVJYYXFJPQotLS0tLUVORCBDRVJUSUZJQ0FURS0tLS0tCgAwDQYJ - KoZIhvcNAQELBQADggEBADk76T9z80TGctAh/LvBX8LMCLZVb/F2nw7mQy9Q7N1w - 6fo7TRV0XV6/yvctv3eB0oly0ybcuBZmARkokKAy47Sl6EhO16t2/mpp3CYx7+5A - +Rx5Mupwtziwa6IXfPa3Ml+e99HMXv8CBSbEP6NiMvxNJPXbncjdWI6T+EWYnB9O - n7HcKweXpk3sPBPvm7Tyq0n3Q7+3rM5JaPr6O4+ksVG/TepqHoYF+KG9AdJyzv6v - 69HeBsvyBfQ1BhX8vqAKJDxbML0eYzDWZ6tDqMuJ/CdqTGJ97d0YISTJ2eRTmXiP - jm1g0H4p0FUGuDItBX7q+mkfWkTBgl1fR0ovP0YaWvk= - -----END CERTIFICATE----- diff --git a/src/rootfs/files/configs/pki-service/lxc-swarm-template.yaml b/src/rootfs/files/configs/pki-service/lxc-swarm-template.yaml index be28e865..995c1db9 100644 --- a/src/rootfs/files/configs/pki-service/lxc-swarm-template.yaml +++ b/src/rootfs/files/configs/pki-service/lxc-swarm-template.yaml @@ -25,10 +25,11 @@ pki: type: super keysPath: /app/keys mode: - role: subroot - attestationServiceSource: - type: swarm - mode: init - storage: + role: swarm + swarmMode: init + storage: storageType: file storageFolder: /app/swarm-storage + networkSettings: + networkType: trusted + diff --git a/src/services/apps/pki-authority/helpers.py b/src/services/apps/pki-authority/helpers.py index 5216ea56..4ef54142 100644 --- a/src/services/apps/pki-authority/helpers.py +++ b/src/services/apps/pki-authority/helpers.py @@ -47,7 +47,6 @@ def log(level: LogLevel, message: str): class VMMode(Enum): """VM mode types.""" - LEGACY = "legacy" SWARM_INIT = "swarm-init" SWARM_NORMAL = "swarm-normal" @@ -167,7 +166,9 @@ def create( log(LogLevel.ERROR, f"Failed to create container: {error}") return False - def is_service_healthy(self, min_uptime: int = 120, healthcheck_url: str = "/healthcheck") -> bool: + def is_service_healthy( + self, min_uptime: int = 120, healthcheck_url: str = "/healthcheck" + ) -> bool: """Check if service inside container is running and healthy.""" try: # 1. Check service status inside container @@ -278,8 +279,6 @@ def detect_vm_mode() -> VMMode: with open("/proc/cmdline", "r", encoding="utf-8") as file: cmdline = file.read() - if "vm_mode=legacy" in cmdline: - return VMMode.LEGACY if "vm_mode=swarm-init" in cmdline: return VMMode.SWARM_INIT return VMMode.SWARM_NORMAL @@ -287,15 +286,35 @@ def detect_vm_mode() -> VMMode: return VMMode.SWARM_NORMAL -def get_pki_domain() -> str: - """Read PKI authority domain from swarm-env.yaml. +def detect_network_type() -> str: + """Detect network type from kernel command line. + + Returns: + 'untrusted' if allow_untrusted=true is present in cmdline, otherwise 'trusted'. + """ + try: + with open("/proc/cmdline", "r", encoding="utf-8") as file: + cmdline = file.read() + + if "allow_untrusted=true" in cmdline: + return "untrusted" + return "trusted" + except FileNotFoundError: + return "trusted" + + +def get_pki_authority_param(param_name: str) -> str: + """Read PKI authority parameter from swarm-env.yaml. + + Args: + param_name: Name of the parameter under pki-authority section. Returns: - Domain string. + Parameter value as string. Raises: FileNotFoundError: If swarm-env.yaml does not exist. - ValueError: If configuration is empty or domain is not found. + ValueError: If configuration is empty or parameter is not found. Exception: For other errors during reading. """ swarm_env_path = Path(SWARM_ENV_YAML) @@ -314,37 +333,39 @@ def get_pki_domain() -> str: log(LogLevel.ERROR, error_msg) raise ValueError(error_msg) - domain = config.get("pki-authority", {}).get("domain") - if not domain: - error_msg = f"No domain found in {SWARM_ENV_YAML} under pki-authority.domain" + param_value = config.get("pki-authority", {}).get(param_name) + if not param_value: + error_msg = ( + f"No {param_name} found in {SWARM_ENV_YAML} " + f"under pki-authority.{param_name}" + ) log(LogLevel.ERROR, error_msg) raise ValueError(error_msg) - log(LogLevel.INFO, f"Read PKI domain from config: {domain}") - return domain + log(LogLevel.INFO, f"Read {param_name} from config: {param_value}") + return param_value except (FileNotFoundError, ValueError): raise except Exception as error: # pylint: disable=broad-exception-caught - error_msg = f"Failed to read domain from {SWARM_ENV_YAML}: {error}" + error_msg = f"Failed to read {param_name} from {SWARM_ENV_YAML}: {error}" log(LogLevel.ERROR, error_msg) raise Exception(error_msg) from error -def patch_yaml_config(cpu_type: str, vm_mode: VMMode, pki_domain: str): +def patch_yaml_config( + cpu_type: str, + vm_mode: VMMode, + pki_domain: str, + network_type: str, + network_key_hash: str +): """Set own challenge type in LXC container configuration.""" - if vm_mode == VMMode.LEGACY: - template_name = "lxc-legacy-vm-template.yaml" - log( - LogLevel.INFO, - f"Detected {vm_mode.value} mode, using legacy template" - ) - else: - template_name = "lxc-swarm-template.yaml" - log( - LogLevel.INFO, - f"Detected {vm_mode.value} mode, using swarm template" - ) + template_name = "lxc-swarm-template.yaml" + log( + LogLevel.INFO, + f"Detected {vm_mode.value} mode, using swarm template" + ) src_yaml = Path(f"/etc/super/containers/pki-authority/{template_name}") dst_yaml = Path(f"/var/lib/lxc/{PKI_SERVICE_NAME}/rootfs/app/conf/lxc.yaml") @@ -382,16 +403,25 @@ def patch_yaml_config(cpu_type: str, vm_mode: VMMode, pki_domain: str): config["pki"]["ownDomain"] = pki_domain log(LogLevel.INFO, f"Set ownDomain to: {pki_domain}") - # Set mode.attestationServiceSource.mode for swarm modes - if vm_mode in (VMMode.SWARM_INIT, VMMode.SWARM_NORMAL): - if "mode" not in config["pki"]: - config["pki"]["mode"] = {} - if "attestationServiceSource" not in config["pki"]["mode"]: - config["pki"]["mode"]["attestationServiceSource"] = {} + # Set mode.swarmMode + if "mode" not in config["pki"]: + config["pki"]["mode"] = {} - mode_value = "init" if vm_mode == VMMode.SWARM_INIT else "normal" - config["pki"]["mode"]["attestationServiceSource"]["mode"] = mode_value - log(LogLevel.INFO, f"Set attestationServiceSource mode to: {mode_value}") + mode_value = "init" if vm_mode == VMMode.SWARM_INIT else "normal" + config["pki"]["mode"]["swarmMode"] = mode_value + log(LogLevel.INFO, f"Set swarmMode to: {mode_value}") + + # Set networkSettings + if network_type and network_key_hash: + config["pki"]["mode"]["networkSettings"] = { + "networkType": network_type, + "networkKeyHashHex": network_key_hash + } + log( + LogLevel.INFO, + f"Set networkSettings: networkType={network_type}, " + f"networkKeyHashHex={network_key_hash}" + ) # Ensure destination directory exists dst_yaml.parent.mkdir(parents=True, exist_ok=True) @@ -765,5 +795,3 @@ def read_property_from_fs(file_name: str) -> tuple[bool, bytes]: if content: return (True, content) return (False, b"") - - diff --git a/src/services/apps/pki-authority/main.py b/src/services/apps/pki-authority/main.py index ba5ecc17..3b3aeb46 100755 --- a/src/services/apps/pki-authority/main.py +++ b/src/services/apps/pki-authority/main.py @@ -17,6 +17,7 @@ delete_iptables_rules, detect_cpu_type, detect_vm_mode, + detect_network_type, patch_yaml_config, patch_lxc_config, setup_iptables, @@ -30,7 +31,7 @@ read_property_from_fs, LogLevel, log, - get_pki_domain, + get_pki_authority_param, ) # Configuration @@ -48,6 +49,8 @@ class EventHandler: ] PROP_INITIALIZED = f"{AUTHORITY_SERVICE_PREFIX}initialized" PROP_PKI_DOMAIN = f"{AUTHORITY_SERVICE_PREFIX}pki_domain" + PROP_NETWORK_KEY_HASH = f"{AUTHORITY_SERVICE_PREFIX}network_key_hash" + PROP_NETWORK_TYPE = f"{AUTHORITY_SERVICE_PREFIX}network_type" def __init__(self, input_data: PluginInput): self.input_data = input_data @@ -63,13 +66,15 @@ def __init__(self, input_data: PluginInput): self.authority_config = {prop["name"]: prop["value"] for prop in self.authority_props} self.pki_domain = self.authority_config.get(self.PROP_PKI_DOMAIN, "") + self.network_key_hash = self.authority_config.get(self.PROP_NETWORK_KEY_HASH, "") + self.network_type = self.authority_config.get(self.PROP_NETWORK_TYPE, "") # Output parameters self.status = None self.error_message = None self.cluster_properties = {} - def get_redis_tunnel_ips(self) -> list[str]: + def _get_redis_tunnel_ips(self) -> list[str]: """Get list of Redis node tunnel IPs.""" redis_node_props = self.state_json.get("redisNodeProperties", []) wg_props = self.state_json.get("wgNodeProperties", []) @@ -84,15 +89,15 @@ def get_redis_tunnel_ips(self) -> list[str]: return sorted(set(redis_hosts)) - def get_redis_connection_info(self) -> list[tuple[str, int]]: + def _get_redis_connection_info(self) -> list[tuple[str, int]]: """Get Redis cluster connection endpoints. Returns list of (host, port) tuples for Redis nodes. """ - redis_tunnel_ips = self.get_redis_tunnel_ips() + redis_tunnel_ips = self._get_redis_tunnel_ips() return [(ip, 6379) for ip in redis_tunnel_ips] - def create_gateway_endpoints(self): + def _create_gateway_endpoints(self): """Create and update gateway endpoints in Redis.""" if not self.is_leader: return @@ -106,7 +111,7 @@ def create_gateway_endpoints(self): current_endpoints.append(tunnel_ip) # Get Redis connection info - redis_endpoints = self.get_redis_connection_info() + redis_endpoints = self._get_redis_connection_info() if not redis_endpoints and current_endpoints: self.status = "postponed" @@ -206,10 +211,10 @@ def create_gateway_endpoints(self): self.error_message = error_msg log(LogLevel.ERROR, error_msg) - def create_output(self) -> PluginOutput: + def _create_output(self) -> PluginOutput: """Create plugin output based on current status.""" if self.status == "completed": - self.create_gateway_endpoints() + self._create_gateway_endpoints() elif self.status == "postponed": log(LogLevel.INFO, f"Apply postponed: {self.error_message}") elif self.status == "error": @@ -226,161 +231,287 @@ def create_output(self) -> PluginOutput: ) ) - # pylint: disable=too-many-locals,too-many-return-statements - # pylint: disable=too-many-branches,too-many-statements def apply(self) -> PluginOutput: """Apply PKI Authority configuration.""" + # Basic validation if not isinstance(self.state_json, dict): self.status = "error" self.error_message = "Invalid state format" - return self.create_output() + return self._create_output() local_tunnel_ip = get_node_tunnel_ip(self.local_node_id, self.wg_props) if not local_tunnel_ip: self.status = "error" self.error_message = "Local node has no WireGuard tunnel IP" - return self.create_output() + return self._create_output() try: vm_mode = detect_vm_mode() - initialized = self.authority_config.get(self.PROP_INITIALIZED) - # If initialized is true, verify all required properties are present - if initialized == "true": - missing = [] + # Route to appropriate handler based on VM mode + if vm_mode == VMMode.SWARM_INIT: + return self._handle_swarm_init(local_tunnel_ip) - for prop in self.AUTHORITY_SERVICE_PROPERTIES: - prop_name = f"{self.AUTHORITY_SERVICE_PREFIX}{prop}" - prop_value = self.authority_config.get(prop_name, "") + # SWARM_NORMAL + return self._handle_swarm_normal(local_tunnel_ip) - if not prop_value: - missing.append(prop_name) + except Exception as error: # pylint: disable=broad-exception-caught + error_msg = f"Apply failed: {str(error)}" + log(LogLevel.ERROR, error_msg) + self.status = "error" + self.error_message = error_msg + return self._create_output() - if not self.pki_domain: - self.pki_domain = get_pki_domain() - missing.append(self.PROP_PKI_DOMAIN) + def _stop_container_if_running(self, container: LXCContainer) -> None: + """Stop container if it's running.""" + if container.is_running(): + log(LogLevel.INFO, "Stopping existing container") + exit_code = container.stop(graceful_timeout=30, command_timeout=60) + if exit_code != 0: + raise Exception(f"Failed to stop container with exit code {exit_code}") + + def _configure_and_start_container( + self, container: LXCContainer, local_tunnel_ip: str, vm_mode: VMMode + ) -> None: + """Configure and start container.""" + cpu_type = detect_cpu_type() + patch_yaml_config( + cpu_type, + vm_mode, + self.pki_domain, + self.network_type, + self.network_key_hash + ) + patch_lxc_config(cpu_type) + update_pccs_url() + setup_iptables(local_tunnel_ip) - if missing: - error_msg = ( - f"Service marked as initialized but missing properties: " - f"{', '.join(missing)}" - ) - log(LogLevel.ERROR, error_msg) - initialized = "false" + exit_code = container.start(timeout=30) + if exit_code != 0: + raise Exception(f"Failed to start container with exit code {exit_code}") - if vm_mode == VMMode.SWARM_NORMAL and initialized != "true": - self.status = "postponed" - self.error_message = ( - "Waiting for authority service properties to be initialized" - ) - return self.create_output() + log(LogLevel.INFO, f"LXC container {PKI_SERVICE_NAME} started") - container = LXCContainer(PKI_SERVICE_NAME) + def _check_for_missing_properties(self) -> list[str]: + """Check for missing required properties. - # Start or restart LXC container - if container.is_running(): - if initialized != "true" or self.is_restart_required(): - exit_code = container.stop( - graceful_timeout=30, command_timeout=60 - ) - if exit_code != 0: - raise Exception( - f"Failed to stop container with exit code {exit_code}" - ) + Returns: + List of missing property names (empty if all present) + """ + missing = [] - if container.is_running(): + for prop in self.AUTHORITY_SERVICE_PROPERTIES: + prop_name = f"{self.AUTHORITY_SERVICE_PREFIX}{prop}" + if not self.authority_config.get(prop_name, ""): + missing.append(prop_name) + + if not self.pki_domain: + missing.append(self.PROP_PKI_DOMAIN) + + if not self.network_key_hash: + missing.append(self.PROP_NETWORK_KEY_HASH) + + if not self.network_type: + missing.append(self.PROP_NETWORK_TYPE) + + return missing + + def _wait_for_properties_generation(self) -> PluginOutput: + """Wait for tee-pki service to generate ALL property files.""" + missing_properties = self.AUTHORITY_SERVICE_PROPERTIES.copy() + timeout = 30 + interval = 5 + elapsed = 0 + collected_properties = {} + + while elapsed < timeout: + # Try to read each missing property + for prop in missing_properties[:]: + success, value = read_property_from_fs(prop) + + if success: + prop_key = f"{self.AUTHORITY_SERVICE_PREFIX}{prop}" + collected_properties[prop_key] = base64.b64encode(value).decode() + missing_properties.remove(prop) + + # Check if ALL properties collected + if not missing_properties: log( LogLevel.INFO, - f"Container {PKI_SERVICE_NAME} is already running, " - f"no restart required" + "All property files have been generated by tee-pki service" ) + # Set initialized flag ONLY when all properties are ready + collected_properties[self.PROP_PKI_DOMAIN] = self.pki_domain + collected_properties[self.PROP_NETWORK_KEY_HASH] = self.network_key_hash + collected_properties[self.PROP_NETWORK_TYPE] = self.network_type + collected_properties[self.PROP_INITIALIZED] = "true" + self.status = "completed" - return self.create_output() - - cpu_type = detect_cpu_type() - if not self.pki_domain: - self.pki_domain = get_pki_domain() - patch_yaml_config(cpu_type, vm_mode, self.pki_domain) - patch_lxc_config(cpu_type) - update_pccs_url() - setup_iptables(local_tunnel_ip) - - if initialized == "true": - for prop in self.AUTHORITY_SERVICE_PROPERTIES: - prop_name = f"{self.AUTHORITY_SERVICE_PREFIX}{prop}" - prop_value = self.authority_config.get(prop_name, "") - save_property_into_fs(prop, base64.b64decode(prop_value)) - - exit_code = container.start(timeout=30) - if exit_code != 0: - raise Exception( - f"Failed to start container with exit code {exit_code}" - ) + self.cluster_properties = collected_properties + return self._create_output() - log(LogLevel.INFO, f"LXC container {PKI_SERVICE_NAME} is running") - - # If not initialized, wait for tee-pki service to generate property files - if initialized != "true": - missing_properties = self.AUTHORITY_SERVICE_PROPERTIES.copy() - timeout = 30 - interval = 5 - elapsed = 0 - collected_properties = {} - - while elapsed < timeout: - # Try to read each missing property - for prop in missing_properties[:]: - success, value = read_property_from_fs(prop) - - if success: - prop_key = f"{self.AUTHORITY_SERVICE_PREFIX}{prop}" - collected_properties[prop_key] = \ - base64.b64encode(value).decode() - missing_properties.remove(prop) - - # Check if all properties collected - if not missing_properties: - log( - LogLevel.INFO, - "All property files have been generated " - "by tee-pki service" - ) - collected_properties[self.PROP_PKI_DOMAIN] = self.pki_domain - collected_properties[self.PROP_INITIALIZED] = "true" + log( + LogLevel.INFO, + f"Waiting for property files. Missing: " + f"{', '.join(missing_properties)} (elapsed: {elapsed}s)" + ) - self.status = "completed" - self.cluster_properties = collected_properties - return self.create_output() + time.sleep(interval) + elapsed += interval - # Show what's still missing - log( - LogLevel.INFO, - f"Waiting for property files. Missing: " - f"{', '.join(missing_properties)} (elapsed: {elapsed}s)" - ) + # Timeout - NOT all properties collected, do NOT set initialized flag + self.status = "postponed" + self.error_message = ( + f"Timeout waiting for tee-pki to generate property files: " + f"{', '.join(missing_properties)}" + ) + return self._create_output() - time.sleep(interval) - elapsed += interval + def _handle_swarm_init(self, local_tunnel_ip: str) -> PluginOutput: + """Handle swarm-init mode: read external sources and initialize properties.""" + # Step 1: Get pki_domain from external source (file) + if not self.pki_domain: + try: + self.pki_domain = get_pki_authority_param("domain") + log(LogLevel.INFO, f"Read PKI domain from external source: {self.pki_domain}") + except Exception as error: # pylint: disable=broad-exception-caught + error_msg = f"Failed to get PKI domain from external source: {error}" + log(LogLevel.ERROR, error_msg) + self.status = "error" + self.error_message = error_msg + return self._create_output() + + # Get network_key_hash from external source (file) + if not self.network_key_hash: + try: + self.network_key_hash = get_pki_authority_param("networkKeyHashHex") + log( + LogLevel.INFO, + f"Read network key hash from external source: {self.network_key_hash}" + ) + except Exception as error: # pylint: disable=broad-exception-caught + error_msg = f"Failed to get network key hash from external source: {error}" + log(LogLevel.ERROR, error_msg) + self.status = "error" + self.error_message = error_msg + return self._create_output() + + # Get network_type from kernel cmdline + if not self.network_type: + self.network_type = detect_network_type() + log(LogLevel.INFO, f"Detected network type: {self.network_type}") - # Timeout reached - self.status = "postponed" - self.error_message = ( - f"Timeout waiting for tee-pki to generate property files: " - f"{', '.join(missing_properties)}" + container = LXCContainer(PKI_SERVICE_NAME) + initialized = self.authority_config.get(self.PROP_INITIALIZED) + + # Step 2: Check initialized flag + if initialized == "true": + # Step 3: Verify ALL required properties are present + missing = self._check_for_missing_properties() + + # Step 4: If ANY property is missing - ERROR + if missing: + error_msg = ( + f"Service marked as initialized but missing required properties: " + f"{', '.join(missing)}" ) - return self.create_output() + log(LogLevel.ERROR, error_msg) + self.status = "error" + self.error_message = error_msg + return self._create_output() + + # Step 5: Compare DB properties with FS (is_restart_required) + # Step 6: If mismatch - restart container and restore properties + if container.is_running() and not self._is_restart_required(): + # Everything matches, container running, nothing to do + log(LogLevel.INFO, "Container running, no changes detected") + self.status = "completed" + return self._create_output() + # Need to restart or start container + if container.is_running(): + log(LogLevel.INFO, "Configuration changed, restarting container") + self._stop_container_if_running(container) + + # Restore properties from DB to filesystem + for prop in self.AUTHORITY_SERVICE_PROPERTIES: + prop_name = f"{self.AUTHORITY_SERVICE_PREFIX}{prop}" + prop_value = self.authority_config.get(prop_name, "") + save_property_into_fs(prop, base64.b64decode(prop_value)) + + # Start container + self._configure_and_start_container(container, local_tunnel_ip, VMMode.SWARM_INIT) self.status = "completed" - return self.create_output() + return self._create_output() - except Exception as error: # pylint: disable=broad-exception-caught - error_msg = f"Apply failed: {str(error)}" + # Step 7: Not initialized - restart container and wait for properties generation + log(LogLevel.INFO, "Service not initialized, starting initialization process") + + # Restart container if running + if container.is_running(): + log(LogLevel.INFO, "Stopping container for initialization") + self._stop_container_if_running(container) + + # Start container + self._configure_and_start_container(container, local_tunnel_ip, VMMode.SWARM_INIT) + + # Wait for properties generation + return self._wait_for_properties_generation() + + def _handle_swarm_normal(self, local_tunnel_ip: str) -> PluginOutput: + """Handle swarm-normal mode: read ONLY from properties (DB), no external sources.""" + initialized = self.authority_config.get(self.PROP_INITIALIZED) + + # If not initialized - wait for swarm-init to complete + if initialized != "true": + self.status = "postponed" + self.error_message = "Waiting for authority service properties to be initialized" + return self._create_output() + + # Initialized - verify ALL required properties are present + missing = self._check_for_missing_properties() + + # If ANY property is missing - ERROR (should never happen if initialized=true) + if missing: + error_msg = ( + f"Service marked as initialized but missing required properties: " + f"{', '.join(missing)}" + ) log(LogLevel.ERROR, error_msg) self.status = "error" self.error_message = error_msg - return self.create_output() + return self._create_output() + + # All properties present - manage container + container = LXCContainer(PKI_SERVICE_NAME) + + # Check if restart is needed + if container.is_running(): + if self._is_restart_required(): + log(LogLevel.INFO, "Configuration changed, restarting container") + self._stop_container_if_running(container) + else: + log( + LogLevel.INFO, + f"Container {PKI_SERVICE_NAME} is already running, " + f"no restart required" + ) + self.status = "completed" + return self._create_output() + + # Restore properties to filesystem before starting container + for prop in self.AUTHORITY_SERVICE_PROPERTIES: + prop_name = f"{self.AUTHORITY_SERVICE_PREFIX}{prop}" + prop_value = self.authority_config.get(prop_name, "") + save_property_into_fs(prop, base64.b64decode(prop_value)) + + # Configure and start container + self._configure_and_start_container(container, local_tunnel_ip, VMMode.SWARM_NORMAL) + + self.status = "completed" + return self._create_output() - def is_restart_required(self) -> bool: + def _is_restart_required(self) -> bool: """Check if container restart is required based on config changes.""" for prop in self.AUTHORITY_SERVICE_PROPERTIES: prop_name = f"{self.AUTHORITY_SERVICE_PREFIX}{prop}" @@ -411,13 +542,13 @@ def is_restart_required(self) -> bool: log(LogLevel.INFO, "No configuration changes detected") return False - def delete_route_from_redis(self) -> None: + def _delete_route_from_redis(self) -> None: """Delete the PKI Authority route from Redis Cluster. Raises: Exception: If deletion fails """ - redis_endpoints = self.get_redis_connection_info() + redis_endpoints = self._get_redis_connection_info() if not redis_endpoints: log(LogLevel.WARN, "No Redis endpoints available, skipping route deletion") @@ -460,7 +591,7 @@ def destroy(self) -> PluginOutput: LogLevel.INFO, "This is the last PKI Authority node, deleting route from Redis" ) - self.delete_route_from_redis() + self._delete_route_from_redis() log(LogLevel.INFO, "PKI Authority destroyed") return PluginOutput( From 3c6b1a276f9b4a97cb653f36e3533f3f967704db Mon Sep 17 00:00:00 2001 From: Petr Evstifeev Date: Wed, 21 Jan 2026 13:42:06 -0600 Subject: [PATCH 31/51] pki_node_ready draft --- .../pki-service/lxc-swarm-template.yaml | 2 +- src/services/apps/pki-authority/helpers.py | 112 ++++------- src/services/apps/pki-authority/main.py | 177 ++++++++++++------ src/services/apps/pki-authority/manifest.yaml | 10 + 4 files changed, 161 insertions(+), 140 deletions(-) diff --git a/src/rootfs/files/configs/pki-service/lxc-swarm-template.yaml b/src/rootfs/files/configs/pki-service/lxc-swarm-template.yaml index 995c1db9..37068303 100644 --- a/src/rootfs/files/configs/pki-service/lxc-swarm-template.yaml +++ b/src/rootfs/files/configs/pki-service/lxc-swarm-template.yaml @@ -6,7 +6,7 @@ pki: - token - tdx - sev-snp - allowedSubRootChallenges: + validateParamRules: - type: tdx signatureVerification: github - type: sev-snp diff --git a/src/services/apps/pki-authority/helpers.py b/src/services/apps/pki-authority/helpers.py index 4ef54142..e1a91d80 100644 --- a/src/services/apps/pki-authority/helpers.py +++ b/src/services/apps/pki-authority/helpers.py @@ -7,10 +7,8 @@ import re import secrets import shutil -import ssl import subprocess import sys -import time import urllib.request from datetime import datetime from enum import Enum @@ -166,12 +164,10 @@ def create( log(LogLevel.ERROR, f"Failed to create container: {error}") return False - def is_service_healthy( - self, min_uptime: int = 120, healthcheck_url: str = "/healthcheck" - ) -> bool: + def is_service_healthy(self, healthcheck_url: str = "/healthcheck") -> bool: """Check if service inside container is running and healthy.""" try: - # 1. Check service status inside container + # Check service status inside container result = subprocess.run( [ "lxc-attach", "-n", self.container_name, "--", @@ -183,81 +179,31 @@ def is_service_healthy( ) status = result.stdout.strip() - if status not in ["active", "activating"]: + if status != "active": log(LogLevel.INFO, f"Service {SERVICE_INSIDE_CONTAINER} status: {status}") return False - # If service is active, check how long it's been running - if status == "active": - result = subprocess.run( - [ - "lxc-attach", "-n", self.container_name, "--", - "systemctl", "show", - SERVICE_INSIDE_CONTAINER, - "--property=ActiveEnterTimestamp" - ], - capture_output=True, - text=True, - check=False - ) + # Service is active, check healthcheck endpoint + container_ip = self.get_ip() + if not container_ip: + log(LogLevel.INFO, "Could not get container IP") + return False - # Parse ActiveEnterTimestamp - for line in result.stdout.split('\n'): - if line.startswith('ActiveEnterTimestamp='): - timestamp_str = line.split('=', 1)[1].strip() - if timestamp_str and timestamp_str != '0': - try: - # Get timestamp in seconds since epoch - ts_result = subprocess.run( - ["date", "+%s", "-d", timestamp_str], - capture_output=True, - text=True, - check=False - ) - start_time = int(ts_result.stdout.strip()) - current_time = int(time.time()) - uptime_seconds = current_time - start_time - - # If running more than min_uptime, check healthcheck endpoint - if uptime_seconds > min_uptime: - container_ip = self.get_ip() - - if container_ip: - # Perform HTTPS healthcheck without certificate verification - try: - ctx = ssl.create_default_context() - ctx.check_hostname = False - ctx.verify_mode = ssl.CERT_NONE - - req = urllib.request.Request( - f"https://{container_ip}{healthcheck_url}" - ) - with urllib.request.urlopen( - req, context=ctx, timeout=5 - ) as response: - if response.status == 200: - return True - - log( - LogLevel.INFO, - f"Healthcheck returned status: " - f"{response.status}" - ) - return False - except Exception as error: # pylint: disable=broad-exception-caught - log( - LogLevel.INFO, - f"Healthcheck failed: {error}" - ) - return False - except Exception as error: # pylint: disable=broad-exception-caught - log( - LogLevel.INFO, - f"Failed to parse service uptime: {error}" - ) - - # Service is active or activating (but not ready for healthcheck yet) - return True + # Perform HTTP healthcheck + try: + req = urllib.request.Request(f"http://{container_ip}{healthcheck_url}") + with urllib.request.urlopen(req, timeout=5) as response: + if response.status == 200: + return True + + log( + LogLevel.INFO, + f"Healthcheck returned status: {response.status}" + ) + return False + except Exception as error: # pylint: disable=broad-exception-caught + log(LogLevel.INFO, f"Healthcheck failed: {error}") + return False except Exception as error: # pylint: disable=broad-exception-caught log(LogLevel.ERROR, f"Failed to check service health: {error}") @@ -387,11 +333,21 @@ def patch_yaml_config( # For untrusted, generate random deviceIdHex (32 bytes) if cpu_type == "untrusted": + # Check if untrusted CPU type is running in trusted network + if network_type != "untrusted": + error_msg = ( + "Cannot run untrusted machine in trusted network. " + f"CPU type: {cpu_type}, Network type: {network_type}" + ) + log(LogLevel.ERROR, error_msg) + raise ValueError(error_msg) + device_id_hex = secrets.token_hex(32) config["pki"]["ownChallenge"]["deviceIdHex"] = device_id_hex log(LogLevel.INFO, f"Generated deviceIdHex for untrusted type: {device_id_hex}") - # Add 'untrusted' to allowedChallenges if not present + # Add 'untrusted' to allowedChallenges if network type is untrusted + if network_type == "untrusted": if "allowedChallenges" not in config["pki"]: config["pki"]["allowedChallenges"] = [] if "untrusted" not in config["pki"]["allowedChallenges"]: diff --git a/src/services/apps/pki-authority/main.py b/src/services/apps/pki-authority/main.py index 3b3aeb46..377973c6 100755 --- a/src/services/apps/pki-authority/main.py +++ b/src/services/apps/pki-authority/main.py @@ -69,10 +69,19 @@ def __init__(self, input_data: PluginInput): self.network_key_hash = self.authority_config.get(self.PROP_NETWORK_KEY_HASH, "") self.network_type = self.authority_config.get(self.PROP_NETWORK_TYPE, "") + # Read current pki_node_ready value + pki_node_props = self.state_json.get("pkiNodeProperties", []) + self.current_pki_node_ready = None + for prop in pki_node_props: + if prop.get("node_id") == self.local_node_id and prop.get("name") == "pki_node_ready": + self.current_pki_node_ready = prop.get("value") + break + # Output parameters self.status = None self.error_message = None self.cluster_properties = {} + self.node_properties = {} def _get_redis_tunnel_ips(self) -> list[str]: """Get list of Redis node tunnel IPs.""" @@ -99,16 +108,16 @@ def _get_redis_connection_info(self) -> list[tuple[str, int]]: def _create_gateway_endpoints(self): """Create and update gateway endpoints in Redis.""" - if not self.is_leader: - return - - # Get current endpoints from cluster nodes + # Get current endpoints from nodes with pki_node_ready=true + pki_node_props = self.state_json.get("pkiNodeProperties", []) + current_endpoints = [] - for node in self.pki_cluster_nodes: - node_id = node.get("node_id") - tunnel_ip = get_node_tunnel_ip(node_id, self.wg_props) - if tunnel_ip: - current_endpoints.append(tunnel_ip) + for prop in pki_node_props: + if prop.get("name") == "pki_node_ready" and prop.get("value") == "true": + node_id = prop.get("node_id") + tunnel_ip = get_node_tunnel_ip(node_id, self.wg_props) + if tunnel_ip: + current_endpoints.append(tunnel_ip) # Get Redis connection info redis_endpoints = self._get_redis_connection_info() @@ -214,7 +223,8 @@ def _create_gateway_endpoints(self): def _create_output(self) -> PluginOutput: """Create plugin output based on current status.""" if self.status == "completed": - self._create_gateway_endpoints() + if self.is_leader: + self._create_gateway_endpoints() elif self.status == "postponed": log(LogLevel.INFO, f"Apply postponed: {self.error_message}") elif self.status == "error": @@ -228,39 +238,39 @@ def _create_output(self) -> PluginOutput: error_message=self.error_message, cluster_properties=( self.cluster_properties if self.status == "completed" else None - ) + ), + node_properties=self.node_properties if self.node_properties else None ) def apply(self) -> PluginOutput: """Apply PKI Authority configuration.""" - # Basic validation - if not isinstance(self.state_json, dict): - self.status = "error" - self.error_message = "Invalid state format" - return self._create_output() + try: + # Basic validation + if not isinstance(self.state_json, dict): + self.status = "error" + self.error_message = "Invalid state format" + return self._create_output() - local_tunnel_ip = get_node_tunnel_ip(self.local_node_id, self.wg_props) - if not local_tunnel_ip: - self.status = "error" - self.error_message = "Local node has no WireGuard tunnel IP" - return self._create_output() + local_tunnel_ip = get_node_tunnel_ip(self.local_node_id, self.wg_props) + if not local_tunnel_ip: + self.status = "postponed" + self.error_message = "Waiting for WireGuard tunnel IP to be configured" + return self._create_output() - try: vm_mode = detect_vm_mode() - # Route to appropriate handler based on VM mode if vm_mode == VMMode.SWARM_INIT: - return self._handle_swarm_init(local_tunnel_ip) - - # SWARM_NORMAL - return self._handle_swarm_normal(local_tunnel_ip) + self._handle_swarm_init(local_tunnel_ip) + else: + self._handle_swarm_normal(local_tunnel_ip) except Exception as error: # pylint: disable=broad-exception-caught error_msg = f"Apply failed: {str(error)}" log(LogLevel.ERROR, error_msg) self.status = "error" self.error_message = error_msg - return self._create_output() + + return self._create_output() def _stop_container_if_running(self, container: LXCContainer) -> None: """Stop container if it's running.""" @@ -290,7 +300,11 @@ def _configure_and_start_container( if exit_code != 0: raise Exception(f"Failed to start container with exit code {exit_code}") - log(LogLevel.INFO, f"LXC container {PKI_SERVICE_NAME} started") + is_healthy, err_msg = self.health(timeout=30, interval=5) + if is_healthy: + log(LogLevel.INFO, f"LXC container {PKI_SERVICE_NAME} started and health check passed") + else: + log(LogLevel.WARN, f"LXC container {PKI_SERVICE_NAME} started but health check failed: {err_msg}") def _check_for_missing_properties(self) -> list[str]: """Check for missing required properties. @@ -316,11 +330,9 @@ def _check_for_missing_properties(self) -> list[str]: return missing - def _wait_for_properties_generation(self) -> PluginOutput: + def _wait_for_properties_generation(self, timeout: int = 30, interval: int = 5) -> None: """Wait for tee-pki service to generate ALL property files.""" missing_properties = self.AUTHORITY_SERVICE_PROPERTIES.copy() - timeout = 30 - interval = 5 elapsed = 0 collected_properties = {} @@ -348,7 +360,7 @@ def _wait_for_properties_generation(self) -> PluginOutput: self.status = "completed" self.cluster_properties = collected_properties - return self._create_output() + return log( LogLevel.INFO, @@ -365,9 +377,8 @@ def _wait_for_properties_generation(self) -> PluginOutput: f"Timeout waiting for tee-pki to generate property files: " f"{', '.join(missing_properties)}" ) - return self._create_output() - def _handle_swarm_init(self, local_tunnel_ip: str) -> PluginOutput: + def _handle_swarm_init(self, local_tunnel_ip: str) -> None: """Handle swarm-init mode: read external sources and initialize properties.""" # Step 1: Get pki_domain from external source (file) if not self.pki_domain: @@ -379,7 +390,7 @@ def _handle_swarm_init(self, local_tunnel_ip: str) -> PluginOutput: log(LogLevel.ERROR, error_msg) self.status = "error" self.error_message = error_msg - return self._create_output() + return # Get network_key_hash from external source (file) if not self.network_key_hash: @@ -394,7 +405,7 @@ def _handle_swarm_init(self, local_tunnel_ip: str) -> PluginOutput: log(LogLevel.ERROR, error_msg) self.status = "error" self.error_message = error_msg - return self._create_output() + return # Get network_type from kernel cmdline if not self.network_type: @@ -418,7 +429,7 @@ def _handle_swarm_init(self, local_tunnel_ip: str) -> PluginOutput: log(LogLevel.ERROR, error_msg) self.status = "error" self.error_message = error_msg - return self._create_output() + return # Step 5: Compare DB properties with FS (is_restart_required) # Step 6: If mismatch - restart container and restore properties @@ -426,7 +437,7 @@ def _handle_swarm_init(self, local_tunnel_ip: str) -> PluginOutput: # Everything matches, container running, nothing to do log(LogLevel.INFO, "Container running, no changes detected") self.status = "completed" - return self._create_output() + return # Need to restart or start container if container.is_running(): @@ -442,7 +453,7 @@ def _handle_swarm_init(self, local_tunnel_ip: str) -> PluginOutput: # Start container self._configure_and_start_container(container, local_tunnel_ip, VMMode.SWARM_INIT) self.status = "completed" - return self._create_output() + return # Step 7: Not initialized - restart container and wait for properties generation log(LogLevel.INFO, "Service not initialized, starting initialization process") @@ -454,11 +465,11 @@ def _handle_swarm_init(self, local_tunnel_ip: str) -> PluginOutput: # Start container self._configure_and_start_container(container, local_tunnel_ip, VMMode.SWARM_INIT) - # Wait for properties generation - return self._wait_for_properties_generation() + self._wait_for_properties_generation(timeout=30, interval=5) - def _handle_swarm_normal(self, local_tunnel_ip: str) -> PluginOutput: + + def _handle_swarm_normal(self, local_tunnel_ip: str) -> None: """Handle swarm-normal mode: read ONLY from properties (DB), no external sources.""" initialized = self.authority_config.get(self.PROP_INITIALIZED) @@ -466,7 +477,7 @@ def _handle_swarm_normal(self, local_tunnel_ip: str) -> PluginOutput: if initialized != "true": self.status = "postponed" self.error_message = "Waiting for authority service properties to be initialized" - return self._create_output() + return # Initialized - verify ALL required properties are present missing = self._check_for_missing_properties() @@ -480,7 +491,7 @@ def _handle_swarm_normal(self, local_tunnel_ip: str) -> PluginOutput: log(LogLevel.ERROR, error_msg) self.status = "error" self.error_message = error_msg - return self._create_output() + return # All properties present - manage container container = LXCContainer(PKI_SERVICE_NAME) @@ -497,7 +508,7 @@ def _handle_swarm_normal(self, local_tunnel_ip: str) -> PluginOutput: f"no restart required" ) self.status = "completed" - return self._create_output() + return # Restore properties to filesystem before starting container for prop in self.AUTHORITY_SERVICE_PROPERTIES: @@ -509,7 +520,6 @@ def _handle_swarm_normal(self, local_tunnel_ip: str) -> PluginOutput: self._configure_and_start_container(container, local_tunnel_ip, VMMode.SWARM_NORMAL) self.status = "completed" - return self._create_output() def _is_restart_required(self) -> bool: """Check if container restart is required based on config changes.""" @@ -607,6 +617,52 @@ def destroy(self) -> PluginOutput: status="error", error_message=error_msg, local_state=self.local_state ) + def health(self, timeout: int = 0, interval: int = 5) -> tuple[bool, str]: + """Check health of PKI Authority service. + + Args: + timeout: Maximum time to wait for service to become healthy (0 = single check) + interval: Time between health check attempts + + Returns: + Tuple of (is_healthy, error_message). If healthy, error_message is empty string. + """ + is_healthy = False + error_msg = "" + + try: + container = LXCContainer(PKI_SERVICE_NAME) + elapsed = 0 + + while True: + if container.is_running() and container.is_service_healthy(): + is_healthy = True + break + + # If timeout is 0, only check once + if timeout == 0 or elapsed >= timeout: + error_msg = "PKI service is not healthy or container is not running" + break + + # Wait before next attempt + time.sleep(interval) + elapsed += interval + + except Exception as error: # pylint: disable=broad-exception-caught + error_msg = f"Health check failed: {str(error)}" + log(LogLevel.ERROR, error_msg) + + # Compare current pki_node_ready with new health status + current_healthy_status = "true" if is_healthy else "false" + if self.current_pki_node_ready != current_healthy_status: + log( + LogLevel.INFO, + f"PKI node ready status changed: {self.current_pki_node_ready} -> {current_healthy_status}" + ) + self.node_properties["pki_node_ready"] = current_healthy_status + + return (is_healthy, error_msg) + # Plugin commands @plugin.command("init") @@ -635,23 +691,22 @@ def handle_apply(input_data: PluginInput) -> PluginOutput: @plugin.command("health") def handle_health(input_data: PluginInput) -> PluginOutput: """Check health of PKI Authority service.""" - local_state = input_data.local_state or {} - - try: - container = LXCContainer(PKI_SERVICE_NAME) - - if container.is_running() and container.is_service_healthy(): - return PluginOutput(status="completed", local_state=local_state) + handler = EventHandler(input_data) + is_healthy, error_msg = handler.health() + if is_healthy: return PluginOutput( - status="error", - error_message="PKI service is not healthy or container is not running", - local_state=local_state + status="completed", + local_state=input_data.local_state, + node_properties=handler.node_properties if handler.node_properties else None ) - except Exception as error: # pylint: disable=broad-exception-caught - error_msg = f"Health check failed: {str(error)}" - log(LogLevel.ERROR, error_msg) - return PluginOutput(status="error", error_message=error_msg, local_state=local_state) + + return PluginOutput( + status="error", + error_message=error_msg, + local_state=input_data.local_state, + node_properties=handler.node_properties if handler.node_properties else None + ) @plugin.command("finalize") diff --git a/src/services/apps/pki-authority/manifest.yaml b/src/services/apps/pki-authority/manifest.yaml index e3501f36..44eb121b 100644 --- a/src/services/apps/pki-authority/manifest.yaml +++ b/src/services/apps/pki-authority/manifest.yaml @@ -53,6 +53,16 @@ stateExpr: {id, node_id: .node, cluster} ] | sort_by(.id, .node_id, .cluster), + pkiNodeProperties: [ + $swarmdb.clusternodeproperties[] | + select( + (.cluster_node | startswith($cluster.id)) and + .deleted_ts == null and + .name == "pki_node_ready" + ) | + {cluster_node, name, value, node_id: ( .cluster_node as $cn | $swarmdb.clusternodes[] | select(.id == $cn)) | .node} + ] | sort_by(.cluster_node, .name, .value, .node_id), + redisCluster: { id: $redisCluster.id }, From 867b02eb5cd42d67fc10d89f197faa81bdfa3c3d Mon Sep 17 00:00:00 2001 From: Petr Evstifeev Date: Thu, 22 Jan 2026 05:32:34 -0600 Subject: [PATCH 32/51] one more improvements --- src/services/apps/pki-authority/helpers.py | 40 ++++ src/services/apps/pki-authority/main.py | 209 ++++++++++----------- 2 files changed, 138 insertions(+), 111 deletions(-) diff --git a/src/services/apps/pki-authority/helpers.py b/src/services/apps/pki-authority/helpers.py index e1a91d80..fa351a83 100644 --- a/src/services/apps/pki-authority/helpers.py +++ b/src/services/apps/pki-authority/helpers.py @@ -249,6 +249,46 @@ def detect_network_type() -> str: return "trusted" +def read_yaml_config_param(param_path: str) -> Optional[str]: + """Read parameter from container's yaml configuration. + + Args: + param_path: Dot-separated path to parameter (e.g., 'pki.ownDomain'). + + Returns: + Parameter value as string, or None if not found or error. + """ + yaml_config_path = Path(f"/var/lib/lxc/{PKI_SERVICE_NAME}/rootfs/app/conf/lxc.yaml") + + if not yaml_config_path.exists(): + log(LogLevel.DEBUG, f"YAML config not found: {yaml_config_path}") + return None + + try: + with open(yaml_config_path, "r", encoding="utf-8") as file: + config = yaml.safe_load(file) + + if not config: + log(LogLevel.DEBUG, f"Empty YAML config: {yaml_config_path}") + return None + + # Navigate through nested dictionary using dot-separated path + value = config + for key in param_path.split('.'): + if isinstance(value, dict): + value = value.get(key) + if value is None: + return None + else: + return None + + return str(value) if value is not None else None + + except Exception as error: # pylint: disable=broad-exception-caught + log(LogLevel.DEBUG, f"Failed to read {param_path} from YAML config: {error}") + return None + + def get_pki_authority_param(param_name: str) -> str: """Read PKI authority parameter from swarm-env.yaml. diff --git a/src/services/apps/pki-authority/main.py b/src/services/apps/pki-authority/main.py index 377973c6..7f86197a 100755 --- a/src/services/apps/pki-authority/main.py +++ b/src/services/apps/pki-authority/main.py @@ -32,6 +32,7 @@ LogLevel, log, get_pki_authority_param, + read_yaml_config_param, ) # Configuration @@ -68,6 +69,8 @@ def __init__(self, input_data: PluginInput): self.pki_domain = self.authority_config.get(self.PROP_PKI_DOMAIN, "") self.network_key_hash = self.authority_config.get(self.PROP_NETWORK_KEY_HASH, "") self.network_type = self.authority_config.get(self.PROP_NETWORK_TYPE, "") + self.initialized = self.authority_config.get(self.PROP_INITIALIZED, "") + self.vm_mode = detect_vm_mode() # Read current pki_node_ready value pki_node_props = self.state_json.get("pkiNodeProperties", []) @@ -140,24 +143,17 @@ def _create_gateway_endpoints(self): # Read current route from Redis registered_endpoints = [] - try: - existing_route = redis_client.get(route_key) - if existing_route: - route_data = json.loads(existing_route) - # Extract IPs from targets URLs - for target in route_data.get("targets", []): - url = target.get("url", "") - # Parse https://IP:PORT format - if "://" in url: - ip_port = url.split("://")[1] - ip = ip_port.split(":")[0] - registered_endpoints.append(ip) - except Exception as error: # pylint: disable=broad-exception-caught - log( - LogLevel.WARN, - f"Failed to read existing route from Redis, treating as empty: {error}" - ) - registered_endpoints = [] + existing_route = redis_client.get(route_key) + if existing_route: + route_data = json.loads(existing_route) + # Extract IPs from targets URLs + for target in route_data.get("targets", []): + url = target.get("url", "") + # Parse https://IP:PORT format + if "://" in url: + ip_port = url.split("://")[1] + ip = ip_port.split(":")[0] + registered_endpoints.append(ip) # Compare endpoints regardless of order if set(registered_endpoints) == set(current_endpoints): @@ -232,13 +228,16 @@ def _create_output(self) -> PluginOutput: else: log(LogLevel.ERROR, f"Apply ended with unknown status {self.status}") + # Determine if cluster_properties should be included + should_update_cluster = ( + self.vm_mode == VMMode.SWARM_INIT or self.is_leader + ) and self.status in ("completed", "postponed") and self.cluster_properties + return PluginOutput( status=self.status, local_state=self.local_state if self.status == "completed" else None, error_message=self.error_message, - cluster_properties=( - self.cluster_properties if self.status == "completed" else None - ), + cluster_properties=self.cluster_properties if should_update_cluster else None, node_properties=self.node_properties if self.node_properties else None ) @@ -257,9 +256,7 @@ def apply(self) -> PluginOutput: self.error_message = "Waiting for WireGuard tunnel IP to be configured" return self._create_output() - vm_mode = detect_vm_mode() - - if vm_mode == VMMode.SWARM_INIT: + if self.vm_mode == VMMode.SWARM_INIT: self._handle_swarm_init(local_tunnel_ip) else: self._handle_swarm_normal(local_tunnel_ip) @@ -280,14 +277,54 @@ def _stop_container_if_running(self, container: LXCContainer) -> None: if exit_code != 0: raise Exception(f"Failed to stop container with exit code {exit_code}") + def _ensure_container_running(self, container: LXCContainer, local_tunnel_ip: str) -> None: + """Ensure container is running with correct configuration. + + Checks for missing properties, restart requirements, restores properties and starts container. + Sets self.status to 'completed' or 'error' and self.error_message on error. + """ + # Verify ALL required properties are present + missing = self._check_for_missing_properties() + + if missing: + error_msg = ( + f"Service marked as initialized but missing required properties: " + f"{', '.join(missing)}" + ) + log(LogLevel.ERROR, error_msg) + self.status = "error" + self.error_message = error_msg + return + + # Check if restart is needed + if container.is_running(): + if self._is_restart_required(): + log(LogLevel.INFO, "Configuration changed, restarting contain er") + self._stop_container_if_running(container) + else: + # Everything matches, container running, nothing to do + log(LogLevel.INFO, "Container running, no changes detected") + self.status = "completed" + return + + # Restore properties from DB to filesystem + for prop in self.AUTHORITY_SERVICE_PROPERTIES: + prop_name = f"{self.AUTHORITY_SERVICE_PREFIX}{prop}" + prop_value = self.authority_config.get(prop_name, "") + save_property_into_fs(prop, base64.b64decode(prop_value)) + + # Start container + self._configure_and_start_container(container, local_tunnel_ip) + self.status = "completed" + def _configure_and_start_container( - self, container: LXCContainer, local_tunnel_ip: str, vm_mode: VMMode + self, container: LXCContainer, local_tunnel_ip: str ) -> None: """Configure and start container.""" cpu_type = detect_cpu_type() patch_yaml_config( cpu_type, - vm_mode, + self.vm_mode, self.pki_domain, self.network_type, self.network_key_hash @@ -357,6 +394,7 @@ def _wait_for_properties_generation(self, timeout: int = 30, interval: int = 5) collected_properties[self.PROP_NETWORK_KEY_HASH] = self.network_key_hash collected_properties[self.PROP_NETWORK_TYPE] = self.network_type collected_properties[self.PROP_INITIALIZED] = "true" + self.initialized = "true" self.status = "completed" self.cluster_properties = collected_properties @@ -380,7 +418,7 @@ def _wait_for_properties_generation(self, timeout: int = 30, interval: int = 5) def _handle_swarm_init(self, local_tunnel_ip: str) -> None: """Handle swarm-init mode: read external sources and initialize properties.""" - # Step 1: Get pki_domain from external source (file) + # Get pki_domain from external source (file) if not self.pki_domain: try: self.pki_domain = get_pki_authority_param("domain") @@ -413,49 +451,13 @@ def _handle_swarm_init(self, local_tunnel_ip: str) -> None: log(LogLevel.INFO, f"Detected network type: {self.network_type}") container = LXCContainer(PKI_SERVICE_NAME) - initialized = self.authority_config.get(self.PROP_INITIALIZED) - - # Step 2: Check initialized flag - if initialized == "true": - # Step 3: Verify ALL required properties are present - missing = self._check_for_missing_properties() - - # Step 4: If ANY property is missing - ERROR - if missing: - error_msg = ( - f"Service marked as initialized but missing required properties: " - f"{', '.join(missing)}" - ) - log(LogLevel.ERROR, error_msg) - self.status = "error" - self.error_message = error_msg - return - - # Step 5: Compare DB properties with FS (is_restart_required) - # Step 6: If mismatch - restart container and restore properties - if container.is_running() and not self._is_restart_required(): - # Everything matches, container running, nothing to do - log(LogLevel.INFO, "Container running, no changes detected") - self.status = "completed" - return - - # Need to restart or start container - if container.is_running(): - log(LogLevel.INFO, "Configuration changed, restarting container") - self._stop_container_if_running(container) - - # Restore properties from DB to filesystem - for prop in self.AUTHORITY_SERVICE_PROPERTIES: - prop_name = f"{self.AUTHORITY_SERVICE_PREFIX}{prop}" - prop_value = self.authority_config.get(prop_name, "") - save_property_into_fs(prop, base64.b64decode(prop_value)) - # Start container - self._configure_and_start_container(container, local_tunnel_ip, VMMode.SWARM_INIT) - self.status = "completed" + if self.initialized == "true": + # Use common logic for ensuring container is running + self._ensure_container_running(container, local_tunnel_ip) return - # Step 7: Not initialized - restart container and wait for properties generation + # Not initialized - restart container and wait for properties generation log(LogLevel.INFO, "Service not initialized, starting initialization process") # Restart container if running @@ -464,65 +466,28 @@ def _handle_swarm_init(self, local_tunnel_ip: str) -> None: self._stop_container_if_running(container) # Start container - self._configure_and_start_container(container, local_tunnel_ip, VMMode.SWARM_INIT) + self._configure_and_start_container(container, local_tunnel_ip) # Wait for properties generation self._wait_for_properties_generation(timeout=30, interval=5) def _handle_swarm_normal(self, local_tunnel_ip: str) -> None: """Handle swarm-normal mode: read ONLY from properties (DB), no external sources.""" - initialized = self.authority_config.get(self.PROP_INITIALIZED) - # If not initialized - wait for swarm-init to complete - if initialized != "true": + if self.initialized != "true": self.status = "postponed" self.error_message = "Waiting for authority service properties to be initialized" return - # Initialized - verify ALL required properties are present - missing = self._check_for_missing_properties() - - # If ANY property is missing - ERROR (should never happen if initialized=true) - if missing: - error_msg = ( - f"Service marked as initialized but missing required properties: " - f"{', '.join(missing)}" - ) - log(LogLevel.ERROR, error_msg) - self.status = "error" - self.error_message = error_msg - return - # All properties present - manage container container = LXCContainer(PKI_SERVICE_NAME) - # Check if restart is needed - if container.is_running(): - if self._is_restart_required(): - log(LogLevel.INFO, "Configuration changed, restarting container") - self._stop_container_if_running(container) - else: - log( - LogLevel.INFO, - f"Container {PKI_SERVICE_NAME} is already running, " - f"no restart required" - ) - self.status = "completed" - return - - # Restore properties to filesystem before starting container - for prop in self.AUTHORITY_SERVICE_PROPERTIES: - prop_name = f"{self.AUTHORITY_SERVICE_PREFIX}{prop}" - prop_value = self.authority_config.get(prop_name, "") - save_property_into_fs(prop, base64.b64decode(prop_value)) - - # Configure and start container - self._configure_and_start_container(container, local_tunnel_ip, VMMode.SWARM_NORMAL) - - self.status = "completed" + # Use common logic for ensuring container is running + self._ensure_container_running(container, local_tunnel_ip) def _is_restart_required(self) -> bool: """Check if container restart is required based on config changes.""" + # Check file-based properties for prop in self.AUTHORITY_SERVICE_PROPERTIES: prop_name = f"{self.AUTHORITY_SERVICE_PREFIX}{prop}" config_value = self.authority_config.get(prop_name, "") @@ -548,6 +513,26 @@ def _is_restart_required(self) -> bool: log(LogLevel.ERROR, f"Failed to decode property {prop}: {error}") return True + # Check yaml config parameters + yaml_params = [ + ("pki.ownDomain", self.pki_domain, "domain"), + ("pki.mode.networkSettings.networkType", self.network_type, "network_type"), + ("pki.mode.networkSettings.networkKeyHashHex", self.network_key_hash, "network_key_hash"), + ] + + for yaml_path, expected_value, param_name in yaml_params: + if not expected_value: + continue + + yaml_value = read_yaml_config_param(yaml_path) + + if yaml_value != expected_value: + log( + LogLevel.INFO, + f"Parameter {param_name} changed (yaml: {yaml_value}, expected: {expected_value}), restart required" + ) + return True + # No changes detected log(LogLevel.INFO, "No configuration changes detected") return False @@ -633,15 +618,17 @@ def health(self, timeout: int = 0, interval: int = 5) -> tuple[bool, str]: try: container = LXCContainer(PKI_SERVICE_NAME) elapsed = 0 + attempt = 0 while True: + attempt += 1 if container.is_running() and container.is_service_healthy(): is_healthy = True break # If timeout is 0, only check once if timeout == 0 or elapsed >= timeout: - error_msg = "PKI service is not healthy or container is not running" + error_msg = f"PKI service is not healthy or container is not running (attempts: {attempt})" break # Wait before next attempt @@ -649,7 +636,7 @@ def health(self, timeout: int = 0, interval: int = 5) -> tuple[bool, str]: elapsed += interval except Exception as error: # pylint: disable=broad-exception-caught - error_msg = f"Health check failed: {str(error)}" + error_msg = f"Health check failed on attempt {attempt}: {str(error)}" log(LogLevel.ERROR, error_msg) # Compare current pki_node_ready with new health status From 692e9ff0f5d7229cec24ae59537460357f0eed5b Mon Sep 17 00:00:00 2001 From: Petr Evstifeev Date: Thu, 22 Jan 2026 08:32:37 -0600 Subject: [PATCH 33/51] use raw networkKey instead of hash --- src/services/apps/pki-authority/main.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/services/apps/pki-authority/main.py b/src/services/apps/pki-authority/main.py index 7f86197a..cbc24bb7 100755 --- a/src/services/apps/pki-authority/main.py +++ b/src/services/apps/pki-authority/main.py @@ -2,6 +2,7 @@ """PKI Authority service provisioning plugin.""" import base64 +import hashlib import json import sys import time @@ -433,10 +434,11 @@ def _handle_swarm_init(self, local_tunnel_ip: str) -> None: # Get network_key_hash from external source (file) if not self.network_key_hash: try: - self.network_key_hash = get_pki_authority_param("networkKeyHashHex") + network_key = get_pki_authority_param("networkKey") + self.network_key_hash = hashlib.sha256(network_key.encode()).hexdigest() log( LogLevel.INFO, - f"Read network key hash from external source: {self.network_key_hash}" + f"Calculated network key hash from external source: {self.network_key_hash}" ) except Exception as error: # pylint: disable=broad-exception-caught error_msg = f"Failed to get network key hash from external source: {error}" From ddf59a564fbca59034a7f8cf6855ae4fc7432968 Mon Sep 17 00:00:00 2001 From: Petr Evstifeev Date: Thu, 22 Jan 2026 09:34:38 -0600 Subject: [PATCH 34/51] pylint fixes --- src/services/apps/pki-authority/helpers.py | 4 +- src/services/apps/pki-authority/main.py | 67 +++++++++++++++------- 2 files changed, 48 insertions(+), 23 deletions(-) diff --git a/src/services/apps/pki-authority/helpers.py b/src/services/apps/pki-authority/helpers.py index fa351a83..b8b408c3 100644 --- a/src/services/apps/pki-authority/helpers.py +++ b/src/services/apps/pki-authority/helpers.py @@ -234,7 +234,7 @@ def detect_vm_mode() -> VMMode: def detect_network_type() -> str: """Detect network type from kernel command line. - + Returns: 'untrusted' if allow_untrusted=true is present in cmdline, otherwise 'trusted'. """ @@ -381,7 +381,7 @@ def patch_yaml_config( ) log(LogLevel.ERROR, error_msg) raise ValueError(error_msg) - + device_id_hex = secrets.token_hex(32) config["pki"]["ownChallenge"]["deviceIdHex"] = device_id_hex log(LogLevel.INFO, f"Generated deviceIdHex for untrusted type: {device_id_hex}") diff --git a/src/services/apps/pki-authority/main.py b/src/services/apps/pki-authority/main.py index cbc24bb7..e8f30cf0 100755 --- a/src/services/apps/pki-authority/main.py +++ b/src/services/apps/pki-authority/main.py @@ -114,7 +114,7 @@ def _create_gateway_endpoints(self): """Create and update gateway endpoints in Redis.""" # Get current endpoints from nodes with pki_node_ready=true pki_node_props = self.state_json.get("pkiNodeProperties", []) - + current_endpoints = [] for prop in pki_node_props: if prop.get("name") == "pki_node_ready" and prop.get("value") == "true": @@ -280,8 +280,9 @@ def _stop_container_if_running(self, container: LXCContainer) -> None: def _ensure_container_running(self, container: LXCContainer, local_tunnel_ip: str) -> None: """Ensure container is running with correct configuration. - - Checks for missing properties, restart requirements, restores properties and starts container. + + Checks for missing properties, restart requirements, + restores properties and starts container. Sets self.status to 'completed' or 'error' and self.error_message on error. """ # Verify ALL required properties are present @@ -340,9 +341,15 @@ def _configure_and_start_container( is_healthy, err_msg = self.health(timeout=30, interval=5) if is_healthy: - log(LogLevel.INFO, f"LXC container {PKI_SERVICE_NAME} started and health check passed") + log( + LogLevel.INFO, + f"LXC container {PKI_SERVICE_NAME} started and health check passed" + ) else: - log(LogLevel.WARN, f"LXC container {PKI_SERVICE_NAME} started but health check failed: {err_msg}") + log( + LogLevel.WARN, + f"LXC container {PKI_SERVICE_NAME} started but health check failed: {err_msg}" + ) def _check_for_missing_properties(self) -> list[str]: """Check for missing required properties. @@ -518,8 +525,16 @@ def _is_restart_required(self) -> bool: # Check yaml config parameters yaml_params = [ ("pki.ownDomain", self.pki_domain, "domain"), - ("pki.mode.networkSettings.networkType", self.network_type, "network_type"), - ("pki.mode.networkSettings.networkKeyHashHex", self.network_key_hash, "network_key_hash"), + ( + "pki.mode.networkSettings.networkType", + self.network_type, + "network_type" + ), + ( + "pki.mode.networkSettings.networkKeyHashHex", + self.network_key_hash, + "network_key_hash" + ), ] for yaml_path, expected_value, param_name in yaml_params: @@ -531,7 +546,8 @@ def _is_restart_required(self) -> bool: if yaml_value != expected_value: log( LogLevel.INFO, - f"Parameter {param_name} changed (yaml: {yaml_value}, expected: {expected_value}), restart required" + f"Parameter {param_name} changed " + f"(yaml: {yaml_value}, expected: {expected_value}), restart required" ) return True @@ -578,7 +594,11 @@ def destroy(self) -> PluginOutput: exit_code = container.destroy() if exit_code != 0: error_msg = f"Failed to destroy container with exit code {exit_code}" - return PluginOutput(status="error", error_message=error_msg, local_state=self.local_state) + return PluginOutput( + status="error", + error_message=error_msg, + local_state=self.local_state + ) delete_iptables_rules() @@ -594,7 +614,9 @@ def destroy(self) -> PluginOutput: return PluginOutput( status="completed", local_state=self.local_state, - cluster_properties=self.cluster_properties if self.cluster_properties else None + cluster_properties=( + self.cluster_properties if self.cluster_properties else None + ) ) except Exception as error: # pylint: disable=broad-exception-caught @@ -606,41 +628,44 @@ def destroy(self) -> PluginOutput: def health(self, timeout: int = 0, interval: int = 5) -> tuple[bool, str]: """Check health of PKI Authority service. - + Args: timeout: Maximum time to wait for service to become healthy (0 = single check) interval: Time between health check attempts - + Returns: Tuple of (is_healthy, error_message). If healthy, error_message is empty string. """ is_healthy = False error_msg = "" - + try: container = LXCContainer(PKI_SERVICE_NAME) elapsed = 0 attempt = 0 - + while True: attempt += 1 if container.is_running() and container.is_service_healthy(): is_healthy = True break - + # If timeout is 0, only check once if timeout == 0 or elapsed >= timeout: - error_msg = f"PKI service is not healthy or container is not running (attempts: {attempt})" + error_msg = ( + f"PKI service is not healthy or container is not running " + f"(attempts: {attempt})" + ) break - + # Wait before next attempt time.sleep(interval) elapsed += interval - + except Exception as error: # pylint: disable=broad-exception-caught error_msg = f"Health check failed on attempt {attempt}: {str(error)}" log(LogLevel.ERROR, error_msg) - + # Compare current pki_node_ready with new health status current_healthy_status = "true" if is_healthy else "false" if self.current_pki_node_ready != current_healthy_status: @@ -649,7 +674,7 @@ def health(self, timeout: int = 0, interval: int = 5) -> tuple[bool, str]: f"PKI node ready status changed: {self.current_pki_node_ready} -> {current_healthy_status}" ) self.node_properties["pki_node_ready"] = current_healthy_status - + return (is_healthy, error_msg) @@ -689,7 +714,7 @@ def handle_health(input_data: PluginInput) -> PluginOutput: local_state=input_data.local_state, node_properties=handler.node_properties if handler.node_properties else None ) - + return PluginOutput( status="error", error_message=error_msg, From 2b5bbcf11ac2e9651ead2e28d0f7c20d343251fe Mon Sep 17 00:00:00 2001 From: Petr Evstifeev Date: Thu, 22 Jan 2026 12:30:02 -0600 Subject: [PATCH 35/51] more strict query for pki nodes --- src/services/apps/pki-authority/main.py | 42 ++++++++++++++----- src/services/apps/pki-authority/manifest.yaml | 4 +- 2 files changed, 33 insertions(+), 13 deletions(-) diff --git a/src/services/apps/pki-authority/main.py b/src/services/apps/pki-authority/main.py index e8f30cf0..e8a6299f 100755 --- a/src/services/apps/pki-authority/main.py +++ b/src/services/apps/pki-authority/main.py @@ -90,13 +90,12 @@ def __init__(self, input_data: PluginInput): def _get_redis_tunnel_ips(self) -> list[str]: """Get list of Redis node tunnel IPs.""" redis_node_props = self.state_json.get("redisNodeProperties", []) - wg_props = self.state_json.get("wgNodeProperties", []) redis_hosts = [] for prop in redis_node_props: if prop.get("name") == "redis_node_ready" and prop.get("value") == "true": node_id = prop.get("node_id") - tunnel_ip = get_node_tunnel_ip(node_id, wg_props) + tunnel_ip = get_node_tunnel_ip(node_id, self.wg_props) if tunnel_ip: redis_hosts.append(tunnel_ip) @@ -110,23 +109,43 @@ def _get_redis_connection_info(self) -> list[tuple[str, int]]: redis_tunnel_ips = self._get_redis_tunnel_ips() return [(ip, 6379) for ip in redis_tunnel_ips] - def _create_gateway_endpoints(self): - """Create and update gateway endpoints in Redis.""" - # Get current endpoints from nodes with pki_node_ready=true - pki_node_props = self.state_json.get("pkiNodeProperties", []) + def _get_current_endpoints(self) -> list[str]: + """Get list of tunnel IPs for PKI nodes that are ready. + Returns list of tunnel IPs for nodes with pki_node_ready=true. + """ + pki_node_props = self.state_json.get("pkiNodeProperties", []) current_endpoints = [] - for prop in pki_node_props: - if prop.get("name") == "pki_node_ready" and prop.get("value") == "true": - node_id = prop.get("node_id") + + for cluster_node in self.pki_cluster_nodes: + node_id = cluster_node.get("node_id") + if not node_id: + continue + + # Find pki_node_ready property for this node + node_ready = False + for prop in pki_node_props: + if (prop.get("node_id") == node_id and + prop.get("name") == "pki_node_ready" and + prop.get("value") == "true"): + node_ready = True + break + + if node_ready: tunnel_ip = get_node_tunnel_ip(node_id, self.wg_props) if tunnel_ip: current_endpoints.append(tunnel_ip) + return current_endpoints + + def _create_gateway_endpoints(self): + """Create and update gateway endpoints in Redis.""" + current_endpoints = self._get_current_endpoints() + # Get Redis connection info redis_endpoints = self._get_redis_connection_info() - if not redis_endpoints and current_endpoints: + if not redis_endpoints: self.status = "postponed" self.error_message = "No Redis nodes available to configure gateway routes" return @@ -671,7 +690,8 @@ def health(self, timeout: int = 0, interval: int = 5) -> tuple[bool, str]: if self.current_pki_node_ready != current_healthy_status: log( LogLevel.INFO, - f"PKI node ready status changed: {self.current_pki_node_ready} -> {current_healthy_status}" + f"PKI node ready status changed: " + f"{self.current_pki_node_ready} -> {current_healthy_status}" ) self.node_properties["pki_node_ready"] = current_healthy_status diff --git a/src/services/apps/pki-authority/manifest.yaml b/src/services/apps/pki-authority/manifest.yaml index 44eb121b..4f7a914e 100644 --- a/src/services/apps/pki-authority/manifest.yaml +++ b/src/services/apps/pki-authority/manifest.yaml @@ -60,7 +60,7 @@ stateExpr: .deleted_ts == null and .name == "pki_node_ready" ) | - {cluster_node, name, value, node_id: ( .cluster_node as $cn | $swarmdb.clusternodes[] | select(.id == $cn)) | .node} + {cluster_node, name, value, node_id: ( .cluster_node as $cn | $swarmdb.clusternodes[] | select(.id == $cn and .deleted_ts == null)) | .node} ] | sort_by(.cluster_node, .name, .value, .node_id), redisCluster: { @@ -74,7 +74,7 @@ stateExpr: .deleted_ts == null and .name == "redis_node_ready" ) | - {cluster_node, name, value, node_id: ( .cluster_node as $cn | $swarmdb.clusternodes[] | select(.id == $cn)) | .node} + {cluster_node, name, value, node_id: ( .cluster_node as $cn | $swarmdb.clusternodes[] | select(.id == $cn and .deleted_ts == null)) | .node} ] | sort_by(.cluster_node, .name, .value, .node_id), wgCluster: { From f3ec4f748b951116f1a76db2d5e5979f1d17f360 Mon Sep 17 00:00:00 2001 From: Petr Evstifeev Date: Fri, 23 Jan 2026 10:39:40 -0600 Subject: [PATCH 36/51] forgot to commit dockerfile --- src/Dockerfile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/Dockerfile b/src/Dockerfile index d7e51d76..ed6282de 100644 --- a/src/Dockerfile +++ b/src/Dockerfile @@ -247,9 +247,8 @@ ADD rootfs/files/scripts/install_lxc_deps.sh /buildroot/files/scripts/ RUN --security=insecure /buildroot/files/scripts/install_lxc_deps.sh RUN mkdir -p "${OUTPUTDIR}/etc/super/containers/pki-authority" -COPY --from=ghcr.io/super-protocol/tee-pki-authority-service-lxc:build-21069314544 /pki-authority.tar "${OUTPUTDIR}/etc/super/containers/pki-authority/pki-authority.tar" +COPY --from=ghcr.io/super-protocol/tee-pki-authority-service-lxc:build-21253804163 /pki-authority.tar "${OUTPUTDIR}/etc/super/containers/pki-authority/pki-authority.tar" ADD rootfs/files/configs/pki-service/lxc-swarm-template.yaml "${OUTPUTDIR}/etc/super/containers/pki-authority/lxc-swarm-template.yaml" -ADD rootfs/files/configs/pki-service/lxc-legacy-vm-template.yaml "${OUTPUTDIR}/etc/super/containers/pki-authority/lxc-legacy-vm-template.yaml" ADD rootfs/files/configs/pki-service/dnsmasq.conf "${OUTPUTDIR}/etc/lxc/dnsmasq.conf" ADD rootfs/files/configs/pki-service/lxc-net "${OUTPUTDIR}/etc/default/lxc-net" From 469cf6fcc3545bc496367a0847e2b8a7998ae8cb Mon Sep 17 00:00:00 2001 From: Petr Evstifeev Date: Wed, 28 Jan 2026 08:16:40 -0600 Subject: [PATCH 37/51] draft --- src/Dockerfile | 12 + .../pki-service/pki-authority-init.service | 12 + .../configs/pki-service/pki-authority.path | 13 + .../configs/pki-service/pki-authority.service | 19 + .../pki-authority.service.d/init.conf | 6 + .../configs/pki-service/pki_configure.py | 71 ++ .../files/configs/pki-service/pki_helpers.py | 793 ++++++++++++++++++ .../files/configs/pki-service/pki_init.py | 29 + src/swarm-scripts/80.setup-pki-authority.sh | 44 - 9 files changed, 955 insertions(+), 44 deletions(-) create mode 100644 src/rootfs/files/configs/pki-service/pki-authority-init.service create mode 100644 src/rootfs/files/configs/pki-service/pki-authority.path create mode 100644 src/rootfs/files/configs/pki-service/pki-authority.service create mode 100644 src/rootfs/files/configs/pki-service/pki-authority.service.d/init.conf create mode 100644 src/rootfs/files/configs/pki-service/pki_configure.py create mode 100644 src/rootfs/files/configs/pki-service/pki_helpers.py create mode 100644 src/rootfs/files/configs/pki-service/pki_init.py delete mode 100644 src/swarm-scripts/80.setup-pki-authority.sh diff --git a/src/Dockerfile b/src/Dockerfile index ed6282de..c583f2e1 100644 --- a/src/Dockerfile +++ b/src/Dockerfile @@ -251,6 +251,18 @@ COPY --from=ghcr.io/super-protocol/tee-pki-authority-service-lxc:build-212538041 ADD rootfs/files/configs/pki-service/lxc-swarm-template.yaml "${OUTPUTDIR}/etc/super/containers/pki-authority/lxc-swarm-template.yaml" ADD rootfs/files/configs/pki-service/dnsmasq.conf "${OUTPUTDIR}/etc/lxc/dnsmasq.conf" ADD rootfs/files/configs/pki-service/lxc-net "${OUTPUTDIR}/etc/default/lxc-net" +RUN mkdir -p "${OUTPUTDIR}/usr/local/bin/pki-authority" +ADD rootfs/files/configs/pki-service/pki_helpers.py "${OUTPUTDIR}/usr/local/bin/pki-authority/pki_helpers.py" +ADD rootfs/files/configs/pki-service/pki_init.py "${OUTPUTDIR}/usr/local/bin/pki-authority/pki_init.py" +ADD rootfs/files/configs/pki-service/pki_configure.py "${OUTPUTDIR}/usr/local/bin/pki-authority/pki_configure.py" +ADD rootfs/files/configs/pki-service/pki-authority-init.service "${OUTPUTDIR}/etc/systemd/system" +ADD rootfs/files/configs/pki-service/pki-authority.service "${OUTPUTDIR}/etc/systemd/system" +ADD rootfs/files/configs/pki-service/pki-authority.service.d/init.conf "${OUTPUTDIR}/etc/systemd/system/pki-authority.service.d/init.conf" +ADD rootfs/files/configs/pki-service/pki-authority.path "${OUTPUTDIR}/etc/systemd/system" +RUN chmod +x "${OUTPUTDIR}/usr/local/bin/pki-authority/pki_init.py" "${OUTPUTDIR}/usr/local/bin/pki-authority/pki_configure.py" +RUN ln -s /etc/systemd/system/pki-authority-init.service "${OUTPUTDIR}/etc/systemd/system/multi-user.target.wants/pki-authority-init.service" +RUN ln -s /etc/systemd/system/pki-authority.service "${OUTPUTDIR}/etc/systemd/system/multi-user.target.wants/pki-authority.service" +RUN ln -s /etc/systemd/system/pki-authority.path "${OUTPUTDIR}/etc/systemd/system/multi-user.target.wants/pki-authority.path" ADD rootfs/files/configs/etc/multipath.conf.append /buildroot/files/configs/etc/multipath.conf.append ADD rootfs/files/configs/etc/sysctl.conf.append /buildroot/files/configs/etc/sysctl.conf.append diff --git a/src/rootfs/files/configs/pki-service/pki-authority-init.service b/src/rootfs/files/configs/pki-service/pki-authority-init.service new file mode 100644 index 00000000..b1848a41 --- /dev/null +++ b/src/rootfs/files/configs/pki-service/pki-authority-init.service @@ -0,0 +1,12 @@ +[Unit] +Description=PKI authority initialization +After=lxc.service lxc-net.service lxc-monitord.service +Requires=lxc.service lxc-net.service lxc-monitord.service + +[Service] +Type=oneshot +RemainAfterExit=yes +ExecStart=/usr/bin/python3 /usr/local/bin/pki-authority/pki_init.py + +[Install] +WantedBy=multi-user.target diff --git a/src/rootfs/files/configs/pki-service/pki-authority.path b/src/rootfs/files/configs/pki-service/pki-authority.path new file mode 100644 index 00000000..a142b4c1 --- /dev/null +++ b/src/rootfs/files/configs/pki-service/pki-authority.path @@ -0,0 +1,13 @@ +[Unit] +Description=Wait for Swarm configuration files for PKI authority +ConditionKernelCommandLine=!vm_mode=swarm-init + +[Path] +# Wait for these files to appear before starting pki-authority.service +PathExists=/run/swarm/join.token +PathExists=/run/swarm/manager.addr +# Add more paths here as needed +Unit=pki-authority.service + +[Install] +WantedBy=multi-user.target diff --git a/src/rootfs/files/configs/pki-service/pki-authority.service b/src/rootfs/files/configs/pki-service/pki-authority.service new file mode 100644 index 00000000..a51bb1b1 --- /dev/null +++ b/src/rootfs/files/configs/pki-service/pki-authority.service @@ -0,0 +1,19 @@ +[Unit] +Description=PKI authority lxc-container +After=pki-authority-init.service nvidia-persistenced.service pccs.service +Requires=pki-authority-init.service pccs.service +Wants=nvidia-persistenced.service + +[Service] +Type=simple +Restart=always +RestartSec=5 +TimeoutStartSec=3min + +ExecStartPre=/usr/bin/python3 /usr/local/bin/pki-authority/pki_configure.py + +ExecStart=/usr/bin/lxc-start -n pki-authority -F + +ExecStop=/usr/bin/lxc-stop -n pki-authority + +KillMode=control-group diff --git a/src/rootfs/files/configs/pki-service/pki-authority.service.d/init.conf b/src/rootfs/files/configs/pki-service/pki-authority.service.d/init.conf new file mode 100644 index 00000000..5b3ad47c --- /dev/null +++ b/src/rootfs/files/configs/pki-service/pki-authority.service.d/init.conf @@ -0,0 +1,6 @@ +[Unit] +# Auto-start only in swarm-init mode +ConditionKernelCommandLine=vm_mode=swarm-init + +[Install] +WantedBy=multi-user.target diff --git a/src/rootfs/files/configs/pki-service/pki_configure.py b/src/rootfs/files/configs/pki-service/pki_configure.py new file mode 100644 index 00000000..3fb8c4f4 --- /dev/null +++ b/src/rootfs/files/configs/pki-service/pki_configure.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 +""" +PKI Authority LXC container configuration. +Configures the container with network, device access, and runtime settings. +""" + +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent)) +from pki_helpers import ( + log, LogLevel, detect_cpu_type, detect_vm_mode, detect_network_type, + patch_yaml_config, patch_lxc_config, get_pki_authority_param, + PKI_SERVICE_NAME +) + + +def main(): + """Main configuration logic.""" + log(LogLevel.INFO, "Starting PKI Authority configuration") + + # Check if container exists + if not Path(f"/var/lib/lxc/{PKI_SERVICE_NAME}").exists(): + log(LogLevel.ERROR, f"Container '{PKI_SERVICE_NAME}' does not exist") + sys.exit(1) + + # Detect environment + cpu_type = detect_cpu_type() + vm_mode = detect_vm_mode() + network_type = detect_network_type() + + log(LogLevel.INFO, f"CPU type: {cpu_type}") + log(LogLevel.INFO, f"VM mode: {vm_mode.value}") + log(LogLevel.INFO, f"Network type: {network_type}") + + # Get parameters from swarm-env.yaml + try: + pki_domain = get_pki_authority_param("domain") + network_key_hash = get_pki_authority_param("network_key_hash") + except Exception as e: + log(LogLevel.ERROR, f"Failed to read parameters: {e}") + sys.exit(1) + + # Patch YAML config + try: + patch_yaml_config( + cpu_type=cpu_type, + vm_mode=vm_mode, + network_type=network_type, + pki_domain=pki_domain, + network_key_hash=network_key_hash + ) + log(LogLevel.INFO, "YAML config patched successfully") + except Exception as e: + log(LogLevel.ERROR, f"Failed to patch YAML config: {e}") + sys.exit(1) + + # Patch LXC config + try: + patch_lxc_config(cpu_type) + log(LogLevel.INFO, "LXC config patched successfully") + except Exception as e: + log(LogLevel.ERROR, f"Failed to patch LXC config: {e}") + sys.exit(1) + + log(LogLevel.INFO, "PKI Authority configuration completed successfully") + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/src/rootfs/files/configs/pki-service/pki_helpers.py b/src/rootfs/files/configs/pki-service/pki_helpers.py new file mode 100644 index 00000000..b8b408c3 --- /dev/null +++ b/src/rootfs/files/configs/pki-service/pki_helpers.py @@ -0,0 +1,793 @@ +#!/usr/bin/env python3 +""" +PKI Authority LXC container management helpers. +""" + +import os +import re +import secrets +import shutil +import subprocess +import sys +import urllib.request +from datetime import datetime +from enum import Enum +from pathlib import Path +from typing import List, Optional + +import yaml + +PKI_SERVICE_NAME = "pki-authority" +SERVICE_INSIDE_CONTAINER = "tee-pki" +BRIDGE_NAME = "lxcbr0" +PCCS_PORT = "8081" +PKI_SERVICE_EXTERNAL_PORT = "8443" +CONTAINER_IP = "10.0.3.100" +WIREGUARD_INTERFACE = "wg0" +STORAGE_PATH = Path(f"/var/lib/lxc/{PKI_SERVICE_NAME}/rootfs/app/swarm-storage") +IPTABLES_RULE_COMMENT = f"{PKI_SERVICE_NAME}-rule" +SWARM_ENV_YAML = "/sp/swarm/swarm-env.yaml" + + +class LogLevel(Enum): + """Log levels for structured logging.""" + INFO = "INFO" + WARN = "WARN" + ERROR = "ERROR" + DEBUG = "DEBUG" + + +def log(level: LogLevel, message: str): + """Log message with timestamp, service name and level.""" + timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + print(f"[{timestamp}] [{PKI_SERVICE_NAME}] [{level.value}] {message}", file=sys.stderr) + + +class VMMode(Enum): + """VM mode types.""" + SWARM_INIT = "swarm-init" + SWARM_NORMAL = "swarm-normal" + +class LXCContainer: + """Manager for LXC container operations.""" + + def __init__(self, container_name: str = PKI_SERVICE_NAME): + self.container_name = container_name + + def start(self, timeout: int = 30) -> int: + """Start LXC container. Returns exit code.""" + log(LogLevel.INFO, f"Starting LXC container {self.container_name}") + result = subprocess.run( + ["lxc-start", "-n", self.container_name], + capture_output=True, + text=True, + timeout=timeout, + check=False + ) + return result.returncode + + def stop(self, graceful_timeout: int = 30, command_timeout: int = 60) -> int: + """Stop LXC container gracefully. Returns exit code.""" + log(LogLevel.INFO, f"Stopping LXC container {self.container_name} gracefully") + result = subprocess.run( + ["lxc-stop", "-n", self.container_name, "-t", str(graceful_timeout)], + capture_output=True, + text=True, + timeout=command_timeout, + check=False + ) + return result.returncode + + def destroy(self) -> int: + """Destroy LXC container. Returns exit code.""" + log(LogLevel.INFO, f"Destroying LXC container {self.container_name}") + result = subprocess.run( + ["lxc-destroy", "-n", self.container_name, "-f"], + capture_output=True, + text=True, + timeout=60, + check=False + ) + + if result.returncode != 0: + log(LogLevel.ERROR, f"Failed to destroy container: {result.stderr}") + + return result.returncode + + def is_running(self) -> bool: + """Check if LXC container is running.""" + try: + result = subprocess.run( + ["lxc-ls", "--running"], + capture_output=True, + text=True, + check=False + ) + if self.container_name not in result.stdout: + log(LogLevel.INFO, f"LXC container {self.container_name} is not running") + return False + return True + except Exception as error: # pylint: disable=broad-exception-caught + log(LogLevel.ERROR, f"Failed to check LXC container status: {error}") + return False + + def get_ip(self) -> Optional[str]: + """Get container IP address.""" + try: + result = subprocess.run( + ["lxc-info", "-n", self.container_name, "-iH"], + capture_output=True, + text=True, + check=False + ) + container_ip = result.stdout.strip() if result.stdout.strip() else None + return container_ip + except Exception as error: # pylint: disable=broad-exception-caught + log(LogLevel.ERROR, f"Failed to get container IP: {error}") + return None + + def create( + self, + archive_path: str = "/etc/super/containers/pki-authority/pki-authority.tar" + ) -> bool: + """Create LXC container if it doesn't exist. + + Returns True if created or already exists. + """ + # Check if container already exists + result = subprocess.run( + ["lxc-info", "-n", self.container_name], + capture_output=True, + text=True, + check=False + ) + + if result.returncode == 0: + log(LogLevel.INFO, f"Container '{self.container_name}' already exists.") + return True + + log(LogLevel.INFO, f"Container '{self.container_name}' not found. Creating...") + try: + subprocess.run( + [ + "lxc-create", + "-n", self.container_name, + "-t", "oci", + "--", + "--url", f"docker-archive:{archive_path}" + ], + check=True + ) + log(LogLevel.INFO, f"Container '{self.container_name}' created.") + return True + except subprocess.CalledProcessError as error: + log(LogLevel.ERROR, f"Failed to create container: {error}") + return False + + def is_service_healthy(self, healthcheck_url: str = "/healthcheck") -> bool: + """Check if service inside container is running and healthy.""" + try: + # Check service status inside container + result = subprocess.run( + [ + "lxc-attach", "-n", self.container_name, "--", + "systemctl", "is-active", SERVICE_INSIDE_CONTAINER + ], + capture_output=True, + text=True, + check=False + ) + status = result.stdout.strip() + + if status != "active": + log(LogLevel.INFO, f"Service {SERVICE_INSIDE_CONTAINER} status: {status}") + return False + + # Service is active, check healthcheck endpoint + container_ip = self.get_ip() + if not container_ip: + log(LogLevel.INFO, "Could not get container IP") + return False + + # Perform HTTP healthcheck + try: + req = urllib.request.Request(f"http://{container_ip}{healthcheck_url}") + with urllib.request.urlopen(req, timeout=5) as response: + if response.status == 200: + return True + + log( + LogLevel.INFO, + f"Healthcheck returned status: {response.status}" + ) + return False + except Exception as error: # pylint: disable=broad-exception-caught + log(LogLevel.INFO, f"Healthcheck failed: {error}") + return False + + except Exception as error: # pylint: disable=broad-exception-caught + log(LogLevel.ERROR, f"Failed to check service health: {error}") + return False + + +def detect_cpu_type() -> str: + """Detect CPU type based on available devices.""" + if Path("/dev/tdx_guest").is_char_device(): + return "tdx" + if Path("/dev/sev-guest").is_char_device(): + return "sev-snp" + return "untrusted" + + +def detect_vm_mode() -> VMMode: + """Detect VM mode from kernel command line.""" + try: + with open("/proc/cmdline", "r", encoding="utf-8") as file: + cmdline = file.read() + + if "vm_mode=swarm-init" in cmdline: + return VMMode.SWARM_INIT + return VMMode.SWARM_NORMAL + except FileNotFoundError: + return VMMode.SWARM_NORMAL + + +def detect_network_type() -> str: + """Detect network type from kernel command line. + + Returns: + 'untrusted' if allow_untrusted=true is present in cmdline, otherwise 'trusted'. + """ + try: + with open("/proc/cmdline", "r", encoding="utf-8") as file: + cmdline = file.read() + + if "allow_untrusted=true" in cmdline: + return "untrusted" + return "trusted" + except FileNotFoundError: + return "trusted" + + +def read_yaml_config_param(param_path: str) -> Optional[str]: + """Read parameter from container's yaml configuration. + + Args: + param_path: Dot-separated path to parameter (e.g., 'pki.ownDomain'). + + Returns: + Parameter value as string, or None if not found or error. + """ + yaml_config_path = Path(f"/var/lib/lxc/{PKI_SERVICE_NAME}/rootfs/app/conf/lxc.yaml") + + if not yaml_config_path.exists(): + log(LogLevel.DEBUG, f"YAML config not found: {yaml_config_path}") + return None + + try: + with open(yaml_config_path, "r", encoding="utf-8") as file: + config = yaml.safe_load(file) + + if not config: + log(LogLevel.DEBUG, f"Empty YAML config: {yaml_config_path}") + return None + + # Navigate through nested dictionary using dot-separated path + value = config + for key in param_path.split('.'): + if isinstance(value, dict): + value = value.get(key) + if value is None: + return None + else: + return None + + return str(value) if value is not None else None + + except Exception as error: # pylint: disable=broad-exception-caught + log(LogLevel.DEBUG, f"Failed to read {param_path} from YAML config: {error}") + return None + + +def get_pki_authority_param(param_name: str) -> str: + """Read PKI authority parameter from swarm-env.yaml. + + Args: + param_name: Name of the parameter under pki-authority section. + + Returns: + Parameter value as string. + + Raises: + FileNotFoundError: If swarm-env.yaml does not exist. + ValueError: If configuration is empty or parameter is not found. + Exception: For other errors during reading. + """ + swarm_env_path = Path(SWARM_ENV_YAML) + + if not swarm_env_path.exists(): + error_msg = f"Swarm environment config not found: {SWARM_ENV_YAML}" + log(LogLevel.ERROR, error_msg) + raise FileNotFoundError(error_msg) + + try: + with open(swarm_env_path, "r", encoding="utf-8") as file: + config = yaml.safe_load(file) + + if not config: + error_msg = f"Empty configuration in {SWARM_ENV_YAML}" + log(LogLevel.ERROR, error_msg) + raise ValueError(error_msg) + + param_value = config.get("pki-authority", {}).get(param_name) + if not param_value: + error_msg = ( + f"No {param_name} found in {SWARM_ENV_YAML} " + f"under pki-authority.{param_name}" + ) + log(LogLevel.ERROR, error_msg) + raise ValueError(error_msg) + + log(LogLevel.INFO, f"Read {param_name} from config: {param_value}") + return param_value + + except (FileNotFoundError, ValueError): + raise + except Exception as error: # pylint: disable=broad-exception-caught + error_msg = f"Failed to read {param_name} from {SWARM_ENV_YAML}: {error}" + log(LogLevel.ERROR, error_msg) + raise Exception(error_msg) from error + + +def patch_yaml_config( + cpu_type: str, + vm_mode: VMMode, + pki_domain: str, + network_type: str, + network_key_hash: str +): + """Set own challenge type in LXC container configuration.""" + template_name = "lxc-swarm-template.yaml" + log( + LogLevel.INFO, + f"Detected {vm_mode.value} mode, using swarm template" + ) + + src_yaml = Path(f"/etc/super/containers/pki-authority/{template_name}") + dst_yaml = Path(f"/var/lib/lxc/{PKI_SERVICE_NAME}/rootfs/app/conf/lxc.yaml") + + if not src_yaml.exists(): + log(LogLevel.ERROR, f"Error: {src_yaml} not found.") + sys.exit(1) + + # Load YAML, modify, and save + with open(src_yaml, "r", encoding="utf-8") as file: + config = yaml.safe_load(file) + + # Set the CPU type in the configuration + if "pki" not in config: + config["pki"] = {} + if "ownChallenge" not in config["pki"]: + config["pki"]["ownChallenge"] = {} + config["pki"]["ownChallenge"]["type"] = cpu_type + + # For untrusted, generate random deviceIdHex (32 bytes) + if cpu_type == "untrusted": + # Check if untrusted CPU type is running in trusted network + if network_type != "untrusted": + error_msg = ( + "Cannot run untrusted machine in trusted network. " + f"CPU type: {cpu_type}, Network type: {network_type}" + ) + log(LogLevel.ERROR, error_msg) + raise ValueError(error_msg) + + device_id_hex = secrets.token_hex(32) + config["pki"]["ownChallenge"]["deviceIdHex"] = device_id_hex + log(LogLevel.INFO, f"Generated deviceIdHex for untrusted type: {device_id_hex}") + + # Add 'untrusted' to allowedChallenges if network type is untrusted + if network_type == "untrusted": + if "allowedChallenges" not in config["pki"]: + config["pki"]["allowedChallenges"] = [] + if "untrusted" not in config["pki"]["allowedChallenges"]: + config["pki"]["allowedChallenges"].append("untrusted") + log(LogLevel.INFO, "Added 'untrusted' to allowedChallenges") + + # Set ownDomain from parameter + if pki_domain: + config["pki"]["ownDomain"] = pki_domain + log(LogLevel.INFO, f"Set ownDomain to: {pki_domain}") + + # Set mode.swarmMode + if "mode" not in config["pki"]: + config["pki"]["mode"] = {} + + mode_value = "init" if vm_mode == VMMode.SWARM_INIT else "normal" + config["pki"]["mode"]["swarmMode"] = mode_value + log(LogLevel.INFO, f"Set swarmMode to: {mode_value}") + + # Set networkSettings + if network_type and network_key_hash: + config["pki"]["mode"]["networkSettings"] = { + "networkType": network_type, + "networkKeyHashHex": network_key_hash + } + log( + LogLevel.INFO, + f"Set networkSettings: networkType={network_type}, " + f"networkKeyHashHex={network_key_hash}" + ) + + # Ensure destination directory exists + dst_yaml.parent.mkdir(parents=True, exist_ok=True) + + # Write modified YAML + with open(dst_yaml, "w", encoding="utf-8") as file: + yaml.dump(config, file, default_flow_style=False) + + +def patch_lxc_config(cpu_type: str): + """Patch LXC container configuration.""" + config_file = Path(f"/var/lib/lxc/{PKI_SERVICE_NAME}/config") + config_bak = Path(f"{config_file}.bak") + + # Always restore config from backup if backup exists + if config_bak.exists(): + shutil.copy(config_bak, config_file) + else: + # Create backup before first patch + if config_file.exists(): + shutil.copy(config_file, config_bak) + + # Append MAC address configuration + with open(config_file, "a", encoding="utf-8") as file: + file.write("lxc.net.0.hwaddr = 4e:fc:0a:d5:2d:ff\n") + + # Add device-specific configuration + if cpu_type == "sev-snp": + dev_path = Path("/dev/sev-guest") + stat_info = dev_path.stat() + dev_id = f"{os.major(stat_info.st_rdev)}:{os.minor(stat_info.st_rdev)}" + + with open(config_file, "a", encoding="utf-8") as file: + file.write(f"lxc.cgroup2.devices.allow = c {dev_id} rwm\n") + file.write( + "lxc.mount.entry = /dev/sev-guest dev/sev-guest " + "none bind,optional,create=file\n" + ) + + elif cpu_type == "tdx": + dev_path = Path("/dev/tdx_guest") + stat_info = dev_path.stat() + dev_id = f"{os.major(stat_info.st_rdev)}:{os.minor(stat_info.st_rdev)}" + + with open(config_file, "a", encoding="utf-8") as file: + file.write(f"lxc.cgroup2.devices.allow = c {dev_id} rwm\n") + file.write( + "lxc.mount.entry = /dev/tdx_guest dev/tdx_guest " + "none bind,optional,create=file\n" + ) + + if Path("/etc/tdx-attest.conf").exists(): + file.write( + "lxc.mount.entry = /etc/tdx-attest.conf etc/tdx-attest.conf " + "none bind,ro,create=file\n" + ) + + +def get_bridge_ip(bridge_name: str) -> str: + """Get host IP address on the LXC bridge.""" + result = subprocess.run( + ["ip", "-4", "addr", "show", bridge_name], + capture_output=True, + text=True, + check=False + ) + + if result.returncode != 0: + log( + LogLevel.ERROR, + f"Error: Could not determine IP address for bridge {bridge_name}" + ) + sys.exit(1) + + # Parse IP address from output + match = re.search(r'inet\s+(\d+\.\d+\.\d+\.\d+)', result.stdout) + if not match: + log( + LogLevel.ERROR, + f"Error: Could not determine IP address for bridge {bridge_name}" + ) + sys.exit(1) + + return match.group(1) + + +def enable_route_localnet(bridge_name: str): + """Enable route_localnet for the bridge.""" + sysctl_key = f"net.ipv4.conf.{bridge_name}.route_localnet" + + result = subprocess.run( + ["sysctl", "-n", sysctl_key], + capture_output=True, + text=True, + check=False + ) + + if result.returncode == 0 and result.stdout.strip() == "1": + log(LogLevel.INFO, f"route_localnet already enabled for {bridge_name}") + else: + subprocess.run( + ["sysctl", "-w", f"{sysctl_key}=1"], + check=True + ) + log(LogLevel.INFO, f"Enabled route_localnet for {bridge_name}") + + +def delete_iptables_rules(): + """Delete all iptables rules for PKI container (NAT and filter tables).""" + # Delete rules from NAT table chains: PREROUTING, OUTPUT, POSTROUTING + for chain in ["PREROUTING", "OUTPUT", "POSTROUTING"]: + result = subprocess.run( + ["iptables", "-t", "nat", "-S", chain], + capture_output=True, text=True, check=True + ) + + rules = result.stdout.splitlines() + + for rule in rules: + # Delete rules that contain our comment + if IPTABLES_RULE_COMMENT in rule: + delete_rule = rule.replace("-A", "-D", 1) + subprocess.run(["iptables", "-t", "nat"] + delete_rule.split()[1:], check=True) + log(LogLevel.INFO, f"Deleted iptables NAT rule: {delete_rule}") + + # Delete rules from filter table (INPUT chain) + result = subprocess.run( + ["iptables", "-S", "INPUT"], + capture_output=True, text=True, check=True + ) + + rules = result.stdout.splitlines() + + for rule in rules: + # Delete rules that contain our comment + if IPTABLES_RULE_COMMENT in rule: + delete_rule = rule.replace("-A", "-D", 1) + subprocess.run(["iptables"] + delete_rule.split()[1:], check=True) + log(LogLevel.INFO, f"Deleted iptables INPUT rule: {delete_rule}") + + +def ensure_iptables_rule(check_args: List[str], add_args: List[str], description: str): + """Ensure iptables rule exists, add if missing.""" + log(LogLevel.INFO, f"Checking iptables rule: {description}") + + check_result = subprocess.run(check_args, capture_output=True, check=False) + + if check_result.returncode == 0: + log(LogLevel.INFO, "Rule already exists") + else: + subprocess.run(add_args, check=True) + log(LogLevel.INFO, "Rule added") + +def setup_iptables(wg_ip): + """Setup iptables NAT rules for LXC container access to host services.""" + host_ip = get_bridge_ip(BRIDGE_NAME) + + enable_route_localnet(BRIDGE_NAME) + + # Rule 1: PCCS DNAT + ensure_iptables_rule( + check_args=[ + "iptables", "-t", "nat", "-C", "PREROUTING", + "-p", "tcp", + "-d", host_ip, + "--dport", PCCS_PORT, + "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, + "-j", "DNAT", + "--to-destination", f"127.0.0.1:{PCCS_PORT}" + ], + add_args=[ + "iptables", "-t", "nat", "-A", "PREROUTING", + "-p", "tcp", + "-d", host_ip, + "--dport", PCCS_PORT, + "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, + "-j", "DNAT", + "--to-destination", f"127.0.0.1:{PCCS_PORT}" + ], + description=f"PCCS DNAT {host_ip}:{PCCS_PORT} -> 127.0.0.1:{PCCS_PORT}" + ) + + # Rule 2: WireGuard PREROUTING (HTTPS) + ensure_iptables_rule( + check_args=[ + "iptables", "-t", "nat", "-C", "PREROUTING", + "-i", WIREGUARD_INTERFACE, + "-p", "tcp", + "--dport", PKI_SERVICE_EXTERNAL_PORT, + "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, + "-j", "DNAT", + "--to-destination", f"{CONTAINER_IP}:443" + ], + add_args=[ + "iptables", "-t", "nat", "-A", "PREROUTING", + "-i", WIREGUARD_INTERFACE, + "-p", "tcp", + "--dport", PKI_SERVICE_EXTERNAL_PORT, + "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, + "-j", "DNAT", + "--to-destination", f"{CONTAINER_IP}:443" + ], + description=f"PREROUTING WireGuard {PKI_SERVICE_EXTERNAL_PORT} -> {CONTAINER_IP}:443" + ) + + # Rule 2a: WireGuard PREROUTING (HTTP) + ensure_iptables_rule( + check_args=[ + "iptables", "-t", "nat", "-C", "PREROUTING", + "-i", WIREGUARD_INTERFACE, + "-p", "tcp", + "--dport", "8080", + "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, + "-j", "DNAT", + "--to-destination", f"{CONTAINER_IP}:80" + ], + add_args=[ + "iptables", "-t", "nat", "-A", "PREROUTING", + "-i", WIREGUARD_INTERFACE, + "-p", "tcp", + "--dport", "8080", + "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, + "-j", "DNAT", + "--to-destination", f"{CONTAINER_IP}:80" + ], + description=f"PREROUTING WireGuard 8080 -> {CONTAINER_IP}:80" + ) + + # Rule 3: OUTPUT (HTTPS) + ensure_iptables_rule( + check_args=[ + "iptables", "-t", "nat", "-C", "OUTPUT", + "-d", wg_ip, + "-p", "tcp", + "--dport", PKI_SERVICE_EXTERNAL_PORT, + "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, + "-j", "DNAT", + "--to-destination", f"{CONTAINER_IP}:443" + ], + add_args=[ + "iptables", "-t", "nat", "-A", "OUTPUT", + "-d", wg_ip, + "-p", "tcp", + "--dport", PKI_SERVICE_EXTERNAL_PORT, + "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, + "-j", "DNAT", + "--to-destination", f"{CONTAINER_IP}:443" + ], + description=f"OUTPUT {wg_ip}:{PKI_SERVICE_EXTERNAL_PORT} -> {CONTAINER_IP}:443" + ) + + # Rule 3a: OUTPUT (HTTP) + ensure_iptables_rule( + check_args=[ + "iptables", "-t", "nat", "-C", "OUTPUT", + "-d", wg_ip, + "-p", "tcp", + "--dport", "8080", + "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, + "-j", "DNAT", + "--to-destination", f"{CONTAINER_IP}:80" + ], + add_args=[ + "iptables", "-t", "nat", "-A", "OUTPUT", + "-d", wg_ip, + "-p", "tcp", + "--dport", "8080", + "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, + "-j", "DNAT", + "--to-destination", f"{CONTAINER_IP}:80" + ], + description=f"OUTPUT {wg_ip}:8080 -> {CONTAINER_IP}:80" + ) + + # Rule 4: MASQUERADE + ensure_iptables_rule( + check_args=[ + "iptables", "-t", "nat", "-C", "POSTROUTING", + "-s", f"{CONTAINER_IP}/32", + "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, + "-j", "MASQUERADE" + ], + add_args=[ + "iptables", "-t", "nat", "-A", "POSTROUTING", + "-s", f"{CONTAINER_IP}/32", + "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, + "-j", "MASQUERADE" + ], + description=f"POSTROUTING MASQUERADE for {CONTAINER_IP}/32" + ) + + # Rule 5: Allow port 8081 on lxcbr0 + ensure_iptables_rule( + check_args=[ + "iptables", "-C", "INPUT", + "-i", "lxcbr0", + "-p", "tcp", + "--dport", "8081", + "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, + "-j", "ACCEPT" + ], + add_args=[ + "iptables", "-A", "INPUT", + "-i", "lxcbr0", + "-p", "tcp", + "--dport", "8081", + "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, + "-j", "ACCEPT" + ], + description="Allow TCP port 8081 on lxcbr0" + ) + + +def update_pccs_url(): + """Update PCCS URL in QCNL configuration.""" + qcnl_conf = Path(f"/var/lib/lxc/{PKI_SERVICE_NAME}/rootfs/etc/sgx_default_qcnl.conf") + qcnl_conf_bak = Path(f"{qcnl_conf}.bak") + + host_ip = get_bridge_ip(BRIDGE_NAME) + + pccs_url = f"https://{host_ip}:{PCCS_PORT}/sgx/certification/v4/" + + if not qcnl_conf.exists(): + log(LogLevel.ERROR, f"Error: {qcnl_conf} not found") + sys.exit(1) + + if not qcnl_conf_bak.exists(): + shutil.copy(qcnl_conf, qcnl_conf_bak) + + shutil.copy(qcnl_conf_bak, qcnl_conf) + + with open(qcnl_conf, "r", encoding="utf-8") as file: + content = file.read() + + content = re.sub( + r'"pccs_url":\s*"[^"]*"', + f'"pccs_url": "{pccs_url}"', + content + ) + + with open(qcnl_conf, "w", encoding="utf-8") as file: + file.write(content) + + + +def init_container(): + """Initialize LXC container for PKI Authority.""" + LXCContainer(PKI_SERVICE_NAME).create() + + +def get_node_tunnel_ip(node_id: str, wg_props: List[dict]) -> Optional[str]: + """Get tunnel IP for a node from WireGuard properties.""" + for prop in wg_props: + if prop.get("node_id") == node_id and prop.get("name") == "tunnel_ip": + return prop.get("value") + return None + + +def save_property_into_fs(file_name: str, content: bytes): + """Save property content to filesystem.""" + STORAGE_PATH.mkdir(parents=True, exist_ok=True) + file_path = STORAGE_PATH / file_name + file_path.write_bytes(content) + + +def read_property_from_fs(file_name: str) -> tuple[bool, bytes]: + """Read property content from filesystem.""" + file_path = STORAGE_PATH / file_name + if file_path.exists(): + content = file_path.read_bytes() + if content: + return (True, content) + return (False, b"") diff --git a/src/rootfs/files/configs/pki-service/pki_init.py b/src/rootfs/files/configs/pki-service/pki_init.py new file mode 100644 index 00000000..ff90937f --- /dev/null +++ b/src/rootfs/files/configs/pki-service/pki_init.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 +""" +PKI Authority LXC container initialization. +Creates the container from OCI archive if it doesn't exist. +""" + +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent)) +from pki_helpers import LXCContainer, log, LogLevel + + +def main(): + """Main initialization logic.""" + log(LogLevel.INFO, "Starting PKI Authority initialization") + + # Create container using LXCContainer class + container = LXCContainer() + if not container.create(): + log(LogLevel.ERROR, "Container creation failed") + sys.exit(1) + + log(LogLevel.INFO, "PKI Authority initialization completed successfully") + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/src/swarm-scripts/80.setup-pki-authority.sh b/src/swarm-scripts/80.setup-pki-authority.sh deleted file mode 100644 index f9bb770c..00000000 --- a/src/swarm-scripts/80.setup-pki-authority.sh +++ /dev/null @@ -1,44 +0,0 @@ -#!/bin/bash -set -euo pipefail - -# This script bootstraps the pki-authority service into SwarmDB via mysql client. -# Run it INSIDE the container. Assumes mysql client is available. -# -# Note: -# - The pki-authority manifest and main.py should be available inside the container at: -# /etc/swarm-services/pki-authority/manifest.yaml and /etc/swarm-services/pki-authority/main.py -# (mount or copy them similarly to the wireguard service) -# -# - pki-authority depends on a WireGuard cluster existing and sharing nodes with it. -# When bootstrapping WireGuard, prefer ClusterPolicy id 'wireguard' to match pki-authority's stateExpr. -DB_HOST=${DB_HOST:-127.0.0.1} -DB_PORT=${DB_PORT:-3306} -DB_USER=${DB_USER:-root} -DB_NAME=${DB_NAME:-swarmdb} - -# Service descriptors -SERVICE_NAME=${SERVICE_NAME:-pki-authority} -SERVICE_VERSION=${SERVICE_VERSION:-1.0.0} -CLUSTER_POLICY=${CLUSTER_POLICY:-pki-authority} -CLUSTER_ID=${CLUSTER_ID:-pki-authority} - -# Path to manifest file INSIDE the container (configs are mounted to /configs) -MANIFEST_PATH=${MANIFEST_PATH:-/etc/swarm-services/${SERVICE_NAME}/manifest.yaml} -LOCATION_PATH=${LOCATION_PATH:-/etc/swarm-services/${SERVICE_NAME}} -SERVICE_PK="${CLUSTER_POLICY}:${SERVICE_NAME}" - -if [ ! -f "$MANIFEST_PATH" ]; then - echo "Manifest not found at: $MANIFEST_PATH" >&2 - exit 1 -fi - -CLI="$(dirname "$0")/swarm-cli.sh" -echo "Creating/Updating ClusterPolicies '$CLUSTER_POLICY'..." -DB_HOST="$DB_HOST" DB_PORT="$DB_PORT" DB_USER="$DB_USER" DB_NAME="$DB_NAME" \ - python3 "$(dirname "$0")/swarm-cli.py" create ClusterPolicies "$CLUSTER_POLICY" - -echo "Creating/Updating ClusterServices '$SERVICE_PK'..." -DB_HOST="$DB_HOST" DB_PORT="$DB_PORT" DB_USER="$DB_USER" DB_NAME="$DB_NAME" \ - python3 "$(dirname "$0")/swarm-cli.py" create ClusterServices "$SERVICE_PK" --name="$SERVICE_NAME" --cluster_policy="$CLUSTER_POLICY" --version="$SERVICE_VERSION" --location="$LOCATION_PATH" - -echo "Done. The provision worker will reconcile '$SERVICE_NAME' shortly." From ec02209909a5e2def1e197e1a3bc8106558a6799 Mon Sep 17 00:00:00 2001 From: Petr Evstifeev Date: Wed, 28 Jan 2026 10:39:46 -0600 Subject: [PATCH 38/51] draft 2 --- src/Dockerfile | 24 ++-- .../pki-service/{ => conf}/dnsmasq.conf | 0 .../configs/pki-service/{ => conf}/lxc-net | 0 .../{ => conf}/lxc-swarm-template.yaml | 0 .../{ => scripts}/pki_configure.py | 32 ++--- .../pki-service/scripts/pki_healthcheck.py | 36 ++++++ .../pki-service/{ => scripts}/pki_helpers.py | 113 +----------------- .../pki-service/{ => scripts}/pki_init.py | 0 .../pki-service/scripts/pki_watchdog.sh | 42 +++++++ .../{ => systemd}/pki-authority-init.service | 0 .../systemd/pki-authority-watchdog.service | 7 ++ .../systemd/pki-authority-watchdog.timer | 10 ++ .../{ => systemd}/pki-authority.path | 0 .../{ => systemd}/pki-authority.service | 0 .../pki-authority.service.d/init.conf | 0 15 files changed, 127 insertions(+), 137 deletions(-) rename src/rootfs/files/configs/pki-service/{ => conf}/dnsmasq.conf (100%) rename src/rootfs/files/configs/pki-service/{ => conf}/lxc-net (100%) rename src/rootfs/files/configs/pki-service/{ => conf}/lxc-swarm-template.yaml (100%) rename src/rootfs/files/configs/pki-service/{ => scripts}/pki_configure.py (75%) create mode 100644 src/rootfs/files/configs/pki-service/scripts/pki_healthcheck.py rename src/rootfs/files/configs/pki-service/{ => scripts}/pki_helpers.py (86%) rename src/rootfs/files/configs/pki-service/{ => scripts}/pki_init.py (100%) create mode 100644 src/rootfs/files/configs/pki-service/scripts/pki_watchdog.sh rename src/rootfs/files/configs/pki-service/{ => systemd}/pki-authority-init.service (100%) create mode 100644 src/rootfs/files/configs/pki-service/systemd/pki-authority-watchdog.service create mode 100644 src/rootfs/files/configs/pki-service/systemd/pki-authority-watchdog.timer rename src/rootfs/files/configs/pki-service/{ => systemd}/pki-authority.path (100%) rename src/rootfs/files/configs/pki-service/{ => systemd}/pki-authority.service (100%) rename src/rootfs/files/configs/pki-service/{ => systemd}/pki-authority.service.d/init.conf (100%) diff --git a/src/Dockerfile b/src/Dockerfile index c583f2e1..22ac484a 100644 --- a/src/Dockerfile +++ b/src/Dockerfile @@ -248,21 +248,23 @@ RUN --security=insecure /buildroot/files/scripts/install_lxc_deps.sh RUN mkdir -p "${OUTPUTDIR}/etc/super/containers/pki-authority" COPY --from=ghcr.io/super-protocol/tee-pki-authority-service-lxc:build-21253804163 /pki-authority.tar "${OUTPUTDIR}/etc/super/containers/pki-authority/pki-authority.tar" -ADD rootfs/files/configs/pki-service/lxc-swarm-template.yaml "${OUTPUTDIR}/etc/super/containers/pki-authority/lxc-swarm-template.yaml" -ADD rootfs/files/configs/pki-service/dnsmasq.conf "${OUTPUTDIR}/etc/lxc/dnsmasq.conf" -ADD rootfs/files/configs/pki-service/lxc-net "${OUTPUTDIR}/etc/default/lxc-net" +ADD rootfs/files/configs/pki-service/conf/lxc-swarm-template.yaml "${OUTPUTDIR}/etc/super/containers/pki-authority/lxc-swarm-template.yaml" +ADD rootfs/files/configs/pki-service/conf/dnsmasq.conf "${OUTPUTDIR}/etc/lxc/dnsmasq.conf" +ADD rootfs/files/configs/pki-service/conf/lxc-net "${OUTPUTDIR}/etc/default/lxc-net" RUN mkdir -p "${OUTPUTDIR}/usr/local/bin/pki-authority" -ADD rootfs/files/configs/pki-service/pki_helpers.py "${OUTPUTDIR}/usr/local/bin/pki-authority/pki_helpers.py" -ADD rootfs/files/configs/pki-service/pki_init.py "${OUTPUTDIR}/usr/local/bin/pki-authority/pki_init.py" -ADD rootfs/files/configs/pki-service/pki_configure.py "${OUTPUTDIR}/usr/local/bin/pki-authority/pki_configure.py" -ADD rootfs/files/configs/pki-service/pki-authority-init.service "${OUTPUTDIR}/etc/systemd/system" -ADD rootfs/files/configs/pki-service/pki-authority.service "${OUTPUTDIR}/etc/systemd/system" -ADD rootfs/files/configs/pki-service/pki-authority.service.d/init.conf "${OUTPUTDIR}/etc/systemd/system/pki-authority.service.d/init.conf" -ADD rootfs/files/configs/pki-service/pki-authority.path "${OUTPUTDIR}/etc/systemd/system" -RUN chmod +x "${OUTPUTDIR}/usr/local/bin/pki-authority/pki_init.py" "${OUTPUTDIR}/usr/local/bin/pki-authority/pki_configure.py" +ADD rootfs/files/configs/pki-service/scripts/*.py "${OUTPUTDIR}/usr/local/bin/pki-authority/" +ADD rootfs/files/configs/pki-service/scripts/*.sh "${OUTPUTDIR}/usr/local/bin/pki-authority/" +ADD rootfs/files/configs/pki-service/systemd/pki-authority-init.service "${OUTPUTDIR}/etc/systemd/system" +ADD rootfs/files/configs/pki-service/systemd/pki-authority.service "${OUTPUTDIR}/etc/systemd/system" +ADD rootfs/files/configs/pki-service/systemd/pki-authority-watchdog.service "${OUTPUTDIR}/etc/systemd/system" +ADD rootfs/files/configs/pki-service/systemd/pki-authority-watchdog.timer "${OUTPUTDIR}/etc/systemd/system" +ADD rootfs/files/configs/pki-service/systemd/pki-authority.service.d/init.conf "${OUTPUTDIR}/etc/systemd/system/pki-authority.service.d/init.conf" +ADD rootfs/files/configs/pki-service/systemd/pki-authority.path "${OUTPUTDIR}/etc/systemd/system" +RUN chmod +x "${OUTPUTDIR}"/usr/local/bin/pki-authority/*.py "${OUTPUTDIR}"/usr/local/bin/pki-authority/*.sh RUN ln -s /etc/systemd/system/pki-authority-init.service "${OUTPUTDIR}/etc/systemd/system/multi-user.target.wants/pki-authority-init.service" RUN ln -s /etc/systemd/system/pki-authority.service "${OUTPUTDIR}/etc/systemd/system/multi-user.target.wants/pki-authority.service" RUN ln -s /etc/systemd/system/pki-authority.path "${OUTPUTDIR}/etc/systemd/system/multi-user.target.wants/pki-authority.path" +RUN ln -s /etc/systemd/system/pki-authority-watchdog.timer "${OUTPUTDIR}/etc/systemd/system/timers.target.wants/pki-authority-watchdog.timer" ADD rootfs/files/configs/etc/multipath.conf.append /buildroot/files/configs/etc/multipath.conf.append ADD rootfs/files/configs/etc/sysctl.conf.append /buildroot/files/configs/etc/sysctl.conf.append diff --git a/src/rootfs/files/configs/pki-service/dnsmasq.conf b/src/rootfs/files/configs/pki-service/conf/dnsmasq.conf similarity index 100% rename from src/rootfs/files/configs/pki-service/dnsmasq.conf rename to src/rootfs/files/configs/pki-service/conf/dnsmasq.conf diff --git a/src/rootfs/files/configs/pki-service/lxc-net b/src/rootfs/files/configs/pki-service/conf/lxc-net similarity index 100% rename from src/rootfs/files/configs/pki-service/lxc-net rename to src/rootfs/files/configs/pki-service/conf/lxc-net diff --git a/src/rootfs/files/configs/pki-service/lxc-swarm-template.yaml b/src/rootfs/files/configs/pki-service/conf/lxc-swarm-template.yaml similarity index 100% rename from src/rootfs/files/configs/pki-service/lxc-swarm-template.yaml rename to src/rootfs/files/configs/pki-service/conf/lxc-swarm-template.yaml diff --git a/src/rootfs/files/configs/pki-service/pki_configure.py b/src/rootfs/files/configs/pki-service/scripts/pki_configure.py similarity index 75% rename from src/rootfs/files/configs/pki-service/pki_configure.py rename to src/rootfs/files/configs/pki-service/scripts/pki_configure.py index 3fb8c4f4..57c6d109 100644 --- a/src/rootfs/files/configs/pki-service/pki_configure.py +++ b/src/rootfs/files/configs/pki-service/scripts/pki_configure.py @@ -4,6 +4,7 @@ Configures the container with network, device access, and runtime settings. """ +import hashlib import sys from pathlib import Path @@ -11,6 +12,7 @@ from pki_helpers import ( log, LogLevel, detect_cpu_type, detect_vm_mode, detect_network_type, patch_yaml_config, patch_lxc_config, get_pki_authority_param, + setup_iptables, update_pccs_url, PKI_SERVICE_NAME ) @@ -33,34 +35,32 @@ def main(): log(LogLevel.INFO, f"VM mode: {vm_mode.value}") log(LogLevel.INFO, f"Network type: {network_type}") - # Get parameters from swarm-env.yaml try: pki_domain = get_pki_authority_param("domain") - network_key_hash = get_pki_authority_param("network_key_hash") - except Exception as e: - log(LogLevel.ERROR, f"Failed to read parameters: {e}") - sys.exit(1) - - # Patch YAML config - try: + network_key = get_pki_authority_param("networkKey") + patch_yaml_config( cpu_type=cpu_type, vm_mode=vm_mode, network_type=network_type, pki_domain=pki_domain, - network_key_hash=network_key_hash + network_key_hash=hashlib.sha256(network_key.encode()).hexdigest() ) log(LogLevel.INFO, "YAML config patched successfully") - except Exception as e: - log(LogLevel.ERROR, f"Failed to patch YAML config: {e}") - sys.exit(1) - - # Patch LXC config - try: + patch_lxc_config(cpu_type) log(LogLevel.INFO, "LXC config patched successfully") + + # Setup iptables rules + setup_iptables() + log(LogLevel.INFO, "iptables rules configured successfully") + + # Update PCCS URL in container + update_pccs_url() + log(LogLevel.INFO, "PCCS URL updated successfully") + except Exception as e: - log(LogLevel.ERROR, f"Failed to patch LXC config: {e}") + log(LogLevel.ERROR, f"Configuration failed: {e}") sys.exit(1) log(LogLevel.INFO, "PKI Authority configuration completed successfully") diff --git a/src/rootfs/files/configs/pki-service/scripts/pki_healthcheck.py b/src/rootfs/files/configs/pki-service/scripts/pki_healthcheck.py new file mode 100644 index 00000000..33c80b1d --- /dev/null +++ b/src/rootfs/files/configs/pki-service/scripts/pki_healthcheck.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python3 +""" +PKI Authority health check. +Checks if the PKI Authority service inside the container is healthy. +""" + +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent)) +from pki_helpers import LXCContainer, log, LogLevel, PKI_SERVICE_NAME + + +def main(): + """Main health check logic.""" + log(LogLevel.INFO, "Starting PKI Authority health check") + + # Create container manager + container = LXCContainer(PKI_SERVICE_NAME) + + # Check if container is running + if not container.is_running(): + log(LogLevel.ERROR, f"Container '{PKI_SERVICE_NAME}' is not running") + sys.exit(1) + + # Check if service inside container is healthy + if not container.is_service_healthy(): + log(LogLevel.ERROR, "PKI Authority service is not healthy") + sys.exit(1) + + log(LogLevel.INFO, "PKI Authority service is healthy") + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/src/rootfs/files/configs/pki-service/pki_helpers.py b/src/rootfs/files/configs/pki-service/scripts/pki_helpers.py similarity index 86% rename from src/rootfs/files/configs/pki-service/pki_helpers.py rename to src/rootfs/files/configs/pki-service/scripts/pki_helpers.py index b8b408c3..0337431e 100644 --- a/src/rootfs/files/configs/pki-service/pki_helpers.py +++ b/src/rootfs/files/configs/pki-service/scripts/pki_helpers.py @@ -571,7 +571,7 @@ def ensure_iptables_rule(check_args: List[str], add_args: List[str], description subprocess.run(add_args, check=True) log(LogLevel.INFO, "Rule added") -def setup_iptables(wg_ip): +def setup_iptables(): """Setup iptables NAT rules for LXC container access to host services.""" host_ip = get_bridge_ip(BRIDGE_NAME) @@ -600,99 +600,7 @@ def setup_iptables(wg_ip): description=f"PCCS DNAT {host_ip}:{PCCS_PORT} -> 127.0.0.1:{PCCS_PORT}" ) - # Rule 2: WireGuard PREROUTING (HTTPS) - ensure_iptables_rule( - check_args=[ - "iptables", "-t", "nat", "-C", "PREROUTING", - "-i", WIREGUARD_INTERFACE, - "-p", "tcp", - "--dport", PKI_SERVICE_EXTERNAL_PORT, - "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, - "-j", "DNAT", - "--to-destination", f"{CONTAINER_IP}:443" - ], - add_args=[ - "iptables", "-t", "nat", "-A", "PREROUTING", - "-i", WIREGUARD_INTERFACE, - "-p", "tcp", - "--dport", PKI_SERVICE_EXTERNAL_PORT, - "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, - "-j", "DNAT", - "--to-destination", f"{CONTAINER_IP}:443" - ], - description=f"PREROUTING WireGuard {PKI_SERVICE_EXTERNAL_PORT} -> {CONTAINER_IP}:443" - ) - - # Rule 2a: WireGuard PREROUTING (HTTP) - ensure_iptables_rule( - check_args=[ - "iptables", "-t", "nat", "-C", "PREROUTING", - "-i", WIREGUARD_INTERFACE, - "-p", "tcp", - "--dport", "8080", - "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, - "-j", "DNAT", - "--to-destination", f"{CONTAINER_IP}:80" - ], - add_args=[ - "iptables", "-t", "nat", "-A", "PREROUTING", - "-i", WIREGUARD_INTERFACE, - "-p", "tcp", - "--dport", "8080", - "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, - "-j", "DNAT", - "--to-destination", f"{CONTAINER_IP}:80" - ], - description=f"PREROUTING WireGuard 8080 -> {CONTAINER_IP}:80" - ) - - # Rule 3: OUTPUT (HTTPS) - ensure_iptables_rule( - check_args=[ - "iptables", "-t", "nat", "-C", "OUTPUT", - "-d", wg_ip, - "-p", "tcp", - "--dport", PKI_SERVICE_EXTERNAL_PORT, - "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, - "-j", "DNAT", - "--to-destination", f"{CONTAINER_IP}:443" - ], - add_args=[ - "iptables", "-t", "nat", "-A", "OUTPUT", - "-d", wg_ip, - "-p", "tcp", - "--dport", PKI_SERVICE_EXTERNAL_PORT, - "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, - "-j", "DNAT", - "--to-destination", f"{CONTAINER_IP}:443" - ], - description=f"OUTPUT {wg_ip}:{PKI_SERVICE_EXTERNAL_PORT} -> {CONTAINER_IP}:443" - ) - - # Rule 3a: OUTPUT (HTTP) - ensure_iptables_rule( - check_args=[ - "iptables", "-t", "nat", "-C", "OUTPUT", - "-d", wg_ip, - "-p", "tcp", - "--dport", "8080", - "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, - "-j", "DNAT", - "--to-destination", f"{CONTAINER_IP}:80" - ], - add_args=[ - "iptables", "-t", "nat", "-A", "OUTPUT", - "-d", wg_ip, - "-p", "tcp", - "--dport", "8080", - "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, - "-j", "DNAT", - "--to-destination", f"{CONTAINER_IP}:80" - ], - description=f"OUTPUT {wg_ip}:8080 -> {CONTAINER_IP}:80" - ) - - # Rule 4: MASQUERADE + # Rule 2: MASQUERADE ensure_iptables_rule( check_args=[ "iptables", "-t", "nat", "-C", "POSTROUTING", @@ -709,7 +617,7 @@ def setup_iptables(wg_ip): description=f"POSTROUTING MASQUERADE for {CONTAINER_IP}/32" ) - # Rule 5: Allow port 8081 on lxcbr0 + # Rule 3: Allow port 8081 on lxcbr0 ensure_iptables_rule( check_args=[ "iptables", "-C", "INPUT", @@ -761,21 +669,6 @@ def update_pccs_url(): with open(qcnl_conf, "w", encoding="utf-8") as file: file.write(content) - - -def init_container(): - """Initialize LXC container for PKI Authority.""" - LXCContainer(PKI_SERVICE_NAME).create() - - -def get_node_tunnel_ip(node_id: str, wg_props: List[dict]) -> Optional[str]: - """Get tunnel IP for a node from WireGuard properties.""" - for prop in wg_props: - if prop.get("node_id") == node_id and prop.get("name") == "tunnel_ip": - return prop.get("value") - return None - - def save_property_into_fs(file_name: str, content: bytes): """Save property content to filesystem.""" STORAGE_PATH.mkdir(parents=True, exist_ok=True) diff --git a/src/rootfs/files/configs/pki-service/pki_init.py b/src/rootfs/files/configs/pki-service/scripts/pki_init.py similarity index 100% rename from src/rootfs/files/configs/pki-service/pki_init.py rename to src/rootfs/files/configs/pki-service/scripts/pki_init.py diff --git a/src/rootfs/files/configs/pki-service/scripts/pki_watchdog.sh b/src/rootfs/files/configs/pki-service/scripts/pki_watchdog.sh new file mode 100644 index 00000000..49932da6 --- /dev/null +++ b/src/rootfs/files/configs/pki-service/scripts/pki_watchdog.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +SERVICE="pki-authority.service" +MIN_UPTIME_SECONDS=180 # 3 minutes + +# Check if service is enabled +if ! systemctl is-enabled --quiet "$SERVICE"; then + exit 0 +fi + +# Check if service is active +if ! systemctl is-active --quiet "$SERVICE"; then + exit 0 +fi + +# Get service activation time (unix timestamp) +ACTIVE_ENTER=$(systemctl show -p ActiveEnterTimestamp --value "$SERVICE") +if [ -z "$ACTIVE_ENTER" ] || [ "$ACTIVE_ENTER" = "n/a" ]; then + exit 0 +fi + +# Convert to unix timestamp +ACTIVE_ENTER_SEC=$(date -d "$ACTIVE_ENTER" +%s 2>/dev/null) +if [ -z "$ACTIVE_ENTER_SEC" ]; then + exit 0 +fi + +# Get current time +CURRENT_SEC=$(date +%s) + +# Calculate uptime in seconds +UPTIME_SEC=$((CURRENT_SEC - ACTIVE_ENTER_SEC)) + +# If uptime is less than 3 minutes - exit +if [ "$UPTIME_SEC" -lt "$MIN_UPTIME_SECONDS" ]; then + exit 0 +fi + +# Run healthcheck +/usr/bin/python3 /usr/local/bin/pki-authority/pki_healthcheck.py || { + /usr/bin/systemctl restart "$SERVICE" +} diff --git a/src/rootfs/files/configs/pki-service/pki-authority-init.service b/src/rootfs/files/configs/pki-service/systemd/pki-authority-init.service similarity index 100% rename from src/rootfs/files/configs/pki-service/pki-authority-init.service rename to src/rootfs/files/configs/pki-service/systemd/pki-authority-init.service diff --git a/src/rootfs/files/configs/pki-service/systemd/pki-authority-watchdog.service b/src/rootfs/files/configs/pki-service/systemd/pki-authority-watchdog.service new file mode 100644 index 00000000..5129e845 --- /dev/null +++ b/src/rootfs/files/configs/pki-service/systemd/pki-authority-watchdog.service @@ -0,0 +1,7 @@ +[Unit] +Description=PKI Authority Watchdog + +[Service] +Type=oneshot +TimeoutStartSec=1min +ExecStart=/usr/local/bin/pki-authority/pki_watchdog.sh diff --git a/src/rootfs/files/configs/pki-service/systemd/pki-authority-watchdog.timer b/src/rootfs/files/configs/pki-service/systemd/pki-authority-watchdog.timer new file mode 100644 index 00000000..bc9af0b1 --- /dev/null +++ b/src/rootfs/files/configs/pki-service/systemd/pki-authority-watchdog.timer @@ -0,0 +1,10 @@ +[Unit] +Description=PKI Authority Watchdog Timer + + +[Timer] +OnUnitActiveSec=5min +Unit=pki-authority-watchdog.service + +[Install] +WantedBy=timers.target diff --git a/src/rootfs/files/configs/pki-service/pki-authority.path b/src/rootfs/files/configs/pki-service/systemd/pki-authority.path similarity index 100% rename from src/rootfs/files/configs/pki-service/pki-authority.path rename to src/rootfs/files/configs/pki-service/systemd/pki-authority.path diff --git a/src/rootfs/files/configs/pki-service/pki-authority.service b/src/rootfs/files/configs/pki-service/systemd/pki-authority.service similarity index 100% rename from src/rootfs/files/configs/pki-service/pki-authority.service rename to src/rootfs/files/configs/pki-service/systemd/pki-authority.service diff --git a/src/rootfs/files/configs/pki-service/pki-authority.service.d/init.conf b/src/rootfs/files/configs/pki-service/systemd/pki-authority.service.d/init.conf similarity index 100% rename from src/rootfs/files/configs/pki-service/pki-authority.service.d/init.conf rename to src/rootfs/files/configs/pki-service/systemd/pki-authority.service.d/init.conf From d66dc9878c52397527ae2eafdfce9081d5ed68cc Mon Sep 17 00:00:00 2001 From: Petr Evstifeev Date: Tue, 3 Feb 2026 02:43:14 -0600 Subject: [PATCH 39/51] swarm key generation, disable watchdog --- src/Dockerfile | 6 +- .../pki-service/conf/lxc-swarm-template.yaml | 11 +++ .../pki-service/scripts/pki_configure.py | 18 +++- .../pki-service/scripts/pki_healthcheck.py | 0 .../pki-service/scripts/pki_helpers.py | 82 ++++++++++++++++++- .../configs/pki-service/scripts/pki_init.py | 0 6 files changed, 110 insertions(+), 7 deletions(-) mode change 100644 => 100755 src/rootfs/files/configs/pki-service/scripts/pki_configure.py mode change 100644 => 100755 src/rootfs/files/configs/pki-service/scripts/pki_healthcheck.py mode change 100644 => 100755 src/rootfs/files/configs/pki-service/scripts/pki_helpers.py mode change 100644 => 100755 src/rootfs/files/configs/pki-service/scripts/pki_init.py diff --git a/src/Dockerfile b/src/Dockerfile index 22ac484a..47d8eda2 100644 --- a/src/Dockerfile +++ b/src/Dockerfile @@ -256,15 +256,15 @@ ADD rootfs/files/configs/pki-service/scripts/*.py "${OUTPUTDIR}/usr/local/bin/pk ADD rootfs/files/configs/pki-service/scripts/*.sh "${OUTPUTDIR}/usr/local/bin/pki-authority/" ADD rootfs/files/configs/pki-service/systemd/pki-authority-init.service "${OUTPUTDIR}/etc/systemd/system" ADD rootfs/files/configs/pki-service/systemd/pki-authority.service "${OUTPUTDIR}/etc/systemd/system" -ADD rootfs/files/configs/pki-service/systemd/pki-authority-watchdog.service "${OUTPUTDIR}/etc/systemd/system" -ADD rootfs/files/configs/pki-service/systemd/pki-authority-watchdog.timer "${OUTPUTDIR}/etc/systemd/system" +# ADD rootfs/files/configs/pki-service/systemd/pki-authority-watchdog.service "${OUTPUTDIR}/etc/systemd/system" +# ADD rootfs/files/configs/pki-service/systemd/pki-authority-watchdog.timer "${OUTPUTDIR}/etc/systemd/system" ADD rootfs/files/configs/pki-service/systemd/pki-authority.service.d/init.conf "${OUTPUTDIR}/etc/systemd/system/pki-authority.service.d/init.conf" ADD rootfs/files/configs/pki-service/systemd/pki-authority.path "${OUTPUTDIR}/etc/systemd/system" RUN chmod +x "${OUTPUTDIR}"/usr/local/bin/pki-authority/*.py "${OUTPUTDIR}"/usr/local/bin/pki-authority/*.sh RUN ln -s /etc/systemd/system/pki-authority-init.service "${OUTPUTDIR}/etc/systemd/system/multi-user.target.wants/pki-authority-init.service" RUN ln -s /etc/systemd/system/pki-authority.service "${OUTPUTDIR}/etc/systemd/system/multi-user.target.wants/pki-authority.service" RUN ln -s /etc/systemd/system/pki-authority.path "${OUTPUTDIR}/etc/systemd/system/multi-user.target.wants/pki-authority.path" -RUN ln -s /etc/systemd/system/pki-authority-watchdog.timer "${OUTPUTDIR}/etc/systemd/system/timers.target.wants/pki-authority-watchdog.timer" +# RUN ln -s /etc/systemd/system/pki-authority-watchdog.timer "${OUTPUTDIR}/etc/systemd/system/timers.target.wants/pki-authority-watchdog.timer" ADD rootfs/files/configs/etc/multipath.conf.append /buildroot/files/configs/etc/multipath.conf.append ADD rootfs/files/configs/etc/sysctl.conf.append /buildroot/files/configs/etc/sysctl.conf.append diff --git a/src/rootfs/files/configs/pki-service/conf/lxc-swarm-template.yaml b/src/rootfs/files/configs/pki-service/conf/lxc-swarm-template.yaml index 37068303..f1551f65 100644 --- a/src/rootfs/files/configs/pki-service/conf/lxc-swarm-template.yaml +++ b/src/rootfs/files/configs/pki-service/conf/lxc-swarm-template.yaml @@ -1,6 +1,9 @@ api: httpsPort: 443 httpPort: 80 + enabledApis: + - secrets + - pki pki: allowedChallenges: - token @@ -32,4 +35,12 @@ pki: storageFolder: /app/swarm-storage networkSettings: networkType: trusted +secretsStorage: + static: + swarmKey: dummy-swarm-key + storage: + storageType: file + storageFolder: /app/swarm-storage + validationCaBundle: + type: pki diff --git a/src/rootfs/files/configs/pki-service/scripts/pki_configure.py b/src/rootfs/files/configs/pki-service/scripts/pki_configure.py old mode 100644 new mode 100755 index 57c6d109..bca2a801 --- a/src/rootfs/files/configs/pki-service/scripts/pki_configure.py +++ b/src/rootfs/files/configs/pki-service/scripts/pki_configure.py @@ -12,8 +12,8 @@ from pki_helpers import ( log, LogLevel, detect_cpu_type, detect_vm_mode, detect_network_type, patch_yaml_config, patch_lxc_config, get_pki_authority_param, - setup_iptables, update_pccs_url, - PKI_SERVICE_NAME + setup_iptables, update_pccs_url, generate_swarm_key, load_swarm_key, + PKI_SERVICE_NAME, VMMode ) @@ -38,13 +38,25 @@ def main(): try: pki_domain = get_pki_authority_param("domain") network_key = get_pki_authority_param("networkKey") + + # Get or generate swarm key based on VM mode + if vm_mode == VMMode.SWARM_INIT: + # In swarm-init mode: try to load existing key, generate if doesn't exist + try: + swarm_key = load_swarm_key() + except FileNotFoundError: + swarm_key = generate_swarm_key() + else: + # In swarm-normal mode: key must exist + swarm_key = load_swarm_key() patch_yaml_config( cpu_type=cpu_type, vm_mode=vm_mode, network_type=network_type, pki_domain=pki_domain, - network_key_hash=hashlib.sha256(network_key.encode()).hexdigest() + network_key_hash=hashlib.sha256(network_key.encode()).hexdigest(), + swarm_key=swarm_key ) log(LogLevel.INFO, "YAML config patched successfully") diff --git a/src/rootfs/files/configs/pki-service/scripts/pki_healthcheck.py b/src/rootfs/files/configs/pki-service/scripts/pki_healthcheck.py old mode 100644 new mode 100755 diff --git a/src/rootfs/files/configs/pki-service/scripts/pki_helpers.py b/src/rootfs/files/configs/pki-service/scripts/pki_helpers.py old mode 100644 new mode 100755 index 0337431e..55015372 --- a/src/rootfs/files/configs/pki-service/scripts/pki_helpers.py +++ b/src/rootfs/files/configs/pki-service/scripts/pki_helpers.py @@ -27,6 +27,7 @@ STORAGE_PATH = Path(f"/var/lib/lxc/{PKI_SERVICE_NAME}/rootfs/app/swarm-storage") IPTABLES_RULE_COMMENT = f"{PKI_SERVICE_NAME}-rule" SWARM_ENV_YAML = "/sp/swarm/swarm-env.yaml" +SWARM_KEY_FILE = "/etc/swarm.key" class LogLevel(Enum): @@ -339,12 +340,82 @@ def get_pki_authority_param(param_name: str) -> str: raise Exception(error_msg) from error +def generate_swarm_key() -> str: + """Generate new 32-byte swarm key and save to file. + + Returns: + Swarm key as hex string (64 characters). + + Raises: + Exception: If failed to save key to file. + """ + swarm_key_path = Path(SWARM_KEY_FILE) + + log(LogLevel.INFO, "Generating new 32-byte swarm key") + swarm_key = secrets.token_hex(32) # 32 bytes = 64 hex characters + + try: + with open(swarm_key_path, "w", encoding="utf-8") as file: + file.write(swarm_key) + + # Set restrictive permissions (600) + swarm_key_path.chmod(0o600) + + log(LogLevel.INFO, f"Swarm key generated and saved to {SWARM_KEY_FILE}") + return swarm_key + except Exception as error: + error_msg = f"Failed to save swarm key: {error}" + log(LogLevel.ERROR, error_msg) + raise Exception(error_msg) from error + + +def load_swarm_key() -> str: + """Load existing swarm key from file. + + Returns: + Swarm key as hex string (64 characters). + + Raises: + FileNotFoundError: If swarm key file doesn't exist. + ValueError: If swarm key format is invalid. + Exception: For other errors during reading. + """ + swarm_key_path = Path(SWARM_KEY_FILE) + + if not swarm_key_path.exists(): + error_msg = f"Swarm key file {SWARM_KEY_FILE} not found" + log(LogLevel.ERROR, error_msg) + raise FileNotFoundError(error_msg) + + log(LogLevel.INFO, f"Reading swarm key from {SWARM_KEY_FILE}") + + try: + with open(swarm_key_path, "r", encoding="utf-8") as file: + swarm_key = file.read().strip() + + # Validate key format (should be 64 hex characters) + if not re.match(r'^[0-9a-fA-F]{64}$', swarm_key): + error_msg = f"Invalid swarm key format in {SWARM_KEY_FILE}. Expected 64 hex characters." + log(LogLevel.ERROR, error_msg) + raise ValueError(error_msg) + + log(LogLevel.INFO, "Swarm key loaded successfully") + return swarm_key + except (FileNotFoundError, ValueError): + raise + except Exception as error: + error_msg = f"Failed to read swarm key: {error}" + log(LogLevel.ERROR, error_msg) + raise Exception(error_msg) from error + + def patch_yaml_config( cpu_type: str, vm_mode: VMMode, pki_domain: str, network_type: str, - network_key_hash: str + network_key_hash: str, + swarm_key: str ): """Set own challenge type in LXC container configuration.""" template_name = "lxc-swarm-template.yaml" @@ -419,6 +490,15 @@ def patch_yaml_config( f"networkKeyHashHex={network_key_hash}" ) + # Set secretsStorage with swarmKey + if swarm_key: + if "secretsStorage" not in config: + config["secretsStorage"] = {} + if "static" not in config["secretsStorage"]: + config["secretsStorage"]["static"] = {} + config["secretsStorage"]["static"]["swarmKey"] = swarm_key + log(LogLevel.INFO, "Set swarmKey in secretsStorage.static") + # Ensure destination directory exists dst_yaml.parent.mkdir(parents=True, exist_ok=True) diff --git a/src/rootfs/files/configs/pki-service/scripts/pki_init.py b/src/rootfs/files/configs/pki-service/scripts/pki_init.py old mode 100644 new mode 100755 From 64f48374b244f2094a6728e593286151d56595fa Mon Sep 17 00:00:00 2001 From: Petr Evstifeev Date: Tue, 3 Feb 2026 13:42:58 -0600 Subject: [PATCH 40/51] pki sync draft --- src/Dockerfile | 17 +++++--- .../pki-service/conf/lxc-swarm-template.yaml | 2 +- .../pki-service/conf/secrets-config.yaml | 14 +++++++ .../pki-service/scripts/pki_configure.py | 8 +++- .../pki-service/scripts/pki_helpers.py | 30 ++++++++++++- .../pki-service/scripts/pki_watchdog.sh | 42 ------------------- .../systemd/pki-authority-sync.service | 15 +++++++ .../systemd/pki-authority-watchdog.service | 7 ---- .../systemd/pki-authority-watchdog.timer | 10 ----- .../pki-service/systemd/pki-authority.path | 13 ------ .../pki-service/systemd/pki-authority.service | 7 +++- .../systemd/pki-authority.service.d/init.conf | 6 --- .../files/scripts/install_sync_client.sh | 40 ++++++++++++++++++ .../files/scripts/setup_runtime_tools.sh | 2 +- 14 files changed, 123 insertions(+), 90 deletions(-) create mode 100644 src/rootfs/files/configs/pki-service/conf/secrets-config.yaml delete mode 100644 src/rootfs/files/configs/pki-service/scripts/pki_watchdog.sh create mode 100644 src/rootfs/files/configs/pki-service/systemd/pki-authority-sync.service delete mode 100644 src/rootfs/files/configs/pki-service/systemd/pki-authority-watchdog.service delete mode 100644 src/rootfs/files/configs/pki-service/systemd/pki-authority-watchdog.timer delete mode 100644 src/rootfs/files/configs/pki-service/systemd/pki-authority.path delete mode 100644 src/rootfs/files/configs/pki-service/systemd/pki-authority.service.d/init.conf create mode 100644 src/rootfs/files/scripts/install_sync_client.sh diff --git a/src/Dockerfile b/src/Dockerfile index 47d8eda2..feb6409e 100644 --- a/src/Dockerfile +++ b/src/Dockerfile @@ -256,15 +256,13 @@ ADD rootfs/files/configs/pki-service/scripts/*.py "${OUTPUTDIR}/usr/local/bin/pk ADD rootfs/files/configs/pki-service/scripts/*.sh "${OUTPUTDIR}/usr/local/bin/pki-authority/" ADD rootfs/files/configs/pki-service/systemd/pki-authority-init.service "${OUTPUTDIR}/etc/systemd/system" ADD rootfs/files/configs/pki-service/systemd/pki-authority.service "${OUTPUTDIR}/etc/systemd/system" -# ADD rootfs/files/configs/pki-service/systemd/pki-authority-watchdog.service "${OUTPUTDIR}/etc/systemd/system" -# ADD rootfs/files/configs/pki-service/systemd/pki-authority-watchdog.timer "${OUTPUTDIR}/etc/systemd/system" -ADD rootfs/files/configs/pki-service/systemd/pki-authority.service.d/init.conf "${OUTPUTDIR}/etc/systemd/system/pki-authority.service.d/init.conf" -ADD rootfs/files/configs/pki-service/systemd/pki-authority.path "${OUTPUTDIR}/etc/systemd/system" +ADD rootfs/files/configs/pki-service/systemd/pki-authority-sync.service "${OUTPUTDIR}/etc/systemd/system" +RUN mkdir -p "${OUTPUTDIR}/etc/super/pki-authority-sync" +ADD rootfs/files/configs/pki-service/conf/secrets-config.yaml "${OUTPUTDIR}/etc/super/pki-authority-sync/secrets-config.yaml" RUN chmod +x "${OUTPUTDIR}"/usr/local/bin/pki-authority/*.py "${OUTPUTDIR}"/usr/local/bin/pki-authority/*.sh RUN ln -s /etc/systemd/system/pki-authority-init.service "${OUTPUTDIR}/etc/systemd/system/multi-user.target.wants/pki-authority-init.service" RUN ln -s /etc/systemd/system/pki-authority.service "${OUTPUTDIR}/etc/systemd/system/multi-user.target.wants/pki-authority.service" -RUN ln -s /etc/systemd/system/pki-authority.path "${OUTPUTDIR}/etc/systemd/system/multi-user.target.wants/pki-authority.path" -# RUN ln -s /etc/systemd/system/pki-authority-watchdog.timer "${OUTPUTDIR}/etc/systemd/system/timers.target.wants/pki-authority-watchdog.timer" +RUN ln -s /etc/systemd/system/pki-authority-sync.service "${OUTPUTDIR}/etc/systemd/system/multi-user.target.wants/pki-authority-sync.service" ADD rootfs/files/configs/etc/multipath.conf.append /buildroot/files/configs/etc/multipath.conf.append ADD rootfs/files/configs/etc/sysctl.conf.append /buildroot/files/configs/etc/sysctl.conf.append @@ -366,6 +364,13 @@ RUN chmod +x ${OUTPUTDIR}/etc/swarm-cloud/services/*/main.py ADD rootfs/files/scripts/setup_runtime_tools.sh /buildroot/files/scripts/ RUN chmod +x /buildroot/files/scripts/setup_runtime_tools.sh RUN --security=insecure /buildroot/files/scripts/setup_runtime_tools.sh + +# install pki-sync-client npm package globally (requires python3-venv from setup_runtime_tools) +ARG PKI_SYNC_CLIENT_VERSION=2.0.2 +ADD rootfs/files/scripts/install_sync_client.sh /buildroot/files/scripts/ +RUN chmod +x /buildroot/files/scripts/install_sync_client.sh +RUN --security=insecure /buildroot/files/scripts/install_sync_client.sh "${PKI_SYNC_CLIENT_VERSION}" + # MongoDB (install official mongodb-org 7.0 via Jammy repository inside VM rootfs) ADD rootfs/files/scripts/install_mongodb.sh /buildroot/files/scripts/ RUN --security=insecure bash /buildroot/files/scripts/install_mongodb.sh diff --git a/src/rootfs/files/configs/pki-service/conf/lxc-swarm-template.yaml b/src/rootfs/files/configs/pki-service/conf/lxc-swarm-template.yaml index f1551f65..5f43bf65 100644 --- a/src/rootfs/files/configs/pki-service/conf/lxc-swarm-template.yaml +++ b/src/rootfs/files/configs/pki-service/conf/lxc-swarm-template.yaml @@ -21,7 +21,7 @@ pki: ownChallenge: type: tdx certParams: - ocspUrl: https://ocsp.certs.superprotocol.com/v1/ocsp + ocspUrl: '' keyStorage: type: trusted storage: diff --git a/src/rootfs/files/configs/pki-service/conf/secrets-config.yaml b/src/rootfs/files/configs/pki-service/conf/secrets-config.yaml new file mode 100644 index 00000000..8c92a47e --- /dev/null +++ b/src/rootfs/files/configs/pki-service/conf/secrets-config.yaml @@ -0,0 +1,14 @@ +# PKI Sync Client - Secrets Configuration +secrets: + - secretName: auth_token + saveTo: /var/lib/lxc/pki-authority/rootfs/app/swarm-storage/auth_token + - secretName: basic_certificate + saveTo: /var/lib/lxc/pki-authority/rootfs/app/swarm-storage/basic_certificate + - secretName: basic_privateKey + saveTo: /var/lib/lxc/pki-authority/rootfs/app/swarm-storage/basic_privateKey + - secretName: lite_certificate + saveTo: /var/lib/lxc/pki-authority/rootfs/app/swarm-storage/lite_certificate + - secretName: lite_privateKey + saveTo: /var/lib/lxc/pki-authority/rootfs/app/swarm-storage/lite_privateKey + - secretName: swarmKey + saveTo: /etc/swarm/swarm.key \ No newline at end of file diff --git a/src/rootfs/files/configs/pki-service/scripts/pki_configure.py b/src/rootfs/files/configs/pki-service/scripts/pki_configure.py index bca2a801..771a903a 100755 --- a/src/rootfs/files/configs/pki-service/scripts/pki_configure.py +++ b/src/rootfs/files/configs/pki-service/scripts/pki_configure.py @@ -36,7 +36,13 @@ def main(): log(LogLevel.INFO, f"Network type: {network_type}") try: - pki_domain = get_pki_authority_param("domain") + try: + pki_domain = get_pki_authority_param("domain") + except (FileNotFoundError, ValueError) as e: + log(LogLevel.WARN, f"Failed to read domain from config: {e}") + pki_domain = "localhost" + log(LogLevel.INFO, f"Using default domain: {pki_domain}") + network_key = get_pki_authority_param("networkKey") # Get or generate swarm key based on VM mode diff --git a/src/rootfs/files/configs/pki-service/scripts/pki_helpers.py b/src/rootfs/files/configs/pki-service/scripts/pki_helpers.py index 55015372..9b0ba3b9 100755 --- a/src/rootfs/files/configs/pki-service/scripts/pki_helpers.py +++ b/src/rootfs/files/configs/pki-service/scripts/pki_helpers.py @@ -27,7 +27,7 @@ STORAGE_PATH = Path(f"/var/lib/lxc/{PKI_SERVICE_NAME}/rootfs/app/swarm-storage") IPTABLES_RULE_COMMENT = f"{PKI_SERVICE_NAME}-rule" SWARM_ENV_YAML = "/sp/swarm/swarm-env.yaml" -SWARM_KEY_FILE = "/etc/swarm.key" +SWARM_KEY_FILE = "/etc/swarm/swarm.key" class LogLevel(Enum): @@ -355,6 +355,11 @@ def generate_swarm_key() -> str: swarm_key = secrets.token_hex(32) # 32 bytes = 64 hex characters try: + # Ensure directory exists + if not swarm_key_path.parent.exists(): + swarm_key_path.parent.mkdir(parents=True, exist_ok=True) + log(LogLevel.INFO, f"Created directory {swarm_key_path.parent}") + with open(swarm_key_path, "w", encoding="utf-8") as file: file.write(swarm_key) @@ -718,6 +723,29 @@ def setup_iptables(): description="Allow TCP port 8081 on lxcbr0" ) + # Rule 4: DNAT external port 8443 to container port 443 + ensure_iptables_rule( + check_args=[ + "iptables", "-t", "nat", "-C", "PREROUTING", + "-i", "enp0s1", + "-p", "tcp", + "--dport", PKI_SERVICE_EXTERNAL_PORT, + "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, + "-j", "DNAT", + "--to-destination", f"{CONTAINER_IP}:443" + ], + add_args=[ + "iptables", "-t", "nat", "-A", "PREROUTING", + "-i", "enp0s1", + "-p", "tcp", + "--dport", PKI_SERVICE_EXTERNAL_PORT, + "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, + "-j", "DNAT", + "--to-destination", f"{CONTAINER_IP}:443" + ], + description=f"PKI external access: enp0s1:{PKI_SERVICE_EXTERNAL_PORT} -> {CONTAINER_IP}:443" + ) + def update_pccs_url(): """Update PCCS URL in QCNL configuration.""" diff --git a/src/rootfs/files/configs/pki-service/scripts/pki_watchdog.sh b/src/rootfs/files/configs/pki-service/scripts/pki_watchdog.sh deleted file mode 100644 index 49932da6..00000000 --- a/src/rootfs/files/configs/pki-service/scripts/pki_watchdog.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/bin/bash - -SERVICE="pki-authority.service" -MIN_UPTIME_SECONDS=180 # 3 minutes - -# Check if service is enabled -if ! systemctl is-enabled --quiet "$SERVICE"; then - exit 0 -fi - -# Check if service is active -if ! systemctl is-active --quiet "$SERVICE"; then - exit 0 -fi - -# Get service activation time (unix timestamp) -ACTIVE_ENTER=$(systemctl show -p ActiveEnterTimestamp --value "$SERVICE") -if [ -z "$ACTIVE_ENTER" ] || [ "$ACTIVE_ENTER" = "n/a" ]; then - exit 0 -fi - -# Convert to unix timestamp -ACTIVE_ENTER_SEC=$(date -d "$ACTIVE_ENTER" +%s 2>/dev/null) -if [ -z "$ACTIVE_ENTER_SEC" ]; then - exit 0 -fi - -# Get current time -CURRENT_SEC=$(date +%s) - -# Calculate uptime in seconds -UPTIME_SEC=$((CURRENT_SEC - ACTIVE_ENTER_SEC)) - -# If uptime is less than 3 minutes - exit -if [ "$UPTIME_SEC" -lt "$MIN_UPTIME_SECONDS" ]; then - exit 0 -fi - -# Run healthcheck -/usr/bin/python3 /usr/local/bin/pki-authority/pki_healthcheck.py || { - /usr/bin/systemctl restart "$SERVICE" -} diff --git a/src/rootfs/files/configs/pki-service/systemd/pki-authority-sync.service b/src/rootfs/files/configs/pki-service/systemd/pki-authority-sync.service new file mode 100644 index 00000000..f521827b --- /dev/null +++ b/src/rootfs/files/configs/pki-service/systemd/pki-authority-sync.service @@ -0,0 +1,15 @@ +[Unit] +Description=PKI authority sync client service +ConditionKernelCommandLine=!vm_mode=swarm-init +After=network-online.target pki-authority-init.service +Wants=network-online.target + +[Service] +Type=oneshot +Environment=NODE_ENV=production +ExecStart=/usr/bin/node /usr/bin/pki-sync-client sync --config /etc/super/pki-authority-sync/secrets-config.yaml --swarm-env /sp/swarm/swarm-env.yaml -v --disable-server-identity-check +Restart=on-failure +RestartSec=10s + +[Install] +WantedBy=multi-user.target diff --git a/src/rootfs/files/configs/pki-service/systemd/pki-authority-watchdog.service b/src/rootfs/files/configs/pki-service/systemd/pki-authority-watchdog.service deleted file mode 100644 index 5129e845..00000000 --- a/src/rootfs/files/configs/pki-service/systemd/pki-authority-watchdog.service +++ /dev/null @@ -1,7 +0,0 @@ -[Unit] -Description=PKI Authority Watchdog - -[Service] -Type=oneshot -TimeoutStartSec=1min -ExecStart=/usr/local/bin/pki-authority/pki_watchdog.sh diff --git a/src/rootfs/files/configs/pki-service/systemd/pki-authority-watchdog.timer b/src/rootfs/files/configs/pki-service/systemd/pki-authority-watchdog.timer deleted file mode 100644 index bc9af0b1..00000000 --- a/src/rootfs/files/configs/pki-service/systemd/pki-authority-watchdog.timer +++ /dev/null @@ -1,10 +0,0 @@ -[Unit] -Description=PKI Authority Watchdog Timer - - -[Timer] -OnUnitActiveSec=5min -Unit=pki-authority-watchdog.service - -[Install] -WantedBy=timers.target diff --git a/src/rootfs/files/configs/pki-service/systemd/pki-authority.path b/src/rootfs/files/configs/pki-service/systemd/pki-authority.path deleted file mode 100644 index a142b4c1..00000000 --- a/src/rootfs/files/configs/pki-service/systemd/pki-authority.path +++ /dev/null @@ -1,13 +0,0 @@ -[Unit] -Description=Wait for Swarm configuration files for PKI authority -ConditionKernelCommandLine=!vm_mode=swarm-init - -[Path] -# Wait for these files to appear before starting pki-authority.service -PathExists=/run/swarm/join.token -PathExists=/run/swarm/manager.addr -# Add more paths here as needed -Unit=pki-authority.service - -[Install] -WantedBy=multi-user.target diff --git a/src/rootfs/files/configs/pki-service/systemd/pki-authority.service b/src/rootfs/files/configs/pki-service/systemd/pki-authority.service index a51bb1b1..d4fba21e 100644 --- a/src/rootfs/files/configs/pki-service/systemd/pki-authority.service +++ b/src/rootfs/files/configs/pki-service/systemd/pki-authority.service @@ -1,8 +1,8 @@ [Unit] Description=PKI authority lxc-container -After=pki-authority-init.service nvidia-persistenced.service pccs.service +After=pki-authority-init.service pki-authority-sync.service nvidia-persistenced.service pccs.service Requires=pki-authority-init.service pccs.service -Wants=nvidia-persistenced.service +Wants=nvidia-persistenced.service pki-authority-sync.service [Service] Type=simple @@ -17,3 +17,6 @@ ExecStart=/usr/bin/lxc-start -n pki-authority -F ExecStop=/usr/bin/lxc-stop -n pki-authority KillMode=control-group + +[Install] +WantedBy=multi-user.target diff --git a/src/rootfs/files/configs/pki-service/systemd/pki-authority.service.d/init.conf b/src/rootfs/files/configs/pki-service/systemd/pki-authority.service.d/init.conf deleted file mode 100644 index 5b3ad47c..00000000 --- a/src/rootfs/files/configs/pki-service/systemd/pki-authority.service.d/init.conf +++ /dev/null @@ -1,6 +0,0 @@ -[Unit] -# Auto-start only in swarm-init mode -ConditionKernelCommandLine=vm_mode=swarm-init - -[Install] -WantedBy=multi-user.target diff --git a/src/rootfs/files/scripts/install_sync_client.sh b/src/rootfs/files/scripts/install_sync_client.sh new file mode 100644 index 00000000..f30a53e5 --- /dev/null +++ b/src/rootfs/files/scripts/install_sync_client.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +# bash unofficial strict mode; +set -euo pipefail; + +# public, required +# OUTPUTDIR + +# public, optional +# $1 - PKI_SYNC_CLIENT_VERSION - version to install, if not set - installs latest + +# private +BUILDROOT="/buildroot"; + +# init loggggging; +source "${BUILDROOT}/files/scripts/log.sh"; + +# chroot functions +source "${BUILDROOT}/files/scripts/chroot.sh"; + +function install_sync_client() { + local PKI_SYNC_CLIENT_VERSION="${1:-}"; + local PACKAGE_NAME="@super-protocol/pki-sync-client"; + local PACKAGE_SPEC="${PACKAGE_NAME}"; + + if [ -n "${PKI_SYNC_CLIENT_VERSION}" ]; then + PACKAGE_SPEC="${PACKAGE_NAME}@${PKI_SYNC_CLIENT_VERSION}"; + log_info "installing ${PACKAGE_SPEC} npm package globally"; + else + PACKAGE_SPEC="${PACKAGE_NAME}@latest"; + log_info "installing ${PACKAGE_SPEC} npm package globally"; + fi + + chroot "${OUTPUTDIR}" /bin/bash -c "npm install -g ${PACKAGE_SPEC}"; + log_info "${PACKAGE_SPEC} installed successfully"; +} + +chroot_init; +install_sync_client "${1:-}"; +chroot_deinit; diff --git a/src/rootfs/files/scripts/setup_runtime_tools.sh b/src/rootfs/files/scripts/setup_runtime_tools.sh index 82c4d288..fbee705b 100644 --- a/src/rootfs/files/scripts/setup_runtime_tools.sh +++ b/src/rootfs/files/scripts/setup_runtime_tools.sh @@ -20,7 +20,7 @@ function setup_runtime_tools() { log_info "installing runtime packages into rootfs (python3, redis, mysql client, openssl, netcat, dns tools)" chroot "${OUTPUTDIR}" /usr/bin/apt update chroot "${OUTPUTDIR}" /usr/bin/apt install -y --no-install-recommends \ - mysql-client python3 python3-pip redis-server redis-tools openssl netcat-openbsd dnsutils + mysql-client python3 python3-pip python3-venv redis-server redis-tools openssl netcat-openbsd dnsutils chroot "${OUTPUTDIR}" /usr/bin/apt clean log_info "installing Python runtime dependencies" From 9768db8ae86c0ae272cb64046d6257f675bedc14 Mon Sep 17 00:00:00 2001 From: Petr Evstifeev Date: Wed, 4 Feb 2026 05:32:05 -0600 Subject: [PATCH 41/51] sync network type --- .../pki-service/scripts/pki_configure.py | 36 ++++++++- .../pki-service/scripts/pki_helpers.py | 80 ++++++++++++++++--- 2 files changed, 102 insertions(+), 14 deletions(-) diff --git a/src/rootfs/files/configs/pki-service/scripts/pki_configure.py b/src/rootfs/files/configs/pki-service/scripts/pki_configure.py index 771a903a..eeb1faaa 100755 --- a/src/rootfs/files/configs/pki-service/scripts/pki_configure.py +++ b/src/rootfs/files/configs/pki-service/scripts/pki_configure.py @@ -13,7 +13,8 @@ log, LogLevel, detect_cpu_type, detect_vm_mode, detect_network_type, patch_yaml_config, patch_lxc_config, get_pki_authority_param, setup_iptables, update_pccs_url, generate_swarm_key, load_swarm_key, - PKI_SERVICE_NAME, VMMode + read_network_type_from_certificate, + PKI_SERVICE_NAME, VMMode, NetworkType, STORAGE_PATH ) @@ -29,11 +30,40 @@ def main(): # Detect environment cpu_type = detect_cpu_type() vm_mode = detect_vm_mode() - network_type = detect_network_type() log(LogLevel.INFO, f"CPU type: {cpu_type}") log(LogLevel.INFO, f"VM mode: {vm_mode.value}") - log(LogLevel.INFO, f"Network type: {network_type}") + + # Network type detection based on VM mode + if vm_mode == VMMode.SWARM_INIT: + # In swarm-init mode: read from kernel cmdline + network_type = detect_network_type() + log(LogLevel.INFO, f"Network type (from cmdline): {network_type.value}") + else: + # In swarm-normal mode: verify required files exist in swarm-storage + # These files should be synced by pki-authority-sync.service before this script runs + required_files = [ + "auth_token", + "basic_certificate", + "basic_privateKey", + "lite_certificate", + "lite_privateKey" + ] + + missing_files = [f for f in required_files if not (STORAGE_PATH / f).exists()] + if missing_files: + error_msg = ( + f"Required files missing in {STORAGE_PATH}: {', '.join(missing_files)}. " + "These files should be synced by pki-authority-sync.service before this script runs." + ) + log(LogLevel.ERROR, error_msg) + sys.exit(1) + + log(LogLevel.INFO, "All required swarm-storage files are present") + + # Read network type from certificate OID + network_type = read_network_type_from_certificate() + log(LogLevel.INFO, f"Network type (from certificate): {network_type.value}") try: try: diff --git a/src/rootfs/files/configs/pki-service/scripts/pki_helpers.py b/src/rootfs/files/configs/pki-service/scripts/pki_helpers.py index 9b0ba3b9..e99a94c5 100755 --- a/src/rootfs/files/configs/pki-service/scripts/pki_helpers.py +++ b/src/rootfs/files/configs/pki-service/scripts/pki_helpers.py @@ -16,6 +16,8 @@ from typing import List, Optional import yaml +from cryptography import x509 +from cryptography.x509.oid import ObjectIdentifier PKI_SERVICE_NAME = "pki-authority" SERVICE_INSIDE_CONTAINER = "tee-pki" @@ -28,6 +30,7 @@ IPTABLES_RULE_COMMENT = f"{PKI_SERVICE_NAME}-rule" SWARM_ENV_YAML = "/sp/swarm/swarm-env.yaml" SWARM_KEY_FILE = "/etc/swarm/swarm.key" +OID_CUSTOM_EXTENSION_NETWORK_TYPE = "1.3.6.1.3.8888.4" class LogLevel(Enum): @@ -49,6 +52,13 @@ class VMMode(Enum): SWARM_INIT = "swarm-init" SWARM_NORMAL = "swarm-normal" + +class NetworkType(Enum): + """Network type types.""" + TRUSTED = "trusted" + UNTRUSTED = "untrusted" + + class LXCContainer: """Manager for LXC container operations.""" @@ -233,21 +243,69 @@ def detect_vm_mode() -> VMMode: return VMMode.SWARM_NORMAL -def detect_network_type() -> str: +def detect_network_type() -> NetworkType: """Detect network type from kernel command line. Returns: - 'untrusted' if allow_untrusted=true is present in cmdline, otherwise 'trusted'. + NetworkType.UNTRUSTED if allow_untrusted=true is present in cmdline, + otherwise NetworkType.TRUSTED. """ try: with open("/proc/cmdline", "r", encoding="utf-8") as file: cmdline = file.read() if "allow_untrusted=true" in cmdline: - return "untrusted" - return "trusted" + return NetworkType.UNTRUSTED + return NetworkType.TRUSTED except FileNotFoundError: - return "trusted" + return NetworkType.TRUSTED + + +def read_network_type_from_certificate(cert_path: Path = STORAGE_PATH / "basic_certificate") -> NetworkType: + """Read network type from certificate's custom OID extension. + + Args: + cert_path: Path to PEM certificate file. + + Returns: + NetworkType.TRUSTED or NetworkType.UNTRUSTED based on OID 1.3.6.1.3.8888.4 value. + Defaults to NetworkType.TRUSTED if OID is not present or has other value. + """ + try: + if not cert_path.exists(): + error_msg = f"Certificate not found at {cert_path}" + log(LogLevel.ERROR, error_msg) + raise FileNotFoundError(error_msg) + + with open(cert_path, "rb") as f: + cert = x509.load_pem_x509_certificate(f.read()) + + # Custom OID for network type + network_type_oid = ObjectIdentifier(OID_CUSTOM_EXTENSION_NETWORK_TYPE) + + try: + # Try to get the extension by OID + extension = cert.extensions.get_extension_for_oid(network_type_oid) + # Extension value is typically ASN.1 encoded, get raw value + value = extension.value.value.decode('utf-8').strip() + + if value == NetworkType.TRUSTED.value: + log(LogLevel.INFO, f"Network type from certificate OID: {value}") + return NetworkType.TRUSTED + elif value == NetworkType.UNTRUSTED.value: + log(LogLevel.INFO, f"Network type from certificate OID: {value}") + return NetworkType.UNTRUSTED + else: + log(LogLevel.WARN, f"Unknown network type value '{value}' in OID, defaulting to trusted") + return NetworkType.TRUSTED + + except x509.ExtensionNotFound: + log(LogLevel.INFO, f"OID {OID_CUSTOM_EXTENSION_NETWORK_TYPE} not found in certificate, defaulting to trusted") + return NetworkType.TRUSTED + + except Exception as e: + log(LogLevel.ERROR, f"Error reading certificate: {e}, defaulting to trusted") + return NetworkType.TRUSTED def read_yaml_config_param(param_path: str) -> Optional[str]: @@ -418,7 +476,7 @@ def patch_yaml_config( cpu_type: str, vm_mode: VMMode, pki_domain: str, - network_type: str, + network_type: NetworkType, network_key_hash: str, swarm_key: str ): @@ -450,10 +508,10 @@ def patch_yaml_config( # For untrusted, generate random deviceIdHex (32 bytes) if cpu_type == "untrusted": # Check if untrusted CPU type is running in trusted network - if network_type != "untrusted": + if network_type != NetworkType.UNTRUSTED: error_msg = ( "Cannot run untrusted machine in trusted network. " - f"CPU type: {cpu_type}, Network type: {network_type}" + f"CPU type: {cpu_type}, Network type: {network_type.value}" ) log(LogLevel.ERROR, error_msg) raise ValueError(error_msg) @@ -463,7 +521,7 @@ def patch_yaml_config( log(LogLevel.INFO, f"Generated deviceIdHex for untrusted type: {device_id_hex}") # Add 'untrusted' to allowedChallenges if network type is untrusted - if network_type == "untrusted": + if network_type == NetworkType.UNTRUSTED: if "allowedChallenges" not in config["pki"]: config["pki"]["allowedChallenges"] = [] if "untrusted" not in config["pki"]["allowedChallenges"]: @@ -486,12 +544,12 @@ def patch_yaml_config( # Set networkSettings if network_type and network_key_hash: config["pki"]["mode"]["networkSettings"] = { - "networkType": network_type, + "networkType": network_type.value, "networkKeyHashHex": network_key_hash } log( LogLevel.INFO, - f"Set networkSettings: networkType={network_type}, " + f"Set networkSettings: networkType={network_type.value}, " f"networkKeyHashHex={network_key_hash}" ) From 0544f94f4a726ac3d986976e0ffe87517a5937d8 Mon Sep 17 00:00:00 2001 From: Petr Evstifeev Date: Wed, 4 Feb 2026 10:40:10 -0600 Subject: [PATCH 42/51] gatekeer certs --- src/Dockerfile | 4 +- .../pki-service/scripts/pki_helpers.py | 39 +++++++++++++++++-- .../systemd/pki-authority-sync.service | 8 +++- 3 files changed, 45 insertions(+), 6 deletions(-) diff --git a/src/Dockerfile b/src/Dockerfile index feb6409e..f14feca8 100644 --- a/src/Dockerfile +++ b/src/Dockerfile @@ -259,7 +259,7 @@ ADD rootfs/files/configs/pki-service/systemd/pki-authority.service "${OUTPUTDIR} ADD rootfs/files/configs/pki-service/systemd/pki-authority-sync.service "${OUTPUTDIR}/etc/systemd/system" RUN mkdir -p "${OUTPUTDIR}/etc/super/pki-authority-sync" ADD rootfs/files/configs/pki-service/conf/secrets-config.yaml "${OUTPUTDIR}/etc/super/pki-authority-sync/secrets-config.yaml" -RUN chmod +x "${OUTPUTDIR}"/usr/local/bin/pki-authority/*.py "${OUTPUTDIR}"/usr/local/bin/pki-authority/*.sh +RUN chmod +x "${OUTPUTDIR}"/usr/local/bin/pki-authority/*.py RUN ln -s /etc/systemd/system/pki-authority-init.service "${OUTPUTDIR}/etc/systemd/system/multi-user.target.wants/pki-authority-init.service" RUN ln -s /etc/systemd/system/pki-authority.service "${OUTPUTDIR}/etc/systemd/system/multi-user.target.wants/pki-authority.service" RUN ln -s /etc/systemd/system/pki-authority-sync.service "${OUTPUTDIR}/etc/systemd/system/multi-user.target.wants/pki-authority-sync.service" @@ -366,7 +366,7 @@ RUN chmod +x /buildroot/files/scripts/setup_runtime_tools.sh RUN --security=insecure /buildroot/files/scripts/setup_runtime_tools.sh # install pki-sync-client npm package globally (requires python3-venv from setup_runtime_tools) -ARG PKI_SYNC_CLIENT_VERSION=2.0.2 +ARG PKI_SYNC_CLIENT_VERSION=2.0.3 ADD rootfs/files/scripts/install_sync_client.sh /buildroot/files/scripts/ RUN chmod +x /buildroot/files/scripts/install_sync_client.sh RUN --security=insecure /buildroot/files/scripts/install_sync_client.sh "${PKI_SYNC_CLIENT_VERSION}" diff --git a/src/rootfs/files/configs/pki-service/scripts/pki_helpers.py b/src/rootfs/files/configs/pki-service/scripts/pki_helpers.py index e99a94c5..a311c051 100755 --- a/src/rootfs/files/configs/pki-service/scripts/pki_helpers.py +++ b/src/rootfs/files/configs/pki-service/scripts/pki_helpers.py @@ -26,6 +26,7 @@ PKI_SERVICE_EXTERNAL_PORT = "8443" CONTAINER_IP = "10.0.3.100" WIREGUARD_INTERFACE = "wg0" +EXTERNAL_INTERFACE = "enp0s1" # Default external network interface STORAGE_PATH = Path(f"/var/lib/lxc/{PKI_SERVICE_NAME}/rootfs/app/swarm-storage") IPTABLES_RULE_COMMENT = f"{PKI_SERVICE_NAME}-rule" SWARM_ENV_YAML = "/sp/swarm/swarm-env.yaml" @@ -668,6 +669,37 @@ def enable_route_localnet(bridge_name: str): log(LogLevel.INFO, f"Enabled route_localnet for {bridge_name}") +def get_external_interface() -> str: + """Detect external network interface from default route. + + Returns: + Name of the external network interface used for default route. + Falls back to EXTERNAL_INTERFACE constant if detection fails. + """ + try: + # Get default route interface + result = subprocess.run( + ["ip", "route", "show", "default"], + capture_output=True, + text=True, + check=False + ) + + if result.returncode == 0 and result.stdout: + # Parse output like: "default via 192.168.1.1 dev enp0s1 proto dhcp metric 100" + match = re.search(r'dev\s+(\S+)', result.stdout) + if match: + interface = match.group(1) + log(LogLevel.INFO, f"Detected external interface from default route: {interface}") + return interface + + log(LogLevel.WARN, f"Could not detect external interface, using default: {EXTERNAL_INTERFACE}") + return EXTERNAL_INTERFACE + except Exception as error: # pylint: disable=broad-exception-caught + log(LogLevel.WARN, f"Failed to detect external interface: {error}, using default: {EXTERNAL_INTERFACE}") + return EXTERNAL_INTERFACE + + def delete_iptables_rules(): """Delete all iptables rules for PKI container (NAT and filter tables).""" # Delete rules from NAT table chains: PREROUTING, OUTPUT, POSTROUTING @@ -717,6 +749,7 @@ def ensure_iptables_rule(check_args: List[str], add_args: List[str], description def setup_iptables(): """Setup iptables NAT rules for LXC container access to host services.""" host_ip = get_bridge_ip(BRIDGE_NAME) + external_interface = get_external_interface() enable_route_localnet(BRIDGE_NAME) @@ -785,7 +818,7 @@ def setup_iptables(): ensure_iptables_rule( check_args=[ "iptables", "-t", "nat", "-C", "PREROUTING", - "-i", "enp0s1", + "-i", external_interface, "-p", "tcp", "--dport", PKI_SERVICE_EXTERNAL_PORT, "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, @@ -794,14 +827,14 @@ def setup_iptables(): ], add_args=[ "iptables", "-t", "nat", "-A", "PREROUTING", - "-i", "enp0s1", + "-i", external_interface, "-p", "tcp", "--dport", PKI_SERVICE_EXTERNAL_PORT, "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, "-j", "DNAT", "--to-destination", f"{CONTAINER_IP}:443" ], - description=f"PKI external access: enp0s1:{PKI_SERVICE_EXTERNAL_PORT} -> {CONTAINER_IP}:443" + description=f"PKI external access: {external_interface}:{PKI_SERVICE_EXTERNAL_PORT} -> {CONTAINER_IP}:443" ) diff --git a/src/rootfs/files/configs/pki-service/systemd/pki-authority-sync.service b/src/rootfs/files/configs/pki-service/systemd/pki-authority-sync.service index f521827b..a1f8e363 100644 --- a/src/rootfs/files/configs/pki-service/systemd/pki-authority-sync.service +++ b/src/rootfs/files/configs/pki-service/systemd/pki-authority-sync.service @@ -7,7 +7,13 @@ Wants=network-online.target [Service] Type=oneshot Environment=NODE_ENV=production -ExecStart=/usr/bin/node /usr/bin/pki-sync-client sync --config /etc/super/pki-authority-sync/secrets-config.yaml --swarm-env /sp/swarm/swarm-env.yaml -v --disable-server-identity-check +ExecStart=/usr/bin/node /usr/bin/pki-sync-client sync \ + --config /etc/super/pki-authority-sync/secrets-config.yaml \ + --swarm-env /sp/swarm/swarm-env.yaml \ + --save-certs-to-dir /etc/super/certs/ \ + --save-certs-prefix gatekeeper \ + --disable-server-identity-check \ + -v Restart=on-failure RestartSec=10s From bd730886016d600bded9fe9985a0cdfc4d9bb995 Mon Sep 17 00:00:00 2001 From: Petr Evstifeev Date: Wed, 4 Feb 2026 13:05:48 -0600 Subject: [PATCH 43/51] use aes encryption for swarm-db --- src/Dockerfile | 2 + src/repos/swarm-db | 2 +- .../etc/systemd/system/swarm-db.service | 10 ++- .../files/scripts/prepare_swarm_db_config.py | 83 +++++++++++++++++++ 4 files changed, 93 insertions(+), 4 deletions(-) create mode 100644 src/rootfs/files/scripts/prepare_swarm_db_config.py diff --git a/src/Dockerfile b/src/Dockerfile index f14feca8..333a071b 100644 --- a/src/Dockerfile +++ b/src/Dockerfile @@ -279,6 +279,8 @@ RUN ln -sf /etc/systemd/system/hardening-vm.service "${OUTPUTDIR}/etc/systemd/sy # swarm services ADD rootfs/files/configs/etc/systemd/system/swarm-db.service ${OUTPUTDIR}/etc/systemd/system/swarm-db.service RUN ln -sf /etc/systemd/system/swarm-db.service "${OUTPUTDIR}/etc/systemd/system/multi-user.target.wants/swarm-db.service" +ADD rootfs/files/scripts/prepare_swarm_db_config.py ${OUTPUTDIR}/usr/local/bin/prepare_swarm_db_config.py +RUN chmod +x ${OUTPUTDIR}/usr/local/bin/prepare_swarm_db_config.py ADD rootfs/files/configs/usr/local/bin/swarm-cloud-api.sh ${OUTPUTDIR}/usr/local/bin/swarm-cloud-api.sh ADD rootfs/files/configs/etc/systemd/system/swarm-node.service ${OUTPUTDIR}/etc/systemd/system/swarm-node.service ADD rootfs/files/configs/usr/local/bin/swarm-node.sh ${OUTPUTDIR}/usr/local/bin/swarm-node.sh diff --git a/src/repos/swarm-db b/src/repos/swarm-db index bc8d0afb..f9899571 160000 --- a/src/repos/swarm-db +++ b/src/repos/swarm-db @@ -1 +1 @@ -Subproject commit bc8d0afbb78ac2153443677e7fdf4969ae29f119 +Subproject commit f9899571f670505a85486288cb1e255c2d16a870 diff --git a/src/rootfs/files/configs/etc/systemd/system/swarm-db.service b/src/rootfs/files/configs/etc/systemd/system/swarm-db.service index 5a621707..36f7c7fb 100644 --- a/src/rootfs/files/configs/etc/systemd/system/swarm-db.service +++ b/src/rootfs/files/configs/etc/systemd/system/swarm-db.service @@ -1,7 +1,7 @@ [Unit] Description=Swarm DB service -After=network-online.target local-fs.target -Wants=network-online.target +After=network-online.target local-fs.target pki-authority.service +Wants=network-online.target pki-authority.service RequiresMountsFor=/var /var/lib /var/lib/swarm-db ConditionPathExists=/usr/local/bin/swarm-db-linux-amd64 ConditionPathExists=/sp/swarm/node-db.yaml @@ -10,7 +10,11 @@ ConditionPathExists=/sp/swarm/node-db.yaml Type=simple WorkingDirectory=/ ExecStartPre=mkdir -p /var/lib/swarm-db/data -ExecStart=/usr/local/bin/swarm-db-linux-amd64 -config /sp/swarm/node-db.yaml +ExecStartPre=/usr/local/bin/prepare_swarm_db_config.py \ + --base-config /sp/swarm/node-db.yaml \ + --key-file /etc/swarm/swarm.key \ + --output-config /etc/swarm/swarm-db-config.yaml +ExecStart=/usr/local/bin/swarm-db-linux-amd64 -config /etc/swarm/swarm-db-config.yaml StandardOutput=append:/var/log/swarm-db.log StandardError=append:/var/log/swarm-db-err.log Restart=always diff --git a/src/rootfs/files/scripts/prepare_swarm_db_config.py b/src/rootfs/files/scripts/prepare_swarm_db_config.py new file mode 100644 index 00000000..bb3c5b54 --- /dev/null +++ b/src/rootfs/files/scripts/prepare_swarm_db_config.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 +""" +Prepare swarm-db configuration by adding encryption key from swarm.key +""" + +import sys +import re +import yaml +import argparse +from pathlib import Path + + +def prepare_swarm_db_config(base_config_path: str, key_path: str, output_path: str) -> None: + """ + Read base config, add encryption section with key from key_path, save to output_path + + Args: + base_config_path: Path to base node-db.yaml template + key_path: Path to swarm.key file (must contain 64-char hex string) + output_path: Path to save final config + """ + # Check if key file exists + if not Path(key_path).exists(): + raise FileNotFoundError(f"Encryption key file not found: {key_path}") + + # Read base configuration + with open(base_config_path, 'r') as f: + config = yaml.safe_load(f) + + # Read and validate encryption key + with open(key_path, 'r') as f: + encryption_key = f.read().strip() + + # Validate key format: must be 64 hex characters + if not re.match(r'^[0-9a-fA-F]{64}$', encryption_key): + raise ValueError( + f"Invalid key format: must be 64 hex characters (0-9, a-f, A-F), " + f"got {len(encryption_key)} characters" + ) + + # Add encryption section to memberlist + if 'memberlist' not in config: + config['memberlist'] = {} + + config['memberlist']['encryption'] = { + 'mode': 'static', + 'static_value': encryption_key + } + + # Save final configuration + with open(output_path, 'w') as f: + yaml.dump(config, f, default_flow_style=False, sort_keys=False) + + print(f"Swarm DB config prepared successfully: {output_path}") + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description='Prepare swarm-db configuration by adding encryption key' + ) + parser.add_argument( + '--base-config', + required=True, + help='Path to base node-db.yaml template' + ) + parser.add_argument( + '--key-file', + required=True, + help='Path to swarm.key file (64-char hex string)' + ) + parser.add_argument( + '--output-config', + required=True, + help='Path to save final configuration' + ) + + args = parser.parse_args() + + try: + prepare_swarm_db_config(args.base_config, args.key_file, args.output_config) + except Exception as e: + print(f"Error preparing swarm-db config: {e}", file=sys.stderr) + sys.exit(1) From 8229d1cf27d98d45234e81b79b204cef36df8f17 Mon Sep 17 00:00:00 2001 From: Petr Evstifeev Date: Thu, 5 Feb 2026 09:53:08 -0600 Subject: [PATCH 44/51] networkKey->networkID, do not sync auth_token --- src/Dockerfile | 4 ++-- .../pki-service/conf/secrets-config.yaml | 2 -- .../pki-service/scripts/pki_configure.py | 6 ++--- .../pki-service/scripts/pki_helpers.py | 23 ++++++++++--------- 4 files changed, 16 insertions(+), 19 deletions(-) diff --git a/src/Dockerfile b/src/Dockerfile index 333a071b..528b6188 100644 --- a/src/Dockerfile +++ b/src/Dockerfile @@ -247,7 +247,7 @@ ADD rootfs/files/scripts/install_lxc_deps.sh /buildroot/files/scripts/ RUN --security=insecure /buildroot/files/scripts/install_lxc_deps.sh RUN mkdir -p "${OUTPUTDIR}/etc/super/containers/pki-authority" -COPY --from=ghcr.io/super-protocol/tee-pki-authority-service-lxc:build-21253804163 /pki-authority.tar "${OUTPUTDIR}/etc/super/containers/pki-authority/pki-authority.tar" +COPY --from=ghcr.io/super-protocol/tee-pki-authority-service-lxc:build-21716071201 /pki-authority.tar "${OUTPUTDIR}/etc/super/containers/pki-authority/pki-authority.tar" ADD rootfs/files/configs/pki-service/conf/lxc-swarm-template.yaml "${OUTPUTDIR}/etc/super/containers/pki-authority/lxc-swarm-template.yaml" ADD rootfs/files/configs/pki-service/conf/dnsmasq.conf "${OUTPUTDIR}/etc/lxc/dnsmasq.conf" ADD rootfs/files/configs/pki-service/conf/lxc-net "${OUTPUTDIR}/etc/default/lxc-net" @@ -368,7 +368,7 @@ RUN chmod +x /buildroot/files/scripts/setup_runtime_tools.sh RUN --security=insecure /buildroot/files/scripts/setup_runtime_tools.sh # install pki-sync-client npm package globally (requires python3-venv from setup_runtime_tools) -ARG PKI_SYNC_CLIENT_VERSION=2.0.3 +ARG PKI_SYNC_CLIENT_VERSION=2.0.4 ADD rootfs/files/scripts/install_sync_client.sh /buildroot/files/scripts/ RUN chmod +x /buildroot/files/scripts/install_sync_client.sh RUN --security=insecure /buildroot/files/scripts/install_sync_client.sh "${PKI_SYNC_CLIENT_VERSION}" diff --git a/src/rootfs/files/configs/pki-service/conf/secrets-config.yaml b/src/rootfs/files/configs/pki-service/conf/secrets-config.yaml index 8c92a47e..6c719c9f 100644 --- a/src/rootfs/files/configs/pki-service/conf/secrets-config.yaml +++ b/src/rootfs/files/configs/pki-service/conf/secrets-config.yaml @@ -1,7 +1,5 @@ # PKI Sync Client - Secrets Configuration secrets: - - secretName: auth_token - saveTo: /var/lib/lxc/pki-authority/rootfs/app/swarm-storage/auth_token - secretName: basic_certificate saveTo: /var/lib/lxc/pki-authority/rootfs/app/swarm-storage/basic_certificate - secretName: basic_privateKey diff --git a/src/rootfs/files/configs/pki-service/scripts/pki_configure.py b/src/rootfs/files/configs/pki-service/scripts/pki_configure.py index eeb1faaa..ee4addd9 100755 --- a/src/rootfs/files/configs/pki-service/scripts/pki_configure.py +++ b/src/rootfs/files/configs/pki-service/scripts/pki_configure.py @@ -4,7 +4,6 @@ Configures the container with network, device access, and runtime settings. """ -import hashlib import sys from pathlib import Path @@ -43,7 +42,6 @@ def main(): # In swarm-normal mode: verify required files exist in swarm-storage # These files should be synced by pki-authority-sync.service before this script runs required_files = [ - "auth_token", "basic_certificate", "basic_privateKey", "lite_certificate", @@ -73,7 +71,7 @@ def main(): pki_domain = "localhost" log(LogLevel.INFO, f"Using default domain: {pki_domain}") - network_key = get_pki_authority_param("networkKey") + network_id = get_pki_authority_param("networkID") # Get or generate swarm key based on VM mode if vm_mode == VMMode.SWARM_INIT: @@ -91,7 +89,7 @@ def main(): vm_mode=vm_mode, network_type=network_type, pki_domain=pki_domain, - network_key_hash=hashlib.sha256(network_key.encode()).hexdigest(), + network_id=network_id, swarm_key=swarm_key ) log(LogLevel.INFO, "YAML config patched successfully") diff --git a/src/rootfs/files/configs/pki-service/scripts/pki_helpers.py b/src/rootfs/files/configs/pki-service/scripts/pki_helpers.py index a311c051..d67be515 100755 --- a/src/rootfs/files/configs/pki-service/scripts/pki_helpers.py +++ b/src/rootfs/files/configs/pki-service/scripts/pki_helpers.py @@ -478,7 +478,7 @@ def patch_yaml_config( vm_mode: VMMode, pki_domain: str, network_type: NetworkType, - network_key_hash: str, + network_id: str, swarm_key: str ): """Set own challenge type in LXC container configuration.""" @@ -543,16 +543,17 @@ def patch_yaml_config( log(LogLevel.INFO, f"Set swarmMode to: {mode_value}") # Set networkSettings - if network_type and network_key_hash: - config["pki"]["mode"]["networkSettings"] = { - "networkType": network_type.value, - "networkKeyHashHex": network_key_hash - } - log( - LogLevel.INFO, - f"Set networkSettings: networkType={network_type.value}, " - f"networkKeyHashHex={network_key_hash}" - ) + if network_type or network_id: + if "networkSettings" not in config["pki"]["mode"]: + config["pki"]["mode"]["networkSettings"] = {} + + if network_type: + config["pki"]["mode"]["networkSettings"]["networkType"] = network_type.value + log(LogLevel.INFO, f"Set networkSettings.networkType: {network_type.value}") + + if network_id: + config["pki"]["mode"]["networkSettings"]["networkID"] = network_id + log(LogLevel.INFO, f"Set networkSettings.networkID: {network_id}") # Set secretsStorage with swarmKey if swarm_key: From da60aade0387d6d76a89eceed64eed89425264d8 Mon Sep 17 00:00:00 2001 From: Petr Evstifeev Date: Thu, 5 Feb 2026 10:03:50 -0600 Subject: [PATCH 45/51] small fixes --- .../pki-service/conf/secrets-config.yaml | 2 +- .../configs/usr/local/bin/hardening-vm.sh | 2 +- src/services/apps/pki-authority/helpers.py | 793 ------------------ src/services/apps/pki-authority/main.py | 761 ----------------- src/services/apps/pki-authority/manifest.yaml | 104 --- 5 files changed, 2 insertions(+), 1660 deletions(-) delete mode 100644 src/services/apps/pki-authority/helpers.py delete mode 100755 src/services/apps/pki-authority/main.py delete mode 100644 src/services/apps/pki-authority/manifest.yaml diff --git a/src/rootfs/files/configs/pki-service/conf/secrets-config.yaml b/src/rootfs/files/configs/pki-service/conf/secrets-config.yaml index 6c719c9f..f0e811e4 100644 --- a/src/rootfs/files/configs/pki-service/conf/secrets-config.yaml +++ b/src/rootfs/files/configs/pki-service/conf/secrets-config.yaml @@ -9,4 +9,4 @@ secrets: - secretName: lite_privateKey saveTo: /var/lib/lxc/pki-authority/rootfs/app/swarm-storage/lite_privateKey - secretName: swarmKey - saveTo: /etc/swarm/swarm.key \ No newline at end of file + saveTo: /etc/swarm/swarm.key diff --git a/src/rootfs/files/configs/usr/local/bin/hardening-vm.sh b/src/rootfs/files/configs/usr/local/bin/hardening-vm.sh index f0afce04..c148b9b3 100755 --- a/src/rootfs/files/configs/usr/local/bin/hardening-vm.sh +++ b/src/rootfs/files/configs/usr/local/bin/hardening-vm.sh @@ -44,4 +44,4 @@ if grep -q 'sp-debug=true' /proc/cmdline; then iptables -A INPUT -p tcp --dport 22 -j ACCEPT systemctl start ssh -fi \ No newline at end of file +fi diff --git a/src/services/apps/pki-authority/helpers.py b/src/services/apps/pki-authority/helpers.py deleted file mode 100644 index b8b408c3..00000000 --- a/src/services/apps/pki-authority/helpers.py +++ /dev/null @@ -1,793 +0,0 @@ -#!/usr/bin/env python3 -""" -PKI Authority LXC container management helpers. -""" - -import os -import re -import secrets -import shutil -import subprocess -import sys -import urllib.request -from datetime import datetime -from enum import Enum -from pathlib import Path -from typing import List, Optional - -import yaml - -PKI_SERVICE_NAME = "pki-authority" -SERVICE_INSIDE_CONTAINER = "tee-pki" -BRIDGE_NAME = "lxcbr0" -PCCS_PORT = "8081" -PKI_SERVICE_EXTERNAL_PORT = "8443" -CONTAINER_IP = "10.0.3.100" -WIREGUARD_INTERFACE = "wg0" -STORAGE_PATH = Path(f"/var/lib/lxc/{PKI_SERVICE_NAME}/rootfs/app/swarm-storage") -IPTABLES_RULE_COMMENT = f"{PKI_SERVICE_NAME}-rule" -SWARM_ENV_YAML = "/sp/swarm/swarm-env.yaml" - - -class LogLevel(Enum): - """Log levels for structured logging.""" - INFO = "INFO" - WARN = "WARN" - ERROR = "ERROR" - DEBUG = "DEBUG" - - -def log(level: LogLevel, message: str): - """Log message with timestamp, service name and level.""" - timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") - print(f"[{timestamp}] [{PKI_SERVICE_NAME}] [{level.value}] {message}", file=sys.stderr) - - -class VMMode(Enum): - """VM mode types.""" - SWARM_INIT = "swarm-init" - SWARM_NORMAL = "swarm-normal" - -class LXCContainer: - """Manager for LXC container operations.""" - - def __init__(self, container_name: str = PKI_SERVICE_NAME): - self.container_name = container_name - - def start(self, timeout: int = 30) -> int: - """Start LXC container. Returns exit code.""" - log(LogLevel.INFO, f"Starting LXC container {self.container_name}") - result = subprocess.run( - ["lxc-start", "-n", self.container_name], - capture_output=True, - text=True, - timeout=timeout, - check=False - ) - return result.returncode - - def stop(self, graceful_timeout: int = 30, command_timeout: int = 60) -> int: - """Stop LXC container gracefully. Returns exit code.""" - log(LogLevel.INFO, f"Stopping LXC container {self.container_name} gracefully") - result = subprocess.run( - ["lxc-stop", "-n", self.container_name, "-t", str(graceful_timeout)], - capture_output=True, - text=True, - timeout=command_timeout, - check=False - ) - return result.returncode - - def destroy(self) -> int: - """Destroy LXC container. Returns exit code.""" - log(LogLevel.INFO, f"Destroying LXC container {self.container_name}") - result = subprocess.run( - ["lxc-destroy", "-n", self.container_name, "-f"], - capture_output=True, - text=True, - timeout=60, - check=False - ) - - if result.returncode != 0: - log(LogLevel.ERROR, f"Failed to destroy container: {result.stderr}") - - return result.returncode - - def is_running(self) -> bool: - """Check if LXC container is running.""" - try: - result = subprocess.run( - ["lxc-ls", "--running"], - capture_output=True, - text=True, - check=False - ) - if self.container_name not in result.stdout: - log(LogLevel.INFO, f"LXC container {self.container_name} is not running") - return False - return True - except Exception as error: # pylint: disable=broad-exception-caught - log(LogLevel.ERROR, f"Failed to check LXC container status: {error}") - return False - - def get_ip(self) -> Optional[str]: - """Get container IP address.""" - try: - result = subprocess.run( - ["lxc-info", "-n", self.container_name, "-iH"], - capture_output=True, - text=True, - check=False - ) - container_ip = result.stdout.strip() if result.stdout.strip() else None - return container_ip - except Exception as error: # pylint: disable=broad-exception-caught - log(LogLevel.ERROR, f"Failed to get container IP: {error}") - return None - - def create( - self, - archive_path: str = "/etc/super/containers/pki-authority/pki-authority.tar" - ) -> bool: - """Create LXC container if it doesn't exist. - - Returns True if created or already exists. - """ - # Check if container already exists - result = subprocess.run( - ["lxc-info", "-n", self.container_name], - capture_output=True, - text=True, - check=False - ) - - if result.returncode == 0: - log(LogLevel.INFO, f"Container '{self.container_name}' already exists.") - return True - - log(LogLevel.INFO, f"Container '{self.container_name}' not found. Creating...") - try: - subprocess.run( - [ - "lxc-create", - "-n", self.container_name, - "-t", "oci", - "--", - "--url", f"docker-archive:{archive_path}" - ], - check=True - ) - log(LogLevel.INFO, f"Container '{self.container_name}' created.") - return True - except subprocess.CalledProcessError as error: - log(LogLevel.ERROR, f"Failed to create container: {error}") - return False - - def is_service_healthy(self, healthcheck_url: str = "/healthcheck") -> bool: - """Check if service inside container is running and healthy.""" - try: - # Check service status inside container - result = subprocess.run( - [ - "lxc-attach", "-n", self.container_name, "--", - "systemctl", "is-active", SERVICE_INSIDE_CONTAINER - ], - capture_output=True, - text=True, - check=False - ) - status = result.stdout.strip() - - if status != "active": - log(LogLevel.INFO, f"Service {SERVICE_INSIDE_CONTAINER} status: {status}") - return False - - # Service is active, check healthcheck endpoint - container_ip = self.get_ip() - if not container_ip: - log(LogLevel.INFO, "Could not get container IP") - return False - - # Perform HTTP healthcheck - try: - req = urllib.request.Request(f"http://{container_ip}{healthcheck_url}") - with urllib.request.urlopen(req, timeout=5) as response: - if response.status == 200: - return True - - log( - LogLevel.INFO, - f"Healthcheck returned status: {response.status}" - ) - return False - except Exception as error: # pylint: disable=broad-exception-caught - log(LogLevel.INFO, f"Healthcheck failed: {error}") - return False - - except Exception as error: # pylint: disable=broad-exception-caught - log(LogLevel.ERROR, f"Failed to check service health: {error}") - return False - - -def detect_cpu_type() -> str: - """Detect CPU type based on available devices.""" - if Path("/dev/tdx_guest").is_char_device(): - return "tdx" - if Path("/dev/sev-guest").is_char_device(): - return "sev-snp" - return "untrusted" - - -def detect_vm_mode() -> VMMode: - """Detect VM mode from kernel command line.""" - try: - with open("/proc/cmdline", "r", encoding="utf-8") as file: - cmdline = file.read() - - if "vm_mode=swarm-init" in cmdline: - return VMMode.SWARM_INIT - return VMMode.SWARM_NORMAL - except FileNotFoundError: - return VMMode.SWARM_NORMAL - - -def detect_network_type() -> str: - """Detect network type from kernel command line. - - Returns: - 'untrusted' if allow_untrusted=true is present in cmdline, otherwise 'trusted'. - """ - try: - with open("/proc/cmdline", "r", encoding="utf-8") as file: - cmdline = file.read() - - if "allow_untrusted=true" in cmdline: - return "untrusted" - return "trusted" - except FileNotFoundError: - return "trusted" - - -def read_yaml_config_param(param_path: str) -> Optional[str]: - """Read parameter from container's yaml configuration. - - Args: - param_path: Dot-separated path to parameter (e.g., 'pki.ownDomain'). - - Returns: - Parameter value as string, or None if not found or error. - """ - yaml_config_path = Path(f"/var/lib/lxc/{PKI_SERVICE_NAME}/rootfs/app/conf/lxc.yaml") - - if not yaml_config_path.exists(): - log(LogLevel.DEBUG, f"YAML config not found: {yaml_config_path}") - return None - - try: - with open(yaml_config_path, "r", encoding="utf-8") as file: - config = yaml.safe_load(file) - - if not config: - log(LogLevel.DEBUG, f"Empty YAML config: {yaml_config_path}") - return None - - # Navigate through nested dictionary using dot-separated path - value = config - for key in param_path.split('.'): - if isinstance(value, dict): - value = value.get(key) - if value is None: - return None - else: - return None - - return str(value) if value is not None else None - - except Exception as error: # pylint: disable=broad-exception-caught - log(LogLevel.DEBUG, f"Failed to read {param_path} from YAML config: {error}") - return None - - -def get_pki_authority_param(param_name: str) -> str: - """Read PKI authority parameter from swarm-env.yaml. - - Args: - param_name: Name of the parameter under pki-authority section. - - Returns: - Parameter value as string. - - Raises: - FileNotFoundError: If swarm-env.yaml does not exist. - ValueError: If configuration is empty or parameter is not found. - Exception: For other errors during reading. - """ - swarm_env_path = Path(SWARM_ENV_YAML) - - if not swarm_env_path.exists(): - error_msg = f"Swarm environment config not found: {SWARM_ENV_YAML}" - log(LogLevel.ERROR, error_msg) - raise FileNotFoundError(error_msg) - - try: - with open(swarm_env_path, "r", encoding="utf-8") as file: - config = yaml.safe_load(file) - - if not config: - error_msg = f"Empty configuration in {SWARM_ENV_YAML}" - log(LogLevel.ERROR, error_msg) - raise ValueError(error_msg) - - param_value = config.get("pki-authority", {}).get(param_name) - if not param_value: - error_msg = ( - f"No {param_name} found in {SWARM_ENV_YAML} " - f"under pki-authority.{param_name}" - ) - log(LogLevel.ERROR, error_msg) - raise ValueError(error_msg) - - log(LogLevel.INFO, f"Read {param_name} from config: {param_value}") - return param_value - - except (FileNotFoundError, ValueError): - raise - except Exception as error: # pylint: disable=broad-exception-caught - error_msg = f"Failed to read {param_name} from {SWARM_ENV_YAML}: {error}" - log(LogLevel.ERROR, error_msg) - raise Exception(error_msg) from error - - -def patch_yaml_config( - cpu_type: str, - vm_mode: VMMode, - pki_domain: str, - network_type: str, - network_key_hash: str -): - """Set own challenge type in LXC container configuration.""" - template_name = "lxc-swarm-template.yaml" - log( - LogLevel.INFO, - f"Detected {vm_mode.value} mode, using swarm template" - ) - - src_yaml = Path(f"/etc/super/containers/pki-authority/{template_name}") - dst_yaml = Path(f"/var/lib/lxc/{PKI_SERVICE_NAME}/rootfs/app/conf/lxc.yaml") - - if not src_yaml.exists(): - log(LogLevel.ERROR, f"Error: {src_yaml} not found.") - sys.exit(1) - - # Load YAML, modify, and save - with open(src_yaml, "r", encoding="utf-8") as file: - config = yaml.safe_load(file) - - # Set the CPU type in the configuration - if "pki" not in config: - config["pki"] = {} - if "ownChallenge" not in config["pki"]: - config["pki"]["ownChallenge"] = {} - config["pki"]["ownChallenge"]["type"] = cpu_type - - # For untrusted, generate random deviceIdHex (32 bytes) - if cpu_type == "untrusted": - # Check if untrusted CPU type is running in trusted network - if network_type != "untrusted": - error_msg = ( - "Cannot run untrusted machine in trusted network. " - f"CPU type: {cpu_type}, Network type: {network_type}" - ) - log(LogLevel.ERROR, error_msg) - raise ValueError(error_msg) - - device_id_hex = secrets.token_hex(32) - config["pki"]["ownChallenge"]["deviceIdHex"] = device_id_hex - log(LogLevel.INFO, f"Generated deviceIdHex for untrusted type: {device_id_hex}") - - # Add 'untrusted' to allowedChallenges if network type is untrusted - if network_type == "untrusted": - if "allowedChallenges" not in config["pki"]: - config["pki"]["allowedChallenges"] = [] - if "untrusted" not in config["pki"]["allowedChallenges"]: - config["pki"]["allowedChallenges"].append("untrusted") - log(LogLevel.INFO, "Added 'untrusted' to allowedChallenges") - - # Set ownDomain from parameter - if pki_domain: - config["pki"]["ownDomain"] = pki_domain - log(LogLevel.INFO, f"Set ownDomain to: {pki_domain}") - - # Set mode.swarmMode - if "mode" not in config["pki"]: - config["pki"]["mode"] = {} - - mode_value = "init" if vm_mode == VMMode.SWARM_INIT else "normal" - config["pki"]["mode"]["swarmMode"] = mode_value - log(LogLevel.INFO, f"Set swarmMode to: {mode_value}") - - # Set networkSettings - if network_type and network_key_hash: - config["pki"]["mode"]["networkSettings"] = { - "networkType": network_type, - "networkKeyHashHex": network_key_hash - } - log( - LogLevel.INFO, - f"Set networkSettings: networkType={network_type}, " - f"networkKeyHashHex={network_key_hash}" - ) - - # Ensure destination directory exists - dst_yaml.parent.mkdir(parents=True, exist_ok=True) - - # Write modified YAML - with open(dst_yaml, "w", encoding="utf-8") as file: - yaml.dump(config, file, default_flow_style=False) - - -def patch_lxc_config(cpu_type: str): - """Patch LXC container configuration.""" - config_file = Path(f"/var/lib/lxc/{PKI_SERVICE_NAME}/config") - config_bak = Path(f"{config_file}.bak") - - # Always restore config from backup if backup exists - if config_bak.exists(): - shutil.copy(config_bak, config_file) - else: - # Create backup before first patch - if config_file.exists(): - shutil.copy(config_file, config_bak) - - # Append MAC address configuration - with open(config_file, "a", encoding="utf-8") as file: - file.write("lxc.net.0.hwaddr = 4e:fc:0a:d5:2d:ff\n") - - # Add device-specific configuration - if cpu_type == "sev-snp": - dev_path = Path("/dev/sev-guest") - stat_info = dev_path.stat() - dev_id = f"{os.major(stat_info.st_rdev)}:{os.minor(stat_info.st_rdev)}" - - with open(config_file, "a", encoding="utf-8") as file: - file.write(f"lxc.cgroup2.devices.allow = c {dev_id} rwm\n") - file.write( - "lxc.mount.entry = /dev/sev-guest dev/sev-guest " - "none bind,optional,create=file\n" - ) - - elif cpu_type == "tdx": - dev_path = Path("/dev/tdx_guest") - stat_info = dev_path.stat() - dev_id = f"{os.major(stat_info.st_rdev)}:{os.minor(stat_info.st_rdev)}" - - with open(config_file, "a", encoding="utf-8") as file: - file.write(f"lxc.cgroup2.devices.allow = c {dev_id} rwm\n") - file.write( - "lxc.mount.entry = /dev/tdx_guest dev/tdx_guest " - "none bind,optional,create=file\n" - ) - - if Path("/etc/tdx-attest.conf").exists(): - file.write( - "lxc.mount.entry = /etc/tdx-attest.conf etc/tdx-attest.conf " - "none bind,ro,create=file\n" - ) - - -def get_bridge_ip(bridge_name: str) -> str: - """Get host IP address on the LXC bridge.""" - result = subprocess.run( - ["ip", "-4", "addr", "show", bridge_name], - capture_output=True, - text=True, - check=False - ) - - if result.returncode != 0: - log( - LogLevel.ERROR, - f"Error: Could not determine IP address for bridge {bridge_name}" - ) - sys.exit(1) - - # Parse IP address from output - match = re.search(r'inet\s+(\d+\.\d+\.\d+\.\d+)', result.stdout) - if not match: - log( - LogLevel.ERROR, - f"Error: Could not determine IP address for bridge {bridge_name}" - ) - sys.exit(1) - - return match.group(1) - - -def enable_route_localnet(bridge_name: str): - """Enable route_localnet for the bridge.""" - sysctl_key = f"net.ipv4.conf.{bridge_name}.route_localnet" - - result = subprocess.run( - ["sysctl", "-n", sysctl_key], - capture_output=True, - text=True, - check=False - ) - - if result.returncode == 0 and result.stdout.strip() == "1": - log(LogLevel.INFO, f"route_localnet already enabled for {bridge_name}") - else: - subprocess.run( - ["sysctl", "-w", f"{sysctl_key}=1"], - check=True - ) - log(LogLevel.INFO, f"Enabled route_localnet for {bridge_name}") - - -def delete_iptables_rules(): - """Delete all iptables rules for PKI container (NAT and filter tables).""" - # Delete rules from NAT table chains: PREROUTING, OUTPUT, POSTROUTING - for chain in ["PREROUTING", "OUTPUT", "POSTROUTING"]: - result = subprocess.run( - ["iptables", "-t", "nat", "-S", chain], - capture_output=True, text=True, check=True - ) - - rules = result.stdout.splitlines() - - for rule in rules: - # Delete rules that contain our comment - if IPTABLES_RULE_COMMENT in rule: - delete_rule = rule.replace("-A", "-D", 1) - subprocess.run(["iptables", "-t", "nat"] + delete_rule.split()[1:], check=True) - log(LogLevel.INFO, f"Deleted iptables NAT rule: {delete_rule}") - - # Delete rules from filter table (INPUT chain) - result = subprocess.run( - ["iptables", "-S", "INPUT"], - capture_output=True, text=True, check=True - ) - - rules = result.stdout.splitlines() - - for rule in rules: - # Delete rules that contain our comment - if IPTABLES_RULE_COMMENT in rule: - delete_rule = rule.replace("-A", "-D", 1) - subprocess.run(["iptables"] + delete_rule.split()[1:], check=True) - log(LogLevel.INFO, f"Deleted iptables INPUT rule: {delete_rule}") - - -def ensure_iptables_rule(check_args: List[str], add_args: List[str], description: str): - """Ensure iptables rule exists, add if missing.""" - log(LogLevel.INFO, f"Checking iptables rule: {description}") - - check_result = subprocess.run(check_args, capture_output=True, check=False) - - if check_result.returncode == 0: - log(LogLevel.INFO, "Rule already exists") - else: - subprocess.run(add_args, check=True) - log(LogLevel.INFO, "Rule added") - -def setup_iptables(wg_ip): - """Setup iptables NAT rules for LXC container access to host services.""" - host_ip = get_bridge_ip(BRIDGE_NAME) - - enable_route_localnet(BRIDGE_NAME) - - # Rule 1: PCCS DNAT - ensure_iptables_rule( - check_args=[ - "iptables", "-t", "nat", "-C", "PREROUTING", - "-p", "tcp", - "-d", host_ip, - "--dport", PCCS_PORT, - "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, - "-j", "DNAT", - "--to-destination", f"127.0.0.1:{PCCS_PORT}" - ], - add_args=[ - "iptables", "-t", "nat", "-A", "PREROUTING", - "-p", "tcp", - "-d", host_ip, - "--dport", PCCS_PORT, - "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, - "-j", "DNAT", - "--to-destination", f"127.0.0.1:{PCCS_PORT}" - ], - description=f"PCCS DNAT {host_ip}:{PCCS_PORT} -> 127.0.0.1:{PCCS_PORT}" - ) - - # Rule 2: WireGuard PREROUTING (HTTPS) - ensure_iptables_rule( - check_args=[ - "iptables", "-t", "nat", "-C", "PREROUTING", - "-i", WIREGUARD_INTERFACE, - "-p", "tcp", - "--dport", PKI_SERVICE_EXTERNAL_PORT, - "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, - "-j", "DNAT", - "--to-destination", f"{CONTAINER_IP}:443" - ], - add_args=[ - "iptables", "-t", "nat", "-A", "PREROUTING", - "-i", WIREGUARD_INTERFACE, - "-p", "tcp", - "--dport", PKI_SERVICE_EXTERNAL_PORT, - "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, - "-j", "DNAT", - "--to-destination", f"{CONTAINER_IP}:443" - ], - description=f"PREROUTING WireGuard {PKI_SERVICE_EXTERNAL_PORT} -> {CONTAINER_IP}:443" - ) - - # Rule 2a: WireGuard PREROUTING (HTTP) - ensure_iptables_rule( - check_args=[ - "iptables", "-t", "nat", "-C", "PREROUTING", - "-i", WIREGUARD_INTERFACE, - "-p", "tcp", - "--dport", "8080", - "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, - "-j", "DNAT", - "--to-destination", f"{CONTAINER_IP}:80" - ], - add_args=[ - "iptables", "-t", "nat", "-A", "PREROUTING", - "-i", WIREGUARD_INTERFACE, - "-p", "tcp", - "--dport", "8080", - "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, - "-j", "DNAT", - "--to-destination", f"{CONTAINER_IP}:80" - ], - description=f"PREROUTING WireGuard 8080 -> {CONTAINER_IP}:80" - ) - - # Rule 3: OUTPUT (HTTPS) - ensure_iptables_rule( - check_args=[ - "iptables", "-t", "nat", "-C", "OUTPUT", - "-d", wg_ip, - "-p", "tcp", - "--dport", PKI_SERVICE_EXTERNAL_PORT, - "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, - "-j", "DNAT", - "--to-destination", f"{CONTAINER_IP}:443" - ], - add_args=[ - "iptables", "-t", "nat", "-A", "OUTPUT", - "-d", wg_ip, - "-p", "tcp", - "--dport", PKI_SERVICE_EXTERNAL_PORT, - "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, - "-j", "DNAT", - "--to-destination", f"{CONTAINER_IP}:443" - ], - description=f"OUTPUT {wg_ip}:{PKI_SERVICE_EXTERNAL_PORT} -> {CONTAINER_IP}:443" - ) - - # Rule 3a: OUTPUT (HTTP) - ensure_iptables_rule( - check_args=[ - "iptables", "-t", "nat", "-C", "OUTPUT", - "-d", wg_ip, - "-p", "tcp", - "--dport", "8080", - "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, - "-j", "DNAT", - "--to-destination", f"{CONTAINER_IP}:80" - ], - add_args=[ - "iptables", "-t", "nat", "-A", "OUTPUT", - "-d", wg_ip, - "-p", "tcp", - "--dport", "8080", - "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, - "-j", "DNAT", - "--to-destination", f"{CONTAINER_IP}:80" - ], - description=f"OUTPUT {wg_ip}:8080 -> {CONTAINER_IP}:80" - ) - - # Rule 4: MASQUERADE - ensure_iptables_rule( - check_args=[ - "iptables", "-t", "nat", "-C", "POSTROUTING", - "-s", f"{CONTAINER_IP}/32", - "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, - "-j", "MASQUERADE" - ], - add_args=[ - "iptables", "-t", "nat", "-A", "POSTROUTING", - "-s", f"{CONTAINER_IP}/32", - "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, - "-j", "MASQUERADE" - ], - description=f"POSTROUTING MASQUERADE for {CONTAINER_IP}/32" - ) - - # Rule 5: Allow port 8081 on lxcbr0 - ensure_iptables_rule( - check_args=[ - "iptables", "-C", "INPUT", - "-i", "lxcbr0", - "-p", "tcp", - "--dport", "8081", - "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, - "-j", "ACCEPT" - ], - add_args=[ - "iptables", "-A", "INPUT", - "-i", "lxcbr0", - "-p", "tcp", - "--dport", "8081", - "-m", "comment", "--comment", IPTABLES_RULE_COMMENT, - "-j", "ACCEPT" - ], - description="Allow TCP port 8081 on lxcbr0" - ) - - -def update_pccs_url(): - """Update PCCS URL in QCNL configuration.""" - qcnl_conf = Path(f"/var/lib/lxc/{PKI_SERVICE_NAME}/rootfs/etc/sgx_default_qcnl.conf") - qcnl_conf_bak = Path(f"{qcnl_conf}.bak") - - host_ip = get_bridge_ip(BRIDGE_NAME) - - pccs_url = f"https://{host_ip}:{PCCS_PORT}/sgx/certification/v4/" - - if not qcnl_conf.exists(): - log(LogLevel.ERROR, f"Error: {qcnl_conf} not found") - sys.exit(1) - - if not qcnl_conf_bak.exists(): - shutil.copy(qcnl_conf, qcnl_conf_bak) - - shutil.copy(qcnl_conf_bak, qcnl_conf) - - with open(qcnl_conf, "r", encoding="utf-8") as file: - content = file.read() - - content = re.sub( - r'"pccs_url":\s*"[^"]*"', - f'"pccs_url": "{pccs_url}"', - content - ) - - with open(qcnl_conf, "w", encoding="utf-8") as file: - file.write(content) - - - -def init_container(): - """Initialize LXC container for PKI Authority.""" - LXCContainer(PKI_SERVICE_NAME).create() - - -def get_node_tunnel_ip(node_id: str, wg_props: List[dict]) -> Optional[str]: - """Get tunnel IP for a node from WireGuard properties.""" - for prop in wg_props: - if prop.get("node_id") == node_id and prop.get("name") == "tunnel_ip": - return prop.get("value") - return None - - -def save_property_into_fs(file_name: str, content: bytes): - """Save property content to filesystem.""" - STORAGE_PATH.mkdir(parents=True, exist_ok=True) - file_path = STORAGE_PATH / file_name - file_path.write_bytes(content) - - -def read_property_from_fs(file_name: str) -> tuple[bool, bytes]: - """Read property content from filesystem.""" - file_path = STORAGE_PATH / file_name - if file_path.exists(): - content = file_path.read_bytes() - if content: - return (True, content) - return (False, b"") diff --git a/src/services/apps/pki-authority/main.py b/src/services/apps/pki-authority/main.py deleted file mode 100755 index e8a6299f..00000000 --- a/src/services/apps/pki-authority/main.py +++ /dev/null @@ -1,761 +0,0 @@ -#!/usr/bin/env python3 -"""PKI Authority service provisioning plugin.""" - -import base64 -import hashlib -import json -import sys -import time -from pathlib import Path - -from provision_plugin_sdk import ProvisionPlugin, PluginInput, PluginOutput -from redis import RedisCluster -from redis.cluster import ClusterNode - -# Import helpers -sys.path.insert(0, str(Path(__file__).parent)) -from helpers import ( - delete_iptables_rules, - detect_cpu_type, - detect_vm_mode, - detect_network_type, - patch_yaml_config, - patch_lxc_config, - setup_iptables, - update_pccs_url, - LXCContainer, - PKI_SERVICE_NAME, - get_node_tunnel_ip, - init_container, - VMMode, - save_property_into_fs, - read_property_from_fs, - LogLevel, - log, - get_pki_authority_param, - read_yaml_config_param, -) - -# Configuration -plugin = ProvisionPlugin() - - -class EventHandler: - """Handler for PKI Authority provisioning events.""" - - # Authority service property prefix and names - AUTHORITY_SERVICE_PREFIX = "pki_authority_" - AUTHORITY_SERVICE_PROPERTIES = [ - "auth_token", "basic_certificate", "basic_privateKey", - "lite_certificate", "lite_privateKey" - ] - PROP_INITIALIZED = f"{AUTHORITY_SERVICE_PREFIX}initialized" - PROP_PKI_DOMAIN = f"{AUTHORITY_SERVICE_PREFIX}pki_domain" - PROP_NETWORK_KEY_HASH = f"{AUTHORITY_SERVICE_PREFIX}network_key_hash" - PROP_NETWORK_TYPE = f"{AUTHORITY_SERVICE_PREFIX}network_type" - - def __init__(self, input_data: PluginInput): - self.input_data = input_data - self.local_node_id = input_data.local_node_id - self.state_json = input_data.state or {} - self.local_state = input_data.local_state or {} - self.cluster_info = self.state_json.get("cluster", {}) - leader_node_id = self.cluster_info.get("leader_node") - self.is_leader = self.local_node_id == leader_node_id - self.pki_cluster_nodes = self.state_json.get("clusterNodes", []) - self.wg_props = self.state_json.get("wgNodeProperties", []) - self.authority_props = self.state_json.get("authorityServiceProperties", []) - self.authority_config = {prop["name"]: prop["value"] for prop in self.authority_props} - - self.pki_domain = self.authority_config.get(self.PROP_PKI_DOMAIN, "") - self.network_key_hash = self.authority_config.get(self.PROP_NETWORK_KEY_HASH, "") - self.network_type = self.authority_config.get(self.PROP_NETWORK_TYPE, "") - self.initialized = self.authority_config.get(self.PROP_INITIALIZED, "") - self.vm_mode = detect_vm_mode() - - # Read current pki_node_ready value - pki_node_props = self.state_json.get("pkiNodeProperties", []) - self.current_pki_node_ready = None - for prop in pki_node_props: - if prop.get("node_id") == self.local_node_id and prop.get("name") == "pki_node_ready": - self.current_pki_node_ready = prop.get("value") - break - - # Output parameters - self.status = None - self.error_message = None - self.cluster_properties = {} - self.node_properties = {} - - def _get_redis_tunnel_ips(self) -> list[str]: - """Get list of Redis node tunnel IPs.""" - redis_node_props = self.state_json.get("redisNodeProperties", []) - - redis_hosts = [] - for prop in redis_node_props: - if prop.get("name") == "redis_node_ready" and prop.get("value") == "true": - node_id = prop.get("node_id") - tunnel_ip = get_node_tunnel_ip(node_id, self.wg_props) - if tunnel_ip: - redis_hosts.append(tunnel_ip) - - return sorted(set(redis_hosts)) - - def _get_redis_connection_info(self) -> list[tuple[str, int]]: - """Get Redis cluster connection endpoints. - - Returns list of (host, port) tuples for Redis nodes. - """ - redis_tunnel_ips = self._get_redis_tunnel_ips() - return [(ip, 6379) for ip in redis_tunnel_ips] - - def _get_current_endpoints(self) -> list[str]: - """Get list of tunnel IPs for PKI nodes that are ready. - - Returns list of tunnel IPs for nodes with pki_node_ready=true. - """ - pki_node_props = self.state_json.get("pkiNodeProperties", []) - current_endpoints = [] - - for cluster_node in self.pki_cluster_nodes: - node_id = cluster_node.get("node_id") - if not node_id: - continue - - # Find pki_node_ready property for this node - node_ready = False - for prop in pki_node_props: - if (prop.get("node_id") == node_id and - prop.get("name") == "pki_node_ready" and - prop.get("value") == "true"): - node_ready = True - break - - if node_ready: - tunnel_ip = get_node_tunnel_ip(node_id, self.wg_props) - if tunnel_ip: - current_endpoints.append(tunnel_ip) - - return current_endpoints - - def _create_gateway_endpoints(self): - """Create and update gateway endpoints in Redis.""" - current_endpoints = self._get_current_endpoints() - - # Get Redis connection info - redis_endpoints = self._get_redis_connection_info() - - if not redis_endpoints: - self.status = "postponed" - self.error_message = "No Redis nodes available to configure gateway routes" - return - - route_key = f"manual-routes:{self.pki_domain}" - startup_nodes = [ClusterNode(host, port) for host, port in redis_endpoints] - - try: - redis_client = RedisCluster( - startup_nodes=startup_nodes, - decode_responses=True, - skip_full_coverage_check=True, - socket_connect_timeout=5, - ) - - # Read current route from Redis - registered_endpoints = [] - existing_route = redis_client.get(route_key) - if existing_route: - route_data = json.loads(existing_route) - # Extract IPs from targets URLs - for target in route_data.get("targets", []): - url = target.get("url", "") - # Parse https://IP:PORT format - if "://" in url: - ip_port = url.split("://")[1] - ip = ip_port.split(":")[0] - registered_endpoints.append(ip) - - # Compare endpoints regardless of order - if set(registered_endpoints) == set(current_endpoints): - log( - LogLevel.INFO, - f"Gateway endpoints are up to date: " - f"registered={registered_endpoints}, current={current_endpoints}" - ) - return - - log( - LogLevel.INFO, - f"Gateway endpoints changed: " - f"registered={registered_endpoints}, current={current_endpoints}" - ) - - # Build targets list from current endpoints - targets = [ - {"url": f"https://{endpoint}:8443", "weight": 1} - for endpoint in current_endpoints - ] - route_config = { - "targets": targets, - "policy": "rr", - "preserve_host": False, - "passthrough": True - } - route_json = json.dumps(route_config) - - # Retry logic for setting route in Redis - max_retries = 3 - retry_delay = 5 - - for attempt in range(1, max_retries + 1): - try: - redis_client.set(route_key, route_json) - log( - LogLevel.INFO, - f"Successfully set gateway route {route_key} in Redis Cluster" - ) - break # Success, exit retry loop - except Exception as set_error: # pylint: disable=broad-exception-caught - if attempt < max_retries: - log( - LogLevel.WARN, - f"Failed to set route (attempt {attempt}/{max_retries}): {set_error}. " - f"Retrying in {retry_delay}s..." - ) - time.sleep(retry_delay) - else: - log( - LogLevel.ERROR, - f"Failed to set route after {max_retries} attempts: {set_error}" - ) - raise - - except Exception as error: # pylint: disable=broad-exception-caught - error_msg = f"Failed to set route in Redis Cluster: {str(error)}" - self.status = "postponed" - self.error_message = error_msg - log(LogLevel.ERROR, error_msg) - - def _create_output(self) -> PluginOutput: - """Create plugin output based on current status.""" - if self.status == "completed": - if self.is_leader: - self._create_gateway_endpoints() - elif self.status == "postponed": - log(LogLevel.INFO, f"Apply postponed: {self.error_message}") - elif self.status == "error": - log(LogLevel.ERROR, f"Apply error: {self.error_message}") - else: - log(LogLevel.ERROR, f"Apply ended with unknown status {self.status}") - - # Determine if cluster_properties should be included - should_update_cluster = ( - self.vm_mode == VMMode.SWARM_INIT or self.is_leader - ) and self.status in ("completed", "postponed") and self.cluster_properties - - return PluginOutput( - status=self.status, - local_state=self.local_state if self.status == "completed" else None, - error_message=self.error_message, - cluster_properties=self.cluster_properties if should_update_cluster else None, - node_properties=self.node_properties if self.node_properties else None - ) - - def apply(self) -> PluginOutput: - """Apply PKI Authority configuration.""" - try: - # Basic validation - if not isinstance(self.state_json, dict): - self.status = "error" - self.error_message = "Invalid state format" - return self._create_output() - - local_tunnel_ip = get_node_tunnel_ip(self.local_node_id, self.wg_props) - if not local_tunnel_ip: - self.status = "postponed" - self.error_message = "Waiting for WireGuard tunnel IP to be configured" - return self._create_output() - - if self.vm_mode == VMMode.SWARM_INIT: - self._handle_swarm_init(local_tunnel_ip) - else: - self._handle_swarm_normal(local_tunnel_ip) - - except Exception as error: # pylint: disable=broad-exception-caught - error_msg = f"Apply failed: {str(error)}" - log(LogLevel.ERROR, error_msg) - self.status = "error" - self.error_message = error_msg - - return self._create_output() - - def _stop_container_if_running(self, container: LXCContainer) -> None: - """Stop container if it's running.""" - if container.is_running(): - log(LogLevel.INFO, "Stopping existing container") - exit_code = container.stop(graceful_timeout=30, command_timeout=60) - if exit_code != 0: - raise Exception(f"Failed to stop container with exit code {exit_code}") - - def _ensure_container_running(self, container: LXCContainer, local_tunnel_ip: str) -> None: - """Ensure container is running with correct configuration. - - Checks for missing properties, restart requirements, - restores properties and starts container. - Sets self.status to 'completed' or 'error' and self.error_message on error. - """ - # Verify ALL required properties are present - missing = self._check_for_missing_properties() - - if missing: - error_msg = ( - f"Service marked as initialized but missing required properties: " - f"{', '.join(missing)}" - ) - log(LogLevel.ERROR, error_msg) - self.status = "error" - self.error_message = error_msg - return - - # Check if restart is needed - if container.is_running(): - if self._is_restart_required(): - log(LogLevel.INFO, "Configuration changed, restarting contain er") - self._stop_container_if_running(container) - else: - # Everything matches, container running, nothing to do - log(LogLevel.INFO, "Container running, no changes detected") - self.status = "completed" - return - - # Restore properties from DB to filesystem - for prop in self.AUTHORITY_SERVICE_PROPERTIES: - prop_name = f"{self.AUTHORITY_SERVICE_PREFIX}{prop}" - prop_value = self.authority_config.get(prop_name, "") - save_property_into_fs(prop, base64.b64decode(prop_value)) - - # Start container - self._configure_and_start_container(container, local_tunnel_ip) - self.status = "completed" - - def _configure_and_start_container( - self, container: LXCContainer, local_tunnel_ip: str - ) -> None: - """Configure and start container.""" - cpu_type = detect_cpu_type() - patch_yaml_config( - cpu_type, - self.vm_mode, - self.pki_domain, - self.network_type, - self.network_key_hash - ) - patch_lxc_config(cpu_type) - update_pccs_url() - setup_iptables(local_tunnel_ip) - - exit_code = container.start(timeout=30) - if exit_code != 0: - raise Exception(f"Failed to start container with exit code {exit_code}") - - is_healthy, err_msg = self.health(timeout=30, interval=5) - if is_healthy: - log( - LogLevel.INFO, - f"LXC container {PKI_SERVICE_NAME} started and health check passed" - ) - else: - log( - LogLevel.WARN, - f"LXC container {PKI_SERVICE_NAME} started but health check failed: {err_msg}" - ) - - def _check_for_missing_properties(self) -> list[str]: - """Check for missing required properties. - - Returns: - List of missing property names (empty if all present) - """ - missing = [] - - for prop in self.AUTHORITY_SERVICE_PROPERTIES: - prop_name = f"{self.AUTHORITY_SERVICE_PREFIX}{prop}" - if not self.authority_config.get(prop_name, ""): - missing.append(prop_name) - - if not self.pki_domain: - missing.append(self.PROP_PKI_DOMAIN) - - if not self.network_key_hash: - missing.append(self.PROP_NETWORK_KEY_HASH) - - if not self.network_type: - missing.append(self.PROP_NETWORK_TYPE) - - return missing - - def _wait_for_properties_generation(self, timeout: int = 30, interval: int = 5) -> None: - """Wait for tee-pki service to generate ALL property files.""" - missing_properties = self.AUTHORITY_SERVICE_PROPERTIES.copy() - elapsed = 0 - collected_properties = {} - - while elapsed < timeout: - # Try to read each missing property - for prop in missing_properties[:]: - success, value = read_property_from_fs(prop) - - if success: - prop_key = f"{self.AUTHORITY_SERVICE_PREFIX}{prop}" - collected_properties[prop_key] = base64.b64encode(value).decode() - missing_properties.remove(prop) - - # Check if ALL properties collected - if not missing_properties: - log( - LogLevel.INFO, - "All property files have been generated by tee-pki service" - ) - # Set initialized flag ONLY when all properties are ready - collected_properties[self.PROP_PKI_DOMAIN] = self.pki_domain - collected_properties[self.PROP_NETWORK_KEY_HASH] = self.network_key_hash - collected_properties[self.PROP_NETWORK_TYPE] = self.network_type - collected_properties[self.PROP_INITIALIZED] = "true" - self.initialized = "true" - - self.status = "completed" - self.cluster_properties = collected_properties - return - - log( - LogLevel.INFO, - f"Waiting for property files. Missing: " - f"{', '.join(missing_properties)} (elapsed: {elapsed}s)" - ) - - time.sleep(interval) - elapsed += interval - - # Timeout - NOT all properties collected, do NOT set initialized flag - self.status = "postponed" - self.error_message = ( - f"Timeout waiting for tee-pki to generate property files: " - f"{', '.join(missing_properties)}" - ) - - def _handle_swarm_init(self, local_tunnel_ip: str) -> None: - """Handle swarm-init mode: read external sources and initialize properties.""" - # Get pki_domain from external source (file) - if not self.pki_domain: - try: - self.pki_domain = get_pki_authority_param("domain") - log(LogLevel.INFO, f"Read PKI domain from external source: {self.pki_domain}") - except Exception as error: # pylint: disable=broad-exception-caught - error_msg = f"Failed to get PKI domain from external source: {error}" - log(LogLevel.ERROR, error_msg) - self.status = "error" - self.error_message = error_msg - return - - # Get network_key_hash from external source (file) - if not self.network_key_hash: - try: - network_key = get_pki_authority_param("networkKey") - self.network_key_hash = hashlib.sha256(network_key.encode()).hexdigest() - log( - LogLevel.INFO, - f"Calculated network key hash from external source: {self.network_key_hash}" - ) - except Exception as error: # pylint: disable=broad-exception-caught - error_msg = f"Failed to get network key hash from external source: {error}" - log(LogLevel.ERROR, error_msg) - self.status = "error" - self.error_message = error_msg - return - - # Get network_type from kernel cmdline - if not self.network_type: - self.network_type = detect_network_type() - log(LogLevel.INFO, f"Detected network type: {self.network_type}") - - container = LXCContainer(PKI_SERVICE_NAME) - - if self.initialized == "true": - # Use common logic for ensuring container is running - self._ensure_container_running(container, local_tunnel_ip) - return - - # Not initialized - restart container and wait for properties generation - log(LogLevel.INFO, "Service not initialized, starting initialization process") - - # Restart container if running - if container.is_running(): - log(LogLevel.INFO, "Stopping container for initialization") - self._stop_container_if_running(container) - - # Start container - self._configure_and_start_container(container, local_tunnel_ip) - # Wait for properties generation - self._wait_for_properties_generation(timeout=30, interval=5) - - - def _handle_swarm_normal(self, local_tunnel_ip: str) -> None: - """Handle swarm-normal mode: read ONLY from properties (DB), no external sources.""" - # If not initialized - wait for swarm-init to complete - if self.initialized != "true": - self.status = "postponed" - self.error_message = "Waiting for authority service properties to be initialized" - return - - # All properties present - manage container - container = LXCContainer(PKI_SERVICE_NAME) - - # Use common logic for ensuring container is running - self._ensure_container_running(container, local_tunnel_ip) - - def _is_restart_required(self) -> bool: - """Check if container restart is required based on config changes.""" - # Check file-based properties - for prop in self.AUTHORITY_SERVICE_PROPERTIES: - prop_name = f"{self.AUTHORITY_SERVICE_PREFIX}{prop}" - config_value = self.authority_config.get(prop_name, "") - - if not config_value: - continue - - # Read current value from filesystem - success, fs_value = read_property_from_fs(prop) - - if not success: - # File doesn't exist in FS, restart required - log(LogLevel.INFO, f"Property {prop} not found in filesystem, restart required") - return True - - # Decode config value from base64 and compare with filesystem value - try: - decoded_config_value = base64.b64decode(config_value) - if decoded_config_value != fs_value: - log(LogLevel.INFO, f"Property {prop} has changed, restart required") - return True - except Exception as error: # pylint: disable=broad-exception-caught - log(LogLevel.ERROR, f"Failed to decode property {prop}: {error}") - return True - - # Check yaml config parameters - yaml_params = [ - ("pki.ownDomain", self.pki_domain, "domain"), - ( - "pki.mode.networkSettings.networkType", - self.network_type, - "network_type" - ), - ( - "pki.mode.networkSettings.networkKeyHashHex", - self.network_key_hash, - "network_key_hash" - ), - ] - - for yaml_path, expected_value, param_name in yaml_params: - if not expected_value: - continue - - yaml_value = read_yaml_config_param(yaml_path) - - if yaml_value != expected_value: - log( - LogLevel.INFO, - f"Parameter {param_name} changed " - f"(yaml: {yaml_value}, expected: {expected_value}), restart required" - ) - return True - - # No changes detected - log(LogLevel.INFO, "No configuration changes detected") - return False - - def _delete_route_from_redis(self) -> None: - """Delete the PKI Authority route from Redis Cluster. - - Raises: - Exception: If deletion fails - """ - redis_endpoints = self._get_redis_connection_info() - - if not redis_endpoints: - log(LogLevel.WARN, "No Redis endpoints available, skipping route deletion") - return - - route_key = f"routes:{self.pki_domain}" - startup_nodes = [ClusterNode(host, port) for host, port in redis_endpoints] - - redis_client = RedisCluster( - startup_nodes=startup_nodes, - decode_responses=True, - skip_full_coverage_check=True, - socket_connect_timeout=5, - ) - redis_client.delete(route_key) - log(LogLevel.INFO, f"Deleted route {route_key} from Redis Cluster") - - def destroy(self) -> PluginOutput: - """Destroy PKI Authority service and clean up.""" - try: - container = LXCContainer(PKI_SERVICE_NAME) - - # Stop container if running - if container.is_running(): - exit_code = container.stop(graceful_timeout=30, command_timeout=60) - if exit_code != 0: - log(LogLevel.WARN, "Failed to stop container gracefully") - - # Destroy container - exit_code = container.destroy() - if exit_code != 0: - error_msg = f"Failed to destroy container with exit code {exit_code}" - return PluginOutput( - status="error", - error_message=error_msg, - local_state=self.local_state - ) - - delete_iptables_rules() - - # If this is the last node and domain is configured, delete route from Redis - if len(self.pki_cluster_nodes) <= 1 and self.pki_domain: - log( - LogLevel.INFO, - "This is the last PKI Authority node, deleting route from Redis" - ) - self._delete_route_from_redis() - - log(LogLevel.INFO, "PKI Authority destroyed") - return PluginOutput( - status="completed", - local_state=self.local_state, - cluster_properties=( - self.cluster_properties if self.cluster_properties else None - ) - ) - - except Exception as error: # pylint: disable=broad-exception-caught - error_msg = f"Destroy failed: {str(error)}" - log(LogLevel.ERROR, error_msg) - return PluginOutput( - status="error", error_message=error_msg, local_state=self.local_state - ) - - def health(self, timeout: int = 0, interval: int = 5) -> tuple[bool, str]: - """Check health of PKI Authority service. - - Args: - timeout: Maximum time to wait for service to become healthy (0 = single check) - interval: Time between health check attempts - - Returns: - Tuple of (is_healthy, error_message). If healthy, error_message is empty string. - """ - is_healthy = False - error_msg = "" - - try: - container = LXCContainer(PKI_SERVICE_NAME) - elapsed = 0 - attempt = 0 - - while True: - attempt += 1 - if container.is_running() and container.is_service_healthy(): - is_healthy = True - break - - # If timeout is 0, only check once - if timeout == 0 or elapsed >= timeout: - error_msg = ( - f"PKI service is not healthy or container is not running " - f"(attempts: {attempt})" - ) - break - - # Wait before next attempt - time.sleep(interval) - elapsed += interval - - except Exception as error: # pylint: disable=broad-exception-caught - error_msg = f"Health check failed on attempt {attempt}: {str(error)}" - log(LogLevel.ERROR, error_msg) - - # Compare current pki_node_ready with new health status - current_healthy_status = "true" if is_healthy else "false" - if self.current_pki_node_ready != current_healthy_status: - log( - LogLevel.INFO, - f"PKI node ready status changed: " - f"{self.current_pki_node_ready} -> {current_healthy_status}" - ) - self.node_properties["pki_node_ready"] = current_healthy_status - - return (is_healthy, error_msg) - - -# Plugin commands -@plugin.command("init") -def handle_init(input_data: PluginInput) -> PluginOutput: - """Initialize PKI Authority service.""" - try: - log(LogLevel.INFO, "Running PKI initialization") - init_container() - log(LogLevel.INFO, "PKI initialization completed") - return PluginOutput(status="completed", local_state=input_data.local_state) - except Exception as error: # pylint: disable=broad-exception-caught - error_msg = f"Failed to initialize PKI: {str(error)}" - log(LogLevel.ERROR, error_msg) - return PluginOutput( - status="error", error_message=error_msg, local_state=input_data.local_state - ) - - -@plugin.command("apply") -def handle_apply(input_data: PluginInput) -> PluginOutput: - """Apply PKI Authority configuration and start service.""" - handler = EventHandler(input_data) - return handler.apply() - - -@plugin.command("health") -def handle_health(input_data: PluginInput) -> PluginOutput: - """Check health of PKI Authority service.""" - handler = EventHandler(input_data) - is_healthy, error_msg = handler.health() - - if is_healthy: - return PluginOutput( - status="completed", - local_state=input_data.local_state, - node_properties=handler.node_properties if handler.node_properties else None - ) - - return PluginOutput( - status="error", - error_message=error_msg, - local_state=input_data.local_state, - node_properties=handler.node_properties if handler.node_properties else None - ) - - -@plugin.command("finalize") -def handle_finalize(input_data: PluginInput) -> PluginOutput: - """Finalize PKI Authority service setup.""" - log(LogLevel.INFO, "PKI Authority finalized") - return PluginOutput(status="completed", local_state=input_data.local_state) - - -@plugin.command("destroy") -def handle_destroy(input_data: PluginInput) -> PluginOutput: - """Destroy PKI Authority service and clean up.""" - handler = EventHandler(input_data) - return handler.destroy() - - -if __name__ == "__main__": - plugin.run() diff --git a/src/services/apps/pki-authority/manifest.yaml b/src/services/apps/pki-authority/manifest.yaml deleted file mode 100644 index 4f7a914e..00000000 --- a/src/services/apps/pki-authority/manifest.yaml +++ /dev/null @@ -1,104 +0,0 @@ -name: pki-authority -version: 1.0.0 -commands: - - init - - apply - - health - - finalize - - destroy -healthcheckIntervalSecs: 60 -entrypoint: main.py -stateExpr: - engine: jq - query: | - ($swarmdb.clusters[] | select(.id == "{{ clusterId }}" and .deleted_ts == null)) as $cluster | - - ([$swarmdb.clusternodes[] | select(.cluster == "{{ clusterId }}" and .deleted_ts == null)]) as $pkiClusterNodes | - - ($pkiClusterNodes | map(.node)) as $pkiNodeIds | - - # Find Redis cluster - ( - $swarmdb.clusters[] | - select(.cluster_policy == "redis" and .deleted_ts == null) - ) as $redisCluster | - - # Get Redis cluster nodes - ([$swarmdb.clusternodes[] | select(.cluster == $redisCluster.id and .deleted_ts == null)]) as $redisClusterNodes | - - ($redisClusterNodes | map(.node)) as $redisNodeIds | - - # Find WireGuard cluster that contains PKI nodes - ( - $swarmdb.clusters[] | - select(.cluster_policy == "wireguard" and .deleted_ts == null) | - . as $currentCluster | - select( - ( - [$swarmdb.clusternodes[] | select(.cluster == $currentCluster.id and .deleted_ts == null and (.node | IN($pkiNodeIds[])))] | - length > 0 - ) - ) - ) as $wgCluster | - - { - cluster: { - id: $cluster.id, - cluster_policy: $cluster.cluster_policy, - leader_node: $cluster.leader_node - }, - - clusterNodes: [ - $pkiClusterNodes[] | - {id, node_id: .node, cluster} - ] | sort_by(.id, .node_id, .cluster), - - pkiNodeProperties: [ - $swarmdb.clusternodeproperties[] | - select( - (.cluster_node | startswith($cluster.id)) and - .deleted_ts == null and - .name == "pki_node_ready" - ) | - {cluster_node, name, value, node_id: ( .cluster_node as $cn | $swarmdb.clusternodes[] | select(.id == $cn and .deleted_ts == null)) | .node} - ] | sort_by(.cluster_node, .name, .value, .node_id), - - redisCluster: { - id: $redisCluster.id - }, - - redisNodeProperties: [ - $swarmdb.clusternodeproperties[] | - select( - (.cluster_node | startswith($redisCluster.id)) and - .deleted_ts == null and - .name == "redis_node_ready" - ) | - {cluster_node, name, value, node_id: ( .cluster_node as $cn | $swarmdb.clusternodes[] | select(.id == $cn and .deleted_ts == null)) | .node} - ] | sort_by(.cluster_node, .name, .value, .node_id), - - wgCluster: { - id: $wgCluster.id - }, - - wgNodeProperties: [ - $swarmdb.clusternodeproperties[] | - select( - (.cluster_node | startswith($wgCluster.id)) and - .deleted_ts == null and - .name == "tunnel_ip" - ) | - {cluster_node, name, value, node_id: ( .cluster_node as $cn | $swarmdb.clusternodes[] | select(.id == $cn)) | .node} - ] | sort_by(.cluster_node, .name, .value, .node_id), - - authorityServiceProperties: [ - $swarmdb.clusterproperties[] | - select( - .cluster == $cluster.id and - .deleted_ts == null and - (.name | startswith("pki_authority_")) - ) | - {name, value} - ] | sort_by(.name) - } - From bf881422adf2cf03ebd5a359fafee517c4e0f82a Mon Sep 17 00:00:00 2001 From: Petr Evstifeev Date: Thu, 5 Feb 2026 11:58:03 -0600 Subject: [PATCH 46/51] use latest pki components --- src/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Dockerfile b/src/Dockerfile index c4f40499..688fe5f1 100644 --- a/src/Dockerfile +++ b/src/Dockerfile @@ -247,7 +247,7 @@ ADD rootfs/files/scripts/install_lxc_deps.sh /buildroot/files/scripts/ RUN --security=insecure /buildroot/files/scripts/install_lxc_deps.sh RUN mkdir -p "${OUTPUTDIR}/etc/super/containers/pki-authority" -COPY --from=ghcr.io/super-protocol/tee-pki-authority-service-lxc:build-21716071201 /pki-authority.tar "${OUTPUTDIR}/etc/super/containers/pki-authority/pki-authority.tar" +COPY --from=ghcr.io/super-protocol/tee-pki-authority-service-lxc:build-21720130629 /pki-authority.tar "${OUTPUTDIR}/etc/super/containers/pki-authority/pki-authority.tar" ADD rootfs/files/configs/pki-service/conf/lxc-swarm-template.yaml "${OUTPUTDIR}/etc/super/containers/pki-authority/lxc-swarm-template.yaml" ADD rootfs/files/configs/pki-service/conf/dnsmasq.conf "${OUTPUTDIR}/etc/lxc/dnsmasq.conf" ADD rootfs/files/configs/pki-service/conf/lxc-net "${OUTPUTDIR}/etc/default/lxc-net" @@ -368,7 +368,7 @@ RUN chmod +x /buildroot/files/scripts/setup_runtime_tools.sh RUN --security=insecure /buildroot/files/scripts/setup_runtime_tools.sh # install pki-sync-client npm package globally (requires python3-venv from setup_runtime_tools) -ARG PKI_SYNC_CLIENT_VERSION=2.0.4 +ARG PKI_SYNC_CLIENT_VERSION=2.0.5 ADD rootfs/files/scripts/install_sync_client.sh /buildroot/files/scripts/ RUN chmod +x /buildroot/files/scripts/install_sync_client.sh RUN --security=insecure /buildroot/files/scripts/install_sync_client.sh "${PKI_SYNC_CLIENT_VERSION}" From 56466f4f409e4b14ddfb23403538a7ea46cb7056 Mon Sep 17 00:00:00 2001 From: Petr Evstifeev Date: Fri, 6 Feb 2026 08:36:54 -0600 Subject: [PATCH 47/51] reveiw fixes --- src/Dockerfile | 3 ++- src/rootfs/files/configs/pki-service/scripts/pki_helpers.py | 6 ++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/Dockerfile b/src/Dockerfile index 546b14cb..9dcf1172 100644 --- a/src/Dockerfile +++ b/src/Dockerfile @@ -260,8 +260,9 @@ RUN mkdir -p "${OUTPUTDIR}/usr/local/bin"; ADD rootfs/files/scripts/install_lxc_deps.sh /buildroot/files/scripts/ RUN --security=insecure /buildroot/files/scripts/install_lxc_deps.sh +ARG PKI_AUTHORITY_SERVICE_LXC_TAG=build-21720130629 RUN mkdir -p "${OUTPUTDIR}/etc/super/containers/pki-authority" -COPY --from=ghcr.io/super-protocol/tee-pki-authority-service-lxc:build-21720130629 /pki-authority.tar "${OUTPUTDIR}/etc/super/containers/pki-authority/pki-authority.tar" +COPY --from=ghcr.io/super-protocol/tee-pki-authority-service-lxc:${PKI_AUTHORITY_SERVICE_LXC_TAG} /pki-authority.tar "${OUTPUTDIR}/etc/super/containers/pki-authority/pki-authority.tar" ADD rootfs/files/configs/pki-service/conf/lxc-swarm-template.yaml "${OUTPUTDIR}/etc/super/containers/pki-authority/lxc-swarm-template.yaml" ADD rootfs/files/configs/pki-service/conf/dnsmasq.conf "${OUTPUTDIR}/etc/lxc/dnsmasq.conf" ADD rootfs/files/configs/pki-service/conf/lxc-net "${OUTPUTDIR}/etc/default/lxc-net" diff --git a/src/rootfs/files/configs/pki-service/scripts/pki_helpers.py b/src/rootfs/files/configs/pki-service/scripts/pki_helpers.py index d67be515..0712f259 100755 --- a/src/rootfs/files/configs/pki-service/scripts/pki_helpers.py +++ b/src/rootfs/files/configs/pki-service/scripts/pki_helpers.py @@ -269,8 +269,10 @@ def read_network_type_from_certificate(cert_path: Path = STORAGE_PATH / "basic_c cert_path: Path to PEM certificate file. Returns: - NetworkType.TRUSTED or NetworkType.UNTRUSTED based on OID 1.3.6.1.3.8888.4 value. - Defaults to NetworkType.TRUSTED if OID is not present or has other value. + NetworkType.TRUSTED or NetworkType.UNTRUSTED based on the value of the + custom extension identified by OID_CUSTOM_EXTENSION_NETWORK_TYPE. + Defaults to NetworkType.TRUSTED if the extension is not present or has + another value. """ try: if not cert_path.exists(): From c64c0c8d642f933eef1e9ec49c85012710797764 Mon Sep 17 00:00:00 2001 From: Petr Evstifeev Date: Tue, 10 Feb 2026 09:37:26 -0600 Subject: [PATCH 48/51] nvidia gpu info --- .../pki-service/scripts/pki_configure.py | 5 +- .../pki-service/scripts/pki_helpers.py | 47 +++++++++++++++++-- .../systemd/pki-authority-sync.service | 4 +- .../pki-service/systemd/pki-authority.service | 4 +- 4 files changed, 51 insertions(+), 9 deletions(-) diff --git a/src/rootfs/files/configs/pki-service/scripts/pki_configure.py b/src/rootfs/files/configs/pki-service/scripts/pki_configure.py index ee4addd9..692280f0 100755 --- a/src/rootfs/files/configs/pki-service/scripts/pki_configure.py +++ b/src/rootfs/files/configs/pki-service/scripts/pki_configure.py @@ -10,7 +10,7 @@ sys.path.insert(0, str(Path(__file__).parent)) from pki_helpers import ( log, LogLevel, detect_cpu_type, detect_vm_mode, detect_network_type, - patch_yaml_config, patch_lxc_config, get_pki_authority_param, + patch_yaml_config, patch_lxc_config, mount_vm_certs, get_pki_authority_param, setup_iptables, update_pccs_url, generate_swarm_key, load_swarm_key, read_network_type_from_certificate, PKI_SERVICE_NAME, VMMode, NetworkType, STORAGE_PATH @@ -96,6 +96,9 @@ def main(): patch_lxc_config(cpu_type) log(LogLevel.INFO, "LXC config patched successfully") + + if vm_mode == VMMode.SWARM_NORMAL: + mount_vm_certs() # Setup iptables rules setup_iptables() diff --git a/src/rootfs/files/configs/pki-service/scripts/pki_helpers.py b/src/rootfs/files/configs/pki-service/scripts/pki_helpers.py index 0712f259..0b18aae0 100755 --- a/src/rootfs/files/configs/pki-service/scripts/pki_helpers.py +++ b/src/rootfs/files/configs/pki-service/scripts/pki_helpers.py @@ -27,9 +27,13 @@ CONTAINER_IP = "10.0.3.100" WIREGUARD_INTERFACE = "wg0" EXTERNAL_INTERFACE = "enp0s1" # Default external network interface -STORAGE_PATH = Path(f"/var/lib/lxc/{PKI_SERVICE_NAME}/rootfs/app/swarm-storage") +CONTAINER_ROOTFS = f"/var/lib/lxc/{PKI_SERVICE_NAME}/rootfs" +STORAGE_PATH = Path(f"{CONTAINER_ROOTFS}/app/swarm-storage") IPTABLES_RULE_COMMENT = f"{PKI_SERVICE_NAME}-rule" SWARM_ENV_YAML = "/sp/swarm/swarm-env.yaml" +VM_CERTS_HOST_DIR = "/etc/super/certs/vm" +VM_CERT_FILE_NAME = "vm_cert.pem" +VM_CERT_CONTAINER_FILE = f"/app/{VM_CERT_FILE_NAME}" SWARM_KEY_FILE = "/etc/swarm/swarm.key" OID_CUSTOM_EXTENSION_NETWORK_TYPE = "1.3.6.1.3.8888.4" @@ -320,7 +324,7 @@ def read_yaml_config_param(param_path: str) -> Optional[str]: Returns: Parameter value as string, or None if not found or error. """ - yaml_config_path = Path(f"/var/lib/lxc/{PKI_SERVICE_NAME}/rootfs/app/conf/lxc.yaml") + yaml_config_path = Path(f"{CONTAINER_ROOTFS}/app/conf/lxc.yaml") if not yaml_config_path.exists(): log(LogLevel.DEBUG, f"YAML config not found: {yaml_config_path}") @@ -491,7 +495,7 @@ def patch_yaml_config( ) src_yaml = Path(f"/etc/super/containers/pki-authority/{template_name}") - dst_yaml = Path(f"/var/lib/lxc/{PKI_SERVICE_NAME}/rootfs/app/conf/lxc.yaml") + dst_yaml = Path(f"{CONTAINER_ROOTFS}/app/conf/lxc.yaml") if not src_yaml.exists(): log(LogLevel.ERROR, f"Error: {src_yaml} not found.") @@ -622,6 +626,41 @@ def patch_lxc_config(cpu_type: str): "none bind,ro,create=file\n" ) +def mount_vm_certs(): + """Copy vm_cert.pem into rootfs and patch YAML config with vmCertificatePath.""" + src_cert = Path(VM_CERTS_HOST_DIR) / VM_CERT_FILE_NAME + if not src_cert.exists(): + log(LogLevel.ERROR, f"Error: {src_cert} not found") + sys.exit(1) + + dst_cert = Path(f"{CONTAINER_ROOTFS}{VM_CERT_CONTAINER_FILE}") + dst_cert.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(src_cert, dst_cert) + + dst_yaml = Path(f"{CONTAINER_ROOTFS}/app/conf/lxc.yaml") + + if not dst_yaml.exists(): + log(LogLevel.ERROR, f"Error: {dst_yaml} not found") + sys.exit(1) + + with open(dst_yaml, "r", encoding="utf-8") as file: + config = yaml.safe_load(file) + + if not config: + log(LogLevel.ERROR, f"Empty YAML config: {dst_yaml}") + sys.exit(1) + + if "pki" not in config: + config["pki"] = {} + if "mode" not in config["pki"]: + config["pki"]["mode"] = {} + + config["pki"]["mode"]["vmCertificatePath"] = VM_CERT_CONTAINER_FILE + log(LogLevel.INFO, f"Set vmCertificatePath to: {VM_CERT_CONTAINER_FILE}") + + with open(dst_yaml, "w", encoding="utf-8") as file: + yaml.dump(config, file, default_flow_style=False) + def get_bridge_ip(bridge_name: str) -> str: """Get host IP address on the LXC bridge.""" @@ -843,7 +882,7 @@ def setup_iptables(): def update_pccs_url(): """Update PCCS URL in QCNL configuration.""" - qcnl_conf = Path(f"/var/lib/lxc/{PKI_SERVICE_NAME}/rootfs/etc/sgx_default_qcnl.conf") + qcnl_conf = Path(f"{CONTAINER_ROOTFS}/etc/sgx_default_qcnl.conf") qcnl_conf_bak = Path(f"{qcnl_conf}.bak") host_ip = get_bridge_ip(BRIDGE_NAME) diff --git a/src/rootfs/files/configs/pki-service/systemd/pki-authority-sync.service b/src/rootfs/files/configs/pki-service/systemd/pki-authority-sync.service index a1f8e363..8ee6911f 100644 --- a/src/rootfs/files/configs/pki-service/systemd/pki-authority-sync.service +++ b/src/rootfs/files/configs/pki-service/systemd/pki-authority-sync.service @@ -10,8 +10,8 @@ Environment=NODE_ENV=production ExecStart=/usr/bin/node /usr/bin/pki-sync-client sync \ --config /etc/super/pki-authority-sync/secrets-config.yaml \ --swarm-env /sp/swarm/swarm-env.yaml \ - --save-certs-to-dir /etc/super/certs/ \ - --save-certs-prefix gatekeeper \ + --save-certs-to-dir /etc/super/certs/vm \ + --save-certs-prefix vm \ --disable-server-identity-check \ -v Restart=on-failure diff --git a/src/rootfs/files/configs/pki-service/systemd/pki-authority.service b/src/rootfs/files/configs/pki-service/systemd/pki-authority.service index d4fba21e..80144634 100644 --- a/src/rootfs/files/configs/pki-service/systemd/pki-authority.service +++ b/src/rootfs/files/configs/pki-service/systemd/pki-authority.service @@ -1,8 +1,8 @@ [Unit] Description=PKI authority lxc-container -After=pki-authority-init.service pki-authority-sync.service nvidia-persistenced.service pccs.service +After=pki-authority-init.service pki-authority-sync.service pccs.service Requires=pki-authority-init.service pccs.service -Wants=nvidia-persistenced.service pki-authority-sync.service +Wants=pki-authority-sync.service [Service] Type=simple From e8cadd8638add7bad564fe958333d9e957eb19b9 Mon Sep 17 00:00:00 2001 From: Petr Evstifeev Date: Tue, 10 Feb 2026 09:41:25 -0600 Subject: [PATCH 49/51] sync client nvidia dependency --- .../configs/pki-service/systemd/pki-authority-sync.service | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/rootfs/files/configs/pki-service/systemd/pki-authority-sync.service b/src/rootfs/files/configs/pki-service/systemd/pki-authority-sync.service index 8ee6911f..8707e39b 100644 --- a/src/rootfs/files/configs/pki-service/systemd/pki-authority-sync.service +++ b/src/rootfs/files/configs/pki-service/systemd/pki-authority-sync.service @@ -1,8 +1,8 @@ [Unit] Description=PKI authority sync client service ConditionKernelCommandLine=!vm_mode=swarm-init -After=network-online.target pki-authority-init.service -Wants=network-online.target +After=network-online.target pki-authority-init.service nvidia-persistenced.service +Wants=network-online.target nvidia-persistenced.service [Service] Type=oneshot From bf6115a7bace37fe9844ec24cb5f8ae7e68681c9 Mon Sep 17 00:00:00 2001 From: Petr Evstifeev Date: Tue, 10 Feb 2026 12:38:28 -0600 Subject: [PATCH 50/51] update to latest pki components --- src/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Dockerfile b/src/Dockerfile index 9dcf1172..63a5ff10 100644 --- a/src/Dockerfile +++ b/src/Dockerfile @@ -262,7 +262,7 @@ RUN --security=insecure /buildroot/files/scripts/install_lxc_deps.sh ARG PKI_AUTHORITY_SERVICE_LXC_TAG=build-21720130629 RUN mkdir -p "${OUTPUTDIR}/etc/super/containers/pki-authority" -COPY --from=ghcr.io/super-protocol/tee-pki-authority-service-lxc:${PKI_AUTHORITY_SERVICE_LXC_TAG} /pki-authority.tar "${OUTPUTDIR}/etc/super/containers/pki-authority/pki-authority.tar" +COPY --from=ghcr.io/super-protocol/tee-pki-authority-service-lxc:build-21875562504 /pki-authority.tar "${OUTPUTDIR}/etc/super/containers/pki-authority/pki-authority.tar" ADD rootfs/files/configs/pki-service/conf/lxc-swarm-template.yaml "${OUTPUTDIR}/etc/super/containers/pki-authority/lxc-swarm-template.yaml" ADD rootfs/files/configs/pki-service/conf/dnsmasq.conf "${OUTPUTDIR}/etc/lxc/dnsmasq.conf" ADD rootfs/files/configs/pki-service/conf/lxc-net "${OUTPUTDIR}/etc/default/lxc-net" @@ -383,7 +383,7 @@ RUN chmod +x /buildroot/files/scripts/setup_runtime_tools.sh RUN --security=insecure /buildroot/files/scripts/setup_runtime_tools.sh # install pki-sync-client npm package globally (requires python3-venv from setup_runtime_tools) -ARG PKI_SYNC_CLIENT_VERSION=2.0.5 +ARG PKI_SYNC_CLIENT_VERSION=2.0.6 ADD rootfs/files/scripts/install_sync_client.sh /buildroot/files/scripts/ RUN chmod +x /buildroot/files/scripts/install_sync_client.sh RUN --security=insecure /buildroot/files/scripts/install_sync_client.sh "${PKI_SYNC_CLIENT_VERSION}" From 80bd1bf2e834836ec586cf32bf328b11f3bd83f7 Mon Sep 17 00:00:00 2001 From: Petr Evstifeev Date: Fri, 13 Feb 2026 06:50:16 -0600 Subject: [PATCH 51/51] using mount instread of copy --- .../pki-service/scripts/pki_helpers.py | 32 ++++++++++++++----- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/src/rootfs/files/configs/pki-service/scripts/pki_helpers.py b/src/rootfs/files/configs/pki-service/scripts/pki_helpers.py index 0b18aae0..f6f91a78 100755 --- a/src/rootfs/files/configs/pki-service/scripts/pki_helpers.py +++ b/src/rootfs/files/configs/pki-service/scripts/pki_helpers.py @@ -32,8 +32,9 @@ IPTABLES_RULE_COMMENT = f"{PKI_SERVICE_NAME}-rule" SWARM_ENV_YAML = "/sp/swarm/swarm-env.yaml" VM_CERTS_HOST_DIR = "/etc/super/certs/vm" +VM_CERTS_CONTAINER_DIR = "app/vm-certs" # Relative path for lxc.mount.entry VM_CERT_FILE_NAME = "vm_cert.pem" -VM_CERT_CONTAINER_FILE = f"/app/{VM_CERT_FILE_NAME}" +VM_CERT_CONTAINER_FILE = f"/{VM_CERTS_CONTAINER_DIR}/{VM_CERT_FILE_NAME}" SWARM_KEY_FILE = "/etc/swarm/swarm.key" OID_CUSTOM_EXTENSION_NETWORK_TYPE = "1.3.6.1.3.8888.4" @@ -627,16 +628,31 @@ def patch_lxc_config(cpu_type: str): ) def mount_vm_certs(): - """Copy vm_cert.pem into rootfs and patch YAML config with vmCertificatePath.""" - src_cert = Path(VM_CERTS_HOST_DIR) / VM_CERT_FILE_NAME - if not src_cert.exists(): - log(LogLevel.ERROR, f"Error: {src_cert} not found") + """Mount vm certs directory into container and patch YAML config with vmCertificatePath.""" + src_dir = Path(VM_CERTS_HOST_DIR) + if not src_dir.exists(): + log(LogLevel.ERROR, f"Error: {src_dir} not found") sys.exit(1) - dst_cert = Path(f"{CONTAINER_ROOTFS}{VM_CERT_CONTAINER_FILE}") - dst_cert.parent.mkdir(parents=True, exist_ok=True) - shutil.copy2(src_cert, dst_cert) + # Add mount entry to LXC config + config_file = Path(f"/var/lib/lxc/{PKI_SERVICE_NAME}/config") + mount_entry = f"lxc.mount.entry = {VM_CERTS_HOST_DIR} {VM_CERTS_CONTAINER_DIR} none bind,ro,create=dir\n" + + if config_file.exists(): + with open(config_file, "r", encoding="utf-8") as file: + content = file.read() + + if mount_entry.strip() not in content: + with open(config_file, "a", encoding="utf-8") as file: + file.write(mount_entry) + log(LogLevel.INFO, f"Added mount entry for {VM_CERTS_HOST_DIR}") + else: + log(LogLevel.INFO, f"Mount entry for {VM_CERTS_HOST_DIR} already exists") + else: + log(LogLevel.ERROR, f"Error: LXC config file {config_file} not found") + sys.exit(1) + # Update YAML config with vmCertificatePath dst_yaml = Path(f"{CONTAINER_ROOTFS}/app/conf/lxc.yaml") if not dst_yaml.exists():