diff --git a/apps/api/src/observability.test.ts b/apps/api/src/observability.test.ts new file mode 100644 index 0000000..879891f --- /dev/null +++ b/apps/api/src/observability.test.ts @@ -0,0 +1,137 @@ +import { afterAll, beforeAll, describe, expect, it } from 'vitest'; +import type { FastifyInstance } from 'fastify'; + +import { buildServer } from './server.js'; + +type EnvSnapshot = Record; + +function snapshotEnv(keys: string[]): EnvSnapshot { + return Object.fromEntries(keys.map((key) => [key, process.env[key]])); +} + +function restoreEnv(snapshot: EnvSnapshot) { + for (const [key, value] of Object.entries(snapshot)) { + if (value === undefined) { + delete process.env[key]; + continue; + } + process.env[key] = value; + } +} + +const keysToSnapshot = [ + 'API_KEYS', + 'API_KEY_SCOPES', + 'REVOCATION_ISSUERS', + 'RATE_LIMIT_GLOBAL_MAX', + 'RATE_LIMIT_API_KEY_MAX', + 'RATE_LIMIT_WINDOW' +]; + +describe.sequential('observability: correlation IDs and metrics endpoint', () => { + let app: FastifyInstance; + let envSnapshot: EnvSnapshot; + + beforeAll(async () => { + envSnapshot = snapshotEnv(keysToSnapshot); + process.env.API_KEYS = 'obs-test-key'; + process.env.API_KEY_SCOPES = 'obs-test-key=read'; + process.env.RATE_LIMIT_GLOBAL_MAX = '200'; + process.env.RATE_LIMIT_API_KEY_MAX = '200'; + process.env.RATE_LIMIT_WINDOW = '1 minute'; + app = await buildServer(); + }); + + afterAll(async () => { + await app.close(); + restoreEnv(envSnapshot); + }); + + it('returns x-request-id header on health endpoint', async () => { + const response = await app.inject({ + method: 'GET', + url: '/api/v1/health' + }); + + expect(response.statusCode).toBe(200); + const requestId = response.headers['x-request-id']; + expect(requestId).toBeTypeOf('string'); + expect((requestId as string).length).toBeGreaterThan(0); + }); + + it('returns a distinct x-request-id for each request', async () => { + const first = await app.inject({ method: 'GET', url: '/api/v1/health' }); + const second = await app.inject({ method: 'GET', url: '/api/v1/health' }); + + const idFirst = first.headers['x-request-id']; + const idSecond = second.headers['x-request-id']; + + expect(idFirst).toBeTypeOf('string'); + expect(idSecond).toBeTypeOf('string'); + expect(idFirst).not.toBe(idSecond); + }); + + it('returns x-request-id header on metrics endpoint', async () => { + const response = await app.inject({ + method: 'GET', + url: '/api/v1/metrics' + }); + + expect(response.statusCode).toBe(200); + expect(response.headers['x-request-id']).toBeTypeOf('string'); + }); + + it('metrics endpoint exposes baseline HTTP request counters', async () => { + // Trigger a request to populate metrics + await app.inject({ method: 'GET', url: '/api/v1/health' }); + + const response = await app.inject({ + method: 'GET', + url: '/api/v1/metrics' + }); + + expect(response.statusCode).toBe(200); + expect(response.headers['content-type']).toContain('text/plain'); + const body = response.body; + expect(body).toContain('deedshield_http_requests_total'); + expect(body).toContain('deedshield_http_request_duration_seconds'); + }); + + it('metrics endpoint exposes business-level verification lifecycle counters', async () => { + const response = await app.inject({ + method: 'GET', + url: '/api/v1/metrics' + }); + + expect(response.statusCode).toBe(200); + const body = response.body; + expect(body).toContain('deedshield_receipts_issued_total'); + expect(body).toContain('deedshield_receipt_verifications_total'); + expect(body).toContain('deedshield_revocations_total'); + expect(body).toContain('deedshield_verify_duration_seconds'); + }); + + it('metrics endpoint exposes default Node.js process metrics', async () => { + const response = await app.inject({ + method: 'GET', + url: '/api/v1/metrics' + }); + + expect(response.statusCode).toBe(200); + const body = response.body; + // prom-client collectDefaultMetrics includes process_cpu_seconds_total + expect(body).toContain('deedshield_api_process_cpu_seconds_total'); + }); + + it('x-request-id is consistent in header and not a sensitive value', async () => { + const response = await app.inject({ + method: 'GET', + url: '/api/v1/status' + }); + + const requestId = response.headers['x-request-id'] as string; + // Must be a string without obvious secret patterns (no Bearer, no sha256= etc.) + expect(requestId).not.toMatch(/^(bearer|sha256=|eyJ)/i); + expect(requestId.length).toBeLessThan(128); + }); +}); diff --git a/apps/api/src/server.ts b/apps/api/src/server.ts index 22d4c4d..bd33160 100644 --- a/apps/api/src/server.ts +++ b/apps/api/src/server.ts @@ -873,6 +873,32 @@ export async function buildServer(options: BuildServerOptions = {}) { buckets: [0.01, 0.05, 0.1, 0.25, 0.5, 1, 2, 5], registers: [metricsRegistry] }); + // Business-level verification lifecycle metrics + const receiptsIssuedTotal = new Counter({ + name: 'deedshield_receipts_issued_total', + help: 'Total signed receipts issued by decision outcome', + labelNames: ['decision', 'policy_profile'] as const, + registers: [metricsRegistry] + }); + const receiptVerificationsTotal = new Counter({ + name: 'deedshield_receipt_verifications_total', + help: 'Total post-issuance receipt verifications by outcome', + labelNames: ['outcome'] as const, + registers: [metricsRegistry] + }); + const revocationsTotal = new Counter({ + name: 'deedshield_revocations_total', + help: 'Total receipt revocations processed', + labelNames: [] as const, + registers: [metricsRegistry] + }); + const verifyDurationSeconds = new Histogram({ + name: 'deedshield_verify_duration_seconds', + help: 'End-to-end duration of the verification and receipt issuance flow', + labelNames: ['decision'] as const, + buckets: [0.1, 0.25, 0.5, 1, 2, 5, 10], + registers: [metricsRegistry] + }); const perApiKeyRateLimit = { max: securityConfig.perApiKeyRateLimitMax, timeWindow: securityConfig.rateLimitWindow, @@ -883,6 +909,11 @@ export async function buildServer(options: BuildServerOptions = {}) { (request as any)[REQUEST_START] = Date.now(); }); + // Propagate Fastify's auto-generated request ID as x-request-id for correlation across logs + app.addHook('onSend', async (request, reply) => { + reply.header('x-request-id', request.id); + }); + app.addHook('onResponse', async (request, reply) => { const route = (request.routeOptions.url || request.url.split('?')[0] || 'unknown').toString(); const method = request.method; @@ -1100,6 +1131,7 @@ export async function buildServer(options: BuildServerOptions = {}) { preHandler: [requireApiKeyScope(securityConfig, 'verify')], config: { rateLimit: perApiKeyRateLimit } }, async (request, reply) => { + const verifyStartMs = Date.now(); const parsed = verifyInputSchema.safeParse(request.body); if (!parsed.success) { return reply.code(400).send({ error: 'Invalid payload', details: parsed.error.flatten() }); @@ -1255,6 +1287,20 @@ export async function buildServer(options: BuildServerOptions = {}) { riskScore: signedReceipt.riskScore }); + receiptsIssuedTotal.inc({ decision: signedReceipt.decision, policy_profile: input.policy.profile }); + verifyDurationSeconds.observe({ decision: signedReceipt.decision }, (Date.now() - verifyStartMs) / 1000); + app.log.info( + { + event: 'receipt_issued', + request_id: request.id, + receipt_id: record.id, + decision: signedReceipt.decision, + policy_profile: input.policy.profile, + duration_ms: Date.now() - verifyStartMs + }, + 'receipt_issued' + ); + return reply.send(body); }); @@ -1372,6 +1418,19 @@ export async function buildServer(options: BuildServerOptions = {}) { return reply.code(500).send({ error: 'Receipt reconstruction failed' }); } const verificationResult = await verifyStoredReceipt(receipt, record, securityConfig); + const verificationOutcome = verificationResult.verified ? 'verified' : 'not_verified'; + receiptVerificationsTotal.inc({ outcome: verificationOutcome }); + app.log.info( + { + event: 'receipt_verified', + request_id: request.id, + receipt_id: receiptId, + outcome: verificationOutcome, + signature_verified: verificationResult.signatureVerified, + integrity_verified: verificationResult.integrityVerified + }, + 'receipt_verified' + ); return reply.send({ verified: verificationResult.verified, @@ -1468,6 +1527,17 @@ export async function buildServer(options: BuildServerOptions = {}) { data: { revoked: true } }); + revocationsTotal.inc(); + app.log.info( + { + event: 'receipt_revoked', + request_id: request.id, + receipt_id: receiptId, + issuer_id: revocationVerification.issuerId + }, + 'receipt_revoked' + ); + return reply.send({ status: 'REVOKED', issuerId: revocationVerification.issuerId }); }); diff --git a/docs/ops/monitoring/README.md b/docs/ops/monitoring/README.md index 2603e87..63652be 100644 --- a/docs/ops/monitoring/README.md +++ b/docs/ops/monitoring/README.md @@ -74,9 +74,21 @@ Store screenshots in staging evidence (for example under `docs/evidence/staging/ ## Notes - The alert rules use metrics emitted by `apps/api/src/server.ts`: - - `deedshield_http_requests_total` - - `deedshield_http_request_duration_seconds` + - **HTTP infrastructure metrics:** + - `deedshield_http_requests_total` (labels: `method`, `route`, `status_code`) + - `deedshield_http_request_duration_seconds` (labels: `method`, `route`, `status_code`) + - **Verification lifecycle business metrics:** + - `deedshield_receipts_issued_total` (labels: `decision`, `policy_profile`) — incremented per signed receipt issued + - `deedshield_receipt_verifications_total` (labels: `outcome`: `verified` | `not_verified`) — incremented per post-issuance receipt verification + - `deedshield_revocations_total` — incremented per receipt revocation + - `deedshield_verify_duration_seconds` (labels: `decision`) — histogram of end-to-end verify+receipt-issue duration - Core latency scope follows baseline routes: - `/api/v1/verify` - `/api/v1/receipt/:receiptId` - `/api/v1/receipt/:receiptId/verify` +- Correlation IDs: every response includes `x-request-id` matching Fastify's `request.id`. Structured log entries for `receipt_issued`, `receipt_verified`, and `receipt_revoked` events include `request_id` for cross-log correlation. +- Cardinality guidance: + - `decision` label: low cardinality (ALLOW | FLAG | BLOCK) + - `policy_profile` label: low cardinality per deployment; monitor new profiles before adding + - `outcome` label: low cardinality (verified | not_verified) + - Avoid per-receipt or per-user labels to prevent cardinality explosion diff --git a/docs/ops/monitoring/alert-rules.yml b/docs/ops/monitoring/alert-rules.yml index b5e5c5f..fc0a4bd 100644 --- a/docs/ops/monitoring/alert-rules.yml +++ b/docs/ops/monitoring/alert-rules.yml @@ -33,6 +33,30 @@ groups: / clamp_min(sum by (job) (rate(deedshield_http_requests_total{route=~"/api/v1/.*",route!="/api/v1/metrics"}[24h])), 0.001) + # Verification lifecycle business metrics + - record: deedshield:receipts_issued:rate5m + expr: | + sum by (job) (rate(deedshield_receipts_issued_total[5m])) + + - record: deedshield:receipts_issued_allow_ratio:rate5m + expr: | + sum by (job) (rate(deedshield_receipts_issued_total{decision="ALLOW"}[5m])) + / + clamp_min(sum by (job) (rate(deedshield_receipts_issued_total[5m])), 0.001) + + - record: deedshield:revocations:rate1h + expr: | + sum by (job) (rate(deedshield_revocations_total[1h])) + + - record: deedshield:verify_p95_duration_seconds:rate5m + expr: | + histogram_quantile( + 0.95, + sum by (job, le) ( + rate(deedshield_verify_duration_seconds_bucket[5m]) + ) + ) + - name: deedshield-api-slo-alerts interval: 30s rules: @@ -179,3 +203,39 @@ groups: summary: "Traffic volume dropped >70% from 24h baseline" description: "Business-hours request volume for job {{ $labels.job }} is below 30% of the 24h baseline for 15m (UTC window 14:00-23:00)." runbook: "docs/final/10_INCIDENT_ESCALATION_AND_SLO_BASELINE.md" + + # Verification lifecycle alerts + - alert: DeedShieldVerifyP95LatencyWarning + expr: | + ( + deedshield:verify_p95_duration_seconds:rate5m > 3.0 + ) + and on (job) + ( + sum by (job) (rate(deedshield_receipts_issued_total[5m])) > 0.01 + ) + for: 10m + labels: + severity: warning + incident_severity: SEV-3 + service: deedshield-api + slo: verify-latency + annotations: + summary: "Verification p95 duration above 3s for 10m" + description: "End-to-end verify+receipt-issue p95 duration exceeded 3s for job {{ $labels.job }}. Check upstream verifier dependencies." + runbook: "docs/final/10_INCIDENT_ESCALATION_AND_SLO_BASELINE.md" + + - alert: DeedShieldRevocationSpike + expr: | + sum by (job) (increase(deedshield_revocations_total[5m])) > 5 + for: 0m + labels: + severity: warning + incident_severity: SEV-3 + service: deedshield-api + slo: revocation-integrity + annotations: + summary: "More than 5 receipt revocations in 5 minutes" + description: "Revocation spike detected: {{ $value }} revocations in last 5m for job {{ $labels.job }}. Verify this is an authorized batch operation." + runbook: "docs/final/10_INCIDENT_ESCALATION_AND_SLO_BASELINE.md" + diff --git a/docs/ops/monitoring/grafana-dashboard-deedshield-api.json b/docs/ops/monitoring/grafana-dashboard-deedshield-api.json index c2b26b6..95d0869 100644 --- a/docs/ops/monitoring/grafana-dashboard-deedshield-api.json +++ b/docs/ops/monitoring/grafana-dashboard-deedshield-api.json @@ -589,6 +589,254 @@ ], "title": "Current 5xx Ratio by Route", "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "decimals": 3, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 29 + }, + "id": 10, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "sum by (decision) (rate(deedshield_receipts_issued_total{job=~\"$job\"}[5m]))", + "legendFormat": "{{ decision }}", + "range": true, + "refId": "A" + } + ], + "title": "Receipt Issuance Rate by Decision (5m)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 3.0 + }, + { + "color": "red", + "value": 5.0 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 29 + }, + "id": 11, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum by (le) (rate(deedshield_verify_duration_seconds_bucket{job=~\"$job\"}[5m])))", + "legendFormat": "p95 duration", + "range": true, + "refId": "A" + } + ], + "title": "Verify End-to-End P95 Duration (5m)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1.0 + }, + { + "color": "red", + "value": 5.0 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 37 + }, + "id": 12, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "sum(increase(deedshield_revocations_total{job=~\"$job\"}[5m]))", + "legendFormat": "revocations", + "range": true, + "refId": "A" + } + ], + "title": "Receipt Revocations (5m)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 0.8 + }, + { + "color": "red", + "value": 0.5 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 37 + }, + "id": 13, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "sum(rate(deedshield_receipt_verifications_total{job=~\"$job\",outcome=\"verified\"}[5m])) / clamp_min(sum(rate(deedshield_receipt_verifications_total{job=~\"$job\"}[5m])), 0.001)", + "legendFormat": "verified ratio", + "range": true, + "refId": "A" + } + ], + "title": "Receipt Verification Success Ratio (5m)", + "type": "timeseries" } ], "refresh": "30s", diff --git a/package-lock.json b/package-lock.json index 4d43cc9..b9c3106 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "deed-shield", - "version": "0.1.0", + "version": "0.2.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "deed-shield", - "version": "0.1.0", + "version": "0.2.0", "hasInstallScript": true, "workspaces": [ "apps/*", @@ -45,7 +45,7 @@ }, "apps/api": { "name": "@deed-shield/api", - "version": "0.1.0", + "version": "0.2.0", "hasInstallScript": true, "dependencies": { "@deed-shield/core": "file:../../packages/core", @@ -109,7 +109,7 @@ }, "apps/web": { "name": "@deed-shield/web", - "version": "0.1.0", + "version": "0.2.0", "dependencies": { "fastify": "5.8.1", "next": "^16.1.6", @@ -11834,7 +11834,7 @@ }, "packages/contracts": { "name": "@deed-shield/contracts", - "version": "0.1.0", + "version": "0.2.0", "dependencies": { "fastify": "5.8.1" }, @@ -12109,7 +12109,7 @@ }, "packages/core": { "name": "@deed-shield/core", - "version": "0.1.0", + "version": "0.2.0", "dependencies": { "ethers": "^6.12.0", "jose": "^5.2.4",