Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
137 changes: 137 additions & 0 deletions apps/api/src/observability.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
import { afterAll, beforeAll, describe, expect, it } from 'vitest';
import type { FastifyInstance } from 'fastify';

import { buildServer } from './server.js';

type EnvSnapshot = Record<string, string | undefined>;

function snapshotEnv(keys: string[]): EnvSnapshot {
return Object.fromEntries(keys.map((key) => [key, process.env[key]]));
}

function restoreEnv(snapshot: EnvSnapshot) {
for (const [key, value] of Object.entries(snapshot)) {
if (value === undefined) {
delete process.env[key];
continue;
}
process.env[key] = value;
}
}

const keysToSnapshot = [
'API_KEYS',
'API_KEY_SCOPES',
'REVOCATION_ISSUERS',
'RATE_LIMIT_GLOBAL_MAX',
'RATE_LIMIT_API_KEY_MAX',
'RATE_LIMIT_WINDOW'
];

describe.sequential('observability: correlation IDs and metrics endpoint', () => {
let app: FastifyInstance;
let envSnapshot: EnvSnapshot;

beforeAll(async () => {
envSnapshot = snapshotEnv(keysToSnapshot);
process.env.API_KEYS = 'obs-test-key';
process.env.API_KEY_SCOPES = 'obs-test-key=read';
process.env.RATE_LIMIT_GLOBAL_MAX = '200';
process.env.RATE_LIMIT_API_KEY_MAX = '200';
process.env.RATE_LIMIT_WINDOW = '1 minute';
app = await buildServer();
});

afterAll(async () => {
await app.close();
restoreEnv(envSnapshot);
});

it('returns x-request-id header on health endpoint', async () => {
const response = await app.inject({
method: 'GET',
url: '/api/v1/health'
});

expect(response.statusCode).toBe(200);
const requestId = response.headers['x-request-id'];
expect(requestId).toBeTypeOf('string');
expect((requestId as string).length).toBeGreaterThan(0);
});

it('returns a distinct x-request-id for each request', async () => {
const first = await app.inject({ method: 'GET', url: '/api/v1/health' });
const second = await app.inject({ method: 'GET', url: '/api/v1/health' });

const idFirst = first.headers['x-request-id'];
const idSecond = second.headers['x-request-id'];

expect(idFirst).toBeTypeOf('string');
expect(idSecond).toBeTypeOf('string');
expect(idFirst).not.toBe(idSecond);
});

it('returns x-request-id header on metrics endpoint', async () => {
const response = await app.inject({
method: 'GET',
url: '/api/v1/metrics'
});

expect(response.statusCode).toBe(200);
expect(response.headers['x-request-id']).toBeTypeOf('string');
});

it('metrics endpoint exposes baseline HTTP request counters', async () => {
// Trigger a request to populate metrics
await app.inject({ method: 'GET', url: '/api/v1/health' });

const response = await app.inject({
method: 'GET',
url: '/api/v1/metrics'
});

expect(response.statusCode).toBe(200);
expect(response.headers['content-type']).toContain('text/plain');
const body = response.body;
expect(body).toContain('deedshield_http_requests_total');
expect(body).toContain('deedshield_http_request_duration_seconds');
});

it('metrics endpoint exposes business-level verification lifecycle counters', async () => {
const response = await app.inject({
method: 'GET',
url: '/api/v1/metrics'
});

expect(response.statusCode).toBe(200);
const body = response.body;
expect(body).toContain('deedshield_receipts_issued_total');
expect(body).toContain('deedshield_receipt_verifications_total');
expect(body).toContain('deedshield_revocations_total');
expect(body).toContain('deedshield_verify_duration_seconds');
});

it('metrics endpoint exposes default Node.js process metrics', async () => {
const response = await app.inject({
method: 'GET',
url: '/api/v1/metrics'
});

expect(response.statusCode).toBe(200);
const body = response.body;
// prom-client collectDefaultMetrics includes process_cpu_seconds_total
expect(body).toContain('deedshield_api_process_cpu_seconds_total');
});

it('x-request-id is consistent in header and not a sensitive value', async () => {
const response = await app.inject({
method: 'GET',
url: '/api/v1/status'
});

const requestId = response.headers['x-request-id'] as string;
// Must be a string without obvious secret patterns (no Bearer, no sha256= etc.)
expect(requestId).not.toMatch(/^(bearer|sha256=|eyJ)/i);
Comment on lines +132 to +134
expect(requestId.length).toBeLessThan(128);
});
});
70 changes: 70 additions & 0 deletions apps/api/src/server.ts
Original file line number Diff line number Diff line change
Expand Up @@ -873,6 +873,32 @@ export async function buildServer(options: BuildServerOptions = {}) {
buckets: [0.01, 0.05, 0.1, 0.25, 0.5, 1, 2, 5],
registers: [metricsRegistry]
});
// Business-level verification lifecycle metrics
const receiptsIssuedTotal = new Counter({
name: 'deedshield_receipts_issued_total',
help: 'Total signed receipts issued by decision outcome',
labelNames: ['decision', 'policy_profile'] as const,
registers: [metricsRegistry]
});
const receiptVerificationsTotal = new Counter({
name: 'deedshield_receipt_verifications_total',
help: 'Total post-issuance receipt verifications by outcome',
labelNames: ['outcome'] as const,
registers: [metricsRegistry]
});
const revocationsTotal = new Counter({
name: 'deedshield_revocations_total',
help: 'Total receipt revocations processed',
labelNames: [] as const,
registers: [metricsRegistry]
});
const verifyDurationSeconds = new Histogram({
name: 'deedshield_verify_duration_seconds',
help: 'End-to-end duration of the verification and receipt issuance flow',
labelNames: ['decision'] as const,
buckets: [0.1, 0.25, 0.5, 1, 2, 5, 10],
registers: [metricsRegistry]
});
const perApiKeyRateLimit = {
max: securityConfig.perApiKeyRateLimitMax,
timeWindow: securityConfig.rateLimitWindow,
Expand All @@ -883,6 +909,11 @@ export async function buildServer(options: BuildServerOptions = {}) {
(request as any)[REQUEST_START] = Date.now();
});

// Propagate Fastify's auto-generated request ID as x-request-id for correlation across logs
app.addHook('onSend', async (request, reply) => {
reply.header('x-request-id', request.id);
});

app.addHook('onResponse', async (request, reply) => {
const route = (request.routeOptions.url || request.url.split('?')[0] || 'unknown').toString();
const method = request.method;
Expand Down Expand Up @@ -1100,6 +1131,7 @@ export async function buildServer(options: BuildServerOptions = {}) {
preHandler: [requireApiKeyScope(securityConfig, 'verify')],
config: { rateLimit: perApiKeyRateLimit }
}, async (request, reply) => {
const verifyStartMs = Date.now();
const parsed = verifyInputSchema.safeParse(request.body);
if (!parsed.success) {
return reply.code(400).send({ error: 'Invalid payload', details: parsed.error.flatten() });
Expand Down Expand Up @@ -1255,6 +1287,20 @@ export async function buildServer(options: BuildServerOptions = {}) {
riskScore: signedReceipt.riskScore
});

receiptsIssuedTotal.inc({ decision: signedReceipt.decision, policy_profile: input.policy.profile });

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Bound policy profile labels before incrementing metrics

In the /api/v1/verify handler, deedshield_receipts_issued_total is incremented with policy_profile: input.policy.profile, which comes directly from request input and is only validated as a non-empty string. Any caller with verify scope can therefore generate unbounded (or very long) label values, causing high-cardinality time series growth and increased Prometheus memory/scrape cost that can degrade observability for the service. Restrict this label to a bounded allowlist (or remove it) before calling .inc.

Useful? React with 👍 / 👎.

verifyDurationSeconds.observe({ decision: signedReceipt.decision }, (Date.now() - verifyStartMs) / 1000);
app.log.info(
{
event: 'receipt_issued',
request_id: request.id,
receipt_id: record.id,
decision: signedReceipt.decision,
policy_profile: input.policy.profile,
duration_ms: Date.now() - verifyStartMs
},
'receipt_issued'
);

return reply.send(body);
});

Expand Down Expand Up @@ -1372,6 +1418,19 @@ export async function buildServer(options: BuildServerOptions = {}) {
return reply.code(500).send({ error: 'Receipt reconstruction failed' });
}
const verificationResult = await verifyStoredReceipt(receipt, record, securityConfig);
const verificationOutcome = verificationResult.verified ? 'verified' : 'not_verified';
receiptVerificationsTotal.inc({ outcome: verificationOutcome });
app.log.info(
{
event: 'receipt_verified',
request_id: request.id,
receipt_id: receiptId,
outcome: verificationOutcome,
signature_verified: verificationResult.signatureVerified,
integrity_verified: verificationResult.integrityVerified
},
'receipt_verified'
);

return reply.send({
verified: verificationResult.verified,
Expand Down Expand Up @@ -1468,6 +1527,17 @@ export async function buildServer(options: BuildServerOptions = {}) {
data: { revoked: true }
});

revocationsTotal.inc();
app.log.info(
{
event: 'receipt_revoked',
request_id: request.id,
receipt_id: receiptId,
issuer_id: revocationVerification.issuerId
},
'receipt_revoked'
);

return reply.send({ status: 'REVOKED', issuerId: revocationVerification.issuerId });
});

Expand Down
16 changes: 14 additions & 2 deletions docs/ops/monitoring/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,21 @@ Store screenshots in staging evidence (for example under `docs/evidence/staging/

## Notes
- The alert rules use metrics emitted by `apps/api/src/server.ts`:
- `deedshield_http_requests_total`
- `deedshield_http_request_duration_seconds`
- **HTTP infrastructure metrics:**
- `deedshield_http_requests_total` (labels: `method`, `route`, `status_code`)
- `deedshield_http_request_duration_seconds` (labels: `method`, `route`, `status_code`)
- **Verification lifecycle business metrics:**
- `deedshield_receipts_issued_total` (labels: `decision`, `policy_profile`) — incremented per signed receipt issued
- `deedshield_receipt_verifications_total` (labels: `outcome`: `verified` | `not_verified`) — incremented per post-issuance receipt verification
- `deedshield_revocations_total` — incremented per receipt revocation
- `deedshield_verify_duration_seconds` (labels: `decision`) — histogram of end-to-end verify+receipt-issue duration
- Core latency scope follows baseline routes:
- `/api/v1/verify`
- `/api/v1/receipt/:receiptId`
- `/api/v1/receipt/:receiptId/verify`
- Correlation IDs: every response includes `x-request-id` matching Fastify's `request.id`. Structured log entries for `receipt_issued`, `receipt_verified`, and `receipt_revoked` events include `request_id` for cross-log correlation.
- Cardinality guidance:
- `decision` label: low cardinality (ALLOW | FLAG | BLOCK)
- `policy_profile` label: low cardinality per deployment; monitor new profiles before adding
- `outcome` label: low cardinality (verified | not_verified)
- Avoid per-receipt or per-user labels to prevent cardinality explosion
60 changes: 60 additions & 0 deletions docs/ops/monitoring/alert-rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,30 @@ groups:
/
clamp_min(sum by (job) (rate(deedshield_http_requests_total{route=~"/api/v1/.*",route!="/api/v1/metrics"}[24h])), 0.001)

# Verification lifecycle business metrics
- record: deedshield:receipts_issued:rate5m
expr: |
sum by (job) (rate(deedshield_receipts_issued_total[5m]))

- record: deedshield:receipts_issued_allow_ratio:rate5m
expr: |
sum by (job) (rate(deedshield_receipts_issued_total{decision="ALLOW"}[5m]))
/
clamp_min(sum by (job) (rate(deedshield_receipts_issued_total[5m])), 0.001)

- record: deedshield:revocations:rate1h
expr: |
sum by (job) (rate(deedshield_revocations_total[1h]))

- record: deedshield:verify_p95_duration_seconds:rate5m
expr: |
histogram_quantile(
0.95,
sum by (job, le) (
rate(deedshield_verify_duration_seconds_bucket[5m])
)
)

- name: deedshield-api-slo-alerts
interval: 30s
rules:
Expand Down Expand Up @@ -179,3 +203,39 @@ groups:
summary: "Traffic volume dropped >70% from 24h baseline"
description: "Business-hours request volume for job {{ $labels.job }} is below 30% of the 24h baseline for 15m (UTC window 14:00-23:00)."
runbook: "docs/final/10_INCIDENT_ESCALATION_AND_SLO_BASELINE.md"

# Verification lifecycle alerts
- alert: DeedShieldVerifyP95LatencyWarning
expr: |
(
deedshield:verify_p95_duration_seconds:rate5m > 3.0
)
and on (job)
(
sum by (job) (rate(deedshield_receipts_issued_total[5m])) > 0.01
)
for: 10m
labels:
severity: warning
incident_severity: SEV-3
service: deedshield-api
slo: verify-latency
annotations:
summary: "Verification p95 duration above 3s for 10m"
description: "End-to-end verify+receipt-issue p95 duration exceeded 3s for job {{ $labels.job }}. Check upstream verifier dependencies."
runbook: "docs/final/10_INCIDENT_ESCALATION_AND_SLO_BASELINE.md"

- alert: DeedShieldRevocationSpike
expr: |
sum by (job) (increase(deedshield_revocations_total[5m])) > 5
for: 0m
labels:
severity: warning
incident_severity: SEV-3
service: deedshield-api
slo: revocation-integrity
annotations:
summary: "More than 5 receipt revocations in 5 minutes"
description: "Revocation spike detected: {{ $value }} revocations in last 5m for job {{ $labels.job }}. Verify this is an authorized batch operation."
runbook: "docs/final/10_INCIDENT_ESCALATION_AND_SLO_BASELINE.md"

Loading
Loading