diff --git a/.gitignore b/.gitignore index f8d0620..8d180d7 100644 --- a/.gitignore +++ b/.gitignore @@ -56,6 +56,8 @@ npm-debug.log* # === Claude Code === .claude/settings.local.json +.claude/scheduled_tasks.lock +.claude/projects/ # === Factory (read-only reference, not deployed) === # factory/ is tracked but treated as read-only reference docs diff --git a/pulse/.env.example b/pulse/.env.example index 5397540..1123775 100644 --- a/pulse/.env.example +++ b/pulse/.env.example @@ -25,15 +25,25 @@ PULSE_DATA_PORT=8000 # -- Source Connector Tokens ------------------------------------------------ # GitHub Personal Access Token (repo, read:org scopes) GITHUB_TOKEN= +# GitHub org slug — REQUIRED. Used by discover_repos() — see ingestion-spec §2.3. +GITHUB_ORG= # GitLab Personal Access Token (read_api scope) GITLAB_TOKEN= # Jira API Token + email JIRA_API_TOKEN= JIRA_EMAIL= +# Jira base URL (e.g., https://your-org.atlassian.net) +JIRA_BASE_URL= +# JIRA_PROJECTS is intentionally absent. PULSE uses dynamic discovery +# (ingestion-spec §2.3); the active project list is maintained in +# `jira_project_catalog` table and resolved by ModeResolver. Do NOT add +# JIRA_PROJECTS unless you set DYNAMIC_JIRA_DISCOVERY_ENABLED=false (not +# recommended). +DYNAMIC_JIRA_DISCOVERY_ENABLED=true # Azure DevOps Personal Access Token (Code, Work Items read) AZURE_DEVOPS_PAT= # Jenkins API credentials (read-only: Overall/Read, Job/Read, Run/Read) -JENKINS_BASE_URL=https://jenkins.webmotors.com.br +JENKINS_BASE_URL= JENKINS_USERNAME= JENKINS_API_TOKEN= diff --git a/pulse/config/connections.yaml b/pulse/config/connections.yaml index 6605f00..e8c9b34 100644 --- a/pulse/config/connections.yaml +++ b/pulse/config/connections.yaml @@ -20,17 +20,13 @@ connections: token_env: GITHUB_TOKEN base_url: https://api.github.com sync_interval_minutes: 15 + # Per ingestion-spec §2.3 (Discovery-Only): NO explicit list of repos. + # The connector calls `discover_repos(active_months=12)` on each cycle + # via GraphQL `organization.repositories(orderBy: PUSHED_AT)` filtered + # by activity. New repos appear automatically; archived ones drop off + # without manual YAML edits. scope: - repositories: - - "webmotors-private/webmotors.next.ui" - - "webmotors-private/webmotors.portal.ui" - - "webmotors-private/webmotors.buyer.ui" - - "webmotors-private/webmotors.buyer.desktop.ui" - - "webmotors-private/webmotors.catalogo.next.ui" - - "webmotors-private/webmotors.fipe.next.ui" - - "webmotors-private/webmotors.pf" - - "webmotors-private/eleanor.flutter" - - "webmotors-private/webmotors.app.pf.search.bff" + active_months: 12 - name: "Webmotors Jenkins" source: jenkins @@ -41,13 +37,11 @@ connections: sync_interval_minutes: 15 scope: # Job list is loaded from config/jenkins-job-mapping.json (auto-generated). - # Generated 2026-04-14 by READ-ONLY SCM scan of 544 active PRD Jenkins - # jobs — each job's lastBuild → remoteUrls resolves the GitHub repo. - # Total: 577 PRD jobs across 283 repos. - # - # To regenerate: run scripts/discover_jenkins_jobs.py (READ-ONLY). - # The sync worker reads prd_jobs from the mapping file at startup. - jobs_from_mapping: true # Signals config.py to use jenkins-job-mapping.json + # Generated by READ-ONLY SCM scan of active PRD Jenkins jobs — each + # job's lastBuild → remoteUrls resolves the GitHub repo. Per + # ingestion-spec §3.6, regen via scripts/discover_jenkins_jobs.py + # when new repos appear (manual or weekly cron). + jobs_from_mapping: true - name: "Webmotors Jira" source: jira @@ -56,19 +50,15 @@ connections: token_env: JIRA_API_TOKEN base_url: https://webmotors.atlassian.net sync_interval_minutes: 15 + # Per ingestion-spec §2.3 (Discovery-Only): NO explicit project list. + # `ProjectDiscoveryService` lists ALL Jira projects; `SmartPrioritizer` + # auto-activates projects with ≥3 PR references. Tenant config in + # `tenant_jira_config` controls discovery mode (must be 'smart' for + # auto-activation). PII-flagged projects require manual approval. scope: - projects: - # Canais Digitais Web (Kanban) - - "DESC" # PF - Descobrir veículo - - "ENO" # PF - Encontrar oferta - - "ANCR" # PF - Anunciar - - "PUSO" # PF - USO - # Canais Digitais App (Kanban) - - "APPF" # PF - Aplicativo - # Sprint-based projects - - "FID" # Fidelidade - - "CTURBO" # Consultor Turbo Lab - - "PTURB" # Portal Turbo Lab + mode: smart + smart_min_pr_references: 3 + smart_pr_scan_days: 90 # Issue status mapping — Webmotors Jira (Portuguese) → PULSE normalized # Primary source; overrides DEFAULT_STATUS_MAPPING in normalizer.py. diff --git a/pulse/docs/backlog/ops-backlog.md b/pulse/docs/backlog/ops-backlog.md index c3b7262..1b18bd1 100644 --- a/pulse/docs/backlog/ops-backlog.md +++ b/pulse/docs/backlog/ops-backlog.md @@ -787,3 +787,928 @@ clientes, não pela equipe. --- +## FDD-OPS-012 · Issue sync — batch-per-project (simetria com PRs) + +**Epic:** Data Pipeline Reliability · **Release:** R1 +**Priority:** **P1** · **Persona:** Engineering (visibility + memory safety) +**Owner class:** `pulse-data-engineer` +**Trigger:** 2026-04-28 — full re-ingestion travada por horas em fase +"search/jql" sem nenhuma issue persistida no DB. Diagnóstico: arquitetura +do `_sync_issues()` é bulk-fetch-then-persist, enquanto `_sync_pull_requests()` +foi migrada pra batch-per-repo em 2026-04-23 (commit `7f9f339`). Issues +ficou pra trás. + +### Problema + +`packages/pulse-data/src/workers/devlake_sync.py:_sync_issues()` segue o +padrão antigo: + +```python +raw_issues = await self._reader.fetch_issues(...) # ← BLOQUEIA até paginar TUDO +changelogs = await self._reader.fetch_issue_changelogs(ids) # ← + N calls extras +normalized = [normalize_issue(...) for raw in raw_issues] # ← TUDO em RAM +count = await self._upsert_issues(normalized) # ← upsert único +``` + +Para 32 projetos × ~12k issues médias = ~376k issues: +- **Tempo até primeira linha persistida**: 2-5h (pagination + changelogs serial) +- **Pico de memória**: ~1-2 GB de issue dicts (no atual setup, OK; se Webmotors crescer pra 1M+, OOM) +- **Visibilidade zero durante fetch**: `eng_issues.COUNT()` fica em 0 por horas — operadores acham que travou +- **Recovery se sync abortar mid-fetch**: zero progress preserved (toda paginação se perde) + +PRs já resolveram isso em `7f9f339`: + +```python +# devlake_sync.py:_sync_pull_requests() (post-7f9f339) +async for repo_name, raw_prs in self._reader.fetch_pull_requests_batched(since=since): + # 1 repo at a time → normalize → upsert → progress signal +``` + +Resultado: PRs persistem em batches de ~100 a cada poucos segundos, operador +vê COUNT crescendo, recovery preserva 95%+ do trabalho em caso de crash. + +### Solução + +Espelhar o padrão de PRs em issues: + +1. **Refactor `JiraConnector.fetch_issues()` em `fetch_issues_batched()`** — + AsyncIterator que yielda `(project_key, batch_of_issues)` por página JQL + (ou por projeto, granularidade a definir). + +2. **Refactor `_sync_issues()` em devlake_sync.py** — loop async sobre + batches, normaliza + upsert por batch, atualiza progress, publica Kafka + por batch. + +3. **Manter changelog fetch inline com expand=changelog** — não fazer call + separada `fetch_issue_changelogs(ids)`. JQL já suporta `expand=changelog` + inline (veja `jira_connector.py:212`). Verificar se está sendo usado. + +4. **Watermark batch-aware** — atualizar watermark a cada N batches (ex: 10), + não só no final. Permite resume após crash sem perder muito. + +### Acceptance Criteria + +``` +Given a fresh re-ingestion against a Webmotors-scale tenant (32 projects, 376k issues) + When _sync_issues() runs + Then eng_issues.COUNT() starts growing within 60 seconds (not after hours) + AND each batch persists ~100-500 issues + AND total memory peak stays below 800 MB (vs 2 GB current) + AND if the worker crashes mid-sync, ≥80% of fetched issues are already in DB + +Given the new batch-per-project mode is enabled + When operator queries `SELECT COUNT(*) FROM eng_issues` repeatedly + Then count increases monotonically during the sync (not 0 → 376k jump) + +Given Pipeline Monitor exposes /pipeline/ingestion-progress + When _sync_issues() is mid-run + Then progress endpoint shows current_source = "" and + records_ingested updates per batch (parity with PR sync) +``` + +### Anti-surveillance check +PASS — sem mudança em payload de métrica. Refactor é puramente sobre +fluxo de ingestão. + +### Dependencies +Nenhuma. Pode ser implementado isoladamente. + +### Estimate +**M (4-6h)**: +- 1.5h refactor `JiraConnector.fetch_issues_batched()` +- 1.5h refactor `_sync_issues()` em devlake_sync.py +- 1h ajustar progress tracking + watermarks +- 1-2h tests (unit pra batched fetcher + integration test contra fixture mock) + +### Riscos de não fazer + +- Cada full re-ingestion futura leva 3-5h cega (igual hoje) +- Quando Webmotors crescer ou primeiro tenant 2× maior chegar, OOM +- Operador não tem visibilidade durante o fetch — mascarando travas como + a que aconteceu hoje (cycle 2 falhou silenciosamente em 21:23 e ninguém + notou por 14h) + +### Bonus + +Esta FDD se conecta com **FDD-OPS-008** (per-endpoint perf budgets) — uma +vez que issues sync seja batched, fica viável adicionar performance +assertions: "batch persist deve completar em < 30s" → falha CI se regredir. + +--- + +## FDD-OPS-013 · Eliminate redundant `fetch_issue_changelogs` call in `_sync_issues` + +**Epic:** Data Pipeline Reliability · **Release:** R1 (P0 — fixes +24h+ blocking phase observed 2026-04-28) +**Priority:** **P0** · **Persona:** Data engineering, all customers +**Owner class:** `pulse-data-engineer` +**Trigger:** 2026-04-28 — full re-ingestion stuck for hours in +sequential `GET /rest/api/3/issue/{id}?expand=changelog` calls (~3 +calls/sec for 250k+ issues = ~24h estimated). Diagnosed as redundant. + +### Problema + +`_sync_issues()` faz duas chamadas que sobrepõem 100%: + +1. `fetch_issues()` — JQL search com `expand=changelog` inline. Já + retorna a changelog completa em `raw["changelog"]`. +2. `fetch_issue_changelogs(ids)` — chama `GET /issue/{id}?expand=changelog` + uma vez por issue. + +Resultado: 376k issues × ~300ms latência = **~31 horas de chamadas +redundantes** + pressão sobre rate limit Atlassian. + +O próprio connector documenta o problema (`jira_connector.py:267`): + +```python +def fetch_issue_changelogs(...): + """... + Since fetch_issues already includes changelogs via expand=changelog, + this method is used for issues fetched WITHOUT expand (e.g., sprint issues). + """ +``` + +Mas em `devlake_sync.py:614`: + +```python +issue_ids = [str(raw["id"]) for raw in raw_issues] +changelogs_by_issue = await self._reader.fetch_issue_changelogs(issue_ids) # ← redundante +``` + +E `normalize_issue` recebe `changelogs=changelogs_by_issue.get(id, [])` em +vez de extrair `raw["changelog"]` direto. + +### Solução + +**1 mudança código + 1 teste:** + +```python +# devlake_sync.py:_sync_issues() +# REMOVER: +# issue_ids = [str(raw["id"]) for raw in raw_issues] +# changelogs_by_issue = await self._reader.fetch_issue_changelogs(issue_ids) + +# SUBSTITUIR por: +# (changelogs já estão em raw["changelog"] via expand) +for raw in raw_issues: + issue_changelogs = raw.get("changelog", {}).get("histories", []) + issue_data = normalize_issue( + raw, self._tenant_id, self._status_mapping, + changelogs=issue_changelogs, + ) + normalized.append(issue_data) +``` + +`fetch_issue_changelogs` permanece existindo — é usado SOMENTE para +sprint issues que vêm sem `expand` (esse caminho fica intocado). + +### Acceptance Criteria + +``` +Given full re-ingestion against Webmotors (32 projects, 376k issues) + When _sync_issues() runs + Then NO calls are made to GET /rest/api/3/issue/{id}?expand=changelog + (verify via httpx logs / mock) + AND eng_issues.status_transitions JSONB is populated correctly + (parity with current behavior — verified by domain-level tests) + AND total wall time for issues phase drops from ~24h to ~5min + +Given a fresh tenant has 1000 issues across 5 projects + When sync runs + Then changelogs are extracted from inline expand response + AND status_transitions field has same content as before +``` + +### Regression test + +Adicionar test em `packages/pulse-data/tests/integration/`: + +```python +def test_sync_issues_uses_inline_changelogs_only(): + # Mock JiraConnector.fetch_issues returning raw with "changelog" inline + # Mock fetch_issue_changelogs to record calls + # Run _sync_issues + # Assert mock_fetch_issue_changelogs.call_count == 0 + # Assert eng_issues.status_transitions populated correctly +``` + +Trava regressão futura (alguém pode "consertar" reintroduzindo a call). + +### Anti-surveillance check +PASS — sem mudança em payload/normalização, só elimina I/O redundante. + +### Estimate +**XS (1-2h)**: +- 30min: code change in `_sync_issues()` +- 30min: regression test +- 30min: validate against real Webmotors data (compare status_transitions before/after) +- ~30min margin + +### Dependencies +Nenhuma. Pode ser shipped imediatamente. + +### Risco de não fazer +Cada full re-ingestion (Webmotors hoje, novos tenants amanhã) leva 24h+ +em vez de minutos. SaaS-blocker. + +### Conexão com v2 architecture +Este é o "quick win Phase 1" do `docs/ingestion-architecture-v2.md`. Não +substitui Phases 2/3, mas elimina o pior gargalo single-handedly. + +--- + +## FDD-OPS-014 · Per-source workers + per-scope watermarks + +**Epic:** Data Pipeline Architecture · **Release:** R1 +**Priority:** **P1** · **Persona:** SaaS engineering team +**Owner class:** `pulse-data-engineer` + `pulse-engineer` +**Trigger:** 2026-04-27/28 incidents — sync-worker monolítico travado +em Jenkins (VPN off) bloqueando GitHub e Jira que estavam saudáveis. +Global watermark causando full backfill ao adicionar projetos novos. + +### Problema (dois sintomas, uma causa) + +**Sintoma 1 — sem source isolation (AP-4):** + +`DataSyncWorker` é um único processo que roda 4 fases sequenciais +(`issues → PRs → deploys → sprints`). Todas as 4 fontes (GitHub, Jira, +Jenkins) compartilham: + +- Mesmo event loop +- Mesma cadence de sync +- Mesmo cycle order +- Mesmo failure handling + +Consequência: **Jenkins offline (VPN drop)** ou **Jira blip** travam +todo o ciclo, mesmo que GitHub esteja saudável. Onboarding de GitLab/ADO +significa ainda mais código no mesmo loop monolítico. + +A simétrica fica esquisita: `discovery-worker` JÁ é processo separado +(boa decisão em ADR-014). `sync-worker` ficou para trás. + +**Sintoma 2 — global watermark (AP-3):** + +`pipeline_watermarks` tem 1 row por `entity_type`, sem dimensão de +scope: + +```sql +entity_type='issues', last_synced_at='2026-04-26' -- aplica a TODOS os 32 projetos +``` + +Consequência: quando discovery ativa um novo projeto, a única forma de +backfill é resetar watermark para `2020-01-01`, o que **re-fetcha +TODOS os 200k+ issues dos projetos existentes** sem necessidade. + +### Solução (2 partes coesas) + +**Parte 1 — split sync-worker em 3 workers:** + +``` +docker-compose.yml: + sync-worker → REMOVE + github-sync-worker → NEW (apenas GitHub PRs) + jira-sync-worker → NEW (apenas Jira issues + sprints) + jenkins-sync-worker → NEW (apenas Jenkins deploys) +``` + +Cada worker: +- Próprio event loop +- Cadence configurável independente +- Health-aware: pre-flight check antes de iniciar fase +- Logging com tag de source para grep/filter + +**Parte 2 — per-scope watermarks:** + +Migration nova adiciona `scope_key` em `pipeline_watermarks`: + +```sql +ALTER TABLE pipeline_watermarks + ADD COLUMN scope_key VARCHAR(255) NOT NULL DEFAULT '*'; + +-- Drop unique on entity_type alone, replace: +ALTER TABLE pipeline_watermarks + ADD CONSTRAINT uq_watermark_scope + UNIQUE (tenant_id, entity_type, scope_key); +``` + +Watermarks viram: + +| tenant_id | entity_type | scope_key | last_synced_at | +|---|---|---|---| +| ...001 | issues | jira:project:BG | 2026-04-26 | +| ...001 | issues | jira:project:OKM | 2026-04-26 | +| ...001 | pull_requests | github:repo:foo/bar | 2026-04-26 | +| ...001 | deployments | jenkins:job:deploy-X | 2026-04-26 | + +Connector-side: `fetch_issues(project_key=..., since=watermark[scope_key])`. + +### Acceptance Criteria + +``` +Given Jenkins is unreachable (VPN off) + When the daily ingestion cycle runs + Then jenkins-sync-worker logs "unhealthy, skipping cycle" + AND github-sync-worker continues normally + AND jira-sync-worker continues normally + AND VPN reconnect → jenkins-sync-worker resumes from last per-scope watermark + +Given a NEW Jira project is auto-activated by discovery + When jira-sync-worker runs the next cycle + Then ONLY the new project's issues are backfilled (since 2020-01-01) + AND existing projects' issues are NOT re-fetched + AND pipeline_watermarks has a new row with scope_key=jira:project:NEW + +Given Webmotors has 32 active Jira projects + When jira-sync-worker runs incremental sync + Then 32 watermarks are queried (1 per scope) + AND each project syncs from its own last_synced_at + AND total cycle time scales linearly with new data, not historical data +``` + +### Anti-surveillance check +PASS — sem mudança em campos persistidos. + +### Estimate +**M-L (1 semana)**: +- 1 dia: extract per-source workers (refactor `DataSyncWorker`) +- 0.5 dia: docker-compose + Dockerfile per-source +- 1 dia: schema migration + watermark repo refactor +- 1 dia: connector-side scope filtering (Jira `project_keys` already there; GitHub repo-by-repo already there; Jenkins per-job) +- 1 dia: testes (especialmente o cenário VPN drop simulation) +- 0.5 dia: Pipeline Monitor UI per-source breakdown +- ~1 dia margin + +### Dependencies +- FDD-OPS-013 (deve shipping antes pra simplificar refactor) +- FDD-OPS-012 (issue batch-per-project) idealmente ships antes — mas + pode ser paralelo + +### Risco de não fazer +- Cada outage de fonte (VPN, rate-limit, Atlassian incident) trava todo + o pipeline +- Onboarding de GitLab/ADO/Linear adiciona código na monolita já + frágil +- SaaS multi-tenant inviável sem isolation entre tenants → entre sources + é o primeiro passo + +### Conexão com v2 architecture +Este é o "Phase 2" de `docs/ingestion-architecture-v2.md`. Phase 3 (job +queue + worker pool) constrói em cima. + +--- + +## FDD-OPS-015 · Observable ingestion: pre-flight estimates + per-scope progress + ETA + +**Epic:** Data Pipeline / Ops Visibility · **Release:** R1 +**Priority:** **P1** · **Persona:** Operators (you, on-call), data engineering +**Owner class:** `pulse-data-engineer` + `pulse-engineer` (UI) +**Trigger:** 2026-04-27/28 — 5 cycles where I gave estimates ("ETA +45min") that were wrong by 10×+. Operator (você) cannot answer "is it +stuck?" without diving into logs. `COUNT(*)` is useless during +bulk-fetch. + +### Problema + +Atualmente: + +1. **Sem pre-flight count.** Worker não pergunta "quantas issues match + esse JQL?" antes de iniciar. Apenas começa. +2. **Sem rate-aware ETA.** Pace medido (ex: 27 calls/min) não é + usado pra calcular tempo restante. +3. **Sem per-scope progress.** Quando preso, impossível distinguir + "BG (197k) ainda não terminou" de "estamos no projeto X". +4. **Pipeline Monitor mostra agregado per-entity_type**, não per-scope. + +Consequência operacional: **5 falsos alarmes de progresso esta semana**. + +### Solução (3 entregas coesas) + +**1. Pre-flight estimate per scope:** + +Em cada início de fase, o worker chama o source pra contar: + +```python +# Jira: count via JQL count +estimate = await jira.count_issues(project_key=BG, since=watermark) +# logs: "[scope=jira:project:BG] estimated 12,450 issues since 2026-04-26" +``` + +Se a count call em si for muito cara (alguns sources não suportam), +heuristic: "X items since Y, extrapolated." + +**2. Per-batch progress with rate-aware ETA:** + +Cada batch persistido emite progress event: + +```python +{ + "scope": "jira:project:BG", + "phase": "fetching", + "items_done": 1200, + "items_total_estimate": 12450, + "items_per_second": 18.5, + "eta_seconds": 608, + "started_at": "...", + "current_high_water": "2026-04-27T10:23:00Z" +} +``` + +Tabela nova `pipeline_progress` (live + historical): + +```sql +CREATE TABLE pipeline_progress ( + id UUID PRIMARY KEY, + tenant_id UUID, + scope_key VARCHAR(255), + entity_type VARCHAR(64), + phase VARCHAR(32), -- fetching | normalizing | persisting | done | failed + items_done INT, + items_estimate INT, + items_per_second DOUBLE PRECISION, + eta_seconds INT, + started_at TIMESTAMPTZ, + last_progress_at TIMESTAMPTZ, + status VARCHAR(16), -- running | done | failed | paused + last_error TEXT +); +``` + +**3. Endpoint `/pipeline/jobs` + Pipeline Monitor UI per-scope:** + +``` +GET /data/v1/pipeline/jobs + +[ + { + "scope": "jira:project:BG", + "entity_type": "issues", + "status": "running", + "items_done": 1200, + "items_estimate": 12450, + "progress_pct": 9.6, + "eta_seconds": 608, + "rate_per_sec": 18.5, + "started_at": "...", + "errors": [] + }, + ... +] +``` + +Pipeline Monitor UI ganha tab "Per-scope progress" com tabela tipo Top Hat: +scope, status, %, ETA, current rate, errors. + +### Acceptance Criteria + +``` +Given a fresh ingestion against 32 projects + When operator queries /pipeline/jobs after 30s + Then response includes 32 rows (1 per active scope) + AND each row has status, items_done, ETA, rate + AND ETA accuracy: actual_completion_time within ±20% of estimate + (measured: ETA at 10% complete vs actual completion at 100%) + +Given an ingestion job stalls (network blip, source down) + When 60 seconds pass without progress + Then job's last_progress_at falls > 60s behind now() + AND UI displays "stalled" badge + AND on-call gets clear signal "scope X is stuck" + +Given operator wants to investigate a slow source + When opens Pipeline Monitor → Per-scope tab + Then can sort by items_per_second + AND can filter by entity_type/source + AND can see error history per scope +``` + +### Anti-surveillance check +PASS — progress data is metadata about ingestion, not user activity. + +### Estimate +**M (3-5 dias)**: +- 0.5 dia: schema migration `pipeline_progress` +- 1 dia: pre-flight count helpers (Jira count JQL, GitHub repo count, Jenkins job count) +- 1 dia: per-batch progress emission + ETA calculation +- 0.5 dia: `/pipeline/jobs` endpoint +- 1 dia: Pipeline Monitor UI tab per-scope +- 0.5 dia: tests + dashboard polish + +### Dependencies +- FDD-OPS-014 (per-scope watermarks) é pré-requisito do per-scope + progress +- FDD-OPS-012 (batch-per-project) facilita progress emit per-batch + +### Riscos +- Pre-flight count aumenta tempo total se overhead alto. Mitigar: se + count > 5s, usar heuristic +- Estimate ruim no início (até medir rate real) — aceitar e refinar a + cada batch + +### Conexão com v2 architecture +Este é o "Phase 1.5" de `docs/ingestion-architecture-v2.md`. Crítico +para evitar repetir o ciclo de "estimar 45min, esperar 4h, descobrir +que travou". + +--- + +## FDD-OPS-016 · Effort estimation fallback chain (Story Points → T-shirt → Hours → Count) + +**Epic:** Data Quality · **Release:** R1 +**Priority:** **P1** · **Persona:** Data consumer / metric layer +**Owner class:** `pulse-data-engineer` · **Status:** SHIPPED 2026-04-28 + +### Problema confirmado + +Panorama do Pulse DB em 2026-04-28 mostrou **`story_points = 0` em todas +as 311.007 issues**. Investigação na instância Jira da Webmotors revelou: + +- **`customfield_10004` ("Story Points")**: 0% populado em todos os 69 projetos +- **`customfield_18524` ("Story point estimate")**: 0% populado também +- Webmotors **não usa Story Points como método de estimativa** + +Distribuição real por projeto (amostra de 50 issues): + +| Projeto | T-Shirt Size | Original Estimate (h) | Tamanho/Impacto | Padrão | +|---------|--------------|------------------------|------------------|--------| +| ENO | 24% | 52% | 4% | Horas + tshirt | +| DESC | 26% | 34% | 6% | Horas + tshirt | +| APPF | 0% | 12% | 0% | Horas (raro) | +| OKM | 4% | 8% | 0% | Quase Kanban | +| BG, FID, PTURB | 0% | 0% | 0% | **Kanban puro** | + +Sem fallback, métricas de velocity, throughput-by-effort e forecast +ficavam zeradas para 100% das issues — bloqueando todo o pilar Lean. + +### Solução implementada + +Cadeia de fallback em `JiraConnector._extract_story_points`: + +1. **Story Points / Story point estimate** (numérico) — uso direto +2. **T-Shirt Size** (option) — mapa Fibonacci: PP=1, P=2, M=3, G=5, GG=8, GGG=13 +3. **Tamanho/Impacto** (option) — mesmo mapa +4. **`timeoriginalestimate`** (segundos) — buckets: ≤4h=1, ≤8h=2, ≤16h=3, ≤24h=5, ≤40h=8, ≤80h=13, >80h=21 +5. **`None`** — issue genuinamente não estimada + +Discovery automático via `_discover_custom_fields` casa por nome +("t-shirt size", "tamanho/impacto") — não hardcode customfield IDs. + +Telemetria de origem (`_effort_source_counts`) loggada por batched run: +operadores conseguem ver se o squad migrou de horas pra t-shirt sem +combar logs. + +### Quando `story_points = None` (Kanban puro) + +Quando nada está populado, a métrica downstream **DEVE contar items** +em vez de somar pontos. Esta decisão fica na camada de métricas, **não** +no normalizer. O normalizer só extrai o que existe. + +### Regras de mapeamento — escolhas e por quê + +- **Fibonacci-aligned**: comum na indústria, métricas downstream já + esperam essa escala +- **Hours buckets calibrados** contra valores observados na Webmotors + (2h–124h, múltiplos de 4) — cada valor comum cai num bucket sensato +- **Skipa SP = 0**: sentinel comum para "não estimado", trata como falta + +### Validação live + +Projeto CRMC (1.375 issues, ingestão completa pós-fix): +- **52,3% com effort estimado** (719/1.375 issues) +- Distribuição de valores: 1, 2, 3, 5, 8 — confirma escala Fibonacci aplicada + +### Migração dos 311k issues legados + +Como o upsert sobrescreve `story_points` em re-sync, os 311k issues +existentes vão receber o effort correto **conforme cada projeto recebe +updates incrementais**. Para acelerar, op pode resetar watermarks +por projeto via SQL — custo: re-fetch da API Jira. + +### Arquivos +- `pulse/packages/pulse-data/src/connectors/jira_connector.py`: + - Constants `TSHIRT_TO_POINTS`, `_hours_to_points`, patterns + - `_discover_custom_fields` agora detecta tshirt fields + - `_extract_story_points` reescrito com cadeia de fallback + - Telemetria via `_effort_source_counts` + log no fim de batched fetch +- `pulse/packages/pulse-data/tests/unit/test_effort_fallback_chain.py`: + 34 testes cobrindo cada hop, cada size, cada bucket de horas + +### Anti-surveillance check +PASS — apenas valores agregados de effort são extraídos; nenhum dado +identificador de pessoa é coletado. + +### Próximo passo (deferido) +Adicionar coluna `effort_source` em `eng_issues` para auditoria por +issue (qual hop produziu o valor). Útil para debugging mas não +bloqueante. Cobertura atual via telemetria batched é suficiente +para R1. + +--- + +## FDD-OPS-017 · Status normalization with statusCategory fallback + +**Epic:** Data Quality (foundational) · **Release:** R1 +**Priority:** **P0** (corrupts every flow metric) · **Persona:** All metric consumers +**Owner class:** `pulse-data-engineer` · **Status:** SHIPPED 2026-04-29 + +### Problema confirmado + +Audit do panorama em 2026-04-28 mostrou distribuição absurda de +`normalized_status` em 311k issues: + + - 96,5% `done` · 3,3% `todo` · 0,2% `in_progress` · 0,1% `in_review` + +A Webmotors tem **104 status raw distintos** em workflows ativos. Nosso +`DEFAULT_STATUS_MAPPING` cobria ~50, então 50+ status caíam silenciosamente +no fallback "Unknown → todo" — incluindo: + +| Status raw | Issues afetadas | Bucket atual | Bucket correto | +|---|---|---|---| +| `FECHADO EM PROD` | 2.881 | todo | done | +| `Em Progresso` | 6 | todo | in_progress | +| `Em desenv` | 4 | todo | in_progress | +| `Em Deploy Produção` | 14 | todo | in_progress | +| `Em Monitoramento Produção` | 3 | todo | done | +| `Homologação` | 9 | todo | in_review | +| `Em Verificação` | 4 | todo | in_review | +| ... | ... | ... | ... | + +**Impacto em CASCATA**: status_transitions herdam a classificação errada, +então o último estado de uma issue concluída ficava registrado como +`todo`. Resultado: + +- **Cycle Time** infinito (não há transição para `done`) +- **Throughput** sub-conta (issues entregues não aparecem) +- **WIP** super-conta (issues finalizadas continuam "em fluxo") +- **CFD** distorcido (área de "todo" inflada) +- **Lead Time** indeterminado + +Sem o fix, **todo o pilar Lean** está comprometido para qualquer projeto +que use status PT-BR fora do nosso mapping. + +### Solução implementada + +**Estratégia híbrida** em 3 camadas: + +1. **Mapping textual** (`DEFAULT_STATUS_MAPPING`) — preserva a + granularidade `in_progress` vs `in_review` que as métricas curadas + precisam. Expandido para cobrir os top 80+ status PT-BR observados. + +2. **Fallback `statusCategory.key` da Jira** — fonte autoritativa para + a dimensão `done` vs `não-done`. Descoberto via `/rest/api/3/status` + (chamada única por lifetime do conector, ~326 status definitions na + Webmotors). + - `done` → `done` + - `indeterminate` → `in_progress` + - `new` → `todo` + +3. **Default final** `todo` com WARN log — só atinge status sem + categoria (extremamente raro). + +### Arquivos modificados + +- `pulse/packages/pulse-data/src/connectors/jira_connector.py`: + - `_discover_status_categories()` — descobre + cacheia `name → category` + - `_map_issue` anexa `status_category` (current) e + `status_categories_map` (todos, para histórico de transitions) +- `pulse/packages/pulse-data/src/contexts/engineering_data/normalizer.py`: + - `normalize_status(raw, mapping, status_category=...)` — assinatura nova + - `build_status_transitions(..., status_categories_map=...)` — classifica + cada `to_status` histórica via map + - `DEFAULT_STATUS_MAPPING` expandido (~80 entradas novas PT-BR) +- `pulse/packages/pulse-data/tests/unit/test_status_normalization.py`: + 44 testes novos (textual ganha quando definido, category fallback, + Webmotors regression cases, transitions integração) + +### Validação live + +Cross-check do mapping contra DB atual mostrou que **3.151 issues +reclassificarão** quando o sync re-tocar (1% do total): + + - 2.923 `todo → done` (FECHADO EM PROD/HML, etc.) + - 161 `todo → in_review` (Homologação, Verificação, etc.) + - 67 `todo → in_progress` (Em Progresso, Em desenv, etc.) + +Esses 3.151 representam o "long tail" cuja má classificação distorcia +métricas individuais. Os ~300k issues `done` corretos continuam corretos. + +### Backfill dos legados + +Como o upsert sobrescreve `normalized_status` e `status_transitions`, +issues vão se reclassificar conforme cada projeto receber updates +incrementais. Para acelerar há duas opções: + +1. **Reset watermark por projeto** (custo: re-fetch da API Jira) +2. **Migration script futuro** — recalcular `normalized_status` e + `status_transitions[].status` direto via SQL (sem refetch). Decidido + deixar para issue separada — muda dado em produção, requer plano. + +### Anti-surveillance check +PASS — apenas valores de status agregados; nenhum dado pessoal. + +### Test coverage +116/116 verde (44 novos + 72 existentes). Cobertura inclui: +- Textual mapping ganha sobre category mismatch +- Cada categoria Jira fallback (`done` / `indeterminate` / `new`) +- Casos PT-BR Webmotors regressão +- Backward compat (legacy callers sem category) +- `build_status_transitions` integrado com category map + +### Decisão de produto registrada + +`FECHADO EM HML` foi mapeado como `done` (segue Jira) em vez de +`in_review`. Workflow author classifica como done; respeitamos. Se +Webmotors quiser mantê-lo em fluxo, pode renomear para "Aguardando +Deploy Produção" (já mapeado como in_progress). + +--- + +## FDD-OPS-018 · Sprint status pipeline — 4-layer cheese fix + +**Epic:** Data Quality (sprint metrics) · **Release:** R1 +**Priority:** **P1** · **Persona:** Sprint metric consumers +**Owner class:** `pulse-data-engineer` · **Status:** SHIPPED 2026-04-29 + +### Problema confirmado + +100% das 216 sprints na Webmotors estavam com `status=''` no `eng_sprints`. +O `goal` também totalmente vazio. Investigação revelou um clássico +"swiss cheese alignment" — **4 bugs independentes** em camadas diferentes, +cada um sozinho garantia que o status nunca fosse populado: + +| Camada | Bug | Sintoma sozinho | +|---|---|---| +| 1. Connector | `_map_sprint` mapeava OK (ACTIVE/CLOSED/FUTURE) | — | +| 2. Normalizer | `normalize_sprint` retornava dict SEM `status` | Status nunca chega no upsert | +| 3. Worker upsert | `_upsert_sprints` ON CONFLICT não atualizava `status`/`goal` | Sprints existentes nunca atualizam | +| 4. Connector watermark | `_fetch_board_sprints` filtrava por `started_date < since` | Sprints antigas nunca re-fetchadas | +| 5. ORM model | `EngSprint` no SQLAlchemy não tinha campo `status` (schema drift) | `Unconsumed column names: status` | + +A camada 4 é particularmente insidiosa: sprint state transitions +(`active` → `closed`) acontecem em `endDate`, não `startDate`. Filtrar +por started_date significa que uma sprint que começou em março e +fechou em maio nunca tem o status atualizado depois de março. + +### Impacto métrico (atual e futuro) + +Atualmente nenhum métrico consome `eng_sprints.status` diretamente — +por isso o bug ficou silencioso. Mas: +- **Sprint Comparison / Velocity Trend** (já em código) precisa filtrar + sprints `closed` para excluir sprints em andamento da regressão linear +- **Dashboard "current sprint"** (planejado) precisa de `status='active'` +- **Carryover Rate** já usa heurística de `endDate < now()` mas o ideal + é confiar em status='closed' +- **Goal** é input visual importante para a página da sprint + +### Solução implementada + +**Fix em todas as 4 camadas**: + +1. `JiraConnector._map_sprint` agora também passa `goal` adiante +2. `normalizer.normalize_sprint` inclui `status` (lowercase: `active`/ + `closed`/`future`/None) e `goal` (com strip de null bytes) +3. `_upsert_sprints` ON CONFLICT atualiza `status` + `goal` +4. `_fetch_board_sprints` removeu o filtro de watermark (volume baixo, + sprints mudam estado ao longo do tempo, sempre re-fetch é correto) +5. `EngSprint` model adiciona `status: Mapped[str|None]` (corrige drift) + +Helper `_normalize_sprint_status` mapeia aliases comuns (open→active, +completed→closed, planned→future) e devolve `None` para valores +desconhecidos — não bucketiza silenciosamente. + +### Validação live + +Após o fix + ad-hoc backfill direto: + +| Status | Quantidade | Tem goal? | +|---|---|---| +| `closed` | 187 | sim | +| `active` | 3 | sim | +| `future` | 5 | sim | +| (vazio) | 22 | — board órfão 873 sem projeto ativo | + +**195/217 = 89,9%** das sprints com status correto + 70% com goal real +(ex: "Gestão de banner no backoffice de CNC e TEMPO para novas +especificações técnicas"). As 22 vazias são de board órfão, fora do +escopo deste fix. + +### Tests +- `tests/unit/test_sprint_normalization.py` — 26 testes novos: + - status field presente no dict (5 cenários) + - unknown values retornam None (4) + - aliases (13 mapeamentos) + - goal passthrough (3) + - structural anti-regression: `_upsert_sprints.set_` inclui status + goal +- 142/142 verde (pyramid completo) + +### Lição aprendida — guard against future drift + +ORM model drift was the most insidious of the 4 bugs. The DB had the +column for ages; only the SQLAlchemy `EngSprint` was missing it. Any +upsert path that included `status` would crash; any path that omitted +it would silently produce empty data. Prevention going forward: + +- Pyramid test step "schema introspection vs ORM model" (deferred — + candidate for FDD-OPS-001 line of defense) +- Migration review checklist: every new column → corresponding + Mapped column in SQLAlchemy model + +### Anti-surveillance check +PASS — `goal` is squad/sprint-level free text, no individual attribution. + +--- + +## FDD-DEV-METRICS-001 · Codename "dev-metrics" — proprietary estimation & forecasting model + +**Epic:** Product Differentiation · **Release:** R3+ (codename "dev-metrics") +**Priority:** **P3** (large-scope, visionary) · **Persona:** Eng Manager + Squad Lead +**Owner class:** `pulse-product-director` + `pulse-data-scientist` + `pulse-engineer` +**Status:** PLANNED — capture only, do not start + +> **Marcador estratégico**: este FDD reserva o espaço no backlog do projeto +> codinome **"dev-metrics"**, que vai reescrever completamente a UX/UI do +> PULSE adicionando dezenas de features proprietárias e únicas na indústria. +> Documentação completa virá no próprio release plan do "dev-metrics" — esta +> entrada apenas garante que o tema **não se perde** entre R1 e R3. + +### Por que existe este card + +Hoje (R1) usamos uma cadeia de fallback **automática e implícita** para +extrair effort estimation (FDD-OPS-016). Isso resolve o problema imediato +mas **assume convenções** (Fibonacci scale, hours-bucket mapping). Squads +diferentes têm filosofias diferentes: + +- "Story Points são nosso golden standard" +- "Horas são mais honestas" +- "Tamanho de camisa só é útil pra refinement, não pra forecast" +- "Não estimamos. Throughput by item é nosso único KPI" + +Cada filosofia gera métricas diferentes. Hoje somos opinionados; +amanhã queremos ser **configuráveis** por squad e ainda **proativos**: +sugerir ao squad qual método cabe melhor com base no histórico real. + +### Visão (R3 — projeto "dev-metrics") + +1. **Per-squad estimation method** (admin UI): + - Squad escolhe: SP nativo, T-shirt, Hours, Count-only, ou "Auto" + - PULSE respeita a escolha em **toda** a métrica (velocity, forecast, + CFD por effort, scatterplot) + - Auto-mode: usa fallback chain atual + telemetria + +2. **Modelo proprietário de previsão e insights** (vantagem competitiva): + - Identifica drift de estimativa (squad marcando tudo como "M" há + 6 sprints) + - Calibra automaticamente: "Vocês marcaram esse card como P, mas + histórico de issues do tipo 'bug' com label 'auth' nesta squad + teve 73% de chance de virar G/GG" + - Insight de método: "73% das squads kanban-puras como vocês têm + throughput estável; vocês não — possível causa: variabilidade no + refinement" + - Forecast com Monte Carlo usando o método nativo do squad + - **Anti-surveillance**: insights são sobre o squad/processo, + **nunca** sobre indivíduos + +3. **UX completa rescritia**: + - Dashboard reescrito ao redor do método escolhido + - Painel "estimation health" novo + - Drill-down comparativo: "como seria sua velocity se vocês tivessem + adotado method X há 3 sprints?" + +### Diferenciador + +Concorrentes (LinearB, Jellyfish, Swarmia, Athenian) hoje são opinionados +em SP. PULSE será o **único** que respeita filosofia da squad e usa +isso como entrada de modelo, não como ruído a ser normalizado. + +### Pré-requisitos (capturar agora) + +Quando "dev-metrics" começar: +1. **`effort_source`** já estar em `eng_issues` (next step do + FDD-OPS-016) — sem isso, modelo proprietário não tem feature de método +2. **Histórico estatístico** mínimo de ~6 sprints por squad (ou ~30 + ciclos de Cycle Time pra Kanban) — bootstrap funciona em paralelo +3. **Multi-tenant scope_key** (FDD-OPS-014) — consolidado, OK +4. **Anti-surveillance review** rigoroso — modelo NÃO pode personalizar + por indivíduo, só por squad/repo + +### Lembrete operacional (CRÍTICO) + +**Não esquecer ao chegar em R2/R3.** Este FDD existe especificamente +para resgatar o tema. Reviewer de release plan deve checar: +- ✅ FDD-DEV-METRICS-001 ainda apontado no roadmap? +- ✅ `effort_source` adicionado antes do R3 começar? +- ✅ Telemetria do fallback chain ainda gerando dados utilizáveis? + +### Anti-surveillance check +PASS by design — modelo opera em agregado por squad/issue-type, nunca +por pessoa. Precisa review formal do CISO antes do release. + +### Estimate +**XL (multi-sprint, R3)** — escopo de release inteiro, não card único. + +### Dependencies +- FDD-OPS-016 (effort fallback chain) — base hoje +- FDD-OPS-014 (per-scope) — entregue +- Future migration: adicionar coluna `effort_source` em `eng_issues` + +--- + diff --git a/pulse/docs/ingestion-architecture-v2.md b/pulse/docs/ingestion-architecture-v2.md new file mode 100644 index 0000000..98bf48e --- /dev/null +++ b/pulse/docs/ingestion-architecture-v2.md @@ -0,0 +1,657 @@ +# PULSE Ingestion Architecture — v2 Review + +**Status:** Proposal · **Author:** orchestrator (post-mortem of 2026-04-27/28 incidents) +**Audience:** `pulse-data-engineer`, `pulse-engineer`, `pulse-product-director` +**Companion docs:** `ingestion-spec.md` (current architecture), `metrics/metrics-inconsistencies.md` (data quality history) + +--- + +## 1. Why this document exists + +This week's full re-ingestion against the Webmotors tenant exposed +structural defects in PULSE's ingestion pipeline that **cannot be +fixed by patches**. Five distinct failures in five days: + +| # | Date | Failure | Time wasted | +|---|---|---|---| +| 1 | 2026-04-23 | Snapshot drift (FDD-OPS-001) — workers running stale code | hours of debugging across 3 incidents | +| 2 | 2026-04-27 | `make seed-reset` wiped 442k rows of real Webmotors data without explicit gate | full re-ingestion required | +| 3 | 2026-04-27 | `metrics_snapshots` 50× perf regression at 7M rows — partial index missing | dashboard erro, ~2h diagnose+fix | +| 4 | 2026-04-27 21:23 | Cycle 2 failed silently — Jira ConnectionError (network blip) → 0 issues persisted → unnoticed for 14h | 14h × engineer attention | +| 5 | 2026-04-28 | Sync stuck 1.5h in JQL pagination, then hours in `fetch_issue_changelogs` (estimated 24-28h to converge) | currently running, decision pending | + +Each was **rational locally** when shipped. The sum is **not viable +for SaaS**. When we onboard the second tenant, every problem above +multiplies; when we onboard tenant N, we never finish. + +The user-stated target: **at least 10× improvement in speed, +simplicity, resilience, and security.** + +This document is the proposal. + +--- + +## 2. The five anti-patterns we keep hitting + +### AP-1: Bulk-fetch-then-persist (issues only) + +**Symptom (today):** `eng_issues.COUNT() = 0` for **3+ hours** while +sync worker buffers 250k+ issues in memory before any DB write. + +**Code:** `packages/pulse-data/src/workers/devlake_sync.py:_sync_issues()` +lines 605-635: + +```python +raw_issues = await self._reader.fetch_issues(...) # blocks until ALL 32 projects paginated +changelogs = await self._reader.fetch_issue_changelogs(ids) # 1 GET per issue (250k+ HTTP calls) +normalized = [normalize_issue(...) for raw in raw_issues] # all in memory +count = await self._upsert_issues(normalized) # single bulk upsert +``` + +**Why it's wrong:** +- Time-to-first-row (TTFR): hours, not seconds +- Memory: 1.5+ GB peak (manageable today, OOM at 2× scale) +- Visibility: operator queries `COUNT(*)`, sees 0, can't tell if working or stuck +- Recovery: crash mid-sync = lose 100% of fetched work + +**Notable:** PRs ALREADY escaped this pattern via commit `7f9f339` +(2026-04-23), which made `_sync_pull_requests` batch-per-repo. PR sync +now persists ~100 rows every few seconds — operator sees `COUNT(*)` +growing in real-time. Issues was missed in that refactor. + +**Tracked:** FDD-OPS-012 (created 2026-04-28). + +--- + +### AP-2: Redundant API calls + +**Symptom (today):** worker is hitting `GET /rest/api/3/issue/{id}?expand=changelog&fields=status` +once per issue — ~3 calls/sec. For 250k issues this is ~24 hours of +blocking HTTP work. + +**Code:** `devlake_sync.py:614`: + +```python +issue_ids = [str(raw["id"]) for raw in raw_issues] +changelogs_by_issue = await self._reader.fetch_issue_changelogs(issue_ids) +``` + +**Why it's wrong:** `fetch_issues()` already requests `expand=changelog` +on the JQL search (`jira_connector.py:240`). The changelog data is +**already in `raw_issues`** — the separate fetch is duplicate work. + +The connector itself documents this: + +```python +# jira_connector.py:267 +def fetch_issue_changelogs(...): + """... + Since fetch_issues already includes changelogs via expand=changelog, + this method is used for issues fetched WITHOUT expand (e.g., sprint issues). + """ +``` + +**Why it survives:** there's no test asserting "main issues sync uses +inline changelogs". The redundant call is invisible until production +scale exposes it. + +**Cost:** 376k HTTP calls × ~300ms = ~31 hours of pure API latency, +plus Atlassian rate-limit pressure. + +**Fix:** one-line — replace the separate call with read from +`raw["changelog"]` field already present in JQL response. + +**Tracked:** to be opened as FDD-OPS-013. + +--- + +### AP-3: Sequential phases with global watermark + +**Symptom (yesterday):** cycle 2 hit a Jira ConnectionError at 21:23, +issues sync errored silently with 0 results, sync moved on to PRs/deploys/ +sprints (which succeeded), watermark for issues never advanced. Next +14 hours of cycles wasted because the worker kept trying issues with +the same scope, hitting the same ordering issue, never producing data. + +**Code:** `devlake_sync.py:DataSyncWorker.sync()` runs phases in fixed +order: + +```python +1. _sync_issues() # fails silently → 0 issues +2. _sync_pull_requests() # ok → 63131 PRs +3. _sync_deployments() # ok → 1376 deploys +4. _sync_sprints() # ok → 216 sprints +``` + +`pipeline_watermarks` has ONE row per `entity_type` regardless of scope: + +```sql +entity_type='issues', last_synced_at='2020-01-01' (when reset) +``` + +**Why it's wrong:** + +1. **Single failure point**: failure in any phase doesn't degrade + gracefully; watermark stays where it was, next cycle reruns same + work, no signal that "issues broke at 21:23, PRs were fine". + +2. **Global watermark = full backfill on scope expansion**: when + discovery activates a new project, we have to reset watermark to + 2020-01-01 to backfill — but this also re-fetches the 200k + already-ingested issues from existing projects. Wasteful. + +3. **No bulkheads**: if Jira has a hiccup, issues phase blocks. No + timeout, no skip, no degraded mode. + +**Tracked:** to be opened as FDD-OPS-014 (per-scope watermarks + +phase isolation). + +--- + +### AP-4: No source isolation + +**Symptom (today AM):** sync worker stuck retrying Jenkins jobs +(VPN was off overnight) — every cycle would burn ~10s × 200 dead jobs += 30+ minutes on Jenkins timeouts before getting to anything else. + +**Code:** all four sources (GitHub, Jira, Jenkins, future GitLab) +share **one process**, **one event loop**, **one cycle order**. + +**Why it's wrong:** + +- Jenkins outage (VPN, infra) blocks GitHub sync (which works fine) +- Jira rate-limited → blocks deployment ingestion that doesn't touch Jira +- One slow source = global throughput floor +- Adding GitLab/ADO/Linear means more code in the same shared loop + +**The asymmetry:** discovery already has its OWN worker +(`discovery_scheduler.py`). The sync side wasn't given the same +treatment. + +**Tracked:** FDD-OPS-014 (covers per-source workers). + +--- + +### AP-5: Estimate-and-pray (no real observability) + +**Symptom (every cycle):** I tell you "ETA 45min", we wait 4h, find +out it's stuck, restart, lose work. We've done this **5 times this +week**. Each time my estimate is plausible at start, wrong by an +order of magnitude after exposure. + +**Why estimates fail:** + +1. **No pre-flight cost estimate.** We don't ask Jira "how many issues + match this JQL?" before fetching. We don't ask GitHub "how many PRs + in active repos last 12 months?" We just start and hope. + +2. **Progress proxy is `COUNT(*)`** — but in bulk-fetch mode (AP-1), + COUNT stays 0 until the very end. Useless during the long phase. + +3. **No rate-aware ETA.** When pace is 27 calls/min for 10 minutes, + we don't multiply by remaining work to get a real ETA. + +4. **No per-scope visibility.** When stuck, we can't tell "is BG + project taking forever, or is OKM done and we're on a small one?" + +**Tracked:** FDD-OPS-015 (observable ingestion: pre-flight estimate + +per-scope progress + rate-aware ETA). + +--- + +## 3. Target Principles for v2 (the 10× envelope) + +These are non-negotiable design constraints. Every code change in +ingestion lands or is rejected against these. + +### P-1: Stream by default — Time-to-first-row (TTFR) ≤ 60s + +Every fetcher is an `AsyncIterator` yielding small batches (50-200 +items). Each batch: +- normalize → upsert → emit Kafka event → ack → advance watermark + +Memory bound: ~10 MB max in flight at any time, regardless of total volume. + +**Effect:** operator sees row count growing from minute 1. Crash +recovery loses ≤1 batch. + +### P-2: Source-isolated workers (bulkheads) + +One worker process **per source** (github-sync-worker, jira-sync-worker, +jenkins-sync-worker, future gitlab-sync-worker). Independent: + +- Event loop +- Cycle cadence +- Watermarks +- Failure handling +- Rate-limit budget + +**Effect:** Jira down ≠ GitHub down ≠ Jenkins down. Onboarding GitLab +adds a worker; doesn't touch the others. + +### P-3: Per-scope watermarks (kill global) + +`pipeline_watermarks` keyed by `(source, entity_type, scope_key)`: + +```sql +(jira, issues, project_key=BG) last=2026-04-26 18:33 +(jira, issues, project_key=OKM) last=2026-04-26 18:35 +(github, prs, repo=foo/bar) last=2026-04-26 18:40 +``` + +**Effect:** new project activated = backfill ONLY that scope. Existing +work preserved. Per-scope progress and ETA become trivial. + +### P-4: Job queue + worker pool (not in-process loops) + +Discovery emits jobs ("ingest scope X, since Y") onto a queue +(Redis-backed or Kafka topic). Worker pool consumes with configurable +concurrency per source. + +``` +Discovery → enqueue jobs → Queue → Worker[1..N] → DB streaming +``` + +**Effect:** + +- Concurrency scales with hardware (5 parallel JQL queries vs 1) +- Failure = job retried, not whole cycle restarted +- New tenant = new jobs in queue, no orchestrator change +- SaaS-ready: 100 tenants = 100× jobs but same code + +### P-5: Backpressure + rate-limit awareness + +Read API rate-limit headers (`X-RateLimit-Remaining`, `Retry-After`). +Adapt automatically: + +- 90% of limit consumed → slow down (sleep proportional to remaining budget) +- 429 / Retry-After → exponential backoff with jitter (per source) +- GitHub GraphQL cost: track query cost vs hourly budget (5000) + +**Effect:** never hit hard limits. Sustained throughput is `~80% of +limit`, not `100% then 429 storm then crash`. + +### P-6: Saga pattern per batch (idempotent + recoverable) + +Each batch is a transactional unit: + +``` +BEGIN + INSERT/UPDATE rows (ON CONFLICT DO UPDATE) + INSERT pipeline_event (kafka_emitted=false) + UPDATE pipeline_watermarks SET last_synced_at = max(batch) +COMMIT + +ASYNC: emit Kafka event, mark pipeline_event.kafka_emitted=true +``` + +If crash before COMMIT: nothing changes, watermark unchanged, on +restart the worker re-fetches the same batch. + +If crash after COMMIT but before Kafka emit: outbox pattern catches +unemitted events on next cycle. + +**Effect:** zero data loss, zero duplicates (upsert idempotent), zero +silent skips. + +### P-7: Observable by default + +Every job emits structured progress: + +```json +{ + "scope": "jira:project:BG", + "phase": "fetching", + "items_total_estimate": 197043, + "items_done": 12500, + "items_per_second": 84, + "eta_seconds": 2200, + "started_at": "...", + "errors": [] +} +``` + +Exposed via: +- `GET /pipeline/jobs` — current state of all jobs +- Prometheus metrics: `pulse_ingestion_items_total{source,scope,entity}`, + `pulse_ingestion_duration_seconds`, `pulse_ingestion_error_rate` +- Pipeline Monitor UI — already exists, gets per-scope breakdown + +**Effect:** "is it stuck?" answered in 5 seconds, not 4 hours. + +### P-8: Health-aware orchestration + +Before each batch: + +```python +if not source.is_reachable(): + self.mark_unhealthy(source) + return +``` + +When source unhealthy, jobs go to "paused" queue. Periodic health +ping (1/min) re-tests; on recovery, jobs resume from where they were. + +**Effect:** VPN drop = jobs pause cleanly, no error storm, no time +wasted retrying. VPN back = automatic resume. + +--- + +## 4. Proposed Architecture v2 + +``` +┌──────────────────────────────────────────────────────────────────┐ +│ Discovery Service (per source) │ +│ github-discovery jira-discovery jenkins-discovery │ +│ (org-scan) (project-scan) (job-scan via SCM) │ +│ │ │ │ │ +│ └────────┬───────────┴───────────────────┘ │ +│ ▼ │ +│ emits jobs: { source, scope, entity, since, priority } │ +└──────────────┬───────────────────────────────────────────────────┘ + │ + ▼ +┌──────────────────────────────────────────────────────────────────┐ +│ Job Queue (Redis Streams or Kafka topic) │ +│ jira:issues:BG since=2026-04-26 priority=high │ +│ jira:issues:OKM since=2026-04-26 │ +│ github:prs:foo/bar since=2026-04-26 │ +│ jenkins:deploys:job-X since=2026-04-26 │ +└──────────────┬───────────────────────────────────────────────────┘ + │ + ▼ +┌──────────────────────────────────────────────────────────────────┐ +│ Worker Pool (configurable concurrency per source) │ +│ ┌──────────────────────────────────────────────────────────┐ │ +│ │ jira-worker[1..5] │ │ +│ │ pick job → BatchedFetcher → for batch in stream: │ │ +│ │ normalize → upsert → emit_event → advance_watermark │ │ +│ │ emit progress event │ │ +│ └──────────────────────────────────────────────────────────┘ │ +│ ┌──────────────────────────────────────────────────────────┐ │ +│ │ github-worker[1..3] │ │ +│ │ jenkins-worker[1..3] │ │ +│ └──────────────────────────────────────────────────────────┘ │ +└──────────────┬───────────────────────────────────────────────────┘ + │ writes + ▼ +┌──────────────────────────────────────────────────────────────────┐ +│ PULSE DB │ +│ eng_pull_requests, eng_issues, eng_deployments, eng_sprints │ +│ pipeline_watermarks (source, entity, scope_key) → last_at │ +│ pipeline_jobs (job state: pending/running/done/failed) │ +│ pipeline_events_outbox (Kafka emit guarantee) │ +│ pipeline_progress (per-scope progress + ETA) │ +└──────────────────────────────────────────────────────────────────┘ + +┌──────────────────────────────────────────────────────────────────┐ +│ Metrics Worker (unchanged) │ +│ consumes Kafka events → recomputes snapshots │ +└──────────────────────────────────────────────────────────────────┘ +``` + +### Key API contracts + +```python +# A fetcher is just an AsyncIterator yielding small batches +class BatchedFetcher(Protocol): + def fetch(self, scope: str, since: datetime | None) -> AsyncIterator[Batch]: + ... + +@dataclass +class Batch: + scope: str # e.g., "BG" + items: list[dict] # 50-200 raw items + source_high_water: datetime # for watermark advancement + estimated_total: int | None # if pre-flight known, for ETA + rate_limit: RateLimitInfo | None # adaptive throttling +``` + +```python +# Job worker is a generic loop, source-agnostic +class IngestionJobWorker: + async def run_job(self, job: Job): + fetcher = registry.get_fetcher(job.source, job.entity) + async for batch in fetcher.fetch(job.scope, job.since): + await self.persist_batch(batch) # transactional + await self.emit_progress(job, batch) # per batch + await self.check_health() # circuit breaker +``` + +--- + +## 5. The 10× envelope, decomposed + +| Lever | Today | v2 | Speedup | Notes | +|---|---|---|---|---| +| Stream vs bulk-then-persist | 250k issues × 1.5h fetch + 0.5h normalize+upsert = 2h | 100 items every ~3s = constant-time TTFR | **30×** TTFR | AP-1 + FDD-OPS-012 | +| Kill redundant changelog fetch | 376k × 1 HTTP call (~24h) | 0 (use inline) | **∞** (eliminates phase) | AP-2 + FDD-OPS-013 | +| Source isolation (parallel) | 4 phases sequential | 3 source workers concurrent | **3-4×** wall time | AP-4 + FDD-OPS-014 | +| Per-source concurrency | 1 connector active | 3-5 workers per source | **5×** sustained throughput | P-4 | +| Adaptive rate limits | naive retries, sometimes 429-banned | stay 80% of limit | **2×** sustained, **0** ban | P-5 | +| Per-scope watermarks | new project = full reset = full backfill | new scope = scope-only backfill | **10×** for incremental ops | AP-3 + FDD-OPS-014 | +| Health-aware (skip unreachable) | block whole cycle on Jenkins outage | pause source, others continue | qualitative — turns hours of wasted retry into 0 | P-8 | +| Pre-flight estimate | guess | actual API count | qualitative — answers "stuck?" in seconds | P-7 + FDD-OPS-015 | + +**Aggregate effect on the workload that's running RIGHT NOW** (376k +issues across 32 projects, fresh tenant): + +- **Today's path:** 24-30h+ (potentially infinite if changelog fetch + rate-limits) +- **v2 Phase 1 path** (just AP-1+AP-2 fixes): 30-45 minutes +- **v2 Phase 2 path** (+ source isolation): same 30-45 min for issues, + but now happens in parallel with PR sync, deploy sync — total cycle + ~45 min vs ~3h + +--- + +## 6. Migration Path — non-bigbang, in 3 phases + +I will NOT propose a clean-room rewrite. The codebase has 1 year of +hard-won correctness (status mapping, anti-surveillance, edge cases). +Throwing it out is the wrong reflex. + +Each phase delivers value standalone and is reversible. + +### Phase 1: Quick Wins — fixes the immediate pain (1-2 days, P0) + +**Scope:** correct existing code, no architecture change. + +| Item | Effort | Effect | +|---|---|---| +| **AP-2 fix** — comment out redundant `fetch_issue_changelogs` call in `_sync_issues`; teach normalizer to read inline `raw["changelog"]` | XS (1h code + tests) | 24h+ → ~5 min for changelog phase (eliminated) | +| **AP-1 fix** (FDD-OPS-012) — refactor `_sync_issues` to batch-per-project, mirror `_sync_pull_requests` pattern from `7f9f339` | M (4-6h) | TTFR for issues: hours → seconds; memory: 1.5GB → 50MB | +| **Pre-flight estimate logging** — before each `_sync_*`, log "I will fetch ~N items based on JQL count / GraphQL nodeId / Jenkins job count" | XS (1h) | Operator gets actual ETA vs guess | + +**Total Phase 1: ~1-2 dev-days.** +**Result on Webmotors workload: 24h → ~30-45 min for full re-ingest.** + +### Phase 2: Source Isolation (3-5 days, P1) + +**Scope:** structural — split sync-worker into per-source workers. + +| Item | Effort | +|---|---| +| Extract `JiraSyncWorker`, `GithubSyncWorker`, `JenkinsSyncWorker` from monolithic `DataSyncWorker` | M (1 day) | +| docker-compose: 3 services instead of 1 | XS | +| Per-source watermarks: schema migration + repo update | M (1 day) | +| Health-aware pre-flight check before each cycle | S (2-3h) | +| Update Pipeline Monitor UI for per-source breakdown | S (existing surface) | + +**Total Phase 2: 3-5 dev-days.** +**Result: failure isolation, parallel execution, correct watermarks +under scope expansion.** + +### Phase 3: Job Queue + Pool (1-2 weeks, R1) + +**Scope:** the SaaS-ready pattern. + +| Item | Effort | +|---|---| +| Choose job queue (Redis Streams vs Kafka topic — both already running) | XS (decision) | +| Job state schema (`pipeline_jobs` table) | S | +| Generic `IngestionJobWorker` consuming jobs | M (1-2 days) | +| Refactor each source to expose `BatchedFetcher` interface | M (1 day per source) | +| Discovery emits jobs (no longer triggers sync directly) | S | +| Retry policy + dead-letter | M | +| Tests + chaos eng (kill worker mid-job, verify resume) | M | + +**Total Phase 3: 1-2 dev-weeks.** +**Result: SaaS-ready ingestion. Adding 100 tenants = 100× more jobs, +not 100× more code paths.** + +--- + +## 7. What we are NOT doing (out of scope) + +- **No connector rewrites.** GitHub/Jira/Jenkins connectors stay as-is; + they have well-tested correctness logic. Only the orchestration layer + changes. +- **No DevLake re-introduction.** ADR-005 is settled. +- **No event sourcing.** Outbox pattern (Phase 1.5+) is sufficient + for our Kafka guarantee. +- **No SaaS multi-tenant orchestration.** Phase 3 makes it possible; + full multi-tenant rollout is R1 product work, separate spec. + +--- + +## 8. Decisions to make NOW + +For the team. These are not code decisions; they need product/eng +alignment. + +### D-1: Phase 1 NOW vs after current sync converges? + +**Option A:** Stop the current sync (lose ~3h of work), apply Phase 1 +fixes (~1-2 days), restart. Total: 2 days + 30 min final ingestion. +Sustainable code lands. + +**Option B:** Wait for current sync to converge (24-30h+), then start +Phase 1. Total: 1-2 days waste + 1-2 days Phase 1. + +**Recommendation:** A. Even with restart cost, A finishes faster AND +ships durable code. Continuing with the broken pipeline is sunk cost. + +### D-2: Phase 2 + 3 timing + +Phase 2 is a clear R1 commitment. Phase 3 is the SaaS gate — must +ship before second tenant goes live. Suggest committing both to R1 +sprint planning explicitly. + +### D-3: Backlog FDDs + +Three new FDDs come out of this: + +- **FDD-OPS-013** Kill redundant `fetch_issue_changelogs` (Phase 1 quick win, XS) +- **FDD-OPS-014** Per-source workers + per-scope watermarks (Phase 2, M-L) +- **FDD-OPS-015** Observable ingestion: pre-flight estimates + per-scope progress + ETA (Phase 1.5) + +(FDD-OPS-012 — issue batch-per-project — was already opened 2026-04-28.) + +--- + +## 9. Success criteria — how we know v2 worked + +Lock these as acceptance for the migration: + +1. **TTFR ≤ 60s for any source/entity** (measured: time from cycle + start to first row in `eng_*` table) — ✅ **ATINGIDO (Phase 1, commit `4d1c9b4`)**: `_sync_issues` agora streams per-project; primeira issue persistida em <30s tipicamente +2. **Full re-ingestion at Webmotors scale (376k issues, 64k PRs, 1.4k + deploys, 200 sprints) completes in ≤ 90 minutes** — ⚠️ **PARCIAL**: backfill BG (197k issues em projeto único) ainda é o gargalo dominante. Demais projetos rápidos. Estimativa total ~2-3h, não 90min — projeto BG sozinho consome maioria do tempo +3. **Memory peak ≤ 200 MB per worker** (vs 1.5 GB today) — ✅ **ATINGIDO**: Phase 1 streaming reduz para ~50-100 MB peak observado em produção +4. **Zero silent failures** — every error is logged with scope and + visible via `GET /pipeline/jobs` endpoint — ⚠️ **PARCIAL**: per-batch logs detalhados existem; `pipeline_ingestion_progress` tracking OK; falta `GET /pipeline/jobs` endpoint dedicado (FDD-OPS-015 pendente) +5. **VPN drop simulation**: kill jenkins network in test, GitHub + + Jira ingestion continues unaffected, Jenkins resumes on reconnect — ❌ **NÃO ATINGIDO**: Phase 2-A/B per-scope watermarks shippadas mas worker still monolítico. P-2 source isolation requer Step 2.6 (docker-compose split em workers per-source) — pendente +6. **Adding 1 fake project to Jira catalog** triggers backfill ONLY + for that scope (not full rerun of existing 32 projects) — ✅ **ATINGIDO (Phase 2-A + 2-B, commits `c2c6e5d`..`c628528`)**: per-scope watermarks `(tenant, entity, scope_key)` + read-side resolution `since_by_project`/`since_by_repo` enviam since correto por escopo +7. **Crash recovery test**: SIGKILL worker mid-batch, restart, verify + ≥99% of fetched data persisted (not 0, like today) — ✅ **ATINGIDO (Phase 1)**: cada batch persiste imediatamente via `_upsert_*` antes de avançar watermark; crash recovery loses ≤1 batch (~50-100 issues) + +**Status agregado v2 (2026-04-29):** + +| Phase | Status | Commits | +|---|---|---| +| Phase 1 (Quick Wins — AP-1 + AP-2 + pre-flight) | ✅ SHIPPED | `4d1c9b4`, `62c183f` | +| Phase 2-A (writes per-scope watermarks) | ✅ SHIPPED | `c2c6e5d`, `a2d5850`, `f357d05`, `15574a7`, `4f86fd2` | +| Phase 2-B (reads per-scope watermarks) | ✅ SHIPPED | `4478f13`, `c628528` | +| Phase 2.6 (docker-compose split per-source workers) | ⏳ PENDING | next session | +| Phase 3 (job queue + worker pool — SaaS-ready) | ⏳ PENDING | R1 | +| **Bonus data-quality fixes descobertos durante v2** | ✅ SHIPPED | `177830e` (changelog), `172f3f2` (effort), `0c7124d` (status), `649ed78` (sprint) | + +**Observação importante:** durante a engenharia Phase 1+2 emergiram 4 bugs estruturais de data quality (status_transitions=0, story_points=0, status normalization skew, sprint status vazio) que **não estavam no escopo original** mas ficaram visíveis quando começamos a olhar dados frescos pós-Phase 1. Documentados como INC-020..023 / FDD-OPS-016..018. Fix de cada um expandiu o escopo do v2 — mas todos foram resolvidos ainda dentro da janela de 2 dias. + +These are testable. Phase 3 acceptance hinges on items 4-7. **Item 5 (VPN simulation)** é o gating não-resolvido para confiar em SaaS multi-source. + +--- + +## 10. The honest risk + +This document advocates for stopping a 3-hour-old sync to start a +2-day refactor. That is itself a "another patch" pattern — promise +something better, ask to throw away the work in flight. + +**Why I think this time it's different:** + +- The diagnosis is structural, not a one-off (5 distinct failures, all + same root cause family) +- Phase 1 alone is small enough to verify in 1-2 days, not 1-2 weeks +- The 10× number is decomposed and falsifiable — if we ship Phase 1 + and don't see TTFR drop from hours to seconds, we made a wrong + diagnosis and need to revise +- The current sync's 24h ETA is itself a falsifiable claim that I'm + putting in writing now — if it converges in <2h, I was wrong and + Phase 1's urgency is reduced + +But the user's frustration is correct. The default should be: "until +proven otherwise, every ingestion run is doomed at this scale." Phase +1 disproves that for issues. Phase 2 disproves it for cross-source +failures. Phase 3 disproves it for SaaS multi-tenant. + +If we don't take this seriously now, we will rediscover all of it +when the second tenant onboards, with much more visibility and +political cost. + +--- + +## Appendix A: Why the current architecture exists + +This is not blame. The current state is the natural accretion of: + +- ADR-005 (replace DevLake): the focus was correctness, not throughput. + Bulk-then-persist was acceptable when datasets were small and we were + proving feasibility. +- Commit `7f9f339` (PR batch refactor): proved the streaming pattern + works. Should have generalized then; didn't because PRs were the + pain at the time. +- Discovery service (ADR-014): correctly built as separate worker. + The lesson didn't propagate to sync. +- 60+ status mappings (PT-BR): hard-won correctness. Don't break. +- Schema-drift monitor (FDD-OPS-001 line 3): smart, defensive, + belongs in v2 unchanged. + +v2 is **not** "throw away the work." It's "promote streaming + +isolation from local optimization in 1-2 places to architectural +default." + +--- + +## Appendix B: Counter-arguments I considered + +- "Just optimize the current code, don't restructure" — 5 incidents + in 5 days argue against. Optimization without isolation = endless + whack-a-mole. +- "Wait until 2nd customer pays, then build SaaS-ready ingestion" — + building SaaS infra under customer time pressure is how outages + happen at acquisition demos. +- "Use a 3rd-party data platform (Airbyte, Fivetran)" — explicitly + rejected in ADR-005 (DevLake had the same coverage gap on Postgres). + Adding another opaque layer doesn't solve our problems. +- "The 10× number is hand-wavy" — fair, but each lever is decomposed + in §5. Falsifiable acceptance criteria in §9. + +--- + +**Status of this document:** PROPOSAL. Awaiting review by +`pulse-data-engineer`, `pulse-engineer`, `pulse-product-director`, +and final approval from the user before any implementation. diff --git a/pulse/docs/ingestion-spec.md b/pulse/docs/ingestion-spec.md index 749122b..21d05d4 100644 --- a/pulse/docs/ingestion-spec.md +++ b/pulse/docs/ingestion-spec.md @@ -13,18 +13,24 @@ This document captures every adjustment, problem, and solution encountered during PULSE's data ingestion buildout — from initial DevLake-based pipeline to current proprietary connectors with dynamic discovery. It serves as the **single source of truth** for understanding ingestion behavior and as the **specification baseline** for building a fully autonomous SaaS ingestion engine. -### Current State (2026-04-14) - -| Metric | Value | -|--------|-------| -| Jira projects active | 69 | -| Issues ingested | 373,872 | -| PRs ingested | 63,647 | -| PR-Issue link rate | 21.9% (13,966 PRs) | -| Deployments (Jenkins) | 83 | -| Sprints | 215 | -| GitHub repos discovered | 754 (active), 1,429 (total) | -| Ingestion cycle time | ~3h (full backfill), ~7min (incremental) | +### Current State (2026-04-29 — pós-Phase-1 v2 + data-quality fixes) + +| Metric | Value | Note | +|--------|-------|------| +| Jira projects active | 32 (de 69 totais descobertos) | Subset ativo via discovery dinâmica (ADR-014) | +| Issues ingested | 311.068 | Re-ingestão pós-`seed_dev` revert (commit `40ca7e4`); diff vs. 373k anterior é por escopo de projetos ativos | +| PRs ingested | 63.131 | Estável desde 2026-04-27 | +| PR-Issue link rate | ~5% (em recovery após reset) | Baixo temporariamente — re-link pós-ingestão completa restaura ~22% | +| Deployments (Jenkins) | 1.376 | Auto-discovery via SCM scan (commit `d1aebf7`) | +| Sprints | 195/217 com status correto (89,9%) | 22 vazias = board órfão 873 sem projeto ativo. Pós-FDD-OPS-018 (commit `649ed78`) | +| GitHub repos discovered | 754 (active), 1.429 (total) | Estável | +| Status definitions discovered | 326 (117 new + 181 indeterminate + 28 done) | Pós-FDD-OPS-017 (commit `0c7124d`) | +| Distinct status names em uso | 104 | DEFAULT_STATUS_MAPPING expandido para ~80; fallback `statusCategory` cobre o resto | +| Squads ativos | 27 | FID + PTURB usam Sprint; **25 são Kanban-pure** (sem sprints) | +| Story Points usage | 0% (todos os 69 projetos) | Webmotors NÃO usa SP — fallback chain T-shirt/Hours/Count em FDD-OPS-016 | +| Ingestion cycle time | TTFR <60s (Phase 1 v2) | Backfill BG ~197k issues continua o gargalo. Pre-fix bulk: 24-30h. Pós-fix: ~30-45 min issues + paralelo PR/deploy | +| Coverage de `status_transitions` | ~0% legacy / 100% fresh | Rolling forward: cada incremental sync corrige; backfill retroativo opcional via watermark reset | +| Coverage de `story_points` (effort) | 52,3% em projetos novos (CRMC), ~0% legacy | Mesma rolling-forward dinâmica que status_transitions | --- @@ -42,13 +48,71 @@ This document captures every adjustment, problem, and solution encountered durin | Characteristic | Detail | Impact on Ingestion | |---------------|--------|-------------------| -| Org size | ~750 active repos, 69 Jira projects | High volume, need batch processing | -| Jira project scale | 197K issues in single project (BG) | Single JQL query can return massive payloads | -| Custom fields | Sprint = `customfield_10007`, Story Points = `customfield_18524` | Must discover dynamically per tenant | -| Jenkins patterns | No corporate standard; each repo has unique pipeline config | Cannot use single regex for deployment detection | -| Language mix | Portuguese status names ("Em Desenvolvimento", "Concluido") | Status normalizer needs i18n mapping | -| Jira reserved words | Project key "DESC" is SQL reserved word | Must quote project keys in JQL | -| Archived projects | Some keys referenced in PRs (e.g., "RC") don't exist in Jira API | Graceful handling of orphan references | +| Org size | ~750 active repos, 69 Jira projects, 27 squads ativos | High volume, need batch processing | +| Squad shape | 25 de 27 squads são **Kanban-puros** (sem sprints); apenas FID + PTURB usam Scrum | Sprint metrics aplicam-se a 7% das squads — métricas de fluxo (Cycle Time, CFD, Throughput) são as primárias | +| Jira project scale | 197K issues em projeto único (BG) | Single JQL query can return massive payloads — exige streaming per-project | +| Custom fields | Sprint = `customfield_10007`, Story Points = `customfield_18524` (+ legacy `customfield_10004`) | Must discover dynamically per tenant via `/rest/api/3/field` | +| Effort estimation method | **Webmotors NÃO usa Story Points** (0% dos 69 projetos). Padrões heterogêneos por squad: T-shirt size (P/M/G), `timeoriginalestimate` em horas, ou nada (Kanban-puro) | FDD-OPS-016 — fallback chain SP→T-shirt→Hours→None com discovery dinâmico de campos T-shirt/Tamanho | +| T-shirt size fields | `customfield_18762` ("T-Shirt Size") + `customfield_15100` ("Tamanho/Impacto") | Mapeados em escala Fibonacci: PP=1, P=2, M=3, G=5, GG=8, GGG=13. Discovery por nome (case-insensitive) | +| Status workflows | 326 status definitions descobertas; 104 raw distintos em uso ativo | DEFAULT_STATUS_MAPPING curado com ~80 PT-BR; resto via fallback `statusCategory.key` da Jira | +| Jenkins patterns | No corporate standard; each repo has unique pipeline config | Cannot use single regex for deployment detection — auto-discovery via SCM scan (`d1aebf7`) descobriu 577 PRD jobs em 283 repos | +| Language mix | Portuguese status names ("Em Desenvolvimento", "Concluído", "FECHADO EM PROD") | Status normalizer requer i18n mapping + `statusCategory` fallback como rede de segurança | +| Jira reserved words | Project key "DESC" é SQL reserved word | Must quote project keys in JQL | +| Archived projects | Some keys referenced in PRs (e.g., "RC") don't exist in Jira API | Graceful handling of orphan references — RC tem 1.348 PR refs sem Jira project correspondente | +| NULL bytes em texto | Observado 2026-04-28 em ENO-3296 (description) | Postgres `text` rejeita 0x00; helper `_strip_null_bytes` aplicado a title/description/assignee no normalizer | +| Network dependency | Acesso à Jira/GitHub/Jenkins via VPN corporativa | VPN drops causaram silent failures (FDD-OPS-001 / FDD-OPS-014 §AP-3, AP-4); health-aware orchestration é P-8 do v2 | + +### 2.3 Source Configuration Philosophy — Discovery Only + +**Decisão fundamental (locked-in 2026-04-27):** PULSE **NÃO mantém listas +explícitas** de repos GitHub ou projetos Jira em `connections.yaml` ou em +qualquer outro lugar. **Todo source é descoberto dinamicamente.** + +**Por quê** — três razões: + +1. **Listas explícitas envelhecem mal**: cada novo squad/repo/projeto + exige edição manual + redeploy. Webmotors evoluiu de 8 → 69 projetos + Jira em poucas semanas; manter sincronizado à mão não escala. +2. **Falham silenciosamente**: PRs referenciando `SECOM-1234` ficam + "linkados a nada" se SECOM não está na lista. Resultado: 5.27% de + link rate. Após discovery: 21.9% (4× melhor) com 96-100% per active + project. +3. **Não fazem sentido pra SaaS**: o produto precisa funcionar em + tenant novo sem que ninguém edite YAML. Discovery é a única forma de + "zero-config onboarding" (princípio §6.1). + +**O que é mantido em `connections.yaml`** (não-discoverable): + +| Campo | Razão | +|---|---| +| `connections[].source` (github/jira/jenkins) | Identifica tipo de conector pra usar | +| `connections[].base_url` | Endpoint da source (Jira tenant URL, GitHub Enterprise vs Cloud) | +| `connections[].token_env`/`username_env` | Onde achar credenciais (env var) | +| `connections[].sync_interval_minutes` | Cadência de sync (decisão operacional, não discoverable) | +| `status_mapping` (60+ entries PT-BR/EN) | Mapeamento de workflow Jira customizado → estados normalizados (todo/in_progress/in_review/done). Pode ser parcialmente AI-discovered no futuro (§6.4) | +| `teams` (squad → repos/projects mapping) | Decisão de organização, não topologia de source — pertence ao produto | + +**O que foi REMOVIDO em 2026-04-27:** + +- `connections[].scope.repositories` (lista de 9 repos GitHub explícitos) +- `connections[].scope.projects` (lista de 8 projetos Jira explícitos) + +Eram artefatos de bootstrap (teste de viabilidade no início do projeto). +Agora dispensáveis. + +**Como cada source descobre:** + +| Source | Mecanismo | Resultado | +|---|---|---| +| **GitHub** | `discover_repos(active_months=12)` via GraphQL `organization.repositories(orderBy: PUSHED_AT)` filtrado por atividade | ~283 repos com atividade nos últimos 12 meses | +| **Jira** | `ProjectDiscoveryService.run_discovery()` lista todos projetos via REST `/rest/api/3/project`, marca como `discovered`. `SmartPrioritizer.auto_activate(threshold=3)` promove pra `active` projetos com ≥3 references em PR titles | 69 projetos descobertos, ~9 dos quais auto-ativados na primeira passada (cresce conforme novos PRs chegam) | +| **Jenkins** | `discover_jenkins_jobs.py` faz SCM scan READ-ONLY em todos os jobs, gera `config/jenkins-job-mapping.json`. Sync worker lê esse JSON. Re-rodar quando novos repos aparecem (semanal/sob demanda) | 577 PRD jobs em 283 repos | + +**Quando re-discovery acontece:** + +- Jira: cron `0 3 * * *` UTC (configurável via `tenant_jira_config.discovery_schedule_cron`); manual via `POST /admin/jira/discovery/run` +- GitHub: a cada ciclo de sync (15min) — o `discover_repos` é chamado pelo connector se `_explicit_repos is None` +- Jenkins: regen do JSON é manual (script `discover_jenkins_jobs.py`); idempotente --- @@ -84,15 +148,146 @@ async def sync(self): ### 3.3 Key Design Decisions -| Decision | Rationale | ADR | -|----------|-----------|-----| +| Decision | Rationale | ADR / Commit | +|----------|-----------|--------------| +| **Discovery-only source configuration** | See §2.3 — explicit lists kill SaaS scalability and link rate | 2026-04-27 | | Replaced DevLake with proprietary connectors | 99.3% issue data loss in DevLake PostgreSQL layer | ADR-005 | | GraphQL primary for GitHub, REST fallback | 40x faster PR fetch (50 PRs + reviews + stats in 1 call) | Commit `60fe576` | | Per-repo batch upsert (not all-at-end) | Memory efficiency + real-time progress visibility | Commit `7f9f339` | -| Global watermark per entity (not per-project) | Simpler model, but requires reset for project scope expansion | Migration 002 | +| Global watermark per entity (not per-project) | Simpler model, but requires reset for project scope expansion. **Tradeoff documented in §3.7 + Problem 5.** | Migration 002 | | JSONB for `linked_issue_ids` and `status_transitions` | Flexible schema, supports variable-length arrays | Migration 001 | | Row-Level Security on all tables | Multi-tenant isolation at DB level | Migration 001 | | Kafka event backbone | Decouples ingestion from metric calculation | ADR-004 | +| **Partial index for snapshots `(tenant, metric_type, calculated_at DESC) WHERE team_id IS NULL`** | 50× perf regression on `/metrics/home` once `metrics_snapshots` >5M rows; non-partial index doesn't help due to B-tree NULL semantics | Commit `80f1796` (2026-04-27) | +| **Worker schema-drift monitor (FDD-OPS-001 line 3)** | Detects payload-vs-dataclass mismatch when bytecode is stale; tags rows with `_schema_drift` for Pipeline Monitor surfacing | Commit `5d71618` | + +### 3.4 Worker Lifecycle Guarantees + +**Origin:** FDD-OPS-001 incidents (2026-04-16/17/18) — Python workers running +stale code in memory while updated source was on disk. Resulted in 3 +production-local incidents in 3 days where snapshots persisted with +obsolete logic. + +**Four lines of defense (all SHIPPED):** + +1. **Hot-reload em dev (planned, not yet shipped)** — `docker compose + watch` to auto-reload workers on file change +2. **Admin recalc force-reload** — `POST /admin/metrics/recalculate` + calls `importlib.reload()` on domain/service modules before recalc +3. **Snapshot schema-drift monitor (SHIPPED 2026-04-23)** — pós-write, + compara payload com dataclass corrente. Missing fields → log WARN + `FDD-OPS-001/L3` + Prometheus counter `pulse_snapshot_schema_drift_total` + + anota `_schema_drift` no JSONB. Pipeline Monitor consome via + `GET /pipeline/schema-drift?hours=N` +4. **CI/CD force-restart on deploy (SHIPPED 2026-04-23)** — + `.github/workflows/deploy.yml` sempre roda + `docker compose up -d --force-recreate` nos 4 workers Python pós + build (deploy step ainda é TODO, mas o template existe) + +**Operacional fora do CI:** após edit em `domain/service` files local, +o operator deve rodar `make rotate-secrets` (que faz `up -d +--force-recreate` em 5 serviços) — `docker compose restart` NÃO relê +o `.env` nem força reimport de módulos. Documentado em +`docs/testing-playbook.md` §8.9. + +### 3.5 DB Index Strategy for Snapshots + +**Origin:** 2026-04-27 incident — dashboard error 30s timeout porque +`/metrics/home` levava 54s. Causa raiz: `metrics_snapshots` cresceu +pra 7M rows e a query `WHERE tenant_id=? AND metric_type=? AND team_id +IS NULL ORDER BY calculated_at DESC LIMIT 200` regrediu de Index Scan +pra Parallel Seq Scan (10s/query × 8 queries por home request = 50s+). + +**Indexes mantidos** (em `metrics_snapshots`): + +| Index | Definição | Cobre | +|---|---|---| +| `metrics_snapshots_pkey` | `(id)` | Primary key — sempre | +| `uq_metrics_snapshots_*` | `UNIQUE(tenant, team, type, name, period_start, period_end)` | Upsert constraint | +| `idx_metrics_snapshots_lookup` | `(tenant, type, name, period_start, period_end)` | Specific metric+window queries | +| **`idx_metrics_snapshots_tenant_latest`** | `(tenant, type, calculated_at DESC) WHERE team_id IS NULL` | **`/metrics/home` tenant-wide aggregations** (NEW 2026-04-27, migration 009) | + +**Por que partial index** (não non-partial): B-tree não usa índice +quando filtro inclui `IS NULL` em coluna não-NULL-aware. Partial +index `WHERE team_id IS NULL` resolve isso e mantém o índice menor +(exclui linhas team-scoped que têm padrão de acesso diferente). + +**Resultado medido**: query 10.3s → 2.4ms (**~4000× faster**). `/metrics/home` +total: 54s → 0.6s. + +**Princípio pra futuro**: toda nova query crítica que faz `ORDER BY ... +LIMIT N` em tabela >1M rows precisa de índice **explicitamente +ordenado** pela coluna do ORDER BY. EXPLAIN ANALYZE durante PR review. +Tracked como FDD-OPS-009 (DB query plan regression tests). + +### 3.6 Jenkins Job Mapping Workflow + +**Por que mapping em vez de discovery contínua:** Jenkins não tem +endpoint nativo eficiente pra "list todos os PRD jobs com seus repos +GitHub correspondentes". Precisaríamos consultar `lastBuild.remoteUrls` +de cada job individualmente — pra 1400+ jobs Webmotors, isso é caro +e lento. + +**Solução:** SCM scan one-shot, output em JSON, sync worker lê o JSON +no boot. + +**Fluxo:** + +``` +1. Operator (humano ou cron) roda: + docker compose exec sync-worker python -m scripts.discover_jenkins_jobs + +2. Script faz READ-ONLY scan via Jenkins API: + - GET /api/json?tree=jobs[name,fullName,url,lastBuild[url]] + - Para cada job: lastBuild → workflow_run → SCM remoteUrls + - Classifica jobs por padrão (PRD vs DEV vs HML) + - Casa cada job com repo GitHub (heurísticas: nome, SCM URL) + - Output: config/jenkins-job-mapping.json (committed) + +3. sync-worker lê o JSON no startup (config flag jobs_from_mapping=true) + - Mantém em memória: dict[repo_full_name, list[prd_jobs]] + - Pra cada deploy event do Jenkins: usa o mapping pra resolver repo + +4. Quando regenerar: + - Novo repo Webmotors aparece (esperado: poucas vezes/mês) + - Mudança de pattern de naming dos jobs + - Cron sugerido (futuro): semanal, sábado 04:00 UTC +``` + +**Resultado atual** (`jenkins-job-mapping.json` versão 2026-04-14): +283 repos × 577 PRD jobs. + +**Idempotência:** script é READ-ONLY. Re-rodar a qualquer momento é +seguro. Dois runs consecutivos produzem JSONs equivalentes (modulo +mudanças genuínas em Jenkins). + +### 3.7 Post-Ingestion Mandatory Steps + +Após qualquer **full re-ingestion** (DB wipe + sync from scratch), +quatro passos pós-ingestão são **obrigatórios** pra ter dashboard +correto. Skip qualquer um → métricas incompletas ou inconsistentes. + +| # | Operação | Endpoint / Comando | Tempo | Por quê | +|---|---|---|---|---| +| 1 | Backfill description | `POST /data/v1/admin/issues/refresh-descriptions?scope=all` | ~43min | `description` não é puxada no fetch padrão de issues (custo de payload Jira); endpoint admin busca via `GET /rest/api/3/issue/{key}`. Necessário pro Flow Health drawer mostrar contexto da issue. Cobertura final esperada ~62% (~38% das issues genuinamente sem description no Jira). | +| 2 | Re-link PRs↔Issues | `psql < scripts/relink_prs_to_issues.sql` | ~5s | Sync worker linka PRs durante ingestão usando o snapshot de issues no momento. Discovery dinâmica pode ativar projetos depois — re-link captura PRs que ficaram sem match na primeira passada. Idempotente. | +| 3 | Force snapshot recalc | `POST /data/v1/admin/metrics/recalculate` | ~10s | Garante que todos os 6 períodos (7d/14d/30d/60d/90d/120d) e 4 metric types (dora/lean/cycle_time/throughput) têm snapshot fresco. Workers rodam por evento Kafka, mas alguns períodos podem ficar stale se o evento não disparou em algum bucket. | +| 4 | (Conditional) Backfill `first_commit_at` | `POST /data/v1/admin/prs/refresh-first-commits?scope=stale` | varies | **Skip se ingestão usou código pós-INC-003 fix (2026-04-17+).** Validar via SQL: se ≥90% dos PRs têm `first_commit_at < created_at`, não rodar. Se <90%, rodar com `scope=stale` (filtro `first_commit_at == created_at`). | + +**Validação pós-step 4:** + +```sql +SELECT + COUNT(*) AS total, + COUNT(*) FILTER (WHERE first_commit_at < created_at) AS correct, + COUNT(*) FILTER (WHERE first_commit_at = created_at) AS stale, + ROUND(100.0 * COUNT(*) FILTER (WHERE first_commit_at < created_at) + / NULLIF(COUNT(*),0), 1) AS pct_correct +FROM eng_pull_requests WHERE source = 'github'; +``` + +Esperado: `pct_correct >= 90%` (alguns PRs muito pequenos onde commit +e abertura acontecem no mesmo segundo são casos legítimos de igualdade). --- @@ -255,41 +450,92 @@ WHERE entity_type = 'issues'; --- -### Problem 6: Status Normalization — Portuguese and Custom Workflows +### Problem 6: Status Normalization — Hybrid Textual + Jira statusCategory Fallback -**Context:** Jira workflows vary wildly across orgs and even across projects within the same org. Webmotors uses Portuguese status names. +**Context:** Jira workflows variam selvagemente entre orgs e até entre projects do mesmo tenant. Webmotors usa status names em PT-BR (e.g., "Em Desenvolvimento", "FECHADO EM PROD"). Audit em 2026-04-28 (FDD-OPS-017 / INC-022) mostrou que a abordagem **textual-only** original era catastroficamente insuficiente. -**Symptoms:** -- "Em Desenvolvimento" not mapping to `in_progress` -- "Concluido" (without accent) not mapping to `done` -- Custom statuses like "Aguardando Deploy", "Em Code Review" unrecognized +**Symptoms quantificados (2026-04-28):** -**Solution:** Extensive DEFAULT_STATUS_MAPPING with 60+ entries covering English, Portuguese, and common custom workflows. +Distribuição de `normalized_status` em 311.068 issues: +- 96,5% `done` · 3,3% `todo` · 0,2% `in_progress` · 0,1% `in_review` + +Investigação revelou que a Webmotors tem **104 status raw distintos** em workflows ativos. O `DEFAULT_STATUS_MAPPING` original cobria ~50 → 50+ status caíam silenciosamente no fallback "Unknown → todo". Casos sistêmicos: + +| Status raw | Issues afetadas | Bucket atual (errado) | Bucket correto | +|---|---|---|---| +| `FECHADO EM PROD` | 2.881 | todo | done | +| `FECHADO EM HML` | 14 | todo | done | +| `Em Progresso` | 6 | todo | in_progress | +| `Em desenv` | 4 | todo | in_progress | +| `Em Deploy Produção` | 14 | todo | in_progress | +| `Em Monitoramento Produção` | 3 | todo | done | +| `Homologação` | 9 | todo | in_review | +| `Em Verificação` | 4 | todo | in_review | +| (50+ outros) | dezenas | todo | varia | + +**Cascada CRÍTICA**: status_transitions herdam classificação errada. A última transição registrada de uma issue concluída ficava com `status: "todo"` em vez de `done`. Resultado em CASCATA: + +- **Cycle Time** infinito (não há transição final para `done`) +- **Throughput** sub-conta (issues entregues não aparecem) +- **WIP** super-conta (issues finalizadas continuam "em fluxo") +- **CFD / Lead Time** distorcidos +- **Flow Efficiency** indeterminado + +Sem o fix, **todo o pilar Lean** está comprometido para qualquer projeto que use status fora do mapping curado. + +**Solução: Hybrid normalization em 3 camadas** (FDD-OPS-017, commit `0c7124d`): ```python -DEFAULT_STATUS_MAPPING = { - # English - "open": "todo", "to do": "todo", "backlog": "todo", - "in progress": "in_progress", "in development": "in_progress", - "done": "done", "closed": "done", "resolved": "done", - # Portuguese - "em desenvolvimento": "in_progress", "em progresso": "in_progress", - "concluído": "done", "concluido": "done", "finalizado": "done", - "a fazer": "todo", "pendente": "todo", - # Custom patterns - "code review": "in_progress", "em code review": "in_progress", - "aguardando deploy": "in_progress", "ready for qa": "in_progress", - "em teste": "in_progress", "testing": "in_progress", - ... -} +def normalize_status(raw_status, status_mapping=None, status_category=None): + # Camada 1: Textual mapping curado (granularidade in_progress vs in_review) + mapping = {**DEFAULT_STATUS_MAPPING} # ~80 PT-BR + EN entries + if status_mapping: + mapping.update({k.lower(): v for k, v in status_mapping.items()}) + normalized = mapping.get(raw_status.lower().strip()) + if normalized: + return normalized + + # Camada 2: Jira statusCategory.key fallback (autoritativo done/não-done) + if status_category: + cat = status_category.lower().strip() + if cat == "done": return "done" + if cat == "indeterminate": return "in_progress" # NB: collapses in_review + if cat == "new": return "todo" + + # Camada 3: Default 'todo' com WARN log (extremamente raro agora) + logger.warning("Unknown status %r — defaulting to 'todo'", raw_status) + return "todo" ``` -**Result:** 99%+ status normalization accuracy for Webmotors workflows. +**Discovery da camada 2** (`_discover_status_categories`): conector chama `/rest/api/3/status` 1× por lifetime e cacheia `name → category` para todos os 326 status defs do tenant. Webmotors: 117 new + 181 indeterminate + 28 done. + +**Por que híbrido (não pure textual nem pure category):** + +- **Textual ganha** quando definido — preserva granularidade `in_progress` vs `in_review` que o Cycle Time Breakdown precisa. Jira `statusCategory.indeterminate` colapsa os dois. +- **Category fallback** captura o long tail tenant-custom sem manutenção contínua. Workflow author é fonte de verdade sobre done/não-done. +- **Default 'todo'** com WARN só atinge agora status sem category — extremamente raro pós-fix. + +**`build_status_transitions` integrado**: `status_categories_map` (todos status → categoria) é passado adiante para classificar cada `to_status` histórico via map. O bug de cascada acima é corrigido na fonte. + +**Result quantificado:** + +3.151 issues reclassificarão na re-ingestão (1% do total) — long tail catastrófico. Distribuição já correta para os 97% restantes. -**SaaS Implication:** Static mapping won't scale. Need: -1. **Learning-based mapper**: observe workflow transitions to infer categories -2. **Per-tenant overrides**: allow admin to map custom statuses -3. **AI fallback**: LLM classifies unknown statuses into todo/in_progress/done +| Transição | Quantidade | +|---|---| +| `todo → done` (FECHADO EM PROD/HML, etc.) | 2.923 | +| `todo → in_review` (Homologação, Verificação) | 161 | +| `todo → in_progress` (Em Progresso, Em desenv) | 67 | + +**Decisão de produto registrada** (FDD-OPS-017 backlog): `FECHADO EM HML` mapeado como `done` (segue Jira `statusCategory.key='done'` + nome literal "FECHADO"). Workflow author classifica como done; respeitamos. Se Webmotors quiser tratar como ainda em fluxo, pode renomear para "Aguardando Deploy Produção" (mapeado para `in_progress`). + +**SaaS Implication:** Hybrid approach é SaaS-ready out-of-the-box. Cada novo tenant: +1. Conector descobre seus 100-300 status defs via `/rest/api/3/status` (1 chamada) +2. Textual mapping curado (PT-BR + EN + ~80 PT-BR variants) cobre majoritário +3. Status category fallback captura o long tail proprietário +4. Operadores adicionam mappings textuais específicos APENAS quando precisam de granularidade `in_review` (raro) + +**Future** (FDD-OPS já catalogado): AI-fallback para status que faltam category — observar workflow transitions para inferir categoria (Section 6.4.2). --- @@ -382,6 +628,217 @@ async def _fetch_repo_prs_graphql(self, repo_name, since): --- +### Problem 11: Inline Changelog Lost in Connector Mapping (`_map_issue` drop) + +**Context:** FDD-OPS-013 (commit `4d1c9b4`, 2026-04-28) eliminou o redundant `fetch_issue_changelogs` round-trip extraindo changelogs **inline** do JQL response (`expand=changelog`). Função nova `extract_status_transitions_inline(raw)` no sync worker fez `raw.get("changelog", {}).get("histories", [])`. Pareceu funcionar (testes passaram). Entretanto, audit em 2026-04-28 mostrou `status_transitions = []` em **100% das 311.007 issues** — mesmo problema que Phase 1 era para resolver. + +**Symptoms:** + +- 311.007 issues no DB (todas as ingeridas pós-Phase-1) com `status_transitions = []` +- Cycle Time não fechava — sem transição para `done` +- Throughput sub-contava — issues `done` apareciam como em fluxo +- WIP super-contava — issues finalizadas no bucket de "ativo" +- Lean metrics todas comprometidas + +**Root Cause** (descoberto via tracing connector → worker em 2026-04-29): + +`JiraConnector._map_issue` (commit ancestral) extraía o changelog para um cache lateral (`self._last_changelogs[internal_id]`) **mas NÃO incluía o campo `changelog` no dict mapeado de retorno**: + +```python +# Ancestral code (BUG): +def _map_issue(self, jira_issue): + changelogs = self._extract_changelogs(internal_id, jira_issue) + if changelogs: + self._last_changelogs[internal_id] = changelogs # cache lateral + return { + "id": internal_id, + "title": fields.get("summary", ""), + # ... outros campos ... + # ❌ NO changelog field aqui + } +``` + +O `_sync_issues` (worker) chamava `extract_status_transitions_inline(raw)` no dict mapeado — `raw.get("changelog", {})` retornava `{}` sempre porque o key não existia. Resultado: lista vazia para toda issue. + +**Por que escapou dos testes:** Os 10 testes em `test_inline_changelog_extraction.py` testavam `extract_status_transitions_inline` **isoladamente** contra dicts sintéticos que JÁ tinham `changelog`. O contrato entre `_map_issue` e o extractor nunca foi testado end-to-end. + +**Solution** (commit `177830e`, 2026-04-29): + +```python +return { + "id": internal_id, + # ... outros campos ... + # FDD-OPS-013 — preserve raw changelog from `expand=changelog` so + # extract_status_transitions_inline() in the sync worker can read it. + "changelog": jira_issue.get("changelog", {}), +} +``` + +Test guard novo: `TestMapIssuePreservesChangelogForInlineExtraction` instancia o connector, alimenta payload Jira-shaped com `expand=changelog`, asserta que o pipe end-to-end (mapper → extractor) produz transitions não-vazias. + +**Result:** Validado live no projeto BG: 1.994 issues re-sincados todos com 3-8 transitions normalizadas (BG-202188: 5 transitions; BG-202413: 3 transitions). Pré-fix: 0 transitions em 311k issues. Pós-fix: 100% das issues recém-tocadas carregam transitions. + +**Lição genérica** — *cache lateral vs return value anti-pattern*: + +> Connector mappers devem retornar **dados completos** no dict mapeado. +> Esconder dados em side caches (`self._last_*`) que outros call sites +> não conhecem é um anti-pattern. Quando outro path tenta acessar via +> "interface natural" (dict access), o dado está invisível mas o cache +> técnico-correto está silently populated. + +Test pyramid lição: testar **contratos entre componentes**, não só cada componente isolado. + +**SaaS Implication:** Padrão "connector retorna dados completos no return value" deve ser doc-policy ao adicionar conectores futuros (GitLab, ADO, Linear). E todo connector → worker pipe precisa de pelo menos 1 test end-to-end que use a SHAPE real da API source. + +--- + +### Problem 12: Effort Estimation Without Story Points (Webmotors-style heterogeneity) + +**Context:** Métricas como Velocity, Throughput-by-effort, Forecast Monte Carlo dependem de "esforço" agregado. Padrão da indústria: Story Points. Audit em 2026-04-28 (FDD-OPS-016 / INC-021): **`story_points = 0` em 100% das 311.007 issues** da Webmotors. + +**Symptoms:** + +- Sample em todos os 69 projetos ativos: `customfield_10004` ("Story Points") + `customfield_18524` ("Story point estimate") **0% populados** +- Webmotors **não usa Story Points** como método de estimativa (decisão organizacional) +- Velocity sempre zerada, throughput-by-effort impossível, forecast sem input + +**Investigação em squads** (samples de 50 issues por projeto): + +| Projeto | T-Shirt Size | Original Estimate (h) | Tamanho/Impacto | Padrão observado | +|---------|--------------|------------------------|------------------|--------| +| ENO | 24% | 52% | 4% | Horas + tshirt | +| DESC | 26% | 34% | 6% | Horas + tshirt | +| APPF | 0% | 12% | 0% | Horas (raro) | +| OKM | 4% | 8% | 0% | Quase Kanban | +| BG, FID, PTURB | 0% | 0% | 0% | **Kanban puro — não estimam** | + +**Conclusão:** padrão heterogêneo entre squads — algumas usam horas, algumas T-shirt size, várias não estimam (Kanban-puro). Single-method approach não funciona. + +**Solution** (commit `172f3f2`, 2026-04-29) — **Effort Fallback Chain**: + +Discovery dinâmico em `_discover_custom_fields`: +- Casa por nome (case-insensitive) os patterns `"t-shirt size"` e `"tamanho/impacto"` +- Webmotors: descobriu `customfield_18762` ("T-Shirt Size") + `customfield_15100` ("Tamanho/Impacto") +- Funciona em qualquer tenant (não hardcode customfield IDs) + +`_extract_story_points` (renomeado conceitualmente para "effort") com cadeia em ordem de prioridade: + +```python +# 1+2. Native numeric Story Points (preferred — no conversion) +for field_id in (story_points_field_id, *FALLBACK_STORY_POINTS_FIELDS, "story_points"): + if value > 0: return float(value) # source: 'story_points' + +# 3+4. T-shirt sized fields → Fibonacci scale +TSHIRT_TO_POINTS = {"PP": 1, "P": 2, "M": 3, "G": 5, "GG": 8, "GGG": 13, + "XS": 1, "S": 2, "L": 5, "XL": 8, "XXL": 13} +for fid in self._tshirt_field_ids: + if (label := unwrap(fields[fid])) and (mapped := TSHIRT_TO_POINTS.get(label.upper())): + return mapped # source: 'tshirt_to_sp' + +# 5. Original Estimate (seconds) → SP equivalent buckets +def _hours_to_points(h): + if h <= 4: return 1 + if h <= 8: return 2 # ~1d + if h <= 16: return 3 # ~2d + if h <= 24: return 5 + if h <= 40: return 8 # ~1w + if h <= 80: return 13 # ~2w + return 21 +# source: 'hours_to_sp' + +# 6. None — issue genuinamente não estimada (Kanban-puro) +# source: 'unestimated' +# CONSUMER MUST count items rather than sum points +``` + +**Telemetria** (`_effort_source_counts`): por batched run, log da distribuição de qual hop produziu o valor. Operadores veem drift ("squad migrou de horas para t-shirt em maio") sem combar logs. + +**Quando `None` (Kanban-puro):** decisão de **count vs sum** fica na camada de métrica, **não** no normalizer. Métrica downstream precisa contar items rather than sum points. Documentado em §8.12. + +**Result:** + +Validado live em CRMC (1.375 issues, projeto novo full-history pós-fix): +- **52,3% com effort estimado** (719/1.375) +- Distribuição de valores: 1, 2, 3, 5, 8 — confirma escala Fibonacci aplicada +- 47,7% com `story_points = None` → métrica counta items + +**Future (codename "dev-metrics" R3+)** — FDD-DEV-METRICS-001: + +Hoje a fallback chain é **automática e implícita**. Diferentes filosofias produzem métricas diferentes. R3 vai entregar: +- Per-squad estimation method choice (admin UI: SP / T-shirt / Hours / Count-only / Auto) +- Modelo proprietário de previsão e insights (drift detection, calibração contra histórico, Monte Carlo com método nativo) +- UX completa rescritta ao redor da escolha +- Anti-surveillance by design (insights por squad/processo, nunca individual) + +**Diferenciador competitivo:** LinearB / Jellyfish / Swarmia / Athenian são opinionated em SP. PULSE é o **único** que respeita filosofia da squad e usa isso como entrada de modelo, não como ruído a normalizar. + +**SaaS Implication:** Effort fallback chain é SaaS-ready (descoberta dinâmica). Para "dev-metrics" (R3+), precisa adicionar: +- Coluna `effort_source` em `eng_issues` (auditoria por issue) +- Migration deferred — registrado como prerequisite no FDD-DEV-METRICS-001 + +--- + +### Problem 13: Sprint Status Pipeline — 4-Layer Swiss Cheese + +**Context:** 100% das 216 sprints no `eng_sprints` da Webmotors com `status=''`. `goal` também totalmente vazio. Audit (FDD-OPS-018 / INC-023, 2026-04-29) revelou clássico **swiss cheese alignment** — quatro bugs independentes em camadas diferentes, cada um sozinho garantindo o resultado. + +**Symptoms:** + +- `SELECT status, COUNT(*) FROM eng_sprints` → `('', 216)` +- Sprint Comparison / Velocity Trend não pode filtrar `closed` para excluir sprints em andamento da regressão +- "Current sprint" planejado precisa `status='active'` — impossível sem dado + +**Os 4 bugs (cada um suficiente para causar o sintoma):** + +| # | Camada | Bug | Como sozinho garantia status vazio | +|---|---|---|---| +| 1 | `connectors/jira_connector.py:_map_sprint` | Mapeava OK (ACTIVE/CLOSED/FUTURE) | (não era bug — fonte estava certa) | +| 2 | `engineering_data/normalizer.py:normalize_sprint` | Retornava dict **sem** o campo `status` | Status nunca chega no upsert | +| 3 | `workers/devlake_sync.py:_upsert_sprints` | ON CONFLICT `set_={...}` não incluía `status`/`goal` | Sprints existentes (que existem) nunca atualizam | +| 4 | `connectors/jira_connector.py:_fetch_board_sprints` | Filtrava `started_date < since` | State transitions acontecem em `endDate` — sprint que começou em março e fechou em maio nunca tem update após março | +| 5 | `engineering_data/models.py:EngSprint` | Schema da DB tinha `status` mas ORM SQLAlchemy não tinha o `Mapped[str\|None]` correspondente | **Path que omitia status funcionava silently empty; path que tentava popular crashava com `Unconsumed column names: status`** | + +**Bug #5 (ORM schema drift) é o mais insidioso.** Coluna existia no DB há tempos (alguma migration anterior); ORM nunca foi atualizado. O sintoma é assimétrico: quem **omite** o campo passa silenciosamente; quem **inclui** crashar. Ninguém investiga porque "tá vazio mas não dá erro". + +**Solution** (commit `649ed78`, 2026-04-29) — fix em todas as camadas: + +1. `_map_sprint` agora também passa `goal` adiante (Jira API o retorna) +2. `normalize_sprint` inclui `status` (lowercase: `active`/`closed`/`future`/None) + `goal` (com strip null bytes) +3. `_upsert_sprints` ON CONFLICT `set_` atualiza ambos +4. `_fetch_board_sprints` removeu filtro de watermark (volume baixo, ~216 total / ~5 ativas; sempre re-fetch é correto pois state transitions) +5. `EngSprint` model adiciona `status: Mapped[str | None] = mapped_column(String(50), nullable=True)` + +Helper `_normalize_sprint_status` mapeia aliases comuns: +- `open → active` · `in_progress → active` +- `completed/complete/ended → closed` +- `planned/upcoming → future` +- **Unknown values → None** (não bucketiza silenciosamente — operador investiga via NULL no DB) + +**Por que NÃO bucketizar unknown:** Velocity / Carryover logic precisa saber QUAIS sprints estão de fato fechadas. Mapear "?" para `closed` corromperia o cálculo. Fail-loud é melhor que fail-silent aqui. + +**Result:** + +Validado live (ad-hoc backfill cobrindo 31 projetos ativos): + +| Status | Quantidade | Tem goal? | +|---|---|---| +| `closed` | 187 | sim | +| `active` | 3 | sim | +| `future` | 5 | sim | +| (vazio) | 22 | board órfão 873 sem projeto ativo | + +**195/217 = 89,9%** das sprints com status correto + 70% com goal real (e.g., "Gestão de banner no backoffice de CNC e TEMPO para novas especificações técnicas"). As 22 vazias são de board órfão, fora do escopo deste fix. + +**Lição genérica — `Schema drift detection pattern`:** + +> Adicionar guard test "DB columns vs ORM Mapped fields" — candidato a 5ª linha de defesa do FDD-OPS-001 (eliminação de drift). +> Migration review checklist deve incluir: toda nova coluna → Mapped column correspondente no SQLAlchemy. +> ORM drift é o tipo de bug onde "alguns paths funcionam, outros crashern" — não tem sintoma uniforme observável, então fica oculto até alguém tentar exatamente o path quebrado. + +**SaaS Implication:** Sprint pipeline pós-fix está SaaS-ready. Para tenants futuros: discovery automático de boards Scrum (já existe), normalização lowercase consistente com convenção PULSE, fail-loud em status desconhecidos — operador onboarding vê NULL e investiga ao invés de receber dado silenciosamente errado. + +--- + ## 5. Entity Relationship Map ### 5.1 Cross-Source Entity Linking @@ -566,6 +1023,75 @@ STATUS_PATTERNS = { - Cross-worker coordination (Redis-based token bucket) - Graceful degradation (reduce batch size on rate limit, don't fail) +#### 6.3.6 Effort Extraction (Deterministic Core + Discovery Fallback) + +**Problem:** Story Points não são universais — Webmotors validou 0% de uso em 69 projetos. Squads usam métodos heterogêneos: T-shirt size (P/M/G), `timeoriginalestimate` em horas, ou nada (Kanban-puro). Single-method extraction quebra para esses tenants. Implementado em FDD-OPS-016 (commit `172f3f2`). + +**Discovery dinâmico** (deterministic, zero-config): + +```python +# JiraConnector._discover_custom_fields() +EFFORT_NAME_PATTERNS_TSHIRT = ("t-shirt size", "tshirt size", "tamanho/impacto") + +for field in fields_list: + name = field.get("name", "").strip().lower() + fid = field.get("id", "") + + # Story Points (numeric) + if name in ("story points", "story point estimate"): + self._story_points_field_id = fid + + # T-shirt sized fields (option-typed) + elif any(p in name for p in EFFORT_NAME_PATTERNS_TSHIRT): + self._tshirt_field_ids.append(fid) +``` + +**Fallback chain (priority order):** + +| # | Source | Conversão | Source label | +|---|---|---|---| +| 1 | `customfield_*` ("Story Points") | uso direto (numeric) | `story_points` | +| 2 | `customfield_*` ("Story point estimate") | uso direto | `story_points` | +| 3 | `customfield_*` ("T-Shirt Size") | mapa Fibonacci PP=1, P=2, M=3, G=5, GG=8, GGG=13 (PT-BR) ou XS/S/M/L/XL/XXL (EN) | `tshirt_to_sp` | +| 4 | `customfield_*` ("Tamanho/Impacto") | mesmo mapa | `tshirt_to_sp` | +| 5 | `timeoriginalestimate` (segundos) | buckets: ≤4h=1, ≤8h=2, ≤16h=3, ≤24h=5, ≤40h=8, ≤80h=13, >80h=21 | `hours_to_sp` | +| 6 | None | sem estimativa — **métrica downstream conta items (Kanban-puro)** | `unestimated` | + +**Hour bucket calibration:** alinhado com "1 ideal day = ~6h productive". Buckets calibrados contra valores observados na Webmotors (2h–124h, múltiplos de 4) para que cada valor comum caia em um bucket sensato. Output já na escala SP que métricas downstream esperam. + +**Skip SP = 0:** sentinel comum para "não estimado", trata como falta. Cai para próximo hop da chain ao invés de retornar `0.0`. + +**Telemetria** (`_effort_source_counts`): incrementa contador por `source` label (incluindo `'unestimated'`). Logado per batched run: + +``` +[batched] effort source distribution (1375 issues): + tshirt_to_sp=521 (37.9%), hours_to_sp=198 (14.4%), unestimated=656 (47.7%) +``` + +Operadores spotam estimation drift sem combar logs. + +**Anti-pattern evitado** — bucketização silenciosa de unknown values: + +> Ao receber um T-shirt size desconhecido (ex: "JUMBO"), o connector +> NÃO mapeia silenciosamente para algum default. Cai para o próximo +> hop. Se nenhum produzir valor, retorna `None` com source label +> `'unestimated'`. Métrica downstream sabe que tem que counta items. + +**SaaS Implication:** Já SaaS-ready. Cada tenant onboarda com: +1. Discovery automático de fields T-shirt e Tamanho via match de nome +2. Story Points classico funciona out-of-the-box se usado +3. `timeoriginalestimate` é Jira built-in (não custom field) — sempre disponível +4. Telemetria revela qual método o tenant usa nas primeiras horas pós-onboarding + +**Future (FDD-DEV-METRICS-001 / codename "dev-metrics" R3+)** — promote esta cadeia automática a uma escolha **explícita por squad**: + +- Admin UI permite escolher método: SP / T-shirt / Hours / Count-only / Auto (current) +- Modelo proprietário: detecta drift de estimativa, calibra contra histórico, surfaces insights ("squad marcando tudo como M há 6 sprints") +- Forecast Monte Carlo usa o método nativo do squad (não força SP como LinearB / Jellyfish / Swarmia / Athenian fazem) +- Anti-surveillance by design: insights por squad/processo, **nunca** individual + +Pré-requisito (deferred): adicionar coluna `effort_source` em `eng_issues` para auditoria por issue. + ### 6.4 Non-Deterministic Components (Implement with AI) These problems have ambiguous inputs and require contextual understanding. An embedded AI agent ("Ingestion Intelligence Agent") handles them. @@ -875,3 +1401,368 @@ IngestionPipeline: | `efaeba7` | Discovery service, mode resolver, guardrails | | `bea8b13` | Admin API + React UI for discovery | | `c5350dc` | Security hardening, PII gating, Phase 4 rollout | +| `5d71618` | Snapshot drift monitor (FDD-OPS-001 line 3) + deploy workflow | +| `0a1050c` | FDD-OPS-001 lines 1+2 — eliminate stale-code-in-workers drift | +| `dd10d34` | FDD-OPS-002 — full Jira description backfill (61.74% coverage) | +| `80f1796` | Partial index for snapshots — fixes 50× perf regression on `/metrics/home` | +| `c5e38bb` | docs(architecture): ingestion v2 — diagnostic + 10× target + migration path | +| `4d1c9b4` | FDD-OPS-012 + FDD-OPS-013 — Phase 1 v2: issues sync streams per-project + inline changelog (eliminates redundant `fetch_issue_changelogs`) | +| `62c183f` | Strip NULL bytes (0x00) from text fields before persist — Webmotors `ENO-3296` description had 0x00 | +| `4c2c1c5` | docs(ingestion): Phase 2 drafts — per-source workers + per-scope watermarks (FDD-OPS-014) | +| `c2c6e5d` | Phase 2 step 2.1 — apply scope_key migration | +| `a2d5850` | Phase 2 step 2.2 — per-scope watermark API | +| `f357d05` | Phase 2 step 2.3 — `_sync_issues` uses per-project watermarks | +| `15574a7` | Phase 2 steps 2.4 + 2.5 — per-repo watermark writes for PRs and deploys | +| `4f86fd2` | FDD-OPS-014 step 2.7 (urgent) — drop legacy `uq_watermark_entity` (Postgres enforces ALL UniqueConstraints; legacy blocked per-scope inserts) | +| `4478f13` | Phase 2-B step 2.4-B — read per-repo watermarks for PRs | +| `c628528` | Phase 2-B step 2.5-B — read per-repo watermarks for deployments | +| `177830e` | INC-020 / FDD-OPS-013 follow-up — preserve Jira changelog in `_map_issue` so inline extraction works (status_transitions=[] em 311k issues) | +| `172f3f2` | INC-021 / FDD-OPS-016 — effort estimation fallback chain (Story Points → T-shirt → Hours → None) + FDD-DEV-METRICS-001 placeholder for R3+ | +| `0c7124d` | INC-022 / FDD-OPS-017 — status normalization with `statusCategory.key` fallback (96.5% done skew + 50+ PT-BR statuses unmapped) | +| `649ed78` | INC-023 / FDD-OPS-018 — sprint status pipeline 4-layer cheese fix (normalizer + upsert + watermark + ORM drift) | + +### D. Webmotors-Discovered Patterns (training material para futuros tenants) + +Capturados durante a engenharia 2026-04 — servem como **base de comparação** quando onboardar novos tenants e como **alvo de discoveries automáticas** para o Ingestion Intelligence Agent (Section 6.5). + +**D.1 — Estimação de esforço heterogênea entre squads:** + +- Webmotors **não usa Story Points** (0% nos 69 projetos) +- Distribuição de método por squad sample: + - Squads que estimam: ENO (52% horas + 24% T-shirt), DESC (34% horas + 26% T-shirt) + - Squads que estimam pouco: APPF (12% horas), OKM (8% horas) + - **Squads Kanban-puros** (não estimam): BG, FID, PTURB, e ~22 outros (25 de 27 squads totais) +- Fields descobertos: `customfield_18762` (T-Shirt: P/M/G), `customfield_15100` (Tamanho/Impacto: PP/P/M/G) +- **Implicação para futuros tenants:** rodar discovery por nome ("t-shirt", "tamanho", "size") e logar telemetria de método usado por squad. Provável que tenants Kanban-pesados tenham padrão similar. + +**D.2 — Workflow status diversity:** + +- 326 status definitions descobertos via `/rest/api/3/status` +- 104 status raw distintos populados em issues ativas +- DEFAULT_STATUS_MAPPING curado precisa de ~80 entries para cobrir granularidade `in_review` específica de PT-BR +- Resto cai no fallback `statusCategory.key` (autoritativo done/não-done) +- Padrões PT-BR observados: + - "FECHADO EM PROD", "FECHADO EM HML", "Concluído", "Cancelado" → `done` + - "Em Desenvolvimento", "Em imersão", "Em andamento", "Em Progresso" → `in_progress` + - "Em Code Review", "Em Teste HML", "Homologação", "Aguardando Code Review" → `in_review` + - "BACKLOG", "Refinado", "PAUSADO" → `todo` +- **Implicação:** mapping curado é por idioma + cultura organizacional. AI fallback (Section 6.4.2) deve aprender **por tenant** após primeiros 1k transitions observados. + +**D.3 — Squad shape:** + +- 27 squads ativos +- **25 são Kanban-puros** (sem sprints) — métricas Lean (CFD, Throughput, WIP, Cycle Time) são primárias +- 2 squads (FID, PTURB) usam Sprint — métricas Scrum (Velocity, Carryover) aplicam +- **Implicação:** UX padrão deve assumir Kanban-first. Sprint metrics aparecem condicionalmente quando `eng_sprints` tem dados ativos para a squad. + +**D.4 — Repo & deploy scale:** + +- 754 GitHub repos active / 1.429 total descobertos +- 283 repos com Jenkins config descoberto via SCM scan (commit `d1aebf7`) +- 577 PRD jobs auto-classificados por pattern matching +- 197.043 issues no projeto único BG (concentração extrema — single JQL retorna massive payload) +- **Implicação:** SaaS engine deve assumir distribuição power-law (alguns projetos enormes, muitos pequenos). Streaming per-project (P-1 do v2) é não-negociável. + +**D.5 — Operational realities:** + +- VPN drops causam silent failures sem health-aware orchestration (P-8) +- Project keys com palavras-reservadas SQL ("DESC") exigem quoting em JQL +- Orphan project keys em PR titles ("RC" tem 1.348 references sem Jira project) — alias resolution AI necessário (Section 6.4.5) +- NULL bytes (0x00) em descriptions PT-BR — `_strip_null_bytes` defensivo +- Jenkins SHAs são build IDs, não git SHAs — PR↔Deploy linking via temporal correlation, não SHA match + +**D.6 — Anti-pattern de dev process descobertos:** + +- **Cache lateral vs return value** (INC-020): connector mappers escondendo dados em `self._last_*` que outros call sites não acessam +- **Schema drift entre migration e ORM** (INC-023): coluna existe no DB mas SQLAlchemy `Mapped` ausente — paths que omitem campo passam, paths que incluem crashern +- **Swiss cheese alignment** (INC-023): feature inteira zerada por 4+ bugs independentes em camadas diferentes; cada um sozinho garantia o sintoma +- **Watermark filter dimension errado** (INC-023 #3): sprint state transitions em `endDate` não `startDate` — escolher dimensão correta de watermark é crítico +- **Bucketização silenciosa de unknown values**: anti-pattern. Sempre fail-loud (None/WARN) — operador investiga via NULL no DB + +--- + +## 8. Metric Field Decisions — Master Table + +Esta seção consolida **as decisões de qual timestamp/field é usado pra +cada métrica**, ancorando-se nos incidentes documentados em +`docs/metrics/metrics-inconsistencies.md`. Quando uma métrica produz +um número estranho, comece por aqui — provavelmente é decisão de +campo, não bug de código. + +### 8.1 Lead Time for Changes (DORA) + +**Fórmula canônica:** `deployed_at - first_commit_at` (em horas) + +| Field | Source | Decisão | Referência | +|---|---|---|---| +| `eng_pull_requests.first_commit_at` | GitHub GraphQL `commits(first:1).authoredDate` | Real authored date do primeiro commit no branch — **NÃO** a data de abertura do PR | INC-003 fix 2026-04-17, commit `c5350dc` | +| `eng_pull_requests.deployed_at` | Temporal linking PR → Jenkins deploy via SHA matching | Populado por `link_pr_deploys()` quando deploy chega; null pra PRs sem deploy linkado | INC-004 fix 2026-04-17 | + +**Variantes expostas pelo backend** (decisão FDD-DSH-082, 2026-04-17): + +- `lead_time_for_changes_hours` (inclusive): inclui PRs sem `deployed_at` usando `merged_at` como fallback. Maior cobertura, mas não-canônico DORA. +- `lead_time_for_changes_hours_strict`: SOMENTE PRs com `deployed_at != NULL`. Canônico DORA. Cobertura menor (depende de Jenkins linking). +- Frontend mostra ambos em cards separados. Usuário escolhe a interpretação. + +**Edge case**: PR aberto-e-fechado-sem-merge → excluído do cálculo (`is_merged = false`). + +### 8.2 Cycle Time + +**Fórmula:** `merged_at - first_commit_at` (em horas) — INC-007 fix 2026-04-17 + +**Phases breakdown** (`cycle_time/breakdown` snapshot): + +| Phase | De | Para | +|---|---|---| +| `coding` | `first_commit_at` | `pr_opened_at` (created_at) | +| `pickup` | `pr_opened_at` | `first_review_at` | +| `review` | `first_review_at` | `merged_at` | +| `merge_to_deploy` | `merged_at` | `deployed_at` | + +**Edge case INC-012 (parcial)**: `merge_to_deploy` é null quando +`deployed_at` é null. Stacked bar mostra 3 fases em vez de 4. Documentado +como aceitável até full Jenkins linking (depende de FDD-DSH-050). + +### 8.3 Deployment Frequency + +**Fórmula:** `count(eng_deployments WHERE environment='production' AND deployed_at IN [period])` por unidade de tempo + +| Decisão | Referência | +|---|---| +| Filtro `environment='production'` (não staging/dev) | INC-008 fix 2026-04-17 | +| Source = jenkins (Webmotors) | `connections.yaml` | +| `is_failure` derivado de `result != 'SUCCESS'` no Jenkins build | normalizer `_extract_jenkins_result()` | +| **Aberto INC-016**: builds UNSTABLE (testes falham mas compila) contam como falha — comportamento mais rigoroso que padrão DORA, sem flag pra desabilitar | P2, aceitável | + +### 8.4 Change Failure Rate + +**Fórmula:** `count(deploys WHERE is_failure) / count(deploys)` no período + +**Decisões:** mesmas de §8.3 (escopo de deploys idêntico). + +### 8.5 MTTR (Mean Time to Recovery) + +**Status:** ❌ **AINDA NÃO IMPLEMENTADO** + +`recovery_time_hours` é always null (INC-005). Calculation function existe +e está correta, mas não há pipeline de incidents para alimentar. Card +"Time to Restore" mostra `null` + badge "R1" + tooltip explicativo. + +Tracking: FDD-DSH-050 (P1, L, multi-agent — data scientist define sinal +de incidente → data engineer cria tabela `eng_incidents` → backend → frontend). + +### 8.6 Throughput (PRs merged per period) + +**Fórmula:** `count(PRs WHERE is_merged AND merged_at IN [period])` + +| Decisão | Referência | +|---|---| +| Fetch por `merged_at` (não `created_at`) | INC-001 fix 2026-04-16 — antes, PRs com lifecycle longo eram subcontados | +| `pr_analytics.total_merged` no payload `throughput/pr_analytics` | usado por `/metrics/home` | +| Cycle time per-week sparkline computed inline | INC-007 fix | + +### 8.7 WIP (Work in Progress) + +**Fórmula:** `count(eng_issues WHERE normalized_status IN ('in_progress','in_review'))` no momento do snapshot + +**Decisões importantes:** + +- Status `todo` **excluído do WIP** — apenas trabalho tocado conta. Documentado em `kanban-formulas-v1.md` §2 +- "aguardando deploy produção" mapeado pra `done` (INC-019 P2 — debatível, porém fixo no `connections.yaml` status_mapping) +- WIP é tenant-aggregate por default; per-squad é cálculo on-demand via `squad_key` query param + +### 8.8 Lead Time Distribution / CFD / Scatterplot (Lean) + +**Fonte de verdade:** `eng_issues` com `status_transitions` JSONB populado pelo Jira changelog. + +| Métrica | Fórmula | Edge case | +|---|---|---| +| Lead Time Distribution | histograma de `completed_at - created_at` por bin | INC-010 fix 2026-04-16: inclui issues longas que stradle o período | +| CFD | contagem por status × dia, banda `done` usa `MAX(done_so_far)` | INC-009 P1 — protege contra reopens | +| Scatterplot | um ponto por issue concluída no período (P50/85/95 lines) | mesmo escopo de fetch que LT distribution | + +### 8.9 Anti-Surveillance Invariant + +**Decisão fundamental, INVIOLÁVEL:** + +> Author/assignee/reporter **NUNCA** entram em payloads de métrica. + +**Onde está garantido:** + +1. **Domain dataclasses** (`pulse-data/src/contexts/metrics/domain/`): nenhum field tipo `author`, `assignee`, `reporter` ou similar +2. **Schema registry** (FDD-OPS-001 line 3): payload-vs-dataclass diff loga `_schema_drift` se algo nuevo aparece +3. **Frontend contract tests** (`tests/contract/anti-surveillance-schemas.test.ts`): meta-test que injeta payload tainted em cada um dos 6 schemas Zod e verifica rejeição +4. **Underlying tables** (`eng_pull_requests.author`, `eng_issues.assignee`) — campos existem (necessários pra ingestão e linking), mas **nunca atravessam a fronteira de aggregação** + +**Snapshot anonimizado (PR #2.1 / future):** quando construirmos pipeline +de snapshot pra distribuir entre devs, aggregate-only não é suficiente — +o DB ainda tem PII nos raw fields. Anonimização determinística de +author/assignee → hash + `@example.invalid` é necessária. Detalhes em +`docs/onboarding.md` (PR #2.1). + +### 8.10 Status Normalization (hybrid textual + statusCategory) + +**Fonte primária:** hybrid em 3 camadas (FDD-OPS-017 / INC-022 / commit `0c7124d`): + +1. **Textual mapping curado** — `DEFAULT_STATUS_MAPPING` em `engineering_data/normalizer.py`, ~80 entries PT-BR Webmotors-curated + EN. Preserva granularidade `in_progress` vs `in_review`. +2. **Jira `statusCategory.key` fallback** — autoritativo done/não-done. Connector descobre via `/rest/api/3/status` (1 chamada/lifetime, cacheada). Webmotors: 326 status defs descobertas. +3. **Default 'todo' com WARN log** — extremamente raro pós-fix (só status sem categoria). + +**Categorias normalizadas produzidas:** `todo | in_progress | in_review | done` (4 categorias). Métricas downstream em `domain/lean.py:_ACTIVE_STATUSES = {"in_progress", "in_review"}` tratam ambos como WIP/active para Cycle Time. + +**Discovery cacheado por instância de connector:** + +```python +# JiraConnector +self._status_categories: dict[str, str] = {} # name (lowercase) → category key +self._status_categories_discovered: bool = False + +async def _discover_status_categories(self): + data = await self._client.get(f"{REST_API}/status") + for s in data: + name = (s.get("name") or "").strip().lower() + cat = ((s.get("statusCategory") or {}).get("key") or "").strip().lower() + if name and cat in ("new", "indeterminate", "done"): + self._status_categories[name] = cat +``` + +**`_map_issue` anexa ao dict mapeado:** + +- `status_category`: a categoria do status atual +- `status_categories_map`: o dict completo (mesma referência para todas as issues do batch) + +**Histórico (`build_status_transitions`)** usa o `status_categories_map` para classificar cada `to_status` histórica: + +```python +for cl in changelogs: + cat = status_categories_map.get(cl["to_status"].strip().lower()) + normalized = normalize_status(cl["to_status"], status_mapping, cat) +``` + +**Edge cases conhecidos & decisões:** + +| Status | Mapping | Justificativa | +|---|---|---| +| `FECHADO EM PROD` | `done` | Jira category=done; nome literal "FECHADO" | +| `FECHADO EM HML` | `done` | Jira category=done. Workflow author classifica como done; respeitamos. Se squad quer "ainda em fluxo", renomeia para "Aguardando Deploy Produção" | +| `aguardando deploy produção` | `in_progress` | INC-019 P2 reverso — quando deploy é o gargalo, item ainda está em fluxo | +| `em teste azul/hml` | `in_review` | Webmotors-specific QA stages; granularidade preservada via textual | +| `construção de hipótese` | `in_progress` | Kanban upstream — trabalho ativo de discovery | +| `Aguardando Code Review` | `in_review` | Trabalho ativo aguardando reviewer (textual ganha sobre Jira `new` neste tenant) | +| Status sem mapping E sem category | `todo` (com WARN log) | Conservador — operador investiga via WARN | + +**Princípio**: textual ganha quando definido (granularidade); category ganha sobre default (autoridade). Tudo que cai em "todo" sem ambos é log-visible — raro, mas observável. + +**Por que mantemos 4 categorias (não 3 como Jira)** — métricas Lean precisam distinguir `in_progress` (development active) de `in_review` (waiting on review/test) para Cycle Time Breakdown. Jira `statusCategory.indeterminate` colapsa os dois; nosso textual mapping preserva quando a squad nomeia. + +### 8.11 PR ↔ Issue Linking + +**Mecanismo:** regex `[A-Z][A-Z0-9]+-\d+` em `pr.title`, `pr.head_ref`, `pr.base_ref` + +**Sequência:** + +1. Sync worker carrega `(issue_key, external_id)` do tenant **antes** de sincronizar PRs (issues vêm 1º no ciclo) +2. Pra cada PR, regex extrai possíveis keys (multi-match suportado) +3. Filtra keys que existem em `jira_project_catalog` com status `active|discovered` +4. Popula `linked_issue_ids` JSONB do PR + +**Per-project link rate observado** (Webmotors, post-discovery): + +- Top performers (96-100%): SDI, PUSO, DSP, FID, CRMC +- Tenant-wide médio: 21.9% +- Falsos positivos: HOTFIX-123, RELEASE-1, BUGFIX-42, lib names (LODASH-4) — filtrados via `IN (jira_project_catalog)` clause +- Orphans conhecidos: RC (1348 references, projeto archived no Jira) + +**Re-relink pós-ingestão:** script `scripts/relink_prs_to_issues.sql` +re-aplica em PRs antigos quando novos projetos são ativados via discovery +dinâmica. + +### 8.12 Effort Estimation (story_points field) + +**Fonte primária:** `eng_issues.story_points` (numeric, nullable) — populado pelo `_extract_story_points` no connector via fallback chain (FDD-OPS-016 / INC-021 / commit `172f3f2`). Detalhes na §6.3.6. + +**Hops em ordem de prioridade** (telemetria via `_effort_source_counts`): + +| Hop | Source | Conversão | Source label | +|---|---|---|---| +| 1 | `customfield_10004` ("Story Points") | numeric direto (skip se = 0) | `story_points` | +| 2 | `customfield_18524` ("Story point estimate") | numeric direto | `story_points` | +| 3 | T-shirt size field (discovered) | Fibonacci: PP=1, P=2, M=3, G=5, GG=8, GGG=13 | `tshirt_to_sp` | +| 4 | `customfield_15100` ("Tamanho/Impacto") | mesmo mapa | `tshirt_to_sp` | +| 5 | `timeoriginalestimate` (segundos) | buckets ≤4h=1, ≤8h=2, ≤16h=3, ≤24h=5, ≤40h=8, ≤80h=13, >80h=21 | `hours_to_sp` | +| 6 | None | `null` em `eng_issues.story_points` | `unestimated` | + +**Decisão downstream — quando `story_points IS NULL`:** + +- Métricas baseadas em soma (Velocity, Story Point Throughput): **NÃO somar** issues `null` +- Métricas baseadas em count (Throughput by issue, WIP, Cycle Time): **incluir** issues `null` normalmente +- **Para tenants Kanban-puros** (Webmotors: 25/27 squads), `story_points` é `null` para 100% — **a métrica primária deve ser count, não sum** + +**Anti-pattern evitado:** + +> NÃO defaultar para `story_points = 1` (ou outro valor sentinel) +> quando não há estimativa. Seria silently wrong para Velocity. +> Métrica precisa saber explicitamente que aquela issue não foi +> estimada. `null` é fail-loud (NULL no DB visível) vs `1` que é +> fail-silent. + +**Webmotors-observed coverage** pós-fix (CRMC, projeto novo full-history): + +- 52,3% com effort estimado (sample de 1.375 issues) +- Distribuição valores: 1, 2, 3, 5, 8 (Fibonacci aplicado) +- 47,7% `null` → métrica conta items + +**Future:** R3 codename "dev-metrics" (FDD-DEV-METRICS-001) entrega: +- Coluna `effort_source` em `eng_issues` para auditoria por issue +- Per-squad estimation method choice (admin UI) +- Modelo proprietário de previsão usando método nativo do squad + +### 8.13 Sprint Status & Goal + +**Fonte primária:** `eng_sprints.status` (varchar(50), nullable) + `eng_sprints.goal` (text, nullable). Populados pelo `normalize_sprint` (FDD-OPS-018 / INC-023 / commit `649ed78`). + +**Status normalization:** + +| Raw value (Jira) | Aliases aceitos | Normalized | +|---|---|---| +| ACTIVE | active, open, in_progress | `active` | +| CLOSED | closed, completed, complete, ended | `closed` | +| FUTURE | future, planned, upcoming | `future` | +| (qualquer outro) | — | `None` (fail-loud, operador investiga) | + +**Por que NULL para unknown** (não bucketizar): Sprint Velocity e Carryover logic precisam saber QUAIS sprints estão de fato fechadas. Bucketizar "?" para `closed` corromperia a regressão linear de tendência. NULL torna o problema visível. + +**Goal field:** + +- Source: `sprint.goal` da Jira API (string, free-text setado por squad lead) +- Normalizer aplica `_strip_null_bytes` (Postgres rejeita 0x00) +- Webmotors observed: 70% das sprints têm goal real (e.g., "Gestão de banner no backoffice de CNC e TEMPO para novas especificações técnicas") + +**Re-fetch policy crítica** — sprints **não usam watermark filter** (decisão de FDD-OPS-018): + +- State transitions acontecem em `endDate`, não `startDate` +- Volume baixo (~216 total / ~5 ativas em qualquer momento) +- Sempre re-fetch é correto E barato +- Se quiser otimizar no futuro: filtrar por `endDate < since` (não `startDate`) + +**ON CONFLICT update obrigatório:** + +```python +# _upsert_sprints +.on_conflict_do_update( + index_elements=["tenant_id", "external_id"], + set_={ + "name": sd["name"], + "status": sd.get("status"), # FDD-OPS-018: era omitido + "goal": sd.get("goal"), # FDD-OPS-018: era omitido + "started_at": sd["started_at"], + "completed_at": sd["completed_at"], + # ... outros campos métricos + "updated_at": datetime.now(timezone.utc), + }, +) +``` + +**Lição** — quando o ON CONFLICT `set_` omite um campo, sprints existentes nunca recebem update mesmo se o normalizer está correto. Pattern: `set_` deve incluir TODOS os campos que podem mudar entre syncs, exceto `external_id` e `tenant_id`. + +--- diff --git a/pulse/docs/ingestion-v2-phase-2-plan.md b/pulse/docs/ingestion-v2-phase-2-plan.md new file mode 100644 index 0000000..b4037a0 --- /dev/null +++ b/pulse/docs/ingestion-v2-phase-2-plan.md @@ -0,0 +1,374 @@ +# Ingestion v2 — Phase 2 Plan (FDD-OPS-014) + +**Status:** PARTIAL — foundation shipped 2026-04-28, read-side refactor + worker split deferred. +**Companion docs:** `ingestion-architecture-v2.md` (overall design), +`ingestion-spec.md` (current architecture). +**Sister artifact (applied):** `alembic/versions/010_pipeline_watermarks_scope_key.py` + +--- + +## 0. Shipping summary (2026-04-28 status) + +What landed in this iteration vs. what carries forward: + +### ✅ Shipped (production-ready, validated against live tenant) + +| Step | Commit | What | +|---|---|---| +| **2.1** | `f357d05` | Migration 010 applied: `pipeline_watermarks.scope_key VARCHAR(255) NOT NULL DEFAULT '*'` + `uq_watermark_entity_scope` UNIQUE coexisting with legacy `uq_watermark_entity` | +| **2.2** | `f357d05` | Per-scope watermarks API: `GLOBAL_SCOPE`, `make_scope_key(source, dim, value)`, `_get_watermark(scope_key=...)`, `_set_watermark(scope_key=...)`, `_list_watermarks_by_scope(scope_keys=[...])`. Default `'*'` preserves all legacy callers. | +| **2.3** | `f357d05` | `_sync_issues()` reads + writes per-project watermarks (`jira:project:`). Logs "watermark plan: N backfill, M incremental" pre-flight. Per-project advance fires on project transition. Legacy global '*' kept for compat. | +| **2.4** | `15574a7` | `_sync_pull_requests()` writes per-repo watermarks (`github:repo:/`) on each batch persist. **Write-side only** — connector still uses single `since` for fetch. | +| **2.5** | `15574a7` | `_sync_deployments()` writes per-repo watermarks (`jenkins:repo:`) post-upsert. Per-repo not per-job (Q2 decision: matches PR↔deploy linking dimension). **Write-side only.** | + +Test coverage shipped: 19 unit tests (`test_watermark_scope_keys.py` 9, `test_inline_changelog_extraction.py` 10 — re-validated alongside). + +### 🟡 Deferred to next iteration (sister FDD) + +| Step | What's missing | Why deferred | +|---|---|---| +| **2.4-B / 2.5-B** | Connector signature refactor: accept `since_by_repo` / `since_by_project` so per-scope watermarks are READ during fetch (not just written) | Required for new-repo backfill correctness — without it, adding a repo only fetches PRs newer than the global `*` watermark. Significant connector code change (~M effort), warranted in a dedicated PR with thorough tests. | +| **2.6** | docker-compose split into per-source workers (jira/github/jenkins) | Architectural value of split (per-source isolation, parallel cycles) only realizes when combined with 2.4-B + 2.5-B. Splitting alone = 3 containers running same global-watermark logic — zero throughput win. | +| **2.7** | Migration 011: drop legacy `uq_watermark_entity` constraint | Plan §3 explicitly requires "after one successful per-source cycle". Per-source doesn't exist yet (deferred above). Legacy constraint coexists harmlessly until then. | +| **Health-aware pre-flight** (P-8 in v2 doc) | Pre-cycle source reachability check (skip cycle if source unhealthy) | Belongs with worker-split work (each per-source worker owns its health-check). Without split, a single sync still has interleaved phases. | + +### 🟢 Foundation shipped means + +- New scope rows accumulate every cycle. When the read-side refactor lands, every active repo/project already has its own watermark — no schema migration, no backfill of historic data. +- Migration 010 is rollback-safe via `downgrade()`. The legacy unique constraint coexists with the new one for as long as needed. +- All Phase 1 wins (FDD-OPS-012 batched persistence, FDD-OPS-013 inline changelogs) remain intact and continue working. + +### 📅 Suggested next iteration + +Open as `feat/ingestion-v2-phase-2b` branch: + +1. Refactor `JiraConnector.fetch_issues_batched` to accept `since_by_project` dict (already does — done in Phase 1). Just verify wired correctly. +2. Refactor `GithubConnector.fetch_pull_requests_batched` to accept `since_by_repo: dict[str, datetime | None]` and use per-repo since when provided. +3. Refactor `JenkinsConnector` deployments fetch to accept per-repo since. +4. Update `_sync_*` methods to pass `since_by_` from `_list_watermarks_by_scope` results. +5. Smoke test: add new project to Jira catalog → confirm only that scope backfills. +6. THEN: docker-compose split (Step 2.6) + companion migration 011. + +Estimated effort for Phase 2-B: **M-L (~3-5 dev-days)**. Honest scoping based on actual time spent on Phase 2-A (much faster than originally estimated due to clean foundation). + +--- + +## 1. Goals (acceptance criteria) + +The migration is "done" when **all 5** acceptance items hold: + +1. **Per-source isolation**: Jenkins outage (or Jira slowness, or GitHub + rate-limit) does not block the other two sources. Each source has its + own worker process, event loop, and cycle cadence. +2. **Per-scope watermarks**: a new Jira project activation does not + trigger a full re-fetch of existing 200k+ issues. Each scope_key + advances independently. +3. **Health-aware pre-flight**: each cycle checks source reachability + before starting any I/O. VPN drop = mark unhealthy + skip cycle, not + block-and-retry-forever. +4. **Backwards-compat**: existing `pipeline_watermarks` rows keep working + during the transition (scope_key='*' default). +5. **Tests pass**: 100% of existing unit/integration suites + new tests + for per-source and per-scope behavior. + +Non-goals (deferred to Phase 3): +- Job queue / worker pool +- Pre-flight cost estimation via API count call +- `/pipeline/jobs` per-job endpoint + +--- + +## 2. Architecture diff (current → target) + +### Current + +``` +docker-compose.yml: + sync-worker (one process, one event loop, runs: + _sync_issues → _sync_prs → _sync_deploys → _sync_sprints + sequentially, every 15 min) + +pipeline_watermarks: + (tenant, entity_type) UNIQUE ← GLOBAL across all scopes + e.g. row: (tenant=001, entity='issues', last_synced_at='2026-04-26') +``` + +### Target + +``` +docker-compose.yml: + jira-sync-worker (entity: issues, sprints, sprint-issues) + github-sync-worker (entity: pull_requests, repos) + jenkins-sync-worker (entity: deployments) + + All independent: own event loop, cron schedule, retry policy, + health-check, watermark scope, container. + + discovery-worker (unchanged — already separate) + +pipeline_watermarks: + (tenant, entity_type, scope_key) UNIQUE ← PER-SCOPE + e.g. rows: + (tenant=001, entity='issues', scope='jira:project:BG', last_synced='...') + (tenant=001, entity='issues', scope='jira:project:OKM', last_synced='...') + (tenant=001, entity='prs', scope='github:repo:foo', last_synced='...') +``` + +--- + +## 3. Implementation order (dependencies) + +The order minimizes risk and allows early rollback. + +### Step 2.1 — Schema migration (010, sister file) + +Add `scope_key` column with default `'*'` + companion unique constraint. +Existing rows continue to work (read by `(tenant, entity_type)` matches +the `'*'` row exactly). + +**Risk:** very low. Default value preserves all existing reads/writes. +**Rollback:** `alembic downgrade -1`. +**Validation:** smoke against existing sync flow — should produce +identical behavior. + +### Step 2.2 — Repository layer: per-scope watermark API + +Add `get_watermark(tenant, entity, scope_key='*')` and +`set_watermark(tenant, entity, scope_key, ts, count)` to the watermarks +repo. Default `'*'` keeps current callers untouched. + +**Risk:** low. Existing call sites untouched; new ones opt in. +**Validation:** unit tests for default vs explicit scope_key. + +### Step 2.3 — JiraSyncWorker (extract from monolith) + +New module `src/workers/jira_sync_worker.py` containing: + +```python +class JiraSyncWorker: + """Single-source worker. Owns: issues, sprints, sprint-issues.""" + + async def cycle(self): + if not await self._check_jira_health(): + logger.info("Jira unhealthy this cycle; skipping") + return + + await self._sync_issues() # uses per-project scope keys + await self._sync_sprints() # scope='jira:board:' + await self._sync_sprint_issues() # scope='jira:sprint:' + + async def _check_jira_health(self) -> bool: + # GET /rest/api/3/myself with 5s timeout + ... +``` + +`_sync_issues` becomes per-project loop with per-project watermark +read/write. The PR loop pattern from Phase 1 transfers directly. + +**Risk:** medium. Monolithic worker still works; new worker is opt-in +via env flag `PULSE_USE_PER_SOURCE_WORKERS=true`. + +### Step 2.4 — GithubSyncWorker + +Same pattern. Owns: pull_requests, repos discovery. +scope_key format: `github:repo:/`. + +### Step 2.5 — JenkinsSyncWorker + +Same pattern. Owns: deployments. +scope_key format: `jenkins:job:`. + +Health check: `GET /api/json` with 5s timeout. If VPN off → unhealthy +this cycle; resume on next. + +### Step 2.6 — docker-compose.yml: 3 workers replace 1 + +```yaml +sync-worker: + # REMOVED. Replaced by 3 specific workers below. + +jira-sync-worker: + image: pulse-jira-sync-worker + command: python -m src.workers.jira_sync_worker + ... + +github-sync-worker: + ... + +jenkins-sync-worker: + ... +``` + +**Risk:** low — Dockerfiles unchanged (single image, 3 different commands). +**Rollback:** revert compose, restart sync-worker. + +### Step 2.7 — Companion migration 011: drop legacy unique constraint + +After all workers are emitting per-scope writes for >1 successful cycle, +drop `uq_watermark_entity` constraint. Coexistence period prevents cutover +surprises. + +--- + +## 4. Test plan + +Each item lists the test type and what it asserts. + +### Unit tests (no DB, no network) + +| Test | What it asserts | +|---|---| +| `test_watermarks_repo_default_scope_compat` | `get_watermark(t, e)` returns same row as `get_watermark(t, e, scope_key='*')` | +| `test_watermarks_repo_set_per_scope` | Setting scope=`'jira:project:BG'` doesn't affect global `'*'` row | +| `test_jira_health_check_returns_false_on_timeout` | Mock httpx returning timeout → health=False | +| `test_jira_sync_skips_cycle_when_unhealthy` | `_check_jira_health()=False` → `_sync_issues()` not called | +| `test_github_sync_per_repo_watermark` | Each repo has independent watermark | +| `test_jenkins_sync_per_job_watermark` | Each job has independent watermark | + +### Integration tests (DB, mocked HTTP) + +| Test | What it asserts | +|---|---| +| `test_jira_full_cycle_uses_per_project_watermarks` | After cycle, every active project has its own watermark row | +| `test_jira_new_project_activation_only_backfills_that_scope` | Activate new project → only that scope_key gets full backfill, others unchanged | +| `test_jira_one_project_failure_does_not_block_others` | Mock 401 on project X → other projects still complete | +| `test_companion_migration_011_safe_after_workers_migrated` | Verify constraint drop doesn't break existing reads | + +### End-to-end (Webmotors-scale, manual run) + +| Test | What it asserts | +|---|---| +| Boot 3 workers, full re-ingestion against Webmotors | Convergence in <90 min total (parallel sources) | +| Disable VPN mid-Jenkins-sync | Jenkins worker pauses gracefully; Jira+GitHub continue | +| Add new Jira project to catalog | Only that project backfilled in next cycle; others skipped | +| Kill jira-sync-worker mid-cycle | On restart, ≥80% of fetched issues already persisted (per Phase 1) AND watermarks reflect work done | + +### Regression tests (must keep passing) + +- All 52 unit tests from Phase 1 connector/aggregator suite +- `test_inline_changelog_extraction.py` (10 tests, FDD-OPS-013 anti-regression) +- All existing dora/lean/cycle_time domain tests + +--- + +## 5. Rollout sequence (in production / staging) + +When this Phase 2 code is ready: + +1. **Pre-flight**: announce maintenance window (~30 min for safety even + though zero-downtime is the design goal). +2. **Run migration 010** (additive) → verify no errors, queries unchanged. +3. **Deploy new worker images** with `PULSE_USE_PER_SOURCE_WORKERS=false` + (still the monolith). No behavior change. +4. **Validate** monolith still works with new schema column present. +5. **Flip flag** to `=true`. Three new workers start. Old `sync-worker` + container is replaced. +6. **Watch one full cycle** (~30 min). All three sources should run + independently with per-scope watermarks. +7. **Run migration 011** → drop legacy constraint. +8. **Remove backwards-compat code paths** (separate cleanup PR). + +If anything misbehaves at any step, rollback path: +- Steps 1-4: `alembic downgrade -1` + redeploy old image +- Steps 5-6: flip flag back to `false`, kill new workers, restart monolith +- Step 7: requires manual constraint recreation; coordinate carefully + +--- + +## 6. Estimate (effort) + +Honest scoping: + +| Step | Effort | Owner | +|---|---|---| +| 2.1 Schema migration | XS (1h, already drafted) | data-engineer | +| 2.2 Watermarks repo per-scope API | S (2-3h) | data-engineer | +| 2.3 JiraSyncWorker extraction | M (1 day) | data-engineer | +| 2.4 GithubSyncWorker extraction | S (4-6h, simpler since PRs already streaming) | data-engineer | +| 2.5 JenkinsSyncWorker extraction | S (4h, simplest) | data-engineer | +| 2.6 docker-compose split | XS (1h) | engineer | +| 2.7 Companion migration 011 | XS (30min) | data-engineer | +| Tests (unit + integration) | M (1 day total) | test-engineer | +| Rollout + validation | S (half day) | engineer + data-engineer | +| **Total** | **~1 week of focused engineering** | | + +This matches the `ingestion-architecture-v2.md` Phase 2 estimate (3-5 days). + +--- + +## 7. Open questions (for review) + +These need a decision before implementation starts. Captured here so +they don't block the technical work. + +### Q1: Health-check policy for workers + +Question: when a source is unhealthy, should the worker: +- (a) Skip the cycle entirely (current Phase 1 behavior — simple) +- (b) Run with cached data only (more code, useful for read-heavy tasks) +- (c) Pause the worker (no retry until manual restart) + +Recommendation: **(a) skip + log + retry next cycle**. Matches what the +v2 doc implies. Operators can grep for "unhealthy this cycle". + +### Q2: Scope-key format — strict schema or freeform string? + +Question: should `scope_key` follow a strict pattern like +`::` (e.g., `jira:project:BG`) or stay as +opaque text? + +Recommendation: **convention enforced in code, not constraint**. +String column is flexible; helper functions like +`make_scope_key(source, dimension, value)` enforce shape. Allows +future scopes (e.g., `jira:tenant-rule:bg-only`) without migration. + +### Q3: What happens to the global `*` rows after migration 011? + +Question: keep them as "tenant-wide aggregate watermarks" (informational) +or delete? + +Recommendation: **delete in a separate cleanup PR after 1 month of +stable per-scope operation**. Removes cognitive load. If someone wants +"latest across scopes", that's a `MAX(last_synced_at)` query, trivial. + +### Q4: Alembic chain — single migration or two? + +Question: keep migration split (010 add, 011 drop) or combine? + +Recommendation: **keep split**. The risk of dropping the old constraint +before workers are confirmed writing per-scope is high; the cost of +keeping both for a month is zero. Two migrations provide a safe rollback +window. + +--- + +## 8. What this plan does NOT cover (explicitly out of scope) + +- **Job queue + worker pool** — Phase 3, separate plan +- **Pre-flight item count via API** — FDD-OPS-015 full version, separate +- **Pipeline Monitor UI per-scope tab** — needs FDD-OPS-015's data layer + first +- **GitLab / Azure DevOps / Linear connectors** — R2+, separate work +- **MTTR pipeline** — FDD-DSH-050, completely independent track + +--- + +## Status + +**Status of this document:** PARTIAL IMPLEMENTATION (2026-04-28). + +Phase 2-A foundation shipped — see §0 for the breakdown of what landed +vs. what was deferred to Phase 2-B. The architectural pattern (per-scope +watermarks coexisting with legacy global '*' rows) is in production use +and validated against the Webmotors tenant. + +Phase 2-B (read-side connector refactor + docker-compose split + drop +legacy constraint) opens as a separate effort — see §0 "Suggested next +iteration" for the concrete roadmap. + +### Document changelog + +- **2026-04-28 evening** — PARTIAL status. Steps 2.1–2.5 (write-side) + shipped. Steps 2.4-B, 2.5-B, 2.6, 2.7 deferred with rationale. +- **2026-04-28 afternoon** — DRAFT 1 produced in parallel while Phase 1 + ingestion converged. diff --git a/pulse/docs/metrics/metrics-inconsistencies.md b/pulse/docs/metrics/metrics-inconsistencies.md index eb4092d..2961590 100644 --- a/pulse/docs/metrics/metrics-inconsistencies.md +++ b/pulse/docs/metrics/metrics-inconsistencies.md @@ -31,6 +31,10 @@ Gravidade: | INC-017 | Lead Time | P2 | A API DORA retorna `lead_time_for_changes_hours` como numero bruto. A `change_failure_rate` e retornada como ratio (0.0-1.0). A conversao de CFR para % acontece apenas no frontend. Consumidores diretos da API podem confundir CFR = 0.22 com 22% ou com 0.22%. | `routes.py:214`; `transforms.ts:305` | Usuarios integrando a API diretamente podem misinterpretar CFR. | Documentar explicitamente no OpenAPI spec que CFR e ratio 0.0-1.0. Ou unificar: retornar como porcentagem (0-100) diretamente da API. | | INC-018 | Cycle Time — benchmarks | P2 | Thresholds de Cycle Time na UI (`< 2h = elite`, `< 24h = high`, `< 72h = medium`) sao definidos no frontend como "PULSE-internal" mas aparecem ao lado de metricas DORA sem distinguir que nao sao da DORA 2023. | `transforms.ts:171-176`; `BENCHMARKS['cycle_time']` | Usuarios podem crer que "Cycle Time elite < 2h" e uma definicao DORA oficial. | Adicionar label "PULSE benchmark" vs "DORA 2023" nos cards da UI. | | INC-019 | WIP — "aguardando deploy producao" | P2 | O status Jira "aguardando deploy producao" esta mapeado para `done` no normalizer. Semanticamente, o item ainda esta aguardando ser entregue — nao foi concluido. Isso subestima WIP e o throughput nao contabiliza o delay de deploy. | `normalizer.py:77` | WIP esta subestimado; itens aguardando deploy aparecem como "done" antes de realmente chegarem a producao. | Mapear "aguardando deploy producao" para `in_review` ou criar um 5o status `awaiting_deploy`. | +| INC-020 | Lean — `status_transitions` | P0 | `_map_issue` no `JiraConnector` extraía o changelog para um cache lateral (`self._last_changelogs`) mas NÃO incluía o campo `changelog` no dict mapeado. O `_sync_issues` chama `extract_status_transitions_inline(raw)` que faz `raw.get("changelog", {}).get("histories", [])` — sempre vazio. **Resultado: 311.007 issues (100%) com `status_transitions=[]`.** Cycle Time não fechava (sem transição final para done), Throughput sub-contava, WIP super-contava, CFD distorcido, Lead Time indeterminado. | `jira_connector.py:_map_issue` (changelog não retornado); `workers/devlake_sync.py:extract_status_transitions_inline` lê do dict mapeado. | Todo o pilar Lean comprometido para qualquer projeto que use o pipeline batched (Phase 1 v2). | Incluir `"changelog": jira_issue.get("changelog", {})` no return de `_map_issue`. Adicionar test guard `TestMapIssuePreservesChangelogForInlineExtraction` end-to-end (mapper → extractor) — o gap de cobertura era exatamente esse: testes do extractor isolado não pegavam o drop no mapper. | +| INC-021 | Lean / Sprint — `story_points = 0` em 100% issues | P0 | Audit em 2026-04-28 (FDD-OPS-016): `story_points = 0` para todas as 311.007 issues. Investigação na API Jira da Webmotors revelou: `customfield_10004` ("Story Points") e `customfield_18524` ("Story point estimate") **0% populados** em todos os 69 projetos ativos. Webmotors **não usa Story Points como método de estimativa**. Squads usam padrões heterogêneos: T-shirt size, original estimate em horas, ou nada (Kanban-puro). | `jira_connector.py:_extract_story_points` (só consultava campos numéricos clássicos). | Velocity sempre zerada, throughput-by-effort impossível, forecast Monte Carlo sem input. Bloqueia toda métrica que dependa de "esforço" como agregação. | **Fallback chain implementada**: SP nativo → T-shirt (P=2/M=3/G=5… escala Fibonacci) → `timeoriginalestimate` (buckets de horas) → `None`. Discovery dinâmico via `_discover_custom_fields` casa por nome ("t-shirt size", "tamanho/impacto"). Telemetria `_effort_source_counts` por batched run. **Quando `None`, métrica downstream DEVE contar items (Kanban-puro)** — decisão fica na camada de métrica, não no normalizer. | +| INC-022 | Lean / Flow — Status normalization 96.5% done skew | P0 | Audit em 2026-04-28 mostrou distribuição absurda: 96,5% `done` / 3,3% `todo` / 0,2% `in_progress` / 0,1% `in_review`. A Webmotors tem **104 status raw distintos** em workflows ativos; `DEFAULT_STATUS_MAPPING` cobria ~50 → 50+ status caíam silenciosamente no fallback "Unknown → todo". Casos sistêmicos: `FECHADO EM PROD` (2.881 issues) ia para `todo` em vez de `done`; `Em Progresso`, `Em desenv` (in_progress) idem; `Homologação`, `Em Verificação` (in_review) idem. **Impacto em CASCATA**: status_transitions herdam a classificação errada → último estado de issue concluída ficava `todo`. Cycle Time infinito, Throughput sub-contava, WIP super-contava, CFD/Lead Time corrompidos. | `engineering_data/normalizer.py:normalize_status` (default `'todo'` em status desconhecido); `build_status_transitions` propagava o erro. | **Todo o pilar Lean** corrompido para qualquer tenant com status fora do mapping curado. SaaS-ready zero. | **Hybrid normalization em 3 camadas**: (1) Textual `DEFAULT_STATUS_MAPPING` expandido com ~80 PT-BR Webmotors-curated (preserva granularidade `in_progress` vs `in_review`); (2) Fallback `statusCategory.key` da Jira (autoritativo done/não-done) — connector descobre via `/rest/api/3/status` (1 chamada/lifetime, 326 status defs Webmotors, cacheado); (3) Default `todo` com WARN (extremamente raro agora). Quantificado: 3.151 issues reclassificarão (1% — long tail catastrófico); distribuição já correta para os 97% restantes. | +| INC-023 | Sprint — `status` sempre vazio | P0 | 100% das 216 sprints na Webmotors com `status=''` no `eng_sprints`. `goal` também totalmente vazio. Investigação revelou clássico **swiss cheese alignment** — 4 bugs independentes em camadas diferentes, cada um sozinho garantindo o resultado: (1) `normalize_sprint` retornava dict SEM o campo `status`; (2) `_upsert_sprints.on_conflict_do_update.set_` não atualizava `status`/`goal` (sprints existentes nunca recebiam update); (3) `_fetch_board_sprints` filtrava por `started_date < since` — sprint state transitions acontecem em `endDate`, não `startDate` (filtro errado de dimensão); (4) **ORM model `EngSprint` não tinha o campo `status`** apesar do schema do DB ter — drift coluna existe há tempos no DB, ORM nunca atualizado. Path que omitia status funcionava silently empty; path que tentava popular crashava com `Unconsumed column names: status`. | `jira_connector.py:_map_sprint`, `normalizer.py:normalize_sprint`, `workers/devlake_sync.py:_upsert_sprints`, `engineering_data/models.py:EngSprint`. | Sprint Comparison / Velocity Trend não pode filtrar `closed`; "current sprint" planejado precisa `active`; Carryover heurística baseada em `endDate < now()` em vez do status correto. | Fix nas 4 camadas: (1) `_map_sprint` passa `goal` adiante; (2) normalizer inclui `status` (lowercase `active`/`closed`/`future`/None) + `goal` com strip de null bytes; (3) ON CONFLICT atualiza ambos; (4) removeu filtro de watermark (volume baixo ~216 total / ~5 ativas, sempre re-fetch é correto pois state transitions); (5) `EngSprint.status: Mapped[str\|None]` adicionado (corrige drift). Helper `_normalize_sprint_status` mapeia aliases (open→active, completed→closed, planned→future) e devolve `None` para desconhecidos — não bucketiza silenciosamente. **Lição genérica**: adicionar guard test "DB columns vs ORM Mapped fields" — schema drift é o bug mais insidioso porque alguns paths funcionam e outros crashern. | --- @@ -38,10 +42,10 @@ Gravidade: | Gravidade | Quantidade | Impacto | |-----------|-----------|---------| -| P0 | 7 | Numeros errados exibidos ao usuario | +| P0 | 11 | Numeros errados exibidos ao usuario | | P1 | 8 | Numeros subotimizados/incompletos | | P2 | 4 | Apresentacao/documentacao | -| **Total** | **19** | | +| **Total** | **23** | | ## IDs P0 listados por ordem de impacto @@ -52,6 +56,10 @@ Gravidade: 5. **INC-003** — `first_commit_at` = data de abertura do PR — ✅ **FIXED 2026-04-17** (GraphQL `commits(first:1).authoredDate` no `github_connector`; normalizer consome `_first_commit_at` com fallback para `created_date`; admin endpoint `POST /data/v1/admin/prs/refresh-first-commits` faz backfill. Backfill `scope=last-60d`: 5020 processados, 4653 atualizados, 0 erros, 459s. Resultado: P50 Cycle Time 0,28h → 5,94h em 60d; 90,1% dos PRs agora com `first_commit_at < created_at`. Pendente: backfill histórico (~59k PRs) com `scope=stale`.) 6. **INC-007** — Cycle time em throughput trend sempre None 7. **INC-005** — MTTR sempre null (documentado, mas DORA overall fica incompleto) +8. **INC-022** — Status normalization 96.5% done skew (afeta Cycle Time / Throughput / WIP / CFD / Lead Time em CASCATA via status_transitions) — ✅ **FIXED 2026-04-29** (hybrid: textual mapping curado + Jira `statusCategory.key` fallback + 326 status defs descobertos; commit `0c7124d`). +9. **INC-020** — `status_transitions=[]` em 311k issues (changelog drop em `_map_issue`) — ✅ **FIXED 2026-04-29** (`jira_connector._map_issue` preserva `changelog`; commit `177830e`). +10. **INC-021** — `story_points=0` em 100% issues (Webmotors não usa SP) — ✅ **FIXED 2026-04-29** (effort fallback chain SP→T-shirt→Hours→None; commit `172f3f2`). +11. **INC-023** — Sprint status sempre vazio (4-layer cheese: normalizer + upsert + watermark + ORM drift) — ✅ **FIXED 2026-04-29** (fix nas 4 camadas; commit `649ed78`). ### Status bar @@ -65,6 +73,10 @@ Gravidade: | INC-003 | ✅ Fixed | 2026-04-17 | `connectors/github_connector.py` GraphQL query includes `commits(first:1).authoredDate` + REST fallback via `_fetch_first_commit_date`; `engineering_data/normalizer.py` reads `_first_commit_at` with fallback; `engineering_data/services/backfill_first_commits.py` + admin endpoint `POST /data/v1/admin/prs/refresh-first-commits` backfills historical PRs. 60d scope: 5020 processados, 4653 atualizados, P50 Cycle Time 0,28h → 5,94h. | | INC-004 | ✅ Fixed | 2026-04-17 | Temporal linking PR→deploy: `engineering_data/services/backfill_deployed_at.py` (one-shot CTE com LATERAL join em `repo = split_part(pr.repo,'/',2)` e janela 30d) + admin `POST /data/v1/admin/prs/refresh-deployed-at` + forward-path hook em `workers/devlake_sync._sync_deployments` chamando `link_recent_deploys_to_prs` após upsert. SHA match descartado (Jenkins `sha` é build ID, não git SHA). Também corrige INC-012 (Deploy phase) como consequência. **Fix adicional em `domain/cycle_time.breakdown_single_pr`**: `total_hours` agora usa `first_commit_at → merged_at` (Cycle Time canônico); antes caía no mesmo endpoint `deployed_at` do DORA e colapsava Lead Time ≡ Cycle Time assim que `deployed_at` fosse populado. Backfill `scope=last-60d`: 5104 processados, 2037 linkados (40% cobertura — limitado pelos 126/390 repos com Jenkins prod), duração 0,93s. `scope=stale` histórico: 3706 linkados adicionais. Resultado home (60d): LT=65,51h vs CT P50=5,92h (diff=59,59h = fila de deploy). | | INC-012 | ✅ Fixed | 2026-04-17 | Resolvido como efeito colateral do INC-004: com `deployed_at` populado em 2037 PRs (60d), a fase Deploy do Cycle Time Breakdown passa a ter dados reais (merge→deploy P50 ≈ 136h em 60d). | +| INC-020 | ✅ Fixed | 2026-04-29 | `connectors/jira_connector.py:_map_issue` agora inclui `"changelog": jira_issue.get("changelog", {})` no return. Test guard `tests/unit/test_inline_changelog_extraction.py::TestMapIssuePreservesChangelogForInlineExtraction` exercita end-to-end (mapper → extractor) — gap original era teste do extractor isolado. Validado live no projeto BG: 1.994 issues re-sincados todos com 3-8 transitions normalizadas. Commit `177830e`. | +| INC-021 | ✅ Fixed | 2026-04-29 | `connectors/jira_connector.py`: discovery dinâmico de `customfield_18762` ("T-Shirt Size") + `customfield_15100` ("Tamanho/Impacto") via `/rest/api/3/field`; `_extract_story_points` reescrito como fallback chain (SP nativo → T-shirt mapping Fibonacci → `timeoriginalestimate` buckets → None). Telemetria `_effort_source_counts` loggada por batched run. 34 testes em `tests/unit/test_effort_fallback_chain.py`. Validado live em CRMC (1.375 issues): 52,3% com effort estimado, valores 1/2/3/5/8 (Fibonacci aplicado). Backlog FDD-DEV-METRICS-001 reservado para R3+ (per-squad estimation choice + proprietary forecasting). Commit `172f3f2`. | +| INC-022 | ✅ Fixed | 2026-04-29 | (1) `_discover_status_categories()` no `JiraConnector` cacheia `name → category` via `/rest/api/3/status` (326 defs Webmotors: 117 new + 181 indeterminate + 28 done). (2) `_map_issue` anexa `status_category` (current) e `status_categories_map` (todos, para histórico). (3) `normalize_status(raw, mapping, status_category=...)` aceita category fallback antes do default todo: `done→done`, `indeterminate→in_progress`, `new→todo`. (4) `build_status_transitions(..., status_categories_map=...)` classifica cada to_status histórica via map. (5) `DEFAULT_STATUS_MAPPING` expandido com ~80 PT-BR Webmotors-curated. 44 testes em `tests/unit/test_status_normalization.py`. Quantificado pré-fix: 3.151 issues reclassificarão (2.923 todo→done, 161 todo→in_review, 67 todo→in_progress). Commit `0c7124d`. | +| INC-023 | ✅ Fixed | 2026-04-29 | Fix nas 4 camadas do swiss cheese: (1) `jira_connector._map_sprint` passa `goal` adiante; (2) `normalizer.normalize_sprint` inclui `status` (lowercase: `active`/`closed`/`future`/None via helper `_normalize_sprint_status` com aliases) + `goal` com strip de null bytes; (3) `_upsert_sprints` ON CONFLICT atualiza `status`+`goal`; (4) `_fetch_board_sprints` removeu filtro `started_date < since` (volume baixo, state transitions em `endDate`); (5) `EngSprint.status: Mapped[str\|None]` adicionado (corrige schema drift). 26 testes em `tests/unit/test_sprint_normalization.py` (incl. structural anti-regression para upsert set_). Validado live: 195/217 sprints (89,9%) com status correto, 70% com goal real. As 22 vazias = board órfão 873 sem projeto ativo (fora de escopo). Commit `649ed78`. | ### Admin recalc endpoint (2026-04-17) Forced refresh without waiting for Kafka events: diff --git a/pulse/packages/pulse-data/alembic/versions/010_pipeline_watermarks_scope_key.py b/pulse/packages/pulse-data/alembic/versions/010_pipeline_watermarks_scope_key.py new file mode 100644 index 0000000..c88f374 --- /dev/null +++ b/pulse/packages/pulse-data/alembic/versions/010_pipeline_watermarks_scope_key.py @@ -0,0 +1,165 @@ +"""pipeline_watermarks: add scope_key (FDD-OPS-014 Phase 2, Step 2.1). + +Promoted from DRAFT 2026-04-28 after `docs/ingestion-v2-phase-2-plan.md` +review approval. + +============================================================================== +Why this migration exists (FDD-OPS-014, Phase 2 of ingestion-architecture-v2) +============================================================================== + +Today `pipeline_watermarks` has ONE row per (tenant, entity_type). Adding +a single new Jira project means resetting the watermark to bring its +historical data — but that ALSO re-fetches the existing 200k+ issues from +all other projects unnecessarily. + +After this migration: rows are keyed by (tenant, entity_type, scope_key). +A new project starts with `scope_key = "jira:project:NEWKEY"` watermark +at NULL → backfills only that scope. Other scopes' watermarks unchanged. + +Same pattern for repos (github), jobs (jenkins), and future sources. + +============================================================================== +Migration plan (zero-downtime, multi-step) +============================================================================== + +This migration is INTENTIONALLY conservative — it adds the new column +with a default WITHOUT removing the old constraint. A second migration +(011, after the worker code switches to writing per-scope rows) drops +the old global constraint. + +Step 010 (this file): + 1. ADD COLUMN scope_key VARCHAR(255) NOT NULL DEFAULT '*' + - existing rows get scope_key='*' (means "global, all scopes") + - workers can keep reading existing rows by querying scope_key='*' + 2. CREATE INDEX on (tenant_id, entity_type, scope_key) + 3. CREATE UNIQUE CONSTRAINT uq_watermark_scope on + (tenant_id, entity_type, scope_key) ← coexists with old global one + 4. KEEP existing uq_watermark_entity (tenant_id, entity_type) UNTIL + workers migrate + +Step 011 (separate file, AFTER worker code is deployed): + - DROP CONSTRAINT uq_watermark_entity + - At this point all writes use scope_key, the global '*' rows can + be removed too (or kept as "backwards-compat aggregate") + +============================================================================== +Rollback strategy +============================================================================== + +`downgrade()` removes only what `upgrade()` adds. It does NOT touch the +old constraint (since this migration didn't drop it). Safe to revert +if the new column proves problematic. + +============================================================================== +What this DOES NOT change +============================================================================== + +- No worker code changes (those go in Phase 2 PR). +- No queries change yet — workers still read by (tenant, entity_type) + which now matches the global '*' row. +- No data backfill — existing rows just inherit '*' default. + +Revision ID: 010_pipeline_watermarks_scope_key +Revises: 009_metrics_snapshots_tenant_latest_index +Create Date: 2026-04-28 +""" + +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +revision: str = "010_watermarks_scope_key" +down_revision: Union[str, None] = "009_metrics_snapshots_tenant_latest_index" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Add scope_key column + new unique constraint (coexists with old).""" + + # 1. Add the column with default '*' so existing rows get a value. + op.add_column( + "pipeline_watermarks", + sa.Column( + "scope_key", + sa.String(length=255), + nullable=False, + server_default="*", + comment=( + "Scope identifier within an entity_type. " + "Format: '::' " + "(e.g., 'jira:project:BG', 'github:repo:foo/bar', " + "'jenkins:job:deploy-X'). Value '*' means global " + "(legacy global watermark). FDD-OPS-014." + ), + ), + ) + + # 2. Index for the per-scope lookup pattern. Replaces nothing yet — + # keeps the old (tenant, entity_type) index for backwards-compat. + op.create_index( + "ix_watermarks_tenant_entity_scope", + "pipeline_watermarks", + ["tenant_id", "entity_type", "scope_key"], + unique=False, + ) + + # 3. New UNIQUE constraint covering scope_key. Coexists with the old + # `uq_watermark_entity` constraint until step 011 drops it. + op.create_unique_constraint( + "uq_watermark_entity_scope", + "pipeline_watermarks", + ["tenant_id", "entity_type", "scope_key"], + ) + + # 4. Defensive: any RLS policies on the table apply to the new column + # automatically (policies are at table level, not column level). + # No change needed. + + +def downgrade() -> None: + """Reverse: drop new constraint + index + column. Old constraints stay.""" + op.drop_constraint( + "uq_watermark_entity_scope", + "pipeline_watermarks", + type_="unique", + ) + op.drop_index( + "ix_watermarks_tenant_entity_scope", + table_name="pipeline_watermarks", + ) + op.drop_column("pipeline_watermarks", "scope_key") + + +# ============================================================================ +# Companion migration that should follow (011) — KEEP IN SYNC HERE for review +# ============================================================================ +# +# def upgrade(): +# # Drop the legacy global-watermark constraint now that all writes use +# # scope_key. Safe to run only after Phase 2 worker code is deployed. +# op.drop_constraint( +# "uq_watermark_entity", +# "pipeline_watermarks", +# type_="unique", +# ) +# op.drop_index( +# "ix_watermarks_tenant_entity", +# table_name="pipeline_watermarks", +# ) +# +# def downgrade(): +# op.create_unique_constraint( +# "uq_watermark_entity", +# "pipeline_watermarks", +# ["tenant_id", "entity_type"], +# ) +# op.create_index( +# "ix_watermarks_tenant_entity", +# "pipeline_watermarks", +# ["tenant_id", "entity_type"], +# ) +# +# ============================================================================ diff --git a/pulse/packages/pulse-data/alembic/versions/011_drop_legacy_watermark_constraint.py b/pulse/packages/pulse-data/alembic/versions/011_drop_legacy_watermark_constraint.py new file mode 100644 index 0000000..1eaac43 --- /dev/null +++ b/pulse/packages/pulse-data/alembic/versions/011_drop_legacy_watermark_constraint.py @@ -0,0 +1,68 @@ +"""Drop legacy uq_watermark_entity constraint (FDD-OPS-014, Phase 2 step 2.7). + +Promoted earlier than originally planned because the assumption in +migration 010 ("legacy and new UNIQUE constraints coexist harmlessly") +was wrong: Postgres enforces ALL UniqueConstraints on every INSERT. +Trying to insert a per-scope row like (tenant, 'issues', +'jira:project:OKM', ...) failed with: + + UniqueViolationError: duplicate key value violates unique + constraint "uq_watermark_entity" + DETAIL: Key (tenant_id, entity_type)=(..., issues) already exists. + +The legacy constraint treats (tenant, entity_type) as the unique key +regardless of scope_key, so the existing '*' row blocked every +attempt to insert a scoped row. + +Resolution: drop the legacy constraint. The new +`uq_watermark_entity_scope` (tenant, entity_type, scope_key) +correctly handles both '*' and scoped rows. + +This was discovered immediately after Phase 2-A deployment (Steps +2.1-2.5) when sync cycles started failing with "status=failed" on the +first scope advance attempt. Documenting the root cause here so +future migrations don't repeat the dual-constraint assumption. + +Revision ID: 011_drop_legacy_watermark +Revises: 010_watermarks_scope_key +Create Date: 2026-04-28 +""" + +from typing import Sequence, Union + +from alembic import op + + +revision: str = "011_drop_legacy_watermark" +down_revision: Union[str, None] = "010_watermarks_scope_key" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Drop legacy unique-on-(tenant, entity) constraint and index.""" + # Use IF EXISTS for safety — this migration was applied via raw SQL + # before the file existed, so the actual DROP may have already run. + op.execute( + "ALTER TABLE pipeline_watermarks " + "DROP CONSTRAINT IF EXISTS uq_watermark_entity" + ) + op.execute("DROP INDEX IF EXISTS ix_watermarks_tenant_entity") + + +def downgrade() -> None: + """Restore legacy constraint + index. + + WARNING: this only works if no two rows have the same + (tenant_id, entity_type) — i.e., either you're back to a single '*' + row per tenant+entity, or you've collapsed scope rows first. + """ + op.execute( + "CREATE INDEX IF NOT EXISTS ix_watermarks_tenant_entity " + "ON pipeline_watermarks (tenant_id, entity_type)" + ) + op.execute( + "ALTER TABLE pipeline_watermarks " + "ADD CONSTRAINT uq_watermark_entity " + "UNIQUE (tenant_id, entity_type)" + ) diff --git a/pulse/packages/pulse-data/src/connectors/aggregator.py b/pulse/packages/pulse-data/src/connectors/aggregator.py index eb28e34..f2c2b1a 100644 --- a/pulse/packages/pulse-data/src/connectors/aggregator.py +++ b/pulse/packages/pulse-data/src/connectors/aggregator.py @@ -78,10 +78,16 @@ async def get_pull_request_source_count(self) -> int: return total async def fetch_pull_requests_batched( - self, since: datetime | None = None, + self, + since: datetime | None = None, + since_by_repo: dict[str, datetime | None] | None = None, ) -> AsyncIterator[tuple[str, list[dict[str, Any]] | None]]: """Yield PRs in batches (per repo) from all code-hosting connectors. + FDD-OPS-014 step 2.4-B: forwards since_by_repo to connectors that + support it. Connectors without the parameter (older shape) fall + back to single-`since` behavior. + Each yield is (repo_name, prs_or_none): - prs is None → "starting" signal for this repo (UI progress hint) - prs is list → completed batch ready to persist @@ -90,7 +96,17 @@ async def fetch_pull_requests_batched( connector = self._connectors.get(source) if connector and hasattr(connector, "fetch_pull_requests_batched"): try: - async for repo_name, prs in connector.fetch_pull_requests_batched(since): + # Detect if connector supports since_by_repo (graceful + # for connectors not yet updated in newer codebases). + import inspect + sig = inspect.signature(connector.fetch_pull_requests_batched) + if "since_by_repo" in sig.parameters: + gen = connector.fetch_pull_requests_batched( + since=since, since_by_repo=since_by_repo, + ) + else: + gen = connector.fetch_pull_requests_batched(since) + async for repo_name, prs in gen: yield repo_name, prs except Exception: logger.exception("Error fetching batched PRs from %s", source) @@ -102,6 +118,10 @@ async def fetch_issues( ) -> list[dict[str, Any]]: """Fetch issues from all work-tracking connectors (Jira, GitHub Issues). + DEPRECATED for new code paths — use fetch_issues_batched() which streams + per-project and persists incrementally (FDD-OPS-012). This bulk-fetch + method is retained for backward compatibility (sprint sync etc.). + Args: since: Watermark for incremental sync. project_keys: If provided, passed to Jira connector to scope which @@ -122,6 +142,34 @@ async def fetch_issues( logger.exception("Error fetching issues from %s", source) return all_issues + async def fetch_issues_batched( + self, + project_keys: list[str], + since_by_project: dict[str, datetime | None] | None = None, + ): + """Stream issues per-project from work-tracking connectors (FDD-OPS-012). + + Yields (project_key, batch) tuples per page. Caller normalizes, + upserts, emits Kafka, advances watermark per batch — bounded memory, + crash-safe. + + Currently only Jira implements batched issues. GitHub/Azure issues + sync remains bulk (low volume, can be migrated later if needed). + """ + connector = self._connectors.get("jira") + if connector is None or not hasattr(connector, "fetch_issues_batched"): + logger.warning("No Jira connector with batched fetch — skipping") + return + + try: + async for project_key, batch in connector.fetch_issues_batched( + project_keys=project_keys, + since_by_project=since_by_project, + ): + yield project_key, batch + except Exception: + logger.exception("Error during batched issue fetch from Jira") + async def fetch_issue_changelogs( self, issue_ids: list[str], ) -> dict[str, list[dict[str, Any]]]: @@ -171,15 +219,29 @@ async def fetch_issue_changelogs( return all_changelogs async def fetch_deployments( - self, since: datetime | None = None, + self, + since: datetime | None = None, + since_by_repo: dict[str, datetime | None] | None = None, ) -> list[dict[str, Any]]: - """Fetch deployments from all CI/CD connectors (Jenkins, GitHub Actions).""" + """Fetch deployments from all CI/CD connectors (Jenkins, GitHub Actions). + + FDD-OPS-014 step 2.5-B: forwards since_by_repo to connectors that + support it. Connectors without the parameter fall back to single + `since` behavior. + """ + import inspect all_deploys: list[dict[str, Any]] = [] for source in ("jenkins", "github", "gitlab", "azure"): connector = self._connectors.get(source) if connector: try: - deploys = await connector.fetch_deployments(since) + sig = inspect.signature(connector.fetch_deployments) + if "since_by_repo" in sig.parameters: + deploys = await connector.fetch_deployments( + since=since, since_by_repo=since_by_repo, + ) + else: + deploys = await connector.fetch_deployments(since) all_deploys.extend(deploys) logger.info("Fetched %d deployments from %s", len(deploys), source) except Exception: diff --git a/pulse/packages/pulse-data/src/connectors/github_connector.py b/pulse/packages/pulse-data/src/connectors/github_connector.py index 44c2535..99e245d 100644 --- a/pulse/packages/pulse-data/src/connectors/github_connector.py +++ b/pulse/packages/pulse-data/src/connectors/github_connector.py @@ -197,23 +197,57 @@ async def get_source_count(self) -> int: return len(repos) async def fetch_pull_requests_batched( - self, since: datetime | None = None, + self, + since: datetime | None = None, + since_by_repo: dict[str, datetime | None] | None = None, ) -> AsyncIterator[tuple[str, list[dict[str, Any]] | None]]: """Yield PRs in batches, one batch per repo — parallelized via GraphQL. Processes REPO_CONCURRENCY repos at a time. Each repo uses a single GraphQL query per page (50 PRs) instead of 1+2N REST calls. + FDD-OPS-014 step 2.4-B: per-repo watermarks. When `since_by_repo` + is provided, each repo uses its own `since` timestamp: + - Found in dict, value is datetime → incremental from that point + - Found in dict, value is None → full backfill (new repo) + - NOT in dict → falls back to bulk `since` + Backwards-compat: if `since_by_repo` is None, all repos use the + single `since` parameter (legacy behavior, preserved for callers + not yet updated). + For each repo, emits: 1. (repo_full_name, None) — "starting" signal for UI progress 2. (repo_full_name, list_of_prs) — completed batch (only if non-empty) """ repos = await self._get_repos() total_repos = len(repos) - logger.info( - "Starting parallel PR fetch: %d repos, concurrency=%d, page_size=%d", - total_repos, REPO_CONCURRENCY, GRAPHQL_PAGE_SIZE, - ) + + # Resolve effective `since` per repo. Calling with explicit + # since_by_repo wins; otherwise everyone gets the bulk `since`. + def _resolve_since(repo: str) -> datetime | None: + if since_by_repo is not None and repo in since_by_repo: + return since_by_repo[repo] + return since + + # Pre-flight summary so operator sees the per-repo plan up front. + if since_by_repo is not None: + backfill = sum( + 1 for r in repos + if since_by_repo.get(r, since) is None + ) + incremental = total_repos - backfill + logger.info( + "Starting parallel PR fetch: %d repos (per-repo plan: " + "%d backfill, %d incremental), concurrency=%d, page_size=%d", + total_repos, backfill, incremental, + REPO_CONCURRENCY, GRAPHQL_PAGE_SIZE, + ) + else: + logger.info( + "Starting parallel PR fetch: %d repos, concurrency=%d, " + "page_size=%d (single since=%s)", + total_repos, REPO_CONCURRENCY, GRAPHQL_PAGE_SIZE, since, + ) semaphore = asyncio.Semaphore(REPO_CONCURRENCY) # Queue holds outputs from worker coroutines so we can yield them @@ -224,12 +258,14 @@ async def worker(repo_full_name: str) -> None: async with semaphore: # Emit "starting" as soon as we acquire the slot await queue.put(("start", repo_full_name, None)) + repo_since = _resolve_since(repo_full_name) try: - prs = await self._fetch_repo_prs_graphql(repo_full_name, since) + prs = await self._fetch_repo_prs_graphql(repo_full_name, repo_since) if prs: logger.info( - "Batch: %d PRs from %s (GraphQL)", + "Batch: %d PRs from %s (GraphQL, since=%s)", len(prs), repo_full_name, + repo_since.isoformat() if repo_since else "full-history", ) await queue.put(("batch", repo_full_name, prs)) else: @@ -240,7 +276,7 @@ async def worker(repo_full_name: str) -> None: repo_full_name, ) try: - prs = await self._fetch_repo_prs(repo_full_name, since) + prs = await self._fetch_repo_prs(repo_full_name, repo_since) await queue.put(("batch", repo_full_name, prs or [])) except Exception: logger.exception("REST fallback also failed for %s", repo_full_name) diff --git a/pulse/packages/pulse-data/src/connectors/jenkins_connector.py b/pulse/packages/pulse-data/src/connectors/jenkins_connector.py index 5e02429..95adee1 100644 --- a/pulse/packages/pulse-data/src/connectors/jenkins_connector.py +++ b/pulse/packages/pulse-data/src/connectors/jenkins_connector.py @@ -116,17 +116,43 @@ async def test_connection(self) -> dict[str, Any]: # ------------------------------------------------------------------ async def fetch_deployments( - self, since: datetime | None = None, + self, + since: datetime | None = None, + since_by_repo: dict[str, datetime | None] | None = None, ) -> list[dict[str, Any]]: """Fetch builds from configured Jenkins jobs. Each build is mapped to a deployment record. Only jobs configured in connections.yaml are fetched (not all Jenkins jobs). + + FDD-OPS-014 step 2.5-B: per-repo `since` resolution. Jenkins has + no native "repo" concept — we use the job→repo mapping (built + from SCM scan, see `discover_jenkins_jobs.py`) to map each job + to its source repo and look up the repo's watermark. + + Resolution order per job: + 1. since_by_repo[mapped_repo] (if mapped_repo in dict) + 2. fall back to bulk `since` (single-watermark behavior) + + Backwards compat: if since_by_repo is None, all jobs use + single `since` (legacy bulk behavior preserved). """ if not self._jobs: logger.warning("No Jenkins jobs configured — skipping deployment fetch") return [] + # Pre-flight: log per-repo plan when since_by_repo is provided. + if since_by_repo is not None: + jobs_with_scope = sum( + 1 for j in self._jobs + if self._job_to_repo.get(j.get("fullName", ""), "") in since_by_repo + ) + logger.info( + "Jenkins fetch: %d jobs total, %d jobs with per-repo watermark, " + "rest use bulk since=%s", + len(self._jobs), jobs_with_scope, since, + ) + all_builds: list[dict[str, Any]] = [] for job_config in self._jobs: @@ -134,8 +160,15 @@ async def fetch_deployments( if not job_name: continue + # Resolve per-repo since via job→repo mapping. + repo = self._job_to_repo.get(job_name, job_name) + if since_by_repo is not None and repo in since_by_repo: + job_since = since_by_repo[repo] + else: + job_since = since + try: - builds = await self._fetch_job_builds(job_name, since) + builds = await self._fetch_job_builds(job_name, job_since) all_builds.extend(builds) except Exception: logger.exception("Failed to fetch builds for job: %s", job_name) diff --git a/pulse/packages/pulse-data/src/connectors/jira_connector.py b/pulse/packages/pulse-data/src/connectors/jira_connector.py index bdc2a67..9844a16 100644 --- a/pulse/packages/pulse-data/src/connectors/jira_connector.py +++ b/pulse/packages/pulse-data/src/connectors/jira_connector.py @@ -57,6 +57,59 @@ FALLBACK_STORY_POINTS_FIELDS = ("customfield_10016", "customfield_10028") FALLBACK_SPRINT_FIELDS = ("customfield_10020", "customfield_10010") +# --------------------------------------------------------------------------- +# Effort estimation fallback chain (FDD-OPS-016) +# +# Webmotors and many enterprise tenants do NOT use story points (validated +# 2026-04-28: 0% population across all 69 active Jira projects). Different +# squads use different estimation methods, or none at all. We discover and +# extract from a fallback chain in priority order: +# +# 1. Story Points (numeric) → use raw value +# 2. Story point estimate → use raw value +# 3. T-Shirt Size (option) → map P/M/G... to Fibonacci scale +# 4. Tamanho/Impacto (option) → map PP/P/M/G... to Fibonacci scale +# 5. Original Estimate (sec) → bucket hours into Fibonacci-aligned points +# 6. None → consumer falls back to count-of-items +# (Kanban-pure mode) +# +# When `story_points` lands as None, downstream metrics (Lean throughput, +# velocity) MUST count items rather than sum points. The decision to count +# vs sum lives in the metric layer, not here. +# +# Future (codename "dev-metrics"): admin UI to opt into a specific method +# per source/squad + proprietary forecasting model. See FDD-DEV-METRICS-001 +# in ops-backlog.md. +# --------------------------------------------------------------------------- + +# Field-name keywords used by `_discover_effort_fields` (case-insensitive, +# matched against Jira `fields` API "name" property). +EFFORT_NAME_PATTERNS_TSHIRT = ("t-shirt size", "tshirt size", "tamanho/impacto") +EFFORT_NAME_PATTERNS_TIME = ("original estimate",) # core field, not custom + +# Fibonacci-like mapping for option-typed effort fields. Covers the values +# observed in Webmotors data + common defaults (XS/S/M/L/XL/XXL). +TSHIRT_TO_POINTS: dict[str, float] = { + # Portuguese sizes + "PP": 1.0, "P": 2.0, "M": 3.0, "G": 5.0, "GG": 8.0, "GGG": 13.0, + # English sizes + "XS": 1.0, "S": 2.0, "L": 5.0, "XL": 8.0, "XXL": 13.0, +} + +# Hour-based estimation buckets → SP equivalent. +# Aligned with "1 ideal day = ~6h productive, 1 SP ≈ small task < 0.5d" so +# the steps stay roughly Fibonacci. Calibrated against Webmotors observed +# values (2h–124h, multiples of 4) so each common value lands in a sensible +# bucket. Rounded to the SP scale that downstream metrics already speak. +def _hours_to_points(hours: float) -> float: + if hours <= 4: return 1.0 + if hours <= 8: return 2.0 + if hours <= 16: return 3.0 + if hours <= 24: return 5.0 + if hours <= 40: return 8.0 + if hours <= 80: return 13.0 + return 21.0 + class JiraConnector(BaseConnector): """Fetches issues, sprints, and changelogs from Jira Cloud REST API v3. @@ -102,7 +155,23 @@ def __init__( # _discover_custom_fields() on first fetch_issues() call. self._sprint_field_id: str | None = None self._story_points_field_id: str | None = None + # FDD-OPS-016: discovered effort-fallback field IDs (T-shirt size, + # Tamanho/Impacto). Many tenants don't use story points at all. + self._tshirt_field_ids: list[str] = [] self._custom_fields_discovered: bool = False + # Telemetry for `_extract_effort` — counts how often each strategy + # was the one that produced a value, plus how many issues fell + # through to None. Logged at end of each batched fetch so operators + # can spot estimation mode shifts without combing through traces. + self._effort_source_counts: dict[str, int] = {} + # FDD-OPS-017 — status→category map cached from /rest/api/3/status. + # Keys are lowercased status names (e.g., "fechado em prod"); values + # are statusCategory.key ("new" | "indeterminate" | "done"). Used + # by the normalizer as the authoritative fallback when a textual + # mapping isn't found. Populated by `_discover_status_categories()` + # on first fetch. + self._status_categories: dict[str, str] = {} + self._status_categories_discovered: bool = False @property def source_type(self) -> str: @@ -206,6 +275,7 @@ async def fetch_issues( # Discover tenant-specific custom field IDs (sprint, story points) await self._discover_custom_fields() + await self._discover_status_categories() # Quote each project key in JQL — some keys like "DESC" are reserved words quoted_projects = ", ".join(f'"{p}"' for p in effective_projects) @@ -223,6 +293,13 @@ async def fetch_issues( fields_to_fetch.append(self._sprint_field_id) if self._story_points_field_id: fields_to_fetch.append(self._story_points_field_id) + # FDD-OPS-016: include effort fallback fields (T-shirt size, + # Tamanho/Impacto, original estimate) + for f in self._tshirt_field_ids: + if f not in fields_to_fetch: + fields_to_fetch.append(f) + if "timeoriginalestimate" not in fields_to_fetch: + fields_to_fetch.append("timeoriginalestimate") # Always include fallbacks to survive mis-discovery for f in FALLBACK_SPRINT_FIELDS + FALLBACK_STORY_POINTS_FIELDS: if f not in fields_to_fetch: @@ -259,6 +336,128 @@ async def fetch_issues( logger.info("Fetched %d issues from Jira (%d projects, %d pages)", len(all_issues), len(effective_projects), page) return all_issues + async def fetch_issues_batched( + self, + project_keys: list[str], + since_by_project: dict[str, datetime | None] | None = None, + ): + """Stream issues PER PROJECT, yielding (project_key, batch) per page. + + FDD-OPS-012 — replaces the bulk-fetch-all-then-persist pattern of + fetch_issues(). Yields each page (~50 issues) as it arrives, so the + caller can normalize → upsert → emit_event → advance_watermark + immediately. Memory bound: ~one page in flight; crash recovery loses + at most one page of work. + + Per-project pagination (one JQL per project) instead of `project IN + (...)` makes per-scope watermarks possible (each project advances + its own last_synced_at independently — see FDD-OPS-014). It also + means failure on one project doesn't lose progress on others. + + Args: + project_keys: Projects to sync. Must be explicit; no fallback + to env var (caller MUST resolve via ModeResolver). + since_by_project: Optional per-project watermark. Missing keys + default to None (full backfill for that project). + + Yields: + (project_key, list_of_normalized_raw_issues) tuples. + Each list has SEARCH_PAGE_SIZE items (50 by default), except + the last page of each project which may be smaller. + """ + if not project_keys: + logger.warning("fetch_issues_batched: empty project_keys, nothing to do") + return + + # Discover tenant-specific custom field IDs once (cached for reuse). + await self._discover_custom_fields() + await self._discover_status_categories() + + # Build fields list: base + discovered custom fields + fallbacks. + fields_to_fetch = list(SEARCH_FIELDS) + if self._sprint_field_id: + fields_to_fetch.append(self._sprint_field_id) + if self._story_points_field_id: + fields_to_fetch.append(self._story_points_field_id) + # FDD-OPS-016: effort fallback fields + for f in self._tshirt_field_ids: + if f not in fields_to_fetch: + fields_to_fetch.append(f) + if "timeoriginalestimate" not in fields_to_fetch: + fields_to_fetch.append("timeoriginalestimate") + for f in FALLBACK_SPRINT_FIELDS + FALLBACK_STORY_POINTS_FIELDS: + if f not in fields_to_fetch: + fields_to_fetch.append(f) + + since_by_project = since_by_project or {} + # FDD-OPS-016: reset effort telemetry per batched call so the + # summary log reflects only this run. + self._effort_source_counts = {} + + for project_key in project_keys: + since = since_by_project.get(project_key) + # Keys like "DESC" collide with SQL reserved words — quote always. + jql = f'project = "{project_key}"' + if since: + since_str = since.strftime("%Y-%m-%d %H:%M") + jql += f' AND updated >= "{since_str}"' + jql += " ORDER BY updated DESC" + + logger.info( + "[batched] %s: starting JQL fetch (since=%s)", + project_key, since.isoformat() if since else "full-history", + ) + + next_page_token: str | None = None + page = 0 + total_yielded = 0 + + while True: + body: dict[str, Any] = { + "jql": jql, + "maxResults": SEARCH_PAGE_SIZE, + "fields": fields_to_fetch, + "expand": "changelog", # critical: keeps changelog inline (FDD-OPS-013) + } + if next_page_token: + body["nextPageToken"] = next_page_token + + data = await self._client.post( + f"{REST_API}/search/jql", json_body=body, + ) + + issues = data.get("issues", []) + if issues: + mapped_batch = [self._map_issue(issue) for issue in issues] + yield project_key, mapped_batch + total_yielded += len(mapped_batch) + + page += 1 + next_page_token = data.get("nextPageToken") + if not next_page_token or not issues: + break + + logger.info( + "[batched] %s: complete (%d issues, %d pages)", + project_key, total_yielded, page, + ) + + # FDD-OPS-016 — log effort-source distribution so operators can spot + # which fields the squad uses (or that they don't estimate at all). + if self._effort_source_counts: + total = sum(self._effort_source_counts.values()) + breakdown = ", ".join( + f"{src}={cnt} ({100.0*cnt/total:.1f}%)" + for src, cnt in sorted( + self._effort_source_counts.items(), + key=lambda kv: -kv[1], + ) + ) + logger.info( + "[batched] effort source distribution (%d issues): %s", + total, breakdown, + ) + async def fetch_issue_changelogs( self, issue_ids: list[str], ) -> dict[str, list[dict[str, Any]]]: @@ -393,6 +592,17 @@ def _map_issue(self, jira_issue: dict[str, Any]) -> dict[str, Any]: sprint_id = self._extract_sprint_id(fields) status_name = (fields.get("status") or {}).get("name", "") + # FDD-OPS-017 — read statusCategory from Jira's own `status` field + # (always inline in the issue response, no extra HTTP). Fallback to + # the cached `name → category` map if the issue payload lacks it + # (older Jira REST APIs / odd workflows). + status_cat_inline = ( + ((fields.get("status") or {}).get("statusCategory") or {}).get("key") + ) + status_category = ( + status_cat_inline.lower() if isinstance(status_cat_inline, str) + else self._status_categories.get(status_name.strip().lower()) + ) # Store changelogs inline (extracted separately for the sync worker) self._last_changelogs = self._last_changelogs if hasattr(self, "_last_changelogs") else {} @@ -420,6 +630,20 @@ def _map_issue(self, jira_issue: dict[str, Any]) -> dict[str, Any]: "assignee_name": (fields.get("assignee") or {}).get("displayName"), "type": (fields.get("issuetype") or {}).get("name", "Task"), "sprint_id": sprint_id, + # FDD-OPS-017 — Jira's authoritative classification of THIS issue's + # current status. The normalizer uses it as the fallback when the + # textual DEFAULT_STATUS_MAPPING doesn't recognize the status name. + "status_category": status_category, + # FDD-OPS-017 — full name→category map so build_status_transitions + # can classify each historical to_status, not just the current one. + # Same dict reference for every issue (cached on the connector); + # downstream upsert ignores extra keys. + "status_categories_map": self._status_categories, + # FDD-OPS-013 — preserve raw changelog from `expand=changelog` so + # `extract_status_transitions_inline()` in the sync worker can read + # it. Without this, mapped dict drops the changelog and ALL issues + # land with status_transitions=[] in eng_issues. + "changelog": jira_issue.get("changelog", {}), } def _map_sprint_issue( @@ -469,6 +693,52 @@ def _extract_changelogs( transitions.sort(key=lambda t: t.get("created_date") or "") return transitions + async def _discover_status_categories(self) -> None: + """FDD-OPS-017 — fetch all status definitions and cache name→category. + + Jira's `/rest/api/3/status` returns every status defined in the + tenant, each tagged with a `statusCategory.key` of "new", + "indeterminate", or "done". This is the AUTHORITATIVE classification + of "is this status considered finished by the workflow author". + + Used by the normalizer as the fallback when our textual + DEFAULT_STATUS_MAPPING doesn't recognize a status name. Without + this, exotic Webmotors statuses like "FECHADO EM PROD" silently + defaulted to "todo", catastrophically polluting flow metrics + (Cycle Time, Throughput, WIP, CFD all read from `normalized_status`). + + Discovery is one HTTP call per connector lifetime — cached on + instance. Failures degrade gracefully: we just lose the fallback. + """ + if self._status_categories_discovered: + return + + try: + data = await self._client.get(f"{REST_API}/status") + except Exception: + logger.exception( + "Failed to fetch Jira status catalog — normalization will " + "rely solely on textual DEFAULT_STATUS_MAPPING" + ) + self._status_categories_discovered = True + return + + statuses = data if isinstance(data, list) else data.get("values", []) + for s in statuses: + name = (s.get("name") or "").strip().lower() + cat = ((s.get("statusCategory") or {}).get("key") or "").strip().lower() + if name and cat in ("new", "indeterminate", "done"): + self._status_categories[name] = cat + + self._status_categories_discovered = True + logger.info( + "Discovered %d Jira status definitions (new=%d, indeterminate=%d, done=%d)", + len(self._status_categories), + sum(1 for v in self._status_categories.values() if v == "new"), + sum(1 for v in self._status_categories.values() if v == "indeterminate"), + sum(1 for v in self._status_categories.values() if v == "done"), + ) + # ------------------------------------------------------------------ # Internal: Custom field discovery + extraction helpers # ------------------------------------------------------------------ @@ -502,12 +772,18 @@ async def _discover_custom_fields(self) -> None: self._sprint_field_id = fid elif name in ("story points", "story point estimate") and not self._story_points_field_id: self._story_points_field_id = fid + elif any(p in name for p in EFFORT_NAME_PATTERNS_TSHIRT): + # FDD-OPS-016 — option-typed effort fallback (P/M/G…) + if fid not in self._tshirt_field_ids: + self._tshirt_field_ids.append(fid) self._custom_fields_discovered = True logger.info( - "Discovered Jira custom fields — sprint=%s, story_points=%s", + "Discovered Jira custom fields — sprint=%s, story_points=%s, " + "effort_tshirt_fields=%s", self._sprint_field_id or "(none — using fallback)", self._story_points_field_id or "(none — using fallback)", + self._tshirt_field_ids or "(none)", ) def _extract_sprint_id(self, fields: dict[str, Any]) -> str | None: @@ -622,20 +898,89 @@ def _collect_leaf_texts(node: Any) -> list[str]: return flat def _extract_story_points(self, fields: dict[str, Any]) -> float | None: - """Extract story points, preferring the discovered custom field.""" - candidates: list[str] = [] + """Extract effort estimate, falling back through Story Points → + T-shirt size → Original Estimate hours → None. + + Returns a float on the SP scale so downstream metrics (velocity, + throughput) can sum it. Returns None when the issue is genuinely + unestimated; the metric layer must then count items rather than + sum points (Kanban-pure mode). See FDD-OPS-016. + + Side effect: increments `self._effort_source_counts[source]` so + `fetch_issues_batched` can log the distribution per run. The source + label is recorded even on None ("unestimated") so coverage can be + observed end-to-end. + """ + # 1+2. Native numeric story-point fields (preferred — no conversion). + sp_candidates: list[str] = [] if self._story_points_field_id: - candidates.append(self._story_points_field_id) - candidates.extend(FALLBACK_STORY_POINTS_FIELDS) - candidates.append("story_points") - - for c in candidates: + sp_candidates.append(self._story_points_field_id) + sp_candidates.extend(FALLBACK_STORY_POINTS_FIELDS) + sp_candidates.append("story_points") + for c in sp_candidates: value = fields.get(c) - if value is not None: - try: - return float(value) - except (TypeError, ValueError): - continue + if value is None or value == "": + continue + try: + points = float(value) + except (TypeError, ValueError): + continue + if points > 0: + self._effort_source_counts["story_points"] = ( + self._effort_source_counts.get("story_points", 0) + 1 + ) + return points + + # 3+4. T-shirt sized fields → map P/M/G… to Fibonacci scale. + for fid in self._tshirt_field_ids: + raw = fields.get(fid) + label = self._unwrap_option(raw) + if not label: + continue + mapped = TSHIRT_TO_POINTS.get(label.upper()) + if mapped is not None: + self._effort_source_counts["tshirt_to_sp"] = ( + self._effort_source_counts.get("tshirt_to_sp", 0) + 1 + ) + return mapped + # Unknown size value — don't silently mis-map; fall through. + + # 5. Original Estimate (hours) → SP equivalent buckets. + secs = fields.get("timeoriginalestimate") + if secs: + try: + hours = float(secs) / 3600.0 + if hours > 0: + self._effort_source_counts["hours_to_sp"] = ( + self._effort_source_counts.get("hours_to_sp", 0) + 1 + ) + return _hours_to_points(hours) + except (TypeError, ValueError): + pass + + # 6. Genuinely unestimated. Track for telemetry; metric layer counts items. + self._effort_source_counts["unestimated"] = ( + self._effort_source_counts.get("unestimated", 0) + 1 + ) + return None + + @staticmethod + def _unwrap_option(raw: Any) -> str | None: + """Extract the string label from a Jira option-typed field. + + Jira returns option fields as `{"value": "P", "id": "..."}` but + legacy/edge cases sometimes use "name" or a bare string. Be lenient. + """ + if raw is None: + return None + if isinstance(raw, str): + label = raw.strip() + return label or None + if isinstance(raw, dict): + for key in ("value", "name", "displayName"): + v = raw.get(key) + if isinstance(v, str) and v.strip(): + return v.strip() return None # ------------------------------------------------------------------ @@ -711,17 +1056,18 @@ async def _fetch_board_sprints( for sprint in sprints: mapped = self._map_sprint(sprint, board_id) - # Apply watermark filter - if since: - start_date = mapped.get("started_date") - if start_date and isinstance(start_date, str): - try: - dt = datetime.fromisoformat(start_date.replace("Z", "+00:00")) - if dt < since: - continue - except ValueError: - pass - + # FDD-OPS-018 — DELIBERATELY NOT applying a `since` watermark + # filter here. Sprint state transitions (future→active→closed) + # happen on `endDate`, not `startDate`. The previous filter + # `if started_date < since: continue` meant a sprint that + # started in March and closed in May would never have its + # status updated past March's snapshot — every Webmotors + # sprint landed with empty status because the watermark was + # advanced past their start date. + # + # Volume is bounded (~216 total, ~5 active at any time across + # 27 squads), so always re-fetching every sprint per cycle + # is cheap and correct. all_sprints.append(mapped) if data.get("isLast", True) or not sprints: @@ -749,6 +1095,9 @@ def _map_sprint(self, sprint: dict[str, Any], board_id: int) -> dict[str, Any]: "name": sprint.get("name", ""), "url": self._base_url, "status": status, + # FDD-OPS-018 — sprint goal (free-text, set by squad lead). + # Jira returns this as a string; pass through for normalizer. + "goal": sprint.get("goal"), "started_date": sprint.get("startDate"), "ended_date": sprint.get("endDate"), "completed_date": sprint.get("completeDate"), diff --git a/pulse/packages/pulse-data/src/contexts/engineering_data/models.py b/pulse/packages/pulse-data/src/contexts/engineering_data/models.py index fa7fa75..a6e3a14 100644 --- a/pulse/packages/pulse-data/src/contexts/engineering_data/models.py +++ b/pulse/packages/pulse-data/src/contexts/engineering_data/models.py @@ -168,6 +168,12 @@ class EngSprint(TenantModel): source: Mapped[str] = mapped_column(String(32), nullable=False) name: Mapped[str] = mapped_column(String(256), nullable=False) board_id: Mapped[str] = mapped_column(String(128), nullable=False) + # FDD-OPS-018 — sprint lifecycle: active | closed | future | NULL. + # Was missing from the ORM model despite existing in the DB schema + # (schema drift). Without this Mapped column, every attempt to upsert + # `status` raised "Unconsumed column names: status" and the field + # silently stayed empty for all 216 Webmotors sprints. + status: Mapped[str | None] = mapped_column(String(50), nullable=True) started_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True) completed_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True) goal: Mapped[str | None] = mapped_column(Text, nullable=True) diff --git a/pulse/packages/pulse-data/src/contexts/engineering_data/normalizer.py b/pulse/packages/pulse-data/src/contexts/engineering_data/normalizer.py index daedbe6..5016ed4 100644 --- a/pulse/packages/pulse-data/src/contexts/engineering_data/normalizer.py +++ b/pulse/packages/pulse-data/src/contexts/engineering_data/normalizer.py @@ -86,15 +86,107 @@ "aguardando desenvolvimento": "todo", "priorizado gp": "todo", "pronto para o gp": "todo", + "em progresso": "in_progress", + "em desenv": "in_progress", + "em deploy hml": "in_progress", + "em deploy produção": "in_progress", + "em deploy azul": "in_progress", # Active work / pre-dev analysis "construção de hipótese": "in_progress", "desenvolvimento": "in_progress", "design": "in_progress", "analise": "in_progress", + "análise": "in_progress", + "em análise": "in_progress", "discovery": "in_progress", "entendimento": "in_progress", - # Post-deploy + # FDD-OPS-017 — Webmotors PT-BR status names that need the in_review + # granularity (Jira's `indeterminate` category collapses these into + # in_progress, but for Cycle Time breakdown we want the split). + "em verificação": "in_review", + "em teste": "in_review", + "em teste regressão": "in_review", + "em teste integrado hml": "in_review", + "em testes integrados": "in_review", + "em teste try": "in_review", + "homologação": "in_review", + "para verificação": "in_review", + "pronto para teste": "in_review", + "aguardando teste": "in_review", + "aguardando teste regressão": "in_review", + "aguardando teste hml": "in_review", + "aguardando teste try": "in_review", + "aguardando review": "in_review", + "aguardando deploy": "in_review", + "aguardando deploy hml": "in_review", + "aguardando deploy azul": "in_review", + "aguardando merge": "in_review", + "valid. azul": "in_review", + "validação": "in_review", + "validação infosec": "in_review", + "revisão de negócio": "in_review", + "em design review": "in_review", + # Post-deploy / monitoring → done (issue is shipped, monitoring is + # passive observation, not active dev work) "pós-implantação": "done", + "fechado em prod": "done", + # NOTE: "fechado em hml" — Jira's own statusCategory is "done" and the + # name literally says FECHADO. We respect that. If a workflow author + # later wants to keep these issues in WIP (e.g., pending prod rollout), + # they should rename the status to "Aguardando Deploy Produção" which + # already maps to in_progress. + "fechado em hml": "done", + "em monitoramento produção": "done", + "feito": "done", + "finalizado": "done", + "publicado": "done", + "resolvido": "done", + "entregue": "done", + "envio para loja": "done", + "itens concluídos": "done", + "fechada": "done", + # Cancelled / rejected variations observed in Webmotors + "recusado": "done", + "reprovado": "done", + "solicitação reprovada": "done", + "falha": "done", + "arquivo morto": "done", + "estacionamento": "done", + # Common backlog/refinement aliases + "novo": "todo", + "a fazer": "todo", + "aberto": "todo", + "esboçando": "todo", + "ideação": "todo", + "exploração": "todo", + "descoberta": "todo", + "descobrindo": "todo", + "mapeando": "todo", + "desenhando": "todo", + "prototipando": "todo", + "novo chamado": "todo", + "em refinamento": "todo", + "em refinamento de negócio": "todo", + "em refinamento técnico": "todo", + "pré refinamento": "todo", + "aguardando refinamento": "todo", + "aguardando refinamento técnico": "todo", + "aguardando refinamento tecnico": "todo", + "aguardando análise": "todo", + "aguardando definição e refinamento": "todo", + "aguardando handover": "todo", + "aguardando terceiro": "todo", + "aguardando ideação": "todo", + "aguardando aprovação": "todo", + "aguardando validação": "todo", + "priorizado": "todo", + "priorização técnica": "todo", + "priorizando o negócio": "todo", + "preparando o trabalho": "todo", + "ajustes do trabalho": "todo", + "revisando trabalho": "todo", + "pausado": "todo", + "não aplicável": "todo", } # Regex to find issue keys in branch names (e.g., "feature/BACK-123-add-login") @@ -168,31 +260,74 @@ def _extract_project_key(issue_key: str | None, url: str | None) -> str: return "UNKNOWN" -def normalize_status(raw_status: str, status_mapping: dict[str, str] | None = None) -> str: - """Normalize a raw issue status to one of: todo, in_progress, done. +def normalize_status( + raw_status: str, + status_mapping: dict[str, str] | None = None, + status_category: str | None = None, +) -> str: + """Normalize a raw issue status to one of: todo | in_progress | in_review | done. Args: raw_status: The original status string from the source system. status_mapping: Optional custom mapping overriding defaults. + status_category: FDD-OPS-017 — Jira's own statusCategory.key value + ("new" | "indeterminate" | "done") for this status. Used as the + authoritative fallback when our textual mapping doesn't recognize + the status name. Without it, custom Jira workflows (e.g., + "FECHADO EM PROD") silently default to "todo" — corrupting + every flow metric (Cycle Time, Throughput, WIP, CFD). Returns: - Normalized status string. + Normalized status string. Granularity: + - `todo` — work not started + - `in_progress` — actively being worked on + - `in_review` — code/test review (subset of "active" for WIP) + - `done` — completed (workflow author classified as done) + + Resolution order: + 1. Custom + DEFAULT_STATUS_MAPPING textual lookup (preserves + the in_progress/in_review distinction we hand-curated) + 2. status_category fallback ("done" → done, "indeterminate" → + in_progress, "new" → todo) + 3. Final default "todo" with WARN log (visible in pipeline_events) """ mapping = {**DEFAULT_STATUS_MAPPING} if status_mapping: mapping.update({k.lower(): v for k, v in status_mapping.items()}) - normalized = mapping.get(raw_status.lower().strip()) + key = raw_status.lower().strip() + normalized = mapping.get(key) if normalized: return normalized - logger.warning("Unknown status '%s' — defaulting to 'todo'", raw_status) + # FDD-OPS-017 — fall back to Jira's own statusCategory before defaulting + # to "todo". This is the safety net for the long tail of tenant-custom + # workflow states (104 distinct statuses observed in Webmotors alone). + if status_category: + cat = status_category.lower().strip() + if cat == "done": + return "done" + if cat == "indeterminate": + # Active work. We can't distinguish in_progress vs in_review at + # this level — that's intentional, since `_ACTIVE_STATUSES` + # treats both equivalently for WIP/Cycle Time. Operators who + # want the finer split must add the status to DEFAULT_STATUS_MAPPING. + return "in_progress" + if cat == "new": + return "todo" + + logger.warning( + "Unknown status %r (no textual mapping, no statusCategory) " + "— defaulting to 'todo'", + raw_status, + ) return "todo" def build_status_transitions( changelogs: list[dict[str, Any]], status_mapping: dict[str, str] | None = None, + status_categories_map: dict[str, str] | None = None, ) -> list[dict[str, Any]]: """Convert DevLake issue_changelogs into PULSE status_transitions JSONB. @@ -200,6 +335,11 @@ def build_status_transitions( changelogs: Sorted list of changelog dicts with keys: from_status, to_status, created_date status_mapping: Optional custom mapping for normalization. + status_categories_map: FDD-OPS-017 — name→category dict (lowercased + keys) from the Jira connector. Lets each historical to_status + fall back to its statusCategory when not in the textual mapping. + Without this, a status no longer in active Jira workflows + (legacy / archived) defaults to "todo" → bogus Cycle Time. Returns: List of transition dicts: @@ -208,11 +348,13 @@ def build_status_transitions( if not changelogs: return [] + cats = status_categories_map or {} transitions: list[dict[str, Any]] = [] for i, cl in enumerate(changelogs): entered_at = _parse_datetime(cl["created_date"]) to_status_raw = cl.get("to_status", "") - normalized = normalize_status(to_status_raw, status_mapping) + cat = cats.get(to_status_raw.strip().lower()) + normalized = normalize_status(to_status_raw, status_mapping, cat) # exited_at is the entered_at of the next transition, or None if current exited_at = None @@ -283,8 +425,8 @@ def normalize_pull_request( "tenant_id": tenant_id, "source": source, "repo": repo, - "title": devlake_pr.get("title", ""), - "author": devlake_pr.get("author_name", "unknown"), + "title": _strip_null_bytes(devlake_pr.get("title", "")), + "author": _strip_null_bytes(devlake_pr.get("author_name", "unknown")), "state": state, "is_merged": is_merged, "first_commit_at": first_commit_at, # INC-003: real authored_date when enriched @@ -321,7 +463,11 @@ def normalize_issue( Dict matching EngIssue model columns. """ raw_status = devlake_issue.get("original_status") or devlake_issue.get("status", "") - normalized = normalize_status(raw_status, status_mapping) + # FDD-OPS-017 — pull Jira's authoritative category from the connector + # so the normalizer can fall back to it when textual mapping misses. + status_category = devlake_issue.get("status_category") + status_categories_map = devlake_issue.get("status_categories_map") or {} + normalized = normalize_status(raw_status, status_mapping, status_category) issue_key = devlake_issue.get("issue_key", "") project_key = _extract_project_key(issue_key, devlake_issue.get("url")) @@ -330,7 +476,9 @@ def normalize_issue( resolution_date = _parse_datetime(devlake_issue.get("resolution_date")) # Build status transitions from changelog data (populated by Jira plugin) - transitions = build_status_transitions(changelogs or [], status_mapping) + transitions = build_status_transitions( + changelogs or [], status_mapping, status_categories_map, + ) # Derive started_at from first transition to an active state started_at = None @@ -370,18 +518,23 @@ def normalize_issue( else None ) + # Strip NULL bytes (0x00) from any text field. Postgres `text`/`varchar` + # rejects them with `CharacterNotInRepertoireError: invalid byte sequence + # for encoding "UTF8": 0x00`. Real-world Jira data has them — observed + # 2026-04-28 in ENO-3296 description (likely paste from buggy source). + # Without this, a single bad row breaks the whole batch upsert. return { "external_id": str(devlake_issue["id"]), "tenant_id": tenant_id, "source": _detect_source(devlake_issue), "project_key": project_key, "issue_key": (issue_key or None), - "title": devlake_issue.get("title", ""), - "description": description, + "title": _strip_null_bytes(devlake_issue.get("title", "")), + "description": _strip_null_bytes(description), "issue_type": issue_type, "status": raw_status, "normalized_status": normalized, - "assignee": devlake_issue.get("assignee_name"), + "assignee": _strip_null_bytes(devlake_issue.get("assignee_name")), "story_points": devlake_issue.get("story_point"), "sprint_id": sprint_id, "status_transitions": transitions, @@ -392,6 +545,19 @@ def normalize_issue( } +def _strip_null_bytes(value: Any) -> Any: + """Remove NULL bytes (0x00) from a string. Pass-through for non-strings. + + Postgres rejects 0x00 in `text`/`varchar` with + `CharacterNotInRepertoireError`. Real-world Jira data sometimes contains + them (copy-paste from binary sources, malformed encoding upstream). + Stripping is the conservative choice — preserves all readable content. + """ + if isinstance(value, str) and "\x00" in value: + return value.replace("\x00", "") + return value + + def normalize_deployment( devlake_deploy: dict[str, Any], tenant_id: UUID, @@ -507,9 +673,19 @@ def normalize_sprint( "source": _detect_source(devlake_sprint), "name": devlake_sprint.get("name", ""), "board_id": str(devlake_sprint.get("original_board_id", "")), + # FDD-OPS-018 — sprint lifecycle status, lowercase to match the + # convention used elsewhere in PULSE (`normalized_status`, + # `issue_type`, etc.). The connector emits ACTIVE/CLOSED/FUTURE; + # we normalize here so consumers can rely on a stable casing. + # Was previously DROPPED entirely → all 216 Webmotors sprints + # landed with status='' in eng_sprints, breaking any future + # filter for "active sprint" / "completed sprints in quarter". + "status": _normalize_sprint_status(devlake_sprint.get("status")), + # FDD-OPS-018 — sprint goal text (set by squad lead in Jira). Was + # hardcoded None; now passed through from the connector. + "goal": _strip_null_bytes(devlake_sprint.get("goal")), "started_at": started_date, "completed_at": ended_date, - "goal": None, # Not in DevLake domain table "committed_items": committed_items, "committed_points": committed_points, "added_items": 0, # Requires tracking scope changes over time @@ -522,6 +698,39 @@ def normalize_sprint( } +# Sprint lifecycle states accepted by `_normalize_sprint_status`. Anything +# else falls through to None (better than guessing) — operators see NULLs +# in eng_sprints.status and can investigate. +_SPRINT_STATUS_ALIASES: dict[str, str] = { + "active": "active", + "closed": "closed", + "future": "future", + # Common aliases observed across Jira variants + "open": "active", + "in_progress": "active", + "completed": "closed", + "complete": "closed", + "ended": "closed", + "planned": "future", + "upcoming": "future", +} + + +def _normalize_sprint_status(raw: Any) -> str | None: + """Map a sprint state string to one of: active | closed | future | None. + + Lowercased; whitespace stripped. Unknown values return None — we don't + silently bucket them into one of the known states, since Sprint Velocity + / Carryover logic relies on knowing which sprints are actually closed. + """ + if not isinstance(raw, str): + return None + key = raw.strip().lower() + if not key: + return None + return _SPRINT_STATUS_ALIASES.get(key) + + def build_issue_key_map( issue_rows: list[tuple[str | None, str]], ) -> dict[str, str]: diff --git a/pulse/packages/pulse-data/src/contexts/pipeline/models.py b/pulse/packages/pulse-data/src/contexts/pipeline/models.py index 550f907..c528fee 100644 --- a/pulse/packages/pulse-data/src/contexts/pipeline/models.py +++ b/pulse/packages/pulse-data/src/contexts/pipeline/models.py @@ -19,20 +19,44 @@ class PipelineWatermark(TenantModel): - """Stores sync watermarks per entity type for incremental sync. + """Stores sync watermarks per (tenant, entity, scope) for incremental sync. Replaces the in-memory _WATERMARKS dict with persistent DB storage, so watermarks survive worker restarts and scale across replicas. + + FDD-OPS-014 (migration 010): added `scope_key` so a single entity_type + can have multiple scopes. E.g.: + scope_key='*' → legacy global (one row, all sources) + scope_key='jira:project:BG' → Jira project BG + scope_key='github:repo:foo/bar' → specific GitHub repo + scope_key='jenkins:job:deploy-X'→ specific Jenkins job + + The legacy `uq_watermark_entity` constraint coexists with the new + `uq_watermark_entity_scope` UNIQUE — to be dropped in migration 011 + after all worker code is writing per-scope. """ __tablename__ = "pipeline_watermarks" __table_args__ = ( - UniqueConstraint("tenant_id", "entity_type", name="uq_watermark_entity"), + # Per-scope constraint (active from migration 010 onward). + # Legacy uq_watermark_entity (without scope_key) was dropped in + # migration 011 — Postgres enforces all UniqueConstraints on every + # INSERT, so "harmless coexistence" was impossible: legacy blocked + # any per-scope insert because the (tenant, entity) tuple already + # existed via the '*' row. Discovered immediately after Phase 2-A + # deployment. + UniqueConstraint( + "tenant_id", "entity_type", "scope_key", + name="uq_watermark_entity_scope", + ), ) entity_type: Mapped[str] = mapped_column( String(64), nullable=False, ) # pull_requests | issues | deployments | sprints + scope_key: Mapped[str] = mapped_column( + String(255), nullable=False, server_default="*", + ) # see class docstring for format last_synced_at: Mapped[datetime] = mapped_column( DateTime(timezone=True), nullable=False, ) diff --git a/pulse/packages/pulse-data/src/workers/devlake_sync.py b/pulse/packages/pulse-data/src/workers/devlake_sync.py index c3902f1..5ffb4f1 100644 --- a/pulse/packages/pulse-data/src/workers/devlake_sync.py +++ b/pulse/packages/pulse-data/src/workers/devlake_sync.py @@ -66,15 +66,81 @@ logger = logging.getLogger(__name__) +# --------------------------------------------------------------------------- +# Changelog helpers +# --------------------------------------------------------------------------- + +def extract_status_transitions_inline(raw_issue: dict[str, Any]) -> list[dict[str, Any]]: + """Extract status transitions from a Jira issue's INLINE changelog. + + FDD-OPS-013 — replaces the previous round-trip to + `fetch_issue_changelogs(issue_ids)` which made one HTTP GET per issue. + The JQL search uses `expand=changelog`, so the changelog is already + present in the response payload. + + Always returns a list (possibly empty for issues with no status changes + in their history). The empty-list case is what fixed the 24h hang in + production: previously the cache lookup on `_last_changelogs` skipped + entries with empty transitions, causing a downstream cache-miss that + triggered the redundant individual GET. + + Output shape mirrors `JiraConnector._extract_changelogs` so that + `normalize_issue(..., changelogs=...)` doesn't need to change. + """ + issue_id = str(raw_issue["id"]) + transitions: list[dict[str, Any]] = [] + for history in raw_issue.get("changelog", {}).get("histories", []): + created = history.get("created") + for item in history.get("items", []): + if item.get("field", "").lower() == "status": + transitions.append({ + "issue_id": issue_id, + "from_status": item.get("fromString", ""), + "to_status": item.get("toString", ""), + "created_date": created, + }) + transitions.sort(key=lambda t: t.get("created_date") or "") + return transitions + + # --------------------------------------------------------------------------- # Watermark helpers — persistent DB storage via pipeline_watermarks +# +# FDD-OPS-014 (migration 010): watermarks are keyed by (tenant, entity, scope). +# `scope_key='*'` is the legacy "global" key — kept as default for backwards +# compatibility during the rollout. Per-source workers (steps 2.3-2.5) will +# pass explicit scope_keys like 'jira:project:BG' or 'github:repo:foo/bar'. # --------------------------------------------------------------------------- -async def _get_watermark(session, tenant_id: UUID, entity: str) -> datetime | None: - """Get the last sync timestamp for an entity type from the DB.""" +# Scope-key conventions (free-form string per Q2 of phase-2 plan, but helpers +# enforce shape). Format: '::'. +GLOBAL_SCOPE = "*" + + +def make_scope_key(source: str, dimension: str, value: str) -> str: + """Build a canonical scope_key. Convention enforced via helper, not DB. + + Examples: + make_scope_key("jira", "project", "BG") -> "jira:project:BG" + make_scope_key("github", "repo", "foo/bar") -> "github:repo:foo/bar" + """ + return f"{source}:{dimension}:{value}" + + +async def _get_watermark( + session, tenant_id: UUID, entity: str, scope_key: str = GLOBAL_SCOPE, +) -> datetime | None: + """Get the last sync timestamp for (entity_type, scope_key) from the DB. + + Default scope_key='*' preserves legacy callers (one global row per + entity_type). Per-source workers pass an explicit scope_key. + """ result = await session.execute( select(PipelineWatermark.last_synced_at) - .where(PipelineWatermark.entity_type == entity) + .where( + PipelineWatermark.entity_type == entity, + PipelineWatermark.scope_key == scope_key, + ) ) row = result.scalar_one_or_none() return row @@ -82,19 +148,24 @@ async def _get_watermark(session, tenant_id: UUID, entity: str) -> datetime | No async def _set_watermark( session, tenant_id: UUID, entity: str, ts: datetime, count: int, + scope_key: str = GLOBAL_SCOPE, ) -> None: - """Upsert the watermark for an entity type using ON CONFLICT.""" + """Upsert the watermark for (entity_type, scope_key) using ON CONFLICT. + + Default scope_key='*' upserts the legacy global row. + """ stmt = ( pg_insert(PipelineWatermark) .values( id=uuid.uuid4(), tenant_id=tenant_id, entity_type=entity, + scope_key=scope_key, last_synced_at=ts, records_synced=count, ) .on_conflict_do_update( - constraint="uq_watermark_entity", + constraint="uq_watermark_entity_scope", set_={ "last_synced_at": ts, "records_synced": count, @@ -103,7 +174,33 @@ async def _set_watermark( ) ) await session.execute(stmt) - logger.debug("Updated watermark for %s to %s (count=%d)", entity, ts, count) + logger.debug( + "Updated watermark for %s/%s to %s (count=%d)", + entity, scope_key, ts, count, + ) + + +async def _list_watermarks_by_scope( + session, tenant_id: UUID, entity: str, scope_keys: list[str], +) -> dict[str, datetime | None]: + """Bulk-fetch watermarks for a list of scopes. Returns {scope_key: ts}. + + Missing scopes return None (no watermark = full backfill on first sync). + Used by per-source workers (Phase 2 step 2.3+) to feed + `since_by_project={...}` into batched fetchers. + """ + if not scope_keys: + return {} + + result = await session.execute( + select(PipelineWatermark.scope_key, PipelineWatermark.last_synced_at) + .where( + PipelineWatermark.entity_type == entity, + PipelineWatermark.scope_key.in_(scope_keys), + ) + ) + found = {row[0]: row[1] for row in result.all()} + return {scope: found.get(scope) for scope in scope_keys} # --------------------------------------------------------------------------- @@ -428,11 +525,52 @@ async def _sync_pull_requests(self) -> int: published to Kafka immediately — no accumulation in memory. If the process crashes mid-sync, all previously persisted repos are safe. + FDD-OPS-014 step 2.4-B: PER-REPO watermarks now READ + WRITTEN. + Each repo has scope_key='github:repo:/'. Adding a new + repo = backfill ONLY that scope. Existing repos continue from their + own last_synced_at, not the global '*' value. + + The global '*' watermark is still updated at end-of-cycle for any + remaining legacy reads (Pipeline Monitor UI etc.). Migration 011 + already dropped the legacy unique constraint that conflicted with + per-scope inserts. + Progress is tracked in pipeline_ingestion_progress for real-time visibility in the Pipeline Monitor dashboard. """ + # Load ALL existing per-repo watermarks for pull_requests. We don't + # know which repos the connector will emit yet, so fetch the full + # set keyed by scope_key. The connector will look up each repo's + # since via since_by_repo[repo] (None = backfill on first sync). async with get_session(self._tenant_id) as session: - since = await _get_watermark(session, self._tenant_id, "pull_requests") + global_since = await _get_watermark( + session, self._tenant_id, "pull_requests", + ) + # Returns rows where scope_key starts with 'github:repo:'. + from sqlalchemy import select as _select + result = await session.execute( + _select( + PipelineWatermark.scope_key, + PipelineWatermark.last_synced_at, + ).where( + PipelineWatermark.entity_type == "pull_requests", + PipelineWatermark.scope_key.like("github:repo:%"), + ) + ) + since_by_repo: dict[str, datetime | None] = {} + for scope_key_str, last_synced in result.all(): + # 'github:repo:owner/name' → 'owner/name' + repo = scope_key_str[len("github:repo:"):] + since_by_repo[repo] = last_synced + + logger.info( + "[prs] watermark plan: %d repos with per-scope rows, global '*' fallback=%s", + len(since_by_repo), + global_since.isoformat() if global_since else "None (full backfill)", + ) + # Pass single fallback for compatibility — repos not in + # since_by_repo (newly discovered) inherit it. + since = global_since # Build issue-key lookup for PR linking. Loading all issue external_ids # from the tenant is cheap (~30k strings) and lets us link each batch @@ -473,7 +611,10 @@ async def _sync_pull_requests(self) -> int: repos_done = 0 try: - async for repo_name, raw_prs in self._reader.fetch_pull_requests_batched(since=since): + async for repo_name, raw_prs in self._reader.fetch_pull_requests_batched( + since=since, + since_by_repo=since_by_repo, + ): # "Starting" signal: connector emits (repo_name, None) before # any API calls so the UI can show progress immediately. if raw_prs is None: @@ -527,6 +668,19 @@ async def _sync_pull_requests(self) -> int: events.append((str(pr["external_id"]), event)) await publish_batch(self._producer, TOPIC_PR_NORMALIZED, events) + # FDD-OPS-014 step 2.4: advance this repo's scope watermark. + # Writes accumulate even though reads are still global '*'; + # follow-up commit changes the connector to read this dict + # via since_by_repo. + if batch_count > 0: + repo_scope = make_scope_key("github", "repo", repo_name) + async with get_session(self._tenant_id) as session: + await _set_watermark( + session, self._tenant_id, "pull_requests", + started_at, batch_count, + scope_key=repo_scope, + ) + # Update progress in DB (queryable by API) await _update_ingestion_progress( self._tenant_id, "pull_requests", @@ -580,92 +734,293 @@ async def _sync_pull_requests(self) -> int: return total_count async def _sync_issues(self) -> int: - """Read issues from source connectors, upsert to PULSE DB, publish to Kafka.""" - async with get_session(self._tenant_id) as session: - since = await _get_watermark(session, self._tenant_id, "issues") - - # Resolve project keys via dynamic discovery or env var fallback - project_keys: list[str] | None = None + """Stream issues from Jira PER PROJECT, persisting each batch immediately. + + FDD-OPS-012 — replaces the previous bulk-fetch-then-persist pattern + (everything in RAM until JQL pagination + ALL changelog HTTP calls + complete, then single upsert) with per-page streaming. Mirrors the + pattern that PRs adopted in commit 7f9f339. + + FDD-OPS-014 step 2.3 — uses PER-PROJECT watermarks. Each project has + its own scope_key='jira:project:' row in pipeline_watermarks. + Adding a new project = backfill ONLY that scope. Per-project failures + don't reset other projects' watermarks. The legacy global '*' + watermark is also updated at end-of-cycle for backwards compat. + + Properties: + - Time-to-first-row: < 10s + - Memory: ~one page in flight, not all-projects + - Crash recovery: lose ≤ 1 batch of work + - Per-project incremental sync: only fetch new since last project run + """ + # Resolve project keys via dynamic discovery (kill-switch via env var). + # No fallback to a static env var list — that path was deprecated when + # we landed discovery-only (ingestion-spec §2.3). Empty list = nothing + # to sync this cycle. + project_keys: list[str] = [] if settings.dynamic_jira_discovery_enabled: try: async with get_session(self._tenant_id) as session: resolver = ModeResolver(session) project_keys = await resolver.resolve_active_projects(self._tenant_id) logger.info( - "Dynamic discovery resolved %d Jira projects for tenant %s", + "[issues] resolved %d active Jira projects for tenant %s", len(project_keys), self._tenant_id, ) except Exception: logger.exception( - "ModeResolver failed for tenant %s, falling back to env var", + "[issues] ModeResolver failed for tenant %s — skipping cycle", self._tenant_id, ) - project_keys = None - - fetch_kwargs: dict[str, Any] = {"since": since} - if project_keys is not None: - fetch_kwargs["project_keys"] = project_keys - raw_issues = await self._reader.fetch_issues(**fetch_kwargs) - if not raw_issues: - logger.info("No new issues to sync") + return 0 + + if not project_keys: + logger.info("[issues] no active projects, nothing to sync") return 0 - # Fetch status changelogs for all issues in this batch (Jira only) - issue_ids = [str(raw["id"]) for raw in raw_issues] - changelogs_by_issue = await self._reader.fetch_issue_changelogs(issue_ids) + # FDD-OPS-014 step 2.3: load per-project watermarks (scope_key per + # project). Missing rows return None = full backfill for that scope. + project_scopes = [ + make_scope_key("jira", "project", pk) for pk in project_keys + ] + async with get_session(self._tenant_id) as session: + scope_to_wm = await _list_watermarks_by_scope( + session, self._tenant_id, "issues", project_scopes, + ) + since_by_project: dict[str, datetime | None] = { + pk: scope_to_wm[make_scope_key("jira", "project", pk)] + for pk in project_keys + } + + # Log which projects need backfill vs which have an existing watermark + backfill_count = sum(1 for v in since_by_project.values() if v is None) + incremental_count = len(project_keys) - backfill_count + logger.info( + "[issues] watermark plan: %d projects backfill (no scope), " + "%d projects incremental", + backfill_count, incremental_count, + ) - # Normalize - normalized = [] - for raw in raw_issues: - try: - issue_id = str(raw["id"]) - issue_changelogs = changelogs_by_issue.get(issue_id, []) - issue_data = normalize_issue( - raw, - self._tenant_id, - self._status_mapping, - changelogs=issue_changelogs, + # FDD-OPS-015 lite: pre-flight progress signal so operators see the + # scope BEFORE we start hammering the API. + started_at = datetime.now(timezone.utc) + await _update_ingestion_progress( + self._tenant_id, "issues", + status="running", + total_sources=len(project_keys), + sources_done=0, + records_ingested=0, + current_source=None, + started_at=started_at, + ) + + total_count = 0 + projects_done: set[str] = set() + current_project: str | None = None + per_project_count: dict[str, int] = {pk: 0 for pk in project_keys} + + async def _advance_project_watermark(project_key: str) -> None: + """Update watermark for `jira:project:` after that project finishes. + + Only advances when count > 0 — empty syncs (incremental with no + changes) leave the watermark unchanged so a subsequent failed + cycle doesn't accidentally claim "synced through now()". + """ + count_for_project = per_project_count.get(project_key, 0) + if count_for_project == 0: + return + scope_key = make_scope_key("jira", "project", project_key) + async with get_session(self._tenant_id) as session: + await _set_watermark( + session, self._tenant_id, "issues", + started_at, count_for_project, scope_key=scope_key, ) - normalized.append(issue_data) - except Exception: - logger.exception("Error normalizing issue: %s", raw.get("id")) + logger.info( + "[issues] watermark advanced: %s → %s (%d issues this cycle)", + scope_key, started_at.isoformat(), count_for_project, + ) - # Upsert to PULSE DB - count = await self._upsert_issues(normalized) + try: + async for project_key, raw_batch in self._reader.fetch_issues_batched( + project_keys=project_keys, + since_by_project=since_by_project, + ): + # Project change marker for ingestion progress + watermark advance + if project_key != current_project: + if current_project is not None: + # Previous project finished — advance its scope watermark + await _advance_project_watermark(current_project) + projects_done.add(current_project) + current_project = project_key + await _update_ingestion_progress( + self._tenant_id, "issues", + status="running", + sources_done=len(projects_done), + records_ingested=total_count, + current_source=project_key, + ) - # Publish to Kafka - events = [] - for issue in normalized: - events.append((str(issue["external_id"]), issue)) - await publish_batch(self._producer, TOPIC_ISSUE_NORMALIZED, events) + # FDD-OPS-013: changelogs are INLINE from JQL expand=changelog. + # No extra HTTP round-trip per issue. + normalized: list[dict[str, Any]] = [] + for raw in raw_batch: + try: + issue_changelogs = extract_status_transitions_inline(raw) + issue_data = normalize_issue( + raw, + self._tenant_id, + self._status_mapping, + changelogs=issue_changelogs, + ) + normalized.append(issue_data) + except Exception: + logger.exception( + "[issues] normalize error in project %s: id=%s", + project_key, raw.get("id"), + ) - # Update watermark in DB - async with get_session(self._tenant_id) as session: - await _set_watermark( - session, self._tenant_id, "issues", - datetime.now(timezone.utc), count, + if not normalized: + continue + + # Persist this batch immediately (FDD-OPS-012) + batch_count = await self._upsert_issues(normalized) + total_count += batch_count + per_project_count[project_key] = per_project_count.get(project_key, 0) + batch_count + + # Emit Kafka events for this batch only + events = [ + (str(issue["external_id"]), issue) + for issue in normalized + ] + await publish_batch( + self._producer, TOPIC_ISSUE_NORMALIZED, events, + ) + + # Per-batch progress update (operator can grep the log to + # confirm forward progress) + logger.info( + "[issues] batch persisted: %s +%d (project total: %d, " + "tenant total: %d)", + project_key, batch_count, + per_project_count[project_key], total_count, + ) + + await _update_ingestion_progress( + self._tenant_id, "issues", + records_ingested=total_count, + current_source=project_key, + ) + + # Final project after the loop: advance its watermark + mark done + if current_project is not None: + await _advance_project_watermark(current_project) + projects_done.add(current_project) + + logger.info( + "[issues] sync complete: %d issues across %d projects " + "(per-project counts: %s)", + total_count, len(projects_done), + {k: v for k, v in per_project_count.items() if v > 0}, ) - # Record sync outcome per project for guardrails (dynamic discovery only) - if settings.dynamic_jira_discovery_enabled and project_keys: - try: + # Update legacy global '*' watermark for backwards compat. Some + # monitoring queries / Pipeline Monitor still read by entity + # without scope. Migration 011 (FDD-OPS-014 step 2.7) will drop + # the legacy unique constraint after a successful per-source + # cycle; until then both keep updating. + if total_count > 0: async with get_session(self._tenant_id) as session: - guardrails = Guardrails(session) - for pk in project_keys: - await guardrails.record_sync_outcome( - self._tenant_id, pk, success=True, - ) - except Exception: - logger.exception("Failed to record sync outcomes for guardrails") + await _set_watermark( + session, self._tenant_id, "issues", + started_at, total_count, + # default scope_key='*' — legacy global row + ) - return count + # Record per-project sync outcome for guardrails (success only — + # batches that errored mid-stream are logged but don't block) + if settings.dynamic_jira_discovery_enabled and projects_done: + try: + async with get_session(self._tenant_id) as session: + guardrails = Guardrails(session) + for pk in projects_done: + await guardrails.record_sync_outcome( + self._tenant_id, pk, success=True, + ) + except Exception: + logger.exception( + "[issues] failed to record guardrail outcomes", + ) + + await _update_ingestion_progress( + self._tenant_id, "issues", + status="completed", + sources_done=len(projects_done), + records_ingested=total_count, + current_source=None, + finished_at=datetime.now(timezone.utc), + ) + + except Exception as exc: + await _update_ingestion_progress( + self._tenant_id, "issues", + status="failed", + sources_done=len(projects_done), + records_ingested=total_count, + current_source=current_project, + finished_at=datetime.now(timezone.utc), + error_message=str(exc)[:500], + ) + logger.exception("[issues] sync cycle failed") + raise + + return total_count async def _sync_deployments(self) -> int: - """Read deployments from source connectors, upsert to PULSE DB, publish to Kafka.""" + """Read deployments from source connectors, upsert to PULSE DB, publish to Kafka. + + FDD-OPS-014 step 2.5 — writes per-repo scope watermarks alongside + the legacy global '*' row. Per-repo READ + per-job streaming are + follow-ups; this commit accumulates the rows so they're available + when the connector refactor lands. + + Granularity choice (Q2 of phase-2-plan): repo-level scope rather + than per-job. Volume is low (~1.4k deploys at Webmotors scale); the + repo dimension matches the cross-source linking model (PR↔deploy + is by repo+sha) and avoids an explosion of scope rows for + ephemeral Jenkins jobs. + """ + started_at = datetime.now(timezone.utc) + # FDD-OPS-014 step 2.5-B: read per-repo watermarks for deployments. + # Pre-load all rows where scope_key starts with 'jenkins:repo:' so + # the connector can resolve each job's `since` via job→repo mapping. async with get_session(self._tenant_id) as session: since = await _get_watermark(session, self._tenant_id, "deployments") + from sqlalchemy import select as _select + result = await session.execute( + _select( + PipelineWatermark.scope_key, + PipelineWatermark.last_synced_at, + ).where( + PipelineWatermark.entity_type == "deployments", + PipelineWatermark.scope_key.like("jenkins:repo:%"), + ) + ) + since_by_repo: dict[str, datetime | None] = {} + for scope_key_str, last_synced in result.all(): + # 'jenkins:repo:owner/name' → 'owner/name' + repo = scope_key_str[len("jenkins:repo:"):] + since_by_repo[repo] = last_synced + + logger.info( + "[deployments] watermark plan: %d repos with per-scope rows, " + "global '*' fallback=%s", + len(since_by_repo), + since.isoformat() if since else "None (full backfill)", + ) - raw_deployments = await self._reader.fetch_deployments(since=since) + raw_deployments = await self._reader.fetch_deployments( + since=since, since_by_repo=since_by_repo, + ) if not raw_deployments: logger.info("No new deployments to sync") return 0 @@ -679,9 +1034,31 @@ async def _sync_deployments(self) -> int: except Exception: logger.exception("Error normalizing deployment: %s", raw.get("id")) + # Group per repo to track per-scope counts for watermark writes. + per_repo_count: dict[str, int] = {} + for d in normalized: + repo = d.get("repo") or "unknown" + per_repo_count[repo] = per_repo_count.get(repo, 0) + 1 + # Upsert to PULSE DB count = await self._upsert_deployments(normalized) + # FDD-OPS-014 step 2.5: advance per-repo deploy watermarks. Reads + # still use global '*' until the fetcher refactor lands. + async with get_session(self._tenant_id) as session: + for repo, repo_count in per_repo_count.items(): + if repo_count == 0: + continue + repo_scope = make_scope_key("jenkins", "repo", repo) + await _set_watermark( + session, self._tenant_id, "deployments", + started_at, repo_count, scope_key=repo_scope, + ) + logger.info( + "[deployments] advanced %d per-repo watermarks (jenkins:repo:*)", + len([c for c in per_repo_count.values() if c > 0]), + ) + # INC-004 — forward-path linker: bind newly ingested deploys back to # any merged PRs in the same repo that were still missing # `deployed_at`. Scoped to the min deployed_at in this batch so the @@ -877,6 +1254,12 @@ async def _upsert_sprints(self, sprints: list[dict[str, Any]]) -> int: index_elements=["tenant_id", "external_id"], set_={ "name": sprint_data["name"], + # FDD-OPS-018 — status + goal were missing from + # this ON CONFLICT set, so existing sprints kept + # their stale (empty) status forever. Active + # sprints transitioning to closed never updated. + "status": sprint_data.get("status"), + "goal": sprint_data.get("goal"), "started_at": sprint_data["started_at"], "completed_at": sprint_data["completed_at"], "committed_items": sprint_data["committed_items"], diff --git a/pulse/packages/pulse-data/tests/unit/test_effort_fallback_chain.py b/pulse/packages/pulse-data/tests/unit/test_effort_fallback_chain.py new file mode 100644 index 0000000..11a8de2 --- /dev/null +++ b/pulse/packages/pulse-data/tests/unit/test_effort_fallback_chain.py @@ -0,0 +1,224 @@ +"""Regression tests for FDD-OPS-016 — effort estimation fallback chain. + +Webmotors and many enterprise tenants don't use Story Points. Different +squads use T-shirt sizes (P/M/G…), original estimate hours, or simply +don't estimate. The connector's `_extract_story_points` walks a priority +chain so downstream metrics get a usable number when one exists, and +None when the issue is genuinely unestimated. + +These tests exercise the chain end-to-end against Jira-shaped payloads. +If a future refactor reorders the chain or drops a fallback, multiple +tests fail with messages naming the broken hop. +""" + +from __future__ import annotations + +import pytest + +from src.connectors.jira_connector import ( + TSHIRT_TO_POINTS, + JiraConnector, + _hours_to_points, +) + + +@pytest.fixture +def connector() -> JiraConnector: + """A connector instance with effort discovery already populated. + + We bypass __init__ so tests don't hit env vars / the network. + """ + c = JiraConnector.__new__(JiraConnector) + c._connection_id = 1 + c._base_url = "https://example.atlassian.net" + c._sprint_field_id = None + c._story_points_field_id = "customfield_10004" + c._tshirt_field_ids = ["customfield_18762", "customfield_15100"] + c._custom_fields_discovered = True + c._effort_source_counts = {} + return c + + +# --------------------------------------------------------------------------- +# 1. Native Story Points — highest priority +# --------------------------------------------------------------------------- + +class TestStoryPointsTakesPriority: + def test_uses_discovered_story_points_field_when_set(self, connector): + result = connector._extract_story_points({"customfield_10004": 5}) + assert result == 5.0 + assert connector._effort_source_counts == {"story_points": 1} + + def test_skips_zero_story_points_and_falls_through(self, connector): + """0 SP is a common sentinel for "not yet estimated" — skip it.""" + result = connector._extract_story_points({ + "customfield_10004": 0, + "customfield_18762": {"value": "P"}, + }) + assert result == TSHIRT_TO_POINTS["P"] + assert connector._effort_source_counts == {"tshirt_to_sp": 1} + + def test_native_sp_wins_over_tshirt(self, connector): + result = connector._extract_story_points({ + "customfield_10004": 8, + "customfield_18762": {"value": "P"}, # would map to 2 + "timeoriginalestimate": 14400, # would map via hours + }) + assert result == 8.0 + + +# --------------------------------------------------------------------------- +# 2. T-shirt sizing — second priority +# --------------------------------------------------------------------------- + +class TestTshirtSizing: + @pytest.mark.parametrize( + "size,expected", + [("PP", 1.0), ("P", 2.0), ("M", 3.0), ("G", 5.0), ("GG", 8.0), ("GGG", 13.0)], + ) + def test_portuguese_sizes_map_correctly(self, connector, size, expected): + result = connector._extract_story_points({ + "customfield_18762": {"value": size}, + }) + assert result == expected + + @pytest.mark.parametrize( + "size,expected", + [("XS", 1.0), ("S", 2.0), ("M", 3.0), ("L", 5.0), ("XL", 8.0), ("XXL", 13.0)], + ) + def test_english_sizes_map_correctly(self, connector, size, expected): + result = connector._extract_story_points({ + "customfield_18762": {"value": size}, + }) + assert result == expected + + def test_lowercase_size_is_normalized(self, connector): + """Be lenient: Jira sometimes returns 'p' instead of 'P'.""" + result = connector._extract_story_points({ + "customfield_18762": {"value": "p"}, + }) + assert result == TSHIRT_TO_POINTS["P"] + + def test_unknown_size_falls_through_to_hours(self, connector): + result = connector._extract_story_points({ + "customfield_18762": {"value": "JUMBO"}, + "timeoriginalestimate": 28800, # 8h → 2 SP + }) + assert result == 2.0 + assert connector._effort_source_counts == {"hours_to_sp": 1} + + def test_secondary_tshirt_field_used_when_first_empty(self, connector): + """Tamanho/Impacto picks up where T-Shirt Size is empty.""" + result = connector._extract_story_points({ + "customfield_18762": None, + "customfield_15100": {"value": "G"}, + }) + assert result == TSHIRT_TO_POINTS["G"] + + def test_bare_string_option_value(self, connector): + """Some legacy responses give a string directly, not a dict.""" + result = connector._extract_story_points({ + "customfield_18762": "M", + }) + assert result == TSHIRT_TO_POINTS["M"] + + +# --------------------------------------------------------------------------- +# 3. Original Estimate (hours) — third priority +# --------------------------------------------------------------------------- + +class TestOriginalEstimateHours: + @pytest.mark.parametrize( + "seconds,expected_hours,expected_sp", + [ + (3600, 1.0, 1.0), # ≤4h + (14400, 4.0, 1.0), # exactly 4h + (28800, 8.0, 2.0), # ≤8h (1 day) + (57600, 16.0, 3.0), # ≤16h (2 days) + (86400, 24.0, 5.0), # ≤24h + (115200, 32.0, 8.0), # ≤40h + (288000, 80.0, 13.0), # ≤80h (2 weeks) + (446400, 124.0, 21.0), # >80h — observed Webmotors max + ], + ) + def test_seconds_to_sp_buckets( + self, connector, seconds, expected_hours, expected_sp, + ): + # Direct check of the helper for clarity + assert _hours_to_points(expected_hours) == expected_sp + # End-to-end: connector picks up timeoriginalestimate + result = connector._extract_story_points({ + "timeoriginalestimate": seconds, + }) + assert result == expected_sp + assert connector._effort_source_counts == {"hours_to_sp": 1} + + def test_zero_seconds_falls_through_to_unestimated(self, connector): + result = connector._extract_story_points({"timeoriginalestimate": 0}) + assert result is None + assert connector._effort_source_counts == {"unestimated": 1} + + +# --------------------------------------------------------------------------- +# 4. Unestimated — final fallback +# --------------------------------------------------------------------------- + +class TestUnestimatedReturnsNone: + def test_no_fields_returns_none(self, connector): + """Kanban-pure mode: metric layer must count items, not sum SP.""" + result = connector._extract_story_points({}) + assert result is None + assert connector._effort_source_counts == {"unestimated": 1} + + def test_empty_strings_treated_as_missing(self, connector): + result = connector._extract_story_points({ + "customfield_10004": "", + "customfield_18762": {"value": ""}, + "customfield_15100": None, + }) + assert result is None + + def test_telemetry_aggregates_across_calls(self, connector): + """Operators rely on the breakdown log to spot estimation shifts.""" + connector._extract_story_points({"customfield_10004": 5}) + connector._extract_story_points({"customfield_18762": {"value": "M"}}) + connector._extract_story_points({"timeoriginalestimate": 14400}) + connector._extract_story_points({}) + connector._extract_story_points({}) + assert connector._effort_source_counts == { + "story_points": 1, + "tshirt_to_sp": 1, + "hours_to_sp": 1, + "unestimated": 2, + } + + +# --------------------------------------------------------------------------- +# 5. Webmotors-shaped real-world cases +# --------------------------------------------------------------------------- + +class TestWebmotorsShapeIntegration: + """Sanity check against the field combos actually observed in production.""" + + def test_eno_typical_issue(self, connector): + """ENO sample: T-shirt 'P' + 8h original estimate. T-shirt wins.""" + result = connector._extract_story_points({ + "customfield_18762": {"value": "P"}, + "timeoriginalestimate": 28800, + }) + assert result == 2.0 # P → 2 + + def test_desc_typical_issue(self, connector): + """DESC sample: T-shirt 'G' only.""" + result = connector._extract_story_points({ + "customfield_18762": {"value": "G"}, + }) + assert result == 5.0 + + def test_bg_typical_issue(self, connector): + """BG (Kanban-pure): nothing populated — None forces item count.""" + result = connector._extract_story_points({ + "summary": "do the thing", + "status": {"name": "Done"}, + }) + assert result is None diff --git a/pulse/packages/pulse-data/tests/unit/test_inline_changelog_extraction.py b/pulse/packages/pulse-data/tests/unit/test_inline_changelog_extraction.py new file mode 100644 index 0000000..f750171 --- /dev/null +++ b/pulse/packages/pulse-data/tests/unit/test_inline_changelog_extraction.py @@ -0,0 +1,356 @@ +"""Regression tests for FDD-OPS-013 — inline changelog extraction. + +Locks in the contract that `_sync_issues` extracts status transitions from +the JQL response payload (`raw_issue["changelog"]["histories"]`) WITHOUT +making additional HTTP round-trips per issue. + +Why this matters: the previous implementation called +`self._reader.fetch_issue_changelogs(issue_ids)` after `fetch_issues`, +which performed one `GET /issue/{id}?expand=changelog` per issue. For +Webmotors-scale tenants (~376k issues), this took 24+ hours of pure +HTTP latency. After this fix, the same data is extracted from the +already-loaded JQL response in a few milliseconds. + +If a future refactor reintroduces the round-trip pattern, these tests +should fail and force the author to confront the cost. +""" + +from __future__ import annotations + +import pytest + +from src.workers.devlake_sync import extract_status_transitions_inline + + +# --------------------------------------------------------------------------- +# Fixtures — shape mirrors real Jira JQL `expand=changelog` response +# --------------------------------------------------------------------------- + +def _jira_issue_with_changelog(issue_id: str, histories: list[dict]) -> dict: + """Build a fake Jira JQL response item with inline changelog.""" + return { + "id": issue_id, + "key": f"TEST-{issue_id}", + "fields": {"status": {"name": "In Progress"}}, + "changelog": {"histories": histories}, + } + + +@pytest.fixture +def issue_with_two_status_transitions() -> dict: + """Realistic case: a typical issue moves through To Do → In Progress → Done.""" + return _jira_issue_with_changelog( + issue_id="100200", + histories=[ + { + "created": "2026-01-15T10:00:00.000+0000", + "items": [ + { + "field": "Status", + "fromString": "To Do", + "toString": "In Progress", + }, + ], + }, + { + "created": "2026-01-20T16:30:00.000+0000", + "items": [ + { + "field": "Status", + "fromString": "In Progress", + "toString": "Done", + }, + ], + }, + ], + ) + + +@pytest.fixture +def issue_with_no_changelog() -> dict: + """Edge case: brand-new issue, never moved status. Pre-fix this caused + the cache miss → downstream HTTP call. Now must return [] safely.""" + return _jira_issue_with_changelog(issue_id="100300", histories=[]) + + +@pytest.fixture +def issue_with_mixed_history() -> dict: + """Realistic: changelog has Status changes mixed with non-Status events + (assignee, priority, summary). Only Status events become transitions.""" + return _jira_issue_with_changelog( + issue_id="100400", + histories=[ + { + "created": "2026-02-01T09:00:00.000+0000", + "items": [ + {"field": "Assignee", "fromString": "Alice", "toString": "Bob"}, + ], + }, + { + "created": "2026-02-02T11:00:00.000+0000", + "items": [ + {"field": "Status", "fromString": "To Do", "toString": "In Progress"}, + {"field": "Priority", "fromString": "Medium", "toString": "High"}, + ], + }, + { + "created": "2026-02-03T14:00:00.000+0000", + "items": [ + {"field": "Summary", "fromString": "Foo", "toString": "Foo bar"}, + ], + }, + ], + ) + + +@pytest.fixture +def issue_with_unsorted_history() -> dict: + """Defensive: Jira occasionally returns histories out of chronological + order. The extracted transitions must be sorted by created_date so + `build_status_transitions` (downstream) computes correct durations.""" + return _jira_issue_with_changelog( + issue_id="100500", + histories=[ + { + "created": "2026-03-15T12:00:00.000+0000", # later + "items": [ + {"field": "Status", "fromString": "B", "toString": "C"}, + ], + }, + { + "created": "2026-03-10T09:00:00.000+0000", # earlier + "items": [ + {"field": "Status", "fromString": "A", "toString": "B"}, + ], + }, + ], + ) + + +# --------------------------------------------------------------------------- +# Behavioral tests +# --------------------------------------------------------------------------- + +class TestExtractStatusTransitionsInline: + def test_extracts_two_status_transitions(self, issue_with_two_status_transitions): + result = extract_status_transitions_inline(issue_with_two_status_transitions) + assert len(result) == 2 + assert result[0]["from_status"] == "To Do" + assert result[0]["to_status"] == "In Progress" + assert result[1]["from_status"] == "In Progress" + assert result[1]["to_status"] == "Done" + + def test_each_transition_carries_issue_id(self, issue_with_two_status_transitions): + result = extract_status_transitions_inline(issue_with_two_status_transitions) + assert all(t["issue_id"] == "100200" for t in result) + + def test_each_transition_carries_created_date(self, issue_with_two_status_transitions): + result = extract_status_transitions_inline(issue_with_two_status_transitions) + assert result[0]["created_date"] == "2026-01-15T10:00:00.000+0000" + assert result[1]["created_date"] == "2026-01-20T16:30:00.000+0000" + + def test_empty_changelog_returns_empty_list(self, issue_with_no_changelog): + """REGRESSION GUARD: pre-fix, this case caused cache-miss + HTTP fallback. + Must always return a list, even if empty. Never None, never raise.""" + result = extract_status_transitions_inline(issue_with_no_changelog) + assert result == [] + assert isinstance(result, list) + + def test_only_status_field_events_are_extracted(self, issue_with_mixed_history): + """Assignee, Priority, Summary changes don't become transitions.""" + result = extract_status_transitions_inline(issue_with_mixed_history) + assert len(result) == 1 + assert result[0]["from_status"] == "To Do" + assert result[0]["to_status"] == "In Progress" + + def test_status_field_match_is_case_insensitive(self): + """Defensive: Jira sometimes returns 'status', sometimes 'Status'.""" + for field_name in ("Status", "status", "STATUS"): + issue = _jira_issue_with_changelog( + issue_id="999", + histories=[ + { + "created": "2026-01-01T00:00:00.000+0000", + "items": [ + {"field": field_name, "fromString": "X", "toString": "Y"}, + ], + }, + ], + ) + result = extract_status_transitions_inline(issue) + assert len(result) == 1, f"failed for field name {field_name!r}" + + def test_transitions_are_chronologically_sorted(self, issue_with_unsorted_history): + """Downstream metric calculations depend on ordered transitions.""" + result = extract_status_transitions_inline(issue_with_unsorted_history) + assert len(result) == 2 + assert result[0]["created_date"] == "2026-03-10T09:00:00.000+0000" + assert result[1]["created_date"] == "2026-03-15T12:00:00.000+0000" + + def test_returns_empty_for_issue_without_changelog_key(self): + """Defensive: issue from Jira API may lack `changelog` key entirely.""" + result = extract_status_transitions_inline( + {"id": "555", "key": "X-1", "fields": {}} + ) + assert result == [] + + def test_returns_empty_for_changelog_without_histories(self): + """Defensive: `changelog: {}` without `histories` key.""" + result = extract_status_transitions_inline( + {"id": "555", "key": "X-1", "changelog": {}} + ) + assert result == [] + + +# --------------------------------------------------------------------------- +# Anti-regression: the redundant HTTP call must NEVER come back +# --------------------------------------------------------------------------- + +class TestSyncIssuesDoesNotCallFetchIssueChangelogs: + """If a future refactor reintroduces the per-issue HTTP fallback in + `_sync_issues`, this test fails. The check is structural — it greps + the source — to keep the test independent of any DB or network setup. + + Note: `fetch_issue_changelogs` may STILL be called from sprint sync + (where issues come without `expand=changelog`). This test scopes its + assertion to `_sync_issues` only. + """ + + def test_sync_issues_does_not_call_fetch_issue_changelogs(self): + """Source-grep: `_sync_issues` body must not reference `fetch_issue_changelogs`. + + If you really need it back, remove this test AND amend FDD-OPS-013 + in ops-backlog.md AND benchmark the new approach against + Webmotors-scale dataset (376k issues). + """ + from pathlib import Path + + sync_file = Path(__file__).resolve().parents[2] / "src" / "workers" / "devlake_sync.py" + source = sync_file.read_text() + + # Find the _sync_issues body — from "async def _sync_issues" until + # the next "async def" or "def " at the same indentation. + start = source.find("async def _sync_issues(") + assert start != -1, "Could not find _sync_issues definition" + + # Find next method def at same indent (4 spaces, prefixed with newline). + end = source.find("\n async def ", start + 1) + if end == -1: + end = source.find("\n def ", start + 1) + assert end != -1, "Could not find end of _sync_issues body" + + sync_issues_body = source[start:end] + + # Only flag actual function CALLS (`.fetch_issue_changelogs(` or + # `await fetch_issue_changelogs(`), not comments or docstrings that + # reference the name historically. The pattern matches a call + # expression, not free text. + import re + call_pattern = re.compile(r"(? dict: + """Mirror what `JiraConnector._map_sprint` returns (ACTIVE/CLOSED/FUTURE).""" + return { + "id": f"jira:JiraSprint:1:{sprint_id}", + "original_board_id": "42", + "name": "Sprint 99", + "url": "https://example.atlassian.net", + "status": status, + "goal": goal, + "started_date": "2026-04-01T00:00:00.000Z", + "ended_date": "2026-04-15T00:00:00.000Z", + "completed_date": None, + "total_issues": 0, + } + + +# --------------------------------------------------------------------------- +# 1. Normalize sprint emits the status field +# --------------------------------------------------------------------------- + +class TestStatusFieldPresent: + """REGRESSION GUARD: pre-fix, `normalize_sprint` returned a dict without + a `status` key at all, so every sprint landed with NULL/empty status.""" + + def test_active_normalizes_to_lowercase(self): + result = normalize_sprint(_connector_sprint(status="ACTIVE"), uuid4()) + assert "status" in result, ( + "normalize_sprint dropped the `status` field — eng_sprints.status " + "would land empty for every sprint. This is the 2026-04-29 bug." + ) + assert result["status"] == "active" + + def test_closed_normalizes_to_lowercase(self): + result = normalize_sprint(_connector_sprint(status="CLOSED"), uuid4()) + assert result["status"] == "closed" + + def test_future_normalizes_to_lowercase(self): + result = normalize_sprint(_connector_sprint(status="FUTURE"), uuid4()) + assert result["status"] == "future" + + def test_already_lowercase_passthrough(self): + result = normalize_sprint(_connector_sprint(status="active"), uuid4()) + assert result["status"] == "active" + + def test_whitespace_is_stripped(self): + result = normalize_sprint(_connector_sprint(status=" CLOSED "), uuid4()) + assert result["status"] == "closed" + + +# --------------------------------------------------------------------------- +# 2. Unknown / missing values +# --------------------------------------------------------------------------- + +class TestUnknownStatusReturnsNone: + """We deliberately don't bucket unknown values — operators must see + NULLs in eng_sprints.status and investigate (e.g., new Jira state). + Silently mapping to one of the known states would corrupt Velocity / + Carryover logic that relies on knowing which sprints are ACTUALLY + closed.""" + + def test_empty_string_is_none(self): + result = normalize_sprint(_connector_sprint(status=""), uuid4()) + assert result["status"] is None + + def test_none_is_none(self): + result = normalize_sprint(_connector_sprint(status=None), uuid4()) + assert result["status"] is None + + def test_unknown_value_is_none(self): + result = normalize_sprint(_connector_sprint(status="some_new_state"), uuid4()) + assert result["status"] is None + + def test_non_string_is_none(self): + result = normalize_sprint(_connector_sprint(status=42), uuid4()) # type: ignore[arg-type] + assert result["status"] is None + + +# --------------------------------------------------------------------------- +# 3. Aliases — common Jira variants that should map cleanly +# --------------------------------------------------------------------------- + +class TestStatusAliases: + @pytest.mark.parametrize("raw,expected", [ + ("active", "active"), + ("ACTIVE", "active"), + ("open", "active"), # alias + ("in_progress", "active"), # alias + ("closed", "closed"), + ("CLOSED", "closed"), + ("completed", "closed"), # alias + ("complete", "closed"), # alias + ("ended", "closed"), # alias + ("future", "future"), + ("FUTURE", "future"), + ("planned", "future"), # alias + ("upcoming", "future"), # alias + ]) + def test_alias_maps_correctly(self, raw, expected): + assert _normalize_sprint_status(raw) == expected + + +# --------------------------------------------------------------------------- +# 4. Goal field passthrough (also was previously hardcoded to None) +# --------------------------------------------------------------------------- + +class TestGoalFieldPassthrough: + def test_goal_string_is_preserved(self): + result = normalize_sprint( + _connector_sprint(goal="Ship the auth flow this sprint"), uuid4(), + ) + assert result["goal"] == "Ship the auth flow this sprint" + + def test_none_goal_stays_none(self): + result = normalize_sprint(_connector_sprint(goal=None), uuid4()) + assert result["goal"] is None + + def test_null_byte_in_goal_is_stripped(self): + """Postgres `text` rejects 0x00. Same defensive strip we apply to + title/description/assignee on issues.""" + result = normalize_sprint( + _connector_sprint(goal="Goal with\x00null byte"), uuid4(), + ) + assert result["goal"] is not None + assert "\x00" not in result["goal"] + + +# --------------------------------------------------------------------------- +# 5. Anti-regression on _upsert_sprints — structural source check +# --------------------------------------------------------------------------- + +class TestUpsertSprintsIncludesStatus: + """REGRESSION GUARD: pre-fix, `_upsert_sprints.on_conflict_do_update.set_` + omitted `status` and `goal` — so existing sprints kept their stale empty + status forever even after the normalizer was fixed. + + If a future refactor removes them from the set_ block again, this test + fails. The check is structural (greps the source) so it doesn't depend + on a real DB or Jira client. + """ + + def test_upsert_sprints_set_includes_status_and_goal(self): + from pathlib import Path + + sync_file = ( + Path(__file__).resolve().parents[2] / "src" / "workers" / "devlake_sync.py" + ) + source = sync_file.read_text() + + start = source.find("async def _upsert_sprints(") + assert start != -1, "Could not find _upsert_sprints definition" + + # Find next method or top-level def + end = source.find("\n async def ", start + 1) + if end == -1: + end = source.find("\n def ", start + 1) + if end == -1: + end = len(source) + + body = source[start:end] + + for field in ("status", "goal"): + assert f'"{field}": sprint_data' in body or f'"{field}":sprint_data' in body, ( + f"_upsert_sprints set_ block must update {field!r} on conflict. " + "Without it, existing sprints never receive the corrected " + "value when the connector or normalizer changes." + ) diff --git a/pulse/packages/pulse-data/tests/unit/test_status_normalization.py b/pulse/packages/pulse-data/tests/unit/test_status_normalization.py new file mode 100644 index 0000000..b2f73d2 --- /dev/null +++ b/pulse/packages/pulse-data/tests/unit/test_status_normalization.py @@ -0,0 +1,216 @@ +"""Regression tests for FDD-OPS-017 — status normalization with statusCategory +fallback. + +THE BUG (2026-04-28 audit): 311k issues showed normalized_status distribution +of 96.5% done, 0.2% in_progress, 3.3% todo. Investigation revealed: + + - Webmotors Jira has 104 distinct status names across workflows + - DEFAULT_STATUS_MAPPING covered ~50 → 50+ statuses fell to default 'todo' + - 2,881 issues with status='FECHADO EM PROD' landed in 'todo' (should be 'done') + - Various active work states ('Em Progresso', 'Em desenv') were classified + as 'todo' + - Result: every flow metric (Cycle Time, Throughput, WIP, CFD, Flow + Efficiency) was systematically corrupted across the whole tenant + +THE FIX: hybrid normalization + + 1. Textual DEFAULT_STATUS_MAPPING — preserves the in_progress vs in_review + distinction we curated for Cycle Time breakdown + 2. Jira's statusCategory.key fallback — authoritative for done/non-done, + covers the long tail of tenant-custom workflows automatically + 3. Final default 'todo' with WARN log + +If a future refactor reverts to the textual-only path, these tests fail +loudly with messages naming the broken classification. +""" + +from __future__ import annotations + +import pytest + +from src.contexts.engineering_data.normalizer import ( + DEFAULT_STATUS_MAPPING, + build_status_transitions, + normalize_status, +) + + +# --------------------------------------------------------------------------- +# 1. Textual mapping wins (preserves curated granularity) +# --------------------------------------------------------------------------- + +class TestTextualMappingTakesPriority: + def test_known_status_uses_textual_even_when_category_disagrees(self): + """Even if Jira's category says 'indeterminate', our explicit mapping + of 'em code review' → 'in_review' must win. The category-only fallback + loses the in_progress/in_review granularity that Cycle Time needs.""" + result = normalize_status( + "Em Code Review", + status_category="indeterminate", + ) + assert result == "in_review" + + def test_pt_br_done_status_classified_correctly(self): + """'Concluído' must always be done, regardless of category.""" + assert normalize_status("Concluído") == "done" + # Even if hypothetically the category was wrong: + assert normalize_status("Concluído", status_category="new") == "done" + + def test_custom_mapping_overrides_default(self): + custom = {"weird-state": "in_progress"} + assert normalize_status("weird-state", status_mapping=custom) == "in_progress" + + +# --------------------------------------------------------------------------- +# 2. statusCategory fallback — the actual fix +# --------------------------------------------------------------------------- + +class TestStatusCategoryFallback: + def test_unknown_status_with_done_category_returns_done(self): + """REGRESSION: pre-fix, this returned 'todo' and corrupted Throughput + + Cycle Time + Lead Time for every issue with a custom 'done' status.""" + result = normalize_status( + "FECHADO EM PROD UNKNOWN VARIANT", + status_category="done", + ) + assert result == "done" + + def test_unknown_status_with_indeterminate_returns_in_progress(self): + """Active work that isn't in our textual mapping defaults to + in_progress (not in_review) — operators must add explicit mapping + if the in_review distinction matters.""" + result = normalize_status( + "Some New Custom State", + status_category="indeterminate", + ) + assert result == "in_progress" + + def test_unknown_status_with_new_category_returns_todo(self): + result = normalize_status( + "Aguardando Terceiro Custom", + status_category="new", + ) + assert result == "todo" + + def test_unknown_status_without_category_defaults_to_todo(self): + """Legacy fallback when neither textual nor category matches.""" + result = normalize_status("Totally Unknown") + assert result == "todo" + + def test_invalid_category_falls_through_to_default(self): + """Defensive: garbage in `status_category` doesn't crash the pipeline.""" + result = normalize_status("Whatever", status_category="garbage") + assert result == "todo" + + def test_category_is_case_insensitive(self): + assert normalize_status("X", status_category="DONE") == "done" + assert normalize_status("X", status_category="Indeterminate") == "in_progress" + + +# --------------------------------------------------------------------------- +# 3. Real-world Webmotors statuses that broke the original normalizer +# --------------------------------------------------------------------------- + +class TestWebmotorsStatusRegression: + """Each parametrized case is a status string that, pre-fix, caused + visible metric corruption. They must classify correctly NOW. + """ + + @pytest.mark.parametrize("raw,expected", [ + ("FECHADO EM PROD", "done"), # 2,881 issues affected + ("FECHADO EM HML", "done"), # Jira's own category is "done" + ("Concluído", "done"), + ("Cancelado", "done"), + ("FECHADO", "done"), + ("Em Desenvolvimento", "in_progress"), + ("Em imersão", "in_progress"), + ("Em andamento", "in_progress"), + ("Em Progresso", "in_progress"), # was 'todo' pre-fix + ("Em Code Review", "in_review"), + ("Em Teste HML", "in_review"), + ("Homologação", "in_review"), # was 'todo' pre-fix + ("Em Verificação", "in_review"), # was 'todo' pre-fix + ("BACKLOG", "todo"), + ("A Fazer", "todo"), + ("Refinado", "todo"), + ("PAUSADO", "todo"), + ]) + def test_observed_status_classifies_correctly(self, raw, expected): + assert normalize_status(raw) == expected, ( + f"{raw!r} should be {expected!r}, but got {normalize_status(raw)!r}" + ) + + +# --------------------------------------------------------------------------- +# 4. build_status_transitions integrates the category map +# --------------------------------------------------------------------------- + +class TestBuildStatusTransitionsWithCategories: + def test_unknown_to_status_uses_categories_map(self): + """REGRESSION: a transition into a custom 'done'-category status + must be classified as done in the resulting transitions array, + not 'todo'. Cycle Time breakdown reads transitions to determine + time spent in each phase.""" + changelogs = [ + { + "from_status": "Em Desenvolvimento", + "to_status": "Some Custom Done State", + "created_date": "2026-04-01T10:00:00.000+0000", + }, + ] + cats_map = {"some custom done state": "done"} + result = build_status_transitions( + changelogs, status_categories_map=cats_map, + ) + assert len(result) == 1 + assert result[0]["status"] == "done" + + def test_textual_mapping_still_wins_in_transitions(self): + changelogs = [ + { + "from_status": "A", + "to_status": "Em Code Review", + "created_date": "2026-04-01T10:00:00.000+0000", + }, + ] + # Even with a misleading category in the map: + cats_map = {"em code review": "indeterminate"} + result = build_status_transitions( + changelogs, status_categories_map=cats_map, + ) + assert result[0]["status"] == "in_review" + + def test_transitions_without_categories_map_still_works(self): + """Backward compat: legacy callers don't pass status_categories_map.""" + changelogs = [ + { + "from_status": "A", + "to_status": "Done", + "created_date": "2026-04-01T10:00:00.000+0000", + }, + ] + result = build_status_transitions(changelogs) + assert result[0]["status"] == "done" + + +# --------------------------------------------------------------------------- +# 5. Anti-regression: textual mapping coverage +# --------------------------------------------------------------------------- + +class TestTextualMappingCompleteness: + """The DEFAULT_STATUS_MAPPING grew significantly during FDD-OPS-017 to + cover Webmotors PT-BR workflows. These tests guard against accidental + deletion. + """ + + @pytest.mark.parametrize("status", [ + "fechado em prod", "concluído", "cancelado", "fechado", + "em desenvolvimento", "em andamento", "em progresso", + "em code review", "em teste hml", "em verificação", "homologação", + "backlog", "a fazer", "refinado", + ]) + def test_critical_pt_br_status_is_mapped(self, status): + assert status in DEFAULT_STATUS_MAPPING, ( + f"{status!r} must remain in DEFAULT_STATUS_MAPPING — " + "removing it reverts FDD-OPS-017 and re-corrupts metrics." + ) diff --git a/pulse/packages/pulse-data/tests/unit/test_watermark_scope_keys.py b/pulse/packages/pulse-data/tests/unit/test_watermark_scope_keys.py new file mode 100644 index 0000000..81f67f7 --- /dev/null +++ b/pulse/packages/pulse-data/tests/unit/test_watermark_scope_keys.py @@ -0,0 +1,62 @@ +"""Unit tests for FDD-OPS-014 step 2.2 — per-scope watermark API. + +Validates that: +1. `make_scope_key()` produces canonical strings +2. Default scope_key='*' preserves legacy callers (backwards-compat) +3. New explicit scope_keys are independent rows +4. `_list_watermarks_by_scope` returns None for missing scopes (full backfill) + +Tests use a Postgres test container fixture (existing in conftest); the +DB-touching tests live under tests/integration/ — this file covers the +pure helpers that don't need DB. +""" + +from __future__ import annotations + +import pytest + +from src.workers.devlake_sync import GLOBAL_SCOPE, make_scope_key + + +class TestMakeScopeKey: + def test_jira_project_format(self): + assert make_scope_key("jira", "project", "BG") == "jira:project:BG" + + def test_github_repo_format(self): + assert make_scope_key("github", "repo", "foo/bar") == "github:repo:foo/bar" + + def test_jenkins_job_with_folders(self): + # Jenkins jobs can have folder/sub/job notation + assert ( + make_scope_key("jenkins", "job", "PI-Money/money-prd") + == "jenkins:job:PI-Money/money-prd" + ) + + def test_global_scope_constant(self): + # Sanity: the default value used everywhere matches what migration 010 + # set as DEFAULT in DDL. If this changes, the migration default and + # legacy reads break. + assert GLOBAL_SCOPE == "*" + + def test_separator_is_colon(self): + # Scope keys are routed by source prefix; helpers and consumers all + # split on ':'. Don't change the separator without a migration. + result = make_scope_key("source", "dim", "value") + assert result.count(":") == 2 + assert result.split(":") == ["source", "dim", "value"] + + @pytest.mark.parametrize( + "source,dim,value", + [ + ("jira", "project", "X"), + ("github", "repo", "a/b/c"), # repos can have slashes + ("jenkins", "job", "x.y.z"), # job names can have dots + ("future", "tenant", "id-with-dashes"), + ], + ) + def test_value_pass_through(self, source, dim, value): + # Helper does NOT escape or sanitize — values pass through. Callers + # are expected to use scope_key as opaque identifier; equality + # comparison is what matters. + result = make_scope_key(source, dim, value) + assert result == f"{source}:{dim}:{value}"