From 569256a1989fed80abcc221b51ff81e6545e6381 Mon Sep 17 00:00:00 2001 From: alextuan1024 Date: Tue, 17 Mar 2026 11:37:41 +0800 Subject: [PATCH 1/5] feat: render article inline images as markdown --- tests/test_client.py | 41 +++++++++++++++++++++ twitter_cli/parser.py | 83 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 124 insertions(+) diff --git a/tests/test_client.py b/tests/test_client.py index b26098c..f43edf6 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -513,6 +513,47 @@ def test_depth_limit(self, mock_ct_headers, mock_session): assert parse_tweet_result(self.SAMPLE_TWEET_RESULT, depth=3) is None + @patch("twitter_cli.client._get_cffi_session") + @patch("twitter_cli.client._gen_ct_headers", return_value={}) + def test_article_atomic_image_block_renders_markdown_image(self, mock_ct_headers, mock_session): + mock_session.return_value = MagicMock() + mock_session.return_value.get = MagicMock(side_effect=Exception("skip")) + + client = TwitterClient.__new__(TwitterClient) + client._ct_init_attempted = True + client._client_transaction = None + + result = copy.deepcopy(self.SAMPLE_TWEET_RESULT) + result["article"] = { + "article_results": { + "result": { + "title": "Article title", + "content_state": { + "blocks": [ + {"key": "a", "type": "unstyled", "text": "Intro", "entityRanges": []}, + {"key": "b", "type": "atomic", "text": " ", "entityRanges": [{"offset": 0, "length": 1, "key": 0}]}, + {"key": "c", "type": "unstyled", "text": "Outro", "entityRanges": []}, + ], + "entityMap": { + "0": { + "type": "IMAGE", + "mutability": "IMMUTABLE", + "data": { + "caption": "A cat", + "original_url": "https://pbs.twimg.com/media/cat.jpg", + }, + } + }, + }, + } + } + } + + tweet = parse_tweet_result(result) + assert tweet is not None + assert tweet.article_title == "Article title" + assert tweet.article_text == "Intro\n\n![A cat](https://pbs.twimg.com/media/cat.jpg)\n\nOutro" + # ── TwitterAPIError ────────────────────────────────────────────────────── diff --git a/twitter_cli/parser.py b/twitter_cli/parser.py index bb964a8..bcb6d86 100644 --- a/twitter_cli/parser.py +++ b/twitter_cli/parser.py @@ -113,6 +113,85 @@ def _extract_author(user_data, user_legacy): # ── Article parsing ────────────────────────────────────────────────────── +def _find_article_image_url(value): + # type: (Any) -> Optional[str] + """Best-effort extraction of the original image URL from article entity data.""" + if isinstance(value, dict): + for key in ( + "original_url", + "originalUrl", + "media_url_https", + "mediaUrlHttps", + "media_url", + "mediaUrl", + "url", + "src", + "uri", + ): + candidate = value.get(key) + if isinstance(candidate, str) and candidate.strip(): + lowered = candidate.lower() + if ( + lowered.startswith("https://pbs.twimg.com/") + or lowered.endswith((".jpg", ".jpeg", ".png", ".gif", ".webp")) + or any(ext in lowered for ext in (".jpg?", ".jpeg?", ".png?", ".gif?", ".webp?")) + ): + return candidate.strip() + for nested in value.values(): + found = _find_article_image_url(nested) + if found: + return found + return None + if isinstance(value, list): + for item in value: + found = _find_article_image_url(item) + if found: + return found + return None + + +def _find_article_caption(value): + # type: (Any) -> Optional[str] + """Best-effort extraction of image caption/alt text from article entity data.""" + if isinstance(value, dict): + for key in ("caption", "alt", "alt_text", "altText", "title", "name"): + candidate = value.get(key) + if isinstance(candidate, str) and candidate.strip(): + return candidate.strip() + for nested in value.values(): + found = _find_article_caption(nested) + if found: + return found + return None + if isinstance(value, list): + for item in value: + found = _find_article_caption(item) + if found: + return found + return None + + +def _extract_article_images(block, entity_map): + # type: (Dict[str, Any], Dict[str, Any]) -> List[str] + """Convert atomic Draft.js image entities to Markdown image lines.""" + parts = [] # type: List[str] + for entity_range in block.get("entityRanges", []) or []: + if not isinstance(entity_range, dict): + continue + entity_key = entity_range.get("key") + entity = entity_map.get(entity_key) + if entity is None and entity_key is not None: + entity = entity_map.get(str(entity_key)) + if not isinstance(entity, dict): + continue + image_url = _find_article_image_url(entity) + if not image_url: + continue + caption = _find_article_caption(entity) or "" + parts.append("![%s](%s)" % (caption, image_url)) + return parts + + def _parse_article(tweet_data): # type: (Dict[str, Any]) -> Dict[str, Any] """Extract Twitter Article data (long-form content) from a tweet. @@ -130,12 +209,16 @@ def _parse_article(tweet_data): if not blocks: return {"article_title": title, "article_text": None} + entity_map = content_state.get("entityMap", {}) + # Convert draft.js blocks to Markdown parts = [] # type: List[str] ordered_counter = 0 for block in blocks: block_type = block.get("type", "unstyled") # type: str if block_type == "atomic": + parts.extend(_extract_article_images(block, entity_map)) + ordered_counter = 0 continue text = block.get("text", "") # type: str if not text: From 15abd718f0c09db5c01d5097a996762e743528c5 Mon Sep 17 00:00:00 2001 From: alextuan1024 Date: Tue, 17 Mar 2026 15:55:12 +0800 Subject: [PATCH 2/5] fix: support list-style article entity maps --- tests/test_client.py | 41 +++++++++++++++++++++++++ twitter_cli/parser.py | 69 ++++++++++++++++++++++++++++++++++++++----- 2 files changed, 103 insertions(+), 7 deletions(-) diff --git a/tests/test_client.py b/tests/test_client.py index f43edf6..8cba813 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -554,6 +554,47 @@ def test_article_atomic_image_block_renders_markdown_image(self, mock_ct_headers assert tweet.article_title == "Article title" assert tweet.article_text == "Intro\n\n![A cat](https://pbs.twimg.com/media/cat.jpg)\n\nOutro" + @patch("twitter_cli.client._get_cffi_session") + @patch("twitter_cli.client._gen_ct_headers", return_value={}) + def test_article_atomic_image_block_supports_list_entity_map_and_media_entities(self, mock_ct_headers, mock_session): + mock_session.return_value = MagicMock() + mock_session.return_value.get = MagicMock(side_effect=Exception("skip")) + + client = TwitterClient.__new__(TwitterClient) + client._ct_init_attempted = True + client._client_transaction = None + + result = copy.deepcopy(self.SAMPLE_TWEET_RESULT) + result["article"] = { + "article_results": { + "result": { + "title": "Article title", + "content_state": { + "blocks": [ + {"key": "a", "type": "unstyled", "text": "Intro", "entityRanges": []}, + {"key": "b", "type": "atomic", "text": " ", "entityRanges": [{"offset": 0, "length": 1, "key": 2}]}, + {"key": "c", "type": "unstyled", "text": "Outro", "entityRanges": []}, + ], + "entityMap": [ + {"key": "2", "value": {"type": "MEDIA", "data": {"mediaItems": [{"mediaId": "2030504404391194624"}]}}} + ], + }, + "media_entities": [ + { + "media_id": "2030504404391194624", + "media_info": { + "original_img_url": "https://pbs.twimg.com/media/example.png" + }, + } + ], + } + } + } + + tweet = parse_tweet_result(result) + assert tweet is not None + assert tweet.article_text == "Intro\n\n![](https://pbs.twimg.com/media/example.png)\n\nOutro" + # ── TwitterAPIError ────────────────────────────────────────────────────── diff --git a/twitter_cli/parser.py b/twitter_cli/parser.py index bcb6d86..25d1fff 100644 --- a/twitter_cli/parser.py +++ b/twitter_cli/parser.py @@ -118,6 +118,8 @@ def _find_article_image_url(value): """Best-effort extraction of the original image URL from article entity data.""" if isinstance(value, dict): for key in ( + "original_img_url", + "originalImgUrl", "original_url", "originalUrl", "media_url_https", @@ -171,20 +173,72 @@ def _find_article_caption(value): return None -def _extract_article_images(block, entity_map): - # type: (Dict[str, Any], Dict[str, Any]) -> List[str] +def _normalize_article_entity_map(entity_map): + # type: (Any) -> Dict[str, Any] + """Normalize Draft.js entityMap that may arrive as dict or [{key, value}, ...].""" + if isinstance(entity_map, dict): + normalized = {} # type: Dict[str, Any] + for key, value in entity_map.items(): + normalized[str(key)] = value + return normalized + if isinstance(entity_map, list): + normalized = {} # type: Dict[str, Any] + for item in entity_map: + if not isinstance(item, dict): + continue + key = item.get("key") + value = item.get("value") + if key is None or value is None: + continue + normalized[str(key)] = value + return normalized + return {} + + +def _extract_article_media_url_map(article_results): + # type: (Dict[str, Any]) -> Dict[str, str] + """Map article media ids/keys to original image URLs when article entities reference IDs only.""" + media_url_map = {} # type: Dict[str, str] + media_candidates = [] # type: List[Any] + + cover_media = article_results.get("cover_media") + if cover_media: + media_candidates.append(cover_media) + media_candidates.extend(article_results.get("media_entities") or []) + + for media in media_candidates: + if not isinstance(media, dict): + continue + media_info = media.get("media_info") or {} + image_url = _find_article_image_url(media_info) or _find_article_image_url(media) + if not image_url: + continue + for key in ("media_id", "media_key", "id"): + candidate = media.get(key) + if isinstance(candidate, str) and candidate: + media_url_map[candidate] = image_url + return media_url_map + + +def _extract_article_images(block, entity_map, media_url_map): + # type: (Dict[str, Any], Dict[str, Any], Dict[str, str]) -> List[str] """Convert atomic Draft.js image entities to Markdown image lines.""" parts = [] # type: List[str] for entity_range in block.get("entityRanges", []) or []: if not isinstance(entity_range, dict): continue entity_key = entity_range.get("key") - entity = entity_map.get(entity_key) - if entity is None and entity_key is not None: - entity = entity_map.get(str(entity_key)) + entity = entity_map.get(str(entity_key)) if entity_key is not None else None if not isinstance(entity, dict): continue image_url = _find_article_image_url(entity) + if not image_url: + media_items = _deep_get(entity, "data", "mediaItems") or [] + for media_item in media_items: + media_id = media_item.get("mediaId") if isinstance(media_item, dict) else None + if isinstance(media_id, str) and media_id in media_url_map: + image_url = media_url_map[media_id] + break if not image_url: continue caption = _find_article_caption(entity) or "" @@ -209,7 +263,8 @@ def _parse_article(tweet_data): if not blocks: return {"article_title": title, "article_text": None} - entity_map = content_state.get("entityMap", {}) + entity_map = _normalize_article_entity_map(content_state.get("entityMap", {})) + media_url_map = _extract_article_media_url_map(article_results) # Convert draft.js blocks to Markdown parts = [] # type: List[str] @@ -217,7 +272,7 @@ def _parse_article(tweet_data): for block in blocks: block_type = block.get("type", "unstyled") # type: str if block_type == "atomic": - parts.extend(_extract_article_images(block, entity_map)) + parts.extend(_extract_article_images(block, entity_map, media_url_map)) ordered_counter = 0 continue text = block.get("text", "") # type: str From 320b8dbf153f04e1d8c118a9da55830875a4bd94 Mon Sep 17 00:00:00 2001 From: alextuan1024 Date: Tue, 17 Mar 2026 16:11:10 +0800 Subject: [PATCH 3/5] test: add real-world article image fixtures --- tests/test_client.py | 130 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 130 insertions(+) diff --git a/tests/test_client.py b/tests/test_client.py index 8cba813..a79e7d4 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -595,6 +595,136 @@ def test_article_atomic_image_block_supports_list_entity_map_and_media_entities( assert tweet is not None assert tweet.article_text == "Intro\n\n![](https://pbs.twimg.com/media/example.png)\n\nOutro" + @patch("twitter_cli.client._get_cffi_session") + @patch("twitter_cli.client._gen_ct_headers", return_value={}) + def test_article_real_shape_odysseus_like_payload_renders_two_images(self, mock_ct_headers, mock_session): + mock_session.return_value = MagicMock() + mock_session.return_value.get = MagicMock(side_effect=Exception("skip")) + + client = TwitterClient.__new__(TwitterClient) + client._ct_init_attempted = True + client._client_transaction = None + + result = copy.deepcopy(self.SAMPLE_TWEET_RESULT) + result["article"] = { + "article_results": { + "result": { + "title": "Harness Engineering Is Cybernetics", + "content_state": { + "blocks": [ + {"key": "a", "type": "unstyled", "text": "First paragraph", "entityRanges": []}, + {"key": "b", "type": "atomic", "text": " ", "entityRanges": [{"offset": 0, "length": 1, "key": 2}]}, + {"key": "c", "type": "unstyled", "text": "Middle paragraph", "entityRanges": []}, + {"key": "d", "type": "atomic", "text": " ", "entityRanges": [{"offset": 0, "length": 1, "key": 5}]}, + {"key": "e", "type": "unstyled", "text": "Last paragraph", "entityRanges": []}, + ], + "entityMap": [ + {"key": "5", "value": {"type": "MEDIA", "data": {"mediaItems": [{"mediaId": "2030414996266741760"}]}}}, + {"key": "2", "value": {"type": "MEDIA", "data": {"mediaItems": [{"mediaId": "2030504404391194624"}]}}}, + ], + }, + "media_entities": [ + { + "media_id": "2030504404391194624", + "media_info": { + "original_img_url": "https://pbs.twimg.com/media/HC3M_2qacAA7mej.png" + }, + }, + { + "media_id": "2030414996266741760", + "media_info": { + "original_img_url": "https://pbs.twimg.com/media/HC17rnca8AAQgjt.jpg" + }, + }, + ], + } + } + } + + tweet = parse_tweet_result(result) + assert tweet is not None + assert tweet.article_text == ( + "First paragraph\n\n" + "![](https://pbs.twimg.com/media/HC3M_2qacAA7mej.png)\n\n" + "Middle paragraph\n\n" + "![](https://pbs.twimg.com/media/HC17rnca8AAQgjt.jpg)\n\n" + "Last paragraph" + ) + + @patch("twitter_cli.client._get_cffi_session") + @patch("twitter_cli.client._gen_ct_headers", return_value={}) + def test_article_real_shape_elvissun_like_payload_renders_caption_and_three_images(self, mock_ct_headers, mock_session): + mock_session.return_value = MagicMock() + mock_session.return_value.get = MagicMock(side_effect=Exception("skip")) + + client = TwitterClient.__new__(TwitterClient) + client._ct_init_attempted = True + client._client_transaction = None + + result = copy.deepcopy(self.SAMPLE_TWEET_RESULT) + result["article"] = { + "article_results": { + "result": { + "title": "OpenClaw + Codex/ClaudeCode Agent Swarm", + "content_state": { + "blocks": [ + {"key": "a", "type": "unstyled", "text": "Intro", "entityRanges": []}, + {"key": "b", "type": "atomic", "text": " ", "entityRanges": [{"offset": 0, "length": 1, "key": 0}]}, + {"key": "c", "type": "unstyled", "text": "Diagram intro", "entityRanges": []}, + {"key": "d", "type": "atomic", "text": " ", "entityRanges": [{"offset": 0, "length": 1, "key": 1}]}, + {"key": "e", "type": "unstyled", "text": "Context comparison", "entityRanges": []}, + {"key": "f", "type": "atomic", "text": " ", "entityRanges": [{"offset": 0, "length": 1, "key": 2}]}, + ], + "entityMap": [ + { + "key": "0", + "value": { + "type": "MEDIA", + "data": { + "caption": "before Jan: CC/codex only | after Jan: Openclaw orchestrates CC/codex", + "mediaItems": [{"mediaId": "2025660629109895168"}], + }, + }, + }, + {"key": "1", "value": {"type": "MEDIA", "data": {"mediaItems": [{"mediaId": "2025790010293669888"}]}}}, + {"key": "2", "value": {"type": "MEDIA", "data": {"mediaItems": [{"mediaId": "2025780043406864384"}]}}}, + ], + }, + "media_entities": [ + { + "media_id": "2025660629109895168", + "media_info": { + "original_img_url": "https://pbs.twimg.com/media/HByXnBmW8AANOl9.jpg" + }, + }, + { + "media_id": "2025790010293669888", + "media_info": { + "original_img_url": "https://pbs.twimg.com/media/HB0NSAEW0AAYPOF.jpg" + }, + }, + { + "media_id": "2025780043406864384", + "media_info": { + "original_img_url": "https://pbs.twimg.com/media/HB0EN2hXcAAbGi9.png" + }, + }, + ], + } + } + } + + tweet = parse_tweet_result(result) + assert tweet is not None + assert tweet.article_text == ( + "Intro\n\n" + "![before Jan: CC/codex only | after Jan: Openclaw orchestrates CC/codex](https://pbs.twimg.com/media/HByXnBmW8AANOl9.jpg)\n\n" + "Diagram intro\n\n" + "![](https://pbs.twimg.com/media/HB0NSAEW0AAYPOF.jpg)\n\n" + "Context comparison\n\n" + "![](https://pbs.twimg.com/media/HB0EN2hXcAAbGi9.png)" + ) + # ── TwitterAPIError ────────────────────────────────────────────────────── From b10a780fc3ed8d322e717379cfbaf6fa67a1a3c5 Mon Sep 17 00:00:00 2001 From: alextuan1024 Date: Tue, 17 Mar 2026 16:24:56 +0800 Subject: [PATCH 4/5] test: add article markdown fixtures --- .../articles/elvissun-2025920521871716562.md | 238 ++++++++++++++++++ .../articles/odyseus0z-2030416758138634583.md | 58 +++++ 2 files changed, 296 insertions(+) create mode 100644 tests/fixtures/articles/elvissun-2025920521871716562.md create mode 100644 tests/fixtures/articles/odyseus0z-2030416758138634583.md diff --git a/tests/fixtures/articles/elvissun-2025920521871716562.md b/tests/fixtures/articles/elvissun-2025920521871716562.md new file mode 100644 index 0000000..301fe5e --- /dev/null +++ b/tests/fixtures/articles/elvissun-2025920521871716562.md @@ -0,0 +1,238 @@ +# OpenClaw + Codex/ClaudeCode Agent Swarm: The One-Person Dev Team [Full Setup] + +- Author: @elvissun (Elvis) +- Published: Mon Feb 23 13:07:46 +0000 2026 +- URL: https://x.com/elvissun/status/2025920521871716562 +- Likes: 12.3K +- Retweets: 1.6K +- Replies: 393 +- Bookmarks: 36.7K +- Views: 0 + +I don't use Codex or Claude Code directly anymore. + +I use OpenClaw as my orchestration layer. My orchestrator, Zoe, spawns the agents, writes their prompts, picks the right model for each task, monitors progress, and pings me on Telegram when PRs are ready to merge. + +Proof points from the last 4 weeks: + +- 94 commits in one day. My most productive day - I had 3 client calls and didn't open my editor once. The average is around 50 commits a day. + +- 7 PRs in 30 minutes. Idea to production are blazing fast because coding and validations are mostly automated. + +- Commits → MRR: I use this for a real B2B SaaS I'm building — bundling it with founder-led sales to deliver most feature requests same-day. Speed converts leads into paying customers. + +![before Jan: CC/codex only | after Jan: Openclaw orchestrates CC/codex](https://pbs.twimg.com/media/HByXnBmW8AANOl9.jpg) + +My git history looks like I just hired a dev team. In reality it's just me going from managing claude code, to managing an openclaw agent that manages a fleet of other claude code and codex agents. + +Success rate: The system one-shots almost all small to medium tasks without any intervention. + +Cost: ~$100/month for Claude and $90/month for Codex, but you can start with $20. + +Here's why this works better than using Codex or Claude Code directly: + +>Codex and Claude Code have very little context about your business. + +They see code. They don't see the full picture of your business. + +OpenClaw changes the equation. It acts as the orchestration layer between you and all agents — it holds all my business context (customer data, meeting notes, past decisions, what worked, what failed) inside my Obsidian vault, and translates historical context into precise prompts for each coding agent. The agents stay focused on code. The orchestrator stays at the high strategy level. + +Here's how the system works at a high level: + +![](https://pbs.twimg.com/media/HB0NSAEW0AAYPOF.jpg) + +Last week Stripe wrote about their background agent system called "Minions" — parallel coding agents backed by a centralized orchestration layer. I accidentally built the same thing but it runs locally on my Mac mini. + +Before I tell you how to set this up, you should know WHY you need an agent orchestrator. + +## Why One AI Can't Do Both + +Context windows are zero-sum. You have to choose what goes in. + +Fill it with code → no room for business context. Fill it with customer history → no room for the codebase. This is why the two-tier system works: each AI is loaded with exactly what it needs. + +OpenClaw and Codex have drastically different context: + +![](https://pbs.twimg.com/media/HB0EN2hXcAAbGi9.png) + +Specialization through context, not through different models. + +## The Full 8-step Workflow + +Let me walk through a real example from last week. + +Step 1: Customer Request → Scoping with Zoe + +I had a call with an agency customer. They wanted to reuse configurations they've already set up across the team. + +After the call, I talked through the request with Zoe. Because all my meeting notes sync automatically to my obsidian vault, zero explanation was needed on my end. We scoped out the feature together — and landed on a template system that lets them save and edit their existing configurations. + +Then Zoe does three things: + +1. Tops up credits to unblock customer immediately — she has admin API access + +2. Pulls customer config from prod database — she has read-only prod DB access (my codex agents will never have this) to retrieve their existing setup, which gets included in the prompt + +3. Spawns a Codex agent — with a detailed prompt containing all the context + +Step 2: Spawn the Agent + +Each agent gets its own worktree (isolated branch) and tmux session: + +The agent runs in a tmux session with full terminal logging via a script. + +Here's how we launch agents: + +I used to use codex exec or claude -p, but switch to tmux recently: + +tmux is far better because mid-task redirection is powerful. Agent going the wrong direction? Don't kill it: + +The task gets tracked in .clawdbot/active-tasks.json: + +When complete, it updates with PR number and checks. (More on this in step 5) + +Step 3: Monitoring in a loop + +A cron job runs every 10 minutes to babysit all agents. This pretty much functions as an improved Ralph Loop, more on it later. + +But it doesn't poll the agents directly — that would be expensive. Instead, it runs a script that reads the JSON registry and checks: + +The script is 100% deterministic and extremely token-efficient: + +- Checks if tmux sessions are alive +- Checks for open PRs on tracked branches +- Checks CI status via gh cli +- Auto-respawns failed agents (max 3 attempts) if CI fails or critical review feedback +- Only alerts if something needs human attention + +I'm not watching terminals. The system tells me when to look. + +Step 4: Agent Creates PR + +The agent commits, pushes, and opens a PR via `gh pr create --fill`. At this point I do NOT get notified — a PR alone isn't done. + +Definition of done (very important your agent knows this): + +- PR created +- Branch synced to main (no merge conflicts) +- CI passing (lint, types, unit tests, E2E) +- Codex review passed +- Claude Code review passed +- Gemini review passed +- Screenshots included (if UI changes) + +Step 5: Automated Code Review + +Every PR gets reviewed by three AI models. They catch different things: + +- Codex Reviewer — Exceptional at edge cases. Does the most thorough review. Catches logic errors, missing error handling, race conditions. False positive rate is very low. + +- Gemini Code Assist Reviewer — Free and incredibly useful. Catches security issues, scalability problems other agents miss. And suggests specific fixes. No brainer to install. + +- Claude Code Reviewer — Mostly useless - tends to be overly cautious. Lots of "consider adding..." suggestions that are usually overengineering. I skip everything unless it's marked critical. It rarely finds critical issues on its own but validates what the other reviewers flag. + +All three post comments directly on the PR. + +Step 6: Automated Testing + +Our CI pipeline runs a heavy amount of automated tests: + +- Lint and TypeScript checks +- Unit tests +- E2E tests +- Playwright tests against a preview environment (identical to prod) + +I added a new rule last week: if the PR changes any UI, it must include a screenshot in the PR description. Otherwise CI fails. This dramatically shortens review time — I can see exactly what changed without clicking through the preview. + +Step 7: Human Review + +Now I get the Telegram notification: "PR #341 ready for review." + +By this point: + +- CI passed +- Three AI reviewers approved the code +- Screenshots show the UI changes +- All edge cases are documented in review comments + +My review takes 5-10 minutes. Many PRs I merge without reading the code — the screenshot shows me everything I need. + +Step 8: Merge + +PR merges. A daily cron job cleans up orphaned worktrees and task registry json. + +## The Ralph Loop V2 + +This is essentially the Ralph Loop, but better. + +The Ralph Loop pulls context from memory, generate output, evaluate results, save learnings. But most implementations run the same prompt each cycle. The distilled learnings improve future retrievals, but the prompt itself stays static. + +Our system is different. When an agent fails, Zoe doesn't just respawn it with the same prompt. She looks at the failure with full business context and figures out how to unblock it: + +- Agent ran out of context? "Focus only on these three files." + +- Agent went the wrong direction? "Stop. The customer wanted X, not Y. Here's what they said in the meeting." + +- Agent need clarification? "Here's customer's email and what their company does." + +Zoe babysits agents through to completion. She has context the agents don't — customer history, meeting notes, what we tried before, why it failed. She uses that context to write better prompts on each retry. + +But she also doesn't wait for me to assign tasks. She finds work proactively: + +- Morning: Scans Sentry → finds 4 new errors → spawns 4 agents to investigate and fix + +- After meetings: Scans meeting notes → flags 3 feature requests customers mentioned → spawns 3 Codex agents + +- Evening: Scans git log → spawns Claude Code to update changelog and customer docs + +I take a walk after a customer call. Come back to Telegram: "7 PRs ready for review. 3 features, 4 bug fixes." + +When agents succeed, the pattern gets logged. "This prompt structure works for billing features." "Codex needs the type definitions upfront." "Always include the test file paths." + +The reward signals are: CI passing, all three code reviews passing, human merge. Any failure triggers the loop. Over time, Zoe writes better prompts because she remembers what shipped. + +## Choosing the Right Agent + +Not all coding agents are equal. Quick reference: + +Codex is my workhorse. Backend logic, complex bugs, multi-file refactors, anything that requires reasoning across the codebase. It's slower but thorough. I use it for 90% of tasks. + +Claude Code is faster and better at frontend work. It also has fewer permission issues, so it's great for git operations. (I used to use this more to drive day to day, but Codex 5.3 is simply better and faster now) + +Gemini has a different superpower — design sensibility. For beautiful UIs, I'll have Gemini generate an HTML/CSS spec first, then hand that to Claude Code to implement in our component system. Gemini designs, Claude builds. + +Zoe picks the right agent for each task and routes outputs between them. A billing system bug goes to Codex. A button style fix goes to Claude Code. A new dashboard design starts with Gemini. + +## How to Set This Up + +Copy this entire article into OpenClaw and tell it: "Implement this agent swarm setup for my codebase." + +It'll read the architecture, create the scripts, set up the directory structure, and configure cron monitoring. Done in 10 minutes. + +No course to sell you. + +## The Bottleneck Nobody Expects + +Here's the ceiling I'm hitting right now: RAM. + +Each agent needs its own worktree. Each worktree needs its own `node_modules`. Each agent runs builds, type checks, tests. Five agents running simultaneously means five parallel TypeScript compilers, five test runners, five sets of dependencies loaded into memory. + +My Mac Mini with 16GB tops out at 4-5 agents before it starts swapping — and I need to be lucky they don't try to build at the same time. + +So I bought a Mac Studio M4 max with 128GB RAM ($3,500) to power this system. It arrives end of March and I'll share if it's worth it. + +## Up Next: The One-Person Million-Dollar Company + +We're going to see a ton of one-person million-dollar companies starting in 2026. The leverage is massive for those who understand how to build recursively self-improving agents. + +This is what it looks like: an AI orchestrator as an extension of yourself (like what Zoe is to me), delegating work to specialized agents that handle different business functions. Engineering. Customer support. Ops. Marketing. Each agent focused on what it's good at. You maintain laser focus and full control. + +The next generation of entrepreneurs won't hire a team of 10 to do what one person with the right system can do. They'll build like this — staying small, moving fast, shipping daily. + +There's so much AI-generated slop right now. So much hype around agents and "mission controls" without building anything actually useful. Fancy demos with no real-world benefits. + +I'm trying to do the opposite: less hype, more documentation of building an actual business. Real customers, real revenue, real commits that ship to production, and real loss too. + +What am I building? Agentic PR — a one-person company taking on the enterprise PR incumbents. Agents that help startups get press coverage without a $10k/month retainer. + +If you want to see how far I take this, follow along. diff --git a/tests/fixtures/articles/odyseus0z-2030416758138634583.md b/tests/fixtures/articles/odyseus0z-2030416758138634583.md new file mode 100644 index 0000000..ed4f66a --- /dev/null +++ b/tests/fixtures/articles/odyseus0z-2030416758138634583.md @@ -0,0 +1,58 @@ +# Harness Engineering Is Cybernetics + +- Author: @odysseus0z (George) +- Published: Sat Mar 07 22:54:13 +0000 2026 +- URL: https://x.com/odysseus0z/status/2030416758138634583 +- Likes: 1.2K +- Retweets: 232 +- Replies: 20 +- Bookmarks: 2.4K +- Views: 0 + +Reading OpenAI's harness engineering post, I kept having a feeling I couldn't place. Then it clicked: I'd seen this pattern before. Not once — three times. + +The first was James Watt's centrifugal governor in the 1780s. Before it, a worker stood next to the steam engine adjusting the valve by hand. After it, a weighted flyball mechanism sensed rotational speed and adjusted the valve automatically. The worker didn't disappear. The job changed: from turning the valve to designing the governor. + +![](https://pbs.twimg.com/media/HC3M_2qacAA7mej.png) + +The second was Kubernetes. You declare desired state — three replicas, this image, these resource limits. A controller continuously observes actual state. When they diverge, the controller reconciles: restarts crashed pods, scales replicas, rolls back bad deployments. The engineer's job shifted from restarting services to writing the spec the system reconciles against. + +The third is now. OpenAI describes engineers who no longer write code. Instead they design environments, build feedback loops, and codify architectural constraints — then agents write the code. A million lines in five months, zero written by hand. They call it "harness engineering." + +Same pattern each time. Norbert Wiener named it in 1948: cybernetics, from the Greek κυβερνήτης — steersman. The same root that gave Kubernetes its name. You stop turning the valve. You steer. + +![](https://pbs.twimg.com/media/HC17rnca8AAQgjt.jpg) + +Each time the pattern appears, it's because someone built a sensor and actuator powerful enough to close the loop at that layer. + +## Why the codebase was the holdout + +The codebase had feedback loops, but only at the lower levels. Compilers close a loop on syntax. Test suites close a loop on behavior. Linters close a loop on style. These are real cybernetic controls — but they only operate on properties that can be checked mechanically. Does it compile? Does it pass? Does it follow the rules? + +Everything above that — does this change fit the system's architecture? is this the right approach? is this abstraction going to cause problems as the codebase grows? — had no sensor and no actuator. Only humans could operate at that level, on both sides: judging quality and writing the fix. + +LLMs changed both at once. They can sense at the level humans used to own — and act at the same level: restructure a module, redesign an inconsistent interface, rewrite a test suite around the contracts that actually matter. For the first time, the feedback loop can close where the important decisions are made. + +But closing the loop is necessary, not sufficient. Watt's governor needed to be tuned. Kubernetes controllers need the right spec. And an LLM working on your codebase needs something harder to provide. + +## Calibrating the sensor and actuator + +Getting the basic feedback loop working — tests that agents can run, CI that gives parseable output, error messages that point to the fix — is table stakes. Carlini demonstrated this when he had 16 parallel agents build a C compiler: embarrassingly simple prompts, but carefully designed test infrastructure. "Most of my effort went into designing the environment around Claude — the tests, the environment, the feedback." + +The harder problem is calibrating the sensor and actuator with knowledge specific to your system. This is where most people get stuck, and where they blame the agent. + +"It keeps doing the wrong thing. It doesn't understand our codebase." The diagnosis is almost always wrong. The agent isn't failing because it lacks capability. It's failing because the knowledge it needs — what "good" means for your system, which patterns your architecture rewards, which it avoids — is locked inside your head, and you haven't externalized it. Agents don't learn through osmosis. If you don't write it down, the agent makes the same mistakes on the hundredth run as the first. + +The work is making your judgment machine-readable. Architecture docs that describe actual layering and dependency direction. Custom linters with remediation instructions baked in. Golden principles that encode your team's taste. OpenAI found exactly this: they spent 20% of every Friday cleaning up "AI slop" — until they encoded their standards into the harness itself. + +## The only way forward + +The practices this demands — documentation, automated testing, codified architectural decisions, fast feedback loops — were always correct. Every engineering book written in the last thirty years recommends them. Most people skip them because the cost of skipping was slow and diffuse: gradual quality decline, painful onboarding, tech debt that compounds quietly. + +Agentic engineering makes the cost extreme. Skip the documentation and the agent ignores your conventions — not on one PR, but on every PR, at machine speed, around the clock. Skip the tests and the feedback loop can't close at all. Skip the architectural constraints and drift compounds faster than you can fix it. And here's the trap: you can't use agents to clean up the mess if the agents don't know what clean looks like. Without the calibration, the machines that created the problem can't solve it either. + +The practices haven't changed. The penalty for ignoring them has become unbearable. + +The generation-verification asymmetry — the intuition behind P vs NP, demonstrated empirically for LLMs by Cobbe et al. — points to where this goes. Generating a correct solution is harder than verifying one. You don't need to out-implement the machine. You need to out-evaluate it: specify what "correct" looks like, recognize when the output misses, judge whether the direction is right. + +The workers who designed Watt's governor didn't go back to turning valves. Not because they couldn't. Because it no longer made sense. From cd074c961ce873487eff634aa2989c63b41fb33d Mon Sep 17 00:00:00 2001 From: alextuan1024 Date: Tue, 17 Mar 2026 16:27:24 +0800 Subject: [PATCH 5/5] Revert "test: add article markdown fixtures" This reverts commit b10a780fc3ed8d322e717379cfbaf6fa67a1a3c5. --- .../articles/elvissun-2025920521871716562.md | 238 ------------------ .../articles/odyseus0z-2030416758138634583.md | 58 ----- 2 files changed, 296 deletions(-) delete mode 100644 tests/fixtures/articles/elvissun-2025920521871716562.md delete mode 100644 tests/fixtures/articles/odyseus0z-2030416758138634583.md diff --git a/tests/fixtures/articles/elvissun-2025920521871716562.md b/tests/fixtures/articles/elvissun-2025920521871716562.md deleted file mode 100644 index 301fe5e..0000000 --- a/tests/fixtures/articles/elvissun-2025920521871716562.md +++ /dev/null @@ -1,238 +0,0 @@ -# OpenClaw + Codex/ClaudeCode Agent Swarm: The One-Person Dev Team [Full Setup] - -- Author: @elvissun (Elvis) -- Published: Mon Feb 23 13:07:46 +0000 2026 -- URL: https://x.com/elvissun/status/2025920521871716562 -- Likes: 12.3K -- Retweets: 1.6K -- Replies: 393 -- Bookmarks: 36.7K -- Views: 0 - -I don't use Codex or Claude Code directly anymore. - -I use OpenClaw as my orchestration layer. My orchestrator, Zoe, spawns the agents, writes their prompts, picks the right model for each task, monitors progress, and pings me on Telegram when PRs are ready to merge. - -Proof points from the last 4 weeks: - -- 94 commits in one day. My most productive day - I had 3 client calls and didn't open my editor once. The average is around 50 commits a day. - -- 7 PRs in 30 minutes. Idea to production are blazing fast because coding and validations are mostly automated. - -- Commits → MRR: I use this for a real B2B SaaS I'm building — bundling it with founder-led sales to deliver most feature requests same-day. Speed converts leads into paying customers. - -![before Jan: CC/codex only | after Jan: Openclaw orchestrates CC/codex](https://pbs.twimg.com/media/HByXnBmW8AANOl9.jpg) - -My git history looks like I just hired a dev team. In reality it's just me going from managing claude code, to managing an openclaw agent that manages a fleet of other claude code and codex agents. - -Success rate: The system one-shots almost all small to medium tasks without any intervention. - -Cost: ~$100/month for Claude and $90/month for Codex, but you can start with $20. - -Here's why this works better than using Codex or Claude Code directly: - ->Codex and Claude Code have very little context about your business. - -They see code. They don't see the full picture of your business. - -OpenClaw changes the equation. It acts as the orchestration layer between you and all agents — it holds all my business context (customer data, meeting notes, past decisions, what worked, what failed) inside my Obsidian vault, and translates historical context into precise prompts for each coding agent. The agents stay focused on code. The orchestrator stays at the high strategy level. - -Here's how the system works at a high level: - -![](https://pbs.twimg.com/media/HB0NSAEW0AAYPOF.jpg) - -Last week Stripe wrote about their background agent system called "Minions" — parallel coding agents backed by a centralized orchestration layer. I accidentally built the same thing but it runs locally on my Mac mini. - -Before I tell you how to set this up, you should know WHY you need an agent orchestrator. - -## Why One AI Can't Do Both - -Context windows are zero-sum. You have to choose what goes in. - -Fill it with code → no room for business context. Fill it with customer history → no room for the codebase. This is why the two-tier system works: each AI is loaded with exactly what it needs. - -OpenClaw and Codex have drastically different context: - -![](https://pbs.twimg.com/media/HB0EN2hXcAAbGi9.png) - -Specialization through context, not through different models. - -## The Full 8-step Workflow - -Let me walk through a real example from last week. - -Step 1: Customer Request → Scoping with Zoe - -I had a call with an agency customer. They wanted to reuse configurations they've already set up across the team. - -After the call, I talked through the request with Zoe. Because all my meeting notes sync automatically to my obsidian vault, zero explanation was needed on my end. We scoped out the feature together — and landed on a template system that lets them save and edit their existing configurations. - -Then Zoe does three things: - -1. Tops up credits to unblock customer immediately — she has admin API access - -2. Pulls customer config from prod database — she has read-only prod DB access (my codex agents will never have this) to retrieve their existing setup, which gets included in the prompt - -3. Spawns a Codex agent — with a detailed prompt containing all the context - -Step 2: Spawn the Agent - -Each agent gets its own worktree (isolated branch) and tmux session: - -The agent runs in a tmux session with full terminal logging via a script. - -Here's how we launch agents: - -I used to use codex exec or claude -p, but switch to tmux recently: - -tmux is far better because mid-task redirection is powerful. Agent going the wrong direction? Don't kill it: - -The task gets tracked in .clawdbot/active-tasks.json: - -When complete, it updates with PR number and checks. (More on this in step 5) - -Step 3: Monitoring in a loop - -A cron job runs every 10 minutes to babysit all agents. This pretty much functions as an improved Ralph Loop, more on it later. - -But it doesn't poll the agents directly — that would be expensive. Instead, it runs a script that reads the JSON registry and checks: - -The script is 100% deterministic and extremely token-efficient: - -- Checks if tmux sessions are alive -- Checks for open PRs on tracked branches -- Checks CI status via gh cli -- Auto-respawns failed agents (max 3 attempts) if CI fails or critical review feedback -- Only alerts if something needs human attention - -I'm not watching terminals. The system tells me when to look. - -Step 4: Agent Creates PR - -The agent commits, pushes, and opens a PR via `gh pr create --fill`. At this point I do NOT get notified — a PR alone isn't done. - -Definition of done (very important your agent knows this): - -- PR created -- Branch synced to main (no merge conflicts) -- CI passing (lint, types, unit tests, E2E) -- Codex review passed -- Claude Code review passed -- Gemini review passed -- Screenshots included (if UI changes) - -Step 5: Automated Code Review - -Every PR gets reviewed by three AI models. They catch different things: - -- Codex Reviewer — Exceptional at edge cases. Does the most thorough review. Catches logic errors, missing error handling, race conditions. False positive rate is very low. - -- Gemini Code Assist Reviewer — Free and incredibly useful. Catches security issues, scalability problems other agents miss. And suggests specific fixes. No brainer to install. - -- Claude Code Reviewer — Mostly useless - tends to be overly cautious. Lots of "consider adding..." suggestions that are usually overengineering. I skip everything unless it's marked critical. It rarely finds critical issues on its own but validates what the other reviewers flag. - -All three post comments directly on the PR. - -Step 6: Automated Testing - -Our CI pipeline runs a heavy amount of automated tests: - -- Lint and TypeScript checks -- Unit tests -- E2E tests -- Playwright tests against a preview environment (identical to prod) - -I added a new rule last week: if the PR changes any UI, it must include a screenshot in the PR description. Otherwise CI fails. This dramatically shortens review time — I can see exactly what changed without clicking through the preview. - -Step 7: Human Review - -Now I get the Telegram notification: "PR #341 ready for review." - -By this point: - -- CI passed -- Three AI reviewers approved the code -- Screenshots show the UI changes -- All edge cases are documented in review comments - -My review takes 5-10 minutes. Many PRs I merge without reading the code — the screenshot shows me everything I need. - -Step 8: Merge - -PR merges. A daily cron job cleans up orphaned worktrees and task registry json. - -## The Ralph Loop V2 - -This is essentially the Ralph Loop, but better. - -The Ralph Loop pulls context from memory, generate output, evaluate results, save learnings. But most implementations run the same prompt each cycle. The distilled learnings improve future retrievals, but the prompt itself stays static. - -Our system is different. When an agent fails, Zoe doesn't just respawn it with the same prompt. She looks at the failure with full business context and figures out how to unblock it: - -- Agent ran out of context? "Focus only on these three files." - -- Agent went the wrong direction? "Stop. The customer wanted X, not Y. Here's what they said in the meeting." - -- Agent need clarification? "Here's customer's email and what their company does." - -Zoe babysits agents through to completion. She has context the agents don't — customer history, meeting notes, what we tried before, why it failed. She uses that context to write better prompts on each retry. - -But she also doesn't wait for me to assign tasks. She finds work proactively: - -- Morning: Scans Sentry → finds 4 new errors → spawns 4 agents to investigate and fix - -- After meetings: Scans meeting notes → flags 3 feature requests customers mentioned → spawns 3 Codex agents - -- Evening: Scans git log → spawns Claude Code to update changelog and customer docs - -I take a walk after a customer call. Come back to Telegram: "7 PRs ready for review. 3 features, 4 bug fixes." - -When agents succeed, the pattern gets logged. "This prompt structure works for billing features." "Codex needs the type definitions upfront." "Always include the test file paths." - -The reward signals are: CI passing, all three code reviews passing, human merge. Any failure triggers the loop. Over time, Zoe writes better prompts because she remembers what shipped. - -## Choosing the Right Agent - -Not all coding agents are equal. Quick reference: - -Codex is my workhorse. Backend logic, complex bugs, multi-file refactors, anything that requires reasoning across the codebase. It's slower but thorough. I use it for 90% of tasks. - -Claude Code is faster and better at frontend work. It also has fewer permission issues, so it's great for git operations. (I used to use this more to drive day to day, but Codex 5.3 is simply better and faster now) - -Gemini has a different superpower — design sensibility. For beautiful UIs, I'll have Gemini generate an HTML/CSS spec first, then hand that to Claude Code to implement in our component system. Gemini designs, Claude builds. - -Zoe picks the right agent for each task and routes outputs between them. A billing system bug goes to Codex. A button style fix goes to Claude Code. A new dashboard design starts with Gemini. - -## How to Set This Up - -Copy this entire article into OpenClaw and tell it: "Implement this agent swarm setup for my codebase." - -It'll read the architecture, create the scripts, set up the directory structure, and configure cron monitoring. Done in 10 minutes. - -No course to sell you. - -## The Bottleneck Nobody Expects - -Here's the ceiling I'm hitting right now: RAM. - -Each agent needs its own worktree. Each worktree needs its own `node_modules`. Each agent runs builds, type checks, tests. Five agents running simultaneously means five parallel TypeScript compilers, five test runners, five sets of dependencies loaded into memory. - -My Mac Mini with 16GB tops out at 4-5 agents before it starts swapping — and I need to be lucky they don't try to build at the same time. - -So I bought a Mac Studio M4 max with 128GB RAM ($3,500) to power this system. It arrives end of March and I'll share if it's worth it. - -## Up Next: The One-Person Million-Dollar Company - -We're going to see a ton of one-person million-dollar companies starting in 2026. The leverage is massive for those who understand how to build recursively self-improving agents. - -This is what it looks like: an AI orchestrator as an extension of yourself (like what Zoe is to me), delegating work to specialized agents that handle different business functions. Engineering. Customer support. Ops. Marketing. Each agent focused on what it's good at. You maintain laser focus and full control. - -The next generation of entrepreneurs won't hire a team of 10 to do what one person with the right system can do. They'll build like this — staying small, moving fast, shipping daily. - -There's so much AI-generated slop right now. So much hype around agents and "mission controls" without building anything actually useful. Fancy demos with no real-world benefits. - -I'm trying to do the opposite: less hype, more documentation of building an actual business. Real customers, real revenue, real commits that ship to production, and real loss too. - -What am I building? Agentic PR — a one-person company taking on the enterprise PR incumbents. Agents that help startups get press coverage without a $10k/month retainer. - -If you want to see how far I take this, follow along. diff --git a/tests/fixtures/articles/odyseus0z-2030416758138634583.md b/tests/fixtures/articles/odyseus0z-2030416758138634583.md deleted file mode 100644 index ed4f66a..0000000 --- a/tests/fixtures/articles/odyseus0z-2030416758138634583.md +++ /dev/null @@ -1,58 +0,0 @@ -# Harness Engineering Is Cybernetics - -- Author: @odysseus0z (George) -- Published: Sat Mar 07 22:54:13 +0000 2026 -- URL: https://x.com/odysseus0z/status/2030416758138634583 -- Likes: 1.2K -- Retweets: 232 -- Replies: 20 -- Bookmarks: 2.4K -- Views: 0 - -Reading OpenAI's harness engineering post, I kept having a feeling I couldn't place. Then it clicked: I'd seen this pattern before. Not once — three times. - -The first was James Watt's centrifugal governor in the 1780s. Before it, a worker stood next to the steam engine adjusting the valve by hand. After it, a weighted flyball mechanism sensed rotational speed and adjusted the valve automatically. The worker didn't disappear. The job changed: from turning the valve to designing the governor. - -![](https://pbs.twimg.com/media/HC3M_2qacAA7mej.png) - -The second was Kubernetes. You declare desired state — three replicas, this image, these resource limits. A controller continuously observes actual state. When they diverge, the controller reconciles: restarts crashed pods, scales replicas, rolls back bad deployments. The engineer's job shifted from restarting services to writing the spec the system reconciles against. - -The third is now. OpenAI describes engineers who no longer write code. Instead they design environments, build feedback loops, and codify architectural constraints — then agents write the code. A million lines in five months, zero written by hand. They call it "harness engineering." - -Same pattern each time. Norbert Wiener named it in 1948: cybernetics, from the Greek κυβερνήτης — steersman. The same root that gave Kubernetes its name. You stop turning the valve. You steer. - -![](https://pbs.twimg.com/media/HC17rnca8AAQgjt.jpg) - -Each time the pattern appears, it's because someone built a sensor and actuator powerful enough to close the loop at that layer. - -## Why the codebase was the holdout - -The codebase had feedback loops, but only at the lower levels. Compilers close a loop on syntax. Test suites close a loop on behavior. Linters close a loop on style. These are real cybernetic controls — but they only operate on properties that can be checked mechanically. Does it compile? Does it pass? Does it follow the rules? - -Everything above that — does this change fit the system's architecture? is this the right approach? is this abstraction going to cause problems as the codebase grows? — had no sensor and no actuator. Only humans could operate at that level, on both sides: judging quality and writing the fix. - -LLMs changed both at once. They can sense at the level humans used to own — and act at the same level: restructure a module, redesign an inconsistent interface, rewrite a test suite around the contracts that actually matter. For the first time, the feedback loop can close where the important decisions are made. - -But closing the loop is necessary, not sufficient. Watt's governor needed to be tuned. Kubernetes controllers need the right spec. And an LLM working on your codebase needs something harder to provide. - -## Calibrating the sensor and actuator - -Getting the basic feedback loop working — tests that agents can run, CI that gives parseable output, error messages that point to the fix — is table stakes. Carlini demonstrated this when he had 16 parallel agents build a C compiler: embarrassingly simple prompts, but carefully designed test infrastructure. "Most of my effort went into designing the environment around Claude — the tests, the environment, the feedback." - -The harder problem is calibrating the sensor and actuator with knowledge specific to your system. This is where most people get stuck, and where they blame the agent. - -"It keeps doing the wrong thing. It doesn't understand our codebase." The diagnosis is almost always wrong. The agent isn't failing because it lacks capability. It's failing because the knowledge it needs — what "good" means for your system, which patterns your architecture rewards, which it avoids — is locked inside your head, and you haven't externalized it. Agents don't learn through osmosis. If you don't write it down, the agent makes the same mistakes on the hundredth run as the first. - -The work is making your judgment machine-readable. Architecture docs that describe actual layering and dependency direction. Custom linters with remediation instructions baked in. Golden principles that encode your team's taste. OpenAI found exactly this: they spent 20% of every Friday cleaning up "AI slop" — until they encoded their standards into the harness itself. - -## The only way forward - -The practices this demands — documentation, automated testing, codified architectural decisions, fast feedback loops — were always correct. Every engineering book written in the last thirty years recommends them. Most people skip them because the cost of skipping was slow and diffuse: gradual quality decline, painful onboarding, tech debt that compounds quietly. - -Agentic engineering makes the cost extreme. Skip the documentation and the agent ignores your conventions — not on one PR, but on every PR, at machine speed, around the clock. Skip the tests and the feedback loop can't close at all. Skip the architectural constraints and drift compounds faster than you can fix it. And here's the trap: you can't use agents to clean up the mess if the agents don't know what clean looks like. Without the calibration, the machines that created the problem can't solve it either. - -The practices haven't changed. The penalty for ignoring them has become unbearable. - -The generation-verification asymmetry — the intuition behind P vs NP, demonstrated empirically for LLMs by Cobbe et al. — points to where this goes. Generating a correct solution is harder than verifying one. You don't need to out-implement the machine. You need to out-evaluate it: specify what "correct" looks like, recognize when the output misses, judge whether the direction is right. - -The workers who designed Watt's governor didn't go back to turning valves. Not because they couldn't. Because it no longer made sense.