From 800896124e5a029e6fc24f769598afc2e427a14a Mon Sep 17 00:00:00 2001 From: Chris Huber Date: Wed, 6 May 2026 09:44:17 -0400 Subject: [PATCH 1/4] Fix AI packet prompt projection --- inc/Cli/Commands/AICommand.php | 8 + inc/Core/Steps/AI/AIStep.php | 55 +---- inc/Engine/AI/DataPacketPromptProjector.php | 230 ++++++++++++++++++++ inc/Engine/AI/RequestInspector.php | 9 +- tests/Unit/Core/Steps/AI/AIStepTest.php | 97 +++++++++ tests/ai-packet-projection-smoke.php | 104 +++++++++ tests/ai-request-inspector-smoke.php | 2 + 7 files changed, 454 insertions(+), 51 deletions(-) create mode 100644 inc/Engine/AI/DataPacketPromptProjector.php create mode 100644 tests/ai-packet-projection-smoke.php diff --git a/inc/Cli/Commands/AICommand.php b/inc/Cli/Commands/AICommand.php index a0c709650..8ddd1ca69 100644 --- a/inc/Cli/Commands/AICommand.php +++ b/inc/Cli/Commands/AICommand.php @@ -104,6 +104,14 @@ private function renderTableSummary( array $result ): void { 'metric' => 'conversation_user_message_bytes', 'value' => (int) $result['conversation_user_message_bytes'], ), + array( + 'metric' => 'canonical_packet_json_bytes', + 'value' => (int) $result['canonical_packet_json_bytes'], + ), + array( + 'metric' => 'projected_packet_json_bytes', + 'value' => (int) $result['projected_packet_json_bytes'], + ), array( 'metric' => 'conversation_packet_json_bytes', 'value' => (int) $result['conversation_packet_json_bytes'], diff --git a/inc/Core/Steps/AI/AIStep.php b/inc/Core/Steps/AI/AIStep.php index 75adf1c57..8a1cef396 100644 --- a/inc/Core/Steps/AI/AIStep.php +++ b/inc/Core/Steps/AI/AIStep.php @@ -12,6 +12,7 @@ use DataMachine\Core\Steps\StepTypeRegistrationTrait; use DataMachine\Core\Steps\QueueableTrait; use DataMachine\Engine\AI\ConversationManager; +use DataMachine\Engine\AI\DataPacketPromptProjector; use DataMachine\Engine\AI\PipelineTranscriptPolicy; use DataMachine\Engine\AI\Tools\ToolExecutor; use DataMachine\Engine\AI\Tools\ToolPolicyResolver; @@ -191,7 +192,7 @@ protected function executeStep(): array { $messages = array(); if ( ! empty( $this->dataPackets ) ) { - $data_packet_content = wp_json_encode( array( 'data_packets' => self::sanitizeDataPacketsForAi( $this->dataPackets ) ), JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE ); + $data_packet_content = wp_json_encode( array( 'data_packets' => DataPacketPromptProjector::project( $this->dataPackets ) ), JSON_UNESCAPED_UNICODE ); $messages[] = ConversationManager::buildConversationMessage( 'user', false === $data_packet_content ? '' : $data_packet_content @@ -431,58 +432,16 @@ protected function executeStep(): array { } /** - * Remove local-only file paths before serializing data packets to AI. + * Project data packets before serializing them to AI. * - * Fetch handlers may include file_info.file_path so downstream runtime steps - * can attach images or access files. That internal path should not be exposed - * in the AI-visible JSON payload because models can copy it into generated - * content. The original packets remain unchanged for runtime use. + * Kept as a compatibility wrapper for older tests/call sites. Canonical + * packets remain unchanged for runtime and storage use. * * @param array $data_packets Original data packets. - * @return array Sanitized copy safe for AI serialization. + * @return array Projected copy safe for AI serialization. */ public static function sanitizeDataPacketsForAi( array $data_packets ): array { - $sanitized_packets = array(); - - foreach ( $data_packets as $packet ) { - if ( ! is_array( $packet ) ) { - $sanitized_packets[] = $packet; - continue; - } - - $sanitized_packet = $packet; - - if ( isset( $sanitized_packet['data'] ) && is_array( $sanitized_packet['data'] ) ) { - $sanitized_packet['data'] = self::sanitizePacketDataForAi( $sanitized_packet['data'] ); - } - - $sanitized_packets[] = $sanitized_packet; - } - - return $sanitized_packets; - } - - /** - * Remove internal file path fields from packet data. - * - * @param array $packet_data Packet data array. - * @return array Sanitized packet data. - */ - private static function sanitizePacketDataForAi( array $packet_data ): array { - if ( ! isset( $packet_data['file_info'] ) || ! is_array( $packet_data['file_info'] ) ) { - return $packet_data; - } - - $sanitized_file_info = $packet_data['file_info']; - unset( $sanitized_file_info['file_path'] ); - - if ( empty( $sanitized_file_info ) ) { - unset( $packet_data['file_info'] ); - return $packet_data; - } - - $packet_data['file_info'] = $sanitized_file_info; - return $packet_data; + return DataPacketPromptProjector::project( $data_packets ); } /** diff --git a/inc/Engine/AI/DataPacketPromptProjector.php b/inc/Engine/AI/DataPacketPromptProjector.php new file mode 100644 index 000000000..abbde8139 --- /dev/null +++ b/inc/Engine/AI/DataPacketPromptProjector.php @@ -0,0 +1,230 @@ + self::firstString( $source, $data, $metadata, array( 'title', 'name', 'subject' ) ), + 'body' => self::firstString( $source, $data, array(), array( 'content', 'body', 'text', 'summary', 'description' ) ), + 'url' => self::firstString( $source, $metadata, $data, array( 'url', 'link', 'permalink', 'source_url', 'mcp_url' ) ), + 'date' => self::firstString( $source, $metadata, array(), array( 'date', 'created_at', 'updated_at', 'modified_at', 'mcp_date' ) ), + 'author' => self::firstString( $source, $metadata, array(), array( 'author', 'byline', 'user', 'mcp_author' ) ), + 'matching_content' => self::cleanSnippet( self::firstString( $source, $metadata, array(), array( 'matching_content', 'snippet', 'excerpt' ) ) ), + 'tags' => self::firstValue( $source, $metadata, array(), array( 'tags', 'mcp_tags' ) ), + 'source_id' => self::firstString( $source, $metadata, array(), array( 'id', 'guid', 'item_identifier', 'source_id' ) ), + ), + static fn( $value ) => null !== $value && '' !== $value && array() !== $value + ); + + $projected_data = self::sanitizePacketData( $projected_data ); + + $projected_metadata = array_filter( + array( + 'source_type' => $metadata['source_type'] ?? null, + 'source_url' => $metadata['source_url'] ?? ( $metadata['mcp_url'] ?? null ), + 'item_identifier' => $metadata['item_identifier'] ?? null, + 'source_label' => $metadata['source_label'] ?? ( $metadata['mcp_provider'] ?? null ), + ), + static fn( $value ) => null !== $value && '' !== $value && array() !== $value + ); + + $projected = array( + 'type' => $packet['type'] ?? 'fetch', + 'data' => $projected_data, + 'metadata' => $projected_metadata, + ); + + if ( array_key_exists( 'timestamp', $packet ) ) { + $projected['timestamp'] = $packet['timestamp']; + } + + return $projected; + } + + /** + * Remove internal fields from prompt-facing data. + * + * @param array $packet_data Packet data. + * @return array Sanitized packet data. + */ + private static function sanitizePacketData( array $packet_data ): array { + if ( ! isset( $packet_data['file_info'] ) || ! is_array( $packet_data['file_info'] ) ) { + return $packet_data; + } + + $sanitized_file_info = $packet_data['file_info']; + unset( $sanitized_file_info['file_path'] ); + + if ( empty( $sanitized_file_info ) ) { + unset( $packet_data['file_info'] ); + return $packet_data; + } + + $packet_data['file_info'] = $sanitized_file_info; + return $packet_data; + } + + /** + * Decode a JSON object from a packet body. + * + * @param mixed $value Candidate JSON string. + * @return array|null + */ + private static function decodeJsonObject( mixed $value ): ?array { + if ( ! is_string( $value ) || '' === trim( $value ) ) { + return null; + } + + $decoded = json_decode( $value, true ); + return is_array( $decoded ) ? $decoded : null; + } + + /** + * Return the first scalar value as a string. + * + * @param array $primary Primary source. + * @param array $secondary Secondary source. + * @param array $tertiary Tertiary source. + * @param array $keys Candidate keys. + * @return string|null + */ + private static function firstString( array $primary, array $secondary, array $tertiary, array $keys ): ?string { + foreach ( array( $primary, $secondary, $tertiary ) as $source ) { + foreach ( $keys as $key ) { + if ( ! array_key_exists( $key, $source ) || ! is_scalar( $source[ $key ] ) ) { + continue; + } + + $value = trim( (string) $source[ $key ] ); + if ( '' !== $value ) { + return $value; + } + } + } + + return null; + } + + /** + * Return the first available value for any candidate key. + * + * @param array $primary Primary source. + * @param array $secondary Secondary source. + * @param array $tertiary Tertiary source. + * @param array $keys Candidate keys. + * @return mixed|null + */ + private static function firstValue( array $primary, array $secondary, array $tertiary, array $keys ): mixed { + foreach ( array( $primary, $secondary, $tertiary ) as $source ) { + foreach ( $keys as $key ) { + if ( array_key_exists( $key, $source ) && null !== $source[ $key ] && '' !== $source[ $key ] ) { + return $source[ $key ]; + } + } + } + + return null; + } + + /** + * Remove search-highlight tags from snippets. + * + * @param string|null $snippet Snippet value. + * @return string|null + */ + private static function cleanSnippet( ?string $snippet ): ?string { + if ( null === $snippet ) { + return null; + } + + $snippet = preg_replace( '#]*>#i', '', $snippet ); + $snippet = is_string( $snippet ) ? trim( $snippet ) : ''; + + return '' === $snippet ? null : $snippet; + } +} diff --git a/inc/Engine/AI/RequestInspector.php b/inc/Engine/AI/RequestInspector.php index ea29a5c8d..802b61606 100644 --- a/inc/Engine/AI/RequestInspector.php +++ b/inc/Engine/AI/RequestInspector.php @@ -13,7 +13,6 @@ use DataMachine\Core\EngineData; use DataMachine\Core\FilesRepository\FileRetrieval; use DataMachine\Core\PluginSettings; -use DataMachine\Core\Steps\AI\AIStep; use DataMachine\Core\Steps\AI\ToolPolicy\PipelineToolPolicyArgs; use DataMachine\Core\Steps\FlowStepConfig; use DataMachine\Abilities\Flow\QueueAbility; @@ -191,7 +190,7 @@ private function buildInitialMessages( array $data_packets, EngineData $engine, $messages = array(); if ( ! empty( $data_packets ) ) { - $data_packet_content = wp_json_encode( array( 'data_packets' => AIStep::sanitizeDataPacketsForAi( $data_packets ) ), JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE ); + $data_packet_content = wp_json_encode( array( 'data_packets' => DataPacketPromptProjector::project( $data_packets ) ), JSON_UNESCAPED_UNICODE ); $messages[] = ConversationManager::buildConversationMessage( 'user', false === $data_packet_content ? '' : $data_packet_content @@ -272,6 +271,8 @@ private function measure( array $assembled, array $data_packets, array $initial_ $messages = $request['messages'] ?? array(); $tools = $request['tools'] ?? array(); + $projected_packets = DataPacketPromptProjector::project( $data_packets ); + return array( 'message_count' => count( $messages ), 'initial_message_count' => count( $initial_messages ), @@ -279,7 +280,9 @@ private function measure( array $assembled, array $data_packets, array $initial_ 'messages_json_bytes' => self::jsonBytes( $messages ), 'tools_json_bytes' => self::jsonBytes( $tools ), 'conversation_user_message_bytes' => self::sumUserMessageBytes( $initial_messages ), - 'conversation_packet_json_bytes' => self::jsonBytes( AIStep::sanitizeDataPacketsForAi( $data_packets ) ), + 'canonical_packet_json_bytes' => self::jsonBytes( $data_packets ), + 'projected_packet_json_bytes' => self::jsonBytes( $projected_packets ), + 'conversation_packet_json_bytes' => self::jsonBytes( $projected_packets ), 'directives' => $assembled['directive_breakdown'], 'tool_count' => count( $structured_tools ), 'largest_tools' => $this->largestTools( $structured_tools ), diff --git a/tests/Unit/Core/Steps/AI/AIStepTest.php b/tests/Unit/Core/Steps/AI/AIStepTest.php index 943582dbc..3da794133 100644 --- a/tests/Unit/Core/Steps/AI/AIStepTest.php +++ b/tests/Unit/Core/Steps/AI/AIStepTest.php @@ -8,6 +8,7 @@ namespace DataMachine\Tests\Unit\Core\Steps\AI; use DataMachine\Core\Steps\AI\AIStep; +use DataMachine\Engine\AI\DataPacketPromptProjector; use DataMachine\Engine\AI\Tools\ToolResultFinder; use PHPUnit\Framework\TestCase; use ReflectionMethod; @@ -79,6 +80,102 @@ public function test_sanitize_data_packets_for_ai_leaves_packets_without_file_in $this->assertSame( $data_packets, AIStep::sanitizeDataPacketsForAi( $data_packets ) ); } + public function test_prompt_projection_flattens_mcp_packet_and_preserves_canonical_packet(): void { + $raw_item = array( + 'id' => 'mgs-123', + 'title' => 'Data Download, April 14, 2026', + 'url' => 'https://example.com/p2/post', + 'date' => '2026-04-14T12:00:00Z', + 'author' => 'Chris', + 'matching_content' => 'A highlighted source snippet.', + 'tags' => array( 'mgs', 'history' ), + ); + $canonical = array( + array( + 'type' => 'fetch', + 'timestamp' => 1770000000, + 'data' => array( + 'title' => 'Wrapped title', + 'body' => wp_json_encode( $raw_item, JSON_UNESCAPED_UNICODE ), + ), + 'metadata' => array( + 'source_type' => 'mcp', + 'pipeline_id' => 3, + 'flow_id' => 2, + 'handler' => 'mcp_fetch', + 'mcp_provider' => 'WordPress.com MGS', + 'mcp_server' => 'wordpress-com', + 'mcp_tool' => 'search', + 'mcp_url' => 'https://example.com/p2/post', + 'mcp_raw_item' => $raw_item, + 'item_identifier' => 'mgs-123', + ), + ), + ); + + $canonical_before = $canonical; + $projected = DataPacketPromptProjector::project( $canonical ); + + $this->assertSame( $canonical_before, $canonical, 'Projection must not mutate canonical packets.' ); + $this->assertSame( 'Data Download, April 14, 2026', $projected[0]['data']['title'] ); + $this->assertSame( 'https://example.com/p2/post', $projected[0]['data']['url'] ); + $this->assertSame( 'A highlighted source snippet.', $projected[0]['data']['matching_content'] ); + $this->assertSame( array( 'mgs', 'history' ), $projected[0]['data']['tags'] ); + $this->assertSame( 'mcp', $projected[0]['metadata']['source_type'] ); + $this->assertSame( 'mgs-123', $projected[0]['metadata']['item_identifier'] ); + $this->assertArrayNotHasKey( 'mcp_raw_item', $projected[0]['metadata'] ); + $this->assertArrayNotHasKey( 'pipeline_id', $projected[0]['metadata'] ); + + $canonical_bytes = strlen( wp_json_encode( $canonical, JSON_UNESCAPED_UNICODE ) ); + $projected_bytes = strlen( wp_json_encode( $projected, JSON_UNESCAPED_UNICODE ) ); + + $this->assertLessThan( $canonical_bytes, $projected_bytes ); + } + + public function test_prompt_projection_generic_fallback_preserves_unknown_packet_shape(): void { + $canonical = array( + array( + 'type' => 'fetch', + 'data' => array( + 'title' => 'RSS item', + 'body' => 'Keep body', + 'file_info' => array( + 'file_path' => '/tmp/runtime-only.jpg', + 'mime_type' => 'image/jpeg', + ), + ), + 'metadata' => array( + 'source_type' => 'rss', + 'custom_key' => 'custom value', + ), + ), + ); + + $projected = DataPacketPromptProjector::project( $canonical ); + + $this->assertSame( 'RSS item', $projected[0]['data']['title'] ); + $this->assertSame( 'Keep body', $projected[0]['data']['body'] ); + $this->assertSame( 'rss', $projected[0]['metadata']['source_type'] ); + $this->assertSame( 'custom value', $projected[0]['metadata']['custom_key'] ); + $this->assertArrayNotHasKey( 'file_path', $projected[0]['data']['file_info'] ); + $this->assertSame( '/tmp/runtime-only.jpg', $canonical[0]['data']['file_info']['file_path'] ); + } + + public function test_prompt_projection_does_not_flatten_unknown_json_body_packets(): void { + $canonical = array( + array( + 'type' => 'fetch', + 'data' => array( + 'title' => 'Unknown JSON packet', + 'body' => '{"title":"Nested title","custom":"important"}', + ), + 'metadata' => array( 'source_type' => 'custom_json_feed' ), + ), + ); + + $this->assertSame( $canonical, DataPacketPromptProjector::project( $canonical ) ); + } + /** * Test that processLoopResults does NOT carry forward input DataPackets. * diff --git a/tests/ai-packet-projection-smoke.php b/tests/ai-packet-projection-smoke.php new file mode 100644 index 000000000..e4694fc26 --- /dev/null +++ b/tests/ai-packet-projection-smoke.php @@ -0,0 +1,104 @@ + 'mgs-624', + 'title' => 'Data Download, April 14, 2026', + 'url' => 'https://example.com/a8c/post', + 'date' => '2026-04-14T12:00:00Z', + 'author' => 'Chris', + 'matching_content' => 'Useful highlight for the model.', + 'tags' => array( 'mgs', 'history' ), +); + +$canonical = array( + array( + 'type' => 'fetch', + 'timestamp' => 1770000000, + 'data' => array( + 'title' => 'Wrapped MGS item', + 'body' => wp_json_encode( $raw_item, JSON_UNESCAPED_UNICODE ), + ), + 'metadata' => array( + 'source_type' => 'mcp', + 'pipeline_id' => 3, + 'flow_id' => 2, + 'handler' => 'mcp_fetch', + 'mcp_provider' => 'WordPress.com MGS', + 'mcp_server' => 'wordpress-com', + 'mcp_tool' => 'search', + 'mcp_url' => 'https://example.com/a8c/post', + 'mcp_raw_item' => $raw_item, + 'item_identifier' => 'mgs-624', + ), + ), +); + +$canonical_before = $canonical; +$projected = \DataMachine\Engine\AI\DataPacketPromptProjector::project( $canonical ); + +assert_projection( 'canonical packet unchanged after projection', $canonical_before === $canonical ); +assert_projection( 'MGS title flattened from source body', 'Data Download, April 14, 2026' === ( $projected[0]['data']['title'] ?? '' ) ); +assert_projection( 'snippet strips em highlight tags', 'Useful highlight for the model.' === ( $projected[0]['data']['matching_content'] ?? '' ) ); +assert_projection( 'mcp_raw_item omitted from prompt metadata', ! array_key_exists( 'mcp_raw_item', $projected[0]['metadata'] ?? array() ) ); +assert_projection( 'engine plumbing omitted from prompt metadata', ! array_key_exists( 'pipeline_id', $projected[0]['metadata'] ?? array() ) ); +assert_projection( 'stable item identifier preserved', 'mgs-624' === ( $projected[0]['metadata']['item_identifier'] ?? '' ) ); + +$canonical_bytes = strlen( wp_json_encode( $canonical, JSON_UNESCAPED_UNICODE ) ); +$projected_bytes = strlen( wp_json_encode( $projected, JSON_UNESCAPED_UNICODE ) ); + +assert_projection( 'projected packet JSON is smaller than canonical JSON', $projected_bytes < $canonical_bytes, "canonical=$canonical_bytes projected=$projected_bytes" ); + +$prompt_json = wp_json_encode( array( 'data_packets' => $projected ), JSON_UNESCAPED_UNICODE ); +assert_projection( 'prompt JSON is compact by default', ! str_contains( $prompt_json, "\n" ) ); + +$unknown_json_packet = array( + array( + 'type' => 'fetch', + 'data' => array( + 'title' => 'Unknown JSON packet', + 'body' => '{"title":"Nested title","custom":"important"}', + ), + 'metadata' => array( 'source_type' => 'custom_json_feed' ), + ), +); +$unknown_projected = \DataMachine\Engine\AI\DataPacketPromptProjector::project( $unknown_json_packet ); +assert_projection( 'unknown JSON body packets use conservative fallback', $unknown_json_packet === $unknown_projected ); + +echo "\n$total assertions, $failed failures\n"; +if ( $failed > 0 ) { + exit( 1 ); +} diff --git a/tests/ai-request-inspector-smoke.php b/tests/ai-request-inspector-smoke.php index 843c0fbe4..87219a69f 100644 --- a/tests/ai-request-inspector-smoke.php +++ b/tests/ai-request-inspector-smoke.php @@ -202,6 +202,8 @@ function ( array $directives ): array { assert_test( '--job option documented', false !== strpos( $command, '--job=' ) ); assert_test( '--step option documented', false !== strpos( $command, '--step=' ) ); assert_test( 'json output path exists', false !== strpos( $command, "'json' === \$format" ) ); +assert_test( 'table output includes canonical packet bytes', false !== strpos( $command, 'canonical_packet_json_bytes' ) ); +assert_test( 'table output includes projected packet bytes', false !== strpos( $command, 'projected_packet_json_bytes' ) ); assert_test( 'table output includes directive section', false !== strpos( $command, "Directives" ) ); assert_test( 'table output includes largest tools section', false !== strpos( $command, "Largest tools" ) ); From 24c2278090f8c9ce7dd0dafe6ef61265e3bf510e Mon Sep 17 00:00:00 2001 From: Chris Huber Date: Wed, 6 May 2026 10:06:20 -0400 Subject: [PATCH 2/4] Preserve MGS matching content snippets --- inc/Engine/AI/DataPacketPromptProjector.php | 37 ++++++++++++++++++++- tests/Unit/Core/Steps/AI/AIStepTest.php | 15 +++++++-- tests/ai-packet-projection-smoke.php | 15 +++++++-- 3 files changed, 62 insertions(+), 5 deletions(-) diff --git a/inc/Engine/AI/DataPacketPromptProjector.php b/inc/Engine/AI/DataPacketPromptProjector.php index abbde8139..54f43b409 100644 --- a/inc/Engine/AI/DataPacketPromptProjector.php +++ b/inc/Engine/AI/DataPacketPromptProjector.php @@ -94,7 +94,7 @@ private static function projectMcpPacket( array $packet, array $data, array $met 'url' => self::firstString( $source, $metadata, $data, array( 'url', 'link', 'permalink', 'source_url', 'mcp_url' ) ), 'date' => self::firstString( $source, $metadata, array(), array( 'date', 'created_at', 'updated_at', 'modified_at', 'mcp_date' ) ), 'author' => self::firstString( $source, $metadata, array(), array( 'author', 'byline', 'user', 'mcp_author' ) ), - 'matching_content' => self::cleanSnippet( self::firstString( $source, $metadata, array(), array( 'matching_content', 'snippet', 'excerpt' ) ) ), + 'matching_content' => self::firstSnippetValue( $source, $metadata, array(), array( 'matching_content', 'snippet', 'excerpt' ) ), 'tags' => self::firstValue( $source, $metadata, array(), array( 'tags', 'mcp_tags' ) ), 'source_id' => self::firstString( $source, $metadata, array(), array( 'id', 'guid', 'item_identifier', 'source_id' ) ), ), @@ -211,6 +211,41 @@ private static function firstValue( array $primary, array $secondary, array $ter return null; } + /** + * Return the first snippet value, preserving real MGS snippet arrays. + * + * @param array $primary Primary source. + * @param array $secondary Secondary source. + * @param array $tertiary Tertiary source. + * @param array $keys Candidate keys. + * @return string|array|null + */ + private static function firstSnippetValue( array $primary, array $secondary, array $tertiary, array $keys ): string|array|null { + $value = self::firstValue( $primary, $secondary, $tertiary, $keys ); + + if ( is_array( $value ) ) { + $snippets = array(); + foreach ( $value as $snippet ) { + if ( ! is_scalar( $snippet ) ) { + continue; + } + + $cleaned = self::cleanSnippet( (string) $snippet ); + if ( null !== $cleaned ) { + $snippets[] = $cleaned; + } + } + + return empty( $snippets ) ? null : $snippets; + } + + if ( is_scalar( $value ) ) { + return self::cleanSnippet( (string) $value ); + } + + return null; + } + /** * Remove search-highlight tags from snippets. * diff --git a/tests/Unit/Core/Steps/AI/AIStepTest.php b/tests/Unit/Core/Steps/AI/AIStepTest.php index 3da794133..59e82559b 100644 --- a/tests/Unit/Core/Steps/AI/AIStepTest.php +++ b/tests/Unit/Core/Steps/AI/AIStepTest.php @@ -87,7 +87,11 @@ public function test_prompt_projection_flattens_mcp_packet_and_preserves_canonic 'url' => 'https://example.com/p2/post', 'date' => '2026-04-14T12:00:00Z', 'author' => 'Chris', - 'matching_content' => 'A highlighted source snippet.', + 'matching_content' => array( + 'Gutenlypso Rollout Plan...', + 'We are getting close to shipping Gutenlypso...', + 'Triaging/fixing Gutenberg bugs...', + ), 'tags' => array( 'mgs', 'history' ), ); $canonical = array( @@ -119,7 +123,14 @@ public function test_prompt_projection_flattens_mcp_packet_and_preserves_canonic $this->assertSame( $canonical_before, $canonical, 'Projection must not mutate canonical packets.' ); $this->assertSame( 'Data Download, April 14, 2026', $projected[0]['data']['title'] ); $this->assertSame( 'https://example.com/p2/post', $projected[0]['data']['url'] ); - $this->assertSame( 'A highlighted source snippet.', $projected[0]['data']['matching_content'] ); + $this->assertSame( + array( + 'Gutenlypso Rollout Plan...', + 'We are getting close to shipping Gutenlypso...', + 'Triaging/fixing Gutenberg bugs...', + ), + $projected[0]['data']['matching_content'] + ); $this->assertSame( array( 'mgs', 'history' ), $projected[0]['data']['tags'] ); $this->assertSame( 'mcp', $projected[0]['metadata']['source_type'] ); $this->assertSame( 'mgs-123', $projected[0]['metadata']['item_identifier'] ); diff --git a/tests/ai-packet-projection-smoke.php b/tests/ai-packet-projection-smoke.php index e4694fc26..ca1b97d4a 100644 --- a/tests/ai-packet-projection-smoke.php +++ b/tests/ai-packet-projection-smoke.php @@ -40,7 +40,11 @@ function assert_projection( string $name, bool $condition, string $detail = '' ) 'url' => 'https://example.com/a8c/post', 'date' => '2026-04-14T12:00:00Z', 'author' => 'Chris', - 'matching_content' => 'Useful highlight for the model.', + 'matching_content' => array( + 'Gutenlypso Rollout Plan...', + 'We are getting close to shipping Gutenlypso...', + 'Triaging/fixing Gutenberg bugs...', + ), 'tags' => array( 'mgs', 'history' ), ); @@ -72,7 +76,14 @@ function assert_projection( string $name, bool $condition, string $detail = '' ) assert_projection( 'canonical packet unchanged after projection', $canonical_before === $canonical ); assert_projection( 'MGS title flattened from source body', 'Data Download, April 14, 2026' === ( $projected[0]['data']['title'] ?? '' ) ); -assert_projection( 'snippet strips em highlight tags', 'Useful highlight for the model.' === ( $projected[0]['data']['matching_content'] ?? '' ) ); +assert_projection( + 'MGS snippet array strips em highlight tags per item', + array( + 'Gutenlypso Rollout Plan...', + 'We are getting close to shipping Gutenlypso...', + 'Triaging/fixing Gutenberg bugs...', + ) === ( $projected[0]['data']['matching_content'] ?? array() ) +); assert_projection( 'mcp_raw_item omitted from prompt metadata', ! array_key_exists( 'mcp_raw_item', $projected[0]['metadata'] ?? array() ) ); assert_projection( 'engine plumbing omitted from prompt metadata', ! array_key_exists( 'pipeline_id', $projected[0]['metadata'] ?? array() ) ); assert_projection( 'stable item identifier preserved', 'mgs-624' === ( $projected[0]['metadata']['item_identifier'] ?? '' ) ); From 70c37fa4e73c963ccefa38130e1d71579ce0c7c7 Mon Sep 17 00:00:00 2001 From: Chris Huber Date: Wed, 6 May 2026 10:26:14 -0400 Subject: [PATCH 3/4] Keep packet projection source agnostic --- inc/Engine/AI/DataPacketPromptProjector.php | 202 ++------------------ tests/Unit/Core/Steps/AI/AIStepTest.php | 104 ++++------ tests/ai-packet-projection-smoke.php | 129 ++++++++----- 3 files changed, 130 insertions(+), 305 deletions(-) diff --git a/inc/Engine/AI/DataPacketPromptProjector.php b/inc/Engine/AI/DataPacketPromptProjector.php index 54f43b409..ba7f73b03 100644 --- a/inc/Engine/AI/DataPacketPromptProjector.php +++ b/inc/Engine/AI/DataPacketPromptProjector.php @@ -10,13 +10,18 @@ defined( 'ABSPATH' ) || exit; /** - * Builds compact, prompt-facing packet copies without changing canonical packets. + * Builds prompt-facing packet copies without changing canonical packets. */ class DataPacketPromptProjector { /** * Project canonical DataPackets for AI prompt serialization. * + * Data Machine's default projection is intentionally source-agnostic. Source + * integrations that understand handler-specific packet shapes can replace or + * compact the prompt-facing packet with the datamachine_ai_project_data_packet + * filter while canonical storage/engine packets remain unchanged. + * * @param array $data_packets Canonical packets from storage/engine state. * @return array Prompt-facing packet copies. */ @@ -36,91 +41,22 @@ public static function project( array $data_packets ): array { } /** - * Project one packet. + * Project one packet using the generic default and filter extension point. * * @param array $packet Canonical packet. * @return array Prompt-facing packet. */ private static function projectPacket( array $packet ): array { - $data = is_array( $packet['data'] ?? null ) ? $packet['data'] : array(); - $metadata = is_array( $packet['metadata'] ?? null ) ? $packet['metadata'] : array(); - - if ( self::isMcpPacket( $data, $metadata ) ) { - return self::projectMcpPacket( $packet, $data, $metadata ); - } - $projected = $packet; if ( isset( $projected['data'] ) && is_array( $projected['data'] ) ) { $projected['data'] = self::sanitizePacketData( $projected['data'] ); } - return $projected; - } - - /** - * Detect packets from MCP/MGS-style fetchers. - * - * @param array $data Packet data. - * @param array $metadata Packet metadata. - * @return bool - */ - private static function isMcpPacket( array $data, array $metadata ): bool { - return isset( $metadata['mcp_raw_item'] ) - || isset( $metadata['mcp_url'] ) - || isset( $metadata['mcp_tool'] ) - || isset( $metadata['mcp_provider'] ) - || 'mcp' === ( $metadata['source_type'] ?? '' ); - } - - /** - * Project an MCP/MGS packet by flattening useful source fields and removing duplicates. - * - * @param array $packet Canonical packet. - * @param array $data Packet data. - * @param array $metadata Packet metadata. - * @return array Prompt-facing packet. - */ - private static function projectMcpPacket( array $packet, array $data, array $metadata ): array { - $source = self::decodeJsonObject( $data['body'] ?? null ); - if ( null === $source && is_array( $metadata['mcp_raw_item'] ?? null ) ) { - $source = $metadata['mcp_raw_item']; - } - $source = is_array( $source ) ? $source : array(); - - $projected_data = array_filter( - array( - 'title' => self::firstString( $source, $data, $metadata, array( 'title', 'name', 'subject' ) ), - 'body' => self::firstString( $source, $data, array(), array( 'content', 'body', 'text', 'summary', 'description' ) ), - 'url' => self::firstString( $source, $metadata, $data, array( 'url', 'link', 'permalink', 'source_url', 'mcp_url' ) ), - 'date' => self::firstString( $source, $metadata, array(), array( 'date', 'created_at', 'updated_at', 'modified_at', 'mcp_date' ) ), - 'author' => self::firstString( $source, $metadata, array(), array( 'author', 'byline', 'user', 'mcp_author' ) ), - 'matching_content' => self::firstSnippetValue( $source, $metadata, array(), array( 'matching_content', 'snippet', 'excerpt' ) ), - 'tags' => self::firstValue( $source, $metadata, array(), array( 'tags', 'mcp_tags' ) ), - 'source_id' => self::firstString( $source, $metadata, array(), array( 'id', 'guid', 'item_identifier', 'source_id' ) ), - ), - static fn( $value ) => null !== $value && '' !== $value && array() !== $value - ); - - $projected_data = self::sanitizePacketData( $projected_data ); - - $projected_metadata = array_filter( - array( - 'source_type' => $metadata['source_type'] ?? null, - 'source_url' => $metadata['source_url'] ?? ( $metadata['mcp_url'] ?? null ), - 'item_identifier' => $metadata['item_identifier'] ?? null, - 'source_label' => $metadata['source_label'] ?? ( $metadata['mcp_provider'] ?? null ), - ), - static fn( $value ) => null !== $value && '' !== $value && array() !== $value - ); - - $projected = array( - 'type' => $packet['type'] ?? 'fetch', - 'data' => $projected_data, - 'metadata' => $projected_metadata, - ); - - if ( array_key_exists( 'timestamp', $packet ) ) { - $projected['timestamp'] = $packet['timestamp']; + if ( function_exists( 'apply_filters' ) ) { + $filtered = apply_filters( 'datamachine_ai_project_data_packet', $projected, $packet ); + if ( is_array( $filtered ) ) { + return $filtered; + } } return $projected; @@ -148,118 +84,4 @@ private static function sanitizePacketData( array $packet_data ): array { $packet_data['file_info'] = $sanitized_file_info; return $packet_data; } - - /** - * Decode a JSON object from a packet body. - * - * @param mixed $value Candidate JSON string. - * @return array|null - */ - private static function decodeJsonObject( mixed $value ): ?array { - if ( ! is_string( $value ) || '' === trim( $value ) ) { - return null; - } - - $decoded = json_decode( $value, true ); - return is_array( $decoded ) ? $decoded : null; - } - - /** - * Return the first scalar value as a string. - * - * @param array $primary Primary source. - * @param array $secondary Secondary source. - * @param array $tertiary Tertiary source. - * @param array $keys Candidate keys. - * @return string|null - */ - private static function firstString( array $primary, array $secondary, array $tertiary, array $keys ): ?string { - foreach ( array( $primary, $secondary, $tertiary ) as $source ) { - foreach ( $keys as $key ) { - if ( ! array_key_exists( $key, $source ) || ! is_scalar( $source[ $key ] ) ) { - continue; - } - - $value = trim( (string) $source[ $key ] ); - if ( '' !== $value ) { - return $value; - } - } - } - - return null; - } - - /** - * Return the first available value for any candidate key. - * - * @param array $primary Primary source. - * @param array $secondary Secondary source. - * @param array $tertiary Tertiary source. - * @param array $keys Candidate keys. - * @return mixed|null - */ - private static function firstValue( array $primary, array $secondary, array $tertiary, array $keys ): mixed { - foreach ( array( $primary, $secondary, $tertiary ) as $source ) { - foreach ( $keys as $key ) { - if ( array_key_exists( $key, $source ) && null !== $source[ $key ] && '' !== $source[ $key ] ) { - return $source[ $key ]; - } - } - } - - return null; - } - - /** - * Return the first snippet value, preserving real MGS snippet arrays. - * - * @param array $primary Primary source. - * @param array $secondary Secondary source. - * @param array $tertiary Tertiary source. - * @param array $keys Candidate keys. - * @return string|array|null - */ - private static function firstSnippetValue( array $primary, array $secondary, array $tertiary, array $keys ): string|array|null { - $value = self::firstValue( $primary, $secondary, $tertiary, $keys ); - - if ( is_array( $value ) ) { - $snippets = array(); - foreach ( $value as $snippet ) { - if ( ! is_scalar( $snippet ) ) { - continue; - } - - $cleaned = self::cleanSnippet( (string) $snippet ); - if ( null !== $cleaned ) { - $snippets[] = $cleaned; - } - } - - return empty( $snippets ) ? null : $snippets; - } - - if ( is_scalar( $value ) ) { - return self::cleanSnippet( (string) $value ); - } - - return null; - } - - /** - * Remove search-highlight tags from snippets. - * - * @param string|null $snippet Snippet value. - * @return string|null - */ - private static function cleanSnippet( ?string $snippet ): ?string { - if ( null === $snippet ) { - return null; - } - - $snippet = preg_replace( '#]*>#i', '', $snippet ); - $snippet = is_string( $snippet ) ? trim( $snippet ) : ''; - - return '' === $snippet ? null : $snippet; - } } diff --git a/tests/Unit/Core/Steps/AI/AIStepTest.php b/tests/Unit/Core/Steps/AI/AIStepTest.php index 59e82559b..c8f21b25a 100644 --- a/tests/Unit/Core/Steps/AI/AIStepTest.php +++ b/tests/Unit/Core/Steps/AI/AIStepTest.php @@ -80,69 +80,6 @@ public function test_sanitize_data_packets_for_ai_leaves_packets_without_file_in $this->assertSame( $data_packets, AIStep::sanitizeDataPacketsForAi( $data_packets ) ); } - public function test_prompt_projection_flattens_mcp_packet_and_preserves_canonical_packet(): void { - $raw_item = array( - 'id' => 'mgs-123', - 'title' => 'Data Download, April 14, 2026', - 'url' => 'https://example.com/p2/post', - 'date' => '2026-04-14T12:00:00Z', - 'author' => 'Chris', - 'matching_content' => array( - 'Gutenlypso Rollout Plan...', - 'We are getting close to shipping Gutenlypso...', - 'Triaging/fixing Gutenberg bugs...', - ), - 'tags' => array( 'mgs', 'history' ), - ); - $canonical = array( - array( - 'type' => 'fetch', - 'timestamp' => 1770000000, - 'data' => array( - 'title' => 'Wrapped title', - 'body' => wp_json_encode( $raw_item, JSON_UNESCAPED_UNICODE ), - ), - 'metadata' => array( - 'source_type' => 'mcp', - 'pipeline_id' => 3, - 'flow_id' => 2, - 'handler' => 'mcp_fetch', - 'mcp_provider' => 'WordPress.com MGS', - 'mcp_server' => 'wordpress-com', - 'mcp_tool' => 'search', - 'mcp_url' => 'https://example.com/p2/post', - 'mcp_raw_item' => $raw_item, - 'item_identifier' => 'mgs-123', - ), - ), - ); - - $canonical_before = $canonical; - $projected = DataPacketPromptProjector::project( $canonical ); - - $this->assertSame( $canonical_before, $canonical, 'Projection must not mutate canonical packets.' ); - $this->assertSame( 'Data Download, April 14, 2026', $projected[0]['data']['title'] ); - $this->assertSame( 'https://example.com/p2/post', $projected[0]['data']['url'] ); - $this->assertSame( - array( - 'Gutenlypso Rollout Plan...', - 'We are getting close to shipping Gutenlypso...', - 'Triaging/fixing Gutenberg bugs...', - ), - $projected[0]['data']['matching_content'] - ); - $this->assertSame( array( 'mgs', 'history' ), $projected[0]['data']['tags'] ); - $this->assertSame( 'mcp', $projected[0]['metadata']['source_type'] ); - $this->assertSame( 'mgs-123', $projected[0]['metadata']['item_identifier'] ); - $this->assertArrayNotHasKey( 'mcp_raw_item', $projected[0]['metadata'] ); - $this->assertArrayNotHasKey( 'pipeline_id', $projected[0]['metadata'] ); - - $canonical_bytes = strlen( wp_json_encode( $canonical, JSON_UNESCAPED_UNICODE ) ); - $projected_bytes = strlen( wp_json_encode( $projected, JSON_UNESCAPED_UNICODE ) ); - - $this->assertLessThan( $canonical_bytes, $projected_bytes ); - } - public function test_prompt_projection_generic_fallback_preserves_unknown_packet_shape(): void { $canonical = array( array( @@ -187,6 +124,47 @@ public function test_prompt_projection_does_not_flatten_unknown_json_body_packet $this->assertSame( $canonical, DataPacketPromptProjector::project( $canonical ) ); } + public function test_prompt_projection_filter_can_replace_prompt_packet_without_mutating_canonical(): void { + $canonical = array( + array( + 'type' => 'fetch', + 'data' => array( + 'title' => 'Verbose packet', + 'body' => 'Long source-specific body that an integration understands.', + ), + 'metadata' => array( + 'source_type' => 'integration_owned_source', + 'raw_payload' => array( 'duplicated' => true ), + ), + ), + ); + + add_filter( + 'datamachine_ai_project_data_packet', + static function ( array $projected, array $packet ): array { + if ( 'integration_owned_source' !== ( $packet['metadata']['source_type'] ?? '' ) ) { + return $projected; + } + + return array( + 'type' => $packet['type'], + 'data' => array( 'title' => $packet['data']['title'] ), + 'metadata' => array( 'source_type' => $packet['metadata']['source_type'] ), + ); + }, + 10, + 2 + ); + + $canonical_before = $canonical; + $projected = DataPacketPromptProjector::project( $canonical ); + + $this->assertSame( $canonical_before, $canonical ); + $this->assertSame( 'Verbose packet', $projected[0]['data']['title'] ); + $this->assertArrayNotHasKey( 'body', $projected[0]['data'] ); + $this->assertArrayNotHasKey( 'raw_payload', $projected[0]['metadata'] ); + } + /** * Test that processLoopResults does NOT carry forward input DataPackets. * diff --git a/tests/ai-packet-projection-smoke.php b/tests/ai-packet-projection-smoke.php index ca1b97d4a..bf420d88b 100644 --- a/tests/ai-packet-projection-smoke.php +++ b/tests/ai-packet-projection-smoke.php @@ -11,6 +11,29 @@ define( 'ABSPATH', __DIR__ . '/' ); } +$test_filters = array(); + +function add_filter( string $hook, callable $callback, int $priority = 10, int $_accepted_args = 1 ): void { + global $test_filters; + $test_filters[ $hook ][ $priority ][] = $callback; +} + +function apply_filters( string $hook, $value, ...$args ) { + global $test_filters; + if ( empty( $test_filters[ $hook ] ) ) { + return $value; + } + + ksort( $test_filters[ $hook ] ); + foreach ( $test_filters[ $hook ] as $callbacks ) { + foreach ( $callbacks as $callback ) { + $value = $callback( $value, ...$args ); + } + } + + return $value; +} + function wp_json_encode( $value, int $flags = 0 ) { return json_encode( $value, $flags ); } @@ -34,39 +57,21 @@ function assert_projection( string $name, bool $condition, string $detail = '' ) echo "AI DataPacket prompt projection smoke\n"; -$raw_item = array( - 'id' => 'mgs-624', - 'title' => 'Data Download, April 14, 2026', - 'url' => 'https://example.com/a8c/post', - 'date' => '2026-04-14T12:00:00Z', - 'author' => 'Chris', - 'matching_content' => array( - 'Gutenlypso Rollout Plan...', - 'We are getting close to shipping Gutenlypso...', - 'Triaging/fixing Gutenberg bugs...', - ), - 'tags' => array( 'mgs', 'history' ), -); - $canonical = array( array( 'type' => 'fetch', 'timestamp' => 1770000000, 'data' => array( - 'title' => 'Wrapped MGS item', - 'body' => wp_json_encode( $raw_item, JSON_UNESCAPED_UNICODE ), + 'title' => 'Generic packet', + 'body' => 'Plain source text', + 'file_info' => array( + 'file_path' => '/tmp/runtime-only.jpg', + 'mime_type' => 'image/jpeg', + ), ), 'metadata' => array( - 'source_type' => 'mcp', - 'pipeline_id' => 3, - 'flow_id' => 2, - 'handler' => 'mcp_fetch', - 'mcp_provider' => 'WordPress.com MGS', - 'mcp_server' => 'wordpress-com', - 'mcp_tool' => 'search', - 'mcp_url' => 'https://example.com/a8c/post', - 'mcp_raw_item' => $raw_item, - 'item_identifier' => 'mgs-624', + 'source_type' => 'generic_source', + 'custom_key' => 'custom value', ), ), ); @@ -74,40 +79,60 @@ function assert_projection( string $name, bool $condition, string $detail = '' ) $canonical_before = $canonical; $projected = \DataMachine\Engine\AI\DataPacketPromptProjector::project( $canonical ); -assert_projection( 'canonical packet unchanged after projection', $canonical_before === $canonical ); -assert_projection( 'MGS title flattened from source body', 'Data Download, April 14, 2026' === ( $projected[0]['data']['title'] ?? '' ) ); -assert_projection( - 'MGS snippet array strips em highlight tags per item', - array( - 'Gutenlypso Rollout Plan...', - 'We are getting close to shipping Gutenlypso...', - 'Triaging/fixing Gutenberg bugs...', - ) === ( $projected[0]['data']['matching_content'] ?? array() ) +assert_projection( 'canonical packet unchanged after generic projection', $canonical_before === $canonical ); +assert_projection( 'generic title preserved', 'Generic packet' === ( $projected[0]['data']['title'] ?? '' ) ); +assert_projection( 'generic body preserved', 'Plain source text' === ( $projected[0]['data']['body'] ?? '' ) ); +assert_projection( 'generic metadata preserved', 'custom value' === ( $projected[0]['metadata']['custom_key'] ?? '' ) ); +assert_projection( 'runtime file_path stripped from prompt data', ! array_key_exists( 'file_path', $projected[0]['data']['file_info'] ?? array() ) ); + +add_filter( + 'datamachine_ai_project_data_packet', + static function ( array $projected_packet, array $canonical_packet ): array { + if ( 'integration_owned_source' !== ( $canonical_packet['metadata']['source_type'] ?? '' ) ) { + return $projected_packet; + } + + return array( + 'type' => $canonical_packet['type'], + 'data' => array( + 'title' => $canonical_packet['data']['title'], + 'snippet' => 'Source-specific compact projection', + ), + 'metadata' => array( 'source_type' => $canonical_packet['metadata']['source_type'] ), + ); + }, + 10, + 2 ); -assert_projection( 'mcp_raw_item omitted from prompt metadata', ! array_key_exists( 'mcp_raw_item', $projected[0]['metadata'] ?? array() ) ); -assert_projection( 'engine plumbing omitted from prompt metadata', ! array_key_exists( 'pipeline_id', $projected[0]['metadata'] ?? array() ) ); -assert_projection( 'stable item identifier preserved', 'mgs-624' === ( $projected[0]['metadata']['item_identifier'] ?? '' ) ); - -$canonical_bytes = strlen( wp_json_encode( $canonical, JSON_UNESCAPED_UNICODE ) ); -$projected_bytes = strlen( wp_json_encode( $projected, JSON_UNESCAPED_UNICODE ) ); - -assert_projection( 'projected packet JSON is smaller than canonical JSON', $projected_bytes < $canonical_bytes, "canonical=$canonical_bytes projected=$projected_bytes" ); -$prompt_json = wp_json_encode( array( 'data_packets' => $projected ), JSON_UNESCAPED_UNICODE ); -assert_projection( 'prompt JSON is compact by default', ! str_contains( $prompt_json, "\n" ) ); - -$unknown_json_packet = array( +$source_specific = array( array( 'type' => 'fetch', 'data' => array( - 'title' => 'Unknown JSON packet', - 'body' => '{"title":"Nested title","custom":"important"}', + 'title' => 'Verbose integration packet', + 'body' => str_repeat( 'Long duplicated source text. ', 20 ), + ), + 'metadata' => array( + 'source_type' => 'integration_owned_source', + 'raw_payload' => array( 'duplicated' => true ), ), - 'metadata' => array( 'source_type' => 'custom_json_feed' ), ), ); -$unknown_projected = \DataMachine\Engine\AI\DataPacketPromptProjector::project( $unknown_json_packet ); -assert_projection( 'unknown JSON body packets use conservative fallback', $unknown_json_packet === $unknown_projected ); +$source_specific_before = $source_specific; +$compact = \DataMachine\Engine\AI\DataPacketPromptProjector::project( $source_specific ); + +assert_projection( 'filter projection leaves canonical source packet unchanged', $source_specific_before === $source_specific ); +assert_projection( 'filter projection can remove verbose body', ! array_key_exists( 'body', $compact[0]['data'] ?? array() ) ); +assert_projection( 'filter projection can remove source-specific raw metadata', ! array_key_exists( 'raw_payload', $compact[0]['metadata'] ?? array() ) ); +assert_projection( 'filter projection keeps compact source text', 'Source-specific compact projection' === ( $compact[0]['data']['snippet'] ?? '' ) ); + +$canonical_bytes = strlen( wp_json_encode( $source_specific, JSON_UNESCAPED_UNICODE ) ); +$projected_bytes = strlen( wp_json_encode( $compact, JSON_UNESCAPED_UNICODE ) ); + +assert_projection( 'filter-projected packet JSON is smaller than canonical JSON', $projected_bytes < $canonical_bytes, "canonical=$canonical_bytes projected=$projected_bytes" ); + +$prompt_json = wp_json_encode( array( 'data_packets' => $compact ), JSON_UNESCAPED_UNICODE ); +assert_projection( 'prompt JSON is compact by default', ! str_contains( $prompt_json, "\n" ) ); echo "\n$total assertions, $failed failures\n"; if ( $failed > 0 ) { From 59be00feafd8e8bfbd57bd826770f2e51970c3b8 Mon Sep 17 00:00:00 2001 From: Chris Huber Date: Wed, 6 May 2026 10:58:40 -0400 Subject: [PATCH 4/4] Pass context to packet projection filters --- inc/Core/Steps/AI/AIStep.php | 24 +++++++++----- inc/Engine/AI/DataPacketPromptProjector.php | 12 ++++--- inc/Engine/AI/RequestInspector.php | 29 ++++++++++++----- tests/Unit/Core/Steps/AI/AIStepTest.php | 35 +++++++++++++++++++++ tests/ai-packet-projection-smoke.php | 33 ++++++++++++++++--- 5 files changed, 108 insertions(+), 25 deletions(-) diff --git a/inc/Core/Steps/AI/AIStep.php b/inc/Core/Steps/AI/AIStep.php index 8a1cef396..0adafc5a0 100644 --- a/inc/Core/Steps/AI/AIStep.php +++ b/inc/Core/Steps/AI/AIStep.php @@ -189,10 +189,25 @@ protected function executeStep(): array { $mime_type = is_string( $file_info['type'] ) ? $file_info['type'] : ''; } + $pipeline_step_id = $this->flow_step_config['pipeline_step_id']; + + // Resolve user_id and agent_id from engine snapshot (set by RunFlowAbility). + $job_snapshot = $this->engine->get( 'job' ); + $agent_id = (int) ( $job_snapshot['agent_id'] ?? 0 ); + $user_id = (int) ( $job_snapshot['user_id'] ?? 0 ); + + $packet_projection_context = array( + 'job_id' => $this->job_id, + 'pipeline_id' => $job_snapshot['pipeline_id'] ?? null, + 'flow_id' => $job_snapshot['flow_id'] ?? null, + 'flow_step_id' => $this->flow_step_id, + 'pipeline_step_id' => $pipeline_step_id, + ); + $messages = array(); if ( ! empty( $this->dataPackets ) ) { - $data_packet_content = wp_json_encode( array( 'data_packets' => DataPacketPromptProjector::project( $this->dataPackets ) ), JSON_UNESCAPED_UNICODE ); + $data_packet_content = wp_json_encode( array( 'data_packets' => DataPacketPromptProjector::project( $this->dataPackets, $packet_projection_context ) ), JSON_UNESCAPED_UNICODE ); $messages[] = ConversationManager::buildConversationMessage( 'user', false === $data_packet_content ? '' : $data_packet_content @@ -216,17 +231,10 @@ protected function executeStep(): array { $messages[] = ConversationManager::buildConversationMessage( 'user', $user_message ); } - $pipeline_step_id = $this->flow_step_config['pipeline_step_id']; - $pipeline_step_config = $this->engine->getPipelineStepConfig( $pipeline_step_id ); $max_turns = PluginSettings::get( 'max_turns', PluginSettings::DEFAULT_MAX_TURNS ); - // Resolve user_id and agent_id from engine snapshot (set by RunFlowAbility). - $job_snapshot = $this->engine->get( 'job' ); - $agent_id = (int) ( $job_snapshot['agent_id'] ?? 0 ); - $user_id = (int) ( $job_snapshot['user_id'] ?? 0 ); - // Resolve transcript persistence policy once per AI step invocation. // Resolution order: flow > pipeline > site option (default false). // The boolean is threaded through $payload so the loop doesn't need diff --git a/inc/Engine/AI/DataPacketPromptProjector.php b/inc/Engine/AI/DataPacketPromptProjector.php index ba7f73b03..af16d6e68 100644 --- a/inc/Engine/AI/DataPacketPromptProjector.php +++ b/inc/Engine/AI/DataPacketPromptProjector.php @@ -23,9 +23,10 @@ class DataPacketPromptProjector { * filter while canonical storage/engine packets remain unchanged. * * @param array $data_packets Canonical packets from storage/engine state. + * @param array $context Source-agnostic runtime context for projection filters. * @return array Prompt-facing packet copies. */ - public static function project( array $data_packets ): array { + public static function project( array $data_packets, array $context = array() ): array { $projected_packets = array(); foreach ( $data_packets as $packet ) { @@ -34,7 +35,7 @@ public static function project( array $data_packets ): array { continue; } - $projected_packets[] = self::projectPacket( $packet ); + $projected_packets[] = self::projectPacket( $packet, $context ); } return $projected_packets; @@ -43,17 +44,18 @@ public static function project( array $data_packets ): array { /** * Project one packet using the generic default and filter extension point. * - * @param array $packet Canonical packet. + * @param array $packet Canonical packet. + * @param array $context Source-agnostic runtime context for projection filters. * @return array Prompt-facing packet. */ - private static function projectPacket( array $packet ): array { + private static function projectPacket( array $packet, array $context ): array { $projected = $packet; if ( isset( $projected['data'] ) && is_array( $projected['data'] ) ) { $projected['data'] = self::sanitizePacketData( $projected['data'] ); } if ( function_exists( 'apply_filters' ) ) { - $filtered = apply_filters( 'datamachine_ai_project_data_packet', $projected, $packet ); + $filtered = apply_filters( 'datamachine_ai_project_data_packet', $projected, $packet, $context ); if ( is_array( $filtered ) ) { return $filtered; } diff --git a/inc/Engine/AI/RequestInspector.php b/inc/Engine/AI/RequestInspector.php index 802b61606..07d831832 100644 --- a/inc/Engine/AI/RequestInspector.php +++ b/inc/Engine/AI/RequestInspector.php @@ -88,9 +88,10 @@ public function inspectPipelineJob( int $job_id, ?string $flow_step_id = null ): ); } - $data_packets = $this->retrieveDataPackets( $job_id, $engine ); - $messages = $this->buildInitialMessages( $data_packets, $engine, $flow_step_config ); - $payload = $this->buildPayload( $job_id, $flow_step_id, $pipeline_step_id, $data_packets, $engine, $job ); + $data_packets = $this->retrieveDataPackets( $job_id, $engine ); + $packet_projection_context = $this->buildProjectionContext( $job_id, $flow_step_id, $pipeline_step_id, $engine, $job ); + $messages = $this->buildInitialMessages( $data_packets, $engine, $flow_step_config, $packet_projection_context ); + $payload = $this->buildPayload( $job_id, $flow_step_id, $pipeline_step_id, $data_packets, $engine, $job ); $previous_step_config = $this->getAdjacentStepConfig( $engine, $flow_step_id, $payload, 'previous' ); $next_step_config = $this->getAdjacentStepConfig( $engine, $flow_step_id, $payload, 'next' ); @@ -160,7 +161,7 @@ public function inspectPipelineJob( int $job_id, ?string $flow_step_id = null ): 'model' => $model, 'mode' => ToolPolicyResolver::MODE_PIPELINE, ), - $this->measure( $assembled, $data_packets, $messages ) + $this->measure( $assembled, $data_packets, $messages, $packet_projection_context ) ); } @@ -186,11 +187,11 @@ private function retrieveDataPackets( int $job_id, EngineData $engine ): array { return ( new FileRetrieval() )->retrieve_data_by_job_id( $job_id, $context ); } - private function buildInitialMessages( array $data_packets, EngineData $engine, array $flow_step_config ): array { + private function buildInitialMessages( array $data_packets, EngineData $engine, array $flow_step_config, array $packet_projection_context ): array { $messages = array(); if ( ! empty( $data_packets ) ) { - $data_packet_content = wp_json_encode( array( 'data_packets' => DataPacketPromptProjector::project( $data_packets ) ), JSON_UNESCAPED_UNICODE ); + $data_packet_content = wp_json_encode( array( 'data_packets' => DataPacketPromptProjector::project( $data_packets, $packet_projection_context ) ), JSON_UNESCAPED_UNICODE ); $messages[] = ConversationManager::buildConversationMessage( 'user', false === $data_packet_content ? '' : $data_packet_content @@ -230,6 +231,18 @@ private function peekPromptQueueValue( array $flow_step_config ): string { return trim( (string) ( $queue[0]['prompt'] ?? '' ) ); } + private function buildProjectionContext( int $job_id, string $flow_step_id, string $pipeline_step_id, EngineData $engine, array $job ): array { + $job_snapshot = $engine->getJobContext(); + + return array( + 'job_id' => $job_id, + 'pipeline_id' => $job_snapshot['pipeline_id'] ?? ( $job['pipeline_id'] ?? null ), + 'flow_id' => $job_snapshot['flow_id'] ?? ( $job['flow_id'] ?? null ), + 'flow_step_id' => $flow_step_id, + 'pipeline_step_id' => $pipeline_step_id, + ); + } + private function buildPayload( int $job_id, string $flow_step_id, @@ -265,13 +278,13 @@ private function getAdjacentStepConfig( EngineData $engine, string $flow_step_id return $adjacent_id ? $engine->getFlowStepConfig( $adjacent_id ) : null; } - private function measure( array $assembled, array $data_packets, array $initial_messages ): array { + private function measure( array $assembled, array $data_packets, array $initial_messages, array $packet_projection_context ): array { $request = $assembled['request']; $structured_tools = $assembled['structured_tools']; $messages = $request['messages'] ?? array(); $tools = $request['tools'] ?? array(); - $projected_packets = DataPacketPromptProjector::project( $data_packets ); + $projected_packets = DataPacketPromptProjector::project( $data_packets, $packet_projection_context ); return array( 'message_count' => count( $messages ), diff --git a/tests/Unit/Core/Steps/AI/AIStepTest.php b/tests/Unit/Core/Steps/AI/AIStepTest.php index c8f21b25a..7552ebba4 100644 --- a/tests/Unit/Core/Steps/AI/AIStepTest.php +++ b/tests/Unit/Core/Steps/AI/AIStepTest.php @@ -165,6 +165,41 @@ static function ( array $projected, array $packet ): array { $this->assertArrayNotHasKey( 'raw_payload', $projected[0]['metadata'] ); } + public function test_prompt_projection_filter_receives_source_agnostic_context(): void { + $canonical = array( + array( + 'type' => 'fetch', + 'data' => array( 'title' => 'Context packet' ), + 'metadata' => array( 'source_type' => 'context_source' ), + ), + ); + $context = array( + 'job_id' => 1799, + 'pipeline_id' => 3, + 'flow_id' => 2, + 'flow_step_id' => 'flow_step_ai', + 'pipeline_step_id' => 'pipeline_step_ai', + ); + $received = array(); + + add_filter( + 'datamachine_ai_project_data_packet', + static function ( array $projected, array $packet, array $filter_context ) use ( &$received ): array { + if ( 'context_source' === ( $packet['metadata']['source_type'] ?? '' ) ) { + $received = $filter_context; + } + + return $projected; + }, + 10, + 3 + ); + + DataPacketPromptProjector::project( $canonical, $context ); + + $this->assertSame( $context, $received ); + } + /** * Test that processLoopResults does NOT carry forward input DataPackets. * diff --git a/tests/ai-packet-projection-smoke.php b/tests/ai-packet-projection-smoke.php index bf420d88b..96eee7f4d 100644 --- a/tests/ai-packet-projection-smoke.php +++ b/tests/ai-packet-projection-smoke.php @@ -15,7 +15,10 @@ function add_filter( string $hook, callable $callback, int $priority = 10, int $_accepted_args = 1 ): void { global $test_filters; - $test_filters[ $hook ][ $priority ][] = $callback; + $test_filters[ $hook ][ $priority ][] = array( + 'callback' => $callback, + 'accepted_args' => $_accepted_args, + ); } function apply_filters( string $hook, $value, ...$args ) { @@ -26,8 +29,10 @@ function apply_filters( string $hook, $value, ...$args ) { ksort( $test_filters[ $hook ] ); foreach ( $test_filters[ $hook ] as $callbacks ) { - foreach ( $callbacks as $callback ) { - $value = $callback( $value, ...$args ); + foreach ( $callbacks as $filter ) { + $accepted_args = max( 1, (int) $filter['accepted_args'] ); + $filter_args = array_slice( array_merge( array( $value ), $args ), 0, $accepted_args ); + $value = $filter['callback']( ...$filter_args ); } } @@ -119,12 +124,32 @@ static function ( array $projected_packet, array $canonical_packet ): array { ), ); $source_specific_before = $source_specific; -$compact = \DataMachine\Engine\AI\DataPacketPromptProjector::project( $source_specific ); +$context = array( + 'job_id' => 1799, + 'pipeline_id' => 3, + 'flow_id' => 2, + 'flow_step_id' => 'flow_step_ai', + 'pipeline_step_id' => 'pipeline_step_ai', +); +$received_context = array(); + +add_filter( + 'datamachine_ai_project_data_packet', + static function ( array $projected_packet, array $_canonical_packet, array $filter_context ) use ( &$received_context ): array { + $received_context = $filter_context; + return $projected_packet; + }, + 20, + 3 +); + +$compact = \DataMachine\Engine\AI\DataPacketPromptProjector::project( $source_specific, $context ); assert_projection( 'filter projection leaves canonical source packet unchanged', $source_specific_before === $source_specific ); assert_projection( 'filter projection can remove verbose body', ! array_key_exists( 'body', $compact[0]['data'] ?? array() ) ); assert_projection( 'filter projection can remove source-specific raw metadata', ! array_key_exists( 'raw_payload', $compact[0]['metadata'] ?? array() ) ); assert_projection( 'filter projection keeps compact source text', 'Source-specific compact projection' === ( $compact[0]['data']['snippet'] ?? '' ) ); +assert_projection( 'three-argument filter receives projection context', $context === $received_context ); $canonical_bytes = strlen( wp_json_encode( $source_specific, JSON_UNESCAPED_UNICODE ) ); $projected_bytes = strlen( wp_json_encode( $compact, JSON_UNESCAPED_UNICODE ) );