diff --git a/CHANGELOG.md b/CHANGELOG.md index 5e44bdf..2c1458a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,11 @@ All notable changes to `@stackbilt/llm-providers` are documented here. Format follows [Keep a Changelog](https://keepachangelog.com/). Versions use [Semantic Versioning](https://semver.org/). +## [1.5.1] — 2026-04-27 + +### Fixed +- **`analyzeImage()` silent empty response on Cloudflare** — `@cf/meta/llama-3.2-11b-vision-instruct` via the Workers AI binding requires a raw `{ image: number[], prompt, max_tokens }` input shape, not the OpenAI-compatible `messages/image_url` format. The chat path returns `choices[0].message.content === null` via the binding, causing `extractText()` to silently return `""`. The provider now detects this model and dispatches to the raw binding format, mapping the result's `{ response: string }` back through the existing normalisation path. Other vision models (`@cf/google/gemma-4-26b-a4b-it`, `@cf/meta/llama-4-scout-17b-16e-instruct`) continue using the chat format unchanged. Fixes #53. + ## [1.5.0] — 2026-04-23 Bundles the unreleased 1.4.0 scope (model retirements, drift test) with envelope validation, env auto-discovery, and the declarative catalog into a single minor release. 1.4.0 was tagged in `package.json` but never published to npm; consumers upgrading from 1.3.0 receive all of the following. diff --git a/package.json b/package.json index 4b79742..6ffb2e1 100755 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@stackbilt/llm-providers", - "version": "1.5.0", + "version": "1.5.1", "description": "Multi-LLM failover with circuit breakers, cost tracking, and intelligent retry. Cloudflare Workers native.", "author": "Stackbilt ", "license": "Apache-2.0", diff --git a/src/__tests__/cloudflare.test.ts b/src/__tests__/cloudflare.test.ts index be93b3a..df07a2c 100644 --- a/src/__tests__/cloudflare.test.ts +++ b/src/__tests__/cloudflare.test.ts @@ -348,11 +348,89 @@ describe('CloudflareProvider', () => { expect(content.filter((p: { type: string }) => p.type === 'image_url')).toHaveLength(2); }); - it('accepts pre-formed data: URLs via image.url', async () => { - mockAiRun.mockResolvedValueOnce({ - choices: [{ message: { content: 'ok' }, finish_reason: 'stop' }] + it('uses raw binding format for llama-3.2-11b-vision-instruct (fixes silent empty response)', async () => { + mockAiRun.mockResolvedValueOnce({ response: 'A delicious pasta dish.' }); + + const result = await provider.generateResponse({ + model: '@cf/meta/llama-3.2-11b-vision-instruct', + messages: [{ role: 'user', content: 'Describe this food image.' }], + images: [{ data: 'QUJD', mimeType: 'image/jpeg' }], + maxTokens: 512 }); + const [modelArg, body] = mockAiRun.mock.calls[0]; + expect(modelArg).toBe('@cf/meta/llama-3.2-11b-vision-instruct'); + expect(Array.isArray(body.image)).toBe(true); + expect(body.image).toHaveLength(3); // QUJD = 3 bytes: [65, 66, 67] + expect(body.prompt).toBe('Describe this food image.'); + expect(body.max_tokens).toBe(512); + expect(body.messages).toBeUndefined(); + expect(result.content).toBe('A delicious pasta dish.'); + expect(result.message).toBe('A delicious pasta dish.'); + }); + + it('prepends system prompt to raw binding prompt for llama-3.2', async () => { + mockAiRun.mockResolvedValueOnce({ response: 'Pasta.' }); + + await provider.generateResponse({ + model: '@cf/meta/llama-3.2-11b-vision-instruct', + messages: [{ role: 'user', content: 'What is this?' }], + images: [{ data: 'QUJD', mimeType: 'image/jpeg' }], + systemPrompt: 'You are a food critic.', + }); + + const [, body] = mockAiRun.mock.calls[0]; + expect(body.prompt).toBe('You are a food critic.\n\nWhat is this?'); + }); + + it('rejects multiple images on llama-3.2 raw binding with a clear error', async () => { + await expect( + provider.generateResponse({ + model: '@cf/meta/llama-3.2-11b-vision-instruct', + messages: [{ role: 'user', content: 'compare' }], + images: [ + { data: 'QUJD', mimeType: 'image/jpeg' }, + { data: 'REVG', mimeType: 'image/jpeg' } + ] + }) + ).rejects.toThrow(/supports exactly one image/); + }); + + it('extracts text from array-content user message for llama-3.2 raw binding', async () => { + mockAiRun.mockResolvedValueOnce({ response: 'Spaghetti.' }); + + await provider.generateResponse({ + model: '@cf/meta/llama-3.2-11b-vision-instruct', + messages: [{ + role: 'user', + content: [ + { type: 'text', text: 'What food is this?' }, + { type: 'text', text: 'Be brief.' } + ] as unknown as string + }], + images: [{ data: 'QUJD', mimeType: 'image/jpeg' }] + }); + + const [, body] = mockAiRun.mock.calls[0]; + expect(body.prompt).toBe('What food is this? Be brief.'); + }); + + it('defaults max_tokens to 512 when not specified for llama-3.2 raw binding', async () => { + mockAiRun.mockResolvedValueOnce({ response: 'ok' }); + + await provider.generateResponse({ + model: '@cf/meta/llama-3.2-11b-vision-instruct', + messages: [{ role: 'user', content: 'x' }], + images: [{ data: 'QUJD', mimeType: 'image/jpeg' }] + }); + + const [, body] = mockAiRun.mock.calls[0]; + expect(body.max_tokens).toBe(512); + }); + + it('accepts pre-formed data: URL for llama-3.2 raw binding', async () => { + mockAiRun.mockResolvedValueOnce({ response: 'ok' }); + await provider.generateResponse({ model: '@cf/meta/llama-3.2-11b-vision-instruct', messages: [{ role: 'user', content: 'x' }], @@ -360,8 +438,27 @@ describe('CloudflareProvider', () => { }); const [, body] = mockAiRun.mock.calls[0]; - const imagePart = body.messages[0].content[1]; - expect(imagePart.image_url.url).toBe('data:image/webp;base64,ZEFUQQ=='); + expect(Array.isArray(body.image)).toBe(true); + expect(body.messages).toBeUndefined(); + }); + + it('other vision models (gemma-4, llama-4-scout) still use chat/image_url format', async () => { + mockAiRun.mockResolvedValueOnce({ + choices: [{ message: { content: 'A tomato.' }, finish_reason: 'stop' }] + }); + + await provider.generateResponse({ + model: '@cf/google/gemma-4-26b-a4b-it', + messages: [{ role: 'user', content: 'What is in this image?' }], + images: [{ data: 'QUJD', mimeType: 'image/png' }], + maxTokens: 256 + }); + + const [, body] = mockAiRun.mock.calls[0]; + expect(body.messages).toBeDefined(); + expect(body.image).toBeUndefined(); + const imagePart = body.messages[body.messages.length - 1].content[1]; + expect(imagePart.image_url.url).toBe('data:image/png;base64,QUJD'); }); it('rejects HTTP image URLs (requires base64 bytes)', async () => { diff --git a/src/providers/cloudflare.ts b/src/providers/cloudflare.ts index ed8d151..18053ac 100755 --- a/src/providers/cloudflare.ts +++ b/src/providers/cloudflare.ts @@ -87,6 +87,14 @@ interface WorkersAIResult { result?: WorkersAIResult; // wrapped responses } +// Models that require the raw { image, prompt } binding format rather than chat/image_url. +// Add any new CF vision models here if they exhibit the same null-content symptom via the binding. +// (The chat path returns choices[0].message.content === null through the Workers AI binding, +// silently producing "".) +const LLAMA_VISION_RAW_MODELS = new Set([ + '@cf/meta/llama-3.2-11b-vision-instruct' +]); + export class CloudflareProvider extends BaseProvider { name = 'cloudflare'; models = [ @@ -134,13 +142,19 @@ export class CloudflareProvider extends BaseProvider { try { const response = await this.executeWithResiliency(async () => { const model = request.model || this.getRecommendedModel(request); - const cloudflareRequest = this.formatRequest(request, model); - // Validate model is supported if (!this.models.includes(model)) { throw new ModelNotFoundError('cloudflare', model); } + // llama-3.2-11b vision requires the raw Workers AI binding format. + // The chat/image_url path returns null content via the binding. + if (LLAMA_VISION_RAW_MODELS.has(model) && (request.images?.length ?? 0) > 0) { + const result = await this.runLlamaVisionRaw(request, model); + return this.formatResponse(result as WorkersAIResult, model, request, Date.now() - startTime); + } + + const cloudflareRequest = this.formatRequest(request, model); // Workers AI binding uses branded model names; cast at API boundary // eslint-disable-next-line @typescript-eslint/no-explicit-any -- Ai.run() requires branded model types const result = await (this.ai as { run(model: string, input: unknown): Promise }).run(model, cloudflareRequest); @@ -358,6 +372,54 @@ export class CloudflareProvider extends BaseProvider { }; } + private async runLlamaVisionRaw(request: LLMRequest, model: string): Promise { + if (request.images!.length > 1) { + throw new ConfigurationError( + this.name, + `${model} supports exactly one image via the raw binding format — ${request.images!.length} were provided.` + ); + } + + const image = request.images![0]; + + let imageBytes: number[]; + if (image.data) { + imageBytes = Array.from(Uint8Array.from(atob(image.data), c => c.charCodeAt(0))); + } else if (image.url?.startsWith('data:')) { + const b64 = image.url.split(',')[1] ?? ''; + imageBytes = Array.from(Uint8Array.from(atob(b64), c => c.charCodeAt(0))); + } else { + throw new ConfigurationError( + this.name, + `${model} requires base64 image data or a data: URL — HTTP URLs are not supported.` + ); + } + + const systemPrefix = request.systemPrompt ? `${request.systemPrompt}\n\n` : ''; + let lastUserText = ''; + for (let i = request.messages.length - 1; i >= 0; i--) { + if (request.messages[i].role === 'user') { + const raw = request.messages[i].content; + lastUserText = typeof raw === 'string' + ? raw + : Array.isArray(raw) + ? (raw as Array<{ type?: string; text?: string }>) + .filter(p => p.type === 'text') + .map(p => p.text ?? '') + .join(' ') + : ''; + break; + } + } + + // eslint-disable-next-line @typescript-eslint/no-explicit-any -- Ai.run() requires branded model types + return (this.ai as { run(model: string, input: unknown): Promise }).run(model, { + image: imageBytes, + prompt: `${systemPrefix}${lastUserText}`, + max_tokens: request.maxTokens ?? 512 + }) as Promise; + } + private formatRequest(request: LLMRequest, model: string): CloudflareRequest { const capabilities = this.getModelCapabilities()[model]; const usesTools =