From d2e5f7f7b74e539e6bd99ce16a5096a8001d8454 Mon Sep 17 00:00:00 2001 From: Stackbilt Date: Mon, 27 Apr 2026 06:57:18 -0500 Subject: [PATCH 1/2] fix(cloudflare): use raw binding format for llama-3.2-11b-vision-instruct (#53) Workers AI binding for this model requires { image: number[], prompt, max_tokens } instead of the OpenAI-compatible messages/image_url format. The chat path returns choices[0].message.content === null via the binding, causing extractText() to silently return "". Other vision models are unaffected. Co-Authored-By: Claude Sonnet 4.6 --- CHANGELOG.md | 5 +++ package.json | 2 +- src/__tests__/cloudflare.test.ts | 62 +++++++++++++++++++++++++++++--- src/providers/cloudflare.ts | 50 ++++++++++++++++++++++++-- 4 files changed, 111 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5e44bdf..2c1458a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,11 @@ All notable changes to `@stackbilt/llm-providers` are documented here. Format follows [Keep a Changelog](https://keepachangelog.com/). Versions use [Semantic Versioning](https://semver.org/). +## [1.5.1] — 2026-04-27 + +### Fixed +- **`analyzeImage()` silent empty response on Cloudflare** — `@cf/meta/llama-3.2-11b-vision-instruct` via the Workers AI binding requires a raw `{ image: number[], prompt, max_tokens }` input shape, not the OpenAI-compatible `messages/image_url` format. The chat path returns `choices[0].message.content === null` via the binding, causing `extractText()` to silently return `""`. The provider now detects this model and dispatches to the raw binding format, mapping the result's `{ response: string }` back through the existing normalisation path. Other vision models (`@cf/google/gemma-4-26b-a4b-it`, `@cf/meta/llama-4-scout-17b-16e-instruct`) continue using the chat format unchanged. Fixes #53. + ## [1.5.0] — 2026-04-23 Bundles the unreleased 1.4.0 scope (model retirements, drift test) with envelope validation, env auto-discovery, and the declarative catalog into a single minor release. 1.4.0 was tagged in `package.json` but never published to npm; consumers upgrading from 1.3.0 receive all of the following. diff --git a/package.json b/package.json index 4b79742..6ffb2e1 100755 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@stackbilt/llm-providers", - "version": "1.5.0", + "version": "1.5.1", "description": "Multi-LLM failover with circuit breakers, cost tracking, and intelligent retry. Cloudflare Workers native.", "author": "Stackbilt ", "license": "Apache-2.0", diff --git a/src/__tests__/cloudflare.test.ts b/src/__tests__/cloudflare.test.ts index be93b3a..e6e9792 100644 --- a/src/__tests__/cloudflare.test.ts +++ b/src/__tests__/cloudflare.test.ts @@ -348,11 +348,44 @@ describe('CloudflareProvider', () => { expect(content.filter((p: { type: string }) => p.type === 'image_url')).toHaveLength(2); }); - it('accepts pre-formed data: URLs via image.url', async () => { - mockAiRun.mockResolvedValueOnce({ - choices: [{ message: { content: 'ok' }, finish_reason: 'stop' }] + it('uses raw binding format for llama-3.2-11b-vision-instruct (fixes silent empty response)', async () => { + mockAiRun.mockResolvedValueOnce({ response: 'A delicious pasta dish.' }); + + const result = await provider.generateResponse({ + model: '@cf/meta/llama-3.2-11b-vision-instruct', + messages: [{ role: 'user', content: 'Describe this food image.' }], + images: [{ data: 'QUJD', mimeType: 'image/jpeg' }], + maxTokens: 512 + }); + + const [modelArg, body] = mockAiRun.mock.calls[0]; + expect(modelArg).toBe('@cf/meta/llama-3.2-11b-vision-instruct'); + expect(Array.isArray(body.image)).toBe(true); + expect(body.image).toHaveLength(3); // QUJD = 3 bytes: [65, 66, 67] + expect(body.prompt).toBe('Describe this food image.'); + expect(body.max_tokens).toBe(512); + expect(body.messages).toBeUndefined(); + expect(result.content).toBe('A delicious pasta dish.'); + expect(result.message).toBe('A delicious pasta dish.'); + }); + + it('prepends system prompt to raw binding prompt for llama-3.2', async () => { + mockAiRun.mockResolvedValueOnce({ response: 'Pasta.' }); + + await provider.generateResponse({ + model: '@cf/meta/llama-3.2-11b-vision-instruct', + messages: [{ role: 'user', content: 'What is this?' }], + images: [{ data: 'QUJD', mimeType: 'image/jpeg' }], + systemPrompt: 'You are a food critic.', }); + const [, body] = mockAiRun.mock.calls[0]; + expect(body.prompt).toBe('You are a food critic.\n\nWhat is this?'); + }); + + it('accepts pre-formed data: URL for llama-3.2 raw binding', async () => { + mockAiRun.mockResolvedValueOnce({ response: 'ok' }); + await provider.generateResponse({ model: '@cf/meta/llama-3.2-11b-vision-instruct', messages: [{ role: 'user', content: 'x' }], @@ -360,8 +393,27 @@ describe('CloudflareProvider', () => { }); const [, body] = mockAiRun.mock.calls[0]; - const imagePart = body.messages[0].content[1]; - expect(imagePart.image_url.url).toBe('data:image/webp;base64,ZEFUQQ=='); + expect(Array.isArray(body.image)).toBe(true); + expect(body.messages).toBeUndefined(); + }); + + it('other vision models (gemma-4, llama-4-scout) still use chat/image_url format', async () => { + mockAiRun.mockResolvedValueOnce({ + choices: [{ message: { content: 'A tomato.' }, finish_reason: 'stop' }] + }); + + await provider.generateResponse({ + model: '@cf/google/gemma-4-26b-a4b-it', + messages: [{ role: 'user', content: 'What is in this image?' }], + images: [{ data: 'QUJD', mimeType: 'image/png' }], + maxTokens: 256 + }); + + const [, body] = mockAiRun.mock.calls[0]; + expect(body.messages).toBeDefined(); + expect(body.image).toBeUndefined(); + const imagePart = body.messages[body.messages.length - 1].content[1]; + expect(imagePart.image_url.url).toBe('data:image/png;base64,QUJD'); }); it('rejects HTTP image URLs (requires base64 bytes)', async () => { diff --git a/src/providers/cloudflare.ts b/src/providers/cloudflare.ts index ed8d151..50cdad7 100755 --- a/src/providers/cloudflare.ts +++ b/src/providers/cloudflare.ts @@ -87,6 +87,13 @@ interface WorkersAIResult { result?: WorkersAIResult; // wrapped responses } +// Workers AI binding for this model requires a raw { image, prompt } input rather +// than the OpenAI-compatible messages/image_url format. The chat path returns +// choices[0].message.content === null via the binding, silently producing "". +const LLAMA_VISION_RAW_MODELS = new Set([ + '@cf/meta/llama-3.2-11b-vision-instruct' +]); + export class CloudflareProvider extends BaseProvider { name = 'cloudflare'; models = [ @@ -134,13 +141,19 @@ export class CloudflareProvider extends BaseProvider { try { const response = await this.executeWithResiliency(async () => { const model = request.model || this.getRecommendedModel(request); - const cloudflareRequest = this.formatRequest(request, model); - // Validate model is supported if (!this.models.includes(model)) { throw new ModelNotFoundError('cloudflare', model); } + // llama-3.2-11b vision requires the raw Workers AI binding format. + // The chat/image_url path returns null content via the binding. + if (LLAMA_VISION_RAW_MODELS.has(model) && (request.images?.length ?? 0) > 0) { + const result = await this.runLlamaVisionRaw(request, model); + return this.formatResponse(result as WorkersAIResult, model, request, Date.now() - startTime); + } + + const cloudflareRequest = this.formatRequest(request, model); // Workers AI binding uses branded model names; cast at API boundary // eslint-disable-next-line @typescript-eslint/no-explicit-any -- Ai.run() requires branded model types const result = await (this.ai as { run(model: string, input: unknown): Promise }).run(model, cloudflareRequest); @@ -358,6 +371,39 @@ export class CloudflareProvider extends BaseProvider { }; } + private async runLlamaVisionRaw(request: LLMRequest, model: string): Promise { + const image = request.images![0]; + + let imageBytes: number[]; + if (image.data) { + imageBytes = Array.from(Uint8Array.from(atob(image.data), c => c.charCodeAt(0))); + } else if (image.url?.startsWith('data:')) { + const b64 = image.url.split(',')[1] ?? ''; + imageBytes = Array.from(Uint8Array.from(atob(b64), c => c.charCodeAt(0))); + } else { + throw new ConfigurationError( + this.name, + `${model} requires base64 image data or a data: URL — HTTP URLs are not supported.` + ); + } + + const systemPrefix = request.systemPrompt ? `${request.systemPrompt}\n\n` : ''; + let lastUserText = ''; + for (let i = request.messages.length - 1; i >= 0; i--) { + if (request.messages[i].role === 'user') { + lastUserText = typeof request.messages[i].content === 'string' ? request.messages[i].content : ''; + break; + } + } + + // eslint-disable-next-line @typescript-eslint/no-explicit-any -- Ai.run() requires branded model types + return (this.ai as { run(model: string, input: unknown): Promise }).run(model, { + image: imageBytes, + prompt: `${systemPrefix}${lastUserText}`, + max_tokens: request.maxTokens + }) as Promise; + } + private formatRequest(request: LLMRequest, model: string): CloudflareRequest { const capabilities = this.getModelCapabilities()[model]; const usesTools = From 293ac6370a9f2aab0735258c0b6d1fbd090fd8b9 Mon Sep 17 00:00:00 2001 From: Stackbilt Date: Mon, 27 Apr 2026 07:01:41 -0500 Subject: [PATCH 2/2] fix: address review comments on #53 fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Guard against multiple images (silent truncation → explicit error) - Flatten array-content user messages into raw binding prompt string - Default max_tokens to 512 when not provided (avoids undefined) - Expand LLAMA_VISION_RAW_MODELS comment for future maintainers - Three new tests covering the above Co-Authored-By: Claude Sonnet 4.6 --- src/__tests__/cloudflare.test.ts | 45 ++++++++++++++++++++++++++++++++ src/providers/cloudflare.ts | 26 ++++++++++++++---- 2 files changed, 66 insertions(+), 5 deletions(-) diff --git a/src/__tests__/cloudflare.test.ts b/src/__tests__/cloudflare.test.ts index e6e9792..df07a2c 100644 --- a/src/__tests__/cloudflare.test.ts +++ b/src/__tests__/cloudflare.test.ts @@ -383,6 +383,51 @@ describe('CloudflareProvider', () => { expect(body.prompt).toBe('You are a food critic.\n\nWhat is this?'); }); + it('rejects multiple images on llama-3.2 raw binding with a clear error', async () => { + await expect( + provider.generateResponse({ + model: '@cf/meta/llama-3.2-11b-vision-instruct', + messages: [{ role: 'user', content: 'compare' }], + images: [ + { data: 'QUJD', mimeType: 'image/jpeg' }, + { data: 'REVG', mimeType: 'image/jpeg' } + ] + }) + ).rejects.toThrow(/supports exactly one image/); + }); + + it('extracts text from array-content user message for llama-3.2 raw binding', async () => { + mockAiRun.mockResolvedValueOnce({ response: 'Spaghetti.' }); + + await provider.generateResponse({ + model: '@cf/meta/llama-3.2-11b-vision-instruct', + messages: [{ + role: 'user', + content: [ + { type: 'text', text: 'What food is this?' }, + { type: 'text', text: 'Be brief.' } + ] as unknown as string + }], + images: [{ data: 'QUJD', mimeType: 'image/jpeg' }] + }); + + const [, body] = mockAiRun.mock.calls[0]; + expect(body.prompt).toBe('What food is this? Be brief.'); + }); + + it('defaults max_tokens to 512 when not specified for llama-3.2 raw binding', async () => { + mockAiRun.mockResolvedValueOnce({ response: 'ok' }); + + await provider.generateResponse({ + model: '@cf/meta/llama-3.2-11b-vision-instruct', + messages: [{ role: 'user', content: 'x' }], + images: [{ data: 'QUJD', mimeType: 'image/jpeg' }] + }); + + const [, body] = mockAiRun.mock.calls[0]; + expect(body.max_tokens).toBe(512); + }); + it('accepts pre-formed data: URL for llama-3.2 raw binding', async () => { mockAiRun.mockResolvedValueOnce({ response: 'ok' }); diff --git a/src/providers/cloudflare.ts b/src/providers/cloudflare.ts index 50cdad7..18053ac 100755 --- a/src/providers/cloudflare.ts +++ b/src/providers/cloudflare.ts @@ -87,9 +87,10 @@ interface WorkersAIResult { result?: WorkersAIResult; // wrapped responses } -// Workers AI binding for this model requires a raw { image, prompt } input rather -// than the OpenAI-compatible messages/image_url format. The chat path returns -// choices[0].message.content === null via the binding, silently producing "". +// Models that require the raw { image, prompt } binding format rather than chat/image_url. +// Add any new CF vision models here if they exhibit the same null-content symptom via the binding. +// (The chat path returns choices[0].message.content === null through the Workers AI binding, +// silently producing "".) const LLAMA_VISION_RAW_MODELS = new Set([ '@cf/meta/llama-3.2-11b-vision-instruct' ]); @@ -372,6 +373,13 @@ export class CloudflareProvider extends BaseProvider { } private async runLlamaVisionRaw(request: LLMRequest, model: string): Promise { + if (request.images!.length > 1) { + throw new ConfigurationError( + this.name, + `${model} supports exactly one image via the raw binding format — ${request.images!.length} were provided.` + ); + } + const image = request.images![0]; let imageBytes: number[]; @@ -391,7 +399,15 @@ export class CloudflareProvider extends BaseProvider { let lastUserText = ''; for (let i = request.messages.length - 1; i >= 0; i--) { if (request.messages[i].role === 'user') { - lastUserText = typeof request.messages[i].content === 'string' ? request.messages[i].content : ''; + const raw = request.messages[i].content; + lastUserText = typeof raw === 'string' + ? raw + : Array.isArray(raw) + ? (raw as Array<{ type?: string; text?: string }>) + .filter(p => p.type === 'text') + .map(p => p.text ?? '') + .join(' ') + : ''; break; } } @@ -400,7 +416,7 @@ export class CloudflareProvider extends BaseProvider { return (this.ai as { run(model: string, input: unknown): Promise }).run(model, { image: imageBytes, prompt: `${systemPrefix}${lastUserText}`, - max_tokens: request.maxTokens + max_tokens: request.maxTokens ?? 512 }) as Promise; }