From 76d0a3e2907b0ff7a6f7939c6cff9ea424f1a477 Mon Sep 17 00:00:00 2001 From: "suiyu.zg" Date: Wed, 25 Feb 2026 11:43:45 +0800 Subject: [PATCH 1/4] fix(security): prevent data exfiltration via img src in chat output - Add sanitizeChatContent utility to restrict img src to same-origin, relative, or data: URLs only - Block external URLs in img tags (e.g. prompt injection embedding conversation history) - Apply sanitization to all chat/AI content rendering: markdown-content, floating-chat-widget, highlight-markdown, search views, chunk cards - Mitigates vulnerability where malicious instructions (from KB, custom prompt, agent, etc.) could exfiltrate user conversations to attacker-controlled servers Co-authored-by: Cursor --- .../floating-chat-widget-markdown.tsx | 7 ++- .../components/highlight-markdown/index.tsx | 3 +- web/src/components/markdown-content/index.tsx | 9 +-- .../next-markdown-content/index.tsx | 9 +-- .../components/chunk-card/index.tsx | 4 +- .../components/chunk-card/index.tsx | 4 +- .../next-search/markdown-content/index.tsx | 9 +-- web/src/pages/next-search/search-view.tsx | 4 +- web/src/pages/search/index.tsx | 4 +- web/src/utils/sanitize.ts | 57 +++++++++++++++++++ 10 files changed, 80 insertions(+), 30 deletions(-) create mode 100644 web/src/utils/sanitize.ts diff --git a/web/src/components/floating-chat-widget-markdown.tsx b/web/src/components/floating-chat-widget-markdown.tsx index b4a7db1f2..f7b316a39 100644 --- a/web/src/components/floating-chat-widget-markdown.tsx +++ b/web/src/components/floating-chat-widget-markdown.tsx @@ -14,10 +14,10 @@ import { showImage, } from '@/utils/chat'; import { getExtension } from '@/utils/document-util'; +import { sanitizeChatContent } from '@/utils/sanitize'; import { InfoCircleOutlined } from '@ant-design/icons'; import { Button, Flex, Popover, Tooltip } from 'antd'; import classNames from 'classnames'; -import DOMPurify from 'dompurify'; import 'katex/dist/katex.min.css'; import { omit } from 'lodash'; import { pipe } from 'lodash/fp'; @@ -57,7 +57,8 @@ const FloatingChatWidgetMarkdown = ({ const isDarkTheme = useIsDarkTheme(); const contentWithCursor = useMemo(() => { - let text = content === '' ? t('chat.searching') : content; + let text = + content === '' ? t('chat.searching') : sanitizeChatContent(content); const nextText = replaceTextByOldReg(text); return pipe(replaceThinkToSection, preprocessLaTeX)(nextText); }, [content, t]); @@ -179,7 +180,7 @@ const FloatingChatWidgetMarkdown = ({
diff --git a/web/src/components/highlight-markdown/index.tsx b/web/src/components/highlight-markdown/index.tsx index 50b752680..0b1394c2f 100644 --- a/web/src/components/highlight-markdown/index.tsx +++ b/web/src/components/highlight-markdown/index.tsx @@ -13,6 +13,7 @@ import remarkMath from 'remark-math'; import 'katex/dist/katex.min.css'; // `rehype-katex` does not import the CSS for you import { preprocessLaTeX } from '@/utils/chat'; +import { sanitizeChatContent } from '@/utils/sanitize'; import { useIsDarkTheme } from '../theme-provider'; import styles from './index.less'; @@ -51,7 +52,7 @@ const HighLightMarkdown = ({ } as any } > - {children ? preprocessLaTeX(children) : children} + {children ? preprocessLaTeX(sanitizeChatContent(children)) : children} ); }; diff --git a/web/src/components/markdown-content/index.tsx b/web/src/components/markdown-content/index.tsx index 6e93bf134..3d9bf161e 100644 --- a/web/src/components/markdown-content/index.tsx +++ b/web/src/components/markdown-content/index.tsx @@ -2,7 +2,7 @@ import Image from '@/components/image'; import SvgIcon from '@/components/svg-icon'; import { IReference, IReferenceChunk } from '@/interfaces/database/chat'; import { getExtension } from '@/utils/document-util'; -import DOMPurify from 'dompurify'; +import { sanitizeChatContent } from '@/utils/sanitize'; import { useCallback, useEffect, useMemo } from 'react'; import Markdown from 'react-markdown'; import SyntaxHighlighter from 'react-syntax-highlighter'; @@ -52,10 +52,7 @@ const MarkdownContent = ({ const { setDocumentIds, data: fileThumbnails } = useFetchDocumentThumbnailsByIds(); const contentWithCursor = useMemo(() => { - let text = DOMPurify.sanitize(content, { - ADD_TAGS: ['think', 'section'], - ADD_ATTR: ['class'], - }); + let text = sanitizeChatContent(content); // let text = content; if (text === '') { @@ -166,7 +163,7 @@ const MarkdownContent = ({
diff --git a/web/src/components/next-markdown-content/index.tsx b/web/src/components/next-markdown-content/index.tsx index 8a300b226..be4a0d3dd 100644 --- a/web/src/components/next-markdown-content/index.tsx +++ b/web/src/components/next-markdown-content/index.tsx @@ -2,7 +2,7 @@ import Image from '@/components/image'; import SvgIcon from '@/components/svg-icon'; import { IReferenceChunk, IReferenceObject } from '@/interfaces/database/chat'; import { getExtension } from '@/utils/document-util'; -import DOMPurify from 'dompurify'; +import { sanitizeChatContent } from '@/utils/sanitize'; import { memo, useCallback, useEffect, useMemo } from 'react'; import Markdown from 'react-markdown'; import SyntaxHighlighter from 'react-syntax-highlighter'; @@ -53,10 +53,7 @@ function MarkdownContent({ const { setDocumentIds, data: fileThumbnails } = useFetchDocumentThumbnailsByIds(); const contentWithCursor = useMemo(() => { - let text = DOMPurify.sanitize(content, { - ADD_TAGS: ['think', 'section'], - ADD_ATTR: ['class'], - }); + let text = sanitizeChatContent(content); // let text = content; if (text === '') { text = t('chat.searching'); @@ -168,7 +165,7 @@ function MarkdownContent({
diff --git a/web/src/pages/chunk/parsed-result/add-knowledge/components/knowledge-chunk/components/chunk-card/index.tsx b/web/src/pages/chunk/parsed-result/add-knowledge/components/knowledge-chunk/components/chunk-card/index.tsx index 97a5af714..5c50ac666 100644 --- a/web/src/pages/chunk/parsed-result/add-knowledge/components/knowledge-chunk/components/chunk-card/index.tsx +++ b/web/src/pages/chunk/parsed-result/add-knowledge/components/knowledge-chunk/components/chunk-card/index.tsx @@ -10,9 +10,9 @@ import { } from '@/components/ui/tooltip'; import type { ChunkDocType, IChunk } from '@/interfaces/database/knowledge'; import { cn } from '@/lib/utils'; +import { sanitizeChatContent } from '@/utils/sanitize'; import { CheckedState } from '@radix-ui/react-checkbox'; import classNames from 'classnames'; -import DOMPurify from 'dompurify'; import { useEffect, useState } from 'react'; import { useTranslation } from 'react-i18next'; import { ChunkTextMode } from '../../constant'; @@ -124,7 +124,7 @@ const ChunkCard = ({ >
{ - let text = DOMPurify.sanitize(content, { - ADD_TAGS: ['think', 'section'], - ADD_ATTR: ['class'], - }); + let text = sanitizeChatContent(content); // let text = content; if (text === '') { text = t('chat.searching'); @@ -168,7 +165,7 @@ const MarkdownContent = ({
diff --git a/web/src/pages/next-search/search-view.tsx b/web/src/pages/next-search/search-view.tsx index 890e28072..4b1068355 100644 --- a/web/src/pages/next-search/search-view.tsx +++ b/web/src/pages/next-search/search-view.tsx @@ -14,7 +14,7 @@ import { import { RAGFlowPagination } from '@/components/ui/ragflow-pagination'; import { IReference } from '@/interfaces/database/chat'; import { cn } from '@/lib/utils'; -import DOMPurify from 'dompurify'; +import { sanitizeChatContent } from '@/utils/sanitize'; import { isEmpty } from 'lodash'; import { BrainCircuit, Search, X } from 'lucide-react'; import { Dispatch, SetStateAction, useEffect, useState } from 'react'; @@ -208,7 +208,7 @@ export default function SearchingView({
{ >
{ + if (node.tagName === 'IMG') { + const src = node.getAttribute('src'); + if (src && !isAllowedImageUrl(src)) { + node.removeAttribute('src'); + } + } + }); + secureImageHookAdded = true; +} + +/** + * Sanitize content for safe rendering. Restricts img src to same-origin, + * relative, or data: URLs only. Use for all chat/AI output and user content. + */ +export function sanitizeChatContent(dirty: string, config?: Config): string { + ensureSecureImageHook(); + const result = DOMPurify.sanitize(dirty, { + ADD_TAGS: ['think', 'section'], + ADD_ATTR: ['class'], + ...config, + }); + return typeof result === 'string' ? result : ''; +} From edda10091e028bd527e1df5759d103c0be611b27 Mon Sep 17 00:00:00 2001 From: "suiyu.zg" Date: Wed, 25 Feb 2026 11:46:24 +0800 Subject: [PATCH 2/4] chore: ignore web/.env for local debugging config Co-authored-by: Cursor --- web/.env | 2 -- web/.gitignore | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) delete mode 100644 web/.env diff --git a/web/.env b/web/.env deleted file mode 100644 index a6cbd9ccd..000000000 --- a/web/.env +++ /dev/null @@ -1,2 +0,0 @@ -PORT=9222 -DID_YOU_KNOW=none \ No newline at end of file diff --git a/web/.gitignore b/web/.gitignore index fe4e6bf38..b934a778d 100644 --- a/web/.gitignore +++ b/web/.gitignore @@ -1,4 +1,5 @@ /node_modules +/.env /.env.local /.umirc.local.ts /config/config.local.ts From a9815aa001aaf86ba965d860995be13bf8b82960 Mon Sep 17 00:00:00 2001 From: "suiyu.zg" Date: Wed, 25 Feb 2026 14:35:54 +0800 Subject: [PATCH 3/4] Use relative path so frontend treats it as same-origin --- powerrag/parser/mineru_parser.py | 14 +++----------- powerrag/parser/vllm_parser.py | 16 +++++----------- 2 files changed, 8 insertions(+), 22 deletions(-) diff --git a/powerrag/parser/mineru_parser.py b/powerrag/parser/mineru_parser.py index 1ad3765c1..81405bfde 100644 --- a/powerrag/parser/mineru_parser.py +++ b/powerrag/parser/mineru_parser.py @@ -248,18 +248,10 @@ def store_images(self, md_content: str, images: ImageDict, output_dir: str) -> s # Store image in OceanBase STORAGE_IMPL.put(output_dir, img_name, img_bytes) - # Generate URL for the image using PowerRAG image access endpoint - # Get PowerRAG server configuration - powerrag_config = get_base_config("powerrag", {}) or {} - server_url = os.environ.get("PUBLIC_SERVER_URL", "http://localhost:6000") - - # Ensure server_url has protocol prefix - if not server_url.startswith("http://") and not server_url.startswith("https://"): - server_url = f"http://{server_url}" - - # Construct the image URL using PowerRAG chunk image endpoint + # Use relative path so frontend treats it as same-origin (proxied to backend). + # Avoids sanitizer blocking external img src while preventing data exfiltration. kb_id = output_dir.split('/')[0] if '/' in output_dir else output_dir - image_url = f"{server_url}/api/v1/powerrag/chunk/image/{kb_id}/{img_name}" + image_url = f"/api/v1/powerrag/chunk/image/{kb_id}/{img_name}" # Add to result list image_info.append((img_name, image_url)) diff --git a/powerrag/parser/vllm_parser.py b/powerrag/parser/vllm_parser.py index 30778d1b7..6ba1d9d4d 100644 --- a/powerrag/parser/vllm_parser.py +++ b/powerrag/parser/vllm_parser.py @@ -415,10 +415,8 @@ def layoutjson2md(self, image: Image.Image, cells: list, text_key: str = 'text', # Store image in storage (bucket) STORAGE_IMPL.put(output_dir, img_filename, img_bytes) - # Generate URL for the image - powerrag_config = get_base_config("powerrag", {}) or {} - api_url = os.environ.get("PUBLIC_SERVER_URL", "http://localhost:6000") - image_url = f"http://{api_url}/v1/chunk/image/{output_dir}/{img_filename}" + # Use relative path so frontend treats it as same-origin (proxied to backend) + image_url = f"/v1/chunk/image/{output_dir}/{img_filename}" # Use HTML img tag with URL text_items.append(f'$$00$$') @@ -662,13 +660,9 @@ def store_images(self, md_content: str, images: ImageDict, output_dir: str) -> s # Store image in storage STORAGE_IMPL.put(output_dir, img_name, img_bytes) - # Generate URL for the image using RAGFlow image access endpoint - # Get RAGFlow server configuration - powerrag_config = get_base_config("powerrag", {}) or {} - api_url = os.environ.get("PUBLIC_SERVER_URL", "http://localhost:6000") - - # Construct the image URL using the auth_image endpoint - image_url = f"http://{api_url}/v1/chunk/image/{output_dir}/{img_name}" + # Use relative path so frontend treats it as same-origin (proxied to backend). + # Avoids sanitizer blocking external img src while preventing data exfiltration. + image_url = f"/v1/chunk/image/{output_dir}/{img_name}" # Add to result list image_info.append((img_name, image_url)) From e83f36f89115bbf4cec24c231dd346bb21bea50d Mon Sep 17 00:00:00 2001 From: "suiyu.zg" Date: Wed, 25 Feb 2026 15:31:23 +0800 Subject: [PATCH 4/4] solve copilot reviews --- web/src/utils/sanitize.ts | 46 +++++++++++++++++++++++++++++++-------- 1 file changed, 37 insertions(+), 9 deletions(-) diff --git a/web/src/utils/sanitize.ts b/web/src/utils/sanitize.ts index a1880d328..b2f1202a0 100644 --- a/web/src/utils/sanitize.ts +++ b/web/src/utils/sanitize.ts @@ -5,13 +5,16 @@ */ import DOMPurify, { Config } from 'dompurify'; -const ALLOWED_IMAGE_URL_PATTERN = /^(?:\/|data:image\/|\.\/|\.\.\/)/i; +// Single / only (not //), ./ , ../ , or data:image/ +const ALLOWED_IMAGE_URL_PATTERN = /^(?:\/(?!\/)|data:image\/|\.\/|\.\.\/)/i; function isAllowedImageUrl(url: string): boolean { if (!url || typeof url !== 'string') return false; const trimmed = url.trim(); if (!trimmed) return false; - // Allow relative URLs: /path, ./path, ../path + // Block protocol-relative URLs (//evil.com) and backslash variants (\\...) + if (trimmed.startsWith('//') || trimmed.startsWith('\\\\')) return false; + // Allow relative URLs: /path (single slash), ./path, ../path if (ALLOWED_IMAGE_URL_PATTERN.test(trimmed)) return true; // Allow data:image/* for inline images (no network request) if (trimmed.toLowerCase().startsWith('data:image/')) return true; @@ -27,24 +30,49 @@ function isAllowedImageUrl(url: string): boolean { return false; } +/** Parse srcset attribute into list of URLs (descriptors like 1x, 100w are stripped). */ +function getUrlsFromSrcset(srcset: string): string[] { + return srcset + .split(',') + .map((s) => s.trim().split(/\s+/)[0]) + .filter(Boolean); +} + +function isSrcsetAllowed(srcset: string): boolean { + const urls = getUrlsFromSrcset(srcset); + return urls.length > 0 && urls.every((url) => isAllowedImageUrl(url)); +} + +function sanitizeImageUrlAttributes(node: Element): void { + const tag = node.tagName; + if (tag !== 'IMG' && tag !== 'SOURCE') return; + + const src = node.getAttribute('src'); + if (src && !isAllowedImageUrl(src)) { + node.removeAttribute('src'); + } + + const srcset = node.getAttribute('srcset'); + if (srcset && !isSrcsetAllowed(srcset)) { + node.removeAttribute('srcset'); + } +} + let secureImageHookAdded = false; function ensureSecureImageHook(): void { if (secureImageHookAdded) return; DOMPurify.addHook('afterSanitizeAttributes', (node) => { - if (node.tagName === 'IMG') { - const src = node.getAttribute('src'); - if (src && !isAllowedImageUrl(src)) { - node.removeAttribute('src'); - } + if (node.nodeType === 1) { + sanitizeImageUrlAttributes(node as Element); } }); secureImageHookAdded = true; } /** - * Sanitize content for safe rendering. Restricts img src to same-origin, - * relative, or data: URLs only. Use for all chat/AI output and user content. + * Sanitize content for safe rendering. Restricts img/srcset (and source src/srcset) + * to same-origin, relative, or data: URLs only. Use for all chat/AI output and user content. */ export function sanitizeChatContent(dirty: string, config?: Config): string { ensureSecureImageHook();