diff --git a/powerrag/parser/mineru_parser.py b/powerrag/parser/mineru_parser.py index 1ad3765c1..81405bfde 100644 --- a/powerrag/parser/mineru_parser.py +++ b/powerrag/parser/mineru_parser.py @@ -248,18 +248,10 @@ def store_images(self, md_content: str, images: ImageDict, output_dir: str) -> s # Store image in OceanBase STORAGE_IMPL.put(output_dir, img_name, img_bytes) - # Generate URL for the image using PowerRAG image access endpoint - # Get PowerRAG server configuration - powerrag_config = get_base_config("powerrag", {}) or {} - server_url = os.environ.get("PUBLIC_SERVER_URL", "http://localhost:6000") - - # Ensure server_url has protocol prefix - if not server_url.startswith("http://") and not server_url.startswith("https://"): - server_url = f"http://{server_url}" - - # Construct the image URL using PowerRAG chunk image endpoint + # Use relative path so frontend treats it as same-origin (proxied to backend). + # Avoids sanitizer blocking external img src while preventing data exfiltration. kb_id = output_dir.split('/')[0] if '/' in output_dir else output_dir - image_url = f"{server_url}/api/v1/powerrag/chunk/image/{kb_id}/{img_name}" + image_url = f"/api/v1/powerrag/chunk/image/{kb_id}/{img_name}" # Add to result list image_info.append((img_name, image_url)) diff --git a/powerrag/parser/vllm_parser.py b/powerrag/parser/vllm_parser.py index 30778d1b7..6ba1d9d4d 100644 --- a/powerrag/parser/vllm_parser.py +++ b/powerrag/parser/vllm_parser.py @@ -415,10 +415,8 @@ def layoutjson2md(self, image: Image.Image, cells: list, text_key: str = 'text', # Store image in storage (bucket) STORAGE_IMPL.put(output_dir, img_filename, img_bytes) - # Generate URL for the image - powerrag_config = get_base_config("powerrag", {}) or {} - api_url = os.environ.get("PUBLIC_SERVER_URL", "http://localhost:6000") - image_url = f"http://{api_url}/v1/chunk/image/{output_dir}/{img_filename}" + # Use relative path so frontend treats it as same-origin (proxied to backend) + image_url = f"/v1/chunk/image/{output_dir}/{img_filename}" # Use HTML img tag with URL text_items.append(f'$$00$$') @@ -662,13 +660,9 @@ def store_images(self, md_content: str, images: ImageDict, output_dir: str) -> s # Store image in storage STORAGE_IMPL.put(output_dir, img_name, img_bytes) - # Generate URL for the image using RAGFlow image access endpoint - # Get RAGFlow server configuration - powerrag_config = get_base_config("powerrag", {}) or {} - api_url = os.environ.get("PUBLIC_SERVER_URL", "http://localhost:6000") - - # Construct the image URL using the auth_image endpoint - image_url = f"http://{api_url}/v1/chunk/image/{output_dir}/{img_name}" + # Use relative path so frontend treats it as same-origin (proxied to backend). + # Avoids sanitizer blocking external img src while preventing data exfiltration. + image_url = f"/v1/chunk/image/{output_dir}/{img_name}" # Add to result list image_info.append((img_name, image_url)) diff --git a/web/.env b/web/.env deleted file mode 100644 index a6cbd9ccd..000000000 --- a/web/.env +++ /dev/null @@ -1,2 +0,0 @@ -PORT=9222 -DID_YOU_KNOW=none \ No newline at end of file diff --git a/web/.gitignore b/web/.gitignore index fe4e6bf38..b934a778d 100644 --- a/web/.gitignore +++ b/web/.gitignore @@ -1,4 +1,5 @@ /node_modules +/.env /.env.local /.umirc.local.ts /config/config.local.ts diff --git a/web/src/components/floating-chat-widget-markdown.tsx b/web/src/components/floating-chat-widget-markdown.tsx index b4a7db1f2..f7b316a39 100644 --- a/web/src/components/floating-chat-widget-markdown.tsx +++ b/web/src/components/floating-chat-widget-markdown.tsx @@ -14,10 +14,10 @@ import { showImage, } from '@/utils/chat'; import { getExtension } from '@/utils/document-util'; +import { sanitizeChatContent } from '@/utils/sanitize'; import { InfoCircleOutlined } from '@ant-design/icons'; import { Button, Flex, Popover, Tooltip } from 'antd'; import classNames from 'classnames'; -import DOMPurify from 'dompurify'; import 'katex/dist/katex.min.css'; import { omit } from 'lodash'; import { pipe } from 'lodash/fp'; @@ -57,7 +57,8 @@ const FloatingChatWidgetMarkdown = ({ const isDarkTheme = useIsDarkTheme(); const contentWithCursor = useMemo(() => { - let text = content === '' ? t('chat.searching') : content; + let text = + content === '' ? t('chat.searching') : sanitizeChatContent(content); const nextText = replaceTextByOldReg(text); return pipe(replaceThinkToSection, preprocessLaTeX)(nextText); }, [content, t]); @@ -179,7 +180,7 @@ const FloatingChatWidgetMarkdown = ({
diff --git a/web/src/components/highlight-markdown/index.tsx b/web/src/components/highlight-markdown/index.tsx index 50b752680..0b1394c2f 100644 --- a/web/src/components/highlight-markdown/index.tsx +++ b/web/src/components/highlight-markdown/index.tsx @@ -13,6 +13,7 @@ import remarkMath from 'remark-math'; import 'katex/dist/katex.min.css'; // `rehype-katex` does not import the CSS for you import { preprocessLaTeX } from '@/utils/chat'; +import { sanitizeChatContent } from '@/utils/sanitize'; import { useIsDarkTheme } from '../theme-provider'; import styles from './index.less'; @@ -51,7 +52,7 @@ const HighLightMarkdown = ({ } as any } > - {children ? preprocessLaTeX(children) : children} + {children ? preprocessLaTeX(sanitizeChatContent(children)) : children} ); }; diff --git a/web/src/components/markdown-content/index.tsx b/web/src/components/markdown-content/index.tsx index 6e93bf134..3d9bf161e 100644 --- a/web/src/components/markdown-content/index.tsx +++ b/web/src/components/markdown-content/index.tsx @@ -2,7 +2,7 @@ import Image from '@/components/image'; import SvgIcon from '@/components/svg-icon'; import { IReference, IReferenceChunk } from '@/interfaces/database/chat'; import { getExtension } from '@/utils/document-util'; -import DOMPurify from 'dompurify'; +import { sanitizeChatContent } from '@/utils/sanitize'; import { useCallback, useEffect, useMemo } from 'react'; import Markdown from 'react-markdown'; import SyntaxHighlighter from 'react-syntax-highlighter'; @@ -52,10 +52,7 @@ const MarkdownContent = ({ const { setDocumentIds, data: fileThumbnails } = useFetchDocumentThumbnailsByIds(); const contentWithCursor = useMemo(() => { - let text = DOMPurify.sanitize(content, { - ADD_TAGS: ['think', 'section'], - ADD_ATTR: ['class'], - }); + let text = sanitizeChatContent(content); // let text = content; if (text === '') { @@ -166,7 +163,7 @@ const MarkdownContent = ({
diff --git a/web/src/components/next-markdown-content/index.tsx b/web/src/components/next-markdown-content/index.tsx index 8a300b226..be4a0d3dd 100644 --- a/web/src/components/next-markdown-content/index.tsx +++ b/web/src/components/next-markdown-content/index.tsx @@ -2,7 +2,7 @@ import Image from '@/components/image'; import SvgIcon from '@/components/svg-icon'; import { IReferenceChunk, IReferenceObject } from '@/interfaces/database/chat'; import { getExtension } from '@/utils/document-util'; -import DOMPurify from 'dompurify'; +import { sanitizeChatContent } from '@/utils/sanitize'; import { memo, useCallback, useEffect, useMemo } from 'react'; import Markdown from 'react-markdown'; import SyntaxHighlighter from 'react-syntax-highlighter'; @@ -53,10 +53,7 @@ function MarkdownContent({ const { setDocumentIds, data: fileThumbnails } = useFetchDocumentThumbnailsByIds(); const contentWithCursor = useMemo(() => { - let text = DOMPurify.sanitize(content, { - ADD_TAGS: ['think', 'section'], - ADD_ATTR: ['class'], - }); + let text = sanitizeChatContent(content); // let text = content; if (text === '') { text = t('chat.searching'); @@ -168,7 +165,7 @@ function MarkdownContent({
diff --git a/web/src/pages/chunk/parsed-result/add-knowledge/components/knowledge-chunk/components/chunk-card/index.tsx b/web/src/pages/chunk/parsed-result/add-knowledge/components/knowledge-chunk/components/chunk-card/index.tsx index 97a5af714..5c50ac666 100644 --- a/web/src/pages/chunk/parsed-result/add-knowledge/components/knowledge-chunk/components/chunk-card/index.tsx +++ b/web/src/pages/chunk/parsed-result/add-knowledge/components/knowledge-chunk/components/chunk-card/index.tsx @@ -10,9 +10,9 @@ import { } from '@/components/ui/tooltip'; import type { ChunkDocType, IChunk } from '@/interfaces/database/knowledge'; import { cn } from '@/lib/utils'; +import { sanitizeChatContent } from '@/utils/sanitize'; import { CheckedState } from '@radix-ui/react-checkbox'; import classNames from 'classnames'; -import DOMPurify from 'dompurify'; import { useEffect, useState } from 'react'; import { useTranslation } from 'react-i18next'; import { ChunkTextMode } from '../../constant'; @@ -124,7 +124,7 @@ const ChunkCard = ({ >
{ - let text = DOMPurify.sanitize(content, { - ADD_TAGS: ['think', 'section'], - ADD_ATTR: ['class'], - }); + let text = sanitizeChatContent(content); // let text = content; if (text === '') { text = t('chat.searching'); @@ -168,7 +165,7 @@ const MarkdownContent = ({
diff --git a/web/src/pages/next-search/search-view.tsx b/web/src/pages/next-search/search-view.tsx index 890e28072..4b1068355 100644 --- a/web/src/pages/next-search/search-view.tsx +++ b/web/src/pages/next-search/search-view.tsx @@ -14,7 +14,7 @@ import { import { RAGFlowPagination } from '@/components/ui/ragflow-pagination'; import { IReference } from '@/interfaces/database/chat'; import { cn } from '@/lib/utils'; -import DOMPurify from 'dompurify'; +import { sanitizeChatContent } from '@/utils/sanitize'; import { isEmpty } from 'lodash'; import { BrainCircuit, Search, X } from 'lucide-react'; import { Dispatch, SetStateAction, useEffect, useState } from 'react'; @@ -208,7 +208,7 @@ export default function SearchingView({
{ >
s.trim().split(/\s+/)[0]) + .filter(Boolean); +} + +function isSrcsetAllowed(srcset: string): boolean { + const urls = getUrlsFromSrcset(srcset); + return urls.length > 0 && urls.every((url) => isAllowedImageUrl(url)); +} + +function sanitizeImageUrlAttributes(node: Element): void { + const tag = node.tagName; + if (tag !== 'IMG' && tag !== 'SOURCE') return; + + const src = node.getAttribute('src'); + if (src && !isAllowedImageUrl(src)) { + node.removeAttribute('src'); + } + + const srcset = node.getAttribute('srcset'); + if (srcset && !isSrcsetAllowed(srcset)) { + node.removeAttribute('srcset'); + } +} + +let secureImageHookAdded = false; + +function ensureSecureImageHook(): void { + if (secureImageHookAdded) return; + DOMPurify.addHook('afterSanitizeAttributes', (node) => { + if (node.nodeType === 1) { + sanitizeImageUrlAttributes(node as Element); + } + }); + secureImageHookAdded = true; +} + +/** + * Sanitize content for safe rendering. Restricts img/srcset (and source src/srcset) + * to same-origin, relative, or data: URLs only. Use for all chat/AI output and user content. + */ +export function sanitizeChatContent(dirty: string, config?: Config): string { + ensureSecureImageHook(); + const result = DOMPurify.sanitize(dirty, { + ADD_TAGS: ['think', 'section'], + ADD_ATTR: ['class'], + ...config, + }); + return typeof result === 'string' ? result : ''; +}