Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 3 additions & 11 deletions powerrag/parser/mineru_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,18 +248,10 @@ def store_images(self, md_content: str, images: ImageDict, output_dir: str) -> s
# Store image in OceanBase
STORAGE_IMPL.put(output_dir, img_name, img_bytes)

# Generate URL for the image using PowerRAG image access endpoint
# Get PowerRAG server configuration
powerrag_config = get_base_config("powerrag", {}) or {}
server_url = os.environ.get("PUBLIC_SERVER_URL", "http://localhost:6000")

# Ensure server_url has protocol prefix
if not server_url.startswith("http://") and not server_url.startswith("https://"):
server_url = f"http://{server_url}"

# Construct the image URL using PowerRAG chunk image endpoint
# Use relative path so frontend treats it as same-origin (proxied to backend).
# Avoids sanitizer blocking external img src while preventing data exfiltration.
kb_id = output_dir.split('/')[0] if '/' in output_dir else output_dir
image_url = f"{server_url}/api/v1/powerrag/chunk/image/{kb_id}/{img_name}"
image_url = f"/api/v1/powerrag/chunk/image/{kb_id}/{img_name}"

# Add to result list
image_info.append((img_name, image_url))
Expand Down
16 changes: 5 additions & 11 deletions powerrag/parser/vllm_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -415,10 +415,8 @@ def layoutjson2md(self, image: Image.Image, cells: list, text_key: str = 'text',
# Store image in storage (bucket)
STORAGE_IMPL.put(output_dir, img_filename, img_bytes)

# Generate URL for the image
powerrag_config = get_base_config("powerrag", {}) or {}
api_url = os.environ.get("PUBLIC_SERVER_URL", "http://localhost:6000")
image_url = f"http://{api_url}/v1/chunk/image/{output_dir}/{img_filename}"
# Use relative path so frontend treats it as same-origin (proxied to backend)
image_url = f"/v1/chunk/image/{output_dir}/{img_filename}"

# Use HTML img tag with URL
text_items.append(f'<img src="{image_url}" alt="$$00$$" style="max-width: 60%; height: auto;">')
Expand Down Expand Up @@ -662,13 +660,9 @@ def store_images(self, md_content: str, images: ImageDict, output_dir: str) -> s
# Store image in storage
STORAGE_IMPL.put(output_dir, img_name, img_bytes)

# Generate URL for the image using RAGFlow image access endpoint
# Get RAGFlow server configuration
powerrag_config = get_base_config("powerrag", {}) or {}
api_url = os.environ.get("PUBLIC_SERVER_URL", "http://localhost:6000")

# Construct the image URL using the auth_image endpoint
image_url = f"http://{api_url}/v1/chunk/image/{output_dir}/{img_name}"
# Use relative path so frontend treats it as same-origin (proxied to backend).
# Avoids sanitizer blocking external img src while preventing data exfiltration.
image_url = f"/v1/chunk/image/{output_dir}/{img_name}"

# Add to result list
image_info.append((img_name, image_url))
Expand Down
2 changes: 0 additions & 2 deletions web/.env

This file was deleted.

1 change: 1 addition & 0 deletions web/.gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
/node_modules
/.env
/.env.local
/.umirc.local.ts
/config/config.local.ts
Expand Down
7 changes: 4 additions & 3 deletions web/src/components/floating-chat-widget-markdown.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@ import {
showImage,
} from '@/utils/chat';
import { getExtension } from '@/utils/document-util';
import { sanitizeChatContent } from '@/utils/sanitize';
import { InfoCircleOutlined } from '@ant-design/icons';
import { Button, Flex, Popover, Tooltip } from 'antd';
import classNames from 'classnames';
import DOMPurify from 'dompurify';
import 'katex/dist/katex.min.css';
import { omit } from 'lodash';
import { pipe } from 'lodash/fp';
Expand Down Expand Up @@ -57,7 +57,8 @@ const FloatingChatWidgetMarkdown = ({
const isDarkTheme = useIsDarkTheme();

const contentWithCursor = useMemo(() => {
let text = content === '' ? t('chat.searching') : content;
let text =
content === '' ? t('chat.searching') : sanitizeChatContent(content);
const nextText = replaceTextByOldReg(text);
return pipe(replaceThinkToSection, preprocessLaTeX)(nextText);
}, [content, t]);
Expand Down Expand Up @@ -179,7 +180,7 @@ const FloatingChatWidgetMarkdown = ({
<div className="space-y-2 flex-1 min-w-0">
<div
dangerouslySetInnerHTML={{
__html: DOMPurify.sanitize(chunkItem?.content ?? ''),
__html: sanitizeChatContent(chunkItem?.content ?? ''),
}}
className="max-h-[250px] overflow-y-auto text-xs leading-relaxed p-2 bg-gray-50 dark:bg-gray-800 rounded prose-sm"
></div>
Expand Down
3 changes: 2 additions & 1 deletion web/src/components/highlight-markdown/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import remarkMath from 'remark-math';
import 'katex/dist/katex.min.css'; // `rehype-katex` does not import the CSS for you

import { preprocessLaTeX } from '@/utils/chat';
import { sanitizeChatContent } from '@/utils/sanitize';
import { useIsDarkTheme } from '../theme-provider';
import styles from './index.less';

Expand Down Expand Up @@ -51,7 +52,7 @@ const HighLightMarkdown = ({
} as any
}
>
{children ? preprocessLaTeX(children) : children}
{children ? preprocessLaTeX(sanitizeChatContent(children)) : children}
</Markdown>
);
};
Expand Down
9 changes: 3 additions & 6 deletions web/src/components/markdown-content/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import Image from '@/components/image';
import SvgIcon from '@/components/svg-icon';
import { IReference, IReferenceChunk } from '@/interfaces/database/chat';
import { getExtension } from '@/utils/document-util';
import DOMPurify from 'dompurify';
import { sanitizeChatContent } from '@/utils/sanitize';
import { useCallback, useEffect, useMemo } from 'react';
import Markdown from 'react-markdown';
import SyntaxHighlighter from 'react-syntax-highlighter';
Expand Down Expand Up @@ -52,10 +52,7 @@ const MarkdownContent = ({
const { setDocumentIds, data: fileThumbnails } =
useFetchDocumentThumbnailsByIds();
const contentWithCursor = useMemo(() => {
let text = DOMPurify.sanitize(content, {
ADD_TAGS: ['think', 'section'],
ADD_ATTR: ['class'],
});
let text = sanitizeChatContent(content);

// let text = content;
if (text === '') {
Expand Down Expand Up @@ -166,7 +163,7 @@ const MarkdownContent = ({
<div className={'space-y-2 max-w-[40vw]'}>
<div
dangerouslySetInnerHTML={{
__html: DOMPurify.sanitize(chunkItem?.content ?? ''),
__html: sanitizeChatContent(chunkItem?.content ?? ''),
}}
className={classNames(styles.chunkContentText)}
></div>
Expand Down
9 changes: 3 additions & 6 deletions web/src/components/next-markdown-content/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import Image from '@/components/image';
import SvgIcon from '@/components/svg-icon';
import { IReferenceChunk, IReferenceObject } from '@/interfaces/database/chat';
import { getExtension } from '@/utils/document-util';
import DOMPurify from 'dompurify';
import { sanitizeChatContent } from '@/utils/sanitize';
import { memo, useCallback, useEffect, useMemo } from 'react';
import Markdown from 'react-markdown';
import SyntaxHighlighter from 'react-syntax-highlighter';
Expand Down Expand Up @@ -53,10 +53,7 @@ function MarkdownContent({
const { setDocumentIds, data: fileThumbnails } =
useFetchDocumentThumbnailsByIds();
const contentWithCursor = useMemo(() => {
let text = DOMPurify.sanitize(content, {
ADD_TAGS: ['think', 'section'],
ADD_ATTR: ['class'],
});
let text = sanitizeChatContent(content);
// let text = content;
if (text === '') {
text = t('chat.searching');
Expand Down Expand Up @@ -168,7 +165,7 @@ function MarkdownContent({
<div className={'space-y-2 max-w-[40vw] w-full'}>
<div
dangerouslySetInnerHTML={{
__html: DOMPurify.sanitize(chunkItem?.content ?? ''),
__html: sanitizeChatContent(chunkItem?.content ?? ''),
}}
className={classNames(styles.chunkContentText, 'w-full')}
></div>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ import {
} from '@/components/ui/tooltip';
import type { ChunkDocType, IChunk } from '@/interfaces/database/knowledge';
import { cn } from '@/lib/utils';
import { sanitizeChatContent } from '@/utils/sanitize';
import { CheckedState } from '@radix-ui/react-checkbox';
import classNames from 'classnames';
import DOMPurify from 'dompurify';
import { useEffect, useState } from 'react';
import { useTranslation } from 'react-i18next';
import { ChunkTextMode } from '../../constant';
Expand Down Expand Up @@ -124,7 +124,7 @@ const ChunkCard = ({
>
<div
dangerouslySetInnerHTML={{
__html: DOMPurify.sanitize(item.content_with_weight),
__html: sanitizeChatContent(item.content_with_weight),
}}
className={classNames(styles.contentText, {
[styles.contentEllipsis]: textMode === ChunkTextMode.Ellipse,
Expand Down
4 changes: 2 additions & 2 deletions web/src/pages/dataflow-result/components/chunk-card/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ import {
} from '@/components/ui/popover';
import { Switch } from '@/components/ui/switch';
import { IChunk } from '@/interfaces/database/knowledge';
import { sanitizeChatContent } from '@/utils/sanitize';
import { CheckedState } from '@radix-ui/react-checkbox';
import classNames from 'classnames';
import DOMPurify from 'dompurify';
import { useEffect, useState } from 'react';
import { ChunkTextMode } from '../../constant';
import styles from './index.less';
Expand Down Expand Up @@ -104,7 +104,7 @@ const ChunkCard = ({
>
<div
dangerouslySetInnerHTML={{
__html: DOMPurify.sanitize(item.content_with_weight),
__html: sanitizeChatContent(item.content_with_weight),
}}
className={classNames(styles.contentText, {
[styles.contentEllipsis]: textMode === ChunkTextMode.Ellipse,
Expand Down
9 changes: 3 additions & 6 deletions web/src/pages/next-search/markdown-content/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import Image from '@/components/image';
import SvgIcon from '@/components/svg-icon';
import { IReference, IReferenceChunk } from '@/interfaces/database/chat';
import { getExtension } from '@/utils/document-util';
import DOMPurify from 'dompurify';
import { sanitizeChatContent } from '@/utils/sanitize';
import { memo, useCallback, useEffect, useMemo } from 'react';
import Markdown from 'react-markdown';
import SyntaxHighlighter from 'react-syntax-highlighter';
Expand Down Expand Up @@ -63,10 +63,7 @@ const MarkdownContent = ({
const { setDocumentIds, data: fileThumbnails } =
useFetchDocumentThumbnailsByIds();
const contentWithCursor = useMemo(() => {
let text = DOMPurify.sanitize(content, {
ADD_TAGS: ['think', 'section'],
ADD_ATTR: ['class'],
});
let text = sanitizeChatContent(content);
// let text = content;
if (text === '') {
text = t('chat.searching');
Expand Down Expand Up @@ -168,7 +165,7 @@ const MarkdownContent = ({
<div className={'space-y-2 max-w-[40vw]'}>
<div
dangerouslySetInnerHTML={{
__html: DOMPurify.sanitize(chunkItem?.content ?? ''),
__html: sanitizeChatContent(chunkItem?.content ?? ''),
}}
className={classNames(styles.chunkContentText)}
></div>
Expand Down
4 changes: 2 additions & 2 deletions web/src/pages/next-search/search-view.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ import {
import { RAGFlowPagination } from '@/components/ui/ragflow-pagination';
import { IReference } from '@/interfaces/database/chat';
import { cn } from '@/lib/utils';
import DOMPurify from 'dompurify';
import { sanitizeChatContent } from '@/utils/sanitize';
import { isEmpty } from 'lodash';
import { BrainCircuit, Search, X } from 'lucide-react';
import { Dispatch, SetStateAction, useEffect, useState } from 'react';
Expand Down Expand Up @@ -208,7 +208,7 @@ export default function SearchingView({
<PopoverTrigger asChild>
<div
dangerouslySetInnerHTML={{
__html: DOMPurify.sanitize(
__html: sanitizeChatContent(
`${
chunk.highlight ??
chunk.content_with_weight ??
Expand Down
4 changes: 2 additions & 2 deletions web/src/pages/search/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import {
} from '@/hooks/knowledge-hooks';
import { useGetPaginationWithRouter } from '@/hooks/logic-hooks';
import { IReference } from '@/interfaces/database/chat';
import { sanitizeChatContent } from '@/utils/sanitize';
import {
Button,
Card,
Expand All @@ -30,7 +31,6 @@ import {
Tooltip,
} from 'antd';
import classNames from 'classnames';
import DOMPurify from 'dompurify';
import { isEmpty } from 'lodash';
import { CircleStop, SendHorizontal } from 'lucide-react';
import { useCallback, useMemo, useState } from 'react';
Expand Down Expand Up @@ -190,7 +190,7 @@ const SearchPage = () => {
>
<div
dangerouslySetInnerHTML={{
__html: DOMPurify.sanitize(
__html: sanitizeChatContent(
`${item.highlight}...`,
),
}}
Expand Down
85 changes: 85 additions & 0 deletions web/src/utils/sanitize.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
/**
* Secure sanitization for chat/AI-generated content.
* Prevents data exfiltration via img src (e.g. prompt injection that embeds
* conversation history in external image URLs).
*/
import DOMPurify, { Config } from 'dompurify';

// Single / only (not //), ./ , ../ , or data:image/
const ALLOWED_IMAGE_URL_PATTERN = /^(?:\/(?!\/)|data:image\/|\.\/|\.\.\/)/i;

function isAllowedImageUrl(url: string): boolean {
if (!url || typeof url !== 'string') return false;
const trimmed = url.trim();
if (!trimmed) return false;
// Block protocol-relative URLs (//evil.com) and backslash variants (\\...)
if (trimmed.startsWith('//') || trimmed.startsWith('\\\\')) return false;
// Allow relative URLs: /path (single slash), ./path, ../path
if (ALLOWED_IMAGE_URL_PATTERN.test(trimmed)) return true;
// Allow data:image/* for inline images (no network request)
if (trimmed.toLowerCase().startsWith('data:image/')) return true;
// Allow same-origin only
if (typeof window !== 'undefined') {
try {
const parsed = new URL(trimmed, window.location.origin);
return parsed.origin === window.location.origin;
} catch {
return false;
}
}
return false;
}

/** Parse srcset attribute into list of URLs (descriptors like 1x, 100w are stripped). */
function getUrlsFromSrcset(srcset: string): string[] {
return srcset
.split(',')
.map((s) => s.trim().split(/\s+/)[0])
.filter(Boolean);
}

function isSrcsetAllowed(srcset: string): boolean {
const urls = getUrlsFromSrcset(srcset);
return urls.length > 0 && urls.every((url) => isAllowedImageUrl(url));
}

function sanitizeImageUrlAttributes(node: Element): void {
const tag = node.tagName;
if (tag !== 'IMG' && tag !== 'SOURCE') return;

const src = node.getAttribute('src');
if (src && !isAllowedImageUrl(src)) {
node.removeAttribute('src');
}

const srcset = node.getAttribute('srcset');
if (srcset && !isSrcsetAllowed(srcset)) {
node.removeAttribute('srcset');
}
}

let secureImageHookAdded = false;

function ensureSecureImageHook(): void {
if (secureImageHookAdded) return;
DOMPurify.addHook('afterSanitizeAttributes', (node) => {
if (node.nodeType === 1) {
sanitizeImageUrlAttributes(node as Element);
}
});
secureImageHookAdded = true;
}

/**
* Sanitize content for safe rendering. Restricts img/srcset (and source src/srcset)
* to same-origin, relative, or data: URLs only. Use for all chat/AI output and user content.
*/
export function sanitizeChatContent(dirty: string, config?: Config): string {
ensureSecureImageHook();
const result = DOMPurify.sanitize(dirty, {
ADD_TAGS: ['think', 'section'],
ADD_ATTR: ['class'],
...config,
});
return typeof result === 'string' ? result : '';
}