From b22e8863d1ae2769a2ad6ffd0408d685d18cac06 Mon Sep 17 00:00:00 2001 From: archer <545436317@qq.com> Date: Mon, 17 Nov 2025 11:58:58 +0800 Subject: [PATCH 1/6] fix: text split --- test/cases/global/common/string/textSplitter.test.ts | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/test/cases/global/common/string/textSplitter.test.ts b/test/cases/global/common/string/textSplitter.test.ts index 3d92243c94c7..b7fcd8b82a1c 100644 --- a/test/cases/global/common/string/textSplitter.test.ts +++ b/test/cases/global/common/string/textSplitter.test.ts @@ -583,7 +583,10 @@ FastGPT AI 相关参数配置说明 const normalizedChunks = simpleChunks(chunks); const normalizedExpected = simpleChunks(mock.result); - + fs.writeFileSync( + '/Volumes/code/fastgpt-pro/FastGPT/test/cases/global/common/string/test.md', + JSON.stringify(normalizedChunks, null, 2) + ); expect(normalizedChunks).toEqual(normalizedExpected); }); From 8cae0ae87b906ad437dc552af2dc9b379e2e3895 Mon Sep 17 00:00:00 2001 From: archer <545436317@qq.com> Date: Mon, 17 Nov 2025 12:12:18 +0800 Subject: [PATCH 2/6] remove test --- test/cases/global/common/string/textSplitter.test.ts | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/test/cases/global/common/string/textSplitter.test.ts b/test/cases/global/common/string/textSplitter.test.ts index b7fcd8b82a1c..3d92243c94c7 100644 --- a/test/cases/global/common/string/textSplitter.test.ts +++ b/test/cases/global/common/string/textSplitter.test.ts @@ -583,10 +583,7 @@ FastGPT AI 相关参数配置说明 const normalizedChunks = simpleChunks(chunks); const normalizedExpected = simpleChunks(mock.result); - fs.writeFileSync( - '/Volumes/code/fastgpt-pro/FastGPT/test/cases/global/common/string/test.md', - JSON.stringify(normalizedChunks, null, 2) - ); + expect(normalizedChunks).toEqual(normalizedExpected); }); From a858c1530e611941b68dfc0546447b3872bce867 Mon Sep 17 00:00:00 2001 From: xqvvu Date: Tue, 4 Nov 2025 15:26:29 +0800 Subject: [PATCH 3/6] feat: integrate S3 for dataset with compatibility --- .../global/common/file/image/constants.ts | 1 + packages/global/core/dataset/api.d.ts | 1 + .../global/core/dataset/apiDataset/type.d.ts | 1 + packages/global/core/dataset/controller.d.ts | 1 + .../global/core/dataset/training/type.d.ts | 1 - packages/global/core/dataset/type.d.ts | 8 +- packages/global/core/dataset/v2/api.ts | 34 +++ .../common/buffer/rawText/controller.ts | 10 +- .../service/common/buffer/rawText/schema.ts | 4 +- .../service/common/file/gridfs/controller.ts | 93 +----- .../service/common/file/image/controller.ts | 21 +- packages/service/common/file/read/utils.ts | 97 +++++-- packages/service/common/s3/buckets/base.ts | 77 +++-- packages/service/common/s3/mq.ts | 13 +- packages/service/common/s3/sources/avatar.ts | 33 ++- .../service/common/s3/sources/chat/index.ts | 40 ++- .../common/s3/sources/dataset/index.ts | 269 ++++++++++++++++++ .../service/common/s3/sources/dataset/type.ts | 61 ++++ packages/service/common/s3/type.ts | 8 +- packages/service/common/s3/utils.ts | 56 ++++ packages/service/core/ai/llm/request.ts | 3 +- packages/service/core/ai/llm/utils.ts | 53 +++- packages/service/core/app/controller.ts | 2 + packages/service/core/chat/saveChat.ts | 49 +++- .../core/dataset/apiDataset/custom/api.ts | 19 +- .../core/dataset/collection/controller.ts | 75 ++++- .../service/core/dataset/collection/schema.ts | 7 +- .../service/core/dataset/collection/utils.ts | 6 +- packages/service/core/dataset/controller.ts | 10 + .../service/core/dataset/data/controller.ts | 21 +- packages/service/core/dataset/data/schema.ts | 7 + .../service/core/dataset/image/controller.ts | 8 +- packages/service/core/dataset/read.ts | 161 ++++++++--- .../core/dataset/training/controller.ts | 1 + .../service/core/dataset/training/schema.ts | 4 + .../service/core/workflow/dispatch/ai/chat.ts | 27 +- .../core/workflow/dispatch/ai/tool/index.ts | 18 +- .../core/workflow/dispatch/tools/readFiles.ts | 35 ++- .../service/support/permission/auth/file.ts | 35 ++- .../support/permission/dataset/auth.ts | 21 +- packages/web/i18n/en/app.json | 2 +- packages/web/i18n/en/chat.json | 1 + packages/web/i18n/zh-CN/app.json | 2 +- packages/web/i18n/zh-CN/chat.json | 1 + packages/web/i18n/zh-Hant/app.json | 2 +- packages/web/i18n/zh-Hant/chat.json | 1 + .../app/src/components/Markdown/img/Image.tsx | 51 +++- .../app/src/components/Markdown/index.tsx | 6 +- .../detail/Import/diffSource/FileLocal.tsx | 57 ++-- .../dataset/detail/MetaDataCard.tsx | 6 +- projects/app/src/pages/api/core/app/copy.ts | 2 +- projects/app/src/pages/api/core/app/create.ts | 31 +- .../core/dataset/collection/create/backup.ts | 12 +- .../core/dataset/collection/create/fileId.ts | 35 +-- .../core/dataset/collection/create/images.ts | 22 +- .../dataset/collection/create/template.ts | 12 +- .../core/dataset/collection/create/text.ts | 19 +- .../api/core/dataset/collection/detail.ts | 15 +- .../src/pages/api/core/dataset/data/delete.ts | 2 + .../pages/api/core/dataset/data/insertData.ts | 4 +- .../api/core/dataset/data/insertImages.ts | 31 +- .../pages/api/core/dataset/data/v2/list.ts | 48 ++-- .../api/core/dataset/file/getPreviewChunks.ts | 3 +- .../core/dataset/presignDatasetFileGetUrl.ts | 90 ++++++ .../core/dataset/presignDatasetFilePostUrl.ts | 41 +++ projects/app/src/pages/api/file/temp.ts | 43 +++ .../app/src/pages/api/system/file/[jwt].ts | 70 +++++ .../service/core/dataset/data/controller.ts | 54 ++++ .../core/dataset/queues/datasetParse.ts | 136 +++++++-- .../core/dataset/queues/generateVector.ts | 11 + projects/app/src/web/common/file/api.ts | 7 + projects/app/src/web/core/dataset/api.ts | 4 + .../dataset/hooks/readCollectionSource.ts | 4 +- 73 files changed, 1795 insertions(+), 421 deletions(-) create mode 100644 packages/global/core/dataset/v2/api.ts create mode 100644 packages/service/common/s3/sources/dataset/index.ts create mode 100644 packages/service/common/s3/sources/dataset/type.ts create mode 100644 packages/service/common/s3/utils.ts create mode 100644 projects/app/src/pages/api/core/dataset/presignDatasetFileGetUrl.ts create mode 100644 projects/app/src/pages/api/core/dataset/presignDatasetFilePostUrl.ts create mode 100644 projects/app/src/pages/api/file/temp.ts create mode 100644 projects/app/src/pages/api/system/file/[jwt].ts diff --git a/packages/global/common/file/image/constants.ts b/packages/global/common/file/image/constants.ts index 5e511e4b26f5..3aa07836cef7 100644 --- a/packages/global/common/file/image/constants.ts +++ b/packages/global/common/file/image/constants.ts @@ -4,3 +4,4 @@ export const FolderIcon = 'file/fill/folder'; export const FolderImgUrl = '/imgs/files/folder.svg'; export const HttpPluginImgUrl = '/imgs/app/httpPluginFill.svg'; export const HttpImgUrl = '/imgs/workflow/http.png'; +export const TempFileURL = '/api/file/temp'; diff --git a/packages/global/core/dataset/api.d.ts b/packages/global/core/dataset/api.d.ts index 1a3935127018..6c308e511a86 100644 --- a/packages/global/core/dataset/api.d.ts +++ b/packages/global/core/dataset/api.d.ts @@ -139,6 +139,7 @@ export type PushDatasetDataChunkProps = { q?: string; a?: string; imageId?: string; + imageKeys?: string[]; chunkIndex?: number; indexes?: Omit[]; }; diff --git a/packages/global/core/dataset/apiDataset/type.d.ts b/packages/global/core/dataset/apiDataset/type.d.ts index 1e8758411515..98546bce867a 100644 --- a/packages/global/core/dataset/apiDataset/type.d.ts +++ b/packages/global/core/dataset/apiDataset/type.d.ts @@ -40,6 +40,7 @@ export type ApiDatasetServerType = { export type ApiFileReadContentResponse = { title?: string; rawText: string; + imageKeys?: string[]; }; export type APIFileReadResponse = { diff --git a/packages/global/core/dataset/controller.d.ts b/packages/global/core/dataset/controller.d.ts index ec724e8c63d2..5729ffa8f42b 100644 --- a/packages/global/core/dataset/controller.d.ts +++ b/packages/global/core/dataset/controller.d.ts @@ -9,6 +9,7 @@ export type CreateDatasetDataProps = { q: string; a?: string; imageId?: string; + imageKeys?: string[]; indexes?: Omit[]; indexPrefix?: string; }; diff --git a/packages/global/core/dataset/training/type.d.ts b/packages/global/core/dataset/training/type.d.ts index 183bd0c1beb3..89071fc8ab34 100644 --- a/packages/global/core/dataset/training/type.d.ts +++ b/packages/global/core/dataset/training/type.d.ts @@ -9,7 +9,6 @@ export type PushDataToTrainingQueueProps = { data: PushDatasetDataChunkProps[]; mode?: TrainingModeEnum; - data: PushDatasetDataChunkProps[]; agentModel: string; vectorModel: string; diff --git a/packages/global/core/dataset/type.d.ts b/packages/global/core/dataset/type.d.ts index d854bd274229..f0d08347ee1e 100644 --- a/packages/global/core/dataset/type.d.ts +++ b/packages/global/core/dataset/type.d.ts @@ -118,6 +118,7 @@ export type DatasetCollectionSchemaType = ChunkSettingsType & { rawTextLength?: number; hashRawText?: string; + metadata?: { webPageSelector?: string; relatedImgId?: string; // The id of the associated image collections @@ -147,6 +148,7 @@ export type DatasetDataFieldType = { q: string; // large chunks or question a?: string; // answer or custom content imageId?: string; + imageKeys?: string[]; }; export type DatasetDataSchemaType = DatasetDataFieldType & { _id: string; @@ -191,6 +193,7 @@ export type DatasetTrainingSchemaType = { q: string; a: string; imageId?: string; + imageKeys?: string[]; imageDescMap?: Record; chunkIndex: number; indexSize?: number; @@ -250,7 +253,10 @@ export type TagUsageType = { export type DatasetCollectionItemType = CollectionWithDatasetType & { sourceName: string; sourceId?: string; - file?: DatasetFileSchema; + file?: { + filename?: string; + contentLength?: number; + }; permission: DatasetPermission; indexAmount: number; errorCount?: number; diff --git a/packages/global/core/dataset/v2/api.ts b/packages/global/core/dataset/v2/api.ts new file mode 100644 index 000000000000..fdcd3540528c --- /dev/null +++ b/packages/global/core/dataset/v2/api.ts @@ -0,0 +1,34 @@ +import { ObjectIdSchema } from '../../../common/type/mongo'; +import z from 'zod'; + +export const PresignDatasetFileGetUrlSchema = z.union([ + z.object({ + key: z + .string() + .nonempty() + .refine((key) => key.startsWith('dataset/'), { + message: 'Invalid key format: must start with "dataset/"' + }) + .transform((k) => decodeURIComponent(k)), + preview: z.boolean().optional() + }), + z.object({ + collectionId: ObjectIdSchema + // datasetId: ObjectIdSchema + }) +]); +export type PresignDatasetFileGetUrlParams = z.infer; + +export const PresignDatasetFilePostUrlSchema = z.object({ + filename: z.string().min(1), + datasetId: ObjectIdSchema +}); +export type PresignDatasetFilePostUrlParams = z.infer; + +export const ShortPreviewLinkSchema = z.object({ + k: z + .string() + .nonempty() + .transform((k) => `chat:temp_file:${decodeURIComponent(k)}`) +}); +export type ShortPreviewLinkParams = z.infer; diff --git a/packages/service/common/buffer/rawText/controller.ts b/packages/service/common/buffer/rawText/controller.ts index d16c9c59e185..25200d4ab01d 100644 --- a/packages/service/common/buffer/rawText/controller.ts +++ b/packages/service/common/buffer/rawText/controller.ts @@ -18,18 +18,21 @@ export const addRawTextBuffer = async ({ sourceId, sourceName, text, - expiredTime + expiredTime, + imageKeys = [] }: { sourceId: string; sourceName: string; text: string; expiredTime: Date; + imageKeys?: string[]; }) => { const gridBucket = getGridBucket(); const metadata = { sourceId, sourceName, - expiredTime + expiredTime, + imageKeys }; const buffer = Buffer.from(text); @@ -106,7 +109,8 @@ export const getRawTextBuffer = async (sourceId: string) => { return { text: rawText, - sourceName: bufferData.metadata?.sourceName || '' + sourceName: bufferData.metadata?.sourceName || '', + imageKeys: bufferData.metadata?.imageKeys || [] }; }); }; diff --git a/packages/service/common/buffer/rawText/schema.ts b/packages/service/common/buffer/rawText/schema.ts index f6e9ea580dbb..fe485da0e16f 100644 --- a/packages/service/common/buffer/rawText/schema.ts +++ b/packages/service/common/buffer/rawText/schema.ts @@ -6,7 +6,8 @@ const RawTextBufferSchema = new Schema({ metadata: { sourceId: { type: String, required: true }, sourceName: { type: String, required: true }, - expiredTime: { type: Date, required: true } + expiredTime: { type: Date, required: true }, + imageKeys: { type: [String], required: true } } }); RawTextBufferSchema.index({ 'metadata.sourceId': 'hashed' }); @@ -18,5 +19,6 @@ export const MongoRawTextBufferSchema = getMongoModel<{ sourceId: string; sourceName: string; expiredTime: Date; + imageKeys: string[]; }; }>(`${bucketName}.files`, RawTextBufferSchema); diff --git a/packages/service/common/file/gridfs/controller.ts b/packages/service/common/file/gridfs/controller.ts index d1e707816c6c..734074d4dcaf 100644 --- a/packages/service/common/file/gridfs/controller.ts +++ b/packages/service/common/file/gridfs/controller.ts @@ -4,16 +4,12 @@ import fsp from 'fs/promises'; import fs from 'fs'; import { type DatasetFileSchema } from '@fastgpt/global/core/dataset/type'; import { MongoChatFileSchema, MongoDatasetFileSchema } from './schema'; -import { detectFileEncoding, detectFileEncodingByPath } from '@fastgpt/global/common/file/tools'; -import { CommonErrEnum } from '@fastgpt/global/common/error/code/common'; -import { readRawContentByFileBuffer } from '../read/utils'; -import { computeGridFsChunSize, gridFsStream2Buffer, stream2Encoding } from './utils'; +import { detectFileEncodingByPath } from '@fastgpt/global/common/file/tools'; +import { computeGridFsChunSize, stream2Encoding } from './utils'; import { addLog } from '../../system/log'; -import { parseFileExtensionFromUrl } from '@fastgpt/global/common/string/tools'; import { Readable } from 'stream'; -import { addRawTextBuffer, getRawTextBuffer } from '../../buffer/rawText/controller'; -import { addMinutes } from 'date-fns'; import { retryFn } from '@fastgpt/global/common/system/utils'; +import { getS3DatasetSource } from '../../s3/sources/dataset'; export function getGFSCollection(bucket: `${BucketNameEnum}`) { MongoDatasetFileSchema; @@ -162,11 +158,17 @@ export async function delFileByFileIdList({ fileIdList: string[]; }): Promise { return retryFn(async () => { + const s3DatasetSource = getS3DatasetSource(); + const bucket = getGridBucket(bucketName); for await (const fileId of fileIdList) { try { - await bucket.delete(new Types.ObjectId(String(fileId))); + if (s3DatasetSource.isDatasetObjectKey(fileId)) { + await s3DatasetSource.deleteDatasetFileByKey(fileId); + } else { + await bucket.delete(new Types.ObjectId(String(fileId))); + } } catch (error: any) { if (typeof error?.message === 'string' && error.message.includes('File not found')) { addLog.warn('File not found', { fileId }); @@ -189,78 +191,3 @@ export async function getDownloadStream({ return bucket.openDownloadStream(new Types.ObjectId(fileId)); } - -export const readFileContentFromMongo = async ({ - teamId, - tmbId, - bucketName, - fileId, - customPdfParse = false, - getFormatText, - usageId -}: { - teamId: string; - tmbId: string; - bucketName: `${BucketNameEnum}`; - fileId: string; - customPdfParse?: boolean; - getFormatText?: boolean; // 数据类型都尽可能转化成 markdown 格式 - usageId?: string; -}): Promise<{ - rawText: string; - filename: string; -}> => { - const bufferId = `${String(fileId)}-${customPdfParse}`; - // read buffer - const fileBuffer = await getRawTextBuffer(bufferId); - if (fileBuffer) { - return { - rawText: fileBuffer.text, - filename: fileBuffer?.sourceName - }; - } - - const [file, fileStream] = await Promise.all([ - getFileById({ bucketName, fileId }), - getDownloadStream({ bucketName, fileId }) - ]); - if (!file) { - return Promise.reject(CommonErrEnum.fileNotFound); - } - - const extension = parseFileExtensionFromUrl(file?.filename); - - const start = Date.now(); - const fileBuffers = await gridFsStream2Buffer(fileStream); - addLog.debug('get file buffer', { time: Date.now() - start }); - - const encoding = file?.metadata?.encoding || detectFileEncoding(fileBuffers); - - // Get raw text - const { rawText } = await readRawContentByFileBuffer({ - customPdfParse, - usageId, - getFormatText, - extension, - teamId, - tmbId, - buffer: fileBuffers, - encoding, - metadata: { - relatedId: fileId - } - }); - - // Add buffer - addRawTextBuffer({ - sourceId: bufferId, - sourceName: file.filename, - text: rawText, - expiredTime: addMinutes(new Date(), 20) - }); - - return { - rawText, - filename: file.filename - }; -}; diff --git a/packages/service/common/file/image/controller.ts b/packages/service/common/file/image/controller.ts index 700d41631f89..64489615b6c1 100644 --- a/packages/service/common/file/image/controller.ts +++ b/packages/service/common/file/image/controller.ts @@ -64,23 +64,28 @@ export async function uploadMongoImg({ export const copyAvatarImage = async ({ teamId, imageUrl, - ttl, + temporary, session }: { teamId: string; imageUrl: string; - ttl: boolean; + temporary: boolean; session?: ClientSession; }) => { if (!imageUrl) return; - // S3 - if (imageUrl.startsWith(`${imageBaseUrl}/${S3Sources.avatar}`)) { - const extendName = path.extname(imageUrl); + const avatarSource = getS3AvatarSource(); + if (avatarSource.isAvatarKey(imageUrl)) { + const filename = (() => { + const last = imageUrl.split('/').pop()?.split('-')[1]; + if (!last) return getNanoid(6).concat(path.extname(imageUrl)); + return `${getNanoid(6)}-${last}`; + })(); const key = await getS3AvatarSource().copyAvatar({ - sourceKey: imageUrl.slice(imageBaseUrl.length), - targetKey: `${S3Sources.avatar}/${teamId}/${getNanoid(6)}${extendName}`, - ttl + key: imageUrl, + teamId, + filename, + temporary }); return key; } diff --git a/packages/service/common/file/read/utils.ts b/packages/service/common/file/read/utils.ts index 3cc8e26e8cab..a07b64bc1a9c 100644 --- a/packages/service/common/file/read/utils.ts +++ b/packages/service/common/file/read/utils.ts @@ -9,6 +9,12 @@ import { matchMdImg } from '@fastgpt/global/common/string/markdown'; import { createPdfParseUsage } from '../../../support/wallet/usage/controller'; import { useDoc2xServer } from '../../../thirdProvider/doc2x'; import { readRawContentFromBuffer } from '../../../worker/function'; +import { getS3DatasetSource } from '../../s3/sources/dataset'; +import type { ParsedFileContentS3KeyParams } from '../../s3/sources/dataset/type'; +import { getNanoid } from '@fastgpt/global/common/string/tools'; +import path from 'path'; +import { S3Sources } from '../../s3/type'; +import { randomUUID } from 'crypto'; export type readRawTextByLocalFileParams = { teamId: string; @@ -17,6 +23,7 @@ export type readRawTextByLocalFileParams = { encoding: string; customPdfParse?: boolean; getFormatText?: boolean; + uploadKey: string; metadata?: Record; }; export const readRawTextByLocalFile = async (params: readRawTextByLocalFileParams) => { @@ -26,7 +33,7 @@ export const readRawTextByLocalFile = async (params: readRawTextByLocalFileParam const buffer = await fs.promises.readFile(path); - return readRawContentByFileBuffer({ + return readS3FileContentByBuffer({ extension, customPdfParse: params.customPdfParse, getFormatText: params.getFormatText, @@ -34,21 +41,21 @@ export const readRawTextByLocalFile = async (params: readRawTextByLocalFileParam tmbId: params.tmbId, encoding: params.encoding, buffer, - metadata: params.metadata + uploadKeyPrefix: params.uploadKey }); }; -export const readRawContentByFileBuffer = async ({ +export const readS3FileContentByBuffer = async ({ teamId, tmbId, extension, buffer, encoding, - metadata, customPdfParse = false, usageId, - getFormatText = true + getFormatText = true, + uploadKeyPrefix }: { teamId: string; tmbId: string; @@ -56,13 +63,14 @@ export const readRawContentByFileBuffer = async ({ extension: string; buffer: Buffer; encoding: string; - metadata?: Record; customPdfParse?: boolean; usageId?: string; getFormatText?: boolean; + uploadKeyPrefix: string; }): Promise<{ rawText: string; + imageKeys?: string[]; }> => { const systemParse = () => readRawContentFromBuffer({ @@ -158,21 +166,25 @@ export const readRawContentByFileBuffer = async ({ addLog.debug(`Parse file success, time: ${Date.now() - start}ms. `); // markdown data format - if (imageList) { + const uploadedImageKeys: string[] = []; + if (imageList && imageList.length > 0) { + addLog.debug(`Processing ${imageList.length} images from parsed document`); + await batchRun(imageList, async (item) => { const src = await (async () => { try { - return await uploadMongoImg({ + const ext = item.mime.split('/')[1].replace('x-', ''); + const imageKey = await getS3DatasetSource().uploadDatasetImage({ base64Img: `data:${item.mime};base64,${item.base64}`, - teamId, - metadata: { - ...metadata, - mime: item.mime - } + mimetype: `${ext}`, + filename: `${item.uuid}.${ext}`, + uploadKey: `${uploadKeyPrefix}/${item.uuid}.${ext}` }); + uploadedImageKeys.push(imageKey); + return imageKey; } catch (error) { - addLog.warn('Upload file image error', { error }); - return 'Upload load image error'; + // Don't add to uploadedImageKeys if upload failed, but still continue processing + return `[Image Upload Failed: ${item.uuid}]`; } })(); rawText = rawText.replace(item.uuid, src); @@ -180,9 +192,60 @@ export const readRawContentByFileBuffer = async ({ formatText = formatText.replace(item.uuid, src); } }); + + // Log summary of image processing + addLog.info(`Image processing completed`, { + total: imageList.length, + successful: uploadedImageKeys.length, + failed: imageList.length - uploadedImageKeys.length + }); } - addLog.debug(`Upload file success, time: ${Date.now() - start}ms`); + addLog.debug(`Upload file to S3 success, time: ${Date.now() - start}ms`, { + uploadedImageKeysCount: uploadedImageKeys.length, + uploadedImageKeys + }); + + return { + rawText: getFormatText ? formatText || rawText : rawText, + imageKeys: uploadedImageKeys + }; +}; + +export const parsedFileContentS3Key = { + temp: (appId: string) => `chat/${appId}/temp/parsed/${randomUUID()}`, + + chat: ({ appId, chatId, uId }: { chatId: string; uId: string; appId: string }) => + `chat/${appId}/${uId}/${chatId}/parsed`, + + dataset: (params: ParsedFileContentS3KeyParams) => { + const { datasetId, mimetype, filename, parentFileKey } = params; + + const extension = mimetype; + const image = (() => { + if (filename) { + return Boolean(path.extname(filename)) + ? `${getNanoid(6)}-${filename}` + : `${getNanoid(6)}-${filename}.${extension}`; + } + return `${getNanoid(6)}.${extension}`; + })(); + + const parentFilename = parentFileKey?.slice().split('/').at(-1); + const parsedParentFilename = parentFilename + ? `parsed-${path.basename(parentFilename, path.extname(parentFilename))}` + : ''; + const parsedParentFileKey = parentFileKey + ?.split('/') + .slice(0, -1) + .concat(parsedParentFilename) + .join('/'); - return { rawText: getFormatText ? formatText || rawText : rawText }; + return { + key: parsedParentFileKey + ? `${parsedParentFileKey}/${image}` + : [S3Sources.dataset, datasetId, image].join('/'), + filename: image + }; + } }; diff --git a/packages/service/common/s3/buckets/base.ts b/packages/service/common/s3/buckets/base.ts index a17c3b3b8dd9..9264cf59448f 100644 --- a/packages/service/common/s3/buckets/base.ts +++ b/packages/service/common/s3/buckets/base.ts @@ -1,4 +1,10 @@ -import { Client, type RemoveOptions, type CopyConditions, InvalidObjectNameError } from 'minio'; +import { + Client, + type RemoveOptions, + type CopyConditions, + InvalidObjectNameError, + S3Error +} from 'minio'; import { type CreatePostPresignedUrlOptions, type CreatePostPresignedUrlParams, @@ -11,9 +17,10 @@ import { defaultS3Options, getSystemMaxFileSize, Mimes } from '../constants'; import path from 'node:path'; import { MongoS3TTL } from '../schema'; import { getNanoid } from '@fastgpt/global/common/string/tools'; -import { addHours } from 'date-fns'; +import { addHours, addMinutes } from 'date-fns'; import { addLog } from '../../system/log'; import { addS3DelJob } from '../mq'; +import { type Readable } from 'node:stream'; export class S3BaseBucket { private _client: Client; @@ -80,24 +87,26 @@ export class S3BaseBucket { } async copy({ - src, - dst, - ttl, + from, + to, options }: { - src: string; - dst: string; - ttl: boolean; - options?: CopyConditions; + from: string; + to: string; + options?: { + temporary?: boolean; + copyConditions?: CopyConditions; + }; }): ReturnType { - if (ttl) { + const bucket = this.name; + if (options?.temporary) { await MongoS3TTL.create({ - minioKey: dst, + minioKey: to, bucketName: this.name, expiredTime: addHours(new Date(), 24) }); } - return this.client.copyObject(this.name, src, dst, options); + return this.client.copyObject(bucket, to, `${bucket}/${from}`, options?.copyConditions); } exist(): Promise { @@ -109,24 +118,46 @@ export class S3BaseBucket { if (!objectKey) return Promise.resolve(); return await this.client.removeObject(this.name, objectKey, options); } catch (error) { - if (error instanceof InvalidObjectNameError) { - addLog.warn(`${this.name} delete object not found: ${objectKey}`, error); - return Promise.resolve(); + if (error instanceof S3Error) { + if (error.code === 'InvalidObjectName') { + addLog.warn(`${this.name} delete object not found: ${objectKey}`, error); + return Promise.resolve(); + } } return Promise.reject(error); } } - addDeleteJob({ prefix, key }: { prefix?: string; key?: string }): Promise { - return addS3DelJob({ prefix, key, bucketName: this.name }); - } - listObjectsV2( ...params: Parameters extends [string, ...infer R] ? R : never ) { return this.client.listObjectsV2(this.name, ...params); } + putObject(...params: Parameters extends [string, ...infer R] ? R : never) { + return this.client.putObject(this.name, ...params); + } + + getObject(...params: Parameters extends [string, ...infer R] ? R : never) { + return this.client.getObject(this.name, ...params); + } + + statObject(...params: Parameters extends [string, ...infer R] ? R : never) { + return this.client.statObject(this.name, ...params); + } + + async fileStreamToBuffer(stream: Readable): Promise { + const chunks: Buffer[] = []; + for await (const chunk of stream) { + chunks.push(chunk); + } + return Buffer.concat(chunks); + } + + addDeleteJob(params: Omit[0], 'bucketName'>) { + return addS3DelJob({ ...params, bucketName: this.name }); + } + async createPostPresignedUrl( params: CreatePostPresignedUrlParams, options: CreatePostPresignedUrlOptions = {} @@ -140,8 +171,7 @@ export class S3BaseBucket { const key = (() => { if ('rawKey' in params) return params.rawKey; - - return `${params.source}/${params.teamId}/${getNanoid(6)}-${filename}`; + return [params.source, params.teamId, `${getNanoid(6)}-${filename}`].join('/'); })(); const policy = this.externalClient.newPostPolicy(); @@ -151,11 +181,12 @@ export class S3BaseBucket { if (formatMaxFileSize) { policy.setContentLengthRange(1, formatMaxFileSize); } - policy.setExpires(new Date(Date.now() + 10 * 60 * 1000)); + policy.setExpires(addMinutes(new Date(), 10)); policy.setUserMetaData({ 'content-disposition': `attachment; filename="${encodeURIComponent(filename)}"`, 'origin-filename': encodeURIComponent(filename), - 'upload-time': new Date().toISOString() + 'upload-time': new Date().toISOString(), + ...params.metadata }); const { formData, postURL } = await this.externalClient.presignedPostPolicy(policy); diff --git a/packages/service/common/s3/mq.ts b/packages/service/common/s3/mq.ts index 9c6ac5a5ab84..1deaea7b657e 100644 --- a/packages/service/common/s3/mq.ts +++ b/packages/service/common/s3/mq.ts @@ -4,6 +4,7 @@ import { retryFn } from '@fastgpt/global/common/system/utils'; export type S3MQJobData = { key?: string; + keys?: string[]; prefix?: string; bucketName: string; }; @@ -29,9 +30,8 @@ export const startS3DelWorker = async () => { return getWorker( QueueNames.s3FileDelete, async (job) => { - const { prefix, bucketName, key } = job.data; + const { prefix, bucketName, key, keys } = job.data; const limit = pLimit(10); - const tasks: Promise[] = []; const bucket = s3BucketMap[bucketName]; if (!bucket) { return Promise.reject(`Bucket not found: ${bucketName}`); @@ -40,7 +40,16 @@ export const startS3DelWorker = async () => { if (key) { await bucket.delete(key); } + if (keys) { + const tasks: Promise[] = []; + for (const key of keys) { + const p = limit(() => retryFn(() => bucket.delete(key))); + tasks.push(p); + } + await Promise.all(tasks); + } if (prefix) { + const tasks: Promise[] = []; return new Promise(async (resolve, reject) => { const stream = bucket.listObjectsV2(prefix, true); stream.on('data', async (file) => { diff --git a/packages/service/common/s3/sources/avatar.ts b/packages/service/common/s3/sources/avatar.ts index 0fbcd738dcce..8f3bbe7a8454 100644 --- a/packages/service/common/s3/sources/avatar.ts +++ b/packages/service/common/s3/sources/avatar.ts @@ -68,20 +68,29 @@ class S3AvatarSource { } async copyAvatar({ - sourceKey, - targetKey, - ttl + key, + teamId, + filename, + temporary = false }: { - sourceKey: string; - targetKey: string; - ttl: boolean; + key: string; + teamId: string; + filename: string; + temporary: boolean; }) { - await this.bucket.copy({ - src: sourceKey, - dst: targetKey, - ttl - }); - return targetKey; + const from = key.slice(this.prefix.length); + const to = `${S3Sources.avatar}/${teamId}/${filename}`; + await this.bucket.copy({ from, to, options: { temporary } }); + return this.prefix.concat(to); + } + + isAvatarKey( + key?: string, + options?: { prefix?: string } + ): key is `${typeof S3Sources.avatar}/${string}` { + const { prefix = this.prefix } = options ?? {}; + const objectKey = prefix ? key?.slice(prefix.length) : key; + return objectKey?.startsWith(`${S3Sources.avatar}/`) ?? false; } } diff --git a/packages/service/common/s3/sources/chat/index.ts b/packages/service/common/s3/sources/chat/index.ts index 0aaad652609c..a98ca1a8e95f 100644 --- a/packages/service/common/s3/sources/chat/index.ts +++ b/packages/service/common/s3/sources/chat/index.ts @@ -1,4 +1,4 @@ -import { getNanoid } from '@fastgpt/global/common/string/tools'; +import { getNanoid, parseFileExtensionFromUrl } from '@fastgpt/global/common/string/tools'; import { S3PrivateBucket } from '../../buckets/private'; import { S3Sources } from '../../type'; import { @@ -7,8 +7,6 @@ import { ChatFileUploadSchema, DelChatFileByPrefixSchema } from './type'; -import { MongoS3TTL } from '../../schema'; -import { addHours } from 'date-fns'; class S3ChatSource { private bucket: S3PrivateBucket; @@ -22,10 +20,37 @@ class S3ChatSource { return (this.instance ??= new S3ChatSource()); } - static isChatFileKey(key?: string): key is `${typeof S3Sources.chat}/${string}` { + isChatFileKey(key?: string): key is `${typeof S3Sources.chat}/${string}` { return key?.startsWith(`${S3Sources.chat}/`) ?? false; } + // 获取文件流 + getChatFileStream(key: string) { + return this.bucket.getObject(key); + } + + // 获取文件状态 + getChatFileStat(key: string) { + return this.bucket.statObject(key); + } + + // 获取文件元数据 + async getFileMetadata(key: string) { + const stat = await this.getChatFileStat(key); + if (!stat) return { filename: '', extension: '', contentLength: 0, contentType: '' }; + + const contentLength = stat.size; + const filename: string = decodeURIComponent(stat.metaData['origin-filename']); + const extension = parseFileExtensionFromUrl(filename); + const contentType: string = stat.metaData['content-type']; + return { + filename, + extension, + contentType, + contentLength + }; + } + async createGetChatFileURL(params: { key: string; expiredHours?: number; external: boolean }) { const { key, expiredHours = 1, external = false } = params; // 默认一个小时 @@ -38,12 +63,7 @@ class S3ChatSource { async createUploadChatFileURL(params: CheckChatFileKeys) { const { appId, chatId, uId, filename } = ChatFileUploadSchema.parse(params); const rawKey = [S3Sources.chat, appId, uId, chatId, `${getNanoid(6)}-${filename}`].join('/'); - await MongoS3TTL.create({ - minioKey: rawKey, - bucketName: this.bucket.name, - expiredTime: addHours(new Date(), 24) - }); - return await this.bucket.createPostPresignedUrl({ rawKey, filename }); + return await this.bucket.createPostPresignedUrl({ rawKey, filename }, { expiredHours: 24 }); } deleteChatFilesByPrefix(params: DelChatFileByPrefixParams) { diff --git a/packages/service/common/s3/sources/dataset/index.ts b/packages/service/common/s3/sources/dataset/index.ts new file mode 100644 index 000000000000..685852ecb818 --- /dev/null +++ b/packages/service/common/s3/sources/dataset/index.ts @@ -0,0 +1,269 @@ +import { S3Sources } from '../../type'; +import { S3PrivateBucket } from '../../buckets/private'; +import { getNanoid, parseFileExtensionFromUrl } from '@fastgpt/global/common/string/tools'; +import { + type CreateGetDatasetFileURLParams, + CreateGetDatasetFileURLParamsSchema, + type CreateUploadDatasetFileParams, + CreateUploadDatasetFileParamsSchema, + type DeleteDatasetFilesByPrefixParams, + DeleteDatasetFilesByPrefixParamsSchema, + type GetDatasetFileContentParams, + GetDatasetFileContentParamsSchema, + type UploadDatasetFileByBufferParams, + UploadDatasetFileByBufferParamsSchema, + type UploadDatasetImageParams, + UploadDatasetImageParamsSchema +} from './type'; +import { MongoS3TTL } from '../../schema'; +import { + addDays, + addHours, + addMinutes, + differenceInDays, + differenceInMilliseconds +} from 'date-fns'; +import { addLog } from '../../../system/log'; +import { detectFileEncoding } from '@fastgpt/global/common/file/tools'; +import { readS3FileContentByBuffer } from '../../../file/read/utils'; +import { addRawTextBuffer, getRawTextBuffer } from '../../../buffer/rawText/controller'; +import type { ClientSession } from '../../../mongo'; +import { MongoDatasetData } from '../../../../core/dataset/data/schema'; +import path from 'node:path'; +import { Mimes } from '../../constants'; +import jwt from 'jsonwebtoken'; +import { ERROR_ENUM } from '@fastgpt/global/common/error/errorCode'; + +type DatasetObjectKey = `${typeof S3Sources.dataset}/${string}`; + +class S3DatasetSource { + private bucket: S3PrivateBucket; + private static instance: S3DatasetSource; + + constructor() { + this.bucket = new S3PrivateBucket(); + } + + static getInstance() { + return (this.instance ??= new S3DatasetSource()); + } + + // 下载链接 + async createGetDatasetFileURL(params: CreateGetDatasetFileURLParams) { + const { key, expiredHours, external } = CreateGetDatasetFileURLParamsSchema.parse(params); + + if (external) { + return await this.bucket.createExtenalUrl({ key, expiredHours }); + } + return await this.bucket.createPreviewlUrl({ key, expiredHours }); + } + + // 上传链接 + async createUploadDatasetFileURL(params: CreateUploadDatasetFileParams) { + const { filename, datasetId } = CreateUploadDatasetFileParamsSchema.parse(params); + const rawKey = [S3Sources.dataset, datasetId, `${getNanoid(6)}-${filename}`].join('/'); + return await this.bucket.createPostPresignedUrl({ rawKey, filename }, { expiredHours: 3 }); + } + + // 前缀删除 + deleteDatasetFilesByPrefix(params: DeleteDatasetFilesByPrefixParams) { + const { datasetId } = DeleteDatasetFilesByPrefixParamsSchema.parse(params); + const prefix = [S3Sources.dataset, datasetId].filter(Boolean).join('/'); + return this.bucket.addDeleteJob({ prefix }); + } + + // 单个键删除 + deleteDatasetFileByKey(key?: string) { + return this.bucket.addDeleteJob({ key }); + } + + // 多个键删除 + deleteDatasetFilesByKeys(keys: string[]) { + return this.bucket.addDeleteJob({ keys }); + } + + // 获取文件流 + getDatasetFileStream(key: string) { + return this.bucket.getObject(key); + } + + // 获取文件状态 + getDatasetFileStat(key: string) { + return this.bucket.statObject(key); + } + + // 获取文件元数据 + async getFileMetadata(key: string) { + const stat = await this.getDatasetFileStat(key); + if (!stat) return { filename: '', extension: '', contentLength: 0, contentType: '' }; + + const contentLength = stat.size; + const filename: string = decodeURIComponent(stat.metaData['origin-filename']); + const extension = parseFileExtensionFromUrl(filename); + const contentType: string = stat.metaData['content-type']; + return { + filename, + extension, + contentType, + contentLength + }; + } + + isDatasetObjectKey(key?: string): key is DatasetObjectKey { + return typeof key === 'string' && key.startsWith(`${S3Sources.dataset}/`); + } + + async getDatasetBase64Image(key: string): Promise { + const [stream, metadata] = await Promise.all([ + this.getDatasetFileStream(key), + this.getFileMetadata(key) + ]); + const buffer = await this.bucket.fileStreamToBuffer(stream); + const base64 = buffer.toString('base64'); + return `data:${metadata.contentType || 'image/jpeg'};base64,${base64}`; + } + + async getDatasetFileRawText(params: GetDatasetFileContentParams) { + const { fileId, teamId, tmbId, customPdfParse, getFormatText, usageId, datasetId } = + GetDatasetFileContentParamsSchema.parse(params); + + const bufferId = `${fileId}-${customPdfParse}`; + const fileBuffer = await getRawTextBuffer(bufferId); + if (fileBuffer) { + return { + rawText: fileBuffer.text, + filename: fileBuffer.sourceName, + imageKeys: fileBuffer.imageKeys + }; + } + + const [metadata, stream] = await Promise.all([ + this.getFileMetadata(fileId), + this.getDatasetFileStream(fileId) + ]); + + const extension = metadata.extension; + const filename: string = decodeURIComponent(metadata.filename); + + const start = Date.now(); + const buffer = await this.bucket.fileStreamToBuffer(stream); + addLog.debug('get dataset file buffer', { time: Date.now() - start }); + + const encoding = detectFileEncoding(buffer); + const prefix = `${path.dirname(fileId)}/${path.basename(fileId, path.extname(fileId))}-parsed`; + const { rawText, imageKeys } = await readS3FileContentByBuffer({ + teamId, + tmbId, + uploadKeyPrefix: prefix, + extension, + buffer, + encoding, + customPdfParse, + usageId, + getFormatText + }); + + addRawTextBuffer({ + sourceId: bufferId, + sourceName: filename, + text: rawText, + expiredTime: addMinutes(new Date(), 20), + imageKeys + }); + + return { + rawText, + filename, + imageKeys + }; + } + + // 上传图片 + async uploadDatasetImage(params: UploadDatasetImageParams): Promise { + const { uploadKey, base64Img, mimetype, filename } = + UploadDatasetImageParamsSchema.parse(params); + + const base64Data = base64Img.split(',')[1] || base64Img; + const buffer = Buffer.from(base64Data, 'base64'); + + await this.bucket.putObject(uploadKey, buffer, buffer.length, { + 'content-type': mimetype, + 'upload-time': new Date().toISOString(), + 'origin-filename': encodeURIComponent(filename) + }); + + await MongoS3TTL.create({ + minioKey: uploadKey, + bucketName: this.bucket.name, + expiredTime: addDays(new Date(), 7) + }); + + return uploadKey; + } + + // 根据文件 Buffer 上传文件 + async uploadDatasetFileByBuffer(params: UploadDatasetFileByBufferParams): Promise { + const { datasetId, buffer, filename } = UploadDatasetFileByBufferParamsSchema.parse(params); + + const key = [S3Sources.dataset, datasetId, `${getNanoid(6)}-${filename}`].join('/'); + await this.bucket.putObject(key, buffer, buffer.length, { + 'content-type': Mimes[path.extname(filename) as keyof typeof Mimes], + 'upload-time': new Date().toISOString(), + 'origin-filename': encodeURIComponent(filename) + }); + await MongoS3TTL.create({ + minioKey: key, + bucketName: this.bucket.name, + expiredTime: addHours(new Date(), 3) + }); + return key; + } + + // 移除单个文件的 TTL 记录 + async removeDatasetFileTTL(fileKey: string, session?: ClientSession): Promise { + await MongoS3TTL.deleteOne( + { + minioKey: fileKey, + bucketName: this.bucket.name + }, + { session } + ); + + addLog.debug('Removed TTL for dataset file', { fileKey }); + } + + // 移除多个图片的 TTL 记录 + async removeDatasetImagesTTL(imageKeys: string[], session?: ClientSession): Promise { + if (imageKeys.length === 0) return; + + const result = await MongoS3TTL.deleteMany( + { + minioKey: { $in: imageKeys }, + bucketName: this.bucket.name + }, + { session } + ); + + addLog.debug('Removed TTL for dataset images', { + imageKeysCount: imageKeys.length, + deletedCount: result.deletedCount + }); + } + + async getFileDatasetInfo(key: string): Promise<{ + _id: string; + datasetId: string; + collectionId: string; + } | null> { + return await MongoDatasetData.findOne( + { $or: [{ imageKeys: { $in: [key] } }, { imageId: key }] }, + 'datasetId collectionId' + ) + .lean() + .exec(); + } +} + +export function getS3DatasetSource() { + return S3DatasetSource.getInstance(); +} diff --git a/packages/service/common/s3/sources/dataset/type.ts b/packages/service/common/s3/sources/dataset/type.ts new file mode 100644 index 000000000000..ba82f4a4dfcc --- /dev/null +++ b/packages/service/common/s3/sources/dataset/type.ts @@ -0,0 +1,61 @@ +import { ObjectIdSchema } from '@fastgpt/global/common/type/mongo'; +import { z } from 'zod'; + +export const CreateUploadDatasetFileParamsSchema = z.object({ + filename: z.string().nonempty(), + datasetId: ObjectIdSchema +}); +export type CreateUploadDatasetFileParams = z.infer; + +export const CreateGetDatasetFileURLParamsSchema = z.object({ + key: z.string().nonempty(), + expiredHours: z.number().positive().optional(), + external: z.boolean().optional() +}); +export type CreateGetDatasetFileURLParams = z.infer; + +export const DeleteDatasetFilesByPrefixParamsSchema = z.object({ + datasetId: ObjectIdSchema +}); +export type DeleteDatasetFilesByPrefixParams = z.infer< + typeof DeleteDatasetFilesByPrefixParamsSchema +>; + +export const GetDatasetFileContentParamsSchema = z.object({ + teamId: ObjectIdSchema, + tmbId: ObjectIdSchema, + fileId: z.string().nonempty(), // 这是 ObjectKey + customPdfParse: z.boolean().optional(), + getFormatText: z.boolean().optional(), // 数据类型都尽可能转化成 markdown 格式 + datasetId: ObjectIdSchema, + usageId: ObjectIdSchema.optional() +}); +export type GetDatasetFileContentParams = z.infer; + +export const UploadParsedDatasetImagesParamsSchema = z.object({ + key: z.string().nonempty() +}); +export type UploadParsedDatasetImagesParams = z.infer; + +export const UploadDatasetImageParamsSchema = z.object({ + base64Img: z.string().nonempty(), + uploadKey: z.string().nonempty(), + mimetype: z.string().nonempty(), + filename: z.string().nonempty() +}); +export type UploadDatasetImageParams = z.infer; + +export const ParsedFileContentS3KeyParamsSchema = z.object({ + datasetId: ObjectIdSchema, + mimetype: z.string().nonempty(), + filename: z.string().optional(), + parentFileKey: z.string().optional() // 被解析的文件的完整key,作为图片的父目录 +}); +export type ParsedFileContentS3KeyParams = z.infer; + +export const UploadDatasetFileByBufferParamsSchema = z.object({ + datasetId: ObjectIdSchema, + buffer: z.instanceof(Buffer), + filename: z.string().nonempty() +}); +export type UploadDatasetFileByBufferParams = z.infer; diff --git a/packages/service/common/s3/type.ts b/packages/service/common/s3/type.ts index 885a0081742b..50881d4d2907 100644 --- a/packages/service/common/s3/type.ts +++ b/packages/service/common/s3/type.ts @@ -17,7 +17,7 @@ export type ExtensionType = keyof typeof Mimes; export type S3OptionsType = typeof defaultS3Options; -export const S3SourcesSchema = z.enum(['avatar', 'chat']); +export const S3SourcesSchema = z.enum(['avatar', 'chat', 'dataset']); export const S3Sources = S3SourcesSchema.enum; export type S3SourceType = z.infer; @@ -25,13 +25,15 @@ export const CreatePostPresignedUrlParamsSchema = z.union([ // Option 1: Only rawKey z.object({ filename: z.string().min(1), - rawKey: z.string().min(1) + rawKey: z.string().min(1), + metadata: z.record(z.string(), z.string()).optional() }), // Option 2: filename with optional source and teamId z.object({ filename: z.string().min(1), source: S3SourcesSchema.optional(), - teamId: z.string().length(16).optional() + teamId: z.string().length(16).optional(), + metadata: z.record(z.string(), z.string()).optional() }) ]); export type CreatePostPresignedUrlParams = z.infer; diff --git a/packages/service/common/s3/utils.ts b/packages/service/common/s3/utils.ts new file mode 100644 index 000000000000..1cebd7b39af2 --- /dev/null +++ b/packages/service/common/s3/utils.ts @@ -0,0 +1,56 @@ +import jwt from 'jsonwebtoken'; +import { differenceInMilliseconds, addDays } from 'date-fns'; +import { ERROR_ENUM } from '@fastgpt/global/common/error/errorCode'; +import { S3Sources } from './type'; +import { getS3ChatSource } from './sources/chat'; +import { getS3DatasetSource } from './sources/dataset'; +import { EndpointUrl } from '@fastgpt/global/common/file/constants'; + +export function jwtSignS3ObjectKey(objectKey: string) { + const secret = process.env.FILE_TOKEN_KEY as string; + const now = new Date(); + const expiresIn = differenceInMilliseconds(addDays(now, 90), now); + const token = jwt.sign({ objectKey }, secret, { expiresIn }); + + return token; +} + +export function jwtVerifyS3ObjectKey(token: string) { + const secret = process.env.FILE_TOKEN_KEY as string; + return new Promise<{ objectKey: string }>((resolve, reject) => { + jwt.verify(token, secret, (err, payload) => { + if (err || !payload || !(payload as jwt.JwtPayload).objectKey) { + reject(ERROR_ENUM.unAuthFile); + } + + resolve(payload as { objectKey: string }); + }); + }); +} + +export async function replaceDatasetQuoteTextWithJWT(datasetQuoteText: string) { + if (!datasetQuoteText || typeof datasetQuoteText !== 'string') return datasetQuoteText as string; + + const prefixPattern = Object.values(S3Sources) + .map((pattern) => `${pattern}\\/[^\\s)]+`) + .join('|'); + const regex = new RegExp(String.raw`(!?)\[([^\]]+)\]\((?!https?:\/\/)(${prefixPattern})\)`, 'g'); + const s3DatasetSource = getS3DatasetSource(); + const s3ChatSource = getS3ChatSource(); + + const matches = Array.from(datasetQuoteText.matchAll(regex)); + let content = datasetQuoteText; + + for (const match of matches.slice().reverse()) { + const [full, bang, alt, objectKey] = match; + + if (s3DatasetSource.isDatasetObjectKey(objectKey) || s3ChatSource.isChatFileKey(objectKey)) { + const url = `${EndpointUrl}/api/system/file/${jwtSignS3ObjectKey(objectKey)}`; + const replacement = `${bang}[${alt}](${url})`; + content = + content.slice(0, match.index) + replacement + content.slice(match.index + full.length); + } + } + + return content; +} diff --git a/packages/service/core/ai/llm/request.ts b/packages/service/core/ai/llm/request.ts index 0b2bfd369252..65a85d28bd37 100644 --- a/packages/service/core/ai/llm/request.ts +++ b/packages/service/core/ai/llm/request.ts @@ -86,7 +86,8 @@ export const createLLMResponse = async ( messages: rewriteMessages }); - // console.dir(requestBody, { depth: null }); + console.dir(requestBody, { depth: null }); + // console.log(JSON.stringify(requestBody, null, 2)); const { response, isStreamResponse, getEmptyResponseTip } = await createChatCompletion({ body: requestBody, userKey, diff --git a/packages/service/core/ai/llm/utils.ts b/packages/service/core/ai/llm/utils.ts index a1988215f064..7a50289f0b72 100644 --- a/packages/service/core/ai/llm/utils.ts +++ b/packages/service/core/ai/llm/utils.ts @@ -14,6 +14,12 @@ import { addLog } from '../../../common/system/log'; import { getImageBase64 } from '../../../common/file/image/utils'; import { getS3ChatSource } from '../../../common/s3/sources/chat'; import { isInternalAddress } from '../../../common/system/utils'; +import { S3Sources } from '../../../common/s3/type'; +import { getS3DatasetSource } from '../../../common/s3/sources/dataset'; +import { getGlobalRedisConnection } from '../../../common/redis'; +import { randomUUID } from 'node:crypto'; +import { TempFileURL } from '@fastgpt/global/common/file/image/constants'; +import { EndpointUrl } from '@fastgpt/global/common/file/constants'; export const filterGPTMessageByMaxContext = async ({ messages = [], @@ -127,7 +133,7 @@ export const loadRequestMessages = async ({ const result: ChatCompletionContentPart[] = []; // 提取所有HTTPS图片URL并添加到result开头 - const httpsImages = [...new Set(Array.from(input.matchAll(imageRegex), (m) => m[0]))]; + const httpsImages = Array.from(new Set(input.matchAll(imageRegex)), (m) => m[0]); httpsImages.forEach((url) => { result.push({ type: 'image_url', @@ -276,6 +282,46 @@ export const loadRequestMessages = async ({ return result.map((item) => item.text).join('\n'); }; + // const redis = getGlobalRedisConnection(); + // const prefixPattern = Object.values(S3Sources) + // .map((pattern) => `${pattern}\\/[^\\s)]+`) + // .join('|'); + // const regex = new RegExp(String.raw`(!?)\[([^\]]+)\]\((?!https?:\/\/)(${prefixPattern})\)`, 'g'); + + // TODO: 在我迁移完到 JWT 后移除这个 transformS3PreviewKey + // const transformS3PreviewKey = async ( + // origin: string | ChatCompletionContentPartText[] | undefined + // ) => { + // if (!origin || typeof origin !== 'string') return origin as string; + + // const matches = Array.from(origin.matchAll(regex)); + // let content = origin; + + // for (const match of matches.slice().reverse()) { + // const [full, bang, alt, objectKey] = match; + + // const filename = objectKey.split('/').pop()?.split('-')[1]; + // const name = `${randomUUID()}:${filename}`; + + // const redisKey = `chat:temp_file:${name}`; + // try { + // await redis.set(redisKey, objectKey); + // await redis.expire(redisKey, 3600); + // } catch { + // continue; + // } + + // const k = new URLSearchParams({ k: name }); + // const link = `${EndpointUrl}${TempFileURL}?${k}`; + + // const replacement = `${bang}[${alt}](${link})`; + // content = + // content.slice(0, match.index) + replacement + content.slice(match.index + full.length); + // } + + // return content; + // }; + if (messages.length === 0) { return Promise.reject(i18nT('common:core.chat.error.Messages empty')); } @@ -379,7 +425,10 @@ export const loadRequestMessages = async ({ return { ...item, - content: formatContent + content: + typeof formatContent === 'string' + ? formatContent + : (formatContent as ChatCompletionContentPartText[]) }; } else if (item.role === ChatCompletionRequestMessageRoleEnum.Assistant) { if (item.tool_calls || item.function_call) { diff --git a/packages/service/core/app/controller.ts b/packages/service/core/app/controller.ts index bd8ec22386cd..5bd91b2331c7 100644 --- a/packages/service/core/app/controller.ts +++ b/packages/service/core/app/controller.ts @@ -28,6 +28,7 @@ import { mongoSessionRun } from '../../common/mongo/sessionRun'; import { MongoAppLogKeys } from './logs/logkeysSchema'; import { MongoChatItemResponse } from '../chat/chatItemResponseSchema'; import { getS3ChatSource } from '../../common/s3/sources/chat'; +import { getS3AvatarSource } from '../../common/s3/sources/avatar'; export const beforeUpdateAppFormat = ({ nodes }: { nodes?: StoreNodeItemType[] }) => { if (!nodes) return; @@ -219,6 +220,7 @@ export const onDelOneApp = async ({ // Delete avatar await removeImageByPath(app.avatar, session); + await getS3AvatarSource().deleteAvatar(app.avatar, session); }; // Delete chats diff --git a/packages/service/core/chat/saveChat.ts b/packages/service/core/chat/saveChat.ts index ad5f35ca0081..21013640adf6 100644 --- a/packages/service/core/chat/saveChat.ts +++ b/packages/service/core/chat/saveChat.ts @@ -1,4 +1,8 @@ -import type { AIChatItemType, UserChatItemType } from '@fastgpt/global/core/chat/type.d'; +import type { + AIChatItemType, + AIChatItemValueItemType, + UserChatItemType +} from '@fastgpt/global/core/chat/type.d'; import { MongoApp } from '../app/schema'; import type { ChatSourceEnum } from '@fastgpt/global/core/chat/constants'; import { ChatItemValueTypeEnum, ChatRoleEnum } from '@fastgpt/global/core/chat/constants'; @@ -19,6 +23,7 @@ import { MongoChatItemResponse } from './chatItemResponseSchema'; import { chatValue2RuntimePrompt } from '@fastgpt/global/core/chat/adapt'; import { MongoS3TTL } from '../../common/s3/schema'; import type { ClientSession } from '../../common/mongo'; +import { getGlobalRedisConnection } from '../../common/redis'; type Props = { chatId: string; @@ -41,6 +46,44 @@ type Props = { errorMsg?: string; }; +// TODO: 在我迁移完到 JWT 后移除这个 transformAiResponse +// const transformAiResponse = async (value: AIChatItemValueItemType[]) => { +// const redis = getGlobalRedisConnection(); +// const regex = /(!?)\[([^\]]+)\]\((https?:\/\/[^\s)]+\/api\/file\/temp[^\s)]*)\)/g; + +// return Promise.all( +// value.map(async (item) => { +// if (item.type !== ChatItemValueTypeEnum.text || !item.text) return item; +// let content = item.text.content; +// const matches = Array.from(content.matchAll(regex)); + +// for (const match of matches.slice().reverse()) { +// const [full, bang, alt, link] = match; +// if (typeof match.index !== 'number') continue; + +// try { +// const url = new URL(link); // 可能会发生解析错误 +// const k = url.searchParams.get('k'); +// if (!k) continue; + +// const redisKey = `chat:temp_file:${decodeURIComponent(k)}`; +// const objectKey = await redis.get(redisKey); +// if (!objectKey) continue; + +// const replacement = `${bang}[${alt}](${objectKey})`; +// content = +// content.slice(0, match.index) + replacement + content.slice(match.index + full.length); +// } catch { +// continue; +// } +// } + +// item.text.content = content; +// return item; +// }) +// ); +// }; + const beforProcess = (props: Props) => { // Remove url props.userContent.value.forEach((item) => { @@ -109,6 +152,10 @@ const formatAiContent = ({ return responseItem; }); + // aiResponse.value = await transformAiResponse(aiResponse.value); + // console.log('aiResponse ========================'); + // console.dir(aiResponse, { depth: null }); + return { aiResponse: { ...aiResponse, diff --git a/packages/service/core/dataset/apiDataset/custom/api.ts b/packages/service/core/dataset/apiDataset/custom/api.ts index 277ca4eb93b9..6f96f5ea93ad 100644 --- a/packages/service/core/dataset/apiDataset/custom/api.ts +++ b/packages/service/core/dataset/apiDataset/custom/api.ts @@ -126,12 +126,14 @@ export const useApiDatasetRequest = ({ apiServer }: { apiServer: APIFileServer } teamId, tmbId, apiFileId, - customPdfParse + customPdfParse, + datasetId }: { teamId: string; tmbId: string; apiFileId: string; customPdfParse?: boolean; + datasetId: string; }): Promise => { const data = await request< { @@ -148,7 +150,8 @@ export const useApiDatasetRequest = ({ apiServer }: { apiServer: APIFileServer } if (content) { return { title, - rawText: content + rawText: content, + imageKeys: [] }; } if (previewUrl) { @@ -157,15 +160,17 @@ export const useApiDatasetRequest = ({ apiServer }: { apiServer: APIFileServer } if (buffer) { return { title, - rawText: buffer.text + rawText: buffer.text, + imageKeys: buffer.imageKeys || [] }; } - const rawText = await readFileRawTextByUrl({ + const { rawText, imageKeys } = await readFileRawTextByUrl({ teamId, tmbId, url: previewUrl, relatedId: apiFileId, + datasetId, customPdfParse, getFormatText: true }); @@ -174,12 +179,14 @@ export const useApiDatasetRequest = ({ apiServer }: { apiServer: APIFileServer } sourceId: previewUrl, sourceName: title || '', text: rawText, - expiredTime: addMinutes(new Date(), 30) + expiredTime: addMinutes(new Date(), 30), + imageKeys }); return { title, - rawText + rawText, + imageKeys }; } return Promise.reject('Invalid content type: content or previewUrl is required'); diff --git a/packages/service/core/dataset/collection/controller.ts b/packages/service/core/dataset/collection/controller.ts index 2ff9673099f1..0e18bcf4c179 100644 --- a/packages/service/core/dataset/collection/controller.ts +++ b/packages/service/core/dataset/collection/controller.ts @@ -1,4 +1,7 @@ -import { DatasetCollectionDataProcessModeEnum } from '@fastgpt/global/core/dataset/constants'; +import { + DatasetCollectionDataProcessModeEnum, + DatasetCollectionTypeEnum +} from '@fastgpt/global/core/dataset/constants'; import type { CreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api.d'; import { MongoDatasetCollection } from './schema'; import type { @@ -31,11 +34,14 @@ import { } from '@fastgpt/global/core/dataset/training/utils'; import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants'; import { clearCollectionImages, removeDatasetImageExpiredTime } from '../image/utils'; +import { getS3DatasetSource } from '../../../common/s3/sources/dataset'; +import { addLog } from '../../../common/system/log'; export const createCollectionAndInsertData = async ({ dataset, rawText, imageIds, + imageKeys, createCollectionParams, backupParse = false, billId, @@ -44,6 +50,7 @@ export const createCollectionAndInsertData = async ({ dataset: DatasetSchemaType; rawText?: string; imageIds?: string[]; + imageKeys?: string[]; createCollectionParams: CreateOneCollectionParams; backupParse?: boolean; @@ -133,7 +140,8 @@ export const createCollectionAndInsertData = async ({ customReg: formatCreateCollectionParams.chunkSplitter ? [formatCreateCollectionParams.chunkSplitter] : [], - backupParse + backupParse, + imageKeys }); return { chunks, @@ -232,12 +240,14 @@ export const createCollectionAndInsertData = async ({ } })(); - // 6. Remove images ttl index - await removeDatasetImageExpiredTime({ - ids: imageIds, - collectionId, - session - }); + // Remove S3 image TTLs for imageKeys + if (imageKeys && imageKeys.length > 0) { + await getS3DatasetSource().removeDatasetImagesTTL(imageKeys, session); + } + // Remove S3 image TTLs for imageIds + if (imageIds && imageIds.length > 0) { + await getS3DatasetSource().removeDatasetImagesTTL(imageIds, session); + } return { collectionId: String(collectionId), @@ -363,9 +373,40 @@ export async function delCollection({ if (!teamId) return Promise.reject('teamId is not exist'); + const s3DatasetSource = getS3DatasetSource(); const datasetIds = Array.from(new Set(collections.map((item) => String(item.datasetId)))); const collectionIds = collections.map((item) => String(item._id)); + const allImageKeys = await (async () => { + const datas = await MongoDatasetData.find( + { + teamId, + datasetId: { $in: datasetIds }, + collectionId: { $in: collectionIds } + }, + { imageKeys: 1 } + ).lean(); + + const imageKeys = datas.flatMap((data) => data.imageKeys || []); + return [...new Set(imageKeys)].filter((key) => s3DatasetSource.isDatasetObjectKey(key)); + })(); + + const allImageIds = await (async () => { + const datas = await MongoDatasetData.find( + { + teamId, + datasetId: { $in: datasetIds }, + collectionId: { $in: collectionIds } + }, + { imageId: 1 } + ).lean(); + return [ + ...new Set( + datas.map((data) => data.imageId).filter((key) => s3DatasetSource.isDatasetObjectKey(key)) + ) + ]; + })(); + await retryFn(async () => { await Promise.all([ // Delete training data @@ -420,5 +461,23 @@ export async function delCollection({ }, { session } ); + + // delete s3 images which are parsed from docs + if (allImageKeys.length > 0) { + try { + await s3DatasetSource.deleteDatasetFilesByKeys(allImageKeys); + } catch (error) { + addLog.error('Failed to cleanup S3 images', error); + } + } + + // delete s3 images + if (allImageIds.length > 0) { + try { + await s3DatasetSource.deleteDatasetFilesByKeys(allImageIds); + } catch (error) { + addLog.error('Failed to cleanup S3 images', error); + } + } }); } diff --git a/packages/service/core/dataset/collection/schema.ts b/packages/service/core/dataset/collection/schema.ts index 61ddc48a099b..f2281a6f217e 100644 --- a/packages/service/core/dataset/collection/schema.ts +++ b/packages/service/core/dataset/collection/schema.ts @@ -58,10 +58,8 @@ const DatasetCollectionSchema = new Schema({ // Metadata // local file collection - fileId: { - type: Schema.Types.ObjectId, - ref: 'dataset.files' - }, + // Support both GridFS ObjectId (string) and S3 key (string) + fileId: String, // web link collection rawLink: String, // Api collection @@ -72,6 +70,7 @@ const DatasetCollectionSchema = new Schema({ rawTextLength: Number, hashRawText: String, + metadata: { type: Object, default: {} diff --git a/packages/service/core/dataset/collection/utils.ts b/packages/service/core/dataset/collection/utils.ts index e2334ef0b3a9..152732eaf0da 100644 --- a/packages/service/core/dataset/collection/utils.ts +++ b/packages/service/core/dataset/collection/utils.ts @@ -161,9 +161,10 @@ export const syncCollection = async (collection: CollectionWithDatasetType) => { }; })(); - const { title, rawText } = await readDatasetSourceRawText({ + const { title, rawText, imageKeys } = await readDatasetSourceRawText({ teamId: collection.teamId, tmbId: collection.tmbId, + datasetId: collection.datasetId, ...sourceReadType }); @@ -188,6 +189,7 @@ export const syncCollection = async (collection: CollectionWithDatasetType) => { session, dataset, rawText: rawText, + imageKeys, createCollectionParams: { ...collection, name: title || collection.name, @@ -208,7 +210,7 @@ export const syncCollection = async (collection: CollectionWithDatasetType) => { return DatasetCollectionSyncResultEnum.sameRaw; }; -/* +/* QA: 独立进程 Chunk: Image Index -> Auto index -> chunk index */ diff --git a/packages/service/core/dataset/controller.ts b/packages/service/core/dataset/controller.ts index dade64db9214..89577a8c7fde 100644 --- a/packages/service/core/dataset/controller.ts +++ b/packages/service/core/dataset/controller.ts @@ -15,6 +15,8 @@ import { removeDatasetSyncJobScheduler } from './datasetSync'; import { mongoSessionRun } from '../../common/mongo/sessionRun'; import { removeImageByPath } from '../../common/file/image/controller'; import { UserError } from '@fastgpt/global/common/error/utils'; +import { getS3DatasetSource } from '../../common/s3/sources/dataset'; +import { getS3AvatarSource } from '../../common/s3/sources/avatar'; /* ============= dataset ========== */ /* find all datasetId by top datasetId */ @@ -122,6 +124,10 @@ export async function delDatasetRelevantData({ teamId, datasetId: { $in: datasetIds } }).session(session); + + for (const datasetId of datasetIds) { + await getS3DatasetSource().deleteDatasetFilesByPrefix({ datasetId }); + } } export const deleteDatasets = async ({ @@ -162,5 +168,9 @@ export const deleteDatasets = async ({ for await (const dataset of datasets) { await removeImageByPath(dataset.avatar, session); } + + for await (const dataset of datasets) { + await getS3AvatarSource().deleteAvatar(dataset.avatar, session); + } }); }; diff --git a/packages/service/core/dataset/data/controller.ts b/packages/service/core/dataset/data/controller.ts index 50c70500797d..38dd004c0d2a 100644 --- a/packages/service/core/dataset/data/controller.ts +++ b/packages/service/core/dataset/data/controller.ts @@ -1,6 +1,8 @@ +import { getS3DatasetSource } from '../../../common/s3/sources/dataset'; import { addEndpointToImageUrl } from '../../../common/file/image/utils'; import { getDatasetImagePreviewUrl } from '../image/utils'; -import type { DatasetCiteItemType, DatasetDataSchemaType } from '@fastgpt/global/core/dataset/type'; +import type { DatasetDataSchemaType } from '@fastgpt/global/core/dataset/type'; +import { getS3ChatSource } from '../../../common/s3/sources/chat'; export const formatDatasetDataValue = ({ teamId, @@ -56,12 +58,15 @@ export const formatDatasetDataValue = ({ }; } - const previewUrl = getDatasetImagePreviewUrl({ - imageId, - teamId, - datasetId, - expiredMinutes: 60 * 24 * 7 // 7 days - }); + const previewUrl = + getS3DatasetSource().isDatasetObjectKey(imageId) || getS3ChatSource().isChatFileKey(imageId) + ? imageId + : getDatasetImagePreviewUrl({ + imageId, + teamId, + datasetId, + expiredMinutes: 60 * 24 * 7 // 7 days + }); return { q: `![${q.replaceAll('\n', '')}](${previewUrl})`, @@ -71,7 +76,7 @@ export const formatDatasetDataValue = ({ }; export const getFormatDatasetCiteList = (list: DatasetDataSchemaType[]) => { - return list.map((item) => ({ + return list.map((item) => ({ _id: item._id, ...formatDatasetDataValue({ teamId: item.teamId, diff --git a/packages/service/core/dataset/data/schema.ts b/packages/service/core/dataset/data/schema.ts index 8d20378e62a5..2d1cfefbc87f 100644 --- a/packages/service/core/dataset/data/schema.ts +++ b/packages/service/core/dataset/data/schema.ts @@ -40,6 +40,10 @@ const DatasetDataSchema = new Schema({ type: String }, imageId: String, + imageKeys: { + type: [String], + default: [] + }, imageDescMap: Object, history: { type: [ @@ -105,6 +109,9 @@ try { // rebuild data DatasetDataSchema.index({ rebuilding: 1, teamId: 1, datasetId: 1 }); + // Query images by collection efficiently + DatasetDataSchema.index({ collectionId: 1, imageKeys: 1 }); + // 为查询 initJieba 字段不存在的数据添加索引 DatasetDataSchema.index({ initJieba: 1, updateTime: 1 }); diff --git a/packages/service/core/dataset/image/controller.ts b/packages/service/core/dataset/image/controller.ts index ca597eae44b5..d8cd7882d1e8 100644 --- a/packages/service/core/dataset/image/controller.ts +++ b/packages/service/core/dataset/image/controller.ts @@ -10,6 +10,7 @@ import { checkTimerLock } from '../../../common/system/timerLock/utils'; import { TimerIdEnum } from '../../../common/system/timerLock/constants'; import { addLog } from '../../../common/system/log'; import { UserError } from '@fastgpt/global/common/error/utils'; +import { getS3DatasetSource } from '../../../common/s3/sources/dataset'; const getGridBucket = () => { return new connectionMongo.mongo.GridFSBucket(connectionMongo.connection.db!, { @@ -114,9 +115,14 @@ export const getDatasetImageBase64 = async (imageId: string) => { export const deleteDatasetImage = async (imageId: string) => { const gridBucket = getGridBucket(); + const s3DatasetSource = getS3DatasetSource(); try { - await gridBucket.delete(new Types.ObjectId(imageId)); + if (s3DatasetSource.isDatasetObjectKey(imageId)) { + await s3DatasetSource.deleteDatasetFileByKey(imageId); + } else { + await gridBucket.delete(new Types.ObjectId(imageId)); + } } catch (error: any) { const msg = error?.message; if (msg.includes('File not found')) { diff --git a/packages/service/core/dataset/read.ts b/packages/service/core/dataset/read.ts index b17efe205c74..beac3c5149e1 100644 --- a/packages/service/core/dataset/read.ts +++ b/packages/service/core/dataset/read.ts @@ -1,13 +1,11 @@ -import { BucketNameEnum } from '@fastgpt/global/common/file/constants'; import { ChunkTriggerConfigTypeEnum, DatasetSourceReadTypeEnum } from '@fastgpt/global/core/dataset/constants'; -import { readFileContentFromMongo } from '../../common/file/gridfs/controller'; import { urlsFetch } from '../../common/string/cheerio'; import { type TextSplitProps } from '@fastgpt/global/common/string/textSplitter'; import axios from 'axios'; -import { readRawContentByFileBuffer } from '../../common/file/read/utils'; +import { parsedFileContentS3Key, readS3FileContentByBuffer } from '../../common/file/read/utils'; import { parseFileExtensionFromUrl } from '@fastgpt/global/common/string/tools'; import { getApiDatasetRequest } from './apiDataset'; import Papa from 'papaparse'; @@ -17,6 +15,9 @@ import { addLog } from '../../common/system/log'; import { retryFn } from '@fastgpt/global/common/system/utils'; import { getFileMaxSize } from '../../common/file/utils'; import { UserError } from '@fastgpt/global/common/error/utils'; +import { getS3DatasetSource } from '../../common/s3/sources/dataset'; +import { Mimes } from '../../common/s3/constants'; +import path from 'node:path'; export const readFileRawTextByUrl = async ({ teamId, @@ -25,6 +26,7 @@ export const readFileRawTextByUrl = async ({ customPdfParse, getFormatText, relatedId, + datasetId, maxFileSize = getFileMaxSize() }: { teamId: string; @@ -33,6 +35,7 @@ export const readFileRawTextByUrl = async ({ customPdfParse?: boolean; getFormatText?: boolean; relatedId: string; // externalFileId / apiFileId + datasetId: string; maxFileSize?: number; }) => { const extension = parseFileExtensionFromUrl(url); @@ -64,7 +67,7 @@ export const readFileRawTextByUrl = async ({ const chunks: Buffer[] = []; let totalLength = 0; - return new Promise((resolve, reject) => { + return new Promise<{ rawText: string; imageKeys: string[] }>((resolve, reject) => { let isAborted = false; const cleanup = () => { @@ -107,8 +110,14 @@ export const readFileRawTextByUrl = async ({ // 立即清理 chunks 数组释放内存 chunks.length = 0; - const { rawText } = await retryFn(() => - readRawContentByFileBuffer({ + const { rawText, imageKeys } = await retryFn(() => { + const key = parsedFileContentS3Key.dataset({ + datasetId, + mimetype: Mimes[extension as keyof typeof Mimes], + filename: 'file' + }).key; + const prefix = `${path.dirname(key)}/${path.basename(key, path.extname(key))}-parsed`; + return readS3FileContentByBuffer({ customPdfParse, getFormatText, extension, @@ -116,13 +125,11 @@ export const readFileRawTextByUrl = async ({ tmbId, buffer, encoding: 'utf-8', - metadata: { - relatedId - } - }) - ); + uploadKeyPrefix: prefix + }); + }); - resolve(rawText); + resolve({ rawText, imageKeys: imageKeys || [] }); } catch (error) { cleanup(); reject(error); @@ -142,7 +149,7 @@ export const readFileRawTextByUrl = async ({ }); }; -/* +/* fileId - local file, read from mongo link - request externalFile/apiFile = request read @@ -157,7 +164,8 @@ export const readDatasetSourceRawText = async ({ apiDatasetServer, customPdfParse, getFormatText, - usageId + usageId, + datasetId }: { teamId: string; tmbId: string; @@ -170,23 +178,31 @@ export const readDatasetSourceRawText = async ({ externalFileId?: string; // external file dataset apiDatasetServer?: ApiDatasetServerType; // api dataset usageId?: string; + datasetId: string; // For S3 image upload }): Promise<{ title?: string; rawText: string; + imageKeys?: string[]; }> => { if (type === DatasetSourceReadTypeEnum.fileLocal) { - const { filename, rawText } = await readFileContentFromMongo({ + if (!datasetId || !getS3DatasetSource().isDatasetObjectKey(sourceId)) { + return Promise.reject('datasetId is required for S3 files'); + } + + const { filename, rawText, imageKeys } = await getS3DatasetSource().getDatasetFileRawText({ teamId, tmbId, - bucketName: BucketNameEnum.dataset, fileId: sourceId, getFormatText, customPdfParse, - usageId + usageId, + datasetId }); + return { title: filename, - rawText + rawText, + imageKeys }; } else if (type === DatasetSourceReadTypeEnum.link) { const result = await urlsFetch({ @@ -201,19 +217,22 @@ export const readDatasetSourceRawText = async ({ return { title, - rawText: content + rawText: content, + imageKeys: [] // Link sources don't have images, return empty array }; } else if (type === DatasetSourceReadTypeEnum.externalFile) { if (!externalFileId) return Promise.reject(new UserError('FileId not found')); - const rawText = await readFileRawTextByUrl({ + const { rawText, imageKeys } = await readFileRawTextByUrl({ teamId, tmbId, url: sourceId, relatedId: externalFileId, + datasetId, customPdfParse }); return { - rawText + rawText, + imageKeys }; } else if (type === DatasetSourceReadTypeEnum.apiFile) { const { title, rawText } = await readApiServerFileContent({ @@ -221,16 +240,19 @@ export const readDatasetSourceRawText = async ({ apiFileId: sourceId, teamId, tmbId, - customPdfParse + customPdfParse, + datasetId }); return { title, - rawText + rawText, + imageKeys: [] // API files don't have imageKeys in current implementation }; } return { title: '', - rawText: '' + rawText: '', + imageKeys: [] }; }; @@ -239,13 +261,15 @@ export const readApiServerFileContent = async ({ apiFileId, teamId, tmbId, - customPdfParse + customPdfParse, + datasetId }: { apiDatasetServer?: ApiDatasetServerType; apiFileId: string; teamId: string; tmbId: string; customPdfParse?: boolean; + datasetId: string; }): Promise<{ title?: string; rawText: string; @@ -254,7 +278,8 @@ export const readApiServerFileContent = async ({ teamId, tmbId, apiFileId, - customPdfParse + customPdfParse, + datasetId }); }; @@ -265,10 +290,12 @@ export const rawText2Chunks = async ({ backupParse, chunkSize = 512, imageIdList, + imageKeys, ...splitProps }: { rawText: string; imageIdList?: string[]; + imageKeys?: string[]; chunkTriggerType?: ChunkTriggerConfigTypeEnum; chunkTriggerMinSize?: number; // maxSize from agent model, not store @@ -281,6 +308,7 @@ export const rawText2Chunks = async ({ a: string; indexes?: string[]; imageIdList?: string[]; + imageKeys?: string[]; }[] > => { const parseDatasetBackup2Chunks = (rawText: string) => { @@ -288,12 +316,40 @@ export const rawText2Chunks = async ({ const chunks = csvArr .slice(1) - .map((item) => ({ - q: item[0] || '', - a: item[1] || '', - indexes: item.slice(2).filter((item) => item.trim()), - imageIdList - })) + .map((item) => { + const q = item[0] || ''; + const a = item[1] || ''; + const fullText = q + '\n' + a; + + // Extract image keys that are actually referenced in this chunk + const chunkImageKeys = []; + + if (imageKeys && imageKeys.length > 0) { + // Find all markdown image references in the chunk + const imageRefRegex = /!\[[^\]]*\]\(([^)]+)\)/g; + const referencedUrls = new Set(); + let match; + + while ((match = imageRefRegex.exec(fullText)) !== null) { + referencedUrls.add(match[1]); + } + + // Filter imageKeys to only include those referenced in this chunk + for (const imageKey of imageKeys) { + if (referencedUrls.has(imageKey)) { + chunkImageKeys.push(imageKey); + } + } + } + + return { + q, + a, + indexes: item.slice(2).filter((item) => item.trim()), + imageIdList, + imageKeys: chunkImageKeys + }; + }) .filter((item) => item.q || item.a); return { @@ -315,7 +371,8 @@ export const rawText2Chunks = async ({ { q: rawText, a: '', - imageIdList + imageIdList, + imageKeys: imageKeys || [] } ]; } @@ -324,7 +381,7 @@ export const rawText2Chunks = async ({ if (chunkTriggerType !== ChunkTriggerConfigTypeEnum.forceChunk) { const textLength = rawText.trim().length; if (textLength < chunkTriggerMinSize) { - return [{ q: rawText, a: '', imageIdList }]; + return [{ q: rawText, a: '', imageIdList, imageKeys: imageKeys || [] }]; } } @@ -334,10 +391,34 @@ export const rawText2Chunks = async ({ ...splitProps }); - return chunks.map((item) => ({ - q: item, - a: '', - indexes: [], - imageIdList - })); + return chunks.map((item) => { + // Extract image keys that are actually referenced in this chunk + const chunkImageKeys = []; + + if (imageKeys && imageKeys.length > 0) { + // Find all markdown image references in the chunk + const imageRefRegex = /!\[[^\]]*\]\(([^)]+)\)/g; + const referencedUrls = new Set(); + let match; + + while ((match = imageRefRegex.exec(item)) !== null) { + referencedUrls.add(match[1]); // match[1] is the URL part + } + + // Filter imageKeys to only include those referenced in this chunk + for (const imageKey of imageKeys) { + if (referencedUrls.has(imageKey)) { + chunkImageKeys.push(imageKey); + } + } + } + + return { + q: item, + a: '', + indexes: [], + imageIdList, + imageKeys: chunkImageKeys + }; + }); }; diff --git a/packages/service/core/dataset/training/controller.ts b/packages/service/core/dataset/training/controller.ts index fec4f1bc886e..8bd35d32cc48 100644 --- a/packages/service/core/dataset/training/controller.ts +++ b/packages/service/core/dataset/training/controller.ts @@ -122,6 +122,7 @@ export async function pushDataListToTrainingQueue({ ...(item.q && { q: item.q }), ...(item.a && { a: item.a }), ...(item.imageId && { imageId: item.imageId }), + imageKeys: item.imageKeys || [], chunkIndex: item.chunkIndex ?? 0, indexSize, weight: weight ?? 0, diff --git a/packages/service/core/dataset/training/schema.ts b/packages/service/core/dataset/training/schema.ts index a8f723a37d41..a637e9e898fe 100644 --- a/packages/service/core/dataset/training/schema.ts +++ b/packages/service/core/dataset/training/schema.ts @@ -64,6 +64,10 @@ const TrainingDataSchema = new Schema({ default: '' }, imageId: String, + imageKeys: { + type: [String], + default: [] + }, imageDescMap: Object, chunkIndex: { type: Number, diff --git a/packages/service/core/workflow/dispatch/ai/chat.ts b/packages/service/core/workflow/dispatch/ai/chat.ts index c6cf2d37f767..3d37de813e31 100644 --- a/packages/service/core/workflow/dispatch/ai/chat.ts +++ b/packages/service/core/workflow/dispatch/ai/chat.ts @@ -41,6 +41,11 @@ import { i18nT } from '../../../../../web/i18n/utils'; import { postTextCensor } from '../../../chat/postTextCensor'; import { createLLMResponse } from '../../../ai/llm/request'; import { formatModelChars2Points } from '../../../../support/wallet/usage/utils'; +import { S3Sources } from '../../../../common/s3/type'; +import { getS3DatasetSource } from '../../../../common/s3/sources/dataset'; +import { getS3ChatSource } from '../../../../common/s3/sources/chat'; +import { jwtSignS3ObjectKey, replaceDatasetQuoteTextWithJWT } from '../../../../common/s3/utils'; +import { EndpointUrl } from '@fastgpt/global/common/file/constants'; export type ChatProps = ModuleDispatchProps< AIChatNodeProps & { @@ -98,6 +103,7 @@ export const dispatchChatCompletion = async (props: ChatProps): Promise if (stringQuoteText) { @@ -366,7 +382,10 @@ async function getMultiInput({ teamId: runningUserInfo.teamId, tmbId: runningUserInfo.tmbId, customPdfParse, - usageId + usageId, + appId, + chatId, + uId }); return { diff --git a/packages/service/core/workflow/dispatch/ai/tool/index.ts b/packages/service/core/workflow/dispatch/ai/tool/index.ts index 4a79ba0bde10..a6ddda52839a 100644 --- a/packages/service/core/workflow/dispatch/ai/tool/index.ts +++ b/packages/service/core/workflow/dispatch/ai/tool/index.ts @@ -119,7 +119,10 @@ export const dispatchRunTools = async (props: DispatchToolModuleProps): Promise< fileLinks, inputFiles: globalFiles, hasReadFilesTool, - usageId + usageId, + appId: props.runningAppInfo.id, + chatId: props.chatId, + uId: props.uid }); const concatenateSystemPrompt = [ @@ -277,7 +280,10 @@ const getMultiInput = async ({ customPdfParse, inputFiles, hasReadFilesTool, - usageId + usageId, + appId, + chatId, + uId }: { runningUserInfo: ChatDispatchProps['runningUserInfo']; histories: ChatItemType[]; @@ -288,6 +294,9 @@ const getMultiInput = async ({ inputFiles: UserChatItemValueItemType['file'][]; hasReadFilesTool: boolean; usageId?: string; + appId: string; + chatId?: string; + uId: string; }) => { // Not file quote if (!fileLinks || hasReadFilesTool) { @@ -316,7 +325,10 @@ const getMultiInput = async ({ customPdfParse, usageId, teamId: runningUserInfo.teamId, - tmbId: runningUserInfo.tmbId + tmbId: runningUserInfo.tmbId, + appId, + chatId, + uId }); return { diff --git a/packages/service/core/workflow/dispatch/tools/readFiles.ts b/packages/service/core/workflow/dispatch/tools/readFiles.ts index 4726b7b045bf..a8592642d1a2 100644 --- a/packages/service/core/workflow/dispatch/tools/readFiles.ts +++ b/packages/service/core/workflow/dispatch/tools/readFiles.ts @@ -7,7 +7,10 @@ import axios from 'axios'; import { serverRequestBaseUrl } from '../../../../common/api/serverRequest'; import { getErrText } from '@fastgpt/global/common/error/utils'; import { detectFileEncoding, parseUrlToFileType } from '@fastgpt/global/common/file/tools'; -import { readRawContentByFileBuffer } from '../../../../common/file/read/utils'; +import { + parsedFileContentS3Key, + readS3FileContentByBuffer +} from '../../../../common/file/read/utils'; import { ChatRoleEnum } from '@fastgpt/global/core/chat/constants'; import { type ChatItemType, type UserChatItemValueItemType } from '@fastgpt/global/core/chat/type'; import { parseFileExtensionFromUrl } from '@fastgpt/global/common/string/tools'; @@ -16,6 +19,11 @@ import { addRawTextBuffer, getRawTextBuffer } from '../../../../common/buffer/ra import { addMinutes } from 'date-fns'; import { getNodeErrResponse } from '../utils'; import { isInternalAddress } from '../../../../common/system/utils'; +import { S3Sources } from '../../../../common/s3/type'; +import { getS3DatasetSource } from '../../../../common/s3/sources/dataset'; +import { getS3ChatSource } from '../../../../common/s3/sources/chat'; +import { jwtSignS3ObjectKey, replaceDatasetQuoteTextWithJWT } from '../../../../common/s3/utils'; +import { EndpointUrl } from '@fastgpt/global/common/file/constants'; type Props = ModuleDispatchProps<{ [NodeInputKeyEnum.fileUrlList]: string[]; @@ -71,7 +79,10 @@ export const dispatchReadFiles = async (props: Props): Promise => { teamId, tmbId, customPdfParse, - usageId + usageId, + appId: props.runningAppInfo.id, + chatId: props.chatId, + uId: props.uid }); return { @@ -124,7 +135,10 @@ export const getFileContentFromLinks = async ({ teamId, tmbId, customPdfParse, - usageId + usageId, + appId, + chatId, + uId }: { urls: string[]; requestOrigin?: string; @@ -133,6 +147,9 @@ export const getFileContentFromLinks = async ({ tmbId: string; customPdfParse?: boolean; usageId?: string; + appId: string; + chatId?: string; + uId: string; }) => { const parseUrlList = urls // Remove invalid urls @@ -224,7 +241,7 @@ export const getFileContentFromLinks = async ({ })(); // Read file - const { rawText } = await readRawContentByFileBuffer({ + const { rawText, imageKeys } = await readS3FileContentByBuffer({ extension, teamId, tmbId, @@ -232,18 +249,22 @@ export const getFileContentFromLinks = async ({ encoding, customPdfParse, getFormatText: true, + uploadKeyPrefix: parsedFileContentS3Key.chat({ appId, chatId: chatId!, uId }), usageId }); + const replacedText = await replaceDatasetQuoteTextWithJWT(rawText); + // Add to buffer addRawTextBuffer({ sourceId: url, sourceName: filename, - text: rawText, - expiredTime: addMinutes(new Date(), 20) + text: replacedText, + expiredTime: addMinutes(new Date(), 20), + imageKeys }); - return formatResponseObject({ filename, url, content: rawText }); + return formatResponseObject({ filename, url, content: replacedText }); } catch (error) { return formatResponseObject({ filename: '', diff --git a/packages/service/support/permission/auth/file.ts b/packages/service/support/permission/auth/file.ts index bc4d868072a4..3f507dd4f79a 100644 --- a/packages/service/support/permission/auth/file.ts +++ b/packages/service/support/permission/auth/file.ts @@ -10,6 +10,8 @@ import { addMinutes } from 'date-fns'; import { parseHeaderCert } from './common'; import jwt from 'jsonwebtoken'; import { ERROR_ENUM } from '@fastgpt/global/common/error/errorCode'; +import { S3Sources } from '../../../common/s3/type'; +import { getS3DatasetSource } from '../../../common/s3/sources/dataset'; export const authCollectionFile = async ({ fileId, @@ -17,28 +19,24 @@ export const authCollectionFile = async ({ ...props }: AuthModeType & { fileId: string; -}): Promise< - AuthResponseType & { - file: DatasetFileSchema; - } -> => { +}): Promise => { const authRes = await parseHeaderCert(props); const { teamId, tmbId } = authRes; - const file = await getFileById({ bucketName: BucketNameEnum.dataset, fileId }); - - if (!file) { - return Promise.reject(CommonErrEnum.fileNotFound); - } - - if (file.metadata?.teamId !== teamId) { - return Promise.reject(CommonErrEnum.unAuthFile); + if (fileId.startsWith(S3Sources.dataset)) { + const stat = await getS3DatasetSource().getDatasetFileStat(fileId); + if (!stat) return Promise.reject(CommonErrEnum.fileNotFound); + } else { + const file = await getFileById({ bucketName: BucketNameEnum.dataset, fileId }); + if (!file) { + return Promise.reject(CommonErrEnum.fileNotFound); + } + if (file.metadata?.teamId !== teamId) { + return Promise.reject(CommonErrEnum.unAuthFile); + } } - const permission = new Permission({ - role: ReadRoleVal, - isOwner: file.metadata?.uid === tmbId || file.metadata?.tmbId === tmbId - }); + const permission = new Permission({ role: ReadRoleVal, isOwner: true }); if (!permission.checkPer(per)) { return Promise.reject(CommonErrEnum.unAuthFile); @@ -46,8 +44,7 @@ export const authCollectionFile = async ({ return { ...authRes, - permission, - file + permission }; }; diff --git a/packages/service/support/permission/dataset/auth.ts b/packages/service/support/permission/dataset/auth.ts index faa53dd13683..1deff53a5b1a 100644 --- a/packages/service/support/permission/dataset/auth.ts +++ b/packages/service/support/permission/dataset/auth.ts @@ -19,11 +19,11 @@ import { MongoDatasetData } from '../../../core/dataset/data/schema'; import { type AuthModeType, type AuthResponseType } from '../type'; import { DatasetTypeEnum } from '@fastgpt/global/core/dataset/constants'; import { type ParentIdType } from '@fastgpt/global/common/parentFolder/type'; -import { DataSetDefaultRoleVal } from '@fastgpt/global/support/permission/dataset/constant'; import { getDatasetImagePreviewUrl } from '../../../core/dataset/image/utils'; import { i18nT } from '../../../../web/i18n/utils'; import { parseHeaderCert } from '../auth/common'; import { sumPer } from '@fastgpt/global/support/permission/utils'; +import { getS3DatasetSource } from '../../../common/s3/sources/dataset'; export const authDatasetByTmbId = async ({ tmbId, @@ -242,6 +242,7 @@ export async function authDatasetData({ collectionId: datasetData.collectionId }); + const s3DatasetSource = getS3DatasetSource(); const data: DatasetDataItemType = { id: String(datasetData._id), teamId: datasetData.teamId, @@ -249,13 +250,19 @@ export async function authDatasetData({ q: datasetData.q, a: datasetData.a, imageId: datasetData.imageId, + imageKeys: datasetData.imageKeys, imagePreivewUrl: datasetData.imageId - ? getDatasetImagePreviewUrl({ - imageId: datasetData.imageId, - teamId: datasetData.teamId, - datasetId: datasetData.datasetId, - expiredMinutes: 30 - }) + ? s3DatasetSource.isDatasetObjectKey(datasetData.imageId) + ? await s3DatasetSource.createGetDatasetFileURL({ + key: datasetData.imageId, + expiredHours: 24 + }) + : getDatasetImagePreviewUrl({ + imageId: datasetData.imageId, + teamId: datasetData.teamId, + datasetId: datasetData.datasetId, + expiredMinutes: 30 + }) : undefined, chunkIndex: datasetData.chunkIndex, indexes: datasetData.indexes, diff --git a/packages/web/i18n/en/app.json b/packages/web/i18n/en/app.json index 81aab5a5388b..aa51a6086523 100644 --- a/packages/web/i18n/en/app.json +++ b/packages/web/i18n/en/app.json @@ -153,7 +153,7 @@ "file_recover": "File will overwrite current content", "file_types": "Optional file types", "file_upload": "File Upload", - "file_upload_tip": "Once enabled, documents/images can be uploaded. Documents are retained for 7 days, images for 15 days. Using this feature may incur additional costs. To ensure a good experience, please choose an AI model with a larger context length when using this feature.", + "file_upload_tip": "Once enabled, you can configure the types of files that users can upload. Files are saved along with the conversation; deleting the conversation or deleting the application will clear the files. To ensure a good user experience, please select an AI model with a longer context length when using it.", "find_more_tools": "Explore more", "go_to_chat": "Go to Conversation", "go_to_run": "Go to Execution", diff --git a/packages/web/i18n/en/chat.json b/packages/web/i18n/en/chat.json index 357f99659bcb..4cbc5b08a3b0 100644 --- a/packages/web/i18n/en/chat.json +++ b/packages/web/i18n/en/chat.json @@ -75,6 +75,7 @@ "query_extension_result": "Problem optimization results", "question_tip": "From top to bottom, the response order of each module", "read_raw_source": "Open the original text", + "images_collection_not_supported": "Image collection is not supported open the original file", "reasoning_text": "Thinking process", "release_cancel": "Release Cancel", "release_send": "Release send, slide up to cancel", diff --git a/packages/web/i18n/zh-CN/app.json b/packages/web/i18n/zh-CN/app.json index bc6a354fcfcf..abaecff82ee6 100644 --- a/packages/web/i18n/zh-CN/app.json +++ b/packages/web/i18n/zh-CN/app.json @@ -157,7 +157,7 @@ "file_recover": "文件将覆盖当前内容", "file_types": "可选文件类型", "file_upload": "文件上传", - "file_upload_tip": "开启后,可以上传文档/图片。文档保留7天,图片保留15天。使用该功能可能产生较多额外费用。为保证使用体验,使用该功能时,请选择上下文长度较大的AI模型。", + "file_upload_tip": "开启后,可以配置用户可上传的文件类型。文件跟随对话保存,删除对话或删除应用均会清理文件。为保证使用体验,使用时请选择上下文长度较大的AI模型。", "find_more_tools": "探索更多", "go_to_chat": "去对话", "go_to_run": "去运行", diff --git a/packages/web/i18n/zh-CN/chat.json b/packages/web/i18n/zh-CN/chat.json index 3117883f7da9..23ed352f7c60 100644 --- a/packages/web/i18n/zh-CN/chat.json +++ b/packages/web/i18n/zh-CN/chat.json @@ -75,6 +75,7 @@ "query_extension_result": "问题优化结果", "question_tip": "从上到下,为各个模块的响应顺序", "read_raw_source": "打开原文", + "images_collection_not_supported": "图片数据集不支持打开原文", "reasoning_text": "思考过程", "release_cancel": "松开取消", "release_send": "松开发送,上滑取消", diff --git a/packages/web/i18n/zh-Hant/app.json b/packages/web/i18n/zh-Hant/app.json index b2fc705c5909..8b0fa393a98f 100644 --- a/packages/web/i18n/zh-Hant/app.json +++ b/packages/web/i18n/zh-Hant/app.json @@ -152,7 +152,7 @@ "file_recover": "檔案將會覆蓋目前內容", "file_types": "可選文件類型", "file_upload": "檔案上傳", - "file_upload_tip": "開啟後,可以上傳文件/圖片。文件保留 7 天,圖片保留 15 天。使用這個功能可能產生較多額外費用。為了確保使用體驗,使用這個功能時,請選擇上下文長度較大的 AI 模型。", + "file_upload_tip": "開啟後,可以設定使用者可上傳的檔案類型。檔案跟隨對話儲存,刪除對話或刪除應用程式均會清理檔案。為保證使用體驗,使用時請選擇上下文長度較大的AI模型。", "find_more_tools": "探索更多", "go_to_chat": "前往對話", "go_to_run": "前往執行", diff --git a/packages/web/i18n/zh-Hant/chat.json b/packages/web/i18n/zh-Hant/chat.json index 950ce7bee80b..5abb70e67565 100644 --- a/packages/web/i18n/zh-Hant/chat.json +++ b/packages/web/i18n/zh-Hant/chat.json @@ -75,6 +75,7 @@ "query_extension_result": "問題優化結果", "question_tip": "由上至下,各個模組的回應順序", "read_raw_source": "開啟原文", + "images_collection_not_supported": "圖片資料集不支持開啟原文", "reasoning_text": "思考過程", "release_cancel": "鬆開取消", "release_send": "鬆開傳送,上滑取消", diff --git a/projects/app/src/components/Markdown/img/Image.tsx b/projects/app/src/components/Markdown/img/Image.tsx index e4939cf732fc..a42e234af740 100644 --- a/projects/app/src/components/Markdown/img/Image.tsx +++ b/projects/app/src/components/Markdown/img/Image.tsx @@ -1,14 +1,57 @@ -import React, { useState } from 'react'; +import React, { useState, useEffect } from 'react'; import { Box, type ImageProps, Skeleton } from '@chakra-ui/react'; import MyPhotoView from '@fastgpt/web/components/common/Image/PhotoView'; import { useBoolean } from 'ahooks'; import { useTranslation } from 'next-i18next'; +import { getPresignedDatasetFileGetUrl } from '@/web/core/dataset/api'; +import { getPresignedChatFileGetUrl } from '@/web/common/file/api'; +import type { AProps } from '../A'; -const MdImage = ({ src, ...props }: { src?: string } & ImageProps) => { +const MdImage = ({ + src, + ...props +}: { src?: string } & ImageProps & { chatAuthData?: AProps['chatAuthData'] }) => { const { t } = useTranslation(); const [isLoaded, { setTrue }] = useBoolean(false); - const [renderSrc, setRenderSrc] = useState(src); + const [isLoading, setIsLoading] = useState(false); + + // TODO: 在我迁移完到 JWT 后移除这个 useEffect + useEffect(() => { + if (!src || (!src.startsWith('dataset/') && !src.startsWith('chat/'))) { + setRenderSrc(src); + return; + } + + const loadS3Image = async () => { + try { + setIsLoading(true); + if (src.startsWith('dataset/')) { + const url = await getPresignedDatasetFileGetUrl({ key: src }); + setRenderSrc(url); + } else if (src.startsWith('chat/')) { + const url = await getPresignedChatFileGetUrl({ + key: src, + appId: props.chatAuthData?.appId || '', + outLinkAuthData: { + shareId: props.chatAuthData?.shareId, + outLinkUid: props.chatAuthData?.outLinkUid, + teamId: props.chatAuthData?.teamId, + teamToken: props.chatAuthData?.teamToken + } + }); + setRenderSrc(url); + } + } catch (error) { + console.error('Failed to sign S3 image:', error); + setRenderSrc('/imgs/errImg.png'); + } finally { + setIsLoading(false); + } + }; + + loadS3Image(); + }, [src, props.chatAuthData]); if (src?.includes('base64') && !src.startsWith('data:image')) { return Invalid base64 image; @@ -19,7 +62,7 @@ const MdImage = ({ src, ...props }: { src?: string } & ImageProps) => { } return ( - + { const components = useCreation(() => { return { - img: Image, + img: (props: any) => {props.alt}, pre: RewritePre, code: Code, a: (props: any) => ( @@ -145,8 +145,8 @@ function Code(e: any) { return Component; } -function Image({ src }: { src?: string }) { - return ; +function Image({ src, chatAuthData }: { src?: string; chatAuthData?: AProps['chatAuthData'] }) { + return ; } function RewritePre({ children }: any) { diff --git a/projects/app/src/pageComponents/dataset/detail/Import/diffSource/FileLocal.tsx b/projects/app/src/pageComponents/dataset/detail/Import/diffSource/FileLocal.tsx index 18c60f0c0e8e..3358845a351e 100644 --- a/projects/app/src/pageComponents/dataset/detail/Import/diffSource/FileLocal.tsx +++ b/projects/app/src/pageComponents/dataset/detail/Import/diffSource/FileLocal.tsx @@ -15,6 +15,9 @@ import { getErrText } from '@fastgpt/global/common/error/utils'; import { formatFileSize } from '@fastgpt/global/common/file/tools'; import { getFileIcon } from '@fastgpt/global/common/file/icon'; import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext'; +import { getUploadDatasetFilePresignedUrl } from '@/web/common/file/api'; +import { POST } from '@/web/common/api/request'; +import { parseS3UploadError } from '@fastgpt/global/common/error/s3'; const DataProcess = dynamic(() => import('../commonProgress/DataProcess')); const PreviewData = dynamic(() => import('../commonProgress/PreviewData')); @@ -67,39 +70,51 @@ const SelectFile = React.memo(function SelectFile() { await Promise.all( files.map(async ({ fileId, file }) => { try { - const { fileId: uploadFileId } = await uploadFile2DB({ - file, - bucketName: BucketNameEnum.dataset, - data: { - datasetId + const { url, fields, maxSize } = await getUploadDatasetFilePresignedUrl({ + filename: file.name, + datasetId + }); + + // Upload File to S3 + const formData = new FormData(); + Object.entries(fields).forEach(([k, v]) => formData.set(k, v)); + formData.set('file', file); + await POST(url, formData, { + headers: { + 'Content-Type': 'multipart/form-data; charset=utf-8' }, - percentListen: (e) => { + onUploadProgress: (e) => { + if (!e.total) return; + const percent = Math.round((e.loaded / e.total) * 100); setSelectFiles((state) => state.map((item) => item.id === fileId ? { ...item, uploadedFileRate: item.uploadedFileRate - ? Math.max(e, item.uploadedFileRate) - : e + ? Math.max(percent, item.uploadedFileRate) + : percent } : item ) ); } - }); - setSelectFiles((state) => - state.map((item) => - item.id === fileId - ? { - ...item, - dbFileId: uploadFileId, - isUploading: false, - uploadedFileRate: 100 - } - : item - ) - ); + }) + .then(() => { + setSelectFiles((state) => + state.map((item) => + item.id === fileId + ? { + ...item, + dbFileId: fields.key, + isUploading: false, + uploadedFileRate: 100 + } + : item + ) + ); + }) + .catch((error) => Promise.reject(parseS3UploadError({ t, error, maxSize }))); } catch (error) { setSelectFiles((state) => state.map((item) => diff --git a/projects/app/src/pageComponents/dataset/detail/MetaDataCard.tsx b/projects/app/src/pageComponents/dataset/detail/MetaDataCard.tsx index 9aa951eb2c7b..c94afa8410e0 100644 --- a/projects/app/src/pageComponents/dataset/detail/MetaDataCard.tsx +++ b/projects/app/src/pageComponents/dataset/detail/MetaDataCard.tsx @@ -56,13 +56,15 @@ const MetaDataCard = ({ datasetId }: { datasetId: string }) => { }, { label: t('dataset:collection_name'), - value: collection.file?.filename || collection?.rawLink || collection?.name + value: decodeURIComponent( + collection.file?.filename || collection?.rawLink || collection?.name + ) }, ...(collection.file ? [ { label: t('common:core.dataset.collection.metadata.source size'), - value: formatFileSize(collection.file.length) + value: formatFileSize(collection.file.contentLength || 0) } ] : []), diff --git a/projects/app/src/pages/api/core/app/copy.ts b/projects/app/src/pages/api/core/app/copy.ts index 01cc744e28cc..978925b49229 100644 --- a/projects/app/src/pages/api/core/app/copy.ts +++ b/projects/app/src/pages/api/core/app/copy.ts @@ -40,7 +40,7 @@ async function handler( const avatar = await copyAvatarImage({ teamId, imageUrl: app.avatar, - ttl: true, + temporary: true, session }); diff --git a/projects/app/src/pages/api/core/app/create.ts b/projects/app/src/pages/api/core/app/create.ts index 66aa5bd9b106..43a6b68fc1c8 100644 --- a/projects/app/src/pages/api/core/app/create.ts +++ b/projects/app/src/pages/api/core/app/create.ts @@ -29,6 +29,9 @@ import { MongoResourcePermission } from '@fastgpt/service/support/permission/sch import { getMyModels } from '@fastgpt/service/support/permission/model/controller'; import { removeUnauthModels } from '@fastgpt/global/core/workflow/utils'; import { getS3AvatarSource } from '@fastgpt/service/common/s3/sources/avatar'; +import { MongoAppTemplate } from '@fastgpt/service/core/app/templates/templateSchema'; +import { getNanoid } from '@fastgpt/global/common/string/tools'; +import path from 'node:path'; export type CreateAppBody = { parentId?: ParentIdType; @@ -157,11 +160,35 @@ export const onCreateApp = async ({ } const create = async (session: ClientSession) => { + const _avatar = await (async () => { + if (!templateId) return avatar; + + const template = await MongoAppTemplate.findOne({ templateId }, 'avatar').lean(); + if (!template?.avatar) return avatar; + + const s3AvatarSource = getS3AvatarSource(); + if (!s3AvatarSource.isAvatarKey(template.avatar)) return template.avatar; + + const filename = (() => { + const last = template.avatar.split('/').pop()?.split('-')[1]; + if (!last) return getNanoid(6).concat(path.extname(template.avatar)); + return `${getNanoid(6)}-${last}`; + })(); + + const copiedAvatar = await s3AvatarSource.copyAvatar({ + key: template.avatar, + teamId, + filename, + temporary: true + }); + return copiedAvatar; + })(); + const [app] = await MongoApp.create( [ { ...parseParentIdInMongo(parentId), - avatar, + avatar: _avatar, name, intro, teamId, @@ -207,7 +234,7 @@ export const onCreateApp = async ({ resourceType: PerResourceTypeEnum.app }); - await getS3AvatarSource().refreshAvatar(avatar, undefined, session); + await getS3AvatarSource().refreshAvatar(_avatar, undefined, session); (async () => { addAuditLog({ diff --git a/projects/app/src/pages/api/core/dataset/collection/create/backup.ts b/projects/app/src/pages/api/core/dataset/collection/create/backup.ts index 3e67104f6d13..467bfe916f4b 100644 --- a/projects/app/src/pages/api/core/dataset/collection/create/backup.ts +++ b/projects/app/src/pages/api/core/dataset/collection/create/backup.ts @@ -3,7 +3,10 @@ import { NextAPI } from '@/service/middleware/entry'; import { getUploadModel } from '@fastgpt/service/common/file/multer'; import { removeFilesByPaths } from '@fastgpt/service/common/file/utils'; import { addLog } from '@fastgpt/service/common/system/log'; -import { readRawTextByLocalFile } from '@fastgpt/service/common/file/read/utils'; +import { + parsedFileContentS3Key, + readRawTextByLocalFile +} from '@fastgpt/service/common/file/read/utils'; import { authDataset } from '@fastgpt/service/support/permission/dataset/auth'; import { WritePermissionVal } from '@fastgpt/global/support/permission/constant'; import { createCollectionAndInsertData } from '@fastgpt/service/core/dataset/collection/controller'; @@ -48,7 +51,12 @@ async function handler(req: ApiRequestProps, res: ApiRe tmbId, path: file.path, encoding: file.encoding, - getFormatText: false + getFormatText: false, + uploadKey: parsedFileContentS3Key.dataset({ + datasetId: dataset._id, + mimetype: file.mimetype, + filename: file.originalname + }).key }); if (!rawText.trim().startsWith('q,a,indexes')) { return Promise.reject(i18nT('dataset:backup_template_invalid')); diff --git a/projects/app/src/pages/api/core/dataset/collection/create/fileId.ts b/projects/app/src/pages/api/core/dataset/collection/create/fileId.ts index ce9d20924789..707e9a8602a3 100644 --- a/projects/app/src/pages/api/core/dataset/collection/create/fileId.ts +++ b/projects/app/src/pages/api/core/dataset/collection/create/fileId.ts @@ -8,8 +8,9 @@ import { NextAPI } from '@/service/middleware/entry'; import { type ApiRequestProps } from '@fastgpt/service/type/next'; import { WritePermissionVal } from '@fastgpt/global/support/permission/constant'; import { type CreateCollectionResponse } from '@/global/core/dataset/api'; -import { deleteRawTextBuffer } from '@fastgpt/service/common/buffer/rawText/controller'; import { CommonErrEnum } from '@fastgpt/global/common/error/code/common'; +import { S3Sources } from '@fastgpt/service/common/s3/type'; +import { getS3DatasetSource } from '@fastgpt/service/common/s3/sources/dataset'; async function handler( req: ApiRequestProps @@ -24,17 +25,23 @@ async function handler( datasetId: body.datasetId }); - // 1. read file - const file = await getFileById({ - bucketName: BucketNameEnum.dataset, - fileId - }); + const filename = await (async () => { + if (fileId.startsWith(S3Sources.dataset)) { + const metadata = await getS3DatasetSource().getFileMetadata(fileId); + if (!metadata) return Promise.reject(CommonErrEnum.fileNotFound); + return metadata.filename; + } - if (!file) { - return Promise.reject(CommonErrEnum.fileNotFound); - } + const file = await getFileById({ + bucketName: BucketNameEnum.dataset, + fileId + }); + if (!file) { + return Promise.reject(CommonErrEnum.fileNotFound); + } - const filename = file.filename; + return file.filename; + })(); const { collectionId, insertResults } = await createCollectionAndInsertData({ dataset, @@ -44,17 +51,11 @@ async function handler( tmbId, type: DatasetCollectionTypeEnum.file, name: filename, - fileId, - metadata: { - relatedImgId: fileId - }, + fileId, // ObjectId -> ObjectKey customPdfParse } }); - // remove buffer - await deleteRawTextBuffer(fileId); - return { collectionId, results: insertResults diff --git a/projects/app/src/pages/api/core/dataset/collection/create/images.ts b/projects/app/src/pages/api/core/dataset/collection/create/images.ts index 32fd193dfa87..122071987586 100644 --- a/projects/app/src/pages/api/core/dataset/collection/create/images.ts +++ b/projects/app/src/pages/api/core/dataset/collection/create/images.ts @@ -16,6 +16,11 @@ import { i18nT } from '@fastgpt/web/i18n/utils'; import { authFrequencyLimit } from '@/service/common/frequencyLimit/api'; import { addSeconds } from 'date-fns'; import { createDatasetImage } from '@fastgpt/service/core/dataset/image/controller'; +import { getS3DatasetSource } from '@fastgpt/service/common/s3/sources/dataset'; +import { S3Sources } from '@fastgpt/service/common/s3/type'; +import { getNanoid } from '@fastgpt/global/common/string/tools'; +import fsp from 'node:fs/promises'; +import path from 'node:path'; const authUploadLimit = (tmbId: string, num: number) => { if (!global.feConfigs.uploadFileMaxAmount) return; @@ -56,16 +61,17 @@ async function handler( return Promise.reject(i18nT('file:Image_dataset_requires_VLM_model_to_be_configured')); } - // 1. Save image to db + // 1. Save image to S3 const imageIds = await Promise.all( files.map(async (file) => { - return ( - await createDatasetImage({ - teamId, - datasetId, - file - }) - ).imageId; + const filename = path.basename(file.filename); + const uploadKey = [S3Sources.dataset, datasetId, `${getNanoid(6)}-${filename}`].join('/'); + return getS3DatasetSource().uploadDatasetImage({ + uploadKey, + mimetype: file.mimetype, + filename, + base64Img: (await fsp.readFile(file.path)).toString('base64') + }); }) ); diff --git a/projects/app/src/pages/api/core/dataset/collection/create/template.ts b/projects/app/src/pages/api/core/dataset/collection/create/template.ts index 52dedb5eb940..0ad876379928 100644 --- a/projects/app/src/pages/api/core/dataset/collection/create/template.ts +++ b/projects/app/src/pages/api/core/dataset/collection/create/template.ts @@ -3,7 +3,10 @@ import { NextAPI } from '@/service/middleware/entry'; import { getUploadModel } from '@fastgpt/service/common/file/multer'; import { removeFilesByPaths } from '@fastgpt/service/common/file/utils'; import { addLog } from '@fastgpt/service/common/system/log'; -import { readRawTextByLocalFile } from '@fastgpt/service/common/file/read/utils'; +import { + parsedFileContentS3Key, + readRawTextByLocalFile +} from '@fastgpt/service/common/file/read/utils'; import { authDataset } from '@fastgpt/service/support/permission/dataset/auth'; import { WritePermissionVal } from '@fastgpt/global/support/permission/constant'; import { createCollectionAndInsertData } from '@fastgpt/service/core/dataset/collection/controller'; @@ -51,7 +54,12 @@ async function handler( tmbId, path: file.path, encoding: file.encoding, - getFormatText: false + getFormatText: false, + uploadKey: parsedFileContentS3Key.dataset({ + datasetId: dataset._id, + mimetype: file.mimetype, + filename: file.originalname + }).key }); if (!rawText.trim().startsWith('q,a,indexes')) { return Promise.reject(i18nT('dataset:template_file_invalid')); diff --git a/projects/app/src/pages/api/core/dataset/collection/create/text.ts b/projects/app/src/pages/api/core/dataset/collection/create/text.ts index 66cb32513937..6100be5b79d7 100644 --- a/projects/app/src/pages/api/core/dataset/collection/create/text.ts +++ b/projects/app/src/pages/api/core/dataset/collection/create/text.ts @@ -6,7 +6,7 @@ import { DatasetCollectionTypeEnum } from '@fastgpt/global/core/dataset/constant import { NextAPI } from '@/service/middleware/entry'; import { WritePermissionVal } from '@fastgpt/global/support/permission/constant'; import { type CreateCollectionResponse } from '@/global/core/dataset/api'; -import { createFileFromText } from '@fastgpt/service/common/file/gridfs/utils'; +import { getS3DatasetSource } from '@fastgpt/service/common/s3/sources/dataset'; async function handler(req: NextApiRequest): CreateCollectionResponse { const { name, text, ...body } = req.body as TextCreateDatasetCollectionParams; @@ -21,14 +21,11 @@ async function handler(req: NextApiRequest): CreateCollectionResponse { // 1. Create file from text const filename = `${name}.txt`; - const { fileId } = await createFileFromText({ - bucket: 'dataset', - filename, - text, - metadata: { - teamId, - uid: tmbId - } + const s3DatasetSource = getS3DatasetSource(); + const key = await s3DatasetSource.uploadDatasetFileByBuffer({ + datasetId: String(dataset._id), + buffer: Buffer.from(text), + filename }); const { collectionId, insertResults } = await createCollectionAndInsertData({ @@ -38,11 +35,13 @@ async function handler(req: NextApiRequest): CreateCollectionResponse { teamId, tmbId, type: DatasetCollectionTypeEnum.file, - fileId, + fileId: key, name: filename } }); + await s3DatasetSource.removeDatasetFileTTL(key); + return { collectionId, results: insertResults diff --git a/projects/app/src/pages/api/core/dataset/collection/detail.ts b/projects/app/src/pages/api/core/dataset/collection/detail.ts index 4982b796d246..ba3492b460d5 100644 --- a/projects/app/src/pages/api/core/dataset/collection/detail.ts +++ b/projects/app/src/pages/api/core/dataset/collection/detail.ts @@ -1,4 +1,4 @@ -/* +/* Get one dataset collection detail */ import type { NextApiRequest } from 'next'; @@ -14,6 +14,8 @@ import { collectionTagsToTagLabel } from '@fastgpt/service/core/dataset/collecti import { getVectorCountByCollectionId } from '@fastgpt/service/common/vectorDB/controller'; import { MongoDatasetTraining } from '@fastgpt/service/core/dataset/training/schema'; import { readFromSecondary } from '@fastgpt/service/common/mongo/utils'; +import { S3Sources } from '@fastgpt/service/common/s3/type'; +import { getS3DatasetSource } from '@fastgpt/service/common/s3/sources/dataset'; async function handler(req: NextApiRequest): Promise { const { id } = req.query as { id: string }; @@ -31,10 +33,15 @@ async function handler(req: NextApiRequest): Promise per: ReadPermissionVal }); - // get file + const fileId = collection?.fileId; const [file, indexAmount, errorCount] = await Promise.all([ - collection?.fileId - ? await getFileById({ bucketName: BucketNameEnum.dataset, fileId: collection.fileId }) + fileId + ? fileId.startsWith(S3Sources.dataset) + ? getS3DatasetSource().getFileMetadata(fileId) + : (async () => { + const file = await getFileById({ bucketName: BucketNameEnum.dataset, fileId }); + return { filename: file?.filename, contentLength: file?.length }; + })() : undefined, getVectorCountByCollectionId(collection.teamId, collection.datasetId, collection._id), MongoDatasetTraining.countDocuments( diff --git a/projects/app/src/pages/api/core/dataset/data/delete.ts b/projects/app/src/pages/api/core/dataset/data/delete.ts index cd9b5ee47617..bf664a74a421 100644 --- a/projects/app/src/pages/api/core/dataset/data/delete.ts +++ b/projects/app/src/pages/api/core/dataset/data/delete.ts @@ -7,6 +7,7 @@ import { CommonErrEnum } from '@fastgpt/global/common/error/code/common'; import { addAuditLog } from '@fastgpt/service/support/user/audit/util'; import { AuditEventEnum } from '@fastgpt/global/support/user/audit/constants'; import { getI18nDatasetType } from '@fastgpt/service/support/user/audit/util'; + async function handler(req: NextApiRequest) { const { id: dataId } = req.query as { id: string; @@ -26,6 +27,7 @@ async function handler(req: NextApiRequest) { }); await deleteDatasetData(datasetData); + (async () => { addAuditLog({ tmbId, diff --git a/projects/app/src/pages/api/core/dataset/data/insertData.ts b/projects/app/src/pages/api/core/dataset/data/insertData.ts index 5a2d7015780f..945d822ea44d 100644 --- a/projects/app/src/pages/api/core/dataset/data/insertData.ts +++ b/projects/app/src/pages/api/core/dataset/data/insertData.ts @@ -1,4 +1,4 @@ -/* +/* insert one data to dataset (immediately insert) manual input or mark data */ @@ -60,9 +60,7 @@ async function handler(req: NextApiRequest) { text: simpleText(item.text) })); - const token = await countPromptTokens(formatQ + formatA, ''); const vectorModelData = getEmbeddingModel(vectorModel); - const llmModelData = getLLMModel(agentModel); await hasSameValue({ teamId, diff --git a/projects/app/src/pages/api/core/dataset/data/insertImages.ts b/projects/app/src/pages/api/core/dataset/data/insertImages.ts index 164863d57054..61c94bd0468c 100644 --- a/projects/app/src/pages/api/core/dataset/data/insertImages.ts +++ b/projects/app/src/pages/api/core/dataset/data/insertImages.ts @@ -14,6 +14,10 @@ import { getEmbeddingModel, getLLMModel, getVlmModel } from '@fastgpt/service/co import { pushDataListToTrainingQueue } from '@fastgpt/service/core/dataset/training/controller'; import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants'; import { removeDatasetImageExpiredTime } from '@fastgpt/service/core/dataset/image/utils'; +import { getS3DatasetSource } from '@fastgpt/service/common/s3/sources/dataset'; +import path from 'node:path'; +import fsp from 'node:fs/promises'; +import { parsedFileContentS3Key } from '@fastgpt/service/common/file/read/utils'; export type insertImagesQuery = {}; @@ -60,17 +64,20 @@ async function handler( await authUploadLimit(tmbId, files.length); - // 1. Upload images to db + // 1. Upload images to S3 const imageIds = await Promise.all( - files.map(async (file) => { - return ( - await createDatasetImage({ - teamId, + files.map(async (file) => + getS3DatasetSource().uploadDatasetImage({ + uploadKey: parsedFileContentS3Key.dataset({ datasetId: dataset._id, - file - }) - ).imageId; - }) + mimetype: file.mimetype, + filename: path.basename(file.filename) + }).key, + mimetype: file.mimetype, + filename: path.basename(file.filename), + base64Img: (await fsp.readFile(file.path)).toString('base64') + }) + ) ); // 2. Insert images to training queue @@ -106,11 +113,7 @@ async function handler( }); // 3. Clear ttl - await removeDatasetImageExpiredTime({ - ids: imageIds, - collectionId, - session - }); + await getS3DatasetSource().removeDatasetImagesTTL(imageIds, session); }); return {}; diff --git a/projects/app/src/pages/api/core/dataset/data/v2/list.ts b/projects/app/src/pages/api/core/dataset/data/v2/list.ts index bc2578b44cf8..dc7262b1d723 100644 --- a/projects/app/src/pages/api/core/dataset/data/v2/list.ts +++ b/projects/app/src/pages/api/core/dataset/data/v2/list.ts @@ -10,6 +10,7 @@ import { parsePaginationRequest } from '@fastgpt/service/common/api/pagination'; import { MongoDatasetImageSchema } from '@fastgpt/service/core/dataset/image/schema'; import { readFromSecondary } from '@fastgpt/service/common/mongo/utils'; import { getDatasetImagePreviewUrl } from '@fastgpt/service/core/dataset/image/utils'; +import { getS3DatasetSource } from '@fastgpt/service/common/s3/sources/dataset'; export type GetDatasetDataListProps = PaginationProps & { searchText?: string; @@ -56,10 +57,11 @@ async function handler( const imageIds = list.map((item) => item.imageId!).filter(Boolean); const imageSizeMap = new Map(); + const s3DatasetSource = getS3DatasetSource(); if (imageIds.length > 0) { const imageInfos = await MongoDatasetImageSchema.find( - { _id: { $in: imageIds } }, + { _id: { $in: imageIds.filter((id) => !s3DatasetSource.isDatasetObjectKey(id)) } }, '_id length', { ...readFromSecondary @@ -69,26 +71,38 @@ async function handler( imageInfos.forEach((item) => { imageSizeMap.set(String(item._id), item.length); }); + + const s3ImageIds = imageIds.filter((id) => s3DatasetSource.isDatasetObjectKey(id)); + for (const id of s3ImageIds) { + imageSizeMap.set(id, (await s3DatasetSource.getFileMetadata(id)).contentLength); + } } return { - list: list.map((item) => { - const imageSize = item.imageId ? imageSizeMap.get(String(item.imageId)) : undefined; - const imagePreviewUrl = item.imageId - ? getDatasetImagePreviewUrl({ - imageId: item.imageId, - teamId, - datasetId: collection.datasetId, - expiredMinutes: 30 - }) - : undefined; + list: await Promise.all( + list.map(async (item) => { + const imageSize = item.imageId ? imageSizeMap.get(String(item.imageId)) : undefined; + const imagePreviewUrl = item.imageId + ? s3DatasetSource.isDatasetObjectKey(item.imageId) + ? await getS3DatasetSource().createGetDatasetFileURL({ + key: item.imageId, + expiredHours: 24 + }) + : getDatasetImagePreviewUrl({ + imageId: item.imageId, + teamId, + datasetId: collection.datasetId, + expiredMinutes: 30 + }) + : undefined; - return { - ...item, - imageSize, - imagePreviewUrl - }; - }), + return { + ...item, + imageSize, + imagePreviewUrl + }; + }) + ), total }; } diff --git a/projects/app/src/pages/api/core/dataset/file/getPreviewChunks.ts b/projects/app/src/pages/api/core/dataset/file/getPreviewChunks.ts index b6a8c1dfbc31..c50d9cc53190 100644 --- a/projects/app/src/pages/api/core/dataset/file/getPreviewChunks.ts +++ b/projects/app/src/pages/api/core/dataset/file/getPreviewChunks.ts @@ -95,7 +95,8 @@ async function handler( selector, externalFileId, customPdfParse, - apiDatasetServer: dataset.apiDatasetServer + apiDatasetServer: dataset.apiDatasetServer, + datasetId }); const chunks = await rawText2Chunks({ diff --git a/projects/app/src/pages/api/core/dataset/presignDatasetFileGetUrl.ts b/projects/app/src/pages/api/core/dataset/presignDatasetFileGetUrl.ts new file mode 100644 index 000000000000..84b9385ffc17 --- /dev/null +++ b/projects/app/src/pages/api/core/dataset/presignDatasetFileGetUrl.ts @@ -0,0 +1,90 @@ +import { NextAPI } from '@/service/middleware/entry'; +import { type ApiRequestProps } from '@fastgpt/service/type/next'; +import { getS3DatasetSource } from '@fastgpt/service/common/s3/sources/dataset'; +import { + PresignDatasetFileGetUrlSchema, + type PresignDatasetFileGetUrlParams +} from '@fastgpt/global/core/dataset/v2/api'; +import { + authDataset, + authDatasetCollection +} from '@fastgpt/service/support/permission/dataset/auth'; +import { CommonErrEnum } from '@fastgpt/global/common/error/code/common'; +import { ReadPermissionVal } from '@fastgpt/global/support/permission/constant'; +import { createFileToken } from '@fastgpt/service/support/permission/auth/file'; +import { BucketNameEnum, ReadFileBaseUrl } from '@fastgpt/global/common/file/constants'; +import { DatasetCollectionTypeEnum } from '@fastgpt/global/core/dataset/constants'; +import { UserError } from '@fastgpt/global/common/error/utils'; + +async function handler(req: ApiRequestProps) { + const parsed = PresignDatasetFileGetUrlSchema.parse(req.body); + const s3DatasetSource = getS3DatasetSource(); + + // 获取文档中解析出来的图片 + if ('key' in parsed) { + const { key } = parsed; + + const dataset = await s3DatasetSource.getFileDatasetInfo(key); + if (!dataset) { + // 如果 `dataset_datas` 中没有找到记录,则这次的请求应该是图片的预览请求,验证 datasetId 的权限即可 + const datasetId = key.split('/')[1] || ''; + await authDataset({ + datasetId, + per: ReadPermissionVal, + req, + authToken: true, + authApiKey: true + }); + } else { + await authDatasetCollection({ + req, + authToken: true, + authApiKey: true, + per: ReadPermissionVal, + collectionId: dataset.collectionId + }); + } + + return await s3DatasetSource.createGetDatasetFileURL({ key, expiredHours: 24 }); + } + + // 其他文件 + const { collectionId } = parsed; + const { + collection, + teamId: userTeamId, + tmbId: uid, + authType + } = await authDatasetCollection({ + req, + collectionId, + authToken: true, + authApiKey: true, + per: ReadPermissionVal + }); + + if (collection.type === DatasetCollectionTypeEnum.images) { + return Promise.reject(new UserError('chat:images_collection_not_supported')); + } + + const key = collection.fileId; + if (!key) { + return Promise.reject(CommonErrEnum.unAuthFile); + } + + if (s3DatasetSource.isDatasetObjectKey(key)) { + return await s3DatasetSource.createGetDatasetFileURL({ key, expiredHours: 24 }); + } else { + const token = await createFileToken({ + uid, + fileId: key, + teamId: userTeamId, + bucketName: BucketNameEnum.dataset, + customExpireMinutes: authType === 'outLink' ? 5 : undefined + }); + + return `${ReadFileBaseUrl}/${collection.name}?token=${token}`; + } +} + +export default NextAPI(handler); diff --git a/projects/app/src/pages/api/core/dataset/presignDatasetFilePostUrl.ts b/projects/app/src/pages/api/core/dataset/presignDatasetFilePostUrl.ts new file mode 100644 index 000000000000..e53ecfe4feae --- /dev/null +++ b/projects/app/src/pages/api/core/dataset/presignDatasetFilePostUrl.ts @@ -0,0 +1,41 @@ +import type { ApiRequestProps } from '@fastgpt/service/type/next'; +import { NextAPI } from '@/service/middleware/entry'; +import { type CreatePostPresignedUrlResult } from '@fastgpt/service/common/s3/type'; +import { getS3DatasetSource } from '@fastgpt/service/common/s3/sources/dataset'; +import { authFrequencyLimit } from '@/service/common/frequencyLimit/api'; +import { addSeconds } from 'date-fns'; +import type { PresignDatasetFilePostUrlParams } from '@fastgpt/global/core/dataset/v2/api'; +import { authDataset } from '@fastgpt/service/support/permission/dataset/auth'; +import { WritePermissionVal } from '@fastgpt/global/support/permission/constant'; + +const authUploadLimit = (tmbId: string) => { + if (!global.feConfigs.uploadFileMaxAmount) return; + return authFrequencyLimit({ + eventId: `${tmbId}-uploadfile`, + maxAmount: global.feConfigs.uploadFileMaxAmount * 2, + expiredTime: addSeconds(new Date(), 30) // 30s + }); +}; + +async function handler( + req: ApiRequestProps +): Promise { + const { filename, datasetId } = req.body; + + const { userId } = await authDataset({ + datasetId, + per: WritePermissionVal, + req, + authToken: true, + authApiKey: true + }); + + await authUploadLimit(userId); + + return await getS3DatasetSource().createUploadDatasetFileURL({ + datasetId, + filename + }); +} + +export default NextAPI(handler); diff --git a/projects/app/src/pages/api/file/temp.ts b/projects/app/src/pages/api/file/temp.ts new file mode 100644 index 000000000000..bb5b393e526b --- /dev/null +++ b/projects/app/src/pages/api/file/temp.ts @@ -0,0 +1,43 @@ +import { getGlobalRedisConnection } from '@fastgpt/service/common/redis'; +import { type ApiRequestProps } from '@fastgpt/service/type/next'; +import { + ShortPreviewLinkSchema, + type ShortPreviewLinkParams +} from '@fastgpt/global/core/dataset/v2/api'; +import { authCert } from '@fastgpt/service/support/permission/auth/common'; +import { CommonErrEnum } from '@fastgpt/global/common/error/code/common'; +import { NextAPI } from '@/service/middleware/entry'; +import { getS3ChatSource } from '@fastgpt/service/common/s3/sources/chat'; +import { getS3DatasetSource } from '@fastgpt/service/common/s3/sources/dataset'; +import type { NextApiResponse } from 'next'; + +// Short Preview Link +async function handler(req: ApiRequestProps, res: NextApiResponse) { + const parsed = ShortPreviewLinkSchema.parse(req.query); + const { k: redisKey } = parsed; + + await authCert({ req, authToken: true }); + + const redis = getGlobalRedisConnection(); + const objectKey = await redis.get(redisKey); + if (!objectKey) { + res.status(404).end(); + return; + } + + const s3ChatSource = getS3ChatSource(); + const s3DatasetSource = getS3DatasetSource(); + + if (s3ChatSource.isChatFileKey(objectKey)) { + res.redirect(302, await s3ChatSource.createGetChatFileURL({ key: objectKey, external: true })); + } else if (s3DatasetSource.isDatasetObjectKey(objectKey)) { + res.redirect( + 302, + await s3DatasetSource.createGetDatasetFileURL({ key: objectKey, external: true }) + ); + } + + res.status(404).end(); +} + +export default NextAPI(handler); diff --git a/projects/app/src/pages/api/system/file/[jwt].ts b/projects/app/src/pages/api/system/file/[jwt].ts new file mode 100644 index 000000000000..b4104758638b --- /dev/null +++ b/projects/app/src/pages/api/system/file/[jwt].ts @@ -0,0 +1,70 @@ +import type { NextApiRequest, NextApiResponse } from 'next'; +import { jsonRes } from '@fastgpt/service/common/response'; +import { getS3DatasetSource } from '@fastgpt/service/common/s3/sources/dataset'; +import { addLog } from '@fastgpt/service/common/system/log'; +import { jwtVerifyS3ObjectKey } from '@fastgpt/service/common/s3/utils'; +import { getS3ChatSource } from '@fastgpt/service/common/s3/sources/chat'; + +export default async function handler(req: NextApiRequest, res: NextApiResponse) { + try { + const { jwt } = req.query as { jwt: string }; + + const s3DatasetSource = getS3DatasetSource(); + const s3ChatSource = getS3ChatSource(); + + const { objectKey } = await jwtVerifyS3ObjectKey(jwt); + + if (s3DatasetSource.isDatasetObjectKey(objectKey) || s3ChatSource.isChatFileKey(objectKey)) { + try { + const [stream, metadata] = await Promise.all( + (() => { + if (s3DatasetSource.isDatasetObjectKey(objectKey)) { + return [ + s3DatasetSource.getDatasetFileStream(objectKey), + s3DatasetSource.getFileMetadata(objectKey) + ]; + } else { + return [ + s3ChatSource.getChatFileStream(objectKey), + s3ChatSource.getFileMetadata(objectKey) + ]; + } + })() + ); + + res.setHeader('Content-Type', metadata.contentType); + res.setHeader('Cache-Control', 'public, max-age=31536000'); + res.setHeader('Content-Length', metadata.contentLength); + + stream.pipe(res); + + stream.on('error', (error) => { + addLog.error('Error reading dataset file', { error }); + if (!res.headersSent) { + res.status(500).end(); + } + }); + + stream.on('end', () => { + res.end(); + }); + return; + } catch (error) { + return jsonRes(res, { + code: 500, + error + }); + } + } + + jsonRes(res, { + code: 404, + error: 'File not found' + }); + } catch (error) { + jsonRes(res, { + code: 500, + error + }); + } +} diff --git a/projects/app/src/service/core/dataset/data/controller.ts b/projects/app/src/service/core/dataset/data/controller.ts index 07b11f7905c0..dd8e85b5511e 100644 --- a/projects/app/src/service/core/dataset/data/controller.ts +++ b/projects/app/src/service/core/dataset/data/controller.ts @@ -18,7 +18,12 @@ import { MongoDatasetDataText } from '@fastgpt/service/core/dataset/data/dataTex import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants'; import { countPromptTokens } from '@fastgpt/service/common/string/tiktoken'; import { deleteDatasetImage } from '@fastgpt/service/core/dataset/image/controller'; +import { addLog } from '@fastgpt/service/common/system/log'; import { text2Chunks } from '@fastgpt/service/worker/function'; +import { getS3DatasetSource } from '@fastgpt/service/common/s3/sources/dataset'; +import { getGlobalRedisConnection } from '@fastgpt/service/common/redis'; +import { S3Sources } from '@fastgpt/service/common/s3/type'; +import _ from 'lodash'; const formatIndexes = async ({ indexes = [], @@ -168,6 +173,7 @@ export async function insertData2Dataset({ q, a, imageId, + imageKeys, chunkIndex = 0, indexSize = 512, indexes, @@ -216,6 +222,13 @@ export async function insertData2Dataset({ })); // 2. Create mongo data + addLog.debug('[insertData2Dataset] Creating mongo data', { + qPreview: q?.substring(0, 100), + imageKeysCount: imageKeys?.length || 0, + imageKeys, + chunkIndex + }); + const [{ _id }] = await MongoDatasetData.create( [ { @@ -226,6 +239,7 @@ export async function insertData2Dataset({ q, a, imageId, + imageKeys, imageDescMap, chunkIndex, indexes: results @@ -406,6 +420,42 @@ export async function updateData2Dataset({ idList: deleteVectorIdList }); } + + // Check if there are any images need to be deleted + const retrieveS3PreviewKeys = async (q: string) => { + const redis = getGlobalRedisConnection(); + const prefixPattern = Object.values(S3Sources) + .map((pattern) => `${pattern}\\/[^\\s)]+`) + .join('|'); + const regex = new RegExp( + String.raw`(!?)\[([^\]]+)\]\((?!https?:\/\/)(${prefixPattern})\)`, + 'g' + ); + + const matches = Array.from(q.matchAll(regex)); + const objectKeys = []; + + for (const match of matches.slice().reverse()) { + const [, , , objectKey] = match; + + if (getS3DatasetSource().isDatasetObjectKey(objectKey)) { + objectKeys.push(objectKey); + } + } + + return objectKeys; + }; + + const objectKeys = await retrieveS3PreviewKeys(q); + const differenceKeys = _.difference(mongoData.imageKeys || [], objectKeys); + if (differenceKeys.length > 0) { + await getS3DatasetSource().deleteDatasetFilesByKeys(differenceKeys); + } + await MongoDatasetData.updateOne( + { _id: mongoData._id }, + { $set: { imageKeys: objectKeys } }, + { session } + ); }); return { @@ -424,6 +474,10 @@ export const deleteDatasetData = async (data: DatasetDataItemType) => { await deleteDatasetImage(data.imageId); } + if (data.imageKeys && data.imageKeys.length > 0) { + await getS3DatasetSource().deleteDatasetFilesByKeys(data.imageKeys); + } + // 3. Delete vector data await deleteDatasetDataVector({ teamId: data.teamId, diff --git a/projects/app/src/service/core/dataset/queues/datasetParse.ts b/projects/app/src/service/core/dataset/queues/datasetParse.ts index 5017377d9745..a9445f2180a8 100644 --- a/projects/app/src/service/core/dataset/queues/datasetParse.ts +++ b/projects/app/src/service/core/dataset/queues/datasetParse.ts @@ -32,6 +32,7 @@ import { POST } from '@fastgpt/service/common/api/plusRequest'; import { pushLLMTrainingUsage } from '@fastgpt/service/support/wallet/usage/controller'; import { MongoImage } from '@fastgpt/service/common/file/image/schema'; import { UsageItemTypeEnum } from '@fastgpt/global/support/wallet/usage/constants'; +import { getS3DatasetSource } from '@fastgpt/service/common/s3/sources/dataset'; const requestLLMPargraph = async ({ rawText, @@ -176,7 +177,12 @@ export const datasetParseQueue = async (): Promise => { continue; } - addLog.info(`[Parse Queue] Start`); + addLog.info(`[Parse Queue] ========== START PROCESSING ==========`, { + collectionId: collection._id, + datasetId: dataset._id, + fileId: collection.fileId, + type: collection.type + }); try { const trainingMode = getTrainingModeByCollection({ @@ -231,14 +237,28 @@ export const datasetParseQueue = async (): Promise => { } // 2. Read source - const { title, rawText } = await readDatasetSourceRawText({ + addLog.info('[Parse Queue] === START PARSING ===', { + collectionId: collection._id, + fileId: collection.fileId, + type: collection.type + }); + + let { title, rawText, imageKeys } = await readDatasetSourceRawText({ teamId: data.teamId, tmbId: data.tmbId, customPdfParse: collection.customPdfParse, usageId: data.billId, + datasetId: data.datasetId, ...sourceReadType }); + addLog.info('[Parse Queue] Read source result', { + title, + rawTextLength: rawText.length, + imageKeysCount: imageKeys?.length || 0, + imageKeys + }); + // 3. LLM Pargraph const { resultText, totalInputTokens, totalOutputTokens } = await requestLLMPargraph({ rawText, @@ -268,7 +288,20 @@ export const datasetParseQueue = async (): Promise => { overlapRatio: collection.trainingType === DatasetCollectionDataProcessModeEnum.chunk ? 0.2 : 0, customReg: collection.chunkSplitter ? [collection.chunkSplitter] : [], - backupParse: collection.trainingType === DatasetCollectionDataProcessModeEnum.backup + backupParse: collection.trainingType === DatasetCollectionDataProcessModeEnum.backup, + imageKeys + }); + + addLog.debug('[Parse Queue] After chunk split', { + chunksCount: chunks.length, + firstChunkImageKeys: chunks[0]?.imageKeys, + allChunksHaveImageKeys: chunks.every((chunk) => chunk.imageKeys), + detailedChunks: chunks.map((chunk, idx) => ({ + index: idx, + qPreview: chunk.q?.substring(0, 100), + imageKeysCount: chunk.imageKeys?.length || 0, + imageKeys: chunk.imageKeys + })) }); // Check dataset limit @@ -303,6 +336,27 @@ export const datasetParseQueue = async (): Promise => { ); // 6. Push to chunk queue + const trainingData = chunks.map((item, index) => ({ + ...item, + indexes: item.indexes?.map((text) => ({ + type: DatasetDataIndexTypeEnum.custom, + text + })), + chunkIndex: index + })); + + addLog.debug('[Parse Queue] Before push to training queue', { + trainingDataCount: trainingData.length, + firstItemImageKeys: trainingData[0]?.imageKeys, + hasImageKeys: trainingData.some((item) => item.imageKeys && item.imageKeys.length > 0), + detailedTrainingData: trainingData.map((item, idx) => ({ + index: idx, + qPreview: item.q?.substring(0, 100), + imageKeysCount: item.imageKeys?.length || 0, + imageKeys: item.imageKeys + })) + }); + await pushDataListToTrainingQueue({ teamId: data.teamId, tmbId: data.tmbId, @@ -314,14 +368,7 @@ export const datasetParseQueue = async (): Promise => { indexSize: collection.indexSize, mode: trainingMode, billId: data.billId, - data: chunks.map((item, index) => ({ - ...item, - indexes: item.indexes?.map((text) => ({ - type: DatasetDataIndexTypeEnum.custom, - text - })), - chunkIndex: index - })), + data: trainingData, session }); @@ -335,24 +382,57 @@ export const datasetParseQueue = async (): Promise => { } ); - // 8. Remove image ttl - const relatedImgId = collection.metadata?.relatedImgId; - if (relatedImgId) { - await MongoImage.updateMany( - { - teamId: collection.teamId, - 'metadata.relatedId': relatedImgId - }, - { - // Remove expiredTime to avoid ttl expiration - $unset: { - expiredTime: 1 + // 8. Remove TTLs (file, images) + const s3DatasetSource = getS3DatasetSource(); + + addLog.info('[Parse Queue] Before removing TTLs', { + hasFileId: !!collection.fileId, + isS3File: collection.fileId && s3DatasetSource.isDatasetObjectKey(collection.fileId), + imageKeysCount: imageKeys?.length || 0, + imageKeys: imageKeys + }); + + // 8.1 For S3 files, remove file TTL and image TTLs + if (collection.fileId && s3DatasetSource.isDatasetObjectKey(collection.fileId)) { + // Remove file TTL + await s3DatasetSource.removeDatasetFileTTL(collection.fileId, session); + addLog.info('[Parse Queue] Removed file TTL', { fileId: collection.fileId }); + + // Remove image TTLs + if (imageKeys && imageKeys.length > 0) { + await s3DatasetSource.removeDatasetImagesTTL(imageKeys, session); + addLog.info('[Parse Queue] Removed image TTLs', { + imageKeysCount: imageKeys.length, + imageKeys + }); + } else { + addLog.warn('[Parse Queue] No imageKeys to remove TTL', { + imageKeysIsUndefined: imageKeys === undefined, + imageKeysIsNull: imageKeys === null, + imageKeysLength: imageKeys?.length + }); + } + } + // 8.2 For GridFS files (legacy), remove MongoDB image TTL + else { + const relatedImgId = collection.metadata?.relatedImgId; + if (relatedImgId) { + await MongoImage.updateMany( + { + teamId: collection.teamId, + 'metadata.relatedId': relatedImgId + }, + { + // Remove expiredTime to avoid ttl expiration + $unset: { + expiredTime: 1 + } + }, + { + session } - }, - { - session - } - ); + ); + } } }); diff --git a/projects/app/src/service/core/dataset/queues/generateVector.ts b/projects/app/src/service/core/dataset/queues/generateVector.ts index 60fbfc4552bf..437be2eee5d2 100644 --- a/projects/app/src/service/core/dataset/queues/generateVector.ts +++ b/projects/app/src/service/core/dataset/queues/generateVector.ts @@ -78,6 +78,9 @@ export async function generateVector(): Promise { select: '_id indexes' } ]) + .select( + 'teamId tmbId datasetId collectionId q a imageId imageKeys imageDescMap chunkIndex indexSize billId mode retryCount lockTime indexes' + ) .lean(); // task preemption @@ -257,6 +260,13 @@ const rebuildData = async ({ trainingData }: { trainingData: TrainingDataType }) const insertData = async ({ trainingData }: { trainingData: TrainingDataType }) => { return mongoSessionRun(async (session) => { + addLog.debug('[Vector Queue] insertData - before insert', { + trainingDataId: trainingData._id, + qPreview: trainingData.q?.substring(0, 100), + imageKeysCount: trainingData.imageKeys?.length || 0, + imageKeys: trainingData.imageKeys + }); + // insert new data to dataset const { tokens } = await insertData2Dataset({ teamId: trainingData.teamId, @@ -266,6 +276,7 @@ const insertData = async ({ trainingData }: { trainingData: TrainingDataType }) q: trainingData.q, a: trainingData.a, imageId: trainingData.imageId, + imageKeys: trainingData.imageKeys, imageDescMap: trainingData.imageDescMap, chunkIndex: trainingData.chunkIndex, indexSize: diff --git a/projects/app/src/web/common/file/api.ts b/projects/app/src/web/common/file/api.ts index 7322463bb69d..43f52c3e0ed9 100644 --- a/projects/app/src/web/common/file/api.ts +++ b/projects/app/src/web/common/file/api.ts @@ -54,3 +54,10 @@ export const getPresignedChatFileGetUrl = (params: { }) => { return POST('/core/chat/presignChatFileGetUrl', params); }; + +export const getUploadDatasetFilePresignedUrl = (params: { + filename: string; + datasetId: string; +}) => { + return POST('/core/dataset/presignDatasetFilePostUrl', params); +}; diff --git a/projects/app/src/web/core/dataset/api.ts b/projects/app/src/web/core/dataset/api.ts index 333a1b4015ca..4e909b6e4bcb 100644 --- a/projects/app/src/web/core/dataset/api.ts +++ b/projects/app/src/web/core/dataset/api.ts @@ -83,6 +83,7 @@ import type { DatasetCreateWithFilesBody, DatasetCreateWithFilesResponse } from '@/pages/api/core/dataset/createWithFiles'; +import type { PresignDatasetFileGetUrlParams } from '@fastgpt/global/core/dataset/v2/api'; /* ======================== dataset ======================= */ export const getDatasets = (data: GetDatasetListBody) => @@ -322,3 +323,6 @@ export const getApiDatasetCatalog = (data: GetApiDatasetCataLogProps) => export const getApiDatasetPaths = (data: GetApiDatasetPathBody) => POST('/core/dataset/apiDataset/getPathNames', data); + +export const getPresignedDatasetFileGetUrl = (data: PresignDatasetFileGetUrlParams) => + POST('/core/dataset/presignDatasetFileGetUrl', data); diff --git a/projects/app/src/web/core/dataset/hooks/readCollectionSource.ts b/projects/app/src/web/core/dataset/hooks/readCollectionSource.ts index 65b636e56cef..c2b11000442b 100644 --- a/projects/app/src/web/core/dataset/hooks/readCollectionSource.ts +++ b/projects/app/src/web/core/dataset/hooks/readCollectionSource.ts @@ -1,5 +1,5 @@ import { useSystemStore } from '@/web/common/system/useSystemStore'; -import { getCollectionSource } from '@/web/core/dataset/api'; +import { getPresignedDatasetFileGetUrl } from '@/web/core/dataset/api'; import { getErrText } from '@fastgpt/global/common/error/utils'; import { useToast } from '@fastgpt/web/hooks/useToast'; import { useTranslation } from 'next-i18next'; @@ -16,7 +16,7 @@ export function getCollectionSourceAndOpen( try { setLoading(true); - const { value: url } = await getCollectionSource(props); + const url = await getPresignedDatasetFileGetUrl({ collectionId: props.collectionId }); if (!url) { throw new Error('No file found'); From b6da886ed0bd0d92a6f8621232705f622e2757b4 Mon Sep 17 00:00:00 2001 From: xqvvu Date: Tue, 18 Nov 2025 18:06:34 +0800 Subject: [PATCH 4/6] fix: delay s3 files delete timing --- .../service/common/file/image/controller.ts | 8 ++- packages/service/common/file/read/utils.ts | 18 ++++-- .../common/s3/sources/dataset/index.ts | 44 ++++++------- .../service/common/s3/sources/dataset/type.ts | 6 +- packages/service/common/s3/utils.ts | 31 --------- packages/service/core/ai/llm/request.ts | 1 - packages/service/core/ai/llm/utils.ts | 40 ------------ packages/service/core/app/controller.ts | 1 - packages/service/core/chat/saveChat.ts | 50 +-------------- .../core/dataset/collection/controller.ts | 63 +++++-------------- packages/service/core/dataset/controller.ts | 4 -- packages/service/core/dataset/read.ts | 4 +- packages/service/core/dataset/utils.ts | 61 ++++++++++++++++++ .../service/core/workflow/dispatch/ai/chat.ts | 6 +- .../core/workflow/dispatch/tools/readFiles.ts | 11 ++-- .../api/core/dataset/data/insertImages.ts | 4 -- .../core/dataset/presignDatasetFileGetUrl.ts | 3 +- .../service/core/dataset/data/controller.ts | 8 --- .../core/dataset/queues/datasetParse.ts | 26 ++------ .../core/dataset/queues/generateVector.ts | 16 +++++ 20 files changed, 156 insertions(+), 249 deletions(-) diff --git a/packages/service/common/file/image/controller.ts b/packages/service/common/file/image/controller.ts index 64489615b6c1..0fe6ebc9a3cb 100644 --- a/packages/service/common/file/image/controller.ts +++ b/packages/service/common/file/image/controller.ts @@ -135,9 +135,13 @@ export const removeImageByPath = (path?: string, session?: ClientSession) => { if (!name) return; const id = name.split('.')[0]; - if (!id || !Types.ObjectId.isValid(id)) return; + if (!id) return; - return MongoImage.deleteOne({ _id: id }, { session }); + if (Types.ObjectId.isValid(id)) { + return MongoImage.deleteOne({ _id: id }, { session }); + } else if (getS3AvatarSource().isAvatarKey(path)) { + return getS3AvatarSource().deleteAvatar(path, session); + } }; export async function readMongoImg({ id }: { id: string }) { diff --git a/packages/service/common/file/read/utils.ts b/packages/service/common/file/read/utils.ts index a07b64bc1a9c..be5dc779896a 100644 --- a/packages/service/common/file/read/utils.ts +++ b/packages/service/common/file/read/utils.ts @@ -41,7 +41,9 @@ export const readRawTextByLocalFile = async (params: readRawTextByLocalFileParam tmbId: params.tmbId, encoding: params.encoding, buffer, - uploadKeyPrefix: params.uploadKey + imageKeyOptions: { + prefix: params.uploadKey + } }); }; @@ -55,7 +57,7 @@ export const readS3FileContentByBuffer = async ({ customPdfParse = false, usageId, getFormatText = true, - uploadKeyPrefix + imageKeyOptions }: { teamId: string; tmbId: string; @@ -67,7 +69,10 @@ export const readS3FileContentByBuffer = async ({ customPdfParse?: boolean; usageId?: string; getFormatText?: boolean; - uploadKeyPrefix: string; + imageKeyOptions: { + prefix: string; + hasTTL?: boolean; + }; }): Promise<{ rawText: string; imageKeys?: string[]; @@ -173,12 +178,14 @@ export const readS3FileContentByBuffer = async ({ await batchRun(imageList, async (item) => { const src = await (async () => { try { + const { prefix, hasTTL } = imageKeyOptions; const ext = item.mime.split('/')[1].replace('x-', ''); const imageKey = await getS3DatasetSource().uploadDatasetImage({ base64Img: `data:${item.mime};base64,${item.base64}`, mimetype: `${ext}`, filename: `${item.uuid}.${ext}`, - uploadKey: `${uploadKeyPrefix}/${item.uuid}.${ext}` + uploadKey: `${prefix}/${item.uuid}.${ext}`, + hasTTL }); uploadedImageKeys.push(imageKey); return imageKey; @@ -213,11 +220,14 @@ export const readS3FileContentByBuffer = async ({ }; export const parsedFileContentS3Key = { + // 临时的文件路径(比如 evaluation) temp: (appId: string) => `chat/${appId}/temp/parsed/${randomUUID()}`, + // 对话中上传的文件的解析结果的图片的 Key chat: ({ appId, chatId, uId }: { chatId: string; uId: string; appId: string }) => `chat/${appId}/${uId}/${chatId}/parsed`, + // 上传数据集的文件的解析结果的图片的 Key dataset: (params: ParsedFileContentS3KeyParams) => { const { datasetId, mimetype, filename, parentFileKey } = params; diff --git a/packages/service/common/s3/sources/dataset/index.ts b/packages/service/common/s3/sources/dataset/index.ts index 685852ecb818..35579047281b 100644 --- a/packages/service/common/s3/sources/dataset/index.ts +++ b/packages/service/common/s3/sources/dataset/index.ts @@ -67,8 +67,8 @@ class S3DatasetSource { // 前缀删除 deleteDatasetFilesByPrefix(params: DeleteDatasetFilesByPrefixParams) { - const { datasetId } = DeleteDatasetFilesByPrefixParamsSchema.parse(params); - const prefix = [S3Sources.dataset, datasetId].filter(Boolean).join('/'); + const { datasetId, rawPrefix } = DeleteDatasetFilesByPrefixParamsSchema.parse(params); + const prefix = rawPrefix || [S3Sources.dataset, datasetId].filter(Boolean).join('/'); return this.bucket.addDeleteJob({ prefix }); } @@ -154,13 +154,15 @@ class S3DatasetSource { const { rawText, imageKeys } = await readS3FileContentByBuffer({ teamId, tmbId, - uploadKeyPrefix: prefix, extension, buffer, encoding, customPdfParse, usageId, - getFormatText + getFormatText, + imageKeyOptions: { + prefix: prefix + } }); addRawTextBuffer({ @@ -180,8 +182,13 @@ class S3DatasetSource { // 上传图片 async uploadDatasetImage(params: UploadDatasetImageParams): Promise { - const { uploadKey, base64Img, mimetype, filename } = - UploadDatasetImageParamsSchema.parse(params); + const { + uploadKey, + base64Img, + mimetype, + filename, + hasTTL = true + } = UploadDatasetImageParamsSchema.parse(params); const base64Data = base64Img.split(',')[1] || base64Img; const buffer = Buffer.from(base64Data, 'base64'); @@ -192,11 +199,13 @@ class S3DatasetSource { 'origin-filename': encodeURIComponent(filename) }); - await MongoS3TTL.create({ - minioKey: uploadKey, - bucketName: this.bucket.name, - expiredTime: addDays(new Date(), 7) - }); + if (hasTTL) { + await MongoS3TTL.create({ + minioKey: uploadKey, + bucketName: this.bucket.name, + expiredTime: addDays(new Date(), 7) + }); + } return uploadKey; } @@ -249,19 +258,6 @@ class S3DatasetSource { deletedCount: result.deletedCount }); } - - async getFileDatasetInfo(key: string): Promise<{ - _id: string; - datasetId: string; - collectionId: string; - } | null> { - return await MongoDatasetData.findOne( - { $or: [{ imageKeys: { $in: [key] } }, { imageId: key }] }, - 'datasetId collectionId' - ) - .lean() - .exec(); - } } export function getS3DatasetSource() { diff --git a/packages/service/common/s3/sources/dataset/type.ts b/packages/service/common/s3/sources/dataset/type.ts index ba82f4a4dfcc..9b8355189b28 100644 --- a/packages/service/common/s3/sources/dataset/type.ts +++ b/packages/service/common/s3/sources/dataset/type.ts @@ -15,7 +15,8 @@ export const CreateGetDatasetFileURLParamsSchema = z.object({ export type CreateGetDatasetFileURLParams = z.infer; export const DeleteDatasetFilesByPrefixParamsSchema = z.object({ - datasetId: ObjectIdSchema + datasetId: ObjectIdSchema.optional(), + rawPrefix: z.string().nonempty().optional() }); export type DeleteDatasetFilesByPrefixParams = z.infer< typeof DeleteDatasetFilesByPrefixParamsSchema @@ -41,7 +42,8 @@ export const UploadDatasetImageParamsSchema = z.object({ base64Img: z.string().nonempty(), uploadKey: z.string().nonempty(), mimetype: z.string().nonempty(), - filename: z.string().nonempty() + filename: z.string().nonempty(), + hasTTL: z.boolean().optional() }); export type UploadDatasetImageParams = z.infer; diff --git a/packages/service/common/s3/utils.ts b/packages/service/common/s3/utils.ts index 1cebd7b39af2..77d8e5d02a22 100644 --- a/packages/service/common/s3/utils.ts +++ b/packages/service/common/s3/utils.ts @@ -1,10 +1,6 @@ import jwt from 'jsonwebtoken'; import { differenceInMilliseconds, addDays } from 'date-fns'; import { ERROR_ENUM } from '@fastgpt/global/common/error/errorCode'; -import { S3Sources } from './type'; -import { getS3ChatSource } from './sources/chat'; -import { getS3DatasetSource } from './sources/dataset'; -import { EndpointUrl } from '@fastgpt/global/common/file/constants'; export function jwtSignS3ObjectKey(objectKey: string) { const secret = process.env.FILE_TOKEN_KEY as string; @@ -27,30 +23,3 @@ export function jwtVerifyS3ObjectKey(token: string) { }); }); } - -export async function replaceDatasetQuoteTextWithJWT(datasetQuoteText: string) { - if (!datasetQuoteText || typeof datasetQuoteText !== 'string') return datasetQuoteText as string; - - const prefixPattern = Object.values(S3Sources) - .map((pattern) => `${pattern}\\/[^\\s)]+`) - .join('|'); - const regex = new RegExp(String.raw`(!?)\[([^\]]+)\]\((?!https?:\/\/)(${prefixPattern})\)`, 'g'); - const s3DatasetSource = getS3DatasetSource(); - const s3ChatSource = getS3ChatSource(); - - const matches = Array.from(datasetQuoteText.matchAll(regex)); - let content = datasetQuoteText; - - for (const match of matches.slice().reverse()) { - const [full, bang, alt, objectKey] = match; - - if (s3DatasetSource.isDatasetObjectKey(objectKey) || s3ChatSource.isChatFileKey(objectKey)) { - const url = `${EndpointUrl}/api/system/file/${jwtSignS3ObjectKey(objectKey)}`; - const replacement = `${bang}[${alt}](${url})`; - content = - content.slice(0, match.index) + replacement + content.slice(match.index + full.length); - } - } - - return content; -} diff --git a/packages/service/core/ai/llm/request.ts b/packages/service/core/ai/llm/request.ts index 65a85d28bd37..74297520ee46 100644 --- a/packages/service/core/ai/llm/request.ts +++ b/packages/service/core/ai/llm/request.ts @@ -86,7 +86,6 @@ export const createLLMResponse = async ( messages: rewriteMessages }); - console.dir(requestBody, { depth: null }); // console.log(JSON.stringify(requestBody, null, 2)); const { response, isStreamResponse, getEmptyResponseTip } = await createChatCompletion({ body: requestBody, diff --git a/packages/service/core/ai/llm/utils.ts b/packages/service/core/ai/llm/utils.ts index 7a50289f0b72..7e9f645e808d 100644 --- a/packages/service/core/ai/llm/utils.ts +++ b/packages/service/core/ai/llm/utils.ts @@ -282,46 +282,6 @@ export const loadRequestMessages = async ({ return result.map((item) => item.text).join('\n'); }; - // const redis = getGlobalRedisConnection(); - // const prefixPattern = Object.values(S3Sources) - // .map((pattern) => `${pattern}\\/[^\\s)]+`) - // .join('|'); - // const regex = new RegExp(String.raw`(!?)\[([^\]]+)\]\((?!https?:\/\/)(${prefixPattern})\)`, 'g'); - - // TODO: 在我迁移完到 JWT 后移除这个 transformS3PreviewKey - // const transformS3PreviewKey = async ( - // origin: string | ChatCompletionContentPartText[] | undefined - // ) => { - // if (!origin || typeof origin !== 'string') return origin as string; - - // const matches = Array.from(origin.matchAll(regex)); - // let content = origin; - - // for (const match of matches.slice().reverse()) { - // const [full, bang, alt, objectKey] = match; - - // const filename = objectKey.split('/').pop()?.split('-')[1]; - // const name = `${randomUUID()}:${filename}`; - - // const redisKey = `chat:temp_file:${name}`; - // try { - // await redis.set(redisKey, objectKey); - // await redis.expire(redisKey, 3600); - // } catch { - // continue; - // } - - // const k = new URLSearchParams({ k: name }); - // const link = `${EndpointUrl}${TempFileURL}?${k}`; - - // const replacement = `${bang}[${alt}](${link})`; - // content = - // content.slice(0, match.index) + replacement + content.slice(match.index + full.length); - // } - - // return content; - // }; - if (messages.length === 0) { return Promise.reject(i18nT('common:core.chat.error.Messages empty')); } diff --git a/packages/service/core/app/controller.ts b/packages/service/core/app/controller.ts index 5bd91b2331c7..bd085bc62e92 100644 --- a/packages/service/core/app/controller.ts +++ b/packages/service/core/app/controller.ts @@ -220,7 +220,6 @@ export const onDelOneApp = async ({ // Delete avatar await removeImageByPath(app.avatar, session); - await getS3AvatarSource().deleteAvatar(app.avatar, session); }; // Delete chats diff --git a/packages/service/core/chat/saveChat.ts b/packages/service/core/chat/saveChat.ts index 21013640adf6..5684966e7a07 100644 --- a/packages/service/core/chat/saveChat.ts +++ b/packages/service/core/chat/saveChat.ts @@ -1,8 +1,4 @@ -import type { - AIChatItemType, - AIChatItemValueItemType, - UserChatItemType -} from '@fastgpt/global/core/chat/type.d'; +import type { AIChatItemType, UserChatItemType } from '@fastgpt/global/core/chat/type.d'; import { MongoApp } from '../app/schema'; import type { ChatSourceEnum } from '@fastgpt/global/core/chat/constants'; import { ChatItemValueTypeEnum, ChatRoleEnum } from '@fastgpt/global/core/chat/constants'; @@ -23,7 +19,6 @@ import { MongoChatItemResponse } from './chatItemResponseSchema'; import { chatValue2RuntimePrompt } from '@fastgpt/global/core/chat/adapt'; import { MongoS3TTL } from '../../common/s3/schema'; import type { ClientSession } from '../../common/mongo'; -import { getGlobalRedisConnection } from '../../common/redis'; type Props = { chatId: string; @@ -46,44 +41,6 @@ type Props = { errorMsg?: string; }; -// TODO: 在我迁移完到 JWT 后移除这个 transformAiResponse -// const transformAiResponse = async (value: AIChatItemValueItemType[]) => { -// const redis = getGlobalRedisConnection(); -// const regex = /(!?)\[([^\]]+)\]\((https?:\/\/[^\s)]+\/api\/file\/temp[^\s)]*)\)/g; - -// return Promise.all( -// value.map(async (item) => { -// if (item.type !== ChatItemValueTypeEnum.text || !item.text) return item; -// let content = item.text.content; -// const matches = Array.from(content.matchAll(regex)); - -// for (const match of matches.slice().reverse()) { -// const [full, bang, alt, link] = match; -// if (typeof match.index !== 'number') continue; - -// try { -// const url = new URL(link); // 可能会发生解析错误 -// const k = url.searchParams.get('k'); -// if (!k) continue; - -// const redisKey = `chat:temp_file:${decodeURIComponent(k)}`; -// const objectKey = await redis.get(redisKey); -// if (!objectKey) continue; - -// const replacement = `${bang}[${alt}](${objectKey})`; -// content = -// content.slice(0, match.index) + replacement + content.slice(match.index + full.length); -// } catch { -// continue; -// } -// } - -// item.text.content = content; -// return item; -// }) -// ); -// }; - const beforProcess = (props: Props) => { // Remove url props.userContent.value.forEach((item) => { @@ -113,6 +70,7 @@ const afterProcess = async ({ }) .flat() .filter(Boolean) as string[]; + if (fileKeys.length > 0) { await MongoS3TTL.deleteMany({ minioKey: { $in: fileKeys } }, { session }); } @@ -152,10 +110,6 @@ const formatAiContent = ({ return responseItem; }); - // aiResponse.value = await transformAiResponse(aiResponse.value); - // console.log('aiResponse ========================'); - // console.dir(aiResponse, { depth: null }); - return { aiResponse: { ...aiResponse, diff --git a/packages/service/core/dataset/collection/controller.ts b/packages/service/core/dataset/collection/controller.ts index 0e18bcf4c179..8ec2419c4351 100644 --- a/packages/service/core/dataset/collection/controller.ts +++ b/packages/service/core/dataset/collection/controller.ts @@ -33,9 +33,9 @@ import { getLLMMaxChunkSize } from '@fastgpt/global/core/dataset/training/utils'; import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants'; -import { clearCollectionImages, removeDatasetImageExpiredTime } from '../image/utils'; +import { clearCollectionImages } from '../image/utils'; import { getS3DatasetSource } from '../../../common/s3/sources/dataset'; -import { addLog } from '../../../common/system/log'; +import path from 'node:path'; export const createCollectionAndInsertData = async ({ dataset, @@ -240,14 +240,8 @@ export const createCollectionAndInsertData = async ({ } })(); - // Remove S3 image TTLs for imageKeys - if (imageKeys && imageKeys.length > 0) { - await getS3DatasetSource().removeDatasetImagesTTL(imageKeys, session); - } - // Remove S3 image TTLs for imageIds - if (imageIds && imageIds.length > 0) { - await getS3DatasetSource().removeDatasetImagesTTL(imageIds, session); - } + // Note: Image TTLs will be removed in generateVector queue after successful insertion to dataset_datas + // This improves fault tolerance - if vector generation fails, images remain protected by TTL for retry return { collectionId: String(collectionId), @@ -377,34 +371,21 @@ export async function delCollection({ const datasetIds = Array.from(new Set(collections.map((item) => String(item.datasetId)))); const collectionIds = collections.map((item) => String(item._id)); - const allImageKeys = await (async () => { + const allS3Keys = await (async () => { const datas = await MongoDatasetData.find( { teamId, datasetId: { $in: datasetIds }, collectionId: { $in: collectionIds } }, - { imageKeys: 1 } + { imageKeys: 1, imageId: 1 } ).lean(); - const imageKeys = datas.flatMap((data) => data.imageKeys || []); - return [...new Set(imageKeys)].filter((key) => s3DatasetSource.isDatasetObjectKey(key)); - })(); - - const allImageIds = await (async () => { - const datas = await MongoDatasetData.find( - { - teamId, - datasetId: { $in: datasetIds }, - collectionId: { $in: collectionIds } - }, - { imageId: 1 } - ).lean(); - return [ - ...new Set( - datas.map((data) => data.imageId).filter((key) => s3DatasetSource.isDatasetObjectKey(key)) + return datas.flatMap((data) => + Array.from(new Set([...(data.imageKeys || []), data.imageId || ''])).filter((key) => + s3DatasetSource.isDatasetObjectKey(key) ) - ]; + ); })(); await retryFn(async () => { @@ -460,24 +441,14 @@ export async function delCollection({ _id: { $in: collectionIds } }, { session } - ); + ).lean(); // delete s3 images which are parsed from docs - if (allImageKeys.length > 0) { - try { - await s3DatasetSource.deleteDatasetFilesByKeys(allImageKeys); - } catch (error) { - addLog.error('Failed to cleanup S3 images', error); - } - } - - // delete s3 images - if (allImageIds.length > 0) { - try { - await s3DatasetSource.deleteDatasetFilesByKeys(allImageIds); - } catch (error) { - addLog.error('Failed to cleanup S3 images', error); - } - } + // collections + // .map((item) => item.fileId) + // .filter((fileId) => s3DatasetSource.isDatasetObjectKey(fileId)) + // .map((key) => `${path.dirname(key)}/${path.basename(key, path.extname(key))}-parsed`) + // .forEach((prefix) => s3DatasetSource.deleteDatasetFilesByPrefix({ rawPrefix: prefix })); + await s3DatasetSource.deleteDatasetFilesByKeys(allS3Keys); }); } diff --git a/packages/service/core/dataset/controller.ts b/packages/service/core/dataset/controller.ts index 89577a8c7fde..1f52fe2daa8f 100644 --- a/packages/service/core/dataset/controller.ts +++ b/packages/service/core/dataset/controller.ts @@ -168,9 +168,5 @@ export const deleteDatasets = async ({ for await (const dataset of datasets) { await removeImageByPath(dataset.avatar, session); } - - for await (const dataset of datasets) { - await getS3AvatarSource().deleteAvatar(dataset.avatar, session); - } }); }; diff --git a/packages/service/core/dataset/read.ts b/packages/service/core/dataset/read.ts index beac3c5149e1..569e78b977a0 100644 --- a/packages/service/core/dataset/read.ts +++ b/packages/service/core/dataset/read.ts @@ -125,7 +125,9 @@ export const readFileRawTextByUrl = async ({ tmbId, buffer, encoding: 'utf-8', - uploadKeyPrefix: prefix + imageKeyOptions: { + prefix: prefix + } }); }); diff --git a/packages/service/core/dataset/utils.ts b/packages/service/core/dataset/utils.ts index 1bbcd5168b78..6d4b34988fee 100644 --- a/packages/service/core/dataset/utils.ts +++ b/packages/service/core/dataset/utils.ts @@ -1,5 +1,11 @@ +import { MongoDatasetData } from './data/schema'; import { authDatasetByTmbId } from '../../support/permission/dataset/auth'; import { ReadPermissionVal } from '@fastgpt/global/support/permission/constant'; +import { S3Sources } from '../../common/s3/type'; +import { getS3DatasetSource } from '../../common/s3/sources/dataset'; +import { getS3ChatSource } from '../../common/s3/sources/chat'; +import { EndpointUrl } from '@fastgpt/global/common/file/constants'; +import { jwtSignS3ObjectKey } from '../../common/s3/utils'; // TODO: 需要优化成批量获取权限 export const filterDatasetsByTmbId = async ({ @@ -28,3 +34,58 @@ export const filterDatasetsByTmbId = async ({ // Then filter datasetIds based on permissions return datasetIds.filter((_, index) => permissions[index]); }; + +export async function getFileDatasetInfo(key: string): Promise<{ + _id: string; + datasetId: string; + collectionId: string; +} | null> { + return await MongoDatasetData.findOne( + { $or: [{ imageKeys: { $in: [key] } }, { imageId: key }] }, + 'datasetId collectionId' + ) + .lean() + .exec(); +} + +/** + * 替换数据集引用文本 markdown 中的链接格式的 S3 对象键为 JWT 签名后的 URL + * + * @param datasetQuoteText 数据集引用文本 + * @returns 替换后的文本 + * + * @example + * + * ```typescript + * const datasetQuoteText = '![image.png](dataset/68fee42e1d416bb5ddc85b19/6901c3071ba2bea567e8d8db/aZos7D-214afce5-4d42-4356-9e05-8164d51c59ae.png)'; + * const replacedText = await replaceDatasetQuoteTextWithJWT(datasetQuoteText) + * console.log(replacedText) + * // '![image.png](http://localhost:3000/api/system/file/eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJvYmplY3RLZXkiOiJjaGF0LzY5MWFlMjlkNDA0ZDA0Njg3MTdkZDc0Ny82OGFkODVhNzQ2MzAwNmM5NjM3OTlhMDcvalhmWHk4eWZHQUZzOVdKcGNXUmJBaFYyL3BhcnNlZC85YTBmNGZlZC00ZWRmLTQ2MTMtYThkNi01MzNhZjVhZTUxZGMucG5nIiwiaWF0IjoxNzYzMzcwOTYwLCJleHAiOjk1MzkzNzA5NjB9.tMDWg0-ZWRnWPNp9Hakd0w1hhaO8jj2oD98SU0wAQYQ)' + * ``` + */ +export async function replaceDatasetQuoteTextWithJWT(datasetQuoteText: string) { + if (!datasetQuoteText || typeof datasetQuoteText !== 'string') return datasetQuoteText as string; + + const prefixPattern = Object.values(S3Sources) + .map((pattern) => `${pattern}\\/[^\\s)]+`) + .join('|'); + const regex = new RegExp(String.raw`(!?)\[([^\]]+)\]\((?!https?:\/\/)(${prefixPattern})\)`, 'g'); + const s3DatasetSource = getS3DatasetSource(); + const s3ChatSource = getS3ChatSource(); + + const matches = Array.from(datasetQuoteText.matchAll(regex)); + let content = datasetQuoteText; + + for (const match of matches.slice().reverse()) { + const [full, bang, alt, objectKey] = match; + + if (s3DatasetSource.isDatasetObjectKey(objectKey) || s3ChatSource.isChatFileKey(objectKey)) { + const url = `${EndpointUrl}/api/system/file/${jwtSignS3ObjectKey(objectKey)}`; + const replacement = `${bang}[${alt}](${url})`; + content = + content.slice(0, match.index) + replacement + content.slice(match.index + full.length); + } + } + + return content; +} diff --git a/packages/service/core/workflow/dispatch/ai/chat.ts b/packages/service/core/workflow/dispatch/ai/chat.ts index 3d37de813e31..370a6a8a65a3 100644 --- a/packages/service/core/workflow/dispatch/ai/chat.ts +++ b/packages/service/core/workflow/dispatch/ai/chat.ts @@ -41,11 +41,7 @@ import { i18nT } from '../../../../../web/i18n/utils'; import { postTextCensor } from '../../../chat/postTextCensor'; import { createLLMResponse } from '../../../ai/llm/request'; import { formatModelChars2Points } from '../../../../support/wallet/usage/utils'; -import { S3Sources } from '../../../../common/s3/type'; -import { getS3DatasetSource } from '../../../../common/s3/sources/dataset'; -import { getS3ChatSource } from '../../../../common/s3/sources/chat'; -import { jwtSignS3ObjectKey, replaceDatasetQuoteTextWithJWT } from '../../../../common/s3/utils'; -import { EndpointUrl } from '@fastgpt/global/common/file/constants'; +import { replaceDatasetQuoteTextWithJWT } from '../../../dataset/utils'; export type ChatProps = ModuleDispatchProps< AIChatNodeProps & { diff --git a/packages/service/core/workflow/dispatch/tools/readFiles.ts b/packages/service/core/workflow/dispatch/tools/readFiles.ts index a8592642d1a2..2b8c8f6dc8af 100644 --- a/packages/service/core/workflow/dispatch/tools/readFiles.ts +++ b/packages/service/core/workflow/dispatch/tools/readFiles.ts @@ -19,11 +19,7 @@ import { addRawTextBuffer, getRawTextBuffer } from '../../../../common/buffer/ra import { addMinutes } from 'date-fns'; import { getNodeErrResponse } from '../utils'; import { isInternalAddress } from '../../../../common/system/utils'; -import { S3Sources } from '../../../../common/s3/type'; -import { getS3DatasetSource } from '../../../../common/s3/sources/dataset'; -import { getS3ChatSource } from '../../../../common/s3/sources/chat'; -import { jwtSignS3ObjectKey, replaceDatasetQuoteTextWithJWT } from '../../../../common/s3/utils'; -import { EndpointUrl } from '@fastgpt/global/common/file/constants'; +import { replaceDatasetQuoteTextWithJWT } from '../../../dataset/utils'; type Props = ModuleDispatchProps<{ [NodeInputKeyEnum.fileUrlList]: string[]; @@ -249,7 +245,10 @@ export const getFileContentFromLinks = async ({ encoding, customPdfParse, getFormatText: true, - uploadKeyPrefix: parsedFileContentS3Key.chat({ appId, chatId: chatId!, uId }), + imageKeyOptions: { + prefix: parsedFileContentS3Key.chat({ appId, chatId: chatId!, uId }), + hasTTL: false + }, usageId }); diff --git a/projects/app/src/pages/api/core/dataset/data/insertImages.ts b/projects/app/src/pages/api/core/dataset/data/insertImages.ts index 61c94bd0468c..87f83c0a4344 100644 --- a/projects/app/src/pages/api/core/dataset/data/insertImages.ts +++ b/projects/app/src/pages/api/core/dataset/data/insertImages.ts @@ -13,7 +13,6 @@ import { UsageSourceEnum } from '@fastgpt/global/support/wallet/usage/constants' import { getEmbeddingModel, getLLMModel, getVlmModel } from '@fastgpt/service/core/ai/model'; import { pushDataListToTrainingQueue } from '@fastgpt/service/core/dataset/training/controller'; import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants'; -import { removeDatasetImageExpiredTime } from '@fastgpt/service/core/dataset/image/utils'; import { getS3DatasetSource } from '@fastgpt/service/common/s3/sources/dataset'; import path from 'node:path'; import fsp from 'node:fs/promises'; @@ -111,9 +110,6 @@ async function handler( })), session }); - - // 3. Clear ttl - await getS3DatasetSource().removeDatasetImagesTTL(imageIds, session); }); return {}; diff --git a/projects/app/src/pages/api/core/dataset/presignDatasetFileGetUrl.ts b/projects/app/src/pages/api/core/dataset/presignDatasetFileGetUrl.ts index 84b9385ffc17..03833b5cfc19 100644 --- a/projects/app/src/pages/api/core/dataset/presignDatasetFileGetUrl.ts +++ b/projects/app/src/pages/api/core/dataset/presignDatasetFileGetUrl.ts @@ -15,6 +15,7 @@ import { createFileToken } from '@fastgpt/service/support/permission/auth/file'; import { BucketNameEnum, ReadFileBaseUrl } from '@fastgpt/global/common/file/constants'; import { DatasetCollectionTypeEnum } from '@fastgpt/global/core/dataset/constants'; import { UserError } from '@fastgpt/global/common/error/utils'; +import { getFileDatasetInfo } from '@fastgpt/service/core/dataset/utils'; async function handler(req: ApiRequestProps) { const parsed = PresignDatasetFileGetUrlSchema.parse(req.body); @@ -24,7 +25,7 @@ async function handler(req: ApiRequestProps) { if ('key' in parsed) { const { key } = parsed; - const dataset = await s3DatasetSource.getFileDatasetInfo(key); + const dataset = await getFileDatasetInfo(key); if (!dataset) { // 如果 `dataset_datas` 中没有找到记录,则这次的请求应该是图片的预览请求,验证 datasetId 的权限即可 const datasetId = key.split('/')[1] || ''; diff --git a/projects/app/src/service/core/dataset/data/controller.ts b/projects/app/src/service/core/dataset/data/controller.ts index dd8e85b5511e..69f405fc9d4f 100644 --- a/projects/app/src/service/core/dataset/data/controller.ts +++ b/projects/app/src/service/core/dataset/data/controller.ts @@ -221,14 +221,6 @@ export async function insertData2Dataset({ dataId: insertIds[index] })); - // 2. Create mongo data - addLog.debug('[insertData2Dataset] Creating mongo data', { - qPreview: q?.substring(0, 100), - imageKeysCount: imageKeys?.length || 0, - imageKeys, - chunkIndex - }); - const [{ _id }] = await MongoDatasetData.create( [ { diff --git a/projects/app/src/service/core/dataset/queues/datasetParse.ts b/projects/app/src/service/core/dataset/queues/datasetParse.ts index a9445f2180a8..b5e6a92da754 100644 --- a/projects/app/src/service/core/dataset/queues/datasetParse.ts +++ b/projects/app/src/service/core/dataset/queues/datasetParse.ts @@ -382,36 +382,20 @@ export const datasetParseQueue = async (): Promise => { } ); - // 8. Remove TTLs (file, images) + // 8. Remove file TTL (images TTL will be removed after successful insertion to dataset_datas) const s3DatasetSource = getS3DatasetSource(); - addLog.info('[Parse Queue] Before removing TTLs', { + addLog.info('[Parse Queue] Before removing file TTL', { hasFileId: !!collection.fileId, - isS3File: collection.fileId && s3DatasetSource.isDatasetObjectKey(collection.fileId), - imageKeysCount: imageKeys?.length || 0, - imageKeys: imageKeys + isS3File: collection.fileId && s3DatasetSource.isDatasetObjectKey(collection.fileId) }); - // 8.1 For S3 files, remove file TTL and image TTLs + // 8.1 For S3 files, remove file TTL only if (collection.fileId && s3DatasetSource.isDatasetObjectKey(collection.fileId)) { // Remove file TTL await s3DatasetSource.removeDatasetFileTTL(collection.fileId, session); addLog.info('[Parse Queue] Removed file TTL', { fileId: collection.fileId }); - - // Remove image TTLs - if (imageKeys && imageKeys.length > 0) { - await s3DatasetSource.removeDatasetImagesTTL(imageKeys, session); - addLog.info('[Parse Queue] Removed image TTLs', { - imageKeysCount: imageKeys.length, - imageKeys - }); - } else { - addLog.warn('[Parse Queue] No imageKeys to remove TTL', { - imageKeysIsUndefined: imageKeys === undefined, - imageKeysIsNull: imageKeys === null, - imageKeysLength: imageKeys?.length - }); - } + // Note: Image TTLs will be removed in generateVector queue after successful insertion } // 8.2 For GridFS files (legacy), remove MongoDB image TTL else { diff --git a/projects/app/src/service/core/dataset/queues/generateVector.ts b/projects/app/src/service/core/dataset/queues/generateVector.ts index 437be2eee5d2..be52b312ab61 100644 --- a/projects/app/src/service/core/dataset/queues/generateVector.ts +++ b/projects/app/src/service/core/dataset/queues/generateVector.ts @@ -20,6 +20,8 @@ import type { } from '@fastgpt/global/core/dataset/type'; import { retryFn } from '@fastgpt/global/common/system/utils'; import { delay } from '@fastgpt/service/common/bullmq'; +import { getS3DatasetSource } from '@fastgpt/service/common/s3/sources/dataset'; +import { getS3ChatSource } from '@fastgpt/service/common/s3/sources/chat'; const reduceQueue = () => { global.vectorQueueLen = global.vectorQueueLen > 0 ? global.vectorQueueLen - 1 : 0; @@ -289,6 +291,20 @@ const insertData = async ({ trainingData }: { trainingData: TrainingDataType }) embeddingModel: trainingData.dataset.vectorModel, session }); + + await (async () => { + const s3DatasetSource = getS3DatasetSource(); + const keys = Array.from( + new Set([...(trainingData.imageKeys ?? []), trainingData.imageId ?? '']) + ) + .flat() + .filter((key) => s3DatasetSource.isDatasetObjectKey(key)); + + if (keys.length <= 0) return; + + await s3DatasetSource.removeDatasetImagesTTL(keys, session); + })(); + // delete data from training await MongoDatasetTraining.deleteOne({ _id: trainingData._id }, { session }); From 8e8d492667d83c9918969f5ee838ab5e15055e37 Mon Sep 17 00:00:00 2001 From: xqvvu Date: Wed, 19 Nov 2025 18:00:28 +0800 Subject: [PATCH 5/6] fix: remove imageKeys --- packages/global/core/dataset/api.d.ts | 1 - .../global/core/dataset/apiDataset/type.d.ts | 1 - packages/global/core/dataset/controller.d.ts | 1 - packages/global/core/dataset/type.d.ts | 2 - .../common/buffer/rawText/controller.ts | 10 +- .../service/common/buffer/rawText/schema.ts | 4 +- packages/service/common/file/read/utils.ts | 89 ++----------- .../common/s3/sources/dataset/index.ts | 84 ++---------- .../service/common/s3/sources/dataset/type.ts | 11 +- packages/service/common/s3/type.ts | 11 +- packages/service/common/s3/utils.ts | 118 ++++++++++++++++- packages/service/core/chat/saveChat.ts | 5 +- .../core/dataset/apiDataset/custom/api.ts | 14 +- .../core/dataset/collection/controller.ts | 58 ++++----- .../service/core/dataset/collection/utils.ts | 3 +- packages/service/core/dataset/data/schema.ts | 7 - packages/service/core/dataset/read.ts | 122 +++++------------- .../core/dataset/training/controller.ts | 1 - .../service/core/dataset/training/schema.ts | 4 - packages/service/core/dataset/utils.ts | 18 +-- .../service/core/workflow/dispatch/ai/chat.ts | 22 ++-- .../core/workflow/dispatch/tools/readFiles.ts | 31 ++--- .../support/permission/dataset/auth.ts | 3 +- .../app/src/components/Markdown/img/Image.tsx | 1 - projects/app/src/pages/api/core/app/create.ts | 3 +- .../core/dataset/collection/create/backup.ts | 8 +- .../core/dataset/collection/create/images.ts | 8 +- .../dataset/collection/create/template.ts | 8 +- .../core/dataset/collection/create/text.ts | 3 +- .../api/core/dataset/collection/delete.ts | 2 +- .../api/core/dataset/data/insertImages.ts | 12 +- .../core/dataset/presignDatasetFileGetUrl.ts | 29 ++--- projects/app/src/pages/api/file/temp.ts | 43 ------ .../service/core/dataset/data/controller.ts | 52 +------- .../core/dataset/queues/datasetParse.ts | 57 +------- .../core/dataset/queues/generateVector.ts | 25 +--- 36 files changed, 289 insertions(+), 582 deletions(-) delete mode 100644 projects/app/src/pages/api/file/temp.ts diff --git a/packages/global/core/dataset/api.d.ts b/packages/global/core/dataset/api.d.ts index 6c308e511a86..1a3935127018 100644 --- a/packages/global/core/dataset/api.d.ts +++ b/packages/global/core/dataset/api.d.ts @@ -139,7 +139,6 @@ export type PushDatasetDataChunkProps = { q?: string; a?: string; imageId?: string; - imageKeys?: string[]; chunkIndex?: number; indexes?: Omit[]; }; diff --git a/packages/global/core/dataset/apiDataset/type.d.ts b/packages/global/core/dataset/apiDataset/type.d.ts index 98546bce867a..1e8758411515 100644 --- a/packages/global/core/dataset/apiDataset/type.d.ts +++ b/packages/global/core/dataset/apiDataset/type.d.ts @@ -40,7 +40,6 @@ export type ApiDatasetServerType = { export type ApiFileReadContentResponse = { title?: string; rawText: string; - imageKeys?: string[]; }; export type APIFileReadResponse = { diff --git a/packages/global/core/dataset/controller.d.ts b/packages/global/core/dataset/controller.d.ts index 5729ffa8f42b..ec724e8c63d2 100644 --- a/packages/global/core/dataset/controller.d.ts +++ b/packages/global/core/dataset/controller.d.ts @@ -9,7 +9,6 @@ export type CreateDatasetDataProps = { q: string; a?: string; imageId?: string; - imageKeys?: string[]; indexes?: Omit[]; indexPrefix?: string; }; diff --git a/packages/global/core/dataset/type.d.ts b/packages/global/core/dataset/type.d.ts index f0d08347ee1e..e3fee578158e 100644 --- a/packages/global/core/dataset/type.d.ts +++ b/packages/global/core/dataset/type.d.ts @@ -148,7 +148,6 @@ export type DatasetDataFieldType = { q: string; // large chunks or question a?: string; // answer or custom content imageId?: string; - imageKeys?: string[]; }; export type DatasetDataSchemaType = DatasetDataFieldType & { _id: string; @@ -193,7 +192,6 @@ export type DatasetTrainingSchemaType = { q: string; a: string; imageId?: string; - imageKeys?: string[]; imageDescMap?: Record; chunkIndex: number; indexSize?: number; diff --git a/packages/service/common/buffer/rawText/controller.ts b/packages/service/common/buffer/rawText/controller.ts index 25200d4ab01d..d16c9c59e185 100644 --- a/packages/service/common/buffer/rawText/controller.ts +++ b/packages/service/common/buffer/rawText/controller.ts @@ -18,21 +18,18 @@ export const addRawTextBuffer = async ({ sourceId, sourceName, text, - expiredTime, - imageKeys = [] + expiredTime }: { sourceId: string; sourceName: string; text: string; expiredTime: Date; - imageKeys?: string[]; }) => { const gridBucket = getGridBucket(); const metadata = { sourceId, sourceName, - expiredTime, - imageKeys + expiredTime }; const buffer = Buffer.from(text); @@ -109,8 +106,7 @@ export const getRawTextBuffer = async (sourceId: string) => { return { text: rawText, - sourceName: bufferData.metadata?.sourceName || '', - imageKeys: bufferData.metadata?.imageKeys || [] + sourceName: bufferData.metadata?.sourceName || '' }; }); }; diff --git a/packages/service/common/buffer/rawText/schema.ts b/packages/service/common/buffer/rawText/schema.ts index fe485da0e16f..f6e9ea580dbb 100644 --- a/packages/service/common/buffer/rawText/schema.ts +++ b/packages/service/common/buffer/rawText/schema.ts @@ -6,8 +6,7 @@ const RawTextBufferSchema = new Schema({ metadata: { sourceId: { type: String, required: true }, sourceName: { type: String, required: true }, - expiredTime: { type: Date, required: true }, - imageKeys: { type: [String], required: true } + expiredTime: { type: Date, required: true } } }); RawTextBufferSchema.index({ 'metadata.sourceId': 'hashed' }); @@ -19,6 +18,5 @@ export const MongoRawTextBufferSchema = getMongoModel<{ sourceId: string; sourceName: string; expiredTime: Date; - imageKeys: string[]; }; }>(`${bucketName}.files`, RawTextBufferSchema); diff --git a/packages/service/common/file/read/utils.ts b/packages/service/common/file/read/utils.ts index be5dc779896a..1049ac20f742 100644 --- a/packages/service/common/file/read/utils.ts +++ b/packages/service/common/file/read/utils.ts @@ -1,4 +1,3 @@ -import { uploadMongoImg } from '../image/controller'; import FormData from 'form-data'; import fs from 'fs'; import type { ReadFileResponse } from '../../../worker/readFile/type'; @@ -9,12 +8,9 @@ import { matchMdImg } from '@fastgpt/global/common/string/markdown'; import { createPdfParseUsage } from '../../../support/wallet/usage/controller'; import { useDoc2xServer } from '../../../thirdProvider/doc2x'; import { readRawContentFromBuffer } from '../../../worker/function'; -import { getS3DatasetSource } from '../../s3/sources/dataset'; -import type { ParsedFileContentS3KeyParams } from '../../s3/sources/dataset/type'; -import { getNanoid } from '@fastgpt/global/common/string/tools'; -import path from 'path'; -import { S3Sources } from '../../s3/type'; -import { randomUUID } from 'crypto'; +import { uploadImage2S3Bucket } from '../../s3/utils'; +import { Mimes } from '../../s3/constants'; +import { addDays } from 'date-fns'; export type readRawTextByLocalFileParams = { teamId: string; @@ -42,7 +38,8 @@ export const readRawTextByLocalFile = async (params: readRawTextByLocalFileParam encoding: params.encoding, buffer, imageKeyOptions: { - prefix: params.uploadKey + prefix: params.uploadKey, + expiredTime: addDays(new Date(), 1) } }); }; @@ -71,11 +68,10 @@ export const readS3FileContentByBuffer = async ({ getFormatText?: boolean; imageKeyOptions: { prefix: string; - hasTTL?: boolean; + expiredTime?: Date; }; }): Promise<{ rawText: string; - imageKeys?: string[]; }> => { const systemParse = () => readRawContentFromBuffer({ @@ -171,26 +167,23 @@ export const readS3FileContentByBuffer = async ({ addLog.debug(`Parse file success, time: ${Date.now() - start}ms. `); // markdown data format - const uploadedImageKeys: string[] = []; if (imageList && imageList.length > 0) { addLog.debug(`Processing ${imageList.length} images from parsed document`); await batchRun(imageList, async (item) => { const src = await (async () => { try { - const { prefix, hasTTL } = imageKeyOptions; - const ext = item.mime.split('/')[1].replace('x-', ''); - const imageKey = await getS3DatasetSource().uploadDatasetImage({ + const { prefix, expiredTime } = imageKeyOptions; + const ext = `.${item.mime.split('/')[1].replace('x-', '')}`; + + return await uploadImage2S3Bucket('private', { base64Img: `data:${item.mime};base64,${item.base64}`, - mimetype: `${ext}`, - filename: `${item.uuid}.${ext}`, uploadKey: `${prefix}/${item.uuid}.${ext}`, - hasTTL + mimetype: Mimes[ext as keyof typeof Mimes], + filename: `${item.uuid}.${ext}`, + expiredTime }); - uploadedImageKeys.push(imageKey); - return imageKey; } catch (error) { - // Don't add to uploadedImageKeys if upload failed, but still continue processing return `[Image Upload Failed: ${item.uuid}]`; } })(); @@ -199,63 +192,9 @@ export const readS3FileContentByBuffer = async ({ formatText = formatText.replace(item.uuid, src); } }); - - // Log summary of image processing - addLog.info(`Image processing completed`, { - total: imageList.length, - successful: uploadedImageKeys.length, - failed: imageList.length - uploadedImageKeys.length - }); } - addLog.debug(`Upload file to S3 success, time: ${Date.now() - start}ms`, { - uploadedImageKeysCount: uploadedImageKeys.length, - uploadedImageKeys - }); - return { - rawText: getFormatText ? formatText || rawText : rawText, - imageKeys: uploadedImageKeys + rawText: getFormatText ? formatText || rawText : rawText }; }; - -export const parsedFileContentS3Key = { - // 临时的文件路径(比如 evaluation) - temp: (appId: string) => `chat/${appId}/temp/parsed/${randomUUID()}`, - - // 对话中上传的文件的解析结果的图片的 Key - chat: ({ appId, chatId, uId }: { chatId: string; uId: string; appId: string }) => - `chat/${appId}/${uId}/${chatId}/parsed`, - - // 上传数据集的文件的解析结果的图片的 Key - dataset: (params: ParsedFileContentS3KeyParams) => { - const { datasetId, mimetype, filename, parentFileKey } = params; - - const extension = mimetype; - const image = (() => { - if (filename) { - return Boolean(path.extname(filename)) - ? `${getNanoid(6)}-${filename}` - : `${getNanoid(6)}-${filename}.${extension}`; - } - return `${getNanoid(6)}.${extension}`; - })(); - - const parentFilename = parentFileKey?.slice().split('/').at(-1); - const parsedParentFilename = parentFilename - ? `parsed-${path.basename(parentFilename, path.extname(parentFilename))}` - : ''; - const parsedParentFileKey = parentFileKey - ?.split('/') - .slice(0, -1) - .concat(parsedParentFilename) - .join('/'); - - return { - key: parsedParentFileKey - ? `${parsedParentFileKey}/${image}` - : [S3Sources.dataset, datasetId, image].join('/'), - filename: image - }; - } -}; diff --git a/packages/service/common/s3/sources/dataset/index.ts b/packages/service/common/s3/sources/dataset/index.ts index 35579047281b..a6db3714df29 100644 --- a/packages/service/common/s3/sources/dataset/index.ts +++ b/packages/service/common/s3/sources/dataset/index.ts @@ -11,9 +11,7 @@ import { type GetDatasetFileContentParams, GetDatasetFileContentParamsSchema, type UploadDatasetFileByBufferParams, - UploadDatasetFileByBufferParamsSchema, - type UploadDatasetImageParams, - UploadDatasetImageParamsSchema + UploadDatasetFileByBufferParamsSchema } from './type'; import { MongoS3TTL } from '../../schema'; import { @@ -37,7 +35,7 @@ import { ERROR_ENUM } from '@fastgpt/global/common/error/errorCode'; type DatasetObjectKey = `${typeof S3Sources.dataset}/${string}`; class S3DatasetSource { - private bucket: S3PrivateBucket; + public bucket: S3PrivateBucket; private static instance: S3DatasetSource; constructor() { @@ -65,7 +63,11 @@ class S3DatasetSource { return await this.bucket.createPostPresignedUrl({ rawKey, filename }, { expiredHours: 3 }); } - // 前缀删除 + /** + * 可以根据 datasetId 或者 prefix 删除文件 + * 如果存在 rawPrefix 则优先使用 rawPrefix 去删除文件,否则使用 datasetId 拼接前缀去删除文件 + * 比如根据被解析的文档前缀去删除解析出来的图片 + **/ deleteDatasetFilesByPrefix(params: DeleteDatasetFilesByPrefixParams) { const { datasetId, rawPrefix } = DeleteDatasetFilesByPrefixParamsSchema.parse(params); const prefix = rawPrefix || [S3Sources.dataset, datasetId].filter(Boolean).join('/'); @@ -132,8 +134,7 @@ class S3DatasetSource { if (fileBuffer) { return { rawText: fileBuffer.text, - filename: fileBuffer.sourceName, - imageKeys: fileBuffer.imageKeys + filename: fileBuffer.sourceName }; } @@ -151,7 +152,7 @@ class S3DatasetSource { const encoding = detectFileEncoding(buffer); const prefix = `${path.dirname(fileId)}/${path.basename(fileId, path.extname(fileId))}-parsed`; - const { rawText, imageKeys } = await readS3FileContentByBuffer({ + const { rawText } = await readS3FileContentByBuffer({ teamId, tmbId, extension, @@ -169,47 +170,15 @@ class S3DatasetSource { sourceId: bufferId, sourceName: filename, text: rawText, - expiredTime: addMinutes(new Date(), 20), - imageKeys + expiredTime: addMinutes(new Date(), 20) }); return { rawText, - filename, - imageKeys + filename }; } - // 上传图片 - async uploadDatasetImage(params: UploadDatasetImageParams): Promise { - const { - uploadKey, - base64Img, - mimetype, - filename, - hasTTL = true - } = UploadDatasetImageParamsSchema.parse(params); - - const base64Data = base64Img.split(',')[1] || base64Img; - const buffer = Buffer.from(base64Data, 'base64'); - - await this.bucket.putObject(uploadKey, buffer, buffer.length, { - 'content-type': mimetype, - 'upload-time': new Date().toISOString(), - 'origin-filename': encodeURIComponent(filename) - }); - - if (hasTTL) { - await MongoS3TTL.create({ - minioKey: uploadKey, - bucketName: this.bucket.name, - expiredTime: addDays(new Date(), 7) - }); - } - - return uploadKey; - } - // 根据文件 Buffer 上传文件 async uploadDatasetFileByBuffer(params: UploadDatasetFileByBufferParams): Promise { const { datasetId, buffer, filename } = UploadDatasetFileByBufferParamsSchema.parse(params); @@ -227,37 +196,6 @@ class S3DatasetSource { }); return key; } - - // 移除单个文件的 TTL 记录 - async removeDatasetFileTTL(fileKey: string, session?: ClientSession): Promise { - await MongoS3TTL.deleteOne( - { - minioKey: fileKey, - bucketName: this.bucket.name - }, - { session } - ); - - addLog.debug('Removed TTL for dataset file', { fileKey }); - } - - // 移除多个图片的 TTL 记录 - async removeDatasetImagesTTL(imageKeys: string[], session?: ClientSession): Promise { - if (imageKeys.length === 0) return; - - const result = await MongoS3TTL.deleteMany( - { - minioKey: { $in: imageKeys }, - bucketName: this.bucket.name - }, - { session } - ); - - addLog.debug('Removed TTL for dataset images', { - imageKeysCount: imageKeys.length, - deletedCount: result.deletedCount - }); - } } export function getS3DatasetSource() { diff --git a/packages/service/common/s3/sources/dataset/type.ts b/packages/service/common/s3/sources/dataset/type.ts index 9b8355189b28..81fbefa0b4b7 100644 --- a/packages/service/common/s3/sources/dataset/type.ts +++ b/packages/service/common/s3/sources/dataset/type.ts @@ -38,20 +38,11 @@ export const UploadParsedDatasetImagesParamsSchema = z.object({ }); export type UploadParsedDatasetImagesParams = z.infer; -export const UploadDatasetImageParamsSchema = z.object({ - base64Img: z.string().nonempty(), - uploadKey: z.string().nonempty(), - mimetype: z.string().nonempty(), - filename: z.string().nonempty(), - hasTTL: z.boolean().optional() -}); -export type UploadDatasetImageParams = z.infer; - export const ParsedFileContentS3KeyParamsSchema = z.object({ datasetId: ObjectIdSchema, mimetype: z.string().nonempty(), filename: z.string().optional(), - parentFileKey: z.string().optional() // 被解析的文件的完整key,作为图片的父目录 + parsedFileKey: z.string().optional() // 被解析的文件的完整 key,作为图片的父目录 }); export type ParsedFileContentS3KeyParams = z.infer; diff --git a/packages/service/common/s3/type.ts b/packages/service/common/s3/type.ts index 50881d4d2907..48417d3895fc 100644 --- a/packages/service/common/s3/type.ts +++ b/packages/service/common/s3/type.ts @@ -17,7 +17,7 @@ export type ExtensionType = keyof typeof Mimes; export type S3OptionsType = typeof defaultS3Options; -export const S3SourcesSchema = z.enum(['avatar', 'chat', 'dataset']); +export const S3SourcesSchema = z.enum(['avatar', 'chat', 'dataset', 'tmp']); export const S3Sources = S3SourcesSchema.enum; export type S3SourceType = z.infer; @@ -57,6 +57,15 @@ export const CreateGetPresignedUrlParamsSchema = z.object({ }); export type createPreviewUrlParams = z.infer; +export const UploadImage2S3BucketParamsSchema = z.object({ + base64Img: z.string().nonempty(), + uploadKey: z.string().nonempty(), + mimetype: z.string().nonempty(), + filename: z.string().nonempty(), + expiredTime: z.date().optional() +}); +export type UploadImage2S3BucketParams = z.infer; + declare global { var s3BucketMap: { [key: string]: S3BaseBucket; diff --git a/packages/service/common/s3/utils.ts b/packages/service/common/s3/utils.ts index 77d8e5d02a22..ede3d9c8078c 100644 --- a/packages/service/common/s3/utils.ts +++ b/packages/service/common/s3/utils.ts @@ -1,11 +1,21 @@ import jwt from 'jsonwebtoken'; -import { differenceInMilliseconds, addDays } from 'date-fns'; +import { addDays, isAfter, differenceInSeconds } from 'date-fns'; import { ERROR_ENUM } from '@fastgpt/global/common/error/errorCode'; +import type { ClientSession } from 'mongoose'; +import { MongoS3TTL } from './schema'; +import { S3Buckets } from './constants'; +import { S3PrivateBucket } from './buckets/private'; +import { S3Sources, type UploadImage2S3BucketParams } from './type'; +import { S3PublicBucket } from './buckets/public'; +import { getNanoid } from '@fastgpt/global/common/string/tools'; +import path from 'node:path'; +import { randomUUID } from 'node:crypto'; +import type { ParsedFileContentS3KeyParams } from './sources/dataset/type'; export function jwtSignS3ObjectKey(objectKey: string) { const secret = process.env.FILE_TOKEN_KEY as string; const now = new Date(); - const expiresIn = differenceInMilliseconds(addDays(now, 90), now); + const expiresIn = differenceInSeconds(addDays(now, 90), now); const token = jwt.sign({ objectKey }, secret, { expiresIn }); return token; @@ -23,3 +33,107 @@ export function jwtVerifyS3ObjectKey(token: string) { }); }); } + +export function removeS3TTL({ + key, + bucketName, + session +}: { + key: string[] | string; + bucketName: keyof typeof S3Buckets; + session?: ClientSession; +}) { + if (!key) return; + + if (Array.isArray(key)) { + return MongoS3TTL.deleteMany( + { + minioKey: { $in: key }, + bucketName: S3Buckets[bucketName] + }, + { session } + ); + } + + if (typeof key === 'string') { + return MongoS3TTL.deleteOne( + { + minioKey: key, + bucketName: S3Buckets[bucketName] + }, + { session } + ); + } +} + +export async function uploadImage2S3Bucket( + bucketName: keyof typeof S3Buckets, + params: UploadImage2S3BucketParams +) { + const { base64Img, filename, mimetype, uploadKey, expiredTime } = params; + + const bucket = bucketName === 'private' ? new S3PrivateBucket() : new S3PublicBucket(); + + const base64Data = base64Img.split(',')[1] || base64Img; + const buffer = Buffer.from(base64Data, 'base64'); + + await bucket.putObject(uploadKey, buffer, buffer.length, { + 'content-type': mimetype, + 'upload-time': new Date().toISOString(), + 'origin-filename': encodeURIComponent(filename) + }); + + const now = new Date(); + if (expiredTime && isAfter(expiredTime, now)) { + await MongoS3TTL.create({ + minioKey: uploadKey, + bucketName: bucket.name, + expiredTime: expiredTime + }); + } + + return uploadKey; +} + +export const ParsedFileContentS3Key = { + // 临时的文件路径(比如 evaluation) + temp: (appId: string) => { + return `${S3Sources.chat}/${appId}/temp/${randomUUID()}`; + }, + + // 对话中上传的文件的解析结果的图片的 Key + chat: ({ appId, chatId, uId }: { chatId: string; uId: string; appId: string }) => { + return `${S3Sources.chat}/${appId}/${uId}/${chatId}/parsed`; + }, + + // 上传数据集的文件的解析结果的图片的 Key + dataset: (params: ParsedFileContentS3KeyParams) => { + const { datasetId, mimetype: ext, filename, parsedFileKey } = params; + + const imageName = (() => { + const id = getNanoid(6); + if (!filename) return `${id}.${ext}`; + return !!path.extname(filename) ? `${id}-${filename}` : `${id}-${filename}.${ext}`; + })(); + + if (!parsedFileKey) { + return { + key: [S3Sources.dataset, datasetId, imageName].join('/'), + filename: imageName + }; + } + + const parsedFileName = parsedFileKey.split('/').at(-1)!; + const parsedContentPrefix = `parsed-${path.basename(parsedFileName, path.extname(parsedFileName))}`; + const parsedContentKey = parsedFileKey + .split('/') + .slice(0, -1) + .concat(parsedContentPrefix) + .join('/'); + + return { + key: parsedContentKey, + filename: imageName + }; + } +}; diff --git a/packages/service/core/chat/saveChat.ts b/packages/service/core/chat/saveChat.ts index 5684966e7a07..5c73f4ec12b8 100644 --- a/packages/service/core/chat/saveChat.ts +++ b/packages/service/core/chat/saveChat.ts @@ -17,8 +17,8 @@ import { MongoAppChatLog } from '../app/logs/chatLogsSchema'; import { writePrimary } from '../../common/mongo/utils'; import { MongoChatItemResponse } from './chatItemResponseSchema'; import { chatValue2RuntimePrompt } from '@fastgpt/global/core/chat/adapt'; -import { MongoS3TTL } from '../../common/s3/schema'; import type { ClientSession } from '../../common/mongo'; +import { removeS3TTL } from '../../common/s3/utils'; type Props = { chatId: string; @@ -56,7 +56,6 @@ const afterProcess = async ({ contents: (UserChatItemType | AIChatItemType)[]; session: ClientSession; }) => { - // Remove ttl const fileKeys = contents .map((item) => { if (item.value && Array.isArray(item.value)) { @@ -72,7 +71,7 @@ const afterProcess = async ({ .filter(Boolean) as string[]; if (fileKeys.length > 0) { - await MongoS3TTL.deleteMany({ minioKey: { $in: fileKeys } }, { session }); + await removeS3TTL({ key: fileKeys, bucketName: 'private', session }); } }; diff --git a/packages/service/core/dataset/apiDataset/custom/api.ts b/packages/service/core/dataset/apiDataset/custom/api.ts index 6f96f5ea93ad..1af2ffd8134d 100644 --- a/packages/service/core/dataset/apiDataset/custom/api.ts +++ b/packages/service/core/dataset/apiDataset/custom/api.ts @@ -150,8 +150,7 @@ export const useApiDatasetRequest = ({ apiServer }: { apiServer: APIFileServer } if (content) { return { title, - rawText: content, - imageKeys: [] + rawText: content }; } if (previewUrl) { @@ -160,12 +159,11 @@ export const useApiDatasetRequest = ({ apiServer }: { apiServer: APIFileServer } if (buffer) { return { title, - rawText: buffer.text, - imageKeys: buffer.imageKeys || [] + rawText: buffer.text }; } - const { rawText, imageKeys } = await readFileRawTextByUrl({ + const { rawText } = await readFileRawTextByUrl({ teamId, tmbId, url: previewUrl, @@ -179,14 +177,12 @@ export const useApiDatasetRequest = ({ apiServer }: { apiServer: APIFileServer } sourceId: previewUrl, sourceName: title || '', text: rawText, - expiredTime: addMinutes(new Date(), 30), - imageKeys + expiredTime: addMinutes(new Date(), 30) }); return { title, - rawText, - imageKeys + rawText }; } return Promise.reject('Invalid content type: content or previewUrl is required'); diff --git a/packages/service/core/dataset/collection/controller.ts b/packages/service/core/dataset/collection/controller.ts index 8ec2419c4351..ea8c24ea39ca 100644 --- a/packages/service/core/dataset/collection/controller.ts +++ b/packages/service/core/dataset/collection/controller.ts @@ -36,12 +36,12 @@ import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/cons import { clearCollectionImages } from '../image/utils'; import { getS3DatasetSource } from '../../../common/s3/sources/dataset'; import path from 'node:path'; +import { removeS3TTL } from '../../../common/s3/utils'; export const createCollectionAndInsertData = async ({ dataset, rawText, imageIds, - imageKeys, createCollectionParams, backupParse = false, billId, @@ -50,7 +50,6 @@ export const createCollectionAndInsertData = async ({ dataset: DatasetSchemaType; rawText?: string; imageIds?: string[]; - imageKeys?: string[]; createCollectionParams: CreateOneCollectionParams; backupParse?: boolean; @@ -140,8 +139,7 @@ export const createCollectionAndInsertData = async ({ customReg: formatCreateCollectionParams.chunkSplitter ? [formatCreateCollectionParams.chunkSplitter] : [], - backupParse, - imageKeys + backupParse }); return { chunks, @@ -240,9 +238,6 @@ export const createCollectionAndInsertData = async ({ } })(); - // Note: Image TTLs will be removed in generateVector queue after successful insertion to dataset_datas - // This improves fault tolerance - if vector generation fails, images remain protected by TTL for retry - return { collectionId: String(collectionId), insertResults @@ -304,6 +299,10 @@ export async function createOneCollection({ session, ...props }: CreateOneCollec { session, ordered: true } ); + if (getS3DatasetSource().isDatasetObjectKey(fileId)) { + await removeS3TTL({ key: fileId, bucketName: 'private', session }); + } + return collection; } @@ -371,22 +370,20 @@ export async function delCollection({ const datasetIds = Array.from(new Set(collections.map((item) => String(item.datasetId)))); const collectionIds = collections.map((item) => String(item._id)); - const allS3Keys = await (async () => { - const datas = await MongoDatasetData.find( - { - teamId, - datasetId: { $in: datasetIds }, - collectionId: { $in: collectionIds } - }, - { imageKeys: 1, imageId: 1 } - ).lean(); - - return datas.flatMap((data) => - Array.from(new Set([...(data.imageKeys || []), data.imageId || ''])).filter((key) => - s3DatasetSource.isDatasetObjectKey(key) - ) - ); - })(); + const imageCollectionIds = collections + .filter((item) => item.type === DatasetCollectionTypeEnum.images) + .map((item) => String(item._id)); + const imageDatas = await MongoDatasetData.find( + { + teamId, + datasetId: { $in: datasetIds }, + collectionId: { $in: imageCollectionIds } + }, + { imageId: 1 } + ).lean(); + const imageIds = imageDatas + .map((item) => item.imageId) + .filter((key) => s3DatasetSource.isDatasetObjectKey(key)); await retryFn(async () => { await Promise.all([ @@ -444,11 +441,14 @@ export async function delCollection({ ).lean(); // delete s3 images which are parsed from docs - // collections - // .map((item) => item.fileId) - // .filter((fileId) => s3DatasetSource.isDatasetObjectKey(fileId)) - // .map((key) => `${path.dirname(key)}/${path.basename(key, path.extname(key))}-parsed`) - // .forEach((prefix) => s3DatasetSource.deleteDatasetFilesByPrefix({ rawPrefix: prefix })); - await s3DatasetSource.deleteDatasetFilesByKeys(allS3Keys); + // Delete all images parsed from the document by prefix + collections + .map((item) => item.fileId) + .filter((fileId): fileId is string => !!fileId && s3DatasetSource.isDatasetObjectKey(fileId)) + .map((key) => `${path.dirname(key)}/${path.basename(key, path.extname(key))}-parsed`) + .forEach((prefix) => s3DatasetSource.deleteDatasetFilesByPrefix({ rawPrefix: prefix })); + + // delete s3 images which are uploaded by users + await s3DatasetSource.deleteDatasetFilesByKeys(imageIds); }); } diff --git a/packages/service/core/dataset/collection/utils.ts b/packages/service/core/dataset/collection/utils.ts index 152732eaf0da..5573b4957c85 100644 --- a/packages/service/core/dataset/collection/utils.ts +++ b/packages/service/core/dataset/collection/utils.ts @@ -161,7 +161,7 @@ export const syncCollection = async (collection: CollectionWithDatasetType) => { }; })(); - const { title, rawText, imageKeys } = await readDatasetSourceRawText({ + const { title, rawText } = await readDatasetSourceRawText({ teamId: collection.teamId, tmbId: collection.tmbId, datasetId: collection.datasetId, @@ -189,7 +189,6 @@ export const syncCollection = async (collection: CollectionWithDatasetType) => { session, dataset, rawText: rawText, - imageKeys, createCollectionParams: { ...collection, name: title || collection.name, diff --git a/packages/service/core/dataset/data/schema.ts b/packages/service/core/dataset/data/schema.ts index 2d1cfefbc87f..8d20378e62a5 100644 --- a/packages/service/core/dataset/data/schema.ts +++ b/packages/service/core/dataset/data/schema.ts @@ -40,10 +40,6 @@ const DatasetDataSchema = new Schema({ type: String }, imageId: String, - imageKeys: { - type: [String], - default: [] - }, imageDescMap: Object, history: { type: [ @@ -109,9 +105,6 @@ try { // rebuild data DatasetDataSchema.index({ rebuilding: 1, teamId: 1, datasetId: 1 }); - // Query images by collection efficiently - DatasetDataSchema.index({ collectionId: 1, imageKeys: 1 }); - // 为查询 initJieba 字段不存在的数据添加索引 DatasetDataSchema.index({ initJieba: 1, updateTime: 1 }); diff --git a/packages/service/core/dataset/read.ts b/packages/service/core/dataset/read.ts index 569e78b977a0..99dbf3c8aeb6 100644 --- a/packages/service/core/dataset/read.ts +++ b/packages/service/core/dataset/read.ts @@ -5,7 +5,7 @@ import { import { urlsFetch } from '../../common/string/cheerio'; import { type TextSplitProps } from '@fastgpt/global/common/string/textSplitter'; import axios from 'axios'; -import { parsedFileContentS3Key, readS3FileContentByBuffer } from '../../common/file/read/utils'; +import { readS3FileContentByBuffer } from '../../common/file/read/utils'; import { parseFileExtensionFromUrl } from '@fastgpt/global/common/string/tools'; import { getApiDatasetRequest } from './apiDataset'; import Papa from 'papaparse'; @@ -18,6 +18,7 @@ import { UserError } from '@fastgpt/global/common/error/utils'; import { getS3DatasetSource } from '../../common/s3/sources/dataset'; import { Mimes } from '../../common/s3/constants'; import path from 'node:path'; +import { ParsedFileContentS3Key } from '../../common/s3/utils'; export const readFileRawTextByUrl = async ({ teamId, @@ -67,7 +68,7 @@ export const readFileRawTextByUrl = async ({ const chunks: Buffer[] = []; let totalLength = 0; - return new Promise<{ rawText: string; imageKeys: string[] }>((resolve, reject) => { + return new Promise<{ rawText: string }>((resolve, reject) => { let isAborted = false; const cleanup = () => { @@ -110,12 +111,12 @@ export const readFileRawTextByUrl = async ({ // 立即清理 chunks 数组释放内存 chunks.length = 0; - const { rawText, imageKeys } = await retryFn(() => { - const key = parsedFileContentS3Key.dataset({ + const { rawText } = await retryFn(() => { + const { key } = ParsedFileContentS3Key.dataset({ datasetId, - mimetype: Mimes[extension as keyof typeof Mimes], - filename: 'file' - }).key; + filename: 'file', + mimetype: Mimes[extension as keyof typeof Mimes] + }); const prefix = `${path.dirname(key)}/${path.basename(key, path.extname(key))}-parsed`; return readS3FileContentByBuffer({ customPdfParse, @@ -131,7 +132,7 @@ export const readFileRawTextByUrl = async ({ }); }); - resolve({ rawText, imageKeys: imageKeys || [] }); + resolve({ rawText }); } catch (error) { cleanup(); reject(error); @@ -184,14 +185,13 @@ export const readDatasetSourceRawText = async ({ }): Promise<{ title?: string; rawText: string; - imageKeys?: string[]; }> => { if (type === DatasetSourceReadTypeEnum.fileLocal) { if (!datasetId || !getS3DatasetSource().isDatasetObjectKey(sourceId)) { return Promise.reject('datasetId is required for S3 files'); } - const { filename, rawText, imageKeys } = await getS3DatasetSource().getDatasetFileRawText({ + const { filename, rawText } = await getS3DatasetSource().getDatasetFileRawText({ teamId, tmbId, fileId: sourceId, @@ -203,8 +203,7 @@ export const readDatasetSourceRawText = async ({ return { title: filename, - rawText, - imageKeys + rawText }; } else if (type === DatasetSourceReadTypeEnum.link) { const result = await urlsFetch({ @@ -219,12 +218,11 @@ export const readDatasetSourceRawText = async ({ return { title, - rawText: content, - imageKeys: [] // Link sources don't have images, return empty array + rawText: content }; } else if (type === DatasetSourceReadTypeEnum.externalFile) { if (!externalFileId) return Promise.reject(new UserError('FileId not found')); - const { rawText, imageKeys } = await readFileRawTextByUrl({ + const { rawText } = await readFileRawTextByUrl({ teamId, tmbId, url: sourceId, @@ -233,8 +231,7 @@ export const readDatasetSourceRawText = async ({ customPdfParse }); return { - rawText, - imageKeys + rawText }; } else if (type === DatasetSourceReadTypeEnum.apiFile) { const { title, rawText } = await readApiServerFileContent({ @@ -247,14 +244,12 @@ export const readDatasetSourceRawText = async ({ }); return { title, - rawText, - imageKeys: [] // API files don't have imageKeys in current implementation + rawText }; } return { title: '', - rawText: '', - imageKeys: [] + rawText: '' }; }; @@ -292,12 +287,10 @@ export const rawText2Chunks = async ({ backupParse, chunkSize = 512, imageIdList, - imageKeys, ...splitProps }: { rawText: string; imageIdList?: string[]; - imageKeys?: string[]; chunkTriggerType?: ChunkTriggerConfigTypeEnum; chunkTriggerMinSize?: number; // maxSize from agent model, not store @@ -310,48 +303,18 @@ export const rawText2Chunks = async ({ a: string; indexes?: string[]; imageIdList?: string[]; - imageKeys?: string[]; }[] > => { const parseDatasetBackup2Chunks = (rawText: string) => { const csvArr = Papa.parse(rawText).data as string[][]; - const chunks = csvArr .slice(1) - .map((item) => { - const q = item[0] || ''; - const a = item[1] || ''; - const fullText = q + '\n' + a; - - // Extract image keys that are actually referenced in this chunk - const chunkImageKeys = []; - - if (imageKeys && imageKeys.length > 0) { - // Find all markdown image references in the chunk - const imageRefRegex = /!\[[^\]]*\]\(([^)]+)\)/g; - const referencedUrls = new Set(); - let match; - - while ((match = imageRefRegex.exec(fullText)) !== null) { - referencedUrls.add(match[1]); - } - - // Filter imageKeys to only include those referenced in this chunk - for (const imageKey of imageKeys) { - if (referencedUrls.has(imageKey)) { - chunkImageKeys.push(imageKey); - } - } - } - - return { - q, - a, - indexes: item.slice(2).filter((item) => item.trim()), - imageIdList, - imageKeys: chunkImageKeys - }; - }) + .map((item) => ({ + q: item[0] || '', + a: item[1] || '', + indexes: item.slice(2).filter((item) => item.trim()), + imageIdList + })) .filter((item) => item.q || item.a); return { @@ -373,8 +336,7 @@ export const rawText2Chunks = async ({ { q: rawText, a: '', - imageIdList, - imageKeys: imageKeys || [] + imageIdList } ]; } @@ -383,7 +345,7 @@ export const rawText2Chunks = async ({ if (chunkTriggerType !== ChunkTriggerConfigTypeEnum.forceChunk) { const textLength = rawText.trim().length; if (textLength < chunkTriggerMinSize) { - return [{ q: rawText, a: '', imageIdList, imageKeys: imageKeys || [] }]; + return [{ q: rawText, a: '', imageIdList }]; } } @@ -393,34 +355,10 @@ export const rawText2Chunks = async ({ ...splitProps }); - return chunks.map((item) => { - // Extract image keys that are actually referenced in this chunk - const chunkImageKeys = []; - - if (imageKeys && imageKeys.length > 0) { - // Find all markdown image references in the chunk - const imageRefRegex = /!\[[^\]]*\]\(([^)]+)\)/g; - const referencedUrls = new Set(); - let match; - - while ((match = imageRefRegex.exec(item)) !== null) { - referencedUrls.add(match[1]); // match[1] is the URL part - } - - // Filter imageKeys to only include those referenced in this chunk - for (const imageKey of imageKeys) { - if (referencedUrls.has(imageKey)) { - chunkImageKeys.push(imageKey); - } - } - } - - return { - q: item, - a: '', - indexes: [], - imageIdList, - imageKeys: chunkImageKeys - }; - }); + return chunks.map((item) => ({ + q: item, + a: '', + indexes: [], + imageIdList + })); }; diff --git a/packages/service/core/dataset/training/controller.ts b/packages/service/core/dataset/training/controller.ts index 8bd35d32cc48..fec4f1bc886e 100644 --- a/packages/service/core/dataset/training/controller.ts +++ b/packages/service/core/dataset/training/controller.ts @@ -122,7 +122,6 @@ export async function pushDataListToTrainingQueue({ ...(item.q && { q: item.q }), ...(item.a && { a: item.a }), ...(item.imageId && { imageId: item.imageId }), - imageKeys: item.imageKeys || [], chunkIndex: item.chunkIndex ?? 0, indexSize, weight: weight ?? 0, diff --git a/packages/service/core/dataset/training/schema.ts b/packages/service/core/dataset/training/schema.ts index a637e9e898fe..a8f723a37d41 100644 --- a/packages/service/core/dataset/training/schema.ts +++ b/packages/service/core/dataset/training/schema.ts @@ -64,10 +64,6 @@ const TrainingDataSchema = new Schema({ default: '' }, imageId: String, - imageKeys: { - type: [String], - default: [] - }, imageDescMap: Object, chunkIndex: { type: Number, diff --git a/packages/service/core/dataset/utils.ts b/packages/service/core/dataset/utils.ts index 6d4b34988fee..f8cdd6ab2f06 100644 --- a/packages/service/core/dataset/utils.ts +++ b/packages/service/core/dataset/utils.ts @@ -35,21 +35,8 @@ export const filterDatasetsByTmbId = async ({ return datasetIds.filter((_, index) => permissions[index]); }; -export async function getFileDatasetInfo(key: string): Promise<{ - _id: string; - datasetId: string; - collectionId: string; -} | null> { - return await MongoDatasetData.findOne( - { $or: [{ imageKeys: { $in: [key] } }, { imageId: key }] }, - 'datasetId collectionId' - ) - .lean() - .exec(); -} - /** - * 替换数据集引用文本 markdown 中的链接格式的 S3 对象键为 JWT 签名后的 URL + * 替换数据集引用 markdown 文本中的图片链接格式的 S3 对象键为 JWT 签名后的 URL * * @param datasetQuoteText 数据集引用文本 * @returns 替换后的文本 @@ -80,7 +67,8 @@ export async function replaceDatasetQuoteTextWithJWT(datasetQuoteText: string) { const [full, bang, alt, objectKey] = match; if (s3DatasetSource.isDatasetObjectKey(objectKey) || s3ChatSource.isChatFileKey(objectKey)) { - const url = `${EndpointUrl}/api/system/file/${jwtSignS3ObjectKey(objectKey)}`; + const token = jwtSignS3ObjectKey(objectKey); + const url = `${EndpointUrl}/api/system/file/${token}`; const replacement = `${bang}[${alt}](${url})`; content = content.slice(0, match.index) + replacement + content.slice(match.index + full.length); diff --git a/packages/service/core/workflow/dispatch/ai/chat.ts b/packages/service/core/workflow/dispatch/ai/chat.ts index 370a6a8a65a3..26293bb8d7ae 100644 --- a/packages/service/core/workflow/dispatch/ai/chat.ts +++ b/packages/service/core/workflow/dispatch/ai/chat.ts @@ -42,6 +42,7 @@ import { postTextCensor } from '../../../chat/postTextCensor'; import { createLLMResponse } from '../../../ai/llm/request'; import { formatModelChars2Points } from '../../../../support/wallet/usage/utils'; import { replaceDatasetQuoteTextWithJWT } from '../../../dataset/utils'; +import { ParsedFileContentS3Key } from '../../../../common/s3/utils'; export type ChatProps = ModuleDispatchProps< AIChatNodeProps & { @@ -136,12 +137,13 @@ export const dispatchChatCompletion = async (props: ChatProps): Promise if (stringQuoteText) { @@ -379,9 +377,7 @@ async function getMultiInput({ tmbId: runningUserInfo.tmbId, customPdfParse, usageId, - appId, - chatId, - uId + fileS3Prefix }); return { diff --git a/packages/service/core/workflow/dispatch/tools/readFiles.ts b/packages/service/core/workflow/dispatch/tools/readFiles.ts index 2b8c8f6dc8af..fe97de4e1773 100644 --- a/packages/service/core/workflow/dispatch/tools/readFiles.ts +++ b/packages/service/core/workflow/dispatch/tools/readFiles.ts @@ -7,10 +7,7 @@ import axios from 'axios'; import { serverRequestBaseUrl } from '../../../../common/api/serverRequest'; import { getErrText } from '@fastgpt/global/common/error/utils'; import { detectFileEncoding, parseUrlToFileType } from '@fastgpt/global/common/file/tools'; -import { - parsedFileContentS3Key, - readS3FileContentByBuffer -} from '../../../../common/file/read/utils'; +import { readS3FileContentByBuffer } from '../../../../common/file/read/utils'; import { ChatRoleEnum } from '@fastgpt/global/core/chat/constants'; import { type ChatItemType, type UserChatItemValueItemType } from '@fastgpt/global/core/chat/type'; import { parseFileExtensionFromUrl } from '@fastgpt/global/common/string/tools'; @@ -20,6 +17,7 @@ import { addMinutes } from 'date-fns'; import { getNodeErrResponse } from '../utils'; import { isInternalAddress } from '../../../../common/system/utils'; import { replaceDatasetQuoteTextWithJWT } from '../../../dataset/utils'; +import { ParsedFileContentS3Key } from '../../../../common/s3/utils'; type Props = ModuleDispatchProps<{ [NodeInputKeyEnum.fileUrlList]: string[]; @@ -76,9 +74,11 @@ export const dispatchReadFiles = async (props: Props): Promise => { tmbId, customPdfParse, usageId, - appId: props.runningAppInfo.id, - chatId: props.chatId, - uId: props.uid + fileS3Prefix: ParsedFileContentS3Key.chat({ + appId: props.runningAppInfo.id, + chatId: props.chatId!, + uId: props.uid + }) }); return { @@ -132,9 +132,7 @@ export const getFileContentFromLinks = async ({ tmbId, customPdfParse, usageId, - appId, - chatId, - uId + fileS3Prefix }: { urls: string[]; requestOrigin?: string; @@ -143,9 +141,7 @@ export const getFileContentFromLinks = async ({ tmbId: string; customPdfParse?: boolean; usageId?: string; - appId: string; - chatId?: string; - uId: string; + fileS3Prefix: string; }) => { const parseUrlList = urls // Remove invalid urls @@ -236,8 +232,7 @@ export const getFileContentFromLinks = async ({ return detectFileEncoding(buffer); })(); - // Read file - const { rawText, imageKeys } = await readS3FileContentByBuffer({ + const { rawText } = await readS3FileContentByBuffer({ extension, teamId, tmbId, @@ -246,8 +241,7 @@ export const getFileContentFromLinks = async ({ customPdfParse, getFormatText: true, imageKeyOptions: { - prefix: parsedFileContentS3Key.chat({ appId, chatId: chatId!, uId }), - hasTTL: false + prefix: fileS3Prefix }, usageId }); @@ -259,8 +253,7 @@ export const getFileContentFromLinks = async ({ sourceId: url, sourceName: filename, text: replacedText, - expiredTime: addMinutes(new Date(), 20), - imageKeys + expiredTime: addMinutes(new Date(), 20) }); return formatResponseObject({ filename, url, content: replacedText }); diff --git a/packages/service/support/permission/dataset/auth.ts b/packages/service/support/permission/dataset/auth.ts index 1deff53a5b1a..47d61405af49 100644 --- a/packages/service/support/permission/dataset/auth.ts +++ b/packages/service/support/permission/dataset/auth.ts @@ -250,12 +250,11 @@ export async function authDatasetData({ q: datasetData.q, a: datasetData.a, imageId: datasetData.imageId, - imageKeys: datasetData.imageKeys, imagePreivewUrl: datasetData.imageId ? s3DatasetSource.isDatasetObjectKey(datasetData.imageId) ? await s3DatasetSource.createGetDatasetFileURL({ key: datasetData.imageId, - expiredHours: 24 + expiredHours: 1 }) : getDatasetImagePreviewUrl({ imageId: datasetData.imageId, diff --git a/projects/app/src/components/Markdown/img/Image.tsx b/projects/app/src/components/Markdown/img/Image.tsx index a42e234af740..30c41e4a5443 100644 --- a/projects/app/src/components/Markdown/img/Image.tsx +++ b/projects/app/src/components/Markdown/img/Image.tsx @@ -16,7 +16,6 @@ const MdImage = ({ const [renderSrc, setRenderSrc] = useState(src); const [isLoading, setIsLoading] = useState(false); - // TODO: 在我迁移完到 JWT 后移除这个 useEffect useEffect(() => { if (!src || (!src.startsWith('dataset/') && !src.startsWith('chat/'))) { setRenderSrc(src); diff --git a/projects/app/src/pages/api/core/app/create.ts b/projects/app/src/pages/api/core/app/create.ts index 43a6b68fc1c8..63660b6ce341 100644 --- a/projects/app/src/pages/api/core/app/create.ts +++ b/projects/app/src/pages/api/core/app/create.ts @@ -175,13 +175,12 @@ export const onCreateApp = async ({ return `${getNanoid(6)}-${last}`; })(); - const copiedAvatar = await s3AvatarSource.copyAvatar({ + return await s3AvatarSource.copyAvatar({ key: template.avatar, teamId, filename, temporary: true }); - return copiedAvatar; })(); const [app] = await MongoApp.create( diff --git a/projects/app/src/pages/api/core/dataset/collection/create/backup.ts b/projects/app/src/pages/api/core/dataset/collection/create/backup.ts index 467bfe916f4b..eec9cd21e227 100644 --- a/projects/app/src/pages/api/core/dataset/collection/create/backup.ts +++ b/projects/app/src/pages/api/core/dataset/collection/create/backup.ts @@ -3,10 +3,7 @@ import { NextAPI } from '@/service/middleware/entry'; import { getUploadModel } from '@fastgpt/service/common/file/multer'; import { removeFilesByPaths } from '@fastgpt/service/common/file/utils'; import { addLog } from '@fastgpt/service/common/system/log'; -import { - parsedFileContentS3Key, - readRawTextByLocalFile -} from '@fastgpt/service/common/file/read/utils'; +import { readRawTextByLocalFile } from '@fastgpt/service/common/file/read/utils'; import { authDataset } from '@fastgpt/service/support/permission/dataset/auth'; import { WritePermissionVal } from '@fastgpt/global/support/permission/constant'; import { createCollectionAndInsertData } from '@fastgpt/service/core/dataset/collection/controller'; @@ -16,6 +13,7 @@ import { } from '@fastgpt/global/core/dataset/constants'; import { i18nT } from '@fastgpt/web/i18n/utils'; import { uploadFile } from '@fastgpt/service/common/file/gridfs/controller'; +import { ParsedFileContentS3Key } from '@fastgpt/service/common/s3/utils'; export type backupQuery = {}; @@ -52,7 +50,7 @@ async function handler(req: ApiRequestProps, res: ApiRe path: file.path, encoding: file.encoding, getFormatText: false, - uploadKey: parsedFileContentS3Key.dataset({ + uploadKey: ParsedFileContentS3Key.dataset({ datasetId: dataset._id, mimetype: file.mimetype, filename: file.originalname diff --git a/projects/app/src/pages/api/core/dataset/collection/create/images.ts b/projects/app/src/pages/api/core/dataset/collection/create/images.ts index 122071987586..cb4151ea5e51 100644 --- a/projects/app/src/pages/api/core/dataset/collection/create/images.ts +++ b/projects/app/src/pages/api/core/dataset/collection/create/images.ts @@ -14,13 +14,14 @@ import { removeFilesByPaths } from '@fastgpt/service/common/file/utils'; import type { NextApiResponse } from 'next'; import { i18nT } from '@fastgpt/web/i18n/utils'; import { authFrequencyLimit } from '@/service/common/frequencyLimit/api'; -import { addSeconds } from 'date-fns'; +import { addDays, addSeconds } from 'date-fns'; import { createDatasetImage } from '@fastgpt/service/core/dataset/image/controller'; import { getS3DatasetSource } from '@fastgpt/service/common/s3/sources/dataset'; import { S3Sources } from '@fastgpt/service/common/s3/type'; import { getNanoid } from '@fastgpt/global/common/string/tools'; import fsp from 'node:fs/promises'; import path from 'node:path'; +import { uploadImage2S3Bucket } from '@fastgpt/service/common/s3/utils'; const authUploadLimit = (tmbId: string, num: number) => { if (!global.feConfigs.uploadFileMaxAmount) return; @@ -66,11 +67,12 @@ async function handler( files.map(async (file) => { const filename = path.basename(file.filename); const uploadKey = [S3Sources.dataset, datasetId, `${getNanoid(6)}-${filename}`].join('/'); - return getS3DatasetSource().uploadDatasetImage({ + return uploadImage2S3Bucket('private', { + base64Img: (await fsp.readFile(file.path)).toString('base64'), uploadKey, mimetype: file.mimetype, filename, - base64Img: (await fsp.readFile(file.path)).toString('base64') + expiredTime: addDays(new Date(), 7) }); }) ); diff --git a/projects/app/src/pages/api/core/dataset/collection/create/template.ts b/projects/app/src/pages/api/core/dataset/collection/create/template.ts index 0ad876379928..480a80b6a479 100644 --- a/projects/app/src/pages/api/core/dataset/collection/create/template.ts +++ b/projects/app/src/pages/api/core/dataset/collection/create/template.ts @@ -3,10 +3,7 @@ import { NextAPI } from '@/service/middleware/entry'; import { getUploadModel } from '@fastgpt/service/common/file/multer'; import { removeFilesByPaths } from '@fastgpt/service/common/file/utils'; import { addLog } from '@fastgpt/service/common/system/log'; -import { - parsedFileContentS3Key, - readRawTextByLocalFile -} from '@fastgpt/service/common/file/read/utils'; +import { readRawTextByLocalFile } from '@fastgpt/service/common/file/read/utils'; import { authDataset } from '@fastgpt/service/support/permission/dataset/auth'; import { WritePermissionVal } from '@fastgpt/global/support/permission/constant'; import { createCollectionAndInsertData } from '@fastgpt/service/core/dataset/collection/controller'; @@ -16,6 +13,7 @@ import { } from '@fastgpt/global/core/dataset/constants'; import { i18nT } from '@fastgpt/web/i18n/utils'; import { uploadFile } from '@fastgpt/service/common/file/gridfs/controller'; +import { ParsedFileContentS3Key } from '@fastgpt/service/common/s3/utils'; export type templateImportQuery = {}; @@ -55,7 +53,7 @@ async function handler( path: file.path, encoding: file.encoding, getFormatText: false, - uploadKey: parsedFileContentS3Key.dataset({ + uploadKey: ParsedFileContentS3Key.dataset({ datasetId: dataset._id, mimetype: file.mimetype, filename: file.originalname diff --git a/projects/app/src/pages/api/core/dataset/collection/create/text.ts b/projects/app/src/pages/api/core/dataset/collection/create/text.ts index 6100be5b79d7..7d778eba0392 100644 --- a/projects/app/src/pages/api/core/dataset/collection/create/text.ts +++ b/projects/app/src/pages/api/core/dataset/collection/create/text.ts @@ -7,6 +7,7 @@ import { NextAPI } from '@/service/middleware/entry'; import { WritePermissionVal } from '@fastgpt/global/support/permission/constant'; import { type CreateCollectionResponse } from '@/global/core/dataset/api'; import { getS3DatasetSource } from '@fastgpt/service/common/s3/sources/dataset'; +import { removeS3TTL } from '@fastgpt/service/common/s3/utils'; async function handler(req: NextApiRequest): CreateCollectionResponse { const { name, text, ...body } = req.body as TextCreateDatasetCollectionParams; @@ -40,7 +41,7 @@ async function handler(req: NextApiRequest): CreateCollectionResponse { } }); - await s3DatasetSource.removeDatasetFileTTL(key); + await removeS3TTL({ key, bucketName: 'private' }); return { collectionId, diff --git a/projects/app/src/pages/api/core/dataset/collection/delete.ts b/projects/app/src/pages/api/core/dataset/collection/delete.ts index 36488944f40a..3840659940e8 100644 --- a/projects/app/src/pages/api/core/dataset/collection/delete.ts +++ b/projects/app/src/pages/api/core/dataset/collection/delete.ts @@ -43,7 +43,7 @@ async function handler(req: ApiRequestProps) teamId, datasetId: collection.datasetId, collectionId, - fields: '_id teamId datasetId fileId metadata' + fields: '_id teamId type datasetId fileId metadata' }); }) ).then((res) => { diff --git a/projects/app/src/pages/api/core/dataset/data/insertImages.ts b/projects/app/src/pages/api/core/dataset/data/insertImages.ts index 87f83c0a4344..2297c69388d6 100644 --- a/projects/app/src/pages/api/core/dataset/data/insertImages.ts +++ b/projects/app/src/pages/api/core/dataset/data/insertImages.ts @@ -1,12 +1,11 @@ import type { ApiRequestProps, ApiResponseType } from '@fastgpt/service/type/next'; import { NextAPI } from '@/service/middleware/entry'; import { authFrequencyLimit } from '@/service/common/frequencyLimit/api'; -import { addSeconds } from 'date-fns'; +import { addDays, addSeconds } from 'date-fns'; import { removeFilesByPaths } from '@fastgpt/service/common/file/utils'; import { getUploadModel } from '@fastgpt/service/common/file/multer'; import { authDatasetCollection } from '@fastgpt/service/support/permission/dataset/auth'; import { WritePermissionVal } from '@fastgpt/global/support/permission/constant'; -import { createDatasetImage } from '@fastgpt/service/core/dataset/image/controller'; import { mongoSessionRun } from '@fastgpt/service/common/mongo/sessionRun'; import { createTrainingUsage } from '@fastgpt/service/support/wallet/usage/controller'; import { UsageSourceEnum } from '@fastgpt/global/support/wallet/usage/constants'; @@ -16,7 +15,7 @@ import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants'; import { getS3DatasetSource } from '@fastgpt/service/common/s3/sources/dataset'; import path from 'node:path'; import fsp from 'node:fs/promises'; -import { parsedFileContentS3Key } from '@fastgpt/service/common/file/read/utils'; +import { ParsedFileContentS3Key, uploadImage2S3Bucket } from '@fastgpt/service/common/s3/utils'; export type insertImagesQuery = {}; @@ -66,15 +65,16 @@ async function handler( // 1. Upload images to S3 const imageIds = await Promise.all( files.map(async (file) => - getS3DatasetSource().uploadDatasetImage({ - uploadKey: parsedFileContentS3Key.dataset({ + uploadImage2S3Bucket('private', { + base64Img: (await fsp.readFile(file.path)).toString('base64'), + uploadKey: ParsedFileContentS3Key.dataset({ datasetId: dataset._id, mimetype: file.mimetype, filename: path.basename(file.filename) }).key, mimetype: file.mimetype, filename: path.basename(file.filename), - base64Img: (await fsp.readFile(file.path)).toString('base64') + expiredTime: addDays(new Date(), 7) }) ) ); diff --git a/projects/app/src/pages/api/core/dataset/presignDatasetFileGetUrl.ts b/projects/app/src/pages/api/core/dataset/presignDatasetFileGetUrl.ts index 03833b5cfc19..6b3e7ea0e1c0 100644 --- a/projects/app/src/pages/api/core/dataset/presignDatasetFileGetUrl.ts +++ b/projects/app/src/pages/api/core/dataset/presignDatasetFileGetUrl.ts @@ -15,7 +15,6 @@ import { createFileToken } from '@fastgpt/service/support/permission/auth/file'; import { BucketNameEnum, ReadFileBaseUrl } from '@fastgpt/global/common/file/constants'; import { DatasetCollectionTypeEnum } from '@fastgpt/global/core/dataset/constants'; import { UserError } from '@fastgpt/global/common/error/utils'; -import { getFileDatasetInfo } from '@fastgpt/service/core/dataset/utils'; async function handler(req: ApiRequestProps) { const parsed = PresignDatasetFileGetUrlSchema.parse(req.body); @@ -25,26 +24,14 @@ async function handler(req: ApiRequestProps) { if ('key' in parsed) { const { key } = parsed; - const dataset = await getFileDatasetInfo(key); - if (!dataset) { - // 如果 `dataset_datas` 中没有找到记录,则这次的请求应该是图片的预览请求,验证 datasetId 的权限即可 - const datasetId = key.split('/')[1] || ''; - await authDataset({ - datasetId, - per: ReadPermissionVal, - req, - authToken: true, - authApiKey: true - }); - } else { - await authDatasetCollection({ - req, - authToken: true, - authApiKey: true, - per: ReadPermissionVal, - collectionId: dataset.collectionId - }); - } + const datasetId = key.split('/')[1] || ''; + await authDataset({ + datasetId, + per: ReadPermissionVal, + req, + authToken: true, + authApiKey: true + }); return await s3DatasetSource.createGetDatasetFileURL({ key, expiredHours: 24 }); } diff --git a/projects/app/src/pages/api/file/temp.ts b/projects/app/src/pages/api/file/temp.ts deleted file mode 100644 index bb5b393e526b..000000000000 --- a/projects/app/src/pages/api/file/temp.ts +++ /dev/null @@ -1,43 +0,0 @@ -import { getGlobalRedisConnection } from '@fastgpt/service/common/redis'; -import { type ApiRequestProps } from '@fastgpt/service/type/next'; -import { - ShortPreviewLinkSchema, - type ShortPreviewLinkParams -} from '@fastgpt/global/core/dataset/v2/api'; -import { authCert } from '@fastgpt/service/support/permission/auth/common'; -import { CommonErrEnum } from '@fastgpt/global/common/error/code/common'; -import { NextAPI } from '@/service/middleware/entry'; -import { getS3ChatSource } from '@fastgpt/service/common/s3/sources/chat'; -import { getS3DatasetSource } from '@fastgpt/service/common/s3/sources/dataset'; -import type { NextApiResponse } from 'next'; - -// Short Preview Link -async function handler(req: ApiRequestProps, res: NextApiResponse) { - const parsed = ShortPreviewLinkSchema.parse(req.query); - const { k: redisKey } = parsed; - - await authCert({ req, authToken: true }); - - const redis = getGlobalRedisConnection(); - const objectKey = await redis.get(redisKey); - if (!objectKey) { - res.status(404).end(); - return; - } - - const s3ChatSource = getS3ChatSource(); - const s3DatasetSource = getS3DatasetSource(); - - if (s3ChatSource.isChatFileKey(objectKey)) { - res.redirect(302, await s3ChatSource.createGetChatFileURL({ key: objectKey, external: true })); - } else if (s3DatasetSource.isDatasetObjectKey(objectKey)) { - res.redirect( - 302, - await s3DatasetSource.createGetDatasetFileURL({ key: objectKey, external: true }) - ); - } - - res.status(404).end(); -} - -export default NextAPI(handler); diff --git a/projects/app/src/service/core/dataset/data/controller.ts b/projects/app/src/service/core/dataset/data/controller.ts index 69f405fc9d4f..82077b635d35 100644 --- a/projects/app/src/service/core/dataset/data/controller.ts +++ b/projects/app/src/service/core/dataset/data/controller.ts @@ -18,12 +18,9 @@ import { MongoDatasetDataText } from '@fastgpt/service/core/dataset/data/dataTex import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants'; import { countPromptTokens } from '@fastgpt/service/common/string/tiktoken'; import { deleteDatasetImage } from '@fastgpt/service/core/dataset/image/controller'; -import { addLog } from '@fastgpt/service/common/system/log'; import { text2Chunks } from '@fastgpt/service/worker/function'; import { getS3DatasetSource } from '@fastgpt/service/common/s3/sources/dataset'; -import { getGlobalRedisConnection } from '@fastgpt/service/common/redis'; -import { S3Sources } from '@fastgpt/service/common/s3/type'; -import _ from 'lodash'; +import { removeS3TTL } from '@fastgpt/service/common/s3/utils'; const formatIndexes = async ({ indexes = [], @@ -173,7 +170,6 @@ export async function insertData2Dataset({ q, a, imageId, - imageKeys, chunkIndex = 0, indexSize = 512, indexes, @@ -231,7 +227,6 @@ export async function insertData2Dataset({ q, a, imageId, - imageKeys, imageDescMap, chunkIndex, indexes: results @@ -254,6 +249,11 @@ export async function insertData2Dataset({ { session, ordered: true } ); + // 只移除图片数据集的图片的 TTL + if (getS3DatasetSource().isDatasetObjectKey(imageId)) { + await removeS3TTL({ key: imageId, bucketName: 'private', session }); + } + return { insertId: _id, tokens @@ -412,42 +412,6 @@ export async function updateData2Dataset({ idList: deleteVectorIdList }); } - - // Check if there are any images need to be deleted - const retrieveS3PreviewKeys = async (q: string) => { - const redis = getGlobalRedisConnection(); - const prefixPattern = Object.values(S3Sources) - .map((pattern) => `${pattern}\\/[^\\s)]+`) - .join('|'); - const regex = new RegExp( - String.raw`(!?)\[([^\]]+)\]\((?!https?:\/\/)(${prefixPattern})\)`, - 'g' - ); - - const matches = Array.from(q.matchAll(regex)); - const objectKeys = []; - - for (const match of matches.slice().reverse()) { - const [, , , objectKey] = match; - - if (getS3DatasetSource().isDatasetObjectKey(objectKey)) { - objectKeys.push(objectKey); - } - } - - return objectKeys; - }; - - const objectKeys = await retrieveS3PreviewKeys(q); - const differenceKeys = _.difference(mongoData.imageKeys || [], objectKeys); - if (differenceKeys.length > 0) { - await getS3DatasetSource().deleteDatasetFilesByKeys(differenceKeys); - } - await MongoDatasetData.updateOne( - { _id: mongoData._id }, - { $set: { imageKeys: objectKeys } }, - { session } - ); }); return { @@ -466,9 +430,7 @@ export const deleteDatasetData = async (data: DatasetDataItemType) => { await deleteDatasetImage(data.imageId); } - if (data.imageKeys && data.imageKeys.length > 0) { - await getS3DatasetSource().deleteDatasetFilesByKeys(data.imageKeys); - } + // Note: We don't delete parsed images from S3 here - they will be cleaned up when the collection is deleted // 3. Delete vector data await deleteDatasetDataVector({ diff --git a/projects/app/src/service/core/dataset/queues/datasetParse.ts b/projects/app/src/service/core/dataset/queues/datasetParse.ts index b5e6a92da754..e6a1878a8d8a 100644 --- a/projects/app/src/service/core/dataset/queues/datasetParse.ts +++ b/projects/app/src/service/core/dataset/queues/datasetParse.ts @@ -236,14 +236,7 @@ export const datasetParseQueue = async (): Promise => { continue; } - // 2. Read source - addLog.info('[Parse Queue] === START PARSING ===', { - collectionId: collection._id, - fileId: collection.fileId, - type: collection.type - }); - - let { title, rawText, imageKeys } = await readDatasetSourceRawText({ + let { title, rawText } = await readDatasetSourceRawText({ teamId: data.teamId, tmbId: data.tmbId, customPdfParse: collection.customPdfParse, @@ -252,13 +245,6 @@ export const datasetParseQueue = async (): Promise => { ...sourceReadType }); - addLog.info('[Parse Queue] Read source result', { - title, - rawTextLength: rawText.length, - imageKeysCount: imageKeys?.length || 0, - imageKeys - }); - // 3. LLM Pargraph const { resultText, totalInputTokens, totalOutputTokens } = await requestLLMPargraph({ rawText, @@ -288,20 +274,7 @@ export const datasetParseQueue = async (): Promise => { overlapRatio: collection.trainingType === DatasetCollectionDataProcessModeEnum.chunk ? 0.2 : 0, customReg: collection.chunkSplitter ? [collection.chunkSplitter] : [], - backupParse: collection.trainingType === DatasetCollectionDataProcessModeEnum.backup, - imageKeys - }); - - addLog.debug('[Parse Queue] After chunk split', { - chunksCount: chunks.length, - firstChunkImageKeys: chunks[0]?.imageKeys, - allChunksHaveImageKeys: chunks.every((chunk) => chunk.imageKeys), - detailedChunks: chunks.map((chunk, idx) => ({ - index: idx, - qPreview: chunk.q?.substring(0, 100), - imageKeysCount: chunk.imageKeys?.length || 0, - imageKeys: chunk.imageKeys - })) + backupParse: collection.trainingType === DatasetCollectionDataProcessModeEnum.backup }); // Check dataset limit @@ -345,18 +318,6 @@ export const datasetParseQueue = async (): Promise => { chunkIndex: index })); - addLog.debug('[Parse Queue] Before push to training queue', { - trainingDataCount: trainingData.length, - firstItemImageKeys: trainingData[0]?.imageKeys, - hasImageKeys: trainingData.some((item) => item.imageKeys && item.imageKeys.length > 0), - detailedTrainingData: trainingData.map((item, idx) => ({ - index: idx, - qPreview: item.q?.substring(0, 100), - imageKeysCount: item.imageKeys?.length || 0, - imageKeys: item.imageKeys - })) - }); - await pushDataListToTrainingQueue({ teamId: data.teamId, tmbId: data.tmbId, @@ -383,19 +344,9 @@ export const datasetParseQueue = async (): Promise => { ); // 8. Remove file TTL (images TTL will be removed after successful insertion to dataset_datas) - const s3DatasetSource = getS3DatasetSource(); - - addLog.info('[Parse Queue] Before removing file TTL', { - hasFileId: !!collection.fileId, - isS3File: collection.fileId && s3DatasetSource.isDatasetObjectKey(collection.fileId) - }); - // 8.1 For S3 files, remove file TTL only - if (collection.fileId && s3DatasetSource.isDatasetObjectKey(collection.fileId)) { - // Remove file TTL - await s3DatasetSource.removeDatasetFileTTL(collection.fileId, session); - addLog.info('[Parse Queue] Removed file TTL', { fileId: collection.fileId }); - // Note: Image TTLs will be removed in generateVector queue after successful insertion + if (collection.fileId && getS3DatasetSource().isDatasetObjectKey(collection.fileId)) { + // await removeS3TTL({ key: collection.fileId, bucketName: 'private', session }); } // 8.2 For GridFS files (legacy), remove MongoDB image TTL else { diff --git a/projects/app/src/service/core/dataset/queues/generateVector.ts b/projects/app/src/service/core/dataset/queues/generateVector.ts index be52b312ab61..62f475f62b7b 100644 --- a/projects/app/src/service/core/dataset/queues/generateVector.ts +++ b/projects/app/src/service/core/dataset/queues/generateVector.ts @@ -20,8 +20,6 @@ import type { } from '@fastgpt/global/core/dataset/type'; import { retryFn } from '@fastgpt/global/common/system/utils'; import { delay } from '@fastgpt/service/common/bullmq'; -import { getS3DatasetSource } from '@fastgpt/service/common/s3/sources/dataset'; -import { getS3ChatSource } from '@fastgpt/service/common/s3/sources/chat'; const reduceQueue = () => { global.vectorQueueLen = global.vectorQueueLen > 0 ? global.vectorQueueLen - 1 : 0; @@ -81,7 +79,7 @@ export async function generateVector(): Promise { } ]) .select( - 'teamId tmbId datasetId collectionId q a imageId imageKeys imageDescMap chunkIndex indexSize billId mode retryCount lockTime indexes' + 'teamId tmbId datasetId collectionId q a imageId imageDescMap chunkIndex indexSize billId mode retryCount lockTime indexes' ) .lean(); @@ -262,13 +260,6 @@ const rebuildData = async ({ trainingData }: { trainingData: TrainingDataType }) const insertData = async ({ trainingData }: { trainingData: TrainingDataType }) => { return mongoSessionRun(async (session) => { - addLog.debug('[Vector Queue] insertData - before insert', { - trainingDataId: trainingData._id, - qPreview: trainingData.q?.substring(0, 100), - imageKeysCount: trainingData.imageKeys?.length || 0, - imageKeys: trainingData.imageKeys - }); - // insert new data to dataset const { tokens } = await insertData2Dataset({ teamId: trainingData.teamId, @@ -278,7 +269,6 @@ const insertData = async ({ trainingData }: { trainingData: TrainingDataType }) q: trainingData.q, a: trainingData.a, imageId: trainingData.imageId, - imageKeys: trainingData.imageKeys, imageDescMap: trainingData.imageDescMap, chunkIndex: trainingData.chunkIndex, indexSize: @@ -292,19 +282,6 @@ const insertData = async ({ trainingData }: { trainingData: TrainingDataType }) session }); - await (async () => { - const s3DatasetSource = getS3DatasetSource(); - const keys = Array.from( - new Set([...(trainingData.imageKeys ?? []), trainingData.imageId ?? '']) - ) - .flat() - .filter((key) => s3DatasetSource.isDatasetObjectKey(key)); - - if (keys.length <= 0) return; - - await s3DatasetSource.removeDatasetImagesTTL(keys, session); - })(); - // delete data from training await MongoDatasetTraining.deleteOne({ _id: trainingData._id }, { session }); From c9ece0f157ab370e5ef072cf15cd062708ea514a Mon Sep 17 00:00:00 2001 From: xqvvu Date: Thu, 20 Nov 2025 17:38:23 +0800 Subject: [PATCH 6/6] fix: remove parsed images' TTL --- packages/service/common/file/read/utils.ts | 3 +- packages/service/common/s3/controller.ts | 20 ++++++ .../service/common/s3/sources/chat/index.ts | 8 ++- .../service/common/s3/sources/chat/type.ts | 3 +- packages/service/common/s3/utils.ts | 27 ++++++-- .../service/core/dataset/data/controller.ts | 17 +++-- .../service/core/dataset/search/controller.ts | 13 +++- packages/service/core/dataset/utils.ts | 8 +-- .../service/core/workflow/dispatch/ai/chat.ts | 6 +- .../core/workflow/dispatch/ai/tool/index.ts | 9 ++- .../core/workflow/dispatch/tools/readFiles.ts | 25 +++++-- .../app/src/components/Markdown/img/Image.tsx | 66 +++++++++---------- .../pages/api/core/dataset/data/v2/list.ts | 12 ++-- .../api/core/dataset/file/getPreviewChunks.ts | 9 ++- 14 files changed, 152 insertions(+), 74 deletions(-) diff --git a/packages/service/common/file/read/utils.ts b/packages/service/common/file/read/utils.ts index 1049ac20f742..822d9bd15dc2 100644 --- a/packages/service/common/file/read/utils.ts +++ b/packages/service/common/file/read/utils.ts @@ -180,7 +180,7 @@ export const readS3FileContentByBuffer = async ({ base64Img: `data:${item.mime};base64,${item.base64}`, uploadKey: `${prefix}/${item.uuid}.${ext}`, mimetype: Mimes[ext as keyof typeof Mimes], - filename: `${item.uuid}.${ext}`, + filename: `${item.uuid}${ext}`, expiredTime }); } catch (error) { @@ -188,6 +188,7 @@ export const readS3FileContentByBuffer = async ({ } })(); rawText = rawText.replace(item.uuid, src); + // rawText = rawText.replace(item.uuid, jwtSignS3ObjectKey(src, addDays(new Date(), 90))); if (formatText) { formatText = formatText.replace(item.uuid, src); } diff --git a/packages/service/common/s3/controller.ts b/packages/service/common/s3/controller.ts index c77112e05b84..d1bb51c05637 100644 --- a/packages/service/common/s3/controller.ts +++ b/packages/service/common/s3/controller.ts @@ -3,6 +3,7 @@ import { addLog } from '../system/log'; import { setCron } from '../system/cron'; import { checkTimerLock } from '../system/timerLock/utils'; import { TimerIdEnum } from '../system/timerLock/constants'; +import path from 'node:path'; export async function clearExpiredMinioFiles() { try { @@ -26,6 +27,25 @@ export async function clearExpiredMinioFiles() { if (bucket) { await bucket.delete(file.minioKey); + + if (!file.minioKey.includes('-parsed/')) { + try { + const dir = path.dirname(file.minioKey); + const basename = path.basename(file.minioKey); + const ext = path.extname(basename); + + if (ext) { + const nameWithoutExt = path.basename(basename, ext); + const parsedPrefix = `${dir}/${nameWithoutExt}-parsed`; + + await bucket.addDeleteJob({ prefix: parsedPrefix }); + addLog.info(`Scheduled deletion of parsed images: ${parsedPrefix}`); + } + } catch (error) { + addLog.debug(`Failed to schedule parsed images deletion for ${file.minioKey}`); + } + } + await MongoS3TTL.deleteOne({ _id: file._id }); success++; diff --git a/packages/service/common/s3/sources/chat/index.ts b/packages/service/common/s3/sources/chat/index.ts index a98ca1a8e95f..cebaddce0770 100644 --- a/packages/service/common/s3/sources/chat/index.ts +++ b/packages/service/common/s3/sources/chat/index.ts @@ -7,6 +7,7 @@ import { ChatFileUploadSchema, DelChatFileByPrefixSchema } from './type'; +import { addHours, differenceInHours } from 'date-fns'; class S3ChatSource { private bucket: S3PrivateBucket; @@ -61,9 +62,12 @@ class S3ChatSource { } async createUploadChatFileURL(params: CheckChatFileKeys) { - const { appId, chatId, uId, filename } = ChatFileUploadSchema.parse(params); + const { appId, chatId, uId, filename, expiredTime } = ChatFileUploadSchema.parse(params); const rawKey = [S3Sources.chat, appId, uId, chatId, `${getNanoid(6)}-${filename}`].join('/'); - return await this.bucket.createPostPresignedUrl({ rawKey, filename }, { expiredHours: 24 }); + return await this.bucket.createPostPresignedUrl( + { rawKey, filename }, + { expiredHours: expiredTime ? differenceInHours(new Date(), expiredTime) : 24 } + ); } deleteChatFilesByPrefix(params: DelChatFileByPrefixParams) { diff --git a/packages/service/common/s3/sources/chat/type.ts b/packages/service/common/s3/sources/chat/type.ts index bd245d018c13..bccd7942f9a1 100644 --- a/packages/service/common/s3/sources/chat/type.ts +++ b/packages/service/common/s3/sources/chat/type.ts @@ -5,7 +5,8 @@ export const ChatFileUploadSchema = z.object({ appId: ObjectIdSchema, chatId: z.string().nonempty(), uId: z.string().nonempty(), - filename: z.string().nonempty() + filename: z.string().nonempty(), + expiredTime: z.date().optional() }); export type CheckChatFileKeys = z.infer; diff --git a/packages/service/common/s3/utils.ts b/packages/service/common/s3/utils.ts index ede3d9c8078c..2fb5761394d1 100644 --- a/packages/service/common/s3/utils.ts +++ b/packages/service/common/s3/utils.ts @@ -11,14 +11,20 @@ import { getNanoid } from '@fastgpt/global/common/string/tools'; import path from 'node:path'; import { randomUUID } from 'node:crypto'; import type { ParsedFileContentS3KeyParams } from './sources/dataset/type'; - -export function jwtSignS3ObjectKey(objectKey: string) { +import { EndpointUrl } from '@fastgpt/global/common/file/constants'; + +/** + * + * @param objectKey + * @param expiredTime + * @returns + */ +export function jwtSignS3ObjectKey(objectKey: string, expiredTime: Date) { const secret = process.env.FILE_TOKEN_KEY as string; - const now = new Date(); - const expiresIn = differenceInSeconds(addDays(now, 90), now); + const expiresIn = differenceInSeconds(expiredTime, new Date()); const token = jwt.sign({ objectKey }, secret, { expiresIn }); - return token; + return `${EndpointUrl}/api/system/file/${token}`; } export function jwtVerifyS3ObjectKey(token: string) { @@ -95,15 +101,22 @@ export async function uploadImage2S3Bucket( return uploadKey; } +export const getFileNameFromPresignedURL = (presignedURL: string) => { + const url = new URL(presignedURL); + const fullname = url.pathname.split('/').pop()!; + const filename = path.basename(fullname, path.extname(fullname)); + return decodeURIComponent(filename); +}; + export const ParsedFileContentS3Key = { // 临时的文件路径(比如 evaluation) temp: (appId: string) => { - return `${S3Sources.chat}/${appId}/temp/${randomUUID()}`; + return `${S3Sources.tmp}/${appId}/temp/${randomUUID()}`; }, // 对话中上传的文件的解析结果的图片的 Key chat: ({ appId, chatId, uId }: { chatId: string; uId: string; appId: string }) => { - return `${S3Sources.chat}/${appId}/${uId}/${chatId}/parsed`; + return `${S3Sources.chat}/${appId}/${uId}/${chatId}`; }, // 上传数据集的文件的解析结果的图片的 Key diff --git a/packages/service/core/dataset/data/controller.ts b/packages/service/core/dataset/data/controller.ts index 38dd004c0d2a..a29c5a9b5b6a 100644 --- a/packages/service/core/dataset/data/controller.ts +++ b/packages/service/core/dataset/data/controller.ts @@ -58,15 +58,14 @@ export const formatDatasetDataValue = ({ }; } - const previewUrl = - getS3DatasetSource().isDatasetObjectKey(imageId) || getS3ChatSource().isChatFileKey(imageId) - ? imageId - : getDatasetImagePreviewUrl({ - imageId, - teamId, - datasetId, - expiredMinutes: 60 * 24 * 7 // 7 days - }); + const previewUrl = getS3DatasetSource().isDatasetObjectKey(imageId) + ? imageId + : getDatasetImagePreviewUrl({ + imageId, + teamId, + datasetId, + expiredMinutes: 60 * 24 * 7 // 7 days + }); return { q: `![${q.replaceAll('\n', '')}](${previewUrl})`, diff --git a/packages/service/core/dataset/search/controller.ts b/packages/service/core/dataset/search/controller.ts index d0662dd374d0..bb54f812472f 100644 --- a/packages/service/core/dataset/search/controller.ts +++ b/packages/service/core/dataset/search/controller.ts @@ -33,6 +33,8 @@ import { datasetSearchQueryExtension } from './utils'; import type { RerankModelItemType } from '@fastgpt/global/core/ai/model.d'; import { formatDatasetDataValue } from '../data/controller'; import { pushTrack } from '../../../common/middle/tracks/utils'; +import { replaceDatasetQuoteTextWithJWT } from '../../../core/dataset/utils'; +import { addHours } from 'date-fns'; export type SearchDatasetDataProps = { histories: ChatItemType[]; @@ -53,7 +55,7 @@ export type SearchDatasetDataProps = { [NodeInputKeyEnum.datasetSearchRerankModel]?: RerankModelItemType; [NodeInputKeyEnum.datasetSearchRerankWeight]?: number; - /* + /* { tags: { $and: ["str1","str2"], @@ -230,7 +232,7 @@ export async function searchDatasetData( }; }; - /* + /* Collection metadata filter 标签过滤: 1. and 先生效 @@ -903,10 +905,15 @@ export async function searchDatasetData( // token filter const filterMaxTokensResult = await filterDatasetDataByMaxTokens(scoreFilter, maxTokens); + const finalResult = filterMaxTokensResult.map((item) => { + item.q = replaceDatasetQuoteTextWithJWT(item.q, addHours(new Date(), 1)); + return item; + }); + pushTrack.datasetSearch({ datasetIds, teamId }); return { - searchRes: filterMaxTokensResult, + searchRes: finalResult, embeddingTokens, reRankInputTokens, searchMode, diff --git a/packages/service/core/dataset/utils.ts b/packages/service/core/dataset/utils.ts index f8cdd6ab2f06..d637bc6b86b0 100644 --- a/packages/service/core/dataset/utils.ts +++ b/packages/service/core/dataset/utils.ts @@ -39,18 +39,19 @@ export const filterDatasetsByTmbId = async ({ * 替换数据集引用 markdown 文本中的图片链接格式的 S3 对象键为 JWT 签名后的 URL * * @param datasetQuoteText 数据集引用文本 + * @param expiredTime 过期时间 * @returns 替换后的文本 * * @example * * ```typescript * const datasetQuoteText = '![image.png](dataset/68fee42e1d416bb5ddc85b19/6901c3071ba2bea567e8d8db/aZos7D-214afce5-4d42-4356-9e05-8164d51c59ae.png)'; - * const replacedText = await replaceDatasetQuoteTextWithJWT(datasetQuoteText) + * const replacedText = await replaceDatasetQuoteTextWithJWT(datasetQuoteText, addDays(new Date(), 90)) * console.log(replacedText) * // '![image.png](http://localhost:3000/api/system/file/eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJvYmplY3RLZXkiOiJjaGF0LzY5MWFlMjlkNDA0ZDA0Njg3MTdkZDc0Ny82OGFkODVhNzQ2MzAwNmM5NjM3OTlhMDcvalhmWHk4eWZHQUZzOVdKcGNXUmJBaFYyL3BhcnNlZC85YTBmNGZlZC00ZWRmLTQ2MTMtYThkNi01MzNhZjVhZTUxZGMucG5nIiwiaWF0IjoxNzYzMzcwOTYwLCJleHAiOjk1MzkzNzA5NjB9.tMDWg0-ZWRnWPNp9Hakd0w1hhaO8jj2oD98SU0wAQYQ)' * ``` */ -export async function replaceDatasetQuoteTextWithJWT(datasetQuoteText: string) { +export function replaceDatasetQuoteTextWithJWT(datasetQuoteText: string, expiredTime: Date) { if (!datasetQuoteText || typeof datasetQuoteText !== 'string') return datasetQuoteText as string; const prefixPattern = Object.values(S3Sources) @@ -67,8 +68,7 @@ export async function replaceDatasetQuoteTextWithJWT(datasetQuoteText: string) { const [full, bang, alt, objectKey] = match; if (s3DatasetSource.isDatasetObjectKey(objectKey) || s3ChatSource.isChatFileKey(objectKey)) { - const token = jwtSignS3ObjectKey(objectKey); - const url = `${EndpointUrl}/api/system/file/${token}`; + const url = jwtSignS3ObjectKey(objectKey, expiredTime); const replacement = `${bang}[${alt}](${url})`; content = content.slice(0, match.index) + replacement + content.slice(match.index + full.length); diff --git a/packages/service/core/workflow/dispatch/ai/chat.ts b/packages/service/core/workflow/dispatch/ai/chat.ts index 26293bb8d7ae..39bb8ee104a4 100644 --- a/packages/service/core/workflow/dispatch/ai/chat.ts +++ b/packages/service/core/workflow/dispatch/ai/chat.ts @@ -42,7 +42,8 @@ import { postTextCensor } from '../../../chat/postTextCensor'; import { createLLMResponse } from '../../../ai/llm/request'; import { formatModelChars2Points } from '../../../../support/wallet/usage/utils'; import { replaceDatasetQuoteTextWithJWT } from '../../../dataset/utils'; -import { ParsedFileContentS3Key } from '../../../../common/s3/utils'; +import { getFileNameFromPresignedURL, ParsedFileContentS3Key } from '../../../../common/s3/utils'; +import { addDays } from 'date-fns'; export type ChatProps = ModuleDispatchProps< AIChatNodeProps & { @@ -311,7 +312,8 @@ async function filterDatasetQuote({ : ''; return { - datasetQuoteText: await replaceDatasetQuoteTextWithJWT(datasetQuoteText) + // datasetQuoteText: replaceDatasetQuoteTextWithJWT(datasetQuoteText, addDays(new Date(), 90)) + datasetQuoteText }; } diff --git a/packages/service/core/workflow/dispatch/ai/tool/index.ts b/packages/service/core/workflow/dispatch/ai/tool/index.ts index a6ddda52839a..6bd3fe76d58b 100644 --- a/packages/service/core/workflow/dispatch/ai/tool/index.ts +++ b/packages/service/core/workflow/dispatch/ai/tool/index.ts @@ -31,6 +31,7 @@ import { postTextCensor } from '../../../../chat/postTextCensor'; import type { FlowNodeInputItemType } from '@fastgpt/global/core/workflow/type/io'; import type { McpToolDataType } from '@fastgpt/global/core/app/tool/mcpTool/type'; import type { JSONSchemaInputType } from '@fastgpt/global/core/app/jsonschema'; +import { ParsedFileContentS3Key } from '../../../../../common/s3/utils'; type Response = DispatchNodeResultType<{ [NodeOutputKeyEnum.answerText]: string; @@ -326,9 +327,11 @@ const getMultiInput = async ({ usageId, teamId: runningUserInfo.teamId, tmbId: runningUserInfo.tmbId, - appId, - chatId, - uId + fileS3Prefix: ParsedFileContentS3Key.chat({ + appId, + chatId: chatId!, + uId + }) }); return { diff --git a/packages/service/core/workflow/dispatch/tools/readFiles.ts b/packages/service/core/workflow/dispatch/tools/readFiles.ts index fe97de4e1773..801f6e2deec3 100644 --- a/packages/service/core/workflow/dispatch/tools/readFiles.ts +++ b/packages/service/core/workflow/dispatch/tools/readFiles.ts @@ -13,11 +13,11 @@ import { type ChatItemType, type UserChatItemValueItemType } from '@fastgpt/glob import { parseFileExtensionFromUrl } from '@fastgpt/global/common/string/tools'; import { addLog } from '../../../../common/system/log'; import { addRawTextBuffer, getRawTextBuffer } from '../../../../common/buffer/rawText/controller'; -import { addMinutes } from 'date-fns'; +import { addDays, addMinutes } from 'date-fns'; import { getNodeErrResponse } from '../utils'; import { isInternalAddress } from '../../../../common/system/utils'; import { replaceDatasetQuoteTextWithJWT } from '../../../dataset/utils'; -import { ParsedFileContentS3Key } from '../../../../common/s3/utils'; +import { getFileNameFromPresignedURL, ParsedFileContentS3Key } from '../../../../common/s3/utils'; type Props = ModuleDispatchProps<{ [NodeInputKeyEnum.fileUrlList]: string[]; @@ -65,6 +65,23 @@ export const dispatchReadFiles = async (props: Props): Promise => { const filesFromHistories = version !== '489' ? [] : getHistoryFileLinks(histories); try { + console.dir( + { + urls: [...fileUrlList, ...filesFromHistories], + requestOrigin, + maxFiles, + teamId, + tmbId, + customPdfParse, + usageId, + fileS3Prefix: ParsedFileContentS3Key.chat({ + appId: props.runningAppInfo.id, + chatId: props.chatId!, + uId: props.uid + }) + }, + { depth: null } + ); const { text, readFilesResult } = await getFileContentFromLinks({ // Concat fileUrlList and filesFromHistories; remove not supported files urls: [...fileUrlList, ...filesFromHistories], @@ -241,12 +258,12 @@ export const getFileContentFromLinks = async ({ customPdfParse, getFormatText: true, imageKeyOptions: { - prefix: fileS3Prefix + prefix: `${fileS3Prefix}/${getFileNameFromPresignedURL(url)}-parsed` }, usageId }); - const replacedText = await replaceDatasetQuoteTextWithJWT(rawText); + const replacedText = replaceDatasetQuoteTextWithJWT(rawText, addDays(new Date(), 90)); // Add to buffer addRawTextBuffer({ diff --git a/projects/app/src/components/Markdown/img/Image.tsx b/projects/app/src/components/Markdown/img/Image.tsx index 30c41e4a5443..6139d3b8e52f 100644 --- a/projects/app/src/components/Markdown/img/Image.tsx +++ b/projects/app/src/components/Markdown/img/Image.tsx @@ -16,41 +16,41 @@ const MdImage = ({ const [renderSrc, setRenderSrc] = useState(src); const [isLoading, setIsLoading] = useState(false); - useEffect(() => { - if (!src || (!src.startsWith('dataset/') && !src.startsWith('chat/'))) { - setRenderSrc(src); - return; - } + // useEffect(() => { + // if (!src || (!src.startsWith('dataset/') && !src.startsWith('chat/'))) { + // setRenderSrc(src); + // return; + // } - const loadS3Image = async () => { - try { - setIsLoading(true); - if (src.startsWith('dataset/')) { - const url = await getPresignedDatasetFileGetUrl({ key: src }); - setRenderSrc(url); - } else if (src.startsWith('chat/')) { - const url = await getPresignedChatFileGetUrl({ - key: src, - appId: props.chatAuthData?.appId || '', - outLinkAuthData: { - shareId: props.chatAuthData?.shareId, - outLinkUid: props.chatAuthData?.outLinkUid, - teamId: props.chatAuthData?.teamId, - teamToken: props.chatAuthData?.teamToken - } - }); - setRenderSrc(url); - } - } catch (error) { - console.error('Failed to sign S3 image:', error); - setRenderSrc('/imgs/errImg.png'); - } finally { - setIsLoading(false); - } - }; + // const loadS3Image = async () => { + // try { + // setIsLoading(true); + // if (src.startsWith('dataset/')) { + // const url = await getPresignedDatasetFileGetUrl({ key: src }); + // setRenderSrc(url); + // } else if (src.startsWith('chat/')) { + // const url = await getPresignedChatFileGetUrl({ + // key: src, + // appId: props.chatAuthData?.appId || '', + // outLinkAuthData: { + // shareId: props.chatAuthData?.shareId, + // outLinkUid: props.chatAuthData?.outLinkUid, + // teamId: props.chatAuthData?.teamId, + // teamToken: props.chatAuthData?.teamToken + // } + // }); + // setRenderSrc(url); + // } + // } catch (error) { + // console.error('Failed to sign S3 image:', error); + // setRenderSrc('/imgs/errImg.png'); + // } finally { + // setIsLoading(false); + // } + // }; - loadS3Image(); - }, [src, props.chatAuthData]); + // loadS3Image(); + // }, [src, props.chatAuthData]); if (src?.includes('base64') && !src.startsWith('data:image')) { return Invalid base64 image; diff --git a/projects/app/src/pages/api/core/dataset/data/v2/list.ts b/projects/app/src/pages/api/core/dataset/data/v2/list.ts index dc7262b1d723..594a2b9acdc8 100644 --- a/projects/app/src/pages/api/core/dataset/data/v2/list.ts +++ b/projects/app/src/pages/api/core/dataset/data/v2/list.ts @@ -11,6 +11,9 @@ import { MongoDatasetImageSchema } from '@fastgpt/service/core/dataset/image/sch import { readFromSecondary } from '@fastgpt/service/common/mongo/utils'; import { getDatasetImagePreviewUrl } from '@fastgpt/service/core/dataset/image/utils'; import { getS3DatasetSource } from '@fastgpt/service/common/s3/sources/dataset'; +import { addDays } from 'date-fns'; +import { jwtSignS3ObjectKey } from '@fastgpt/service/common/s3/utils'; +import { replaceDatasetQuoteTextWithJWT } from '@fastgpt/service/core/dataset/utils'; export type GetDatasetDataListProps = PaginationProps & { searchText?: string; @@ -55,6 +58,10 @@ async function handler( MongoDatasetData.countDocuments(match) ]); + list.forEach( + (item) => void (item.q = replaceDatasetQuoteTextWithJWT(item.q, addDays(new Date(), 1))) + ); + const imageIds = list.map((item) => item.imageId!).filter(Boolean); const imageSizeMap = new Map(); const s3DatasetSource = getS3DatasetSource(); @@ -84,10 +91,7 @@ async function handler( const imageSize = item.imageId ? imageSizeMap.get(String(item.imageId)) : undefined; const imagePreviewUrl = item.imageId ? s3DatasetSource.isDatasetObjectKey(item.imageId) - ? await getS3DatasetSource().createGetDatasetFileURL({ - key: item.imageId, - expiredHours: 24 - }) + ? jwtSignS3ObjectKey(item.imageId, addDays(new Date(), 1)) : getDatasetImagePreviewUrl({ imageId: item.imageId, teamId, diff --git a/projects/app/src/pages/api/core/dataset/file/getPreviewChunks.ts b/projects/app/src/pages/api/core/dataset/file/getPreviewChunks.ts index c50d9cc53190..26c129593689 100644 --- a/projects/app/src/pages/api/core/dataset/file/getPreviewChunks.ts +++ b/projects/app/src/pages/api/core/dataset/file/getPreviewChunks.ts @@ -15,6 +15,8 @@ import { import { CommonErrEnum } from '@fastgpt/global/common/error/code/common'; import { getEmbeddingModel, getLLMModel } from '@fastgpt/service/core/ai/model'; import type { ChunkSettingsType } from '@fastgpt/global/core/dataset/type'; +import { replaceDatasetQuoteTextWithJWT } from '@fastgpt/service/core/dataset/utils'; +import { addDays } from 'date-fns'; export type PostPreviewFilesChunksProps = ChunkSettingsType & { datasetId: string; @@ -111,8 +113,13 @@ async function handler( customReg: formatChunkSettings.chunkSplitter ? [formatChunkSettings.chunkSplitter] : [] }); + const chunksWithJWT = chunks.slice(0, 10).map((chunk) => ({ + q: replaceDatasetQuoteTextWithJWT(chunk.q, addDays(new Date(), 1)), + a: replaceDatasetQuoteTextWithJWT(chunk.a, addDays(new Date(), 1)) + })); + return { - chunks: chunks.slice(0, 10), + chunks: chunksWithJWT, total: chunks.length }; }