labring
diff --git a/‎packages/global/common/file/image/constants.ts‎
Lines changed: 1 addition & 0 deletions b/‎packages/global/common/file/image/constants.ts‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎packages/global/core/dataset/api.d.ts‎
Lines changed: 1 addition & 0 deletions b/‎packages/global/core/dataset/api.d.ts‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎packages/global/core/dataset/apiDataset/type.d.ts‎
Lines changed: 1 addition & 0 deletions b/‎packages/global/core/dataset/apiDataset/type.d.ts‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎packages/global/core/dataset/controller.d.ts‎
Lines changed: 1 addition & 0 deletions b/‎packages/global/core/dataset/controller.d.ts‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎packages/global/core/dataset/training/type.d.ts‎
Lines changed: 0 additions & 1 deletion b/‎packages/global/core/dataset/training/type.d.ts‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎packages/global/core/dataset/type.d.ts‎
Lines changed: 7 additions & 1 deletion b/‎packages/global/core/dataset/type.d.ts‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎packages/global/core/dataset/v2/api.ts‎
Lines changed: 34 additions & 0 deletions b/‎packages/global/core/dataset/v2/api.ts‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎packages/service/common/buffer/rawText/controller.ts‎
Lines changed: 7 additions & 3 deletions b/‎packages/service/common/buffer/rawText/controller.ts‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎packages/service/common/buffer/rawText/schema.ts‎
Lines changed: 3 additions & 1 deletion b/‎packages/service/common/buffer/rawText/schema.ts‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎packages/service/common/file/gridfs/controller.ts‎
Lines changed: 10 additions & 83 deletions b/‎packages/service/common/file/gridfs/controller.ts‎
Lines changed: 10 additions & 83 deletions
@@ -4,3 +4,4 @@ export const FolderIcon = 'file/fill/folder';
 export const FolderImgUrl = '/imgs/files/folder.svg';
 export const HttpPluginImgUrl = '/imgs/app/httpPluginFill.svg';
 export const HttpImgUrl = '/imgs/workflow/http.png';
+export const TempFileURL = '/api/file/temp';
@@ -139,6 +139,7 @@ export type PushDatasetDataChunkProps = {
   q?: string;
   a?: string;
   imageId?: string;
+  imageKeys?: string[];
   chunkIndex?: number;
   indexes?: Omit<DatasetDataIndexItemType, 'dataId'>[];
 };
 
@@ -40,6 +40,7 @@ export type ApiDatasetServerType = {
 export type ApiFileReadContentResponse = {
   title?: string;
   rawText: string;
+  imageKeys?: string[];
 };
 
 export type APIFileReadResponse = {
 
@@ -9,6 +9,7 @@ export type CreateDatasetDataProps = {
   q: string;
   a?: string;
   imageId?: string;
+  imageKeys?: string[];
   indexes?: Omit<DatasetDataIndexItemType, 'dataId'>[];
   indexPrefix?: string;
 };
 
@@ -9,7 +9,6 @@ export type PushDataToTrainingQueueProps = {
 
   data: PushDatasetDataChunkProps[];
   mode?: TrainingModeEnum;
-  data: PushDatasetDataChunkProps[];
 
   agentModel: string;
   vectorModel: string;
 
@@ -117,6 +117,7 @@ export type DatasetCollectionSchemaType = ChunkSettingsType & {
 
   rawTextLength?: number;
   hashRawText?: string;
+
   metadata?: {
     webPageSelector?: string;
     relatedImgId?: string; // The id of the associated image collections
@@ -146,6 +147,7 @@ export type DatasetDataFieldType = {
   q: string; // large chunks or question
   a?: string; // answer or custom content
   imageId?: string;
+  imageKeys?: string[];
 };
 export type DatasetDataSchemaType = DatasetDataFieldType & {
   _id: string;
@@ -190,6 +192,7 @@ export type DatasetTrainingSchemaType = {
   q: string;
   a: string;
   imageId?: string;
+  imageKeys?: string[];
   imageDescMap?: Record<string, string>;
   chunkIndex: number;
   indexSize?: number;
@@ -249,7 +252,10 @@ export type TagUsageType = {
 export type DatasetCollectionItemType = CollectionWithDatasetType & {
   sourceName: string;
   sourceId?: string;
-  file?: DatasetFileSchema;
+  file?: {
+    filename?: string;
+    contentLength?: number;
+  };
   permission: DatasetPermission;
   indexAmount: number;
   errorCount?: number;
 
@@ -0,0 +1,34 @@
+import { ObjectIdSchema } from '../../../common/type/mongo';
+import z from 'zod';
+
+export const PresignDatasetFileGetUrlSchema = z.union([
+  z.object({
+    key: z
+      .string()
+      .nonempty()
+      .refine((key) => key.startsWith('dataset/'), {
+        message: 'Invalid key format: must start with "dataset/"'
+      })
+      .transform((k) => decodeURIComponent(k)),
+    preview: z.boolean().optional()
+  }),
+  z.object({
+    collectionId: ObjectIdSchema
+    // datasetId: ObjectIdSchema
+  })
+]);
+export type PresignDatasetFileGetUrlParams = z.infer<typeof PresignDatasetFileGetUrlSchema>;
+
+export const PresignDatasetFilePostUrlSchema = z.object({
+  filename: z.string().min(1),
+  datasetId: ObjectIdSchema
+});
+export type PresignDatasetFilePostUrlParams = z.infer<typeof PresignDatasetFilePostUrlSchema>;
+
+export const ShortPreviewLinkSchema = z.object({
+  k: z
+    .string()
+    .nonempty()
+    .transform((k) => `chat:temp_file:${decodeURIComponent(k)}`)
+});
+export type ShortPreviewLinkParams = z.infer<typeof ShortPreviewLinkSchema>;
@@ -18,18 +18,21 @@ export const addRawTextBuffer = async ({
   sourceId,
   sourceName,
   text,
-  expiredTime
+  expiredTime,
+  imageKeys = []
 }: {
   sourceId: string;
   sourceName: string;
   text: string;
   expiredTime: Date;
+  imageKeys?: string[];
 }) => {
   const gridBucket = getGridBucket();
   const metadata = {
     sourceId,
     sourceName,
-    expiredTime
+    expiredTime,
+    imageKeys
   };
 
   const buffer = Buffer.from(text);
@@ -106,7 +109,8 @@ export const getRawTextBuffer = async (sourceId: string) => {
 
     return {
       text: rawText,
-      sourceName: bufferData.metadata?.sourceName || ''
+      sourceName: bufferData.metadata?.sourceName || '',
+      imageKeys: bufferData.metadata?.imageKeys || []
     };
   });
 };
 
@@ -6,7 +6,8 @@ const RawTextBufferSchema = new Schema({
   metadata: {
     sourceId: { type: String, required: true },
     sourceName: { type: String, required: true },
-    expiredTime: { type: Date, required: true }
+    expiredTime: { type: Date, required: true },
+    imageKeys: { type: [String], required: true }
   }
 });
 RawTextBufferSchema.index({ 'metadata.sourceId': 'hashed' });
@@ -18,5 +19,6 @@ export const MongoRawTextBufferSchema = getMongoModel<{
     sourceId: string;
     sourceName: string;
     expiredTime: Date;
+    imageKeys: string[];
   };
 }>(`${bucketName}.files`, RawTextBufferSchema);
@@ -4,16 +4,12 @@ import fsp from 'fs/promises';
 import fs from 'fs';
 import { type DatasetFileSchema } from '@fastgpt/global/core/dataset/type';
 import { MongoChatFileSchema, MongoDatasetFileSchema } from './schema';
-import { detectFileEncoding, detectFileEncodingByPath } from '@fastgpt/global/common/file/tools';
-import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
-import { readRawContentByFileBuffer } from '../read/utils';
-import { computeGridFsChunSize, gridFsStream2Buffer, stream2Encoding } from './utils';
+import { detectFileEncodingByPath } from '@fastgpt/global/common/file/tools';
+import { computeGridFsChunSize, stream2Encoding } from './utils';
 import { addLog } from '../../system/log';
-import { parseFileExtensionFromUrl } from '@fastgpt/global/common/string/tools';
 import { Readable } from 'stream';
-import { addRawTextBuffer, getRawTextBuffer } from '../../buffer/rawText/controller';
-import { addMinutes } from 'date-fns';
 import { retryFn } from '@fastgpt/global/common/system/utils';
+import { getS3DatasetSource } from '../../s3/sources/dataset';
 
 export function getGFSCollection(bucket: `${BucketNameEnum}`) {
   MongoDatasetFileSchema;
@@ -162,11 +158,17 @@ export async function delFileByFileIdList({
   fileIdList: string[];
 }): Promise<any> {
   return retryFn(async () => {
+    const s3DatasetSource = getS3DatasetSource();
+
     const bucket = getGridBucket(bucketName);
 
     for await (const fileId of fileIdList) {
       try {
-        await bucket.delete(new Types.ObjectId(String(fileId)));
+        if (s3DatasetSource.isDatasetObjectKey(fileId)) {
+          await s3DatasetSource.deleteDatasetFileByKey(fileId);
+        } else {
+          await bucket.delete(new Types.ObjectId(String(fileId)));
+        }
       } catch (error: any) {
         if (typeof error?.message === 'string' && error.message.includes('File not found')) {
           addLog.warn('File not found', { fileId });
@@ -189,78 +191,3 @@ export async function getDownloadStream({
 
   return bucket.openDownloadStream(new Types.ObjectId(fileId));
 }
-
-export const readFileContentFromMongo = async ({
-  teamId,
-  tmbId,
-  bucketName,
-  fileId,
-  customPdfParse = false,
-  getFormatText,
-  usageId
-}: {
-  teamId: string;
-  tmbId: string;
-  bucketName: `${BucketNameEnum}`;
-  fileId: string;
-  customPdfParse?: boolean;
-  getFormatText?: boolean; // 数据类型都尽可能转化成 markdown 格式
-  usageId?: string;
-}): Promise<{
-  rawText: string;
-  filename: string;
-}> => {
-  const bufferId = `${String(fileId)}-${customPdfParse}`;
-  // read buffer
-  const fileBuffer = await getRawTextBuffer(bufferId);
-  if (fileBuffer) {
-    return {
-      rawText: fileBuffer.text,
-      filename: fileBuffer?.sourceName
-    };
-  }
-
-  const [file, fileStream] = await Promise.all([
-    getFileById({ bucketName, fileId }),
-    getDownloadStream({ bucketName, fileId })
-  ]);
-  if (!file) {
-    return Promise.reject(CommonErrEnum.fileNotFound);
-  }
-
-  const extension = parseFileExtensionFromUrl(file?.filename);
-
-  const start = Date.now();
-  const fileBuffers = await gridFsStream2Buffer(fileStream);
-  addLog.debug('get file buffer', { time: Date.now() - start });
-
-  const encoding = file?.metadata?.encoding || detectFileEncoding(fileBuffers);
-
-  // Get raw text
-  const { rawText } = await readRawContentByFileBuffer({
-    customPdfParse,
-    usageId,
-    getFormatText,
-    extension,
-    teamId,
-    tmbId,
-    buffer: fileBuffers,
-    encoding,
-    metadata: {
-      relatedId: fileId
-    }
-  });
-
-  // Add buffer
-  addRawTextBuffer({
-    sourceId: bufferId,
-    sourceName: file.filename,
-    text: rawText,
-    expiredTime: addMinutes(new Date(), 20)
-  });
-
-  return {
-    rawText,
-    filename: file.filename
-  };
-};