Skip to content

Commit 0e69f94

Browse files
committed
feat: integrate S3 for dataset with compatibility
1 parent a99b927 commit 0e69f94

File tree

72 files changed

+1780
-420
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

72 files changed

+1780
-420
lines changed

packages/global/common/file/image/constants.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,4 @@ export const FolderIcon = 'file/fill/folder';
44
export const FolderImgUrl = '/imgs/files/folder.svg';
55
export const HttpPluginImgUrl = '/imgs/app/httpPluginFill.svg';
66
export const HttpImgUrl = '/imgs/workflow/http.png';
7+
export const TempFileURL = '/api/file/temp';

packages/global/core/dataset/api.d.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,7 @@ export type PushDatasetDataChunkProps = {
139139
q?: string;
140140
a?: string;
141141
imageId?: string;
142+
imageKeys?: string[];
142143
chunkIndex?: number;
143144
indexes?: Omit<DatasetDataIndexItemType, 'dataId'>[];
144145
};

packages/global/core/dataset/apiDataset/type.d.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ export type ApiDatasetServerType = {
4040
export type ApiFileReadContentResponse = {
4141
title?: string;
4242
rawText: string;
43+
imageKeys?: string[];
4344
};
4445

4546
export type APIFileReadResponse = {

packages/global/core/dataset/controller.d.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ export type CreateDatasetDataProps = {
99
q: string;
1010
a?: string;
1111
imageId?: string;
12+
imageKeys?: string[];
1213
indexes?: Omit<DatasetDataIndexItemType, 'dataId'>[];
1314
indexPrefix?: string;
1415
};

packages/global/core/dataset/training/type.d.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ export type PushDataToTrainingQueueProps = {
99

1010
data: PushDatasetDataChunkProps[];
1111
mode?: TrainingModeEnum;
12-
data: PushDatasetDataChunkProps[];
1312

1413
agentModel: string;
1514
vectorModel: string;

packages/global/core/dataset/type.d.ts

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ export type DatasetCollectionSchemaType = ChunkSettingsType & {
117117

118118
rawTextLength?: number;
119119
hashRawText?: string;
120+
120121
metadata?: {
121122
webPageSelector?: string;
122123
relatedImgId?: string; // The id of the associated image collections
@@ -146,6 +147,7 @@ export type DatasetDataFieldType = {
146147
q: string; // large chunks or question
147148
a?: string; // answer or custom content
148149
imageId?: string;
150+
imageKeys?: string[];
149151
};
150152
export type DatasetDataSchemaType = DatasetDataFieldType & {
151153
_id: string;
@@ -190,6 +192,7 @@ export type DatasetTrainingSchemaType = {
190192
q: string;
191193
a: string;
192194
imageId?: string;
195+
imageKeys?: string[];
193196
imageDescMap?: Record<string, string>;
194197
chunkIndex: number;
195198
indexSize?: number;
@@ -249,7 +252,10 @@ export type TagUsageType = {
249252
export type DatasetCollectionItemType = CollectionWithDatasetType & {
250253
sourceName: string;
251254
sourceId?: string;
252-
file?: DatasetFileSchema;
255+
file?: {
256+
filename?: string;
257+
contentLength?: number;
258+
};
253259
permission: DatasetPermission;
254260
indexAmount: number;
255261
errorCount?: number;
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
import { ObjectIdSchema } from '../../../common/type/mongo';
2+
import z from 'zod';
3+
4+
export const PresignDatasetFileGetUrlSchema = z.union([
5+
z.object({
6+
key: z
7+
.string()
8+
.nonempty()
9+
.refine((key) => key.startsWith('dataset/'), {
10+
message: 'Invalid key format: must start with "dataset/"'
11+
})
12+
.transform((k) => decodeURIComponent(k)),
13+
preview: z.boolean().optional()
14+
}),
15+
z.object({
16+
collectionId: ObjectIdSchema
17+
// datasetId: ObjectIdSchema
18+
})
19+
]);
20+
export type PresignDatasetFileGetUrlParams = z.infer<typeof PresignDatasetFileGetUrlSchema>;
21+
22+
export const PresignDatasetFilePostUrlSchema = z.object({
23+
filename: z.string().min(1),
24+
datasetId: ObjectIdSchema
25+
});
26+
export type PresignDatasetFilePostUrlParams = z.infer<typeof PresignDatasetFilePostUrlSchema>;
27+
28+
export const ShortPreviewLinkSchema = z.object({
29+
k: z
30+
.string()
31+
.nonempty()
32+
.transform((k) => `chat:temp_file:${decodeURIComponent(k)}`)
33+
});
34+
export type ShortPreviewLinkParams = z.infer<typeof ShortPreviewLinkSchema>;

packages/service/common/buffer/rawText/controller.ts

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,18 +18,21 @@ export const addRawTextBuffer = async ({
1818
sourceId,
1919
sourceName,
2020
text,
21-
expiredTime
21+
expiredTime,
22+
imageKeys = []
2223
}: {
2324
sourceId: string;
2425
sourceName: string;
2526
text: string;
2627
expiredTime: Date;
28+
imageKeys?: string[];
2729
}) => {
2830
const gridBucket = getGridBucket();
2931
const metadata = {
3032
sourceId,
3133
sourceName,
32-
expiredTime
34+
expiredTime,
35+
imageKeys
3336
};
3437

3538
const buffer = Buffer.from(text);
@@ -106,7 +109,8 @@ export const getRawTextBuffer = async (sourceId: string) => {
106109

107110
return {
108111
text: rawText,
109-
sourceName: bufferData.metadata?.sourceName || ''
112+
sourceName: bufferData.metadata?.sourceName || '',
113+
imageKeys: bufferData.metadata?.imageKeys || []
110114
};
111115
});
112116
};

packages/service/common/buffer/rawText/schema.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@ const RawTextBufferSchema = new Schema({
66
metadata: {
77
sourceId: { type: String, required: true },
88
sourceName: { type: String, required: true },
9-
expiredTime: { type: Date, required: true }
9+
expiredTime: { type: Date, required: true },
10+
imageKeys: { type: [String], required: true }
1011
}
1112
});
1213
RawTextBufferSchema.index({ 'metadata.sourceId': 'hashed' });
@@ -18,5 +19,6 @@ export const MongoRawTextBufferSchema = getMongoModel<{
1819
sourceId: string;
1920
sourceName: string;
2021
expiredTime: Date;
22+
imageKeys: string[];
2123
};
2224
}>(`${bucketName}.files`, RawTextBufferSchema);

packages/service/common/file/gridfs/controller.ts

Lines changed: 10 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,12 @@ import fsp from 'fs/promises';
44
import fs from 'fs';
55
import { type DatasetFileSchema } from '@fastgpt/global/core/dataset/type';
66
import { MongoChatFileSchema, MongoDatasetFileSchema } from './schema';
7-
import { detectFileEncoding, detectFileEncodingByPath } from '@fastgpt/global/common/file/tools';
8-
import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
9-
import { readRawContentByFileBuffer } from '../read/utils';
10-
import { computeGridFsChunSize, gridFsStream2Buffer, stream2Encoding } from './utils';
7+
import { detectFileEncodingByPath } from '@fastgpt/global/common/file/tools';
8+
import { computeGridFsChunSize, stream2Encoding } from './utils';
119
import { addLog } from '../../system/log';
12-
import { parseFileExtensionFromUrl } from '@fastgpt/global/common/string/tools';
1310
import { Readable } from 'stream';
14-
import { addRawTextBuffer, getRawTextBuffer } from '../../buffer/rawText/controller';
15-
import { addMinutes } from 'date-fns';
1611
import { retryFn } from '@fastgpt/global/common/system/utils';
12+
import { getS3DatasetSource } from '../../s3/sources/dataset';
1713

1814
export function getGFSCollection(bucket: `${BucketNameEnum}`) {
1915
MongoDatasetFileSchema;
@@ -162,11 +158,17 @@ export async function delFileByFileIdList({
162158
fileIdList: string[];
163159
}): Promise<any> {
164160
return retryFn(async () => {
161+
const s3DatasetSource = getS3DatasetSource();
162+
165163
const bucket = getGridBucket(bucketName);
166164

167165
for await (const fileId of fileIdList) {
168166
try {
169-
await bucket.delete(new Types.ObjectId(String(fileId)));
167+
if (s3DatasetSource.isDatasetObjectKey(fileId)) {
168+
await s3DatasetSource.deleteDatasetFileByKey(fileId);
169+
} else {
170+
await bucket.delete(new Types.ObjectId(String(fileId)));
171+
}
170172
} catch (error: any) {
171173
if (typeof error?.message === 'string' && error.message.includes('File not found')) {
172174
addLog.warn('File not found', { fileId });
@@ -189,78 +191,3 @@ export async function getDownloadStream({
189191

190192
return bucket.openDownloadStream(new Types.ObjectId(fileId));
191193
}
192-
193-
export const readFileContentFromMongo = async ({
194-
teamId,
195-
tmbId,
196-
bucketName,
197-
fileId,
198-
customPdfParse = false,
199-
getFormatText,
200-
usageId
201-
}: {
202-
teamId: string;
203-
tmbId: string;
204-
bucketName: `${BucketNameEnum}`;
205-
fileId: string;
206-
customPdfParse?: boolean;
207-
getFormatText?: boolean; // 数据类型都尽可能转化成 markdown 格式
208-
usageId?: string;
209-
}): Promise<{
210-
rawText: string;
211-
filename: string;
212-
}> => {
213-
const bufferId = `${String(fileId)}-${customPdfParse}`;
214-
// read buffer
215-
const fileBuffer = await getRawTextBuffer(bufferId);
216-
if (fileBuffer) {
217-
return {
218-
rawText: fileBuffer.text,
219-
filename: fileBuffer?.sourceName
220-
};
221-
}
222-
223-
const [file, fileStream] = await Promise.all([
224-
getFileById({ bucketName, fileId }),
225-
getDownloadStream({ bucketName, fileId })
226-
]);
227-
if (!file) {
228-
return Promise.reject(CommonErrEnum.fileNotFound);
229-
}
230-
231-
const extension = parseFileExtensionFromUrl(file?.filename);
232-
233-
const start = Date.now();
234-
const fileBuffers = await gridFsStream2Buffer(fileStream);
235-
addLog.debug('get file buffer', { time: Date.now() - start });
236-
237-
const encoding = file?.metadata?.encoding || detectFileEncoding(fileBuffers);
238-
239-
// Get raw text
240-
const { rawText } = await readRawContentByFileBuffer({
241-
customPdfParse,
242-
usageId,
243-
getFormatText,
244-
extension,
245-
teamId,
246-
tmbId,
247-
buffer: fileBuffers,
248-
encoding,
249-
metadata: {
250-
relatedId: fileId
251-
}
252-
});
253-
254-
// Add buffer
255-
addRawTextBuffer({
256-
sourceId: bufferId,
257-
sourceName: file.filename,
258-
text: rawText,
259-
expiredTime: addMinutes(new Date(), 20)
260-
});
261-
262-
return {
263-
rawText,
264-
filename: file.filename
265-
};
266-
};

0 commit comments

Comments
 (0)