Skip to content

Commit 8e8d492

Browse files
committed
fix: remove imageKeys
1 parent b6da886 commit 8e8d492

File tree

36 files changed

+289
-582
lines changed

36 files changed

+289
-582
lines changed

packages/global/core/dataset/api.d.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,6 @@ export type PushDatasetDataChunkProps = {
139139
q?: string;
140140
a?: string;
141141
imageId?: string;
142-
imageKeys?: string[];
143142
chunkIndex?: number;
144143
indexes?: Omit<DatasetDataIndexItemType, 'dataId'>[];
145144
};

packages/global/core/dataset/apiDataset/type.d.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@ export type ApiDatasetServerType = {
4040
export type ApiFileReadContentResponse = {
4141
title?: string;
4242
rawText: string;
43-
imageKeys?: string[];
4443
};
4544

4645
export type APIFileReadResponse = {

packages/global/core/dataset/controller.d.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ export type CreateDatasetDataProps = {
99
q: string;
1010
a?: string;
1111
imageId?: string;
12-
imageKeys?: string[];
1312
indexes?: Omit<DatasetDataIndexItemType, 'dataId'>[];
1413
indexPrefix?: string;
1514
};

packages/global/core/dataset/type.d.ts

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,6 @@ export type DatasetDataFieldType = {
148148
q: string; // large chunks or question
149149
a?: string; // answer or custom content
150150
imageId?: string;
151-
imageKeys?: string[];
152151
};
153152
export type DatasetDataSchemaType = DatasetDataFieldType & {
154153
_id: string;
@@ -193,7 +192,6 @@ export type DatasetTrainingSchemaType = {
193192
q: string;
194193
a: string;
195194
imageId?: string;
196-
imageKeys?: string[];
197195
imageDescMap?: Record<string, string>;
198196
chunkIndex: number;
199197
indexSize?: number;

packages/service/common/buffer/rawText/controller.ts

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,21 +18,18 @@ export const addRawTextBuffer = async ({
1818
sourceId,
1919
sourceName,
2020
text,
21-
expiredTime,
22-
imageKeys = []
21+
expiredTime
2322
}: {
2423
sourceId: string;
2524
sourceName: string;
2625
text: string;
2726
expiredTime: Date;
28-
imageKeys?: string[];
2927
}) => {
3028
const gridBucket = getGridBucket();
3129
const metadata = {
3230
sourceId,
3331
sourceName,
34-
expiredTime,
35-
imageKeys
32+
expiredTime
3633
};
3734

3835
const buffer = Buffer.from(text);
@@ -109,8 +106,7 @@ export const getRawTextBuffer = async (sourceId: string) => {
109106

110107
return {
111108
text: rawText,
112-
sourceName: bufferData.metadata?.sourceName || '',
113-
imageKeys: bufferData.metadata?.imageKeys || []
109+
sourceName: bufferData.metadata?.sourceName || ''
114110
};
115111
});
116112
};

packages/service/common/buffer/rawText/schema.ts

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,7 @@ const RawTextBufferSchema = new Schema({
66
metadata: {
77
sourceId: { type: String, required: true },
88
sourceName: { type: String, required: true },
9-
expiredTime: { type: Date, required: true },
10-
imageKeys: { type: [String], required: true }
9+
expiredTime: { type: Date, required: true }
1110
}
1211
});
1312
RawTextBufferSchema.index({ 'metadata.sourceId': 'hashed' });
@@ -19,6 +18,5 @@ export const MongoRawTextBufferSchema = getMongoModel<{
1918
sourceId: string;
2019
sourceName: string;
2120
expiredTime: Date;
22-
imageKeys: string[];
2321
};
2422
}>(`${bucketName}.files`, RawTextBufferSchema);

packages/service/common/file/read/utils.ts

Lines changed: 14 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
import { uploadMongoImg } from '../image/controller';
21
import FormData from 'form-data';
32
import fs from 'fs';
43
import type { ReadFileResponse } from '../../../worker/readFile/type';
@@ -9,12 +8,9 @@ import { matchMdImg } from '@fastgpt/global/common/string/markdown';
98
import { createPdfParseUsage } from '../../../support/wallet/usage/controller';
109
import { useDoc2xServer } from '../../../thirdProvider/doc2x';
1110
import { readRawContentFromBuffer } from '../../../worker/function';
12-
import { getS3DatasetSource } from '../../s3/sources/dataset';
13-
import type { ParsedFileContentS3KeyParams } from '../../s3/sources/dataset/type';
14-
import { getNanoid } from '@fastgpt/global/common/string/tools';
15-
import path from 'path';
16-
import { S3Sources } from '../../s3/type';
17-
import { randomUUID } from 'crypto';
11+
import { uploadImage2S3Bucket } from '../../s3/utils';
12+
import { Mimes } from '../../s3/constants';
13+
import { addDays } from 'date-fns';
1814

1915
export type readRawTextByLocalFileParams = {
2016
teamId: string;
@@ -42,7 +38,8 @@ export const readRawTextByLocalFile = async (params: readRawTextByLocalFileParam
4238
encoding: params.encoding,
4339
buffer,
4440
imageKeyOptions: {
45-
prefix: params.uploadKey
41+
prefix: params.uploadKey,
42+
expiredTime: addDays(new Date(), 1)
4643
}
4744
});
4845
};
@@ -71,11 +68,10 @@ export const readS3FileContentByBuffer = async ({
7168
getFormatText?: boolean;
7269
imageKeyOptions: {
7370
prefix: string;
74-
hasTTL?: boolean;
71+
expiredTime?: Date;
7572
};
7673
}): Promise<{
7774
rawText: string;
78-
imageKeys?: string[];
7975
}> => {
8076
const systemParse = () =>
8177
readRawContentFromBuffer({
@@ -171,26 +167,23 @@ export const readS3FileContentByBuffer = async ({
171167
addLog.debug(`Parse file success, time: ${Date.now() - start}ms. `);
172168

173169
// markdown data format
174-
const uploadedImageKeys: string[] = [];
175170
if (imageList && imageList.length > 0) {
176171
addLog.debug(`Processing ${imageList.length} images from parsed document`);
177172

178173
await batchRun(imageList, async (item) => {
179174
const src = await (async () => {
180175
try {
181-
const { prefix, hasTTL } = imageKeyOptions;
182-
const ext = item.mime.split('/')[1].replace('x-', '');
183-
const imageKey = await getS3DatasetSource().uploadDatasetImage({
176+
const { prefix, expiredTime } = imageKeyOptions;
177+
const ext = `.${item.mime.split('/')[1].replace('x-', '')}`;
178+
179+
return await uploadImage2S3Bucket('private', {
184180
base64Img: `data:${item.mime};base64,${item.base64}`,
185-
mimetype: `${ext}`,
186-
filename: `${item.uuid}.${ext}`,
187181
uploadKey: `${prefix}/${item.uuid}.${ext}`,
188-
hasTTL
182+
mimetype: Mimes[ext as keyof typeof Mimes],
183+
filename: `${item.uuid}.${ext}`,
184+
expiredTime
189185
});
190-
uploadedImageKeys.push(imageKey);
191-
return imageKey;
192186
} catch (error) {
193-
// Don't add to uploadedImageKeys if upload failed, but still continue processing
194187
return `[Image Upload Failed: ${item.uuid}]`;
195188
}
196189
})();
@@ -199,63 +192,9 @@ export const readS3FileContentByBuffer = async ({
199192
formatText = formatText.replace(item.uuid, src);
200193
}
201194
});
202-
203-
// Log summary of image processing
204-
addLog.info(`Image processing completed`, {
205-
total: imageList.length,
206-
successful: uploadedImageKeys.length,
207-
failed: imageList.length - uploadedImageKeys.length
208-
});
209195
}
210196

211-
addLog.debug(`Upload file to S3 success, time: ${Date.now() - start}ms`, {
212-
uploadedImageKeysCount: uploadedImageKeys.length,
213-
uploadedImageKeys
214-
});
215-
216197
return {
217-
rawText: getFormatText ? formatText || rawText : rawText,
218-
imageKeys: uploadedImageKeys
198+
rawText: getFormatText ? formatText || rawText : rawText
219199
};
220200
};
221-
222-
export const parsedFileContentS3Key = {
223-
// 临时的文件路径(比如 evaluation)
224-
temp: (appId: string) => `chat/${appId}/temp/parsed/${randomUUID()}`,
225-
226-
// 对话中上传的文件的解析结果的图片的 Key
227-
chat: ({ appId, chatId, uId }: { chatId: string; uId: string; appId: string }) =>
228-
`chat/${appId}/${uId}/${chatId}/parsed`,
229-
230-
// 上传数据集的文件的解析结果的图片的 Key
231-
dataset: (params: ParsedFileContentS3KeyParams) => {
232-
const { datasetId, mimetype, filename, parentFileKey } = params;
233-
234-
const extension = mimetype;
235-
const image = (() => {
236-
if (filename) {
237-
return Boolean(path.extname(filename))
238-
? `${getNanoid(6)}-${filename}`
239-
: `${getNanoid(6)}-${filename}.${extension}`;
240-
}
241-
return `${getNanoid(6)}.${extension}`;
242-
})();
243-
244-
const parentFilename = parentFileKey?.slice().split('/').at(-1);
245-
const parsedParentFilename = parentFilename
246-
? `parsed-${path.basename(parentFilename, path.extname(parentFilename))}`
247-
: '';
248-
const parsedParentFileKey = parentFileKey
249-
?.split('/')
250-
.slice(0, -1)
251-
.concat(parsedParentFilename)
252-
.join('/');
253-
254-
return {
255-
key: parsedParentFileKey
256-
? `${parsedParentFileKey}/${image}`
257-
: [S3Sources.dataset, datasetId, image].join('/'),
258-
filename: image
259-
};
260-
}
261-
};

packages/service/common/s3/sources/dataset/index.ts

Lines changed: 11 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,7 @@ import {
1111
type GetDatasetFileContentParams,
1212
GetDatasetFileContentParamsSchema,
1313
type UploadDatasetFileByBufferParams,
14-
UploadDatasetFileByBufferParamsSchema,
15-
type UploadDatasetImageParams,
16-
UploadDatasetImageParamsSchema
14+
UploadDatasetFileByBufferParamsSchema
1715
} from './type';
1816
import { MongoS3TTL } from '../../schema';
1917
import {
@@ -37,7 +35,7 @@ import { ERROR_ENUM } from '@fastgpt/global/common/error/errorCode';
3735
type DatasetObjectKey = `${typeof S3Sources.dataset}/${string}`;
3836

3937
class S3DatasetSource {
40-
private bucket: S3PrivateBucket;
38+
public bucket: S3PrivateBucket;
4139
private static instance: S3DatasetSource;
4240

4341
constructor() {
@@ -65,7 +63,11 @@ class S3DatasetSource {
6563
return await this.bucket.createPostPresignedUrl({ rawKey, filename }, { expiredHours: 3 });
6664
}
6765

68-
// 前缀删除
66+
/**
67+
* 可以根据 datasetId 或者 prefix 删除文件
68+
* 如果存在 rawPrefix 则优先使用 rawPrefix 去删除文件,否则使用 datasetId 拼接前缀去删除文件
69+
* 比如根据被解析的文档前缀去删除解析出来的图片
70+
**/
6971
deleteDatasetFilesByPrefix(params: DeleteDatasetFilesByPrefixParams) {
7072
const { datasetId, rawPrefix } = DeleteDatasetFilesByPrefixParamsSchema.parse(params);
7173
const prefix = rawPrefix || [S3Sources.dataset, datasetId].filter(Boolean).join('/');
@@ -132,8 +134,7 @@ class S3DatasetSource {
132134
if (fileBuffer) {
133135
return {
134136
rawText: fileBuffer.text,
135-
filename: fileBuffer.sourceName,
136-
imageKeys: fileBuffer.imageKeys
137+
filename: fileBuffer.sourceName
137138
};
138139
}
139140

@@ -151,7 +152,7 @@ class S3DatasetSource {
151152

152153
const encoding = detectFileEncoding(buffer);
153154
const prefix = `${path.dirname(fileId)}/${path.basename(fileId, path.extname(fileId))}-parsed`;
154-
const { rawText, imageKeys } = await readS3FileContentByBuffer({
155+
const { rawText } = await readS3FileContentByBuffer({
155156
teamId,
156157
tmbId,
157158
extension,
@@ -169,47 +170,15 @@ class S3DatasetSource {
169170
sourceId: bufferId,
170171
sourceName: filename,
171172
text: rawText,
172-
expiredTime: addMinutes(new Date(), 20),
173-
imageKeys
173+
expiredTime: addMinutes(new Date(), 20)
174174
});
175175

176176
return {
177177
rawText,
178-
filename,
179-
imageKeys
178+
filename
180179
};
181180
}
182181

183-
// 上传图片
184-
async uploadDatasetImage(params: UploadDatasetImageParams): Promise<string> {
185-
const {
186-
uploadKey,
187-
base64Img,
188-
mimetype,
189-
filename,
190-
hasTTL = true
191-
} = UploadDatasetImageParamsSchema.parse(params);
192-
193-
const base64Data = base64Img.split(',')[1] || base64Img;
194-
const buffer = Buffer.from(base64Data, 'base64');
195-
196-
await this.bucket.putObject(uploadKey, buffer, buffer.length, {
197-
'content-type': mimetype,
198-
'upload-time': new Date().toISOString(),
199-
'origin-filename': encodeURIComponent(filename)
200-
});
201-
202-
if (hasTTL) {
203-
await MongoS3TTL.create({
204-
minioKey: uploadKey,
205-
bucketName: this.bucket.name,
206-
expiredTime: addDays(new Date(), 7)
207-
});
208-
}
209-
210-
return uploadKey;
211-
}
212-
213182
// 根据文件 Buffer 上传文件
214183
async uploadDatasetFileByBuffer(params: UploadDatasetFileByBufferParams): Promise<string> {
215184
const { datasetId, buffer, filename } = UploadDatasetFileByBufferParamsSchema.parse(params);
@@ -227,37 +196,6 @@ class S3DatasetSource {
227196
});
228197
return key;
229198
}
230-
231-
// 移除单个文件的 TTL 记录
232-
async removeDatasetFileTTL(fileKey: string, session?: ClientSession): Promise<void> {
233-
await MongoS3TTL.deleteOne(
234-
{
235-
minioKey: fileKey,
236-
bucketName: this.bucket.name
237-
},
238-
{ session }
239-
);
240-
241-
addLog.debug('Removed TTL for dataset file', { fileKey });
242-
}
243-
244-
// 移除多个图片的 TTL 记录
245-
async removeDatasetImagesTTL(imageKeys: string[], session?: ClientSession): Promise<void> {
246-
if (imageKeys.length === 0) return;
247-
248-
const result = await MongoS3TTL.deleteMany(
249-
{
250-
minioKey: { $in: imageKeys },
251-
bucketName: this.bucket.name
252-
},
253-
{ session }
254-
);
255-
256-
addLog.debug('Removed TTL for dataset images', {
257-
imageKeysCount: imageKeys.length,
258-
deletedCount: result.deletedCount
259-
});
260-
}
261199
}
262200

263201
export function getS3DatasetSource() {

packages/service/common/s3/sources/dataset/type.ts

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -38,20 +38,11 @@ export const UploadParsedDatasetImagesParamsSchema = z.object({
3838
});
3939
export type UploadParsedDatasetImagesParams = z.infer<typeof UploadParsedDatasetImagesParamsSchema>;
4040

41-
export const UploadDatasetImageParamsSchema = z.object({
42-
base64Img: z.string().nonempty(),
43-
uploadKey: z.string().nonempty(),
44-
mimetype: z.string().nonempty(),
45-
filename: z.string().nonempty(),
46-
hasTTL: z.boolean().optional()
47-
});
48-
export type UploadDatasetImageParams = z.infer<typeof UploadDatasetImageParamsSchema>;
49-
5041
export const ParsedFileContentS3KeyParamsSchema = z.object({
5142
datasetId: ObjectIdSchema,
5243
mimetype: z.string().nonempty(),
5344
filename: z.string().optional(),
54-
parentFileKey: z.string().optional() // 被解析的文件的完整key,作为图片的父目录
45+
parsedFileKey: z.string().optional() // 被解析的文件的完整 key,作为图片的父目录
5546
});
5647
export type ParsedFileContentS3KeyParams = z.infer<typeof ParsedFileContentS3KeyParamsSchema>;
5748

0 commit comments

Comments
 (0)