Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions packages/global/common/file/image/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ export const FolderIcon = 'file/fill/folder';
export const FolderImgUrl = '/imgs/files/folder.svg';
export const HttpPluginImgUrl = '/imgs/app/httpPluginFill.svg';
export const HttpImgUrl = '/imgs/workflow/http.png';
export const TempFileURL = '/api/file/temp';
1 change: 0 additions & 1 deletion packages/global/core/dataset/training/type.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ export type PushDataToTrainingQueueProps = {

data: PushDatasetDataChunkProps[];
mode?: TrainingModeEnum;
data: PushDatasetDataChunkProps[];

agentModel: string;
vectorModel: string;
Expand Down
6 changes: 5 additions & 1 deletion packages/global/core/dataset/type.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ export type DatasetCollectionSchemaType = ChunkSettingsType & {

rawTextLength?: number;
hashRawText?: string;

metadata?: {
webPageSelector?: string;
relatedImgId?: string; // The id of the associated image collections
Expand Down Expand Up @@ -250,7 +251,10 @@ export type TagUsageType = {
export type DatasetCollectionItemType = CollectionWithDatasetType & {
sourceName: string;
sourceId?: string;
file?: DatasetFileSchema;
file?: {
filename?: string;
contentLength?: number;
};
permission: DatasetPermission;
indexAmount: number;
errorCount?: number;
Expand Down
34 changes: 34 additions & 0 deletions packages/global/core/dataset/v2/api.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import { ObjectIdSchema } from '../../../common/type/mongo';
import z from 'zod';

export const PresignDatasetFileGetUrlSchema = z.union([
z.object({
key: z
.string()
.nonempty()
.refine((key) => key.startsWith('dataset/'), {
message: 'Invalid key format: must start with "dataset/"'
})
.transform((k) => decodeURIComponent(k)),
preview: z.boolean().optional()
}),
z.object({
collectionId: ObjectIdSchema
// datasetId: ObjectIdSchema
})
]);
export type PresignDatasetFileGetUrlParams = z.infer<typeof PresignDatasetFileGetUrlSchema>;

export const PresignDatasetFilePostUrlSchema = z.object({
filename: z.string().min(1),
datasetId: ObjectIdSchema
});
export type PresignDatasetFilePostUrlParams = z.infer<typeof PresignDatasetFilePostUrlSchema>;

export const ShortPreviewLinkSchema = z.object({
k: z
.string()
.nonempty()
.transform((k) => `chat:temp_file:${decodeURIComponent(k)}`)
});
export type ShortPreviewLinkParams = z.infer<typeof ShortPreviewLinkSchema>;
93 changes: 10 additions & 83 deletions packages/service/common/file/gridfs/controller.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,12 @@ import fsp from 'fs/promises';
import fs from 'fs';
import { type DatasetFileSchema } from '@fastgpt/global/core/dataset/type';
import { MongoChatFileSchema, MongoDatasetFileSchema } from './schema';
import { detectFileEncoding, detectFileEncodingByPath } from '@fastgpt/global/common/file/tools';
import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
import { readRawContentByFileBuffer } from '../read/utils';
import { computeGridFsChunSize, gridFsStream2Buffer, stream2Encoding } from './utils';
import { detectFileEncodingByPath } from '@fastgpt/global/common/file/tools';
import { computeGridFsChunSize, stream2Encoding } from './utils';
import { addLog } from '../../system/log';
import { parseFileExtensionFromUrl } from '@fastgpt/global/common/string/tools';
import { Readable } from 'stream';
import { addRawTextBuffer, getRawTextBuffer } from '../../buffer/rawText/controller';
import { addMinutes } from 'date-fns';
import { retryFn } from '@fastgpt/global/common/system/utils';
import { getS3DatasetSource } from '../../s3/sources/dataset';

export function getGFSCollection(bucket: `${BucketNameEnum}`) {
MongoDatasetFileSchema;
Expand Down Expand Up @@ -162,11 +158,17 @@ export async function delFileByFileIdList({
fileIdList: string[];
}): Promise<any> {
return retryFn(async () => {
const s3DatasetSource = getS3DatasetSource();

const bucket = getGridBucket(bucketName);

for await (const fileId of fileIdList) {
try {
await bucket.delete(new Types.ObjectId(String(fileId)));
if (s3DatasetSource.isDatasetObjectKey(fileId)) {
await s3DatasetSource.deleteDatasetFileByKey(fileId);
} else {
await bucket.delete(new Types.ObjectId(String(fileId)));
}
} catch (error: any) {
if (typeof error?.message === 'string' && error.message.includes('File not found')) {
addLog.warn('File not found', { fileId });
Expand All @@ -189,78 +191,3 @@ export async function getDownloadStream({

return bucket.openDownloadStream(new Types.ObjectId(fileId));
}

export const readFileContentFromMongo = async ({
teamId,
tmbId,
bucketName,
fileId,
customPdfParse = false,
getFormatText,
usageId
}: {
teamId: string;
tmbId: string;
bucketName: `${BucketNameEnum}`;
fileId: string;
customPdfParse?: boolean;
getFormatText?: boolean; // 数据类型都尽可能转化成 markdown 格式
usageId?: string;
}): Promise<{
rawText: string;
filename: string;
}> => {
const bufferId = `${String(fileId)}-${customPdfParse}`;
// read buffer
const fileBuffer = await getRawTextBuffer(bufferId);
if (fileBuffer) {
return {
rawText: fileBuffer.text,
filename: fileBuffer?.sourceName
};
}

const [file, fileStream] = await Promise.all([
getFileById({ bucketName, fileId }),
getDownloadStream({ bucketName, fileId })
]);
if (!file) {
return Promise.reject(CommonErrEnum.fileNotFound);
}

const extension = parseFileExtensionFromUrl(file?.filename);

const start = Date.now();
const fileBuffers = await gridFsStream2Buffer(fileStream);
addLog.debug('get file buffer', { time: Date.now() - start });

const encoding = file?.metadata?.encoding || detectFileEncoding(fileBuffers);

// Get raw text
const { rawText } = await readRawContentByFileBuffer({
customPdfParse,
usageId,
getFormatText,
extension,
teamId,
tmbId,
buffer: fileBuffers,
encoding,
metadata: {
relatedId: fileId
}
});

// Add buffer
addRawTextBuffer({
sourceId: bufferId,
sourceName: file.filename,
text: rawText,
expiredTime: addMinutes(new Date(), 20)
});

return {
rawText,
filename: file.filename
};
};
29 changes: 19 additions & 10 deletions packages/service/common/file/image/controller.ts
Original file line number Diff line number Diff line change
Expand Up @@ -64,23 +64,28 @@ export async function uploadMongoImg({
export const copyAvatarImage = async ({
teamId,
imageUrl,
ttl,
temporary,
session
}: {
teamId: string;
imageUrl: string;
ttl: boolean;
temporary: boolean;
session?: ClientSession;
}) => {
if (!imageUrl) return;

// S3
if (imageUrl.startsWith(`${imageBaseUrl}/${S3Sources.avatar}`)) {
const extendName = path.extname(imageUrl);
const avatarSource = getS3AvatarSource();
if (avatarSource.isAvatarKey(imageUrl)) {
const filename = (() => {
const last = imageUrl.split('/').pop()?.split('-')[1];
if (!last) return getNanoid(6).concat(path.extname(imageUrl));
return `${getNanoid(6)}-${last}`;
})();
const key = await getS3AvatarSource().copyAvatar({
sourceKey: imageUrl.slice(imageBaseUrl.length),
targetKey: `${S3Sources.avatar}/${teamId}/${getNanoid(6)}${extendName}`,
ttl
key: imageUrl,
teamId,
filename,
temporary
});
return key;
}
Expand Down Expand Up @@ -130,9 +135,13 @@ export const removeImageByPath = (path?: string, session?: ClientSession) => {
if (!name) return;

const id = name.split('.')[0];
if (!id || !Types.ObjectId.isValid(id)) return;
if (!id) return;

return MongoImage.deleteOne({ _id: id }, { session });
if (Types.ObjectId.isValid(id)) {
return MongoImage.deleteOne({ _id: id }, { session });
} else if (getS3AvatarSource().isAvatarKey(path)) {
return getS3AvatarSource().deleteAvatar(path, session);
}
};

export async function readMongoImg({ id }: { id: string }) {
Expand Down
50 changes: 31 additions & 19 deletions packages/service/common/file/read/utils.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import { uploadMongoImg } from '../image/controller';
import FormData from 'form-data';
import fs from 'fs';
import type { ReadFileResponse } from '../../../worker/readFile/type';
Expand All @@ -9,6 +8,9 @@ import { matchMdImg } from '@fastgpt/global/common/string/markdown';
import { createPdfParseUsage } from '../../../support/wallet/usage/controller';
import { useDoc2xServer } from '../../../thirdProvider/doc2x';
import { readRawContentFromBuffer } from '../../../worker/function';
import { uploadImage2S3Bucket } from '../../s3/utils';
import { Mimes } from '../../s3/constants';
import { addDays } from 'date-fns';

export type readRawTextByLocalFileParams = {
teamId: string;
Expand All @@ -17,6 +19,7 @@ export type readRawTextByLocalFileParams = {
encoding: string;
customPdfParse?: boolean;
getFormatText?: boolean;
uploadKey: string;
metadata?: Record<string, any>;
};
export const readRawTextByLocalFile = async (params: readRawTextByLocalFileParams) => {
Expand All @@ -26,41 +29,47 @@ export const readRawTextByLocalFile = async (params: readRawTextByLocalFileParam

const buffer = await fs.promises.readFile(path);

return readRawContentByFileBuffer({
return readS3FileContentByBuffer({
extension,
customPdfParse: params.customPdfParse,
getFormatText: params.getFormatText,
teamId: params.teamId,
tmbId: params.tmbId,
encoding: params.encoding,
buffer,
metadata: params.metadata
imageKeyOptions: {
prefix: params.uploadKey,
expiredTime: addDays(new Date(), 1)
}
});
};

export const readRawContentByFileBuffer = async ({
export const readS3FileContentByBuffer = async ({
teamId,
tmbId,

extension,
buffer,
encoding,
metadata,
customPdfParse = false,
usageId,
getFormatText = true
getFormatText = true,
imageKeyOptions
}: {
teamId: string;
tmbId: string;

extension: string;
buffer: Buffer;
encoding: string;
metadata?: Record<string, any>;

customPdfParse?: boolean;
usageId?: string;
getFormatText?: boolean;
imageKeyOptions: {
prefix: string;
expiredTime?: Date;
};
}): Promise<{
rawText: string;
}> => {
Expand Down Expand Up @@ -158,21 +167,24 @@ export const readRawContentByFileBuffer = async ({
addLog.debug(`Parse file success, time: ${Date.now() - start}ms. `);

// markdown data format
if (imageList) {
if (imageList && imageList.length > 0) {
addLog.debug(`Processing ${imageList.length} images from parsed document`);

await batchRun(imageList, async (item) => {
const src = await (async () => {
try {
return await uploadMongoImg({
const { prefix, expiredTime } = imageKeyOptions;
const ext = `.${item.mime.split('/')[1].replace('x-', '')}`;

return await uploadImage2S3Bucket('private', {
base64Img: `data:${item.mime};base64,${item.base64}`,
teamId,
metadata: {
...metadata,
mime: item.mime
}
uploadKey: `${prefix}/${item.uuid}.${ext}`,
mimetype: Mimes[ext as keyof typeof Mimes],
filename: `${item.uuid}.${ext}`,
expiredTime
});
} catch (error) {
addLog.warn('Upload file image error', { error });
return 'Upload load image error';
return `[Image Upload Failed: ${item.uuid}]`;
Copy link

Copilot AI Nov 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unclear error handling: The error message "Upload load image error" on line 187 should be "Upload image error" or "Failed to upload image". The word "load" seems misplaced.

Copilot uses AI. Check for mistakes.
}
})();
rawText = rawText.replace(item.uuid, src);
Expand All @@ -182,7 +194,7 @@ export const readRawContentByFileBuffer = async ({
});
}

addLog.debug(`Upload file success, time: ${Date.now() - start}ms`);

return { rawText: getFormatText ? formatText || rawText : rawText };
return {
rawText: getFormatText ? formatText || rawText : rawText
};
};
Loading
Loading