Skip to content

Commit

Permalink
Feat: pptx and xlsx loader (#1118)
Browse files Browse the repository at this point in the history
* perf: plan tip

* perf: upload size controller

* feat: add image ttl index

* feat: new upload file ux

* remove file

* feat: support read pptx

* feat: support xlsx

* fix: rerank docker flie
  • Loading branch information
c121914yu authored Apr 1, 2024
1 parent f9d266a commit 21288d1
Show file tree
Hide file tree
Showing 90 changed files with 2,703 additions and 1,674 deletions.
2 changes: 1 addition & 1 deletion docSite/content/docs/workflow/modules/tool.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ weight: 356

## 工具是如何运行的

要了解工具如何允许,首先需要知道它的运行条件。
要了解工具如何运行的,首先需要知道它的运行条件。

1. 需要工具的介绍(或者叫描述)。这个介绍会告诉LLM,这个工具的作用是什么,LLM会根据上下文语义,决定是否需要调用这个工具。
2. 工具的参数。有些工具调用时,可能需要一些特殊的参数。参数中有2个关键的值:`参数介绍``是否必须`
Expand Down
7 changes: 6 additions & 1 deletion packages/global/common/error/code/common.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,17 @@ import { ErrType } from '../errorCode';
/* dataset: 507000 */
const startCode = 507000;
export enum CommonErrEnum {
fileNotFound = 'fileNotFound'
fileNotFound = 'fileNotFound',
unAuthFile = 'unAuthFile'
}
const datasetErr = [
{
statusText: CommonErrEnum.fileNotFound,
message: 'error.fileNotFound'
},
{
statusText: CommonErrEnum.unAuthFile,
message: 'error.unAuthFile'
}
];
export default datasetErr.reduce((acc, cur, index) => {
Expand Down
7 changes: 4 additions & 3 deletions packages/global/common/string/textSplitter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,9 @@ export const splitText2Chunks = (props: {
{ reg: /^(####\s[^\n]+)\n/gm, maxLen: chunkLen * 1.2 },

{ reg: /([\n]([`~]))/g, maxLen: chunkLen * 4 }, // code block
{ reg: /([\n](?!\s*[\*\-|>0-9]))/g, maxLen: chunkLen * 2 }, // (?![\*\-|>`0-9]): markdown special char
{ reg: /([\n](?!\s*[\*\-|>0-9]))/g, maxLen: chunkLen * 2 }, // 增大块,尽可能保证它是一个完整的段落。 (?![\*\-|>`0-9]): markdown special char
{ reg: /([\n])/g, maxLen: chunkLen * 1.2 },

// ------ There's no overlap on the top
{ reg: /([]|([a-zA-Z])\.\s)/g, maxLen: chunkLen * 1.2 },
{ reg: /([]|!\s)/g, maxLen: chunkLen * 1.2 },
{ reg: /([]|\?\s)/g, maxLen: chunkLen * 1.4 },
Expand All @@ -56,7 +56,7 @@ export const splitText2Chunks = (props: {
const checkIndependentChunk = (step: number) => step >= customRegLen && step <= 4 + customRegLen;
const checkForbidOverlap = (step: number) => step <= 6 + customRegLen;

// if use markdown title split, Separate record title title
// if use markdown title split, Separate record title
const getSplitTexts = ({ text, step }: { text: string; step: number }) => {
if (step >= stepReges.length) {
return [
Expand Down Expand Up @@ -97,6 +97,7 @@ export const splitText2Chunks = (props: {
.filter((item) => item.text.trim());
};

/* Gets the overlap at the end of a text as the beginning of the next block */
const getOneTextOverlapText = ({ text, step }: { text: string; step: number }): string => {
const forbidOverlap = checkForbidOverlap(step);
const maxOverlapLen = chunkLen * 0.4;
Expand Down
1 change: 1 addition & 0 deletions packages/global/common/system/types/index.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ export type FastGPTFeConfigsType = {
customApiDomain?: string;
customSharePageDomain?: string;

uploadFileMaxAmount?: number;
uploadFileMaxSize?: number;
};

Expand Down
12 changes: 8 additions & 4 deletions packages/global/core/dataset/api.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -44,14 +44,18 @@ export type TextCreateDatasetCollectionParams = ApiCreateDatasetCollectionParams
export type LinkCreateDatasetCollectionParams = ApiCreateDatasetCollectionParams & {
link: string;
};
export type FileIdCreateDatasetCollectionParams = ApiCreateDatasetCollectionParams & {
fileId: string;
};
export type FileCreateDatasetCollectionParams = ApiCreateDatasetCollectionParams & {
name: string;
rawTextLength: number;
hashRawText: string;

fileMetadata?: Record<string, any>;
collectionMetadata?: Record<string, any>;
};
export type CsvTableCreateDatasetCollectionParams = {
datasetId: string;
parentId?: string;
fileId: string;
};

/* ================= data ===================== */
export type PgSearchRawType = {
Expand Down
7 changes: 7 additions & 0 deletions packages/global/core/dataset/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,13 @@ export const DatasetCollectionSyncResultMap = {
/* ------------ data -------------- */

/* ------------ training -------------- */
export enum ImportDataSourceEnum {
fileLocal = 'fileLocal',
fileLink = 'fileLink',
fileCustom = 'fileCustom',
csvTable = 'csvTable'
}

export enum TrainingModeEnum {
chunk = 'chunk',
auto = 'auto',
Expand Down
12 changes: 6 additions & 6 deletions packages/global/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,18 @@
"name": "@fastgpt/global",
"version": "1.0.0",
"dependencies": {
"@apidevtools/swagger-parser": "^10.1.0",
"axios": "^1.5.1",
"dayjs": "^1.11.7",
"encoding": "^0.1.13",
"js-tiktoken": "^1.0.7",
"openapi-types": "^12.1.3",
"openai": "4.28.0",
"nanoid": "^4.0.1",
"js-yaml": "^4.1.0",
"timezones-list": "^3.0.2",
"next": "13.5.2",
"jschardet": "3.1.1",
"@apidevtools/swagger-parser": "^10.1.0"
"nanoid": "^4.0.1",
"next": "13.5.2",
"openai": "4.28.0",
"openapi-types": "^12.1.3",
"timezones-list": "^3.0.2"
},
"devDependencies": {
"@types/js-yaml": "^4.0.9",
Expand Down
33 changes: 33 additions & 0 deletions packages/service/common/buffer/rawText/schema.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import { connectionMongo, type Model } from '../../mongo';
const { Schema, model, models } = connectionMongo;
import { RawTextBufferSchemaType } from './type';

export const collectionName = 'buffer.rawText';

const RawTextBufferSchema = new Schema({
sourceId: {
type: String,
required: true
},
rawText: {
type: String,
default: ''
},
createTime: {
type: Date,
default: () => new Date()
},
metadata: Object
});

try {
RawTextBufferSchema.index({ sourceId: 1 });
// 20 minutes
RawTextBufferSchema.index({ createTime: 1 }, { expireAfterSeconds: 20 * 60 });
} catch (error) {
console.log(error);
}

export const MongoRwaTextBuffer: Model<RawTextBufferSchemaType> =
models[collectionName] || model(collectionName, RawTextBufferSchema);
MongoRwaTextBuffer.syncIndexes();
8 changes: 8 additions & 0 deletions packages/service/common/buffer/rawText/type.d.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
export type RawTextBufferSchemaType = {
sourceId: string;
rawText: string;
createTime: Date;
metadata?: {
filename: string;
};
};
2 changes: 1 addition & 1 deletion packages/service/common/buffer/tts/schema.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import { connectionMongo, type Model } from '../../../common/mongo';
const { Schema, model, models } = connectionMongo;
import { TTSBufferSchemaType } from './type.d';

export const collectionName = 'ttsbuffers';
export const collectionName = 'buffer.tts';

const TTSBufferSchema = new Schema({
bufferId: {
Expand Down
148 changes: 148 additions & 0 deletions packages/service/common/file/gridfs/controller.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,18 @@ import fsp from 'fs/promises';
import fs from 'fs';
import { DatasetFileSchema } from '@fastgpt/global/core/dataset/type';
import { MongoFileSchema } from './schema';
import { detectFileEncoding } from '@fastgpt/global/common/file/tools';
import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
import { readFileRawText } from '../read/rawText';
import { ReadFileByBufferParams } from '../read/type';
import { readMarkdown } from '../read/markdown';
import { readHtmlRawText } from '../read/html';
import { readPdfFile } from '../read/pdf';
import { readWordFile } from '../read/word';
import { readCsvRawText } from '../read/csv';
import { MongoRwaTextBuffer } from '../../buffer/rawText/schema';
import { readPptxRawText } from '../read/pptx';
import { readXlsxRawText } from '../read/xlsx';

export function getGFSCollection(bucket: `${BucketNameEnum}`) {
MongoFileSchema;
Expand Down Expand Up @@ -111,3 +123,139 @@ export async function getDownloadStream({

return bucket.openDownloadStream(new Types.ObjectId(fileId));
}

export const readFileEncode = async ({
bucketName,
fileId
}: {
bucketName: `${BucketNameEnum}`;
fileId: string;
}) => {
const encodeStream = await getDownloadStream({ bucketName, fileId });
let buffers: Buffer = Buffer.from([]);
for await (const chunk of encodeStream) {
buffers = Buffer.concat([buffers, chunk]);
if (buffers.length > 10) {
encodeStream.abort();
break;
}
}

const encoding = detectFileEncoding(buffers);

return encoding as BufferEncoding;
};

export const readFileContent = async ({
teamId,
bucketName,
fileId,
csvFormat = false
}: {
teamId: string;
bucketName: `${BucketNameEnum}`;
fileId: string;
csvFormat?: boolean;
}): Promise<{
rawText: string;
filename: string;
}> => {
// read buffer
const fileBuffer = await MongoRwaTextBuffer.findOne({ sourceId: fileId }).lean();
if (fileBuffer) {
return {
rawText: fileBuffer.rawText,
filename: fileBuffer.metadata?.filename || ''
};
}

const [file, encoding, fileStream] = await Promise.all([
getFileById({ bucketName, fileId }),
readFileEncode({ bucketName, fileId }),
getDownloadStream({ bucketName, fileId })
]);

if (!file) {
return Promise.reject(CommonErrEnum.fileNotFound);
}

const extension = file?.filename?.split('.')?.pop()?.toLowerCase() || '';

const fileBuffers = await (() => {
return new Promise<Buffer>((resolve, reject) => {
let buffers = Buffer.from([]);
fileStream.on('data', (chunk) => {
buffers = Buffer.concat([buffers, chunk]);
});
fileStream.on('end', () => {
resolve(buffers);
});
fileStream.on('error', (err) => {
reject(err);
});
});
})();

const params: ReadFileByBufferParams = {
teamId,
buffer: fileBuffers,
encoding,
metadata: {
relatedId: fileId
}
};

const { rawText } = await (async () => {
switch (extension) {
case 'txt':
return readFileRawText(params);
case 'md':
return readMarkdown(params);
case 'html':
return readHtmlRawText(params);
case 'pdf':
return readPdfFile(params);
case 'docx':
return readWordFile(params);
case 'pptx':
return readPptxRawText(params);
case 'xlsx':
const xlsxResult = await readXlsxRawText(params);
if (csvFormat) {
return {
rawText: xlsxResult.formatText || ''
};
}
return {
rawText: xlsxResult.rawText
};
case 'csv':
const csvResult = await readCsvRawText(params);
if (csvFormat) {
return {
rawText: csvResult.formatText || ''
};
}
return {
rawText: csvResult.rawText
};
default:
return Promise.reject('Only support .txt, .md, .html, .pdf, .docx, pptx, .csv, .xlsx');
}
})();

if (rawText.trim()) {
await MongoRwaTextBuffer.create({
sourceId: fileId,
rawText,
metadata: {
filename: file.filename
}
});
}

return {
rawText,
filename: file.filename
};
};
4 changes: 1 addition & 3 deletions packages/service/common/file/image/controller.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ export async function uploadMongoImg({
teamId,
expiredTime,
metadata,

shareId
}: UploadImgProps & {
teamId: string;
Expand All @@ -30,9 +29,8 @@ export async function uploadMongoImg({
type,
teamId,
binary,
expiredTime: expiredTime,
expiredTime,
metadata,

shareId
});

Expand Down
2 changes: 1 addition & 1 deletion packages/service/common/file/image/schema.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,13 @@ const ImageSchema = new Schema({
enum: Object.keys(mongoImageTypeMap),
required: true
},

metadata: {
type: Object
}
});

try {
// tts expired
ImageSchema.index({ expiredTime: 1 }, { expireAfterSeconds: 60 });
ImageSchema.index({ type: 1 });
ImageSchema.index({ createTime: 1 });
Expand Down
21 changes: 21 additions & 0 deletions packages/service/common/file/read/csv.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import Papa from 'papaparse';
import { ReadFileByBufferParams, ReadFileResponse } from './type.d';
import { readFileRawText } from './rawText';

// 加载源文件内容
export const readCsvRawText = async (params: ReadFileByBufferParams): Promise<ReadFileResponse> => {
const { rawText } = readFileRawText(params);

const csvArr = Papa.parse(rawText).data as string[][];

const header = csvArr[0];

const formatText = header
? csvArr.map((item) => item.map((item, i) => `${header[i]}:${item}`).join('\n')).join('\n')
: '';

return {
rawText,
formatText
};
};
Loading

0 comments on commit 21288d1

Please sign in to comment.