Feat: pptx and xlsx loader (#1118)

* perf: plan tip * perf: upload size controller * feat: add image ttl index * feat: new upload file ux * remove file * feat: support read pptx * feat: support xlsx * fix: rerank docker flie
labring · Apr 1, 2024 · 21288d1 · 21288d1
1 parent f9d266a
commit 21288d1
Show file tree

Hide file tree

Showing 90 changed files with 2,703 additions and 1,674 deletions.
diff --git a/docSite/content/docs/workflow/modules/tool.md b/docSite/content/docs/workflow/modules/tool.md
@@ -22,7 +22,7 @@ weight: 356
 
 ## 工具是如何运行的
 
-要了解工具如何允许，首先需要知道它的运行条件。
+要了解工具如何运行的，首先需要知道它的运行条件。
 
 1. 需要工具的介绍（或者叫描述）。这个介绍会告诉LLM，这个工具的作用是什么，LLM会根据上下文语义，决定是否需要调用这个工具。
 2. 工具的参数。有些工具调用时，可能需要一些特殊的参数。参数中有2个关键的值：`参数介绍`和`是否必须`。

diff --git a/packages/global/common/error/code/common.ts b/packages/global/common/error/code/common.ts
@@ -3,12 +3,17 @@ import { ErrType } from '../errorCode';
 /* dataset: 507000 */
 const startCode = 507000;
 export enum CommonErrEnum {
-  fileNotFound = 'fileNotFound'
+  fileNotFound = 'fileNotFound',
+  unAuthFile = 'unAuthFile'
 }
 const datasetErr = [
   {
     statusText: CommonErrEnum.fileNotFound,
     message: 'error.fileNotFound'
+  },
+  {
+    statusText: CommonErrEnum.unAuthFile,
+    message: 'error.unAuthFile'
   }
 ];
 export default datasetErr.reduce((acc, cur, index) => {

diff --git a/packages/global/common/string/textSplitter.ts b/packages/global/common/string/textSplitter.ts
@@ -40,9 +40,9 @@ export const splitText2Chunks = (props: {
     { reg: /^(####\s[^\n]+)\n/gm, maxLen: chunkLen * 1.2 },
 
     { reg: /([\n]([`~]))/g, maxLen: chunkLen * 4 }, // code block
-    { reg: /([\n](?!\s*[\*\-|>0-9]))/g, maxLen: chunkLen * 2 }, // (?![\*\-|>`0-9]): markdown special char
+    { reg: /([\n](?!\s*[\*\-|>0-9]))/g, maxLen: chunkLen * 2 }, // 增大块，尽可能保证它是一个完整的段落。 (?![\*\-|>`0-9]): markdown special char
     { reg: /([\n])/g, maxLen: chunkLen * 1.2 },
-
+    // ------ There's no overlap on the top
     { reg: /([。]|([a-zA-Z])\.\s)/g, maxLen: chunkLen * 1.2 },
     { reg: /([！]|!\s)/g, maxLen: chunkLen * 1.2 },
     { reg: /([？]|\?\s)/g, maxLen: chunkLen * 1.4 },
@@ -56,7 +56,7 @@ export const splitText2Chunks = (props: {
   const checkIndependentChunk = (step: number) => step >= customRegLen && step <= 4 + customRegLen;
   const checkForbidOverlap = (step: number) => step <= 6 + customRegLen;
 
-  // if use markdown title split, Separate record title title
+  // if use markdown title split, Separate record title
   const getSplitTexts = ({ text, step }: { text: string; step: number }) => {
     if (step >= stepReges.length) {
       return [
@@ -97,6 +97,7 @@ export const splitText2Chunks = (props: {
       .filter((item) => item.text.trim());
   };
 
+  /* Gets the overlap at the end of a text as the beginning of the next block */
   const getOneTextOverlapText = ({ text, step }: { text: string; step: number }): string => {
     const forbidOverlap = checkForbidOverlap(step);
     const maxOverlapLen = chunkLen * 0.4;

diff --git a/packages/global/common/system/types/index.d.ts b/packages/global/common/system/types/index.d.ts
@@ -55,6 +55,7 @@ export type FastGPTFeConfigsType = {
   customApiDomain?: string;
   customSharePageDomain?: string;
 
+  uploadFileMaxAmount?: number;
   uploadFileMaxSize?: number;
 };
 

diff --git a/packages/global/core/dataset/api.d.ts b/packages/global/core/dataset/api.d.ts
@@ -44,14 +44,18 @@ export type TextCreateDatasetCollectionParams = ApiCreateDatasetCollectionParams
 export type LinkCreateDatasetCollectionParams = ApiCreateDatasetCollectionParams & {
   link: string;
 };
+export type FileIdCreateDatasetCollectionParams = ApiCreateDatasetCollectionParams & {
+  fileId: string;
+};
 export type FileCreateDatasetCollectionParams = ApiCreateDatasetCollectionParams & {
-  name: string;
-  rawTextLength: number;
-  hashRawText: string;
-
   fileMetadata?: Record<string, any>;
   collectionMetadata?: Record<string, any>;
 };
+export type CsvTableCreateDatasetCollectionParams = {
+  datasetId: string;
+  parentId?: string;
+  fileId: string;
+};
 
 /* ================= data ===================== */
 export type PgSearchRawType = {

diff --git a/packages/global/core/dataset/constants.ts b/packages/global/core/dataset/constants.ts
@@ -73,6 +73,13 @@ export const DatasetCollectionSyncResultMap = {
 /* ------------ data -------------- */
 
 /* ------------ training -------------- */
+export enum ImportDataSourceEnum {
+  fileLocal = 'fileLocal',
+  fileLink = 'fileLink',
+  fileCustom = 'fileCustom',
+  csvTable = 'csvTable'
+}
+
 export enum TrainingModeEnum {
   chunk = 'chunk',
   auto = 'auto',

diff --git a/packages/global/package.json b/packages/global/package.json
@@ -2,18 +2,18 @@
   "name": "@fastgpt/global",
   "version": "1.0.0",
   "dependencies": {
+    "@apidevtools/swagger-parser": "^10.1.0",
     "axios": "^1.5.1",
     "dayjs": "^1.11.7",
     "encoding": "^0.1.13",
     "js-tiktoken": "^1.0.7",
-    "openapi-types": "^12.1.3",
-    "openai": "4.28.0",
-    "nanoid": "^4.0.1",
     "js-yaml": "^4.1.0",
-    "timezones-list": "^3.0.2",
-    "next": "13.5.2",
     "jschardet": "3.1.1",
-    "@apidevtools/swagger-parser": "^10.1.0"
+    "nanoid": "^4.0.1",
+    "next": "13.5.2",
+    "openai": "4.28.0",
+    "openapi-types": "^12.1.3",
+    "timezones-list": "^3.0.2"
   },
   "devDependencies": {
     "@types/js-yaml": "^4.0.9",

diff --git a/packages/service/common/buffer/rawText/schema.ts b/packages/service/common/buffer/rawText/schema.ts
@@ -0,0 +1,33 @@
+import { connectionMongo, type Model } from '../../mongo';
+const { Schema, model, models } = connectionMongo;
+import { RawTextBufferSchemaType } from './type';
+
+export const collectionName = 'buffer.rawText';
+
+const RawTextBufferSchema = new Schema({
+  sourceId: {
+    type: String,
+    required: true
+  },
+  rawText: {
+    type: String,
+    default: ''
+  },
+  createTime: {
+    type: Date,
+    default: () => new Date()
+  },
+  metadata: Object
+});
+
+try {
+  RawTextBufferSchema.index({ sourceId: 1 });
+  //  20 minutes
+  RawTextBufferSchema.index({ createTime: 1 }, { expireAfterSeconds: 20 * 60 });
+} catch (error) {
+  console.log(error);
+}
+
+export const MongoRwaTextBuffer: Model<RawTextBufferSchemaType> =
+  models[collectionName] || model(collectionName, RawTextBufferSchema);
+MongoRwaTextBuffer.syncIndexes();
diff --git a/packages/service/common/buffer/rawText/type.d.ts b/packages/service/common/buffer/rawText/type.d.ts
@@ -0,0 +1,8 @@
+export type RawTextBufferSchemaType = {
+  sourceId: string;
+  rawText: string;
+  createTime: Date;
+  metadata?: {
+    filename: string;
+  };
+};
diff --git a/packages/service/common/buffer/tts/schema.ts b/packages/service/common/buffer/tts/schema.ts
@@ -2,7 +2,7 @@ import { connectionMongo, type Model } from '../../../common/mongo';
 const { Schema, model, models } = connectionMongo;
 import { TTSBufferSchemaType } from './type.d';
 
-export const collectionName = 'ttsbuffers';
+export const collectionName = 'buffer.tts';
 
 const TTSBufferSchema = new Schema({
   bufferId: {

diff --git a/packages/service/common/file/gridfs/controller.ts b/packages/service/common/file/gridfs/controller.ts
@@ -4,6 +4,18 @@ import fsp from 'fs/promises';
 import fs from 'fs';
 import { DatasetFileSchema } from '@fastgpt/global/core/dataset/type';
 import { MongoFileSchema } from './schema';
+import { detectFileEncoding } from '@fastgpt/global/common/file/tools';
+import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
+import { readFileRawText } from '../read/rawText';
+import { ReadFileByBufferParams } from '../read/type';
+import { readMarkdown } from '../read/markdown';
+import { readHtmlRawText } from '../read/html';
+import { readPdfFile } from '../read/pdf';
+import { readWordFile } from '../read/word';
+import { readCsvRawText } from '../read/csv';
+import { MongoRwaTextBuffer } from '../../buffer/rawText/schema';
+import { readPptxRawText } from '../read/pptx';
+import { readXlsxRawText } from '../read/xlsx';
 
 export function getGFSCollection(bucket: `${BucketNameEnum}`) {
   MongoFileSchema;
@@ -111,3 +123,139 @@ export async function getDownloadStream({
 
   return bucket.openDownloadStream(new Types.ObjectId(fileId));
 }
+
+export const readFileEncode = async ({
+  bucketName,
+  fileId
+}: {
+  bucketName: `${BucketNameEnum}`;
+  fileId: string;
+}) => {
+  const encodeStream = await getDownloadStream({ bucketName, fileId });
+  let buffers: Buffer = Buffer.from([]);
+  for await (const chunk of encodeStream) {
+    buffers = Buffer.concat([buffers, chunk]);
+    if (buffers.length > 10) {
+      encodeStream.abort();
+      break;
+    }
+  }
+
+  const encoding = detectFileEncoding(buffers);
+
+  return encoding as BufferEncoding;
+};
+
+export const readFileContent = async ({
+  teamId,
+  bucketName,
+  fileId,
+  csvFormat = false
+}: {
+  teamId: string;
+  bucketName: `${BucketNameEnum}`;
+  fileId: string;
+  csvFormat?: boolean;
+}): Promise<{
+  rawText: string;
+  filename: string;
+}> => {
+  // read buffer
+  const fileBuffer = await MongoRwaTextBuffer.findOne({ sourceId: fileId }).lean();
+  if (fileBuffer) {
+    return {
+      rawText: fileBuffer.rawText,
+      filename: fileBuffer.metadata?.filename || ''
+    };
+  }
+
+  const [file, encoding, fileStream] = await Promise.all([
+    getFileById({ bucketName, fileId }),
+    readFileEncode({ bucketName, fileId }),
+    getDownloadStream({ bucketName, fileId })
+  ]);
+
+  if (!file) {
+    return Promise.reject(CommonErrEnum.fileNotFound);
+  }
+
+  const extension = file?.filename?.split('.')?.pop()?.toLowerCase() || '';
+
+  const fileBuffers = await (() => {
+    return new Promise<Buffer>((resolve, reject) => {
+      let buffers = Buffer.from([]);
+      fileStream.on('data', (chunk) => {
+        buffers = Buffer.concat([buffers, chunk]);
+      });
+      fileStream.on('end', () => {
+        resolve(buffers);
+      });
+      fileStream.on('error', (err) => {
+        reject(err);
+      });
+    });
+  })();
+
+  const params: ReadFileByBufferParams = {
+    teamId,
+    buffer: fileBuffers,
+    encoding,
+    metadata: {
+      relatedId: fileId
+    }
+  };
+
+  const { rawText } = await (async () => {
+    switch (extension) {
+      case 'txt':
+        return readFileRawText(params);
+      case 'md':
+        return readMarkdown(params);
+      case 'html':
+        return readHtmlRawText(params);
+      case 'pdf':
+        return readPdfFile(params);
+      case 'docx':
+        return readWordFile(params);
+      case 'pptx':
+        return readPptxRawText(params);
+      case 'xlsx':
+        const xlsxResult = await readXlsxRawText(params);
+        if (csvFormat) {
+          return {
+            rawText: xlsxResult.formatText || ''
+          };
+        }
+        return {
+          rawText: xlsxResult.rawText
+        };
+      case 'csv':
+        const csvResult = await readCsvRawText(params);
+        if (csvFormat) {
+          return {
+            rawText: csvResult.formatText || ''
+          };
+        }
+        return {
+          rawText: csvResult.rawText
+        };
+      default:
+        return Promise.reject('Only support .txt, .md, .html, .pdf, .docx, pptx, .csv, .xlsx');
+    }
+  })();
+
+  if (rawText.trim()) {
+    await MongoRwaTextBuffer.create({
+      sourceId: fileId,
+      rawText,
+      metadata: {
+        filename: file.filename
+      }
+    });
+  }
+
+  return {
+    rawText,
+    filename: file.filename
+  };
+};
diff --git a/packages/service/common/file/image/controller.ts b/packages/service/common/file/image/controller.ts
@@ -14,7 +14,6 @@ export async function uploadMongoImg({
   teamId,
   expiredTime,
   metadata,
-
   shareId
 }: UploadImgProps & {
   teamId: string;
@@ -30,9 +29,8 @@ export async function uploadMongoImg({
     type,
     teamId,
     binary,
-    expiredTime: expiredTime,
+    expiredTime,
     metadata,
-
     shareId
   });
 

diff --git a/packages/service/common/file/image/schema.ts b/packages/service/common/file/image/schema.ts
@@ -25,13 +25,13 @@ const ImageSchema = new Schema({
     enum: Object.keys(mongoImageTypeMap),
     required: true
   },
-
   metadata: {
     type: Object
   }
 });
 
 try {
+  // tts expired
   ImageSchema.index({ expiredTime: 1 }, { expireAfterSeconds: 60 });
   ImageSchema.index({ type: 1 });
   ImageSchema.index({ createTime: 1 });

diff --git a/packages/service/common/file/read/csv.ts b/packages/service/common/file/read/csv.ts
@@ -0,0 +1,21 @@
+import Papa from 'papaparse';
+import { ReadFileByBufferParams, ReadFileResponse } from './type.d';
+import { readFileRawText } from './rawText';
+
+// 加载源文件内容
+export const readCsvRawText = async (params: ReadFileByBufferParams): Promise<ReadFileResponse> => {
+  const { rawText } = readFileRawText(params);
+
+  const csvArr = Papa.parse(rawText).data as string[][];
+
+  const header = csvArr[0];
+
+  const formatText = header
+    ? csvArr.map((item) => item.map((item, i) => `${header[i]}:${item}`).join('\n')).join('\n')
+    : '';
+
+  return {
+    rawText,
+    formatText
+  };
+};