diff --git a/docSite/content/docs/workflow/modules/tool.md b/docSite/content/docs/workflow/modules/tool.md index 0dda1e079298..fb0a771816c0 100644 --- a/docSite/content/docs/workflow/modules/tool.md +++ b/docSite/content/docs/workflow/modules/tool.md @@ -22,7 +22,7 @@ weight: 356 ## 工具是如何运行的 -要了解工具如何允许,首先需要知道它的运行条件。 +要了解工具如何运行的,首先需要知道它的运行条件。 1. 需要工具的介绍(或者叫描述)。这个介绍会告诉LLM,这个工具的作用是什么,LLM会根据上下文语义,决定是否需要调用这个工具。 2. 工具的参数。有些工具调用时,可能需要一些特殊的参数。参数中有2个关键的值:`参数介绍`和`是否必须`。 diff --git a/packages/global/common/error/code/common.ts b/packages/global/common/error/code/common.ts index 323ed8ad7970..7bb787d2c27f 100644 --- a/packages/global/common/error/code/common.ts +++ b/packages/global/common/error/code/common.ts @@ -3,12 +3,17 @@ import { ErrType } from '../errorCode'; /* dataset: 507000 */ const startCode = 507000; export enum CommonErrEnum { - fileNotFound = 'fileNotFound' + fileNotFound = 'fileNotFound', + unAuthFile = 'unAuthFile' } const datasetErr = [ { statusText: CommonErrEnum.fileNotFound, message: 'error.fileNotFound' + }, + { + statusText: CommonErrEnum.unAuthFile, + message: 'error.unAuthFile' } ]; export default datasetErr.reduce((acc, cur, index) => { diff --git a/packages/global/common/string/textSplitter.ts b/packages/global/common/string/textSplitter.ts index 1f8591b3a04d..8ee585721b49 100644 --- a/packages/global/common/string/textSplitter.ts +++ b/packages/global/common/string/textSplitter.ts @@ -40,9 +40,9 @@ export const splitText2Chunks = (props: { { reg: /^(####\s[^\n]+)\n/gm, maxLen: chunkLen * 1.2 }, { reg: /([\n]([`~]))/g, maxLen: chunkLen * 4 }, // code block - { reg: /([\n](?!\s*[\*\-|>0-9]))/g, maxLen: chunkLen * 2 }, // (?![\*\-|>`0-9]): markdown special char + { reg: /([\n](?!\s*[\*\-|>0-9]))/g, maxLen: chunkLen * 2 }, // 增大块,尽可能保证它是一个完整的段落。 (?![\*\-|>`0-9]): markdown special char { reg: /([\n])/g, maxLen: chunkLen * 1.2 }, - + // ------ There's no overlap on the top { reg: /([。]|([a-zA-Z])\.\s)/g, maxLen: chunkLen * 1.2 }, { reg: /([!]|!\s)/g, maxLen: chunkLen * 1.2 }, { reg: /([?]|\?\s)/g, maxLen: chunkLen * 1.4 }, @@ -56,7 +56,7 @@ export const splitText2Chunks = (props: { const checkIndependentChunk = (step: number) => step >= customRegLen && step <= 4 + customRegLen; const checkForbidOverlap = (step: number) => step <= 6 + customRegLen; - // if use markdown title split, Separate record title title + // if use markdown title split, Separate record title const getSplitTexts = ({ text, step }: { text: string; step: number }) => { if (step >= stepReges.length) { return [ @@ -97,6 +97,7 @@ export const splitText2Chunks = (props: { .filter((item) => item.text.trim()); }; + /* Gets the overlap at the end of a text as the beginning of the next block */ const getOneTextOverlapText = ({ text, step }: { text: string; step: number }): string => { const forbidOverlap = checkForbidOverlap(step); const maxOverlapLen = chunkLen * 0.4; diff --git a/packages/global/common/system/types/index.d.ts b/packages/global/common/system/types/index.d.ts index 94abbd74ac66..81def51f3d35 100644 --- a/packages/global/common/system/types/index.d.ts +++ b/packages/global/common/system/types/index.d.ts @@ -55,6 +55,7 @@ export type FastGPTFeConfigsType = { customApiDomain?: string; customSharePageDomain?: string; + uploadFileMaxAmount?: number; uploadFileMaxSize?: number; }; diff --git a/packages/global/core/dataset/api.d.ts b/packages/global/core/dataset/api.d.ts index 3c487db2b4ae..7a34b6c8c7d1 100644 --- a/packages/global/core/dataset/api.d.ts +++ b/packages/global/core/dataset/api.d.ts @@ -44,14 +44,18 @@ export type TextCreateDatasetCollectionParams = ApiCreateDatasetCollectionParams export type LinkCreateDatasetCollectionParams = ApiCreateDatasetCollectionParams & { link: string; }; +export type FileIdCreateDatasetCollectionParams = ApiCreateDatasetCollectionParams & { + fileId: string; +}; export type FileCreateDatasetCollectionParams = ApiCreateDatasetCollectionParams & { - name: string; - rawTextLength: number; - hashRawText: string; - fileMetadata?: Record; collectionMetadata?: Record; }; +export type CsvTableCreateDatasetCollectionParams = { + datasetId: string; + parentId?: string; + fileId: string; +}; /* ================= data ===================== */ export type PgSearchRawType = { diff --git a/packages/global/core/dataset/constants.ts b/packages/global/core/dataset/constants.ts index bd89bc8251a8..0e8be0f5d766 100644 --- a/packages/global/core/dataset/constants.ts +++ b/packages/global/core/dataset/constants.ts @@ -73,6 +73,13 @@ export const DatasetCollectionSyncResultMap = { /* ------------ data -------------- */ /* ------------ training -------------- */ +export enum ImportDataSourceEnum { + fileLocal = 'fileLocal', + fileLink = 'fileLink', + fileCustom = 'fileCustom', + csvTable = 'csvTable' +} + export enum TrainingModeEnum { chunk = 'chunk', auto = 'auto', diff --git a/packages/global/package.json b/packages/global/package.json index d8b5559e43db..5354d772c870 100644 --- a/packages/global/package.json +++ b/packages/global/package.json @@ -2,18 +2,18 @@ "name": "@fastgpt/global", "version": "1.0.0", "dependencies": { + "@apidevtools/swagger-parser": "^10.1.0", "axios": "^1.5.1", "dayjs": "^1.11.7", "encoding": "^0.1.13", "js-tiktoken": "^1.0.7", - "openapi-types": "^12.1.3", - "openai": "4.28.0", - "nanoid": "^4.0.1", "js-yaml": "^4.1.0", - "timezones-list": "^3.0.2", - "next": "13.5.2", "jschardet": "3.1.1", - "@apidevtools/swagger-parser": "^10.1.0" + "nanoid": "^4.0.1", + "next": "13.5.2", + "openai": "4.28.0", + "openapi-types": "^12.1.3", + "timezones-list": "^3.0.2" }, "devDependencies": { "@types/js-yaml": "^4.0.9", diff --git a/packages/service/common/buffer/rawText/schema.ts b/packages/service/common/buffer/rawText/schema.ts new file mode 100644 index 000000000000..b0735e5c64ce --- /dev/null +++ b/packages/service/common/buffer/rawText/schema.ts @@ -0,0 +1,33 @@ +import { connectionMongo, type Model } from '../../mongo'; +const { Schema, model, models } = connectionMongo; +import { RawTextBufferSchemaType } from './type'; + +export const collectionName = 'buffer.rawText'; + +const RawTextBufferSchema = new Schema({ + sourceId: { + type: String, + required: true + }, + rawText: { + type: String, + default: '' + }, + createTime: { + type: Date, + default: () => new Date() + }, + metadata: Object +}); + +try { + RawTextBufferSchema.index({ sourceId: 1 }); + // 20 minutes + RawTextBufferSchema.index({ createTime: 1 }, { expireAfterSeconds: 20 * 60 }); +} catch (error) { + console.log(error); +} + +export const MongoRwaTextBuffer: Model = + models[collectionName] || model(collectionName, RawTextBufferSchema); +MongoRwaTextBuffer.syncIndexes(); diff --git a/packages/service/common/buffer/rawText/type.d.ts b/packages/service/common/buffer/rawText/type.d.ts new file mode 100644 index 000000000000..43a793adc60d --- /dev/null +++ b/packages/service/common/buffer/rawText/type.d.ts @@ -0,0 +1,8 @@ +export type RawTextBufferSchemaType = { + sourceId: string; + rawText: string; + createTime: Date; + metadata?: { + filename: string; + }; +}; diff --git a/packages/service/common/buffer/tts/schema.ts b/packages/service/common/buffer/tts/schema.ts index 3004325d2a9d..670d1ed15793 100644 --- a/packages/service/common/buffer/tts/schema.ts +++ b/packages/service/common/buffer/tts/schema.ts @@ -2,7 +2,7 @@ import { connectionMongo, type Model } from '../../../common/mongo'; const { Schema, model, models } = connectionMongo; import { TTSBufferSchemaType } from './type.d'; -export const collectionName = 'ttsbuffers'; +export const collectionName = 'buffer.tts'; const TTSBufferSchema = new Schema({ bufferId: { diff --git a/packages/service/common/file/gridfs/controller.ts b/packages/service/common/file/gridfs/controller.ts index 729c9fb02726..2dd1bf12bae9 100644 --- a/packages/service/common/file/gridfs/controller.ts +++ b/packages/service/common/file/gridfs/controller.ts @@ -4,6 +4,18 @@ import fsp from 'fs/promises'; import fs from 'fs'; import { DatasetFileSchema } from '@fastgpt/global/core/dataset/type'; import { MongoFileSchema } from './schema'; +import { detectFileEncoding } from '@fastgpt/global/common/file/tools'; +import { CommonErrEnum } from '@fastgpt/global/common/error/code/common'; +import { readFileRawText } from '../read/rawText'; +import { ReadFileByBufferParams } from '../read/type'; +import { readMarkdown } from '../read/markdown'; +import { readHtmlRawText } from '../read/html'; +import { readPdfFile } from '../read/pdf'; +import { readWordFile } from '../read/word'; +import { readCsvRawText } from '../read/csv'; +import { MongoRwaTextBuffer } from '../../buffer/rawText/schema'; +import { readPptxRawText } from '../read/pptx'; +import { readXlsxRawText } from '../read/xlsx'; export function getGFSCollection(bucket: `${BucketNameEnum}`) { MongoFileSchema; @@ -111,3 +123,139 @@ export async function getDownloadStream({ return bucket.openDownloadStream(new Types.ObjectId(fileId)); } + +export const readFileEncode = async ({ + bucketName, + fileId +}: { + bucketName: `${BucketNameEnum}`; + fileId: string; +}) => { + const encodeStream = await getDownloadStream({ bucketName, fileId }); + let buffers: Buffer = Buffer.from([]); + for await (const chunk of encodeStream) { + buffers = Buffer.concat([buffers, chunk]); + if (buffers.length > 10) { + encodeStream.abort(); + break; + } + } + + const encoding = detectFileEncoding(buffers); + + return encoding as BufferEncoding; +}; + +export const readFileContent = async ({ + teamId, + bucketName, + fileId, + csvFormat = false +}: { + teamId: string; + bucketName: `${BucketNameEnum}`; + fileId: string; + csvFormat?: boolean; +}): Promise<{ + rawText: string; + filename: string; +}> => { + // read buffer + const fileBuffer = await MongoRwaTextBuffer.findOne({ sourceId: fileId }).lean(); + if (fileBuffer) { + return { + rawText: fileBuffer.rawText, + filename: fileBuffer.metadata?.filename || '' + }; + } + + const [file, encoding, fileStream] = await Promise.all([ + getFileById({ bucketName, fileId }), + readFileEncode({ bucketName, fileId }), + getDownloadStream({ bucketName, fileId }) + ]); + + if (!file) { + return Promise.reject(CommonErrEnum.fileNotFound); + } + + const extension = file?.filename?.split('.')?.pop()?.toLowerCase() || ''; + + const fileBuffers = await (() => { + return new Promise((resolve, reject) => { + let buffers = Buffer.from([]); + fileStream.on('data', (chunk) => { + buffers = Buffer.concat([buffers, chunk]); + }); + fileStream.on('end', () => { + resolve(buffers); + }); + fileStream.on('error', (err) => { + reject(err); + }); + }); + })(); + + const params: ReadFileByBufferParams = { + teamId, + buffer: fileBuffers, + encoding, + metadata: { + relatedId: fileId + } + }; + + const { rawText } = await (async () => { + switch (extension) { + case 'txt': + return readFileRawText(params); + case 'md': + return readMarkdown(params); + case 'html': + return readHtmlRawText(params); + case 'pdf': + return readPdfFile(params); + case 'docx': + return readWordFile(params); + case 'pptx': + return readPptxRawText(params); + case 'xlsx': + const xlsxResult = await readXlsxRawText(params); + if (csvFormat) { + return { + rawText: xlsxResult.formatText || '' + }; + } + return { + rawText: xlsxResult.rawText + }; + case 'csv': + const csvResult = await readCsvRawText(params); + if (csvFormat) { + return { + rawText: csvResult.formatText || '' + }; + } + return { + rawText: csvResult.rawText + }; + default: + return Promise.reject('Only support .txt, .md, .html, .pdf, .docx, pptx, .csv, .xlsx'); + } + })(); + + if (rawText.trim()) { + await MongoRwaTextBuffer.create({ + sourceId: fileId, + rawText, + metadata: { + filename: file.filename + } + }); + } + + return { + rawText, + filename: file.filename + }; +}; diff --git a/packages/service/common/file/image/controller.ts b/packages/service/common/file/image/controller.ts index 398d8bf7220c..be1672907a37 100644 --- a/packages/service/common/file/image/controller.ts +++ b/packages/service/common/file/image/controller.ts @@ -14,7 +14,6 @@ export async function uploadMongoImg({ teamId, expiredTime, metadata, - shareId }: UploadImgProps & { teamId: string; @@ -30,9 +29,8 @@ export async function uploadMongoImg({ type, teamId, binary, - expiredTime: expiredTime, + expiredTime, metadata, - shareId }); diff --git a/packages/service/common/file/image/schema.ts b/packages/service/common/file/image/schema.ts index 2eb2593d3415..cc43ad03b70d 100644 --- a/packages/service/common/file/image/schema.ts +++ b/packages/service/common/file/image/schema.ts @@ -25,13 +25,13 @@ const ImageSchema = new Schema({ enum: Object.keys(mongoImageTypeMap), required: true }, - metadata: { type: Object } }); try { + // tts expired ImageSchema.index({ expiredTime: 1 }, { expireAfterSeconds: 60 }); ImageSchema.index({ type: 1 }); ImageSchema.index({ createTime: 1 }); diff --git a/packages/service/common/file/read/csv.ts b/packages/service/common/file/read/csv.ts new file mode 100644 index 000000000000..81cc6fb499fd --- /dev/null +++ b/packages/service/common/file/read/csv.ts @@ -0,0 +1,21 @@ +import Papa from 'papaparse'; +import { ReadFileByBufferParams, ReadFileResponse } from './type.d'; +import { readFileRawText } from './rawText'; + +// 加载源文件内容 +export const readCsvRawText = async (params: ReadFileByBufferParams): Promise => { + const { rawText } = readFileRawText(params); + + const csvArr = Papa.parse(rawText).data as string[][]; + + const header = csvArr[0]; + + const formatText = header + ? csvArr.map((item) => item.map((item, i) => `${header[i]}:${item}`).join('\n')).join('\n') + : ''; + + return { + rawText, + formatText + }; +}; diff --git a/packages/service/common/file/read/html.ts b/packages/service/common/file/read/html.ts new file mode 100644 index 000000000000..b3f9476de2f0 --- /dev/null +++ b/packages/service/common/file/read/html.ts @@ -0,0 +1,23 @@ +import { ReadFileByBufferParams, ReadFileResponse } from './type.d'; +import { initMarkdownText } from './utils'; +import { htmlToMarkdown } from '../../string/markdown'; +import { readFileRawText } from './rawText'; + +export const readHtmlRawText = async ( + params: ReadFileByBufferParams +): Promise => { + const { teamId, metadata } = params; + const { rawText: html } = readFileRawText(params); + + const md = await htmlToMarkdown(html); + + const rawText = await initMarkdownText({ + teamId, + md, + metadata + }); + + return { + rawText + }; +}; diff --git a/packages/service/common/file/read/markdown.ts b/packages/service/common/file/read/markdown.ts new file mode 100644 index 000000000000..982e75240d43 --- /dev/null +++ b/packages/service/common/file/read/markdown.ts @@ -0,0 +1,18 @@ +import { ReadFileByBufferParams, ReadFileResponse } from './type.d'; +import { initMarkdownText } from './utils'; +import { readFileRawText } from './rawText'; + +export const readMarkdown = async (params: ReadFileByBufferParams): Promise => { + const { teamId, metadata } = params; + const { rawText: md } = readFileRawText(params); + + const rawText = await initMarkdownText({ + teamId, + md, + metadata + }); + + return { + rawText + }; +}; diff --git a/packages/service/common/file/read/parseOffice.ts b/packages/service/common/file/read/parseOffice.ts new file mode 100644 index 000000000000..327b120e949e --- /dev/null +++ b/packages/service/common/file/read/parseOffice.ts @@ -0,0 +1,119 @@ +import { getNanoid } from '@fastgpt/global/common/string/tools'; +import fs from 'fs'; +import decompress from 'decompress'; +import { DOMParser } from '@xmldom/xmldom'; +import { clearDirFiles } from '../utils'; +import { addLog } from '../../system/log'; + +const DEFAULTDECOMPRESSSUBLOCATION = '/tmp'; + +function getNewFileName(ext: string) { + return `${DEFAULTDECOMPRESSSUBLOCATION}/${getNanoid()}.${ext}`; +} + +const parseString = (xml: string) => { + let parser = new DOMParser(); + return parser.parseFromString(xml, 'text/xml'); +}; + +const parsePowerPoint = async ({ + filepath, + decompressPath, + encoding +}: { + filepath: string; + decompressPath: string; + encoding: BufferEncoding; +}) => { + // Files regex that hold our content of interest + const allFilesRegex = /ppt\/(notesSlides|slides)\/(notesSlide|slide)\d+.xml/g; + const slidesRegex = /ppt\/slides\/slide\d+.xml/g; + + /** The decompress location which contains the filename in it */ + + const files = await decompress(filepath, decompressPath, { + filter: (x) => !!x.path.match(allFilesRegex) + }); + + // Verify if atleast the slides xml files exist in the extracted files list. + if ( + files.length == 0 || + !files.map((file) => file.path).some((filename) => filename.match(slidesRegex)) + ) { + return Promise.reject('解析 PPT 失败'); + } + + // Returning an array of all the xml contents read using fs.readFileSync + const xmlContentArray = files.map((file) => + fs.readFileSync(`${decompressPath}/${file.path}`, encoding) + ); + + let responseArr: string[] = []; + + xmlContentArray.forEach((xmlContent) => { + /** Find text nodes with a:p tags */ + const xmlParagraphNodesList = parseString(xmlContent).getElementsByTagName('a:p'); + + /** Store all the text content to respond */ + responseArr.push( + Array.from(xmlParagraphNodesList) + // Filter paragraph nodes than do not have any text nodes which are identifiable by a:t tag + .filter((paragraphNode) => paragraphNode.getElementsByTagName('a:t').length != 0) + .map((paragraphNode) => { + /** Find text nodes with a:t tags */ + const xmlTextNodeList = paragraphNode.getElementsByTagName('a:t'); + return Array.from(xmlTextNodeList) + .filter((textNode) => textNode.childNodes[0] && textNode.childNodes[0].nodeValue) + .map((textNode) => textNode.childNodes[0].nodeValue) + .join(''); + }) + .join('\n') + ); + }); + + return responseArr.join('\n'); +}; + +export const parseOffice = async ({ + buffer, + encoding, + extension +}: { + buffer: Buffer; + encoding: BufferEncoding; + extension: string; +}) => { + // Prepare file for processing + // create temp file subdirectory if it does not exist + if (!fs.existsSync(DEFAULTDECOMPRESSSUBLOCATION)) { + fs.mkdirSync(DEFAULTDECOMPRESSSUBLOCATION, { recursive: true }); + } + + // temp file name + const filepath = getNewFileName(extension); + const decompressPath = `${DEFAULTDECOMPRESSSUBLOCATION}/${getNanoid()}`; + // const decompressPath = `${DEFAULTDECOMPRESSSUBLOCATION}/test`; + + // write new file + fs.writeFileSync(filepath, buffer, { + encoding + }); + + const text = await (async () => { + try { + switch (extension) { + case 'pptx': + return parsePowerPoint({ filepath, decompressPath, encoding }); + default: + return Promise.reject('只能读取 .pptx 文件'); + } + } catch (error) { + addLog.error(`Load ppt error`, { error }); + } + return ''; + })(); + + fs.unlinkSync(filepath); + clearDirFiles(decompressPath); + return text; +}; diff --git a/packages/web/common/file/read/pdf.ts b/packages/service/common/file/read/pdf.ts similarity index 75% rename from packages/web/common/file/read/pdf.ts rename to packages/service/common/file/read/pdf.ts index 2e7ac97eb3f6..270e3b4592ba 100644 --- a/packages/web/common/file/read/pdf.ts +++ b/packages/service/common/file/read/pdf.ts @@ -1,5 +1,7 @@ -/* read file to txt */ -import * as pdfjsLib from 'pdfjs-dist'; +import * as pdfjs from 'pdfjs-dist/legacy/build/pdf.mjs'; +// @ts-ignore +import('pdfjs-dist/legacy/build/pdf.worker.min.mjs'); +import { ReadFileByBufferParams, ReadFileResponse } from './type'; type TokenType = { str: string; @@ -11,9 +13,9 @@ type TokenType = { hasEOL: boolean; }; -export const readPdfFile = async ({ pdf }: { pdf: ArrayBuffer }) => { - pdfjsLib.GlobalWorkerOptions.workerSrc = '/js/pdf.worker.js'; - +export const readPdfFile = async ({ + buffer +}: ReadFileByBufferParams): Promise => { const readPDFPage = async (doc: any, pageNo: number) => { const page = await doc.getPage(pageNo); const tokenizedText = await page.getTextContent(); @@ -51,14 +53,19 @@ export const readPdfFile = async ({ pdf }: { pdf: ArrayBuffer }) => { .join(''); }; - const doc = await pdfjsLib.getDocument(pdf).promise; + const loadingTask = pdfjs.getDocument(buffer.buffer); + const doc = await loadingTask.promise; + const pageTextPromises = []; for (let pageNo = 1; pageNo <= doc.numPages; pageNo++) { pageTextPromises.push(readPDFPage(doc, pageNo)); } const pageTexts = await Promise.all(pageTextPromises); + loadingTask.destroy(); + return { - rawText: pageTexts.join('') + rawText: pageTexts.join(''), + metadata: {} }; }; diff --git a/packages/service/common/file/read/pptx.ts b/packages/service/common/file/read/pptx.ts new file mode 100644 index 000000000000..1a9ee1735730 --- /dev/null +++ b/packages/service/common/file/read/pptx.ts @@ -0,0 +1,14 @@ +import { ReadFileByBufferParams, ReadFileResponse } from './type.d'; +// import { parseOfficeAsync } from 'officeparser'; +import { parseOffice } from './parseOffice'; + +export const readPptxRawText = async ({ + buffer, + encoding +}: ReadFileByBufferParams): Promise => { + const result = await parseOffice({ buffer, encoding, extension: 'pptx' }); + + return { + rawText: result + }; +}; diff --git a/packages/service/common/file/read/rawText.ts b/packages/service/common/file/read/rawText.ts new file mode 100644 index 000000000000..af6902a6ec15 --- /dev/null +++ b/packages/service/common/file/read/rawText.ts @@ -0,0 +1,10 @@ +import { ReadFileByBufferParams, ReadFileResponse } from './type.d'; + +// 加载源文件内容 +export const readFileRawText = ({ buffer, encoding }: ReadFileByBufferParams): ReadFileResponse => { + const content = buffer.toString(encoding); + + return { + rawText: content + }; +}; diff --git a/packages/service/common/file/read/type.d.ts b/packages/service/common/file/read/type.d.ts new file mode 100644 index 000000000000..9f7d7336d685 --- /dev/null +++ b/packages/service/common/file/read/type.d.ts @@ -0,0 +1,12 @@ +export type ReadFileByBufferParams = { + teamId: string; + buffer: Buffer; + encoding: BufferEncoding; + metadata?: Record; +}; + +export type ReadFileResponse = { + rawText: string; + formatText?: string; + metadata?: Record; +}; diff --git a/packages/service/common/file/read/utils.ts b/packages/service/common/file/read/utils.ts new file mode 100644 index 000000000000..a78683d2df38 --- /dev/null +++ b/packages/service/common/file/read/utils.ts @@ -0,0 +1,25 @@ +import { markdownProcess } from '@fastgpt/global/common/string/markdown'; +import { uploadMongoImg } from '../image/controller'; +import { MongoImageTypeEnum } from '@fastgpt/global/common/file/image/constants'; +import { addHours } from 'date-fns'; + +export const initMarkdownText = ({ + teamId, + md, + metadata +}: { + md: string; + teamId: string; + metadata?: Record; +}) => + markdownProcess({ + rawText: md, + uploadImgController: (base64Img) => + uploadMongoImg({ + type: MongoImageTypeEnum.collectionImage, + base64Img, + teamId, + metadata, + expiredTime: addHours(new Date(), 2) + }) + }); diff --git a/packages/service/common/file/read/word.ts b/packages/service/common/file/read/word.ts new file mode 100644 index 000000000000..0ef5baa81e02 --- /dev/null +++ b/packages/service/common/file/read/word.ts @@ -0,0 +1,35 @@ +import mammoth from 'mammoth'; +import { htmlToMarkdown } from '../../string/markdown'; +import { ReadFileByBufferParams, ReadFileResponse } from './type'; +import { initMarkdownText } from './utils'; + +/** + * read docx to markdown + */ +export const readWordFile = async ({ + teamId, + buffer, + metadata = {} +}: ReadFileByBufferParams): Promise => { + try { + const { value: html } = await mammoth.convertToHtml({ + buffer + }); + + const md = await htmlToMarkdown(html); + + const rawText = await initMarkdownText({ + teamId, + md, + metadata + }); + + return { + rawText, + metadata: {} + }; + } catch (error) { + console.log('error doc read:', error); + return Promise.reject('Can not read doc file, please convert to PDF'); + } +}; diff --git a/packages/service/common/file/read/xlsx.ts b/packages/service/common/file/read/xlsx.ts new file mode 100644 index 000000000000..774e8e7d26e4 --- /dev/null +++ b/packages/service/common/file/read/xlsx.ts @@ -0,0 +1,45 @@ +import { ReadFileByBufferParams, ReadFileResponse } from './type.d'; +import xlsx from 'node-xlsx'; +import Papa from 'papaparse'; + +export const readXlsxRawText = async ({ + buffer +}: ReadFileByBufferParams): Promise => { + const result = xlsx.parse(buffer, { + skipHidden: false, + defval: '' + }); + + const format2Csv = result.map(({ name, data }) => { + return { + title: `#${name}`, + csvText: data.map((item) => item.join(',')).join('\n') + }; + }); + + const rawText = format2Csv.map((item) => item.csvText).join('\n'); + const formatText = format2Csv + .map((item) => { + const csvArr = Papa.parse(item.csvText).data as string[][]; + const header = csvArr[0]; + + const formatText = header + ? csvArr + .map((item) => + item + .map((item, i) => (item ? `${header[i]}:${item}` : '')) + .filter(Boolean) + .join('\n') + ) + .join('\n') + : ''; + + return `${item.title}\n${formatText}`; + }) + .join('\n'); + + return { + rawText: rawText, + formatText + }; +}; diff --git a/packages/service/common/file/utils.ts b/packages/service/common/file/utils.ts index bf3b50cbfa78..231771832082 100644 --- a/packages/service/common/file/utils.ts +++ b/packages/service/common/file/utils.ts @@ -35,13 +35,8 @@ export const clearDirFiles = (dirPath: string) => { return; } - fs.readdirSync(dirPath).forEach((file) => { - const curPath = `${dirPath}/${file}`; - if (fs.lstatSync(curPath).isDirectory()) { - clearDirFiles(curPath); - } else { - fs.unlinkSync(curPath); - } + fs.rmdirSync(dirPath, { + recursive: true }); }; diff --git a/packages/service/core/dataset/collection/controller.ts b/packages/service/core/dataset/collection/controller.ts index 9a5513050b4c..cc25e9a84116 100644 --- a/packages/service/core/dataset/collection/controller.ts +++ b/packages/service/core/dataset/collection/controller.ts @@ -9,7 +9,6 @@ import { DatasetCollectionSchemaType } from '@fastgpt/global/core/dataset/type'; import { MongoDatasetTraining } from '../training/schema'; -import { delay } from '@fastgpt/global/common/system/utils'; import { MongoDatasetData } from '../data/schema'; import { delImgByRelatedId } from '../../../common/file/image/controller'; import { deleteDatasetDataVector } from '../../../common/vectorStore/controller'; diff --git a/packages/service/core/dataset/training/constants.ts b/packages/service/core/dataset/training/constants.ts new file mode 100644 index 000000000000..fb5b6be07f75 --- /dev/null +++ b/packages/service/core/dataset/training/constants.ts @@ -0,0 +1,6 @@ +export enum ImportDataSourceEnum { + fileLocal = 'fileLocal', + fileLink = 'fileLink', + fileCustom = 'fileCustom', + tableLocal = 'tableLocal' +} diff --git a/packages/service/core/dataset/training/controller.ts b/packages/service/core/dataset/training/controller.ts index 39f0d532b303..342c55cd1aaf 100644 --- a/packages/service/core/dataset/training/controller.ts +++ b/packages/service/core/dataset/training/controller.ts @@ -1,14 +1,16 @@ -import { delay } from '@fastgpt/global/common/system/utils'; import { MongoDatasetTraining } from './schema'; import type { PushDatasetDataChunkProps, PushDatasetDataProps, PushDatasetDataResponse } from '@fastgpt/global/core/dataset/api.d'; -import { getCollectionWithDataset } from '../controller'; import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants'; import { simpleText } from '@fastgpt/global/common/string/tools'; import { countPromptTokens } from '@fastgpt/global/common/string/tiktoken'; +import { ClientSession } from '../../../common/mongo'; +import { getLLMModel, getVectorModel } from '../../ai/model'; +import { addLog } from '../../../common/system/log'; +import { getCollectionWithDataset } from '../controller'; export const lockTrainingDataByTeamId = async (teamId: string): Promise => { try { @@ -23,31 +25,52 @@ export const lockTrainingDataByTeamId = async (teamId: string): Promise => } catch (error) {} }; +export const pushDataListToTrainingQueueByCollectionId = async ({ + collectionId, + ...props +}: { + teamId: string; + tmbId: string; + session?: ClientSession; +} & PushDatasetDataProps) => { + const { + datasetId: { _id: datasetId, agentModel, vectorModel } + } = await getCollectionWithDataset(collectionId); + return pushDataListToTrainingQueue({ + ...props, + datasetId, + collectionId, + agentModel, + vectorModel + }); +}; + export async function pushDataListToTrainingQueue({ teamId, tmbId, + datasetId, collectionId, + agentModel, + vectorModel, data, prompt, billId, - trainingMode = TrainingModeEnum.chunk + trainingMode = TrainingModeEnum.chunk, + session }: { teamId: string; tmbId: string; + datasetId: string; + agentModel: string; + vectorModel: string; + session?: ClientSession; } & PushDatasetDataProps): Promise { - const vectorModelList = global.vectorModels; - const datasetModelList = global.llmModels; - - const { - datasetId: { _id: datasetId, vectorModel, agentModel } - } = await getCollectionWithDataset(collectionId); - const checkModelValid = async () => { - const agentModelData = datasetModelList?.find((item) => item.model === agentModel); + const agentModelData = getLLMModel(agentModel); if (!agentModelData) { return Promise.reject(`File model ${agentModel} is inValid`); } - const vectorModelData = vectorModelList?.find((item) => item.model === vectorModel); + const vectorModelData = getVectorModel(vectorModel); if (!vectorModelData) { return Promise.reject(`Vector model ${vectorModel} is inValid`); } @@ -124,52 +147,43 @@ export async function pushDataListToTrainingQueue({ }); // insert data to db - const insertData = async (dataList: PushDatasetDataChunkProps[], retry = 3): Promise => { - try { - const results = await MongoDatasetTraining.insertMany( - dataList.map((item, i) => ({ - teamId, - tmbId, - datasetId, - collectionId, - billId, - mode: trainingMode, - prompt, - model, - q: item.q, - a: item.a, - chunkIndex: item.chunkIndex ?? 0, - weight: weight ?? 0, - indexes: item.indexes - })) - ); - await delay(500); - return results.length; - } catch (error) { - if (retry > 0) { - await delay(500); - return insertData(dataList, retry - 1); - } - return Promise.reject(error); - } - }; + const insertLen = filterResult.success.length; + const failedDocuments: PushDatasetDataChunkProps[] = []; - let insertLen = 0; - const chunkSize = 50; - const chunkList = filterResult.success.reduce( - (acc, cur) => { - const lastChunk = acc[acc.length - 1]; - if (lastChunk.length < chunkSize) { - lastChunk.push(cur); - } else { - acc.push([cur]); + // 使用 insertMany 批量插入 + try { + await MongoDatasetTraining.insertMany( + filterResult.success.map((item) => ({ + teamId, + tmbId, + datasetId, + collectionId, + billId, + mode: trainingMode, + prompt, + model, + q: item.q, + a: item.a, + chunkIndex: item.chunkIndex ?? 0, + weight: weight ?? 0, + indexes: item.indexes + })), + { + session } - return acc; - }, - [[]] as PushDatasetDataChunkProps[][] - ); - for await (const chunks of chunkList) { - insertLen += await insertData(chunks); + ); + } catch (error: any) { + addLog.error(`Insert error`, error); + // 如果有错误,将失败的文档添加到失败列表中 + error.writeErrors.forEach((writeError: any) => { + failedDocuments.push(data[writeError.index]); + }); + console.log('failed', failedDocuments); + } + + // 对于失败的文档,尝试单独插入 + for await (const item of failedDocuments) { + await MongoDatasetTraining.create(item); } delete filterResult.success; diff --git a/packages/service/core/dataset/training/utils.ts b/packages/service/core/dataset/training/utils.ts index fb579148a6c5..7a94b052697e 100644 --- a/packages/service/core/dataset/training/utils.ts +++ b/packages/service/core/dataset/training/utils.ts @@ -2,6 +2,7 @@ import { DatasetTrainingSchemaType } from '@fastgpt/global/core/dataset/type'; import { addLog } from '../../../common/system/log'; import { getErrText } from '@fastgpt/global/common/error/utils'; import { MongoDatasetTraining } from './schema'; +import Papa from 'papaparse'; export const checkInvalidChunkAndLock = async ({ err, @@ -39,3 +40,18 @@ export const checkInvalidChunkAndLock = async ({ } return false; }; + +export const parseCsvTable2Chunks = (rawText: string) => { + const csvArr = Papa.parse(rawText).data as string[][]; + + const chunks = csvArr + .map((item) => ({ + q: item[0] || '', + a: item[1] || '' + })) + .filter((item) => item.q || item.a); + + return { + chunks + }; +}; diff --git a/packages/service/package.json b/packages/service/package.json index bd8178650091..344d767298af 100644 --- a/packages/service/package.json +++ b/packages/service/package.json @@ -4,27 +4,36 @@ "dependencies": { "@fastgpt/global": "workspace:*", "@node-rs/jieba": "1.10.0", + "@xmldom/xmldom": "^0.8.10", "axios": "^1.5.1", "cheerio": "1.0.0-rc.12", "cookie": "^0.5.0", "date-fns": "2.30.0", "dayjs": "^1.11.7", + "decompress": "^4.2.1", "encoding": "^0.1.13", + "file-type": "^19.0.0", "json5": "^2.2.3", "jsonwebtoken": "^9.0.2", + "mammoth": "^1.6.0", "mongoose": "^7.0.2", "multer": "1.4.5-lts.1", "next": "13.5.2", "nextjs-cors": "^2.1.2", "node-cron": "^3.0.3", + "node-xlsx": "^0.23.0", + "papaparse": "5.4.1", + "pdfjs-dist": "4.0.269", "pg": "^8.10.0", "tunnel": "^0.0.6" }, "devDependencies": { "@types/cookie": "^0.5.2", + "@types/decompress": "^4.2.7", "@types/jsonwebtoken": "^9.0.3", "@types/multer": "^1.4.10", "@types/node-cron": "^3.0.11", + "@types/papaparse": "5.3.7", "@types/pg": "^8.6.6", "@types/tunnel": "^0.0.4" } diff --git a/packages/service/support/permission/auth/file.ts b/packages/service/support/permission/auth/file.ts new file mode 100644 index 000000000000..345b9d5fad8d --- /dev/null +++ b/packages/service/support/permission/auth/file.ts @@ -0,0 +1,42 @@ +import { AuthResponseType } from '@fastgpt/global/support/permission/type'; +import { AuthModeType } from '../type'; +import { DatasetFileSchema } from '@fastgpt/global/core/dataset/type'; +import { parseHeaderCert } from '../controller'; +import { getFileById } from '../../../common/file/gridfs/controller'; +import { BucketNameEnum } from '@fastgpt/global/common/file/constants'; +import { CommonErrEnum } from '@fastgpt/global/common/error/code/common'; + +export async function authFile({ + fileId, + per = 'owner', + ...props +}: AuthModeType & { + fileId: string; +}): Promise< + AuthResponseType & { + file: DatasetFileSchema; + } +> { + const authRes = await parseHeaderCert(props); + const { teamId, tmbId } = authRes; + + const file = await getFileById({ bucketName: BucketNameEnum.dataset, fileId }); + + if (!file) { + return Promise.reject(CommonErrEnum.fileNotFound); + } + + if (file.metadata?.teamId !== teamId) { + return Promise.reject(CommonErrEnum.unAuthFile); + } + if (per === 'owner' && file.metadata?.tmbId !== tmbId) { + return Promise.reject(CommonErrEnum.unAuthFile); + } + + return { + ...authRes, + isOwner: per === 'owner', + canWrite: per === 'owner', + file + }; +} diff --git a/packages/web/common/file/read/csv.ts b/packages/web/common/file/read/csv.ts deleted file mode 100644 index 783a7309ff87..000000000000 --- a/packages/web/common/file/read/csv.ts +++ /dev/null @@ -1,40 +0,0 @@ -import Papa from 'papaparse'; -import { readFileRawText } from './rawText'; - -/** - * read csv to json - * @response { - * header: string[], - * data: string[][] - * } - */ -export const readCsvContent = async ({ file }: { file: File }) => { - try { - const { rawText: textArr } = await readFileRawText(file); - const csvArr = Papa.parse(textArr).data as string[][]; - if (csvArr.length === 0) { - throw new Error('csv 解析失败'); - } - - const header = csvArr.shift() as string[]; - - // add title to data - const rawText = csvArr - .map((item) => - item.map((value, index) => { - if (!header[index]) return value; - return `${header[index]}: ${value}`; - }) - ) - .flat() - .join('\n'); - - return { - rawText, - header, - data: csvArr.map((item) => item) - }; - } catch (error) { - return Promise.reject('解析 csv 文件失败'); - } -}; diff --git a/packages/web/common/file/read/html.ts b/packages/web/common/file/read/html.ts deleted file mode 100644 index 403d4a5d9992..000000000000 --- a/packages/web/common/file/read/html.ts +++ /dev/null @@ -1,21 +0,0 @@ -import { htmlStr2Md } from '../../string/markdown'; -import { readFileRawText } from './rawText'; -import { markdownProcess } from '@fastgpt/global/common/string/markdown'; - -export const readHtmlFile = async ({ - file, - uploadImgController -}: { - file: File; - uploadImgController?: (base64: string) => Promise; -}) => { - const { rawText } = await readFileRawText(file); - const md = htmlStr2Md(rawText); - - const simpleMd = await markdownProcess({ - rawText: md, - uploadImgController - }); - - return { rawText: simpleMd }; -}; diff --git a/packages/web/common/file/read/index.ts b/packages/web/common/file/read/index.ts deleted file mode 100644 index d14342108504..000000000000 --- a/packages/web/common/file/read/index.ts +++ /dev/null @@ -1,49 +0,0 @@ -import { loadFile2Buffer } from '../utils'; -import { readCsvContent } from './csv'; -import { readHtmlFile } from './html'; -import { readMdFile } from './md'; -import { readPdfFile } from './pdf'; -import { readFileRawText } from './rawText'; -import { readWordFile } from './word'; - -export const readFileRawContent = async ({ - file, - uploadBase64Controller -}: { - file: File; - uploadBase64Controller?: (base64: string) => Promise; -}): Promise<{ - rawText: string; -}> => { - const extension = file?.name?.split('.')?.pop()?.toLowerCase(); - - switch (extension) { - case 'txt': - return readFileRawText(file); - case 'md': - return readMdFile({ - file, - uploadImgController: uploadBase64Controller - }); - case 'html': - return readHtmlFile({ - file, - uploadImgController: uploadBase64Controller - }); - case 'csv': - return readCsvContent({ file }); - case 'pdf': - const pdf = await loadFile2Buffer({ file }); - return readPdfFile({ pdf }); - case 'docx': - return readWordFile({ - file, - uploadImgController: uploadBase64Controller - }); - - default: - return { - rawText: '' - }; - } -}; diff --git a/packages/web/common/file/read/md.ts b/packages/web/common/file/read/md.ts deleted file mode 100644 index 5df750c92c56..000000000000 --- a/packages/web/common/file/read/md.ts +++ /dev/null @@ -1,17 +0,0 @@ -import { markdownProcess } from '@fastgpt/global/common/string/markdown'; -import { readFileRawText } from './rawText'; - -export const readMdFile = async ({ - file, - uploadImgController -}: { - file: File; - uploadImgController?: (base64: string) => Promise; -}) => { - const { rawText: md } = await readFileRawText(file); - const simpleMd = await markdownProcess({ - rawText: md, - uploadImgController - }); - return { rawText: simpleMd }; -}; diff --git a/packages/web/common/file/read/rawText.ts b/packages/web/common/file/read/rawText.ts deleted file mode 100644 index ab9f7ce5ba5f..000000000000 --- a/packages/web/common/file/read/rawText.ts +++ /dev/null @@ -1,36 +0,0 @@ -import { detectFileEncoding } from '@fastgpt/global/common/file/tools'; - -/** - * read file raw text - */ -export const readFileRawText = (file: File) => { - return new Promise<{ rawText: string }>((resolve, reject) => { - try { - const reader = new FileReader(); - reader.onload = () => { - //@ts-ignore - const encode = detectFileEncoding(reader.result); - - // 再次读取文件,这次使用检测到的编码 - const reader2 = new FileReader(); - reader2.onload = () => { - resolve({ - rawText: reader2.result as string - }); - }; - reader2.onerror = (err) => { - console.log('Error reading file with detected encoding:', err); - reject('Read file error with detected encoding'); - }; - reader2.readAsText(file, encode); - }; - reader.onerror = (err) => { - console.log('error txt read:', err); - reject('Read file error'); - }; - reader.readAsBinaryString(file); - } catch (error) { - reject(error); - } - }); -}; diff --git a/packages/web/common/file/read/word.ts b/packages/web/common/file/read/word.ts deleted file mode 100644 index 24f93c789a5d..000000000000 --- a/packages/web/common/file/read/word.ts +++ /dev/null @@ -1,28 +0,0 @@ -import { markdownProcess } from '@fastgpt/global/common/string/markdown'; -import { htmlStr2Md } from '../../string/markdown'; -import { loadFile2Buffer } from '../utils'; -import mammoth from 'mammoth'; - -export const readWordFile = async ({ - file, - uploadImgController -}: { - file: File; - uploadImgController?: (base64: string) => Promise; -}) => { - const buffer = await loadFile2Buffer({ file }); - - const { value: html } = await mammoth.convertToHtml({ - arrayBuffer: buffer - }); - const md = htmlStr2Md(html); - - const rawText = await markdownProcess({ - rawText: md, - uploadImgController: uploadImgController - }); - - return { - rawText - }; -}; diff --git a/packages/web/components/common/Icon/constants.ts b/packages/web/components/common/Icon/constants.ts index 5d50fee76211..120356958d81 100644 --- a/packages/web/components/common/Icon/constants.ts +++ b/packages/web/components/common/Icon/constants.ts @@ -101,6 +101,7 @@ export const iconPaths = { 'core/dataset/mixedRecall': () => import('./icons/core/dataset/mixedRecall.svg'), 'core/dataset/modeEmbedding': () => import('./icons/core/dataset/modeEmbedding.svg'), 'core/dataset/rerank': () => import('./icons/core/dataset/rerank.svg'), + 'core/dataset/splitLight': () => import('./icons/core/dataset/splitLight.svg'), 'core/dataset/tableCollection': () => import('./icons/core/dataset/tableCollection.svg'), 'core/dataset/websiteDataset': () => import('./icons/core/dataset/websiteDataset.svg'), 'core/modules/basicNode': () => import('./icons/core/modules/basicNode.svg'), diff --git a/packages/web/components/common/Icon/icons/core/dataset/splitLight.svg b/packages/web/components/common/Icon/icons/core/dataset/splitLight.svg new file mode 100644 index 000000000000..621ced958690 --- /dev/null +++ b/packages/web/components/common/Icon/icons/core/dataset/splitLight.svg @@ -0,0 +1,6 @@ + + + \ No newline at end of file diff --git a/packages/web/components/common/MyDrawer/MyRightDrawer.tsx b/packages/web/components/common/MyDrawer/MyRightDrawer.tsx new file mode 100644 index 000000000000..88c139fc8738 --- /dev/null +++ b/packages/web/components/common/MyDrawer/MyRightDrawer.tsx @@ -0,0 +1,70 @@ +import React from 'react'; +import MyIcon from '../Icon'; +import { + Drawer, + DrawerBody, + DrawerHeader, + DrawerOverlay, + DrawerContent, + DrawerCloseButton, + DrawerContentProps, + Flex, + Image +} from '@chakra-ui/react'; +import { useLoading } from '../../../hooks/useLoading'; + +type Props = DrawerContentProps & { + onClose: () => void; + iconSrc?: string; + title?: any; + isLoading?: boolean; +}; + +const MyRightDrawer = ({ + onClose, + iconSrc, + title, + maxW = ['90vw', '30vw'], + children, + isLoading, + ...props +}: Props) => { + const { Loading } = useLoading(); + return ( + + + + + + + {iconSrc && ( + <> + {iconSrc.startsWith('/') ? ( + + ) : ( + + )} + + )} + {title} + + + + + + {children} + + + + + ); +}; + +export default MyRightDrawer; diff --git a/packages/web/components/common/Radio/LeftRadio.tsx b/packages/web/components/common/Radio/LeftRadio.tsx index 6d97ba84cb15..f80e951b5b0c 100644 --- a/packages/web/components/common/Radio/LeftRadio.tsx +++ b/packages/web/components/common/Radio/LeftRadio.tsx @@ -2,6 +2,8 @@ import React from 'react'; import { Box, Flex, useTheme, Grid, type GridProps } from '@chakra-ui/react'; import { useTranslation } from 'next-i18next'; import MyTooltip from '../MyTooltip'; +import { QuestionOutlineIcon } from '@chakra-ui/icons'; +import QuestionTip from '../MyTooltip/QuestionTip'; // @ts-ignore interface Props extends GridProps { @@ -36,58 +38,59 @@ const LeftRadio = ({ return ( {list.map((item) => ( - - onChange(item.value)} + })} + onClick={() => onChange(item.value)} + > + - - - - - - + bg={value === item.value ? 'primary.600' : 'transparent'} + > + + + + {typeof item.title === 'string' ? t(item.title) : item.title} - {!!item.desc && ( - - {t(item.desc)} - - )} - {item?.children} - - - + {!!item.tooltip && } + + {!!item.desc && ( + + {t(item.desc)} + + )} + {item?.children} + + ))} ); diff --git a/packages/web/package.json b/packages/web/package.json index 7b2d1ce66730..8c4eb2bc270b 100644 --- a/packages/web/package.json +++ b/packages/web/package.json @@ -12,31 +12,31 @@ "@emotion/styled": "^11.11.0", "@fastgpt/global": "workspace:*", "@fingerprintjs/fingerprintjs": "^4.2.1", + "@lexical/react": "0.12.6", + "@lexical/text": "0.12.6", + "@lexical/utils": "0.12.6", "@monaco-editor/react": "^4.6.0", - "mammoth": "^1.6.0", + "@tanstack/react-query": "^4.24.10", + "date-fns": "2.30.0", + "dayjs": "^1.11.7", "i18next": "23.10.0", "joplin-turndown-plugin-gfm": "^1.0.12", + "lexical": "0.12.6", + "lodash": "^4.17.21", + "mammoth": "^1.6.0", "next-i18next": "15.2.0", + "papaparse": "^5.4.1", "pdfjs-dist": "4.0.269", "react": "18.2.0", + "react-day-picker": "^8.7.1", "react-dom": "18.2.0", "react-i18next": "13.5.0", - "turndown": "^7.1.2", - "lexical": "0.12.6", - "@lexical/react": "0.12.6", - "papaparse": "^5.4.1", - "@lexical/utils": "0.12.6", - "@lexical/text": "0.12.6", - "date-fns": "2.30.0", - "react-day-picker": "^8.7.1", - "lodash": "^4.17.21", - "@tanstack/react-query": "^4.24.10", - "dayjs": "^1.11.7" + "turndown": "^7.1.2" }, "devDependencies": { "@types/lodash": "^4.14.191", - "@types/react": "18.2.0", "@types/papaparse": "^5.3.7", + "@types/react": "18.2.0", "@types/react-dom": "18.2.0", "@types/turndown": "^5.0.4" } diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 671e5cf81ca9..719e46967658 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -99,6 +99,9 @@ importers: '@node-rs/jieba': specifier: 1.10.0 version: 1.10.0 + '@xmldom/xmldom': + specifier: ^0.8.10 + version: 0.8.10 axios: specifier: ^1.5.1 version: 1.6.8 @@ -114,15 +117,24 @@ importers: dayjs: specifier: ^1.11.7 version: 1.11.10 + decompress: + specifier: ^4.2.1 + version: 4.2.1 encoding: specifier: ^0.1.13 version: 0.1.13 + file-type: + specifier: ^19.0.0 + version: 19.0.0 json5: specifier: ^2.2.3 version: 2.2.3 jsonwebtoken: specifier: ^9.0.2 version: 9.0.2 + mammoth: + specifier: ^1.6.0 + version: 1.7.0 mongoose: specifier: ^7.0.2 version: 7.6.10 @@ -138,6 +150,15 @@ importers: node-cron: specifier: ^3.0.3 version: 3.0.3 + node-xlsx: + specifier: ^0.23.0 + version: 0.23.0 + papaparse: + specifier: 5.4.1 + version: 5.4.1 + pdfjs-dist: + specifier: 4.0.269 + version: 4.0.269(encoding@0.1.13) pg: specifier: ^8.10.0 version: 8.11.3 @@ -148,6 +169,9 @@ importers: '@types/cookie': specifier: ^0.5.2 version: 0.5.4 + '@types/decompress': + specifier: ^4.2.7 + version: 4.2.7 '@types/jsonwebtoken': specifier: ^9.0.3 version: 9.0.6 @@ -157,6 +181,9 @@ importers: '@types/node-cron': specifier: ^3.0.11 version: 3.0.11 + '@types/papaparse': + specifier: 5.3.7 + version: 5.3.7 '@types/pg': specifier: ^8.6.6 version: 8.11.3 @@ -240,7 +267,7 @@ importers: version: 5.4.1 pdfjs-dist: specifier: 4.0.269 - version: 4.0.269 + version: 4.0.269(encoding@0.1.13) react: specifier: 18.2.0 version: 18.2.0 @@ -3789,10 +3816,9 @@ packages: yjs: 13.6.14 dev: false - /@mapbox/node-pre-gyp@1.0.11: + /@mapbox/node-pre-gyp@1.0.11(encoding@0.1.13): resolution: {integrity: sha512-Yhlar6v9WQgUp/He7BdgzOz8lqMQ8sU+jkCq7Wx8Myc5YFJLbEe7lgui/V7G1qB1DJykHSGwreceSaD60Y0PUQ==} hasBin: true - requiresBuild: true dependencies: detect-libc: 2.0.3 https-proxy-agent: 5.0.1 @@ -4522,6 +4548,10 @@ packages: use-sync-external-store: 1.2.0(react@18.2.0) dev: false + /@tokenizer/token@0.3.0: + resolution: {integrity: sha512-OvjF+z51L3ov0OyAU0duzsYuvO01PH7x4t6DJx+guahgTnBHkhJdG7soQeTSFLWN3efnHyibZ4Z8l2EuWwJN3A==} + dev: false + /@trysound/sax@0.2.0: resolution: {integrity: sha512-L7z9BgrNEcYyUYtF+HaEfiS5ebkh9jXqbszz7pC0hRBPaatV0XjSD3+eHrpqFemQfgwiFF0QPIarnIihIDn7OA==} engines: {node: '>=10.13.0'} @@ -4737,6 +4767,12 @@ packages: '@types/ms': 0.7.34 dev: false + /@types/decompress@4.2.7: + resolution: {integrity: sha512-9z+8yjKr5Wn73Pt17/ldnmQToaFHZxK0N1GHysuk/JIPT8RIdQeoInM01wWPgypRcvb6VH1drjuFpQ4zmY437g==} + dependencies: + '@types/node': 20.11.30 + dev: true + /@types/estree@1.0.5: resolution: {integrity: sha512-/kYRxGDLWzHOB7q+wtSUQlFrtcdUccpfy+X+9iMBpHK8QLLhx2wIPYuS5DYtR9Wa/YlZAbIovy7qVdB1Aq6Lyw==} dev: true @@ -4876,6 +4912,12 @@ packages: '@types/node': 20.11.30 dev: true + /@types/papaparse@5.3.7: + resolution: {integrity: sha512-f2HKmlnPdCvS0WI33WtCs5GD7X1cxzzS/aduaxSu3I7TbhWlENjSPs6z5TaB9K0J+BH1jbmqTaM+ja5puis4wg==} + dependencies: + '@types/node': 20.11.30 + dev: true + /@types/parse-json@4.0.2: resolution: {integrity: sha512-dISoDXWWQwUquiKsyZ4Ng+HX2KsPL7LyHKHQwgGFEA3IaKac4Obd+h2a/a6waisAoepJlBcx9paWqjA8/HVjCw==} @@ -5550,6 +5592,13 @@ packages: resolution: {integrity: sha512-Ceh+7ox5qe7LJuLHoY0feh3pHuUDHAcRUeyL2VYghZwfpkNIy/+8Ocg0a3UuSoYzavmylwuLWQOf3hl0jjMMIw==} engines: {node: '>=8'} + /bl@1.2.3: + resolution: {integrity: sha512-pvcNpa0UU69UT341rO6AYy4FVAIkUHuZXRIWbq+zHnsVcRzDDjIAhGuuYoi0d//cwIwtt4pkpKycWEfjdV+vww==} + dependencies: + readable-stream: 2.3.8 + safe-buffer: 5.2.1 + dev: false + /bluebird@3.4.7: resolution: {integrity: sha512-iD3898SR7sWVRHbiQv+sHUtHnMvC1o3nW5rAcqnq3uOn07DSAppZYUkIGslDz6gXC7HfunPe7YVBgoEJASPcHA==} dev: false @@ -5610,10 +5659,29 @@ packages: engines: {node: '>=14.20.1'} dev: false + /buffer-alloc-unsafe@1.1.0: + resolution: {integrity: sha512-TEM2iMIEQdJ2yjPJoSIsldnleVaAk1oW3DBVUykyOLsEsFmEc9kn+SFFPz+gl54KQNxlDnAwCXosOS9Okx2xAg==} + dev: false + + /buffer-alloc@1.2.0: + resolution: {integrity: sha512-CFsHQgjtW1UChdXgbyJGtnm+O/uLQeZdtbDo8mfUgYXCHSM1wgrVxXm6bSyrUuErEb+4sYVGCzASBRot7zyrow==} + dependencies: + buffer-alloc-unsafe: 1.1.0 + buffer-fill: 1.0.0 + dev: false + + /buffer-crc32@0.2.13: + resolution: {integrity: sha512-VO9Ht/+p3SN7SKWqcrgEzjGbRSJYTx+Q1pTQC0wrWqHx0vpJraQ6GtHx8tvcg1rlK1byhU5gccxgOgj7B0TDkQ==} + dev: false + /buffer-equal-constant-time@1.0.1: resolution: {integrity: sha512-zRpUiDwd/xk6ADqPMATG8vc9VPrkck7T07OIx0gnjmJAnHnTVXNQG3vfvWNuiZIkwu9KrKdA1iJKfsfTVxE6NA==} dev: false + /buffer-fill@1.0.0: + resolution: {integrity: sha512-T7zexNBwiiaCOGDg9xNX9PBmjrubblRkENuptryuI64URkXDFum9il/JGL8Lm8wYfAXpredVXXZz7eMHilimiQ==} + dev: false + /buffer-from@1.1.2: resolution: {integrity: sha512-E+XQCRwSbaaiChtv6k6Dwgc+bx+Bs6vuKJHHl5kox/BaKbhiXzqQOwK4cO22yElGp2OCmjwVhT3HmxgyPGnJfQ==} dev: false @@ -5623,6 +5691,13 @@ packages: engines: {node: '>=4'} dev: false + /buffer@5.7.1: + resolution: {integrity: sha512-EHcyIPBQ4BSGlvjB16k5KgAJ27CIsHY/2JBmCRReo48y9rQ3MaUzWX3KVlBa4U7MyX02HdVj0K7C3WaB3ju7FQ==} + dependencies: + base64-js: 1.5.1 + ieee754: 1.2.1 + dev: false + /busboy@1.6.0: resolution: {integrity: sha512-8SFQbg/0hQ9xy3UNTB0YEnsNBbWfhf7RtnzpL7TkBiTBRfrQ9Fxcnz7VJsleJpyp6rVLvXiuORqjlHi5q+PYuA==} engines: {node: '>=10.16.0'} @@ -5665,12 +5740,12 @@ packages: /caniuse-lite@1.0.30001599: resolution: {integrity: sha512-LRAQHZ4yT1+f9LemSMeqdMpMxZcc4RMWdj4tiFe3G8tNkWK+E58g+/tzotb5cU6TbcVJLr4fySiAW7XmxQvZQA==} - /canvas@2.11.2: + /canvas@2.11.2(encoding@0.1.13): resolution: {integrity: sha512-ItanGBMrmRV7Py2Z+Xhs7cT+FNt5K0vPL4p9EZ/UX/Mu7hFbkxSjKF2KVtPwX7UYWp7dRKnrTvReflgrItJbdw==} engines: {node: '>=6'} requiresBuild: true dependencies: - '@mapbox/node-pre-gyp': 1.0.11 + '@mapbox/node-pre-gyp': 1.0.11(encoding@0.1.13) nan: 2.19.0 simple-get: 3.1.1 transitivePeerDependencies: @@ -5909,6 +5984,10 @@ packages: engines: {node: '>=16'} dev: true + /commander@2.20.3: + resolution: {integrity: sha512-GpVkmM8vF2vQUkj2LvZmD35JxeJOLCwJ9cUkugyk2nuhbv3+mJvpLYYt+0+USMxE+oj+ey/lJEnhZw75x/OMcQ==} + dev: false + /commander@7.2.0: resolution: {integrity: sha512-QrWXB+ZQSVPmIWIhtEO9H+gwHaMGYiF5ChvoJ+K9ZGHG/sVsa6yiesAD1GC/x46sET00Xlwo1u49RVVVzvcSkw==} engines: {node: '>= 10'} @@ -6469,6 +6548,59 @@ packages: dev: false optional: true + /decompress-tar@4.1.1: + resolution: {integrity: sha512-JdJMaCrGpB5fESVyxwpCx4Jdj2AagLmv3y58Qy4GE6HMVjWz1FeVQk1Ct4Kye7PftcdOo/7U7UKzYBJgqnGeUQ==} + engines: {node: '>=4'} + dependencies: + file-type: 5.2.0 + is-stream: 1.1.0 + tar-stream: 1.6.2 + dev: false + + /decompress-tarbz2@4.1.1: + resolution: {integrity: sha512-s88xLzf1r81ICXLAVQVzaN6ZmX4A6U4z2nMbOwobxkLoIIfjVMBg7TeguTUXkKeXni795B6y5rnvDw7rxhAq9A==} + engines: {node: '>=4'} + dependencies: + decompress-tar: 4.1.1 + file-type: 6.2.0 + is-stream: 1.1.0 + seek-bzip: 1.0.6 + unbzip2-stream: 1.4.3 + dev: false + + /decompress-targz@4.1.1: + resolution: {integrity: sha512-4z81Znfr6chWnRDNfFNqLwPvm4db3WuZkqV+UgXQzSngG3CEKdBkw5jrv3axjjL96glyiiKjsxJG3X6WBZwX3w==} + engines: {node: '>=4'} + dependencies: + decompress-tar: 4.1.1 + file-type: 5.2.0 + is-stream: 1.1.0 + dev: false + + /decompress-unzip@4.0.1: + resolution: {integrity: sha512-1fqeluvxgnn86MOh66u8FjbtJpAFv5wgCT9Iw8rcBqQcCo5tO8eiJw7NNTrvt9n4CRBVq7CstiS922oPgyGLrw==} + engines: {node: '>=4'} + dependencies: + file-type: 3.9.0 + get-stream: 2.3.1 + pify: 2.3.0 + yauzl: 2.10.0 + dev: false + + /decompress@4.2.1: + resolution: {integrity: sha512-e48kc2IjU+2Zw8cTb6VZcJQ3lgVbS4uuB1TfCHbiZIP/haNXm+SVyhu+87jts5/3ROpd82GSVCoNs/z8l4ZOaQ==} + engines: {node: '>=4'} + dependencies: + decompress-tar: 4.1.1 + decompress-tarbz2: 4.1.1 + decompress-targz: 4.1.1 + decompress-unzip: 4.0.1 + graceful-fs: 4.2.11 + make-dir: 1.3.0 + pify: 2.3.0 + strip-dirs: 2.1.0 + dev: false + /deep-eql@4.1.3: resolution: {integrity: sha512-WaEtAOpRA1MQ0eohqZjpGD8zdI0Ovsm8mmFhaDN8dvDZzyoUMcYDnf5Y6iu7HTXxf8JDS23qWa4a+hKCDyOPzw==} engines: {node: '>=6'} @@ -6712,6 +6844,12 @@ packages: iconv-lite: 0.6.3 dev: false + /end-of-stream@1.4.4: + resolution: {integrity: sha512-+uw1inIHVPQoaVuHzRyXd21icM+cnt4CzD5rW+NC1wjOUSTOs+Te7FOv7AhN7vS9x/oIyhLP5PR1H+phQAHu5Q==} + dependencies: + once: 1.4.0 + dev: false + /enhanced-resolve@5.16.0: resolution: {integrity: sha512-O+QWCviPNSSLAD9Ucn8Awv+poAkqn3T1XY5/N7kR7rQO9yfSGWkYZDwpJ+iKF7B8rxaQKWngSqACpgzeapSyoA==} engines: {node: '>=10.13.0'} @@ -7403,6 +7541,12 @@ packages: dependencies: format: 0.2.2 + /fd-slicer@1.1.0: + resolution: {integrity: sha512-cE1qsB/VwyQozZ+q1dGxR8LBYNZeofhEdUNGSMbQD3Gw2lAzX9Zb3uIU6Ebc/Fmyjo9AWWfnn0AUCHqtevs/8g==} + dependencies: + pend: 1.2.0 + dev: false + /file-entry-cache@6.0.1: resolution: {integrity: sha512-7Gps/XWymbLk2QLYK4NzpMOrYjMhdIxXuIvy2QBsLE6ljuodKvdkWs/cpyJJ3CVIVpH0Oi1Hvg1ovbMzLdFBBg==} engines: {node: ^10.12.0 || >=12.0.0} @@ -7410,6 +7554,30 @@ packages: flat-cache: 3.2.0 dev: true + /file-type@19.0.0: + resolution: {integrity: sha512-s7cxa7/leUWLiXO78DVVfBVse+milos9FitauDLG1pI7lNaJ2+5lzPnr2N24ym+84HVwJL6hVuGfgVE+ALvU8Q==} + engines: {node: '>=18'} + dependencies: + readable-web-to-node-stream: 3.0.2 + strtok3: 7.0.0 + token-types: 5.0.1 + dev: false + + /file-type@3.9.0: + resolution: {integrity: sha512-RLoqTXE8/vPmMuTI88DAzhMYC99I8BWv7zYP4A1puo5HIjEJ5EX48ighy4ZyKMG9EDXxBgW6e++cn7d1xuFghA==} + engines: {node: '>=0.10.0'} + dev: false + + /file-type@5.2.0: + resolution: {integrity: sha512-Iq1nJ6D2+yIO4c8HHg4fyVb8mAJieo1Oloy1mLLaB2PvezNedhBVm+QU7g0qM42aiMbRXTxKKwGD17rjKNJYVQ==} + engines: {node: '>=4'} + dev: false + + /file-type@6.2.0: + resolution: {integrity: sha512-YPcTBDV+2Tm0VqjybVd32MHdlEGAtuxS3VAYsumFokDSMG+ROT5wawGlnHDoz7bfMcMDt9hxuXvXwoKUx2fkOg==} + engines: {node: '>=4'} + dev: false + /fill-range@7.0.1: resolution: {integrity: sha512-qOo9F+dMUmC2Lcb4BbVvnKJxTPjCm+RRpe4gDuGrzkL7mEVl/djYSu2OdQ2Pa302N4oqkSg9ir6jaLWJ2USVpQ==} engines: {node: '>=8'} @@ -7550,6 +7718,10 @@ packages: engines: {node: '>= 0.6'} dev: false + /fs-constants@1.0.0: + resolution: {integrity: sha512-y6OAwoSIf7FyjMIv94u+b5rdheZEjzR63GTyZJm5qh4Bi+2YgwLCcI/fPFZkL5PSixOt6ZNKm+w+Hfp/Bciwow==} + dev: false + /fs-minipass@2.1.0: resolution: {integrity: sha512-V/JgOLFCS+R6Vcq0slCuaeWEdNC3ouDlJMNIsacH2VtALiu9mV4LPrHc5cDl8k5aw6J8jwgWWpiTo5RYhmIzvg==} engines: {node: '>= 8'} @@ -7626,6 +7798,14 @@ packages: engines: {node: '>=6'} dev: false + /get-stream@2.3.1: + resolution: {integrity: sha512-AUGhbbemXxrZJRD5cDvKtQxLuYaIbNtDTK8YqupCI393Q2KSTreEsLUN3ZxAWFGiKTzL6nKuzfcIvieflUX9qA==} + engines: {node: '>=0.10.0'} + dependencies: + object-assign: 4.1.1 + pinkie-promise: 2.0.1 + dev: false + /get-stream@6.0.1: resolution: {integrity: sha512-ts6Wi+2j3jQjqi70w5AlN8DFnkSwC+MqmxEzdEALB2qXZYV3X/b1CTfgPLGJNMeAWxdPfU8FO1ms3NUfaHCPYg==} engines: {node: '>=10'} @@ -7978,6 +8158,10 @@ packages: safer-buffer: 2.1.2 dev: false + /ieee754@1.2.1: + resolution: {integrity: sha512-dcyqhDvX1C46lXZcVqCpK+FtMRQVdIMN6/Df5js2zouUsqG7I6sFxitIC+7KYK29KdXOLHdu9zL4sFnoVQnqaA==} + dev: false + /ignore@5.3.1: resolution: {integrity: sha512-5Fytz/IraMjqpwfd34ke28PTVMjZjJG2MPn5t7OE4eUCUNf8BAa7b5WUS9/Qvr6mwOQS7Mk6vdsMno5he+T8Xw==} engines: {node: '>= 4'} @@ -8178,6 +8362,10 @@ packages: engines: {node: '>= 0.4'} dev: true + /is-natural-number@4.0.1: + resolution: {integrity: sha512-Y4LTamMe0DDQIIAlaer9eKebAlDSV6huy+TWhJVPlzZh2o4tRP5SQWFlLn5N0To4mDD22/qdOq+veo1cSISLgQ==} + dev: false + /is-negative-zero@2.0.3: resolution: {integrity: sha512-5KoIu2Ngpyek75jXodFvnafB6DJgr3u8uuK0LEZJjrU19DrMD3EVERaR8sjz8CCGgpZvxPl9SuE1GMVPFHx1mw==} engines: {node: '>= 0.4'} @@ -8229,6 +8417,11 @@ packages: call-bind: 1.0.7 dev: true + /is-stream@1.1.0: + resolution: {integrity: sha512-uQPm8kcs47jx38atAcWTVxyltQYoPT68y9aWYdV6yWXSyW8mzSat0TL6CiWdZeCdF3KrAvpVtnHbTv4RN+rqdQ==} + engines: {node: '>=0.10.0'} + dev: false + /is-stream@3.0.0: resolution: {integrity: sha512-LnQR4bZ9IADDRSkvpqMGvt/tEJWclzklNgSw48V5EAaAeDd6qGvN8ei6k5p0tvxSR171VmGyHuTiAOfxAbr8kA==} engines: {node: ^12.20.0 || ^14.13.1 || >=16.0.0} @@ -8704,6 +8897,13 @@ packages: '@jridgewell/sourcemap-codec': 1.4.15 dev: true + /make-dir@1.3.0: + resolution: {integrity: sha512-2w31R7SJtieJJnQtGc7RVL2StM2vGYVfqUOvUDxH6bC6aJTxPxTF0GnIgCyu7tjockiUWAYQRbxa7vKn34s5sQ==} + engines: {node: '>=4'} + dependencies: + pify: 3.0.0 + dev: false + /make-dir@3.1.0: resolution: {integrity: sha512-g3FeP20LNwhALb/6Cz6Dd4F2ngze0jz7tbzrD2wAV+o9FeNHe4rL+yK2md0J/fiSf1sa1ADhXqi5+oVwOM/eGw==} engines: {node: '>=8'} @@ -9547,6 +9747,14 @@ packages: /node-releases@2.0.14: resolution: {integrity: sha512-y10wOWt8yZpqXmOgRo77WaHEmhYQYGNA6y421PKsKYWEK8aW+cqAphborZDhqfyKrbZEN92CN1X2KbafY2s7Yw==} + /node-xlsx@0.23.0: + resolution: {integrity: sha512-r3KaSZSsSrK92rbPXnX/vDdxURmPPik0rjJ3A+Pybzpjyrk4G6WyGfj8JIz5dMMEpCmWVpmO4qoVPBxnpLv/8Q==} + engines: {node: '>=10.0.0'} + hasBin: true + dependencies: + xlsx: '@cdn.sheetjs.com/xlsx-0.19.3/xlsx-0.19.3.tgz' + dev: false + /non-layered-tidy-tree-layout@2.0.2: resolution: {integrity: sha512-gkXMxRzUH+PB0ax9dUN0yYF0S25BqeAYqhgMaLUFmpXLEk7Fcu8f4emJuOAY0V8kjDICxROIKsTAKsV/v355xw==} dev: false @@ -9875,17 +10083,26 @@ packages: resolution: {integrity: sha512-Dp6zGqpTdETdR63lehJYPeIOqpiNBNtc7BpWSLrOje7UaIsE5aY92r/AunQA7rsXvet3lrJ3JnZX29UPTKXyKQ==} dev: true - /pdfjs-dist@4.0.269: + /pdfjs-dist@4.0.269(encoding@0.1.13): resolution: {integrity: sha512-jjWO56tcOjnmPqDf8PmXDeZ781AGvpHMYI3HhNtaFKTRXXPaD1ArSrhVe38/XsrIQJ0onISCND/vuXaWJkiDWw==} engines: {node: '>=18'} optionalDependencies: - canvas: 2.11.2 + canvas: 2.11.2(encoding@0.1.13) path2d-polyfill: 2.1.1 transitivePeerDependencies: - encoding - supports-color dev: false + /peek-readable@5.0.0: + resolution: {integrity: sha512-YtCKvLUOvwtMGmrniQPdO7MwPjgkFBtFIrmfSbYmYuq3tKDV/mcfAhBth1+C3ru7uXIZasc/pHnb+YDYNkkj4A==} + engines: {node: '>=14.16'} + dev: false + + /pend@1.2.0: + resolution: {integrity: sha512-F3asv42UuXchdzt+xXqfW1OGlVBe+mxa2mqI0pg5yAHZPvFmY3Y6drSf/GQ1A86WgWEN9Kzh/WrgKa6iGcHXLg==} + dev: false + /pg-cloudflare@1.1.1: resolution: {integrity: sha512-xWPagP/4B6BgFO+EKz3JONXv3YDgvkbVrGw2mTo3D6tVDQRh1e7cqVGvyR3BE+eQgAvx1XhW/iEASj4/jCWl3Q==} requiresBuild: true @@ -9979,6 +10196,28 @@ packages: hasBin: true dev: true + /pify@2.3.0: + resolution: {integrity: sha512-udgsAY+fTnvv7kI7aaxbqwWNb0AHiB0qBO89PZKPkoTmGOgdbrHDKD+0B2X4uTfJ/FT1R09r9gTsjUjNJotuog==} + engines: {node: '>=0.10.0'} + dev: false + + /pify@3.0.0: + resolution: {integrity: sha512-C3FsVNH1udSEX48gGX1xfvwTWfsYWj5U+8/uK15BGzIGrKoUpghX8hWZwa/OFnakBiiVNmBvemTJR5mcy7iPcg==} + engines: {node: '>=4'} + dev: false + + /pinkie-promise@2.0.1: + resolution: {integrity: sha512-0Gni6D4UcLTbv9c57DfxDGdr41XfgUjqWZu492f0cIGr16zDU06BWP/RAEvOuo7CQ0CNjHaLlM59YJJFm3NWlw==} + engines: {node: '>=0.10.0'} + dependencies: + pinkie: 2.0.4 + dev: false + + /pinkie@2.0.4: + resolution: {integrity: sha512-MnUuEycAemtSaeFSjXKW/aroV7akBbY+Sv+RkyqFjgAe73F+MR0TBWKBRDkmfWq/HiFmdavfZ1G7h4SPZXaCSg==} + engines: {node: '>=0.10.0'} + dev: false + /pkg-types@1.0.3: resolution: {integrity: sha512-nN7pYi0AQqJnoLPC9eHFQ8AcyaixBUOwvqc5TDnIKCMEE6I0y8P7OKA7fPexsXGCGxQDl/cmrLAp26LhcwxZ4A==} dependencies: @@ -10396,7 +10635,13 @@ packages: string_decoder: 1.3.0 util-deprecate: 1.0.2 dev: false - optional: true + + /readable-web-to-node-stream@3.0.2: + resolution: {integrity: sha512-ePeK6cc1EcKLEhJFt/AebMCLL+GgSKhuygrZ/GLaKZYEecIgIECf4UaUuaByiGtzckwR4ain9VzUh95T1exYGw==} + engines: {node: '>=8'} + dependencies: + readable-stream: 3.6.2 + dev: false /readdirp@3.6.0: resolution: {integrity: sha512-hOS089on8RduqdbhvQ5Z37A0ESjsqz6qnRcffsMU3495FuTdqSm+7bhJ29JvIOsBDEEnan5DPu9t3To9VRlMzA==} @@ -10715,6 +10960,13 @@ packages: dependencies: loose-envify: 1.4.0 + /seek-bzip@1.0.6: + resolution: {integrity: sha512-e1QtP3YL5tWww8uKaOCQ18UxIT2laNBXHjV/S2WYCiK4udiv8lkG89KRIoCjUagnAmCBurjF4zEVX2ByBbnCjQ==} + hasBin: true + dependencies: + commander: 2.20.3 + dev: false + /semver@6.3.1: resolution: {integrity: sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==} hasBin: true @@ -11024,7 +11276,6 @@ packages: dependencies: safe-buffer: 5.2.1 dev: false - optional: true /strip-ansi@6.0.1: resolution: {integrity: sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==} @@ -11044,6 +11295,12 @@ packages: engines: {node: '>=4'} dev: true + /strip-dirs@2.1.0: + resolution: {integrity: sha512-JOCxOeKLm2CAS73y/U4ZeZPTkE+gNVCzKt7Eox84Iej1LT/2pTWYpZKJuxwQpvX1LiZb1xokNR7RLfuBAa7T3g==} + dependencies: + is-natural-number: 4.0.1 + dev: false + /strip-final-newline@3.0.0: resolution: {integrity: sha512-dOESqjYr96iWYylGObzd39EuNTa5VJxyvVAEm5Jnh7KGo75V43Hk1odPQkNDyXNmUR6k+gEiDVXnjB8HJ3crXw==} engines: {node: '>=12'} @@ -11060,6 +11317,14 @@ packages: js-tokens: 8.0.3 dev: true + /strtok3@7.0.0: + resolution: {integrity: sha512-pQ+V+nYQdC5H3Q7qBZAz/MO6lwGhoC2gOAjuouGf/VO0m7vQRh8QNMl2Uf6SwAtzZ9bOw3UIeBukEGNJl5dtXQ==} + engines: {node: '>=14.16'} + dependencies: + '@tokenizer/token': 0.3.0 + peek-readable: 5.0.0 + dev: false + /style-to-object@0.4.4: resolution: {integrity: sha512-HYNoHZa2GorYNyqiCaBgsxvcJIn7OHq6inEga+E6Ke3m5JkoqpQbnFssk4jwe+K7AhGa2fcha4wSOf1Kn01dMg==} dependencies: @@ -11131,6 +11396,19 @@ packages: engines: {node: '>=6'} dev: true + /tar-stream@1.6.2: + resolution: {integrity: sha512-rzS0heiNf8Xn7/mpdSVVSMAWAoy9bfb1WOTYC78Z0UQKeKa/CWS8FOq0lKGNa8DWKAn9gxjCvMLYc5PGXYlK2A==} + engines: {node: '>= 0.8.0'} + dependencies: + bl: 1.2.3 + buffer-alloc: 1.2.0 + end-of-stream: 1.4.4 + fs-constants: 1.0.0 + readable-stream: 2.3.8 + to-buffer: 1.1.1 + xtend: 4.0.2 + dev: false + /tar@6.2.0: resolution: {integrity: sha512-/Wo7DcT0u5HUV486xg675HtjNd3BXZ6xDbzsCUZPt5iw8bTQ63bP0Raut3mvro9u+CUyq7YQd8Cx55fsZXxqLQ==} engines: {node: '>=10'} @@ -11149,6 +11427,10 @@ packages: resolution: {integrity: sha512-N+8UisAXDGk8PFXP4HAzVR9nbfmVJ3zYLAWiTIoqC5v5isinhr+r5uaO8+7r3BMfuNIufIsA7RdpVgacC2cSpw==} dev: true + /through@2.3.8: + resolution: {integrity: sha512-w89qg7PI8wAdvX60bMDP+bFoD5Dvhm9oLheFp5O4a2QF0cSBGsBX4qZmadPMvVqlLJBBci+WqGGOAPvcDeNSVg==} + dev: false + /timezones-list@3.0.3: resolution: {integrity: sha512-C+Vdvvj2c1xB6pu81pOX8geo6mrk/QsudFVlTVQET7QQwu8WAIyhDNeCrK5grU7EMzmbKLWqz7uU6dN8fvQvPQ==} dev: false @@ -11171,6 +11453,10 @@ packages: engines: {node: '>=14.0.0'} dev: true + /to-buffer@1.1.1: + resolution: {integrity: sha512-lx9B5iv7msuFYE3dytT+KE5tap+rNYw+K4jVkb9R/asAb+pbBSM17jtunHplhBe6RRJdZx3Pn2Jph24O32mOVg==} + dev: false + /to-fast-properties@2.0.0: resolution: {integrity: sha512-/OaKK0xYrs3DmxRYqL/yDc+FxFUVYhDlXMhRmv3z915w2HF1tnN1omB354j8VUGO/hbRzyD6Y3sA7v7GS/ceog==} engines: {node: '>=4'} @@ -11190,6 +11476,14 @@ packages: engines: {node: '>=0.6'} dev: false + /token-types@5.0.1: + resolution: {integrity: sha512-Y2fmSnZjQdDb9W4w4r1tswlMHylzWIeOKpx0aZH9BgGtACHhrk3OkT52AzwcuqTRBZtvvnTjDBh8eynMulu8Vg==} + engines: {node: '>=14.16'} + dependencies: + '@tokenizer/token': 0.3.0 + ieee754: 1.2.1 + dev: false + /tr46@0.0.3: resolution: {integrity: sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==} dev: false @@ -11369,6 +11663,13 @@ packages: which-boxed-primitive: 1.0.2 dev: true + /unbzip2-stream@1.4.3: + resolution: {integrity: sha512-mlExGW4w71ebDJviH16lQLtZS32VKqsSfk80GCfUlwT/4/hNRFsoscrF/c++9xinkMzECL1uL9DDwXqFWkruPg==} + dependencies: + buffer: 5.7.1 + through: 2.3.8 + dev: false + /underscore@1.13.6: resolution: {integrity: sha512-+A5Sja4HP1M08MaXya7p5LvjuM7K6q/2EaC0+iovj/wOcMsTzMvDFbasi/oSapiwOlt252IqsKqPjCl7huKS0A==} dev: false @@ -11943,6 +12244,13 @@ packages: engines: {node: '>= 14'} dev: true + /yauzl@2.10.0: + resolution: {integrity: sha512-p4a9I6X6nu6IhoGmBqAcbJy1mlC4j27vEPZX9F4L4/vZT3Lyq1VkFHw/V/PUcB9Buo+DG3iHkT0x3Qya58zc3g==} + dependencies: + buffer-crc32: 0.2.13 + fd-slicer: 1.1.0 + dev: false + /yjs@13.6.14: resolution: {integrity: sha512-D+7KcUr0j+vBCUSKXXEWfA+bG4UQBviAwP3gYBhkstkgwy5+8diOPMx0iqLIOxNo/HxaREUimZRxqHGAHCL2BQ==} engines: {node: '>=16.0.0', npm: '>=8.0.0'} @@ -12029,3 +12337,11 @@ packages: /zwitch@2.0.4: resolution: {integrity: sha512-bXE4cR/kVZhKZX/RjPEflHaKVhUVl85noU3v6b8apfQEc1x4A+zBxjZ4lN8LqGd6WZ3dl98pY4o717VFmoPp+A==} dev: false + + '@cdn.sheetjs.com/xlsx-0.19.3/xlsx-0.19.3.tgz': + resolution: {tarball: https://cdn.sheetjs.com/xlsx-0.19.3/xlsx-0.19.3.tgz} + name: xlsx + version: 0.19.3 + engines: {node: '>=0.8'} + hasBin: true + dev: false diff --git a/projects/app/data/config.json b/projects/app/data/config.json index 39072bc52b95..6a36ab45fa79 100644 --- a/projects/app/data/config.json +++ b/projects/app/data/config.json @@ -82,7 +82,7 @@ "name": "Embedding-2", "avatar": "/imgs/model/openai.svg", "charsPointsPrice": 0, - "defaultToken": 700, + "defaultToken": 512, "maxToken": 3000, "weight": 100, "dbConfig": {}, diff --git a/projects/app/public/locales/en/common.json b/projects/app/public/locales/en/common.json index c01c85d552f4..e0fa3b5241d2 100644 --- a/projects/app/public/locales/en/common.json +++ b/projects/app/public/locales/en/common.json @@ -56,6 +56,7 @@ } }, "common": { + "Action": "Action", "Add": "Add", "Add New": "Add", "All": "All", @@ -79,6 +80,7 @@ "Create New": "Create", "Create Success": "Create Success", "Create Time": "Create time", + "Creating": "Creating", "Custom Title": "Custom Title", "Delete": "Delete", "Delete Failed": "Delete Failed", @@ -191,6 +193,7 @@ "Empty file tip": "The file content is empty. The file may be unreadable or pure image file content.", "File Content": "File Content", "File Name": "File Name", + "File Size": "File Size", "File content can not be empty": "File content can not be empty", "Filename Can not Be Empty": "Filename Can not Be Empty", "Read File Error": "Read file error", @@ -198,6 +201,7 @@ "Select failed": "Select file failed", "Select file amount limit": "A maximum of {{max}} files can be selected", "Select file amount limit 100": "You can select a maximum of 100 files at a time", + "Some file count exceeds limit": "The number of files exceeding {{maxCount}} has been automatically intercepted", "Some file size exceeds limit": "Some files exceed: {{maxSize}}, have been filtered", "Support file type": "Support {{fileType}} files", "Support max count": "A maximum of {{maxCount}} files are supported.", @@ -620,7 +624,7 @@ "file": "File", "folder": "Folder", "import": { - "Auto mode Estimated Price Tips": "Enhanced processing calls the file processing model: {{price}} integral /1k Tokens", + "Auto mode Estimated Price Tips": "Need to call the file processing model, need to consume more Tokens: {{price}} credits /1k Tokens", "Auto process": "Auto", "Auto process desc": "Automatically set segmentation and preprocessing rules", "CSV Import": "CSV QA Import", @@ -642,7 +646,7 @@ "Data file progress": "Data upload progress", "Data process params": "Data process params", "Down load csv template": "Down load csv template", - "Embedding Estimated Price Tips": "Index billing: {{price}}/1k Tokens", + "Embedding Estimated Price Tips": "Use only the index model and consume a small amount of Tokens: {{price}} credits /1k Tokens", "Estimated Price": "Estimated Price: : {{amount}}{{unit}}", "Estimated Price Tips": "QA charges: {{charsPointsPrice}} points/1k Tokens", "Estimated points": "About {{points}} points", @@ -657,15 +661,19 @@ "Import Failed": "Import Failed", "Import Success Tip": "The {{num}} group data is imported successfully. Please wait for training.", "Import Tip": "This task cannot be terminated and takes some time to generate indexes. Please confirm the import. If the balance is insufficient, the unfinished task will be suspended and can continue after topping up.", + "Import success": "Import successful, please wait for training", "Link name": "Link name", "Link name placeholder": "Only static links are supported\nOne per line, up to 10 links at a time", "Local file": "Local file", "Local file desc": "Upload files in PDF, TXT, DOCX and other formats", "Only Show First 50 Chunk": "Show only part", - "Preview chunks": "Chunks", - "Preview raw text": "Preview file text (max show 10000 words)", + "Predicted chunk": "Predicted chunk", + "Predicted chunk amount": "Predicted chunks:{{amount}}", + "Predicted total chars": "Predicted chars: {{total}}", + "Preview chunks": "Preview chunks", + "Preview raw text": "Preview file text (max show 3000 words)", "Process way": "Process way", - "QA Estimated Price Tips": "QA billing: {{price}}/1k Tokens (including input and output)", + "QA Estimated Price Tips": "Need to call the file processing model, need to consume more Tokens: {{price}} credits /1k Tokens", "QA Import": "QA Split", "QA Import Tip": "According to certain rules, the text is broken into a larger paragraph, and the AI is invoked to generate a question and answer pair for the paragraph.", "Re Preview": "RePreview", @@ -680,8 +688,8 @@ "Total tokens": "Tokens", "Training mode": "Training mode", "Upload data": "Upload data", - "Upload file progress": "File upload progress", - "Upload status": "Upload status", + "Upload file progress": "Upload state", + "Upload status": "Status", "Upload success": "Upload success", "Web link": "Web link", "Web link desc": "Fetch static web content as a collection" @@ -1348,6 +1356,7 @@ "Pay error": "Pay error", "Pay success": "Pay success", "Plan expired time": "Plan expired time", + "Plan reset time": "Plan reset time", "Standard Plan Detail": "Standard Plan Detail", "To read plan": "Read plan", "bill": { diff --git a/projects/app/public/locales/zh/common.json b/projects/app/public/locales/zh/common.json index cba1a71a1411..47c492787b03 100644 --- a/projects/app/public/locales/zh/common.json +++ b/projects/app/public/locales/zh/common.json @@ -56,6 +56,7 @@ } }, "common": { + "Action": "操作", "Add": "添加", "Add New": "新增", "All": "全部", @@ -79,6 +80,7 @@ "Create New": "新建", "Create Success": "创建成功", "Create Time": "创建时间", + "Creating": "创建中", "Custom Title": "自定义标题", "Delete": "删除", "Delete Failed": "删除失败", @@ -191,6 +193,7 @@ "Empty file tip": "文件内容为空,可能该文件无法读取或为纯图片文件内容。", "File Content": "文件内容", "File Name": "文件名", + "File Size": "文件大小", "File content can not be empty": "文件内容不能为空", "Filename Can not Be Empty": "文件名不能为空", "Read File Error": "解析文件失败", @@ -198,6 +201,7 @@ "Select failed": "选择文件异常", "Select file amount limit": "最多选择 {{max}} 个文件", "Select file amount limit 100": "每次最多选择100个文件", + "Some file count exceeds limit": "超出{{maxCount}}个文件,已自动截取", "Some file size exceeds limit": "部分文件超出: {{maxSize}},已被过滤", "Support file type": "支持 {{fileType}} 类型文件", "Support max count": "最多支持 {{maxCount}} 个文件。", @@ -622,7 +626,7 @@ "file": "文件", "folder": "目录", "import": { - "Auto mode Estimated Price Tips": "增强处理需调用文件处理模型: {{price}}积分/1k Tokens", + "Auto mode Estimated Price Tips": "需调用文件处理模型,需要消耗较多Tokens: {{price}}积分/1k Tokens", "Auto process": "自动", "Auto process desc": "自动设置分割和预处理规则", "CSV Import": "CSV 导入", @@ -644,7 +648,7 @@ "Data file progress": "数据上传进度", "Data process params": "数据处理参数", "Down load csv template": "点击下载 CSV 模板", - "Embedding Estimated Price Tips": "索引计费: {{price}}积分/1k Tokens", + "Embedding Estimated Price Tips": "仅使用索引模型,消耗少量Tokens: {{price}}积分/1k Tokens", "Estimated Price": "预估价格: {{amount}}{{unit}}", "Estimated Price Tips": "QA计费为\n输入: {{charsPointsPrice}}积分/1k Tokens", "Estimated points": "预估消耗 {{points}} 积分", @@ -659,15 +663,19 @@ "Import Failed": "导入文件失败", "Import Success Tip": "共成功导入 {{num}} 组数据,请耐心等待训练.", "Import Tip": "该任务无法终止,需要一定时间生成索引,请确认导入。如果余额不足,未完成的任务会被暂停,充值后可继续进行。", + "Import success": "导入成功,请等待训练", "Link name": "网络链接", "Link name placeholder": "仅支持静态链接,如果上传后数据为空,可能该链接无法被读取\n每行一个,每次最多 10 个链接", "Local file": "本地文件", "Local file desc": "上传 PDF, TXT, DOCX 等格式的文件", "Only Show First 50 Chunk": "仅展示部分", - "Preview chunks": "分段预览", - "Preview raw text": "预览源文本(最多展示10000字)", + "Predicted chunk": "预估分段", + "Predicted chunk amount": "预估分段:{{amount}}", + "Predicted total chars": "预估字数: {{total}}", + "Preview chunks": "预览分段(最多5段)", + "Preview raw text": "预览源文本(最多3000字)", "Process way": "处理方式", - "QA Estimated Price Tips": "QA计费为: {{price}}积分/1k Tokens(包含输入和输出)", + "QA Estimated Price Tips": "需调用文件处理模型,需要消耗较多Tokens: {{price}}积分/1k Tokens", "QA Import": "QA拆分", "QA Import Tip": "根据一定规则,将文本拆成一段较大的段落,调用 AI 为该段落生成问答对。有非常高的检索精度,但是会丢失很多内容细节。", "Re Preview": "重新生成预览", @@ -683,7 +691,7 @@ "Training mode": "训练模式", "Upload data": "上传数据", "Upload file progress": "文件上传进度", - "Upload status": "上传状态", + "Upload status": "状态", "Upload success": "上传成功", "Web link": "网页链接", "Web link desc": "读取静态网页内容作为数据集" @@ -1350,6 +1358,7 @@ "Pay error": "支付失败", "Pay success": "支付成功", "Plan expired time": "套餐到期时间", + "Plan reset time": "套餐重置时间", "Standard Plan Detail": "套餐详情", "To read plan": "查看套餐", "bill": { @@ -1407,7 +1416,7 @@ "Standard update fail": "修改订阅套餐异常", "Standard update success": "变更订阅套餐成功!", "Sub plan": "订阅套餐", - "Sub plan tip": "免费使用 FastGPT 或升级更高的套餐", + "Sub plan tip": "免费使用 {{title}} 或升级更高的套餐", "Team plan and usage": "套餐与用量", "Training weight": "训练优先级: {{weight}}", "Update extra ai points": "额外AI积分", diff --git a/projects/app/src/global/core/dataset/api.d.ts b/projects/app/src/global/core/dataset/api.d.ts index 314a4fe77e88..d909078e443e 100644 --- a/projects/app/src/global/core/dataset/api.d.ts +++ b/projects/app/src/global/core/dataset/api.d.ts @@ -2,6 +2,7 @@ import { PushDatasetDataChunkProps } from '@fastgpt/global/core/dataset/api'; import { DatasetSearchModeEnum, DatasetTypeEnum, + ImportDataSourceEnum, TrainingModeEnum } from '@fastgpt/global/core/dataset/constants'; import { @@ -67,3 +68,24 @@ export type SearchTestResponse = { similarity: number; usingQueryExtension: boolean; }; + +/* =========== training =========== */ +export type PostPreviewFilesChunksProps = { + type: `${ImportDataSourceEnum}`; + sourceId: string; + chunkSize: number; + overlapRatio: number; + customSplitChar?: string; +}; + +export type PostPreviewFilesChunksResponse = { + fileId: string; + rawTextLength: number; + chunks: string[]; +}[]; +export type PostPreviewTableChunksResponse = { + fileId: string; + totalChunks: number; + chunks: { q: string; a: string; chunkIndex: number }[]; + errorText?: string; +}[]; diff --git a/projects/app/src/global/core/dataset/request.d.ts b/projects/app/src/global/core/dataset/request.d.ts deleted file mode 100644 index 111c33d02985..000000000000 --- a/projects/app/src/global/core/dataset/request.d.ts +++ /dev/null @@ -1,5 +0,0 @@ -/* ================= dataset ===================== */ - -/* ================= collection ===================== */ - -/* ================= data ===================== */ diff --git a/projects/app/src/pages/account/components/Info.tsx b/projects/app/src/pages/account/components/Info.tsx index ed3eac4bd250..41e649c90347 100644 --- a/projects/app/src/pages/account/components/Info.tsx +++ b/projects/app/src/pages/account/components/Info.tsx @@ -397,14 +397,22 @@ const PlanUsage = () => { {t(planName)} - - {t('support.wallet.Plan expired time')}: - {formatTime2YMD(standardPlan?.expiredTime)} - - {isFreeTeam && ( - - 免费版用户30天无任何使用记录时,系统会自动清理账号知识库。 - + + {isFreeTeam ? ( + <> + + {t('support.wallet.Plan reset time')}: + {formatTime2YMD(standardPlan?.expiredTime)} + + + 免费版用户30天无任何使用记录时,系统会自动清理账号知识库。 + + + ) : ( + + {t('support.wallet.Plan expired time')}: + {formatTime2YMD(standardPlan?.expiredTime)} + )} + + {feConfigs?.show_pay && ( + + + {priceTip} + + )} + + - + + + {isOpenCustomPrompt && ( void; }) => { const { t } = useTranslation(); - const { sources, setSources } = useImportStore(); return ( - + diff --git a/projects/app/src/pages/dataset/detail/components/Import/commonProgress/Upload.tsx b/projects/app/src/pages/dataset/detail/components/Import/commonProgress/Upload.tsx index d0350a352040..be6e180960fc 100644 --- a/projects/app/src/pages/dataset/detail/components/Import/commonProgress/Upload.tsx +++ b/projects/app/src/pages/dataset/detail/components/Import/commonProgress/Upload.tsx @@ -1,4 +1,4 @@ -import React, { useEffect, useState } from 'react'; +import React from 'react'; import { Box, TableContainer, @@ -8,164 +8,109 @@ import { Th, Td, Tbody, - Progress, Flex, Button } from '@chakra-ui/react'; import { useImportStore, type FormType } from '../Provider'; +import { ImportDataSourceEnum } from '@fastgpt/global/core/dataset/constants'; import { useTranslation } from 'next-i18next'; import MyIcon from '@fastgpt/web/components/common/Icon'; import { useRequest } from '@fastgpt/web/hooks/useRequest'; -import { postCreateTrainingUsage } from '@/web/support/wallet/usage/api'; import { useDatasetStore } from '@/web/core/dataset/store/dataset'; -import { chunksUpload, fileCollectionCreate } from '@/web/core/dataset/utils'; -import { ImportSourceItemType } from '@/web/core/dataset/type'; -import { hashStr } from '@fastgpt/global/common/string/tools'; import { useToast } from '@fastgpt/web/hooks/useToast'; import { useRouter } from 'next/router'; import { TabEnum } from '../../../index'; -import { postCreateDatasetLinkCollection, postDatasetCollection } from '@/web/core/dataset/api'; -import { DatasetCollectionTypeEnum } from '@fastgpt/global/core/dataset/constants'; -import { checkTeamDatasetSizeLimit } from '@/web/support/user/team/api'; - -const Upload = ({ showPreviewChunks }: { showPreviewChunks: boolean }) => { +import { + postCreateDatasetCsvTableCollection, + postCreateDatasetFileCollection, + postCreateDatasetLinkCollection, + postCreateDatasetTextCollection +} from '@/web/core/dataset/api'; +import { getErrText } from '@fastgpt/global/common/error/utils'; +import Tag from '@/components/Tag'; + +const Upload = () => { const { t } = useTranslation(); const { toast } = useToast(); const router = useRouter(); const { datasetDetail } = useDatasetStore(); - const { parentId, sources, processParamsForm, chunkSize, totalChunks, uploadRate } = + const { importSource, parentId, sources, setSources, processParamsForm, chunkSize } = useImportStore(); - const [uploadList, setUploadList] = useState< - (ImportSourceItemType & { - uploadedFileRate: number; - uploadedChunksRate: number; - })[] - >([]); const { handleSubmit } = processParamsForm; const { mutate: startUpload, isLoading } = useRequest({ mutationFn: async ({ mode, customSplitChar, qaPrompt, webSelector }: FormType) => { - if (uploadList.length === 0) return; - - await checkTeamDatasetSizeLimit(totalChunks); - - let totalInsertion = 0; + if (sources.length === 0) return; + const filterWaitingSources = sources.filter((item) => item.createStatus === 'waiting'); // Batch create collection and upload chunks - for await (const item of uploadList) { - // create collection - const collectionId = await (async () => { - const commonParams = { - parentId, - trainingType: mode, - datasetId: datasetDetail._id, - chunkSize, - chunkSplitter: customSplitChar, - qaPrompt, - - name: item.sourceName, - rawTextLength: item.rawText.length, - hashRawText: hashStr(item.rawText) - }; - if (item.file) { - return fileCollectionCreate({ - file: item.file, - data: { - ...commonParams, - collectionMetadata: { - relatedImgId: item.id + for await (const item of filterWaitingSources) { + setSources((state) => + state.map((source) => + source.id === item.id + ? { + ...source, + createStatus: 'creating' } - }, - percentListen: (e) => { - setUploadList((state) => - state.map((uploadItem) => - uploadItem.id === item.id - ? { - ...uploadItem, - uploadedFileRate: e - } - : uploadItem - ) - ); - } - }); - } else if (item.link) { - const { collectionId } = await postCreateDatasetLinkCollection({ - ...commonParams, - link: item.link, - metadata: { - webPageSelector: webSelector - } - }); - setUploadList((state) => - state.map((uploadItem) => - uploadItem.id === item.id - ? { - ...uploadItem, - uploadedFileRate: 100 - } - : uploadItem - ) - ); - return collectionId; - } else if (item.rawText) { - // manual collection - return postDatasetCollection({ - ...commonParams, - type: DatasetCollectionTypeEnum.virtual - }); - } - return ''; - })(); - - if (!collectionId) continue; - if (item.link) continue; + : source + ) + ); - const billId = await postCreateTrainingUsage({ - name: item.sourceName, - datasetId: datasetDetail._id - }); + // create collection + const commonParams = { + parentId, + trainingType: mode, + datasetId: datasetDetail._id, + chunkSize, + chunkSplitter: customSplitChar, + qaPrompt, + + name: item.sourceName + }; + if (importSource === ImportDataSourceEnum.fileLocal && item.dbFileId) { + await postCreateDatasetFileCollection({ + ...commonParams, + fileId: item.dbFileId + }); + } else if (importSource === ImportDataSourceEnum.fileLink && item.link) { + await postCreateDatasetLinkCollection({ + ...commonParams, + link: item.link, + metadata: { + webPageSelector: webSelector + } + }); + } else if (importSource === ImportDataSourceEnum.fileCustom && item.rawText) { + // manual collection + await postCreateDatasetTextCollection({ + ...commonParams, + text: item.rawText + }); + } else if (importSource === ImportDataSourceEnum.csvTable && item.dbFileId) { + await postCreateDatasetCsvTableCollection({ + ...commonParams, + fileId: item.dbFileId + }); + } - // upload chunks - const chunks = item.chunks; - const { insertLen } = await chunksUpload({ - collectionId, - billId, - trainingMode: mode, - chunks, - rate: uploadRate, - onUploading: (e) => { - setUploadList((state) => - state.map((uploadItem) => - uploadItem.id === item.id - ? { - ...uploadItem, - uploadedChunksRate: e - } - : uploadItem - ) - ); - }, - prompt: qaPrompt - }); - totalInsertion += insertLen; + setSources((state) => + state.map((source) => + source.id === item.id + ? { + ...source, + createStatus: 'finish' + } + : source + ) + ); } - - return totalInsertion; }, - onSuccess(num) { - if (showPreviewChunks) { - toast({ - title: t('core.dataset.import.Import Success Tip', { num }), - status: 'success' - }); - } else { - toast({ - title: t('core.dataset.import.Upload success'), - status: 'success' - }); - } + onSuccess() { + toast({ + title: t('core.dataset.import.Import success'), + status: 'success' + }); // close import page router.replace({ @@ -175,21 +120,21 @@ const Upload = ({ showPreviewChunks }: { showPreviewChunks: boolean }) => { } }); }, + onError() { + setSources((state) => + state.map((source) => + source.createStatus === 'creating' + ? { + ...source, + createStatus: 'waiting' + } + : source + ) + ); + }, errorToast: t('common.file.Upload failed') }); - useEffect(() => { - setUploadList( - sources.map((item) => { - return { - ...item, - uploadedFileRate: item.file ? 0 : -1, - uploadedChunksRate: 0 - }; - }) - ); - }, []); - return ( @@ -199,85 +144,35 @@ const Upload = ({ showPreviewChunks }: { showPreviewChunks: boolean }) => { {t('core.dataset.import.Source name')} - {showPreviewChunks ? ( - <> - - {t('core.dataset.Chunk amount')} - - - {t('core.dataset.import.Upload file progress')} - - - {t('core.dataset.import.Data file progress')} - - - ) : ( - <> - - {t('core.dataset.import.Upload status')} - - - )} + + {t('core.dataset.import.Upload status')} + - {uploadList.map((item) => ( + {sources.map((item) => ( - - - {item.sourceName} + + + + + {item.sourceName} + + + + + + {item.createStatus === 'waiting' && ( + {t('common.Waiting')} + )} + {item.createStatus === 'creating' && ( + {t('common.Creating')} + )} + {item.createStatus === 'finish' && ( + {t('common.Finish')} + )} + - {showPreviewChunks ? ( - <> - {item.chunks.length} - - {item.uploadedFileRate === -1 ? ( - '-' - ) : ( - - - {`${item.uploadedFileRate}%`} - - )} - - - - - {`${item.uploadedChunksRate}%`} - - - - ) : ( - <> - - {item.uploadedFileRate === 100 ? t('common.Finish') : t('common.Waiting')} - - - )} ))} @@ -286,8 +181,8 @@ const Upload = ({ showPreviewChunks }: { showPreviewChunks: boolean }) => { diff --git a/projects/app/src/pages/dataset/detail/components/Import/components/FileSelector.tsx b/projects/app/src/pages/dataset/detail/components/Import/components/FileSelector.tsx new file mode 100644 index 000000000000..dc709c5f9a07 --- /dev/null +++ b/projects/app/src/pages/dataset/detail/components/Import/components/FileSelector.tsx @@ -0,0 +1,296 @@ +import MyBox from '@/components/common/MyBox'; +import { useSelectFile } from '@/web/common/file/hooks/useSelectFile'; +import { useToast } from '@fastgpt/web/hooks/useToast'; +import { Box, FlexProps } from '@chakra-ui/react'; +import { formatFileSize } from '@fastgpt/global/common/file/tools'; +import MyIcon from '@fastgpt/web/components/common/Icon'; +import { useTranslation } from 'next-i18next'; +import React, { DragEvent, useCallback, useMemo, useState } from 'react'; +import { getNanoid } from '@fastgpt/global/common/string/tools'; +import { useRequest } from '@fastgpt/web/hooks/useRequest'; +import { getFileIcon } from '@fastgpt/global/common/file/icon'; +import { useSystemStore } from '@/web/common/system/useSystemStore'; +import { uploadFile2DB } from '@/web/common/file/controller'; +import { BucketNameEnum } from '@fastgpt/global/common/file/constants'; +import { ImportSourceItemType } from '@/web/core/dataset/type'; + +export type SelectFileItemType = { + fileId: string; + folderPath: string; + file: File; +}; + +const FileSelector = ({ + fileType, + selectFiles, + setSelectFiles, + onStartSelect, + onFinishSelect, + ...props +}: { + fileType: string; + selectFiles: ImportSourceItemType[]; + setSelectFiles: React.Dispatch>; + onStartSelect: () => void; + onFinishSelect: () => void; +} & FlexProps) => { + const { t } = useTranslation(); + const { toast } = useToast(); + const { feConfigs } = useSystemStore(); + + const maxCount = feConfigs?.uploadFileMaxAmount || 1000; + const maxSize = (feConfigs?.uploadFileMaxSize || 1024) * 1024 * 1024; + + const { File, onOpen } = useSelectFile({ + fileType, + multiple: true, + maxCount + }); + const [isDragging, setIsDragging] = useState(false); + const isMaxSelected = useMemo( + () => selectFiles.length >= maxCount, + [maxCount, selectFiles.length] + ); + + const filterTypeReg = new RegExp( + `(${fileType + .split(',') + .map((item) => item.trim()) + .join('|')})$`, + 'i' + ); + + const { mutate: onSelectFile, isLoading } = useRequest({ + mutationFn: async (files: SelectFileItemType[]) => { + { + onStartSelect(); + setSelectFiles((state) => { + const formatFiles = files.map((selectFile) => { + const { fileId, file } = selectFile; + + return { + id: fileId, + createStatus: 'waiting', + file, + sourceName: file.name, + sourceSize: formatFileSize(file.size), + icon: getFileIcon(file.name), + isUploading: true, + uploadedFileRate: 0 + }; + }); + const results = formatFiles.concat(state).slice(0, maxCount); + return results; + }); + try { + // upload file + await Promise.all( + files.map(async ({ fileId, file }) => { + const uploadFileId = await uploadFile2DB({ + file, + bucketName: BucketNameEnum.dataset, + percentListen: (e) => { + setSelectFiles((state) => + state.map((item) => + item.id === fileId + ? { + ...item, + uploadedFileRate: e + } + : item + ) + ); + } + }); + setSelectFiles((state) => + state.map((item) => + item.id === fileId + ? { + ...item, + dbFileId: uploadFileId, + isUploading: false + } + : item + ) + ); + }) + ); + } catch (error) { + console.log(error); + } + onFinishSelect(); + } + } + }); + + const selectFileCallback = useCallback( + (files: SelectFileItemType[]) => { + if (selectFiles.length + files.length > maxCount) { + files = files.slice(0, maxCount - selectFiles.length); + toast({ + status: 'warning', + title: t('common.file.Some file count exceeds limit', { maxCount }) + }); + } + // size check + if (!maxSize) { + return onSelectFile(files); + } + const filterFiles = files.filter((item) => item.file.size <= maxSize); + + if (filterFiles.length < files.length) { + toast({ + status: 'warning', + title: t('common.file.Some file size exceeds limit', { maxSize: formatFileSize(maxSize) }) + }); + } + + return onSelectFile(filterFiles); + }, + [maxCount, maxSize, onSelectFile, selectFiles.length, t, toast] + ); + + const handleDragEnter = (e: DragEvent) => { + e.preventDefault(); + setIsDragging(true); + }; + + const handleDragLeave = (e: DragEvent) => { + e.preventDefault(); + setIsDragging(false); + }; + + const handleDrop = async (e: DragEvent) => { + e.preventDefault(); + setIsDragging(false); + + const items = e.dataTransfer.items; + const fileList: SelectFileItemType[] = []; + + if (e.dataTransfer.items.length <= 1) { + const traverseFileTree = async (item: any) => { + return new Promise((resolve, reject) => { + if (item.isFile) { + item.file((file: File) => { + const folderPath = (item.fullPath || '').split('/').slice(2, -1).join('/'); + + if (filterTypeReg.test(file.name)) { + fileList.push({ + fileId: getNanoid(), + folderPath, + file + }); + } + resolve(); + }); + } else if (item.isDirectory) { + const dirReader = item.createReader(); + dirReader.readEntries(async (entries: any[]) => { + for (let i = 0; i < entries.length; i++) { + await traverseFileTree(entries[i]); + } + resolve(); + }); + } + }); + }; + + for await (const item of items) { + await traverseFileTree(item.webkitGetAsEntry()); + } + } else { + const files = Array.from(e.dataTransfer.files); + let isErr = files.some((item) => item.type === ''); + if (isErr) { + return toast({ + title: t('file.upload error description'), + status: 'error' + }); + } + + fileList.push( + ...files + .filter((item) => filterTypeReg.test(item.name)) + .map((file) => ({ + fileId: getNanoid(), + folderPath: '', + file + })) + ); + } + + selectFileCallback(fileList.slice(0, maxCount)); + }; + + return ( + e.preventDefault(), + onDragLeave: handleDragLeave, + onDrop: handleDrop, + onClick: onOpen + })} + {...props} + > + + {isMaxSelected ? ( + <> + + 已达到最大文件数量 + + + ) : ( + <> + + {isDragging + ? t('file.Release the mouse to upload the file') + : t('common.file.Select and drag file tip')} + + {/* file type */} + + {t('common.file.Support file type', { fileType })} + + + {/* max count */} + {maxCount && t('common.file.Support max count', { maxCount })} + {/* max size */} + {maxSize && t('common.file.Support max size', { maxSize: formatFileSize(maxSize) })} + + + + selectFileCallback( + files.map((file) => ({ + fileId: getNanoid(), + folderPath: '', + file + })) + ) + } + /> + + )} + + ); +}; + +export default React.memo(FileSelector); diff --git a/projects/app/src/pages/dataset/detail/components/Import/sourceSelector/FileSourceSelector.tsx b/projects/app/src/pages/dataset/detail/components/Import/components/FileSourceSelector.tsx similarity index 96% rename from projects/app/src/pages/dataset/detail/components/Import/sourceSelector/FileSourceSelector.tsx rename to projects/app/src/pages/dataset/detail/components/Import/components/FileSourceSelector.tsx index e30bab2407e4..38af361d8a98 100644 --- a/projects/app/src/pages/dataset/detail/components/Import/sourceSelector/FileSourceSelector.tsx +++ b/projects/app/src/pages/dataset/detail/components/Import/components/FileSourceSelector.tsx @@ -3,9 +3,9 @@ import MyModal from '@fastgpt/web/components/common/MyModal'; import { ModalBody, ModalFooter, Button } from '@chakra-ui/react'; import { useTranslation } from 'next-i18next'; import LeftRadio from '@fastgpt/web/components/common/Radio/LeftRadio'; -import { ImportDataSourceEnum } from '..'; import { useRouter } from 'next/router'; import { TabEnum } from '../../..'; +import { ImportDataSourceEnum } from '@fastgpt/global/core/dataset/constants'; const FileModeSelector = ({ onClose }: { onClose: () => void }) => { const { t } = useTranslation(); diff --git a/projects/app/src/pages/dataset/detail/components/Import/components/Preview.tsx b/projects/app/src/pages/dataset/detail/components/Import/components/Preview.tsx index e6fbf630d8bb..97d7f0a15f60 100644 --- a/projects/app/src/pages/dataset/detail/components/Import/components/Preview.tsx +++ b/projects/app/src/pages/dataset/detail/components/Import/components/Preview.tsx @@ -1,132 +1,94 @@ -import React, { useMemo, useState } from 'react'; -import { Box, Flex } from '@chakra-ui/react'; +import React, { useState } from 'react'; +import { Box, Flex, IconButton } from '@chakra-ui/react'; import MyIcon from '@fastgpt/web/components/common/Icon'; import { useTranslation } from 'next-i18next'; -import RowTabs from '@fastgpt/web/components/common/Tabs/RowTabs'; +import { useImportStore } from '../Provider'; +import MyMenu from '@/components/MyMenu'; import { ImportSourceItemType } from '@/web/core/dataset/type'; +import dynamic from 'next/dynamic'; +const PreviewRawText = dynamic(() => import('./PreviewRawText')); +const PreviewChunks = dynamic(() => import('./PreviewChunks')); -enum PreviewListEnum { - chunks = 'chunks', - sources = 'sources' -} - -const Preview = ({ - sources, - showPreviewChunks -}: { - sources: ImportSourceItemType[]; - showPreviewChunks: boolean; -}) => { +const Preview = ({ showPreviewChunks }: { showPreviewChunks: boolean }) => { const { t } = useTranslation(); - const [previewListType, setPreviewListType] = useState( - showPreviewChunks ? PreviewListEnum.chunks : PreviewListEnum.sources - ); - const chunks = useMemo(() => { - const oneSourceChunkLength = Math.max(4, Math.floor(50 / sources.length)); - return sources - .map((source) => - source.chunks.slice(0, oneSourceChunkLength).map((chunk, i) => ({ - ...chunk, - index: i + 1, - sourceName: source.sourceName, - sourceIcon: source.icon - })) - ) - .flat(); - }, [sources]); + const { sources } = useImportStore(); + const [previewRawTextSource, setPreviewRawTextSource] = useState(); + const [previewChunkSource, setPreviewChunkSource] = useState(); return ( - - - + + + {t('core.dataset.import.Sources list')} + + + {sources.map((source) => ( + + + + {source.sourceName} + + {showPreviewChunks && ( + + } + aria-label={''} + size={'sm'} + variant={'whitePrimary'} + /> } - ] - : []), - { - icon: 'core/dataset/fileCollection', - label: t('core.dataset.import.Sources list'), - value: PreviewListEnum.sources - } - ]} - value={previewListType} - onChange={(e) => setPreviewListType(e as PreviewListEnum)} - /> - - - {previewListType === PreviewListEnum.chunks ? ( - <> - {chunks.map((chunk, i) => ( - - - - # {chunk.index} - - - - {chunk.sourceName} - - - - {chunk.q} - {chunk.a} - + menuList={[ + { + label: ( + + + {t('core.dataset.import.Preview raw text')} + + ), + onClick: () => setPreviewRawTextSource(source) + }, + { + label: ( + + + {t('core.dataset.import.Preview chunks')} + + ), + onClick: () => setPreviewChunkSource(source) + } + ]} + /> - ))} - - ) : ( - <> - {sources.map((source) => ( - - - - {source.sourceName} - - {showPreviewChunks && ( - - {t('core.dataset.import.File chunk amount', { amount: source.chunks.length })} - - )} - - ))} - - )} + )} + + ))} + {!!previewRawTextSource && ( + setPreviewRawTextSource(undefined)} + /> + )} + {!!previewChunkSource && ( + setPreviewChunkSource(undefined)} + /> + )} ); }; diff --git a/projects/app/src/pages/dataset/detail/components/Import/components/PreviewChunks.tsx b/projects/app/src/pages/dataset/detail/components/Import/components/PreviewChunks.tsx new file mode 100644 index 000000000000..e0c1a6b7e3a6 --- /dev/null +++ b/projects/app/src/pages/dataset/detail/components/Import/components/PreviewChunks.tsx @@ -0,0 +1,95 @@ +import React, { useMemo } from 'react'; +import { Box } from '@chakra-ui/react'; +import { ImportSourceItemType } from '@/web/core/dataset/type'; +import { useQuery } from '@tanstack/react-query'; +import MyRightDrawer from '@fastgpt/web/components/common/MyDrawer/MyRightDrawer'; +import { getPreviewChunks } from '@/web/core/dataset/api'; +import { useImportStore } from '../Provider'; +import { ImportDataSourceEnum } from '@fastgpt/global/core/dataset/constants'; +import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter'; +import { useToast } from '@fastgpt/web/hooks/useToast'; +import { getErrText } from '@fastgpt/global/common/error/utils'; + +const PreviewChunks = ({ + previewSource, + onClose +}: { + previewSource: ImportSourceItemType; + onClose: () => void; +}) => { + const { toast } = useToast(); + const { importSource, chunkSize, chunkOverlapRatio, processParamsForm } = useImportStore(); + + const { data = [], isLoading } = useQuery( + ['previewSource'], + () => { + if ( + importSource === ImportDataSourceEnum.fileLocal || + importSource === ImportDataSourceEnum.csvTable || + importSource === ImportDataSourceEnum.fileLink + ) { + return getPreviewChunks({ + type: importSource, + sourceId: previewSource.dbFileId || previewSource.link || '', + chunkSize, + overlapRatio: chunkOverlapRatio, + customSplitChar: processParamsForm.getValues('customSplitChar') + }); + } else if (importSource === ImportDataSourceEnum.fileCustom) { + const customSplitChar = processParamsForm.getValues('customSplitChar'); + const { chunks } = splitText2Chunks({ + text: previewSource.rawText || '', + chunkLen: chunkSize, + overlapRatio: chunkOverlapRatio, + customReg: customSplitChar ? [customSplitChar] : [] + }); + return chunks.map((chunk) => ({ + q: chunk, + a: '' + })); + } + return []; + }, + { + onError(err) { + toast({ + status: 'warning', + title: getErrText(err) + }); + } + } + ); + + return ( + + {data.map((item, index) => ( + + {item.q} + {item.a} + + ))} + + ); +}; + +export default React.memo(PreviewChunks); diff --git a/projects/app/src/pages/dataset/detail/components/Import/components/PreviewRawText.tsx b/projects/app/src/pages/dataset/detail/components/Import/components/PreviewRawText.tsx index d98c386e112b..b995d4ed83a1 100644 --- a/projects/app/src/pages/dataset/detail/components/Import/components/PreviewRawText.tsx +++ b/projects/app/src/pages/dataset/detail/components/Import/components/PreviewRawText.tsx @@ -1,28 +1,73 @@ import React from 'react'; -import MyModal from '@fastgpt/web/components/common/MyModal'; -import { ModalBody } from '@chakra-ui/react'; - -export type PreviewRawTextProps = { - icon: string; - title: string; - rawText: string; -}; +import { Box } from '@chakra-ui/react'; +import { ImportSourceItemType } from '@/web/core/dataset/type'; +import { useQuery } from '@tanstack/react-query'; +import { getPreviewFileContent } from '@/web/common/file/api'; +import MyRightDrawer from '@fastgpt/web/components/common/MyDrawer/MyRightDrawer'; +import { useImportStore } from '../Provider'; +import { ImportDataSourceEnum } from '@fastgpt/global/core/dataset/constants'; +import { useToast } from '@fastgpt/web/hooks/useToast'; +import { getErrText } from '@fastgpt/global/common/error/utils'; const PreviewRawText = ({ - icon, - title, - rawText, + previewSource, onClose -}: PreviewRawTextProps & { +}: { + previewSource: ImportSourceItemType; onClose: () => void; }) => { + const { toast } = useToast(); + const { importSource } = useImportStore(); + + const { data, isLoading } = useQuery( + ['previewSource', previewSource?.dbFileId], + () => { + if (importSource === ImportDataSourceEnum.fileLocal && previewSource.dbFileId) { + return getPreviewFileContent({ + fileId: previewSource.dbFileId, + csvFormat: true + }); + } + if (importSource === ImportDataSourceEnum.csvTable && previewSource.dbFileId) { + return getPreviewFileContent({ + fileId: previewSource.dbFileId, + csvFormat: false + }); + } + if (importSource === ImportDataSourceEnum.fileCustom) { + return { + previewContent: (previewSource.rawText || '').slice(0, 3000) + }; + } + + return { + previewContent: '' + }; + }, + { + onError(err) { + toast({ + status: 'warning', + title: getErrText(err) + }); + } + } + ); + + const rawText = data?.previewContent || ''; + return ( - - + + {rawText} - - + + ); }; -export default PreviewRawText; +export default React.memo(PreviewRawText); diff --git a/projects/app/src/pages/dataset/detail/components/Import/components/RenderFiles.tsx b/projects/app/src/pages/dataset/detail/components/Import/components/RenderFiles.tsx new file mode 100644 index 000000000000..a6489715b2a5 --- /dev/null +++ b/projects/app/src/pages/dataset/detail/components/Import/components/RenderFiles.tsx @@ -0,0 +1,119 @@ +import React, { useState } from 'react'; +import { + Flex, + TableContainer, + Table, + Thead, + Tr, + Th, + Td, + Tbody, + Progress, + IconButton +} from '@chakra-ui/react'; +import { ImportSourceItemType } from '@/web/core/dataset/type.d'; +import MyIcon from '@fastgpt/web/components/common/Icon'; +import { useTranslation } from 'next-i18next'; +import MyTooltip from '@/components/MyTooltip'; +import dynamic from 'next/dynamic'; + +const PreviewRawText = dynamic(() => import('./PreviewRawText')); + +export const RenderUploadFiles = ({ + files, + setFiles, + showPreviewContent +}: { + files: ImportSourceItemType[]; + setFiles: React.Dispatch>; + showPreviewContent?: boolean; +}) => { + const { t } = useTranslation(); + const [previewFile, setPreviewFile] = useState(); + + return files.length > 0 ? ( + <> + + + + + + + + + + + + {files.map((item) => ( + + + + + + + ))} + +
+ {t('common.file.File Name')} + + {t('core.dataset.import.Upload file progress')} + + {t('common.file.File Size')} + + {t('common.Action')} +
+ + + {item.sourceName} + + + + = 100 ? 'green' : 'blue'} + bg="myGray.200" + hasStripe + isAnimated + mr={2} + /> + {`${item.uploadedFileRate}%`} + + {item.sourceSize} + {!item.isUploading && ( + + {showPreviewContent && ( + + } + aria-label={''} + onClick={() => setPreviewFile(item)} + /> + + )} + + } + aria-label={''} + onClick={() => { + setFiles((state) => state.filter((file) => file.id !== item.id)); + }} + /> + + )} +
+
+ {!!previewFile && ( + setPreviewFile(undefined)} /> + )} + + ) : null; +}; diff --git a/projects/app/src/pages/dataset/detail/components/Import/diffSource/FileCustomText.tsx b/projects/app/src/pages/dataset/detail/components/Import/diffSource/FileCustomText.tsx index 065922a2d7e7..5161376dcf21 100644 --- a/projects/app/src/pages/dataset/detail/components/Import/diffSource/FileCustomText.tsx +++ b/projects/app/src/pages/dataset/detail/components/Import/diffSource/FileCustomText.tsx @@ -1,4 +1,4 @@ -import React, { useEffect } from 'react'; +import React, { useCallback, useEffect } from 'react'; import { ImportDataComponentProps } from '@/web/core/dataset/type.d'; import dynamic from 'next/dynamic'; @@ -19,7 +19,7 @@ const CustomTet = ({ activeStep, goToNext }: ImportDataComponentProps) => { <> {activeStep === 0 && } {activeStep === 1 && } - {activeStep === 2 && } + {activeStep === 2 && } ); }; @@ -36,6 +36,24 @@ const CustomTextInput = ({ goToNext }: { goToNext: () => void }) => { } }); + const onSubmit = useCallback( + (data: { name: string; value: string }) => { + const fileId = getNanoid(32); + + setSources([ + { + id: fileId, + createStatus: 'waiting', + rawText: data.value, + sourceName: data.name, + icon: 'file/fill/manual' + } + ]); + goToNext(); + }, + [goToNext, setSources] + ); + useEffect(() => { const source = sources[0]; if (source) { @@ -78,25 +96,7 @@ const CustomTextInput = ({ goToNext }: { goToNext: () => void }) => { />
- + ); diff --git a/projects/app/src/pages/dataset/detail/components/Import/diffSource/FileLink.tsx b/projects/app/src/pages/dataset/detail/components/Import/diffSource/FileLink.tsx index ebb2a198ffe4..c4b4d7a1209f 100644 --- a/projects/app/src/pages/dataset/detail/components/Import/diffSource/FileLink.tsx +++ b/projects/app/src/pages/dataset/detail/components/Import/diffSource/FileLink.tsx @@ -23,7 +23,7 @@ const LinkCollection = ({ activeStep, goToNext }: ImportDataComponentProps) => { <> {activeStep === 0 && } {activeStep === 1 && } - {activeStep === 2 && } + {activeStep === 2 && } ); }; @@ -128,10 +128,8 @@ const CustomLinkImport = ({ goToNext }: { goToNext: () => void }) => { setSources( newLinkList.map((link) => ({ id: getNanoid(32), + createStatus: 'waiting', link, - rawText: '', - chunks: [], - chunkChars: 0, sourceName: link, icon: LinkCollectionIcon })) diff --git a/projects/app/src/pages/dataset/detail/components/Import/diffSource/FileLocal.tsx b/projects/app/src/pages/dataset/detail/components/Import/diffSource/FileLocal.tsx index 69315f1e7aa6..efa1692cac7f 100644 --- a/projects/app/src/pages/dataset/detail/components/Import/diffSource/FileLocal.tsx +++ b/projects/app/src/pages/dataset/detail/components/Import/diffSource/FileLocal.tsx @@ -1,41 +1,27 @@ -import React, { useEffect, useMemo, useState } from 'react'; -import { ImportDataComponentProps } from '@/web/core/dataset/type.d'; -import { Box, Button, Flex } from '@chakra-ui/react'; -import { ImportSourceItemType } from '@/web/core/dataset/type.d'; -import FileSelector, { type SelectFileItemType } from '@/web/core/dataset/components/FileSelector'; -import { getFileIcon } from '@fastgpt/global/common/file/icon'; -import MyIcon from '@fastgpt/web/components/common/Icon'; -import { formatFileSize } from '@fastgpt/global/common/file/tools'; +import React, { useCallback, useEffect, useMemo, useState } from 'react'; +import { ImportDataComponentProps, ImportSourceItemType } from '@/web/core/dataset/type.d'; +import { Box, Button } from '@chakra-ui/react'; +import FileSelector from '../components/FileSelector'; import { useTranslation } from 'next-i18next'; -import { getNanoid } from '@fastgpt/global/common/string/tools'; -import { useRequest } from '@fastgpt/web/hooks/useRequest'; -import { readFileRawContent } from '@fastgpt/web/common/file/read'; -import { getUploadBase64ImgController } from '@/web/common/file/controller'; -import { MongoImageTypeEnum } from '@fastgpt/global/common/file/image/constants'; -import MyTooltip from '@/components/MyTooltip'; -import type { PreviewRawTextProps } from '../components/PreviewRawText'; import { useImportStore } from '../Provider'; -import { useSystemStore } from '@/web/common/system/useSystemStore'; import dynamic from 'next/dynamic'; import Loading from '@fastgpt/web/components/common/MyLoading'; +import { RenderUploadFiles } from '../components/RenderFiles'; const DataProcess = dynamic(() => import('../commonProgress/DataProcess'), { loading: () => }); const Upload = dynamic(() => import('../commonProgress/Upload')); -const PreviewRawText = dynamic(() => import('../components/PreviewRawText')); -type FileItemType = ImportSourceItemType & { file: File }; -const fileType = '.txt, .docx, .csv, .pdf, .md, .html'; -const maxSelectFileCount = 1000; +const fileType = '.txt, .docx, .csv, .xlsx, .pdf, .md, .html, .pptx'; const FileLocal = ({ activeStep, goToNext }: ImportDataComponentProps) => { return ( <> {activeStep === 0 && } {activeStep === 1 && } - {activeStep === 2 && } + {activeStep === 2 && } ); }; @@ -44,135 +30,47 @@ export default React.memo(FileLocal); const SelectFile = React.memo(function SelectFile({ goToNext }: { goToNext: () => void }) { const { t } = useTranslation(); - const { feConfigs } = useSystemStore(); const { sources, setSources } = useImportStore(); - // @ts-ignore - const [selectFiles, setSelectFiles] = useState(sources); + const [selectFiles, setSelectFiles] = useState( + sources.map((source) => ({ + isUploading: false, + ...source + })) + ); + const [uploading, setUploading] = useState(false); const successFiles = useMemo(() => selectFiles.filter((item) => !item.errorMsg), [selectFiles]); - const [previewRaw, setPreviewRaw] = useState(); - useEffect(() => { setSources(successFiles); - }, [successFiles]); - - const { mutate: onSelectFile, isLoading } = useRequest({ - mutationFn: async (files: SelectFileItemType[]) => { - { - for await (const selectFile of files) { - const { file, folderPath } = selectFile; - const relatedId = getNanoid(32); + }, [setSources, successFiles]); - const { rawText } = await (() => { - try { - return readFileRawContent({ - file, - uploadBase64Controller: (base64Img) => - getUploadBase64ImgController({ - base64Img, - type: MongoImageTypeEnum.collectionImage, - metadata: { - relatedId - } - }) - }); - } catch (error) { - return { rawText: '' }; - } - })(); - - const item: FileItemType = { - id: relatedId, - file, - rawText, - chunks: [], - chunkChars: 0, - sourceFolderPath: folderPath, - sourceName: file.name, - sourceSize: formatFileSize(file.size), - icon: getFileIcon(file.name), - errorMsg: rawText.length === 0 ? t('common.file.Empty file tip') : '' - }; - - setSelectFiles((state) => { - const results = [item].concat(state).slice(0, maxSelectFileCount); - return results; - }); - } - } - } - }); + const onclickNext = useCallback(() => { + // filter uploaded files + setSelectFiles((state) => state.filter((item) => (item.uploadedFileRate || 0) >= 100)); + goToNext(); + }, [goToNext]); return ( setUploading(true)} + onFinishSelect={() => setUploading(false)} /> {/* render files */} - - {selectFiles.map((item) => ( - - - setPreviewRaw({ - icon: item.icon, - title: item.sourceName, - rawText: item.rawText.slice(0, 10000) - }) - } - > - - - {item.sourceName} - - - {item.sourceSize} - {item.rawText.length > 0 && ( - <>,{t('common.Number of words', { amount: item.rawText.length })} - )} - - {item.errorMsg && ( - - - - )} - { - e.stopPropagation(); - setSelectFiles((state) => state.filter((file) => file.id !== item.id)); - }} - /> - - - ))} - + - - - - {previewRaw && setPreviewRaw(undefined)} />} ); }); diff --git a/projects/app/src/pages/dataset/detail/components/Import/diffSource/TableLocal.tsx b/projects/app/src/pages/dataset/detail/components/Import/diffSource/TableLocal.tsx index 0f90ef52a5a6..27e884e96d15 100644 --- a/projects/app/src/pages/dataset/detail/components/Import/diffSource/TableLocal.tsx +++ b/projects/app/src/pages/dataset/detail/components/Import/diffSource/TableLocal.tsx @@ -1,108 +1,62 @@ import React, { useEffect, useMemo, useState } from 'react'; -import { ImportDataComponentProps } from '@/web/core/dataset/type.d'; -import { Box, Button, Flex } from '@chakra-ui/react'; -import { ImportSourceItemType } from '@/web/core/dataset/type.d'; -import FileSelector, { type SelectFileItemType } from '@/web/core/dataset/components/FileSelector'; -import { getFileIcon } from '@fastgpt/global/common/file/icon'; -import MyIcon from '@fastgpt/web/components/common/Icon'; -import { formatFileSize } from '@fastgpt/global/common/file/tools'; +import { ImportDataComponentProps, ImportSourceItemType } from '@/web/core/dataset/type.d'; +import { Box, Button } from '@chakra-ui/react'; +import FileSelector from '../components/FileSelector'; import { useTranslation } from 'next-i18next'; -import { getNanoid } from '@fastgpt/global/common/string/tools'; -import { useRequest } from '@fastgpt/web/hooks/useRequest'; -import MyTooltip from '@/components/MyTooltip'; import { useImportStore } from '../Provider'; -import { useSystemStore } from '@/web/common/system/useSystemStore'; import dynamic from 'next/dynamic'; import { fileDownload } from '@/web/common/file/utils'; -import { readCsvContent } from '@fastgpt/web/common/file/read/csv'; +import { RenderUploadFiles } from '../components/RenderFiles'; const PreviewData = dynamic(() => import('../commonProgress/PreviewData')); const Upload = dynamic(() => import('../commonProgress/Upload')); -type FileItemType = ImportSourceItemType & { file: File }; const fileType = '.csv'; -const maxSelectFileCount = 1000; const FileLocal = ({ activeStep, goToNext }: ImportDataComponentProps) => { return ( <> {activeStep === 0 && } {activeStep === 1 && } - {activeStep === 2 && } + {activeStep === 2 && } ); }; export default React.memo(FileLocal); -const csvTemplate = `index,content -"必填内容","可选内容。CSV 中请注意内容不能包含双引号,双引号是列分割符号" +const csvTemplate = `"第一列内容","第二列内容" +"必填列","可选列。CSV 中请注意内容不能包含双引号,双引号是列分割符号" +"只会讲第一和第二列内容导入,其余列会被忽略","" "结合人工智能的演进历程,AIGC的发展大致可以分为三个阶段,即:早期萌芽阶段(20世纪50年代至90年代中期)、沉淀积累阶段(20世纪90年代中期至21世纪10年代中期),以及快速发展展阶段(21世纪10年代中期至今)。","" "AIGC发展分为几个阶段?","早期萌芽阶段(20世纪50年代至90年代中期)、沉淀积累阶段(20世纪90年代中期至21世纪10年代中期)、快速发展展阶段(21世纪10年代中期至今)"`; const SelectFile = React.memo(function SelectFile({ goToNext }: { goToNext: () => void }) { const { t } = useTranslation(); - const { feConfigs } = useSystemStore(); const { sources, setSources } = useImportStore(); - // @ts-ignore - const [selectFiles, setSelectFiles] = useState(sources); + const [selectFiles, setSelectFiles] = useState( + sources.map((source) => ({ + isUploading: false, + ...source + })) + ); + const [uploading, setUploading] = useState(false); + const successFiles = useMemo(() => selectFiles.filter((item) => !item.errorMsg), [selectFiles]); useEffect(() => { setSources(successFiles); }, [successFiles]); - const { mutate: onSelectFile, isLoading } = useRequest({ - mutationFn: async (files: SelectFileItemType[]) => { - { - for await (const selectFile of files) { - const { file, folderPath } = selectFile; - const { header, data } = await readCsvContent({ file }); - - const filterData: FileItemType['chunks'] = data - .filter((item) => item[0]) - .map((item) => ({ - q: item[0] || '', - a: item[1] || '', - chunkIndex: 0 - })); - - const item: FileItemType = { - id: getNanoid(32), - file, - rawText: '', - chunks: filterData, - chunkChars: 0, - sourceFolderPath: folderPath, - sourceName: file.name, - sourceSize: formatFileSize(file.size), - icon: getFileIcon(file.name), - errorMsg: - header[0] !== 'index' || header[1] !== 'content' || filterData.length === 0 - ? t('core.dataset.import.Csv format error') - : '' - }; - - setSelectFiles((state) => { - const results = [item].concat(state).slice(0, 10); - return results; - }); - } - } - }, - errorToast: t('common.file.Select failed') - }); - return ( setUploading(true)} + onFinishSelect={() => setUploading(false)} /> {/* render files */} - - {selectFiles.map((item) => ( - - - - {item.sourceName} - - - {item.sourceSize} - - {item.errorMsg && ( - - - - )} - { - setSelectFiles((state) => state.filter((file) => file.id !== item.id)); - }} - /> - - ))} - - - -