diff --git a/app/ui/package.json b/app/ui/package.json index 9a92f2bd..086e8463 100644 --- a/app/ui/package.json +++ b/app/ui/package.json @@ -1,7 +1,7 @@ { "name": "app", "private": true, - "version": "1.9.5", + "version": "1.10.0", "type": "module", "scripts": { "dev": "vite", diff --git a/app/ui/src/@types/bot.ts b/app/ui/src/@types/bot.ts index aac427e2..1c6b9f02 100644 --- a/app/ui/src/@types/bot.ts +++ b/app/ui/src/@types/bot.ts @@ -22,6 +22,7 @@ export type BotSettings = { inactivityTimeout: number; autoResetSession: boolean; autoSyncDataSources: boolean; + internetSearchEnabled: boolean; }; chatModel: { label: string; diff --git a/app/ui/src/components/Bot/DS/DsTable.tsx b/app/ui/src/components/Bot/DS/DsTable.tsx index e196164c..48f3ca26 100644 --- a/app/ui/src/components/Bot/DS/DsTable.tsx +++ b/app/ui/src/components/Bot/DS/DsTable.tsx @@ -18,6 +18,7 @@ import { YoutubeIcon } from "../../Icons/YoutubeIcon"; import { ApiIcon } from "../../Icons/ApiIcon"; export const DsTable = ({ data, + searchNode }: { data: { id: string; @@ -25,6 +26,7 @@ export const DsTable = ({ content: string; status: string; }[]; + searchNode: React.ReactNode; }) => { const statusColor = (status: string) => { switch (status.toLowerCase()) { @@ -134,6 +136,8 @@ export const DsTable = ({
+ {searchNode} +
{data.length === 0 && ( diff --git a/app/ui/src/components/Bot/Playground/Message.tsx b/app/ui/src/components/Bot/Playground/Message.tsx index 554e5cfc..1d4c6723 100644 --- a/app/ui/src/components/Bot/Playground/Message.tsx +++ b/app/ui/src/components/Bot/Playground/Message.tsx @@ -10,6 +10,7 @@ import React from "react"; import { removeUUID } from "../../../utils/filename"; import { useSpeechSynthesis } from "../../../hooks/useSpeechSynthesis"; import { useElevenLabsTTS } from "../../../hooks/useElevenLabsTTS"; +import { Collapse } from "antd"; type Props = Message & { onSourceClick(source: any): void; @@ -55,24 +56,41 @@ export const PlaygroundMessage = (props: Props) => {
- {props.isBot && ( -
- {props?.sources?.map((source, index) => ( - - ))} -
+ {props.isBot && props?.sources && props?.sources?.length > 0 && ( + + Bot sources +
+ ), + children: ( +
+ {props?.sources?.map((source, index) => ( + + ))} +
+ ), + }, + ]} + /> )}
diff --git a/app/ui/src/components/Bot/Settings/SettingsBody.tsx b/app/ui/src/components/Bot/Settings/SettingsBody.tsx index 2f2def2a..108db2ac 100644 --- a/app/ui/src/components/Bot/Settings/SettingsBody.tsx +++ b/app/ui/src/components/Bot/Settings/SettingsBody.tsx @@ -169,6 +169,7 @@ export const SettingsBody: React.FC = ({ autoResetSession: data.autoResetSession, inactivityTimeout: data.inactivityTimeout, autoSyncDataSources: data.autoSyncDataSources, + internetSearchEnabled: data.internetSearchEnabled, }} form={form} requiredMark={false} @@ -226,7 +227,24 @@ export const SettingsBody: React.FC = ({ options={chatModel} /> - + +

+ If you change the embedding method, make sure to + re-fetch the data source or choose a model with the same + dimensions +

+ + } + > + +
= ({ /> - (); const navigate = useNavigate(); const [page, setPage] = React.useState(1); const [limit, setLimit] = React.useState(10); + const [search, setSearch] = React.useState(undefined); + const [searchValue, setSearchValue] = React.useState( + undefined + ); const { data: botData, status } = useQuery( - ["getBotDS", param.id, page, limit], + ["getBotDS", param.id, page, limit, searchValue], async () => { const response = await api.get( - `/bot/${param.id}/source?page=${page}&limit=${limit}` + `/bot/${param.id}/source?page=${page}&limit=${limit}${ + searchValue && searchValue.trim().length > 0 + ? `&search=${searchValue}` + : "" + }` ); return response.data as { data: { @@ -47,7 +55,20 @@ export default function BotDSRoot() { {status === "loading" && } {status === "success" && (
- + + setSearchValue(value)} + onChange={(e) => setSearch(e.target.value)} + placeholder="Search" + style={{ width: 300 }} + /> +
+ } + /> {botData.total >= 10 && (
{ + const tsquerySpecialChars = /[()|&:*!]/g + const search = searchTerm + .trim() + .replace(tsquerySpecialChars, ' ') + .split(/\s+/) + .join(' | ') + return search +} export const getBotByIdEmbeddingsHandler = async ( request: FastifyRequest, reply: FastifyReply @@ -14,7 +22,7 @@ export const getBotByIdEmbeddingsHandler = async ( const bot = await prisma.bot.findFirst({ where: { id, - user_id: request.user?.is_admin ? undefined : request.user?.user_id + user_id: request.user?.is_admin ? undefined : request.user?.user_id, }, }); @@ -45,11 +53,12 @@ export const getDatasourceByBotId = async ( const id = request.params.id; const { limit, page } = request.query; const skip = (page - 1) * limit; + const search = request.query.search; const bot = await prisma.bot.findFirst({ where: { id, - user_id: request.user?.is_admin ? undefined : request.user?.user_id + user_id: request.user?.user_id, }, }); @@ -58,35 +67,74 @@ export const getDatasourceByBotId = async ( message: "Bot not found", }); } - - const sources = await prisma.botSource.findMany({ - where: { - botId: id, - type: { - notIn: ["crawl", "sitemap", "zip"], + const [sources, totalCount] = await Promise.all([ + prisma.botSource.findMany({ + where: { + botId: id, + type: { + notIn: ["crawl", "sitemap", "zip"], + }, + OR: search + ? [ + { + content: { + search: preprocessSearchTerms(search), + }, + }, + { + document: { + some: { + content: { + search: preprocessSearchTerms(search), + }, + }, + }, + }, + ] + : undefined, }, - }, - orderBy: { - createdAt: 'desc' - }, - skip, - take: limit, - }); - - const totalCount = await prisma.botSource.count({ - where: { - botId: id, - type: { - notIn: ["crawl", "sitemap", "zip"], + orderBy: { + createdAt: "desc", }, - }, - }); + skip, + take: limit, + include: { + document: true, + }, + }), + prisma.botSource.count({ + where: { + botId: id, + type: { + notIn: ["crawl", "sitemap", "zip"], + }, + OR: search + ? [ + { + content: { + search: preprocessSearchTerms(search), + }, + }, + { + document: { + some: { + content: { + search: preprocessSearchTerms(search), + }, + }, + }, + }, + ] + : undefined, + }, + }), + ]); return { data: sources, total: totalCount, next: page * limit < totalCount ? page + 1 : null, - prev: page > 1 ? page - 1 : null + prev: page > 1 ? page - 1 : null, }; }; @@ -100,7 +148,7 @@ export const getBotByIdHandler = async ( const bot = await prisma.bot.findFirst({ where: { id, - user_id: request.user?.is_admin ? undefined : request.user?.user_id + user_id: request.user?.is_admin ? undefined : request.user?.user_id, }, }); @@ -121,7 +169,7 @@ export const getAllBotsHandler = async ( const prisma = request.server.prisma; const bots = await prisma.bot.findMany({ where: { - user_id: request.user?.user_id + user_id: request.user?.user_id, }, orderBy: { createdAt: "desc", @@ -201,7 +249,7 @@ export const getCreateBotConfigHandler = async ( embeddingModel, defaultChatModel: settings?.defaultChatModel, defaultEmbeddingModel: settings?.defaultEmbeddingModel, - fileUploadSizeLimit: settings?.fileUploadSizeLimit + fileUploadSizeLimit: settings?.fileUploadSizeLimit, }; }; @@ -215,7 +263,7 @@ export const getBotByIdSettingsHandler = async ( const bot = await prisma.bot.findFirst({ where: { id, - user_id: request.user?.is_admin ? undefined : request.user?.user_id + user_id: request.user?.is_admin ? undefined : request.user?.user_id, }, }); if (!bot) { @@ -291,7 +339,7 @@ export const isBotReadyHandler = async ( const bot = await prisma.bot.findFirst({ where: { id, - user_id: request.user?.is_admin ? undefined : request.user?.user_id + user_id: request.user?.is_admin ? undefined : request.user?.user_id, }, }); diff --git a/server/src/handlers/api/v1/bot/bot/types.ts b/server/src/handlers/api/v1/bot/bot/types.ts index e4a1eca6..6b013dd0 100644 --- a/server/src/handlers/api/v1/bot/bot/types.ts +++ b/server/src/handlers/api/v1/bot/bot/types.ts @@ -150,5 +150,6 @@ export type GetDatasourceByBotId = { Querystring: { limit: number; page: number; + search?: string } } \ No newline at end of file diff --git a/server/src/integration/telegram.ts b/server/src/integration/telegram.ts index 43009005..7a2af602 100644 --- a/server/src/integration/telegram.ts +++ b/server/src/integration/telegram.ts @@ -9,6 +9,7 @@ import { convertTextToAudio } from "./handlers/utils/audio-to-text"; import { FileFlavor, hydrateFiles } from "@grammyjs/files"; import * as fs from "fs/promises"; import { convertOggToWave } from "../utils/ffmpeg"; +import { telegramFormat } from "../utils/telegram-format"; type DialoqBaseContext = FileFlavor; export default class TelegramBot { static get clients() { @@ -73,7 +74,14 @@ export default class TelegramBot { user_id ); - return await ctx.reply(message); + if (process.env.DB_TELEGEAM_PARSE_MODE === "normal") { + return await ctx.reply(message); + } + + return await ctx.reply(telegramFormat(message), + { + parse_mode: "HTML", + }); }); bot.on("message:voice", async (ctx) => { @@ -100,7 +108,15 @@ export default class TelegramBot { user_id ); - return await ctx.reply(message); + + if (process.env.DB_TELEGEAM_PARSE_MODE === "normal") { + return await ctx.reply(message); + } + + return await ctx.reply(telegramFormat(message), + { + parse_mode: "HTML", + }); } catch (error) { console.log(error); return await ctx.reply("Opps! Something went wrong"); diff --git a/server/src/internet/index.ts b/server/src/internet/index.ts new file mode 100644 index 00000000..bd08b33b --- /dev/null +++ b/server/src/internet/index.ts @@ -0,0 +1,166 @@ +import * as cheerio from "cheerio"; +import { Embeddings } from "@langchain/core/embeddings"; +import { Document } from "@langchain/core/documents"; +import * as ml_distance from "ml-distance" + +const SERACH_PROVIDER = process.env.DB_SEARCH_PROVIDER || "default"; +const TOTAL_RESULTS_LIMIT = process.env.DB_TOTAL_RESULTS_LIMIT ? parseInt(process.env.DB_TOTAL_RESULTS_LIMIT) : 5; + +export const duckduckgoSearchUnOffical = async (query: string) => { + const abortController = new AbortController(); + setTimeout(() => abortController.abort(), 10000); + + const htmlString = await fetch( + "https://html.duckduckgo.com/html/?q=" + query, + { + signal: abortController.signal, + } + ) + .then((response) => response.text()) + .catch(); + + const $ = cheerio.load(htmlString); + + const searchResults = Array.from($("div.results_links_deep")).map( + (result) => { + const title = $(result).find("a.result__a").text(); + const link = $(result) + .find("a.result__snippet") + .attr("href") + .replace("//duckduckgo.com/l/?uddg=", "") + .replace(/&rut=.*/, ""); + + const content = $(result).find("a.result__snippet").text(); + const decodedLink = decodeURIComponent(link); + return { title, link: decodedLink, content }; + } + ); + + return searchResults; +}; + +export const googleSearchUnOffical = async (query: string) => { + const abortController = new AbortController(); + setTimeout(() => abortController.abort(), 10000); + + const htmlString = await fetch( + "https://www.google.com/search?hl=en&q=" + query, + { + signal: abortController.signal, + } + ) + .then((response) => response.text()) + .catch(); + + const $ = cheerio.load(htmlString); + + const searchResults = $("div.g").map((_, result) => { + const title = $(result).find("h3").text(); + const link = $(result).find("a").attr("href"); + const content = $(result).find("span").map((_, span) => $(span).text()).get().join(" "); + return { title, link, content }; + }).get(); + + return searchResults; +}; + +export const searxngSearch = async (query: string) => { + const abortController = new AbortController(); + setTimeout(() => abortController.abort(), 10000); + + const searxngUrl = process.env.DB_SEARXNG_URL; + + if (!searxngUrl) { + throw new Error("SEARXNG_URL is not set"); + } + const url = new URL(`${searxngUrl}/search`); + + url.searchParams.append("q", query); + url.searchParams.append("format", "json"); + const response = await fetch(url.toString(), { + method: "GET", + headers: { + Accept: "application/json", + }, + }); + + if (!response.ok) { + const err = await response.json(); + console.error(`Error: ${err}`); + throw new Error(`Error: ${response.status}`); + } + + const data = (await response.json()) as { + results: { + title: string; + url: string; + content: string; + }[]; + }; + + return data.results.map((result) => ({ + title: result.title, + link: result.url, + content: result.content, + })); +}; + +const searchProviders = { + duckduckgo: duckduckgoSearchUnOffical, + google: googleSearchUnOffical, + searxng: searxngSearch, + default: + process.env.IS_RAILWAY != "true" + ? searxngSearch + : duckduckgoSearchUnOffical, +}; + +export const searchInternet = async (embedding: Embeddings, { query }: { query: string }) => { + + if (process.env.DISABLE_INTERNET_SEARCH == "true") { + return []; + } + + const searchProvider = searchProviders[SERACH_PROVIDER]; + if (!searchProvider) { + throw new Error(`Search provider ${SERACH_PROVIDER} not found`); + } + const datat = await searchProvider(query); + + const data = datat.filter((doc) => doc?.content.length > 0); + + const results = data.slice(0, TOTAL_RESULTS_LIMIT) + + const [docEmbeddings, queryEmbedding] = await Promise.all([ + embedding.embedDocuments(results.map((doc) => doc.content)), + embedding.embedQuery(query), + ]); + + + const similarity = docEmbeddings.map((docEmbedding, i) => { + const sim = ml_distance.similarity.cosine(queryEmbedding, docEmbedding) + + return { + index: i, + similarity: sim + } + }) + + const sortedDocs = similarity + .sort((a, b) => b.similarity - a.similarity) + .filter((sim) => sim.similarity > 0.5) + .slice(0, 15) + .map((sim) => { + return [ + { + pageContent: results[sim.index]?.content || "", + metadata: { + source: results[sim.index]?.link || "", + } + } as Document, + sim.similarity + ] + }) + + return sortedDocs; +}; diff --git a/server/src/utils/hybrid.ts b/server/src/utils/hybrid.ts index 28956b0c..5dc028df 100644 --- a/server/src/utils/hybrid.ts +++ b/server/src/utils/hybrid.ts @@ -3,6 +3,7 @@ import { PrismaClient } from "@prisma/client"; import { Embeddings } from "langchain/embeddings/base"; import { BaseRetriever, BaseRetrieverInput } from "@langchain/core/retrievers"; import { CallbackManagerForRetrieverRun, Callbacks } from "langchain/callbacks"; +import { searchInternet } from "../internet"; const prisma = new PrismaClient(); export interface DialoqbaseLibArgs extends BaseRetrieverInput { botId: string; @@ -47,7 +48,11 @@ export class DialoqbaseHybridRetrival extends BaseRetriever { const vector = `[${embeddedQuery.join(",")}]`; const bot_id = this.botId; - + const botInfo = await prisma.bot.findFirst({ + where: { + id: bot_id, + }, + }); const data = await prisma.$queryRaw` SELECT * FROM "similarity_search_v2"(query_embedding := ${vector}::vector, botId := ${bot_id}::text,match_count := ${k}::int) `; @@ -62,8 +67,17 @@ export class DialoqbaseHybridRetrival extends BaseRetriever { resp.similarity * 10, resp.id, ]); + let internetSearchResults = []; + if (botInfo.internetSearchEnabled) { + internetSearchResults = await searchInternet(this.embeddings, { + query: query, + }); + } - return result; + const combinedResults = [...result, ...internetSearchResults]; + combinedResults.sort((a, b) => b[1] - a[1]); + const topResults = combinedResults.slice(0, k); + return topResults; } catch (e) { console.log(e); return []; diff --git a/server/src/utils/store.ts b/server/src/utils/store.ts index 33bce9c3..84766a89 100644 --- a/server/src/utils/store.ts +++ b/server/src/utils/store.ts @@ -2,6 +2,8 @@ import { Document } from "@langchain/core/documents"; import { PrismaClient } from "@prisma/client"; import { Embeddings } from "@langchain/core/embeddings"; import { VectorStore } from "@langchain/core/vectorstores"; +import { Callbacks } from "langchain/callbacks"; +import { searchInternet } from "../internet"; const prisma = new PrismaClient(); export interface DialoqbaseLibArgs { botId: string; @@ -92,7 +94,8 @@ export class DialoqbaseVectorStore extends VectorStore { async similaritySearchVectorWithScore( query: number[], k: number, - filter?: this["FilterType"] | undefined + filter?: this["FilterType"] | undefined, + originalQuery?: string | undefined ): Promise<[Document>, number][]> { if (!query) { return []; @@ -114,10 +117,8 @@ export class DialoqbaseVectorStore extends VectorStore { const data = await prisma.$queryRaw` SELECT * FROM "similarity_search_v2"(query_embedding := ${vector}::vector, botId := ${bot_id}::text,match_count := ${match_count}::int) `; - - const result: [Document, number][] = ( - data as SearchEmbeddingsResponse[] - ).map((resp) => [ + + const result = (data as SearchEmbeddingsResponse[]).map((resp) => [ new Document({ metadata: resp.metadata, pageContent: resp.content, @@ -125,15 +126,42 @@ export class DialoqbaseVectorStore extends VectorStore { resp.similarity, ]); + let internetSearchResults = []; + if (botInfo.internetSearchEnabled) { + internetSearchResults = await searchInternet(this.embeddings, { + query: originalQuery, + }); + } + + const combinedResults = [...result, ...internetSearchResults]; + combinedResults.sort((a, b) => b[1] - a[1]); + + const topResults = combinedResults.slice(0, k); + if (semanticSearchSimilarityScore === "none") { - return result; + return topResults; } - const valueInFloat = parseFloat(semanticSearchSimilarityScore); - const filteredResult = result.filter( - ([, similarity]) => similarity >= valueInFloat + const similarityThreshold = parseFloat(semanticSearchSimilarityScore); + const filteredResults = topResults.filter( + ([, similarity]) => similarity >= similarityThreshold + ); + return filteredResults; + } + + async similaritySearch( + query: string, + k = 4, + filter: this["FilterType"] | undefined = undefined, + _callbacks: Callbacks | undefined = undefined // implement passing to embedQuery later + ): Promise { + const results = await this.similaritySearchVectorWithScore( + await this.embeddings.embedQuery(query), + k, + filter, + query ); - return filteredResult; + return results.map((result) => result[0]); } _vectorstoreType(): string { diff --git a/server/src/utils/telegram-format.ts b/server/src/utils/telegram-format.ts new file mode 100644 index 00000000..0e082425 --- /dev/null +++ b/server/src/utils/telegram-format.ts @@ -0,0 +1,131 @@ +// this code is a typescript conversion of the original python code from the repo: https://github.com/Latand/formatter-chatgpt-telegram + +function convertHtmlChars(text: string): string { + text = text.replace(/&/g, "&"); + text = text.replace(//g, ">"); + return text; +} + +function splitByTag(outText: string, mdTag: string, htmlTag: string): string { + const tagPattern = new RegExp( + `(?$1`); +} + +function escapeRegExp(string: string): string { + return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); +} + +function ensureClosingDelimiters(text: string): string { + if ((text.match(/```/g) || []).length % 2 !== 0) { + text += "```"; + } + if ((text.match(/`/g) || []).length % 2 !== 0) { + text += "`"; + } + return text; +} + +function extractAndConvertCodeBlocks(text: string): [string, Record] { + text = ensureClosingDelimiters(text); + const placeholders: string[] = []; + const codeBlocks: Record = {}; + + const replacer = (match: RegExpMatchArray): [string, string] => { + const language = match[1] || ""; + const codeContent = match[3]; + const placeholder = `CODEBLOCKPLACEHOLDER${placeholders.length}`; + placeholders.push(placeholder); + const htmlCodeBlock = language + ? `
${codeContent}
` + : `
${codeContent}
`; + return [placeholder, htmlCodeBlock]; + }; + + let modifiedText = text; + const regex = /```(\w*)?(\n)?(.*?)```/gs; + let match: RegExpExecArray | null; + + while ((match = regex.exec(text)) !== null) { + const [placeholder, htmlCodeBlock] = replacer(match); + codeBlocks[placeholder] = htmlCodeBlock; + modifiedText = modifiedText.replace(match[0], placeholder); + } + + return [modifiedText, codeBlocks]; +} + +function reinsertCodeBlocks(text: string, codeBlocks: Record): string { + for (const [placeholder, htmlCodeBlock] of Object.entries(codeBlocks)) { + text = text.replace(placeholder, htmlCodeBlock); + } + return text; +} + +function combineBlockquotes(text: string): string { + const lines = text.split("\n"); + const combinedLines: string[] = []; + let blockquoteLines: string[] = []; + let inBlockquote = false; + + for (const line of lines) { + if (line.startsWith(">")) { + inBlockquote = true; + blockquoteLines.push(line.slice(1).trim()); + } else { + if (inBlockquote) { + combinedLines.push( + `
${blockquoteLines.join("\n")}
` + ); + blockquoteLines = []; + inBlockquote = false; + } + combinedLines.push(line); + } + } + + if (inBlockquote) { + combinedLines.push( + `
${blockquoteLines.join("\n")}
` + ); + } + + return combinedLines.join("\n"); +} + +function removeBlockquoteEscaping(output: string): string { + return output + .replace(/<blockquote>/g, "
") + .replace(/<\/blockquote>/g, "
"); +} + +export function telegramFormat(text: string): string { + text = combineBlockquotes(text); + text = convertHtmlChars(text); + + let [output, codeBlocks] = extractAndConvertCodeBlocks(text); + + output = output.replace(//g, ">"); + output = output.replace(/`(.*?)`/g, "$1"); + output = output.replace(/\*\*\*(.*?)\*\*\*/g, "$1"); + output = output.replace(/\_\_\_(.*?)\_\_\_/g, "$1"); + + output = splitByTag(output, "**", "b"); + output = splitByTag(output, "__", "u"); + output = splitByTag(output, "_", "i"); + output = splitByTag(output, "*", "i"); + output = splitByTag(output, "~~", "s"); + + output = output.replace(/【[^】]+】/g, ""); + output = output.replace(/!?\\[(.*?)\\]\\((.*?)\\)/g, '$1'); + output = output.replace(/^\s*#+ (.+)/gm, "$1"); + output = output.replace(/^(\s*)[\-\*] (.+)/gm, "$1• $2"); + + output = reinsertCodeBlocks(output, codeBlocks); + output = removeBlockquoteEscaping(output); + + return output; +} diff --git a/server/yarn.lock b/server/yarn.lock index 26437ebc..a86a3b33 100644 --- a/server/yarn.lock +++ b/server/yarn.lock @@ -5552,7 +5552,7 @@ ml-distance-euclidean@^2.0.0: resolved "https://registry.yarnpkg.com/ml-distance-euclidean/-/ml-distance-euclidean-2.0.0.tgz#3a668d236649d1b8fec96380b9435c6f42c9a817" integrity sha512-yC9/2o8QF0A3m/0IXqCTXCzz2pNEzvmcE/9HFKOZGnTjatvBbsn4lWYJkxENkA4Ug2fnYl7PXQxnPi21sgMy/Q== -ml-distance@^4.0.0: +ml-distance@^4.0.0, ml-distance@^4.0.1: version "4.0.1" resolved "https://registry.yarnpkg.com/ml-distance/-/ml-distance-4.0.1.tgz#4741d17a1735888c5388823762271dfe604bd019" integrity sha512-feZ5ziXs01zhyFUUUeZV5hwc0f5JW0Sh0ckU1koZe/wdVkJdGxcP06KNQuF0WBTj8FttQUzcvQcpcrOp/XrlEw==