From ae510619f069ef2530bbba752a240d501dc2d047 Mon Sep 17 00:00:00 2001 From: Sean Hatfield Date: Tue, 17 Dec 2024 04:16:20 +0800 Subject: [PATCH] Purge cached docs and remove docs from all workspaces on vectorDB/embedder changes (#2819) * wip remove all docs clear vector db on embedder/vector db change * purge all cached docs and remove docs from workspaces on vectordb/embedder change * lint * remove unneeded console log * remove reset vector stores endpoint and move to server side updateENV with postUpdate check * reset embed module * remove unused import * simplify deletion process rescoped document deletion to be more general for speed, everything needs to be reset anyway fixed issue where unembedded docs not in any workspaces, but cached, were not removed * add back missing readme file update warning text modals --------- Co-authored-by: timothycarambat --- .../src/components/ChangeWarning/index.jsx | 74 +++++++++++-------- .../EmbeddingPreference/index.jsx | 2 +- .../GeneralSettings/VectorDatabase/index.jsx | 2 +- server/models/vectors.js | 34 ++++++--- server/utils/files/index.js | 11 +++ server/utils/helpers/index.js | 5 +- server/utils/helpers/updateENV.js | 23 ++++++ .../utils/vectorStore/resetAllVectorStores.js | 48 ++++++++++++ 8 files changed, 153 insertions(+), 46 deletions(-) create mode 100644 server/utils/vectorStore/resetAllVectorStores.js diff --git a/frontend/src/components/ChangeWarning/index.jsx b/frontend/src/components/ChangeWarning/index.jsx index 42b211baf4..2e0950a080 100644 --- a/frontend/src/components/ChangeWarning/index.jsx +++ b/frontend/src/components/ChangeWarning/index.jsx @@ -1,4 +1,4 @@ -import { Warning } from "@phosphor-icons/react"; +import { Warning, X } from "@phosphor-icons/react"; export default function ChangeWarningModal({ warningText = "", @@ -6,41 +6,55 @@ export default function ChangeWarningModal({ onConfirm, }) { return ( -
-
-
-
- -

Warning

-
+
+
+
+ +

+ WARNING - This action is irreversible +

-
-

- {warningText} + +

+
+
+

+ {warningText.split("\\n").map((line, index) => ( + + {line} +
+
+ ))}

Are you sure you want to proceed?

- -
- - -
+
+
+ +
); diff --git a/frontend/src/pages/GeneralSettings/EmbeddingPreference/index.jsx b/frontend/src/pages/GeneralSettings/EmbeddingPreference/index.jsx index 893948472d..77853e0a99 100644 --- a/frontend/src/pages/GeneralSettings/EmbeddingPreference/index.jsx +++ b/frontend/src/pages/GeneralSettings/EmbeddingPreference/index.jsx @@ -361,7 +361,7 @@ export default function GeneralEmbeddingPreference() { )} diff --git a/frontend/src/pages/GeneralSettings/VectorDatabase/index.jsx b/frontend/src/pages/GeneralSettings/VectorDatabase/index.jsx index f7246de542..11e70d8fd7 100644 --- a/frontend/src/pages/GeneralSettings/VectorDatabase/index.jsx +++ b/frontend/src/pages/GeneralSettings/VectorDatabase/index.jsx @@ -308,7 +308,7 @@ export default function GeneralVectorDatabase() { )} diff --git a/server/models/vectors.js b/server/models/vectors.js index f6b79964a0..3653303da2 100644 --- a/server/models/vectors.js +++ b/server/models/vectors.js @@ -25,6 +25,19 @@ const DocumentVectors = { } }, + where: async function (clause = {}, limit) { + try { + const results = await prisma.document_vectors.findMany({ + where: clause, + take: limit || undefined, + }); + return results; + } catch (error) { + console.error("Where query failed", error); + return []; + } + }, + deleteForWorkspace: async function (workspaceId) { const documents = await Document.forWorkspace(workspaceId); const docIds = [...new Set(documents.map((doc) => doc.docId))]; @@ -40,27 +53,24 @@ const DocumentVectors = { } }, - where: async function (clause = {}, limit) { + deleteIds: async function (ids = []) { try { - const results = await prisma.document_vectors.findMany({ - where: clause, - take: limit || undefined, + await prisma.document_vectors.deleteMany({ + where: { id: { in: ids } }, }); - return results; + return true; } catch (error) { - console.error("Where query failed", error); - return []; + console.error("Delete IDs failed", error); + return false; } }, - deleteIds: async function (ids = []) { + delete: async function (clause = {}) { try { - await prisma.document_vectors.deleteMany({ - where: { id: { in: ids } }, - }); + await prisma.document_vectors.deleteMany({ where: clause }); return true; } catch (error) { - console.error("Delete IDs failed", error); + console.error("Delete failed", error); return false; } }, diff --git a/server/utils/files/index.js b/server/utils/files/index.js index 598884f999..625d8582cd 100644 --- a/server/utils/files/index.js +++ b/server/utils/files/index.js @@ -281,6 +281,16 @@ async function getWatchedDocumentFilenames(filenames = []) { }, {}); } +/** + * Purges the entire vector-cache folder and recreates it. + * @returns {void} + */ +function purgeEntireVectorCache() { + fs.rmSync(vectorCachePath, { recursive: true, force: true }); + fs.mkdirSync(vectorCachePath); + return; +} + module.exports = { findDocumentInDocuments, cachedVectorInformation, @@ -293,4 +303,5 @@ module.exports = { isWithin, documentsPath, hasVectorCachedFiles, + purgeEntireVectorCache, }; diff --git a/server/utils/helpers/index.js b/server/utils/helpers/index.js index e599078b6a..748e4fb1b1 100644 --- a/server/utils/helpers/index.js +++ b/server/utils/helpers/index.js @@ -52,10 +52,11 @@ /** * Gets the systems current vector database provider. + * @param {('pinecone' | 'chroma' | 'lancedb' | 'weaviate' | 'qdrant' | 'milvus' | 'zilliz' | 'astra') | null} getExactly - If provided, this will return an explit provider. * @returns { BaseVectorDatabaseProvider} */ -function getVectorDbClass() { - const vectorSelection = process.env.VECTOR_DB || "lancedb"; +function getVectorDbClass(getExactly = null) { + const vectorSelection = getExactly ?? process.env.VECTOR_DB ?? "lancedb"; switch (vectorSelection) { case "pinecone": const { Pinecone } = require("../vectorDbProviders/pinecone"); diff --git a/server/utils/helpers/updateENV.js b/server/utils/helpers/updateENV.js index 3cfc13e6e1..948703dca2 100644 --- a/server/utils/helpers/updateENV.js +++ b/server/utils/helpers/updateENV.js @@ -1,3 +1,5 @@ +const { resetAllVectorStores } = require("../vectorStore/resetAllVectorStores"); + const KEY_MAPPING = { LLMProvider: { envKey: "LLM_PROVIDER", @@ -248,6 +250,7 @@ const KEY_MAPPING = { EmbeddingEngine: { envKey: "EMBEDDING_ENGINE", checks: [supportedEmbeddingModel], + postUpdate: [handleVectorStoreReset], }, EmbeddingBasePath: { envKey: "EMBEDDING_BASE_PATH", @@ -256,6 +259,7 @@ const KEY_MAPPING = { EmbeddingModelPref: { envKey: "EMBEDDING_MODEL_PREF", checks: [isNotEmpty], + postUpdate: [handleVectorStoreReset], }, EmbeddingModelMaxChunkLength: { envKey: "EMBEDDING_MODEL_MAX_CHUNK_LENGTH", @@ -276,6 +280,7 @@ const KEY_MAPPING = { VectorDB: { envKey: "VECTOR_DB", checks: [isNotEmpty, supportedVectorDB], + postUpdate: [handleVectorStoreReset], }, // Chroma Options @@ -878,6 +883,24 @@ function noRestrictedChars(input = "") { : null; } +async function handleVectorStoreReset(key, prevValue, nextValue) { + if (prevValue === nextValue) return; + if (key === "VectorDB") { + console.log( + `Vector configuration changed from ${prevValue} to ${nextValue} - resetting ${prevValue} namespaces` + ); + return await resetAllVectorStores({ vectorDbKey: prevValue }); + } + + if (key === "EmbeddingEngine" || key === "EmbeddingModelPref") { + console.log( + `${key} changed from ${prevValue} to ${nextValue} - resetting ${process.env.VECTOR_DB} namespaces` + ); + return await resetAllVectorStores({ vectorDbKey: process.env.VECTOR_DB }); + } + return false; +} + // This will force update .env variables which for any which reason were not able to be parsed or // read from an ENV file as this seems to be a complicating step for many so allowing people to write // to the process will at least alleviate that issue. It does not perform comprehensive validity checks or sanity checks diff --git a/server/utils/vectorStore/resetAllVectorStores.js b/server/utils/vectorStore/resetAllVectorStores.js new file mode 100644 index 0000000000..3bb9a5ec4c --- /dev/null +++ b/server/utils/vectorStore/resetAllVectorStores.js @@ -0,0 +1,48 @@ +const { Workspace } = require("../../models/workspace"); +const { Document } = require("../../models/documents"); +const { DocumentVectors } = require("../../models/vectors"); +const { EventLogs } = require("../../models/eventLogs"); +const { purgeEntireVectorCache } = require("../files"); +const { getVectorDbClass } = require("../helpers"); + +/** + * Resets all vector database and associated content: + * - Purges the entire vector-cache folder. + * - Deletes all document vectors from the database. + * - Deletes all documents from the database. + * - Deletes all vector db namespaces for each workspace. + * - Logs an event indicating the reset. + * @param {string} vectorDbKey - The _previous_ vector database provider name that we will be resetting. + * @returns {Promise} - True if successful, false otherwise. + */ +async function resetAllVectorStores({ vectorDbKey }) { + try { + const workspaces = await Workspace.where(); + purgeEntireVectorCache(); // Purges the entire vector-cache folder. + await DocumentVectors.delete(); // Deletes all document vectors from the database. + await Document.delete(); // Deletes all documents from the database. + await EventLogs.logEvent("workspace_vectors_reset", { + reason: "System vector configuration changed", + }); + + console.log( + "Resetting anythingllm managed vector namespaces for", + vectorDbKey + ); + const VectorDb = getVectorDbClass(vectorDbKey); + for (const workspace of workspaces) { + try { + await VectorDb["delete-namespace"]({ namespace: workspace.slug }); + } catch (e) { + console.error(e.message); + } + } + + return true; + } catch (error) { + console.error("Failed to reset vector stores:", error); + return false; + } +} + +module.exports = { resetAllVectorStores };