Skip to content

Commit

Permalink
Update package versions and add support for YouTube transcripts
Browse files Browse the repository at this point in the history
  • Loading branch information
n4ze3m committed Mar 7, 2024
1 parent 2fe7073 commit 5f09d52
Show file tree
Hide file tree
Showing 11 changed files with 315 additions and 47 deletions.
2 changes: 1 addition & 1 deletion app/ui/package.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"name": "app",
"private": true,
"version": "1.7.5",
"version": "1.7.6",
"type": "module",
"scripts": {
"dev": "vite",
Expand Down
111 changes: 98 additions & 13 deletions app/ui/src/components/Common/BotForm.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ import { YoutubeIcon } from "../Icons/YoutubeIcon";
import { ApiIcon } from "../Icons/ApiIcon";
import { SitemapIcon } from "../Icons/SitemapIcon";
import { useCreateConfig } from "../../hooks/useCreateConfig";
import { useQuery } from "@tanstack/react-query";
import api from "../../services/api";

type Props = {
createBot: (values: any) => void;
Expand All @@ -48,6 +50,9 @@ export const BotForm = ({
}: Props) => {
const { data: botConfig, status: botConfigStatus } = useCreateConfig();

const youtubeMode = Form.useWatch(["options", "youtube_mode"], form);
const url = Form.useWatch(["content"], form);

const [availableSources] = React.useState([
{
id: 1,
Expand Down Expand Up @@ -331,21 +336,36 @@ export const BotForm = ({
},
]}
>
<Input type="url" placeholder="Enter the youtube URL" />
<Input
type="url"
size="large"
placeholder="Enter the youtube URL"
/>
</Form.Item>

<p className="text-sm text-gray-500">
If you find any issues, please report them on{" "}
<a
href="https://github.com/n4ze3m/dialoqbase/issues/new?title=Github%20issue&labels=bug"
target="_blank"
rel="noreferrer"
className="font-medium text-indigo-600 hover:text-indigo-500"
>
GitHub
</a>
.
</p>
<Form.Item
name={["options", "youtube_mode"]}
label="Youtube mode"
rules={[
{
required: true,
message: "Please select a youtube mode",
},
]}
>
<Select
options={[
{
label: "Transcribe using Whisper",
value: "whisper",
},
{
label: "Youtube Transcript",
value: "transcript",
},
]}
/>
</Form.Item>
</>
),
},
Expand Down Expand Up @@ -426,6 +446,31 @@ export const BotForm = ({
const [selectedSource, _setSelectedSource] = React.useState<any>(
showEmbeddingAndModels ? null : availableSources[0]
);
const { data: transcripts, isLoading: isFetchingTranscript } = useQuery({
queryKey: [
"fetchYoutubeTranscript",
url,
youtubeMode,
selectedSource?.value,
],
queryFn: async () => {
if (Boolean(url) && youtubeMode === "transcript") {
const res = await api.get("/yt/transcript?url=" + url);
return res.data["data"] as {
name: {
simpleText: string;
};
languageCode: string;
}[];
}

return [];
},
enabled:
Boolean(url) &&
youtubeMode === "transcript" &&
selectedSource?.value === "youtube",
});

return (
<>
Expand All @@ -446,6 +491,7 @@ export const BotForm = ({
method: "get",
headers: "{}",
body: "{}",
youtube_mode: "whisper",
},
}}
>
Expand Down Expand Up @@ -533,6 +579,45 @@ export const BotForm = ({
</Row>
)}

{selectedSource &&
selectedSource.value === "youtube" &&
youtubeMode === "transcript" && (
<>
<Form.Item
name={["options", "language_code"]}
label="Select language"
rules={[
{
required: true,
message: "Please select a language",
},
]}
>
<Select
loading={isFetchingTranscript}
placeholder="Select language"
options={transcripts?.map((transcript) => ({
label: transcript.name.simpleText,
value: transcript.languageCode,
}))}
/>
</Form.Item>

<p className="text-sm text-gray-500">
If you find any issues, please report them on{" "}
<a
href="https://github.com/n4ze3m/dialoqbase/issues/new?title=Github%20issue&labels=bug"
target="_blank"
rel="noreferrer"
className="font-medium text-indigo-600 hover:text-indigo-500"
>
GitHub
</a>
.
</p>
</>
)}

<Form.Item hidden={!showEmbeddingAndModels} noStyle>
<Divider />
</Form.Item>
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "dialoqbase",
"version": "1.7.5",
"version": "1.7.6",
"description": "Create chatbots with ease",
"scripts": {
"ui:dev": "pnpm run --filter ui dev",
Expand Down
1 change: 1 addition & 0 deletions server/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@
"ts-node": "^10.9.1",
"unique-names-generator": "^4.7.1",
"wavefile": "^11.0.0",
"yt-transcript": "^0.0.2",
"ytdl-core": "^4.11.5"
},
"devDependencies": {
Expand Down
22 changes: 22 additions & 0 deletions server/src/handlers/api/v1/yt/get.handler.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import { FastifyRequest, FastifyReply } from "fastify";
import { GetYoutubeTranscript } from "./types";
import { YtTranscript } from "yt-transcript"

const getYoutubeTranscript = async (url: string) => {
const transcript = new YtTranscript({ url });
const data = await transcript.listAllTranscripts();
return data;
}

export const getYoutubeTranscriptHandler = async (
request: FastifyRequest<GetYoutubeTranscript>,
reply: FastifyReply
) => {
const { url } = request.query;

const available = await getYoutubeTranscript(url);

return {
data: available as any
}
}
1 change: 1 addition & 0 deletions server/src/handlers/api/v1/yt/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
export * from "./get.handler"
5 changes: 5 additions & 0 deletions server/src/handlers/api/v1/yt/types.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
export type GetYoutubeTranscript = {
Querystring: {
url: string;
}
}
51 changes: 51 additions & 0 deletions server/src/loader/youtube-transcript.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import { BaseDocumentLoader } from "langchain/document_loaders/base";
import { Document } from "langchain/document";
import { YtTranscript } from "yt-transcript";

export interface YoutubeTranscriptParams {
url: string;
language_code: string;
}

export class DialoqbaseYoutubeTranscript
extends BaseDocumentLoader
implements YoutubeTranscriptParams {
language_code: string;
url: string;

constructor({ language_code, url }: YoutubeTranscriptParams) {
super();
this.language_code = language_code;
this.url = url
}



async load(): Promise<Document<Record<string, any>>[]> {
const ytTranscript = new YtTranscript({
url: this.url,
})

const script = await ytTranscript.getTranscript(this.language_code)

if (!script) throw new Error("No script found")


let text = ""

script.forEach((item) => {
text += item.text + " "
})


return [
{
metadata: {
source: this.url,
audio: { chunks: script }
},
pageContent: text
}
]
}
}
114 changes: 82 additions & 32 deletions server/src/queue/controllers/youtube.controller.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,44 +5,94 @@ import { DialoqbaseVectorStore } from "../../utils/store";
import { embeddings } from "../../utils/embeddings";
import { DialoqbaseYoutube } from "../../loader/youtube";
import { PrismaClient } from "@prisma/client";
import { DialoqbaseYoutubeTranscript } from "../../loader/youtube-transcript";

export const youtubeQueueController = async (
source: QSource,
prisma: PrismaClient
) => {
const loader = new DialoqbaseYoutube({
url: source.content!,
});
const docs = await loader.load();

const textSplitter = new RecursiveCharacterTextSplitter({
chunkSize: 1000,
chunkOverlap: 200,
});
const chunks = await textSplitter.splitDocuments(docs);

const embeddingInfo = await prisma.dialoqbaseModels.findFirst({
where: {
model_id: source.embedding,
hide: false,
deleted: false,
},
});

if (!embeddingInfo) {
throw new Error("Embedding not found. Please verify the embedding id");
const {
language_code,
youtube_mode
} = source.options as {
language_code: string;
youtube_mode: "whisper" | "transcript";
}
if (youtube_mode === "transcript") {
console.log("Using Youtube Transcript Mode")
const loader = new DialoqbaseYoutubeTranscript({
url: source.content!,
language_code,
});
const docs = await loader.load();

await DialoqbaseVectorStore.fromDocuments(
chunks,
embeddings(
embeddingInfo.model_provider!.toLowerCase(),
embeddingInfo.model_id,
embeddingInfo?.config
),
{
botId: source.botId,
sourceId: source.id,
const textSplitter = new RecursiveCharacterTextSplitter({
chunkSize: 1000,
chunkOverlap: 200,
});
const chunks = await textSplitter.splitDocuments(docs);

const embeddingInfo = await prisma.dialoqbaseModels.findFirst({
where: {
model_id: source.embedding,
hide: false,
deleted: false,
},
});

if (!embeddingInfo) {
throw new Error("Embedding not found. Please verify the embedding id");
}

await DialoqbaseVectorStore.fromDocuments(
chunks,
embeddings(
embeddingInfo.model_provider!.toLowerCase(),
embeddingInfo.model_id,
embeddingInfo?.config
),
{
botId: source.botId,
sourceId: source.id,
}
);
}
else {
console.log("Using Youtube Whisper Mode")
const loader = new DialoqbaseYoutube({
url: source.content!,
});
const docs = await loader.load();

const textSplitter = new RecursiveCharacterTextSplitter({
chunkSize: 1000,
chunkOverlap: 200,
});
const chunks = await textSplitter.splitDocuments(docs);

const embeddingInfo = await prisma.dialoqbaseModels.findFirst({
where: {
model_id: source.embedding,
hide: false,
deleted: false,
},
});

if (!embeddingInfo) {
throw new Error("Embedding not found. Please verify the embedding id");
}
);

await DialoqbaseVectorStore.fromDocuments(
chunks,
embeddings(
embeddingInfo.model_provider!.toLowerCase(),
embeddingInfo.model_id,
embeddingInfo?.config
),
{
botId: source.botId,
sourceId: source.id,
}
);
}
};
Loading

0 comments on commit 5f09d52

Please sign in to comment.