Update dependencies and add YouTube transcript support

This commit is contained in:
n4ze3m
2024-03-08 00:45:28 +05:30
parent 6d559eda2f
commit 7a72961562
6 changed files with 151 additions and 68 deletions

View File

@@ -38,7 +38,7 @@ export const EmptySidePanel = () => {
}
}, [ollamaInfo])
const { setSelectedModel, selectedModel, chatMode, setChatMode } =
const { setSelectedModel, selectedModel, chatMode, setChatMode, } =
useMessage()
return (

View File

@@ -20,9 +20,12 @@ import { getHtmlOfCurrentTab } from "~libs/get-html"
import { PageAssistHtmlLoader } from "~loader/html"
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"
import { OllamaEmbeddings } from "@langchain/community/embeddings/ollama"
import { createChatWithWebsiteChain, groupMessagesByConversation } from "~chain/chat-with-website"
import {
createChatWithWebsiteChain,
groupMessagesByConversation
} from "~chain/chat-with-website"
import { MemoryVectorStore } from "langchain/vectorstores/memory"
import { chromeRunTime } from "~libs/runtime"
export type BotResponse = {
bot: {
text: string
@@ -134,11 +137,11 @@ export const useMessage = () => {
url
})
const docs = await loader.load()
const chunkSize = await defaultEmbeddingChunkSize();
const chunkOverlap = await defaultEmbeddingChunkOverlap();
const chunkSize = await defaultEmbeddingChunkSize()
const chunkOverlap = await defaultEmbeddingChunkOverlap()
const textSplitter = new RecursiveCharacterTextSplitter({
chunkSize,
chunkOverlap,
chunkOverlap
})
const chunks = await textSplitter.splitDocuments(docs)
@@ -158,64 +161,65 @@ export const useMessage = () => {
}
const chatWithWebsiteMode = async (message: string) => {
const ollamaUrl = await getOllamaURL()
const { html, url } = await getHtmlOfCurrentTab()
const isAlreadyExistEmbedding = keepTrackOfEmbedding[url]
let newMessage: Message[] = [
...messages,
{
isBot: false,
name: "You",
message,
sources: []
},
{
isBot: true,
name: selectedModel,
message: "▋",
sources: []
}
]
const appendingIndex = newMessage.length - 1
setMessages(newMessage)
const embeddingModle = await defaultEmbeddingModelForRag()
const ollamaEmbedding = new OllamaEmbeddings({
model: embeddingModle || selectedModel,
baseUrl: cleanUrl(ollamaUrl)
})
const ollamaChat = new ChatOllama({
model: selectedModel,
baseUrl: cleanUrl(ollamaUrl)
})
let vectorstore: MemoryVectorStore
if (isAlreadyExistEmbedding) {
vectorstore = isAlreadyExistEmbedding
} else {
vectorstore = await memoryEmbedding(url, html, ollamaEmbedding)
}
const { ragPrompt: systemPrompt, ragQuestionPrompt: questionPrompt } =
await promptForRag()
const sanitizedQuestion = message.trim().replaceAll("\n", " ")
const chain = createChatWithWebsiteChain({
llm: ollamaChat,
question_llm: ollamaChat,
question_template: questionPrompt,
response_template: systemPrompt,
retriever: vectorstore.asRetriever()
})
try {
let isAlreadyExistEmbedding: MemoryVectorStore
const { html, url } = await getHtmlOfCurrentTab()
isAlreadyExistEmbedding = keepTrackOfEmbedding[url]
let newMessage: Message[] = [
...messages,
{
isBot: false,
name: "You",
message,
sources: []
},
{
isBot: true,
name: selectedModel,
message: "▋",
sources: []
}
]
const appendingIndex = newMessage.length - 1
setMessages(newMessage)
const ollamaUrl = await getOllamaURL()
const embeddingModle = await defaultEmbeddingModelForRag()
const ollamaEmbedding = new OllamaEmbeddings({
model: embeddingModle || selectedModel,
baseUrl: cleanUrl(ollamaUrl)
})
const ollamaChat = new ChatOllama({
model: selectedModel,
baseUrl: cleanUrl(ollamaUrl)
})
let vectorstore: MemoryVectorStore
if (isAlreadyExistEmbedding) {
vectorstore = isAlreadyExistEmbedding
} else {
vectorstore = await memoryEmbedding(url, html, ollamaEmbedding)
}
const { ragPrompt: systemPrompt, ragQuestionPrompt: questionPrompt } =
await promptForRag()
const sanitizedQuestion = message.trim().replaceAll("\n", " ")
const chain = createChatWithWebsiteChain({
llm: ollamaChat,
question_llm: ollamaChat,
question_template: questionPrompt,
response_template: systemPrompt,
retriever: vectorstore.asRetriever()
})
const chunks = await chain.stream({
question: sanitizedQuestion,
chat_history: groupMessagesByConversation(history),
chat_history: groupMessagesByConversation(history)
})
let count = 0
for await (const chunk of chunks) {
@@ -258,7 +262,8 @@ export const useMessage = () => {
{
isBot: true,
name: selectedModel,
message: `Something went wrong. Check out the following logs:
message: `Error in chat with website mode. Check out the following logs:
~~~
${e?.message}
~~~

View File

@@ -1,3 +1,4 @@
const _getHtml = () => {
const url = window.location.href
const html = Array.from(document.querySelectorAll("script")).reduce(
@@ -29,3 +30,4 @@ export const getHtmlOfCurrentTab = async () => {
return result
}

View File

@@ -2,6 +2,20 @@ import { BaseDocumentLoader } from "langchain/document_loaders/base"
import { Document } from "@langchain/core/documents"
import { compile } from "html-to-text"
import { chromeRunTime } from "~libs/runtime"
import { YtTranscript } from "yt-transcript"
const YT_REGEX =
/(?:https?:\/\/)?(?:www\.)?(?:youtube\.com|youtu\.be)\/(?:watch\?v=)?([a-zA-Z0-9_-]+)/
const isYoutubeLink = (url: string) => {
return YT_REGEX.test(url)
}
const getTranscript = async (url: string) => {
const ytTranscript = new YtTranscript({ url })
return await ytTranscript.getTranscript()
}
export interface WebLoaderParams {
html: string
@@ -21,6 +35,29 @@ export class PageAssistHtmlLoader
}
async load(): Promise<Document<Record<string, any>>[]> {
if (isYoutubeLink(this.url)) {
const transcript = await getTranscript(this.url)
if (!transcript) {
throw new Error("Transcript not found for this video.")
}
let text = ""
transcript.forEach((item) => {
text += item.text + " "
})
return [
{
metadata: {
source: this.url,
audio: { chunks: transcript }
},
pageContent: text
}
]
}
const htmlCompiler = compile({
wordwrap: false
})
@@ -30,6 +67,29 @@ export class PageAssistHtmlLoader
}
async loadByURL(): Promise<Document<Record<string, any>>[]> {
if (isYoutubeLink(this.url)) {
const transcript = await getTranscript(this.url)
if (!transcript) {
throw new Error("Transcript not found for this video.")
}
let text = ""
transcript.forEach((item) => {
text += item.text + " "
})
return [
{
metadata: {
source: this.url,
audio: { chunks: transcript }
},
pageContent: text
}
]
}
await chromeRunTime(this.url)
const fetchHTML = await fetch(this.url)
const html = await fetchHTML.text()