Update dependencies and add YouTube transcript support

This commit is contained in:
n4ze3m 2024-03-08 00:45:28 +05:30
parent 6d559eda2f
commit 7a72961562
6 changed files with 151 additions and 68 deletions

View File

@ -1,7 +1,7 @@
{
"name": "pageassist",
"displayName": "Page Assist - A Web UI for Local AI Models",
"version": "1.0.7",
"version": "1.0.8",
"description": "Use your locally running AI models to assist you in your web browsing.",
"author": "n4ze3m",
"scripts": {
@ -26,7 +26,7 @@
"dayjs": "^1.11.10",
"html-to-text": "^9.0.5",
"langchain": "^0.1.9",
"lucide-react": "^0.340.0",
"lucide-react": "^0.350.0",
"plasmo": "0.84.1",
"property-information": "^6.4.1",
"react": "18.2.0",
@ -38,6 +38,7 @@
"rehype-mathjax": "4.0.3",
"remark-gfm": "3.0.1",
"remark-math": "5.1.1",
"yt-transcript": "^0.0.2",
"zustand": "^4.5.0"
},
"devDependencies": {

View File

@ -38,7 +38,7 @@ export const EmptySidePanel = () => {
}
}, [ollamaInfo])
const { setSelectedModel, selectedModel, chatMode, setChatMode } =
const { setSelectedModel, selectedModel, chatMode, setChatMode, } =
useMessage()
return (

View File

@ -20,9 +20,12 @@ import { getHtmlOfCurrentTab } from "~libs/get-html"
import { PageAssistHtmlLoader } from "~loader/html"
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"
import { OllamaEmbeddings } from "@langchain/community/embeddings/ollama"
import { createChatWithWebsiteChain, groupMessagesByConversation } from "~chain/chat-with-website"
import {
createChatWithWebsiteChain,
groupMessagesByConversation
} from "~chain/chat-with-website"
import { MemoryVectorStore } from "langchain/vectorstores/memory"
import { chromeRunTime } from "~libs/runtime"
export type BotResponse = {
bot: {
text: string
@ -134,11 +137,11 @@ export const useMessage = () => {
url
})
const docs = await loader.load()
const chunkSize = await defaultEmbeddingChunkSize();
const chunkOverlap = await defaultEmbeddingChunkOverlap();
const chunkSize = await defaultEmbeddingChunkSize()
const chunkOverlap = await defaultEmbeddingChunkOverlap()
const textSplitter = new RecursiveCharacterTextSplitter({
chunkSize,
chunkOverlap,
chunkOverlap
})
const chunks = await textSplitter.splitDocuments(docs)
@ -158,64 +161,65 @@ export const useMessage = () => {
}
const chatWithWebsiteMode = async (message: string) => {
const ollamaUrl = await getOllamaURL()
const { html, url } = await getHtmlOfCurrentTab()
const isAlreadyExistEmbedding = keepTrackOfEmbedding[url]
let newMessage: Message[] = [
...messages,
{
isBot: false,
name: "You",
message,
sources: []
},
{
isBot: true,
name: selectedModel,
message: "▋",
sources: []
}
]
const appendingIndex = newMessage.length - 1
setMessages(newMessage)
const embeddingModle = await defaultEmbeddingModelForRag()
const ollamaEmbedding = new OllamaEmbeddings({
model: embeddingModle || selectedModel,
baseUrl: cleanUrl(ollamaUrl)
})
const ollamaChat = new ChatOllama({
model: selectedModel,
baseUrl: cleanUrl(ollamaUrl)
})
let vectorstore: MemoryVectorStore
if (isAlreadyExistEmbedding) {
vectorstore = isAlreadyExistEmbedding
} else {
vectorstore = await memoryEmbedding(url, html, ollamaEmbedding)
}
const { ragPrompt: systemPrompt, ragQuestionPrompt: questionPrompt } =
await promptForRag()
const sanitizedQuestion = message.trim().replaceAll("\n", " ")
const chain = createChatWithWebsiteChain({
llm: ollamaChat,
question_llm: ollamaChat,
question_template: questionPrompt,
response_template: systemPrompt,
retriever: vectorstore.asRetriever()
})
try {
let isAlreadyExistEmbedding: MemoryVectorStore
const { html, url } = await getHtmlOfCurrentTab()
isAlreadyExistEmbedding = keepTrackOfEmbedding[url]
let newMessage: Message[] = [
...messages,
{
isBot: false,
name: "You",
message,
sources: []
},
{
isBot: true,
name: selectedModel,
message: "▋",
sources: []
}
]
const appendingIndex = newMessage.length - 1
setMessages(newMessage)
const ollamaUrl = await getOllamaURL()
const embeddingModle = await defaultEmbeddingModelForRag()
const ollamaEmbedding = new OllamaEmbeddings({
model: embeddingModle || selectedModel,
baseUrl: cleanUrl(ollamaUrl)
})
const ollamaChat = new ChatOllama({
model: selectedModel,
baseUrl: cleanUrl(ollamaUrl)
})
let vectorstore: MemoryVectorStore
if (isAlreadyExistEmbedding) {
vectorstore = isAlreadyExistEmbedding
} else {
vectorstore = await memoryEmbedding(url, html, ollamaEmbedding)
}
const { ragPrompt: systemPrompt, ragQuestionPrompt: questionPrompt } =
await promptForRag()
const sanitizedQuestion = message.trim().replaceAll("\n", " ")
const chain = createChatWithWebsiteChain({
llm: ollamaChat,
question_llm: ollamaChat,
question_template: questionPrompt,
response_template: systemPrompt,
retriever: vectorstore.asRetriever()
})
const chunks = await chain.stream({
question: sanitizedQuestion,
chat_history: groupMessagesByConversation(history),
chat_history: groupMessagesByConversation(history)
})
let count = 0
for await (const chunk of chunks) {
@ -258,7 +262,8 @@ export const useMessage = () => {
{
isBot: true,
name: selectedModel,
message: `Something went wrong. Check out the following logs:
message: `Error in chat with website mode. Check out the following logs:
~~~
${e?.message}
~~~

View File

@ -1,3 +1,4 @@
const _getHtml = () => {
const url = window.location.href
const html = Array.from(document.querySelectorAll("script")).reduce(
@ -29,3 +30,4 @@ export const getHtmlOfCurrentTab = async () => {
return result
}

View File

@ -2,6 +2,20 @@ import { BaseDocumentLoader } from "langchain/document_loaders/base"
import { Document } from "@langchain/core/documents"
import { compile } from "html-to-text"
import { chromeRunTime } from "~libs/runtime"
import { YtTranscript } from "yt-transcript"
const YT_REGEX =
/(?:https?:\/\/)?(?:www\.)?(?:youtube\.com|youtu\.be)\/(?:watch\?v=)?([a-zA-Z0-9_-]+)/
const isYoutubeLink = (url: string) => {
return YT_REGEX.test(url)
}
const getTranscript = async (url: string) => {
const ytTranscript = new YtTranscript({ url })
return await ytTranscript.getTranscript()
}
export interface WebLoaderParams {
html: string
@ -21,6 +35,29 @@ export class PageAssistHtmlLoader
}
async load(): Promise<Document<Record<string, any>>[]> {
if (isYoutubeLink(this.url)) {
const transcript = await getTranscript(this.url)
if (!transcript) {
throw new Error("Transcript not found for this video.")
}
let text = ""
transcript.forEach((item) => {
text += item.text + " "
})
return [
{
metadata: {
source: this.url,
audio: { chunks: transcript }
},
pageContent: text
}
]
}
const htmlCompiler = compile({
wordwrap: false
})
@ -30,6 +67,29 @@ export class PageAssistHtmlLoader
}
async loadByURL(): Promise<Document<Record<string, any>>[]> {
if (isYoutubeLink(this.url)) {
const transcript = await getTranscript(this.url)
if (!transcript) {
throw new Error("Transcript not found for this video.")
}
let text = ""
transcript.forEach((item) => {
text += item.text + " "
})
return [
{
metadata: {
source: this.url,
audio: { chunks: transcript }
},
pageContent: text
}
]
}
await chromeRunTime(this.url)
const fetchHTML = await fetch(this.url)
const html = await fetchHTML.text()

View File

@ -4967,10 +4967,10 @@ lru-cache@^6.0.0:
resolved "https://registry.yarnpkg.com/lru-cache/-/lru-cache-10.2.0.tgz#0bd445ca57363465900f4d1f9bd8db343a4d95c3"
integrity sha512-2bIM8x+VAf6JT4bKAljS1qUWgMsqZRPGJS6FSahIMPVvctcNhyVp7AJu7quxOW9jwkryBReKZY5tY5JYv2n/7Q==
lucide-react@^0.340.0:
version "0.340.0"
resolved "https://registry.yarnpkg.com/lucide-react/-/lucide-react-0.340.0.tgz#67a6fac6a5e257f2036dffae0dd94d6ccb28ce8e"
integrity sha512-mWzYhbyy2d+qKuKHh+GWElPwa+kIquTnKbmSLGWOuZy+bjfZCkYD8DQWVFlqI4mQwc4HNxcqcOvtQ7ZS2PwURg==
lucide-react@^0.350.0:
version "0.350.0"
resolved "https://registry.yarnpkg.com/lucide-react/-/lucide-react-0.350.0.tgz#78b45342f4daff4535290e37b1ea7eb0961a3dab"
integrity sha512-5IZVKsxxG8Nn81gpsz4XLNgCAXkppCh0Y0P0GLO39h5iVD2WEaB9of6cPkLtzys1GuSfxJxmwsDh487y7LAf/g==
magic-string@^0.30.0:
version "0.30.6"
@ -7772,6 +7772,13 @@ ws@^8.11.0:
resolved "https://registry.yarnpkg.com/ws/-/ws-8.16.0.tgz#d1cd774f36fbc07165066a60e40323eab6446fd4"
integrity sha512-HS0c//TP7Ina87TfiPUz1rQzMhHrl/SG2guqRcTOIUYD2q8uhUdNHZYJUaQ8aTGPzCh+c6oawMKW35nFl1dxyQ==
xml-js@^1.6.11:
version "1.6.11"
resolved "https://registry.yarnpkg.com/xml-js/-/xml-js-1.6.11.tgz#927d2f6947f7f1c19a316dd8eea3614e8b18f8e9"
integrity sha512-7rVi2KMfwfWFl+GpPg6m80IVMWXLRjO+PxTq7V2CDhoGak0wzYzFgUY2m4XJ47OGdXd8eLE8EmwfAmdjw7lC1g==
dependencies:
sax "^1.2.4"
xml-name-validator@^4.0.0:
version "4.0.0"
resolved "https://registry.yarnpkg.com/xml-name-validator/-/xml-name-validator-4.0.0.tgz#79a006e2e63149a8600f15430f0a4725d1524835"
@ -7817,6 +7824,14 @@ yaml@^2.2.1, yaml@^2.3.4:
resolved "https://registry.yarnpkg.com/yaml/-/yaml-2.3.4.tgz#53fc1d514be80aabf386dc6001eb29bf3b7523b2"
integrity sha512-8aAvwVUSHpfEqTQ4w/KMlf3HcRdt50E5ODIQJBw1fQ5RL34xabzxtUlzTXVqc4rkZsPbvrXKWnABCD7kWSmocA==
yt-transcript@^0.0.2:
version "0.0.2"
resolved "https://registry.yarnpkg.com/yt-transcript/-/yt-transcript-0.0.2.tgz#1c54aede89bb8a03bbca3ba58520dbbd9c828571"
integrity sha512-+cNRqW6tSQNDkQDVrWNT6hc6X2TnaQLvUJIepzn9r7XdEvPtUDkfsyhptW5+j0EPIEpnlsKyA/epCUrE4QKn2g==
dependencies:
axios "^1.6.7"
xml-js "^1.6.11"
zod-to-json-schema@^3.22.3:
version "3.22.4"
resolved "https://registry.yarnpkg.com/zod-to-json-schema/-/zod-to-json-schema-3.22.4.tgz#f8cc691f6043e9084375e85fb1f76ebafe253d70"