Refactor useMessage hook and remove unused code

This commit is contained in:
n4ze3m 2024-03-25 23:17:43 +05:30
parent 2381588e72
commit 3904a74701
5 changed files with 132 additions and 121 deletions

View File

@ -1,89 +1,22 @@
import React from "react" import React from "react"
import { cleanUrl } from "~/libs/clean-url" import { cleanUrl } from "~/libs/clean-url"
import { import {
defaultEmbeddingChunkOverlap,
defaultEmbeddingChunkSize,
defaultEmbeddingModelForRag, defaultEmbeddingModelForRag,
getOllamaURL, getOllamaURL,
promptForRag, promptForRag,
systemPromptForNonRag systemPromptForNonRag
} from "~/services/ollama" } from "~/services/ollama"
import { useStoreMessage, type ChatHistory, type Message } from "~/store" import { useStoreMessage, type Message } from "~/store"
import { ChatOllama } from "@langchain/community/chat_models/ollama" import { ChatOllama } from "@langchain/community/chat_models/ollama"
import { import { HumanMessage, SystemMessage } from "@langchain/core/messages"
HumanMessage,
AIMessage,
type MessageContent,
SystemMessage
} from "@langchain/core/messages"
import { getDataFromCurrentTab } from "~/libs/get-html" import { getDataFromCurrentTab } from "~/libs/get-html"
import { PageAssistHtmlLoader } from "~/loader/html"
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"
import { OllamaEmbeddings } from "@langchain/community/embeddings/ollama" import { OllamaEmbeddings } from "@langchain/community/embeddings/ollama"
import { import {
createChatWithWebsiteChain, createChatWithWebsiteChain,
groupMessagesByConversation groupMessagesByConversation
} from "~/chain/chat-with-website" } from "~/chain/chat-with-website"
import { MemoryVectorStore } from "langchain/vectorstores/memory" import { MemoryVectorStore } from "langchain/vectorstores/memory"
import { chromeRunTime } from "~/libs/runtime" import { memoryEmbedding } from "@/utils/memory-embeddings"
export type BotResponse = {
bot: {
text: string
sourceDocuments: any[]
}
history: ChatHistory
history_id: string
}
const generateHistory = (
messages: {
role: "user" | "assistant" | "system"
content: string
image?: string
}[]
) => {
let history = []
for (const message of messages) {
if (message.role === "user") {
let content: MessageContent = [
{
type: "text",
text: message.content
}
]
if (message.image) {
content = [
{
type: "image_url",
image_url: message.image
},
{
type: "text",
text: message.content
}
]
}
history.push(
new HumanMessage({
content: content
})
)
} else if (message.role === "assistant") {
history.push(
new AIMessage({
content: [
{
type: "text",
text: message.content
}
]
})
)
}
}
return history
}
export const useMessage = () => { export const useMessage = () => {
const { const {
@ -129,47 +62,18 @@ export const useMessage = () => {
setStreaming(false) setStreaming(false)
} }
const memoryEmbedding = async (
url: string,
html: string,
ollamaEmbedding: OllamaEmbeddings
) => {
const loader = new PageAssistHtmlLoader({
html,
url
})
const docs = await loader.load()
const chunkSize = await defaultEmbeddingChunkSize()
const chunkOverlap = await defaultEmbeddingChunkOverlap()
const textSplitter = new RecursiveCharacterTextSplitter({
chunkSize,
chunkOverlap
})
const chunks = await textSplitter.splitDocuments(docs)
const store = new MemoryVectorStore(ollamaEmbedding)
setIsEmbedding(true)
await store.addDocuments(chunks)
setKeepTrackOfEmbedding({
...keepTrackOfEmbedding,
[url]: store
})
setIsEmbedding(false)
return store
}
const chatWithWebsiteMode = async (message: string) => { const chatWithWebsiteMode = async (message: string) => {
try { try {
let isAlreadyExistEmbedding: MemoryVectorStore let isAlreadyExistEmbedding: MemoryVectorStore
let embedURL: string, embedHTML: string let embedURL: string, embedHTML: string, embedType: string
let embedPDF: { content: string; page: number }[] = []
if (messages.length === 0) { if (messages.length === 0) {
const { content: html, url, type } = await getDataFromCurrentTab() const { content: html, url, type, pdf } = await getDataFromCurrentTab()
embedHTML = html embedHTML = html
embedURL = url embedURL = url
embedType = type
embedPDF = pdf
setCurrentURL(url) setCurrentURL(url)
isAlreadyExistEmbedding = keepTrackOfEmbedding[currentURL] isAlreadyExistEmbedding = keepTrackOfEmbedding[currentURL]
} else { } else {
@ -212,11 +116,16 @@ export const useMessage = () => {
if (isAlreadyExistEmbedding) { if (isAlreadyExistEmbedding) {
vectorstore = isAlreadyExistEmbedding vectorstore = isAlreadyExistEmbedding
} else { } else {
vectorstore = await memoryEmbedding( vectorstore = await memoryEmbedding({
embedURL, html: embedHTML,
embedHTML, keepTrackOfEmbedding: keepTrackOfEmbedding,
ollamaEmbedding ollamaEmbedding: ollamaEmbedding,
) pdf: embedPDF,
setIsEmbedding: setIsEmbedding,
setKeepTrackOfEmbedding: setKeepTrackOfEmbedding,
type: embedType,
url: embedURL
})
} }
const { ragPrompt: systemPrompt, ragQuestionPrompt: questionPrompt } = const { ragPrompt: systemPrompt, ragQuestionPrompt: questionPrompt } =

View File

@ -1,14 +1,11 @@
import i18n from "i18next"; import i18n from "i18next";
import LanguageDetector from "i18next-browser-languagedetector";
import { initReactI18next } from "react-i18next"; import { initReactI18next } from "react-i18next";
import { en } from "./lang/en"; import { en } from "./lang/en";
import { ml } from "./lang/ml"; import { ml } from "./lang/ml";
i18n i18n
.use(LanguageDetector)
.use(initReactI18next) .use(initReactI18next)
.init({ .init({
debug: true,
resources: { resources: {
en: en, en: en,
ml: ml ml: ml

View File

@ -26,10 +26,7 @@ export const getPdf = async (data: ArrayBuffer) => {
const _getHtml = async () => { const _getHtml = async () => {
const url = window.location.href const url = window.location.href
// check the content type
if (document.contentType === "application/pdf") { if (document.contentType === "application/pdf") {
return { url, content: "", type: "pdf" } return { url, content: "", type: "pdf" }
} }
const html = Array.from(document.querySelectorAll("script")).reduce( const html = Array.from(document.querySelectorAll("script")).reduce(
@ -40,6 +37,7 @@ const _getHtml = async () => {
) )
return { url, content: html, type: "html" } return { url, content: html, type: "html" }
} }
export const getDataFromCurrentTab = async () => { export const getDataFromCurrentTab = async () => {
const result = new Promise((resolve) => { const result = new Promise((resolve) => {
chrome.tabs.query({ active: true, currentWindow: true }, async (tabs) => { chrome.tabs.query({ active: true, currentWindow: true }, async (tabs) => {
@ -66,7 +64,10 @@ export const getDataFromCurrentTab = async () => {
if (type === "pdf") { if (type === "pdf") {
const res = await fetch(url) const res = await fetch(url)
const data = await res.arrayBuffer() const data = await res.arrayBuffer()
let pdfHtml: string[] = [] let pdfHtml: {
content: string
page: number
}[] = []
const pdf = await getPdf(data) const pdf = await getPdf(data)
for (let i = 1; i <= pdf.numPages; i += 1) { for (let i = 1; i <= pdf.numPages; i += 1) {
@ -79,18 +80,22 @@ export const getDataFromCurrentTab = async () => {
const text = content?.items.map((item: any) => item.str).join("\n") const text = content?.items.map((item: any) => item.str).join("\n")
.replace(/\x00/g, "").trim(); .replace(/\x00/g, "").trim();
pdfHtml.push(`<div class="pdf-page">${text}</div>`) pdfHtml.push({
content: text,
page: i
})
} }
return { return {
url, url,
content: pdfHtml.join(""), content: "",
type: "html" pdf: pdfHtml,
type: "pdf"
} }
} }
return { url, content, type } return { url, content, type, pdf: [] }
} }

37
src/loader/pdf.ts Normal file
View File

@ -0,0 +1,37 @@
import { BaseDocumentLoader } from "langchain/document_loaders/base"
import { Document } from "@langchain/core/documents"
export interface WebLoaderParams {
pdf: { content: string, page: number }[]
url: string
}
export class PageAssistPDFLoader
extends BaseDocumentLoader
implements WebLoaderParams {
pdf: { content: string, page: number }[]
url: string
constructor({ pdf, url }: WebLoaderParams) {
super()
this.pdf = pdf
this.url = url
}
async load(): Promise<Document<Record<string, any>>[]> {
const documents: Document[] = [];
for (const page of this.pdf) {
const metadata = { source: this.url, page: page.page }
documents.push(new Document({ pageContent: page.content, metadata }))
}
return [
new Document({
pageContent: documents.map((doc) => doc.pageContent).join("\n\n"),
metadata: documents.map((doc) => doc.metadata),
}),
];
}
}

View File

@ -0,0 +1,63 @@
import { PageAssistHtmlLoader } from "~/loader/html"
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"
import { MemoryVectorStore } from "langchain/vectorstores/memory"
import { OllamaEmbeddings } from "@langchain/community/embeddings/ollama"
import { defaultEmbeddingChunkOverlap, defaultEmbeddingChunkSize } from "@/services/ollama"
import { PageAssistPDFLoader } from "@/loader/pdf"
export const getLoader = ({ html, pdf, type, url }: {
url: string,
html: string,
type: string,
pdf: { content: string, page: number }[]
}) => {
if (type === "pdf") {
return new PageAssistPDFLoader({
pdf,
url
})
} else {
return new PageAssistHtmlLoader({
html,
url
})
}
}
export const memoryEmbedding = async (
{ html,
keepTrackOfEmbedding, ollamaEmbedding, pdf, setIsEmbedding, setKeepTrackOfEmbedding, type, url }: {
url: string,
html: string,
type: string,
pdf: { content: string, page: number }[],
keepTrackOfEmbedding: Record<string, MemoryVectorStore>,
ollamaEmbedding: OllamaEmbeddings,
setIsEmbedding: (value: boolean) => void,
setKeepTrackOfEmbedding: (value: Record<string, MemoryVectorStore>) => void
}
) => {
setIsEmbedding(true)
const loader = getLoader({ html, pdf, type, url })
const docs = await loader.load()
const chunkSize = await defaultEmbeddingChunkSize()
const chunkOverlap = await defaultEmbeddingChunkOverlap()
const textSplitter = new RecursiveCharacterTextSplitter({
chunkSize,
chunkOverlap
})
const chunks = await textSplitter.splitDocuments(docs)
const store = new MemoryVectorStore(ollamaEmbedding)
await store.addDocuments(chunks)
setKeepTrackOfEmbedding({
...keepTrackOfEmbedding,
[url]: store
})
setIsEmbedding(false)
return store
}