diff --git a/src/hooks/useMessage.tsx b/src/hooks/useMessage.tsx index bd7622a..c501185 100644 --- a/src/hooks/useMessage.tsx +++ b/src/hooks/useMessage.tsx @@ -1,89 +1,22 @@ import React from "react" import { cleanUrl } from "~/libs/clean-url" import { - defaultEmbeddingChunkOverlap, - defaultEmbeddingChunkSize, defaultEmbeddingModelForRag, getOllamaURL, promptForRag, systemPromptForNonRag } from "~/services/ollama" -import { useStoreMessage, type ChatHistory, type Message } from "~/store" +import { useStoreMessage, type Message } from "~/store" import { ChatOllama } from "@langchain/community/chat_models/ollama" -import { - HumanMessage, - AIMessage, - type MessageContent, - SystemMessage -} from "@langchain/core/messages" +import { HumanMessage, SystemMessage } from "@langchain/core/messages" import { getDataFromCurrentTab } from "~/libs/get-html" -import { PageAssistHtmlLoader } from "~/loader/html" -import { RecursiveCharacterTextSplitter } from "langchain/text_splitter" import { OllamaEmbeddings } from "@langchain/community/embeddings/ollama" import { createChatWithWebsiteChain, groupMessagesByConversation } from "~/chain/chat-with-website" import { MemoryVectorStore } from "langchain/vectorstores/memory" -import { chromeRunTime } from "~/libs/runtime" -export type BotResponse = { - bot: { - text: string - sourceDocuments: any[] - } - history: ChatHistory - history_id: string -} - -const generateHistory = ( - messages: { - role: "user" | "assistant" | "system" - content: string - image?: string - }[] -) => { - let history = [] - for (const message of messages) { - if (message.role === "user") { - let content: MessageContent = [ - { - type: "text", - text: message.content - } - ] - - if (message.image) { - content = [ - { - type: "image_url", - image_url: message.image - }, - { - type: "text", - text: message.content - } - ] - } - history.push( - new HumanMessage({ - content: content - }) - ) - } else if (message.role === "assistant") { - history.push( - new AIMessage({ - content: [ - { - type: "text", - text: message.content - } - ] - }) - ) - } - } - return history -} +import { memoryEmbedding } from "@/utils/memory-embeddings" export const useMessage = () => { const { @@ -129,47 +62,18 @@ export const useMessage = () => { setStreaming(false) } - const memoryEmbedding = async ( - url: string, - html: string, - ollamaEmbedding: OllamaEmbeddings - ) => { - const loader = new PageAssistHtmlLoader({ - html, - url - }) - const docs = await loader.load() - const chunkSize = await defaultEmbeddingChunkSize() - const chunkOverlap = await defaultEmbeddingChunkOverlap() - const textSplitter = new RecursiveCharacterTextSplitter({ - chunkSize, - chunkOverlap - }) - - const chunks = await textSplitter.splitDocuments(docs) - - const store = new MemoryVectorStore(ollamaEmbedding) - - setIsEmbedding(true) - - await store.addDocuments(chunks) - setKeepTrackOfEmbedding({ - ...keepTrackOfEmbedding, - [url]: store - }) - setIsEmbedding(false) - - return store - } - const chatWithWebsiteMode = async (message: string) => { try { let isAlreadyExistEmbedding: MemoryVectorStore - let embedURL: string, embedHTML: string + let embedURL: string, embedHTML: string, embedType: string + let embedPDF: { content: string; page: number }[] = [] + if (messages.length === 0) { - const { content: html, url, type } = await getDataFromCurrentTab() + const { content: html, url, type, pdf } = await getDataFromCurrentTab() embedHTML = html embedURL = url + embedType = type + embedPDF = pdf setCurrentURL(url) isAlreadyExistEmbedding = keepTrackOfEmbedding[currentURL] } else { @@ -212,11 +116,16 @@ export const useMessage = () => { if (isAlreadyExistEmbedding) { vectorstore = isAlreadyExistEmbedding } else { - vectorstore = await memoryEmbedding( - embedURL, - embedHTML, - ollamaEmbedding - ) + vectorstore = await memoryEmbedding({ + html: embedHTML, + keepTrackOfEmbedding: keepTrackOfEmbedding, + ollamaEmbedding: ollamaEmbedding, + pdf: embedPDF, + setIsEmbedding: setIsEmbedding, + setKeepTrackOfEmbedding: setKeepTrackOfEmbedding, + type: embedType, + url: embedURL + }) } const { ragPrompt: systemPrompt, ragQuestionPrompt: questionPrompt } = diff --git a/src/i18n/index.ts b/src/i18n/index.ts index 626d1e7..2e7306f 100644 --- a/src/i18n/index.ts +++ b/src/i18n/index.ts @@ -1,14 +1,11 @@ import i18n from "i18next"; -import LanguageDetector from "i18next-browser-languagedetector"; import { initReactI18next } from "react-i18next"; import { en } from "./lang/en"; import { ml } from "./lang/ml"; i18n - .use(LanguageDetector) .use(initReactI18next) .init({ - debug: true, resources: { en: en, ml: ml diff --git a/src/libs/get-html.ts b/src/libs/get-html.ts index 97bb750..465e480 100644 --- a/src/libs/get-html.ts +++ b/src/libs/get-html.ts @@ -26,10 +26,7 @@ export const getPdf = async (data: ArrayBuffer) => { const _getHtml = async () => { const url = window.location.href - // check the content type if (document.contentType === "application/pdf") { - - return { url, content: "", type: "pdf" } } const html = Array.from(document.querySelectorAll("script")).reduce( @@ -40,6 +37,7 @@ const _getHtml = async () => { ) return { url, content: html, type: "html" } } + export const getDataFromCurrentTab = async () => { const result = new Promise((resolve) => { chrome.tabs.query({ active: true, currentWindow: true }, async (tabs) => { @@ -66,7 +64,10 @@ export const getDataFromCurrentTab = async () => { if (type === "pdf") { const res = await fetch(url) const data = await res.arrayBuffer() - let pdfHtml: string[] = [] + let pdfHtml: { + content: string + page: number + }[] = [] const pdf = await getPdf(data) for (let i = 1; i <= pdf.numPages; i += 1) { @@ -79,18 +80,22 @@ export const getDataFromCurrentTab = async () => { const text = content?.items.map((item: any) => item.str).join("\n") .replace(/\x00/g, "").trim(); - pdfHtml.push(`
${text}
`) + pdfHtml.push({ + content: text, + page: i + }) } return { url, - content: pdfHtml.join(""), - type: "html" + content: "", + pdf: pdfHtml, + type: "pdf" } } - return { url, content, type } + return { url, content, type, pdf: [] } } diff --git a/src/loader/pdf.ts b/src/loader/pdf.ts new file mode 100644 index 0000000..097460b --- /dev/null +++ b/src/loader/pdf.ts @@ -0,0 +1,37 @@ +import { BaseDocumentLoader } from "langchain/document_loaders/base" +import { Document } from "@langchain/core/documents" +export interface WebLoaderParams { + pdf: { content: string, page: number }[] + url: string +} + +export class PageAssistPDFLoader + extends BaseDocumentLoader + implements WebLoaderParams { + pdf: { content: string, page: number }[] + url: string + + constructor({ pdf, url }: WebLoaderParams) { + super() + this.pdf = pdf + this.url = url + } + + async load(): Promise>[]> { + const documents: Document[] = []; + + for (const page of this.pdf) { + const metadata = { source: this.url, page: page.page } + documents.push(new Document({ pageContent: page.content, metadata })) + } + + return [ + new Document({ + pageContent: documents.map((doc) => doc.pageContent).join("\n\n"), + metadata: documents.map((doc) => doc.metadata), + }), + ]; + + + } +} diff --git a/src/utils/memory-embeddings.ts b/src/utils/memory-embeddings.ts new file mode 100644 index 0000000..e3572d1 --- /dev/null +++ b/src/utils/memory-embeddings.ts @@ -0,0 +1,63 @@ +import { PageAssistHtmlLoader } from "~/loader/html" +import { RecursiveCharacterTextSplitter } from "langchain/text_splitter" +import { MemoryVectorStore } from "langchain/vectorstores/memory" +import { OllamaEmbeddings } from "@langchain/community/embeddings/ollama" +import { defaultEmbeddingChunkOverlap, defaultEmbeddingChunkSize } from "@/services/ollama" +import { PageAssistPDFLoader } from "@/loader/pdf" + + +export const getLoader = ({ html, pdf, type, url }: { + url: string, + html: string, + type: string, + pdf: { content: string, page: number }[] +}) => { + if (type === "pdf") { + return new PageAssistPDFLoader({ + pdf, + url + }) + } else { + return new PageAssistHtmlLoader({ + html, + url + }) + } +} + +export const memoryEmbedding = async ( + { html, + keepTrackOfEmbedding, ollamaEmbedding, pdf, setIsEmbedding, setKeepTrackOfEmbedding, type, url }: { + url: string, + html: string, + type: string, + pdf: { content: string, page: number }[], + keepTrackOfEmbedding: Record, + ollamaEmbedding: OllamaEmbeddings, + setIsEmbedding: (value: boolean) => void, + setKeepTrackOfEmbedding: (value: Record) => void + } +) => { + setIsEmbedding(true) + + const loader = getLoader({ html, pdf, type, url }) + const docs = await loader.load() + const chunkSize = await defaultEmbeddingChunkSize() + const chunkOverlap = await defaultEmbeddingChunkOverlap() + const textSplitter = new RecursiveCharacterTextSplitter({ + chunkSize, + chunkOverlap + }) + + const chunks = await textSplitter.splitDocuments(docs) + + const store = new MemoryVectorStore(ollamaEmbedding) + + await store.addDocuments(chunks) + setKeepTrackOfEmbedding({ + ...keepTrackOfEmbedding, + [url]: store + }) + setIsEmbedding(false) + return store +} \ No newline at end of file