diff --git a/src/hooks/useMessage.tsx b/src/hooks/useMessage.tsx index bd7622a..c501185 100644 --- a/src/hooks/useMessage.tsx +++ b/src/hooks/useMessage.tsx @@ -1,89 +1,22 @@ import React from "react" import { cleanUrl } from "~/libs/clean-url" import { - defaultEmbeddingChunkOverlap, - defaultEmbeddingChunkSize, defaultEmbeddingModelForRag, getOllamaURL, promptForRag, systemPromptForNonRag } from "~/services/ollama" -import { useStoreMessage, type ChatHistory, type Message } from "~/store" +import { useStoreMessage, type Message } from "~/store" import { ChatOllama } from "@langchain/community/chat_models/ollama" -import { - HumanMessage, - AIMessage, - type MessageContent, - SystemMessage -} from "@langchain/core/messages" +import { HumanMessage, SystemMessage } from "@langchain/core/messages" import { getDataFromCurrentTab } from "~/libs/get-html" -import { PageAssistHtmlLoader } from "~/loader/html" -import { RecursiveCharacterTextSplitter } from "langchain/text_splitter" import { OllamaEmbeddings } from "@langchain/community/embeddings/ollama" import { createChatWithWebsiteChain, groupMessagesByConversation } from "~/chain/chat-with-website" import { MemoryVectorStore } from "langchain/vectorstores/memory" -import { chromeRunTime } from "~/libs/runtime" -export type BotResponse = { - bot: { - text: string - sourceDocuments: any[] - } - history: ChatHistory - history_id: string -} - -const generateHistory = ( - messages: { - role: "user" | "assistant" | "system" - content: string - image?: string - }[] -) => { - let history = [] - for (const message of messages) { - if (message.role === "user") { - let content: MessageContent = [ - { - type: "text", - text: message.content - } - ] - - if (message.image) { - content = [ - { - type: "image_url", - image_url: message.image - }, - { - type: "text", - text: message.content - } - ] - } - history.push( - new HumanMessage({ - content: content - }) - ) - } else if (message.role === "assistant") { - history.push( - new AIMessage({ - content: [ - { - type: "text", - text: message.content - } - ] - }) - ) - } - } - return history -} +import { memoryEmbedding } from "@/utils/memory-embeddings" export const useMessage = () => { const { @@ -129,47 +62,18 @@ export const useMessage = () => { setStreaming(false) } - const memoryEmbedding = async ( - url: string, - html: string, - ollamaEmbedding: OllamaEmbeddings - ) => { - const loader = new PageAssistHtmlLoader({ - html, - url - }) - const docs = await loader.load() - const chunkSize = await defaultEmbeddingChunkSize() - const chunkOverlap = await defaultEmbeddingChunkOverlap() - const textSplitter = new RecursiveCharacterTextSplitter({ - chunkSize, - chunkOverlap - }) - - const chunks = await textSplitter.splitDocuments(docs) - - const store = new MemoryVectorStore(ollamaEmbedding) - - setIsEmbedding(true) - - await store.addDocuments(chunks) - setKeepTrackOfEmbedding({ - ...keepTrackOfEmbedding, - [url]: store - }) - setIsEmbedding(false) - - return store - } - const chatWithWebsiteMode = async (message: string) => { try { let isAlreadyExistEmbedding: MemoryVectorStore - let embedURL: string, embedHTML: string + let embedURL: string, embedHTML: string, embedType: string + let embedPDF: { content: string; page: number }[] = [] + if (messages.length === 0) { - const { content: html, url, type } = await getDataFromCurrentTab() + const { content: html, url, type, pdf } = await getDataFromCurrentTab() embedHTML = html embedURL = url + embedType = type + embedPDF = pdf setCurrentURL(url) isAlreadyExistEmbedding = keepTrackOfEmbedding[currentURL] } else { @@ -212,11 +116,16 @@ export const useMessage = () => { if (isAlreadyExistEmbedding) { vectorstore = isAlreadyExistEmbedding } else { - vectorstore = await memoryEmbedding( - embedURL, - embedHTML, - ollamaEmbedding - ) + vectorstore = await memoryEmbedding({ + html: embedHTML, + keepTrackOfEmbedding: keepTrackOfEmbedding, + ollamaEmbedding: ollamaEmbedding, + pdf: embedPDF, + setIsEmbedding: setIsEmbedding, + setKeepTrackOfEmbedding: setKeepTrackOfEmbedding, + type: embedType, + url: embedURL + }) } const { ragPrompt: systemPrompt, ragQuestionPrompt: questionPrompt } = diff --git a/src/i18n/index.ts b/src/i18n/index.ts index 626d1e7..2e7306f 100644 --- a/src/i18n/index.ts +++ b/src/i18n/index.ts @@ -1,14 +1,11 @@ import i18n from "i18next"; -import LanguageDetector from "i18next-browser-languagedetector"; import { initReactI18next } from "react-i18next"; import { en } from "./lang/en"; import { ml } from "./lang/ml"; i18n - .use(LanguageDetector) .use(initReactI18next) .init({ - debug: true, resources: { en: en, ml: ml diff --git a/src/libs/get-html.ts b/src/libs/get-html.ts index 97bb750..465e480 100644 --- a/src/libs/get-html.ts +++ b/src/libs/get-html.ts @@ -26,10 +26,7 @@ export const getPdf = async (data: ArrayBuffer) => { const _getHtml = async () => { const url = window.location.href - // check the content type if (document.contentType === "application/pdf") { - - return { url, content: "", type: "pdf" } } const html = Array.from(document.querySelectorAll("script")).reduce( @@ -40,6 +37,7 @@ const _getHtml = async () => { ) return { url, content: html, type: "html" } } + export const getDataFromCurrentTab = async () => { const result = new Promise((resolve) => { chrome.tabs.query({ active: true, currentWindow: true }, async (tabs) => { @@ -66,7 +64,10 @@ export const getDataFromCurrentTab = async () => { if (type === "pdf") { const res = await fetch(url) const data = await res.arrayBuffer() - let pdfHtml: string[] = [] + let pdfHtml: { + content: string + page: number + }[] = [] const pdf = await getPdf(data) for (let i = 1; i <= pdf.numPages; i += 1) { @@ -79,18 +80,22 @@ export const getDataFromCurrentTab = async () => { const text = content?.items.map((item: any) => item.str).join("\n") .replace(/\x00/g, "").trim(); - pdfHtml.push(`