diff --git a/bun.lockb b/bun.lockb index 1212713..c2f8f9b 100644 Binary files a/bun.lockb and b/bun.lockb differ diff --git a/package.json b/package.json index 5d64161..44daa54 100644 --- a/package.json +++ b/package.json @@ -21,6 +21,7 @@ "@langchain/community": "^0.0.41", "@mantine/form": "^7.5.0", "@mantine/hooks": "^7.5.3", + "@mozilla/readability": "^0.5.0", "@plasmohq/storage": "^1.9.0", "@tailwindcss/forms": "^0.5.7", "@tailwindcss/typography": "^0.5.10", diff --git a/src/components/Option/Playground/PlaygroundForm.tsx b/src/components/Option/Playground/PlaygroundForm.tsx index c3e97ec..8d248f8 100644 --- a/src/components/Option/Playground/PlaygroundForm.tsx +++ b/src/components/Option/Playground/PlaygroundForm.tsx @@ -14,6 +14,7 @@ import { useTranslation } from "react-i18next" import { KnowledgeSelect } from "../Knowledge/KnowledgeSelect" import { useSpeechRecognition } from "@/hooks/useSpeechRecognition" import { PiGlobe } from "react-icons/pi" +import { extractReadabilityContent } from "@/parser/reader" type Props = { dropedFile: File | undefined diff --git a/src/parser/reader.ts b/src/parser/reader.ts new file mode 100644 index 0000000..23b22bf --- /dev/null +++ b/src/parser/reader.ts @@ -0,0 +1,19 @@ +import { Readability } from "@mozilla/readability" +import { defaultExtractContent } from "./default" +export const extractReadabilityContent = async (url: string) => { + const response = await fetch(url) + if (!response.ok) { + throw new Error(`Failed to fetch ${url}`) + } + + const html = await response.text() + + // create a fake dom for Readability + const doc = new DOMParser().parseFromString(html, "text/html") + const reader = new Readability(doc) + const article = reader.parse() + + // convert the article to markdown + const markdown = defaultExtractContent(article.content) + return markdown +} \ No newline at end of file diff --git a/src/web/web.ts b/src/web/web.ts index e9c1765..8661489 100644 --- a/src/web/web.ts +++ b/src/web/web.ts @@ -4,6 +4,7 @@ import { webDuckDuckGoSearch } from "./search-engines/duckduckgo" import { getSearchProvider } from "@/services/search" import { webSogouSearch } from "./search-engines/sogou" import { webBraveSearch } from "./search-engines/brave" +import { getWebsiteFromQuery, processSingleWebsite } from "./website" const getHostName = (url: string) => { try { @@ -29,8 +30,25 @@ const searchWeb = (provider: string, query: string) => { export const getSystemPromptForWeb = async (query: string) => { try { - const searchProvider = await getSearchProvider() - const search = await searchWeb(searchProvider, query) + + const websiteVisit = getWebsiteFromQuery(query) + let search: { + url: any; + content: string; + }[] = [] + + if (websiteVisit.hasUrl) { + + const url = websiteVisit.url + const queryWithoutUrl = websiteVisit.queryWithouUrls + search = await processSingleWebsite(url, queryWithoutUrl) + + } else { + const searchProvider = await getSearchProvider() + search = await searchWeb(searchProvider, query) + } + + const search_results = search .map( diff --git a/src/web/website/index.ts b/src/web/website/index.ts new file mode 100644 index 0000000..021db16 --- /dev/null +++ b/src/web/website/index.ts @@ -0,0 +1,94 @@ +import { cleanUrl } from "@/libs/clean-url" +import { extractReadabilityContent } from "@/parser/reader" +import { defaultEmbeddingChunkOverlap, defaultEmbeddingChunkSize, defaultEmbeddingModelForRag, getOllamaURL } from "@/services/ollama" +import { getIsSimpleInternetSearch } from "@/services/search" +import { OllamaEmbeddings } from "@langchain/community/embeddings/ollama" +import type { Document } from "@langchain/core/documents" +import { RecursiveCharacterTextSplitter } from "langchain/text_splitter" +import { MemoryVectorStore } from "langchain/vectorstores/memory" + +export const processSingleWebsite = async (url: string, query: string) => { + let content = await extractReadabilityContent(url) + + const isSimpleMode = await getIsSimpleInternetSearch() + + if (isSimpleMode) { + return [ + { + url, + content: content.length > 5000 ? content.slice(0, 5000) : content + } + ] + } + + const docs: Document>[] = [ + { + metadata: { + url + }, + pageContent: content + } + ] + + const ollamaUrl = await getOllamaURL() + + const embeddingModle = await defaultEmbeddingModelForRag() + const ollamaEmbedding = new OllamaEmbeddings({ + model: embeddingModle || "", + baseUrl: cleanUrl(ollamaUrl) + }) + + const chunkSize = await defaultEmbeddingChunkSize() + const chunkOverlap = await defaultEmbeddingChunkOverlap() + const textSplitter = new RecursiveCharacterTextSplitter({ + chunkSize, + chunkOverlap + }) + + const chunks = await textSplitter.splitDocuments(docs) + + const store = new MemoryVectorStore(ollamaEmbedding) + + await store.addDocuments(chunks) + + const resultsWithEmbeddings = await store.similaritySearch(query, 3) + + const searchResult = resultsWithEmbeddings.map((result) => { + return { + url: result.metadata.url, + content: result.pageContent + } + }) + + return searchResult +} + + +export const getWebsiteFromQuery = (query: string): { + queryWithouUrls: string, + url: string, + hasUrl: boolean +} => { + + const urlRegex = /https?:\/\/[^\s]+/g + + const urls = query.match(urlRegex) + + if (!urls) { + return { + queryWithouUrls: query, + url: "", + hasUrl: false + } + } + + const url = urls[0] + + const queryWithouUrls = query.replace(url, "") + + return { + queryWithouUrls, + url, + hasUrl: true + } +} \ No newline at end of file diff --git a/wxt.config.ts b/wxt.config.ts index 8d94b9f..be6459d 100644 --- a/wxt.config.ts +++ b/wxt.config.ts @@ -48,7 +48,7 @@ export default defineConfig({ outDir: "build", manifest: { - version: "1.1.12", + version: "1.1.13", name: process.env.TARGET === "firefox" ? "Page Assist - A Web UI for Local AI Models"