feat: Add @mozilla/readability dependency for extracting content from web pages

2024-06-22 00:25:12 +05:30 · 2024-06-22 00:25:12 +05:30 · d23b70b979
commit d23b70b979
parent 56cea30058
7 changed files with 136 additions and 3 deletions
--- a/bun.lockb
+++ b/bun.lockb
--- a/package.json
+++ b/package.json
@ -21,6 +21,7 @@
    "@langchain/community": "^0.0.41",
    "@mantine/form": "^7.5.0",
    "@mantine/hooks": "^7.5.3",
+    "@mozilla/readability": "^0.5.0",
    "@plasmohq/storage": "^1.9.0",
    "@tailwindcss/forms": "^0.5.7",
    "@tailwindcss/typography": "^0.5.10",
--- a/src/components/Option/Playground/PlaygroundForm.tsx
+++ b/src/components/Option/Playground/PlaygroundForm.tsx
@ -14,6 +14,7 @@ import { useTranslation } from "react-i18next"
 import { KnowledgeSelect } from "../Knowledge/KnowledgeSelect"
 import { useSpeechRecognition } from "@/hooks/useSpeechRecognition"
 import { PiGlobe } from "react-icons/pi"
+import { extractReadabilityContent } from "@/parser/reader"

 type Props = {
  dropedFile: File | undefined
--- a/src/parser/reader.ts
+++ b/src/parser/reader.ts
@ -0,0 +1,19 @@
+import { Readability } from "@mozilla/readability"
+import { defaultExtractContent } from "./default"
+export const extractReadabilityContent = async (url: string) => {
+    const response = await fetch(url)
+    if (!response.ok) {
+        throw new Error(`Failed to fetch ${url}`)
+    }
+
+    const html = await response.text()
+
+    // create a fake dom for Readability
+    const doc = new DOMParser().parseFromString(html, "text/html")
+    const reader = new Readability(doc)
+    const article = reader.parse()
+
+    // convert the article to markdown
+    const markdown = defaultExtractContent(article.content)
+    return markdown
+}
--- a/src/web/web.ts
+++ b/src/web/web.ts
@ -4,6 +4,7 @@ import { webDuckDuckGoSearch } from "./search-engines/duckduckgo"
 import { getSearchProvider } from "@/services/search"
 import { webSogouSearch } from "./search-engines/sogou"
 import { webBraveSearch } from "./search-engines/brave"
+import { getWebsiteFromQuery, processSingleWebsite } from "./website"

 const getHostName = (url: string) => {
  try {
@ -29,8 +30,25 @@ const searchWeb = (provider: string, query: string) => {

 export const getSystemPromptForWeb = async (query: string) => {
  try {
-    const searchProvider = await getSearchProvider()
-    const search = await searchWeb(searchProvider, query)
+
+    const websiteVisit = getWebsiteFromQuery(query)
+    let search: {
+      url: any;
+      content: string;
+    }[] = []
+
+    if (websiteVisit.hasUrl) {
+
+      const url = websiteVisit.url
+      const queryWithoutUrl = websiteVisit.queryWithouUrls
+      search = await processSingleWebsite(url, queryWithoutUrl)
+
+    } else {
+      const searchProvider = await getSearchProvider()
+      search = await searchWeb(searchProvider, query)
+    }
+
+

    const search_results = search
      .map(
--- a/src/web/website/index.ts
+++ b/src/web/website/index.ts
@ -0,0 +1,94 @@
+import { cleanUrl } from "@/libs/clean-url"
+import { extractReadabilityContent } from "@/parser/reader"
+import { defaultEmbeddingChunkOverlap, defaultEmbeddingChunkSize, defaultEmbeddingModelForRag, getOllamaURL } from "@/services/ollama"
+import { getIsSimpleInternetSearch } from "@/services/search"
+import { OllamaEmbeddings } from "@langchain/community/embeddings/ollama"
+import type { Document } from "@langchain/core/documents"
+import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"
+import { MemoryVectorStore } from "langchain/vectorstores/memory"
+
+export const processSingleWebsite = async (url: string, query: string) => {
+    let content = await extractReadabilityContent(url)
+
+    const isSimpleMode = await getIsSimpleInternetSearch()
+
+    if (isSimpleMode) {
+        return [
+            {
+                url,
+                content: content.length > 5000 ? content.slice(0, 5000) : content
+            }
+        ]
+    }
+
+    const docs: Document<Record<string, any>>[] = [
+        {
+            metadata: {
+                url
+            },
+            pageContent: content
+        }
+    ]
+
+    const ollamaUrl = await getOllamaURL()
+
+    const embeddingModle = await defaultEmbeddingModelForRag()
+    const ollamaEmbedding = new OllamaEmbeddings({
+        model: embeddingModle || "",
+        baseUrl: cleanUrl(ollamaUrl)
+    })
+
+    const chunkSize = await defaultEmbeddingChunkSize()
+    const chunkOverlap = await defaultEmbeddingChunkOverlap()
+    const textSplitter = new RecursiveCharacterTextSplitter({
+        chunkSize,
+        chunkOverlap
+    })
+
+    const chunks = await textSplitter.splitDocuments(docs)
+
+    const store = new MemoryVectorStore(ollamaEmbedding)
+
+    await store.addDocuments(chunks)
+
+    const resultsWithEmbeddings = await store.similaritySearch(query, 3)
+
+    const searchResult = resultsWithEmbeddings.map((result) => {
+        return {
+            url: result.metadata.url,
+            content: result.pageContent
+        }
+    })
+
+    return searchResult
+}
+
+
+export const getWebsiteFromQuery = (query: string): {
+    queryWithouUrls: string,
+    url: string,
+    hasUrl: boolean
+} => {
+
+    const urlRegex = /https?:\/\/[^\s]+/g
+
+    const urls = query.match(urlRegex)
+
+    if (!urls) {
+        return {
+            queryWithouUrls: query,
+            url: "",
+            hasUrl: false
+        }
+    }
+
+    const url = urls[0]
+
+    const queryWithouUrls = query.replace(url, "")
+
+    return {
+        queryWithouUrls,
+        url,
+        hasUrl: true
+    }
+}
--- a/wxt.config.ts
+++ b/wxt.config.ts
@ -48,7 +48,7 @@ export default defineConfig({
  outDir: "build",

  manifest: {
-    version: "1.1.12",
+    version: "1.1.13",
    name:
      process.env.TARGET === "firefox"
        ? "Page Assist - A Web UI for Local AI Models"