feat: Add @mozilla/readability dependency for extracting content from web pages

This commit is contained in:
n4ze3m 2024-06-22 00:25:12 +05:30
parent 56cea30058
commit d23b70b979
7 changed files with 136 additions and 3 deletions

BIN
bun.lockb

Binary file not shown.

View File

@ -21,6 +21,7 @@
"@langchain/community": "^0.0.41", "@langchain/community": "^0.0.41",
"@mantine/form": "^7.5.0", "@mantine/form": "^7.5.0",
"@mantine/hooks": "^7.5.3", "@mantine/hooks": "^7.5.3",
"@mozilla/readability": "^0.5.0",
"@plasmohq/storage": "^1.9.0", "@plasmohq/storage": "^1.9.0",
"@tailwindcss/forms": "^0.5.7", "@tailwindcss/forms": "^0.5.7",
"@tailwindcss/typography": "^0.5.10", "@tailwindcss/typography": "^0.5.10",

View File

@ -14,6 +14,7 @@ import { useTranslation } from "react-i18next"
import { KnowledgeSelect } from "../Knowledge/KnowledgeSelect" import { KnowledgeSelect } from "../Knowledge/KnowledgeSelect"
import { useSpeechRecognition } from "@/hooks/useSpeechRecognition" import { useSpeechRecognition } from "@/hooks/useSpeechRecognition"
import { PiGlobe } from "react-icons/pi" import { PiGlobe } from "react-icons/pi"
import { extractReadabilityContent } from "@/parser/reader"
type Props = { type Props = {
dropedFile: File | undefined dropedFile: File | undefined

19
src/parser/reader.ts Normal file
View File

@ -0,0 +1,19 @@
import { Readability } from "@mozilla/readability"
import { defaultExtractContent } from "./default"
export const extractReadabilityContent = async (url: string) => {
const response = await fetch(url)
if (!response.ok) {
throw new Error(`Failed to fetch ${url}`)
}
const html = await response.text()
// create a fake dom for Readability
const doc = new DOMParser().parseFromString(html, "text/html")
const reader = new Readability(doc)
const article = reader.parse()
// convert the article to markdown
const markdown = defaultExtractContent(article.content)
return markdown
}

View File

@ -4,6 +4,7 @@ import { webDuckDuckGoSearch } from "./search-engines/duckduckgo"
import { getSearchProvider } from "@/services/search" import { getSearchProvider } from "@/services/search"
import { webSogouSearch } from "./search-engines/sogou" import { webSogouSearch } from "./search-engines/sogou"
import { webBraveSearch } from "./search-engines/brave" import { webBraveSearch } from "./search-engines/brave"
import { getWebsiteFromQuery, processSingleWebsite } from "./website"
const getHostName = (url: string) => { const getHostName = (url: string) => {
try { try {
@ -29,8 +30,25 @@ const searchWeb = (provider: string, query: string) => {
export const getSystemPromptForWeb = async (query: string) => { export const getSystemPromptForWeb = async (query: string) => {
try { try {
const websiteVisit = getWebsiteFromQuery(query)
let search: {
url: any;
content: string;
}[] = []
if (websiteVisit.hasUrl) {
const url = websiteVisit.url
const queryWithoutUrl = websiteVisit.queryWithouUrls
search = await processSingleWebsite(url, queryWithoutUrl)
} else {
const searchProvider = await getSearchProvider() const searchProvider = await getSearchProvider()
const search = await searchWeb(searchProvider, query) search = await searchWeb(searchProvider, query)
}
const search_results = search const search_results = search
.map( .map(

94
src/web/website/index.ts Normal file
View File

@ -0,0 +1,94 @@
import { cleanUrl } from "@/libs/clean-url"
import { extractReadabilityContent } from "@/parser/reader"
import { defaultEmbeddingChunkOverlap, defaultEmbeddingChunkSize, defaultEmbeddingModelForRag, getOllamaURL } from "@/services/ollama"
import { getIsSimpleInternetSearch } from "@/services/search"
import { OllamaEmbeddings } from "@langchain/community/embeddings/ollama"
import type { Document } from "@langchain/core/documents"
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"
import { MemoryVectorStore } from "langchain/vectorstores/memory"
export const processSingleWebsite = async (url: string, query: string) => {
let content = await extractReadabilityContent(url)
const isSimpleMode = await getIsSimpleInternetSearch()
if (isSimpleMode) {
return [
{
url,
content: content.length > 5000 ? content.slice(0, 5000) : content
}
]
}
const docs: Document<Record<string, any>>[] = [
{
metadata: {
url
},
pageContent: content
}
]
const ollamaUrl = await getOllamaURL()
const embeddingModle = await defaultEmbeddingModelForRag()
const ollamaEmbedding = new OllamaEmbeddings({
model: embeddingModle || "",
baseUrl: cleanUrl(ollamaUrl)
})
const chunkSize = await defaultEmbeddingChunkSize()
const chunkOverlap = await defaultEmbeddingChunkOverlap()
const textSplitter = new RecursiveCharacterTextSplitter({
chunkSize,
chunkOverlap
})
const chunks = await textSplitter.splitDocuments(docs)
const store = new MemoryVectorStore(ollamaEmbedding)
await store.addDocuments(chunks)
const resultsWithEmbeddings = await store.similaritySearch(query, 3)
const searchResult = resultsWithEmbeddings.map((result) => {
return {
url: result.metadata.url,
content: result.pageContent
}
})
return searchResult
}
export const getWebsiteFromQuery = (query: string): {
queryWithouUrls: string,
url: string,
hasUrl: boolean
} => {
const urlRegex = /https?:\/\/[^\s]+/g
const urls = query.match(urlRegex)
if (!urls) {
return {
queryWithouUrls: query,
url: "",
hasUrl: false
}
}
const url = urls[0]
const queryWithouUrls = query.replace(url, "")
return {
queryWithouUrls,
url,
hasUrl: true
}
}

View File

@ -48,7 +48,7 @@ export default defineConfig({
outDir: "build", outDir: "build",
manifest: { manifest: {
version: "1.1.12", version: "1.1.13",
name: name:
process.env.TARGET === "firefox" process.env.TARGET === "firefox"
? "Page Assist - A Web UI for Local AI Models" ? "Page Assist - A Web UI for Local AI Models"