feat: Add @mozilla/readability dependency for extracting content from web pages
This commit is contained in:
parent
56cea30058
commit
d23b70b979
@ -21,6 +21,7 @@
|
||||
"@langchain/community": "^0.0.41",
|
||||
"@mantine/form": "^7.5.0",
|
||||
"@mantine/hooks": "^7.5.3",
|
||||
"@mozilla/readability": "^0.5.0",
|
||||
"@plasmohq/storage": "^1.9.0",
|
||||
"@tailwindcss/forms": "^0.5.7",
|
||||
"@tailwindcss/typography": "^0.5.10",
|
||||
|
@ -14,6 +14,7 @@ import { useTranslation } from "react-i18next"
|
||||
import { KnowledgeSelect } from "../Knowledge/KnowledgeSelect"
|
||||
import { useSpeechRecognition } from "@/hooks/useSpeechRecognition"
|
||||
import { PiGlobe } from "react-icons/pi"
|
||||
import { extractReadabilityContent } from "@/parser/reader"
|
||||
|
||||
type Props = {
|
||||
dropedFile: File | undefined
|
||||
|
19
src/parser/reader.ts
Normal file
19
src/parser/reader.ts
Normal file
@ -0,0 +1,19 @@
|
||||
import { Readability } from "@mozilla/readability"
|
||||
import { defaultExtractContent } from "./default"
|
||||
export const extractReadabilityContent = async (url: string) => {
|
||||
const response = await fetch(url)
|
||||
if (!response.ok) {
|
||||
throw new Error(`Failed to fetch ${url}`)
|
||||
}
|
||||
|
||||
const html = await response.text()
|
||||
|
||||
// create a fake dom for Readability
|
||||
const doc = new DOMParser().parseFromString(html, "text/html")
|
||||
const reader = new Readability(doc)
|
||||
const article = reader.parse()
|
||||
|
||||
// convert the article to markdown
|
||||
const markdown = defaultExtractContent(article.content)
|
||||
return markdown
|
||||
}
|
@ -4,6 +4,7 @@ import { webDuckDuckGoSearch } from "./search-engines/duckduckgo"
|
||||
import { getSearchProvider } from "@/services/search"
|
||||
import { webSogouSearch } from "./search-engines/sogou"
|
||||
import { webBraveSearch } from "./search-engines/brave"
|
||||
import { getWebsiteFromQuery, processSingleWebsite } from "./website"
|
||||
|
||||
const getHostName = (url: string) => {
|
||||
try {
|
||||
@ -29,8 +30,25 @@ const searchWeb = (provider: string, query: string) => {
|
||||
|
||||
export const getSystemPromptForWeb = async (query: string) => {
|
||||
try {
|
||||
const searchProvider = await getSearchProvider()
|
||||
const search = await searchWeb(searchProvider, query)
|
||||
|
||||
const websiteVisit = getWebsiteFromQuery(query)
|
||||
let search: {
|
||||
url: any;
|
||||
content: string;
|
||||
}[] = []
|
||||
|
||||
if (websiteVisit.hasUrl) {
|
||||
|
||||
const url = websiteVisit.url
|
||||
const queryWithoutUrl = websiteVisit.queryWithouUrls
|
||||
search = await processSingleWebsite(url, queryWithoutUrl)
|
||||
|
||||
} else {
|
||||
const searchProvider = await getSearchProvider()
|
||||
search = await searchWeb(searchProvider, query)
|
||||
}
|
||||
|
||||
|
||||
|
||||
const search_results = search
|
||||
.map(
|
||||
|
94
src/web/website/index.ts
Normal file
94
src/web/website/index.ts
Normal file
@ -0,0 +1,94 @@
|
||||
import { cleanUrl } from "@/libs/clean-url"
|
||||
import { extractReadabilityContent } from "@/parser/reader"
|
||||
import { defaultEmbeddingChunkOverlap, defaultEmbeddingChunkSize, defaultEmbeddingModelForRag, getOllamaURL } from "@/services/ollama"
|
||||
import { getIsSimpleInternetSearch } from "@/services/search"
|
||||
import { OllamaEmbeddings } from "@langchain/community/embeddings/ollama"
|
||||
import type { Document } from "@langchain/core/documents"
|
||||
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"
|
||||
import { MemoryVectorStore } from "langchain/vectorstores/memory"
|
||||
|
||||
export const processSingleWebsite = async (url: string, query: string) => {
|
||||
let content = await extractReadabilityContent(url)
|
||||
|
||||
const isSimpleMode = await getIsSimpleInternetSearch()
|
||||
|
||||
if (isSimpleMode) {
|
||||
return [
|
||||
{
|
||||
url,
|
||||
content: content.length > 5000 ? content.slice(0, 5000) : content
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
const docs: Document<Record<string, any>>[] = [
|
||||
{
|
||||
metadata: {
|
||||
url
|
||||
},
|
||||
pageContent: content
|
||||
}
|
||||
]
|
||||
|
||||
const ollamaUrl = await getOllamaURL()
|
||||
|
||||
const embeddingModle = await defaultEmbeddingModelForRag()
|
||||
const ollamaEmbedding = new OllamaEmbeddings({
|
||||
model: embeddingModle || "",
|
||||
baseUrl: cleanUrl(ollamaUrl)
|
||||
})
|
||||
|
||||
const chunkSize = await defaultEmbeddingChunkSize()
|
||||
const chunkOverlap = await defaultEmbeddingChunkOverlap()
|
||||
const textSplitter = new RecursiveCharacterTextSplitter({
|
||||
chunkSize,
|
||||
chunkOverlap
|
||||
})
|
||||
|
||||
const chunks = await textSplitter.splitDocuments(docs)
|
||||
|
||||
const store = new MemoryVectorStore(ollamaEmbedding)
|
||||
|
||||
await store.addDocuments(chunks)
|
||||
|
||||
const resultsWithEmbeddings = await store.similaritySearch(query, 3)
|
||||
|
||||
const searchResult = resultsWithEmbeddings.map((result) => {
|
||||
return {
|
||||
url: result.metadata.url,
|
||||
content: result.pageContent
|
||||
}
|
||||
})
|
||||
|
||||
return searchResult
|
||||
}
|
||||
|
||||
|
||||
export const getWebsiteFromQuery = (query: string): {
|
||||
queryWithouUrls: string,
|
||||
url: string,
|
||||
hasUrl: boolean
|
||||
} => {
|
||||
|
||||
const urlRegex = /https?:\/\/[^\s]+/g
|
||||
|
||||
const urls = query.match(urlRegex)
|
||||
|
||||
if (!urls) {
|
||||
return {
|
||||
queryWithouUrls: query,
|
||||
url: "",
|
||||
hasUrl: false
|
||||
}
|
||||
}
|
||||
|
||||
const url = urls[0]
|
||||
|
||||
const queryWithouUrls = query.replace(url, "")
|
||||
|
||||
return {
|
||||
queryWithouUrls,
|
||||
url,
|
||||
hasUrl: true
|
||||
}
|
||||
}
|
@ -48,7 +48,7 @@ export default defineConfig({
|
||||
outDir: "build",
|
||||
|
||||
manifest: {
|
||||
version: "1.1.12",
|
||||
version: "1.1.13",
|
||||
name:
|
||||
process.env.TARGET === "firefox"
|
||||
? "Page Assist - A Web UI for Local AI Models"
|
||||
|
Loading…
x
Reference in New Issue
Block a user