feat: Add @mozilla/readability dependency for extracting content from web pages
This commit is contained in:
parent
56cea30058
commit
d23b70b979
@ -21,6 +21,7 @@
|
|||||||
"@langchain/community": "^0.0.41",
|
"@langchain/community": "^0.0.41",
|
||||||
"@mantine/form": "^7.5.0",
|
"@mantine/form": "^7.5.0",
|
||||||
"@mantine/hooks": "^7.5.3",
|
"@mantine/hooks": "^7.5.3",
|
||||||
|
"@mozilla/readability": "^0.5.0",
|
||||||
"@plasmohq/storage": "^1.9.0",
|
"@plasmohq/storage": "^1.9.0",
|
||||||
"@tailwindcss/forms": "^0.5.7",
|
"@tailwindcss/forms": "^0.5.7",
|
||||||
"@tailwindcss/typography": "^0.5.10",
|
"@tailwindcss/typography": "^0.5.10",
|
||||||
|
@ -14,6 +14,7 @@ import { useTranslation } from "react-i18next"
|
|||||||
import { KnowledgeSelect } from "../Knowledge/KnowledgeSelect"
|
import { KnowledgeSelect } from "../Knowledge/KnowledgeSelect"
|
||||||
import { useSpeechRecognition } from "@/hooks/useSpeechRecognition"
|
import { useSpeechRecognition } from "@/hooks/useSpeechRecognition"
|
||||||
import { PiGlobe } from "react-icons/pi"
|
import { PiGlobe } from "react-icons/pi"
|
||||||
|
import { extractReadabilityContent } from "@/parser/reader"
|
||||||
|
|
||||||
type Props = {
|
type Props = {
|
||||||
dropedFile: File | undefined
|
dropedFile: File | undefined
|
||||||
|
19
src/parser/reader.ts
Normal file
19
src/parser/reader.ts
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
import { Readability } from "@mozilla/readability"
|
||||||
|
import { defaultExtractContent } from "./default"
|
||||||
|
export const extractReadabilityContent = async (url: string) => {
|
||||||
|
const response = await fetch(url)
|
||||||
|
if (!response.ok) {
|
||||||
|
throw new Error(`Failed to fetch ${url}`)
|
||||||
|
}
|
||||||
|
|
||||||
|
const html = await response.text()
|
||||||
|
|
||||||
|
// create a fake dom for Readability
|
||||||
|
const doc = new DOMParser().parseFromString(html, "text/html")
|
||||||
|
const reader = new Readability(doc)
|
||||||
|
const article = reader.parse()
|
||||||
|
|
||||||
|
// convert the article to markdown
|
||||||
|
const markdown = defaultExtractContent(article.content)
|
||||||
|
return markdown
|
||||||
|
}
|
@ -4,6 +4,7 @@ import { webDuckDuckGoSearch } from "./search-engines/duckduckgo"
|
|||||||
import { getSearchProvider } from "@/services/search"
|
import { getSearchProvider } from "@/services/search"
|
||||||
import { webSogouSearch } from "./search-engines/sogou"
|
import { webSogouSearch } from "./search-engines/sogou"
|
||||||
import { webBraveSearch } from "./search-engines/brave"
|
import { webBraveSearch } from "./search-engines/brave"
|
||||||
|
import { getWebsiteFromQuery, processSingleWebsite } from "./website"
|
||||||
|
|
||||||
const getHostName = (url: string) => {
|
const getHostName = (url: string) => {
|
||||||
try {
|
try {
|
||||||
@ -29,8 +30,25 @@ const searchWeb = (provider: string, query: string) => {
|
|||||||
|
|
||||||
export const getSystemPromptForWeb = async (query: string) => {
|
export const getSystemPromptForWeb = async (query: string) => {
|
||||||
try {
|
try {
|
||||||
const searchProvider = await getSearchProvider()
|
|
||||||
const search = await searchWeb(searchProvider, query)
|
const websiteVisit = getWebsiteFromQuery(query)
|
||||||
|
let search: {
|
||||||
|
url: any;
|
||||||
|
content: string;
|
||||||
|
}[] = []
|
||||||
|
|
||||||
|
if (websiteVisit.hasUrl) {
|
||||||
|
|
||||||
|
const url = websiteVisit.url
|
||||||
|
const queryWithoutUrl = websiteVisit.queryWithouUrls
|
||||||
|
search = await processSingleWebsite(url, queryWithoutUrl)
|
||||||
|
|
||||||
|
} else {
|
||||||
|
const searchProvider = await getSearchProvider()
|
||||||
|
search = await searchWeb(searchProvider, query)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
const search_results = search
|
const search_results = search
|
||||||
.map(
|
.map(
|
||||||
|
94
src/web/website/index.ts
Normal file
94
src/web/website/index.ts
Normal file
@ -0,0 +1,94 @@
|
|||||||
|
import { cleanUrl } from "@/libs/clean-url"
|
||||||
|
import { extractReadabilityContent } from "@/parser/reader"
|
||||||
|
import { defaultEmbeddingChunkOverlap, defaultEmbeddingChunkSize, defaultEmbeddingModelForRag, getOllamaURL } from "@/services/ollama"
|
||||||
|
import { getIsSimpleInternetSearch } from "@/services/search"
|
||||||
|
import { OllamaEmbeddings } from "@langchain/community/embeddings/ollama"
|
||||||
|
import type { Document } from "@langchain/core/documents"
|
||||||
|
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"
|
||||||
|
import { MemoryVectorStore } from "langchain/vectorstores/memory"
|
||||||
|
|
||||||
|
export const processSingleWebsite = async (url: string, query: string) => {
|
||||||
|
let content = await extractReadabilityContent(url)
|
||||||
|
|
||||||
|
const isSimpleMode = await getIsSimpleInternetSearch()
|
||||||
|
|
||||||
|
if (isSimpleMode) {
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
url,
|
||||||
|
content: content.length > 5000 ? content.slice(0, 5000) : content
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
const docs: Document<Record<string, any>>[] = [
|
||||||
|
{
|
||||||
|
metadata: {
|
||||||
|
url
|
||||||
|
},
|
||||||
|
pageContent: content
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
const ollamaUrl = await getOllamaURL()
|
||||||
|
|
||||||
|
const embeddingModle = await defaultEmbeddingModelForRag()
|
||||||
|
const ollamaEmbedding = new OllamaEmbeddings({
|
||||||
|
model: embeddingModle || "",
|
||||||
|
baseUrl: cleanUrl(ollamaUrl)
|
||||||
|
})
|
||||||
|
|
||||||
|
const chunkSize = await defaultEmbeddingChunkSize()
|
||||||
|
const chunkOverlap = await defaultEmbeddingChunkOverlap()
|
||||||
|
const textSplitter = new RecursiveCharacterTextSplitter({
|
||||||
|
chunkSize,
|
||||||
|
chunkOverlap
|
||||||
|
})
|
||||||
|
|
||||||
|
const chunks = await textSplitter.splitDocuments(docs)
|
||||||
|
|
||||||
|
const store = new MemoryVectorStore(ollamaEmbedding)
|
||||||
|
|
||||||
|
await store.addDocuments(chunks)
|
||||||
|
|
||||||
|
const resultsWithEmbeddings = await store.similaritySearch(query, 3)
|
||||||
|
|
||||||
|
const searchResult = resultsWithEmbeddings.map((result) => {
|
||||||
|
return {
|
||||||
|
url: result.metadata.url,
|
||||||
|
content: result.pageContent
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
return searchResult
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
export const getWebsiteFromQuery = (query: string): {
|
||||||
|
queryWithouUrls: string,
|
||||||
|
url: string,
|
||||||
|
hasUrl: boolean
|
||||||
|
} => {
|
||||||
|
|
||||||
|
const urlRegex = /https?:\/\/[^\s]+/g
|
||||||
|
|
||||||
|
const urls = query.match(urlRegex)
|
||||||
|
|
||||||
|
if (!urls) {
|
||||||
|
return {
|
||||||
|
queryWithouUrls: query,
|
||||||
|
url: "",
|
||||||
|
hasUrl: false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const url = urls[0]
|
||||||
|
|
||||||
|
const queryWithouUrls = query.replace(url, "")
|
||||||
|
|
||||||
|
return {
|
||||||
|
queryWithouUrls,
|
||||||
|
url,
|
||||||
|
hasUrl: true
|
||||||
|
}
|
||||||
|
}
|
@ -48,7 +48,7 @@ export default defineConfig({
|
|||||||
outDir: "build",
|
outDir: "build",
|
||||||
|
|
||||||
manifest: {
|
manifest: {
|
||||||
version: "1.1.12",
|
version: "1.1.13",
|
||||||
name:
|
name:
|
||||||
process.env.TARGET === "firefox"
|
process.env.TARGET === "firefox"
|
||||||
? "Page Assist - A Web UI for Local AI Models"
|
? "Page Assist - A Web UI for Local AI Models"
|
||||||
|
Loading…
x
Reference in New Issue
Block a user