feat: Add @mozilla/readability dependency for extracting content from web pages
This commit is contained in:
		
							parent
							
								
									56cea30058
								
							
						
					
					
						commit
						d23b70b979
					
				| @ -21,6 +21,7 @@ | ||||
|     "@langchain/community": "^0.0.41", | ||||
|     "@mantine/form": "^7.5.0", | ||||
|     "@mantine/hooks": "^7.5.3", | ||||
|     "@mozilla/readability": "^0.5.0", | ||||
|     "@plasmohq/storage": "^1.9.0", | ||||
|     "@tailwindcss/forms": "^0.5.7", | ||||
|     "@tailwindcss/typography": "^0.5.10", | ||||
|  | ||||
| @ -14,6 +14,7 @@ import { useTranslation } from "react-i18next" | ||||
| import { KnowledgeSelect } from "../Knowledge/KnowledgeSelect" | ||||
| import { useSpeechRecognition } from "@/hooks/useSpeechRecognition" | ||||
| import { PiGlobe } from "react-icons/pi" | ||||
| import { extractReadabilityContent } from "@/parser/reader" | ||||
| 
 | ||||
| type Props = { | ||||
|   dropedFile: File | undefined | ||||
|  | ||||
							
								
								
									
										19
									
								
								src/parser/reader.ts
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										19
									
								
								src/parser/reader.ts
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,19 @@ | ||||
| import { Readability } from "@mozilla/readability" | ||||
| import { defaultExtractContent } from "./default" | ||||
| export const extractReadabilityContent = async (url: string) => { | ||||
|     const response = await fetch(url) | ||||
|     if (!response.ok) { | ||||
|         throw new Error(`Failed to fetch ${url}`) | ||||
|     } | ||||
| 
 | ||||
|     const html = await response.text() | ||||
| 
 | ||||
|     // create a fake dom for Readability
 | ||||
|     const doc = new DOMParser().parseFromString(html, "text/html") | ||||
|     const reader = new Readability(doc) | ||||
|     const article = reader.parse() | ||||
| 
 | ||||
|     // convert the article to markdown
 | ||||
|     const markdown = defaultExtractContent(article.content) | ||||
|     return markdown | ||||
| } | ||||
| @ -4,6 +4,7 @@ import { webDuckDuckGoSearch } from "./search-engines/duckduckgo" | ||||
| import { getSearchProvider } from "@/services/search" | ||||
| import { webSogouSearch } from "./search-engines/sogou" | ||||
| import { webBraveSearch } from "./search-engines/brave" | ||||
| import { getWebsiteFromQuery, processSingleWebsite } from "./website" | ||||
| 
 | ||||
| const getHostName = (url: string) => { | ||||
|   try { | ||||
| @ -29,8 +30,25 @@ const searchWeb = (provider: string, query: string) => { | ||||
| 
 | ||||
| export const getSystemPromptForWeb = async (query: string) => { | ||||
|   try { | ||||
| 
 | ||||
|     const websiteVisit = getWebsiteFromQuery(query) | ||||
|     let search: { | ||||
|       url: any; | ||||
|       content: string; | ||||
|     }[] = [] | ||||
| 
 | ||||
|     if (websiteVisit.hasUrl) { | ||||
| 
 | ||||
|       const url = websiteVisit.url | ||||
|       const queryWithoutUrl = websiteVisit.queryWithouUrls | ||||
|       search = await processSingleWebsite(url, queryWithoutUrl) | ||||
| 
 | ||||
|     } else { | ||||
|       const searchProvider = await getSearchProvider() | ||||
|     const search = await searchWeb(searchProvider, query) | ||||
|       search = await searchWeb(searchProvider, query) | ||||
|     } | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
|     const search_results = search | ||||
|       .map( | ||||
|  | ||||
							
								
								
									
										94
									
								
								src/web/website/index.ts
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										94
									
								
								src/web/website/index.ts
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,94 @@ | ||||
| import { cleanUrl } from "@/libs/clean-url" | ||||
| import { extractReadabilityContent } from "@/parser/reader" | ||||
| import { defaultEmbeddingChunkOverlap, defaultEmbeddingChunkSize, defaultEmbeddingModelForRag, getOllamaURL } from "@/services/ollama" | ||||
| import { getIsSimpleInternetSearch } from "@/services/search" | ||||
| import { OllamaEmbeddings } from "@langchain/community/embeddings/ollama" | ||||
| import type { Document } from "@langchain/core/documents" | ||||
| import { RecursiveCharacterTextSplitter } from "langchain/text_splitter" | ||||
| import { MemoryVectorStore } from "langchain/vectorstores/memory" | ||||
| 
 | ||||
| export const processSingleWebsite = async (url: string, query: string) => { | ||||
|     let content = await extractReadabilityContent(url) | ||||
| 
 | ||||
|     const isSimpleMode = await getIsSimpleInternetSearch() | ||||
| 
 | ||||
|     if (isSimpleMode) { | ||||
|         return [ | ||||
|             { | ||||
|                 url, | ||||
|                 content: content.length > 5000 ? content.slice(0, 5000) : content | ||||
|             } | ||||
|         ] | ||||
|     } | ||||
| 
 | ||||
|     const docs: Document<Record<string, any>>[] = [ | ||||
|         { | ||||
|             metadata: { | ||||
|                 url | ||||
|             }, | ||||
|             pageContent: content | ||||
|         } | ||||
|     ] | ||||
| 
 | ||||
|     const ollamaUrl = await getOllamaURL() | ||||
| 
 | ||||
|     const embeddingModle = await defaultEmbeddingModelForRag() | ||||
|     const ollamaEmbedding = new OllamaEmbeddings({ | ||||
|         model: embeddingModle || "", | ||||
|         baseUrl: cleanUrl(ollamaUrl) | ||||
|     }) | ||||
| 
 | ||||
|     const chunkSize = await defaultEmbeddingChunkSize() | ||||
|     const chunkOverlap = await defaultEmbeddingChunkOverlap() | ||||
|     const textSplitter = new RecursiveCharacterTextSplitter({ | ||||
|         chunkSize, | ||||
|         chunkOverlap | ||||
|     }) | ||||
| 
 | ||||
|     const chunks = await textSplitter.splitDocuments(docs) | ||||
| 
 | ||||
|     const store = new MemoryVectorStore(ollamaEmbedding) | ||||
| 
 | ||||
|     await store.addDocuments(chunks) | ||||
| 
 | ||||
|     const resultsWithEmbeddings = await store.similaritySearch(query, 3) | ||||
| 
 | ||||
|     const searchResult = resultsWithEmbeddings.map((result) => { | ||||
|         return { | ||||
|             url: result.metadata.url, | ||||
|             content: result.pageContent | ||||
|         } | ||||
|     }) | ||||
| 
 | ||||
|     return searchResult | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| export const getWebsiteFromQuery = (query: string): { | ||||
|     queryWithouUrls: string, | ||||
|     url: string, | ||||
|     hasUrl: boolean | ||||
| } => { | ||||
| 
 | ||||
|     const urlRegex = /https?:\/\/[^\s]+/g | ||||
| 
 | ||||
|     const urls = query.match(urlRegex) | ||||
| 
 | ||||
|     if (!urls) { | ||||
|         return { | ||||
|             queryWithouUrls: query, | ||||
|             url: "", | ||||
|             hasUrl: false | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     const url = urls[0] | ||||
| 
 | ||||
|     const queryWithouUrls = query.replace(url, "") | ||||
| 
 | ||||
|     return { | ||||
|         queryWithouUrls, | ||||
|         url, | ||||
|         hasUrl: true | ||||
|     } | ||||
| } | ||||
| @ -48,7 +48,7 @@ export default defineConfig({ | ||||
|   outDir: "build", | ||||
| 
 | ||||
|   manifest: { | ||||
|     version: "1.1.12", | ||||
|     version: "1.1.13", | ||||
|     name: | ||||
|       process.env.TARGET === "firefox" | ||||
|         ? "Page Assist - A Web UI for Local AI Models" | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user