refactor: Update PageAssistHtmlLoader to use extractReadabilityContent for parsing web page content
This commit is contained in:
		
							parent
							
								
									4363a4b0de
								
							
						
					
					
						commit
						1e9b66d823
					
				| @ -1,9 +1,9 @@ | ||||
| import { BaseDocumentLoader } from "langchain/document_loaders/base" | ||||
| import { Document } from "@langchain/core/documents" | ||||
| import { compile } from "html-to-text" | ||||
| import { urlRewriteRuntime } from "~/libs/runtime" | ||||
| import { YtTranscript } from "yt-transcript" | ||||
| import { isWikipedia, parseWikipedia } from "@/parser/wiki" | ||||
| import { extractReadabilityContent } from "@/parser/reader" | ||||
| 
 | ||||
| const YT_REGEX = | ||||
|   /(?:https?:\/\/)?(?:www\.)?(?:youtube\.com|youtu\.be)\/(?:watch\?v=)?([a-zA-Z0-9_-]+)/ | ||||
| @ -24,8 +24,7 @@ export interface WebLoaderParams { | ||||
| 
 | ||||
| export class PageAssistHtmlLoader | ||||
|   extends BaseDocumentLoader | ||||
|   implements WebLoaderParams | ||||
| { | ||||
|   implements WebLoaderParams { | ||||
|   html: string | ||||
|   url: string | ||||
| 
 | ||||
| @ -52,30 +51,14 @@ export class PageAssistHtmlLoader | ||||
|         { | ||||
|           metadata: { | ||||
|             source: this.url, | ||||
|             url: this.url, | ||||
|             audio: { chunks: transcript } | ||||
|           }, | ||||
|           pageContent: text | ||||
|         } | ||||
|       ] | ||||
|     } | ||||
| 
 | ||||
|     // let html = this.html
 | ||||
| 
 | ||||
|     // if (isWikipedia(this.url)) {
 | ||||
|     //   console.log("Wikipedia URL detected")
 | ||||
|     //   html = parseWikipedia(html)
 | ||||
|     // }
 | ||||
| 
 | ||||
|     // // else if (isTwitter(this.url)) {
 | ||||
|     // //   console.log("Twitter URL detected")
 | ||||
|     // //   html = parseTweet(html, this.url)
 | ||||
|     // // }
 | ||||
| 
 | ||||
|     // const htmlCompiler = compile({
 | ||||
|     //   wordwrap: false
 | ||||
|     // })
 | ||||
|     // const text = htmlCompiler(html)
 | ||||
|     const metadata = { source: this.url } | ||||
|     const metadata = { source: this.url, url: this.url, } | ||||
|     return [new Document({ pageContent: this.html, metadata })] | ||||
|   } | ||||
| 
 | ||||
| @ -95,6 +78,7 @@ export class PageAssistHtmlLoader | ||||
|       return [ | ||||
|         { | ||||
|           metadata: { | ||||
|             url: this.url, | ||||
|             source: this.url, | ||||
|             audio: { chunks: transcript } | ||||
|           }, | ||||
| @ -103,22 +87,15 @@ export class PageAssistHtmlLoader | ||||
|       ] | ||||
|     } | ||||
|     await urlRewriteRuntime(this.url, "web") | ||||
|     const fetchHTML = await fetch(this.url) | ||||
|     let html = await fetchHTML.text() | ||||
| 
 | ||||
|     let text = ""; | ||||
|     if (isWikipedia(this.url)) { | ||||
|       console.log("Wikipedia URL detected") | ||||
|       html = parseWikipedia(await fetchHTML.text()) | ||||
|       const fetchHTML = await fetch(this.url) | ||||
|       text = parseWikipedia(await fetchHTML.text()) | ||||
|     } else { | ||||
|       text = await extractReadabilityContent(this.url) | ||||
|     } | ||||
| 
 | ||||
|     const htmlCompiler = compile({ | ||||
|       wordwrap: false, | ||||
|       selectors: [ | ||||
|         { selector: "img", format: "skip" }, | ||||
|         { selector: "script", format: "skip" } | ||||
|       ] | ||||
|     }) | ||||
|     const text = htmlCompiler(html) | ||||
|     const metadata = { url: this.url } | ||||
|     return [new Document({ pageContent: text, metadata })] | ||||
|   } | ||||
|  | ||||
| @ -1,4 +1,5 @@ | ||||
| import * as cheerio from "cheerio" | ||||
| import { defaultExtractContent } from "./default" | ||||
| 
 | ||||
| export const isWikipedia = (url: string) => { | ||||
|   const WIKI_REGEX = /wikipedia\.org\/wiki\//g | ||||
| @ -24,5 +25,5 @@ export const parseWikipedia = (html: string) => { | ||||
|   content?.find("div.toc")?.remove() | ||||
|   const newHtml = content?.html() | ||||
| 
 | ||||
|   return `<div>TITLE: ${title?.text()}</div><div>${newHtml}</div>` | ||||
|   return defaultExtractContent(`<div>TITLE: ${title?.text()}</div><div>${newHtml}</div>`) | ||||
| } | ||||
|  | ||||
| @ -18,7 +18,7 @@ export const getIsSimpleInternetSearch = async () => { | ||||
| export const getIsVisitSpecificWebsite = async () => { | ||||
|   const isVisitSpecificWebsite = await storage.get("isVisitSpecificWebsite") | ||||
|   if (!isVisitSpecificWebsite || isVisitSpecificWebsite.length === 0) { | ||||
|     return false | ||||
|     return true | ||||
|   } | ||||
|   return isVisitSpecificWebsite === "true" | ||||
| } | ||||
|  | ||||
| @ -1,34 +1,16 @@ | ||||
| import { cleanUrl } from "@/libs/clean-url" | ||||
| import { extractReadabilityContent } from "@/parser/reader" | ||||
| import { PageAssistHtmlLoader } from "@/loader/html" | ||||
| import { defaultEmbeddingChunkOverlap, defaultEmbeddingChunkSize, defaultEmbeddingModelForRag, getOllamaURL } from "@/services/ollama" | ||||
| import { getIsSimpleInternetSearch } from "@/services/search" | ||||
| import { OllamaEmbeddings } from "@langchain/community/embeddings/ollama" | ||||
| import type { Document } from "@langchain/core/documents" | ||||
| import { RecursiveCharacterTextSplitter } from "langchain/text_splitter" | ||||
| import { MemoryVectorStore } from "langchain/vectorstores/memory" | ||||
| 
 | ||||
| export const processSingleWebsite = async (url: string, query: string) => { | ||||
|     let content = await extractReadabilityContent(url) | ||||
| 
 | ||||
|     // const isSimpleMode = await getIsSimpleInternetSearch()
 | ||||
| 
 | ||||
|     // if (isSimpleMode) {
 | ||||
|     //     return [
 | ||||
|     //         {
 | ||||
|     //             url,
 | ||||
|     //             content: content.length > 5000 ? content.slice(0, 5000) : content
 | ||||
|     //         }
 | ||||
|     //     ]
 | ||||
|     // }
 | ||||
| 
 | ||||
|     const docs: Document<Record<string, any>>[] = [ | ||||
|         { | ||||
|             metadata: { | ||||
|                 url | ||||
|             }, | ||||
|             pageContent: content | ||||
|         } | ||||
|     ] | ||||
|     const loader = new PageAssistHtmlLoader({ | ||||
|         html: "", | ||||
|         url | ||||
|     }) | ||||
|     const docs = await loader.loadByURL() | ||||
| 
 | ||||
|     const ollamaUrl = await getOllamaURL() | ||||
| 
 | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user