refactor: Update PageAssistHtmlLoader to use extractReadabilityContent for parsing web page content
This commit is contained in:
		
							parent
							
								
									4363a4b0de
								
							
						
					
					
						commit
						1e9b66d823
					
				| @ -1,9 +1,9 @@ | |||||||
| import { BaseDocumentLoader } from "langchain/document_loaders/base" | import { BaseDocumentLoader } from "langchain/document_loaders/base" | ||||||
| import { Document } from "@langchain/core/documents" | import { Document } from "@langchain/core/documents" | ||||||
| import { compile } from "html-to-text" |  | ||||||
| import { urlRewriteRuntime } from "~/libs/runtime" | import { urlRewriteRuntime } from "~/libs/runtime" | ||||||
| import { YtTranscript } from "yt-transcript" | import { YtTranscript } from "yt-transcript" | ||||||
| import { isWikipedia, parseWikipedia } from "@/parser/wiki" | import { isWikipedia, parseWikipedia } from "@/parser/wiki" | ||||||
|  | import { extractReadabilityContent } from "@/parser/reader" | ||||||
| 
 | 
 | ||||||
| const YT_REGEX = | const YT_REGEX = | ||||||
|   /(?:https?:\/\/)?(?:www\.)?(?:youtube\.com|youtu\.be)\/(?:watch\?v=)?([a-zA-Z0-9_-]+)/ |   /(?:https?:\/\/)?(?:www\.)?(?:youtube\.com|youtu\.be)\/(?:watch\?v=)?([a-zA-Z0-9_-]+)/ | ||||||
| @ -24,8 +24,7 @@ export interface WebLoaderParams { | |||||||
| 
 | 
 | ||||||
| export class PageAssistHtmlLoader | export class PageAssistHtmlLoader | ||||||
|   extends BaseDocumentLoader |   extends BaseDocumentLoader | ||||||
|   implements WebLoaderParams |   implements WebLoaderParams { | ||||||
| { |  | ||||||
|   html: string |   html: string | ||||||
|   url: string |   url: string | ||||||
| 
 | 
 | ||||||
| @ -52,30 +51,14 @@ export class PageAssistHtmlLoader | |||||||
|         { |         { | ||||||
|           metadata: { |           metadata: { | ||||||
|             source: this.url, |             source: this.url, | ||||||
|  |             url: this.url, | ||||||
|             audio: { chunks: transcript } |             audio: { chunks: transcript } | ||||||
|           }, |           }, | ||||||
|           pageContent: text |           pageContent: text | ||||||
|         } |         } | ||||||
|       ] |       ] | ||||||
|     } |     } | ||||||
| 
 |     const metadata = { source: this.url, url: this.url, } | ||||||
|     // let html = this.html
 |  | ||||||
| 
 |  | ||||||
|     // if (isWikipedia(this.url)) {
 |  | ||||||
|     //   console.log("Wikipedia URL detected")
 |  | ||||||
|     //   html = parseWikipedia(html)
 |  | ||||||
|     // }
 |  | ||||||
| 
 |  | ||||||
|     // // else if (isTwitter(this.url)) {
 |  | ||||||
|     // //   console.log("Twitter URL detected")
 |  | ||||||
|     // //   html = parseTweet(html, this.url)
 |  | ||||||
|     // // }
 |  | ||||||
| 
 |  | ||||||
|     // const htmlCompiler = compile({
 |  | ||||||
|     //   wordwrap: false
 |  | ||||||
|     // })
 |  | ||||||
|     // const text = htmlCompiler(html)
 |  | ||||||
|     const metadata = { source: this.url } |  | ||||||
|     return [new Document({ pageContent: this.html, metadata })] |     return [new Document({ pageContent: this.html, metadata })] | ||||||
|   } |   } | ||||||
| 
 | 
 | ||||||
| @ -95,6 +78,7 @@ export class PageAssistHtmlLoader | |||||||
|       return [ |       return [ | ||||||
|         { |         { | ||||||
|           metadata: { |           metadata: { | ||||||
|  |             url: this.url, | ||||||
|             source: this.url, |             source: this.url, | ||||||
|             audio: { chunks: transcript } |             audio: { chunks: transcript } | ||||||
|           }, |           }, | ||||||
| @ -103,22 +87,15 @@ export class PageAssistHtmlLoader | |||||||
|       ] |       ] | ||||||
|     } |     } | ||||||
|     await urlRewriteRuntime(this.url, "web") |     await urlRewriteRuntime(this.url, "web") | ||||||
|     const fetchHTML = await fetch(this.url) |     let text = ""; | ||||||
|     let html = await fetchHTML.text() |  | ||||||
| 
 |  | ||||||
|     if (isWikipedia(this.url)) { |     if (isWikipedia(this.url)) { | ||||||
|       console.log("Wikipedia URL detected") |       console.log("Wikipedia URL detected") | ||||||
|       html = parseWikipedia(await fetchHTML.text()) |       const fetchHTML = await fetch(this.url) | ||||||
|  |       text = parseWikipedia(await fetchHTML.text()) | ||||||
|  |     } else { | ||||||
|  |       text = await extractReadabilityContent(this.url) | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     const htmlCompiler = compile({ |  | ||||||
|       wordwrap: false, |  | ||||||
|       selectors: [ |  | ||||||
|         { selector: "img", format: "skip" }, |  | ||||||
|         { selector: "script", format: "skip" } |  | ||||||
|       ] |  | ||||||
|     }) |  | ||||||
|     const text = htmlCompiler(html) |  | ||||||
|     const metadata = { url: this.url } |     const metadata = { url: this.url } | ||||||
|     return [new Document({ pageContent: text, metadata })] |     return [new Document({ pageContent: text, metadata })] | ||||||
|   } |   } | ||||||
|  | |||||||
| @ -1,4 +1,5 @@ | |||||||
| import * as cheerio from "cheerio" | import * as cheerio from "cheerio" | ||||||
|  | import { defaultExtractContent } from "./default" | ||||||
| 
 | 
 | ||||||
| export const isWikipedia = (url: string) => { | export const isWikipedia = (url: string) => { | ||||||
|   const WIKI_REGEX = /wikipedia\.org\/wiki\//g |   const WIKI_REGEX = /wikipedia\.org\/wiki\//g | ||||||
| @ -24,5 +25,5 @@ export const parseWikipedia = (html: string) => { | |||||||
|   content?.find("div.toc")?.remove() |   content?.find("div.toc")?.remove() | ||||||
|   const newHtml = content?.html() |   const newHtml = content?.html() | ||||||
| 
 | 
 | ||||||
|   return `<div>TITLE: ${title?.text()}</div><div>${newHtml}</div>` |   return defaultExtractContent(`<div>TITLE: ${title?.text()}</div><div>${newHtml}</div>`) | ||||||
| } | } | ||||||
|  | |||||||
| @ -18,7 +18,7 @@ export const getIsSimpleInternetSearch = async () => { | |||||||
| export const getIsVisitSpecificWebsite = async () => { | export const getIsVisitSpecificWebsite = async () => { | ||||||
|   const isVisitSpecificWebsite = await storage.get("isVisitSpecificWebsite") |   const isVisitSpecificWebsite = await storage.get("isVisitSpecificWebsite") | ||||||
|   if (!isVisitSpecificWebsite || isVisitSpecificWebsite.length === 0) { |   if (!isVisitSpecificWebsite || isVisitSpecificWebsite.length === 0) { | ||||||
|     return false |     return true | ||||||
|   } |   } | ||||||
|   return isVisitSpecificWebsite === "true" |   return isVisitSpecificWebsite === "true" | ||||||
| } | } | ||||||
|  | |||||||
| @ -1,34 +1,16 @@ | |||||||
| import { cleanUrl } from "@/libs/clean-url" | import { cleanUrl } from "@/libs/clean-url" | ||||||
| import { extractReadabilityContent } from "@/parser/reader" | import { PageAssistHtmlLoader } from "@/loader/html" | ||||||
| import { defaultEmbeddingChunkOverlap, defaultEmbeddingChunkSize, defaultEmbeddingModelForRag, getOllamaURL } from "@/services/ollama" | import { defaultEmbeddingChunkOverlap, defaultEmbeddingChunkSize, defaultEmbeddingModelForRag, getOllamaURL } from "@/services/ollama" | ||||||
| import { getIsSimpleInternetSearch } from "@/services/search" |  | ||||||
| import { OllamaEmbeddings } from "@langchain/community/embeddings/ollama" | import { OllamaEmbeddings } from "@langchain/community/embeddings/ollama" | ||||||
| import type { Document } from "@langchain/core/documents" |  | ||||||
| import { RecursiveCharacterTextSplitter } from "langchain/text_splitter" | import { RecursiveCharacterTextSplitter } from "langchain/text_splitter" | ||||||
| import { MemoryVectorStore } from "langchain/vectorstores/memory" | import { MemoryVectorStore } from "langchain/vectorstores/memory" | ||||||
| 
 | 
 | ||||||
| export const processSingleWebsite = async (url: string, query: string) => { | export const processSingleWebsite = async (url: string, query: string) => { | ||||||
|     let content = await extractReadabilityContent(url) |     const loader = new PageAssistHtmlLoader({ | ||||||
| 
 |         html: "", | ||||||
|     // const isSimpleMode = await getIsSimpleInternetSearch()
 |         url | ||||||
| 
 |     }) | ||||||
|     // if (isSimpleMode) {
 |     const docs = await loader.loadByURL() | ||||||
|     //     return [
 |  | ||||||
|     //         {
 |  | ||||||
|     //             url,
 |  | ||||||
|     //             content: content.length > 5000 ? content.slice(0, 5000) : content
 |  | ||||||
|     //         }
 |  | ||||||
|     //     ]
 |  | ||||||
|     // }
 |  | ||||||
| 
 |  | ||||||
|     const docs: Document<Record<string, any>>[] = [ |  | ||||||
|         { |  | ||||||
|             metadata: { |  | ||||||
|                 url |  | ||||||
|             }, |  | ||||||
|             pageContent: content |  | ||||||
|         } |  | ||||||
|     ] |  | ||||||
| 
 | 
 | ||||||
|     const ollamaUrl = await getOllamaURL() |     const ollamaUrl = await getOllamaURL() | ||||||
| 
 | 
 | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user