refactor: Update PageAssistHtmlLoader to use extractReadabilityContent for parsing web page content

This commit is contained in:
n4ze3m 2024-06-23 20:34:43 +05:30
parent 4363a4b0de
commit 1e9b66d823
4 changed files with 19 additions and 59 deletions

View File

@ -1,9 +1,9 @@
import { BaseDocumentLoader } from "langchain/document_loaders/base" import { BaseDocumentLoader } from "langchain/document_loaders/base"
import { Document } from "@langchain/core/documents" import { Document } from "@langchain/core/documents"
import { compile } from "html-to-text"
import { urlRewriteRuntime } from "~/libs/runtime" import { urlRewriteRuntime } from "~/libs/runtime"
import { YtTranscript } from "yt-transcript" import { YtTranscript } from "yt-transcript"
import { isWikipedia, parseWikipedia } from "@/parser/wiki" import { isWikipedia, parseWikipedia } from "@/parser/wiki"
import { extractReadabilityContent } from "@/parser/reader"
const YT_REGEX = const YT_REGEX =
/(?:https?:\/\/)?(?:www\.)?(?:youtube\.com|youtu\.be)\/(?:watch\?v=)?([a-zA-Z0-9_-]+)/ /(?:https?:\/\/)?(?:www\.)?(?:youtube\.com|youtu\.be)\/(?:watch\?v=)?([a-zA-Z0-9_-]+)/
@ -24,8 +24,7 @@ export interface WebLoaderParams {
export class PageAssistHtmlLoader export class PageAssistHtmlLoader
extends BaseDocumentLoader extends BaseDocumentLoader
implements WebLoaderParams implements WebLoaderParams {
{
html: string html: string
url: string url: string
@ -52,30 +51,14 @@ export class PageAssistHtmlLoader
{ {
metadata: { metadata: {
source: this.url, source: this.url,
url: this.url,
audio: { chunks: transcript } audio: { chunks: transcript }
}, },
pageContent: text pageContent: text
} }
] ]
} }
const metadata = { source: this.url, url: this.url, }
// let html = this.html
// if (isWikipedia(this.url)) {
// console.log("Wikipedia URL detected")
// html = parseWikipedia(html)
// }
// // else if (isTwitter(this.url)) {
// // console.log("Twitter URL detected")
// // html = parseTweet(html, this.url)
// // }
// const htmlCompiler = compile({
// wordwrap: false
// })
// const text = htmlCompiler(html)
const metadata = { source: this.url }
return [new Document({ pageContent: this.html, metadata })] return [new Document({ pageContent: this.html, metadata })]
} }
@ -95,6 +78,7 @@ export class PageAssistHtmlLoader
return [ return [
{ {
metadata: { metadata: {
url: this.url,
source: this.url, source: this.url,
audio: { chunks: transcript } audio: { chunks: transcript }
}, },
@ -103,22 +87,15 @@ export class PageAssistHtmlLoader
] ]
} }
await urlRewriteRuntime(this.url, "web") await urlRewriteRuntime(this.url, "web")
const fetchHTML = await fetch(this.url) let text = "";
let html = await fetchHTML.text()
if (isWikipedia(this.url)) { if (isWikipedia(this.url)) {
console.log("Wikipedia URL detected") console.log("Wikipedia URL detected")
html = parseWikipedia(await fetchHTML.text()) const fetchHTML = await fetch(this.url)
text = parseWikipedia(await fetchHTML.text())
} else {
text = await extractReadabilityContent(this.url)
} }
const htmlCompiler = compile({
wordwrap: false,
selectors: [
{ selector: "img", format: "skip" },
{ selector: "script", format: "skip" }
]
})
const text = htmlCompiler(html)
const metadata = { url: this.url } const metadata = { url: this.url }
return [new Document({ pageContent: text, metadata })] return [new Document({ pageContent: text, metadata })]
} }

View File

@ -1,4 +1,5 @@
import * as cheerio from "cheerio" import * as cheerio from "cheerio"
import { defaultExtractContent } from "./default"
export const isWikipedia = (url: string) => { export const isWikipedia = (url: string) => {
const WIKI_REGEX = /wikipedia\.org\/wiki\//g const WIKI_REGEX = /wikipedia\.org\/wiki\//g
@ -24,5 +25,5 @@ export const parseWikipedia = (html: string) => {
content?.find("div.toc")?.remove() content?.find("div.toc")?.remove()
const newHtml = content?.html() const newHtml = content?.html()
return `<div>TITLE: ${title?.text()}</div><div>${newHtml}</div>` return defaultExtractContent(`<div>TITLE: ${title?.text()}</div><div>${newHtml}</div>`)
} }

View File

@ -18,7 +18,7 @@ export const getIsSimpleInternetSearch = async () => {
export const getIsVisitSpecificWebsite = async () => { export const getIsVisitSpecificWebsite = async () => {
const isVisitSpecificWebsite = await storage.get("isVisitSpecificWebsite") const isVisitSpecificWebsite = await storage.get("isVisitSpecificWebsite")
if (!isVisitSpecificWebsite || isVisitSpecificWebsite.length === 0) { if (!isVisitSpecificWebsite || isVisitSpecificWebsite.length === 0) {
return false return true
} }
return isVisitSpecificWebsite === "true" return isVisitSpecificWebsite === "true"
} }

View File

@ -1,34 +1,16 @@
import { cleanUrl } from "@/libs/clean-url" import { cleanUrl } from "@/libs/clean-url"
import { extractReadabilityContent } from "@/parser/reader" import { PageAssistHtmlLoader } from "@/loader/html"
import { defaultEmbeddingChunkOverlap, defaultEmbeddingChunkSize, defaultEmbeddingModelForRag, getOllamaURL } from "@/services/ollama" import { defaultEmbeddingChunkOverlap, defaultEmbeddingChunkSize, defaultEmbeddingModelForRag, getOllamaURL } from "@/services/ollama"
import { getIsSimpleInternetSearch } from "@/services/search"
import { OllamaEmbeddings } from "@langchain/community/embeddings/ollama" import { OllamaEmbeddings } from "@langchain/community/embeddings/ollama"
import type { Document } from "@langchain/core/documents"
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter" import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"
import { MemoryVectorStore } from "langchain/vectorstores/memory" import { MemoryVectorStore } from "langchain/vectorstores/memory"
export const processSingleWebsite = async (url: string, query: string) => { export const processSingleWebsite = async (url: string, query: string) => {
let content = await extractReadabilityContent(url) const loader = new PageAssistHtmlLoader({
html: "",
// const isSimpleMode = await getIsSimpleInternetSearch() url
})
// if (isSimpleMode) { const docs = await loader.loadByURL()
// return [
// {
// url,
// content: content.length > 5000 ? content.slice(0, 5000) : content
// }
// ]
// }
const docs: Document<Record<string, any>>[] = [
{
metadata: {
url
},
pageContent: content
}
]
const ollamaUrl = await getOllamaURL() const ollamaUrl = await getOllamaURL()