refactor: Update PageAssistHtmlLoader to use extractReadabilityContent for parsing web page content
This commit is contained in:
parent
4363a4b0de
commit
1e9b66d823
@ -1,9 +1,9 @@
|
|||||||
import { BaseDocumentLoader } from "langchain/document_loaders/base"
|
import { BaseDocumentLoader } from "langchain/document_loaders/base"
|
||||||
import { Document } from "@langchain/core/documents"
|
import { Document } from "@langchain/core/documents"
|
||||||
import { compile } from "html-to-text"
|
|
||||||
import { urlRewriteRuntime } from "~/libs/runtime"
|
import { urlRewriteRuntime } from "~/libs/runtime"
|
||||||
import { YtTranscript } from "yt-transcript"
|
import { YtTranscript } from "yt-transcript"
|
||||||
import { isWikipedia, parseWikipedia } from "@/parser/wiki"
|
import { isWikipedia, parseWikipedia } from "@/parser/wiki"
|
||||||
|
import { extractReadabilityContent } from "@/parser/reader"
|
||||||
|
|
||||||
const YT_REGEX =
|
const YT_REGEX =
|
||||||
/(?:https?:\/\/)?(?:www\.)?(?:youtube\.com|youtu\.be)\/(?:watch\?v=)?([a-zA-Z0-9_-]+)/
|
/(?:https?:\/\/)?(?:www\.)?(?:youtube\.com|youtu\.be)\/(?:watch\?v=)?([a-zA-Z0-9_-]+)/
|
||||||
@ -24,8 +24,7 @@ export interface WebLoaderParams {
|
|||||||
|
|
||||||
export class PageAssistHtmlLoader
|
export class PageAssistHtmlLoader
|
||||||
extends BaseDocumentLoader
|
extends BaseDocumentLoader
|
||||||
implements WebLoaderParams
|
implements WebLoaderParams {
|
||||||
{
|
|
||||||
html: string
|
html: string
|
||||||
url: string
|
url: string
|
||||||
|
|
||||||
@ -52,30 +51,14 @@ export class PageAssistHtmlLoader
|
|||||||
{
|
{
|
||||||
metadata: {
|
metadata: {
|
||||||
source: this.url,
|
source: this.url,
|
||||||
|
url: this.url,
|
||||||
audio: { chunks: transcript }
|
audio: { chunks: transcript }
|
||||||
},
|
},
|
||||||
pageContent: text
|
pageContent: text
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
const metadata = { source: this.url, url: this.url, }
|
||||||
// let html = this.html
|
|
||||||
|
|
||||||
// if (isWikipedia(this.url)) {
|
|
||||||
// console.log("Wikipedia URL detected")
|
|
||||||
// html = parseWikipedia(html)
|
|
||||||
// }
|
|
||||||
|
|
||||||
// // else if (isTwitter(this.url)) {
|
|
||||||
// // console.log("Twitter URL detected")
|
|
||||||
// // html = parseTweet(html, this.url)
|
|
||||||
// // }
|
|
||||||
|
|
||||||
// const htmlCompiler = compile({
|
|
||||||
// wordwrap: false
|
|
||||||
// })
|
|
||||||
// const text = htmlCompiler(html)
|
|
||||||
const metadata = { source: this.url }
|
|
||||||
return [new Document({ pageContent: this.html, metadata })]
|
return [new Document({ pageContent: this.html, metadata })]
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -95,6 +78,7 @@ export class PageAssistHtmlLoader
|
|||||||
return [
|
return [
|
||||||
{
|
{
|
||||||
metadata: {
|
metadata: {
|
||||||
|
url: this.url,
|
||||||
source: this.url,
|
source: this.url,
|
||||||
audio: { chunks: transcript }
|
audio: { chunks: transcript }
|
||||||
},
|
},
|
||||||
@ -103,22 +87,15 @@ export class PageAssistHtmlLoader
|
|||||||
]
|
]
|
||||||
}
|
}
|
||||||
await urlRewriteRuntime(this.url, "web")
|
await urlRewriteRuntime(this.url, "web")
|
||||||
const fetchHTML = await fetch(this.url)
|
let text = "";
|
||||||
let html = await fetchHTML.text()
|
|
||||||
|
|
||||||
if (isWikipedia(this.url)) {
|
if (isWikipedia(this.url)) {
|
||||||
console.log("Wikipedia URL detected")
|
console.log("Wikipedia URL detected")
|
||||||
html = parseWikipedia(await fetchHTML.text())
|
const fetchHTML = await fetch(this.url)
|
||||||
|
text = parseWikipedia(await fetchHTML.text())
|
||||||
|
} else {
|
||||||
|
text = await extractReadabilityContent(this.url)
|
||||||
}
|
}
|
||||||
|
|
||||||
const htmlCompiler = compile({
|
|
||||||
wordwrap: false,
|
|
||||||
selectors: [
|
|
||||||
{ selector: "img", format: "skip" },
|
|
||||||
{ selector: "script", format: "skip" }
|
|
||||||
]
|
|
||||||
})
|
|
||||||
const text = htmlCompiler(html)
|
|
||||||
const metadata = { url: this.url }
|
const metadata = { url: this.url }
|
||||||
return [new Document({ pageContent: text, metadata })]
|
return [new Document({ pageContent: text, metadata })]
|
||||||
}
|
}
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
import * as cheerio from "cheerio"
|
import * as cheerio from "cheerio"
|
||||||
|
import { defaultExtractContent } from "./default"
|
||||||
|
|
||||||
export const isWikipedia = (url: string) => {
|
export const isWikipedia = (url: string) => {
|
||||||
const WIKI_REGEX = /wikipedia\.org\/wiki\//g
|
const WIKI_REGEX = /wikipedia\.org\/wiki\//g
|
||||||
@ -24,5 +25,5 @@ export const parseWikipedia = (html: string) => {
|
|||||||
content?.find("div.toc")?.remove()
|
content?.find("div.toc")?.remove()
|
||||||
const newHtml = content?.html()
|
const newHtml = content?.html()
|
||||||
|
|
||||||
return `<div>TITLE: ${title?.text()}</div><div>${newHtml}</div>`
|
return defaultExtractContent(`<div>TITLE: ${title?.text()}</div><div>${newHtml}</div>`)
|
||||||
}
|
}
|
||||||
|
@ -18,7 +18,7 @@ export const getIsSimpleInternetSearch = async () => {
|
|||||||
export const getIsVisitSpecificWebsite = async () => {
|
export const getIsVisitSpecificWebsite = async () => {
|
||||||
const isVisitSpecificWebsite = await storage.get("isVisitSpecificWebsite")
|
const isVisitSpecificWebsite = await storage.get("isVisitSpecificWebsite")
|
||||||
if (!isVisitSpecificWebsite || isVisitSpecificWebsite.length === 0) {
|
if (!isVisitSpecificWebsite || isVisitSpecificWebsite.length === 0) {
|
||||||
return false
|
return true
|
||||||
}
|
}
|
||||||
return isVisitSpecificWebsite === "true"
|
return isVisitSpecificWebsite === "true"
|
||||||
}
|
}
|
||||||
|
@ -1,34 +1,16 @@
|
|||||||
import { cleanUrl } from "@/libs/clean-url"
|
import { cleanUrl } from "@/libs/clean-url"
|
||||||
import { extractReadabilityContent } from "@/parser/reader"
|
import { PageAssistHtmlLoader } from "@/loader/html"
|
||||||
import { defaultEmbeddingChunkOverlap, defaultEmbeddingChunkSize, defaultEmbeddingModelForRag, getOllamaURL } from "@/services/ollama"
|
import { defaultEmbeddingChunkOverlap, defaultEmbeddingChunkSize, defaultEmbeddingModelForRag, getOllamaURL } from "@/services/ollama"
|
||||||
import { getIsSimpleInternetSearch } from "@/services/search"
|
|
||||||
import { OllamaEmbeddings } from "@langchain/community/embeddings/ollama"
|
import { OllamaEmbeddings } from "@langchain/community/embeddings/ollama"
|
||||||
import type { Document } from "@langchain/core/documents"
|
|
||||||
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"
|
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"
|
||||||
import { MemoryVectorStore } from "langchain/vectorstores/memory"
|
import { MemoryVectorStore } from "langchain/vectorstores/memory"
|
||||||
|
|
||||||
export const processSingleWebsite = async (url: string, query: string) => {
|
export const processSingleWebsite = async (url: string, query: string) => {
|
||||||
let content = await extractReadabilityContent(url)
|
const loader = new PageAssistHtmlLoader({
|
||||||
|
html: "",
|
||||||
// const isSimpleMode = await getIsSimpleInternetSearch()
|
|
||||||
|
|
||||||
// if (isSimpleMode) {
|
|
||||||
// return [
|
|
||||||
// {
|
|
||||||
// url,
|
|
||||||
// content: content.length > 5000 ? content.slice(0, 5000) : content
|
|
||||||
// }
|
|
||||||
// ]
|
|
||||||
// }
|
|
||||||
|
|
||||||
const docs: Document<Record<string, any>>[] = [
|
|
||||||
{
|
|
||||||
metadata: {
|
|
||||||
url
|
url
|
||||||
},
|
})
|
||||||
pageContent: content
|
const docs = await loader.loadByURL()
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
||||||
const ollamaUrl = await getOllamaURL()
|
const ollamaUrl = await getOllamaURL()
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user