refactor: Update PageAssistHtmlLoader to use extractReadabilityContent for parsing web page content

This commit is contained in:
n4ze3m
2024-06-23 20:34:43 +05:30
parent 4363a4b0de
commit 1e9b66d823
4 changed files with 19 additions and 59 deletions

View File

@@ -1,4 +1,5 @@
import * as cheerio from "cheerio"
import { defaultExtractContent } from "./default"
export const isWikipedia = (url: string) => {
const WIKI_REGEX = /wikipedia\.org\/wiki\//g
@@ -24,5 +25,5 @@ export const parseWikipedia = (html: string) => {
content?.find("div.toc")?.remove()
const newHtml = content?.html()
return `<div>TITLE: ${title?.text()}</div><div>${newHtml}</div>`
return defaultExtractContent(`<div>TITLE: ${title?.text()}</div><div>${newHtml}</div>`)
}