feat: Improve content extraction with readability library

This commit is contained in:
n4ze3m 2024-07-22 00:36:20 +05:30
parent af09c8eed6
commit 498f4a02d2
2 changed files with 37 additions and 4 deletions

View File

@ -1,10 +1,43 @@
import * as cheerio from "cheerio" import * as cheerio from "cheerio"
import TurndownService from "turndown" import TurndownService from "turndown"
let turndownService = new TurndownService() import { Readability, isProbablyReaderable } from "@mozilla/readability"
export const defaultExtractContent = (html: string) => { export const defaultExtractContent = (html: string) => {
const $ = cheerio.load(html)
const mainContent = $('[role="main"]').html() || $("main").html() || $.html()
const markdown = turndownService.turndown(mainContent) const doc = new DOMParser().parseFromString(html, "text/html")
return markdown if (isProbablyReaderable(doc)) {
const reader = new Readability(doc)
const article = reader.parse()
const turndownService = new TurndownService({
headingStyle: 'atx',
codeBlockStyle: 'fenced'
})
return turndownService.turndown(article.content).trim()
}
const $ = cheerio.load(html)
$('script, style, link, svg, [src^="data:image/"]').remove()
$('*').each((_, element) => {
if ('attribs' in element) {
const attributes = element.attribs
for (const attr in attributes) {
if (attr !== 'href' && attr !== 'src') {
$(element).removeAttr(attr)
}
}
}
})
const mainContent = $('[role="main"]').html() || $("main").html() || $("body").html() || ""
const turndownService = new TurndownService({
headingStyle: 'atx',
codeBlockStyle: 'fenced'
})
const markdown = turndownService.turndown(mainContent)
return markdown.trim()
} }

View File

@ -1,4 +1,4 @@
import { Readability } from "@mozilla/readability" import { Readability, } from "@mozilla/readability"
import { defaultExtractContent } from "./default" import { defaultExtractContent } from "./default"
export const extractReadabilityContent = async (url: string) => { export const extractReadabilityContent = async (url: string) => {
const response = await fetch(url) const response = await fetch(url)