feat: Improve content extraction with readability library
This commit is contained in:
parent
af09c8eed6
commit
498f4a02d2
@ -1,10 +1,43 @@
|
||||
import * as cheerio from "cheerio"
|
||||
import TurndownService from "turndown"
|
||||
let turndownService = new TurndownService()
|
||||
import { Readability, isProbablyReaderable } from "@mozilla/readability"
|
||||
|
||||
export const defaultExtractContent = (html: string) => {
|
||||
|
||||
|
||||
const doc = new DOMParser().parseFromString(html, "text/html")
|
||||
if (isProbablyReaderable(doc)) {
|
||||
const reader = new Readability(doc)
|
||||
const article = reader.parse()
|
||||
const turndownService = new TurndownService({
|
||||
headingStyle: 'atx',
|
||||
codeBlockStyle: 'fenced'
|
||||
})
|
||||
return turndownService.turndown(article.content).trim()
|
||||
}
|
||||
|
||||
const $ = cheerio.load(html)
|
||||
const mainContent = $('[role="main"]').html() || $("main").html() || $.html()
|
||||
|
||||
$('script, style, link, svg, [src^="data:image/"]').remove()
|
||||
|
||||
$('*').each((_, element) => {
|
||||
if ('attribs' in element) {
|
||||
const attributes = element.attribs
|
||||
for (const attr in attributes) {
|
||||
if (attr !== 'href' && attr !== 'src') {
|
||||
$(element).removeAttr(attr)
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
const mainContent = $('[role="main"]').html() || $("main").html() || $("body").html() || ""
|
||||
|
||||
const turndownService = new TurndownService({
|
||||
headingStyle: 'atx',
|
||||
codeBlockStyle: 'fenced'
|
||||
})
|
||||
const markdown = turndownService.turndown(mainContent)
|
||||
return markdown
|
||||
|
||||
return markdown.trim()
|
||||
}
|
||||
|
@ -1,4 +1,4 @@
|
||||
import { Readability } from "@mozilla/readability"
|
||||
import { Readability, } from "@mozilla/readability"
|
||||
import { defaultExtractContent } from "./default"
|
||||
export const extractReadabilityContent = async (url: string) => {
|
||||
const response = await fetch(url)
|
||||
|
Loading…
x
Reference in New Issue
Block a user