feat: Improve content extraction with readability library
This commit is contained in:
parent
af09c8eed6
commit
498f4a02d2
@ -1,10 +1,43 @@
|
|||||||
import * as cheerio from "cheerio"
|
import * as cheerio from "cheerio"
|
||||||
import TurndownService from "turndown"
|
import TurndownService from "turndown"
|
||||||
let turndownService = new TurndownService()
|
import { Readability, isProbablyReaderable } from "@mozilla/readability"
|
||||||
|
|
||||||
export const defaultExtractContent = (html: string) => {
|
export const defaultExtractContent = (html: string) => {
|
||||||
const $ = cheerio.load(html)
|
|
||||||
const mainContent = $('[role="main"]').html() || $("main").html() || $.html()
|
|
||||||
const markdown = turndownService.turndown(mainContent)
|
const doc = new DOMParser().parseFromString(html, "text/html")
|
||||||
return markdown
|
if (isProbablyReaderable(doc)) {
|
||||||
|
const reader = new Readability(doc)
|
||||||
|
const article = reader.parse()
|
||||||
|
const turndownService = new TurndownService({
|
||||||
|
headingStyle: 'atx',
|
||||||
|
codeBlockStyle: 'fenced'
|
||||||
|
})
|
||||||
|
return turndownService.turndown(article.content).trim()
|
||||||
|
}
|
||||||
|
|
||||||
|
const $ = cheerio.load(html)
|
||||||
|
|
||||||
|
$('script, style, link, svg, [src^="data:image/"]').remove()
|
||||||
|
|
||||||
|
$('*').each((_, element) => {
|
||||||
|
if ('attribs' in element) {
|
||||||
|
const attributes = element.attribs
|
||||||
|
for (const attr in attributes) {
|
||||||
|
if (attr !== 'href' && attr !== 'src') {
|
||||||
|
$(element).removeAttr(attr)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
const mainContent = $('[role="main"]').html() || $("main").html() || $("body").html() || ""
|
||||||
|
|
||||||
|
const turndownService = new TurndownService({
|
||||||
|
headingStyle: 'atx',
|
||||||
|
codeBlockStyle: 'fenced'
|
||||||
|
})
|
||||||
|
const markdown = turndownService.turndown(mainContent)
|
||||||
|
|
||||||
|
return markdown.trim()
|
||||||
}
|
}
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
import { Readability } from "@mozilla/readability"
|
import { Readability, } from "@mozilla/readability"
|
||||||
import { defaultExtractContent } from "./default"
|
import { defaultExtractContent } from "./default"
|
||||||
export const extractReadabilityContent = async (url: string) => {
|
export const extractReadabilityContent = async (url: string) => {
|
||||||
const response = await fetch(url)
|
const response = await fetch(url)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user