diff --git a/src/parser/default.ts b/src/parser/default.ts index e15c9ad..4675649 100644 --- a/src/parser/default.ts +++ b/src/parser/default.ts @@ -1,10 +1,43 @@ import * as cheerio from "cheerio" import TurndownService from "turndown" -let turndownService = new TurndownService() +import { Readability, isProbablyReaderable } from "@mozilla/readability" export const defaultExtractContent = (html: string) => { + + + const doc = new DOMParser().parseFromString(html, "text/html") + if (isProbablyReaderable(doc)) { + const reader = new Readability(doc) + const article = reader.parse() + const turndownService = new TurndownService({ + headingStyle: 'atx', + codeBlockStyle: 'fenced' + }) + return turndownService.turndown(article.content).trim() + } + const $ = cheerio.load(html) - const mainContent = $('[role="main"]').html() || $("main").html() || $.html() + + $('script, style, link, svg, [src^="data:image/"]').remove() + + $('*').each((_, element) => { + if ('attribs' in element) { + const attributes = element.attribs + for (const attr in attributes) { + if (attr !== 'href' && attr !== 'src') { + $(element).removeAttr(attr) + } + } + } + }) + + const mainContent = $('[role="main"]').html() || $("main").html() || $("body").html() || "" + + const turndownService = new TurndownService({ + headingStyle: 'atx', + codeBlockStyle: 'fenced' + }) const markdown = turndownService.turndown(mainContent) - return markdown + + return markdown.trim() } diff --git a/src/parser/reader.ts b/src/parser/reader.ts index 23b22bf..c0bbbf3 100644 --- a/src/parser/reader.ts +++ b/src/parser/reader.ts @@ -1,4 +1,4 @@ -import { Readability } from "@mozilla/readability" +import { Readability, } from "@mozilla/readability" import { defaultExtractContent } from "./default" export const extractReadabilityContent = async (url: string) => { const response = await fetch(url)