feat: Add @mozilla/readability dependency for extracting content from web pages
This commit is contained in:
19
src/parser/reader.ts
Normal file
19
src/parser/reader.ts
Normal file
@@ -0,0 +1,19 @@
|
||||
import { Readability } from "@mozilla/readability"
|
||||
import { defaultExtractContent } from "./default"
|
||||
export const extractReadabilityContent = async (url: string) => {
|
||||
const response = await fetch(url)
|
||||
if (!response.ok) {
|
||||
throw new Error(`Failed to fetch ${url}`)
|
||||
}
|
||||
|
||||
const html = await response.text()
|
||||
|
||||
// create a fake dom for Readability
|
||||
const doc = new DOMParser().parseFromString(html, "text/html")
|
||||
const reader = new Readability(doc)
|
||||
const article = reader.parse()
|
||||
|
||||
// convert the article to markdown
|
||||
const markdown = defaultExtractContent(article.content)
|
||||
return markdown
|
||||
}
|
||||
Reference in New Issue
Block a user