import { BaseDocumentLoader } from "langchain/document_loaders/base" import { Document } from "@langchain/core/documents" import { compile } from "html-to-text" import { urlRewriteRuntime } from "~/libs/runtime" import { YtTranscript } from "yt-transcript" import { isWikipedia, parseWikipedia } from "@/parser/wiki" const YT_REGEX = /(?:https?:\/\/)?(?:www\.)?(?:youtube\.com|youtu\.be)\/(?:watch\?v=)?([a-zA-Z0-9_-]+)/ const isYoutubeLink = (url: string) => { return YT_REGEX.test(url) } const getTranscript = async (url: string) => { const ytTranscript = new YtTranscript({ url }) return await ytTranscript.getTranscript() } export interface WebLoaderParams { html: string url: string } export class PageAssistHtmlLoader extends BaseDocumentLoader implements WebLoaderParams { html: string url: string constructor({ html, url }: WebLoaderParams) { super() this.html = html this.url = url } async load(): Promise>[]> { if (isYoutubeLink(this.url)) { const transcript = await getTranscript(this.url) if (!transcript) { throw new Error("Transcript not found for this video.") } let text = "" transcript.forEach((item) => { text += item.text + " " }) return [ { metadata: { source: this.url, audio: { chunks: transcript } }, pageContent: text } ] } // let html = this.html // if (isWikipedia(this.url)) { // console.log("Wikipedia URL detected") // html = parseWikipedia(html) // } // // else if (isTwitter(this.url)) { // // console.log("Twitter URL detected") // // html = parseTweet(html, this.url) // // } // const htmlCompiler = compile({ // wordwrap: false // }) // const text = htmlCompiler(html) const metadata = { source: this.url } return [new Document({ pageContent: this.html, metadata })] } async loadByURL(): Promise>[]> { if (isYoutubeLink(this.url)) { const transcript = await getTranscript(this.url) if (!transcript) { throw new Error("Transcript not found for this video.") } let text = "" transcript.forEach((item) => { text += item.text + " " }) return [ { metadata: { source: this.url, audio: { chunks: transcript } }, pageContent: text } ] } await urlRewriteRuntime(this.url, "web") const fetchHTML = await fetch(this.url) let html = await fetchHTML.text() if (isWikipedia(this.url)) { console.log("Wikipedia URL detected") html = parseWikipedia(await fetchHTML.text()) } const htmlCompiler = compile({ wordwrap: false, selectors: [ { selector: "img", format: "skip" }, { selector: "script", format: "skip" } ] }) const text = htmlCompiler(html) const metadata = { url: this.url } return [new Document({ pageContent: text, metadata })] } }