Update dependencies and add YouTube transcript support
This commit is contained in:
@@ -2,6 +2,20 @@ import { BaseDocumentLoader } from "langchain/document_loaders/base"
|
||||
import { Document } from "@langchain/core/documents"
|
||||
import { compile } from "html-to-text"
|
||||
import { chromeRunTime } from "~libs/runtime"
|
||||
import { YtTranscript } from "yt-transcript"
|
||||
|
||||
const YT_REGEX =
|
||||
/(?:https?:\/\/)?(?:www\.)?(?:youtube\.com|youtu\.be)\/(?:watch\?v=)?([a-zA-Z0-9_-]+)/
|
||||
|
||||
const isYoutubeLink = (url: string) => {
|
||||
return YT_REGEX.test(url)
|
||||
}
|
||||
|
||||
const getTranscript = async (url: string) => {
|
||||
const ytTranscript = new YtTranscript({ url })
|
||||
return await ytTranscript.getTranscript()
|
||||
}
|
||||
|
||||
|
||||
export interface WebLoaderParams {
|
||||
html: string
|
||||
@@ -21,6 +35,29 @@ export class PageAssistHtmlLoader
|
||||
}
|
||||
|
||||
async load(): Promise<Document<Record<string, any>>[]> {
|
||||
if (isYoutubeLink(this.url)) {
|
||||
const transcript = await getTranscript(this.url)
|
||||
if (!transcript) {
|
||||
throw new Error("Transcript not found for this video.")
|
||||
}
|
||||
|
||||
let text = ""
|
||||
|
||||
transcript.forEach((item) => {
|
||||
text += item.text + " "
|
||||
})
|
||||
|
||||
|
||||
return [
|
||||
{
|
||||
metadata: {
|
||||
source: this.url,
|
||||
audio: { chunks: transcript }
|
||||
},
|
||||
pageContent: text
|
||||
}
|
||||
]
|
||||
}
|
||||
const htmlCompiler = compile({
|
||||
wordwrap: false
|
||||
})
|
||||
@@ -30,6 +67,29 @@ export class PageAssistHtmlLoader
|
||||
}
|
||||
|
||||
async loadByURL(): Promise<Document<Record<string, any>>[]> {
|
||||
if (isYoutubeLink(this.url)) {
|
||||
const transcript = await getTranscript(this.url)
|
||||
if (!transcript) {
|
||||
throw new Error("Transcript not found for this video.")
|
||||
}
|
||||
|
||||
let text = ""
|
||||
|
||||
transcript.forEach((item) => {
|
||||
text += item.text + " "
|
||||
})
|
||||
|
||||
|
||||
return [
|
||||
{
|
||||
metadata: {
|
||||
source: this.url,
|
||||
audio: { chunks: transcript }
|
||||
},
|
||||
pageContent: text
|
||||
}
|
||||
]
|
||||
}
|
||||
await chromeRunTime(this.url)
|
||||
const fetchHTML = await fetch(this.url)
|
||||
const html = await fetchHTML.text()
|
||||
|
||||
Reference in New Issue
Block a user