Update dependencies and add YouTube transcript support

This commit is contained in:
n4ze3m
2024-03-08 00:45:28 +05:30
parent 6d559eda2f
commit 7a72961562
6 changed files with 151 additions and 68 deletions

View File

@@ -2,6 +2,20 @@ import { BaseDocumentLoader } from "langchain/document_loaders/base"
import { Document } from "@langchain/core/documents"
import { compile } from "html-to-text"
import { chromeRunTime } from "~libs/runtime"
import { YtTranscript } from "yt-transcript"
const YT_REGEX =
/(?:https?:\/\/)?(?:www\.)?(?:youtube\.com|youtu\.be)\/(?:watch\?v=)?([a-zA-Z0-9_-]+)/
const isYoutubeLink = (url: string) => {
return YT_REGEX.test(url)
}
const getTranscript = async (url: string) => {
const ytTranscript = new YtTranscript({ url })
return await ytTranscript.getTranscript()
}
export interface WebLoaderParams {
html: string
@@ -21,6 +35,29 @@ export class PageAssistHtmlLoader
}
async load(): Promise<Document<Record<string, any>>[]> {
if (isYoutubeLink(this.url)) {
const transcript = await getTranscript(this.url)
if (!transcript) {
throw new Error("Transcript not found for this video.")
}
let text = ""
transcript.forEach((item) => {
text += item.text + " "
})
return [
{
metadata: {
source: this.url,
audio: { chunks: transcript }
},
pageContent: text
}
]
}
const htmlCompiler = compile({
wordwrap: false
})
@@ -30,6 +67,29 @@ export class PageAssistHtmlLoader
}
async loadByURL(): Promise<Document<Record<string, any>>[]> {
if (isYoutubeLink(this.url)) {
const transcript = await getTranscript(this.url)
if (!transcript) {
throw new Error("Transcript not found for this video.")
}
let text = ""
transcript.forEach((item) => {
text += item.text + " "
})
return [
{
metadata: {
source: this.url,
audio: { chunks: transcript }
},
pageContent: text
}
]
}
await chromeRunTime(this.url)
const fetchHTML = await fetch(this.url)
const html = await fetchHTML.text()