This commit is contained in:
n4ze3m
2024-02-02 22:01:16 +05:30
parent 23e488770d
commit 28361c47e6
9 changed files with 487 additions and 17 deletions

31
src/loader/html.ts Normal file
View File

@@ -0,0 +1,31 @@
import { BaseDocumentLoader } from "langchain/document_loaders/base"
import { Document } from "langchain/document"
import { compile } from "html-to-text"
export interface WebLoaderParams {
html: string
url: string
}
export class PageAssistHtmlLoader
extends BaseDocumentLoader
implements WebLoaderParams
{
html: string
url: string
constructor({ html, url }: WebLoaderParams) {
super()
this.html = html
this.url = url
}
async load(): Promise<Document<Record<string, any>>[]> {
const htmlCompiler = compile({
wordwrap: false
})
const text = htmlCompiler(this.html)
const metadata = { source: this.url }
return [new Document({ pageContent: text, metadata })]
}
}