Update dependencies and fix import paths

This commit is contained in:
n4ze3m
2024-04-05 20:28:29 +05:30
parent d91d4c4761
commit ac347a3970
43 changed files with 1142 additions and 99 deletions

49
src/loader/pdf-url.ts Normal file
View File

@@ -0,0 +1,49 @@
import { BaseDocumentLoader } from "langchain/document_loaders/base"
import { Document } from "@langchain/core/documents"
import { processPdf } from "@/libs/pdf"
export interface WebLoaderParams {
url: string
name: string
}
export class PageAssistPDFUrlLoader
extends BaseDocumentLoader
implements WebLoaderParams
{
pdf: { content: string; page: number }[]
url: string
name: string
constructor({ url, name }: WebLoaderParams) {
super()
this.url = url
this.name = name
}
async load(): Promise<Document<Record<string, any>>[]> {
const documents: Document[] = []
const data = await processPdf(this.url)
for (let i = 1; i <= data.numPages; i += 1) {
const page = await data.getPage(i)
const content = await page.getTextContent()
if (content?.items.length === 0) {
continue
}
const text = content?.items
.map((item: any) => item.str)
.join("\n")
.replace(/\x00/g, "")
.trim()
documents.push({
pageContent: text,
metadata: { source: this.name, page: i }
})
}
return documents
}
}

View File

@@ -1,37 +1,36 @@
import { BaseDocumentLoader } from "langchain/document_loaders/base"
import { Document } from "@langchain/core/documents"
export interface WebLoaderParams {
pdf: { content: string, page: number }[]
url: string
pdf: { content: string; page: number }[]
url: string
}
export class PageAssistPDFLoader
extends BaseDocumentLoader
implements WebLoaderParams {
pdf: { content: string, page: number }[]
url: string
extends BaseDocumentLoader
implements WebLoaderParams
{
pdf: { content: string; page: number }[]
url: string
constructor({ pdf, url }: WebLoaderParams) {
super()
this.pdf = pdf
this.url = url
constructor({ pdf, url }: WebLoaderParams) {
super()
this.pdf = pdf
this.url = url
}
async load(): Promise<Document<Record<string, any>>[]> {
const documents: Document[] = []
for (const page of this.pdf) {
const metadata = { source: this.url, page: page.page }
documents.push(new Document({ pageContent: page.content, metadata }))
}
async load(): Promise<Document<Record<string, any>>[]> {
const documents: Document[] = [];
for (const page of this.pdf) {
const metadata = { source: this.url, page: page.page }
documents.push(new Document({ pageContent: page.content, metadata }))
}
return [
new Document({
pageContent: documents.map((doc) => doc.pageContent).join("\n\n"),
metadata: documents.map((doc) => doc.metadata),
}),
];
}
return [
new Document({
pageContent: documents.map((doc) => doc.pageContent).join("\n\n"),
metadata: documents.map((doc) => doc.metadata)
})
]
}
}