Update dependencies and fix import paths
This commit is contained in:
49
src/loader/pdf-url.ts
Normal file
49
src/loader/pdf-url.ts
Normal file
@@ -0,0 +1,49 @@
|
||||
import { BaseDocumentLoader } from "langchain/document_loaders/base"
|
||||
import { Document } from "@langchain/core/documents"
|
||||
import { processPdf } from "@/libs/pdf"
|
||||
export interface WebLoaderParams {
|
||||
url: string
|
||||
name: string
|
||||
}
|
||||
|
||||
export class PageAssistPDFUrlLoader
|
||||
extends BaseDocumentLoader
|
||||
implements WebLoaderParams
|
||||
{
|
||||
pdf: { content: string; page: number }[]
|
||||
url: string
|
||||
name: string
|
||||
|
||||
constructor({ url, name }: WebLoaderParams) {
|
||||
super()
|
||||
this.url = url
|
||||
this.name = name
|
||||
}
|
||||
|
||||
async load(): Promise<Document<Record<string, any>>[]> {
|
||||
const documents: Document[] = []
|
||||
|
||||
const data = await processPdf(this.url)
|
||||
|
||||
for (let i = 1; i <= data.numPages; i += 1) {
|
||||
const page = await data.getPage(i)
|
||||
const content = await page.getTextContent()
|
||||
|
||||
if (content?.items.length === 0) {
|
||||
continue
|
||||
}
|
||||
|
||||
const text = content?.items
|
||||
.map((item: any) => item.str)
|
||||
.join("\n")
|
||||
.replace(/\x00/g, "")
|
||||
.trim()
|
||||
documents.push({
|
||||
pageContent: text,
|
||||
metadata: { source: this.name, page: i }
|
||||
})
|
||||
}
|
||||
|
||||
return documents
|
||||
}
|
||||
}
|
||||
@@ -1,37 +1,36 @@
|
||||
import { BaseDocumentLoader } from "langchain/document_loaders/base"
|
||||
import { Document } from "@langchain/core/documents"
|
||||
export interface WebLoaderParams {
|
||||
pdf: { content: string, page: number }[]
|
||||
url: string
|
||||
pdf: { content: string; page: number }[]
|
||||
url: string
|
||||
}
|
||||
|
||||
export class PageAssistPDFLoader
|
||||
extends BaseDocumentLoader
|
||||
implements WebLoaderParams {
|
||||
pdf: { content: string, page: number }[]
|
||||
url: string
|
||||
extends BaseDocumentLoader
|
||||
implements WebLoaderParams
|
||||
{
|
||||
pdf: { content: string; page: number }[]
|
||||
url: string
|
||||
|
||||
constructor({ pdf, url }: WebLoaderParams) {
|
||||
super()
|
||||
this.pdf = pdf
|
||||
this.url = url
|
||||
constructor({ pdf, url }: WebLoaderParams) {
|
||||
super()
|
||||
this.pdf = pdf
|
||||
this.url = url
|
||||
}
|
||||
|
||||
async load(): Promise<Document<Record<string, any>>[]> {
|
||||
const documents: Document[] = []
|
||||
|
||||
for (const page of this.pdf) {
|
||||
const metadata = { source: this.url, page: page.page }
|
||||
documents.push(new Document({ pageContent: page.content, metadata }))
|
||||
}
|
||||
|
||||
async load(): Promise<Document<Record<string, any>>[]> {
|
||||
const documents: Document[] = [];
|
||||
|
||||
for (const page of this.pdf) {
|
||||
const metadata = { source: this.url, page: page.page }
|
||||
documents.push(new Document({ pageContent: page.content, metadata }))
|
||||
}
|
||||
|
||||
return [
|
||||
new Document({
|
||||
pageContent: documents.map((doc) => doc.pageContent).join("\n\n"),
|
||||
metadata: documents.map((doc) => doc.metadata),
|
||||
}),
|
||||
];
|
||||
|
||||
|
||||
}
|
||||
return [
|
||||
new Document({
|
||||
pageContent: documents.map((doc) => doc.pageContent).join("\n\n"),
|
||||
metadata: documents.map((doc) => doc.metadata)
|
||||
})
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user