feat: Add support for Mammoth library for docx file uploads

This commit is contained in:
n4ze3m
2024-05-24 18:26:28 +05:30
parent 845b725970
commit 8899a42331
13 changed files with 130 additions and 20 deletions

View File

@@ -9,6 +9,18 @@ import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"
import { PageAssistVectorStore } from "./PageAssistVectorStore"
import { PageAssisCSVUrlLoader } from "@/loader/csv"
import { PageAssisTXTUrlLoader } from "@/loader/txt"
import { PageAssistDocxLoader } from "@/loader/docx"
const readAsArrayBuffer = (file: File): Promise<ArrayBuffer> => {
return new Promise((resolve, reject) => {
const reader = new FileReader()
reader.onload = () => {
resolve(reader.result as ArrayBuffer)
}
reader.onerror = reject
reader.readAsArrayBuffer(file)
})
}
export const processKnowledge = async (msg: any, id: string): Promise<void> => {
console.log(`Processing knowledge with id: ${id}`)
@@ -58,6 +70,26 @@ export const processKnowledge = async (msg: any, id: string): Promise<void> => {
knownledge_id: knowledge.id,
file_id: doc.source_id
})
} else if (doc.type === "docx" || doc.type === "application/vnd.openxmlformats-officedocument.wordprocessingml.document") {
try {
const loader = new PageAssistDocxLoader({
fileName: doc.filename,
buffer: await toArrayBufferFromBase64(
doc.content
)
})
let docs = await loader.load()
const chunks = await textSplitter.splitDocuments(docs)
await PageAssistVectorStore.fromDocuments(chunks, ollamaEmbedding, {
knownledge_id: knowledge.id,
file_id: doc.source_id
})
} catch (error) {
console.error(`Error processing knowledge with id: ${id}`, error)
}
} else {
const loader = new PageAssisTXTUrlLoader({
name: doc.filename,