feat: Add support for Mammoth library for docx file uploads

This commit is contained in:
n4ze3m 2024-05-24 18:26:28 +05:30
parent 845b725970
commit 8899a42331
13 changed files with 130 additions and 20 deletions

BIN
bun.lockb

Binary file not shown.

View File

@ -36,6 +36,7 @@
"i18next-browser-languagedetector": "^7.2.0",
"langchain": "^0.1.28",
"lucide-react": "^0.350.0",
"mammoth": "^1.7.2",
"ml-distance": "^4.0.1",
"pdfjs-dist": "4.0.379",
"property-information": "^6.4.1",

View File

@ -32,7 +32,7 @@
"uploadFile": {
"label": "Upload File",
"uploadText": "Drag and drop a file here or click to upload",
"uploadHint": "Supported file types: .pdf, .csv, .txt, .md",
"uploadHint": "Supported file types: .pdf, .csv, .txt, .md, .docx",
"required": "File is required"
},
"submit": "Submit",

View File

@ -32,7 +32,7 @@
"uploadFile": {
"label": "ഫയല്‍ അപ്‌ലോഡ് ചെയ്യുക",
"uploadText": "ഇവിടെ ഒരു ഫയല്‍ എടുത്തിടുക അല്ലെങ്കില്‍ അപ്‌ലോഡ് ചെയ്യാന്‍ ക്ലിക്ക് ചെയ്യുക",
"uploadHint": "പിന്തുണയുള്ള ഫയല്‍ തരങ്ങള്‍: .pdf, .csv, .txt, .md",
"uploadHint": "പിന്തുണയുള്ള ഫയല്‍ തരങ്ങള്‍: .pdf, .csv, .txt, .md,.docx",
"required": "ഫയല്‍ ആവശ്യമാണ്"
},
"submit": "സമര്‍പ്പിക്കുക",

View File

@ -32,7 +32,7 @@
"uploadFile": {
"label": "Загрузить файл",
"uploadText": "Перетащите файл сюда или нажмите, чтобы загрузить",
"uploadHint": "Поддерживаемые типы файлов: .pdf, .csv, .txt, .md",
"uploadHint": "Поддерживаемые типы файлов: .pdf, .csv, .txt, .md,.docx",
"required": "Файл обязателен"
},
"submit": "Отправить",

View File

@ -90,15 +90,16 @@ export const AddKnowledge = ({ open, setOpen }: Props) => {
return e?.fileList
}}>
<Upload.Dragger
accept={".pdf, .csv, .txt, .md"}
accept={".pdf, .csv, .txt, .md, .docx"}
multiple={true}
maxCount={10}
beforeUpload={(file) => {
const allowedTypes = [
"application/pdf",
// "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"text/csv",
"text/plain"
"text/plain",
"text/markdown",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
]
.map((type) => type.toLowerCase())
.join(", ")

View File

@ -7,6 +7,7 @@ type HistoryInfo = {
id: string
title: string
is_rag: boolean
message_source?: "copilot" | "web-ui"
createdAt: number
}
@ -224,10 +225,10 @@ export const generateID = () => {
})
}
export const saveHistory = async (title: string, is_rag?: boolean) => {
export const saveHistory = async (title: string, is_rag?: boolean, message_source?: "copilot" | "web-ui") => {
const id = generateID()
const createdAt = Date.now()
const history = { id, title, createdAt, is_rag }
const history = { id, title, createdAt, is_rag, message_source }
const db = new PageAssitDatabase()
await db.addChatHistory(history)
return history
@ -465,3 +466,17 @@ export const importPrompts = async (prompts: Prompts) => {
await db.addPrompt(prompt)
}
}
export const getRecentChatFromCopilot = async () => {
const db = new PageAssitDatabase()
const chatHistories = await db.getChatHistories()
if (chatHistories.length === 0) return null
const history = chatHistories.find(
(history) => history.message_source === "copilot"
)
if (!history) return null
const messages = await db.getChatHistory(history.id)
return { history, messages }
}

View File

@ -11,7 +11,8 @@ export const saveMessageOnError = async ({
historyId,
selectedModel,
setHistoryId,
isRegenerating
isRegenerating,
message_source = "web-ui"
}: {
e: any
setHistory: (history: ChatHistory) => void
@ -22,7 +23,8 @@ export const saveMessageOnError = async ({
historyId: string | null
selectedModel: string
setHistoryId: (historyId: string) => void
isRegenerating: boolean
isRegenerating: boolean,
message_source?: "copilot" | "web-ui"
}) => {
if (
e?.name === "AbortError" ||
@ -65,7 +67,7 @@ export const saveMessageOnError = async ({
2
)
} else {
const newHistoryId = await saveHistory(userMessage)
const newHistoryId = await saveHistory(userMessage, false, message_source)
if (!isRegenerating) {
await saveMessage(
newHistoryId.id,
@ -103,7 +105,8 @@ export const saveMessageOnSuccess = async ({
message,
image,
fullText,
source
source,
message_source = "web-ui"
}: {
historyId: string | null
setHistoryId: (historyId: string) => void
@ -112,7 +115,8 @@ export const saveMessageOnSuccess = async ({
message: string
image: string
fullText: string
source: any[]
source: any[],
message_source?: "copilot" | "web-ui"
}) => {
if (historyId) {
if (!isRegenerate) {
@ -136,7 +140,7 @@ export const saveMessageOnSuccess = async ({
2
)
} else {
const newHistoryId = await saveHistory(message)
const newHistoryId = await saveHistory(message, false, message_source)
await saveMessage(
newHistoryId.id,
selectedModel,

View File

@ -329,7 +329,8 @@ export const useMessage = () => {
message,
image,
fullText,
source
source,
message_source: "copilot"
})
setIsProcessing(false)
@ -345,7 +346,8 @@ export const useMessage = () => {
setHistory,
setHistoryId,
userMessage: message,
isRegenerating: isRegenerate
isRegenerating: isRegenerate,
message_source: "copilot"
})
if (!errorSave) {
@ -535,7 +537,8 @@ export const useMessage = () => {
message,
image,
fullText,
source: []
source: [],
message_source: "copilot"
})
setIsProcessing(false)
@ -551,7 +554,8 @@ export const useMessage = () => {
setHistory,
setHistoryId,
userMessage: message,
isRegenerating: isRegenerate
isRegenerating: isRegenerate,
message_source: "copilot"
})
if (!errorSave) {

View File

@ -9,6 +9,18 @@ import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"
import { PageAssistVectorStore } from "./PageAssistVectorStore"
import { PageAssisCSVUrlLoader } from "@/loader/csv"
import { PageAssisTXTUrlLoader } from "@/loader/txt"
import { PageAssistDocxLoader } from "@/loader/docx"
const readAsArrayBuffer = (file: File): Promise<ArrayBuffer> => {
return new Promise((resolve, reject) => {
const reader = new FileReader()
reader.onload = () => {
resolve(reader.result as ArrayBuffer)
}
reader.onerror = reject
reader.readAsArrayBuffer(file)
})
}
export const processKnowledge = async (msg: any, id: string): Promise<void> => {
console.log(`Processing knowledge with id: ${id}`)
@ -58,6 +70,26 @@ export const processKnowledge = async (msg: any, id: string): Promise<void> => {
knownledge_id: knowledge.id,
file_id: doc.source_id
})
} else if (doc.type === "docx" || doc.type === "application/vnd.openxmlformats-officedocument.wordprocessingml.document") {
try {
const loader = new PageAssistDocxLoader({
fileName: doc.filename,
buffer: await toArrayBufferFromBase64(
doc.content
)
})
let docs = await loader.load()
const chunks = await textSplitter.splitDocuments(docs)
await PageAssistVectorStore.fromDocuments(chunks, ollamaEmbedding, {
knownledge_id: knowledge.id,
file_id: doc.source_id
})
} catch (error) {
console.error(`Error processing knowledge with id: ${id}`, error)
}
} else {
const loader = new PageAssisTXTUrlLoader({
name: doc.filename,

33
src/loader/docx.ts Normal file
View File

@ -0,0 +1,33 @@
import { BaseDocumentLoader } from "langchain/document_loaders/base"
import { Document } from "@langchain/core/documents"
import * as mammoth from "mammoth"
export interface WebLoaderParams {
fileName: string
buffer: ArrayBuffer
}
export class PageAssistDocxLoader
extends BaseDocumentLoader
implements WebLoaderParams {
fileName: string
buffer: ArrayBuffer
constructor({ fileName, buffer }: WebLoaderParams) {
super()
this.fileName = fileName
this.buffer = buffer
}
public async load(): Promise<Document[]> {
const data = await mammoth.extractRawText({
arrayBuffer: this.buffer
})
const text = data.value
const meta = { source: this.fileName }
if (text) {
return [new Document({ pageContent: text, metadata: meta })]
}
return []
}
}

View File

@ -1,3 +1,8 @@
import {
formatToChatHistory,
formatToMessage,
getRecentChatFromCopilot
} from "@/db"
import React from "react"
import { SidePanelBody } from "~/components/Sidepanel/Chat/body"
import { SidepanelForm } from "~/components/Sidepanel/Chat/form"
@ -10,7 +15,20 @@ import { useMessage } from "~/hooks/useMessage"
const [dropState, setDropState] = React.useState<
"idle" | "dragging" | "error"
>("idle")
const {chatMode} = useMessage()
const { chatMode, messages, setHistory, setHistoryId, setMessages } =
useMessage()
const setRecentMessagesOnLoad = async () => {
if (messages.length === 0) {
const recentChat = await getRecentChatFromCopilot()
if (recentChat) {
setHistoryId(recentChat.history.id)
setHistory(formatToChatHistory(recentChat.messages))
setMessages(formatToMessage(recentChat.messages))
}
}
}
React.useEffect(() => {
if (!drop.current) {
return
@ -67,6 +85,7 @@ import { useMessage } from "~/hooks/useMessage"
}
}
}, [])
return (
<div
ref={drop}

View File

@ -10,6 +10,7 @@ export const toBase64 = (file: File | Blob): Promise<string> => {
})
}
export const toArrayBufferFromBase64 = async (base64: string) => {
const res = await fetch(base64)
const blob = await res.blob()