feat: Add support for Mammoth library for docx file uploads

This commit is contained in:
n4ze3m 2024-05-24 18:26:28 +05:30
parent 845b725970
commit 8899a42331
13 changed files with 130 additions and 20 deletions

BIN
bun.lockb

Binary file not shown.

View File

@ -36,6 +36,7 @@
"i18next-browser-languagedetector": "^7.2.0", "i18next-browser-languagedetector": "^7.2.0",
"langchain": "^0.1.28", "langchain": "^0.1.28",
"lucide-react": "^0.350.0", "lucide-react": "^0.350.0",
"mammoth": "^1.7.2",
"ml-distance": "^4.0.1", "ml-distance": "^4.0.1",
"pdfjs-dist": "4.0.379", "pdfjs-dist": "4.0.379",
"property-information": "^6.4.1", "property-information": "^6.4.1",

View File

@ -32,7 +32,7 @@
"uploadFile": { "uploadFile": {
"label": "Upload File", "label": "Upload File",
"uploadText": "Drag and drop a file here or click to upload", "uploadText": "Drag and drop a file here or click to upload",
"uploadHint": "Supported file types: .pdf, .csv, .txt, .md", "uploadHint": "Supported file types: .pdf, .csv, .txt, .md, .docx",
"required": "File is required" "required": "File is required"
}, },
"submit": "Submit", "submit": "Submit",

View File

@ -32,7 +32,7 @@
"uploadFile": { "uploadFile": {
"label": "ഫയല്‍ അപ്‌ലോഡ് ചെയ്യുക", "label": "ഫയല്‍ അപ്‌ലോഡ് ചെയ്യുക",
"uploadText": "ഇവിടെ ഒരു ഫയല്‍ എടുത്തിടുക അല്ലെങ്കില്‍ അപ്‌ലോഡ് ചെയ്യാന്‍ ക്ലിക്ക് ചെയ്യുക", "uploadText": "ഇവിടെ ഒരു ഫയല്‍ എടുത്തിടുക അല്ലെങ്കില്‍ അപ്‌ലോഡ് ചെയ്യാന്‍ ക്ലിക്ക് ചെയ്യുക",
"uploadHint": "പിന്തുണയുള്ള ഫയല്‍ തരങ്ങള്‍: .pdf, .csv, .txt, .md", "uploadHint": "പിന്തുണയുള്ള ഫയല്‍ തരങ്ങള്‍: .pdf, .csv, .txt, .md,.docx",
"required": "ഫയല്‍ ആവശ്യമാണ്" "required": "ഫയല്‍ ആവശ്യമാണ്"
}, },
"submit": "സമര്‍പ്പിക്കുക", "submit": "സമര്‍പ്പിക്കുക",

View File

@ -32,7 +32,7 @@
"uploadFile": { "uploadFile": {
"label": "Загрузить файл", "label": "Загрузить файл",
"uploadText": "Перетащите файл сюда или нажмите, чтобы загрузить", "uploadText": "Перетащите файл сюда или нажмите, чтобы загрузить",
"uploadHint": "Поддерживаемые типы файлов: .pdf, .csv, .txt, .md", "uploadHint": "Поддерживаемые типы файлов: .pdf, .csv, .txt, .md,.docx",
"required": "Файл обязателен" "required": "Файл обязателен"
}, },
"submit": "Отправить", "submit": "Отправить",

View File

@ -90,15 +90,16 @@ export const AddKnowledge = ({ open, setOpen }: Props) => {
return e?.fileList return e?.fileList
}}> }}>
<Upload.Dragger <Upload.Dragger
accept={".pdf, .csv, .txt, .md"} accept={".pdf, .csv, .txt, .md, .docx"}
multiple={true} multiple={true}
maxCount={10} maxCount={10}
beforeUpload={(file) => { beforeUpload={(file) => {
const allowedTypes = [ const allowedTypes = [
"application/pdf", "application/pdf",
// "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"text/csv", "text/csv",
"text/plain" "text/plain",
"text/markdown",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
] ]
.map((type) => type.toLowerCase()) .map((type) => type.toLowerCase())
.join(", ") .join(", ")

View File

@ -7,6 +7,7 @@ type HistoryInfo = {
id: string id: string
title: string title: string
is_rag: boolean is_rag: boolean
message_source?: "copilot" | "web-ui"
createdAt: number createdAt: number
} }
@ -224,10 +225,10 @@ export const generateID = () => {
}) })
} }
export const saveHistory = async (title: string, is_rag?: boolean) => { export const saveHistory = async (title: string, is_rag?: boolean, message_source?: "copilot" | "web-ui") => {
const id = generateID() const id = generateID()
const createdAt = Date.now() const createdAt = Date.now()
const history = { id, title, createdAt, is_rag } const history = { id, title, createdAt, is_rag, message_source }
const db = new PageAssitDatabase() const db = new PageAssitDatabase()
await db.addChatHistory(history) await db.addChatHistory(history)
return history return history
@ -465,3 +466,17 @@ export const importPrompts = async (prompts: Prompts) => {
await db.addPrompt(prompt) await db.addPrompt(prompt)
} }
} }
export const getRecentChatFromCopilot = async () => {
const db = new PageAssitDatabase()
const chatHistories = await db.getChatHistories()
if (chatHistories.length === 0) return null
const history = chatHistories.find(
(history) => history.message_source === "copilot"
)
if (!history) return null
const messages = await db.getChatHistory(history.id)
return { history, messages }
}

View File

@ -11,7 +11,8 @@ export const saveMessageOnError = async ({
historyId, historyId,
selectedModel, selectedModel,
setHistoryId, setHistoryId,
isRegenerating isRegenerating,
message_source = "web-ui"
}: { }: {
e: any e: any
setHistory: (history: ChatHistory) => void setHistory: (history: ChatHistory) => void
@ -22,7 +23,8 @@ export const saveMessageOnError = async ({
historyId: string | null historyId: string | null
selectedModel: string selectedModel: string
setHistoryId: (historyId: string) => void setHistoryId: (historyId: string) => void
isRegenerating: boolean isRegenerating: boolean,
message_source?: "copilot" | "web-ui"
}) => { }) => {
if ( if (
e?.name === "AbortError" || e?.name === "AbortError" ||
@ -65,7 +67,7 @@ export const saveMessageOnError = async ({
2 2
) )
} else { } else {
const newHistoryId = await saveHistory(userMessage) const newHistoryId = await saveHistory(userMessage, false, message_source)
if (!isRegenerating) { if (!isRegenerating) {
await saveMessage( await saveMessage(
newHistoryId.id, newHistoryId.id,
@ -103,7 +105,8 @@ export const saveMessageOnSuccess = async ({
message, message,
image, image,
fullText, fullText,
source source,
message_source = "web-ui"
}: { }: {
historyId: string | null historyId: string | null
setHistoryId: (historyId: string) => void setHistoryId: (historyId: string) => void
@ -112,7 +115,8 @@ export const saveMessageOnSuccess = async ({
message: string message: string
image: string image: string
fullText: string fullText: string
source: any[] source: any[],
message_source?: "copilot" | "web-ui"
}) => { }) => {
if (historyId) { if (historyId) {
if (!isRegenerate) { if (!isRegenerate) {
@ -136,7 +140,7 @@ export const saveMessageOnSuccess = async ({
2 2
) )
} else { } else {
const newHistoryId = await saveHistory(message) const newHistoryId = await saveHistory(message, false, message_source)
await saveMessage( await saveMessage(
newHistoryId.id, newHistoryId.id,
selectedModel, selectedModel,

View File

@ -329,7 +329,8 @@ export const useMessage = () => {
message, message,
image, image,
fullText, fullText,
source source,
message_source: "copilot"
}) })
setIsProcessing(false) setIsProcessing(false)
@ -345,7 +346,8 @@ export const useMessage = () => {
setHistory, setHistory,
setHistoryId, setHistoryId,
userMessage: message, userMessage: message,
isRegenerating: isRegenerate isRegenerating: isRegenerate,
message_source: "copilot"
}) })
if (!errorSave) { if (!errorSave) {
@ -535,7 +537,8 @@ export const useMessage = () => {
message, message,
image, image,
fullText, fullText,
source: [] source: [],
message_source: "copilot"
}) })
setIsProcessing(false) setIsProcessing(false)
@ -551,7 +554,8 @@ export const useMessage = () => {
setHistory, setHistory,
setHistoryId, setHistoryId,
userMessage: message, userMessage: message,
isRegenerating: isRegenerate isRegenerating: isRegenerate,
message_source: "copilot"
}) })
if (!errorSave) { if (!errorSave) {

View File

@ -9,6 +9,18 @@ import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"
import { PageAssistVectorStore } from "./PageAssistVectorStore" import { PageAssistVectorStore } from "./PageAssistVectorStore"
import { PageAssisCSVUrlLoader } from "@/loader/csv" import { PageAssisCSVUrlLoader } from "@/loader/csv"
import { PageAssisTXTUrlLoader } from "@/loader/txt" import { PageAssisTXTUrlLoader } from "@/loader/txt"
import { PageAssistDocxLoader } from "@/loader/docx"
const readAsArrayBuffer = (file: File): Promise<ArrayBuffer> => {
return new Promise((resolve, reject) => {
const reader = new FileReader()
reader.onload = () => {
resolve(reader.result as ArrayBuffer)
}
reader.onerror = reject
reader.readAsArrayBuffer(file)
})
}
export const processKnowledge = async (msg: any, id: string): Promise<void> => { export const processKnowledge = async (msg: any, id: string): Promise<void> => {
console.log(`Processing knowledge with id: ${id}`) console.log(`Processing knowledge with id: ${id}`)
@ -58,6 +70,26 @@ export const processKnowledge = async (msg: any, id: string): Promise<void> => {
knownledge_id: knowledge.id, knownledge_id: knowledge.id,
file_id: doc.source_id file_id: doc.source_id
}) })
} else if (doc.type === "docx" || doc.type === "application/vnd.openxmlformats-officedocument.wordprocessingml.document") {
try {
const loader = new PageAssistDocxLoader({
fileName: doc.filename,
buffer: await toArrayBufferFromBase64(
doc.content
)
})
let docs = await loader.load()
const chunks = await textSplitter.splitDocuments(docs)
await PageAssistVectorStore.fromDocuments(chunks, ollamaEmbedding, {
knownledge_id: knowledge.id,
file_id: doc.source_id
})
} catch (error) {
console.error(`Error processing knowledge with id: ${id}`, error)
}
} else { } else {
const loader = new PageAssisTXTUrlLoader({ const loader = new PageAssisTXTUrlLoader({
name: doc.filename, name: doc.filename,

33
src/loader/docx.ts Normal file
View File

@ -0,0 +1,33 @@
import { BaseDocumentLoader } from "langchain/document_loaders/base"
import { Document } from "@langchain/core/documents"
import * as mammoth from "mammoth"
export interface WebLoaderParams {
fileName: string
buffer: ArrayBuffer
}
export class PageAssistDocxLoader
extends BaseDocumentLoader
implements WebLoaderParams {
fileName: string
buffer: ArrayBuffer
constructor({ fileName, buffer }: WebLoaderParams) {
super()
this.fileName = fileName
this.buffer = buffer
}
public async load(): Promise<Document[]> {
const data = await mammoth.extractRawText({
arrayBuffer: this.buffer
})
const text = data.value
const meta = { source: this.fileName }
if (text) {
return [new Document({ pageContent: text, metadata: meta })]
}
return []
}
}

View File

@ -1,16 +1,34 @@
import {
formatToChatHistory,
formatToMessage,
getRecentChatFromCopilot
} from "@/db"
import React from "react" import React from "react"
import { SidePanelBody } from "~/components/Sidepanel/Chat/body" import { SidePanelBody } from "~/components/Sidepanel/Chat/body"
import { SidepanelForm } from "~/components/Sidepanel/Chat/form" import { SidepanelForm } from "~/components/Sidepanel/Chat/form"
import { SidepanelHeader } from "~/components/Sidepanel/Chat/header" import { SidepanelHeader } from "~/components/Sidepanel/Chat/header"
import { useMessage } from "~/hooks/useMessage" import { useMessage } from "~/hooks/useMessage"
const SidepanelChat = () => { const SidepanelChat = () => {
const drop = React.useRef<HTMLDivElement>(null) const drop = React.useRef<HTMLDivElement>(null)
const [dropedFile, setDropedFile] = React.useState<File | undefined>() const [dropedFile, setDropedFile] = React.useState<File | undefined>()
const [dropState, setDropState] = React.useState< const [dropState, setDropState] = React.useState<
"idle" | "dragging" | "error" "idle" | "dragging" | "error"
>("idle") >("idle")
const {chatMode} = useMessage() const { chatMode, messages, setHistory, setHistoryId, setMessages } =
useMessage()
const setRecentMessagesOnLoad = async () => {
if (messages.length === 0) {
const recentChat = await getRecentChatFromCopilot()
if (recentChat) {
setHistoryId(recentChat.history.id)
setHistory(formatToChatHistory(recentChat.messages))
setMessages(formatToMessage(recentChat.messages))
}
}
}
React.useEffect(() => { React.useEffect(() => {
if (!drop.current) { if (!drop.current) {
return return
@ -67,6 +85,7 @@ import { useMessage } from "~/hooks/useMessage"
} }
} }
}, []) }, [])
return ( return (
<div <div
ref={drop} ref={drop}

View File

@ -10,6 +10,7 @@ export const toBase64 = (file: File | Blob): Promise<string> => {
}) })
} }
export const toArrayBufferFromBase64 = async (base64: string) => { export const toArrayBufferFromBase64 = async (base64: string) => {
const res = await fetch(base64) const res = await fetch(base64)
const blob = await res.blob() const blob = await res.blob()