112 lines
3.7 KiB
TypeScript
112 lines
3.7 KiB
TypeScript
import { getKnowledgeById, updateKnowledgeStatus } from "@/db/knowledge"
|
|
import { PageAssistPDFUrlLoader } from "@/loader/pdf-url"
|
|
import {
|
|
defaultEmbeddingChunkOverlap,
|
|
defaultEmbeddingChunkSize,
|
|
getOllamaURL
|
|
} from "@/services/ollama"
|
|
import { OllamaEmbeddings } from "@langchain/community/embeddings/ollama"
|
|
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"
|
|
import { PageAssistVectorStore } from "./PageAssistVectorStore"
|
|
import { PageAssisCSVUrlLoader } from "@/loader/csv"
|
|
import { PageAssisTXTUrlLoader } from "@/loader/txt"
|
|
import { PageAssistDocxLoader } from "@/loader/docx"
|
|
import { cleanUrl } from "./clean-url"
|
|
|
|
|
|
export const processKnowledge = async (msg: any, id: string): Promise<void> => {
|
|
console.log(`Processing knowledge with id: ${id}`)
|
|
try {
|
|
const knowledge = await getKnowledgeById(id)
|
|
const ollamaUrl = await getOllamaURL()
|
|
|
|
if (!knowledge) {
|
|
console.error(`Knowledge with id ${id} not found`)
|
|
return
|
|
}
|
|
|
|
await updateKnowledgeStatus(id, "processing")
|
|
|
|
const ollamaEmbedding = new OllamaEmbeddings({
|
|
baseUrl: cleanUrl(ollamaUrl),
|
|
model: knowledge.embedding_model
|
|
})
|
|
const chunkSize = await defaultEmbeddingChunkSize()
|
|
const chunkOverlap = await defaultEmbeddingChunkOverlap()
|
|
const textSplitter = new RecursiveCharacterTextSplitter({
|
|
chunkSize,
|
|
chunkOverlap
|
|
})
|
|
|
|
for (const doc of knowledge.source) {
|
|
if (doc.type === "pdf" || doc.type === "application/pdf") {
|
|
const loader = new PageAssistPDFUrlLoader({
|
|
name: doc.filename,
|
|
url: doc.content
|
|
})
|
|
let docs = await loader.load()
|
|
const chunks = await textSplitter.splitDocuments(docs)
|
|
await PageAssistVectorStore.fromDocuments(chunks, ollamaEmbedding, {
|
|
knownledge_id: knowledge.id,
|
|
file_id: doc.source_id
|
|
})
|
|
} else if (doc.type === "csv" || doc.type === "text/csv") {
|
|
const loader = new PageAssisCSVUrlLoader({
|
|
name: doc.filename,
|
|
url: doc.content,
|
|
options: {}
|
|
})
|
|
|
|
let docs = await loader.load()
|
|
|
|
const chunks = await textSplitter.splitDocuments(docs)
|
|
await PageAssistVectorStore.fromDocuments(chunks, ollamaEmbedding, {
|
|
knownledge_id: knowledge.id,
|
|
file_id: doc.source_id
|
|
})
|
|
} else if (doc.type === "docx" || doc.type === "application/vnd.openxmlformats-officedocument.wordprocessingml.document") {
|
|
try {
|
|
const loader = new PageAssistDocxLoader({
|
|
fileName: doc.filename,
|
|
buffer: await toArrayBufferFromBase64(
|
|
doc.content
|
|
)
|
|
})
|
|
|
|
let docs = await loader.load()
|
|
|
|
const chunks = await textSplitter.splitDocuments(docs)
|
|
|
|
await PageAssistVectorStore.fromDocuments(chunks, ollamaEmbedding, {
|
|
knownledge_id: knowledge.id,
|
|
file_id: doc.source_id
|
|
})
|
|
} catch (error) {
|
|
console.error(`Error processing knowledge with id: ${id}`, error)
|
|
}
|
|
} else {
|
|
const loader = new PageAssisTXTUrlLoader({
|
|
name: doc.filename,
|
|
url: doc.content
|
|
})
|
|
|
|
let docs = await loader.load()
|
|
|
|
const chunks = await textSplitter.splitDocuments(docs)
|
|
|
|
await PageAssistVectorStore.fromDocuments(chunks, ollamaEmbedding, {
|
|
knownledge_id: knowledge.id,
|
|
file_id: doc.source_id
|
|
})
|
|
}
|
|
}
|
|
|
|
await updateKnowledgeStatus(id, "finished")
|
|
} catch (error) {
|
|
console.error(`Error processing knowledge with id: ${id}`, error)
|
|
await updateKnowledgeStatus(id, "failed")
|
|
} finally {
|
|
console.log(`Finished processing knowledge with id: ${id}`)
|
|
}
|
|
}
|