page-assist/src/utils/memory-embeddings.ts

63 lines
1.9 KiB
TypeScript

import { PageAssistHtmlLoader } from "~/loader/html"
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"
import { MemoryVectorStore } from "langchain/vectorstores/memory"
import { OllamaEmbeddings } from "@langchain/community/embeddings/ollama"
import { defaultEmbeddingChunkOverlap, defaultEmbeddingChunkSize } from "@/services/ollama"
import { PageAssistPDFLoader } from "@/loader/pdf"
export const getLoader = ({ html, pdf, type, url }: {
url: string,
html: string,
type: string,
pdf: { content: string, page: number }[]
}) => {
if (type === "pdf") {
return new PageAssistPDFLoader({
pdf,
url
})
} else {
return new PageAssistHtmlLoader({
html,
url
})
}
}
export const memoryEmbedding = async (
{ html,
keepTrackOfEmbedding, ollamaEmbedding, pdf, setIsEmbedding, setKeepTrackOfEmbedding, type, url }: {
url: string,
html: string,
type: string,
pdf: { content: string, page: number }[],
keepTrackOfEmbedding: Record<string, MemoryVectorStore>,
ollamaEmbedding: OllamaEmbeddings,
setIsEmbedding: (value: boolean) => void,
setKeepTrackOfEmbedding: (value: Record<string, MemoryVectorStore>) => void
}
) => {
setIsEmbedding(true)
const loader = getLoader({ html, pdf, type, url })
const docs = await loader.load()
const chunkSize = await defaultEmbeddingChunkSize()
const chunkOverlap = await defaultEmbeddingChunkOverlap()
const textSplitter = new RecursiveCharacterTextSplitter({
chunkSize,
chunkOverlap
})
const chunks = await textSplitter.splitDocuments(docs)
const store = new MemoryVectorStore(ollamaEmbedding)
await store.addDocuments(chunks)
setKeepTrackOfEmbedding({
...keepTrackOfEmbedding,
[url]: store
})
setIsEmbedding(false)
return store
}