import { PageAssistHtmlLoader } from "~/loader/html"
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"
import { MemoryVectorStore } from "langchain/vectorstores/memory"
import { OllamaEmbeddings } from "@langchain/community/embeddings/ollama"
import {
defaultEmbeddingChunkOverlap,
defaultEmbeddingChunkSize
} from "@/services/ollama"
import { PageAssistPDFLoader } from "@/loader/pdf"
export const getLoader = ({
html,
pdf,
type,
url
}: {
url: string
html: string
type: string
pdf: { content: string; page: number }[]
}) => {
if (type === "pdf") {
return new PageAssistPDFLoader({
pdf,
url
})
} else {
return new PageAssistHtmlLoader({
html,
url
})
}
}
export const memoryEmbedding = async ({
html,
keepTrackOfEmbedding,
ollamaEmbedding,
pdf,
setIsEmbedding,
setKeepTrackOfEmbedding,
type,
url
}: {
url: string
html: string
type: string
pdf: { content: string; page: number }[]
keepTrackOfEmbedding: Record
ollamaEmbedding: OllamaEmbeddings
setIsEmbedding: (value: boolean) => void
setKeepTrackOfEmbedding: (value: Record) => void
}) => {
setIsEmbedding(true)
const loader = getLoader({ html, pdf, type, url })
const docs = await loader.load()
const chunkSize = await defaultEmbeddingChunkSize()
const chunkOverlap = await defaultEmbeddingChunkOverlap()
const textSplitter = new RecursiveCharacterTextSplitter({
chunkSize,
chunkOverlap
})
const chunks = await textSplitter.splitDocuments(docs)
const store = new MemoryVectorStore(ollamaEmbedding)
await store.addDocuments(chunks)
setKeepTrackOfEmbedding({
...keepTrackOfEmbedding,
[url]: store
})
setIsEmbedding(false)
return store
}