refactor: Update PageAssistHtmlLoader to use extractReadabilityContent for parsing web page content
This commit is contained in:
@@ -1,34 +1,16 @@
|
||||
import { cleanUrl } from "@/libs/clean-url"
|
||||
import { extractReadabilityContent } from "@/parser/reader"
|
||||
import { PageAssistHtmlLoader } from "@/loader/html"
|
||||
import { defaultEmbeddingChunkOverlap, defaultEmbeddingChunkSize, defaultEmbeddingModelForRag, getOllamaURL } from "@/services/ollama"
|
||||
import { getIsSimpleInternetSearch } from "@/services/search"
|
||||
import { OllamaEmbeddings } from "@langchain/community/embeddings/ollama"
|
||||
import type { Document } from "@langchain/core/documents"
|
||||
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"
|
||||
import { MemoryVectorStore } from "langchain/vectorstores/memory"
|
||||
|
||||
export const processSingleWebsite = async (url: string, query: string) => {
|
||||
let content = await extractReadabilityContent(url)
|
||||
|
||||
// const isSimpleMode = await getIsSimpleInternetSearch()
|
||||
|
||||
// if (isSimpleMode) {
|
||||
// return [
|
||||
// {
|
||||
// url,
|
||||
// content: content.length > 5000 ? content.slice(0, 5000) : content
|
||||
// }
|
||||
// ]
|
||||
// }
|
||||
|
||||
const docs: Document<Record<string, any>>[] = [
|
||||
{
|
||||
metadata: {
|
||||
url
|
||||
},
|
||||
pageContent: content
|
||||
}
|
||||
]
|
||||
const loader = new PageAssistHtmlLoader({
|
||||
html: "",
|
||||
url
|
||||
})
|
||||
const docs = await loader.loadByURL()
|
||||
|
||||
const ollamaUrl = await getOllamaURL()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user