refactor: Update PageAssistHtmlLoader to use extractReadabilityContent for parsing web page content

This commit is contained in:
n4ze3m
2024-06-23 20:34:43 +05:30
parent 4363a4b0de
commit 1e9b66d823
4 changed files with 19 additions and 59 deletions

View File

@@ -1,34 +1,16 @@
import { cleanUrl } from "@/libs/clean-url"
import { extractReadabilityContent } from "@/parser/reader"
import { PageAssistHtmlLoader } from "@/loader/html"
import { defaultEmbeddingChunkOverlap, defaultEmbeddingChunkSize, defaultEmbeddingModelForRag, getOllamaURL } from "@/services/ollama"
import { getIsSimpleInternetSearch } from "@/services/search"
import { OllamaEmbeddings } from "@langchain/community/embeddings/ollama"
import type { Document } from "@langchain/core/documents"
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"
import { MemoryVectorStore } from "langchain/vectorstores/memory"
export const processSingleWebsite = async (url: string, query: string) => {
let content = await extractReadabilityContent(url)
// const isSimpleMode = await getIsSimpleInternetSearch()
// if (isSimpleMode) {
// return [
// {
// url,
// content: content.length > 5000 ? content.slice(0, 5000) : content
// }
// ]
// }
const docs: Document<Record<string, any>>[] = [
{
metadata: {
url
},
pageContent: content
}
]
const loader = new PageAssistHtmlLoader({
html: "",
url
})
const docs = await loader.loadByURL()
const ollamaUrl = await getOllamaURL()