feat: Add Baidu search engine

This commit is contained in:
n4ze3m
2025-02-01 11:22:12 +05:30
parent 2d1e465582
commit 342d544e30
7 changed files with 114 additions and 10 deletions

View File

@@ -0,0 +1,105 @@
import { cleanUrl } from "@/libs/clean-url"
import { PageAssistHtmlLoader } from "@/loader/html"
import { pageAssistEmbeddingModel } from "@/models/embedding"
import {
defaultEmbeddingModelForRag,
getOllamaURL
} from "@/services/ollama"
import {
getIsSimpleInternetSearch,
totalSearchResults
} from "@/services/search"
import { getPageAssistTextSplitter } from "@/utils/text-splitter"
import type { Document } from "@langchain/core/documents"
import { MemoryVectorStore } from "langchain/vectorstores/memory"
export const localBaiduSearch = async (query: string) => {
const TOTAL_SEARCH_RESULTS = await totalSearchResults()
const abortController = new AbortController()
setTimeout(() => abortController.abort(), 10000)
const jsonRes = await fetch(
"https://www.baidu.com/s?wd=" + encodeURIComponent(query) + "&tn=json&rn=" + TOTAL_SEARCH_RESULTS,
{
signal: abortController.signal
}
)
.then((response) => response.json())
.catch((e) => {
console.log(e)
return {
feed: {
entry: []
}
}
})
const data = jsonRes?.feed?.entry || []
const searchResults = data.map((result: any) => {
const title = result?.title || ""
const link = result?.url
const content = result?.abs || ""
return { title, link, content }
})
return searchResults.filter((result) => result?.link)
}
export const webBaiduSearch = async (query: string) => {
const searchResults = await localBaiduSearch(query)
const isSimpleMode = await getIsSimpleInternetSearch()
if (isSimpleMode) {
await getOllamaURL()
return searchResults.map((result) => {
return {
url: result.link,
content: result.content
}
})
}
const docs: Document<Record<string, any>>[] = []
for (const result of searchResults) {
const loader = new PageAssistHtmlLoader({
html: "",
url: result.link
})
const documents = await loader.loadByURL()
documents.forEach((doc) => {
docs.push(doc)
})
}
const ollamaUrl = await getOllamaURL()
const embeddingModle = await defaultEmbeddingModelForRag()
const ollamaEmbedding = await pageAssistEmbeddingModel({
model: embeddingModle || "",
baseUrl: cleanUrl(ollamaUrl)
})
const textSplitter = await getPageAssistTextSplitter()
const chunks = await textSplitter.splitDocuments(docs)
const store = new MemoryVectorStore(ollamaEmbedding)
await store.addDocuments(chunks)
const resultsWithEmbeddings = await store.similaritySearch(query, 3)
const searchResult = resultsWithEmbeddings.map((result) => {
return {
url: result.metadata.url,
content: result.pageContent
}
})
return searchResult
}

View File

@@ -1,5 +1,4 @@
import { cleanUrl } from "@/libs/clean-url"
import { urlRewriteRuntime } from "@/libs/runtime"
import { PageAssistHtmlLoader } from "@/loader/html"
import { pageAssistEmbeddingModel } from "@/models/embedding"
import {
@@ -16,7 +15,6 @@ import * as cheerio from "cheerio"
import { MemoryVectorStore } from "langchain/vectorstores/memory"
export const localDuckDuckGoSearch = async (query: string) => {
await urlRewriteRuntime(cleanUrl("https://html.duckduckgo.com/html/?q=" + query), "duckduckgo")
const abortController = new AbortController()
setTimeout(() => abortController.abort(), 10000)

View File

@@ -8,7 +8,6 @@ import { getPageAssistTextSplitter } from "@/utils/text-splitter"
import type { Document } from "@langchain/core/documents"
import { MemoryVectorStore } from "langchain/vectorstores/memory"
import { cleanUrl } from "~/libs/clean-url"
import { urlRewriteRuntime } from "~/libs/runtime"
import { PageAssistHtmlLoader } from "~/loader/html"
import {
defaultEmbeddingModelForRag,
@@ -18,10 +17,6 @@ import {
export const localGoogleSearch = async (query: string) => {
const baseGoogleDomain = await getGoogleDomain()
await urlRewriteRuntime(
cleanUrl(`https://www.${baseGoogleDomain}/search?hl=en&q=` + query),
"google"
)
const abortController = new AbortController()
setTimeout(() => abortController.abort(), 10000)

View File

@@ -7,6 +7,7 @@ import { webBraveSearch } from "./search-engines/brave"
import { getWebsiteFromQuery, processSingleWebsite } from "./website"
import { searxngSearch } from "./search-engines/searxng"
import { braveAPISearch } from "./search-engines/brave-api"
import { webBaiduSearch } from "./search-engines/baidu"
const getHostName = (url: string) => {
try {
@@ -29,6 +30,8 @@ const searchWeb = (provider: string, query: string) => {
return searxngSearch(query)
case "brave-api":
return braveAPISearch(query)
case "baidu":
return webBaiduSearch(query)
default:
return webGoogleSearch(query)
}