Add Sogu as search engine

This commit is contained in:
n4ze3m 2024-04-28 00:36:33 +05:30
parent ca1a6e9b1a
commit ad5601a038
5 changed files with 137 additions and 3 deletions

View File

@ -6,5 +6,9 @@ export const SUPPORTED_SERACH_PROVIDERS = [
{ {
label: "DuckDuckGo", label: "DuckDuckGo",
value: "duckduckgo" value: "duckduckgo"
},
{
label: "Sogou",
value: "sogou"
} }
] ]

View File

@ -95,7 +95,7 @@ export const webDuckDuckGoSearch = async (query: string) => {
const textSplitter = new RecursiveCharacterTextSplitter({ const textSplitter = new RecursiveCharacterTextSplitter({
chunkSize, chunkSize,
chunkOverlap chunkOverlap
}) })
const chunks = await textSplitter.splitDocuments(docs) const chunks = await textSplitter.splitDocuments(docs)

View File

@ -0,0 +1,127 @@
import { cleanUrl } from "@/libs/clean-url"
import { chromeRunTime } from "@/libs/runtime"
import { PageAssistHtmlLoader } from "@/loader/html"
import {
defaultEmbeddingChunkOverlap,
defaultEmbeddingChunkSize,
defaultEmbeddingModelForRag,
getOllamaURL
} from "@/services/ollama"
import {
getIsSimpleInternetSearch,
totalSearchResults
} from "@/services/search"
import { OllamaEmbeddings } from "@langchain/community/embeddings/ollama"
import type { Document } from "@langchain/core/documents"
import * as cheerio from "cheerio"
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"
import { MemoryVectorStore } from "langchain/vectorstores/memory"
const getCorrectTargeUrl = async (url: string) => {
if (!url) return ""
const res = await fetch(url)
const $ = cheerio.load(await res.text())
const link = $("script").text()
const matches = link.match(/"(.*?)"/)
return matches?.[1] || ""
}
export const localSogouSearch = async (query: string) => {
await chromeRunTime(cleanUrl("https://www.sogou.com/web?query=" + query))
const abortController = new AbortController()
setTimeout(() => abortController.abort(), 10000)
const htmlString = await fetch("https://www.sogou.com/web?query=" + query, {
signal: abortController.signal
})
.then((response) => response.text())
.catch()
const $ = cheerio.load(htmlString)
const $result = $("#main .results")
const nodes = $result.children().map(async (i, el) => {
const $el = $(el)
const title = $el.find(".vr-title").text().replace(/\n/g, "").trim()
let link = $el.find(".vr-title > a").get(0)?.attribs.href
const content = [".star-wiki", ".fz-mid", ".attribute-centent"]
.map((selector) => {
;[".text-lightgray", ".zan-box", ".tag-website"].forEach((cls) => {
$el.find(cls).remove()
})
return $el.find(selector).text().trim() ?? ""
})
.join(" ")
if (link?.startsWith("/")) {
link = await getCorrectTargeUrl(`https://www.sogou.com${link}`)
}
return { title, link, content }
})
const searchResults = await Promise.all(nodes)
return searchResults.filter(
(result) => result.link && result.title && result.content
)
}
export const webSogouSearch = async (query: string) => {
const results = await localSogouSearch(query)
const TOTAL_SEARCH_RESULTS = await totalSearchResults()
const searchResults = results.slice(0, TOTAL_SEARCH_RESULTS)
const isSimpleMode = await getIsSimpleInternetSearch()
if (isSimpleMode) {
await getOllamaURL()
return searchResults.map((result) => {
return {
url: result.link,
content: result.content
}
})
}
const docs: Document<Record<string, any>>[] = []
for (const result of searchResults) {
const loader = new PageAssistHtmlLoader({
html: "",
url: result.link
})
const documents = await loader.loadByURL()
documents.forEach((doc) => {
docs.push(doc)
})
}
const ollamaUrl = await getOllamaURL()
const embeddingModle = await defaultEmbeddingModelForRag()
const ollamaEmbedding = new OllamaEmbeddings({
model: embeddingModle || "",
baseUrl: cleanUrl(ollamaUrl)
})
const chunkSize = await defaultEmbeddingChunkSize()
const chunkOverlap = await defaultEmbeddingChunkOverlap()
const textSplitter = new RecursiveCharacterTextSplitter({
chunkSize,
chunkOverlap
})
const chunks = await textSplitter.splitDocuments(docs)
const store = new MemoryVectorStore(ollamaEmbedding)
await store.addDocuments(chunks)
const resultsWithEmbeddings = await store.similaritySearch(query, 3)
const searchResult = resultsWithEmbeddings.map((result) => {
return {
url: result.metadata.url,
content: result.pageContent
}
})
return searchResult
}

View File

@ -1,7 +1,8 @@
import { getWebSearchPrompt } from "~/services/ollama" import { getWebSearchPrompt } from "~/services/ollama"
import { webGoogleSearch } from "./local-google" import { webGoogleSearch } from "./search-engines/google"
import { webDuckDuckGoSearch } from "./local-duckduckgo" import { webDuckDuckGoSearch } from "./search-engines/duckduckgo"
import { getSearchProvider } from "@/services/search" import { getSearchProvider } from "@/services/search"
import { webSogouSearch } from "./search-engines/sogou"
const getHostName = (url: string) => { const getHostName = (url: string) => {
try { try {
@ -16,6 +17,8 @@ const searchWeb = (provider: string, query: string) => {
switch (provider) { switch (provider) {
case "duckduckgo": case "duckduckgo":
return webDuckDuckGoSearch(query) return webDuckDuckGoSearch(query)
case "sogou":
return webSogouSearch(query)
default: default:
return webGoogleSearch(query) return webGoogleSearch(query)
} }