From ad5601a0385b2b7d40592de1d366753ef951fd1f Mon Sep 17 00:00:00 2001 From: n4ze3m Date: Sun, 28 Apr 2024 00:36:33 +0530 Subject: [PATCH] Add Sogu as search engine --- src/utils/search-provider.ts | 4 + .../duckduckgo.ts} | 2 +- .../google.ts} | 0 src/web/search-engines/sogou.ts | 127 ++++++++++++++++++ src/web/web.ts | 7 +- 5 files changed, 137 insertions(+), 3 deletions(-) rename src/web/{local-duckduckgo.ts => search-engines/duckduckgo.ts} (99%) rename src/web/{local-google.ts => search-engines/google.ts} (100%) create mode 100644 src/web/search-engines/sogou.ts diff --git a/src/utils/search-provider.ts b/src/utils/search-provider.ts index f23fb8f..5fb35e6 100644 --- a/src/utils/search-provider.ts +++ b/src/utils/search-provider.ts @@ -6,5 +6,9 @@ export const SUPPORTED_SERACH_PROVIDERS = [ { label: "DuckDuckGo", value: "duckduckgo" + }, + { + label: "Sogou", + value: "sogou" } ] \ No newline at end of file diff --git a/src/web/local-duckduckgo.ts b/src/web/search-engines/duckduckgo.ts similarity index 99% rename from src/web/local-duckduckgo.ts rename to src/web/search-engines/duckduckgo.ts index 3a9adfc..089e571 100644 --- a/src/web/local-duckduckgo.ts +++ b/src/web/search-engines/duckduckgo.ts @@ -95,7 +95,7 @@ export const webDuckDuckGoSearch = async (query: string) => { const textSplitter = new RecursiveCharacterTextSplitter({ chunkSize, chunkOverlap - }) + }) const chunks = await textSplitter.splitDocuments(docs) diff --git a/src/web/local-google.ts b/src/web/search-engines/google.ts similarity index 100% rename from src/web/local-google.ts rename to src/web/search-engines/google.ts diff --git a/src/web/search-engines/sogou.ts b/src/web/search-engines/sogou.ts new file mode 100644 index 0000000..ffa7e6f --- /dev/null +++ b/src/web/search-engines/sogou.ts @@ -0,0 +1,127 @@ +import { cleanUrl } from "@/libs/clean-url" +import { chromeRunTime } from "@/libs/runtime" +import { PageAssistHtmlLoader } from "@/loader/html" +import { + defaultEmbeddingChunkOverlap, + defaultEmbeddingChunkSize, + defaultEmbeddingModelForRag, + getOllamaURL +} from "@/services/ollama" +import { + getIsSimpleInternetSearch, + totalSearchResults +} from "@/services/search" +import { OllamaEmbeddings } from "@langchain/community/embeddings/ollama" +import type { Document } from "@langchain/core/documents" +import * as cheerio from "cheerio" +import { RecursiveCharacterTextSplitter } from "langchain/text_splitter" +import { MemoryVectorStore } from "langchain/vectorstores/memory" +const getCorrectTargeUrl = async (url: string) => { + if (!url) return "" + const res = await fetch(url) + const $ = cheerio.load(await res.text()) + const link = $("script").text() + const matches = link.match(/"(.*?)"/) + return matches?.[1] || "" +} +export const localSogouSearch = async (query: string) => { + await chromeRunTime(cleanUrl("https://www.sogou.com/web?query=" + query)) + + const abortController = new AbortController() + + setTimeout(() => abortController.abort(), 10000) + + const htmlString = await fetch("https://www.sogou.com/web?query=" + query, { + signal: abortController.signal + }) + .then((response) => response.text()) + .catch() + + const $ = cheerio.load(htmlString) + const $result = $("#main .results") + const nodes = $result.children().map(async (i, el) => { + const $el = $(el) + const title = $el.find(".vr-title").text().replace(/\n/g, "").trim() + let link = $el.find(".vr-title > a").get(0)?.attribs.href + const content = [".star-wiki", ".fz-mid", ".attribute-centent"] + .map((selector) => { + ;[".text-lightgray", ".zan-box", ".tag-website"].forEach((cls) => { + $el.find(cls).remove() + }) + return $el.find(selector).text().trim() ?? "" + }) + .join(" ") + if (link?.startsWith("/")) { + link = await getCorrectTargeUrl(`https://www.sogou.com${link}`) + } + return { title, link, content } + }) + + const searchResults = await Promise.all(nodes) + return searchResults.filter( + (result) => result.link && result.title && result.content + ) +} + +export const webSogouSearch = async (query: string) => { + const results = await localSogouSearch(query) + const TOTAL_SEARCH_RESULTS = await totalSearchResults() + const searchResults = results.slice(0, TOTAL_SEARCH_RESULTS) + + const isSimpleMode = await getIsSimpleInternetSearch() + + if (isSimpleMode) { + await getOllamaURL() + return searchResults.map((result) => { + return { + url: result.link, + content: result.content + } + }) + } + + const docs: Document>[] = [] + for (const result of searchResults) { + const loader = new PageAssistHtmlLoader({ + html: "", + url: result.link + }) + + const documents = await loader.loadByURL() + + documents.forEach((doc) => { + docs.push(doc) + }) + } + const ollamaUrl = await getOllamaURL() + + const embeddingModle = await defaultEmbeddingModelForRag() + const ollamaEmbedding = new OllamaEmbeddings({ + model: embeddingModle || "", + baseUrl: cleanUrl(ollamaUrl) + }) + + const chunkSize = await defaultEmbeddingChunkSize() + const chunkOverlap = await defaultEmbeddingChunkOverlap() + const textSplitter = new RecursiveCharacterTextSplitter({ + chunkSize, + chunkOverlap + }) + + const chunks = await textSplitter.splitDocuments(docs) + + const store = new MemoryVectorStore(ollamaEmbedding) + + await store.addDocuments(chunks) + + const resultsWithEmbeddings = await store.similaritySearch(query, 3) + + const searchResult = resultsWithEmbeddings.map((result) => { + return { + url: result.metadata.url, + content: result.pageContent + } + }) + + return searchResult +} diff --git a/src/web/web.ts b/src/web/web.ts index 83675e7..cfadf09 100644 --- a/src/web/web.ts +++ b/src/web/web.ts @@ -1,7 +1,8 @@ import { getWebSearchPrompt } from "~/services/ollama" -import { webGoogleSearch } from "./local-google" -import { webDuckDuckGoSearch } from "./local-duckduckgo" +import { webGoogleSearch } from "./search-engines/google" +import { webDuckDuckGoSearch } from "./search-engines/duckduckgo" import { getSearchProvider } from "@/services/search" +import { webSogouSearch } from "./search-engines/sogou" const getHostName = (url: string) => { try { @@ -16,6 +17,8 @@ const searchWeb = (provider: string, query: string) => { switch (provider) { case "duckduckgo": return webDuckDuckGoSearch(query) + case "sogou": + return webSogouSearch(query) default: return webGoogleSearch(query) }