Added Web search DuckDuckGo
This commit is contained in:
114
src/web/local-duckduckgo.ts
Normal file
114
src/web/local-duckduckgo.ts
Normal file
@@ -0,0 +1,114 @@
|
||||
import { cleanUrl } from "@/libs/clean-url"
|
||||
import { chromeRunTime } from "@/libs/runtime"
|
||||
import { PageAssistHtmlLoader } from "@/loader/html"
|
||||
import {
|
||||
defaultEmbeddingChunkOverlap,
|
||||
defaultEmbeddingChunkSize,
|
||||
defaultEmbeddingModelForRag,
|
||||
getOllamaURL
|
||||
} from "@/services/ollama"
|
||||
import {
|
||||
getIsSimpleInternetSearch,
|
||||
totalSearchResults
|
||||
} from "@/services/search"
|
||||
import { OllamaEmbeddings } from "@langchain/community/embeddings/ollama"
|
||||
import type { Document } from "@langchain/core/documents"
|
||||
import * as cheerio from "cheerio"
|
||||
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"
|
||||
import { MemoryVectorStore } from "langchain/vectorstores/memory"
|
||||
|
||||
export const localDuckDuckGoSearch = async (query: string) => {
|
||||
await chromeRunTime(cleanUrl("https://html.duckduckgo.com/html/?q=" + query))
|
||||
|
||||
const abortController = new AbortController()
|
||||
setTimeout(() => abortController.abort(), 10000)
|
||||
|
||||
const htmlString = await fetch(
|
||||
"https://html.duckduckgo.com/html/?q=" + query,
|
||||
{
|
||||
signal: abortController.signal
|
||||
}
|
||||
)
|
||||
.then((response) => response.text())
|
||||
.catch()
|
||||
|
||||
const $ = cheerio.load(htmlString)
|
||||
|
||||
const searchResults = Array.from($("div.results_links_deep")).map(
|
||||
(result) => {
|
||||
const title = $(result).find("a.result__a").text()
|
||||
const link = $(result)
|
||||
.find("a.result__snippet")
|
||||
.attr("href")
|
||||
.replace("//duckduckgo.com/l/?uddg=", "")
|
||||
const content = $(result).find("a.result__snippet").text()
|
||||
const decodedLink = decodeURIComponent(link)
|
||||
return { title, link: decodedLink, content }
|
||||
}
|
||||
)
|
||||
|
||||
return searchResults
|
||||
}
|
||||
|
||||
export const webDuckDuckGoSearch = async (query: string) => {
|
||||
const results = await localDuckDuckGoSearch(query)
|
||||
const TOTAL_SEARCH_RESULTS = await totalSearchResults()
|
||||
const searchResults = results.slice(0, TOTAL_SEARCH_RESULTS)
|
||||
|
||||
const isSimpleMode = await getIsSimpleInternetSearch()
|
||||
|
||||
if (isSimpleMode) {
|
||||
await getOllamaURL()
|
||||
return searchResults.map((result) => {
|
||||
return {
|
||||
url: result.link,
|
||||
content: result.content
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
const docs: Document<Record<string, any>>[] = []
|
||||
for (const result of searchResults) {
|
||||
const loader = new PageAssistHtmlLoader({
|
||||
html: "",
|
||||
url: result.link
|
||||
})
|
||||
|
||||
const documents = await loader.loadByURL()
|
||||
|
||||
documents.forEach((doc) => {
|
||||
docs.push(doc)
|
||||
})
|
||||
}
|
||||
const ollamaUrl = await getOllamaURL()
|
||||
|
||||
const embeddingModle = await defaultEmbeddingModelForRag()
|
||||
const ollamaEmbedding = new OllamaEmbeddings({
|
||||
model: embeddingModle || "",
|
||||
baseUrl: cleanUrl(ollamaUrl)
|
||||
})
|
||||
|
||||
const chunkSize = await defaultEmbeddingChunkSize()
|
||||
const chunkOverlap = await defaultEmbeddingChunkOverlap()
|
||||
const textSplitter = new RecursiveCharacterTextSplitter({
|
||||
chunkSize,
|
||||
chunkOverlap
|
||||
})
|
||||
|
||||
const chunks = await textSplitter.splitDocuments(docs)
|
||||
|
||||
const store = new MemoryVectorStore(ollamaEmbedding)
|
||||
|
||||
await store.addDocuments(chunks)
|
||||
|
||||
const resultsWithEmbeddings = await store.similaritySearch(query, 3)
|
||||
|
||||
const searchResult = resultsWithEmbeddings.map((result) => {
|
||||
return {
|
||||
url: result.metadata.url,
|
||||
content: result.pageContent
|
||||
}
|
||||
})
|
||||
|
||||
return searchResult
|
||||
}
|
||||
@@ -1,3 +1,7 @@
|
||||
import {
|
||||
getIsSimpleInternetSearch,
|
||||
totalSearchResults
|
||||
} from "@/services/search"
|
||||
import { OllamaEmbeddings } from "@langchain/community/embeddings/ollama"
|
||||
import type { Document } from "@langchain/core/documents"
|
||||
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"
|
||||
@@ -5,16 +9,13 @@ import { MemoryVectorStore } from "langchain/vectorstores/memory"
|
||||
import { cleanUrl } from "~/libs/clean-url"
|
||||
import { chromeRunTime } from "~/libs/runtime"
|
||||
import { PageAssistHtmlLoader } from "~/loader/html"
|
||||
import { defaultEmbeddingChunkOverlap, defaultEmbeddingChunkSize, defaultEmbeddingModelForRag, getIsSimpleInternetSearch, getOllamaURL } from "~/services/ollama"
|
||||
import {
|
||||
defaultEmbeddingChunkOverlap,
|
||||
defaultEmbeddingChunkSize,
|
||||
defaultEmbeddingModelForRag,
|
||||
getOllamaURL
|
||||
} from "~/services/ollama"
|
||||
|
||||
const BLOCKED_HOSTS = [
|
||||
"google.com",
|
||||
"youtube.com",
|
||||
"twitter.com",
|
||||
"linkedin.com",
|
||||
]
|
||||
|
||||
const TOTAL_SEARCH_RESULTS = 2
|
||||
|
||||
export const localGoogleSearch = async (query: string) => {
|
||||
await chromeRunTime(
|
||||
@@ -40,23 +41,18 @@ export const localGoogleSearch = async (query: string) => {
|
||||
(result) => {
|
||||
const title = result.querySelector("h3")?.textContent
|
||||
const link = result.querySelector("a")?.getAttribute("href")
|
||||
const content = Array.from(result.querySelectorAll("span")).map((span) => span.textContent).join(" ")
|
||||
const content = Array.from(result.querySelectorAll("span"))
|
||||
.map((span) => span.textContent)
|
||||
.join(" ")
|
||||
return { title, link, content }
|
||||
}
|
||||
)
|
||||
const filteredSearchResults = searchResults
|
||||
.filter(
|
||||
(result) =>
|
||||
!result.link ||
|
||||
!BLOCKED_HOSTS.some((host) => result.link.includes(host))
|
||||
)
|
||||
.filter((result) => result.title && result.link)
|
||||
return filteredSearchResults
|
||||
return searchResults
|
||||
}
|
||||
|
||||
|
||||
export const webSearch = async (query: string) => {
|
||||
export const webGoogleSearch = async (query: string) => {
|
||||
const results = await localGoogleSearch(query)
|
||||
const TOTAL_SEARCH_RESULTS = await totalSearchResults()
|
||||
const searchResults = results.slice(0, TOTAL_SEARCH_RESULTS)
|
||||
|
||||
const isSimpleMode = await getIsSimpleInternetSearch()
|
||||
@@ -71,7 +67,7 @@ export const webSearch = async (query: string) => {
|
||||
})
|
||||
}
|
||||
|
||||
const docs: Document<Record<string, any>>[] = [];
|
||||
const docs: Document<Record<string, any>>[] = []
|
||||
for (const result of searchResults) {
|
||||
const loader = new PageAssistHtmlLoader({
|
||||
html: "",
|
||||
@@ -89,14 +85,14 @@ export const webSearch = async (query: string) => {
|
||||
const embeddingModle = await defaultEmbeddingModelForRag()
|
||||
const ollamaEmbedding = new OllamaEmbeddings({
|
||||
model: embeddingModle || "",
|
||||
baseUrl: cleanUrl(ollamaUrl),
|
||||
baseUrl: cleanUrl(ollamaUrl)
|
||||
})
|
||||
|
||||
const chunkSize = await defaultEmbeddingChunkSize();
|
||||
const chunkOverlap = await defaultEmbeddingChunkOverlap();
|
||||
const chunkSize = await defaultEmbeddingChunkSize()
|
||||
const chunkOverlap = await defaultEmbeddingChunkOverlap()
|
||||
const textSplitter = new RecursiveCharacterTextSplitter({
|
||||
chunkSize,
|
||||
chunkOverlap,
|
||||
chunkOverlap
|
||||
})
|
||||
|
||||
const chunks = await textSplitter.splitDocuments(docs)
|
||||
@@ -105,7 +101,6 @@ export const webSearch = async (query: string) => {
|
||||
|
||||
await store.addDocuments(chunks)
|
||||
|
||||
|
||||
const resultsWithEmbeddings = await store.similaritySearch(query, 3)
|
||||
|
||||
const searchResult = resultsWithEmbeddings.map((result) => {
|
||||
@@ -116,4 +111,4 @@ export const webSearch = async (query: string) => {
|
||||
})
|
||||
|
||||
return searchResult
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,42 +1,61 @@
|
||||
import { getWebSearchPrompt } from "~/services/ollama"
|
||||
import { webSearch } from "./local-google"
|
||||
import { webGoogleSearch } from "./local-google"
|
||||
import { webDuckDuckGoSearch } from "./local-duckduckgo"
|
||||
import { getSearchProvider } from "@/services/search"
|
||||
|
||||
const getHostName = (url: string) => {
|
||||
try {
|
||||
const hostname = new URL(url).hostname
|
||||
return hostname
|
||||
} catch (e) {
|
||||
return ""
|
||||
}
|
||||
try {
|
||||
const hostname = new URL(url).hostname
|
||||
return hostname
|
||||
} catch (e) {
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
const searchWeb = (provider: string, query: string) => {
|
||||
switch (provider) {
|
||||
case "duckduckgo":
|
||||
return webDuckDuckGoSearch(query)
|
||||
default:
|
||||
return webGoogleSearch(query)
|
||||
}
|
||||
}
|
||||
|
||||
export const getSystemPromptForWeb = async (query: string) => {
|
||||
try {
|
||||
const search = await webSearch(query)
|
||||
try {
|
||||
const searchProvider = await getSearchProvider()
|
||||
const search = await searchWeb(searchProvider, query)
|
||||
|
||||
const search_results = search.map((result, idx) => `<result source="${result.url}" id="${idx}">${result.content}</result>`).join("\n")
|
||||
const search_results = search
|
||||
.map(
|
||||
(result, idx) =>
|
||||
`<result source="${result.url}" id="${idx}">${result.content}</result>`
|
||||
)
|
||||
.join("\n")
|
||||
|
||||
const current_date_time = new Date().toLocaleString()
|
||||
const current_date_time = new Date().toLocaleString()
|
||||
|
||||
const system = await getWebSearchPrompt();
|
||||
const system = await getWebSearchPrompt()
|
||||
|
||||
const prompt = system.replace("{current_date_time}", current_date_time).replace("{search_results}", search_results)
|
||||
const prompt = system
|
||||
.replace("{current_date_time}", current_date_time)
|
||||
.replace("{search_results}", search_results)
|
||||
|
||||
return {
|
||||
prompt,
|
||||
source: search.map((result) => {
|
||||
return {
|
||||
prompt,
|
||||
source: search.map((result) => {
|
||||
return {
|
||||
url: result.url,
|
||||
name: getHostName(result.url),
|
||||
type: "url",
|
||||
}
|
||||
})
|
||||
}
|
||||
} catch (e) {
|
||||
console.error(e)
|
||||
return {
|
||||
prompt: "",
|
||||
source: [],
|
||||
url: result.url,
|
||||
name: getHostName(result.url),
|
||||
type: "url"
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
console.error(e)
|
||||
return {
|
||||
prompt: "",
|
||||
source: []
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user