feat: add IoD search

This commit is contained in:
Nex Zhu
2025-02-14 18:17:12 +08:00
parent 691575e449
commit e8471f1802
33 changed files with 524 additions and 104 deletions

148
src/web/iod.ts Normal file
View File

@@ -0,0 +1,148 @@
import { cleanUrl } from "@/libs/clean-url"
import { PageAssistHtmlLoader } from "@/loader/html"
import { pageAssistEmbeddingModel } from "@/models/embedding"
import { defaultEmbeddingModelForRag, getOllamaURL } from "@/services/ollama"
import {
getIsSimpleInternetSearch,
totalSearchResults
} from "@/services/search"
import { getPageAssistTextSplitter } from "@/utils/text-splitter"
import type { Document } from "@langchain/core/documents"
import { MemoryVectorStore } from "langchain/vectorstores/memory"
const makeRegSearchParams = (count: number, keyword: string) => ({
action: "executeContract",
contractID: "BDBrowser",
operation: "sendRequestDirectly",
arg: {
id: "670E241C9937B3537047C87053E3AA36",
doipUrl: "tcp://reg01.public.internetofdata.cn:21037",
op: "Search",
attributes: {
offset: 0,
count,
bodyBase64Encoded: false,
searchMode: [
{
key: "data_type",
type: "MUST",
value: "paper"
},
// {
// key: "title",
// type: "MUST",
// value: keyword,
// },
{
key: "description",
type: "MUST",
value: keyword
}
]
},
body: ""
}
})
export const localIodSearch = async (query: string, keywords: string[]) => {
const TOTAL_SEARCH_RESULTS = await totalSearchResults()
const results = (
await Promise.all(
keywords.map(async (keyword) => {
const abortController = new AbortController()
setTimeout(() => abortController.abort(), 10000)
const params = makeRegSearchParams(TOTAL_SEARCH_RESULTS, keyword)
return fetch("http://47.93.156.31:21033/SCIDE/SCManager", {
method: "POST",
body: JSON.stringify(params),
signal: abortController.signal
})
.then((response) => response.json())
.then((res) => {
if (res.status !== "Success") {
console.log(res)
return []
}
const body = JSON.parse(res.result.body)
if (body.code !== 0) {
console.log(body)
return []
}
const results =
body.data?.results?.filter((r) => r.url || r.pdf_url) || []
results.forEach((r) => {
r.url = r.url || r.pdf_url
})
return results
})
.catch((e) => {
console.log(e)
return []
})
})
)
).flat()
return results
}
const ARXIV_URL = /^https:\/\/arxiv.org\//
export const searchIod = async (query: string, keywords: string[]) => {
const searchResults = await localIodSearch(query, keywords)
const isSimpleMode = await getIsSimpleInternetSearch()
if (isSimpleMode) {
await getOllamaURL()
return searchResults
}
const docs: Document<Record<string, any>>[] = []
for (const result of searchResults) {
let url = result.url
if (ARXIV_URL.test(result.url)) {
url = result.url.replace("/pdf/", "/abs/").replace(".pdf", "")
}
const loader = new PageAssistHtmlLoader({
html: "",
url
})
const documents = await loader.loadByURL()
documents.forEach((doc) => {
docs.push(doc)
})
}
const ollamaUrl = await getOllamaURL()
const embeddingModle = await defaultEmbeddingModelForRag()
const ollamaEmbedding = await pageAssistEmbeddingModel({
model: embeddingModle || "",
baseUrl: cleanUrl(ollamaUrl)
})
const textSplitter = await getPageAssistTextSplitter()
const chunks = await textSplitter.splitDocuments(docs)
const store = new MemoryVectorStore(ollamaEmbedding)
await store.addDocuments(chunks)
const resultsWithEmbeddings = await store.similaritySearch(query, 3)
const searchResult = resultsWithEmbeddings.map((result) => {
return {
url: result.metadata.url,
content: result.pageContent
}
})
return searchResult
}

View File

@@ -8,6 +8,7 @@ import { getWebsiteFromQuery, processSingleWebsite } from "./website"
import { searxngSearch } from "./search-engines/searxng"
import { braveAPISearch } from "./search-engines/brave-api"
import { webBaiduSearch } from "./search-engines/baidu"
import { searchIod } from "./iod"
const getHostName = (url: string) => {
try {
@@ -37,30 +38,66 @@ const searchWeb = (provider: string, query: string) => {
}
}
export const getSystemPromptForWeb = async (query: string) => {
export const getSystemPromptForWeb = async (
query: string,
keywords: string[] = [],
webSearch = true,
iodSearch = false
) => {
try {
const websiteVisit = getWebsiteFromQuery(query)
let search: {
url: any;
content: string;
let webSearchResults: {
url: any
content: string
}[] = []
// let search_results_web = ""
const isVisitSpecificWebsite = await getIsVisitSpecificWebsite()
if (webSearch) {
const isVisitSpecificWebsite = await getIsVisitSpecificWebsite()
if (isVisitSpecificWebsite && websiteVisit.hasUrl) {
if (isVisitSpecificWebsite && websiteVisit.hasUrl) {
const url = websiteVisit.url
const queryWithoutUrl = websiteVisit.queryWithouUrls
webSearchResults = await processSingleWebsite(url, queryWithoutUrl)
} else {
const searchProvider = await getSearchProvider()
webSearchResults = await searchWeb(searchProvider, query)
}
const url = websiteVisit.url
const queryWithoutUrl = websiteVisit.queryWithouUrls
search = await processSingleWebsite(url, queryWithoutUrl)
} else {
const searchProvider = await getSearchProvider()
search = await searchWeb(searchProvider, query)
// search_results_web = webSearchResults
// .map(
// (result, idx) =>
// `<result source="${result.url}" id="${idx}">${result.content}</result>`
// )
// .join("\n")
}
let iodSearchResults: {
doId: string
name: string
url?: string
// pdf_url?: string
description: string
}[] = []
// let search_results_iod = ""
const search_results = search
if (iodSearch) {
iodSearchResults = await searchIod(query, keywords)
// search_results_iod = iodSearchResults
// .map(
// (result, idx) =>
// `<result source="${result.url}" id="${idx}">${result.content}</result>`
// )
// .join("\n")
}
const search_results = iodSearchResults.map((res) => ({
url: `${res.doId}: ${res.name}`,
content: res.description
}))
.concat(
webSearchResults
)
.map(
(result, idx) =>
`<result source="${result.url}" id="${idx}">${result.content}</result>`
@@ -77,13 +114,14 @@ export const getSystemPromptForWeb = async (query: string) => {
return {
prompt,
source: search.map((result) => {
webSources: webSearchResults.map((result) => {
return {
url: result.url,
name: getHostName(result.url),
type: "url"
}
})
}),
iodSources: iodSearchResults,
}
} catch (e) {
console.error(e)