feat: add IoD search
This commit is contained in:
148
src/web/iod.ts
Normal file
148
src/web/iod.ts
Normal file
@@ -0,0 +1,148 @@
|
||||
import { cleanUrl } from "@/libs/clean-url"
|
||||
import { PageAssistHtmlLoader } from "@/loader/html"
|
||||
import { pageAssistEmbeddingModel } from "@/models/embedding"
|
||||
import { defaultEmbeddingModelForRag, getOllamaURL } from "@/services/ollama"
|
||||
import {
|
||||
getIsSimpleInternetSearch,
|
||||
totalSearchResults
|
||||
} from "@/services/search"
|
||||
import { getPageAssistTextSplitter } from "@/utils/text-splitter"
|
||||
import type { Document } from "@langchain/core/documents"
|
||||
import { MemoryVectorStore } from "langchain/vectorstores/memory"
|
||||
|
||||
const makeRegSearchParams = (count: number, keyword: string) => ({
|
||||
action: "executeContract",
|
||||
contractID: "BDBrowser",
|
||||
operation: "sendRequestDirectly",
|
||||
arg: {
|
||||
id: "670E241C9937B3537047C87053E3AA36",
|
||||
doipUrl: "tcp://reg01.public.internetofdata.cn:21037",
|
||||
op: "Search",
|
||||
attributes: {
|
||||
offset: 0,
|
||||
count,
|
||||
bodyBase64Encoded: false,
|
||||
searchMode: [
|
||||
{
|
||||
key: "data_type",
|
||||
type: "MUST",
|
||||
value: "paper"
|
||||
},
|
||||
// {
|
||||
// key: "title",
|
||||
// type: "MUST",
|
||||
// value: keyword,
|
||||
// },
|
||||
{
|
||||
key: "description",
|
||||
type: "MUST",
|
||||
value: keyword
|
||||
}
|
||||
]
|
||||
},
|
||||
body: ""
|
||||
}
|
||||
})
|
||||
|
||||
export const localIodSearch = async (query: string, keywords: string[]) => {
|
||||
const TOTAL_SEARCH_RESULTS = await totalSearchResults()
|
||||
|
||||
const results = (
|
||||
await Promise.all(
|
||||
keywords.map(async (keyword) => {
|
||||
const abortController = new AbortController()
|
||||
setTimeout(() => abortController.abort(), 10000)
|
||||
|
||||
const params = makeRegSearchParams(TOTAL_SEARCH_RESULTS, keyword)
|
||||
|
||||
return fetch("http://47.93.156.31:21033/SCIDE/SCManager", {
|
||||
method: "POST",
|
||||
body: JSON.stringify(params),
|
||||
signal: abortController.signal
|
||||
})
|
||||
.then((response) => response.json())
|
||||
.then((res) => {
|
||||
if (res.status !== "Success") {
|
||||
console.log(res)
|
||||
return []
|
||||
}
|
||||
const body = JSON.parse(res.result.body)
|
||||
if (body.code !== 0) {
|
||||
console.log(body)
|
||||
return []
|
||||
}
|
||||
const results =
|
||||
body.data?.results?.filter((r) => r.url || r.pdf_url) || []
|
||||
results.forEach((r) => {
|
||||
r.url = r.url || r.pdf_url
|
||||
})
|
||||
return results
|
||||
})
|
||||
.catch((e) => {
|
||||
console.log(e)
|
||||
return []
|
||||
})
|
||||
})
|
||||
)
|
||||
).flat()
|
||||
|
||||
return results
|
||||
}
|
||||
|
||||
const ARXIV_URL = /^https:\/\/arxiv.org\//
|
||||
|
||||
export const searchIod = async (query: string, keywords: string[]) => {
|
||||
const searchResults = await localIodSearch(query, keywords)
|
||||
|
||||
const isSimpleMode = await getIsSimpleInternetSearch()
|
||||
|
||||
if (isSimpleMode) {
|
||||
await getOllamaURL()
|
||||
return searchResults
|
||||
}
|
||||
|
||||
const docs: Document<Record<string, any>>[] = []
|
||||
for (const result of searchResults) {
|
||||
let url = result.url
|
||||
if (ARXIV_URL.test(result.url)) {
|
||||
url = result.url.replace("/pdf/", "/abs/").replace(".pdf", "")
|
||||
}
|
||||
|
||||
const loader = new PageAssistHtmlLoader({
|
||||
html: "",
|
||||
url
|
||||
})
|
||||
|
||||
const documents = await loader.loadByURL()
|
||||
|
||||
documents.forEach((doc) => {
|
||||
docs.push(doc)
|
||||
})
|
||||
}
|
||||
const ollamaUrl = await getOllamaURL()
|
||||
|
||||
const embeddingModle = await defaultEmbeddingModelForRag()
|
||||
const ollamaEmbedding = await pageAssistEmbeddingModel({
|
||||
model: embeddingModle || "",
|
||||
baseUrl: cleanUrl(ollamaUrl)
|
||||
})
|
||||
|
||||
const textSplitter = await getPageAssistTextSplitter()
|
||||
|
||||
const chunks = await textSplitter.splitDocuments(docs)
|
||||
|
||||
const store = new MemoryVectorStore(ollamaEmbedding)
|
||||
|
||||
await store.addDocuments(chunks)
|
||||
|
||||
const resultsWithEmbeddings = await store.similaritySearch(query, 3)
|
||||
|
||||
const searchResult = resultsWithEmbeddings.map((result) => {
|
||||
return {
|
||||
url: result.metadata.url,
|
||||
content: result.pageContent
|
||||
}
|
||||
})
|
||||
|
||||
return searchResult
|
||||
}
|
||||
@@ -8,6 +8,7 @@ import { getWebsiteFromQuery, processSingleWebsite } from "./website"
|
||||
import { searxngSearch } from "./search-engines/searxng"
|
||||
import { braveAPISearch } from "./search-engines/brave-api"
|
||||
import { webBaiduSearch } from "./search-engines/baidu"
|
||||
import { searchIod } from "./iod"
|
||||
|
||||
const getHostName = (url: string) => {
|
||||
try {
|
||||
@@ -37,30 +38,66 @@ const searchWeb = (provider: string, query: string) => {
|
||||
}
|
||||
}
|
||||
|
||||
export const getSystemPromptForWeb = async (query: string) => {
|
||||
export const getSystemPromptForWeb = async (
|
||||
query: string,
|
||||
keywords: string[] = [],
|
||||
webSearch = true,
|
||||
iodSearch = false
|
||||
) => {
|
||||
try {
|
||||
|
||||
const websiteVisit = getWebsiteFromQuery(query)
|
||||
let search: {
|
||||
url: any;
|
||||
content: string;
|
||||
let webSearchResults: {
|
||||
url: any
|
||||
content: string
|
||||
}[] = []
|
||||
// let search_results_web = ""
|
||||
|
||||
const isVisitSpecificWebsite = await getIsVisitSpecificWebsite()
|
||||
if (webSearch) {
|
||||
const isVisitSpecificWebsite = await getIsVisitSpecificWebsite()
|
||||
|
||||
if (isVisitSpecificWebsite && websiteVisit.hasUrl) {
|
||||
if (isVisitSpecificWebsite && websiteVisit.hasUrl) {
|
||||
const url = websiteVisit.url
|
||||
const queryWithoutUrl = websiteVisit.queryWithouUrls
|
||||
webSearchResults = await processSingleWebsite(url, queryWithoutUrl)
|
||||
} else {
|
||||
const searchProvider = await getSearchProvider()
|
||||
webSearchResults = await searchWeb(searchProvider, query)
|
||||
}
|
||||
|
||||
const url = websiteVisit.url
|
||||
const queryWithoutUrl = websiteVisit.queryWithouUrls
|
||||
search = await processSingleWebsite(url, queryWithoutUrl)
|
||||
|
||||
} else {
|
||||
const searchProvider = await getSearchProvider()
|
||||
search = await searchWeb(searchProvider, query)
|
||||
// search_results_web = webSearchResults
|
||||
// .map(
|
||||
// (result, idx) =>
|
||||
// `<result source="${result.url}" id="${idx}">${result.content}</result>`
|
||||
// )
|
||||
// .join("\n")
|
||||
}
|
||||
|
||||
let iodSearchResults: {
|
||||
doId: string
|
||||
name: string
|
||||
url?: string
|
||||
// pdf_url?: string
|
||||
description: string
|
||||
}[] = []
|
||||
// let search_results_iod = ""
|
||||
|
||||
const search_results = search
|
||||
if (iodSearch) {
|
||||
iodSearchResults = await searchIod(query, keywords)
|
||||
// search_results_iod = iodSearchResults
|
||||
// .map(
|
||||
// (result, idx) =>
|
||||
// `<result source="${result.url}" id="${idx}">${result.content}</result>`
|
||||
// )
|
||||
// .join("\n")
|
||||
}
|
||||
|
||||
const search_results = iodSearchResults.map((res) => ({
|
||||
url: `${res.doId}: ${res.name}`,
|
||||
content: res.description
|
||||
}))
|
||||
.concat(
|
||||
webSearchResults
|
||||
)
|
||||
.map(
|
||||
(result, idx) =>
|
||||
`<result source="${result.url}" id="${idx}">${result.content}</result>`
|
||||
@@ -77,13 +114,14 @@ export const getSystemPromptForWeb = async (query: string) => {
|
||||
|
||||
return {
|
||||
prompt,
|
||||
source: search.map((result) => {
|
||||
webSources: webSearchResults.map((result) => {
|
||||
return {
|
||||
url: result.url,
|
||||
name: getHostName(result.url),
|
||||
type: "url"
|
||||
}
|
||||
})
|
||||
}),
|
||||
iodSources: iodSearchResults,
|
||||
}
|
||||
} catch (e) {
|
||||
console.error(e)
|
||||
|
||||
Reference in New Issue
Block a user