206 lines
5.3 KiB
TypeScript
206 lines
5.3 KiB
TypeScript
import { cleanUrl } from "@/libs/clean-url"
|
|
import { PageAssistHtmlLoader } from "@/loader/html"
|
|
import { PageAssistPDFUrlLoader } from "@/loader/pdf-url"
|
|
import { pageAssistEmbeddingModel } from "@/models/embedding"
|
|
import { defaultEmbeddingModelForRag, getOllamaURL } from "@/services/ollama"
|
|
import {
|
|
getIsSimpleInternetSearch,
|
|
totalSearchResults
|
|
} from "@/services/search"
|
|
import { getPageAssistTextSplitter } from "@/utils/text-splitter"
|
|
import type { Document } from "@langchain/core/documents"
|
|
import { MemoryVectorStore } from "langchain/vectorstores/memory"
|
|
import type { IodRegistryEntry } from "~/types/iod"
|
|
|
|
const makeRegSearchParams = (count: number, keyword: string) => ({
|
|
action: "executeContract",
|
|
contractID: "BDBrowser",
|
|
operation: "sendRequestDirectly",
|
|
arg: {
|
|
id: "670E241C9937B3537047C87053E3AA36",
|
|
doipUrl: "tcp://reg01.public.internetofdata.cn:21037",
|
|
op: "Search",
|
|
attributes: {
|
|
offset: 0,
|
|
count,
|
|
bodyBase64Encoded: false,
|
|
searchMode: [
|
|
{
|
|
key: "data_type",
|
|
type: "MUST",
|
|
value: "paper"
|
|
},
|
|
// {
|
|
// key: "title",
|
|
// type: "MUST",
|
|
// value: keyword,
|
|
// },
|
|
{
|
|
key: "description",
|
|
type: "MUST",
|
|
value: keyword
|
|
}
|
|
]
|
|
},
|
|
body: ""
|
|
}
|
|
})
|
|
|
|
export async function localIodSearch(
|
|
query: string,
|
|
keywords: string[]
|
|
): Promise<IodRegistryEntry[]> {
|
|
const TOTAL_SEARCH_RESULTS = await totalSearchResults()
|
|
|
|
const results = (
|
|
await Promise.all(
|
|
keywords.map(async (keyword) => {
|
|
const abortController = new AbortController()
|
|
setTimeout(() => abortController.abort(), 10000)
|
|
|
|
const params = makeRegSearchParams(TOTAL_SEARCH_RESULTS, keyword)
|
|
|
|
return fetch("http://47.93.156.31:21033/SCIDE/SCManager", {
|
|
method: "POST",
|
|
body: JSON.stringify(params),
|
|
signal: abortController.signal
|
|
})
|
|
.then((response) => response.json())
|
|
.then((res) => {
|
|
if (res.status !== "Success") {
|
|
console.log(res)
|
|
return []
|
|
}
|
|
const body = JSON.parse(res.result.body)
|
|
if (body.code !== 0) {
|
|
console.log(body)
|
|
return []
|
|
}
|
|
const results: IodRegistryEntry[] =
|
|
body.data?.results?.filter((r) => r.url || r.pdf_url) || []
|
|
for (const r of results) {
|
|
r.url = r.url || r.pdf_url
|
|
}
|
|
return results
|
|
})
|
|
.catch((e) => {
|
|
console.log(e)
|
|
return []
|
|
})
|
|
})
|
|
)
|
|
).flat()
|
|
|
|
return results
|
|
}
|
|
|
|
const ARXIV_URL_PATTERN = /^https?:\/\/arxiv\.org\//
|
|
const ARXIV_NO_HTM = "No HTML for"
|
|
|
|
export const searchIod = async (query: string, keywords: string[]) => {
|
|
const searchResults = await localIodSearch(query, keywords)
|
|
|
|
const isSimpleMode = await getIsSimpleInternetSearch()
|
|
|
|
if (isSimpleMode) {
|
|
await getOllamaURL()
|
|
return searchResults
|
|
}
|
|
|
|
const docs: Document<Record<string, any>>[] = []
|
|
const resMap = new Map<string, IodRegistryEntry>()
|
|
for (const result of searchResults) {
|
|
const url = result.url
|
|
if (!url) continue
|
|
|
|
let htmlUrl = ""
|
|
if (ARXIV_URL_PATTERN.test(url)) {
|
|
htmlUrl = url.replace("/pdf/", "/html/").replace(".pdf", "")
|
|
}
|
|
|
|
let noHtml = htmlUrl === ""
|
|
if (!noHtml) {
|
|
const loader = new PageAssistHtmlLoader({
|
|
html: "",
|
|
url: htmlUrl
|
|
})
|
|
|
|
try {
|
|
const documents = await loader.loadByURL()
|
|
for (const doc of documents) {
|
|
if (doc.pageContent.includes(ARXIV_NO_HTM)) {
|
|
noHtml = true
|
|
return
|
|
}
|
|
docs.push(doc)
|
|
}
|
|
} catch (e) {
|
|
console.log(e)
|
|
noHtml = true
|
|
}
|
|
}
|
|
|
|
if (noHtml) {
|
|
if (url.endsWith(".pdf")) {
|
|
const loader = new PageAssistPDFUrlLoader({
|
|
name: result.name,
|
|
url
|
|
})
|
|
|
|
try {
|
|
const documents = await loader.load()
|
|
for (const doc of documents) {
|
|
docs.push(doc)
|
|
}
|
|
} catch (e) {
|
|
console.log(e)
|
|
}
|
|
} else {
|
|
const loader = new PageAssistHtmlLoader({
|
|
html: "",
|
|
url
|
|
})
|
|
|
|
try {
|
|
const documents = await loader.loadByURL()
|
|
for (const doc of documents) {
|
|
docs.push(doc)
|
|
}
|
|
} catch (e) {
|
|
console.log(e)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
const ollamaUrl = await getOllamaURL()
|
|
|
|
const embeddingModle = await defaultEmbeddingModelForRag()
|
|
const ollamaEmbedding = await pageAssistEmbeddingModel({
|
|
model: embeddingModle || "",
|
|
baseUrl: cleanUrl(ollamaUrl)
|
|
})
|
|
|
|
const textSplitter = await getPageAssistTextSplitter()
|
|
|
|
const chunks = await textSplitter.splitDocuments(docs)
|
|
|
|
const store = new MemoryVectorStore(ollamaEmbedding)
|
|
|
|
await store.addDocuments(chunks)
|
|
|
|
const resultsWithEmbeddings = await store.similaritySearch(query, 3)
|
|
|
|
const searchResult = resultsWithEmbeddings.map((result) => {
|
|
// `source` for PDF type
|
|
const key = result.metadata.url || result.metadata.source
|
|
if (!key) return null
|
|
const fullRes = resMap[key]
|
|
return {
|
|
...fullRes,
|
|
content: result.pageContent
|
|
}
|
|
}).filter((r) => r)
|
|
|
|
return searchResult
|
|
}
|