import { cleanUrl } from "@/libs/clean-url" import { PageAssistHtmlLoader } from "@/loader/html" import { PageAssistPDFUrlLoader } from "@/loader/pdf-url" import { pageAssistEmbeddingModel } from "@/models/embedding" import { defaultEmbeddingModelForRag, getOllamaURL } from "@/services/ollama" import { getIsSimpleInternetSearch, totalSearchResults } from "@/services/search" import { getPageAssistTextSplitter } from "@/utils/text-splitter" import { Document } from "@langchain/core/documents" import { MemoryVectorStore } from "langchain/vectorstores/memory" import type { IodRegistryEntry } from "~/types/iod" import { PageAssitDatabase } from "@/db" import exp from "constants" import { Segment, useDefault, cnPOSTag, enPOSTag} from 'segmentit'; const segment = useDefault(new Segment()); export const tokenizeInput = function (input: string): string[] { const words = segment.doSegment(input, { simple: false }); console.log(words.map(function(word){return {w:word.w, p:enPOSTag(word.p)}}) ); return words.filter(word =>( word.w.length > 1)).map(word=>word.w); } //doipUrl = tcp://reg01.public.internetofdata.cn:21037 export const iodConfig = { "gatewayUrl": "tcp://reg01.public.internetofdata.cn:21037", "registry":"data/Registry", "localRepository":"data/Repository", "doBrowser":"http://021.node.internetapi.cn:21030/SCIDE/SCManager" } export const iodConfigLocal = { "gatewayUrl": "tcp://127.0.0.1:21036", "registry":"bdware/Registry", "localRepository":"bdtest.local/myrepo1", "doBrowser":"http://127.0.0.1:21030/SCIDE/SCManager" } function inGrepList(str: string){ return "什么|问题|需要|合适|设计|考虑|合作|精度|传感器|最新|研究|药物".indexOf(str)!=-1; } export const makeSearchParamsWithDataType = function(count: number, keyword: string| string[], dataType: string){ const searchMode = []; searchMode.push({"key":"data_type", "type":"MUST", "value":dataType}) if (typeof keyword === 'string') { // 如果 keyword 是字符串,则直接添加一个 searchMode 条目 searchMode.push({ key: "description", type: "MUST", value: keyword }); } else if (Array.isArray(keyword)) { // 如果 keyword 是数组,则为每个元素添加一个 searchMode 条目 keyword.forEach(str => { if (!inGrepList(str)) searchMode.push({ key: "description", type: "SHOULD", value: str }); }); } return { action: "executeContract", contractID: "BDBrowser", operation: "sendRequestDirectly", arg: { id: iodConfig.registry, //doipUrl:"tcp://127.0.0.1:21039", doipUrl: iodConfig.gatewayUrl, op: "Search", vars:{ timeout:15000 }, attributes: { offset: 0, count, bodyBase64Encoded: false, searchMode:searchMode }, body: "" } } } export const makeRegSearchParams = function(count: number, keyword: string| string[]){ const searchMode = []; if (typeof keyword === 'string') { // 如果 keyword 是字符串,则直接添加一个 searchMode 条目 searchMode.push({ key: "description", type: "MUST", value: keyword }); } else if (Array.isArray(keyword)) { // 如果 keyword 是数组,则为每个元素添加一个 searchMode 条目 keyword.forEach(str => { if (!inGrepList(str)) searchMode.push({ key: "description", type: "SHOULD", value: str }); }); } return { action: "executeContract", contractID: "BDBrowser", operation: "sendRequestDirectly", arg: { id: iodConfig.registry, //doipUrl:"tcp://127.0.0.1:21039", doipUrl: iodConfig.gatewayUrl, op: "Search", vars:{ timeout:15000 }, attributes: { offset: 0, count, bodyBase64Encoded: false, searchMode:searchMode }, body: "" } } } export const makeDOIPParams = (doId:string, op:string, attributes:Object, requestBody: string) => ({ action: "executeContract", contractID: "BDBrowser", operation: "sendRequestDirectly", arg: { id: doId, doipUrl: iodConfig.gatewayUrl, op: op, attributes: attributes, body: requestBody } }) export const retrieveDoc = function(doId: string) : Promise { console.log("retriveDoc:"+doId) const params = makeDOIPParams(doId,"Retrieve",{ bodyBase64Encoded: false }, ""); const abortController = new AbortController() setTimeout(() => abortController.abort(), 10000) return fetch(iodConfig.doBrowser, { method: "POST", body: JSON.stringify(params), signal: abortController.signal }).then((response) => { console.log("responseIn retrieveDoc:"); console.log(response); return response.json()}) .then((res) => { console.log("res:"); console.log(res.result.body); //TODO return { metadata:{traceId:res.result.header.attributes?.traceId}, pageContent:res.result.body } }) } export const updateInLocalRepo = function(historyId: string, requestBody: Object) : Promise { const params = makeDOIPParams(iodConfig.localRepository,"Update",{ "aiDialogID": historyId, bodyBase64Encoded: false }, JSON.stringify(requestBody)); const abortController = new AbortController() setTimeout(() => abortController.abort(), 10000) return fetch(iodConfig.doBrowser, { method: "POST", body: JSON.stringify(params), signal: abortController.signal }).then((response) => response.json()) .then((res) => { console.log("update dialog:"+JSON.stringify(res)) return res.body; }) } export const updateDialog = async function(histroyId : string, botMessage: any): Promise { //TODO @Nex confused by Message/MessageType in ./db/index.ts! const db = new PageAssitDatabase() const chatHistory = await db.getChatHistory(histroyId) var userMessage = null; for (var i=0;i ({ url: r.url, tokenCount: r.url.length, content: r.url, traceId: r?.traceId })) ?? []; updateBody.IoDSources = botMessage.iodSources?.map((r) => ({ id: r.doId, tokenCount: (r.content || r.description)?calculateTokenCount((r.content || r.description)):0, content: r.content || r.description, traceId: r?.traceId })) ?? []; console.log("updateBody:"); console.log(updateBody) return updateInLocalRepo(histroyId,updateBody) } export async function localIodSearch( query: string, keywords: string[] ): Promise { const TOTAL_SEARCH_RESULTS = await totalSearchResults() const abortController = new AbortController(); setTimeout(() => abortController.abort(), 10000); const params = makeRegSearchParams(TOTAL_SEARCH_RESULTS, keywords); const dataParams = makeSearchParamsWithDataType(TOTAL_SEARCH_RESULTS,keywords, "data"); const scenarioParams = makeSearchParamsWithDataType(TOTAL_SEARCH_RESULTS,keywords, "scenario"); const orgParams = makeSearchParamsWithDataType(TOTAL_SEARCH_RESULTS,keywords, "organization"); try { console.log('params------->',params) const requests = [ fetch(iodConfig.doBrowser,{method: "POST", body: JSON.stringify(dataParams),signal: abortController.signal}), fetch(iodConfig.doBrowser,{method: "POST", body: JSON.stringify(scenarioParams),signal: abortController.signal}), fetch(iodConfig.doBrowser,{method: "POST", body: JSON.stringify(orgParams),signal: abortController.signal}) ]; //TODO @Zhaoweijie, 这三类分别是数据、场景、团队的搜索请求。 const responses = await Promise.all(requests); const results = await Promise.all(responses.map(res => res.json())); const allResults: IodRegistryEntry[] = []; for (const res of results) { // 检查顶层状态 if (res.status !== "Success") { continue; // 跳过失败的请求 } let body; try { body = JSON.parse(res.result.body); } catch (e) { console.warn("Failed to parse result.body as JSON", e); continue; } if (body.code !== 0) { continue; } const entries: IodRegistryEntry[] = body.data?.results || []; // 数据清洗:补全 url 和 doId for (const r of entries) { r.url = r.url || r.pdf_url; r.doId = r.doId || r.doid; } // 合并到总结果 allResults.push(...entries); } const seenDoIds = new Set(); const prunedResults: IodRegistryEntry[] = []; for (const r of allResults) { if (r.doId && !seenDoIds.has(r.doId)) { seenDoIds.add(r.doId); prunedResults.push(r); } } return prunedResults; } catch (e) { console.log(e); return []; } } const ARXIV_URL_PATTERN = /^https?:\/\/arxiv\.org\// const ARXIV_NO_HTM = "No HTML for" export const searchIod = async (query: string, keywords: string[]) => { const searchResults = await localIodSearch(query, keywords) const isSimpleMode = await getIsSimpleInternetSearch() console.log("searchMode:"+isSimpleMode+"\n kw:"+JSON.stringify(keywords)+"\n"+" ->searchResult:\n"+JSON.stringify(searchResults)) console.log("pruned Search Result:"+JSON.stringify(searchResults.map(r=>r.doId+" "+r.name))) if (isSimpleMode) { await getOllamaURL() return searchResults } const docs: Document>[] = [] const resMap = new Map() for (const result of searchResults) { const url = result.url if (result.doId){ //TODO !!!!@Nex traceId should be the id of history/question! let docFromRetrieve = await retrieveDoc(result.doId); console.log("doc from Retrieve:"+result.doId+" -->"+JSON.stringify(docFromRetrieve)) docs.push(docFromRetrieve) result.description = docFromRetrieve.pageContent; result.traceId = docFromRetrieve.metadata?.traceId; continue; } if (!url) { continue; } let htmlUrl = "" if (ARXIV_URL_PATTERN.test(url)) { htmlUrl = url.replace("/pdf/", "/html/").replace(".pdf", "") } let noHtml = htmlUrl === "" if (!noHtml) { const loader = new PageAssistHtmlLoader({ html: "", url: htmlUrl }) try { const documents = await loader.loadByURL() for (const doc of documents) { if (doc.pageContent.includes(ARXIV_NO_HTM)) { noHtml = true return } docs.push(doc) } } catch (e) { console.log(e) noHtml = true } } if (noHtml) { if (url.endsWith(".pdf")) { const loader = new PageAssistPDFUrlLoader({ name: result.name, url }) try { const documents = await loader.load() for (const doc of documents) { docs.push(doc) } } catch (e) { console.log(e) } } else { const loader = new PageAssistHtmlLoader({ html: "", url }) try { const documents = await loader.loadByURL() for (const doc of documents) { docs.push(doc) } } catch (e) { console.log(e) } } } } return searchResults /* const ollamaUrl = await getOllamaURL() const embeddingModle = await defaultEmbeddingModelForRag() const ollamaEmbedding = await pageAssistEmbeddingModel({ model: embeddingModle || "", baseUrl: cleanUrl(ollamaUrl) }) const textSplitter = await getPageAssistTextSplitter() const chunks = await textSplitter.splitDocuments(docs) const store = new MemoryVectorStore(ollamaEmbedding) await store.addDocuments(chunks) const resultsWithEmbeddings = await store.similaritySearch(query, 3) const searchResult = resultsWithEmbeddings.map((result) => { // `source` for PDF type const key = result.metadata.url || result.metadata.source if (!key) return null const fullRes = resMap[key] return { ...fullRes, content: result.pageContent } }).filter((r) => r) return searchResult */ } export const calculateTokenCount = function(str:string){ const byteArray = new TextEncoder().encode(str); return byteArray.length; }