2025-08-20 18:36:48 +08:00

427 lines
13 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import { cleanUrl } from "@/libs/clean-url"
import { PageAssistHtmlLoader } from "@/loader/html"
import { PageAssistPDFUrlLoader } from "@/loader/pdf-url"
import { pageAssistEmbeddingModel } from "@/models/embedding"
import { defaultEmbeddingModelForRag, getOllamaURL } from "@/services/ollama"
import {
getIsSimpleInternetSearch,
totalSearchResults
} from "@/services/search"
import { getPageAssistTextSplitter } from "@/utils/text-splitter"
import { Document } from "@langchain/core/documents"
import { MemoryVectorStore } from "langchain/vectorstores/memory"
import type { IodRegistryEntry } from "~/types/iod"
import { PageAssitDatabase } from "@/db"
import exp from "constants"
import { Segment, useDefault, cnPOSTag, enPOSTag} from 'segmentit';
const segment = useDefault(new Segment());
export const tokenizeInput = function (input: string): string[] {
const words = segment.doSegment(input, { simple: false });
console.log(words.map(function(word){return {w:word.w, p:enPOSTag(word.p)}}) );
return words.filter(word =>( word.w.length > 1)).map(word=>word.w);
}
//doipUrl = tcp://reg01.public.internetofdata.cn:21037
export const iodConfig = {
"gatewayUrl": "tcp://reg01.public.internetofdata.cn:21037",
"registry":"data/Registry",
"localRepository":"data/Repository",
"doBrowser":"http://021.node.internetapi.cn:21030/SCIDE/SCManager"
}
export const iodConfigLocal = {
"gatewayUrl": "tcp://127.0.0.1:21036",
"registry":"bdware/Registry",
"localRepository":"bdtest.local/myrepo1",
"doBrowser":"http://127.0.0.1:21030/SCIDE/SCManager"
}
function inGrepList(str: string){
return "什么|问题|需要|合适|设计|考虑|合作|精度|传感器|最新|研究|药物".indexOf(str)!=-1;
}
export const makeSearchParamsWithDataType = function(count: number, keyword: string| string[], dataType: string){
const searchMode = [];
searchMode.push({"key":"data_type", "type":"MUST", "value":dataType})
if (typeof keyword === 'string') {
// 如果 keyword 是字符串,则直接添加一个 searchMode 条目
searchMode.push({
key: "description",
type: "MUST",
value: keyword
});
} else if (Array.isArray(keyword)) {
// 如果 keyword 是数组,则为每个元素添加一个 searchMode 条目
keyword.forEach(str => {
if (!inGrepList(str))
searchMode.push({
key: "description",
type: "SHOULD",
value: str
});
});
}
return {
action: "executeContract",
contractID: "BDBrowser",
operation: "sendRequestDirectly",
arg: {
id: iodConfig.registry,
//doipUrl:"tcp://127.0.0.1:21039",
doipUrl: iodConfig.gatewayUrl,
op: "Search",
vars:{
timeout:15000
},
attributes: {
offset: 0,
count,
bodyBase64Encoded: false,
searchMode:searchMode
},
body: ""
}
}
}
export const makeRegSearchParams = function(count: number, keyword: string| string[]){
const searchMode = [];
if (typeof keyword === 'string') {
// 如果 keyword 是字符串,则直接添加一个 searchMode 条目
searchMode.push({
key: "description",
type: "MUST",
value: keyword
});
} else if (Array.isArray(keyword)) {
// 如果 keyword 是数组,则为每个元素添加一个 searchMode 条目
keyword.forEach(str => {
if (!inGrepList(str))
searchMode.push({
key: "description",
type: "SHOULD",
value: str
});
});
}
return {
action: "executeContract",
contractID: "BDBrowser",
operation: "sendRequestDirectly",
arg: {
id: iodConfig.registry,
//doipUrl:"tcp://127.0.0.1:21039",
doipUrl: iodConfig.gatewayUrl,
op: "Search",
vars:{
timeout:15000
},
attributes: {
offset: 0,
count,
bodyBase64Encoded: false,
searchMode:searchMode
},
body: ""
}
}
}
export const makeDOIPParams = (doId:string, op:string, attributes:Object, requestBody: string) => ({
action: "executeContract",
contractID: "BDBrowser",
operation: "sendRequestDirectly",
arg: {
id: doId,
doipUrl: iodConfig.gatewayUrl,
op: op,
attributes: attributes,
body: requestBody
}
})
export const retrieveDoc = function(doId: string) : Promise<Document> {
console.log("retriveDoc:"+doId)
const params = makeDOIPParams(doId,"Retrieve",{
bodyBase64Encoded: false
}, "");
const abortController = new AbortController()
setTimeout(() => abortController.abort(), 10000)
return fetch(iodConfig.doBrowser, {
method: "POST",
body: JSON.stringify(params),
signal: abortController.signal
}).then((response) => {
console.log("responseIn retrieveDoc:");
console.log(response);
return response.json()})
.then((res) => {
console.log("res:");
console.log(res.result.body);
//TODO
return {
metadata:{traceId:res.result.header.attributes?.traceId},
pageContent:res.result.body
}
})
}
export const updateInLocalRepo = function(historyId: string, requestBody: Object) : Promise<string> {
const params = makeDOIPParams(iodConfig.localRepository,"Update",{
"aiDialogID": historyId,
bodyBase64Encoded: false
}, JSON.stringify(requestBody));
const abortController = new AbortController()
setTimeout(() => abortController.abort(), 10000)
return fetch(iodConfig.doBrowser, {
method: "POST",
body: JSON.stringify(params),
signal: abortController.signal
}).then((response) => response.json())
.then((res) => {
console.log("update dialog:"+JSON.stringify(res))
return res.body;
})
}
export const updateDialog = async function(histroyId : string, botMessage: any): Promise<string> {
//TODO @Nex confused by Message/MessageType in ./db/index.ts!
const db = new PageAssitDatabase()
const chatHistory = await db.getChatHistory(histroyId)
var userMessage = null;
for (var i=0;i<chatHistory.length;i++){
userMessage = chatHistory[i];
if (userMessage.role=='user') break;
}
let updateBody:any = {};
// !!!IMPORTANT!!! traceId = histroyId+"/"+userMessage.id;
// Update traceId in retrieveDoc!
updateBody.traceId = histroyId+"/"+userMessage.id;
updateBody.question = {
"id": histroyId+"/"+userMessage.id,
"content": userMessage.content,
"tokenCount": userMessage.content.length
}
updateBody.answer = {
"id": histroyId+"/"+botMessage.id,
"content": botMessage.content,
"tokenCount": botMessage.content.length
}
//TODO set a correct model ID
updateBody.model = {"id":"bdware.ollama/" + userMessage.name}
//TODO incorrect tokenCount calculated!!
updateBody.webSources = botMessage.webSources?.map((r) => ({
url: r.url,
tokenCount: r.url.length,
content: r.url,
traceId: r?.traceId
})) ?? [];
updateBody.IoDSources = botMessage.iodSources?.map((r) => ({
id: r.doId,
tokenCount: (r.content || r.description)?calculateTokenCount((r.content || r.description)):0,
content: r.content || r.description,
traceId: r?.traceId
})) ?? [];
console.log("updateBody:");
console.log(updateBody)
return updateInLocalRepo(histroyId,updateBody)
}
export async function localIodSearch(
query: string,
keywords: string[]
): Promise<IodRegistryEntry[]> {
const TOTAL_SEARCH_RESULTS = await totalSearchResults()
const abortController = new AbortController();
setTimeout(() => abortController.abort(), 10000);
const params = makeRegSearchParams(TOTAL_SEARCH_RESULTS, keywords);
const dataParams = makeSearchParamsWithDataType(TOTAL_SEARCH_RESULTS,keywords, "data");
const scenarioParams = makeSearchParamsWithDataType(TOTAL_SEARCH_RESULTS,keywords, "scenario");
const orgParams = makeSearchParamsWithDataType(TOTAL_SEARCH_RESULTS,keywords, "organization");
try {
console.log('params------->',params)
const requests = [
fetch(iodConfig.doBrowser,{method: "POST", body: JSON.stringify(dataParams),signal: abortController.signal}),
fetch(iodConfig.doBrowser,{method: "POST", body: JSON.stringify(scenarioParams),signal: abortController.signal}),
fetch(iodConfig.doBrowser,{method: "POST", body: JSON.stringify(orgParams),signal: abortController.signal})
];
//TODO @Zhaoweijie 这三类分别是数据、场景、团队的搜索请求。
const responses = await Promise.all(requests);
const results = await Promise.all(responses.map(res => res.json()));
const allResults: IodRegistryEntry[] = [];
for (const res of results) {
// 检查顶层状态
if (res.status !== "Success") {
continue; // 跳过失败的请求
}
let body;
try {
body = JSON.parse(res.result.body);
} catch (e) {
console.warn("Failed to parse result.body as JSON", e);
continue;
}
if (body.code !== 0) {
continue;
}
const entries: IodRegistryEntry[] = body.data?.results || [];
// 数据清洗:补全 url 和 doId
for (const r of entries) {
r.url = r.url || r.pdf_url;
r.doId = r.doId || r.doid;
}
// 合并到总结果
allResults.push(...entries);
}
const seenDoIds = new Set<string>();
const prunedResults: IodRegistryEntry[] = [];
for (const r of allResults) {
if (r.doId && !seenDoIds.has(r.doId)) {
seenDoIds.add(r.doId);
prunedResults.push(r);
}
}
return prunedResults;
} catch (e) {
console.log(e);
return [];
}
}
const ARXIV_URL_PATTERN = /^https?:\/\/arxiv\.org\//
const ARXIV_NO_HTM = "No HTML for"
export const searchIod = async (query: string, keywords: string[]) => {
const searchResults = await localIodSearch(query, keywords)
const isSimpleMode = await getIsSimpleInternetSearch()
console.log("searchMode:"+isSimpleMode+"\n kw:"+JSON.stringify(keywords)+"\n"+" ->searchResult:\n"+JSON.stringify(searchResults))
console.log("pruned Search Result:"+JSON.stringify(searchResults.map(r=>r.doId+" "+r.name)))
if (isSimpleMode) {
await getOllamaURL()
return searchResults
}
const docs: Document<Record<string, any>>[] = []
const resMap = new Map<string, IodRegistryEntry>()
for (const result of searchResults) {
const url = result.url
if (result.doId){
//TODO !!!!@Nex traceId should be the id of history/question!
let docFromRetrieve = await retrieveDoc(result.doId);
console.log("doc from Retrieve:"+result.doId+" -->"+JSON.stringify(docFromRetrieve))
docs.push(docFromRetrieve)
result.description = docFromRetrieve.pageContent;
result.traceId = docFromRetrieve.metadata?.traceId;
continue;
}
if (!url) {
continue;
}
let htmlUrl = ""
if (ARXIV_URL_PATTERN.test(url)) {
htmlUrl = url.replace("/pdf/", "/html/").replace(".pdf", "")
}
let noHtml = htmlUrl === ""
if (!noHtml) {
const loader = new PageAssistHtmlLoader({
html: "",
url: htmlUrl
})
try {
const documents = await loader.loadByURL()
for (const doc of documents) {
if (doc.pageContent.includes(ARXIV_NO_HTM)) {
noHtml = true
return
}
docs.push(doc)
}
} catch (e) {
console.log(e)
noHtml = true
}
}
if (noHtml) {
if (url.endsWith(".pdf")) {
const loader = new PageAssistPDFUrlLoader({
name: result.name,
url
})
try {
const documents = await loader.load()
for (const doc of documents) {
docs.push(doc)
}
} catch (e) {
console.log(e)
}
} else {
const loader = new PageAssistHtmlLoader({
html: "",
url
})
try {
const documents = await loader.loadByURL()
for (const doc of documents) {
docs.push(doc)
}
} catch (e) {
console.log(e)
}
}
}
}
return searchResults
/*
const ollamaUrl = await getOllamaURL()
const embeddingModle = await defaultEmbeddingModelForRag()
const ollamaEmbedding = await pageAssistEmbeddingModel({
model: embeddingModle || "",
baseUrl: cleanUrl(ollamaUrl)
})
const textSplitter = await getPageAssistTextSplitter()
const chunks = await textSplitter.splitDocuments(docs)
const store = new MemoryVectorStore(ollamaEmbedding)
await store.addDocuments(chunks)
const resultsWithEmbeddings = await store.similaritySearch(query, 3)
const searchResult = resultsWithEmbeddings.map((result) => {
// `source` for PDF type
const key = result.metadata.url || result.metadata.source
if (!key) return null
const fullRes = resMap[key]
return {
...fullRes,
content: result.pageContent
}
}).filter((r) => r)
return searchResult
*/
}
export const calculateTokenCount = function(str:string){
const byteArray = new TextEncoder().encode(str);
return byteArray.length;
}