From a56e46a98dbdb271352e4967377c9b2e46ada2a0 Mon Sep 17 00:00:00 2001 From: Nex Zhu Date: Fri, 14 Feb 2025 23:24:27 +0800 Subject: [PATCH] feat: IoD search process HTML/PDF content --- src/types/iod.ts | 7 ++++ src/web/iod.ts | 84 +++++++++++++++++++++++++++++++++++++++--------- src/web/web.ts | 9 ++---- 3 files changed, 77 insertions(+), 23 deletions(-) create mode 100644 src/types/iod.ts diff --git a/src/types/iod.ts b/src/types/iod.ts new file mode 100644 index 0000000..6cc661f --- /dev/null +++ b/src/types/iod.ts @@ -0,0 +1,7 @@ +export type IodRegistryEntry = { + doId: string + name: string + url?: string + pdf_url?: string + description: string +} diff --git a/src/web/iod.ts b/src/web/iod.ts index 12a8780..dd9a980 100644 --- a/src/web/iod.ts +++ b/src/web/iod.ts @@ -1,5 +1,6 @@ import { cleanUrl } from "@/libs/clean-url" import { PageAssistHtmlLoader } from "@/loader/html" +import { PageAssistPDFUrlLoader } from "@/loader/pdf-url" import { pageAssistEmbeddingModel } from "@/models/embedding" import { defaultEmbeddingModelForRag, getOllamaURL } from "@/services/ollama" import { @@ -9,6 +10,7 @@ import { import { getPageAssistTextSplitter } from "@/utils/text-splitter" import type { Document } from "@langchain/core/documents" import { MemoryVectorStore } from "langchain/vectorstores/memory" +import type { IodRegistryEntry } from "~/types/iod" const makeRegSearchParams = (count: number, keyword: string) => ({ action: "executeContract", @@ -44,7 +46,10 @@ const makeRegSearchParams = (count: number, keyword: string) => ({ } }) -export const localIodSearch = async (query: string, keywords: string[]) => { +export async function localIodSearch( + query: string, + keywords: string[] +): Promise { const TOTAL_SEARCH_RESULTS = await totalSearchResults() const results = ( @@ -71,11 +76,11 @@ export const localIodSearch = async (query: string, keywords: string[]) => { console.log(body) return [] } - const results = + const results: IodRegistryEntry[] = body.data?.results?.filter((r) => r.url || r.pdf_url) || [] - results.forEach((r) => { + for (const r of results) { r.url = r.url || r.pdf_url - }) + } return results }) .catch((e) => { @@ -89,7 +94,8 @@ export const localIodSearch = async (query: string, keywords: string[]) => { return results } -const ARXIV_URL = /^https:\/\/arxiv.org\// +const ARXIV_URL_PATTERN = /^https?:\/\/arxiv\.org\// +const ARXIV_NO_HTM = "No HTML for" export const searchIod = async (query: string, keywords: string[]) => { const searchResults = await localIodSearch(query, keywords) @@ -103,21 +109,67 @@ export const searchIod = async (query: string, keywords: string[]) => { const docs: Document>[] = [] for (const result of searchResults) { - let url = result.url - if (ARXIV_URL.test(result.url)) { - url = result.url.replace("/pdf/", "/abs/").replace(".pdf", "") + const url = result.url + if (!url) continue + + let htmlUrl = "" + if (ARXIV_URL_PATTERN.test(url)) { + htmlUrl = url.replace("/pdf/", "/html/").replace(".pdf", "") } - const loader = new PageAssistHtmlLoader({ - html: "", - url - }) + let noHtml = htmlUrl === "" + if (!noHtml) { + const loader = new PageAssistHtmlLoader({ + html: "", + url: htmlUrl + }) - const documents = await loader.loadByURL() + try { + const documents = await loader.loadByURL() + for (const doc of documents) { + if (doc.pageContent.includes(ARXIV_NO_HTM)) { + noHtml = true + return + } + docs.push(doc) + } + } catch (e) { + console.log(e) + noHtml = true + } + } - documents.forEach((doc) => { - docs.push(doc) - }) + if (noHtml) { + if (url.endsWith(".pdf")) { + const loader = new PageAssistPDFUrlLoader({ + name: result.name, + url + }) + + try { + const documents = await loader.load() + for (const doc of documents) { + docs.push(doc) + } + } catch (e) { + console.log(e) + } + } else { + const loader = new PageAssistHtmlLoader({ + html: "", + url + }) + + try { + const documents = await loader.loadByURL() + for (const doc of documents) { + docs.push(doc) + } + } catch (e) { + console.log(e) + } + } + } } const ollamaUrl = await getOllamaURL() diff --git a/src/web/web.ts b/src/web/web.ts index 6d06c06..d28c5c9 100644 --- a/src/web/web.ts +++ b/src/web/web.ts @@ -9,6 +9,7 @@ import { searxngSearch } from "./search-engines/searxng" import { braveAPISearch } from "./search-engines/brave-api" import { webBaiduSearch } from "./search-engines/baidu" import { searchIod } from "./iod" +import type { IodRegistryEntry } from "~/types/iod" const getHostName = (url: string) => { try { @@ -72,13 +73,7 @@ export const getSystemPromptForWeb = async ( // .join("\n") } - let iodSearchResults: { - doId: string - name: string - url?: string - // pdf_url?: string - description: string - }[] = [] + let iodSearchResults: IodRegistryEntry[] = [] // let search_results_iod = "" if (iodSearch) {