feat: IoD search process HTML/PDF content

This commit is contained in:
Nex Zhu 2025-02-14 23:24:27 +08:00
parent e8471f1802
commit a56e46a98d
3 changed files with 77 additions and 23 deletions

7
src/types/iod.ts Normal file
View File

@ -0,0 +1,7 @@
export type IodRegistryEntry = {
doId: string
name: string
url?: string
pdf_url?: string
description: string
}

View File

@ -1,5 +1,6 @@
import { cleanUrl } from "@/libs/clean-url" import { cleanUrl } from "@/libs/clean-url"
import { PageAssistHtmlLoader } from "@/loader/html" import { PageAssistHtmlLoader } from "@/loader/html"
import { PageAssistPDFUrlLoader } from "@/loader/pdf-url"
import { pageAssistEmbeddingModel } from "@/models/embedding" import { pageAssistEmbeddingModel } from "@/models/embedding"
import { defaultEmbeddingModelForRag, getOllamaURL } from "@/services/ollama" import { defaultEmbeddingModelForRag, getOllamaURL } from "@/services/ollama"
import { import {
@ -9,6 +10,7 @@ import {
import { getPageAssistTextSplitter } from "@/utils/text-splitter" import { getPageAssistTextSplitter } from "@/utils/text-splitter"
import type { Document } from "@langchain/core/documents" import type { Document } from "@langchain/core/documents"
import { MemoryVectorStore } from "langchain/vectorstores/memory" import { MemoryVectorStore } from "langchain/vectorstores/memory"
import type { IodRegistryEntry } from "~/types/iod"
const makeRegSearchParams = (count: number, keyword: string) => ({ const makeRegSearchParams = (count: number, keyword: string) => ({
action: "executeContract", action: "executeContract",
@ -44,7 +46,10 @@ const makeRegSearchParams = (count: number, keyword: string) => ({
} }
}) })
export const localIodSearch = async (query: string, keywords: string[]) => { export async function localIodSearch(
query: string,
keywords: string[]
): Promise<IodRegistryEntry[]> {
const TOTAL_SEARCH_RESULTS = await totalSearchResults() const TOTAL_SEARCH_RESULTS = await totalSearchResults()
const results = ( const results = (
@ -71,11 +76,11 @@ export const localIodSearch = async (query: string, keywords: string[]) => {
console.log(body) console.log(body)
return [] return []
} }
const results = const results: IodRegistryEntry[] =
body.data?.results?.filter((r) => r.url || r.pdf_url) || [] body.data?.results?.filter((r) => r.url || r.pdf_url) || []
results.forEach((r) => { for (const r of results) {
r.url = r.url || r.pdf_url r.url = r.url || r.pdf_url
}) }
return results return results
}) })
.catch((e) => { .catch((e) => {
@ -89,7 +94,8 @@ export const localIodSearch = async (query: string, keywords: string[]) => {
return results return results
} }
const ARXIV_URL = /^https:\/\/arxiv.org\// const ARXIV_URL_PATTERN = /^https?:\/\/arxiv\.org\//
const ARXIV_NO_HTM = "No HTML for"
export const searchIod = async (query: string, keywords: string[]) => { export const searchIod = async (query: string, keywords: string[]) => {
const searchResults = await localIodSearch(query, keywords) const searchResults = await localIodSearch(query, keywords)
@ -103,21 +109,67 @@ export const searchIod = async (query: string, keywords: string[]) => {
const docs: Document<Record<string, any>>[] = [] const docs: Document<Record<string, any>>[] = []
for (const result of searchResults) { for (const result of searchResults) {
let url = result.url const url = result.url
if (ARXIV_URL.test(result.url)) { if (!url) continue
url = result.url.replace("/pdf/", "/abs/").replace(".pdf", "")
let htmlUrl = ""
if (ARXIV_URL_PATTERN.test(url)) {
htmlUrl = url.replace("/pdf/", "/html/").replace(".pdf", "")
} }
const loader = new PageAssistHtmlLoader({ let noHtml = htmlUrl === ""
html: "", if (!noHtml) {
url const loader = new PageAssistHtmlLoader({
}) html: "",
url: htmlUrl
})
const documents = await loader.loadByURL() try {
const documents = await loader.loadByURL()
for (const doc of documents) {
if (doc.pageContent.includes(ARXIV_NO_HTM)) {
noHtml = true
return
}
docs.push(doc)
}
} catch (e) {
console.log(e)
noHtml = true
}
}
documents.forEach((doc) => { if (noHtml) {
docs.push(doc) if (url.endsWith(".pdf")) {
}) const loader = new PageAssistPDFUrlLoader({
name: result.name,
url
})
try {
const documents = await loader.load()
for (const doc of documents) {
docs.push(doc)
}
} catch (e) {
console.log(e)
}
} else {
const loader = new PageAssistHtmlLoader({
html: "",
url
})
try {
const documents = await loader.loadByURL()
for (const doc of documents) {
docs.push(doc)
}
} catch (e) {
console.log(e)
}
}
}
} }
const ollamaUrl = await getOllamaURL() const ollamaUrl = await getOllamaURL()

View File

@ -9,6 +9,7 @@ import { searxngSearch } from "./search-engines/searxng"
import { braveAPISearch } from "./search-engines/brave-api" import { braveAPISearch } from "./search-engines/brave-api"
import { webBaiduSearch } from "./search-engines/baidu" import { webBaiduSearch } from "./search-engines/baidu"
import { searchIod } from "./iod" import { searchIod } from "./iod"
import type { IodRegistryEntry } from "~/types/iod"
const getHostName = (url: string) => { const getHostName = (url: string) => {
try { try {
@ -72,13 +73,7 @@ export const getSystemPromptForWeb = async (
// .join("\n") // .join("\n")
} }
let iodSearchResults: { let iodSearchResults: IodRegistryEntry[] = []
doId: string
name: string
url?: string
// pdf_url?: string
description: string
}[] = []
// let search_results_iod = "" // let search_results_iod = ""
if (iodSearch) { if (iodSearch) {