feat: IoD search process HTML/PDF content
This commit is contained in:
parent
e8471f1802
commit
a56e46a98d
7
src/types/iod.ts
Normal file
7
src/types/iod.ts
Normal file
@ -0,0 +1,7 @@
|
||||
export type IodRegistryEntry = {
|
||||
doId: string
|
||||
name: string
|
||||
url?: string
|
||||
pdf_url?: string
|
||||
description: string
|
||||
}
|
@ -1,5 +1,6 @@
|
||||
import { cleanUrl } from "@/libs/clean-url"
|
||||
import { PageAssistHtmlLoader } from "@/loader/html"
|
||||
import { PageAssistPDFUrlLoader } from "@/loader/pdf-url"
|
||||
import { pageAssistEmbeddingModel } from "@/models/embedding"
|
||||
import { defaultEmbeddingModelForRag, getOllamaURL } from "@/services/ollama"
|
||||
import {
|
||||
@ -9,6 +10,7 @@ import {
|
||||
import { getPageAssistTextSplitter } from "@/utils/text-splitter"
|
||||
import type { Document } from "@langchain/core/documents"
|
||||
import { MemoryVectorStore } from "langchain/vectorstores/memory"
|
||||
import type { IodRegistryEntry } from "~/types/iod"
|
||||
|
||||
const makeRegSearchParams = (count: number, keyword: string) => ({
|
||||
action: "executeContract",
|
||||
@ -44,7 +46,10 @@ const makeRegSearchParams = (count: number, keyword: string) => ({
|
||||
}
|
||||
})
|
||||
|
||||
export const localIodSearch = async (query: string, keywords: string[]) => {
|
||||
export async function localIodSearch(
|
||||
query: string,
|
||||
keywords: string[]
|
||||
): Promise<IodRegistryEntry[]> {
|
||||
const TOTAL_SEARCH_RESULTS = await totalSearchResults()
|
||||
|
||||
const results = (
|
||||
@ -71,11 +76,11 @@ export const localIodSearch = async (query: string, keywords: string[]) => {
|
||||
console.log(body)
|
||||
return []
|
||||
}
|
||||
const results =
|
||||
const results: IodRegistryEntry[] =
|
||||
body.data?.results?.filter((r) => r.url || r.pdf_url) || []
|
||||
results.forEach((r) => {
|
||||
for (const r of results) {
|
||||
r.url = r.url || r.pdf_url
|
||||
})
|
||||
}
|
||||
return results
|
||||
})
|
||||
.catch((e) => {
|
||||
@ -89,7 +94,8 @@ export const localIodSearch = async (query: string, keywords: string[]) => {
|
||||
return results
|
||||
}
|
||||
|
||||
const ARXIV_URL = /^https:\/\/arxiv.org\//
|
||||
const ARXIV_URL_PATTERN = /^https?:\/\/arxiv\.org\//
|
||||
const ARXIV_NO_HTM = "No HTML for"
|
||||
|
||||
export const searchIod = async (query: string, keywords: string[]) => {
|
||||
const searchResults = await localIodSearch(query, keywords)
|
||||
@ -103,21 +109,67 @@ export const searchIod = async (query: string, keywords: string[]) => {
|
||||
|
||||
const docs: Document<Record<string, any>>[] = []
|
||||
for (const result of searchResults) {
|
||||
let url = result.url
|
||||
if (ARXIV_URL.test(result.url)) {
|
||||
url = result.url.replace("/pdf/", "/abs/").replace(".pdf", "")
|
||||
const url = result.url
|
||||
if (!url) continue
|
||||
|
||||
let htmlUrl = ""
|
||||
if (ARXIV_URL_PATTERN.test(url)) {
|
||||
htmlUrl = url.replace("/pdf/", "/html/").replace(".pdf", "")
|
||||
}
|
||||
|
||||
const loader = new PageAssistHtmlLoader({
|
||||
html: "",
|
||||
url
|
||||
})
|
||||
let noHtml = htmlUrl === ""
|
||||
if (!noHtml) {
|
||||
const loader = new PageAssistHtmlLoader({
|
||||
html: "",
|
||||
url: htmlUrl
|
||||
})
|
||||
|
||||
const documents = await loader.loadByURL()
|
||||
try {
|
||||
const documents = await loader.loadByURL()
|
||||
for (const doc of documents) {
|
||||
if (doc.pageContent.includes(ARXIV_NO_HTM)) {
|
||||
noHtml = true
|
||||
return
|
||||
}
|
||||
docs.push(doc)
|
||||
}
|
||||
} catch (e) {
|
||||
console.log(e)
|
||||
noHtml = true
|
||||
}
|
||||
}
|
||||
|
||||
documents.forEach((doc) => {
|
||||
docs.push(doc)
|
||||
})
|
||||
if (noHtml) {
|
||||
if (url.endsWith(".pdf")) {
|
||||
const loader = new PageAssistPDFUrlLoader({
|
||||
name: result.name,
|
||||
url
|
||||
})
|
||||
|
||||
try {
|
||||
const documents = await loader.load()
|
||||
for (const doc of documents) {
|
||||
docs.push(doc)
|
||||
}
|
||||
} catch (e) {
|
||||
console.log(e)
|
||||
}
|
||||
} else {
|
||||
const loader = new PageAssistHtmlLoader({
|
||||
html: "",
|
||||
url
|
||||
})
|
||||
|
||||
try {
|
||||
const documents = await loader.loadByURL()
|
||||
for (const doc of documents) {
|
||||
docs.push(doc)
|
||||
}
|
||||
} catch (e) {
|
||||
console.log(e)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
const ollamaUrl = await getOllamaURL()
|
||||
|
||||
|
@ -9,6 +9,7 @@ import { searxngSearch } from "./search-engines/searxng"
|
||||
import { braveAPISearch } from "./search-engines/brave-api"
|
||||
import { webBaiduSearch } from "./search-engines/baidu"
|
||||
import { searchIod } from "./iod"
|
||||
import type { IodRegistryEntry } from "~/types/iod"
|
||||
|
||||
const getHostName = (url: string) => {
|
||||
try {
|
||||
@ -72,13 +73,7 @@ export const getSystemPromptForWeb = async (
|
||||
// .join("\n")
|
||||
}
|
||||
|
||||
let iodSearchResults: {
|
||||
doId: string
|
||||
name: string
|
||||
url?: string
|
||||
// pdf_url?: string
|
||||
description: string
|
||||
}[] = []
|
||||
let iodSearchResults: IodRegistryEntry[] = []
|
||||
// let search_results_iod = ""
|
||||
|
||||
if (iodSearch) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user