feat: IoD search process HTML/PDF content
This commit is contained in:
parent
e8471f1802
commit
a56e46a98d
7
src/types/iod.ts
Normal file
7
src/types/iod.ts
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
export type IodRegistryEntry = {
|
||||||
|
doId: string
|
||||||
|
name: string
|
||||||
|
url?: string
|
||||||
|
pdf_url?: string
|
||||||
|
description: string
|
||||||
|
}
|
@ -1,5 +1,6 @@
|
|||||||
import { cleanUrl } from "@/libs/clean-url"
|
import { cleanUrl } from "@/libs/clean-url"
|
||||||
import { PageAssistHtmlLoader } from "@/loader/html"
|
import { PageAssistHtmlLoader } from "@/loader/html"
|
||||||
|
import { PageAssistPDFUrlLoader } from "@/loader/pdf-url"
|
||||||
import { pageAssistEmbeddingModel } from "@/models/embedding"
|
import { pageAssistEmbeddingModel } from "@/models/embedding"
|
||||||
import { defaultEmbeddingModelForRag, getOllamaURL } from "@/services/ollama"
|
import { defaultEmbeddingModelForRag, getOllamaURL } from "@/services/ollama"
|
||||||
import {
|
import {
|
||||||
@ -9,6 +10,7 @@ import {
|
|||||||
import { getPageAssistTextSplitter } from "@/utils/text-splitter"
|
import { getPageAssistTextSplitter } from "@/utils/text-splitter"
|
||||||
import type { Document } from "@langchain/core/documents"
|
import type { Document } from "@langchain/core/documents"
|
||||||
import { MemoryVectorStore } from "langchain/vectorstores/memory"
|
import { MemoryVectorStore } from "langchain/vectorstores/memory"
|
||||||
|
import type { IodRegistryEntry } from "~/types/iod"
|
||||||
|
|
||||||
const makeRegSearchParams = (count: number, keyword: string) => ({
|
const makeRegSearchParams = (count: number, keyword: string) => ({
|
||||||
action: "executeContract",
|
action: "executeContract",
|
||||||
@ -44,7 +46,10 @@ const makeRegSearchParams = (count: number, keyword: string) => ({
|
|||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
export const localIodSearch = async (query: string, keywords: string[]) => {
|
export async function localIodSearch(
|
||||||
|
query: string,
|
||||||
|
keywords: string[]
|
||||||
|
): Promise<IodRegistryEntry[]> {
|
||||||
const TOTAL_SEARCH_RESULTS = await totalSearchResults()
|
const TOTAL_SEARCH_RESULTS = await totalSearchResults()
|
||||||
|
|
||||||
const results = (
|
const results = (
|
||||||
@ -71,11 +76,11 @@ export const localIodSearch = async (query: string, keywords: string[]) => {
|
|||||||
console.log(body)
|
console.log(body)
|
||||||
return []
|
return []
|
||||||
}
|
}
|
||||||
const results =
|
const results: IodRegistryEntry[] =
|
||||||
body.data?.results?.filter((r) => r.url || r.pdf_url) || []
|
body.data?.results?.filter((r) => r.url || r.pdf_url) || []
|
||||||
results.forEach((r) => {
|
for (const r of results) {
|
||||||
r.url = r.url || r.pdf_url
|
r.url = r.url || r.pdf_url
|
||||||
})
|
}
|
||||||
return results
|
return results
|
||||||
})
|
})
|
||||||
.catch((e) => {
|
.catch((e) => {
|
||||||
@ -89,7 +94,8 @@ export const localIodSearch = async (query: string, keywords: string[]) => {
|
|||||||
return results
|
return results
|
||||||
}
|
}
|
||||||
|
|
||||||
const ARXIV_URL = /^https:\/\/arxiv.org\//
|
const ARXIV_URL_PATTERN = /^https?:\/\/arxiv\.org\//
|
||||||
|
const ARXIV_NO_HTM = "No HTML for"
|
||||||
|
|
||||||
export const searchIod = async (query: string, keywords: string[]) => {
|
export const searchIod = async (query: string, keywords: string[]) => {
|
||||||
const searchResults = await localIodSearch(query, keywords)
|
const searchResults = await localIodSearch(query, keywords)
|
||||||
@ -103,21 +109,67 @@ export const searchIod = async (query: string, keywords: string[]) => {
|
|||||||
|
|
||||||
const docs: Document<Record<string, any>>[] = []
|
const docs: Document<Record<string, any>>[] = []
|
||||||
for (const result of searchResults) {
|
for (const result of searchResults) {
|
||||||
let url = result.url
|
const url = result.url
|
||||||
if (ARXIV_URL.test(result.url)) {
|
if (!url) continue
|
||||||
url = result.url.replace("/pdf/", "/abs/").replace(".pdf", "")
|
|
||||||
|
let htmlUrl = ""
|
||||||
|
if (ARXIV_URL_PATTERN.test(url)) {
|
||||||
|
htmlUrl = url.replace("/pdf/", "/html/").replace(".pdf", "")
|
||||||
}
|
}
|
||||||
|
|
||||||
const loader = new PageAssistHtmlLoader({
|
let noHtml = htmlUrl === ""
|
||||||
html: "",
|
if (!noHtml) {
|
||||||
url
|
const loader = new PageAssistHtmlLoader({
|
||||||
})
|
html: "",
|
||||||
|
url: htmlUrl
|
||||||
|
})
|
||||||
|
|
||||||
const documents = await loader.loadByURL()
|
try {
|
||||||
|
const documents = await loader.loadByURL()
|
||||||
|
for (const doc of documents) {
|
||||||
|
if (doc.pageContent.includes(ARXIV_NO_HTM)) {
|
||||||
|
noHtml = true
|
||||||
|
return
|
||||||
|
}
|
||||||
|
docs.push(doc)
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
console.log(e)
|
||||||
|
noHtml = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
documents.forEach((doc) => {
|
if (noHtml) {
|
||||||
docs.push(doc)
|
if (url.endsWith(".pdf")) {
|
||||||
})
|
const loader = new PageAssistPDFUrlLoader({
|
||||||
|
name: result.name,
|
||||||
|
url
|
||||||
|
})
|
||||||
|
|
||||||
|
try {
|
||||||
|
const documents = await loader.load()
|
||||||
|
for (const doc of documents) {
|
||||||
|
docs.push(doc)
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
console.log(e)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
const loader = new PageAssistHtmlLoader({
|
||||||
|
html: "",
|
||||||
|
url
|
||||||
|
})
|
||||||
|
|
||||||
|
try {
|
||||||
|
const documents = await loader.loadByURL()
|
||||||
|
for (const doc of documents) {
|
||||||
|
docs.push(doc)
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
console.log(e)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
const ollamaUrl = await getOllamaURL()
|
const ollamaUrl = await getOllamaURL()
|
||||||
|
|
||||||
|
@ -9,6 +9,7 @@ import { searxngSearch } from "./search-engines/searxng"
|
|||||||
import { braveAPISearch } from "./search-engines/brave-api"
|
import { braveAPISearch } from "./search-engines/brave-api"
|
||||||
import { webBaiduSearch } from "./search-engines/baidu"
|
import { webBaiduSearch } from "./search-engines/baidu"
|
||||||
import { searchIod } from "./iod"
|
import { searchIod } from "./iod"
|
||||||
|
import type { IodRegistryEntry } from "~/types/iod"
|
||||||
|
|
||||||
const getHostName = (url: string) => {
|
const getHostName = (url: string) => {
|
||||||
try {
|
try {
|
||||||
@ -72,13 +73,7 @@ export const getSystemPromptForWeb = async (
|
|||||||
// .join("\n")
|
// .join("\n")
|
||||||
}
|
}
|
||||||
|
|
||||||
let iodSearchResults: {
|
let iodSearchResults: IodRegistryEntry[] = []
|
||||||
doId: string
|
|
||||||
name: string
|
|
||||||
url?: string
|
|
||||||
// pdf_url?: string
|
|
||||||
description: string
|
|
||||||
}[] = []
|
|
||||||
// let search_results_iod = ""
|
// let search_results_iod = ""
|
||||||
|
|
||||||
if (iodSearch) {
|
if (iodSearch) {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user