diff --git a/src/loader/html.ts b/src/loader/html.ts index 786c60e..5eeb168 100644 --- a/src/loader/html.ts +++ b/src/loader/html.ts @@ -1,9 +1,9 @@ import { BaseDocumentLoader } from "langchain/document_loaders/base" import { Document } from "@langchain/core/documents" -import { compile } from "html-to-text" import { urlRewriteRuntime } from "~/libs/runtime" import { YtTranscript } from "yt-transcript" import { isWikipedia, parseWikipedia } from "@/parser/wiki" +import { extractReadabilityContent } from "@/parser/reader" const YT_REGEX = /(?:https?:\/\/)?(?:www\.)?(?:youtube\.com|youtu\.be)\/(?:watch\?v=)?([a-zA-Z0-9_-]+)/ @@ -24,8 +24,7 @@ export interface WebLoaderParams { export class PageAssistHtmlLoader extends BaseDocumentLoader - implements WebLoaderParams -{ + implements WebLoaderParams { html: string url: string @@ -52,30 +51,14 @@ export class PageAssistHtmlLoader { metadata: { source: this.url, + url: this.url, audio: { chunks: transcript } }, pageContent: text } ] } - - // let html = this.html - - // if (isWikipedia(this.url)) { - // console.log("Wikipedia URL detected") - // html = parseWikipedia(html) - // } - - // // else if (isTwitter(this.url)) { - // // console.log("Twitter URL detected") - // // html = parseTweet(html, this.url) - // // } - - // const htmlCompiler = compile({ - // wordwrap: false - // }) - // const text = htmlCompiler(html) - const metadata = { source: this.url } + const metadata = { source: this.url, url: this.url, } return [new Document({ pageContent: this.html, metadata })] } @@ -95,6 +78,7 @@ export class PageAssistHtmlLoader return [ { metadata: { + url: this.url, source: this.url, audio: { chunks: transcript } }, @@ -103,22 +87,15 @@ export class PageAssistHtmlLoader ] } await urlRewriteRuntime(this.url, "web") - const fetchHTML = await fetch(this.url) - let html = await fetchHTML.text() - + let text = ""; if (isWikipedia(this.url)) { console.log("Wikipedia URL detected") - html = parseWikipedia(await fetchHTML.text()) + const fetchHTML = await fetch(this.url) + text = parseWikipedia(await fetchHTML.text()) + } else { + text = await extractReadabilityContent(this.url) } - const htmlCompiler = compile({ - wordwrap: false, - selectors: [ - { selector: "img", format: "skip" }, - { selector: "script", format: "skip" } - ] - }) - const text = htmlCompiler(html) const metadata = { url: this.url } return [new Document({ pageContent: text, metadata })] } diff --git a/src/parser/wiki.ts b/src/parser/wiki.ts index 36f567c..2dbe88c 100644 --- a/src/parser/wiki.ts +++ b/src/parser/wiki.ts @@ -1,4 +1,5 @@ import * as cheerio from "cheerio" +import { defaultExtractContent } from "./default" export const isWikipedia = (url: string) => { const WIKI_REGEX = /wikipedia\.org\/wiki\//g @@ -24,5 +25,5 @@ export const parseWikipedia = (html: string) => { content?.find("div.toc")?.remove() const newHtml = content?.html() - return `
TITLE: ${title?.text()}
${newHtml}
` + return defaultExtractContent(`
TITLE: ${title?.text()}
${newHtml}
`) } diff --git a/src/services/search.ts b/src/services/search.ts index 19e5f28..f548cfe 100644 --- a/src/services/search.ts +++ b/src/services/search.ts @@ -18,7 +18,7 @@ export const getIsSimpleInternetSearch = async () => { export const getIsVisitSpecificWebsite = async () => { const isVisitSpecificWebsite = await storage.get("isVisitSpecificWebsite") if (!isVisitSpecificWebsite || isVisitSpecificWebsite.length === 0) { - return false + return true } return isVisitSpecificWebsite === "true" } diff --git a/src/web/website/index.ts b/src/web/website/index.ts index e0ce399..ba0bed1 100644 --- a/src/web/website/index.ts +++ b/src/web/website/index.ts @@ -1,34 +1,16 @@ import { cleanUrl } from "@/libs/clean-url" -import { extractReadabilityContent } from "@/parser/reader" +import { PageAssistHtmlLoader } from "@/loader/html" import { defaultEmbeddingChunkOverlap, defaultEmbeddingChunkSize, defaultEmbeddingModelForRag, getOllamaURL } from "@/services/ollama" -import { getIsSimpleInternetSearch } from "@/services/search" import { OllamaEmbeddings } from "@langchain/community/embeddings/ollama" -import type { Document } from "@langchain/core/documents" import { RecursiveCharacterTextSplitter } from "langchain/text_splitter" import { MemoryVectorStore } from "langchain/vectorstores/memory" export const processSingleWebsite = async (url: string, query: string) => { - let content = await extractReadabilityContent(url) - - // const isSimpleMode = await getIsSimpleInternetSearch() - - // if (isSimpleMode) { - // return [ - // { - // url, - // content: content.length > 5000 ? content.slice(0, 5000) : content - // } - // ] - // } - - const docs: Document>[] = [ - { - metadata: { - url - }, - pageContent: content - } - ] + const loader = new PageAssistHtmlLoader({ + html: "", + url + }) + const docs = await loader.loadByURL() const ollamaUrl = await getOllamaURL()