diff --git a/bun.lockb b/bun.lockb index 4c50846..ce5cb65 100644 Binary files a/bun.lockb and b/bun.lockb differ diff --git a/package.json b/package.json index be825fa..d9a126b 100644 --- a/package.json +++ b/package.json @@ -50,6 +50,7 @@ "rehype-mathjax": "4.0.3", "remark-gfm": "3.0.1", "remark-math": "5.1.1", + "turndown": "^7.1.3", "yt-transcript": "^0.0.2", "zustand": "^4.5.0" }, @@ -63,6 +64,7 @@ "@types/react": "18.2.48", "@types/react-dom": "18.2.18", "@types/react-syntax-highlighter": "^15.5.11", + "@types/turndown": "^5.0.4", "autoprefixer": "^10.4.17", "postcss": "^8.4.33", "prettier": "3.2.4", diff --git a/src/components/Sidepanel/Chat/header.tsx b/src/components/Sidepanel/Chat/header.tsx index 0d52747..e7a1885 100644 --- a/src/components/Sidepanel/Chat/header.tsx +++ b/src/components/Sidepanel/Chat/header.tsx @@ -5,7 +5,7 @@ import { Tooltip } from "antd" import { BoxesIcon, CogIcon, EraserIcon, HistoryIcon } from "lucide-react" import { useTranslation } from "react-i18next" export const SidepanelHeader = () => { - const { clearChat, isEmbedding, messages } = useMessage() + const { clearChat, isEmbedding, messages, streaming } = useMessage() const { t } = useTranslation(["sidepanel", "common"]) return ( @@ -25,16 +25,15 @@ export const SidepanelHeader = () => { ) : null} - {messages.length > 0 && ( - - - + {messages.length > 0 && !streaming && ( + )} {/* diff --git a/src/hooks/useMessage.tsx b/src/hooks/useMessage.tsx index 89f5b27..07fe767 100644 --- a/src/hooks/useMessage.tsx +++ b/src/hooks/useMessage.tsx @@ -81,6 +81,7 @@ export const useMessage = () => { signal: AbortSignal, embeddingSignal: AbortSignal ) => { + setStreaming(true) const url = await getOllamaURL() const ollama = new ChatOllama({ @@ -320,6 +321,7 @@ export const useMessage = () => { history: ChatHistory, signal: AbortSignal ) => { + setStreaming(true) const url = await getOllamaURL() if (image.length > 0) { diff --git a/src/libs/get-html.ts b/src/libs/get-html.ts index bed1f43..40499d4 100644 --- a/src/libs/get-html.ts +++ b/src/libs/get-html.ts @@ -1,17 +1,25 @@ +import { defaultExtractContent } from "@/parser/default" import { getPdf } from "./pdf" +import { + isTweet, + isTwitterTimeline, + parseTweet, + parseTwitterTimeline, +} from "@/parser/twitter" +import { isGoogleDocs, parseGoogleDocs } from "@/parser/google-docs" +import { cleanUnwantedUnicode } from "@/utils/clean" -const _getHtml = async () => { +const _getHtml = () => { const url = window.location.href if (document.contentType === "application/pdf") { return { url, content: "", type: "pdf" } } - const html = Array.from(document.querySelectorAll("script")).reduce( - (acc, script) => { - return acc.replace(script.outerHTML, "") - }, - document.documentElement.outerHTML - ) - return { url, content: html, type: "html" } + + return { + content: document.documentElement.outerHTML, + url, + type: "html" + } } export const getDataFromCurrentTab = async () => { @@ -34,7 +42,6 @@ export const getDataFromCurrentTab = async () => { type: string }> - const { content, type, url } = await result if (type === "pdf") { @@ -47,31 +54,58 @@ export const getDataFromCurrentTab = async () => { const pdf = await getPdf(data) for (let i = 1; i <= pdf.numPages; i += 1) { - const page = await pdf.getPage(i); - const content = await page.getTextContent(); + const page = await pdf.getPage(i) + const content = await page.getTextContent() if (content?.items.length === 0) { - continue; + continue } - const text = content?.items.map((item: any) => item.str).join("\n") - .replace(/\x00/g, "").trim(); + const text = content?.items + .map((item: any) => item.str) + .join("\n") + .replace(/\x00/g, "") + .trim() pdfHtml.push({ content: text, page: i }) } - return { url, content: "", pdf: pdfHtml, type: "pdf" } - } - - return { url, content, type, pdf: [] } + if (isTwitterTimeline(url)) { + const data = parseTwitterTimeline(content) + return { + url, + content: data, + type: "html", + pdf: [] + } + } else if (isTweet(url)) { + const data = parseTweet(content) + return { + url, + content: data, + type: "html", + pdf: [] + } + } else if (isGoogleDocs(url)) { + const data = await parseGoogleDocs() + if (data) { + return { + url, + content: cleanUnwantedUnicode(data), + type: "html", + pdf: [] + } + } + } + const data = defaultExtractContent(content) + return { url, content: data, type, pdf: [] } } - diff --git a/src/loader/html.ts b/src/loader/html.ts index de567f0..94eaed0 100644 --- a/src/loader/html.ts +++ b/src/loader/html.ts @@ -59,24 +59,24 @@ export class PageAssistHtmlLoader ] } - let html = this.html + // let html = this.html - if (isWikipedia(this.url)) { - console.log("Wikipedia URL detected") - html = parseWikipedia(html) - } - - // else if (isTwitter(this.url)) { - // console.log("Twitter URL detected") - // html = parseTweet(html, this.url) + // if (isWikipedia(this.url)) { + // console.log("Wikipedia URL detected") + // html = parseWikipedia(html) // } - const htmlCompiler = compile({ - wordwrap: false - }) - const text = htmlCompiler(html) + // // else if (isTwitter(this.url)) { + // // console.log("Twitter URL detected") + // // html = parseTweet(html, this.url) + // // } + + // const htmlCompiler = compile({ + // wordwrap: false + // }) + // const text = htmlCompiler(html) const metadata = { source: this.url } - return [new Document({ pageContent: text, metadata })] + return [new Document({ pageContent: this.html, metadata })] } async loadByURL(): Promise>[]> { @@ -110,7 +110,7 @@ export class PageAssistHtmlLoader console.log("Wikipedia URL detected") html = parseWikipedia(await fetchHTML.text()) } - + // else if (isTwitter(this.url)) { // console.log("Twitter URL detected") // html = parseTweet(await fetchHTML.text(), this.url) diff --git a/src/parser/default.ts b/src/parser/default.ts new file mode 100644 index 0000000..e15c9ad --- /dev/null +++ b/src/parser/default.ts @@ -0,0 +1,10 @@ +import * as cheerio from "cheerio" +import TurndownService from "turndown" +let turndownService = new TurndownService() + +export const defaultExtractContent = (html: string) => { + const $ = cheerio.load(html) + const mainContent = $('[role="main"]').html() || $("main").html() || $.html() + const markdown = turndownService.turndown(mainContent) + return markdown +} diff --git a/src/parser/google-docs.ts b/src/parser/google-docs.ts new file mode 100644 index 0000000..23e7316 --- /dev/null +++ b/src/parser/google-docs.ts @@ -0,0 +1,119 @@ + +export const isGoogleDocs = (url: string) => { + const GOOGLE_DOCS_REGEX = /docs\.google\.com\/document/g + return GOOGLE_DOCS_REGEX.test(url) +} + +const getGoogleDocs = () => { + try { + function traverse( + obj: { [x: string]: any }, + predicate: { (_: any, value: any): boolean; (arg0: any, arg1: any): any }, + maxDepth: number, + propNames = Object.getOwnPropertyNames(obj) + ) { + const visited = new Set() + const results = [] + let iterations = 0 + + const traverseObj = ( + name: string, + value: unknown, + path: any[], + depth = 0 + ) => { + iterations++ + if (name === "prototype" || value instanceof Window || depth > maxDepth) + return + + const currentPath = [...path, name] + + try { + if (predicate(name, value)) { + results.push({ path: currentPath, value }) + return + } + } catch (error) {} + + if (value != null && !visited.has(value)) { + visited.add(value) + if (Array.isArray(value)) { + value.forEach((val, index) => { + try { + traverseObj(index.toString(), val, currentPath, depth + 1) + } catch (error) {} + }) + } else if (value instanceof Object) { + const propNamesForValue = + value && + // @ts-ignore + value.nodeType === 1 && + // @ts-ignore + typeof value.nodeName === "string" + ? Object.getOwnPropertyNames(obj) + : Object.getOwnPropertyNames(value) + + propNamesForValue.forEach((prop) => { + try { + traverseObj(prop, value[prop], currentPath, depth + 1) + } catch (error) {} + }) + } + } + } + + propNames.forEach((prop) => { + try { + traverseObj(prop, obj[prop], []) + } catch (error) {} + }) + + return { results, iterations } + } + + const result = traverse( + // @ts-ignore + window.KX_kixApp, + (_: any, value: { toString: () => string }) => + value && "\x03" === value.toString().charAt(0), + 5 + ) + if (result.results?.[0]?.value) { + return { + content: result.results[0].value + } + } + + return { + content: null + } + } catch (error) { + return { + content: null + } + } +} + +export const parseGoogleDocs = async () => { + const result = new Promise((resolve) => { + chrome.tabs.query({ active: true, currentWindow: true }, async (tabs) => { + const tab = tabs[0] + + const data = await chrome.scripting.executeScript({ + target: { tabId: tab.id }, + world: "MAIN", + func: getGoogleDocs + }) + + if (data.length > 0) { + resolve(data[0].result) + } + }) + }) as Promise<{ + content?: string + }> + + const { content } = await result + + return content +} diff --git a/src/parser/google-sheets.ts b/src/parser/google-sheets.ts new file mode 100644 index 0000000..21f3120 --- /dev/null +++ b/src/parser/google-sheets.ts @@ -0,0 +1,5 @@ +import * as cheerio from 'cheerio'; + +export const parseGoogleSheets = (html: string) => { + const $ = cheerio.load(html); +}; \ No newline at end of file diff --git a/src/parser/twitter.ts b/src/parser/twitter.ts index a998945..cfbbd3d 100644 --- a/src/parser/twitter.ts +++ b/src/parser/twitter.ts @@ -2,89 +2,101 @@ import * as cheerio from "cheerio" export const isTweet = (url: string) => { const TWEET_REGEX = /twitter\.com\/[a-zA-Z0-9_]+\/status\/[0-9]+/g - return TWEET_REGEX.test(url) + const X_REGEX = /x\.com\/[a-zA-Z0-9_]+\/status\/[0-9]+/g + return TWEET_REGEX.test(url) || X_REGEX.test(url) +} + +export const isTwitterTimeline = (url: string) => { + return url === "https://twitter.com/home" || url === "https://x.com/home" } export const isTwitterProfile = (url: string) => { const PROFILE_REGEX = /twitter\.com\/[a-zA-Z0-9_]+/g - return PROFILE_REGEX.test(url) + const X_REGEX = /x\.com\/[a-zA-Z0-9_]+/g + return PROFILE_REGEX.test(url) || X_REGEX.test(url) } -export const isTwitterTimeline = (url: string) => { - const TIMELINE_REGEX = /twitter\.com\/home/g - return TIMELINE_REGEX.test(url) +export const parseTwitterTimeline = (html: string) => { + const $ = cheerio.load(html) + const postElements = $("[data-testid=tweetText]") + const authorElements = $("[data-testid=User-Name]") + + const posts = postElements + .map((index, element) => { + const post = $(element).text() + const author = $(authorElements[index]).text() + return { + author, + post + } + }) + .get() + + return posts + .map((post) => { + return `## Author: ${post.author}\n\n${post.post}\n\n---\n\n` + }) + .filter((value, index, self) => self.indexOf(value) === index) + .join("\n") } -export const isTwitter = (url: string) => { - return isTweet(url) || isTwitterProfile(url) || isTwitterTimeline(url) +export const parseTweet = (html: string) => { + const $ = cheerio.load(html) + const postElements = $("[data-testid=tweetText]") + const authorElements = $("[data-testid=User-Name]") + + const posts = postElements + .map((index, element) => { + const post = $(element).text() + const author = $(authorElements[index]).text() + return { + author, + post, + isReply: index !== 0 + } + }) + .get() + + return posts + .map((post) => { + return `##Author: ${post.author}\n\n${post.isReply ? "Reply:" : "Post:"} ${post.post}\n\n---\n\n` + }) + .join("\n") } -export const isTwitterNotification = (url: string) => { - const NOTIFICATION_REGEX = /twitter\.com\/notifications/g - return NOTIFICATION_REGEX.test(url) -} - -export const parseTweet = (html: string, url: string) => { - if (!html) { - return "" - } - +export const parseTweetProfile = (html: string) => { const $ = cheerio.load(html) - if (isTweet(url)) { - console.log("tweet") - const tweet = $("div[data-testid='tweet']") - const tweetContent = tweet.find("div[lang]") - const tweetMedia = tweet.find("div[role='group']") - const author = tweet.find("a[role='link']").text() - const date = tweet.find("time").text() - return `
${author} ${tweetContent.text()} ${tweetMedia.html()} ${date}
` - } + const profileName = $("[data-testid=UserProfileHeader_Items]") + .find("h1") + .text() + const profileBio = $("[data-testid=UserProfileHeader_Items]").find("p").text() + const profileLocation = $("[data-testid=UserProfileHeader_Items]") + .find("span") + .text() + const profileJoinDate = $("[data-testid=UserProfileHeader_Items]") + .find("span") + .text() + const profileFollowers = $( + "[data-testid=UserProfileHeader_Items] span" + ).text() + const profileFollowing = $( + "[data-testid=UserProfileHeader_Items] span" + ).text() - if (isTwitterTimeline(url)) { - console.log("timeline") - const timeline = $("div[data-testid='primaryColumn']") - const timelineContent = timeline.find("div[data-testid='tweet']") - console.log(timelineContent.html()) - const tweet = timelineContent - .map((i, el) => { - const author = $(el).find("a[role='link']").text() - const content = $(el).find("div[lang]").text() - const media = $(el).find("div[role='group']").html() - const date = $(el).find("time").text() - return `
${author} ${content} ${media} ${date}
` - }) - .get() - .join("") - console.log(tweet) - return `
${tweet}
` - } + const postElements = $("[data-testid=tweetText]") + const authorElements = $("[data-testid=User-Name]") - if (isTwitterNotification(url)) { - console.log("notification") - const notification = $("div[data-testid='primaryColumn']") - const notificationContent = notification.find("div[data-testid='tweet']") - return `
${notificationContent.html()}
` - } - if (isTwitterProfile(url)) { - console.log("profile") - const profile = $("div[data-testid='primaryColumn']") - const profileContent = profile.find( - "div[data-testid='UserProfileHeader_Items']" - ) - const profileTweets = profile.find("div[data-testid='tweet']") - return `
${profileContent.html()}
${profileTweets.html()}
` - } - console.log("no match") - const timeline = $("div[data-testid='primaryColumn']") - const timelineContent = timeline.find("div[data-testid='tweet']") - const tweet = timelineContent.map((i, el) => { - const author = $(el).find("a[role='link']").text() - const content = $(el).find("div[lang]").text() - const media = $(el).find("div[role='group']").html() - const date = $(el).find("time").text() - return `
${author} ${content} ${media} ${date}
` - }) + const posts = postElements + .map((index, element) => { + const post = $(element).text() + const author = $(authorElements[index]).text() + return { + author, + post + } + }) + .get() - return `
${tweet}
` + return `## Profile: ${profileName}\n\nBio: ${profileBio}\n\nLocation: ${profileLocation}\n\nJoin Date: ${profileJoinDate}\n\nFollowers: ${profileFollowers}\n\nFollowing: ${profileFollowing}\n\nPosts: ${posts.map((post) => `Author: ${post.author}\n\nPost: ${post.post}\n\n---\n\n`).join("\n")}` } diff --git a/src/utils/clean.ts b/src/utils/clean.ts new file mode 100644 index 0000000..e49c44d --- /dev/null +++ b/src/utils/clean.ts @@ -0,0 +1,4 @@ +export const cleanUnwantedUnicode = (text: string) => { + const UNICODE_REGEX = /[\u200B-\u200D\uFEFF]/g + return text.replace(UNICODE_REGEX, "").trim() +}