Update dependencies and fix whitespace formatting in isTTSEnabled function in tts.ts

This commit is contained in:
n4ze3m 2024-04-15 11:32:30 +05:30
parent 476323d928
commit c914233610
11 changed files with 302 additions and 115 deletions

BIN
bun.lockb

Binary file not shown.

View File

@ -50,6 +50,7 @@
"rehype-mathjax": "4.0.3", "rehype-mathjax": "4.0.3",
"remark-gfm": "3.0.1", "remark-gfm": "3.0.1",
"remark-math": "5.1.1", "remark-math": "5.1.1",
"turndown": "^7.1.3",
"yt-transcript": "^0.0.2", "yt-transcript": "^0.0.2",
"zustand": "^4.5.0" "zustand": "^4.5.0"
}, },
@ -63,6 +64,7 @@
"@types/react": "18.2.48", "@types/react": "18.2.48",
"@types/react-dom": "18.2.18", "@types/react-dom": "18.2.18",
"@types/react-syntax-highlighter": "^15.5.11", "@types/react-syntax-highlighter": "^15.5.11",
"@types/turndown": "^5.0.4",
"autoprefixer": "^10.4.17", "autoprefixer": "^10.4.17",
"postcss": "^8.4.33", "postcss": "^8.4.33",
"prettier": "3.2.4", "prettier": "3.2.4",

View File

@ -5,7 +5,7 @@ import { Tooltip } from "antd"
import { BoxesIcon, CogIcon, EraserIcon, HistoryIcon } from "lucide-react" import { BoxesIcon, CogIcon, EraserIcon, HistoryIcon } from "lucide-react"
import { useTranslation } from "react-i18next" import { useTranslation } from "react-i18next"
export const SidepanelHeader = () => { export const SidepanelHeader = () => {
const { clearChat, isEmbedding, messages } = useMessage() const { clearChat, isEmbedding, messages, streaming } = useMessage()
const { t } = useTranslation(["sidepanel", "common"]) const { t } = useTranslation(["sidepanel", "common"])
return ( return (
@ -25,16 +25,15 @@ export const SidepanelHeader = () => {
<BoxesIcon className="h-5 w-5 text-gray-500 dark:text-gray-400 animate-bounce animate-infinite" /> <BoxesIcon className="h-5 w-5 text-gray-500 dark:text-gray-400 animate-bounce animate-infinite" />
</Tooltip> </Tooltip>
) : null} ) : null}
{messages.length > 0 && ( {messages.length > 0 && !streaming && (
<Tooltip title={t("tooltip.clear")}>
<button <button
title={t("tooltip.clear")}
onClick={() => { onClick={() => {
clearChat() clearChat()
}} }}
className="flex items-center space-x-1 focus:outline-none focus-visible:ring-2 focus-visible:ring-pink-700"> className="flex items-center space-x-1 focus:outline-none focus-visible:ring-2 focus-visible:ring-pink-700">
<EraserIcon className="h-5 w-5 text-gray-500 dark:text-gray-400" /> <EraserIcon className="h-5 w-5 text-gray-500 dark:text-gray-400" />
</button> </button>
</Tooltip>
)} )}
{/* <Tooltip title={t("tooltip.history")}> {/* <Tooltip title={t("tooltip.history")}>
<Link to="/history"> <Link to="/history">

View File

@ -81,6 +81,7 @@ export const useMessage = () => {
signal: AbortSignal, signal: AbortSignal,
embeddingSignal: AbortSignal embeddingSignal: AbortSignal
) => { ) => {
setStreaming(true)
const url = await getOllamaURL() const url = await getOllamaURL()
const ollama = new ChatOllama({ const ollama = new ChatOllama({
@ -320,6 +321,7 @@ export const useMessage = () => {
history: ChatHistory, history: ChatHistory,
signal: AbortSignal signal: AbortSignal
) => { ) => {
setStreaming(true)
const url = await getOllamaURL() const url = await getOllamaURL()
if (image.length > 0) { if (image.length > 0) {

View File

@ -1,17 +1,25 @@
import { defaultExtractContent } from "@/parser/default"
import { getPdf } from "./pdf" import { getPdf } from "./pdf"
import {
isTweet,
isTwitterTimeline,
parseTweet,
parseTwitterTimeline,
} from "@/parser/twitter"
import { isGoogleDocs, parseGoogleDocs } from "@/parser/google-docs"
import { cleanUnwantedUnicode } from "@/utils/clean"
const _getHtml = async () => { const _getHtml = () => {
const url = window.location.href const url = window.location.href
if (document.contentType === "application/pdf") { if (document.contentType === "application/pdf") {
return { url, content: "", type: "pdf" } return { url, content: "", type: "pdf" }
} }
const html = Array.from(document.querySelectorAll("script")).reduce(
(acc, script) => { return {
return acc.replace(script.outerHTML, "") content: document.documentElement.outerHTML,
}, url,
document.documentElement.outerHTML type: "html"
) }
return { url, content: html, type: "html" }
} }
export const getDataFromCurrentTab = async () => { export const getDataFromCurrentTab = async () => {
@ -34,7 +42,6 @@ export const getDataFromCurrentTab = async () => {
type: string type: string
}> }>
const { content, type, url } = await result const { content, type, url } = await result
if (type === "pdf") { if (type === "pdf") {
@ -47,31 +54,58 @@ export const getDataFromCurrentTab = async () => {
const pdf = await getPdf(data) const pdf = await getPdf(data)
for (let i = 1; i <= pdf.numPages; i += 1) { for (let i = 1; i <= pdf.numPages; i += 1) {
const page = await pdf.getPage(i); const page = await pdf.getPage(i)
const content = await page.getTextContent(); const content = await page.getTextContent()
if (content?.items.length === 0) { if (content?.items.length === 0) {
continue; continue
} }
const text = content?.items.map((item: any) => item.str).join("\n") const text = content?.items
.replace(/\x00/g, "").trim(); .map((item: any) => item.str)
.join("\n")
.replace(/\x00/g, "")
.trim()
pdfHtml.push({ pdfHtml.push({
content: text, content: text,
page: i page: i
}) })
} }
return { return {
url, url,
content: "", content: "",
pdf: pdfHtml, pdf: pdfHtml,
type: "pdf" type: "pdf"
} }
} }
if (isTwitterTimeline(url)) {
return { url, content, type, pdf: [] } const data = parseTwitterTimeline(content)
return {
url,
content: data,
type: "html",
pdf: []
}
} else if (isTweet(url)) {
const data = parseTweet(content)
return {
url,
content: data,
type: "html",
pdf: []
}
} else if (isGoogleDocs(url)) {
const data = await parseGoogleDocs()
if (data) {
return {
url,
content: cleanUnwantedUnicode(data),
type: "html",
pdf: []
}
}
}
const data = defaultExtractContent(content)
return { url, content: data, type, pdf: [] }
} }

View File

@ -59,24 +59,24 @@ export class PageAssistHtmlLoader
] ]
} }
let html = this.html // let html = this.html
if (isWikipedia(this.url)) { // if (isWikipedia(this.url)) {
console.log("Wikipedia URL detected") // console.log("Wikipedia URL detected")
html = parseWikipedia(html) // html = parseWikipedia(html)
}
// else if (isTwitter(this.url)) {
// console.log("Twitter URL detected")
// html = parseTweet(html, this.url)
// } // }
const htmlCompiler = compile({ // // else if (isTwitter(this.url)) {
wordwrap: false // // console.log("Twitter URL detected")
}) // // html = parseTweet(html, this.url)
const text = htmlCompiler(html) // // }
// const htmlCompiler = compile({
// wordwrap: false
// })
// const text = htmlCompiler(html)
const metadata = { source: this.url } const metadata = { source: this.url }
return [new Document({ pageContent: text, metadata })] return [new Document({ pageContent: this.html, metadata })]
} }
async loadByURL(): Promise<Document<Record<string, any>>[]> { async loadByURL(): Promise<Document<Record<string, any>>[]> {

10
src/parser/default.ts Normal file
View File

@ -0,0 +1,10 @@
import * as cheerio from "cheerio"
import TurndownService from "turndown"
let turndownService = new TurndownService()
export const defaultExtractContent = (html: string) => {
const $ = cheerio.load(html)
const mainContent = $('[role="main"]').html() || $("main").html() || $.html()
const markdown = turndownService.turndown(mainContent)
return markdown
}

119
src/parser/google-docs.ts Normal file
View File

@ -0,0 +1,119 @@
export const isGoogleDocs = (url: string) => {
const GOOGLE_DOCS_REGEX = /docs\.google\.com\/document/g
return GOOGLE_DOCS_REGEX.test(url)
}
const getGoogleDocs = () => {
try {
function traverse(
obj: { [x: string]: any },
predicate: { (_: any, value: any): boolean; (arg0: any, arg1: any): any },
maxDepth: number,
propNames = Object.getOwnPropertyNames(obj)
) {
const visited = new Set()
const results = []
let iterations = 0
const traverseObj = (
name: string,
value: unknown,
path: any[],
depth = 0
) => {
iterations++
if (name === "prototype" || value instanceof Window || depth > maxDepth)
return
const currentPath = [...path, name]
try {
if (predicate(name, value)) {
results.push({ path: currentPath, value })
return
}
} catch (error) {}
if (value != null && !visited.has(value)) {
visited.add(value)
if (Array.isArray(value)) {
value.forEach((val, index) => {
try {
traverseObj(index.toString(), val, currentPath, depth + 1)
} catch (error) {}
})
} else if (value instanceof Object) {
const propNamesForValue =
value &&
// @ts-ignore
value.nodeType === 1 &&
// @ts-ignore
typeof value.nodeName === "string"
? Object.getOwnPropertyNames(obj)
: Object.getOwnPropertyNames(value)
propNamesForValue.forEach((prop) => {
try {
traverseObj(prop, value[prop], currentPath, depth + 1)
} catch (error) {}
})
}
}
}
propNames.forEach((prop) => {
try {
traverseObj(prop, obj[prop], [])
} catch (error) {}
})
return { results, iterations }
}
const result = traverse(
// @ts-ignore
window.KX_kixApp,
(_: any, value: { toString: () => string }) =>
value && "\x03" === value.toString().charAt(0),
5
)
if (result.results?.[0]?.value) {
return {
content: result.results[0].value
}
}
return {
content: null
}
} catch (error) {
return {
content: null
}
}
}
export const parseGoogleDocs = async () => {
const result = new Promise((resolve) => {
chrome.tabs.query({ active: true, currentWindow: true }, async (tabs) => {
const tab = tabs[0]
const data = await chrome.scripting.executeScript({
target: { tabId: tab.id },
world: "MAIN",
func: getGoogleDocs
})
if (data.length > 0) {
resolve(data[0].result)
}
})
}) as Promise<{
content?: string
}>
const { content } = await result
return content
}

View File

@ -0,0 +1,5 @@
import * as cheerio from 'cheerio';
export const parseGoogleSheets = (html: string) => {
const $ = cheerio.load(html);
};

View File

@ -2,89 +2,101 @@ import * as cheerio from "cheerio"
export const isTweet = (url: string) => { export const isTweet = (url: string) => {
const TWEET_REGEX = /twitter\.com\/[a-zA-Z0-9_]+\/status\/[0-9]+/g const TWEET_REGEX = /twitter\.com\/[a-zA-Z0-9_]+\/status\/[0-9]+/g
return TWEET_REGEX.test(url) const X_REGEX = /x\.com\/[a-zA-Z0-9_]+\/status\/[0-9]+/g
return TWEET_REGEX.test(url) || X_REGEX.test(url)
}
export const isTwitterTimeline = (url: string) => {
return url === "https://twitter.com/home" || url === "https://x.com/home"
} }
export const isTwitterProfile = (url: string) => { export const isTwitterProfile = (url: string) => {
const PROFILE_REGEX = /twitter\.com\/[a-zA-Z0-9_]+/g const PROFILE_REGEX = /twitter\.com\/[a-zA-Z0-9_]+/g
return PROFILE_REGEX.test(url) const X_REGEX = /x\.com\/[a-zA-Z0-9_]+/g
} return PROFILE_REGEX.test(url) || X_REGEX.test(url)
export const isTwitterTimeline = (url: string) => {
const TIMELINE_REGEX = /twitter\.com\/home/g
return TIMELINE_REGEX.test(url)
}
export const isTwitter = (url: string) => {
return isTweet(url) || isTwitterProfile(url) || isTwitterTimeline(url)
}
export const isTwitterNotification = (url: string) => {
const NOTIFICATION_REGEX = /twitter\.com\/notifications/g
return NOTIFICATION_REGEX.test(url)
}
export const parseTweet = (html: string, url: string) => {
if (!html) {
return ""
} }
export const parseTwitterTimeline = (html: string) => {
const $ = cheerio.load(html) const $ = cheerio.load(html)
const postElements = $("[data-testid=tweetText]")
const authorElements = $("[data-testid=User-Name]")
if (isTweet(url)) { const posts = postElements
console.log("tweet") .map((index, element) => {
const tweet = $("div[data-testid='tweet']") const post = $(element).text()
const tweetContent = tweet.find("div[lang]") const author = $(authorElements[index]).text()
const tweetMedia = tweet.find("div[role='group']") return {
const author = tweet.find("a[role='link']").text() author,
const date = tweet.find("time").text() post
return `<div>${author} ${tweetContent.text()} ${tweetMedia.html()} ${date}</div>`
} }
if (isTwitterTimeline(url)) {
console.log("timeline")
const timeline = $("div[data-testid='primaryColumn']")
const timelineContent = timeline.find("div[data-testid='tweet']")
console.log(timelineContent.html())
const tweet = timelineContent
.map((i, el) => {
const author = $(el).find("a[role='link']").text()
const content = $(el).find("div[lang]").text()
const media = $(el).find("div[role='group']").html()
const date = $(el).find("time").text()
return `<div>${author} ${content} ${media} ${date}</div>`
}) })
.get() .get()
.join("")
console.log(tweet)
return `<div>${tweet}</div>`
}
if (isTwitterNotification(url)) { return posts
console.log("notification") .map((post) => {
const notification = $("div[data-testid='primaryColumn']") return `## Author: ${post.author}\n\n${post.post}\n\n---\n\n`
const notificationContent = notification.find("div[data-testid='tweet']")
return `<div>${notificationContent.html()}</div>`
}
if (isTwitterProfile(url)) {
console.log("profile")
const profile = $("div[data-testid='primaryColumn']")
const profileContent = profile.find(
"div[data-testid='UserProfileHeader_Items']"
)
const profileTweets = profile.find("div[data-testid='tweet']")
return `<div>${profileContent.html()}</div><div>${profileTweets.html()}</div>`
}
console.log("no match")
const timeline = $("div[data-testid='primaryColumn']")
const timelineContent = timeline.find("div[data-testid='tweet']")
const tweet = timelineContent.map((i, el) => {
const author = $(el).find("a[role='link']").text()
const content = $(el).find("div[lang]").text()
const media = $(el).find("div[role='group']").html()
const date = $(el).find("time").text()
return `<div>${author} ${content} ${media} ${date}</div>`
}) })
.filter((value, index, self) => self.indexOf(value) === index)
return `<div>${tweet}</div>` .join("\n")
}
export const parseTweet = (html: string) => {
const $ = cheerio.load(html)
const postElements = $("[data-testid=tweetText]")
const authorElements = $("[data-testid=User-Name]")
const posts = postElements
.map((index, element) => {
const post = $(element).text()
const author = $(authorElements[index]).text()
return {
author,
post,
isReply: index !== 0
}
})
.get()
return posts
.map((post) => {
return `##Author: ${post.author}\n\n${post.isReply ? "Reply:" : "Post:"} ${post.post}\n\n---\n\n`
})
.join("\n")
}
export const parseTweetProfile = (html: string) => {
const $ = cheerio.load(html)
const profileName = $("[data-testid=UserProfileHeader_Items]")
.find("h1")
.text()
const profileBio = $("[data-testid=UserProfileHeader_Items]").find("p").text()
const profileLocation = $("[data-testid=UserProfileHeader_Items]")
.find("span")
.text()
const profileJoinDate = $("[data-testid=UserProfileHeader_Items]")
.find("span")
.text()
const profileFollowers = $(
"[data-testid=UserProfileHeader_Items] span"
).text()
const profileFollowing = $(
"[data-testid=UserProfileHeader_Items] span"
).text()
const postElements = $("[data-testid=tweetText]")
const authorElements = $("[data-testid=User-Name]")
const posts = postElements
.map((index, element) => {
const post = $(element).text()
const author = $(authorElements[index]).text()
return {
author,
post
}
})
.get()
return `## Profile: ${profileName}\n\nBio: ${profileBio}\n\nLocation: ${profileLocation}\n\nJoin Date: ${profileJoinDate}\n\nFollowers: ${profileFollowers}\n\nFollowing: ${profileFollowing}\n\nPosts: ${posts.map((post) => `Author: ${post.author}\n\nPost: ${post.post}\n\n---\n\n`).join("\n")}`
} }

4
src/utils/clean.ts Normal file
View File

@ -0,0 +1,4 @@
export const cleanUnwantedUnicode = (text: string) => {
const UNICODE_REGEX = /[\u200B-\u200D\uFEFF]/g
return text.replace(UNICODE_REGEX, "").trim()
}