>[]> {
@@ -110,7 +110,7 @@ export class PageAssistHtmlLoader
console.log("Wikipedia URL detected")
html = parseWikipedia(await fetchHTML.text())
}
-
+
// else if (isTwitter(this.url)) {
// console.log("Twitter URL detected")
// html = parseTweet(await fetchHTML.text(), this.url)
diff --git a/src/parser/default.ts b/src/parser/default.ts
new file mode 100644
index 0000000..e15c9ad
--- /dev/null
+++ b/src/parser/default.ts
@@ -0,0 +1,10 @@
+import * as cheerio from "cheerio"
+import TurndownService from "turndown"
+let turndownService = new TurndownService()
+
+export const defaultExtractContent = (html: string) => {
+ const $ = cheerio.load(html)
+ const mainContent = $('[role="main"]').html() || $("main").html() || $.html()
+ const markdown = turndownService.turndown(mainContent)
+ return markdown
+}
diff --git a/src/parser/google-docs.ts b/src/parser/google-docs.ts
new file mode 100644
index 0000000..23e7316
--- /dev/null
+++ b/src/parser/google-docs.ts
@@ -0,0 +1,119 @@
+
+export const isGoogleDocs = (url: string) => {
+ const GOOGLE_DOCS_REGEX = /docs\.google\.com\/document/g
+ return GOOGLE_DOCS_REGEX.test(url)
+}
+
+const getGoogleDocs = () => {
+ try {
+ function traverse(
+ obj: { [x: string]: any },
+ predicate: { (_: any, value: any): boolean; (arg0: any, arg1: any): any },
+ maxDepth: number,
+ propNames = Object.getOwnPropertyNames(obj)
+ ) {
+ const visited = new Set()
+ const results = []
+ let iterations = 0
+
+ const traverseObj = (
+ name: string,
+ value: unknown,
+ path: any[],
+ depth = 0
+ ) => {
+ iterations++
+ if (name === "prototype" || value instanceof Window || depth > maxDepth)
+ return
+
+ const currentPath = [...path, name]
+
+ try {
+ if (predicate(name, value)) {
+ results.push({ path: currentPath, value })
+ return
+ }
+ } catch (error) {}
+
+ if (value != null && !visited.has(value)) {
+ visited.add(value)
+ if (Array.isArray(value)) {
+ value.forEach((val, index) => {
+ try {
+ traverseObj(index.toString(), val, currentPath, depth + 1)
+ } catch (error) {}
+ })
+ } else if (value instanceof Object) {
+ const propNamesForValue =
+ value &&
+ // @ts-ignore
+ value.nodeType === 1 &&
+ // @ts-ignore
+ typeof value.nodeName === "string"
+ ? Object.getOwnPropertyNames(obj)
+ : Object.getOwnPropertyNames(value)
+
+ propNamesForValue.forEach((prop) => {
+ try {
+ traverseObj(prop, value[prop], currentPath, depth + 1)
+ } catch (error) {}
+ })
+ }
+ }
+ }
+
+ propNames.forEach((prop) => {
+ try {
+ traverseObj(prop, obj[prop], [])
+ } catch (error) {}
+ })
+
+ return { results, iterations }
+ }
+
+ const result = traverse(
+ // @ts-ignore
+ window.KX_kixApp,
+ (_: any, value: { toString: () => string }) =>
+ value && "\x03" === value.toString().charAt(0),
+ 5
+ )
+ if (result.results?.[0]?.value) {
+ return {
+ content: result.results[0].value
+ }
+ }
+
+ return {
+ content: null
+ }
+ } catch (error) {
+ return {
+ content: null
+ }
+ }
+}
+
+export const parseGoogleDocs = async () => {
+ const result = new Promise((resolve) => {
+ chrome.tabs.query({ active: true, currentWindow: true }, async (tabs) => {
+ const tab = tabs[0]
+
+ const data = await chrome.scripting.executeScript({
+ target: { tabId: tab.id },
+ world: "MAIN",
+ func: getGoogleDocs
+ })
+
+ if (data.length > 0) {
+ resolve(data[0].result)
+ }
+ })
+ }) as Promise<{
+ content?: string
+ }>
+
+ const { content } = await result
+
+ return content
+}
diff --git a/src/parser/google-sheets.ts b/src/parser/google-sheets.ts
new file mode 100644
index 0000000..21f3120
--- /dev/null
+++ b/src/parser/google-sheets.ts
@@ -0,0 +1,5 @@
+import * as cheerio from 'cheerio';
+
+export const parseGoogleSheets = (html: string) => {
+ const $ = cheerio.load(html);
+};
\ No newline at end of file
diff --git a/src/parser/twitter.ts b/src/parser/twitter.ts
index a998945..cfbbd3d 100644
--- a/src/parser/twitter.ts
+++ b/src/parser/twitter.ts
@@ -2,89 +2,101 @@ import * as cheerio from "cheerio"
export const isTweet = (url: string) => {
const TWEET_REGEX = /twitter\.com\/[a-zA-Z0-9_]+\/status\/[0-9]+/g
- return TWEET_REGEX.test(url)
+ const X_REGEX = /x\.com\/[a-zA-Z0-9_]+\/status\/[0-9]+/g
+ return TWEET_REGEX.test(url) || X_REGEX.test(url)
+}
+
+export const isTwitterTimeline = (url: string) => {
+ return url === "https://twitter.com/home" || url === "https://x.com/home"
}
export const isTwitterProfile = (url: string) => {
const PROFILE_REGEX = /twitter\.com\/[a-zA-Z0-9_]+/g
- return PROFILE_REGEX.test(url)
+ const X_REGEX = /x\.com\/[a-zA-Z0-9_]+/g
+ return PROFILE_REGEX.test(url) || X_REGEX.test(url)
}
-export const isTwitterTimeline = (url: string) => {
- const TIMELINE_REGEX = /twitter\.com\/home/g
- return TIMELINE_REGEX.test(url)
+export const parseTwitterTimeline = (html: string) => {
+ const $ = cheerio.load(html)
+ const postElements = $("[data-testid=tweetText]")
+ const authorElements = $("[data-testid=User-Name]")
+
+ const posts = postElements
+ .map((index, element) => {
+ const post = $(element).text()
+ const author = $(authorElements[index]).text()
+ return {
+ author,
+ post
+ }
+ })
+ .get()
+
+ return posts
+ .map((post) => {
+ return `## Author: ${post.author}\n\n${post.post}\n\n---\n\n`
+ })
+ .filter((value, index, self) => self.indexOf(value) === index)
+ .join("\n")
}
-export const isTwitter = (url: string) => {
- return isTweet(url) || isTwitterProfile(url) || isTwitterTimeline(url)
+export const parseTweet = (html: string) => {
+ const $ = cheerio.load(html)
+ const postElements = $("[data-testid=tweetText]")
+ const authorElements = $("[data-testid=User-Name]")
+
+ const posts = postElements
+ .map((index, element) => {
+ const post = $(element).text()
+ const author = $(authorElements[index]).text()
+ return {
+ author,
+ post,
+ isReply: index !== 0
+ }
+ })
+ .get()
+
+ return posts
+ .map((post) => {
+ return `##Author: ${post.author}\n\n${post.isReply ? "Reply:" : "Post:"} ${post.post}\n\n---\n\n`
+ })
+ .join("\n")
}
-export const isTwitterNotification = (url: string) => {
- const NOTIFICATION_REGEX = /twitter\.com\/notifications/g
- return NOTIFICATION_REGEX.test(url)
-}
-
-export const parseTweet = (html: string, url: string) => {
- if (!html) {
- return ""
- }
-
+export const parseTweetProfile = (html: string) => {
const $ = cheerio.load(html)
- if (isTweet(url)) {
- console.log("tweet")
- const tweet = $("div[data-testid='tweet']")
- const tweetContent = tweet.find("div[lang]")
- const tweetMedia = tweet.find("div[role='group']")
- const author = tweet.find("a[role='link']").text()
- const date = tweet.find("time").text()
- return `${author} ${tweetContent.text()} ${tweetMedia.html()} ${date}
`
- }
+ const profileName = $("[data-testid=UserProfileHeader_Items]")
+ .find("h1")
+ .text()
+ const profileBio = $("[data-testid=UserProfileHeader_Items]").find("p").text()
+ const profileLocation = $("[data-testid=UserProfileHeader_Items]")
+ .find("span")
+ .text()
+ const profileJoinDate = $("[data-testid=UserProfileHeader_Items]")
+ .find("span")
+ .text()
+ const profileFollowers = $(
+ "[data-testid=UserProfileHeader_Items] span"
+ ).text()
+ const profileFollowing = $(
+ "[data-testid=UserProfileHeader_Items] span"
+ ).text()
- if (isTwitterTimeline(url)) {
- console.log("timeline")
- const timeline = $("div[data-testid='primaryColumn']")
- const timelineContent = timeline.find("div[data-testid='tweet']")
- console.log(timelineContent.html())
- const tweet = timelineContent
- .map((i, el) => {
- const author = $(el).find("a[role='link']").text()
- const content = $(el).find("div[lang]").text()
- const media = $(el).find("div[role='group']").html()
- const date = $(el).find("time").text()
- return `${author} ${content} ${media} ${date}
`
- })
- .get()
- .join("")
- console.log(tweet)
- return `${tweet}
`
- }
+ const postElements = $("[data-testid=tweetText]")
+ const authorElements = $("[data-testid=User-Name]")
- if (isTwitterNotification(url)) {
- console.log("notification")
- const notification = $("div[data-testid='primaryColumn']")
- const notificationContent = notification.find("div[data-testid='tweet']")
- return `${notificationContent.html()}
`
- }
- if (isTwitterProfile(url)) {
- console.log("profile")
- const profile = $("div[data-testid='primaryColumn']")
- const profileContent = profile.find(
- "div[data-testid='UserProfileHeader_Items']"
- )
- const profileTweets = profile.find("div[data-testid='tweet']")
- return `${profileContent.html()}
${profileTweets.html()}
`
- }
- console.log("no match")
- const timeline = $("div[data-testid='primaryColumn']")
- const timelineContent = timeline.find("div[data-testid='tweet']")
- const tweet = timelineContent.map((i, el) => {
- const author = $(el).find("a[role='link']").text()
- const content = $(el).find("div[lang]").text()
- const media = $(el).find("div[role='group']").html()
- const date = $(el).find("time").text()
- return `${author} ${content} ${media} ${date}
`
- })
+ const posts = postElements
+ .map((index, element) => {
+ const post = $(element).text()
+ const author = $(authorElements[index]).text()
+ return {
+ author,
+ post
+ }
+ })
+ .get()
- return `${tweet}
`
+ return `## Profile: ${profileName}\n\nBio: ${profileBio}\n\nLocation: ${profileLocation}\n\nJoin Date: ${profileJoinDate}\n\nFollowers: ${profileFollowers}\n\nFollowing: ${profileFollowing}\n\nPosts: ${posts.map((post) => `Author: ${post.author}\n\nPost: ${post.post}\n\n---\n\n`).join("\n")}`
}
diff --git a/src/utils/clean.ts b/src/utils/clean.ts
new file mode 100644
index 0000000..e49c44d
--- /dev/null
+++ b/src/utils/clean.ts
@@ -0,0 +1,4 @@
+export const cleanUnwantedUnicode = (text: string) => {
+ const UNICODE_REGEX = /[\u200B-\u200D\uFEFF]/g
+ return text.replace(UNICODE_REGEX, "").trim()
+}