Update dependencies and fix whitespace formatting in isTTSEnabled function in tts.ts

2024-04-15 11:32:30 +05:30
parent 476323d928
commit c914233610
11 changed files with 302 additions and 115 deletions
--- a/src/libs/get-html.ts
+++ b/src/libs/get-html.ts
@@ -1,17 +1,25 @@
+import { defaultExtractContent } from "@/parser/default"
 import { getPdf } from "./pdf"
+import {
+  isTweet,
+  isTwitterTimeline,
+  parseTweet,
+  parseTwitterTimeline,
+} from "@/parser/twitter"
+import { isGoogleDocs, parseGoogleDocs } from "@/parser/google-docs"
+import { cleanUnwantedUnicode } from "@/utils/clean"

-const _getHtml = async () => {
+const _getHtml = () => {
  const url = window.location.href
  if (document.contentType === "application/pdf") {
    return { url, content: "", type: "pdf" }
  }
-  const html = Array.from(document.querySelectorAll("script")).reduce(
-    (acc, script) => {
-      return acc.replace(script.outerHTML, "")
-    },
-    document.documentElement.outerHTML
-  )
-  return { url, content: html, type: "html" }
+
+  return {
+    content: document.documentElement.outerHTML,
+    url,
+    type: "html"
+  }
 }

 export const getDataFromCurrentTab = async () => {
@@ -34,7 +42,6 @@ export const getDataFromCurrentTab = async () => {
    type: string
  }>

-
  const { content, type, url } = await result

  if (type === "pdf") {
@@ -47,31 +54,58 @@ export const getDataFromCurrentTab = async () => {
    const pdf = await getPdf(data)

    for (let i = 1; i <= pdf.numPages; i += 1) {
-      const page = await pdf.getPage(i);
-      const content = await page.getTextContent();
+      const page = await pdf.getPage(i)
+      const content = await page.getTextContent()

      if (content?.items.length === 0) {
-        continue;
+        continue
      }

-      const text = content?.items.map((item: any) => item.str).join("\n")
-        .replace(/\x00/g, "").trim();
+      const text = content?.items
+        .map((item: any) => item.str)
+        .join("\n")
+        .replace(/\x00/g, "")
+        .trim()
      pdfHtml.push({
        content: text,
        page: i
      })
    }

-
    return {
      url,
      content: "",
      pdf: pdfHtml,
      type: "pdf"
    }
-
  }
-
-  return { url, content, type, pdf: [] }
+  if (isTwitterTimeline(url)) {
+    const data = parseTwitterTimeline(content)
+    return {
+      url,
+      content: data,
+      type: "html",
+      pdf: []
+    }
+  } else if (isTweet(url)) {
+    const data = parseTweet(content)
+    return {
+      url,
+      content: data,
+      type: "html",
+      pdf: []
+    }
+  } else if (isGoogleDocs(url)) {
+    const data = await parseGoogleDocs()
+    if (data) {
+      return {
+        url,
+        content: cleanUnwantedUnicode(data),
+        type: "html",
+        pdf: []
+      }
+    }
+  }
+  const data = defaultExtractContent(content)
+  return { url, content: data, type, pdf: [] }
 }
-