feat: Add ElevenLabs TTS provider and response splitting options

2024-12-28 20:10:50 +05:30
parent aa49f03f63
commit 3ddb7f1ad8
7 changed files with 454 additions and 57 deletions
--- a/src/hooks/useTTS.tsx
+++ b/src/hooks/useTTS.tsx
@@ -1,44 +1,101 @@
 import { useEffect, useState } from "react"
 import { notification } from "antd"
-import { getVoice, isSSMLEnabled } from "@/services/tts"
+import {
+  getElevenLabsApiKey,
+  getElevenLabsModel,
+  getElevenLabsVoiceId,
+  getTTSProvider,
+  getVoice,
+  isSSMLEnabled
+} from "@/services/tts"
 import { markdownToSSML } from "@/utils/markdown-to-ssml"
-type VoiceOptions = {
+import { generateSpeech } from "@/services/elevenlabs"
+import { splitMessageContent } from "@/utils/tts"
+
+export interface VoiceOptions {
  utterance: string
 }

 export const useTTS = () => {
  const [isSpeaking, setIsSpeaking] = useState(false)
+  const [audioElement, setAudioElement] = useState<HTMLAudioElement | null>(
+    null
+  )

  const speak = async ({ utterance }: VoiceOptions) => {
    try {
      const voice = await getVoice()
-      const isSSML = await isSSMLEnabled()
-      if (isSSML) {
-        utterance = markdownToSSML(utterance)
-      }
-      if (import.meta.env.BROWSER === "chrome") {
-        chrome.tts.speak(utterance, {
-          voiceName: voice,
-          onEvent(event) {
-            if (event.type === "start") {
-              setIsSpeaking(true)
-            } else if (event.type === "end") {
-              setIsSpeaking(false)
-            }
-          }
-        })
-      } else {
-        // browser tts
-        window.speechSynthesis.speak(new SpeechSynthesisUtterance(utterance))
-        window.speechSynthesis.onvoiceschanged = () => {
-          const voices = window.speechSynthesis.getVoices()
-          const voice = voices.find((v) => v.name === voice)
-          const utter = new SpeechSynthesisUtterance(utterance)
-          utter.voice = voice
-          window.speechSynthesis.speak(utter)
+      const provider = await getTTSProvider()
+
+      if (provider === "browser") {
+        const isSSML = await isSSMLEnabled()
+        if (isSSML) {
+          utterance = markdownToSSML(utterance)
        }
+        if (import.meta.env.BROWSER === "chrome") {
+          chrome.tts.speak(utterance, {
+            voiceName: voice,
+            onEvent(event) {
+              if (event.type === "start") {
+                setIsSpeaking(true)
+              } else if (event.type === "end") {
+                setIsSpeaking(false)
+              }
+            }
+          })
+        } else {
+          window.speechSynthesis.speak(new SpeechSynthesisUtterance(utterance))
+          window.speechSynthesis.onvoiceschanged = () => {
+            const voices = window.speechSynthesis.getVoices()
+            const voice = voices.find((v) => v.name === voice)
+            const utter = new SpeechSynthesisUtterance(utterance)
+            utter.voice = voice
+            window.speechSynthesis.speak(utter)
+          }
+        }
+      } else if (provider === "elevenlabs") {
+        const apiKey = await getElevenLabsApiKey()
+        const modelId = await getElevenLabsModel()
+        const voiceId = await getElevenLabsVoiceId()
+        const sentences = splitMessageContent(utterance)
+        let nextAudioData: ArrayBuffer | null = null
+        if (!apiKey || !modelId || !voiceId) {
+          throw new Error("Missing ElevenLabs configuration")
+        }
+        for (let i = 0; i < sentences.length; i++) {
+          setIsSpeaking(true)
+
+          let currentAudioData =
+            nextAudioData ||
+            (await generateSpeech(apiKey, sentences[i], voiceId, modelId))
+            
+          if (i < sentences.length - 1) {
+            generateSpeech(apiKey, sentences[i + 1], voiceId, modelId)
+              .then((nextAudioData) => {
+                nextAudioData = nextAudioData
+              })
+              .catch(console.error)
+          }
+
+          const blob = new Blob([currentAudioData], { type: "audio/mpeg" })
+          const url = URL.createObjectURL(blob)
+          const audio = new Audio(url)
+          setAudioElement(audio)
+
+          await new Promise((resolve) => {
+            audio.onended = resolve
+            audio.play()
+          })
+
+          URL.revokeObjectURL(url)
+        }
+
+        setIsSpeaking(false)
+        setAudioElement(null)
      }
    } catch (error) {
+      setIsSpeaking(false)
+      setAudioElement(null)
      notification.error({
        message: "Error",
        description: "Something went wrong while trying to play the audio"
@@ -47,6 +104,14 @@ export const useTTS = () => {
  }

  const cancel = () => {
+    if (audioElement) {
+      audioElement.pause()
+      audioElement.currentTime = 0
+      setAudioElement(null)
+      setIsSpeaking(false)
+      return
+    }
+
    if (import.meta.env.BROWSER === "chrome") {
      chrome.tts.stop()
    } else {