feat: Add ElevenLabs TTS provider and response splitting options

This commit is contained in:
n4ze3m
2024-12-28 20:10:50 +05:30
parent aa49f03f63
commit 3ddb7f1ad8
7 changed files with 454 additions and 57 deletions

View File

@@ -1,44 +1,101 @@
import { useEffect, useState } from "react"
import { notification } from "antd"
import { getVoice, isSSMLEnabled } from "@/services/tts"
import {
getElevenLabsApiKey,
getElevenLabsModel,
getElevenLabsVoiceId,
getTTSProvider,
getVoice,
isSSMLEnabled
} from "@/services/tts"
import { markdownToSSML } from "@/utils/markdown-to-ssml"
type VoiceOptions = {
import { generateSpeech } from "@/services/elevenlabs"
import { splitMessageContent } from "@/utils/tts"
export interface VoiceOptions {
utterance: string
}
export const useTTS = () => {
const [isSpeaking, setIsSpeaking] = useState(false)
const [audioElement, setAudioElement] = useState<HTMLAudioElement | null>(
null
)
const speak = async ({ utterance }: VoiceOptions) => {
try {
const voice = await getVoice()
const isSSML = await isSSMLEnabled()
if (isSSML) {
utterance = markdownToSSML(utterance)
}
if (import.meta.env.BROWSER === "chrome") {
chrome.tts.speak(utterance, {
voiceName: voice,
onEvent(event) {
if (event.type === "start") {
setIsSpeaking(true)
} else if (event.type === "end") {
setIsSpeaking(false)
}
}
})
} else {
// browser tts
window.speechSynthesis.speak(new SpeechSynthesisUtterance(utterance))
window.speechSynthesis.onvoiceschanged = () => {
const voices = window.speechSynthesis.getVoices()
const voice = voices.find((v) => v.name === voice)
const utter = new SpeechSynthesisUtterance(utterance)
utter.voice = voice
window.speechSynthesis.speak(utter)
const provider = await getTTSProvider()
if (provider === "browser") {
const isSSML = await isSSMLEnabled()
if (isSSML) {
utterance = markdownToSSML(utterance)
}
if (import.meta.env.BROWSER === "chrome") {
chrome.tts.speak(utterance, {
voiceName: voice,
onEvent(event) {
if (event.type === "start") {
setIsSpeaking(true)
} else if (event.type === "end") {
setIsSpeaking(false)
}
}
})
} else {
window.speechSynthesis.speak(new SpeechSynthesisUtterance(utterance))
window.speechSynthesis.onvoiceschanged = () => {
const voices = window.speechSynthesis.getVoices()
const voice = voices.find((v) => v.name === voice)
const utter = new SpeechSynthesisUtterance(utterance)
utter.voice = voice
window.speechSynthesis.speak(utter)
}
}
} else if (provider === "elevenlabs") {
const apiKey = await getElevenLabsApiKey()
const modelId = await getElevenLabsModel()
const voiceId = await getElevenLabsVoiceId()
const sentences = splitMessageContent(utterance)
let nextAudioData: ArrayBuffer | null = null
if (!apiKey || !modelId || !voiceId) {
throw new Error("Missing ElevenLabs configuration")
}
for (let i = 0; i < sentences.length; i++) {
setIsSpeaking(true)
let currentAudioData =
nextAudioData ||
(await generateSpeech(apiKey, sentences[i], voiceId, modelId))
if (i < sentences.length - 1) {
generateSpeech(apiKey, sentences[i + 1], voiceId, modelId)
.then((nextAudioData) => {
nextAudioData = nextAudioData
})
.catch(console.error)
}
const blob = new Blob([currentAudioData], { type: "audio/mpeg" })
const url = URL.createObjectURL(blob)
const audio = new Audio(url)
setAudioElement(audio)
await new Promise((resolve) => {
audio.onended = resolve
audio.play()
})
URL.revokeObjectURL(url)
}
setIsSpeaking(false)
setAudioElement(null)
}
} catch (error) {
setIsSpeaking(false)
setAudioElement(null)
notification.error({
message: "Error",
description: "Something went wrong while trying to play the audio"
@@ -47,6 +104,14 @@ export const useTTS = () => {
}
const cancel = () => {
if (audioElement) {
audioElement.pause()
audioElement.currentTime = 0
setAudioElement(null)
setIsSpeaking(false)
return
}
if (import.meta.env.BROWSER === "chrome") {
chrome.tts.stop()
} else {