feat: Add ElevenLabs TTS provider and response splitting options

This commit is contained in:
n4ze3m 2024-12-28 20:10:50 +05:30
parent aa49f03f63
commit 3ddb7f1ad8
7 changed files with 454 additions and 57 deletions

View File

@ -113,6 +113,9 @@
},
"ssmlEnabled": {
"label": "Enable SSML (Speech Synthesis Markup Language)"
},
"responseSplitting": {
"label": "Response Splitting"
}
}
},

View File

@ -1,9 +1,10 @@
import { SaveButton } from "@/components/Common/SaveButton"
import { getModels, getVoices } from "@/services/elevenlabs"
import { getTTSSettings, setTTSSettings } from "@/services/tts"
import { useWebUI } from "@/store/webui"
import { useForm } from "@mantine/form"
import { useQuery } from "@tanstack/react-query"
import { Select, Skeleton, Switch } from "antd"
import { Input, message, Select, Skeleton, Switch } from "antd"
import { useTranslation } from "react-i18next"
export const TTSModeSettings = ({ hideBorder }: { hideBorder?: boolean }) => {
@ -15,7 +16,11 @@ export const TTSModeSettings = ({ hideBorder }: { hideBorder?: boolean }) => {
ttsEnabled: false,
ttsProvider: "",
voice: "",
ssmlEnabled: false
ssmlEnabled: false,
elevenLabsApiKey: "",
elevenLabsVoiceId: "",
elevenLabsModel: "",
responseSplitting: ""
}
})
@ -28,6 +33,27 @@ export const TTSModeSettings = ({ hideBorder }: { hideBorder?: boolean }) => {
}
})
const { data: elevenLabsData } = useQuery({
queryKey: ["fetchElevenLabsData", form.values.elevenLabsApiKey],
queryFn: async () => {
try {
if (
form.values.ttsProvider === "elevenlabs" &&
form.values.elevenLabsApiKey
) {
const voices = await getVoices(form.values.elevenLabsApiKey)
const models = await getModels(form.values.elevenLabsApiKey)
return { voices, models }
}
} catch (e) {
console.log(e)
message.error("Error fetching ElevenLabs data")
}
return null
},
enabled:
form.values.ttsProvider === "elevenlabs" && !!form.values.elevenLabsApiKey
})
if (status === "pending" || status === "error") {
return <Skeleton active />
}
@ -72,29 +98,103 @@ export const TTSModeSettings = ({ hideBorder }: { hideBorder?: boolean }) => {
<Select
placeholder={t("generalSettings.tts.ttsProvider.placeholder")}
className="w-full mt-4 sm:mt-0 sm:w-[200px]"
options={[{ label: "Browser TTS", value: "browser" }]}
options={[
{ label: "Browser TTS", value: "browser" },
{
label: "ElevenLabs",
value: "elevenlabs"
}
]}
{...form.getInputProps("ttsProvider")}
/>
</div>
</div>
<div className="flex sm:flex-row flex-col space-y-4 sm:space-y-0 sm:justify-between">
<span className="text-gray-700 dark:text-neutral-50 ">
{t("generalSettings.tts.ttsVoice.label")}
</span>
<div>
<Select
placeholder={t("generalSettings.tts.ttsVoice.placeholder")}
className="w-full mt-4 sm:mt-0 sm:w-[200px]"
options={data?.browserTTSVoices?.map(
(voice) => ({
{form.values.ttsProvider === "browser" && (
<div className="flex sm:flex-row flex-col space-y-4 sm:space-y-0 sm:justify-between">
<span className="text-gray-700 dark:text-neutral-50 ">
{t("generalSettings.tts.ttsVoice.label")}
</span>
<div>
<Select
placeholder={t("generalSettings.tts.ttsVoice.placeholder")}
className="w-full mt-4 sm:mt-0 sm:w-[200px]"
options={data?.browserTTSVoices?.map((voice) => ({
label: `${voice.voiceName} - ${voice.lang}`.trim(),
value: voice.voiceName
})
)}
{...form.getInputProps("voice")}
/>
}))}
{...form.getInputProps("voice")}
/>
</div>
</div>
</div>
)}
{form.values.ttsProvider === "elevenlabs" && (
<>
<div className="flex sm:flex-row flex-col space-y-4 sm:space-y-0 sm:justify-between">
<span className="text-gray-700 dark:text-neutral-50">
API Key
</span>
<Input.Password
placeholder="sk_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
className=" mt-4 sm:mt-0 !w-[300px] sm:w-[200px]"
required
{...form.getInputProps("elevenLabsApiKey")}
/>
</div>
{elevenLabsData && (
<>
<div className="flex sm:flex-row flex-col space-y-4 sm:space-y-0 sm:justify-between">
<span className="text-gray-700 dark:text-neutral-50">
TTS Voice
</span>
<Select
options={elevenLabsData.voices.map((v) => ({
label: v.name,
value: v.voice_id
}))}
className="w-full mt-4 sm:mt-0 sm:w-[200px]"
placeholder="Select a voice"
{...form.getInputProps("elevenLabsVoiceId")}
/>
</div>
<div className="flex sm:flex-row flex-col space-y-4 sm:space-y-0 sm:justify-between">
<span className="text-gray-700 dark:text-neutral-50">
TTS Model
</span>
<Select
className="w-full mt-4 sm:mt-0 sm:w-[200px]"
placeholder="Select a model"
options={elevenLabsData.models.map((m) => ({
label: m.name,
value: m.model_id
}))}
{...form.getInputProps("elevenLabsModel")}
/>
</div>
<div className="flex sm:flex-row flex-col space-y-4 sm:space-y-0 sm:justify-between">
<span className="text-gray-700 dark:text-neutral-50 ">
{t("generalSettings.tts.responseSplitting.label")}
</span>
<div>
<Select
placeholder={t(
"generalSettings.tts.responseSplitting.placeholder"
)}
className="w-full mt-4 sm:mt-0 sm:w-[200px]"
options={[
{ label: "None", value: "none" },
{ label: "Punctuation", value: "punctuation" },
{ label: "Paragraph", value: "paragraph" }
]}
{...form.getInputProps("responseSplitting")}
/>
</div>
</div>
</>
)}
</>
)}
<div className="flex sm:flex-row flex-col space-y-4 sm:space-y-0 sm:justify-between">
<span className="text-gray-700 dark:text-neutral-50 ">
{t("generalSettings.tts.ssmlEnabled.label")}

View File

@ -1,44 +1,101 @@
import { useEffect, useState } from "react"
import { notification } from "antd"
import { getVoice, isSSMLEnabled } from "@/services/tts"
import {
getElevenLabsApiKey,
getElevenLabsModel,
getElevenLabsVoiceId,
getTTSProvider,
getVoice,
isSSMLEnabled
} from "@/services/tts"
import { markdownToSSML } from "@/utils/markdown-to-ssml"
type VoiceOptions = {
import { generateSpeech } from "@/services/elevenlabs"
import { splitMessageContent } from "@/utils/tts"
export interface VoiceOptions {
utterance: string
}
export const useTTS = () => {
const [isSpeaking, setIsSpeaking] = useState(false)
const [audioElement, setAudioElement] = useState<HTMLAudioElement | null>(
null
)
const speak = async ({ utterance }: VoiceOptions) => {
try {
const voice = await getVoice()
const isSSML = await isSSMLEnabled()
if (isSSML) {
utterance = markdownToSSML(utterance)
}
if (import.meta.env.BROWSER === "chrome") {
chrome.tts.speak(utterance, {
voiceName: voice,
onEvent(event) {
if (event.type === "start") {
setIsSpeaking(true)
} else if (event.type === "end") {
setIsSpeaking(false)
}
}
})
} else {
// browser tts
window.speechSynthesis.speak(new SpeechSynthesisUtterance(utterance))
window.speechSynthesis.onvoiceschanged = () => {
const voices = window.speechSynthesis.getVoices()
const voice = voices.find((v) => v.name === voice)
const utter = new SpeechSynthesisUtterance(utterance)
utter.voice = voice
window.speechSynthesis.speak(utter)
const provider = await getTTSProvider()
if (provider === "browser") {
const isSSML = await isSSMLEnabled()
if (isSSML) {
utterance = markdownToSSML(utterance)
}
if (import.meta.env.BROWSER === "chrome") {
chrome.tts.speak(utterance, {
voiceName: voice,
onEvent(event) {
if (event.type === "start") {
setIsSpeaking(true)
} else if (event.type === "end") {
setIsSpeaking(false)
}
}
})
} else {
window.speechSynthesis.speak(new SpeechSynthesisUtterance(utterance))
window.speechSynthesis.onvoiceschanged = () => {
const voices = window.speechSynthesis.getVoices()
const voice = voices.find((v) => v.name === voice)
const utter = new SpeechSynthesisUtterance(utterance)
utter.voice = voice
window.speechSynthesis.speak(utter)
}
}
} else if (provider === "elevenlabs") {
const apiKey = await getElevenLabsApiKey()
const modelId = await getElevenLabsModel()
const voiceId = await getElevenLabsVoiceId()
const sentences = splitMessageContent(utterance)
let nextAudioData: ArrayBuffer | null = null
if (!apiKey || !modelId || !voiceId) {
throw new Error("Missing ElevenLabs configuration")
}
for (let i = 0; i < sentences.length; i++) {
setIsSpeaking(true)
let currentAudioData =
nextAudioData ||
(await generateSpeech(apiKey, sentences[i], voiceId, modelId))
if (i < sentences.length - 1) {
generateSpeech(apiKey, sentences[i + 1], voiceId, modelId)
.then((nextAudioData) => {
nextAudioData = nextAudioData
})
.catch(console.error)
}
const blob = new Blob([currentAudioData], { type: "audio/mpeg" })
const url = URL.createObjectURL(blob)
const audio = new Audio(url)
setAudioElement(audio)
await new Promise((resolve) => {
audio.onended = resolve
audio.play()
})
URL.revokeObjectURL(url)
}
setIsSpeaking(false)
setAudioElement(null)
}
} catch (error) {
setIsSpeaking(false)
setAudioElement(null)
notification.error({
message: "Error",
description: "Something went wrong while trying to play the audio"
@ -47,6 +104,14 @@ export const useTTS = () => {
}
const cancel = () => {
if (audioElement) {
audioElement.pause()
audioElement.currentTime = 0
setAudioElement(null)
setIsSpeaking(false)
return
}
if (import.meta.env.BROWSER === "chrome") {
chrome.tts.stop()
} else {

View File

@ -0,0 +1,49 @@
import axios from 'axios';
export interface Voice {
voice_id: string;
name: string;
}
export interface Model {
model_id: string;
name: string;
}
const BASE_URL = 'https://api.elevenlabs.io/v1';
export const getVoices = async (apiKey: string): Promise<Voice[]> => {
const response = await axios.get(`${BASE_URL}/voices`, {
headers: { 'xi-api-key': apiKey }
});
return response.data.voices;
};
export const getModels = async (apiKey: string): Promise<Model[]> => {
const response = await axios.get(`${BASE_URL}/models`, {
headers: { 'xi-api-key': apiKey }
});
return response.data;
};
export const generateSpeech = async (
apiKey: string,
text: string,
voiceId: string,
modelId: string
): Promise<ArrayBuffer> => {
const response = await axios.post(
`${BASE_URL}/text-to-speech/${voiceId}`,
{
text,
model_id: modelId,
},
{
headers: {
'xi-api-key': apiKey,
'Content-Type': 'application/json',
},
responseType: 'arraybuffer',
}
);
return response.data;
};

View File

@ -4,7 +4,7 @@ const storage = new Storage()
const DEFAULT_TTS_PROVIDER = "browser"
const AVAILABLE_TTS_PROVIDERS = ["browser"] as const
const AVAILABLE_TTS_PROVIDERS = ["browser", "elevenlabs"] as const
export const getTTSProvider = async (): Promise<
(typeof AVAILABLE_TTS_PROVIDERS)[number]
@ -63,22 +63,78 @@ export const setSSMLEnabled = async (isSSMLEnabled: boolean) => {
await storage.set("isSSMLEnabled", isSSMLEnabled.toString())
}
export const getElevenLabsApiKey = async () => {
const data = await storage.get("elevenLabsApiKey")
return data
}
export const setElevenLabsApiKey = async (elevenLabsApiKey: string) => {
await storage.set("elevenLabsApiKey", elevenLabsApiKey)
}
export const getElevenLabsVoiceId = async () => {
const data = await storage.get("elevenLabsVoiceId")
return data
}
export const setElevenLabsVoiceId = async (elevenLabsVoiceId: string) => {
await storage.set("elevenLabsVoiceId", elevenLabsVoiceId)
}
export const getElevenLabsModel = async () => {
const data = await storage.get("elevenLabsModel")
return data
}
export const setElevenLabsModel = async (elevenLabsModel: string) => {
await storage.set("elevenLabsModel", elevenLabsModel)
}
export const getResponseSplitting = async () => {
const data = await storage.get("ttsResponseSplitting")
if (!data || data.length === 0 || data === "") {
return "punctuation"
}
return data
}
export const setResponseSplitting = async (responseSplitting: string) => {
await storage.set("ttsResponseSplitting", responseSplitting)
}
export const getTTSSettings = async () => {
const [ttsEnabled, ttsProvider, browserTTSVoices, voice, ssmlEnabled] =
await Promise.all([
isTTSEnabled(),
getTTSProvider(),
getBrowserTTSVoices(),
getVoice(),
isSSMLEnabled()
])
const [
ttsEnabled,
ttsProvider,
browserTTSVoices,
voice,
ssmlEnabled,
elevenLabsApiKey,
elevenLabsVoiceId,
elevenLabsModel,
responseSplitting
] = await Promise.all([
isTTSEnabled(),
getTTSProvider(),
getBrowserTTSVoices(),
getVoice(),
isSSMLEnabled(),
getElevenLabsApiKey(),
getElevenLabsVoiceId(),
getElevenLabsModel(),
getResponseSplitting()
])
return {
ttsEnabled,
ttsProvider,
browserTTSVoices,
voice,
ssmlEnabled
ssmlEnabled,
elevenLabsApiKey,
elevenLabsVoiceId,
elevenLabsModel,
responseSplitting
}
}
@ -86,17 +142,29 @@ export const setTTSSettings = async ({
ttsEnabled,
ttsProvider,
voice,
ssmlEnabled
ssmlEnabled,
elevenLabsApiKey,
elevenLabsVoiceId,
elevenLabsModel,
responseSplitting
}: {
ttsEnabled: boolean
ttsProvider: string
voice: string
ssmlEnabled: boolean
elevenLabsApiKey: string
elevenLabsVoiceId: string
elevenLabsModel: string
responseSplitting: string
}) => {
await Promise.all([
setTTSEnabled(ttsEnabled),
setTTSProvider(ttsProvider),
setVoice(voice),
setSSMLEnabled(ssmlEnabled)
setSSMLEnabled(ssmlEnabled),
setElevenLabsApiKey(elevenLabsApiKey),
setElevenLabsVoiceId(elevenLabsVoiceId),
setElevenLabsModel(elevenLabsModel),
setResponseSplitting(responseSplitting)
])
}

112
src/utils/tts.ts Normal file
View File

@ -0,0 +1,112 @@
// inspired from https://github.com/open-webui/open-webui/blob/2299f4843003759290cc6bf823595c6578ee4470/src/lib/utils/index.ts
const CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
export const sanitizeEmojis = (text: string): string => {
const EMOJI_PATTERN = /[\uD800-\uDBFF][\uDC00-\uDFFF]|\uD83C[\uDC00-\uDFFF]|\uD83D[\uDC00-\uDE4F]/g;
return text.replace(EMOJI_PATTERN, '');
};
export const sanitizeMarkdown = (text: string): string => {
return text
.replace(/(```[\s\S]*?```)/g, '')
.replace(/^\|.*\|$/gm, '')
.replace(/(?:\*\*|__)(.*?)(?:\*\*|__)/g, '$1')
.replace(/(?:[*_])(.*?)(?:[*_])/g, '$1')
.replace(/~~(.*?)~~/g, '$1')
.replace(/`([^`]+)`/g, '$1')
.replace(/!?\[([^\]]*)\](?:\([^)]+\)|\[[^\]]*\])/g, '$1')
.replace(/^\[[^\]]+\]:\s*.*$/gm, '')
.replace(/^#{1,6}\s+/gm, '')
.replace(/^\s*[-*+]\s+/gm, '')
.replace(/^\s*(?:\d+\.)\s+/gm, '')
.replace(/^\s*>[> ]*/gm, '')
.replace(/^\s*:\s+/gm, '')
.replace(/\[\^[^\]]*\]/g, '')
.replace(/[-*_~]/g, '')
.replace(/\n{2,}/g, '\n');
};
export const sanitizeText = (content: string): string => {
return sanitizeMarkdown(sanitizeEmojis(content.trim()));
};
export const parseTextIntoSentences = (text: string): string[] => {
const codeBlocks: string[] = [];
let blockIndex = 0;
const processedText = text.replace(CODE_BLOCK_PATTERN, (match) => {
const placeholder = `\u0000${blockIndex}\u0000`;
codeBlocks[blockIndex++] = match;
return placeholder;
});
const sentences = processedText.split(/(?<=[.!?])\s+/);
return sentences
.map(sentence =>
sentence.replace(/\u0000(\d+)\u0000/g, (_, idx) => codeBlocks[idx])
)
.map(sanitizeText)
.filter(Boolean);
};
export const parseTextIntoParagraphs = (text: string): string[] => {
const codeBlocks: string[] = [];
let blockIndex = 0;
const processedText = text.replace(CODE_BLOCK_PATTERN, (match) => {
const placeholder = `\u0000${blockIndex}\u0000`;
codeBlocks[blockIndex++] = match;
return placeholder;
});
return processedText
.split(/\n+/)
.map(paragraph =>
paragraph.replace(/\u0000(\d+)\u0000/g, (_, idx) => codeBlocks[idx])
)
.map(sanitizeText)
.filter(Boolean);
};
export const optimizeSentencesForSpeech = (text: string): string[] => {
return parseTextIntoSentences(text).reduce((optimizedTexts, currentText) => {
const lastIndex = optimizedTexts.length - 1;
if (lastIndex >= 0) {
const previousText = optimizedTexts[lastIndex];
const wordCount = previousText.split(/\s+/).length;
const charCount = previousText.length;
if (wordCount < 4 || charCount < 50) {
optimizedTexts[lastIndex] = `${previousText} ${currentText}`;
} else {
optimizedTexts.push(currentText);
}
} else {
optimizedTexts.push(currentText);
}
return optimizedTexts;
}, [] as string[]);
};
export const splitMessageContent = (content: string, splitBy: string = 'punctuation') => {
const messageContentParts: string[] = [];
switch (splitBy) {
case 'punctuation':
messageContentParts.push(...optimizeSentencesForSpeech(content));
break;
case 'paragraph':
messageContentParts.push(...parseTextIntoParagraphs(content));
break;
case 'none':
messageContentParts.push(sanitizeText(content));
break;
default:
}
return messageContentParts;
};

View File

@ -51,7 +51,7 @@ export default defineConfig({
outDir: "build",
manifest: {
version: "1.3.10",
version: "1.4.0",
name:
process.env.TARGET === "firefox"
? "Page Assist - A Web UI for Local AI Models"