diff --git a/src/assets/locale/en/settings.json b/src/assets/locale/en/settings.json
index 20eb45c..3f6af27 100644
--- a/src/assets/locale/en/settings.json
+++ b/src/assets/locale/en/settings.json
@@ -113,6 +113,9 @@
},
"ssmlEnabled": {
"label": "Enable SSML (Speech Synthesis Markup Language)"
+ },
+ "responseSplitting": {
+ "label": "Response Splitting"
}
}
},
diff --git a/src/components/Option/Settings/tts-mode.tsx b/src/components/Option/Settings/tts-mode.tsx
index 7360419..2de6be3 100644
--- a/src/components/Option/Settings/tts-mode.tsx
+++ b/src/components/Option/Settings/tts-mode.tsx
@@ -1,9 +1,10 @@
import { SaveButton } from "@/components/Common/SaveButton"
+import { getModels, getVoices } from "@/services/elevenlabs"
import { getTTSSettings, setTTSSettings } from "@/services/tts"
import { useWebUI } from "@/store/webui"
import { useForm } from "@mantine/form"
import { useQuery } from "@tanstack/react-query"
-import { Select, Skeleton, Switch } from "antd"
+import { Input, message, Select, Skeleton, Switch } from "antd"
import { useTranslation } from "react-i18next"
export const TTSModeSettings = ({ hideBorder }: { hideBorder?: boolean }) => {
@@ -15,7 +16,11 @@ export const TTSModeSettings = ({ hideBorder }: { hideBorder?: boolean }) => {
ttsEnabled: false,
ttsProvider: "",
voice: "",
- ssmlEnabled: false
+ ssmlEnabled: false,
+ elevenLabsApiKey: "",
+ elevenLabsVoiceId: "",
+ elevenLabsModel: "",
+ responseSplitting: ""
}
})
@@ -28,6 +33,27 @@ export const TTSModeSettings = ({ hideBorder }: { hideBorder?: boolean }) => {
}
})
+ const { data: elevenLabsData } = useQuery({
+ queryKey: ["fetchElevenLabsData", form.values.elevenLabsApiKey],
+ queryFn: async () => {
+ try {
+ if (
+ form.values.ttsProvider === "elevenlabs" &&
+ form.values.elevenLabsApiKey
+ ) {
+ const voices = await getVoices(form.values.elevenLabsApiKey)
+ const models = await getModels(form.values.elevenLabsApiKey)
+ return { voices, models }
+ }
+ } catch (e) {
+ console.log(e)
+ message.error("Error fetching ElevenLabs data")
+ }
+ return null
+ },
+ enabled:
+ form.values.ttsProvider === "elevenlabs" && !!form.values.elevenLabsApiKey
+ })
if (status === "pending" || status === "error") {
return
}
@@ -72,29 +98,103 @@ export const TTSModeSettings = ({ hideBorder }: { hideBorder?: boolean }) => {
-
-
- {t("generalSettings.tts.ttsVoice.label")}
-
-
-
+ )}
+ {form.values.ttsProvider === "elevenlabs" && (
+ <>
+
+
+ API Key
+
+
+
+
+ {elevenLabsData && (
+ <>
+
+
+ TTS Voice
+
+ ({
+ label: v.name,
+ value: v.voice_id
+ }))}
+ className="w-full mt-4 sm:mt-0 sm:w-[200px]"
+ placeholder="Select a voice"
+ {...form.getInputProps("elevenLabsVoiceId")}
+ />
+
+
+
+
+ TTS Model
+
+ ({
+ label: m.name,
+ value: m.model_id
+ }))}
+ {...form.getInputProps("elevenLabsModel")}
+ />
+
+
+
+ {t("generalSettings.tts.responseSplitting.label")}
+
+
+
+
+
+ >
+ )}
+ >
+ )}
{t("generalSettings.tts.ssmlEnabled.label")}
diff --git a/src/hooks/useTTS.tsx b/src/hooks/useTTS.tsx
index 5274956..6573ec4 100644
--- a/src/hooks/useTTS.tsx
+++ b/src/hooks/useTTS.tsx
@@ -1,44 +1,101 @@
import { useEffect, useState } from "react"
import { notification } from "antd"
-import { getVoice, isSSMLEnabled } from "@/services/tts"
+import {
+ getElevenLabsApiKey,
+ getElevenLabsModel,
+ getElevenLabsVoiceId,
+ getTTSProvider,
+ getVoice,
+ isSSMLEnabled
+} from "@/services/tts"
import { markdownToSSML } from "@/utils/markdown-to-ssml"
-type VoiceOptions = {
+import { generateSpeech } from "@/services/elevenlabs"
+import { splitMessageContent } from "@/utils/tts"
+
+export interface VoiceOptions {
utterance: string
}
export const useTTS = () => {
const [isSpeaking, setIsSpeaking] = useState(false)
+ const [audioElement, setAudioElement] = useState(
+ null
+ )
const speak = async ({ utterance }: VoiceOptions) => {
try {
const voice = await getVoice()
- const isSSML = await isSSMLEnabled()
- if (isSSML) {
- utterance = markdownToSSML(utterance)
- }
- if (import.meta.env.BROWSER === "chrome") {
- chrome.tts.speak(utterance, {
- voiceName: voice,
- onEvent(event) {
- if (event.type === "start") {
- setIsSpeaking(true)
- } else if (event.type === "end") {
- setIsSpeaking(false)
- }
- }
- })
- } else {
- // browser tts
- window.speechSynthesis.speak(new SpeechSynthesisUtterance(utterance))
- window.speechSynthesis.onvoiceschanged = () => {
- const voices = window.speechSynthesis.getVoices()
- const voice = voices.find((v) => v.name === voice)
- const utter = new SpeechSynthesisUtterance(utterance)
- utter.voice = voice
- window.speechSynthesis.speak(utter)
+ const provider = await getTTSProvider()
+
+ if (provider === "browser") {
+ const isSSML = await isSSMLEnabled()
+ if (isSSML) {
+ utterance = markdownToSSML(utterance)
}
+ if (import.meta.env.BROWSER === "chrome") {
+ chrome.tts.speak(utterance, {
+ voiceName: voice,
+ onEvent(event) {
+ if (event.type === "start") {
+ setIsSpeaking(true)
+ } else if (event.type === "end") {
+ setIsSpeaking(false)
+ }
+ }
+ })
+ } else {
+ window.speechSynthesis.speak(new SpeechSynthesisUtterance(utterance))
+ window.speechSynthesis.onvoiceschanged = () => {
+ const voices = window.speechSynthesis.getVoices()
+ const voice = voices.find((v) => v.name === voice)
+ const utter = new SpeechSynthesisUtterance(utterance)
+ utter.voice = voice
+ window.speechSynthesis.speak(utter)
+ }
+ }
+ } else if (provider === "elevenlabs") {
+ const apiKey = await getElevenLabsApiKey()
+ const modelId = await getElevenLabsModel()
+ const voiceId = await getElevenLabsVoiceId()
+ const sentences = splitMessageContent(utterance)
+ let nextAudioData: ArrayBuffer | null = null
+ if (!apiKey || !modelId || !voiceId) {
+ throw new Error("Missing ElevenLabs configuration")
+ }
+ for (let i = 0; i < sentences.length; i++) {
+ setIsSpeaking(true)
+
+ let currentAudioData =
+ nextAudioData ||
+ (await generateSpeech(apiKey, sentences[i], voiceId, modelId))
+
+ if (i < sentences.length - 1) {
+ generateSpeech(apiKey, sentences[i + 1], voiceId, modelId)
+ .then((nextAudioData) => {
+ nextAudioData = nextAudioData
+ })
+ .catch(console.error)
+ }
+
+ const blob = new Blob([currentAudioData], { type: "audio/mpeg" })
+ const url = URL.createObjectURL(blob)
+ const audio = new Audio(url)
+ setAudioElement(audio)
+
+ await new Promise((resolve) => {
+ audio.onended = resolve
+ audio.play()
+ })
+
+ URL.revokeObjectURL(url)
+ }
+
+ setIsSpeaking(false)
+ setAudioElement(null)
}
} catch (error) {
+ setIsSpeaking(false)
+ setAudioElement(null)
notification.error({
message: "Error",
description: "Something went wrong while trying to play the audio"
@@ -47,6 +104,14 @@ export const useTTS = () => {
}
const cancel = () => {
+ if (audioElement) {
+ audioElement.pause()
+ audioElement.currentTime = 0
+ setAudioElement(null)
+ setIsSpeaking(false)
+ return
+ }
+
if (import.meta.env.BROWSER === "chrome") {
chrome.tts.stop()
} else {
diff --git a/src/services/elevenlabs.ts b/src/services/elevenlabs.ts
new file mode 100644
index 0000000..d87cdef
--- /dev/null
+++ b/src/services/elevenlabs.ts
@@ -0,0 +1,49 @@
+import axios from 'axios';
+export interface Voice {
+ voice_id: string;
+ name: string;
+}
+
+export interface Model {
+ model_id: string;
+ name: string;
+}
+
+const BASE_URL = 'https://api.elevenlabs.io/v1';
+
+export const getVoices = async (apiKey: string): Promise => {
+ const response = await axios.get(`${BASE_URL}/voices`, {
+ headers: { 'xi-api-key': apiKey }
+ });
+ return response.data.voices;
+};
+
+export const getModels = async (apiKey: string): Promise => {
+ const response = await axios.get(`${BASE_URL}/models`, {
+ headers: { 'xi-api-key': apiKey }
+ });
+ return response.data;
+};
+
+export const generateSpeech = async (
+ apiKey: string,
+ text: string,
+ voiceId: string,
+ modelId: string
+): Promise => {
+ const response = await axios.post(
+ `${BASE_URL}/text-to-speech/${voiceId}`,
+ {
+ text,
+ model_id: modelId,
+ },
+ {
+ headers: {
+ 'xi-api-key': apiKey,
+ 'Content-Type': 'application/json',
+ },
+ responseType: 'arraybuffer',
+ }
+ );
+ return response.data;
+};
\ No newline at end of file
diff --git a/src/services/tts.ts b/src/services/tts.ts
index 847efb4..ace7b05 100644
--- a/src/services/tts.ts
+++ b/src/services/tts.ts
@@ -4,7 +4,7 @@ const storage = new Storage()
const DEFAULT_TTS_PROVIDER = "browser"
-const AVAILABLE_TTS_PROVIDERS = ["browser"] as const
+const AVAILABLE_TTS_PROVIDERS = ["browser", "elevenlabs"] as const
export const getTTSProvider = async (): Promise<
(typeof AVAILABLE_TTS_PROVIDERS)[number]
@@ -63,22 +63,78 @@ export const setSSMLEnabled = async (isSSMLEnabled: boolean) => {
await storage.set("isSSMLEnabled", isSSMLEnabled.toString())
}
+export const getElevenLabsApiKey = async () => {
+ const data = await storage.get("elevenLabsApiKey")
+ return data
+}
+
+export const setElevenLabsApiKey = async (elevenLabsApiKey: string) => {
+ await storage.set("elevenLabsApiKey", elevenLabsApiKey)
+}
+
+export const getElevenLabsVoiceId = async () => {
+ const data = await storage.get("elevenLabsVoiceId")
+ return data
+}
+
+export const setElevenLabsVoiceId = async (elevenLabsVoiceId: string) => {
+ await storage.set("elevenLabsVoiceId", elevenLabsVoiceId)
+}
+
+export const getElevenLabsModel = async () => {
+ const data = await storage.get("elevenLabsModel")
+ return data
+}
+
+export const setElevenLabsModel = async (elevenLabsModel: string) => {
+ await storage.set("elevenLabsModel", elevenLabsModel)
+}
+
+export const getResponseSplitting = async () => {
+ const data = await storage.get("ttsResponseSplitting")
+ if (!data || data.length === 0 || data === "") {
+ return "punctuation"
+ }
+ return data
+}
+
+export const setResponseSplitting = async (responseSplitting: string) => {
+ await storage.set("ttsResponseSplitting", responseSplitting)
+}
+
export const getTTSSettings = async () => {
- const [ttsEnabled, ttsProvider, browserTTSVoices, voice, ssmlEnabled] =
- await Promise.all([
- isTTSEnabled(),
- getTTSProvider(),
- getBrowserTTSVoices(),
- getVoice(),
- isSSMLEnabled()
- ])
+ const [
+ ttsEnabled,
+ ttsProvider,
+ browserTTSVoices,
+ voice,
+ ssmlEnabled,
+ elevenLabsApiKey,
+ elevenLabsVoiceId,
+ elevenLabsModel,
+ responseSplitting
+ ] = await Promise.all([
+ isTTSEnabled(),
+ getTTSProvider(),
+ getBrowserTTSVoices(),
+ getVoice(),
+ isSSMLEnabled(),
+ getElevenLabsApiKey(),
+ getElevenLabsVoiceId(),
+ getElevenLabsModel(),
+ getResponseSplitting()
+ ])
return {
ttsEnabled,
ttsProvider,
browserTTSVoices,
voice,
- ssmlEnabled
+ ssmlEnabled,
+ elevenLabsApiKey,
+ elevenLabsVoiceId,
+ elevenLabsModel,
+ responseSplitting
}
}
@@ -86,17 +142,29 @@ export const setTTSSettings = async ({
ttsEnabled,
ttsProvider,
voice,
- ssmlEnabled
+ ssmlEnabled,
+ elevenLabsApiKey,
+ elevenLabsVoiceId,
+ elevenLabsModel,
+ responseSplitting
}: {
ttsEnabled: boolean
ttsProvider: string
voice: string
ssmlEnabled: boolean
+ elevenLabsApiKey: string
+ elevenLabsVoiceId: string
+ elevenLabsModel: string
+ responseSplitting: string
}) => {
await Promise.all([
setTTSEnabled(ttsEnabled),
setTTSProvider(ttsProvider),
setVoice(voice),
- setSSMLEnabled(ssmlEnabled)
+ setSSMLEnabled(ssmlEnabled),
+ setElevenLabsApiKey(elevenLabsApiKey),
+ setElevenLabsVoiceId(elevenLabsVoiceId),
+ setElevenLabsModel(elevenLabsModel),
+ setResponseSplitting(responseSplitting)
])
}
diff --git a/src/utils/tts.ts b/src/utils/tts.ts
new file mode 100644
index 0000000..c79683b
--- /dev/null
+++ b/src/utils/tts.ts
@@ -0,0 +1,112 @@
+// inspired from https://github.com/open-webui/open-webui/blob/2299f4843003759290cc6bf823595c6578ee4470/src/lib/utils/index.ts
+
+const CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
+
+export const sanitizeEmojis = (text: string): string => {
+ const EMOJI_PATTERN = /[\uD800-\uDBFF][\uDC00-\uDFFF]|\uD83C[\uDC00-\uDFFF]|\uD83D[\uDC00-\uDE4F]/g;
+ return text.replace(EMOJI_PATTERN, '');
+};
+
+export const sanitizeMarkdown = (text: string): string => {
+ return text
+ .replace(/(```[\s\S]*?```)/g, '')
+ .replace(/^\|.*\|$/gm, '')
+ .replace(/(?:\*\*|__)(.*?)(?:\*\*|__)/g, '$1')
+ .replace(/(?:[*_])(.*?)(?:[*_])/g, '$1')
+ .replace(/~~(.*?)~~/g, '$1')
+ .replace(/`([^`]+)`/g, '$1')
+ .replace(/!?\[([^\]]*)\](?:\([^)]+\)|\[[^\]]*\])/g, '$1')
+ .replace(/^\[[^\]]+\]:\s*.*$/gm, '')
+ .replace(/^#{1,6}\s+/gm, '')
+ .replace(/^\s*[-*+]\s+/gm, '')
+ .replace(/^\s*(?:\d+\.)\s+/gm, '')
+ .replace(/^\s*>[> ]*/gm, '')
+ .replace(/^\s*:\s+/gm, '')
+ .replace(/\[\^[^\]]*\]/g, '')
+ .replace(/[-*_~]/g, '')
+ .replace(/\n{2,}/g, '\n');
+};
+
+export const sanitizeText = (content: string): string => {
+ return sanitizeMarkdown(sanitizeEmojis(content.trim()));
+};
+
+export const parseTextIntoSentences = (text: string): string[] => {
+ const codeBlocks: string[] = [];
+ let blockIndex = 0;
+
+ const processedText = text.replace(CODE_BLOCK_PATTERN, (match) => {
+ const placeholder = `\u0000${blockIndex}\u0000`;
+ codeBlocks[blockIndex++] = match;
+ return placeholder;
+ });
+
+ const sentences = processedText.split(/(?<=[.!?])\s+/);
+
+ return sentences
+ .map(sentence =>
+ sentence.replace(/\u0000(\d+)\u0000/g, (_, idx) => codeBlocks[idx])
+ )
+ .map(sanitizeText)
+ .filter(Boolean);
+};
+
+export const parseTextIntoParagraphs = (text: string): string[] => {
+ const codeBlocks: string[] = [];
+ let blockIndex = 0;
+
+ const processedText = text.replace(CODE_BLOCK_PATTERN, (match) => {
+ const placeholder = `\u0000${blockIndex}\u0000`;
+ codeBlocks[blockIndex++] = match;
+ return placeholder;
+ });
+
+ return processedText
+ .split(/\n+/)
+ .map(paragraph =>
+ paragraph.replace(/\u0000(\d+)\u0000/g, (_, idx) => codeBlocks[idx])
+ )
+ .map(sanitizeText)
+ .filter(Boolean);
+};
+
+export const optimizeSentencesForSpeech = (text: string): string[] => {
+ return parseTextIntoSentences(text).reduce((optimizedTexts, currentText) => {
+ const lastIndex = optimizedTexts.length - 1;
+
+ if (lastIndex >= 0) {
+ const previousText = optimizedTexts[lastIndex];
+ const wordCount = previousText.split(/\s+/).length;
+ const charCount = previousText.length;
+
+ if (wordCount < 4 || charCount < 50) {
+ optimizedTexts[lastIndex] = `${previousText} ${currentText}`;
+ } else {
+ optimizedTexts.push(currentText);
+ }
+ } else {
+ optimizedTexts.push(currentText);
+ }
+
+ return optimizedTexts;
+ }, [] as string[]);
+};
+
+export const splitMessageContent = (content: string, splitBy: string = 'punctuation') => {
+ const messageContentParts: string[] = [];
+
+ switch (splitBy) {
+ case 'punctuation':
+ messageContentParts.push(...optimizeSentencesForSpeech(content));
+ break;
+ case 'paragraph':
+ messageContentParts.push(...parseTextIntoParagraphs(content));
+ break;
+ case 'none':
+ messageContentParts.push(sanitizeText(content));
+ break;
+ default:
+ }
+
+ return messageContentParts;
+};
\ No newline at end of file
diff --git a/wxt.config.ts b/wxt.config.ts
index 40f728e..0aa73c2 100644
--- a/wxt.config.ts
+++ b/wxt.config.ts
@@ -51,7 +51,7 @@ export default defineConfig({
outDir: "build",
manifest: {
- version: "1.3.10",
+ version: "1.4.0",
name:
process.env.TARGET === "firefox"
? "Page Assist - A Web UI for Local AI Models"