diff --git a/src/assets/locale/en/settings.json b/src/assets/locale/en/settings.json index 20eb45c..3f6af27 100644 --- a/src/assets/locale/en/settings.json +++ b/src/assets/locale/en/settings.json @@ -113,6 +113,9 @@ }, "ssmlEnabled": { "label": "Enable SSML (Speech Synthesis Markup Language)" + }, + "responseSplitting": { + "label": "Response Splitting" } } }, diff --git a/src/components/Option/Settings/tts-mode.tsx b/src/components/Option/Settings/tts-mode.tsx index 7360419..2de6be3 100644 --- a/src/components/Option/Settings/tts-mode.tsx +++ b/src/components/Option/Settings/tts-mode.tsx @@ -1,9 +1,10 @@ import { SaveButton } from "@/components/Common/SaveButton" +import { getModels, getVoices } from "@/services/elevenlabs" import { getTTSSettings, setTTSSettings } from "@/services/tts" import { useWebUI } from "@/store/webui" import { useForm } from "@mantine/form" import { useQuery } from "@tanstack/react-query" -import { Select, Skeleton, Switch } from "antd" +import { Input, message, Select, Skeleton, Switch } from "antd" import { useTranslation } from "react-i18next" export const TTSModeSettings = ({ hideBorder }: { hideBorder?: boolean }) => { @@ -15,7 +16,11 @@ export const TTSModeSettings = ({ hideBorder }: { hideBorder?: boolean }) => { ttsEnabled: false, ttsProvider: "", voice: "", - ssmlEnabled: false + ssmlEnabled: false, + elevenLabsApiKey: "", + elevenLabsVoiceId: "", + elevenLabsModel: "", + responseSplitting: "" } }) @@ -28,6 +33,27 @@ export const TTSModeSettings = ({ hideBorder }: { hideBorder?: boolean }) => { } }) + const { data: elevenLabsData } = useQuery({ + queryKey: ["fetchElevenLabsData", form.values.elevenLabsApiKey], + queryFn: async () => { + try { + if ( + form.values.ttsProvider === "elevenlabs" && + form.values.elevenLabsApiKey + ) { + const voices = await getVoices(form.values.elevenLabsApiKey) + const models = await getModels(form.values.elevenLabsApiKey) + return { voices, models } + } + } catch (e) { + console.log(e) + message.error("Error fetching ElevenLabs data") + } + return null + }, + enabled: + form.values.ttsProvider === "elevenlabs" && !!form.values.elevenLabsApiKey + }) if (status === "pending" || status === "error") { return } @@ -72,29 +98,103 @@ export const TTSModeSettings = ({ hideBorder }: { hideBorder?: boolean }) => { ({ + {form.values.ttsProvider === "browser" && ( +
+ + {t("generalSettings.tts.ttsVoice.label")} + +
+ ({ + label: v.name, + value: v.voice_id + }))} + className="w-full mt-4 sm:mt-0 sm:w-[200px]" + placeholder="Select a voice" + {...form.getInputProps("elevenLabsVoiceId")} + /> +
+ +
+ + TTS Model + + +
+
+ + )} + + )}
{t("generalSettings.tts.ssmlEnabled.label")} diff --git a/src/hooks/useTTS.tsx b/src/hooks/useTTS.tsx index 5274956..6573ec4 100644 --- a/src/hooks/useTTS.tsx +++ b/src/hooks/useTTS.tsx @@ -1,44 +1,101 @@ import { useEffect, useState } from "react" import { notification } from "antd" -import { getVoice, isSSMLEnabled } from "@/services/tts" +import { + getElevenLabsApiKey, + getElevenLabsModel, + getElevenLabsVoiceId, + getTTSProvider, + getVoice, + isSSMLEnabled +} from "@/services/tts" import { markdownToSSML } from "@/utils/markdown-to-ssml" -type VoiceOptions = { +import { generateSpeech } from "@/services/elevenlabs" +import { splitMessageContent } from "@/utils/tts" + +export interface VoiceOptions { utterance: string } export const useTTS = () => { const [isSpeaking, setIsSpeaking] = useState(false) + const [audioElement, setAudioElement] = useState( + null + ) const speak = async ({ utterance }: VoiceOptions) => { try { const voice = await getVoice() - const isSSML = await isSSMLEnabled() - if (isSSML) { - utterance = markdownToSSML(utterance) - } - if (import.meta.env.BROWSER === "chrome") { - chrome.tts.speak(utterance, { - voiceName: voice, - onEvent(event) { - if (event.type === "start") { - setIsSpeaking(true) - } else if (event.type === "end") { - setIsSpeaking(false) - } - } - }) - } else { - // browser tts - window.speechSynthesis.speak(new SpeechSynthesisUtterance(utterance)) - window.speechSynthesis.onvoiceschanged = () => { - const voices = window.speechSynthesis.getVoices() - const voice = voices.find((v) => v.name === voice) - const utter = new SpeechSynthesisUtterance(utterance) - utter.voice = voice - window.speechSynthesis.speak(utter) + const provider = await getTTSProvider() + + if (provider === "browser") { + const isSSML = await isSSMLEnabled() + if (isSSML) { + utterance = markdownToSSML(utterance) } + if (import.meta.env.BROWSER === "chrome") { + chrome.tts.speak(utterance, { + voiceName: voice, + onEvent(event) { + if (event.type === "start") { + setIsSpeaking(true) + } else if (event.type === "end") { + setIsSpeaking(false) + } + } + }) + } else { + window.speechSynthesis.speak(new SpeechSynthesisUtterance(utterance)) + window.speechSynthesis.onvoiceschanged = () => { + const voices = window.speechSynthesis.getVoices() + const voice = voices.find((v) => v.name === voice) + const utter = new SpeechSynthesisUtterance(utterance) + utter.voice = voice + window.speechSynthesis.speak(utter) + } + } + } else if (provider === "elevenlabs") { + const apiKey = await getElevenLabsApiKey() + const modelId = await getElevenLabsModel() + const voiceId = await getElevenLabsVoiceId() + const sentences = splitMessageContent(utterance) + let nextAudioData: ArrayBuffer | null = null + if (!apiKey || !modelId || !voiceId) { + throw new Error("Missing ElevenLabs configuration") + } + for (let i = 0; i < sentences.length; i++) { + setIsSpeaking(true) + + let currentAudioData = + nextAudioData || + (await generateSpeech(apiKey, sentences[i], voiceId, modelId)) + + if (i < sentences.length - 1) { + generateSpeech(apiKey, sentences[i + 1], voiceId, modelId) + .then((nextAudioData) => { + nextAudioData = nextAudioData + }) + .catch(console.error) + } + + const blob = new Blob([currentAudioData], { type: "audio/mpeg" }) + const url = URL.createObjectURL(blob) + const audio = new Audio(url) + setAudioElement(audio) + + await new Promise((resolve) => { + audio.onended = resolve + audio.play() + }) + + URL.revokeObjectURL(url) + } + + setIsSpeaking(false) + setAudioElement(null) } } catch (error) { + setIsSpeaking(false) + setAudioElement(null) notification.error({ message: "Error", description: "Something went wrong while trying to play the audio" @@ -47,6 +104,14 @@ export const useTTS = () => { } const cancel = () => { + if (audioElement) { + audioElement.pause() + audioElement.currentTime = 0 + setAudioElement(null) + setIsSpeaking(false) + return + } + if (import.meta.env.BROWSER === "chrome") { chrome.tts.stop() } else { diff --git a/src/services/elevenlabs.ts b/src/services/elevenlabs.ts new file mode 100644 index 0000000..d87cdef --- /dev/null +++ b/src/services/elevenlabs.ts @@ -0,0 +1,49 @@ +import axios from 'axios'; +export interface Voice { + voice_id: string; + name: string; +} + +export interface Model { + model_id: string; + name: string; +} + +const BASE_URL = 'https://api.elevenlabs.io/v1'; + +export const getVoices = async (apiKey: string): Promise => { + const response = await axios.get(`${BASE_URL}/voices`, { + headers: { 'xi-api-key': apiKey } + }); + return response.data.voices; +}; + +export const getModels = async (apiKey: string): Promise => { + const response = await axios.get(`${BASE_URL}/models`, { + headers: { 'xi-api-key': apiKey } + }); + return response.data; +}; + +export const generateSpeech = async ( + apiKey: string, + text: string, + voiceId: string, + modelId: string +): Promise => { + const response = await axios.post( + `${BASE_URL}/text-to-speech/${voiceId}`, + { + text, + model_id: modelId, + }, + { + headers: { + 'xi-api-key': apiKey, + 'Content-Type': 'application/json', + }, + responseType: 'arraybuffer', + } + ); + return response.data; +}; \ No newline at end of file diff --git a/src/services/tts.ts b/src/services/tts.ts index 847efb4..ace7b05 100644 --- a/src/services/tts.ts +++ b/src/services/tts.ts @@ -4,7 +4,7 @@ const storage = new Storage() const DEFAULT_TTS_PROVIDER = "browser" -const AVAILABLE_TTS_PROVIDERS = ["browser"] as const +const AVAILABLE_TTS_PROVIDERS = ["browser", "elevenlabs"] as const export const getTTSProvider = async (): Promise< (typeof AVAILABLE_TTS_PROVIDERS)[number] @@ -63,22 +63,78 @@ export const setSSMLEnabled = async (isSSMLEnabled: boolean) => { await storage.set("isSSMLEnabled", isSSMLEnabled.toString()) } +export const getElevenLabsApiKey = async () => { + const data = await storage.get("elevenLabsApiKey") + return data +} + +export const setElevenLabsApiKey = async (elevenLabsApiKey: string) => { + await storage.set("elevenLabsApiKey", elevenLabsApiKey) +} + +export const getElevenLabsVoiceId = async () => { + const data = await storage.get("elevenLabsVoiceId") + return data +} + +export const setElevenLabsVoiceId = async (elevenLabsVoiceId: string) => { + await storage.set("elevenLabsVoiceId", elevenLabsVoiceId) +} + +export const getElevenLabsModel = async () => { + const data = await storage.get("elevenLabsModel") + return data +} + +export const setElevenLabsModel = async (elevenLabsModel: string) => { + await storage.set("elevenLabsModel", elevenLabsModel) +} + +export const getResponseSplitting = async () => { + const data = await storage.get("ttsResponseSplitting") + if (!data || data.length === 0 || data === "") { + return "punctuation" + } + return data +} + +export const setResponseSplitting = async (responseSplitting: string) => { + await storage.set("ttsResponseSplitting", responseSplitting) +} + export const getTTSSettings = async () => { - const [ttsEnabled, ttsProvider, browserTTSVoices, voice, ssmlEnabled] = - await Promise.all([ - isTTSEnabled(), - getTTSProvider(), - getBrowserTTSVoices(), - getVoice(), - isSSMLEnabled() - ]) + const [ + ttsEnabled, + ttsProvider, + browserTTSVoices, + voice, + ssmlEnabled, + elevenLabsApiKey, + elevenLabsVoiceId, + elevenLabsModel, + responseSplitting + ] = await Promise.all([ + isTTSEnabled(), + getTTSProvider(), + getBrowserTTSVoices(), + getVoice(), + isSSMLEnabled(), + getElevenLabsApiKey(), + getElevenLabsVoiceId(), + getElevenLabsModel(), + getResponseSplitting() + ]) return { ttsEnabled, ttsProvider, browserTTSVoices, voice, - ssmlEnabled + ssmlEnabled, + elevenLabsApiKey, + elevenLabsVoiceId, + elevenLabsModel, + responseSplitting } } @@ -86,17 +142,29 @@ export const setTTSSettings = async ({ ttsEnabled, ttsProvider, voice, - ssmlEnabled + ssmlEnabled, + elevenLabsApiKey, + elevenLabsVoiceId, + elevenLabsModel, + responseSplitting }: { ttsEnabled: boolean ttsProvider: string voice: string ssmlEnabled: boolean + elevenLabsApiKey: string + elevenLabsVoiceId: string + elevenLabsModel: string + responseSplitting: string }) => { await Promise.all([ setTTSEnabled(ttsEnabled), setTTSProvider(ttsProvider), setVoice(voice), - setSSMLEnabled(ssmlEnabled) + setSSMLEnabled(ssmlEnabled), + setElevenLabsApiKey(elevenLabsApiKey), + setElevenLabsVoiceId(elevenLabsVoiceId), + setElevenLabsModel(elevenLabsModel), + setResponseSplitting(responseSplitting) ]) } diff --git a/src/utils/tts.ts b/src/utils/tts.ts new file mode 100644 index 0000000..c79683b --- /dev/null +++ b/src/utils/tts.ts @@ -0,0 +1,112 @@ +// inspired from https://github.com/open-webui/open-webui/blob/2299f4843003759290cc6bf823595c6578ee4470/src/lib/utils/index.ts + +const CODE_BLOCK_PATTERN = /```[\s\S]*?```/g; + +export const sanitizeEmojis = (text: string): string => { + const EMOJI_PATTERN = /[\uD800-\uDBFF][\uDC00-\uDFFF]|\uD83C[\uDC00-\uDFFF]|\uD83D[\uDC00-\uDE4F]/g; + return text.replace(EMOJI_PATTERN, ''); +}; + +export const sanitizeMarkdown = (text: string): string => { + return text + .replace(/(```[\s\S]*?```)/g, '') + .replace(/^\|.*\|$/gm, '') + .replace(/(?:\*\*|__)(.*?)(?:\*\*|__)/g, '$1') + .replace(/(?:[*_])(.*?)(?:[*_])/g, '$1') + .replace(/~~(.*?)~~/g, '$1') + .replace(/`([^`]+)`/g, '$1') + .replace(/!?\[([^\]]*)\](?:\([^)]+\)|\[[^\]]*\])/g, '$1') + .replace(/^\[[^\]]+\]:\s*.*$/gm, '') + .replace(/^#{1,6}\s+/gm, '') + .replace(/^\s*[-*+]\s+/gm, '') + .replace(/^\s*(?:\d+\.)\s+/gm, '') + .replace(/^\s*>[> ]*/gm, '') + .replace(/^\s*:\s+/gm, '') + .replace(/\[\^[^\]]*\]/g, '') + .replace(/[-*_~]/g, '') + .replace(/\n{2,}/g, '\n'); +}; + +export const sanitizeText = (content: string): string => { + return sanitizeMarkdown(sanitizeEmojis(content.trim())); +}; + +export const parseTextIntoSentences = (text: string): string[] => { + const codeBlocks: string[] = []; + let blockIndex = 0; + + const processedText = text.replace(CODE_BLOCK_PATTERN, (match) => { + const placeholder = `\u0000${blockIndex}\u0000`; + codeBlocks[blockIndex++] = match; + return placeholder; + }); + + const sentences = processedText.split(/(?<=[.!?])\s+/); + + return sentences + .map(sentence => + sentence.replace(/\u0000(\d+)\u0000/g, (_, idx) => codeBlocks[idx]) + ) + .map(sanitizeText) + .filter(Boolean); +}; + +export const parseTextIntoParagraphs = (text: string): string[] => { + const codeBlocks: string[] = []; + let blockIndex = 0; + + const processedText = text.replace(CODE_BLOCK_PATTERN, (match) => { + const placeholder = `\u0000${blockIndex}\u0000`; + codeBlocks[blockIndex++] = match; + return placeholder; + }); + + return processedText + .split(/\n+/) + .map(paragraph => + paragraph.replace(/\u0000(\d+)\u0000/g, (_, idx) => codeBlocks[idx]) + ) + .map(sanitizeText) + .filter(Boolean); +}; + +export const optimizeSentencesForSpeech = (text: string): string[] => { + return parseTextIntoSentences(text).reduce((optimizedTexts, currentText) => { + const lastIndex = optimizedTexts.length - 1; + + if (lastIndex >= 0) { + const previousText = optimizedTexts[lastIndex]; + const wordCount = previousText.split(/\s+/).length; + const charCount = previousText.length; + + if (wordCount < 4 || charCount < 50) { + optimizedTexts[lastIndex] = `${previousText} ${currentText}`; + } else { + optimizedTexts.push(currentText); + } + } else { + optimizedTexts.push(currentText); + } + + return optimizedTexts; + }, [] as string[]); +}; + +export const splitMessageContent = (content: string, splitBy: string = 'punctuation') => { + const messageContentParts: string[] = []; + + switch (splitBy) { + case 'punctuation': + messageContentParts.push(...optimizeSentencesForSpeech(content)); + break; + case 'paragraph': + messageContentParts.push(...parseTextIntoParagraphs(content)); + break; + case 'none': + messageContentParts.push(sanitizeText(content)); + break; + default: + } + + return messageContentParts; +}; \ No newline at end of file diff --git a/wxt.config.ts b/wxt.config.ts index 40f728e..0aa73c2 100644 --- a/wxt.config.ts +++ b/wxt.config.ts @@ -51,7 +51,7 @@ export default defineConfig({ outDir: "build", manifest: { - version: "1.3.10", + version: "1.4.0", name: process.env.TARGET === "firefox" ? "Page Assist - A Web UI for Local AI Models"