From be366777d45ca15c97a0bd6603b639b7e6ee7082 Mon Sep 17 00:00:00 2001 From: Dennis Thiessen Date: Wed, 18 Feb 2026 10:11:13 +0100 Subject: [PATCH] added voice functionality and increased avatar size --- backend/.env.example | 3 +++ backend/app/config.py | 3 +++ backend/app/routers/voice.py | 9 ++++---- backend/app/services/voice_service.py | 8 +++++-- frontend/css/style.css | 32 +++++++++++++++++++++++++-- frontend/index.html | 2 +- frontend/js/voice.js | 13 ++++++++++- 7 files changed, 59 insertions(+), 11 deletions(-) diff --git a/backend/.env.example b/backend/.env.example index bad9067..71080a2 100644 --- a/backend/.env.example +++ b/backend/.env.example @@ -18,6 +18,9 @@ LLM_PROVIDER=gemini LLM_API_KEY=your-api-key-here LLM_MODEL=gemini-2.0-flash +# Optional: Dedicated OpenAI Key for Voice (if LLM_PROVIDER is not openai) +OPENAI_API_KEY=sk-... + # Voice mode: "api" (OpenAI Whisper/TTS) or "browser" (Web Speech API fallback) VOICE_MODE=browser TTS_MODEL=tts-1 diff --git a/backend/app/config.py b/backend/app/config.py index b0afd81..ab42101 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -27,6 +27,9 @@ class Settings(BaseSettings): llm_provider: str = "openai" # used by litellm routing llm_api_key: str = "" llm_model: str = "gpt-4o-mini" + + # OpenAI API Key (specifically for Voice/TTS if LLM_PROVIDER is different) + openai_api_key: str = "" # Voice feature flag: "api" = LLM provider Whisper/TTS, "browser" = Web Speech API voice_mode: Literal["api", "browser"] = "api" diff --git a/backend/app/routers/voice.py b/backend/app/routers/voice.py index 68c7dea..b21dd95 100644 --- a/backend/app/routers/voice.py +++ b/backend/app/routers/voice.py @@ -20,11 +20,10 @@ async def voice_config(user: User = Depends(get_current_user)): """Return current voice mode so frontend knows whether to use browser or API.""" settings = get_settings() # API STT (Whisper) works with OpenAI-compatible providers - api_available = bool( - settings.voice_mode == "api" - and settings.llm_api_key - and settings.llm_provider in ("openai",) - ) + # Check if we have a dedicated voice key OR a generic LLM key for OpenAI + has_key = bool(settings.openai_api_key or (settings.llm_api_key and settings.llm_provider == "openai")) + + api_available = bool(settings.voice_mode == "api" and has_key) return VoiceConfigOut( voice_mode=settings.voice_mode, voice_api_available=api_available, diff --git a/backend/app/services/voice_service.py b/backend/app/services/voice_service.py index e555bae..c000cca 100644 --- a/backend/app/services/voice_service.py +++ b/backend/app/services/voice_service.py @@ -10,7 +10,9 @@ from app.config import get_settings async def transcribe(audio_bytes: bytes, filename: str = "audio.webm") -> str: """Transcribe audio to text using OpenAI Whisper API.""" settings = get_settings() - client = openai.AsyncOpenAI(api_key=settings.llm_api_key) + # Use dedicated OpenAI key if available, otherwise fallback to LLM key + api_key = settings.openai_api_key or settings.llm_api_key + client = openai.AsyncOpenAI(api_key=api_key) audio_file = io.BytesIO(audio_bytes) audio_file.name = filename @@ -25,7 +27,9 @@ async def transcribe(audio_bytes: bytes, filename: str = "audio.webm") -> str: async def synthesize(text: str) -> bytes: """Synthesize text to speech using OpenAI TTS API.""" settings = get_settings() - client = openai.AsyncOpenAI(api_key=settings.llm_api_key) + # Use dedicated OpenAI key if available, otherwise fallback to LLM key + api_key = settings.openai_api_key or settings.llm_api_key + client = openai.AsyncOpenAI(api_key=api_key) response = await client.audio.speech.create( model=settings.tts_model, diff --git a/frontend/css/style.css b/frontend/css/style.css index d8d7ae0..cd79eb8 100644 --- a/frontend/css/style.css +++ b/frontend/css/style.css @@ -845,9 +845,20 @@ tr:hover td { /* ── Avatar ───────────────────────────────────────────────────────── */ .avatar-container { position: relative; - width: 72px; - height: 72px; + width: 120px; + height: 120px; flex-shrink: 0; + transition: transform 0.3s cubic-bezier(0.34, 1.56, 0.64, 1); +} + +.avatar-container.speaking { + transform: scale(1.05); +} + +.avatar-container.speaking .avatar-ring { + border-color: var(--accent); + box-shadow: 0 0 20px var(--accent-glow); + animation: avatarSpeakPulse 1.2s infinite; } .avatar-ring { @@ -1199,4 +1210,21 @@ tr:hover td { color: var(--text-muted); opacity: 0.6; font-family: monospace; +} + +@keyframes avatarSpeakPulse { + 0% { + transform: scale(1); + opacity: 0.8; + } + + 50% { + transform: scale(1.15); + opacity: 0.4; + } + + 100% { + transform: scale(1); + opacity: 0.8; + } } \ No newline at end of file diff --git a/frontend/index.html b/frontend/index.html index a481406..19123b3 100644 --- a/frontend/index.html +++ b/frontend/index.html @@ -30,7 +30,7 @@ - v0.1.1 + v0.2.0 diff --git a/frontend/js/voice.js b/frontend/js/voice.js index 12ca8f3..758c10d 100644 --- a/frontend/js/voice.js +++ b/frontend/js/voice.js @@ -199,8 +199,19 @@ class VoiceManager { const audioBlob = await response.blob(); const audioUrl = URL.createObjectURL(audioBlob); const audio = new Audio(audioUrl); + + // Visual feedback + const avatarContainer = document.querySelector('.avatar-container'); + if (avatarContainer) avatarContainer.classList.add('speaking'); + await audio.play(); - return new Promise(resolve => { audio.onended = resolve; }); + + return new Promise(resolve => { + audio.onended = () => { + if (avatarContainer) avatarContainer.classList.remove('speaking'); + resolve(); + }; + }); } } catch (e) { console.warn('[Voice] API TTS failed:', e);