added voice functionality and increased avatar size

2026-02-18 10:11:13 +01:00
parent 6df7b17261
commit be366777d4
7 changed files with 59 additions and 11 deletions
--- a/backend/.env.example
+++ b/backend/.env.example
@@ -18,6 +18,9 @@ LLM_PROVIDER=gemini
 LLM_API_KEY=your-api-key-here
 LLM_MODEL=gemini-2.0-flash

+# Optional: Dedicated OpenAI Key for Voice (if LLM_PROVIDER is not openai)
+OPENAI_API_KEY=sk-...
+
 # Voice mode: "api" (OpenAI Whisper/TTS) or "browser" (Web Speech API fallback)
 VOICE_MODE=browser
 TTS_MODEL=tts-1
--- a/backend/app/config.py
+++ b/backend/app/config.py
@@ -27,6 +27,9 @@ class Settings(BaseSettings):
    llm_provider: str = "openai"  # used by litellm routing
    llm_api_key: str = ""
    llm_model: str = "gpt-4o-mini"
+    
+    # OpenAI API Key (specifically for Voice/TTS if LLM_PROVIDER is different)
+    openai_api_key: str = ""

    # Voice feature flag: "api" = LLM provider Whisper/TTS, "browser" = Web Speech API
    voice_mode: Literal["api", "browser"] = "api"
--- a/backend/app/routers/voice.py
+++ b/backend/app/routers/voice.py
@@ -20,11 +20,10 @@ async def voice_config(user: User = Depends(get_current_user)):
    """Return current voice mode so frontend knows whether to use browser or API."""
    settings = get_settings()
    # API STT (Whisper) works with OpenAI-compatible providers
-    api_available = bool(
-        settings.voice_mode == "api"
-        and settings.llm_api_key
-        and settings.llm_provider in ("openai",)
-    )
+    # Check if we have a dedicated voice key OR a generic LLM key for OpenAI
+    has_key = bool(settings.openai_api_key or (settings.llm_api_key and settings.llm_provider == "openai"))
+    
+    api_available = bool(settings.voice_mode == "api" and has_key)
    return VoiceConfigOut(
        voice_mode=settings.voice_mode,
        voice_api_available=api_available,
--- a/backend/app/services/voice_service.py
+++ b/backend/app/services/voice_service.py
@@ -10,7 +10,9 @@ from app.config import get_settings
 async def transcribe(audio_bytes: bytes, filename: str = "audio.webm") -> str:
    """Transcribe audio to text using OpenAI Whisper API."""
    settings = get_settings()
-    client = openai.AsyncOpenAI(api_key=settings.llm_api_key)
+    # Use dedicated OpenAI key if available, otherwise fallback to LLM key
+    api_key = settings.openai_api_key or settings.llm_api_key
+    client = openai.AsyncOpenAI(api_key=api_key)

    audio_file = io.BytesIO(audio_bytes)
    audio_file.name = filename
@@ -25,7 +27,9 @@ async def transcribe(audio_bytes: bytes, filename: str = "audio.webm") -> str:
 async def synthesize(text: str) -> bytes:
    """Synthesize text to speech using OpenAI TTS API."""
    settings = get_settings()
-    client = openai.AsyncOpenAI(api_key=settings.llm_api_key)
+    # Use dedicated OpenAI key if available, otherwise fallback to LLM key
+    api_key = settings.openai_api_key or settings.llm_api_key
+    client = openai.AsyncOpenAI(api_key=api_key)

    response = await client.audio.speech.create(
        model=settings.tts_model,