language-llm/frontend/js/voice.js

/* FluentGerman.ai — Voice module (API-only TTS, browser + API STT) */

class VoiceManager {
    constructor() {
        this.mode = 'browser';
        this.recognition = null;
        this.isRecording = false;
        this.isDisabled = false;
        this.lastInputWasVoice = false;
        this.mediaRecorder = null;
        this.audioChunks = [];
        this.onResult = null;
        this.onStateChange = null;
        this.browserSTTSupported = false;
        this.apiAvailable = false;
        this.onProcessing = null; // New callback for "Transcribing..." state
    }

    async init() {
        this._initBrowserSTT();

        try {
            const response = await api('/voice/config');
            if (response?.ok) {
                const config = await response.json();
                this.mode = config.voice_mode;
                this.apiAvailable = config.voice_api_available || false;
                console.log('[Voice] Server mode:', this.mode, '| API available:', this.apiAvailable);
            }
        } catch (e) {
            console.warn('[Voice] Could not fetch config, using browser mode');
            this.mode = 'browser';
        }

        // Determine best STT method
        if (this.mode === 'browser' && !this.browserSTTSupported) {
            if (this.apiAvailable) {
                this.mode = 'api';
            } else {
                this.isDisabled = true;
            }
        } else if (this.mode === 'api' && !this.apiAvailable) {
            if (this.browserSTTSupported) {
                this.mode = 'browser';
            } else {
                this.isDisabled = true;
            }
        }

        console.log('[Voice] Final mode:', this.isDisabled ? 'DISABLED' : this.mode);
    }

    _initBrowserSTT() {
        const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
        if (!SpeechRecognition) {
            this.browserSTTSupported = false;
            return;
        }

        this.browserSTTSupported = true;
        this.recognition = new SpeechRecognition();
        this.recognition.continuous = false;
        this.recognition.interimResults = false;
        this.recognition.lang = 'de-DE';

        this.recognition.onresult = (event) => {
            const text = event.results[0][0].transcript;
            console.log('[Voice] STT result:', text);
            this.lastInputWasVoice = true;
            if (this.onResult) this.onResult(text);
        };

        this.recognition.onend = () => {
            this.isRecording = false;
            if (this.onStateChange) this.onStateChange(false);
        };

        this.recognition.onerror = (event) => {
            console.error('[Voice] STT error:', event.error);
            this.isRecording = false;
            if (this.onStateChange) this.onStateChange(false);

            if (event.error === 'not-allowed') {
                showToast('Microphone access denied. Allow it in browser settings.', 'error');
            } else if (event.error === 'no-speech') {
                showToast('No speech detected. Try again.', 'error');
            }
        };
    }

    async startRecording() {
        if (this.isDisabled) {
            showToast('Voice requires Chrome or Edge (HTTPS).', 'error');
            return;
        }

        // Optimistic UI updates moved inside specific start blocks to prevent "fake" recording state
        // if hardware access fails or takes time.

        if (this.mode === 'api') {
            try {
                const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
                this.audioChunks = [];
                this.mediaRecorder = new MediaRecorder(stream, { mimeType: 'audio/webm' });

                this.mediaRecorder.ondataavailable = (e) => {
                    if (e.data.size > 0) this.audioChunks.push(e.data);
                };

                this.mediaRecorder.onstop = async () => {
                    stream.getTracks().forEach(t => t.stop());
                    const blob = new Blob(this.audioChunks, { type: 'audio/webm' });
                    await this._transcribeAPI(blob);
                };

                // Wait for recorder to actually start before updating UI
                this.mediaRecorder.start();

                // Now we are truly recording
                this.isRecording = true;
                this.lastInputWasVoice = true;
                if (this.onStateChange) this.onStateChange(true);

            } catch (e) {
                console.error('[Voice] Mic access error:', e);
                showToast('Microphone access denied or error', 'error');
                this.isRecording = false;
                if (this.onStateChange) this.onStateChange(false);
            }
        } else {
            if (this.recognition) {
                try {
                    this.recognition.start();
                    // Recognition 'onstart' would be better, but this is okay for browser mode
                    this.isRecording = true;
                    this.lastInputWasVoice = true;
                    if (this.onStateChange) this.onStateChange(true);
                } catch (e) {
                    this.isRecording = false;
                    if (this.onStateChange) this.onStateChange(false);
                    showToast('Voice recognition failed. Try again.', 'error');
                }
            }
        }
    }

    stopRecording() {
        if (this.mode === 'api') {
            if (this.mediaRecorder?.state === 'recording') {
                // Show processing state immediately (don't wait for onstop callback)
                if (this.onProcessing) this.onProcessing(true);
                this.mediaRecorder.stop();
            } else {
                this.isRecording = false;
                if (this.onStateChange) this.onStateChange(false);
            }
        } else {
            try { this.recognition?.stop(); } catch (e) { /* already stopped */ }
            this.isRecording = false;
            if (this.onStateChange) this.onStateChange(false);
        }
    }

    toggleRecording() {
        if (this.isDisabled) {
            showToast('Voice requires Chrome or Edge (HTTPS).', 'error');
            return;
        }
        this.isRecording ? this.stopRecording() : this.startRecording();
    }

    async _transcribeAPI(blob) {
        if (this.onProcessing) this.onProcessing(true);

        try {
            const formData = new FormData();
            formData.append('audio', blob, 'recording.webm');

            const response = await api('/voice/transcribe', {
                method: 'POST',
                body: formData,
            });

            if (response?.ok) {
                const data = await response.json();
                this.lastInputWasVoice = true;
                if (this.onResult) this.onResult(data.text);
            } else {
                const err = await response.json().catch(() => ({}));
                showToast(`Transcription failed: ${err.detail || 'Unknown error'}`, 'error');
            }
        } catch (e) {
            showToast('Transcription network error', 'error');
        } finally {
            this.isRecording = false;
            // Stop processing state
            if (this.onProcessing) this.onProcessing(false);
            if (this.onStateChange) this.onStateChange(false);
        }
    }

    /**
     * Fetch TTS audio blob for text (API only).
     * Returns audio URL or null.
     */
    async fetchAudio(text) {
        if (!this.apiAvailable) return null;

        const clean = VoiceManager.stripMarkdown(text);
        try {
            const response = await api(`/voice/synthesize?text=${encodeURIComponent(clean)}`, {
                method: 'POST',
            });

            if (response?.ok) {
                const audioBlob = await response.blob();
                return URL.createObjectURL(audioBlob);
            } else {
                const err = await response.json().catch(() => ({}));
                console.warn('[Voice] TTS error:', err);
            }
        } catch (e) {
            console.warn('[Voice] TTS network error:', e);
        }
        return null;
    }

    /**
     * Play audio with an inline mini-player (progress bar, seek, replay).
     * @param {string} audioUrl – blob URL from fetchAudio()
     * @param {HTMLElement} [containerEl] – element to append the player into
     * @returns {Promise} resolves when first playback ends
     */
    async playAudio(audioUrl, containerEl) {
        if (!audioUrl) return;

        const audio = new Audio(audioUrl);

        // Visual feedback — avatar pulse
        const avatarContainer = document.querySelector('.avatar-container');
        if (avatarContainer) avatarContainer.classList.add('speaking');

        // ── Build player DOM ──────────────────────────────────────────
        const player = document.createElement('div');
        player.className = 'audio-player';

        const playBtn = document.createElement('button');
        playBtn.className = 'audio-player-btn playing';
        playBtn.innerHTML = VoiceManager._pauseIcon();
        playBtn.title = 'Pause';

        const track = document.createElement('div');
        track.className = 'audio-player-track';
        const fill = document.createElement('div');
        fill.className = 'audio-player-fill';
        track.appendChild(fill);

        const timeLabel = document.createElement('span');
        timeLabel.className = 'audio-player-time';
        timeLabel.textContent = '0:00 / 0:00';

        player.appendChild(playBtn);
        player.appendChild(track);
        player.appendChild(timeLabel);

        if (containerEl) {
            containerEl.appendChild(player);
        }

        // ── Helpers ───────────────────────────────────────────────────
        function fmt(s) {
            if (!isFinite(s)) return '0:00';
            const m = Math.floor(s / 60);
            const sec = Math.floor(s % 60);
            return `${m}:${sec.toString().padStart(2, '0')}`;
        }

        function updateProgress() {
            if (!audio.duration) return;
            const pct = (audio.currentTime / audio.duration) * 100;
            fill.style.width = pct + '%';
            timeLabel.textContent = `${fmt(audio.currentTime)} / ${fmt(audio.duration)}`;
        }

        // ── Events ────────────────────────────────────────────────────
        audio.addEventListener('timeupdate', updateProgress);

        audio.addEventListener('loadedmetadata', () => {
            timeLabel.textContent = `0:00 / ${fmt(audio.duration)}`;
        });

        // Seek on track click
        track.addEventListener('click', (e) => {
            const rect = track.getBoundingClientRect();
            const pct = (e.clientX - rect.left) / rect.width;
            audio.currentTime = pct * audio.duration;
            updateProgress();
        });

        // Play/pause toggle
        playBtn.addEventListener('click', () => {
            if (audio.paused) {
                audio.play();
                playBtn.classList.add('playing');
                playBtn.innerHTML = VoiceManager._pauseIcon();
                playBtn.title = 'Pause';
                if (avatarContainer) avatarContainer.classList.add('speaking');
            } else {
                audio.pause();
                playBtn.classList.remove('playing');
                playBtn.innerHTML = VoiceManager._playIcon();
                playBtn.title = 'Play';
                if (avatarContainer) avatarContainer.classList.remove('speaking');
            }
        });

        // ── Playback ──────────────────────────────────────────────────
        try {
            // Wait for audio to be fully buffered before playing
            await new Promise((resolve, reject) => {
                audio.addEventListener('canplaythrough', resolve, { once: true });
                audio.addEventListener('error', reject, { once: true });
                audio.load(); // Explicitly trigger loading
            });
            audio.currentTime = 0; // Ensure we start from the very beginning
            await audio.play();
            return new Promise(resolve => {
                audio.onended = () => {
                    if (avatarContainer) avatarContainer.classList.remove('speaking');
                    playBtn.classList.remove('playing');
                    playBtn.innerHTML = VoiceManager._playIcon();
                    playBtn.title = 'Replay';
                    fill.style.width = '100%';
                    // Reset to beginning for replay
                    audio.currentTime = 0;
                    resolve();
                };
                audio.onerror = () => {
                    if (avatarContainer) avatarContainer.classList.remove('speaking');
                    resolve();
                };
            });
        } catch (e) {
            console.error('Playback failed', e);
            if (avatarContainer) avatarContainer.classList.remove('speaking');
            playBtn.classList.remove('playing');
            playBtn.innerHTML = VoiceManager._playIcon();
        }
    }

    // ── SVG icons (inline, no external deps) ──────────────────────────
    static _playIcon() {
        return `<svg width="14" height="14" viewBox="0 0 24 24" fill="currentColor"><polygon points="6,3 20,12 6,21"/></svg>`;
    }

    static _pauseIcon() {
        return `<svg width="14" height="14" viewBox="0 0 24 24" fill="currentColor"><rect x="5" y="3" width="4" height="18"/><rect x="15" y="3" width="4" height="18"/></svg>`;
    }

    /**
     * Legacy method for backward compatibility if needed,
     * or for simple direct speech.
     */
    async speak(text) {
        const url = await this.fetchAudio(text);
        if (url) await this.playAudio(url);
    }

    /**
     * Strip markdown formatting from text so TTS reads naturally.
     */
    static stripMarkdown(text) {
        return text
            .replace(/```[\s\S]*?```/g, '')        // code blocks
            .replace(/`([^`]+)`/g, '$1')            // inline code
            .replace(/#{1,6}\s+/g, '')              // headings
            .replace(/\*\*([^*]+)\*\*/g, '$1')      // bold
            .replace(/\*([^*]+)\*/g, '$1')          // italic
            .replace(/__([^_]+)__/g, '$1')          // bold alt
            .replace(/_([^_]+)_/g, '$1')            // italic alt
            .replace(/~~([^~]+)~~/g, '$1')          // strikethrough
            .replace(/^\s*[-*+]\s+/gm, '')          // unordered lists
            .replace(/^\s*\d+\.\s+/gm, '')          // ordered lists
            .replace(/\[([^\]]+)\]\([^)]+\)/g, '$1')// links
            .replace(/!\[([^\]]*)\]\([^)]+\)/g, '') // images
            .replace(/>\s+/g, '')                   // blockquotes
            .replace(/\n{2,}/g, '. ')               // paragraph breaks → pause
            .replace(/\n/g, ' ')                    // newlines → space
            .trim();
    }
}