sp/src/pages/web-tools/whisper-cpp.astro

---
import Layout from "../../layouts/Layout.astro"
---
<Layout title="">
    <div class="min-h-screen">
        <div class="container mx-auto px-4 py-8">
            <div class="max-w-3xl mx-auto bg-white rounded-xl shadow-md overflow-hidden p-6">
                <h1 class="text-3xl font-bold text-center text-gray-800 mb-6">Whisper.cpp STT Streaming</h1>

                <div class="mb-6">
                    <div class="flex justify-center space-x-4 mb-4">
                        <button id="startBtn" class="bg-green-500 hover:bg-green-600 text-white font-bold py-2 px-4 rounded">
                            Start Recording
                        </button>
                        <button id="stopBtn" disabled class="bg-red-500 hover:bg-red-600 text-white font-bold py-2 px-4 rounded">
                            Stop Recording
                        </button>
                        <button id="clearBtn" class="bg-gray-500 hover:bg-gray-600 text-white font-bold py-2 px-4 rounded">
                            Clear Text
                        </button>
                    </div>

                    <div class="mb-4">
                        <label class="block text-gray-700 text-sm font-bold mb-2" for="language">
                            Language
                        </label>
                        <select id="language" class="shadow border rounded w-full py-2 px-3 text-gray-700 leading-tight focus:outline-none focus:shadow-outline">
                            <option value="auto">Auto-detect</option>
                            <option value="en">English</option>
                            <option value="es">Spanish</option>
                            <option value="fr">French</option>
                            <option value="de">German</option>
                            <option value="it">Italian</option>
                            <option value="ja">Japanese</option>
                            <option value="zh">Chinese</option>
                        </select>
                    </div>

                    <div class="mb-4">
                        <label class="block text-gray-700 text-sm font-bold mb-2" for="model">
                            Model
                        </label>
                        <select id="model" class="shadow border rounded w-full py-2 px-3 text-gray-700 leading-tight focus:outline-none focus:shadow-outline">
                            <option value="tiny">Tiny</option>
                            <option value="base">Base</option>
                            <option value="small">Small</option>
                            <option value="medium">Medium</option>
                            <option value="large" selected>Large</option>
                        </select>
                    </div>
                </div>

                <div class="mb-4">
                    <label class="block text-gray-700 text-sm font-bold mb-2" for="status">
                        Status
                    </label>
                    <div id="status" class="bg-gray-100 p-3 rounded text-sm text-gray-700">
                        Ready to start recording...
                    </div>
                </div>

                <div class="mb-4">
                    <label class="block text-gray-700 text-sm font-bold mb-2" for="transcript">
                        Transcript
                    </label>
                    <div id="transcript" class="bg-gray-50 p-4 rounded min-h-32 border border-gray-200">
                        <!-- Transcript will appear here -->
                    </div>
                </div>

                <div class="text-xs text-gray-500 mt-6">
                    <p>Note: This interface connects to a whisper.cpp server for processing. Audio is streamed in real-time.</p>
                </div>
            </div>
        </div>
    </div>
</Layout>
<script is:inline>
    // DOM Elements
    const startBtn = document.getElementById('startBtn');
    const stopBtn = document.getElementById('stopBtn');
    const clearBtn = document.getElementById('clearBtn');
    const statusDiv = document.getElementById('status');
    const transcriptDiv = document.getElementById('transcript');
    const languageSelect = document.getElementById('language');
    const modelSelect = document.getElementById('model');

    // Audio context and variables
    let audioContext;
    let mediaStream;
    let processor;
    let audioSocket;
    let silenceTimeout;
    const SILENCE_THRESHOLD = 0.02; // Adjust based on testing
    const SILENCE_TIMEOUT_MS = 2000; // 2 seconds of silence before stopping

    // WebSocket URL - adjust to your whisper.cpp server
    const WS_URL = 'ws://localhost:8765';

    // Initialize
    document.addEventListener('DOMContentLoaded', () => {
        // Check for WebAudio API support
        if (!window.AudioContext && !window.webkitAudioContext) {
            statusDiv.textContent = 'Web Audio API not supported in this browser';
            startBtn.disabled = true;
            return;
        }

        // Check for WebSocket support
        if (!window.WebSocket) {
            statusDiv.textContent = 'WebSocket not supported in this browser';
            startBtn.disabled = true;
            return;
        }
    });

    // Event Listeners
    startBtn.addEventListener('click', startRecording);
    stopBtn.addEventListener('click', stopRecording);
    clearBtn.addEventListener('click', clearTranscript);

    async function startRecording() {
        try {
            statusDiv.textContent = 'Requesting microphone...';

            // Get microphone access
            mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true });

            // Initialize audio context
            audioContext = new (window.AudioContext || window.webkitAudioContext)();
            const source = audioContext.createMediaStreamSource(mediaStream);

            // Create script processor for audio processing
            processor = audioContext.createScriptProcessor(4096, 1, 1);

            // Connect audio nodes
            source.connect(processor);
            processor.connect(audioContext.destination);

            // Initialize WebSocket connection
            audioSocket = new WebSocket(WS_URL);

            audioSocket.onopen = () => {
                statusDiv.textContent = 'Connected to server. Recording...';
                startBtn.disabled = true;
                stopBtn.disabled = false;

                // Send configuration
                audioSocket.send(JSON.stringify({
                    type: 'config',
                    language: languageSelect.value,
                    model: modelSelect.value
                }));
            };

            audioSocket.onmessage = (event) => {
                const data = JSON.parse(event.data);

                if (data.type === 'transcript') {
                    // Append to transcript
                    const p = document.createElement('p');
                    p.className = 'mb-2';
                    p.textContent = data.text;
                    transcriptDiv.appendChild(p);

                    // Scroll to bottom
                    transcriptDiv.scrollTop = transcriptDiv.scrollHeight;
                } else if (data.type === 'status') {
                    statusDiv.textContent = data.message;
                } else if (data.type === 'error') {
                    statusDiv.textContent = `Error: ${data.message}`;
                    stopRecording();
                }
            };

            audioSocket.onclose = () => {
                if (statusDiv.textContent !== 'Recording stopped.') {
                    statusDiv.textContent = 'Connection closed unexpectedly.';
                }
                cleanup();
            };

            audioSocket.onerror = (error) => {
                statusDiv.textContent = `WebSocket error: ${error.message}`;
                cleanup();
            };

            // Process audio data
            processor.onaudioprocess = (event) => {
                if (!audioSocket || audioSocket.readyState !== WebSocket.OPEN) return;

                const audioData = event.inputBuffer.getChannelData(0);

                // Check for silence
                const isSilent = isAudioSilent(audioData);

                if (!isSilent) {
                    // Reset silence timeout
                    clearTimeout(silenceTimeout);
                    silenceTimeout = setTimeout(() => {
                        statusDiv.textContent = 'Silence detected, stopping recording...';
                        stopRecording();
                    }, SILENCE_TIMEOUT_MS);

                    // Convert Float32Array to Int16Array for WebSocket
                    const int16Data = convertFloat32ToInt16(audioData);

                    // Send audio data
                    audioSocket.send(int16Data);
                }
            };

        } catch (error) {
            statusDiv.textContent = `Error: ${error.message}`;
            console.error(error);
            cleanup();
        }
    }

    function stopRecording() {
        statusDiv.textContent = 'Recording stopped.';
        if (audioSocket && audioSocket.readyState === WebSocket.OPEN) {
            audioSocket.send(JSON.stringify({ type: 'eof' }));
            audioSocket.close();
        }
        cleanup();
    }

    function clearTranscript() {
        transcriptDiv.innerHTML = '';
    }

    function cleanup() {
        if (processor) {
            processor.disconnect();
            processor = null;
        }

        if (mediaStream) {
            mediaStream.getTracks().forEach(track => track.stop());
            mediaStream = null;
        }

        if (audioContext) {
            audioContext.close().catch(console.error);
            audioContext = null;
        }

        clearTimeout(silenceTimeout);
        startBtn.disabled = false;
        stopBtn.disabled = true;
    }

    // Helper functions
    function isAudioSilent(audioData) {
        // Calculate RMS (root mean square) of the audio buffer
        let sum = 0;
        for (let i = 0; i < audioData.length; i++) {
            sum += audioData[i] * audioData[i];
        }
        const rms = Math.sqrt(sum / audioData.length);
        return rms < SILENCE_THRESHOLD;
    }

    function convertFloat32ToInt16(buffer) {
        const length = buffer.length;
        const int16Array = new Int16Array(length);

        for (let i = 0; i < length; i++) {
            const s = Math.max(-1, Math.min(1, buffer[i]));
            int16Array[i] = s < 0 ? s * 0x8000 : s * 0x7FFF;
        }

        return int16Array.buffer;
    }
</script>