Files
sp/src/pages/web-tools/whisper-cpp.astro
suvodip ghosh 6071cd5228 update feat
2025-06-06 05:29:39 +00:00

276 lines
11 KiB
Plaintext

---
import Layout from "../../layouts/Layout.astro"
---
<Layout title="">
<div class="min-h-screen">
<div class="container mx-auto px-4 py-8">
<div class="max-w-3xl mx-auto bg-white rounded-xl shadow-md overflow-hidden p-6">
<h1 class="text-3xl font-bold text-center text-gray-800 mb-6">Whisper.cpp STT Streaming</h1>
<div class="mb-6">
<div class="flex justify-center space-x-4 mb-4">
<button id="startBtn" class="bg-green-500 hover:bg-green-600 text-white font-bold py-2 px-4 rounded">
Start Recording
</button>
<button id="stopBtn" disabled class="bg-red-500 hover:bg-red-600 text-white font-bold py-2 px-4 rounded">
Stop Recording
</button>
<button id="clearBtn" class="bg-gray-500 hover:bg-gray-600 text-white font-bold py-2 px-4 rounded">
Clear Text
</button>
</div>
<div class="mb-4">
<label class="block text-gray-700 text-sm font-bold mb-2" for="language">
Language
</label>
<select id="language" class="shadow border rounded w-full py-2 px-3 text-gray-700 leading-tight focus:outline-none focus:shadow-outline">
<option value="auto">Auto-detect</option>
<option value="en">English</option>
<option value="es">Spanish</option>
<option value="fr">French</option>
<option value="de">German</option>
<option value="it">Italian</option>
<option value="ja">Japanese</option>
<option value="zh">Chinese</option>
</select>
</div>
<div class="mb-4">
<label class="block text-gray-700 text-sm font-bold mb-2" for="model">
Model
</label>
<select id="model" class="shadow border rounded w-full py-2 px-3 text-gray-700 leading-tight focus:outline-none focus:shadow-outline">
<option value="tiny">Tiny</option>
<option value="base">Base</option>
<option value="small">Small</option>
<option value="medium">Medium</option>
<option value="large" selected>Large</option>
</select>
</div>
</div>
<div class="mb-4">
<label class="block text-gray-700 text-sm font-bold mb-2" for="status">
Status
</label>
<div id="status" class="bg-gray-100 p-3 rounded text-sm text-gray-700">
Ready to start recording...
</div>
</div>
<div class="mb-4">
<label class="block text-gray-700 text-sm font-bold mb-2" for="transcript">
Transcript
</label>
<div id="transcript" class="bg-gray-50 p-4 rounded min-h-32 border border-gray-200">
<!-- Transcript will appear here -->
</div>
</div>
<div class="text-xs text-gray-500 mt-6">
<p>Note: This interface connects to a whisper.cpp server for processing. Audio is streamed in real-time.</p>
</div>
</div>
</div>
</div>
</Layout>
<script is:inline>
// DOM Elements
const startBtn = document.getElementById('startBtn');
const stopBtn = document.getElementById('stopBtn');
const clearBtn = document.getElementById('clearBtn');
const statusDiv = document.getElementById('status');
const transcriptDiv = document.getElementById('transcript');
const languageSelect = document.getElementById('language');
const modelSelect = document.getElementById('model');
// Audio context and variables
let audioContext;
let mediaStream;
let processor;
let audioSocket;
let silenceTimeout;
const SILENCE_THRESHOLD = 0.02; // Adjust based on testing
const SILENCE_TIMEOUT_MS = 2000; // 2 seconds of silence before stopping
// WebSocket URL - adjust to your whisper.cpp server
const WS_URL = 'ws://localhost:8765';
// Initialize
document.addEventListener('DOMContentLoaded', () => {
// Check for WebAudio API support
if (!window.AudioContext && !window.webkitAudioContext) {
statusDiv.textContent = 'Web Audio API not supported in this browser';
startBtn.disabled = true;
return;
}
// Check for WebSocket support
if (!window.WebSocket) {
statusDiv.textContent = 'WebSocket not supported in this browser';
startBtn.disabled = true;
return;
}
});
// Event Listeners
startBtn.addEventListener('click', startRecording);
stopBtn.addEventListener('click', stopRecording);
clearBtn.addEventListener('click', clearTranscript);
async function startRecording() {
try {
statusDiv.textContent = 'Requesting microphone...';
// Get microphone access
mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true });
// Initialize audio context
audioContext = new (window.AudioContext || window.webkitAudioContext)();
const source = audioContext.createMediaStreamSource(mediaStream);
// Create script processor for audio processing
processor = audioContext.createScriptProcessor(4096, 1, 1);
// Connect audio nodes
source.connect(processor);
processor.connect(audioContext.destination);
// Initialize WebSocket connection
audioSocket = new WebSocket(WS_URL);
audioSocket.onopen = () => {
statusDiv.textContent = 'Connected to server. Recording...';
startBtn.disabled = true;
stopBtn.disabled = false;
// Send configuration
audioSocket.send(JSON.stringify({
type: 'config',
language: languageSelect.value,
model: modelSelect.value
}));
};
audioSocket.onmessage = (event) => {
const data = JSON.parse(event.data);
if (data.type === 'transcript') {
// Append to transcript
const p = document.createElement('p');
p.className = 'mb-2';
p.textContent = data.text;
transcriptDiv.appendChild(p);
// Scroll to bottom
transcriptDiv.scrollTop = transcriptDiv.scrollHeight;
} else if (data.type === 'status') {
statusDiv.textContent = data.message;
} else if (data.type === 'error') {
statusDiv.textContent = `Error: ${data.message}`;
stopRecording();
}
};
audioSocket.onclose = () => {
if (statusDiv.textContent !== 'Recording stopped.') {
statusDiv.textContent = 'Connection closed unexpectedly.';
}
cleanup();
};
audioSocket.onerror = (error) => {
statusDiv.textContent = `WebSocket error: ${error.message}`;
cleanup();
};
// Process audio data
processor.onaudioprocess = (event) => {
if (!audioSocket || audioSocket.readyState !== WebSocket.OPEN) return;
const audioData = event.inputBuffer.getChannelData(0);
// Check for silence
const isSilent = isAudioSilent(audioData);
if (!isSilent) {
// Reset silence timeout
clearTimeout(silenceTimeout);
silenceTimeout = setTimeout(() => {
statusDiv.textContent = 'Silence detected, stopping recording...';
stopRecording();
}, SILENCE_TIMEOUT_MS);
// Convert Float32Array to Int16Array for WebSocket
const int16Data = convertFloat32ToInt16(audioData);
// Send audio data
audioSocket.send(int16Data);
}
};
} catch (error) {
statusDiv.textContent = `Error: ${error.message}`;
console.error(error);
cleanup();
}
}
function stopRecording() {
statusDiv.textContent = 'Recording stopped.';
if (audioSocket && audioSocket.readyState === WebSocket.OPEN) {
audioSocket.send(JSON.stringify({ type: 'eof' }));
audioSocket.close();
}
cleanup();
}
function clearTranscript() {
transcriptDiv.innerHTML = '';
}
function cleanup() {
if (processor) {
processor.disconnect();
processor = null;
}
if (mediaStream) {
mediaStream.getTracks().forEach(track => track.stop());
mediaStream = null;
}
if (audioContext) {
audioContext.close().catch(console.error);
audioContext = null;
}
clearTimeout(silenceTimeout);
startBtn.disabled = false;
stopBtn.disabled = true;
}
// Helper functions
function isAudioSilent(audioData) {
// Calculate RMS (root mean square) of the audio buffer
let sum = 0;
for (let i = 0; i < audioData.length; i++) {
sum += audioData[i] * audioData[i];
}
const rms = Math.sqrt(sum / audioData.length);
return rms < SILENCE_THRESHOLD;
}
function convertFloat32ToInt16(buffer) {
const length = buffer.length;
const int16Array = new Int16Array(length);
for (let i = 0; i < length; i++) {
const s = Math.max(-1, Math.min(1, buffer[i]));
int16Array[i] = s < 0 ? s * 0x8000 : s * 0x7FFF;
}
return int16Array.buffer;
}
</script>