sp/src/components/Tools/AudioToText.jsx

773 lines
27 KiB
JavaScript

import { useState, useRef, useEffect, useCallback, useMemo } from 'react';
import { Button } from '../ui/button';
const API_OPTIONS = [
{
id: 'whisper',
name: 'Whisper (GPU)',
endpoint: 'https://stt-41.siliconpin.com/stt',
description: '2 Req / Min, 10 / Day is free'
},
{
id: 'vosk',
name: 'Vosk (CPU)',
endpoint: 'https://api.vosk.ai/stt',
description: '10 Req / Min, 100 / Day is free'
},
];
const MAX_FILE_SIZE_MB = 5;
const MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024;
export default function AudioUploader() {
// State management
const [file, setFile] = useState(null);
const [status, setStatus] = useState('No file uploaded yet');
const [response, setResponse] = useState(null);
const [isLoading, setIsLoading] = useState(false);
const [error, setError] = useState(null);
const [selectedApi, setSelectedApi] = useState('whisper');
const [copied, setCopied] = useState(false);
const [debugLogs, setDebugLogs] = useState([]);
const [recordingTime, setRecordingTime] = useState(0);
const [isRecording, setIsRecording] = useState(false);
const [audioBlob, setAudioBlob] = useState(null);
const [audioUrl, setAudioUrl] = useState(null);
const [showDebug, setShowDebug] = useState(false);
// Refs
const fileInputRef = useRef(null);
const mediaRecorderRef = useRef(null);
const audioChunksRef = useRef([]);
const timerRef = useRef(null);
const audioContextRef = useRef(null);
const analyserRef = useRef(null);
const canvasRef = useRef(null);
const animationRef = useRef(null);
const streamRef = useRef(null);
// Debug logging with useCallback to prevent infinite loops
const addDebugLog = useCallback((message) => {
const timestamp = new Date().toISOString().split('T')[1].split('.')[0];
const logMessage = `${timestamp}: ${message}`;
setDebugLogs(prev => [...prev.slice(-100), logMessage]);
console.debug(logMessage);
}, []);
// Timer effect
useEffect(() => {
if (isRecording) {
timerRef.current = setInterval(() => {
setRecordingTime(prev => prev + 1);
}, 1000);
return () => clearInterval(timerRef.current);
}
}, [isRecording]);
// Clean up on unmount
useEffect(() => {
return () => {
addDebugLog('Component unmounting - cleaning up resources');
stopRecording();
if (animationRef.current) {
cancelAnimationFrame(animationRef.current);
}
if (audioContextRef.current?.state !== 'closed') {
audioContextRef.current?.close().catch(err => {
addDebugLog(`Error closing AudioContext: ${err.message}`);
});
}
clearInterval(timerRef.current);
};
}, [addDebugLog]);
// Handle file change - completely stable implementation
const handleFileChange = useCallback((e) => {
const selectedFile = e.target.files[0];
if (!selectedFile) {
setFile(null);
setStatus('No file selected');
addDebugLog('No file selected');
return;
}
if (!['audio/wav', 'audio/mpeg', 'audio/ogg', 'audio/webm'].includes(selectedFile.type) &&
!selectedFile.name.match(/\.(wav|mp3|ogg|webm)$/i)) {
const errorMsg = 'Unsupported file format. Please use WAV, MP3, or OGG';
setError(errorMsg);
setStatus('Invalid file type');
setFile(null);
e.target.value = '';
addDebugLog(errorMsg);
return;
}
if (selectedFile.size > MAX_FILE_SIZE_BYTES) {
const errorMsg = `File size exceeds ${MAX_FILE_SIZE_MB}MB limit`;
setError(errorMsg);
setStatus('File too large');
setFile(null);
e.target.value = '';
addDebugLog(errorMsg);
return;
}
setFile(selectedFile);
setStatus(`File selected: ${selectedFile.name}`);
setResponse(null);
setError(null);
setAudioBlob(null);
setAudioUrl(null);
addDebugLog(`File selected: ${selectedFile.name} (${(selectedFile.size / (1024 * 1024)).toFixed(2)} MB)`);
}, [addDebugLog]);
// Create WAV blob - stable implementation
const createWavBlob = useCallback(async (audioBlob) => {
try {
addDebugLog('Starting WAV blob creation');
const arrayBuffer = await audioBlob.arrayBuffer();
const audioContext = new (window.AudioContext || window.webkitAudioContext)({
sampleRate: 16000
});
const decodedData = await audioContext.decodeAudioData(arrayBuffer);
addDebugLog(`Decoded audio data: ${decodedData.length} samples, ${decodedData.numberOfChannels} channels`);
let audioData;
if (decodedData.numberOfChannels > 1) {
audioData = new Float32Array(decodedData.length);
for (let i = 0; i < decodedData.length; i++) {
audioData[i] = (decodedData.getChannelData(0)[i] + decodedData.getChannelData(1)[i]) / 2;
}
addDebugLog('Converted stereo to mono');
} else {
audioData = decodedData.getChannelData(0);
}
const pcmData = new Int16Array(audioData.length);
for (let i = 0; i < audioData.length; i++) {
const s = Math.max(-1, Math.min(1, audioData[i]));
pcmData[i] = s < 0 ? s * 0x8000 : s * 0x7FFF;
}
addDebugLog(`Converted to 16-bit PCM: ${pcmData.length} samples`);
const wavHeader = createWaveHeader(pcmData.length * 2, {
sampleRate: 16000,
numChannels: 1,
bitDepth: 16
});
const wavBlob = new Blob([wavHeader, pcmData], { type: 'audio/wav' });
addDebugLog(`Created WAV blob: ${(wavBlob.size / 1024).toFixed(2)} KB`);
return wavBlob;
} catch (err) {
const errorMsg = `Error creating WAV blob: ${err.message}`;
addDebugLog(errorMsg);
throw new Error('Failed to process audio recording');
}
}, [addDebugLog]);
// Handle submit - stable implementation
const handleSubmit = useCallback(async () => {
let fileToSubmit;
try {
setIsLoading(true);
const apiName = API_OPTIONS.find(api => api.id === selectedApi)?.name;
setStatus(`Processing with ${apiName}...`);
setError(null);
addDebugLog(`Starting submission with ${apiName}`);
if (audioBlob) {
addDebugLog('Processing recorded audio blob');
fileToSubmit = await createWavBlob(audioBlob);
} else if (file) {
addDebugLog('Processing uploaded file');
fileToSubmit = file;
} else {
const errorMsg = 'No audio file selected';
addDebugLog(errorMsg);
throw new Error(errorMsg);
}
if (fileToSubmit.size > MAX_FILE_SIZE_BYTES) {
const errorMsg = `File size exceeds ${MAX_FILE_SIZE_MB}MB limit`;
addDebugLog(errorMsg);
throw new Error(errorMsg);
}
const formData = new FormData();
formData.append('audio', fileToSubmit, 'audio.wav');
addDebugLog(`Created FormData with ${(fileToSubmit.size / 1024).toFixed(2)} KB file`);
const apiConfig = API_OPTIONS.find(api => api.id === selectedApi);
if (!apiConfig) {
const errorMsg = 'Selected API not found';
addDebugLog(errorMsg);
throw new Error(errorMsg);
}
addDebugLog(`Sending request to ${apiConfig.endpoint}`);
const apiResponse = await fetch(apiConfig.endpoint, {
method: 'POST',
body: formData,
});
if (!apiResponse.ok) {
let errorMessage = `API returned error status: ${apiResponse.status}`;
try {
const errorData = await apiResponse.json();
errorMessage = errorData.message || errorData.error || errorMessage;
} catch (e) {
addDebugLog('Failed to parse error response');
}
addDebugLog(`API error: ${errorMessage}`);
throw new Error(errorMessage);
}
const result = await apiResponse.json();
addDebugLog('Received successful response from API');
setResponse({
api: selectedApi,
data: result
});
setStatus('Processing complete');
} catch (err) {
const errorMsg = err.message.includes('Failed to fetch')
? 'Network error: Could not connect to the API server'
: err.message;
addDebugLog(`Error during submission: ${errorMsg}`);
setError(errorMsg);
setStatus('Processing failed');
setResponse(null);
} finally {
setIsLoading(false);
addDebugLog('Submission process completed');
}
}, [selectedApi, audioBlob, file, createWavBlob, addDebugLog]);
// Helper functions
const createWaveHeader = useCallback((dataLength, config) => {
const byteRate = config.sampleRate * config.numChannels * (config.bitDepth / 8);
const blockAlign = config.numChannels * (config.bitDepth / 8);
const buffer = new ArrayBuffer(44);
const view = new DataView(buffer);
writeString(view, 0, 'RIFF');
view.setUint32(4, 36 + dataLength, true);
writeString(view, 8, 'WAVE');
writeString(view, 12, 'fmt ');
view.setUint32(16, 16, true);
view.setUint16(20, 1, true);
view.setUint16(22, config.numChannels, true);
view.setUint32(24, config.sampleRate, true);
view.setUint32(28, byteRate, true);
view.setUint16(32, blockAlign, true);
view.setUint16(34, config.bitDepth, true);
writeString(view, 36, 'data');
view.setUint32(40, dataLength, true);
return new Uint8Array(buffer);
}, []);
const writeString = useCallback((view, offset, string) => {
for (let i = 0; i < string.length; i++) {
view.setUint8(offset + i, string.charCodeAt(i));
}
}, []);
const copyToClipboard = useCallback((text) => {
navigator.clipboard.writeText(text).then(() => {
setCopied(true);
setTimeout(() => setCopied(false), 2000);
addDebugLog('Text copied to clipboard');
}).catch(err => {
const errorMsg = 'Failed to copy text to clipboard';
addDebugLog(`${errorMsg}: ${err.message}`);
setError(errorMsg);
});
}, [addDebugLog]);
// Fixed getDisplayText to prevent infinite loops
const displayText = useMemo(() => {
if (!response?.data) {
addDebugLog('No response data to display');
return null;
}
if (typeof response.data === 'string') {
return response.data;
}
if (response.data.text) {
return response.data.text;
}
if (response.data.transcript) {
return response.data.transcript;
}
if (response.data.results?.[0]?.alternatives?.[0]?.transcript) {
return response.data.results[0].alternatives[0].transcript;
}
return "Received response but couldn't extract text. View full response for details.";
}, [response, addDebugLog]);
// Recording functions
const startRecording = useCallback(async () => {
try {
addDebugLog('Attempting to start recording');
setStatus("Requesting microphone access...");
if (audioBlob) {
addDebugLog('Clearing previous recording');
setAudioBlob(null);
setAudioUrl(null);
}
// Initialize audio context
audioContextRef.current = new (window.AudioContext || window.webkitAudioContext)({
sampleRate: 16000
});
addDebugLog(`AudioContext created with sample rate: ${audioContextRef.current.sampleRate}`);
// Get user media
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
streamRef.current = stream;
addDebugLog('Microphone access granted, stream created');
// Setup visualization
setupVisualizer(stream);
// Initialize MediaRecorder
mediaRecorderRef.current = new MediaRecorder(stream);
audioChunksRef.current = [];
mediaRecorderRef.current.ondataavailable = (e) => {
audioChunksRef.current.push(e.data);
};
mediaRecorderRef.current.onstop = () => {
const audioBlob = new Blob(audioChunksRef.current, { type: 'audio/webm' });
setAudioBlob(audioBlob);
setAudioUrl(URL.createObjectURL(audioBlob));
setStatus("Recording stopped. Ready to process.");
};
mediaRecorderRef.current.start(100); // Collect data every 100ms
setIsRecording(true);
setStatus("Recording (16kHz, 16-bit mono)...");
addDebugLog('Recording started');
} catch (err) {
const errorMsg = `Error starting recording: ${err.message}`;
addDebugLog(errorMsg);
setError(errorMsg);
setStatus("Recording failed");
setIsRecording(false);
}
}, [audioBlob, addDebugLog]);
const stopRecording = useCallback(() => {
addDebugLog('Stop recording initiated');
if (!isRecording) {
addDebugLog('Not currently recording, ignoring stop request');
return;
}
try {
setIsRecording(false);
addDebugLog('Recording state updated to false');
if (mediaRecorderRef.current?.state === 'recording') {
mediaRecorderRef.current.stop();
}
if (streamRef.current) {
streamRef.current.getTracks().forEach(track => {
track.stop();
addDebugLog(`Stopped track: ${track.kind}`);
});
}
if (animationRef.current) {
cancelAnimationFrame(animationRef.current);
animationRef.current = null;
addDebugLog('Visualization animation stopped');
}
addDebugLog('Recording successfully stopped');
} catch (err) {
const errorMsg = `Error stopping recording: ${err.message}`;
addDebugLog(errorMsg);
setError(errorMsg);
setStatus("Recording stop failed");
}
}, [isRecording, addDebugLog]);
const playRecording = useCallback(() => {
if (audioUrl) {
addDebugLog('Playing recording');
const audio = new Audio(audioUrl);
audio.play();
setStatus("Playing recording...");
audio.onended = () => {
addDebugLog('Playback finished');
setStatus("Playback finished");
};
} else {
addDebugLog('No audio URL available for playback');
}
}, [audioUrl, addDebugLog]);
const setupVisualizer = useCallback((stream) => {
if (!audioContextRef.current) {
addDebugLog('AudioContext not available for visualization');
return;
}
try {
const source = audioContextRef.current.createMediaStreamSource(stream);
analyserRef.current = audioContextRef.current.createAnalyser();
analyserRef.current.fftSize = 64;
source.connect(analyserRef.current);
addDebugLog('Visualizer audio nodes connected');
const bufferLength = analyserRef.current.frequencyBinCount;
const dataArray = new Uint8Array(bufferLength);
const draw = () => {
animationRef.current = requestAnimationFrame(draw);
analyserRef.current.getByteFrequencyData(dataArray);
const canvas = canvasRef.current;
if (!canvas) return;
const ctx = canvas.getContext('2d');
if (!ctx) return;
ctx.clearRect(0, 0, canvas.width, canvas.height);
const barWidth = (canvas.width / bufferLength) * 2.5;
let x = 0;
for (let i = 0; i < bufferLength; i++) {
const barHeight = dataArray[i] / 2;
ctx.fillStyle = `rgb(${barHeight + 100}, 50, 50)`;
ctx.fillRect(x, canvas.height - barHeight, barWidth, barHeight);
x += barWidth + 1;
}
};
draw();
addDebugLog('Visualizer animation started');
} catch (err) {
addDebugLog(`Error setting up visualizer: ${err.message}`);
}
}, [addDebugLog]);
// Helper functions
const formatTime = useCallback((seconds) => {
const mins = Math.floor(seconds / 60).toString().padStart(2, '0');
const secs = (seconds % 60).toString().padStart(2, '0');
return `${mins}:${secs}`;
}, []);
const clearDebugLogs = useCallback(() => {
setDebugLogs([]);
addDebugLog('Debug logs cleared');
}, [addDebugLog]);
const toggleDebug = useCallback(() => {
setShowDebug(!showDebug);
addDebugLog(`Debug panel ${showDebug ? 'hidden' : 'shown'}`);
}, [showDebug, addDebugLog]);
return (
<div className="container mx-auto px-4 max-w-4xl my-6">
<div className="bg-white rounded-lg shadow-md p-6">
<div className="flex justify-between items-center mb-6">
<h1 className="text-2xl font-bold text-gray-800">Speech to Text Converter</h1>
<Button
onClick={toggleDebug}
variant="outline"
size="sm"
>
{showDebug ? 'Hide Debug' : 'Show Debug'}
</Button>
</div>
{/* Recording Section */}
<div className="mb-8 p-4 border border-dashed border-[#6d9e37] rounded-lg bg-gray-50">
<h2 className="text-lg font-semibold mb-3 text-gray-700">Record Audio (16kHz, 16-bit mono)</h2>
<canvas
ref={canvasRef}
className="w-full h-20 bg-gray-200 rounded mb-3"
style={{ display: isRecording ? 'block' : 'none' }}
/>
<div className="flex flex-wrap gap-2 mb-3">
<Button
onClick={startRecording}
disabled={isRecording || isLoading}
variant="outline"
className="px-4 py-2"
>
Start Recording
</Button>
<Button
onClick={stopRecording}
disabled={!isRecording || isLoading}
variant="outline"
className="px-4 py-2"
>
Stop Recording
</Button>
<Button
onClick={playRecording}
disabled={!audioUrl || isRecording || isLoading}
variant="outline"
className="px-4 py-2"
>
Play Recording
</Button>
</div>
{isRecording && (
<div className="text-center text-[#6d9e37] font-medium">
Recording: {formatTime(recordingTime)}
</div>
)}
{audioUrl && !isRecording && (
<div className="flex items-center justify-between bg-white p-3 rounded border mt-2">
<div className="flex items-center">
<svg className="w-5 h-5 text-[#6d9e37] mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M9 12l2 2 4-4m6 2a9 9 0 11-18 0 9 9 0 0118 0z" />
</svg>
<span className="text-sm font-medium text-[#6d9e37]">recording.wav</span>
</div>
<span className="text-xs text-[#6d9e37]">
{(audioBlob?.size / (1024 * 1024)).toFixed(2)} MB
</span>
</div>
)}
</div>
<div className="flex items-center justify-center w-full mb-3 -mt-2">
<span className="flex-grow border-b-2 border-[#6d9e37]"></span>
<h2 className="mx-2 text-[#6d9e37] font-bold">OR</h2>
<span className="flex-grow border-b-2 border-[#6d9e37]"></span>
</div>
{/* File Upload Section */}
<div className="mb-8 p-4 border border-dashed border-[#6d9e37] rounded-lg bg-gray-50">
<div className="flex items-center justify-between mb-4">
<div>
<label className="block text-sm font-medium text-gray-700 mb-1">
Upload Audio File
</label>
<p className="text-xs text-gray-500">Supports WAV, MP3, OGG formats (max {MAX_FILE_SIZE_MB}MB)</p>
</div>
<Button
onClick={() => fileInputRef.current?.click()}
variant="outline"
className="px-4 py-2"
disabled={isLoading || isRecording}
>
Select File
</Button>
<input
type="file"
ref={fileInputRef}
accept="audio/*,.wav,.mp3,.ogg"
onChange={handleFileChange}
className="hidden"
disabled={isLoading || isRecording}
/>
</div>
{file && (
<div className="flex items-center justify-between bg-white p-3 rounded border">
<div className="flex items-center">
<svg className="w-5 h-5 text-[#6d9e37] mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M9 12l2 2 4-4m6 2a9 9 0 11-18 0 9 9 0 0118 0z" />
</svg>
<span className="text-sm font-medium text-[#6d9e37]">{file.name}</span>
</div>
<span className="text-xs text-[#6d9e37]">
{(file.size / (1024 * 1024)).toFixed(2)} MB
</span>
</div>
)}
</div>
{/* API Selection Section */}
<div className="mb-8">
<h2 className="text-lg font-semibold mb-3 text-gray-700">Select Speech Recognition API</h2>
<div className="grid grid-cols-1 md:grid-cols-2 gap-3">
{API_OPTIONS.map(api => (
<div
key={api.id}
onClick={() => !isLoading && !isRecording && setSelectedApi(api.id)}
className={`p-4 border-[1.5px] rounded-lg cursor-pointer transition-all ${
selectedApi === api.id
? 'border-[1.5px] border-[#6d9e37] bg-[#6d9e3720]'
: 'border-gray-200 hover:border-gray-300'
} ${isLoading || isRecording ? 'opacity-50 cursor-not-allowed' : ''}`}
>
<div className="flex items-center">
<input
type="radio"
checked={selectedApi === api.id}
onChange={() => {}}
className="mr-2 h-4 w-4 accent-[#6d9e37]"
disabled={isLoading || isRecording}
/>
<div>
<h3 className="font-bold text-[#6d9e37]">{api.name}</h3>
<p className="text-xs text-gray-500">{api.description}</p>
</div>
</div>
</div>
))}
</div>
</div>
{/* Submit Button */}
<div className="flex justify-center mb-8">
<Button
onClick={handleSubmit}
disabled={(!file && !audioBlob) || isLoading || isRecording ||
(file && file.size > MAX_FILE_SIZE_BYTES)}
className="px-6 py-3 text-lg w-full md:w-auto"
>
{isLoading ? (
<span className="flex items-center justify-center">
<svg className="animate-spin -ml-1 mr-2 h-4 w-4 text-white" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24">
<circle className="opacity-25" cx="12" cy="12" r="10" stroke="currentColor" strokeWidth="4"></circle>
<path className="opacity-75" fill="currentColor" d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4zm2 5.291A7.962 7.962 0 014 12H0c0 3.042 1.135 5.824 3 7.938l3-2.647z"></path>
</svg>
Processing...
</span>
) : (
`Convert with ${API_OPTIONS.find(api => api.id === selectedApi)?.name}`
)}
</Button>
</div>
{/* Status */}
<div className="mb-6 text-center">
<p className={`text-sm ${
error ? 'text-red-600' :
isLoading ? 'text-[#6d9e37]' :
isRecording ? 'text-[#6d9e37]' :
'text-gray-600'
}`}>
{status}
</p>
</div>
{/* Error Display */}
{error && (
<div className="mt-6 p-4 bg-red-50 border border-red-200 rounded-lg">
<div className="flex items-center text-red-600">
<svg className="w-5 h-5 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M12 8v4m0 4h.01M21 12a9 9 0 11-18 0 9 9 0 0118 0z" />
</svg>
<h3 className="font-medium">Error</h3>
</div>
<p className="mt-2 text-sm text-red-600">{error}</p>
{error.includes(`${MAX_FILE_SIZE_MB}MB`) && (
<p className="mt-1 text-xs text-red-500">
Please select a smaller audio file or record a shorter audio
</p>
)}
</div>
)}
{/* Results Display */}
{response && (
<div className="mt-6 border rounded-lg overflow-hidden">
<div className="bg-gray-50 px-4 py-3 border-b flex justify-between items-center">
<h3 className="font-medium text-[#6d9e37]">{API_OPTIONS.find(api => api.id === response.api)?.name} Results</h3>
{displayText && (
<Button
size="sm"
variant="outline"
onClick={() => copyToClipboard(displayText)}
className="text-sm">
{copied ? (<span className="flex items-center">Copied!</span>) : 'Copy Text'}
</Button>
)}
</div>
<div className="p-4 bg-white">
{displayText ? (
<div className="space-y-2">
<h4 className="text-sm font-medium text-gray-700">Transcription:</h4>
<div className="relative">
<p className="p-3 bg-gray-50 rounded text-sm text-gray-600 font-medium">{displayText}</p>
</div>
</div>
) : (
<p className="text-sm text-gray-600">
No text transcription found in the response.
</p>
)}
</div>
</div>
)}
{/* Debug Section */}
{showDebug && (
<div className="mt-8 border rounded-lg overflow-hidden">
<div className="bg-gray-50 px-4 py-3 border-b flex justify-between items-center">
<h3 className="font-medium text-gray-700">Debug Logs</h3>
<div className="flex gap-2">
<Button
size="sm"
variant="outline"
onClick={clearDebugLogs}
className="text-sm">
Clear Logs
</Button>
<Button
size="sm"
variant="outline"
onClick={() => {
const blob = new Blob([debugLogs.join('\n')], { type: 'text/plain' });
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = `audio-recorder-debug-${new Date().toISOString()}.log`;
a.click();
URL.revokeObjectURL(url);
}}
className="text-sm">
Export Logs
</Button>
</div>
</div>
<div className="p-4 bg-white max-h-60 overflow-y-auto">
{debugLogs.length > 0 ? (
<div className="space-y-1">
{debugLogs.map((log, index) => (
<div key={index} className="text-xs font-mono text-gray-600 border-b border-gray-100 pb-1">
{log}
</div>
))}
</div>
) : (
<p className="text-sm text-gray-500">No debug logs yet. Interactions will appear here.</p>
)}
</div>
</div>
)}
</div>
</div>
);
}