diff --git a/speech_processor.py b/speech_processor.py index 7c95c23..ab589da 100644 --- a/speech_processor.py +++ b/speech_processor.py @@ -2,120 +2,141 @@ import vosk import sys import json +import struct +import numpy as np +from queue import Queue +from threading import Thread +import soundfile as sf import tempfile import os -import wave -import soundfile as sf -# Global model - load once -model = None +# Global recognizer recognizer = None +audio_queue = Queue() +result_queue = Queue() def initialize_vosk(): - """Initialize Vosk model""" - global model, recognizer + global recognizer + model_path = "vosk-model" # Update this path - model_path = "/app/vosk-model" if not os.path.exists(model_path): - return {"success": False, "error": "Vosk model not found at /app/vosk-model"} + return {"success": False, "error": "Model not found"} try: - vosk.SetLogLevel(-1) # Reduce log verbosity + vosk.SetLogLevel(-1) model = vosk.Model(model_path) recognizer = vosk.KaldiRecognizer(model, 16000) return {"success": True} except Exception as e: - return {"success": False, "error": f"Failed to initialize Vosk: {str(e)}"} + return {"success": False, "error": str(e)} -def process_audio_chunk(audio_data): - """Process audio data and return transcription""" +def audio_worker(): global recognizer - - if not recognizer: - init_result = initialize_vosk() - if not init_result["success"]: - return init_result - - try: - # Write audio data to temporary file - with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file: - temp_file.write(audio_data) - temp_filename = temp_file.name + while True: + audio_data, request_id = audio_queue.get() - # Read audio file with soundfile try: - audio_data, sample_rate = sf.read(temp_filename) - - # Convert to 16-bit PCM at 16kHz if needed - if sample_rate != 16000: - # Simple resampling (for better quality, use librosa) - import numpy as np - audio_data = np.interp( - np.linspace(0, len(audio_data), int(len(audio_data) * 16000 / sample_rate)), - np.arange(len(audio_data)), - audio_data - ) - - # Convert to bytes - audio_bytes = (audio_data * 32767).astype('int16').tobytes() - - # Process with Vosk - if recognizer.AcceptWaveform(audio_bytes): - result = json.loads(recognizer.Result()) - text = result.get('text', '') - else: - result = json.loads(recognizer.PartialResult()) - text = result.get('partial', '') - - # Clean up - os.unlink(temp_filename) - - return {"success": True, "text": text} + # Write to temp file and read with soundfile + with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f: + f.write(audio_data) + temp_path = f.name + try: + data, samplerate = sf.read(temp_path, dtype='float32') + + # Resample if needed + if samplerate != 16000: + duration = len(data) / samplerate + data = np.interp( + np.linspace(0, len(data)-1, int(duration * 16000)), + np.arange(len(data)), + data + ) + + # Convert to 16-bit PCM + data = (data * 32767).astype('int16') + + # Process with Vosk + if recognizer.AcceptWaveform(data.tobytes()): + text = json.loads(recognizer.Result()).get('text', '') + is_final = True + else: + text = json.loads(recognizer.PartialResult()).get('partial', '') + is_final = False + + result_queue.put(({ + "success": True, + "text": text, + "is_final": is_final, + "requestId": request_id + }, request_id)) + + finally: + os.unlink(temp_path) + except Exception as e: - os.unlink(temp_filename) - return {"success": False, "error": f"Audio processing error: {str(e)}"} - - except Exception as e: - return {"success": False, "error": f"General error: {str(e)}"} + result_queue.put(({ + "success": False, + "error": str(e), + "requestId": request_id + }, request_id)) def main(): - """Main loop to process audio chunks from stdin""" - # Initialize Vosk on startup + # Initialize Vosk init_result = initialize_vosk() if not init_result["success"]: - error_response = json.dumps(init_result).encode('utf-8') - sys.stdout.buffer.write(len(error_response).to_bytes(4, byteorder='big')) - sys.stdout.buffer.write(error_response) + error = json.dumps({ + "success": False, + "error": init_result["error"], + "requestId": 0 + }).encode() + sys.stdout.buffer.write(struct.pack('>I', len(error))) + sys.stdout.buffer.write(error) sys.stdout.buffer.flush() - sys.exit(1) + return + + # Start worker thread + Thread(target=audio_worker, daemon=True).start() while True: try: - # Read length of incoming data - length_data = sys.stdin.buffer.read(4) - if not length_data: + # Read message length (4 bytes) + length_bytes = sys.stdin.buffer.read(4) + if not length_bytes: break - - length = int.from_bytes(length_data, byteorder='big') + length = struct.unpack('>I', length_bytes)[0] + + # Read request ID (4 bytes) + id_bytes = sys.stdin.buffer.read(4) + if not id_bytes: + break + request_id = struct.unpack('>I', id_bytes)[0] # Read audio data audio_data = sys.stdin.buffer.read(length) + if len(audio_data) != length: + break - # Process audio - result = process_audio_chunk(audio_data) - - # Send result back - response = json.dumps(result).encode('utf-8') - sys.stdout.buffer.write(len(response).to_bytes(4, byteorder='big')) - sys.stdout.buffer.write(response) - sys.stdout.buffer.flush() + # Add to processing queue + audio_queue.put((audio_data, request_id)) + # Check for results + while not result_queue.empty(): + result, res_id = result_queue.get() + response = json.dumps(result).encode() + sys.stdout.buffer.write(struct.pack('>I', len(response))) + sys.stdout.buffer.write(struct.pack('>I', res_id))) + sys.stdout.buffer.write(response) + sys.stdout.buffer.flush() + except Exception as e: - error_result = {"success": False, "error": str(e)} - response = json.dumps(error_result).encode('utf-8') - sys.stdout.buffer.write(len(response).to_bytes(4, byteorder='big')) - sys.stdout.buffer.write(response) + error = json.dumps({ + "success": False, + "error": str(e), + "requestId": request_id if 'request_id' in locals() else 0 + }).encode() + sys.stdout.buffer.write(struct.pack('>I', len(error))) + sys.stdout.buffer.write(error) sys.stdout.buffer.flush() if __name__ == "__main__":