From 1015c8a2faade115ec6c20c6a80c53951ff7e0a4 Mon Sep 17 00:00:00 2001 From: Kar Date: Thu, 5 Jun 2025 10:41:50 +0000 Subject: [PATCH] Update speech_processor.py --- speech_processor.py | 81 +++++++++++++++++++++++++++++++-------------- 1 file changed, 56 insertions(+), 25 deletions(-) diff --git a/speech_processor.py b/speech_processor.py index 7c95c23..f07d505 100644 --- a/speech_processor.py +++ b/speech_processor.py @@ -4,8 +4,9 @@ import sys import json import tempfile import os -import wave import soundfile as sf +import numpy as np +import struct # Global model - load once model = None @@ -15,9 +16,9 @@ def initialize_vosk(): """Initialize Vosk model""" global model, recognizer - model_path = "/app/vosk-model" + model_path = "/app/vosk-model" # Update this path to your model location if not os.path.exists(model_path): - return {"success": False, "error": "Vosk model not found at /app/vosk-model"} + return {"success": False, "error": f"Vosk model not found at {model_path}"} try: vosk.SetLogLevel(-1) # Reduce log verbosity @@ -27,14 +28,14 @@ def initialize_vosk(): except Exception as e: return {"success": False, "error": f"Failed to initialize Vosk: {str(e)}"} -def process_audio_chunk(audio_data): +def process_audio_chunk(audio_data, request_id): """Process audio data and return transcription""" global recognizer if not recognizer: init_result = initialize_vosk() if not init_result["success"]: - return init_result + return {**init_result, "requestId": request_id} try: # Write audio data to temporary file @@ -44,77 +45,107 @@ def process_audio_chunk(audio_data): # Read audio file with soundfile try: - audio_data, sample_rate = sf.read(temp_filename) + audio_data, sample_rate = sf.read(temp_filename, dtype='float32') - # Convert to 16-bit PCM at 16kHz if needed + # Convert to 16kHz if needed if sample_rate != 16000: - # Simple resampling (for better quality, use librosa) - import numpy as np + # Simple resampling (for production, use proper resampling) + num_samples = int(len(audio_data) * 16000 / sample_rate) audio_data = np.interp( - np.linspace(0, len(audio_data), int(len(audio_data) * 16000 / sample_rate)), + np.linspace(0, len(audio_data)-1, num_samples), np.arange(len(audio_data)), audio_data ) + sample_rate = 16000 - # Convert to bytes - audio_bytes = (audio_data * 32767).astype('int16').tobytes() + # Convert to 16-bit PCM + audio_data = (audio_data * 32767).astype('int16') # Process with Vosk - if recognizer.AcceptWaveform(audio_bytes): + if recognizer.AcceptWaveform(audio_data.tobytes()): result = json.loads(recognizer.Result()) text = result.get('text', '') + is_final = True else: result = json.loads(recognizer.PartialResult()) text = result.get('partial', '') + is_final = False # Clean up os.unlink(temp_filename) - return {"success": True, "text": text} + return { + "success": True, + "text": text, + "is_final": is_final, + "requestId": request_id + } except Exception as e: os.unlink(temp_filename) - return {"success": False, "error": f"Audio processing error: {str(e)}"} + return { + "success": False, + "error": f"Audio processing error: {str(e)}", + "requestId": request_id + } except Exception as e: - return {"success": False, "error": f"General error: {str(e)}"} + return { + "success": False, + "error": f"General error: {str(e)}", + "requestId": request_id + } def main(): """Main loop to process audio chunks from stdin""" # Initialize Vosk on startup init_result = initialize_vosk() if not init_result["success"]: - error_response = json.dumps(init_result).encode('utf-8') - sys.stdout.buffer.write(len(error_response).to_bytes(4, byteorder='big')) + error_response = json.dumps({ + **init_result, + "requestId": 0 + }).encode('utf-8') + sys.stdout.buffer.write(struct.pack('>I', len(error_response))) sys.stdout.buffer.write(error_response) sys.stdout.buffer.flush() sys.exit(1) while True: try: - # Read length of incoming data + # Read length of incoming data (4 bytes) length_data = sys.stdin.buffer.read(4) if not length_data: break - - length = int.from_bytes(length_data, byteorder='big') + length = struct.unpack('>I', length_data)[0] + + # Read request ID (4 bytes) + id_data = sys.stdin.buffer.read(4) + if not id_data: + break + request_id = struct.unpack('>I', id_data)[0] # Read audio data audio_data = sys.stdin.buffer.read(length) + if len(audio_data) != length: + break # Process audio - result = process_audio_chunk(audio_data) + result = process_audio_chunk(audio_data, request_id) # Send result back response = json.dumps(result).encode('utf-8') - sys.stdout.buffer.write(len(response).to_bytes(4, byteorder='big')) + sys.stdout.buffer.write(struct.pack('>I', len(response))) sys.stdout.buffer.write(response) sys.stdout.buffer.flush() except Exception as e: - error_result = {"success": False, "error": str(e)} + error_result = { + "success": False, + "error": str(e), + "requestId": request_id if 'request_id' in locals() else 0 + } response = json.dumps(error_result).encode('utf-8') - sys.stdout.buffer.write(len(response).to_bytes(4, byteorder='big')) + sys.stdout.buffer.write(struct.pack('>I', len(response))) sys.stdout.buffer.write(response) sys.stdout.buffer.flush()