diff --git a/speech_processor.py b/speech_processor.py index ab589da..a2456d5 100644 --- a/speech_processor.py +++ b/speech_processor.py @@ -4,20 +4,18 @@ import sys import json import struct import numpy as np -from queue import Queue -from threading import Thread import soundfile as sf import tempfile import os +from threading import Lock -# Global recognizer +# Global recognizer with thread lock recognizer = None -audio_queue = Queue() -result_queue = Queue() +recognizer_lock = Lock() def initialize_vosk(): global recognizer - model_path = "vosk-model" # Update this path + model_path = "vosk-model" # Update this path to your model if not os.path.exists(model_path): return {"success": False, "error": "Model not found"} @@ -30,56 +28,59 @@ def initialize_vosk(): except Exception as e: return {"success": False, "error": str(e)} -def audio_worker(): +def process_audio(audio_data, request_id): global recognizer - while True: - audio_data, request_id = audio_queue.get() + + if not recognizer: + init_result = initialize_vosk() + if not init_result["success"]: + return {**init_result, "requestId": request_id} + + try: + # Write to temp file + with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f: + f.write(audio_data) + temp_path = f.name try: - # Write to temp file and read with soundfile - with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f: - f.write(audio_data) - temp_path = f.name + # Read with soundfile + data, samplerate = sf.read(temp_path, dtype='float32') - try: - data, samplerate = sf.read(temp_path, dtype='float32') - - # Resample if needed - if samplerate != 16000: - duration = len(data) / samplerate - data = np.interp( - np.linspace(0, len(data)-1, int(duration * 16000)), - np.arange(len(data)), - data - ) - - # Convert to 16-bit PCM - data = (data * 32767).astype('int16') - - # Process with Vosk + # Convert to 16kHz if needed + if samplerate != 16000: + duration = len(data) / samplerate + data = np.interp( + np.linspace(0, len(data)-1, int(duration * 16000)), + np.arange(len(data)), + data + ) + + # Convert to 16-bit PCM + data = (data * 32767).astype('int16') + + # Process with thread-safe recognizer + with recognizer_lock: if recognizer.AcceptWaveform(data.tobytes()): text = json.loads(recognizer.Result()).get('text', '') is_final = True else: text = json.loads(recognizer.PartialResult()).get('partial', '') is_final = False - - result_queue.put(({ - "success": True, - "text": text, - "is_final": is_final, - "requestId": request_id - }, request_id)) - - finally: - os.unlink(temp_path) - - except Exception as e: - result_queue.put(({ - "success": False, - "error": str(e), + + return { + "success": True, + "text": text, + "is_final": is_final, "requestId": request_id - }, request_id)) + } + finally: + os.unlink(temp_path) + except Exception as e: + return { + "success": False, + "error": str(e), + "requestId": request_id + } def main(): # Initialize Vosk @@ -95,9 +96,6 @@ def main(): sys.stdout.buffer.flush() return - # Start worker thread - Thread(target=audio_worker, daemon=True).start() - while True: try: # Read message length (4 bytes) @@ -117,18 +115,14 @@ def main(): if len(audio_data) != length: break - # Add to processing queue - audio_queue.put((audio_data, request_id)) + # Process and send response + result = process_audio(audio_data, request_id) + response = json.dumps(result).encode() + sys.stdout.buffer.write(struct.pack('>I', len(response))) + sys.stdout.buffer.write(struct.pack('>I', request_id)) + sys.stdout.buffer.write(response) + sys.stdout.buffer.flush() - # Check for results - while not result_queue.empty(): - result, res_id = result_queue.get() - response = json.dumps(result).encode() - sys.stdout.buffer.write(struct.pack('>I', len(response))) - sys.stdout.buffer.write(struct.pack('>I', res_id))) - sys.stdout.buffer.write(response) - sys.stdout.buffer.flush() - except Exception as e: error = json.dumps({ "success": False,