Update speech_processor.py

2025-06-05 12:41:33 +00:00
parent c0c3c7405d
commit b68e390d4c
1 changed files with 100 additions and 79 deletions
--- a/speech_processor.py
+++ b/speech_processor.py
@@ -2,120 +2,141 @@
 import vosk
 import sys
 import json
 import struct
 import numpy as np
 from queue import Queue
 from threading import Thread
 import soundfile as sf
 import tempfile
 import os
 import wave
 import soundfile as sf
-# Global model - load once
+# Global recognizer
 model = None
 recognizer = None
 audio_queue = Queue()
 result_queue = Queue()
 def initialize_vosk():
-    """Initialize Vosk model"""
+    global recognizer
-    global model, recognizer
+    model_path = "vosk-model"  # Update this path
    model_path = "/app/vosk-model"
    if not os.path.exists(model_path):
-        return {"success": False, "error": "Vosk model not found at /app/vosk-model"}
+        return {"success": False, "error": "Model not found"}
    try:
-        vosk.SetLogLevel(-1)  # Reduce log verbosity
+        vosk.SetLogLevel(-1)
        model = vosk.Model(model_path)
        recognizer = vosk.KaldiRecognizer(model, 16000)
        return {"success": True}
    except Exception as e:
-        return {"success": False, "error": f"Failed to initialize Vosk: {str(e)}"}
+        return {"success": False, "error": str(e)}
-def process_audio_chunk(audio_data):
+def audio_worker():
    """Process audio data and return transcription"""
    global recognizer
    while True:
        audio_data, request_id = audio_queue.get()
    if not recognizer:
        init_result = initialize_vosk()
        if not init_result["success"]:
            return init_result
    try:
        # Write audio data to temporary file
        with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
            temp_file.write(audio_data)
            temp_filename = temp_file.name
        # Read audio file with soundfile
        try:
-            audio_data, sample_rate = sf.read(temp_filename)
+            # Write to temp file and read with soundfile
            with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
                f.write(audio_data)
                temp_path = f.name
-            # Convert to 16-bit PCM at 16kHz if needed
+            try:
-            if sample_rate != 16000:
+                data, samplerate = sf.read(temp_path, dtype='float32')
                # Simple resampling (for better quality, use librosa)
                import numpy as np
                audio_data = np.interp(
                    np.linspace(0, len(audio_data), int(len(audio_data) * 16000 / sample_rate)),
                    np.arange(len(audio_data)),
                    audio_data
                )
-            # Convert to bytes
+                # Resample if needed
-            audio_bytes = (audio_data * 32767).astype('int16').tobytes()
+                if samplerate != 16000:
                    duration = len(data) / samplerate
                    data = np.interp(
                        np.linspace(0, len(data)-1, int(duration * 16000)),
                        np.arange(len(data)),
                        data
                    )
-            # Process with Vosk
+                # Convert to 16-bit PCM
-            if recognizer.AcceptWaveform(audio_bytes):
+                data = (data * 32767).astype('int16')
                result = json.loads(recognizer.Result())
                text = result.get('text', '')
            else:
                result = json.loads(recognizer.PartialResult())
                text = result.get('partial', '')
-            # Clean up
+                # Process with Vosk
-            os.unlink(temp_filename)
+                if recognizer.AcceptWaveform(data.tobytes()):
                    text = json.loads(recognizer.Result()).get('text', '')
                    is_final = True
                else:
                    text = json.loads(recognizer.PartialResult()).get('partial', '')
                    is_final = False
-            return {"success": True, "text": text}
+                result_queue.put(({
                    "success": True,
                    "text": text,
                    "is_final": is_final,
                    "requestId": request_id
                }, request_id))
            finally:
                os.unlink(temp_path)
        except Exception as e:
-            os.unlink(temp_filename)
+            result_queue.put(({
-            return {"success": False, "error": f"Audio processing error: {str(e)}"}
+                "success": False,
-        
+                "error": str(e),
-    except Exception as e:
+                "requestId": request_id
-        return {"success": False, "error": f"General error: {str(e)}"}
+            }, request_id))
 def main():
-    """Main loop to process audio chunks from stdin"""
+    # Initialize Vosk
    # Initialize Vosk on startup
    init_result = initialize_vosk()
    if not init_result["success"]:
-        error_response = json.dumps(init_result).encode('utf-8')
+        error = json.dumps({
-        sys.stdout.buffer.write(len(error_response).to_bytes(4, byteorder='big'))
+            "success": False,
-        sys.stdout.buffer.write(error_response)
+            "error": init_result["error"],
            "requestId": 0
        }).encode()
        sys.stdout.buffer.write(struct.pack('>I', len(error)))
        sys.stdout.buffer.write(error)
        sys.stdout.buffer.flush()
-        sys.exit(1)
+        return
    # Start worker thread
    Thread(target=audio_worker, daemon=True).start()
    while True:
        try:
-            # Read length of incoming data
+            # Read message length (4 bytes)
-            length_data = sys.stdin.buffer.read(4)
+            length_bytes = sys.stdin.buffer.read(4)
-            if not length_data:
+            if not length_bytes:
                break
            length = struct.unpack('>I', length_bytes)[0]
-            length = int.from_bytes(length_data, byteorder='big')
+            # Read request ID (4 bytes)
            id_bytes = sys.stdin.buffer.read(4)
            if not id_bytes:
                break
            request_id = struct.unpack('>I', id_bytes)[0]
            # Read audio data
            audio_data = sys.stdin.buffer.read(length)
            if len(audio_data) != length:
                break
-            # Process audio
+            # Add to processing queue
-            result = process_audio_chunk(audio_data)
+            audio_queue.put((audio_data, request_id))
-            # Send result back
+            # Check for results
-            response = json.dumps(result).encode('utf-8')
+            while not result_queue.empty():
-            sys.stdout.buffer.write(len(response).to_bytes(4, byteorder='big'))
+                result, res_id = result_queue.get()
-            sys.stdout.buffer.write(response)
+                response = json.dumps(result).encode()
-            sys.stdout.buffer.flush()
+                sys.stdout.buffer.write(struct.pack('>I', len(response)))
                sys.stdout.buffer.write(struct.pack('>I', res_id)))
                sys.stdout.buffer.write(response)
                sys.stdout.buffer.flush()
        except Exception as e:
-            error_result = {"success": False, "error": str(e)}
+            error = json.dumps({
-            response = json.dumps(error_result).encode('utf-8')
+                "success": False,
-            sys.stdout.buffer.write(len(response).to_bytes(4, byteorder='big'))
+                "error": str(e),
-            sys.stdout.buffer.write(response)
+                "requestId": request_id if 'request_id' in locals() else 0
            }).encode()
            sys.stdout.buffer.write(struct.pack('>I', len(error)))
            sys.stdout.buffer.write(error)
            sys.stdout.buffer.flush()
 if __name__ == "__main__":