Update speech_processor.py

2025-06-05 12:41:33 +00:00
parent c0c3c7405d
commit b68e390d4c
1 changed files with 100 additions and 79 deletions
--- a/speech_processor.py
+++ b/speech_processor.py
@@ -2,120 +2,141 @@
 import vosk
 import sys
 import json
+import struct
+import numpy as np
+from queue import Queue
+from threading import Thread
+import soundfile as sf
 import tempfile
 import os
-import wave
-import soundfile as sf

-# Global model - load once
-model = None
+# Global recognizer
 recognizer = None
+audio_queue = Queue()
+result_queue = Queue()

 def initialize_vosk():
-    """Initialize Vosk model"""
-    global model, recognizer
+    global recognizer
+    model_path = "vosk-model"  # Update this path
    
-    model_path = "/app/vosk-model"
    if not os.path.exists(model_path):
-        return {"success": False, "error": "Vosk model not found at /app/vosk-model"}
+        return {"success": False, "error": "Model not found"}
    
    try:
-        vosk.SetLogLevel(-1)  # Reduce log verbosity
+        vosk.SetLogLevel(-1)
        model = vosk.Model(model_path)
        recognizer = vosk.KaldiRecognizer(model, 16000)
        return {"success": True}
    except Exception as e:
-        return {"success": False, "error": f"Failed to initialize Vosk: {str(e)}"}
+        return {"success": False, "error": str(e)}

-def process_audio_chunk(audio_data):
-    """Process audio data and return transcription"""
+def audio_worker():
    global recognizer
-    
-    if not recognizer:
-        init_result = initialize_vosk()
-        if not init_result["success"]:
-            return init_result
+    while True:
+        audio_data, request_id = audio_queue.get()
        
        try:
-        # Write audio data to temporary file
-        with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
-            temp_file.write(audio_data)
-            temp_filename = temp_file.name
+            # Write to temp file and read with soundfile
+            with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
+                f.write(audio_data)
+                temp_path = f.name
            
-        # Read audio file with soundfile
            try:
-            audio_data, sample_rate = sf.read(temp_filename)
+                data, samplerate = sf.read(temp_path, dtype='float32')
                
-            # Convert to 16-bit PCM at 16kHz if needed
-            if sample_rate != 16000:
-                # Simple resampling (for better quality, use librosa)
-                import numpy as np
-                audio_data = np.interp(
-                    np.linspace(0, len(audio_data), int(len(audio_data) * 16000 / sample_rate)),
-                    np.arange(len(audio_data)),
-                    audio_data
+                # Resample if needed
+                if samplerate != 16000:
+                    duration = len(data) / samplerate
+                    data = np.interp(
+                        np.linspace(0, len(data)-1, int(duration * 16000)),
+                        np.arange(len(data)),
+                        data
                    )
                
-            # Convert to bytes
-            audio_bytes = (audio_data * 32767).astype('int16').tobytes()
+                # Convert to 16-bit PCM
+                data = (data * 32767).astype('int16')
                
                # Process with Vosk
-            if recognizer.AcceptWaveform(audio_bytes):
-                result = json.loads(recognizer.Result())
-                text = result.get('text', '')
+                if recognizer.AcceptWaveform(data.tobytes()):
+                    text = json.loads(recognizer.Result()).get('text', '')
+                    is_final = True
                else:
-                result = json.loads(recognizer.PartialResult())
-                text = result.get('partial', '')
+                    text = json.loads(recognizer.PartialResult()).get('partial', '')
+                    is_final = False
                
-            # Clean up
-            os.unlink(temp_filename)
+                result_queue.put(({
+                    "success": True,
+                    "text": text,
+                    "is_final": is_final,
+                    "requestId": request_id
+                }, request_id))
                
-            return {"success": True, "text": text}
+            finally:
+                os.unlink(temp_path)
                
        except Exception as e:
-            os.unlink(temp_filename)
-            return {"success": False, "error": f"Audio processing error: {str(e)}"}
-        
-    except Exception as e:
-        return {"success": False, "error": f"General error: {str(e)}"}
+            result_queue.put(({
+                "success": False,
+                "error": str(e),
+                "requestId": request_id
+            }, request_id))

 def main():
-    """Main loop to process audio chunks from stdin"""
-    # Initialize Vosk on startup
+    # Initialize Vosk
    init_result = initialize_vosk()
    if not init_result["success"]:
-        error_response = json.dumps(init_result).encode('utf-8')
-        sys.stdout.buffer.write(len(error_response).to_bytes(4, byteorder='big'))
-        sys.stdout.buffer.write(error_response)
+        error = json.dumps({
+            "success": False,
+            "error": init_result["error"],
+            "requestId": 0
+        }).encode()
+        sys.stdout.buffer.write(struct.pack('>I', len(error)))
+        sys.stdout.buffer.write(error)
        sys.stdout.buffer.flush()
-        sys.exit(1)
+        return
+    
+    # Start worker thread
+    Thread(target=audio_worker, daemon=True).start()
    
    while True:
        try:
-            # Read length of incoming data
-            length_data = sys.stdin.buffer.read(4)
-            if not length_data:
+            # Read message length (4 bytes)
+            length_bytes = sys.stdin.buffer.read(4)
+            if not length_bytes:
                break
+            length = struct.unpack('>I', length_bytes)[0]
            
-            length = int.from_bytes(length_data, byteorder='big')
+            # Read request ID (4 bytes)
+            id_bytes = sys.stdin.buffer.read(4)
+            if not id_bytes:
+                break
+            request_id = struct.unpack('>I', id_bytes)[0]
            
            # Read audio data
            audio_data = sys.stdin.buffer.read(length)
+            if len(audio_data) != length:
+                break
            
-            # Process audio
-            result = process_audio_chunk(audio_data)
+            # Add to processing queue
+            audio_queue.put((audio_data, request_id))
            
-            # Send result back
-            response = json.dumps(result).encode('utf-8')
-            sys.stdout.buffer.write(len(response).to_bytes(4, byteorder='big'))
+            # Check for results
+            while not result_queue.empty():
+                result, res_id = result_queue.get()
+                response = json.dumps(result).encode()
+                sys.stdout.buffer.write(struct.pack('>I', len(response)))
+                sys.stdout.buffer.write(struct.pack('>I', res_id)))
                sys.stdout.buffer.write(response)
                sys.stdout.buffer.flush()
                
        except Exception as e:
-            error_result = {"success": False, "error": str(e)}
-            response = json.dumps(error_result).encode('utf-8')
-            sys.stdout.buffer.write(len(response).to_bytes(4, byteorder='big'))
-            sys.stdout.buffer.write(response)
+            error = json.dumps({
+                "success": False,
+                "error": str(e),
+                "requestId": request_id if 'request_id' in locals() else 0
+            }).encode()
+            sys.stdout.buffer.write(struct.pack('>I', len(error)))
+            sys.stdout.buffer.write(error)
            sys.stdout.buffer.flush()

 if __name__ == "__main__":