From 1015c8a2faade115ec6c20c6a80c53951ff7e0a4 Mon Sep 17 00:00:00 2001
From: Kar <kar@siliconpin.com>
Date: Thu, 5 Jun 2025 10:41:50 +0000
Subject: [PATCH] Update speech_processor.py

---
 speech_processor.py | 81 +++++++++++++++++++++++++++++++--------------
 1 file changed, 56 insertions(+), 25 deletions(-)

diff --git a/speech_processor.py b/speech_processor.py
index 7c95c23..f07d505 100644
--- a/speech_processor.py
+++ b/speech_processor.py
@@ -4,8 +4,9 @@ import sys
 import json
 import tempfile
 import os
-import wave
 import soundfile as sf
+import numpy as np
+import struct
 
 # Global model - load once
 model = None
@@ -15,9 +16,9 @@ def initialize_vosk():
     """Initialize Vosk model"""
     global model, recognizer
     
-    model_path = "/app/vosk-model"
+    model_path = "/app/vosk-model"  # Update this path to your model location
     if not os.path.exists(model_path):
-        return {"success": False, "error": "Vosk model not found at /app/vosk-model"}
+        return {"success": False, "error": f"Vosk model not found at {model_path}"}
     
     try:
         vosk.SetLogLevel(-1)  # Reduce log verbosity
@@ -27,14 +28,14 @@ def initialize_vosk():
     except Exception as e:
         return {"success": False, "error": f"Failed to initialize Vosk: {str(e)}"}
 
-def process_audio_chunk(audio_data):
+def process_audio_chunk(audio_data, request_id):
     """Process audio data and return transcription"""
     global recognizer
     
     if not recognizer:
         init_result = initialize_vosk()
         if not init_result["success"]:
-            return init_result
+            return {**init_result, "requestId": request_id}
     
     try:
         # Write audio data to temporary file
@@ -44,77 +45,107 @@ def process_audio_chunk(audio_data):
         
         # Read audio file with soundfile
         try:
-            audio_data, sample_rate = sf.read(temp_filename)
+            audio_data, sample_rate = sf.read(temp_filename, dtype='float32')
             
-            # Convert to 16-bit PCM at 16kHz if needed
+            # Convert to 16kHz if needed
             if sample_rate != 16000:
-                # Simple resampling (for better quality, use librosa)
-                import numpy as np
+                # Simple resampling (for production, use proper resampling)
+                num_samples = int(len(audio_data) * 16000 / sample_rate)
                 audio_data = np.interp(
-                    np.linspace(0, len(audio_data), int(len(audio_data) * 16000 / sample_rate)),
+                    np.linspace(0, len(audio_data)-1, num_samples),
                     np.arange(len(audio_data)),
                     audio_data
                 )
+                sample_rate = 16000
             
-            # Convert to bytes
-            audio_bytes = (audio_data * 32767).astype('int16').tobytes()
+            # Convert to 16-bit PCM
+            audio_data = (audio_data * 32767).astype('int16')
             
             # Process with Vosk
-            if recognizer.AcceptWaveform(audio_bytes):
+            if recognizer.AcceptWaveform(audio_data.tobytes()):
                 result = json.loads(recognizer.Result())
                 text = result.get('text', '')
+                is_final = True
             else:
                 result = json.loads(recognizer.PartialResult())
                 text = result.get('partial', '')
+                is_final = False
             
             # Clean up
             os.unlink(temp_filename)
             
-            return {"success": True, "text": text}
+            return {
+                "success": True,
+                "text": text,
+                "is_final": is_final,
+                "requestId": request_id
+            }
             
         except Exception as e:
             os.unlink(temp_filename)
-            return {"success": False, "error": f"Audio processing error: {str(e)}"}
+            return {
+                "success": False,
+                "error": f"Audio processing error: {str(e)}",
+                "requestId": request_id
+            }
         
     except Exception as e:
-        return {"success": False, "error": f"General error: {str(e)}"}
+        return {
+            "success": False,
+            "error": f"General error: {str(e)}",
+            "requestId": request_id
+        }
 
 def main():
     """Main loop to process audio chunks from stdin"""
     # Initialize Vosk on startup
     init_result = initialize_vosk()
     if not init_result["success"]:
-        error_response = json.dumps(init_result).encode('utf-8')
-        sys.stdout.buffer.write(len(error_response).to_bytes(4, byteorder='big'))
+        error_response = json.dumps({
+            **init_result,
+            "requestId": 0
+        }).encode('utf-8')
+        sys.stdout.buffer.write(struct.pack('>I', len(error_response)))
         sys.stdout.buffer.write(error_response)
         sys.stdout.buffer.flush()
         sys.exit(1)
     
     while True:
         try:
-            # Read length of incoming data
+            # Read length of incoming data (4 bytes)
             length_data = sys.stdin.buffer.read(4)
             if not length_data:
                 break
-                
-            length = int.from_bytes(length_data, byteorder='big')
+            length = struct.unpack('>I', length_data)[0]
+            
+            # Read request ID (4 bytes)
+            id_data = sys.stdin.buffer.read(4)
+            if not id_data:
+                break
+            request_id = struct.unpack('>I', id_data)[0]
             
             # Read audio data
             audio_data = sys.stdin.buffer.read(length)
+            if len(audio_data) != length:
+                break
             
             # Process audio
-            result = process_audio_chunk(audio_data)
+            result = process_audio_chunk(audio_data, request_id)
             
             # Send result back
             response = json.dumps(result).encode('utf-8')
-            sys.stdout.buffer.write(len(response).to_bytes(4, byteorder='big'))
+            sys.stdout.buffer.write(struct.pack('>I', len(response)))
             sys.stdout.buffer.write(response)
             sys.stdout.buffer.flush()
             
         except Exception as e:
-            error_result = {"success": False, "error": str(e)}
+            error_result = {
+                "success": False,
+                "error": str(e),
+                "requestId": request_id if 'request_id' in locals() else 0
+            }
             response = json.dumps(error_result).encode('utf-8')
-            sys.stdout.buffer.write(len(response).to_bytes(4, byteorder='big'))
+            sys.stdout.buffer.write(struct.pack('>I', len(response)))
             sys.stdout.buffer.write(response)
             sys.stdout.buffer.flush()