Update speech_processor.py

c1
Kar 2025-06-05 10:41:50 +00:00
parent 878411e55b
commit 1015c8a2fa
1 changed files with 56 additions and 25 deletions

View File

@ -4,8 +4,9 @@ import sys
import json import json
import tempfile import tempfile
import os import os
import wave
import soundfile as sf import soundfile as sf
import numpy as np
import struct
# Global model - load once # Global model - load once
model = None model = None
@ -15,9 +16,9 @@ def initialize_vosk():
"""Initialize Vosk model""" """Initialize Vosk model"""
global model, recognizer global model, recognizer
model_path = "/app/vosk-model" model_path = "/app/vosk-model" # Update this path to your model location
if not os.path.exists(model_path): if not os.path.exists(model_path):
return {"success": False, "error": "Vosk model not found at /app/vosk-model"} return {"success": False, "error": f"Vosk model not found at {model_path}"}
try: try:
vosk.SetLogLevel(-1) # Reduce log verbosity vosk.SetLogLevel(-1) # Reduce log verbosity
@ -27,14 +28,14 @@ def initialize_vosk():
except Exception as e: except Exception as e:
return {"success": False, "error": f"Failed to initialize Vosk: {str(e)}"} return {"success": False, "error": f"Failed to initialize Vosk: {str(e)}"}
def process_audio_chunk(audio_data): def process_audio_chunk(audio_data, request_id):
"""Process audio data and return transcription""" """Process audio data and return transcription"""
global recognizer global recognizer
if not recognizer: if not recognizer:
init_result = initialize_vosk() init_result = initialize_vosk()
if not init_result["success"]: if not init_result["success"]:
return init_result return {**init_result, "requestId": request_id}
try: try:
# Write audio data to temporary file # Write audio data to temporary file
@ -44,77 +45,107 @@ def process_audio_chunk(audio_data):
# Read audio file with soundfile # Read audio file with soundfile
try: try:
audio_data, sample_rate = sf.read(temp_filename) audio_data, sample_rate = sf.read(temp_filename, dtype='float32')
# Convert to 16-bit PCM at 16kHz if needed # Convert to 16kHz if needed
if sample_rate != 16000: if sample_rate != 16000:
# Simple resampling (for better quality, use librosa) # Simple resampling (for production, use proper resampling)
import numpy as np num_samples = int(len(audio_data) * 16000 / sample_rate)
audio_data = np.interp( audio_data = np.interp(
np.linspace(0, len(audio_data), int(len(audio_data) * 16000 / sample_rate)), np.linspace(0, len(audio_data)-1, num_samples),
np.arange(len(audio_data)), np.arange(len(audio_data)),
audio_data audio_data
) )
sample_rate = 16000
# Convert to bytes # Convert to 16-bit PCM
audio_bytes = (audio_data * 32767).astype('int16').tobytes() audio_data = (audio_data * 32767).astype('int16')
# Process with Vosk # Process with Vosk
if recognizer.AcceptWaveform(audio_bytes): if recognizer.AcceptWaveform(audio_data.tobytes()):
result = json.loads(recognizer.Result()) result = json.loads(recognizer.Result())
text = result.get('text', '') text = result.get('text', '')
is_final = True
else: else:
result = json.loads(recognizer.PartialResult()) result = json.loads(recognizer.PartialResult())
text = result.get('partial', '') text = result.get('partial', '')
is_final = False
# Clean up # Clean up
os.unlink(temp_filename) os.unlink(temp_filename)
return {"success": True, "text": text} return {
"success": True,
"text": text,
"is_final": is_final,
"requestId": request_id
}
except Exception as e: except Exception as e:
os.unlink(temp_filename) os.unlink(temp_filename)
return {"success": False, "error": f"Audio processing error: {str(e)}"} return {
"success": False,
"error": f"Audio processing error: {str(e)}",
"requestId": request_id
}
except Exception as e: except Exception as e:
return {"success": False, "error": f"General error: {str(e)}"} return {
"success": False,
"error": f"General error: {str(e)}",
"requestId": request_id
}
def main(): def main():
"""Main loop to process audio chunks from stdin""" """Main loop to process audio chunks from stdin"""
# Initialize Vosk on startup # Initialize Vosk on startup
init_result = initialize_vosk() init_result = initialize_vosk()
if not init_result["success"]: if not init_result["success"]:
error_response = json.dumps(init_result).encode('utf-8') error_response = json.dumps({
sys.stdout.buffer.write(len(error_response).to_bytes(4, byteorder='big')) **init_result,
"requestId": 0
}).encode('utf-8')
sys.stdout.buffer.write(struct.pack('>I', len(error_response)))
sys.stdout.buffer.write(error_response) sys.stdout.buffer.write(error_response)
sys.stdout.buffer.flush() sys.stdout.buffer.flush()
sys.exit(1) sys.exit(1)
while True: while True:
try: try:
# Read length of incoming data # Read length of incoming data (4 bytes)
length_data = sys.stdin.buffer.read(4) length_data = sys.stdin.buffer.read(4)
if not length_data: if not length_data:
break break
length = struct.unpack('>I', length_data)[0]
length = int.from_bytes(length_data, byteorder='big')
# Read request ID (4 bytes)
id_data = sys.stdin.buffer.read(4)
if not id_data:
break
request_id = struct.unpack('>I', id_data)[0]
# Read audio data # Read audio data
audio_data = sys.stdin.buffer.read(length) audio_data = sys.stdin.buffer.read(length)
if len(audio_data) != length:
break
# Process audio # Process audio
result = process_audio_chunk(audio_data) result = process_audio_chunk(audio_data, request_id)
# Send result back # Send result back
response = json.dumps(result).encode('utf-8') response = json.dumps(result).encode('utf-8')
sys.stdout.buffer.write(len(response).to_bytes(4, byteorder='big')) sys.stdout.buffer.write(struct.pack('>I', len(response)))
sys.stdout.buffer.write(response) sys.stdout.buffer.write(response)
sys.stdout.buffer.flush() sys.stdout.buffer.flush()
except Exception as e: except Exception as e:
error_result = {"success": False, "error": str(e)} error_result = {
"success": False,
"error": str(e),
"requestId": request_id if 'request_id' in locals() else 0
}
response = json.dumps(error_result).encode('utf-8') response = json.dumps(error_result).encode('utf-8')
sys.stdout.buffer.write(len(response).to_bytes(4, byteorder='big')) sys.stdout.buffer.write(struct.pack('>I', len(response)))
sys.stdout.buffer.write(response) sys.stdout.buffer.write(response)
sys.stdout.buffer.flush() sys.stdout.buffer.flush()