Update speech_processor.py

c2
Kar 2025-06-05 12:41:33 +00:00
parent c0c3c7405d
commit b68e390d4c
1 changed files with 100 additions and 79 deletions

View File

@ -2,120 +2,141 @@
import vosk
import sys
import json
import struct
import numpy as np
from queue import Queue
from threading import Thread
import soundfile as sf
import tempfile
import os
import wave
import soundfile as sf
# Global model - load once
model = None
# Global recognizer
recognizer = None
audio_queue = Queue()
result_queue = Queue()
def initialize_vosk():
"""Initialize Vosk model"""
global model, recognizer
global recognizer
model_path = "vosk-model" # Update this path
model_path = "/app/vosk-model"
if not os.path.exists(model_path):
return {"success": False, "error": "Vosk model not found at /app/vosk-model"}
return {"success": False, "error": "Model not found"}
try:
vosk.SetLogLevel(-1) # Reduce log verbosity
vosk.SetLogLevel(-1)
model = vosk.Model(model_path)
recognizer = vosk.KaldiRecognizer(model, 16000)
return {"success": True}
except Exception as e:
return {"success": False, "error": f"Failed to initialize Vosk: {str(e)}"}
return {"success": False, "error": str(e)}
def process_audio_chunk(audio_data):
"""Process audio data and return transcription"""
def audio_worker():
global recognizer
if not recognizer:
init_result = initialize_vosk()
if not init_result["success"]:
return init_result
while True:
audio_data, request_id = audio_queue.get()
try:
# Write audio data to temporary file
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
temp_file.write(audio_data)
temp_filename = temp_file.name
# Write to temp file and read with soundfile
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
f.write(audio_data)
temp_path = f.name
# Read audio file with soundfile
try:
audio_data, sample_rate = sf.read(temp_filename)
data, samplerate = sf.read(temp_path, dtype='float32')
# Convert to 16-bit PCM at 16kHz if needed
if sample_rate != 16000:
# Simple resampling (for better quality, use librosa)
import numpy as np
audio_data = np.interp(
np.linspace(0, len(audio_data), int(len(audio_data) * 16000 / sample_rate)),
np.arange(len(audio_data)),
audio_data
# Resample if needed
if samplerate != 16000:
duration = len(data) / samplerate
data = np.interp(
np.linspace(0, len(data)-1, int(duration * 16000)),
np.arange(len(data)),
data
)
# Convert to bytes
audio_bytes = (audio_data * 32767).astype('int16').tobytes()
# Convert to 16-bit PCM
data = (data * 32767).astype('int16')
# Process with Vosk
if recognizer.AcceptWaveform(audio_bytes):
result = json.loads(recognizer.Result())
text = result.get('text', '')
if recognizer.AcceptWaveform(data.tobytes()):
text = json.loads(recognizer.Result()).get('text', '')
is_final = True
else:
result = json.loads(recognizer.PartialResult())
text = result.get('partial', '')
text = json.loads(recognizer.PartialResult()).get('partial', '')
is_final = False
# Clean up
os.unlink(temp_filename)
result_queue.put(({
"success": True,
"text": text,
"is_final": is_final,
"requestId": request_id
}, request_id))
return {"success": True, "text": text}
finally:
os.unlink(temp_path)
except Exception as e:
os.unlink(temp_filename)
return {"success": False, "error": f"Audio processing error: {str(e)}"}
except Exception as e:
return {"success": False, "error": f"General error: {str(e)}"}
result_queue.put(({
"success": False,
"error": str(e),
"requestId": request_id
}, request_id))
def main():
"""Main loop to process audio chunks from stdin"""
# Initialize Vosk on startup
# Initialize Vosk
init_result = initialize_vosk()
if not init_result["success"]:
error_response = json.dumps(init_result).encode('utf-8')
sys.stdout.buffer.write(len(error_response).to_bytes(4, byteorder='big'))
sys.stdout.buffer.write(error_response)
error = json.dumps({
"success": False,
"error": init_result["error"],
"requestId": 0
}).encode()
sys.stdout.buffer.write(struct.pack('>I', len(error)))
sys.stdout.buffer.write(error)
sys.stdout.buffer.flush()
sys.exit(1)
return
# Start worker thread
Thread(target=audio_worker, daemon=True).start()
while True:
try:
# Read length of incoming data
length_data = sys.stdin.buffer.read(4)
if not length_data:
# Read message length (4 bytes)
length_bytes = sys.stdin.buffer.read(4)
if not length_bytes:
break
length = struct.unpack('>I', length_bytes)[0]
length = int.from_bytes(length_data, byteorder='big')
# Read request ID (4 bytes)
id_bytes = sys.stdin.buffer.read(4)
if not id_bytes:
break
request_id = struct.unpack('>I', id_bytes)[0]
# Read audio data
audio_data = sys.stdin.buffer.read(length)
if len(audio_data) != length:
break
# Process audio
result = process_audio_chunk(audio_data)
# Add to processing queue
audio_queue.put((audio_data, request_id))
# Send result back
response = json.dumps(result).encode('utf-8')
sys.stdout.buffer.write(len(response).to_bytes(4, byteorder='big'))
# Check for results
while not result_queue.empty():
result, res_id = result_queue.get()
response = json.dumps(result).encode()
sys.stdout.buffer.write(struct.pack('>I', len(response)))
sys.stdout.buffer.write(struct.pack('>I', res_id)))
sys.stdout.buffer.write(response)
sys.stdout.buffer.flush()
except Exception as e:
error_result = {"success": False, "error": str(e)}
response = json.dumps(error_result).encode('utf-8')
sys.stdout.buffer.write(len(response).to_bytes(4, byteorder='big'))
sys.stdout.buffer.write(response)
error = json.dumps({
"success": False,
"error": str(e),
"requestId": request_id if 'request_id' in locals() else 0
}).encode()
sys.stdout.buffer.write(struct.pack('>I', len(error)))
sys.stdout.buffer.write(error)
sys.stdout.buffer.flush()
if __name__ == "__main__":