Update speech_processor.py

c2
Kar 2025-06-05 12:41:33 +00:00
parent c0c3c7405d
commit b68e390d4c
1 changed files with 100 additions and 79 deletions

View File

@ -2,120 +2,141 @@
import vosk import vosk
import sys import sys
import json import json
import struct
import numpy as np
from queue import Queue
from threading import Thread
import soundfile as sf
import tempfile import tempfile
import os import os
import wave
import soundfile as sf
# Global model - load once # Global recognizer
model = None
recognizer = None recognizer = None
audio_queue = Queue()
result_queue = Queue()
def initialize_vosk(): def initialize_vosk():
"""Initialize Vosk model""" global recognizer
global model, recognizer model_path = "vosk-model" # Update this path
model_path = "/app/vosk-model"
if not os.path.exists(model_path): if not os.path.exists(model_path):
return {"success": False, "error": "Vosk model not found at /app/vosk-model"} return {"success": False, "error": "Model not found"}
try: try:
vosk.SetLogLevel(-1) # Reduce log verbosity vosk.SetLogLevel(-1)
model = vosk.Model(model_path) model = vosk.Model(model_path)
recognizer = vosk.KaldiRecognizer(model, 16000) recognizer = vosk.KaldiRecognizer(model, 16000)
return {"success": True} return {"success": True}
except Exception as e: except Exception as e:
return {"success": False, "error": f"Failed to initialize Vosk: {str(e)}"} return {"success": False, "error": str(e)}
def process_audio_chunk(audio_data): def audio_worker():
"""Process audio data and return transcription"""
global recognizer global recognizer
while True:
audio_data, request_id = audio_queue.get()
if not recognizer:
init_result = initialize_vosk()
if not init_result["success"]:
return init_result
try:
# Write audio data to temporary file
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
temp_file.write(audio_data)
temp_filename = temp_file.name
# Read audio file with soundfile
try: try:
audio_data, sample_rate = sf.read(temp_filename) # Write to temp file and read with soundfile
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
f.write(audio_data)
temp_path = f.name
# Convert to 16-bit PCM at 16kHz if needed try:
if sample_rate != 16000: data, samplerate = sf.read(temp_path, dtype='float32')
# Simple resampling (for better quality, use librosa)
import numpy as np
audio_data = np.interp(
np.linspace(0, len(audio_data), int(len(audio_data) * 16000 / sample_rate)),
np.arange(len(audio_data)),
audio_data
)
# Convert to bytes # Resample if needed
audio_bytes = (audio_data * 32767).astype('int16').tobytes() if samplerate != 16000:
duration = len(data) / samplerate
data = np.interp(
np.linspace(0, len(data)-1, int(duration * 16000)),
np.arange(len(data)),
data
)
# Process with Vosk # Convert to 16-bit PCM
if recognizer.AcceptWaveform(audio_bytes): data = (data * 32767).astype('int16')
result = json.loads(recognizer.Result())
text = result.get('text', '')
else:
result = json.loads(recognizer.PartialResult())
text = result.get('partial', '')
# Clean up # Process with Vosk
os.unlink(temp_filename) if recognizer.AcceptWaveform(data.tobytes()):
text = json.loads(recognizer.Result()).get('text', '')
is_final = True
else:
text = json.loads(recognizer.PartialResult()).get('partial', '')
is_final = False
return {"success": True, "text": text} result_queue.put(({
"success": True,
"text": text,
"is_final": is_final,
"requestId": request_id
}, request_id))
finally:
os.unlink(temp_path)
except Exception as e: except Exception as e:
os.unlink(temp_filename) result_queue.put(({
return {"success": False, "error": f"Audio processing error: {str(e)}"} "success": False,
"error": str(e),
except Exception as e: "requestId": request_id
return {"success": False, "error": f"General error: {str(e)}"} }, request_id))
def main(): def main():
"""Main loop to process audio chunks from stdin""" # Initialize Vosk
# Initialize Vosk on startup
init_result = initialize_vosk() init_result = initialize_vosk()
if not init_result["success"]: if not init_result["success"]:
error_response = json.dumps(init_result).encode('utf-8') error = json.dumps({
sys.stdout.buffer.write(len(error_response).to_bytes(4, byteorder='big')) "success": False,
sys.stdout.buffer.write(error_response) "error": init_result["error"],
"requestId": 0
}).encode()
sys.stdout.buffer.write(struct.pack('>I', len(error)))
sys.stdout.buffer.write(error)
sys.stdout.buffer.flush() sys.stdout.buffer.flush()
sys.exit(1) return
# Start worker thread
Thread(target=audio_worker, daemon=True).start()
while True: while True:
try: try:
# Read length of incoming data # Read message length (4 bytes)
length_data = sys.stdin.buffer.read(4) length_bytes = sys.stdin.buffer.read(4)
if not length_data: if not length_bytes:
break break
length = struct.unpack('>I', length_bytes)[0]
length = int.from_bytes(length_data, byteorder='big') # Read request ID (4 bytes)
id_bytes = sys.stdin.buffer.read(4)
if not id_bytes:
break
request_id = struct.unpack('>I', id_bytes)[0]
# Read audio data # Read audio data
audio_data = sys.stdin.buffer.read(length) audio_data = sys.stdin.buffer.read(length)
if len(audio_data) != length:
break
# Process audio # Add to processing queue
result = process_audio_chunk(audio_data) audio_queue.put((audio_data, request_id))
# Send result back # Check for results
response = json.dumps(result).encode('utf-8') while not result_queue.empty():
sys.stdout.buffer.write(len(response).to_bytes(4, byteorder='big')) result, res_id = result_queue.get()
sys.stdout.buffer.write(response) response = json.dumps(result).encode()
sys.stdout.buffer.flush() sys.stdout.buffer.write(struct.pack('>I', len(response)))
sys.stdout.buffer.write(struct.pack('>I', res_id)))
sys.stdout.buffer.write(response)
sys.stdout.buffer.flush()
except Exception as e: except Exception as e:
error_result = {"success": False, "error": str(e)} error = json.dumps({
response = json.dumps(error_result).encode('utf-8') "success": False,
sys.stdout.buffer.write(len(response).to_bytes(4, byteorder='big')) "error": str(e),
sys.stdout.buffer.write(response) "requestId": request_id if 'request_id' in locals() else 0
}).encode()
sys.stdout.buffer.write(struct.pack('>I', len(error)))
sys.stdout.buffer.write(error)
sys.stdout.buffer.flush() sys.stdout.buffer.flush()
if __name__ == "__main__": if __name__ == "__main__":