Update speech_processor.py
parent
878411e55b
commit
1015c8a2fa
|
@ -4,8 +4,9 @@ import sys
|
||||||
import json
|
import json
|
||||||
import tempfile
|
import tempfile
|
||||||
import os
|
import os
|
||||||
import wave
|
|
||||||
import soundfile as sf
|
import soundfile as sf
|
||||||
|
import numpy as np
|
||||||
|
import struct
|
||||||
|
|
||||||
# Global model - load once
|
# Global model - load once
|
||||||
model = None
|
model = None
|
||||||
|
@ -15,9 +16,9 @@ def initialize_vosk():
|
||||||
"""Initialize Vosk model"""
|
"""Initialize Vosk model"""
|
||||||
global model, recognizer
|
global model, recognizer
|
||||||
|
|
||||||
model_path = "/app/vosk-model"
|
model_path = "/app/vosk-model" # Update this path to your model location
|
||||||
if not os.path.exists(model_path):
|
if not os.path.exists(model_path):
|
||||||
return {"success": False, "error": "Vosk model not found at /app/vosk-model"}
|
return {"success": False, "error": f"Vosk model not found at {model_path}"}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
vosk.SetLogLevel(-1) # Reduce log verbosity
|
vosk.SetLogLevel(-1) # Reduce log verbosity
|
||||||
|
@ -27,14 +28,14 @@ def initialize_vosk():
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return {"success": False, "error": f"Failed to initialize Vosk: {str(e)}"}
|
return {"success": False, "error": f"Failed to initialize Vosk: {str(e)}"}
|
||||||
|
|
||||||
def process_audio_chunk(audio_data):
|
def process_audio_chunk(audio_data, request_id):
|
||||||
"""Process audio data and return transcription"""
|
"""Process audio data and return transcription"""
|
||||||
global recognizer
|
global recognizer
|
||||||
|
|
||||||
if not recognizer:
|
if not recognizer:
|
||||||
init_result = initialize_vosk()
|
init_result = initialize_vosk()
|
||||||
if not init_result["success"]:
|
if not init_result["success"]:
|
||||||
return init_result
|
return {**init_result, "requestId": request_id}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Write audio data to temporary file
|
# Write audio data to temporary file
|
||||||
|
@ -44,77 +45,107 @@ def process_audio_chunk(audio_data):
|
||||||
|
|
||||||
# Read audio file with soundfile
|
# Read audio file with soundfile
|
||||||
try:
|
try:
|
||||||
audio_data, sample_rate = sf.read(temp_filename)
|
audio_data, sample_rate = sf.read(temp_filename, dtype='float32')
|
||||||
|
|
||||||
# Convert to 16-bit PCM at 16kHz if needed
|
# Convert to 16kHz if needed
|
||||||
if sample_rate != 16000:
|
if sample_rate != 16000:
|
||||||
# Simple resampling (for better quality, use librosa)
|
# Simple resampling (for production, use proper resampling)
|
||||||
import numpy as np
|
num_samples = int(len(audio_data) * 16000 / sample_rate)
|
||||||
audio_data = np.interp(
|
audio_data = np.interp(
|
||||||
np.linspace(0, len(audio_data), int(len(audio_data) * 16000 / sample_rate)),
|
np.linspace(0, len(audio_data)-1, num_samples),
|
||||||
np.arange(len(audio_data)),
|
np.arange(len(audio_data)),
|
||||||
audio_data
|
audio_data
|
||||||
)
|
)
|
||||||
|
sample_rate = 16000
|
||||||
|
|
||||||
# Convert to bytes
|
# Convert to 16-bit PCM
|
||||||
audio_bytes = (audio_data * 32767).astype('int16').tobytes()
|
audio_data = (audio_data * 32767).astype('int16')
|
||||||
|
|
||||||
# Process with Vosk
|
# Process with Vosk
|
||||||
if recognizer.AcceptWaveform(audio_bytes):
|
if recognizer.AcceptWaveform(audio_data.tobytes()):
|
||||||
result = json.loads(recognizer.Result())
|
result = json.loads(recognizer.Result())
|
||||||
text = result.get('text', '')
|
text = result.get('text', '')
|
||||||
|
is_final = True
|
||||||
else:
|
else:
|
||||||
result = json.loads(recognizer.PartialResult())
|
result = json.loads(recognizer.PartialResult())
|
||||||
text = result.get('partial', '')
|
text = result.get('partial', '')
|
||||||
|
is_final = False
|
||||||
|
|
||||||
# Clean up
|
# Clean up
|
||||||
os.unlink(temp_filename)
|
os.unlink(temp_filename)
|
||||||
|
|
||||||
return {"success": True, "text": text}
|
return {
|
||||||
|
"success": True,
|
||||||
|
"text": text,
|
||||||
|
"is_final": is_final,
|
||||||
|
"requestId": request_id
|
||||||
|
}
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
os.unlink(temp_filename)
|
os.unlink(temp_filename)
|
||||||
return {"success": False, "error": f"Audio processing error: {str(e)}"}
|
return {
|
||||||
|
"success": False,
|
||||||
|
"error": f"Audio processing error: {str(e)}",
|
||||||
|
"requestId": request_id
|
||||||
|
}
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return {"success": False, "error": f"General error: {str(e)}"}
|
return {
|
||||||
|
"success": False,
|
||||||
|
"error": f"General error: {str(e)}",
|
||||||
|
"requestId": request_id
|
||||||
|
}
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
"""Main loop to process audio chunks from stdin"""
|
"""Main loop to process audio chunks from stdin"""
|
||||||
# Initialize Vosk on startup
|
# Initialize Vosk on startup
|
||||||
init_result = initialize_vosk()
|
init_result = initialize_vosk()
|
||||||
if not init_result["success"]:
|
if not init_result["success"]:
|
||||||
error_response = json.dumps(init_result).encode('utf-8')
|
error_response = json.dumps({
|
||||||
sys.stdout.buffer.write(len(error_response).to_bytes(4, byteorder='big'))
|
**init_result,
|
||||||
|
"requestId": 0
|
||||||
|
}).encode('utf-8')
|
||||||
|
sys.stdout.buffer.write(struct.pack('>I', len(error_response)))
|
||||||
sys.stdout.buffer.write(error_response)
|
sys.stdout.buffer.write(error_response)
|
||||||
sys.stdout.buffer.flush()
|
sys.stdout.buffer.flush()
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
# Read length of incoming data
|
# Read length of incoming data (4 bytes)
|
||||||
length_data = sys.stdin.buffer.read(4)
|
length_data = sys.stdin.buffer.read(4)
|
||||||
if not length_data:
|
if not length_data:
|
||||||
break
|
break
|
||||||
|
length = struct.unpack('>I', length_data)[0]
|
||||||
length = int.from_bytes(length_data, byteorder='big')
|
|
||||||
|
# Read request ID (4 bytes)
|
||||||
|
id_data = sys.stdin.buffer.read(4)
|
||||||
|
if not id_data:
|
||||||
|
break
|
||||||
|
request_id = struct.unpack('>I', id_data)[0]
|
||||||
|
|
||||||
# Read audio data
|
# Read audio data
|
||||||
audio_data = sys.stdin.buffer.read(length)
|
audio_data = sys.stdin.buffer.read(length)
|
||||||
|
if len(audio_data) != length:
|
||||||
|
break
|
||||||
|
|
||||||
# Process audio
|
# Process audio
|
||||||
result = process_audio_chunk(audio_data)
|
result = process_audio_chunk(audio_data, request_id)
|
||||||
|
|
||||||
# Send result back
|
# Send result back
|
||||||
response = json.dumps(result).encode('utf-8')
|
response = json.dumps(result).encode('utf-8')
|
||||||
sys.stdout.buffer.write(len(response).to_bytes(4, byteorder='big'))
|
sys.stdout.buffer.write(struct.pack('>I', len(response)))
|
||||||
sys.stdout.buffer.write(response)
|
sys.stdout.buffer.write(response)
|
||||||
sys.stdout.buffer.flush()
|
sys.stdout.buffer.flush()
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_result = {"success": False, "error": str(e)}
|
error_result = {
|
||||||
|
"success": False,
|
||||||
|
"error": str(e),
|
||||||
|
"requestId": request_id if 'request_id' in locals() else 0
|
||||||
|
}
|
||||||
response = json.dumps(error_result).encode('utf-8')
|
response = json.dumps(error_result).encode('utf-8')
|
||||||
sys.stdout.buffer.write(len(response).to_bytes(4, byteorder='big'))
|
sys.stdout.buffer.write(struct.pack('>I', len(response)))
|
||||||
sys.stdout.buffer.write(response)
|
sys.stdout.buffer.write(response)
|
||||||
sys.stdout.buffer.flush()
|
sys.stdout.buffer.flush()
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue