stt-vosk-py-node/speech_processor.py

#!/usr/bin/env python3
import vosk
import sys
import json
import struct
import numpy as np
import soundfile as sf
import tempfile
import os
from threading import Lock

# Global recognizer with thread lock
recognizer = None
recognizer_lock = Lock()

def initialize_vosk():
    global recognizer
    model_path = "vosk-model-small-en-us-0.15"  # Update this path to your model

    if not os.path.exists(model_path):
        return {"success": False, "error": "Model not found"}

    try:
        vosk.SetLogLevel(-1)
        model = vosk.Model(model_path)
        recognizer = vosk.KaldiRecognizer(model, 16000)
        return {"success": True}
    except Exception as e:
        return {"success": False, "error": str(e)}

def process_audio(audio_data, request_id):
    global recognizer

    if not recognizer:
        init_result = initialize_vosk()
        if not init_result["success"]:
            return {**init_result, "requestId": request_id}

    try:
        # Write to temp file
        with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
            f.write(audio_data)
            temp_path = f.name

        try:
            # Read with soundfile
            data, samplerate = sf.read(temp_path, dtype='float32')

            # Convert to 16kHz if needed
            if samplerate != 16000:
                duration = len(data) / samplerate
                data = np.interp(
                    np.linspace(0, len(data)-1, int(duration * 16000)),
                    np.arange(len(data)),
                    data
                )

            # Convert to 16-bit PCM
            data = (data * 32767).astype('int16')

            # Process with thread-safe recognizer
            with recognizer_lock:
                if recognizer.AcceptWaveform(data.tobytes()):
                    text = json.loads(recognizer.Result()).get('text', '')
                    is_final = True
                else:
                    text = json.loads(recognizer.PartialResult()).get('partial', '')
                    is_final = False

            return {
                "success": True,
                "text": text,
                "is_final": is_final,
                "requestId": request_id
            }
        finally:
            os.unlink(temp_path)
    except Exception as e:
        return {
            "success": False,
            "error": str(e),
            "requestId": request_id
        }

def main():
    # Initialize Vosk
    init_result = initialize_vosk()
    if not init_result["success"]:
        error = json.dumps({
            "success": False,
            "error": init_result["error"],
            "requestId": 0
        }).encode()
        sys.stdout.buffer.write(struct.pack('>I', len(error)))
        sys.stdout.buffer.write(error)
        sys.stdout.buffer.flush()
        return

    while True:
        try:
            # Read message length (4 bytes)
            length_bytes = sys.stdin.buffer.read(4)
            if not length_bytes:
                break
            length = struct.unpack('>I', length_bytes)[0]

            # Read request ID (4 bytes)
            id_bytes = sys.stdin.buffer.read(4)
            if not id_bytes:
                break
            request_id = struct.unpack('>I', id_bytes)[0]

            # Read audio data
            audio_data = sys.stdin.buffer.read(length)
            if len(audio_data) != length:
                break

            # Process and send response
            result = process_audio(audio_data, request_id)
            response = json.dumps(result).encode()
            sys.stdout.buffer.write(struct.pack('>I', len(response)))
            sys.stdout.buffer.write(struct.pack('>I', request_id))
            sys.stdout.buffer.write(response)
            sys.stdout.buffer.flush()

        except Exception as e:
            error = json.dumps({
                "success": False,
                "error": str(e),
                "requestId": request_id if 'request_id' in locals() else 0
            }).encode()
            sys.stdout.buffer.write(struct.pack('>I', len(error)))
            sys.stdout.buffer.write(error)
            sys.stdout.buffer.flush()

if __name__ == "__main__":
    main()