stt-vosk-py-api/app.py

52 lines
1.6 KiB
Python

from fastapi import FastAPI, File, UploadFile, HTTPException
from fastapi.responses import JSONResponse
from vosk import Model, KaldiRecognizer
import wave
import os
import shutil
import json
app = FastAPI()
MODEL_PATH = "model/vosk-model-small-en-us-0.15"
if not os.path.exists(MODEL_PATH):
raise RuntimeError(f"Vosk model not found at {MODEL_PATH}")
model = Model(MODEL_PATH)
@app.post("/stt")
async def transcribe(audio: UploadFile = File(...)):
if not audio.filename.endswith(".wav"):
raise HTTPException(status_code=400, detail="Only .wav files are supported")
temp_path = f"temp_{audio.filename}"
with open(temp_path, "wb") as f:
shutil.copyfileobj(audio.file, f)
try:
wf = wave.open(temp_path, "rb")
if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getframerate() != 16000:
raise HTTPException(status_code=400, detail="Audio must be 16kHz 16-bit mono WAV")
rec = KaldiRecognizer(model, wf.getframerate())
rec.SetWords(True)
results = []
while True:
data = wf.readframes(4000)
if len(data) == 0:
break
if rec.AcceptWaveform(data):
results.append(json.loads(rec.Result()))
results.append(json.loads(rec.FinalResult()))
full_text = " ".join([res.get("text", "") for res in results])
return JSONResponse(content={"text": full_text})
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
finally:
wf.close()
os.remove(temp_path)