commit 603b36574b85678a2504e36b6f92c2cb7acdc32a Author: Kar l5 Date: Mon Jun 16 00:06:36 2025 +0530 init diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..1cd202a --- /dev/null +++ b/Dockerfile @@ -0,0 +1,27 @@ +FROM python:3.11-slim + +# Set working directory +WORKDIR /app + +# Install dependencies +RUN apt-get update && apt-get install -y \ + build-essential \ + ffmpeg \ + curl \ + && rm -rf /var/lib/apt/lists/* + +# Copy requirements +COPY requirements.txt . + +# Install Python dependencies +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application files +COPY app.py . +COPY model/ model/ + +# Expose port +EXPOSE 5082 + +# Run FastAPI app +CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "5082"] diff --git a/app.py b/app.py new file mode 100644 index 0000000..03cfe12 --- /dev/null +++ b/app.py @@ -0,0 +1,51 @@ +from fastapi import FastAPI, File, UploadFile, HTTPException +from fastapi.responses import JSONResponse +from vosk import Model, KaldiRecognizer +import wave +import os +import shutil +import json + +app = FastAPI() +MODEL_PATH = "model/vosk-model-small-en-us-0.15" + +if not os.path.exists(MODEL_PATH): + raise RuntimeError(f"Vosk model not found at {MODEL_PATH}") + +model = Model(MODEL_PATH) + +@app.post("/stt") +async def transcribe(audio: UploadFile = File(...)): + if not audio.filename.endswith(".wav"): + raise HTTPException(status_code=400, detail="Only .wav files are supported") + + temp_path = f"temp_{audio.filename}" + with open(temp_path, "wb") as f: + shutil.copyfileobj(audio.file, f) + + try: + wf = wave.open(temp_path, "rb") + if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getframerate() != 16000: + raise HTTPException(status_code=400, detail="Audio must be 16kHz 16-bit mono WAV") + + rec = KaldiRecognizer(model, wf.getframerate()) + rec.SetWords(True) + + results = [] + while True: + data = wf.readframes(4000) + if len(data) == 0: + break + if rec.AcceptWaveform(data): + results.append(json.loads(rec.Result())) + + results.append(json.loads(rec.FinalResult())) + full_text = " ".join([res.get("text", "") for res in results]) + + return JSONResponse(content={"text": full_text}) + + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + finally: + wf.close() + os.remove(temp_path) diff --git a/model/vosk-model-small-en-us-0.15/README b/model/vosk-model-small-en-us-0.15/README new file mode 100644 index 0000000..a7f7931 --- /dev/null +++ b/model/vosk-model-small-en-us-0.15/README @@ -0,0 +1,9 @@ +US English model for mobile Vosk applications + +Copyright 2020 Alpha Cephei Inc + +Accuracy: 10.38 (tedlium test) 9.85 (librispeech test-clean) +Speed: 0.11xRT (desktop) +Latency: 0.15s (right context) + + diff --git a/model/vosk-model-small-en-us-0.15/am/final.mdl b/model/vosk-model-small-en-us-0.15/am/final.mdl new file mode 100644 index 0000000..5596b31 Binary files /dev/null and b/model/vosk-model-small-en-us-0.15/am/final.mdl differ diff --git a/model/vosk-model-small-en-us-0.15/conf/mfcc.conf b/model/vosk-model-small-en-us-0.15/conf/mfcc.conf new file mode 100644 index 0000000..eaa40c5 --- /dev/null +++ b/model/vosk-model-small-en-us-0.15/conf/mfcc.conf @@ -0,0 +1,7 @@ +--sample-frequency=16000 +--use-energy=false +--num-mel-bins=40 +--num-ceps=40 +--low-freq=20 +--high-freq=7600 +--allow-downsample=true diff --git a/model/vosk-model-small-en-us-0.15/conf/model.conf b/model/vosk-model-small-en-us-0.15/conf/model.conf new file mode 100644 index 0000000..9d5b0da --- /dev/null +++ b/model/vosk-model-small-en-us-0.15/conf/model.conf @@ -0,0 +1,10 @@ +--min-active=200 +--max-active=3000 +--beam=10.0 +--lattice-beam=2.0 +--acoustic-scale=1.0 +--frame-subsampling-factor=3 +--endpoint.silence-phones=1:2:3:4:5:6:7:8:9:10 +--endpoint.rule2.min-trailing-silence=0.5 +--endpoint.rule3.min-trailing-silence=0.75 +--endpoint.rule4.min-trailing-silence=1.0 diff --git a/model/vosk-model-small-en-us-0.15/graph/Gr.fst b/model/vosk-model-small-en-us-0.15/graph/Gr.fst new file mode 100644 index 0000000..1f292e6 Binary files /dev/null and b/model/vosk-model-small-en-us-0.15/graph/Gr.fst differ diff --git a/model/vosk-model-small-en-us-0.15/graph/HCLr.fst b/model/vosk-model-small-en-us-0.15/graph/HCLr.fst new file mode 100644 index 0000000..9797b26 Binary files /dev/null and b/model/vosk-model-small-en-us-0.15/graph/HCLr.fst differ diff --git a/model/vosk-model-small-en-us-0.15/graph/disambig_tid.int b/model/vosk-model-small-en-us-0.15/graph/disambig_tid.int new file mode 100644 index 0000000..762fd5f --- /dev/null +++ b/model/vosk-model-small-en-us-0.15/graph/disambig_tid.int @@ -0,0 +1,17 @@ +10015 +10016 +10017 +10018 +10019 +10020 +10021 +10022 +10023 +10024 +10025 +10026 +10027 +10028 +10029 +10030 +10031 diff --git a/model/vosk-model-small-en-us-0.15/graph/phones/word_boundary.int b/model/vosk-model-small-en-us-0.15/graph/phones/word_boundary.int new file mode 100644 index 0000000..df23fd7 --- /dev/null +++ b/model/vosk-model-small-en-us-0.15/graph/phones/word_boundary.int @@ -0,0 +1,166 @@ +1 nonword +2 begin +3 end +4 internal +5 singleton +6 nonword +7 begin +8 end +9 internal +10 singleton +11 begin +12 end +13 internal +14 singleton +15 begin +16 end +17 internal +18 singleton +19 begin +20 end +21 internal +22 singleton +23 begin +24 end +25 internal +26 singleton +27 begin +28 end +29 internal +30 singleton +31 begin +32 end +33 internal +34 singleton +35 begin +36 end +37 internal +38 singleton +39 begin +40 end +41 internal +42 singleton +43 begin +44 end +45 internal +46 singleton +47 begin +48 end +49 internal +50 singleton +51 begin +52 end +53 internal +54 singleton +55 begin +56 end +57 internal +58 singleton +59 begin +60 end +61 internal +62 singleton +63 begin +64 end +65 internal +66 singleton +67 begin +68 end +69 internal +70 singleton +71 begin +72 end +73 internal +74 singleton +75 begin +76 end +77 internal +78 singleton +79 begin +80 end +81 internal +82 singleton +83 begin +84 end +85 internal +86 singleton +87 begin +88 end +89 internal +90 singleton +91 begin +92 end +93 internal +94 singleton +95 begin +96 end +97 internal +98 singleton +99 begin +100 end +101 internal +102 singleton +103 begin +104 end +105 internal +106 singleton +107 begin +108 end +109 internal +110 singleton +111 begin +112 end +113 internal +114 singleton +115 begin +116 end +117 internal +118 singleton +119 begin +120 end +121 internal +122 singleton +123 begin +124 end +125 internal +126 singleton +127 begin +128 end +129 internal +130 singleton +131 begin +132 end +133 internal +134 singleton +135 begin +136 end +137 internal +138 singleton +139 begin +140 end +141 internal +142 singleton +143 begin +144 end +145 internal +146 singleton +147 begin +148 end +149 internal +150 singleton +151 begin +152 end +153 internal +154 singleton +155 begin +156 end +157 internal +158 singleton +159 begin +160 end +161 internal +162 singleton +163 begin +164 end +165 internal +166 singleton diff --git a/model/vosk-model-small-en-us-0.15/ivector/final.dubm b/model/vosk-model-small-en-us-0.15/ivector/final.dubm new file mode 100644 index 0000000..db789eb Binary files /dev/null and b/model/vosk-model-small-en-us-0.15/ivector/final.dubm differ diff --git a/model/vosk-model-small-en-us-0.15/ivector/final.ie b/model/vosk-model-small-en-us-0.15/ivector/final.ie new file mode 100644 index 0000000..93737bf Binary files /dev/null and b/model/vosk-model-small-en-us-0.15/ivector/final.ie differ diff --git a/model/vosk-model-small-en-us-0.15/ivector/final.mat b/model/vosk-model-small-en-us-0.15/ivector/final.mat new file mode 100644 index 0000000..c3ec635 Binary files /dev/null and b/model/vosk-model-small-en-us-0.15/ivector/final.mat differ diff --git a/model/vosk-model-small-en-us-0.15/ivector/global_cmvn.stats b/model/vosk-model-small-en-us-0.15/ivector/global_cmvn.stats new file mode 100644 index 0000000..b9d92ef --- /dev/null +++ b/model/vosk-model-small-en-us-0.15/ivector/global_cmvn.stats @@ -0,0 +1,3 @@ + [ + 1.682383e+11 -1.1595e+10 -1.521733e+10 4.32034e+09 -2.257938e+10 -1.969666e+10 -2.559265e+10 -1.535687e+10 -1.276854e+10 -4.494483e+09 -1.209085e+10 -5.64008e+09 -1.134847e+10 -3.419512e+09 -1.079542e+10 -4.145463e+09 -6.637486e+09 -1.11318e+09 -3.479773e+09 -1.245932e+08 -1.386961e+09 6.560655e+07 -2.436518e+08 -4.032432e+07 4.620046e+08 -7.714964e+07 9.551484e+08 -4.119761e+08 8.208582e+08 -7.117156e+08 7.457703e+08 -4.3106e+08 1.202726e+09 2.904036e+08 1.231931e+09 3.629848e+08 6.366939e+08 -4.586172e+08 -5.267629e+08 -3.507819e+08 1.679838e+09 + 1.741141e+13 8.92488e+11 8.743834e+11 8.848896e+11 1.190313e+12 1.160279e+12 1.300066e+12 1.005678e+12 9.39335e+11 8.089614e+11 7.927041e+11 6.882427e+11 6.444235e+11 5.151451e+11 4.825723e+11 3.210106e+11 2.720254e+11 1.772539e+11 1.248102e+11 6.691599e+10 3.599804e+10 1.207574e+10 1.679301e+09 4.594778e+08 5.821614e+09 1.451758e+10 2.55803e+10 3.43277e+10 4.245286e+10 4.784859e+10 4.988591e+10 4.925451e+10 5.074584e+10 4.9557e+10 4.407876e+10 3.421443e+10 3.138606e+10 2.539716e+10 1.948134e+10 1.381167e+10 0 ] diff --git a/model/vosk-model-small-en-us-0.15/ivector/online_cmvn.conf b/model/vosk-model-small-en-us-0.15/ivector/online_cmvn.conf new file mode 100644 index 0000000..7748a4a --- /dev/null +++ b/model/vosk-model-small-en-us-0.15/ivector/online_cmvn.conf @@ -0,0 +1 @@ +# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh diff --git a/model/vosk-model-small-en-us-0.15/ivector/splice.conf b/model/vosk-model-small-en-us-0.15/ivector/splice.conf new file mode 100644 index 0000000..960cd2e --- /dev/null +++ b/model/vosk-model-small-en-us-0.15/ivector/splice.conf @@ -0,0 +1,2 @@ +--left-context=3 +--right-context=3 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..5004d6c --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +fastapi +uvicorn +python-multipart +vosk