master
suvodip ghosh 2025-05-31 11:08:36 +00:00
parent 6bb4a49d74
commit 7c47f65494
18 changed files with 555 additions and 6 deletions

View File

@ -1,10 +1,11 @@
FROM python:3.9-slim as base FROM python:3.9-slim AS base
# Install dependencies # Install dependencies
RUN apt-get update && apt-get install -y \ RUN apt-get update && apt-get install -y \
python3-pip \ python3-pip \
ffmpeg \ ffmpeg \
wget \ wget \
unzip \
&& rm -rf /var/lib/apt/lists/* && rm -rf /var/lib/apt/lists/*
# Install Python requirements # Install Python requirements
@ -21,15 +22,15 @@ RUN wget https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip &&
COPY app.py . COPY app.py .
# COPY Caddyfile . # COPY Caddyfile .
FROM base as production FROM base AS production
# Install gunicorn and eventlet # Install gunicorn and eventlet
RUN pip install gunicorn eventlet RUN pip install gunicorn eventlet
# Expose ports (8000 for app, 2019 for Caddy admin) # Expose ports (8000 for app, 2019 for Caddy admin)
EXPOSE 8000 2019 EXPOSE 5000
CMD ["caddy", "run", "--config", "/app/Caddyfile"] #CMD ["caddy", "run", "--config", "/app/Caddyfile"]
FROM base as development FROM base AS development
# For development with auto-reload # For development with auto-reload
CMD ["python", "app.py"] CMD ["python", "app.py"]

2
app.py
View File

@ -34,4 +34,4 @@ def handle_audio_stream(audio_data):
if __name__ == '__main__': if __name__ == '__main__':
os.makedirs('static', exist_ok=True) os.makedirs('static', exist_ok=True)
socketio.run(app, host='0.0.0.0', port=8000) socketio.run(app, host='0.0.0.0', port=5000)

305
app2.py Normal file
View File

@ -0,0 +1,305 @@
#!/usr/bin/env python3
"""
Improved Real-time Speech-to-Text WebSocket Server using Vosk
with better audio format handling
"""
import asyncio
import websockets
import json
import logging
import subprocess
import tempfile
import os
import io
from vosk import Model, KaldiRecognizer
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
class ImprovedVoskSTTServer:
def __init__(self, model_path="vosk-model-small-en-us-0.15", sample_rate=16000):
"""
Initialize Improved Vosk STT Server with FFmpeg support
Args:
model_path: Path to Vosk model directory
sample_rate: Audio sample rate (16000 is recommended)
"""
self.model_path = model_path
self.sample_rate = sample_rate
self.model = None
self.check_dependencies()
self.load_model()
def check_dependencies(self):
"""Check if FFmpeg is available"""
try:
subprocess.run(['ffmpeg', '-version'],
capture_output=True, check=True)
logger.info("FFmpeg is available")
self.has_ffmpeg = True
except (subprocess.CalledProcessError, FileNotFoundError):
logger.warning("FFmpeg not found. Audio conversion may be limited.")
self.has_ffmpeg = False
def load_model(self):
"""Load Vosk model"""
try:
if not os.path.exists(self.model_path):
logger.error(f"Model path {self.model_path} does not exist!")
self.print_model_setup_instructions()
raise FileNotFoundError(f"Model not found at {self.model_path}")
logger.info(f"Loading Vosk model from {self.model_path}...")
self.model = Model(self.model_path)
logger.info("Model loaded successfully!")
except Exception as e:
logger.error(f"Failed to load model: {e}")
raise
def print_model_setup_instructions(self):
"""Print instructions for setting up Vosk model"""
logger.info("=" * 60)
logger.info("VOSK MODEL SETUP INSTRUCTIONS")
logger.info("=" * 60)
logger.info("1. Download a Vosk model (choose based on your needs):")
logger.info("")
logger.info(" Small English model (~50MB):")
logger.info(" wget https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip")
logger.info(" unzip vosk-model-small-en-us-0.15.zip")
logger.info("")
logger.info(" Large English model (~1.8GB, better accuracy):")
logger.info(" wget https://alphacephei.com/vosk/models/vosk-model-en-us-0.22.zip")
logger.info(" unzip vosk-model-en-us-0.22.zip")
logger.info("")
logger.info(" Other languages available at: https://alphacephei.com/vosk/models")
logger.info("")
logger.info("2. Place the extracted model directory in the server folder")
logger.info("3. Update the model path when starting the server")
logger.info("=" * 60)
async def handle_client(self, websocket, path):
"""Handle WebSocket client connection"""
client_ip = websocket.remote_address[0]
logger.info(f"New client connected: {client_ip}")
# Create recognizer for this client
recognizer = KaldiRecognizer(self.model, self.sample_rate)
try:
await websocket.send(json.dumps({
"type": "status",
"message": "Connected to Vosk STT Server",
"server_info": {
"sample_rate": self.sample_rate,
"has_ffmpeg": self.has_ffmpeg,
"model_path": self.model_path
}
}))
async for message in websocket:
try:
# Handle binary audio data
if isinstance(message, bytes):
await self.process_audio_chunk(websocket, recognizer, message)
# Handle text messages (commands, etc.)
elif isinstance(message, str):
await self.handle_text_message(websocket, recognizer, message)
except Exception as e:
logger.error(f"Error processing message: {e}")
await websocket.send(json.dumps({
"type": "error",
"message": str(e)
}))
except websockets.exceptions.ConnectionClosed:
logger.info(f"Client disconnected: {client_ip}")
except Exception as e:
logger.error(f"Error handling client {client_ip}: {e}")
async def process_audio_chunk(self, websocket, recognizer, audio_data):
"""Process incoming audio chunk with Vosk"""
try:
# Convert audio to PCM format for Vosk
pcm_data = await self.convert_to_pcm(audio_data)
if pcm_data:
# Feed audio to recognizer
if recognizer.AcceptWaveform(pcm_data):
# Final result
result = json.loads(recognizer.Result())
if result.get('text', '').strip():
await websocket.send(json.dumps({
"type": "transcription",
"text": result['text'],
"final": True,
"confidence": result.get('confidence', 0.0),
"timestamp": asyncio.get_event_loop().time()
}))
logger.info(f"Final: {result['text']}")
else:
# Partial result
partial_result = json.loads(recognizer.PartialResult())
if partial_result.get('partial', '').strip():
await websocket.send(json.dumps({
"type": "transcription",
"text": partial_result['partial'],
"final": False,
"confidence": 0.0,
"timestamp": asyncio.get_event_loop().time()
}))
except Exception as e:
logger.error(f"Error processing audio: {e}")
async def convert_to_pcm(self, audio_data):
"""
Convert various audio formats to PCM format using FFmpeg
"""
if not self.has_ffmpeg:
# Fallback: assume audio is already in compatible format
return audio_data
try:
# Create temporary files
with tempfile.NamedTemporaryFile(suffix='.webm', delete=False) as input_file:
input_file.write(audio_data)
input_path = input_file.name
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as output_file:
output_path = output_file.name
# Use FFmpeg to convert to PCM WAV format
cmd = [
'ffmpeg',
'-i', input_path,
'-acodec', 'pcm_s16le', # 16-bit PCM
'-ac', '1', # Mono
'-ar', str(self.sample_rate), # Sample rate
'-f', 'wav',
'-y', # Overwrite output
output_path
]
# Run conversion asynchronously
process = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
stdout, stderr = await process.communicate()
if process.returncode == 0:
# Read converted audio
with open(output_path, 'rb') as f:
wav_data = f.read()
# Extract PCM data (skip WAV header - 44 bytes)
pcm_data = wav_data[44:]
# Cleanup
os.unlink(input_path)
os.unlink(output_path)
return pcm_data
else:
logger.error(f"FFmpeg conversion failed: {stderr.decode()}")
# Cleanup
os.unlink(input_path)
if os.path.exists(output_path):
os.unlink(output_path)
return None
except Exception as e:
logger.error(f"Audio conversion error: {e}")
return None
async def handle_text_message(self, websocket, recognizer, message):
"""Handle text-based commands from client"""
try:
data = json.loads(message)
command = data.get('command')
if command == 'ping':
await websocket.send(json.dumps({
"type": "pong",
"timestamp": asyncio.get_event_loop().time()
}))
elif command == 'reset':
# Reset recognizer
recognizer.Reset()
await websocket.send(json.dumps({
"type": "status",
"message": "Recognizer reset"
}))
elif command == 'get_info':
await websocket.send(json.dumps({
"type": "server_info",
"sample_rate": self.sample_rate,
"has_ffmpeg": self.has_ffmpeg,
"model_path": self.model_path
}))
except json.JSONDecodeError:
logger.error("Invalid JSON message received")
async def start_server(self, host="0.0.0.0", port=5000):
"""Start the WebSocket server"""
logger.info(f"Starting Vosk STT WebSocket server on {host}:{port}")
logger.info(f"Using model: {self.model_path}")
logger.info(f"Sample rate: {self.sample_rate}")
logger.info(f"FFmpeg available: {self.has_ffmpeg}")
try:
async with websockets.serve(self.handle_client, host, port):
logger.info("Server started successfully!")
logger.info("Waiting for client connections...")
logger.info("Press Ctrl+C to stop the server")
# Keep server running
await asyncio.Future() # run forever
except Exception as e:
logger.error(f"Server error: {e}")
raise
def main():
"""Main entry point"""
import argparse
parser = argparse.ArgumentParser(description='Improved Vosk STT WebSocket Server')
parser.add_argument('--host', default='0.0.0.0', help='Host to bind to')
parser.add_argument('--port', type=int, default=8765, help='Port to bind to')
parser.add_argument('--model', default='vosk-model-small-en-us-0.15',
help='Path to Vosk model directory')
parser.add_argument('--sample-rate', type=int, default=16000,
help='Audio sample rate')
args = parser.parse_args()
try:
# Create and start server
server = ImprovedVoskSTTServer(model_path=args.model, sample_rate=args.sample_rate)
asyncio.run(server.start_server(host=args.host, port=args.port))
except KeyboardInterrupt:
logger.info("Server stopped by user")
except Exception as e:
logger.error(f"Server failed to start: {e}")
return 1
return 0
if __name__ == "__main__":
exit(main())

9
model/README Normal file
View File

@ -0,0 +1,9 @@
US English model for mobile Vosk applications
Copyright 2020 Alpha Cephei Inc
Accuracy: 10.38 (tedlium test) 9.85 (librispeech test-clean)
Speed: 0.11xRT (desktop)
Latency: 0.15s (right context)

BIN
model/am/final.mdl Normal file

Binary file not shown.

7
model/conf/mfcc.conf Normal file
View File

@ -0,0 +1,7 @@
--sample-frequency=16000
--use-energy=false
--num-mel-bins=40
--num-ceps=40
--low-freq=20
--high-freq=7600
--allow-downsample=true

10
model/conf/model.conf Normal file
View File

@ -0,0 +1,10 @@
--min-active=200
--max-active=3000
--beam=10.0
--lattice-beam=2.0
--acoustic-scale=1.0
--frame-subsampling-factor=3
--endpoint.silence-phones=1:2:3:4:5:6:7:8:9:10
--endpoint.rule2.min-trailing-silence=0.5
--endpoint.rule3.min-trailing-silence=0.75
--endpoint.rule4.min-trailing-silence=1.0

BIN
model/graph/Gr.fst Normal file

Binary file not shown.

BIN
model/graph/HCLr.fst Normal file

Binary file not shown.

View File

@ -0,0 +1,17 @@
10015
10016
10017
10018
10019
10020
10021
10022
10023
10024
10025
10026
10027
10028
10029
10030
10031

View File

@ -0,0 +1,166 @@
1 nonword
2 begin
3 end
4 internal
5 singleton
6 nonword
7 begin
8 end
9 internal
10 singleton
11 begin
12 end
13 internal
14 singleton
15 begin
16 end
17 internal
18 singleton
19 begin
20 end
21 internal
22 singleton
23 begin
24 end
25 internal
26 singleton
27 begin
28 end
29 internal
30 singleton
31 begin
32 end
33 internal
34 singleton
35 begin
36 end
37 internal
38 singleton
39 begin
40 end
41 internal
42 singleton
43 begin
44 end
45 internal
46 singleton
47 begin
48 end
49 internal
50 singleton
51 begin
52 end
53 internal
54 singleton
55 begin
56 end
57 internal
58 singleton
59 begin
60 end
61 internal
62 singleton
63 begin
64 end
65 internal
66 singleton
67 begin
68 end
69 internal
70 singleton
71 begin
72 end
73 internal
74 singleton
75 begin
76 end
77 internal
78 singleton
79 begin
80 end
81 internal
82 singleton
83 begin
84 end
85 internal
86 singleton
87 begin
88 end
89 internal
90 singleton
91 begin
92 end
93 internal
94 singleton
95 begin
96 end
97 internal
98 singleton
99 begin
100 end
101 internal
102 singleton
103 begin
104 end
105 internal
106 singleton
107 begin
108 end
109 internal
110 singleton
111 begin
112 end
113 internal
114 singleton
115 begin
116 end
117 internal
118 singleton
119 begin
120 end
121 internal
122 singleton
123 begin
124 end
125 internal
126 singleton
127 begin
128 end
129 internal
130 singleton
131 begin
132 end
133 internal
134 singleton
135 begin
136 end
137 internal
138 singleton
139 begin
140 end
141 internal
142 singleton
143 begin
144 end
145 internal
146 singleton
147 begin
148 end
149 internal
150 singleton
151 begin
152 end
153 internal
154 singleton
155 begin
156 end
157 internal
158 singleton
159 begin
160 end
161 internal
162 singleton
163 begin
164 end
165 internal
166 singleton

BIN
model/ivector/final.dubm Normal file

Binary file not shown.

BIN
model/ivector/final.ie Normal file

Binary file not shown.

BIN
model/ivector/final.mat Normal file

Binary file not shown.

View File

@ -0,0 +1,3 @@
[
1.682383e+11 -1.1595e+10 -1.521733e+10 4.32034e+09 -2.257938e+10 -1.969666e+10 -2.559265e+10 -1.535687e+10 -1.276854e+10 -4.494483e+09 -1.209085e+10 -5.64008e+09 -1.134847e+10 -3.419512e+09 -1.079542e+10 -4.145463e+09 -6.637486e+09 -1.11318e+09 -3.479773e+09 -1.245932e+08 -1.386961e+09 6.560655e+07 -2.436518e+08 -4.032432e+07 4.620046e+08 -7.714964e+07 9.551484e+08 -4.119761e+08 8.208582e+08 -7.117156e+08 7.457703e+08 -4.3106e+08 1.202726e+09 2.904036e+08 1.231931e+09 3.629848e+08 6.366939e+08 -4.586172e+08 -5.267629e+08 -3.507819e+08 1.679838e+09
1.741141e+13 8.92488e+11 8.743834e+11 8.848896e+11 1.190313e+12 1.160279e+12 1.300066e+12 1.005678e+12 9.39335e+11 8.089614e+11 7.927041e+11 6.882427e+11 6.444235e+11 5.151451e+11 4.825723e+11 3.210106e+11 2.720254e+11 1.772539e+11 1.248102e+11 6.691599e+10 3.599804e+10 1.207574e+10 1.679301e+09 4.594778e+08 5.821614e+09 1.451758e+10 2.55803e+10 3.43277e+10 4.245286e+10 4.784859e+10 4.988591e+10 4.925451e+10 5.074584e+10 4.9557e+10 4.407876e+10 3.421443e+10 3.138606e+10 2.539716e+10 1.948134e+10 1.381167e+10 0 ]

View File

@ -0,0 +1 @@
# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh

View File

@ -0,0 +1,2 @@
--left-context=3
--right-context=3

28
todo.md Normal file
View File

@ -0,0 +1,28 @@
# Audio STT Streaming Project Todos
## Frontend Tasks
- [ ] Create audio capture interface with start/stop recording
- [ ] Implement WebSocket connection to server
- [ ] Stream audio data in real-time to server
- [ ] Display incoming transcribed text from server
- [ ] Add audio visualization (optional)
- [ ] Handle connection errors and reconnection
## Backend Tasks
- [ ] Set up WebSocket server (Node.js/Python)
- [ ] Integrate Vosk STT engine
- [ ] Handle incoming audio stream processing
- [ ] Stream transcribed text back to client
- [ ] Add error handling and logging
- [ ] Create deployment documentation
## Server Setup
- [ ] Create Python server with Vosk integration
- [ ] Add WebSocket support for real-time communication
- [ ] Configure audio format handling (WAV/PCM)
- [ ] Test with different audio sample rates
## Deployment
- [ ] Create VPS deployment guide
- [ ] Add environment configuration
- [ ] Test end-to-end functionality