wip
parent
6bb4a49d74
commit
7c47f65494
11
Dockerfile
11
Dockerfile
|
@ -1,10 +1,11 @@
|
||||||
FROM python:3.9-slim as base
|
FROM python:3.9-slim AS base
|
||||||
|
|
||||||
# Install dependencies
|
# Install dependencies
|
||||||
RUN apt-get update && apt-get install -y \
|
RUN apt-get update && apt-get install -y \
|
||||||
python3-pip \
|
python3-pip \
|
||||||
ffmpeg \
|
ffmpeg \
|
||||||
wget \
|
wget \
|
||||||
|
unzip \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
# Install Python requirements
|
# Install Python requirements
|
||||||
|
@ -21,15 +22,15 @@ RUN wget https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip &&
|
||||||
COPY app.py .
|
COPY app.py .
|
||||||
# COPY Caddyfile .
|
# COPY Caddyfile .
|
||||||
|
|
||||||
FROM base as production
|
FROM base AS production
|
||||||
# Install gunicorn and eventlet
|
# Install gunicorn and eventlet
|
||||||
RUN pip install gunicorn eventlet
|
RUN pip install gunicorn eventlet
|
||||||
|
|
||||||
# Expose ports (8000 for app, 2019 for Caddy admin)
|
# Expose ports (8000 for app, 2019 for Caddy admin)
|
||||||
EXPOSE 8000 2019
|
EXPOSE 5000
|
||||||
|
|
||||||
CMD ["caddy", "run", "--config", "/app/Caddyfile"]
|
#CMD ["caddy", "run", "--config", "/app/Caddyfile"]
|
||||||
|
|
||||||
FROM base as development
|
FROM base AS development
|
||||||
# For development with auto-reload
|
# For development with auto-reload
|
||||||
CMD ["python", "app.py"]
|
CMD ["python", "app.py"]
|
2
app.py
2
app.py
|
@ -34,4 +34,4 @@ def handle_audio_stream(audio_data):
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
os.makedirs('static', exist_ok=True)
|
os.makedirs('static', exist_ok=True)
|
||||||
socketio.run(app, host='0.0.0.0', port=8000)
|
socketio.run(app, host='0.0.0.0', port=5000)
|
|
@ -0,0 +1,305 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Improved Real-time Speech-to-Text WebSocket Server using Vosk
|
||||||
|
with better audio format handling
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import websockets
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import subprocess
|
||||||
|
import tempfile
|
||||||
|
import os
|
||||||
|
import io
|
||||||
|
from vosk import Model, KaldiRecognizer
|
||||||
|
|
||||||
|
# Configure logging
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
||||||
|
)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
class ImprovedVoskSTTServer:
|
||||||
|
def __init__(self, model_path="vosk-model-small-en-us-0.15", sample_rate=16000):
|
||||||
|
"""
|
||||||
|
Initialize Improved Vosk STT Server with FFmpeg support
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model_path: Path to Vosk model directory
|
||||||
|
sample_rate: Audio sample rate (16000 is recommended)
|
||||||
|
"""
|
||||||
|
self.model_path = model_path
|
||||||
|
self.sample_rate = sample_rate
|
||||||
|
self.model = None
|
||||||
|
self.check_dependencies()
|
||||||
|
self.load_model()
|
||||||
|
|
||||||
|
def check_dependencies(self):
|
||||||
|
"""Check if FFmpeg is available"""
|
||||||
|
try:
|
||||||
|
subprocess.run(['ffmpeg', '-version'],
|
||||||
|
capture_output=True, check=True)
|
||||||
|
logger.info("FFmpeg is available")
|
||||||
|
self.has_ffmpeg = True
|
||||||
|
except (subprocess.CalledProcessError, FileNotFoundError):
|
||||||
|
logger.warning("FFmpeg not found. Audio conversion may be limited.")
|
||||||
|
self.has_ffmpeg = False
|
||||||
|
|
||||||
|
def load_model(self):
|
||||||
|
"""Load Vosk model"""
|
||||||
|
try:
|
||||||
|
if not os.path.exists(self.model_path):
|
||||||
|
logger.error(f"Model path {self.model_path} does not exist!")
|
||||||
|
self.print_model_setup_instructions()
|
||||||
|
raise FileNotFoundError(f"Model not found at {self.model_path}")
|
||||||
|
|
||||||
|
logger.info(f"Loading Vosk model from {self.model_path}...")
|
||||||
|
self.model = Model(self.model_path)
|
||||||
|
logger.info("Model loaded successfully!")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to load model: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def print_model_setup_instructions(self):
|
||||||
|
"""Print instructions for setting up Vosk model"""
|
||||||
|
logger.info("=" * 60)
|
||||||
|
logger.info("VOSK MODEL SETUP INSTRUCTIONS")
|
||||||
|
logger.info("=" * 60)
|
||||||
|
logger.info("1. Download a Vosk model (choose based on your needs):")
|
||||||
|
logger.info("")
|
||||||
|
logger.info(" Small English model (~50MB):")
|
||||||
|
logger.info(" wget https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip")
|
||||||
|
logger.info(" unzip vosk-model-small-en-us-0.15.zip")
|
||||||
|
logger.info("")
|
||||||
|
logger.info(" Large English model (~1.8GB, better accuracy):")
|
||||||
|
logger.info(" wget https://alphacephei.com/vosk/models/vosk-model-en-us-0.22.zip")
|
||||||
|
logger.info(" unzip vosk-model-en-us-0.22.zip")
|
||||||
|
logger.info("")
|
||||||
|
logger.info(" Other languages available at: https://alphacephei.com/vosk/models")
|
||||||
|
logger.info("")
|
||||||
|
logger.info("2. Place the extracted model directory in the server folder")
|
||||||
|
logger.info("3. Update the model path when starting the server")
|
||||||
|
logger.info("=" * 60)
|
||||||
|
|
||||||
|
async def handle_client(self, websocket, path):
|
||||||
|
"""Handle WebSocket client connection"""
|
||||||
|
client_ip = websocket.remote_address[0]
|
||||||
|
logger.info(f"New client connected: {client_ip}")
|
||||||
|
|
||||||
|
# Create recognizer for this client
|
||||||
|
recognizer = KaldiRecognizer(self.model, self.sample_rate)
|
||||||
|
|
||||||
|
try:
|
||||||
|
await websocket.send(json.dumps({
|
||||||
|
"type": "status",
|
||||||
|
"message": "Connected to Vosk STT Server",
|
||||||
|
"server_info": {
|
||||||
|
"sample_rate": self.sample_rate,
|
||||||
|
"has_ffmpeg": self.has_ffmpeg,
|
||||||
|
"model_path": self.model_path
|
||||||
|
}
|
||||||
|
}))
|
||||||
|
|
||||||
|
async for message in websocket:
|
||||||
|
try:
|
||||||
|
# Handle binary audio data
|
||||||
|
if isinstance(message, bytes):
|
||||||
|
await self.process_audio_chunk(websocket, recognizer, message)
|
||||||
|
|
||||||
|
# Handle text messages (commands, etc.)
|
||||||
|
elif isinstance(message, str):
|
||||||
|
await self.handle_text_message(websocket, recognizer, message)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error processing message: {e}")
|
||||||
|
await websocket.send(json.dumps({
|
||||||
|
"type": "error",
|
||||||
|
"message": str(e)
|
||||||
|
}))
|
||||||
|
|
||||||
|
except websockets.exceptions.ConnectionClosed:
|
||||||
|
logger.info(f"Client disconnected: {client_ip}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error handling client {client_ip}: {e}")
|
||||||
|
|
||||||
|
async def process_audio_chunk(self, websocket, recognizer, audio_data):
|
||||||
|
"""Process incoming audio chunk with Vosk"""
|
||||||
|
try:
|
||||||
|
# Convert audio to PCM format for Vosk
|
||||||
|
pcm_data = await self.convert_to_pcm(audio_data)
|
||||||
|
|
||||||
|
if pcm_data:
|
||||||
|
# Feed audio to recognizer
|
||||||
|
if recognizer.AcceptWaveform(pcm_data):
|
||||||
|
# Final result
|
||||||
|
result = json.loads(recognizer.Result())
|
||||||
|
if result.get('text', '').strip():
|
||||||
|
await websocket.send(json.dumps({
|
||||||
|
"type": "transcription",
|
||||||
|
"text": result['text'],
|
||||||
|
"final": True,
|
||||||
|
"confidence": result.get('confidence', 0.0),
|
||||||
|
"timestamp": asyncio.get_event_loop().time()
|
||||||
|
}))
|
||||||
|
logger.info(f"Final: {result['text']}")
|
||||||
|
else:
|
||||||
|
# Partial result
|
||||||
|
partial_result = json.loads(recognizer.PartialResult())
|
||||||
|
if partial_result.get('partial', '').strip():
|
||||||
|
await websocket.send(json.dumps({
|
||||||
|
"type": "transcription",
|
||||||
|
"text": partial_result['partial'],
|
||||||
|
"final": False,
|
||||||
|
"confidence": 0.0,
|
||||||
|
"timestamp": asyncio.get_event_loop().time()
|
||||||
|
}))
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error processing audio: {e}")
|
||||||
|
|
||||||
|
async def convert_to_pcm(self, audio_data):
|
||||||
|
"""
|
||||||
|
Convert various audio formats to PCM format using FFmpeg
|
||||||
|
"""
|
||||||
|
if not self.has_ffmpeg:
|
||||||
|
# Fallback: assume audio is already in compatible format
|
||||||
|
return audio_data
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Create temporary files
|
||||||
|
with tempfile.NamedTemporaryFile(suffix='.webm', delete=False) as input_file:
|
||||||
|
input_file.write(audio_data)
|
||||||
|
input_path = input_file.name
|
||||||
|
|
||||||
|
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as output_file:
|
||||||
|
output_path = output_file.name
|
||||||
|
|
||||||
|
# Use FFmpeg to convert to PCM WAV format
|
||||||
|
cmd = [
|
||||||
|
'ffmpeg',
|
||||||
|
'-i', input_path,
|
||||||
|
'-acodec', 'pcm_s16le', # 16-bit PCM
|
||||||
|
'-ac', '1', # Mono
|
||||||
|
'-ar', str(self.sample_rate), # Sample rate
|
||||||
|
'-f', 'wav',
|
||||||
|
'-y', # Overwrite output
|
||||||
|
output_path
|
||||||
|
]
|
||||||
|
|
||||||
|
# Run conversion asynchronously
|
||||||
|
process = await asyncio.create_subprocess_exec(
|
||||||
|
*cmd,
|
||||||
|
stdout=asyncio.subprocess.PIPE,
|
||||||
|
stderr=asyncio.subprocess.PIPE
|
||||||
|
)
|
||||||
|
|
||||||
|
stdout, stderr = await process.communicate()
|
||||||
|
|
||||||
|
if process.returncode == 0:
|
||||||
|
# Read converted audio
|
||||||
|
with open(output_path, 'rb') as f:
|
||||||
|
wav_data = f.read()
|
||||||
|
|
||||||
|
# Extract PCM data (skip WAV header - 44 bytes)
|
||||||
|
pcm_data = wav_data[44:]
|
||||||
|
|
||||||
|
# Cleanup
|
||||||
|
os.unlink(input_path)
|
||||||
|
os.unlink(output_path)
|
||||||
|
|
||||||
|
return pcm_data
|
||||||
|
else:
|
||||||
|
logger.error(f"FFmpeg conversion failed: {stderr.decode()}")
|
||||||
|
# Cleanup
|
||||||
|
os.unlink(input_path)
|
||||||
|
if os.path.exists(output_path):
|
||||||
|
os.unlink(output_path)
|
||||||
|
return None
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Audio conversion error: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def handle_text_message(self, websocket, recognizer, message):
|
||||||
|
"""Handle text-based commands from client"""
|
||||||
|
try:
|
||||||
|
data = json.loads(message)
|
||||||
|
command = data.get('command')
|
||||||
|
|
||||||
|
if command == 'ping':
|
||||||
|
await websocket.send(json.dumps({
|
||||||
|
"type": "pong",
|
||||||
|
"timestamp": asyncio.get_event_loop().time()
|
||||||
|
}))
|
||||||
|
|
||||||
|
elif command == 'reset':
|
||||||
|
# Reset recognizer
|
||||||
|
recognizer.Reset()
|
||||||
|
await websocket.send(json.dumps({
|
||||||
|
"type": "status",
|
||||||
|
"message": "Recognizer reset"
|
||||||
|
}))
|
||||||
|
|
||||||
|
elif command == 'get_info':
|
||||||
|
await websocket.send(json.dumps({
|
||||||
|
"type": "server_info",
|
||||||
|
"sample_rate": self.sample_rate,
|
||||||
|
"has_ffmpeg": self.has_ffmpeg,
|
||||||
|
"model_path": self.model_path
|
||||||
|
}))
|
||||||
|
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
logger.error("Invalid JSON message received")
|
||||||
|
|
||||||
|
async def start_server(self, host="0.0.0.0", port=5000):
|
||||||
|
"""Start the WebSocket server"""
|
||||||
|
logger.info(f"Starting Vosk STT WebSocket server on {host}:{port}")
|
||||||
|
logger.info(f"Using model: {self.model_path}")
|
||||||
|
logger.info(f"Sample rate: {self.sample_rate}")
|
||||||
|
logger.info(f"FFmpeg available: {self.has_ffmpeg}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
async with websockets.serve(self.handle_client, host, port):
|
||||||
|
logger.info("Server started successfully!")
|
||||||
|
logger.info("Waiting for client connections...")
|
||||||
|
logger.info("Press Ctrl+C to stop the server")
|
||||||
|
|
||||||
|
# Keep server running
|
||||||
|
await asyncio.Future() # run forever
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Server error: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Main entry point"""
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description='Improved Vosk STT WebSocket Server')
|
||||||
|
parser.add_argument('--host', default='0.0.0.0', help='Host to bind to')
|
||||||
|
parser.add_argument('--port', type=int, default=8765, help='Port to bind to')
|
||||||
|
parser.add_argument('--model', default='vosk-model-small-en-us-0.15',
|
||||||
|
help='Path to Vosk model directory')
|
||||||
|
parser.add_argument('--sample-rate', type=int, default=16000,
|
||||||
|
help='Audio sample rate')
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Create and start server
|
||||||
|
server = ImprovedVoskSTTServer(model_path=args.model, sample_rate=args.sample_rate)
|
||||||
|
asyncio.run(server.start_server(host=args.host, port=args.port))
|
||||||
|
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
logger.info("Server stopped by user")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Server failed to start: {e}")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
return 0
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
exit(main())
|
|
@ -0,0 +1,9 @@
|
||||||
|
US English model for mobile Vosk applications
|
||||||
|
|
||||||
|
Copyright 2020 Alpha Cephei Inc
|
||||||
|
|
||||||
|
Accuracy: 10.38 (tedlium test) 9.85 (librispeech test-clean)
|
||||||
|
Speed: 0.11xRT (desktop)
|
||||||
|
Latency: 0.15s (right context)
|
||||||
|
|
||||||
|
|
Binary file not shown.
|
@ -0,0 +1,7 @@
|
||||||
|
--sample-frequency=16000
|
||||||
|
--use-energy=false
|
||||||
|
--num-mel-bins=40
|
||||||
|
--num-ceps=40
|
||||||
|
--low-freq=20
|
||||||
|
--high-freq=7600
|
||||||
|
--allow-downsample=true
|
|
@ -0,0 +1,10 @@
|
||||||
|
--min-active=200
|
||||||
|
--max-active=3000
|
||||||
|
--beam=10.0
|
||||||
|
--lattice-beam=2.0
|
||||||
|
--acoustic-scale=1.0
|
||||||
|
--frame-subsampling-factor=3
|
||||||
|
--endpoint.silence-phones=1:2:3:4:5:6:7:8:9:10
|
||||||
|
--endpoint.rule2.min-trailing-silence=0.5
|
||||||
|
--endpoint.rule3.min-trailing-silence=0.75
|
||||||
|
--endpoint.rule4.min-trailing-silence=1.0
|
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,17 @@
|
||||||
|
10015
|
||||||
|
10016
|
||||||
|
10017
|
||||||
|
10018
|
||||||
|
10019
|
||||||
|
10020
|
||||||
|
10021
|
||||||
|
10022
|
||||||
|
10023
|
||||||
|
10024
|
||||||
|
10025
|
||||||
|
10026
|
||||||
|
10027
|
||||||
|
10028
|
||||||
|
10029
|
||||||
|
10030
|
||||||
|
10031
|
|
@ -0,0 +1,166 @@
|
||||||
|
1 nonword
|
||||||
|
2 begin
|
||||||
|
3 end
|
||||||
|
4 internal
|
||||||
|
5 singleton
|
||||||
|
6 nonword
|
||||||
|
7 begin
|
||||||
|
8 end
|
||||||
|
9 internal
|
||||||
|
10 singleton
|
||||||
|
11 begin
|
||||||
|
12 end
|
||||||
|
13 internal
|
||||||
|
14 singleton
|
||||||
|
15 begin
|
||||||
|
16 end
|
||||||
|
17 internal
|
||||||
|
18 singleton
|
||||||
|
19 begin
|
||||||
|
20 end
|
||||||
|
21 internal
|
||||||
|
22 singleton
|
||||||
|
23 begin
|
||||||
|
24 end
|
||||||
|
25 internal
|
||||||
|
26 singleton
|
||||||
|
27 begin
|
||||||
|
28 end
|
||||||
|
29 internal
|
||||||
|
30 singleton
|
||||||
|
31 begin
|
||||||
|
32 end
|
||||||
|
33 internal
|
||||||
|
34 singleton
|
||||||
|
35 begin
|
||||||
|
36 end
|
||||||
|
37 internal
|
||||||
|
38 singleton
|
||||||
|
39 begin
|
||||||
|
40 end
|
||||||
|
41 internal
|
||||||
|
42 singleton
|
||||||
|
43 begin
|
||||||
|
44 end
|
||||||
|
45 internal
|
||||||
|
46 singleton
|
||||||
|
47 begin
|
||||||
|
48 end
|
||||||
|
49 internal
|
||||||
|
50 singleton
|
||||||
|
51 begin
|
||||||
|
52 end
|
||||||
|
53 internal
|
||||||
|
54 singleton
|
||||||
|
55 begin
|
||||||
|
56 end
|
||||||
|
57 internal
|
||||||
|
58 singleton
|
||||||
|
59 begin
|
||||||
|
60 end
|
||||||
|
61 internal
|
||||||
|
62 singleton
|
||||||
|
63 begin
|
||||||
|
64 end
|
||||||
|
65 internal
|
||||||
|
66 singleton
|
||||||
|
67 begin
|
||||||
|
68 end
|
||||||
|
69 internal
|
||||||
|
70 singleton
|
||||||
|
71 begin
|
||||||
|
72 end
|
||||||
|
73 internal
|
||||||
|
74 singleton
|
||||||
|
75 begin
|
||||||
|
76 end
|
||||||
|
77 internal
|
||||||
|
78 singleton
|
||||||
|
79 begin
|
||||||
|
80 end
|
||||||
|
81 internal
|
||||||
|
82 singleton
|
||||||
|
83 begin
|
||||||
|
84 end
|
||||||
|
85 internal
|
||||||
|
86 singleton
|
||||||
|
87 begin
|
||||||
|
88 end
|
||||||
|
89 internal
|
||||||
|
90 singleton
|
||||||
|
91 begin
|
||||||
|
92 end
|
||||||
|
93 internal
|
||||||
|
94 singleton
|
||||||
|
95 begin
|
||||||
|
96 end
|
||||||
|
97 internal
|
||||||
|
98 singleton
|
||||||
|
99 begin
|
||||||
|
100 end
|
||||||
|
101 internal
|
||||||
|
102 singleton
|
||||||
|
103 begin
|
||||||
|
104 end
|
||||||
|
105 internal
|
||||||
|
106 singleton
|
||||||
|
107 begin
|
||||||
|
108 end
|
||||||
|
109 internal
|
||||||
|
110 singleton
|
||||||
|
111 begin
|
||||||
|
112 end
|
||||||
|
113 internal
|
||||||
|
114 singleton
|
||||||
|
115 begin
|
||||||
|
116 end
|
||||||
|
117 internal
|
||||||
|
118 singleton
|
||||||
|
119 begin
|
||||||
|
120 end
|
||||||
|
121 internal
|
||||||
|
122 singleton
|
||||||
|
123 begin
|
||||||
|
124 end
|
||||||
|
125 internal
|
||||||
|
126 singleton
|
||||||
|
127 begin
|
||||||
|
128 end
|
||||||
|
129 internal
|
||||||
|
130 singleton
|
||||||
|
131 begin
|
||||||
|
132 end
|
||||||
|
133 internal
|
||||||
|
134 singleton
|
||||||
|
135 begin
|
||||||
|
136 end
|
||||||
|
137 internal
|
||||||
|
138 singleton
|
||||||
|
139 begin
|
||||||
|
140 end
|
||||||
|
141 internal
|
||||||
|
142 singleton
|
||||||
|
143 begin
|
||||||
|
144 end
|
||||||
|
145 internal
|
||||||
|
146 singleton
|
||||||
|
147 begin
|
||||||
|
148 end
|
||||||
|
149 internal
|
||||||
|
150 singleton
|
||||||
|
151 begin
|
||||||
|
152 end
|
||||||
|
153 internal
|
||||||
|
154 singleton
|
||||||
|
155 begin
|
||||||
|
156 end
|
||||||
|
157 internal
|
||||||
|
158 singleton
|
||||||
|
159 begin
|
||||||
|
160 end
|
||||||
|
161 internal
|
||||||
|
162 singleton
|
||||||
|
163 begin
|
||||||
|
164 end
|
||||||
|
165 internal
|
||||||
|
166 singleton
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,3 @@
|
||||||
|
[
|
||||||
|
1.682383e+11 -1.1595e+10 -1.521733e+10 4.32034e+09 -2.257938e+10 -1.969666e+10 -2.559265e+10 -1.535687e+10 -1.276854e+10 -4.494483e+09 -1.209085e+10 -5.64008e+09 -1.134847e+10 -3.419512e+09 -1.079542e+10 -4.145463e+09 -6.637486e+09 -1.11318e+09 -3.479773e+09 -1.245932e+08 -1.386961e+09 6.560655e+07 -2.436518e+08 -4.032432e+07 4.620046e+08 -7.714964e+07 9.551484e+08 -4.119761e+08 8.208582e+08 -7.117156e+08 7.457703e+08 -4.3106e+08 1.202726e+09 2.904036e+08 1.231931e+09 3.629848e+08 6.366939e+08 -4.586172e+08 -5.267629e+08 -3.507819e+08 1.679838e+09
|
||||||
|
1.741141e+13 8.92488e+11 8.743834e+11 8.848896e+11 1.190313e+12 1.160279e+12 1.300066e+12 1.005678e+12 9.39335e+11 8.089614e+11 7.927041e+11 6.882427e+11 6.444235e+11 5.151451e+11 4.825723e+11 3.210106e+11 2.720254e+11 1.772539e+11 1.248102e+11 6.691599e+10 3.599804e+10 1.207574e+10 1.679301e+09 4.594778e+08 5.821614e+09 1.451758e+10 2.55803e+10 3.43277e+10 4.245286e+10 4.784859e+10 4.988591e+10 4.925451e+10 5.074584e+10 4.9557e+10 4.407876e+10 3.421443e+10 3.138606e+10 2.539716e+10 1.948134e+10 1.381167e+10 0 ]
|
|
@ -0,0 +1 @@
|
||||||
|
# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
|
|
@ -0,0 +1,2 @@
|
||||||
|
--left-context=3
|
||||||
|
--right-context=3
|
|
@ -0,0 +1,28 @@
|
||||||
|
# Audio STT Streaming Project Todos
|
||||||
|
|
||||||
|
## Frontend Tasks
|
||||||
|
- [ ] Create audio capture interface with start/stop recording
|
||||||
|
- [ ] Implement WebSocket connection to server
|
||||||
|
- [ ] Stream audio data in real-time to server
|
||||||
|
- [ ] Display incoming transcribed text from server
|
||||||
|
- [ ] Add audio visualization (optional)
|
||||||
|
- [ ] Handle connection errors and reconnection
|
||||||
|
|
||||||
|
## Backend Tasks
|
||||||
|
- [ ] Set up WebSocket server (Node.js/Python)
|
||||||
|
- [ ] Integrate Vosk STT engine
|
||||||
|
- [ ] Handle incoming audio stream processing
|
||||||
|
- [ ] Stream transcribed text back to client
|
||||||
|
- [ ] Add error handling and logging
|
||||||
|
- [ ] Create deployment documentation
|
||||||
|
|
||||||
|
## Server Setup
|
||||||
|
- [ ] Create Python server with Vosk integration
|
||||||
|
- [ ] Add WebSocket support for real-time communication
|
||||||
|
- [ ] Configure audio format handling (WAV/PCM)
|
||||||
|
- [ ] Test with different audio sample rates
|
||||||
|
|
||||||
|
## Deployment
|
||||||
|
- [ ] Create VPS deployment guide
|
||||||
|
- [ ] Add environment configuration
|
||||||
|
- [ ] Test end-to-end functionality
|
Loading…
Reference in New Issue