diff --git a/Dockerfile b/Dockerfile index cf12d11..c8db987 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,10 +1,11 @@ -FROM python:3.9-slim as base +FROM python:3.9-slim AS base # Install dependencies RUN apt-get update && apt-get install -y \ python3-pip \ ffmpeg \ wget \ + unzip \ && rm -rf /var/lib/apt/lists/* # Install Python requirements @@ -21,15 +22,15 @@ RUN wget https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip && COPY app.py . # COPY Caddyfile . -FROM base as production +FROM base AS production # Install gunicorn and eventlet RUN pip install gunicorn eventlet # Expose ports (8000 for app, 2019 for Caddy admin) -EXPOSE 8000 2019 +EXPOSE 5000 -CMD ["caddy", "run", "--config", "/app/Caddyfile"] +#CMD ["caddy", "run", "--config", "/app/Caddyfile"] -FROM base as development +FROM base AS development # For development with auto-reload CMD ["python", "app.py"] \ No newline at end of file diff --git a/app.py b/app.py index 5cd1495..b3acb6d 100644 --- a/app.py +++ b/app.py @@ -34,4 +34,4 @@ def handle_audio_stream(audio_data): if __name__ == '__main__': os.makedirs('static', exist_ok=True) - socketio.run(app, host='0.0.0.0', port=8000) \ No newline at end of file + socketio.run(app, host='0.0.0.0', port=5000) \ No newline at end of file diff --git a/app2.py b/app2.py new file mode 100644 index 0000000..d5f78aa --- /dev/null +++ b/app2.py @@ -0,0 +1,305 @@ +#!/usr/bin/env python3 +""" +Improved Real-time Speech-to-Text WebSocket Server using Vosk +with better audio format handling +""" + +import asyncio +import websockets +import json +import logging +import subprocess +import tempfile +import os +import io +from vosk import Model, KaldiRecognizer + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s' +) +logger = logging.getLogger(__name__) + +class ImprovedVoskSTTServer: + def __init__(self, model_path="vosk-model-small-en-us-0.15", sample_rate=16000): + """ + Initialize Improved Vosk STT Server with FFmpeg support + + Args: + model_path: Path to Vosk model directory + sample_rate: Audio sample rate (16000 is recommended) + """ + self.model_path = model_path + self.sample_rate = sample_rate + self.model = None + self.check_dependencies() + self.load_model() + + def check_dependencies(self): + """Check if FFmpeg is available""" + try: + subprocess.run(['ffmpeg', '-version'], + capture_output=True, check=True) + logger.info("FFmpeg is available") + self.has_ffmpeg = True + except (subprocess.CalledProcessError, FileNotFoundError): + logger.warning("FFmpeg not found. Audio conversion may be limited.") + self.has_ffmpeg = False + + def load_model(self): + """Load Vosk model""" + try: + if not os.path.exists(self.model_path): + logger.error(f"Model path {self.model_path} does not exist!") + self.print_model_setup_instructions() + raise FileNotFoundError(f"Model not found at {self.model_path}") + + logger.info(f"Loading Vosk model from {self.model_path}...") + self.model = Model(self.model_path) + logger.info("Model loaded successfully!") + + except Exception as e: + logger.error(f"Failed to load model: {e}") + raise + + def print_model_setup_instructions(self): + """Print instructions for setting up Vosk model""" + logger.info("=" * 60) + logger.info("VOSK MODEL SETUP INSTRUCTIONS") + logger.info("=" * 60) + logger.info("1. Download a Vosk model (choose based on your needs):") + logger.info("") + logger.info(" Small English model (~50MB):") + logger.info(" wget https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip") + logger.info(" unzip vosk-model-small-en-us-0.15.zip") + logger.info("") + logger.info(" Large English model (~1.8GB, better accuracy):") + logger.info(" wget https://alphacephei.com/vosk/models/vosk-model-en-us-0.22.zip") + logger.info(" unzip vosk-model-en-us-0.22.zip") + logger.info("") + logger.info(" Other languages available at: https://alphacephei.com/vosk/models") + logger.info("") + logger.info("2. Place the extracted model directory in the server folder") + logger.info("3. Update the model path when starting the server") + logger.info("=" * 60) + + async def handle_client(self, websocket, path): + """Handle WebSocket client connection""" + client_ip = websocket.remote_address[0] + logger.info(f"New client connected: {client_ip}") + + # Create recognizer for this client + recognizer = KaldiRecognizer(self.model, self.sample_rate) + + try: + await websocket.send(json.dumps({ + "type": "status", + "message": "Connected to Vosk STT Server", + "server_info": { + "sample_rate": self.sample_rate, + "has_ffmpeg": self.has_ffmpeg, + "model_path": self.model_path + } + })) + + async for message in websocket: + try: + # Handle binary audio data + if isinstance(message, bytes): + await self.process_audio_chunk(websocket, recognizer, message) + + # Handle text messages (commands, etc.) + elif isinstance(message, str): + await self.handle_text_message(websocket, recognizer, message) + + except Exception as e: + logger.error(f"Error processing message: {e}") + await websocket.send(json.dumps({ + "type": "error", + "message": str(e) + })) + + except websockets.exceptions.ConnectionClosed: + logger.info(f"Client disconnected: {client_ip}") + except Exception as e: + logger.error(f"Error handling client {client_ip}: {e}") + + async def process_audio_chunk(self, websocket, recognizer, audio_data): + """Process incoming audio chunk with Vosk""" + try: + # Convert audio to PCM format for Vosk + pcm_data = await self.convert_to_pcm(audio_data) + + if pcm_data: + # Feed audio to recognizer + if recognizer.AcceptWaveform(pcm_data): + # Final result + result = json.loads(recognizer.Result()) + if result.get('text', '').strip(): + await websocket.send(json.dumps({ + "type": "transcription", + "text": result['text'], + "final": True, + "confidence": result.get('confidence', 0.0), + "timestamp": asyncio.get_event_loop().time() + })) + logger.info(f"Final: {result['text']}") + else: + # Partial result + partial_result = json.loads(recognizer.PartialResult()) + if partial_result.get('partial', '').strip(): + await websocket.send(json.dumps({ + "type": "transcription", + "text": partial_result['partial'], + "final": False, + "confidence": 0.0, + "timestamp": asyncio.get_event_loop().time() + })) + + except Exception as e: + logger.error(f"Error processing audio: {e}") + + async def convert_to_pcm(self, audio_data): + """ + Convert various audio formats to PCM format using FFmpeg + """ + if not self.has_ffmpeg: + # Fallback: assume audio is already in compatible format + return audio_data + + try: + # Create temporary files + with tempfile.NamedTemporaryFile(suffix='.webm', delete=False) as input_file: + input_file.write(audio_data) + input_path = input_file.name + + with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as output_file: + output_path = output_file.name + + # Use FFmpeg to convert to PCM WAV format + cmd = [ + 'ffmpeg', + '-i', input_path, + '-acodec', 'pcm_s16le', # 16-bit PCM + '-ac', '1', # Mono + '-ar', str(self.sample_rate), # Sample rate + '-f', 'wav', + '-y', # Overwrite output + output_path + ] + + # Run conversion asynchronously + process = await asyncio.create_subprocess_exec( + *cmd, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE + ) + + stdout, stderr = await process.communicate() + + if process.returncode == 0: + # Read converted audio + with open(output_path, 'rb') as f: + wav_data = f.read() + + # Extract PCM data (skip WAV header - 44 bytes) + pcm_data = wav_data[44:] + + # Cleanup + os.unlink(input_path) + os.unlink(output_path) + + return pcm_data + else: + logger.error(f"FFmpeg conversion failed: {stderr.decode()}") + # Cleanup + os.unlink(input_path) + if os.path.exists(output_path): + os.unlink(output_path) + return None + + except Exception as e: + logger.error(f"Audio conversion error: {e}") + return None + + async def handle_text_message(self, websocket, recognizer, message): + """Handle text-based commands from client""" + try: + data = json.loads(message) + command = data.get('command') + + if command == 'ping': + await websocket.send(json.dumps({ + "type": "pong", + "timestamp": asyncio.get_event_loop().time() + })) + + elif command == 'reset': + # Reset recognizer + recognizer.Reset() + await websocket.send(json.dumps({ + "type": "status", + "message": "Recognizer reset" + })) + + elif command == 'get_info': + await websocket.send(json.dumps({ + "type": "server_info", + "sample_rate": self.sample_rate, + "has_ffmpeg": self.has_ffmpeg, + "model_path": self.model_path + })) + + except json.JSONDecodeError: + logger.error("Invalid JSON message received") + + async def start_server(self, host="0.0.0.0", port=5000): + """Start the WebSocket server""" + logger.info(f"Starting Vosk STT WebSocket server on {host}:{port}") + logger.info(f"Using model: {self.model_path}") + logger.info(f"Sample rate: {self.sample_rate}") + logger.info(f"FFmpeg available: {self.has_ffmpeg}") + + try: + async with websockets.serve(self.handle_client, host, port): + logger.info("Server started successfully!") + logger.info("Waiting for client connections...") + logger.info("Press Ctrl+C to stop the server") + + # Keep server running + await asyncio.Future() # run forever + + except Exception as e: + logger.error(f"Server error: {e}") + raise + +def main(): + """Main entry point""" + import argparse + + parser = argparse.ArgumentParser(description='Improved Vosk STT WebSocket Server') + parser.add_argument('--host', default='0.0.0.0', help='Host to bind to') + parser.add_argument('--port', type=int, default=8765, help='Port to bind to') + parser.add_argument('--model', default='vosk-model-small-en-us-0.15', + help='Path to Vosk model directory') + parser.add_argument('--sample-rate', type=int, default=16000, + help='Audio sample rate') + + args = parser.parse_args() + + try: + # Create and start server + server = ImprovedVoskSTTServer(model_path=args.model, sample_rate=args.sample_rate) + asyncio.run(server.start_server(host=args.host, port=args.port)) + + except KeyboardInterrupt: + logger.info("Server stopped by user") + except Exception as e: + logger.error(f"Server failed to start: {e}") + return 1 + + return 0 + +if __name__ == "__main__": + exit(main()) diff --git a/model/README b/model/README new file mode 100644 index 0000000..a7f7931 --- /dev/null +++ b/model/README @@ -0,0 +1,9 @@ +US English model for mobile Vosk applications + +Copyright 2020 Alpha Cephei Inc + +Accuracy: 10.38 (tedlium test) 9.85 (librispeech test-clean) +Speed: 0.11xRT (desktop) +Latency: 0.15s (right context) + + diff --git a/model/am/final.mdl b/model/am/final.mdl new file mode 100644 index 0000000..5596b31 Binary files /dev/null and b/model/am/final.mdl differ diff --git a/model/conf/mfcc.conf b/model/conf/mfcc.conf new file mode 100644 index 0000000..eaa40c5 --- /dev/null +++ b/model/conf/mfcc.conf @@ -0,0 +1,7 @@ +--sample-frequency=16000 +--use-energy=false +--num-mel-bins=40 +--num-ceps=40 +--low-freq=20 +--high-freq=7600 +--allow-downsample=true diff --git a/model/conf/model.conf b/model/conf/model.conf new file mode 100644 index 0000000..9d5b0da --- /dev/null +++ b/model/conf/model.conf @@ -0,0 +1,10 @@ +--min-active=200 +--max-active=3000 +--beam=10.0 +--lattice-beam=2.0 +--acoustic-scale=1.0 +--frame-subsampling-factor=3 +--endpoint.silence-phones=1:2:3:4:5:6:7:8:9:10 +--endpoint.rule2.min-trailing-silence=0.5 +--endpoint.rule3.min-trailing-silence=0.75 +--endpoint.rule4.min-trailing-silence=1.0 diff --git a/model/graph/Gr.fst b/model/graph/Gr.fst new file mode 100644 index 0000000..1f292e6 Binary files /dev/null and b/model/graph/Gr.fst differ diff --git a/model/graph/HCLr.fst b/model/graph/HCLr.fst new file mode 100644 index 0000000..9797b26 Binary files /dev/null and b/model/graph/HCLr.fst differ diff --git a/model/graph/disambig_tid.int b/model/graph/disambig_tid.int new file mode 100644 index 0000000..762fd5f --- /dev/null +++ b/model/graph/disambig_tid.int @@ -0,0 +1,17 @@ +10015 +10016 +10017 +10018 +10019 +10020 +10021 +10022 +10023 +10024 +10025 +10026 +10027 +10028 +10029 +10030 +10031 diff --git a/model/graph/phones/word_boundary.int b/model/graph/phones/word_boundary.int new file mode 100644 index 0000000..df23fd7 --- /dev/null +++ b/model/graph/phones/word_boundary.int @@ -0,0 +1,166 @@ +1 nonword +2 begin +3 end +4 internal +5 singleton +6 nonword +7 begin +8 end +9 internal +10 singleton +11 begin +12 end +13 internal +14 singleton +15 begin +16 end +17 internal +18 singleton +19 begin +20 end +21 internal +22 singleton +23 begin +24 end +25 internal +26 singleton +27 begin +28 end +29 internal +30 singleton +31 begin +32 end +33 internal +34 singleton +35 begin +36 end +37 internal +38 singleton +39 begin +40 end +41 internal +42 singleton +43 begin +44 end +45 internal +46 singleton +47 begin +48 end +49 internal +50 singleton +51 begin +52 end +53 internal +54 singleton +55 begin +56 end +57 internal +58 singleton +59 begin +60 end +61 internal +62 singleton +63 begin +64 end +65 internal +66 singleton +67 begin +68 end +69 internal +70 singleton +71 begin +72 end +73 internal +74 singleton +75 begin +76 end +77 internal +78 singleton +79 begin +80 end +81 internal +82 singleton +83 begin +84 end +85 internal +86 singleton +87 begin +88 end +89 internal +90 singleton +91 begin +92 end +93 internal +94 singleton +95 begin +96 end +97 internal +98 singleton +99 begin +100 end +101 internal +102 singleton +103 begin +104 end +105 internal +106 singleton +107 begin +108 end +109 internal +110 singleton +111 begin +112 end +113 internal +114 singleton +115 begin +116 end +117 internal +118 singleton +119 begin +120 end +121 internal +122 singleton +123 begin +124 end +125 internal +126 singleton +127 begin +128 end +129 internal +130 singleton +131 begin +132 end +133 internal +134 singleton +135 begin +136 end +137 internal +138 singleton +139 begin +140 end +141 internal +142 singleton +143 begin +144 end +145 internal +146 singleton +147 begin +148 end +149 internal +150 singleton +151 begin +152 end +153 internal +154 singleton +155 begin +156 end +157 internal +158 singleton +159 begin +160 end +161 internal +162 singleton +163 begin +164 end +165 internal +166 singleton diff --git a/model/ivector/final.dubm b/model/ivector/final.dubm new file mode 100644 index 0000000..db789eb Binary files /dev/null and b/model/ivector/final.dubm differ diff --git a/model/ivector/final.ie b/model/ivector/final.ie new file mode 100644 index 0000000..93737bf Binary files /dev/null and b/model/ivector/final.ie differ diff --git a/model/ivector/final.mat b/model/ivector/final.mat new file mode 100644 index 0000000..c3ec635 Binary files /dev/null and b/model/ivector/final.mat differ diff --git a/model/ivector/global_cmvn.stats b/model/ivector/global_cmvn.stats new file mode 100644 index 0000000..b9d92ef --- /dev/null +++ b/model/ivector/global_cmvn.stats @@ -0,0 +1,3 @@ + [ + 1.682383e+11 -1.1595e+10 -1.521733e+10 4.32034e+09 -2.257938e+10 -1.969666e+10 -2.559265e+10 -1.535687e+10 -1.276854e+10 -4.494483e+09 -1.209085e+10 -5.64008e+09 -1.134847e+10 -3.419512e+09 -1.079542e+10 -4.145463e+09 -6.637486e+09 -1.11318e+09 -3.479773e+09 -1.245932e+08 -1.386961e+09 6.560655e+07 -2.436518e+08 -4.032432e+07 4.620046e+08 -7.714964e+07 9.551484e+08 -4.119761e+08 8.208582e+08 -7.117156e+08 7.457703e+08 -4.3106e+08 1.202726e+09 2.904036e+08 1.231931e+09 3.629848e+08 6.366939e+08 -4.586172e+08 -5.267629e+08 -3.507819e+08 1.679838e+09 + 1.741141e+13 8.92488e+11 8.743834e+11 8.848896e+11 1.190313e+12 1.160279e+12 1.300066e+12 1.005678e+12 9.39335e+11 8.089614e+11 7.927041e+11 6.882427e+11 6.444235e+11 5.151451e+11 4.825723e+11 3.210106e+11 2.720254e+11 1.772539e+11 1.248102e+11 6.691599e+10 3.599804e+10 1.207574e+10 1.679301e+09 4.594778e+08 5.821614e+09 1.451758e+10 2.55803e+10 3.43277e+10 4.245286e+10 4.784859e+10 4.988591e+10 4.925451e+10 5.074584e+10 4.9557e+10 4.407876e+10 3.421443e+10 3.138606e+10 2.539716e+10 1.948134e+10 1.381167e+10 0 ] diff --git a/model/ivector/online_cmvn.conf b/model/ivector/online_cmvn.conf new file mode 100644 index 0000000..7748a4a --- /dev/null +++ b/model/ivector/online_cmvn.conf @@ -0,0 +1 @@ +# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh diff --git a/model/ivector/splice.conf b/model/ivector/splice.conf new file mode 100644 index 0000000..960cd2e --- /dev/null +++ b/model/ivector/splice.conf @@ -0,0 +1,2 @@ +--left-context=3 +--right-context=3 diff --git a/todo.md b/todo.md new file mode 100644 index 0000000..6f983d5 --- /dev/null +++ b/todo.md @@ -0,0 +1,28 @@ +# Audio STT Streaming Project Todos + +## Frontend Tasks +- [ ] Create audio capture interface with start/stop recording +- [ ] Implement WebSocket connection to server +- [ ] Stream audio data in real-time to server +- [ ] Display incoming transcribed text from server +- [ ] Add audio visualization (optional) +- [ ] Handle connection errors and reconnection + +## Backend Tasks +- [ ] Set up WebSocket server (Node.js/Python) +- [ ] Integrate Vosk STT engine +- [ ] Handle incoming audio stream processing +- [ ] Stream transcribed text back to client +- [ ] Add error handling and logging +- [ ] Create deployment documentation + +## Server Setup +- [ ] Create Python server with Vosk integration +- [ ] Add WebSocket support for real-time communication +- [ ] Configure audio format handling (WAV/PCM) +- [ ] Test with different audio sample rates + +## Deployment +- [ ] Create VPS deployment guide +- [ ] Add environment configuration +- [ ] Test end-to-end functionality