From 9989eeb879efc36e6daf39d029e557d0036c2ebb Mon Sep 17 00:00:00 2001 From: Kar Date: Thu, 5 Jun 2025 14:31:34 +0530 Subject: [PATCH] init --- .claude/settings.local.json | 12 +++ .dockerignore | 11 ++ .gitignore | 52 +++++++++ .yarnrc | 1 + CLAUDE.md | 51 +++++++++ Dockerfile | 41 ++++++++ README.md | 136 ++++++++++++++++++++++++ client-example.js | 71 +++++++++++++ docker-compose.yml | 16 +++ package.json | 27 +++++ public/app.js | 205 ++++++++++++++++++++++++++++++++++++ public/index.html | 99 +++++++++++++++++ requirements.md | 24 +++++ requirements.txt | 3 + server.js | 144 +++++++++++++++++++++++++ speech_processor.py | 122 +++++++++++++++++++++ 16 files changed, 1015 insertions(+) create mode 100644 .claude/settings.local.json create mode 100644 .dockerignore create mode 100644 .gitignore create mode 100644 .yarnrc create mode 100644 CLAUDE.md create mode 100644 Dockerfile create mode 100644 README.md create mode 100644 client-example.js create mode 100644 docker-compose.yml create mode 100644 package.json create mode 100644 public/app.js create mode 100644 public/index.html create mode 100644 requirements.md create mode 100644 requirements.txt create mode 100644 server.js create mode 100644 speech_processor.py diff --git a/.claude/settings.local.json b/.claude/settings.local.json new file mode 100644 index 0000000..e0a88e5 --- /dev/null +++ b/.claude/settings.local.json @@ -0,0 +1,12 @@ +{ + "permissions": { + "allow": [ + "Bash(mkdir:*)", + "Bash(ls:*)", + "Bash(unzip:*)", + "Bash(mv:*)", + "Bash(docker-compose up:*)" + ], + "deny": [] + } +} \ No newline at end of file diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..2603746 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,11 @@ +node_modules +npm-debug.log +Dockerfile +.dockerignore +.git +.gitignore +README.md +.env +.nyc_output +coverage +.vscode \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..68607f9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,52 @@ +# See https://help.github.com/articles/ignoring-files/ for more about ignoring files. + +vosk-model + +# dependencies +/node_modules +/.pnp +.pnp.js +/.yarn/* +!/.yarn/releases +!/.yarn/plugins +!/.yarn/sdks + +# testing +/coverage + +# next.js +/.next/ +/out/ +public/sitemap.xml +.vercel + +# production +/build +*.xml + +# rss feed +/public/feed.xml + +# search +/public/search.json + +# misc +.DS_Store +.idea + +# debug +*.log +npm-debug.log* +yarn-debug.log* +yarn-error.log* + +# local env files +.env +.env.local +.env.development.local +.env.test.local +.env.production.local + +# Contentlayer +.contentlayer + diff --git a/.yarnrc b/.yarnrc new file mode 100644 index 0000000..6c8b0a1 --- /dev/null +++ b/.yarnrc @@ -0,0 +1 @@ +registry "https://registry.yarnpkg.com" \ No newline at end of file diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..8bc3e4d --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,51 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Project Overview + +This is a speech-to-text proof of concept that runs entirely locally without third-party APIs. The system captures live microphone audio from a browser, sends it to a backend server, and converts it to text using open-source libraries like Vosk. + +## Architecture + +The project consists of two main components: +- **Frontend**: Basic HTML page with JavaScript for microphone capture and audio streaming +- **Backend**: Server (Node.js or Python) that receives audio streams and performs speech-to-text conversion using local libraries + +## Development Environment + +- User runs fish terminal +- All processing must be local (no cloud services) +- System should utilize local hardware for speech recognition + +## Key Implementation Requirements + +- Real-time or near-real-time audio streaming from browser to backend +- Local speech-to-text processing using libraries like Vosk +- Display transcribed text on the frontend UI +- Start/stop recording functionality +- WebSocket or similar real-time communication between frontend and backend + +## Development Commands + +### Docker (Recommended) +- `docker-compose up --build` - Build and start the application +- `docker-compose down` - Stop the application + +### Local Development +- `yarn install` - Install dependencies (yarn is configured) +- `yarn start` - Start the server +- `yarn dev` - Start with nodemon for development + +## Technology Stack + +- **Backend**: Node.js with Express and WebSocket server +- **Frontend**: HTML5 + JavaScript with AudioWorklet for audio capture +- **Speech Recognition**: Vosk library (Python) for local processing +- **Communication**: WebSocket for real-time audio streaming and transcription + +## Setup Requirements + +- Download Vosk model to `./vosk-model/` directory +- Server runs on http://localhost:3000 +- WebSocket API available at `ws://localhost:3000` for external clients \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..e0f0b25 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,41 @@ +FROM node:18 + +# Install Python and required dependencies +RUN apt-get update && apt-get install -y \ + python3 \ + python3-pip \ + python3-dev \ + python3-venv \ + build-essential \ + libsndfile1 \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# Copy Python requirements and install in virtual environment +COPY requirements.txt ./ +RUN python3 -m venv /opt/venv +ENV PATH="/opt/venv/bin:$PATH" +RUN pip install -r requirements.txt + +# Copy package files +COPY package.json yarn.lock* ./ +COPY .yarnrc ./ + +# Install Node.js dependencies +RUN yarn install --frozen-lockfile + +# Copy application code +COPY . . + +# Make Python script executable +RUN chmod +x speech_processor.py + +# Expose port +EXPOSE 3000 + +# Ensure virtual environment is active for runtime +ENV PATH="/opt/venv/bin:$PATH" + +# Start the application +CMD ["yarn", "start"] \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..07de776 --- /dev/null +++ b/README.md @@ -0,0 +1,136 @@ +# Speech-to-Text POC + +A speech-to-text proof of concept that processes audio locally using Vosk without requiring cloud APIs. The system exposes a WebSocket API that any client can connect to for real-time speech recognition. + +## Features + +- **Local Processing**: Uses Vosk for offline speech recognition +- **WebSocket API**: Server exposes `ws://localhost:3000` for any client to connect +- **Web Interface**: Browser-based demo for testing +- **Docker Support**: Complete containerized solution +- **No Cloud Dependencies**: Everything runs locally + +## Quick Start + +1. **Download Vosk model:** + ```bash + curl -L -o vosk-model-small-en-us-0.15.zip https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip + unzip vosk-model-small-en-us-0.15.zip + mv vosk-model-small-en-us-0.15 vosk-model + ``` + +2. **Start with Docker:** + ```bash + docker-compose up --build + ``` + +3. **Test the web interface:** + - Open `http://localhost:3000` in your browser + - Click "Start Recording" and speak + - See transcriptions appear in real-time + +## WebSocket API Usage + +The server exposes a WebSocket endpoint at `ws://localhost:3000` that accepts: + +- **Input**: Raw WAV audio data (16kHz, 16-bit, mono) +- **Output**: JSON messages with transcriptions + +### Example Client Usage + +```javascript +const WebSocket = require('ws'); +const fs = require('fs'); + +const ws = new WebSocket('ws://localhost:3000'); + +ws.on('open', () => { + // Send WAV audio file + const audioData = fs.readFileSync('audio.wav'); + ws.send(audioData); +}); + +ws.on('message', (data) => { + const message = JSON.parse(data); + if (message.type === 'transcription') { + console.log('Text:', message.text); + } +}); +``` + +See `client-example.js` for a complete Node.js client implementation. + +## Local Development Setup + +### Prerequisites +- Node.js 14+ +- Python 3.8+ +- Vosk model (downloaded as above) + +### Installation + +1. **Install Node.js dependencies:** + ```bash + yarn install + ``` + +2. **Install Python dependencies:** + ```bash + python3 -m venv venv + source venv/bin/activate # On Windows: venv\Scripts\activate + pip install -r requirements.txt + ``` + +3. **Start the server:** + ```bash + yarn start + ``` + +## Architecture + +- **Backend**: Node.js Express server with WebSocket support +- **Speech Processing**: Python subprocess using Vosk library +- **Frontend**: HTML5 + JavaScript with AudioWorklet for microphone capture +- **Communication**: WebSocket for bidirectional real-time communication + +## Supported Audio Formats + +- **Input**: WAV files (16kHz, 16-bit, mono preferred) +- **Browser**: Automatic conversion from microphone input +- **API**: Raw audio buffers or WAV format + +## Performance Notes + +- **Model Size**: Small model (~39MB) for fast loading +- **Latency**: Near real-time processing depending on audio chunk size +- **Accuracy**: Good for clear speech, may vary with background noise +- **Resource Usage**: Lightweight, suitable for local deployment + +## Troubleshooting + +### Common Issues + +1. **Model not found**: Ensure Vosk model is extracted to `./vosk-model/` directory +2. **Python errors**: Check that virtual environment is activated and dependencies installed +3. **WebSocket connection fails**: Verify server is running on port 3000 +4. **No audio**: Check browser microphone permissions + +### Docker Issues + +- **Build failures**: Ensure you have enough disk space for the image +- **Model mounting**: Verify `./vosk-model/` exists before running docker-compose +- **Permission errors**: Check file permissions on the vosk-model directory + +## Development + +- **Server logs**: `docker-compose logs -f` to see real-time logs +- **Rebuild**: `docker-compose up --build` after code changes +- **Stop**: `docker-compose down` to stop all services + +## Model Information + +- **Current**: Vosk Small English US (0.15) +- **Size**: ~39MB +- **Languages**: English (US) +- **Accuracy**: Optimized for speed over accuracy +- **Alternatives**: See [Vosk Models](https://alphacephei.com/vosk/models) for other languages/sizes \ No newline at end of file diff --git a/client-example.js b/client-example.js new file mode 100644 index 0000000..dc0f0fc --- /dev/null +++ b/client-example.js @@ -0,0 +1,71 @@ +// Example client that can connect to the WebSocket STT API +const WebSocket = require('ws'); +const fs = require('fs'); + +class STTClient { + constructor(serverUrl = 'ws://localhost:3000') { + this.ws = new WebSocket(serverUrl); + this.setupWebSocket(); + } + + setupWebSocket() { + this.ws.on('open', () => { + console.log('Connected to STT server'); + }); + + this.ws.on('message', (data) => { + const message = JSON.parse(data); + + if (message.type === 'transcription') { + console.log('Transcription:', message.text); + } else if (message.type === 'error') { + console.error('STT Error:', message.message); + } + }); + + this.ws.on('close', () => { + console.log('Disconnected from STT server'); + }); + + this.ws.on('error', (error) => { + console.error('WebSocket error:', error); + }); + } + + // Send audio file for transcription + sendAudioFile(filePath) { + if (this.ws.readyState === WebSocket.OPEN) { + const audioData = fs.readFileSync(filePath); + this.ws.send(audioData); + console.log(`Sent audio file: ${filePath}`); + } else { + console.error('WebSocket not connected'); + } + } + + // Send raw audio buffer + sendAudioBuffer(audioBuffer) { + if (this.ws.readyState === WebSocket.OPEN) { + this.ws.send(audioBuffer); + } else { + console.error('WebSocket not connected'); + } + } + + close() { + this.ws.close(); + } +} + +// Example usage +if (require.main === module) { + const client = new STTClient(); + + // Example: Send an audio file + // client.sendAudioFile('./test-audio.wav'); + + // Keep the process alive + process.stdin.resume(); +} + +module.exports = STTClient; \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..1a19a87 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,16 @@ +services: + stt-app: + build: . + ports: + - "3000:3000" + volumes: + - ./public:/app/public + - ./vosk-model:/app/vosk-model:ro + environment: + - NODE_ENV=development + restart: unless-stopped + healthcheck: + test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3000"] + interval: 30s + timeout: 10s + retries: 3 \ No newline at end of file diff --git a/package.json b/package.json new file mode 100644 index 0000000..1483fc4 --- /dev/null +++ b/package.json @@ -0,0 +1,27 @@ +{ + "name": "stt-simple", + "version": "1.0.0", + "description": "Simple Speech-to-Text POC using local libraries", + "main": "server.js", + "scripts": { + "start": "node server.js", + "dev": "nodemon server.js" + }, + "dependencies": { + "ws": "^8.14.2", + "express": "^4.18.2", + "node-wav": "^0.0.2", + "stream": "^0.0.2" + }, + "devDependencies": { + "nodemon": "^3.0.2" + }, + "keywords": [ + "speech-to-text", + "vosk", + "websockets" + ], + "author": "", + "license": "MIT", + "packageManager": "yarn@1.22.22+sha512.a6b2f7906b721bba3d67d4aff083df04dad64c399707841b7acf00f6b133b7ac24255f2652fa22ae3534329dc6180534e98d17432037ff6fd140556e2bb3137e" +} diff --git a/public/app.js b/public/app.js new file mode 100644 index 0000000..4bea216 --- /dev/null +++ b/public/app.js @@ -0,0 +1,205 @@ +class SpeechToTextApp { + constructor() { + this.ws = null; + this.audioContext = null; + this.processor = null; + this.stream = null; + this.isRecording = false; + + this.startBtn = document.getElementById('startBtn'); + this.stopBtn = document.getElementById('stopBtn'); + this.clearBtn = document.getElementById('clearBtn'); + this.status = document.getElementById('status'); + this.transcription = document.getElementById('transcription'); + + this.initializeEventListeners(); + this.connectWebSocket(); + } + + initializeEventListeners() { + this.startBtn.addEventListener('click', () => this.startRecording()); + this.stopBtn.addEventListener('click', () => this.stopRecording()); + this.clearBtn.addEventListener('click', () => this.clearTranscription()); + } + + connectWebSocket() { + const wsProtocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:'; + const wsUrl = `${wsProtocol}//${window.location.host}`; + + this.ws = new WebSocket(wsUrl); + + this.ws.onopen = () => { + this.updateStatus('Connected to server', 'success'); + }; + + this.ws.onmessage = (event) => { + const data = JSON.parse(event.data); + if (data.type === 'transcription' && data.text) { + this.appendTranscription(data.text); + } + }; + + this.ws.onclose = () => { + this.updateStatus('Disconnected from server', 'error'); + setTimeout(() => this.connectWebSocket(), 3000); + }; + + this.ws.onerror = (error) => { + this.updateStatus('WebSocket error', 'error'); + }; + } + + + async startRecording() { + try { + this.stream = await navigator.mediaDevices.getUserMedia({ + audio: { + sampleRate: 16000, + channelCount: 1, + echoCancellation: true, + noiseSuppression: true + } + }); + + this.audioContext = new (window.AudioContext || window.webkitAudioContext)({ + sampleRate: 16000 + }); + + const source = this.audioContext.createMediaStreamSource(this.stream); + + await this.audioContext.audioWorklet.addModule('data:text/javascript,' + encodeURIComponent(` + class AudioProcessor extends AudioWorkletProcessor { + constructor() { + super(); + this.bufferSize = 4096; + this.buffer = new Float32Array(this.bufferSize); + this.bufferIndex = 0; + } + + process(inputs) { + const input = inputs[0]; + if (input.length > 0) { + const audioData = input[0]; + + for (let i = 0; i < audioData.length; i++) { + this.buffer[this.bufferIndex] = audioData[i]; + this.bufferIndex++; + + if (this.bufferIndex >= this.bufferSize) { + // Convert to WAV format + const int16Array = new Int16Array(this.bufferSize); + for (let j = 0; j < this.bufferSize; j++) { + int16Array[j] = Math.max(-32768, Math.min(32767, this.buffer[j] * 32768)); + } + + // Create WAV header + const wavBuffer = this.createWAVBuffer(int16Array); + this.port.postMessage(wavBuffer); + + this.bufferIndex = 0; + } + } + } + return true; + } + + createWAVBuffer(samples) { + const length = samples.length; + const buffer = new ArrayBuffer(44 + length * 2); + const view = new DataView(buffer); + + // WAV header + const writeString = (offset, string) => { + for (let i = 0; i < string.length; i++) { + view.setUint8(offset + i, string.charCodeAt(i)); + } + }; + + writeString(0, 'RIFF'); + view.setUint32(4, 36 + length * 2, true); + writeString(8, 'WAVE'); + writeString(12, 'fmt '); + view.setUint32(16, 16, true); + view.setUint16(20, 1, true); + view.setUint16(22, 1, true); + view.setUint32(24, 16000, true); + view.setUint32(28, 16000 * 2, true); + view.setUint16(32, 2, true); + view.setUint16(34, 16, true); + writeString(36, 'data'); + view.setUint32(40, length * 2, true); + + // Convert samples to bytes + let offset = 44; + for (let i = 0; i < length; i++) { + view.setInt16(offset, samples[i], true); + offset += 2; + } + + return buffer; + } + } + registerProcessor('audio-processor', AudioProcessor); + `)); + + this.processor = new AudioWorkletNode(this.audioContext, 'audio-processor'); + + this.processor.port.onmessage = (event) => { + if (this.ws && this.ws.readyState === WebSocket.OPEN) { + this.ws.send(event.data); + } + }; + + source.connect(this.processor); + + this.isRecording = true; + this.startBtn.disabled = true; + this.stopBtn.disabled = false; + this.startBtn.textContent = 'Recording...'; + this.startBtn.classList.add('recording'); + this.updateStatus('🔴 Recording...', 'success'); + + } catch (error) { + this.updateStatus('Error accessing microphone: ' + error.message, 'error'); + console.error('Error starting recording:', error); + } + } + + stopRecording() { + if (this.stream) { + this.stream.getTracks().forEach(track => track.stop()); + } + + if (this.audioContext) { + this.audioContext.close(); + } + + this.isRecording = false; + this.startBtn.disabled = false; + this.stopBtn.disabled = true; + this.startBtn.textContent = 'Start Recording'; + this.startBtn.classList.remove('recording'); + this.updateStatus('Recording stopped', 'success'); + } + + clearTranscription() { + this.transcription.textContent = 'Transcribed text will appear here...'; + } + + appendTranscription(text) { + if (this.transcription.textContent === 'Transcribed text will appear here...') { + this.transcription.textContent = ''; + } + this.transcription.textContent += text + ' '; + this.transcription.scrollTop = this.transcription.scrollHeight; + } + + updateStatus(message, type = '') { + this.status.textContent = message; + this.status.className = `status ${type}`; + } +} + +document.addEventListener('DOMContentLoaded', () => { + new SpeechToTextApp(); +}); \ No newline at end of file diff --git a/public/index.html b/public/index.html new file mode 100644 index 0000000..1e56940 --- /dev/null +++ b/public/index.html @@ -0,0 +1,99 @@ + + + + + + Speech-to-Text POC + + + +
+

🎙️ Speech-to-Text POC

+ +
+ + + +
+ +
Ready to record
+ +
+ Transcribed text will appear here... +
+
+ + + + \ No newline at end of file diff --git a/requirements.md b/requirements.md new file mode 100644 index 0000000..90f47eb --- /dev/null +++ b/requirements.md @@ -0,0 +1,24 @@ +### 🧩 **Requirement: Speech-to-Text POC (No 3rd-Party APIs)** + +#### **Goal** + +Build a simple proof of concept (POC) that captures live microphone audio from the browser, sends it to a backend server, converts the audio to text using an open-source/local library, and displays the text on the UI. + +#### **Key Points** + +* A basic `index.html` page to: + + * Start/stop microphone recording. + * Stream audio to the backend. + * Display the transcribed text in real-time or after processing. +* A backend server (e.g., Node.js or Python) that: + + * Receives audio stream. + * Uses a **local speech-to-text library** (e.g., [Vosk](https://alphacephei.com/vosk/)) — **no external APIs**. + * Sends back the transcribed text to the frontend. + +#### **Note** + +* I am using fish terminal +* The solution should run locally and utilize system hardware. +* Avoid any third-party cloud services. \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..757a20a --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +vosk==0.3.45 +soundfile==0.12.1 +numpy==1.24.3 \ No newline at end of file diff --git a/server.js b/server.js new file mode 100644 index 0000000..c6484a2 --- /dev/null +++ b/server.js @@ -0,0 +1,144 @@ +const express = require('express'); +const WebSocket = require('ws'); +const { spawn } = require('child_process'); +const fs = require('fs'); + +const app = express(); +const PORT = 3000; + +app.use(express.static('public')); + +const server = app.listen(PORT, () => { + console.log(`Server running on http://localhost:${PORT}`); + console.log('Using Python SpeechRecognition with PocketSphinx for local STT'); +}); + +const wss = new WebSocket.Server({ server }); + +class SpeechProcessor { + constructor() { + this.pythonProcess = null; + this.initializePythonProcess(); + } + + initializePythonProcess() { + try { + this.pythonProcess = spawn('python3', ['speech_processor.py'], { + stdio: ['pipe', 'pipe', 'pipe'] + }); + + this.pythonProcess.stderr.on('data', (data) => { + console.error('Python process error:', data.toString()); + }); + + this.pythonProcess.on('close', (code) => { + console.log(`Python process closed with code ${code}`); + // Restart process if it dies + setTimeout(() => this.initializePythonProcess(), 1000); + }); + + console.log('Python speech processor initialized'); + } catch (error) { + console.error('Failed to initialize Python process:', error); + } + } + + async processAudio(audioBuffer) { + return new Promise((resolve, reject) => { + if (!this.pythonProcess) { + reject(new Error('Python process not available')); + return; + } + + // Send audio data length first + const lengthBuffer = Buffer.allocUnsafe(4); + lengthBuffer.writeUInt32BE(audioBuffer.length, 0); + this.pythonProcess.stdin.write(lengthBuffer); + + // Send audio data + this.pythonProcess.stdin.write(audioBuffer); + + // Read response + let responseLength = null; + let responseData = Buffer.alloc(0); + let expecting = 'length'; + + const onData = (data) => { + responseData = Buffer.concat([responseData, data]); + + if (expecting === 'length' && responseData.length >= 4) { + responseLength = responseData.readUInt32BE(0); + responseData = responseData.slice(4); + expecting = 'data'; + } + + if (expecting === 'data' && responseData.length >= responseLength) { + const jsonData = responseData.slice(0, responseLength); + this.pythonProcess.stdout.removeListener('data', onData); + + try { + const result = JSON.parse(jsonData.toString()); + resolve(result); + } catch (error) { + reject(error); + } + } + }; + + this.pythonProcess.stdout.on('data', onData); + + // Timeout after 10 seconds + setTimeout(() => { + this.pythonProcess.stdout.removeListener('data', onData); + reject(new Error('Speech processing timeout')); + }, 10000); + }); + } +} + +const speechProcessor = new SpeechProcessor(); + +wss.on('connection', (ws) => { + console.log('Client connected'); + + ws.on('message', async (data) => { + try { + if (Buffer.isBuffer(data)) { + // Raw audio data received + const result = await speechProcessor.processAudio(data); + + if (result.success && result.text) { + ws.send(JSON.stringify({ + type: 'transcription', + text: result.text + })); + console.log('Transcription:', result.text); + } else if (!result.success) { + console.error('STT Error:', result.error); + ws.send(JSON.stringify({ + type: 'error', + message: result.error + })); + } + } else { + // JSON message received + const message = JSON.parse(data); + console.log('Received message:', message); + } + } catch (error) { + console.error('Error processing message:', error); + ws.send(JSON.stringify({ + type: 'error', + message: 'Error processing audio' + })); + } + }); + + ws.on('close', () => { + console.log('Client disconnected'); + }); + + ws.on('error', (error) => { + console.error('WebSocket error:', error); + }); +}); \ No newline at end of file diff --git a/speech_processor.py b/speech_processor.py new file mode 100644 index 0000000..7c95c23 --- /dev/null +++ b/speech_processor.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python3 +import vosk +import sys +import json +import tempfile +import os +import wave +import soundfile as sf + +# Global model - load once +model = None +recognizer = None + +def initialize_vosk(): + """Initialize Vosk model""" + global model, recognizer + + model_path = "/app/vosk-model" + if not os.path.exists(model_path): + return {"success": False, "error": "Vosk model not found at /app/vosk-model"} + + try: + vosk.SetLogLevel(-1) # Reduce log verbosity + model = vosk.Model(model_path) + recognizer = vosk.KaldiRecognizer(model, 16000) + return {"success": True} + except Exception as e: + return {"success": False, "error": f"Failed to initialize Vosk: {str(e)}"} + +def process_audio_chunk(audio_data): + """Process audio data and return transcription""" + global recognizer + + if not recognizer: + init_result = initialize_vosk() + if not init_result["success"]: + return init_result + + try: + # Write audio data to temporary file + with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file: + temp_file.write(audio_data) + temp_filename = temp_file.name + + # Read audio file with soundfile + try: + audio_data, sample_rate = sf.read(temp_filename) + + # Convert to 16-bit PCM at 16kHz if needed + if sample_rate != 16000: + # Simple resampling (for better quality, use librosa) + import numpy as np + audio_data = np.interp( + np.linspace(0, len(audio_data), int(len(audio_data) * 16000 / sample_rate)), + np.arange(len(audio_data)), + audio_data + ) + + # Convert to bytes + audio_bytes = (audio_data * 32767).astype('int16').tobytes() + + # Process with Vosk + if recognizer.AcceptWaveform(audio_bytes): + result = json.loads(recognizer.Result()) + text = result.get('text', '') + else: + result = json.loads(recognizer.PartialResult()) + text = result.get('partial', '') + + # Clean up + os.unlink(temp_filename) + + return {"success": True, "text": text} + + except Exception as e: + os.unlink(temp_filename) + return {"success": False, "error": f"Audio processing error: {str(e)}"} + + except Exception as e: + return {"success": False, "error": f"General error: {str(e)}"} + +def main(): + """Main loop to process audio chunks from stdin""" + # Initialize Vosk on startup + init_result = initialize_vosk() + if not init_result["success"]: + error_response = json.dumps(init_result).encode('utf-8') + sys.stdout.buffer.write(len(error_response).to_bytes(4, byteorder='big')) + sys.stdout.buffer.write(error_response) + sys.stdout.buffer.flush() + sys.exit(1) + + while True: + try: + # Read length of incoming data + length_data = sys.stdin.buffer.read(4) + if not length_data: + break + + length = int.from_bytes(length_data, byteorder='big') + + # Read audio data + audio_data = sys.stdin.buffer.read(length) + + # Process audio + result = process_audio_chunk(audio_data) + + # Send result back + response = json.dumps(result).encode('utf-8') + sys.stdout.buffer.write(len(response).to_bytes(4, byteorder='big')) + sys.stdout.buffer.write(response) + sys.stdout.buffer.flush() + + except Exception as e: + error_result = {"success": False, "error": str(e)} + response = json.dumps(error_result).encode('utf-8') + sys.stdout.buffer.write(len(response).to_bytes(4, byteorder='big')) + sys.stdout.buffer.write(response) + sys.stdout.buffer.flush() + +if __name__ == "__main__": + main() \ No newline at end of file