From 9989eeb879efc36e6daf39d029e557d0036c2ebb Mon Sep 17 00:00:00 2001
From: Kar <kar@cb1>
Date: Thu, 5 Jun 2025 14:31:34 +0530
Subject: [PATCH] init

---
 .claude/settings.local.json |  12 +++
 .dockerignore               |  11 ++
 .gitignore                  |  52 +++++++++
 .yarnrc                     |   1 +
 CLAUDE.md                   |  51 +++++++++
 Dockerfile                  |  41 ++++++++
 README.md                   | 136 ++++++++++++++++++++++++
 client-example.js           |  71 +++++++++++++
 docker-compose.yml          |  16 +++
 package.json                |  27 +++++
 public/app.js               | 205 ++++++++++++++++++++++++++++++++++++
 public/index.html           |  99 +++++++++++++++++
 requirements.md             |  24 +++++
 requirements.txt            |   3 +
 server.js                   | 144 +++++++++++++++++++++++++
 speech_processor.py         | 122 +++++++++++++++++++++
 16 files changed, 1015 insertions(+)
 create mode 100644 .claude/settings.local.json
 create mode 100644 .dockerignore
 create mode 100644 .gitignore
 create mode 100644 .yarnrc
 create mode 100644 CLAUDE.md
 create mode 100644 Dockerfile
 create mode 100644 README.md
 create mode 100644 client-example.js
 create mode 100644 docker-compose.yml
 create mode 100644 package.json
 create mode 100644 public/app.js
 create mode 100644 public/index.html
 create mode 100644 requirements.md
 create mode 100644 requirements.txt
 create mode 100644 server.js
 create mode 100644 speech_processor.py

diff --git a/.claude/settings.local.json b/.claude/settings.local.json
new file mode 100644
index 0000000..e0a88e5
--- /dev/null
+++ b/.claude/settings.local.json
@@ -0,0 +1,12 @@
+{
+  "permissions": {
+    "allow": [
+      "Bash(mkdir:*)",
+      "Bash(ls:*)",
+      "Bash(unzip:*)",
+      "Bash(mv:*)",
+      "Bash(docker-compose up:*)"
+    ],
+    "deny": []
+  }
+}
\ No newline at end of file
diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..2603746
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,11 @@
+node_modules
+npm-debug.log
+Dockerfile
+.dockerignore
+.git
+.gitignore
+README.md
+.env
+.nyc_output
+coverage
+.vscode
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..68607f9
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,52 @@
+# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
+
+vosk-model
+
+# dependencies
+/node_modules
+/.pnp
+.pnp.js
+/.yarn/*
+!/.yarn/releases
+!/.yarn/plugins
+!/.yarn/sdks
+
+# testing
+/coverage
+
+# next.js
+/.next/
+/out/
+public/sitemap.xml
+.vercel
+
+# production
+/build
+*.xml
+
+# rss feed
+/public/feed.xml
+
+# search
+/public/search.json
+
+# misc
+.DS_Store
+.idea
+
+# debug
+*.log
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+
+# local env files
+.env
+.env.local
+.env.development.local
+.env.test.local
+.env.production.local
+
+# Contentlayer
+.contentlayer
+
diff --git a/.yarnrc b/.yarnrc
new file mode 100644
index 0000000..6c8b0a1
--- /dev/null
+++ b/.yarnrc
@@ -0,0 +1 @@
+registry "https://registry.yarnpkg.com"
\ No newline at end of file
diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 0000000..8bc3e4d
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,51 @@
+# CLAUDE.md
+
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+
+## Project Overview
+
+This is a speech-to-text proof of concept that runs entirely locally without third-party APIs. The system captures live microphone audio from a browser, sends it to a backend server, and converts it to text using open-source libraries like Vosk.
+
+## Architecture
+
+The project consists of two main components:
+- **Frontend**: Basic HTML page with JavaScript for microphone capture and audio streaming
+- **Backend**: Server (Node.js or Python) that receives audio streams and performs speech-to-text conversion using local libraries
+
+## Development Environment
+
+- User runs fish terminal
+- All processing must be local (no cloud services)
+- System should utilize local hardware for speech recognition
+
+## Key Implementation Requirements
+
+- Real-time or near-real-time audio streaming from browser to backend
+- Local speech-to-text processing using libraries like Vosk
+- Display transcribed text on the frontend UI
+- Start/stop recording functionality
+- WebSocket or similar real-time communication between frontend and backend
+
+## Development Commands
+
+### Docker (Recommended)
+- `docker-compose up --build` - Build and start the application
+- `docker-compose down` - Stop the application
+
+### Local Development
+- `yarn install` - Install dependencies (yarn is configured)
+- `yarn start` - Start the server
+- `yarn dev` - Start with nodemon for development
+
+## Technology Stack
+
+- **Backend**: Node.js with Express and WebSocket server
+- **Frontend**: HTML5 + JavaScript with AudioWorklet for audio capture
+- **Speech Recognition**: Vosk library (Python) for local processing
+- **Communication**: WebSocket for real-time audio streaming and transcription
+
+## Setup Requirements
+
+- Download Vosk model to `./vosk-model/` directory
+- Server runs on http://localhost:3000
+- WebSocket API available at `ws://localhost:3000` for external clients
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..e0f0b25
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,41 @@
+FROM node:18
+
+# Install Python and required dependencies
+RUN apt-get update && apt-get install -y \
+    python3 \
+    python3-pip \
+    python3-dev \
+    python3-venv \
+    build-essential \
+    libsndfile1 \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+
+# Copy Python requirements and install in virtual environment
+COPY requirements.txt ./
+RUN python3 -m venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+RUN pip install -r requirements.txt
+
+# Copy package files
+COPY package.json yarn.lock* ./
+COPY .yarnrc ./
+
+# Install Node.js dependencies
+RUN yarn install --frozen-lockfile
+
+# Copy application code
+COPY . .
+
+# Make Python script executable
+RUN chmod +x speech_processor.py
+
+# Expose port
+EXPOSE 3000
+
+# Ensure virtual environment is active for runtime
+ENV PATH="/opt/venv/bin:$PATH"
+
+# Start the application
+CMD ["yarn", "start"]
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..07de776
--- /dev/null
+++ b/README.md
@@ -0,0 +1,136 @@
+# Speech-to-Text POC
+
+A speech-to-text proof of concept that processes audio locally using Vosk without requiring cloud APIs. The system exposes a WebSocket API that any client can connect to for real-time speech recognition.
+
+## Features
+
+- **Local Processing**: Uses Vosk for offline speech recognition
+- **WebSocket API**: Server exposes `ws://localhost:3000` for any client to connect
+- **Web Interface**: Browser-based demo for testing
+- **Docker Support**: Complete containerized solution
+- **No Cloud Dependencies**: Everything runs locally
+
+## Quick Start
+
+1. **Download Vosk model:**
+   ```bash
+   curl -L -o vosk-model-small-en-us-0.15.zip https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip
+   unzip vosk-model-small-en-us-0.15.zip
+   mv vosk-model-small-en-us-0.15 vosk-model
+   ```
+
+2. **Start with Docker:**
+   ```bash
+   docker-compose up --build
+   ```
+
+3. **Test the web interface:**
+   - Open `http://localhost:3000` in your browser
+   - Click "Start Recording" and speak
+   - See transcriptions appear in real-time
+
+## WebSocket API Usage
+
+The server exposes a WebSocket endpoint at `ws://localhost:3000` that accepts:
+
+- **Input**: Raw WAV audio data (16kHz, 16-bit, mono)
+- **Output**: JSON messages with transcriptions
+
+### Example Client Usage
+
+```javascript
+const WebSocket = require('ws');
+const fs = require('fs');
+
+const ws = new WebSocket('ws://localhost:3000');
+
+ws.on('open', () => {
+    // Send WAV audio file
+    const audioData = fs.readFileSync('audio.wav');
+    ws.send(audioData);
+});
+
+ws.on('message', (data) => {
+    const message = JSON.parse(data);
+    if (message.type === 'transcription') {
+        console.log('Text:', message.text);
+    }
+});
+```
+
+See `client-example.js` for a complete Node.js client implementation.
+
+## Local Development Setup
+
+### Prerequisites
+- Node.js 14+
+- Python 3.8+
+- Vosk model (downloaded as above)
+
+### Installation
+
+1. **Install Node.js dependencies:**
+   ```bash
+   yarn install
+   ```
+
+2. **Install Python dependencies:**
+   ```bash
+   python3 -m venv venv
+   source venv/bin/activate  # On Windows: venv\Scripts\activate
+   pip install -r requirements.txt
+   ```
+
+3. **Start the server:**
+   ```bash
+   yarn start
+   ```
+
+## Architecture
+
+- **Backend**: Node.js Express server with WebSocket support
+- **Speech Processing**: Python subprocess using Vosk library
+- **Frontend**: HTML5 + JavaScript with AudioWorklet for microphone capture
+- **Communication**: WebSocket for bidirectional real-time communication
+
+## Supported Audio Formats
+
+- **Input**: WAV files (16kHz, 16-bit, mono preferred)
+- **Browser**: Automatic conversion from microphone input
+- **API**: Raw audio buffers or WAV format
+
+## Performance Notes
+
+- **Model Size**: Small model (~39MB) for fast loading
+- **Latency**: Near real-time processing depending on audio chunk size
+- **Accuracy**: Good for clear speech, may vary with background noise
+- **Resource Usage**: Lightweight, suitable for local deployment
+
+## Troubleshooting
+
+### Common Issues
+
+1. **Model not found**: Ensure Vosk model is extracted to `./vosk-model/` directory
+2. **Python errors**: Check that virtual environment is activated and dependencies installed
+3. **WebSocket connection fails**: Verify server is running on port 3000
+4. **No audio**: Check browser microphone permissions
+
+### Docker Issues
+
+- **Build failures**: Ensure you have enough disk space for the image
+- **Model mounting**: Verify `./vosk-model/` exists before running docker-compose
+- **Permission errors**: Check file permissions on the vosk-model directory
+
+## Development
+
+- **Server logs**: `docker-compose logs -f` to see real-time logs
+- **Rebuild**: `docker-compose up --build` after code changes
+- **Stop**: `docker-compose down` to stop all services
+
+## Model Information
+
+- **Current**: Vosk Small English US (0.15)
+- **Size**: ~39MB
+- **Languages**: English (US)
+- **Accuracy**: Optimized for speed over accuracy
+- **Alternatives**: See [Vosk Models](https://alphacephei.com/vosk/models) for other languages/sizes
\ No newline at end of file
diff --git a/client-example.js b/client-example.js
new file mode 100644
index 0000000..dc0f0fc
--- /dev/null
+++ b/client-example.js
@@ -0,0 +1,71 @@
+// Example client that can connect to the WebSocket STT API
+const WebSocket = require('ws');
+const fs = require('fs');
+
+class STTClient {
+    constructor(serverUrl = 'ws://localhost:3000') {
+        this.ws = new WebSocket(serverUrl);
+        this.setupWebSocket();
+    }
+
+    setupWebSocket() {
+        this.ws.on('open', () => {
+            console.log('Connected to STT server');
+        });
+
+        this.ws.on('message', (data) => {
+            const message = JSON.parse(data);
+            
+            if (message.type === 'transcription') {
+                console.log('Transcription:', message.text);
+            } else if (message.type === 'error') {
+                console.error('STT Error:', message.message);
+            }
+        });
+
+        this.ws.on('close', () => {
+            console.log('Disconnected from STT server');
+        });
+
+        this.ws.on('error', (error) => {
+            console.error('WebSocket error:', error);
+        });
+    }
+
+    // Send audio file for transcription
+    sendAudioFile(filePath) {
+        if (this.ws.readyState === WebSocket.OPEN) {
+            const audioData = fs.readFileSync(filePath);
+            this.ws.send(audioData);
+            console.log(`Sent audio file: ${filePath}`);
+        } else {
+            console.error('WebSocket not connected');
+        }
+    }
+
+    // Send raw audio buffer
+    sendAudioBuffer(audioBuffer) {
+        if (this.ws.readyState === WebSocket.OPEN) {
+            this.ws.send(audioBuffer);
+        } else {
+            console.error('WebSocket not connected');
+        }
+    }
+
+    close() {
+        this.ws.close();
+    }
+}
+
+// Example usage
+if (require.main === module) {
+    const client = new STTClient();
+    
+    // Example: Send an audio file
+    // client.sendAudioFile('./test-audio.wav');
+    
+    // Keep the process alive
+    process.stdin.resume();
+}
+
+module.exports = STTClient;
\ No newline at end of file
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000..1a19a87
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,16 @@
+services:
+  stt-app:
+    build: .
+    ports:
+      - "3000:3000"
+    volumes:
+      - ./public:/app/public
+      - ./vosk-model:/app/vosk-model:ro
+    environment:
+      - NODE_ENV=development
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3000"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
\ No newline at end of file
diff --git a/package.json b/package.json
new file mode 100644
index 0000000..1483fc4
--- /dev/null
+++ b/package.json
@@ -0,0 +1,27 @@
+{
+  "name": "stt-simple",
+  "version": "1.0.0",
+  "description": "Simple Speech-to-Text POC using local libraries",
+  "main": "server.js",
+  "scripts": {
+    "start": "node server.js",
+    "dev": "nodemon server.js"
+  },
+  "dependencies": {
+    "ws": "^8.14.2",
+    "express": "^4.18.2",
+    "node-wav": "^0.0.2",
+    "stream": "^0.0.2"
+  },
+  "devDependencies": {
+    "nodemon": "^3.0.2"
+  },
+  "keywords": [
+    "speech-to-text",
+    "vosk",
+    "websockets"
+  ],
+  "author": "",
+  "license": "MIT",
+  "packageManager": "yarn@1.22.22+sha512.a6b2f7906b721bba3d67d4aff083df04dad64c399707841b7acf00f6b133b7ac24255f2652fa22ae3534329dc6180534e98d17432037ff6fd140556e2bb3137e"
+}
diff --git a/public/app.js b/public/app.js
new file mode 100644
index 0000000..4bea216
--- /dev/null
+++ b/public/app.js
@@ -0,0 +1,205 @@
+class SpeechToTextApp {
+    constructor() {
+        this.ws = null;
+        this.audioContext = null;
+        this.processor = null;
+        this.stream = null;
+        this.isRecording = false;
+
+        this.startBtn = document.getElementById('startBtn');
+        this.stopBtn = document.getElementById('stopBtn');
+        this.clearBtn = document.getElementById('clearBtn');
+        this.status = document.getElementById('status');
+        this.transcription = document.getElementById('transcription');
+
+        this.initializeEventListeners();
+        this.connectWebSocket();
+    }
+
+    initializeEventListeners() {
+        this.startBtn.addEventListener('click', () => this.startRecording());
+        this.stopBtn.addEventListener('click', () => this.stopRecording());
+        this.clearBtn.addEventListener('click', () => this.clearTranscription());
+    }
+
+    connectWebSocket() {
+        const wsProtocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
+        const wsUrl = `${wsProtocol}//${window.location.host}`;
+        
+        this.ws = new WebSocket(wsUrl);
+        
+        this.ws.onopen = () => {
+            this.updateStatus('Connected to server', 'success');
+        };
+        
+        this.ws.onmessage = (event) => {
+            const data = JSON.parse(event.data);
+            if (data.type === 'transcription' && data.text) {
+                this.appendTranscription(data.text);
+            }
+        };
+        
+        this.ws.onclose = () => {
+            this.updateStatus('Disconnected from server', 'error');
+            setTimeout(() => this.connectWebSocket(), 3000);
+        };
+        
+        this.ws.onerror = (error) => {
+            this.updateStatus('WebSocket error', 'error');
+        };
+    }
+
+
+    async startRecording() {
+        try {
+            this.stream = await navigator.mediaDevices.getUserMedia({ 
+                audio: {
+                    sampleRate: 16000,
+                    channelCount: 1,
+                    echoCancellation: true,
+                    noiseSuppression: true
+                } 
+            });
+
+            this.audioContext = new (window.AudioContext || window.webkitAudioContext)({
+                sampleRate: 16000
+            });
+
+            const source = this.audioContext.createMediaStreamSource(this.stream);
+            
+            await this.audioContext.audioWorklet.addModule('data:text/javascript,' + encodeURIComponent(`
+                class AudioProcessor extends AudioWorkletProcessor {
+                    constructor() {
+                        super();
+                        this.bufferSize = 4096;
+                        this.buffer = new Float32Array(this.bufferSize);
+                        this.bufferIndex = 0;
+                    }
+
+                    process(inputs) {
+                        const input = inputs[0];
+                        if (input.length > 0) {
+                            const audioData = input[0];
+                            
+                            for (let i = 0; i < audioData.length; i++) {
+                                this.buffer[this.bufferIndex] = audioData[i];
+                                this.bufferIndex++;
+                                
+                                if (this.bufferIndex >= this.bufferSize) {
+                                    // Convert to WAV format
+                                    const int16Array = new Int16Array(this.bufferSize);
+                                    for (let j = 0; j < this.bufferSize; j++) {
+                                        int16Array[j] = Math.max(-32768, Math.min(32767, this.buffer[j] * 32768));
+                                    }
+                                    
+                                    // Create WAV header
+                                    const wavBuffer = this.createWAVBuffer(int16Array);
+                                    this.port.postMessage(wavBuffer);
+                                    
+                                    this.bufferIndex = 0;
+                                }
+                            }
+                        }
+                        return true;
+                    }
+
+                    createWAVBuffer(samples) {
+                        const length = samples.length;
+                        const buffer = new ArrayBuffer(44 + length * 2);
+                        const view = new DataView(buffer);
+
+                        // WAV header
+                        const writeString = (offset, string) => {
+                            for (let i = 0; i < string.length; i++) {
+                                view.setUint8(offset + i, string.charCodeAt(i));
+                            }
+                        };
+
+                        writeString(0, 'RIFF');
+                        view.setUint32(4, 36 + length * 2, true);
+                        writeString(8, 'WAVE');
+                        writeString(12, 'fmt ');
+                        view.setUint32(16, 16, true);
+                        view.setUint16(20, 1, true);
+                        view.setUint16(22, 1, true);
+                        view.setUint32(24, 16000, true);
+                        view.setUint32(28, 16000 * 2, true);
+                        view.setUint16(32, 2, true);
+                        view.setUint16(34, 16, true);
+                        writeString(36, 'data');
+                        view.setUint32(40, length * 2, true);
+
+                        // Convert samples to bytes
+                        let offset = 44;
+                        for (let i = 0; i < length; i++) {
+                            view.setInt16(offset, samples[i], true);
+                            offset += 2;
+                        }
+
+                        return buffer;
+                    }
+                }
+                registerProcessor('audio-processor', AudioProcessor);
+            `));
+
+            this.processor = new AudioWorkletNode(this.audioContext, 'audio-processor');
+            
+            this.processor.port.onmessage = (event) => {
+                if (this.ws && this.ws.readyState === WebSocket.OPEN) {
+                    this.ws.send(event.data);
+                }
+            };
+
+            source.connect(this.processor);
+            
+            this.isRecording = true;
+            this.startBtn.disabled = true;
+            this.stopBtn.disabled = false;
+            this.startBtn.textContent = 'Recording...';
+            this.startBtn.classList.add('recording');
+            this.updateStatus('🔴 Recording...', 'success');
+
+        } catch (error) {
+            this.updateStatus('Error accessing microphone: ' + error.message, 'error');
+            console.error('Error starting recording:', error);
+        }
+    }
+
+    stopRecording() {
+        if (this.stream) {
+            this.stream.getTracks().forEach(track => track.stop());
+        }
+        
+        if (this.audioContext) {
+            this.audioContext.close();
+        }
+        
+        this.isRecording = false;
+        this.startBtn.disabled = false;
+        this.stopBtn.disabled = true;
+        this.startBtn.textContent = 'Start Recording';
+        this.startBtn.classList.remove('recording');
+        this.updateStatus('Recording stopped', 'success');
+    }
+
+    clearTranscription() {
+        this.transcription.textContent = 'Transcribed text will appear here...';
+    }
+
+    appendTranscription(text) {
+        if (this.transcription.textContent === 'Transcribed text will appear here...') {
+            this.transcription.textContent = '';
+        }
+        this.transcription.textContent += text + ' ';
+        this.transcription.scrollTop = this.transcription.scrollHeight;
+    }
+
+    updateStatus(message, type = '') {
+        this.status.textContent = message;
+        this.status.className = `status ${type}`;
+    }
+}
+
+document.addEventListener('DOMContentLoaded', () => {
+    new SpeechToTextApp();
+});
\ No newline at end of file
diff --git a/public/index.html b/public/index.html
new file mode 100644
index 0000000..1e56940
--- /dev/null
+++ b/public/index.html
@@ -0,0 +1,99 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Speech-to-Text POC</title>
+    <style>
+        body {
+            font-family: Arial, sans-serif;
+            max-width: 800px;
+            margin: 0 auto;
+            padding: 20px;
+            background-color: #f5f5f5;
+        }
+        .container {
+            background: white;
+            padding: 30px;
+            border-radius: 10px;
+            box-shadow: 0 2px 10px rgba(0,0,0,0.1);
+        }
+        h1 {
+            text-align: center;
+            color: #333;
+        }
+        .controls {
+            text-align: center;
+            margin: 30px 0;
+        }
+        button {
+            background: #007bff;
+            color: white;
+            border: none;
+            padding: 15px 30px;
+            border-radius: 5px;
+            cursor: pointer;
+            font-size: 16px;
+            margin: 0 10px;
+        }
+        button:hover {
+            background: #0056b3;
+        }
+        button:disabled {
+            background: #ccc;
+            cursor: not-allowed;
+        }
+        .recording {
+            background: #dc3545 !important;
+        }
+        .status {
+            text-align: center;
+            margin: 20px 0;
+            font-weight: bold;
+        }
+        .transcription {
+            background: #f8f9fa;
+            border: 1px solid #dee2e6;
+            border-radius: 5px;
+            padding: 20px;
+            min-height: 200px;
+            margin: 20px 0;
+            font-size: 16px;
+            line-height: 1.5;
+        }
+        .error {
+            color: #dc3545;
+            background: #f8d7da;
+            padding: 10px;
+            border-radius: 5px;
+            margin: 10px 0;
+        }
+        .success {
+            color: #155724;
+            background: #d4edda;
+            padding: 10px;
+            border-radius: 5px;
+            margin: 10px 0;
+        }
+    </style>
+</head>
+<body>
+    <div class="container">
+        <h1>🎙️ Speech-to-Text POC</h1>
+        
+        <div class="controls">
+            <button id="startBtn">Start Recording</button>
+            <button id="stopBtn" disabled>Stop Recording</button>
+            <button id="clearBtn">Clear Text</button>
+        </div>
+        
+        <div id="status" class="status">Ready to record</div>
+        
+        <div id="transcription" class="transcription">
+            Transcribed text will appear here...
+        </div>
+    </div>
+
+    <script src="app.js"></script>
+</body>
+</html>
\ No newline at end of file
diff --git a/requirements.md b/requirements.md
new file mode 100644
index 0000000..90f47eb
--- /dev/null
+++ b/requirements.md
@@ -0,0 +1,24 @@
+### 🧩 **Requirement: Speech-to-Text POC (No 3rd-Party APIs)**
+
+#### **Goal**
+
+Build a simple proof of concept (POC) that captures live microphone audio from the browser, sends it to a backend server, converts the audio to text using an open-source/local library, and displays the text on the UI.
+
+#### **Key Points**
+
+* A basic `index.html` page to:
+
+    * Start/stop microphone recording.
+    * Stream audio to the backend.
+    * Display the transcribed text in real-time or after processing.
+* A backend server (e.g., Node.js or Python) that:
+
+    * Receives audio stream.
+    * Uses a **local speech-to-text library** (e.g., [Vosk](https://alphacephei.com/vosk/)) — **no external APIs**.
+    * Sends back the transcribed text to the frontend.
+
+#### **Note**
+
+* I am using fish terminal
+* The solution should run locally and utilize system hardware.
+* Avoid any third-party cloud services.
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..757a20a
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,3 @@
+vosk==0.3.45
+soundfile==0.12.1
+numpy==1.24.3
\ No newline at end of file
diff --git a/server.js b/server.js
new file mode 100644
index 0000000..c6484a2
--- /dev/null
+++ b/server.js
@@ -0,0 +1,144 @@
+const express = require('express');
+const WebSocket = require('ws');
+const { spawn } = require('child_process');
+const fs = require('fs');
+
+const app = express();
+const PORT = 3000;
+
+app.use(express.static('public'));
+
+const server = app.listen(PORT, () => {
+    console.log(`Server running on http://localhost:${PORT}`);
+    console.log('Using Python SpeechRecognition with PocketSphinx for local STT');
+});
+
+const wss = new WebSocket.Server({ server });
+
+class SpeechProcessor {
+    constructor() {
+        this.pythonProcess = null;
+        this.initializePythonProcess();
+    }
+
+    initializePythonProcess() {
+        try {
+            this.pythonProcess = spawn('python3', ['speech_processor.py'], {
+                stdio: ['pipe', 'pipe', 'pipe']
+            });
+
+            this.pythonProcess.stderr.on('data', (data) => {
+                console.error('Python process error:', data.toString());
+            });
+
+            this.pythonProcess.on('close', (code) => {
+                console.log(`Python process closed with code ${code}`);
+                // Restart process if it dies
+                setTimeout(() => this.initializePythonProcess(), 1000);
+            });
+
+            console.log('Python speech processor initialized');
+        } catch (error) {
+            console.error('Failed to initialize Python process:', error);
+        }
+    }
+
+    async processAudio(audioBuffer) {
+        return new Promise((resolve, reject) => {
+            if (!this.pythonProcess) {
+                reject(new Error('Python process not available'));
+                return;
+            }
+
+            // Send audio data length first
+            const lengthBuffer = Buffer.allocUnsafe(4);
+            lengthBuffer.writeUInt32BE(audioBuffer.length, 0);
+            this.pythonProcess.stdin.write(lengthBuffer);
+            
+            // Send audio data
+            this.pythonProcess.stdin.write(audioBuffer);
+
+            // Read response
+            let responseLength = null;
+            let responseData = Buffer.alloc(0);
+            let expecting = 'length';
+
+            const onData = (data) => {
+                responseData = Buffer.concat([responseData, data]);
+
+                if (expecting === 'length' && responseData.length >= 4) {
+                    responseLength = responseData.readUInt32BE(0);
+                    responseData = responseData.slice(4);
+                    expecting = 'data';
+                }
+
+                if (expecting === 'data' && responseData.length >= responseLength) {
+                    const jsonData = responseData.slice(0, responseLength);
+                    this.pythonProcess.stdout.removeListener('data', onData);
+                    
+                    try {
+                        const result = JSON.parse(jsonData.toString());
+                        resolve(result);
+                    } catch (error) {
+                        reject(error);
+                    }
+                }
+            };
+
+            this.pythonProcess.stdout.on('data', onData);
+
+            // Timeout after 10 seconds
+            setTimeout(() => {
+                this.pythonProcess.stdout.removeListener('data', onData);
+                reject(new Error('Speech processing timeout'));
+            }, 10000);
+        });
+    }
+}
+
+const speechProcessor = new SpeechProcessor();
+
+wss.on('connection', (ws) => {
+    console.log('Client connected');
+    
+    ws.on('message', async (data) => {
+        try {
+            if (Buffer.isBuffer(data)) {
+                // Raw audio data received
+                const result = await speechProcessor.processAudio(data);
+                
+                if (result.success && result.text) {
+                    ws.send(JSON.stringify({ 
+                        type: 'transcription', 
+                        text: result.text 
+                    }));
+                    console.log('Transcription:', result.text);
+                } else if (!result.success) {
+                    console.error('STT Error:', result.error);
+                    ws.send(JSON.stringify({ 
+                        type: 'error', 
+                        message: result.error 
+                    }));
+                }
+            } else {
+                // JSON message received
+                const message = JSON.parse(data);
+                console.log('Received message:', message);
+            }
+        } catch (error) {
+            console.error('Error processing message:', error);
+            ws.send(JSON.stringify({ 
+                type: 'error', 
+                message: 'Error processing audio' 
+            }));
+        }
+    });
+    
+    ws.on('close', () => {
+        console.log('Client disconnected');
+    });
+    
+    ws.on('error', (error) => {
+        console.error('WebSocket error:', error);
+    });
+});
\ No newline at end of file
diff --git a/speech_processor.py b/speech_processor.py
new file mode 100644
index 0000000..7c95c23
--- /dev/null
+++ b/speech_processor.py
@@ -0,0 +1,122 @@
+#!/usr/bin/env python3
+import vosk
+import sys
+import json
+import tempfile
+import os
+import wave
+import soundfile as sf
+
+# Global model - load once
+model = None
+recognizer = None
+
+def initialize_vosk():
+    """Initialize Vosk model"""
+    global model, recognizer
+    
+    model_path = "/app/vosk-model"
+    if not os.path.exists(model_path):
+        return {"success": False, "error": "Vosk model not found at /app/vosk-model"}
+    
+    try:
+        vosk.SetLogLevel(-1)  # Reduce log verbosity
+        model = vosk.Model(model_path)
+        recognizer = vosk.KaldiRecognizer(model, 16000)
+        return {"success": True}
+    except Exception as e:
+        return {"success": False, "error": f"Failed to initialize Vosk: {str(e)}"}
+
+def process_audio_chunk(audio_data):
+    """Process audio data and return transcription"""
+    global recognizer
+    
+    if not recognizer:
+        init_result = initialize_vosk()
+        if not init_result["success"]:
+            return init_result
+    
+    try:
+        # Write audio data to temporary file
+        with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
+            temp_file.write(audio_data)
+            temp_filename = temp_file.name
+        
+        # Read audio file with soundfile
+        try:
+            audio_data, sample_rate = sf.read(temp_filename)
+            
+            # Convert to 16-bit PCM at 16kHz if needed
+            if sample_rate != 16000:
+                # Simple resampling (for better quality, use librosa)
+                import numpy as np
+                audio_data = np.interp(
+                    np.linspace(0, len(audio_data), int(len(audio_data) * 16000 / sample_rate)),
+                    np.arange(len(audio_data)),
+                    audio_data
+                )
+            
+            # Convert to bytes
+            audio_bytes = (audio_data * 32767).astype('int16').tobytes()
+            
+            # Process with Vosk
+            if recognizer.AcceptWaveform(audio_bytes):
+                result = json.loads(recognizer.Result())
+                text = result.get('text', '')
+            else:
+                result = json.loads(recognizer.PartialResult())
+                text = result.get('partial', '')
+            
+            # Clean up
+            os.unlink(temp_filename)
+            
+            return {"success": True, "text": text}
+            
+        except Exception as e:
+            os.unlink(temp_filename)
+            return {"success": False, "error": f"Audio processing error: {str(e)}"}
+        
+    except Exception as e:
+        return {"success": False, "error": f"General error: {str(e)}"}
+
+def main():
+    """Main loop to process audio chunks from stdin"""
+    # Initialize Vosk on startup
+    init_result = initialize_vosk()
+    if not init_result["success"]:
+        error_response = json.dumps(init_result).encode('utf-8')
+        sys.stdout.buffer.write(len(error_response).to_bytes(4, byteorder='big'))
+        sys.stdout.buffer.write(error_response)
+        sys.stdout.buffer.flush()
+        sys.exit(1)
+    
+    while True:
+        try:
+            # Read length of incoming data
+            length_data = sys.stdin.buffer.read(4)
+            if not length_data:
+                break
+                
+            length = int.from_bytes(length_data, byteorder='big')
+            
+            # Read audio data
+            audio_data = sys.stdin.buffer.read(length)
+            
+            # Process audio
+            result = process_audio_chunk(audio_data)
+            
+            # Send result back
+            response = json.dumps(result).encode('utf-8')
+            sys.stdout.buffer.write(len(response).to_bytes(4, byteorder='big'))
+            sys.stdout.buffer.write(response)
+            sys.stdout.buffer.flush()
+            
+        except Exception as e:
+            error_result = {"success": False, "error": str(e)}
+            response = json.dumps(error_result).encode('utf-8')
+            sys.stdout.buffer.write(len(response).to_bytes(4, byteorder='big'))
+            sys.stdout.buffer.write(response)
+            sys.stdout.buffer.flush()
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file