init
commit
9989eeb879
|
@ -0,0 +1,12 @@
|
||||||
|
{
|
||||||
|
"permissions": {
|
||||||
|
"allow": [
|
||||||
|
"Bash(mkdir:*)",
|
||||||
|
"Bash(ls:*)",
|
||||||
|
"Bash(unzip:*)",
|
||||||
|
"Bash(mv:*)",
|
||||||
|
"Bash(docker-compose up:*)"
|
||||||
|
],
|
||||||
|
"deny": []
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,11 @@
|
||||||
|
node_modules
|
||||||
|
npm-debug.log
|
||||||
|
Dockerfile
|
||||||
|
.dockerignore
|
||||||
|
.git
|
||||||
|
.gitignore
|
||||||
|
README.md
|
||||||
|
.env
|
||||||
|
.nyc_output
|
||||||
|
coverage
|
||||||
|
.vscode
|
|
@ -0,0 +1,52 @@
|
||||||
|
# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
|
||||||
|
|
||||||
|
vosk-model
|
||||||
|
|
||||||
|
# dependencies
|
||||||
|
/node_modules
|
||||||
|
/.pnp
|
||||||
|
.pnp.js
|
||||||
|
/.yarn/*
|
||||||
|
!/.yarn/releases
|
||||||
|
!/.yarn/plugins
|
||||||
|
!/.yarn/sdks
|
||||||
|
|
||||||
|
# testing
|
||||||
|
/coverage
|
||||||
|
|
||||||
|
# next.js
|
||||||
|
/.next/
|
||||||
|
/out/
|
||||||
|
public/sitemap.xml
|
||||||
|
.vercel
|
||||||
|
|
||||||
|
# production
|
||||||
|
/build
|
||||||
|
*.xml
|
||||||
|
|
||||||
|
# rss feed
|
||||||
|
/public/feed.xml
|
||||||
|
|
||||||
|
# search
|
||||||
|
/public/search.json
|
||||||
|
|
||||||
|
# misc
|
||||||
|
.DS_Store
|
||||||
|
.idea
|
||||||
|
|
||||||
|
# debug
|
||||||
|
*.log
|
||||||
|
npm-debug.log*
|
||||||
|
yarn-debug.log*
|
||||||
|
yarn-error.log*
|
||||||
|
|
||||||
|
# local env files
|
||||||
|
.env
|
||||||
|
.env.local
|
||||||
|
.env.development.local
|
||||||
|
.env.test.local
|
||||||
|
.env.production.local
|
||||||
|
|
||||||
|
# Contentlayer
|
||||||
|
.contentlayer
|
||||||
|
|
|
@ -0,0 +1,51 @@
|
||||||
|
# CLAUDE.md
|
||||||
|
|
||||||
|
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
||||||
|
|
||||||
|
## Project Overview
|
||||||
|
|
||||||
|
This is a speech-to-text proof of concept that runs entirely locally without third-party APIs. The system captures live microphone audio from a browser, sends it to a backend server, and converts it to text using open-source libraries like Vosk.
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
The project consists of two main components:
|
||||||
|
- **Frontend**: Basic HTML page with JavaScript for microphone capture and audio streaming
|
||||||
|
- **Backend**: Server (Node.js or Python) that receives audio streams and performs speech-to-text conversion using local libraries
|
||||||
|
|
||||||
|
## Development Environment
|
||||||
|
|
||||||
|
- User runs fish terminal
|
||||||
|
- All processing must be local (no cloud services)
|
||||||
|
- System should utilize local hardware for speech recognition
|
||||||
|
|
||||||
|
## Key Implementation Requirements
|
||||||
|
|
||||||
|
- Real-time or near-real-time audio streaming from browser to backend
|
||||||
|
- Local speech-to-text processing using libraries like Vosk
|
||||||
|
- Display transcribed text on the frontend UI
|
||||||
|
- Start/stop recording functionality
|
||||||
|
- WebSocket or similar real-time communication between frontend and backend
|
||||||
|
|
||||||
|
## Development Commands
|
||||||
|
|
||||||
|
### Docker (Recommended)
|
||||||
|
- `docker-compose up --build` - Build and start the application
|
||||||
|
- `docker-compose down` - Stop the application
|
||||||
|
|
||||||
|
### Local Development
|
||||||
|
- `yarn install` - Install dependencies (yarn is configured)
|
||||||
|
- `yarn start` - Start the server
|
||||||
|
- `yarn dev` - Start with nodemon for development
|
||||||
|
|
||||||
|
## Technology Stack
|
||||||
|
|
||||||
|
- **Backend**: Node.js with Express and WebSocket server
|
||||||
|
- **Frontend**: HTML5 + JavaScript with AudioWorklet for audio capture
|
||||||
|
- **Speech Recognition**: Vosk library (Python) for local processing
|
||||||
|
- **Communication**: WebSocket for real-time audio streaming and transcription
|
||||||
|
|
||||||
|
## Setup Requirements
|
||||||
|
|
||||||
|
- Download Vosk model to `./vosk-model/` directory
|
||||||
|
- Server runs on http://localhost:3000
|
||||||
|
- WebSocket API available at `ws://localhost:3000` for external clients
|
|
@ -0,0 +1,41 @@
|
||||||
|
FROM node:18
|
||||||
|
|
||||||
|
# Install Python and required dependencies
|
||||||
|
RUN apt-get update && apt-get install -y \
|
||||||
|
python3 \
|
||||||
|
python3-pip \
|
||||||
|
python3-dev \
|
||||||
|
python3-venv \
|
||||||
|
build-essential \
|
||||||
|
libsndfile1 \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Copy Python requirements and install in virtual environment
|
||||||
|
COPY requirements.txt ./
|
||||||
|
RUN python3 -m venv /opt/venv
|
||||||
|
ENV PATH="/opt/venv/bin:$PATH"
|
||||||
|
RUN pip install -r requirements.txt
|
||||||
|
|
||||||
|
# Copy package files
|
||||||
|
COPY package.json yarn.lock* ./
|
||||||
|
COPY .yarnrc ./
|
||||||
|
|
||||||
|
# Install Node.js dependencies
|
||||||
|
RUN yarn install --frozen-lockfile
|
||||||
|
|
||||||
|
# Copy application code
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# Make Python script executable
|
||||||
|
RUN chmod +x speech_processor.py
|
||||||
|
|
||||||
|
# Expose port
|
||||||
|
EXPOSE 3000
|
||||||
|
|
||||||
|
# Ensure virtual environment is active for runtime
|
||||||
|
ENV PATH="/opt/venv/bin:$PATH"
|
||||||
|
|
||||||
|
# Start the application
|
||||||
|
CMD ["yarn", "start"]
|
|
@ -0,0 +1,136 @@
|
||||||
|
# Speech-to-Text POC
|
||||||
|
|
||||||
|
A speech-to-text proof of concept that processes audio locally using Vosk without requiring cloud APIs. The system exposes a WebSocket API that any client can connect to for real-time speech recognition.
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
- **Local Processing**: Uses Vosk for offline speech recognition
|
||||||
|
- **WebSocket API**: Server exposes `ws://localhost:3000` for any client to connect
|
||||||
|
- **Web Interface**: Browser-based demo for testing
|
||||||
|
- **Docker Support**: Complete containerized solution
|
||||||
|
- **No Cloud Dependencies**: Everything runs locally
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
|
1. **Download Vosk model:**
|
||||||
|
```bash
|
||||||
|
curl -L -o vosk-model-small-en-us-0.15.zip https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip
|
||||||
|
unzip vosk-model-small-en-us-0.15.zip
|
||||||
|
mv vosk-model-small-en-us-0.15 vosk-model
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Start with Docker:**
|
||||||
|
```bash
|
||||||
|
docker-compose up --build
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Test the web interface:**
|
||||||
|
- Open `http://localhost:3000` in your browser
|
||||||
|
- Click "Start Recording" and speak
|
||||||
|
- See transcriptions appear in real-time
|
||||||
|
|
||||||
|
## WebSocket API Usage
|
||||||
|
|
||||||
|
The server exposes a WebSocket endpoint at `ws://localhost:3000` that accepts:
|
||||||
|
|
||||||
|
- **Input**: Raw WAV audio data (16kHz, 16-bit, mono)
|
||||||
|
- **Output**: JSON messages with transcriptions
|
||||||
|
|
||||||
|
### Example Client Usage
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
const WebSocket = require('ws');
|
||||||
|
const fs = require('fs');
|
||||||
|
|
||||||
|
const ws = new WebSocket('ws://localhost:3000');
|
||||||
|
|
||||||
|
ws.on('open', () => {
|
||||||
|
// Send WAV audio file
|
||||||
|
const audioData = fs.readFileSync('audio.wav');
|
||||||
|
ws.send(audioData);
|
||||||
|
});
|
||||||
|
|
||||||
|
ws.on('message', (data) => {
|
||||||
|
const message = JSON.parse(data);
|
||||||
|
if (message.type === 'transcription') {
|
||||||
|
console.log('Text:', message.text);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
See `client-example.js` for a complete Node.js client implementation.
|
||||||
|
|
||||||
|
## Local Development Setup
|
||||||
|
|
||||||
|
### Prerequisites
|
||||||
|
- Node.js 14+
|
||||||
|
- Python 3.8+
|
||||||
|
- Vosk model (downloaded as above)
|
||||||
|
|
||||||
|
### Installation
|
||||||
|
|
||||||
|
1. **Install Node.js dependencies:**
|
||||||
|
```bash
|
||||||
|
yarn install
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Install Python dependencies:**
|
||||||
|
```bash
|
||||||
|
python3 -m venv venv
|
||||||
|
source venv/bin/activate # On Windows: venv\Scripts\activate
|
||||||
|
pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Start the server:**
|
||||||
|
```bash
|
||||||
|
yarn start
|
||||||
|
```
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
- **Backend**: Node.js Express server with WebSocket support
|
||||||
|
- **Speech Processing**: Python subprocess using Vosk library
|
||||||
|
- **Frontend**: HTML5 + JavaScript with AudioWorklet for microphone capture
|
||||||
|
- **Communication**: WebSocket for bidirectional real-time communication
|
||||||
|
|
||||||
|
## Supported Audio Formats
|
||||||
|
|
||||||
|
- **Input**: WAV files (16kHz, 16-bit, mono preferred)
|
||||||
|
- **Browser**: Automatic conversion from microphone input
|
||||||
|
- **API**: Raw audio buffers or WAV format
|
||||||
|
|
||||||
|
## Performance Notes
|
||||||
|
|
||||||
|
- **Model Size**: Small model (~39MB) for fast loading
|
||||||
|
- **Latency**: Near real-time processing depending on audio chunk size
|
||||||
|
- **Accuracy**: Good for clear speech, may vary with background noise
|
||||||
|
- **Resource Usage**: Lightweight, suitable for local deployment
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Common Issues
|
||||||
|
|
||||||
|
1. **Model not found**: Ensure Vosk model is extracted to `./vosk-model/` directory
|
||||||
|
2. **Python errors**: Check that virtual environment is activated and dependencies installed
|
||||||
|
3. **WebSocket connection fails**: Verify server is running on port 3000
|
||||||
|
4. **No audio**: Check browser microphone permissions
|
||||||
|
|
||||||
|
### Docker Issues
|
||||||
|
|
||||||
|
- **Build failures**: Ensure you have enough disk space for the image
|
||||||
|
- **Model mounting**: Verify `./vosk-model/` exists before running docker-compose
|
||||||
|
- **Permission errors**: Check file permissions on the vosk-model directory
|
||||||
|
|
||||||
|
## Development
|
||||||
|
|
||||||
|
- **Server logs**: `docker-compose logs -f` to see real-time logs
|
||||||
|
- **Rebuild**: `docker-compose up --build` after code changes
|
||||||
|
- **Stop**: `docker-compose down` to stop all services
|
||||||
|
|
||||||
|
## Model Information
|
||||||
|
|
||||||
|
- **Current**: Vosk Small English US (0.15)
|
||||||
|
- **Size**: ~39MB
|
||||||
|
- **Languages**: English (US)
|
||||||
|
- **Accuracy**: Optimized for speed over accuracy
|
||||||
|
- **Alternatives**: See [Vosk Models](https://alphacephei.com/vosk/models) for other languages/sizes
|
|
@ -0,0 +1,71 @@
|
||||||
|
// Example client that can connect to the WebSocket STT API
|
||||||
|
const WebSocket = require('ws');
|
||||||
|
const fs = require('fs');
|
||||||
|
|
||||||
|
class STTClient {
|
||||||
|
constructor(serverUrl = 'ws://localhost:3000') {
|
||||||
|
this.ws = new WebSocket(serverUrl);
|
||||||
|
this.setupWebSocket();
|
||||||
|
}
|
||||||
|
|
||||||
|
setupWebSocket() {
|
||||||
|
this.ws.on('open', () => {
|
||||||
|
console.log('Connected to STT server');
|
||||||
|
});
|
||||||
|
|
||||||
|
this.ws.on('message', (data) => {
|
||||||
|
const message = JSON.parse(data);
|
||||||
|
|
||||||
|
if (message.type === 'transcription') {
|
||||||
|
console.log('Transcription:', message.text);
|
||||||
|
} else if (message.type === 'error') {
|
||||||
|
console.error('STT Error:', message.message);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
this.ws.on('close', () => {
|
||||||
|
console.log('Disconnected from STT server');
|
||||||
|
});
|
||||||
|
|
||||||
|
this.ws.on('error', (error) => {
|
||||||
|
console.error('WebSocket error:', error);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Send audio file for transcription
|
||||||
|
sendAudioFile(filePath) {
|
||||||
|
if (this.ws.readyState === WebSocket.OPEN) {
|
||||||
|
const audioData = fs.readFileSync(filePath);
|
||||||
|
this.ws.send(audioData);
|
||||||
|
console.log(`Sent audio file: ${filePath}`);
|
||||||
|
} else {
|
||||||
|
console.error('WebSocket not connected');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Send raw audio buffer
|
||||||
|
sendAudioBuffer(audioBuffer) {
|
||||||
|
if (this.ws.readyState === WebSocket.OPEN) {
|
||||||
|
this.ws.send(audioBuffer);
|
||||||
|
} else {
|
||||||
|
console.error('WebSocket not connected');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
close() {
|
||||||
|
this.ws.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Example usage
|
||||||
|
if (require.main === module) {
|
||||||
|
const client = new STTClient();
|
||||||
|
|
||||||
|
// Example: Send an audio file
|
||||||
|
// client.sendAudioFile('./test-audio.wav');
|
||||||
|
|
||||||
|
// Keep the process alive
|
||||||
|
process.stdin.resume();
|
||||||
|
}
|
||||||
|
|
||||||
|
module.exports = STTClient;
|
|
@ -0,0 +1,16 @@
|
||||||
|
services:
|
||||||
|
stt-app:
|
||||||
|
build: .
|
||||||
|
ports:
|
||||||
|
- "3000:3000"
|
||||||
|
volumes:
|
||||||
|
- ./public:/app/public
|
||||||
|
- ./vosk-model:/app/vosk-model:ro
|
||||||
|
environment:
|
||||||
|
- NODE_ENV=development
|
||||||
|
restart: unless-stopped
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3000"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
|
@ -0,0 +1,27 @@
|
||||||
|
{
|
||||||
|
"name": "stt-simple",
|
||||||
|
"version": "1.0.0",
|
||||||
|
"description": "Simple Speech-to-Text POC using local libraries",
|
||||||
|
"main": "server.js",
|
||||||
|
"scripts": {
|
||||||
|
"start": "node server.js",
|
||||||
|
"dev": "nodemon server.js"
|
||||||
|
},
|
||||||
|
"dependencies": {
|
||||||
|
"ws": "^8.14.2",
|
||||||
|
"express": "^4.18.2",
|
||||||
|
"node-wav": "^0.0.2",
|
||||||
|
"stream": "^0.0.2"
|
||||||
|
},
|
||||||
|
"devDependencies": {
|
||||||
|
"nodemon": "^3.0.2"
|
||||||
|
},
|
||||||
|
"keywords": [
|
||||||
|
"speech-to-text",
|
||||||
|
"vosk",
|
||||||
|
"websockets"
|
||||||
|
],
|
||||||
|
"author": "",
|
||||||
|
"license": "MIT",
|
||||||
|
"packageManager": "yarn@1.22.22+sha512.a6b2f7906b721bba3d67d4aff083df04dad64c399707841b7acf00f6b133b7ac24255f2652fa22ae3534329dc6180534e98d17432037ff6fd140556e2bb3137e"
|
||||||
|
}
|
|
@ -0,0 +1,205 @@
|
||||||
|
class SpeechToTextApp {
|
||||||
|
constructor() {
|
||||||
|
this.ws = null;
|
||||||
|
this.audioContext = null;
|
||||||
|
this.processor = null;
|
||||||
|
this.stream = null;
|
||||||
|
this.isRecording = false;
|
||||||
|
|
||||||
|
this.startBtn = document.getElementById('startBtn');
|
||||||
|
this.stopBtn = document.getElementById('stopBtn');
|
||||||
|
this.clearBtn = document.getElementById('clearBtn');
|
||||||
|
this.status = document.getElementById('status');
|
||||||
|
this.transcription = document.getElementById('transcription');
|
||||||
|
|
||||||
|
this.initializeEventListeners();
|
||||||
|
this.connectWebSocket();
|
||||||
|
}
|
||||||
|
|
||||||
|
initializeEventListeners() {
|
||||||
|
this.startBtn.addEventListener('click', () => this.startRecording());
|
||||||
|
this.stopBtn.addEventListener('click', () => this.stopRecording());
|
||||||
|
this.clearBtn.addEventListener('click', () => this.clearTranscription());
|
||||||
|
}
|
||||||
|
|
||||||
|
connectWebSocket() {
|
||||||
|
const wsProtocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
|
||||||
|
const wsUrl = `${wsProtocol}//${window.location.host}`;
|
||||||
|
|
||||||
|
this.ws = new WebSocket(wsUrl);
|
||||||
|
|
||||||
|
this.ws.onopen = () => {
|
||||||
|
this.updateStatus('Connected to server', 'success');
|
||||||
|
};
|
||||||
|
|
||||||
|
this.ws.onmessage = (event) => {
|
||||||
|
const data = JSON.parse(event.data);
|
||||||
|
if (data.type === 'transcription' && data.text) {
|
||||||
|
this.appendTranscription(data.text);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
this.ws.onclose = () => {
|
||||||
|
this.updateStatus('Disconnected from server', 'error');
|
||||||
|
setTimeout(() => this.connectWebSocket(), 3000);
|
||||||
|
};
|
||||||
|
|
||||||
|
this.ws.onerror = (error) => {
|
||||||
|
this.updateStatus('WebSocket error', 'error');
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async startRecording() {
|
||||||
|
try {
|
||||||
|
this.stream = await navigator.mediaDevices.getUserMedia({
|
||||||
|
audio: {
|
||||||
|
sampleRate: 16000,
|
||||||
|
channelCount: 1,
|
||||||
|
echoCancellation: true,
|
||||||
|
noiseSuppression: true
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
this.audioContext = new (window.AudioContext || window.webkitAudioContext)({
|
||||||
|
sampleRate: 16000
|
||||||
|
});
|
||||||
|
|
||||||
|
const source = this.audioContext.createMediaStreamSource(this.stream);
|
||||||
|
|
||||||
|
await this.audioContext.audioWorklet.addModule('data:text/javascript,' + encodeURIComponent(`
|
||||||
|
class AudioProcessor extends AudioWorkletProcessor {
|
||||||
|
constructor() {
|
||||||
|
super();
|
||||||
|
this.bufferSize = 4096;
|
||||||
|
this.buffer = new Float32Array(this.bufferSize);
|
||||||
|
this.bufferIndex = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
process(inputs) {
|
||||||
|
const input = inputs[0];
|
||||||
|
if (input.length > 0) {
|
||||||
|
const audioData = input[0];
|
||||||
|
|
||||||
|
for (let i = 0; i < audioData.length; i++) {
|
||||||
|
this.buffer[this.bufferIndex] = audioData[i];
|
||||||
|
this.bufferIndex++;
|
||||||
|
|
||||||
|
if (this.bufferIndex >= this.bufferSize) {
|
||||||
|
// Convert to WAV format
|
||||||
|
const int16Array = new Int16Array(this.bufferSize);
|
||||||
|
for (let j = 0; j < this.bufferSize; j++) {
|
||||||
|
int16Array[j] = Math.max(-32768, Math.min(32767, this.buffer[j] * 32768));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create WAV header
|
||||||
|
const wavBuffer = this.createWAVBuffer(int16Array);
|
||||||
|
this.port.postMessage(wavBuffer);
|
||||||
|
|
||||||
|
this.bufferIndex = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
createWAVBuffer(samples) {
|
||||||
|
const length = samples.length;
|
||||||
|
const buffer = new ArrayBuffer(44 + length * 2);
|
||||||
|
const view = new DataView(buffer);
|
||||||
|
|
||||||
|
// WAV header
|
||||||
|
const writeString = (offset, string) => {
|
||||||
|
for (let i = 0; i < string.length; i++) {
|
||||||
|
view.setUint8(offset + i, string.charCodeAt(i));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
writeString(0, 'RIFF');
|
||||||
|
view.setUint32(4, 36 + length * 2, true);
|
||||||
|
writeString(8, 'WAVE');
|
||||||
|
writeString(12, 'fmt ');
|
||||||
|
view.setUint32(16, 16, true);
|
||||||
|
view.setUint16(20, 1, true);
|
||||||
|
view.setUint16(22, 1, true);
|
||||||
|
view.setUint32(24, 16000, true);
|
||||||
|
view.setUint32(28, 16000 * 2, true);
|
||||||
|
view.setUint16(32, 2, true);
|
||||||
|
view.setUint16(34, 16, true);
|
||||||
|
writeString(36, 'data');
|
||||||
|
view.setUint32(40, length * 2, true);
|
||||||
|
|
||||||
|
// Convert samples to bytes
|
||||||
|
let offset = 44;
|
||||||
|
for (let i = 0; i < length; i++) {
|
||||||
|
view.setInt16(offset, samples[i], true);
|
||||||
|
offset += 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
return buffer;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
registerProcessor('audio-processor', AudioProcessor);
|
||||||
|
`));
|
||||||
|
|
||||||
|
this.processor = new AudioWorkletNode(this.audioContext, 'audio-processor');
|
||||||
|
|
||||||
|
this.processor.port.onmessage = (event) => {
|
||||||
|
if (this.ws && this.ws.readyState === WebSocket.OPEN) {
|
||||||
|
this.ws.send(event.data);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
source.connect(this.processor);
|
||||||
|
|
||||||
|
this.isRecording = true;
|
||||||
|
this.startBtn.disabled = true;
|
||||||
|
this.stopBtn.disabled = false;
|
||||||
|
this.startBtn.textContent = 'Recording...';
|
||||||
|
this.startBtn.classList.add('recording');
|
||||||
|
this.updateStatus('🔴 Recording...', 'success');
|
||||||
|
|
||||||
|
} catch (error) {
|
||||||
|
this.updateStatus('Error accessing microphone: ' + error.message, 'error');
|
||||||
|
console.error('Error starting recording:', error);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
stopRecording() {
|
||||||
|
if (this.stream) {
|
||||||
|
this.stream.getTracks().forEach(track => track.stop());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (this.audioContext) {
|
||||||
|
this.audioContext.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
this.isRecording = false;
|
||||||
|
this.startBtn.disabled = false;
|
||||||
|
this.stopBtn.disabled = true;
|
||||||
|
this.startBtn.textContent = 'Start Recording';
|
||||||
|
this.startBtn.classList.remove('recording');
|
||||||
|
this.updateStatus('Recording stopped', 'success');
|
||||||
|
}
|
||||||
|
|
||||||
|
clearTranscription() {
|
||||||
|
this.transcription.textContent = 'Transcribed text will appear here...';
|
||||||
|
}
|
||||||
|
|
||||||
|
appendTranscription(text) {
|
||||||
|
if (this.transcription.textContent === 'Transcribed text will appear here...') {
|
||||||
|
this.transcription.textContent = '';
|
||||||
|
}
|
||||||
|
this.transcription.textContent += text + ' ';
|
||||||
|
this.transcription.scrollTop = this.transcription.scrollHeight;
|
||||||
|
}
|
||||||
|
|
||||||
|
updateStatus(message, type = '') {
|
||||||
|
this.status.textContent = message;
|
||||||
|
this.status.className = `status ${type}`;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
document.addEventListener('DOMContentLoaded', () => {
|
||||||
|
new SpeechToTextApp();
|
||||||
|
});
|
|
@ -0,0 +1,99 @@
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||||
|
<title>Speech-to-Text POC</title>
|
||||||
|
<style>
|
||||||
|
body {
|
||||||
|
font-family: Arial, sans-serif;
|
||||||
|
max-width: 800px;
|
||||||
|
margin: 0 auto;
|
||||||
|
padding: 20px;
|
||||||
|
background-color: #f5f5f5;
|
||||||
|
}
|
||||||
|
.container {
|
||||||
|
background: white;
|
||||||
|
padding: 30px;
|
||||||
|
border-radius: 10px;
|
||||||
|
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
|
||||||
|
}
|
||||||
|
h1 {
|
||||||
|
text-align: center;
|
||||||
|
color: #333;
|
||||||
|
}
|
||||||
|
.controls {
|
||||||
|
text-align: center;
|
||||||
|
margin: 30px 0;
|
||||||
|
}
|
||||||
|
button {
|
||||||
|
background: #007bff;
|
||||||
|
color: white;
|
||||||
|
border: none;
|
||||||
|
padding: 15px 30px;
|
||||||
|
border-radius: 5px;
|
||||||
|
cursor: pointer;
|
||||||
|
font-size: 16px;
|
||||||
|
margin: 0 10px;
|
||||||
|
}
|
||||||
|
button:hover {
|
||||||
|
background: #0056b3;
|
||||||
|
}
|
||||||
|
button:disabled {
|
||||||
|
background: #ccc;
|
||||||
|
cursor: not-allowed;
|
||||||
|
}
|
||||||
|
.recording {
|
||||||
|
background: #dc3545 !important;
|
||||||
|
}
|
||||||
|
.status {
|
||||||
|
text-align: center;
|
||||||
|
margin: 20px 0;
|
||||||
|
font-weight: bold;
|
||||||
|
}
|
||||||
|
.transcription {
|
||||||
|
background: #f8f9fa;
|
||||||
|
border: 1px solid #dee2e6;
|
||||||
|
border-radius: 5px;
|
||||||
|
padding: 20px;
|
||||||
|
min-height: 200px;
|
||||||
|
margin: 20px 0;
|
||||||
|
font-size: 16px;
|
||||||
|
line-height: 1.5;
|
||||||
|
}
|
||||||
|
.error {
|
||||||
|
color: #dc3545;
|
||||||
|
background: #f8d7da;
|
||||||
|
padding: 10px;
|
||||||
|
border-radius: 5px;
|
||||||
|
margin: 10px 0;
|
||||||
|
}
|
||||||
|
.success {
|
||||||
|
color: #155724;
|
||||||
|
background: #d4edda;
|
||||||
|
padding: 10px;
|
||||||
|
border-radius: 5px;
|
||||||
|
margin: 10px 0;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div class="container">
|
||||||
|
<h1>🎙️ Speech-to-Text POC</h1>
|
||||||
|
|
||||||
|
<div class="controls">
|
||||||
|
<button id="startBtn">Start Recording</button>
|
||||||
|
<button id="stopBtn" disabled>Stop Recording</button>
|
||||||
|
<button id="clearBtn">Clear Text</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div id="status" class="status">Ready to record</div>
|
||||||
|
|
||||||
|
<div id="transcription" class="transcription">
|
||||||
|
Transcribed text will appear here...
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<script src="app.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
|
@ -0,0 +1,24 @@
|
||||||
|
### 🧩 **Requirement: Speech-to-Text POC (No 3rd-Party APIs)**
|
||||||
|
|
||||||
|
#### **Goal**
|
||||||
|
|
||||||
|
Build a simple proof of concept (POC) that captures live microphone audio from the browser, sends it to a backend server, converts the audio to text using an open-source/local library, and displays the text on the UI.
|
||||||
|
|
||||||
|
#### **Key Points**
|
||||||
|
|
||||||
|
* A basic `index.html` page to:
|
||||||
|
|
||||||
|
* Start/stop microphone recording.
|
||||||
|
* Stream audio to the backend.
|
||||||
|
* Display the transcribed text in real-time or after processing.
|
||||||
|
* A backend server (e.g., Node.js or Python) that:
|
||||||
|
|
||||||
|
* Receives audio stream.
|
||||||
|
* Uses a **local speech-to-text library** (e.g., [Vosk](https://alphacephei.com/vosk/)) — **no external APIs**.
|
||||||
|
* Sends back the transcribed text to the frontend.
|
||||||
|
|
||||||
|
#### **Note**
|
||||||
|
|
||||||
|
* I am using fish terminal
|
||||||
|
* The solution should run locally and utilize system hardware.
|
||||||
|
* Avoid any third-party cloud services.
|
|
@ -0,0 +1,3 @@
|
||||||
|
vosk==0.3.45
|
||||||
|
soundfile==0.12.1
|
||||||
|
numpy==1.24.3
|
|
@ -0,0 +1,144 @@
|
||||||
|
const express = require('express');
|
||||||
|
const WebSocket = require('ws');
|
||||||
|
const { spawn } = require('child_process');
|
||||||
|
const fs = require('fs');
|
||||||
|
|
||||||
|
const app = express();
|
||||||
|
const PORT = 3000;
|
||||||
|
|
||||||
|
app.use(express.static('public'));
|
||||||
|
|
||||||
|
const server = app.listen(PORT, () => {
|
||||||
|
console.log(`Server running on http://localhost:${PORT}`);
|
||||||
|
console.log('Using Python SpeechRecognition with PocketSphinx for local STT');
|
||||||
|
});
|
||||||
|
|
||||||
|
const wss = new WebSocket.Server({ server });
|
||||||
|
|
||||||
|
class SpeechProcessor {
|
||||||
|
constructor() {
|
||||||
|
this.pythonProcess = null;
|
||||||
|
this.initializePythonProcess();
|
||||||
|
}
|
||||||
|
|
||||||
|
initializePythonProcess() {
|
||||||
|
try {
|
||||||
|
this.pythonProcess = spawn('python3', ['speech_processor.py'], {
|
||||||
|
stdio: ['pipe', 'pipe', 'pipe']
|
||||||
|
});
|
||||||
|
|
||||||
|
this.pythonProcess.stderr.on('data', (data) => {
|
||||||
|
console.error('Python process error:', data.toString());
|
||||||
|
});
|
||||||
|
|
||||||
|
this.pythonProcess.on('close', (code) => {
|
||||||
|
console.log(`Python process closed with code ${code}`);
|
||||||
|
// Restart process if it dies
|
||||||
|
setTimeout(() => this.initializePythonProcess(), 1000);
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log('Python speech processor initialized');
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Failed to initialize Python process:', error);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async processAudio(audioBuffer) {
|
||||||
|
return new Promise((resolve, reject) => {
|
||||||
|
if (!this.pythonProcess) {
|
||||||
|
reject(new Error('Python process not available'));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Send audio data length first
|
||||||
|
const lengthBuffer = Buffer.allocUnsafe(4);
|
||||||
|
lengthBuffer.writeUInt32BE(audioBuffer.length, 0);
|
||||||
|
this.pythonProcess.stdin.write(lengthBuffer);
|
||||||
|
|
||||||
|
// Send audio data
|
||||||
|
this.pythonProcess.stdin.write(audioBuffer);
|
||||||
|
|
||||||
|
// Read response
|
||||||
|
let responseLength = null;
|
||||||
|
let responseData = Buffer.alloc(0);
|
||||||
|
let expecting = 'length';
|
||||||
|
|
||||||
|
const onData = (data) => {
|
||||||
|
responseData = Buffer.concat([responseData, data]);
|
||||||
|
|
||||||
|
if (expecting === 'length' && responseData.length >= 4) {
|
||||||
|
responseLength = responseData.readUInt32BE(0);
|
||||||
|
responseData = responseData.slice(4);
|
||||||
|
expecting = 'data';
|
||||||
|
}
|
||||||
|
|
||||||
|
if (expecting === 'data' && responseData.length >= responseLength) {
|
||||||
|
const jsonData = responseData.slice(0, responseLength);
|
||||||
|
this.pythonProcess.stdout.removeListener('data', onData);
|
||||||
|
|
||||||
|
try {
|
||||||
|
const result = JSON.parse(jsonData.toString());
|
||||||
|
resolve(result);
|
||||||
|
} catch (error) {
|
||||||
|
reject(error);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
this.pythonProcess.stdout.on('data', onData);
|
||||||
|
|
||||||
|
// Timeout after 10 seconds
|
||||||
|
setTimeout(() => {
|
||||||
|
this.pythonProcess.stdout.removeListener('data', onData);
|
||||||
|
reject(new Error('Speech processing timeout'));
|
||||||
|
}, 10000);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const speechProcessor = new SpeechProcessor();
|
||||||
|
|
||||||
|
wss.on('connection', (ws) => {
|
||||||
|
console.log('Client connected');
|
||||||
|
|
||||||
|
ws.on('message', async (data) => {
|
||||||
|
try {
|
||||||
|
if (Buffer.isBuffer(data)) {
|
||||||
|
// Raw audio data received
|
||||||
|
const result = await speechProcessor.processAudio(data);
|
||||||
|
|
||||||
|
if (result.success && result.text) {
|
||||||
|
ws.send(JSON.stringify({
|
||||||
|
type: 'transcription',
|
||||||
|
text: result.text
|
||||||
|
}));
|
||||||
|
console.log('Transcription:', result.text);
|
||||||
|
} else if (!result.success) {
|
||||||
|
console.error('STT Error:', result.error);
|
||||||
|
ws.send(JSON.stringify({
|
||||||
|
type: 'error',
|
||||||
|
message: result.error
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// JSON message received
|
||||||
|
const message = JSON.parse(data);
|
||||||
|
console.log('Received message:', message);
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Error processing message:', error);
|
||||||
|
ws.send(JSON.stringify({
|
||||||
|
type: 'error',
|
||||||
|
message: 'Error processing audio'
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
ws.on('close', () => {
|
||||||
|
console.log('Client disconnected');
|
||||||
|
});
|
||||||
|
|
||||||
|
ws.on('error', (error) => {
|
||||||
|
console.error('WebSocket error:', error);
|
||||||
|
});
|
||||||
|
});
|
|
@ -0,0 +1,122 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
import vosk
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
import tempfile
|
||||||
|
import os
|
||||||
|
import wave
|
||||||
|
import soundfile as sf
|
||||||
|
|
||||||
|
# Global model - load once
|
||||||
|
model = None
|
||||||
|
recognizer = None
|
||||||
|
|
||||||
|
def initialize_vosk():
|
||||||
|
"""Initialize Vosk model"""
|
||||||
|
global model, recognizer
|
||||||
|
|
||||||
|
model_path = "/app/vosk-model"
|
||||||
|
if not os.path.exists(model_path):
|
||||||
|
return {"success": False, "error": "Vosk model not found at /app/vosk-model"}
|
||||||
|
|
||||||
|
try:
|
||||||
|
vosk.SetLogLevel(-1) # Reduce log verbosity
|
||||||
|
model = vosk.Model(model_path)
|
||||||
|
recognizer = vosk.KaldiRecognizer(model, 16000)
|
||||||
|
return {"success": True}
|
||||||
|
except Exception as e:
|
||||||
|
return {"success": False, "error": f"Failed to initialize Vosk: {str(e)}"}
|
||||||
|
|
||||||
|
def process_audio_chunk(audio_data):
|
||||||
|
"""Process audio data and return transcription"""
|
||||||
|
global recognizer
|
||||||
|
|
||||||
|
if not recognizer:
|
||||||
|
init_result = initialize_vosk()
|
||||||
|
if not init_result["success"]:
|
||||||
|
return init_result
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Write audio data to temporary file
|
||||||
|
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
|
||||||
|
temp_file.write(audio_data)
|
||||||
|
temp_filename = temp_file.name
|
||||||
|
|
||||||
|
# Read audio file with soundfile
|
||||||
|
try:
|
||||||
|
audio_data, sample_rate = sf.read(temp_filename)
|
||||||
|
|
||||||
|
# Convert to 16-bit PCM at 16kHz if needed
|
||||||
|
if sample_rate != 16000:
|
||||||
|
# Simple resampling (for better quality, use librosa)
|
||||||
|
import numpy as np
|
||||||
|
audio_data = np.interp(
|
||||||
|
np.linspace(0, len(audio_data), int(len(audio_data) * 16000 / sample_rate)),
|
||||||
|
np.arange(len(audio_data)),
|
||||||
|
audio_data
|
||||||
|
)
|
||||||
|
|
||||||
|
# Convert to bytes
|
||||||
|
audio_bytes = (audio_data * 32767).astype('int16').tobytes()
|
||||||
|
|
||||||
|
# Process with Vosk
|
||||||
|
if recognizer.AcceptWaveform(audio_bytes):
|
||||||
|
result = json.loads(recognizer.Result())
|
||||||
|
text = result.get('text', '')
|
||||||
|
else:
|
||||||
|
result = json.loads(recognizer.PartialResult())
|
||||||
|
text = result.get('partial', '')
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
os.unlink(temp_filename)
|
||||||
|
|
||||||
|
return {"success": True, "text": text}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
os.unlink(temp_filename)
|
||||||
|
return {"success": False, "error": f"Audio processing error: {str(e)}"}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return {"success": False, "error": f"General error: {str(e)}"}
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Main loop to process audio chunks from stdin"""
|
||||||
|
# Initialize Vosk on startup
|
||||||
|
init_result = initialize_vosk()
|
||||||
|
if not init_result["success"]:
|
||||||
|
error_response = json.dumps(init_result).encode('utf-8')
|
||||||
|
sys.stdout.buffer.write(len(error_response).to_bytes(4, byteorder='big'))
|
||||||
|
sys.stdout.buffer.write(error_response)
|
||||||
|
sys.stdout.buffer.flush()
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
# Read length of incoming data
|
||||||
|
length_data = sys.stdin.buffer.read(4)
|
||||||
|
if not length_data:
|
||||||
|
break
|
||||||
|
|
||||||
|
length = int.from_bytes(length_data, byteorder='big')
|
||||||
|
|
||||||
|
# Read audio data
|
||||||
|
audio_data = sys.stdin.buffer.read(length)
|
||||||
|
|
||||||
|
# Process audio
|
||||||
|
result = process_audio_chunk(audio_data)
|
||||||
|
|
||||||
|
# Send result back
|
||||||
|
response = json.dumps(result).encode('utf-8')
|
||||||
|
sys.stdout.buffer.write(len(response).to_bytes(4, byteorder='big'))
|
||||||
|
sys.stdout.buffer.write(response)
|
||||||
|
sys.stdout.buffer.flush()
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
error_result = {"success": False, "error": str(e)}
|
||||||
|
response = json.dumps(error_result).encode('utf-8')
|
||||||
|
sys.stdout.buffer.write(len(response).to_bytes(4, byteorder='big'))
|
||||||
|
sys.stdout.buffer.write(response)
|
||||||
|
sys.stdout.buffer.flush()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
Loading…
Reference in New Issue