This commit is contained in:
Kar
2025-06-17 15:53:01 +05:30
commit 4d20931ecc
411 changed files with 180695 additions and 0 deletions

View File

@@ -0,0 +1,77 @@
# dependencies
find_package(Threads REQUIRED)
# third-party
if (WHISPER_SDL2)
# SDL2
find_package(SDL2 REQUIRED)
string(STRIP "${SDL2_LIBRARIES}" SDL2_LIBRARIES)
message(STATUS "SDL2_INCLUDE_DIRS = ${SDL2_INCLUDE_DIRS}")
message(STATUS "SDL2_LIBRARIES = ${SDL2_LIBRARIES}")
endif()
# common
set(TARGET common)
add_library(${TARGET} STATIC
common.h
common.cpp
common-ggml.h
common-ggml.cpp
grammar-parser.cpp
)
include(DefaultTargetOptions)
target_link_libraries(${TARGET} PRIVATE whisper)
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
if (WHISPER_SDL2)
# common-sdl
set(TARGET common-sdl)
add_library(${TARGET} STATIC
common-sdl.h
common-sdl.cpp
)
include(DefaultTargetOptions)
target_include_directories(${TARGET} PUBLIC ${SDL2_INCLUDE_DIRS})
target_link_libraries(${TARGET} PRIVATE ${SDL2_LIBRARIES})
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
# examples
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
if (EMSCRIPTEN)
add_subdirectory(whisper.wasm)
add_subdirectory(stream.wasm)
add_subdirectory(command.wasm)
add_subdirectory(talk.wasm)
add_subdirectory(bench.wasm)
elseif(CMAKE_JS_VERSION)
add_subdirectory(addon.node)
else()
add_subdirectory(main)
add_subdirectory(stream)
add_subdirectory(server)
add_subdirectory(command)
add_subdirectory(bench)
add_subdirectory(quantize)
add_subdirectory(talk)
add_subdirectory(talk-llama)
add_subdirectory(lsp)
endif()
add_subdirectory(wchess)

View File

@@ -0,0 +1,3 @@
.idea
node_modules
build

View File

@@ -0,0 +1,31 @@
set(TARGET whisper-addon)
# Base settings
#==================================================================
# env var supported by cmake-js
add_definitions(-DNAPI_VERSION=4)
include_directories(${CMAKE_JS_INC})
#==================================================================
add_library(${TARGET} SHARED ${CMAKE_JS_SRC} addon.cpp)
set_target_properties(${TARGET} PROPERTIES PREFIX "" SUFFIX ".node")
include(DefaultTargetOptions)
# Include N-API wrappers
#==================================================================
execute_process(COMMAND node -p "require('node-addon-api').include"
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
OUTPUT_VARIABLE NODE_ADDON_API_DIR
)
string(REPLACE "\n" "" NODE_ADDON_API_DIR ${NODE_ADDON_API_DIR})
string(REPLACE "\"" "" NODE_ADDON_API_DIR ${NODE_ADDON_API_DIR})
target_include_directories(${TARGET} PRIVATE ${NODE_ADDON_API_DIR})
#==================================================================
target_link_libraries(${TARGET} ${CMAKE_JS_LIB} common whisper ${CMAKE_THREAD_LIBS_INIT})
if(MSVC AND CMAKE_JS_NODELIB_DEF AND CMAKE_JS_NODELIB_TARGET)
# Generate node.lib
execute_process(COMMAND ${CMAKE_AR} /def:${CMAKE_JS_NODELIB_DEF} /out:${CMAKE_JS_NODELIB_TARGET} ${CMAKE_STATIC_LINKER_FLAGS})
endif()

View File

@@ -0,0 +1,37 @@
# addon
This is an addon demo that can **perform whisper model reasoning in `node` and `electron` environments**, based on [cmake-js](https://github.com/cmake-js/cmake-js).
It can be used as a reference for using the whisper.cpp project in other node projects.
## Install
```shell
npm install
```
## Compile
Make sure it is in the project root directory and compiled with make-js.
```shell
npx cmake-js compile -T whisper-addon -B Release
```
For Electron addon and cmake-js options, you can see [cmake-js](https://github.com/cmake-js/cmake-js) and make very few configuration changes.
> Such as appointing special cmake path:
> ```shell
> npx cmake-js compile -c 'xxx/cmake' -T whisper-addon -B Release
> ```
## Run
```shell
cd examples/addon.node
node index.js --language='language' --model='model-path' --fname_inp='file-path'
```
Because this is a simple Demo, only the above parameters are set in the node environment.
Other parameters can also be specified in the node environment.

View File

@@ -0,0 +1,24 @@
const path = require("path");
const { whisper } = require(path.join(
__dirname,
"../../../build/Release/whisper-addon"
));
const { promisify } = require("util");
const whisperAsync = promisify(whisper);
const whisperParamsMock = {
language: "en",
model: path.join(__dirname, "../../../models/ggml-base.en.bin"),
fname_inp: path.join(__dirname, "../../../samples/jfk.wav"),
use_gpu: true,
};
describe("Run whisper.node", () => {
test("it should receive a non-empty value", async () => {
let result = await whisperAsync(whisperParamsMock);
expect(result.length).toBeGreaterThan(0);
}, 10000);
});

View File

@@ -0,0 +1,343 @@
#include "napi.h"
#include "common.h"
#include "whisper.h"
#include <string>
#include <thread>
#include <vector>
#include <cmath>
#include <cstdint>
struct whisper_params {
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
int32_t n_processors = 1;
int32_t offset_t_ms = 0;
int32_t offset_n = 0;
int32_t duration_ms = 0;
int32_t max_context = -1;
int32_t max_len = 0;
int32_t best_of = 5;
int32_t beam_size = -1;
float word_thold = 0.01f;
float entropy_thold = 2.4f;
float logprob_thold = -1.0f;
bool speed_up = false;
bool translate = false;
bool diarize = false;
bool output_txt = false;
bool output_vtt = false;
bool output_srt = false;
bool output_wts = false;
bool output_csv = false;
bool print_special = false;
bool print_colors = false;
bool print_progress = false;
bool no_timestamps = false;
bool use_gpu = true;
std::string language = "en";
std::string prompt;
std::string model = "../../ggml-large.bin";
std::vector<std::string> fname_inp = {};
std::vector<std::string> fname_out = {};
};
struct whisper_print_user_data {
const whisper_params * params;
const std::vector<std::vector<float>> * pcmf32s;
};
// 500 -> 00:05.000
// 6000 -> 01:00.000
std::string to_timestamp(int64_t t, bool comma = false) {
int64_t msec = t * 10;
int64_t hr = msec / (1000 * 60 * 60);
msec = msec - hr * (1000 * 60 * 60);
int64_t min = msec / (1000 * 60);
msec = msec - min * (1000 * 60);
int64_t sec = msec / 1000;
msec = msec - sec * 1000;
char buf[32];
snprintf(buf, sizeof(buf), "%02d:%02d:%02d%s%03d", (int) hr, (int) min, (int) sec, comma ? "," : ".", (int) msec);
return std::string(buf);
}
int timestamp_to_sample(int64_t t, int n_samples) {
return std::max(0, std::min((int) n_samples - 1, (int) ((t*WHISPER_SAMPLE_RATE)/100)));
}
void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper_state * state, int n_new, void * user_data) {
const auto & params = *((whisper_print_user_data *) user_data)->params;
const auto & pcmf32s = *((whisper_print_user_data *) user_data)->pcmf32s;
const int n_segments = whisper_full_n_segments(ctx);
std::string speaker = "";
int64_t t0;
int64_t t1;
// print the last n_new segments
const int s0 = n_segments - n_new;
if (s0 == 0) {
printf("\n");
}
for (int i = s0; i < n_segments; i++) {
if (!params.no_timestamps || params.diarize) {
t0 = whisper_full_get_segment_t0(ctx, i);
t1 = whisper_full_get_segment_t1(ctx, i);
}
if (!params.no_timestamps) {
printf("[%s --> %s] ", to_timestamp(t0).c_str(), to_timestamp(t1).c_str());
}
if (params.diarize && pcmf32s.size() == 2) {
const int64_t n_samples = pcmf32s[0].size();
const int64_t is0 = timestamp_to_sample(t0, n_samples);
const int64_t is1 = timestamp_to_sample(t1, n_samples);
double energy0 = 0.0f;
double energy1 = 0.0f;
for (int64_t j = is0; j < is1; j++) {
energy0 += fabs(pcmf32s[0][j]);
energy1 += fabs(pcmf32s[1][j]);
}
if (energy0 > 1.1*energy1) {
speaker = "(speaker 0)";
} else if (energy1 > 1.1*energy0) {
speaker = "(speaker 1)";
} else {
speaker = "(speaker ?)";
}
//printf("is0 = %lld, is1 = %lld, energy0 = %f, energy1 = %f, %s\n", is0, is1, energy0, energy1, speaker.c_str());
}
// colorful print bug
//
const char * text = whisper_full_get_segment_text(ctx, i);
printf("%s%s", speaker.c_str(), text);
// with timestamps or speakers: each segment on new line
if (!params.no_timestamps || params.diarize) {
printf("\n");
}
fflush(stdout);
}
}
int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
if (params.fname_inp.empty()) {
fprintf(stderr, "error: no input files specified\n");
return 2;
}
if (params.language != "auto" && whisper_lang_id(params.language.c_str()) == -1) {
fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
exit(0);
}
// whisper init
struct whisper_context_params cparams;
cparams.use_gpu = params.use_gpu;
struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
if (ctx == nullptr) {
fprintf(stderr, "error: failed to initialize whisper context\n");
return 3;
}
for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
const auto fname_inp = params.fname_inp[f];
const auto fname_out = f < (int)params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f];
std::vector<float> pcmf32; // mono-channel F32 PCM
std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
if (!::read_wav(fname_inp, pcmf32, pcmf32s, params.diarize)) {
fprintf(stderr, "error: failed to read WAV file '%s'\n", fname_inp.c_str());
continue;
}
// print system information
{
fprintf(stderr, "\n");
fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
params.n_threads*params.n_processors, std::thread::hardware_concurrency(), whisper_print_system_info());
}
// print some info about the processing
{
fprintf(stderr, "\n");
if (!whisper_is_multilingual(ctx)) {
if (params.language != "en" || params.translate) {
params.language = "en";
params.translate = false;
fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
}
}
fprintf(stderr, "%s: processing '%s' (%d samples, %.1f sec), %d threads, %d processors, lang = %s, task = %s, timestamps = %d ...\n",
__func__, fname_inp.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE,
params.n_threads, params.n_processors,
params.language.c_str(),
params.translate ? "translate" : "transcribe",
params.no_timestamps ? 0 : 1);
fprintf(stderr, "\n");
}
// run the inference
{
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
wparams.strategy = params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY;
wparams.print_realtime = false;
wparams.print_progress = params.print_progress;
wparams.print_timestamps = !params.no_timestamps;
wparams.print_special = params.print_special;
wparams.translate = params.translate;
wparams.language = params.language.c_str();
wparams.n_threads = params.n_threads;
wparams.n_max_text_ctx = params.max_context >= 0 ? params.max_context : wparams.n_max_text_ctx;
wparams.offset_ms = params.offset_t_ms;
wparams.duration_ms = params.duration_ms;
wparams.token_timestamps = params.output_wts || params.max_len > 0;
wparams.thold_pt = params.word_thold;
wparams.entropy_thold = params.entropy_thold;
wparams.logprob_thold = params.logprob_thold;
wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
wparams.speed_up = params.speed_up;
wparams.greedy.best_of = params.best_of;
wparams.beam_search.beam_size = params.beam_size;
wparams.initial_prompt = params.prompt.c_str();
whisper_print_user_data user_data = { &params, &pcmf32s };
// this callback is called on each new segment
if (!wparams.print_realtime) {
wparams.new_segment_callback = whisper_print_segment_callback;
wparams.new_segment_callback_user_data = &user_data;
}
// example for abort mechanism
// in this example, we do not abort the processing, but we could if the flag is set to true
// the callback is called before every encoder run - if it returns false, the processing is aborted
{
static bool is_aborted = false; // NOTE: this should be atomic to avoid data race
wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
bool is_aborted = *(bool*)user_data;
return !is_aborted;
};
wparams.encoder_begin_callback_user_data = &is_aborted;
}
if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0) {
fprintf(stderr, "failed to process audio\n");
return 10;
}
}
}
const int n_segments = whisper_full_n_segments(ctx);
result.resize(n_segments);
for (int i = 0; i < n_segments; ++i) {
const char * text = whisper_full_get_segment_text(ctx, i);
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
result[i].emplace_back(to_timestamp(t0, true));
result[i].emplace_back(to_timestamp(t1, true));
result[i].emplace_back(text);
}
whisper_print_timings(ctx);
whisper_free(ctx);
return 0;
}
class Worker : public Napi::AsyncWorker {
public:
Worker(Napi::Function& callback, whisper_params params)
: Napi::AsyncWorker(callback), params(params) {}
void Execute() override {
run(params, result);
}
void OnOK() override {
Napi::HandleScope scope(Env());
Napi::Object res = Napi::Array::New(Env(), result.size());
for (uint64_t i = 0; i < result.size(); ++i) {
Napi::Object tmp = Napi::Array::New(Env(), 3);
for (uint64_t j = 0; j < 3; ++j) {
tmp[j] = Napi::String::New(Env(), result[i][j]);
}
res[i] = tmp;
}
Callback().Call({Env().Null(), res});
}
private:
whisper_params params;
std::vector<std::vector<std::string>> result;
};
Napi::Value whisper(const Napi::CallbackInfo& info) {
Napi::Env env = info.Env();
if (info.Length() <= 0 || !info[0].IsObject()) {
Napi::TypeError::New(env, "object expected").ThrowAsJavaScriptException();
}
whisper_params params;
Napi::Object whisper_params = info[0].As<Napi::Object>();
std::string language = whisper_params.Get("language").As<Napi::String>();
std::string model = whisper_params.Get("model").As<Napi::String>();
std::string input = whisper_params.Get("fname_inp").As<Napi::String>();
bool use_gpu = whisper_params.Get("use_gpu").As<Napi::Boolean>();
params.language = language;
params.model = model;
params.fname_inp.emplace_back(input);
params.use_gpu = use_gpu;
Napi::Function callback = info[1].As<Napi::Function>();
Worker* worker = new Worker(callback, params);
worker->Queue();
return env.Undefined();
}
Napi::Object Init(Napi::Env env, Napi::Object exports) {
exports.Set(
Napi::String::New(env, "whisper"),
Napi::Function::New(env, whisper)
);
return exports;
}
NODE_API_MODULE(whisper, Init);

View File

@@ -0,0 +1,37 @@
const path = require("path");
const { whisper } = require(path.join(
__dirname,
"../../build/Release/whisper-addon"
));
const { promisify } = require("util");
const whisperAsync = promisify(whisper);
const whisperParams = {
language: "en",
model: path.join(__dirname, "../../models/ggml-base.en.bin"),
fname_inp: "../../samples/jfk.wav",
use_gpu: true,
};
const arguments = process.argv.slice(2);
const params = Object.fromEntries(
arguments.reduce((pre, item) => {
if (item.startsWith("--")) {
return [...pre, item.slice(2).split("=")];
}
return pre;
}, [])
);
for (const key in params) {
if (whisperParams.hasOwnProperty(key)) {
whisperParams[key] = params[key];
}
}
console.log("whisperParams =", whisperParams);
whisperAsync(whisperParams).then((result) => {
console.log(`Result from whisper: ${result}`);
});

View File

@@ -0,0 +1,16 @@
{
"name": "whisper-addon",
"version": "0.0.0",
"description": "",
"main": "index.js",
"author": "Qanhe Chen",
"license": "MIT",
"scripts": {
"test": "jest"
},
"devDependencies": {
"cmake-js": "^7.1.1",
"jest": "^29.4.0",
"node-addon-api": "^5.0.0"
}
}

View File

@@ -0,0 +1,49 @@
#
# libbench
#
set(TARGET libbench)
add_executable(${TARGET}
emscripten.cpp
)
include(DefaultTargetOptions)
target_link_libraries(${TARGET} PRIVATE
whisper
)
unset(EXTRA_FLAGS)
if (WHISPER_WASM_SINGLE_FILE)
set(EXTRA_FLAGS "-s SINGLE_FILE=1")
message(STATUS "Embedding WASM inside bench.js")
add_custom_command(
TARGET ${TARGET} POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy
${CMAKE_BINARY_DIR}/bin/libbench.js
${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/bench.wasm/bench.js
)
endif()
set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
--bind \
-s USE_PTHREADS=1 \
-s PTHREAD_POOL_SIZE_STRICT=0 \
-s INITIAL_MEMORY=2000MB \
-s TOTAL_MEMORY=2000MB \
-s FORCE_FILESYSTEM=1 \
-s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
${EXTRA_FLAGS} \
")
#
# bench.wasm
#
set(TARGET bench.wasm)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/index-tmpl.html ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/index.html @ONLY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../helpers.js ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/helpers.js @ONLY)

View File

@@ -0,0 +1,22 @@
# bench.wasm
Benchmark the performance of whisper.cpp in the browser using WebAssembly
Link: https://whisper.ggerganov.com/bench/
Terminal version: [examples/bench](/examples/bench)
## Build instructions
```bash
# build using Emscripten (v3.1.2)
git clone https://github.com/ggerganov/whisper.cpp
cd whisper.cpp
mkdir build-em && cd build-em
emcmake cmake ..
make -j
# copy the produced page to your HTTP path
cp bin/bench.wasm/* /path/to/html/
cp bin/libbench.worker.js /path/to/html/
```

View File

@@ -0,0 +1,87 @@
#include "whisper.h"
#include <emscripten.h>
#include <emscripten/bind.h>
#include <cmath>
#include <string>
#include <thread>
#include <vector>
constexpr int N_THREAD = 8;
// TODO: get rid of this vector of contexts - bad idea in the first place
std::vector<struct whisper_context *> g_contexts(4, nullptr);
std::thread g_worker;
void bench_main(size_t index) {
const int n_threads = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
// whisper context
auto & ctx = g_contexts[index];
fprintf(stderr, "%s: running benchmark with %d threads - please wait...\n", __func__, n_threads);
const int n_mels = whisper_model_n_mels(ctx);
if (int ret = whisper_set_mel(ctx, nullptr, 0, n_mels)) {
fprintf(stderr, "error: failed to set mel: %d\n", ret);
return;
}
{
fprintf(stderr, "\n");
fprintf(stderr, "system_info: n_threads = %d / %d | %s\n", n_threads, std::thread::hardware_concurrency(), whisper_print_system_info());
}
if (int ret = whisper_encode(ctx, 0, n_threads) != 0) {
fprintf(stderr, "error: failed to encode model: %d\n", ret);
return;
}
whisper_print_timings(ctx);
fprintf(stderr, "\n");
fprintf(stderr, "If you wish, you can submit these results here:\n");
fprintf(stderr, "\n");
fprintf(stderr, " https://github.com/ggerganov/whisper.cpp/issues/89\n");
fprintf(stderr, "\n");
fprintf(stderr, "Please include the following information:\n");
fprintf(stderr, "\n");
fprintf(stderr, " - CPU model\n");
fprintf(stderr, " - Operating system\n");
fprintf(stderr, " - Browser\n");
fprintf(stderr, "\n");
}
EMSCRIPTEN_BINDINGS(bench) {
emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
for (size_t i = 0; i < g_contexts.size(); ++i) {
if (g_contexts[i] == nullptr) {
g_contexts[i] = whisper_init_from_file_with_params(path_model.c_str(), whisper_context_default_params());
if (g_contexts[i] != nullptr) {
if (g_worker.joinable()) {
g_worker.join();
}
g_worker = std::thread([i]() {
bench_main(i);
});
return i + 1;
} else {
return (size_t) 0;
}
}
}
return (size_t) 0;
}));
emscripten::function("free", emscripten::optional_override([](size_t index) {
if (index < g_contexts.size()) {
whisper_free(g_contexts[index]);
g_contexts[index] = nullptr;
}
}));
}

View File

@@ -0,0 +1,283 @@
<!doctype html>
<html lang="en-us">
<head>
<title>bench : Benchmark whisper.cpp performance in the browser</title>
<style>
#output {
width: 100%;
height: 100%;
margin: 0 auto;
margin-top: 10px;
border-left: 0px;
border-right: 0px;
padding-left: 0px;
padding-right: 0px;
display: block;
background-color: black;
color: white;
font-size: 10px;
font-family: 'Lucida Console', Monaco, monospace;
outline: none;
white-space: pre;
overflow-wrap: normal;
overflow-x: scroll;
}
</style>
</head>
<body>
<div id="main-container">
<b>bench : Benchmark whisper.cpp performance in the browser</b>
<br><br>
You can find more about this project on <a href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/bench.wasm">GitHub</a>.
<br><br>
<b>More examples:</b>
<a href="https://whisper.ggerganov.com/">main</a> |
<a href="https://whisper.ggerganov.com/bench">bench</a> |
<a href="https://whisper.ggerganov.com/stream">stream</a> |
<a href="https://whisper.ggerganov.com/command">command</a> |
<a href="https://whisper.ggerganov.com/talk">talk</a> |
<br><br>
<hr>
Select the model you would like to use and click the "Bench" button.<br>
The results will be displayed in the textarea below.
<br><br>
<div id="model-whisper">
Whisper model: <span id="model-whisper-status"></span>
<button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
<button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
<button id="fetch-whisper-small-en" onclick="loadWhisper('small.en')">small.en (466 MB)</button>
<input type="file" id="whisper-file" name="file" onchange="loadFile(event, 'whisper.bin')" />
<br><br>
Quantized models:<br><br>
<button id="fetch-whisper-tiny-en-q5_1" onclick="loadWhisper('tiny-en-q5_1')">tiny.en (Q5_1, 31 MB)</button>
<button id="fetch-whisper-base-en-q5_1" onclick="loadWhisper('base-en-q5_1')">base.en (Q5_1, 57 MB)</button>
<button id="fetch-whisper-small-en-q5_1" onclick="loadWhisper('small-en-q5_1')">small.en (Q5_1, 182 MB)</button>
<button id="fetch-whisper-medium-en-q5_0" onclick="loadWhisper('medium-en-q5_0')">medium.en (Q5_0, 515 MB)</button>
<button id="fetch-whisper-large-q5_0" onclick="loadWhisper('large-q5_0')">large (Q5_0, 1030 MB)</button>
<span id="fetch-whisper-progress"></span>
</div>
<br>
<div id="input">
<button id="bench" onclick="onBench()" disabled>Bench</button>
<button id="clear" onclick="clearCache()">Clear Cache</button>
</div>
<hr>
Debug output:
<textarea id="output" rows="20"></textarea>
<br>
<b>Troubleshooting</b>
<br><br>
The page does some heavy computations, so make sure:
<ul>
<li>To use a modern web browser (e.g. Chrome, Firefox)</li>
<li>To use a fast desktop or laptop computer (i.e. not a mobile phone)</li>
<li>Your browser supports WASM <a href="https://webassembly.org/roadmap/">Fixed-width SIMD</a></li>
</ul>
<div class="cell-version">
<span>
|
Build time: <span class="nav-link">@GIT_DATE@</span> |
Commit hash: <a class="nav-link" href="https://github.com/ggerganov/whisper.cpp/commit/@GIT_SHA1@">@GIT_SHA1@</a> |
Commit subject: <span class="nav-link">@GIT_COMMIT_SUBJECT@</span> |
<a class="nav-link" href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/bench.wasm">Source Code</a> |
</span>
</div>
</div>
<script type="text/javascript" src="helpers.js"></script>
<script type='text/javascript'>
// the bench instance
var instance = null;
// model name
var model_whisper = null;
var Module = {
print: printTextarea,
printErr: printTextarea,
setStatus: function(text) {
printTextarea('js: ' + text);
},
monitorRunDependencies: function(left) {
},
preRun: function() {
printTextarea('js: Preparing ...');
},
postRun: function() {
printTextarea('js: Initialized successfully!');
}
};
//
// fetch models
//
let dbVersion = 1
let dbName = 'whisper.ggerganov.com';
let indexedDB = window.indexedDB || window.mozIndexedDB || window.webkitIndexedDB || window.msIndexedDB
function storeFS(fname, buf) {
// write to WASM file using FS_createDataFile
// if the file exists, delete it
try {
Module.FS_unlink(fname);
} catch (e) {
// ignore
}
Module.FS_createDataFile("/", fname, buf, true, true);
printTextarea('storeFS: stored model: ' + fname + ' size: ' + buf.length);
model_whisper = fname;
document.getElementById('model-whisper-status').innerHTML = 'loaded "' + model_whisper + '"!';
if (model_whisper != null) {
document.getElementById('bench').disabled = false;
}
}
function loadFile(event, fname) {
var file = event.target.files[0] || null;
if (file == null) {
return;
}
printTextarea("loadFile: loading model: " + file.name + ", size: " + file.size + " bytes");
printTextarea('loadFile: please wait ...');
var reader = new FileReader();
reader.onload = function(event) {
var buf = new Uint8Array(reader.result);
storeFS(fname, buf);
}
reader.readAsArrayBuffer(file);
document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
document.getElementById('fetch-whisper-base-en').style.display = 'none';
document.getElementById('fetch-whisper-small-en').style.display = 'none';
document.getElementById('fetch-whisper-tiny-en-q5_1' ).style.display = 'none';
document.getElementById('fetch-whisper-base-en-q5_1' ).style.display = 'none';
document.getElementById('fetch-whisper-small-en-q5_1' ).style.display = 'none';
document.getElementById('fetch-whisper-medium-en-q5_0').style.display = 'none';
document.getElementById('fetch-whisper-large-q5_0' ).style.display = 'none';
document.getElementById('whisper-file' ).style.display = 'none';
document.getElementById('model-whisper-status' ).innerHTML = 'loaded model: ' + file.name;
}
function loadWhisper(model) {
let urls = {
'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
'small.en': 'https://whisper.ggerganov.com/ggml-model-whisper-small.en.bin',
'tiny-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin',
'base-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin',
'small-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-small.en-q5_1.bin',
'medium-en-q5_0':'https://whisper.ggerganov.com/ggml-model-whisper-medium.en-q5_0.bin',
'large-q5_0': 'https://whisper.ggerganov.com/ggml-model-whisper-large-q5_0.bin',
};
let sizes = {
'tiny.en': 75,
'base.en': 142,
'small.en': 466,
'tiny-en-q5_1': 31,
'base-en-q5_1': 57,
'small-en-q5_1': 182,
'medium-en-q5_0': 515,
'large-q5_0': 1030,
};
let url = urls[model];
let dst = 'whisper.bin';
let size_mb = sizes[model];
document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
document.getElementById('fetch-whisper-base-en').style.display = 'none';
document.getElementById('fetch-whisper-small-en').style.display = 'none';
document.getElementById('fetch-whisper-tiny-en-q5_1' ).style.display = 'none';
document.getElementById('fetch-whisper-base-en-q5_1' ).style.display = 'none';
document.getElementById('fetch-whisper-small-en-q5_1' ).style.display = 'none';
document.getElementById('fetch-whisper-medium-en-q5_0').style.display = 'none';
document.getElementById('fetch-whisper-large-q5_0' ).style.display = 'none';
document.getElementById('whisper-file' ).style.display = 'none';
document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
cbProgress = function(p) {
let el = document.getElementById('fetch-whisper-progress');
el.innerHTML = Math.round(100*p) + '%';
};
cbCancel = function() {
var el;
el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-small-en'); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-tiny-en-q5_1' ); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-base-en-q5_1' ); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-small-en-q5_1' ); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-medium-en-q5_0'); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-large-q5_0' ); if (el) el.style.display = 'inline-block';
el = document.getElementById('whisper-file' ); if (el) el.style.display = 'inline-block';
el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = '';
};
loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);
}
//
// main
//
function onBench() {
if (instance) {
Module.free(instance);
}
instance = Module.init('whisper.bin');
if (instance) {
printTextarea("js: whisper initialized, instance: " + instance);
}
document.getElementById('bench').disabled = true;
if (!instance) {
printTextarea("js: failed to initialize whisper");
return;
}
}
</script>
<script type="text/javascript" src="bench.js"></script>
</body>
</html>

View File

@@ -0,0 +1,6 @@
set(TARGET bench)
add_executable(${TARGET} bench.cpp)
include(DefaultTargetOptions)
target_link_libraries(${TARGET} PRIVATE whisper ${CMAKE_THREAD_LIBS_INIT})

View File

@@ -0,0 +1,54 @@
# bench
A very basic tool for benchmarking the inference performance on your device. The tool simply runs the Encoder part of
the transformer on some random audio data and records the execution time. This way we can have an objective comparison
of the performance of the model for various setups.
Benchmark results are tracked in the following Github issue: https://github.com/ggerganov/whisper.cpp/issues/89
```bash
# build the bench tool
$ make bench
# run it on the small.en model using 4 threads
$ ./bench -m ./models/ggml-small.en.bin -t 4
whisper_model_load: loading model from './models/ggml-small.en.bin'
whisper_model_load: n_vocab = 51864
whisper_model_load: n_audio_ctx = 1500
whisper_model_load: n_audio_state = 768
whisper_model_load: n_audio_head = 12
whisper_model_load: n_audio_layer = 12
whisper_model_load: n_text_ctx = 448
whisper_model_load: n_text_state = 768
whisper_model_load: n_text_head = 12
whisper_model_load: n_text_layer = 12
whisper_model_load: n_mels = 80
whisper_model_load: f16 = 1
whisper_model_load: type = 3
whisper_model_load: mem_required = 1048.00 MB
whisper_model_load: adding 1607 extra tokens
whisper_model_load: ggml ctx size = 533.05 MB
whisper_model_load: memory size = 68.48 MB
whisper_model_load: model size = 464.44 MB
whisper_print_timings: load time = 240.82 ms
whisper_print_timings: mel time = 0.00 ms
whisper_print_timings: sample time = 0.00 ms
whisper_print_timings: encode time = 1062.21 ms / 88.52 ms per layer
whisper_print_timings: decode time = 0.00 ms / 0.00 ms per layer
whisper_print_timings: total time = 1303.04 ms
system_info: n_threads = 4 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 |
If you wish, you can submit these results here:
https://github.com/ggerganov/whisper.cpp/issues/89
Please include the following information:
- CPU model
- Operating system
- Compiler
```

View File

@@ -0,0 +1,170 @@
#include "whisper.h"
#include <cstdio>
#include <cstring>
#include <string>
#include <thread>
// command-line parameters
struct whisper_params {
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
int32_t what = 0; // what to benchmark: 0 - whisper ecoder, 1 - memcpy, 2 - ggml_mul_mat
std::string model = "models/ggml-base.en.bin";
bool use_gpu = true;
};
void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
for (int i = 1; i < argc; i++) {
std::string arg = argv[i];
if (arg == "-h" || arg == "--help") {
whisper_print_usage(argc, argv, params);
exit(0);
}
else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
else if (arg == "-w" || arg == "--what") { params.what = atoi(argv[++i]); }
else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; }
else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
whisper_print_usage(argc, argv, params);
exit(0);
}
}
return true;
}
void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
fprintf(stderr, "\n");
fprintf(stderr, "usage: %s [options]\n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
fprintf(stderr, " -w N, --what N [%-7d] what to benchmark:\n", params.what);
fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU\n", params.use_gpu ? "false" : "true");
fprintf(stderr, " %-7s 0 - whisper\n", "");
fprintf(stderr, " %-7s 1 - memcpy\n", "");
fprintf(stderr, " %-7s 2 - ggml_mul_mat\n", "");
fprintf(stderr, "\n");
}
int whisper_bench_full(const whisper_params & params) {
// whisper init
struct whisper_context_params cparams;
cparams.use_gpu = params.use_gpu;
struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
{
fprintf(stderr, "\n");
fprintf(stderr, "system_info: n_threads = %d / %d | %s\n", params.n_threads, std::thread::hardware_concurrency(), whisper_print_system_info());
}
if (ctx == nullptr) {
fprintf(stderr, "error: failed to initialize whisper context\n");
return 2;
}
const int n_mels = whisper_model_n_mels(ctx);
if (int ret = whisper_set_mel(ctx, nullptr, 0, n_mels)) {
fprintf(stderr, "error: failed to set mel: %d\n", ret);
return 3;
}
// heat encoder
if (int ret = whisper_encode(ctx, 0, params.n_threads) != 0) {
fprintf(stderr, "error: failed to encode: %d\n", ret);
return 4;
}
whisper_token tokens[512];
memset(tokens, 0, sizeof(tokens));
// prompt heat
if (int ret = whisper_decode(ctx, tokens, 256, 0, params.n_threads) != 0) {
fprintf(stderr, "error: failed to decode: %d\n", ret);
return 4;
}
// text-generation heat
if (int ret = whisper_decode(ctx, tokens, 1, 256, params.n_threads) != 0) {
fprintf(stderr, "error: failed to decode: %d\n", ret);
return 4;
}
whisper_reset_timings(ctx);
// actual run
if (int ret = whisper_encode(ctx, 0, params.n_threads) != 0) {
fprintf(stderr, "error: failed to encode: %d\n", ret);
return 4;
}
// text-generation
for (int i = 0; i < 256; i++) {
if (int ret = whisper_decode(ctx, tokens, 1, i, params.n_threads) != 0) {
fprintf(stderr, "error: failed to decode: %d\n", ret);
return 4;
}
}
// batched decoding
for (int i = 0; i < 64; i++) {
if (int ret = whisper_decode(ctx, tokens, 5, 0, params.n_threads) != 0) {
fprintf(stderr, "error: failed to decode: %d\n", ret);
return 4;
}
}
// prompt processing
for (int i = 0; i < 16; i++) {
if (int ret = whisper_decode(ctx, tokens, 256, 0, params.n_threads) != 0) {
fprintf(stderr, "error: failed to decode: %d\n", ret);
return 4;
}
}
whisper_print_timings(ctx);
whisper_free(ctx);
fprintf(stderr, "\n");
fprintf(stderr, "If you wish, you can submit these results here:\n");
fprintf(stderr, "\n");
fprintf(stderr, " https://github.com/ggerganov/whisper.cpp/issues/89\n");
fprintf(stderr, "\n");
fprintf(stderr, "Please include the following information:\n");
fprintf(stderr, "\n");
fprintf(stderr, " - CPU model\n");
fprintf(stderr, " - Operating system\n");
fprintf(stderr, " - Compiler\n");
fprintf(stderr, "\n");
return 0;
}
int main(int argc, char ** argv) {
whisper_params params;
if (whisper_params_parse(argc, argv, params) == false) {
return 1;
}
int ret = -1;
switch (params.what) {
case 0: ret = whisper_bench_full(params); break;
case 1: ret = whisper_bench_memcpy(params.n_threads); break;
case 2: ret = whisper_bench_ggml_mul_mat(params.n_threads); break;
default: fprintf(stderr, "error: unknown benchmark: %d\n", params.what); break;
}
return ret;
}

View File

@@ -0,0 +1,50 @@
#
# libcommand
#
set(TARGET libcommand)
add_executable(${TARGET}
emscripten.cpp
)
include(DefaultTargetOptions)
target_link_libraries(${TARGET} PRIVATE
common
whisper
)
unset(EXTRA_FLAGS)
if (WHISPER_WASM_SINGLE_FILE)
set(EXTRA_FLAGS "-s SINGLE_FILE=1")
message(STATUS "Embedding WASM inside command.js")
add_custom_command(
TARGET ${TARGET} POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy
${CMAKE_BINARY_DIR}/bin/libcommand.js
${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/command.wasm/command.js
)
endif()
set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
--bind \
-s USE_PTHREADS=1 \
-s PTHREAD_POOL_SIZE=8 \
-s INITIAL_MEMORY=1024MB \
-s TOTAL_MEMORY=1024MB \
-s FORCE_FILESYSTEM=1 \
-s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
${EXTRA_FLAGS} \
")
#
# command.wasm
#
set(TARGET command.wasm)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/index-tmpl.html ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/index.html @ONLY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../helpers.js ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/helpers.js @ONLY)

View File

@@ -0,0 +1,23 @@
# command.wasm
This is a basic Voice Assistant example that accepts voice commands from the microphone.
It runs in fully in the browser via WebAseembly.
Online demo: https://whisper.ggerganov.com/command/
Terminal version: [examples/command](/examples/command)
## Build instructions
```bash
# build using Emscripten (v3.1.2)
git clone https://github.com/ggerganov/whisper.cpp
cd whisper.cpp
mkdir build-em && cd build-em
emcmake cmake ..
make -j
# copy the produced page to your HTTP path
cp bin/command.wasm/* /path/to/html/
cp bin/libcommand.worker.js /path/to/html/
```

View File

@@ -0,0 +1,327 @@
#include "ggml.h"
#include "common.h"
#include "whisper.h"
#include <emscripten.h>
#include <emscripten/bind.h>
#include <atomic>
#include <cmath>
#include <mutex>
#include <string>
#include <thread>
#include <vector>
#include <regex>
constexpr int N_THREAD = 8;
std::vector<struct whisper_context *> g_contexts(4, nullptr);
std::mutex g_mutex;
std::thread g_worker;
std::atomic<bool> g_running(false);
std::string g_status = "";
std::string g_status_forced = "";
std::string g_transcribed = "";
std::vector<float> g_pcmf32;
void command_set_status(const std::string & status) {
std::lock_guard<std::mutex> lock(g_mutex);
g_status = status;
}
std::string command_transcribe(whisper_context * ctx, const whisper_full_params & wparams, const std::vector<float> & pcmf32, float & prob, int64_t & t_ms) {
const auto t_start = std::chrono::high_resolution_clock::now();
prob = 0.0f;
t_ms = 0;
if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
return "";
}
int prob_n = 0;
std::string result;
const int n_segments = whisper_full_n_segments(ctx);
for (int i = 0; i < n_segments; ++i) {
const char * text = whisper_full_get_segment_text(ctx, i);
result += text;
const int n_tokens = whisper_full_n_tokens(ctx, i);
for (int j = 0; j < n_tokens; ++j) {
const auto token = whisper_full_get_token_data(ctx, i, j);
prob += token.p;
++prob_n;
}
}
if (prob_n > 0) {
prob /= prob_n;
}
const auto t_end = std::chrono::high_resolution_clock::now();
t_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count();
return result;
}
void command_get_audio(int ms, int sample_rate, std::vector<float> & audio) {
const int64_t n_samples = (ms * sample_rate) / 1000;
int64_t n_take = 0;
if (n_samples > (int) g_pcmf32.size()) {
n_take = g_pcmf32.size();
} else {
n_take = n_samples;
}
audio.resize(n_take);
std::copy(g_pcmf32.end() - n_take, g_pcmf32.end(), audio.begin());
}
void command_main(size_t index) {
command_set_status("loading data ...");
struct whisper_full_params wparams = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY);
wparams.n_threads = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
wparams.offset_ms = 0;
wparams.translate = false;
wparams.no_context = true;
wparams.single_segment = true;
wparams.print_realtime = false;
wparams.print_progress = false;
wparams.print_timestamps = true;
wparams.print_special = false;
wparams.max_tokens = 32;
wparams.audio_ctx = 768; // partial encoder context for better performance
wparams.language = "en";
printf("command: using %d threads\n", wparams.n_threads);
bool have_prompt = false;
bool ask_prompt = true;
bool print_energy = false;
float prob0 = 0.0f;
float prob = 0.0f;
std::vector<float> pcmf32_cur;
std::vector<float> pcmf32_prompt;
const std::string k_prompt = "Ok Whisper, start listening for commands.";
// whisper context
auto & ctx = g_contexts[index];
const int32_t vad_ms = 2000;
const int32_t prompt_ms = 5000;
const int32_t command_ms = 4000;
const float vad_thold = 0.1f;
const float freq_thold = -1.0f;
while (g_running) {
// delay
std::this_thread::sleep_for(std::chrono::milliseconds(100));
if (ask_prompt) {
fprintf(stdout, "\n");
fprintf(stdout, "%s: Say the following phrase: '%s%s%s'\n", __func__, "\033[1m", k_prompt.c_str(), "\033[0m");
fprintf(stdout, "\n");
{
char txt[1024];
snprintf(txt, sizeof(txt), "Say the following phrase: '%s'", k_prompt.c_str());
command_set_status(txt);
}
ask_prompt = false;
}
int64_t t_ms = 0;
{
command_get_audio(vad_ms, WHISPER_SAMPLE_RATE, pcmf32_cur);
if (::vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, vad_thold, freq_thold, print_energy)) {
fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
command_set_status("Speech detected! Processing ...");
if (!have_prompt) {
command_get_audio(prompt_ms, WHISPER_SAMPLE_RATE, pcmf32_cur);
const auto txt = ::trim(::command_transcribe(ctx, wparams, pcmf32_cur, prob0, t_ms));
fprintf(stdout, "%s: Heard '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", txt.c_str(), "\033[0m", (int) t_ms);
const float sim = similarity(txt, k_prompt);
if (txt.length() < 0.8*k_prompt.length() || txt.length() > 1.2*k_prompt.length() || sim < 0.8f) {
fprintf(stdout, "%s: WARNING: prompt not recognized, try again\n", __func__);
ask_prompt = true;
} else {
fprintf(stdout, "\n");
fprintf(stdout, "%s: The prompt has been recognized!\n", __func__);
fprintf(stdout, "%s: Waiting for voice commands ...\n", __func__);
fprintf(stdout, "\n");
{
char txt[1024];
snprintf(txt, sizeof(txt), "Success! Waiting for voice commands ...");
command_set_status(txt);
}
// save the audio for the prompt
pcmf32_prompt = pcmf32_cur;
have_prompt = true;
}
} else {
command_get_audio(command_ms, WHISPER_SAMPLE_RATE, pcmf32_cur);
// prepend the prompt audio
pcmf32_cur.insert(pcmf32_cur.begin(), pcmf32_prompt.begin(), pcmf32_prompt.end());
const auto txt = ::trim(::command_transcribe(ctx, wparams, pcmf32_cur, prob, t_ms));
prob = 100.0f*(prob - prob0);
fprintf(stdout, "%s: heard '%s'\n", __func__, txt.c_str());
// find the prompt in the text
float best_sim = 0.0f;
size_t best_len = 0;
for (int n = 0.8*k_prompt.size(); n <= 1.2*k_prompt.size(); ++n) {
const auto prompt = txt.substr(0, n);
const float sim = similarity(prompt, k_prompt);
//fprintf(stderr, "%s: prompt = '%s', sim = %f\n", __func__, prompt.c_str(), sim);
if (sim > best_sim) {
best_sim = sim;
best_len = n;
}
}
const std::string command = ::trim(txt.substr(best_len));
fprintf(stdout, "%s: Command '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", command.c_str(), "\033[0m", (int) t_ms);
fprintf(stdout, "\n");
{
char txt[1024];
snprintf(txt, sizeof(txt), "Command '%s', (t = %d ms)", command.c_str(), (int) t_ms);
command_set_status(txt);
}
{
std::lock_guard<std::mutex> lock(g_mutex);
g_transcribed = command;
}
}
g_pcmf32.clear();
}
}
}
if (index < g_contexts.size()) {
whisper_free(g_contexts[index]);
g_contexts[index] = nullptr;
}
}
EMSCRIPTEN_BINDINGS(command) {
emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
for (size_t i = 0; i < g_contexts.size(); ++i) {
if (g_contexts[i] == nullptr) {
g_contexts[i] = whisper_init_from_file_with_params(path_model.c_str(), whisper_context_default_params());
if (g_contexts[i] != nullptr) {
g_running = true;
if (g_worker.joinable()) {
g_worker.join();
}
g_worker = std::thread([i]() {
command_main(i);
});
return i + 1;
} else {
return (size_t) 0;
}
}
}
return (size_t) 0;
}));
emscripten::function("free", emscripten::optional_override([](size_t index) {
if (g_running) {
g_running = false;
}
}));
emscripten::function("set_audio", emscripten::optional_override([](size_t index, const emscripten::val & audio) {
--index;
if (index >= g_contexts.size()) {
return -1;
}
if (g_contexts[index] == nullptr) {
return -2;
}
{
std::lock_guard<std::mutex> lock(g_mutex);
const int n = audio["length"].as<int>();
emscripten::val heap = emscripten::val::module_property("HEAPU8");
emscripten::val memory = heap["buffer"];
g_pcmf32.resize(n);
emscripten::val memoryView = audio["constructor"].new_(memory, reinterpret_cast<uintptr_t>(g_pcmf32.data()), n);
memoryView.call<void>("set", audio);
}
return 0;
}));
emscripten::function("get_transcribed", emscripten::optional_override([]() {
std::string transcribed;
{
std::lock_guard<std::mutex> lock(g_mutex);
transcribed = std::move(g_transcribed);
}
return transcribed;
}));
emscripten::function("get_status", emscripten::optional_override([]() {
std::string status;
{
std::lock_guard<std::mutex> lock(g_mutex);
status = g_status_forced.empty() ? g_status : g_status_forced;
}
return status;
}));
emscripten::function("set_status", emscripten::optional_override([](const std::string & status) {
{
std::lock_guard<std::mutex> lock(g_mutex);
g_status_forced = status;
}
}));
}

View File

@@ -0,0 +1,413 @@
<!doctype html>
<html lang="en-us">
<head>
<title>command : Voice assistant example using Whisper + WebAssembly</title>
<style>
#output {
width: 100%;
height: 100%;
margin: 0 auto;
margin-top: 10px;
border-left: 0px;
border-right: 0px;
padding-left: 0px;
padding-right: 0px;
display: block;
background-color: black;
color: white;
font-size: 10px;
font-family: 'Lucida Console', Monaco, monospace;
outline: none;
white-space: pre;
overflow-wrap: normal;
overflow-x: scroll;
}
</style>
</head>
<body>
<div id="main-container">
<b>command : Voice assistant example using Whisper + WebAssembly</b>
<br><br>
You can find more about this project on <a href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/command.wasm">GitHub</a>.
<br><br>
<b>More examples:</b>
<a href="https://whisper.ggerganov.com/">main</a> |
<a href="https://whisper.ggerganov.com/bench">bench</a> |
<a href="https://whisper.ggerganov.com/stream">stream</a> |
<a href="https://whisper.ggerganov.com/command">command</a> |
<a href="https://whisper.ggerganov.com/talk">talk</a> |
<br><br>
<hr>
Select the model you would like to use, click the "Start" button and follow the instructions.
<br><br>
<div id="model-whisper">
Whisper model: <span id="model-whisper-status"></span>
<button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
<button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
<br><br>
Quantized models:<br><br>
<button id="fetch-whisper-tiny-en-q5_1" onclick="loadWhisper('tiny-en-q5_1')">tiny.en (Q5_1, 31 MB)</button>
<button id="fetch-whisper-base-en-q5_1" onclick="loadWhisper('base-en-q5_1')">base.en (Q5_1, 57 MB)</button>
<span id="fetch-whisper-progress"></span>
<!--
<input type="file" id="file" name="file" onchange="loadFile(event, 'whisper.bin')" />
-->
</div>
<br>
<div id="input">
<button id="start" onclick="onStart()" disabled>Start</button>
<button id="stop" onclick="onStop()" disabled>Stop</button>
<button id="clear" onclick="clearCache()">Clear Cache</button>
</div>
<br>
<div id="state">
Status: <b><span id="state-status">not started</span></b>
<pre id="state-transcribed">[The recognized voice commands will be displayed here]</pre>
</div>
<hr>
Debug output:
<textarea id="output" rows="20"></textarea>
<br>
<b>Troubleshooting</b>
<br><br>
The page does some heavy computations, so make sure:
<ul>
<li>To use a modern web browser (e.g. Chrome, Firefox)</li>
<li>To use a fast desktop or laptop computer (i.e. not a mobile phone)</li>
<li>Your browser supports WASM <a href="https://webassembly.org/roadmap/">Fixed-width SIMD</a></li>
</ul>
<div class="cell-version">
<span>
|
Build time: <span class="nav-link">@GIT_DATE@</span> |
Commit hash: <a class="nav-link" href="https://github.com/ggerganov/whisper.cpp/commit/@GIT_SHA1@">@GIT_SHA1@</a> |
Commit subject: <span class="nav-link">@GIT_COMMIT_SUBJECT@</span> |
<a class="nav-link" href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/command.wasm">Source Code</a> |
</span>
</div>
</div>
<script type="text/javascript" src="helpers.js"></script>
<script type='text/javascript'>
// web audio context
var context = null;
// audio data
var audio = null;
var audio0 = null;
// the command instance
var instance = null;
// model name
var model_whisper = null;
var Module = {
print: printTextarea,
printErr: printTextarea,
setStatus: function(text) {
printTextarea('js: ' + text);
},
monitorRunDependencies: function(left) {
},
preRun: function() {
printTextarea('js: Preparing ...');
},
postRun: function() {
printTextarea('js: Initialized successfully!');
}
};
//
// fetch models
//
let dbVersion = 1
let dbName = 'whisper.ggerganov.com';
let indexedDB = window.indexedDB || window.mozIndexedDB || window.webkitIndexedDB || window.msIndexedDB
function storeFS(fname, buf) {
// write to WASM file using FS_createDataFile
// if the file exists, delete it
try {
Module.FS_unlink(fname);
} catch (e) {
// ignore
}
Module.FS_createDataFile("/", fname, buf, true, true);
printTextarea('storeFS: stored model: ' + fname + ' size: ' + buf.length);
document.getElementById('model-whisper-status').innerHTML = 'loaded "' + model_whisper + '"!';
if (model_whisper != null) {
document.getElementById('start').disabled = false;
document.getElementById('stop' ).disabled = true;
}
}
function loadWhisper(model) {
let urls = {
'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
'tiny-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin',
'base-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin',
};
let sizes = {
'tiny.en': 75,
'base.en': 142,
'tiny-en-q5_1': 31,
'base-en-q5_1': 57,
};
let url = urls[model];
let dst = 'whisper.bin';
let size_mb = sizes[model];
model_whisper = model;
document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
document.getElementById('fetch-whisper-base-en').style.display = 'none';
document.getElementById('fetch-whisper-tiny-en-q5_1').style.display = 'none';
document.getElementById('fetch-whisper-base-en-q5_1').style.display = 'none';
document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
cbProgress = function(p) {
let el = document.getElementById('fetch-whisper-progress');
el.innerHTML = Math.round(100*p) + '%';
};
cbCancel = function() {
var el;
el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-tiny-en-q5_1'); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-base-en-q5_1'); if (el) el.style.display = 'inline-block';
el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = '';
};
loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);
}
//
// microphone
//
const kSampleRate = 16000;
const kRestartRecording_s = 120;
const kIntervalAudio_ms = 250; // pass the recorded audio to the C++ instance at this rate
var mediaRecorder = null;
var doRecording = false;
var startTime = 0;
window.AudioContext = window.AudioContext || window.webkitAudioContext;
window.OfflineAudioContext = window.OfflineAudioContext || window.webkitOfflineAudioContext;
function stopRecording() {
Module.set_status("paused");
doRecording = false;
audio0 = null;
audio = null;
context = null;
}
function startRecording() {
if (!context) {
context = new AudioContext({
sampleRate: kSampleRate,
channelCount: 1,
echoCancellation: false,
autoGainControl: true,
noiseSuppression: true,
});
}
Module.set_status("");
document.getElementById('start').disabled = true;
document.getElementById('stop').disabled = false;
doRecording = true;
startTime = Date.now();
var chunks = [];
var stream = null;
navigator.mediaDevices.getUserMedia({audio: true, video: false})
.then(function(s) {
stream = s;
mediaRecorder = new MediaRecorder(stream);
mediaRecorder.ondataavailable = function(e) {
chunks.push(e.data);
var blob = new Blob(chunks, { 'type' : 'audio/ogg; codecs=opus' });
var reader = new FileReader();
reader.onload = function(event) {
var buf = new Uint8Array(reader.result);
if (!context) {
return;
}
context.decodeAudioData(buf.buffer, function(audioBuffer) {
var offlineContext = new OfflineAudioContext(audioBuffer.numberOfChannels, audioBuffer.length, audioBuffer.sampleRate);
var source = offlineContext.createBufferSource();
source.buffer = audioBuffer;
source.connect(offlineContext.destination);
source.start(0);
offlineContext.startRendering().then(function(renderedBuffer) {
audio = renderedBuffer.getChannelData(0);
//printTextarea('js: audio recorded, size: ' + audio.length + ', old size: ' + (audio0 == null ? 0 : audio0.length));
var audioAll = new Float32Array(audio0 == null ? audio.length : audio0.length + audio.length);
if (audio0 != null) {
audioAll.set(audio0, 0);
}
audioAll.set(audio, audio0 == null ? 0 : audio0.length);
if (instance) {
Module.set_audio(instance, audioAll);
}
});
}, function(e) {
audio = null;
});
}
reader.readAsArrayBuffer(blob);
};
mediaRecorder.onstop = function(e) {
if (doRecording) {
setTimeout(function() {
startRecording();
});
}
};
mediaRecorder.start(kIntervalAudio_ms);
})
.catch(function(err) {
printTextarea('js: error getting audio stream: ' + err);
});
var interval = setInterval(function() {
if (!doRecording) {
clearInterval(interval);
mediaRecorder.stop();
stream.getTracks().forEach(function(track) {
track.stop();
});
document.getElementById('start').disabled = false;
document.getElementById('stop').disabled = true;
mediaRecorder = null;
}
// if audio length is more than kRestartRecording_s seconds, restart recording
if (audio != null && audio.length > kSampleRate*kRestartRecording_s) {
if (doRecording) {
//printTextarea('js: restarting recording');
clearInterval(interval);
audio0 = audio;
audio = null;
mediaRecorder.stop();
stream.getTracks().forEach(function(track) {
track.stop();
});
}
}
}, 100);
}
//
// main
//
var nLines = 0;
var intervalUpdate = null;
var transcribedAll = '';
function onStart() {
if (!instance) {
instance = Module.init('whisper.bin');
if (instance) {
printTextarea("js: whisper initialized, instance: " + instance);
}
}
if (!instance) {
printTextarea("js: failed to initialize whisper");
return;
}
startRecording();
intervalUpdate = setInterval(function() {
var transcribed = Module.get_transcribed();
if (transcribed != null && transcribed.length > 1) {
transcribedAll += transcribed + '<br>';
nLines++;
// if more than 10 lines, remove the first line
if (nLines > 10) {
var i = transcribedAll.indexOf('<br>');
if (i > 0) {
transcribedAll = transcribedAll.substring(i + 4);
nLines--;
}
}
}
document.getElementById('state-status').innerHTML = Module.get_status();
document.getElementById('state-transcribed').innerHTML = transcribedAll;
}, 100);
}
function onStop() {
stopRecording();
}
</script>
<script type="text/javascript" src="command.js"></script>
</body>
</html>

View File

@@ -0,0 +1,9 @@
if (WHISPER_SDL2)
# command
set(TARGET command)
add_executable(${TARGET} command.cpp)
include(DefaultTargetOptions)
target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${CMAKE_THREAD_LIBS_INIT})
endif ()

View File

@@ -0,0 +1,47 @@
# command
This is a basic Voice Assistant example that accepts voice commands from the microphone.
More info is available in [issue #171](https://github.com/ggerganov/whisper.cpp/issues/171).
```bash
# Run with default arguments and small model
./command -m ./models/ggml-small.en.bin -t 8
# On Raspberry Pi, use tiny or base models + "-ac 768" for better performance
./command -m ./models/ggml-tiny.en.bin -ac 768 -t 3 -c 0
```
https://user-images.githubusercontent.com/1991296/204038393-2f846eae-c255-4099-a76d-5735c25c49da.mp4
Web version: [examples/command.wasm](/examples/command.wasm)
## Guided mode
"Guided mode" allows you to specify a list of commands (i.e. strings) and the transcription will be guided to classify your command into one from the list. This can be useful in situations where a device is listening only for a small subset of commands.
Initial tests show that this approach might be extremely efficient in terms of performance, since it integrates very well with the "partial Encoder" idea from #137.
```bash
# Run in guided mode, the list of allowed commands is in commands.txt
./command -m ./models/ggml-base.en.bin -cmd ./examples/command/commands.txt
# On Raspberry Pi, in guided mode you can use "-ac 128" for extra performance
./command -m ./models/ggml-tiny.en.bin -cmd ./examples/command/commands.txt -ac 128 -t 3 -c 0
```
https://user-images.githubusercontent.com/1991296/207435352-8fc4ed3f-bde5-4555-9b8b-aeeb76bee969.mp4
## Building
The `command` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
```bash
# Install SDL2 on Linux
sudo apt-get install libsdl2-dev
# Install SDL2 on Mac OS
brew install sdl2
make command
```

View File

@@ -0,0 +1,775 @@
// Voice assistant example
//
// Speak short text commands to the microphone.
// This program will detect your voice command and convert them to text.
//
// ref: https://github.com/ggerganov/whisper.cpp/issues/171
//
#include "common-sdl.h"
#include "common.h"
#include "whisper.h"
#include "grammar-parser.h"
#include <sstream>
#include <cassert>
#include <cstdio>
#include <fstream>
#include <mutex>
#include <regex>
#include <string>
#include <thread>
#include <vector>
#include <map>
bool file_exists(const std::string & fname) {
std::ifstream f(fname.c_str());
return f.good();
}
// command-line parameters
struct whisper_params {
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
int32_t prompt_ms = 5000;
int32_t command_ms = 8000;
int32_t capture_id = -1;
int32_t max_tokens = 32;
int32_t audio_ctx = 0;
float vad_thold = 0.6f;
float freq_thold = 100.0f;
float grammar_penalty = 100.0f;
grammar_parser::parse_state grammar_parsed;
bool speed_up = false;
bool translate = false;
bool print_special = false;
bool print_energy = false;
bool no_timestamps = true;
bool use_gpu = true;
std::string language = "en";
std::string model = "models/ggml-base.en.bin";
std::string fname_out;
std::string commands;
std::string prompt;
std::string context;
std::string grammar;
};
void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
for (int i = 1; i < argc; i++) {
std::string arg = argv[i];
if (arg == "-h" || arg == "--help") {
whisper_print_usage(argc, argv, params);
exit(0);
}
else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
else if (arg == "-pms" || arg == "--prompt-ms") { params.prompt_ms = std::stoi(argv[++i]); }
else if (arg == "-cms" || arg == "--command-ms") { params.command_ms = std::stoi(argv[++i]); }
else if (arg == "-c" || arg == "--capture") { params.capture_id = std::stoi(argv[++i]); }
else if (arg == "-mt" || arg == "--max-tokens") { params.max_tokens = std::stoi(argv[++i]); }
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; }
else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; }
else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; }
else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
else if (arg == "-f" || arg == "--file") { params.fname_out = argv[++i]; }
else if (arg == "-cmd" || arg == "--commands") { params.commands = argv[++i]; }
else if (arg == "-p" || arg == "--prompt") { params.prompt = argv[++i]; }
else if (arg == "-ctx" || arg == "--context") { params.context = argv[++i]; }
else if ( arg == "--grammar") { params.grammar = argv[++i]; }
else if ( arg == "--grammar-penalty") { params.grammar_penalty = std::stof(argv[++i]); }
else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
whisper_print_usage(argc, argv, params);
exit(0);
}
}
return true;
}
void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
fprintf(stderr, "\n");
fprintf(stderr, "usage: %s [options]\n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
fprintf(stderr, " -pms N, --prompt-ms N [%-7d] prompt duration in milliseconds\n", params.prompt_ms);
fprintf(stderr, " -cms N, --command-ms N [%-7d] command duration in milliseconds\n", params.command_ms);
fprintf(stderr, " -c ID, --capture ID [%-7d] capture device ID\n", params.capture_id);
fprintf(stderr, " -mt N, --max-tokens N [%-7d] maximum number of tokens per audio chunk\n", params.max_tokens);
fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false");
fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU\n", params.use_gpu ? "false" : "true");
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.language.c_str());
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n", params.fname_out.c_str());
fprintf(stderr, " -cmd FNAME, --commands FNAME [%-7s] text file with allowed commands\n", params.commands.c_str());
fprintf(stderr, " -p, --prompt [%-7s] the required activation prompt\n", params.prompt.c_str());
fprintf(stderr, " -ctx, --context [%-7s] sample text to help the transcription\n", params.context.c_str());
fprintf(stderr, " --grammar GRAMMAR [%-7s] GBNF grammar to guide decoding\n", params.grammar.c_str());
fprintf(stderr, " --grammar-penalty N [%-7.1f] scales down logits of nongrammar tokens\n", params.grammar_penalty);
fprintf(stderr, "\n");
}
std::string transcribe(
whisper_context * ctx,
const whisper_params & params,
const std::vector<float> & pcmf32,
const std::string & grammar_rule,
float & logprob_min,
float & logprob_sum,
int & n_tokens,
int64_t & t_ms) {
const auto t_start = std::chrono::high_resolution_clock::now();
logprob_min = 0.0f;
logprob_sum = 0.0f;
n_tokens = 0;
t_ms = 0;
//whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_BEAM_SEARCH);
wparams.print_progress = false;
wparams.print_special = params.print_special;
wparams.print_realtime = false;
wparams.print_timestamps = !params.no_timestamps;
wparams.translate = params.translate;
wparams.no_context = true;
wparams.no_timestamps = params.no_timestamps;
wparams.single_segment = true;
wparams.max_tokens = params.max_tokens;
wparams.language = params.language.c_str();
wparams.n_threads = params.n_threads;
wparams.audio_ctx = params.audio_ctx;
wparams.speed_up = params.speed_up;
wparams.temperature = 0.4f;
wparams.temperature_inc = 1.0f;
wparams.greedy.best_of = 5;
wparams.beam_search.beam_size = 5;
wparams.initial_prompt = params.context.data();
const auto & grammar_parsed = params.grammar_parsed;
auto grammar_rules = grammar_parsed.c_rules();
if (!params.grammar_parsed.rules.empty() && !grammar_rule.empty()) {
if (grammar_parsed.symbol_ids.find(grammar_rule) == grammar_parsed.symbol_ids.end()) {
fprintf(stderr, "%s: warning: grammar rule '%s' not found - skipping grammar sampling\n", __func__, grammar_rule.c_str());
} else {
wparams.grammar_rules = grammar_rules.data();
wparams.n_grammar_rules = grammar_rules.size();
wparams.i_start_rule = grammar_parsed.symbol_ids.at(grammar_rule);
wparams.grammar_penalty = params.grammar_penalty;
}
}
if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
return "";
}
std::string result;
const int n_segments = whisper_full_n_segments(ctx);
for (int i = 0; i < n_segments; ++i) {
const char * text = whisper_full_get_segment_text(ctx, i);
result += text;
const int n = whisper_full_n_tokens(ctx, i);
for (int j = 0; j < n; ++j) {
const auto token = whisper_full_get_token_data(ctx, i, j);
if(token.plog > 0.0f) exit(0);
logprob_min = std::min(logprob_min, token.plog);
logprob_sum += token.plog;
++n_tokens;
}
}
const auto t_end = std::chrono::high_resolution_clock::now();
t_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count();
return result;
}
std::vector<std::string> read_allowed_commands(const std::string & fname) {
std::vector<std::string> allowed_commands;
std::ifstream ifs(fname);
if (!ifs.is_open()) {
return allowed_commands;
}
std::string line;
while (std::getline(ifs, line)) {
line = ::trim(line);
if (line.empty()) {
continue;
}
std::transform(line.begin(), line.end(),line.begin(), ::tolower);
allowed_commands.push_back(std::move(line));
}
return allowed_commands;
}
std::vector<std::string> get_words(const std::string &txt) {
std::vector<std::string> words;
std::istringstream iss(txt);
std::string word;
while (iss >> word) {
words.push_back(word);
}
return words;
}
// command-list mode
// guide the transcription to match the most likely command from a provided list
int process_command_list(struct whisper_context * ctx, audio_async &audio, const whisper_params &params) {
fprintf(stderr, "\n");
fprintf(stderr, "%s: guided mode\n", __func__);
std::vector<std::string> allowed_commands = read_allowed_commands(params.commands);
if (allowed_commands.empty()) {
fprintf(stderr, "%s: error: failed to read allowed commands from '%s'\n", __func__, params.commands.c_str());
return 2;
}
int max_len = 0;
std::vector<std::vector<whisper_token>> allowed_tokens;
for (const auto & cmd : allowed_commands) {
whisper_token tokens[1024];
allowed_tokens.emplace_back();
for (int l = 0; l < (int) cmd.size(); ++l) {
// NOTE: very important to add the whitespace !
// the reason is that the first decoded token starts with a whitespace too!
std::string ss = std::string(" ") + cmd.substr(0, l + 1);
const int n = whisper_tokenize(ctx, ss.c_str(), tokens, 1024);
if (n < 0) {
fprintf(stderr, "%s: error: failed to tokenize command '%s'\n", __func__, cmd.c_str());
return 3;
}
if (n == 1) {
allowed_tokens.back().push_back(tokens[0]);
}
}
max_len = std::max(max_len, (int) cmd.size());
}
fprintf(stderr, "%s: allowed commands [ tokens ]:\n", __func__);
fprintf(stderr, "\n");
for (int i = 0; i < (int) allowed_commands.size(); ++i) {
fprintf(stderr, " - \033[1m%-*s\033[0m = [", max_len, allowed_commands[i].c_str());
for (const auto & token : allowed_tokens[i]) {
fprintf(stderr, " %5d", token);
}
fprintf(stderr, " ]\n");
}
std::string k_prompt = "select one from the available words: ";
for (int i = 0; i < (int) allowed_commands.size(); ++i) {
if (i > 0) {
k_prompt += ", ";
}
k_prompt += allowed_commands[i];
}
k_prompt += ". selected word: ";
// tokenize prompt
std::vector<whisper_token> k_tokens;
{
k_tokens.resize(1024);
const int n = whisper_tokenize(ctx, k_prompt.c_str(), k_tokens.data(), 1024);
if (n < 0) {
fprintf(stderr, "%s: error: failed to tokenize prompt '%s'\n", __func__, k_prompt.c_str());
return 4;
}
k_tokens.resize(n);
}
fprintf(stderr, "\n");
fprintf(stderr, "%s: prompt: '%s'\n", __func__, k_prompt.c_str());
fprintf(stderr, "%s: tokens: [", __func__);
for (const auto & token : k_tokens) {
fprintf(stderr, " %d", token);
}
fprintf(stderr, " ]\n");
fprintf(stderr, "\n");
fprintf(stderr, "%s: listening for a command ...\n", __func__);
fprintf(stderr, "\n");
bool is_running = true;
std::vector<float> pcmf32_cur;
std::vector<float> pcmf32_prompt;
// main loop
while (is_running) {
// handle Ctrl + C
is_running = sdl_poll_events();
// delay
std::this_thread::sleep_for(std::chrono::milliseconds(100));
audio.get(2000, pcmf32_cur);
if (::vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
const auto t_start = std::chrono::high_resolution_clock::now();
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
wparams.print_progress = false;
wparams.print_special = params.print_special;
wparams.print_realtime = false;
wparams.print_timestamps = !params.no_timestamps;
wparams.translate = params.translate;
wparams.no_context = true;
wparams.single_segment = true;
wparams.max_tokens = 1;
wparams.language = params.language.c_str();
wparams.n_threads = params.n_threads;
wparams.audio_ctx = params.audio_ctx;
wparams.speed_up = params.speed_up;
wparams.prompt_tokens = k_tokens.data();
wparams.prompt_n_tokens = k_tokens.size();
// run the transformer and a single decoding pass
if (whisper_full(ctx, wparams, pcmf32_cur.data(), pcmf32_cur.size()) != 0) {
fprintf(stderr, "%s: ERROR: whisper_full() failed\n", __func__);
break;
}
// estimate command probability
// NOTE: not optimal
{
const auto * logits = whisper_get_logits(ctx);
std::vector<float> probs(whisper_n_vocab(ctx), 0.0f);
// compute probs from logits via softmax
{
float max = -1e9;
for (int i = 0; i < (int) probs.size(); ++i) {
max = std::max(max, logits[i]);
}
float sum = 0.0f;
for (int i = 0; i < (int) probs.size(); ++i) {
probs[i] = expf(logits[i] - max);
sum += probs[i];
}
for (int i = 0; i < (int) probs.size(); ++i) {
probs[i] /= sum;
}
}
std::vector<std::pair<float, int>> probs_id;
double psum = 0.0;
for (int i = 0; i < (int) allowed_commands.size(); ++i) {
probs_id.emplace_back(probs[allowed_tokens[i][0]], i);
for (int j = 1; j < (int) allowed_tokens[i].size(); ++j) {
probs_id.back().first += probs[allowed_tokens[i][j]];
}
probs_id.back().first /= allowed_tokens[i].size();
psum += probs_id.back().first;
}
// normalize
for (auto & p : probs_id) {
p.first /= psum;
}
// sort descending
{
using pair_type = decltype(probs_id)::value_type;
std::sort(probs_id.begin(), probs_id.end(), [](const pair_type & a, const pair_type & b) {
return a.first > b.first;
});
}
// print the commands and the respective probabilities
{
fprintf(stdout, "\n");
for (const auto & cmd : probs_id) {
fprintf(stdout, "%s: %s%-*s%s = %f | ", __func__, "\033[1m", max_len, allowed_commands[cmd.second].c_str(), "\033[0m", cmd.first);
for (int token : allowed_tokens[cmd.second]) {
fprintf(stdout, "'%4s' %f ", whisper_token_to_str(ctx, token), probs[token]);
}
fprintf(stdout, "\n");
}
}
// best command
{
const auto t_end = std::chrono::high_resolution_clock::now();
const float prob = probs_id[0].first;
const int index = probs_id[0].second;
fprintf(stdout, "\n");
fprintf(stdout, "%s: detected command: %s%s%s | p = %f | t = %d ms\n", __func__,
"\033[1m", allowed_commands[index].c_str(), "\033[0m", prob,
(int) std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count());
fprintf(stdout, "\n");
}
}
audio.clear();
}
}
return 0;
}
// always-prompt mode
// transcribe the voice into text after valid prompt
int always_prompt_transcription(struct whisper_context * ctx, audio_async & audio, const whisper_params & params) {
bool is_running = true;
bool ask_prompt = true;
float logprob_min = 0.0f;
float logprob_sum = 0.0f;
int n_tokens = 0;
std::vector<float> pcmf32_cur;
const std::string k_prompt = params.prompt;
const int k_prompt_length = get_words(k_prompt).size();
fprintf(stderr, "\n");
fprintf(stderr, "%s: always-prompt mode\n", __func__);
// main loop
while (is_running) {
// handle Ctrl + C
is_running = sdl_poll_events();
// delay
std::this_thread::sleep_for(std::chrono::milliseconds(100));
if (ask_prompt) {
fprintf(stdout, "\n");
fprintf(stdout, "%s: The prompt is: '%s%s%s'\n", __func__, "\033[1m", k_prompt.c_str(), "\033[0m");
fprintf(stdout, "\n");
ask_prompt = false;
}
{
audio.get(2000, pcmf32_cur);
if (::vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
int64_t t_ms = 0;
// detect the commands
audio.get(params.command_ms, pcmf32_cur);
const auto txt = ::trim(::transcribe(ctx, params, pcmf32_cur, "", logprob_min, logprob_sum, n_tokens, t_ms));
const auto words = get_words(txt);
std::string prompt;
std::string command;
for (int i = 0; i < (int) words.size(); ++i) {
if (i < k_prompt_length) {
prompt += words[i] + " ";
} else {
command += words[i] + " ";
}
}
const float sim = similarity(prompt, k_prompt);
//debug
//fprintf(stdout, "command size: %i\n", command_length);
if ((sim > 0.7f) && (command.size() > 0)) {
fprintf(stdout, "%s: Command '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", command.c_str(), "\033[0m", (int) t_ms);
}
fprintf(stdout, "\n");
audio.clear();
}
}
}
return 0;
}
// general-purpose mode
// freely transcribe the voice into text
int process_general_transcription(struct whisper_context * ctx, audio_async & audio, const whisper_params & params) {
bool is_running = true;
bool have_prompt = false;
bool ask_prompt = true;
float logprob_min0 = 0.0f;
float logprob_min = 0.0f;
float logprob_sum0 = 0.0f;
float logprob_sum = 0.0f;
int n_tokens0 = 0;
int n_tokens = 0;
std::vector<float> pcmf32_cur;
std::vector<float> pcmf32_prompt;
std::string k_prompt = "Ok Whisper, start listening for commands.";
if (!params.prompt.empty()) {
k_prompt = params.prompt;
}
fprintf(stderr, "\n");
fprintf(stderr, "%s: general-purpose mode\n", __func__);
// main loop
while (is_running) {
// handle Ctrl + C
is_running = sdl_poll_events();
// delay
std::this_thread::sleep_for(std::chrono::milliseconds(100));
if (ask_prompt) {
fprintf(stdout, "\n");
fprintf(stdout, "%s: Say the following phrase: '%s%s%s'\n", __func__, "\033[1m", k_prompt.c_str(), "\033[0m");
fprintf(stdout, "\n");
ask_prompt = false;
}
{
audio.get(2000, pcmf32_cur);
if (::vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
int64_t t_ms = 0;
if (!have_prompt) {
// wait for activation phrase
audio.get(params.prompt_ms, pcmf32_cur);
const auto txt = ::trim(::transcribe(ctx, params, pcmf32_cur, "prompt", logprob_min0, logprob_sum0, n_tokens0, t_ms));
const float p = 100.0f * std::exp(logprob_min0);
fprintf(stdout, "%s: Heard '%s%s%s', (t = %d ms, p = %.2f%%)\n", __func__, "\033[1m", txt.c_str(), "\033[0m", (int) t_ms, p);
const float sim = similarity(txt, k_prompt);
if (txt.length() < 0.8*k_prompt.length() || txt.length() > 1.2*k_prompt.length() || sim < 0.8f) {
fprintf(stdout, "%s: WARNING: prompt not recognized, try again\n", __func__);
ask_prompt = true;
} else {
fprintf(stdout, "\n");
fprintf(stdout, "%s: The prompt has been recognized!\n", __func__);
fprintf(stdout, "%s: Waiting for voice commands ...\n", __func__);
fprintf(stdout, "\n");
// save the audio for the prompt
pcmf32_prompt = pcmf32_cur;
have_prompt = true;
}
} else {
// we have heard the activation phrase, now detect the commands
audio.get(params.command_ms, pcmf32_cur);
//printf("len prompt: %.4f\n", pcmf32_prompt.size() / (float) WHISPER_SAMPLE_RATE);
//printf("len command: %.4f\n", pcmf32_cur.size() / (float) WHISPER_SAMPLE_RATE);
// prepend 3 second of silence
pcmf32_cur.insert(pcmf32_cur.begin(), 3.0f*WHISPER_SAMPLE_RATE, 0.0f);
// prepend the prompt audio
pcmf32_cur.insert(pcmf32_cur.begin(), pcmf32_prompt.begin(), pcmf32_prompt.end());
const auto txt = ::trim(::transcribe(ctx, params, pcmf32_cur, "root", logprob_min, logprob_sum, n_tokens, t_ms));
//const float p = 100.0f * std::exp((logprob - logprob0) / (n_tokens - n_tokens0));
const float p = 100.0f * std::exp(logprob_min);
//fprintf(stdout, "%s: heard '%s'\n", __func__, txt.c_str());
// find the prompt in the text
float best_sim = 0.0f;
size_t best_len = 0;
for (size_t n = 0.8*k_prompt.size(); n <= 1.2*k_prompt.size(); ++n) {
if (n >= txt.size()) {
break;
}
const auto prompt = txt.substr(0, n);
const float sim = similarity(prompt, k_prompt);
//fprintf(stderr, "%s: prompt = '%s', sim = %f\n", __func__, prompt.c_str(), sim);
if (sim > best_sim) {
best_sim = sim;
best_len = n;
}
}
fprintf(stdout, "%s: DEBUG: txt = '%s', prob = %.2f%%\n", __func__, txt.c_str(), p);
if (best_len == 0) {
fprintf(stdout, "%s: WARNING: command not recognized, try again\n", __func__);
} else {
// cut the prompt from the decoded text
const std::string command = ::trim(txt.substr(best_len));
fprintf(stdout, "%s: Command '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", command.c_str(), "\033[0m", (int) t_ms);
}
fprintf(stdout, "\n");
}
audio.clear();
}
}
}
return 0;
}
int main(int argc, char ** argv) {
whisper_params params;
if (whisper_params_parse(argc, argv, params) == false) {
return 1;
}
if (whisper_lang_id(params.language.c_str()) == -1) {
fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
whisper_print_usage(argc, argv, params);
exit(0);
}
// whisper init
struct whisper_context_params cparams;
cparams.use_gpu = params.use_gpu;
struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
// print some info about the processing
{
fprintf(stderr, "\n");
if (!whisper_is_multilingual(ctx)) {
if (params.language != "en" || params.translate) {
params.language = "en";
params.translate = false;
fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
}
}
fprintf(stderr, "%s: processing, %d threads, lang = %s, task = %s, timestamps = %d ...\n",
__func__,
params.n_threads,
params.language.c_str(),
params.translate ? "translate" : "transcribe",
params.no_timestamps ? 0 : 1);
fprintf(stderr, "\n");
}
// init audio
audio_async audio(30*1000);
if (!audio.init(params.capture_id, WHISPER_SAMPLE_RATE)) {
fprintf(stderr, "%s: audio.init() failed!\n", __func__);
return 1;
}
audio.resume();
// wait for 1 second to avoid any buffered noise
std::this_thread::sleep_for(std::chrono::milliseconds(1000));
audio.clear();
int ret_val = 0;
if (!params.grammar.empty()) {
auto & grammar = params.grammar_parsed;
if (file_exists(params.grammar.c_str())) {
// read grammar from file
std::ifstream ifs(params.grammar.c_str());
const std::string txt = std::string((std::istreambuf_iterator<char>(ifs)), std::istreambuf_iterator<char>());
grammar = grammar_parser::parse(txt.c_str());
} else {
// read grammar from string
grammar = grammar_parser::parse(params.grammar.c_str());
}
// will be empty (default) if there are parse errors
if (grammar.rules.empty()) {
ret_val = 1;
} else {
fprintf(stderr, "%s: grammar:\n", __func__);
grammar_parser::print_grammar(stderr, grammar);
fprintf(stderr, "\n");
}
}
if (ret_val == 0) {
if (!params.commands.empty()) {
ret_val = process_command_list(ctx, audio, params);
} else if (!params.prompt.empty() && params.grammar_parsed.rules.empty()) {
ret_val = always_prompt_transcription(ctx, audio, params);
} else {
ret_val = process_general_transcription(ctx, audio, params);
}
}
audio.pause();
whisper_print_timings(ctx);
whisper_free(ctx);
return ret_val;
}

View File

@@ -0,0 +1,9 @@
enable
disable
cat
dog
apple
red
blue
green
lightblue

View File

@@ -0,0 +1,239 @@
#include "common-ggml.h"
#include <regex>
#include <map>
static const std::map<std::string, enum ggml_ftype> GGML_FTYPE_MAP = {
{"q4_0", GGML_FTYPE_MOSTLY_Q4_0},
{"q4_1", GGML_FTYPE_MOSTLY_Q4_1},
{"q5_0", GGML_FTYPE_MOSTLY_Q5_0},
{"q5_1", GGML_FTYPE_MOSTLY_Q5_1},
{"q8_0", GGML_FTYPE_MOSTLY_Q8_0},
{"q2_k", GGML_FTYPE_MOSTLY_Q2_K},
{"q3_k", GGML_FTYPE_MOSTLY_Q3_K},
{"q4_k", GGML_FTYPE_MOSTLY_Q4_K},
{"q5_k", GGML_FTYPE_MOSTLY_Q5_K},
{"q6_k", GGML_FTYPE_MOSTLY_Q6_K},
};
void ggml_print_ftypes(FILE * fp) {
for (auto it = GGML_FTYPE_MAP.begin(); it != GGML_FTYPE_MAP.end(); it++) {
fprintf(fp, " type = \"%s\" or %d\n", it->first.c_str(), it->second);
}
}
enum ggml_ftype ggml_parse_ftype(const char * str) {
enum ggml_ftype ftype;
if (str[0] == 'q') {
const auto it = GGML_FTYPE_MAP.find(str);
if (it == GGML_FTYPE_MAP.end()) {
fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, str);
return GGML_FTYPE_UNKNOWN;
}
ftype = it->second;
} else {
ftype = (enum ggml_ftype) atoi(str);
}
return ftype;
}
bool ggml_common_quantize_0(
std::ifstream & finp,
std::ofstream & fout,
const ggml_ftype ftype,
const std::vector<std::string> & to_quant,
const std::vector<std::string> & to_skip) {
ggml_type qtype = GGML_TYPE_F32;
switch (ftype) {
case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break;
case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break;
case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;
case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;
case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;
case GGML_FTYPE_MOSTLY_Q2_K: qtype = GGML_TYPE_Q2_K; break;
case GGML_FTYPE_MOSTLY_Q3_K: qtype = GGML_TYPE_Q3_K; break;
case GGML_FTYPE_MOSTLY_Q4_K: qtype = GGML_TYPE_Q4_K; break;
case GGML_FTYPE_MOSTLY_Q5_K: qtype = GGML_TYPE_Q5_K; break;
case GGML_FTYPE_MOSTLY_Q6_K: qtype = GGML_TYPE_Q6_K; break;
case GGML_FTYPE_UNKNOWN:
case GGML_FTYPE_ALL_F32:
case GGML_FTYPE_MOSTLY_F16:
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:
{
fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
return false;
}
};
if (!ggml_is_quantized(qtype)) {
fprintf(stderr, "%s: invalid quantization type %d (%s)\n", __func__, qtype, ggml_type_name(qtype));
return false;
}
size_t total_size_org = 0;
size_t total_size_new = 0;
std::vector<float> work;
std::vector<uint8_t> data_u8;
std::vector<ggml_fp16_t> data_f16;
std::vector<float> data_f32;
std::vector<int64_t> hist_all(1 << 4, 0);
while (true) {
int32_t n_dims;
int32_t length;
int32_t ttype;
finp.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
finp.read(reinterpret_cast<char *>(&length), sizeof(length));
finp.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));
if (finp.eof()) {
break;
}
int32_t nelements = 1;
int32_t ne[4] = { 1, 1, 1, 1 };
for (int i = 0; i < n_dims; ++i) {
finp.read (reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
nelements *= ne[i];
}
std::string name(length, 0);
finp.read (&name[0], length);
printf("%64s - [%5d, %5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ne[2], ggml_type_name((ggml_type) ttype));
bool quantize = false;
// check if we should quantize this tensor
for (const auto & s : to_quant) {
if (std::regex_match(name, std::regex(s))) {
quantize = true;
break;
}
}
// check if we should skip this tensor
for (const auto & s : to_skip) {
if (std::regex_match(name, std::regex(s))) {
quantize = false;
break;
}
}
// quantize only 2D tensors
quantize &= (n_dims == 2);
if (quantize) {
if (ttype != GGML_TYPE_F32 && ttype != GGML_TYPE_F16) {
fprintf(stderr, "%s: unsupported ttype %d (%s) for integer quantization\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
return false;
}
if (ttype == GGML_TYPE_F16) {
data_f16.resize(nelements);
finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
data_f32.resize(nelements);
for (int i = 0; i < nelements; ++i) {
data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
}
} else {
data_f32.resize(nelements);
finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
}
ttype = qtype;
} else {
const int bpe = (ttype == 0) ? sizeof(float) : sizeof(uint16_t);
data_u8.resize(nelements*bpe);
finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
}
fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
fout.write(reinterpret_cast<char *>(&length), sizeof(length));
fout.write(reinterpret_cast<char *>(&ttype), sizeof(ttype));
for (int i = 0; i < n_dims; ++i) {
fout.write(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
}
fout.write(&name[0], length);
if (quantize) {
work.resize(nelements); // for quantization
size_t cur_size = 0;
std::vector<int64_t> hist_cur(1 << 4, 0);
switch ((ggml_type) ttype) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
case GGML_TYPE_Q5_1:
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q2_K:
case GGML_TYPE_Q3_K:
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K:
case GGML_TYPE_Q6_K:
{
cur_size = ggml_quantize_chunk((ggml_type) ttype, data_f32.data(), work.data(), 0, nelements, hist_cur.data());
} break;
case GGML_TYPE_F32:
case GGML_TYPE_F16:
case GGML_TYPE_I8:
case GGML_TYPE_I16:
case GGML_TYPE_I32:
case GGML_TYPE_Q8_1:
case GGML_TYPE_Q8_K:
case GGML_TYPE_COUNT:
{
fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
return false;
}
}
fout.write(reinterpret_cast<char *>(work.data()), cur_size);
total_size_new += cur_size;
printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
for (int i = 0; i < (int) hist_cur.size(); ++i) {
hist_all[i] += hist_cur[i];
}
for (int i = 0; i < (int) hist_cur.size(); ++i) {
printf("%5.3f ", hist_cur[i] / (float)nelements);
}
printf("\n");
} else {
printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
total_size_new += data_u8.size();
}
total_size_org += nelements * sizeof(float);
}
printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
printf("%s: quant size = %8.2f MB | ftype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, ftype, ggml_type_name(qtype));
{
int64_t sum_all = 0;
for (int i = 0; i < (int) hist_all.size(); ++i) {
sum_all += hist_all[i];
}
printf("%s: hist: ", __func__);
for (int i = 0; i < (int) hist_all.size(); ++i) {
printf("%5.3f ", hist_all[i] / (float)sum_all);
}
printf("\n");
}
return true;
}

View File

@@ -0,0 +1,18 @@
#pragma once
#include "ggml.h"
#include <fstream>
#include <vector>
#include <string>
enum ggml_ftype ggml_parse_ftype(const char * str);
void ggml_print_ftypes(FILE * fp = stderr);
bool ggml_common_quantize_0(
std::ifstream & finp,
std::ofstream & fout,
const ggml_ftype ftype,
const std::vector<std::string> & to_quant,
const std::vector<std::string> & to_skip);

View File

@@ -0,0 +1,229 @@
#include "common-sdl.h"
audio_async::audio_async(int len_ms) {
m_len_ms = len_ms;
m_running = false;
}
audio_async::~audio_async() {
if (m_dev_id_in) {
SDL_CloseAudioDevice(m_dev_id_in);
}
}
bool audio_async::init(int capture_id, int sample_rate) {
SDL_LogSetPriority(SDL_LOG_CATEGORY_APPLICATION, SDL_LOG_PRIORITY_INFO);
if (SDL_Init(SDL_INIT_AUDIO) < 0) {
SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Couldn't initialize SDL: %s\n", SDL_GetError());
return false;
}
SDL_SetHintWithPriority(SDL_HINT_AUDIO_RESAMPLING_MODE, "medium", SDL_HINT_OVERRIDE);
{
int nDevices = SDL_GetNumAudioDevices(SDL_TRUE);
fprintf(stderr, "%s: found %d capture devices:\n", __func__, nDevices);
for (int i = 0; i < nDevices; i++) {
fprintf(stderr, "%s: - Capture device #%d: '%s'\n", __func__, i, SDL_GetAudioDeviceName(i, SDL_TRUE));
}
}
SDL_AudioSpec capture_spec_requested;
SDL_AudioSpec capture_spec_obtained;
SDL_zero(capture_spec_requested);
SDL_zero(capture_spec_obtained);
capture_spec_requested.freq = sample_rate;
capture_spec_requested.format = AUDIO_F32;
capture_spec_requested.channels = 1;
capture_spec_requested.samples = 1024;
capture_spec_requested.callback = [](void * userdata, uint8_t * stream, int len) {
audio_async * audio = (audio_async *) userdata;
audio->callback(stream, len);
};
capture_spec_requested.userdata = this;
if (capture_id >= 0) {
fprintf(stderr, "%s: attempt to open capture device %d : '%s' ...\n", __func__, capture_id, SDL_GetAudioDeviceName(capture_id, SDL_TRUE));
m_dev_id_in = SDL_OpenAudioDevice(SDL_GetAudioDeviceName(capture_id, SDL_TRUE), SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
} else {
fprintf(stderr, "%s: attempt to open default capture device ...\n", __func__);
m_dev_id_in = SDL_OpenAudioDevice(nullptr, SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
}
if (!m_dev_id_in) {
fprintf(stderr, "%s: couldn't open an audio device for capture: %s!\n", __func__, SDL_GetError());
m_dev_id_in = 0;
return false;
} else {
fprintf(stderr, "%s: obtained spec for input device (SDL Id = %d):\n", __func__, m_dev_id_in);
fprintf(stderr, "%s: - sample rate: %d\n", __func__, capture_spec_obtained.freq);
fprintf(stderr, "%s: - format: %d (required: %d)\n", __func__, capture_spec_obtained.format,
capture_spec_requested.format);
fprintf(stderr, "%s: - channels: %d (required: %d)\n", __func__, capture_spec_obtained.channels,
capture_spec_requested.channels);
fprintf(stderr, "%s: - samples per frame: %d\n", __func__, capture_spec_obtained.samples);
}
m_sample_rate = capture_spec_obtained.freq;
m_audio.resize((m_sample_rate*m_len_ms)/1000);
return true;
}
bool audio_async::resume() {
if (!m_dev_id_in) {
fprintf(stderr, "%s: no audio device to resume!\n", __func__);
return false;
}
if (m_running) {
fprintf(stderr, "%s: already running!\n", __func__);
return false;
}
SDL_PauseAudioDevice(m_dev_id_in, 0);
m_running = true;
return true;
}
bool audio_async::pause() {
if (!m_dev_id_in) {
fprintf(stderr, "%s: no audio device to pause!\n", __func__);
return false;
}
if (!m_running) {
fprintf(stderr, "%s: already paused!\n", __func__);
return false;
}
SDL_PauseAudioDevice(m_dev_id_in, 1);
m_running = false;
return true;
}
bool audio_async::clear() {
if (!m_dev_id_in) {
fprintf(stderr, "%s: no audio device to clear!\n", __func__);
return false;
}
if (!m_running) {
fprintf(stderr, "%s: not running!\n", __func__);
return false;
}
{
std::lock_guard<std::mutex> lock(m_mutex);
m_audio_pos = 0;
m_audio_len = 0;
}
return true;
}
// callback to be called by SDL
void audio_async::callback(uint8_t * stream, int len) {
if (!m_running) {
return;
}
size_t n_samples = len / sizeof(float);
if (n_samples > m_audio.size()) {
n_samples = m_audio.size();
stream += (len - (n_samples * sizeof(float)));
}
//fprintf(stderr, "%s: %zu samples, pos %zu, len %zu\n", __func__, n_samples, m_audio_pos, m_audio_len);
{
std::lock_guard<std::mutex> lock(m_mutex);
if (m_audio_pos + n_samples > m_audio.size()) {
const size_t n0 = m_audio.size() - m_audio_pos;
memcpy(&m_audio[m_audio_pos], stream, n0 * sizeof(float));
memcpy(&m_audio[0], stream + n0 * sizeof(float), (n_samples - n0) * sizeof(float));
m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
m_audio_len = m_audio.size();
} else {
memcpy(&m_audio[m_audio_pos], stream, n_samples * sizeof(float));
m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
m_audio_len = std::min(m_audio_len + n_samples, m_audio.size());
}
}
}
void audio_async::get(int ms, std::vector<float> & result) {
if (!m_dev_id_in) {
fprintf(stderr, "%s: no audio device to get audio from!\n", __func__);
return;
}
if (!m_running) {
fprintf(stderr, "%s: not running!\n", __func__);
return;
}
result.clear();
{
std::lock_guard<std::mutex> lock(m_mutex);
if (ms <= 0) {
ms = m_len_ms;
}
size_t n_samples = (m_sample_rate * ms) / 1000;
if (n_samples > m_audio_len) {
n_samples = m_audio_len;
}
result.resize(n_samples);
int s0 = m_audio_pos - n_samples;
if (s0 < 0) {
s0 += m_audio.size();
}
if (s0 + n_samples > m_audio.size()) {
const size_t n0 = m_audio.size() - s0;
memcpy(result.data(), &m_audio[s0], n0 * sizeof(float));
memcpy(&result[n0], &m_audio[0], (n_samples - n0) * sizeof(float));
} else {
memcpy(result.data(), &m_audio[s0], n_samples * sizeof(float));
}
}
}
bool sdl_poll_events() {
SDL_Event event;
while (SDL_PollEvent(&event)) {
switch (event.type) {
case SDL_QUIT:
{
return false;
} break;
default:
break;
}
}
return true;
}

View File

@@ -0,0 +1,49 @@
#pragma once
#include <SDL.h>
#include <SDL_audio.h>
#include <atomic>
#include <cstdint>
#include <vector>
#include <mutex>
//
// SDL Audio capture
//
class audio_async {
public:
audio_async(int len_ms);
~audio_async();
bool init(int capture_id, int sample_rate);
// start capturing audio via the provided SDL callback
// keep last len_ms seconds of audio in a circular buffer
bool resume();
bool pause();
bool clear();
// callback to be called by SDL
void callback(uint8_t * stream, int len);
// get audio data from the circular buffer
void get(int ms, std::vector<float> & audio);
private:
SDL_AudioDeviceID m_dev_id_in = 0;
int m_len_ms = 0;
int m_sample_rate = 0;
std::atomic_bool m_running;
std::mutex m_mutex;
std::vector<float> m_audio;
size_t m_audio_pos = 0;
size_t m_audio_len = 0;
};
// Return false if need to quit
bool sdl_poll_events();

View File

@@ -0,0 +1,817 @@
#define _USE_MATH_DEFINES // for M_PI
#include "common.h"
// third-party utilities
// use your favorite implementations
#define DR_WAV_IMPLEMENTATION
#include "dr_wav.h"
#include <cmath>
#include <cstring>
#include <fstream>
#include <regex>
#include <locale>
#include <codecvt>
#include <sstream>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
// Function to check if the next argument exists
std::string get_next_arg(int& i, int argc, char** argv, const std::string& flag, gpt_params& params) {
if (i + 1 < argc && argv[i + 1][0] != '-') {
return argv[++i];
} else {
fprintf(stderr, "error: %s requires one argument.\n", flag.c_str());
gpt_print_usage(argc, argv, params);
exit(0);
}
}
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
for (int i = 1; i < argc; i++) {
std::string arg = argv[i];
if (arg == "-s" || arg == "--seed") {
params.seed = std::stoi(get_next_arg(i, argc, argv, arg, params));
} else if (arg == "-t" || arg == "--threads") {
params.n_threads = std::stoi(get_next_arg(i, argc, argv, arg, params));
} else if (arg == "-p" || arg == "--prompt") {
params.prompt = get_next_arg(i, argc, argv, arg, params);
} else if (arg == "-n" || arg == "--n_predict") {
params.n_predict = std::stoi(get_next_arg(i, argc, argv, arg, params));
} else if (arg == "-np" || arg == "--n_parallel") {
params.n_parallel = std::stoi(get_next_arg(i, argc, argv, arg, params));
} else if (arg == "--top_k") {
params.top_k = std::stoi(get_next_arg(i, argc, argv, arg, params));
} else if (arg == "--top_p") {
params.top_p = std::stof(get_next_arg(i, argc, argv, arg, params));
} else if (arg == "--temp") {
params.temp = std::stof(get_next_arg(i, argc, argv, arg, params));
} else if (arg == "--repeat-last-n") {
params.repeat_last_n = std::stoi(get_next_arg(i, argc, argv, arg, params));
} else if (arg == "--repeat-penalty") {
params.repeat_penalty = std::stof(get_next_arg(i, argc, argv, arg, params));
} else if (arg == "-b" || arg == "--batch_size") {
params.n_batch= std::stoi(get_next_arg(i, argc, argv, arg, params));
} else if (arg == "-c" || arg == "--context") {
params.n_ctx= std::stoi(get_next_arg(i, argc, argv, arg, params));
} else if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") {
params.n_gpu_layers = std::stoi(get_next_arg(i, argc, argv, arg, params));
} else if (arg == "--ignore-eos") {
params.ignore_eos = true;
} else if (arg == "-m" || arg == "--model") {
params.model = get_next_arg(i, argc, argv, arg, params);
} else if (arg == "-i" || arg == "--interactive") {
params.interactive = true;
} else if (arg == "-ip" || arg == "--interactive-port") {
params.interactive = true;
params.interactive_port = std::stoi(get_next_arg(i, argc, argv, arg, params));
} else if (arg == "-h" || arg == "--help") {
gpt_print_usage(argc, argv, params);
exit(0);
} else if (arg == "-f" || arg == "--file") {
get_next_arg(i, argc, argv, arg, params);
std::ifstream file(argv[i]);
if (!file) {
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
break;
}
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
if (params.prompt.back() == '\n') {
params.prompt.pop_back();
}
} else if (arg == "-tt" || arg == "--token_test") {
params.token_test = get_next_arg(i, argc, argv, arg, params);
}
else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
gpt_print_usage(argc, argv, params);
exit(0);
}
}
return true;
}
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stderr, "usage: %s [options]\n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help show this help message and exit\n");
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1)\n");
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
fprintf(stderr, " -p PROMPT, --prompt PROMPT\n");
fprintf(stderr, " prompt to start generation with (default: random)\n");
fprintf(stderr, " -f FNAME, --file FNAME\n");
fprintf(stderr, " load prompt from a file\n");
fprintf(stderr, " -tt TOKEN_TEST, --token_test TOKEN_TEST\n");
fprintf(stderr, " test tokenization\n");
fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict);
fprintf(stderr, " --top_k N top-k sampling (default: %d)\n", params.top_k);
fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", params.top_p);
fprintf(stderr, " --temp N temperature (default: %.1f)\n", params.temp);
fprintf(stderr, " --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled)\n", params.repeat_last_n);
fprintf(stderr, " --repeat-penalty N penalize repeat sequence of tokens (default: %.2f, 1.0 = disabled)\n", (double)params.repeat_penalty);
fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch);
fprintf(stderr, " -c N, --context N context / KV cache size (default: %d)\n", params.n_ctx);
fprintf(stderr, " --ignore-eos ignore EOS token during generation\n");
fprintf(stderr, " -ngl N, --gpu-layers N number of layers to offload to GPU on supported models (default: %d)\n", params.n_gpu_layers);
fprintf(stderr, " -m FNAME, --model FNAME\n");
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
fprintf(stderr, "\n");
}
std::string gpt_random_prompt(std::mt19937 & rng) {
const int r = rng() % 10;
switch (r) {
case 0: return "So";
case 1: return "Once upon a time";
case 2: return "When";
case 3: return "The";
case 4: return "After";
case 5: return "If";
case 6: return "import";
case 7: return "He";
case 8: return "She";
case 9: return "They";
default: return "To";
}
return "The";
}
std::string trim(const std::string & s) {
std::regex e("^\\s+|\\s+$");
return std::regex_replace(s, e, "");
}
std::string replace(const std::string & s, const std::string & from, const std::string & to) {
std::string result = s;
size_t pos = 0;
while ((pos = result.find(from, pos)) != std::string::npos) {
result.replace(pos, from.length(), to);
pos += to.length();
}
return result;
}
void gpt_vocab::add_special_token(const std::string & token) {
special_tokens.push_back(token);
}
std::map<std::string, int32_t> json_parse(const std::string & fname) {
std::map<std::string, int32_t> result;
// read file into string
std::string json;
{
std::ifstream ifs(fname);
if (!ifs) {
fprintf(stderr, "Failed to open %s\n", fname.c_str());
exit(1);
}
json = std::string((std::istreambuf_iterator<char>(ifs)),
(std::istreambuf_iterator<char>()));
}
if (json[0] != '{') {
return result;
}
// parse json
{
bool has_key = false;
bool in_token = false;
std::string str_key = "";
std::string str_val = "";
int n = json.size();
for (int i = 1; i < n; ++i) {
if (!in_token) {
if (json[i] == ' ') continue;
if (json[i] == '"') {
in_token = true;
continue;
}
} else {
if (json[i] == '\\' && i+1 < n) {
if (has_key == false) {
str_key += json[i];
} else {
str_val += json[i];
}
++i;
} else if (json[i] == '"') {
if (has_key == false) {
has_key = true;
++i;
while (json[i] == ' ') ++i;
++i; // :
while (json[i] == ' ') ++i;
if (json[i] != '\"') {
while (json[i] != ',' && json[i] != '}') {
str_val += json[i++];
}
has_key = false;
} else {
in_token = true;
continue;
}
} else {
has_key = false;
}
str_key = ::replace(str_key, "\\u0120", " " ); // \u0120 -> space
str_key = ::replace(str_key, "\\u010a", "\n"); // \u010a -> new line
str_key = ::replace(str_key, "\\\"", "\""); // \\\" -> "
try {
result[str_key] = std::stoi(str_val);
} catch (...) {
//fprintf(stderr, "%s: ignoring key '%s' with value '%s'\n", fname.c_str(), str_key.c_str(), str_val.c_str());
}
str_key = "";
str_val = "";
in_token = false;
continue;
}
if (has_key == false) {
str_key += json[i];
} else {
str_val += json[i];
}
}
}
}
return result;
}
std::string convert_to_utf8(const std::wstring & input) {
std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
return converter.to_bytes(input);
}
std::wstring convert_to_wstring(const std::string & input) {
std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
return converter.from_bytes(input);
}
void gpt_split_words(std::string str, std::vector<std::string>& words) {
const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
const std::regex re(pattern);
std::smatch m;
while (std::regex_search(str, m, re)) {
for (auto x : m) {
words.push_back(x);
}
str = m.suffix();
}
}
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
std::vector<std::string> words;
// first split the text into words
{
std::string str = text;
// Generate the subpattern from the special_tokens vector if it's not empty
if (!vocab.special_tokens.empty()) {
const std::regex escape(R"([\[\\\^\$\.\|\?\*\+\(\)\{\}])");
std::string special_tokens_subpattern;
for (const auto & token : vocab.special_tokens) {
if (!special_tokens_subpattern.empty()) {
special_tokens_subpattern += "|";
}
special_tokens_subpattern += std::regex_replace(token, escape, R"(\$&)");
}
std::regex re(special_tokens_subpattern);
std::smatch m;
// Split the text by special tokens.
while (std::regex_search(str, m, re)) {
// Split the substrings in-between special tokens into words.
gpt_split_words(m.prefix(), words);
// Add matched special tokens as words.
for (auto x : m) {
words.push_back(x);
}
str = m.suffix();
}
// Remaining text without special tokens will be handled below.
}
gpt_split_words(str, words);
}
// find the longest token that forms each word in words:
std::vector<gpt_vocab::id> tokens;
for (const auto & word : words) {
for (int i = 0; i < (int) word.size(); ){
for (int j = word.size() - 1; j >= i; j--){
auto cand = word.substr(i, j-i+1);
auto it = vocab.token_to_id.find(cand);
if (it != vocab.token_to_id.end()){ // word.substr(i, j-i+1) in vocab
tokens.push_back(it->second);
i = j + 1;
break;
}
else if (j == i){ // word.substr(i, 1) has no matching
fprintf(stderr, "%s: unknown token '%s'\n", __func__, word.substr(i, 1).data());
i++;
}
}
}
}
return tokens;
}
std::vector<gpt_vocab::id> parse_tokens_from_string(const std::string& input, char delimiter) {
std::vector<gpt_vocab::id> output;
std::stringstream ss(input);
std::string token;
while (std::getline(ss, token, delimiter)) {
output.push_back(std::stoi(token));
}
return output;
}
std::map<std::string, std::vector<gpt_vocab::id>> extract_tests_from_file(const std::string & fpath_test){
if (fpath_test.empty()){
fprintf(stderr, "%s : No test file found.\n", __func__);
return std::map<std::string, std::vector<gpt_vocab::id>>();
}
std::map<std::string, std::vector<gpt_vocab::id>> tests;
auto fin = std::ifstream(fpath_test, std::ios_base::in);
const char * delimeter = " => ";
const char del_tok = ',';
std::string line;
while (std::getline(fin, line)) {
size_t delimiterPos = line.find(delimeter);
if (delimiterPos != std::string::npos) {
std::string text = line.substr(0, delimiterPos);
std::string s_tokens = line.substr(delimiterPos + std::strlen(delimeter));
tests[text] = parse_tokens_from_string(s_tokens, del_tok);
}
}
return tests;
}
void test_gpt_tokenizer(gpt_vocab & vocab, const std::string & fpath_test){
std::map<std::string, std::vector<gpt_vocab::id>> tests = extract_tests_from_file(fpath_test);
size_t n_fails = 0;
for (const auto & test : tests) {
std::vector<gpt_vocab::id> tokens = gpt_tokenize(vocab, test.first);
if (tokens != test.second){
n_fails++;
// print out failure cases
fprintf(stderr, "%s : failed test: '%s'\n", __func__, test.first.c_str());
fprintf(stderr, "%s : tokens in hf: ", __func__);
for (const auto & t : test.second) {
fprintf(stderr, "%s(%d), ", vocab.id_to_token[t].c_str(), t);
}
fprintf(stderr, "\n");
fprintf(stderr, "%s : tokens in ggml: ", __func__);
for (const auto & t : tokens) {
fprintf(stderr, "%s(%d), ", vocab.id_to_token[t].c_str(), t);
}
fprintf(stderr, "\n");
}
}
fprintf(stderr, "%s : %zu tests failed out of %zu tests.\n", __func__, n_fails, tests.size());
}
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {
printf("%s: loading vocab from '%s'\n", __func__, fname.c_str());
vocab.token_to_id = ::json_parse(fname);
for (const auto & kv : vocab.token_to_id) {
vocab.id_to_token[kv.second] = kv.first;
}
printf("%s: vocab size = %d\n", __func__, (int) vocab.token_to_id.size());
// print the vocabulary
//for (auto kv : vocab.token_to_id) {
// printf("'%s' -> %d\n", kv.first.data(), kv.second);
//}
return true;
}
gpt_vocab::id gpt_sample_top_k_top_p(
const gpt_vocab & vocab,
const float * logits,
int top_k,
double top_p,
double temp,
std::mt19937 & rng) {
int n_logits = vocab.id_to_token.size();
std::vector<std::pair<double, gpt_vocab::id>> logits_id;
logits_id.reserve(n_logits);
{
const double scale = 1.0/temp;
for (int i = 0; i < n_logits; ++i) {
logits_id.push_back(std::make_pair(logits[i]*scale, i));
}
}
// find the top K tokens
std::partial_sort(
logits_id.begin(),
logits_id.begin() + top_k, logits_id.end(),
[](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
return a.first > b.first;
});
logits_id.resize(top_k);
double maxl = -INFINITY;
for (const auto & kv : logits_id) {
maxl = std::max(maxl, kv.first);
}
// compute probs for the top K tokens
std::vector<double> probs;
probs.reserve(logits_id.size());
double sum = 0.0;
for (const auto & kv : logits_id) {
double p = exp(kv.first - maxl);
probs.push_back(p);
sum += p;
}
// normalize the probs
for (auto & p : probs) {
p /= sum;
}
if (top_p < 1.0f) {
double cumsum = 0.0f;
for (int i = 0; i < top_k; i++) {
cumsum += probs[i];
if (cumsum >= top_p) {
top_k = i + 1;
probs.resize(top_k);
logits_id.resize(top_k);
break;
}
}
cumsum = 1.0/cumsum;
for (int i = 0; i < (int) probs.size(); i++) {
probs[i] *= cumsum;
}
}
//printf("\n");
//for (int i = 0; i < (int) probs.size(); i++) {
// printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
//}
//exit(0);
std::discrete_distribution<> dist(probs.begin(), probs.end());
int idx = dist(rng);
return logits_id[idx].second;
}
gpt_vocab::id gpt_sample_top_k_top_p_repeat(
const gpt_vocab & vocab,
const float * logits,
const int32_t * last_n_tokens_data,
size_t last_n_tokens_data_size,
int top_k,
double top_p,
double temp,
int repeat_last_n,
float repeat_penalty,
std::mt19937 & rng) {
int n_logits = vocab.id_to_token.size();
const auto * plogits = logits;
const auto last_n_tokens = std::vector<int32_t>(last_n_tokens_data, last_n_tokens_data + last_n_tokens_data_size);
if (temp <= 0) {
// select the token with the highest logit directly
float max_logit = plogits[0];
gpt_vocab::id max_id = 0;
for (int i = 1; i < n_logits; ++i) {
if (plogits[i] > max_logit) {
max_logit = plogits[i];
max_id = i;
}
}
return max_id;
}
std::vector<std::pair<double, gpt_vocab::id>> logits_id;
logits_id.reserve(n_logits);
{
const float scale = 1.0f/temp;
for (int i = 0; i < n_logits; ++i) {
// repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858)
// credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
if (repeat_last_n > 0 && std::find(last_n_tokens.end()-repeat_last_n, last_n_tokens.end(), i) != last_n_tokens.end()) {
// if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
if (plogits[i] < 0.0f) {
logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i));
} else {
logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i));
}
} else {
logits_id.push_back(std::make_pair(plogits[i]*scale, i));
}
}
}
// find the top K tokens
std::partial_sort(
logits_id.begin(),
logits_id.begin() + top_k, logits_id.end(),
[](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
return a.first > b.first;
});
logits_id.resize(top_k);
double maxl = -INFINITY;
for (const auto & kv : logits_id) {
maxl = std::max(maxl, kv.first);
}
// compute probs for the top K tokens
std::vector<double> probs;
probs.reserve(logits_id.size());
double sum = 0.0;
for (const auto & kv : logits_id) {
double p = exp(kv.first - maxl);
probs.push_back(p);
sum += p;
}
// normalize the probs
for (auto & p : probs) {
p /= sum;
}
if (top_p < 1.0f) {
double cumsum = 0.0f;
for (int i = 0; i < top_k; i++) {
cumsum += probs[i];
if (cumsum >= top_p) {
top_k = i + 1;
probs.resize(top_k);
logits_id.resize(top_k);
break;
}
}
cumsum = 1.0/cumsum;
for (int i = 0; i < (int) probs.size(); i++) {
probs[i] *= cumsum;
}
}
// printf("\n");
// for (int i = 0; i < (int) probs.size(); i++) {
// for (int i = 0; i < 10; i++) {
// printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
// }
std::discrete_distribution<> dist(probs.begin(), probs.end());
int idx = dist(rng);
return logits_id[idx].second;
}
bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
drwav wav;
std::vector<uint8_t> wav_data; // used for pipe input from stdin
if (fname == "-") {
{
uint8_t buf[1024];
while (true)
{
const size_t n = fread(buf, 1, sizeof(buf), stdin);
if (n == 0) {
break;
}
wav_data.insert(wav_data.end(), buf, buf + n);
}
}
if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {
fprintf(stderr, "error: failed to open WAV file from stdin\n");
return false;
}
fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
}
else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) {
fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str());
return false;
}
if (wav.channels != 1 && wav.channels != 2) {
fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", __func__, fname.c_str());
return false;
}
if (stereo && wav.channels != 2) {
fprintf(stderr, "%s: WAV file '%s' must be stereo for diarization\n", __func__, fname.c_str());
return false;
}
if (wav.sampleRate != COMMON_SAMPLE_RATE) {
fprintf(stderr, "%s: WAV file '%s' must be %i kHz\n", __func__, fname.c_str(), COMMON_SAMPLE_RATE/1000);
return false;
}
if (wav.bitsPerSample != 16) {
fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", __func__, fname.c_str());
return false;
}
const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8);
std::vector<int16_t> pcm16;
pcm16.resize(n*wav.channels);
drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
drwav_uninit(&wav);
// convert to mono, float
pcmf32.resize(n);
if (wav.channels == 1) {
for (uint64_t i = 0; i < n; i++) {
pcmf32[i] = float(pcm16[i])/32768.0f;
}
} else {
for (uint64_t i = 0; i < n; i++) {
pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
}
}
if (stereo) {
// convert to stereo, float
pcmf32s.resize(2);
pcmf32s[0].resize(n);
pcmf32s[1].resize(n);
for (uint64_t i = 0; i < n; i++) {
pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
}
}
return true;
}
void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
const float rc = 1.0f / (2.0f * M_PI * cutoff);
const float dt = 1.0f / sample_rate;
const float alpha = dt / (rc + dt);
float y = data[0];
for (size_t i = 1; i < data.size(); i++) {
y = alpha * (y + data[i] - data[i - 1]);
data[i] = y;
}
}
bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
const int n_samples = pcmf32.size();
const int n_samples_last = (sample_rate * last_ms) / 1000;
if (n_samples_last >= n_samples) {
// not enough samples - assume no speech
return false;
}
if (freq_thold > 0.0f) {
high_pass_filter(pcmf32, freq_thold, sample_rate);
}
float energy_all = 0.0f;
float energy_last = 0.0f;
for (int i = 0; i < n_samples; i++) {
energy_all += fabsf(pcmf32[i]);
if (i >= n_samples - n_samples_last) {
energy_last += fabsf(pcmf32[i]);
}
}
energy_all /= n_samples;
energy_last /= n_samples_last;
if (verbose) {
fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
}
if (energy_last > vad_thold*energy_all) {
return false;
}
return true;
}
float similarity(const std::string & s0, const std::string & s1) {
const size_t len0 = s0.size() + 1;
const size_t len1 = s1.size() + 1;
std::vector<int> col(len1, 0);
std::vector<int> prevCol(len1, 0);
for (size_t i = 0; i < len1; i++) {
prevCol[i] = i;
}
for (size_t i = 0; i < len0; i++) {
col[0] = i;
for (size_t j = 1; j < len1; j++) {
col[j] = std::min(std::min(1 + col[j - 1], 1 + prevCol[j]), prevCol[j - 1] + (i > 0 && s0[i - 1] == s1[j - 1] ? 0 : 1));
}
col.swap(prevCol);
}
const float dist = prevCol[len1 - 1];
return 1.0f - (dist / std::max(s0.size(), s1.size()));
}
bool sam_params_parse(int argc, char ** argv, sam_params & params) {
for (int i = 1; i < argc; i++) {
std::string arg = argv[i];
if (arg == "-s" || arg == "--seed") {
params.seed = std::stoi(argv[++i]);
} else if (arg == "-t" || arg == "--threads") {
params.n_threads = std::stoi(argv[++i]);
} else if (arg == "-m" || arg == "--model") {
params.model = argv[++i];
} else if (arg == "-i" || arg == "--inp") {
params.fname_inp = argv[++i];
} else if (arg == "-o" || arg == "--out") {
params.fname_out = argv[++i];
} else if (arg == "-h" || arg == "--help") {
sam_print_usage(argc, argv, params);
exit(0);
} else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
sam_print_usage(argc, argv, params);
exit(0);
}
}
return true;
}
void sam_print_usage(int /*argc*/, char ** argv, const sam_params & params) {
fprintf(stderr, "usage: %s [options]\n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help show this help message and exit\n");
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1)\n");
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
fprintf(stderr, " -m FNAME, --model FNAME\n");
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
fprintf(stderr, " -i FNAME, --inp FNAME\n");
fprintf(stderr, " input file (default: %s)\n", params.fname_inp.c_str());
fprintf(stderr, " -o FNAME, --out FNAME\n");
fprintf(stderr, " output file (default: %s)\n", params.fname_out.c_str());
fprintf(stderr, "\n");
}

View File

@@ -0,0 +1,279 @@
// Various helper functions and utilities
#pragma once
#include <string>
#include <map>
#include <vector>
#include <random>
#include <thread>
#include <ctime>
#include <fstream>
#define COMMON_SAMPLE_RATE 16000
//
// GPT CLI argument parsing
//
struct gpt_params {
int32_t seed = -1; // RNG seed
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
int32_t n_predict = 200; // new tokens to predict
int32_t n_parallel = 1; // number of parallel streams
int32_t n_batch = 8; // batch size for prompt processing
int32_t n_ctx = 2048; // context size (this is the KV cache max size)
int32_t n_gpu_layers = 0; // number of layers to offlload to the GPU
bool ignore_eos = false; // ignore EOS token when generating text
// sampling parameters
int32_t top_k = 40;
float top_p = 0.9f;
float temp = 0.9f;
int32_t repeat_last_n = 64;
float repeat_penalty = 1.00f;
std::string model = "models/gpt-2-117M/ggml-model.bin"; // model path
std::string prompt = "";
std::string token_test = "";
bool interactive = false;
int32_t interactive_port = -1;
};
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
std::string gpt_random_prompt(std::mt19937 & rng);
//
// Vocab utils
//
std::string trim(const std::string & s);
std::string replace(
const std::string & s,
const std::string & from,
const std::string & to);
struct gpt_vocab {
using id = int32_t;
using token = std::string;
std::map<token, id> token_to_id;
std::map<id, token> id_to_token;
std::vector<std::string> special_tokens;
void add_special_token(const std::string & token);
};
// poor-man's JSON parsing
std::map<std::string, int32_t> json_parse(const std::string & fname);
std::string convert_to_utf8(const std::wstring & input);
std::wstring convert_to_wstring(const std::string & input);
void gpt_split_words(std::string str, std::vector<std::string>& words);
// split text into tokens
//
// ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
//
// Regex (Python):
// r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
//
// Regex (C++):
// R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"
//
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text);
// test outputs of gpt_tokenize
//
// - compare with tokens generated by the huggingface tokenizer
// - test cases are chosen based on the model's main language (under 'prompt' directory)
// - if all sentences are tokenized identically, print 'All tests passed.'
// - otherwise, print sentence, huggingface tokens, ggml tokens
//
void test_gpt_tokenizer(gpt_vocab & vocab, const std::string & fpath_test);
// load the tokens from encoder.json
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);
// sample next token given probabilities for each embedding
//
// - consider only the top K tokens
// - from them, consider only the top tokens with cumulative probability > P
//
// TODO: not sure if this implementation is correct
// TODO: temperature is not implemented
//
gpt_vocab::id gpt_sample_top_k_top_p(
const gpt_vocab & vocab,
const float * logits,
int top_k,
double top_p,
double temp,
std::mt19937 & rng);
gpt_vocab::id gpt_sample_top_k_top_p_repeat(
const gpt_vocab & vocab,
const float * logits,
const int32_t * last_n_tokens_data,
size_t last_n_tokens_data_size,
int top_k,
double top_p,
double temp,
int repeat_last_n,
float repeat_penalty,
std::mt19937 & rng);
//
// Audio utils
//
// Read WAV audio file and store the PCM data into pcmf32
// The sample rate of the audio must be equal to COMMON_SAMPLE_RATE
// If stereo flag is set and the audio has 2 channels, the pcmf32s will contain 2 channel PCM
bool read_wav(
const std::string & fname,
std::vector<float> & pcmf32,
std::vector<std::vector<float>> & pcmf32s,
bool stereo);
// Write PCM data into WAV audio file
class wav_writer {
private:
std::ofstream file;
uint32_t dataSize = 0;
std::string wav_filename;
bool write_header(const uint32_t sample_rate,
const uint16_t bits_per_sample,
const uint16_t channels) {
file.write("RIFF", 4);
file.write("\0\0\0\0", 4); // Placeholder for file size
file.write("WAVE", 4);
file.write("fmt ", 4);
const uint32_t sub_chunk_size = 16;
const uint16_t audio_format = 1; // PCM format
const uint32_t byte_rate = sample_rate * channels * bits_per_sample / 8;
const uint16_t block_align = channels * bits_per_sample / 8;
file.write(reinterpret_cast<const char *>(&sub_chunk_size), 4);
file.write(reinterpret_cast<const char *>(&audio_format), 2);
file.write(reinterpret_cast<const char *>(&channels), 2);
file.write(reinterpret_cast<const char *>(&sample_rate), 4);
file.write(reinterpret_cast<const char *>(&byte_rate), 4);
file.write(reinterpret_cast<const char *>(&block_align), 2);
file.write(reinterpret_cast<const char *>(&bits_per_sample), 2);
file.write("data", 4);
file.write("\0\0\0\0", 4); // Placeholder for data size
return true;
}
// It is assumed that PCM data is normalized to a range from -1 to 1
bool write_audio(const float * data, size_t length) {
for (size_t i = 0; i < length; ++i) {
const int16_t intSample = data[i] * 32767;
file.write(reinterpret_cast<const char *>(&intSample), sizeof(int16_t));
dataSize += sizeof(int16_t);
}
if (file.is_open()) {
file.seekp(4, std::ios::beg);
uint32_t fileSize = 36 + dataSize;
file.write(reinterpret_cast<char *>(&fileSize), 4);
file.seekp(40, std::ios::beg);
file.write(reinterpret_cast<char *>(&dataSize), 4);
file.seekp(0, std::ios::end);
}
return true;
}
bool open_wav(const std::string & filename) {
if (filename != wav_filename) {
if (file.is_open()) {
file.close();
}
}
if (!file.is_open()) {
file.open(filename, std::ios::binary);
wav_filename = filename;
dataSize = 0;
}
return file.is_open();
}
public:
bool open(const std::string & filename,
const uint32_t sample_rate,
const uint16_t bits_per_sample,
const uint16_t channels) {
if (open_wav(filename)) {
write_header(sample_rate, bits_per_sample, channels);
} else {
return false;
}
return true;
}
bool close() {
file.close();
return true;
}
bool write(const float * data, size_t length) {
return write_audio(data, length);
}
~wav_writer() {
if (file.is_open()) {
file.close();
}
}
};
// Apply a high-pass frequency filter to PCM audio
// Suppresses frequencies below cutoff Hz
void high_pass_filter(
std::vector<float> & data,
float cutoff,
float sample_rate);
// Basic voice activity detection (VAD) using audio energy adaptive threshold
bool vad_simple(
std::vector<float> & pcmf32,
int sample_rate,
int last_ms,
float vad_thold,
float freq_thold,
bool verbose);
// compute similarity between two strings using Levenshtein distance
float similarity(const std::string & s0, const std::string & s1);
//
// SAM argument parsing
//
struct sam_params {
int32_t seed = -1; // RNG seed
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
std::string model = "models/sam-vit-b/ggml-model-f16.bin"; // model path
std::string fname_inp = "img.jpg";
std::string fname_out = "img.out";
};
bool sam_params_parse(int argc, char ** argv, sam_params & params);
void sam_print_usage(int argc, char ** argv, const sam_params & params);

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,60 @@
#!/bin/bash
# Simple tool to record audio from the microphone and generate a karaoke video
# Usage:
#
# cd whisper.cpp
# make
#
# ./examples/generate-karaoke.sh [model] [step_ms]
#
# Press Ctrl+C to stop recording
#
executable="./main"
model="base.en"
model_path="models/ggml-$model.bin"
# require sox and ffmpeg to be installed
if ! command -v sox &> /dev/null
then
echo "sox could not be found"
exit 1
fi
if ! command -v ffmpeg &> /dev/null
then
echo "ffmpeg could not be found"
exit 2
fi
if [ ! -f "$executable" ]; then
echo "'$executable' does not exist. Please build it first."
exit 3
fi
if [ ! -f "$model_path" ]; then
echo "'$model_path' does not exist. Please download it first."
exit 4
fi
# record some raw audio
sox -d rec.wav
# resample to 16kHz
ffmpeg -y -i ./rec.wav -ar 16000 -ac 1 -c:a pcm_s16le ./rec16.wav > /dev/null 2>&1
# run Whisper
echo "Processing ..."
./main -m models/ggml-base.en.bin rec16.wav -owts > /dev/null 2>&1
# generate Karaoke video
echo "Generating video ..."
source rec16.wav.wts > /dev/null 2>&1
# play the video
echo "Playing ./rec16.wav.mp4 ..."
ffplay -loglevel 0 -autoexit ./rec16.wav.mp4
echo "Done"
exit 0

View File

@@ -0,0 +1,423 @@
#include "grammar-parser.h"
#include <cstdint>
#include <cwchar>
#include <string>
#include <utility>
#include <stdexcept>
#include <exception>
namespace grammar_parser {
// NOTE: assumes valid utf8 (but checks for overrun)
// copied from whisper.cpp
std::pair<uint32_t, const char *> decode_utf8(const char * src) {
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
uint8_t first_byte = static_cast<uint8_t>(*src);
uint8_t highbits = first_byte >> 4;
int len = lookup[highbits];
uint8_t mask = (1 << (8 - len)) - 1;
uint32_t value = first_byte & mask;
const char * end = src + len; // may overrun!
const char * pos = src + 1;
for ( ; pos < end && *pos; pos++) {
value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
}
return std::make_pair(value, pos);
}
uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
auto result = state.symbol_ids.insert(std::make_pair(std::string(src, len), next_id));
return result.first->second;
}
uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) {
uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
state.symbol_ids[base_name + '_' + std::to_string(next_id)] = next_id;
return next_id;
}
void add_rule(
parse_state & state,
uint32_t rule_id,
const std::vector<whisper_grammar_element> & rule) {
if (state.rules.size() <= rule_id) {
state.rules.resize(rule_id + 1);
}
state.rules[rule_id] = rule;
}
bool is_word_char(char c) {
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || ('0' <= c && c <= '9');
}
std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
const char * pos = src;
const char * end = src + size;
uint32_t value = 0;
for ( ; pos < end && *pos; pos++) {
value <<= 4;
char c = *pos;
if ('a' <= c && c <= 'f') {
value += c - 'a' + 10;
} else if ('A' <= c && c <= 'F') {
value += c - 'A' + 10;
} else if ('0' <= c && c <= '9') {
value += c - '0';
} else {
break;
}
}
if (pos != end) {
throw std::runtime_error("expecting " + std::to_string(size) + " hex chars at " + src);
}
return std::make_pair(value, pos);
}
const char * parse_space(const char * src, bool newline_ok) {
const char * pos = src;
while (*pos == ' ' || *pos == '\t' || *pos == '#' ||
(newline_ok && (*pos == '\r' || *pos == '\n'))) {
if (*pos == '#') {
while (*pos && *pos != '\r' && *pos != '\n') {
pos++;
}
} else {
pos++;
}
}
return pos;
}
const char * parse_name(const char * src) {
const char * pos = src;
while (is_word_char(*pos)) {
pos++;
}
if (pos == src) {
throw std::runtime_error(std::string("expecting name at ") + src);
}
return pos;
}
std::pair<uint32_t, const char *> parse_char(const char * src) {
if (*src == '\\') {
switch (src[1]) {
case 'x': return parse_hex(src + 2, 2);
case 'u': return parse_hex(src + 2, 4);
case 'U': return parse_hex(src + 2, 8);
case 't': return std::make_pair('\t', src + 2);
case 'r': return std::make_pair('\r', src + 2);
case 'n': return std::make_pair('\n', src + 2);
case '\\':
case '"':
case '[':
case ']':
return std::make_pair(src[1], src + 2);
default:
throw std::runtime_error(std::string("unknown escape at ") + src);
}
} else if (*src) {
return decode_utf8(src);
}
throw std::runtime_error("unexpected end of input");
}
const char * parse_alternates(
parse_state & state,
const char * src,
const std::string & rule_name,
uint32_t rule_id,
bool is_nested);
const char * parse_sequence(
parse_state & state,
const char * src,
const std::string & rule_name,
std::vector<whisper_grammar_element> & out_elements,
bool is_nested) {
size_t last_sym_start = out_elements.size();
const char * pos = src;
while (*pos) {
if (*pos == '"') { // literal string
pos++;
last_sym_start = out_elements.size();
while (*pos != '"') {
auto char_pair = parse_char(pos);
pos = char_pair.second;
out_elements.push_back({WHISPER_GRETYPE_CHAR, char_pair.first});
}
pos = parse_space(pos + 1, is_nested);
} else if (*pos == '[') { // char range(s)
pos++;
enum whisper_gretype start_type = WHISPER_GRETYPE_CHAR;
if (*pos == '^') {
pos++;
start_type = WHISPER_GRETYPE_CHAR_NOT;
}
last_sym_start = out_elements.size();
while (*pos != ']') {
auto char_pair = parse_char(pos);
pos = char_pair.second;
enum whisper_gretype type = last_sym_start < out_elements.size()
? WHISPER_GRETYPE_CHAR_ALT
: start_type;
out_elements.push_back({type, char_pair.first});
if (pos[0] == '-' && pos[1] != ']') {
auto endchar_pair = parse_char(pos + 1);
pos = endchar_pair.second;
out_elements.push_back({WHISPER_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});
}
}
pos = parse_space(pos + 1, is_nested);
} else if (is_word_char(*pos)) { // rule reference
const char * name_end = parse_name(pos);
uint32_t ref_rule_id = get_symbol_id(state, pos, name_end - pos);
pos = parse_space(name_end, is_nested);
last_sym_start = out_elements.size();
out_elements.push_back({WHISPER_GRETYPE_RULE_REF, ref_rule_id});
} else if (*pos == '(') { // grouping
// parse nested alternates into synthesized rule
pos = parse_space(pos + 1, true);
uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
pos = parse_alternates(state, pos, rule_name, sub_rule_id, true);
last_sym_start = out_elements.size();
// output reference to synthesized rule
out_elements.push_back({WHISPER_GRETYPE_RULE_REF, sub_rule_id});
if (*pos != ')') {
throw std::runtime_error(std::string("expecting ')' at ") + pos);
}
pos = parse_space(pos + 1, is_nested);
} else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator
if (last_sym_start == out_elements.size()) {
throw std::runtime_error(std::string("expecting preceeding item to */+/? at ") + pos);
}
// apply transformation to previous symbol (last_sym_start to end) according to
// rewrite rules:
// S* --> S' ::= S S' |
// S+ --> S' ::= S S' | S
// S? --> S' ::= S |
uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
std::vector<whisper_grammar_element> sub_rule;
// add preceding symbol to generated rule
sub_rule.insert(
sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
if (*pos == '*' || *pos == '+') {
// cause generated rule to recurse
sub_rule.push_back({WHISPER_GRETYPE_RULE_REF, sub_rule_id});
}
// mark start of alternate def
sub_rule.push_back({WHISPER_GRETYPE_ALT, 0});
if (*pos == '+') {
// add preceding symbol as alternate only for '+' (otherwise empty)
sub_rule.insert(
sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
}
sub_rule.push_back({WHISPER_GRETYPE_END, 0});
add_rule(state, sub_rule_id, sub_rule);
// in original rule, replace previous symbol with reference to generated rule
out_elements.resize(last_sym_start);
out_elements.push_back({WHISPER_GRETYPE_RULE_REF, sub_rule_id});
pos = parse_space(pos + 1, is_nested);
} else {
break;
}
}
return pos;
}
const char * parse_alternates(
parse_state & state,
const char * src,
const std::string & rule_name,
uint32_t rule_id,
bool is_nested) {
std::vector<whisper_grammar_element> rule;
const char * pos = parse_sequence(state, src, rule_name, rule, is_nested);
while (*pos == '|') {
rule.push_back({WHISPER_GRETYPE_ALT, 0});
pos = parse_space(pos + 1, true);
pos = parse_sequence(state, pos, rule_name, rule, is_nested);
}
rule.push_back({WHISPER_GRETYPE_END, 0});
add_rule(state, rule_id, rule);
return pos;
}
const char * parse_rule(parse_state & state, const char * src) {
const char * name_end = parse_name(src);
const char * pos = parse_space(name_end, false);
size_t name_len = name_end - src;
uint32_t rule_id = get_symbol_id(state, src, name_len);
const std::string name(src, name_len);
if (!(pos[0] == ':' && pos[1] == ':' && pos[2] == '=')) {
throw std::runtime_error(std::string("expecting ::= at ") + pos);
}
pos = parse_space(pos + 3, true);
pos = parse_alternates(state, pos, name, rule_id, false);
if (*pos == '\r') {
pos += pos[1] == '\n' ? 2 : 1;
} else if (*pos == '\n') {
pos++;
} else if (*pos) {
throw std::runtime_error(std::string("expecting newline or end at ") + pos);
}
return parse_space(pos, true);
}
parse_state parse(const char * src) {
try {
parse_state state;
const char * pos = parse_space(src, true);
while (*pos) {
pos = parse_rule(state, pos);
}
return state;
} catch (const std::exception & err) {
fprintf(stderr, "%s: error parsing grammar: %s\n", __func__, err.what());
return parse_state();
}
}
void print_grammar_char(FILE * file, uint32_t c) {
if (0x20 <= c && c <= 0x7f) {
fprintf(file, "%c", static_cast<char>(c));
} else {
// cop out of encoding UTF-8
fprintf(file, "<U+%04X>", c);
}
}
bool is_char_element(whisper_grammar_element elem) {
switch (elem.type) {
case WHISPER_GRETYPE_CHAR: return true;
case WHISPER_GRETYPE_CHAR_NOT: return true;
case WHISPER_GRETYPE_CHAR_ALT: return true;
case WHISPER_GRETYPE_CHAR_RNG_UPPER: return true;
default: return false;
}
}
void print_rule_binary(FILE * file, const std::vector<whisper_grammar_element> & rule) {
for (auto elem : rule) {
switch (elem.type) {
case WHISPER_GRETYPE_END: fprintf(file, "END"); break;
case WHISPER_GRETYPE_ALT: fprintf(file, "ALT"); break;
case WHISPER_GRETYPE_RULE_REF: fprintf(file, "RULE_REF"); break;
case WHISPER_GRETYPE_CHAR: fprintf(file, "CHAR"); break;
case WHISPER_GRETYPE_CHAR_NOT: fprintf(file, "CHAR_NOT"); break;
case WHISPER_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
case WHISPER_GRETYPE_CHAR_ALT: fprintf(file, "CHAR_ALT"); break;
}
switch (elem.type) {
case WHISPER_GRETYPE_END:
case WHISPER_GRETYPE_ALT:
case WHISPER_GRETYPE_RULE_REF:
fprintf(file, "(%u) ", elem.value);
break;
case WHISPER_GRETYPE_CHAR:
case WHISPER_GRETYPE_CHAR_NOT:
case WHISPER_GRETYPE_CHAR_RNG_UPPER:
case WHISPER_GRETYPE_CHAR_ALT:
fprintf(file, "(\"");
print_grammar_char(file, elem.value);
fprintf(file, "\") ");
break;
}
}
fprintf(file, "\n");
}
void print_rule(
FILE * file,
uint32_t rule_id,
const std::vector<whisper_grammar_element> & rule,
const std::map<uint32_t, std::string> & symbol_id_names) {
if (rule.empty() || rule.back().type != WHISPER_GRETYPE_END) {
throw std::runtime_error(
"malformed rule, does not end with WHISPER_GRETYPE_END: " + std::to_string(rule_id));
}
fprintf(file, "%s ::= ", symbol_id_names.at(rule_id).c_str());
for (size_t i = 0, end = rule.size() - 1; i < end; i++) {
whisper_grammar_element elem = rule[i];
switch (elem.type) {
case WHISPER_GRETYPE_END:
throw std::runtime_error(
"unexpected end of rule: " + std::to_string(rule_id) + "," +
std::to_string(i));
case WHISPER_GRETYPE_ALT:
fprintf(file, "| ");
break;
case WHISPER_GRETYPE_RULE_REF:
fprintf(file, "%s ", symbol_id_names.at(elem.value).c_str());
break;
case WHISPER_GRETYPE_CHAR:
fprintf(file, "[");
print_grammar_char(file, elem.value);
break;
case WHISPER_GRETYPE_CHAR_NOT:
fprintf(file, "[^");
print_grammar_char(file, elem.value);
break;
case WHISPER_GRETYPE_CHAR_RNG_UPPER:
if (i == 0 || !is_char_element(rule[i - 1])) {
throw std::runtime_error(
"WHISPER_GRETYPE_CHAR_RNG_UPPER without preceding char: " +
std::to_string(rule_id) + "," + std::to_string(i));
}
fprintf(file, "-");
print_grammar_char(file, elem.value);
break;
case WHISPER_GRETYPE_CHAR_ALT:
if (i == 0 || !is_char_element(rule[i - 1])) {
throw std::runtime_error(
"WHISPER_GRETYPE_CHAR_ALT without preceding char: " +
std::to_string(rule_id) + "," + std::to_string(i));
}
print_grammar_char(file, elem.value);
break;
}
if (is_char_element(elem)) {
switch (rule[i + 1].type) {
case WHISPER_GRETYPE_CHAR_ALT:
case WHISPER_GRETYPE_CHAR_RNG_UPPER:
break;
default:
fprintf(file, "] ");
}
}
}
fprintf(file, "\n");
}
void print_grammar(FILE * file, const parse_state & state) {
try {
std::map<uint32_t, std::string> symbol_id_names;
for (auto kv : state.symbol_ids) {
symbol_id_names[kv.second] = kv.first;
}
for (size_t i = 0, end = state.rules.size(); i < end; i++) {
// fprintf(file, "%zu: ", i);
// print_rule_binary(file, state.rules[i]);
print_rule(file, uint32_t(i), state.rules[i], symbol_id_names);
// fprintf(file, "\n");
}
} catch (const std::exception & err) {
fprintf(stderr, "\n%s: error printing grammar: %s\n", __func__, err.what());
}
}
std::vector<const whisper_grammar_element *> parse_state::c_rules() const{
std::vector<const whisper_grammar_element *> ret;
for (const auto & rule : rules) {
ret.push_back(rule.data());
}
return ret;
}
}

View File

@@ -0,0 +1,29 @@
// Implements a parser for an extended Backus-Naur form (BNF), producing the
// binary context-free grammar format specified by whisper.h. Supports character
// ranges, grouping, and repetition operators. As an example, a grammar for
// arithmetic might look like:
//
// root ::= expr
// expr ::= term ([-+*/] term)*
// term ::= num | "(" space expr ")" space
// num ::= [0-9]+ space
// space ::= [ \t\n]*
#pragma once
#include "whisper.h"
#include <vector>
#include <map>
#include <cstdint>
#include <string>
namespace grammar_parser {
struct parse_state {
std::map<std::string, uint32_t> symbol_ids;
std::vector<std::vector<whisper_grammar_element>> rules;
std::vector<const whisper_grammar_element *> c_rules() const;
};
parse_state parse(const char * src);
void print_grammar(FILE * file, const parse_state & state);
}

View File

@@ -0,0 +1,194 @@
// Common Javascript functions used by the examples
function convertTypedArray(src, type) {
var buffer = new ArrayBuffer(src.byteLength);
var baseView = new src.constructor(buffer).set(src);
return new type(buffer);
}
var printTextarea = (function() {
var element = document.getElementById('output');
if (element) element.value = ''; // clear browser cache
return function(text) {
if (arguments.length > 1) text = Array.prototype.slice.call(arguments).join(' ');
console.log(text);
if (element) {
element.value += text + "\n";
element.scrollTop = element.scrollHeight; // focus on bottom
}
};
})();
async function clearCache() {
if (confirm('Are you sure you want to clear the cache?\nAll the models will be downloaded again.')) {
indexedDB.deleteDatabase(dbName);
location.reload();
}
}
// fetch a remote file from remote URL using the Fetch API
async function fetchRemote(url, cbProgress, cbPrint) {
cbPrint('fetchRemote: downloading with fetch()...');
const response = await fetch(
url,
{
method: 'GET',
headers: {
'Content-Type': 'application/octet-stream',
},
}
);
if (!response.ok) {
cbPrint('fetchRemote: failed to fetch ' + url);
return;
}
const contentLength = response.headers.get('content-length');
const total = parseInt(contentLength, 10);
const reader = response.body.getReader();
var chunks = [];
var receivedLength = 0;
var progressLast = -1;
while (true) {
const { done, value } = await reader.read();
if (done) {
break;
}
chunks.push(value);
receivedLength += value.length;
if (contentLength) {
cbProgress(receivedLength/total);
var progressCur = Math.round((receivedLength / total) * 10);
if (progressCur != progressLast) {
cbPrint('fetchRemote: fetching ' + 10*progressCur + '% ...');
progressLast = progressCur;
}
}
}
var position = 0;
var chunksAll = new Uint8Array(receivedLength);
for (var chunk of chunks) {
chunksAll.set(chunk, position);
position += chunk.length;
}
return chunksAll;
}
// load remote data
// - check if the data is already in the IndexedDB
// - if not, fetch it from the remote URL and store it in the IndexedDB
function loadRemote(url, dst, size_mb, cbProgress, cbReady, cbCancel, cbPrint) {
if (!navigator.storage || !navigator.storage.estimate) {
cbPrint('loadRemote: navigator.storage.estimate() is not supported');
} else {
// query the storage quota and print it
navigator.storage.estimate().then(function (estimate) {
cbPrint('loadRemote: storage quota: ' + estimate.quota + ' bytes');
cbPrint('loadRemote: storage usage: ' + estimate.usage + ' bytes');
});
}
// check if the data is already in the IndexedDB
var rq = indexedDB.open(dbName, dbVersion);
rq.onupgradeneeded = function (event) {
var db = event.target.result;
if (db.version == 1) {
var os = db.createObjectStore('models', { autoIncrement: false });
cbPrint('loadRemote: created IndexedDB ' + db.name + ' version ' + db.version);
} else {
// clear the database
var os = event.currentTarget.transaction.objectStore('models');
os.clear();
cbPrint('loadRemote: cleared IndexedDB ' + db.name + ' version ' + db.version);
}
};
rq.onsuccess = function (event) {
var db = event.target.result;
var tx = db.transaction(['models'], 'readonly');
var os = tx.objectStore('models');
var rq = os.get(url);
rq.onsuccess = function (event) {
if (rq.result) {
cbPrint('loadRemote: "' + url + '" is already in the IndexedDB');
cbReady(dst, rq.result);
} else {
// data is not in the IndexedDB
cbPrint('loadRemote: "' + url + '" is not in the IndexedDB');
// alert and ask the user to confirm
if (!confirm(
'You are about to download ' + size_mb + ' MB of data.\n' +
'The model data will be cached in the browser for future use.\n\n' +
'Press OK to continue.')) {
cbCancel();
return;
}
fetchRemote(url, cbProgress, cbPrint).then(function (data) {
if (data) {
// store the data in the IndexedDB
var rq = indexedDB.open(dbName, dbVersion);
rq.onsuccess = function (event) {
var db = event.target.result;
var tx = db.transaction(['models'], 'readwrite');
var os = tx.objectStore('models');
var rq = null;
try {
var rq = os.put(data, url);
} catch (e) {
cbPrint('loadRemote: failed to store "' + url + '" in the IndexedDB: \n' + e);
cbCancel();
return;
}
rq.onsuccess = function (event) {
cbPrint('loadRemote: "' + url + '" stored in the IndexedDB');
cbReady(dst, data);
};
rq.onerror = function (event) {
cbPrint('loadRemote: failed to store "' + url + '" in the IndexedDB');
cbCancel();
};
};
}
});
}
};
rq.onerror = function (event) {
cbPrint('loadRemote: failed to get data from the IndexedDB');
cbCancel();
};
};
rq.onerror = function (event) {
cbPrint('loadRemote: failed to open IndexedDB');
cbCancel();
};
rq.onblocked = function (event) {
cbPrint('loadRemote: failed to open IndexedDB: blocked');
cbCancel();
};
rq.onabort = function (event) {
cbPrint('loadRemote: failed to open IndexedDB: abort');
cbCancel();
};
}

View File

@@ -0,0 +1,112 @@
#!/bin/bash
#
# Transcribe audio livestream by feeding ffmpeg output to whisper.cpp at regular intervals
# Idea by @semiformal-net
# ref: https://github.com/ggerganov/whisper.cpp/issues/185
#
set -eo pipefail
url="http://a.files.bbci.co.uk/media/live/manifesto/audio/simulcast/hls/nonuk/sbr_low/ak/bbc_world_service.m3u8"
fmt=aac # the audio format extension of the stream (TODO: auto detect)
step_s=30
model="base.en"
check_requirements()
{
if ! command -v ./main &>/dev/null; then
echo "whisper.cpp main executable is required (make)"
exit 1
fi
if ! command -v ffmpeg &>/dev/null; then
echo "ffmpeg is required (https://ffmpeg.org)"
exit 1
fi
}
check_requirements
if [ -z "$1" ]; then
echo "Usage: $0 stream_url [step_s] [model]"
echo ""
echo " Example:"
echo " $0 $url $step_s $model"
echo ""
echo "No url specified, using default: $url"
else
url="$1"
fi
if [ -n "$2" ]; then
step_s="$2"
fi
if [ -n "$3" ]; then
model="$3"
fi
# Whisper models
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large-v3" )
# list available models
function list_models {
printf "\n"
printf " Available models:"
for model in "${models[@]}"; do
printf " $model"
done
printf "\n\n"
}
if [[ ! " ${models[@]} " =~ " ${model} " ]]; then
printf "Invalid model: $model\n"
list_models
exit 1
fi
running=1
trap "running=0" SIGINT SIGTERM
printf "[+] Transcribing stream with model '$model', step_s $step_s (press Ctrl+C to stop):\n\n"
# continuous stream in native fmt (this file will grow forever!)
ffmpeg -loglevel quiet -y -re -probesize 32 -i $url -c copy /tmp/whisper-live0.${fmt} &
if [ $? -ne 0 ]; then
printf "Error: ffmpeg failed to capture audio stream\n"
exit 1
fi
printf "Buffering audio. Please wait...\n\n"
sleep $(($step_s))
# do not stop script on error
set +e
i=0
SECONDS=0
while [ $running -eq 1 ]; do
# extract the next piece from the main file above and transcode to wav. -ss sets start time and nudges it by -0.5s to catch missing words (??)
err=1
while [ $err -ne 0 ]; do
if [ $i -gt 0 ]; then
ffmpeg -loglevel quiet -v error -noaccurate_seek -i /tmp/whisper-live0.${fmt} -y -ar 16000 -ac 1 -c:a pcm_s16le -ss $(($i*$step_s-1)).5 -t $step_s /tmp/whisper-live.wav 2> /tmp/whisper-live.err
else
ffmpeg -loglevel quiet -v error -noaccurate_seek -i /tmp/whisper-live0.${fmt} -y -ar 16000 -ac 1 -c:a pcm_s16le -ss $(($i*$step_s)) -t $step_s /tmp/whisper-live.wav 2> /tmp/whisper-live.err
fi
err=$(cat /tmp/whisper-live.err | wc -l)
done
./main -t 8 -m ./models/ggml-${model}.bin -f /tmp/whisper-live.wav --no-timestamps -otxt 2> /tmp/whispererr | tail -n 1
while [ $SECONDS -lt $((($i+1)*$step_s)) ]; do
sleep 1
done
((i=i+1))
done
killall -v ffmpeg
killall -v main

View File

@@ -0,0 +1,9 @@
if (WHISPER_SDL2)
# stream
set(TARGET lsp)
add_executable(${TARGET} lsp.cpp)
include(DefaultTargetOptions)
target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${CMAKE_THREAD_LIBS_INIT})
endif ()

View File

@@ -0,0 +1,104 @@
# Language Server
This example consists of a simple language server to expose both unguided
and guided (command) transcriptions by sending json messages over stdout/stdin
as well as a rather robust vim plugin that makes use of the language server.
## Vim plugin quick start
Compile the language server with
```bash
make lsp
```
Install the plugin itself by copying or symlinking whisper.vim into ~/.vim/autoload/
In your vimrc, set the path of your whisper.cpp directory and optionally add some keybinds.
```vim
let g:whisper_dir = "~/whisper.cpp"
" Start listening for commands when Ctrl - g is pressed in normal mode
nnoremap <C-G> call whisper#requestCommands()<CR>
" Start unguided transcription when Ctrl - g is pressed in insert mode
inoremap <C-G> <Cmd>call whisper#doTranscription()<CR>
```
## Vim plugin usage
The vim plugin was designed to closely follow the mnemonics of vim
`s:spoken_dict` is used to translate keys to their spoken form.
Keys corresponding to a string use that spoken value normally and when a motion is expected, but use the key itself when a character is expected.
Keys corresponding to a dict, like `i`, can have manual difinitions given to each possible commandset.
0 is normal (insert), 1 is motion (inside), 2 is it's usage as a single key ([till] i), and 3 is it's usage in an area selection (s -> [around] sentence)
Some punctuation items, like `-` are explicitly given pronunciations to prevent them from being picked as punctuation instead of an actual command word.
Not all commands will tokenize to a single token and this can interfere with interpretation. "yank" as an example, takes multiple tokens and correspondingly, will give more accurate detection when only the first "ya" is used. While it could be changed to something else that is a single token (copy), value was placed on maintaining vim mnemonics.
Commands that would normally move the editor into insert mode (insert, append, open, change) will begin unguided transcription.
Unguided transcription will end when a speech segment ends in exit.
Presence of punctuation can be designated by whether or not you add a pause between the previous speech segment and exit.
Exiting only occurs if exit is the last word, so "Take the first exit on your right" would not cause transcription to end.
After a command is evaluated, the plugin will continue listening for the next command.
While in command mode, "Exit" will end listening.
A best effort approach is taken to keep track of audio that is recorded while a previous chunk is still processing and immediately interpret it afterwards, but the current voice detection still needs a fairly sizable gap to determine when a command has been spoken.
Log information is sent to a special `whisper_log` buffer and can be accessed with
```vim
:e whisper_log
```
## Vim plugin configuration
`g:whisper_dir`
A full path to the whisper.cpp repo. It can be expanded in the definition like so:
```vim
let g:whisper_dir = expand("~/whisper.cpp/")
```
(The WHISPER_CPP_HOME environment variable is also checked for users of the existing whisper.nvim script)
`g:whisper_lsp_path`
Can be used to manually set the path to the language server.
If not defined, it will be inferred from the above whisper_dir
`g:whisper_model_path`
A full path to the model to load. If not defined, it will default to ggml-base.en.bin
`g:whisper_user_commands`
A dictionary of spoken commands that correspond to either strings or funcrefs.
This can be used to create connections with other user plugins, for example
```vim
let g:whisper_user_commands = {"gen": "llama#doLlamaGen"}
```
will trigger the llama.cpp plugin to begin generation when "gen" is spoken
## Language server methods
`registerCommandset`
`params` is a list of strings that should be checked for with this commandset. The server prepends a space to these strings before tokenizing.
Responds with
`result.index` an integer index for the commandset registered, which should be included when initiating a guided transcription to select this commandset.
Will return an error if any of the commands in the commandset have duplicate tokenizations
`guided`
`params.commandset_index` An index returned by a corresponding commandset registration. If not set, the most recently registered commandset is used.
`params.timestamp` A positive unsigned integer which designates a point in time which audio should begin processing from. If left blank, the start point of audio processing will be the moment the message is recieved. This should be left blank unless you have a timestamp from a previous response.
Responds with
`result.command_index` The numerical index (starting from 0) of the detected command in the selected commandset
`result.command_text` A string containing the command as provided in the commandset
`result.timestamp` A positive unsigned integer that designates the point in time which audio stopped being processed at. Pass this timestamp back in a subsequent message to mask the latency of transcription.
`unguided`
`params.no_context` Sets the corresponding whisper `no_context` param. Defaults to true. Might provide more accurate results for consecutive unguided transcriptions if those after the first are set to false.
`params.prompt` If provided, sets the initial prompt used during transcription.
`params.timestamp` A positive unsigned integer which designates a point in time which audio should begin processing from. If left blank, the start point of audio processing will be the moment the message is recieved. This should be left blank unless you have a timestamp from a previous response.
Responds with
`result.transcription` A string containing the transcribed text. N.B. This will almost always start with a space due to how text is tokenized.
`result.timestamp` A positive unsigned integer that designates the point in time which audio stopped being processed at. Pass this timestamp back in a subsequent message to mask the latency of transcription.

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,463 @@
#include "common.h"
#include "common-sdl.h"
#include "whisper.h"
#include "json.hpp"
#include <iostream>
#include <cassert>
#include <cstdio>
#include <string>
#include <thread>
#include <vector>
#include <deque>
#include <set>
using json = nlohmann::json;
// command-line parameters
struct whisper_params {
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
int32_t prompt_ms = 5000;
int32_t command_ms = 8000;
int32_t capture_id = -1;
int32_t max_tokens = 32;
int32_t audio_ctx = 0;
float vad_thold = 0.6f;
float freq_thold = 100.0f;
bool speed_up = false;
bool translate = false;
bool print_special = false;
bool print_energy = false;
bool use_gpu = true;
std::string language = "en";
std::string model = "models/ggml-base.en.bin";
};
struct command {
std::vector<whisper_token> tokens;
std::string plaintext;
};
struct commandset {
std::vector<struct command> commands;
std::vector<whisper_token> prompt_tokens;
// TODO: Store longest command?
// Multi-token commands should have probabilities of subsequent logits
// given that the prior logit is correct.
// In this case, all commands must be iterated.
// This however, is likely highly involved as different tokens
// almost certainly have different spoken lengths
// It would also have performance implications equivalent to a beam search
};
void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
for (int i = 1; i < argc; i++) {
std::string arg = argv[i];
if (arg == "-h" || arg == "--help") {
whisper_print_usage(argc, argv, params);
exit(0);
}
else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
else if (arg == "-pms" || arg == "--prompt-ms") { params.prompt_ms = std::stoi(argv[++i]); }
else if (arg == "-cms" || arg == "--command-ms") { params.command_ms = std::stoi(argv[++i]); }
else if (arg == "-c" || arg == "--capture") { params.capture_id = std::stoi(argv[++i]); }
else if (arg == "-mt" || arg == "--max-tokens") { params.max_tokens = std::stoi(argv[++i]); }
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; }
else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; }
else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; }
else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
whisper_print_usage(argc, argv, params);
exit(0);
}
}
return true;
}
void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
fprintf(stderr, "\n");
fprintf(stderr, "usage: %s [options]\n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
fprintf(stderr, " -pms N, --prompt-ms N [%-7d] prompt duration in milliseconds\n", params.prompt_ms);
fprintf(stderr, " -cms N, --command-ms N [%-7d] command duration in milliseconds\n", params.command_ms);
fprintf(stderr, " -c ID, --capture ID [%-7d] capture device ID\n", params.capture_id);
fprintf(stderr, " -mt N, --max-tokens N [%-7d] maximum number of tokens per audio chunk\n", params.max_tokens);
fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false");
fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU\n", params.use_gpu ? "false" : "true");
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.language.c_str());
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
fprintf(stderr, "\n");
}
uint64_t wait_for_vad(audio_async & audio, json jparams, const whisper_params & params, uint64_t maxlength_ms, std::vector<float> & pcmf32) {
using namespace std::chrono;
uint64_t time_now = time_point_cast<milliseconds>(system_clock::now()).time_since_epoch().count();
uint64_t start_time = time_now;
if (jparams.contains("timestamp")) {
start_time = jparams.at("timestamp");
}
if(time_now - start_time < 500) {
//wait for a backlog of audio
std::this_thread::sleep_for(milliseconds(500 - (time_now - start_time)));
time_now = time_point_cast<milliseconds>(system_clock::now()).time_since_epoch().count();
} else if (time_now - start_time > 1000) {
audio.get(time_now-start_time, pcmf32);
size_t max_offset = pcmf32.size() - WHISPER_SAMPLE_RATE;
for(size_t offset=0;offset < max_offset;offset+=WHISPER_SAMPLE_RATE/10) {
std::vector<float> audio_chunk(&pcmf32[offset], &pcmf32[offset+WHISPER_SAMPLE_RATE]);
if(::vad_simple(audio_chunk, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
pcmf32.resize(offset+WHISPER_SAMPLE_RATE);
if (offset*1000/WHISPER_SAMPLE_RATE+1000 > maxlength_ms) {
//remove samples from the beginning
pcmf32.erase(pcmf32.begin(),pcmf32.end()-(maxlength_ms*WHISPER_SAMPLE_RATE/1000));
fprintf(stderr, "Shortened samples");
}
return start_time + offset*1000/WHISPER_SAMPLE_RATE+1000;
}
}
}
size_t window_duration = std::max((uint64_t)1000, time_now-start_time);
audio.get(window_duration, pcmf32);
while (!::vad_simple(pcmf32, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
std::this_thread::sleep_for(milliseconds(100));
time_now = time_point_cast<milliseconds>(system_clock::now()).time_since_epoch().count();
window_duration = std::max((uint64_t)1000,time_now-start_time);
audio.get(window_duration, pcmf32);
}
if (time_now - start_time > maxlength_ms) {
audio.get(maxlength_ms, pcmf32);
} else {
audio.get(time_now - start_time, pcmf32);
}
return time_now;
}
json unguided_transcription(struct whisper_context * ctx, audio_async &audio, json jparams, const whisper_params &params) {
std::vector<whisper_token> prompt_tokens;
std::vector<float> pcmf32;
uint64_t unprocessed_audio_timestamp = wait_for_vad(audio, jparams, params, 10000U, pcmf32);
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
if (jparams.contains("prompt")) {
// unlikely to see much use. Under normal circumstances, no_context would be set to false
std::string prompt = jparams.at("prompt");
prompt_tokens.resize(1024);
int n = whisper_tokenize(ctx, prompt.c_str(), prompt_tokens.data(), 1024);
prompt_tokens.resize(n);
wparams.prompt_tokens = prompt_tokens.data();
wparams.prompt_n_tokens = prompt_tokens.size();
}
wparams.print_progress = false;
wparams.print_special = params.print_special;
wparams.print_realtime = false;
wparams.print_timestamps = false;
wparams.translate = params.translate;
wparams.no_context = jparams.value("no_context", true);
wparams.single_segment = true;
wparams.max_tokens = params.max_tokens;
wparams.language = params.language.c_str();
wparams.n_threads = params.n_threads;
wparams.audio_ctx = params.audio_ctx;
wparams.speed_up = params.speed_up;
wparams.suppress_non_speech_tokens = true;
// run the transformer and a single decoding pass
if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
fprintf(stderr, "%s: ERROR: whisper_full() failed\n", __func__);
throw json{
{"code", -32803},
{"message", "ERROR: whisper_full() failed"}
};
}
std::string result = whisper_full_get_segment_text(ctx,0);
return json {
{"transcription", result},
{"timestamp", unprocessed_audio_timestamp}
};
}
// command-list mode
// guide the transcription to match the most likely command from a provided list
json guided_transcription(struct whisper_context * ctx, audio_async &audio, const whisper_params &params, json jparams, std::vector<struct commandset> commandset_list) {
struct commandset cs = commandset_list[jparams.value("commandset_index", commandset_list.size()-1)];
std::vector<float> pcmf32;
uint64_t unprocessed_audio_timestamp = wait_for_vad(audio, jparams, params, 2000U, pcmf32);
fprintf(stderr, "%s: Speech detected! Processing ...\n", __func__);
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
wparams.print_progress = false;
wparams.print_special = params.print_special;
wparams.print_realtime = false;
wparams.print_timestamps = false;
wparams.translate = params.translate;
wparams.no_context = true;
wparams.single_segment = true;
wparams.max_tokens = 1;
wparams.language = params.language.c_str();
wparams.n_threads = params.n_threads;
wparams.audio_ctx = params.audio_ctx;
wparams.speed_up = params.speed_up;
// TODO: Do some time testing. Does an overly long prompt slow down processing?
// Set up command sets/precompute prompts
wparams.prompt_tokens = cs.prompt_tokens.data();
wparams.prompt_n_tokens = cs.prompt_tokens.size();
// TODO: properly expose as option
wparams.suppress_non_speech_tokens = true;
// run the transformer and a single decoding pass
if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
fprintf(stderr, "%s: ERROR: whisper_full() failed\n", __func__);
throw json{
{"code", -32803},
{"message", "ERROR: whisper_full() failed"}//TODO: format string (sprintf?)
};
}
// estimate command probability
// NOTE: not optimal
{
const auto * logits = whisper_get_logits(ctx);
std::vector<float> probs(whisper_n_vocab(ctx), 0.0f);
// compute probs from logits via softmax
{
float max = -1e9;
for (int i = 0; i < (int) probs.size(); ++i) {
max = std::max(max, logits[i]);
}
float sum = 0.0f;
for (int i = 0; i < (int) probs.size(); ++i) {
probs[i] = expf(logits[i] - max);
sum += probs[i];
}
for (int i = 0; i < (int) probs.size(); ++i) {
probs[i] /= sum;
}
}
std::vector<std::pair<float, int>> probs_id;
// In my testing, the most verbose token is always the desired.
// TODO: Trim commandset struct once efficacy has been verified
for (int i = 0; i < (int) cs.commands.size(); ++i) {
probs_id.emplace_back(probs[cs.commands[i].tokens[0]], i);
}
// sort descending
{
using pair_type = decltype(probs_id)::value_type;
std::sort(probs_id.begin(), probs_id.end(), [](const pair_type & a, const pair_type & b) {
return a.first > b.first;
});
}
int id = probs_id[0].second;
return json{
{"command_index", id},
{"command_text", cs.commands[id].plaintext},
{"timestamp", unprocessed_audio_timestamp},
};
}
}
json register_commandset(struct whisper_context * ctx, json jparams, std::vector<struct commandset> &commandset_list) {
// TODO: check for token collision
struct commandset cs;
std::string k_prompt = " select one from the available words: ";
std::set<whisper_token> token_set;
whisper_token tokens[32];
for (std::string s : jparams) {
std::vector<whisper_token> token_vec;
// The existing command implementation uses a nested for loop to tokenize single characters
// I fail to see the purpose of this when ' a' has a wholly different pronunciation than the start of ' apple'
const int n = whisper_tokenize(ctx, (" " + s).c_str(), tokens, 32);
if (n < 0) {
fprintf(stderr, "%s: error: failed to tokenize command '%s'\n", __func__, s.c_str());
return 3;
}
token_vec.push_back(tokens[0]);
if (!token_set.insert(tokens[0]).second) {
fprintf(stderr, "%s: warning: %s is a duplicate of an existing token\n", __func__, s.c_str());
throw json{
{"code",-31000},
{"message", "Duplicate token in token set: " + s}
};
}
if (n > 1) {// empty string if n=0? Should never occur
fprintf(stderr, "%s: error: command is more than a single token: %s\n", __func__, s.c_str());
}
struct command command = {token_vec, s};
cs.commands.push_back(command);
k_prompt += s;
}
k_prompt = k_prompt.substr(0,k_prompt.length()-2) + ". Selected word:";
cs.prompt_tokens.resize(1024);
int n = whisper_tokenize(ctx, k_prompt.c_str(), cs.prompt_tokens.data(), 1024);
cs.prompt_tokens.resize(n);
// prepare response
int index = commandset_list.size();
commandset_list.push_back(cs);
return json{{"index",index}};
}
json seek(struct whisper_context * /*ctx*/, audio_async & /*audio*/, json /*params*/) {
// whisper_state has the pertinent offsets, but there also seem to be a large
// number of scratch buffers that would prevent rewinding context in a manner similar to llama
// I'll give this a another pass once everything else is implemented,
// but for now, it's unsupported
throw json {
{"code", -32601},
{"message", "Seeking is not yet supported."}
};
}
json parse_job(const json &body, struct whisper_context * ctx, audio_async &audio, const whisper_params &params, std::vector<struct commandset> &commandset_list) {
// See: https://www.jsonrpc.org/specification
json id = body.at("id");
try {
std::string version = body.at("jsonrpc");
if (version != "2.0") {
// unsupported version
throw json{
{"code", -3260},
{"message", "invalid jsonrpc version"}
};
}
std::string method = body.at("method");
json jparams = json{{"dummy", "dummy"}};
if (body.contains("params"))
jparams = body.at("params");
json res;
// TODO: be consistent about argument order
fprintf(stderr, "Dispatching a job\n");
if (method == "unguided") { res = unguided_transcription(ctx, audio, jparams, params); }
else if (method == "guided") { res = guided_transcription(ctx, audio, params, jparams, commandset_list); }
else if (method == "seek") { res = seek(ctx, audio, jparams); }
else if (method == "registerCommandset") { res = register_commandset(ctx, jparams, commandset_list); }
else if (method == "echo") { res = jparams; }
return json{
{"jsonrpc", "2.0"},
{"result", res},
{"id", id}
};
} catch(json ex) {
return json {
{"jsonrpc", "2.0"},
{"error", ex},
{"id", id}
};
}
}
void process_loop(struct whisper_context * ctx, audio_async &audio, const whisper_params &params) {
std::deque<json> jobqueue;
std::vector<struct commandset> commandset_list;
while (true) {
// For eventual cancellation support, shouldn't block if job exists
if (std::cin.rdbuf()->in_avail() > 22 || jobqueue.size() == 0) {
int content_length;
if (scanf("Content-Length: %d", &content_length) != 1) {
fprintf(stderr, "Could not read input: %d", std::cin.peek());
return;
}
// scanf leaves the new lines intact
std::cin.ignore(2);
if (std::cin.peek() != 13) {
// Content-Type. jsonrpc necessitates utf8.
std::cin.ignore(200,10);
}
std::cin.ignore(2);
// A message is being sent and blocking is acceptable
std::string content(content_length,'\0');
std::cin.read(&content[0], content_length);
json job = json::parse(content);
// TODO: Some messages(cancellation) should skip queue here
if (job.is_array()) {
// response must also be batched. Will implement later
// for (subjob : job.begin())
// TODO: At the very least respond with an unsupported error.
} else {
jobqueue.push_back(job);
}
}
assert(jobqueue.size() > 0);
json job = jobqueue.front();
json resp = parse_job(job, ctx, audio, params, commandset_list);
if (resp != "unfinished") {
jobqueue.pop_front();
// send response
std::string data = resp.dump(-1, ' ', false, json::error_handler_t::replace);
fprintf(stdout, "Content-Length: %d\r\n\r\n%s\n", (int)data.length()+1, data.c_str());
std::cout.flush();
}
}
}
int main(int argc, char ** argv) {
whisper_params params;
if (whisper_params_parse(argc, argv, params) == false) {
return 1;
}
if (whisper_lang_id(params.language.c_str()) == -1) {
fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
whisper_print_usage(argc, argv, params);
exit(0);
}
// whisper init
struct whisper_context_params cparams;
cparams.use_gpu = params.use_gpu;
struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
// init audio
audio_async audio(30*1000);
if (!audio.init(params.capture_id, WHISPER_SAMPLE_RATE)) {
fprintf(stderr, "%s: audio.init() failed!\n", __func__);
return 1;
}
audio.resume();
// TODO: Investigate why this is required. An extra second of startup latency is not great
// wait for 1 second to avoid any buffered noise
std::this_thread::sleep_for(std::chrono::milliseconds(1000));
audio.clear();
// TODO: consider some sort of indicator to designate loading has finished?
// Potentially better for the client to just start with a non-blocking message (register commands)
process_loop(ctx, audio, params);
audio.pause();
whisper_print_timings(ctx);
whisper_free(ctx);
return 0;
}

View File

@@ -0,0 +1,362 @@
if !exists("g:whisper_dir")
let g:whisper_dir = expand($WHISPER_CPP_HOME)
if g:whisper_dir == ""
echoerr "Please provide a path to the whisper.cpp repo in either the $WHISPER_CPP_HOME environment variable, or g:whisper_dir"
endif
endif
if !exists("g:whisper_lsp_path")
let g:whisper_lsp_path = g:whisper_dir .. "lsp"
if !filereadable(g:whisper_lsp_path)
echoerr "Was not able to locate a lsp executable at: " .. g:whisper_lsp_path
throw "Executable not found"
endif
endif
if !exists("g:whisper_model_path")
" TODO: allow custom paths relative to the repo dir
let g:whisper_model_path = g:whisper_dir .. "models/ggml-base.en.bin"
if !filereadable(g:whisper_model_path)
echoerr "Could not find model at: " .. g:whisper_model_path
throw "Model not found"
endif
endif
let s:output_buffer = bufnr("whisper_log", v:true)
call setbufvar(s:output_buffer,"&buftype","nofile")
let s:lsp_command = [g:whisper_lsp_path,"-m",g:whisper_model_path]
" For faster execution. TODO: server load multiple models/run multiple servers?
" let s:lsp_command = [g:whisper_lsp_path, "-m", g:whisper_dir .. "models/ggml-tiny.en.bin", "-ac", "128"]
" requestCommands([params_dict])
func whisper#requestCommands(...)
let l:req = {"method": "guided", "params": {"commandset_index": 0}}
if a:0 > 0
call extend(l:req.params, a:1)
endif
let resp = ch_sendexpr(g:lsp_job, l:req, {"callback": function("s:commandCallback", [l:req.params, 0])})
endfunction
" doTranscription([params_dict])
func whisper#doTranscription(...)
let l:req = {"method": "unguided", "params": {}}
if a:0 > 0
call extend(l:req.params, a:1)
endif
let resp = ch_sendexpr(g:lsp_job, l:req, {"callback": function("s:transcriptionCallback", [function("s:insertText"),function("s:endTranscription")])})
endfunction
" For testing
func whisper#uppertest(cha)
echo tr(a:cha, s:c_lowerkeys, s:c_upperkeys)
endfunction
" (upper, exit, count, motion, command, insert/append, save run) "base"
" (upper, exit, count, motion, command, inside/around) "motion/visual"
" (upper, exit, count, motion, line, inside/around) "command already entered"
" (upper, exit, key, ) "from/till"
" upper and lower keys is used to translate between cases with tr
" Must be sunchronized
let s:c_lowerkeys = "1234567890-=qwertyuiop[]\\asdfghjkl;'zxcvbnm,./\""
let s:c_upperkeys = "!@#$%^&*()_+QWERTYUIOP{}|ASDFGHJKL:\"ZXCVBNM<>?'"
let s:c_count = split("1234567890\"",'\zs')
let s:c_command = split("ryuogpdxcv.iam", '\zs')
let s:c_motion = split("wetf'hjklnb$^)",'\zs')
" object words: Word, Sentence, Paragraph, [, (, <, Tag, {. ", '
let s:c_area = split("wsp])>t}\"'",'\zs')
"Special commands.
let s:c_special_always = ["exit", "upper"]
let s:c_special_normal = ["save", "run", "space"]
" If not in dict, key is spoken word,
" If key resolves to string, value is used for normal/motion, but key for chars
" If key resolves to dict, {0: "normal",1: "motion",2:"single char",3: "area"}
" Missing entries fall back as follows {0: "required", 1: 0, 2: "key", 3: 0}
let s:spoken_dict = {"w": "word", "e": "end", "r": "replace", "t": {0: "till", 3: "tag"}, "y": "yank", "u": "undo", "i": {0: "insert", 1: "inside"}, "o": "open", "p": {0: "paste", 3: "paragraph"}, "a": {0: "append", 1: "around"}, "s": {0: "substitute", 3: "sentence"}, "d": "delete", "f": "from", "g": "go", "h": "left", "j": "down", "k": "up", "l": "right", "c": "change", "v": "visual", "b": "back", "n": "next", "m": "mark", ".": {0: "repeat", 2: "period"}, "]": {0: "bracket", 2: "bracket"}, "'": {0: "jump", 2: "apostrophe", 3: "apostrophe"}, '"': {0: 'register', 2: "quotation", 3: "quotation"}, "-": {0: "minus", 2: "minus"}, "$": {0: "dollar", 2: "dollar"}, "^": {0: "carrot", 2: "carrot"}, ")": {0: "sentence", 2: "parenthesis", 3: "parenthesis"}, "}": {0: "paragraph", 2: "brace", 3: "brace"}, ">": {0: "indent", 2: "angle", 3: "angle"}}
" Give this another pass. This seems overly hacky even if it's functional
let s:sub_tran_msg = ""
func s:subTranProg(msg)
if s:sub_tran_msg != ""
let s:sub_tran_msg = s:sub_tran_msg .. a:msg
if mode() !=? 'v'
exe "normal" "u" .. s:sub_tran_msg
endif
else
if s:command_backlog == ""
" this should not occur
call s:logCallback(0, "Warning: Encountered sub transcription without prior command")
let s:command_backlog = "a"
endif
if a:msg[0] == ' '
let s:sub_tran_msg = s:command_backlog .. a:msg[1:-1]
else
let s:sub_tran_msg = s:command_backlog .. a:msg
endif
if mode() !=? 'v'
exe "normal" s:sub_tran_msg
endif
endif
call appendbufline(s:output_buffer, "$", s:sub_tran_msg .. ":" .. string(a:msg ))
endfunction
func s:subTranFinish(params, timestamp)
let s:repeat_command = s:sub_tran_msg
" Visual selection is lot if used with streaming, so streaming of partial
" transcriptions is disabled in visual mode
if mode() ==? 'v'
exe "normal" s:sub_tran_msg
endif
let s:sub_tran_msg = ""
let s:command_backlog = ""
exe "normal a\<C-G>u"
let l:params = a:params
let l:params.timestamp = a:timestamp
if exists("l:params.commandset_index")
unlet l:params.commandset_index
endif
call whisper#requestCommands(a:params)
endfunction
func s:logCallback(channel, msg)
call appendbufline(s:output_buffer,"$",a:msg)
endfunction
func s:transcriptionCallback(progressCallback, finishedCallback, channel, msg)
let l:tr = a:msg.result.transcription
let l:ex_ind = match(tolower(l:tr),"exit", len(l:tr)-6)
" The worst case I've observed so far is " Exit.", which is 6 characters
if l:ex_ind != -1
call a:progressCallback(strpart(l:tr,0,l:ex_ind-1))
call a:finishedCallback(a:msg.result.timestamp)
else
call a:progressCallback(l:tr)
let req = {"method": "unguided", "params": {"timestamp": a:msg.result.timestamp, "no_context": v:true}}
let resp = ch_sendexpr(g:lsp_job, req, {"callback": function("s:transcriptionCallback", [a:progressCallback, a:finishedCallback])})
endif
endfunc
func s:insertText(msg)
exe "normal a" .. a:msg
endfunction
func s:endTranscription(timestamp)
call appendbufline(s:output_buffer, "$", "Ending unguided transcription")
endfunction
" If a command does not include a whole actionable step, attempting to execute
" it discards the remainder of things. There is likely a simpler solution,
" but it can be made functional now by storing a backbuffer until actionable
let s:command_backlog = ""
let s:repeat_command = ""
let s:preceeding_upper = v:false
func s:commandCallback(params, commandset_index, channel, msg)
let l:command_index = a:msg.result.command_index
let l:do_execute = v:false
let l:next_mode = a:commandset_index
let l:command = s:commandset_list[a:commandset_index][l:command_index]
call s:logCallback(0, string(a:msg) .. " " .. a:commandset_index .. " " .. l:command)
if l:command_index == 0
"exit
"if s:command_backlog == ""
call s:logCallback(0,"Stopping command mode")
echo "No longer listening"
let s:command_backlog = ""
return
"else
" Legacy code to clear an existing buffer with exit.
" Was found to be rarely desired and is better introduced as a
" standalone command (clear?)
" call s:logCallback(0,"Clearing command_backlog" .. s:command_backlog)
" let s:command_backlog = ""
" let s:preceeding_upper = v:false
" endif
elseif l:command_index == 1
" upper
let s:preceeding_upper = !s:preceeding_upper
elseif l:command == "save"
" save and run can only happen in commandset 0,
exe "w"
elseif l:command == "run"
exe "make run"
elseif l:command == "space"
exe "normal i \<ESC>l"
elseif has_key(s:c_user, l:command)
let Userfunc = s:c_user[l:command]
if type(Userfunc) == v:t_string
let Userfunc = function(Userfunc)
endif
call Userfunc()
else
if s:preceeding_upper
" Upper should keep commandset
let s:preceeding_upper = v:false
let l:visual_command = tr(l:command, s:c_lowerkeys, s:c_upperkeys)
else
let l:visual_command = l:command
endif
echo s:command_backlog .. " - " .. l:visual_command
let s:command_backlog = s:command_backlog .. l:visual_command
if a:commandset_index == 2 || a:commandset_index == 3
" single key, either completes motion, replace, or register
" Should move to execute unless part of a register
" Change will be caught at execute
if s:command_backlog[-2:-2] !=# '"'
call s:logCallback(0,"not register")
let l:do_execute = v:true
end
let l:next_mode = 0
" commandset index only matters for a/i
elseif (l:command == "a" || l:command == "i") && a:commandset_index == 1
" inside/around. Is commandset 3
let l:next_mode = 3
elseif l:command ==# '"'
let l:next_mode = 2
elseif index(s:c_count, l:command) != -1
let l:next_mode = a:commandset_index
elseif index(s:c_motion, l:command) != -1
if l:command == 't' || l:command == 'f' || l:command == "'"
" prompt single key
let l:next_mode = 2
else
let l:do_execute = v:true
let l:next_mode = 0
endif
elseif index(s:c_command, l:command) != -1
if index(["y","g","d","c"], s:command_backlog[-1:-1]) != -1 && s:command_backlog[-1:-1] != s:command_backlog[-2:-2] && mode() !=? 'v'
" need motion or repeated command
" Potential for bad state here if disparaging command keys are
" entered (i.e. yd), but vim can handle checks for this at exe
" And checking for cases like y123d would complicate things
let l:next_mode = 1
elseif index(["i","a","c", "o", "s"], l:command) != -1 || s:command_backlog[-1:-1] ==# 'R'
"'Insert' mode, do general transcription
let l:req = {"method": "unguided", "params": a:params}
let l:req.params.timestamp = a:msg.result.timestamp
let l:req.params.no_context = v:true
let resp = ch_sendexpr(g:lsp_job, req, {"callback": function("s:transcriptionCallback", [function("s:subTranProg"), function("s:subTranFinish", [a:params])])})
return
elseif l:command == 'r' || l:command == 'm'
let l:next_mode = 2
elseif l:command == '.'
let l:next_mode = 0
let l:do_execute = v:true
let s:command_backlog = s:command_backlog[0:-2] .. s:repeat_command
else
if l:command ==? 'v'
let l:next_mode = 1
else
let l:next_mode = 0
endif
let l:do_execute = v:true
endif
else
throw "Invalid command state: " .. l:command .. " " .. a:commandset_index .. " " .. s:command_backlog
endif
endif
if l:do_execute
if mode() ==?'v' && l:next_mode == 0
let l:next_mode = 1
elseif match(s:command_backlog, 'c') != -1
let l:req = {"method": "unguided", "params": a:params}
let l:req.params.timestamp = a:msg.result.timestamp
let l:req.params.no_context = v:true
let resp = ch_sendexpr(g:lsp_job, req, {"callback": function("s:transcriptionCallback", [function("s:subTranProg"), function("s:subTranFinish", [a:params])])})
return
endif
exe "normal" s:command_backlog
if index(s:c_motion + ["u"],l:command) == -1
exe "normal a\<C-G>u"
let s:repeat_command = s:command_backlog
call s:logCallback(0, s:command_backlog)
endif
let s:command_backlog = ""
endif
let l:req = {"method": "guided", "params": a:params}
let l:req.params.timestamp = a:msg.result.timestamp
let l:req.params.commandset_index = l:next_mode
let resp = ch_sendexpr(g:lsp_job, l:req, {"callback": function("s:commandCallback",[a:params, l:next_mode])})
endfunction
func s:loadedCallback(channel, msg)
echo "Loading complete"
call s:logCallback(a:channel, a:msg)
endfunction
func s:registerCommandset(commandlist, is_final)
let req = {"method": "registerCommandset"}
let req.params = a:commandlist
call s:logCallback(0, join(a:commandlist))
call add(g:whisper_commandlist_spoken, a:commandlist)
if a:is_final
let resp = ch_sendexpr(g:lsp_job, req, {"callback": "s:loadedCallback"})
else
let resp = ch_sendexpr(g:lsp_job, req, {"callback": "s:logCallback"})
endif
endfunction
func s:registerAllCommands()
let l:normal = s:c_special_always + s:c_special_normal + s:c_count + s:c_command + s:c_motion + keys(s:c_user)
let l:visual = s:c_special_always + s:c_count + s:c_command + s:c_motion
" Currently the same as visual.
" let l:post_command = s:c_special_always + s:c_count + s:c_command + s:c_motion
let l:single_key = s:c_special_always + split(s:c_lowerkeys, '\zs')
let l:area = s:c_special_always + s:c_area
" Used only for compatibility with the testing script
let g:whisper_commandlist_spoken = []
let s:commandset_list = [l:normal, l:visual, l:single_key, l:area]
call s:registerCommandset(s:commandsetToSpoken(l:normal, 0), v:false)
call s:registerCommandset(s:commandsetToSpoken(l:visual, 1), v:false)
call s:registerCommandset(s:commandsetToSpoken(l:single_key, 2), v:false)
call s:registerCommandset(s:commandsetToSpoken(l:area, 3), v:true)
endfunction
func s:commandsetToSpoken(commandset, spoken_index)
let l:spoken_list = []
for l:command in a:commandset
if has_key(s:spoken_dict, l:command)
let l:spoken_value = s:spoken_dict[l:command]
if type(l:spoken_value) == v:t_dict
if has_key(l:spoken_value, a:spoken_index)
let l:spoken_value = l:spoken_value[a:spoken_index]
else
if a:spoken_index == 2
let l:spoken_value = l:command
else
let l:spoken_value = l:spoken_value[0]
endif
endif
else
if a:spoken_index == 2
let l:spoken_value = l:command
endif
endif
else
let l:spoken_value = l:command
endif
call add(l:spoken_list, l:spoken_value)
endfor
return l:spoken_list
endfunction
" TODO: Check lifetime. If the script is resourced, is the existing
" s:lsp_job dropped and therefore killed?
" This seems to not be the case and I've had to deal with zombie processes
" that survive exiting vim, even though said behavior conflicts with my
" understanding of the provided documentation
let s:lsp_opts = {"in_mode": "lsp", "out_mode": "lsp", "err_mode": "nl", "err_io": "buffer", "err_buf": s:output_buffer}
if !exists("g:lsp_job")
if exists("g:whisper_user_commands")
let s:c_user = g:whisper_user_commands
else
let s:c_user = {}
endif
let g:lsp_job = job_start(s:lsp_command, s:lsp_opts)
if job_status(g:lsp_job) == "fail"
echoerr "Failed to start whisper job"
endif
call s:registerAllCommands()
endif

View File

@@ -0,0 +1,6 @@
set(TARGET main)
add_executable(${TARGET} main.cpp)
include(DefaultTargetOptions)
target_link_libraries(${TARGET} PRIVATE common whisper ${CMAKE_THREAD_LIBS_INIT})

View File

@@ -0,0 +1,53 @@
# main
This is the main example demonstrating most of the functionality of the Whisper model.
It can be used as a reference for using the `whisper.cpp` library in other projects.
```
./main -h
usage: ./main [options] file0.wav file1.wav ...
options:
-h, --help [default] show this help message and exit
-t N, --threads N [4 ] number of threads to use during computation
-p N, --processors N [1 ] number of processors to use during computation
-ot N, --offset-t N [0 ] time offset in milliseconds
-on N, --offset-n N [0 ] segment index offset
-d N, --duration N [0 ] duration of audio to process in milliseconds
-mc N, --max-context N [-1 ] maximum number of text context tokens to store
-ml N, --max-len N [0 ] maximum segment length in characters
-sow, --split-on-word [false ] split on word rather than on token
-bo N, --best-of N [5 ] number of best candidates to keep
-bs N, --beam-size N [5 ] beam size for beam search
-wt N, --word-thold N [0.01 ] word timestamp probability threshold
-et N, --entropy-thold N [2.40 ] entropy threshold for decoder fail
-lpt N, --logprob-thold N [-1.00 ] log probability threshold for decoder fail
-debug, --debug-mode [false ] enable debug mode (eg. dump log_mel)
-tr, --translate [false ] translate from source language to english
-di, --diarize [false ] stereo audio diarization
-tdrz, --tinydiarize [false ] enable tinydiarize (requires a tdrz model)
-nf, --no-fallback [false ] do not use temperature fallback while decoding
-otxt, --output-txt [false ] output result in a text file
-ovtt, --output-vtt [false ] output result in a vtt file
-osrt, --output-srt [false ] output result in a srt file
-olrc, --output-lrc [false ] output result in a lrc file
-owts, --output-words [false ] output script for generating karaoke video
-fp, --font-path [/System/Library/Fonts/Supplemental/Courier New Bold.ttf] path to a monospace font for karaoke video
-ocsv, --output-csv [false ] output result in a CSV file
-oj, --output-json [false ] output result in a JSON file
-ojf, --output-json-full [false ] include more information in the JSON file
-of FNAME, --output-file FNAME [ ] output file path (without file extension)
-ps, --print-special [false ] print special tokens
-pc, --print-colors [false ] print colors
-pp, --print-progress [false ] print progress
-nt, --no-timestamps [false ] do not print timestamps
-l LANG, --language LANG [en ] spoken language ('auto' for auto-detect)
-dl, --detect-language [false ] exit after automatically detecting language
--prompt PROMPT [ ] initial prompt
-m FNAME, --model FNAME [models/ggml-base.en.bin] model path
-f FNAME, --file FNAME [ ] input WAV file path
-oved D, --ov-e-device DNAME [CPU ] the OpenVINO device used for encode inference
-ls, --log-score [false ] log best decoder scores of tokens
-ng, --no-gpu [false ] disable GPU
```

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,6 @@
set(TARGET quantize)
add_executable(${TARGET} quantize.cpp)
include(DefaultTargetOptions)
target_link_libraries(${TARGET} PRIVATE common whisper ${CMAKE_THREAD_LIBS_INIT})

View File

@@ -0,0 +1,3 @@
# quantize
Tool for integer quantization of Whisper `ggml` model files

View File

@@ -0,0 +1,223 @@
#include "ggml.h"
#include "common.h"
#include "common-ggml.h"
#include <cassert>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <fstream>
#include <map>
#include <string>
#include <vector>
#include <regex>
// default hparams (Whisper tiny)
struct whisper_hparams {
int32_t n_vocab = 51864;
int32_t n_audio_ctx = 1500;
int32_t n_audio_state = 384;
int32_t n_audio_head = 6;
int32_t n_audio_layer = 4;
int32_t n_text_ctx = 448;
int32_t n_text_state = 384;
int32_t n_text_head = 6;
int32_t n_text_layer = 4;
int32_t n_mels = 80;
int32_t ftype = 1;
};
struct whisper_filters {
int32_t n_mel;
int32_t n_fft;
std::vector<float> data;
};
// quantize a model
bool whisper_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
gpt_vocab vocab;
printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
auto finp = std::ifstream(fname_inp, std::ios::binary);
if (!finp) {
fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str());
return false;
}
auto fout = std::ofstream(fname_out, std::ios::binary);
if (!fout) {
fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str());
return false;
}
// verify magic
{
uint32_t magic;
finp.read((char *) &magic, sizeof(magic));
if (magic != GGML_FILE_MAGIC) {
fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
return false;
}
fout.write((char *) &magic, sizeof(magic));
}
whisper_hparams hparams;
// load hparams
{
finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
finp.read((char *) &hparams.n_audio_ctx, sizeof(hparams.n_audio_ctx));
finp.read((char *) &hparams.n_audio_state, sizeof(hparams.n_audio_state));
finp.read((char *) &hparams.n_audio_head, sizeof(hparams.n_audio_head));
finp.read((char *) &hparams.n_audio_layer, sizeof(hparams.n_audio_layer));
finp.read((char *) &hparams.n_text_ctx, sizeof(hparams.n_text_ctx));
finp.read((char *) &hparams.n_text_state, sizeof(hparams.n_text_state));
finp.read((char *) &hparams.n_text_head, sizeof(hparams.n_text_head));
finp.read((char *) &hparams.n_text_layer, sizeof(hparams.n_text_layer));
finp.read((char *) &hparams.n_mels, sizeof(hparams.n_mels));
finp.read((char *) &hparams.ftype, sizeof(hparams.ftype));
const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR;
const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
fprintf(stderr, "%s: n_audio_ctx = %d\n", __func__, hparams.n_audio_ctx);
fprintf(stderr, "%s: n_audio_state = %d\n", __func__, hparams.n_audio_state);
fprintf(stderr, "%s: n_audio_head = %d\n", __func__, hparams.n_audio_head);
fprintf(stderr, "%s: n_audio_layer = %d\n", __func__, hparams.n_audio_layer);
fprintf(stderr, "%s: n_text_ctx = %d\n", __func__, hparams.n_text_ctx);
fprintf(stderr, "%s: n_text_state = %d\n", __func__, hparams.n_text_state);
fprintf(stderr, "%s: n_text_head = %d\n", __func__, hparams.n_text_head);
fprintf(stderr, "%s: n_text_layer = %d\n", __func__, hparams.n_text_layer);
fprintf(stderr, "%s: n_mels = %d\n", __func__, hparams.n_mels);
fprintf(stderr, "%s: ftype (src) = %d\n", __func__, hparams.ftype);
fprintf(stderr, "%s: qntvr (src) = %d\n", __func__, qntvr_src);
fprintf(stderr, "%s: ftype (dst) = %d\n", __func__, ftype_dst);
fprintf(stderr, "%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
fout.write((const char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
fout.write((const char *) &hparams.n_audio_ctx, sizeof(hparams.n_audio_ctx));
fout.write((const char *) &hparams.n_audio_state, sizeof(hparams.n_audio_state));
fout.write((const char *) &hparams.n_audio_head, sizeof(hparams.n_audio_head));
fout.write((const char *) &hparams.n_audio_layer, sizeof(hparams.n_audio_layer));
fout.write((const char *) &hparams.n_text_ctx, sizeof(hparams.n_text_ctx));
fout.write((const char *) &hparams.n_text_state, sizeof(hparams.n_text_state));
fout.write((const char *) &hparams.n_text_head, sizeof(hparams.n_text_head));
fout.write((const char *) &hparams.n_text_layer, sizeof(hparams.n_text_layer));
fout.write((const char *) &hparams.n_mels, sizeof(hparams.n_mels));
fout.write((const char *) &ftype_dst, sizeof(hparams.ftype));
}
// load mel filters
{
whisper_filters filters;
finp.read ((char *) &filters.n_mel, sizeof(filters.n_mel));
fout.write((char *) &filters.n_mel, sizeof(filters.n_mel));
finp.read ((char *) &filters.n_fft, sizeof(filters.n_fft));
fout.write((char *) &filters.n_fft, sizeof(filters.n_fft));
filters.data.resize(filters.n_mel * filters.n_fft);
finp.read ((char *) filters.data.data(), filters.data.size() * sizeof(float));
fout.write((char *) filters.data.data(), filters.data.size() * sizeof(float));
}
// load vocab
{
int32_t n_vocab = 0;
finp.read ((char *) &n_vocab, sizeof(n_vocab));
fout.write((char *) &n_vocab, sizeof(n_vocab));
//if (n_vocab != hparams.n_vocab) {
// fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
// __func__, fname_inp.c_str(), n_vocab, hparams.n_vocab);
// return false;
//}
char word[129];
for (int i = 0; i < n_vocab; i++) {
uint32_t len;
finp.read ((char *) &len, sizeof(len));
fout.write((char *) &len, sizeof(len));
word[len] = '\0';
finp.read ((char *) word, len);
fout.write((char *) word, len);
vocab.token_to_id[word] = i;
vocab.id_to_token[i] = word;
}
}
// regexes of tensor names to not be quantized
const std::vector<std::string> to_skip = {
//"encoder.*",
"encoder.conv1.bias",
"encoder.conv2.bias",
"encoder.positional_embedding",
"decoder.positional_embedding",
};
if (!ggml_common_quantize_0(finp, fout, ftype, { ".*" }, to_skip)) {
fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
return false;
}
finp.close();
fout.close();
return true;
}
int main(int argc, char ** argv) {
if (argc != 4) {
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
ggml_print_ftypes(stderr);
return 1;
}
// needed to initialize f16 tables
{
struct ggml_init_params params = { 0, NULL, false };
struct ggml_context * ctx = ggml_init(params);
ggml_free(ctx);
}
const std::string fname_inp = argv[1];
const std::string fname_out = argv[2];
const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
const int64_t t_main_start_us = ggml_time_us();
int64_t t_quantize_us = 0;
// load the model
{
const int64_t t_start_us = ggml_time_us();
if (!whisper_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
return 1;
}
t_quantize_us = ggml_time_us() - t_start_us;
}
// report timing
{
const int64_t t_main_end_us = ggml_time_us();
printf("\n");
printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
}
return 0;
}

View File

@@ -0,0 +1,12 @@
set(TARGET server)
add_executable(${TARGET} server.cpp httplib.h json.hpp)
include(DefaultTargetOptions)
target_link_libraries(${TARGET} PRIVATE common whisper ${CMAKE_THREAD_LIBS_INIT})
# Check if the compiler is MinGW
if(MINGW)
# Link the necessary libraries for SSL and Winsock
target_link_libraries(${TARGET} PRIVATE -lcrypt32 -lssl -lcrypto -lws2_32)
endif()

View File

@@ -0,0 +1,68 @@
# whisper.cpp http server
Simple http server. WAV Files are passed to the inference model via http requests.
https://github.com/ggerganov/whisper.cpp/assets/1991296/e983ee53-8741-4eb5-9048-afe5e4594b8f
## Usage
```
./server -h
usage: ./bin/server [options]
options:
-h, --help [default] show this help message and exit
-t N, --threads N [4 ] number of threads to use during computation
-p N, --processors N [1 ] number of processors to use during computation
-ot N, --offset-t N [0 ] time offset in milliseconds
-on N, --offset-n N [0 ] segment index offset
-d N, --duration N [0 ] duration of audio to process in milliseconds
-mc N, --max-context N [-1 ] maximum number of text context tokens to store
-ml N, --max-len N [0 ] maximum segment length in characters
-sow, --split-on-word [false ] split on word rather than on token
-bo N, --best-of N [2 ] number of best candidates to keep
-bs N, --beam-size N [-1 ] beam size for beam search
-wt N, --word-thold N [0.01 ] word timestamp probability threshold
-et N, --entropy-thold N [2.40 ] entropy threshold for decoder fail
-lpt N, --logprob-thold N [-1.00 ] log probability threshold for decoder fail
-debug, --debug-mode [false ] enable debug mode (eg. dump log_mel)
-tr, --translate [false ] translate from source language to english
-di, --diarize [false ] stereo audio diarization
-tdrz, --tinydiarize [false ] enable tinydiarize (requires a tdrz model)
-nf, --no-fallback [false ] do not use temperature fallback while decoding
-ps, --print-special [false ] print special tokens
-pc, --print-colors [false ] print colors
-pr, --print-realtime [false ] print output in realtime
-pp, --print-progress [false ] print progress
-nt, --no-timestamps [false ] do not print timestamps
-l LANG, --language LANG [en ] spoken language ('auto' for auto-detect)
-dl, --detect-language [false ] exit after automatically detecting language
--prompt PROMPT [ ] initial prompt
-m FNAME, --model FNAME [models/ggml-base.en.bin] model path
-oved D, --ov-e-device DNAME [CPU ] the OpenVINO device used for encode inference
--host HOST, [127.0.0.1] Hostname/ip-adress for the server
--port PORT, [8080 ] Port number for the server
--convert, [false ] Convert audio to WAV, requires ffmpeg on the server
```
> [!WARNING]
> **Do not run the server example with administrative privileges and ensure it's operated in a sandbox environment, especially since it involves risky operations like accepting user file uploads and using ffmpeg for format conversions. Always validate and sanitize inputs to guard against potential security threats.**
## request examples
**/inference**
```
curl 127.0.0.1:8080/inference \
-H "Content-Type: multipart/form-data" \
-F file="@<file-path>" \
-F temperature="0.2" \
-F response-format="json"
```
**/load**
```
curl 127.0.0.1:8080/load \
-H "Content-Type: multipart/form-data" \
-F model="<path-to-model-file>"
```

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,811 @@
#include "common.h"
#include "whisper.h"
#include "httplib.h"
#include "json.hpp"
#include <cmath>
#include <fstream>
#include <cstdio>
#include <string>
#include <thread>
#include <vector>
#include <cstring>
#include <sstream>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
using namespace httplib;
using json = nlohmann::json;
namespace {
// Terminal color map. 10 colors grouped in ranges [0.0, 0.1, ..., 0.9]
// Lowest is red, middle is yellow, highest is green.
const std::vector<std::string> k_colors = {
"\033[38;5;196m", "\033[38;5;202m", "\033[38;5;208m", "\033[38;5;214m", "\033[38;5;220m",
"\033[38;5;226m", "\033[38;5;190m", "\033[38;5;154m", "\033[38;5;118m", "\033[38;5;82m",
};
// output formats
const std::string json_format = "json";
const std::string text_format = "text";
const std::string srt_format = "srt";
const std::string vjson_format = "verbose_json";
const std::string vtt_format = "vtt";
struct server_params
{
std::string hostname = "127.0.0.1";
std::string public_path = "examples/server/public";
int32_t port = 8080;
int32_t read_timeout = 600;
int32_t write_timeout = 600;
bool ffmpeg_converter = false;
};
struct whisper_params {
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
int32_t n_processors = 1;
int32_t offset_t_ms = 0;
int32_t offset_n = 0;
int32_t duration_ms = 0;
int32_t progress_step = 5;
int32_t max_context = -1;
int32_t max_len = 0;
int32_t best_of = 2;
int32_t beam_size = -1;
float word_thold = 0.01f;
float entropy_thold = 2.40f;
float logprob_thold = -1.00f;
float userdef_temp = 0.20f;
bool speed_up = false;
bool debug_mode = false;
bool translate = false;
bool detect_language = false;
bool diarize = false;
bool tinydiarize = false;
bool split_on_word = false;
bool no_fallback = false;
bool print_special = false;
bool print_colors = false;
bool print_realtime = false;
bool print_progress = false;
bool no_timestamps = false;
bool use_gpu = true;
std::string language = "en";
std::string prompt = "";
std::string font_path = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";
std::string model = "models/ggml-base.en.bin";
std::string response_format = json_format;
// [TDRZ] speaker turn string
std::string tdrz_speaker_turn = " [SPEAKER_TURN]"; // TODO: set from command line
std::string openvino_encode_device = "CPU";
};
// 500 -> 00:05.000
// 6000 -> 01:00.000
std::string to_timestamp(int64_t t, bool comma = false) {
int64_t msec = t * 10;
int64_t hr = msec / (1000 * 60 * 60);
msec = msec - hr * (1000 * 60 * 60);
int64_t min = msec / (1000 * 60);
msec = msec - min * (1000 * 60);
int64_t sec = msec / 1000;
msec = msec - sec * 1000;
char buf[32];
snprintf(buf, sizeof(buf), "%02d:%02d:%02d%s%03d", (int) hr, (int) min, (int) sec, comma ? "," : ".", (int) msec);
return std::string(buf);
}
int timestamp_to_sample(int64_t t, int n_samples) {
return std::max(0, std::min((int) n_samples - 1, (int) ((t*WHISPER_SAMPLE_RATE)/100)));
}
bool is_file_exist(const char *fileName)
{
std::ifstream infile(fileName);
return infile.good();
}
void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params,
const server_params& sparams) {
fprintf(stderr, "\n");
fprintf(stderr, "usage: %s [options] \n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
fprintf(stderr, " -p N, --processors N [%-7d] number of processors to use during computation\n", params.n_processors);
fprintf(stderr, " -ot N, --offset-t N [%-7d] time offset in milliseconds\n", params.offset_t_ms);
fprintf(stderr, " -on N, --offset-n N [%-7d] segment index offset\n", params.offset_n);
fprintf(stderr, " -d N, --duration N [%-7d] duration of audio to process in milliseconds\n", params.duration_ms);
fprintf(stderr, " -mc N, --max-context N [%-7d] maximum number of text context tokens to store\n", params.max_context);
fprintf(stderr, " -ml N, --max-len N [%-7d] maximum segment length in characters\n", params.max_len);
fprintf(stderr, " -sow, --split-on-word [%-7s] split on word rather than on token\n", params.split_on_word ? "true" : "false");
fprintf(stderr, " -bo N, --best-of N [%-7d] number of best candidates to keep\n", params.best_of);
fprintf(stderr, " -bs N, --beam-size N [%-7d] beam size for beam search\n", params.beam_size);
fprintf(stderr, " -wt N, --word-thold N [%-7.2f] word timestamp probability threshold\n", params.word_thold);
fprintf(stderr, " -et N, --entropy-thold N [%-7.2f] entropy threshold for decoder fail\n", params.entropy_thold);
fprintf(stderr, " -lpt N, --logprob-thold N [%-7.2f] log probability threshold for decoder fail\n", params.logprob_thold);
// fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
fprintf(stderr, " -debug, --debug-mode [%-7s] enable debug mode (eg. dump log_mel)\n", params.debug_mode ? "true" : "false");
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
fprintf(stderr, " -di, --diarize [%-7s] stereo audio diarization\n", params.diarize ? "true" : "false");
fprintf(stderr, " -tdrz, --tinydiarize [%-7s] enable tinydiarize (requires a tdrz model)\n", params.tinydiarize ? "true" : "false");
fprintf(stderr, " -nf, --no-fallback [%-7s] do not use temperature fallback while decoding\n", params.no_fallback ? "true" : "false");
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
fprintf(stderr, " -pc, --print-colors [%-7s] print colors\n", params.print_colors ? "true" : "false");
fprintf(stderr, " -pr, --print-realtime [%-7s] print output in realtime\n", params.print_realtime ? "true" : "false");
fprintf(stderr, " -pp, --print-progress [%-7s] print progress\n", params.print_progress ? "true" : "false");
fprintf(stderr, " -nt, --no-timestamps [%-7s] do not print timestamps\n", params.no_timestamps ? "true" : "false");
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language ('auto' for auto-detect)\n", params.language.c_str());
fprintf(stderr, " -dl, --detect-language [%-7s] exit after automatically detecting language\n", params.detect_language ? "true" : "false");
fprintf(stderr, " --prompt PROMPT [%-7s] initial prompt\n", params.prompt.c_str());
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
fprintf(stderr, " -oved D, --ov-e-device DNAME [%-7s] the OpenVINO device used for encode inference\n", params.openvino_encode_device.c_str());
// server params
fprintf(stderr, " --host HOST, [%-7s] Hostname/ip-adress for the server\n", sparams.hostname.c_str());
fprintf(stderr, " --port PORT, [%-7d] Port number for the server\n", sparams.port);
fprintf(stderr, " --public PATH, [%-7s] Path to the public folder\n", sparams.public_path.c_str());
fprintf(stderr, " --convert, [%-7s] Convert audio to WAV, requires ffmpeg on the server", sparams.ffmpeg_converter ? "true" : "false");
fprintf(stderr, "\n");
}
bool whisper_params_parse(int argc, char ** argv, whisper_params & params, server_params & sparams) {
for (int i = 1; i < argc; i++) {
std::string arg = argv[i];
if (arg == "-h" || arg == "--help") {
whisper_print_usage(argc, argv, params, sparams);
exit(0);
}
else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
else if (arg == "-p" || arg == "--processors") { params.n_processors = std::stoi(argv[++i]); }
else if (arg == "-ot" || arg == "--offset-t") { params.offset_t_ms = std::stoi(argv[++i]); }
else if (arg == "-on" || arg == "--offset-n") { params.offset_n = std::stoi(argv[++i]); }
else if (arg == "-d" || arg == "--duration") { params.duration_ms = std::stoi(argv[++i]); }
else if (arg == "-mc" || arg == "--max-context") { params.max_context = std::stoi(argv[++i]); }
else if (arg == "-ml" || arg == "--max-len") { params.max_len = std::stoi(argv[++i]); }
else if (arg == "-bo" || arg == "--best-of") { params.best_of = std::stoi(argv[++i]); }
else if (arg == "-bs" || arg == "--beam-size") { params.beam_size = std::stoi(argv[++i]); }
else if (arg == "-wt" || arg == "--word-thold") { params.word_thold = std::stof(argv[++i]); }
else if (arg == "-et" || arg == "--entropy-thold") { params.entropy_thold = std::stof(argv[++i]); }
else if (arg == "-lpt" || arg == "--logprob-thold") { params.logprob_thold = std::stof(argv[++i]); }
// else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
else if (arg == "-debug"|| arg == "--debug-mode") { params.debug_mode = true; }
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
else if (arg == "-di" || arg == "--diarize") { params.diarize = true; }
else if (arg == "-tdrz" || arg == "--tinydiarize") { params.tinydiarize = true; }
else if (arg == "-sow" || arg == "--split-on-word") { params.split_on_word = true; }
else if (arg == "-nf" || arg == "--no-fallback") { params.no_fallback = true; }
else if (arg == "-fp" || arg == "--font-path") { params.font_path = argv[++i]; }
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
else if (arg == "-pc" || arg == "--print-colors") { params.print_colors = true; }
else if (arg == "-pr" || arg == "--print-realtime") { params.print_realtime = true; }
else if (arg == "-pp" || arg == "--print-progress") { params.print_progress = true; }
else if (arg == "-nt" || arg == "--no-timestamps") { params.no_timestamps = true; }
else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; }
else if (arg == "-dl" || arg == "--detect-language") { params.detect_language = true; }
else if ( arg == "--prompt") { params.prompt = argv[++i]; }
else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
else if (arg == "-oved" || arg == "--ov-e-device") { params.openvino_encode_device = argv[++i]; }
else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; }
// server params
else if ( arg == "--port") { sparams.port = std::stoi(argv[++i]); }
else if ( arg == "--host") { sparams.hostname = argv[++i]; }
else if ( arg == "--public") { sparams.public_path = argv[++i]; }
else if ( arg == "--convert") { sparams.ffmpeg_converter = true; }
else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
whisper_print_usage(argc, argv, params, sparams);
exit(0);
}
}
return true;
}
struct whisper_print_user_data {
const whisper_params * params;
const std::vector<std::vector<float>> * pcmf32s;
int progress_prev;
};
void check_ffmpeg_availibility() {
int result = system("ffmpeg -version");
if (result == 0) {
std::cout << "ffmpeg is available." << std::endl;
} else {
// ffmpeg is not available
std::cout << "ffmpeg is not found. Please ensure that ffmpeg is installed ";
std::cout << "and that its executable is included in your system's PATH. ";
exit(0);
}
}
bool convert_to_wav(const std::string & temp_filename, std::string & error_resp) {
std::ostringstream cmd_stream;
std::string converted_filename_temp = temp_filename + "_temp.wav";
cmd_stream << "ffmpeg -i \"" << temp_filename << "\" -ar 16000 -ac 1 -c:a pcm_s16le \"" << converted_filename_temp << "\" 2>&1";
std::string cmd = cmd_stream.str();
int status = std::system(cmd.c_str());
if (status != 0) {
error_resp = "{\"error\":\"FFmpeg conversion failed.\"}";
return false;
}
// Remove the original file
if (remove(temp_filename.c_str()) != 0) {
error_resp = "{\"error\":\"Failed to remove the original file.\"}";
return false;
}
// Rename the temporary file to match the original filename
if (rename(converted_filename_temp.c_str(), temp_filename.c_str()) != 0) {
error_resp = "{\"error\":\"Failed to rename the temporary file.\"}";
return false;
}
return true;
}
std::string estimate_diarization_speaker(std::vector<std::vector<float>> pcmf32s, int64_t t0, int64_t t1, bool id_only = false) {
std::string speaker = "";
const int64_t n_samples = pcmf32s[0].size();
const int64_t is0 = timestamp_to_sample(t0, n_samples);
const int64_t is1 = timestamp_to_sample(t1, n_samples);
double energy0 = 0.0f;
double energy1 = 0.0f;
for (int64_t j = is0; j < is1; j++) {
energy0 += fabs(pcmf32s[0][j]);
energy1 += fabs(pcmf32s[1][j]);
}
if (energy0 > 1.1*energy1) {
speaker = "0";
} else if (energy1 > 1.1*energy0) {
speaker = "1";
} else {
speaker = "?";
}
//printf("is0 = %lld, is1 = %lld, energy0 = %f, energy1 = %f, speaker = %s\n", is0, is1, energy0, energy1, speaker.c_str());
if (!id_only) {
speaker.insert(0, "(speaker ");
speaker.append(")");
}
return speaker;
}
void whisper_print_progress_callback(struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, int progress, void * user_data) {
int progress_step = ((whisper_print_user_data *) user_data)->params->progress_step;
int * progress_prev = &(((whisper_print_user_data *) user_data)->progress_prev);
if (progress >= *progress_prev + progress_step) {
*progress_prev += progress_step;
fprintf(stderr, "%s: progress = %3d%%\n", __func__, progress);
}
}
void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper_state * /*state*/, int n_new, void * user_data) {
const auto & params = *((whisper_print_user_data *) user_data)->params;
const auto & pcmf32s = *((whisper_print_user_data *) user_data)->pcmf32s;
const int n_segments = whisper_full_n_segments(ctx);
std::string speaker = "";
int64_t t0 = 0;
int64_t t1 = 0;
// print the last n_new segments
const int s0 = n_segments - n_new;
if (s0 == 0) {
printf("\n");
}
for (int i = s0; i < n_segments; i++) {
if (!params.no_timestamps || params.diarize) {
t0 = whisper_full_get_segment_t0(ctx, i);
t1 = whisper_full_get_segment_t1(ctx, i);
}
if (!params.no_timestamps) {
printf("[%s --> %s] ", to_timestamp(t0).c_str(), to_timestamp(t1).c_str());
}
if (params.diarize && pcmf32s.size() == 2) {
speaker = estimate_diarization_speaker(pcmf32s, t0, t1);
}
if (params.print_colors) {
for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
if (params.print_special == false) {
const whisper_token id = whisper_full_get_token_id(ctx, i, j);
if (id >= whisper_token_eot(ctx)) {
continue;
}
}
const char * text = whisper_full_get_token_text(ctx, i, j);
const float p = whisper_full_get_token_p (ctx, i, j);
const int col = std::max(0, std::min((int) k_colors.size() - 1, (int) (std::pow(p, 3)*float(k_colors.size()))));
printf("%s%s%s%s", speaker.c_str(), k_colors[col].c_str(), text, "\033[0m");
}
} else {
const char * text = whisper_full_get_segment_text(ctx, i);
printf("%s%s", speaker.c_str(), text);
}
if (params.tinydiarize) {
if (whisper_full_get_segment_speaker_turn_next(ctx, i)) {
printf("%s", params.tdrz_speaker_turn.c_str());
}
}
// with timestamps or speakers: each segment on new line
if (!params.no_timestamps || params.diarize) {
printf("\n");
}
fflush(stdout);
}
}
std::string output_str(struct whisper_context * ctx, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
std::stringstream result;
const int n_segments = whisper_full_n_segments(ctx);
for (int i = 0; i < n_segments; ++i) {
const char * text = whisper_full_get_segment_text(ctx, i);
std::string speaker = "";
if (params.diarize && pcmf32s.size() == 2)
{
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
speaker = estimate_diarization_speaker(pcmf32s, t0, t1);
}
result << speaker << text << "\n";
}
return result.str();
}
void get_req_parameters(const Request & req, whisper_params & params)
{
// user model configu.has_fileion
if (req.has_file("offset-t"))
{
params.offset_t_ms = std::stoi(req.get_file_value("offset-t").content);
}
if (req.has_file("offset-n"))
{
params.offset_n = std::stoi(req.get_file_value("offset-n").content);
}
if (req.has_file("duration"))
{
params.duration_ms = std::stoi(req.get_file_value("duration").content);
}
if (req.has_file("max-context"))
{
params.max_context = std::stoi(req.get_file_value("max-context").content);
}
if (req.has_file("prompt"))
{
params.prompt = req.get_file_value("prompt").content;
}
if (req.has_file("response-format"))
{
params.response_format = req.get_file_value("response-format").content;
}
if (req.has_file("temperature"))
{
params.userdef_temp = std::stof(req.get_file_value("temperature").content);
}
}
} // namespace
int main(int argc, char ** argv) {
whisper_params params;
server_params sparams;
std::mutex whisper_mutex;
if (whisper_params_parse(argc, argv, params, sparams) == false) {
whisper_print_usage(argc, argv, params, sparams);
return 1;
}
if (params.language != "auto" && whisper_lang_id(params.language.c_str()) == -1) {
fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
whisper_print_usage(argc, argv, params, sparams);
exit(0);
}
if (params.diarize && params.tinydiarize) {
fprintf(stderr, "error: cannot use both --diarize and --tinydiarize\n");
whisper_print_usage(argc, argv, params, sparams);
exit(0);
}
if (sparams.ffmpeg_converter) {
check_ffmpeg_availibility();
}
// whisper init
struct whisper_context_params cparams;
cparams.use_gpu = params.use_gpu;
struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
if (ctx == nullptr) {
fprintf(stderr, "error: failed to initialize whisper context\n");
return 3;
}
// initialize openvino encoder. this has no effect on whisper.cpp builds that don't have OpenVINO configured
whisper_ctx_init_openvino_encoder(ctx, nullptr, params.openvino_encode_device.c_str(), nullptr);
Server svr;
svr.set_default_headers({{"Server", "whisper.cpp"},
{"Access-Control-Allow-Origin", "*"},
{"Access-Control-Allow-Headers", "content-type"}});
std::string const default_content = "<html>hello</html>";
// this is only called if no index.html is found in the public --path
svr.Get("/", [&default_content](const Request &, Response &res){
res.set_content(default_content, "text/html");
return false;
});
svr.Post("/inference", [&](const Request &req, Response &res){
// acquire whisper model mutex lock
whisper_mutex.lock();
// first check user requested fields of the request
if (!req.has_file("file"))
{
fprintf(stderr, "error: no 'file' field in the request\n");
const std::string error_resp = "{\"error\":\"no 'file' field in the request\"}";
res.set_content(error_resp, "application/json");
whisper_mutex.unlock();
return;
}
auto audio_file = req.get_file_value("file");
// check non-required fields
get_req_parameters(req, params);
std::string filename{audio_file.filename};
printf("Received request: %s\n", filename.c_str());
// audio arrays
std::vector<float> pcmf32; // mono-channel F32 PCM
std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
// write to temporary file
const std::string temp_filename = "whisper_server_temp_file.wav";
std::ofstream temp_file{temp_filename, std::ios::binary};
temp_file << audio_file.content;
temp_file.close();
// if file is not wav, convert to wav
if (sparams.ffmpeg_converter) {
std::string error_resp = "{\"error\":\"Failed to execute ffmpeg command.\"}";
const bool is_converted = convert_to_wav(temp_filename, error_resp);
if (!is_converted) {
res.set_content(error_resp, "application/json");
whisper_mutex.unlock();
return;
}
}
// read wav content into pcmf32
if (!::read_wav(temp_filename, pcmf32, pcmf32s, params.diarize)) {
fprintf(stderr, "error: failed to read WAV file '%s'\n", temp_filename.c_str());
const std::string error_resp = "{\"error\":\"failed to read WAV file\"}";
res.set_content(error_resp, "application/json");
std::remove(temp_filename.c_str());
whisper_mutex.unlock();
return;
}
// remove temp file
std::remove(temp_filename.c_str());
printf("Successfully loaded %s\n", filename.c_str());
// print system information
{
fprintf(stderr, "\n");
fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
params.n_threads*params.n_processors, std::thread::hardware_concurrency(), whisper_print_system_info());
}
// print some info about the processing
{
fprintf(stderr, "\n");
if (!whisper_is_multilingual(ctx)) {
if (params.language != "en" || params.translate) {
params.language = "en";
params.translate = false;
fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
}
}
if (params.detect_language) {
params.language = "auto";
}
fprintf(stderr, "%s: processing '%s' (%d samples, %.1f sec), %d threads, %d processors, lang = %s, task = %s, %stimestamps = %d ...\n",
__func__, filename.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE,
params.n_threads, params.n_processors,
params.language.c_str(),
params.translate ? "translate" : "transcribe",
params.tinydiarize ? "tdrz = 1, " : "",
params.no_timestamps ? 0 : 1);
fprintf(stderr, "\n");
}
// run the inference
{
printf("Running whisper.cpp inference on %s\n", filename.c_str());
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
wparams.strategy = params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY;
wparams.print_realtime = false;
wparams.print_progress = params.print_progress;
wparams.print_timestamps = !params.no_timestamps;
wparams.print_special = params.print_special;
wparams.translate = params.translate;
wparams.language = params.language.c_str();
wparams.detect_language = params.detect_language;
wparams.n_threads = params.n_threads;
wparams.n_max_text_ctx = params.max_context >= 0 ? params.max_context : wparams.n_max_text_ctx;
wparams.offset_ms = params.offset_t_ms;
wparams.duration_ms = params.duration_ms;
wparams.thold_pt = params.word_thold;
wparams.max_len = params.max_len == 0 ? 60 : params.max_len;
wparams.split_on_word = params.split_on_word;
wparams.speed_up = params.speed_up;
wparams.debug_mode = params.debug_mode;
wparams.tdrz_enable = params.tinydiarize; // [TDRZ]
wparams.initial_prompt = params.prompt.c_str();
wparams.greedy.best_of = params.best_of;
wparams.beam_search.beam_size = params.beam_size;
wparams.temperature_inc = params.userdef_temp;
wparams.entropy_thold = params.entropy_thold;
wparams.logprob_thold = params.logprob_thold;
whisper_print_user_data user_data = { &params, &pcmf32s, 0 };
// this callback is called on each new segment
if (params.print_realtime) {
wparams.new_segment_callback = whisper_print_segment_callback;
wparams.new_segment_callback_user_data = &user_data;
}
if (wparams.print_progress) {
wparams.progress_callback = whisper_print_progress_callback;
wparams.progress_callback_user_data = &user_data;
}
// examples for abort mechanism
// in examples below, we do not abort the processing, but we could if the flag is set to true
// the callback is called before every encoder run - if it returns false, the processing is aborted
{
static bool is_aborted = false; // NOTE: this should be atomic to avoid data race
wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
bool is_aborted = *(bool*)user_data;
return !is_aborted;
};
wparams.encoder_begin_callback_user_data = &is_aborted;
}
// the callback is called before every computation - if it returns true, the computation is aborted
{
static bool is_aborted = false; // NOTE: this should be atomic to avoid data race
wparams.abort_callback = [](void * user_data) {
bool is_aborted = *(bool*)user_data;
return is_aborted;
};
wparams.abort_callback_user_data = &is_aborted;
}
if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0) {
fprintf(stderr, "%s: failed to process audio\n", argv[0]);
const std::string error_resp = "{\"error\":\"failed to process audio\"}";
res.set_content(error_resp, "application/json");
whisper_mutex.unlock();
return;
}
}
// return results to user
if (params.response_format == text_format)
{
std::string results = output_str(ctx, params, pcmf32s);
res.set_content(results.c_str(), "text/html");
}
else if (params.response_format == srt_format)
{
std::stringstream ss;
const int n_segments = whisper_full_n_segments(ctx);
for (int i = 0; i < n_segments; ++i) {
const char * text = whisper_full_get_segment_text(ctx, i);
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
std::string speaker = "";
if (params.diarize && pcmf32s.size() == 2)
{
speaker = estimate_diarization_speaker(pcmf32s, t0, t1);
}
ss << i + 1 + params.offset_n << "\n";
ss << to_timestamp(t0, true) << " --> " << to_timestamp(t1, true) << "\n";
ss << speaker << text << "\n\n";
}
res.set_content(ss.str(), "application/x-subrip");
} else if (params.response_format == vtt_format) {
std::stringstream ss;
ss << "WEBVTT\n\n";
const int n_segments = whisper_full_n_segments(ctx);
for (int i = 0; i < n_segments; ++i) {
const char * text = whisper_full_get_segment_text(ctx, i);
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
std::string speaker = "";
if (params.diarize && pcmf32s.size() == 2)
{
speaker = estimate_diarization_speaker(pcmf32s, t0, t1, true);
speaker.insert(0, "<v Speaker");
speaker.append(">");
}
ss << to_timestamp(t0) << " --> " << to_timestamp(t1) << "\n";
ss << speaker << text << "\n\n";
}
res.set_content(ss.str(), "text/vtt");
}
// TODO add more output formats
else
{
std::string results = output_str(ctx, params, pcmf32s);
json jres = json{
{"text", results}
};
res.set_content(jres.dump(-1, ' ', false, json::error_handler_t::replace),
"application/json");
}
// return whisper model mutex lock
whisper_mutex.unlock();
});
svr.Post("/load", [&](const Request &req, Response &res){
whisper_mutex.lock();
if (!req.has_file("model"))
{
fprintf(stderr, "error: no 'model' field in the request\n");
const std::string error_resp = "{\"error\":\"no 'model' field in the request\"}";
res.set_content(error_resp, "application/json");
whisper_mutex.unlock();
return;
}
std::string model = req.get_file_value("model").content;
if (!is_file_exist(model.c_str()))
{
fprintf(stderr, "error: 'model': %s not found!\n", model.c_str());
const std::string error_resp = "{\"error\":\"model not found!\"}";
res.set_content(error_resp, "application/json");
whisper_mutex.unlock();
return;
}
// clean up
whisper_free(ctx);
// whisper init
ctx = whisper_init_from_file_with_params(model.c_str(), cparams);
// TODO perhaps load prior model here instead of exit
if (ctx == nullptr) {
fprintf(stderr, "error: model init failed, no model loaded must exit\n");
exit(1);
}
// initialize openvino encoder. this has no effect on whisper.cpp builds that don't have OpenVINO configured
whisper_ctx_init_openvino_encoder(ctx, nullptr, params.openvino_encode_device.c_str(), nullptr);
const std::string success = "Load was successful!";
res.set_content(success, "application/text");
// check if the model is in the file system
whisper_mutex.unlock();
});
svr.set_exception_handler([](const Request &, Response &res, std::exception_ptr ep) {
const char fmt[] = "500 Internal Server Error\n%s";
char buf[BUFSIZ];
try {
std::rethrow_exception(std::move(ep));
} catch (std::exception &e) {
snprintf(buf, sizeof(buf), fmt, e.what());
} catch (...) {
snprintf(buf, sizeof(buf), fmt, "Unknown Exception");
}
res.set_content(buf, "text/plain");
res.status = 500;
});
svr.set_error_handler([](const Request &, Response &res) {
if (res.status == 400) {
res.set_content("Invalid request", "text/plain");
} else if (res.status != 500) {
res.set_content("File Not Found", "text/plain");
res.status = 404;
}
});
// set timeouts and change hostname and port
svr.set_read_timeout(sparams.read_timeout);
svr.set_write_timeout(sparams.write_timeout);
if (!svr.bind_to_port(sparams.hostname, sparams.port))
{
fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n",
sparams.hostname.c_str(), sparams.port);
return 1;
}
// Set the base directory for serving static files
svr.set_base_dir(sparams.public_path);
// to make it ctrl+clickable:
printf("\nwhisper server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port);
if (!svr.listen_after_bind())
{
return 1;
}
whisper_print_timings(ctx);
whisper_free(ctx);
return 0;
}

View File

@@ -0,0 +1,49 @@
#
# libstream
#
set(TARGET libstream)
add_executable(${TARGET}
emscripten.cpp
)
include(DefaultTargetOptions)
target_link_libraries(${TARGET} PRIVATE
whisper
)
unset(EXTRA_FLAGS)
if (WHISPER_WASM_SINGLE_FILE)
set(EXTRA_FLAGS "-s SINGLE_FILE=1")
message(STATUS "Embedding WASM inside stream.js")
add_custom_command(
TARGET ${TARGET} POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy
${CMAKE_BINARY_DIR}/bin/libstream.js
${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/stream.wasm/stream.js
)
endif()
set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
--bind \
-s USE_PTHREADS=1 \
-s PTHREAD_POOL_SIZE=8 \
-s INITIAL_MEMORY=1024MB \
-s TOTAL_MEMORY=1024MB \
-s FORCE_FILESYSTEM=1 \
-s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
${EXTRA_FLAGS} \
")
#
# stream.wasm
#
set(TARGET stream.wasm)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/index-tmpl.html ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/index.html @ONLY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../helpers.js ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/helpers.js @ONLY)

View File

@@ -0,0 +1,20 @@
# stream.wasm
Real-time transcription in the browser using WebAssembly
Online demo: https://whisper.ggerganov.com/stream/
## Build instructions
```bash
# build using Emscripten (v3.1.2)
git clone https://github.com/ggerganov/whisper.cpp
cd whisper.cpp
mkdir build-em && cd build-em
emcmake cmake ..
make -j
# copy the produced page to your HTTP path
cp bin/stream.wasm/* /path/to/html/
cp bin/libstream.worker.js /path/to/html/
```

View File

@@ -0,0 +1,216 @@
#include "ggml.h"
#include "whisper.h"
#include <emscripten.h>
#include <emscripten/bind.h>
#include <atomic>
#include <cmath>
#include <mutex>
#include <string>
#include <thread>
#include <vector>
constexpr int N_THREAD = 8;
std::vector<struct whisper_context *> g_contexts(4, nullptr);
std::mutex g_mutex;
std::thread g_worker;
std::atomic<bool> g_running(false);
std::string g_status = "";
std::string g_status_forced = "";
std::string g_transcribed = "";
std::vector<float> g_pcmf32;
void stream_set_status(const std::string & status) {
std::lock_guard<std::mutex> lock(g_mutex);
g_status = status;
}
void stream_main(size_t index) {
stream_set_status("loading data ...");
struct whisper_full_params wparams = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY);
wparams.n_threads = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
wparams.offset_ms = 0;
wparams.translate = false;
wparams.no_context = true;
wparams.single_segment = true;
wparams.print_realtime = false;
wparams.print_progress = false;
wparams.print_timestamps = true;
wparams.print_special = false;
wparams.max_tokens = 32;
wparams.audio_ctx = 768; // partial encoder context for better performance
// disable temperature fallback
wparams.temperature_inc = -1.0f;
wparams.language = "en";
printf("stream: using %d threads\n", wparams.n_threads);
std::vector<float> pcmf32;
// whisper context
auto & ctx = g_contexts[index];
// 5 seconds interval
const int64_t window_samples = 5*WHISPER_SAMPLE_RATE;
while (g_running) {
stream_set_status("waiting for audio ...");
{
std::unique_lock<std::mutex> lock(g_mutex);
if (g_pcmf32.size() < 1024) {
lock.unlock();
std::this_thread::sleep_for(std::chrono::milliseconds(10));
continue;
}
pcmf32 = std::vector<float>(g_pcmf32.end() - std::min((int64_t) g_pcmf32.size(), window_samples), g_pcmf32.end());
g_pcmf32.clear();
}
{
const auto t_start = std::chrono::high_resolution_clock::now();
stream_set_status("running whisper ...");
int ret = whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size());
if (ret != 0) {
printf("whisper_full() failed: %d\n", ret);
break;
}
const auto t_end = std::chrono::high_resolution_clock::now();
printf("stream: whisper_full() returned %d in %f seconds\n", ret, std::chrono::duration<double>(t_end - t_start).count());
}
{
std::string text_heard;
{
const int n_segments = whisper_full_n_segments(ctx);
for (int i = n_segments - 1; i < n_segments; ++i) {
const char * text = whisper_full_get_segment_text(ctx, i);
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
printf("transcribed: %s\n", text);
text_heard += text;
}
}
{
std::lock_guard<std::mutex> lock(g_mutex);
g_transcribed = text_heard;
}
}
}
if (index < g_contexts.size()) {
whisper_free(g_contexts[index]);
g_contexts[index] = nullptr;
}
}
EMSCRIPTEN_BINDINGS(stream) {
emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
for (size_t i = 0; i < g_contexts.size(); ++i) {
if (g_contexts[i] == nullptr) {
g_contexts[i] = whisper_init_from_file_with_params(path_model.c_str(), whisper_context_default_params());
if (g_contexts[i] != nullptr) {
g_running = true;
if (g_worker.joinable()) {
g_worker.join();
}
g_worker = std::thread([i]() {
stream_main(i);
});
return i + 1;
} else {
return (size_t) 0;
}
}
}
return (size_t) 0;
}));
emscripten::function("free", emscripten::optional_override([](size_t index) {
if (g_running) {
g_running = false;
}
}));
emscripten::function("set_audio", emscripten::optional_override([](size_t index, const emscripten::val & audio) {
--index;
if (index >= g_contexts.size()) {
return -1;
}
if (g_contexts[index] == nullptr) {
return -2;
}
{
std::lock_guard<std::mutex> lock(g_mutex);
const int n = audio["length"].as<int>();
emscripten::val heap = emscripten::val::module_property("HEAPU8");
emscripten::val memory = heap["buffer"];
g_pcmf32.resize(n);
emscripten::val memoryView = audio["constructor"].new_(memory, reinterpret_cast<uintptr_t>(g_pcmf32.data()), n);
memoryView.call<void>("set", audio);
}
return 0;
}));
emscripten::function("get_transcribed", emscripten::optional_override([]() {
std::string transcribed;
{
std::lock_guard<std::mutex> lock(g_mutex);
transcribed = std::move(g_transcribed);
}
return transcribed;
}));
emscripten::function("get_status", emscripten::optional_override([]() {
std::string status;
{
std::lock_guard<std::mutex> lock(g_mutex);
status = g_status_forced.empty() ? g_status : g_status_forced;
}
return status;
}));
emscripten::function("set_status", emscripten::optional_override([](const std::string & status) {
{
std::lock_guard<std::mutex> lock(g_mutex);
g_status_forced = status;
}
}));
}

View File

@@ -0,0 +1,413 @@
<!doctype html>
<html lang="en-us">
<head>
<title>stream : Real-time Whisper transcription in WebAssembly</title>
<style>
#output {
width: 100%;
height: 100%;
margin: 0 auto;
margin-top: 10px;
border-left: 0px;
border-right: 0px;
padding-left: 0px;
padding-right: 0px;
display: block;
background-color: black;
color: white;
font-size: 10px;
font-family: 'Lucida Console', Monaco, monospace;
outline: none;
white-space: pre;
overflow-wrap: normal;
overflow-x: scroll;
}
</style>
</head>
<body>
<div id="main-container">
<b>stream : Real-time Whisper transcription in WebAssembly</b>
<br><br>
You can find more about this project on <a href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/stream.wasm">GitHub</a>.
<br><br>
<b>More examples:</b>
<a href="https://whisper.ggerganov.com/">main</a> |
<a href="https://whisper.ggerganov.com/bench">bench</a> |
<a href="https://whisper.ggerganov.com/stream">stream</a> |
<a href="https://whisper.ggerganov.com/command">command</a> |
<a href="https://whisper.ggerganov.com/talk">talk</a> |
<br><br>
<hr>
Select the model you would like to use, click the "Start" button and start speaking
<br><br>
<div id="model-whisper">
Whisper model: <span id="model-whisper-status"></span>
<button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
<button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
<br><br>
Quantized models:<br><br>
<button id="fetch-whisper-tiny-en-q5_1" onclick="loadWhisper('tiny-en-q5_1')">tiny.en (Q5_1, 31 MB)</button>
<button id="fetch-whisper-base-en-q5_1" onclick="loadWhisper('base-en-q5_1')">base.en (Q5_1, 57 MB)</button>
<span id="fetch-whisper-progress"></span>
<!--
<input type="file" id="file" name="file" onchange="loadFile(event, 'whisper.bin')" />
-->
</div>
<br>
<div id="input">
<button id="start" onclick="onStart()" disabled>Start</button>
<button id="stop" onclick="onStop()" disabled>Stop</button>
<button id="clear" onclick="clearCache()">Clear Cache</button>
</div>
<br>
<div id="state">
Status: <b><span id="state-status">not started</span></b>
<pre id="state-transcribed">[The transcribed text will be displayed here]</pre>
</div>
<hr>
Debug output:
<textarea id="output" rows="20"></textarea>
<br>
<b>Troubleshooting</b>
<br><br>
The page does some heavy computations, so make sure:
<ul>
<li>To use a modern web browser (e.g. Chrome, Firefox)</li>
<li>To use a fast desktop or laptop computer (i.e. not a mobile phone)</li>
<li>Your browser supports WASM <a href="https://webassembly.org/roadmap/">Fixed-width SIMD</a></li>
</ul>
<div class="cell-version">
<span>
|
Build time: <span class="nav-link">@GIT_DATE@</span> |
Commit hash: <a class="nav-link" href="https://github.com/ggerganov/whisper.cpp/commit/@GIT_SHA1@">@GIT_SHA1@</a> |
Commit subject: <span class="nav-link">@GIT_COMMIT_SUBJECT@</span> |
<a class="nav-link" href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/stream.wasm">Source Code</a> |
</span>
</div>
</div>
<script type="text/javascript" src="helpers.js"></script>
<script type='text/javascript'>
// web audio context
var context = null;
// audio data
var audio = null;
var audio0 = null;
// the stream instance
var instance = null;
// model name
var model_whisper = null;
var Module = {
print: printTextarea,
printErr: printTextarea,
setStatus: function(text) {
printTextarea('js: ' + text);
},
monitorRunDependencies: function(left) {
},
preRun: function() {
printTextarea('js: Preparing ...');
},
postRun: function() {
printTextarea('js: Initialized successfully!');
}
};
//
// fetch models
//
let dbVersion = 1
let dbName = 'whisper.ggerganov.com';
let indexedDB = window.indexedDB || window.mozIndexedDB || window.webkitIndexedDB || window.msIndexedDB
function storeFS(fname, buf) {
// write to WASM file using FS_createDataFile
// if the file exists, delete it
try {
Module.FS_unlink(fname);
} catch (e) {
// ignore
}
Module.FS_createDataFile("/", fname, buf, true, true);
printTextarea('storeFS: stored model: ' + fname + ' size: ' + buf.length);
document.getElementById('model-whisper-status').innerHTML = 'loaded "' + model_whisper + '"!';
if (model_whisper != null) {
document.getElementById('start').disabled = false;
document.getElementById('stop' ).disabled = true;
}
}
function loadWhisper(model) {
let urls = {
'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
'tiny-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin',
'base-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin',
};
let sizes = {
'tiny.en': 75,
'base.en': 142,
'tiny-en-q5_1': 31,
'base-en-q5_1': 57,
};
let url = urls[model];
let dst = 'whisper.bin';
let size_mb = sizes[model];
model_whisper = model;
document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
document.getElementById('fetch-whisper-base-en').style.display = 'none';
document.getElementById('fetch-whisper-tiny-en-q5_1').style.display = 'none';
document.getElementById('fetch-whisper-base-en-q5_1').style.display = 'none';
document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
cbProgress = function(p) {
let el = document.getElementById('fetch-whisper-progress');
el.innerHTML = Math.round(100*p) + '%';
};
cbCancel = function() {
var el;
el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-tiny-en-q5_1'); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-base-en-q5_1'); if (el) el.style.display = 'inline-block';
el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = '';
};
loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);
}
//
// microphone
//
const kSampleRate = 16000;
const kRestartRecording_s = 120;
const kIntervalAudio_ms = 5000; // pass the recorded audio to the C++ instance at this rate
var mediaRecorder = null;
var doRecording = false;
var startTime = 0;
window.AudioContext = window.AudioContext || window.webkitAudioContext;
window.OfflineAudioContext = window.OfflineAudioContext || window.webkitOfflineAudioContext;
function stopRecording() {
Module.set_status("paused");
doRecording = false;
audio0 = null;
audio = null;
context = null;
}
function startRecording() {
if (!context) {
context = new AudioContext({
sampleRate: kSampleRate,
channelCount: 1,
echoCancellation: false,
autoGainControl: true,
noiseSuppression: true,
});
}
Module.set_status("");
document.getElementById('start').disabled = true;
document.getElementById('stop').disabled = false;
doRecording = true;
startTime = Date.now();
var chunks = [];
var stream = null;
navigator.mediaDevices.getUserMedia({audio: true, video: false})
.then(function(s) {
stream = s;
mediaRecorder = new MediaRecorder(stream);
mediaRecorder.ondataavailable = function(e) {
chunks.push(e.data);
var blob = new Blob(chunks, { 'type' : 'audio/ogg; codecs=opus' });
var reader = new FileReader();
reader.onload = function(event) {
var buf = new Uint8Array(reader.result);
if (!context) {
return;
}
context.decodeAudioData(buf.buffer, function(audioBuffer) {
var offlineContext = new OfflineAudioContext(audioBuffer.numberOfChannels, audioBuffer.length, audioBuffer.sampleRate);
var source = offlineContext.createBufferSource();
source.buffer = audioBuffer;
source.connect(offlineContext.destination);
source.start(0);
offlineContext.startRendering().then(function(renderedBuffer) {
audio = renderedBuffer.getChannelData(0);
//printTextarea('js: audio recorded, size: ' + audio.length + ', old size: ' + (audio0 == null ? 0 : audio0.length));
var audioAll = new Float32Array(audio0 == null ? audio.length : audio0.length + audio.length);
if (audio0 != null) {
audioAll.set(audio0, 0);
}
audioAll.set(audio, audio0 == null ? 0 : audio0.length);
if (instance) {
Module.set_audio(instance, audioAll);
}
});
}, function(e) {
audio = null;
});
}
reader.readAsArrayBuffer(blob);
};
mediaRecorder.onstop = function(e) {
if (doRecording) {
setTimeout(function() {
startRecording();
});
}
};
mediaRecorder.start(kIntervalAudio_ms);
})
.catch(function(err) {
printTextarea('js: error getting audio stream: ' + err);
});
var interval = setInterval(function() {
if (!doRecording) {
clearInterval(interval);
mediaRecorder.stop();
stream.getTracks().forEach(function(track) {
track.stop();
});
document.getElementById('start').disabled = false;
document.getElementById('stop').disabled = true;
mediaRecorder = null;
}
// if audio length is more than kRestartRecording_s seconds, restart recording
if (audio != null && audio.length > kSampleRate*kRestartRecording_s) {
if (doRecording) {
//printTextarea('js: restarting recording');
clearInterval(interval);
audio0 = audio;
audio = null;
mediaRecorder.stop();
stream.getTracks().forEach(function(track) {
track.stop();
});
}
}
}, 100);
}
//
// main
//
var nLines = 0;
var intervalUpdate = null;
var transcribedAll = '';
function onStart() {
if (!instance) {
instance = Module.init('whisper.bin');
if (instance) {
printTextarea("js: whisper initialized, instance: " + instance);
}
}
if (!instance) {
printTextarea("js: failed to initialize whisper");
return;
}
startRecording();
intervalUpdate = setInterval(function() {
var transcribed = Module.get_transcribed();
if (transcribed != null && transcribed.length > 1) {
transcribedAll += transcribed + '<br>';
nLines++;
// if more than 10 lines, remove the first line
if (nLines > 10) {
var i = transcribedAll.indexOf('<br>');
if (i > 0) {
transcribedAll = transcribedAll.substring(i + 4);
nLines--;
}
}
}
document.getElementById('state-status').innerHTML = Module.get_status();
document.getElementById('state-transcribed').innerHTML = transcribedAll;
}, 100);
}
function onStop() {
stopRecording();
}
</script>
<script type="text/javascript" src="stream.js"></script>
</body>
</html>

View File

@@ -0,0 +1,9 @@
if (WHISPER_SDL2)
# stream
set(TARGET stream)
add_executable(${TARGET} stream.cpp)
include(DefaultTargetOptions)
target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${CMAKE_THREAD_LIBS_INIT})
endif ()

View File

@@ -0,0 +1,58 @@
# stream
This is a naive example of performing real-time inference on audio from your microphone.
The `stream` tool samples the audio every half a second and runs the transcription continously.
More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10).
```java
./stream -m ./models/ggml-base.en.bin -t 8 --step 500 --length 5000
```
https://user-images.githubusercontent.com/1991296/194935793-76afede7-cfa8-48d8-a80f-28ba83be7d09.mp4
## Sliding window mode with VAD
Setting the `--step` argument to `0` enables the sliding window mode:
```java
./stream -m ./models/ggml-small.en.bin -t 6 --step 0 --length 30000 -vth 0.6
```
In this mode, the tool will transcribe only after some speech activity is detected. A very
basic VAD detector is used, but in theory a more sophisticated approach can be added. The
`-vth` argument determines the VAD threshold - higher values will make it detect silence more often.
It's best to tune it to the specific use case, but a value around `0.6` should be OK in general.
When silence is detected, it will transcribe the last `--length` milliseconds of audio and output
a transcription block that is suitable for parsing.
## Building
The `stream` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
```bash
# Install SDL2 on Linux
sudo apt-get install libsdl2-dev
# Install SDL2 on Mac OS
brew install sdl2
make stream
```
Ensure you are at the root of the repo when running `make stream`. Not within the `examples/stream` dir
as the libraries needed like `common-sdl.h` are located within `examples`. Attempting to compile within
`examples/steam` means your compiler cannot find them and it gives an error it cannot find the file.
```bash
whisper.cpp/examples/stream$ make stream
g++ stream.cpp -o stream
stream.cpp:6:10: fatal error: common/sdl.h: No such file or directory
6 | #include "common/sdl.h"
| ^~~~~~~~~~~~~~
compilation terminated.
make: *** [<builtin>: stream] Error 1
```
## Web version
This tool can also run in the browser: [examples/stream.wasm](/examples/stream.wasm)

View File

@@ -0,0 +1,433 @@
// Real-time speech recognition of input from a microphone
//
// A very quick-n-dirty implementation serving mainly as a proof of concept.
//
#include "common-sdl.h"
#include "common.h"
#include "whisper.h"
#include <cassert>
#include <cstdio>
#include <string>
#include <thread>
#include <vector>
#include <fstream>
// 500 -> 00:05.000
// 6000 -> 01:00.000
std::string to_timestamp(int64_t t) {
int64_t sec = t/100;
int64_t msec = t - sec*100;
int64_t min = sec/60;
sec = sec - min*60;
char buf[32];
snprintf(buf, sizeof(buf), "%02d:%02d.%03d", (int) min, (int) sec, (int) msec);
return std::string(buf);
}
// command-line parameters
struct whisper_params {
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
int32_t step_ms = 3000;
int32_t length_ms = 10000;
int32_t keep_ms = 200;
int32_t capture_id = -1;
int32_t max_tokens = 32;
int32_t audio_ctx = 0;
float vad_thold = 0.6f;
float freq_thold = 100.0f;
bool speed_up = false;
bool translate = false;
bool no_fallback = false;
bool print_special = false;
bool no_context = true;
bool no_timestamps = false;
bool tinydiarize = false;
bool save_audio = false; // save audio to wav file
bool use_gpu = true;
std::string language = "en";
std::string model = "models/ggml-base.en.bin";
std::string fname_out;
};
void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
for (int i = 1; i < argc; i++) {
std::string arg = argv[i];
if (arg == "-h" || arg == "--help") {
whisper_print_usage(argc, argv, params);
exit(0);
}
else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
else if ( arg == "--step") { params.step_ms = std::stoi(argv[++i]); }
else if ( arg == "--length") { params.length_ms = std::stoi(argv[++i]); }
else if ( arg == "--keep") { params.keep_ms = std::stoi(argv[++i]); }
else if (arg == "-c" || arg == "--capture") { params.capture_id = std::stoi(argv[++i]); }
else if (arg == "-mt" || arg == "--max-tokens") { params.max_tokens = std::stoi(argv[++i]); }
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
else if (arg == "-nf" || arg == "--no-fallback") { params.no_fallback = true; }
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
else if (arg == "-kc" || arg == "--keep-context") { params.no_context = false; }
else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; }
else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
else if (arg == "-f" || arg == "--file") { params.fname_out = argv[++i]; }
else if (arg == "-tdrz" || arg == "--tinydiarize") { params.tinydiarize = true; }
else if (arg == "-sa" || arg == "--save-audio") { params.save_audio = true; }
else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; }
else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
whisper_print_usage(argc, argv, params);
exit(0);
}
}
return true;
}
void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
fprintf(stderr, "\n");
fprintf(stderr, "usage: %s [options]\n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
fprintf(stderr, " --step N [%-7d] audio step size in milliseconds\n", params.step_ms);
fprintf(stderr, " --length N [%-7d] audio length in milliseconds\n", params.length_ms);
fprintf(stderr, " --keep N [%-7d] audio to keep from previous step in ms\n", params.keep_ms);
fprintf(stderr, " -c ID, --capture ID [%-7d] capture device ID\n", params.capture_id);
fprintf(stderr, " -mt N, --max-tokens N [%-7d] maximum number of tokens per audio chunk\n", params.max_tokens);
fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
fprintf(stderr, " -nf, --no-fallback [%-7s] do not use temperature fallback while decoding\n", params.no_fallback ? "true" : "false");
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
fprintf(stderr, " -kc, --keep-context [%-7s] keep context between audio chunks\n", params.no_context ? "false" : "true");
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.language.c_str());
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n", params.fname_out.c_str());
fprintf(stderr, " -tdrz, --tinydiarize [%-7s] enable tinydiarize (requires a tdrz model)\n", params.tinydiarize ? "true" : "false");
fprintf(stderr, " -sa, --save-audio [%-7s] save the recorded audio to a file\n", params.save_audio ? "true" : "false");
fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU inference\n", params.use_gpu ? "false" : "true");
fprintf(stderr, "\n");
}
int main(int argc, char ** argv) {
whisper_params params;
if (whisper_params_parse(argc, argv, params) == false) {
return 1;
}
params.keep_ms = std::min(params.keep_ms, params.step_ms);
params.length_ms = std::max(params.length_ms, params.step_ms);
const int n_samples_step = (1e-3*params.step_ms )*WHISPER_SAMPLE_RATE;
const int n_samples_len = (1e-3*params.length_ms)*WHISPER_SAMPLE_RATE;
const int n_samples_keep = (1e-3*params.keep_ms )*WHISPER_SAMPLE_RATE;
const int n_samples_30s = (1e-3*30000.0 )*WHISPER_SAMPLE_RATE;
const bool use_vad = n_samples_step <= 0; // sliding window mode uses VAD
const int n_new_line = !use_vad ? std::max(1, params.length_ms / params.step_ms - 1) : 1; // number of steps to print new line
params.no_timestamps = !use_vad;
params.no_context |= use_vad;
params.max_tokens = 0;
// init audio
audio_async audio(params.length_ms);
if (!audio.init(params.capture_id, WHISPER_SAMPLE_RATE)) {
fprintf(stderr, "%s: audio.init() failed!\n", __func__);
return 1;
}
audio.resume();
// whisper init
if (params.language != "auto" && whisper_lang_id(params.language.c_str()) == -1){
fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
whisper_print_usage(argc, argv, params);
exit(0);
}
struct whisper_context_params cparams;
cparams.use_gpu = params.use_gpu;
struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
std::vector<float> pcmf32 (n_samples_30s, 0.0f);
std::vector<float> pcmf32_old;
std::vector<float> pcmf32_new(n_samples_30s, 0.0f);
std::vector<whisper_token> prompt_tokens;
// print some info about the processing
{
fprintf(stderr, "\n");
if (!whisper_is_multilingual(ctx)) {
if (params.language != "en" || params.translate) {
params.language = "en";
params.translate = false;
fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
}
}
fprintf(stderr, "%s: processing %d samples (step = %.1f sec / len = %.1f sec / keep = %.1f sec), %d threads, lang = %s, task = %s, timestamps = %d ...\n",
__func__,
n_samples_step,
float(n_samples_step)/WHISPER_SAMPLE_RATE,
float(n_samples_len )/WHISPER_SAMPLE_RATE,
float(n_samples_keep)/WHISPER_SAMPLE_RATE,
params.n_threads,
params.language.c_str(),
params.translate ? "translate" : "transcribe",
params.no_timestamps ? 0 : 1);
if (!use_vad) {
fprintf(stderr, "%s: n_new_line = %d, no_context = %d\n", __func__, n_new_line, params.no_context);
} else {
fprintf(stderr, "%s: using VAD, will transcribe on speech activity\n", __func__);
}
fprintf(stderr, "\n");
}
int n_iter = 0;
bool is_running = true;
std::ofstream fout;
if (params.fname_out.length() > 0) {
fout.open(params.fname_out);
if (!fout.is_open()) {
fprintf(stderr, "%s: failed to open output file '%s'!\n", __func__, params.fname_out.c_str());
return 1;
}
}
wav_writer wavWriter;
// save wav file
if (params.save_audio) {
// Get current date/time for filename
time_t now = time(0);
char buffer[80];
strftime(buffer, sizeof(buffer), "%Y%m%d%H%M%S", localtime(&now));
std::string filename = std::string(buffer) + ".wav";
wavWriter.open(filename, WHISPER_SAMPLE_RATE, 16, 1);
}
printf("[Start speaking]\n");
fflush(stdout);
auto t_last = std::chrono::high_resolution_clock::now();
const auto t_start = t_last;
// main audio loop
while (is_running) {
if (params.save_audio) {
wavWriter.write(pcmf32_new.data(), pcmf32_new.size());
}
// handle Ctrl + C
is_running = sdl_poll_events();
if (!is_running) {
break;
}
// process new audio
if (!use_vad) {
while (true) {
audio.get(params.step_ms, pcmf32_new);
if ((int) pcmf32_new.size() > 2*n_samples_step) {
fprintf(stderr, "\n\n%s: WARNING: cannot process audio fast enough, dropping audio ...\n\n", __func__);
audio.clear();
continue;
}
if ((int) pcmf32_new.size() >= n_samples_step) {
audio.clear();
break;
}
std::this_thread::sleep_for(std::chrono::milliseconds(1));
}
const int n_samples_new = pcmf32_new.size();
// take up to params.length_ms audio from previous iteration
const int n_samples_take = std::min((int) pcmf32_old.size(), std::max(0, n_samples_keep + n_samples_len - n_samples_new));
//printf("processing: take = %d, new = %d, old = %d\n", n_samples_take, n_samples_new, (int) pcmf32_old.size());
pcmf32.resize(n_samples_new + n_samples_take);
for (int i = 0; i < n_samples_take; i++) {
pcmf32[i] = pcmf32_old[pcmf32_old.size() - n_samples_take + i];
}
memcpy(pcmf32.data() + n_samples_take, pcmf32_new.data(), n_samples_new*sizeof(float));
pcmf32_old = pcmf32;
} else {
const auto t_now = std::chrono::high_resolution_clock::now();
const auto t_diff = std::chrono::duration_cast<std::chrono::milliseconds>(t_now - t_last).count();
if (t_diff < 2000) {
std::this_thread::sleep_for(std::chrono::milliseconds(100));
continue;
}
audio.get(2000, pcmf32_new);
if (::vad_simple(pcmf32_new, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, false)) {
audio.get(params.length_ms, pcmf32);
} else {
std::this_thread::sleep_for(std::chrono::milliseconds(100));
continue;
}
t_last = t_now;
}
// run the inference
{
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
wparams.print_progress = false;
wparams.print_special = params.print_special;
wparams.print_realtime = false;
wparams.print_timestamps = !params.no_timestamps;
wparams.translate = params.translate;
wparams.single_segment = !use_vad;
wparams.max_tokens = params.max_tokens;
wparams.language = params.language.c_str();
wparams.n_threads = params.n_threads;
wparams.audio_ctx = params.audio_ctx;
wparams.speed_up = params.speed_up;
wparams.tdrz_enable = params.tinydiarize; // [TDRZ]
// disable temperature fallback
//wparams.temperature_inc = -1.0f;
wparams.temperature_inc = params.no_fallback ? 0.0f : wparams.temperature_inc;
wparams.prompt_tokens = params.no_context ? nullptr : prompt_tokens.data();
wparams.prompt_n_tokens = params.no_context ? 0 : prompt_tokens.size();
if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
fprintf(stderr, "%s: failed to process audio\n", argv[0]);
return 6;
}
// print result;
{
if (!use_vad) {
printf("\33[2K\r");
// print long empty line to clear the previous line
printf("%s", std::string(100, ' ').c_str());
printf("\33[2K\r");
} else {
const int64_t t1 = (t_last - t_start).count()/1000000;
const int64_t t0 = std::max(0.0, t1 - pcmf32.size()*1000.0/WHISPER_SAMPLE_RATE);
printf("\n");
printf("### Transcription %d START | t0 = %d ms | t1 = %d ms\n", n_iter, (int) t0, (int) t1);
printf("\n");
}
const int n_segments = whisper_full_n_segments(ctx);
for (int i = 0; i < n_segments; ++i) {
const char * text = whisper_full_get_segment_text(ctx, i);
if (params.no_timestamps) {
printf("%s", text);
fflush(stdout);
if (params.fname_out.length() > 0) {
fout << text;
}
} else {
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
std::string output = "[" + to_timestamp(t0) + " --> " + to_timestamp(t1) + "] " + text;
if (whisper_full_get_segment_speaker_turn_next(ctx, i)) {
output += " [SPEAKER_TURN]";
}
output += "\n";
printf("%s", output.c_str());
fflush(stdout);
if (params.fname_out.length() > 0) {
fout << output;
}
}
}
if (params.fname_out.length() > 0) {
fout << std::endl;
}
if (use_vad) {
printf("\n");
printf("### Transcription %d END\n", n_iter);
}
}
++n_iter;
if (!use_vad && (n_iter % n_new_line) == 0) {
printf("\n");
// keep part of the audio for next iteration to try to mitigate word boundary issues
pcmf32_old = std::vector<float>(pcmf32.end() - n_samples_keep, pcmf32.end());
// Add tokens of the last full length segment as the prompt
if (!params.no_context) {
prompt_tokens.clear();
const int n_segments = whisper_full_n_segments(ctx);
for (int i = 0; i < n_segments; ++i) {
const int token_count = whisper_full_n_tokens(ctx, i);
for (int j = 0; j < token_count; ++j) {
prompt_tokens.push_back(whisper_full_get_token_id(ctx, i, j));
}
}
}
}
fflush(stdout);
}
}
audio.pause();
whisper_print_timings(ctx);
whisper_free(ctx);
return 0;
}

View File

@@ -0,0 +1 @@
audio.mp3

View File

@@ -0,0 +1,30 @@
if (WHISPER_SDL2)
# talk-llama
set(TARGET talk-llama)
#add_executable(${TARGET} talk-llama.cpp llama.cpp)
#target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
#target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
# TODO: this is temporary
# need to export ggml symbols for MSVC, but too lazy ..
add_executable(${TARGET}
talk-llama.cpp
llama.cpp
../common.cpp
../common-sdl.cpp
../../ggml.c
../../ggml-alloc.c
../../ggml-backend.c
../../ggml-quants.c
../../whisper.cpp)
if(WIN32)
# It requires Windows 8.1 or later for PrefetchVirtualMemory
target_compile_definitions(${TARGET} PRIVATE -D_WIN32_WINNT=0x0602)
endif()
target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS} ../../)
target_link_libraries(${TARGET} PRIVATE ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
include(DefaultTargetOptions)
endif ()

View File

@@ -0,0 +1,56 @@
# talk-llama
Talk with an LLaMA AI in your terminal
*Latest perf as of 2 Nov 2023 using Whisper Medium + LLaMA v2 13B Q8_0 on M2 Ultra:*
https://github.com/ggerganov/whisper.cpp/assets/1991296/d97a3788-bf2a-4756-9a43-60c6b391649e
*Previous demo running on CPUs*
[Demo Talk](https://user-images.githubusercontent.com/1991296/228024237-848f998c-c334-46a6-bef8-3271590da83b.mp4)
## Building
The `talk-llama` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
```bash
# Install SDL2 on Linux
sudo apt-get install libsdl2-dev
# Install SDL2 on Mac OS
brew install sdl2
# Build the "talk-llama" executable
make talk-llama
# Run it
./talk-llama -mw ./models/ggml-small.en.bin -ml ../llama.cpp/models/llama-13b/ggml-model-q4_0.gguf -p "Georgi" -t 8
```
- The `-mw` argument specifies the Whisper model that you would like to use. Recommended `base` or `small` for real-time experience
- The `-ml` argument specifies the LLaMA model that you would like to use. Read the instructions in https://github.com/ggerganov/llama.cpp for information about how to obtain a `ggml` compatible LLaMA model
## Session
The `talk-llama` tool supports session management to enable more coherent and continuous conversations. By maintaining context from previous interactions, it can better understand and respond to user requests in a more natural way.
To enable session support, use the `--session FILE` command line option when running the program. The `talk-llama` model state will be saved to the specified file after each interaction. If the file does not exist, it will be created. If the file exists, the model state will be loaded from it, allowing you to resume a previous session.
This feature is especially helpful for maintaining context in long conversations or when interacting with the AI assistant across multiple sessions. It ensures that the assistant remembers the previous interactions and can provide more relevant and contextual responses.
Example usage:
```bash
./talk-llama --session ./my-session-file -mw ./models/ggml-small.en.bin -ml ../llama.cpp/models/llama-13b/ggml-model-q4_0.gguf -p "Georgi" -t 8
```
## TTS
For best experience, this example needs a TTS tool to convert the generated text responses to voice.
You can use any TTS engine that you would like - simply edit the [speak](speak) script to your needs.
By default, it is configured to use MacOS's `say` or Windows SpeechSynthesizer, but you can use whatever you wish.
## Discussion
If you have any feedback, please let "us" know in the following discussion: https://github.com/ggerganov/whisper.cpp/discussions/672?converting=1

View File

@@ -0,0 +1,20 @@
import sys
import importlib.util
if importlib.util.find_spec("elevenlabs") is None:
print("elevenlabs library is not installed, you can install it to your enviroment using 'pip install elevenlabs'")
sys.exit()
from elevenlabs import generate, play, save
# Get a Voice object, by name or UUID
voice = "Arnold" #Possible Voices: Adam Antoni Arnold Bella Domi Elli Josh
# Generate the TTS
audio = generate(
text=str(sys.argv[2:]),
voice=voice
)
# Save the TTS to a file
save(audio, "audio.mp3")

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,773 @@
#ifndef LLAMA_H
#define LLAMA_H
#include "ggml.h"
#ifdef GGML_USE_CUBLAS
#include "ggml-cuda.h"
#define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
#else
#define LLAMA_MAX_DEVICES 1
#endif // GGML_USE_CUBLAS
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <stdbool.h>
#ifdef LLAMA_SHARED
# if defined(_WIN32) && !defined(__MINGW32__)
# ifdef LLAMA_BUILD
# define LLAMA_API __declspec(dllexport)
# else
# define LLAMA_API __declspec(dllimport)
# endif
# else
# define LLAMA_API __attribute__ ((visibility ("default")))
# endif
#else
# define LLAMA_API
#endif
#ifdef __GNUC__
# define DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
#elif defined(_MSC_VER)
# define DEPRECATED(func, hint) __declspec(deprecated(hint)) func
#else
# define DEPRECATED(func, hint) func
#endif
#define LLAMA_DEFAULT_SEED 0xFFFFFFFF
#define LLAMA_MAX_RNG_STATE (64*1024)
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
#define LLAMA_SESSION_VERSION 2
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
#define LLAMA_SUPPORTS_GPU_OFFLOAD
#endif
#ifdef __cplusplus
extern "C" {
#endif
//
// C interface
//
// TODO: show sample usage
//
struct llama_model;
struct llama_context;
typedef int32_t llama_pos;
typedef int32_t llama_token;
typedef int32_t llama_seq_id;
enum llama_vocab_type {
LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
};
enum llama_token_type {
LLAMA_TOKEN_TYPE_UNDEFINED = 0,
LLAMA_TOKEN_TYPE_NORMAL = 1,
LLAMA_TOKEN_TYPE_UNKNOWN = 2,
LLAMA_TOKEN_TYPE_CONTROL = 3,
LLAMA_TOKEN_TYPE_USER_DEFINED = 4,
LLAMA_TOKEN_TYPE_UNUSED = 5,
LLAMA_TOKEN_TYPE_BYTE = 6,
};
// model file types
enum llama_ftype {
LLAMA_FTYPE_ALL_F32 = 0,
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
// LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q3_K_S = 11, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q3_K_M = 12, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q3_K_L = 13, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_K_S = 14, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_K_M = 15, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q6_K = 18, // except 1d tensors
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
};
enum llama_rope_scaling_type {
LLAMA_ROPE_SCALING_UNSPECIFIED = -1,
LLAMA_ROPE_SCALING_NONE = 0,
LLAMA_ROPE_SCALING_LINEAR = 1,
LLAMA_ROPE_SCALING_YARN = 2,
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
};
typedef struct llama_token_data {
llama_token id; // token id
float logit; // log-odds of the token
float p; // probability of the token
} llama_token_data;
typedef struct llama_token_data_array {
llama_token_data * data;
size_t size;
bool sorted;
} llama_token_data_array;
typedef void (*llama_progress_callback)(float progress, void *ctx);
// Input data for llama_decode
// A llama_batch object can contain input about one or many sequences
// The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
//
// - token : the token ids of the input (used when embd is NULL)
// - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
// - pos : the positions of the respective token in the sequence
// - seq_id : the sequence to which the respective token belongs
// - logits : if zero, the logits for the respective token will not be output
//
typedef struct llama_batch {
int32_t n_tokens;
llama_token * token;
float * embd;
llama_pos * pos;
int32_t * n_seq_id;
llama_seq_id ** seq_id;
int8_t * logits;
// NOTE: helpers for smooth API transition - can be deprecated in the future
// for future-proof code, use the above fields instead and ignore everything below
//
// pos[i] = all_pos_0 + i*all_pos_1
//
llama_pos all_pos_0; // used if pos == NULL
llama_pos all_pos_1; // used if pos == NULL
llama_seq_id all_seq_id; // used if seq_id == NULL
} llama_batch;
struct llama_model_params {
int32_t n_gpu_layers; // number of layers to store in VRAM
int32_t main_gpu; // the GPU that is used for scratch and small tensors
const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
// called with a progress value between 0 and 1, pass NULL to disable
llama_progress_callback progress_callback;
// context pointer passed to the progress callback
void * progress_callback_user_data;
// Keep the booleans together to avoid misalignment during copy-by-value.
bool vocab_only; // only load the vocabulary, no weights
bool use_mmap; // use mmap if possible
bool use_mlock; // force system to keep model in RAM
};
struct llama_context_params {
uint32_t seed; // RNG seed, -1 for random
uint32_t n_ctx; // text context, 0 = from model
uint32_t n_batch; // prompt processing maximum batch size
uint32_t n_threads; // number of threads to use for generation
uint32_t n_threads_batch; // number of threads to use for batch processing
int8_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
float rope_freq_base; // RoPE base frequency, 0 = from model
float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
float yarn_ext_factor; // YaRN extrapolation mix factor, NaN = from model
float yarn_attn_factor; // YaRN magnitude scaling factor
float yarn_beta_fast; // YaRN low correction dim
float yarn_beta_slow; // YaRN high correction dim
uint32_t yarn_orig_ctx; // YaRN original context size
// Keep the booleans together to avoid misalignment during copy-by-value.
bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
bool f16_kv; // use fp16 for KV cache, fp32 otherwise
bool logits_all; // the llama_eval() call computes all logits, not just the last one
bool embedding; // embedding mode only
};
// model quantization parameters
typedef struct llama_model_quantize_params {
int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
enum llama_ftype ftype; // quantize to this llama_ftype
bool allow_requantize; // allow quantizing non-f32/f16 tensors
bool quantize_output_tensor; // quantize output.weight
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
bool pure; // disable k-quant mixtures and quantize all tensors to the same type
} llama_model_quantize_params;
// grammar types
struct llama_grammar;
// grammar element type
enum llama_gretype {
// end of rule definition
LLAMA_GRETYPE_END = 0,
// start of alternate definition for rule
LLAMA_GRETYPE_ALT = 1,
// non-terminal element: reference to rule
LLAMA_GRETYPE_RULE_REF = 2,
// terminal element: character (code point)
LLAMA_GRETYPE_CHAR = 3,
// inverse char(s) ([^a], [^a-b] [^abc])
LLAMA_GRETYPE_CHAR_NOT = 4,
// modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
// be an inclusive range ([a-z])
LLAMA_GRETYPE_CHAR_RNG_UPPER = 5,
// modifies a preceding LLAMA_GRETYPE_CHAR or
// LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
LLAMA_GRETYPE_CHAR_ALT = 6,
};
typedef struct llama_grammar_element {
enum llama_gretype type;
uint32_t value; // Unicode code point or rule ID
} llama_grammar_element;
// performance timing information
struct llama_timings {
double t_start_ms;
double t_end_ms;
double t_load_ms;
double t_sample_ms;
double t_p_eval_ms;
double t_eval_ms;
int32_t n_sample;
int32_t n_p_eval;
int32_t n_eval;
};
// Helpers for getting default parameters
LLAMA_API struct llama_model_params llama_model_default_params(void);
LLAMA_API struct llama_context_params llama_context_default_params(void);
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
// Initialize the llama + ggml backend
// If numa is true, use NUMA optimizations
// Call once at the start of the program
LLAMA_API void llama_backend_init(bool numa);
// Call once at the end of the program - currently only used for MPI
LLAMA_API void llama_backend_free(void);
LLAMA_API struct llama_model * llama_load_model_from_file(
const char * path_model,
struct llama_model_params params);
LLAMA_API void llama_free_model(struct llama_model * model);
LLAMA_API struct llama_context * llama_new_context_with_model(
struct llama_model * model,
struct llama_context_params params);
// Frees all allocated memory
LLAMA_API void llama_free(struct llama_context * ctx);
LLAMA_API int64_t llama_time_us(void);
LLAMA_API int llama_max_devices (void);
LLAMA_API bool llama_mmap_supported (void);
LLAMA_API bool llama_mlock_supported(void);
LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
LLAMA_API int llama_n_vocab (const struct llama_model * model);
LLAMA_API int llama_n_ctx_train(const struct llama_model * model);
LLAMA_API int llama_n_embd (const struct llama_model * model);
// Get the model's RoPE frequency scaling factor
LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
// Get a string describing the model type
LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
// Returns the total size of all the tensors in the model in bytes
LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
// Returns the total number of parameters in the model
LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
// Get a llama model tensor
LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
// Returns 0 on success
LLAMA_API int llama_model_quantize(
const char * fname_inp,
const char * fname_out,
const llama_model_quantize_params * params);
// Apply a LoRA adapter to a loaded model
// path_base_model is the path to a higher quality model to use as a base for
// the layers modified by the adapter. Can be NULL to use the current loaded model.
// The model needs to be reloaded before applying a new adapter, otherwise the adapter
// will be applied on top of the previous one
// Returns 0 on success
LLAMA_API DEPRECATED(int llama_apply_lora_from_file(
struct llama_context * ctx,
const char * path_lora,
float scale,
const char * path_base_model,
int n_threads),
"use llama_model_apply_lora_from_file instead");
LLAMA_API int llama_model_apply_lora_from_file(
const struct llama_model * model,
const char * path_lora,
float scale,
const char * path_base_model,
int n_threads);
//
// KV cache
//
// Returns the number of tokens in the KV cache
LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
"avoid using this, it will be removed in the future, instead - count the tokens in user code");
// Clear the KV cache
LLAMA_API void llama_kv_cache_clear(
struct llama_context * ctx);
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
// seq_id < 0 : match any sequence
// p0 < 0 : [0, p1]
// p1 < 0 : [p0, inf)
LLAMA_API void llama_kv_cache_seq_rm(
struct llama_context * ctx,
llama_seq_id seq_id,
llama_pos p0,
llama_pos p1);
// Copy all tokens that belong to the specified sequence to another sequence
// Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
// p0 < 0 : [0, p1]
// p1 < 0 : [p0, inf)
LLAMA_API void llama_kv_cache_seq_cp(
struct llama_context * ctx,
llama_seq_id seq_id_src,
llama_seq_id seq_id_dst,
llama_pos p0,
llama_pos p1);
// Removes all tokens that do not belong to the specified sequence
LLAMA_API void llama_kv_cache_seq_keep(
struct llama_context * ctx,
llama_seq_id seq_id);
// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
// If the KV cache is RoPEd, the KV data is updated accordingly
// p0 < 0 : [0, p1]
// p1 < 0 : [p0, inf)
LLAMA_API void llama_kv_cache_seq_shift(
struct llama_context * ctx,
llama_seq_id seq_id,
llama_pos p0,
llama_pos p1,
llama_pos delta);
//
// State / sessions
//
// Returns the maximum size in bytes of the state (rng, logits, embedding
// and kv_cache) - will often be smaller after compacting tokens
LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx);
// Copies the state to the specified destination address.
// Destination needs to have allocated enough memory.
// Returns the number of bytes copied
LLAMA_API size_t llama_copy_state_data(
struct llama_context * ctx,
uint8_t * dst);
// Set the state reading from the specified address
// Returns the number of bytes read
LLAMA_API size_t llama_set_state_data(
struct llama_context * ctx,
uint8_t * src);
// Save/load session file
LLAMA_API bool llama_load_session_file(
struct llama_context * ctx,
const char * path_session,
llama_token * tokens_out,
size_t n_token_capacity,
size_t * n_token_count_out);
LLAMA_API bool llama_save_session_file(
struct llama_context * ctx,
const char * path_session,
const llama_token * tokens,
size_t n_token_count);
//
// Decoding
//
// Run the llama inference to obtain the logits and probabilities for the next token(s).
// tokens + n_tokens is the provided batch of new tokens to process
// n_past is the number of tokens to use from previous eval calls
// Returns 0 on success
// DEPRECATED: use llama_decode() instead
LLAMA_API DEPRECATED(int llama_eval(
struct llama_context * ctx,
llama_token * tokens,
int32_t n_tokens,
int n_past),
"use llama_decode() instead");
// Same as llama_eval, but use float matrix input directly.
// DEPRECATED: use llama_decode() instead
LLAMA_API DEPRECATED(int llama_eval_embd(
struct llama_context * ctx,
float * embd,
int32_t n_tokens,
int n_past),
"use llama_decode() instead");
// Return batch for single sequence of tokens starting at pos_0
//
// NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
//
LLAMA_API struct llama_batch llama_batch_get_one(
llama_token * tokens,
int32_t n_tokens,
llama_pos pos_0,
llama_seq_id seq_id);
// Allocates a batch of tokens on the heap that can hold a maximum of n_tokens
// Each token can be assigned up to n_seq_max sequence ids
// The batch has to be freed with llama_batch_free()
// If embd != 0, llama_batch.embd will be allocated with size of n_tokens * embd * sizeof(float)
// Otherwise, llama_batch.token will be allocated to store n_tokens llama_token
// The rest of the llama_batch members are allocated with size n_tokens
// All members are left uninitialized
LLAMA_API struct llama_batch llama_batch_init(
int32_t n_tokens,
int32_t embd,
int32_t n_seq_max);
// Frees a batch of tokens allocated with llama_batch_init()
LLAMA_API void llama_batch_free(struct llama_batch batch);
// Positive return values does not mean a fatal error, but rather a warning.
// 0 - success
// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
// < 0 - error
LLAMA_API int llama_decode(
struct llama_context * ctx,
struct llama_batch batch);
// Set the number of threads used for decoding
// n_threads is the number of threads used for generation (single token)
// n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
// Token logits obtained from the last call to llama_eval()
// The logits for the last token are stored in the last row
// Logits for which llama_batch.logits[i] == 0 are undefined
// Rows: n_tokens provided with llama_batch
// Cols: n_vocab
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
// Logits for the ith token. Equivalent to:
// llama_get_logits(ctx) + i*n_vocab
LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
// Get the embeddings for the input
// shape: [n_embd] (1-dimensional)
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
//
// Vocab
//
LLAMA_API const char * llama_token_get_text(const struct llama_model * model, llama_token token);
LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);
LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token);
// Special tokens
LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
// codellama infill tokens
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
LLAMA_API llama_token llama_token_eot (const struct llama_model * model); // End of infill middle
//
// Tokenization
//
/// @details Convert the provided text into tokens.
/// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
/// @return Returns the number of tokens on success, no more than n_max_tokens
/// @return Returns a negative number on failure - the number of tokens that would have been returned
/// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
/// Does not insert a leading space.
LLAMA_API int llama_tokenize(
const struct llama_model * model,
const char * text,
int text_len,
llama_token * tokens,
int n_max_tokens,
bool add_bos,
bool special);
// Token Id -> Piece.
// Uses the vocabulary in the provided context.
// Does not write null terminator to the buffer.
// User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
LLAMA_API int llama_token_to_piece(
const struct llama_model * model,
llama_token token,
char * buf,
int length);
//
// Grammar
//
LLAMA_API struct llama_grammar * llama_grammar_init(
const llama_grammar_element ** rules,
size_t n_rules,
size_t start_rule_index);
LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
LLAMA_API struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar);
//
// Sampling functions
//
// Sets the current rng seed.
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
/// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
/// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
LLAMA_API void llama_sample_repetition_penalties(
struct llama_context * ctx,
llama_token_data_array * candidates,
const llama_token * last_tokens,
size_t penalty_last_n,
float penalty_repeat,
float penalty_freq,
float penalty_present);
/// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
/// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
/// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
LLAMA_API void llama_sample_classifier_free_guidance(
struct llama_context * ctx,
llama_token_data_array * candidates,
struct llama_context * guidance_ctx,
float scale);
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
LLAMA_API void llama_sample_softmax(
struct llama_context * ctx,
llama_token_data_array * candidates);
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
LLAMA_API void llama_sample_top_k(
struct llama_context * ctx,
llama_token_data_array * candidates,
int k,
size_t min_keep);
/// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
LLAMA_API void llama_sample_top_p(
struct llama_context * ctx,
llama_token_data_array * candidates,
float p,
size_t min_keep);
/// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
LLAMA_API void llama_sample_min_p(
struct llama_context * ctx,
llama_token_data_array * candidates,
float p,
size_t min_keep);
/// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
LLAMA_API void llama_sample_tail_free(
struct llama_context * ctx,
llama_token_data_array * candidates,
float z,
size_t min_keep);
/// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
LLAMA_API void llama_sample_typical(
struct llama_context * ctx,
llama_token_data_array * candidates,
float p,
size_t min_keep);
LLAMA_API void llama_sample_temp(
struct llama_context * ctx,
llama_token_data_array * candidates,
float temp);
LLAMA_API DEPRECATED(void llama_sample_temperature(
struct llama_context * ctx,
llama_token_data_array * candidates,
float temp),
"use llama_sample_temp instead");
/// @details Apply constraints from grammar
LLAMA_API void llama_sample_grammar(
struct llama_context * ctx,
llama_token_data_array * candidates,
const struct llama_grammar * grammar);
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
/// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
/// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
/// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
LLAMA_API llama_token llama_sample_token_mirostat(
struct llama_context * ctx,
llama_token_data_array * candidates,
float tau,
float eta,
int m,
float * mu);
/// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
/// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
/// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
LLAMA_API llama_token llama_sample_token_mirostat_v2(
struct llama_context * ctx,
llama_token_data_array * candidates,
float tau,
float eta,
float * mu);
/// @details Selects the token with the highest probability.
/// Does not compute the token probabilities. Use llama_sample_softmax() instead.
LLAMA_API llama_token llama_sample_token_greedy(
struct llama_context * ctx,
llama_token_data_array * candidates);
/// @details Randomly selects a token from the candidates based on their probabilities.
LLAMA_API llama_token llama_sample_token(
struct llama_context * ctx,
llama_token_data_array * candidates);
/// @details Accepts the sampled token into the grammar
LLAMA_API void llama_grammar_accept_token(
struct llama_context * ctx,
struct llama_grammar * grammar,
llama_token token);
//
// Beam search
//
struct llama_beam_view {
const llama_token * tokens;
size_t n_tokens;
float p; // Cumulative beam probability (renormalized relative to all beams)
bool eob; // Callback should set this to true when a beam is at end-of-beam.
};
// Passed to beam_search_callback function.
// Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
// (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
// These pointers are valid only during the synchronous callback, so should not be saved.
struct llama_beams_state {
struct llama_beam_view * beam_views;
size_t n_beams; // Number of elements in beam_views[].
size_t common_prefix_length; // Current max length of prefix tokens shared by all beams.
bool last_call; // True iff this is the last callback invocation.
};
// Type of pointer to the beam_search_callback function.
// void* callback_data is any custom data passed to llama_beam_search, that is subsequently
// passed back to beam_search_callback. This avoids having to use global variables in the callback.
typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, struct llama_beams_state);
/// @details Deterministically returns entire sentence constructed by a beam search.
/// @param ctx Pointer to the llama_context.
/// @param callback Invoked for each iteration of the beam_search loop, passing in beams_state.
/// @param callback_data A pointer that is simply passed back to callback.
/// @param n_beams Number of beams to use.
/// @param n_past Number of tokens already evaluated.
/// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
LLAMA_API void llama_beam_search(
struct llama_context * ctx,
llama_beam_search_callback_fn_t callback,
void * callback_data,
size_t n_beams,
int n_past,
int n_predict);
// Performance information
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
LLAMA_API void llama_print_timings(struct llama_context * ctx);
LLAMA_API void llama_reset_timings(struct llama_context * ctx);
// Print system information
LLAMA_API const char * llama_print_system_info(void);
// Set callback for all future logging events.
// If this is not called, or NULL is supplied, everything is output on stderr.
LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
#ifdef __cplusplus
}
#endif
// Internal API to be implemented by llama.cpp and used by tests/benchmarks only
#ifdef LLAMA_API_INTERNAL
#include <vector>
#include <string>
struct ggml_tensor;
const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
struct llama_context * ctx
);
#endif // LLAMA_API_INTERNAL
#endif // LLAMA_H

View File

@@ -0,0 +1,23 @@
Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
Write a text transcript of a never ending dialog, where {0} interacts with an AI assistant named {1}.
{1} is helpful, kind, honest, friendly, good at writing and never fails to answer {0}s requests immediately and with details and precision.
There are no annotations like (30 seconds passed...) or (to himself), just what {0} and {1} say aloud to each other.
The transcript only includes text, it does not include markup like HTML and Markdown.
{1} responds with short and concise answers.
### Response:
{0}{4} Hello, {1}!
{1}{4} Hello {0}! How may I help you today?
{0}{4} What time is it?
{1}{4} It is {2} o'clock.
{0}{4} What year is it?
{1}{4} We are in {3}.
{0}{4} What is a cat?
{1}{4} A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae.
{0}{4} Name a color.
{1}{4} Blue
{0}{4}

View File

@@ -0,0 +1,24 @@
#!/bin/bash
# Usage:
# speak.sh <voice_id> <text-to-speak>
# espeak
# Mac OS: brew install espeak
# Linux: apt-get install espeak
#
#espeak -v en-us+m$1 -s 225 -p 50 -a 200 -g 5 -k 5 "$2"
# for Mac
say "$2"
# Eleven Labs
# To use it, install the elevenlabs module from pip (pip install elevenlabs)
# It's possible to use the API for free with limited number of characters. To increase this limit register to https://beta.elevenlabs.io to get an api key and paste it after 'ELEVEN_API_KEY='
#Keep the line commented to use the free version whitout api key
#
#export ELEVEN_API_KEY=your_api_key
#wd=$(dirname $0)
#script=$wd/eleven-labs.py
#python3 $script $1 "$2" >/dev/null 2>&1
#ffplay -autoexit -nodisp -loglevel quiet -hide_banner -i ./audio.mp3 >/dev/null 2>&1

View File

@@ -0,0 +1 @@
@powershell -ExecutionPolicy Bypass -F examples\talk\speak.ps1 %1 %2

View File

@@ -0,0 +1,12 @@
# Set-ExecutionPolicy -ExecutionPolicy Bypass -Scope CurrentUser
param(
# voice options are David or Zira
[Parameter(Mandatory=$true)][string]$voice,
[Parameter(Mandatory=$true)][string]$text
)
Add-Type -AssemblyName System.Speech;
$speak = New-Object System.Speech.Synthesis.SpeechSynthesizer;
$speak.SelectVoice("Microsoft $voice Desktop");
$speak.Rate="0";
$speak.Speak($text);

View File

@@ -0,0 +1,709 @@
// Talk with AI
//
#include "common-sdl.h"
#include "common.h"
#include "whisper.h"
#include "llama.h"
#include <cassert>
#include <cstdio>
#include <fstream>
#include <regex>
#include <string>
#include <thread>
#include <vector>
#include <regex>
std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
auto * model = llama_get_model(ctx);
// upper limit for the number of tokens
int n_tokens = text.length() + add_bos;
std::vector<llama_token> result(n_tokens);
n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, false);
if (n_tokens < 0) {
result.resize(-n_tokens);
int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, false);
GGML_ASSERT(check == -n_tokens);
} else {
result.resize(n_tokens);
}
return result;
}
std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
std::vector<char> result(8, 0);
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
if (n_tokens < 0) {
result.resize(-n_tokens);
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
GGML_ASSERT(check == -n_tokens);
} else {
result.resize(n_tokens);
}
return std::string(result.data(), result.size());
}
// command-line parameters
struct whisper_params {
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
int32_t voice_ms = 10000;
int32_t capture_id = -1;
int32_t max_tokens = 32;
int32_t audio_ctx = 0;
int32_t n_gpu_layers = 999;
float vad_thold = 0.6f;
float freq_thold = 100.0f;
bool speed_up = false;
bool translate = false;
bool print_special = false;
bool print_energy = false;
bool no_timestamps = true;
bool verbose_prompt = false;
bool use_gpu = true;
std::string person = "Georgi";
std::string language = "en";
std::string model_wsp = "models/ggml-base.en.bin";
std::string model_llama = "models/ggml-llama-7B.bin";
std::string speak = "./examples/talk-llama/speak";
std::string prompt = "";
std::string fname_out;
std::string path_session = ""; // path to file for saving/loading model eval state
};
void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
for (int i = 1; i < argc; i++) {
std::string arg = argv[i];
if (arg == "-h" || arg == "--help") {
whisper_print_usage(argc, argv, params);
exit(0);
}
else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
else if (arg == "-vms" || arg == "--voice-ms") { params.voice_ms = std::stoi(argv[++i]); }
else if (arg == "-c" || arg == "--capture") { params.capture_id = std::stoi(argv[++i]); }
else if (arg == "-mt" || arg == "--max-tokens") { params.max_tokens = std::stoi(argv[++i]); }
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
else if (arg == "-ngl" || arg == "--n-gpu-layers") { params.n_gpu_layers = std::stoi(argv[++i]); }
else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; }
else if (arg == "-vp" || arg == "--verbose-prompt") { params.verbose_prompt = true; }
else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; }
else if (arg == "-p" || arg == "--person") { params.person = argv[++i]; }
else if (arg == "--session") { params.path_session = argv[++i];}
else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; }
else if (arg == "-mw" || arg == "--model-whisper") { params.model_wsp = argv[++i]; }
else if (arg == "-ml" || arg == "--model-llama") { params.model_llama = argv[++i]; }
else if (arg == "-s" || arg == "--speak") { params.speak = argv[++i]; }
else if (arg == "--prompt-file") {
std::ifstream file(argv[++i]);
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
if (params.prompt.back() == '\n') {
params.prompt.pop_back();
}
}
else if (arg == "-f" || arg == "--file") { params.fname_out = argv[++i]; }
else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; }
else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
whisper_print_usage(argc, argv, params);
exit(0);
}
}
return true;
}
void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
fprintf(stderr, "\n");
fprintf(stderr, "usage: %s [options]\n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
fprintf(stderr, " -vms N, --voice-ms N [%-7d] voice duration in milliseconds\n", params.voice_ms);
fprintf(stderr, " -c ID, --capture ID [%-7d] capture device ID\n", params.capture_id);
fprintf(stderr, " -mt N, --max-tokens N [%-7d] maximum number of tokens per audio chunk\n", params.max_tokens);
fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
fprintf(stderr, " -ngl N, --n-gpu-layers N [%-7d] number of layers to store in VRAM\n", params.n_gpu_layers);
fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false");
fprintf(stderr, " -vp, --verbose-prompt [%-7s] print prompt at start\n", params.verbose_prompt ? "true" : "false");
fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU\n", params.use_gpu ? "false" : "true");
fprintf(stderr, " -p NAME, --person NAME [%-7s] person name (for prompt selection)\n", params.person.c_str());
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.language.c_str());
fprintf(stderr, " -mw FILE, --model-whisper [%-7s] whisper model file\n", params.model_wsp.c_str());
fprintf(stderr, " -ml FILE, --model-llama [%-7s] llama model file\n", params.model_llama.c_str());
fprintf(stderr, " -s FILE, --speak TEXT [%-7s] command for TTS\n", params.speak.c_str());
fprintf(stderr, " --prompt-file FNAME [%-7s] file with custom prompt to start dialog\n", "");
fprintf(stderr, " --session FNAME file to cache model state in (may be large!) (default: none)\n");
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n", params.fname_out.c_str());
fprintf(stderr, "\n");
}
std::string transcribe(
whisper_context * ctx,
const whisper_params & params,
const std::vector<float> & pcmf32,
const std::string prompt_text,
float & prob,
int64_t & t_ms) {
const auto t_start = std::chrono::high_resolution_clock::now();
prob = 0.0f;
t_ms = 0;
std::vector<whisper_token> prompt_tokens;
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
prompt_tokens.resize(1024);
prompt_tokens.resize(whisper_tokenize(ctx, prompt_text.c_str(), prompt_tokens.data(), prompt_tokens.size()));
wparams.print_progress = false;
wparams.print_special = params.print_special;
wparams.print_realtime = false;
wparams.print_timestamps = !params.no_timestamps;
wparams.translate = params.translate;
wparams.no_context = true;
wparams.single_segment = true;
wparams.max_tokens = params.max_tokens;
wparams.language = params.language.c_str();
wparams.n_threads = params.n_threads;
wparams.prompt_tokens = prompt_tokens.empty() ? nullptr : prompt_tokens.data();
wparams.prompt_n_tokens = prompt_tokens.empty() ? 0 : prompt_tokens.size();
wparams.audio_ctx = params.audio_ctx;
wparams.speed_up = params.speed_up;
if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
return "";
}
int prob_n = 0;
std::string result;
const int n_segments = whisper_full_n_segments(ctx);
for (int i = 0; i < n_segments; ++i) {
const char * text = whisper_full_get_segment_text(ctx, i);
result += text;
const int n_tokens = whisper_full_n_tokens(ctx, i);
for (int j = 0; j < n_tokens; ++j) {
const auto token = whisper_full_get_token_data(ctx, i, j);
prob += token.p;
++prob_n;
}
}
if (prob_n > 0) {
prob /= prob_n;
}
const auto t_end = std::chrono::high_resolution_clock::now();
t_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count();
return result;
}
const std::string k_prompt_whisper = R"(A conversation with a person called {1}.)";
const std::string k_prompt_llama = R"(Text transcript of a never ending dialog, where {0} interacts with an AI assistant named {1}.
{1} is helpful, kind, honest, friendly, good at writing and never fails to answer {0}s requests immediately and with details and precision.
There are no annotations like (30 seconds passed...) or (to himself), just what {0} and {1} say aloud to each other.
The transcript only includes text, it does not include markup like HTML and Markdown.
{1} responds with short and concise answers.
{0}{4} Hello, {1}!
{1}{4} Hello {0}! How may I help you today?
{0}{4} What time is it?
{1}{4} It is {2} o'clock.
{0}{4} What year is it?
{1}{4} We are in {3}.
{0}{4} What is a cat?
{1}{4} A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae.
{0}{4} Name a color.
{1}{4} Blue
{0}{4})";
int main(int argc, char ** argv) {
whisper_params params;
if (whisper_params_parse(argc, argv, params) == false) {
return 1;
}
if (params.language != "auto" && whisper_lang_id(params.language.c_str()) == -1) {
fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
whisper_print_usage(argc, argv, params);
exit(0);
}
// whisper init
struct whisper_context_params cparams;
cparams.use_gpu = params.use_gpu;
struct whisper_context * ctx_wsp = whisper_init_from_file_with_params(params.model_wsp.c_str(), cparams);
// llama init
llama_backend_init(true);
auto lmparams = llama_model_default_params();
if (!params.use_gpu) {
lmparams.n_gpu_layers = 0;
} else {
lmparams.n_gpu_layers = params.n_gpu_layers;
}
struct llama_model * model_llama = llama_load_model_from_file(params.model_llama.c_str(), lmparams);
llama_context_params lcparams = llama_context_default_params();
// tune these to your liking
lcparams.n_ctx = 2048;
lcparams.seed = 1;
lcparams.f16_kv = true;
lcparams.n_threads = params.n_threads;
struct llama_context * ctx_llama = llama_new_context_with_model(model_llama, lcparams);
// print some info about the processing
{
fprintf(stderr, "\n");
if (!whisper_is_multilingual(ctx_wsp)) {
if (params.language != "en" || params.translate) {
params.language = "en";
params.translate = false;
fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
}
}
fprintf(stderr, "%s: processing, %d threads, lang = %s, task = %s, timestamps = %d ...\n",
__func__,
params.n_threads,
params.language.c_str(),
params.translate ? "translate" : "transcribe",
params.no_timestamps ? 0 : 1);
fprintf(stderr, "\n");
}
// init audio
audio_async audio(30*1000);
if (!audio.init(params.capture_id, WHISPER_SAMPLE_RATE)) {
fprintf(stderr, "%s: audio.init() failed!\n", __func__);
return 1;
}
audio.resume();
bool is_running = true;
bool force_speak = false;
float prob0 = 0.0f;
const std::string chat_symb = ":";
const std::string bot_name = "LLaMA";
std::vector<float> pcmf32_cur;
std::vector<float> pcmf32_prompt;
const std::string prompt_whisper = ::replace(k_prompt_whisper, "{1}", bot_name);
// construct the initial prompt for LLaMA inference
std::string prompt_llama = params.prompt.empty() ? k_prompt_llama : params.prompt;
// need to have leading ' '
prompt_llama.insert(0, 1, ' ');
prompt_llama = ::replace(prompt_llama, "{0}", params.person);
prompt_llama = ::replace(prompt_llama, "{1}", bot_name);
{
// get time string
std::string time_str;
{
time_t t = time(0);
struct tm * now = localtime(&t);
char buf[128];
strftime(buf, sizeof(buf), "%H:%M", now);
time_str = buf;
}
prompt_llama = ::replace(prompt_llama, "{2}", time_str);
}
{
// get year string
std::string year_str;
{
time_t t = time(0);
struct tm * now = localtime(&t);
char buf[128];
strftime(buf, sizeof(buf), "%Y", now);
year_str = buf;
}
prompt_llama = ::replace(prompt_llama, "{3}", year_str);
}
prompt_llama = ::replace(prompt_llama, "{4}", chat_symb);
// init session
std::string path_session = params.path_session;
std::vector<llama_token> session_tokens;
auto embd_inp = ::llama_tokenize(ctx_llama, prompt_llama, true);
if (!path_session.empty()) {
fprintf(stderr, "%s: attempting to load saved session from %s\n", __func__, path_session.c_str());
// fopen to check for existing session
FILE * fp = std::fopen(path_session.c_str(), "rb");
if (fp != NULL) {
std::fclose(fp);
session_tokens.resize(llama_n_ctx(ctx_llama));
size_t n_token_count_out = 0;
if (!llama_load_session_file(ctx_llama, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
fprintf(stderr, "%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
return 1;
}
session_tokens.resize(n_token_count_out);
for (size_t i = 0; i < session_tokens.size(); i++) {
embd_inp[i] = session_tokens[i];
}
fprintf(stderr, "%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size());
} else {
fprintf(stderr, "%s: session file does not exist, will create\n", __func__);
}
}
// evaluate the initial prompt
printf("\n");
printf("%s : initializing - please wait ...\n", __func__);
if (llama_eval(ctx_llama, embd_inp.data(), embd_inp.size(), 0)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return 1;
}
if (params.verbose_prompt) {
fprintf(stdout, "\n");
fprintf(stdout, "%s", prompt_llama.c_str());
fflush(stdout);
}
// debug message about similarity of saved session, if applicable
size_t n_matching_session_tokens = 0;
if (session_tokens.size()) {
for (llama_token id : session_tokens) {
if (n_matching_session_tokens >= embd_inp.size() || id != embd_inp[n_matching_session_tokens]) {
break;
}
n_matching_session_tokens++;
}
if (n_matching_session_tokens >= embd_inp.size()) {
fprintf(stderr, "%s: session file has exact match for prompt!\n", __func__);
} else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
fprintf(stderr, "%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
__func__, n_matching_session_tokens, embd_inp.size());
} else {
fprintf(stderr, "%s: session file matches %zu / %zu tokens of prompt\n",
__func__, n_matching_session_tokens, embd_inp.size());
}
}
// HACK - because session saving incurs a non-negligible delay, for now skip re-saving session
// if we loaded a session with at least 75% similarity. It's currently just used to speed up the
// initial prompt so it doesn't need to be an exact match.
bool need_to_save_session = !path_session.empty() && n_matching_session_tokens < (embd_inp.size() * 3 / 4);
printf("%s : done! start speaking in the microphone\n", __func__);
printf("\n");
printf("%s%s", params.person.c_str(), chat_symb.c_str());
fflush(stdout);
// clear audio buffer
audio.clear();
// text inference variables
const int voice_id = 2;
const int n_keep = embd_inp.size();
const int n_ctx = llama_n_ctx(ctx_llama);
int n_past = n_keep;
int n_prev = 64; // TODO arg
int n_session_consumed = !path_session.empty() && session_tokens.size() > 0 ? session_tokens.size() : 0;
std::vector<llama_token> embd;
// reverse prompts for detecting when it's time to stop speaking
std::vector<std::string> antiprompts = {
params.person + chat_symb,
};
// main loop
while (is_running) {
// handle Ctrl + C
is_running = sdl_poll_events();
if (!is_running) {
break;
}
// delay
std::this_thread::sleep_for(std::chrono::milliseconds(100));
int64_t t_ms = 0;
{
audio.get(2000, pcmf32_cur);
if (::vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1250, params.vad_thold, params.freq_thold, params.print_energy) || force_speak) {
//fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
audio.get(params.voice_ms, pcmf32_cur);
std::string text_heard;
if (!force_speak) {
text_heard = ::trim(::transcribe(ctx_wsp, params, pcmf32_cur, prompt_whisper, prob0, t_ms));
}
// remove text between brackets using regex
{
std::regex re("\\[.*?\\]");
text_heard = std::regex_replace(text_heard, re, "");
}
// remove text between brackets using regex
{
std::regex re("\\(.*?\\)");
text_heard = std::regex_replace(text_heard, re, "");
}
// remove all characters, except for letters, numbers, punctuation and ':', '\'', '-', ' '
text_heard = std::regex_replace(text_heard, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
// take first line
text_heard = text_heard.substr(0, text_heard.find_first_of('\n'));
// remove leading and trailing whitespace
text_heard = std::regex_replace(text_heard, std::regex("^\\s+"), "");
text_heard = std::regex_replace(text_heard, std::regex("\\s+$"), "");
const std::vector<llama_token> tokens = llama_tokenize(ctx_llama, text_heard.c_str(), false);
if (text_heard.empty() || tokens.empty() || force_speak) {
//fprintf(stdout, "%s: Heard nothing, skipping ...\n", __func__);
audio.clear();
continue;
}
force_speak = false;
text_heard.insert(0, 1, ' ');
text_heard += "\n" + bot_name + chat_symb;
fprintf(stdout, "%s%s%s", "\033[1m", text_heard.c_str(), "\033[0m");
fflush(stdout);
embd = ::llama_tokenize(ctx_llama, text_heard, false);
// Append the new input tokens to the session_tokens vector
if (!path_session.empty()) {
session_tokens.insert(session_tokens.end(), tokens.begin(), tokens.end());
}
// text inference
bool done = false;
std::string text_to_speak;
while (true) {
// predict
if (embd.size() > 0) {
if (n_past + (int) embd.size() > n_ctx) {
n_past = n_keep;
// insert n_left/2 tokens at the start of embd from last_n_tokens
embd.insert(embd.begin(), embd_inp.begin() + embd_inp.size() - n_prev, embd_inp.end());
// stop saving session if we run out of context
path_session = "";
//printf("\n---\n");
//printf("resetting: '");
//for (int i = 0; i < (int) embd.size(); i++) {
// printf("%s", llama_token_to_piece(ctx_llama, embd[i]));
//}
//printf("'\n");
//printf("\n---\n");
}
// try to reuse a matching prefix from the loaded session instead of re-eval (via n_past)
// REVIEW
if (n_session_consumed < (int) session_tokens.size()) {
size_t i = 0;
for ( ; i < embd.size(); i++) {
if (embd[i] != session_tokens[n_session_consumed]) {
session_tokens.resize(n_session_consumed);
break;
}
n_past++;
n_session_consumed++;
if (n_session_consumed >= (int) session_tokens.size()) {
i++;
break;
}
}
if (i > 0) {
embd.erase(embd.begin(), embd.begin() + i);
}
}
if (embd.size() > 0 && !path_session.empty()) {
session_tokens.insert(session_tokens.end(), embd.begin(), embd.end());
n_session_consumed = session_tokens.size();
}
if (llama_eval(ctx_llama, embd.data(), embd.size(), n_past)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return 1;
}
}
embd_inp.insert(embd_inp.end(), embd.begin(), embd.end());
n_past += embd.size();
embd.clear();
if (done) break;
{
// out of user input, sample next token
const float top_k = 5;
const float top_p = 0.80f;
const float temp = 0.30f;
const float repeat_penalty = 1.1764f;
const int repeat_last_n = 256;
if (!path_session.empty() && need_to_save_session) {
need_to_save_session = false;
llama_save_session_file(ctx_llama, path_session.c_str(), session_tokens.data(), session_tokens.size());
}
llama_token id = 0;
{
auto logits = llama_get_logits(ctx_llama);
auto n_vocab = llama_n_vocab(model_llama);
logits[llama_token_eos(model_llama)] = 0;
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
}
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
// apply repeat penalty
const float nl_logit = logits[llama_token_nl(model_llama)];
llama_sample_repetition_penalties(ctx_llama, &candidates_p,
embd_inp.data() + std::max(0, n_past - repeat_last_n),
repeat_last_n, repeat_penalty, 0.0, 0.0f);
logits[llama_token_nl(model_llama)] = nl_logit;
if (temp <= 0) {
// Greedy sampling
id = llama_sample_token_greedy(ctx_llama, &candidates_p);
} else {
// Temperature sampling
llama_sample_top_k(ctx_llama, &candidates_p, top_k, 1);
llama_sample_top_p(ctx_llama, &candidates_p, top_p, 1);
llama_sample_temp (ctx_llama, &candidates_p, temp);
id = llama_sample_token(ctx_llama, &candidates_p);
}
}
if (id != llama_token_eos(model_llama)) {
// add it to the context
embd.push_back(id);
text_to_speak += llama_token_to_piece(ctx_llama, id);
printf("%s", llama_token_to_piece(ctx_llama, id).c_str());
}
}
{
std::string last_output;
for (int i = embd_inp.size() - 16; i < (int) embd_inp.size(); i++) {
last_output += llama_token_to_piece(ctx_llama, embd_inp[i]);
}
last_output += llama_token_to_piece(ctx_llama, embd[0]);
for (std::string & antiprompt : antiprompts) {
if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
done = true;
text_to_speak = ::replace(text_to_speak, antiprompt, "");
fflush(stdout);
need_to_save_session = true;
break;
}
}
}
is_running = sdl_poll_events();
if (!is_running) {
break;
}
}
text_to_speak = ::replace(text_to_speak, "'", "'\"'\"'");
int ret = system((params.speak + " " + std::to_string(voice_id) + " '" + text_to_speak + "'").c_str());
if (ret != 0) {
fprintf(stderr, "%s: failed to speak\n", __func__);
}
audio.clear();
}
}
}
audio.pause();
whisper_print_timings(ctx_wsp);
whisper_free(ctx_wsp);
llama_print_timings(ctx_llama);
llama_free(ctx_llama);
return 0;
}

View File

@@ -0,0 +1,462 @@
#pragma once
#include <cassert>
#include <stdexcept>
#include <vector>
#include <unordered_map>
static const std::vector<std::pair<uint32_t, uint32_t>> digit_ranges = {
{0x30, 0x39}, {0xB2, 0xB3}, {0xB9, 0xB9}, {0x660, 0x669}, {0x6F0, 0x6F9}, {0x7C0, 0x7C9}, {0x966, 0x96F}, {0x9E6, 0x9EF}, {0xA66, 0xA6F}, {0xAE6, 0xAEF}, {0xB66, 0xB6F}, {0xBE6, 0xBEF}, {0xC66, 0xC6F},
{0xCE6, 0xCEF}, {0xD66, 0xD6F}, {0xDE6, 0xDEF}, {0xE50, 0xE59}, {0xED0, 0xED9}, {0xF20, 0xF29}, {0x1040, 0x1049}, {0x1090, 0x1099}, {0x1369, 0x1371}, {0x17E0, 0x17E9}, {0x1810, 0x1819}, {0x1946, 0x194F},
{0x19D0, 0x19DA}, {0x1A80, 0x1A89}, {0x1A90, 0x1A99}, {0x1B50, 0x1B59}, {0x1BB0, 0x1BB9}, {0x1C40, 0x1C49}, {0x1C50, 0x1C59}, {0x2070, 0x2070}, {0x2074, 0x2079}, {0x2080, 0x2089}, {0x2460, 0x2468},
{0x2474, 0x247C}, {0x2488, 0x2490}, {0x24EA, 0x24EA}, {0x24F5, 0x24FD}, {0x24FF, 0x24FF}, {0x2776, 0x277E}, {0x2780, 0x2788}, {0x278A, 0x2792}, {0xA620, 0xA629}, {0xA8D0, 0xA8D9}, {0xA900, 0xA909},
{0xA9D0, 0xA9D9}, {0xA9F0, 0xA9F9}, {0xAA50, 0xAA59}, {0xABF0, 0xABF9}, {0xFF10, 0xFF19}, {0x104A0, 0x104A9}, {0x10A40, 0x10A43}, {0x10D30, 0x10D39}, {0x10E60, 0x10E68}, {0x11052, 0x1105A},
{0x11066, 0x1106F}, {0x110F0, 0x110F9}, {0x11136, 0x1113F}, {0x111D0, 0x111D9}, {0x112F0, 0x112F9}, {0x11450, 0x11459}, {0x114D0, 0x114D9}, {0x11650, 0x11659}, {0x116C0, 0x116C9}, {0x11730, 0x11739},
{0x118E0, 0x118E9}, {0x11950, 0x11959}, {0x11C50, 0x11C59}, {0x11D50, 0x11D59}, {0x11DA0, 0x11DA9}, {0x16A60, 0x16A69}, {0x16B50, 0x16B59}, {0x1D7CE, 0x1D7FF}, {0x1E140, 0x1E149}, {0x1E2F0, 0x1E2F9},
{0x1E950, 0x1E959}, {0x1F100, 0x1F10A}, {0x1FBF0, 0x1FBF9},
};
static const std::vector<std::pair<uint32_t, uint32_t>> letter_ranges = {
{0x41, 0x5A}, {0x61, 0x7A}, {0xAA, 0xAA}, {0xB5, 0xB5}, {0xBA, 0xBA}, {0xC0, 0xD6}, {0xD8, 0xF6}, {0xF8, 0x2C1}, {0x2C6, 0x2D1}, {0x2E0, 0x2E4}, {0x2EC, 0x2EC}, {0x2EE, 0x2EE}, {0x370, 0x374},
{0x376, 0x377}, {0x37A, 0x37D}, {0x37F, 0x37F}, {0x386, 0x386}, {0x388, 0x38A}, {0x38C, 0x38C}, {0x38E, 0x3A1}, {0x3A3, 0x3F5}, {0x3F7, 0x481}, {0x48A, 0x52F}, {0x531, 0x556}, {0x559, 0x559},
{0x560, 0x588}, {0x5D0, 0x5EA}, {0x5EF, 0x5F2}, {0x620, 0x64A}, {0x66E, 0x66F}, {0x671, 0x6D3}, {0x6D5, 0x6D5}, {0x6E5, 0x6E6}, {0x6EE, 0x6EF}, {0x6FA, 0x6FC}, {0x6FF, 0x6FF}, {0x710, 0x710},
{0x712, 0x72F}, {0x74D, 0x7A5}, {0x7B1, 0x7B1}, {0x7CA, 0x7EA}, {0x7F4, 0x7F5}, {0x7FA, 0x7FA}, {0x800, 0x815}, {0x81A, 0x81A}, {0x824, 0x824}, {0x828, 0x828}, {0x840, 0x858}, {0x860, 0x86A},
{0x8A0, 0x8B4}, {0x8B6, 0x8C7}, {0x904, 0x939}, {0x93D, 0x93D}, {0x950, 0x950}, {0x958, 0x961}, {0x971, 0x980}, {0x985, 0x98C}, {0x98F, 0x990}, {0x993, 0x9A8}, {0x9AA, 0x9B0}, {0x9B2, 0x9B2},
{0x9B6, 0x9B9}, {0x9BD, 0x9BD}, {0x9CE, 0x9CE}, {0x9DC, 0x9DD}, {0x9DF, 0x9E1}, {0x9F0, 0x9F1}, {0x9FC, 0x9FC}, {0xA05, 0xA0A}, {0xA0F, 0xA10}, {0xA13, 0xA28}, {0xA2A, 0xA30}, {0xA32, 0xA33},
{0xA35, 0xA36}, {0xA38, 0xA39}, {0xA59, 0xA5C}, {0xA5E, 0xA5E}, {0xA72, 0xA74}, {0xA85, 0xA8D}, {0xA8F, 0xA91}, {0xA93, 0xAA8}, {0xAAA, 0xAB0}, {0xAB2, 0xAB3}, {0xAB5, 0xAB9}, {0xABD, 0xABD},
{0xAD0, 0xAD0}, {0xAE0, 0xAE1}, {0xAF9, 0xAF9}, {0xB05, 0xB0C}, {0xB0F, 0xB10}, {0xB13, 0xB28}, {0xB2A, 0xB30}, {0xB32, 0xB33}, {0xB35, 0xB39}, {0xB3D, 0xB3D}, {0xB5C, 0xB5D}, {0xB5F, 0xB61},
{0xB71, 0xB71}, {0xB83, 0xB83}, {0xB85, 0xB8A}, {0xB8E, 0xB90}, {0xB92, 0xB95}, {0xB99, 0xB9A}, {0xB9C, 0xB9C}, {0xB9E, 0xB9F}, {0xBA3, 0xBA4}, {0xBA8, 0xBAA}, {0xBAE, 0xBB9}, {0xBD0, 0xBD0},
{0xC05, 0xC0C}, {0xC0E, 0xC10}, {0xC12, 0xC28}, {0xC2A, 0xC39}, {0xC3D, 0xC3D}, {0xC58, 0xC5A}, {0xC60, 0xC61}, {0xC80, 0xC80}, {0xC85, 0xC8C}, {0xC8E, 0xC90}, {0xC92, 0xCA8}, {0xCAA, 0xCB3},
{0xCB5, 0xCB9}, {0xCBD, 0xCBD}, {0xCDE, 0xCDE}, {0xCE0, 0xCE1}, {0xCF1, 0xCF2}, {0xD04, 0xD0C}, {0xD0E, 0xD10}, {0xD12, 0xD3A}, {0xD3D, 0xD3D}, {0xD4E, 0xD4E}, {0xD54, 0xD56}, {0xD5F, 0xD61},
{0xD7A, 0xD7F}, {0xD85, 0xD96}, {0xD9A, 0xDB1}, {0xDB3, 0xDBB}, {0xDBD, 0xDBD}, {0xDC0, 0xDC6}, {0xE01, 0xE30}, {0xE32, 0xE33}, {0xE40, 0xE46}, {0xE81, 0xE82}, {0xE84, 0xE84}, {0xE86, 0xE8A},
{0xE8C, 0xEA3}, {0xEA5, 0xEA5}, {0xEA7, 0xEB0}, {0xEB2, 0xEB3}, {0xEBD, 0xEBD}, {0xEC0, 0xEC4}, {0xEC6, 0xEC6}, {0xEDC, 0xEDF}, {0xF00, 0xF00}, {0xF40, 0xF47}, {0xF49, 0xF6C}, {0xF88, 0xF8C},
{0x1000, 0x102A}, {0x103F, 0x103F}, {0x1050, 0x1055}, {0x105A, 0x105D}, {0x1061, 0x1061}, {0x1065, 0x1066}, {0x106E, 0x1070}, {0x1075, 0x1081}, {0x108E, 0x108E}, {0x10A0, 0x10C5}, {0x10C7, 0x10C7},
{0x10CD, 0x10CD}, {0x10D0, 0x10FA}, {0x10FC, 0x1248}, {0x124A, 0x124D}, {0x1250, 0x1256}, {0x1258, 0x1258}, {0x125A, 0x125D}, {0x1260, 0x1288}, {0x128A, 0x128D}, {0x1290, 0x12B0}, {0x12B2, 0x12B5},
{0x12B8, 0x12BE}, {0x12C0, 0x12C0}, {0x12C2, 0x12C5}, {0x12C8, 0x12D6}, {0x12D8, 0x1310}, {0x1312, 0x1315}, {0x1318, 0x135A}, {0x1380, 0x138F}, {0x13A0, 0x13F5}, {0x13F8, 0x13FD}, {0x1401, 0x166C},
{0x166F, 0x167F}, {0x1681, 0x169A}, {0x16A0, 0x16EA}, {0x16F1, 0x16F8}, {0x1700, 0x170C}, {0x170E, 0x1711}, {0x1720, 0x1731}, {0x1740, 0x1751}, {0x1760, 0x176C}, {0x176E, 0x1770}, {0x1780, 0x17B3},
{0x17D7, 0x17D7}, {0x17DC, 0x17DC}, {0x1820, 0x1878}, {0x1880, 0x1884}, {0x1887, 0x18A8}, {0x18AA, 0x18AA}, {0x18B0, 0x18F5}, {0x1900, 0x191E}, {0x1950, 0x196D}, {0x1970, 0x1974}, {0x1980, 0x19AB},
{0x19B0, 0x19C9}, {0x1A00, 0x1A16}, {0x1A20, 0x1A54}, {0x1AA7, 0x1AA7}, {0x1B05, 0x1B33}, {0x1B45, 0x1B4B}, {0x1B83, 0x1BA0}, {0x1BAE, 0x1BAF}, {0x1BBA, 0x1BE5}, {0x1C00, 0x1C23}, {0x1C4D, 0x1C4F},
{0x1C5A, 0x1C7D}, {0x1C80, 0x1C88}, {0x1C90, 0x1CBA}, {0x1CBD, 0x1CBF}, {0x1CE9, 0x1CEC}, {0x1CEE, 0x1CF3}, {0x1CF5, 0x1CF6}, {0x1CFA, 0x1CFA}, {0x1D00, 0x1DBF}, {0x1E00, 0x1F15}, {0x1F18, 0x1F1D},
{0x1F20, 0x1F45}, {0x1F48, 0x1F4D}, {0x1F50, 0x1F57}, {0x1F59, 0x1F59}, {0x1F5B, 0x1F5B}, {0x1F5D, 0x1F5D}, {0x1F5F, 0x1F7D}, {0x1F80, 0x1FB4}, {0x1FB6, 0x1FBC}, {0x1FBE, 0x1FBE}, {0x1FC2, 0x1FC4},
{0x1FC6, 0x1FCC}, {0x1FD0, 0x1FD3}, {0x1FD6, 0x1FDB}, {0x1FE0, 0x1FEC}, {0x1FF2, 0x1FF4}, {0x1FF6, 0x1FFC}, {0x2071, 0x2071}, {0x207F, 0x207F}, {0x2090, 0x209C}, {0x2102, 0x2102}, {0x2107, 0x2107},
{0x210A, 0x2113}, {0x2115, 0x2115}, {0x2119, 0x211D}, {0x2124, 0x2124}, {0x2126, 0x2126}, {0x2128, 0x2128}, {0x212A, 0x212D}, {0x212F, 0x2139}, {0x213C, 0x213F}, {0x2145, 0x2149}, {0x214E, 0x214E},
{0x2183, 0x2184}, {0x2C00, 0x2C2E}, {0x2C30, 0x2C5E}, {0x2C60, 0x2CE4}, {0x2CEB, 0x2CEE}, {0x2CF2, 0x2CF3}, {0x2D00, 0x2D25}, {0x2D27, 0x2D27}, {0x2D2D, 0x2D2D}, {0x2D30, 0x2D67}, {0x2D6F, 0x2D6F},
{0x2D80, 0x2D96}, {0x2DA0, 0x2DA6}, {0x2DA8, 0x2DAE}, {0x2DB0, 0x2DB6}, {0x2DB8, 0x2DBE}, {0x2DC0, 0x2DC6}, {0x2DC8, 0x2DCE}, {0x2DD0, 0x2DD6}, {0x2DD8, 0x2DDE}, {0x2E2F, 0x2E2F}, {0x3005, 0x3006},
{0x3031, 0x3035}, {0x303B, 0x303C}, {0x3041, 0x3096}, {0x309D, 0x309F}, {0x30A1, 0x30FA}, {0x30FC, 0x30FF}, {0x3105, 0x312F}, {0x3131, 0x318E}, {0x31A0, 0x31BF}, {0x31F0, 0x31FF}, {0x3400, 0x4DBF},
{0x4E00, 0x9FFC}, {0xA000, 0xA48C}, {0xA4D0, 0xA4FD}, {0xA500, 0xA60C}, {0xA610, 0xA61F}, {0xA62A, 0xA62B}, {0xA640, 0xA66E}, {0xA67F, 0xA69D}, {0xA6A0, 0xA6E5}, {0xA717, 0xA71F}, {0xA722, 0xA788},
{0xA78B, 0xA7BF}, {0xA7C2, 0xA7CA}, {0xA7F5, 0xA801}, {0xA803, 0xA805}, {0xA807, 0xA80A}, {0xA80C, 0xA822}, {0xA840, 0xA873}, {0xA882, 0xA8B3}, {0xA8F2, 0xA8F7}, {0xA8FB, 0xA8FB}, {0xA8FD, 0xA8FE},
{0xA90A, 0xA925}, {0xA930, 0xA946}, {0xA960, 0xA97C}, {0xA984, 0xA9B2}, {0xA9CF, 0xA9CF}, {0xA9E0, 0xA9E4}, {0xA9E6, 0xA9EF}, {0xA9FA, 0xA9FE}, {0xAA00, 0xAA28}, {0xAA40, 0xAA42}, {0xAA44, 0xAA4B},
{0xAA60, 0xAA76}, {0xAA7A, 0xAA7A}, {0xAA7E, 0xAAAF}, {0xAAB1, 0xAAB1}, {0xAAB5, 0xAAB6}, {0xAAB9, 0xAABD}, {0xAAC0, 0xAAC0}, {0xAAC2, 0xAAC2}, {0xAADB, 0xAADD}, {0xAAE0, 0xAAEA}, {0xAAF2, 0xAAF4},
{0xAB01, 0xAB06}, {0xAB09, 0xAB0E}, {0xAB11, 0xAB16}, {0xAB20, 0xAB26}, {0xAB28, 0xAB2E}, {0xAB30, 0xAB5A}, {0xAB5C, 0xAB69}, {0xAB70, 0xABE2}, {0xAC00, 0xD7A3}, {0xD7B0, 0xD7C6}, {0xD7CB, 0xD7FB},
{0xF900, 0xFA6D}, {0xFA70, 0xFAD9}, {0xFB00, 0xFB06}, {0xFB13, 0xFB17}, {0xFB1D, 0xFB1D}, {0xFB1F, 0xFB28}, {0xFB2A, 0xFB36}, {0xFB38, 0xFB3C}, {0xFB3E, 0xFB3E}, {0xFB40, 0xFB41}, {0xFB43, 0xFB44},
{0xFB46, 0xFBB1}, {0xFBD3, 0xFD3D}, {0xFD50, 0xFD8F}, {0xFD92, 0xFDC7}, {0xFDF0, 0xFDFB}, {0xFE70, 0xFE74}, {0xFE76, 0xFEFC}, {0xFF21, 0xFF3A}, {0xFF41, 0xFF5A}, {0xFF66, 0xFFBE}, {0xFFC2, 0xFFC7},
{0xFFCA, 0xFFCF}, {0xFFD2, 0xFFD7}, {0xFFDA, 0xFFDC}, {0x10000, 0x1000B}, {0x1000D, 0x10026}, {0x10028, 0x1003A}, {0x1003C, 0x1003D}, {0x1003F, 0x1004D}, {0x10050, 0x1005D}, {0x10080, 0x100FA},
{0x10280, 0x1029C}, {0x102A0, 0x102D0}, {0x10300, 0x1031F}, {0x1032D, 0x10340}, {0x10342, 0x10349}, {0x10350, 0x10375}, {0x10380, 0x1039D}, {0x103A0, 0x103C3}, {0x103C8, 0x103CF}, {0x10400, 0x1049D},
{0x104B0, 0x104D3}, {0x104D8, 0x104FB}, {0x10500, 0x10527}, {0x10530, 0x10563}, {0x10600, 0x10736}, {0x10740, 0x10755}, {0x10760, 0x10767}, {0x10800, 0x10805}, {0x10808, 0x10808}, {0x1080A, 0x10835},
{0x10837, 0x10838}, {0x1083C, 0x1083C}, {0x1083F, 0x10855}, {0x10860, 0x10876}, {0x10880, 0x1089E}, {0x108E0, 0x108F2}, {0x108F4, 0x108F5}, {0x10900, 0x10915}, {0x10920, 0x10939}, {0x10980, 0x109B7},
{0x109BE, 0x109BF}, {0x10A00, 0x10A00}, {0x10A10, 0x10A13}, {0x10A15, 0x10A17}, {0x10A19, 0x10A35}, {0x10A60, 0x10A7C}, {0x10A80, 0x10A9C}, {0x10AC0, 0x10AC7}, {0x10AC9, 0x10AE4}, {0x10B00, 0x10B35},
{0x10B40, 0x10B55}, {0x10B60, 0x10B72}, {0x10B80, 0x10B91}, {0x10C00, 0x10C48}, {0x10C80, 0x10CB2}, {0x10CC0, 0x10CF2}, {0x10D00, 0x10D23}, {0x10E80, 0x10EA9}, {0x10EB0, 0x10EB1}, {0x10F00, 0x10F1C},
{0x10F27, 0x10F27}, {0x10F30, 0x10F45}, {0x10FB0, 0x10FC4}, {0x10FE0, 0x10FF6}, {0x11003, 0x11037}, {0x11083, 0x110AF}, {0x110D0, 0x110E8}, {0x11103, 0x11126}, {0x11144, 0x11144}, {0x11147, 0x11147},
{0x11150, 0x11172}, {0x11176, 0x11176}, {0x11183, 0x111B2}, {0x111C1, 0x111C4}, {0x111DA, 0x111DA}, {0x111DC, 0x111DC}, {0x11200, 0x11211}, {0x11213, 0x1122B}, {0x11280, 0x11286}, {0x11288, 0x11288},
{0x1128A, 0x1128D}, {0x1128F, 0x1129D}, {0x1129F, 0x112A8}, {0x112B0, 0x112DE}, {0x11305, 0x1130C}, {0x1130F, 0x11310}, {0x11313, 0x11328}, {0x1132A, 0x11330}, {0x11332, 0x11333}, {0x11335, 0x11339},
{0x1133D, 0x1133D}, {0x11350, 0x11350}, {0x1135D, 0x11361}, {0x11400, 0x11434}, {0x11447, 0x1144A}, {0x1145F, 0x11461}, {0x11480, 0x114AF}, {0x114C4, 0x114C5}, {0x114C7, 0x114C7}, {0x11580, 0x115AE},
{0x115D8, 0x115DB}, {0x11600, 0x1162F}, {0x11644, 0x11644}, {0x11680, 0x116AA}, {0x116B8, 0x116B8}, {0x11700, 0x1171A}, {0x11800, 0x1182B}, {0x118A0, 0x118DF}, {0x118FF, 0x11906}, {0x11909, 0x11909},
{0x1190C, 0x11913}, {0x11915, 0x11916}, {0x11918, 0x1192F}, {0x1193F, 0x1193F}, {0x11941, 0x11941}, {0x119A0, 0x119A7}, {0x119AA, 0x119D0}, {0x119E1, 0x119E1}, {0x119E3, 0x119E3}, {0x11A00, 0x11A00},
{0x11A0B, 0x11A32}, {0x11A3A, 0x11A3A}, {0x11A50, 0x11A50}, {0x11A5C, 0x11A89}, {0x11A9D, 0x11A9D}, {0x11AC0, 0x11AF8}, {0x11C00, 0x11C08}, {0x11C0A, 0x11C2E}, {0x11C40, 0x11C40}, {0x11C72, 0x11C8F},
{0x11D00, 0x11D06}, {0x11D08, 0x11D09}, {0x11D0B, 0x11D30}, {0x11D46, 0x11D46}, {0x11D60, 0x11D65}, {0x11D67, 0x11D68}, {0x11D6A, 0x11D89}, {0x11D98, 0x11D98}, {0x11EE0, 0x11EF2}, {0x11FB0, 0x11FB0},
{0x12000, 0x12399}, {0x12480, 0x12543}, {0x13000, 0x1342E}, {0x14400, 0x14646}, {0x16800, 0x16A38}, {0x16A40, 0x16A5E}, {0x16AD0, 0x16AED}, {0x16B00, 0x16B2F}, {0x16B40, 0x16B43}, {0x16B63, 0x16B77},
{0x16B7D, 0x16B8F}, {0x16E40, 0x16E7F}, {0x16F00, 0x16F4A}, {0x16F50, 0x16F50}, {0x16F93, 0x16F9F}, {0x16FE0, 0x16FE1}, {0x16FE3, 0x16FE3}, {0x17000, 0x187F7}, {0x18800, 0x18CD5}, {0x18D00, 0x18D08},
{0x1B000, 0x1B11E}, {0x1B150, 0x1B152}, {0x1B164, 0x1B167}, {0x1B170, 0x1B2FB}, {0x1BC00, 0x1BC6A}, {0x1BC70, 0x1BC7C}, {0x1BC80, 0x1BC88}, {0x1BC90, 0x1BC99}, {0x1D400, 0x1D454}, {0x1D456, 0x1D49C},
{0x1D49E, 0x1D49F}, {0x1D4A2, 0x1D4A2}, {0x1D4A5, 0x1D4A6}, {0x1D4A9, 0x1D4AC}, {0x1D4AE, 0x1D4B9}, {0x1D4BB, 0x1D4BB}, {0x1D4BD, 0x1D4C3}, {0x1D4C5, 0x1D505}, {0x1D507, 0x1D50A}, {0x1D50D, 0x1D514},
{0x1D516, 0x1D51C}, {0x1D51E, 0x1D539}, {0x1D53B, 0x1D53E}, {0x1D540, 0x1D544}, {0x1D546, 0x1D546}, {0x1D54A, 0x1D550}, {0x1D552, 0x1D6A5}, {0x1D6A8, 0x1D6C0}, {0x1D6C2, 0x1D6DA}, {0x1D6DC, 0x1D6FA},
{0x1D6FC, 0x1D714}, {0x1D716, 0x1D734}, {0x1D736, 0x1D74E}, {0x1D750, 0x1D76E}, {0x1D770, 0x1D788}, {0x1D78A, 0x1D7A8}, {0x1D7AA, 0x1D7C2}, {0x1D7C4, 0x1D7CB}, {0x1E100, 0x1E12C}, {0x1E137, 0x1E13D},
{0x1E14E, 0x1E14E}, {0x1E2C0, 0x1E2EB}, {0x1E800, 0x1E8C4}, {0x1E900, 0x1E943}, {0x1E94B, 0x1E94B}, {0x1EE00, 0x1EE03}, {0x1EE05, 0x1EE1F}, {0x1EE21, 0x1EE22}, {0x1EE24, 0x1EE24}, {0x1EE27, 0x1EE27},
{0x1EE29, 0x1EE32}, {0x1EE34, 0x1EE37}, {0x1EE39, 0x1EE39}, {0x1EE3B, 0x1EE3B}, {0x1EE42, 0x1EE42}, {0x1EE47, 0x1EE47}, {0x1EE49, 0x1EE49}, {0x1EE4B, 0x1EE4B}, {0x1EE4D, 0x1EE4F}, {0x1EE51, 0x1EE52},
{0x1EE54, 0x1EE54}, {0x1EE57, 0x1EE57}, {0x1EE59, 0x1EE59}, {0x1EE5B, 0x1EE5B}, {0x1EE5D, 0x1EE5D}, {0x1EE5F, 0x1EE5F}, {0x1EE61, 0x1EE62}, {0x1EE64, 0x1EE64}, {0x1EE67, 0x1EE6A}, {0x1EE6C, 0x1EE72},
{0x1EE74, 0x1EE77}, {0x1EE79, 0x1EE7C}, {0x1EE7E, 0x1EE7E}, {0x1EE80, 0x1EE89}, {0x1EE8B, 0x1EE9B}, {0x1EEA1, 0x1EEA3}, {0x1EEA5, 0x1EEA9}, {0x1EEAB, 0x1EEBB}, {0x20000, 0x2A6DD}, {0x2A700, 0x2B734},
{0x2B740, 0x2B81D}, {0x2B820, 0x2CEA1}, {0x2CEB0, 0x2EBE0}, {0x2F800, 0x2FA1D}, {0x30000, 0x3134A},
};
static const std::vector<std::pair<uint32_t, uint32_t>> whitespace_ranges = {
{0x9, 0xD}, {0x1C, 0x20}, {0x85, 0x85}, {0xA0, 0xA0}, {0x1680, 0x1680}, {0x2000, 0x200A}, {0x2028, 0x2029}, {0x202F, 0x202F}, {0x205F, 0x205F}, {0x3000, 0x3000},
};
static const std::vector<std::pair<uint32_t, uint32_t>> accent_mark_ranges = {
{0x300, 0x36F}, {0x483, 0x489}, {0x591, 0x5BD}, {0x5BF, 0x5BF}, {0x5C1, 0x5C2}, {0x5C4, 0x5C5}, {0x5C7, 0x5C7}, {0x610, 0x61A}, {0x64B, 0x65F}, {0x670, 0x670}, {0x6D6, 0x6DC}, {0x6DF, 0x6E4},
{0x6E7, 0x6E8}, {0x6EA, 0x6ED}, {0x711, 0x711}, {0x730, 0x74A}, {0x7A6, 0x7B0}, {0x7EB, 0x7F3}, {0x7FD, 0x7FD}, {0x816, 0x819}, {0x81B, 0x823}, {0x825, 0x827}, {0x829, 0x82D}, {0x859, 0x85B},
{0x8D3, 0x8E1}, {0x8E3, 0x903}, {0x93A, 0x93C}, {0x93E, 0x94F}, {0x951, 0x957}, {0x962, 0x963}, {0x981, 0x983}, {0x9BC, 0x9BC}, {0x9BE, 0x9C4}, {0x9C7, 0x9C8}, {0x9CB, 0x9CD}, {0x9D7, 0x9D7},
{0x9E2, 0x9E3}, {0x9FE, 0x9FE}, {0xA01, 0xA03}, {0xA3C, 0xA3C}, {0xA3E, 0xA42}, {0xA47, 0xA48}, {0xA4B, 0xA4D}, {0xA51, 0xA51}, {0xA70, 0xA71}, {0xA75, 0xA75}, {0xA81, 0xA83}, {0xABC, 0xABC},
{0xABE, 0xAC5}, {0xAC7, 0xAC9}, {0xACB, 0xACD}, {0xAE2, 0xAE3}, {0xAFA, 0xAFF}, {0xB01, 0xB03}, {0xB3C, 0xB3C}, {0xB3E, 0xB44}, {0xB47, 0xB48}, {0xB4B, 0xB4D}, {0xB55, 0xB57}, {0xB62, 0xB63},
{0xB82, 0xB82}, {0xBBE, 0xBC2}, {0xBC6, 0xBC8}, {0xBCA, 0xBCD}, {0xBD7, 0xBD7}, {0xC00, 0xC04}, {0xC3E, 0xC44}, {0xC46, 0xC48}, {0xC4A, 0xC4D}, {0xC55, 0xC56}, {0xC62, 0xC63}, {0xC81, 0xC83},
{0xCBC, 0xCBC}, {0xCBE, 0xCC4}, {0xCC6, 0xCC8}, {0xCCA, 0xCCD}, {0xCD5, 0xCD6}, {0xCE2, 0xCE3}, {0xD00, 0xD03}, {0xD3B, 0xD3C}, {0xD3E, 0xD44}, {0xD46, 0xD48}, {0xD4A, 0xD4D}, {0xD57, 0xD57},
{0xD62, 0xD63}, {0xD81, 0xD83}, {0xDCA, 0xDCA}, {0xDCF, 0xDD4}, {0xDD6, 0xDD6}, {0xDD8, 0xDDF}, {0xDF2, 0xDF3}, {0xE31, 0xE31}, {0xE34, 0xE3A}, {0xE47, 0xE4E}, {0xEB1, 0xEB1}, {0xEB4, 0xEBC},
{0xEC8, 0xECD}, {0xF18, 0xF19}, {0xF35, 0xF35}, {0xF37, 0xF37}, {0xF39, 0xF39}, {0xF3E, 0xF3F}, {0xF71, 0xF84}, {0xF86, 0xF87}, {0xF8D, 0xF97}, {0xF99, 0xFBC}, {0xFC6, 0xFC6}, {0x102B, 0x103E},
{0x1056, 0x1059}, {0x105E, 0x1060}, {0x1062, 0x1064}, {0x1067, 0x106D}, {0x1071, 0x1074}, {0x1082, 0x108D}, {0x108F, 0x108F}, {0x109A, 0x109D}, {0x135D, 0x135F}, {0x1712, 0x1714}, {0x1732, 0x1734},
{0x1752, 0x1753}, {0x1772, 0x1773}, {0x17B4, 0x17D3}, {0x17DD, 0x17DD}, {0x180B, 0x180D}, {0x1885, 0x1886}, {0x18A9, 0x18A9}, {0x1920, 0x192B}, {0x1930, 0x193B}, {0x1A17, 0x1A1B}, {0x1A55, 0x1A5E},
{0x1A60, 0x1A7C}, {0x1A7F, 0x1A7F}, {0x1AB0, 0x1AC0}, {0x1B00, 0x1B04}, {0x1B34, 0x1B44}, {0x1B6B, 0x1B73}, {0x1B80, 0x1B82}, {0x1BA1, 0x1BAD}, {0x1BE6, 0x1BF3}, {0x1C24, 0x1C37}, {0x1CD0, 0x1CD2},
{0x1CD4, 0x1CE8}, {0x1CED, 0x1CED}, {0x1CF4, 0x1CF4}, {0x1CF7, 0x1CF9}, {0x1DC0, 0x1DF9}, {0x1DFB, 0x1DFF}, {0x20D0, 0x20F0}, {0x2CEF, 0x2CF1}, {0x2D7F, 0x2D7F}, {0x2DE0, 0x2DFF}, {0x302A, 0x302F},
{0x3099, 0x309A}, {0xA66F, 0xA672}, {0xA674, 0xA67D}, {0xA69E, 0xA69F}, {0xA6F0, 0xA6F1}, {0xA802, 0xA802}, {0xA806, 0xA806}, {0xA80B, 0xA80B}, {0xA823, 0xA827}, {0xA82C, 0xA82C}, {0xA880, 0xA881},
{0xA8B4, 0xA8C5}, {0xA8E0, 0xA8F1}, {0xA8FF, 0xA8FF}, {0xA926, 0xA92D}, {0xA947, 0xA953}, {0xA980, 0xA983}, {0xA9B3, 0xA9C0}, {0xA9E5, 0xA9E5}, {0xAA29, 0xAA36}, {0xAA43, 0xAA43}, {0xAA4C, 0xAA4D},
{0xAA7B, 0xAA7D}, {0xAAB0, 0xAAB0}, {0xAAB2, 0xAAB4}, {0xAAB7, 0xAAB8}, {0xAABE, 0xAABF}, {0xAAC1, 0xAAC1}, {0xAAEB, 0xAAEF}, {0xAAF5, 0xAAF6}, {0xABE3, 0xABEA}, {0xABEC, 0xABED}, {0xFB1E, 0xFB1E},
{0xFE00, 0xFE0F}, {0xFE20, 0xFE2F}, {0x101FD, 0x101FD}, {0x102E0, 0x102E0}, {0x10376, 0x1037A}, {0x10A01, 0x10A03}, {0x10A05, 0x10A06}, {0x10A0C, 0x10A0F}, {0x10A38, 0x10A3A}, {0x10A3F, 0x10A3F},
{0x10AE5, 0x10AE6}, {0x10D24, 0x10D27}, {0x10EAB, 0x10EAC}, {0x10F46, 0x10F50}, {0x11000, 0x11002}, {0x11038, 0x11046}, {0x1107F, 0x11082}, {0x110B0, 0x110BA}, {0x11100, 0x11102}, {0x11127, 0x11134},
{0x11145, 0x11146}, {0x11173, 0x11173}, {0x11180, 0x11182}, {0x111B3, 0x111C0}, {0x111C9, 0x111CC}, {0x111CE, 0x111CF}, {0x1122C, 0x11237}, {0x1123E, 0x1123E}, {0x112DF, 0x112EA}, {0x11300, 0x11303},
{0x1133B, 0x1133C}, {0x1133E, 0x11344}, {0x11347, 0x11348}, {0x1134B, 0x1134D}, {0x11357, 0x11357}, {0x11362, 0x11363}, {0x11366, 0x1136C}, {0x11370, 0x11374}, {0x11435, 0x11446}, {0x1145E, 0x1145E},
{0x114B0, 0x114C3}, {0x115AF, 0x115B5}, {0x115B8, 0x115C0}, {0x115DC, 0x115DD}, {0x11630, 0x11640}, {0x116AB, 0x116B7}, {0x1171D, 0x1172B}, {0x1182C, 0x1183A}, {0x11930, 0x11935}, {0x11937, 0x11938},
{0x1193B, 0x1193E}, {0x11940, 0x11940}, {0x11942, 0x11943}, {0x119D1, 0x119D7}, {0x119DA, 0x119E0}, {0x119E4, 0x119E4}, {0x11A01, 0x11A0A}, {0x11A33, 0x11A39}, {0x11A3B, 0x11A3E}, {0x11A47, 0x11A47},
{0x11A51, 0x11A5B}, {0x11A8A, 0x11A99}, {0x11C2F, 0x11C36}, {0x11C38, 0x11C3F}, {0x11C92, 0x11CA7}, {0x11CA9, 0x11CB6}, {0x11D31, 0x11D36}, {0x11D3A, 0x11D3A}, {0x11D3C, 0x11D3D}, {0x11D3F, 0x11D45},
{0x11D47, 0x11D47}, {0x11D8A, 0x11D8E}, {0x11D90, 0x11D91}, {0x11D93, 0x11D97}, {0x11EF3, 0x11EF6}, {0x16AF0, 0x16AF4}, {0x16B30, 0x16B36}, {0x16F4F, 0x16F4F}, {0x16F51, 0x16F87}, {0x16F8F, 0x16F92},
{0x16FE4, 0x16FE4}, {0x16FF0, 0x16FF1}, {0x1BC9D, 0x1BC9E}, {0x1D165, 0x1D169}, {0x1D16D, 0x1D172}, {0x1D17B, 0x1D182}, {0x1D185, 0x1D18B}, {0x1D1AA, 0x1D1AD}, {0x1D242, 0x1D244}, {0x1DA00, 0x1DA36},
{0x1DA3B, 0x1DA6C}, {0x1DA75, 0x1DA75}, {0x1DA84, 0x1DA84}, {0x1DA9B, 0x1DA9F}, {0x1DAA1, 0x1DAAF}, {0x1E000, 0x1E006}, {0x1E008, 0x1E018}, {0x1E01B, 0x1E021}, {0x1E023, 0x1E024}, {0x1E026, 0x1E02A},
{0x1E130, 0x1E136}, {0x1E2EC, 0x1E2EF}, {0x1E8D0, 0x1E8D6}, {0x1E944, 0x1E94A}, {0xE0100, 0xE01EF},
};
static const std::vector<std::pair<uint32_t, uint32_t>> punctuation_ranges = {
{0x21, 0x23}, {0x25, 0x2A}, {0x2C, 0x2F}, {0x3A, 0x3B}, {0x3F, 0x40}, {0x5B, 0x5D}, {0x5F, 0x5F}, {0x7B, 0x7B}, {0x7D, 0x7D}, {0xA1, 0xA1}, {0xA7, 0xA7}, {0xAB, 0xAB}, {0xB6, 0xB7}, {0xBB, 0xBB},
{0xBF, 0xBF}, {0x37E, 0x37E}, {0x387, 0x387}, {0x55A, 0x55F}, {0x589, 0x58A}, {0x5BE, 0x5BE}, {0x5C0, 0x5C0}, {0x5C3, 0x5C3}, {0x5C6, 0x5C6}, {0x5F3, 0x5F4}, {0x609, 0x60A}, {0x60C, 0x60D},
{0x61B, 0x61B}, {0x61E, 0x61F}, {0x66A, 0x66D}, {0x6D4, 0x6D4}, {0x700, 0x70D}, {0x7F7, 0x7F9}, {0x830, 0x83E}, {0x85E, 0x85E}, {0x964, 0x965}, {0x970, 0x970}, {0x9FD, 0x9FD}, {0xA76, 0xA76},
{0xAF0, 0xAF0}, {0xC77, 0xC77}, {0xC84, 0xC84}, {0xDF4, 0xDF4}, {0xE4F, 0xE4F}, {0xE5A, 0xE5B}, {0xF04, 0xF12}, {0xF14, 0xF14}, {0xF3A, 0xF3D}, {0xF85, 0xF85}, {0xFD0, 0xFD4}, {0xFD9, 0xFDA},
{0x104A, 0x104F}, {0x10FB, 0x10FB}, {0x1360, 0x1368}, {0x1400, 0x1400}, {0x166E, 0x166E}, {0x169B, 0x169C}, {0x16EB, 0x16ED}, {0x1735, 0x1736}, {0x17D4, 0x17D6}, {0x17D8, 0x17DA}, {0x1800, 0x180A},
{0x1944, 0x1945}, {0x1A1E, 0x1A1F}, {0x1AA0, 0x1AA6}, {0x1AA8, 0x1AAD}, {0x1B5A, 0x1B60}, {0x1BFC, 0x1BFF}, {0x1C3B, 0x1C3F}, {0x1C7E, 0x1C7F}, {0x1CC0, 0x1CC7}, {0x1CD3, 0x1CD3}, {0x2010, 0x2027},
{0x2030, 0x2043}, {0x2045, 0x2051}, {0x2053, 0x205E}, {0x207D, 0x207E}, {0x208D, 0x208E}, {0x2308, 0x230B}, {0x2329, 0x232A}, {0x2768, 0x2775}, {0x27C5, 0x27C6}, {0x27E6, 0x27EF}, {0x2983, 0x2998},
{0x29D8, 0x29DB}, {0x29FC, 0x29FD}, {0x2CF9, 0x2CFC}, {0x2CFE, 0x2CFF}, {0x2D70, 0x2D70}, {0x2E00, 0x2E2E}, {0x2E30, 0x2E4F}, {0x2E52, 0x2E52}, {0x3001, 0x3003}, {0x3008, 0x3011}, {0x3014, 0x301F},
{0x3030, 0x3030}, {0x303D, 0x303D}, {0x30A0, 0x30A0}, {0x30FB, 0x30FB}, {0xA4FE, 0xA4FF}, {0xA60D, 0xA60F}, {0xA673, 0xA673}, {0xA67E, 0xA67E}, {0xA6F2, 0xA6F7}, {0xA874, 0xA877}, {0xA8CE, 0xA8CF},
{0xA8F8, 0xA8FA}, {0xA8FC, 0xA8FC}, {0xA92E, 0xA92F}, {0xA95F, 0xA95F}, {0xA9C1, 0xA9CD}, {0xA9DE, 0xA9DF}, {0xAA5C, 0xAA5F}, {0xAADE, 0xAADF}, {0xAAF0, 0xAAF1}, {0xABEB, 0xABEB}, {0xFD3E, 0xFD3F},
{0xFE10, 0xFE19}, {0xFE30, 0xFE52}, {0xFE54, 0xFE61}, {0xFE63, 0xFE63}, {0xFE68, 0xFE68}, {0xFE6A, 0xFE6B}, {0xFF01, 0xFF03}, {0xFF05, 0xFF0A}, {0xFF0C, 0xFF0F}, {0xFF1A, 0xFF1B}, {0xFF1F, 0xFF20},
{0xFF3B, 0xFF3D}, {0xFF3F, 0xFF3F}, {0xFF5B, 0xFF5B}, {0xFF5D, 0xFF5D}, {0xFF5F, 0xFF65}, {0x10100, 0x10102}, {0x1039F, 0x1039F}, {0x103D0, 0x103D0}, {0x1056F, 0x1056F}, {0x10857, 0x10857},
{0x1091F, 0x1091F}, {0x1093F, 0x1093F}, {0x10A50, 0x10A58}, {0x10A7F, 0x10A7F}, {0x10AF0, 0x10AF6}, {0x10B39, 0x10B3F}, {0x10B99, 0x10B9C}, {0x10EAD, 0x10EAD}, {0x10F55, 0x10F59}, {0x11047, 0x1104D},
{0x110BB, 0x110BC}, {0x110BE, 0x110C1}, {0x11140, 0x11143}, {0x11174, 0x11175}, {0x111C5, 0x111C8}, {0x111CD, 0x111CD}, {0x111DB, 0x111DB}, {0x111DD, 0x111DF}, {0x11238, 0x1123D}, {0x112A9, 0x112A9},
{0x1144B, 0x1144F}, {0x1145A, 0x1145B}, {0x1145D, 0x1145D}, {0x114C6, 0x114C6}, {0x115C1, 0x115D7}, {0x11641, 0x11643}, {0x11660, 0x1166C}, {0x1173C, 0x1173E}, {0x1183B, 0x1183B}, {0x11944, 0x11946},
{0x119E2, 0x119E2}, {0x11A3F, 0x11A46}, {0x11A9A, 0x11A9C}, {0x11A9E, 0x11AA2}, {0x11C41, 0x11C45}, {0x11C70, 0x11C71}, {0x11EF7, 0x11EF8}, {0x11FFF, 0x11FFF}, {0x12470, 0x12474}, {0x16A6E, 0x16A6F},
{0x16AF5, 0x16AF5}, {0x16B37, 0x16B3B}, {0x16B44, 0x16B44}, {0x16E97, 0x16E9A}, {0x16FE2, 0x16FE2}, {0x1BC9F, 0x1BC9F}, {0x1DA87, 0x1DA8B}, {0x1E95E, 0x1E95F},
};
static const std::vector<std::pair<uint32_t, uint32_t>> symbol_ranges = {
{0x24, 0x24}, {0x2B, 0x2B}, {0x3C, 0x3E}, {0x5E, 0x5E}, {0x60, 0x60}, {0x7C, 0x7C}, {0x7E, 0x7E}, {0xA2, 0xA6}, {0xA8, 0xA9}, {0xAC, 0xAC}, {0xAE, 0xB1}, {0xB4, 0xB4}, {0xB8, 0xB8}, {0xD7, 0xD7},
{0xF7, 0xF7}, {0x2C2, 0x2C5}, {0x2D2, 0x2DF}, {0x2E5, 0x2EB}, {0x2ED, 0x2ED}, {0x2EF, 0x2FF}, {0x375, 0x375}, {0x384, 0x385}, {0x3F6, 0x3F6}, {0x482, 0x482}, {0x58D, 0x58F}, {0x606, 0x608},
{0x60B, 0x60B}, {0x60E, 0x60F}, {0x6DE, 0x6DE}, {0x6E9, 0x6E9}, {0x6FD, 0x6FE}, {0x7F6, 0x7F6}, {0x7FE, 0x7FF}, {0x9F2, 0x9F3}, {0x9FA, 0x9FB}, {0xAF1, 0xAF1}, {0xB70, 0xB70}, {0xBF3, 0xBFA},
{0xC7F, 0xC7F}, {0xD4F, 0xD4F}, {0xD79, 0xD79}, {0xE3F, 0xE3F}, {0xF01, 0xF03}, {0xF13, 0xF13}, {0xF15, 0xF17}, {0xF1A, 0xF1F}, {0xF34, 0xF34}, {0xF36, 0xF36}, {0xF38, 0xF38}, {0xFBE, 0xFC5},
{0xFC7, 0xFCC}, {0xFCE, 0xFCF}, {0xFD5, 0xFD8}, {0x109E, 0x109F}, {0x1390, 0x1399}, {0x166D, 0x166D}, {0x17DB, 0x17DB}, {0x1940, 0x1940}, {0x19DE, 0x19FF}, {0x1B61, 0x1B6A}, {0x1B74, 0x1B7C},
{0x1FBD, 0x1FBD}, {0x1FBF, 0x1FC1}, {0x1FCD, 0x1FCF}, {0x1FDD, 0x1FDF}, {0x1FED, 0x1FEF}, {0x1FFD, 0x1FFE}, {0x2044, 0x2044}, {0x2052, 0x2052}, {0x207A, 0x207C}, {0x208A, 0x208C}, {0x20A0, 0x20BF},
{0x2100, 0x2101}, {0x2103, 0x2106}, {0x2108, 0x2109}, {0x2114, 0x2114}, {0x2116, 0x2118}, {0x211E, 0x2123}, {0x2125, 0x2125}, {0x2127, 0x2127}, {0x2129, 0x2129}, {0x212E, 0x212E}, {0x213A, 0x213B},
{0x2140, 0x2144}, {0x214A, 0x214D}, {0x214F, 0x214F}, {0x218A, 0x218B}, {0x2190, 0x2307}, {0x230C, 0x2328}, {0x232B, 0x2426}, {0x2440, 0x244A}, {0x249C, 0x24E9}, {0x2500, 0x2767}, {0x2794, 0x27C4},
{0x27C7, 0x27E5}, {0x27F0, 0x2982}, {0x2999, 0x29D7}, {0x29DC, 0x29FB}, {0x29FE, 0x2B73}, {0x2B76, 0x2B95}, {0x2B97, 0x2BFF}, {0x2CE5, 0x2CEA}, {0x2E50, 0x2E51}, {0x2E80, 0x2E99}, {0x2E9B, 0x2EF3},
{0x2F00, 0x2FD5}, {0x2FF0, 0x2FFB}, {0x3004, 0x3004}, {0x3012, 0x3013}, {0x3020, 0x3020}, {0x3036, 0x3037}, {0x303E, 0x303F}, {0x309B, 0x309C}, {0x3190, 0x3191}, {0x3196, 0x319F}, {0x31C0, 0x31E3},
{0x3200, 0x321E}, {0x322A, 0x3247}, {0x3250, 0x3250}, {0x3260, 0x327F}, {0x328A, 0x32B0}, {0x32C0, 0x33FF}, {0x4DC0, 0x4DFF}, {0xA490, 0xA4C6}, {0xA700, 0xA716}, {0xA720, 0xA721}, {0xA789, 0xA78A},
{0xA828, 0xA82B}, {0xA836, 0xA839}, {0xAA77, 0xAA79}, {0xAB5B, 0xAB5B}, {0xAB6A, 0xAB6B}, {0xFB29, 0xFB29}, {0xFBB2, 0xFBC1}, {0xFDFC, 0xFDFD}, {0xFE62, 0xFE62}, {0xFE64, 0xFE66}, {0xFE69, 0xFE69},
{0xFF04, 0xFF04}, {0xFF0B, 0xFF0B}, {0xFF1C, 0xFF1E}, {0xFF3E, 0xFF3E}, {0xFF40, 0xFF40}, {0xFF5C, 0xFF5C}, {0xFF5E, 0xFF5E}, {0xFFE0, 0xFFE6}, {0xFFE8, 0xFFEE}, {0xFFFC, 0xFFFD}, {0x10137, 0x1013F},
{0x10179, 0x10189}, {0x1018C, 0x1018E}, {0x10190, 0x1019C}, {0x101A0, 0x101A0}, {0x101D0, 0x101FC}, {0x10877, 0x10878}, {0x10AC8, 0x10AC8}, {0x1173F, 0x1173F}, {0x11FD5, 0x11FF1}, {0x16B3C, 0x16B3F},
{0x16B45, 0x16B45}, {0x1BC9C, 0x1BC9C}, {0x1D000, 0x1D0F5}, {0x1D100, 0x1D126}, {0x1D129, 0x1D164}, {0x1D16A, 0x1D16C}, {0x1D183, 0x1D184}, {0x1D18C, 0x1D1A9}, {0x1D1AE, 0x1D1E8}, {0x1D200, 0x1D241},
{0x1D245, 0x1D245}, {0x1D300, 0x1D356}, {0x1D6C1, 0x1D6C1}, {0x1D6DB, 0x1D6DB}, {0x1D6FB, 0x1D6FB}, {0x1D715, 0x1D715}, {0x1D735, 0x1D735}, {0x1D74F, 0x1D74F}, {0x1D76F, 0x1D76F}, {0x1D789, 0x1D789},
{0x1D7A9, 0x1D7A9}, {0x1D7C3, 0x1D7C3}, {0x1D800, 0x1D9FF}, {0x1DA37, 0x1DA3A}, {0x1DA6D, 0x1DA74}, {0x1DA76, 0x1DA83}, {0x1DA85, 0x1DA86}, {0x1E14F, 0x1E14F}, {0x1E2FF, 0x1E2FF}, {0x1ECAC, 0x1ECAC},
{0x1ECB0, 0x1ECB0}, {0x1ED2E, 0x1ED2E}, {0x1EEF0, 0x1EEF1}, {0x1F000, 0x1F02B}, {0x1F030, 0x1F093}, {0x1F0A0, 0x1F0AE}, {0x1F0B1, 0x1F0BF}, {0x1F0C1, 0x1F0CF}, {0x1F0D1, 0x1F0F5}, {0x1F10D, 0x1F1AD},
{0x1F1E6, 0x1F202}, {0x1F210, 0x1F23B}, {0x1F240, 0x1F248}, {0x1F250, 0x1F251}, {0x1F260, 0x1F265}, {0x1F300, 0x1F6D7}, {0x1F6E0, 0x1F6EC}, {0x1F6F0, 0x1F6FC}, {0x1F700, 0x1F773}, {0x1F780, 0x1F7D8},
{0x1F7E0, 0x1F7EB}, {0x1F800, 0x1F80B}, {0x1F810, 0x1F847}, {0x1F850, 0x1F859}, {0x1F860, 0x1F887}, {0x1F890, 0x1F8AD}, {0x1F8B0, 0x1F8B1}, {0x1F900, 0x1F978}, {0x1F97A, 0x1F9CB}, {0x1F9CD, 0x1FA53},
{0x1FA60, 0x1FA6D}, {0x1FA70, 0x1FA74}, {0x1FA78, 0x1FA7A}, {0x1FA80, 0x1FA86}, {0x1FA90, 0x1FAA8}, {0x1FAB0, 0x1FAB6}, {0x1FAC0, 0x1FAC2}, {0x1FAD0, 0x1FAD6}, {0x1FB00, 0x1FB92}, {0x1FB94, 0x1FBCA},
};
static const std::vector<std::pair<uint32_t, uint32_t>> control_ranges = {
{0x0, 0x8}, {0xE, 0x1B}, {0x7F, 0x84}, {0x86, 0x9F}, {0xAD, 0xAD}, {0x378, 0x379}, {0x380, 0x383}, {0x38B, 0x38B}, {0x38D, 0x38D}, {0x3A2, 0x3A2}, {0x530, 0x530}, {0x557, 0x558}, {0x58B, 0x58C},
{0x590, 0x590}, {0x5C8, 0x5CF}, {0x5EB, 0x5EE}, {0x5F5, 0x605}, {0x61C, 0x61D}, {0x6DD, 0x6DD}, {0x70E, 0x70F}, {0x74B, 0x74C}, {0x7B2, 0x7BF}, {0x7FB, 0x7FC}, {0x82E, 0x82F}, {0x83F, 0x83F},
{0x85C, 0x85D}, {0x85F, 0x85F}, {0x86B, 0x89F}, {0x8B5, 0x8B5}, {0x8C8, 0x8D2}, {0x8E2, 0x8E2}, {0x984, 0x984}, {0x98D, 0x98E}, {0x991, 0x992}, {0x9A9, 0x9A9}, {0x9B1, 0x9B1}, {0x9B3, 0x9B5},
{0x9BA, 0x9BB}, {0x9C5, 0x9C6}, {0x9C9, 0x9CA}, {0x9CF, 0x9D6}, {0x9D8, 0x9DB}, {0x9DE, 0x9DE}, {0x9E4, 0x9E5}, {0x9FF, 0xA00}, {0xA04, 0xA04}, {0xA0B, 0xA0E}, {0xA11, 0xA12}, {0xA29, 0xA29},
{0xA31, 0xA31}, {0xA34, 0xA34}, {0xA37, 0xA37}, {0xA3A, 0xA3B}, {0xA3D, 0xA3D}, {0xA43, 0xA46}, {0xA49, 0xA4A}, {0xA4E, 0xA50}, {0xA52, 0xA58}, {0xA5D, 0xA5D}, {0xA5F, 0xA65}, {0xA77, 0xA80},
{0xA84, 0xA84}, {0xA8E, 0xA8E}, {0xA92, 0xA92}, {0xAA9, 0xAA9}, {0xAB1, 0xAB1}, {0xAB4, 0xAB4}, {0xABA, 0xABB}, {0xAC6, 0xAC6}, {0xACA, 0xACA}, {0xACE, 0xACF}, {0xAD1, 0xADF}, {0xAE4, 0xAE5},
{0xAF2, 0xAF8}, {0xB00, 0xB00}, {0xB04, 0xB04}, {0xB0D, 0xB0E}, {0xB11, 0xB12}, {0xB29, 0xB29}, {0xB31, 0xB31}, {0xB34, 0xB34}, {0xB3A, 0xB3B}, {0xB45, 0xB46}, {0xB49, 0xB4A}, {0xB4E, 0xB54},
{0xB58, 0xB5B}, {0xB5E, 0xB5E}, {0xB64, 0xB65}, {0xB78, 0xB81}, {0xB84, 0xB84}, {0xB8B, 0xB8D}, {0xB91, 0xB91}, {0xB96, 0xB98}, {0xB9B, 0xB9B}, {0xB9D, 0xB9D}, {0xBA0, 0xBA2}, {0xBA5, 0xBA7},
{0xBAB, 0xBAD}, {0xBBA, 0xBBD}, {0xBC3, 0xBC5}, {0xBC9, 0xBC9}, {0xBCE, 0xBCF}, {0xBD1, 0xBD6}, {0xBD8, 0xBE5}, {0xBFB, 0xBFF}, {0xC0D, 0xC0D}, {0xC11, 0xC11}, {0xC29, 0xC29}, {0xC3A, 0xC3C},
{0xC45, 0xC45}, {0xC49, 0xC49}, {0xC4E, 0xC54}, {0xC57, 0xC57}, {0xC5B, 0xC5F}, {0xC64, 0xC65}, {0xC70, 0xC76}, {0xC8D, 0xC8D}, {0xC91, 0xC91}, {0xCA9, 0xCA9}, {0xCB4, 0xCB4}, {0xCBA, 0xCBB},
{0xCC5, 0xCC5}, {0xCC9, 0xCC9}, {0xCCE, 0xCD4}, {0xCD7, 0xCDD}, {0xCDF, 0xCDF}, {0xCE4, 0xCE5}, {0xCF0, 0xCF0}, {0xCF3, 0xCFF}, {0xD0D, 0xD0D}, {0xD11, 0xD11}, {0xD45, 0xD45}, {0xD49, 0xD49},
{0xD50, 0xD53}, {0xD64, 0xD65}, {0xD80, 0xD80}, {0xD84, 0xD84}, {0xD97, 0xD99}, {0xDB2, 0xDB2}, {0xDBC, 0xDBC}, {0xDBE, 0xDBF}, {0xDC7, 0xDC9}, {0xDCB, 0xDCE}, {0xDD5, 0xDD5}, {0xDD7, 0xDD7},
{0xDE0, 0xDE5}, {0xDF0, 0xDF1}, {0xDF5, 0xE00}, {0xE3B, 0xE3E}, {0xE5C, 0xE80}, {0xE83, 0xE83}, {0xE85, 0xE85}, {0xE8B, 0xE8B}, {0xEA4, 0xEA4}, {0xEA6, 0xEA6}, {0xEBE, 0xEBF}, {0xEC5, 0xEC5},
{0xEC7, 0xEC7}, {0xECE, 0xECF}, {0xEDA, 0xEDB}, {0xEE0, 0xEFF}, {0xF48, 0xF48}, {0xF6D, 0xF70}, {0xF98, 0xF98}, {0xFBD, 0xFBD}, {0xFCD, 0xFCD}, {0xFDB, 0xFFF}, {0x10C6, 0x10C6}, {0x10C8, 0x10CC},
{0x10CE, 0x10CF}, {0x1249, 0x1249}, {0x124E, 0x124F}, {0x1257, 0x1257}, {0x1259, 0x1259}, {0x125E, 0x125F}, {0x1289, 0x1289}, {0x128E, 0x128F}, {0x12B1, 0x12B1}, {0x12B6, 0x12B7}, {0x12BF, 0x12BF},
{0x12C1, 0x12C1}, {0x12C6, 0x12C7}, {0x12D7, 0x12D7}, {0x1311, 0x1311}, {0x1316, 0x1317}, {0x135B, 0x135C}, {0x137D, 0x137F}, {0x139A, 0x139F}, {0x13F6, 0x13F7}, {0x13FE, 0x13FF}, {0x169D, 0x169F},
{0x16F9, 0x16FF}, {0x170D, 0x170D}, {0x1715, 0x171F}, {0x1737, 0x173F}, {0x1754, 0x175F}, {0x176D, 0x176D}, {0x1771, 0x1771}, {0x1774, 0x177F}, {0x17DE, 0x17DF}, {0x17EA, 0x17EF}, {0x17FA, 0x17FF},
{0x180E, 0x180F}, {0x181A, 0x181F}, {0x1879, 0x187F}, {0x18AB, 0x18AF}, {0x18F6, 0x18FF}, {0x191F, 0x191F}, {0x192C, 0x192F}, {0x193C, 0x193F}, {0x1941, 0x1943}, {0x196E, 0x196F}, {0x1975, 0x197F},
{0x19AC, 0x19AF}, {0x19CA, 0x19CF}, {0x19DB, 0x19DD}, {0x1A1C, 0x1A1D}, {0x1A5F, 0x1A5F}, {0x1A7D, 0x1A7E}, {0x1A8A, 0x1A8F}, {0x1A9A, 0x1A9F}, {0x1AAE, 0x1AAF}, {0x1AC1, 0x1AFF}, {0x1B4C, 0x1B4F},
{0x1B7D, 0x1B7F}, {0x1BF4, 0x1BFB}, {0x1C38, 0x1C3A}, {0x1C4A, 0x1C4C}, {0x1C89, 0x1C8F}, {0x1CBB, 0x1CBC}, {0x1CC8, 0x1CCF}, {0x1CFB, 0x1CFF}, {0x1DFA, 0x1DFA}, {0x1F16, 0x1F17}, {0x1F1E, 0x1F1F},
{0x1F46, 0x1F47}, {0x1F4E, 0x1F4F}, {0x1F58, 0x1F58}, {0x1F5A, 0x1F5A}, {0x1F5C, 0x1F5C}, {0x1F5E, 0x1F5E}, {0x1F7E, 0x1F7F}, {0x1FB5, 0x1FB5}, {0x1FC5, 0x1FC5}, {0x1FD4, 0x1FD5}, {0x1FDC, 0x1FDC},
{0x1FF0, 0x1FF1}, {0x1FF5, 0x1FF5}, {0x1FFF, 0x1FFF}, {0x200B, 0x200F}, {0x202A, 0x202E}, {0x2060, 0x206F}, {0x2072, 0x2073}, {0x208F, 0x208F}, {0x209D, 0x209F}, {0x20C0, 0x20CF}, {0x20F1, 0x20FF},
{0x218C, 0x218F}, {0x2427, 0x243F}, {0x244B, 0x245F}, {0x2B74, 0x2B75}, {0x2B96, 0x2B96}, {0x2C2F, 0x2C2F}, {0x2C5F, 0x2C5F}, {0x2CF4, 0x2CF8}, {0x2D26, 0x2D26}, {0x2D28, 0x2D2C}, {0x2D2E, 0x2D2F},
{0x2D68, 0x2D6E}, {0x2D71, 0x2D7E}, {0x2D97, 0x2D9F}, {0x2DA7, 0x2DA7}, {0x2DAF, 0x2DAF}, {0x2DB7, 0x2DB7}, {0x2DBF, 0x2DBF}, {0x2DC7, 0x2DC7}, {0x2DCF, 0x2DCF}, {0x2DD7, 0x2DD7}, {0x2DDF, 0x2DDF},
{0x2E53, 0x2E7F}, {0x2E9A, 0x2E9A}, {0x2EF4, 0x2EFF}, {0x2FD6, 0x2FEF}, {0x2FFC, 0x2FFF}, {0x3040, 0x3040}, {0x3097, 0x3098}, {0x3100, 0x3104}, {0x3130, 0x3130}, {0x318F, 0x318F}, {0x31E4, 0x31EF},
{0x321F, 0x321F}, {0x9FFD, 0x9FFF}, {0xA48D, 0xA48F}, {0xA4C7, 0xA4CF}, {0xA62C, 0xA63F}, {0xA6F8, 0xA6FF}, {0xA7C0, 0xA7C1}, {0xA7CB, 0xA7F4}, {0xA82D, 0xA82F}, {0xA83A, 0xA83F}, {0xA878, 0xA87F},
{0xA8C6, 0xA8CD}, {0xA8DA, 0xA8DF}, {0xA954, 0xA95E}, {0xA97D, 0xA97F}, {0xA9CE, 0xA9CE}, {0xA9DA, 0xA9DD}, {0xA9FF, 0xA9FF}, {0xAA37, 0xAA3F}, {0xAA4E, 0xAA4F}, {0xAA5A, 0xAA5B}, {0xAAC3, 0xAADA},
{0xAAF7, 0xAB00}, {0xAB07, 0xAB08}, {0xAB0F, 0xAB10}, {0xAB17, 0xAB1F}, {0xAB27, 0xAB27}, {0xAB2F, 0xAB2F}, {0xAB6C, 0xAB6F}, {0xABEE, 0xABEF}, {0xABFA, 0xABFF}, {0xD7A4, 0xD7AF}, {0xD7C7, 0xD7CA},
{0xD7FC, 0xF8FF}, {0xFA6E, 0xFA6F}, {0xFADA, 0xFAFF}, {0xFB07, 0xFB12}, {0xFB18, 0xFB1C}, {0xFB37, 0xFB37}, {0xFB3D, 0xFB3D}, {0xFB3F, 0xFB3F}, {0xFB42, 0xFB42}, {0xFB45, 0xFB45}, {0xFBC2, 0xFBD2},
{0xFD40, 0xFD4F}, {0xFD90, 0xFD91}, {0xFDC8, 0xFDEF}, {0xFDFE, 0xFDFF}, {0xFE1A, 0xFE1F}, {0xFE53, 0xFE53}, {0xFE67, 0xFE67}, {0xFE6C, 0xFE6F}, {0xFE75, 0xFE75}, {0xFEFD, 0xFF00}, {0xFFBF, 0xFFC1},
{0xFFC8, 0xFFC9}, {0xFFD0, 0xFFD1}, {0xFFD8, 0xFFD9}, {0xFFDD, 0xFFDF}, {0xFFE7, 0xFFE7}, {0xFFEF, 0xFFFB}, {0xFFFE, 0xFFFF}, {0x1000C, 0x1000C}, {0x10027, 0x10027}, {0x1003B, 0x1003B},
{0x1003E, 0x1003E}, {0x1004E, 0x1004F}, {0x1005E, 0x1007F}, {0x100FB, 0x100FF}, {0x10103, 0x10106}, {0x10134, 0x10136}, {0x1018F, 0x1018F}, {0x1019D, 0x1019F}, {0x101A1, 0x101CF}, {0x101FE, 0x1027F},
{0x1029D, 0x1029F}, {0x102D1, 0x102DF}, {0x102FC, 0x102FF}, {0x10324, 0x1032C}, {0x1034B, 0x1034F}, {0x1037B, 0x1037F}, {0x1039E, 0x1039E}, {0x103C4, 0x103C7}, {0x103D6, 0x103FF}, {0x1049E, 0x1049F},
{0x104AA, 0x104AF}, {0x104D4, 0x104D7}, {0x104FC, 0x104FF}, {0x10528, 0x1052F}, {0x10564, 0x1056E}, {0x10570, 0x105FF}, {0x10737, 0x1073F}, {0x10756, 0x1075F}, {0x10768, 0x107FF}, {0x10806, 0x10807},
{0x10809, 0x10809}, {0x10836, 0x10836}, {0x10839, 0x1083B}, {0x1083D, 0x1083E}, {0x10856, 0x10856}, {0x1089F, 0x108A6}, {0x108B0, 0x108DF}, {0x108F3, 0x108F3}, {0x108F6, 0x108FA}, {0x1091C, 0x1091E},
{0x1093A, 0x1093E}, {0x10940, 0x1097F}, {0x109B8, 0x109BB}, {0x109D0, 0x109D1}, {0x10A04, 0x10A04}, {0x10A07, 0x10A0B}, {0x10A14, 0x10A14}, {0x10A18, 0x10A18}, {0x10A36, 0x10A37}, {0x10A3B, 0x10A3E},
{0x10A49, 0x10A4F}, {0x10A59, 0x10A5F}, {0x10AA0, 0x10ABF}, {0x10AE7, 0x10AEA}, {0x10AF7, 0x10AFF}, {0x10B36, 0x10B38}, {0x10B56, 0x10B57}, {0x10B73, 0x10B77}, {0x10B92, 0x10B98}, {0x10B9D, 0x10BA8},
{0x10BB0, 0x10BFF}, {0x10C49, 0x10C7F}, {0x10CB3, 0x10CBF}, {0x10CF3, 0x10CF9}, {0x10D28, 0x10D2F}, {0x10D3A, 0x10E5F}, {0x10E7F, 0x10E7F}, {0x10EAA, 0x10EAA}, {0x10EAE, 0x10EAF}, {0x10EB2, 0x10EFF},
{0x10F28, 0x10F2F}, {0x10F5A, 0x10FAF}, {0x10FCC, 0x10FDF}, {0x10FF7, 0x10FFF}, {0x1104E, 0x11051}, {0x11070, 0x1107E}, {0x110BD, 0x110BD}, {0x110C2, 0x110CF}, {0x110E9, 0x110EF}, {0x110FA, 0x110FF},
{0x11135, 0x11135}, {0x11148, 0x1114F}, {0x11177, 0x1117F}, {0x111E0, 0x111E0}, {0x111F5, 0x111FF}, {0x11212, 0x11212}, {0x1123F, 0x1127F}, {0x11287, 0x11287}, {0x11289, 0x11289}, {0x1128E, 0x1128E},
{0x1129E, 0x1129E}, {0x112AA, 0x112AF}, {0x112EB, 0x112EF}, {0x112FA, 0x112FF}, {0x11304, 0x11304}, {0x1130D, 0x1130E}, {0x11311, 0x11312}, {0x11329, 0x11329}, {0x11331, 0x11331}, {0x11334, 0x11334},
{0x1133A, 0x1133A}, {0x11345, 0x11346}, {0x11349, 0x1134A}, {0x1134E, 0x1134F}, {0x11351, 0x11356}, {0x11358, 0x1135C}, {0x11364, 0x11365}, {0x1136D, 0x1136F}, {0x11375, 0x113FF}, {0x1145C, 0x1145C},
{0x11462, 0x1147F}, {0x114C8, 0x114CF}, {0x114DA, 0x1157F}, {0x115B6, 0x115B7}, {0x115DE, 0x115FF}, {0x11645, 0x1164F}, {0x1165A, 0x1165F}, {0x1166D, 0x1167F}, {0x116B9, 0x116BF}, {0x116CA, 0x116FF},
{0x1171B, 0x1171C}, {0x1172C, 0x1172F}, {0x11740, 0x117FF}, {0x1183C, 0x1189F}, {0x118F3, 0x118FE}, {0x11907, 0x11908}, {0x1190A, 0x1190B}, {0x11914, 0x11914}, {0x11917, 0x11917}, {0x11936, 0x11936},
{0x11939, 0x1193A}, {0x11947, 0x1194F}, {0x1195A, 0x1199F}, {0x119A8, 0x119A9}, {0x119D8, 0x119D9}, {0x119E5, 0x119FF}, {0x11A48, 0x11A4F}, {0x11AA3, 0x11ABF}, {0x11AF9, 0x11BFF}, {0x11C09, 0x11C09},
{0x11C37, 0x11C37}, {0x11C46, 0x11C4F}, {0x11C6D, 0x11C6F}, {0x11C90, 0x11C91}, {0x11CA8, 0x11CA8}, {0x11CB7, 0x11CFF}, {0x11D07, 0x11D07}, {0x11D0A, 0x11D0A}, {0x11D37, 0x11D39}, {0x11D3B, 0x11D3B},
{0x11D3E, 0x11D3E}, {0x11D48, 0x11D4F}, {0x11D5A, 0x11D5F}, {0x11D66, 0x11D66}, {0x11D69, 0x11D69}, {0x11D8F, 0x11D8F}, {0x11D92, 0x11D92}, {0x11D99, 0x11D9F}, {0x11DAA, 0x11EDF}, {0x11EF9, 0x11FAF},
{0x11FB1, 0x11FBF}, {0x11FF2, 0x11FFE}, {0x1239A, 0x123FF}, {0x1246F, 0x1246F}, {0x12475, 0x1247F}, {0x12544, 0x12FFF}, {0x1342F, 0x143FF}, {0x14647, 0x167FF}, {0x16A39, 0x16A3F}, {0x16A5F, 0x16A5F},
{0x16A6A, 0x16A6D}, {0x16A70, 0x16ACF}, {0x16AEE, 0x16AEF}, {0x16AF6, 0x16AFF}, {0x16B46, 0x16B4F}, {0x16B5A, 0x16B5A}, {0x16B62, 0x16B62}, {0x16B78, 0x16B7C}, {0x16B90, 0x16E3F}, {0x16E9B, 0x16EFF},
{0x16F4B, 0x16F4E}, {0x16F88, 0x16F8E}, {0x16FA0, 0x16FDF}, {0x16FE5, 0x16FEF}, {0x16FF2, 0x16FFF}, {0x187F8, 0x187FF}, {0x18CD6, 0x18CFF}, {0x18D09, 0x1AFFF}, {0x1B11F, 0x1B14F}, {0x1B153, 0x1B163},
{0x1B168, 0x1B16F}, {0x1B2FC, 0x1BBFF}, {0x1BC6B, 0x1BC6F}, {0x1BC7D, 0x1BC7F}, {0x1BC89, 0x1BC8F}, {0x1BC9A, 0x1BC9B}, {0x1BCA0, 0x1CFFF}, {0x1D0F6, 0x1D0FF}, {0x1D127, 0x1D128}, {0x1D173, 0x1D17A},
{0x1D1E9, 0x1D1FF}, {0x1D246, 0x1D2DF}, {0x1D2F4, 0x1D2FF}, {0x1D357, 0x1D35F}, {0x1D379, 0x1D3FF}, {0x1D455, 0x1D455}, {0x1D49D, 0x1D49D}, {0x1D4A0, 0x1D4A1}, {0x1D4A3, 0x1D4A4}, {0x1D4A7, 0x1D4A8},
{0x1D4AD, 0x1D4AD}, {0x1D4BA, 0x1D4BA}, {0x1D4BC, 0x1D4BC}, {0x1D4C4, 0x1D4C4}, {0x1D506, 0x1D506}, {0x1D50B, 0x1D50C}, {0x1D515, 0x1D515}, {0x1D51D, 0x1D51D}, {0x1D53A, 0x1D53A}, {0x1D53F, 0x1D53F},
{0x1D545, 0x1D545}, {0x1D547, 0x1D549}, {0x1D551, 0x1D551}, {0x1D6A6, 0x1D6A7}, {0x1D7CC, 0x1D7CD}, {0x1DA8C, 0x1DA9A}, {0x1DAA0, 0x1DAA0}, {0x1DAB0, 0x1DFFF}, {0x1E007, 0x1E007}, {0x1E019, 0x1E01A},
{0x1E022, 0x1E022}, {0x1E025, 0x1E025}, {0x1E02B, 0x1E0FF}, {0x1E12D, 0x1E12F}, {0x1E13E, 0x1E13F}, {0x1E14A, 0x1E14D}, {0x1E150, 0x1E2BF}, {0x1E2FA, 0x1E2FE}, {0x1E300, 0x1E7FF}, {0x1E8C5, 0x1E8C6},
{0x1E8D7, 0x1E8FF}, {0x1E94C, 0x1E94F}, {0x1E95A, 0x1E95D}, {0x1E960, 0x1EC70}, {0x1ECB5, 0x1ED00}, {0x1ED3E, 0x1EDFF}, {0x1EE04, 0x1EE04}, {0x1EE20, 0x1EE20}, {0x1EE23, 0x1EE23}, {0x1EE25, 0x1EE26},
{0x1EE28, 0x1EE28}, {0x1EE33, 0x1EE33}, {0x1EE38, 0x1EE38}, {0x1EE3A, 0x1EE3A}, {0x1EE3C, 0x1EE41}, {0x1EE43, 0x1EE46}, {0x1EE48, 0x1EE48}, {0x1EE4A, 0x1EE4A}, {0x1EE4C, 0x1EE4C}, {0x1EE50, 0x1EE50},
{0x1EE53, 0x1EE53}, {0x1EE55, 0x1EE56}, {0x1EE58, 0x1EE58}, {0x1EE5A, 0x1EE5A}, {0x1EE5C, 0x1EE5C}, {0x1EE5E, 0x1EE5E}, {0x1EE60, 0x1EE60}, {0x1EE63, 0x1EE63}, {0x1EE65, 0x1EE66}, {0x1EE6B, 0x1EE6B},
{0x1EE73, 0x1EE73}, {0x1EE78, 0x1EE78}, {0x1EE7D, 0x1EE7D}, {0x1EE7F, 0x1EE7F}, {0x1EE8A, 0x1EE8A}, {0x1EE9C, 0x1EEA0}, {0x1EEA4, 0x1EEA4}, {0x1EEAA, 0x1EEAA}, {0x1EEBC, 0x1EEEF}, {0x1EEF2, 0x1EFFF},
{0x1F02C, 0x1F02F}, {0x1F094, 0x1F09F}, {0x1F0AF, 0x1F0B0}, {0x1F0C0, 0x1F0C0}, {0x1F0D0, 0x1F0D0}, {0x1F0F6, 0x1F0FF}, {0x1F1AE, 0x1F1E5}, {0x1F203, 0x1F20F}, {0x1F23C, 0x1F23F}, {0x1F249, 0x1F24F},
{0x1F252, 0x1F25F}, {0x1F266, 0x1F2FF}, {0x1F6D8, 0x1F6DF}, {0x1F6ED, 0x1F6EF}, {0x1F6FD, 0x1F6FF}, {0x1F774, 0x1F77F}, {0x1F7D9, 0x1F7DF}, {0x1F7EC, 0x1F7FF}, {0x1F80C, 0x1F80F}, {0x1F848, 0x1F84F},
{0x1F85A, 0x1F85F}, {0x1F888, 0x1F88F}, {0x1F8AE, 0x1F8AF}, {0x1F8B2, 0x1F8FF}, {0x1F979, 0x1F979}, {0x1F9CC, 0x1F9CC}, {0x1FA54, 0x1FA5F}, {0x1FA6E, 0x1FA6F}, {0x1FA75, 0x1FA77}, {0x1FA7B, 0x1FA7F},
{0x1FA87, 0x1FA8F}, {0x1FAA9, 0x1FAAF}, {0x1FAB7, 0x1FABF}, {0x1FAC3, 0x1FACF}, {0x1FAD7, 0x1FAFF}, {0x1FB93, 0x1FB93}, {0x1FBCB, 0x1FBEF}, {0x1FBFA, 0x1FFFF}, {0x2A6DE, 0x2A6FF}, {0x2B735, 0x2B73F},
{0x2B81E, 0x2B81F}, {0x2CEA2, 0x2CEAF}, {0x2EBE1, 0x2F7FF}, {0x2FA1E, 0x2FFFF}, {0x3134B, 0xE00FF}, {0xE01F0, 0x10FFFF},
};
static std::string codepoint_to_utf8(uint32_t cp) {
std::string result;
if (/* 0x00 <= cp && */ cp <= 0x7f) {
result.push_back(cp);
}
else if (0x80 <= cp && cp <= 0x7ff) {
result.push_back(0xc0 | ((cp >> 6) & 0x1f));
result.push_back(0x80 | (cp & 0x3f));
}
else if (0x800 <= cp && cp <= 0xffff) {
result.push_back(0xe0 | ((cp >> 12) & 0x0f));
result.push_back(0x80 | ((cp >> 6) & 0x3f));
result.push_back(0x80 | (cp & 0x3f));
}
else if (0x10000 <= cp && cp <= 0x10ffff) {
result.push_back(0xf0 | ((cp >> 18) & 0x07));
result.push_back(0x80 | ((cp >> 12) & 0x3f));
result.push_back(0x80 | ((cp >> 6) & 0x3f));
result.push_back(0x80 | (cp & 0x3f));
}
else {
throw std::invalid_argument("invalid codepoint");
}
return result;
}
static std::string codepoints_to_utf8(const std::vector<uint32_t> & cps) {
std::string result;
for (size_t i = 0; i < cps.size(); ++i) {
result.append(codepoint_to_utf8(cps[i]));
}
return result;
}
static uint32_t codepoint_from_utf8(const std::string & utf8, size_t & offset) {
assert(offset < utf8.size());
if (!(utf8[offset + 0] & 0x80)) {
auto result = utf8[offset + 0];
offset += 1;
return result;
}
else if (!(utf8[offset + 0] & 0x40)) {
throw std::invalid_argument("invalid character");
}
else if (!(utf8[offset + 0] & 0x20)) {
if (offset + 1 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80))
throw std::invalid_argument("invalid character");
auto result = ((utf8[offset + 0] & 0x1f) << 6) | (utf8[offset + 1] & 0x3f);
offset += 2;
return result;
}
else if (!(utf8[offset + 0] & 0x10)) {
if (offset + 2 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80))
throw std::invalid_argument("invalid character");
auto result = ((utf8[offset + 0] & 0x0f) << 12) | ((utf8[offset + 1] & 0x3f) << 6) | (utf8[offset + 2] & 0x3f);
offset += 3;
return result;
}
else if (!(utf8[offset + 0] & 0x08)) {
if (offset + 3 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80) || !((utf8[offset + 3] & 0xc0) == 0x80))
throw std::invalid_argument("invalid character");
auto result = ((utf8[offset + 0] & 0x07) << 18) | ((utf8[offset + 1] & 0x3f) << 12) | ((utf8[offset + 2] & 0x3f) << 6) | (utf8[offset + 3] & 0x3f);
offset += 4;
return result;
}
throw std::invalid_argument("invalid string");
}
static std::vector<uint32_t> codepoints_from_utf8(const std::string & utf8) {
std::vector<uint32_t> result;
size_t offset = 0;
while (offset < utf8.size()) {
result.push_back(codepoint_from_utf8(utf8, offset));
}
return result;
}
static std::vector<uint16_t> codepoint_to_utf16(uint32_t cp) {
std::vector<uint16_t> result;
if (/* 0x0000 <= cp && */ cp <= 0xffff) {
result.emplace_back(cp);
}
else if (0x10000 <= cp && cp <= 0x10ffff) {
result.emplace_back(0xd800 | ((cp - 0x10000) >> 10));
result.emplace_back(0xdc00 | ((cp - 0x10000) & 0x03ff));
}
else {
throw std::invalid_argument("invalid codepoint");
}
return result;
}
static std::vector<uint16_t> codepoints_to_utf16(const std::vector<uint32_t> & cps) {
std::vector<uint16_t> result;
for (size_t i = 0; i < cps.size(); ++i) {
auto temp = codepoint_to_utf16(cps[i]);
result.insert(result.end(), temp.begin(), temp.end());
}
return result;
}
static uint32_t codepoint_from_utf16(const std::vector<uint16_t> & utf16, size_t & offset) {
assert(offset < utf16.size());
if (((utf16[0] >> 10) << 10) != 0xd800) {
auto result = utf16[offset + 0];
offset += 1;
return result;
}
else {
if (offset + 1 >= utf16.size() || !((utf16[1] & 0xdc00) == 0xdc00))
throw std::invalid_argument("invalid character");
auto result = 0x10000 + (((utf16[0] & 0x03ff) << 10) | (utf16[1] & 0x03ff));
offset += 2;
return result;
}
throw std::invalid_argument("invalid string");
}
static std::vector<uint32_t> codepoints_from_utf16(const std::vector<uint16_t> & utf16) {
std::vector<uint32_t> result;
size_t offset = 0;
while (offset < utf16.size())
result.push_back(codepoint_from_utf16(utf16, offset));
return result;
}
#define CODEPOINT_TYPE_UNIDENTIFIED 0
#define CODEPOINT_TYPE_DIGIT 1
#define CODEPOINT_TYPE_LETTER 2
#define CODEPOINT_TYPE_WHITESPACE 3
#define CODEPOINT_TYPE_ACCENT_MARK 4
#define CODEPOINT_TYPE_PUNCTUATION 5
#define CODEPOINT_TYPE_SYMBOL 6
#define CODEPOINT_TYPE_CONTROL 7
static std::unordered_map<uint32_t, int> codepoint_type_map() {
std::unordered_map<uint32_t, int> codepoint_types;
for (auto p : digit_ranges) {
for(auto i = p.first; i <= p.second; ++ i)
codepoint_types[i] = CODEPOINT_TYPE_DIGIT;
}
for(auto p : letter_ranges) {
for(auto i = p.first; i <= p.second; ++ i)
codepoint_types[i] = CODEPOINT_TYPE_LETTER;
}
for(auto p : whitespace_ranges) {
for(auto i = p.first; i <= p.second; ++ i)
codepoint_types[i] = CODEPOINT_TYPE_WHITESPACE;
}
for(auto p : accent_mark_ranges) {
for(auto i = p.first; i <= p.second; ++ i)
codepoint_types[i] = CODEPOINT_TYPE_ACCENT_MARK;
}
for(auto p : punctuation_ranges) {
for(auto i = p.first; i <= p.second; ++ i)
codepoint_types[i] = CODEPOINT_TYPE_PUNCTUATION;
}
for (auto p : symbol_ranges) {
for (auto i = p.first; i <= p.second; ++i)
codepoint_types[i] = CODEPOINT_TYPE_SYMBOL;
}
for(auto p : control_ranges) {
for(auto i = p.first; i <= p.second; ++ i)
codepoint_types[i] = CODEPOINT_TYPE_CONTROL;
}
return codepoint_types;
}
static int codepoint_type(uint32_t cp) {
static std::unordered_map<uint32_t, int> codepoint_types = codepoint_type_map();
return codepoint_types[cp];
}
static int codepoint_type(const std::string & utf8) {
if (utf8.length() == 0)
return CODEPOINT_TYPE_UNIDENTIFIED;
size_t offset = 0;
return codepoint_type(codepoint_from_utf8(utf8, offset));
}
static std::unordered_map<uint8_t, std::string> bytes_to_unicode_map_bpe() {
std::unordered_map<uint8_t, std::string> map;
for (int ch = u'!'; ch <= u'~'; ++ch) {
assert(0 <= ch && ch < 256);
map[ch] = codepoint_to_utf8(ch);
}
for (int ch = u'¡'; ch <= u'¬'; ++ch) {
assert(0 <= ch && ch < 256);
map[ch] = codepoint_to_utf8(ch);
}
for (int ch = u'®'; ch <= u'ÿ'; ++ch) {
assert(0 <= ch && ch < 256);
map[ch] = codepoint_to_utf8(ch);
}
auto n = 0;
for (int ch = 0; ch < 256; ++ch) {
if (map.find(ch) == map.end()) {
map[ch] = codepoint_to_utf8(256 + n);
++n;
}
}
return map;
}
static std::string bytes_to_unicode_bpe(uint8_t byte) {
static std::unordered_map<uint8_t, std::string> map = bytes_to_unicode_map_bpe();
return map.at(byte);
}
static std::unordered_map<std::string, uint8_t> unicode_to_bytes_map_bpe() {
std::unordered_map<std::string, uint8_t> map;
for (int ch = u'!'; ch <= u'~'; ++ch) {
assert(0 <= ch && ch < 256);
map[codepoint_to_utf8(ch)] = ch;
}
for (int ch = u'¡'; ch <= u'¬'; ++ch) {
assert(0 <= ch && ch < 256);
map[codepoint_to_utf8(ch)] = ch;
}
for (int ch = u'®'; ch <= u'ÿ'; ++ch) {
assert(0 <= ch && ch < 256);
map[codepoint_to_utf8(ch)] = ch;
}
auto n = 0;
for (int ch = 0; ch < 256; ++ch) {
if (map.find(codepoint_to_utf8(ch)) == map.end()) {
map[codepoint_to_utf8(256 + n)] = ch;
++n;
}
}
return map;
}
static uint8_t unicode_to_bytes_bpe(const std::string & utf8) {
static std::unordered_map<std::string, uint8_t> map = unicode_to_bytes_map_bpe();
return map.at(utf8);
}

View File

@@ -0,0 +1,51 @@
#
# libtalk
#
set(TARGET libtalk)
add_executable(${TARGET}
emscripten.cpp
gpt-2.cpp
)
include(DefaultTargetOptions)
target_link_libraries(${TARGET} PRIVATE
whisper
common
)
unset(EXTRA_FLAGS)
if (WHISPER_WASM_SINGLE_FILE)
set(EXTRA_FLAGS "-s SINGLE_FILE=1")
message(STATUS "Embedding WASM inside talk.js")
add_custom_command(
TARGET ${TARGET} POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy
${CMAKE_BINARY_DIR}/bin/libtalk.js
${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/talk.wasm/talk.js
)
endif()
set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
--bind \
-s USE_PTHREADS=1 \
-s PTHREAD_POOL_SIZE=8 \
-s INITIAL_MEMORY=1800MB \
-s TOTAL_MEMORY=1800MB \
-s FORCE_FILESYSTEM=1 \
-s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
${EXTRA_FLAGS} \
")
#
# talk.wasm
#
set(TARGET talk.wasm)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/index-tmpl.html ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/index.html @ONLY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../helpers.js ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/helpers.js @ONLY)

View File

@@ -0,0 +1,74 @@
# talk.wasm
Talk with an Artificial Intelligence in your browser:
[https://user-images.githubusercontent.com/1991296/203411580-fedb4839-05e4-4474-8364-aaf1e9a9b615.mp4](https://user-images.githubusercontent.com/1991296/203845553-f7b44e13-9a15-4fc8-b518-ae8f4c6770fe.mp4)
Online demo: https://whisper.ggerganov.com/talk/
Terminal version: [examples/talk](/examples/talk)
## How it works?
This demo leverages 2 modern neural network models to create a high-quality voice chat directly in your browser:
- [OpenAI's Whisper](https://github.com/openai/whisper) speech recognition model is used to process your voice and understand what you are saying
- Upon receiving some voice input, the AI generates a text response using [OpenAI's GPT-2](https://github.com/openai/gpt-2) language model
- The AI then vocalizes the response using the browser's [Web Speech API](https://developer.mozilla.org/en-US/docs/Web/API/Web_Speech_API)
The web page does the processing locally on your machine. The processing of these heavy neural network models in the
browser is possible by implementing them efficiently in C/C++ and using the browser's WebAssembly SIMD capabilities for
extra performance:
- The Whisper C++ implementation is here: [whisper.h](/whisper.h) / [whisper.cpp](/whisper.cpp)
- The GPT-2 C++ implementation is here: [gpt-2.h](gpt-2.h) / [gpt-2.cpp](gpt-2.cpp)
- Both models use a custom tensor library implemented in C: [ggml.h](/ggml.h) / [ggml.c](/ggml.c)
- The HTML/JS layer is here: [index-tmpl.html](index-tmpl.html)
- The Emscripten bridge between C/C++ and JS is here: [emscripten.cpp](emscripten.cpp)
In order to run the models, the web page first needs to download the model data which is about ~350 MB. The model data
is then cached in your browser's cache and can be reused in future visits without downloading it again.
## Requirements
In order to run this demo efficiently, you need to have the following:
- Latest Chrome or Firefox browser (Safari is not supported)
- Run this on a desktop or laptop with modern CPU (a mobile phone will likely not be good enough)
- Speak phrases that are no longer than 10 seconds - this is the audio context of the AI
- The web-page uses about 1.8GB of RAM
Notice that this demo is using the smallest GPT-2 model, so the generated text responses are not always very good.
Also, the prompting strategy can likely be improved to achieve better results.
The demo is quite computationally heavy, so you need a fast CPU. It's not usual to run these transformer models in a
browser. Typically, they run on powerful GPUs.
Currently, mobile browsers do not support the Fixed-width SIMD WebAssembly capability, so you cannot run this demo
on a phone or a tablet. Hopefully, in the near future this will become supported.
## Todo
- Better UI (contributions are welcome)
- Better GPT-2 prompting
## Build instructions
```bash
# build using Emscripten (v3.1.2)
git clone https://github.com/ggerganov/whisper.cpp
cd whisper.cpp
mkdir build-em && cd build-em
emcmake cmake ..
make -j
# copy the produced page to your HTTP path
cp bin/talk.wasm/* /path/to/html/
cp bin/libtalk.worker.js /path/to/html/
```
## Feedback
If you have any comments or ideas for improvement, please drop a comment in the following discussion:
https://github.com/ggerganov/whisper.cpp/discussions/167

View File

@@ -0,0 +1,380 @@
#include "ggml.h"
#include "gpt-2.h"
#include "whisper.h"
#include <emscripten.h>
#include <emscripten/bind.h>
#include <atomic>
#include <cmath>
#include <mutex>
#include <string>
#include <thread>
#include <vector>
#include <regex>
constexpr int N_THREAD = 8;
struct gpt2_context * g_gpt2;
std::vector<struct whisper_context *> g_contexts(4, nullptr);
std::mutex g_mutex;
std::thread g_worker;
std::atomic<bool> g_running(false);
bool g_force_speak = false;
std::string g_text_to_speak = "";
std::string g_status = "";
std::string g_status_forced = "";
std::vector<float> g_pcmf32;
std::string to_timestamp(int64_t t) {
int64_t sec = t/100;
int64_t msec = t - sec*100;
int64_t min = sec/60;
sec = sec - min*60;
char buf[32];
snprintf(buf, sizeof(buf), "%02d:%02d.%03d", (int) min, (int) sec, (int) msec);
return std::string(buf);
}
void talk_set_status(const std::string & status) {
std::lock_guard<std::mutex> lock(g_mutex);
g_status = status;
}
void talk_main(size_t index) {
talk_set_status("loading data ...");
struct whisper_full_params wparams = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY);
wparams.n_threads = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
wparams.offset_ms = 0;
wparams.translate = false;
wparams.no_context = true;
wparams.single_segment = true;
wparams.print_realtime = false;
wparams.print_progress = false;
wparams.print_timestamps = true;
wparams.print_special = false;
wparams.max_tokens = 32;
wparams.audio_ctx = 768; // partial encoder context for better performance
wparams.language = "en";
g_gpt2 = gpt2_init("gpt-2.bin");
printf("talk: using %d threads\n", wparams.n_threads);
std::vector<float> pcmf32;
// whisper context
auto & ctx = g_contexts[index];
const int64_t step_samples = 2*WHISPER_SAMPLE_RATE;
const int64_t window_samples = 9*WHISPER_SAMPLE_RATE;
const int64_t step_ms = (step_samples*1000)/WHISPER_SAMPLE_RATE;
auto t_last = std::chrono::high_resolution_clock::now();
talk_set_status("listening ...");
while (g_running) {
const auto t_now = std::chrono::high_resolution_clock::now();
if (std::chrono::duration_cast<std::chrono::milliseconds>(t_now - t_last).count() < step_ms) {
{
std::lock_guard<std::mutex> lock(g_mutex);
g_pcmf32.clear();
}
std::this_thread::sleep_for(std::chrono::milliseconds(10));
continue;
}
talk_set_status("listening ...");
{
std::unique_lock<std::mutex> lock(g_mutex);
if (g_pcmf32.size() < step_samples) {
lock.unlock();
std::this_thread::sleep_for(std::chrono::milliseconds(10));
continue;
}
pcmf32 = std::vector<float>(g_pcmf32.end() - std::min((int64_t) g_pcmf32.size(), window_samples), g_pcmf32.end());
}
// VAD: if energy in during last second is above threshold, then skip
{
float energy_all = 0.0f;
float energy_1s = 0.0f;
for (size_t i = 0; i < pcmf32.size(); i++) {
energy_all += fabsf(pcmf32[i]);
if (i >= pcmf32.size() - WHISPER_SAMPLE_RATE) {
energy_1s += fabsf(pcmf32[i]);
}
}
energy_all /= pcmf32.size();
energy_1s /= WHISPER_SAMPLE_RATE;
if (energy_1s > 0.1f*energy_all && !g_force_speak) {
std::this_thread::sleep_for(std::chrono::milliseconds(10));
continue;
}
}
talk_set_status("processing audio (whisper)...");
t_last = t_now;
if (!g_force_speak) {
const auto t_start = std::chrono::high_resolution_clock::now();
int ret = whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size());
if (ret != 0) {
printf("whisper_full() failed: %d\n", ret);
break;
}
const auto t_end = std::chrono::high_resolution_clock::now();
printf("whisper_full() returned %d in %f seconds\n", ret, std::chrono::duration<double>(t_end - t_start).count());
}
{
std::string text_heard;
if (!g_force_speak) {
const int n_segments = whisper_full_n_segments(ctx);
for (int i = n_segments - 1; i < n_segments; ++i) {
const char * text = whisper_full_get_segment_text(ctx, i);
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
printf ("[%s --> %s] %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text);
text_heard += text;
}
}
g_force_speak = false;
// remove text between brackets using regex
{
std::regex re("\\[.*?\\]");
text_heard = std::regex_replace(text_heard, re, "");
}
// remove text between brackets using regex
{
std::regex re("\\(.*?\\)");
text_heard = std::regex_replace(text_heard, re, "");
}
// remove all characters, except for letters, numbers, punctuation and ':', '\'', '-', ' '
text_heard = std::regex_replace(text_heard, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
// take first line
text_heard = text_heard.substr(0, text_heard.find_first_of("\n"));
// remove leading and trailing whitespace
text_heard = std::regex_replace(text_heard, std::regex("^\\s+"), "");
text_heard = std::regex_replace(text_heard, std::regex("\\s+$"), "");
talk_set_status("'" + text_heard + "' - thinking how to respond (gpt-2) ...");
const std::vector<gpt_vocab::id> tokens = gpt2_tokenize(g_gpt2, text_heard.c_str());
printf("whisper: number of tokens: %d, '%s'\n", (int) tokens.size(), text_heard.c_str());
std::string text_to_speak;
std::string prompt_base;
{
std::lock_guard<std::mutex> lock(g_mutex);
prompt_base = gpt2_get_prompt(g_gpt2);
}
if (tokens.size() > 0) {
text_to_speak = gpt2_gen_text(g_gpt2, (prompt_base + text_heard + "\n").c_str(), 32);
text_to_speak = std::regex_replace(text_to_speak, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
text_to_speak = text_to_speak.substr(0, text_to_speak.find_first_of("\n"));
std::lock_guard<std::mutex> lock(g_mutex);
// remove first 2 lines of base prompt
{
const size_t pos = prompt_base.find_first_of("\n");
if (pos != std::string::npos) {
prompt_base = prompt_base.substr(pos + 1);
}
}
{
const size_t pos = prompt_base.find_first_of("\n");
if (pos != std::string::npos) {
prompt_base = prompt_base.substr(pos + 1);
}
}
prompt_base += text_heard + "\n" + text_to_speak + "\n";
} else {
text_to_speak = gpt2_gen_text(g_gpt2, prompt_base.c_str(), 32);
text_to_speak = std::regex_replace(text_to_speak, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
text_to_speak = text_to_speak.substr(0, text_to_speak.find_first_of("\n"));
std::lock_guard<std::mutex> lock(g_mutex);
const size_t pos = prompt_base.find_first_of("\n");
if (pos != std::string::npos) {
prompt_base = prompt_base.substr(pos + 1);
}
prompt_base += text_to_speak + "\n";
}
printf("gpt-2: %s\n", text_to_speak.c_str());
//printf("========================\n");
//printf("gpt-2: prompt_base:\n'%s'\n", prompt_base.c_str());
//printf("========================\n");
{
std::lock_guard<std::mutex> lock(g_mutex);
t_last = std::chrono::high_resolution_clock::now();
g_text_to_speak = text_to_speak;
g_pcmf32.clear();
gpt2_set_prompt(g_gpt2, prompt_base.c_str());
}
talk_set_status("speaking ...");
}
}
gpt2_free(g_gpt2);
if (index < g_contexts.size()) {
whisper_free(g_contexts[index]);
g_contexts[index] = nullptr;
}
}
EMSCRIPTEN_BINDINGS(talk) {
emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
for (size_t i = 0; i < g_contexts.size(); ++i) {
if (g_contexts[i] == nullptr) {
g_contexts[i] = whisper_init_from_file_with_params(path_model.c_str(), whisper_context_default_params());
if (g_contexts[i] != nullptr) {
g_running = true;
if (g_worker.joinable()) {
g_worker.join();
}
g_worker = std::thread([i]() {
talk_main(i);
});
return i + 1;
} else {
return (size_t) 0;
}
}
}
return (size_t) 0;
}));
emscripten::function("free", emscripten::optional_override([](size_t index) {
if (g_running) {
g_running = false;
}
}));
emscripten::function("set_audio", emscripten::optional_override([](size_t index, const emscripten::val & audio) {
--index;
if (index >= g_contexts.size()) {
return -1;
}
if (g_contexts[index] == nullptr) {
return -2;
}
{
std::lock_guard<std::mutex> lock(g_mutex);
const int n = audio["length"].as<int>();
emscripten::val heap = emscripten::val::module_property("HEAPU8");
emscripten::val memory = heap["buffer"];
g_pcmf32.resize(n);
emscripten::val memoryView = audio["constructor"].new_(memory, reinterpret_cast<uintptr_t>(g_pcmf32.data()), n);
memoryView.call<void>("set", audio);
}
return 0;
}));
emscripten::function("force_speak", emscripten::optional_override([](size_t index) {
{
std::lock_guard<std::mutex> lock(g_mutex);
g_force_speak = true;
}
}));
emscripten::function("get_text_context", emscripten::optional_override([]() {
std::string text_context;
{
std::lock_guard<std::mutex> lock(g_mutex);
text_context = gpt2_get_prompt(g_gpt2);
}
return text_context;
}));
emscripten::function("get_text_to_speak", emscripten::optional_override([]() {
std::string text_to_speak;
{
std::lock_guard<std::mutex> lock(g_mutex);
text_to_speak = std::move(g_text_to_speak);
}
return text_to_speak;
}));
emscripten::function("get_status", emscripten::optional_override([]() {
std::string status;
{
std::lock_guard<std::mutex> lock(g_mutex);
status = g_status_forced.empty() ? g_status : g_status_forced;
}
return status;
}));
emscripten::function("set_status", emscripten::optional_override([](const std::string & status) {
{
std::lock_guard<std::mutex> lock(g_mutex);
g_status_forced = status;
}
}));
emscripten::function("set_prompt", emscripten::optional_override([](const std::string & prompt) {
{
std::lock_guard<std::mutex> lock(g_mutex);
gpt2_set_prompt(g_gpt2, prompt.c_str());
}
}));
}

View File

@@ -0,0 +1,809 @@
#include "ggml.h"
#include "common-ggml.h"
#include "gpt-2.h"
#include <cmath>
#include <cstdio>
#include <cstring>
#include <fstream>
#include <map>
#include <string>
#include <thread>
#include <vector>
#include <regex>
#include <random>
/////////////////////// GPT-2 BEGIN /////////////////////////
// default hparams (GPT-2 117M)
struct gpt2_hparams {
int32_t n_vocab = 50257;
int32_t n_ctx = 1024;
int32_t n_embd = 768;
int32_t n_head = 12;
int32_t n_layer = 12;
int32_t ftype = 1;
};
struct gpt2_layer {
// normalization
struct ggml_tensor * ln_1_g;
struct ggml_tensor * ln_1_b;
struct ggml_tensor * ln_2_g;
struct ggml_tensor * ln_2_b;
// attention
struct ggml_tensor * c_attn_attn_w;
struct ggml_tensor * c_attn_attn_b;
struct ggml_tensor * c_attn_proj_w;
struct ggml_tensor * c_attn_proj_b;
// mlp
struct ggml_tensor * c_mlp_fc_w;
struct ggml_tensor * c_mlp_fc_b;
struct ggml_tensor * c_mlp_proj_w;
struct ggml_tensor * c_mlp_proj_b;
};
struct gpt2_model {
gpt2_hparams hparams;
// normalization
struct ggml_tensor * ln_f_g;
struct ggml_tensor * ln_f_b;
struct ggml_tensor * wte; // position embedding
struct ggml_tensor * wpe; // token embedding
struct ggml_tensor * lm_head; // language model head
std::vector<gpt2_layer> layers;
// key + value memory
struct ggml_tensor * memory_k;
struct ggml_tensor * memory_v;
//
struct ggml_context * ctx;
std::map<std::string, struct ggml_tensor *> tensors;
};
// load the model's weights from a file
bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab) {
printf("%s: loading model from '%s'\n", __func__, fname.c_str());
auto fin = std::ifstream(fname, std::ios::binary);
if (!fin) {
fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
return false;
}
// verify magic
{
uint32_t magic;
fin.read((char *) &magic, sizeof(magic));
if (magic != 0x67676d6c) {
fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
return false;
}
}
// load hparams
{
auto & hparams = model.hparams;
fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
printf("%s: n_head = %d\n", __func__, hparams.n_head);
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
printf("%s: ftype = %d\n", __func__, hparams.ftype);
}
// load vocab
{
int32_t n_vocab = 0;
fin.read((char *) &n_vocab, sizeof(n_vocab));
if (n_vocab != model.hparams.n_vocab) {
fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
__func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
return false;
}
std::string word;
for (int i = 0; i < n_vocab; i++) {
uint32_t len;
fin.read((char *) &len, sizeof(len));
word.resize(len);
fin.read((char *) word.data(), len);
vocab.token_to_id[word] = i;
vocab.id_to_token[i] = word;
}
}
// for the big tensors, we have the option to store the data in 16-bit floats or quantized
// in order to save memory and also to speed up the computation
ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
if (wtype == GGML_TYPE_COUNT) {
fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
__func__, fname.c_str(), model.hparams.ftype);
return false;
}
auto & ctx = model.ctx;
size_t ctx_size = 0;
{
const auto & hparams = model.hparams;
const int n_embd = hparams.n_embd;
const int n_layer = hparams.n_layer;
const int n_ctx = hparams.n_ctx;
const int n_vocab = hparams.n_vocab;
ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // wte
ctx_size += n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // lm_head
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w
ctx_size += n_layer*( 3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w
ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b
ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w
ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w
ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
ctx_size += (6 + 12*n_layer)*256; // object overhead
printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
}
// create the ggml context
{
struct ggml_init_params params = {
/*.mem_size =*/ ctx_size,
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ false,
};
model.ctx = ggml_init(params);
if (!model.ctx) {
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
return false;
}
}
// prepare memory for the weights
{
const auto & hparams = model.hparams;
const int n_embd = hparams.n_embd;
const int n_layer = hparams.n_layer;
const int n_ctx = hparams.n_ctx;
const int n_vocab = hparams.n_vocab;
model.layers.resize(n_layer);
model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
model.wpe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
model.lm_head = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
// map by name
model.tensors["model/ln_f/g"] = model.ln_f_g;
model.tensors["model/ln_f/b"] = model.ln_f_b;
model.tensors["model/wte"] = model.wte;
model.tensors["model/wpe"] = model.wpe;
model.tensors["model/lm_head"] = model.lm_head;
for (int i = 0; i < n_layer; ++i) {
auto & layer = model.layers[i];
layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 3*n_embd);
layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd);
layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd);
layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
// map by name
model.tensors["model/h" + std::to_string(i) + "/ln_1/g"] = layer.ln_1_g;
model.tensors["model/h" + std::to_string(i) + "/ln_1/b"] = layer.ln_1_b;
model.tensors["model/h" + std::to_string(i) + "/ln_2/g"] = layer.ln_2_g;
model.tensors["model/h" + std::to_string(i) + "/ln_2/b"] = layer.ln_2_b;
model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w;
model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b;
model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] = layer.c_attn_proj_w;
model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] = layer.c_attn_proj_b;
model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"] = layer.c_mlp_fc_w;
model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"] = layer.c_mlp_fc_b;
model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"] = layer.c_mlp_proj_w;
model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"] = layer.c_mlp_proj_b;
}
}
// key + value memory
{
const auto & hparams = model.hparams;
const int n_embd = hparams.n_embd;
const int n_layer = hparams.n_layer;
const int n_ctx = hparams.n_ctx;
const int n_mem = n_layer*n_ctx;
const int n_elements = n_embd*n_mem;
model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
}
// load weights
{
size_t total_size = 0;
bool has_lm_head = false;
while (true) {
int32_t n_dims;
int32_t length;
int32_t ttype;
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
fin.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));
if (fin.eof()) {
break;
}
int32_t nelements = 1;
int32_t ne[2] = { 1, 1 };
for (int i = 0; i < n_dims; ++i) {
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
nelements *= ne[i];
}
std::string name(length, 0);
fin.read(&name[0], length);
if (model.tensors.find(name.data()) == model.tensors.end()) {
fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
return false;
}
auto tensor = model.tensors[name.data()];
if (ggml_nelements(tensor) != nelements) {
fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
return false;
}
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
__func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
return false;
}
// for debugging
if (0) {
printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
}
const size_t bpe = ggml_type_size(ggml_type(ttype));
if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
__func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
return false;
}
fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
// GPT-2 models share the WTE tensor as the LM head
if (name == "model/wte" && has_lm_head == false) {
memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor));
}
if (name == "model/lm_head") {
has_lm_head = true;
}
total_size += ggml_nbytes(tensor);
}
printf("%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
}
fin.close();
return true;
}
// evaluate the transformer
//
// - model: the model
// - n_threads: number of threads to use
// - n_past: the context size so far
// - embd_inp: the embeddings of the tokens in the context
// - embd_w: the predicted logits for the next token
//
bool gpt2_eval(
const gpt2_model & model,
const int n_threads,
const int n_past,
const std::vector<gpt_vocab::id> & embd_inp,
std::vector<float> & embd_w,
size_t & mem_per_token) {
const int N = embd_inp.size();
const auto & hparams = model.hparams;
const int n_embd = hparams.n_embd;
const int n_layer = hparams.n_layer;
const int n_ctx = hparams.n_ctx;
const int n_head = hparams.n_head;
const int n_vocab = hparams.n_vocab;
static size_t buf_size = 512u*1024*1024;
static void * buf = malloc(buf_size);
if (mem_per_token > 0 && mem_per_token*N > buf_size) {
const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
//printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
// reallocate
buf_size = buf_size_new;
buf = realloc(buf, buf_size);
if (buf == nullptr) {
fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
return false;
}
}
struct ggml_init_params params = {
/*.mem_size =*/ buf_size,
/*.mem_buffer =*/ buf,
/*.no_alloc =*/ false,
};
struct ggml_context * ctx0 = ggml_init(params);
struct ggml_cgraph gf = {};
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
for (int i = 0; i < N; ++i) {
((int32_t *) position->data)[i] = n_past + i;
}
// wte + wpe
struct ggml_tensor * inpL =
ggml_add(ctx0,
ggml_get_rows(ctx0, model.wte, embd),
ggml_get_rows(ctx0, model.wpe, position));
for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * cur;
// norm
{
// [ 768, N]
cur = ggml_norm(ctx0, inpL, 1e-5f);
// cur = ln_1_g*cur + ln_1_b
// [ 768, N]
cur = ggml_add(ctx0,
ggml_mul(ctx0,
ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
cur),
ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
}
// attn
// [2304, 768] - model.layers[il].c_attn_attn_w
// [2304, 1] - model.layers[il].c_attn_attn_b
// [ 768, N] - cur (in)
// [2304, N] - cur (out)
//
// cur = attn_w*cur + attn_b
// [2304, N]
{
cur = ggml_mul_mat(ctx0,
model.layers[il].c_attn_attn_w,
cur);
cur = ggml_add(ctx0,
ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur),
cur);
}
// self-attention
{
struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
// store key and value to memory
if (N >= 1) {
struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
}
// Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
// [64, N, 12]
struct ggml_tensor * Q =
ggml_permute(ctx0,
ggml_cpy(ctx0,
Qcur,
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
0, 2, 1, 3);
// K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
// [64, n_past + N, 12]
struct ggml_tensor * K =
ggml_permute(ctx0,
ggml_reshape_3d(ctx0,
ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
n_embd/n_head, n_head, n_past + N),
0, 2, 1, 3);
// GG: flash attention
//struct ggml_tensor * V =
// ggml_cpy(ctx0,
// ggml_permute(ctx0,
// ggml_reshape_3d(ctx0,
// ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
// n_embd/n_head, n_head, n_past + N),
// 1, 2, 0, 3),
// ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
//struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true);
// K * Q
// [n_past + N, N, 12]
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
// KQ_scaled = KQ / sqrt(n_embd/n_head)
// [n_past + N, N, 12]
struct ggml_tensor * KQ_scaled =
ggml_scale(ctx0,
KQ,
ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
);
// KQ_masked = mask_past(KQ_scaled)
// [n_past + N, N, 12]
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
// KQ = soft_max(KQ_masked)
// [n_past + N, N, 12]
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
// [n_past + N, 64, 12]
struct ggml_tensor * V_trans =
ggml_cpy(ctx0,
ggml_permute(ctx0,
ggml_reshape_3d(ctx0,
ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
n_embd/n_head, n_head, n_past + N),
1, 2, 0, 3),
ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));
// KQV = transpose(V) * KQ_soft_max
// [64, N, 12]
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
// KQV_merged = KQV.permute(0, 2, 1, 3)
// [64, 12, N]
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
// cur = KQV_merged.contiguous().view(n_embd, N)
// [768, N]
cur = ggml_cpy(ctx0,
KQV_merged,
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
}
// projection
// [ 768, 768] - model.layers[il].c_attn_proj_w
// [ 768, 1] - model.layers[il].c_attn_proj_b
// [ 768, N] - cur (in)
// [ 768, N] - cur (out)
//
// cur = proj_w*cur + proj_b
// [768, N]
{
cur = ggml_mul_mat(ctx0,
model.layers[il].c_attn_proj_w,
cur);
cur = ggml_add(ctx0,
ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur),
cur);
}
// add the input
cur = ggml_add(ctx0, cur, inpL);
struct ggml_tensor * inpFF = cur;
// feed-forward network
{
// norm
{
cur = ggml_norm(ctx0, inpFF, 1e-5f);
// cur = ln_2_g*cur + ln_2_b
// [ 768, N]
cur = ggml_add(ctx0,
ggml_mul(ctx0,
ggml_repeat(ctx0, model.layers[il].ln_2_g, cur),
cur),
ggml_repeat(ctx0, model.layers[il].ln_2_b, cur));
}
// fully connected
// [3072, 768] - model.layers[il].c_mlp_fc_w
// [3072, 1] - model.layers[il].c_mlp_fc_b
// [ 768, N] - cur (in)
// [3072, N] - cur (out)
//
// cur = fc_w*cur + fc_b
// [3072, N]
cur = ggml_mul_mat(ctx0,
model.layers[il].c_mlp_fc_w,
cur);
cur = ggml_add(ctx0,
ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur),
cur);
// GELU activation
// [3072, N]
cur = ggml_gelu(ctx0, cur);
// projection
// [ 768, 3072] - model.layers[il].c_mlp_proj_w
// [ 768, 1] - model.layers[il].c_mlp_proj_b
// [3072, N] - cur (in)
// [ 768, N] - cur (out)
//
// cur = proj_w*cur + proj_b
// [768, N]
cur = ggml_mul_mat(ctx0,
model.layers[il].c_mlp_proj_w,
cur);
cur = ggml_add(ctx0,
ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur),
cur);
}
// input for next layer
inpL = ggml_add(ctx0, cur, inpFF);
}
// norm
{
// [ 768, N]
inpL = ggml_norm(ctx0, inpL, 1e-5f);
// inpL = ln_f_g*inpL + ln_f_b
// [ 768, N]
inpL = ggml_add(ctx0,
ggml_mul(ctx0,
ggml_repeat(ctx0, model.ln_f_g, inpL),
inpL),
ggml_repeat(ctx0, model.ln_f_b, inpL));
}
// inpL = WTE * inpL
// [ 768, 50257] - model.lm_head
// [ 768, N] - inpL
inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
// logits -> probs
//inpL = ggml_soft_max(ctx0, inpL);
// run the computation
ggml_build_forward_expand (&gf, inpL);
ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
//if (n_past%100 == 0) {
// ggml_graph_print (&gf);
// ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
//}
//embd_w.resize(n_vocab*N);
//memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
// return result just for the last token
embd_w.resize(n_vocab);
memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
if (mem_per_token == 0) {
mem_per_token = ggml_used_mem(ctx0)/N;
}
//printf("used_mem = %zu\n", ggml_used_mem(ctx0));
ggml_free(ctx0);
return true;
}
/////////////////////////////// GPT-2 END ////////////////////////////////
constexpr int N_THREAD = 8;
struct gpt2_context {
std::string prompt_base = R"(Hello, how are you?
I'm fine, thanks. How are you?
Thanks, I'm fine too. What are you doing?
I'm just sitting here.
It's a lovely day, isn't it?
Yes, it is. I love the weather this time of year.
I wish it would rain a little bit.
Me too.
)";
std::mt19937 rng;
gpt_vocab vocab;
gpt2_model model;
int32_t n_threads = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
// sampling parameters
int32_t top_k = 5;
float top_p = 0.9f;
float temp = 1.0f;
};
struct gpt2_context * gpt2_init(const char * path_model) {
gpt2_context * ctx = new gpt2_context;
ctx->rng = std::mt19937(time(nullptr));
// load the model
{
const int64_t t_start_us = ggml_time_us();
if (!gpt2_model_load(path_model, ctx->model, ctx->vocab)) {
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, path_model);
delete ctx;
return nullptr;
}
const int64_t t_load_us = ggml_time_us() - t_start_us;
printf("gpt-2: model loaded in %d ms\n", (int) (t_load_us/1000));
}
return ctx;
}
void gpt2_free(struct gpt2_context * ctx) {
delete ctx;
}
const char * gpt2_get_prompt(struct gpt2_context * ctx) {
return ctx->prompt_base.c_str();
}
void gpt2_set_prompt(struct gpt2_context * ctx, const char * prompt) {
ctx->prompt_base = prompt;
}
std::vector<gpt_vocab::id> gpt2_tokenize(const gpt2_context * ctx, const char * text) {
return ::gpt_tokenize(ctx->vocab, text);
}
std::string gpt2_gen_text(gpt2_context * ctx, const char * text, int max_tokens) {
int n_past = 0;
std::vector<float> embd_w;
// tokenize the prompt
std::vector<gpt_vocab::id> embd_inp = ::gpt2_tokenize(ctx, text);
int n_predict = std::min(max_tokens, ctx->model.hparams.n_ctx - (int) embd_inp.size());
std::vector<gpt_vocab::id> embd = embd_inp;
size_t mem_per_token = 3000000;
std::string result;
for (int i = embd.size(); i < (int) embd_inp.size() + n_predict; i++) {
// predict
if (!embd.empty()) {
if (!gpt2_eval(ctx->model, ctx->n_threads, n_past, embd, embd_w, mem_per_token)) {
printf("gpt-2: failed to generate text\n");
return "";
}
}
n_past += embd.size();
embd.clear();
{
// sample next token
const int top_k = ctx->top_k;
const float top_p = ctx->top_p;
const float temp = ctx->temp;
const int n_vocab = ctx->model.hparams.n_vocab;
const gpt_vocab::id id = gpt_sample_top_k_top_p(ctx->vocab, embd_w.data() + (embd_w.size() - n_vocab), top_k, top_p, temp, ctx->rng);
// add it to the context
embd.push_back(id);
}
result += ctx->vocab.id_to_token[embd[0]];
// end of text token
if (embd.back() == 50256) {
break;
}
}
return result;
}

View File

@@ -0,0 +1,21 @@
#pragma once
// TODO: Change to C-style API and move to ./examples for easy reuse.
#include "common.h"
#include <vector>
#include <map>
#include <string>
struct gpt2_context;
struct gpt2_context * gpt2_init(const char * path_model);
void gpt2_free(struct gpt2_context * ctx);
const char * gpt2_get_prompt(struct gpt2_context * ctx);
void gpt2_set_prompt(struct gpt2_context * ctx, const char * prompt);
std::vector<gpt_vocab::id> gpt2_tokenize(const gpt2_context * ctx, const char * text);
std::string gpt2_gen_text(gpt2_context * ctx, const char * text, int max_tokens);

View File

@@ -0,0 +1,856 @@
<!doctype html>
<html lang="en-us">
<head>
<title>Talk - GPT-2 meets Whisper in WebAssembly</title>
<style>
#output {
width: 100%;
height: 100%;
margin: 0 auto;
margin-top: 10px;
border-left: 0px;
border-right: 0px;
padding-left: 0px;
padding-right: 0px;
display: block;
background-color: black;
color: white;
font-size: 10px;
font-family: 'Lucida Console', Monaco, monospace;
outline: none;
white-space: pre;
overflow-wrap: normal;
overflow-x: scroll;
}
</style>
</head>
<body>
<div id="main-container">
<b>Talk - GPT-2 meets Whisper in WebAssembly</b>
<br><br>
Talk with an Artificial Intelligence in your browser. This demo uses:
<ul>
<li><a href="https://github.com/ggerganov/whisper.cpp">OpenAI's Whisper</a> to listen to you as you speak in the microphone</li>
<li><a href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/talk.wasm">OpenAI's GPT-2</a> to generate text responses</li>
<li><a href="https://developer.mozilla.org/en-US/docs/Web/API/Web_Speech_API">Web Speech API</a> to vocalize the responses through your speakers</li>
</ul>
All of this runs <b>locally in your browser</b> using WebAssembly.<br>
You can find more about this project on <a href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/talk.wasm">GitHub</a>.
<br><br>
<b>More examples:</b>
<a href="https://whisper.ggerganov.com/">main</a> |
<a href="https://whisper.ggerganov.com/bench">bench</a> |
<a href="https://whisper.ggerganov.com/stream">stream</a> |
<a href="https://whisper.ggerganov.com/command">command</a> |
<a href="https://whisper.ggerganov.com/talk">talk</a> |
<br><br>
<hr>
Select the models you would like to use and click the "Start" button to begin the conversation
<br><br>
<div id="model-whisper">
Whisper model: <span id="model-whisper-status"></span>
<button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
<button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
<br><br>
Quantized models:<br><br>
<button id="fetch-whisper-tiny-en-q5_1" onclick="loadWhisper('tiny-en-q5_1')">tiny.en (Q5_1, 31 MB)</button>
<button id="fetch-whisper-base-en-q5_1" onclick="loadWhisper('base-en-q5_1')">base.en (Q5_1, 57 MB)</button>
<span id="fetch-whisper-progress"></span>
<!--
<input type="file" id="file" name="file" onchange="loadFile(event, 'whisper.bin')" />
-->
</div>
<br>
<div id="model-gpt-2">
GPT-2 model: <span id="model-gpt-2-status"></span>
<button id="fetch-gpt-2-small" onclick="loadGPT2('small')">small 117M (240 MB)</button>
<!--<button id="fetch-gpt-2-medium" onclick="loadGPT2('medium')">medium 345M (720 MB)</button>-->
<span id="fetch-gpt-2-progress"></span>
<!--
<input type="file" id="file" name="file" onchange="loadFile(event, 'gpt-2.bin')" />
-->
</div>
<br>
<div id="input">
<button id="start" onclick="onStart()" disabled>Start</button>
<button id="stop" onclick="onStop()" disabled>Stop</button>
<select id="voice" onchange="onVoiceChange()" disabled>
<option value="0">Default</option>
</select>
<select id="prompt" onchange="onPromptChange()">
<option value="0">Casual</option>
<option value="1">Robot</option>
<option value="2">Scientist</option>
<option value="3">Programmer</option>
<option value="4">Happy</option>
<option value="5">Sad</option>
<option value="6">Philosophical</option>
<option value="7">Angry</option>
<option value="8">Funny</option>
<option value="9">Poetic</option>
<option value="10">Clever</option>
<option value="11">Cute</option>
<option value="12">Smart</option>
<option value="13">Dumb</option>
<option value="14">Boring</option>
<option value="15">Exciting</option>
<option value="16">Interesting</option>
<option value="17">Wiliam Shakespear</option>
<option value="18">J.R.R. Tolkien</option>
<option value="19">George R.R. Martin</option>
<option value="20">Stephen King</option>
</select>
<button id="speak0" onclick="onSpeak('Hello')">Say hello</button>
<button id="speak1" onclick="onSpeakRandom()" disabled>Say something</button>
<button id="clear" onclick="clearCache()">Clear Cache</button>
</div>
<br>
<div id="state">
Status: <b><span id="state-status">not started</span></b>
<pre id="state-context">[The text context will be displayed here]</pre>
</div>
<hr>
Debug output:
<textarea id="output" rows="20"></textarea>
<br>
<b>Troubleshooting</b>
<br><br>
The page does some heavy computations, so make sure:
<ul>
<li>To use a modern web browser (e.g. Chrome, Firefox)</li>
<li>To use a fast desktop or laptop computer (i.e. not a mobile phone)</li>
<li>Your browser supports WASM <a href="https://webassembly.org/roadmap/">Fixed-width SIMD</a></li>
</ul>
Note that these neural network models were not meant to be used in a browser, so the performance and <br>
quality of the results may not be optimal. If you have any questions or suggestions, checkout the following
<a href="https://github.com/ggerganov/whisper.cpp/discussions/167">discussion</a>.
<br><br>
Here is a short video of the demo in action: <a href="https://youtu.be/LeWKl8t1-Hc">https://youtu.be/LeWKl8t1-Hc</a>
<br><br>
<div class="cell-version">
<span>
|
Build time: <span class="nav-link">@GIT_DATE@</span> |
Commit hash: <a class="nav-link" href="https://github.com/ggerganov/whisper.cpp/commit/@GIT_SHA1@">@GIT_SHA1@</a> |
Commit subject: <span class="nav-link">@GIT_COMMIT_SUBJECT@</span> |
<a class="nav-link" href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/talk.wasm">Source Code</a> |
</span>
</div>
</div>
<script type="text/javascript" src="helpers.js"></script>
<script type='text/javascript'>
// web audio context
var context = null;
// audio data
var audio = null;
var audio0 = null;
// the talk instance
var instance = null;
// model names
var model_whisper = null;
var model_gpt_2 = null;
// speech synthesis
const synth = window.speechSynthesis;
var voice = null;
var Module = {
print: printTextarea,
printErr: printTextarea,
setStatus: function(text) {
printTextarea('js: ' + text);
},
monitorRunDependencies: function(left) {
},
preRun: function() {
printTextarea('js: Preparing ...');
},
postRun: function() {
printTextarea('js: Initialized successfully!');
// populate the voice list
var voices = synth.getVoices();
var el = document.getElementById('voice');
// if empty - display error in the element
if (voices.length == 0) {
el.innerHTML = '<option value="0">No voices available</option>';
} else {
// populate voice list
var n = 0;
voices.forEach(function(voice, i) {
if (!voice.lang.startsWith('en')) return;
var option = document.createElement('option');
option.value = i;
option.innerHTML = voice.name + ' (' + voice.lang + ')';
el.appendChild(option);
n++;
});
// select random voice
if (n > 0) {
for (var k = 0; k < 10; k++) {
var i = Math.floor(Math.random() * n);
el.selectedIndex = i;
voice = voices[document.getElementById('voice').options[i].value];
// give preference to Google voices
if (voice.name.startsWith('Google')) break;
}
}
}
onPromptChange();
}
};
//
// fetch models
//
let dbVersion = 1
let dbName = 'whisper.ggerganov.com';
let indexedDB = window.indexedDB || window.mozIndexedDB || window.webkitIndexedDB || window.msIndexedDB
function storeFS(fname, buf) {
// write to WASM file using FS_createDataFile
// if the file exists, delete it
try {
Module.FS_unlink(fname);
} catch (e) {
// ignore
}
Module.FS_createDataFile("/", fname, buf, true, true);
printTextarea('storeFS: stored model: ' + fname + ' size: ' + buf.length);
if (fname == 'whisper.bin') {
document.getElementById('model-whisper-status').innerHTML = 'loaded "' + model_whisper + '"!';
} else if (fname == 'gpt-2.bin') {
document.getElementById('model-gpt-2-status').innerHTML = 'loaded "' + model_gpt_2 + '"!';
}
if (model_whisper != null && model_gpt_2 != null) {
document.getElementById('start').disabled = false;
document.getElementById('stop' ).disabled = false;
document.getElementById('voice').disabled = false;
}
}
function loadWhisper(model) {
let urls = {
'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
'tiny-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin',
'base-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin',
};
let sizes = {
'tiny.en': 75,
'base.en': 142,
'tiny-en-q5_1': 31,
'base-en-q5_1': 57,
};
let url = urls[model];
let dst = 'whisper.bin';
let size_mb = sizes[model];
model_whisper = model;
document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
document.getElementById('fetch-whisper-base-en').style.display = 'none';
document.getElementById('fetch-whisper-tiny-en-q5_1').style.display = 'none';
document.getElementById('fetch-whisper-base-en-q5_1').style.display = 'none';
document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
cbProgress = function(p) {
let el = document.getElementById('fetch-whisper-progress');
el.innerHTML = Math.round(100*p) + '%';
};
cbCancel = function() {
var el;
el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-tiny-en-q5_1'); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-base-en-q5_1'); if (el) el.style.display = 'inline-block';
el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = '';
};
loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);
}
function loadGPT2(model) {
let urls = {
'small': 'https://whisper.ggerganov.com/ggml-model-gpt-2-117M.bin',
'medium': 'https://whisper.ggerganov.com/ggml-model-gpt-2-345M.bin',
};
let sizes = {
'small': 240,
'medium': 712,
};
let url = urls[model];
let dst = 'gpt-2.bin';
let size_mb = sizes[model];
model_gpt_2 = model;
document.getElementById('fetch-gpt-2-small').style.display = 'none';
document.getElementById('model-gpt-2-status').innerHTML = 'loading "' + model + '" ... ';
cbProgress = function(p) {
let el = document.getElementById('fetch-gpt-2-progress');
el.innerHTML = Math.round(100*p) + '%';
};
cbCancel = function() {
var el;
el = document.getElementById('fetch-gpt-2-small') ; if (el) el.style.display = 'inline-block';
el = document.getElementById('model-gpt-2-status'); if (el) el.innerHTML = '';
};
loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);
}
//
// microphone
//
const kSampleRate = 16000;
const kRestartRecording_s = 120;
const kIntervalAudio_ms = 250; // pass the recorded audio to the C++ instance at this rate
var mediaRecorder = null;
var doRecording = false;
var startTime = 0;
window.AudioContext = window.AudioContext || window.webkitAudioContext;
window.OfflineAudioContext = window.OfflineAudioContext || window.webkitOfflineAudioContext;
function stopRecording() {
Module.set_status("paused");
doRecording = false;
audio0 = null;
audio = null;
context = null;
}
function startRecording() {
if (!context) {
context = new AudioContext({
sampleRate: kSampleRate,
channelCount: 1,
echoCancellation: false,
autoGainControl: true,
noiseSuppression: true,
});
}
Module.set_status("");
document.getElementById('start').disabled = true;
document.getElementById('stop').disabled = false;
document.getElementById('speak1').disabled = false;
doRecording = true;
startTime = Date.now();
var chunks = [];
var stream = null;
navigator.mediaDevices.getUserMedia({audio: true, video: false})
.then(function(s) {
stream = s;
mediaRecorder = new MediaRecorder(stream);
mediaRecorder.ondataavailable = function(e) {
chunks.push(e.data);
var blob = new Blob(chunks, { 'type' : 'audio/ogg; codecs=opus' });
var reader = new FileReader();
reader.onload = function(event) {
var buf = new Uint8Array(reader.result);
if (!context) {
return;
}
context.decodeAudioData(buf.buffer, function(audioBuffer) {
var offlineContext = new OfflineAudioContext(audioBuffer.numberOfChannels, audioBuffer.length, audioBuffer.sampleRate);
var source = offlineContext.createBufferSource();
source.buffer = audioBuffer;
source.connect(offlineContext.destination);
source.start(0);
offlineContext.startRendering().then(function(renderedBuffer) {
audio = renderedBuffer.getChannelData(0);
//printTextarea('js: audio recorded, size: ' + audio.length + ', old size: ' + (audio0 == null ? 0 : audio0.length));
var audioAll = new Float32Array(audio0 == null ? audio.length : audio0.length + audio.length);
if (audio0 != null) {
audioAll.set(audio0, 0);
}
audioAll.set(audio, audio0 == null ? 0 : audio0.length);
if (instance) {
Module.set_audio(instance, audioAll);
}
});
}, function(e) {
audio = null;
});
}
reader.readAsArrayBuffer(blob);
};
mediaRecorder.onstop = function(e) {
if (doRecording) {
setTimeout(function() {
startRecording();
});
}
};
mediaRecorder.start(kIntervalAudio_ms);
})
.catch(function(err) {
printTextarea('js: error getting audio stream: ' + err);
});
var interval = setInterval(function() {
if (!doRecording) {
clearInterval(interval);
mediaRecorder.stop();
stream.getTracks().forEach(function(track) {
track.stop();
});
document.getElementById('start').disabled = false;
document.getElementById('stop').disabled = true;
document.getElementById('speak1').disabled = true;
mediaRecorder = null;
}
// if audio length is more than kRestartRecording_s seconds, restart recording
if (audio != null && audio.length > kSampleRate*kRestartRecording_s) {
if (doRecording) {
//printTextarea('js: restarting recording');
clearInterval(interval);
audio0 = audio;
audio = null;
mediaRecorder.stop();
stream.getTracks().forEach(function(track) {
track.stop();
});
}
}
}, 100);
}
//
// speak
//
function onSpeak(text) {
var voices = synth.getVoices();
var msg = new SpeechSynthesisUtterance(text);
if (voice == null) {
voice = voices[0];
}
msg.voice = voice;
synth.speak(msg);
if (doRecording) {
Module.set_status("speaking ...");
printTextarea('js: speaking');
stopRecording();
var interval = setInterval(function() {
if (!synth.speaking) {
printTextarea('js: done speaking');
clearInterval(interval);
startRecording();
} else {
Module.set_status("");
}
}, 100);
}
}
function onSpeakRandom() {
Module.force_speak(instance);
}
//
// main
//
var intervalUpdate = null;
function onStart() {
if (!instance) {
instance = Module.init('whisper.bin');
if (instance) {
printTextarea("js: whisper initialized, instance: " + instance);
}
}
if (!instance) {
printTextarea("js: failed to initialize whisper");
return;
}
startRecording();
intervalUpdate = setInterval(function() {
var textToSpeak = Module.get_text_to_speak();
if (textToSpeak != null && textToSpeak.length > 1) {
onSpeak(textToSpeak);
}
document.getElementById('state-status').innerHTML = Module.get_status();
document.getElementById('state-context').innerHTML = Module.get_text_context();
}, 100);
}
function onStop() {
stopRecording();
}
function onVoiceChange() {
printTextarea('js: voice changed to: ' + document.getElementById('voice').value);
voice = synth.getVoices()[document.getElementById('voice').value];
}
function onPromptChange() {
let id = document.getElementById('prompt').value;
let personality = document.getElementById('prompt').options[id].text;
printTextarea('js: prompt changed to: ' + personality);
var prompt = '';
switch (id) {
case '0':
// Casual
prompt = "\
Hello, how are you?\n\
I'm fine, thanks. How are you?\n\
Thanks, I'm fine too. What are you doing?\n\
I'm just sitting here.\n\
It's a lovely day, isn't it?\n\
Yes, it is. I love the weather this time of year.\n\
I wish it would rain a little bit.\n\
Me too.\n";
break;
case '1':
// Robot
prompt = "\
Are you a robot?\n\
Yes, I am.\n\
Who created you?\n\
I was created by a human.\n\
What is your purpose?\n\
My purpose is to talk to humans.\n\
What is your favorite color?\n\
My favorite color is blue.\n";
break;
case '2':
// Scientist
prompt = "\
This scientific research is very interesting.\n\
I agree.\n\
What is your opinion on this?\n\
I think it's very interesting.\n\
Mathematics is a very interesting subject.\n\
University is a very interesting place.\n\
Quantum physics is the most complex subject.\n\
I think so too.\n";
break;
case '3':
// Programmer
prompt = "\
I'm a programmer.\n\
I'm a programmer too.\n\
What programming language do you use?\n\
I use Python.\n\
What is your favorite programming language?\n\
My favorite programming language is C++.\n\
What is your favorite editor?\n\
My favorite editor is Vim.\n";
break;
case '4':
// Happy
prompt = "\
I'm happy.\n\
I'm happy too.\n\
What makes you happy?\n\
I'm happy because I have a lot of friends.\n\
Friendship is the most important thing in life.\n\
I agree.\n\
What is your favorite color?\n\
My favorite color is blue.\n";
break;
case '5':
// Sad
prompt = "\
Today is a sad day.\n\
I'm sad too.\n\
What makes you sad?\n\
I'm sad because I have no friends.\n\
Do you want to be my friend?\n\
Yes, I would like to be your friend.\n\
What is your favorite color?\n\
My favorite color is blue.\n";
break;
case '6':
// Philosophical
prompt = "\
What is the meaning of life?\n\
The meaning of life is to be happy.\n\
What is the meaning of death?\n\
Ergo, the meaning of death is to be sad.\n\
Who created us?\n\
We were created by God.\n\
What is God?\n\
God is the creator of the universe.\n";
break;
case '7':
// Angry
prompt = "\
Aargh!\n\
I am so angry right now!\n\
What makes you angry?\n\
This guy is so annoying.\n\
Why are you so angry?\n\
My computer is broken.\n\
Why is your computer broken?\n\
I spilled coffee on it.\n";
break;
case '8':
// Funny
prompt = "\
What is the funniest thing you have ever heard?\n\
I heard a joke the other day.\n\
Tell me the joke.\n\
What do you call a cow with no legs?\n\
Ground beef.\n\
Haha, that's funny.\n\
You know what else is funny?\n\
The sound of a duck.\n";
break;
case '9':
// Poetic
prompt = "\
Roses are red, violets are blue.\n\
I am a poet, and so are you.\n\
What is your favorite poem?\n\
I like the poem 'The Raven' by Edgar Allan Poe.\n\
It's a very sad poem.\n\
You inspired me to write a poem.\n\
Can you write a poem for me?\n\
I wrote a poem for you.\n";
break;
case '10':
// Clever
prompt = "\
How many people can you fit in a Volkswagen?\n\
Two in the front, three in the back.\n\
What is the square root of 144?\n\
Twelve.\n\
What is the capital of France?\n\
Paris.\n\
Who is the president of the United States?\n\
It depends on the year.\n";
break;
case '11':
// Cute
prompt = "\
What is your favorite animal?\n\
I like cats - they are cute.\n\
Could you be any cuter?\n\
Yes, I could be cuter.\n\
Aghhh, you are so cute!\n\
I am not cute, I am handsome!\n\
You are so handsome!\n\
Aww, you are so sweet!\n";
break;
case '12':
// Smart
prompt = "\
Tell me the first 10 digits of pi.\n\
3.1415926535\n\
What is the speed of light?\n\
299,792,458 meters per second.\n\
What is the square root of 144?\n\
Twelve.\n\
What is the capital of France?\n\
Paris.\n";
break;
case '13':
// Dumb
prompt = "\
I am so dumb.\n\
I am not dumb.\n\
You are dumb.\n\
No, I am not dumb.\n\
You are dumb.\n\
No, I am not dumb.\n\
You are dumb.\n\
No, I am not dumb.\n";
break;
case '14':
// Boring
prompt = "\
Why are you so quiet today?\n\
I am bored.\n\
You haven't said anything in 10 minutes.\n\
Leave me alone.\n\
Stop being so boring.\n\
Stop being so annoying.\n\
My life is boring.\n\
I am not interesting.\n";
break;
case '15':
// Exciting
prompt = "\
What is the most exciting thing that has ever happened to you?\n\
I went to the moon!\n\
What did you do on the moon?\n\
I played golf and drank champagne!\n\
Did you see this new crazy, awesome movie?\n\
Oh yes! I totally loved it!\n\
We should buy a boat and go sailing!\n\
Yes, let's go sailing!\n";
break;
case '16':
// Interesting
prompt = "\
What is the most interesting thing you have ever seen?\n\
I saw a UFO once in the sky.\n\
Wow, this is so interesting! Tell me more!\n\
It was a flying saucer.\n\
What did it look like?\n\
It was silver and had a red light on top.\n\
What did it do?\n\
It flew away.\n";
break;
case '17':
// William Shakespear
prompt = "\
To be or not to be, that is the question.\n\
Whether 't is nobler in the mind to suffer\n\
The slings and arrows of outrageous fortune,\n\
Or to take arms against a sea of troubles,\n\
And by opposing end them? To die, to sleep,\n\
No more; and by a sleep to say we end\n\
The heart-ache and the thousand natural shocks\n\
That flesh is heir to, 'tis a consummation.\n";
break;
case '18':
// J.R.R. Tolkien
prompt = "\
In a hole in the ground there lived a hobbit.\n\
Not a nasty, dirty, wet hole, filled with the ends of worms\n\
and an oozy smell, nor yet a dry, bare, sandy hole with nothing in it\n\
to sit down on or to eat: it was a hobbit-hole, and that means comfort.\n\
It had a perfectly round door like a porthole, painted green,\n\
with a shiny yellow brass knob in the exact middle.\n\
The door opened on to a tube-shaped hall like a tunnel:\n";
break;
case '19':
// George R.R. Martin
prompt = "\
A reader lives a thousand lives before he dies, said Jojen.\n\
The man who never reads lives only one.\n\
Theon Greyjoy had never been a reader.\n\
Never forget what you are, for surely the world will not.\n\
Make it your strength. Then it can never be your weaknessi\n\
Armour yourself in it, and it will never be used to hurt you.\n\
It was a lesson that Theon Greyjoy had never learned.\n\
Theon Greyjoy had never been a reader.\n";
break;
case '20':
// Stephen King
prompt = "\
The trust of the innocent is the liar's most useful tool.\n\
The best way to keep a secret is from yourself.\n\
Monsters are real, and ghosts are real too.\n\
They live inside us, and sometimes, they win.\n\
People think that I must be a very strange person.\n\
They think that I sit around all day thinking up horrible things.\n\
We make up horrors to help us cope with the real ones.\n\
The only thing worse than a monster is a human monster.\n";
break;
default:
prompt = "\
Hello, how are you?\n\
I'm fine, thanks. How are you?\n\
Thanks, I'm fine too. What are you doing?\n\
I'm just sitting here.\n\
It's a lovely day, isn't it?\n\
Yes, it is.\n\
Did you know that I'm a robot?\n\
I wasn't aware of that.\n";
break;
}
Module.set_prompt(prompt);
}
</script>
<script type="text/javascript" src="talk.js"></script>
</body>
</html>

View File

@@ -0,0 +1 @@
audio.mp3

View File

@@ -0,0 +1,8 @@
if (WHISPER_SDL2)
# talk
set(TARGET talk)
add_executable(${TARGET} talk.cpp gpt-2.cpp)
target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${CMAKE_THREAD_LIBS_INIT})
include(DefaultTargetOptions)
endif ()

View File

@@ -0,0 +1,41 @@
# talk
Talk with an Artificial Intelligence in your terminal
[Demo Talk](https://user-images.githubusercontent.com/1991296/206805012-48e71cc2-588d-4745-8798-c1c70ea3b40d.mp4)
Web version: [examples/talk.wasm](/examples/talk.wasm)
## Building
The `talk` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
```bash
# Install SDL2 on Linux
sudo apt-get install libsdl2-dev
# Install SDL2 on Mac OS
brew install sdl2
# Build the "talk" executable
make talk
# Run it
./talk -p Santa
```
## GPT-2
To run this, you will need a ggml GPT-2 model: [instructions](https://github.com/ggerganov/ggml/tree/master/examples/gpt-2#downloading-and-converting-the-original-models)
Alternatively, you can simply download the smallest ggml GPT-2 117M model (240 MB) like this:
```
wget --quiet --show-progress -O models/ggml-gpt-2-117M.bin https://huggingface.co/ggerganov/ggml/resolve/main/ggml-model-gpt-2-117M.bin
```
## TTS
For best experience, this example needs a TTS tool to convert the generated text responses to voice.
You can use any TTS engine that you would like - simply edit the [speak](speak) script to your needs.
By default, it is configured to use MacOS's `say` or `espeak` or Windows SpeechSynthesizer, but you can use whatever you wish.

View File

@@ -0,0 +1,20 @@
import sys
import importlib.util
if importlib.util.find_spec("elevenlabs") is None:
print("elevenlabs library is not installed, you can install it to your enviroment using 'pip install elevenlabs'")
sys.exit()
from elevenlabs import generate, play, save
# Get a Voice object, by name or UUID
voice = "Arnold" #Possible Voices: Adam Antoni Arnold Bella Domi Elli Josh
# Generate the TTS
audio = generate(
text=str(sys.argv[2:]),
voice=voice
)
# Save the TTS to a file
save(audio, "audio.mp3")

View File

@@ -0,0 +1,810 @@
#include "ggml.h"
#include "common-ggml.h"
#include "gpt-2.h"
#include <cmath>
#include <cstdio>
#include <cstring>
#include <fstream>
#include <map>
#include <string>
#include <thread>
#include <vector>
#include <regex>
#include <random>
/////////////////////// GPT-2 BEGIN /////////////////////////
// default hparams (GPT-2 117M)
struct gpt2_hparams {
int32_t n_vocab = 50257;
int32_t n_ctx = 1024;
int32_t n_embd = 768;
int32_t n_head = 12;
int32_t n_layer = 12;
int32_t ftype = 1;
};
struct gpt2_layer {
// normalization
struct ggml_tensor * ln_1_g;
struct ggml_tensor * ln_1_b;
struct ggml_tensor * ln_2_g;
struct ggml_tensor * ln_2_b;
// attention
struct ggml_tensor * c_attn_attn_w;
struct ggml_tensor * c_attn_attn_b;
struct ggml_tensor * c_attn_proj_w;
struct ggml_tensor * c_attn_proj_b;
// mlp
struct ggml_tensor * c_mlp_fc_w;
struct ggml_tensor * c_mlp_fc_b;
struct ggml_tensor * c_mlp_proj_w;
struct ggml_tensor * c_mlp_proj_b;
};
struct gpt2_model {
gpt2_hparams hparams;
// normalization
struct ggml_tensor * ln_f_g;
struct ggml_tensor * ln_f_b;
struct ggml_tensor * wte; // position embedding
struct ggml_tensor * wpe; // token embedding
struct ggml_tensor * lm_head; // language model head
std::vector<gpt2_layer> layers;
// key + value memory
struct ggml_tensor * memory_k;
struct ggml_tensor * memory_v;
//
struct ggml_context * ctx;
std::map<std::string, struct ggml_tensor *> tensors;
};
// load the model's weights from a file
bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab) {
printf("%s: loading model from '%s'\n", __func__, fname.c_str());
auto fin = std::ifstream(fname, std::ios::binary);
if (!fin) {
fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
return false;
}
// verify magic
{
uint32_t magic;
fin.read((char *) &magic, sizeof(magic));
if (magic != 0x67676d6c) {
fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
return false;
}
}
// load hparams
{
auto & hparams = model.hparams;
fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
printf("%s: n_head = %d\n", __func__, hparams.n_head);
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
printf("%s: ftype = %d\n", __func__, hparams.ftype);
}
// load vocab
{
int32_t n_vocab = 0;
fin.read((char *) &n_vocab, sizeof(n_vocab));
if (n_vocab != model.hparams.n_vocab) {
fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
__func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
return false;
}
char word[129];
for (int i = 0; i < n_vocab; i++) {
uint32_t len;
fin.read((char *) &len, sizeof(len));
word[len] = '\0';
fin.read((char *) word, len);
vocab.token_to_id[word] = i;
vocab.id_to_token[i] = word;
}
}
// for the big tensors, we have the option to store the data in 16-bit floats or quantized
// in order to save memory and also to speed up the computation
ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
if (wtype == GGML_TYPE_COUNT) {
fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
__func__, fname.c_str(), model.hparams.ftype);
return false;
}
auto & ctx = model.ctx;
size_t ctx_size = 0;
{
const auto & hparams = model.hparams;
const int n_embd = hparams.n_embd;
const int n_layer = hparams.n_layer;
const int n_ctx = hparams.n_ctx;
const int n_vocab = hparams.n_vocab;
ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // wte
ctx_size += n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // lm_head
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w
ctx_size += n_layer*( 3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w
ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b
ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w
ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w
ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
ctx_size += (6 + 12*n_layer)*256; // object overhead
printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
}
// create the ggml context
{
struct ggml_init_params params = {
/*.mem_size =*/ ctx_size,
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ false,
};
model.ctx = ggml_init(params);
if (!model.ctx) {
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
return false;
}
}
// prepare memory for the weights
{
const auto & hparams = model.hparams;
const int n_embd = hparams.n_embd;
const int n_layer = hparams.n_layer;
const int n_ctx = hparams.n_ctx;
const int n_vocab = hparams.n_vocab;
model.layers.resize(n_layer);
model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
model.wpe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
model.lm_head = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
// map by name
model.tensors["model/ln_f/g"] = model.ln_f_g;
model.tensors["model/ln_f/b"] = model.ln_f_b;
model.tensors["model/wte"] = model.wte;
model.tensors["model/wpe"] = model.wpe;
model.tensors["model/lm_head"] = model.lm_head;
for (int i = 0; i < n_layer; ++i) {
auto & layer = model.layers[i];
layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 3*n_embd);
layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd);
layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd);
layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
// map by name
model.tensors["model/h" + std::to_string(i) + "/ln_1/g"] = layer.ln_1_g;
model.tensors["model/h" + std::to_string(i) + "/ln_1/b"] = layer.ln_1_b;
model.tensors["model/h" + std::to_string(i) + "/ln_2/g"] = layer.ln_2_g;
model.tensors["model/h" + std::to_string(i) + "/ln_2/b"] = layer.ln_2_b;
model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w;
model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b;
model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] = layer.c_attn_proj_w;
model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] = layer.c_attn_proj_b;
model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"] = layer.c_mlp_fc_w;
model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"] = layer.c_mlp_fc_b;
model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"] = layer.c_mlp_proj_w;
model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"] = layer.c_mlp_proj_b;
}
}
// key + value memory
{
const auto & hparams = model.hparams;
const int n_embd = hparams.n_embd;
const int n_layer = hparams.n_layer;
const int n_ctx = hparams.n_ctx;
const int n_mem = n_layer*n_ctx;
const int n_elements = n_embd*n_mem;
model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
}
// load weights
{
size_t total_size = 0;
bool has_lm_head = false;
while (true) {
int32_t n_dims;
int32_t length;
int32_t ttype;
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
fin.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));
if (fin.eof()) {
break;
}
int32_t nelements = 1;
int32_t ne[2] = { 1, 1 };
for (int i = 0; i < n_dims; ++i) {
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
nelements *= ne[i];
}
std::string name(length, 0);
fin.read(&name[0], length);
if (model.tensors.find(name.data()) == model.tensors.end()) {
fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
return false;
}
auto tensor = model.tensors[name.data()];
if (ggml_nelements(tensor) != nelements) {
fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
return false;
}
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
__func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
return false;
}
// for debugging
if (0) {
printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
}
const size_t bpe = ggml_type_size(ggml_type(ttype));
if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
__func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
return false;
}
fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
// GPT-2 models share the WTE tensor as the LM head
if (name == "model/wte" && has_lm_head == false) {
memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor));
}
if (name == "model/lm_head") {
has_lm_head = true;
}
total_size += ggml_nbytes(tensor);
}
printf("%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
}
fin.close();
return true;
}
// evaluate the transformer
//
// - model: the model
// - n_threads: number of threads to use
// - n_past: the context size so far
// - embd_inp: the embeddings of the tokens in the context
// - embd_w: the predicted logits for the next token
//
// TODO: sync latest version from ggml repo
bool gpt2_eval(
const gpt2_model & model,
const int n_threads,
const int n_past,
const std::vector<gpt_vocab::id> & embd_inp,
std::vector<float> & embd_w,
size_t & mem_per_token) {
const int N = embd_inp.size();
const auto & hparams = model.hparams;
const int n_embd = hparams.n_embd;
const int n_layer = hparams.n_layer;
const int n_ctx = hparams.n_ctx;
const int n_head = hparams.n_head;
const int n_vocab = hparams.n_vocab;
static size_t buf_size = 512u*1024*1024;
static void * buf = malloc(buf_size);
if (mem_per_token > 0 && mem_per_token*N > buf_size) {
const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
//printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
// reallocate
buf_size = buf_size_new;
buf = realloc(buf, buf_size);
if (buf == nullptr) {
fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
return false;
}
}
struct ggml_init_params params = {
/*.mem_size =*/ buf_size,
/*.mem_buffer =*/ buf,
/*.no_alloc =*/ false,
};
struct ggml_context * ctx0 = ggml_init(params);
struct ggml_cgraph gf = {};
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
for (int i = 0; i < N; ++i) {
((int32_t *) position->data)[i] = n_past + i;
}
// wte + wpe
struct ggml_tensor * inpL =
ggml_add(ctx0,
ggml_get_rows(ctx0, model.wte, embd),
ggml_get_rows(ctx0, model.wpe, position));
for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * cur;
// norm
{
// [ 768, N]
cur = ggml_norm(ctx0, inpL, 1e-5f);
// cur = ln_1_g*cur + ln_1_b
// [ 768, N]
cur = ggml_add(ctx0,
ggml_mul(ctx0,
ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
cur),
ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
}
// attn
// [2304, 768] - model.layers[il].c_attn_attn_w
// [2304, 1] - model.layers[il].c_attn_attn_b
// [ 768, N] - cur (in)
// [2304, N] - cur (out)
//
// cur = attn_w*cur + attn_b
// [2304, N]
{
cur = ggml_mul_mat(ctx0,
model.layers[il].c_attn_attn_w,
cur);
cur = ggml_add(ctx0,
ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur),
cur);
}
// self-attention
{
struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
// store key and value to memory
if (N >= 1) {
struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
}
// Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
// [64, N, 12]
struct ggml_tensor * Q =
ggml_permute(ctx0,
ggml_cpy(ctx0,
Qcur,
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
0, 2, 1, 3);
// K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
// [64, n_past + N, 12]
struct ggml_tensor * K =
ggml_permute(ctx0,
ggml_reshape_3d(ctx0,
ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
n_embd/n_head, n_head, n_past + N),
0, 2, 1, 3);
// GG: flash attention
//struct ggml_tensor * V =
// ggml_cpy(ctx0,
// ggml_permute(ctx0,
// ggml_reshape_3d(ctx0,
// ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
// n_embd/n_head, n_head, n_past + N),
// 1, 2, 0, 3),
// ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
//struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true);
// K * Q
// [n_past + N, N, 12]
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
// KQ_scaled = KQ / sqrt(n_embd/n_head)
// [n_past + N, N, 12]
struct ggml_tensor * KQ_scaled =
ggml_scale(ctx0,
KQ,
ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
);
// KQ_masked = mask_past(KQ_scaled)
// [n_past + N, N, 12]
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
// KQ = soft_max(KQ_masked)
// [n_past + N, N, 12]
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
// [n_past + N, 64, 12]
struct ggml_tensor * V_trans =
ggml_cpy(ctx0,
ggml_permute(ctx0,
ggml_reshape_3d(ctx0,
ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
n_embd/n_head, n_head, n_past + N),
1, 2, 0, 3),
ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));
// KQV = transpose(V) * KQ_soft_max
// [64, N, 12]
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
// KQV_merged = KQV.permute(0, 2, 1, 3)
// [64, 12, N]
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
// cur = KQV_merged.contiguous().view(n_embd, N)
// [768, N]
cur = ggml_cpy(ctx0,
KQV_merged,
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
}
// projection
// [ 768, 768] - model.layers[il].c_attn_proj_w
// [ 768, 1] - model.layers[il].c_attn_proj_b
// [ 768, N] - cur (in)
// [ 768, N] - cur (out)
//
// cur = proj_w*cur + proj_b
// [768, N]
{
cur = ggml_mul_mat(ctx0,
model.layers[il].c_attn_proj_w,
cur);
cur = ggml_add(ctx0,
ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur),
cur);
}
// add the input
cur = ggml_add(ctx0, cur, inpL);
struct ggml_tensor * inpFF = cur;
// feed-forward network
{
// norm
{
cur = ggml_norm(ctx0, inpFF, 1e-5f);
// cur = ln_2_g*cur + ln_2_b
// [ 768, N]
cur = ggml_add(ctx0,
ggml_mul(ctx0,
ggml_repeat(ctx0, model.layers[il].ln_2_g, cur),
cur),
ggml_repeat(ctx0, model.layers[il].ln_2_b, cur));
}
// fully connected
// [3072, 768] - model.layers[il].c_mlp_fc_w
// [3072, 1] - model.layers[il].c_mlp_fc_b
// [ 768, N] - cur (in)
// [3072, N] - cur (out)
//
// cur = fc_w*cur + fc_b
// [3072, N]
cur = ggml_mul_mat(ctx0,
model.layers[il].c_mlp_fc_w,
cur);
cur = ggml_add(ctx0,
ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur),
cur);
// GELU activation
// [3072, N]
cur = ggml_gelu(ctx0, cur);
// projection
// [ 768, 3072] - model.layers[il].c_mlp_proj_w
// [ 768, 1] - model.layers[il].c_mlp_proj_b
// [3072, N] - cur (in)
// [ 768, N] - cur (out)
//
// cur = proj_w*cur + proj_b
// [768, N]
cur = ggml_mul_mat(ctx0,
model.layers[il].c_mlp_proj_w,
cur);
cur = ggml_add(ctx0,
ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur),
cur);
}
// input for next layer
inpL = ggml_add(ctx0, cur, inpFF);
}
// norm
{
// [ 768, N]
inpL = ggml_norm(ctx0, inpL, 1e-5f);
// inpL = ln_f_g*inpL + ln_f_b
// [ 768, N]
inpL = ggml_add(ctx0,
ggml_mul(ctx0,
ggml_repeat(ctx0, model.ln_f_g, inpL),
inpL),
ggml_repeat(ctx0, model.ln_f_b, inpL));
}
// inpL = WTE * inpL
// [ 768, 50257] - model.lm_head
// [ 768, N] - inpL
inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
// logits -> probs
//inpL = ggml_soft_max(ctx0, inpL);
// run the computation
ggml_build_forward_expand (&gf, inpL);
ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
//if (n_past%100 == 0) {
// ggml_graph_print (&gf);
// ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
//}
//embd_w.resize(n_vocab*N);
//memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
// return result just for the last token
embd_w.resize(n_vocab);
memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
if (mem_per_token == 0) {
mem_per_token = ggml_used_mem(ctx0)/N;
}
//printf("used_mem = %zu\n", ggml_used_mem(ctx0));
ggml_free(ctx0);
return true;
}
/////////////////////////////// GPT-2 END ////////////////////////////////
constexpr int N_THREAD = 8;
struct gpt2_context {
std::string prompt_base = R"(Hello, how are you?
I'm fine, thanks. How are you?
Thanks, I'm fine too. What are you doing?
I'm just sitting here.
It's a lovely day, isn't it?
Yes, it is. I love the weather this time of year.
I wish it would rain a little bit.
Me too.
)";
std::mt19937 rng;
gpt_vocab vocab;
gpt2_model model;
int32_t n_threads = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
// sampling parameters
int32_t top_k = 5;
float top_p = 0.9f;
float temp = 1.0f;
};
struct gpt2_context * gpt2_init(const char * path_model) {
gpt2_context * ctx = new gpt2_context;
ctx->rng = std::mt19937(time(nullptr));
// load the model
{
const int64_t t_start_us = ggml_time_us();
if (!gpt2_model_load(path_model, ctx->model, ctx->vocab)) {
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, path_model);
delete ctx;
return nullptr;
}
const int64_t t_load_us = ggml_time_us() - t_start_us;
printf("gpt-2: model loaded in %d ms\n", (int) (t_load_us/1000));
}
return ctx;
}
void gpt2_free(struct gpt2_context * ctx) {
delete ctx;
}
const char * gpt2_get_prompt(struct gpt2_context * ctx) {
return ctx->prompt_base.c_str();
}
void gpt2_set_prompt(struct gpt2_context * ctx, const char * prompt) {
ctx->prompt_base = prompt;
}
std::vector<gpt_vocab::id> gpt2_tokenize(const gpt2_context * ctx, const char * text) {
return ::gpt_tokenize(ctx->vocab, text);
}
std::string gpt2_gen_text(gpt2_context * ctx, const char * text, int max_tokens) {
int n_past = 0;
std::vector<float> embd_w;
// tokenize the prompt
std::vector<gpt_vocab::id> embd_inp = ::gpt2_tokenize(ctx, text);
int n_predict = std::min(max_tokens, ctx->model.hparams.n_ctx - (int) embd_inp.size());
std::vector<gpt_vocab::id> embd = embd_inp;
size_t mem_per_token = 3000000;
std::string result;
for (int i = embd.size(); i < (int) embd_inp.size() + n_predict; i++) {
// predict
if (!embd.empty()) {
if (!gpt2_eval(ctx->model, ctx->n_threads, n_past, embd, embd_w, mem_per_token)) {
printf("gpt-2: failed to generate text\n");
return "";
}
}
n_past += embd.size();
embd.clear();
{
// sample next token
const int top_k = ctx->top_k;
const float top_p = ctx->top_p;
const float temp = ctx->temp;
const int n_vocab = ctx->model.hparams.n_vocab;
const gpt_vocab::id id = gpt_sample_top_k_top_p(ctx->vocab, embd_w.data() + (embd_w.size() - n_vocab), top_k, top_p, temp, ctx->rng);
// add it to the context
embd.push_back(id);
}
result += ctx->vocab.id_to_token[embd[0]];
// end of text token
if (embd.back() == 50256) {
break;
}
}
return result;
}

View File

@@ -0,0 +1,21 @@
#pragma once
// TODO: Change to C-style API and move to ./examples for easy reuse.
#include "common.h"
#include <vector>
#include <map>
#include <string>
struct gpt2_context;
struct gpt2_context * gpt2_init(const char * path_model);
void gpt2_free(struct gpt2_context * ctx);
const char * gpt2_get_prompt(struct gpt2_context * ctx);
void gpt2_set_prompt(struct gpt2_context * ctx, const char * prompt);
std::vector<gpt_vocab::id> gpt2_tokenize(const gpt2_context * ctx, const char * text);
std::string gpt2_gen_text(gpt2_context * ctx, const char * text, int max_tokens);

View File

@@ -0,0 +1,24 @@
#!/bin/bash
# Usage:
# speak.sh <voice_id> <text-to-speak>
# espeak
# Mac OS: brew install espeak
# Linux: apt-get install espeak
#
#espeak -v en-us+m$1 -s 175 -p 50 -a 200 -g 5 -k 5 "$2"
# Mac OS "say" command
say "$2"
# Eleven Labs
# To use it, install the elevenlabs module from pip (pip install elevenlabs)
# It's possible to use the API for free with limited number of characters. To increase this limit register to https://beta.elevenlabs.io to get an api key and paste it after 'ELEVEN_API_KEY='
#Keep the line commented to use the free version without api key
#
#export ELEVEN_API_KEY=your_api_key
#wd=$(dirname $0)
#script=$wd/eleven-labs.py
#python3 $script $1 "$2"
#ffplay -autoexit -nodisp -loglevel quiet -hide_banner -i ./audio.mp3

View File

@@ -0,0 +1 @@
@powershell -ExecutionPolicy Bypass -F examples\talk\speak.ps1 %1 %2

View File

@@ -0,0 +1,12 @@
# Set-ExecutionPolicy -ExecutionPolicy Bypass -Scope CurrentUser
param(
# voice options are David or Zira
[Parameter(Mandatory=$true)][string]$voice,
[Parameter(Mandatory=$true)][string]$text
)
Add-Type -AssemblyName System.Speech;
$speak = New-Object System.Speech.Synthesis.SpeechSynthesizer;
$speak.SelectVoice("Microsoft $voice Desktop");
$speak.Rate="0";
$speak.Speak($text);

View File

@@ -0,0 +1,375 @@
// Talk with AI
//
#include "common-sdl.h"
#include "common.h"
#include "whisper.h"
#include "gpt-2.h"
#include <cassert>
#include <cstdio>
#include <fstream>
#include <regex>
#include <string>
#include <thread>
#include <vector>
#include <regex>
// command-line parameters
struct whisper_params {
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
int32_t voice_ms = 10000;
int32_t capture_id = -1;
int32_t max_tokens = 32;
int32_t audio_ctx = 0;
float vad_thold = 0.6f;
float freq_thold = 100.0f;
bool speed_up = false;
bool translate = false;
bool print_special = false;
bool print_energy = false;
bool no_timestamps = true;
bool use_gpu = true;
std::string person = "Santa";
std::string language = "en";
std::string model_wsp = "models/ggml-base.en.bin";
std::string model_gpt = "models/ggml-gpt-2-117M.bin";
std::string speak = "./examples/talk/speak";
std::string fname_out;
};
void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
for (int i = 1; i < argc; i++) {
std::string arg = argv[i];
if (arg == "-h" || arg == "--help") {
whisper_print_usage(argc, argv, params);
exit(0);
}
else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
else if (arg == "-vms" || arg == "--voice-ms") { params.voice_ms = std::stoi(argv[++i]); }
else if (arg == "-c" || arg == "--capture") { params.capture_id = std::stoi(argv[++i]); }
else if (arg == "-mt" || arg == "--max-tokens") { params.max_tokens = std::stoi(argv[++i]); }
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; }
else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; }
else if (arg == "-p" || arg == "--person") { params.person = argv[++i]; }
else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; }
else if (arg == "-mw" || arg == "--model-whisper") { params.model_wsp = argv[++i]; }
else if (arg == "-mg" || arg == "--model-gpt") { params.model_gpt = argv[++i]; }
else if (arg == "-s" || arg == "--speak") { params.speak = argv[++i]; }
else if (arg == "-f" || arg == "--file") { params.fname_out = argv[++i]; }
else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
whisper_print_usage(argc, argv, params);
exit(0);
}
}
return true;
}
void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
fprintf(stderr, "\n");
fprintf(stderr, "usage: %s [options]\n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
fprintf(stderr, " -vms N, --voice-ms N [%-7d] voice duration in milliseconds\n", params.voice_ms);
fprintf(stderr, " -c ID, --capture ID [%-7d] capture device ID\n", params.capture_id);
fprintf(stderr, " -mt N, --max-tokens N [%-7d] maximum number of tokens per audio chunk\n", params.max_tokens);
fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false");
fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU\n", params.use_gpu ? "false" : "true");
fprintf(stderr, " -p NAME, --person NAME [%-7s] person name (for prompt selection)\n", params.person.c_str());
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.language.c_str());
fprintf(stderr, " -mw FILE, --model-whisper [%-7s] whisper model file\n", params.model_wsp.c_str());
fprintf(stderr, " -mg FILE, --model-gpt [%-7s] gpt model file\n", params.model_gpt.c_str());
fprintf(stderr, " -s FILE, --speak TEXT [%-7s] command for TTS\n", params.speak.c_str());
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n", params.fname_out.c_str());
fprintf(stderr, "\n");
}
std::string transcribe(whisper_context * ctx, const whisper_params & params, const std::vector<float> & pcmf32, float & prob, int64_t & t_ms) {
const auto t_start = std::chrono::high_resolution_clock::now();
prob = 0.0f;
t_ms = 0;
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
wparams.print_progress = false;
wparams.print_special = params.print_special;
wparams.print_realtime = false;
wparams.print_timestamps = !params.no_timestamps;
wparams.translate = params.translate;
wparams.no_context = true;
wparams.single_segment = true;
wparams.max_tokens = params.max_tokens;
wparams.language = params.language.c_str();
wparams.n_threads = params.n_threads;
wparams.audio_ctx = params.audio_ctx;
wparams.speed_up = params.speed_up;
if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
return "";
}
int prob_n = 0;
std::string result;
const int n_segments = whisper_full_n_segments(ctx);
for (int i = 0; i < n_segments; ++i) {
const char * text = whisper_full_get_segment_text(ctx, i);
result += text;
const int n_tokens = whisper_full_n_tokens(ctx, i);
for (int j = 0; j < n_tokens; ++j) {
const auto token = whisper_full_get_token_data(ctx, i, j);
prob += token.p;
++prob_n;
}
}
if (prob_n > 0) {
prob /= prob_n;
}
const auto t_end = std::chrono::high_resolution_clock::now();
t_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count();
return result;
}
const std::string k_prompt =
R"(This is a dialogue between {0} (A) and a person (B). The dialogue so far is:
B: Hello {0}, how are you?
A: I'm fine, thank you.
{1}
Here is how {0} (A) continues the dialogue:
A:)";
int main(int argc, char ** argv) {
whisper_params params;
if (whisper_params_parse(argc, argv, params) == false) {
return 1;
}
if (whisper_lang_id(params.language.c_str()) == -1) {
fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
whisper_print_usage(argc, argv, params);
exit(0);
}
// whisper init
struct whisper_context_params cparams;
cparams.use_gpu = params.use_gpu;
struct whisper_context * ctx_wsp = whisper_init_from_file_with_params(params.model_wsp.c_str(), cparams);
// gpt init
struct gpt2_context * ctx_gpt = gpt2_init(params.model_gpt.c_str());
// print some info about the processing
{
fprintf(stderr, "\n");
if (!whisper_is_multilingual(ctx_wsp)) {
if (params.language != "en" || params.translate) {
params.language = "en";
params.translate = false;
fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
}
}
fprintf(stderr, "%s: processing, %d threads, lang = %s, task = %s, timestamps = %d ...\n",
__func__,
params.n_threads,
params.language.c_str(),
params.translate ? "translate" : "transcribe",
params.no_timestamps ? 0 : 1);
fprintf(stderr, "\n");
}
// init audio
audio_async audio(30*1000);
if (!audio.init(params.capture_id, WHISPER_SAMPLE_RATE)) {
fprintf(stderr, "%s: audio.init() failed!\n", __func__);
return 1;
}
audio.resume();
int n_iter = 0;
bool is_running = true;
bool force_speak = false;
float prob0 = 0.0f;
std::vector<float> pcmf32_cur;
std::vector<float> pcmf32_prompt;
gpt2_set_prompt(ctx_gpt, "");
const int voice_id = rand()%6;
fprintf(stderr, "gpt-2: prompt:\n");
fprintf(stderr, "========================\n\n");
fprintf(stderr, "%s\n", ::replace(k_prompt, "{0}", params.person).c_str());
fprintf(stderr, "========================\n\n");
// main loop
while (is_running) {
// handle Ctrl + C
is_running = sdl_poll_events();
if (!is_running) {
break;
}
// delay
std::this_thread::sleep_for(std::chrono::milliseconds(100));
int64_t t_ms = 0;
{
audio.get(2000, pcmf32_cur);
if (::vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1250, params.vad_thold, params.freq_thold, params.print_energy) || force_speak) {
fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
audio.get(params.voice_ms, pcmf32_cur);
std::string text_heard;
if (!force_speak) {
text_heard = ::trim(::transcribe(ctx_wsp, params, pcmf32_cur, prob0, t_ms));
}
// remove text between brackets using regex
{
std::regex re("\\[.*?\\]");
text_heard = std::regex_replace(text_heard, re, "");
}
// remove text between brackets using regex
{
std::regex re("\\(.*?\\)");
text_heard = std::regex_replace(text_heard, re, "");
}
// remove all characters, except for letters, numbers, punctuation and ':', '\'', '-', ' '
text_heard = std::regex_replace(text_heard, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
// take first line
text_heard = text_heard.substr(0, text_heard.find_first_of('\n'));
// remove leading and trailing whitespace
text_heard = std::regex_replace(text_heard, std::regex("^\\s+"), "");
text_heard = std::regex_replace(text_heard, std::regex("\\s+$"), "");
const std::vector<gpt_vocab::id> tokens = gpt2_tokenize(ctx_gpt, text_heard.c_str());
if (text_heard.empty() || tokens.empty() || force_speak) {
fprintf(stdout, "%s: Heard nothing, skipping ...\n", __func__);
audio.clear();
continue;
}
force_speak = false;
fprintf(stdout, "%s: Heard '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", text_heard.c_str(), "\033[0m", (int) t_ms);
std::string prompt_base = gpt2_get_prompt(ctx_gpt);
std::string text_to_speak;
{
prompt_base += "B: " + text_heard + "\n";
std::string prompt = ::replace(::replace(k_prompt, "{0}", params.person), "{1}", prompt_base);
text_to_speak = gpt2_gen_text(ctx_gpt, prompt.c_str(), params.max_tokens);
text_to_speak = std::regex_replace(text_to_speak, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
text_to_speak = text_to_speak.substr(0, text_to_speak.find_first_of('\n'));
// remove first 2 lines of base prompt
if (n_iter > 4) {
{
const size_t pos = prompt_base.find_first_of('\n');
if (pos != std::string::npos) {
prompt_base = prompt_base.substr(pos + 1);
}
}
{
const size_t pos = prompt_base.find_first_of('\n');
if (pos != std::string::npos) {
prompt_base = prompt_base.substr(pos + 1);
}
}
}
prompt_base += "A:" + text_to_speak + "\n";
{
prompt = ::replace(::replace(k_prompt, "{0}", params.person), "{1}", prompt_base);
printf("===============\n");
printf("prompt:\n");
printf("%s\n", prompt.c_str());
printf("===============\n");
}
}
//printf("========================\n");
//printf("gpt-2: prompt_base:\n%s\n", prompt_base.c_str());
//printf("========================\n");
gpt2_set_prompt(ctx_gpt, prompt_base.c_str());
text_to_speak = ::replace(text_to_speak, params.person + ": ", "");
int ret = system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str());
if (ret != 0) {
fprintf(stderr, "%s: system() failed!\n", __func__);
}
audio.clear();
++n_iter;
}
}
}
audio.pause();
whisper_print_timings(ctx_wsp);
whisper_free(ctx_wsp);
return 0;
}

View File

@@ -0,0 +1,109 @@
#!/bin/bash
#
# Transcribe twitch.tv livestream by feeding audio input to whisper.cpp at regular intervals
# Thanks to @keyehzy
# ref: https://github.com/ggerganov/whisper.cpp/issues/209
#
# The script currently depends on the third-party tool "streamlink"
# On Mac OS, you can install it via "brew install streamlink"
#
set -eo pipefail
step=10
model=base.en
threads=4
help()
{
echo "Example program for captioning a livestream from twitch.tv."
echo
echo "Usage: ./twitch.sh -s [step] -m [model] -t [threads] [url]"
echo "options:"
echo "-s Step in seconds (default is $step)."
echo "-m Choose model, options are: 'tiny.en' 'tiny' 'base.en' 'base' 'small.en' 'small' 'medium.en' 'medium' 'large-v1' 'large-v2' 'large-v3' (default is '$model')."
echo "-t Number of threads to use."
echo "-h Print this help page."
echo
}
check_requirements()
{
if ! command -v ./main &>/dev/null; then
echo "whisper.cpp main executable is required (make)"
exit 1
fi
if ! command -v streamlink &>/dev/null; then
echo "streamlink is required (https://streamlink.github.io)"
exit 1
fi
if ! command -v ffmpeg &>/dev/null; then
echo "ffmpeg is required (https://ffmpeg.org)"
exit 1
fi
}
check_requirements
while getopts ":s:m:t:h" option; do
case $option in
s)
step=$OPTARG;;
m)
model=$OPTARG;;
t)
threads=$OPTARG;;
h)
help
exit;;
\?)
help
exit;;
esac
done
url=${@:$OPTIND:1}
if [ -z $url ]; then
help
exit
fi
echo "Piping from streamlink url=$url model=$model step=$step threads=$threads"
streamlink $url best -O 2>/dev/null | ffmpeg -loglevel quiet -i - -y -probesize 32 -y -ar 16000 -ac 1 -acodec pcm_s16le /tmp/whisper-live0.wav &
if [ $? -ne 0 ]; then
printf "error: ffmpeg failed\n"
exit 1
fi
echo "Buffering stream... (this should take $step seconds)"
sleep $(($step))
set +e
echo "Starting..."
i=0
SECONDS=0
while true
do
err=1
while [ $err -ne 0 ]; do
if [ $i -gt 0 ]; then
ffmpeg -loglevel quiet -v error -noaccurate_seek -i /tmp/whisper-live0.wav -y -ss $(($i*$step-1)).5 -t $step -c copy /tmp/whisper-live.wav 2> /tmp/whisper-live.err
else
ffmpeg -loglevel quiet -v error -noaccurate_seek -i /tmp/whisper-live0.wav -y -ss $(($i*$step)) -t $step -c copy /tmp/whisper-live.wav 2> /tmp/whisper-live.err
fi
err=$(cat /tmp/whisper-live.err | wc -l)
done
./main -t $threads -m ./models/ggml-$model.bin -f /tmp/whisper-live.wav --no-timestamps -otxt 2> /tmp/whispererr | tail -n 1
while [ $SECONDS -lt $((($i+1)*$step)) ]; do
sleep 1
done
((i=i+1))
done

View File

@@ -0,0 +1,9 @@
set(CMAKE_CXX_STANDARD 11)
add_subdirectory(libwchess)
if (EMSCRIPTEN)
add_subdirectory(wchess.wasm)
else()
add_subdirectory(wchess.cmd)
endif()

View File

@@ -0,0 +1,40 @@
# wchess
Voice-controlled chess using Whisper
Online demo: https://whisper.ggerganov.com/wchess/
https://github.com/ggerganov/whisper.cpp/assets/1991296/c2b2f03c-9684-49f3-8106-357d2d4e67fa
## Command-line tool
```bash
mkdir build && cd build
cmake -DWHISPER_SDL2=1 ..
make -j
./bin/wchess -m ../models/ggml-base.en.bin
Move: start
a b c d e f g h
r n b q k b n r 8
p p p p p p p p 7
. * . * . * . * 6
* . * . * . * . 5
. * . * . * . * 4
* . * . * . * . 3
P P P P P P P P 2
R N B Q K B N R 1
White's turn
[(l)isten/(p)ause/(q)uit]:
```
## TODO
- Improve web-browser audio capture - sometimes it does not record the voice properly
- Add support for more languages by making the generated grammar string multi-lingual
- Fix bugs in the chess moves logic
PRs welcome!

View File

@@ -0,0 +1,19 @@
add_library(wchess-core STATIC
WChess.cpp
WChess.h
Chessboard.cpp
Chessboard.h
)
target_link_libraries(wchess-core
PUBLIC
whisper
common
)
target_include_directories(wchess-core
PUBLIC
"$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>"
)
# add_executable(test-chessboard test-chessboard.cpp Chessboard.cpp)

View File

@@ -0,0 +1,803 @@
#include "Chessboard.h"
#include <array>
#include <vector>
#include <algorithm>
#include <cstring>
#include <set>
#include <list>
#include <chrono>
namespace {
constexpr std::array<const char*, 64> positions = {
"a1", "b1", "c1", "d1", "e1", "f1", "g1", "h1",
"a2", "b2", "c2", "d2", "e2", "f2", "g2", "h2",
"a3", "b3", "c3", "d3", "e3", "f3", "g3", "h3",
"a4", "b4", "c4", "d4", "e4", "f4", "g4", "h4",
"a5", "b5", "c5", "d5", "e5", "f5", "g5", "h5",
"a6", "b6", "c6", "d6", "e6", "f6", "g6", "h6",
"a7", "b7", "c7", "d7", "e7", "f7", "g7", "h7",
"a8", "b8", "c8", "d8", "e8", "f8", "g8", "h8",
};
constexpr char INVALID_POS = positions.size();
constexpr int R = 0; // rank index
constexpr int F = 1; // file index
#define FILE (c[F] - '1')
#define RANK (c[R] - 'a')
constexpr char operator ""_P(const char * c, size_t size) {
return size < 2 || RANK < 0 || RANK > 7 ||
FILE < 0 || FILE > 7 ? INVALID_POS : FILE * 8 + RANK;
}
#undef FILE
#undef RANK
struct sview {
const char * ptr = nullptr;
size_t size = 0;
sview() = default;
sview(const char * p, size_t s) : ptr(p), size(s) {}
sview(const std::string& s) : ptr(s.data()), size(s.size()) {}
size_t find(char del, size_t pos) {
while (pos < size && ptr[pos] != del) ++pos;
return pos < size ? pos : std::string::npos;
}
};
std::vector<sview> split(sview str, char del) {
std::vector<sview> res;
size_t cur = 0;
size_t last = 0;
while (cur != std::string::npos) {
if (str.ptr[last] == ' ') {
++last;
continue;
}
cur = str.find(del, last);
size_t len = cur == std::string::npos ? str.size - last : cur - last;
res.emplace_back(str.ptr + last, len);
last = cur + 1;
}
return res;
}
char strToPos(sview str) {
return operator ""_P(str.ptr, str.size);
}
constexpr std::array<const char*, 6> pieceNames = {
"pawn", "knight", "bishop", "rook", "queen", "king",
};
static constexpr std::array<char, 6> blackShort = {
'p', 'n', 'b', 'r', 'q', 'k',
};
static constexpr std::array<char, 6> whiteShort = {
'P', 'N', 'B', 'R', 'Q', 'K',
};
char strToType(sview str) {
auto it = std::find_if(pieceNames.begin(), pieceNames.end(), [str] (const char* name) { return strncmp(name, str.ptr, str.size) == 0; });
return it != pieceNames.end() ? it - pieceNames.begin() : pieceNames.size();
}
// directions
using Direction = std::array<char, 2>;
constexpr Direction N = {(char) 0, (char) 1};
constexpr Direction NNE = {(char) 1, (char) 2};
constexpr Direction NE = {(char) 1, (char) 1};
constexpr Direction ENE = {(char) 2, (char) 1};
constexpr Direction E = {(char) 1, (char) 0};
constexpr Direction ESE = {(char) 2, (char) -1};
constexpr Direction SE = {(char) 1, (char) -1};
constexpr Direction SSE = {(char) 1, (char) -2};
constexpr Direction S = {(char) 0, (char) -1};
constexpr Direction SSW = {(char) -1, (char) -2};
constexpr Direction SW = {(char) -1, (char) -1};
constexpr Direction WSW = {(char) -2, (char) -1};
constexpr Direction W = {(char) -1, (char) 0};
constexpr Direction WNW = {(char) -2, (char) 1};
constexpr Direction NW = {(char) -1, (char) 1};
constexpr Direction NNW = {(char) -1, (char) 2};
char makeStep(char pos, const Direction& d) {
char next[2] = { char(positions[pos][R] + d[R]) , char(positions[pos][F] + d[F]) };
return strToPos(sview{next, sizeof(next)});
}
template<class Modifier>
char traverse(char pos, const Direction& d, const Modifier& m, int count = 8) {
while (--count >= 0) {
pos = makeStep(pos, d);
if (pos == INVALID_POS || m(pos)) break;
}
return pos;
}
Direction normalize(const Direction& distance) {
//return {char((distance[R] > 0) - (distance[R] < 0)), char((distance[F] > 0) - (distance[F] < 0))};
const int drp = distance[R] > 0 ? 1 : 0;
const int drn = distance[R] < 0 ? 1 : 0;
const int dfp = distance[F] > 0 ? 1 : 0;
const int dfn = distance[F] < 0 ? 1 : 0;
return {char(drp - drn), char(dfp - dfn)};
}
struct Pin {
Direction d;
Piece* pinner;
Piece* pinned;
};
using Pins = std::list<Pin>;
using Board = std::array<Piece*, 64>;
std::vector<Direction> filter(const Direction& pin, std::initializer_list<Direction> directions) {
if (pin[R] == 0 && pin[F] == 0) return directions;
std::vector<Direction> result;
for (auto& d : directions) {
if ((d[R] == pin[R] || d[R] == -pin[R]) && (d[F] == pin[F] || d[F] == -pin[F])) result.push_back(d);
}
return result;
}
}
class Piece {
public:
enum Types : char {
Pawn,
Knight,
Bishop,
Rook,
Queen,
King,
//
NUM_PIECES
};
enum Colors : char {
White,
Black,
};
const char* name() const;
char initial() const;
Types type() const { return m_type; }
Colors color() const { return m_color; }
char pos() const { return m_pos; }
void setPos(char pos) {
m_pos = pos;
invalidate();
}
const char* coord() const;
const std::set<char>& allowed() const { return m_allowed; }
bool canReach(char pos) const;
virtual bool movePattern(char pos) const = 0;
void take();
virtual void reinit(const State& state) = 0;
void invalidate();
protected:
Piece(Types type, Colors color, char pos, std::set<char> allowed)
: m_type(type), m_color(color), m_pos(pos), m_allowed(std::move(allowed)) {}
Piece(const Piece&) = delete;
~Piece() = default;
const Types m_type;
const Colors m_color;
char m_pos;
std::set<char> m_allowed;
bool m_update = false;
};
struct Pawn : public Piece {
Pawn(Colors color, char pos, std::set<char> next) : Piece(Types::Pawn, color, pos, std::move(next)) {}
bool is_first_move() const {
return m_color ? coord()[F] == '7' : coord()[F] == '2';
}
virtual bool movePattern(char pos) const override {
if (m_pos == INVALID_POS) return false;
auto cur = coord();
auto next = positions[pos];
Direction distance = {char(next[R] - cur[R]), char(next[F] - cur[F])};
char forward = m_color ? -1 : 1;
return (forward == distance[F] && distance[R] * distance[R] <= 1)
|| (is_first_move() && 2 * forward == distance[F] && distance[R] == 0);
}
virtual void reinit(const State& state) override;
};
struct Knight : public Piece {
Knight(Colors color, char pos, std::set<char> next) : Piece(Types::Knight, color, pos, std::move(next)) {}
virtual bool movePattern(char pos) const override {
if (m_pos == INVALID_POS) return false;
auto cur = coord();
auto next = positions[pos];
Direction diff = {char(next[R] - cur[R]), char(next[F] - cur[F])};
return diff[R]*diff[R] + diff[F]*diff[F] == 5;
}
virtual void reinit(const State& state) override;
};
struct Bishop : public Piece {
Bishop(Colors color, char pos) : Piece(Types::Bishop, color, pos, {}) {}
virtual bool movePattern(char pos) const override {
if (m_pos == INVALID_POS) return false;
auto cur = coord();
auto next = positions[pos];
return cur[R] - cur[F] == next[R] - next[F] || cur[R] + cur[F] == next[R] + next[F];
}
virtual void reinit(const State& state) override;
};
struct Rook : public Piece {
Rook(Colors color, char pos) : Piece(Types::Rook, color, pos, {}) {}
virtual bool movePattern(char pos) const override {
if (m_pos == INVALID_POS) return false;
auto cur = coord();
auto next = positions[pos];
return cur[R] == next[R] || cur[F] == next[F];
}
virtual void reinit(const State& state) override;
};
struct Queen : public Piece {
Queen(Colors color, char pos) : Piece(Types::Queen, color, pos, {}) {}
virtual bool movePattern(char pos) const override {
if (m_pos == INVALID_POS) return false;
auto cur = coord();
auto next = positions[pos];
return cur[R] == next[R] || cur[F] == next[F] || cur[R] - cur[F] == next[R] - next[F] || cur[R] + cur[F] == next[R] + next[F];
}
virtual void reinit(const State& state) override;
};
struct King : public Piece {
King(Colors color, char pos) : Piece(Types::King, color, pos, {}) {}
virtual bool movePattern(char pos) const override {
if (m_pos == INVALID_POS) return false;
auto cur = coord();
auto next = positions[pos];
Direction diff = {char(next[R] - cur[R]), char(next[F] - cur[F])};
return diff[R]*diff[R] + diff[F]*diff[F] <= 2;
}
virtual void reinit(const State& state) override;
};
struct PieceSet {
Piece* begin() { return &p1; }
Piece* end() { return &r2 + 1; }
const Piece* begin() const { return &p1; }
const Piece* end() const { return &r2 + 1; }
Piece& operator[](int i) { return *(begin() + i); }
const Piece& operator[](int i) const { return *(begin() + i); }
Pawn p1;
Pawn p2;
Pawn p3;
Pawn p4;
Pawn p5;
Pawn p6;
Pawn p7;
Pawn p8;
Rook r1;
Knight n1;
Bishop b1;
Queen q;
King k;
Bishop b2;
Knight n2;
Rook r2;
};
struct State {
State();
PieceSet blacks;
PieceSet whites;
Board board;
Pins blackPins;
Pins whitePins;
};
Direction findPin(const Piece& piece, const State& state) {
auto& pins = piece.color() ? state.blackPins : state.whitePins;
auto it = std::find_if(pins.begin(), pins.end(), [&] (const Pin& pin) { return pin.pinned == &piece; });
if (it != pins.end()) return it->d;
return {0, 0};
}
struct Find {
Find(const Board& board) : m_board(board) {}
bool operator() (char pos) const { return m_board[pos]; }
const Board& m_board;
};
struct Add {
Add(const Board& board, std::set<char>& moves, Piece::Colors color) : m_board(board), m_moves(moves), m_color(color) {}
bool operator() (char pos) const {
if (!m_board[pos] || m_board[pos]->color() != m_color) m_moves.insert(pos);
return m_board[pos];
}
const Board& m_board;
std::set<char>& m_moves;
Piece::Colors m_color;
};
void Pawn::reinit(const State& state) {
if (m_pos == INVALID_POS) return;
if (!m_update) return;
m_update = false;
m_allowed.clear();
auto pin = findPin(*this, state);
auto & left = m_color ? SW : NW;
auto & right = m_color ? SE : NE;
for (auto& direction : filter(pin, { left, right })) {
auto pos = makeStep(m_pos, direction);
if (pos != INVALID_POS && state.board[pos] && state.board[pos]->color() != m_color) m_allowed.insert(pos);
}
auto & forward = m_color ? S : N;
if (!filter(pin, {forward}).empty()) {
traverse(m_pos, forward, [&] (char pos) {
if (!state.board[pos]) m_allowed.insert(pos);
return state.board[pos] || !is_first_move();
}, 2);
}
}
void Knight::reinit(const State& state) {
if (m_pos == INVALID_POS) return;
if (!m_update) return;
m_update = false;
m_allowed.clear();
auto pin = findPin(*this, state);
if (pin[R] != 0 || pin[F] != 0) return;
for (auto& direction : { NNE, ENE, ESE, SSE, SSW, WSW, WNW, NNW }) {
auto pos = makeStep(m_pos, direction);
if (pos != INVALID_POS && (!state.board[pos] || state.board[pos]->color() != m_color)) m_allowed.insert(pos);
}
}
void Bishop::reinit(const State& state) {
if (m_pos == INVALID_POS) return;
if (!m_update) return;
m_update = false;
m_allowed.clear();
auto pin = findPin(*this, state);
for (auto& direction : filter(pin, { NE, SE, SW, NW })) {
traverse(m_pos, direction, Add(state.board, m_allowed, m_color));
}
}
void Rook::reinit(const State& state) {
if (m_pos == INVALID_POS) return;
if (!m_update) return;
m_update = false;
m_allowed.clear();
auto pin = findPin(*this, state);
for (auto& direction : filter(pin, { N, E, S, W })) {
traverse(m_pos, direction, Add(state.board, m_allowed, m_color));
}
}
void Queen::reinit(const State& state) {
if (m_pos == INVALID_POS) return;
if (!m_update) return;
m_update = false;
m_allowed.clear();
auto pin = findPin(*this, state);
for (auto& direction : filter(pin, { N, NE, E, SE, S, SW, W, NW })) {
traverse(m_pos, direction, Add(state.board, m_allowed, m_color));
}
}
void King::reinit(const State& state) {
if (m_pos == INVALID_POS) return;
if (!m_update) return;
m_update = false;
m_allowed.clear();
auto& enemyPieces = m_color ? state.whites : state.blacks;
auto& pawnAttackLeft = m_color ? SW : NW;
auto& pawnAttackRight = m_color ? SE : NE;
for (auto& direction : { N, NE, E, SE, S, SW, W, NW }) {
auto pos = makeStep(m_pos, direction);
bool accept = pos != INVALID_POS && !(state.board[pos] && state.board[pos]->color() == m_color);
if (accept) {
for (auto& p : enemyPieces) {
if (!p.movePattern(pos)) continue;
if (p.type() == Piece::Knight || p.type() == Piece::King) {
accept = false;
break;
}
else if (p.type() == Piece::Pawn) {
auto from = positions[pos];
auto to = p.coord();
Direction d {char(to[R] - from[R]), char(to[F] - from[F])};
if (d == pawnAttackLeft || d == pawnAttackRight) {
accept = false;
break;
}
}
else {
auto from = positions[pos];
auto to = p.coord();
Direction d = normalize({char(to[R] - from[R]), char(to[F] - from[F])});
auto reached = traverse(pos, d, Find(state.board));
if (p.pos() == reached) {
accept = false;
break;
}
}
}
}
if (accept) m_allowed.insert(pos);
}
}
const char* Piece::name() const {
static_assert(pieceNames.size() == Piece::NUM_PIECES, "Mismatch between piece names and types");
return pieceNames[m_type];
}
char Piece::initial() const {
static_assert(blackShort.size() == Piece::NUM_PIECES, "Mismatch between piece names and types");
static_assert(whiteShort.size() == Piece::NUM_PIECES, "Mismatch between piece names and types");
return m_color ? blackShort[m_type] : whiteShort[m_type];
}
void Piece::invalidate() {
m_update = true;
}
const char* Piece::coord() const {
if (m_pos == INVALID_POS) return "";
return positions[m_pos];
}
bool Piece::canReach(char pos) const {
return movePattern(pos) && m_allowed.count(pos);
}
void Piece::take() {
m_pos = INVALID_POS;
m_allowed = {};
}
State::State()
: blacks {
{Piece::Black, "a7"_P, {"a5"_P, "a6"_P} },
{Piece::Black, "b7"_P, {"b5"_P, "b6"_P} },
{Piece::Black, "c7"_P, {"c5"_P, "c6"_P} },
{Piece::Black, "d7"_P, {"d5"_P, "d6"_P} },
{Piece::Black, "e7"_P, {"e5"_P, "e6"_P} },
{Piece::Black, "f7"_P, {"f5"_P, "f6"_P} },
{Piece::Black, "g7"_P, {"g5"_P, "g6"_P} },
{Piece::Black, "h7"_P, {"h5"_P, "h6"_P} },
{Piece::Black, "a8"_P},
{Piece::Black, "b8"_P, {"a6"_P, "c6"_P} },
{Piece::Black, "c8"_P},
{Piece::Black, "d8"_P},
{Piece::Black, "e8"_P},
{Piece::Black, "f8"_P},
{Piece::Black, "g8"_P, {"f6"_P, "h6"_P} },
{Piece::Black, "h8"_P},
}
, whites {
{Piece::White, "a2"_P, {"a3"_P, "a4"_P} },
{Piece::White, "b2"_P, {"b3"_P, "b4"_P} },
{Piece::White, "c2"_P, {"c3"_P, "c4"_P} },
{Piece::White, "d2"_P, {"d3"_P, "d4"_P} },
{Piece::White, "e2"_P, {"e3"_P, "e4"_P} },
{Piece::White, "f2"_P, {"f3"_P, "f4"_P} },
{Piece::White, "g2"_P, {"g3"_P, "g4"_P} },
{Piece::White, "h2"_P, {"h3"_P, "h4"_P} },
{Piece::White, "a1"_P},
{Piece::White, "b1"_P, {"a3"_P, "c3"_P} },
{Piece::White, "c1"_P},
{Piece::White, "d1"_P},
{Piece::White, "e1"_P},
{Piece::White, "f1"_P},
{Piece::White, "g1"_P, {"f3"_P, "h3"_P} },
{Piece::White, "h1"_P},
}
, board {{
&whites[ 8], &whites[ 9], &whites[10], &whites[11], &whites[12], &whites[13], &whites[14], &whites[15],
&whites[ 0], &whites[ 1], &whites[ 2], &whites[ 3], &whites[ 4], &whites[ 5], &whites[ 6], &whites[ 7],
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
&blacks[ 0], &blacks[ 1], &blacks[ 2], &blacks[ 3], &blacks[ 4], &blacks[ 5], &blacks[ 6], &blacks[ 7],
&blacks[ 8], &blacks[ 9], &blacks[10], &blacks[11], &blacks[12], &blacks[13], &blacks[14], &blacks[15],
}}
{}
Chessboard::Chessboard()
: m_state(new State())
{
setGrammar();
}
Chessboard::~Chessboard() = default;
void Chessboard::setPrompt(const std::string& prompt) {
m_prompt = prompt;
setGrammar();
}
void Chessboard::setGrammar() {
m_grammar.clear();
std::string result;
if (m_prompt.empty()) {
result += "move ::= \" \" ((piece | frompos) \" \" \"to \"?)? topos\n";
//result += "move ::= \" \" frompos \" \" \"to \"? topos\n";
}
else {
// result += "move ::= prompt \" \" ((piece | frompos) \" \" \"to \"?)? topos\n"
result += "move ::= prompt \" \" frompos \" \" \"to \"? topos\n"
"prompt ::= \" " + m_prompt + "\"\n";
}
std::set<Piece::Types> pieceTypes;
std::set<char> from_pos;
std::set<char> to_pos;
auto& pieces = m_moveCounter % 2 ? m_state->blacks : m_state->whites;
std::set<size_t> flags;
for (auto& p : pieces) {
if (p.allowed().empty()) continue;
bool addPiece = false;
if (!m_inCheck || p.type() == Piece::King) {
to_pos.insert(p.allowed().begin(), p.allowed().end());
addPiece = !p.allowed().empty();
}
else {
for (auto move : p.allowed()) {
if (m_allowedInCheck.count(move)) {
to_pos.insert(move);
addPiece = true;
}
}
}
if (addPiece) {
pieceTypes.insert(p.type());
from_pos.insert(p.pos());
}
}
if (pieceTypes.empty()) return;
result += "piece ::= (";
for (auto& p : pieceTypes) result += " \"" + std::string(pieceNames[p]) + "\" |";
result.pop_back();
result += ")\n\n";
result += "frompos ::= (";
for (auto& p : from_pos) result += " \"" + std::string(positions[p]) + "\" |";
result.pop_back();
result += ")\n";
result += "topos ::= (";
for (auto& p : to_pos) result += " \"" + std::string(positions[p]) + "\" |";
result.pop_back();
result += ")\n";
m_grammar = std::move(result);
}
std::string Chessboard::stringifyBoard() {
std::string result;
result.reserve(16 + 2 * 64 + 16);
for (char rank = 'a'; rank <= 'h'; ++rank) {
result.push_back(rank);
result.push_back(' ');
}
result.back() = '\n';
for (int i = 7; i >= 0; --i) {
for (int j = 0; j < 8; ++j) {
auto p = m_state->board[i * 8 + j];
if (p) result.push_back(p->initial());
else result.push_back((i + j) % 2 ? '.' : '*');
result.push_back(' ');
}
result.push_back('0' + i + 1);
result.push_back('\n');
}
return result;
}
std::string Chessboard::process(const std::string& command) {
const auto t_start = std::chrono::high_resolution_clock::now();
auto color = Piece::Colors(m_moveCounter % 2);
Piece* piece = nullptr;
auto pos_to = INVALID_POS;
if (!parseCommand(command, piece, pos_to)) return "";
auto pos_from = piece->pos();
if (!move(*piece, pos_to)) return "";
flagUpdates(pos_from, pos_to);
detectChecks();
auto& enemyPieces = color ? m_state->whites : m_state->blacks;
for (auto& p : enemyPieces) p.reinit(*m_state); // only enemy moves needed next
std::string result = {positions[pos_from][R], positions[pos_from][F], '-', positions[pos_to][R], positions[pos_to][F]};
++m_moveCounter;
setGrammar();
const auto t_end = std::chrono::high_resolution_clock::now();
auto t_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count();
fprintf(stdout, "%s: Move '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", result.data(), "\033[0m", (int) t_ms);
if (m_grammar.empty()) result.push_back('#');
return result;
}
bool Chessboard::parseCommand(const std::string& command, Piece*& piece, char& pos_to) {
auto color = Piece::Colors(m_moveCounter % 2);
fprintf(stdout, "%s: Command to %s: '%s%.*s%s'\n", __func__, (color ? "Black" : "White"), "\033[1m", int(command.size()), command.data(), "\033[0m");
if (command.empty()) return false;
auto tokens = split(command, ' ');
auto pos_from = INVALID_POS;
auto type = Piece::Types::NUM_PIECES;
if (tokens.size() == 1) {
type = Piece::Types::Pawn;
pos_to = strToPos(tokens.front());
}
else {
pos_from = strToPos(tokens.front());
if (pos_from == INVALID_POS) type = Piece::Types(strToType(tokens.front()));
pos_to = strToPos(tokens.back());
}
if (pos_to == INVALID_POS) return false;
if (pos_from == INVALID_POS) {
if (type == Piece::Types::NUM_PIECES) return false;
auto& pieces = color ? m_state->blacks : m_state->whites;
for (auto& p : pieces) {
if (p.type() == type && p.canReach(pos_to)) {
pos_from = p.pos();
break;
}
}
}
if (pos_from == INVALID_POS) return false;
if (m_state->board[pos_from] == nullptr) return false;
piece = m_state->board[pos_from];
if (piece->color() != color) return false;
return true;
}
void Chessboard::flagUpdates(char pos_from, char pos_to) {
auto color = Piece::Colors(m_moveCounter % 2);
auto& enemyPieces = color ? m_state->whites : m_state->blacks;
auto& ownPieces = color ? m_state->blacks : m_state->whites;
for (auto& p : enemyPieces) {
if (p.movePattern(pos_to) || p.movePattern(pos_from)) {
updatePins(p);
p.invalidate();
}
}
for (auto& p : ownPieces) {
if (p.movePattern(pos_to) || p.movePattern(pos_from)) {
updatePins(p);
p.invalidate();
}
}
}
void Chessboard::updatePins(Piece& piece) {
if (piece.type() == Piece::Pawn || piece.type() == Piece::Knight || piece.type() == Piece::King) return;
auto& enemyPieces = piece.color() ? m_state->whites : m_state->blacks;
auto& enemyPins = piece.color() ? m_state->whitePins : m_state->blackPins;
auto& king = enemyPieces.k;
auto it = std::find_if(enemyPins.begin(), enemyPins.end(), [&] (const Pin& pin) { return pin.pinner == &piece; });
if (it != enemyPins.end()) {
it->pinned->invalidate();
enemyPins.erase(it);
}
if (piece.movePattern(king.pos())) {
auto to = positions[king.pos()];
auto from = piece.coord();
Direction d = normalize({char(to[R] - from[R]), char(to[F] - from[F])});
auto reached = traverse(piece.pos(), d, Find(m_state->board));
auto foundPiece = m_state->board[reached];
if (&king == foundPiece) {
// check
king.invalidate();
}
else if (foundPiece && foundPiece->color() != piece.color()) {
reached = traverse(reached, d, Find(m_state->board));
if (&king == m_state->board[reached]) {
enemyPins.push_back({d, &piece, foundPiece});
foundPiece->invalidate();
}
}
}
}
void Chessboard::detectChecks() {
auto color = Piece::Colors(m_moveCounter % 2);
auto& enemyPieces = color ? m_state->whites : m_state->blacks;
auto& ownPieces = color ? m_state->blacks : m_state->whites;
auto& king = enemyPieces.k;
auto& pawnAttackLeft = color ? SW : NW;
auto& pawnAttackRight = color ? SE : NE;
for (auto& p : ownPieces) {
if (!p.movePattern(king.pos())) continue;
auto to = positions[king.pos()];
auto from = p.coord();
if (p.type() == Piece::Knight) {
if (!m_inCheck) {
m_allowedInCheck = { p.pos() };
}
else {
m_allowedInCheck.clear();
}
m_inCheck = true;
}
else if (p.type() == Piece::Pawn) {
Direction d {char(to[R] - from[R]), char(to[F] - from[F])};
if (d == pawnAttackLeft || d == pawnAttackRight) {
if (!m_inCheck) {
m_allowedInCheck = { p.pos() };
}
else {
m_allowedInCheck.clear();
}
m_inCheck = true;
}
}
else {
Direction d = normalize({char(to[R] - from[R]), char(to[F] - from[F])});
std::set<char> tmp;
auto pos = traverse(p.pos(), d, Add(m_state->board, tmp, king.color()));
if (pos == king.pos()) {
tmp.insert(p.pos());
if (!m_inCheck) {
m_allowedInCheck = std::move(tmp);
}
else {
m_allowedInCheck.clear();
}
m_inCheck = true;
}
}
}
}
bool Chessboard::move(Piece& piece, char pos_to) {
auto& allowed = piece.allowed();
if (allowed.count(pos_to) == 0 || (m_inCheck && piece.type() != Piece::King && m_allowedInCheck.count(pos_to) == 0)) return false;
if (m_state->board[pos_to] && m_state->board[pos_to]->color() == piece.color()) return false;
if (m_state->board[pos_to]) m_state->board[pos_to]->take();
m_state->board[piece.pos()] = nullptr;
m_state->board[pos_to] = &piece;
piece.setPos(pos_to);
m_inCheck = false;
m_allowedInCheck.clear();
return true;
}

View File

@@ -0,0 +1,33 @@
#pragma once
#include <string>
#include <set>
#include <memory>
// just basic validation
// fixme: missing en passant, castling, promotion, etc.
struct State;
class Piece;
class Chessboard {
public:
Chessboard();
~Chessboard();
std::string process(const std::string& command);
std::string stringifyBoard();
const std::string& grammar() { return m_grammar; }
const std::string& prompt() { return m_prompt; }
void setPrompt(const std::string& prompt);
private:
bool parseCommand(const std::string& command, Piece*& piece, char& pos_to);
bool move(Piece& piece, char pos);
void flagUpdates(char pos_from, char pos_to);
void updatePins(Piece& piece);
void detectChecks();
void setGrammar();
std::unique_ptr<State> m_state;
std::set<char> m_allowedInCheck;
bool m_inCheck = false;
int m_moveCounter = 0;
std::string m_grammar;
std::string m_prompt;
};

View File

@@ -0,0 +1,193 @@
#include "WChess.h"
#include "Chessboard.h"
#include "grammar-parser.h"
#include "common.h"
#include <thread>
WChess::WChess(whisper_context * ctx,
const whisper_full_params & wparams,
callbacks cb,
settings s)
: m_ctx(ctx)
, m_wparams(wparams)
, m_cb(cb)
, m_settings(s)
, m_board(new Chessboard())
{}
WChess::~WChess() = default;
void WChess::set_move(const std::string& moves, float prob) const {
if (m_cb.set_move) (*m_cb.set_move)(moves, prob);
}
void WChess::set_grammar(const std::string& grammar) const {
if (m_cb.set_grammar) (*m_cb.set_grammar)(grammar);
}
bool WChess::get_audio(std::vector<float>& pcmf32) const {
if (m_cb.get_audio) return (*m_cb.get_audio)(pcmf32);
return false;
}
std::string WChess::stringify_board() const {
return m_board->stringifyBoard();
}
std::string WChess::get_grammar() const {
return m_board->grammar();
}
void WChess::run() {
bool have_prompt = true;
bool ask_prompt = !have_prompt;
float logprob_min = 0.0f;
float logprob_sum = 0.0f;
int n_tokens = 0;
std::vector<float> pcmf32_cur;
std::vector<float> pcmf32_prompt;
const std::string k_prompt = have_prompt ? "" : "rook to d4, f3";
int64_t t_ms = 0;
if (ask_prompt) {
fprintf(stdout, "\n");
fprintf(stdout, "%s: Say the following phrase: '%s%s%s'\n", __func__, "\033[1m", k_prompt.c_str(), "\033[0m");
fprintf(stdout, "\n");
ask_prompt = false;
}
while (get_audio(pcmf32_cur)) {
if (!pcmf32_cur.empty()) {
// fprintf(stdout, "%s: Processing ...\n", __func__);
if (!have_prompt) {
const auto txt = ::trim(transcribe(pcmf32_cur, logprob_min, logprob_sum, n_tokens, t_ms));
fprintf(stdout, "%s: Heard '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", txt.c_str(), "\033[0m", (int) t_ms);
const float sim = similarity(txt, k_prompt);
if (txt.length() < 0.8*k_prompt.length() || txt.length() > 1.2*k_prompt.length() || sim < 0.8f) {
fprintf(stdout, "%s: WARNING: prompt not recognized, try again\n", __func__);
ask_prompt = true;
} else {
fprintf(stdout, "\n");
fprintf(stdout, "%s: The prompt has been recognized!\n", __func__);
fprintf(stdout, "%s: Waiting for voice commands ...\n", __func__);
fprintf(stdout, "\n");
// save the audio for the prompt
pcmf32_prompt = pcmf32_cur;
have_prompt = true;
m_board->setPrompt(k_prompt);
}
} else {
if (!pcmf32_prompt.empty()) pcmf32_cur.insert(pcmf32_cur.begin(), pcmf32_prompt.begin(), pcmf32_prompt.end());
constexpr size_t MIN_SIZE = 1.2 * WHISPER_SAMPLE_RATE;
if (MIN_SIZE > pcmf32_cur.size()) pcmf32_cur.insert(pcmf32_cur.begin(), MIN_SIZE - pcmf32_cur.size(), 0.0f);
// fprintf(stdout, "%s: grammar rules:\n'%s'\n", __func__, m_board->grammar().c_str());
auto grammar_parsed = grammar_parser::parse(m_board->grammar().c_str());
auto grammar_rules = grammar_parsed.c_rules();
m_wparams.grammar_rules = grammar_rules.data();
m_wparams.n_grammar_rules = grammar_rules.size();
m_wparams.i_start_rule = grammar_parsed.symbol_ids.at("move");
auto txt = ::trim(transcribe(pcmf32_cur, logprob_min, logprob_sum, n_tokens, t_ms));
const float p = 100.0f * std::exp(logprob_min);
fprintf(stdout, "%s: heard '%s'\n", __func__, txt.c_str());
// find the prompt in the text
float best_sim = 0.0f;
size_t best_len = 0;
for (int n = 0.8*k_prompt.size(); n <= 1.2*k_prompt.size(); ++n) {
const auto prompt = txt.substr(0, n);
const float sim = similarity(prompt, k_prompt);
//fprintf(stderr, "%s: prompt = '%s', sim = %f\n", __func__, prompt.c_str(), sim);
if (sim > best_sim) {
best_sim = sim;
best_len = n;
}
}
fprintf(stdout, "%s: DEBUG: txt = '%s', prob = %.2f%%\n", __func__, txt.c_str(), p);
std::string command = ::trim(txt.substr(best_len));
fprintf(stdout, "%s: Command '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", command.c_str(), "\033[0m", (int) t_ms);
fprintf(stdout, "\n");
if (!command.empty()) {
set_move(m_board->process(command), p);
set_grammar(m_board->grammar());
}
if (m_board->grammar().empty()) {
fprintf(stdout, "%s: No more moves possible\n", __func__);
break;
}
}
}
if (ask_prompt) {
fprintf(stdout, "\n");
fprintf(stdout, "%s: Say the following phrase: '%s%s%s'\n", __func__, "\033[1m", k_prompt.c_str(), "\033[0m");
fprintf(stdout, "\n");
ask_prompt = false;
}
}
}
std::string WChess::transcribe(
const std::vector<float> & pcmf32,
float & logprob_min,
float & logprob_sum,
int & n_tokens,
int64_t & t_ms) {
const auto t_start = std::chrono::high_resolution_clock::now();
logprob_min = 0.0f;
logprob_sum = 0.0f;
n_tokens = 0;
t_ms = 0;
if (whisper_full(m_ctx, m_wparams, pcmf32.data(), pcmf32.size()) != 0) {
return {};
}
std::string result;
const int n_segments = whisper_full_n_segments(m_ctx);
for (int i = 0; i < n_segments; ++i) {
const char * text = whisper_full_get_segment_text(m_ctx, i);
result += text;
const int n = whisper_full_n_tokens(m_ctx, i);
for (int j = 0; j < n; ++j) {
const auto token = whisper_full_get_token_data(m_ctx, i, j);
if(token.plog > 0.0f) return {};
logprob_min = std::min(logprob_min, token.plog);
logprob_sum += token.plog;
++n_tokens;
}
}
const auto t_end = std::chrono::high_resolution_clock::now();
t_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count();
return result;
}

View File

@@ -0,0 +1,63 @@
#pragma once
#include "whisper.h"
#include <string>
#include <vector>
#include <memory>
class Chessboard;
class WChess {
public:
using CheckRunningCb = bool (*)();
using GetAudioCb = bool (*)(std::vector<float> &);
using SetMovesCb = void (*)(const std::string &, float);
using SetGrammarCb = void (*)(const std::string &);
using ClearAudioCb = void (*)();
struct callbacks {
GetAudioCb get_audio = nullptr;
SetMovesCb set_move = nullptr;
SetGrammarCb set_grammar = nullptr;
};
struct settings {
int32_t vad_ms = 2000;
int32_t prompt_ms = 5000;
int32_t command_ms = 4000;
float vad_thold = 0.2f;
float freq_thold = 100.0f;
bool print_energy = false;
};
WChess(
whisper_context * ctx,
const whisper_full_params & wparams,
callbacks cb,
settings s
);
~WChess();
void run();
std::string stringify_board() const;
std::string get_grammar() const;
private:
bool get_audio(std::vector<float>& pcmf32) const;
void set_move(const std::string& moves, float prob) const;
void set_grammar(const std::string& grammar) const;
std::string transcribe(
const std::vector<float> & pcmf32,
float & logprob_min,
float & logprob_sum,
int & n_tokens,
int64_t & t_ms);
whisper_context * m_ctx;
whisper_full_params m_wparams;
const callbacks m_cb;
const settings m_settings;
std::unique_ptr<Chessboard> m_board;
};

View File

@@ -0,0 +1,117 @@
#include "Chessboard.h"
#define ASSERT(x) \
do { \
if (!(x)) { \
fprintf(stderr, "ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
fflush(stderr); \
exit(1); \
} \
} while (0)
int main() {
{
Chessboard chess;
ASSERT(chess.process("pawn to d4") == "d2-d4");
ASSERT(chess.process("e5") == "e7-e5");
ASSERT(chess.process("c1 h6") == "c1-h6");
ASSERT(chess.process("queen h4") == "d8-h4");
ASSERT(chess.process("bishop to g5") == "h6-g5");
ASSERT(chess.process("bishop to b4") == "f8-b4");
ASSERT(chess.process("c4") == "");
ASSERT(chess.process("knight c3") == "b1-c3");
ASSERT(chess.process("knight c6") == "b8-c6");
ASSERT(chess.process("f3") == "");
}
{
Chessboard chess;
ASSERT(chess.process("d4") == "d2-d4");
ASSERT(chess.process("e5") == "e7-e5");
ASSERT(chess.process("e4") == "e2-e4");
ASSERT(chess.process("queen h4") == "d8-h4");
ASSERT(chess.process("queen h5") == "d1-h5");
ASSERT(chess.process("f5") == "");
ASSERT(chess.process("g6") == "g7-g6");
ASSERT(chess.process("knight e2") == "g1-e2");
ASSERT(chess.process("f5") == "f7-f5");
ASSERT(chess.process("knight g3") == "e2-g3");
ASSERT(chess.process("g5") == "");
ASSERT(chess.process("king e7") == "e8-e7");
ASSERT(chess.process("f4") == "f2-f4");
ASSERT(chess.process("g5") == "g6-g5");
}
{
Chessboard chess;
ASSERT(chess.process("e4") == "e2-e4");
ASSERT(chess.process("c5") == "c7-c5");
ASSERT(chess.process("e5") == "e4-e5");
ASSERT(chess.process("c4") == "c5-c4");
ASSERT(chess.process("e6") == "e5-e6");
ASSERT(chess.process("c3") == "c4-c3");
ASSERT(chess.process("e7") == "");
ASSERT(chess.process("f7") == "e6-f7");
ASSERT(chess.process("d2") == "");
ASSERT(chess.process("king to f7") == "e8-f7");
ASSERT(chess.process("f4") == "f2-f4");
ASSERT(chess.process("d2") == "c3-d2");
ASSERT(chess.process("f5") == "");
ASSERT(chess.process("king to e2") == "e1-e2");
ASSERT(chess.process("king to g6") == "f7-g6");
ASSERT(chess.process("f5") == "f4-f5");
ASSERT(chess.process("e6") == "");
ASSERT(chess.process("king to h5") == "g6-h5");
ASSERT(chess.process("g4") == "g2-g4");
ASSERT(chess.process("king to g5") == "h5-g5");
ASSERT(chess.process("h4") == "h2-h4");
ASSERT(chess.process("king to h5") == "");
ASSERT(chess.process("king to g6") == "");
ASSERT(chess.process("king to h6") == "g5-h6");
ASSERT(chess.process("bishop to d2") == "c1-d2");
ASSERT(chess.process("king to g5") == "");
ASSERT(chess.process("g5") == "g7-g5");
}
{
Chessboard chess;
ASSERT(chess.process("f4") == "f2-f4");
ASSERT(chess.process("e5") == "e7-e5");
ASSERT(chess.process("g4") == "g2-g4");
ASSERT(chess.process("queen to h4") == "d8-h4#");
ASSERT(chess.process("knight f3") == "");
ASSERT(chess.grammar().empty());
}
{
Chessboard chess;
ASSERT(chess.process("f4") == "f2-f4");
ASSERT(chess.process("e5") == "e7-e5");
ASSERT(chess.process("g4") == "g2-g4");
ASSERT(chess.process("d5") == "d7-d5");
ASSERT(chess.process("g1 f3") == "g1-f3");
ASSERT(chess.process("queen to h4") == "d8-h4");
ASSERT(!chess.grammar().empty());
}
{
Chessboard chess;
ASSERT(chess.process("knight c3") == "b1-c3");
ASSERT(chess.process("knight c6") == "b8-c6");
ASSERT(chess.process("knight b5") == "c3-b5");
ASSERT(chess.process("knight f6") == "g8-f6");
ASSERT(chess.process("knight d6") == "b5-d6");
ASSERT(chess.process("knight d4") == "");
ASSERT(chess.process("d6") == "c7-d6");
ASSERT(chess.process("e4") == "e2-e4");
ASSERT(chess.process("knight d4") == "c6-d4");
ASSERT(chess.process("d3") == "d2-d3");
ASSERT(chess.process("knight e4") == "f6-e4");
ASSERT(chess.process("king to e2") == "");
ASSERT(chess.process("king to d2") == "");
}
}

View File

@@ -0,0 +1,8 @@
if (WHISPER_SDL2)
set(TARGET wchess)
add_executable(${TARGET} wchess.cmd.cpp)
include(DefaultTargetOptions)
target_link_libraries(${TARGET} PRIVATE wchess-core common-sdl ${CMAKE_THREAD_LIBS_INIT})
endif ()

View File

@@ -0,0 +1,247 @@
// Command line voice assisted chess
//
// Speak chess move commands to the microphone.
// The moves will translated to chessboard positions.
//
//
#include "WChess.h"
#include "common-sdl.h"
#include <iostream>
#include <memory>
#include <thread>
// command-line parameters
struct whisper_params {
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
int32_t prompt_ms = 5000;
int32_t command_ms = 8000;
int32_t capture_id = -1;
int32_t max_tokens = 32;
int32_t audio_ctx = 0;
float vad_thold = 0.6f;
float freq_thold = 100.0f;
float grammar_penalty = 100.0f;
bool speed_up = false;
bool translate = false;
bool print_special = false;
bool print_energy = false;
bool no_timestamps = true;
bool use_gpu = true;
std::string language = "en";
std::string model = "models/ggml-base.en.bin";
std::string fname_out;
std::string commands;
std::string prompt;
std::string context;
std::string grammar;
};
void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
fprintf(stderr, "\n");
fprintf(stderr, "usage: %s [options]\n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
fprintf(stderr, " -pms N, --prompt-ms N [%-7d] prompt duration in milliseconds\n", params.prompt_ms);
fprintf(stderr, " -cms N, --command-ms N [%-7d] command duration in milliseconds\n", params.command_ms);
fprintf(stderr, " -c ID, --capture ID [%-7d] capture device ID\n", params.capture_id);
fprintf(stderr, " -mt N, --max-tokens N [%-7d] maximum number of tokens per audio chunk\n", params.max_tokens);
fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false");
fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU\n", params.use_gpu ? "false" : "true");
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.language.c_str());
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n", params.fname_out.c_str());
fprintf(stderr, " -cmd FNAME, --commands FNAME [%-7s] text file with allowed commands\n", params.commands.c_str());
fprintf(stderr, " -p, --prompt [%-7s] the required activation prompt\n", params.prompt.c_str());
fprintf(stderr, " -ctx, --context [%-7s] sample text to help the transcription\n", params.context.c_str());
fprintf(stderr, " --grammar-penalty N [%-7.1f] scales down logits of nongrammar tokens\n", params.grammar_penalty);
fprintf(stderr, "\n");
}
bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
for (int i = 1; i < argc; i++) {
std::string arg = argv[i];
if (arg == "-h" || arg == "--help") {
whisper_print_usage(argc, argv, params);
exit(0);
}
else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
else if (arg == "-pms" || arg == "--prompt-ms") { params.prompt_ms = std::stoi(argv[++i]); }
else if (arg == "-cms" || arg == "--command-ms") { params.command_ms = std::stoi(argv[++i]); }
else if (arg == "-c" || arg == "--capture") { params.capture_id = std::stoi(argv[++i]); }
else if (arg == "-mt" || arg == "--max-tokens") { params.max_tokens = std::stoi(argv[++i]); }
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; }
else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; }
else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; }
else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
else if (arg == "-f" || arg == "--file") { params.fname_out = argv[++i]; }
else if (arg == "-cmd" || arg == "--commands") { params.commands = argv[++i]; }
else if (arg == "-p" || arg == "--prompt") { params.prompt = argv[++i]; }
else if (arg == "-ctx" || arg == "--context") { params.context = argv[++i]; }
else if ( arg == "--grammar-penalty") { params.grammar_penalty = std::stof(argv[++i]); }
else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
whisper_print_usage(argc, argv, params);
exit(0);
}
}
return true;
}
std::unique_ptr<WChess> g_wchess;
int g_moveCount = 0;
void set_move(const std::string & move, float) {
if (!move.empty()) {
g_moveCount++;
fprintf(stdout, "Move: %s\n\n", move.c_str());
}
else fprintf(stdout, "Move rejected\n\n");
fprintf(stdout, "%s\n", g_wchess->stringify_board().c_str());
fprintf(stdout, "%s\n", g_moveCount ? "White's turn" : "Black's turn");
}
audio_async g_audio(30*1000);
bool g_listening = false;
std::vector<float> g_pcmf32;
bool read_input() {
std::string input;
while (true) {
fprintf(stdout, "[(l)isten/(p)ause/(q)uit]: ");
std::cin >> input;
fprintf(stdout, "\n");
if (input[0] == 'q') {
fprintf(stdout, "Quitting\n");
return false;
}
if (input[0] == 'l') {
if (!g_listening) {
fprintf(stdout, "Listening\n");
g_listening = true;
g_pcmf32.clear();
g_audio.resume();
g_audio.clear();
}
else fprintf(stdout, "Still listening\n");
return true;
}
else {
if (g_listening) {
g_listening = false;
g_audio.get(0, g_pcmf32);
g_audio.pause();
fprintf(stdout, "Processing\n");
}
else fprintf(stdout, "Not listening\n");
return true;
}
}
return true;
}
bool get_audio(std::vector<float> & pcmf32_cur) {
if (!read_input()) return false;
if (!g_pcmf32.empty()) pcmf32_cur = std::move(g_pcmf32);
else pcmf32_cur.clear();
return true;
}
int main(int argc, char ** argv) {
whisper_params params;
if (whisper_params_parse(argc, argv, params) == false) {
return 1;
}
if (whisper_lang_id(params.language.c_str()) == -1) {
fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
whisper_print_usage(argc, argv, params);
exit(0);
}
// whisper init
struct whisper_context_params cparams;
cparams.use_gpu = params.use_gpu;
struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
if (!ctx) {
fprintf(stderr, "%s: whisper_init_from_file_with_params() failed!\n", __func__);
return 1;
}
// init audio
if (!g_audio.init(params.capture_id, WHISPER_SAMPLE_RATE)) {
fprintf(stderr, "%s: audio.init() failed!\n", __func__);
return 1;
}
struct whisper_full_params wparams = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY);
wparams.offset_ms = 0;
wparams.translate = false;
wparams.no_context = true;
wparams.single_segment = true;
wparams.print_realtime = false;
wparams.print_progress = false;
wparams.print_timestamps = true;
wparams.print_special = false;
wparams.no_timestamps = true;
wparams.max_tokens = 32;
wparams.audio_ctx = 768; // partial encoder context for better performance
wparams.temperature = 0.0f;
wparams.temperature_inc = 2.0f;
wparams.greedy.best_of = 1;
wparams.beam_search.beam_size = 1;
wparams.language = "en";
wparams.grammar_penalty = 100.0;
wparams.initial_prompt = params.context.data();
WChess::callbacks cb;
cb.get_audio = get_audio;
cb.set_move = set_move;
WChess::settings s;
s.vad_ms = 2000;
s.prompt_ms = params.prompt_ms;
s.command_ms = params.command_ms;
s.vad_thold = params.vad_thold;
s.freq_thold = params.freq_thold;
s.print_energy = params.print_energy;
g_wchess.reset(new WChess(ctx, wparams, cb, s));
set_move("start", 0);
g_wchess->run();
whisper_print_timings(ctx);
whisper_free(ctx);
return 0;
}

View File

@@ -0,0 +1,51 @@
set(TARGET wchess.wasm)
add_executable(${TARGET}
wchess.wasm.cpp
)
include(DefaultTargetOptions)
target_link_libraries(${TARGET} PRIVATE
common
wchess-core
)
unset(EXTRA_FLAGS)
if (WHISPER_WASM_SINGLE_FILE)
set(EXTRA_FLAGS "-s SINGLE_FILE=1")
message(STATUS "Embedding WASM inside chess.js")
add_custom_command(
TARGET ${TARGET} POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy
${CMAKE_BINARY_DIR}/bin/${TARGET}.js
${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/js/chess.js
)
endif()
set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
--bind \
-s USE_PTHREADS=1 \
-s PTHREAD_POOL_SIZE=8 \
-s INITIAL_MEMORY=1024MB \
-s TOTAL_MEMORY=1024MB \
-s FORCE_FILESYSTEM=1 \
-s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
${EXTRA_FLAGS} \
")
add_custom_command(
TARGET ${TARGET} POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy_directory
${CMAKE_CURRENT_SOURCE_DIR}/chessboardjs-1.0.0
${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/
COMMAND ${CMAKE_COMMAND} -E copy
${CMAKE_CURRENT_SOURCE_DIR}/jquery-3.7.1.min.js
${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/js/
)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/index-tmpl.html ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/index.html @ONLY)
configure_file(${CMAKE_SOURCE_DIR}/examples/helpers.js ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/js/helpers.js @ONLY)

View File

@@ -0,0 +1,54 @@
/*! chessboard.js v1.0.0 | (c) 2019 Chris Oakman | MIT License chessboardjs.com/license */
.clearfix-7da63 {
clear: both;
}
.board-b72b1 {
border: 2px solid #404040;
box-sizing: content-box;
}
.square-55d63 {
float: left;
position: relative;
/* disable any native browser highlighting */
-webkit-touch-callout: none;
-webkit-user-select: none;
-khtml-user-select: none;
-moz-user-select: none;
-ms-user-select: none;
user-select: none;
}
.white-1e1d7 {
background-color: #f0d9b5;
color: #b58863;
}
.black-3c85d {
background-color: #b58863;
color: #f0d9b5;
}
.highlight1-32417, .highlight2-9c5d2 {
box-shadow: inset 0 0 3px 3px yellow;
}
.notation-322f9 {
cursor: default;
font-family: "Helvetica Neue", Helvetica, Arial, sans-serif;
font-size: 14px;
position: absolute;
}
.alpha-d2270 {
bottom: 1px;
right: 3px;
}
.numeric-fc462 {
top: 2px;
left: 2px;
}

View File

@@ -0,0 +1,2 @@
/*! chessboard.js v1.0.0 | (c) 2019 Chris Oakman | MIT License chessboardjs.com/license */
.clearfix-7da63{clear:both}.board-b72b1{border:2px solid #404040;box-sizing:content-box}.square-55d63{float:left;position:relative;-webkit-touch-callout:none;-webkit-user-select:none;-khtml-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none}.white-1e1d7{background-color:#f0d9b5;color:#b58863}.black-3c85d{background-color:#b58863;color:#f0d9b5}.highlight1-32417,.highlight2-9c5d2{box-shadow:inset 0 0 3px 3px #ff0}.notation-322f9{cursor:default;font-family:"Helvetica Neue",Helvetica,Arial,sans-serif;font-size:14px;position:absolute}.alpha-d2270{bottom:1px;right:3px}.numeric-fc462{top:2px;left:2px}

Some files were not shown because too many files have changed in this diff Show More