This commit is contained in:
Kar
2025-06-17 15:53:01 +05:30
commit 4d20931ecc
411 changed files with 180695 additions and 0 deletions

1
whisper.cpp-1.5.2/models/.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
*.bin

View File

@@ -0,0 +1,102 @@
## Whisper model files in custom ggml format
The [original Whisper PyTorch models provided by OpenAI](https://github.com/openai/whisper/blob/main/whisper/__init__.py#L17-L27)
are converted to custom `ggml` format in order to be able to load them in C/C++.
Conversion is performed using the [convert-pt-to-ggml.py](convert-pt-to-ggml.py) script.
You can either obtain the original models and generate the `ggml` files yourself using the conversion script,
or you can use the [download-ggml-model.sh](download-ggml-model.sh) script to download the already converted models.
Currently, they are hosted on the following locations:
- https://huggingface.co/ggerganov/whisper.cpp
- https://ggml.ggerganov.com
Sample download:
```java
$ ./download-ggml-model.sh base.en
Downloading ggml model base.en ...
models/ggml-base.en.bin 100%[=============================================>] 141.11M 5.41MB/s in 22s
Done! Model 'base.en' saved in 'models/ggml-base.en.bin'
You can now use it like this:
$ ./main -m models/ggml-base.en.bin -f samples/jfk.wav
```
To convert the files yourself, use the convert-pt-to-ggml.py script. Here is an example usage.
The original PyTorch files are assumed to have been downloaded into ~/.cache/whisper
Change `~/path/to/repo/whisper/` to the location for your copy of the Whisper source:
```
mkdir models/whisper-medium
python models/convert-pt-to-ggml.py ~/.cache/whisper/medium.pt ~/path/to/repo/whisper/ ./models/whisper-medium
mv ./models/whisper-medium/ggml-model.bin models/ggml-medium.bin
rmdir models/whisper-medium
```
A third option to obtain the model files is to download them from Hugging Face:
https://huggingface.co/ggerganov/whisper.cpp/tree/main
## Available models
| Model | Disk | SHA |
| --- | --- | --- |
| tiny | 75 MiB | `bd577a113a864445d4c299885e0cb97d4ba92b5f` |
| tiny.en | 75 MiB | `c78c86eb1a8faa21b369bcd33207cc90d64ae9df` |
| base | 142 MiB | `465707469ff3a37a2b9b8d8f89f2f99de7299dac` |
| base.en | 142 MiB | `137c40403d78fd54d454da0f9bd998f78703390c` |
| small | 466 MiB | `55356645c2b361a969dfd0ef2c5a50d530afd8d5` |
| small.en | 466 MiB | `db8a495a91d927739e50b3fc1cc4c6b8f6c2d022` |
| medium | 1.5 GiB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
| medium.en | 1.5 GiB | `8c30f0e44ce9560643ebd10bbe50cd20eafd3723` |
| large-v1 | 2.9 GiB | `b1caaf735c4cc1429223d5a74f0f4d0b9b59a299` |
| large-v2 | 2.9 GiB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
| large-v3 | 2.9 GiB | `ad82bf6a9043ceed055076d0fd39f5f186ff8062` |
## Model files for testing purposes
The model files prefixed with `for-tests-` are empty (i.e. do not contain any weights) and are used by the CI for
testing purposes. They are directly included in this repository for convenience and the Github Actions CI uses them to
run various sanitizer tests.
## Fine-tuned models
There are community efforts for creating fine-tuned Whisper models using extra training data. For example, this
[blog post](https://huggingface.co/blog/fine-tune-whisper) describes a method for fine-tuning using Hugging Face (HF)
Transformer implementation of Whisper. The produced models are in slightly different format compared to the original
OpenAI format. To read the HF models you can use the [convert-h5-to-ggml.py](convert-h5-to-ggml.py) script like this:
```bash
git clone https://github.com/openai/whisper
git clone https://github.com/ggerganov/whisper.cpp
# clone HF fine-tuned model (this is just an example)
git clone https://huggingface.co/openai/whisper-medium
# convert the model to ggml
python3 ./whisper.cpp/models/convert-h5-to-ggml.py ./whisper-medium/ ./whisper .
```
## Distilled models
Initial support for https://huggingface.co/distil-whisper is available.
Currently, the chunk-based transcription strategy is not implemented, so there can be sub-optimal quality when using the distilled models with `whisper.cpp`.
```bash
# clone OpenAI whisper and whisper.cpp
git clone https://github.com/openai/whisper
git clone https://github.com/ggerganov/whisper.cpp
# get the models
cd whisper.cpp/models
git clone https://huggingface.co/distil-whisper/distil-medium.en
git clone https://huggingface.co/distil-whisper/distil-large-v2
# convert to ggml
python3 ./convert-h5-to-ggml.py ./distil-medium.en/ ../../whisper .
mv ggml-model.bin ggml-medium.en-distil.bin
python3 ./convert-h5-to-ggml.py ./distil-large-v2/ ../../whisper .
mv ggml-model.bin ggml-large-v2-distil.bin
```

View File

@@ -0,0 +1,117 @@
import argparse
import importlib.util
spec = importlib.util.spec_from_file_location('whisper_to_coreml', 'models/convert-whisper-to-coreml.py')
whisper_to_coreml = importlib.util.module_from_spec(spec)
spec.loader.exec_module(whisper_to_coreml)
from whisper import load_model
from copy import deepcopy
import torch
from transformers import WhisperForConditionalGeneration
from huggingface_hub import metadata_update
# https://github.com/bayartsogt-ya/whisper-multiple-hf-datasets/blob/main/src/multiple_datasets/hub_default_utils.py
WHISPER_MAPPING = {
"layers": "blocks",
"fc1": "mlp.0",
"fc2": "mlp.2",
"final_layer_norm": "mlp_ln",
"layers": "blocks",
".self_attn.q_proj": ".attn.query",
".self_attn.k_proj": ".attn.key",
".self_attn.v_proj": ".attn.value",
".self_attn_layer_norm": ".attn_ln",
".self_attn.out_proj": ".attn.out",
".encoder_attn.q_proj": ".cross_attn.query",
".encoder_attn.k_proj": ".cross_attn.key",
".encoder_attn.v_proj": ".cross_attn.value",
".encoder_attn_layer_norm": ".cross_attn_ln",
".encoder_attn.out_proj": ".cross_attn.out",
"decoder.layer_norm.": "decoder.ln.",
"encoder.layer_norm.": "encoder.ln_post.",
"embed_tokens": "token_embedding",
"encoder.embed_positions.weight": "encoder.positional_embedding",
"decoder.embed_positions.weight": "decoder.positional_embedding",
"layer_norm": "ln_post",
}
# https://github.com/bayartsogt-ya/whisper-multiple-hf-datasets/blob/main/src/multiple_datasets/hub_default_utils.py
def rename_keys(s_dict):
keys = list(s_dict.keys())
for key in keys:
new_key = key
for k, v in WHISPER_MAPPING.items():
if k in key:
new_key = new_key.replace(k, v)
print(f"{key} -> {new_key}")
s_dict[new_key] = s_dict.pop(key)
return s_dict
# https://github.com/bayartsogt-ya/whisper-multiple-hf-datasets/blob/main/src/multiple_datasets/hub_default_utils.py
def convert_hf_whisper(hf_model_name_or_path: str, whisper_state_path: str):
transformer_model = WhisperForConditionalGeneration.from_pretrained(hf_model_name_or_path)
config = transformer_model.config
# first build dims
dims = {
'n_mels': config.num_mel_bins,
'n_vocab': config.vocab_size,
'n_audio_ctx': config.max_source_positions,
'n_audio_state': config.d_model,
'n_audio_head': config.encoder_attention_heads,
'n_audio_layer': config.encoder_layers,
'n_text_ctx': config.max_target_positions,
'n_text_state': config.d_model,
'n_text_head': config.decoder_attention_heads,
'n_text_layer': config.decoder_layers
}
state_dict = deepcopy(transformer_model.model.state_dict())
state_dict = rename_keys(state_dict)
torch.save({"dims": dims, "model_state_dict": state_dict}, whisper_state_path)
# Ported from models/convert-whisper-to-coreml.py
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model-name", type=str, help="name of model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large-v1, large-v2, large-v3)", required=True)
parser.add_argument("--model-path", type=str, help="path to the model (e.g. if published on HuggingFace: Oblivion208/whisper-tiny-cantonese)", required=True)
parser.add_argument("--encoder-only", type=bool, help="only convert encoder", default=False)
parser.add_argument("--quantize", type=bool, help="quantize weights to F16", default=False)
parser.add_argument("--optimize-ane", type=bool, help="optimize for ANE execution (currently broken)", default=False)
args = parser.parse_args()
if args.model_name not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large-v1", "large-v2", "large-v3"]:
raise ValueError("Invalid model name")
pt_target_path = f"models/hf-{args.model_name}.pt"
convert_hf_whisper(args.model_path, pt_target_path)
whisper = load_model(pt_target_path).cpu()
hparams = whisper.dims
print(hparams)
if args.optimize_ane:
whisperANE = whisper_to_coreml.WhisperANE(hparams).eval()
whisperANE.load_state_dict(whisper.state_dict())
encoder = whisperANE.encoder
decoder = whisperANE.decoder
else:
encoder = whisper.encoder
decoder = whisper.decoder
# Convert encoder
encoder = whisper_to_coreml.convert_encoder(hparams, encoder, quantize=args.quantize)
encoder.save(f"models/coreml-encoder-{args.model_name}.mlpackage")
if args.encoder_only is False:
# Convert decoder
decoder = whisper_to_coreml.convert_decoder(hparams, decoder, quantize=args.quantize)
decoder.save(f"models/coreml-decoder-{args.model_name}.mlpackage")
print("done converting")

View File

@@ -0,0 +1,208 @@
# Convert Hugging Face fine-tuned models to ggml format
#
# Usage:
#
# git clone https://github.com/openai/whisper
# git clone https://github.com/ggerganov/whisper.cpp
# git clone https://huggingface.co/openai/whisper-medium
#
# python3 ./whisper.cpp/models/convert-h5-to-ggml.py ./whisper-medium/ ./whisper .
#
# This script is similar to "convert-pt-to-ggml.py"
#
# For more info:
#
# https://github.com/ggerganov/whisper.cpp/issues/157
#
import io
import os
import sys
import struct
import json
import code
import torch
import numpy as np
from pathlib import Path
from transformers import WhisperForConditionalGeneration
conv_map = {
'self_attn.k_proj' : 'attn.key',
'self_attn.q_proj' : 'attn.query',
'self_attn.v_proj' : 'attn.value',
'self_attn.out_proj' : 'attn.out',
'self_attn_layer_norm' : 'attn_ln',
'encoder_attn.q_proj' : 'cross_attn.query',
'encoder_attn.v_proj' : 'cross_attn.value',
'encoder_attn.out_proj' : 'cross_attn.out',
'encoder_attn_layer_norm' : 'cross_attn_ln',
'fc1' : 'mlp.0',
'fc2' : 'mlp.2',
'final_layer_norm' : 'mlp_ln',
'encoder.layer_norm.bias' : 'encoder.ln_post.bias',
'encoder.layer_norm.weight' : 'encoder.ln_post.weight',
'encoder.embed_positions.weight': 'encoder.positional_embedding',
'decoder.layer_norm.bias' : 'decoder.ln.bias',
'decoder.layer_norm.weight' : 'decoder.ln.weight',
'decoder.embed_positions.weight': 'decoder.positional_embedding',
'decoder.embed_tokens.weight' : 'decoder.token_embedding.weight',
'proj_out.weight' : 'decoder.proj.weight',
}
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
def bytes_to_unicode():
"""
Returns list of utf-8 byte and a corresponding list of unicode strings.
The reversible bpe codes work on unicode strings.
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
This is a significant percentage of your normal, say, 32K bpe vocab.
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
And avoids mapping to whitespace/control characters the bpe code barfs on.
"""
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
cs = bs[:]
n = 0
for b in range(2**8):
if b not in bs:
bs.append(b)
cs.append(2**8+n)
n += 1
cs = [chr(n) for n in cs]
return dict(zip(bs, cs))
if len(sys.argv) < 4:
print("Usage: convert-h5-to-ggml.py dir_model path-to-whisper-repo dir-output [use-f32]\n")
sys.exit(1)
dir_model = Path(sys.argv[1])
dir_whisper = Path(sys.argv[2])
dir_out = Path(sys.argv[3])
encoder = json.load((dir_model / "vocab.json").open("r", encoding="utf8"))
encoder_added = json.load((dir_model / "added_tokens.json").open( "r", encoding="utf8"))
hparams = json.load((dir_model / "config.json").open("r", encoding="utf8") )
model = WhisperForConditionalGeneration.from_pretrained(dir_model)
#code.interact(local=locals())
n_mels = hparams["num_mel_bins"]
with np.load(os.path.join(dir_whisper, "whisper/assets", "mel_filters.npz")) as f:
filters = torch.from_numpy(f[f"mel_{n_mels}"])
dir_tokenizer = dir_model
fname_out = dir_out / "ggml-model.bin"
tokens = json.load(open(dir_tokenizer / "vocab.json", "r", encoding="utf8"))
# use 16-bit or 32-bit floats
use_f16 = True
if len(sys.argv) > 4:
use_f16 = False
fname_out = dir_out / "ggml-model-f32.bin"
fout = open(fname_out, "wb")
fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
fout.write(struct.pack("i", hparams["vocab_size"]))
fout.write(struct.pack("i", hparams["max_source_positions"]))
fout.write(struct.pack("i", hparams["d_model"]))
fout.write(struct.pack("i", hparams["encoder_attention_heads"]))
fout.write(struct.pack("i", hparams["encoder_layers"]))
fout.write(struct.pack("i", hparams["max_length"]))
fout.write(struct.pack("i", hparams["d_model"]))
fout.write(struct.pack("i", hparams["decoder_attention_heads"]))
fout.write(struct.pack("i", hparams["decoder_layers"]))
fout.write(struct.pack("i", hparams["num_mel_bins"]))
fout.write(struct.pack("i", use_f16))
fout.write(struct.pack("i", filters.shape[0]))
fout.write(struct.pack("i", filters.shape[1]))
for i in range(filters.shape[0]):
for j in range(filters.shape[1]):
fout.write(struct.pack("f", filters[i][j]))
byte_encoder = bytes_to_unicode()
byte_decoder = {v:k for k, v in byte_encoder.items()}
fout.write(struct.pack("i", len(tokens)))
tokens = sorted(tokens.items(), key=lambda x: x[1])
for key in tokens:
text = bytearray([byte_decoder[c] for c in key[0]])
fout.write(struct.pack("i", len(text)))
fout.write(text)
list_vars = model.state_dict()
for name in list_vars.keys():
# this seems to not be used
# ref: https://github.com/huggingface/transformers/blob/9a5b84a0076a04fe9596da72e8668069d4f09ea0/src/transformers/models/whisper/modeling_whisper.py#L1099-L1106
if name == "proj_out.weight":
print('Skipping', name)
continue
src = name
nn = name
if name != "proj_out.weight":
nn = nn.split(".")[1:]
else:
nn = nn.split(".")
if nn[1] == "layers":
nn[1] = "blocks"
if ".".join(nn[3:-1]) == "encoder_attn.k_proj":
mapped = "attn.key" if nn[0] == "encoder" else "cross_attn.key"
else:
mapped = conv_map[".".join(nn[3:-1])]
name = ".".join(nn[:3] + [mapped] + nn[-1:])
else:
name = ".".join(nn)
name = conv_map[name] if name in conv_map else name
print(src, ' -> ', name)
data = list_vars[src].squeeze().numpy()
data = data.astype(np.float16)
# reshape conv bias from [n] to [n, 1]
if name in ["encoder.conv1.bias", "encoder.conv2.bias"]:
data = data.reshape(data.shape[0], 1)
print(" Reshaped variable: " , name , " to shape: ", data.shape)
n_dims = len(data.shape)
print(name, n_dims, data.shape)
# looks like the whisper models are in f16 by default
# so we need to convert the small tensors to f32 until we fully support f16 in ggml
# ftype == 0 -> float32, ftype == 1 -> float16
ftype = 1
if use_f16:
if n_dims < 2 or \
name == "encoder.conv1.bias" or \
name == "encoder.conv2.bias" or \
name == "encoder.positional_embedding" or \
name == "decoder.positional_embedding":
print(" Converting to float32")
data = data.astype(np.float32)
ftype = 0
else:
data = data.astype(np.float32)
ftype = 0
# header
str_ = name.encode('utf-8')
fout.write(struct.pack("iii", n_dims, len(str_), ftype))
for i in range(n_dims):
fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
fout.write(str_)
# data
data.tofile(fout)
fout.close()
print("Done. Output file: " , fname_out)
print("")

View File

@@ -0,0 +1,342 @@
# Convert Whisper transformer model from PyTorch to ggml format
#
# Usage: python convert-pt-to-ggml.py ~/.cache/whisper/medium.pt ~/path/to/repo/whisper/ ./models/whisper-medium
#
# You need to clone the original repo in ~/path/to/repo/whisper/
#
# git clone https://github.com/openai/whisper ~/path/to/repo/whisper/
#
# It is used to various assets needed by the algorithm:
#
# - tokenizer
# - mel filters
#
# Also, you need to have the original models in ~/.cache/whisper/
# See the original repo for more details.
#
# This script loads the specified model and whisper assets and saves them in ggml format.
# The output is a single binary file containing the following information:
#
# - hparams
# - mel filters
# - tokenizer vocab
# - model variables
#
# For each variable, write the following:
#
# - Number of dimensions (int)
# - Name length (int)
# - Dimensions (int[n_dims])
# - Name (char[name_length])
# - Data (float[n_dims])
#
import io
import os
import sys
import struct
import json
import code
import torch
import numpy as np
import base64
from pathlib import Path
#from transformers import GPTJForCausalLM
#from transformers import GPT2TokenizerFast
# ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110
#LANGUAGES = {
# "en": "english",
# "zh": "chinese",
# "de": "german",
# "es": "spanish",
# "ru": "russian",
# "ko": "korean",
# "fr": "french",
# "ja": "japanese",
# "pt": "portuguese",
# "tr": "turkish",
# "pl": "polish",
# "ca": "catalan",
# "nl": "dutch",
# "ar": "arabic",
# "sv": "swedish",
# "it": "italian",
# "id": "indonesian",
# "hi": "hindi",
# "fi": "finnish",
# "vi": "vietnamese",
# "iw": "hebrew",
# "uk": "ukrainian",
# "el": "greek",
# "ms": "malay",
# "cs": "czech",
# "ro": "romanian",
# "da": "danish",
# "hu": "hungarian",
# "ta": "tamil",
# "no": "norwegian",
# "th": "thai",
# "ur": "urdu",
# "hr": "croatian",
# "bg": "bulgarian",
# "lt": "lithuanian",
# "la": "latin",
# "mi": "maori",
# "ml": "malayalam",
# "cy": "welsh",
# "sk": "slovak",
# "te": "telugu",
# "fa": "persian",
# "lv": "latvian",
# "bn": "bengali",
# "sr": "serbian",
# "az": "azerbaijani",
# "sl": "slovenian",
# "kn": "kannada",
# "et": "estonian",
# "mk": "macedonian",
# "br": "breton",
# "eu": "basque",
# "is": "icelandic",
# "hy": "armenian",
# "ne": "nepali",
# "mn": "mongolian",
# "bs": "bosnian",
# "kk": "kazakh",
# "sq": "albanian",
# "sw": "swahili",
# "gl": "galician",
# "mr": "marathi",
# "pa": "punjabi",
# "si": "sinhala",
# "km": "khmer",
# "sn": "shona",
# "yo": "yoruba",
# "so": "somali",
# "af": "afrikaans",
# "oc": "occitan",
# "ka": "georgian",
# "be": "belarusian",
# "tg": "tajik",
# "sd": "sindhi",
# "gu": "gujarati",
# "am": "amharic",
# "yi": "yiddish",
# "lo": "lao",
# "uz": "uzbek",
# "fo": "faroese",
# "ht": "haitian creole",
# "ps": "pashto",
# "tk": "turkmen",
# "nn": "nynorsk",
# "mt": "maltese",
# "sa": "sanskrit",
# "lb": "luxembourgish",
# "my": "myanmar",
# "bo": "tibetan",
# "tl": "tagalog",
# "mg": "malagasy",
# "as": "assamese",
# "tt": "tatar",
# "haw": "hawaiian",
# "ln": "lingala",
# "ha": "hausa",
# "ba": "bashkir",
# "jw": "javanese",
# "su": "sundanese",
#}
## ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L273-L292
#def build_tokenizer(path_to_whisper_repo: str, name: str = "gpt2"):
# os.environ["TOKENIZERS_PARALLELISM"] = "false"
# path = os.path.join(path_to_whisper_repo, "whisper/assets", name)
# tokenizer = GPT2TokenizerFast.from_pretrained(path)
#
# specials = [
# "<|startoftranscript|>",
# *[f"<|{lang}|>" for lang in LANGUAGES.keys()],
# "<|translate|>",
# "<|transcribe|>",
# "<|startoflm|>",
# "<|startofprev|>",
# "<|nocaptions|>",
# "<|notimestamps|>",
# ]
#
# tokenizer.add_special_tokens(dict(additional_special_tokens=specials))
# return tokenizer
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
def bytes_to_unicode():
"""
Returns list of utf-8 byte and a corresponding list of unicode strings.
The reversible bpe codes work on unicode strings.
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
This is a signficant percentage of your normal, say, 32K bpe vocab.
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
And avoids mapping to whitespace/control characters the bpe code barfs on.
"""
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
cs = bs[:]
n = 0
for b in range(2**8):
if b not in bs:
bs.append(b)
cs.append(2**8+n)
n += 1
cs = [chr(n) for n in cs]
return dict(zip(bs, cs))
if len(sys.argv) < 4:
print("Usage: convert-pt-to-ggml.py model.pt path-to-whisper-repo dir-output [use-f32]\n")
sys.exit(1)
fname_inp = Path(sys.argv[1])
dir_whisper = Path(sys.argv[2])
dir_out = Path(sys.argv[3])
# try to load PyTorch binary data
try:
model_bytes = open(fname_inp, "rb").read()
with io.BytesIO(model_bytes) as fp:
checkpoint = torch.load(fp, map_location="cpu")
except Exception:
print("Error: failed to load PyTorch model file:" , fname_inp)
sys.exit(1)
hparams = checkpoint["dims"]
print("hparams:", hparams)
list_vars = checkpoint["model_state_dict"]
#print(list_vars['encoder.positional_embedding'])
#print(list_vars['encoder.conv1.weight'])
#print(list_vars['encoder.conv1.weight'].shape)
# load mel filters
n_mels = hparams["n_mels"]
with np.load(dir_whisper / "whisper" / "assets" / "mel_filters.npz") as f:
filters = torch.from_numpy(f[f"mel_{n_mels}"])
#print (filters)
#code.interact(local=locals())
# load tokenizer
# for backwards compatibility, also check for older hf_transformers format tokenizer files
# old format: dir_whisper/whisper/assets/[multilingual/gpt2]/vocab.json
# new format: dir_whisper/whisper/assets/[multilingual/gpt2].tiktoken
multilingual = hparams["n_vocab"] >= 51865
tokenizer = dir_whisper / "whisper" / "assets" / (multilingual and "multilingual.tiktoken" or "gpt2.tiktoken")
tokenizer_type = "tiktoken"
if not tokenizer.is_file():
tokenizer = dir_whisper / "whisper" / "assets" / (multilingual and "multilingual" or "gpt2") / "vocab.json"
tokenizer_type = "hf_transformers"
if not tokenizer.is_file():
print("Error: failed to find either tiktoken or hf_transformers tokenizer file:", tokenizer)
sys.exit(1)
byte_encoder = bytes_to_unicode()
byte_decoder = {v:k for k, v in byte_encoder.items()}
if tokenizer_type == "tiktoken":
with open(tokenizer, "rb") as f:
contents = f.read()
tokens = {base64.b64decode(token): int(rank) for token, rank in (line.split() for line in contents.splitlines() if line)}
elif tokenizer_type == "hf_transformers":
with open(tokenizer, "r", encoding="utf8") as f:
_tokens_raw = json.load(f)
if '<|endoftext|>' in _tokens_raw:
# ensures exact same model as tokenizer_type == tiktoken
# details: https://github.com/ggerganov/whisper.cpp/pull/725
del _tokens_raw['<|endoftext|>']
tokens = {bytes([byte_decoder[c] for c in token]): int(idx) for token, idx in _tokens_raw.items()}
# output in the same directory as the model
fname_out = dir_out / "ggml-model.bin"
# use 16-bit or 32-bit floats
use_f16 = True
if len(sys.argv) > 4:
use_f16 = False
fname_out = dir_out / "ggml-model-f32.bin"
fout = fname_out.open("wb")
fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
fout.write(struct.pack("i", hparams["n_vocab"]))
fout.write(struct.pack("i", hparams["n_audio_ctx"]))
fout.write(struct.pack("i", hparams["n_audio_state"]))
fout.write(struct.pack("i", hparams["n_audio_head"]))
fout.write(struct.pack("i", hparams["n_audio_layer"]))
fout.write(struct.pack("i", hparams["n_text_ctx"]))
fout.write(struct.pack("i", hparams["n_text_state"]))
fout.write(struct.pack("i", hparams["n_text_head"]))
fout.write(struct.pack("i", hparams["n_text_layer"]))
fout.write(struct.pack("i", hparams["n_mels"]))
fout.write(struct.pack("i", use_f16))
# write mel filters
fout.write(struct.pack("i", filters.shape[0]))
fout.write(struct.pack("i", filters.shape[1]))
for i in range(filters.shape[0]):
for j in range(filters.shape[1]):
fout.write(struct.pack("f", filters[i][j]))
# write tokenizer
fout.write(struct.pack("i", len(tokens)))
for key in tokens:
fout.write(struct.pack("i", len(key)))
fout.write(key)
for name in list_vars.keys():
data = list_vars[name].squeeze().numpy()
print("Processing variable: " , name , " with shape: ", data.shape)
# reshape conv bias from [n] to [n, 1]
if name in ["encoder.conv1.bias", "encoder.conv2.bias"]:
data = data.reshape(data.shape[0], 1)
print(f" Reshaped variable: {name} to shape: ", data.shape)
n_dims = len(data.shape)
# looks like the whisper models are in f16 by default
# so we need to convert the small tensors to f32 until we fully support f16 in ggml
# ftype == 0 -> float32, ftype == 1 -> float16
ftype = 1
if use_f16:
if n_dims < 2 or \
name == "encoder.conv1.bias" or \
name == "encoder.conv2.bias" or \
name == "encoder.positional_embedding" or \
name == "decoder.positional_embedding":
print(" Converting to float32")
data = data.astype(np.float32)
ftype = 0
else:
data = data.astype(np.float32)
ftype = 0
#if name.startswith("encoder"):
# if name.endswith("mlp.0.weight") or \
# name.endswith("mlp.2.weight"):
# print(" Transposing")
# data = data.transpose()
# header
str_ = name.encode('utf-8')
fout.write(struct.pack("iii", n_dims, len(str_), ftype))
for i in range(n_dims):
fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
fout.write(str_)
# data
data.tofile(fout)
fout.close()
print("Done. Output file: " , fname_out)
print("")

View File

@@ -0,0 +1,331 @@
import argparse
import torch
import torch.nn.functional as F
import coremltools as ct
from torch import Tensor
from torch import nn
from typing import Dict
from typing import Optional
from ane_transformers.reference.layer_norm import LayerNormANE as LayerNormANEBase
from coremltools.models.neural_network.quantization_utils import quantize_weights
from whisper.model import Whisper, AudioEncoder, TextDecoder, ResidualAttentionBlock, MultiHeadAttention, ModelDimensions
from whisper import load_model
# Use for changing dim of input in encoder and decoder embeddings
def linear_to_conv2d_map(state_dict, prefix, local_metadata, strict,
missing_keys, unexpected_keys, error_msgs):
"""
Unsqueeze twice to map nn.Linear weights to nn.Conv2d weights
"""
for k in state_dict:
is_attention = all(substr in k for substr in ['attn', '.weight'])
is_mlp = any(k.endswith(s) for s in ['mlp.0.weight', 'mlp.2.weight'])
if (is_attention or is_mlp) and len(state_dict[k].shape) == 2:
state_dict[k] = state_dict[k][:, :, None, None]
def correct_for_bias_scale_order_inversion(state_dict, prefix, local_metadata,
strict, missing_keys,
unexpected_keys, error_msgs):
state_dict[prefix + 'bias'] = state_dict[prefix + 'bias'] / state_dict[prefix + 'weight']
return state_dict
class LayerNormANE(LayerNormANEBase):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._register_load_state_dict_pre_hook(
correct_for_bias_scale_order_inversion)
class MultiHeadAttentionANE(MultiHeadAttention):
def __init__(self, n_state: int, n_head: int):
super().__init__(n_state, n_head)
self.query = nn.Conv2d(n_state, n_state, kernel_size=1)
self.key = nn.Conv2d(n_state, n_state, kernel_size=1, bias=False)
self.value = nn.Conv2d(n_state, n_state, kernel_size=1)
self.out = nn.Conv2d(n_state, n_state, kernel_size=1)
def forward(self,
x: Tensor,
xa: Optional[Tensor] = None,
mask: Optional[Tensor] = None,
kv_cache: Optional[dict] = None):
q = self.query(x)
if kv_cache is None or xa is None or self.key not in kv_cache:
# hooks, if installed (i.e. kv_cache is not None), will prepend the cached kv tensors;
# otherwise, perform key/value projections for self- or cross-attention as usual.
k = self.key(x if xa is None else xa)
v = self.value(x if xa is None else xa)
else:
# for cross-attention, calculate keys and values once and reuse in subsequent calls.
k = kv_cache[self.key]
v = kv_cache[self.value]
wv, qk = self.qkv_attention_ane(q, k, v, mask)
return self.out(wv), qk
def qkv_attention_ane(self, q: Tensor, k: Tensor, v: Tensor, mask: Optional[Tensor] = None):
_, dim, _, seqlen = q.size()
dim_per_head = dim // self.n_head
scale = float(dim_per_head)**-0.5
q = q * scale
mh_q = q.split(dim_per_head, dim=1)
mh_k = k.transpose(1,3).split(dim_per_head, dim=3)
mh_v = v.split(dim_per_head, dim=1)
mh_qk = [
torch.einsum('bchq,bkhc->bkhq', [qi, ki])
for qi, ki in zip(mh_q, mh_k)
] # (batch_size, max_seq_length, 1, max_seq_length) * n_heads
if mask is not None:
for head_idx in range(self.n_head):
mh_qk[head_idx] = mh_qk[head_idx] + mask[:, :seqlen, :, :seqlen]
attn_weights = [aw.softmax(dim=1) for aw in mh_qk] # (batch_size, max_seq_length, 1, max_seq_length) * n_heads
attn = [torch.einsum('bkhq,bchk->bchq', wi, vi) for wi, vi in zip(attn_weights, mh_v)] # (batch_size, dim_per_head, 1, max_seq_length) * n_heads
attn = torch.cat(attn, dim=1) # (batch_size, dim, 1, max_seq_length)
return attn, torch.cat(mh_qk, dim=1).float().detach()
class ResidualAttentionBlockANE(ResidualAttentionBlock):
def __init__(self, n_state: int, n_head: int, cross_attention: bool = False):
super().__init__(n_state, n_head, cross_attention)
self.attn = MultiHeadAttentionANE(n_state, n_head)
self.attn_ln = LayerNormANE(n_state)
self.cross_attn = MultiHeadAttentionANE(n_state, n_head) if cross_attention else None
self.cross_attn_ln = LayerNormANE(n_state) if cross_attention else None
n_mlp = n_state * 4
self.mlp = nn.Sequential(
nn.Conv2d(n_state, n_mlp, kernel_size=1),
nn.GELU(),
nn.Conv2d(n_mlp, n_state, kernel_size=1)
)
self.mlp_ln = LayerNormANE(n_state)
class AudioEncoderANE(AudioEncoder):
def __init__(self, n_mels: int, n_ctx: int, n_state: int, n_head: int, n_layer: int):
super().__init__(n_mels, n_ctx, n_state, n_head, n_layer)
self.blocks = nn.ModuleList(
[ResidualAttentionBlockANE(n_state, n_head) for _ in range(n_layer)]
)
self.ln_post = LayerNormANE(n_state)
def forward(self, x: Tensor):
"""
x : torch.Tensor, shape = (batch_size, n_mels, n_ctx)
the mel spectrogram of the audio
"""
x = F.gelu(self.conv1(x))
x = F.gelu(self.conv2(x))
assert x.shape[1:] == self.positional_embedding.shape[::-1], "incorrect audio shape"
# Add positional embedding and add dummy dim for ANE
x = (x + self.positional_embedding.transpose(0,1)).to(x.dtype).unsqueeze(2)
for block in self.blocks:
x = block(x)
x = self.ln_post(x)
# """
# TODO:
# I think we need to transpose the result here to make it fit whisper.cpp memory order.
# However, even doing this, the results are still wrong. Kind of less wrong compared to
# not transposing, but still wrong.
# Also, I don't know why the original OpenAI implementation does not need to transpose
# transpose to (batch_size, n_ctx, n_state)
# x : torch.Tensor, shape = (batch_size, n_state, 1, n_ctx)
# """
# x = x.transpose(1,3)
return x
class TextDecoderANE(TextDecoder):
def __init__(self, n_vocab: int, n_ctx: int, n_state: int, n_head: int, n_layer: int):
super().__init__(n_vocab, n_ctx, n_state, n_head, n_layer)
self.blocks= nn.ModuleList(
[ResidualAttentionBlockANE(n_state, n_head, cross_attention=True) for _ in range(n_layer)]
)
self.ln= LayerNormANE(n_state)
def forward(self, x: Tensor, xa: Tensor, kv_cache: Optional[dict] = None):
"""
x : torch.LongTensor, shape = (batch_size, <= n_ctx)
the text tokens
xa : torch.Tensor, shape = (batch_size, n_mels, n_audio_ctx)
the encoded audio features to be attended on
"""
offset = next(iter(kv_cache.values())).shape[3] if kv_cache else 0
x = self.token_embedding(x) + self.positional_embedding[offset : offset + x.shape[-1]]
x = x.to(xa.dtype)
# Reformat for ANE
mask = self.mask[None, None, :, :].permute(0,3,1,2)
x = x.transpose(1,2).unsqueeze(2)
for block in self.blocks:
x = block(x, xa, mask=mask, kv_cache=kv_cache)
x = self.ln(x)
# Reformat back from ANE
x = x.permute(0,2,3,1).squeeze(0)
# ANE can only load tensors with dim size of at most 16,384 - whisper uses 51,864 (en) or 51,865 (multi-lang) tokens so we need to compute in chunks
if self.token_embedding.weight.shape[0] >= 51865:
# split in 11 chunks - 4715 each
splits = self.token_embedding.weight.split(self.token_embedding.weight.shape[0]//11, dim=0)
logits = torch.cat([torch.einsum('bid,jd->bij', x, split) for split in splits]).view(*x.shape[:2], -1)
else:
# split in 12 chunks - 4322 each
assert(self.token_embedding.weight.shape[0] == 51864)
splits = self.token_embedding.weight.split(self.token_embedding.weight.shape[0]//12, dim=0)
logits = torch.cat([torch.einsum('bid,jd->bij', x, split) for split in splits]).view(*x.shape[:2], -1)
return logits
class WhisperANE(Whisper):
def __init__(self, dims: ModelDimensions):
super().__init__(dims)
self.encoder = AudioEncoderANE(
self.dims.n_mels,
self.dims.n_audio_ctx,
self.dims.n_audio_state,
self.dims.n_audio_head,
self.dims.n_audio_layer,
)
self.decoder = TextDecoderANE(
self.dims.n_vocab,
self.dims.n_text_ctx,
self.dims.n_text_state,
self.dims.n_text_head,
self.dims.n_text_layer,
)
self._register_load_state_dict_pre_hook(linear_to_conv2d_map)
def forward(self, mel: torch.Tensor, tokens: torch.Tensor) -> Dict[str, torch.Tensor]:
return self.decoder(tokens, self.encoder(mel))
def install_kv_cache_hooks(self, cache: Optional[dict] = None):
cache = {**cache} if cache is not None else {}
hooks = []
def save_to_cache(module, _, output):
if module not in cache or output.shape[3] > self.decoder.positional_embedding.shape[0]:
cache[module] = output # save as-is, for the first token or cross attention
else:
cache[module] = torch.cat([cache[module], output], dim=3).detach()
return cache[module]
def install_hooks(layer: nn.Module):
if isinstance(layer, MultiHeadAttentionANE):
hooks.append(layer.key.register_forward_hook(save_to_cache))
hooks.append(layer.value.register_forward_hook(save_to_cache))
self.decoder.apply(install_hooks)
return cache, hooks
def convert_encoder(hparams, model, quantize=False):
model.eval()
input_shape = (1, hparams.n_mels, 3000)
input_data = torch.randn(input_shape)
traced_model = torch.jit.trace(model, input_data)
model = ct.convert(
traced_model,
convert_to=None if quantize else "mlprogram", # convert will fail if weights are quantized, not sure why
inputs=[ct.TensorType(name="logmel_data", shape=input_shape)],
outputs=[ct.TensorType(name="output")],
compute_units=ct.ComputeUnit.ALL
)
if quantize:
model = quantize_weights(model, nbits=16)
return model
def convert_decoder(hparams, model, quantize=False):
model.eval()
tokens_shape = (1, 1)
audio_shape = (1, hparams.n_audio_state, 1, 1500)
audio_data = torch.randn(audio_shape)
token_data = torch.randint(50257, tokens_shape).long()
traced_model = torch.jit.trace(model, (token_data, audio_data))
model = ct.convert(
traced_model,
convert_to=None if quantize else "mlprogram", # convert will fail if weights are quantized, not sure why
inputs=[
ct.TensorType(name="token_data", shape=tokens_shape, dtype=int),
ct.TensorType(name="audio_data", shape=audio_shape)
]
)
if quantize:
model = quantize_weights(model, nbits=16)
return model
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large-v1, large-v2, large-v3)", required=True)
parser.add_argument("--encoder-only", type=bool, help="only convert encoder", default=False)
parser.add_argument("--quantize", type=bool, help="quantize weights to F16", default=False)
parser.add_argument("--optimize-ane", type=bool, help="optimize for ANE execution (currently broken)", default=False)
args = parser.parse_args()
if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "small.en-tdrz", "medium", "medium.en", "large-v1", "large-v2", "large-v3"]:
raise ValueError("Invalid model name")
whisper = load_model(args.model).cpu()
hparams = whisper.dims
print(hparams)
if args.optimize_ane:
whisperANE = WhisperANE(hparams).eval()
whisperANE.load_state_dict(whisper.state_dict())
encoder = whisperANE.encoder
decoder = whisperANE.decoder
else:
encoder = whisper.encoder
decoder = whisper.decoder
# Convert encoder
encoder = convert_encoder(hparams, encoder, quantize=args.quantize)
encoder.save(f"models/coreml-encoder-{args.model}.mlpackage")
if args.encoder_only is False:
# Convert decoder
decoder = convert_decoder(hparams, decoder, quantize=args.quantize)
decoder.save(f"models/coreml-decoder-{args.model}.mlpackage")
print("done converting")

View File

@@ -0,0 +1,53 @@
import argparse
import torch
from whisper import load_model
import os
from openvino.tools import mo
from openvino.runtime import serialize
import shutil
def convert_encoder(hparams, encoder, mname):
encoder.eval()
mel = torch.zeros((1, hparams.n_mels, 3000))
onnx_folder=os.path.join(os.path.dirname(__file__),"onnx_encoder")
#create a directory to store the onnx model, and other collateral that is saved during onnx export procedure
if not os.path.isdir(onnx_folder):
os.makedirs(onnx_folder)
onnx_path = os.path.join(onnx_folder, "whisper_encoder.onnx")
torch.onnx.export(
encoder,
mel,
onnx_path,
input_names=["mel"],
output_names=["output_features"]
)
# use model optimizer to convert onnx to OpenVINO IR format
encoder_model = mo.convert_model(onnx_path, compress_to_fp16=True)
serialize(encoder_model, xml_path=os.path.join(os.path.dirname(__file__),"ggml-" + mname + "-encoder-openvino.xml"))
#cleanup
if os.path.isdir(onnx_folder):
shutil.rmtree(onnx_folder)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large-v1, large-v2, large-v3)", required=True)
args = parser.parse_args()
if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large-v1", "large-v2", "large-v3"]:
raise ValueError("Invalid model name")
whisper = load_model(args.model).cpu()
hparams = whisper.dims
encoder = whisper.encoder
# Convert encoder to onnx
convert_encoder(hparams, encoder, args.model)

View File

@@ -0,0 +1,82 @@
#!/bin/bash
# This script downloads Whisper model files that have already been converted to Core ML format.
# This way you don't have to convert them yourself.
src="https://huggingface.co/datasets/ggerganov/whisper.cpp-coreml"
pfx="resolve/main/ggml"
# get the path of this script
function get_script_path() {
if [ -x "$(command -v realpath)" ]; then
echo "$(dirname $(realpath $0))"
else
local ret="$(cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P)"
echo "$ret"
fi
}
models_path="$(get_script_path)"
# Whisper models
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large-v3" )
# list available models
function list_models {
printf "\n"
printf " Available models:"
for model in "${models[@]}"; do
printf " $model"
done
printf "\n\n"
}
if [ "$#" -ne 1 ]; then
printf "Usage: $0 <model>\n"
list_models
exit 1
fi
model=$1
if [[ ! " ${models[@]} " =~ " ${model} " ]]; then
printf "Invalid model: $model\n"
list_models
exit 1
fi
# download Core ML model
printf "Downloading Core ML model $model from '$src' ...\n"
cd $models_path
if [ -f "ggml-$model.mlmodel" ]; then
printf "Model $model already exists. Skipping download.\n"
exit 0
fi
if [ -x "$(command -v wget)" ]; then
wget --quiet --show-progress -O ggml-$model.mlmodel $src/$pfx-$model.mlmodel
elif [ -x "$(command -v curl)" ]; then
curl -L --output ggml-$model.mlmodel $src/$pfx-$model.mlmodel
else
printf "Either wget or curl is required to download models.\n"
exit 1
fi
if [ $? -ne 0 ]; then
printf "Failed to download Core ML model $model \n"
printf "Please try again later or download the original Whisper model files and convert them yourself.\n"
exit 1
fi
printf "Done! Model '$model' saved in 'models/ggml-$model.mlmodel'\n"
printf "Run the following command to compile it:\n\n"
printf " $ xcrun coremlc compile ./models/ggml-$model.mlmodel ./models\n\n"
printf "You can now use it like this:\n\n"
printf " $ ./main -m models/ggml-$model.bin -f samples/jfk.wav\n"
printf "\n"

View File

@@ -0,0 +1,64 @@
@echo off
pushd %~dp0
set models_path=%CD%
for %%d in (%~dp0..) do set root_path=%%~fd
popd
set argc=0
for %%x in (%*) do set /A argc+=1
set models=tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 large-v3
if %argc% neq 1 (
echo.
echo Usage: download-ggml-model.cmd model
CALL :list_models
goto :eof
)
set model=%1
for %%b in (%models%) do (
if "%%b"=="%model%" (
CALL :download_model
goto :eof
)
)
echo Invalid model: %model%
CALL :list_models
goto :eof
:download_model
echo Downloading ggml model %model%...
cd "%models_path%"
if exist "ggml-%model%.bin" (
echo Model %model% already exists. Skipping download.
goto :eof
)
PowerShell -NoProfile -ExecutionPolicy Bypass -Command "Start-BitsTransfer -Source https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-%model%.bin -Destination ggml-%model%.bin"
if %ERRORLEVEL% neq 0 (
echo Failed to download ggml model %model%
echo Please try again later or download the original Whisper model files and convert them yourself.
goto :eof
)
echo Done! Model %model% saved in %root_path%\models\ggml-%model%.bin
echo You can now use it like this:
echo main.exe -m %root_path%\models\ggml-%model%.bin -f %root_path%\samples\jfk.wav
goto :eof
:list_models
echo.
echo Available models:
(for %%a in (%models%) do (
echo %%a
))
echo.
exit /b

View File

@@ -0,0 +1,111 @@
#!/bin/bash
# This script downloads Whisper model files that have already been converted to ggml format.
# This way you don't have to convert them yourself.
#src="https://ggml.ggerganov.com"
#pfx="ggml-model-whisper"
src="https://huggingface.co/ggerganov/whisper.cpp"
pfx="resolve/main/ggml"
# get the path of this script
function get_script_path() {
if [ -x "$(command -v realpath)" ]; then
echo "$(dirname "$(realpath "$0")")"
else
local ret="$(cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P)"
echo "$ret"
fi
}
models_path="$(get_script_path)"
# Whisper models
models=(
"tiny.en"
"tiny"
"tiny-q5_1"
"tiny.en-q5_1"
"base.en"
"base"
"base-q5_1"
"base.en-q5_1"
"small.en"
"small.en-tdrz"
"small"
"small-q5_1"
"small.en-q5_1"
"medium"
"medium.en"
"medium-q5_0"
"medium.en-q5_0"
"large-v1"
"large-v2"
"large-v3"
"large-q5_0"
)
# list available models
function list_models {
printf "\n"
printf " Available models:"
for model in "${models[@]}"; do
printf " $model"
done
printf "\n\n"
}
if [ "$#" -ne 1 ]; then
printf "Usage: $0 <model>\n"
list_models
exit 1
fi
model=$1
if [[ ! " ${models[@]} " =~ " ${model} " ]]; then
printf "Invalid model: $model\n"
list_models
exit 1
fi
# check if model contains `tdrz` and update the src and pfx accordingly
if [[ $model == *"tdrz"* ]]; then
src="https://huggingface.co/akashmjn/tinydiarize-whisper.cpp"
pfx="resolve/main/ggml"
fi
# download ggml model
printf "Downloading ggml model $model from '$src' ...\n"
cd "$models_path"
if [ -f "ggml-$model.bin" ]; then
printf "Model $model already exists. Skipping download.\n"
exit 0
fi
if [ -x "$(command -v wget)" ]; then
wget --no-config --quiet --show-progress -O ggml-$model.bin $src/$pfx-$model.bin
elif [ -x "$(command -v curl)" ]; then
curl -L --output ggml-$model.bin $src/$pfx-$model.bin
else
printf "Either wget or curl is required to download models.\n"
exit 1
fi
if [ $? -ne 0 ]; then
printf "Failed to download ggml model $model \n"
printf "Please try again later or download the original Whisper model files and convert them yourself.\n"
exit 1
fi
printf "Done! Model '$model' saved in 'models/ggml-$model.bin'\n"
printf "You can now use it like this:\n\n"
printf " $ ./main -m models/ggml-$model.bin -f samples/jfk.wav\n"
printf "\n"

View File

@@ -0,0 +1,29 @@
#!/bin/bash
#
# This generates:
# - coreml/whisper-encoder-impl.h and coreml/whisper-encoder-impl.m
# - coreml/whisper-decoder-impl.h and coreml/whisper-decoder-impl.m
#
wd=$(dirname "$0")
cd "$wd/../"
python3 models/convert-whisper-to-coreml.py --model tiny.en
mv -v models/coreml-encoder-tiny.en.mlpackage models/whisper-encoder-impl.mlpackage
xcrun coremlc generate models/whisper-encoder-impl.mlpackage coreml/
mv coreml/whisper_encoder_impl.h coreml/whisper-encoder-impl.h
mv coreml/whisper_encoder_impl.m coreml/whisper-encoder-impl.m
sed -i '' 's/whisper_encoder_impl\.h/whisper-encoder-impl.h/g' coreml/whisper-encoder-impl.m
sed -i '' 's/whisper_encoder_impl\.m/whisper-encoder-impl.m/g' coreml/whisper-encoder-impl.m
sed -i '' 's/whisper_encoder_impl\.h/whisper-encoder-impl.h/g' coreml/whisper-encoder-impl.h
mv -v models/coreml-decoder-tiny.en.mlpackage models/whisper-decoder-impl.mlpackage
xcrun coremlc generate models/whisper-decoder-impl.mlpackage coreml/
mv coreml/whisper_decoder_impl.h coreml/whisper-decoder-impl.h
mv coreml/whisper_decoder_impl.m coreml/whisper-decoder-impl.m
sed -i '' 's/whisper_decoder_impl\.h/whisper-decoder-impl.h/g' coreml/whisper-decoder-impl.m
sed -i '' 's/whisper_decoder_impl\.m/whisper-decoder-impl.m/g' coreml/whisper-decoder-impl.m
sed -i '' 's/whisper_decoder_impl\.h/whisper-decoder-impl.h/g' coreml/whisper-decoder-impl.h
rm -rfv models/whisper-encoder-impl.mlpackage models/whisper-decoder-impl.mlpackage

View File

@@ -0,0 +1,36 @@
#!/bin/bash
# Usage: ./generate-coreml-model.sh <model-name>
if [ $# -eq 0 ]; then
echo "No model name supplied"
echo "Usage for Whisper models: ./generate-coreml-model.sh <model-name>"
echo "Usage for HuggingFace models: ./generate-coreml-model.sh -h5 <model-name> <model-path>"
exit 1
elif [[ "$1" == "-h5" && $# != 3 ]]; then
echo "No model name and model path supplied for a HuggingFace model"
echo "Usage for HuggingFace models: ./generate-coreml-model.sh -h5 <model-name> <model-path>"
exit 1
fi
mname="$1"
wd=$(dirname "$0")
cd "$wd/../"
if [[ $mname == "-h5" ]]; then
mname="$2"
mpath="$3"
echo $mpath
python3 models/convert-h5-to-coreml.py --model-name $mname --model-path $mpath --encoder-only True
else
python3 models/convert-whisper-to-coreml.py --model $mname --encoder-only True
fi
xcrun coremlc compile models/coreml-encoder-${mname}.mlpackage models/
rm -rf models/ggml-${mname}-encoder.mlmodelc
mv -v models/coreml-encoder-${mname}.mlmodelc models/ggml-${mname}-encoder.mlmodelc
# TODO: decoder (sometime in the future maybe)
#xcrun coremlc compile models/whisper-decoder-${mname}.mlpackage models/
#rm -rf models/ggml-${mname}-decoder.mlmodelc
#mv -v models/coreml_decoder_${mname}.mlmodelc models/ggml-${mname}-decoder.mlmodelc

View File

@@ -0,0 +1,109 @@
import struct
import torch
import numpy as np
from collections import OrderedDict
from pathlib import Path
import sys
if len(sys.argv) < 3:
print(
"Usage: convert-ggml-to-pt.py model.bin dir-output\n")
sys.exit(1)
fname_inp = Path(sys.argv[1])
dir_out = Path(sys.argv[2])
fname_out = dir_out / "torch-model.pt"
# Open the ggml file
with open(fname_inp, "rb") as f:
# Read magic number and hyperparameters
magic_number, n_vocab, n_audio_ctx, n_audio_state, n_audio_head, n_audio_layer, n_text_ctx, n_text_state, n_text_head, n_text_layer, n_mels, use_f16 = struct.unpack("12i", f.read(48))
print(f"Magic number: {magic_number}")
print(f"Vocab size: {n_vocab}")
print(f"Audio context size: {n_audio_ctx}")
print(f"Audio state size: {n_audio_state}")
print(f"Audio head size: {n_audio_head}")
print(f"Audio layer size: {n_audio_layer}")
print(f"Text context size: {n_text_ctx}")
print(f"Text head size: {n_text_head}")
print(f"Mel size: {n_mels}")
# Read mel filters
# mel_filters = np.fromfile(f, dtype=np.float32, count=n_mels * 2).reshape(n_mels, 2)
# print(f"Mel filters: {mel_filters}")
filters_shape_0 = struct.unpack("i", f.read(4))[0]
print(f"Filters shape 0: {filters_shape_0}")
filters_shape_1 = struct.unpack("i", f.read(4))[0]
print(f"Filters shape 1: {filters_shape_1}")
# Read tokenizer tokens
# bytes = f.read(4)
# print(bytes)
# for i in range(filters.shape[0]):
# for j in range(filters.shape[1]):
# fout.write(struct.pack("f", filters[i][j]))
mel_filters = np.zeros((filters_shape_0, filters_shape_1))
for i in range(filters_shape_0):
for j in range(filters_shape_1):
mel_filters[i][j] = struct.unpack("f", f.read(4))[0]
bytes_data = f.read(4)
num_tokens = struct.unpack("i", bytes_data)[0]
tokens = {}
for _ in range(num_tokens):
token_len = struct.unpack("i", f.read(4))[0]
token = f.read(token_len)
tokens[token] = {}
# Read model variables
model_state_dict = OrderedDict()
while True:
try:
n_dims, name_length, ftype = struct.unpack("iii", f.read(12))
except struct.error:
break # End of file
dims = [struct.unpack("i", f.read(4))[0] for _ in range(n_dims)]
dims = dims[::-1]
name = f.read(name_length).decode("utf-8")
if ftype == 1: # f16
data = np.fromfile(f, dtype=np.float16, count=np.prod(dims)).reshape(dims)
else: # f32
data = np.fromfile(f, dtype=np.float32, count=np.prod(dims)).reshape(dims)
if name in ["encoder.conv1.bias", "encoder.conv2.bias"]:
data = data[:, 0]
model_state_dict[name] = torch.from_numpy(data)
# Now you have the model's state_dict stored in model_state_dict
# You can load this state_dict into a model with the same architecture
# dims = ModelDimensions(**checkpoint["dims"])
# model = Whisper(dims)
from whisper import Whisper, ModelDimensions
dims = ModelDimensions(
n_mels=n_mels,
n_audio_ctx=n_audio_ctx,
n_audio_state=n_audio_state,
n_audio_head=n_audio_head,
n_audio_layer=n_audio_layer,
n_text_ctx=n_text_ctx,
n_text_state=n_text_state,
n_text_head=n_text_head,
n_text_layer=n_text_layer,
n_vocab=n_vocab,
)
model = Whisper(dims) # Replace with your model's class
model.load_state_dict(model_state_dict)
# Save the model in PyTorch format
torch.save(model.state_dict(), fname_out)

View File

@@ -0,0 +1,2 @@
openvino-dev[pytorch,onnx]
openai-whisper