From 1ccc15d3a09388ff2ebaf0849c2eabb7387f865c Mon Sep 17 00:00:00 2001 From: FFengIll Date: Thu, 14 Sep 2023 15:29:39 +0800 Subject: [PATCH 1/2] feat: get vocab from tokenizer is better. --- models/convert-to-ggml.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/models/convert-to-ggml.py b/models/convert-to-ggml.py index 7ef5b80..bbb5ddc 100644 --- a/models/convert-to-ggml.py +++ b/models/convert-to-ggml.py @@ -22,8 +22,6 @@ with open(dir_model + "/config.json", "r", encoding="utf-8") as f: hparams = json.load(f) -with open(dir_model + "/vocab.txt", "r", encoding="utf-8") as f: - vocab = f.readlines() # possible data types # ftype == 0 -> float32 # ftype == 1 -> float16 @@ -63,10 +61,20 @@ fout.write(struct.pack("i", hparams["num_hidden_layers"])) fout.write(struct.pack("i", ftype)) -for i in range(hparams["vocab_size"]): - text = vocab[i][:-1] # strips newline at the end - #print(f"{i}:{text}") - data = bytes(text, 'utf-8') +vocab_list = [] + +# print(tokenizer.get_vocab()) +vocab = tokenizer.get_vocab() +if not isinstance(vocab, dict): + raise TypeError +items = list(vocab.items()) +items.sort(key=lambda x: x[1]) +vocab_list = [i[0] for i in items] + +for idx, k in enumerate(vocab_list): + text = k + # print(f"{i}:{text}") + data = bytes(text, "utf-8") fout.write(struct.pack("i", len(data))) fout.write(data) From 4aa1a17c06c63919bfab17f33a4cbe6aea52951c Mon Sep 17 00:00:00 2001 From: FFengIll Date: Tue, 19 Sep 2023 10:31:23 +0800 Subject: [PATCH 2/2] feat: use `vocab_size` to loop vocab to avoid error. --- models/convert-to-ggml.py | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/models/convert-to-ggml.py b/models/convert-to-ggml.py index bbb5ddc..f980064 100644 --- a/models/convert-to-ggml.py +++ b/models/convert-to-ggml.py @@ -1,9 +1,9 @@ -import sys -import struct import json -import torch -import numpy as np +import struct +import sys +import numpy as np +import torch from transformers import AutoModel, AutoTokenizer if len(sys.argv) < 3: @@ -40,9 +40,9 @@ tokenizer = AutoTokenizer.from_pretrained(dir_model) model = AutoModel.from_pretrained(dir_model, low_cpu_mem_usage=True) -print (model) +print(model) -print(tokenizer.encode('I believe the meaning of life is')) +print(tokenizer.encode("I believe the meaning of life is")) list_vars = model.state_dict() for name in list_vars.keys(): @@ -52,7 +52,7 @@ print(hparams) -fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex +fout.write(struct.pack("i", 0x67676D6C)) # magic: ggml in hex fout.write(struct.pack("i", hparams["vocab_size"])) fout.write(struct.pack("i", hparams["max_position_embeddings"])) fout.write(struct.pack("i", hparams["hidden_size"])) @@ -67,12 +67,13 @@ vocab = tokenizer.get_vocab() if not isinstance(vocab, dict): raise TypeError -items = list(vocab.items()) -items.sort(key=lambda x: x[1]) -vocab_list = [i[0] for i in items] -for idx, k in enumerate(vocab_list): - text = k +# id:key +reversed_vocab = {idx: key for key, idx in vocab.items()} + +# use vocab_size to confirm size +for idx in range(hparams["vocab_size"]): + text = reversed_vocab[idx] # print(f"{i}:{text}") data = bytes(text, "utf-8") fout.write(struct.pack("i", len(data))) @@ -80,26 +81,26 @@ for name in list_vars.keys(): data = list_vars[name].squeeze().numpy() - if name in ['embeddings.position_ids', 'pooler.dense.weight', 'pooler.dense.bias']: + if name in ["embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"]: continue print("Processing variable: " + name + " with shape: ", data.shape) - n_dims = len(data.shape); + n_dims = len(data.shape) # ftype == 0 -> float32, ftype == 1 -> float16 if ftype == 1 and name[-7:] == ".weight" and n_dims == 2: - print(" Converting to float16") - data = data.astype(np.float16) - l_type = 1 + print(" Converting to float16") + data = data.astype(np.float16) + l_type = 1 else: l_type = 0 # header - str = name.encode('utf-8') + str = name.encode("utf-8") fout.write(struct.pack("iii", n_dims, len(str), l_type)) for i in range(n_dims): fout.write(struct.pack("i", data.shape[n_dims - 1 - i])) - fout.write(str); + fout.write(str) # data data.tofile(fout)