1919 sys .path .insert (1 , str (Path (__file__ ).parent / 'gguf-py' / 'gguf' ))
2020import gguf
2121
22- # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
23-
24-
25- def bytes_to_unicode ():
26- """
27- Returns list of utf-8 byte and a corresponding list of unicode strings.
28- The reversible bpe codes work on unicode strings.
29- This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
30- When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
31- This is a significant percentage of your normal, say, 32K bpe vocab.
32- To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
33- And avoids mapping to whitespace/control characters the bpe code barfs on.
34- """
35- bs = list (range (ord ("!" ), ord ("~" )+ 1 ))+ list (range (ord ("¡" ), ord ("¬" )+ 1 ))+ list (range (ord ("®" ), ord ("ÿ" )+ 1 ))
36- cs = bs [:]
37- n = 0
38- for b in range (2 ** 8 ):
39- if b not in bs :
40- bs .append (b )
41- cs .append (2 ** 8 + n )
42- n += 1
43- return dict (zip (bs , (chr (n ) for n in cs )))
44-
4522
4623def count_model_parts (dir_model : Path ) -> int :
4724 num_parts = 0
@@ -131,6 +108,8 @@ def parse_args() -> argparse.Namespace:
131108print ("gguf: get tokenizer metadata" )
132109
133110tokens : list [bytearray ] = []
111+ scores : list [float ] = []
112+ toktypes : list [int ] = []
134113
135114tokenizer_json_file = dir_model / 'tokenizer.json'
136115if not tokenizer_json_file .is_file ():
@@ -155,31 +134,15 @@ def parse_args() -> argparse.Namespace:
155134tokenizer = AutoTokenizer .from_pretrained (dir_model )
156135
157136reverse_vocab = {id : encoded_tok for encoded_tok , id in tokenizer .vocab .items ()}
158- byte_encoder = bytes_to_unicode ()
159- byte_decoder = {v : k for k , v in byte_encoder .items ()}
160137
161138for i in range (vocab_size ):
162- if i in reverse_vocab :
163- try :
164- text = bytearray ([byte_decoder [c ] for c in reverse_vocab [i ]])
165- except KeyError :
166- text = bytearray ()
167- for c in reverse_vocab [i ]:
168- if ord (c ) < 256 : # single byte character
169- try :
170- text .append (byte_decoder [c ])
171- except KeyError :
172- text .extend (c .encode ('utf-8' ))
173- else : # multibyte special token character
174- text .extend (c .encode ('utf-8' ))
175- else :
176- print (f"Key { i } not in tokenizer vocabulary. Padding with an arbitrary token. (It's normal for MPT.)" )
177- pad_token = f"[PAD{ i } ]" .encode ("utf8" )
178- text = bytearray (pad_token )
179-
180- tokens .append (text )
139+ tokens .append (reverse_vocab [i ] if i in reverse_vocab else f"[PAD{ i } ]" )
140+ scores .append (0.0 ) # dummy
141+ toktypes .append (gguf .TokenType .NORMAL )
181142
182143gguf_writer .add_token_list (tokens )
144+ gguf_writer .add_token_scores (scores )
145+ gguf_writer .add_token_types (toktypes )
183146
184147special_vocab = gguf .SpecialVocab (dir_model , load_merges = True )
185148special_vocab .add_to_gguf (gguf_writer )
@@ -239,10 +202,6 @@ def parse_args() -> argparse.Namespace:
239202
240203 print (new_name + ", n_dims = " + str (n_dims ) + ", " + str (old_dtype ) + " --> " + str (data .dtype ))
241204
242-
243- # if new_name == "wte.weight" and data.shape[0] == 50432 and vocab_size == 50254:
244- # data = data[0:vocab_size,:]
245-
246205 gguf_writer .add_tensor (new_name , data )
247206
248207 # note: MPT output is tied to (same as) wte in original model;
0 commit comments