@@ -785,20 +785,29 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
785785 break
786786 yield result
787787
788- def check_vocab_size (params : Params , vocab : Vocab ) -> None :
788+ def check_vocab_size (params : Params , vocab : Vocab , pad_vocab : bool = False ) -> None :
789789 if params .n_vocab != vocab .vocab_size :
790790 assert isinstance (vocab , BpeVocab ) or isinstance (vocab , SentencePieceVocab )
791791 if params .n_vocab == vocab .vocab_size_base :
792792 print ("Ignoring added_tokens.json since model matches vocab size without it." )
793793 vocab .added_tokens_list = []
794794 vocab .vocab_size = vocab .vocab_size_base
795795 return
796+ if pad_vocab and params .n_vocab > vocab .vocab_size :
797+ pad_count = params .n_vocab - vocab .vocab_size
798+ print (f'Padding vocab with { pad_count } token(s) - <dummy00001> through <dummy{ pad_count :05} >' )
799+ for i in range (1 , (params .n_vocab - vocab .vocab_size ) + 1 ):
800+ vocab .added_tokens_list .append (f'<dummy{ i :05} >' )
801+ vocab .vocab_size = params .n_vocab
802+ return
796803 msg = f"Vocab size mismatch (model has { params .n_vocab } , but { vocab .fname_tokenizer } "
797804 if vocab .fname_added_tokens is not None :
798805 msg += f" combined with { vocab .fname_added_tokens } "
799806 msg += f" has { vocab .vocab_size } )."
800807 if vocab .vocab_size < params .n_vocab < vocab .vocab_size + 20 and vocab .fname_added_tokens is None :
801808 msg += f" Most likely you are missing added_tokens.json (should be in { vocab .fname_tokenizer .parent } )."
809+ if vocab .vocab_size < params .n_vocab :
810+ msg += " Possibly try using the --padvocab option."
802811 raise Exception (msg )
803812
804813
@@ -875,8 +884,12 @@ def close(self) -> None:
875884 self .gguf .close ()
876885
877886 @staticmethod
878- def write_vocab_only (fname_out : Path , params : Params , vocab : Vocab , svocab : gguf .SpecialVocab , endianess :gguf .GGUFEndian = gguf .GGUFEndian .LITTLE ) -> None :
879- check_vocab_size (params , vocab )
887+ def write_vocab_only (
888+ fname_out : Path , params : Params , vocab : Vocab , svocab : gguf .SpecialVocab ,
889+ endianess : gguf .GGUFEndian = gguf .GGUFEndian .LITTLE ,
890+ pad_vocab : bool = False ,
891+ ) -> None :
892+ check_vocab_size (params , vocab , pad_vocab = pad_vocab )
880893
881894 of = OutputFile (fname_out , endianess = endianess )
882895
@@ -903,8 +916,14 @@ def maybe_do_quantize(item: tuple[DataType, NDArray]) -> NDArray:
903916 return dt .quantize (arr )
904917
905918 @staticmethod
906- def write_all (fname_out : Path , ftype : GGMLFileType , params : Params , model : LazyModel , vocab : Vocab , svocab : gguf .SpecialVocab , concurrency : int = DEFAULT_CONCURRENCY , endianess = gguf .GGUFEndian .LITTLE ) -> None :
907- check_vocab_size (params , vocab )
919+ def write_all (
920+ fname_out : Path , ftype : GGMLFileType , params : Params ,
921+ model : LazyModel , vocab : Vocab , svocab : gguf .SpecialVocab ,
922+ concurrency : int = DEFAULT_CONCURRENCY ,
923+ endianess : gguf .GGUFEndian = gguf .GGUFEndian .LITTLE ,
924+ pad_vocab : bool = False ,
925+ ) -> None :
926+ check_vocab_size (params , vocab , pad_vocab = pad_vocab )
908927
909928 of = OutputFile (fname_out , endianess = endianess )
910929
@@ -1124,6 +1143,7 @@ def main(args_in: list[str] | None = None) -> None:
11241143 parser .add_argument ("--ctx" , type = int , help = "model training context (default: based on input)" )
11251144 parser .add_argument ("--concurrency" , type = int , help = f"concurrency used for conversion (default: { DEFAULT_CONCURRENCY } )" , default = DEFAULT_CONCURRENCY )
11261145 parser .add_argument ("--bigendian" , action = "store_true" , help = "model is executed on big endian machine" )
1146+ parser .add_argument ("--padvocab" , action = "store_true" , help = "add pad tokens when model vocab expects more than tokenizer metadata provides" )
11271147
11281148 args = parser .parse_args (args_in )
11291149 if args .dump_single :
@@ -1171,7 +1191,8 @@ def main(args_in: list[str] | None = None) -> None:
11711191 load_merges = args .vocabtype == 'bpe' ,
11721192 n_vocab = vocab .vocab_size )
11731193 outfile = args .outfile
1174- OutputFile .write_vocab_only (outfile , params , vocab , special_vocab )
1194+ OutputFile .write_vocab_only (outfile , params , vocab , special_vocab ,
1195+ endianess = endianess , pad_vocab = args .padvocab )
11751196 print (f"Wrote { outfile } " )
11761197 return
11771198
@@ -1194,7 +1215,8 @@ def main(args_in: list[str] | None = None) -> None:
11941215 params .ftype = ftype
11951216 print (f"Writing { outfile } , format { ftype } " )
11961217
1197- OutputFile .write_all (outfile , ftype , params , model , vocab , special_vocab , concurrency = args .concurrency , endianess = endianess )
1218+ OutputFile .write_all (outfile , ftype , params , model , vocab , special_vocab ,
1219+ concurrency = args .concurrency , endianess = endianess , pad_vocab = args .padvocab )
11981220 print (f"Wrote { outfile } " )
11991221
12001222
0 commit comments