diff --git a/Makefile b/Makefile index 40187c4a25e62..1273eb5511b58 100644 --- a/Makefile +++ b/Makefile @@ -569,6 +569,9 @@ perplexity: examples/perplexity/perplexity.cpp build-info.h ggml. embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) +cmap-example: examples/cmap-example/cmap-example.cpp build-info.h ggml.o llama.o common.o $(OBJS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) diff --git a/common/common.cpp b/common/common.cpp index 0f55c33a713a7..1a6156473ab93 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -626,6 +626,9 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { return true; } +// There were missing items from this list of helps so the wording needs checking (all inserted at the end, so reposition too): +// --embedding, --beams, --ppl-stride, --ppl-output-type, --memory-f32, --no-mmap, --mlock, --use-color, --nprobs, --alias, --infill, --prompt-file +// some corresponding changes to the sequence of fprintf() code may be needed void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf("usage: %s [options]\n", argv[0]); printf("\n"); @@ -672,7 +675,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat); printf(" --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta); printf(" --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau); - printf(" -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n"); + printf(" -l T, --logit-bias T T = TOKEN_ID(plus/minus)BIAS\n"); printf(" modifies the likelihood of token appearing in the completion,\n"); printf(" i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"); printf(" or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n"); @@ -687,7 +690,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n"); printf(" --rope-freq-scale N RoPE frequency linear scaling factor (default: loaded from model)\n"); printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n"); - printf(" --no-penalize-nl do not penalize newline token\n"); + printf(" --no-penalize-nl do not penalize newline token (default is DO penalise nl token)\n"); printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); printf(" not recommended: doubles context memory required and no measurable increase in quality\n"); printf(" --temp N temperature (default: %.1f)\n", (double)params.temp); @@ -734,6 +737,18 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" draft model for speculative decoding (default: %s)\n", params.model.c_str()); printf(" -ld LOGDIR, --logdir LOGDIR\n"); printf(" path under which to save YAML logs (no logging if unset)\n"); + printf(" --ppl-stride stride for ppl calcs. 0 (default): the pre-existing approach will be used.\n"); + printf(" --ppl-output-type 0 (default): ppl output as usual, 1: ppl output num_tokens, one per line\n"); + printf(" --embedding 0 (default): get only sentence embedding\n"); + printf(" --beams N 0 (default): if non-zero use beam search of given width N.\n"); + printf(" --memory-f32 0 (default): if true (= 1) disable f16 memory.\n"); + printf(" --no-mmap 0 (default): if true use mmap for faster loads.\n"); + printf(" --mlock 0 (default): if true keep model in memory.\n"); + printf(" --use-color 0 (default): use color to distinguish generations from inputs\n"); + printf(" --nprobs N if > 0 output the probabilities of the top N tokens\n"); + printf(" --alias model alias (default: 'unknown')\n"); + printf(" --infill 0 (defaut) use infill mode\n"); + printf(" --prompt-file name of external prompt file\n"); printf("\n"); } diff --git a/common/common.h b/common/common.h index c802152791797..ee5c1909414b9 100644 --- a/common/common.h +++ b/common/common.h @@ -35,21 +35,21 @@ int32_t get_num_physical_cores(); struct gpt_params { uint32_t seed = -1; // RNG seed - int32_t n_threads = get_num_physical_cores(); - int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads) + int32_t n_threads = get_num_physical_cores(); // user-defined or num of internal physical cores + int32_t n_threads_batch = -1; // num threads for batch proc (-1 = use n_threads) int32_t n_predict = -1; // new tokens to predict int32_t n_ctx = 512; // context size - int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) + int32_t n_batch = 512; // batch size for prompt proc (>=32 to use BLAS) int32_t n_keep = 0; // number of tokens to keep from initial prompt int32_t n_draft = 16; // number of tokens to draft during speculative decoding int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) int32_t n_parallel = 1; // number of parallel sequences to decode int32_t n_sequences = 1; // number of sequences to decode - int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) - int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) + int32_t n_gpu_layers = -1; // num layers stored in VRAM (-1 for default) + int32_t n_gpu_layers_draft = -1; // num layers stored in VRAM for draft mod (-1 for default) int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs - int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. + int32_t n_probs = 0; // if > 0, output probabilities of top n_probs tokens. int32_t n_beams = 0; // if non-zero then use beam search of given width. float rope_freq_base = 0.0f; // RoPE base frequency float rope_freq_scale = 0.0f; // RoPE frequency scaling factor @@ -61,7 +61,7 @@ struct gpt_params { float typical_p = 1.00f; // 1.0 = disabled float temp = 0.80f; // 1.0 = disabled float repeat_penalty = 1.10f; // 1.0 = disabled - int32_t repeat_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) + int32_t repeat_last_n = 64; // last n tokens to penalize (0 = disable, -1 = cxt size) float frequency_penalty = 0.00f; // 0.0 = disabled float presence_penalty = 0.00f; // 0.0 = disabled int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 @@ -75,10 +75,11 @@ struct gpt_params { std::string cfg_negative_prompt; // string to help guidance float cfg_scale = 1.f; // How strong is guidance + std::string help = ""; // universal help parameter std::string model = "models/7B/ggml-model-f16.gguf"; // model path std::string model_draft = ""; // draft model for speculative decoding std::string model_alias = "unknown"; // model alias - std::string prompt = ""; + std::string prompt = ""; // user-provided single prompt std::string prompt_file = ""; // store the external prompt file name std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state std::string input_prefix = ""; // string to prefix user inputs with @@ -90,11 +91,11 @@ struct gpt_params { std::vector> lora_adapter; // lora adapter path with user defined scale std::string lora_base = ""; // base model path for the lora adapter - int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used. - int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line + int ppl_stride = 0; // stride for ppl calcs. 0: the pre-existing approach will be used. + int ppl_output_type = 0; // 0: ppl output as usual, 1: ppl output = num_tokens, ppl, one per line // (which is more convenient to use for plotting) // - bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt + bool hellaswag = false; // compute HellaSwag score from datafile given in prompt size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS @@ -109,7 +110,7 @@ struct gpt_params { bool escape = false; // escape "\n", "\r", "\t", "\'", "\"", and "\\" bool interactive_first = false; // wait for user input immediately bool multiline_input = false; // reverse the usage of `\` - bool simple_io = false; // improves compatibility with subprocesses and limited consoles + bool simple_io = false; // improves compat'y with subprocs and ltd consoles bool cont_batching = false; // insert new sequences for decoding on-the-fly bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index de4cf7a691768..b32706b0337e0 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -30,6 +30,7 @@ else() add_subdirectory(embd-input) add_subdirectory(llama-bench) add_subdirectory(beam-search) + add_subdirectory(cmap-example) if (LLAMA_METAL) add_subdirectory(metal) endif() diff --git a/examples/cmap-example/CMakeLists.txt b/examples/cmap-example/CMakeLists.txt new file mode 100644 index 0000000000000..c5820f7b1bfcd --- /dev/null +++ b/examples/cmap-example/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET cmap-example) +add_executable(${TARGET} cmap-example.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/cmap-example/cmap-example.cpp b/examples/cmap-example/cmap-example.cpp new file mode 100644 index 0000000000000..d06699c864b94 --- /dev/null +++ b/examples/cmap-example/cmap-example.cpp @@ -0,0 +1,124 @@ +// example of a C/C++ equivalent data structure to the python dict in readcommonh.py + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +// there may be good reasons not to sort the parameters, but here we use map +#include +#include + +std::vector split_string(const std::string& str, const std::string& delimiter) { + std::vector tokens; + std::size_t start = 0, end = 0; + bool inside_tags = false; // flag to track if we are inside "<>" + + while ((end = str.find(delimiter, start)) != std::string::npos) { + std::string token = str.substr(start, end - start); + + // if (!token.empty()) { // Add condition to exclude empty substrings + // tokens.push_back(token); + + if (!inside_tags && !token.empty()) { // Add condition to exclude empty substrings and if not inside "<>" + tokens.push_back(token); + } + // deal with cases where the split character occurs inside <> + // Update inside_tags flag based on "<>" + size_t open_tag_pos = str.find("<", start); + size_t close_tag_pos = str.find(">", start); + if (open_tag_pos != std::string::npos && close_tag_pos != std::string::npos && open_tag_pos < end) { + inside_tags = true; + } else if (close_tag_pos != std::string::npos && close_tag_pos < end) { + inside_tags = false; + } + start = end + delimiter.length(); + } + tokens.push_back(str.substr(start)); + return tokens; +} + +void print_parameters(const std::map>& parameters) { + for (const auto& pair : parameters) { + const std::string& key = pair.first; + const std::vector& value = pair.second; // usually has multiple elements + printf("key: %25s: values: ", key.c_str()); + for (const std::string& element : value) { + printf("%s ", element.c_str()); + } + printf("\n"); + } +} + +std::map> extract_parameters() { + std::ifstream file("common/common.h"); + std::string line; + std::vector lines; + while (std::getline(file, line)) { + lines.push_back(line); + } + + std::map> parameters; + // fix up failure to match logit_bias; may also need to add lora_adapter; now dealt with and ready for deletion + // parameters["logit_bias"] = {"std::unordered_map" "logit_bias", "=", "0", "//", "way", "to", "alter", "prob", "of", "word", "being", "chosen"}; + // parameters["lora_adapter"] = {"std::vector>", "lora_adapter", "=", "", "//", "lora", "adapter", "path", "with", "user-defined", "scale"}; + + // are we inside gpt_params? + // this for loop finds all the params inside struct gpt-params + bool inside = false; + for (const std::string& line : lines) { + std::vector nws_elements = split_string(line, " "); + printf("nwe = "); + for (const std::string& element : nws_elements) { + printf("%s ", element.c_str()); + } + printf("\n"); + + if (!nws_elements.empty() && nws_elements[0] == "struct" && nws_elements[1] == "gpt_params") { + inside = true; + } + + if (nws_elements.size() > 2 && inside) { + // cannot use nwe[0] as key because types do not generate unique keys and so overwrite + // Here we deliberately add back the key so we can manually change it when it is different (remove eventually) + // parameters[nws_elements[1]] = nws_elements; + std::vector copy = nws_elements; // Create a copy of nws_elements + parameters[nws_elements[1]] = copy; // Assign the copy to parameters + + // Remove spurious entry caused by eccentric status of logit_bias + if (parameters.count("float>") && parameters["float>"][2] == "logit_bias;") { + parameters.erase("float>"); + } + // Remove spurious entry caused by eccentric status of lora_adapter + if (parameters.count("float>>") && parameters["float>>"][2] == "lora_adapter;") { + parameters.erase("float>>"); + } + } + + // Terminate the harvest; TODO: not robust; need better terminator; this just a crude hack for now + if (nws_elements.size() > 2 && nws_elements[2] == "infill") { + inside = false; + break; + } + } + // now display them (unnecessary operationally; here for development) + print_parameters(parameters); + + // return the results (will eventually become a void function) + return parameters; +} + +int main() { + + // process the code inserted to replicate readcommonh.py + // this does not produce output but here is forced; it just collects the output into parameters and returns 0 + std::map> parameters = extract_parameters(); + print_parameters(parameters); + + return 0; +} diff --git a/examples/cmap-example/find-implemented-args.py b/examples/cmap-example/find-implemented-args.py new file mode 100644 index 0000000000000..c48c8c5a3d82c --- /dev/null +++ b/examples/cmap-example/find-implemented-args.py @@ -0,0 +1,228 @@ +# search the specified directory for files that include command-line arguments +# these are almost always in the form params.argument; "logit_bias" is one exception +# have yet to investigate fully what "lora_adapter" in server.cpp does since it is not apparently +# accessible from the command-line arg/parameter sequence. +# there is also an issue with -ngl which does not appear in some help menus even when apparently implemented, e.g. in parallel.cpp + +import os +import re +import collections +import re +import readcommonh + +# update the source file - usually 'help_list.txt', so the default - in case the source file has been changed +def update_file(file_from, file_to = "help_list.txt"): + # Open the file_from file + with open(file_from, "r") as file: + lines = file.readlines() + + # Find lines starting with "printf(" and ending with ");" (assumes file_from is written in C/C++) + pattern = r'printf\("\s(.*?)\);' + matched_lines = [re.search(pattern, line).group(1) for line in lines if re.search(pattern, line)] + + # Save matched lines to file_to + with open(file_to, "w") as file: + for line in matched_lines: + file.write(line + '\n') + +# helper fn to make the hyphenated words in a file snake-case for searching +def replace_dashes_with_underscores(filename): + with open(filename, 'r') as file: + content = file.read() + + # Match '-' surrounded by word characters on both sides and replace with '_' + replaced_content = re.sub(r'(\w)-(\w)', r'\1_\2', content) + + with open(filename, 'w') as file: + file.write(replaced_content) + +# helper fn to make the underscored words in a file hyphenated for print +def replace_underscores_with_dashes(parameter): + # Match '_' surrounded by word characters on both sides and replace with '-' + return re.sub(r'(\w)_(\w)', r'\1-\2', parameter) + + +# find all instances of "params." in the *.cpp files in a directory +def find_arguments(directory): + arguments = {} + + # Use os.walk() to traverse through files in directory and subdirectories + for root, dirs, files in os.walk(directory): + for file in files: + if file.endswith('.cpp'): + filepath = os.path.join(root, file) + with open(filepath, 'r') as file: + content = file.read() + + # Search for the expression "params." or "params->" excluding prefixes and read the attribute without trailing detritus + # matches = re.findall(r'(?:^|\s)params\.(.*)(?=[\). <,;}]|\Z)', content) + matches = set(re.findall(r'(?:^|\b)params[->\.]([a-zA-Z_0-9]*)(?=[\). <,;}]|\Z)', content)) + + # Add the matches to the dictionary + arguments[filepath] = matches + + return arguments + +# output a list of the params.attributes for each file +def output_results(result): + sorted_result = collections.OrderedDict(sorted(result.items())) + all_of_them = set() + for filename, arguments in sorted_result.items(): + arguments.add("help") + print(f"Filename: \033[32m{filename.split('/')[-1]}\033[0m, arguments: {arguments}\n") + for argument in arguments: + if argument not in all_of_them: + all_of_them.add("".join(argument)) + print(f"\033[32mAll of them: \033[0m{sorted(all_of_them)}.") + return sorted_result + +# put all the words after "//" in a dict back together with spaces +def concatenate(v): + concatenated_element = "" + for i, element in enumerate(v): + if element == "//": + concatenated_element = " ".join(v[i:]) + return concatenated_element + +def title_print(filename): + title = filename.split('/')[-1] + print("\n\n"+"#"*(10+len(title))) + print(f"Filename: \033[32m{title}\033[0m") + print("#"*(10+len(title))) + +# list all the equivalences between declarations in common.h and common.cpp that defines the help +# these are used to substitute the searched params.attributes (keys) with help attributes (values) +def substitution_list(parameters): + # store untrapped parameters as identicals in case we need to change them later + sub_dict = {"n_threads": "threads", + "n_ctx": "ctx_size", + "n_draft" : "draft", + "n_threads_batch" : "threads_batch", + "n_chunks" : "chunks", + "n_batch" : "batch_size", + "n_sequences" : "sequences", + "n_parallel" : "parallel", + "n_beams" : "beams", + "n_keep" : "keep", + "n_probs" : "nprobs", + "path_prompt_cache" : "prompt_cache", + "prompt_file" : "prompt_file", + "input_prefix" : "in_prefix", + "input_suffix" : "in_suffix", + "input_prefix_bos" : "in_prefix_bos", + "antiprompt" : "reverse_prompt", + "mul_mat_q" : "no_mul_mat_q", + "use_mmap" : "no_mmap", + "use_mlock" : "mlock", + "model_alias" : "alias", + "tfs_z" : "tfs", + "use_color" : "color", + "logit_bias" : "logit_bias", + "ignore_eos" : "ignore_eos", + "mirostat_tau" : "mirostat_ent", + "mirostat_eta" : "mirostat_lr", + "penalize_nl" : "no_penalize_nl", + "typical_p" : "typical", + "mem_size" : "mem_size", + "mem_buffer" : "mem_buffer", + "no_alloc" : "no_alloc" + } + new_parameters = [] + for parameter in parameters: + if parameter in sub_dict: + # we need both for future reference + new_parameters.append(parameter) + new_parameters.append(sub_dict[parameter]) + else: + new_parameters.append(parameter) + return new_parameters + +# output the lines of the help file +def find_parameters(file, sorted_result): + with open(file, "r") as helpfile: + lines = helpfile.read().split("\n") + for filename, arguments in sorted_result.items(): + # we try to fix up some variant labelling in help_file.txt + arguments = substitution_list(arguments) + parameters = [] + for line in lines: + for argument in arguments: + # building pattern to avoid spurious matches + # pattern = r"(?:--{}\s)|(?:params\.{}[\s.,\.();])".format(argument, argument.split('n_')[-1]) + pattern = r"(?:--{}\s)|(?:params\.{}(?=[\s.,\.\(\);]|\.+\w))".format(argument, argument.split('n_')[-1]) + # pattern = r"(?<=params\.)\w+(?=\.\w+|\.|,|;|\}|\{|\(|\)|\.)" + # bit of a hack to exclude --attributes at the end of help comment lines + if re.search(pattern, line[:50]): + parameters.append(line) + + all_parameters = set(parameters) + + title_print(filename) + print(f"\nCommand-line arguments available and gpt-params functions implemented (TODO: multi-line helps NEED SOME WORK):\n") + + if not all_parameters: + print(f" \033[032mNone\033[0m\n") + + # first do it the original way + else: + help_count = 0 + for parameter in all_parameters: + # reverse the hypthen/underscore pattern just for printing + replaced_param = replace_underscores_with_dashes(parameter) + if not parameter.startswith(" "): + help_count += 1 + print(f"{help_count:>2} help: \033[33m{replaced_param:<30}\033[0m") + else: + print(f" help: \033[33m{replaced_param:<30}\033[0m") + + # now do it the new way + print("\nNow we extract the original gpt_params definition from common.h with the defaults for implemented arguments:\n") + gpt_count = 0 + for k,v in readcommonh.parameters.items(): + if not readcommonh.parameters.items(): + print(f" \033[032mNone\033[0m\n") + elif k in arguments: + # print(f"gpt_params: \033[33m{k:>20}\033[0m values: {v}") + concatenated_element = concatenate(v) + gpt_count += 1 + print(f"{gpt_count:>2} gpt_param: \033[32m{k:>19}; \033[34mrole: \033[33m{concatenated_element:<60}\033[0m; \033[34mdefault: \033[30m{v[1]:<10}\033[0m ") + + # searching the other way round is quicker: + print("\nSearching the other way round is more efficient:\n") + key_count = 0 + for argument in set(arguments): + if argument in readcommonh.parameters: + key_count += 1 + print(f"{key_count:>2} key: {argument:>25}; role: {concatenate(readcommonh.parameters[argument]):<60}; default: {readcommonh.parameters[argument][1]:<10}") + if help_count == gpt_count and gpt_count == key_count: + print(f"\n\033[032mNo unresolved help-list incompatibilities with \033[33m{filename.split('/')[-1]}\033[0m") + else: + print("\n\033[031mThis app requires some attention regarding help-function consistency.\033[0m") + +# Specify the directory you want to search for cpp files +directory = '/Users/edsilm2/llama.cpp/examples' + +if __name__ == '__main__': + + # update the source help file from C++ source (this works exactly as required) + update_file("common/common.cpp", "help_list.txt") + + # get the parameters from the common.h file utiity we import + print(readcommonh.parameters) + # So now we've got the gpt_parameters in this parameters dict + + # First we alter all the hyphenated help words in help-file.txt to underscores + # we later reverse these changers before printing the help lines + replace_dashes_with_underscores('help_list.txt') + + print("\n####################### find parameters #################################") + # Call the find function to collect all the params.attributes and output the result + result = find_arguments(directory) + + print("\n######################################## output_results #################################") + # sort the results and output them + sorted = output_results(result) + + print("\n######################## find help context parameters #################################") + # analyse the files and what they contain + find_parameters("help_list.txt", sorted) diff --git a/examples/cmap-example/readcommonh.py b/examples/cmap-example/readcommonh.py new file mode 100644 index 0000000000000..422edea8fd252 --- /dev/null +++ b/examples/cmap-example/readcommonh.py @@ -0,0 +1,39 @@ +# read common.h and extract the parameters name list + +import re + +# Read the file into separate lines +with open('common/common.h', 'r') as file: + lines = file.read().split('\n') + +parameters = {} +# we add the logit_bias parameter which otherwise is not found +parameters['logit_bias']=['logit_bias', '0', '//', 'way', 'to', 'alter', 'prob', 'of', 'particular', 'words'] + +inside = False +for line in lines: + # non_whitespace_elements = re.findall(r"\S+", line) + non_whitespace_elements = re.findall(r"[^\s}{=;]+", line) + print(f"nwe = \033[33m{non_whitespace_elements}\033[0m") + if non_whitespace_elements and non_whitespace_elements[0] == "struct": + inside = True + if len(non_whitespace_elements) > 2 and inside: + # note: cannot use nwe[0] because types do not generate unique keys and so overwrite + # here we deliberately add back the key so we can make a manual change when it is different + parameters[non_whitespace_elements[1]] = non_whitespace_elements[1:] + # remove spurious entry caused by eccentric status of logit_bias + if "float>" in parameters and parameters["float>"][1] == 'logit_bias': + del parameters["float>"] + + # this is a bit of a hack to terminate the harvest + if len(non_whitespace_elements) > 2 and non_whitespace_elements[1] == "infill": + inside = False + break +for k, v in parameters.items(): + print(f"key: {k:<20}; values: {v}") + concatenated_element = "" + for i, element in enumerate(v): + if element == "//": + concatenated_element = " ".join(v[i:]) + # break + print(" "*10 + f"parameter: \033[32m{k:>40} \033[34mdefault: \033[30m{v[1]:>5} \033[34mcommment: \033[33m{concatenated_element:80}\033[0m") diff --git a/help_list.txt b/help_list.txt new file mode 100644 index 0000000000000..c74199e0e4264 --- /dev/null +++ b/help_list.txt @@ -0,0 +1,107 @@ + -h, --help show this help message and exit\n" + -i, --interactive run in interactive mode\n" + --interactive_first run in interactive mode and wait for input right away\n" + -ins, --instruct run in instruction mode (use with Alpaca models)\n" + --multiline_input allows you to write or paste multiple lines without ending each in '\\'\n" + -r PROMPT, --reverse_prompt PROMPT\n" + halt generation at PROMPT, return control in interactive mode\n" + (can be specified more than once for multiple prompts).\n" + --color colorise output to distinguish prompt and user input from generations\n" + -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n" + -t N, --threads N number of threads to use during generation (default: %d)\n", params.n_threads + -tb N, --threads_batch N\n" + number of threads to use during batch and prompt processing (default: same as --threads)\n" + -p PROMPT, --prompt PROMPT\n" + prompt to start generation with (default: empty)\n" + -e, --escape process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n" + --prompt_cache FNAME file to cache prompt state for faster startup (default: none)\n" + --prompt_cache_all if specified, saves user input and generations to cache as well.\n" + not supported with --interactive or other interactive options\n" + --prompt_cache_ro if specified, uses the prompt cache but does not update it.\n" + --random_prompt start with a randomized prompt.\n" + --in_prefix_bos prefix BOS to user inputs, preceding the `--in_prefix` string\n" + --in_prefix STRING string to prefix user inputs with (default: empty)\n" + --in_suffix STRING string to suffix after user inputs with (default: empty)\n" + -f FNAME, --file FNAME\n" + prompt file to start generation.\n" + -n N, --n_predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict + -c N, --ctx_size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx + -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch + --top_k N top_k sampling (default: %d, 0 = disabled)\n", params.top_k + --top_p N top_p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p + --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z + --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p + --repeat_last_n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n + --repeat_penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty + --presence_penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty + --frequency_penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty + --mirostat N use Mirostat sampling.\n" + Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n" + (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat + --mirostat_lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta + --mirostat_ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau + -l T, --logit_bias T T = TOKEN_ID(plus/minus)BIAS\n" + modifies the likelihood of token appearing in the completion,\n" + i.e. `--logit_bias 15043+1` to increase likelihood of token ' Hello',\n" + or `--logit_bias 15043_1` to decrease likelihood of token ' Hello'\n" + --grammar GRAMMAR BNF_like grammar to constrain generations (see samples in grammars/ dir)\n" + --grammar_file FNAME file to read grammar from\n" + --cfg_negative_prompt PROMPT\n" + negative prompt to use for guidance. (default: empty)\n" + --cfg_negative_prompt_file FNAME\n" + negative prompt file to use for guidance. (default: empty)\n" + --cfg_scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale + --rope_scale N RoPE context linear scaling factor, inverse of --rope_freq_scale\n" + --rope_freq_base N RoPE base frequency, used by NTK_aware scaling (default: loaded from model)\n" + --rope_freq_scale N RoPE frequency linear scaling factor (default: loaded from model)\n" + --ignore_eos ignore end of stream token and continue generating (implies --logit_bias 2_inf)\n" + --no_penalize_nl do not penalize newline token (default is DO penalise nl token)\n" + --memory_f32 use f32 instead of f16 for memory key+value (default: disabled)\n" + not recommended: doubles context memory required and no measurable increase in quality\n" + --temp N temperature (default: %.1f)\n", (double)params.temp + --logits_all return logits for all tokens in the batch (default: disabled)\n" + --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n" + --hellaswag_tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks + --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep + --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft + --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks + -np N, --parallel N number of parallel sequences to decode (default: %d)\n", params.n_parallel + -ns N, --sequences N number of sequences to decode (default: %d)\n", params.n_sequences + -cb, --cont_batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n" + --mlock force system to keep model in RAM rather than swapping or compressing\n" + --no_mmap do not memory_map model (slower load but may reduce pageouts if not using mlock)\n" + --numa attempt optimizations that help on some NUMA systems\n" + if run without this previously, it is recommended to drop the system page cache before using this\n" + see https://github.com/ggerganov/llama.cpp/issues/1437\n" + -ngl N, --n_gpu_layers N\n" + number of layers to store in VRAM\n" + -ngld N, --n_gpu_layers_draft N\n" + number of layers to store in VRAM for the draft model\n" + -ts SPLIT --tensor_split SPLIT\n" + how to split tensors across multiple GPUs, comma_separated list of proportions, e.g. 3,1\n" + -mg i, --main_gpu i the GPU to use for scratch and small tensors\n" + -nommq, --no_mul_mat_q\n" + use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n" + Not recommended since this is both slower and uses more VRAM.\n" + --verbose_prompt print prompt before generation\n" + --lora FNAME apply LoRA adapter (implies --no_mmap)\n" + --lora_scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no_mmap)\n" + --lora_base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n" + -m FNAME, --model FNAME\n" + model path (default: %s)\n", params.model.c_str() + -md FNAME, --model_draft FNAME\n" + draft model for speculative decoding (default: %s)\n", params.model.c_str() + -ld LOGDIR, --logdir LOGDIR\n" + path under which to save YAML logs (no logging if unset)\n" + --ppl_stride stride for ppl calcs. 0 (default): the pre_existing approach will be used.\n" + --ppl_output_type 0 (default): ppl output as usual, 1: ppl output num_tokens, one per line\n" + --embedding 0 (default): get only sentence embedding\n" + --beams N 0 (default): if non_zero use beam search of given width N.\n" + --memory_f32 0 (default): if true (= 1) disable f16 memory.\n" + --no_mmap 0 (default): if true use mmap for faster loads.\n" + --mlock 0 (default): if true keep model in memory.\n" + --use_color 0 (default): use color to distinguish generations from inputs\n" + --nprobs N if > 0 output the probabilities of the top N tokens\n" + --alias model alias (default: 'unknown')\n" + --infill 0 (defaut) use infill mode\n" + --prompt_file name of external prompt file\n"