added dry sampler implementatin

l3utterfly · l3utterfly · commit 99b77600f199 · 2024-04-23T13:44:22.000+09:00
diff --git a/common/sampling.cpp b/common/sampling.cpp
@@ -260,13 +260,18 @@ static llama_token_data_array llama_sampling_prepare_impl(
 
     const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
 
+    // repetition penalties
     const int32_t penalty_last_n  = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
     const float   penalty_repeat  = params.penalty_repeat;
     const float   penalty_freq    = params.penalty_freq;
     const float   penalty_present = params.penalty_present;
-
     const bool    penalize_nl     = params.penalize_nl;
 
+    // DRY sampler parameters
+    const float   dry_multiplier        = params.dry_multiplier;
+    const float   dry_base              = params.dry_base;
+    const int     dry_allowed_length    = params.dry_allowed_length;
+
     auto & prev = ctx_sampling->prev;
     auto & cur  = ctx_sampling->cur;
 
@@ -302,10 +307,20 @@ static llama_token_data_array llama_sampling_prepare_impl(
     if (penalty_tokens_used_size) {
         const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];
 
+        // repetition penalties
         llama_sample_repetition_penalties(ctx_main, &cur_p,
                 penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size,
                 penalty_tokens_used_size, penalty_repeat, penalty_freq, penalty_present);
 
+        // DRY penalties (multiplier > 0 means enabled)
+        if(dry_multiplier > 0.0f) {
+            llama_sample_dry(ctx_main, &cur_p,
+                            penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size,
+                            penalty_tokens_used_size, dry_base, dry_multiplier, dry_allowed_length,
+                            params.dry_sequence_breakers.data(), params.dry_sequence_breakers.size());
+        }
+        
+
         if (!penalize_nl) {
             for (size_t idx = 0; idx < cur_p.size; idx++) {
                 if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) {
diff --git a/common/sampling.h b/common/sampling.h
@@ -38,7 +38,10 @@ typedef struct llama_sampling_params {
     int32_t     mirostat              = 0;        // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
     float       mirostat_tau          = 5.00f;    // target entropy
     float       mirostat_eta          = 0.10f;    // learning rate
-    bool        penalize_nl           = false;     // consider newlines as a repeatable token
+    bool        penalize_nl           = false;    // consider newlines as a repeatable token
+    float       dry_multiplier        = 0.0f;     // 0.0f = disabled, recommended value: 0.8f
+    float       dry_base              = 1.75f;
+    int         dry_allowed_length    = 2;
 
     std::vector<llama_sampler_type> samplers_sequence = {
         llama_sampler_type::TOP_K,
@@ -59,6 +62,7 @@ typedef struct llama_sampling_params {
     std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
 
     std::vector<llama_token> penalty_prompt_tokens;
+    std::vector<llama_token> dry_sequence_breakers; // sequence breakers for the DRY sampler
     bool                     use_penalty_prompt_tokens = false;
 } llama_sampling_params;
 
diff --git a/llama.cpp b/llama.cpp
@@ -12832,6 +12832,64 @@ void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * can
     }
 }
 
+void llama_sample_dry(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, int last_token_size, float dry_base, float dry_multiplier, int dry_allowed_length, const llama_token * seq_breakers, int seq_breakers_size) {
+    // loop through each candidate
+    for (size_t i = 0; i < candidates->size; ++i) {
+
+        // if our candidate itself is part of the sequence breakers, we don't apply the dry penalty
+        if (std::find(seq_breakers, seq_breakers + seq_breakers_size, candidates->data[i].id) != seq_breakers + seq_breakers_size) {
+            continue;
+        }
+
+        int max_match_length = 0;
+
+        // loop through each previous token
+        for (size_t j = 0; j < last_token_size; ++j) {
+            // if the current candidate is the same as the previous token
+            if (candidates->data[i].id == last_tokens[j]) {
+                // greedily match sequence backwards starting from the current position with the end of prev
+                int match_length = 1;
+
+                // loop through the previous tokens
+                for(;; match_length++) {
+                    // if we have reached the start of our stored prev, break
+                    if(j - match_length > 0) break;
+
+                    // this shouldn't happen because (j - match_length) should always be smaller than (size - match_length)
+                    // but let's check here to avoid the unexpected
+                    if(last_token_size - match_length < 0) break;
+
+                    // compare token starts at our prev index, going backwards by match length
+                    auto compare_token = last_tokens[j - match_length];
+
+                    // head token starts at the end of prev, going backwards by match length
+                    auto head_token = last_tokens[last_token_size - match_length];
+
+                    // if compare token is part of the sequence breakers, break out of the match
+                    if(std::find(seq_breakers, seq_breakers + seq_breakers_size, compare_token) != seq_breakers + seq_breakers_size)
+                        break;
+
+                    // break out of the match if any tokens don't match
+                    if(compare_token != head_token)
+                        break;
+                }
+
+                // update our max match length
+                max_match_length = std::max(max_match_length, match_length);
+            }
+        }
+
+        // apply penalties
+        if(max_match_length > dry_allowed_length) {
+            // calculate the penalty
+            float penalty = dry_multiplier * pow(dry_base, max_match_length - dry_allowed_length);
+
+            // apply the dry penalty
+            candidates->data[i].logit -= penalty;
+        }
+    }
+}
+
 void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep) {
     if (z >= 1.0f || candidates->size <= 2) {
         return;
diff --git a/llama.h b/llama.h
@@ -918,6 +918,18 @@ extern "C" {
                            float   p,
                           size_t   min_keep);
 
+    ///  @details DRY sampler as described in: https://github.com/oobabooga/text-generation-webui/pull/5677
+    LLAMA_API void llama_sample_dry(
+            struct llama_context * ctx,
+          llama_token_data_array * candidates,
+               const llama_token * last_tokens,
+                             int   last_token_size,
+                           float   dry_base,
+                           float   dry_multiplier,
+                             int   dry_allowed_length,
+               const llama_token * seq_breakers,
+                             int   seq_breakers_size);
+
     /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
     LLAMA_API void llama_sample_tail_free(
             struct llama_context * ctx,