model : disable SWA for Phi models (#13676)

ggerganov · web-flow · commit b44890df2e4f · 2025-05-21T13:09:21.000+03:00
* model : disable SWA for Phi models

ggml-ci

* model : update warning message

* model : print warning only if n_swa &gt; 0

* model : fix typo
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -1236,8 +1236,7 @@ llm_graph_input_attn_kv_unified * llm_graph_context::build_attn_inp_kv_unified()
     auto inp = std::make_unique<llm_graph_input_attn_kv_unified>(hparams, cparams, kv_self);
 
     {
-        GGML_ASSERT(hparams.n_swa_pattern == 1 && "Use llama_kv_cache_unified_iswa for SWA");
-        GGML_ASSERT(hparams.n_swa == 0         && "Use llama_kv_cache_unified_iswa for SWA");
+        GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_unified_iswa for SWA");
 
         const auto n_kv = kv_self->get_n();
 
@@ -1312,8 +1311,8 @@ llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unif
         inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
     }
 
-    if (hparams.n_swa_pattern > 1) {
-        GGML_ASSERT(hparams.n_swa > 0          && "Use llama_kv_cache_unified for non-SWA");
+    {
+        GGML_ASSERT(hparams.swa_type != LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_unified for non-SWA");
 
         const auto n_kv = kv_self->get_kv_swa()->get_n();
 
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -853,43 +853,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     default: type = LLM_TYPE_UNKNOWN;
                 }
 
-                // for backward compatibility ; see: https://github.com/ggerganov/llama.cpp/pull/8931
-                if ((hparams.n_layer == 32 || hparams.n_layer == 40) && hparams.n_ctx_train == 4096) {
-                    // default value for Phi-3-mini-4k-instruct and Phi-3-medium-4k-instruct
-                    LLAMA_LOG_WARN("%s: assuming n_swa = 2047 for Phi-3-mini-4k-instruct and Phi-3-medium-4k-instruct\n", __func__);
+                const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
 
-                    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
-
-                    hparams.n_swa = 2047;
-                } else if (hparams.n_layer == 32 && hparams.n_head_kv(0) == 32 && hparams.n_ctx_train == 131072) {
-                    // default value for Phi-3-mini-128k-instruct
-                    LLAMA_LOG_WARN("%s: assuming no SWA for Phi-3-mini-128k-instruct\n", __func__);
-
-                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
-
-                    hparams.n_swa         = hparams.n_ctx_train;
-                    hparams.n_swa_pattern = 1;
-                } else if (hparams.n_layer == 40 && hparams.n_ctx_train == 131072) {
-                    // default value for Phi-3-medium-128k-instruct
-                    LLAMA_LOG_WARN("%s: assuming no SWA for Phi-3-medium-128k-instruct\n", __func__);
+                if (found_swa && hparams.n_swa > 0) {
+                    LLAMA_LOG_WARN("%s: Phi SWA is currently disabled - results might be suboptimal for some models (see %s)\n",
+                            __func__, "https://github.com/ggml-org/llama.cpp/pull/13676");
 
+                    // TODO: fix conversion scripts to correctly populate `n_swa` and `n_swa_pattern`
                     hparams.swa_type = LLAMA_SWA_TYPE_NONE;
 
-                    hparams.n_swa         = hparams.n_ctx_train;
-                    hparams.n_swa_pattern = 1;
-                }
-
-                bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
-                if (!found_swa && hparams.n_swa == 0) {
-                    throw std::runtime_error("invalid value for sliding_window");
-                }
-
-                if (hparams.n_swa > hparams.n_ctx_train) {
-                    LLAMA_LOG_WARN("%s: unexpected n_swa: %d >= %d, disabling SWA\n", __func__, hparams.n_swa, hparams.n_ctx_train);
-
-                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
-
-                    hparams.n_swa         = hparams.n_ctx_train;
+                    hparams.n_swa         = 0;
                     hparams.n_swa_pattern = 1;
                 }
             } break;
@@ -7368,8 +7341,9 @@ struct llm_build_phi2 : public llm_graph_context {
     }
 };
 
-struct llm_build_phi3_iswa : public llm_graph_context {
-    llm_build_phi3_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+template<bool iswa>
+struct llm_build_phi3 : public llm_graph_context {
+    llm_build_phi3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
 
@@ -7383,7 +7357,14 @@ struct llm_build_phi3_iswa : public llm_graph_context {
         // inp_pos - contains the positions
         ggml_tensor * inp_pos = build_inp_pos();
 
-        auto * inp_attn = build_attn_inp_kv_unified_iswa();
+        using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_unified_iswa, llm_graph_input_attn_kv_unified>;
+        inp_attn_type * inp_attn = nullptr;
+
+        if constexpr (iswa) {
+            inp_attn = build_attn_inp_kv_unified_iswa();
+        } else {
+            inp_attn = build_attn_inp_kv_unified();
+        }
 
         for (int il = 0; il < n_layer; ++il) {
             auto * residual = inpL;
@@ -13232,7 +13213,9 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
 
                 LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
 
-                if (hparams.n_swa > 0) {
+                if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
+                    GGML_ASSERT(hparams.n_swa_pattern != 1);
+
                     res = new llama_kv_cache_unified_iswa(
                             *this,
                             params.type_k,
@@ -13245,6 +13228,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                             cparams.n_batch,
                             padding);
                 } else {
+                    GGML_ASSERT(hparams.n_swa_pattern == 1);
+
                     res = new llama_kv_cache_unified(
                             *this,
                             nullptr,
@@ -13353,7 +13338,11 @@ llm_graph_result_ptr llama_model::build_graph(
         case LLM_ARCH_PHI3:
         case LLM_ARCH_PHIMOE:
             {
-                llm = std::make_unique<llm_build_phi3_iswa>(*this, params, gf);
+                if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
+                    llm = std::make_unique<llm_build_phi3<true>> (*this, params, gf);
+                } else {
+                    llm = std::make_unique<llm_build_phi3<false>>(*this, params, gf);
+                }
             } break;
         case LLM_ARCH_PLAMO:
             {

Original file line number	Diff line number	Diff line change
`@@ -1236,8 +1236,7 @@ llm_graph_input_attn_kv_unified * llm_graph_context::build_attn_inp_kv_unified()`
`1236`	`1236`	`auto inp = std::make_unique<llm_graph_input_attn_kv_unified>(hparams, cparams, kv_self);`
`1237`	`1237`
`1238`	`1238`	`{`
`1239`		`- GGML_ASSERT(hparams.n_swa_pattern == 1 && "Use llama_kv_cache_unified_iswa for SWA");`
`1240`		`- GGML_ASSERT(hparams.n_swa == 0 && "Use llama_kv_cache_unified_iswa for SWA");`
	`1239`	`+ GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_unified_iswa for SWA");`
`1241`	`1240`
`1242`	`1241`	`const auto n_kv = kv_self->get_n();`
`1243`	`1242`
`@@ -1312,8 +1311,8 @@ llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unif`
`1312`	`1311`	`inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;`
`1313`	`1312`	`}`
`1314`	`1313`
`1315`		`- if (hparams.n_swa_pattern > 1) {`
`1316`		`- GGML_ASSERT(hparams.n_swa > 0 && "Use llama_kv_cache_unified for non-SWA");`
	`1314`	`+ {`
	`1315`	`+ GGML_ASSERT(hparams.swa_type != LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_unified for non-SWA");`
`1317`	`1316`
`1318`	`1317`	`const auto n_kv = kv_self->get_kv_swa()->get_n();`
`1319`	`1318`