@@ -4666,126 +4666,6 @@ struct llm_build_context {
46664666 ctx0 = nullptr ;
46674667 }
46684668 }
4669- struct ggml_cgraph * build_orion () {
4670- struct ggml_cgraph * gf = ggml_new_graph_custom (ctx0, LLAMA_MAX_NODES, false );
4671-
4672- const int64_t n_embd_head = hparams.n_embd_head_v ;
4673- GGML_ASSERT (n_embd_head == hparams.n_embd_head_k );
4674- GGML_ASSERT (n_embd_head == hparams.n_rot );
4675-
4676- struct ggml_tensor * cur;
4677- struct ggml_tensor * inpL;
4678-
4679- inpL = llm_build_inp_embd (ctx0, hparams, batch, model.tok_embd , lctx.inp_tokens , lctx.inp_embd , cb);
4680- cb (inpL, " inp_embd" , -1 );
4681-
4682- // inp_pos - contains the positions
4683- struct ggml_tensor * inp_pos = ggml_view_1d (ctx0, lctx.inp_pos , n_tokens, 0 );
4684- cb (inp_pos, " inp_pos" , -1 );
4685-
4686- // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4687- struct ggml_tensor * KQ_mask = ggml_view_2d (ctx0, lctx.inp_KQ_mask , n_kv, n_tokens, n_kv*ggml_type_size (lctx.inp_KQ_mask ->type ), 0 );
4688- cb (KQ_mask, " KQ_mask" , -1 );
4689-
4690- // shift the entire K-cache if needed
4691- if (do_rope_shift) {
4692- llm_build_k_shift (ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift , LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
4693- }
4694-
4695- for (int il = 0 ; il < n_layer; ++il) {
4696- struct ggml_tensor * inpSA = inpL;
4697-
4698- // norm
4699- cur = llm_build_norm (ctx0, inpL, hparams,
4700- model.layers [il].attn_norm , model.layers [il].attn_norm_b ,
4701- LLM_NORM, cb, il);
4702- cb (cur, " attn_norm" , il);
4703-
4704- // self-attention
4705- {
4706- // compute Q and K and RoPE them
4707- struct ggml_tensor * Qcur = ggml_mul_mat (ctx0, model.layers [il].wq , cur);
4708- cb (Qcur, " Qcur" , il);
4709- // if (model.layers[il].bq) {
4710- // Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
4711- // cb(Qcur, "Qcur", il);
4712- // }
4713-
4714- struct ggml_tensor * Kcur = ggml_mul_mat (ctx0, model.layers [il].wk , cur);
4715- cb (Kcur, " Kcur" , il);
4716- // if (model.layers[il].bk) {
4717- // Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
4718- // cb(Kcur, "Kcur", il);
4719- // }
4720-
4721- struct ggml_tensor * Vcur = ggml_mul_mat (ctx0, model.layers [il].wv , cur);
4722- cb (Vcur, " Vcur" , il);
4723- // if (model.layers[il].bv) {
4724- // Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
4725- // cb(Vcur, "Vcur", il);
4726- // }
4727-
4728- Qcur = ggml_rope_custom (
4729- ctx0, ggml_reshape_3d (ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
4730- hparams.n_rot , 2 , 0 , n_orig_ctx, freq_base, freq_scale,
4731- ext_factor, attn_factor, beta_fast, beta_slow
4732- );
4733- cb (Qcur, " Qcur" , il);
4734-
4735- Kcur = ggml_rope_custom (
4736- ctx0, ggml_reshape_3d (ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
4737- hparams.n_rot , 2 , 0 , n_orig_ctx, freq_base, freq_scale,
4738- ext_factor, attn_factor, beta_fast, beta_slow
4739- );
4740- cb (Kcur, " Kcur" , il);
4741-
4742- cur = llm_build_kv (ctx0, model, hparams, kv_self, gf,
4743- model.layers [il].wo , NULL ,
4744- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1 .0f , 1 .0f /sqrtf (float (n_embd_head)), cb, il);
4745- cb (cur, " kqv_out" , il);
4746- }
4747-
4748- struct ggml_tensor * ffn_inp = ggml_add (ctx0, cur, inpSA);
4749- cb (ffn_inp, " ffn_inp" , il);
4750-
4751- // feed-forward network
4752- cur = llm_build_norm (ctx0, ffn_inp, hparams,
4753- model.layers [il].ffn_norm , model.layers [il].ffn_norm_b ,
4754- LLM_NORM, cb, il);
4755- cb (cur, " ffn_norm" , il);
4756-
4757- cur = llm_build_ffn (ctx0, cur,
4758- model.layers [il].ffn_up , NULL ,
4759- model.layers [il].ffn_gate , NULL ,
4760- model.layers [il].ffn_down , NULL ,
4761- NULL ,
4762- LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
4763- cb (cur, " ffn_out" , il);
4764-
4765- cur = ggml_add (ctx0, cur, ffn_inp);
4766- cb (cur, " l_out" , il);
4767-
4768- // input for next layer
4769- inpL = cur;
4770- }
4771-
4772- cur = inpL;
4773-
4774- cur = llm_build_norm (ctx0, cur, hparams,
4775- model.output_norm , model.output_norm_b ,
4776- LLM_NORM, cb, -1 );
4777- cb (cur, " result_norm" , -1 );
4778-
4779- // lm_head
4780- cur = ggml_mul_mat (ctx0, model.output , cur);
4781- cb (cur, " result_output" , -1 );
4782-
4783- ggml_build_forward_expand (gf, cur);
4784-
4785- return gf;
4786- }
4787-
4788-
47894669
47904670 struct ggml_cgraph * build_llama () {
47914671 struct ggml_cgraph * gf = ggml_new_graph_custom (ctx0, LLAMA_MAX_NODES, false );
@@ -6589,6 +6469,125 @@ struct llm_build_context {
65896469
65906470 return gf;
65916471 }
6472+
6473+ struct ggml_cgraph * build_orion () {
6474+ struct ggml_cgraph * gf = ggml_new_graph_custom (ctx0, LLAMA_MAX_NODES, false );
6475+
6476+ const int64_t n_embd_head = hparams.n_embd_head_v ;
6477+ GGML_ASSERT (n_embd_head == hparams.n_embd_head_k );
6478+ GGML_ASSERT (n_embd_head == hparams.n_rot );
6479+
6480+ struct ggml_tensor * cur;
6481+ struct ggml_tensor * inpL;
6482+
6483+ inpL = llm_build_inp_embd (ctx0, hparams, batch, model.tok_embd , lctx.inp_tokens , lctx.inp_embd , cb);
6484+ cb (inpL, " inp_embd" , -1 );
6485+
6486+ // inp_pos - contains the positions
6487+ struct ggml_tensor * inp_pos = ggml_view_1d (ctx0, lctx.inp_pos , n_tokens, 0 );
6488+ cb (inp_pos, " inp_pos" , -1 );
6489+
6490+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6491+ struct ggml_tensor * KQ_mask = ggml_view_2d (ctx0, lctx.inp_KQ_mask , n_kv, n_tokens, n_kv*ggml_type_size (lctx.inp_KQ_mask ->type ), 0 );
6492+ cb (KQ_mask, " KQ_mask" , -1 );
6493+
6494+ // shift the entire K-cache if needed
6495+ if (do_rope_shift) {
6496+ llm_build_k_shift (ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift , LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
6497+ }
6498+
6499+ for (int il = 0 ; il < n_layer; ++il) {
6500+ struct ggml_tensor * inpSA = inpL;
6501+
6502+ // norm
6503+ cur = llm_build_norm (ctx0, inpL, hparams,
6504+ model.layers [il].attn_norm , model.layers [il].attn_norm_b ,
6505+ LLM_NORM, cb, il);
6506+ cb (cur, " attn_norm" , il);
6507+
6508+ // self-attention
6509+ {
6510+ // compute Q and K and RoPE them
6511+ struct ggml_tensor * Qcur = ggml_mul_mat (ctx0, model.layers [il].wq , cur);
6512+ cb (Qcur, " Qcur" , il);
6513+ // if (model.layers[il].bq) {
6514+ // Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
6515+ // cb(Qcur, "Qcur", il);
6516+ // }
6517+
6518+ struct ggml_tensor * Kcur = ggml_mul_mat (ctx0, model.layers [il].wk , cur);
6519+ cb (Kcur, " Kcur" , il);
6520+ // if (model.layers[il].bk) {
6521+ // Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
6522+ // cb(Kcur, "Kcur", il);
6523+ // }
6524+
6525+ struct ggml_tensor * Vcur = ggml_mul_mat (ctx0, model.layers [il].wv , cur);
6526+ cb (Vcur, " Vcur" , il);
6527+ // if (model.layers[il].bv) {
6528+ // Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
6529+ // cb(Vcur, "Vcur", il);
6530+ // }
6531+
6532+ Qcur = ggml_rope_custom (
6533+ ctx0, ggml_reshape_3d (ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
6534+ hparams.n_rot , 2 , 0 , n_orig_ctx, freq_base, freq_scale,
6535+ ext_factor, attn_factor, beta_fast, beta_slow
6536+ );
6537+ cb (Qcur, " Qcur" , il);
6538+
6539+ Kcur = ggml_rope_custom (
6540+ ctx0, ggml_reshape_3d (ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
6541+ hparams.n_rot , 2 , 0 , n_orig_ctx, freq_base, freq_scale,
6542+ ext_factor, attn_factor, beta_fast, beta_slow
6543+ );
6544+ cb (Kcur, " Kcur" , il);
6545+
6546+ cur = llm_build_kv (ctx0, model, hparams, kv_self, gf,
6547+ model.layers [il].wo , NULL ,
6548+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1 .0f , 1 .0f /sqrtf (float (n_embd_head)), cb, il);
6549+ cb (cur, " kqv_out" , il);
6550+ }
6551+
6552+ struct ggml_tensor * ffn_inp = ggml_add (ctx0, cur, inpSA);
6553+ cb (ffn_inp, " ffn_inp" , il);
6554+
6555+ // feed-forward network
6556+ cur = llm_build_norm (ctx0, ffn_inp, hparams,
6557+ model.layers [il].ffn_norm , model.layers [il].ffn_norm_b ,
6558+ LLM_NORM, cb, il);
6559+ cb (cur, " ffn_norm" , il);
6560+
6561+ cur = llm_build_ffn (ctx0, cur,
6562+ model.layers [il].ffn_up , NULL ,
6563+ model.layers [il].ffn_gate , NULL ,
6564+ model.layers [il].ffn_down , NULL ,
6565+ NULL ,
6566+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
6567+ cb (cur, " ffn_out" , il);
6568+
6569+ cur = ggml_add (ctx0, cur, ffn_inp);
6570+ cb (cur, " l_out" , il);
6571+
6572+ // input for next layer
6573+ inpL = cur;
6574+ }
6575+
6576+ cur = inpL;
6577+
6578+ cur = llm_build_norm (ctx0, cur, hparams,
6579+ model.output_norm , model.output_norm_b ,
6580+ LLM_NORM, cb, -1 );
6581+ cb (cur, " result_norm" , -1 );
6582+
6583+ // lm_head
6584+ cur = ggml_mul_mat (ctx0, model.output , cur);
6585+ cb (cur, " result_output" , -1 );
6586+
6587+ ggml_build_forward_expand (gf, cur);
6588+
6589+ return gf;
6590+ }
65926591};
65936592
65946593static struct ggml_cgraph * llama_build_graph (
0 commit comments