@@ -21,6 +21,7 @@ struct falcon_hparams {
2121 int32_t n_head = 71 ;
2222 int32_t n_head_kv = 1 ;
2323 int32_t n_layer = 32 ;
24+ int32_t version = 7 ; // 7 for Falcon-7B, 40 for Falcon-40B
2425 int32_t ftype = 1 ;
2526};
2627
@@ -87,8 +88,14 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt_voca
8788 fin.read ((char *) &hparams.n_head , sizeof (hparams.n_head ));
8889 fin.read ((char *) &hparams.n_head_kv , sizeof (hparams.n_head_kv ));
8990 fin.read ((char *) &hparams.n_layer , sizeof (hparams.n_layer ));
91+ fin.read ((char *) &hparams.version , sizeof (hparams.version ));
9092 fin.read ((char *) &hparams.ftype , sizeof (hparams.ftype ));
9193
94+ if (hparams.version != 7 && hparams.version != 40 ) {
95+ fprintf (stderr, " %s: invalid model file '%s' (bad Falcon version: %d)\n " , __func__, fname.c_str (), hparams.version );
96+ return false ;
97+ }
98+
9299 const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
93100
94101 printf (" %s: n_vocab = %d\n " , __func__, hparams.n_vocab );
@@ -162,7 +169,7 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt_voca
162169 n_layer *
163170 (n_embd * ggml_type_sizef (GGML_TYPE_F32)); // input_layernorm_b
164171
165- if (n_head_kv > 1 ) { // Falcon-40B
172+ if (hparams. version == 40 ) { // Falcon-40B
166173 ctx_size +=
167174 n_layer *
168175 (n_embd * ggml_type_sizef (GGML_TYPE_F32)); // attention_norm
@@ -245,7 +252,7 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt_voca
245252 layer.input_layernorm_b =
246253 ggml_new_tensor_1d (ctx, GGML_TYPE_F32, n_embd);
247254
248- if (n_head_kv > 1 ) { // Falcon-40B
255+ if (hparams. version == 40 ) { // for Falcon-40B only
249256 layer.attention_norm =
250257 ggml_new_tensor_1d (ctx, GGML_TYPE_F32, n_embd);
251258 layer.attention_norm_b =
@@ -261,21 +268,23 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt_voca
261268 layer.ffn_down = ggml_new_tensor_2d (ctx, wtype, n_ff, n_embd);
262269
263270 // map by name
264- // Falcon-7B:
265- model.tensors [" transformer.h." + std::to_string (i) +
266- " .input_layernorm.weight" ] = layer.input_layernorm ;
267- model.tensors [" transformer.h." + std::to_string (i) +
268- " .input_layernorm.bias" ] = layer.input_layernorm_b ;
269-
270- // Falcon-40B:
271- model.tensors [" transformer.h." + std::to_string (i) +
272- " .ln_mlp.weight" ] = layer.input_layernorm ;
273- model.tensors [" transformer.h." + std::to_string (i) +
274- " .ln_mlp.bias" ] = layer.input_layernorm_b ;
275- model.tensors [" transformer.h." + std::to_string (i) +
276- " .ln_attn.weight" ] = layer.attention_norm ;
277- model.tensors [" transformer.h." + std::to_string (i) +
278- " .ln_attn.bias" ] = layer.attention_norm_b ;
271+ if (hparams.version == 40 ) {
272+ // Falcon-40B:
273+ model.tensors [" transformer.h." + std::to_string (i) +
274+ " .ln_mlp.weight" ] = layer.input_layernorm ;
275+ model.tensors [" transformer.h." + std::to_string (i) +
276+ " .ln_mlp.bias" ] = layer.input_layernorm_b ;
277+ model.tensors [" transformer.h." + std::to_string (i) +
278+ " .ln_attn.weight" ] = layer.attention_norm ;
279+ model.tensors [" transformer.h." + std::to_string (i) +
280+ " .ln_attn.bias" ] = layer.attention_norm_b ;
281+ } else {
282+ // Falcon-7B:
283+ model.tensors [" transformer.h." + std::to_string (i) +
284+ " .input_layernorm.weight" ] = layer.input_layernorm ;
285+ model.tensors [" transformer.h." + std::to_string (i) +
286+ " .input_layernorm.bias" ] = layer.input_layernorm_b ;
287+ }
279288
280289 model.tensors [" transformer.h." + std::to_string (i) +
281290 " .self_attention.query_key_value.weight" ] =
@@ -346,6 +355,7 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt_voca
346355 }
347356
348357 auto tensor = model.tensors [name.data ()];
358+ fprintf (stderr, " LOOKING AT %s\n " , name.data ());
349359 if (ggml_nelements (tensor) != nelements) {
350360 fprintf (stderr, " %s: tensor '%s' has wrong size in model file\n " , __func__, name.data ());
351361 return false ;
@@ -415,6 +425,7 @@ bool falcon_eval(
415425 const int n_head = hparams.n_head ;
416426 const int n_head_kv = hparams.n_head_kv ;
417427 const int n_vocab = hparams.n_vocab ;
428+ const int version = hparams.version ;
418429 const size_t head_dim = n_embd / n_head;
419430
420431 static size_t buf_size = 256u *1024 *1024 ;
@@ -477,7 +488,7 @@ bool falcon_eval(
477488 layernorm_output),
478489 ggml_repeat (ctx0, model.layers [il].input_layernorm_b , layernorm_output));
479490
480- if (n_head_kv > 1 ) { // Falcon-40B only
491+ if (version == 40 ) { // Falcon-40B only
481492 cur = ggml_norm (ctx0, inpL);
482493
483494 cur = ggml_add (ctx0,
@@ -493,25 +504,40 @@ bool falcon_eval(
493504 // compute QKV
494505 cur = ggml_mul_mat (ctx0, model.layers [il].query_key_value , cur);
495506
496- struct ggml_tensor * Qcur = ggml_view_4d (
497- ctx0, cur, head_dim, n_head / n_head_kv, n_head_kv, N,
507+ // Below is the "qkv" view which splits up QKV into kv groups,
508+ // each group containing n_head / n_head_kv query heads,
509+ // one key head and one value head (hence + 2). We don't really
510+ // need this view as we access Q,K,V through cur directly by
511+ // applying offsets and strides.
512+
513+ /* struct ggml_tensor* qkv = ggml_view_4d(
514+ ctx0, cur, head_dim, n_head / n_head_kv + 2, n_head_kv, N,
498515 head_dim * sizeof_wtype,
499516 head_dim * (n_head / n_head_kv + 2) * sizeof_wtype,
500517 head_dim * (n_head / n_head_kv + 2) * n_head_kv * sizeof_wtype,
501- 0 );
518+ 0);*/
519+
520+ // Note that the strides for Kcur, Vcur are set up so that the
521+ // resulting views are misaligned with the tensor's storage
522+ // (by applying the K/V offset we shift the tensor's original
523+ // view to stick out behind the viewed QKV tensor's allocated
524+ // memory, so to say). This is ok because no actual accesses
525+ // happen to that out-of-range memory, but it can require some
526+ // trickery when trying to accurately dump these views for
527+ // debugging.
502528
503529 struct ggml_tensor * Kcur = ggml_view_4d (
504- ctx0, cur, head_dim, 1 , N, n_head_kv ,
530+ ctx0, cur, head_dim, 1 , n_head_kv, N ,
505531 head_dim * sizeof_wtype,
506532 head_dim * (n_head / n_head_kv + 2 ) * sizeof_wtype,
507- head_dim * (n_head / n_head_kv + 2 ) * N * sizeof_wtype,
533+ head_dim * (n_head / n_head_kv + 2 ) * n_head_kv * sizeof_wtype,
508534 head_dim * (n_head / n_head_kv) * sizeof_wtype);
509535
510536 struct ggml_tensor * Vcur = ggml_view_4d (
511- ctx0, cur, head_dim, 1 , N, n_head_kv ,
537+ ctx0, cur, head_dim, 1 , n_head_kv, N ,
512538 head_dim * sizeof_wtype,
513539 head_dim * (n_head / n_head_kv + 2 ) * sizeof_wtype,
514- head_dim * (n_head / n_head_kv + 2 ) * N * sizeof_wtype,
540+ head_dim * (n_head / n_head_kv + 2 ) * n_head_kv * sizeof_wtype,
515541 head_dim * (n_head / n_head_kv + 1 ) * sizeof_wtype);
516542
517543 // TODO: The crazy piecewise copying below works (well, until GGML_MAX_NODES is hit),
@@ -540,7 +566,7 @@ bool falcon_eval(
540566 Q->nb [2 ] * i);
541567
542568 struct ggml_tensor * src = ggml_view_1d (
543- ctx0, Qcur , head_dim, src_offset);
569+ ctx0, cur , head_dim, src_offset);
544570
545571 struct ggml_tensor * dst = ggml_view_1d (
546572 ctx0, Q, head_dim, dst_offset);
@@ -552,7 +578,9 @@ bool falcon_eval(
552578
553579 // using mode = 2 for neox mode
554580 Q = ggml_rope_inplace (ctx0, Q, n_past, head_dim, 2 );
581+ Kcur = ggml_permute (ctx0, Kcur, 0 , 1 , 3 , 2 );
555582 Kcur = ggml_rope_inplace (ctx0, Kcur, n_past, head_dim, 2 );
583+ Kcur = ggml_permute (ctx0, Kcur, 0 , 1 , 3 , 2 );
556584
557585 // store key and value to memory
558586 {
@@ -583,14 +611,7 @@ bool falcon_eval(
583611
584612 // K * Q
585613
586- // TODO Unfortunately this ggml_repeat does not do what we need it to do:
587- // [ K1, K2 ] will be broadcast into [ [K1, K2], [K1, K2] ], while we actually
588- // need them to become [ [K1, K1], [K2, K2] ] ... And I suppose there will be same
589- // problem with V below as well.
590- // Here too perhaps GGML conversion could do some preprocessing to obtain
591- // a more GGML-friendly memory format.
592-
593- K = ggml_cont (ctx0, ggml_repeat (ctx0, K, repeat_dummy));
614+ K = ggml_cont (ctx0, ggml_repeat2 (ctx0, K, repeat_dummy));
594615 Q = ggml_permute (ctx0, Q, 0 , 2 , 1 , 3 );
595616
596617 struct ggml_tensor * KQ = ggml_mul_mat (ctx0, K, Q);
@@ -611,17 +632,17 @@ bool falcon_eval(
611632 // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
612633 struct ggml_tensor * V = ggml_permute (
613634 ctx0,
614- ggml_reshape_4d (
635+ ggml_reshape_3d (
615636 ctx0,
616637 ggml_view_1d (ctx0, model.memory_v , (n_past + N) * n_head_kv * head_dim,
617638 il * n_ctx *
618639 ggml_element_size (model.memory_v ) *
619640 n_head_kv *
620641 head_dim),
621- head_dim, 1 , n_head_kv, n_past + N),
622- 0 , 3 , 2 , 1 );
642+ head_dim, n_head_kv, n_past + N),
643+ 0 , 2 , 1 , 3 );
623644
624- V = ggml_cont (ctx0, ggml_transpose (ctx0, ggml_repeat (ctx0, V, repeat_dummy)));
645+ V = ggml_cont (ctx0, ggml_transpose (ctx0, ggml_repeat2 (ctx0, V, repeat_dummy)));
625646
626647 // KQV = transpose(V) * KQ_soft_max
627648 struct ggml_tensor * KQV = ggml_mul_mat (ctx0, V, KQ_soft_max);
0 commit comments