diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 16de0c7fd..e3a6cea1d 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -978,10 +978,13 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor([[maybe_unused]] } } else if (extra->split_dim == 1) { + void * extra_ptr; + memcpy(&extra_ptr, tensor->op_params, sizeof(extra_ptr)); if (tensor->ne[2] > 1) { auto row_size = ggml_row_size(tensor->type, tensor->ne[0]); std::vector host_buffer; int ne1 = 0; + int extra_ne1 = 0; for (int i = 0; i < extra->n_device; ++i) { auto split = extra->splits[i]; if (!split) continue; @@ -990,8 +993,18 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor([[maybe_unused]] if (host_buffer.size() < size) host_buffer.resize(size); for (int64_t i02 = 0; i02 < split->ne[2]; ++i02) { auto dst = host_buffer.data() + i02*split->ne[1]*row_size; - auto src = (const char *)data + i02*tensor->nb[2] + ne1*tensor->nb[1]; - memcpy(dst, src, split->ne[1]*row_size); + if (extra_ptr) { + auto & ranges = *(const std::vector>> *)extra_ptr; + for (auto & p : ranges[i]) { + auto this_src = (const char *)data + i02*tensor->nb[2] + p.first*tensor->nb[1]; + auto this_size = p.second*tensor->nb[1]; + memcpy(dst, this_src, this_size); + dst += this_size; + } + } else { + auto src = (const char *)data + i02*tensor->nb[2] + ne1*tensor->nb[1]; + memcpy(dst, src, split->ne[1]*row_size); + } } CUDA_CHECK(cudaMemcpyAsync(split->data, host_buffer.data(), size, cudaMemcpyHostToDevice, cudaStreamPerThread)); ne1 += split->ne[1]; @@ -999,8 +1012,6 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor([[maybe_unused]] } else { int n_interleave = 1; if (auto it = k_map.find(tensor->type); it != k_map.end()) n_interleave = it->second; - void * extra_ptr; - memcpy(&extra_ptr, tensor->op_params, sizeof(extra_ptr)); if (extra_ptr) { auto & ranges = *(const std::vector>> *)extra_ptr; GGML_ASSERT(extra->n_device == int(ranges.size())); diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp index 0ead55224..120c7846d 100644 --- a/src/llama-build-context.cpp +++ b/src/llama-build-context.cpp @@ -1271,7 +1271,8 @@ llm_expert_gating_func_type gating_op, auto split_up_b_shexp = up_b_shexp ? (ggml_split_tensor_t *)up_b_shexp : nullptr; auto split_gate_b_shexp = gate_b_shexp ? (ggml_split_tensor_t *)gate_b_shexp : nullptr; auto split_down_b_shexp = down_b_shexp ? (ggml_split_tensor_t *)down_b_shexp : nullptr; - if (!split_up_exps && !split_gate_exps && !split_down_exps) { + auto split_up_gate_exps = up_gate_exps ? (ggml_split_tensor_t *)up_gate_exps->extra : nullptr; + if (!split_up_exps && !split_gate_exps && !split_up_gate_exps && !split_down_exps) { auto cur = input; if (ffn_norm) { auto the_ffn_norm = ffn_norm->extra ? ((ggml_split_tensor_t *)ffn_norm->extra)->splits[lctx.model.main_gpu] : ffn_norm; @@ -1393,26 +1394,33 @@ llm_expert_gating_func_type gating_op, } return cur; } - GGML_ASSERT(split_up_exps && split_gate_exps && split_down_exps); - GGML_ASSERT(split_up_exps->n_device == split_gate_exps->n_device && split_up_exps->n_device == split_down_exps->n_device); - std::vector results(split_up_exps->n_device, nullptr); + GGML_ASSERT(((split_up_exps && split_gate_exps) || split_up_gate_exps) && split_down_exps); + int n_device = split_down_exps->n_device; + if (split_up_gate_exps) { + GGML_ASSERT(split_up_gate_exps->n_device == n_device); + } else { + GGML_ASSERT(split_up_exps->n_device == n_device && split_gate_exps->n_device == n_device); + } + std::vector results(n_device, nullptr); GGML_ASSERT((!split_up_shexp && !split_gate_shexp && !split_down_shexp) || ( split_up_shexp && split_gate_shexp && split_down_shexp)); auto split_gate_inp = (ggml_split_tensor_t *)gate_inp->extra; - GGML_ASSERT(split_gate_inp && split_gate_inp->n_device == split_up_exps->n_device); + GGML_ASSERT(split_gate_inp && split_gate_inp->n_device == n_device); auto split_exp_probs_b = exp_probs_b ? (ggml_split_tensor_t *)exp_probs_b->extra : nullptr; - GGML_ASSERT(!split_exp_probs_b || split_exp_probs_b->n_device == split_up_exps->n_device); + GGML_ASSERT(!split_exp_probs_b || split_exp_probs_b->n_device == n_device); auto split_gate_inp_b = gate_inp_b ? (ggml_split_tensor_t *)gate_inp_b->extra : nullptr; auto split_exps_down_b = down_exps_b ? (ggml_split_tensor_t *)down_exps_b->extra : nullptr; auto split_exps_gate_b = gate_exps_b ? (ggml_split_tensor_t *)gate_exps_b->extra : nullptr; auto split_exps_up_b = up_exps_b ? (ggml_split_tensor_t *)up_exps_b->extra : nullptr; + auto split_exps_up_gate_b = up_gate_exps_b ? (ggml_split_tensor_t *)up_gate_exps_b->extra : nullptr; int last_id = -1; bool down_bias_added = false; - for (int id = 0; id < split_up_exps->n_device; ++id) { - GGML_ASSERT((split_up_exps->splits[id] && split_gate_exps->splits[id] && split_down_exps->splits[id]) || - (!split_up_exps->splits[id] && !split_gate_exps->splits[id] && !split_down_exps->splits[id])); - if (!split_up_exps->splits[id]) continue; + for (int id = 0; id < n_device; ++id) { + bool has_up_gate = split_up_gate_exps ? split_up_gate_exps->splits[id] != nullptr : split_up_exps->splits[id] != nullptr && split_gate_exps->splits[id]!= nullptr ; + GGML_ASSERT((has_up_gate && split_down_exps->splits[id]) || + (!has_up_gate && !split_down_exps->splits[id])); + if (!has_up_gate) continue; int il_cb = 1000*(id + 1) + il; auto cur = get_input_tensor_sm_graph(ctx, input, id); cur = do_split_norm(ctx, cur, ffn_norm, lctx.model.hparams, cb, id, il_cb, false); @@ -1425,19 +1433,21 @@ llm_expert_gating_func_type gating_op, GGML_ASSERT(!split_exps_up_b || split_exps_up_b->splits[id]); auto routed_out = llm_build_moe_ffn(ctx, lctx, cur, split_gate_inp->splits[id], split_gate_inp_b ? split_gate_inp_b->splits[id] : nullptr, - split_up_exps->splits[id], split_exps_up_b ? split_exps_up_b->splits[id] : nullptr, - split_gate_exps->splits[id], split_exps_gate_b ? split_exps_gate_b->splits[id] : nullptr, + split_up_exps ? split_up_exps->splits[id] : nullptr, split_exps_up_b ? split_exps_up_b->splits[id] : nullptr, + split_gate_exps ? split_gate_exps->splits[id] : nullptr, split_exps_gate_b ? split_exps_gate_b->splits[id] : nullptr, split_down_exps->splits[id], !down_bias_added && split_exps_down_b ? split_exps_down_b->splits[id] : nullptr, split_exp_probs_b ? split_exp_probs_b->splits[id] : nullptr, n_expert, n_expert_used, type_op, norm_w, scale_w, w_scale, - gating_op, cb, il, graph, false); + gating_op, cb, il, graph, false, + split_up_gate_exps ? split_up_gate_exps->splits[id] : nullptr, + split_exps_up_gate_b ? split_exps_up_gate_b->splits[id] : nullptr); cb(routed_out, "routed_out", il_cb); if (split_up_shexp) { - GGML_ASSERT(!split_up_b_shexp || split_up_b_shexp->n_device == split_up_exps->n_device); - GGML_ASSERT(!split_gate_b_shexp || split_gate_b_shexp->n_device == split_up_exps->n_device); - GGML_ASSERT(!split_down_b_shexp || split_down_b_shexp->n_device == split_up_exps->n_device); + GGML_ASSERT(!split_up_b_shexp || split_up_b_shexp->n_device == n_device); + GGML_ASSERT(!split_gate_b_shexp || split_gate_b_shexp->n_device == n_device); + GGML_ASSERT(!split_down_b_shexp || split_down_b_shexp->n_device == n_device); auto shared_out = llm_build_ffn(ctx, lctx, nullptr, cur, split_up_shexp->splits[id], split_up_b_shexp ? split_up_b_shexp->splits[id] : nullptr, nullptr, split_gate_shexp->splits[id], split_gate_b_shexp ? split_gate_b_shexp->splits[id] : nullptr, nullptr, @@ -1477,7 +1487,7 @@ llm_expert_gating_func_type gating_op, cb(results[last_id], "ffn_inp_added", il); } - auto cur = ggml_reduce(ctx, results.data(), split_up_exps->n_device, GGML_OP_ADD); + auto cur = ggml_reduce(ctx, results.data(), n_device, GGML_OP_ADD); cb(cur, "moe_ffn_combined", il); ggml_build_forward_expand(graph, cur); diff --git a/src/llama-load-tensors.cpp b/src/llama-load-tensors.cpp index 87c6c615f..4728f9bc7 100644 --- a/src/llama-load-tensors.cpp +++ b/src/llama-load-tensors.cpp @@ -3215,9 +3215,6 @@ bool create_tensors_helper::create_std_ffn_exps(int64_t n_embd, const LLM_TN & t auto ug_meta = ml.get_tensor_meta(ug_name.c_str()); //printf("Checking for tensor %s: %s\n", ug_name.c_str(), ug_meta ? "found" : "not found"); if (ug_meta) { - if (model.split_mode == LLAMA_SPLIT_MODE_ATTN || model.split_mode == LLAMA_SPLIT_MODE_GRAPH) { - GGML_ABORT("Merged ffn_up_exps/ffn_gate_exps are not supported for split mode graph!"); - } layer.ffn_up_gate_exps = create_tensor(ffn_ctx, ug_name, { n_embd, 2*n_ff_exp, n_expert}, flags); } else { merged = flags == 0 && ml.merge_up_gate_exps && merge_up_gate_exps(tn, i, 0); @@ -3441,6 +3438,25 @@ static void check_delta_split(ggml_tensor * t, llama_split_tensor & l_split) { std::memcpy(t->op_params, &data, sizeof(data)); } +static void prepare_up_gate_split(ggml_tensor * t, llama_split_tensor & split) { + auto extra = (ggml_split_tensor_t *)t->extra; + GGML_ASSERT(extra); + split.ranges.resize(extra->n_device); + int idim = extra->split_dim; + int nrows = t->ne[idim]/2; + int ntot = 0; + for (int is = 0; is < extra->n_device; ++is) { + if (!extra->splits[is]) continue; + auto & ranges = split.ranges[is]; + ranges.resize(2); + int nrows_is = extra->splits[idim]->ne[1]/2; + ranges[0] = {ntot, nrows_is}; + ranges[1] = {ntot + nrows, nrows_is}; + ntot += nrows_is; + } + check_delta_split(t, split); +} + // ttype = 0 -> q, k, v, always multiplied with head_k_dim/head_v_dim // ttype = 1 -> q, k, v, v, always multiplied with head_k_dim/head_v_dim // ttype = 2 -> v @@ -4002,10 +4018,10 @@ bool create_tensors_helper::create_tensors() { } std::vector ffn_split; - if (layer.ffn_down_exps && layer.ffn_up_exps && layer.ffn_gate_exps) { - bool use_split = split_tensors.find(layer.ffn_down_exps) != split_tensors.end() && - split_tensors.find(layer.ffn_gate_exps) != split_tensors.end() && - split_tensors.find(layer.ffn_up_exps) != split_tensors.end(); + if (layer.ffn_down_exps && ((layer.ffn_up_exps && layer.ffn_gate_exps) || layer.ffn_up_gate_exps)) { + bool has_up_gate = split_tensors.find(layer.ffn_gate_exps) != split_tensors.end() && split_tensors.find(layer.ffn_up_exps) != split_tensors.end(); + has_up_gate |= split_tensors.find(layer.ffn_up_gate_exps) != split_tensors.end(); + bool use_split = split_tensors.find(layer.ffn_down_exps) != split_tensors.end() && has_up_gate; if (use_split) { int ffn_granularity = 16; @@ -4017,16 +4033,29 @@ bool create_tensors_helper::create_tensors() { LLAMA_LOG_DEBUG(" split_ffn_exps:"); for ([[maybe_unused]] auto s : ffn_split) LLAMA_LOG_DEBUG(" %d", s); LLAMA_LOG_DEBUG("\n"); prepare_split_tensors(0, ctx_split, layer.ffn_down_exps, layer.split_ffn_down_exps, ffn_split, mem_used); - prepare_split_tensors(1, ctx_split, layer.ffn_up_exps, layer.split_ffn_up_exps, ffn_split, mem_used); - prepare_split_tensors(1, ctx_split, layer.ffn_gate_exps, layer.split_ffn_gate_exps, ffn_split, mem_used); + if (layer.ffn_up_gate_exps) { + auto up_gate_split = ffn_split; + for (auto & v : up_gate_split) v *= 2; + prepare_split_tensors(1, ctx_split, layer.ffn_up_gate_exps, layer.split_ffn_up_gate_exps, up_gate_split, mem_used); + prepare_up_gate_split(layer.ffn_up_gate_exps, layer.split_ffn_up_gate_exps); + if (layer.ffn_up_gate_exps_b) { + prepare_split_tensors(1, ctx_split, layer.ffn_up_gate_exps_b, layer.split_ffn_up_gate_exps_b, up_gate_split, mem_used); + prepare_up_gate_split(layer.ffn_up_gate_exps_b, layer.split_ffn_up_gate_exps_b); + } + } else { + prepare_split_tensors(1, ctx_split, layer.ffn_up_exps, layer.split_ffn_up_exps, ffn_split, mem_used); + prepare_split_tensors(1, ctx_split, layer.ffn_gate_exps, layer.split_ffn_gate_exps, ffn_split, mem_used); + } if (layer.ffn_down_exps_b) { prepare_split_tensors(-1, ctx_split, layer.ffn_down_exps_b, layer.split_ffn_down_exps_b, ffn_split, mem_used); } - if (layer.ffn_up_exps_b) { - prepare_split_tensors( 0, ctx_split, layer.ffn_up_exps_b, layer.split_ffn_up_exps_b, ffn_split, mem_used); - } - if (layer.ffn_gate_exps_b) { - prepare_split_tensors( 0, ctx_split, layer.ffn_gate_exps_b, layer.split_ffn_gate_exps_b, ffn_split, mem_used); + if (!layer.ffn_up_gate_exps) { + if (layer.ffn_up_exps_b) { + prepare_split_tensors( 0, ctx_split, layer.ffn_up_exps_b, layer.split_ffn_up_exps_b, ffn_split, mem_used); + } + if (layer.ffn_gate_exps_b) { + prepare_split_tensors( 0, ctx_split, layer.ffn_gate_exps_b, layer.split_ffn_gate_exps_b, ffn_split, mem_used); + } } } } diff --git a/src/llama-model.h b/src/llama-model.h index 78556b935..b776df26e 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -250,6 +250,7 @@ struct llama_layer { llama_split_tensor split_ffn_gate; llama_split_tensor split_ffn_down; llama_split_tensor split_ffn_norm; + llama_split_tensor split_ffn_up_gate; // ff MoE struct ggml_tensor * ffn_gate_inp = nullptr; @@ -262,6 +263,7 @@ struct llama_layer { llama_split_tensor split_ffn_up_exps; llama_split_tensor split_ffn_gate_exps; llama_split_tensor split_ffn_down_exps; + llama_split_tensor split_ffn_up_gate_exps; // ff MoE bias struct ggml_tensor * ffn_gate_inp_b = nullptr; @@ -288,6 +290,7 @@ struct llama_layer { llama_split_tensor split_ffn_gate_exps_b; llama_split_tensor split_ffn_down_exps_b; llama_split_tensor split_ffn_up_exps_b; + llama_split_tensor split_ffn_up_gate_exps_b; // ff bias struct ggml_tensor * ffn_gate_b = nullptr;