Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 15 additions & 4 deletions ggml/src/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -978,10 +978,13 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor([[maybe_unused]]
}
}
else if (extra->split_dim == 1) {
void * extra_ptr;
memcpy(&extra_ptr, tensor->op_params, sizeof(extra_ptr));
if (tensor->ne[2] > 1) {
auto row_size = ggml_row_size(tensor->type, tensor->ne[0]);
std::vector<char> host_buffer;
int ne1 = 0;
int extra_ne1 = 0;
for (int i = 0; i < extra->n_device; ++i) {
auto split = extra->splits[i];
if (!split) continue;
Expand All @@ -990,17 +993,25 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor([[maybe_unused]]
if (host_buffer.size() < size) host_buffer.resize(size);
for (int64_t i02 = 0; i02 < split->ne[2]; ++i02) {
auto dst = host_buffer.data() + i02*split->ne[1]*row_size;
auto src = (const char *)data + i02*tensor->nb[2] + ne1*tensor->nb[1];
memcpy(dst, src, split->ne[1]*row_size);
if (extra_ptr) {
auto & ranges = *(const std::vector<std::vector<std::pair<int,int>>> *)extra_ptr;
for (auto & p : ranges[i]) {
auto this_src = (const char *)data + i02*tensor->nb[2] + p.first*tensor->nb[1];
auto this_size = p.second*tensor->nb[1];
memcpy(dst, this_src, this_size);
dst += this_size;
}
} else {
auto src = (const char *)data + i02*tensor->nb[2] + ne1*tensor->nb[1];
memcpy(dst, src, split->ne[1]*row_size);
}
}
CUDA_CHECK(cudaMemcpyAsync(split->data, host_buffer.data(), size, cudaMemcpyHostToDevice, cudaStreamPerThread));
ne1 += split->ne[1];
}
} else {
int n_interleave = 1;
if (auto it = k_map.find(tensor->type); it != k_map.end()) n_interleave = it->second;
void * extra_ptr;
memcpy(&extra_ptr, tensor->op_params, sizeof(extra_ptr));
if (extra_ptr) {
auto & ranges = *(const std::vector<std::vector<std::pair<int,int>>> *)extra_ptr;
GGML_ASSERT(extra->n_device == int(ranges.size()));
Expand Down
44 changes: 27 additions & 17 deletions src/llama-build-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1271,7 +1271,8 @@ llm_expert_gating_func_type gating_op,
auto split_up_b_shexp = up_b_shexp ? (ggml_split_tensor_t *)up_b_shexp : nullptr;
auto split_gate_b_shexp = gate_b_shexp ? (ggml_split_tensor_t *)gate_b_shexp : nullptr;
auto split_down_b_shexp = down_b_shexp ? (ggml_split_tensor_t *)down_b_shexp : nullptr;
if (!split_up_exps && !split_gate_exps && !split_down_exps) {
auto split_up_gate_exps = up_gate_exps ? (ggml_split_tensor_t *)up_gate_exps->extra : nullptr;
if (!split_up_exps && !split_gate_exps && !split_up_gate_exps && !split_down_exps) {
auto cur = input;
if (ffn_norm) {
auto the_ffn_norm = ffn_norm->extra ? ((ggml_split_tensor_t *)ffn_norm->extra)->splits[lctx.model.main_gpu] : ffn_norm;
Expand Down Expand Up @@ -1393,26 +1394,33 @@ llm_expert_gating_func_type gating_op,
}
return cur;
}
GGML_ASSERT(split_up_exps && split_gate_exps && split_down_exps);
GGML_ASSERT(split_up_exps->n_device == split_gate_exps->n_device && split_up_exps->n_device == split_down_exps->n_device);
std::vector<ggml_tensor *> results(split_up_exps->n_device, nullptr);
GGML_ASSERT(((split_up_exps && split_gate_exps) || split_up_gate_exps) && split_down_exps);
int n_device = split_down_exps->n_device;
if (split_up_gate_exps) {
GGML_ASSERT(split_up_gate_exps->n_device == n_device);
} else {
GGML_ASSERT(split_up_exps->n_device == n_device && split_gate_exps->n_device == n_device);
}
std::vector<ggml_tensor *> results(n_device, nullptr);
GGML_ASSERT((!split_up_shexp && !split_gate_shexp && !split_down_shexp) ||
( split_up_shexp && split_gate_shexp && split_down_shexp));
auto split_gate_inp = (ggml_split_tensor_t *)gate_inp->extra;
GGML_ASSERT(split_gate_inp && split_gate_inp->n_device == split_up_exps->n_device);
GGML_ASSERT(split_gate_inp && split_gate_inp->n_device == n_device);
auto split_exp_probs_b = exp_probs_b ? (ggml_split_tensor_t *)exp_probs_b->extra : nullptr;
GGML_ASSERT(!split_exp_probs_b || split_exp_probs_b->n_device == split_up_exps->n_device);
GGML_ASSERT(!split_exp_probs_b || split_exp_probs_b->n_device == n_device);

auto split_gate_inp_b = gate_inp_b ? (ggml_split_tensor_t *)gate_inp_b->extra : nullptr;
auto split_exps_down_b = down_exps_b ? (ggml_split_tensor_t *)down_exps_b->extra : nullptr;
auto split_exps_gate_b = gate_exps_b ? (ggml_split_tensor_t *)gate_exps_b->extra : nullptr;
auto split_exps_up_b = up_exps_b ? (ggml_split_tensor_t *)up_exps_b->extra : nullptr;
auto split_exps_up_gate_b = up_gate_exps_b ? (ggml_split_tensor_t *)up_gate_exps_b->extra : nullptr;
int last_id = -1;
bool down_bias_added = false;
for (int id = 0; id < split_up_exps->n_device; ++id) {
GGML_ASSERT((split_up_exps->splits[id] && split_gate_exps->splits[id] && split_down_exps->splits[id]) ||
(!split_up_exps->splits[id] && !split_gate_exps->splits[id] && !split_down_exps->splits[id]));
if (!split_up_exps->splits[id]) continue;
for (int id = 0; id < n_device; ++id) {
bool has_up_gate = split_up_gate_exps ? split_up_gate_exps->splits[id] != nullptr : split_up_exps->splits[id] != nullptr && split_gate_exps->splits[id]!= nullptr ;
GGML_ASSERT((has_up_gate && split_down_exps->splits[id]) ||
(!has_up_gate && !split_down_exps->splits[id]));
if (!has_up_gate) continue;
int il_cb = 1000*(id + 1) + il;
auto cur = get_input_tensor_sm_graph(ctx, input, id);
cur = do_split_norm(ctx, cur, ffn_norm, lctx.model.hparams, cb, id, il_cb, false);
Expand All @@ -1425,19 +1433,21 @@ llm_expert_gating_func_type gating_op,
GGML_ASSERT(!split_exps_up_b || split_exps_up_b->splits[id]);
auto routed_out = llm_build_moe_ffn(ctx, lctx, cur,
split_gate_inp->splits[id], split_gate_inp_b ? split_gate_inp_b->splits[id] : nullptr,
split_up_exps->splits[id], split_exps_up_b ? split_exps_up_b->splits[id] : nullptr,
split_gate_exps->splits[id], split_exps_gate_b ? split_exps_gate_b->splits[id] : nullptr,
split_up_exps ? split_up_exps->splits[id] : nullptr, split_exps_up_b ? split_exps_up_b->splits[id] : nullptr,
split_gate_exps ? split_gate_exps->splits[id] : nullptr, split_exps_gate_b ? split_exps_gate_b->splits[id] : nullptr,
split_down_exps->splits[id], !down_bias_added && split_exps_down_b ? split_exps_down_b->splits[id] : nullptr,
split_exp_probs_b ? split_exp_probs_b->splits[id] : nullptr,
n_expert, n_expert_used,
type_op, norm_w, scale_w, w_scale,
gating_op, cb, il, graph, false);
gating_op, cb, il, graph, false,
split_up_gate_exps ? split_up_gate_exps->splits[id] : nullptr,
split_exps_up_gate_b ? split_exps_up_gate_b->splits[id] : nullptr);
cb(routed_out, "routed_out", il_cb);

if (split_up_shexp) {
GGML_ASSERT(!split_up_b_shexp || split_up_b_shexp->n_device == split_up_exps->n_device);
GGML_ASSERT(!split_gate_b_shexp || split_gate_b_shexp->n_device == split_up_exps->n_device);
GGML_ASSERT(!split_down_b_shexp || split_down_b_shexp->n_device == split_up_exps->n_device);
GGML_ASSERT(!split_up_b_shexp || split_up_b_shexp->n_device == n_device);
GGML_ASSERT(!split_gate_b_shexp || split_gate_b_shexp->n_device == n_device);
GGML_ASSERT(!split_down_b_shexp || split_down_b_shexp->n_device == n_device);
auto shared_out = llm_build_ffn(ctx, lctx, nullptr, cur,
split_up_shexp->splits[id], split_up_b_shexp ? split_up_b_shexp->splits[id] : nullptr, nullptr,
split_gate_shexp->splits[id], split_gate_b_shexp ? split_gate_b_shexp->splits[id] : nullptr, nullptr,
Expand Down Expand Up @@ -1477,7 +1487,7 @@ llm_expert_gating_func_type gating_op,
cb(results[last_id], "ffn_inp_added", il);
}

auto cur = ggml_reduce(ctx, results.data(), split_up_exps->n_device, GGML_OP_ADD);
auto cur = ggml_reduce(ctx, results.data(), n_device, GGML_OP_ADD);
cb(cur, "moe_ffn_combined", il);
ggml_build_forward_expand(graph, cur);

Expand Down
57 changes: 43 additions & 14 deletions src/llama-load-tensors.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3215,9 +3215,6 @@ bool create_tensors_helper::create_std_ffn_exps(int64_t n_embd, const LLM_TN & t
auto ug_meta = ml.get_tensor_meta(ug_name.c_str());
//printf("Checking for tensor %s: %s\n", ug_name.c_str(), ug_meta ? "found" : "not found");
if (ug_meta) {
if (model.split_mode == LLAMA_SPLIT_MODE_ATTN || model.split_mode == LLAMA_SPLIT_MODE_GRAPH) {
GGML_ABORT("Merged ffn_up_exps/ffn_gate_exps are not supported for split mode graph!");
}
layer.ffn_up_gate_exps = create_tensor(ffn_ctx, ug_name, { n_embd, 2*n_ff_exp, n_expert}, flags);
} else {
merged = flags == 0 && ml.merge_up_gate_exps && merge_up_gate_exps(tn, i, 0);
Expand Down Expand Up @@ -3441,6 +3438,25 @@ static void check_delta_split(ggml_tensor * t, llama_split_tensor & l_split) {
std::memcpy(t->op_params, &data, sizeof(data));
}

static void prepare_up_gate_split(ggml_tensor * t, llama_split_tensor & split) {
auto extra = (ggml_split_tensor_t *)t->extra;
GGML_ASSERT(extra);
split.ranges.resize(extra->n_device);
int idim = extra->split_dim;
int nrows = t->ne[idim]/2;
int ntot = 0;
for (int is = 0; is < extra->n_device; ++is) {
if (!extra->splits[is]) continue;
auto & ranges = split.ranges[is];
ranges.resize(2);
int nrows_is = extra->splits[idim]->ne[1]/2;
ranges[0] = {ntot, nrows_is};
ranges[1] = {ntot + nrows, nrows_is};
ntot += nrows_is;
}
check_delta_split(t, split);
}

// ttype = 0 -> q, k, v, always multiplied with head_k_dim/head_v_dim
// ttype = 1 -> q, k, v, v, always multiplied with head_k_dim/head_v_dim
// ttype = 2 -> v
Expand Down Expand Up @@ -4002,10 +4018,10 @@ bool create_tensors_helper::create_tensors() {
}

std::vector<int> ffn_split;
if (layer.ffn_down_exps && layer.ffn_up_exps && layer.ffn_gate_exps) {
bool use_split = split_tensors.find(layer.ffn_down_exps) != split_tensors.end() &&
split_tensors.find(layer.ffn_gate_exps) != split_tensors.end() &&
split_tensors.find(layer.ffn_up_exps) != split_tensors.end();
if (layer.ffn_down_exps && ((layer.ffn_up_exps && layer.ffn_gate_exps) || layer.ffn_up_gate_exps)) {
bool has_up_gate = split_tensors.find(layer.ffn_gate_exps) != split_tensors.end() && split_tensors.find(layer.ffn_up_exps) != split_tensors.end();
has_up_gate |= split_tensors.find(layer.ffn_up_gate_exps) != split_tensors.end();
bool use_split = split_tensors.find(layer.ffn_down_exps) != split_tensors.end() && has_up_gate;

if (use_split) {
int ffn_granularity = 16;
Expand All @@ -4017,16 +4033,29 @@ bool create_tensors_helper::create_tensors() {
LLAMA_LOG_DEBUG(" split_ffn_exps:"); for ([[maybe_unused]] auto s : ffn_split) LLAMA_LOG_DEBUG(" %d", s);
LLAMA_LOG_DEBUG("\n");
prepare_split_tensors(0, ctx_split, layer.ffn_down_exps, layer.split_ffn_down_exps, ffn_split, mem_used);
prepare_split_tensors(1, ctx_split, layer.ffn_up_exps, layer.split_ffn_up_exps, ffn_split, mem_used);
prepare_split_tensors(1, ctx_split, layer.ffn_gate_exps, layer.split_ffn_gate_exps, ffn_split, mem_used);
if (layer.ffn_up_gate_exps) {
auto up_gate_split = ffn_split;
for (auto & v : up_gate_split) v *= 2;
prepare_split_tensors(1, ctx_split, layer.ffn_up_gate_exps, layer.split_ffn_up_gate_exps, up_gate_split, mem_used);
prepare_up_gate_split(layer.ffn_up_gate_exps, layer.split_ffn_up_gate_exps);
if (layer.ffn_up_gate_exps_b) {
prepare_split_tensors(1, ctx_split, layer.ffn_up_gate_exps_b, layer.split_ffn_up_gate_exps_b, up_gate_split, mem_used);
prepare_up_gate_split(layer.ffn_up_gate_exps_b, layer.split_ffn_up_gate_exps_b);
}
} else {
prepare_split_tensors(1, ctx_split, layer.ffn_up_exps, layer.split_ffn_up_exps, ffn_split, mem_used);
prepare_split_tensors(1, ctx_split, layer.ffn_gate_exps, layer.split_ffn_gate_exps, ffn_split, mem_used);
}
if (layer.ffn_down_exps_b) {
prepare_split_tensors(-1, ctx_split, layer.ffn_down_exps_b, layer.split_ffn_down_exps_b, ffn_split, mem_used);
}
if (layer.ffn_up_exps_b) {
prepare_split_tensors( 0, ctx_split, layer.ffn_up_exps_b, layer.split_ffn_up_exps_b, ffn_split, mem_used);
}
if (layer.ffn_gate_exps_b) {
prepare_split_tensors( 0, ctx_split, layer.ffn_gate_exps_b, layer.split_ffn_gate_exps_b, ffn_split, mem_used);
if (!layer.ffn_up_gate_exps) {
if (layer.ffn_up_exps_b) {
prepare_split_tensors( 0, ctx_split, layer.ffn_up_exps_b, layer.split_ffn_up_exps_b, ffn_split, mem_used);
}
if (layer.ffn_gate_exps_b) {
prepare_split_tensors( 0, ctx_split, layer.ffn_gate_exps_b, layer.split_ffn_gate_exps_b, ffn_split, mem_used);
}
}
}
}
Expand Down
3 changes: 3 additions & 0 deletions src/llama-model.h
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,7 @@ struct llama_layer {
llama_split_tensor split_ffn_gate;
llama_split_tensor split_ffn_down;
llama_split_tensor split_ffn_norm;
llama_split_tensor split_ffn_up_gate;

// ff MoE
struct ggml_tensor * ffn_gate_inp = nullptr;
Expand All @@ -262,6 +263,7 @@ struct llama_layer {
llama_split_tensor split_ffn_up_exps;
llama_split_tensor split_ffn_gate_exps;
llama_split_tensor split_ffn_down_exps;
llama_split_tensor split_ffn_up_gate_exps;

// ff MoE bias
struct ggml_tensor * ffn_gate_inp_b = nullptr;
Expand All @@ -288,6 +290,7 @@ struct llama_layer {
llama_split_tensor split_ffn_gate_exps_b;
llama_split_tensor split_ffn_down_exps_b;
llama_split_tensor split_ffn_up_exps_b;
llama_split_tensor split_ffn_up_gate_exps_b;

// ff bias
struct ggml_tensor * ffn_gate_b = nullptr;
Expand Down