ikawrakow · ikawrakow · Mar 12, 2026 · Mar 11, 2026 · Mar 11, 2026 · Mar 11, 2026
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
@@ -978,10 +978,13 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor([[maybe_unused]]
         }
     }
     else if (extra->split_dim == 1) {
+        void * extra_ptr;
+        memcpy(&extra_ptr, tensor->op_params, sizeof(extra_ptr));
         if (tensor->ne[2] > 1) {
             auto row_size = ggml_row_size(tensor->type, tensor->ne[0]);
             std::vector<char> host_buffer;
             int ne1 = 0;
+            int extra_ne1 = 0;
             for (int i = 0; i < extra->n_device; ++i) {
                 auto split = extra->splits[i];
                 if (!split) continue;
@@ -990,17 +993,25 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor([[maybe_unused]]
                 if (host_buffer.size() < size) host_buffer.resize(size);
                 for (int64_t i02 = 0; i02 < split->ne[2]; ++i02) {
                     auto dst = host_buffer.data() + i02*split->ne[1]*row_size;
-                    auto src = (const char *)data + i02*tensor->nb[2] + ne1*tensor->nb[1];
-                    memcpy(dst, src, split->ne[1]*row_size);
+                    if (extra_ptr) {
+                        auto & ranges = *(const std::vector<std::vector<std::pair<int,int>>> *)extra_ptr;
+                        for (auto & p : ranges[i]) {
+                            auto this_src = (const char *)data + i02*tensor->nb[2] + p.first*tensor->nb[1];
+                            auto this_size = p.second*tensor->nb[1];
+                            memcpy(dst, this_src, this_size);
+                            dst += this_size;
+                        }
+                    } else {
+                        auto src = (const char *)data + i02*tensor->nb[2] + ne1*tensor->nb[1];
+                        memcpy(dst, src, split->ne[1]*row_size);
+                    }
                 }
                 CUDA_CHECK(cudaMemcpyAsync(split->data, host_buffer.data(), size, cudaMemcpyHostToDevice, cudaStreamPerThread));
                 ne1 += split->ne[1];
             }
         } else {
             int n_interleave = 1;
             if (auto it = k_map.find(tensor->type); it != k_map.end()) n_interleave = it->second;
-            void * extra_ptr;
-            memcpy(&extra_ptr, tensor->op_params, sizeof(extra_ptr));
             if (extra_ptr) {
                 auto & ranges = *(const std::vector<std::vector<std::pair<int,int>>> *)extra_ptr;
                 GGML_ASSERT(extra->n_device == int(ranges.size()));

diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp
@@ -1271,7 +1271,8 @@ llm_expert_gating_func_type   gating_op,
     auto split_up_b_shexp   = up_b_shexp   ? (ggml_split_tensor_t *)up_b_shexp   : nullptr;
     auto split_gate_b_shexp = gate_b_shexp ? (ggml_split_tensor_t *)gate_b_shexp : nullptr;
     auto split_down_b_shexp = down_b_shexp ? (ggml_split_tensor_t *)down_b_shexp : nullptr;
-    if (!split_up_exps && !split_gate_exps && !split_down_exps) {
+    auto split_up_gate_exps = up_gate_exps ? (ggml_split_tensor_t *)up_gate_exps->extra : nullptr;
+    if (!split_up_exps && !split_gate_exps && !split_up_gate_exps && !split_down_exps) {
         auto cur = input;
         if (ffn_norm) {
             auto the_ffn_norm = ffn_norm->extra ? ((ggml_split_tensor_t *)ffn_norm->extra)->splits[lctx.model.main_gpu] : ffn_norm;
@@ -1393,26 +1394,33 @@ llm_expert_gating_func_type   gating_op,
         }
         return cur;
     }
-    GGML_ASSERT(split_up_exps && split_gate_exps && split_down_exps);
-    GGML_ASSERT(split_up_exps->n_device == split_gate_exps->n_device && split_up_exps->n_device == split_down_exps->n_device);
-    std::vector<ggml_tensor *> results(split_up_exps->n_device, nullptr);
+    GGML_ASSERT(((split_up_exps && split_gate_exps) || split_up_gate_exps) && split_down_exps);
+    int n_device = split_down_exps->n_device;
+    if (split_up_gate_exps) {
+        GGML_ASSERT(split_up_gate_exps->n_device == n_device);
+    } else {
+        GGML_ASSERT(split_up_exps->n_device == n_device && split_gate_exps->n_device == n_device);
+    }
+    std::vector<ggml_tensor *> results(n_device, nullptr);
     GGML_ASSERT((!split_up_shexp && !split_gate_shexp && !split_down_shexp) ||
                 ( split_up_shexp &&  split_gate_shexp &&  split_down_shexp));
     auto split_gate_inp = (ggml_split_tensor_t *)gate_inp->extra;
-    GGML_ASSERT(split_gate_inp && split_gate_inp->n_device == split_up_exps->n_device);
+    GGML_ASSERT(split_gate_inp && split_gate_inp->n_device == n_device);
     auto split_exp_probs_b = exp_probs_b ? (ggml_split_tensor_t *)exp_probs_b->extra : nullptr;
-    GGML_ASSERT(!split_exp_probs_b || split_exp_probs_b->n_device == split_up_exps->n_device);
+    GGML_ASSERT(!split_exp_probs_b || split_exp_probs_b->n_device == n_device);
 
     auto split_gate_inp_b  = gate_inp_b  ? (ggml_split_tensor_t *)gate_inp_b->extra  : nullptr;
     auto split_exps_down_b = down_exps_b ? (ggml_split_tensor_t *)down_exps_b->extra : nullptr;
     auto split_exps_gate_b = gate_exps_b ? (ggml_split_tensor_t *)gate_exps_b->extra : nullptr;
     auto split_exps_up_b   = up_exps_b   ? (ggml_split_tensor_t *)up_exps_b->extra   : nullptr;
+    auto split_exps_up_gate_b = up_gate_exps_b ? (ggml_split_tensor_t *)up_gate_exps_b->extra : nullptr;
     int last_id = -1;
     bool down_bias_added = false;
-    for (int id = 0; id < split_up_exps->n_device; ++id) {
-        GGML_ASSERT((split_up_exps->splits[id] && split_gate_exps->splits[id] && split_down_exps->splits[id]) ||
-                    (!split_up_exps->splits[id] && !split_gate_exps->splits[id] && !split_down_exps->splits[id]));
-        if (!split_up_exps->splits[id]) continue;
+    for (int id = 0; id < n_device; ++id) {
+        bool has_up_gate = split_up_gate_exps ? split_up_gate_exps->splits[id] != nullptr : split_up_exps->splits[id] != nullptr && split_gate_exps->splits[id]!= nullptr ;
+        GGML_ASSERT((has_up_gate && split_down_exps->splits[id]) ||
+                    (!has_up_gate && !split_down_exps->splits[id]));
+        if (!has_up_gate) continue;
         int il_cb = 1000*(id + 1) + il;
         auto cur = get_input_tensor_sm_graph(ctx, input, id);
         cur = do_split_norm(ctx, cur, ffn_norm, lctx.model.hparams, cb, id, il_cb, false);
@@ -1425,19 +1433,21 @@ llm_expert_gating_func_type   gating_op,
         GGML_ASSERT(!split_exps_up_b   || split_exps_up_b->splits[id]);
         auto routed_out = llm_build_moe_ffn(ctx, lctx, cur,
                     split_gate_inp->splits[id],  split_gate_inp_b ? split_gate_inp_b->splits[id] : nullptr,
-                    split_up_exps->splits[id],   split_exps_up_b  ? split_exps_up_b->splits[id]  : nullptr,
-                    split_gate_exps->splits[id], split_exps_gate_b ? split_exps_gate_b->splits[id] : nullptr,
+                    split_up_exps ? split_up_exps->splits[id] : nullptr, split_exps_up_b  ? split_exps_up_b->splits[id]  : nullptr,
+                    split_gate_exps ? split_gate_exps->splits[id] : nullptr, split_exps_gate_b ? split_exps_gate_b->splits[id] : nullptr,
                     split_down_exps->splits[id], !down_bias_added && split_exps_down_b ? split_exps_down_b->splits[id] : nullptr,
                     split_exp_probs_b ? split_exp_probs_b->splits[id] : nullptr,
                     n_expert, n_expert_used,
                     type_op, norm_w, scale_w, w_scale,
-                    gating_op, cb, il, graph, false);
+                    gating_op, cb, il, graph, false,
+                    split_up_gate_exps ? split_up_gate_exps->splits[id] : nullptr,
+                    split_exps_up_gate_b ? split_exps_up_gate_b->splits[id] : nullptr);
         cb(routed_out, "routed_out", il_cb);
 
         if (split_up_shexp) {
-            GGML_ASSERT(!split_up_b_shexp   || split_up_b_shexp->n_device   == split_up_exps->n_device);
-            GGML_ASSERT(!split_gate_b_shexp || split_gate_b_shexp->n_device == split_up_exps->n_device);
-            GGML_ASSERT(!split_down_b_shexp || split_down_b_shexp->n_device == split_up_exps->n_device);
+            GGML_ASSERT(!split_up_b_shexp   || split_up_b_shexp->n_device   == n_device);
+            GGML_ASSERT(!split_gate_b_shexp || split_gate_b_shexp->n_device == n_device);
+            GGML_ASSERT(!split_down_b_shexp || split_down_b_shexp->n_device == n_device);
             auto shared_out = llm_build_ffn(ctx, lctx, nullptr, cur,
                     split_up_shexp->splits[id],   split_up_b_shexp   ? split_up_b_shexp->splits[id]   : nullptr, nullptr,
                     split_gate_shexp->splits[id], split_gate_b_shexp ? split_gate_b_shexp->splits[id] : nullptr, nullptr,
@@ -1477,7 +1487,7 @@ llm_expert_gating_func_type   gating_op,
         cb(results[last_id], "ffn_inp_added", il);
     }
 
-    auto cur = ggml_reduce(ctx, results.data(), split_up_exps->n_device, GGML_OP_ADD);
+    auto cur = ggml_reduce(ctx, results.data(), n_device, GGML_OP_ADD);
     cb(cur, "moe_ffn_combined", il);
     ggml_build_forward_expand(graph, cur);
 

diff --git a/src/llama-load-tensors.cpp b/src/llama-load-tensors.cpp
@@ -3215,9 +3215,6 @@ bool create_tensors_helper::create_std_ffn_exps(int64_t n_embd, const LLM_TN & t
     auto ug_meta = ml.get_tensor_meta(ug_name.c_str());
     //printf("Checking for tensor %s: %s\n", ug_name.c_str(), ug_meta ? "found" : "not found");
     if (ug_meta) {
-        if (model.split_mode == LLAMA_SPLIT_MODE_ATTN || model.split_mode == LLAMA_SPLIT_MODE_GRAPH) {
-            GGML_ABORT("Merged ffn_up_exps/ffn_gate_exps are not supported for split mode graph!");
-        }
         layer.ffn_up_gate_exps = create_tensor(ffn_ctx, ug_name, {  n_embd, 2*n_ff_exp, n_expert}, flags);
     } else {
         merged = flags == 0 && ml.merge_up_gate_exps && merge_up_gate_exps(tn, i, 0);
@@ -3441,6 +3438,25 @@ static void check_delta_split(ggml_tensor * t, llama_split_tensor & l_split) {
     std::memcpy(t->op_params, &data, sizeof(data));
 }
 
+static void prepare_up_gate_split(ggml_tensor * t, llama_split_tensor & split) {
+    auto extra = (ggml_split_tensor_t *)t->extra;
+    GGML_ASSERT(extra);
+    split.ranges.resize(extra->n_device);
+    int idim = extra->split_dim;
+    int nrows = t->ne[idim]/2;
+    int ntot = 0;
+    for (int is = 0; is < extra->n_device; ++is) {
+        if (!extra->splits[is]) continue;
+        auto & ranges = split.ranges[is];
+        ranges.resize(2);
+        int nrows_is = extra->splits[idim]->ne[1]/2;
+        ranges[0] = {ntot,         nrows_is};
+        ranges[1] = {ntot + nrows, nrows_is};
+        ntot += nrows_is;
+    }
+    check_delta_split(t, split);
+}
+
 // ttype = 0 -> q, k, v, always multiplied with head_k_dim/head_v_dim
 // ttype = 1 -> q, k, v, v, always multiplied with head_k_dim/head_v_dim
 // ttype = 2 -> v
@@ -4002,10 +4018,10 @@ bool create_tensors_helper::create_tensors() {
             }
 
             std::vector<int> ffn_split;
-            if (layer.ffn_down_exps && layer.ffn_up_exps && layer.ffn_gate_exps) {
-                bool use_split = split_tensors.find(layer.ffn_down_exps) != split_tensors.end() &&
-                                 split_tensors.find(layer.ffn_gate_exps) != split_tensors.end() &&
-                                 split_tensors.find(layer.ffn_up_exps)   != split_tensors.end();
+            if (layer.ffn_down_exps && ((layer.ffn_up_exps && layer.ffn_gate_exps) || layer.ffn_up_gate_exps)) {
+                bool has_up_gate = split_tensors.find(layer.ffn_gate_exps) != split_tensors.end() && split_tensors.find(layer.ffn_up_exps) != split_tensors.end();
+                has_up_gate |= split_tensors.find(layer.ffn_up_gate_exps) != split_tensors.end();
+                bool use_split = split_tensors.find(layer.ffn_down_exps) != split_tensors.end() && has_up_gate;
 
                 if (use_split) {
                     int ffn_granularity = 16;
@@ -4017,16 +4033,29 @@ bool create_tensors_helper::create_tensors() {
                     LLAMA_LOG_DEBUG("  split_ffn_exps:"); for ([[maybe_unused]] auto s : ffn_split) LLAMA_LOG_DEBUG(" %d", s);
                     LLAMA_LOG_DEBUG("\n");
                     prepare_split_tensors(0, ctx_split, layer.ffn_down_exps, layer.split_ffn_down_exps, ffn_split, mem_used);
-                    prepare_split_tensors(1, ctx_split, layer.ffn_up_exps,   layer.split_ffn_up_exps,   ffn_split, mem_used);
-                    prepare_split_tensors(1, ctx_split, layer.ffn_gate_exps, layer.split_ffn_gate_exps, ffn_split, mem_used);
+                    if (layer.ffn_up_gate_exps) {
+                        auto up_gate_split = ffn_split;
+                        for (auto & v : up_gate_split) v *= 2;
+                        prepare_split_tensors(1, ctx_split, layer.ffn_up_gate_exps, layer.split_ffn_up_gate_exps, up_gate_split, mem_used);
+                        prepare_up_gate_split(layer.ffn_up_gate_exps, layer.split_ffn_up_gate_exps);
+                        if (layer.ffn_up_gate_exps_b) {
+                            prepare_split_tensors(1, ctx_split, layer.ffn_up_gate_exps_b, layer.split_ffn_up_gate_exps_b, up_gate_split, mem_used);
+                            prepare_up_gate_split(layer.ffn_up_gate_exps_b, layer.split_ffn_up_gate_exps_b);
+                        }
+                    } else {
+                        prepare_split_tensors(1, ctx_split, layer.ffn_up_exps,   layer.split_ffn_up_exps,   ffn_split, mem_used);
+                        prepare_split_tensors(1, ctx_split, layer.ffn_gate_exps, layer.split_ffn_gate_exps, ffn_split, mem_used);
+                    }
                     if (layer.ffn_down_exps_b) {
                         prepare_split_tensors(-1, ctx_split, layer.ffn_down_exps_b, layer.split_ffn_down_exps_b, ffn_split, mem_used);
                     }
-                    if (layer.ffn_up_exps_b) {
-                        prepare_split_tensors( 0, ctx_split, layer.ffn_up_exps_b, layer.split_ffn_up_exps_b, ffn_split, mem_used);
-                    }
-                    if (layer.ffn_gate_exps_b) {
-                        prepare_split_tensors( 0, ctx_split, layer.ffn_gate_exps_b, layer.split_ffn_gate_exps_b, ffn_split, mem_used);
+                    if (!layer.ffn_up_gate_exps) {
+                        if (layer.ffn_up_exps_b) {
+                            prepare_split_tensors( 0, ctx_split, layer.ffn_up_exps_b, layer.split_ffn_up_exps_b, ffn_split, mem_used);
+                        }
+                        if (layer.ffn_gate_exps_b) {
+                            prepare_split_tensors( 0, ctx_split, layer.ffn_gate_exps_b, layer.split_ffn_gate_exps_b, ffn_split, mem_used);
+                        }
                     }
                 }
             }

diff --git a/src/llama-model.h b/src/llama-model.h
@@ -250,6 +250,7 @@ struct llama_layer {
     llama_split_tensor split_ffn_gate;
     llama_split_tensor split_ffn_down;
     llama_split_tensor split_ffn_norm;
+    llama_split_tensor split_ffn_up_gate;
 
     // ff MoE
     struct ggml_tensor * ffn_gate_inp = nullptr;
@@ -262,6 +263,7 @@ struct llama_layer {
     llama_split_tensor split_ffn_up_exps;
     llama_split_tensor split_ffn_gate_exps;
     llama_split_tensor split_ffn_down_exps;
+    llama_split_tensor split_ffn_up_gate_exps;
 
     // ff MoE bias
     struct ggml_tensor * ffn_gate_inp_b = nullptr;
@@ -288,6 +290,7 @@ struct llama_layer {
     llama_split_tensor split_ffn_gate_exps_b;
     llama_split_tensor split_ffn_down_exps_b;
     llama_split_tensor split_ffn_up_exps_b;
+    llama_split_tensor split_ffn_up_gate_exps_b;
 
     // ff bias
     struct ggml_tensor * ffn_gate_b = nullptr;