Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1500,6 +1500,21 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
//}
return true;
}
if (arg == "--fit") {
params.fit = true;
return true;
}
if (arg == "--fit-margin") {
CHECK_ARG;
int32_t margin = std::stoi(argv[i]);
if (margin < 0) {
fprintf(stderr, "error: Invalid value for --fit-margin: %d (must be >= 0)\n", margin);
invalid_param = true;
} else {
params.fit_margin = margin;
}
return true;
}
if (arg == "--no-mmap") {
params.use_mmap = false;
return true;
Expand Down Expand Up @@ -2479,6 +2494,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "*", " --run-time-repack", "repack tensors if interleaved variant is available"});
options.push_back({ "*", " --cpu-moe", "keep all MoE weights in CPU memory"});
options.push_back({ "*", " --n-cpu-moe N", "keep MoE weights of the first N layers in CPU memory"});
options.push_back({ "*", " --fit-margin N", "safety margin in MiB when auto-fitting model offloading"});
options.push_back({ "*", " --fit", "automatically determine which tensors to offload to the GPU(s)"});
options.push_back({ "*", " --numa TYPE", "attempt optimizations that help on some NUMA systems\n"
" - distribute: spread execution evenly over all nodes\n"
" - isolate: only spawn threads on CPUs on the node that execution started on\n"
Expand Down Expand Up @@ -3311,6 +3328,8 @@ struct llama_model_params common_model_params_to_llama(const gpt_params & params
mparams.main_gpu = params.main_gpu;
mparams.max_gpu = params.max_gpu;
mparams.ncmoe = params.ncmoe;
mparams.fit = params.fit;
mparams.fit_margin = params.fit_margin;
mparams.type_k = kv_cache_type_from_str(params.cache_type_k);
mparams.type_v = kv_cache_type_from_str(params.cache_type_v);
mparams.max_ctx_size = params.n_ctx;
Expand Down Expand Up @@ -4357,6 +4376,8 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
fprintf(stream, "max_gpu: %d # default: 0\n", params.max_gpu);
fprintf(stream, "ncmoe: %d # default: 0\n", params.ncmoe);
fprintf(stream, "fit: %d # default: false\n", params.fit);
fprintf(stream, "fit_margin: %d # default: 0\n", params.fit_margin);
fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
Expand Down
2 changes: 2 additions & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,8 @@ struct gpt_params {
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
int32_t max_gpu = 0; // max number of GPUs to use at a time for split mode "graph"
int32_t ncmoe = 0; // number of layers in which MoE tensors are left in VRAM
int32_t fit_margin = 0; // safety margin for auto-fit in MiB
bool fit = false; // automatically fit model (for now just using MoE tensor overrides)
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
int32_t grp_attn_n = 1; // group-attention factor
int32_t grp_attn_w = 512; // group-attention width
Expand Down
2 changes: 2 additions & 0 deletions include/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,8 @@ extern "C" {
int32_t n_seq_max;
int32_t n_ubatch;
int32_t amb;
int32_t fit_margin;
bool fit;

// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
const float * tensor_split;
Expand Down
44 changes: 22 additions & 22 deletions src/llama-arch.h
Original file line number Diff line number Diff line change
Expand Up @@ -235,111 +235,111 @@ struct LLM_KV {
};

enum llm_tensor {
LLM_TENSOR_TOKEN_EMBD,
LLM_TENSOR_TOKEN_EMBD, // 0
LLM_TENSOR_TOKEN_EMBD_NORM,
LLM_TENSOR_TOKEN_TYPES,
LLM_TENSOR_POS_EMBD,
LLM_TENSOR_OUTPUT,
LLM_TENSOR_OUTPUT_NORM,
LLM_TENSOR_OUTPUT_NORM, // 5
LLM_TENSOR_ROPE_FREQS,
LLM_TENSOR_ROPE_FACTORS_LONG,
LLM_TENSOR_ROPE_FACTORS_SHORT,
LLM_TENSOR_ATTN_Q,
LLM_TENSOR_ATTN_K,
LLM_TENSOR_ATTN_K, // 10
LLM_TENSOR_ATTN_V,
LLM_TENSOR_ATTN_QKV,
LLM_TENSOR_ATTN_OUT,
LLM_TENSOR_ATTN_NORM,
LLM_TENSOR_ATTN_NORM_2,
LLM_TENSOR_ATTN_NORM_2, // 15
LLM_TENSOR_ATTN_OUT_NORM,
LLM_TENSOR_ATTN_POST_NORM,
LLM_TENSOR_ATTN_ROT_EMBD,
LLM_TENSOR_ATTN_SINKS,
LLM_TENSOR_ATTN_GATE,
LLM_TENSOR_ATTN_GATE, // 20
LLM_TENSOR_FFN_GATE_INP,
LLM_TENSOR_FFN_GATE_INP_SHEXP,
LLM_TENSOR_FFN_NORM,
LLM_TENSOR_FFN_POST_NORM,
LLM_TENSOR_FFN_GATE,
LLM_TENSOR_FFN_GATE, // 25
LLM_TENSOR_FFN_DOWN,
LLM_TENSOR_FFN_UP,
LLM_TENSOR_FFN_ACT,
LLM_TENSOR_FFN_DOWN_EXP, // split experts for backward compatibility
LLM_TENSOR_FFN_GATE_EXP,
LLM_TENSOR_FFN_GATE_EXP, // 30
LLM_TENSOR_FFN_UP_EXP,
LLM_TENSOR_FFN_NORM_EXPS,
LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
LLM_TENSOR_FFN_GATE_EXPS,
LLM_TENSOR_FFN_UP_EXPS,
LLM_TENSOR_FFN_UP_EXPS, // 35
LLM_TENSOR_FFN_GATE_UP_EXPS,
LLM_TENSOR_FFN_DOWN_SHEXP,
LLM_TENSOR_FFN_GATE_SHEXP,
LLM_TENSOR_FFN_UP_SHEXP,
LLM_TENSOR_FFN_EXP_PROBS_B,
LLM_TENSOR_FFN_EXP_PROBS_B, // 40
LLM_TENSOR_ATTN_Q_NORM,
LLM_TENSOR_ATTN_K_NORM,
LLM_TENSOR_LAYER_OUT_NORM,
LLM_TENSOR_SSM_IN,
LLM_TENSOR_SSM_CONV1D,
LLM_TENSOR_SSM_CONV1D, // 45
LLM_TENSOR_SSM_X,
LLM_TENSOR_SSM_DT,
LLM_TENSOR_SSM_A,
LLM_TENSOR_SSM_A_NOSCAN,
LLM_TENSOR_SSM_D,
LLM_TENSOR_SSM_D, // 45
LLM_TENSOR_SSM_NORM,
LLM_TENSOR_SSM_OUT,
LLM_TENSOR_SSM_BETA_ALPHA,
LLM_TENSOR_SSM_ALPHA,
LLM_TENSOR_SSM_BETA,
LLM_TENSOR_SSM_BETA, // 50
LLM_TENSOR_ATTN_Q_A,
LLM_TENSOR_ATTN_Q_B,
LLM_TENSOR_ATTN_KV_A_MQA,
LLM_TENSOR_ATTN_KQ_A_MQA,
LLM_TENSOR_ATTN_KV_B,
LLM_TENSOR_ATTN_KV_B, // 55
LLM_TENSOR_ATTN_K_B,
LLM_TENSOR_ATTN_V_B,
LLM_TENSOR_ATTN_Q_A_NORM,
LLM_TENSOR_ATTN_KV_A_NORM,
LLM_TENSOR_ATTN_SUB_NORM,
LLM_TENSOR_ATTN_SUB_NORM, // 60
LLM_TENSOR_FFN_SUB_NORM,
LLM_TENSOR_DEC_ATTN_NORM,
LLM_TENSOR_DEC_ATTN_Q,
LLM_TENSOR_DEC_ATTN_K,
LLM_TENSOR_DEC_ATTN_V,
LLM_TENSOR_DEC_ATTN_V, // 65
LLM_TENSOR_DEC_ATTN_OUT,
LLM_TENSOR_DEC_ATTN_REL_B,
LLM_TENSOR_DEC_CROSS_ATTN_NORM,
LLM_TENSOR_DEC_CROSS_ATTN_Q,
LLM_TENSOR_DEC_CROSS_ATTN_K,
LLM_TENSOR_DEC_CROSS_ATTN_K, // 70
LLM_TENSOR_DEC_CROSS_ATTN_V,
LLM_TENSOR_DEC_CROSS_ATTN_OUT,
LLM_TENSOR_DEC_CROSS_ATTN_REL_B,
LLM_TENSOR_DEC_FFN_NORM,
LLM_TENSOR_DEC_FFN_GATE,
LLM_TENSOR_DEC_FFN_GATE, // 75
LLM_TENSOR_DEC_FFN_DOWN,
LLM_TENSOR_DEC_FFN_UP,
LLM_TENSOR_DEC_OUTPUT_NORM,
LLM_TENSOR_ENC_ATTN_NORM,
LLM_TENSOR_ENC_ATTN_Q,
LLM_TENSOR_ENC_ATTN_Q, // 80
LLM_TENSOR_ENC_ATTN_K,
LLM_TENSOR_ENC_ATTN_V,
LLM_TENSOR_ENC_ATTN_OUT,
LLM_TENSOR_ENC_ATTN_REL_B,
LLM_TENSOR_ENC_FFN_NORM,
LLM_TENSOR_ENC_FFN_NORM, // 85
LLM_TENSOR_ENC_FFN_GATE,
LLM_TENSOR_ENC_FFN_DOWN,
LLM_TENSOR_ENC_FFN_UP,
LLM_TENSOR_ENC_OUTPUT_NORM,
LLM_TENSOR_NEXTN_EH_PROJ,
LLM_TENSOR_NEXTN_EH_PROJ, // 90
LLM_TENSOR_NEXTN_EMBED_TOKENS,
LLM_TENSOR_NEXTN_ENORM,
LLM_TENSOR_NEXTN_HNORM,
LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,
LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,
LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, // 95
LLM_TENSOR_INDEXER_K_NORM,
LLM_TENSOR_INDEXER_PROJ,
LLM_TENSOR_INDEXER_ATTN_K,
LLM_TENSOR_INDEXER_ATTN_Q_B,
LLM_TENSOR_INDEXER_ATTN_Q_B, // 97

LLM_TENSOR_UNKNOWN,
};
Expand Down
7 changes: 6 additions & 1 deletion src/llama-model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1852,7 +1852,12 @@ llm_tensor llm_tensor_type(llm_arch arch, const std::string & tensor_name, int i
return LLM_TENSOR_UNKNOWN;
}
for (auto & entry : it->second) {
auto this_name = ::format(entry.second.c_str(), il);
auto base_name = ::format(entry.second.c_str(), il);
auto this_name = base_name + ".weight";
if (tensor_name.find(this_name) == 0) {
return entry.first;
}
this_name = base_name + ".bias";
if (tensor_name.find(this_name) == 0) {
return entry.first;
}
Expand Down
Loading