Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 43 additions & 10 deletions tools/llama-bench/llama-bench.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,7 @@ struct cmd_params {
std::vector<std::vector<float>> tensor_split;
std::vector<std::vector<llama_model_tensor_buft_override>> tensor_buft_overrides;
std::vector<bool> use_mmap;
std::vector<bool> use_direct_io;
std::vector<bool> embeddings;
std::vector<bool> no_op_offload;
std::vector<bool> no_host;
Expand Down Expand Up @@ -372,6 +373,7 @@ static const cmd_params cmd_params_defaults = {
/* tensor_split */ { std::vector<float>(llama_max_devices(), 0.0f) },
/* tensor_buft_overrides*/ { std::vector<llama_model_tensor_buft_override>{ { nullptr, nullptr } } },
/* use_mmap */ { true },
/* use_direct_io */ { true },
/* embeddings */ { false },
/* no_op_offload */ { false },
/* no_host */ { false },
Expand Down Expand Up @@ -449,6 +451,8 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -dev, --device <dev0/dev1/...> (default: auto)\n");
printf(" -mmp, --mmap <0|1> (default: %s)\n",
join(cmd_params_defaults.use_mmap, ",").c_str());
printf(" -dio, --direct-io <0|1> (default: %s)\n",
join(cmd_params_defaults.use_direct_io, ",").c_str());
printf(" -embd, --embeddings <0|1> (default: %s)\n",
join(cmd_params_defaults.embeddings, ",").c_str());
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
Expand Down Expand Up @@ -772,6 +776,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
}
auto p = string_split<bool>(argv[i], split_delim);
params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
} else if (arg == "-dio" || arg == "--direct-io") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = string_split<bool>(argv[i], split_delim);
params.use_direct_io.insert(params.use_direct_io.end(), p.begin(), p.end());
} else if (arg == "-embd" || arg == "--embeddings") {
if (++i >= argc) {
invalid_param = true;
Expand Down Expand Up @@ -1008,6 +1019,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
if (params.use_mmap.empty()) {
params.use_mmap = cmd_params_defaults.use_mmap;
}
if (params.use_direct_io.empty()) {
params.use_direct_io = cmd_params_defaults.use_direct_io;
}
if (params.embeddings.empty()) {
params.embeddings = cmd_params_defaults.embeddings;
}
Expand Down Expand Up @@ -1056,6 +1070,7 @@ struct cmd_params_instance {
std::vector<float> tensor_split;
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
bool use_mmap;
bool use_direct_io;
bool embeddings;
bool no_op_offload;
bool no_host;
Expand All @@ -1067,11 +1082,12 @@ struct cmd_params_instance {
if (!devices.empty()) {
mparams.devices = const_cast<ggml_backend_dev_t *>(devices.data());
}
mparams.split_mode = split_mode;
mparams.main_gpu = main_gpu;
mparams.tensor_split = tensor_split.data();
mparams.use_mmap = use_mmap;
mparams.no_host = no_host;
mparams.split_mode = split_mode;
mparams.main_gpu = main_gpu;
mparams.tensor_split = tensor_split.data();
mparams.use_mmap = use_mmap;
mparams.use_direct_io = use_direct_io;
mparams.no_host = no_host;

if (n_cpu_moe <= 0) {
if (tensor_buft_overrides.empty()) {
Expand Down Expand Up @@ -1115,7 +1131,8 @@ struct cmd_params_instance {
bool equal_mparams(const cmd_params_instance & other) const {
return model == other.model && n_gpu_layers == other.n_gpu_layers && n_cpu_moe == other.n_cpu_moe &&
split_mode == other.split_mode &&
main_gpu == other.main_gpu && use_mmap == other.use_mmap && tensor_split == other.tensor_split &&
main_gpu == other.main_gpu && tensor_split == other.tensor_split &&
use_mmap == other.use_mmap && use_direct_io == other.use_direct_io &&
devices == other.devices &&
no_host == other.no_host &&
vec_tensor_buft_override_equal(tensor_buft_overrides, other.tensor_buft_overrides);
Expand Down Expand Up @@ -1153,6 +1170,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
for (const auto & ts : params.tensor_split)
for (const auto & ot : params.tensor_buft_overrides)
for (const auto & mmp : params.use_mmap)
for (const auto & dio : params.use_direct_io)
for (const auto & noh : params.no_host)
for (const auto & embd : params.embeddings)
for (const auto & nopo : params.no_op_offload)
Expand Down Expand Up @@ -1194,6 +1212,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .tensor_split = */ ts,
/* .tensor_buft_overrides = */ ot,
/* .use_mmap = */ mmp,
/* .use_direct_io= */ dio,
/* .embeddings = */ embd,
/* .no_op_offload= */ nopo,
/* .no_host = */ noh,
Expand Down Expand Up @@ -1228,6 +1247,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .tensor_split = */ ts,
/* .tensor_buft_overrides = */ ot,
/* .use_mmap = */ mmp,
/* .use_direct_io= */ dio,
/* .embeddings = */ embd,
/* .no_op_offload= */ nopo,
/* .no_host = */ noh,
Expand Down Expand Up @@ -1262,6 +1282,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .tensor_split = */ ts,
/* .tensor_buft_overrides = */ ot,
/* .use_mmap = */ mmp,
/* .use_direct_io= */ dio,
/* .embeddings = */ embd,
/* .no_op_offload= */ nopo,
/* .no_host = */ noh,
Expand Down Expand Up @@ -1301,6 +1322,7 @@ struct test {
std::vector<float> tensor_split;
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
bool use_mmap;
bool use_direct_io;
bool embeddings;
bool no_op_offload;
bool no_host;
Expand Down Expand Up @@ -1338,6 +1360,7 @@ struct test {
tensor_split = inst.tensor_split;
tensor_buft_overrides = inst.tensor_buft_overrides;
use_mmap = inst.use_mmap;
use_direct_io = inst.use_direct_io;
embeddings = inst.embeddings;
no_op_offload = inst.no_op_offload;
no_host = inst.no_host;
Expand Down Expand Up @@ -1397,9 +1420,9 @@ struct test {
"n_ubatch", "n_threads", "cpu_mask", "cpu_strict", "poll",
"type_k", "type_v", "n_gpu_layers", "n_cpu_moe", "split_mode",
"main_gpu", "no_kv_offload", "flash_attn", "devices", "tensor_split",
"tensor_buft_overrides", "use_mmap", "embeddings", "no_op_offload",
"no_host", "n_prompt", "n_gen", "n_depth", "test_time",
"avg_ns", "stddev_ns", "avg_ts", "stddev_ts"
"tensor_buft_overrides", "use_mmap", "use_direct_io", "embeddings",
"no_op_offload", "no_host", "n_prompt", "n_gen", "n_depth",
"test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts"
};
return fields;
}
Expand All @@ -1414,7 +1437,7 @@ struct test {
return INT;
}
if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" ||
field == "use_mmap" || field == "embeddings" || field == "no_host") {
field == "use_mmap" || field == "use_direct_io" || field == "embeddings" || field == "no_host") {
return BOOL;
}
if (field == "avg_ts" || field == "stddev_ts") {
Expand Down Expand Up @@ -1487,6 +1510,7 @@ struct test {
tensor_split_str,
tensor_buft_overrides_str,
std::to_string(use_mmap),
std::to_string(use_direct_io),
std::to_string(embeddings),
std::to_string(no_op_offload),
std::to_string(no_host),
Expand Down Expand Up @@ -1672,6 +1696,9 @@ struct markdown_printer : public printer {
if (field == "use_mmap") {
return 4;
}
if (field == "use_direct_io") {
return 3;
}
if (field == "test") {
return 15;
}
Expand Down Expand Up @@ -1709,6 +1736,9 @@ struct markdown_printer : public printer {
if (field == "use_mmap") {
return "mmap";
}
if (field == "use_direct_io") {
return "dio";
}
if (field == "embeddings") {
return "embd";
}
Expand Down Expand Up @@ -1793,6 +1823,9 @@ struct markdown_printer : public printer {
if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) {
fields.emplace_back("use_mmap");
}
if (params.use_direct_io.size() > 1 || params.use_direct_io != cmd_params_defaults.use_direct_io) {
fields.emplace_back("use_direct_io");
}
if (params.embeddings.size() > 1 || params.embeddings != cmd_params_defaults.embeddings) {
fields.emplace_back("embeddings");
}
Expand Down
Loading