From 774cf23ee556cac320fb68fd553e78100a8a9855 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Wed, 10 Dec 2025 22:13:58 -0600 Subject: [PATCH 01/56] initial commit for branch --- include/llama.h | 9 +++ src/llama-sampling.cpp | 134 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 143 insertions(+) diff --git a/include/llama.h b/include/llama.h index b52eaacfa7e..7e1e65523b0 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1289,6 +1289,15 @@ extern "C" { const char ** seq_breakers, size_t num_breakers); + /// @details power law sampler, reshapes probability distribution to target specific probability ranges + /// ref: https://github.com/MrJackSpade/llama.cpp + /// ref: [PR] + LLAMA_API struct llama_sampler * llama_sampler_init_power_law( + float target, // target probability (0.0 to 1.0) + float target_range, // adaptive target range (±range from target) + int32_t queue_size, // rolling history window size for adaptation + uint32_t seed); // RNG seed + LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias( int32_t n_vocab, int32_t n_logit_bias, diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 3f4a729bc36..6ef8121d7c1 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2313,6 +2313,140 @@ struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, floa return result; } +// power-law +// ref: https://github.com/MrJackSpade/llama.cpp/tree/master +// ref: [PR] + +struct llama_sampler_power_law { + const float target; + const float target_range; + const int32_t queue_size; + const uint32_t seed; + + std::mt19937 rng; + ring_buffer history; +}; + +static const char * llama_sampler_power_law_name(const struct llama_sampler * /*smpl*/) { + return "power-law"; +} + +static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + auto * ctx = (llama_sampler_power_law *) smpl->ctx; + + // these don't need to be modified or exposed to the user + const float peak_logit_value = 3.0f; + const float tail_heaviness = 3.0f; + + const float min_target = ctx->target - ctx->target_range; + const float max_target = ctx->target + ctx->target_range; + + // compute probabilities to get the "original" values + llama_sampler_softmax_impl(cur_p, false); + + // store original probabilities (needed for history update) + std::vector original_probs; + original_probs.reserve(cur_p->size); + for (size_t i = 0; i < cur_p->size; ++i) { + original_probs.push_back(cur_p->data[i].p); + } + + // calculate adaptive target + float computed_target = ctx->target; + if (ctx->history.size() > 0) { + float sum_excluding_oldest = 0.0f; + size_t sz = ctx->history.size(); + + // sum all except the oldest element + for (size_t i = 0; i < sz - 1; ++i) { + sum_excluding_oldest += ctx->history.rat(i); + } + + float next_value = (ctx->target * ctx->queue_size) - sum_excluding_oldest; + computed_target = std::max(min_target, std::min(next_value, max_target)); + } + + // find closest token (for degenerate width ~ 0 case) + float min_distance = FLT_MAX; + int closest_token_idx = -1; + + for (size_t i = 0; i < cur_p->size; ++i) { + float distance = std::abs(cur_p->data[i].p - computed_target); + if (distance < min_distance) { + min_distance = distance; + closest_token_idx = (int) i; + } + } + + // apply power law transformation + for (size_t i = 0; i < cur_p->size; ++i) { + float p = cur_p->data[i].p; + + float distance = std::abs(p - computed_target); + float normalized_distance = distance / 0.2f; + cur_p->data[i].logit = peak_logit_value / (1.0f + std::pow(normalized_distance, tail_heaviness)); + } + + llama_sampler_softmax_impl(cur_p, false); + + // sample from distribution + const int idx = llama_sample_dist(cur_p, ctx->rng); + + // set sampled token + cur_p->selected = idx; + + // update history with ORIGINAL probability + ctx->history.push_back(original_probs[idx]); +} + +static void llama_sampler_power_law_reset(struct llama_sampler * smpl) { + auto * ctx = (llama_sampler_power_law *) smpl->ctx; + ctx->history = ring_buffer(ctx->queue_size); +} + +static struct llama_sampler * llama_sampler_power_law_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_power_law *) smpl->ctx; + auto * result = llama_sampler_init_power_law(ctx->target, ctx->target_range, ctx->queue_size, ctx->seed); + auto * result_ctx = (llama_sampler_power_law *) result->ctx; + + result_ctx->history = ctx->history; + + return result; +} + +static void llama_sampler_power_law_free(struct llama_sampler * smpl) { + delete (llama_sampler_power_law *) smpl->ctx; +} + +static struct llama_sampler_i llama_sampler_power_law_i = { + /* .name = */ llama_sampler_power_law_name, + /* .accept = */ nullptr, + /* .apply = */ llama_sampler_power_law_apply, + /* .reset = */ llama_sampler_power_law_reset, + /* .clone = */ llama_sampler_power_law_clone, + /* .free = */ llama_sampler_power_law_free, +}; + +struct llama_sampler * llama_sampler_init_power_law( + float target, + float target_range, + int32_t queue_size, + uint32_t seed +) { + auto seed_cur = get_rng_seed(seed); + return llama_sampler_init( + /* .iface = */ &llama_sampler_power_law_i, + /* .ctx = */ new llama_sampler_power_law { + /* .target = */ target, + /* .target_range = */ target_range, + /* .queue_size = */ queue_size, + /* .seed = */ seed_cur, + /* .rng = */ std::mt19937(seed_cur), + /* .history = */ ring_buffer(queue_size), + } + ); +} + // logit-bias struct llama_sampler_logit_bias { From 5ab4ff7e445266f63929617c4f77cb518d24e7ae Mon Sep 17 00:00:00 2001 From: ddh0 Date: Wed, 10 Dec 2025 22:30:14 -0600 Subject: [PATCH 02/56] simplify constants --- src/llama-sampling.cpp | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 6ef8121d7c1..173f660c732 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2334,10 +2334,6 @@ static const char * llama_sampler_power_law_name(const struct llama_sampler * /* static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { auto * ctx = (llama_sampler_power_law *) smpl->ctx; - // these don't need to be modified or exposed to the user - const float peak_logit_value = 3.0f; - const float tail_heaviness = 3.0f; - const float min_target = ctx->target - ctx->target_range; const float max_target = ctx->target + ctx->target_range; @@ -2382,9 +2378,8 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok for (size_t i = 0; i < cur_p->size; ++i) { float p = cur_p->data[i].p; - float distance = std::abs(p - computed_target); - float normalized_distance = distance / 0.2f; - cur_p->data[i].logit = peak_logit_value / (1.0f + std::pow(normalized_distance, tail_heaviness)); + float normalized_distance = std::abs(p - computed_target) / 0.2f; + cur_p->data[i].logit = 3.0f / (1.0f + std::pow(normalized_distance, 3.0f)); } llama_sampler_softmax_impl(cur_p, false); From 88fb0f3f3288724eada8b1212ed6b8bd4552ac33 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Thu, 11 Dec 2025 13:47:51 -0600 Subject: [PATCH 03/56] add params to `struct common_params_sampling`, add reference to PR --- common/common.h | 58 ++++++++++++++++++++++-------------------- include/llama.h | 10 ++++---- src/llama-sampling.cpp | 5 ++-- 3 files changed, 39 insertions(+), 34 deletions(-) diff --git a/common/common.h b/common/common.h index 2fd83f0cf9c..e6d8af4b737 100644 --- a/common/common.h +++ b/common/common.h @@ -116,6 +116,7 @@ enum common_sampler_type { COMMON_SAMPLER_TYPE_INFILL = 9, COMMON_SAMPLER_TYPE_PENALTIES = 10, COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11, + COMMON_SAMPLER_TYPE_POWER_LAW = 12, }; // dimensionality reduction methods, used by cvector-generator @@ -163,33 +164,36 @@ enum common_params_sampling_config : uint64_t { struct common_params_sampling { uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler - int32_t n_prev = 64; // number of previous tokens to remember - int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. - int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens - int32_t top_k = 40; // <= 0 to use vocab size - float top_p = 0.95f; // 1.0 = disabled - float min_p = 0.05f; // 0.0 = disabled - float xtc_probability = 0.00f; // 0.0 = disabled - float xtc_threshold = 0.10f; // > 0.5 disables XTC - float typ_p = 1.00f; // typical_p, 1.0 = disabled - float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities - float dynatemp_range = 0.00f; // 0.0 = disabled - float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler - int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) - float penalty_repeat = 1.00f; // 1.0 = disabled - float penalty_freq = 0.00f; // 0.0 = disabled - float penalty_present = 0.00f; // 0.0 = disabled - float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition: - float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length) - int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty - int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size) - int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 - float top_n_sigma = -1.00f;// -1.0 = disabled - float mirostat_tau = 5.00f; // target entropy - float mirostat_eta = 0.10f; // learning rate - bool ignore_eos = false; - bool no_perf = false; // disable performance metrics - bool timing_per_token = false; + int32_t n_prev = 64; // number of previous tokens to remember + int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. + int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens + int32_t top_k = 40; // <= 0 to use vocab size + float top_p = 0.95f; // 1.0 = disabled + float min_p = 0.05f; // 0.0 = disabled + float xtc_probability = 0.00f; // 0.0 = disabled + float xtc_threshold = 0.10f; // > 0.5 disables XTC + float typ_p = 1.00f; // typical_p, 1.0 = disabled + float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities + float dynatemp_range = 0.00f; // 0.0 = disabled + float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler + int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) + float penalty_repeat = 1.00f; // 1.0 = disabled + float penalty_freq = 0.00f; // 0.0 = disabled + float penalty_present = 0.00f; // 0.0 = disabled + float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition: + float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length) + int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty + int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size) + float power_law_target = 0.5; // target probability (0.0 to 1.0) + float power_law_target_range = 0.5; // adapt the target within this range (target +/- range) + int32_t power_law_queue_size = 10; // rolling history window size for adaptation + int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 + float top_n_sigma = -1.00f; // -1.0 = disabled + float mirostat_tau = 5.00f; // target entropy + float mirostat_eta = 0.10f; // learning rate + bool ignore_eos = false; + bool no_perf = false; // disable performance metrics + bool timing_per_token = false; uint64_t user_sampling_config = 0; // bitfield to track user-specified samplers diff --git a/include/llama.h b/include/llama.h index 7e1e65523b0..3adfdb99939 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1291,12 +1291,12 @@ extern "C" { /// @details power law sampler, reshapes probability distribution to target specific probability ranges /// ref: https://github.com/MrJackSpade/llama.cpp - /// ref: [PR] + /// ref: https://github.com/ggml-org/llama.cpp/pull/17927 LLAMA_API struct llama_sampler * llama_sampler_init_power_law( - float target, // target probability (0.0 to 1.0) - float target_range, // adaptive target range (±range from target) - int32_t queue_size, // rolling history window size for adaptation - uint32_t seed); // RNG seed + float target, // target probability (0.0 to 1.0) + float target_range, // adaptive target range (±range from target) + int32_t queue_size, // rolling history window size for adaptation + uint32_t seed); // RNG seed LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias( int32_t n_vocab, diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 173f660c732..fb488acffef 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2315,7 +2315,7 @@ struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, floa // power-law // ref: https://github.com/MrJackSpade/llama.cpp/tree/master -// ref: [PR] +// ref: https://github.com/ggml-org/llama.cpp/pull/17927 struct llama_sampler_power_law { const float target; @@ -2404,7 +2404,8 @@ static struct llama_sampler * llama_sampler_power_law_clone(const struct llama_s auto * result = llama_sampler_init_power_law(ctx->target, ctx->target_range, ctx->queue_size, ctx->seed); auto * result_ctx = (llama_sampler_power_law *) result->ctx; - result_ctx->history = ctx->history; + result_ctx->rng = ctx->rng; + result_ctx->history = ctx->history; return result; } From 374bfd43634e2ab2b42957243fa0a8295dd8de99 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Thu, 11 Dec 2025 14:22:58 -0600 Subject: [PATCH 04/56] explicitly clamp `min_target` and `max_target` to `[0.0, 1.0]` --- src/llama-sampling.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index fb488acffef..eacad79448c 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2334,8 +2334,9 @@ static const char * llama_sampler_power_law_name(const struct llama_sampler * /* static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { auto * ctx = (llama_sampler_power_law *) smpl->ctx; - const float min_target = ctx->target - ctx->target_range; - const float max_target = ctx->target + ctx->target_range; + // clamp the target range to [0.0, 1.0] + const float min_target = std::max(ctx->target - ctx->target_range, 0.0f); + const float max_target = std::min(ctx->target + ctx->target_range, 1.0f); // compute probabilities to get the "original" values llama_sampler_softmax_impl(cur_p, false); From ffe163911be3201f303c40cf18df431ce14e6e71 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Thu, 11 Dec 2025 15:16:11 -0600 Subject: [PATCH 05/56] add args, rename `queue_size` -> `window_size` --- common/arg.cpp | 23 +++++++++++++++++++++++ common/common.h | 2 +- include/llama.h | 4 ++-- src/llama-sampling.cpp | 14 +++++++------- 4 files changed, 33 insertions(+), 10 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index a31dcbc689c..42106333984 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1501,6 +1501,29 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } } ).set_sparam()); + add_opt(common_arg( + {"--power-law-target"}, "N", + string_format("Power Law sampler target probability (default: %.2f, 0.0 to 1.0)\n" + "[(more info)](https://github.com/ggml-org/llama.cpp/pull/17927)", + (double)params.sampling.power_law_target), + [](common_params & params, const std::string & value) { + params.sampling.power_law_target = std::stof(value); + } + ).set_sparam()); + add_opt(common_arg( + {"--power-law-target-range"}, "N", + string_format("Power Law sampler adaptive range +/- from target (default: %.2f, 0.0 = no adaptation)", (double)params.sampling.power_law_target_range), + [](common_params & params, const std::string & value) { + params.sampling.power_law_target_range = std::stof(value); + } + ).set_sparam()); + add_opt(common_arg( + {"--power-law-window-size"}, "N", + string_format("Power Law sampler rolling window size, in tokens (default: %d)", params.sampling.power_law_window_size), + [](common_params & params, int value) { + params.sampling.power_law_window_size = value; + } + ).set_sparam()); add_opt(common_arg( {"--dynatemp-range"}, "N", string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sampling.dynatemp_range), diff --git a/common/common.h b/common/common.h index e6d8af4b737..d4f1229a7e7 100644 --- a/common/common.h +++ b/common/common.h @@ -186,7 +186,7 @@ struct common_params_sampling { int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size) float power_law_target = 0.5; // target probability (0.0 to 1.0) float power_law_target_range = 0.5; // adapt the target within this range (target +/- range) - int32_t power_law_queue_size = 10; // rolling history window size for adaptation + int32_t power_law_window_size = 10; // rolling history window size for target adaptation int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 float top_n_sigma = -1.00f; // -1.0 = disabled float mirostat_tau = 5.00f; // target entropy diff --git a/include/llama.h b/include/llama.h index 3adfdb99939..1aace655d01 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1294,8 +1294,8 @@ extern "C" { /// ref: https://github.com/ggml-org/llama.cpp/pull/17927 LLAMA_API struct llama_sampler * llama_sampler_init_power_law( float target, // target probability (0.0 to 1.0) - float target_range, // adaptive target range (±range from target) - int32_t queue_size, // rolling history window size for adaptation + float target_range, // adaptive target range (+/- range from target) + int32_t window_size, // rolling history window size for target adaptation uint32_t seed); // RNG seed LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias( diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index eacad79448c..e2c229cd9fe 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2320,7 +2320,7 @@ struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, floa struct llama_sampler_power_law { const float target; const float target_range; - const int32_t queue_size; + const int32_t window_size; const uint32_t seed; std::mt19937 rng; @@ -2359,7 +2359,7 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok sum_excluding_oldest += ctx->history.rat(i); } - float next_value = (ctx->target * ctx->queue_size) - sum_excluding_oldest; + float next_value = (ctx->target * ctx->window_size) - sum_excluding_oldest; computed_target = std::max(min_target, std::min(next_value, max_target)); } @@ -2397,12 +2397,12 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok static void llama_sampler_power_law_reset(struct llama_sampler * smpl) { auto * ctx = (llama_sampler_power_law *) smpl->ctx; - ctx->history = ring_buffer(ctx->queue_size); + ctx->history = ring_buffer(ctx->window_size); } static struct llama_sampler * llama_sampler_power_law_clone(const struct llama_sampler * smpl) { const auto * ctx = (const llama_sampler_power_law *) smpl->ctx; - auto * result = llama_sampler_init_power_law(ctx->target, ctx->target_range, ctx->queue_size, ctx->seed); + auto * result = llama_sampler_init_power_law(ctx->target, ctx->target_range, ctx->window_size, ctx->seed); auto * result_ctx = (llama_sampler_power_law *) result->ctx; result_ctx->rng = ctx->rng; @@ -2427,7 +2427,7 @@ static struct llama_sampler_i llama_sampler_power_law_i = { struct llama_sampler * llama_sampler_init_power_law( float target, float target_range, - int32_t queue_size, + int32_t window_size, uint32_t seed ) { auto seed_cur = get_rng_seed(seed); @@ -2436,10 +2436,10 @@ struct llama_sampler * llama_sampler_init_power_law( /* .ctx = */ new llama_sampler_power_law { /* .target = */ target, /* .target_range = */ target_range, - /* .queue_size = */ queue_size, + /* .window_size = */ window_size, /* .seed = */ seed_cur, /* .rng = */ std::mt19937(seed_cur), - /* .history = */ ring_buffer(queue_size), + /* .history = */ ring_buffer(window_size), } ); } From 4959878a748be461f0bf1e7fecfe93694d5eaba4 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Thu, 11 Dec 2025 16:27:14 -0600 Subject: [PATCH 06/56] improved comments --- common/arg.cpp | 2 +- include/llama.h | 15 +++++++++++---- src/llama-sampling.cpp | 11 +++++++++-- 3 files changed, 21 insertions(+), 7 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 42106333984..eac74547680 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1512,7 +1512,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_sparam()); add_opt(common_arg( {"--power-law-target-range"}, "N", - string_format("Power Law sampler adaptive range +/- from target (default: %.2f, 0.0 = no adaptation)", (double)params.sampling.power_law_target_range), + string_format("Power Law sampler adaptive target range (target±range) (default: %.2f, 0.0 = fixed target)", (double)params.sampling.power_law_target_range), [](common_params & params, const std::string & value) { params.sampling.power_law_target_range = std::stof(value); } diff --git a/include/llama.h b/include/llama.h index 1aace655d01..95df1058cc5 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1289,12 +1289,19 @@ extern "C" { const char ** seq_breakers, size_t num_breakers); - /// @details power law sampler, reshapes probability distribution to target specific probability ranges - /// ref: https://github.com/MrJackSpade/llama.cpp - /// ref: https://github.com/ggml-org/llama.cpp/pull/17927 + /// @details power-law sampler - reshapes probability distribution to target specific probability ranges + /// + /// this sampler is like `greedy`, `dist`, and `mirostat` in that it actually selects a token ID + /// rather than just transforming logits. therefore it must always be the last sampler in the + /// sampler chain. + /// + /// it is recommended to only perform minimal truncation before this sampler. + /// + /// ref: https://github.com/MrJackSpade/llama.cpp/tree/master (original impl, documentation) + /// ref: https://github.com/ggml-org/llama.cpp/pull/17927 (llama.cpp PR) LLAMA_API struct llama_sampler * llama_sampler_init_power_law( float target, // target probability (0.0 to 1.0) - float target_range, // adaptive target range (+/- range from target) + float target_range, // adaptive target range (target±range) int32_t window_size, // rolling history window size for target adaptation uint32_t seed); // RNG seed diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index e2c229cd9fe..0b591d60a88 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2314,8 +2314,15 @@ struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, floa } // power-law -// ref: https://github.com/MrJackSpade/llama.cpp/tree/master -// ref: https://github.com/ggml-org/llama.cpp/pull/17927 +// +// this sampler is like `greedy`, `dist`, and `mirostat` in that it actually selects a token ID +// rather than just transforming logits. therefore it must always be the last sampler in the +// sampler chain. +// +// it is recommended to only perform minimal truncation before this sampler. +// +// ref: https://github.com/MrJackSpade/llama.cpp/tree/master (original impl, documentation) +// ref: https://github.com/ggml-org/llama.cpp/pull/17927 (llama.cpp PR) struct llama_sampler_power_law { const float target; From f3457a83e653b85074dff573ee723069f7cf1fed Mon Sep 17 00:00:00 2001 From: ddh0 Date: Thu, 11 Dec 2025 16:36:00 -0600 Subject: [PATCH 07/56] minor --- common/arg.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index eac74547680..18259c72c26 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1503,7 +1503,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_sparam()); add_opt(common_arg( {"--power-law-target"}, "N", - string_format("Power Law sampler target probability (default: %.2f, 0.0 to 1.0)\n" + string_format("Power Law sampler target probability (default: %.2f; allowed range 0.0 to 1.0)\n" "[(more info)](https://github.com/ggml-org/llama.cpp/pull/17927)", (double)params.sampling.power_law_target), [](common_params & params, const std::string & value) { @@ -1512,7 +1512,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_sparam()); add_opt(common_arg( {"--power-law-target-range"}, "N", - string_format("Power Law sampler adaptive target range (target±range) (default: %.2f, 0.0 = fixed target)", (double)params.sampling.power_law_target_range), + string_format("Power Law sampler adaptive target range (target±range) (default: %.2f; 0.0 = fixed target)", (double)params.sampling.power_law_target_range), [](common_params & params, const std::string & value) { params.sampling.power_law_target_range = std::stof(value); } From 93169593b8f4571df120f6e8dbf8c21185a589ff Mon Sep 17 00:00:00 2001 From: ddh0 Date: Thu, 11 Dec 2025 16:46:17 -0600 Subject: [PATCH 08/56] remove old unused code from algorithm --- src/llama-sampling.cpp | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 0b591d60a88..b61202c6367 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2370,24 +2370,11 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok computed_target = std::max(min_target, std::min(next_value, max_target)); } - // find closest token (for degenerate width ~ 0 case) - float min_distance = FLT_MAX; - int closest_token_idx = -1; - - for (size_t i = 0; i < cur_p->size; ++i) { - float distance = std::abs(cur_p->data[i].p - computed_target); - if (distance < min_distance) { - min_distance = distance; - closest_token_idx = (int) i; - } - } - // apply power law transformation for (size_t i = 0; i < cur_p->size; ++i) { float p = cur_p->data[i].p; - float normalized_distance = std::abs(p - computed_target) / 0.2f; - cur_p->data[i].logit = 3.0f / (1.0f + std::pow(normalized_distance, 3.0f)); + cur_p->data[i].logit = 3.0f / (1.0f + std::pow(normalized_distance, 3.0f)); } llama_sampler_softmax_impl(cur_p, false); From b3aea5776865d09bda4f35729ee367b70cb47f64 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Thu, 11 Dec 2025 16:48:52 -0600 Subject: [PATCH 09/56] minor --- src/llama-sampling.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index b61202c6367..06a1eef1485 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2390,8 +2390,8 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok } static void llama_sampler_power_law_reset(struct llama_sampler * smpl) { - auto * ctx = (llama_sampler_power_law *) smpl->ctx; - ctx->history = ring_buffer(ctx->window_size); + auto * ctx = (llama_sampler_power_law *) smpl->ctx; + ctx->history = ring_buffer(ctx->window_size); } static struct llama_sampler * llama_sampler_power_law_clone(const struct llama_sampler * smpl) { From cd7de7c7a8fc30ec45737df428a09e2b80c30289 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Thu, 11 Dec 2025 17:23:27 -0600 Subject: [PATCH 10/56] add power law case to `common_sampler_init`, add sampler name mappings --- common/sampling.cpp | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/common/sampling.cpp b/common/sampling.cpp index 7a6b7be1e0e..07d71533841 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -243,6 +243,9 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co params.logit_bias.data())); if (params.mirostat == 0) { + // if this flag is set, we will not need to add `dist` at the end of the sampler chain + bool has_distribution_sampler = false; + for (const auto & cnstr : params.samplers) { switch (cnstr) { case COMMON_SAMPLER_TYPE_DRY: @@ -253,7 +256,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co c_breakers.push_back(str.c_str()); } - llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size())); + llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size())); } break; case COMMON_SAMPLER_TYPE_TOP_K: @@ -283,11 +286,18 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co case COMMON_SAMPLER_TYPE_PENALTIES: llama_sampler_chain_add(result->chain, llama_sampler_init_penalties (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present)); break; + case COMMON_SAMPLER_TYPE_POWER_LAW: + llama_sampler_chain_add(result->chain, llama_sampler_init_power_law (params.power_law_target, params.power_law_target_range, params.power_law_window_size, params.seed)); + has_distribution_sampler = true; + break; default: GGML_ASSERT(false && "unknown sampler type"); } } - llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed)); + // only add `dist` to the end of the chain if no other distribution samplers were added + if (!has_distribution_sampler) { + llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed)); + } } else if (params.mirostat == 1) { llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp)); llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100)); @@ -586,6 +596,7 @@ std::vector common_sampler_types_from_names(const std::vect { "xtc", COMMON_SAMPLER_TYPE_XTC }, { "infill", COMMON_SAMPLER_TYPE_INFILL }, { "penalties", COMMON_SAMPLER_TYPE_PENALTIES }, + { "power_law", COMMON_SAMPLER_TYPE_POWER_LAW }, }; // since samplers names are written multiple ways @@ -601,6 +612,7 @@ std::vector common_sampler_types_from_names(const std::vect { "typ", COMMON_SAMPLER_TYPE_TYPICAL_P }, { "min-p", COMMON_SAMPLER_TYPE_MIN_P }, { "temp", COMMON_SAMPLER_TYPE_TEMPERATURE }, + { "power-law", COMMON_SAMPLER_TYPE_POWER_LAW }, }; std::vector samplers; From 534cb4fbba8782cef4b40f3a789811d801d72db5 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Thu, 11 Dec 2025 17:29:04 -0600 Subject: [PATCH 11/56] clarify behaviour when `window_size = 0` --- common/arg.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/arg.cpp b/common/arg.cpp index 18259c72c26..31f67627f64 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1519,7 +1519,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_sparam()); add_opt(common_arg( {"--power-law-window-size"}, "N", - string_format("Power Law sampler rolling window size, in tokens (default: %d)", params.sampling.power_law_window_size), + string_format("Power Law sampler rolling window size, in tokens (default: %d; 0 = fixed target)", params.sampling.power_law_window_size), [](common_params & params, int value) { params.sampling.power_law_window_size = value; } From dcada035b4d18702cce3135a052c7c5dea71e478 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Thu, 11 Dec 2025 17:49:47 -0600 Subject: [PATCH 12/56] add missing enums --- common/sampling.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/common/sampling.cpp b/common/sampling.cpp index 07d71533841..90f48c5a056 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -564,6 +564,7 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) { case COMMON_SAMPLER_TYPE_XTC: return 'x'; case COMMON_SAMPLER_TYPE_INFILL: return 'i'; case COMMON_SAMPLER_TYPE_PENALTIES: return 'e'; + case COMMON_SAMPLER_TYPE_POWER_LAW: return 'w'; default : return '?'; } } @@ -580,6 +581,7 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) { case COMMON_SAMPLER_TYPE_XTC: return "xtc"; case COMMON_SAMPLER_TYPE_INFILL: return "infill"; case COMMON_SAMPLER_TYPE_PENALTIES: return "penalties"; + case COMMON_SAMPLER_TYPE_POWER_LAW: return "power_law"; default : return ""; } } From 2d62bbea9fcdb3cb40b7a437680f3a5c716bebd6 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Thu, 11 Dec 2025 22:43:10 -0600 Subject: [PATCH 13/56] remove `target_range` param, make `target == 1` no-op, cleanup code --- common/arg.cpp | 13 ++------- common/common.h | 60 +++++++++++++++++++--------------------- include/llama.h | 11 +++++--- src/llama-sampling.cpp | 63 +++++++++++++++++++++++++----------------- 4 files changed, 77 insertions(+), 70 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 31f67627f64..a8ea0caf334 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1503,23 +1503,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_sparam()); add_opt(common_arg( {"--power-law-target"}, "N", - string_format("Power Law sampler target probability (default: %.2f; allowed range 0.0 to 1.0)\n" - "[(more info)](https://github.com/ggml-org/llama.cpp/pull/17927)", + string_format("target probability for Power Law sampling (valid range 0.0 to 1.0; <0 = disabled) " + "(%.1f = default)\n[(more info)](https://github.com/ggml-org/llama.cpp/pull/17927)", (double)params.sampling.power_law_target), [](common_params & params, const std::string & value) { params.sampling.power_law_target = std::stof(value); } ).set_sparam()); - add_opt(common_arg( - {"--power-law-target-range"}, "N", - string_format("Power Law sampler adaptive target range (target±range) (default: %.2f; 0.0 = fixed target)", (double)params.sampling.power_law_target_range), - [](common_params & params, const std::string & value) { - params.sampling.power_law_target_range = std::stof(value); - } - ).set_sparam()); add_opt(common_arg( {"--power-law-window-size"}, "N", - string_format("Power Law sampler rolling window size, in tokens (default: %d; 0 = fixed target)", params.sampling.power_law_window_size), + string_format("rolling window size for target adaptation in Power Law sampling (≤0 = fixed target; %d = default)", params.sampling.power_law_window_size), [](common_params & params, int value) { params.sampling.power_law_window_size = value; } diff --git a/common/common.h b/common/common.h index d4f1229a7e7..ba3d776bdc5 100644 --- a/common/common.h +++ b/common/common.h @@ -164,42 +164,40 @@ enum common_params_sampling_config : uint64_t { struct common_params_sampling { uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler - int32_t n_prev = 64; // number of previous tokens to remember - int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. - int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens - int32_t top_k = 40; // <= 0 to use vocab size - float top_p = 0.95f; // 1.0 = disabled - float min_p = 0.05f; // 0.0 = disabled - float xtc_probability = 0.00f; // 0.0 = disabled - float xtc_threshold = 0.10f; // > 0.5 disables XTC - float typ_p = 1.00f; // typical_p, 1.0 = disabled - float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities - float dynatemp_range = 0.00f; // 0.0 = disabled - float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler - int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) - float penalty_repeat = 1.00f; // 1.0 = disabled - float penalty_freq = 0.00f; // 0.0 = disabled - float penalty_present = 0.00f; // 0.0 = disabled - float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition: - float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length) - int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty - int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size) - float power_law_target = 0.5; // target probability (0.0 to 1.0) - float power_law_target_range = 0.5; // adapt the target within this range (target +/- range) - int32_t power_law_window_size = 10; // rolling history window size for target adaptation - int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 - float top_n_sigma = -1.00f; // -1.0 = disabled - float mirostat_tau = 5.00f; // target entropy - float mirostat_eta = 0.10f; // learning rate - bool ignore_eos = false; - bool no_perf = false; // disable performance metrics - bool timing_per_token = false; + int32_t n_prev = 64; // number of previous tokens to remember + int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. + int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens + int32_t top_k = 40; // <= 0 to use vocab size + float top_p = 0.95f; // 1.0 = disabled + float min_p = 0.05f; // 0.0 = disabled + float xtc_probability = 0.00f; // 0.0 = disabled + float xtc_threshold = 0.10f; // > 0.5 disables XTC + float typ_p = 1.00f; // typical_p, 1.0 = disabled + float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities + float dynatemp_range = 0.00f; // 0.0 = disabled + float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler + int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) + float penalty_repeat = 1.00f; // 1.0 = disabled + float penalty_freq = 0.00f; // 0.0 = disabled + float penalty_present = 0.00f; // 0.0 = disabled + float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition: + float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length) + int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty + int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size) + float power_law_target = -1.0f; // target probability for Power Law sampling (valid range 0.0 to 1.0; <0 = disabled) + int32_t power_law_window_size = 10; // rolling window size for target adaptation in Power Law sampling (≤0 = fixed target) + int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 + float top_n_sigma = -1.00f; // -1.0 = disabled + float mirostat_tau = 5.00f; // target entropy + float mirostat_eta = 0.10f; // learning rate + bool ignore_eos = false; + bool no_perf = false; // disable performance metrics + bool timing_per_token = false; uint64_t user_sampling_config = 0; // bitfield to track user-specified samplers std::vector dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY - std::vector samplers = { COMMON_SAMPLER_TYPE_PENALTIES, COMMON_SAMPLER_TYPE_DRY, diff --git a/include/llama.h b/include/llama.h index 95df1058cc5..ce1308d2bd3 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1297,13 +1297,16 @@ extern "C" { /// /// it is recommended to only perform minimal truncation before this sampler. /// + /// @param target target probability (valid range 0.0 to 1.0; <0 = disabled) + /// @param window_size rolling window size for target adaptation (≤0 = fixed target) + /// @param seed RNG seed + /// /// ref: https://github.com/MrJackSpade/llama.cpp/tree/master (original impl, documentation) /// ref: https://github.com/ggml-org/llama.cpp/pull/17927 (llama.cpp PR) LLAMA_API struct llama_sampler * llama_sampler_init_power_law( - float target, // target probability (0.0 to 1.0) - float target_range, // adaptive target range (target±range) - int32_t window_size, // rolling history window size for target adaptation - uint32_t seed); // RNG seed + float target, + int32_t window_size, + uint32_t seed); LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias( int32_t n_vocab, diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 06a1eef1485..d5f485f846d 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2326,12 +2326,11 @@ struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, floa struct llama_sampler_power_law { const float target; - const float target_range; const int32_t window_size; - const uint32_t seed; + const uint32_t seed; std::mt19937 rng; - ring_buffer history; + ring_buffer window; }; static const char * llama_sampler_power_law_name(const struct llama_sampler * /*smpl*/) { @@ -2341,66 +2340,82 @@ static const char * llama_sampler_power_law_name(const struct llama_sampler * /* static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { auto * ctx = (llama_sampler_power_law *) smpl->ctx; - // clamp the target range to [0.0, 1.0] - const float min_target = std::max(ctx->target - ctx->target_range, 0.0f); - const float max_target = std::min(ctx->target + ctx->target_range, 1.0f); + if (ctx->target < 0.0f) { + // no-op: just sample from the distribution as-is + llama_sampler_softmax_impl(cur_p, false); + const int idx = llama_sample_dist(cur_p, ctx->rng); + cur_p->selected = idx; + return; + } + + // fixed power law transform parameters (from original implementation) + const float distribution_width = 0.2f; + const float peak_logit_value = 3.0f; + const float tail_heaviness = 3.0f; // compute probabilities to get the "original" values llama_sampler_softmax_impl(cur_p, false); - // store original probabilities (needed for history update) + // store original probabilities (used for future target adaptation) std::vector original_probs; original_probs.reserve(cur_p->size); for (size_t i = 0; i < cur_p->size; ++i) { original_probs.push_back(cur_p->data[i].p); } + // // calculate adaptive target + // + + const float min_target = 0.0f; + const float max_target = 1.0f; + float computed_target = ctx->target; - if (ctx->history.size() > 0) { + if (ctx->window.size() > 0) { float sum_excluding_oldest = 0.0f; - size_t sz = ctx->history.size(); + size_t sz = ctx->window.size(); // sum all except the oldest element for (size_t i = 0; i < sz - 1; ++i) { - sum_excluding_oldest += ctx->history.rat(i); + sum_excluding_oldest += ctx->window.rat(i); } float next_value = (ctx->target * ctx->window_size) - sum_excluding_oldest; computed_target = std::max(min_target, std::min(next_value, max_target)); } - // apply power law transformation + // + // power law transform + // + for (size_t i = 0; i < cur_p->size; ++i) { float p = cur_p->data[i].p; - float normalized_distance = std::abs(p - computed_target) / 0.2f; - cur_p->data[i].logit = 3.0f / (1.0f + std::pow(normalized_distance, 3.0f)); + float normalized_distance = std::abs(p - computed_target) / distribution_width; + cur_p->data[i].logit = peak_logit_value / (1.0f + std::pow(normalized_distance, tail_heaviness)); } llama_sampler_softmax_impl(cur_p, false); - // sample from distribution + // sample from the transformed distribution const int idx = llama_sample_dist(cur_p, ctx->rng); - - // set sampled token cur_p->selected = idx; - // update history with ORIGINAL probability - ctx->history.push_back(original_probs[idx]); + // add the ORIGINAL probability to the rolling window + ctx->window.push_back(original_probs[idx]); } static void llama_sampler_power_law_reset(struct llama_sampler * smpl) { - auto * ctx = (llama_sampler_power_law *) smpl->ctx; - ctx->history = ring_buffer(ctx->window_size); + auto * ctx = (llama_sampler_power_law *) smpl->ctx; + ctx->window = ring_buffer(ctx->window_size); } static struct llama_sampler * llama_sampler_power_law_clone(const struct llama_sampler * smpl) { const auto * ctx = (const llama_sampler_power_law *) smpl->ctx; - auto * result = llama_sampler_init_power_law(ctx->target, ctx->target_range, ctx->window_size, ctx->seed); + auto * result = llama_sampler_init_power_law(ctx->target, ctx->window_size, ctx->seed); auto * result_ctx = (llama_sampler_power_law *) result->ctx; result_ctx->rng = ctx->rng; - result_ctx->history = ctx->history; + result_ctx->window = ctx->window; return result; } @@ -2420,7 +2435,6 @@ static struct llama_sampler_i llama_sampler_power_law_i = { struct llama_sampler * llama_sampler_init_power_law( float target, - float target_range, int32_t window_size, uint32_t seed ) { @@ -2429,11 +2443,10 @@ struct llama_sampler * llama_sampler_init_power_law( /* .iface = */ &llama_sampler_power_law_i, /* .ctx = */ new llama_sampler_power_law { /* .target = */ target, - /* .target_range = */ target_range, /* .window_size = */ window_size, /* .seed = */ seed_cur, /* .rng = */ std::mt19937(seed_cur), - /* .history = */ ring_buffer(window_size), + /* .window = */ ring_buffer(window_size), } ); } From 5c78b7927fed36512538539d8ff7518c0d23d8cb Mon Sep 17 00:00:00 2001 From: ddh0 Date: Thu, 11 Dec 2025 22:47:36 -0600 Subject: [PATCH 14/56] oops, straggler --- common/sampling.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/sampling.cpp b/common/sampling.cpp index 90f48c5a056..63a17287dc8 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -287,7 +287,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co llama_sampler_chain_add(result->chain, llama_sampler_init_penalties (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present)); break; case COMMON_SAMPLER_TYPE_POWER_LAW: - llama_sampler_chain_add(result->chain, llama_sampler_init_power_law (params.power_law_target, params.power_law_target_range, params.power_law_window_size, params.seed)); + llama_sampler_chain_add(result->chain, llama_sampler_init_power_law (params.power_law_target, params.power_law_window_size, params.seed)); has_distribution_sampler = true; break; default: From 53380c183f225a63ab788cf00e0a0188da073e47 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Fri, 12 Dec 2025 22:39:51 -0600 Subject: [PATCH 15/56] add missing parameters in `server-task.cpp` --- tools/server/server-task.cpp | 52 +++++++++++++++++++----------------- 1 file changed, 27 insertions(+), 25 deletions(-) diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp index 360826062b1..c3ac98f13fb 100644 --- a/tools/server/server-task.cpp +++ b/tools/server/server-task.cpp @@ -182,31 +182,33 @@ task_params server_task::params_from_json_cmpl( params.t_max_predict_ms = json_value(data, "t_max_predict_ms", defaults.t_max_predict_ms); params.response_fields = json_value(data, "response_fields", std::vector()); - params.sampling.top_k = json_value(data, "top_k", defaults.sampling.top_k); - params.sampling.top_p = json_value(data, "top_p", defaults.sampling.top_p); - params.sampling.min_p = json_value(data, "min_p", defaults.sampling.min_p); - params.sampling.top_n_sigma = json_value(data, "top_n_sigma", defaults.sampling.top_n_sigma); - params.sampling.xtc_probability = json_value(data, "xtc_probability", defaults.sampling.xtc_probability); - params.sampling.xtc_threshold = json_value(data, "xtc_threshold", defaults.sampling.xtc_threshold); - params.sampling.typ_p = json_value(data, "typical_p", defaults.sampling.typ_p); - params.sampling.temp = json_value(data, "temperature", defaults.sampling.temp); - params.sampling.dynatemp_range = json_value(data, "dynatemp_range", defaults.sampling.dynatemp_range); - params.sampling.dynatemp_exponent = json_value(data, "dynatemp_exponent", defaults.sampling.dynatemp_exponent); - params.sampling.penalty_last_n = json_value(data, "repeat_last_n", defaults.sampling.penalty_last_n); - params.sampling.penalty_repeat = json_value(data, "repeat_penalty", defaults.sampling.penalty_repeat); - params.sampling.penalty_freq = json_value(data, "frequency_penalty", defaults.sampling.penalty_freq); - params.sampling.penalty_present = json_value(data, "presence_penalty", defaults.sampling.penalty_present); - params.sampling.dry_multiplier = json_value(data, "dry_multiplier", defaults.sampling.dry_multiplier); - params.sampling.dry_base = json_value(data, "dry_base", defaults.sampling.dry_base); - params.sampling.dry_allowed_length = json_value(data, "dry_allowed_length", defaults.sampling.dry_allowed_length); - params.sampling.dry_penalty_last_n = json_value(data, "dry_penalty_last_n", defaults.sampling.dry_penalty_last_n); - params.sampling.mirostat = json_value(data, "mirostat", defaults.sampling.mirostat); - params.sampling.mirostat_tau = json_value(data, "mirostat_tau", defaults.sampling.mirostat_tau); - params.sampling.mirostat_eta = json_value(data, "mirostat_eta", defaults.sampling.mirostat_eta); - params.sampling.seed = json_value(data, "seed", defaults.sampling.seed); - params.sampling.n_probs = json_value(data, "n_probs", defaults.sampling.n_probs); - params.sampling.min_keep = json_value(data, "min_keep", defaults.sampling.min_keep); - params.post_sampling_probs = json_value(data, "post_sampling_probs", defaults.post_sampling_probs); + params.sampling.top_k = json_value(data, "top_k", defaults.sampling.top_k); + params.sampling.top_p = json_value(data, "top_p", defaults.sampling.top_p); + params.sampling.min_p = json_value(data, "min_p", defaults.sampling.min_p); + params.sampling.top_n_sigma = json_value(data, "top_n_sigma", defaults.sampling.top_n_sigma); + params.sampling.xtc_probability = json_value(data, "xtc_probability", defaults.sampling.xtc_probability); + params.sampling.xtc_threshold = json_value(data, "xtc_threshold", defaults.sampling.xtc_threshold); + params.sampling.typ_p = json_value(data, "typical_p", defaults.sampling.typ_p); + params.sampling.temp = json_value(data, "temperature", defaults.sampling.temp); + params.sampling.dynatemp_range = json_value(data, "dynatemp_range", defaults.sampling.dynatemp_range); + params.sampling.dynatemp_exponent = json_value(data, "dynatemp_exponent", defaults.sampling.dynatemp_exponent); + params.sampling.penalty_last_n = json_value(data, "repeat_last_n", defaults.sampling.penalty_last_n); + params.sampling.penalty_repeat = json_value(data, "repeat_penalty", defaults.sampling.penalty_repeat); + params.sampling.penalty_freq = json_value(data, "frequency_penalty", defaults.sampling.penalty_freq); + params.sampling.penalty_present = json_value(data, "presence_penalty", defaults.sampling.penalty_present); + params.sampling.dry_multiplier = json_value(data, "dry_multiplier", defaults.sampling.dry_multiplier); + params.sampling.dry_base = json_value(data, "dry_base", defaults.sampling.dry_base); + params.sampling.dry_allowed_length = json_value(data, "dry_allowed_length", defaults.sampling.dry_allowed_length); + params.sampling.dry_penalty_last_n = json_value(data, "dry_penalty_last_n", defaults.sampling.dry_penalty_last_n); + params.sampling.mirostat = json_value(data, "mirostat", defaults.sampling.mirostat); + params.sampling.mirostat_tau = json_value(data, "mirostat_tau", defaults.sampling.mirostat_tau); + params.sampling.mirostat_eta = json_value(data, "mirostat_eta", defaults.sampling.mirostat_eta); + params.sampling.power_law_target = json_value(data, "power_law_target", defaults.sampling.power_law_target); + params.sampling.power_law_window_size = json_value(data, "power_law_window_size", defaults.sampling.power_law_window_size); + params.sampling.seed = json_value(data, "seed", defaults.sampling.seed); + params.sampling.n_probs = json_value(data, "n_probs", defaults.sampling.n_probs); + params.sampling.min_keep = json_value(data, "min_keep", defaults.sampling.min_keep); + params.post_sampling_probs = json_value(data, "post_sampling_probs", defaults.post_sampling_probs); params.speculative.n_min = json_value(data, "speculative.n_min", defaults.speculative.n_min); params.speculative.n_max = json_value(data, "speculative.n_max", defaults.speculative.n_max); From 94cb883ed9184ac96a838566b0cbbb7918237b64 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Fri, 12 Dec 2025 23:19:08 -0600 Subject: [PATCH 16/56] copy from author ref: https://gist.github.com/MrJackSpade/9be99c7efbba7b95a41377e123b7b069 --- src/llama-sampling.cpp | 156 +++++++++++++++++++++++++++++++++-------- 1 file changed, 125 insertions(+), 31 deletions(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index d5f485f846d..738fd05caa8 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2337,21 +2337,134 @@ static const char * llama_sampler_power_law_name(const struct llama_sampler * /* return "power-law"; } +// Computes the target probability for the current sampling step. +// +// The target determines which token probabilities the power law distribution +// will favor. This function implements a dynamic feedback mechanism to maintain +// an average selection probability close to the base target over time. +// +// When the window is empty: +// - Returns the base target value (ctx->target) +// +// When the window has entries: +// - Calculates what the next target should be to keep the weighted average +// of selected token probabilities equal to ctx->target +// - Uses exponential decay weighting: newer values have more influence +// +// Exponential Decay Weighting: +// After inserting the new value, the weights will be: +// new_value: weight = 1 (age 0, newest) +// rat(0): weight = decay (age 1) +// rat(1): weight = decay^2 (age 2) +// ... +// rat(sz-2): weight = decay^(sz-1) +// rat(sz-1): evicted (oldest) +// +// The "effective window size" is approximately 1/(1-decay): +// decay=0.9 → effective window ≈ 10 tokens +// decay=0.95 → effective window ≈ 20 tokens +// decay=1.0 → no decay, equivalent to simple average (original behavior) +// +// Formula derivation: +// We want the weighted average after insertion to equal target: +// +// (new_value * 1 + Σ rat(i) * decay^(i+1)) / total_weight = target +// +// Where total_weight = 1 + decay + decay^2 + ... + decay^(sz-1) +// = (1 - decay^sz) / (1 - decay) [geometric series] +// +// Solving for new_value: +// new_value = target * total_weight - decay * Σ rat(i) * decay^i +// +// The factor of 'decay' on the sum accounts for all existing values +// shifting one position older when the new value is inserted. +// +// The exponential decay helps prevent "fishtailing" - a phenomenon where +// forced high-probability selections (when the model is very confident) +// cause the algorithm to overcorrect with many low-probability selections, +// then swing back the other way. By decaying old values, the influence of +// forced selections fades faster, reducing oscillation amplitude and +// recovery time. +// +// Finally, the computed target is clamped to [min_target, max_target] to +// prevent extreme values that could destabilize sampling. +// +static float llama_sampler_power_law_compute_target(const llama_sampler_power_law * ctx, + float min_target, + float max_target, + float tail_decay) { + float computed_target = ctx->target; + size_t sz = ctx->window.size(); + + if (sz > 0) { + // Check if window is at capacity (oldest element will be evicted on next push) + // Use the window_size parameter from context, not a capacity() method + const bool window_full = (sz == ctx->window_size); + + // Compute weighted sum with exponential decay + // rat(0) = newest in buffer, gets weight 1 + // rat(i) gets weight decay^i + // + // When window is full: exclude oldest element (it will be evicted) + // When window is not full: include all elements (nothing evicted) + float weighted_sum = 0.0f; + float weight = 1.0f; + size_t elements_to_sum = window_full ? (sz - 1) : sz; + + for (size_t i = 0; i < elements_to_sum; ++i) { + weighted_sum += ctx->window.rat(i) * weight; + weight *= tail_decay; + } + + // Compute total weight after new value is inserted + // When full: sz elements remain (oldest evicted, new added) + // When not full: sz + 1 elements (new added, nothing evicted) + size_t final_element_count = window_full ? sz : (sz + 1); + + float total_weight; + if (std::abs(tail_decay - 1.0f) < FLT_EPSILON) { + total_weight = (float) final_element_count; + } else { + total_weight = (1.0f - std::pow(tail_decay, (float) final_element_count)) / (1.0f - tail_decay); + } + + // Shift weights to account for new value taking position 0 + // All existing values age by 1, so multiply their weights by decay + float shifted_weighted_sum = weighted_sum * tail_decay; + + // Solve for the new value that achieves target weighted average + float next_value = (ctx->target * total_weight) - shifted_weighted_sum; + + // Clamp to allowed range + computed_target = std::max(min_target, std::min(next_value, max_target)); + } + + return computed_target; +} + static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { auto * ctx = (llama_sampler_power_law *) smpl->ctx; if (ctx->target < 0.0f) { + fprintf(stderr, "Target below zero, sampling from distribution\n"); // no-op: just sample from the distribution as-is llama_sampler_softmax_impl(cur_p, false); - const int idx = llama_sample_dist(cur_p, ctx->rng); + const int idx = llama_sample_dist(cur_p, ctx->rng); cur_p->selected = idx; return; } - // fixed power law transform parameters (from original implementation) - const float distribution_width = 0.2f; - const float peak_logit_value = 3.0f; - const float tail_heaviness = 3.0f; + // fixed power law transform parameters + const float distribution_width = 0.3f; + const float peak_logit_value = 5.0f; + const float tail_heaviness = 2.0f; + + // target computation parameters + const float min_target = 0.0f; + const float max_target = 1.0f; + const float tail_decay = 0.50f; // Exponential decay factor for history weighting + // Lower = faster response, higher = more stability + // Effective window ≈ 1/(1-decay) ≈ 20 tokens // compute probabilities to get the "original" values llama_sampler_softmax_impl(cur_p, false); @@ -2363,45 +2476,26 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok original_probs.push_back(cur_p->data[i].p); } - // // calculate adaptive target - // - - const float min_target = 0.0f; - const float max_target = 1.0f; - - float computed_target = ctx->target; - if (ctx->window.size() > 0) { - float sum_excluding_oldest = 0.0f; - size_t sz = ctx->window.size(); + float computed_target = llama_sampler_power_law_compute_target(ctx, min_target, max_target, tail_decay); - // sum all except the oldest element - for (size_t i = 0; i < sz - 1; ++i) { - sum_excluding_oldest += ctx->window.rat(i); - } - - float next_value = (ctx->target * ctx->window_size) - sum_excluding_oldest; - computed_target = std::max(min_target, std::min(next_value, max_target)); - } - - // // power law transform - // - for (size_t i = 0; i < cur_p->size; ++i) { - float p = cur_p->data[i].p; + float p = cur_p->data[i].p; float normalized_distance = std::abs(p - computed_target) / distribution_width; - cur_p->data[i].logit = peak_logit_value / (1.0f + std::pow(normalized_distance, tail_heaviness)); + cur_p->data[i].logit = peak_logit_value / (1.0f + std::pow(normalized_distance, tail_heaviness)); } llama_sampler_softmax_impl(cur_p, false); // sample from the transformed distribution - const int idx = llama_sample_dist(cur_p, ctx->rng); + const int idx = llama_sample_dist(cur_p, ctx->rng); cur_p->selected = idx; // add the ORIGINAL probability to the rolling window - ctx->window.push_back(original_probs[idx]); + float original_p = original_probs[idx]; + + ctx->window.push_back(original_p); } static void llama_sampler_power_law_reset(struct llama_sampler * smpl) { From 0a19a3fd6c179d0e2761130a86cf945acc838c83 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Fri, 12 Dec 2025 23:32:57 -0600 Subject: [PATCH 17/56] remove old debug log, style nit --- src/llama-sampling.cpp | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 738fd05caa8..5871668d96b 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2389,10 +2389,12 @@ static const char * llama_sampler_power_law_name(const struct llama_sampler * /* // Finally, the computed target is clamped to [min_target, max_target] to // prevent extreme values that could destabilize sampling. // -static float llama_sampler_power_law_compute_target(const llama_sampler_power_law * ctx, - float min_target, - float max_target, - float tail_decay) { +static float llama_sampler_power_law_compute_target( + const llama_sampler_power_law * ctx, + float min_target, + float max_target, + float tail_decay) { + float computed_target = ctx->target; size_t sz = ctx->window.size(); @@ -2416,6 +2418,10 @@ static float llama_sampler_power_law_compute_target(const llama_sampler_power_la weight *= tail_decay; } + // Shift weights to account for new value taking position 0 + // All existing values age by 1, so multiply their weights by decay + float shifted_weighted_sum = weighted_sum * tail_decay; + // Compute total weight after new value is inserted // When full: sz elements remain (oldest evicted, new added) // When not full: sz + 1 elements (new added, nothing evicted) @@ -2428,10 +2434,6 @@ static float llama_sampler_power_law_compute_target(const llama_sampler_power_la total_weight = (1.0f - std::pow(tail_decay, (float) final_element_count)) / (1.0f - tail_decay); } - // Shift weights to account for new value taking position 0 - // All existing values age by 1, so multiply their weights by decay - float shifted_weighted_sum = weighted_sum * tail_decay; - // Solve for the new value that achieves target weighted average float next_value = (ctx->target * total_weight) - shifted_weighted_sum; @@ -2446,7 +2448,6 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok auto * ctx = (llama_sampler_power_law *) smpl->ctx; if (ctx->target < 0.0f) { - fprintf(stderr, "Target below zero, sampling from distribution\n"); // no-op: just sample from the distribution as-is llama_sampler_softmax_impl(cur_p, false); const int idx = llama_sample_dist(cur_p, ctx->rng); @@ -2462,9 +2463,9 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok // target computation parameters const float min_target = 0.0f; const float max_target = 1.0f; - const float tail_decay = 0.50f; // Exponential decay factor for history weighting - // Lower = faster response, higher = more stability - // Effective window ≈ 1/(1-decay) ≈ 20 tokens + const float tail_decay = 0.50f; // exponential decay factor for history weighting + // lower = faster response, higher = more stability + // effective window ≈ 1/(1-decay) ≈ 20 tokens // compute probabilities to get the "original" values llama_sampler_softmax_impl(cur_p, false); @@ -2479,7 +2480,10 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok // calculate adaptive target float computed_target = llama_sampler_power_law_compute_target(ctx, min_target, max_target, tail_decay); + // // power law transform + // + for (size_t i = 0; i < cur_p->size; ++i) { float p = cur_p->data[i].p; float normalized_distance = std::abs(p - computed_target) / distribution_width; From 824bb3aa6ebc14e5bf9c2bb5d0959841100f10fd Mon Sep 17 00:00:00 2001 From: ddh0 Date: Sat, 13 Dec 2025 00:23:15 -0600 Subject: [PATCH 18/56] fix compiler warning, add commented-out logging per token --- src/llama-sampling.cpp | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 5871668d96b..7686f59148e 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2401,7 +2401,7 @@ static float llama_sampler_power_law_compute_target( if (sz > 0) { // Check if window is at capacity (oldest element will be evicted on next push) // Use the window_size parameter from context, not a capacity() method - const bool window_full = (sz == ctx->window_size); + const bool window_full = (sz == (size_t)ctx->window_size); // Compute weighted sum with exponential decay // rat(0) = newest in buffer, gets weight 1 @@ -2496,6 +2496,18 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok const int idx = llama_sample_dist(cur_p, ctx->rng); cur_p->selected = idx; + // uncomment this to log the target values and history window contents for every token + // + // fprintf(stderr, "power_law: window_size=%zu/%d values=[", + // ctx->window.size(), ctx->window_size); + // for (size_t i = 0; i < ctx->window.size(); ++i) { + // fprintf(stderr, "%.1f", ctx->window.rat(i)); + // if (i < ctx->window.size() - 1) fprintf(stderr, ","); + // } + // fprintf(stderr, "] computed_target=%.4f selected_token=%d orig_prob=%.4f\n", + // computed_target, cur_p->data[idx].id, original_probs[idx]); + // fflush(stderr); + // add the ORIGINAL probability to the rolling window float original_p = original_probs[idx]; From a96ddd743a8badf058a31edf893ce5c660a02eee Mon Sep 17 00:00:00 2001 From: ddh0 Date: Sat, 13 Dec 2025 22:15:03 -0600 Subject: [PATCH 19/56] re-write + change parameters + simplify --- common/common.h | 58 +++++----- include/llama.h | 22 ++-- src/llama-sampling.cpp | 207 +++++++++++------------------------ tools/server/server-task.cpp | 54 ++++----- 4 files changed, 130 insertions(+), 211 deletions(-) diff --git a/common/common.h b/common/common.h index ba3d776bdc5..66a6ca96b3a 100644 --- a/common/common.h +++ b/common/common.h @@ -164,35 +164,35 @@ enum common_params_sampling_config : uint64_t { struct common_params_sampling { uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler - int32_t n_prev = 64; // number of previous tokens to remember - int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. - int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens - int32_t top_k = 40; // <= 0 to use vocab size - float top_p = 0.95f; // 1.0 = disabled - float min_p = 0.05f; // 0.0 = disabled - float xtc_probability = 0.00f; // 0.0 = disabled - float xtc_threshold = 0.10f; // > 0.5 disables XTC - float typ_p = 1.00f; // typical_p, 1.0 = disabled - float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities - float dynatemp_range = 0.00f; // 0.0 = disabled - float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler - int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) - float penalty_repeat = 1.00f; // 1.0 = disabled - float penalty_freq = 0.00f; // 0.0 = disabled - float penalty_present = 0.00f; // 0.0 = disabled - float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition: - float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length) - int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty - int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size) - float power_law_target = -1.0f; // target probability for Power Law sampling (valid range 0.0 to 1.0; <0 = disabled) - int32_t power_law_window_size = 10; // rolling window size for target adaptation in Power Law sampling (≤0 = fixed target) - int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 - float top_n_sigma = -1.00f; // -1.0 = disabled - float mirostat_tau = 5.00f; // target entropy - float mirostat_eta = 0.10f; // learning rate - bool ignore_eos = false; - bool no_perf = false; // disable performance metrics - bool timing_per_token = false; + int32_t n_prev = 64; // number of previous tokens to remember + int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. + int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens + int32_t top_k = 40; // <= 0 to use vocab size + float top_p = 0.95f; // 1.0 = disabled + float min_p = 0.05f; // 0.0 = disabled + float xtc_probability = 0.00f; // 0.0 = disabled + float xtc_threshold = 0.10f; // > 0.5 disables XTC + float typ_p = 1.00f; // typical_p, 1.0 = disabled + float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities + float dynatemp_range = 0.00f; // 0.0 = disabled + float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler + int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) + float penalty_repeat = 1.00f; // 1.0 = disabled + float penalty_freq = 0.00f; // 0.0 = disabled + float penalty_present = 0.00f; // 0.0 = disabled + float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition: + float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length) + int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty + int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size) + float power_law_target = -1.0f; // select tokens near this probability (valid range 0.0 to 1.0; <0 = disabled) + float power_law_decay = 0.9f; // decay rate for target adaptation over time. lower values -> faster but less stable adaptation. (valid range 0.0 to 1.0; ≤0 = no adaptation) + int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 + float top_n_sigma = -1.00f; // -1.0 = disabled + float mirostat_tau = 5.00f; // target entropy + float mirostat_eta = 0.10f; // learning rate + bool ignore_eos = false; + bool no_perf = false; // disable performance metrics + bool timing_per_token = false; uint64_t user_sampling_config = 0; // bitfield to track user-specified samplers diff --git a/include/llama.h b/include/llama.h index ce1308d2bd3..f3867c69886 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1289,24 +1289,28 @@ extern "C" { const char ** seq_breakers, size_t num_breakers); - /// @details power-law sampler - reshapes probability distribution to target specific probability ranges + /// power-law + /// + /// this sampler implements a power law probability transformation with adaptive + /// target tracking. it reshapes token probability distributions to favor tokens near a + /// configurable target probability, rather than always selecting from the highest probability + /// candidates. it is ideal for creative, unpredictable text generation. /// /// this sampler is like `greedy`, `dist`, and `mirostat` in that it actually selects a token ID /// rather than just transforming logits. therefore it must always be the last sampler in the /// sampler chain. /// - /// it is recommended to only perform minimal truncation before this sampler. + /// minimal truncation before this sampler is recommended. /// - /// @param target target probability (valid range 0.0 to 1.0; <0 = disabled) - /// @param window_size rolling window size for target adaptation (≤0 = fixed target) - /// @param seed RNG seed + /// @param target select tokens near this probability (valid range 0.0 to 1.0; <0 = disabled) + /// @param decay decay rate for target adaptation over time. lower values -> faster but less stable adaptation. (valid range 0.0 to 1.0; ≤0 = no adaptation) /// - /// ref: https://github.com/MrJackSpade/llama.cpp/tree/master (original impl, documentation) + /// ref: https://github.com/MrJackSpade/llama.cpp/tree/master (original impl) /// ref: https://github.com/ggml-org/llama.cpp/pull/17927 (llama.cpp PR) LLAMA_API struct llama_sampler * llama_sampler_init_power_law( - float target, - int32_t window_size, - uint32_t seed); + float target, + float decay, + uint32_t seed); LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias( int32_t n_vocab, diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 7686f59148e..db126a18d50 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2315,133 +2315,62 @@ struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, floa // power-law // +// this sampler implements a power law probability transformation with adaptive +// target tracking. it reshapes token probability distributions to favor tokens near a +// configurable target probability, rather than always selecting from the highest probability +// candidates. it is ideal for creative, unpredictable text generation. +// // this sampler is like `greedy`, `dist`, and `mirostat` in that it actually selects a token ID // rather than just transforming logits. therefore it must always be the last sampler in the // sampler chain. // -// it is recommended to only perform minimal truncation before this sampler. +// minimal truncation before this sampler is recommended. // -// ref: https://github.com/MrJackSpade/llama.cpp/tree/master (original impl, documentation) +// ref: https://github.com/MrJackSpade/llama.cpp/tree/master (original impl) // ref: https://github.com/ggml-org/llama.cpp/pull/17927 (llama.cpp PR) struct llama_sampler_power_law { - const float target; - const int32_t window_size; - const uint32_t seed; - std::mt19937 rng; - ring_buffer window; + // the desired average probability for selected tokens (0.0 to 1.0) + // higher values favor more probable tokens (more deterministic) + // lower values favor less probable tokens (more creative) + // negative values disable Power Law sampling (sample from distribution as-is) + const float target; + + // controls how quickly history influence fades (0.0 to 0.99) + // lower values = faster adaptation, more reactive to recent tokens + // higher values = slower adaptation, more stable over time + // effective history length ≈ 1/(1-decay) tokens + // examples: decay=0.5 → ~2 tokens, decay=0.9 → ~10, decay=0.95 → ~20 + // internally clamped to <= 0.99 to prevent unbounded accumulation + const float decay; + + const uint32_t seed; + std::mt19937 rng; + + // historical token probabilities weighted by recency + float weighted_sum; + // sum of weights, converges to 1/(1-decay) + float total_weight; }; static const char * llama_sampler_power_law_name(const struct llama_sampler * /*smpl*/) { return "power-law"; } -// Computes the target probability for the current sampling step. -// -// The target determines which token probabilities the power law distribution -// will favor. This function implements a dynamic feedback mechanism to maintain -// an average selection probability close to the base target over time. -// -// When the window is empty: -// - Returns the base target value (ctx->target) -// -// When the window has entries: -// - Calculates what the next target should be to keep the weighted average -// of selected token probabilities equal to ctx->target -// - Uses exponential decay weighting: newer values have more influence -// -// Exponential Decay Weighting: -// After inserting the new value, the weights will be: -// new_value: weight = 1 (age 0, newest) -// rat(0): weight = decay (age 1) -// rat(1): weight = decay^2 (age 2) -// ... -// rat(sz-2): weight = decay^(sz-1) -// rat(sz-1): evicted (oldest) -// -// The "effective window size" is approximately 1/(1-decay): -// decay=0.9 → effective window ≈ 10 tokens -// decay=0.95 → effective window ≈ 20 tokens -// decay=1.0 → no decay, equivalent to simple average (original behavior) -// -// Formula derivation: -// We want the weighted average after insertion to equal target: -// -// (new_value * 1 + Σ rat(i) * decay^(i+1)) / total_weight = target -// -// Where total_weight = 1 + decay + decay^2 + ... + decay^(sz-1) -// = (1 - decay^sz) / (1 - decay) [geometric series] -// -// Solving for new_value: -// new_value = target * total_weight - decay * Σ rat(i) * decay^i -// -// The factor of 'decay' on the sum accounts for all existing values -// shifting one position older when the new value is inserted. -// -// The exponential decay helps prevent "fishtailing" - a phenomenon where -// forced high-probability selections (when the model is very confident) -// cause the algorithm to overcorrect with many low-probability selections, -// then swing back the other way. By decaying old values, the influence of -// forced selections fades faster, reducing oscillation amplitude and -// recovery time. -// -// Finally, the computed target is clamped to [min_target, max_target] to -// prevent extreme values that could destabilize sampling. -// -static float llama_sampler_power_law_compute_target( - const llama_sampler_power_law * ctx, - float min_target, - float max_target, - float tail_decay) { - - float computed_target = ctx->target; - size_t sz = ctx->window.size(); - - if (sz > 0) { - // Check if window is at capacity (oldest element will be evicted on next push) - // Use the window_size parameter from context, not a capacity() method - const bool window_full = (sz == (size_t)ctx->window_size); - - // Compute weighted sum with exponential decay - // rat(0) = newest in buffer, gets weight 1 - // rat(i) gets weight decay^i - // - // When window is full: exclude oldest element (it will be evicted) - // When window is not full: include all elements (nothing evicted) - float weighted_sum = 0.0f; - float weight = 1.0f; - size_t elements_to_sum = window_full ? (sz - 1) : sz; - - for (size_t i = 0; i < elements_to_sum; ++i) { - weighted_sum += ctx->window.rat(i) * weight; - weight *= tail_decay; - } - - // Shift weights to account for new value taking position 0 - // All existing values age by 1, so multiply their weights by decay - float shifted_weighted_sum = weighted_sum * tail_decay; - - // Compute total weight after new value is inserted - // When full: sz elements remain (oldest evicted, new added) - // When not full: sz + 1 elements (new added, nothing evicted) - size_t final_element_count = window_full ? sz : (sz + 1); - - float total_weight; - if (std::abs(tail_decay - 1.0f) < FLT_EPSILON) { - total_weight = (float) final_element_count; - } else { - total_weight = (1.0f - std::pow(tail_decay, (float) final_element_count)) / (1.0f - tail_decay); - } - - // Solve for the new value that achieves target weighted average - float next_value = (ctx->target * total_weight) - shifted_weighted_sum; - - // Clamp to allowed range - computed_target = std::max(min_target, std::min(next_value, max_target)); +// compute the adaptive target probability for the current sampling step +static float llama_sampler_power_law_compute_target(const llama_sampler_power_law * ctx, float decay) { + if (ctx->total_weight == 0.0f) { + // if there is no history, just use base target + return ctx->target; } - return computed_target; + // maintain a running weighted sum with exponential decay + float new_total_weight = 1.0f + decay * ctx->total_weight; + float next_value = ctx->target * new_total_weight - decay * ctx->weighted_sum; + + // clamp to [0.0, 1.0] + return std::max(0.0f, std::min(next_value, 1.0f)); } static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { @@ -2455,30 +2384,25 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok return; } + // clamp decay to avoid degenerate case at 1.0 (unbounded accumulation) + const float decay = std::min(ctx->decay, 0.99f); + // fixed power law transform parameters const float distribution_width = 0.3f; const float peak_logit_value = 5.0f; const float tail_heaviness = 2.0f; - // target computation parameters - const float min_target = 0.0f; - const float max_target = 1.0f; - const float tail_decay = 0.50f; // exponential decay factor for history weighting - // lower = faster response, higher = more stability - // effective window ≈ 1/(1-decay) ≈ 20 tokens - - // compute probabilities to get the "original" values + // get the original probabilities llama_sampler_softmax_impl(cur_p, false); - // store original probabilities (used for future target adaptation) + // store the original probabilities (needed for history update after selection) std::vector original_probs; original_probs.reserve(cur_p->size); for (size_t i = 0; i < cur_p->size; ++i) { original_probs.push_back(cur_p->data[i].p); } - // calculate adaptive target - float computed_target = llama_sampler_power_law_compute_target(ctx, min_target, max_target, tail_decay); + float computed_target = llama_sampler_power_law_compute_target(ctx, decay); // // power law transform @@ -2492,40 +2416,30 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok llama_sampler_softmax_impl(cur_p, false); - // sample from the transformed distribution + // sample from transformed distribution const int idx = llama_sample_dist(cur_p, ctx->rng); cur_p->selected = idx; - // uncomment this to log the target values and history window contents for every token - // - // fprintf(stderr, "power_law: window_size=%zu/%d values=[", - // ctx->window.size(), ctx->window_size); - // for (size_t i = 0; i < ctx->window.size(); ++i) { - // fprintf(stderr, "%.1f", ctx->window.rat(i)); - // if (i < ctx->window.size() - 1) fprintf(stderr, ","); - // } - // fprintf(stderr, "] computed_target=%.4f selected_token=%d orig_prob=%.4f\n", - // computed_target, cur_p->data[idx].id, original_probs[idx]); - // fflush(stderr); - - // add the ORIGINAL probability to the rolling window - float original_p = original_probs[idx]; - - ctx->window.push_back(original_p); + // update running history with the original probability of the selected token + float original_p = original_probs[idx]; + ctx->weighted_sum = original_p + decay * ctx->weighted_sum; + ctx->total_weight = 1.0f + decay * ctx->total_weight; } static void llama_sampler_power_law_reset(struct llama_sampler * smpl) { - auto * ctx = (llama_sampler_power_law *) smpl->ctx; - ctx->window = ring_buffer(ctx->window_size); + auto * ctx = (llama_sampler_power_law *) smpl->ctx; + ctx->weighted_sum = 0.0f; + ctx->total_weight = 0.0f; } static struct llama_sampler * llama_sampler_power_law_clone(const struct llama_sampler * smpl) { const auto * ctx = (const llama_sampler_power_law *) smpl->ctx; - auto * result = llama_sampler_init_power_law(ctx->target, ctx->window_size, ctx->seed); + auto * result = llama_sampler_init_power_law(ctx->target, ctx->decay, ctx->seed); auto * result_ctx = (llama_sampler_power_law *) result->ctx; - result_ctx->rng = ctx->rng; - result_ctx->window = ctx->window; + result_ctx->rng = ctx->rng; + result_ctx->weighted_sum = ctx->weighted_sum; + result_ctx->total_weight = ctx->total_weight; return result; } @@ -2545,7 +2459,7 @@ static struct llama_sampler_i llama_sampler_power_law_i = { struct llama_sampler * llama_sampler_init_power_law( float target, - int32_t window_size, + float decay, uint32_t seed ) { auto seed_cur = get_rng_seed(seed); @@ -2553,10 +2467,11 @@ struct llama_sampler * llama_sampler_init_power_law( /* .iface = */ &llama_sampler_power_law_i, /* .ctx = */ new llama_sampler_power_law { /* .target = */ target, - /* .window_size = */ window_size, + /* .decay = */ decay, /* .seed = */ seed_cur, /* .rng = */ std::mt19937(seed_cur), - /* .window = */ ring_buffer(window_size), + /* .weighted_sum = */ 0.0f, + /* .total_weight = */ 0.0f, } ); } diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp index c3ac98f13fb..6c083e66245 100644 --- a/tools/server/server-task.cpp +++ b/tools/server/server-task.cpp @@ -182,33 +182,33 @@ task_params server_task::params_from_json_cmpl( params.t_max_predict_ms = json_value(data, "t_max_predict_ms", defaults.t_max_predict_ms); params.response_fields = json_value(data, "response_fields", std::vector()); - params.sampling.top_k = json_value(data, "top_k", defaults.sampling.top_k); - params.sampling.top_p = json_value(data, "top_p", defaults.sampling.top_p); - params.sampling.min_p = json_value(data, "min_p", defaults.sampling.min_p); - params.sampling.top_n_sigma = json_value(data, "top_n_sigma", defaults.sampling.top_n_sigma); - params.sampling.xtc_probability = json_value(data, "xtc_probability", defaults.sampling.xtc_probability); - params.sampling.xtc_threshold = json_value(data, "xtc_threshold", defaults.sampling.xtc_threshold); - params.sampling.typ_p = json_value(data, "typical_p", defaults.sampling.typ_p); - params.sampling.temp = json_value(data, "temperature", defaults.sampling.temp); - params.sampling.dynatemp_range = json_value(data, "dynatemp_range", defaults.sampling.dynatemp_range); - params.sampling.dynatemp_exponent = json_value(data, "dynatemp_exponent", defaults.sampling.dynatemp_exponent); - params.sampling.penalty_last_n = json_value(data, "repeat_last_n", defaults.sampling.penalty_last_n); - params.sampling.penalty_repeat = json_value(data, "repeat_penalty", defaults.sampling.penalty_repeat); - params.sampling.penalty_freq = json_value(data, "frequency_penalty", defaults.sampling.penalty_freq); - params.sampling.penalty_present = json_value(data, "presence_penalty", defaults.sampling.penalty_present); - params.sampling.dry_multiplier = json_value(data, "dry_multiplier", defaults.sampling.dry_multiplier); - params.sampling.dry_base = json_value(data, "dry_base", defaults.sampling.dry_base); - params.sampling.dry_allowed_length = json_value(data, "dry_allowed_length", defaults.sampling.dry_allowed_length); - params.sampling.dry_penalty_last_n = json_value(data, "dry_penalty_last_n", defaults.sampling.dry_penalty_last_n); - params.sampling.mirostat = json_value(data, "mirostat", defaults.sampling.mirostat); - params.sampling.mirostat_tau = json_value(data, "mirostat_tau", defaults.sampling.mirostat_tau); - params.sampling.mirostat_eta = json_value(data, "mirostat_eta", defaults.sampling.mirostat_eta); - params.sampling.power_law_target = json_value(data, "power_law_target", defaults.sampling.power_law_target); - params.sampling.power_law_window_size = json_value(data, "power_law_window_size", defaults.sampling.power_law_window_size); - params.sampling.seed = json_value(data, "seed", defaults.sampling.seed); - params.sampling.n_probs = json_value(data, "n_probs", defaults.sampling.n_probs); - params.sampling.min_keep = json_value(data, "min_keep", defaults.sampling.min_keep); - params.post_sampling_probs = json_value(data, "post_sampling_probs", defaults.post_sampling_probs); + params.sampling.top_k = json_value(data, "top_k", defaults.sampling.top_k); + params.sampling.top_p = json_value(data, "top_p", defaults.sampling.top_p); + params.sampling.min_p = json_value(data, "min_p", defaults.sampling.min_p); + params.sampling.top_n_sigma = json_value(data, "top_n_sigma", defaults.sampling.top_n_sigma); + params.sampling.xtc_probability = json_value(data, "xtc_probability", defaults.sampling.xtc_probability); + params.sampling.xtc_threshold = json_value(data, "xtc_threshold", defaults.sampling.xtc_threshold); + params.sampling.typ_p = json_value(data, "typical_p", defaults.sampling.typ_p); + params.sampling.temp = json_value(data, "temperature", defaults.sampling.temp); + params.sampling.dynatemp_range = json_value(data, "dynatemp_range", defaults.sampling.dynatemp_range); + params.sampling.dynatemp_exponent = json_value(data, "dynatemp_exponent", defaults.sampling.dynatemp_exponent); + params.sampling.penalty_last_n = json_value(data, "repeat_last_n", defaults.sampling.penalty_last_n); + params.sampling.penalty_repeat = json_value(data, "repeat_penalty", defaults.sampling.penalty_repeat); + params.sampling.penalty_freq = json_value(data, "frequency_penalty", defaults.sampling.penalty_freq); + params.sampling.penalty_present = json_value(data, "presence_penalty", defaults.sampling.penalty_present); + params.sampling.dry_multiplier = json_value(data, "dry_multiplier", defaults.sampling.dry_multiplier); + params.sampling.dry_base = json_value(data, "dry_base", defaults.sampling.dry_base); + params.sampling.dry_allowed_length = json_value(data, "dry_allowed_length", defaults.sampling.dry_allowed_length); + params.sampling.dry_penalty_last_n = json_value(data, "dry_penalty_last_n", defaults.sampling.dry_penalty_last_n); + params.sampling.mirostat = json_value(data, "mirostat", defaults.sampling.mirostat); + params.sampling.mirostat_tau = json_value(data, "mirostat_tau", defaults.sampling.mirostat_tau); + params.sampling.mirostat_eta = json_value(data, "mirostat_eta", defaults.sampling.mirostat_eta); + params.sampling.power_law_target = json_value(data, "power_law_target", defaults.sampling.power_law_target); + params.sampling.power_law_decay = json_value(data, "power_law_decay", defaults.sampling.power_law_decay); + params.sampling.seed = json_value(data, "seed", defaults.sampling.seed); + params.sampling.n_probs = json_value(data, "n_probs", defaults.sampling.n_probs); + params.sampling.min_keep = json_value(data, "min_keep", defaults.sampling.min_keep); + params.post_sampling_probs = json_value(data, "post_sampling_probs", defaults.post_sampling_probs); params.speculative.n_min = json_value(data, "speculative.n_min", defaults.speculative.n_min); params.speculative.n_max = json_value(data, "speculative.n_max", defaults.speculative.n_max); From b8a9626a739541cc6f65cd07ced19b12c364bf48 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Sat, 13 Dec 2025 22:17:08 -0600 Subject: [PATCH 20/56] oops forgot args.cpp --- common/arg.cpp | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 0226a6e644b..919e37b7f8c 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1559,18 +1559,22 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_sparam()); add_opt(common_arg( {"--power-law-target"}, "N", - string_format("target probability for Power Law sampling (valid range 0.0 to 1.0; <0 = disabled) " - "(%.1f = default)\n[(more info)](https://github.com/ggml-org/llama.cpp/pull/17927)", + string_format("power law sampler: select tokens near this probability (valid range 0.0 " + "to 1.0; <0 = disabled) (default: %.2f)\n" + "[(more info)]""(https://github.com/ggml-org/llama.cpp/pull/17927)", (double)params.sampling.power_law_target), [](common_params & params, const std::string & value) { params.sampling.power_law_target = std::stof(value); } ).set_sparam()); add_opt(common_arg( - {"--power-law-window-size"}, "N", - string_format("rolling window size for target adaptation in Power Law sampling (≤0 = fixed target; %d = default)", params.sampling.power_law_window_size), + {"--power-law-decay"}, "N", + string_format("power law sampler: decay rate for target adaptation over time. lower " + "values -> faster but less stable adaptation. " + "(valid range 0.0 to 1.0; ≤0 = no adaptation) (default: %.2f)", + (double)params.sampling.power_law_decay), [](common_params & params, int value) { - params.sampling.power_law_window_size = value; + params.sampling.power_law_decay = value; } ).set_sparam()); add_opt(common_arg( From 965bcc9dc4675432d37340647a6916adbe79f184 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Sat, 13 Dec 2025 22:19:15 -0600 Subject: [PATCH 21/56] fix leftover `window_size` --- common/sampling.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/sampling.cpp b/common/sampling.cpp index 63a17287dc8..8bfdae3be16 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -287,7 +287,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co llama_sampler_chain_add(result->chain, llama_sampler_init_penalties (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present)); break; case COMMON_SAMPLER_TYPE_POWER_LAW: - llama_sampler_chain_add(result->chain, llama_sampler_init_power_law (params.power_law_target, params.power_law_window_size, params.seed)); + llama_sampler_chain_add(result->chain, llama_sampler_init_power_law (params.power_law_target, params.power_law_decay, params.seed)); has_distribution_sampler = true; break; default: From d1e5c60442aebfc788e5096eac8d810efea3c1df Mon Sep 17 00:00:00 2001 From: ddh0 Date: Sat, 13 Dec 2025 23:26:03 -0600 Subject: [PATCH 22/56] add missing values to `common_params_sampling::print()` --- common/sampling.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common/sampling.cpp b/common/sampling.cpp index 8bfdae3be16..a8494a679de 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -151,11 +151,11 @@ std::string common_params_sampling::print() const { "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n" "\tdry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d\n" "\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, top_n_sigma = %.3f, temp = %.3f\n" - "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f", + "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f, power_law_target = %.3f, power_law_decay = %.3f", penalty_last_n, penalty_repeat, penalty_freq, penalty_present, dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, top_n_sigma, temp, - mirostat, mirostat_eta, mirostat_tau); + mirostat, mirostat_eta, mirostat_tau, power_law_target, power_law_decay); return std::string(result); } From 9613c481725a0fb39784db5b292cdc3de446156f Mon Sep 17 00:00:00 2001 From: ddh0 Date: Sun, 14 Dec 2025 00:36:59 -0600 Subject: [PATCH 23/56] with logging --- src/llama-sampling.cpp | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index db126a18d50..ae3e269ea2e 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2362,12 +2362,16 @@ static const char * llama_sampler_power_law_name(const struct llama_sampler * /* static float llama_sampler_power_law_compute_target(const llama_sampler_power_law * ctx, float decay) { if (ctx->total_weight == 0.0f) { // if there is no history, just use base target + fprintf(stderr, "power-law: compute_target: total_weight == 0.0 (target fixed at %.3f)\n", ctx->target); + fflush(stderr); return ctx->target; } // maintain a running weighted sum with exponential decay float new_total_weight = 1.0f + decay * ctx->total_weight; + fprintf(stderr, "power-law: compute_target: new_total_weight = %.3f\n", new_total_weight); fflush(stderr); float next_value = ctx->target * new_total_weight - decay * ctx->weighted_sum; + fprintf(stderr, "power-law: compute_target: next_value = %.3f\n", next_value); fflush(stderr); // clamp to [0.0, 1.0] return std::max(0.0f, std::min(next_value, 1.0f)); @@ -2378,14 +2382,16 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok if (ctx->target < 0.0f) { // no-op: just sample from the distribution as-is + fprintf(stderr, "power-law: no-op!"); fflush(stderr); llama_sampler_softmax_impl(cur_p, false); - const int idx = llama_sample_dist(cur_p, ctx->rng); + const int idx = llama_sample_dist(cur_p, ctx->rng); cur_p->selected = idx; return; } // clamp decay to avoid degenerate case at 1.0 (unbounded accumulation) const float decay = std::min(ctx->decay, 0.99f); + fprintf(stderr, "power-law: decay = %.3f\n", decay); fflush(stderr); // fixed power law transform parameters const float distribution_width = 0.3f; @@ -2403,15 +2409,20 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok } float computed_target = llama_sampler_power_law_compute_target(ctx, decay); + fprintf(stderr, "power-law: computed_target = %.3f\n", computed_target); fflush(stderr); // // power law transform // for (size_t i = 0; i < cur_p->size; ++i) { - float p = cur_p->data[i].p; - float normalized_distance = std::abs(p - computed_target) / distribution_width; - cur_p->data[i].logit = peak_logit_value / (1.0f + std::pow(normalized_distance, tail_heaviness)); + float p = cur_p->data[i].p; + fprintf(stderr, "power-law: transform: p = %.3f\n", p); fflush(stderr); + float normed_distance = std::abs(p - computed_target) / distribution_width; + fprintf(stderr, "power-law: transform: normed_distance = %.3f\n", normed_distance); fflush(stderr); + float new_p = peak_logit_value / (1.0f + std::pow(normed_distance, tail_heaviness)); + fprintf(stderr, "power-law: transform: new_p = %.3f\n", new_p); fflush(stderr); + cur_p->data[i].logit = new_p; } llama_sampler_softmax_impl(cur_p, false); @@ -2419,6 +2430,7 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok // sample from transformed distribution const int idx = llama_sample_dist(cur_p, ctx->rng); cur_p->selected = idx; + fprintf(stderr, "power-law: selected token %d\n", idx); fflush(stderr); // update running history with the original probability of the selected token float original_p = original_probs[idx]; From 2a3f579d1ffcd2dffeb60ea21e7a4ceba6d15e22 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Sun, 14 Dec 2025 01:55:02 -0600 Subject: [PATCH 24/56] does this fix it? --- src/llama-sampling.cpp | 49 ++++++++++++++++++------------------------ 1 file changed, 21 insertions(+), 28 deletions(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index ae3e269ea2e..26135a4f826 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2358,23 +2358,20 @@ static const char * llama_sampler_power_law_name(const struct llama_sampler * /* return "power-law"; } -// compute the adaptive target probability for the current sampling step -static float llama_sampler_power_law_compute_target(const llama_sampler_power_law * ctx, float decay) { +// compute the adapted target probability for the current sampling step +static float llama_sampler_power_law_compute_target(const llama_sampler_power_law * ctx) { + const float base_target = ctx->target; if (ctx->total_weight == 0.0f) { - // if there is no history, just use base target - fprintf(stderr, "power-law: compute_target: total_weight == 0.0 (target fixed at %.3f)\n", ctx->target); - fflush(stderr); - return ctx->target; + fprintf(stderr, "power-law: compute_target: total_weight == 0.0 (target fixed at %.3f)\n", base_target); + return base_target; } + float target = 2.0f * base_target - (ctx->weighted_sum / ctx->total_weight); + fprintf(stderr, "power-law: compute_target: target = %.3f\n", target); - // maintain a running weighted sum with exponential decay - float new_total_weight = 1.0f + decay * ctx->total_weight; - fprintf(stderr, "power-law: compute_target: new_total_weight = %.3f\n", new_total_weight); fflush(stderr); - float next_value = ctx->target * new_total_weight - decay * ctx->weighted_sum; - fprintf(stderr, "power-law: compute_target: next_value = %.3f\n", next_value); fflush(stderr); - - // clamp to [0.0, 1.0] - return std::max(0.0f, std::min(next_value, 1.0f)); + // clamp result to [0.0, 1.0] + target = std::max(0.0f, std::min(target, 1.0f)); + fprintf(stderr, "power-law: compute_target: target (post-clamp) = %.3f\n", target); fflush(stderr); + return target; } static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { @@ -2393,11 +2390,6 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok const float decay = std::min(ctx->decay, 0.99f); fprintf(stderr, "power-law: decay = %.3f\n", decay); fflush(stderr); - // fixed power law transform parameters - const float distribution_width = 0.3f; - const float peak_logit_value = 5.0f; - const float tail_heaviness = 2.0f; - // get the original probabilities llama_sampler_softmax_impl(cur_p, false); @@ -2408,21 +2400,22 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok original_probs.push_back(cur_p->data[i].p); } - float computed_target = llama_sampler_power_law_compute_target(ctx, decay); + float computed_target = llama_sampler_power_law_compute_target(ctx); fprintf(stderr, "power-law: computed_target = %.3f\n", computed_target); fflush(stderr); // // power law transform // + // transformation constants + const float distribution_width = 0.3f; + const float peak_logit_value = 5.0f; + + const float inv_width = 1.0f / distribution_width; + for (size_t i = 0; i < cur_p->size; ++i) { - float p = cur_p->data[i].p; - fprintf(stderr, "power-law: transform: p = %.3f\n", p); fflush(stderr); - float normed_distance = std::abs(p - computed_target) / distribution_width; - fprintf(stderr, "power-law: transform: normed_distance = %.3f\n", normed_distance); fflush(stderr); - float new_p = peak_logit_value / (1.0f + std::pow(normed_distance, tail_heaviness)); - fprintf(stderr, "power-law: transform: new_p = %.3f\n", new_p); fflush(stderr); - cur_p->data[i].logit = new_p; + float dist = (cur_p->data[i].p - computed_target) * inv_width; + cur_p->data[i].logit = peak_logit_value / (1.0f + dist * dist); } llama_sampler_softmax_impl(cur_p, false); @@ -2430,7 +2423,7 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok // sample from transformed distribution const int idx = llama_sample_dist(cur_p, ctx->rng); cur_p->selected = idx; - fprintf(stderr, "power-law: selected token %d\n", idx); fflush(stderr); + fprintf(stderr, "power-law: selected token at index %d\n", idx); fflush(stderr); // update running history with the original probability of the selected token float original_p = original_probs[idx]; From ec54fe5f1445e982e68b6a9c05975de1310719e8 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Sun, 14 Dec 2025 02:54:14 -0600 Subject: [PATCH 25/56] no, but does this? --- common/arg.cpp | 10 ++++------ src/llama-sampling.cpp | 3 +++ 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 919e37b7f8c..e7bb44f8f5d 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1569,12 +1569,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_sparam()); add_opt(common_arg( {"--power-law-decay"}, "N", - string_format("power law sampler: decay rate for target adaptation over time. lower " - "values -> faster but less stable adaptation. " - "(valid range 0.0 to 1.0; ≤0 = no adaptation) (default: %.2f)", - (double)params.sampling.power_law_decay), - [](common_params & params, int value) { - params.sampling.power_law_decay = value; + string_format("decay rate for target adaptation over time. lower values -> faster but less stable adaptation.\n" + "(valid range 0.0 to 1.0; ≤0 = no adaptation) (default: %.2f)", (double)params.sampling.power_law_decay), + [](common_params & params, const std::string & value) { + params.sampling.power_law_decay = std::stof(value); } ).set_sparam()); add_opt(common_arg( diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 26135a4f826..6beb927a6ca 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2427,8 +2427,11 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok // update running history with the original probability of the selected token float original_p = original_probs[idx]; + fprintf(stderr, "power-law: original prob was %.3f\n", original_p); fflush(stderr); ctx->weighted_sum = original_p + decay * ctx->weighted_sum; + fprintf(stderr, "power-law: updated ctx->weighted_sum = %.3f\n", ctx->weighted_sum); fflush(stderr); ctx->total_weight = 1.0f + decay * ctx->total_weight; + fprintf(stderr, "power-law: updated ctx->total_weight = %.3f\n", ctx->total_weight); fflush(stderr); } static void llama_sampler_power_law_reset(struct llama_sampler * smpl) { From 667b70fdac1054401f6ab278fba99a90bcf5253b Mon Sep 17 00:00:00 2001 From: ddh0 Date: Sun, 14 Dec 2025 03:41:28 -0600 Subject: [PATCH 26/56] update default decay --- common/common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/common.h b/common/common.h index 66a6ca96b3a..7fe62b4111e 100644 --- a/common/common.h +++ b/common/common.h @@ -185,7 +185,7 @@ struct common_params_sampling { int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size) float power_law_target = -1.0f; // select tokens near this probability (valid range 0.0 to 1.0; <0 = disabled) - float power_law_decay = 0.9f; // decay rate for target adaptation over time. lower values -> faster but less stable adaptation. (valid range 0.0 to 1.0; ≤0 = no adaptation) + float power_law_decay = 0.50f; // decay rate for target adaptation over time. lower values -> faster but less stable adaptation. (valid range 0.0 to 1.0; ≤0 = no adaptation) int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 float top_n_sigma = -1.00f; // -1.0 = disabled float mirostat_tau = 5.00f; // target entropy From 693478066981b41f3e3b7a714c9327310a87dfc2 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Sun, 14 Dec 2025 16:26:15 -0600 Subject: [PATCH 27/56] optimize --- src/llama-sampling.cpp | 53 ++++++++++++++++++++---------------------- 1 file changed, 25 insertions(+), 28 deletions(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 6beb927a6ca..78fe7706b97 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2349,11 +2349,18 @@ struct llama_sampler_power_law { std::mt19937 rng; // historical token probabilities weighted by recency - float weighted_sum; + float weighted_sum; // sum of weights, converges to 1/(1-decay) - float total_weight; + float total_weight; + // used to store original token probabilities (needed for history update after selection) + std::vector original_probs; }; +// transformation constants +static constexpr float DISTRIBUTION_WIDTH = 0.3f; +static constexpr float PEAK_LOGIT_VALUE = 5.0f; +static constexpr float INV_WIDTH = 1.0f / DISTRIBUTION_WIDTH; + static const char * llama_sampler_power_law_name(const struct llama_sampler * /*smpl*/) { return "power-law"; } @@ -2369,7 +2376,7 @@ static float llama_sampler_power_law_compute_target(const llama_sampler_power_la fprintf(stderr, "power-law: compute_target: target = %.3f\n", target); // clamp result to [0.0, 1.0] - target = std::max(0.0f, std::min(target, 1.0f)); + target = std::clamp(target, 0.0f, 1.0f); fprintf(stderr, "power-law: compute_target: target (post-clamp) = %.3f\n", target); fflush(stderr); return target; } @@ -2379,43 +2386,32 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok if (ctx->target < 0.0f) { // no-op: just sample from the distribution as-is - fprintf(stderr, "power-law: no-op!"); fflush(stderr); + fprintf(stderr, "power-law: no-op!"); llama_sampler_softmax_impl(cur_p, false); const int idx = llama_sample_dist(cur_p, ctx->rng); cur_p->selected = idx; return; } - // clamp decay to avoid degenerate case at 1.0 (unbounded accumulation) - const float decay = std::min(ctx->decay, 0.99f); - fprintf(stderr, "power-law: decay = %.3f\n", decay); fflush(stderr); - // get the original probabilities llama_sampler_softmax_impl(cur_p, false); - // store the original probabilities (needed for history update after selection) - std::vector original_probs; - original_probs.reserve(cur_p->size); + // store the original probabilities + ctx->original_probs.resize(cur_p->size); for (size_t i = 0; i < cur_p->size; ++i) { - original_probs.push_back(cur_p->data[i].p); + ctx->original_probs[i] = cur_p->data[i].p; } float computed_target = llama_sampler_power_law_compute_target(ctx); - fprintf(stderr, "power-law: computed_target = %.3f\n", computed_target); fflush(stderr); + fprintf(stderr, "power-law: computed_target = %.3f\n", computed_target); // // power law transform // - // transformation constants - const float distribution_width = 0.3f; - const float peak_logit_value = 5.0f; - - const float inv_width = 1.0f / distribution_width; - for (size_t i = 0; i < cur_p->size; ++i) { - float dist = (cur_p->data[i].p - computed_target) * inv_width; - cur_p->data[i].logit = peak_logit_value / (1.0f + dist * dist); + float dist = (cur_p->data[i].p - computed_target) * INV_WIDTH; + cur_p->data[i].logit = PEAK_LOGIT_VALUE / (1.0f + dist * dist); } llama_sampler_softmax_impl(cur_p, false); @@ -2423,14 +2419,14 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok // sample from transformed distribution const int idx = llama_sample_dist(cur_p, ctx->rng); cur_p->selected = idx; - fprintf(stderr, "power-law: selected token at index %d\n", idx); fflush(stderr); + fprintf(stderr, "power-law: selected token at index %d\n", idx); // update running history with the original probability of the selected token - float original_p = original_probs[idx]; - fprintf(stderr, "power-law: original prob was %.3f\n", original_p); fflush(stderr); - ctx->weighted_sum = original_p + decay * ctx->weighted_sum; - fprintf(stderr, "power-law: updated ctx->weighted_sum = %.3f\n", ctx->weighted_sum); fflush(stderr); - ctx->total_weight = 1.0f + decay * ctx->total_weight; + float original_p = ctx->original_probs[idx]; + fprintf(stderr, "power-law: original prob was %.3f\n", original_p); + ctx->weighted_sum = original_p + ctx->decay * ctx->weighted_sum; + fprintf(stderr, "power-law: updated ctx->weighted_sum = %.3f\n", ctx->weighted_sum); + ctx->total_weight = 1.0f + ctx->decay * ctx->total_weight; fprintf(stderr, "power-law: updated ctx->total_weight = %.3f\n", ctx->total_weight); fflush(stderr); } @@ -2448,6 +2444,7 @@ static struct llama_sampler * llama_sampler_power_law_clone(const struct llama_s result_ctx->rng = ctx->rng; result_ctx->weighted_sum = ctx->weighted_sum; result_ctx->total_weight = ctx->total_weight; + result_ctx->original_probs.reserve(ctx->original_probs.capacity()); return result; } @@ -2475,7 +2472,7 @@ struct llama_sampler * llama_sampler_init_power_law( /* .iface = */ &llama_sampler_power_law_i, /* .ctx = */ new llama_sampler_power_law { /* .target = */ target, - /* .decay = */ decay, + /* .decay = */ std::min(decay, 0.99f), /* .seed = */ seed_cur, /* .rng = */ std::mt19937(seed_cur), /* .weighted_sum = */ 0.0f, From f5d08724e75d3f41d4737c333349e03b21baa704 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Sun, 14 Dec 2025 21:51:59 -0600 Subject: [PATCH 28/56] fix bad merge my git skills are lacking --- common/sampling.cpp | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/common/sampling.cpp b/common/sampling.cpp index ee58aa50b30..1e26f44a6cb 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -253,8 +253,6 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co for (const auto & str : params.dry_sequence_breakers) { c_breakers.push_back(str.c_str()); } - - llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size())); samplers.push_back(llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size())); } break; @@ -286,7 +284,6 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co samplers.push_back(llama_sampler_init_penalties (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present)); break; case COMMON_SAMPLER_TYPE_POWER_LAW: - llama_sampler_chain_add(result->chain, llama_sampler_init_power_law (params.power_law_target, params.power_law_decay, params.seed)); has_distribution_sampler = true; break; default: @@ -295,10 +292,8 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co } // only add `dist` to the end of the chain if no other distribution samplers were added if (!has_distribution_sampler) { - llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed)); + samplers.push_back(llama_sampler_init_dist(params.seed)); } - - samplers.push_back(llama_sampler_init_dist(params.seed)); } else if (params.mirostat == 1) { samplers.push_back(llama_sampler_init_temp(params.temp)); samplers.push_back(llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100)); From 493bf301ff861cc1ce52dc86e8204954c98f8f80 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Sun, 14 Dec 2025 21:55:45 -0600 Subject: [PATCH 29/56] silence `missing initializer for member` --- src/llama-sampling.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 78fe7706b97..e044ef5898d 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2477,6 +2477,7 @@ struct llama_sampler * llama_sampler_init_power_law( /* .rng = */ std::mt19937(seed_cur), /* .weighted_sum = */ 0.0f, /* .total_weight = */ 0.0f, + /* .original_probs = */ {}, } ); } From 68543257e944acf75f2483619c54638ee46a3901 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Sun, 14 Dec 2025 22:03:17 -0600 Subject: [PATCH 30/56] update default decay to 0.9 --- common/common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/common.h b/common/common.h index 7231cbc5b87..4cc909beeb4 100644 --- a/common/common.h +++ b/common/common.h @@ -185,7 +185,7 @@ struct common_params_sampling { int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size) float power_law_target = -1.0f; // select tokens near this probability (valid range 0.0 to 1.0; <0 = disabled) - float power_law_decay = 0.50f; // decay rate for target adaptation over time. lower values -> faster but less stable adaptation. (valid range 0.0 to 1.0; ≤0 = no adaptation) + float power_law_decay = 0.90f; // decay rate for target adaptation over time. lower values -> faster but less stable adaptation. (valid range 0.0 to 1.0; ≤0 = no adaptation) int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 float top_n_sigma = -1.00f; // -1.0 = disabled float mirostat_tau = 5.00f; // target entropy From b5ed673ce92fdc9753679742ef28a218b5df1e68 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Sun, 14 Dec 2025 22:08:36 -0600 Subject: [PATCH 31/56] fix logging --- src/llama-sampling.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index e044ef5898d..1c1febee2d5 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2386,17 +2386,15 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok if (ctx->target < 0.0f) { // no-op: just sample from the distribution as-is - fprintf(stderr, "power-law: no-op!"); + fprintf(stderr, "power-law: no-op!\n"); fflush(stderr); llama_sampler_softmax_impl(cur_p, false); const int idx = llama_sample_dist(cur_p, ctx->rng); cur_p->selected = idx; return; } - // get the original probabilities + // softmax and store the original probabilities llama_sampler_softmax_impl(cur_p, false); - - // store the original probabilities ctx->original_probs.resize(cur_p->size); for (size_t i = 0; i < cur_p->size; ++i) { ctx->original_probs[i] = cur_p->data[i].p; @@ -2409,6 +2407,7 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok // power law transform // + fprintf(stderr, "power-law: transform: cur_p->size = %.3f\n", cur_p->size); for (size_t i = 0; i < cur_p->size; ++i) { float dist = (cur_p->data[i].p - computed_target) * INV_WIDTH; cur_p->data[i].logit = PEAK_LOGIT_VALUE / (1.0f + dist * dist); From 4e28eb2ffe9d052132f9daa4e5b0d73dec27bb0a Mon Sep 17 00:00:00 2001 From: ddh0 Date: Sun, 14 Dec 2025 22:11:34 -0600 Subject: [PATCH 32/56] format (double) --- src/llama-sampling.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 1c1febee2d5..f2553408373 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2407,7 +2407,7 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok // power law transform // - fprintf(stderr, "power-law: transform: cur_p->size = %.3f\n", cur_p->size); + fprintf(stderr, "power-law: transform: cur_p->size = %.3f\n", (double)cur_p->size); for (size_t i = 0; i < cur_p->size; ++i) { float dist = (cur_p->data[i].p - computed_target) * INV_WIDTH; cur_p->data[i].logit = PEAK_LOGIT_VALUE / (1.0f + dist * dist); From 1c58e9a96a3060e907a60cfae41c837b6f46e2ea Mon Sep 17 00:00:00 2001 From: ddh0 Date: Sun, 14 Dec 2025 22:32:27 -0600 Subject: [PATCH 33/56] add power law to the new `samplers` vector --- common/sampling.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/common/sampling.cpp b/common/sampling.cpp index 1e26f44a6cb..05e44170e40 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -285,6 +285,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co break; case COMMON_SAMPLER_TYPE_POWER_LAW: has_distribution_sampler = true; + samplers.push_back(llama_sampler_init_power_law (params.power_law_target, params.power_law_decay, params.seed)); break; default: GGML_ASSERT(false && "unknown sampler type"); From 4e04bd1ce21f6ec85897e89058866f18d4214b3a Mon Sep 17 00:00:00 2001 From: ddh0 Date: Sun, 14 Dec 2025 23:14:51 -0600 Subject: [PATCH 34/56] log sampler init values --- src/llama-sampling.cpp | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index f2553408373..cf235b57d45 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2466,16 +2466,19 @@ struct llama_sampler * llama_sampler_init_power_law( float decay, uint32_t seed ) { + const float _decay = std::min(decay, 0.99f); + fprintf(stderr, "power-law: init: target %.3f, decay %.3f\n", (double)target, (double)_decay); + fflush(stderr); auto seed_cur = get_rng_seed(seed); return llama_sampler_init( /* .iface = */ &llama_sampler_power_law_i, /* .ctx = */ new llama_sampler_power_law { - /* .target = */ target, - /* .decay = */ std::min(decay, 0.99f), - /* .seed = */ seed_cur, - /* .rng = */ std::mt19937(seed_cur), - /* .weighted_sum = */ 0.0f, - /* .total_weight = */ 0.0f, + /* .target = */ target, + /* .decay = */ _decay, + /* .seed = */ seed_cur, + /* .rng = */ std::mt19937(seed_cur), + /* .weighted_sum = */ 0.0f, + /* .total_weight = */ 0.0f, /* .original_probs = */ {}, } ); From 9c50b573f5e316037700d2fd548adc8a81074d6c Mon Sep 17 00:00:00 2001 From: ddh0 Date: Mon, 15 Dec 2025 09:25:05 -0600 Subject: [PATCH 35/56] improve logging messages in llama_sampler_power_law --- src/llama-sampling.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index cf235b57d45..dc827fe2199 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2369,15 +2369,15 @@ static const char * llama_sampler_power_law_name(const struct llama_sampler * /* static float llama_sampler_power_law_compute_target(const llama_sampler_power_law * ctx) { const float base_target = ctx->target; if (ctx->total_weight == 0.0f) { - fprintf(stderr, "power-law: compute_target: total_weight == 0.0 (target fixed at %.3f)\n", base_target); + fprintf(stderr, "power-law: compute_target: total_weight == 0.0 (target fixed at %.3f)\n", base_target); fflush(stderr); return base_target; } float target = 2.0f * base_target - (ctx->weighted_sum / ctx->total_weight); - fprintf(stderr, "power-law: compute_target: target = %.3f\n", target); + fprintf(stderr, "power-law: compute_target: raw target = %.3f\n", target); // clamp result to [0.0, 1.0] target = std::clamp(target, 0.0f, 1.0f); - fprintf(stderr, "power-law: compute_target: target (post-clamp) = %.3f\n", target); fflush(stderr); + fprintf(stderr, "power-law: compute_target: clamped target = %.3f\n", target); fflush(stderr); return target; } @@ -2407,7 +2407,7 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok // power law transform // - fprintf(stderr, "power-law: transform: cur_p->size = %.3f\n", (double)cur_p->size); + fprintf(stderr, "power-law: transform: cur_p->size = %d\n", (size_t)cur_p->size); for (size_t i = 0; i < cur_p->size; ++i) { float dist = (cur_p->data[i].p - computed_target) * INV_WIDTH; cur_p->data[i].logit = PEAK_LOGIT_VALUE / (1.0f + dist * dist); From 0344068cf112e524eb3fbdbd58c171870b63e56c Mon Sep 17 00:00:00 2001 From: ddh0 Date: Mon, 15 Dec 2025 09:35:44 -0600 Subject: [PATCH 36/56] remove extraneous logging --- src/llama-sampling.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index dc827fe2199..7b48e5d970f 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2401,7 +2401,6 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok } float computed_target = llama_sampler_power_law_compute_target(ctx); - fprintf(stderr, "power-law: computed_target = %.3f\n", computed_target); // // power law transform From 1c2d2e900d487d70f704441bafe9ac87afd89d6f Mon Sep 17 00:00:00 2001 From: ddh0 Date: Mon, 15 Dec 2025 21:02:11 -0600 Subject: [PATCH 37/56] simplify target computation last commit with debug logging! --- src/llama-sampling.cpp | 26 +++++++------------------- 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 7b48e5d970f..7684c8f38cc 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2365,22 +2365,6 @@ static const char * llama_sampler_power_law_name(const struct llama_sampler * /* return "power-law"; } -// compute the adapted target probability for the current sampling step -static float llama_sampler_power_law_compute_target(const llama_sampler_power_law * ctx) { - const float base_target = ctx->target; - if (ctx->total_weight == 0.0f) { - fprintf(stderr, "power-law: compute_target: total_weight == 0.0 (target fixed at %.3f)\n", base_target); fflush(stderr); - return base_target; - } - float target = 2.0f * base_target - (ctx->weighted_sum / ctx->total_weight); - fprintf(stderr, "power-law: compute_target: raw target = %.3f\n", target); - - // clamp result to [0.0, 1.0] - target = std::clamp(target, 0.0f, 1.0f); - fprintf(stderr, "power-law: compute_target: clamped target = %.3f\n", target); fflush(stderr); - return target; -} - static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { auto * ctx = (llama_sampler_power_law *) smpl->ctx; @@ -2400,13 +2384,18 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok ctx->original_probs[i] = cur_p->data[i].p; } - float computed_target = llama_sampler_power_law_compute_target(ctx); + // compute the adapted target probability for the current sampling step + float computed_target = std::clamp( + ctx->total_weight == 0.0f ? ctx->target : 2.0f * ctx->target - (ctx->weighted_sum / ctx->total_weight), + 0.0f, 1.0f + ); + fprintf(stderr, "power-law: computed target = %.3f\n", computed_target); // // power law transform // - fprintf(stderr, "power-law: transform: cur_p->size = %d\n", (size_t)cur_p->size); + fprintf(stderr, "power-law: cur_p->size = %d\n", (int)cur_p->size); for (size_t i = 0; i < cur_p->size; ++i) { float dist = (cur_p->data[i].p - computed_target) * INV_WIDTH; cur_p->data[i].logit = PEAK_LOGIT_VALUE / (1.0f + dist * dist); @@ -2421,7 +2410,6 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok // update running history with the original probability of the selected token float original_p = ctx->original_probs[idx]; - fprintf(stderr, "power-law: original prob was %.3f\n", original_p); ctx->weighted_sum = original_p + ctx->decay * ctx->weighted_sum; fprintf(stderr, "power-law: updated ctx->weighted_sum = %.3f\n", ctx->weighted_sum); ctx->total_weight = 1.0f + ctx->decay * ctx->total_weight; From fcb512908630db298337c3ad13361e4493f1fb8b Mon Sep 17 00:00:00 2001 From: ddh0 Date: Mon, 15 Dec 2025 21:42:29 -0600 Subject: [PATCH 38/56] remove debug logging, explicitly clamp params at init --- src/llama-sampling.cpp | 24 +++++------------------- 1 file changed, 5 insertions(+), 19 deletions(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 7684c8f38cc..77ec141a560 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2370,10 +2370,8 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok if (ctx->target < 0.0f) { // no-op: just sample from the distribution as-is - fprintf(stderr, "power-law: no-op!\n"); fflush(stderr); llama_sampler_softmax_impl(cur_p, false); - const int idx = llama_sample_dist(cur_p, ctx->rng); - cur_p->selected = idx; + cur_p->selected = llama_sample_dist(cur_p, ctx->rng); return; } @@ -2389,13 +2387,8 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok ctx->total_weight == 0.0f ? ctx->target : 2.0f * ctx->target - (ctx->weighted_sum / ctx->total_weight), 0.0f, 1.0f ); - fprintf(stderr, "power-law: computed target = %.3f\n", computed_target); - // // power law transform - // - - fprintf(stderr, "power-law: cur_p->size = %d\n", (int)cur_p->size); for (size_t i = 0; i < cur_p->size; ++i) { float dist = (cur_p->data[i].p - computed_target) * INV_WIDTH; cur_p->data[i].logit = PEAK_LOGIT_VALUE / (1.0f + dist * dist); @@ -2406,14 +2399,10 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok // sample from transformed distribution const int idx = llama_sample_dist(cur_p, ctx->rng); cur_p->selected = idx; - fprintf(stderr, "power-law: selected token at index %d\n", idx); // update running history with the original probability of the selected token - float original_p = ctx->original_probs[idx]; - ctx->weighted_sum = original_p + ctx->decay * ctx->weighted_sum; - fprintf(stderr, "power-law: updated ctx->weighted_sum = %.3f\n", ctx->weighted_sum); - ctx->total_weight = 1.0f + ctx->decay * ctx->total_weight; - fprintf(stderr, "power-law: updated ctx->total_weight = %.3f\n", ctx->total_weight); fflush(stderr); + ctx->weighted_sum = ctx->original_probs[idx] + ctx->decay * ctx->weighted_sum; + ctx->total_weight = 1.0f + ctx->decay * ctx->total_weight; // history fades over time } static void llama_sampler_power_law_reset(struct llama_sampler * smpl) { @@ -2453,15 +2442,12 @@ struct llama_sampler * llama_sampler_init_power_law( float decay, uint32_t seed ) { - const float _decay = std::min(decay, 0.99f); - fprintf(stderr, "power-law: init: target %.3f, decay %.3f\n", (double)target, (double)_decay); - fflush(stderr); auto seed_cur = get_rng_seed(seed); return llama_sampler_init( /* .iface = */ &llama_sampler_power_law_i, /* .ctx = */ new llama_sampler_power_law { - /* .target = */ target, - /* .decay = */ _decay, + /* .target = */ std::clamp(target, 0.0f, 1.0f), + /* .decay = */ std::clamp(decay, 0.0f, 0.99f), /* .seed = */ seed_cur, /* .rng = */ std::mt19937(seed_cur), /* .weighted_sum = */ 0.0f, From 775299892e7026b7823fb7cbef63fcd78777c031 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Wed, 17 Dec 2025 15:06:05 -0600 Subject: [PATCH 39/56] add `use_power_law` flag + logic, minor cleanup --- common/sampling.cpp | 36 +++++++++++++++++++++--------------- include/llama.h | 2 +- src/llama-sampling.cpp | 20 +++++++++----------- 3 files changed, 31 insertions(+), 27 deletions(-) diff --git a/common/sampling.cpp b/common/sampling.cpp index 05e44170e40..d571c5ecd43 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -241,8 +241,8 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co } if (params.mirostat == 0) { - // if this flag is set, we will not need to add `dist` at the end of the sampler chain - bool has_distribution_sampler = false; + + bool use_power_law = false; for (const auto & cnstr : params.samplers) { switch (cnstr) { @@ -253,46 +253,52 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co for (const auto & str : params.dry_sequence_breakers) { c_breakers.push_back(str.c_str()); } - samplers.push_back(llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size())); + samplers.push_back(llama_sampler_init_dry(vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size())); } break; case COMMON_SAMPLER_TYPE_TOP_K: - samplers.push_back(llama_sampler_init_top_k (params.top_k)); + samplers.push_back(llama_sampler_init_top_k(params.top_k)); break; case COMMON_SAMPLER_TYPE_TOP_P: - samplers.push_back(llama_sampler_init_top_p (params.top_p, params.min_keep)); + samplers.push_back(llama_sampler_init_top_p(params.top_p, params.min_keep)); break; case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: samplers.push_back(llama_sampler_init_top_n_sigma(params.top_n_sigma)); break; case COMMON_SAMPLER_TYPE_MIN_P: - samplers.push_back(llama_sampler_init_min_p (params.min_p, params.min_keep)); + samplers.push_back(llama_sampler_init_min_p(params.min_p, params.min_keep)); break; case COMMON_SAMPLER_TYPE_XTC: - samplers.push_back(llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed)); + samplers.push_back(llama_sampler_init_xtc(params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed)); break; case COMMON_SAMPLER_TYPE_TYPICAL_P: - samplers.push_back(llama_sampler_init_typical (params.typ_p, params.min_keep)); + samplers.push_back(llama_sampler_init_typical(params.typ_p, params.min_keep)); break; case COMMON_SAMPLER_TYPE_TEMPERATURE: - samplers.push_back(llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent)); + samplers.push_back(llama_sampler_init_temp_ext(params.temp, params.dynatemp_range, params.dynatemp_exponent)); break; case COMMON_SAMPLER_TYPE_INFILL: - samplers.push_back(llama_sampler_init_infill (vocab)); + samplers.push_back(llama_sampler_init_infill(vocab)); break; case COMMON_SAMPLER_TYPE_PENALTIES: - samplers.push_back(llama_sampler_init_penalties (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present)); + samplers.push_back(llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present)); break; case COMMON_SAMPLER_TYPE_POWER_LAW: - has_distribution_sampler = true; - samplers.push_back(llama_sampler_init_power_law (params.power_law_target, params.power_law_decay, params.seed)); + // the `power_law` sampler is like `dist` in that it selects a single token, + // so we will add `dist` at the end of the chain by default, unless the user + // specifically included `power_law`. we set this flag here so we know to add + // it at the very end. + use_power_law = true; break; default: GGML_ASSERT(false && "unknown sampler type"); } } - // only add `dist` to the end of the chain if no other distribution samplers were added - if (!has_distribution_sampler) { + if (use_power_law) { + // only if user explicitly included power_law sampler + samplers.push_back(llama_sampler_init_power_law(params.power_law_target, params.power_law_decay, params.seed)); + } else { + // default: sample from distribution samplers.push_back(llama_sampler_init_dist(params.seed)); } } else if (params.mirostat == 1) { diff --git a/include/llama.h b/include/llama.h index 3ec3f25c958..f903d34a56b 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1309,7 +1309,7 @@ extern "C" { /// this sampler implements a power law probability transformation with adaptive /// target tracking. it reshapes token probability distributions to favor tokens near a /// configurable target probability, rather than always selecting from the highest probability - /// candidates. it is ideal for creative, unpredictable text generation. + /// candidates. /// /// this sampler is like `greedy`, `dist`, and `mirostat` in that it actually selects a token ID /// rather than just transforming logits. therefore it must always be the last sampler in the diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 77ec141a560..59393275a58 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2318,7 +2318,7 @@ struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, floa // this sampler implements a power law probability transformation with adaptive // target tracking. it reshapes token probability distributions to favor tokens near a // configurable target probability, rather than always selecting from the highest probability -// candidates. it is ideal for creative, unpredictable text generation. +// candidates. // // this sampler is like `greedy`, `dist`, and `mirostat` in that it actually selects a token ID // rather than just transforming logits. therefore it must always be the last sampler in the @@ -2332,7 +2332,7 @@ struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, floa struct llama_sampler_power_law { // the desired average probability for selected tokens (0.0 to 1.0) - // higher values favor more probable tokens (more deterministic) + // higher values favor more probable tokens (more stable and predictable) // lower values favor less probable tokens (more creative) // negative values disable Power Law sampling (sample from distribution as-is) const float target; @@ -2341,19 +2341,17 @@ struct llama_sampler_power_law { // lower values = faster adaptation, more reactive to recent tokens // higher values = slower adaptation, more stable over time // effective history length ≈ 1/(1-decay) tokens - // examples: decay=0.5 → ~2 tokens, decay=0.9 → ~10, decay=0.95 → ~20 + // example: decay=0.5 --> ~2 tokens; decay=0.9 --> ~10 tokens; decay=0.95 --> ~20 tokens // internally clamped to <= 0.99 to prevent unbounded accumulation const float decay; const uint32_t seed; std::mt19937 rng; - // historical token probabilities weighted by recency - float weighted_sum; - // sum of weights, converges to 1/(1-decay) - float total_weight; - // used to store original token probabilities (needed for history update after selection) - std::vector original_probs; + // member variables + float weighted_sum; // historical token probabilities weighted by recency + float total_weight; // sum of weights, converges to 1/(1-decay) + std::vector original_probs; // used to store original token probabilities }; // transformation constants @@ -2401,8 +2399,8 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok cur_p->selected = idx; // update running history with the original probability of the selected token - ctx->weighted_sum = ctx->original_probs[idx] + ctx->decay * ctx->weighted_sum; - ctx->total_weight = 1.0f + ctx->decay * ctx->total_weight; // history fades over time + ctx->weighted_sum = ctx->original_probs[idx] + ctx->decay * ctx->weighted_sum; // history fades over time + ctx->total_weight = 1.0f + ctx->decay * ctx->total_weight; } static void llama_sampler_power_law_reset(struct llama_sampler * smpl) { From b95b0884ddd74a46b8aa98c5edf38f194d9515ab Mon Sep 17 00:00:00 2001 From: ddh0 Date: Sat, 27 Dec 2025 02:10:20 -0600 Subject: [PATCH 40/56] update `power-law` -> `adaptive-p` --- common/arg.cpp | 20 +++--- common/common.h | 6 +- common/sampling.cpp | 32 ++++----- include/llama.h | 31 +++++---- src/llama-sampling.cpp | 123 +++++++++++++++-------------------- tools/server/server-task.cpp | 4 +- 6 files changed, 102 insertions(+), 114 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 1ef8d705488..87438d8d09c 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1597,21 +1597,23 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_sparam()); add_opt(common_arg( - {"--power-law-target"}, "N", - string_format("power law sampler: select tokens near this probability (valid range 0.0 " - "to 1.0; <0 = disabled) (default: %.2f)\n" + {"--adaptive-target"}, "N", + string_format("adaptive-p: select tokens near this probability (valid range 0.0 " + "to 1.0; negative = disabled) (default: %.2f)\n" "[(more info)]""(https://github.com/ggml-org/llama.cpp/pull/17927)", - (double)params.sampling.power_law_target), + (double)params.sampling.adaptive_target), [](common_params & params, const std::string & value) { - params.sampling.power_law_target = std::stof(value); + params.sampling.adaptive_target = std::stof(value); } ).set_sparam()); add_opt(common_arg( - {"--power-law-decay"}, "N", - string_format("decay rate for target adaptation over time. lower values -> faster but less stable adaptation.\n" - "(valid range 0.0 to 1.0; ≤0 = no adaptation) (default: %.2f)", (double)params.sampling.power_law_decay), + {"--adaptive-decay"}, "N", + string_format("adaptive-p: decay rate for target adaptation over time. lower values " + "are more reactive, higher values are more stable.\n" + "(valid range 0.0 to 0.99) (default: %.2f)", + (double)params.sampling.adaptive_decay), [](common_params & params, const std::string & value) { - params.sampling.power_law_decay = std::stof(value); + params.sampling.adaptive_decay = std::stof(value); } ).set_sparam()); add_opt(common_arg( diff --git a/common/common.h b/common/common.h index 66e738e30ae..2145f4f4c27 100644 --- a/common/common.h +++ b/common/common.h @@ -117,7 +117,7 @@ enum common_sampler_type { COMMON_SAMPLER_TYPE_INFILL = 9, COMMON_SAMPLER_TYPE_PENALTIES = 10, COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11, - COMMON_SAMPLER_TYPE_POWER_LAW = 12, + COMMON_SAMPLER_TYPE_ADAPTIVE_P = 12, }; // dimensionality reduction methods, used by cvector-generator @@ -185,8 +185,8 @@ struct common_params_sampling { float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length) int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size) - float power_law_target = -1.0f; // select tokens near this probability (valid range 0.0 to 1.0; <0 = disabled) - float power_law_decay = 0.90f; // decay rate for target adaptation over time. lower values -> faster but less stable adaptation. (valid range 0.0 to 1.0; ≤0 = no adaptation) + float adaptive_target = -1.0f; // select tokens near this probability (valid range 0.0 to 1.0; negative = disabled) + float adaptive_decay = 0.90f; // EMA decay for adaptation; history ≈ 1/(1-decay) tokens (0.0 - 0.99) int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 float top_n_sigma = -1.00f; // -1.0 = disabled float mirostat_tau = 5.00f; // target entropy diff --git a/common/sampling.cpp b/common/sampling.cpp index 4c95450a739..140404f12fc 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -150,11 +150,11 @@ std::string common_params_sampling::print() const { "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n" "\tdry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d\n" "\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, top_n_sigma = %.3f, temp = %.3f\n" - "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f, power_law_target = %.3f, power_law_decay = %.3f", + "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f, adaptive_target = %.3f, adaptive_decay = %.3f", penalty_last_n, penalty_repeat, penalty_freq, penalty_present, dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, top_n_sigma, temp, - mirostat, mirostat_eta, mirostat_tau, power_law_target, power_law_decay); + mirostat, mirostat_eta, mirostat_tau, adaptive_target, adaptive_decay); return std::string(result); } @@ -237,7 +237,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co if (params.mirostat == 0) { - bool use_power_law = false; + bool use_adaptive_p = false; // see below for (const auto & cnstr : params.samplers) { switch (cnstr) { @@ -278,20 +278,20 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co case COMMON_SAMPLER_TYPE_PENALTIES: samplers.push_back(llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present)); break; - case COMMON_SAMPLER_TYPE_POWER_LAW: - // the `power_law` sampler is like `dist` in that it selects a single token, - // so we will add `dist` at the end of the chain by default, unless the user - // specifically included `power_law`. we set this flag here so we know to add - // it at the very end. - use_power_law = true; + case COMMON_SAMPLER_TYPE_ADAPTIVE_P: + // the `adaptive-p` sampler is like `dist` and `mirostat` in that it selects + // a single token, so we will add `dist` at the end of the chain by default, + // unless the user specifically included `adaptive-p`. we set this flag here + // so we know to add the sampler at the very end. + use_adaptive_p = true; break; default: GGML_ASSERT(false && "unknown sampler type"); } } - if (use_power_law) { - // only if user explicitly included power_law sampler - samplers.push_back(llama_sampler_init_power_law(params.power_law_target, params.power_law_decay, params.seed)); + if (use_adaptive_p) { + // only if user explicitly included adaptive-p sampler + samplers.push_back(llama_sampler_init_adaptive_p(params.adaptive_target, params.adaptive_decay, params.seed)); } else { // default: sample from distribution samplers.push_back(llama_sampler_init_dist(params.seed)); @@ -581,7 +581,7 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) { case COMMON_SAMPLER_TYPE_XTC: return 'x'; case COMMON_SAMPLER_TYPE_INFILL: return 'i'; case COMMON_SAMPLER_TYPE_PENALTIES: return 'e'; - case COMMON_SAMPLER_TYPE_POWER_LAW: return 'w'; + case COMMON_SAMPLER_TYPE_ADAPTIVE_P: return 'a'; default : return '?'; } } @@ -598,7 +598,7 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) { case COMMON_SAMPLER_TYPE_XTC: return "xtc"; case COMMON_SAMPLER_TYPE_INFILL: return "infill"; case COMMON_SAMPLER_TYPE_PENALTIES: return "penalties"; - case COMMON_SAMPLER_TYPE_POWER_LAW: return "power_law"; + case COMMON_SAMPLER_TYPE_ADAPTIVE_P: return "adaptive_p"; default : return ""; } } @@ -615,7 +615,7 @@ std::vector common_sampler_types_from_names(const std::vect { "xtc", COMMON_SAMPLER_TYPE_XTC }, { "infill", COMMON_SAMPLER_TYPE_INFILL }, { "penalties", COMMON_SAMPLER_TYPE_PENALTIES }, - { "power_law", COMMON_SAMPLER_TYPE_POWER_LAW }, + { "adaptive_p", COMMON_SAMPLER_TYPE_ADAPTIVE_P }, }; // since samplers names are written multiple ways @@ -631,7 +631,7 @@ std::vector common_sampler_types_from_names(const std::vect { "typ", COMMON_SAMPLER_TYPE_TYPICAL_P }, { "min-p", COMMON_SAMPLER_TYPE_MIN_P }, { "temp", COMMON_SAMPLER_TYPE_TEMPERATURE }, - { "power-law", COMMON_SAMPLER_TYPE_POWER_LAW }, + { "adaptive-p", COMMON_SAMPLER_TYPE_ADAPTIVE_P }, }; std::vector samplers; diff --git a/include/llama.h b/include/llama.h index f903d34a56b..5e8974c94f7 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1304,25 +1304,28 @@ extern "C" { const char ** seq_breakers, size_t num_breakers); - /// power-law + /// adaptive-p: select tokens near a configurable target probability over time. /// - /// this sampler implements a power law probability transformation with adaptive - /// target tracking. it reshapes token probability distributions to favor tokens near a - /// configurable target probability, rather than always selecting from the highest probability - /// candidates. + /// the adaptive-p sampler transforms the token probability distribution to favor tokens + /// that fall near a user-configurable probability target. /// - /// this sampler is like `greedy`, `dist`, and `mirostat` in that it actually selects a token ID - /// rather than just transforming logits. therefore it must always be the last sampler in the - /// sampler chain. + /// internally, the sampler maintains an exponential moving average of the *ORIGINAL* + /// probabilities of selected tokens at each sampling step. it uses this EMA to compute an + /// adapted target probability at each sampling step, thus maintaining the desired target + /// probability over time. /// - /// minimal truncation before this sampler is recommended. + /// adaptive-p selects a token ID rather than just mutating candidates, so it must be last + /// in the sampler chain (like mirostat, dist, greedy). /// - /// @param target select tokens near this probability (valid range 0.0 to 1.0; <0 = disabled) - /// @param decay decay rate for target adaptation over time. lower values -> faster but less stable adaptation. (valid range 0.0 to 1.0; ≤0 = no adaptation) + /// only mild truncation before this sampler is recommended. we suggest applying min-p + /// before adaptive-p as the only other active sampler in the chain. /// - /// ref: https://github.com/MrJackSpade/llama.cpp/tree/master (original impl) - /// ref: https://github.com/ggml-org/llama.cpp/pull/17927 (llama.cpp PR) - LLAMA_API struct llama_sampler * llama_sampler_init_power_law( + /// @param target select tokens near this probability (valid range 0.0 to 1.0; negative = disabled) + /// @param decay EMA decay for adaptation; history ≈ 1/(1-decay) tokens (valid range 0.0 - 0.99) + /// @param seed RNG seed + /// + /// ref: https://github.com/ggml-org/llama.cpp/pull/17927 + LLAMA_API struct llama_sampler * llama_sampler_init_adaptive_p( float target, float decay, uint32_t seed); diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index a4b03193dd6..5a823ca457b 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2329,61 +2329,39 @@ struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, floa return result; } -// power-law +// adaptive-p sampler state // -// this sampler implements a power law probability transformation with adaptive -// target tracking. it reshapes token probability distributions to favor tokens near a -// configurable target probability, rather than always selecting from the highest probability -// candidates. +// maintains an exponential moving average of the *ORIGINAL* probabilities +// of selected tokens, used to compute an adapted target at each sampling step. // -// this sampler is like `greedy`, `dist`, and `mirostat` in that it actually selects a token ID -// rather than just transforming logits. therefore it must always be the last sampler in the -// sampler chain. -// -// minimal truncation before this sampler is recommended. -// -// ref: https://github.com/MrJackSpade/llama.cpp/tree/master (original impl) -// ref: https://github.com/ggml-org/llama.cpp/pull/17927 (llama.cpp PR) - -struct llama_sampler_power_law { - - // the desired average probability for selected tokens (0.0 to 1.0) - // higher values favor more probable tokens (more stable and predictable) - // lower values favor less probable tokens (more creative) - // negative values disable Power Law sampling (sample from distribution as-is) - const float target; - - // controls how quickly history influence fades (0.0 to 0.99) - // lower values = faster adaptation, more reactive to recent tokens - // higher values = slower adaptation, more stable over time - // effective history length ≈ 1/(1-decay) tokens - // example: decay=0.5 --> ~2 tokens; decay=0.9 --> ~10 tokens; decay=0.95 --> ~20 tokens - // internally clamped to <= 0.99 to prevent unbounded accumulation - const float decay; - - const uint32_t seed; - std::mt19937 rng; - - // member variables - float weighted_sum; // historical token probabilities weighted by recency - float total_weight; // sum of weights, converges to 1/(1-decay) - std::vector original_probs; // used to store original token probabilities +// see llama.h for a full description of the sampler +// ref: https://github.com/ggml-org/llama.cpp/pull/17927 +struct llama_sampler_adaptive_p { + const float target; // target probability (0.0 - 1.0; negative = disabled) + const float decay; // EMA decay; history ≈ 1/(1-decay) tokens (0.0 - 0.99) + const uint32_t seed; // RNG seed + std::mt19937 rng; // RNG + float weighted_sum; // sum(p_i * decay^i) + float total_weight; // sum(decay^i), converges to 1/(1-decay) + std::vector original_probs; // pre-transform probs, cached for EMA update }; -// transformation constants +// adaptive probability transformation constants static constexpr float DISTRIBUTION_WIDTH = 0.3f; static constexpr float PEAK_LOGIT_VALUE = 5.0f; +static constexpr float SHARPNESS = 4.0f; static constexpr float INV_WIDTH = 1.0f / DISTRIBUTION_WIDTH; -static const char * llama_sampler_power_law_name(const struct llama_sampler * /*smpl*/) { - return "power-law"; +static const char * llama_sampler_adaptive_p_name(const struct llama_sampler * /*smpl*/) { + return "adaptive-p"; } -static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { - auto * ctx = (llama_sampler_power_law *) smpl->ctx; +static void llama_sampler_adaptive_p_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + auto * ctx = (llama_sampler_adaptive_p *) smpl->ctx; if (ctx->target < 0.0f) { - // no-op: just sample from the distribution as-is + // at negative target values, adaptive-p is no-op + // we simply sample from the existing distribution llama_sampler_softmax_impl(cur_p, false); cur_p->selected = llama_sample_dist(cur_p, ctx->rng); return; @@ -2397,38 +2375,43 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok } // compute the adapted target probability for the current sampling step - float computed_target = std::clamp( - ctx->total_weight == 0.0f ? ctx->target : 2.0f * ctx->target - (ctx->weighted_sum / ctx->total_weight), + auto target = std::clamp(ctx->target, 0.0f, 1.0f); + float adapted_target = std::clamp( + ctx->total_weight == 0.0f ? target : 2.0f * target - (ctx->weighted_sum / ctx->total_weight), 0.0f, 1.0f ); - // power law transform + // adaptive probability transform + // + // quadratic near target for fine differentiation, transitioning to linear decay in the + // tails. unbounded negative logits ensure proper suppression of far-from-target tokens + // after the softmax. + // for (size_t i = 0; i < cur_p->size; ++i) { - float dist = (cur_p->data[i].p - computed_target) * INV_WIDTH; - cur_p->data[i].logit = PEAK_LOGIT_VALUE / (1.0f + dist * dist); + float dist = std::abs((cur_p->data[i].p - adapted_target) * INV_WIDTH); + cur_p->data[i].logit = PEAK_LOGIT_VALUE - SHARPNESS * dist * dist / (1.0f + dist); } + // softmax and sample from the transformed distribution llama_sampler_softmax_impl(cur_p, false); - - // sample from transformed distribution const int idx = llama_sample_dist(cur_p, ctx->rng); cur_p->selected = idx; - // update running history with the original probability of the selected token - ctx->weighted_sum = ctx->original_probs[idx] + ctx->decay * ctx->weighted_sum; // history fades over time + // update history with the original probability of the selected token + ctx->weighted_sum = ctx->original_probs[idx] + ctx->decay * ctx->weighted_sum; ctx->total_weight = 1.0f + ctx->decay * ctx->total_weight; } -static void llama_sampler_power_law_reset(struct llama_sampler * smpl) { - auto * ctx = (llama_sampler_power_law *) smpl->ctx; +static void llama_sampler_adaptive_p_reset(struct llama_sampler * smpl) { + auto * ctx = (llama_sampler_adaptive_p *) smpl->ctx; ctx->weighted_sum = 0.0f; ctx->total_weight = 0.0f; } -static struct llama_sampler * llama_sampler_power_law_clone(const struct llama_sampler * smpl) { - const auto * ctx = (const llama_sampler_power_law *) smpl->ctx; - auto * result = llama_sampler_init_power_law(ctx->target, ctx->decay, ctx->seed); - auto * result_ctx = (llama_sampler_power_law *) result->ctx; +static struct llama_sampler * llama_sampler_adaptive_p_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_adaptive_p *) smpl->ctx; + auto * result = llama_sampler_init_adaptive_p(ctx->target, ctx->decay, ctx->seed); + auto * result_ctx = (llama_sampler_adaptive_p *) result->ctx; result_ctx->rng = ctx->rng; result_ctx->weighted_sum = ctx->weighted_sum; @@ -2438,29 +2421,29 @@ static struct llama_sampler * llama_sampler_power_law_clone(const struct llama_s return result; } -static void llama_sampler_power_law_free(struct llama_sampler * smpl) { - delete (llama_sampler_power_law *) smpl->ctx; +static void llama_sampler_adaptive_p_free(struct llama_sampler * smpl) { + delete (llama_sampler_adaptive_p *) smpl->ctx; } -static struct llama_sampler_i llama_sampler_power_law_i = { - /* .name = */ llama_sampler_power_law_name, +static struct llama_sampler_i llama_sampler_adaptive_p_i = { + /* .name = */ llama_sampler_adaptive_p_name, /* .accept = */ nullptr, - /* .apply = */ llama_sampler_power_law_apply, - /* .reset = */ llama_sampler_power_law_reset, - /* .clone = */ llama_sampler_power_law_clone, - /* .free = */ llama_sampler_power_law_free, + /* .apply = */ llama_sampler_adaptive_p_apply, + /* .reset = */ llama_sampler_adaptive_p_reset, + /* .clone = */ llama_sampler_adaptive_p_clone, + /* .free = */ llama_sampler_adaptive_p_free, }; -struct llama_sampler * llama_sampler_init_power_law( +struct llama_sampler * llama_sampler_init_adaptive_p( float target, float decay, uint32_t seed ) { auto seed_cur = get_rng_seed(seed); return llama_sampler_init( - /* .iface = */ &llama_sampler_power_law_i, - /* .ctx = */ new llama_sampler_power_law { - /* .target = */ std::clamp(target, 0.0f, 1.0f), + /* .iface = */ &llama_sampler_adaptive_p_i, + /* .ctx = */ new llama_sampler_adaptive_p { + /* .target = */ target, /* .decay = */ std::clamp(decay, 0.0f, 0.99f), /* .seed = */ seed_cur, /* .rng = */ std::mt19937(seed_cur), diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp index 13fa0fdec29..d0b547c235b 100644 --- a/tools/server/server-task.cpp +++ b/tools/server/server-task.cpp @@ -201,8 +201,8 @@ task_params server_task::params_from_json_cmpl( params.sampling.mirostat = json_value(data, "mirostat", defaults.sampling.mirostat); params.sampling.mirostat_tau = json_value(data, "mirostat_tau", defaults.sampling.mirostat_tau); params.sampling.mirostat_eta = json_value(data, "mirostat_eta", defaults.sampling.mirostat_eta); - params.sampling.power_law_target = json_value(data, "power_law_target", defaults.sampling.power_law_target); - params.sampling.power_law_decay = json_value(data, "power_law_decay", defaults.sampling.power_law_decay); + params.sampling.adaptive_target = json_value(data, "adaptive_target", defaults.sampling.adaptive_target); + params.sampling.adaptive_decay = json_value(data, "adaptive_decay", defaults.sampling.adaptive_decay); params.sampling.seed = json_value(data, "seed", defaults.sampling.seed); params.sampling.n_probs = json_value(data, "n_probs", defaults.sampling.n_probs); params.sampling.min_keep = json_value(data, "min_keep", defaults.sampling.min_keep); From e7a892065dfbb8ad2cf9ca43acd408e72bfc0321 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Sun, 28 Dec 2025 20:31:26 -0600 Subject: [PATCH 41/56] fix cold start EMA - `ctx->weighted_sum` is now initialized and reset to `target / (1.0f - clamped_decay)` - `ctx->total_weight` is now initialized and reset to `1.0f / (1.0f - clamped_decay)` this fixes a "cold start" problem with the moving average --- src/llama-sampling.cpp | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 5a823ca457b..137c865c303 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2338,7 +2338,7 @@ struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, floa // ref: https://github.com/ggml-org/llama.cpp/pull/17927 struct llama_sampler_adaptive_p { const float target; // target probability (0.0 - 1.0; negative = disabled) - const float decay; // EMA decay; history ≈ 1/(1-decay) tokens (0.0 - 0.99) + const float decay; // EMA decay; history ~= 1/(1-decay) tokens (0.0 - 0.99) const uint32_t seed; // RNG seed std::mt19937 rng; // RNG float weighted_sum; // sum(p_i * decay^i) @@ -2397,15 +2397,18 @@ static void llama_sampler_adaptive_p_apply(struct llama_sampler * smpl, llama_to const int idx = llama_sample_dist(cur_p, ctx->rng); cur_p->selected = idx; - // update history with the original probability of the selected token + // update EMA with the original probability of the selected token ctx->weighted_sum = ctx->original_probs[idx] + ctx->decay * ctx->weighted_sum; ctx->total_weight = 1.0f + ctx->decay * ctx->total_weight; } static void llama_sampler_adaptive_p_reset(struct llama_sampler * smpl) { - auto * ctx = (llama_sampler_adaptive_p *) smpl->ctx; - ctx->weighted_sum = 0.0f; - ctx->total_weight = 0.0f; + auto * ctx = (llama_sampler_adaptive_p *) smpl->ctx; + // ctx->target and ctx->decay never change after init, so it's safe to keep them as is. + // original_probs is completely overwritten on every call to _apply. + // so we only need to reset the EMA state. + ctx->weighted_sum = ctx->target / (1.0f - ctx->decay); + ctx->total_weight = 1.0f / (1.0f - ctx->decay); } static struct llama_sampler * llama_sampler_adaptive_p_clone(const struct llama_sampler * smpl) { @@ -2413,10 +2416,11 @@ static struct llama_sampler * llama_sampler_adaptive_p_clone(const struct llama_ auto * result = llama_sampler_init_adaptive_p(ctx->target, ctx->decay, ctx->seed); auto * result_ctx = (llama_sampler_adaptive_p *) result->ctx; - result_ctx->rng = ctx->rng; - result_ctx->weighted_sum = ctx->weighted_sum; - result_ctx->total_weight = ctx->total_weight; - result_ctx->original_probs.reserve(ctx->original_probs.capacity()); + // copy everything (target, decay, and seed are already set) + result_ctx->original_probs = ctx->original_probs; + result_ctx->weighted_sum = ctx->weighted_sum; + result_ctx->total_weight = ctx->total_weight; + result_ctx->rng = ctx->rng; return result; } @@ -2440,15 +2444,16 @@ struct llama_sampler * llama_sampler_init_adaptive_p( uint32_t seed ) { auto seed_cur = get_rng_seed(seed); + float clamped_decay = std::clamp(decay, 0.0f, 0.99f); return llama_sampler_init( /* .iface = */ &llama_sampler_adaptive_p_i, /* .ctx = */ new llama_sampler_adaptive_p { /* .target = */ target, - /* .decay = */ std::clamp(decay, 0.0f, 0.99f), + /* .decay = */ clamped_decay, /* .seed = */ seed_cur, /* .rng = */ std::mt19937(seed_cur), - /* .weighted_sum = */ 0.0f, - /* .total_weight = */ 0.0f, + /* .weighted_sum = */ target / (1.0f - clamped_decay), + /* .total_weight = */ 1.0f / (1.0f - clamped_decay), /* .original_probs = */ {}, } ); From c6a6f6354b9b1a58af1d5f91102b481ed390d56e Mon Sep 17 00:00:00 2001 From: ddh0 Date: Tue, 30 Dec 2025 13:49:06 -0600 Subject: [PATCH 42/56] update `SHARPNESS` constant to `10.0f` --- src/llama-sampling.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index b3256d09621..34910e56bf8 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2358,10 +2358,10 @@ struct llama_sampler_adaptive_p { }; // adaptive probability transformation constants -static constexpr float DISTRIBUTION_WIDTH = 0.3f; -static constexpr float PEAK_LOGIT_VALUE = 5.0f; -static constexpr float SHARPNESS = 4.0f; -static constexpr float INV_WIDTH = 1.0f / DISTRIBUTION_WIDTH; +static constexpr float DISTRIBUTION_WIDTH = 0.3f; +static constexpr float PEAK_LOGIT_VALUE = 5.0f; +static constexpr float SHARPNESS = 10.0f; +static constexpr float INV_WIDTH = 1.0f / DISTRIBUTION_WIDTH; static const char * llama_sampler_adaptive_p_name(const struct llama_sampler * /*smpl*/) { return "adaptive-p"; From 080749909e7e28a33775de8118d5faec8a9d3dfc Mon Sep 17 00:00:00 2001 From: ddh0 Date: Tue, 30 Dec 2025 15:47:14 -0600 Subject: [PATCH 43/56] minor style fixes no functional changes --- include/llama.h | 1 + src/llama-sampling.cpp | 8 +++++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/include/llama.h b/include/llama.h index 90f9045ef1e..1de1a5f38d9 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1334,6 +1334,7 @@ extern "C" { /// @param seed RNG seed /// /// ref: https://github.com/ggml-org/llama.cpp/pull/17927 + /// LLAMA_API struct llama_sampler * llama_sampler_init_adaptive_p( float target, float decay, diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 34910e56bf8..553faa3c145 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2346,7 +2346,9 @@ struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, floa // of selected tokens, used to compute an adapted target at each sampling step. // // see llama.h for a full description of the sampler +// // ref: https://github.com/ggml-org/llama.cpp/pull/17927 +// struct llama_sampler_adaptive_p { const float target; // target probability (0.0 - 1.0; negative = disabled) const float decay; // EMA decay; history ~= 1/(1-decay) tokens (0.0 - 0.99) @@ -2414,12 +2416,12 @@ static void llama_sampler_adaptive_p_apply(struct llama_sampler * smpl, llama_to } static void llama_sampler_adaptive_p_reset(struct llama_sampler * smpl) { - auto * ctx = (llama_sampler_adaptive_p *) smpl->ctx; + auto * ctx = (llama_sampler_adaptive_p *) smpl->ctx; // ctx->target and ctx->decay never change after init, so it's safe to keep them as is. // original_probs is completely overwritten on every call to _apply. // so we only need to reset the EMA state. - ctx->weighted_sum = ctx->target / (1.0f - ctx->decay); - ctx->total_weight = 1.0f / (1.0f - ctx->decay); + ctx->weighted_sum = ctx->target / (1.0f - ctx->decay); + ctx->total_weight = 1.0f / (1.0f - ctx->decay); } static struct llama_sampler * llama_sampler_adaptive_p_clone(const struct llama_sampler * smpl) { From eb854e73d59c8ee2d4fd01d364ccf27e752f603e Mon Sep 17 00:00:00 2001 From: ddh0 Date: Tue, 30 Dec 2025 15:54:23 -0600 Subject: [PATCH 44/56] minor style fixes cont. --- common/arg.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/arg.cpp b/common/arg.cpp index 4b92d46f28a..946ffbc1827 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1600,7 +1600,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"--adaptive-target"}, "N", string_format("adaptive-p: select tokens near this probability (valid range 0.0 " "to 1.0; negative = disabled) (default: %.2f)\n" - "[(more info)]""(https://github.com/ggml-org/llama.cpp/pull/17927)", + "[(more info)](https://github.com/ggml-org/llama.cpp/pull/17927)", (double)params.sampling.adaptive_target), [](common_params & params, const std::string & value) { params.sampling.adaptive_target = std::stof(value); From c27df510f9d27c0c6a8a712d2ddfef0685ef6f54 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Sun, 4 Jan 2026 14:38:17 -0600 Subject: [PATCH 45/56] update `llama_sampler_adaptive_p_i` for backend sampling (ref: #17004) --- src/llama-sampling.cpp | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index aa6959b1142..c32a81111f8 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -3396,12 +3396,16 @@ static void llama_sampler_adaptive_p_free(struct llama_sampler * smpl) { } static struct llama_sampler_i llama_sampler_adaptive_p_i = { - /* .name = */ llama_sampler_adaptive_p_name, - /* .accept = */ nullptr, - /* .apply = */ llama_sampler_adaptive_p_apply, - /* .reset = */ llama_sampler_adaptive_p_reset, - /* .clone = */ llama_sampler_adaptive_p_clone, - /* .free = */ llama_sampler_adaptive_p_free, + /* .name = */ llama_sampler_adaptive_p_name, + /* .accept = */ nullptr, + /* .apply = */ llama_sampler_adaptive_p_apply, + /* .reset = */ llama_sampler_adaptive_p_reset, + /* .clone = */ llama_sampler_adaptive_p_clone, + /* .free = */ llama_sampler_adaptive_p_free, + /* .backend_init = */ nullptr, + /* .backend_accept = */ nullptr, + /* .backend_apply = */ nullptr, + /* .backend_set_input = */ nullptr, }; struct llama_sampler * llama_sampler_init_adaptive_p( From bef75d908c633b08e8f6d64b8e43879d862e376e Mon Sep 17 00:00:00 2001 From: ddh0 Date: Wed, 7 Jan 2026 18:11:08 -0600 Subject: [PATCH 46/56] separate into `apply` + `accept` functions --- src/llama-sampling.cpp | 73 ++++++++++++++++++++++++++---------------- 1 file changed, 46 insertions(+), 27 deletions(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index c32a81111f8..250835164fb 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -3303,13 +3303,15 @@ struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, floa // ref: https://github.com/ggml-org/llama.cpp/pull/17927 // struct llama_sampler_adaptive_p { - const float target; // target probability (0.0 - 1.0; negative = disabled) - const float decay; // EMA decay; history ~= 1/(1-decay) tokens (0.0 - 0.99) - const uint32_t seed; // RNG seed - std::mt19937 rng; // RNG - float weighted_sum; // sum(p_i * decay^i) - float total_weight; // sum(decay^i), converges to 1/(1-decay) - std::vector original_probs; // pre-transform probs, cached for EMA update + const float target; // target probability (0.0 - 1.0; negative = disabled) + const float decay; // EMA decay; history ~= 1/(1-decay) tokens (0.0 - 0.99) + const uint32_t seed; // RNG seed + std::mt19937 rng; // RNG + float weighted_sum; // sum(p_i * decay^i) + float total_weight; // sum(decay^i), converges to 1/(1-decay) + std::vector original_probs; // pre-transform probs, cached for EMA update + llama_token pending_token_id; // token ID of selected token + llama_token pending_token_idx; // index of orig. prob. of selected token in original_probs }; // adaptive probability transformation constants @@ -3340,7 +3342,7 @@ static void llama_sampler_adaptive_p_apply(struct llama_sampler * smpl, llama_to ctx->original_probs[i] = cur_p->data[i].p; } - // compute the adapted target probability for the current sampling step + // using the EMA, compute the adapted target probability for the current sampling step auto target = std::clamp(ctx->target, 0.0f, 1.0f); float adapted_target = std::clamp( ctx->total_weight == 0.0f ? target : 2.0f * target - (ctx->weighted_sum / ctx->total_weight), @@ -3363,18 +3365,33 @@ static void llama_sampler_adaptive_p_apply(struct llama_sampler * smpl, llama_to const int idx = llama_sample_dist(cur_p, ctx->rng); cur_p->selected = idx; - // update EMA with the original probability of the selected token - ctx->weighted_sum = ctx->original_probs[idx] + ctx->decay * ctx->weighted_sum; - ctx->total_weight = 1.0f + ctx->decay * ctx->total_weight; + // store the selected token ID for acceptance later + ctx->pending_token_id = cur_p->data[idx].id; + ctx->pending_token_idx = idx; +} + +static void llama_sampler_adaptive_p_accept(struct llama_sampler * smpl, llama_token token) { + auto * ctx = (llama_sampler_adaptive_p *) smpl->ctx; + if (ctx->pending_token_id == token) { + GGML_ASSERT(ctx->pending_token_id != LLAMA_TOKEN_NULL); + GGML_ASSERT(ctx->pending_token_idx != -1); + // update EMA with the original probability of the selected token + ctx->weighted_sum = ctx->original_probs[ctx->pending_token_idx] + ctx->decay * ctx->weighted_sum; + ctx->total_weight = 1.0f + ctx->decay * ctx->total_weight; + } + ctx->pending_token_id = LLAMA_TOKEN_NULL; + ctx->pending_token_idx = -1; } static void llama_sampler_adaptive_p_reset(struct llama_sampler * smpl) { auto * ctx = (llama_sampler_adaptive_p *) smpl->ctx; // ctx->target and ctx->decay never change after init, so it's safe to keep them as is. // original_probs is completely overwritten on every call to _apply. - // so we only need to reset the EMA state. - ctx->weighted_sum = ctx->target / (1.0f - ctx->decay); - ctx->total_weight = 1.0f / (1.0f - ctx->decay); + // so we only need to reset the EMA state and pending token. + ctx->weighted_sum = ctx->target / (1.0f - ctx->decay); + ctx->total_weight = 1.0f / (1.0f - ctx->decay); + ctx->pending_token_id = LLAMA_TOKEN_NULL; + ctx->pending_token_idx = -1; } static struct llama_sampler * llama_sampler_adaptive_p_clone(const struct llama_sampler * smpl) { @@ -3382,11 +3399,11 @@ static struct llama_sampler * llama_sampler_adaptive_p_clone(const struct llama_ auto * result = llama_sampler_init_adaptive_p(ctx->target, ctx->decay, ctx->seed); auto * result_ctx = (llama_sampler_adaptive_p *) result->ctx; - // copy everything (target, decay, and seed are already set) - result_ctx->original_probs = ctx->original_probs; - result_ctx->weighted_sum = ctx->weighted_sum; - result_ctx->total_weight = ctx->total_weight; - result_ctx->rng = ctx->rng; + // copy everything (target, decay, seed, and RNG are already set) + result_ctx->weighted_sum = ctx->weighted_sum; + result_ctx->total_weight = ctx->total_weight; + result_ctx->pending_token_id = ctx->pending_token_id; + result_ctx->pending_token_idx = ctx->pending_token_idx; return result; } @@ -3397,7 +3414,7 @@ static void llama_sampler_adaptive_p_free(struct llama_sampler * smpl) { static struct llama_sampler_i llama_sampler_adaptive_p_i = { /* .name = */ llama_sampler_adaptive_p_name, - /* .accept = */ nullptr, + /* .accept = */ llama_sampler_adaptive_p_accept, /* .apply = */ llama_sampler_adaptive_p_apply, /* .reset = */ llama_sampler_adaptive_p_reset, /* .clone = */ llama_sampler_adaptive_p_clone, @@ -3418,13 +3435,15 @@ struct llama_sampler * llama_sampler_init_adaptive_p( return llama_sampler_init( /* .iface = */ &llama_sampler_adaptive_p_i, /* .ctx = */ new llama_sampler_adaptive_p { - /* .target = */ target, - /* .decay = */ clamped_decay, - /* .seed = */ seed_cur, - /* .rng = */ std::mt19937(seed_cur), - /* .weighted_sum = */ target / (1.0f - clamped_decay), - /* .total_weight = */ 1.0f / (1.0f - clamped_decay), - /* .original_probs = */ {}, + /* .target = */ target, + /* .decay = */ clamped_decay, + /* .seed = */ seed_cur, + /* .rng = */ std::mt19937(seed_cur), + /* .weighted_sum = */ target / (1.0f - clamped_decay), + /* .total_weight = */ 1.0f / (1.0f - clamped_decay), + /* .original_probs = */ {}, + /* .pending_token_id = */ LLAMA_TOKEN_NULL, + /* .pending_token_idx = */ -1 } ); } From e99a4a66c347c0aba4d87f4cac6a8d24735fe6d9 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Thu, 8 Jan 2026 20:52:48 -0600 Subject: [PATCH 47/56] `pending_token_idx`: switch from `llama_token` to `int32` functionally identical (`llama.h` has `typedef int32_t llama_token;`), but its more correct now --- src/llama-sampling.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 250835164fb..73bd5d1880f 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -3311,7 +3311,7 @@ struct llama_sampler_adaptive_p { float total_weight; // sum(decay^i), converges to 1/(1-decay) std::vector original_probs; // pre-transform probs, cached for EMA update llama_token pending_token_id; // token ID of selected token - llama_token pending_token_idx; // index of orig. prob. of selected token in original_probs + int32_t pending_token_idx; // index of orig. prob. of selected token in original_probs }; // adaptive probability transformation constants From 3aa23f3faab3d651311d122c42401a4d1ed03a8d Mon Sep 17 00:00:00 2001 From: ddh0 Date: Mon, 12 Jan 2026 09:11:41 -0600 Subject: [PATCH 48/56] don't transform logits <= -1e9f --- src/llama-sampling.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 444a1a1deff..ef10bcab67c 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -3356,6 +3356,12 @@ static void llama_sampler_adaptive_p_apply(struct llama_sampler * smpl, llama_to // after the softmax. // for (size_t i = 0; i < cur_p->size; ++i) { + if (cur_p->data[i].logit <= -1e9f) { + // don't transform logits with very large negative values + // (as set by e.g. min-p and top-p when using backend sampling) + // the value `-1e9f` is copied from `llama_sampler_min_p_backend_apply` + continue; + } float dist = std::abs((cur_p->data[i].p - adapted_target) * INV_WIDTH); cur_p->data[i].logit = PEAK_LOGIT_VALUE - SHARPNESS * dist * dist / (1.0f + dist); } From d21c87eaab11637942c27d2aec2c864ffcb0a3df Mon Sep 17 00:00:00 2001 From: ddh0 Date: Tue, 13 Jan 2026 11:17:22 -0600 Subject: [PATCH 49/56] fix masking in backend top-p, min-p --- src/llama-sampling.cpp | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index ef10bcab67c..8cd149a7449 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -1513,12 +1513,9 @@ static void llama_sampler_top_p_backend_apply( mask_reshaped = ggml_set_rows(ctx, mask_reshaped, ones, ggml_cast(ctx, idxf, GGML_TYPE_I32)); mask = ggml_reshape_1d(ctx, mask_reshaped, mask->ne[0]); - // Use ggml_scale_bias (output = (a * s) + b) which in this case becomes: - // top_p_bias = (mask * 1e9f) - 1e9f. - // So entries in the mask that we want to discard will become -1e9f, and - // others will be 0 (meaning that will not effect the logits). - const float large_val = 1e9f; - struct ggml_tensor * top_p_bias = ggml_scale_bias(ctx, mask, large_val, -large_val); + // Apply -INFINITY bias for masked-out tokens + // log(1) = 0 (keep), log(0) = -INF (discard) + struct ggml_tensor * top_p_bias = ggml_log(ctx, mask); ggml_set_name(top_p_bias, "top_p_bias"); data->logits = ggml_add(ctx, sorted_logits, top_p_bias); @@ -1673,15 +1670,11 @@ static void llama_sampler_min_p_backend_apply( struct ggml_tensor * mask = ggml_step(ctx, sub); ggml_set_name(mask, "min_p_mask"); - // Use ggml_scale_bias (output = (a * s) + b) which in this case becomes: - // min_p_bias = (mask * 1e9f) - 1e9f. - // So entries in the mask that we want to discard will become -1e9f, and - // others will be 0 (meaning that will not effect the logits). - const float large_val = 1e9f; - struct ggml_tensor * min_p_bias = ggml_scale_bias(ctx, mask, large_val, -large_val); + // Apply -INFINITY bias for masked-out tokens + // log(1) = 0 (keep), log(0) = -INF (discard) + struct ggml_tensor * min_p_bias = ggml_log(ctx, mask); ggml_set_name(min_p_bias, "min_p_bias"); - // Add the min_p bias to the logits. data->logits = ggml_add(ctx, data->logits, min_p_bias); ggml_set_name(data->logits, "min_p_logits"); @@ -3356,10 +3349,9 @@ static void llama_sampler_adaptive_p_apply(struct llama_sampler * smpl, llama_to // after the softmax. // for (size_t i = 0; i < cur_p->size; ++i) { - if (cur_p->data[i].logit <= -1e9f) { - // don't transform logits with very large negative values - // (as set by e.g. min-p and top-p when using backend sampling) - // the value `-1e9f` is copied from `llama_sampler_min_p_backend_apply` + if (cur_p->data[i].logit == -INFINITY) { + // don't transform logits that are -INFINITY + // (as masked out by e.g. min-p and top-p when using backend sampling) continue; } float dist = std::abs((cur_p->data[i].p - adapted_target) * INV_WIDTH); From 33c635e0c591f3a1cb5e7bb7a450f87827cb75c6 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Wed, 14 Jan 2026 06:22:17 -0600 Subject: [PATCH 50/56] address review comments --- src/llama-sampling.cpp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 8cd149a7449..bc6d59d72bd 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -3298,8 +3298,9 @@ struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, floa struct llama_sampler_adaptive_p { const float target; // target probability (0.0 - 1.0; negative = disabled) const float decay; // EMA decay; history ~= 1/(1-decay) tokens (0.0 - 0.99) - const uint32_t seed; // RNG seed - std::mt19937 rng; // RNG + const uint32_t seed; // original RND seed + uint32_t seed_cur; // actual RND seed + std::mt19937 rng; // RNG state float weighted_sum; // sum(p_i * decay^i) float total_weight; // sum(decay^i), converges to 1/(1-decay) std::vector original_probs; // pre-transform probs, cached for EMA update @@ -3320,16 +3321,16 @@ static const char * llama_sampler_adaptive_p_name(const struct llama_sampler * / static void llama_sampler_adaptive_p_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { auto * ctx = (llama_sampler_adaptive_p *) smpl->ctx; + llama_sampler_softmax_impl(cur_p, false); + if (ctx->target < 0.0f) { // at negative target values, adaptive-p is no-op // we simply sample from the existing distribution - llama_sampler_softmax_impl(cur_p, false); cur_p->selected = llama_sample_dist(cur_p, ctx->rng); return; } - // softmax and store the original probabilities - llama_sampler_softmax_impl(cur_p, false); + // store the original probabilities ctx->original_probs.resize(cur_p->size); for (size_t i = 0; i < cur_p->size; ++i) { ctx->original_probs[i] = cur_p->data[i].p; @@ -3435,7 +3436,8 @@ struct llama_sampler * llama_sampler_init_adaptive_p( /* .ctx = */ new llama_sampler_adaptive_p { /* .target = */ target, /* .decay = */ clamped_decay, - /* .seed = */ seed_cur, + /* .seed = */ seed, + /* .seed_cur = */ seed_cur, /* .rng = */ std::mt19937(seed_cur), /* .weighted_sum = */ target / (1.0f - clamped_decay), /* .total_weight = */ 1.0f / (1.0f - clamped_decay), From 4b06e0830a700f860a90373fda40312d4014af3d Mon Sep 17 00:00:00 2001 From: ddh0 Date: Wed, 14 Jan 2026 06:41:43 -0600 Subject: [PATCH 51/56] typo in comments `RND` -> `RNG` --- src/llama-sampling.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index bc6d59d72bd..2b65e855637 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -3298,8 +3298,8 @@ struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, floa struct llama_sampler_adaptive_p { const float target; // target probability (0.0 - 1.0; negative = disabled) const float decay; // EMA decay; history ~= 1/(1-decay) tokens (0.0 - 0.99) - const uint32_t seed; // original RND seed - uint32_t seed_cur; // actual RND seed + const uint32_t seed; // original RNG seed + uint32_t seed_cur; // actual RNG seed std::mt19937 rng; // RNG state float weighted_sum; // sum(p_i * decay^i) float total_weight; // sum(decay^i), converges to 1/(1-decay) From 42af39d62f1b440abd224cdeb99eddb225d16918 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Wed, 14 Jan 2026 11:24:20 -0600 Subject: [PATCH 52/56] add docs --- tools/cli/README.md | 2 ++ tools/completion/README.md | 11 +++++++++++ tools/server/README.md | 2 ++ 3 files changed, 15 insertions(+) diff --git a/tools/cli/README.md b/tools/cli/README.md index 9e9f9bead13..3b6f0708ed0 100644 --- a/tools/cli/README.md +++ b/tools/cli/README.md @@ -113,6 +113,8 @@ | `--top-k N` | top-k sampling (default: 40, 0 = disabled)
(env: LLAMA_ARG_TOP_K) | | `--top-p N` | top-p sampling (default: 0.9, 1.0 = disabled) | | `--min-p N` | min-p sampling (default: 0.1, 0.0 = disabled) | +| `--adaptive-target N` | adaptive-p: select tokens near this probability (valid range 0.0 to 1.0; negative = disabled) | +| `--adaptive-decay N` | adaptive-p: EMA decay for adaptation; effective history length ≈ 1/(1-decay) tokens (valid range 0.0 - 0.99) | | `--top-nsigma N` | top-n-sigma sampling (default: -1.0, -1.0 = disabled) | | `--xtc-probability N` | xtc probability (default: 0.0, 0.0 = disabled) | | `--xtc-threshold N` | xtc threshold (default: 0.1, 1.0 = disabled) | diff --git a/tools/completion/README.md b/tools/completion/README.md index 9eea90eed06..a865b297e46 100644 --- a/tools/completion/README.md +++ b/tools/completion/README.md @@ -436,6 +436,17 @@ The Min-P sampling method was designed as an alternative to Top-P, and aims to e Example usage: `--min-p 0.05` +### Adaptive-P Sampling + +- `--adaptive-target N`: select tokens near this probability (valid range 0.0 to 1.0; negative = disabled) +- `--adaptive-decay N`: EMA decay for adaptation; history ≈ 1/(1-decay) tokens (valid range 0.0 - 0.99) + +Adaptive-P: Select tokens near a configurable target probability over time. + +The adaptive-p sampler transforms the token probability distribution to favor tokens that fall near a user-configurable probability target. Internally, the sampler maintains an exponential moving average of the *ORIGINAL* probabilities of selected tokens at each sampling step. It uses this EMA to compute an adapted target probability at each sampling step, thus maintaining the desired target probability over time. Only mild truncation before this sampler is recommended. It is suggested to apply min-p before adaptive-p as the only other active sampler. + +For more info, refer to: [llama.cpp#17927](https://github.com/ggml-org/llama.cpp/pull/17927) + ### Locally Typical Sampling - `--typical N`: Enable locally typical sampling with parameter p (default: 1.0, 1.0 = disabled). diff --git a/tools/server/README.md b/tools/server/README.md index b1a0583d1cb..9fe89387680 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -130,6 +130,8 @@ For the ful list of features, please refer to [server's changelog](https://githu | `--top-k N` | top-k sampling (default: 40, 0 = disabled)
(env: LLAMA_ARG_TOP_K) | | `--top-p N` | top-p sampling (default: 0.9, 1.0 = disabled) | | `--min-p N` | min-p sampling (default: 0.1, 0.0 = disabled) | +| `--adaptive-target N` | adaptive-p: select tokens near this probability (valid range 0.0 to 1.0; negative = disabled) | +| `--adaptive-decay N` | adaptive-p: EMA decay for adaptation; effective history length ≈ 1/(1-decay) tokens (valid range 0.0 - 0.99) | | `--top-nsigma N` | top-n-sigma sampling (default: -1.0, -1.0 = disabled) | | `--xtc-probability N` | xtc probability (default: 0.0, 0.0 = disabled) | | `--xtc-threshold N` | xtc threshold (default: 0.1, 1.0 = disabled) | From 81af54c622a45bf74e75d127a105c608c197ec3d Mon Sep 17 00:00:00 2001 From: ddh0 Date: Wed, 14 Jan 2026 11:28:35 -0600 Subject: [PATCH 53/56] add recommended values in completion docs --- tools/completion/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/completion/README.md b/tools/completion/README.md index a865b297e46..a16be3f684a 100644 --- a/tools/completion/README.md +++ b/tools/completion/README.md @@ -445,6 +445,8 @@ Adaptive-P: Select tokens near a configurable target probability over time. The adaptive-p sampler transforms the token probability distribution to favor tokens that fall near a user-configurable probability target. Internally, the sampler maintains an exponential moving average of the *ORIGINAL* probabilities of selected tokens at each sampling step. It uses this EMA to compute an adapted target probability at each sampling step, thus maintaining the desired target probability over time. Only mild truncation before this sampler is recommended. It is suggested to apply min-p before adaptive-p as the only other active sampler. +Recommended starting values: `--adaptive-target 0.55 --adaptive-decay 0.9` + For more info, refer to: [llama.cpp#17927](https://github.com/ggml-org/llama.cpp/pull/17927) ### Locally Typical Sampling From 40fd48f51987a11369758c255574d066fee997af Mon Sep 17 00:00:00 2001 From: ddh0 Date: Wed, 14 Jan 2026 12:30:43 -0600 Subject: [PATCH 54/56] address PR feedback --- src/llama-sampling.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 2b65e855637..9fb78d89c50 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -3391,6 +3391,8 @@ static void llama_sampler_adaptive_p_reset(struct llama_sampler * smpl) { ctx->total_weight = 1.0f / (1.0f - ctx->decay); ctx->pending_token_id = LLAMA_TOKEN_NULL; ctx->pending_token_idx = -1; + ctx->seed_cur = get_rng_seed(ctx->seed); + ctx->rng.seed(ctx->seed_cur); } static struct llama_sampler * llama_sampler_adaptive_p_clone(const struct llama_sampler * smpl) { From b6041b10db6c93b59ed117cf8391e604eebc6584 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Wed, 14 Jan 2026 21:45:53 -0600 Subject: [PATCH 55/56] remove trailing whitespace (for CI `editorconfig`) --- src/llama-sampling.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 9fb78d89c50..5dde513065b 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -3391,8 +3391,8 @@ static void llama_sampler_adaptive_p_reset(struct llama_sampler * smpl) { ctx->total_weight = 1.0f / (1.0f - ctx->decay); ctx->pending_token_id = LLAMA_TOKEN_NULL; ctx->pending_token_idx = -1; - ctx->seed_cur = get_rng_seed(ctx->seed); - ctx->rng.seed(ctx->seed_cur); + ctx->seed_cur = get_rng_seed(ctx->seed); + ctx->rng.seed(ctx->seed_cur); } static struct llama_sampler * llama_sampler_adaptive_p_clone(const struct llama_sampler * smpl) { From d7e3b8668d66842b76e76ec0d626f8ee3fe8c918 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Thu, 15 Jan 2026 10:56:26 -0600 Subject: [PATCH 56/56] add to adaptive-p to `common_sampler_types_from_chars` --- common/sampling.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/common/sampling.cpp b/common/sampling.cpp index d2160a002cb..11a1d483980 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -726,6 +726,7 @@ std::vector common_sampler_types_from_chars(const std::stri { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC }, { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_INFILL), COMMON_SAMPLER_TYPE_INFILL }, { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_PENALTIES), COMMON_SAMPLER_TYPE_PENALTIES }, + { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_ADAPTIVE_P), COMMON_SAMPLER_TYPE_ADAPTIVE_P }, }; std::vector samplers;