Llama-quantize: Add force-requant feature to partial-requant (ikawrakow#1313 follow-up)

Nexesenex · Nexesenex · commit e65137107e4a · 2026-03-20T15:49:19.000+01:00
Preliminary steps:
- Add --force-requant / -frq argument to force regeneration of split files whose tensor ggml_types differ from the specified quantization type
- Add -prq shortened argument for --partial-requant
- Combined with --partial-requant / -prq: skips existing matching splits, deletes and regenerates splits with mismatched tensor types
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
@@ -151,7 +151,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
 //
 [[noreturn]]
 static void usage(const char * executable) {
-    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--hide-imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--ffn-gate-inp-type] [--attn-q-type] [--attn-k-type] [--attn-v-type] [--attn-qkv-type] [--attn-output-type] [--ffn-gate-type] [--ffn-down-type] [--ffn-up-type] [--keep-split] [--partial-requant] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
+    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--hide-imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--ffn-gate-inp-type] [--attn-q-type] [--attn-k-type] [--attn-v-type] [--attn-qkv-type] [--attn-output-type] [--ffn-gate-type] [--ffn-down-type] [--ffn-up-type] [--keep-split] [--partial-requant] [--force-requant] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
     printf("  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
     printf("  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
     printf("  --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
@@ -175,7 +175,8 @@ static void usage(const char * executable) {
     printf("      --ffn-down-type ggml_type: use this ggml_type for the ffn_down tensor.\n");
     printf("      --ffn-up-type ggml_type: use this ggml_type for the ffn_up tensor.\n\n");
     printf("  --keep-split: will generate quantized model in the same shards as input\n");
-    printf("  --partial-requant: quantize only missing split files in the split quantized .gguf destination directory\n");
+    printf("  --partial-requant, -prq: quantize only missing split files in the split quantized .gguf destination directory\n");
+    printf("  --force-requant, -frq: force requantization of split files whose tensor types differ from the specified quantization type\n");
     printf("  --override-kv KEY=TYPE:VALUE\n");
     printf("      Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n\n");
     printf("Note: --include-weights and --exclude-weights cannot be used together\n");
@@ -351,7 +352,7 @@ int main(int argc, char ** argv) {
 
     bool hide_imatrix = false;
 
-    for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
+    for (; arg_idx < argc && argv[arg_idx][0] == '-'; arg_idx++) {
         if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
             params.quantize_output_tensor = false;
         } else if (strcmp(argv[arg_idx], "--ignore-imatrix-rules") == 0) {
@@ -467,8 +468,10 @@ int main(int argc, char ** argv) {
             }
         } else if (strcmp(argv[arg_idx], "--keep-split") == 0) {
             params.keep_split = true;
-        } else if (strcmp(argv[arg_idx], "--partial-requant") == 0) {
+        } else if (strcmp(argv[arg_idx], "--partial-requant") == 0 || strcmp(argv[arg_idx], "-prq") == 0) {
             params.partial_requant = true;
+        } else if (strcmp(argv[arg_idx], "--force-requant") == 0 || strcmp(argv[arg_idx], "-frq") == 0) {
+            params.force_requant = true;
         } else {
             usage(argv[0]);
         }
diff --git a/include/llama.h b/include/llama.h
@@ -505,6 +505,7 @@ extern "C" {
         bool only_repack;                    // Only repack tensors
         bool dry_run;                        //
         bool partial_requant;                // quantize only missing split files in the split quantized .gguf destination directory
+        bool force_requant;                  // force requantization of split files whose tensor types differ from the specified quantization type
         void * imatrix;                      // pointer to importance matrix data
         void * kv_overrides;                 // pointer to vector containing overrides
         void * custom_quants;                // pointer to vector containing custom quantization rules
diff --git a/src/llama.cpp b/src/llama.cpp
@@ -4675,6 +4675,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
         /*.only_repack                 =*/ false,
         /*.dry_run                     =*/ false,
         /*.partial_requant             =*/ false,
+        /*.force_requant               =*/ false,
         /*.imatrix                     =*/ nullptr,
         /*.kv_overrides                =*/ nullptr,
         /*.custom_quants               =*/ nullptr,