Skip to content

Commit e651371

Browse files
committed
Llama-quantize: Add force-requant feature to partial-requant (ikawrakow#1313 follow-up)
Preliminary steps: - Add --force-requant / -frq argument to force regeneration of split files whose tensor ggml_types differ from the specified quantization type - Add -prq shortened argument for --partial-requant - Combined with --partial-requant / -prq: skips existing matching splits, deletes and regenerates splits with mismatched tensor types
1 parent 46e9d8e commit e651371

3 files changed

Lines changed: 9 additions & 4 deletions

File tree

examples/quantize/quantize.cpp

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
151151
//
152152
[[noreturn]]
153153
static void usage(const char * executable) {
154-
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--hide-imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--ffn-gate-inp-type] [--attn-q-type] [--attn-k-type] [--attn-v-type] [--attn-qkv-type] [--attn-output-type] [--ffn-gate-type] [--ffn-down-type] [--ffn-up-type] [--keep-split] [--partial-requant] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
154+
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--hide-imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--ffn-gate-inp-type] [--attn-q-type] [--attn-k-type] [--attn-v-type] [--attn-qkv-type] [--attn-output-type] [--ffn-gate-type] [--ffn-down-type] [--ffn-up-type] [--keep-split] [--partial-requant] [--force-requant] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
155155
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
156156
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
157157
printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
@@ -175,7 +175,8 @@ static void usage(const char * executable) {
175175
printf(" --ffn-down-type ggml_type: use this ggml_type for the ffn_down tensor.\n");
176176
printf(" --ffn-up-type ggml_type: use this ggml_type for the ffn_up tensor.\n\n");
177177
printf(" --keep-split: will generate quantized model in the same shards as input\n");
178-
printf(" --partial-requant: quantize only missing split files in the split quantized .gguf destination directory\n");
178+
printf(" --partial-requant, -prq: quantize only missing split files in the split quantized .gguf destination directory\n");
179+
printf(" --force-requant, -frq: force requantization of split files whose tensor types differ from the specified quantization type\n");
179180
printf(" --override-kv KEY=TYPE:VALUE\n");
180181
printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n\n");
181182
printf("Note: --include-weights and --exclude-weights cannot be used together\n");
@@ -351,7 +352,7 @@ int main(int argc, char ** argv) {
351352

352353
bool hide_imatrix = false;
353354

354-
for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
355+
for (; arg_idx < argc && argv[arg_idx][0] == '-'; arg_idx++) {
355356
if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
356357
params.quantize_output_tensor = false;
357358
} else if (strcmp(argv[arg_idx], "--ignore-imatrix-rules") == 0) {
@@ -467,8 +468,10 @@ int main(int argc, char ** argv) {
467468
}
468469
} else if (strcmp(argv[arg_idx], "--keep-split") == 0) {
469470
params.keep_split = true;
470-
} else if (strcmp(argv[arg_idx], "--partial-requant") == 0) {
471+
} else if (strcmp(argv[arg_idx], "--partial-requant") == 0 || strcmp(argv[arg_idx], "-prq") == 0) {
471472
params.partial_requant = true;
473+
} else if (strcmp(argv[arg_idx], "--force-requant") == 0 || strcmp(argv[arg_idx], "-frq") == 0) {
474+
params.force_requant = true;
472475
} else {
473476
usage(argv[0]);
474477
}

include/llama.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -505,6 +505,7 @@ extern "C" {
505505
bool only_repack; // Only repack tensors
506506
bool dry_run; //
507507
bool partial_requant; // quantize only missing split files in the split quantized .gguf destination directory
508+
bool force_requant; // force requantization of split files whose tensor types differ from the specified quantization type
508509
void * imatrix; // pointer to importance matrix data
509510
void * kv_overrides; // pointer to vector containing overrides
510511
void * custom_quants; // pointer to vector containing custom quantization rules

src/llama.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4675,6 +4675,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
46754675
/*.only_repack =*/ false,
46764676
/*.dry_run =*/ false,
46774677
/*.partial_requant =*/ false,
4678+
/*.force_requant =*/ false,
46784679
/*.imatrix =*/ nullptr,
46794680
/*.kv_overrides =*/ nullptr,
46804681
/*.custom_quants =*/ nullptr,

0 commit comments

Comments
 (0)