oobabooga
diff --git a/‎README.md‎
Lines changed: 11 additions & 16 deletions b/‎README.md‎
Lines changed: 11 additions & 16 deletions
diff --git a/‎docs/04 - Model Tab.md‎
Lines changed: 0 additions & 8 deletions b/‎docs/04 - Model Tab.md‎
Lines changed: 0 additions & 8 deletions
diff --git a/‎docs/08 - Additional Tips.md‎
Lines changed: 0 additions & 22 deletions b/‎docs/08 - Additional Tips.md‎
Lines changed: 0 additions & 22 deletions
diff --git a/‎docs/What Works.md‎
Lines changed: 2 additions & 6 deletions b/‎docs/What Works.md‎
Lines changed: 2 additions & 6 deletions
diff --git a/‎modules/AutoGPTQ_loader.py‎
Lines changed: 1 addition & 1 deletion b/‎modules/AutoGPTQ_loader.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎modules/GPTQ_loader.py‎
Lines changed: 0 additions & 171 deletions b/‎modules/GPTQ_loader.py‎
Lines changed: 0 additions & 171 deletions
diff --git a/‎modules/loaders.py‎
Lines changed: 0 additions & 34 deletions b/‎modules/loaders.py‎
Lines changed: 0 additions & 34 deletions
@@ -11,7 +11,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
 ## Features
 
 * 3 interface modes: default (two columns), notebook, and chat.
-* Multiple model backends: [Transformers](https://github.com/huggingface/transformers), [llama.cpp](https://github.com/ggerganov/llama.cpp) (through [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)), [ExLlamaV2](https://github.com/turboderp/exllamav2), [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), [AutoAWQ](https://github.com/casper-hansen/AutoAWQ), [GPTQ-for-LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa), [QuIP#](https://github.com/Cornell-RelaxML/quip-sharp).
+* Multiple model backends: [Transformers](https://github.com/huggingface/transformers), [llama.cpp](https://github.com/ggerganov/llama.cpp) (through [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)), [ExLlamaV2](https://github.com/turboderp/exllamav2), [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), [AutoAWQ](https://github.com/casper-hansen/AutoAWQ).
 * Dropdown menu for quickly switching between different models.
 * Large number of extensions (built-in and user-contributed), including Coqui TTS for realistic voice outputs, Whisper STT for voice inputs, translation, [multimodal pipelines](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/multimodal), vector databases, Stable Diffusion integration, and a lot more. See [the wiki](https://github.com/oobabooga/text-generation-webui/wiki/07-%E2%80%90-Extensions) and [the extensions directory](https://github.com/oobabooga/text-generation-webui-extensions) for details.
 * [Chat with custom characters](https://github.com/oobabooga/text-generation-webui/wiki/03-%E2%80%90-Parameters-Tab#character).
@@ -208,12 +208,12 @@ usage: server.py [-h] [--multi-user] [--character CHARACTER] [--model MODEL] [--
                  [--tensorcores] [--n_ctx N_CTX] [--threads THREADS] [--threads-batch THREADS_BATCH] [--no_mul_mat_q] [--n_batch N_BATCH] [--no-mmap] [--mlock] [--n-gpu-layers N_GPU_LAYERS]
                  [--tensor_split TENSOR_SPLIT] [--numa] [--logits_all] [--no_offload_kqv] [--cache-capacity CACHE_CAPACITY] [--row_split] [--streaming-llm] [--attention-sink-size ATTENTION_SINK_SIZE]
                  [--gpu-split GPU_SPLIT] [--autosplit] [--max_seq_len MAX_SEQ_LEN] [--cfg-cache] [--no_flash_attn] [--cache_8bit] [--cache_4bit] [--num_experts_per_token NUM_EXPERTS_PER_TOKEN]
-                 [--triton] [--no_inject_fused_attention] [--no_inject_fused_mlp] [--no_use_cuda_fp16] [--desc_act] [--disable_exllama] [--disable_exllamav2] [--wbits WBITS] [--model_type MODEL_TYPE]
-                 [--groupsize GROUPSIZE] [--pre_layer PRE_LAYER [PRE_LAYER ...]] [--checkpoint CHECKPOINT] [--monkey-patch] [--hqq-backend HQQ_BACKEND] [--deepspeed]
-                 [--nvme-offload-dir NVME_OFFLOAD_DIR] [--local_rank LOCAL_RANK] [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE] [--compress_pos_emb COMPRESS_POS_EMB] [--listen]
-                 [--listen-port LISTEN_PORT] [--listen-host LISTEN_HOST] [--share] [--auto-launch] [--gradio-auth GRADIO_AUTH] [--gradio-auth-path GRADIO_AUTH_PATH] [--ssl-keyfile SSL_KEYFILE]
-                 [--ssl-certfile SSL_CERTFILE] [--api] [--public-api] [--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY] [--admin-key ADMIN_KEY] [--nowebui]
-                 [--multimodal-pipeline MULTIMODAL_PIPELINE]
+                 [--triton] [--no_inject_fused_mlp] [--no_use_cuda_fp16] [--desc_act] [--disable_exllama] [--disable_exllamav2] [--wbits WBITS] [--groupsize GROUPSIZE] [--no_inject_fused_attention]
+                 [--hqq-backend HQQ_BACKEND] [--deepspeed] [--nvme-offload-dir NVME_OFFLOAD_DIR] [--local_rank LOCAL_RANK] [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE]
+                 [--compress_pos_emb COMPRESS_POS_EMB] [--listen] [--listen-port LISTEN_PORT] [--listen-host LISTEN_HOST] [--share] [--auto-launch] [--gradio-auth GRADIO_AUTH]
+                 [--gradio-auth-path GRADIO_AUTH_PATH] [--ssl-keyfile SSL_KEYFILE] [--ssl-certfile SSL_CERTFILE] [--api] [--public-api] [--public-api-id PUBLIC_API_ID] [--api-port API_PORT]
+                 [--api-key API_KEY] [--admin-key ADMIN_KEY] [--nowebui] [--multimodal-pipeline MULTIMODAL_PIPELINE] [--model_type MODEL_TYPE] [--pre_layer PRE_LAYER [PRE_LAYER ...]]
+                 [--checkpoint CHECKPOINT] [--monkey-patch]
 
 Text generation web UI
 
@@ -237,7 +237,7 @@ Basic settings:
 
 Model loader:
   --loader LOADER                                Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2,
-                                                 AutoGPTQ, AutoAWQ, GPTQ-for-LLaMa, QuIP#.
+                                                 AutoGPTQ, AutoAWQ.
 
 Transformers/Accelerate:
   --cpu                                          Use the CPU to generate text. Warning: Training on CPU is extremely slow.
@@ -293,21 +293,16 @@ ExLlamaV2:
 
 AutoGPTQ:
   --triton                                       Use triton.
-  --no_inject_fused_attention                    Disable the use of fused attention, which will use less VRAM at the cost of slower inference.
   --no_inject_fused_mlp                          Triton mode only: disable the use of fused MLP, which will use less VRAM at the cost of slower inference.
   --no_use_cuda_fp16                             This can make models faster on some systems.
   --desc_act                                     For models that do not have a quantize_config.json, this parameter is used to define whether to set desc_act or not in BaseQuantizeConfig.
   --disable_exllama                              Disable ExLlama kernel, which can improve inference speed on some systems.
   --disable_exllamav2                            Disable ExLlamav2 kernel.
-
-GPTQ-for-LLaMa:
   --wbits WBITS                                  Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported.
-  --model_type MODEL_TYPE                        Model type of pre-quantized model. Currently LLaMA, OPT, and GPT-J are supported.
   --groupsize GROUPSIZE                          Group size.
-  --pre_layer PRE_LAYER [PRE_LAYER ...]          The number of layers to allocate to the GPU. Setting this parameter enables CPU offloading for 4-bit models. For multi-gpu, write the numbers separated
-                                                 by spaces, eg --pre_layer 30 60.
-  --checkpoint CHECKPOINT                        The path to the quantized checkpoint file. If not specified, it will be automatically detected.
-  --monkey-patch                                 Apply the monkey patch for using LoRAs with quantized models.
+
+AutoAWQ:
+  --no_inject_fused_attention                    Disable the use of fused attention, which will use less VRAM at the cost of slower inference.
 
 HQQ:
   --hqq-backend HQQ_BACKEND                      Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.
 
@@ -64,14 +64,6 @@ Loads: GPTQ models.
 * **no_use_cuda_fp16**: On some systems, the performance can be very bad with this unset. Can usually be ignored.
 * **desc_act**: For ancient models without proper metadata, sets the model "act-order" parameter manually. Can usually be ignored.
 
-### GPTQ-for-LLaMa
-
-Loads: GPTQ models.
-
-Ancient loader, the first one to implement 4-bit quantization. It works on older GPUs for which ExLlamaV2 and AutoGPTQ do not work, and it doesn't work with "act-order", so you should use it with simple 4-bit-128g models.
-
-* **pre_layer**: Used for CPU offloading. The higher the number, the more layers will be sent to the GPU. GPTQ-for-LLaMa CPU offloading was faster than the one implemented in AutoGPTQ the last time I checked.
-
 ### llama.cpp
 
 Loads: GGUF models. Note: GGML models have been deprecated and do not work anymore.
 
@@ -13,28 +13,6 @@ Source: https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/1126
 
 This file will be automatically detected the next time you start the web UI.
 
-## Using LoRAs with GPTQ-for-LLaMa
-
-This requires using a monkey patch that is supported by this web UI: https://github.com/johnsmith0031/alpaca_lora_4bit
-
-To use it:
-
-Install alpaca_lora_4bit using pip
-
-```
-git clone https://github.com/johnsmith0031/alpaca_lora_4bit.git
-cd alpaca_lora_4bit
-git fetch origin winglian-setup_pip
-git checkout winglian-setup_pip
-pip install .
-```
-
-Start the UI with the --monkey-patch flag:
-
-```
-python server.py --model llama-7b-4bit-128g --listen --lora tloen_alpaca-lora-7b --monkey-patch
-```
-
 ## DeepSpeed
 
 `DeepSpeed ZeRO-3` is an alternative offloading strategy for full-precision (16-bit) transformers models.
 
@@ -2,15 +2,13 @@
 
 | Loader         | Loading 1 LoRA | Loading 2 or more LoRAs | Training LoRAs | Multimodal extension | Perplexity evaluation |
 |----------------|----------------|-------------------------|----------------|----------------------|-----------------------|
-| Transformers   |       ✅       |           ✅\*\*\*      |       ✅\*     |          ✅          |           ✅          |
+| Transformers   |       ✅       |           ✅\*\*        |       ✅\*     |          ✅          |           ✅          |
 | llama.cpp      |       ❌       |           ❌            |       ❌       |          ❌          |    use llamacpp_HF    |
 | llamacpp_HF    |       ❌       |           ❌            |       ❌       |          ❌          |           ✅          |
 | ExLlamav2_HF   |       ✅       |           ✅            |       ❌       |          ❌          |           ✅          |
 | ExLlamav2      |       ✅       |           ✅            |       ❌       |          ❌          |   use ExLlamav2_HF    |
 | AutoGPTQ       |       ✅       |           ❌            |       ❌       |          ✅          |           ✅          |
 | AutoAWQ        |       ?        |           ❌            |       ?        |          ?           |           ✅          |
-| GPTQ-for-LLaMa |       ✅\*\*   |           ✅\*\*\*      |       ✅       |          ✅          |           ✅          |
-| QuIP#          |       ?        |           ?             |       ?        |          ?           |           ✅          |
 | HQQ            |       ?        |           ?             |       ?        |          ?           |           ✅          |
 
 ❌ = not implemented
@@ -19,6 +17,4 @@
 
 \* Training LoRAs with GPTQ models also works with the Transformers loader. Make sure to check "auto-devices" and "disable_exllama" before loading the model.
 
-\*\* Requires the monkey-patch. The instructions can be found [here](https://github.com/oobabooga/text-generation-webui/wiki/08-%E2%80%90-Additional-Tips#using-loras-with-gptq-for-llama).
-
-\*\*\* Multi-LoRA in PEFT is tricky and the current implementation does not work reliably in all cases.
+\*\* Multi-LoRA in PEFT is tricky and the current implementation does not work reliably in all cases.
@@ -44,7 +44,7 @@ def load_quantized(model_name):
         'model_basename': pt_path.stem,
         'device': "xpu:0" if is_xpu_available() else "cuda:0" if not shared.args.cpu else "cpu",
         'use_triton': shared.args.triton,
-        'inject_fused_attention': not shared.args.no_inject_fused_attention,
+        'inject_fused_attention': False,
         'inject_fused_mlp': not shared.args.no_inject_fused_mlp,
         'use_safetensors': use_safetensors,
         'trust_remote_code': shared.args.trust_remote_code,
 
@@ -105,7 +105,6 @@
     ],
     'AutoGPTQ': [
         'triton',
-        'no_inject_fused_attention',
         'no_inject_fused_mlp',
         'no_use_cuda_fp16',
         'wbits',
@@ -131,21 +130,6 @@
         'trust_remote_code',
         'no_use_fast',
     ],
-    'GPTQ-for-LLaMa': [
-        'wbits',
-        'groupsize',
-        'model_type',
-        'pre_layer',
-        'trust_remote_code',
-        'no_use_fast',
-        'gptq_for_llama_info',
-    ],
-    'QuIP#': [
-        'trust_remote_code',
-        'no_use_fast',
-        'no_flash_attn',
-        'quipsharp_info',
-    ],
     'HQQ': [
         'hqq_backend',
         'trust_remote_code',
@@ -205,9 +189,7 @@ def transformers_samplers():
 loaders_samplers = {
     'Transformers': transformers_samplers(),
     'AutoGPTQ': transformers_samplers(),
-    'GPTQ-for-LLaMa': transformers_samplers(),
     'AutoAWQ': transformers_samplers(),
-    'QuIP#': transformers_samplers(),
     'HQQ': transformers_samplers(),
     'ExLlamav2': {
         'temperature',
@@ -339,15 +321,6 @@ def transformers_samplers():
     },
 }
 
-loaders_model_types = {
-    'GPTQ-for-LLaMa': [
-        "None",
-        "llama",
-        "opt",
-        "gptj"
-    ],
-}
-
 
 @functools.cache
 def list_all_samplers():
@@ -375,13 +348,6 @@ def blacklist_samplers(loader, dynamic_temperature):
     return output
 
 
-def get_model_types(loader):
-    if loader in loaders_model_types:
-        return loaders_model_types[loader]
-
-    return ["None"]
-
-
 def get_gpu_memory_keys():
     return [k for k in shared.gradio if k.startswith('gpu_memory')]