oobabooga
diff --git a/‎README.md‎
Lines changed: 1 addition & 6 deletions b/‎README.md‎
Lines changed: 1 addition & 6 deletions
diff --git a/‎download-model.py‎
Lines changed: 9 additions & 9 deletions b/‎download-model.py‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎instruction-templates/NVIDIA-ChatQA.yaml‎
Lines changed: 25 additions & 0 deletions b/‎instruction-templates/NVIDIA-ChatQA.yaml‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎js/main.js‎
Lines changed: 17 additions & 17 deletions b/‎js/main.js‎
Lines changed: 17 additions & 17 deletions
diff --git a/‎models/config.yaml‎
Lines changed: 2 additions & 0 deletions b/‎models/config.yaml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎modules/llamacpp_hf.py‎
Lines changed: 2 additions & 1 deletion b/‎modules/llamacpp_hf.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎modules/llamacpp_model.py‎
Lines changed: 2 additions & 1 deletion b/‎modules/llamacpp_model.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎modules/loaders.py‎
Lines changed: 2 additions & 0 deletions b/‎modules/loaders.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎modules/models.py‎
Lines changed: 1 addition & 1 deletion b/‎modules/models.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎modules/shared.py‎
Lines changed: 1 addition & 0 deletions b/‎modules/shared.py‎
Lines changed: 1 addition & 0 deletions
@@ -256,6 +256,7 @@ List of command-line flags
 | Flag        | Description |
 |-------------|-------------|
 | `--tensorcores`  | Use llama-cpp-python compiled with tensor cores support. This increases performance on RTX cards. NVIDIA only. |
+| `--flash-attn`   | Use flash-attention. |
 | `--n_ctx N_CTX` | Size of the prompt context. |
 | `--threads` | Number of threads to use. |
 | `--threads-batch THREADS_BATCH` | Number of threads to use for batches/prompt processing. |
@@ -425,9 +426,3 @@ If you would like to contribute to the project, check out the [Contributing guid
 ## Acknowledgment
 
 In August 2023, [Andreessen Horowitz](https://a16z.com/) (a16z) provided a generous grant to encourage and support my independent work on this project. I am **extremely** grateful for their trust and recognition.
-
-## GitHub Sponsors
-
-The following is a list of top-tier sponsors for this project here on GitHub:
-
-* Be the first one! Visit https://github.com/sponsors/oobabooga/.
@@ -190,17 +190,17 @@ def get_single_file(self, url, output_folder, start_from_scratch=False):
             headers = {}
             mode = 'wb'
 
-            if output_path.exists() and not start_from_scratch:
-                # Resume download
-                r = session.get(url, stream=True, timeout=20)
-                total_size = int(r.headers.get('content-length', 0))
-                if output_path.stat().st_size >= total_size:
-                    return
+            try:
+                if output_path.exists() and not start_from_scratch:
+                    # Resume download
+                    r = session.get(url, stream=True, timeout=20)
+                    total_size = int(r.headers.get('content-length', 0))
+                    if output_path.stat().st_size >= total_size:
+                        return
 
-                headers = {'Range': f'bytes={output_path.stat().st_size}-'}
-                mode = 'ab'
+                    headers = {'Range': f'bytes={output_path.stat().st_size}-'}
+                    mode = 'ab'
 
-            try:
                 with session.get(url, stream=True, headers=headers, timeout=30) as r:
                     r.raise_for_status()  # If status is not 2xx, raise an error
                     total_size = int(r.headers.get('content-length', 0))
 
@@ -0,0 +1,25 @@
+instruction_template: |-
+  {%- set ns = namespace(found=false) -%}
+  {%- for message in messages -%}
+      {%- if message['role'] == 'system' -%}
+          {%- set ns.found = true -%}
+      {%- endif -%}
+  {%- endfor -%}
+  {%- if not ns.found -%}
+      {{- '' -}}
+  {%- endif %}
+  {%- for message in messages %}
+      {%- if message['role'] == 'system' -%}
+          {{- 'System:' + message['content'] + '\n\n' -}}
+      {%- else -%}
+          {%- if message['role'] == 'user' -%}
+              {{-'User: ' + message['content'] + '\n\n'-}}
+          {%- else -%}
+              {{-'Assistant: ' + message['content'] + '\n\n' -}}
+          {%- endif -%}
+      {%- endif -%}
+  {%- endfor -%}
+  {%- if add_generation_prompt -%}
+      {{-'Assistant:'-}}
+  {%- endif -%}
+
@@ -144,22 +144,21 @@ targetElement.addEventListener("scroll", function() {
 
 // Create a MutationObserver instance
 const observer = new MutationObserver(function(mutations) {
-  mutations.forEach(function(mutation) {
-    updateCssProperties();
-
-    const firstChild = targetElement.children[0];
-    if (firstChild.classList.contains("generating")) {
-      typing.parentNode.classList.add("visible-dots");
-      document.getElementById("stop").style.display = "flex";
-      document.getElementById("Generate").style.display = "none";
-    } else {
-      typing.parentNode.classList.remove("visible-dots");
-      document.getElementById("stop").style.display = "none";
-      document.getElementById("Generate").style.display = "flex";
-    }
+  updateCssProperties();
+
+  const firstChild = targetElement.children[0];
+  if (firstChild.classList.contains("generating")) {
+    typing.parentNode.classList.add("visible-dots");
+    document.getElementById("stop").style.display = "flex";
+    document.getElementById("Generate").style.display = "none";
+  } else {
+    typing.parentNode.classList.remove("visible-dots");
+    document.getElementById("stop").style.display = "none";
+    document.getElementById("Generate").style.display = "flex";
+  }
 
-    doSyntaxHighlighting();
-  });
+
+  doSyntaxHighlighting();
 
   if(!isScrolled) {
     targetElement.scrollTop = targetElement.scrollHeight;
@@ -215,6 +214,9 @@ function doSyntaxHighlighting() {
     indexes.forEach((index) => {
       const element = elements[index];
 
+      // Tag this element to prevent it from being highlighted twice
+      element.setAttribute("data-highlighted", "true");
+
       // Perform syntax highlighting
       const codeBlocks = element.querySelectorAll("pre code");
 
@@ -231,8 +233,6 @@ function doSyntaxHighlighting() {
         ],
       });
 
-      // Tag this element to indicate it has been syntax highlighted
-      element.setAttribute("data-highlighted", "true");
     });
 
     observer.observe(targetElement, config);
 
@@ -204,3 +204,5 @@
   instruction_template: 'ChatML'
 .*airoboros-3_1-yi-34b-200k:
   instruction_template: 'Llama-v2'
+.*chatqa:
+  instruction_template: 'NVIDIA-ChatQA'
@@ -217,7 +217,8 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
             'rope_freq_scale': 1.0 / shared.args.compress_pos_emb,
             'logits_all': shared.args.logits_all,
             'offload_kqv': not shared.args.no_offload_kqv,
-            'split_mode': 1 if not shared.args.row_split else 2
+            'split_mode': 1 if not shared.args.row_split else 2,
+            'flash_attn': shared.args.flash_attn
         }
 
         Llama = llama_cpp_lib().Llama
 
@@ -96,7 +96,8 @@ def from_pretrained(self, path):
             'tensor_split': tensor_split_list,
             'rope_freq_scale': 1.0 / shared.args.compress_pos_emb,
             'offload_kqv': not shared.args.no_offload_kqv,
-            'split_mode': 1 if not shared.args.row_split else 2
+            'split_mode': 1 if not shared.args.row_split else 2,
+            'flash_attn': shared.args.flash_attn
         }
 
         result.model = Llama(**params)
 
@@ -46,6 +46,7 @@
         'no_offload_kqv',
         'row_split',
         'tensorcores',
+        'flash-attn',
         'streaming_llm',
         'attention_sink_size',
     ],
@@ -71,6 +72,7 @@
         'no_offload_kqv',
         'row_split',
         'tensorcores',
+        'flash-attn',
         'streaming_llm',
         'attention_sink_size',
         'llamacpp_HF_info',
 
@@ -107,10 +107,10 @@ def load_model(model_name, loader=None):
     elif loader in ['llama.cpp', 'llamacpp_HF']:
         shared.settings['truncation_length'] = shared.args.n_ctx
 
+    logger.info(f"Loaded \"{model_name}\" in {(time.time()-t0):.2f} seconds.")
     logger.info(f"LOADER: \"{loader}\"")
     logger.info(f"TRUNCATION LENGTH: {shared.settings['truncation_length']}")
     logger.info(f"INSTRUCTION TEMPLATE: \"{metadata['instruction_template']}\"")
-    logger.info(f"Loaded the model in {(time.time()-t0):.2f} seconds.")
     return model, tokenizer
 
 
 
@@ -114,6 +114,7 @@
 
 # llama.cpp
 group = parser.add_argument_group('llama.cpp')
+group.add_argument('--flash-attn', action='store_true', help='Use flash-attention.')
 group.add_argument('--tensorcores', action='store_true', help='Use llama-cpp-python compiled with tensor cores support. This increases performance on RTX cards. NVIDIA only.')
 group.add_argument('--n_ctx', type=int, default=2048, help='Size of the prompt context.')
 group.add_argument('--threads', type=int, default=0, help='Number of threads to use.')
Original file line number	Diff line number	Diff line change
`@@ -217,7 +217,8 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P`
`217`	`217`	`'rope_freq_scale': 1.0 / shared.args.compress_pos_emb,`
`218`	`218`	`'logits_all': shared.args.logits_all,`
`219`	`219`	`'offload_kqv': not shared.args.no_offload_kqv,`
`220`		`- 'split_mode': 1 if not shared.args.row_split else 2`
	`220`	`+ 'split_mode': 1 if not shared.args.row_split else 2,`
	`221`	`+ 'flash_attn': shared.args.flash_attn`
`221`	`222`	`}`
`222`	`223`
`223`	`224`	`Llama = llama_cpp_lib().Llama`
Original file line number	Diff line number	Diff line change
`@@ -96,7 +96,8 @@ def from_pretrained(self, path):`
`96`	`96`	`'tensor_split': tensor_split_list,`
`97`	`97`	`'rope_freq_scale': 1.0 / shared.args.compress_pos_emb,`
`98`	`98`	`'offload_kqv': not shared.args.no_offload_kqv,`
`99`		`- 'split_mode': 1 if not shared.args.row_split else 2`
	`99`	`+ 'split_mode': 1 if not shared.args.row_split else 2,`
	`100`	`+ 'flash_attn': shared.args.flash_attn`
`100`	`101`	`}`
`101`	`102`
`102`	`103`	`result.model = Llama(**params)`