gante · mostafaelhoushi · Oct 28, 2024 · Oct 28, 2024 · Nov 4, 2024 · Nov 19, 2024
diff --git a/experiments/faster_generation/benchmark_code_python.py b/experiments/faster_generation/benchmark_code_python.py
@@ -10,14 +10,14 @@
 INPUT_LEN = 256
 
 
-def run_prediction_loop(model, tokenizer, num_samples, temperature=None, assistant_model=None, assistant_tokenizer=None):
+def run_prediction_loop(model, tokenizer, num_samples, temperature=None, assistant_model=None, assistant_early_exit=None, assistant_tokenizer=None):
     outputs = []
     gen_time = []
     num_tokens = []
     ds = load_dataset("bigcode/the-stack", data_dir="data/python", split="train", streaming=True)
     ds_iterator = iter(ds.take(num_samples))
 
-    desc = "ORIGINAL model" if assistant_model is None else f"ASSISTED model"
+    desc = "ORIGINAL model" if assistant_model is None and assistant_early_exit is None else f"ASSISTED model"
     pbar = tqdm(range(num_samples), desc)
     for i in pbar:
         next_data = next(ds_iterator)["content"]
@@ -27,8 +27,9 @@ def run_prediction_loop(model, tokenizer, num_samples, temperature=None, assista
         generate_kwargs = {
             "do_sample": False,
             "temperature": temperature,
-            "max_length": GEN_LEN,
+            "max_new_tokens": GEN_LEN,
             "assistant_model": assistant_model,
+            "assistant_early_exit": assistant_early_exit,
         }
         if temperature is not None:
             generate_kwargs["do_sample"] = True

diff --git a/experiments/faster_generation/benchmark_decoder_open.py b/experiments/faster_generation/benchmark_decoder_open.py
@@ -10,14 +10,14 @@
 GEN_LEN = 128
 
 
-def run_prediction_loop(model, tokenizer, num_samples, temperature=None, assistant_model=None, assistant_tokenizer=None):
+def run_prediction_loop(model, tokenizer, num_samples, temperature=None, assistant_model=None, assistant_early_exit=None, assistant_tokenizer=None):
     outputs = []
     gen_time = []
     num_tokens = []
     ds = load_dataset("allenai/c4", "en", split="validation", streaming=True)
     ds_iterator = iter(ds.take(num_samples))
 
-    desc = "ORIGINAL model" if assistant_model is None else f"ASSISTED model"
+    desc = "ORIGINAL model" if assistant_model is None and assistant_early_exit is None else f"ASSISTED model"
     pbar = tqdm(range(num_samples), desc)
     for i in pbar:
         next_data = next(ds_iterator)["text"]
@@ -29,6 +29,7 @@ def run_prediction_loop(model, tokenizer, num_samples, temperature=None, assista
             "temperature": temperature,
             "max_length": GEN_LEN,
             "assistant_model": assistant_model,
+            "assistant_early_exit": assistant_early_exit,
         }
         if temperature is not None:
             generate_kwargs["do_sample"] = True

diff --git a/experiments/faster_generation/benchmark_decoder_summ.py b/experiments/faster_generation/benchmark_decoder_summ.py
@@ -9,14 +9,14 @@
 GEN_LEN = 128
 
 
-def run_prediction_loop(model, tokenizer, num_samples, temperature=None, assistant_model=None, assistant_tokenizer=None):
+def run_prediction_loop(model, tokenizer, num_samples, temperature=None, assistant_model=None, assistant_early_exit=None, assistant_tokenizer=None):
     outputs = []
     gen_time = []
     num_tokens = []
     ds = load_dataset("cnn_dailymail", "3.0.0", split="validation", streaming=True)
     ds_iterator = iter(ds.take(num_samples))
 
-    desc = "ORIGINAL model" if assistant_model is None else f"ASSISTED model"
+    desc = "ORIGINAL model" if assistant_model is None and assistant_early_exit is None else f"ASSISTED model"
     pbar = tqdm(range(num_samples), desc)
     for i in pbar:
         next_data = "Article: " + next(ds_iterator)["article"] + " Summary:"
@@ -26,8 +26,9 @@ def run_prediction_loop(model, tokenizer, num_samples, temperature=None, assista
         generate_kwargs = {
             "do_sample": False,
             "temperature": temperature,
-            "max_length": GEN_LEN,
+            "max_new_tokens": GEN_LEN,
             "assistant_model": assistant_model,
+            "assistant_early_exit": assistant_early_exit,
         }
         if temperature is not None:
             generate_kwargs["do_sample"] = True

diff --git a/experiments/faster_generation/benchmark_flant5_summ.py b/experiments/faster_generation/benchmark_flant5_summ.py
@@ -9,14 +9,14 @@
 GEN_LEN = 128
 
 
-def run_prediction_loop(model, tokenizer, num_samples, temperature=None, assistant_model=None, assistant_tokenizer=None):
+def run_prediction_loop(model, tokenizer, num_samples, temperature=None, assistant_model=None, assistant_early_exit=None, assistant_tokenizer=None):
     outputs = []
     gen_time = []
     num_tokens = []
     ds = load_dataset("cnn_dailymail", "3.0.0", split="validation", streaming=True)
     ds_iterator = iter(ds.take(num_samples))
 
-    desc = "ORIGINAL model" if assistant_model is None else f"ASSISTED model"
+    desc = "ORIGINAL model" if assistant_model is None and assistant_early_exit is None else f"ASSISTED model"
     pbar = tqdm(range(num_samples), desc)
     for i in pbar:
         next_data = "Summarize: " + next(ds_iterator)["article"]
@@ -28,6 +28,7 @@ def run_prediction_loop(model, tokenizer, num_samples, temperature=None, assista
             "temperature": temperature,
             "max_length": GEN_LEN,
             "assistant_model": assistant_model,
+            "assistant_early_exit": assistant_early_exit,
         }
         if temperature is not None:
             generate_kwargs["do_sample"] = True

diff --git a/experiments/faster_generation/benchmark_whisper.py b/experiments/faster_generation/benchmark_whisper.py
@@ -8,15 +8,15 @@
 TORCH_DEVICE = 0
 
 
-def run_prediction_loop(model, processor, num_samples, temperature=None, assistant_model=None, assistant_tokenizer=None):
+def run_prediction_loop(model, processor, num_samples, temperature=None, assistant_model=None, assistant_early_exit=None, assistant_tokenizer=None):
     outputs = []
     gen_time = []
     num_tokens = []
 
     ds = load_dataset("librispeech_asr", "clean", split="validation")
     speech_samples = ds.select(range(num_samples))[:num_samples]["audio"]
 
-    desc = "ORIGINAL model" if assistant_model is None else f"ASSISTED model"
+    desc = "ORIGINAL model" if assistant_model is None and assistant_early_exit is None else f"ASSISTED model"
     pbar = tqdm(range(num_samples), desc)
 
     for i in pbar:
@@ -32,6 +32,7 @@ def run_prediction_loop(model, processor, num_samples, temperature=None, assista
             "do_sample": False,
             "temperature": temperature,
             "assistant_model": assistant_model,
+            "assistant_early_exit"=assistant_early_exit,
         }
         if temperature is not None:
             generate_kwargs["do_sample"] = True

diff --git a/experiments/faster_generation/utils.py b/experiments/faster_generation/utils.py
@@ -24,6 +24,7 @@ def get_parsed_args():
     )
     parser.add_argument('model', type=str, help='The HF repo of the *main* model to be used')
     parser.add_argument('--aux-model', type=str, default=None, help='The HF repo of the *assistant* model to be used')
+    parser.add_argument('--aux-early-exit', type=int, default=None, help='The layer of *assistant* early exit to be used')
     parser.add_argument('--dtype', type=str, default=None, help='The data type to be used in BOTH models')
     parser.add_argument(
         '--temperature', type=float, help='The temperature value for sampling. If not set, greedy decoding is used.'
@@ -60,7 +61,7 @@ def run_model(args, processor_cls, model_cls, run_prediction_loop):
     tokenizer = processor_cls.from_pretrained(args.model)
 
     if args.max_gpu_memory is None:  # fails if it doesn't fit in a GPU
-        max_memory = {0: "100GiB", "cpu": "0GiB"}
+        max_memory = None
     else:
         max_memory = {}
         for i in range(len(args.max_gpu_memory)):
@@ -84,15 +85,20 @@ def run_model(args, processor_cls, model_cls, run_prediction_loop):
 
 
 def run_model_with_assistant(args, processor_cls, model_cls, run_prediction_loop):
+    assert args.aux_model is not None or args.aux_early_exit is not None
     tokenizer = processor_cls.from_pretrained(args.model)
 
-    assistant_model = model_cls.from_pretrained(args.aux_model)
-    assistant_model = assistant_model.to(device=TORCH_DEVICE, dtype=args.dtype)
-    if assistant_model.generation_config.pad_token_id is None:
-        assistant_model.generation_config.pad_token_id = assistant_model.generation_config.eos_token_id
+    if args.aux_model:
+        assistant_model = model_cls.from_pretrained(args.aux_model)
+        assistant_model = assistant_model.to(device=TORCH_DEVICE, dtype=args.dtype)
+        if assistant_model.generation_config.pad_token_id is None:
+            assistant_model.generation_config.pad_token_id = assistant_model.generation_config.eos_token_id
+    else:
+        assistant_model = None
+    assistant_early_exit = args.aux_early_exit
 
     if args.max_gpu_memory is None:  # fails if it doesn't fit in a GPU
-        max_memory = {0: "100GiB", "cpu": "0GiB"}
+        max_memory = None
     else:
         max_memory = {}
         for i in range(len(args.max_gpu_memory)):
@@ -112,12 +118,16 @@ def run_model_with_assistant(args, processor_cls, model_cls, run_prediction_loop
         model.generation_config.pad_token_id = model.generation_config.eos_token_id
 
     # If the tokenizer of the two models are different, pass `assistant_tokenizer` to trigger UAG
-    has_same_tokenizer = (
-        model.config.vocab_size == assistant_model.config.vocab_size
-        and model.config.pad_token_id == assistant_model.config.pad_token_id
-        and model.config.eos_token_id == assistant_model.config.eos_token_id
-        and model.config.bos_token_id == assistant_model.config.bos_token_id
-    )
+    if assistant_model:
+        has_same_tokenizer = (
+            model.config.vocab_size == assistant_model.config.vocab_size
+            and model.config.pad_token_id == assistant_model.config.pad_token_id
+            and model.config.eos_token_id == assistant_model.config.eos_token_id
+            and model.config.bos_token_id == assistant_model.config.bos_token_id
+        )
+    else:
+        has_same_tokenizer = True
+
     if has_same_tokenizer:
         assistant_tokenizer = None
     else:
@@ -129,6 +139,7 @@ def run_model_with_assistant(args, processor_cls, model_cls, run_prediction_loop
         num_samples=args.num_samples,
         temperature=args.temperature,
         assistant_model=assistant_model,
+        assistant_early_exit=assistant_early_exit,
         assistant_tokenizer=assistant_tokenizer
     )
     return new_outputs