diff --git a/experiments/faster_generation/benchmark_code_python.py b/experiments/faster_generation/benchmark_code_python.py index decdbc7..fca75ca 100644 --- a/experiments/faster_generation/benchmark_code_python.py +++ b/experiments/faster_generation/benchmark_code_python.py @@ -10,14 +10,14 @@ INPUT_LEN = 256 -def run_prediction_loop(model, tokenizer, num_samples, temperature=None, assistant_model=None, assistant_tokenizer=None): +def run_prediction_loop(model, tokenizer, num_samples, temperature=None, assistant_model=None, assistant_early_exit=None, assistant_tokenizer=None): outputs = [] gen_time = [] num_tokens = [] ds = load_dataset("bigcode/the-stack", data_dir="data/python", split="train", streaming=True) ds_iterator = iter(ds.take(num_samples)) - desc = "ORIGINAL model" if assistant_model is None else f"ASSISTED model" + desc = "ORIGINAL model" if assistant_model is None and assistant_early_exit is None else f"ASSISTED model" pbar = tqdm(range(num_samples), desc) for i in pbar: next_data = next(ds_iterator)["content"] @@ -27,8 +27,9 @@ def run_prediction_loop(model, tokenizer, num_samples, temperature=None, assista generate_kwargs = { "do_sample": False, "temperature": temperature, - "max_length": GEN_LEN, + "max_new_tokens": GEN_LEN, "assistant_model": assistant_model, + "assistant_early_exit": assistant_early_exit, } if temperature is not None: generate_kwargs["do_sample"] = True diff --git a/experiments/faster_generation/benchmark_decoder_open.py b/experiments/faster_generation/benchmark_decoder_open.py index 678c5fd..1f0dd3f 100644 --- a/experiments/faster_generation/benchmark_decoder_open.py +++ b/experiments/faster_generation/benchmark_decoder_open.py @@ -10,14 +10,14 @@ GEN_LEN = 128 -def run_prediction_loop(model, tokenizer, num_samples, temperature=None, assistant_model=None, assistant_tokenizer=None): +def run_prediction_loop(model, tokenizer, num_samples, temperature=None, assistant_model=None, assistant_early_exit=None, assistant_tokenizer=None): outputs = [] gen_time = [] num_tokens = [] ds = load_dataset("allenai/c4", "en", split="validation", streaming=True) ds_iterator = iter(ds.take(num_samples)) - desc = "ORIGINAL model" if assistant_model is None else f"ASSISTED model" + desc = "ORIGINAL model" if assistant_model is None and assistant_early_exit is None else f"ASSISTED model" pbar = tqdm(range(num_samples), desc) for i in pbar: next_data = next(ds_iterator)["text"] @@ -29,6 +29,7 @@ def run_prediction_loop(model, tokenizer, num_samples, temperature=None, assista "temperature": temperature, "max_length": GEN_LEN, "assistant_model": assistant_model, + "assistant_early_exit": assistant_early_exit, } if temperature is not None: generate_kwargs["do_sample"] = True diff --git a/experiments/faster_generation/benchmark_decoder_summ.py b/experiments/faster_generation/benchmark_decoder_summ.py index ffc852d..6e7a3e1 100644 --- a/experiments/faster_generation/benchmark_decoder_summ.py +++ b/experiments/faster_generation/benchmark_decoder_summ.py @@ -9,14 +9,14 @@ GEN_LEN = 128 -def run_prediction_loop(model, tokenizer, num_samples, temperature=None, assistant_model=None, assistant_tokenizer=None): +def run_prediction_loop(model, tokenizer, num_samples, temperature=None, assistant_model=None, assistant_early_exit=None, assistant_tokenizer=None): outputs = [] gen_time = [] num_tokens = [] ds = load_dataset("cnn_dailymail", "3.0.0", split="validation", streaming=True) ds_iterator = iter(ds.take(num_samples)) - desc = "ORIGINAL model" if assistant_model is None else f"ASSISTED model" + desc = "ORIGINAL model" if assistant_model is None and assistant_early_exit is None else f"ASSISTED model" pbar = tqdm(range(num_samples), desc) for i in pbar: next_data = "Article: " + next(ds_iterator)["article"] + " Summary:" @@ -26,8 +26,9 @@ def run_prediction_loop(model, tokenizer, num_samples, temperature=None, assista generate_kwargs = { "do_sample": False, "temperature": temperature, - "max_length": GEN_LEN, + "max_new_tokens": GEN_LEN, "assistant_model": assistant_model, + "assistant_early_exit": assistant_early_exit, } if temperature is not None: generate_kwargs["do_sample"] = True diff --git a/experiments/faster_generation/benchmark_flant5_summ.py b/experiments/faster_generation/benchmark_flant5_summ.py index 3fecd41..ed9edb2 100644 --- a/experiments/faster_generation/benchmark_flant5_summ.py +++ b/experiments/faster_generation/benchmark_flant5_summ.py @@ -9,14 +9,14 @@ GEN_LEN = 128 -def run_prediction_loop(model, tokenizer, num_samples, temperature=None, assistant_model=None, assistant_tokenizer=None): +def run_prediction_loop(model, tokenizer, num_samples, temperature=None, assistant_model=None, assistant_early_exit=None, assistant_tokenizer=None): outputs = [] gen_time = [] num_tokens = [] ds = load_dataset("cnn_dailymail", "3.0.0", split="validation", streaming=True) ds_iterator = iter(ds.take(num_samples)) - desc = "ORIGINAL model" if assistant_model is None else f"ASSISTED model" + desc = "ORIGINAL model" if assistant_model is None and assistant_early_exit is None else f"ASSISTED model" pbar = tqdm(range(num_samples), desc) for i in pbar: next_data = "Summarize: " + next(ds_iterator)["article"] @@ -28,6 +28,7 @@ def run_prediction_loop(model, tokenizer, num_samples, temperature=None, assista "temperature": temperature, "max_length": GEN_LEN, "assistant_model": assistant_model, + "assistant_early_exit": assistant_early_exit, } if temperature is not None: generate_kwargs["do_sample"] = True diff --git a/experiments/faster_generation/benchmark_whisper.py b/experiments/faster_generation/benchmark_whisper.py index a960f95..6528209 100644 --- a/experiments/faster_generation/benchmark_whisper.py +++ b/experiments/faster_generation/benchmark_whisper.py @@ -8,7 +8,7 @@ TORCH_DEVICE = 0 -def run_prediction_loop(model, processor, num_samples, temperature=None, assistant_model=None, assistant_tokenizer=None): +def run_prediction_loop(model, processor, num_samples, temperature=None, assistant_model=None, assistant_early_exit=None, assistant_tokenizer=None): outputs = [] gen_time = [] num_tokens = [] @@ -16,7 +16,7 @@ def run_prediction_loop(model, processor, num_samples, temperature=None, assista ds = load_dataset("librispeech_asr", "clean", split="validation") speech_samples = ds.select(range(num_samples))[:num_samples]["audio"] - desc = "ORIGINAL model" if assistant_model is None else f"ASSISTED model" + desc = "ORIGINAL model" if assistant_model is None and assistant_early_exit is None else f"ASSISTED model" pbar = tqdm(range(num_samples), desc) for i in pbar: @@ -32,6 +32,7 @@ def run_prediction_loop(model, processor, num_samples, temperature=None, assista "do_sample": False, "temperature": temperature, "assistant_model": assistant_model, + "assistant_early_exit"=assistant_early_exit, } if temperature is not None: generate_kwargs["do_sample"] = True diff --git a/experiments/faster_generation/utils.py b/experiments/faster_generation/utils.py index 8a9c11f..3bb5f90 100644 --- a/experiments/faster_generation/utils.py +++ b/experiments/faster_generation/utils.py @@ -24,6 +24,7 @@ def get_parsed_args(): ) parser.add_argument('model', type=str, help='The HF repo of the *main* model to be used') parser.add_argument('--aux-model', type=str, default=None, help='The HF repo of the *assistant* model to be used') + parser.add_argument('--aux-early-exit', type=int, default=None, help='The layer of *assistant* early exit to be used') parser.add_argument('--dtype', type=str, default=None, help='The data type to be used in BOTH models') parser.add_argument( '--temperature', type=float, help='The temperature value for sampling. If not set, greedy decoding is used.' @@ -60,7 +61,7 @@ def run_model(args, processor_cls, model_cls, run_prediction_loop): tokenizer = processor_cls.from_pretrained(args.model) if args.max_gpu_memory is None: # fails if it doesn't fit in a GPU - max_memory = {0: "100GiB", "cpu": "0GiB"} + max_memory = None else: max_memory = {} for i in range(len(args.max_gpu_memory)): @@ -84,15 +85,20 @@ def run_model(args, processor_cls, model_cls, run_prediction_loop): def run_model_with_assistant(args, processor_cls, model_cls, run_prediction_loop): + assert args.aux_model is not None or args.aux_early_exit is not None tokenizer = processor_cls.from_pretrained(args.model) - assistant_model = model_cls.from_pretrained(args.aux_model) - assistant_model = assistant_model.to(device=TORCH_DEVICE, dtype=args.dtype) - if assistant_model.generation_config.pad_token_id is None: - assistant_model.generation_config.pad_token_id = assistant_model.generation_config.eos_token_id + if args.aux_model: + assistant_model = model_cls.from_pretrained(args.aux_model) + assistant_model = assistant_model.to(device=TORCH_DEVICE, dtype=args.dtype) + if assistant_model.generation_config.pad_token_id is None: + assistant_model.generation_config.pad_token_id = assistant_model.generation_config.eos_token_id + else: + assistant_model = None + assistant_early_exit = args.aux_early_exit if args.max_gpu_memory is None: # fails if it doesn't fit in a GPU - max_memory = {0: "100GiB", "cpu": "0GiB"} + max_memory = None else: max_memory = {} for i in range(len(args.max_gpu_memory)): @@ -112,12 +118,16 @@ def run_model_with_assistant(args, processor_cls, model_cls, run_prediction_loop model.generation_config.pad_token_id = model.generation_config.eos_token_id # If the tokenizer of the two models are different, pass `assistant_tokenizer` to trigger UAG - has_same_tokenizer = ( - model.config.vocab_size == assistant_model.config.vocab_size - and model.config.pad_token_id == assistant_model.config.pad_token_id - and model.config.eos_token_id == assistant_model.config.eos_token_id - and model.config.bos_token_id == assistant_model.config.bos_token_id - ) + if assistant_model: + has_same_tokenizer = ( + model.config.vocab_size == assistant_model.config.vocab_size + and model.config.pad_token_id == assistant_model.config.pad_token_id + and model.config.eos_token_id == assistant_model.config.eos_token_id + and model.config.bos_token_id == assistant_model.config.bos_token_id + ) + else: + has_same_tokenizer = True + if has_same_tokenizer: assistant_tokenizer = None else: @@ -129,6 +139,7 @@ def run_model_with_assistant(args, processor_cls, model_cls, run_prediction_loop num_samples=args.num_samples, temperature=args.temperature, assistant_model=assistant_model, + assistant_early_exit=assistant_early_exit, assistant_tokenizer=assistant_tokenizer ) return new_outputs