diff --git a/tests/lora/test_ultravox.py b/tests/lora/test_ultravox.py deleted file mode 100644 index 2faabcb031f7..000000000000 --- a/tests/lora/test_ultravox.py +++ /dev/null @@ -1,131 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -import shutil -from os import path -from tempfile import TemporaryDirectory - -import pytest -import torch -from huggingface_hub import snapshot_download -from safetensors.torch import load_file, save_file -from transformers import AutoTokenizer - -from vllm.lora.request import LoRARequest - -from ..models.utils import check_outputs_equal - -ULTRAVOX_MODEL_NAME = "fixie-ai/ultravox-v0_3" -LLMA_MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct" - -VLLM_PLACEHOLDER = "<|reserved_special_token_0|>" - -PROMPT = "Tell me about a Fool's mate move in 20 words. Provide the moves!" - - -@pytest.fixture(autouse=True) -def v1(run_with_both_engines_lora): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - -def llama3_1_8b_chess_lora_path(): - return snapshot_download( - repo_id="mkopecki/chess-lora-adapter-llama-3.1-8b") - - -# can't use llama lora adapter without module name transformation -# because ultravox nest language model -def transform_module_names_for_ultravox(state_dict): - transformed_state_dict = {} - for key, value in state_dict.items(): - new_key = key.replace("base_model.model", - "base_model.model.language_model") - transformed_state_dict[new_key] = value - return transformed_state_dict - - -def mk_llama3_1_8b_ultravox_chess_lora(source_repo, target_path): - tensor_file = "adapter_model.safetensors" - state_dict = load_file(path.join(source_repo, tensor_file)) - transformed_state_dict = transform_module_names_for_ultravox(state_dict) - - save_file(transformed_state_dict, path.join(target_path, tensor_file)) - - config_file = "adapter_config.json" - shutil.copyfile(path.join(source_repo, config_file), - path.join(target_path, config_file)) - return target_path - - -def _get_prompt(audio_count, question, placeholder, model_name) -> str: - tokenizer = AutoTokenizer.from_pretrained(model_name) - placeholder = f"{placeholder}\n" * audio_count - - return tokenizer.apply_chat_template([{ - 'role': 'user', - 'content': f"{placeholder}{question}" - }], - tokenize=False, - add_generation_prompt=True) - - -def test_ultravox_lora(vllm_runner): - """ - TODO: Train an Ultravox LoRA instead of using a Llama LoRA. - """ - # Workaround to prevent device mismatch in Whisper. - # Can be removed when it is fixed upstream in transformer - # https://github.com/huggingface/transformers/pull/35866 - torch.set_default_device("cpu") - - llama3_1_8b_chess_lora = llama3_1_8b_chess_lora_path() - with TemporaryDirectory() as temp_ultravox_lora_dir: - llama3_1_8b_ultravox_chess_lora = mk_llama3_1_8b_ultravox_chess_lora( - llama3_1_8b_chess_lora, temp_ultravox_lora_dir) - with vllm_runner( - ULTRAVOX_MODEL_NAME, - enforce_eager=True, - max_num_seqs=2, - enable_lora=True, - max_loras=1, - max_lora_rank=128, - dtype="bfloat16", - max_model_len=1024, - ) as vllm_model: - ultravox_outputs: list[tuple[ - list[int], str]] = vllm_model.generate_greedy( - [ - _get_prompt(0, PROMPT, VLLM_PLACEHOLDER, - ULTRAVOX_MODEL_NAME) - ], - 256, - lora_request=LoRARequest(str(1), 1, - llama3_1_8b_ultravox_chess_lora), - ) - - # run llama with and without lora to compare outputs with above - with vllm_runner( - LLMA_MODEL_NAME, - enforce_eager=True, - max_num_seqs=2, - enable_lora=True, - max_loras=1, - max_lora_rank=128, - dtype="bfloat16", - max_model_len=1024, - ) as vllm_model: - llama_outputs: list[tuple[list[int], str]] = ( - vllm_model.generate_greedy( - [_get_prompt(0, PROMPT, VLLM_PLACEHOLDER, LLMA_MODEL_NAME)], - 256, - lora_request=LoRARequest(str(1), 1, llama3_1_8b_chess_lora), - )) - - check_outputs_equal( - outputs_0_lst=ultravox_outputs, - outputs_1_lst=llama_outputs, - name_0="ultravox", - name_1="llama", - )