diff --git a/docs/models/samples/dummy_llama_2.py b/docs/models/samples/dummy_llama_2.py new file mode 100644 index 000000000..7304ff41c --- /dev/null +++ b/docs/models/samples/dummy_llama_2.py @@ -0,0 +1,98 @@ +# -*- coding: utf-8 -*- +""" +This example demonstrates how to use mgleize/fairseq2-dummy-Llama-3.2-1B model with vLLM. +This model weight is randomly initialized and is only for testing purposes. +PS: + this model is not supported yet by modelscope, so if you have applied modelscope in your codebase, like: + ```bash + export VLLM_USE_MODELSCOPE=True + ``` + you should set it to false, make the vllm download model weight from huggingface directly. + + +Requirements: +- vLLM: v0.11.0 or higher +- vLLM-metax: v0.11.0 or higher +- MACA SDK: 3.2.x.x or higher +""" + +import argparse +import os +import sys +from typing import List +import torchvision +torchvision.disable_beta_transforms_warning() # silence annoying warning + +from vllm import LLM, SamplingParams + +DEFAULT_PROMPTS: List[str] = [ + "Describe vLLM's core advantages in one paragraph.", + "Give me 3 engineering recommendations to improve large language model throughput, and briefly explain why.", + "Explain what tensor parallelism is and when to use it.", + "Translate this sentence into Chinese: We are using vLLM to perform offline inference validation testing.", + "Write a Python snippet (no more than 5 lines) showing how to use vLLM to call a local model to generate text.", +] + +def load_prompts(args) -> List[str]: + prompts: List[str] = [] + if args.prompt: + prompts.extend([p.strip() for p in args.prompt if p.strip()]) + if args.prompt_file: + path = args.prompt_file + if os.path.isfile(path): + with open(path, "r", encoding="utf-8") as f: + for line in f: + p = line.strip() + if p: + prompts.append(p) + else: + print(f"[WARN] Prompt file does not exist: {path}") + if not prompts: + prompts = DEFAULT_PROMPTS.copy() + return prompts + +def main(): + # Add some args to make program configurable from command line + parser = argparse.ArgumentParser(description="Minimal offline inference with vLLM (batch prompts, simplified)") + parser.add_argument("--model", default="mgleize/fairseq2-dummy-Llama-3.2-1B", + help="Model name or local directory. If not provided, a small demo model is used. You can replace with e.g. meta-llama/Llama-3.2-1B-Instruct") + parser.add_argument("-p", "--prompt", action="append", + help="Repeatable; provide multiple prompts by specifying multiple times") + parser.add_argument("--prompt-file", + help="Load prompts from a file, one per line") + parser.add_argument("--max-tokens", type=int, default=128, + help="Maximum number of generated tokens") + parser.add_argument("--temperature", type=float, default=0.2, + help="Sampling temperature") + args = parser.parse_args() + + # Inference configures + sampling = SamplingParams( + temperature=args.temperature, + max_tokens=args.max_tokens, + ) + prompts = load_prompts(args) + print(f"[INFO] Loading model: {args.model}") + llm = LLM( + model=args.model, # online or local + dtype="auto", + trust_remote_code=True, # allow custom model code from online repo + tensor_parallel_size=1, # adjustable by number of GPUs + gpu_memory_utilization=0.90, + ) + + print(f"[INFO] Batch generation, total {len(prompts)} prompts") + outputs = llm.generate(prompts, sampling) + + print("\n===== OUTPUT =====") + for i, (inp, out) in enumerate(zip(prompts, outputs), 1): + text = out.outputs[0].text.strip() if out.outputs else "" + print(f"\n--- #{i} ---") + print(f"Prompt:\n<< {inp}") + print(f"Output:\n>> {text}") + + success = any(o.outputs and (o.outputs[0].text.strip() != "") for o in outputs) + sys.exit(0 if success else 2) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index f2974db78..06cbebde5 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -45,6 +45,7 @@ Here the plugin would list all the **tested** model on Maca. | `Qwen3ForCausalLM` | Qwen3 | `Qwen/Qwen3-8B`, etc. | ✅︎ | ✅︎ | | `Qwen3MoeForCausalLM` | Qwen3MoE | `Qwen/Qwen3-30B-A3B`, etc. | ✅︎ | ✅︎ | | `Qwen3NextForCausalLM` | Qwen3NextMoE | `Qwen/Qwen3-Next-80B-A3B-Instruct`, etc. | ✅︎ | ✅︎ | +| `Fairseq2LlamaForCausalLM` | Llama(fairseq2 format) | `mgleize/fairseq2-dummy-Llama-3.2-1B`, etc. | ✅ | ✅ | ## List of Multimodal Language Models