Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 98 additions & 0 deletions docs/models/samples/dummy_llama_2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
# -*- coding: utf-8 -*-
"""
This example demonstrates how to use mgleize/fairseq2-dummy-Llama-3.2-1B model with vLLM.
This model weight is randomly initialized and is only for testing purposes.
PS:
this model is not supported yet by modelscope, so if you have applied modelscope in your codebase, like:
```bash
export VLLM_USE_MODELSCOPE=True
```
you should set it to false, make the vllm download model weight from huggingface directly.


Requirements:
- vLLM: v0.11.0 or higher
- vLLM-metax: v0.11.0 or higher
- MACA SDK: 3.2.x.x or higher
"""

import argparse
import os
import sys
from typing import List
import torchvision
torchvision.disable_beta_transforms_warning() # silence annoying warning

from vllm import LLM, SamplingParams

DEFAULT_PROMPTS: List[str] = [
"Describe vLLM's core advantages in one paragraph.",
"Give me 3 engineering recommendations to improve large language model throughput, and briefly explain why.",
"Explain what tensor parallelism is and when to use it.",
"Translate this sentence into Chinese: We are using vLLM to perform offline inference validation testing.",
"Write a Python snippet (no more than 5 lines) showing how to use vLLM to call a local model to generate text.",
]

def load_prompts(args) -> List[str]:
prompts: List[str] = []
if args.prompt:
prompts.extend([p.strip() for p in args.prompt if p.strip()])
if args.prompt_file:
path = args.prompt_file
if os.path.isfile(path):
with open(path, "r", encoding="utf-8") as f:
for line in f:
p = line.strip()
if p:
prompts.append(p)
else:
print(f"[WARN] Prompt file does not exist: {path}")
if not prompts:
prompts = DEFAULT_PROMPTS.copy()
return prompts

def main():
# Add some args to make program configurable from command line
parser = argparse.ArgumentParser(description="Minimal offline inference with vLLM (batch prompts, simplified)")
parser.add_argument("--model", default="mgleize/fairseq2-dummy-Llama-3.2-1B",
help="Model name or local directory. If not provided, a small demo model is used. You can replace with e.g. meta-llama/Llama-3.2-1B-Instruct")
parser.add_argument("-p", "--prompt", action="append",
help="Repeatable; provide multiple prompts by specifying multiple times")
parser.add_argument("--prompt-file",
help="Load prompts from a file, one per line")
parser.add_argument("--max-tokens", type=int, default=128,
help="Maximum number of generated tokens")
parser.add_argument("--temperature", type=float, default=0.2,
help="Sampling temperature")
args = parser.parse_args()

# Inference configures
sampling = SamplingParams(
temperature=args.temperature,
max_tokens=args.max_tokens,
)
prompts = load_prompts(args)
print(f"[INFO] Loading model: {args.model}")
llm = LLM(
model=args.model, # online or local
dtype="auto",
trust_remote_code=True, # allow custom model code from online repo
tensor_parallel_size=1, # adjustable by number of GPUs
gpu_memory_utilization=0.90,
)

print(f"[INFO] Batch generation, total {len(prompts)} prompts")
outputs = llm.generate(prompts, sampling)

print("\n===== OUTPUT =====")
for i, (inp, out) in enumerate(zip(prompts, outputs), 1):
text = out.outputs[0].text.strip() if out.outputs else ""
print(f"\n--- #{i} ---")
print(f"Prompt:\n<< {inp}")
print(f"Output:\n>> {text}")

success = any(o.outputs and (o.outputs[0].text.strip() != "") for o in outputs)
sys.exit(0 if success else 2)

if __name__ == "__main__":
main()
1 change: 1 addition & 0 deletions docs/models/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ Here the plugin would list all the **tested** model on Maca.
| `Qwen3ForCausalLM` | Qwen3 | `Qwen/Qwen3-8B`, etc. | ✅︎ | ✅︎ |
| `Qwen3MoeForCausalLM` | Qwen3MoE | `Qwen/Qwen3-30B-A3B`, etc. | ✅︎ | ✅︎ |
| `Qwen3NextForCausalLM` | Qwen3NextMoE | `Qwen/Qwen3-Next-80B-A3B-Instruct`, etc. | ✅︎ | ✅︎ |
| `Fairseq2LlamaForCausalLM` | Llama(fairseq2 format) | `mgleize/fairseq2-dummy-Llama-3.2-1B`, etc. | ✅ | ✅ |

## List of Multimodal Language Models

Expand Down