Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 58 additions & 0 deletions docs/models/samples/Olmo-3-1025-7B.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# -*- coding: utf-8 -*-
"""
Requirements:
- vLLM: v0.11.0 or higher
- vLLM-metax: v0.11.0 or higher
- MACA SDK: 3.2.x.x or higher
"""

import argparse
import sys
import torchvision
torchvision.disable_beta_transforms_warning()

from vllm import LLM, SamplingParams

PROMPTS = [
"Describe vLLM's core advantages in one paragraph.",
"Give me 3 engineering recommendations to improve large language model throughput, and briefly explain why.",
"Explain what tensor parallelism is and when to use it.",
"Translate this sentence into Chinese: We are using vLLM to perform offline inference validation testing.",
"Write a Python snippet (no more than 5 lines) showing how to use vLLM to call a local model to generate text.",
]

def main():
parser = argparse.ArgumentParser()
parser.add_argument("--model", default="allenai/Olmo-3-1025-7B", help="Model name or path.")
parser.add_argument("--max-tokens", type=int, default=128, help="Max tokens.")
parser.add_argument("--temperature", type=float, default=0.2, help="Temp.")
args = parser.parse_args()

sampling = SamplingParams(
temperature=args.temperature,
max_tokens=args.max_tokens,
n=1,
)

print(f"[INFO] Loading model: {args.model}")
llm = LLM(
model=args.model,
dtype="auto",
# WARNING: `trust_remote_code=True` is a security risk as it allows for arbitrary code execution from the model's repository. Only use this with models you trust.
trust_remote_code=True,
tensor_parallel_size=1,
gpu_memory_utilization=0.90,
)

print(f"[INFO] Generating {len(PROMPTS)} completions")
outputs = llm.generate(PROMPTS, sampling)

for i, (prompt, out) in enumerate(zip(PROMPTS, outputs), 1):
text = out.outputs[0].text.strip() if out.outputs else ""
print(f"\n[{i}]:\nUser: <<< {prompt}\nAssistant: >>> {text}")

success = any(o.outputs and o.outputs[0].text.strip() for o in outputs)
sys.exit(0 if success else 2)

if __name__ == "__main__":
main()
1 change: 1 addition & 0 deletions docs/models/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ Here the plugin would list all the **tested** model on Maca.
| `Qwen3ForCausalLM` | Qwen3 | `Qwen/Qwen3-8B`, etc. | ✅︎ | ✅︎ |
| `Qwen3MoeForCausalLM` | Qwen3MoE | `Qwen/Qwen3-30B-A3B`, etc. | ✅︎ | ✅︎ |
| `Qwen3NextForCausalLM` | Qwen3NextMoE | `Qwen/Qwen3-Next-80B-A3B-Instruct`, etc. | ✅︎ | ✅︎ |
| `OLMo3ForCausalLM` | OLMo3 | `TBA, allenai/Olmo-3-1025-7B`, etc. | ✅︎ | ✅︎ |

## List of Multimodal Language Models

Expand Down