forked from ggml-org/llama.cpp
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmodel-rerun.py
More file actions
81 lines (69 loc) · 3.24 KB
/
model-rerun.py
File metadata and controls
81 lines (69 loc) · 3.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import subprocess
import re
import sys
##run this script with example python3 model-rerun.py /proj/rel/sw/ggml/models/Tiny-Llama-v0.3-FP32-1.1B-F32.gguf
# Check for model path argument
if len(sys.argv) < 2:
print("Usage: python3 model_rerun.py /full/path/to/model.gguf")
sys.exit(1)
model_path = sys.argv[1]
# Base paragraph to repeat
base_prompt = (
"Use the following pieces of context to answer the question at the end. "
"If you don't know the answer, just say that you don't know, don't try to make up an answer. "
"Symbol used by Apple and Google on some devices to denote an Ethernet connection. "
"Ethernet is a family of wired computer networking technologies used in LAN, MAN, and WAN. "
"It was introduced in 1980 and standardized in 1983 as IEEE 802.3. "
"Over time, Ethernet has replaced technologies like Token Ring and ARCNET. "
"The original 10BASE5 Ethernet used a thick coaxial cable. "
"Question: What is Ethernet? Helpful Answer: An Ethernet network is a type of computer network. "
"Next topic: California. California, often called the 'Golden State', is the most populous U.S. state. "
"It stretches along the Pacific Ocean and features diverse geography from beaches to mountains. "
)
# Prompt size multipliers
multipliers = [1, 2, 3, 4, 5]
results = []
#for i, multiplier in enumerate(multipliers, start=1):
for i, multiplier in enumerate(multipliers[4:], start=5):
prompt = base_prompt * multiplier
prompt_length = len(prompt)
print(f"\n🔄 Run {i}: Testing with prompt size {multiplier}x, actual size = {prompt_length} characters")
command = [
"./build-posix/bin/llama-cli",
"-p", prompt,
"-m", model_path,
"--device", "none",
"-c", "12288",
"--temp", "0.0",
"--n-predict", "5",
"--repeat-penalty", "1.5",
"-b", "1024",
"--top-k", "50",
"--top-p", "0.9",
"--repeat-last-n", "5",
"--no-warmup"
]
print("🚀 Executing llama-cli...")
try:
output = subprocess.check_output(command, stderr=subprocess.STDOUT, text=True)
print("✅ Execution complete.")
except subprocess.CalledProcessError as e:
output = e.output
print("⚠️ Execution failed, capturing output.")
print("🔍 Parsing performance metrics...")
load_time = re.search(r"load time\s*=\s*([\d.]+) ms", output)
prompt_eval_time = re.search(r"prompt eval time\s*=\s*([\d.]+) ms", output)
eval_time = re.search(r"eval time\s*=\s*([\d.]+) ms", output)
results.append({
"Run": i,
"Prompt Size": f"{multiplier}x",
"Load Time (ms)": float(load_time.group(1)) if load_time else "N/A",
"Prompt Eval Time (ms)": float(prompt_eval_time.group(1)) if prompt_eval_time else "N/A",
"Eval Time (ms)": float(eval_time.group(1)) if eval_time else "N/A"
})
print("📦 Metrics captured.")
# Final summary
print("\n📊 Benchmark Summary:")
print(f"{'Run':<5} {'Prompt Size':<12} {'Load Time (ms)':<18} {'Prompt Eval Time (ms)':<24} {'Eval Time (ms)':<18}")
for result in results:
print(f"{result['Run']:<5} {result['Prompt Size']:<12} {str(result['Load Time (ms)']):<18} {str(result['Prompt Eval Time (ms)']):<24} {str(result['Eval Time (ms)']):<18}")