opendatahub-io · vaibhavjainwiz · Aug 27, 2024 · Aug 5, 2024 · Aug 5, 2024 · Aug 5, 2024
diff --git a/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml b/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
@@ -9,3 +9,4 @@ tasks:
     value: 0.664
 limit: 1000
 num_fewshot: 5
+trust_remote_code: True
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
@@ -4,8 +4,8 @@ tasks:
 - name: "gsm8k"
   metrics:
   - name: "exact_match,strict-match"
-    value: 0.409
+    value: 0.419
   - name: "exact_match,flexible-extract"
-    value: 0.406
+    value: 0.416
 limit: 1000
 num_fewshot: 5
diff --git a/...val-harness/configs/Minitron-4B-Base.yaml → ...harness/configs/Minitron-4B-Base-FP8.yaml b/...val-harness/configs/Minitron-4B-Base.yaml → ...harness/configs/Minitron-4B-Base-FP8.yaml
@@ -1,11 +1,11 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nvidia/Minitron-4B-Base -b auto -l 1000 -f 5 -t 1
-model_name: "nvidia/Minitron-4B-Base"
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
+model_name: "mgoin/Minitron-4B-Base-FP8"
 tasks:
 - name: "gsm8k"
   metrics:
   - name: "exact_match,strict-match"
-    value: 0.252
+    value: 0.233
   - name: "exact_match,flexible-extract"
-    value: 0.252
+    value: 0.236
 limit: 1000
 num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/models-small.txt b/.buildkite/lm-eval-harness/configs/models-small.txt
@@ -4,7 +4,7 @@ Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
-Minitron-4B-Base.yaml
+Minitron-4B-Base-FP8.yaml
 Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
 Qwen2-1.5B-Instruct-FP8W8.yaml
 Meta-Llama-3-8B-QQQ.yaml
diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -14,7 +14,7 @@
 import numpy
 import yaml
 
-RTOL = 0.02
+RTOL = 0.05
 TEST_DATA_FILE = os.environ.get(
     "LM_EVAL_TEST_DATA_FILE",
     ".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
@@ -23,9 +23,12 @@
 
 
 def launch_lm_eval(eval_config):
+    trust_remote_code = eval_config.get('trust_remote_code', False)
+
     model_args = f"pretrained={eval_config['model_name']}," \
                  f"tensor_parallel_size={TP_SIZE}," \
-                 f"add_bos_token=true"
+                 f"add_bos_token=true," \
+                 f"trust_remote_code={trust_remote_code}"
 
     results = lm_eval.simple_evaluate(
         model="vllm",

diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md
@@ -34,17 +34,18 @@ See  [vLLM performance dashboard](https://perf.vllm.ai) for the latest performan
 
 Performance benchmark will be triggered when:
 - A PR being merged into vllm.
-- Every commit for those PRs with `perf-benchmarks` label.
+- Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
 
 Nightly benchmark will be triggered when:
-- Every commit for those PRs with `nightly-benchmarks` label.
+- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
 
 
 
 
 ## Performance benchmark details
 
-See [descriptions.md](tests/descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
+
+See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
 
 
 #### Latency test
@@ -68,7 +69,7 @@ Here is an example of one test inside `latency-tests.json`:
 
 In this example:
 -  The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
--  The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-benchmarks-suite.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
+-  The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
 
 Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
 

diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@@ -21,7 +21,7 @@ steps:
           containers:
           - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
             command:
-            - bash .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
+            - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
             resources:
               limits:
                 nvidia.com/gpu: 8

diff --git a/.../nightly-benchmarks/tests/descriptions.md → ...ks/performance-benchmarks-descriptions.md b/.../nightly-benchmarks/tests/descriptions.md → ...ks/performance-benchmarks-descriptions.md
@@ -1,47 +1,42 @@
 
 ## Latency tests
 
-This test suite aims to test vllm's end-to-end latency under a controlled setup.
-
 - Input length: 32 tokens.
 - Output length: 128 tokens.
 - Batch size: fixed (8).
-- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
+- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 - Evaluation metrics: end-to-end latency (mean, median, p99).
 
-### Latency benchmarking results
 
 {latency_tests_markdown_table}
 
-## Throughput tests
 
-This test suite aims to test vllm's throughput.
+## Throughput tests
 
 - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
 - Output length: the corresponding output length of these 200 prompts.
 - Batch size: dynamically determined by vllm to achieve maximum throughput.
-- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
+- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 - Evaluation metrics: throughput.
 
-### Throughput benchmarking results
 
 {throughput_tests_markdown_table}
 
-## Serving tests
 
-This test suite aims to test vllm's real serving metrics.
+## Serving tests
 
 - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
 - Output length: the corresponding output length of these 200 prompts.
 - Batch size: dynamically determined by vllm and the arrival pattern of the requests.
 - **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
-- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
+- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
+- We also added a speculative decoding test for llama-3 70B, under QPS 2
 - Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
 
-### Serving benchmarking results
 
 {serving_tests_markdown_table}
 
+
 ## json version of the benchmarking tables
 
 This section contains the data of the markdown tables above in JSON format. 

diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -174,8 +174,8 @@ def results_to_json(latency, throughput, serving):
     # document the result
     with open(results_folder / "benchmark_results.md", "w") as f:
 
-        results = read_markdown(
-            "../.buildkite/nightly-benchmarks/tests/descriptions.md")
+        results = read_markdown("../.buildkite/nightly-benchmarks/" +
+                                "performance-benchmarks-descriptions.md")
         results = results.format(
             latency_tests_markdown_table=latency_md_table,
             throughput_tests_markdown_table=throughput_md_table,

diff --git a/...ightly-benchmarks/run-benchmarks-suite.sh → ...rks/scripts/run-performance-benchmarks.sh b/...ightly-benchmarks/run-benchmarks-suite.sh → ...rks/scripts/run-performance-benchmarks.sh
@@ -37,9 +37,9 @@ check_hf_token() {
 ensure_sharegpt_downloaded() {
   local FILE=ShareGPT_V3_unfiltered_cleaned_split.json
   if [ ! -f "$FILE" ]; then
-      wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/$FILE
+    wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/$FILE
   else
-      echo "$FILE already exists."
+    echo "$FILE already exists."
   fi
 }
 
@@ -68,35 +68,38 @@ wait_for_server() {
     done' && return 0 || return 1
 }
 
-kill_gpu_processes() {
-  # kill all processes on GPU.
-  pids=$(nvidia-smi --query-compute-apps=pid --format=csv,noheader)
-  if [ -z "$pids" ]; then
-      echo "No GPU processes found."
+kill_processes_launched_by_current_bash() {
+  # Kill all python processes launched from current bash script
+  current_shell_pid=$$
+  processes=$(ps -eo pid,ppid,command | awk -v ppid="$current_shell_pid" -v proc="$1" '$2 == ppid && $3 ~ proc {print $1}')
+  if [ -n "$processes" ]; then
+    echo "Killing the following processes matching '$1':"
+    echo "$processes"
+    echo "$processes" | xargs kill -9
   else
-      for pid in $pids; do
-          kill -9 "$pid"
-          echo "Killed process with PID: $pid"
-      done
-
-      echo "All GPU processes have been killed."
+    echo "No processes found matching '$1'."
   fi
+}
+
+kill_gpu_processes() {
 
-  # waiting for GPU processes to be fully killed
-  # loop while nvidia-smi returns any processes
-  while [ -n "$(nvidia-smi --query-compute-apps=pid --format=csv,noheader)" ]; do
+  ps -aux
+  lsof -t -i:8000 | xargs -r kill -9
+  pkill -f pt_main_thread
+  # this line doesn't work now
+  # ps aux | grep python | grep openai | awk '{print $2}' | xargs -r kill -9
+  pkill -f python3
+  pkill -f /usr/bin/python3
+
+
+  # wait until GPU memory usage smaller than 1GB
+  while [ $(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1) -ge 1000 ]; do
     sleep 1
-    echo "Waiting for GPU processes to be killed"
   done
 
   # remove vllm config file
   rm -rf ~/.config/vllm
 
-  # Print the GPU memory usage
-  # so that we know if all GPU processes are killed.
-  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
-  # The memory usage should be 0 MB.
-  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
 }
 
 upload_to_buildkite() {
@@ -114,7 +117,7 @@ upload_to_buildkite() {
   fi
 
   # Use the determined command to annotate and upload artifacts
-  $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" < $RESULTS_FOLDER/benchmark_results.md
+  $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" <$RESULTS_FOLDER/benchmark_results.md
   $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
 }
 
@@ -166,7 +169,7 @@ run_latency_tests() {
         latency_command: $latency,
         gpu_type: $gpu
       }')
-    echo "$jq_output" > "$RESULTS_FOLDER/$test_name.commands"
+    echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
 
     # run the benchmark
     eval "$latency_command"
@@ -176,7 +179,6 @@ run_latency_tests() {
   done
 }
 
-
 run_throughput_tests() {
   # run throughput tests using `benchmark_throughput.py`
   # $1: a json file specifying throughput test cases
@@ -224,7 +226,7 @@ run_throughput_tests() {
         throughput_command: $command,
         gpu_type: $gpu
       }')
-    echo "$jq_output" > "$RESULTS_FOLDER/$test_name.commands"
+    echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
 
     # run the benchmark
     eval "$throughput_command"
@@ -256,7 +258,6 @@ run_serving_tests() {
       continue
     fi
 
-
     # get client and server arguments
     server_params=$(echo "$params" | jq -r '.server_parameters')
     client_params=$(echo "$params" | jq -r '.client_parameters')
@@ -334,7 +335,7 @@ run_serving_tests() {
           client_command: $client,
           gpu_type: $gpu
         }')
-      echo "$jq_output" > "$RESULTS_FOLDER/${new_test_name}.commands"
+      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
 
     done
 
@@ -351,6 +352,7 @@ main() {
   # dependencies
   (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
   (which jq) || (apt-get update && apt-get -y install jq)
+  (which lsof) || (apt-get update && apt-get install -y lsof)
 
   # get the current IP address, required by benchmark_serving.py
   export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
@@ -369,7 +371,6 @@ main() {
   run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json
   run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json
 
-
   # postprocess benchmarking results
   pip install tabulate pandas
   python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py

diff --git a/.buildkite/nightly-benchmarks/tests/latency-tests.json b/.buildkite/nightly-benchmarks/tests/latency-tests.json
@@ -2,7 +2,7 @@
     {
         "test_name": "latency_llama8B_tp1",
         "parameters": {
-            "model": "meta-llama/Meta-Llama-3-8B",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
             "tensor_parallel_size": 1,
             "load_format": "dummy",
             "num_iters_warmup": 5,
@@ -12,7 +12,7 @@
     {
         "test_name": "latency_llama70B_tp4",
         "parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
             "tensor_parallel_size": 4,
             "load_format": "dummy",
             "num-iters-warmup": 5,

diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests.json b/.buildkite/nightly-benchmarks/tests/serving-tests.json
@@ -3,15 +3,15 @@
         "test_name": "serving_llama8B_tp1_sharegpt",
         "qps_list": [1, 4, 16, "inf"],
         "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3-8B",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
             "tensor_parallel_size": 1,
             "swap_space": 16,
             "disable_log_stats": "",
             "disable_log_requests": "",
             "load_format": "dummy"
         },
         "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3-8B",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
             "backend": "vllm",
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
@@ -22,15 +22,15 @@
         "test_name": "serving_llama70B_tp4_sharegpt",
         "qps_list": [1, 4, 16, "inf"],
         "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
             "tensor_parallel_size": 4,
             "swap_space": 16,
             "disable_log_stats": "",
             "disable_log_requests": "",
             "load_format": "dummy"
         },
         "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
             "backend": "vllm",
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
@@ -60,7 +60,7 @@
         "test_name": "serving_llama70B_tp4_sharegpt_specdecode",
         "qps_list": [2],
         "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
             "disable_log_requests": "", 
             "tensor_parallel_size": 4,
             "swap_space": 16, 
@@ -70,7 +70,7 @@
             "use_v2_block_manager": ""
         },
         "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
             "backend": "vllm",
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",

diff --git a/.buildkite/nightly-benchmarks/tests/throughput-tests.json b/.buildkite/nightly-benchmarks/tests/throughput-tests.json
@@ -2,7 +2,7 @@
     {
         "test_name": "throughput_llama8B_tp1",
         "parameters": {
-            "model": "meta-llama/Meta-Llama-3-8B",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
             "tensor_parallel_size": 1,
             "load_format": "dummy",
             "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
@@ -13,7 +13,7 @@
     {
         "test_name": "throughput_llama70B_tp4",
         "parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
             "tensor_parallel_size": 4,
             "load_format": "dummy",
             "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",