vllm-project · simon-mo · Feb 21, 2025 · Jan 21, 2025 · Jan 21, 2025 · Jan 21, 2025
diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md
@@ -108,8 +108,7 @@ A code example can be found here: <gh-file:examples/offline_inference/classifica
 ### `LLM.score`
 
 The {class}`~vllm.LLM.score` method outputs similarity scores between sentence pairs.
-It is primarily designed for [cross-encoder models](https://www.sbert.net/examples/applications/cross-encoder/README.html).
-These types of models serve as rerankers between candidate query-document pairs in RAG systems.
+It is designed for embedding models and cross encoder models. Embedding models use cosine similarity, and [cross-encoder models](https://www.sbert.net/examples/applications/cross-encoder/README.html) serve as rerankers between candidate query-document pairs in RAG systems.
 
 :::{note}
 vLLM can only perform the model inference component (e.g. embedding, reranking) of RAG.

diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
@@ -49,7 +49,7 @@ In addition, we have the following custom APIs:
 - [Pooling API](#pooling-api) (`/pooling`)
   - Applicable to all [pooling models](../models/pooling_models.md).
 - [Score API](#score-api) (`/score`)
-  - Only applicable to [cross-encoder models](../models/pooling_models.md) (`--task score`).
+  - Applicable to embedding models and [cross-encoder models](../models/pooling_models.md) (`--task score`).
 - [Re-rank API](#rerank-api) (`/rerank`, `/v1/rerank`, `/v2/rerank`)
   - Implements [Jina AI's v1 re-rank API](https://jina.ai/reranker/)
   - Also compatible with [Cohere's v1 & v2 re-rank APIs](https://docs.cohere.com/v2/reference/rerank)
@@ -320,10 +320,10 @@ Code example: <gh-file:examples/online_serving/openai_pooling_client.py>
 
 ### Score API
 
-Our Score API applies a cross-encoder model to predict scores for sentence pairs.
+Our Score API can apply a cross-encoder model or an embedding model to predict scores for sentence pairs. When using an embedding model the score corresponds to the cosine similarity between each embedding pair.
 Usually, the score for a sentence pair refers to the similarity between two sentences, on a scale of 0 to 1.
 
-You can find the documentation for these kind of models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
+You can find the documentation for cross encoder models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
 
 Code example: <gh-file:examples/online_serving/openai_cross_encoder_score.py>
 
@@ -483,11 +483,11 @@ The following extra parameters are supported:
 
 ### Re-rank API
 
-Our Re-rank API applies a cross-encoder model to predict relevant scores between a single query, and
+Our Re-rank API can apply an embedding model or a cross-encoder model to predict relevant scores between a single query, and
 each of a list of documents. Usually, the score for a sentence pair refers to the similarity between two sentences, on
 a scale of 0 to 1.
 
-You can find the documentation for these kind of models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
+You can find the documentation for cross encoder models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
 
 The rerank endpoints support popular re-rank models such as `BAAI/bge-reranker-base` and other models supporting the
 `score` task. Additionally, `/rerank`, `/v1/rerank`, and `/v2/rerank`

@@ -8,17 +8,17 @@
 from ...utils import RemoteOpenAIServer
 
 MODEL_NAME = "BAAI/bge-reranker-base"
+DTYPE = "half"
 
 
 @pytest.fixture(scope="module")
 def server():
-    args = ["--enforce-eager", "--max-model-len", "100"]
+    args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
         yield remote_server
 
 
-@pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 def test_rerank_texts(server: RemoteOpenAIServer, model_name: str):
     query = "What is the capital of France?"
@@ -42,7 +42,6 @@ def test_rerank_texts(server: RemoteOpenAIServer, model_name: str):
     assert rerank.results[1].relevance_score <= 0.01
 
 
-@pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 def test_top_n(server: RemoteOpenAIServer, model_name: str):
     query = "What is the capital of France?"
@@ -68,7 +67,6 @@ def test_top_n(server: RemoteOpenAIServer, model_name: str):
     assert rerank.results[1].relevance_score <= 0.01
 
 
-@pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 def test_rerank_max_model_len(server: RemoteOpenAIServer, model_name: str):
 
@@ -86,4 +84,4 @@ def test_rerank_max_model_len(server: RemoteOpenAIServer, model_name: str):
     assert rerank_response.status_code == 400
     # Assert just a small fragments of the response
     assert "Please reduce the length of the input." in \
-        rerank_response.text
+        rerank_response.text
@@ -1,123 +1,185 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import math
+from typing import Any
+
 import pytest
 import requests
+import torch.nn.functional as F
+from torch import tensor
 
 from vllm.entrypoints.openai.protocol import ScoreResponse
 
 from ...utils import RemoteOpenAIServer
 
-MODEL_NAME = "BAAI/bge-reranker-v2-m3"
-
-
-@pytest.fixture(scope="module")
-def server():
-    args = ["--enforce-eager", "--max-model-len", "100"]
-
-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+MODELS = [
+    {
+        "name": "BAAI/bge-reranker-v2-m3",
+        "is_cross_encoder": True
+    },
+    {
+        "name": "BAAI/bge-base-en-v1.5",
+        "is_cross_encoder": False
+    },
+]
+DTYPE = "half"
+
+
+def run_transformers(hf_model, model, text_pairs):
+    if model["is_cross_encoder"]:
+        return hf_model.predict(text_pairs).tolist()
+    else:
+        hf_embeddings = [
+            hf_model.encode(text_pair) for text_pair in text_pairs
+        ]
+        return [
+            F.cosine_similarity(tensor(pair[0]), tensor(pair[1]), dim=0)
+            for pair in hf_embeddings
+        ]
+
+
+@pytest.fixture(scope="class", params=MODELS)
+def model(request):
+    yield request.param
+
+
+@pytest.fixture(scope="class")
+def server(model: dict[str, Any]):
+    args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]
+
+    with RemoteOpenAIServer(model["name"], args) as remote_server:
         yield remote_server
 
 
-@pytest.mark.asyncio
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
-def test_text_1_str_text_2_list(server: RemoteOpenAIServer, model_name: str):
-    text_1 = "What is the capital of France?"
-    text_2 = [
-        "The capital of Brazil is Brasilia.", "The capital of France is Paris."
-    ]
-
-    score_response = requests.post(server.url_for("score"),
-                                   json={
-                                       "model": model_name,
-                                       "text_1": text_1,
-                                       "text_2": text_2,
-                                   })
-    score_response.raise_for_status()
-    score = ScoreResponse.model_validate(score_response.json())
-
-    assert score.id is not None
-    assert score.data is not None
-    assert len(score.data) == 2
-    assert score.data[0].score <= 0.01
-    assert score.data[1].score >= 0.9
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
-def test_text_1_list_text_2_list(server: RemoteOpenAIServer, model_name: str):
-    text_1 = [
-        "What is the capital of the United States?",
-        "What is the capital of France?"
-    ]
-    text_2 = [
-        "The capital of Brazil is Brasilia.", "The capital of France is Paris."
-    ]
-
-    score_response = requests.post(server.url_for("score"),
-                                   json={
-                                       "model": model_name,
-                                       "text_1": text_1,
-                                       "text_2": text_2,
-                                   })
-    score_response.raise_for_status()
-    score = ScoreResponse.model_validate(score_response.json())
-
-    assert score.id is not None
-    assert score.data is not None
-    assert len(score.data) == 2
-    assert score.data[0].score <= 0.01
-    assert score.data[1].score >= 0.9
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
-def test_text_1_str_text_2_str(server: RemoteOpenAIServer, model_name: str):
-    text_1 = "What is the capital of France?"
-    text_2 = "The capital of France is Paris."
-
-    score_response = requests.post(server.url_for("score"),
-                                   json={
-                                       "model": model_name,
-                                       "text_1": text_1,
-                                       "text_2": text_2,
-                                   })
-    score_response.raise_for_status()
-    score = ScoreResponse.model_validate(score_response.json())
-
-    assert score.id is not None
-    assert score.data is not None
-    assert len(score.data) == 1
-    assert score.data[0].score >= 0.9
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
-def test_score_max_model_len(server: RemoteOpenAIServer, model_name: str):
-
-    text_1 = "What is the capital of France?" * 20
-    text_2 = [
-        "The capital of Brazil is Brasilia.", "The capital of France is Paris."
-    ]
-
-    score_response = requests.post(server.url_for("score"),
-                                   json={
-                                       "model": model_name,
-                                       "text_1": text_1,
-                                       "text_2": text_2,
-                                   })
-    assert score_response.status_code == 400
-    # Assert just a small fragments of the response
-    assert "Please reduce the length of the input." in \
-        score_response.text
-
-    # Test truncation
-    score_response = requests.post(server.url_for("score"),
-                                   json={
-                                       "model": model_name,
-                                       "text_1": text_1,
-                                       "text_2": text_2,
-                                       "truncate_prompt_tokens": 101
-                                   })
-    assert score_response.status_code == 400
-    assert "Please, select a smaller truncation size." in \
-        score_response.text
+@pytest.fixture(scope="class")
+def runner(model: dict[str, Any], hf_runner):
+    kwargs = {
+        "dtype": DTYPE,
+        "is_cross_encoder" if model["is_cross_encoder"]\
+              else "is_sentence_transformer": True
+    }
+
+    with hf_runner(model["name"], **kwargs) as hf_model:
+        yield hf_model
+
+
+class TestModel:
+
+    def test_text_1_str_text_2_list(self, server: RemoteOpenAIServer,
+                                    model: dict[str, Any], runner):
+        text_1 = "What is the capital of France?"
+        text_2 = [
+            "The capital of Brazil is Brasilia.",
+            "The capital of France is Paris."
+        ]
+
+        score_response = requests.post(server.url_for("score"),
+                                       json={
+                                           "model": model["name"],
+                                           "text_1": text_1,
+                                           "text_2": text_2,
+                                       })
+        score_response.raise_for_status()
+        score = ScoreResponse.model_validate(score_response.json())
+
+        assert score.id is not None
+        assert score.data is not None
+        assert len(score.data) == 2
+
+        vllm_outputs = [d.score for d in score.data]
+
+        text_pairs = [[text_1, text_2[0]], [text_1, text_2[1]]]
+        hf_outputs = run_transformers(runner, model, text_pairs)
+
+        for i in range(len(vllm_outputs)):
+            assert math.isclose(hf_outputs[i], vllm_outputs[i], rel_tol=0.01)
+
+    def test_text_1_list_text_2_list(self, server: RemoteOpenAIServer,
+                                     model: dict[str, Any], runner):
+        text_1 = [
+            "What is the capital of the United States?",
+            "What is the capital of France?"
+        ]
+        text_2 = [
+            "The capital of Brazil is Brasilia.",
+            "The capital of France is Paris."
+        ]
+
+        score_response = requests.post(server.url_for("score"),
+                                       json={
+                                           "model": model["name"],
+                                           "text_1": text_1,
+                                           "text_2": text_2,
+                                       })
+        score_response.raise_for_status()
+        score = ScoreResponse.model_validate(score_response.json())
+
+        assert score.id is not None
+        assert score.data is not None
+        assert len(score.data) == 2
+
+        vllm_outputs = [d.score for d in score.data]
+
+        text_pairs = [[text_1[0], text_2[0]], [text_1[1], text_2[1]]]
+        hf_outputs = run_transformers(runner, model, text_pairs)
+
+        for i in range(len(vllm_outputs)):
+            assert math.isclose(hf_outputs[i], vllm_outputs[i], rel_tol=0.01)
+
+    def test_text_1_str_text_2_str(self, server: RemoteOpenAIServer,
+                                   model: dict[str, Any], runner):
+        text_1 = "What is the capital of France?"
+        text_2 = "The capital of France is Paris."
+
+        score_response = requests.post(server.url_for("score"),
+                                       json={
+                                           "model": model["name"],
+                                           "text_1": text_1,
+                                           "text_2": text_2,
+                                       })
+        score_response.raise_for_status()
+        score = ScoreResponse.model_validate(score_response.json())
+
+        assert score.id is not None
+        assert score.data is not None
+        assert len(score.data) == 1
+
+        vllm_outputs = [d.score for d in score.data]
+
+        text_pairs = [[text_1, text_2]]
+        hf_outputs = run_transformers(runner, model, text_pairs)
+
+        for i in range(len(vllm_outputs)):
+            assert math.isclose(hf_outputs[i], vllm_outputs[i], rel_tol=0.01)
+
+    def test_score_max_model_len(self, server: RemoteOpenAIServer,
+                                 model: dict[str, Any]):
+
+        text_1 = "What is the capital of France?" * 20
+        text_2 = [
+            "The capital of Brazil is Brasilia.",
+            "The capital of France is Paris."
+        ]
+
+        score_response = requests.post(server.url_for("score"),
+                                       json={
+                                           "model": model["name"],
+                                           "text_1": text_1,
+                                           "text_2": text_2,
+                                       })
+        assert score_response.status_code == 400
+        # Assert just a small fragments of the response
+        assert "Please reduce the length of the input." in \
+            score_response.text
+
+        # Test truncation
+        score_response = requests.post(server.url_for("score"),
+                                       json={
+                                           "model": model["name"],
+                                           "text_1": text_1,
+                                           "text_2": text_2,
+                                           "truncate_prompt_tokens": 101
+                                       })
+        assert score_response.status_code == 400
+        assert "Please, select a smaller truncation size." in \
+            score_response.text