diff --git a/config/catalog_allow.exs b/config/catalog_allow.exs
index a26698e8..66dc4a4e 100644
--- a/config/catalog_allow.exs
+++ b/config/catalog_allow.exs
@@ -137,6 +137,8 @@ zai_coder_models = :all
 # Cerebras - All models
 cerebras_models = :all
 
+vllm_models = []
+
 config :req_llm, :catalog,
   allow: %{
     anthropic: anthropic_models,
@@ -149,7 +151,8 @@ config :req_llm, :catalog,
     google_vertex_anthropic: google_vertex_anthropic_models,
     zai: zai_models,
     zai_coder: zai_coder_models,
-    cerebras: cerebras_models
+    cerebras: cerebras_models,
+    vllm: vllm_models
   },
   overrides: [],
   custom: []
diff --git a/config/test.exs b/config/test.exs
index 4f3b1077..ca965ba9 100644
--- a/config/test.exs
+++ b/config/test.exs
@@ -15,6 +15,7 @@ config :req_llm, :catalog,
     openrouter: :all,
     amazon_bedrock: :all,
     google_vertex_anthropic: :all,
+    vllm: :all,
     zai: :all,
     zai_coder: :all,
     cerebras: :all
diff --git a/lib/req_llm/catalog.ex b/lib/req_llm/catalog.ex
index 0ecdefce..a576c01d 100644
--- a/lib/req_llm/catalog.ex
+++ b/lib/req_llm/catalog.ex
@@ -346,7 +346,7 @@ defmodule ReqLLM.Catalog do
 
       allowed_spec?(:anthropic, "claude-3-5-sonnet")
       # => true (if anthropic: :all in catalog)
-      
+
       allowed_spec?(:openai, "gpt-4o-mini")
       # => true (if matches pattern)
   """
diff --git a/lib/req_llm/model.ex b/lib/req_llm/model.ex
index 7e90d067..1d618bc9 100644
--- a/lib/req_llm/model.ex
+++ b/lib/req_llm/model.ex
@@ -51,6 +51,7 @@ defmodule ReqLLM.Model do
     field(:modalities, %{input: [modality()], output: [modality()]} | nil)
     field(:capabilities, capabilities() | nil)
     field(:cost, cost() | nil)
+    field(:base_url, String.t(), enforce: false)
     field(:_metadata, map() | nil)
   end
 
@@ -72,6 +73,7 @@ defmodule ReqLLM.Model do
   - `:capabilities` - Model capabilities like `:reasoning`, `:tool_call`, `:temperature`, `:attachment`
   - `:cost` - Pricing information with `:input` and `:output` cost per 1K tokens
      Optional `:cached_input` cost per 1K tokens (defaults to `:input` rate if not specified)
+  - `:base_url - model specific base_url value.  Overrides the VLLM provider base_url value.
   - `:_metadata` - Additional provider-specific metadata
 
   ## Examples
@@ -97,6 +99,7 @@ defmodule ReqLLM.Model do
       modalities: Keyword.get(opts, :modalities),
       capabilities: Keyword.get(opts, :capabilities),
       cost: Keyword.get(opts, :cost),
+      base_url: Keyword.get(opts, :base_url),
       _metadata: Keyword.get(opts, :_metadata)
     }
   end
diff --git a/lib/req_llm/provider/generated/valid_providers.ex b/lib/req_llm/provider/generated/valid_providers.ex
index a9755c68..e2302452 100644
--- a/lib/req_llm/provider/generated/valid_providers.ex
+++ b/lib/req_llm/provider/generated/valid_providers.ex
@@ -55,6 +55,7 @@ defmodule ReqLLM.Provider.Generated.ValidProviders do
     :v0,
     :venice,
     :vercel,
+    :vllm,
     :vultr,
     :wandb,
     :xai,
diff --git a/lib/req_llm/provider/options.ex b/lib/req_llm/provider/options.ex
index 8ea9faf3..8086f13b 100644
--- a/lib/req_llm/provider/options.ex
+++ b/lib/req_llm/provider/options.ex
@@ -422,7 +422,8 @@ defmodule ReqLLM.Provider.Options do
   """
   @spec effective_base_url(module(), ReqLLM.Model.t(), keyword()) :: String.t()
   def effective_base_url(provider_mod, %ReqLLM.Model{} = model, opts) do
-    opts[:base_url] ||
+    model.base_url ||
+      opts[:base_url] ||
       base_url_from_application_config(model.provider) ||
       base_url_from_provider_metadata(model.provider) ||
       provider_mod.default_base_url()
@@ -766,7 +767,8 @@ defmodule ReqLLM.Provider.Options do
 
   defp inject_base_url_from_registry(opts, model, provider_mod) do
     Keyword.put_new_lazy(opts, :base_url, fn ->
-      base_url_from_application_config(model.provider) ||
+      model.base_url ||
+        base_url_from_application_config(model.provider) ||
         base_url_from_provider_metadata(model.provider) ||
         provider_mod.default_base_url()
     end)
diff --git a/lib/req_llm/provider/registry.ex b/lib/req_llm/provider/registry.ex
index f6103f96..088af6f0 100644
--- a/lib/req_llm/provider/registry.ex
+++ b/lib/req_llm/provider/registry.ex
@@ -220,12 +220,15 @@ defmodule ReqLLM.Provider.Registry do
             cost =
               get_in(model_metadata, ["cost"]) |> ReqLLM.Metadata.map_string_keys_to_atoms()
 
+            base_url = get_in(model_metadata, ["base_url"])
+
             enhanced_model =
               ReqLLM.Model.new(provider_id, model_name,
                 limit: limit,
                 modalities: modalities,
                 capabilities: capabilities,
-                cost: cost
+                cost: cost,
+                base_url: base_url
               )
 
             # Add raw metadata for backward compatibility and additional fields
@@ -519,7 +522,7 @@ defmodule ReqLLM.Provider.Registry do
           "models" => %{"claude-3-sonnet" => %{"id" => "claude-3-sonnet", ...}}
         }
       }
-      
+
       ReqLLM.Provider.Registry.initialize(catalog)
       #=> :ok
 
diff --git a/lib/req_llm/providers/vllm.ex b/lib/req_llm/providers/vllm.ex
new file mode 100644
index 00000000..30171a9b
--- /dev/null
+++ b/lib/req_llm/providers/vllm.ex
@@ -0,0 +1,22 @@
+defmodule ReqLLM.Providers.VLLM do
+  @moduledoc """
+  VLLM – fully OpenAI-compatible Chat Completions API.
+
+  The OPENAI_API_KEY is required but the contents can be ignored when starting the vLLM service.
+
+  ## Configuration
+
+      # Add to .env file (automatically loaded)
+      OPENAI_API_KEY=some_value...
+  """
+
+  @behaviour ReqLLM.Provider
+
+  use ReqLLM.Provider.DSL,
+    id: :vllm,
+    # Required to have a value, but generally not used.
+    base_url: "http://localhost:8005/v1",
+    metadata: "priv/models_dev/vllm.json",
+    default_env_key: "OPENAI_API_KEY",
+    provider_schema: []
+end
diff --git a/priv/models_dev/.catalog_manifest.json b/priv/models_dev/.catalog_manifest.json
index 6760469c..d751cf88 100644
--- a/priv/models_dev/.catalog_manifest.json
+++ b/priv/models_dev/.catalog_manifest.json
@@ -56,6 +56,7 @@
     "priv/models_dev/v0.json",
     "priv/models_dev/venice.json",
     "priv/models_dev/vercel.json",
+    "priv/models_dev/vllm.json",
     "priv/models_dev/vultr.json",
     "priv/models_dev/wandb.json",
     "priv/models_dev/xai.json",
diff --git a/priv/models_dev/vllm.json b/priv/models_dev/vllm.json
new file mode 100644
index 00000000..ca788509
--- /dev/null
+++ b/priv/models_dev/vllm.json
@@ -0,0 +1,186 @@
+{
+  "models": [
+    {
+      "attachment": false,
+      "cost": {
+        "input": 2.0e-5,
+        "output": 0.0
+      },
+      "dimensions": {
+        "default": 1536,
+        "max": 1536,
+        "min": 1
+      },
+      "id": "test-only-text-embedding",
+      "knowledge": "2024-01",
+      "last_updated": "2024-01-25",
+      "limit": {
+        "context": 8191,
+        "output": 0
+      },
+      "modalities": {
+        "input": [
+          "text"
+        ],
+        "output": [
+          "embedding"
+        ]
+      },
+      "name": "Test Only to test text embedding metadata",
+      "open_weights": false,
+      "provider": "vllm",
+      "provider_model_id": "test-only-text-embedding",
+      "reasoning": false,
+      "release_date": "2024-01-25",
+      "temperature": false,
+      "tool_call": false,
+      "type": "embedding",
+      "base_url": "http://localhost:8004/v1"
+    },
+    {
+      "api": "chat",
+      "attachment": true,
+      "cost": {
+        "cache_read": 1.25,
+        "input": 2.5,
+        "output": 10
+      },
+      "id": "test-only-chat",
+      "knowledge": "2023-09",
+      "last_updated": "2024-08-06",
+      "limit": {
+        "context": 128000,
+        "output": 16384
+      },
+      "modalities": {
+        "input": [
+          "text",
+          "image"
+        ],
+        "output": [
+          "text"
+        ]
+      },
+      "name": "Test Only Model to check chat metadata",
+      "open_weights": false,
+      "provider": "vllm",
+      "provider_model_id": "test-only-chat",
+      "reasoning": false,
+      "release_date": "2024-05-13",
+      "supports_strict_tools": true,
+      "temperature": true,
+      "tool_call": true,
+      "base_url": "http://localhost:8006/v1"
+    },
+    {
+      "api": "responses",
+      "attachment": true,
+      "cost": {
+        "cache_read": 0.01,
+        "input": 0.05,
+        "output": 0.4
+      },
+      "id": "test-only-responses",
+      "knowledge": "2024-05-30",
+      "last_updated": "2025-08-07",
+      "limit": {
+        "context": 400000,
+        "output": 128000
+      },
+      "modalities": {
+        "input": [
+          "text",
+          "image"
+        ],
+        "output": [
+          "text"
+        ]
+      },
+      "name": "Test Only Model to check responses metadata",
+      "open_weights": false,
+      "provider": "vllm",
+      "provider_model_id": "test-only-responses",
+      "reasoning": true,
+      "release_date": "2025-08-07",
+      "supports_json_schema_response_format": true,
+      "temperature": false,
+      "tool_call": true,
+      "base_url": "http://localhost:8001/v1"
+    },
+    {
+      "api": "chat",
+      "attachment": false,
+      "cost": {
+        "cache_read": 1.25,
+        "input": 0.5,
+        "output": 1.5
+      },
+      "id": "test-only-max-completions",
+      "knowledge": "2021-09-01",
+      "last_updated": "2023-11-06",
+      "limit": {
+        "context": 16385,
+        "output": 4096
+      },
+      "modalities": {
+        "input": [
+          "text"
+        ],
+        "output": [
+          "text"
+        ]
+      },
+      "name": "Test Only Model to check max completions metadata",
+      "open_weights": false,
+      "provider": "vllm",
+      "provider_model_id": "test-only-max-completions",
+      "reasoning": false,
+      "release_date": "2023-03-01",
+      "temperature": true,
+      "tool_call": false,
+      "base_url": "http://localhost:8002/v1"
+    },
+    {
+      "api": "chat",
+      "attachment": true,
+      "cost": {
+        "cache_read": 1.25,
+        "input": 2.5,
+        "output": 10
+      },
+      "id": "test-only-chat-no-model-base-url",
+      "knowledge": "2023-09",
+      "last_updated": "2024-08-06",
+      "limit": {
+        "context": 128000,
+        "output": 16384
+      },
+      "modalities": {
+        "input": [
+          "text"
+        ],
+        "output": [
+          "text"
+        ]
+      },
+      "name": "Test Only Model to check that provider base_url is used when not model base url",
+      "open_weights": false,
+      "provider": "vllm",
+      "provider_model_id": "test-only-chat-no-model-base-url",
+      "reasoning": false,
+      "release_date": "2024-05-13",
+      "supports_strict_tools": true,
+      "temperature": true,
+      "tool_call": true
+    }
+  ],
+  "provider": {
+    "base_url": "http://localhost:8005/v1",
+    "doc": "Local AI model provider",
+    "env": [
+      "OPENAI_API_KEY"
+    ],
+    "id": "vllm",
+    "name": "vLLM"
+  }
+}
\ No newline at end of file
diff --git a/priv/models_local/vllm_exclude.json b/priv/models_local/vllm_exclude.json
new file mode 100644
index 00000000..39d683b9
--- /dev/null
+++ b/priv/models_local/vllm_exclude.json
@@ -0,0 +1,13 @@
+{
+  "provider": {
+    "id": "vllm"
+  },
+  "exclude": [
+    "test-only-text-embedding",
+    "test-only-chat",
+    "test-only-text-embedding",
+    "test-only-responses",
+    "test-only-max-completions",
+    "test-only-chat-no-model-base-url"
+  ]
+}
diff --git a/test/providers/vllm_test.exs b/test/providers/vllm_test.exs
new file mode 100644
index 00000000..08123709
--- /dev/null
+++ b/test/providers/vllm_test.exs
@@ -0,0 +1,778 @@
+defmodule ReqLLM.Providers.VLLMTest do
+  @moduledoc """
+  Provider-level tests for VLLM implementation.
+
+  Tests the provider contract directly without going through Generation layer.
+  Focus: prepare_request -> attach -> request -> decode pipeline.
+  """
+
+  use ReqLLM.ProviderCase, provider: ReqLLM.Providers.VLLM
+
+  alias ReqLLM.Context
+  alias ReqLLM.Providers.VLLM
+
+  describe "provider contract" do
+    test "provider identity and configuration" do
+      assert is_atom(VLLM.provider_id())
+      assert is_binary(VLLM.default_base_url())
+      assert String.starts_with?(VLLM.default_base_url(), "http")
+    end
+
+    test "provider schema separation from core options" do
+      schema_keys = VLLM.provider_schema().schema |> Keyword.keys()
+      core_keys = ReqLLM.Provider.Options.generation_schema().schema |> Keyword.keys()
+
+      # Provider-specific keys should not overlap with core generation keys
+      overlap = MapSet.intersection(MapSet.new(schema_keys), MapSet.new(core_keys))
+
+      assert MapSet.size(overlap) == 0,
+             "Schema overlap detected: #{inspect(MapSet.to_list(overlap))}"
+    end
+
+    test "supported options include core generation keys" do
+      supported = VLLM.supported_provider_options()
+      core_keys = ReqLLM.Provider.Options.all_generation_keys()
+
+      # All core keys should be supported (except meta-keys like :provider_options)
+      core_without_meta = Enum.reject(core_keys, &(&1 == :provider_options))
+      missing = core_without_meta -- supported
+      assert missing == [], "Missing core generation keys: #{inspect(missing)}"
+    end
+
+    test "provider_extended_generation_schema includes both base and provider options" do
+      extended_schema = VLLM.provider_extended_generation_schema()
+      extended_keys = extended_schema.schema |> Keyword.keys()
+
+      # Should include all core generation keys
+      core_keys = ReqLLM.Provider.Options.all_generation_keys()
+      core_without_meta = Enum.reject(core_keys, &(&1 == :provider_options))
+
+      for core_key <- core_without_meta do
+        assert core_key in extended_keys,
+               "Extended schema missing core key: #{core_key}"
+      end
+
+      # Should include provider-specific keys
+      provider_keys = VLLM.provider_schema().schema |> Keyword.keys()
+
+      for provider_key <- provider_keys do
+        assert provider_key in extended_keys,
+               "Extended schema missing provider key: #{provider_key}"
+      end
+    end
+  end
+
+  describe "request preparation & pipeline wiring" do
+    test "prepare_request creates configured chat request" do
+      model = ReqLLM.Model.from!("vllm:test-only-chat")
+      context = context_fixture()
+      opts = [temperature: 0.7, max_tokens: 100]
+
+      {:ok, request} = VLLM.prepare_request(:chat, model, context, opts)
+
+      assert %Req.Request{} = request
+      assert request.url.path == "/chat/completions"
+      assert request.method == :post
+    end
+
+    test "prepare_request creates configured embedding request" do
+      model = ReqLLM.Model.from!("vllm:test-only-text-embedding")
+      text = "Hello, world!"
+      opts = [provider_options: []]
+
+      {:ok, request} = VLLM.prepare_request(:embedding, model, text, opts)
+
+      assert %Req.Request{} = request
+      assert request.url.path == "/embeddings"
+      assert request.method == :post
+    end
+
+    test "prepare_request configures authentication and pipeline for chat" do
+      model = ReqLLM.Model.from!("vllm:test-only-chat")
+      prompt = "Hello, world!"
+      opts = [temperature: 0.5, max_tokens: 50]
+
+      {:ok, request} = VLLM.prepare_request(:chat, model, prompt, opts)
+
+      # Verify core options
+      assert request.options[:model] == model.model
+      assert request.options[:temperature] == 0.5
+      assert request.options[:max_tokens] == 50
+      assert String.starts_with?(List.first(request.headers["authorization"]), "Bearer test-key-")
+
+      # Verify pipeline steps
+      request_steps = Keyword.keys(request.request_steps)
+      response_steps = Keyword.keys(request.response_steps)
+
+      assert :llm_encode_body in request_steps
+      assert :llm_decode_response in response_steps
+    end
+
+    test "prepare_request configures authentication and pipeline for embedding" do
+      model = ReqLLM.Model.from!("vllm:test-only-text-embedding")
+      text = "Hello, world!"
+      opts = []
+
+      {:ok, request} = VLLM.prepare_request(:embedding, model, text, opts)
+
+      # Verify embedding-specific options
+      assert request.options[:model] == model.model
+      assert request.options[:operation] == :embedding
+      assert request.options[:text] == "Hello, world!"
+
+      # Verify authentication
+      assert String.starts_with?(List.first(request.headers["authorization"]), "Bearer test-key-")
+    end
+
+    test "error handling for invalid configurations" do
+      model = ReqLLM.Model.from!("vllm:test-only-chat")
+      context = context_fixture()
+
+      # Unsupported operation
+      {:error, error} = VLLM.prepare_request(:unsupported, model, context, [])
+      assert %ReqLLM.Error.Invalid.Parameter{} = error
+
+      # Provider mismatch
+      wrong_model = ReqLLM.Model.from!("groq:llama-3.1-8b-instant")
+
+      assert_raise ReqLLM.Error.Invalid.Provider, fn ->
+        Req.new() |> VLLM.attach(wrong_model, [])
+      end
+    end
+  end
+
+  describe "body encoding & context translation" do
+    test "encode_body for chat without tools" do
+      model = ReqLLM.Model.from!("vllm:test-only-chat")
+      context = context_fixture()
+
+      # Create a mock request with the expected structure
+      mock_request = %Req.Request{
+        options: [
+          context: context,
+          model: model.model,
+          stream: false
+        ]
+      }
+
+      # Test the encode_body function directly
+      updated_request = VLLM.encode_body(mock_request)
+
+      assert is_binary(updated_request.body)
+      decoded = Jason.decode!(updated_request.body)
+
+      assert decoded["model"] == "test-only-chat"
+      assert is_list(decoded["messages"])
+      assert length(decoded["messages"]) == 2
+      assert decoded["stream"] == false
+      refute Map.has_key?(decoded, "tools")
+
+      [system_msg, user_msg] = decoded["messages"]
+      assert system_msg["role"] == "system"
+      assert user_msg["role"] == "user"
+    end
+
+    test "encode_body for chat with tools but no tool_choice" do
+      model = ReqLLM.Model.from!("vllm:test-only-chat")
+      context = context_fixture()
+
+      tool =
+        ReqLLM.Tool.new!(
+          name: "test_tool",
+          description: "A test tool",
+          parameter_schema: [
+            name: [type: :string, required: true, doc: "A name parameter"]
+          ],
+          callback: fn _ -> {:ok, "result"} end
+        )
+
+      mock_request = %Req.Request{
+        options: [
+          context: context,
+          model: model.model,
+          stream: false,
+          tools: [tool]
+        ]
+      }
+
+      updated_request = VLLM.encode_body(mock_request)
+      decoded = Jason.decode!(updated_request.body)
+
+      assert is_list(decoded["tools"])
+      assert length(decoded["tools"]) == 1
+      refute Map.has_key?(decoded, "tool_choice")
+
+      [encoded_tool] = decoded["tools"]
+      assert encoded_tool["function"]["name"] == "test_tool"
+    end
+
+    test "encode_body for chat with tools and tool_choice" do
+      model = ReqLLM.Model.from!("vllm:test-only-chat")
+      context = context_fixture()
+
+      tool =
+        ReqLLM.Tool.new!(
+          name: "specific_tool",
+          description: "A specific tool",
+          parameter_schema: [
+            value: [type: :string, required: true, doc: "A value parameter"]
+          ],
+          callback: fn _ -> {:ok, "result"} end
+        )
+
+      tool_choice = %{type: "function", function: %{name: "specific_tool"}}
+
+      mock_request = %Req.Request{
+        options: [
+          context: context,
+          model: model.model,
+          stream: false,
+          tools: [tool],
+          tool_choice: tool_choice
+        ]
+      }
+
+      updated_request = VLLM.encode_body(mock_request)
+      decoded = Jason.decode!(updated_request.body)
+
+      assert is_list(decoded["tools"])
+
+      assert decoded["tool_choice"] == %{
+               "type" => "function",
+               "function" => %{"name" => "specific_tool"}
+             }
+    end
+
+    test "encode_body for regular models uses max_tokens" do
+      model = ReqLLM.Model.from!("vllm:test-only-chat")
+      context = context_fixture()
+
+      mock_request = %Req.Request{
+        options: [
+          context: context,
+          model: model.model,
+          stream: false,
+          max_tokens: 1500,
+          temperature: 0.7
+        ]
+      }
+
+      updated_request = VLLM.encode_body(mock_request)
+      decoded = Jason.decode!(updated_request.body)
+
+      assert decoded["model"] == "test-only-chat"
+      assert decoded["max_tokens"] == 1500
+      assert decoded["temperature"] == 0.7
+      refute Map.has_key?(decoded, "max_completion_tokens")
+    end
+
+    test "encode_body for embedding operation" do
+      model = ReqLLM.Model.from!("vllm:test-only-text-embedding")
+      text = "Hello, world!"
+
+      mock_request = %Req.Request{
+        options: [
+          operation: :embedding,
+          model: model.model,
+          text: text
+        ]
+      }
+
+      updated_request = VLLM.encode_body(mock_request)
+      decoded = Jason.decode!(updated_request.body)
+
+      assert decoded["model"] == "test-only-text-embedding"
+      assert decoded["input"] == "Hello, world!"
+    end
+  end
+
+  describe "response decoding" do
+    test "decode_response for chat handles non-streaming responses" do
+      # Create a mock non-streaming response body
+      mock_response_body = %{
+        "id" => "test-only-chat",
+        "object" => "chat.completion",
+        "created" => 1_677_652_288,
+        "model" => "test-only-chat",
+        "choices" => [
+          %{
+            "index" => 0,
+            "message" => %{
+              "role" => "assistant",
+              "content" => "Hello! How can I help you today?"
+            },
+            "logprobs" => nil,
+            "finish_reason" => "stop"
+          }
+        ],
+        "usage" => %{
+          "prompt_tokens" => 13,
+          "completion_tokens" => 7,
+          "total_tokens" => 20
+        }
+      }
+
+      mock_resp = %Req.Response{
+        status: 200,
+        body: mock_response_body
+      }
+
+      model = ReqLLM.Model.from!("vllm:test-only-chat")
+      context = context_fixture()
+
+      mock_req = %Req.Request{
+        options: [context: context, stream: false, model: model.model]
+      }
+
+      # Test decode_response directly
+      {req, resp} = VLLM.decode_response({mock_req, mock_resp})
+
+      assert req == mock_req
+      assert %ReqLLM.Response{} = resp.body
+
+      response = resp.body
+      assert is_binary(response.id)
+      assert response.model == model.model
+      assert response.stream? == false
+
+      # Verify message normalization
+      assert response.message.role == :assistant
+      text = ReqLLM.Response.text(response)
+      assert is_binary(text)
+      assert String.length(text) > 0
+      assert response.finish_reason in [:stop, :length]
+
+      # Verify usage normalization
+      assert is_integer(response.usage.input_tokens)
+      assert is_integer(response.usage.output_tokens)
+      assert is_integer(response.usage.total_tokens)
+
+      # Verify context advancement (original + assistant)
+      assert length(response.context.messages) == 3
+      assert List.last(response.context.messages).role == :assistant
+    end
+
+    test "decode_response for chat handles streaming responses" do
+      # Create mock streaming chunks
+      stream_chunks = [
+        %{"choices" => [%{"delta" => %{"content" => "Hello"}}]},
+        %{"choices" => [%{"delta" => %{"content" => " world"}}]},
+        %{"choices" => [%{"finish_reason" => "stop"}]}
+      ]
+
+      # Create a mock stream for real-time streaming
+      mock_real_time_stream = Stream.map(stream_chunks, & &1)
+
+      # Create a mock Req response
+      mock_resp = %Req.Response{
+        status: 200,
+        body: nil
+      }
+
+      # Create a mock request with context, model, and real-time stream
+      context = context_fixture()
+      model = "test-only-chat"
+
+      mock_req = %Req.Request{
+        options: [context: context, stream: true, model: model],
+        private: %{real_time_stream: mock_real_time_stream}
+      }
+
+      # Test decode_response directly
+      {req, resp} = VLLM.decode_response({mock_req, mock_resp})
+
+      assert req == mock_req
+      assert %ReqLLM.Response{} = resp.body
+
+      response = resp.body
+      assert response.stream? == true
+      assert is_struct(response.stream, Stream)
+      assert response.model == model
+
+      # Verify context is preserved (original messages only in streaming)
+      assert length(response.context.messages) == 2
+
+      # Verify stream structure and processing
+      assert response.usage == %{
+               input_tokens: 0,
+               output_tokens: 0,
+               total_tokens: 0,
+               cached_tokens: 0,
+               reasoning_tokens: 0
+             }
+
+      assert response.finish_reason == nil
+      # http_task removed after fix for issue #42 (no duplicate request execution)
+      assert response.provider_meta == %{}
+    end
+
+    test "decode_response for embedding returns raw body" do
+      # Create a mock embedding response body
+      mock_response_body = %{
+        "object" => "list",
+        "data" => [
+          %{
+            "object" => "embedding",
+            "embedding" => [0.1, 0.2, 0.3],
+            "index" => 0
+          }
+        ],
+        "model" => "test-only-text-embedding",
+        "usage" => %{
+          "prompt_tokens" => 5,
+          "total_tokens" => 5
+        }
+      }
+
+      mock_resp = %Req.Response{
+        status: 200,
+        body: mock_response_body
+      }
+
+      mock_req = %Req.Request{
+        options: [operation: :embedding, model: "test-only-text-embedding"]
+      }
+
+      # Test decode_response for embeddings
+      {req, resp} = VLLM.decode_response({mock_req, mock_resp})
+
+      assert req == mock_req
+      # For embeddings, body should be the raw parsed JSON
+      assert resp.body == mock_response_body
+    end
+
+    test "decode_response handles API errors with non-200 status" do
+      # Create error response
+      error_body = %{
+        "error" => %{
+          "message" => "Invalid API key provided",
+          "type" => "invalid_request_error",
+          "code" => "invalid_api_key"
+        }
+      }
+
+      mock_resp = %Req.Response{
+        status: 401,
+        body: error_body
+      }
+
+      context = context_fixture()
+
+      mock_req = %Req.Request{
+        options: [context: context, model: "test-only-chat"]
+      }
+
+      # Test decode_response error handling (now delegated to ChatAPI)
+      {req, error} = VLLM.decode_response({mock_req, mock_resp})
+
+      assert req == mock_req
+      assert %ReqLLM.Error.API.Response{} = error
+      assert error.status == 401
+      assert error.reason == "Test-only-chat API error"
+      assert error.response_body == error_body
+    end
+  end
+
+  describe "usage extraction" do
+    test "extract_usage with valid usage data" do
+      model = ReqLLM.Model.from!("vllm:test-only-chat")
+
+      body_with_usage = %{
+        "usage" => %{
+          "prompt_tokens" => 15,
+          "completion_tokens" => 25,
+          "total_tokens" => 40
+        }
+      }
+
+      {:ok, usage} = VLLM.extract_usage(body_with_usage, model)
+      assert usage["prompt_tokens"] == 15
+      assert usage["completion_tokens"] == 25
+      assert usage["total_tokens"] == 40
+    end
+
+    test "extract_usage with missing usage data" do
+      model = ReqLLM.Model.from!("vllm:test-only-chat")
+      body_without_usage = %{"choices" => []}
+
+      {:error, :no_usage_found} = VLLM.extract_usage(body_without_usage, model)
+    end
+
+    test "extract_usage with invalid body type" do
+      model = ReqLLM.Model.from!("vllm:test-only-chat")
+
+      {:error, :invalid_body} = VLLM.extract_usage("invalid", model)
+      {:error, :invalid_body} = VLLM.extract_usage(nil, model)
+      {:error, :invalid_body} = VLLM.extract_usage(123, model)
+    end
+  end
+
+  describe "embedding support" do
+    test "prepare_request for embedding with all options" do
+      model = ReqLLM.Model.from!("vllm:test-only-text-embedding")
+      text = "Sample text for embedding"
+      opts = [user: "test-user"]
+
+      {:ok, request} = VLLM.prepare_request(:embedding, model, text, opts)
+
+      assert request.options[:operation] == :embedding
+      assert request.options[:text] == text
+      assert request.options[:user] == "test-user"
+    end
+
+    test "encode_body for embedding with optional parameters" do
+      model = ReqLLM.Model.from!("vllm:test-only-text-embedding")
+
+      mock_request = %Req.Request{
+        options: [
+          operation: :embedding,
+          model: model.model,
+          text: "Test embedding text",
+          provider_options: [dimensions: 512, encoding_format: "base64"],
+          user: "test-user-123"
+        ]
+      }
+
+      updated_request = VLLM.encode_body(mock_request)
+      decoded = Jason.decode!(updated_request.body)
+
+      assert decoded["model"] == "test-only-text-embedding"
+      assert decoded["input"] == "Test embedding text"
+      assert decoded["dimensions"] == 512
+      assert decoded["encoding_format"] == "base64"
+      assert decoded["user"] == "test-user-123"
+    end
+  end
+
+  describe "error handling & robustness" do
+    test "context validation" do
+      # Multiple system messages should fail
+      invalid_context =
+        Context.new([
+          Context.system("System 1"),
+          Context.system("System 2"),
+          Context.user("Hello")
+        ])
+
+      assert_raise ReqLLM.Error.Validation.Error,
+                   ~r/should have at most one system message/,
+                   fn ->
+                     Context.validate!(invalid_context)
+                   end
+    end
+
+    test "prepare_request rejects unsupported operations" do
+      model = ReqLLM.Model.from!("vllm:test-only-chat")
+      context = context_fixture()
+
+      {:error, error} = VLLM.prepare_request(:unsupported, model, context, [])
+      assert %ReqLLM.Error.Invalid.Parameter{} = error
+
+      assert error.parameter =~
+               "operation: :unsupported not supported by ReqLLM.Providers.VLLM. Supported operations: [:chat, :object, :embedding]"
+    end
+
+    test "attach rejects invalid model provider" do
+      wrong_model = ReqLLM.Model.from!("groq:llama-3.1-8b-instant")
+
+      assert_raise ReqLLM.Error.Invalid.Provider, fn ->
+        Req.new() |> VLLM.attach(wrong_model, [])
+      end
+    end
+  end
+
+  describe "ResponsesAPI json_schema support" do
+    test "ResponsesAPI encode_text_format transforms response_format to flattened text.format" do
+      schema = [
+        name: [type: :string, required: true],
+        title: [type: :string, required: true]
+      ]
+
+      json_schema = ReqLLM.Schema.to_json(schema)
+
+      # Enforce strict schema requirements
+      json_schema =
+        json_schema
+        |> Map.put("required", Map.keys(json_schema["properties"]))
+        |> Map.put("additionalProperties", false)
+
+      response_format = %{
+        type: "json_schema",
+        json_schema: %{
+          name: "output_schema",
+          strict: true,
+          schema: json_schema
+        }
+      }
+
+      # Test the encode_text_format function
+      text_format = ReqLLM.Providers.OpenAI.ResponsesAPI.encode_text_format(response_format)
+
+      # ResponsesAPI expects flattened structure: text.format.{name, strict, schema}
+      # not text.format.json_schema.{name, strict, schema}
+      assert text_format["format"]["type"] == "json_schema"
+      assert text_format["format"]["name"] == "output_schema"
+      assert text_format["format"]["strict"] == true
+      assert text_format["format"]["schema"] != nil
+      refute Map.has_key?(text_format["format"], "json_schema")
+    end
+
+    test "ResponsesAPI includes text parameter in request body with name at format level" do
+      model = ReqLLM.Model.from!("vllm:test-only-responses")
+
+      schema = [
+        name: [type: :string, required: true],
+        title: [type: :string, required: true]
+      ]
+
+      json_schema = ReqLLM.Schema.to_json(schema)
+
+      # Enforce strict schema requirements
+      json_schema =
+        json_schema
+        |> Map.put("required", Map.keys(json_schema["properties"]))
+        |> Map.put("additionalProperties", false)
+
+      response_format = %{
+        type: "json_schema",
+        json_schema: %{
+          name: "output_schema",
+          strict: true,
+          schema: json_schema
+        }
+      }
+
+      context = %ReqLLM.Context{
+        messages: [
+          %ReqLLM.Message{
+            role: :user,
+            content: [%ReqLLM.Message.ContentPart{type: :text, text: "Generate a person"}]
+          }
+        ]
+      }
+
+      opts = [
+        provider_options: [response_format: response_format],
+        context: context,
+        model: model.model
+      ]
+
+      # Create a mock request
+      request = %Req.Request{
+        url: URI.parse("https://api.openai.com/v1/responses"),
+        method: :post,
+        options: opts
+      }
+
+      # Test encode_body
+      encoded_request = ReqLLM.Providers.OpenAI.ResponsesAPI.encode_body(request)
+      body = Jason.decode!(encoded_request.body)
+
+      # Verify text parameter exists with correct structure
+      # VLLM ResponsesAPI expects name at text.format.name level, not text.format.json_schema.name
+      assert Map.has_key?(body, "text")
+      assert body["text"]["format"]["type"] == "json_schema"
+      assert body["text"]["format"]["name"] == "output_schema"
+      assert body["text"]["format"]["strict"] == true
+      assert body["text"]["format"]["schema"] != nil
+      assert body["text"]["format"]["schema"]["type"] == "object"
+      assert Map.has_key?(body["text"]["format"]["schema"], "properties")
+    end
+
+    test "ResponsesAPI decode_response extracts and validates object from json_schema response" do
+      model = ReqLLM.Model.from!("vllm:test-only-responses")
+
+      schema = [
+        name: [type: :string, required: true]
+      ]
+
+      {:ok, compiled_schema} = ReqLLM.Schema.compile(schema)
+
+      # Mock a ResponsesAPI response with JSON in output_text
+      mock_response_body = %{
+        "id" => "resp_test123",
+        "model" => "gpt-5-nano-2025-08-07",
+        "object" => "response",
+        "status" => "completed",
+        "output" => [],
+        "output_text" => ~s({"name":"Mara Ellington"}),
+        "usage" => %{
+          "input_tokens" => 31,
+          "output_tokens" => 594,
+          "reasoning_tokens" => 576
+        }
+      }
+
+      mock_resp = %Req.Response{
+        status: 200,
+        body: mock_response_body
+      }
+
+      context = %ReqLLM.Context{
+        messages: [
+          %ReqLLM.Message{
+            role: :user,
+            content: [%ReqLLM.Message.ContentPart{type: :text, text: "Generate a person"}]
+          }
+        ]
+      }
+
+      mock_req = %Req.Request{
+        options: [
+          context: context,
+          model: model.model,
+          operation: :object,
+          compiled_schema: compiled_schema
+        ]
+      }
+
+      # Test decode_response
+      {req, resp} = ReqLLM.Providers.OpenAI.ResponsesAPI.decode_response({mock_req, mock_resp})
+
+      assert req == mock_req
+      assert %ReqLLM.Response{} = resp.body
+
+      response = resp.body
+
+      # The object field should be populated with the parsed and validated JSON
+      assert response.object != nil
+      assert response.object["name"] == "Mara Ellington"
+
+      # The message should still contain the original JSON text
+      text = ReqLLM.Response.text(response)
+      assert text == ~s({"name":"Mara Ellington"})
+    end
+  end
+
+  describe "model base_url overrides the provider base_url" do
+    test "model with a base_url is used by the request base_url" do
+      {:ok, provider} = ReqLLM.Provider.Registry.get_provider(:vllm)
+      {:ok, model} = ReqLLM.Provider.Registry.get_model(:vllm, "test-only-chat")
+      {:ok, provider_metadata} = ReqLLM.Provider.Registry.get_provider_metadata(:vllm)
+      {:ok, request} = provider.prepare_request(:chat, model, "Hello!", temperature: 0.7)
+      assert request.options.base_url == "http://localhost:8006/v1"
+      assert request.options.base_url != provider_metadata["base_url"]
+      assert request.options.base_url == model.base_url
+    end
+
+    test "another model with a base_url is used by the request base_url" do
+      {:ok, provider} = ReqLLM.Provider.Registry.get_provider(:vllm)
+      {:ok, model} = ReqLLM.Provider.Registry.get_model(:vllm, "test-only-responses")
+      {:ok, provider_metadata} = ReqLLM.Provider.Registry.get_provider_metadata(:vllm)
+      {:ok, request} = provider.prepare_request(:chat, model, "Hello!", temperature: 0.7)
+      assert request.options.base_url == "http://localhost:8001/v1"
+      assert request.options.base_url != provider_metadata["base_url"]
+      assert request.options.base_url == model.base_url
+    end
+
+    test "request base_url for model with no base_url uses the provider base_url" do
+      {:ok, provider} = ReqLLM.Provider.Registry.get_provider(:vllm)
+      {:ok, model} = ReqLLM.Provider.Registry.get_model(:vllm, "test-only-chat-no-model-base-url")
+      {:ok, provider_metadata} = ReqLLM.Provider.Registry.get_provider_metadata(:vllm)
+      {:ok, request} = provider.prepare_request(:chat, model, "Hello!", temperature: 0.7)
+      assert request.options.base_url == "http://localhost:8005/v1"
+      assert request.options.base_url == provider_metadata["base_url"]
+      assert request.options.base_url != model.base_url
+    end
+  end
+end