[https://nvbugs/5427043][fix] cherrypick: request length exceeds max_num_tokens (NVIDIA#7718)

Superjomn · dominicshanshan · commit 8437354b8218 · 2025-09-23T19:10:58.000-07:00
Signed-off-by: Superjomn &lt;328693+Superjomn@users.noreply.github.com&gt;
Signed-off-by: Wangshanshan &lt;30051912+dominicshanshan@users.noreply.github.com&gt;
diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py
@@ -925,3 +925,15 @@ def test_llm_return_logprobs_streaming(prompt_logprobs, logprobs,
                                      return_generation_logits,
                                      streaming=True,
                                      backend="pytorch")
+class TestLlmError:
+
+    def test_max_num_token_check(self):
+        """ LLM should raise error when got prompt length exceed the valid range. """
+        llm = LLM(llama_model_path,
+                  kv_cache_config=global_kvcache_config,
+                  max_num_tokens=100)
+
+        with pytest.raises(ValueError,
+                           match="should not exceed max_num_tokens"):
+            ids = [random.randint(10, 100) for _ in range(101)]
+            llm.generate([ids])