diff --git a/comps/cores/proto/api_protocol.py b/comps/cores/proto/api_protocol.py index f8fec8d3a9..6a32023ecb 100644 --- a/comps/cores/proto/api_protocol.py +++ b/comps/cores/proto/api_protocol.py @@ -195,6 +195,7 @@ class ChatCompletionRequest(BaseModel): # top_p: Optional[float] = None # Priority use openai typical_p: Optional[float] = None # repetition_penalty: Optional[float] = None + timeout: Optional[int] = None # doc: begin-chat-completion-extra-params echo: Optional[bool] = Field( diff --git a/comps/llms/src/doc-summarization/README.md b/comps/llms/src/doc-summarization/README.md index 77e0969d2a..287208c977 100644 --- a/comps/llms/src/doc-summarization/README.md +++ b/comps/llms/src/doc-summarization/README.md @@ -133,6 +133,8 @@ curl http://${your_ip}:9000/v1/docsum \ "summary_type" is set to be "auto" by default, in this mode we will check input token length, if it exceed `MAX_INPUT_TOKENS`, `summary_type` will automatically be set to `refine` mode, otherwise will be set to `stuff` mode. +With long contexts, request may get canceled due to its generation taking longer than the default `timeout` value (120s for TGI). Increase it as needed. + **summary_type=stuff** In this mode LLM generate summary based on complete input text. In this case please carefully set `MAX_INPUT_TOKENS` and `MAX_TOTAL_TOKENS` according to your model and device memory, otherwise it may exceed LLM context limit and raise error when meet long context. @@ -157,7 +159,7 @@ In this mode, default `chunk_size` is set to be `min(MAX_TOTAL_TOKENS - input.ma ```bash curl http://${your_ip}:9000/v1/docsum \ -X POST \ - -d '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false}' \ + -d '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false, "timeout":200}' \ -H 'Content-Type: application/json' ``` @@ -170,6 +172,6 @@ In this mode, default `chunk_size` is set to be `min(MAX_TOTAL_TOKENS - 2 * inpu ```bash curl http://${your_ip}:9000/v1/docsum \ -X POST \ - -d '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000}' \ + -d '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000, "timeout":200}' \ -H 'Content-Type: application/json' ``` diff --git a/comps/llms/src/doc-summarization/integrations/tgi.py b/comps/llms/src/doc-summarization/integrations/tgi.py index 902334bc03..f62813c260 100644 --- a/comps/llms/src/doc-summarization/integrations/tgi.py +++ b/comps/llms/src/doc-summarization/integrations/tgi.py @@ -70,6 +70,7 @@ async def invoke(self, input: DocSumChatCompletionRequest): temperature=input.temperature if input.temperature else 0.01, repetition_penalty=input.repetition_penalty if input.repetition_penalty else 1.03, streaming=input.stream, + timeout=input.timeout if input.timeout is not None else 120, server_kwargs=server_kwargs, task="text-generation", ) diff --git a/comps/llms/src/doc-summarization/integrations/vllm.py b/comps/llms/src/doc-summarization/integrations/vllm.py index c292f850e5..a2126c4d52 100644 --- a/comps/llms/src/doc-summarization/integrations/vllm.py +++ b/comps/llms/src/doc-summarization/integrations/vllm.py @@ -63,6 +63,7 @@ async def invoke(self, input: DocSumChatCompletionRequest): top_p=input.top_p if input.top_p else 0.95, streaming=input.stream, temperature=input.temperature if input.temperature else 0.01, + request_timeout=float(input.timeout) if input.timeout is not None else None, ) result = await self.generate(input, self.client) diff --git a/comps/llms/src/faq-generation/integrations/tgi.py b/comps/llms/src/faq-generation/integrations/tgi.py index edfa64bcb2..887f1666c1 100644 --- a/comps/llms/src/faq-generation/integrations/tgi.py +++ b/comps/llms/src/faq-generation/integrations/tgi.py @@ -67,6 +67,7 @@ async def invoke(self, input: ChatCompletionRequest): temperature=input.temperature if input.temperature else 0.01, repetition_penalty=input.repetition_penalty if input.repetition_penalty else 1.03, streaming=input.stream, + timeout=input.timeout if input.timeout is not None else 120, server_kwargs=server_kwargs, ) result = await self.generate(input, self.client) diff --git a/comps/llms/src/faq-generation/integrations/vllm.py b/comps/llms/src/faq-generation/integrations/vllm.py index bf891ea7ff..a812b2b5fa 100644 --- a/comps/llms/src/faq-generation/integrations/vllm.py +++ b/comps/llms/src/faq-generation/integrations/vllm.py @@ -60,6 +60,7 @@ async def invoke(self, input: ChatCompletionRequest): top_p=input.top_p if input.top_p else 0.95, streaming=input.stream, temperature=input.temperature if input.temperature else 0.01, + request_timeout=float(input.timeout) if input.timeout is not None else None, ) result = await self.generate(input, self.client) diff --git a/tests/llms/test_llms_doc-summarization_tgi.sh b/tests/llms/test_llms_doc-summarization_tgi.sh index 16e2018548..18f9b0da86 100644 --- a/tests/llms/test_llms_doc-summarization_tgi.sh +++ b/tests/llms/test_llms_doc-summarization_tgi.sh @@ -125,7 +125,7 @@ function validate_microservices() { 'text' \ "docsum-tgi" \ "docsum-tgi" \ - '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false}' + '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false, "timeout":200}' echo "Validate refine mode..." validate_services \ @@ -133,7 +133,7 @@ function validate_microservices() { 'text' \ "docsum-tgi" \ "docsum-tgi" \ - '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000}' + '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000, "timeout":200}' } function stop_docker() { diff --git a/tests/llms/test_llms_doc-summarization_tgi_on_intel_hpu.sh b/tests/llms/test_llms_doc-summarization_tgi_on_intel_hpu.sh index b8c97f5b60..bc6cd03b0f 100644 --- a/tests/llms/test_llms_doc-summarization_tgi_on_intel_hpu.sh +++ b/tests/llms/test_llms_doc-summarization_tgi_on_intel_hpu.sh @@ -126,7 +126,7 @@ function validate_microservices() { 'text' \ "docsum-tgi-gaudi" \ "docsum-tgi-gaudi" \ - '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false}' + '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false, "timeout":200}' echo "Validate refine mode..." validate_services \ @@ -134,7 +134,7 @@ function validate_microservices() { 'text' \ "docsum-tgi-gaudi" \ "docsum-tgi-gaudi" \ - '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000}' + '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000, "timeout":200}' } function stop_docker() { diff --git a/tests/llms/test_llms_doc-summarization_vllm.sh b/tests/llms/test_llms_doc-summarization_vllm.sh index 42e79aa1e9..88cbef9c92 100644 --- a/tests/llms/test_llms_doc-summarization_vllm.sh +++ b/tests/llms/test_llms_doc-summarization_vllm.sh @@ -140,7 +140,7 @@ function validate_microservices() { 'text' \ "docsum-vllm" \ "docsum-vllm" \ - '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false}' + '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false, "timeout":200}' echo "Validate refine mode..." validate_services \ @@ -148,7 +148,7 @@ function validate_microservices() { 'text' \ "docsum-vllm" \ "docsum-vllm" \ - '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000}' + '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000, "timeout":200}' } function stop_docker() { diff --git a/tests/llms/test_llms_doc-summarization_vllm_on_intel_hpu.sh b/tests/llms/test_llms_doc-summarization_vllm_on_intel_hpu.sh index a6096bd309..31bc13d693 100644 --- a/tests/llms/test_llms_doc-summarization_vllm_on_intel_hpu.sh +++ b/tests/llms/test_llms_doc-summarization_vllm_on_intel_hpu.sh @@ -139,7 +139,7 @@ function validate_microservices() { 'text' \ "docsum-vllm-gaudi" \ "docsum-vllm-gaudi" \ - '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false}' + '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false, "timeout":200}' echo "Validate refine mode..." validate_services \ @@ -147,7 +147,7 @@ function validate_microservices() { 'text' \ "docsum-vllm-gaudi" \ "docsum-vllm-gaudi" \ - '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000}' + '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000, "timeout":200}' } function stop_docker() {