Add timeout param for DocSum and FaqGen to deal with long context (opea-project#1329)

XinyaoWa · pre-commit-ci[bot] · chyundunovDatamonsters · commit d6ec04d91b1a · 2025-04-29T20:19:31.000+07:00
* Add timeout param for DocSum and FaqGen to deal with long context Make timeout param configurable, solve issue opea-project/GenAIExamples#1481 Signed-off-by: Xinyao Wang <xinyao.wang@intel.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Xinyao Wang <xinyao.wang@intel.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Chingis Yundunov <c.yundunov@datamonsters.com>
diff --git a/comps/cores/proto/api_protocol.py b/comps/cores/proto/api_protocol.py
@@ -195,6 +195,7 @@ class ChatCompletionRequest(BaseModel):
     # top_p: Optional[float] = None # Priority use openai
     typical_p: Optional[float] = None
     # repetition_penalty: Optional[float] = None
+    timeout: Optional[int] = None
 
     # doc: begin-chat-completion-extra-params
     echo: Optional[bool] = Field(
diff --git a/comps/llms/src/doc-summarization/README.md b/comps/llms/src/doc-summarization/README.md
@@ -133,6 +133,8 @@ curl http://${your_ip}:9000/v1/docsum \
 
 "summary_type" is set to be "auto" by default, in this mode we will check input token length, if it exceed `MAX_INPUT_TOKENS`, `summary_type` will automatically be set to `refine` mode, otherwise will be set to `stuff` mode.
 
+With long contexts, request may get canceled due to its generation taking longer than the default `timeout` value (120s for TGI). Increase it as needed.
+
 **summary_type=stuff**
 
 In this mode LLM generate summary based on complete input text. In this case please carefully set `MAX_INPUT_TOKENS` and `MAX_TOTAL_TOKENS` according to your model and device memory, otherwise it may exceed LLM context limit and raise error when meet long context.
@@ -157,7 +159,7 @@ In this mode, default `chunk_size` is set to be `min(MAX_TOTAL_TOKENS - input.ma
 ```bash
 curl http://${your_ip}:9000/v1/docsum \
   -X POST \
-  -d '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false}' \
+  -d '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false, "timeout":200}' \
   -H 'Content-Type: application/json'
 ```
 
@@ -170,6 +172,6 @@ In this mode, default `chunk_size` is set to be `min(MAX_TOTAL_TOKENS - 2 * inpu
 ```bash
 curl http://${your_ip}:9000/v1/docsum \
   -X POST \
-  -d '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000}' \
+  -d '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000, "timeout":200}' \
   -H 'Content-Type: application/json'
 ```
diff --git a/comps/llms/src/doc-summarization/integrations/tgi.py b/comps/llms/src/doc-summarization/integrations/tgi.py
@@ -70,6 +70,7 @@ async def invoke(self, input: DocSumChatCompletionRequest):
             temperature=input.temperature if input.temperature else 0.01,
             repetition_penalty=input.repetition_penalty if input.repetition_penalty else 1.03,
             streaming=input.stream,
+            timeout=input.timeout if input.timeout is not None else 120,
             server_kwargs=server_kwargs,
             task="text-generation",
         )
diff --git a/comps/llms/src/doc-summarization/integrations/vllm.py b/comps/llms/src/doc-summarization/integrations/vllm.py
@@ -63,6 +63,7 @@ async def invoke(self, input: DocSumChatCompletionRequest):
             top_p=input.top_p if input.top_p else 0.95,
             streaming=input.stream,
             temperature=input.temperature if input.temperature else 0.01,
+            request_timeout=float(input.timeout) if input.timeout is not None else None,
         )
         result = await self.generate(input, self.client)
 
diff --git a/comps/llms/src/faq-generation/integrations/tgi.py b/comps/llms/src/faq-generation/integrations/tgi.py
@@ -67,6 +67,7 @@ async def invoke(self, input: ChatCompletionRequest):
             temperature=input.temperature if input.temperature else 0.01,
             repetition_penalty=input.repetition_penalty if input.repetition_penalty else 1.03,
             streaming=input.stream,
+            timeout=input.timeout if input.timeout is not None else 120,
             server_kwargs=server_kwargs,
         )
         result = await self.generate(input, self.client)
diff --git a/comps/llms/src/faq-generation/integrations/vllm.py b/comps/llms/src/faq-generation/integrations/vllm.py
@@ -60,6 +60,7 @@ async def invoke(self, input: ChatCompletionRequest):
             top_p=input.top_p if input.top_p else 0.95,
             streaming=input.stream,
             temperature=input.temperature if input.temperature else 0.01,
+            request_timeout=float(input.timeout) if input.timeout is not None else None,
         )
         result = await self.generate(input, self.client)
 
diff --git a/tests/llms/test_llms_doc-summarization_tgi.sh b/tests/llms/test_llms_doc-summarization_tgi.sh
@@ -125,15 +125,15 @@ function validate_microservices() {
         'text' \
         "docsum-tgi" \
         "docsum-tgi" \
-        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false}'
+        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false, "timeout":200}'
 
     echo "Validate refine mode..."
     validate_services \
         "$URL" \
         'text' \
         "docsum-tgi" \
         "docsum-tgi" \
-        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000}'
+        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000, "timeout":200}'
 }
 
 function stop_docker() {
diff --git a/tests/llms/test_llms_doc-summarization_tgi_on_intel_hpu.sh b/tests/llms/test_llms_doc-summarization_tgi_on_intel_hpu.sh
@@ -126,15 +126,15 @@ function validate_microservices() {
         'text' \
         "docsum-tgi-gaudi" \
         "docsum-tgi-gaudi" \
-        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false}'
+        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false, "timeout":200}'
 
     echo "Validate refine mode..."
     validate_services \
         "$URL" \
         'text' \
         "docsum-tgi-gaudi" \
         "docsum-tgi-gaudi" \
-        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000}'
+        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000, "timeout":200}'
 }
 
 function stop_docker() {
diff --git a/tests/llms/test_llms_doc-summarization_vllm.sh b/tests/llms/test_llms_doc-summarization_vllm.sh
@@ -140,15 +140,15 @@ function validate_microservices() {
         'text' \
         "docsum-vllm" \
         "docsum-vllm" \
-        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false}'
+        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false, "timeout":200}'
 
     echo "Validate refine mode..."
     validate_services \
         "$URL" \
         'text' \
         "docsum-vllm" \
         "docsum-vllm" \
-        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000}'
+        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000, "timeout":200}'
 }
 
 function stop_docker() {
diff --git a/tests/llms/test_llms_doc-summarization_vllm_on_intel_hpu.sh b/tests/llms/test_llms_doc-summarization_vllm_on_intel_hpu.sh
@@ -139,15 +139,15 @@ function validate_microservices() {
         'text' \
         "docsum-vllm-gaudi" \
         "docsum-vllm-gaudi" \
-        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false}'
+        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false, "timeout":200}'
 
     echo "Validate refine mode..."
     validate_services \
         "$URL" \
         'text' \
         "docsum-vllm-gaudi" \
         "docsum-vllm-gaudi" \
-        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000}'
+        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000, "timeout":200}'
 }
 
 function stop_docker() {

Original file line number	Diff line number	Diff line change
`@@ -70,6 +70,7 @@ async def invoke(self, input: DocSumChatCompletionRequest):`
`70`	`70`	`temperature=input.temperature if input.temperature else 0.01,`
`71`	`71`	`repetition_penalty=input.repetition_penalty if input.repetition_penalty else 1.03,`
`72`	`72`	`streaming=input.stream,`
	`73`	`+ timeout=input.timeout if input.timeout is not None else 120,`
`73`	`74`	`server_kwargs=server_kwargs,`
`74`	`75`	`task="text-generation",`
`75`	`76`	`)`
Original file line number	Diff line number	Diff line change
`@@ -63,6 +63,7 @@ async def invoke(self, input: DocSumChatCompletionRequest):`
`63`	`63`	`top_p=input.top_p if input.top_p else 0.95,`
`64`	`64`	`streaming=input.stream,`
`65`	`65`	`temperature=input.temperature if input.temperature else 0.01,`
	`66`	`+ request_timeout=float(input.timeout) if input.timeout is not None else None,`
`66`	`67`	`)`
`67`	`68`	`result = await self.generate(input, self.client)`
`68`	`69`
Original file line number	Diff line number	Diff line change
`@@ -67,6 +67,7 @@ async def invoke(self, input: ChatCompletionRequest):`
`67`	`67`	`temperature=input.temperature if input.temperature else 0.01,`
`68`	`68`	`repetition_penalty=input.repetition_penalty if input.repetition_penalty else 1.03,`
`69`	`69`	`streaming=input.stream,`
	`70`	`+ timeout=input.timeout if input.timeout is not None else 120,`
`70`	`71`	`server_kwargs=server_kwargs,`
`71`	`72`	`)`
`72`	`73`	`result = await self.generate(input, self.client)`
Original file line number	Diff line number	Diff line change
`@@ -60,6 +60,7 @@ async def invoke(self, input: ChatCompletionRequest):`
`60`	`60`	`top_p=input.top_p if input.top_p else 0.95,`
`61`	`61`	`streaming=input.stream,`
`62`	`62`	`temperature=input.temperature if input.temperature else 0.01,`
	`63`	`+ request_timeout=float(input.timeout) if input.timeout is not None else None,`
`63`	`64`	`)`
`64`	`65`	`result = await self.generate(input, self.client)`
`65`	`66`