add seq2seq streaming integ test (deepjavalibrary#724)

rohithkrn · web-flow · commit 737bb94209f2 · 2023-05-16T16:40:19.000-07:00
diff --git a/.github/workflows/llm_integration.yml b/.github/workflows/llm_integration.yml
@@ -166,6 +166,17 @@ jobs:
           python3 llm/client.py huggingface bigscience/bloom-3b
           rm -rf docker_env
           docker rm -f $(docker ps -aq)
+      - name: Test streaming t5-large
+        working-directory: tests/integration
+        run: |
+          rm -rf models
+          echo -en "CUDA_VISIBLE_DEVICES=1" > docker_env
+          python3 llm/prepare.py huggingface t5-large
+          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models deepspeed \
+          serve
+          python3 llm/client.py huggingface t5-large
+          rm -rf docker_env
+          docker rm -f $(docker ps -aq)
       - name: On fail step
         if: ${{ failure() }}
         working-directory: tests/integration
diff --git a/tests/integration/llm/client.py b/tests/integration/llm/client.py
@@ -106,6 +106,13 @@ def compute_model_name_hash(model_name):
         "worker": 1,
         "stream_output": True,
     },
+    "t5-large": {
+        "max_memory_per_gpu": [5.0],
+        "batch_size": [1],
+        "seq_length": [32],
+        "worker": 1,
+        "stream_output": True,
+    },
     "no-code/nomic-ai/gpt4all-j": {
         "max_memory_per_gpu": [10.0, 12.0],
         "batch_size": [1, 4],
@@ -456,7 +463,10 @@ def test_handler(model, model_spec):
                             model_name=spec.get("model_name", "test"))
     for i, batch_size in enumerate(spec["batch_size"]):
         for seq_length in spec["seq_length"]:
-            req = {"inputs": batch_generation(batch_size)}
+            if "t5" in model:
+                req = {"inputs": t5_batch_generation(batch_size)}
+            else:
+                req = {"inputs": batch_generation(batch_size)}
             params = {"max_new_tokens": seq_length}
             req["parameters"] = params
             logging.info(f"req {req}")
diff --git a/tests/integration/llm/prepare.py b/tests/integration/llm/prepare.py
@@ -114,6 +114,12 @@
         "option.enable_streaming": True,
         "gpu.maxWorkers": 1,
     },
+    "t5-large": {
+        "option.model_id": "t5-large",
+        "option.tensor_parallel_degree": 1,
+        "option.device_map": "auto",
+        "option.enable_streaming": True,
+    },
 }
 
 ds_handler_list = {