opea-project · chensuyue · Jul 23, 2024 · Jul 17, 2024 · Jul 17, 2024 · Jul 17, 2024
@@ -57,6 +57,9 @@ case ${micro_service} in
     "llms/summarization/tgi")
         IMAGE_NAME="opea/llm-docsum-tgi"
         ;;
+    "llms/faq-generation/tgi")
+        IMAGE_NAME="opea/llm-faqgen-tgi"
+        ;;
     "web_retrievers/langchain/chroma")
         IMAGE_NAME="opea/web-retriever-chroma"
         ;;

@@ -37,6 +37,7 @@
     TranslationGateway,
     SearchQnAGateway,
     AudioQnAGateway,
+    FaqGenGateway,
 )
 
 # Telemetry

@@ -43,6 +43,7 @@ class MegaServiceEndpoint(Enum):
     DOC_SUMMARY = "/v1/docsum"
     SEARCH_QNA = "/v1/searchqna"
     TRANSLATION = "/v1/translation"
+    FAQ_GEN = "/v1/faqgen"
     # Follow OPENAI
     EMBEDDINGS = "/v1/embeddings"
     TTS = "/v1/audio/speech"

@@ -400,3 +400,47 @@ async def handle_request(self, request: Request):
             )
         )
         return ChatCompletionResponse(model="searchqna", choices=choices, usage=usage)
+
+
+class FaqGenGateway(Gateway):
+    def __init__(self, megaservice, host="0.0.0.0", port=8888):
+        super().__init__(
+            megaservice, host, port, str(MegaServiceEndpoint.FAQ_GEN), ChatCompletionRequest, ChatCompletionResponse
+        )
+
+    async def handle_request(self, request: Request):
+        data = await request.json()
+        stream_opt = data.get("stream", True)
+        chat_request = ChatCompletionRequest.parse_obj(data)
+        prompt = self._handle_message(chat_request.messages)
+        parameters = LLMParams(
+            max_new_tokens=chat_request.max_tokens if chat_request.max_tokens else 1024,
+            top_k=chat_request.top_k if chat_request.top_k else 10,
+            top_p=chat_request.top_p if chat_request.top_p else 0.95,
+            temperature=chat_request.temperature if chat_request.temperature else 0.01,
+            repetition_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 1.03,
+            streaming=stream_opt,
+        )
+        result_dict, runtime_graph = await self.megaservice.schedule(
+            initial_inputs={"query": prompt}, llm_parameters=parameters
+        )
+        for node, response in result_dict.items():
+            # Here it suppose the last microservice in the megaservice is LLM.
+            if (
+                isinstance(response, StreamingResponse)
+                and node == list(self.megaservice.services.keys())[-1]
+                and self.megaservice.services[node].service_type == ServiceType.LLM
+            ):
+                return response
+        last_node = runtime_graph.all_leaves()[-1]
+        response = result_dict[last_node]["text"]
+        choices = []
+        usage = UsageInfo()
+        choices.append(
+            ChatCompletionResponseChoice(
+                index=0,
+                message=ChatMessage(role="assistant", content=response),
+                finish_reason="stop",
+            )
+        )
+        return ChatCompletionResponse(model="faqgen", choices=choices, usage=usage)
@@ -0,0 +1,27 @@
+
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+FROM langchain/langchain:latest
+
+RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
+    libgl1-mesa-glx \
+    libjemalloc-dev \
+    vim
+
+RUN useradd -m -s /bin/bash user && \
+    mkdir -p /home/user && \
+    chown -R user /home/user/
+
+USER user
+
+COPY comps /home/user/comps
+
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r /home/user/comps/llms/faq-generation/tgi/requirements.txt
+
+ENV PYTHONPATH=$PYTHONPATH:/home/user
+
+WORKDIR /home/user/comps/llms/faq-generation/tgi
+
+ENTRYPOINT ["python", "llm.py"]
@@ -0,0 +1,69 @@
+# TGI FAQGen LLM Microservice
+
+This microservice interacts with the TGI LLM server to generate FAQs from Input Text.[Text Generation Inference](https://github.com/huggingface/text-generation-inference) (TGI) is a toolkit for deploying and serving Large Language Models (LLMs). TGI enables high-performance text generation for the most popular open-source LLMs, including Llama, Falcon, StarCoder, BLOOM, GPT-NeoX, and more.
+
+# 🚀1. Start Microservice with Docker
+
+If you start an LLM microservice with docker, the `docker_compose_llm.yaml` file will automatically start a TGI service with docker.
+
+## 1.1 Setup Environment Variables
+
+In order to start TGI and LLM services, you need to setup the following environment variables first.
+
+```bash
+export HF_TOKEN=${your_hf_api_token}
+export TGI_LLM_ENDPOINT="http://${your_ip}:8008"
+export LLM_MODEL_ID=${your_hf_llm_model}
+export LANGCHAIN_TRACING_V2=true
+export LANGCHAIN_API_KEY=${your_langchain_api_key}
+```
+
+## 1.2 Build Docker Image
+
+```bash
+cd ../../../../
+docker build -t opea/llm-faqgen-tgi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/faq-generation/tgi/Dockerfile .
+```
+
+To start a docker container, you have two options:
+
+- A. Run Docker with CLI
+- B. Run Docker with Docker Compose
+
+You can choose one as needed.
+
+## 1.3 Run Docker with CLI (Option A)
+
+```bash
+docker run -d -p 8008:80 -v ./data:/data --name tgi_service --shm-size 1g ghcr.io/huggingface/text-generation-inference:1.4 --model-id ${LLM_MODEL_ID}
+```
+
+```bash
+docker run -d --name="llm-faqgen-server" -p 9000:9000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e TGI_LLM_ENDPOINT=$TGI_LLM_ENDPOINT -e HUGGINGFACEHUB_API_TOKEN=$HF_TOKEN opea/llm-faqgen-tgi:latest
+```
+
+## 1.4 Run Docker with Docker Compose (Option B)
+
+```bash
+cd text-generation/tgi
+docker compose -f docker_compose_llm.yaml up -d
+```
+
+# 🚀3. Consume LLM Service
+
+## 3.1 Check Service Status
+
+```bash
+curl http://${your_ip}:9000/v1/health_check\
+  -X GET \
+  -H 'Content-Type: application/json'
+```
+
+## 3.2 Consume FAQGen LLM Service
+
+```bash
+curl http://${your_ip}:9000/v1/faqgen \
+  -X POST \
+  -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' \
+  -H 'Content-Type: application/json'
+```
@@ -0,0 +1,2 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,35 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+version: "3.8"
+
+services:
+  tgi_service:
+    image: ghcr.io/huggingface/text-generation-inference:1.4
+    container_name: tgi-service
+    ports:
+      - "8008:80"
+    volumes:
+      - "./data:/data"
+    environment:
+      HF_TOKEN: ${HF_TOKEN}
+    shm_size: 1g
+    command: --model-id ${LLM_MODEL_ID}
+  llm:
+    image: opea/llm-faqgen-tgi:latest
+    container_name: llm-faqgen-server
+    ports:
+      - "9000:9000"
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
+      HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN}
+      LANGCHAIN_API_KEY: ${LANGCHAIN_API_KEY}
+    restart: unless-stopped
+
+networks:
+  default:
+    driver: bridge
@@ -0,0 +1,81 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+from fastapi.responses import StreamingResponse
+from langchain.chains.summarize import load_summarize_chain
+from langchain.docstore.document import Document
+from langchain.prompts import PromptTemplate
+from langchain.text_splitter import CharacterTextSplitter
+from langchain_community.llms import HuggingFaceEndpoint
+from langsmith import traceable
+
+from comps import GeneratedDoc, LLMParamsDoc, ServiceType, opea_microservices, register_microservice
+
+
+@traceable(run_type="tool")
+def post_process_text(text: str):
+    if text == " ":
+        return "data: @#$\n\n"
+    if text == "\n":
+        return "data: <br/>\n\n"
+    if text.isspace():
+        return None
+    new_text = text.replace(" ", "@#$")
+    return f"data: {new_text}\n\n"
+
+
+@register_microservice(
+    name="opea_service@llm_faqgen",
+    service_type=ServiceType.LLM,
+    endpoint="/v1/faqgen",
+    host="0.0.0.0",
+    port=9000,
+)
+@traceable(run_type="llm")
+def llm_generate(input: LLMParamsDoc):
+    llm_endpoint = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080")
+    llm = HuggingFaceEndpoint(
+        endpoint_url=llm_endpoint,
+        max_new_tokens=input.max_new_tokens,
+        top_k=input.top_k,
+        top_p=input.top_p,
+        typical_p=input.typical_p,
+        temperature=input.temperature,
+        repetition_penalty=input.repetition_penalty,
+        streaming=input.streaming,
+    )
+    templ = """Create a concise FAQs (frequently asked questions and answers) for following text:
+        TEXT: {text}
+        Do not use any prefix or suffix to the FAQ.
+    """
+    PROMPT = PromptTemplate.from_template(templ)
+    llm_chain = load_summarize_chain(llm=llm, prompt=PROMPT)
+
+    if input.streaming:
+        # Split text
+        text_splitter = CharacterTextSplitter()
+
+        texts = text_splitter.split_text(input.query)
+        # Create multiple documents
+        docs = [Document(page_content=t) for t in texts]
+
+        async def stream_generator():
+            from langserve.serialization import WellKnownLCSerializer
+
+            _serializer = WellKnownLCSerializer()
+            async for chunk in llm_chain.astream_log(docs):
+                data = _serializer.dumps({"ops": chunk.ops}).decode("utf-8")
+                yield f"data: {data}\n\n"
+            yield "data: [DONE]\n\n"
+
+        return StreamingResponse(stream_generator(), media_type="text/event-stream")
+    else:
+        response = llm_chain.invoke(input.query)
+        response = response["result"].split("</s>")[0].split("\n")[0]
+        return GeneratedDoc(text=response, prompt=input.query)
+
+
+if __name__ == "__main__":
+    opea_microservices["opea_service@llm_faqgen"].start()
@@ -0,0 +1,12 @@
+docarray[full]
+fastapi
+huggingface_hub
+langchain==0.1.16
+langserve
+langsmith
+opentelemetry-api
+opentelemetry-exporter-otlp
+opentelemetry-sdk
+prometheus-fastapi-instrumentator
+shortuuid
+transformers
@@ -0,0 +1,91 @@
+#!/bin/bash
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+set -xe
+
+WORKPATH=$(dirname "$PWD")
+ip_address=$(hostname -I | awk '{print $1}')
+LOG_PATH="$WORKPATH/tests"
+
+function build_docker_images() {
+    cd $WORKPATH
+    docker build --no-cache -t opea/llm-faqgen-tgi:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/faq-generation/tgi/Dockerfile .
+}
+
+function start_service() {
+    tgi_endpoint_port=5014
+    export your_hf_llm_model="Intel/neural-chat-7b-v3-3"
+    # Remember to set HF_TOKEN before invoking this test!
+    export HF_TOKEN=${HF_TOKEN}
+    docker run -d --name="test-comps-llm-tgi-endpoint" -p $tgi_endpoint_port:80 -v ./data:/data -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 1g ghcr.io/huggingface/text-generation-inference:1.4 --model-id ${your_hf_llm_model}
+    export TGI_LLM_ENDPOINT="http://${ip_address}:${tgi_endpoint_port}"
+
+    tei_service_port=5015
+    docker run -d --name="test-comps-llm-tgi-server" -p ${tei_service_port}:9000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e TGI_LLM_ENDPOINT=$TGI_LLM_ENDPOINT -e HUGGINGFACEHUB_API_TOKEN=$HF_TOKEN opea/llm-faqgen-tgi:comps
+
+    # check whether tgi is fully ready
+    n=0
+    until [[ "$n" -ge 100 ]] || [[ $ready == true ]]; do
+        docker logs test-comps-llm-tgi-endpoint > ${WORKPATH}/tests/test-comps-llm-tgi-endpoint.log
+        n=$((n+1))
+        if grep -q Connected ${WORKPATH}/tests/test-comps-llm-tgi-endpoint.log; then
+            break
+        fi
+        sleep 5s
+    done
+    sleep 5s
+}
+
+function validate_microservice() {
+    tei_service_port=5015
+    http_proxy="" curl http://${ip_address}:${tei_service_port}/v1/faqgen \
+        -X POST \
+        -d '{"query":"Deep learning is a subset of machine learning that utilizes neural networks with multiple layers to analyze various levels of abstract data representations. It enables computers to identify patterns and make decisions with minimal human intervention by learning from large amounts of data."}' \
+        -H 'Content-Type: application/json'
+    docker logs test-comps-llm-tgi-endpoint
+    docker logs test-comps-llm-tgi-server
+
+    cd $LOG_PATH
+    tei_service_port=5015
+    URL="http://${ip_address}:$tei_service_port/v1/faqgen"
+    HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d '{"query": "What is Deep Learning?"}' -H 'Content-Type: application/json' "$URL")
+    if [ "$HTTP_STATUS" -eq 200 ]; then
+        echo "[ llm - faqgen ] HTTP status is 200. Checking content..."
+        local CONTENT=$(curl -s -X POST -d '{"query": "What is Deep Learning?"}' -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/llm_faqgen.log)
+
+        if echo 'text: ' | grep -q "$EXPECTED_RESULT"; then
+            echo "[ llm - faqgen ] Content is as expected."
+            docker logs test-comps-llm-tgi-server >> ${LOG_PATH}/llm_faqgen.log
+        else
+            echo "[ llm - faqgen ] Content does not match the expected result: $CONTENT"
+            docker logs test-comps-llm-tgi-server >> ${LOG_PATH}/llm_faqgen.log
+            exit 1
+        fi
+    else
+        echo "[ llm - faqgen ] HTTP status is not 200. Received status was $HTTP_STATUS"
+        docker logs test-comps-llm-tgi-server >> ${LOG_PATH}/llm_faqgen.log
+        exit 1
+    fi
+}
+
+function stop_docker() {
+    cid=$(docker ps -aq --filter "name=test-comps-llm-*")
+    if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi
+}
+
+function main() {
+
+    stop_docker
+
+    build_docker_images
+    start_service
+
+    validate_microservice
+
+    stop_docker
+    echo y | docker system prune
+
+}
+
+main
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# Copyright (C) 2024 Intel Corporation
		# SPDX-License-Identifier: Apache-2.0