Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/workflows/scripts/docker_images_build_push.sh
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@ case ${micro_service} in
"llms/summarization/tgi")
IMAGE_NAME="opea/llm-docsum-tgi"
;;
"llms/faq-generation/tgi")
IMAGE_NAME="opea/llm-faqgen-tgi"
;;
"web_retrievers/langchain/chroma")
IMAGE_NAME="opea/web-retriever-chroma"
;;
Expand Down
1 change: 1 addition & 0 deletions comps/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
TranslationGateway,
SearchQnAGateway,
AudioQnAGateway,
FaqGenGateway,
)

# Telemetry
Expand Down
1 change: 1 addition & 0 deletions comps/cores/mega/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ class MegaServiceEndpoint(Enum):
DOC_SUMMARY = "/v1/docsum"
SEARCH_QNA = "/v1/searchqna"
TRANSLATION = "/v1/translation"
FAQ_GEN = "/v1/faqgen"
# Follow OPENAI
EMBEDDINGS = "/v1/embeddings"
TTS = "/v1/audio/speech"
Expand Down
44 changes: 44 additions & 0 deletions comps/cores/mega/gateway.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,3 +400,47 @@ async def handle_request(self, request: Request):
)
)
return ChatCompletionResponse(model="searchqna", choices=choices, usage=usage)


class FaqGenGateway(Gateway):
def __init__(self, megaservice, host="0.0.0.0", port=8888):
super().__init__(
megaservice, host, port, str(MegaServiceEndpoint.FAQ_GEN), ChatCompletionRequest, ChatCompletionResponse
)

async def handle_request(self, request: Request):
data = await request.json()
stream_opt = data.get("stream", True)
chat_request = ChatCompletionRequest.parse_obj(data)
prompt = self._handle_message(chat_request.messages)
parameters = LLMParams(
max_new_tokens=chat_request.max_tokens if chat_request.max_tokens else 1024,
top_k=chat_request.top_k if chat_request.top_k else 10,
top_p=chat_request.top_p if chat_request.top_p else 0.95,
temperature=chat_request.temperature if chat_request.temperature else 0.01,
repetition_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 1.03,
streaming=stream_opt,
)
result_dict, runtime_graph = await self.megaservice.schedule(
initial_inputs={"query": prompt}, llm_parameters=parameters
)
for node, response in result_dict.items():
# Here it suppose the last microservice in the megaservice is LLM.
if (
isinstance(response, StreamingResponse)
and node == list(self.megaservice.services.keys())[-1]
and self.megaservice.services[node].service_type == ServiceType.LLM
):
return response
last_node = runtime_graph.all_leaves()[-1]
response = result_dict[last_node]["text"]
choices = []
usage = UsageInfo()
choices.append(
ChatCompletionResponseChoice(
index=0,
message=ChatMessage(role="assistant", content=response),
finish_reason="stop",
)
)
return ChatCompletionResponse(model="faqgen", choices=choices, usage=usage)
27 changes: 27 additions & 0 deletions comps/llms/faq-generation/tgi/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@

# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

FROM langchain/langchain:latest

RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
libgl1-mesa-glx \
libjemalloc-dev \
vim

RUN useradd -m -s /bin/bash user && \
mkdir -p /home/user && \
chown -R user /home/user/

USER user

COPY comps /home/user/comps

RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir -r /home/user/comps/llms/faq-generation/tgi/requirements.txt

ENV PYTHONPATH=$PYTHONPATH:/home/user

WORKDIR /home/user/comps/llms/faq-generation/tgi

ENTRYPOINT ["python", "llm.py"]
69 changes: 69 additions & 0 deletions comps/llms/faq-generation/tgi/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# TGI FAQGen LLM Microservice

This microservice interacts with the TGI LLM server to generate FAQs from Input Text.[Text Generation Inference](https://github.com/huggingface/text-generation-inference) (TGI) is a toolkit for deploying and serving Large Language Models (LLMs). TGI enables high-performance text generation for the most popular open-source LLMs, including Llama, Falcon, StarCoder, BLOOM, GPT-NeoX, and more.

# 🚀1. Start Microservice with Docker

If you start an LLM microservice with docker, the `docker_compose_llm.yaml` file will automatically start a TGI service with docker.

## 1.1 Setup Environment Variables

In order to start TGI and LLM services, you need to setup the following environment variables first.

```bash
export HF_TOKEN=${your_hf_api_token}
export TGI_LLM_ENDPOINT="http://${your_ip}:8008"
export LLM_MODEL_ID=${your_hf_llm_model}
export LANGCHAIN_TRACING_V2=true
export LANGCHAIN_API_KEY=${your_langchain_api_key}
```

## 1.2 Build Docker Image

```bash
cd ../../../../
docker build -t opea/llm-faqgen-tgi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/faq-generation/tgi/Dockerfile .
```

To start a docker container, you have two options:

- A. Run Docker with CLI
- B. Run Docker with Docker Compose

You can choose one as needed.

## 1.3 Run Docker with CLI (Option A)

```bash
docker run -d -p 8008:80 -v ./data:/data --name tgi_service --shm-size 1g ghcr.io/huggingface/text-generation-inference:1.4 --model-id ${LLM_MODEL_ID}
```

```bash
docker run -d --name="llm-faqgen-server" -p 9000:9000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e TGI_LLM_ENDPOINT=$TGI_LLM_ENDPOINT -e HUGGINGFACEHUB_API_TOKEN=$HF_TOKEN opea/llm-faqgen-tgi:latest
```

## 1.4 Run Docker with Docker Compose (Option B)

```bash
cd text-generation/tgi
docker compose -f docker_compose_llm.yaml up -d
```

# 🚀3. Consume LLM Service

## 3.1 Check Service Status

```bash
curl http://${your_ip}:9000/v1/health_check\
-X GET \
-H 'Content-Type: application/json'
```

## 3.2 Consume FAQGen LLM Service

```bash
curl http://${your_ip}:9000/v1/faqgen \
-X POST \
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' \
-H 'Content-Type: application/json'
```
2 changes: 2 additions & 0 deletions comps/llms/faq-generation/tgi/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
35 changes: 35 additions & 0 deletions comps/llms/faq-generation/tgi/docker_compose_llm.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

version: "3.8"

services:
tgi_service:
image: ghcr.io/huggingface/text-generation-inference:1.4
container_name: tgi-service
ports:
- "8008:80"
volumes:
- "./data:/data"
environment:
HF_TOKEN: ${HF_TOKEN}
shm_size: 1g
command: --model-id ${LLM_MODEL_ID}
llm:
image: opea/llm-faqgen-tgi:latest
container_name: llm-faqgen-server
ports:
- "9000:9000"
ipc: host
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN}
LANGCHAIN_API_KEY: ${LANGCHAIN_API_KEY}
restart: unless-stopped

networks:
default:
driver: bridge
81 changes: 81 additions & 0 deletions comps/llms/faq-generation/tgi/llm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

import os

from fastapi.responses import StreamingResponse
from langchain.chains.summarize import load_summarize_chain
from langchain.docstore.document import Document
from langchain.prompts import PromptTemplate
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.llms import HuggingFaceEndpoint
from langsmith import traceable

from comps import GeneratedDoc, LLMParamsDoc, ServiceType, opea_microservices, register_microservice


@traceable(run_type="tool")
def post_process_text(text: str):
if text == " ":
return "data: @#$\n\n"
if text == "\n":
return "data: <br/>\n\n"
if text.isspace():
return None
new_text = text.replace(" ", "@#$")
return f"data: {new_text}\n\n"


@register_microservice(
name="opea_service@llm_faqgen",
service_type=ServiceType.LLM,
endpoint="/v1/faqgen",
host="0.0.0.0",
port=9000,
)
@traceable(run_type="llm")
def llm_generate(input: LLMParamsDoc):
llm_endpoint = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080")
llm = HuggingFaceEndpoint(
endpoint_url=llm_endpoint,
max_new_tokens=input.max_new_tokens,
top_k=input.top_k,
top_p=input.top_p,
typical_p=input.typical_p,
temperature=input.temperature,
repetition_penalty=input.repetition_penalty,
streaming=input.streaming,
)
templ = """Create a concise FAQs (frequently asked questions and answers) for following text:
TEXT: {text}
Do not use any prefix or suffix to the FAQ.
"""
PROMPT = PromptTemplate.from_template(templ)
llm_chain = load_summarize_chain(llm=llm, prompt=PROMPT)

if input.streaming:
# Split text
text_splitter = CharacterTextSplitter()

texts = text_splitter.split_text(input.query)
# Create multiple documents
docs = [Document(page_content=t) for t in texts]

async def stream_generator():
from langserve.serialization import WellKnownLCSerializer

_serializer = WellKnownLCSerializer()
async for chunk in llm_chain.astream_log(docs):
data = _serializer.dumps({"ops": chunk.ops}).decode("utf-8")
yield f"data: {data}\n\n"
yield "data: [DONE]\n\n"

return StreamingResponse(stream_generator(), media_type="text/event-stream")
else:
response = llm_chain.invoke(input.query)
response = response["result"].split("</s>")[0].split("\n")[0]
return GeneratedDoc(text=response, prompt=input.query)


if __name__ == "__main__":
opea_microservices["opea_service@llm_faqgen"].start()
12 changes: 12 additions & 0 deletions comps/llms/faq-generation/tgi/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
docarray[full]
fastapi
huggingface_hub
langchain==0.1.16
langserve
langsmith
opentelemetry-api
opentelemetry-exporter-otlp
opentelemetry-sdk
prometheus-fastapi-instrumentator
shortuuid
transformers
91 changes: 91 additions & 0 deletions tests/test_llms_faq-generation_tgi.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
#!/bin/bash
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

set -xe

WORKPATH=$(dirname "$PWD")
ip_address=$(hostname -I | awk '{print $1}')
LOG_PATH="$WORKPATH/tests"

function build_docker_images() {
cd $WORKPATH
docker build --no-cache -t opea/llm-faqgen-tgi:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/faq-generation/tgi/Dockerfile .
}

function start_service() {
tgi_endpoint_port=5014
export your_hf_llm_model="Intel/neural-chat-7b-v3-3"
# Remember to set HF_TOKEN before invoking this test!
export HF_TOKEN=${HF_TOKEN}
docker run -d --name="test-comps-llm-tgi-endpoint" -p $tgi_endpoint_port:80 -v ./data:/data -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 1g ghcr.io/huggingface/text-generation-inference:1.4 --model-id ${your_hf_llm_model}
export TGI_LLM_ENDPOINT="http://${ip_address}:${tgi_endpoint_port}"

tei_service_port=5015
docker run -d --name="test-comps-llm-tgi-server" -p ${tei_service_port}:9000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e TGI_LLM_ENDPOINT=$TGI_LLM_ENDPOINT -e HUGGINGFACEHUB_API_TOKEN=$HF_TOKEN opea/llm-faqgen-tgi:comps

# check whether tgi is fully ready
n=0
until [[ "$n" -ge 100 ]] || [[ $ready == true ]]; do
docker logs test-comps-llm-tgi-endpoint > ${WORKPATH}/tests/test-comps-llm-tgi-endpoint.log
n=$((n+1))
if grep -q Connected ${WORKPATH}/tests/test-comps-llm-tgi-endpoint.log; then
break
fi
sleep 5s
done
sleep 5s
}

function validate_microservice() {
tei_service_port=5015
http_proxy="" curl http://${ip_address}:${tei_service_port}/v1/faqgen \
-X POST \
-d '{"query":"Deep learning is a subset of machine learning that utilizes neural networks with multiple layers to analyze various levels of abstract data representations. It enables computers to identify patterns and make decisions with minimal human intervention by learning from large amounts of data."}' \
-H 'Content-Type: application/json'
docker logs test-comps-llm-tgi-endpoint
docker logs test-comps-llm-tgi-server

cd $LOG_PATH
tei_service_port=5015
URL="http://${ip_address}:$tei_service_port/v1/faqgen"
HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d '{"query": "What is Deep Learning?"}' -H 'Content-Type: application/json' "$URL")
if [ "$HTTP_STATUS" -eq 200 ]; then
echo "[ llm - faqgen ] HTTP status is 200. Checking content..."
local CONTENT=$(curl -s -X POST -d '{"query": "What is Deep Learning?"}' -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/llm_faqgen.log)

if echo 'text: ' | grep -q "$EXPECTED_RESULT"; then
echo "[ llm - faqgen ] Content is as expected."
docker logs test-comps-llm-tgi-server >> ${LOG_PATH}/llm_faqgen.log
else
echo "[ llm - faqgen ] Content does not match the expected result: $CONTENT"
docker logs test-comps-llm-tgi-server >> ${LOG_PATH}/llm_faqgen.log
exit 1
fi
else
echo "[ llm - faqgen ] HTTP status is not 200. Received status was $HTTP_STATUS"
docker logs test-comps-llm-tgi-server >> ${LOG_PATH}/llm_faqgen.log
exit 1
fi
}

function stop_docker() {
cid=$(docker ps -aq --filter "name=test-comps-llm-*")
if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi
}

function main() {

stop_docker

build_docker_images
start_service

validate_microservice

stop_docker
echo y | docker system prune

}

main