Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 11 additions & 19 deletions comps/llms/text-generation/vllm/langchain/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -223,29 +223,21 @@ User can set the following model parameters according to needs:
- streaming(true/false): return text response in streaming mode or non-streaming mode

```bash
# 1. Non-streaming mode
# stream mode
curl http://${your_ip}:9000/v1/chat/completions \
-X POST \
-d '{"query":"What is Deep Learning?","max_tokens":17,"top_p":1,"temperature":0.7,"frequency_penalty":0,"presence_penalty":0, "streaming":false}' \
-H 'Content-Type: application/json'
-X POST \
-d '{"model": "${model_name}", "messages": "What is Deep Learning?", "max_tokens":17}' \
-H 'Content-Type: application/json'

# 2. Streaming mode
curl http://${your_ip}:9000/v1/chat/completions \
-X POST \
-d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
-H 'Content-Type: application/json'
-X POST \
-d '{"model": "${model_name}", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' \
-H 'Content-Type: application/json'

# 3. Custom chat template with streaming mode
#Non-stream mode
curl http://${your_ip}:9000/v1/chat/completions \
-X POST \
-d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true, "chat_template":"### You are a helpful, respectful and honest assistant to help the user with questions.\n### Context: {context}\n### Question: {question}\n### Answer:"}' \
-H 'Content-Type: application/json'
-X POST \
-d '{"model": "${model_name}", "messages": "What is Deep Learning?", "max_tokens":17, "stream":false}' \
-H 'Content-Type: application/json'

4. # Chat with SearchedDoc (Retrieval context)
curl http://${your_ip}:9000/v1/chat/completions \
-X POST \
-d '{"initial_query":"What is Deep Learning?","retrieved_docs":[{"text":"Deep Learning is a ..."},{"text":"Deep Learning is b ..."}]}' \
-H 'Content-Type: application/json'
```

For parameters, can refer to [LangChain VLLMOpenAI API](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.vllm.VLLMOpenAI.html)
93 changes: 93 additions & 0 deletions comps/llms/text-generation/vllm/langchain/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from fastapi.responses import StreamingResponse
from langchain_community.llms import VLLMOpenAI
from langchain_core.prompts import PromptTemplate
from openai import OpenAI
from template import ChatTemplate

from comps import (
Expand Down Expand Up @@ -194,6 +195,98 @@ async def stream_generator():
logger.info(response)

return GeneratedDoc(text=response, prompt=input.query)
else:
if logflag:
logger.info("[ ChatCompletionRequest ] input in opea format")
client = OpenAI(
api_key="EMPTY",
base_url=llm_endpoint + "/v1",
)

if isinstance(input.messages, str):
prompt = input.messages
if prompt_template:
if sorted(input_variables) == ["context", "question"]:
prompt = prompt_template.format(question=input.messages, context="\n".join(input.documents))
elif input_variables == ["question"]:
prompt = prompt_template.format(question=input.messages)
else:
logger.info(
f"[ ChatCompletionRequest ] {prompt_template} not used, we only support 2 input variables ['question', 'context']"
)
else:
if input.documents:
# use rag default template
prompt = ChatTemplate.generate_rag_prompt(input.messages, input.documents, input.model)

chat_completion = client.completions.create(
model=model_name,
prompt=prompt,
echo=input.echo,
frequency_penalty=input.frequency_penalty,
max_tokens=input.max_tokens,
n=input.n,
presence_penalty=input.presence_penalty,
seed=input.seed,
stop=input.stop,
stream=input.stream,
suffix=input.suffix,
temperature=input.temperature,
top_p=input.top_p,
user=input.user,
)
else:
if input.messages[0]["role"] == "system":
if "{context}" in input.messages[0]["content"]:
if input.documents is None or input.documents == []:
input.messages[0]["content"].format(context="")
else:
input.messages[0]["content"].format(context="\n".join(input.documents))
else:
if prompt_template:
system_prompt = prompt_template
if input_variables == ["context"]:
system_prompt = prompt_template.format(context="\n".join(input.documents))
else:
logger.info(
f"[ ChatCompletionRequest ] {prompt_template} not used, only support 1 input variables ['context']"
)

input.messages.insert(0, {"role": "system", "content": system_prompt})

chat_completion = client.chat.completions.create(
model=model_name,
messages=input.messages,
frequency_penalty=input.frequency_penalty,
max_tokens=input.max_tokens,
n=input.n,
presence_penalty=input.presence_penalty,
response_format=input.response_format,
seed=input.seed,
stop=input.stop,
stream=input.stream,
stream_options=input.stream_options,
temperature=input.temperature,
top_p=input.top_p,
user=input.user,
)

if input.stream:

def stream_generator():
for c in chat_completion:
if logflag:
logger.info(c)
chunk = c.model_dump_json()
if chunk not in ["<|im_end|>", "<|endoftext|>"]:
yield f"data: {chunk}\n\n"
yield "data: [DONE]\n\n"

return StreamingResponse(stream_generator(), media_type="text/event-stream")
else:
if logflag:
logger.info(chat_completion)
return chat_completion


if __name__ == "__main__":
Expand Down
20 changes: 0 additions & 20 deletions comps/llms/text-generation/vllm/langchain/query.sh

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ function start_service() {
-p $port_number:80 \
-e HABANA_VISIBLE_DEVICES=all \
-e OMPI_MCA_btl_vader_single_copy_mechanism=none \
-e VLLM_SKIP_WARMUP=true \
--cap-add=sys_nice \
--ipc=host \
-e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} \
Expand All @@ -62,7 +63,7 @@ function start_service() {

# check whether vllm ray is fully ready
n=0
until [[ "$n" -ge 160 ]] || [[ $ready == true ]]; do
until [[ "$n" -ge 70 ]] || [[ $ready == true ]]; do
docker logs test-comps-vllm-service > ${WORKPATH}/tests/test-comps-vllm-service.log
n=$((n+1))
if grep -q throughput ${WORKPATH}/tests/test-comps-vllm-service.log; then
Expand Down Expand Up @@ -90,9 +91,23 @@ function validate_microservice() {
docker logs test-comps-vllm-microservice
exit 1
fi

result=$(http_proxy="" curl http://${ip_address}:5030/v1/chat/completions \
-X POST \
-d '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17, "stream":false}' \
-H 'Content-Type: application/json')
if [[ $result == *"content"* ]]; then
echo "Result correct."
else
echo "Result wrong. Received was $result"
docker logs test-comps-vllm-service
docker logs test-comps-vllm-microservice
exit 1
fi

result=$(http_proxy="" curl http://${ip_address}:5030/v1/chat/completions \
-X POST \
-d '{"query":"What is Deep Learning?","max_tokens":17,"top_p":1,"temperature":0.7,"frequency_penalty":0,"presence_penalty":0, "streaming":false}' \
-d '{"model": "Intel/neural-chat-7b-v3-3", "messages": "What is Deep Learning?", "max_tokens":17, "stream":false}' \
-H 'Content-Type: application/json')
if [[ $result == *"text"* ]]; then
echo "Result correct."
Expand Down