Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 21 additions & 14 deletions docs/backend/native_api.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
"\n",
"\n",
"server_process, port = launch_server_cmd(\n",
" \"python -m sglang.launch_server --model-path meta-llama/Llama-3.2-1B-Instruct --host 0.0.0.0\"\n",
" \"python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct --host 0.0.0.0\"\n",
")\n",
"\n",
"wait_for_server(f\"http://localhost:{port}\")"
Expand Down Expand Up @@ -105,9 +105,9 @@
"response = requests.get(url)\n",
"response_json = response.json()\n",
"print_highlight(response_json)\n",
"assert response_json[\"model_path\"] == \"meta-llama/Llama-3.2-1B-Instruct\"\n",
"assert response_json[\"model_path\"] == \"qwen/qwen2.5-0.5b-instruct\"\n",
"assert response_json[\"is_generation\"] is True\n",
"assert response_json[\"tokenizer_path\"] == \"meta-llama/Llama-3.2-1B-Instruct\"\n",
"assert response_json[\"tokenizer_path\"] == \"qwen/qwen2.5-0.5b-instruct\"\n",
"assert response_json.keys() == {\"model_path\", \"is_generation\", \"tokenizer_path\"}"
]
},
Expand Down Expand Up @@ -213,7 +213,7 @@
"# successful update with same architecture and size\n",
"\n",
"url = f\"http://localhost:{port}/update_weights_from_disk\"\n",
"data = {\"model_path\": \"meta-llama/Llama-3.2-1B\"}\n",
"data = {\"model_path\": \"qwen/qwen2.5-0.5b-instruct\"}\n",
"\n",
"response = requests.post(url, json=data)\n",
"print_highlight(response.text)\n",
Expand All @@ -230,19 +230,28 @@
"# failed update with different parameter size or wrong name\n",
"\n",
"url = f\"http://localhost:{port}/update_weights_from_disk\"\n",
"data = {\"model_path\": \"meta-llama/Llama-3.2-1B-wrong\"}\n",
"data = {\"model_path\": \"qwen/qwen2.5-0.5b-instruct-wrong\"}\n",
"\n",
"response = requests.post(url, json=data)\n",
"response_json = response.json()\n",
"print_highlight(response_json)\n",
"assert response_json[\"success\"] is False\n",
"assert response_json[\"message\"] == (\n",
" \"Failed to get weights iterator: \"\n",
" \"meta-llama/Llama-3.2-1B-wrong\"\n",
" \"qwen/qwen2.5-0.5b-instruct-wrong\"\n",
" \" (repository not found).\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"terminate_process(server_process)"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand All @@ -259,11 +268,9 @@
"metadata": {},
"outputs": [],
"source": [
"terminate_process(server_process)\n",
"\n",
"embedding_process, port = launch_server_cmd(\n",
" \"\"\"\n",
"python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct \\\n",
"python3 -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-1.5B-instruct \\\n",
" --host 0.0.0.0 --is-embedding\n",
"\"\"\"\n",
")\n",
Expand All @@ -280,7 +287,7 @@
"# successful encode for embedding model\n",
"\n",
"url = f\"http://localhost:{port}/encode\"\n",
"data = {\"model\": \"Alibaba-NLP/gte-Qwen2-7B-instruct\", \"text\": \"Once upon a time\"}\n",
"data = {\"model\": \"Alibaba-NLP/gte-Qwen2-1.5B-instruct\", \"text\": \"Once upon a time\"}\n",
"\n",
"response = requests.post(url, json=data)\n",
"response_json = response.json()\n",
Expand Down Expand Up @@ -318,7 +325,7 @@
"\n",
"reward_process, port = launch_server_cmd(\n",
" \"\"\"\n",
"python -m sglang.launch_server --model-path Skywork/Skywork-Reward-Llama-3.1-8B-v0.2 --host 0.0.0.0 --is-embedding\n",
"python3 -m sglang.launch_server --model-path Skywork/Skywork-Reward-Llama-3.1-8B-v0.2 --host 0.0.0.0 --is-embedding\n",
"\"\"\"\n",
")\n",
"\n",
Expand Down Expand Up @@ -383,7 +390,7 @@
"outputs": [],
"source": [
"expert_record_server_process, port = launch_server_cmd(\n",
" \"python -m sglang.launch_server --model-path Qwen/Qwen1.5-MoE-A2.7B --host 0.0.0.0\"\n",
" \"python3 -m sglang.launch_server --model-path Qwen/Qwen1.5-MoE-A2.7B --host 0.0.0.0\"\n",
")\n",
"\n",
"wait_for_server(f\"http://localhost:{port}\")"
Expand Down Expand Up @@ -449,7 +456,7 @@
"source": [
"tokenizer_free_server_process, port = launch_server_cmd(\n",
" \"\"\"\n",
"python3 -m sglang.launch_server --model-path meta-llama/Llama-3.2-1B-Instruct --skip-tokenizer-init\n",
"python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct --skip-tokenizer-init\n",
"\"\"\"\n",
")\n",
"\n",
Expand All @@ -464,7 +471,7 @@
"source": [
"from transformers import AutoTokenizer\n",
"\n",
"tokenizer = AutoTokenizer.from_pretrained(\"meta-llama/Llama-3.2-1B-Instruct\")\n",
"tokenizer = AutoTokenizer.from_pretrained(\"qwen/qwen2.5-0.5b-instruct\")\n",
"\n",
"input_text = \"What is the capital of France?\"\n",
"\n",
Expand Down
2 changes: 1 addition & 1 deletion docs/backend/offline_engine_api.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@
" nest_asyncio.apply()\n",
"\n",
"\n",
"llm = sgl.Engine(model_path=\"meta-llama/Meta-Llama-3.1-8B-Instruct\")"
"llm = sgl.Engine(model_path=\"qwen/qwen2.5-0.5b-instruct\")"
]
},
{
Expand Down
20 changes: 10 additions & 10 deletions docs/backend/openai_api_completions.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
"\n",
"\n",
"server_process, port = launch_server_cmd(\n",
" \"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --host 0.0.0.0\"\n",
" \"python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct --host 0.0.0.0 --mem-fraction-static 0.8\"\n",
")\n",
"\n",
"wait_for_server(f\"http://localhost:{port}\")\n",
Expand Down Expand Up @@ -75,7 +75,7 @@
"client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")\n",
"\n",
"response = client.chat.completions.create(\n",
" model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
" model=\"qwen/qwen2.5-0.5b-instruct\",\n",
" messages=[\n",
" {\"role\": \"user\", \"content\": \"List 3 countries and their capitals.\"},\n",
" ],\n",
Expand Down Expand Up @@ -104,7 +104,7 @@
"outputs": [],
"source": [
"response = client.chat.completions.create(\n",
" model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
" model=\"qwen/qwen2.5-0.5b-instruct\",\n",
" messages=[\n",
" {\n",
" \"role\": \"system\",\n",
Expand Down Expand Up @@ -143,7 +143,7 @@
"outputs": [],
"source": [
"stream = client.chat.completions.create(\n",
" model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
" model=\"qwen/qwen2.5-0.5b-instruct\",\n",
" messages=[{\"role\": \"user\", \"content\": \"Say this is a test\"}],\n",
" stream=True,\n",
")\n",
Expand All @@ -169,7 +169,7 @@
"outputs": [],
"source": [
"response = client.completions.create(\n",
" model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
" model=\"qwen/qwen2.5-0.5b-instruct\",\n",
" prompt=\"List 3 countries and their capitals.\",\n",
" temperature=0,\n",
" max_tokens=64,\n",
Expand Down Expand Up @@ -198,7 +198,7 @@
"outputs": [],
"source": [
"response = client.completions.create(\n",
" model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
" model=\"qwen/qwen2.5-0.5b-instruct\",\n",
" prompt=\"Write a short story about a space explorer.\",\n",
" temperature=0.7, # Moderate temperature for creative writing\n",
" max_tokens=150, # Longer response for a story\n",
Expand Down Expand Up @@ -257,7 +257,7 @@
" \"method\": \"POST\",\n",
" \"url\": \"/chat/completions\",\n",
" \"body\": {\n",
" \"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
" \"model\": \"qwen/qwen2.5-0.5b-instruct\",\n",
" \"messages\": [\n",
" {\"role\": \"user\", \"content\": \"Tell me a joke about programming\"}\n",
" ],\n",
Expand All @@ -269,7 +269,7 @@
" \"method\": \"POST\",\n",
" \"url\": \"/chat/completions\",\n",
" \"body\": {\n",
" \"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
" \"model\": \"qwen/qwen2.5-0.5b-instruct\",\n",
" \"messages\": [{\"role\": \"user\", \"content\": \"What is Python?\"}],\n",
" \"max_tokens\": 50,\n",
" },\n",
Expand Down Expand Up @@ -362,7 +362,7 @@
" \"method\": \"POST\",\n",
" \"url\": \"/chat/completions\",\n",
" \"body\": {\n",
" \"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
" \"model\": \"qwen/qwen2.5-0.5b-instruct\",\n",
" \"messages\": [\n",
" {\n",
" \"role\": \"system\",\n",
Expand Down Expand Up @@ -439,7 +439,7 @@
" \"method\": \"POST\",\n",
" \"url\": \"/chat/completions\",\n",
" \"body\": {\n",
" \"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
" \"model\": \"qwen/qwen2.5-0.5b-instruct\",\n",
" \"messages\": [\n",
" {\n",
" \"role\": \"system\",\n",
Expand Down
12 changes: 6 additions & 6 deletions docs/backend/openai_api_embeddings.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
"\n",
"embedding_process, port = launch_server_cmd(\n",
" \"\"\"\n",
"python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct \\\n",
"python3 -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-1.5B-instruct \\\n",
" --host 0.0.0.0 --is-embedding\n",
"\"\"\"\n",
")\n",
Expand All @@ -66,7 +66,7 @@
"text = \"Once upon a time\"\n",
"\n",
"curl_text = f\"\"\"curl -s http://localhost:{port}/v1/embeddings \\\n",
" -d '{{\"model\": \"Alibaba-NLP/gte-Qwen2-7B-instruct\", \"input\": \"{text}\"}}'\"\"\"\n",
" -d '{{\"model\": \"Alibaba-NLP/gte-Qwen2-1.5B-instruct\", \"input\": \"{text}\"}}'\"\"\"\n",
"\n",
"text_embedding = json.loads(subprocess.check_output(curl_text, shell=True))[\"data\"][0][\n",
" \"embedding\"\n",
Expand Down Expand Up @@ -94,7 +94,7 @@
"\n",
"response = requests.post(\n",
" f\"http://localhost:{port}/v1/embeddings\",\n",
" json={\"model\": \"Alibaba-NLP/gte-Qwen2-7B-instruct\", \"input\": text},\n",
" json={\"model\": \"Alibaba-NLP/gte-Qwen2-1.5B-instruct\", \"input\": text},\n",
")\n",
"\n",
"text_embedding = response.json()[\"data\"][0][\"embedding\"]\n",
Expand All @@ -121,7 +121,7 @@
"\n",
"# Text embedding example\n",
"response = client.embeddings.create(\n",
" model=\"Alibaba-NLP/gte-Qwen2-7B-instruct\",\n",
" model=\"Alibaba-NLP/gte-Qwen2-1.5B-instruct\",\n",
" input=text,\n",
")\n",
"\n",
Expand Down Expand Up @@ -150,11 +150,11 @@
"\n",
"os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
"\n",
"tokenizer = AutoTokenizer.from_pretrained(\"Alibaba-NLP/gte-Qwen2-7B-instruct\")\n",
"tokenizer = AutoTokenizer.from_pretrained(\"Alibaba-NLP/gte-Qwen2-1.5B-instruct\")\n",
"input_ids = tokenizer.encode(text)\n",
"\n",
"curl_ids = f\"\"\"curl -s http://localhost:{port}/v1/embeddings \\\n",
" -d '{{\"model\": \"Alibaba-NLP/gte-Qwen2-7B-instruct\", \"input\": {json.dumps(input_ids)}}}'\"\"\"\n",
" -d '{{\"model\": \"Alibaba-NLP/gte-Qwen2-1.5B-instruct\", \"input\": {json.dumps(input_ids)}}}'\"\"\"\n",
"\n",
"input_ids_embedding = json.loads(subprocess.check_output(curl_ids, shell=True))[\"data\"][\n",
" 0\n",
Expand Down
14 changes: 7 additions & 7 deletions docs/backend/openai_api_vision.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
"\n",
"Launch the server in your terminal and wait for it to initialize.\n",
"\n",
"**Remember to add** `--chat-template llama_3_vision` **to specify the [vision chat template](https://docs.sglang.ai/backend/openai_api_vision.html#Chat-Template), otherwise, the server will only support text (images won’t be passed in), which can lead to degraded performance.**\n",
"**Remember to add** `--chat-template` **for example** `--chat-template=qwen2-vl` **to specify the [vision chat template](https://docs.sglang.ai/backend/openai_api_vision.html#Chat-Template), otherwise, the server will only support text (images won’t be passed in), which can lead to degraded performance.**\n",
"\n",
"We need to specify `--chat-template` for vision language models because the chat template provided in Hugging Face tokenizer only supports text."
]
Expand All @@ -51,8 +51,8 @@
"\n",
"vision_process, port = launch_server_cmd(\n",
" \"\"\"\n",
"python3 -m sglang.launch_server --model-path meta-llama/Llama-3.2-11B-Vision-Instruct \\\n",
" --chat-template=llama_3_vision\n",
"python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-VL-7B-Instruct \\\n",
" --chat-template=qwen2-vl\n",
"\"\"\"\n",
")\n",
"\n",
Expand All @@ -79,7 +79,7 @@
"curl_command = f\"\"\"\n",
"curl -s http://localhost:{port}/v1/chat/completions \\\\\n",
" -d '{{\n",
" \"model\": \"meta-llama/Llama-3.2-11B-Vision-Instruct\",\n",
" \"model\": \"Qwen/Qwen2.5-VL-7B-Instruct\",\n",
" \"messages\": [\n",
" {{\n",
" \"role\": \"user\",\n",
Expand Down Expand Up @@ -127,7 +127,7 @@
"url = f\"http://localhost:{port}/v1/chat/completions\"\n",
"\n",
"data = {\n",
" \"model\": \"meta-llama/Llama-3.2-11B-Vision-Instruct\",\n",
" \"model\": \"Qwen/Qwen2.5-VL-7B-Instruct\",\n",
" \"messages\": [\n",
" {\n",
" \"role\": \"user\",\n",
Expand Down Expand Up @@ -167,7 +167,7 @@
"client = OpenAI(base_url=f\"http://localhost:{port}/v1\", api_key=\"None\")\n",
"\n",
"response = client.chat.completions.create(\n",
" model=\"meta-llama/Llama-3.2-11B-Vision-Instruct\",\n",
" model=\"Qwen/Qwen2.5-VL-7B-Instruct\",\n",
" messages=[\n",
" {\n",
" \"role\": \"user\",\n",
Expand Down Expand Up @@ -211,7 +211,7 @@
"client = OpenAI(base_url=f\"http://localhost:{port}/v1\", api_key=\"None\")\n",
"\n",
"response = client.chat.completions.create(\n",
" model=\"meta-llama/Llama-3.2-11B-Vision-Instruct\",\n",
" model=\"Qwen/Qwen2.5-VL-7B-Instruct\",\n",
" messages=[\n",
" {\n",
" \"role\": \"user\",\n",
Expand Down
12 changes: 6 additions & 6 deletions docs/backend/send_request.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,11 @@
"\n",
"# This is equivalent to running the following command in your terminal\n",
"\n",
"# python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --host 0.0.0.0\n",
"# python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct --host 0.0.0.0\n",
"\n",
"server_process, port = launch_server_cmd(\n",
" \"\"\"\n",
"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
"python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct \\\n",
" --host 0.0.0.0\n",
"\"\"\"\n",
")\n",
Expand All @@ -65,7 +65,7 @@
"curl_command = f\"\"\"\n",
"curl -s http://localhost:{port}/v1/chat/completions \\\n",
" -H \"Content-Type: application/json\" \\\n",
" -d '{{\"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\", \"messages\": [{{\"role\": \"user\", \"content\": \"What is the capital of France?\"}}]}}'\n",
" -d '{{\"model\": \"qwen/qwen2.5-0.5b-instruct\", \"messages\": [{{\"role\": \"user\", \"content\": \"What is the capital of France?\"}}]}}'\n",
"\"\"\"\n",
"\n",
"response = json.loads(subprocess.check_output(curl_command, shell=True))\n",
Expand All @@ -90,7 +90,7 @@
"url = f\"http://localhost:{port}/v1/chat/completions\"\n",
"\n",
"data = {\n",
" \"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
" \"model\": \"qwen/qwen2.5-0.5b-instruct\",\n",
" \"messages\": [{\"role\": \"user\", \"content\": \"What is the capital of France?\"}],\n",
"}\n",
"\n",
Expand All @@ -116,7 +116,7 @@
"client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")\n",
"\n",
"response = client.chat.completions.create(\n",
" model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
" model=\"qwen/qwen2.5-0.5b-instruct\",\n",
" messages=[\n",
" {\"role\": \"user\", \"content\": \"List 3 countries and their capitals.\"},\n",
" ],\n",
Expand Down Expand Up @@ -145,7 +145,7 @@
"\n",
"# Use stream=True for streaming responses\n",
"response = client.chat.completions.create(\n",
" model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
" model=\"qwen/qwen2.5-0.5b-instruct\",\n",
" messages=[\n",
" {\"role\": \"user\", \"content\": \"List 3 countries and their capitals.\"},\n",
" ],\n",
Expand Down
Loading