diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5037957..fa85317 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -45,7 +45,7 @@ concurrency: # see https://github.com/ggerganov/llama.cpp/issues/7743#issuecomment-2148342691, # https://github.com/ggerganov/llama.cpp/issues/7719#issuecomment-2147631216. jobs: - + darwin: strategy: fail-fast: false @@ -77,7 +77,7 @@ jobs: -DGGML_OPENMP=off \ -DGGML_RPC=on cmake --build ${{ github.workspace }}/build --target llama-box --config Release -- -j $(sysctl -n hw.physicalcpu) - + echo "===== RESULT =====" ls -alh ${{ github.workspace }}/build/bin/ if [ -f ${{ github.workspace }}/build/bin/llama-box ]; then @@ -86,7 +86,7 @@ jobs: else exit 1 fi - + echo "===== PACKAGE =====" mkdir -p ${{ github.workspace }}/out zip -j ${{ github.workspace }}/out/llama-box-darwin-${{ matrix.arch }}-${{ matrix.instruction }}.zip ${{ github.workspace }}/build/bin/llama-box @@ -143,7 +143,7 @@ jobs: else exit 1 fi - + echo "===== PACKAGE =====" mkdir -p ${{ github.workspace }}/out zip -j ${{ github.workspace }}/out/dl-llama-box-darwin-${{ matrix.arch }}-cpu.zip ${{ github.workspace }}/dl-build/bin/* @@ -200,7 +200,7 @@ jobs: else exit 1 fi - + echo "===== PACKAGE =====" mkdir -p ${{ github.workspace }}/out zip -j ${{ github.workspace }}/out/dl-llama-box-darwin-${{ matrix.arch }}-metal.zip ${{ github.workspace }}/dl-build/bin/* @@ -615,11 +615,11 @@ jobs: fail-fast: false matrix: # see https://hub.docker.com/r/mthreads/musa/tags?page_size=&ordering=&name=ubuntu22.04. - # rc4.0 ==> rc4.0.1, Ubuntu 22.04. + # rc4.2 ==> rc4.2.0, Ubuntu 22.04. include: - arch: 'amd64' - version: 'rc4.0' - distro_container_image: 'gpustack/devel-mthreads-musa:rc4.0.1-ubuntu22.04-v2' + version: 'rc4.2' + distro_container_image: 'gpustack/devel-mthreads-musa:rc4.2.0-ubuntu22.04-v2' musa_arch: '21;22;31' runs-on: ${{ matrix.arch == 'amd64' && 'ubuntu-22.04' || 'ubuntu-22.04-arm' }} steps: @@ -1203,7 +1203,7 @@ jobs: EOF chmod +x /tmp/entrypoint.sh cat /tmp/entrypoint.sh - + docker run \ --rm \ --privileged \ @@ -1217,7 +1217,7 @@ jobs: --volume /tmp/entrypoint.sh:/entrypoint.sh \ --entrypoint /entrypoint.sh \ ${{ matrix.distro_container_image }} - + echo "===== PACKAGE =====" mkdir -p ${{ github.workspace }}/out zip -j ${{ github.workspace }}/out/dl-llama-box-linux-${{ matrix.arch }}-cann-${{ matrix.version }}${{ contains(matrix.distro_container_image, '310p') && '-310p' || '' }}.zip ${{ github.workspace }}/dl-build/bin/* @@ -1373,7 +1373,7 @@ jobs: --volume /tmp/entrypoint.sh:/entrypoint.sh \ --entrypoint /entrypoint.sh \ ${{ matrix.distro_container_image }} - + echo "===== PACKAGE =====" mkdir -p ${{ github.workspace }}/out zip -j ${{ github.workspace }}/out/dl-llama-box-linux-${{ matrix.arch }}-oneapi-${{ matrix.version }}.zip ${{ github.workspace }}/dl-build/bin/* @@ -1525,7 +1525,7 @@ jobs: --volume /tmp/entrypoint.sh:/entrypoint.sh \ --entrypoint /entrypoint.sh \ ${{ matrix.distro_container_image }} - + echo "===== PACKAGE =====" mkdir -p ${{ github.workspace }}/out zip -j ${{ github.workspace }}/out/dl-llama-box-linux-${{ matrix.arch }}-vulkan-${{ matrix.version }}.zip ${{ github.workspace }}/dl-build/bin/* @@ -1632,7 +1632,7 @@ jobs: run: | $ErrorActionPreference = "Stop" $ProgressPreference = 'SilentlyContinue' - + if (Test-Path -PathType Leaf -Path "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat") { cmd /c 'call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.arch == 'amd64' && 'amd64' || 'amd64_arm64' }} && set' | ForEach-Object { ` if ($_ -Match '^(.*?)=(.*)$') { $_ | Out-File -FilePath $env:GITHUB_ENV -Append } ` @@ -1642,7 +1642,7 @@ jobs: if ($_ -Match '^(.*?)=(.*)$') { $_ | Out-File -FilePath $env:GITHUB_ENV -Append } ` } } - + "OPENSSL_ROOT_DIR=C:\Program Files\OpenSSL" | Out-File -FilePath $env:GITHUB_ENV -Append - name: Build env: @@ -1651,7 +1651,7 @@ jobs: run: | $ErrorActionPreference = "Stop" $ProgressPreference = 'SilentlyContinue' - + Write-Host "===== BUILD =====" Get-ChildItem Env: -ErrorAction Ignore | Format-Table -Property Name, Value -ErrorAction Ignore cmake -G "Ninja" -S ${{ github.workspace }} -B ${{ github.workspace }}\build -DCMAKE_BUILD_TYPE=Release ` @@ -1672,7 +1672,7 @@ jobs: exit 1 } sccache -s - + Write-Host "===== PACKAGE =====" New-Item -Force -ItemType Directory -Path "${{ github.workspace }}\out" -ErrorAction Ignore | Out-Null Compress-Archive -Path "${{ github.workspace }}\build\bin\llama-box.exe" -DestinationPath "${{ github.workspace }}\out\llama-box-windows-${{ matrix.arch }}-${{ matrix.instruction }}.zip" @@ -1716,7 +1716,7 @@ jobs: run: | $ErrorActionPreference = "Stop" $ProgressPreference = 'SilentlyContinue' - + if (Test-Path -PathType Leaf -Path "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat") { cmd /c 'call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.arch == 'amd64' && 'amd64' || 'amd64_arm64' }} && set' | ForEach-Object { ` if ($_ -Match '^(.*?)=(.*)$') { $_ | Out-File -FilePath $env:GITHUB_ENV -Append } ` @@ -1726,7 +1726,7 @@ jobs: if ($_ -Match '^(.*?)=(.*)$') { $_ | Out-File -FilePath $env:GITHUB_ENV -Append } ` } } - + "OPENSSL_ROOT_DIR=C:\Program Files\OpenSSL" | Out-File -FilePath $env:GITHUB_ENV -Append - name: Build DL env: @@ -1752,7 +1752,7 @@ jobs: exit 1 } sccache -s - + Write-Host "===== PACKAGE =====" New-Item -Force -ItemType Directory -Path "${{ github.workspace }}\out" -ErrorAction Ignore | Out-Null Compress-Archive -Path "${{ github.workspace }}\dl-build\bin\*" -DestinationPath "${{ github.workspace }}\out\dl-llama-box-windows-${{ matrix.arch }}-cpu.zip" @@ -1807,27 +1807,27 @@ jobs: run: | $ErrorActionPreference = "Stop" $ProgressPreference = 'SilentlyContinue' - + Write-Host "I [$((Get-Date).ToString("yyyy-mm-dd HH:mm:ss"))] download NVIDIA CUDA SDK" New-Item -Force -ItemType Directory -Path "${{ github.workspace }}\.toolbox" -ErrorAction Ignore | Out-Null curl.exe --retry 5 --retry-delay 5 ` --output "${{ github.workspace }}\.toolbox\installer.exe" ` --url "${{ matrix.distro_binary_installer }}" - + # https://docs.nvidia.com/cuda/archive/12.4.0/cuda-installation-guide-microsoft-windows/index.html Write-Host "I [$((Get-Date).ToString("yyyy-mm-dd HH:mm:ss"))] install NVIDIA CUDA SDK" Start-Process "${{ github.workspace }}\.toolbox\installer.exe" -NoNewWindow -Wait ` -ArgumentList '-s','cudart_${{ matrix.version }}','nvcc_${{ matrix.version }}','cublas_${{ matrix.version }}','cublas_dev_${{ matrix.version }}','thrust_${{ matrix.version }}' - + Write-Host "I [$((Get-Date).ToString("yyyy-mm-dd HH:mm:ss"))] verify NVIDIA CUDA SDK" & 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v*\bin\nvcc.exe' --version - + $cudaPath = "$(Resolve-Path -Path 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v*\bin\nvcc.exe' | Split-Path | Split-Path)" - $cudaVersion=($cudaPath | Split-Path -Leaf ) -replace 'v(\d+).(\d+)', '$1_$2' + $cudaVersion=($cudaPath | Split-Path -Leaf ) -replace 'v(\d+).(\d+)', '$1_$2' "CUDA_PATH=${cudaPath}" | Out-File -FilePath $env:GITHUB_ENV -Append "CUDA_PATH_V${cudaVersion}=$cudaPath" | Out-File -FilePath $env:GITHUB_ENV -Append "CUDA_PATH_VX_Y=CUDA_PATH_V${cudaVersion}" | Out-File -FilePath $env:GITHUB_ENV -Append - + if (Test-Path -PathType Leaf -Path "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat") { cmd /c 'call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.arch == 'amd64' && 'amd64' || 'amd64_arm64' }} && set' | ForEach-Object { ` if ($_ -Match '^(.*?)=(.*)$') { $_ | Out-File -FilePath $env:GITHUB_ENV -Append } ` @@ -1837,7 +1837,7 @@ jobs: if ($_ -Match '^(.*?)=(.*)$') { $_ | Out-File -FilePath $env:GITHUB_ENV -Append } ` } } - + "OPENSSL_ROOT_DIR=C:\Program Files\OpenSSL" | Out-File -FilePath $env:GITHUB_ENV -Append - name: Build DL env: @@ -1847,9 +1847,9 @@ jobs: run: | $ErrorActionPreference = "Stop" $ProgressPreference = 'SilentlyContinue' - + Write-Host "CUDA_PATH=${env:CUDA_PATH}" - + Write-Host "===== BUILD =====" Get-ChildItem Env: -ErrorAction Ignore | Format-Table -Property Name, Value -ErrorAction Ignore cmake -G "Ninja" -S ${{ github.workspace }} -B ${{ github.workspace }}\dl-build -DCMAKE_BUILD_TYPE=Release ` @@ -1870,7 +1870,7 @@ jobs: exit 1 } sccache -s - + Write-Host "===== PACKAGE =====" New-Item -Force -ItemType Directory -Path "${{ github.workspace }}\out" -ErrorAction Ignore | Out-Null Compress-Archive -Path "${{ github.workspace }}\dl-build\bin\*" -DestinationPath "${{ github.workspace }}\out\dl-llama-box-windows-${{ matrix.arch }}-cuda-${{ matrix.version }}.zip" @@ -1903,7 +1903,7 @@ jobs: exit 1 } sccache -s - + Write-Host "===== PACKAGE =====" New-Item -Force -ItemType Directory -Path "${{ github.workspace }}\out" -ErrorAction Ignore | Out-Null Compress-Archive -Path "${{ github.workspace }}\build\bin\llama-box.exe" -DestinationPath "${{ github.workspace }}\out\llama-box-windows-${{ matrix.arch }}-cuda-${{ matrix.version }}.zip" @@ -1964,20 +1964,20 @@ jobs: curl.exe --retry 5 --retry-delay 5 ` --output "${{ github.workspace }}\.toolbox\installer.exe" ` --url "${{ matrix.distro_binary_installer }}" - + Write-Host "I [$((Get-Date).ToString("yyyy-mm-dd HH:mm:ss"))] install AMD ROCm HIP SDK" Start-Process "${{ github.workspace }}\.toolbox\installer.exe" -NoNewWindow -Wait ` -ArgumentList '-install' Write-Host "I [$((Get-Date).ToString("yyyy-mm-dd HH:mm:ss"))] verify AMD ROCm HIP SDK" & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version - + Write-Host "I [$((Get-Date).ToString("yyyy-mm-dd HH:mm:ss"))] clone AMD ROCm rocWMMA source" git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1 $hipPath = "$(Resolve-Path -Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | Split-Path | Split-Path)" "HIP_PATH=${hipPath}" | Out-File -FilePath $env:GITHUB_ENV -Append - + if (Test-Path -PathType Leaf -Path "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat") { cmd /c 'call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.arch == 'amd64' && 'amd64' || 'amd64_arm64' }} && set' | ForEach-Object { ` if ($_ -Match '^(.*?)=(.*)$') { $_ | Out-File -FilePath $env:GITHUB_ENV -Append } ` @@ -1987,7 +1987,7 @@ jobs: if ($_ -Match '^(.*?)=(.*)$') { $_ | Out-File -FilePath $env:GITHUB_ENV -Append } ` } } - + "OPENSSL_ROOT_DIR=C:\Program Files\OpenSSL" | Out-File -FilePath $env:GITHUB_ENV -Append - name: Build DL env: @@ -1997,9 +1997,9 @@ jobs: run: | $ErrorActionPreference = "Stop" $ProgressPreference = 'SilentlyContinue' - + Write-Host "HIP_PATH=${env:HIP_PATH}" - + Write-Host "===== BUILD =====" $env:CMAKE_PREFIX_PATH = "${env:HIP_PATH}" Get-ChildItem Env: -ErrorAction Ignore | Format-Table -Property Name, Value -ErrorAction Ignore @@ -2024,7 +2024,7 @@ jobs: exit 1 } sccache -s - + Write-Host "===== PACKAGE =====" New-Item -Force -ItemType Directory -Path "${{ github.workspace }}\out" -ErrorAction Ignore | Out-Null Compress-Archive -Path "${{ github.workspace }}\dl-build\bin\*" -DestinationPath "${{ github.workspace }}\out\dl-llama-box-windows-${{ matrix.arch }}-hip-${{ matrix.version }}.zip" @@ -2061,7 +2061,7 @@ jobs: exit 1 } sccache -s - + Write-Host "===== PACKAGE =====" New-Item -Force -ItemType Directory -Path "${{ github.workspace }}\out" -ErrorAction Ignore | Out-Null Compress-Archive -Path "${{ github.workspace }}\build\bin\llama-box.exe" -DestinationPath "${{ github.workspace }}\out\llama-box-windows-${{ matrix.arch }}-hip-${{ matrix.version }}.zip" @@ -2114,7 +2114,7 @@ jobs: curl.exe --retry 5 --retry-delay 5 ` --output "${{ github.workspace }}\.toolbox\installer.exe" ` --url "${{ matrix.distro_binary_installer }}" - + Write-Host "I [$((Get-Date).ToString("yyyy-mm-dd HH:mm:ss"))] install Intel oneAPI SDK" Start-Process "${{ github.workspace }}\.toolbox\installer.exe" -NoNewWindow -Wait ` -ArgumentList '-s','--action=install','--components=intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel','--eula=accept','-p=NEED_VS2017_INTEGRATION=0','-p=NEED_VS2019_INTEGRATION=0','-p=NEED_VS2022_INTEGRATION=0' @@ -2126,11 +2126,11 @@ jobs: "ONEAPI_PATH=${oneapiPath}" | Out-File -FilePath $env:GITHUB_ENV -Append $oneapiRoot = "$(Split-Path -Path $oneapiPath)" "ONEAPI_ROOT=${oneapiRoot}" | Out-File -FilePath $env:GITHUB_ENV -Append - + cmd /c "call `"${oneapiRoot}\setvars.bat`" && set" | ForEach-Object { ` if ($_ -Match '^(.*?)=(.*)$') { $_ | Out-File -FilePath $env:GITHUB_ENV -Append } ` } - + "OPENSSL_ROOT_DIR=C:\Program Files\OpenSSL" | Out-File -FilePath $env:GITHUB_ENV -Append - name: Build DL env: @@ -2139,10 +2139,10 @@ jobs: run: | $ErrorActionPreference = "Stop" $ProgressPreference = 'SilentlyContinue' - + Write-Host "ONEAPI_PATH=${env:ONEAPI_PATH}" Write-Host "ONEAPI_ROOT=${env:ONEAPI_ROOT}" - + Write-Host "===== BUILD =====" Get-ChildItem Env: -ErrorAction Ignore | Format-Table -Property Name, Value -ErrorAction Ignore cmake -G "Ninja" -S ${{ github.workspace }} -B ${{ github.workspace }}\dl-build -DCMAKE_BUILD_TYPE=Release ` @@ -2167,7 +2167,7 @@ jobs: exit 1 } sccache -s - + Write-Host "===== PACKAGE =====" New-Item -Force -ItemType Directory -Path "${{ github.workspace }}\out" -ErrorAction Ignore | Out-Null Compress-Archive -Path "${{ github.workspace }}\dl-build\bin\*" -DestinationPath "${{ github.workspace }}\out\dl-llama-box-windows-${{ matrix.arch }}-oneapi-${{ matrix.version }}.zip" @@ -2202,7 +2202,7 @@ jobs: exit 1 } sccache -s - + Write-Host "===== PACKAGE =====" New-Item -Force -ItemType Directory -Path "${{ github.workspace }}\out" -ErrorAction Ignore | Out-Null Compress-Archive -Path "${{ github.workspace }}\build\bin\llama-box.exe" -DestinationPath "${{ github.workspace }}\out\llama-box-windows-${{ matrix.arch }}-oneapi-${{ matrix.version }}.zip" @@ -2255,7 +2255,7 @@ jobs: curl.exe --retry 5 --retry-delay 5 ` --output "${{ github.workspace }}\.toolbox\installer.exe" ` --url "${{ matrix.distro_binary_installer }}" - + Write-Host "I [$((Get-Date).ToString("yyyy-mm-dd HH:mm:ss"))] install LunarG Vulkan SDK" Start-Process "${{ github.workspace }}\.toolbox\installer.exe" -NoNewWindow -Wait ` -ArgumentList '--accept-licenses','--default-answer','--confirm-command install' @@ -2263,7 +2263,7 @@ jobs: $vulkanPath = "$(Resolve-Path -Path 'C:\VulkanSDK\*\bin' | Split-Path)" "VULKAN_SDK=${vulkanPath}" | Out-File -FilePath $env:GITHUB_ENV -Append "${vulkanPath}\bin" | Out-File -FilePath $env:GITHUB_PATH -Append - + if (Test-Path -PathType Leaf -Path "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat") { cmd /c 'call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.arch == 'amd64' && 'amd64' || 'amd64_arm64' }} && set' | ForEach-Object { ` if ($_ -Match '^(.*?)=(.*)$') { $_ | Out-File -FilePath $env:GITHUB_ENV -Append } ` @@ -2273,7 +2273,7 @@ jobs: if ($_ -Match '^(.*?)=(.*)$') { $_ | Out-File -FilePath $env:GITHUB_ENV -Append } ` } } - + "OPENSSL_ROOT_DIR=C:\Program Files\OpenSSL" | Out-File -FilePath $env:GITHUB_ENV -Append - name: Build DL env: @@ -2282,9 +2282,9 @@ jobs: run: | $ErrorActionPreference = "Stop" $ProgressPreference = 'SilentlyContinue' - + Write-Host "VULKAN_SDK=${env:VULKAN_SDK}" - + Write-Host "===== BUILD =====" $env:CMAKE_PREFIX_PATH = "${env:VULKAN_SDK}" Get-ChildItem Env: -ErrorAction Ignore | Format-Table -Property Name, Value -ErrorAction Ignore diff --git a/README.md b/README.md index 21cd0ff..667b6a7 100644 --- a/README.md +++ b/README.md @@ -10,12 +10,15 @@ and [stable-diffusion.cpp](https://github.com/leejet/stable-diffusion.cpp). ## Agenda -- [Features](#features) -- [Supports](#supports) -- [Examples](#examples) -- [Usage](#usage) -- [Server API](#server-api) -- [Tools](#tools) +- [LLaMA Box (V2)](#llama-box-v2) + - [Agenda](#agenda) + - [Features](#features) + - [Supports](#supports) + - [Examples](#examples) + - [Usage](#usage) + - [Server API](#server-api) + - [Tools](#tools) + - [License](#license) ## Features @@ -71,10 +74,10 @@ and [stable-diffusion.cpp](https://github.com/leejet/stable-diffusion.cpp). ```shell $ # Assume that there are 1 remote RPC server and 3 available GPUs, launch box as below. $ llama-box -c 8192 -np 4 --host 0.0.0.0 -m --rpc remote-ip:remote-port --tensor-split 1,2,3 - $ # Same as --tensor-split 1,2,3,0. - $ # The remote RPC server will handle 1/6 of the model, the 1st GPU will handle 1/3 of the model, and the 2nd GPU will handle 1/2 of the model. + $ # Same as --tensor-split 1,2,3,0. + $ # The remote RPC server will handle 1/6 of the model, the 1st GPU will handle 1/3 of the model, and the 2nd GPU will handle 1/2 of the model. $ # Nothing to do with the 3rd GPU. - + $ # Assume that there are 1 remote RPC servers and 3 available GPUs, launch box as below. $ llama-box -c 8192 -np 4 --host 0.0.0.0 -m --rpc remote-ip:remote-port --tensor-split 0,0,1,1 $ # The 2nd GPU will handle 1/2 of the model, and the 3rd GPU will handle 1/2 of the model. @@ -87,7 +90,7 @@ and [stable-diffusion.cpp](https://github.com/leejet/stable-diffusion.cpp). $ # Same as --tensor-split 1,1,1,0. $ # The remote RPC server will handle text encoder part, the 1st GPU will handle VAE part, and the 2nd GPU will handle diffusion part. $ # Nothing to do with the 3rd GPU. - + $ # Assume that there are 1 remote RPC server and 3 available GPUs, launch box as below. $ llama-box -np 4 --host 0.0.0.0 -m --rpc remote-ip:remote-port --tensor-split 0,0,1,1 $ # Then 2nd GPU will handle text encoder and VAE parts, and the 3rd GPU will handle diffusion part. @@ -97,7 +100,7 @@ and [stable-diffusion.cpp](https://github.com/leejet/stable-diffusion.cpp). ```shell $ # Launch box. $ llama-box -c 8192 -np 4 --host 0.0.0.0 -m - + $ # Inject X-Request-ID: trace-id to track the request. $ curl --silent --no-buffer http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -H "X-Request-ID: trace-id" -d '{"model": "demo", "messages": [{"role":"user", "content":"Introduce Beijing in 50 words."}]}' $ # View logs @@ -106,7 +109,7 @@ and [stable-diffusion.cpp](https://github.com/leejet/stable-diffusion.cpp). ```shell $ # Launch box with -tps -1. $ llama-box -c 8192 -np 4 --host 0.0.0.0 -m --tokens-per-second -1 - + $ # For level 1 users, inject X-Request-Tokens-Per-Second: 10 to limit the number of tokens per second to 10. $ curl --silent --no-buffer http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -H "X-Request-Tokens-Per-Second: 10" -d '{"stream": true, "model": "demo", "messages": [{"role":"user", "content":"Introduce Beijing in 50 words."}]}' @@ -132,7 +135,7 @@ LLaMA Box supports the following platforms. | **Huawei Ascend CANN
8.1.rc1 (8.1.rc1.beta1)** | `linux/amd64`Ubuntu 20.04
`linux/arm64`Ubuntu 20.04 | `Ascend 910b`, `Ascend 310p`,
see [Ascend Document](https://www.hiascend.com/en/document). | | **Huawei Ascend CANN
8.0 (8.0.0.beta1)** | `linux/amd64`Ubuntu 20.04
`linux/arm64`Ubuntu 20.04 | `Ascend 910b`, `Ascend 310p`,
see [Ascend Document](https://www.hiascend.com/en/document). | | **HYGON DTK(DCU Toolkit)
25.04 (25.04)** | `linux/amd64`Ubuntu 22.04
| `Z100`, `K100`, `Z100L`,
see [DTK Community](https://developer.sourcefind.cn/?s=Note). | -| **Moore Threads MUSA
rc4.0 (rc4.0.1)** | `linux/amd64`Ubuntu 22.04
| `MTT S4000`, `MTT S80`,
see [Moor Threads Website](https://en.mthreads.com). | +| **Moore Threads MUSA
rc4.2 (rc4.2.0)** | `linux/amd64`Ubuntu 22.04
| `MTT S4000`, `MTT S80`,
see [Moor Threads Website](https://en.mthreads.com). | | **Apple Metal 3** | `darwin/amd64`macOS 13.7
`darwin/arm64`macOS 14.7 | Support [Apple Metal](https://developer.apple.com/metal/),
see [Metal Framework](https://developer.apple.com/documentation/metal?language=objc). | | _AVX2_ | `darwin/amd64`macOS 13.7
`linux/amd64`CentOS 7
`windows/amd64`Windows Server 2022 | CPUs support AVX2,
see [Wikipedia](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions#Advanced_Vector_Extensions_2). | | _Advanced SIMD (NEON)_ | `linux/arm64`Ubuntu 18.04
`windows/arm64`Windows Server 2022 | CPUs support
Advanced SIMD (NEON),
see [Wikipedia](https://en.wikipedia.org/wiki/ARM_architecture_family#Advanced_SIMD_(Neon)). | @@ -164,7 +167,7 @@ LLaMA Box supports the following platforms. ```shell $ # Provide 4 sessions(allowing 4 parallel chat users), with a max of 8192 tokens per session. $ llama-box -c 8192 -np 4 --host 0.0.0.0 -m ~/.cache/lm-studio/models/unsloth/Qwen3-8B-GGUF/Qwen3-8B-Q8_0.gguf - + $ # Call with curl, $ curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{"model": "qwen3", "messages": [{"role":"user", "content":"Introduce Beijing in 50 words."}]}' @@ -180,24 +183,24 @@ LLaMA Box supports the following platforms. ```shell $ # Provide 4 session(allowing 4 parallel chat users), with a max of 8192 tokens per session. $ llama-box -c 8192 -np 4 --host 0.0.0.0 -m ~/.cache/lm-studio/models/ggml-org/Qwen2.5-VL-7B-Instruct-GGUF/Qwen2.5-VL-7B-Instruct-Q8_0.gguf --mmproj ~/.cache/lm-studio/models/ggml-org/Qwen2.5-VL-7B-Instruct-GGUF/mmproj-Qwen2.5-VL-7B-Instruct-f16.gguf - + $ # Chat with image base64. $ IMAGE_URL="$(echo "data:image/jpeg;base64,$(curl https://raw.githubusercontent.com/haotian-liu/LLaVA/main/llava/serve/examples/extreme_ironing.jpg --output - | base64)")"; \ echo "{\"model\": \"qwen2.5-vl\", \"temperature\": 0.1, \"messages\": [{\"role\":\"system\", \"content\": [{\"type\": \"text\", \"text\": \"You are a helpful assistant.\"}]}, {\"role\":\"user\", \"content\": [{\"type\": \"image_url\", \"image_url\": {\"url\": \"$IMAGE_URL\"}}, {\"type\": \"text\", \"text\": \"What is unusual about this image?\"}]}]}" > /tmp/data.json - + $ # Call with curl, $ curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d @/tmp/data.json $ # or use the chat.sh tool. $ ./llama-box/tools/chat.sh @/tmp/data.json - + $ # Chat with image url. $ IMAGE_URL="https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"; \ echo "{\"model\": \"qwen2.5-vl\", \"temperature\": 0.1, \"messages\": [{\"role\":\"user\", \"content\": [{\"type\":\"text\",\"text\":\"What is in this image?\"}, {\"type\": \"image_url\", \"image_url\": {\"url\": \"$IMAGE_URL\"}}]}]}" > /tmp/data.json - + $ # Call with curl, $ curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d @/tmp/data.json - + $ # or use the chat.sh tool. $ ./llama-box/tools/chat.sh @/tmp/data.json ``` @@ -210,11 +213,11 @@ LLaMA Box supports the following platforms. ```shell $ # Provide 4 session(allowing 4 parallel chat users), with a max of 8192 tokens per session. $ llama-box -c 8192 -np 4 --host 0.0.0.0 -m ~/.cache/lm-studio/models/ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF/Llama-3.2-1B-Instruct-Q8_0.gguf --mmproj ~/.cache/lm-studio/models/ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF/mmproj-ultravox-v0_5-llama-3_2-1b-f16.gguf - + $ # Chat with audio base64. $ AUDIO_DATA="$(curl https://upload.wikimedia.org/wikipedia/commons/transcoded/6/6f/Apollo13-wehaveaproblem.ogg/Apollo13-wehaveaproblem.ogg.mp3 --output - | base64)"; \ echo "{\"model\": \"ultravox\", \"temperature\": 0.1, \"messages\": [{\"role\":\"system\", \"content\": [{\"type\": \"text\", \"text\": \"You are a helpful assistant.\"}]}, {\"role\":\"user\", \"content\": [{\"type\": \"input_audio\", \"input_audio\": { \"format\": \"mp3\", \"data\": \"$AUDIO_DATA\"}}, {\"type\": \"text\", \"text\": \"How many times has roger appeared?\"}]}]}" > /tmp/data.json - + $ # Call with curl, $ curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d @/tmp/data.json @@ -229,10 +232,10 @@ LLaMA Box supports the following platforms. ```shell $ # Provide 4 session(allowing 4 parallel chat users), with a max of 8192 tokens per session. $ llama-box -c 8192 -np 4 --host 0.0.0.0 -m ~/.cache/lm-studio/models/Qwen/Qwen2-0.5B-Instruct-GGUF/qwen2-0_5b-instruct-fp16.gguf - + $ # Call with curl, $ curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{"model": "qwen2.5", "messages": [{"role":"user","content":"What is the weather like in Paris today?"}], "tools": [{"type":"function","function":{"name":"get_weather","parameters":{"type":"object","properties":{"location":{"type":"string"}},"required":["location"]}}}]}' - + $ # or use the chat.sh tool. $ TOOLS_WITH=true ./llama-box/tools/chat.sh "What is the weather like in Paris today?" ``` @@ -246,10 +249,10 @@ LLaMA Box supports the following platforms. ```shell $ # Provide 1 session(allowing 1 parallel chat user). $ llama-box -np 1 --host 0.0.0.0 -m ~/.cache/lm-studio/models/gpustack/stable-diffusion-v3.5-medium-GGUF/stable-diffusion-v3-5-medium-FP16.gguf --images - + $ # Call with curl, $ curl http://localhost:8080/v1/images/generations -H "Content-Type: application/json" -d '{"model": "sd3-medium", "prompt": "A lovely cat"}' - + $ # or use the image_generate.sh tool. $ ./llama-box/tools/image_generate.sh "A lovely cat" ``` @@ -262,11 +265,11 @@ LLaMA Box supports the following platforms. ```shell $ # Provide 1 session(allowing 1 parallel chat user). $ llama-box -np 1 --host 0.0.0.0 -m ~/.cache/lm-studio/models/gpustack/FLUX.1-Fill-dev-GGUF/FLUX.1-Fill-dev-Q8_0.gguf --images - + $ # Call with curl, $ curl https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png --output /tmp/input.png $ curl https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png --output /tmp/mask.png - + $ # or use the image_edit.sh tool. $ IMAGE=/tmp/input.png MASK=/tmp/mask.png ./llama-box/tools/image_edit.sh "a tiger sitting on a park bench" ``` @@ -312,7 +315,7 @@ LLaMA Box supports the following platforms. ```shell $ # Provide 4 session(allowing 4 parallel chat users), with a max of 8192 tokens per session. $ llama-box -c 8192 -np 4 --host 0.0.0.0 -m ~/.cache/lm-studio/models/gpustack/jina-reranker-v1-tiny-en-GGUF/jina-reranker-v1-tiny-en-FP16.gguf --rerank - + $ # Call with curl. $ curl http://localhost:8080/v1/rerank -H "Content-Type: application/json" -d '{"model":"jina-reranker-v1-tiny-en","query":"Organic skincare products for sensitive skin","top_n":3,"documents":["Eco-friendly kitchenware for modern homes","Biodegradable cleaning supplies for eco-conscious consumers","Organic cotton baby clothes for sensitive skin","Natural organic skincare range for sensitive skin","Tech gadgets for smart homes: 2024 edition","Sustainable gardening tools and compost solutions","Sensitive skin-friendly facial cleansers and toners","Organic food wraps and storage solutions","All-natural pet food for dogs with allergies","oga mats made from recycled materials"]}' ``` @@ -370,7 +373,7 @@ general: --system-info Print system info and exit --list-devices Print list of available devices and exit --list-buffer-types Print list of available buffer types and exit - -v, --verbose, --log-verbose + -v, --verbose, --log-verbose Set verbosity level to infinity (i.e. log all messages, useful for debugging) -lv, --verbosity, --log-verbosity V Set the verbosity threshold, messages with a higher verbosity will be ignored @@ -387,7 +390,7 @@ server: -m, --model FILE Model path (default: models/7B/ggml-model-f16.gguf) -a, --alias NAME Model name alias --lora FILE Apply LoRA adapter (implies --no-mmap) - --lora-scaled FILE SCALE + --lora-scaled FILE SCALE Apply LoRA adapter with user defined scaling S (implies --no-mmap) --lora-init-without-apply Load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) @@ -412,7 +415,7 @@ server: server/completion: - -dev, --device + -dev, --device A comma-separated list of devices to use for offloading (none = don't offload) Use --list-devices to see a list of available devices -sm, --split-mode SPLIT_MODE How to split the model across multiple GPUs, one of: @@ -442,7 +445,7 @@ server/completion: -C, --cpu-mask M Set CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: "") -Cr, --cpu-range lo-hi Range of CPUs for affinity. Complements --cpu-mask --cpu-strict <0|1> Use strict CPU placement (default: 0) - + --prio N Set process/thread priority (default: 0), one of: - 0-normal - 1-medium @@ -452,7 +455,7 @@ server/completion: -tb, --threads-batch N Number of threads to use during batch and prompt processing (default: same as --threads) -Cb, --cpu-mask-batch M Set CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask) -Crb, --cpu-range-batch lo-hi Ranges of CPUs for affinity. Complements --cpu-mask-batch - --cpu-strict-batch <0|1> + --cpu-strict-batch <0|1> Use strict CPU placement (default: same as --cpu-strict) --prio-batch N Set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority) --poll-batch <0...100> Use polling to wait for work (default: same as --poll @@ -483,7 +486,7 @@ server/completion: --dry-base N Set DRY sampling base value (default: 1.75) --dry-allowed-length N Set allowed length for DRY sampling (default: 2) --dry-penalty-last-n N Set DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = context size) - --dry-sequence-breaker N + --dry-sequence-breaker N Add sequence breaker for DRY sampling, clearing out default breakers (\n;:;";*) in the process; use "none" to not use any sequence breakers --dynatemp-range N Dynamic temperature range (default: 0.0, 0.0 = disabled) --dynatemp-exp N Dynamic temperature exponent (default: 1.0) @@ -585,14 +588,14 @@ server/images: Path to the CLIP Large (clip-l) text encoder, or use --model included --image-clip-g-model PATH Path to the CLIP Generic (clip-g) text encoder, or use --model included - --image-t5xxl-model PATH + --image-t5xxl-model PATH Path to the Text-to-Text Transfer Transformer (t5xxl) text encoder, or use --model included --image-no-vae-model-offload Disable vae(taesd) model offload --image-vae-model PATH Path to Variational AutoEncoder (vae), or use --model included --image-vae-tiling Indicate to process vae decoder in tiles to reduce memory usage (default: disabled) --image-no-vae-tiling Disable vae decoder in tiles - --image-taesd-model PATH + --image-taesd-model PATH Path to Tiny AutoEncoder For StableDiffusion (taesd), or use --model included --image-upscale-model PATH Path to the upscale model, or use --model included @@ -671,7 +674,7 @@ The available endpoints for the LLaMA Box server mode are: "add_special": false, "with_pieces": false } - + RESPONSE : (application/json) CASE 1: without pieces { @@ -694,7 +697,7 @@ The available endpoints for the LLaMA Box server mode are: { "tokens": [123, ...] } - + RESPONSE : (application/json) { "content": "..." @@ -709,8 +712,8 @@ The available endpoints for the LLaMA Box server mode are: RESPONSE : (application/json) [ { - "id": 0, - "path": "...", + "id": 0, + "path": "...", "init_scale": 1.0 // initial scale, may not be the same as the one used currently }, ... @@ -764,7 +767,7 @@ The available endpoints for the LLaMA Box server mode are: "preview_faster": true // enable preview mode (deprecated) } } - + RESPONSE : (text/event-stream) data: {"created":1731916353,"data":[{"index":0,"object":"image.chunk","progress":10.0}], ...} ... @@ -803,7 +806,7 @@ The available endpoints for the LLaMA Box server mode are: ... ] } - + RESPONSE : (text/event-stream) data: {"created":1731916353,"data":[{"index":0,"object":"image.chunk","progress":10.0}], ...} ... @@ -833,7 +836,7 @@ The available endpoints for the LLaMA Box server mode are: stream_options_chunk_size=4096 // split the final image b64_json into chunks with the given size, default 4k stream_options_preview=true // enable preview mode stream_options_preview_faster=true // enable preview mode (deprecated) - + RESPONSE : (text/event-stream) CASE 1: correct input image data: {"created":1731916353,"data":[{"index":0,"object":"image.chunk","progress":10.0}], ...}