diff --git a/blueprints/llamacpp/Dockerfile-llamacpp b/blueprints/llamacpp/Dockerfile-llamacpp new file mode 100644 index 0000000..31e1552 --- /dev/null +++ b/blueprints/llamacpp/Dockerfile-llamacpp @@ -0,0 +1,49 @@ +FROM intel/oneapi-basekit:latest + +ARG DEBIAN_FRONTEND=noninteractive +ARG LLAMA_CPP_REPO=https://github.com/ggml-org/llama.cpp.git +ARG LLAMA_CPP_REF=master + +SHELL ["/bin/bash", "-lc"] + +# Base packages needed for building llama.cpp +RUN apt-get update && apt-get install -y --no-install-recommends \ + git \ + cmake \ + ninja-build \ + build-essential \ + pkg-config \ + ca-certificates \ + curl \ + libssl-dev \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /opt + +# Clone source +RUN git clone --depth 1 --branch "${LLAMA_CPP_REF}" "${LLAMA_CPP_REPO}" llama.cpp + +WORKDIR /opt/llama.cpp + +# Build with Intel oneAPI compilers + oneMKL BLAS +RUN source /opt/intel/oneapi/setvars.sh --force && \ + cmake -S . -B build -G Ninja \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_C_COMPILER=icx \ + -DCMAKE_CXX_COMPILER=icpx \ + -DGGML_BLAS=ON \ + -DGGML_BLAS_VENDOR=Intel10_64lp \ + -DGGML_NATIVE=ON \ + -DLLAMA_CURL=ON \ + -DLLAMA_OPENSSL=ON && \ + cmake --build build -j"$(nproc)" + +# Write startup wrapper inline +RUN printf '#!/usr/bin/env bash\nset -euo pipefail\nsource /opt/intel/oneapi/setvars.sh --force\nexec "$@"\n' > /usr/local/bin/llama-env && \ + chmod +x /usr/local/bin/llama-env + +ENV PATH="/opt/llama.cpp/build/bin:${PATH}" +ENV OCL_ICD_FILENAMES="" +WORKDIR /workspace + +ENTRYPOINT ["/bin/bash", "-c", "set +u; source /opt/intel/oneapi/setvars.sh --force; mkdir -p /logs; /opt/llama.cpp/build/bin/llama-server --host 0.0.0.0 -c 8192 -t 6 \"$@\" 2>&1 | tee /logs/llama-server.log", "--"] \ No newline at end of file diff --git a/blueprints/llamacpp/README.md b/blueprints/llamacpp/README.md new file mode 100644 index 0000000..3bc4d20 --- /dev/null +++ b/blueprints/llamacpp/README.md @@ -0,0 +1,50 @@ +# llama.cpp — Intel oneAPI with VMWare Foundation + +Runs [llama.cpp](https://github.com/ggml-org/llama.cpp) `llama-server` built with Intel oneAPI compilers and oneMKL BLAS for optimized CPU inference. + +## Build & Run + +**1. Build the image** (output logged to `$HOME/logs/docker-build.log`): + +```bash +mkdir -p $HOME/logs +sudo docker build -f Dockerfile-llamacpp -t llamacpp-intel:latest . \ + 2>&1 | tee $HOME/logs/docker-build.log +``` + +**2. Stop any existing container, then start a fresh one:** + +```bash +sudo docker stop llamacpp 2>/dev/null || true + +sudo docker run --rm -d \ + --ipc=host --net=host \ + -v $HOME/models:/root/.cache/huggingface \ + -v $HOME/logs:/logs \ + --workdir /workspace \ + --name llamacpp \ + llamacpp-intel:latest \ + --hf-repo win10/DeepSeek-Coder-V2-Lite-Instruct-Q8_0-GGUF \ + --hf-file deepseek-coder-v2-lite-instruct-q8_0.gguf +``` + +- `--hf-repo` / `--hf-file` — HuggingFace model to serve (swap to use a different model) +- Models are cached in `$HOME/models` — downloaded once, reused on subsequent runs +- Build logs: `$HOME/logs/docker-build.log` +- Server logs: `$HOME/logs/llama-server.log` +- Server listens on `http://localhost:8080` + +## Test + +```bash +curl http://localhost:8080/v1/completions \ + -X POST \ + -H "Content-Type: application/json" \ + -d '{"model": "win10/DeepSeek-Coder-V2-Lite-Instruct-Q8_0-GGUF", "prompt": "What is Deep Learning?", "max_tokens": 25, "temperature": 0}' +``` + +## Stop + +```bash +sudo docker stop llamacpp +```