opea-project · AhmedSeemalK · Apr 7, 2026
diff --git a/blueprints/llamacpp/Dockerfile-llamacpp b/blueprints/llamacpp/Dockerfile-llamacpp
@@ -0,0 +1,49 @@
+FROM intel/oneapi-basekit:latest
+
+ARG DEBIAN_FRONTEND=noninteractive
+ARG LLAMA_CPP_REPO=https://github.com/ggml-org/llama.cpp.git
+ARG LLAMA_CPP_REF=master
+
+SHELL ["/bin/bash", "-lc"]
+
+# Base packages needed for building llama.cpp
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    git \
+    cmake \
+    ninja-build \
+    build-essential \
+    pkg-config \
+    ca-certificates \
+    curl \
+    libssl-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /opt
+
+# Clone source
+RUN git clone --depth 1 --branch "${LLAMA_CPP_REF}" "${LLAMA_CPP_REPO}" llama.cpp
+
+WORKDIR /opt/llama.cpp
+
+# Build with Intel oneAPI compilers + oneMKL BLAS
+RUN source /opt/intel/oneapi/setvars.sh --force && \
+    cmake -S . -B build -G Ninja \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DCMAKE_C_COMPILER=icx \
+      -DCMAKE_CXX_COMPILER=icpx \
+      -DGGML_BLAS=ON \
+      -DGGML_BLAS_VENDOR=Intel10_64lp \
+      -DGGML_NATIVE=ON \
+      -DLLAMA_CURL=ON \
+      -DLLAMA_OPENSSL=ON && \
+    cmake --build build -j"$(nproc)"
+
+# Write startup wrapper inline
+RUN printf '#!/usr/bin/env bash\nset -euo pipefail\nsource /opt/intel/oneapi/setvars.sh --force\nexec "$@"\n' > /usr/local/bin/llama-env && \
+    chmod +x /usr/local/bin/llama-env
+
+ENV PATH="/opt/llama.cpp/build/bin:${PATH}"
+ENV OCL_ICD_FILENAMES=""
+WORKDIR /workspace
+
+ENTRYPOINT ["/bin/bash", "-c", "set +u; source /opt/intel/oneapi/setvars.sh --force; mkdir -p /logs; /opt/llama.cpp/build/bin/llama-server --host 0.0.0.0 -c 8192 -t 6 \"$@\" 2>&1 | tee /logs/llama-server.log", "--"]
diff --git a/blueprints/llamacpp/README.md b/blueprints/llamacpp/README.md
@@ -0,0 +1,50 @@
+# llama.cpp — Intel oneAPI with VMWare Foundation
+
+Runs [llama.cpp](https://github.com/ggml-org/llama.cpp) `llama-server` built with Intel oneAPI compilers and oneMKL BLAS for optimized CPU inference.
+
+## Build & Run
+
+**1. Build the image** (output logged to `$HOME/logs/docker-build.log`):
+
+```bash
+mkdir -p $HOME/logs
+sudo docker build -f Dockerfile-llamacpp -t llamacpp-intel:latest . \
+  2>&1 | tee $HOME/logs/docker-build.log
+```
+
+**2. Stop any existing container, then start a fresh one:**
+
+```bash
+sudo docker stop llamacpp 2>/dev/null || true
+
+sudo docker run --rm -d \
+  --ipc=host --net=host \
+  -v $HOME/models:/root/.cache/huggingface \
+  -v $HOME/logs:/logs \
+  --workdir /workspace \
+  --name llamacpp \
+  llamacpp-intel:latest \
+  --hf-repo win10/DeepSeek-Coder-V2-Lite-Instruct-Q8_0-GGUF \
+  --hf-file deepseek-coder-v2-lite-instruct-q8_0.gguf
+```
+
+- `--hf-repo` / `--hf-file` — HuggingFace model to serve (swap to use a different model)
+- Models are cached in `$HOME/models` — downloaded once, reused on subsequent runs
+- Build logs: `$HOME/logs/docker-build.log`
+- Server logs: `$HOME/logs/llama-server.log`
+- Server listens on `http://localhost:8080`
+
+## Test
+
+```bash
+curl http://localhost:8080/v1/completions \
+  -X POST \
+  -H "Content-Type: application/json" \
+  -d '{"model": "win10/DeepSeek-Coder-V2-Lite-Instruct-Q8_0-GGUF", "prompt": "What is Deep Learning?", "max_tokens": 25, "temperature": 0}'
+```
+
+## Stop
+
+```bash
+sudo docker stop llamacpp
+```