Add loci-analysis workflow from overlay

DajanaV · DajanaV · commit d63964dda0cb · 2026-03-24T02:17:00.000Z
diff --git a/.github/workflows/loci-analysis.yml b/.github/workflows/loci-analysis.yml
@@ -0,0 +1,109 @@
+name: LOCI Analysis
+on:
+  push:
+    branches:
+      - loci/main-*
+  pull_request:
+    types: [opened, synchronize, reopened]
+
+jobs:
+  loci:
+    if: vars.UPSTREAM_REPO != ''
+    runs-on: ubuntu-latest
+
+    env:
+      LOCI_PROJECT: 'Llama CPP'
+      LOCI_API_KEY: '${{ secrets.LOCI_API_KEY }}'
+      LOCI_BACKEND_URL: '${{ vars.LOCI_BACKEND_URL }}'
+      GH_TOKEN: ${{ secrets.MIRROR_REPOS_WRITE_PAT }}
+
+    environment: ${{ vars.LOCI_ENV || 'PROD__AL_DEMO' }}
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          ref: ${{ (github.event_name == 'pull_request' && github.event.pull_request.head.sha) || github.sha }}
+
+      - name: Compute target
+        id: target
+        if: github.event_name == 'push'
+        run: |
+          branch="${{ github.ref_name }}"
+          sha="${branch#loci/main-}"
+          echo "value=main@${sha}" >> "$GITHUB_OUTPUT"
+
+      - name: Compute base
+        id: base
+        if: github.event_name == 'pull_request'
+        run: |
+          git remote add upstream "https://github.com/${{ vars.UPSTREAM_REPO }}.git" 2>/dev/null || true   
+          git fetch upstream                                                                               
+          upstream_default=$(gh api "repos/${{ vars.UPSTREAM_REPO }}" --jq .default_branch)                
+          merge_base=$(git merge-base HEAD "upstream/${upstream_default}")                                 
+          short_sha="${merge_base:0:7}"                                                                    
+          echo "value=main@${short_sha}" >> "$GITHUB_OUTPUT"                                               
+
+      - name: Install dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y \
+            cmake \
+            build-essential \
+            gcc-aarch64-linux-gnu \
+            g++-aarch64-linux-gnu \
+            libcurl4-openssl-dev
+     
+      - name: Create build directory and configure with CMake
+        run: |
+          mkdir build
+          cd build
+          cmake .. \
+            -DCMAKE_SYSTEM_NAME=Linux \
+            -DCMAKE_SYSTEM_PROCESSOR=aarch64 \
+            -DCMAKE_C_COMPILER=aarch64-linux-gnu-gcc \
+            -DCMAKE_CXX_COMPILER=aarch64-linux-gnu-g++ \
+            -DCMAKE_OSX_SYSROOT= \
+            -DCMAKE_OSX_DEPLOYMENT_TARGET= \
+            -DBUILD_SHARED_LIBS=ON \
+            -DLLAMA_BUILD_TESTS=OFF \
+            -DLLAMA_BUILD_EXAMPLES=OFF \
+            -DLLAMA_BUILD_SERVER=ON \
+            -DLLAMA_BUILD_COMMON=ON \
+            -DLLAMA_BUILD_TOOLS=ON \
+            -DLLAMA_CURL=OFF \
+            -DCMAKE_BUILD_TYPE=Debug \
+            -DCMAKE_C_FLAGS="-march=armv8-a -Wl,-Bsymbolic" \
+            -DCMAKE_CXX_FLAGS="-march=armv8-a -Wl,-Bsymbolic"
+
+
+      - name: Build project
+        run: |
+          cd build
+          cmake --build . -j4
+
+      - name: LOCI Upload
+        uses: auroralabs-loci/loci-action@v1
+        with:
+          mode: upload
+          binaries: |
+                build/bin/libggml.so*
+                build/bin/libllama.so*
+                build/bin/libggml-cpu.so*
+                build/bin/libggml-base.so*
+                build/bin/libmtmd.so*
+                build/bin/llama-bench
+                build/bin/llama-cvector-generator
+                build/bin/llama-gemma3-cli
+                build/bin/llama-gguf-split
+                build/bin/llama-llava-cli
+                build/bin/llama-minicpmv-cli
+                build/bin/llama-quantize
+                build/bin/llama-qwen2vl-cli
+                build/bin/llama-run
+                build/bin/llama-tokenize
+                build/bin/llama-tts
+          project: '${{ env.LOCI_PROJECT }}'
+          target: ${{ steps.target.outputs.value || ''}}
+          base: ${{ steps.base.outputs.value || '' }}
diff --git a/pulls.ndjson b/pulls.ndjson
@@ -0,0 +1 @@
+{"pull_number":"20644","title":"ggml-cuda: Add NVFP4 dp4a kernel","body":"This PR brings in the initial plumbing for basic CUDA support for NVFP4 - it includes one NVFP4xQ8_1 dp4a kernel.  MMA or Blackwell kernels are not included here and were kept out for a separate PR.  \r\n\r\n`vec_dot_mma` is is linked up the dp4a kernel so it still runs dp4a even when BLACKWELL_MMA_AVAILABLE.\r\nThere is a branch for NVFP4 in `mmq_write_back_mma` to push it back to the dp4a layout which will be removed when the MMA kernel is wired in.\r\n\r\nIt was tuned to bring as much performance as possible for DP4A. Comparisons below.  \r\n**CPU vs DP4A**\r\n\r\n| Model | CPU (pp64) | DP4A (pp64) | Speedup | CPU (tg16) | DP4A (tg16) | Speedup |\r\n|---|---:|---:|---:|---:|---:|---:|\r\n| Qwen3.5-0.8B | 59.63 | 3557.08 | **59.65x** | 32.33 | 329.60 | **10.19x** |\r\n| Qwen3.5-27B | 1.27 | 594.14 | **467.83x** | 1.08 | 52.65 | **48.75x** |\r\n\r\n**pp512 / tg128**\r\n\r\n| Model | pp512 t/s | tg128 t/s |\r\n|---|---:|---:|\r\n| Qwen3-4B |  9031.22 | 271.69 |\r\n| Qwen3-8B | 4909.27 | 175.93 |\r\n| Qwen3.5-27B| 1482.89 | 63.47 |\r\n| Qwen3.5-0.8B | 25596.92 | 388.12 |\r\n| Qwen3.5-0.8B-Q4_K_M | 38339.57 | 521.69 |\r\n\r\nAI assistance was used in refactoring and writing some of this code.  Each line has been scrutinized and this was hand-edited to be as neat and as minimal as possible.  Test-backend-ops passes; CPU<>GPU parity was tested with a separate tool and is exact across multiple tile sizes, and kld/ppl was verified on several models and is as expected (tested on GPU only, CPU too slow).","pull_head_sha":"1d9aa514d5aca8f7670636fd6af8791e6810e99c","loci_pr_branch":"loci/pr-20644-nvfp4-dp4a","short_merge_base":"49bfdde","loci_main_branch":"loci/main-49bfdde","use_loci_base":0}

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+{"pull_number":"20644","title":"ggml-cuda: Add NVFP4 dp4a kernel","body":"This PR brings in the initial plumbing for basic CUDA support for NVFP4 - it includes one NVFP4xQ8_1 dp4a kernel. MMA or Blackwell kernels are not included here and were kept out for a separate PR. \r\n\r\n`vec_dot_mma` is is linked up the dp4a kernel so it still runs dp4a even when BLACKWELL_MMA_AVAILABLE.\r\nThere is a branch for NVFP4 in `mmq_write_back_mma` to push it back to the dp4a layout which will be removed when the MMA kernel is wired in.\r\n\r\nIt was tuned to bring as much performance as possible for DP4A. Comparisons below. \r\nCPU vs DP4A\r\n\r\n\| Model \| CPU (pp64) \| DP4A (pp64) \| Speedup \| CPU (tg16) \| DP4A (tg16) \| Speedup \|\r\n\|---\|---:\|---:\|---:\|---:\|---:\|---:\|\r\n\| Qwen3.5-0.8B \| 59.63 \| 3557.08 \| 59.65x \| 32.33 \| 329.60 \| 10.19x \|\r\n\| Qwen3.5-27B \| 1.27 \| 594.14 \| 467.83x \| 1.08 \| 52.65 \| 48.75x \|\r\n\r\npp512 / tg128\r\n\r\n\| Model \| pp512 t/s \| tg128 t/s \|\r\n\|---\|---:\|---:\|\r\n\| Qwen3-4B \| 9031.22 \| 271.69 \|\r\n\| Qwen3-8B \| 4909.27 \| 175.93 \|\r\n\| Qwen3.5-27B\| 1482.89 \| 63.47 \|\r\n\| Qwen3.5-0.8B \| 25596.92 \| 388.12 \|\r\n\| Qwen3.5-0.8B-Q4_K_M \| 38339.57 \| 521.69 \|\r\n\r\nAI assistance was used in refactoring and writing some of this code. Each line has been scrutinized and this was hand-edited to be as neat and as minimal as possible. Test-backend-ops passes; CPU<>GPU parity was tested with a separate tool and is exact across multiple tile sizes, and kld/ppl was verified on several models and is as expected (tested on GPU only, CPU too slow).","pull_head_sha":"1d9aa514d5aca8f7670636fd6af8791e6810e99c","loci_pr_branch":"loci/pr-20644-nvfp4-dp4a","short_merge_base":"49bfdde","loci_main_branch":"loci/main-49bfdde","use_loci_base":0}