spiceai · Jeadie · Jul 14, 2025 · Mar 24, 2025 · Mar 24, 2025 · Mar 24, 2025
diff --git a/.cargo/config.toml b/.cargo/config.toml
@@ -1,12 +1,17 @@
-[target.x86_64-unknown-linux-gnu]
+[build]
 rustflags = ["-C", "target-cpu=native"]
 
 [target.aarch64-apple-darwin]
-[build]
-rustflags = ["-C", "target-cpu=native"]
+rustflags = [
+  "-C", "target-cpu=native",
+  "-C", "target-feature=+aes,+sha2,+fp16",
+]
+
+[target.x86_64-apple-darwin]
+rustflags = [
+  "-C", "target-cpu=native",
+  "-C", "target-feature=-avx,-avx2",
+]
 
 [target.wasm32-unknown-unknown]
 rustflags = ["-C", "target-feature=+simd128"]
-
-[target.x86_64-apple-darwin]
-rustflags = ["-C", "target-feature=-avx,-avx2"]
diff --git a/.github/workflows/analysis.yaml b/.github/workflows/analysis.yaml
@@ -30,9 +30,7 @@ jobs:
             const codeReport = `
               <details>
               <summary>${uniqueIdentifier}</summary>
-              <pre>
-              ${tokeiOutput}
-              </pre>
+              <pre>${tokeiOutput}</pre>
               </details>
             `;
 

diff --git a/.github/workflows/build_cuda_all.yaml b/.github/workflows/build_cuda_all.yaml
@@ -1,6 +1,6 @@
-name: deploy_cuda_docker
+name: build_cuda_all
 
-# gh workflow run deploy_cuda_docker
+# gh workflow run build_cuda_all
 # This also runs on release deploy
 on:
   workflow_dispatch:
@@ -16,7 +16,7 @@ jobs:
           matrix:
             compute_capability: [75, 80, 86, 89, 90]
           fail-fast: false
-        runs-on: [ubuntu-latest]
+        runs-on: ubuntu-latest
 
         permissions:
             contents: write
@@ -59,14 +59,14 @@ jobs:
               uses: docker/metadata-action@v5
               with:
                   images: |
-                    ghcr.io/${{ github.repository_owner }}/$(basename ${{ github.repository }})
+                    ghcr.io/${{ github.repository }}
                   flavor: |
                     latest=false
                   tags: |
-                    type=semver,pattern=cuda-${{matrix.compute_capability}}-{{version}}
-                    type=semver,pattern=cuda-${{matrix.compute_capability}}-{{major}}.{{minor}}
-                    type=raw,value=cuda-${{matrix.compute_capability}}-sha-${{ steps.slug.outputs.short_sha }}
-                    type=raw,value=cuda-${{matrix.compute_capability}}-sha-${{ github.sha }}
+                    type=semver,pattern=cuda-${{ matrix.compute_capability }}-{{ version }}
+                    type=semver,pattern=cuda-${{ matrix.compute_capability }}-{{ major }}.{{ minor }}
+                    type=raw,value=cuda-${{ matrix.compute_capability }}-sha-${{ steps.slug.outputs.short_sha }}
+                    type=raw,value=cuda-${{ matrix.compute_capability }}-sha-${{ github.sha }}
             - name: Build and push Docker image
               id: build-and-push-cuda
               uses: docker/build-push-action@v6
@@ -80,5 +80,3 @@ jobs:
                 build-args: |
                   CUDA_COMPUTE_CAP=${{matrix.compute_capability}}
                 cache-from: type=local,src=/tmp/.buildx-cache
-
-
diff --git a/.gitignore b/.gitignore
@@ -4,3 +4,5 @@
 *.a
 .DS_Store
 .idea
+mistral.rs/
+mistralrs-web-chat/cache
diff --git a/.typos.toml b/.typos.toml
@@ -9,12 +9,18 @@ extend-ignore-identifiers-re = [
     "_thw",
     "thr",
     "nd",
-    "uneeded"
+    "uneeded",
+    "tese",
+    "seperable",
+    "Seperable",
+    "setp",
+    "cna",
 ]
 
 [files]
 extend-exclude = [
     "mistralrs-pyo3/pdoc/*",
     "examples/server/phi3_duckduckgo_mistral.rs.ipynb",
-    "calibration_data/*"
+    "calibration_data/*",
+    "mistralrs-web-chat/static*"
 ]
diff --git a/AGENTS.md b/AGENTS.md
@@ -0,0 +1,147 @@
+<!-- AGENTS.md: Guidance for AI agents to navigate, build, test, and contribute to this repository -->
+# AGENTS
+
+This file provides instructions for AI agents to understand the layout of the `mistral.rs` repository, run builds/tests, and follow project conventions.
+
+## Repository Structure
+
+- `/mistralrs/`           : Main Rust crate (text & multimodal inference API)
+- `/mistralrs-core/`      : Core inference logic and tensor operations (text models)
+- `/mistralrs-vision/`    : Vision inference support (image-based inputs & vision-enabled models)
+- `/mistralrs-quant/`     : Quantization support (ISQ, GGUF, GPTQ, AWQ, FP8, HQQ, etc.)
+- `/mistralrs-paged-attn/`: PagedAttention implementation
+- `/mistralrs-pyo3/`      : Python bindings (PyO3)
+- `/mistralrs-server/`    : CLI & OpenAI-compatible HTTP server (subcommands: run/vision-plain, diffusion, speech)
+- `/mistralrs-server-core/`: Shared server core logic
+- `/mistralrs-web-chat/`  : Web chat application (static assets & backend integration)
+- `/mistralrs-bench/`     : Benchmarking tools
+- `/docs/`                : Markdown documentation for models, features, and guides
+- `/examples/`            : Usage examples (Rust, Python, server samples, notebooks)
+- `/chat_templates/`      : Chat formatting templates (JSON/Jinja)
+- `/scripts/`             : Utility scripts (e.g., AWQ conversion)
+
+## Feature Organization
+
+Mistral.rs supports multiple model types and advanced features via dedicated crates and CLI subcommands:
+
+- **Text Inference**
+  - Crate: `mistralrs-core` (low-level ops), `mistralrs` (API wrapper)
+  - CLI: `run` / `plain` subcommand in `mistralrs-server`
+  - Docs: `docs/SAMPLING.md`, `docs/TOOL_CALLING.md`
+- **Vision Models**
+  - Crate: `mistralrs-vision`
+  - CLI: `vision-plain` subcommand
+  - Docs: `docs/VISION_MODELS.md`, `docs/IMAGEGEN_MODELS.md`, `docs/IMATRIX.md`
+- **Diffusion Models**
+  - CLI: `diffusion` subcommand
+  - Docs: `docs/FLUX.md`
+- **Speech Models**
+  - CLI: `speech` subcommand
+  - Docs: `docs/DIA.md`
+- **Quantization & ISQ**
+  - Crate: `mistralrs-quant`
+  - Docs: `docs/QUANTS.md`, `docs/ISQ.md`
+  - Conversion Script: `scripts/convert_awq_marlin.py`
+- **Paged Attention**
+  - Crate: `mistralrs-paged-attn`
+  - Docs: `docs/PAGED_ATTENTION.md`
+- **Adapters & LoRA/X-LoRA**
+  - Docs: `docs/ADAPTER_MODELS.md`, `docs/LORA_XLORA.md`
+- **Mixture of Experts (AnyMoE)**
+  - Docs: `docs/ANYMOE.md`
+
+## Building
+
+1. Install Rust via rustup (Rust 2021 edition).
+2. Choose optional features (e.g., `cuda`, `flash-attn`, `cudnn`, `metal`, `mkl`, `accelerate`).
+3. Build the entire workspace:
+   ```bash
+   cargo build --workspace --release --features "<features>"
+   ```
+4. Or build/install only the server binary:
+   ```bash
+   cargo build --release --package mistralrs-server --features "<features>"
+   cargo install --path mistralrs-server --features "<features>"
+   ```
+
+## Models
+
+When integrating a new model, make sure it respects all of the varbuilder `.pp` calls. In Candle, a VarBuilder maintains an internal path vector that acts like a “current working directory” for model weights; every call to pp("sub") (alias for push_prefix) clones the builder and appends sub, so successive calls accumulate a dotted prefix such as transformer.h.0 while leaving the original builder untouched . When you eventually call get(...), Candle joins that prefix with the tensor name (prefix + "." + name) and looks it up in the checkpoint backend, producing keys that exactly match the dot-separated names emitted by PyTorch’s state_dict/named_parameters, which means PyTorch-trained weights can be loaded without any renaming  . This lets you recreate the PyTorch module tree in Rust by “walking” it: e.g. vb.pp("word_embeddings") grabs word_embeddings.*, while a chain like vb.pp("encoder").pp("layers").pp(i.to_string()) targets keys such as encoder.layers.0.*, exactly as shown in community tutorials porting Transformers models to Candle  . As one maintainer put it, the prefix system lets you “cd” around the parameter hierarchy, giving a lightweight namespace mechanism that keeps Candle fully compatible with PyTorch naming conventions while remaining ergonomic to use.
+
+You should also look for a model.safetensors.index.json file for the model at hand to verify correct structure.
+
+## Testing
+
+- Core test suite (requires HF token for some tests):
+  ```bash
+  export HF_TOKEN=<your_token>  # or TESTS_HF_TOKEN for CI parity
+  cargo test -p mistralrs-core -p mistralrs-quant -p mistralrs-vision
+  ```
+- Run all tests across workspace (may skip some crates without tests):
+  ```bash
+  cargo test --workspace
+  ```
+
+You should *always* run `cargo check`/`cargo c` before returning to make sure code compiles. If code does not compile, only make edits.
+
+Avoid returning TODOs.
+
+## Formatting & Linting
+
+- Format all Rust code:
+  ```bash
+  cargo fmt --all
+  make fmt       # also formats Python/CUDA/C++ files via ruff, clang-format
+  ```
+- Lint with Clippy:
+  ```bash
+  cargo clippy --workspace --tests --examples -- -D warnings
+  ```
+
+## Documentation
+
+- Generate Rust docs for all crates:
+  ```bash
+  cargo doc --workspace
+  ```
+- Preview at `target/doc/` or publish to GitHub Pages as configured.
+- Refer to `/docs/` for in-depth markdown guides (e.g., DEVICE_MAPPING.md, TOOL_CALLING.md).
+
+## Examples
+
+- Rust examples: `mistralrs/examples/`
+- Python examples: `examples/python/`
+- Server samples: `examples/server/`
+- Run Python scripts:
+  ```bash
+  python3 examples/python/<script>.py
+  ```
+- Run server/CLI:
+  ```bash
+  ./target/release/mistralrs-server -i <mode> -m <model> [options]
+  ```
+
+## CI Parity
+
+The CI pipeline is defined in `.github/workflows/ci.yml` and includes:
+  - `cargo check` for all targets
+  - `cargo test` on core crates
+  - `cargo fmt -- --check`
+  - `cargo clippy -D warnings`
+  - `cargo doc`
+  - Typos check (`crate-ci/typos`)
+
+## Contribution Conventions
+
+- Follow Rust 2021 idioms, keep code minimal and focused.
+- Update `/docs/` and examples when adding features or breaking changes.
+- Add tests and examples for new functionality.
+- Commit messages should be clear and follow conventional style where possible.
+  ```
+  feat(crate): describe new feature
+  fix(crate): describe bug fix
+  docs: update docs for ...
+  ```
+
+---
+*This AGENTS.md file is intended solely to improve AI-driven assistance and does not affect runtime behavior.*
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -0,0 +1,122 @@
+# CLAUDE.md
+
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+
+## Project Overview
+
+mistral.rs is a blazing-fast LLM inference engine written in Rust. It supports text, vision, image generation, and speech models with multiple APIs (Rust, Python, OpenAI HTTP, MCP).
+
+## Essential Commands
+
+### Building
+```bash
+# Basic release build
+cargo build --release
+
+# With CUDA support (Linux)
+cargo build --release --features "cuda flash-attn cudnn"
+
+# With Metal support (macOS)
+cargo build --release --features metal
+
+# Install server binary
+cargo install --path mistralrs-server --features <features>
+```
+
+### Testing & Quality
+```bash
+# Run core tests
+cargo test -p mistralrs-core -p mistralrs-quant -p mistralrs-vision
+
+# Format code (uses rustfmt, ruff, clang-format)
+make fmt
+
+# Check formatting
+cargo fmt --all -- --check
+
+# Run clippy
+cargo clippy --workspace --tests --examples -- -D warnings
+```
+
+### Running Models
+```bash
+# Run interactive mode with plain model
+cargo run --release --features <features> -- -i plain -m <model_id> -a <arch>
+
+# Run with GGUF quantized model
+cargo run --release --features <features> -- -i gguf -f <file> -t <tokenizer>
+
+# Run server
+cargo run --release --features <features> -- --port 1234 <model_args>
+```
+
+## Models
+
+When integrating a new model, make sure it respects all of the varbuilder `.pp` calls. In Candle, a VarBuilder maintains an internal path vector that acts like a “current working directory” for model weights; every call to pp("sub") (alias for push_prefix) clones the builder and appends sub, so successive calls accumulate a dotted prefix such as transformer.h.0 while leaving the original builder untouched . When you eventually call get(...), Candle joins that prefix with the tensor name (prefix + "." + name) and looks it up in the checkpoint backend, producing keys that exactly match the dot-separated names emitted by PyTorch’s state_dict/named_parameters, which means PyTorch-trained weights can be loaded without any renaming  . This lets you recreate the PyTorch module tree in Rust by “walking” it: e.g. vb.pp("word_embeddings") grabs word_embeddings.*, while a chain like vb.pp("encoder").pp("layers").pp(i.to_string()) targets keys such as encoder.layers.0.*, exactly as shown in community tutorials porting Transformers models to Candle  . As one maintainer put it, the prefix system lets you “cd” around the parameter hierarchy, giving a lightweight namespace mechanism that keeps Candle fully compatible with PyTorch naming conventions while remaining ergonomic to use.
+
+You should also look for a model.safetensors.index.json file for the model at hand to verify correct structure.
+
+## Architecture Overview
+
+### Workspace Structure
+- `mistralrs-core/` - Core inference engine, model implementations, pipelines
+- `mistralrs-server/` - CLI binary entry point
+- `mistralrs-server-core/` - HTTP server routing, OpenAI API implementation
+- `mistralrs-pyo3/` - Python bindings (PyO3)
+- `mistralrs/` - High-level Rust API
+- `mistralrs-vision/` - Vision model support
+- `mistralrs-quant/` - Quantization implementations (ISQ, GGUF, GPTQ, etc.)
+- `mistralrs-paged-attn/` - PagedAttention implementation
+- `mistralrs-audio/` - Audio processing
+- `mistralrs-mcp/` - Model Context Protocol client
+- `mistralrs-bench/` - Benchmarking tools
+
+### Key Design Patterns
+
+1. **Pipeline Architecture**: All models implement the `Pipeline` trait in `mistralrs-core/src/pipeline/mod.rs`. Different model types (Plain, GGUF, GGML, Vision) have their own pipeline implementations.
+
+2. **Model Loading**: Models are loaded through `Loader` traits that handle different formats and quantizations. See `mistralrs-core/src/loader.rs`.
+
+3. **Request Handling**: The server uses message passing with `MistralRs` struct managing a background thread pool. Requests flow through `mistralrs-core/src/engine/mod.rs`.
+
+4. **Device Management**: Automatic and manual device mapping for multi-GPU setups handled in `mistralrs-core/src/device_map.rs`.
+
+### Adding New Features
+
+When adding new model architectures:
+1. Implement the model in `mistralrs-core/src/models/`
+2. Add pipeline support in `mistralrs-core/src/pipeline/`
+3. Update model detection in `mistralrs-core/src/pipeline/normal.rs`
+4. Add architecture enum variant in `mistralrs-core/src/lib.rs`
+5. Update CLI args in `mistralrs-server/src/main.rs`
+
+When adding new quantization methods:
+1. Implement in `mistralrs-quant/src/`
+2. Add to quantization loading logic in pipelines
+3. Update documentation in `docs/QUANTIZATION.md`
+
+### Important Files to Know
+
+- `mistralrs-core/src/engine/mod.rs` - Main engine orchestration
+- `mistralrs-core/src/pipeline/mod.rs` - Pipeline trait and common logic
+- `mistralrs-server-core/src/routes.rs` - HTTP API endpoints
+- `mistralrs-pyo3/src/lib.rs` - Python API entry point
+- `mistralrs/examples/` - Usage examples for Rust API
+
+### Testing Approach
+
+You should *always* run `cargo check`/`cargo c` before returning to make sure code compiles. If code does not compile, only make edits.
+
+Avoid returning TODOs.
+
+- Unit tests are colocated with source files
+- Integration tests in `tests/` directories
+- Use `cargo test -p <crate>` to test specific components
+- Python tests require building and installing the package first
+
+### Common Pitfalls
+
+1. **Feature Flags**: Many features are gated behind Cargo features. Always check what features are needed for your use case.
+2. **Device Indices**: CUDA device selection uses 0-based indexing
+3. **Chat Templates**: Models may need specific chat templates - check `chat_templates/` directory
+4. **Quantization**: Different quantization methods have different hardware requirements