chore: clean residual Ollama references (#527)

JacobPEvans-personal · web-flow · commit d05f2c9773b1 · 2026-03-26T10:36:39.000-04:00
Replace Ollama references with MLX (vllm-mlx) across agent routing tables, permissions, scripts, and docs. Add deprecation notice to historical multi-model orchestration plan. Closes #526 (claude)
diff --git a/agentsmd/agents/planner.md b/agentsmd/agents/planner.md
@@ -15,8 +15,7 @@ Architecture and design specialist for system planning and task breakdown.
 | Mode | Model | Reasoning |
 | ---- | ----- | --------- |
 | Cloud | Claude Opus 4.6 | Extended thinking for complex architecture |
-| Local (MLX) | mlx-community/Qwen3-235B-A22B-4bit | Strong reasoning for offline planning (port 11436) |
-| Local (Ollama) | qwen3-next | Fallback when MLX unavailable (port 11434) |
+| Local (MLX) | mlx-community/Qwen3-235B-A22B-4bit | Strong reasoning for offline planning (port 11434) |
 
 ## Capabilities
 
@@ -101,7 +100,6 @@ The planner agent often works with:
 
 When `AI_ORCHESTRATION_LOCAL_ONLY=true`:
 
-- Try MLX first: mlx-community/Qwen3-235B-A22B-4bit (port 11436)
-- Fall back to Ollama: qwen3-next (port 11434)
+- Use MLX: mlx-community/Qwen3-235B-A22B-4bit (port 11434)
 - All planning done locally
 - No cloud API calls
diff --git a/agentsmd/agents/researcher.md b/agentsmd/agents/researcher.md
@@ -19,8 +19,7 @@ actual research work to specialized models (Gemini 3 Pro or local models) via PA
 | Mode | Model | Use Case |
 | ---- | ----- | -------- |
 | Cloud | Gemini 3 Pro | Large context analysis, web research |
-| Local (MLX) | mlx-community/Qwen3-235B-A22B-4bit | Offline research, private data (port 11436) |
-| Local (Ollama) | qwen3-next | Fallback when MLX unavailable (port 11434) |
+| Local (MLX) | mlx-community/Qwen3-235B-A22B-4bit | Offline research, private data (port 11434) |
 
 ## Capabilities
 
@@ -53,10 +52,8 @@ pal clink "Research question here"
 
 When `AI_ORCHESTRATION_LOCAL_ONLY=true` or `--local` flag is passed:
 
-- Try MLX first: mlx-community/Qwen3-235B-A22B-4bit (port 11436)
-- Fall back to Ollama: qwen3-next (port 11434)
+- Use MLX: mlx-community/Qwen3-235B-A22B-4bit (port 11434)
 - No cloud API calls
-- OLLAMA_HOST environment variable is respected
 
 ## Output Format
 
diff --git a/agentsmd/agents/reviewer.md b/agentsmd/agents/reviewer.md
@@ -17,11 +17,11 @@ then synthesizes findings into a unified review.
 
 ## Models Used
 
-| Role | Cloud Model | Local (MLX preferred) | Local (Ollama fallback) |
-| ---- | ----------- | --------------------- | ----------------------- |
-| Primary | Gemini 3 Pro | mlx-community/DeepSeek-R1-Distill-Llama-70B-4bit | deepseek-r1 |
-| Secondary | Claude Opus 4.6 | mlx-community/Qwen3-235B-A22B-4bit | qwen3-next |
-| Synthesis | Claude Sonnet 4.6 | mlx-community/Qwen3.5-27B-4bit | qwen3-next |
+| Role | Cloud Model | Local (MLX) |
+| ---- | ----------- | ----------- |
+| Primary | Gemini 3 Pro | mlx-community/DeepSeek-R1-Distill-Llama-70B-4bit |
+| Secondary | Claude Opus 4.6 | mlx-community/Qwen3-235B-A22B-4bit |
+| Synthesis | Claude Sonnet 4.6 | mlx-community/Qwen3.5-27B-4bit |
 
 ## Review Process
 
@@ -71,9 +71,8 @@ Good patterns, well-written code, improvements over previous state.
 
 When `AI_ORCHESTRATION_LOCAL_ONLY=true`:
 
-- Try MLX first: mlx-community/DeepSeek-R1-Distill-Llama-70B-4bit (port 11436)
-- Fall back to Ollama: deepseek-r1 for primary analysis (port 11434)
-- Cross-validation: MLX Qwen3-235B or Ollama qwen3-next
+- Use MLX: mlx-community/DeepSeek-R1-Distill-Llama-70B-4bit for primary analysis (port 11434)
+- Cross-validation: mlx-community/Qwen3-235B-A22B-4bit
 - No cloud API calls
 
 ## Severity Guidelines
diff --git a/agentsmd/permissions/STRATEGY.md b/agentsmd/permissions/STRATEGY.md
@@ -116,7 +116,7 @@ agentsmd/permissions/
 │   ├── rust.json              # Rust toolchain: cargo (rustc/rustup covered in core.json)
 │   ├── network.json           # Network utilities: ping, dig, host, netstat, lsof, pgrep
 │   ├── system.json            # System utilities: ln, readlink, htop, launchctl, plutil, etc.
-│   └── tools.json             # Dev tools: rbenv, goenv, redis-cli, ollama, shellcheck, etc.
+│   └── tools.json             # Dev tools: rbenv, goenv, redis-cli, shellcheck, etc.
 │
 ├── ask/
 │   ├── git.json               # Git: merge, reset, rebase, cherry-pick, restore, rm, gc/prune, commit --amend, push --force, clean
diff --git a/agentsmd/permissions/allow/tools.json b/agentsmd/permissions/allow/tools.json
@@ -43,7 +43,6 @@
     "orbctl info",
     "orbctl config get",
     "orbctl version",
-    "ollama list",
     "shellcheck",
     "check-jsonschema",
     "claude doctor",
diff --git a/agentsmd/rules/infra/pre-integration-checklist.md b/agentsmd/rules/infra/pre-integration-checklist.md
@@ -1,6 +1,6 @@
 # Pre-Integration Checklist for New Inference Backends
 
-Complete every item before merging a new inference backend (MLX, Ollama, vLLM, etc.).
+Complete every item before merging a new inference backend (MLX (vllm-mlx), vLLM, etc.).
 Based on the MLX arc retrospective where 5 of 14 PRs were reactive fixes that this checklist
 would have prevented.
 
@@ -9,7 +9,7 @@ would have prevented.
 - [ ] Document peak RAM usage for the largest model you plan to serve
 - [ ] Document sustained (idle-loaded) RAM usage with a model resident
 - [ ] Verify total system RAM can handle the backend plus normal workload (browser, IDE, Claude Code)
-- [ ] Set an explicit memory ceiling in the LaunchAgent or service config (e.g., `mlx_max_memory`, `OLLAMA_MAX_VRAM`)
+- [ ] Set an explicit memory ceiling in the LaunchAgent or service config (e.g., `mlx_max_memory`, `VLLM_MAX_MEMORY`)
 - [ ] Confirm OOM behavior: does the process get killed, crash gracefully, or hang?
 - [ ] Test with the largest model on the lowest-spec target machine
 
@@ -49,13 +49,13 @@ would have prevented.
 
 - [ ] List every new environment variable the backend introduces
 - [ ] Check for naming conflicts with existing variables (`env | grep -i <prefix>`)
-- [ ] Follow the existing naming convention (e.g., `OLLAMA_HOST`, `MLX_*`)
+- [ ] Follow the existing naming convention (e.g., `MLX_*`, `VLLM_*`)
 - [ ] Document which variables are required vs. optional, with defaults
 - [ ] Verify variables are set in the correct scope (LaunchAgent plist, shell profile, or Nix config)
 
 ## LaunchAgent / Service Management
 
-- [ ] Define startup order: does this service depend on another (e.g., network, Ollama)?
+- [ ] Define startup order: does this service depend on another (e.g., network, vllm-mlx)?
 - [ ] Add a health check endpoint or command (e.g., `curl http://localhost:<port>/v1/models`)
 - [ ] Set `KeepAlive` or restart policy so the service recovers from crashes
 - [ ] Set `ThrottleInterval` to prevent restart loops from consuming resources
diff --git a/docs/projects/multi-model-orchestration/PLAN.md b/docs/projects/multi-model-orchestration/PLAN.md
@@ -1,5 +1,9 @@
 # Multi-Model AI Orchestration System
 
+> **Note (2026-03-25)**: This plan was written when Ollama was part of the stack. Ollama has been fully removed
+> and replaced by MLX (vllm-mlx) on port 11434. Model references using Ollama-style tags (e.g., `qwen3-coder:30b`)
+> should be read as their HuggingFace equivalents.
+>
 > **ARCHIVED PLAN**: This document contains historical planning notes.
 > The Python scripts shown are **deprecated design artifacts** — do NOT use as templates.
 > All orchestration is handled via PAL MCP tools and direct CLI invocations.
@@ -792,4 +796,4 @@ ln -sf ~/git/ai-assistant-instructions/feature/multi-model-orchestration/agentsm
 - [PAL MCP Server](https://github.com/BeehiveInnovations/pal-mcp-server)
 - [Anthropic Skills](https://github.com/anthropics/skills)
 - [Claude Code Plugins](https://www.anthropic.com/news/claude-code-plugins)
-- [LLM Rankings Dec 2025](https://vertu.com/lifestyle/top-8-ai-models-ranked-gemini-3-chatgpt-5-1-grok-4-claude-4-5-more/)
+- LLM Rankings Dec 2025 (source link no longer available)
diff --git a/scripts/select-model.sh b/scripts/select-model.sh
@@ -7,7 +7,7 @@
 # Usage: select-model.sh [options]
 #   --task-type=<research|coding|review|decision|default>
 #   --cost-sensitive (flag - prefer free local models)
-#   --private (flag - sensitive data, must use local Ollama)
+#   --private (flag - sensitive data, must use local MLX)
 #   --large-context (flag - need 1M+ context window)
 #   --analyze-complexity=<prompt|filepath> (optional complexity analysis)
 #
@@ -132,9 +132,10 @@ select_model() {
 
   # Step 1: Is the data sensitive or confidential?
   if [[ "$private" == "true" ]]; then
-    echo "Model: ollama"
-    echo "Selected: deepseek-r1:70b (local reasoning) or qwen3-next:80b (local general)"
-    echo "Command: ollama run deepseek-r1:70b"
+    echo "Model: mlx-community/DeepSeek-R1-Distill-Llama-70B-4bit"
+    echo "Selected: mlx-community/DeepSeek-R1-Distill-Llama-70B-4bit (local reasoning) or mlx-community/Qwen3-235B-A22B-4bit (local general)"
+    # Use PAL MCP chat tool: pal chat --model mlx-community/DeepSeek-R1-Distill-Llama-70B-4bit "<prompt>"
+    echo "Command: pal chat --model mlx-community/DeepSeek-R1-Distill-Llama-70B-4bit"
     echo "Rationale: Private/sensitive data must stay local. Never use cloud APIs."
     return 0
   fi
@@ -143,32 +144,37 @@ select_model() {
   if [[ "$cost_sensitive" == "true" ]]; then
     case "$task_type" in
       coding)
-        echo "Model: qwen3-coder:30b"
-        echo "Command: ollama run qwen3-coder:30b"
+        echo "Model: mlx-community/Qwen3-Coder-30B-A3B-Instruct"
+        # Use PAL MCP chat tool: pal chat --model mlx-community/Qwen3-Coder-30B-A3B-Instruct "<prompt>"
+        echo "Command: pal chat --model mlx-community/Qwen3-Coder-30B-A3B-Instruct"
         echo "Rationale: Cost-sensitive coding task - using free local specialized model"
         return 0
         ;;
       review)
-        echo "Model: deepseek-r1:70b"
-        echo "Command: ollama run deepseek-r1:70b"
+        echo "Model: mlx-community/DeepSeek-R1-Distill-Llama-70B-4bit"
+        # Use PAL MCP chat tool: pal chat --model mlx-community/DeepSeek-R1-Distill-Llama-70B-4bit "<prompt>"
+        echo "Command: pal chat --model mlx-community/DeepSeek-R1-Distill-Llama-70B-4bit"
         echo "Rationale: Cost-sensitive code review - using free local reasoning model"
         return 0
         ;;
       research)
-        echo "Model: qwen3-next:80b"
-        echo "Command: ollama run qwen3-next:80b"
+        echo "Model: mlx-community/Qwen3-235B-A22B-4bit"
+        # Use PAL MCP chat tool: pal chat --model mlx-community/Qwen3-235B-A22B-4bit "<prompt>"
+        echo "Command: pal chat --model mlx-community/Qwen3-235B-A22B-4bit"
         echo "Rationale: Cost-sensitive research/analysis - using free local general model"
         return 0
         ;;
       decision)
-        echo "Model: deepseek-r1:70b + qwen3-next:80b"
-        echo "Command: bash -c 'echo \"Model 1 (DeepSeek R1):\" && ollama run deepseek-r1:70b && echo -e \"\\nModel 2 (Qwen):\" && ollama run qwen3-next:80b'"
+        echo "Model: mlx-community/DeepSeek-R1-Distill-Llama-70B-4bit + mlx-community/Qwen3-235B-A22B-4bit"
+        # Use PAL MCP clink tool for parallel multi-model: pal clink "<prompt>"
+        echo "Command: pal clink"
         echo "Rationale: Cost-sensitive critical decision - using best-reasoning + general local models"
         return 0
         ;;
       default)
-        echo "Model: qwen3-next:80b"
-        echo "Command: ollama run qwen3-next:80b"
+        echo "Model: mlx-community/Qwen3-235B-A22B-4bit"
+        # Use PAL MCP chat tool: pal chat --model mlx-community/Qwen3-235B-A22B-4bit "<prompt>"
+        echo "Command: pal chat --model mlx-community/Qwen3-235B-A22B-4bit"
         echo "Rationale: Cost-sensitive generic task - using free local model"
         return 0
         ;;
@@ -231,9 +237,11 @@ select_model() {
   fi
 
   # Default: Start local, fall back to cloud
-  echo "Model: ollama-with-fallback"
-  echo "Selected: qwen3-next:80b (local) → gemini-3-pro (cloud fallback)"
-  echo "Command: ollama run qwen3-next:80b || gemini chat --model gemini-3-pro"
+  echo "Model: mlx-with-fallback"
+  echo "Selected: mlx-community/Qwen3-235B-A22B-4bit (local) → gemini-3-pro (cloud fallback)"
+  # Use PAL MCP chat tool with local model first: pal chat --model mlx-community/Qwen3-235B-A22B-4bit "<prompt>"
+  # Fall back to cloud: pal chat --model gemini-3-pro "<prompt>"
+  echo "Command: pal chat --model mlx-community/Qwen3-235B-A22B-4bit"
   echo "Rationale: Default/general task - try local first for cost/privacy, fall back to cloud if needed"
   return 0
 }