Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 24 additions & 2 deletions lib/sanbase/application/mcp.ex
Original file line number Diff line number Diff line change
@@ -1,14 +1,36 @@
defmodule Sanbase.Application.Mcp do
import Sanbase.ApplicationUtils

def init() do
:ok
end

def children() do
registry = {Sanbase.MCP.Registry.Pg, []}

children = [
# Join the libcluster Postgres topology so MCP pods form a BEAM cluster.
# This enables distributed :pg session lookup across pods.
start_in(
{Cluster.Supervisor,
[
Application.get_env(:libcluster, :topologies),
[name: Sanbase.ClusterSupervisor]
]},
[:dev, :prod]
),
Comment on lines 11 to +21
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Check how start_in/2 is implemented and how children are processed
ast-grep --pattern 'def start_in($_, $_) do
  $$$
end'

# Also check how children list is consumed - is there filtering?
rg -n "children" lib/sanbase/application/mcp.ex -A 10

Repository: santiment/sanbase2

Length of output: 1005


🏁 Script executed:

# Check if there's any filtering logic in the children list construction
cat -n lib/sanbase/application/mcp.ex | head -50

# Check if the children list is modified before being returned
rg -n "Supervisor\|nil" lib/sanbase/application/mcp.ex

# Look for any List.flatten or similar filtering operations
rg -n "flatten\|compact\|filter" lib/sanbase/application/mcp.ex

Repository: santiment/sanbase2

Length of output: 1587


🏁 Script executed:

# Check how this MCP module is used in the main Application
rg -n "Sanbase.Application.Mcp" lib/ --type elixir -B 2 -A 2

# Check if there's a supervision tree setup or filtering
rg -n "children.*Mcp\|Mcp.*children" lib/ --type elixir -B 2 -A 2

# Check Elixir version requirement
grep -i "elixir" mix.exs | head -5

Repository: santiment/sanbase2

Length of output: 2561


🏁 Script executed:

# Check the context where mcp_children is used
sed -n '130,150p' lib/sanbase/application/application.ex

Repository: santiment/sanbase2

Length of output: 835


Remove nil values from the children list before passing to the supervisor.

The start_in/2 function returns nil when the environment doesn't match (e.g., in :test). The mcp_children list will then contain nil as its first element. Although Enum.uniq() is called on line 142 of lib/sanbase/application/application.ex, it only removes duplicate entries—not nil values. This will cause the supervisor to fail with an invalid child spec error in test environments.

Filter out nil entries from mcp_children in lib/sanbase/application/mcp.ex before returning, or filter the concatenated children list in lib/sanbase/application/application.ex using Enum.reject(&is_nil/1) before passing to the supervisor.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@lib/sanbase/application/mcp.ex` around lines 11 - 21, The children list built
in mcp_children includes nil entries because start_in/2 returns nil in
non-matching envs; update the mcp_children construction in
lib/sanbase/application/mcp.ex to filter out nils (e.g., pipe the list through
Enum.reject(&is_nil/1) or Enum.filter(& &1)) before returning it so the
supervisor never receives nil child specs (alternatively, add
Enum.reject(&is_nil/1) to the concatenated children list in
lib/sanbase/application/application.ex before the Enum.uniq() call on line
~142).


# Start :pg scope for distributed session registry.
# Must be started before the MCP servers that use it.
%{
id: Sanbase.MCP.Registry.Pg,
start: {:pg, :start_link, [Sanbase.MCP.Registry.Pg.scope()]}
},

# MCP server for metrics access
{Sanbase.MCP.Server, [transport: {:streamable_http, start: true}]},
{Sanbase.MCP.Server, [transport: {:streamable_http, start: true}, registry: registry]},
# Dev MCP server for search docs
{Sanbase.MCP.DevServer, [transport: {:streamable_http, start: true}]}
{Sanbase.MCP.DevServer, [transport: {:streamable_http, start: true}, registry: registry]}
]

opts = [
Expand Down
64 changes: 60 additions & 4 deletions lib/sanbase/mcp/http_plugs.ex
Original file line number Diff line number Diff line change
Expand Up @@ -91,24 +91,54 @@ defmodule Sanbase.MCP.StreamableHTTPPlug do
@moduledoc "Wrapper plug to expose Sanbase.MCP.Server via forward"
@behaviour Plug

import Plug.Conn, only: [get_req_header: 2, put_req_header: 3]
import Plug.Conn,
only: [get_req_header: 2, put_req_header: 3, put_resp_content_type: 2, send_resp: 3]

@server Sanbase.MCP.Server

@impl Plug
def init(opts), do: opts

@impl Plug
def call(%Plug.Conn{method: "DELETE"} = conn, _opts) do
# Handle DELETE ourselves to support cross-pod session termination.
# Anubis uses DynamicSupervisor.terminate_child which only works on the
# local node. We stop the session process directly, which works across
# distributed Erlang nodes.
handle_delete(conn)
end

def call(conn, _opts) do
conn = normalize_post_accept_header(conn)

Anubis.Server.Transport.StreamableHTTP.Plug.call(
conn,
Anubis.Server.Transport.StreamableHTTP.Plug.init(
server: Sanbase.MCP.Server,
server: @server,
request_timeout: 150_000
)
)
end

defp handle_delete(conn) do
registry_name = Anubis.Server.Registry.registry_name(@server)

case get_req_header(conn, "mcp-session-id") do
[session_id] when session_id != "" ->
case Sanbase.MCP.Registry.Pg.lookup_session(registry_name, session_id) do
{:ok, pid} -> GenServer.stop(pid, :normal)
_ -> :ok
end
Comment on lines +128 to +131
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

GenServer.stop/2 on a remote PID may raise or timeout.

When the session PID is on another node, GenServer.stop(pid, :normal) performs a synchronous call with default 5-second timeout. If the remote node is unreachable or the process is already dead, this could raise an exception or hang. Consider wrapping in a try/catch or using GenServer.stop/3 with an explicit timeout.

🛡️ Proposed defensive handling
       [session_id] when session_id != "" ->
         case Sanbase.MCP.Registry.Pg.lookup_session(registry_name, session_id) do
-          {:ok, pid} -> GenServer.stop(pid, :normal)
+          {:ok, pid} ->
+            try do
+              GenServer.stop(pid, :normal, 5_000)
+            catch
+              :exit, _ -> :ok
+            end
           _ -> :ok
         end
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
case Sanbase.MCP.Registry.Pg.lookup_session(registry_name, session_id) do
{:ok, pid} -> GenServer.stop(pid, :normal)
_ -> :ok
end
case Sanbase.MCP.Registry.Pg.lookup_session(registry_name, session_id) do
{:ok, pid} ->
try do
GenServer.stop(pid, :normal, 5_000)
catch
:exit, _ -> :ok
end
_ -> :ok
end
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@lib/sanbase/mcp/http_plugs.ex` around lines 128 - 131, The GenServer.stop
call in the Sanbase.MCP.Registry.Pg.lookup_session branch can block or raise for
remote PIDs; change the call to a defensive version by using GenServer.stop(pid,
:normal, <small_timeout_ms>) and wrap it in a try/rescue (or try/catch) to catch
exits/timeouts so it won’t crash the caller; update the code around
Sanbase.MCP.Registry.Pg.lookup_session and GenServer.stop to log or ignore
errors on stop and return :ok on failure.


conn |> put_resp_content_type("application/json") |> send_resp(200, "{}")

_ ->
conn
|> put_resp_content_type("application/json")
|> send_resp(400, ~s({"error":"Session ID required"}))
end
end
Comment on lines +123 to +140
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Missing halt() after sending response.

After calling send_resp/3, the connection should be halted to prevent further plug processing. This is standard Plug practice and prevents potential issues with downstream plugs.

🐛 Proposed fix
+  import Plug.Conn,
+    only: [get_req_header: 2, put_req_header: 3, put_resp_content_type: 2, send_resp: 3, halt: 1]

   defp handle_delete(conn) do
     registry_name = Anubis.Server.Registry.registry_name(`@server`)

     case get_req_header(conn, "mcp-session-id") do
       [session_id] when session_id != "" ->
         case Sanbase.MCP.Registry.Pg.lookup_session(registry_name, session_id) do
           {:ok, pid} -> GenServer.stop(pid, :normal)
           _ -> :ok
         end

-        conn |> put_resp_content_type("application/json") |> send_resp(200, "{}")
+        conn |> put_resp_content_type("application/json") |> send_resp(200, "{}") |> halt()

       _ ->
         conn
         |> put_resp_content_type("application/json")
         |> send_resp(400, ~s({"error":"Session ID required"}))
+        |> halt()
     end
   end
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@lib/sanbase/mcp/http_plugs.ex` around lines 123 - 140, The handle_delete/1
function sends responses but doesn't halt the Plug connection; update both
branches that call send_resp in handle_delete to immediately halt the conn
afterwards (e.g. thread the conn through send_resp then |> halt()), ensuring the
returned value is the halted conn so downstream plugs won't run; modify the code
paths around Sanbase.MCP.Registry.Pg.lookup_session/2 and the error branch to
pipe the conn into send_resp and then |> halt().


# When both JSON and SSE are advertised on POST, Anubis can choose SSE
# response mode and keep the response stream open, causing client timeouts.
# Force JSON responses for POST requests; SSE remains available via GET.
Expand All @@ -134,24 +164,50 @@ defmodule Sanbase.MCP.StreamableHTTPDevPlug do
@moduledoc "Wrapper plug to expose Sanbase.MCP.DevServer via forward"
@behaviour Plug

import Plug.Conn, only: [get_req_header: 2, put_req_header: 3]
import Plug.Conn,
only: [get_req_header: 2, put_req_header: 3, put_resp_content_type: 2, send_resp: 3]

@server Sanbase.MCP.DevServer

@impl Plug
def init(opts), do: opts

@impl Plug
def call(%Plug.Conn{method: "DELETE"} = conn, _opts) do
handle_delete(conn)
end

def call(conn, _opts) do
conn = normalize_post_accept_header(conn)

Anubis.Server.Transport.StreamableHTTP.Plug.call(
conn,
Anubis.Server.Transport.StreamableHTTP.Plug.init(
server: Sanbase.MCP.DevServer,
server: @server,
request_timeout: 150_000
)
)
end

defp handle_delete(conn) do
registry_name = Anubis.Server.Registry.registry_name(@server)

case get_req_header(conn, "mcp-session-id") do
[session_id] when session_id != "" ->
case Sanbase.MCP.Registry.Pg.lookup_session(registry_name, session_id) do
{:ok, pid} -> GenServer.stop(pid, :normal)
_ -> :ok
end

conn |> put_resp_content_type("application/json") |> send_resp(200, "{}")

_ ->
conn
|> put_resp_content_type("application/json")
|> send_resp(400, ~s({"error":"Session ID required"}))
end
end
Comment on lines +192 to +209
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Same issues apply: missing halt() and unprotected GenServer.stop/2.

The StreamableHTTPDevPlug.handle_delete/1 has the same issues as the main plug - missing halt() after send_resp/3 and potential exception from GenServer.stop/2 on remote nodes. Apply the same fixes here.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@lib/sanbase/mcp/http_plugs.ex` around lines 192 - 209, In handle_delete,
protect the GenServer.stop call and ensure the Plug pipeline is halted: when
lookup_session returns {:ok, pid} wrap GenServer.stop(pid, :normal) in a safe
rescue/catch (e.g., try do GenServer.stop(pid, :normal) rescue _ -> :ok end) to
avoid exceptions from remote nodes, and after each send_resp(200, "{}") and
send_resp(400, ...) call invoke halt() on the conn (e.g., conn |> send_resp(...)
|> halt()) so the request processing stops; reference the function handle_delete
and the GenServer.stop(pid, :normal) call to locate the changes.


defp normalize_post_accept_header(%Plug.Conn{method: "POST"} = conn) do
case get_req_header(conn, "accept") do
[accept | _] ->
Expand Down
49 changes: 49 additions & 0 deletions lib/sanbase/mcp/registry/pg.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
defmodule Sanbase.MCP.Registry.Pg do
@moduledoc """
Distributed MCP session registry backed by Erlang's `:pg` process groups.

Uses `:pg` to register session PIDs in named groups that are automatically
synchronized across all connected BEAM nodes (via libcluster). Any pod can
look up a session PID regardless of which pod owns it, and the caller can
then make a remote `GenServer.call` to that PID through distributed Erlang.

The `:pg` scope (`:sanbase_mcp_sessions`) must be started in the application
supervision tree before any MCP servers — see `Sanbase.Application.Mcp`.

Automatic cleanup: `:pg` removes a PID from all its groups when the process
exits, so no explicit cleanup is needed on session timeout or pod shutdown.
"""

@behaviour Anubis.Server.Registry

@scope :sanbase_mcp_sessions

@doc "Returns the `:pg` scope atom used by this registry."
def scope, do: @scope

# The :pg scope is started in Sanbase.Application.Mcp, not here.
@impl Anubis.Server.Registry
def child_spec(_opts), do: :ignore

@impl Anubis.Server.Registry
def register_session(name, session_id, pid) do
:pg.join(@scope, {name, session_id}, pid)
end

@impl Anubis.Server.Registry
def lookup_session(name, session_id) do
case :pg.get_members(@scope, {name, session_id}) do
[pid | _] -> {:ok, pid}
[] -> {:error, :not_found}
end
end

@impl Anubis.Server.Registry
def unregister_session(name, session_id) do
for pid <- :pg.get_members(@scope, {name, session_id}) do
:pg.leave(@scope, {name, session_id}, pid)
end

:ok
end
end
4 changes: 2 additions & 2 deletions priv/repo/structure.sql
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
-- PostgreSQL database dump
--

\restrict WLWFJIMyr6OGcjQSPVRM7eSTiskufSffxDak0MXfVniqkIalIzbaf4NlIF1MNCt
\restrict fVWjxPqOQhhTh64NMTXrcqQCI7U6qv4xbceeiEaEJo0Ayskt4vhOLvbamkpugla

-- Dumped from database version 15.15 (Homebrew)
-- Dumped by pg_dump version 15.15 (Homebrew)
Expand Down Expand Up @@ -11333,7 +11333,7 @@ ALTER TABLE ONLY public.webinar_registrations
-- PostgreSQL database dump complete
--

\unrestrict WLWFJIMyr6OGcjQSPVRM7eSTiskufSffxDak0MXfVniqkIalIzbaf4NlIF1MNCt
\unrestrict fVWjxPqOQhhTh64NMTXrcqQCI7U6qv4xbceeiEaEJo0Ayskt4vhOLvbamkpugla

INSERT INTO public."schema_migrations" (version) VALUES (20171008200815);
INSERT INTO public."schema_migrations" (version) VALUES (20171008203355);
Expand Down
4 changes: 2 additions & 2 deletions test/sanbase/mcp/mcp_auth_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,8 @@ defmodule SanbaseWeb.Graphql.MCPAuthTest do

assert Sanbase.MCP.Client |> Process.whereis() |> Process.alive?() == true

registry_name = Anubis.Server.Registry.registry_name(Sanbase.MCP.Server)
assert registry_name |> Process.whereis() |> Process.alive?() == true
# Verify the :pg scope for the distributed session registry is running
assert :pg.which_groups(Sanbase.MCP.Registry.Pg.scope()) |> is_list()

assert {:ok,
%Anubis.MCP.Response{
Expand Down
78 changes: 78 additions & 0 deletions test/sanbase/mcp/registry_pg_test.exs
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
defmodule Sanbase.MCP.Registry.PgTest do
use ExUnit.Case, async: true

alias Sanbase.MCP.Registry.Pg

@registry_name :"test.registry.#{System.unique_integer([:positive])}"

setup do
# Start a dedicated :pg scope for test isolation
scope = :"test_pg_#{System.unique_integer([:positive])}"

start_supervised!(%{
id: {:pg, scope},
start: {:pg, :start_link, [scope]}
})

# Temporarily override the scope for testing by using the real scope
# Since the module uses a compile-time constant, we test via the real scope
# that is started in the application supervision tree.
# For unit tests, we test the behaviour contract directly.
{:ok, scope: scope}
end
Comment on lines +6 to +22
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Test setup creates isolated scope but Pg module uses hardcoded scope.

The setup creates a unique :pg scope (line 10), but Sanbase.MCP.Registry.Pg uses a compile-time constant @scope :sanbase_mcp_sessions. The tests will actually use the production scope, not the test-isolated one. This means:

  1. Tests require the production :pg scope to be started (likely via application supervision tree)
  2. The scope variable returned from setup is never used
  3. Tests with async: true may interfere with each other or other tests using the same scope

Consider either:

  • Making the scope configurable in Pg module for test isolation
  • Removing the unused test setup scope and documenting that tests require the app's :pg scope
  • Using async: false if test isolation cannot be achieved
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@test/sanbase/mcp/registry_pg_test.exs` around lines 6 - 22, The test starts a
unique :pg scope (setup returns scope) but Sanbase.MCP.Registry.Pg uses a
compile-time `@scope` :sanbase_mcp_sessions so tests still hit the production
scope; fix by making the Pg module's scope configurable and using the test
scope: change the module to read the scope from a runtime source (e.g.
Application.get_env(:sanbase, :mcp_pg_scope, :sanbase_mcp_sessions)) or add an
API to pass a scope into the public functions (e.g. Registry.Pg.lookup/2,
Registry.Pg.register/2) and update tests to call those functions with the
setup-provided scope variable; alternatively, if you prefer not to change Pg,
remove the unused setup that starts a per-test scope and mark the test module
async: false and document the dependency on the app supervision tree.


describe "child_spec/1" do
test "returns :ignore since :pg scope is started externally" do
assert :ignore = Pg.child_spec([])
end
end

describe "register_session/3 and lookup_session/2" do
test "registers and looks up a session" do
assert {:error, :not_found} = Pg.lookup_session(@registry_name, "sess-1")

assert :ok = Pg.register_session(@registry_name, "sess-1", self())
assert {:ok, pid} = Pg.lookup_session(@registry_name, "sess-1")
assert pid == self()
end

test "returns :not_found for unknown session" do
assert {:error, :not_found} = Pg.lookup_session(@registry_name, "nonexistent")
end
end

describe "unregister_session/2" do
test "removes a session" do
:ok = Pg.register_session(@registry_name, "sess-2", self())
assert {:ok, _} = Pg.lookup_session(@registry_name, "sess-2")

:ok = Pg.unregister_session(@registry_name, "sess-2")
assert {:error, :not_found} = Pg.lookup_session(@registry_name, "sess-2")
end

test "is a no-op for unknown session" do
assert :ok = Pg.unregister_session(@registry_name, "nonexistent")
end
end

describe "automatic cleanup on process exit" do
test "session is removed when the owning process exits" do
{pid, ref} =
spawn_monitor(fn ->
receive do
:stop -> :ok
end
end)

:ok = Pg.register_session(@registry_name, "sess-3", pid)
assert {:ok, ^pid} = Pg.lookup_session(@registry_name, "sess-3")

send(pid, :stop)
assert_receive {:DOWN, ^ref, :process, ^pid, :normal}

# :pg cleanup is async — give it a moment
Process.sleep(50)
assert {:error, :not_found} = Pg.lookup_session(@registry_name, "sess-3")
end
end
end