feat(sam3): enable SDK-based remote execution for SAM3 workflow blocks (#2042)

hansent · claude · web-flow · commit 2387be530d85 · 2026-02-26T13:22:04.000-06:00
* feat(sam3): enable SDK-based remote execution for SAM3 workflow blocks

SAM3 workflow blocks previously always used the API inference proxy for
remote execution, regardless of configuration. This change decouples
the workflow step execution mode from SAM3_EXEC_MODE, enabling SAM3
blocks to use the standard SDK-based remote execution pattern used by
all other model blocks when WORKFLOWS_STEP_EXECUTION_MODE=remote.

Also adds SAM3 SDK client methods (concept_segment, visual_segment,
embed_image) and a new SAM3_FINE_TUNED_MODELS_ENABLED env var to
control fine-tuned model access independently of execution mode.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;

* make style

---------

Co-authored-by: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/inference/core/env.py b/inference/core/env.py
@@ -173,6 +173,13 @@
     raise ValueError(
         f"Invalid SAM3 execution mode in ENVIRONMENT var SAM3_EXEC_MODE (local or remote): {SAM3_EXEC_MODE}"
     )
+# Whether fine-tuned SAM3 models (non-sam3/ prefix) are allowed.
+# Defaults to False when SAM3_EXEC_MODE=remote (backward compat with existing proxy deployments),
+# True otherwise (self-hosted users can use fine-tuned models).
+_sam3_fine_tuned_default = "False" if SAM3_EXEC_MODE == "remote" else "True"
+SAM3_FINE_TUNED_MODELS_ENABLED = str2bool(
+    os.getenv("SAM3_FINE_TUNED_MODELS_ENABLED", _sam3_fine_tuned_default)
+)
 
 # Flag to enable GAZE core model, default is True
 CORE_MODEL_GAZE_ENABLED = str2bool(os.getenv("CORE_MODEL_GAZE_ENABLED", True))
diff --git a/inference/core/interfaces/http/http_api.py b/inference/core/interfaces/http/http_api.py
@@ -175,6 +175,7 @@
     ROBOFLOW_INTERNAL_SERVICE_SECRET,
     ROBOFLOW_SERVICE_SECRET,
     SAM3_EXEC_MODE,
+    SAM3_FINE_TUNED_MODELS_ENABLED,
     USE_INFERENCE_MODELS,
     WEBRTC_WORKER_ENABLED,
     WORKFLOWS_MAX_CONCURRENT_STEPS,
@@ -2586,12 +2587,14 @@ def sam3_segment_image(
                     countinference: Optional[bool] = None,
                     service_secret: Optional[str] = None,
                 ):
-                    if SAM3_EXEC_MODE == "remote":
+                    if not SAM3_FINE_TUNED_MODELS_ENABLED:
                         if not inference_request.model_id.startswith("sam3/"):
                             raise HTTPException(
                                 status_code=501,
-                                detail="Fine-tuned SAM3 models are not supported in remote execution mode yet. Please use a workflow or self-host the server.",
+                                detail="Fine-tuned SAM3 models are not supported on this deployment. Please use a workflow or self-host the server.",
                             )
+
+                    if SAM3_EXEC_MODE == "remote":
                         endpoint = f"{API_BASE_URL}/inferenceproxy/seg-preview"
 
                         # Construct payload for remote API
diff --git a/inference/core/workflows/core_steps/models/foundation/segment_anything3/v1.py b/inference/core/workflows/core_steps/models/foundation/segment_anything3/v1.py
@@ -17,9 +17,12 @@
 from inference.core.entities.responses.sam3 import Sam3SegmentationPrediction
 from inference.core.env import (
     API_BASE_URL,
+    HOSTED_CORE_MODEL_URL,
+    LOCAL_INFERENCE_API_URL,
     ROBOFLOW_INTERNAL_SERVICE_NAME,
     ROBOFLOW_INTERNAL_SERVICE_SECRET,
     SAM3_EXEC_MODE,
+    WORKFLOWS_REMOTE_API_TARGET,
 )
 from inference.core.managers.base import ModelManager
 from inference.core.roboflow_api import build_roboflow_api_headers
@@ -54,6 +57,7 @@
     WorkflowBlock,
     WorkflowBlockManifest,
 )
+from inference_sdk import InferenceHTTPClient
 
 DETECTIONS_CLASS_NAME_FIELD = "class_name"
 DETECTION_ID_FIELD = "detection_id"
@@ -169,30 +173,26 @@ def run(
         else:
             raise ValueError(f"Invalid class names type: {type(class_names)}")
 
-        exec_mode = self._step_execution_mode
-        if SAM3_EXEC_MODE == "local":
-            exec_mode = self._step_execution_mode
-        elif SAM3_EXEC_MODE == "remote":
-            exec_mode = (
-                StepExecutionMode.REMOTE
-            )  # if SAM3_EXEC_MODE == "remote" then force remote execution mode only
-        else:
-            raise ValueError(
-                f"Invalid SAM3 execution mode in ENVIRONMENT var SAM3_EXEC_MODE (local or remote): {SAM3_EXEC_MODE}"
+        if SAM3_EXEC_MODE == "remote":
+            logger.debug("Running SAM3 v1 via inference proxy (SAM3_EXEC_MODE=remote)")
+            return self.run_via_request(
+                images=images,
+                class_names=class_names,
+                threshold=threshold,
             )
-
-        if exec_mode is StepExecutionMode.LOCAL:
-            logger.debug(f"Running SAM3 locally")
+        elif self._step_execution_mode is StepExecutionMode.LOCAL:
+            logger.debug("Running SAM3 v1 locally")
             return self.run_locally(
                 images=images,
                 model_id=model_id,
                 class_names=class_names,
                 threshold=threshold,
             )
-        elif exec_mode is StepExecutionMode.REMOTE:
-            logger.debug(f"Running SAM3 remotely")
-            return self.run_via_request(
+        elif self._step_execution_mode is StepExecutionMode.REMOTE:
+            logger.debug("Running SAM3 v1 remotely via SDK")
+            return self.run_remotely(
                 images=images,
+                model_id=model_id,
                 class_names=class_names,
                 threshold=threshold,
             )
@@ -276,6 +276,81 @@ def run_locally(
             predictions=predictions,
         )
 
+    def run_remotely(
+        self,
+        images: Batch[WorkflowImageData],
+        model_id: str,
+        class_names: Optional[List[str]],
+        threshold: float,
+    ) -> BlockResult:
+        predictions = []
+        if class_names is None:
+            class_names = []
+        if len(class_names) == 0:
+            class_names.append(None)
+
+        api_url = (
+            LOCAL_INFERENCE_API_URL
+            if WORKFLOWS_REMOTE_API_TARGET != "hosted"
+            else HOSTED_CORE_MODEL_URL
+        )
+        client = InferenceHTTPClient(
+            api_url=api_url,
+            api_key=self._api_key,
+        )
+        if WORKFLOWS_REMOTE_API_TARGET == "hosted":
+            client.select_api_v0()
+
+        for single_image in images:
+            prompt_class_ids: List[Optional[int]] = []
+            prompt_class_names: List[Optional[str]] = []
+            prompt_detection_ids: List[Optional[str]] = []
+
+            http_prompts: List[dict] = []
+            for class_name in class_names:
+                http_prompts.append({"type": "text", "text": class_name})
+
+            resp_json = client.sam3_concept_segment(
+                inference_input=single_image.base64_image,
+                prompts=http_prompts,
+                model_id=model_id,
+                output_prob_thresh=threshold,
+            )
+
+            class_predictions: List[InstanceSegmentationPrediction] = []
+            for prompt_result in resp_json.get("prompt_results", []):
+                idx = prompt_result.get("prompt_index", 0)
+                class_name = class_names[idx] if idx < len(class_names) else None
+                raw_predictions = prompt_result.get("predictions", [])
+                adapted_predictions = [SimpleNamespace(**p) for p in raw_predictions]
+                class_pred = convert_sam3_segmentation_response_to_inference_instances_seg_response(
+                    sam3_segmentation_predictions=adapted_predictions,  # type: ignore[arg-type]
+                    image=single_image,
+                    prompt_class_ids=prompt_class_ids,
+                    prompt_class_names=prompt_class_names,
+                    prompt_detection_ids=prompt_detection_ids,
+                    threshold=threshold,
+                    text_prompt=class_name,
+                    specific_class_id=idx,
+                )
+                class_predictions.extend(class_pred.predictions)
+
+            image_width = single_image.numpy_image.shape[1]
+            image_height = single_image.numpy_image.shape[0]
+            final_inference_prediction = InstanceSegmentationInferenceResponse(
+                predictions=class_predictions,
+                image=InferenceResponseImage(width=image_width, height=image_height),
+            )
+            predictions.append(final_inference_prediction)
+
+        predictions = [
+            e.model_dump(by_alias=True, exclude_none=True) for e in predictions
+        ]
+        return self._post_process_result(
+            images=images,
+            predictions=predictions,
+        )
+
     def run_via_request(
         self,
         images: Batch[WorkflowImageData],
diff --git a/inference/core/workflows/core_steps/models/foundation/segment_anything3/v2.py b/inference/core/workflows/core_steps/models/foundation/segment_anything3/v2.py
@@ -17,9 +17,12 @@
 from inference.core.entities.responses.sam3 import Sam3SegmentationPrediction
 from inference.core.env import (
     API_BASE_URL,
+    HOSTED_CORE_MODEL_URL,
+    LOCAL_INFERENCE_API_URL,
     ROBOFLOW_INTERNAL_SERVICE_NAME,
     ROBOFLOW_INTERNAL_SERVICE_SECRET,
     SAM3_EXEC_MODE,
+    WORKFLOWS_REMOTE_API_TARGET,
 )
 from inference.core.managers.base import ModelManager
 from inference.core.roboflow_api import build_roboflow_api_headers
@@ -54,6 +57,7 @@
     WorkflowBlock,
     WorkflowBlockManifest,
 )
+from inference_sdk import InferenceHTTPClient
 
 DETECTIONS_CLASS_NAME_FIELD = "class_name"
 DETECTION_ID_FIELD = "detection_id"
@@ -223,20 +227,18 @@ def run(
         else:
             raise ValueError(f"Invalid class names type: {type(class_names)}")
 
-        exec_mode = self._step_execution_mode
-        if SAM3_EXEC_MODE == "local":
-            exec_mode = self._step_execution_mode
-        elif SAM3_EXEC_MODE == "remote":
-            exec_mode = (
-                StepExecutionMode.REMOTE
-            )  # if SAM3_EXEC_MODE == "remote" then force remote execution mode only
-        else:
-            raise ValueError(
-                f"Invalid SAM3 execution mode in ENVIRONMENT var SAM3_EXEC_MODE (local or remote): {SAM3_EXEC_MODE}"
+        if SAM3_EXEC_MODE == "remote":
+            logger.debug("Running SAM3 v2 via inference proxy (SAM3_EXEC_MODE=remote)")
+            return self.run_via_request(
+                images=images,
+                class_names=class_names,
+                confidence=confidence,
+                per_class_confidence=per_class_confidence,
+                apply_nms=apply_nms,
+                nms_iou_threshold=nms_iou_threshold,
             )
-
-        if exec_mode is StepExecutionMode.LOCAL:
-            logger.debug(f"Running SAM3 locally")
+        elif self._step_execution_mode is StepExecutionMode.LOCAL:
+            logger.debug("Running SAM3 v2 locally")
             return self.run_locally(
                 images=images,
                 model_id=model_id,
@@ -246,10 +248,11 @@ def run(
                 apply_nms=apply_nms,
                 nms_iou_threshold=nms_iou_threshold,
             )
-        elif exec_mode is StepExecutionMode.REMOTE:
-            logger.debug(f"Running SAM3 remotely")
-            return self.run_via_request(
+        elif self._step_execution_mode is StepExecutionMode.REMOTE:
+            logger.debug("Running SAM3 v2 remotely via SDK")
+            return self.run_remotely(
                 images=images,
+                model_id=model_id,
                 class_names=class_names,
                 confidence=confidence,
                 per_class_confidence=per_class_confidence,
@@ -348,6 +351,88 @@ def run_locally(
             predictions=predictions,
         )
 
+    def run_remotely(
+        self,
+        images: Batch[WorkflowImageData],
+        model_id: str,
+        class_names: Optional[List[str]],
+        confidence: float,
+        per_class_confidence: Optional[List[float]] = None,
+        apply_nms: bool = True,
+        nms_iou_threshold: float = 0.9,
+    ) -> BlockResult:
+        predictions = []
+        if class_names is None:
+            class_names = []
+        if len(class_names) == 0:
+            class_names.append(None)
+
+        api_url = (
+            LOCAL_INFERENCE_API_URL
+            if WORKFLOWS_REMOTE_API_TARGET != "hosted"
+            else HOSTED_CORE_MODEL_URL
+        )
+        client = InferenceHTTPClient(
+            api_url=api_url,
+            api_key=self._api_key,
+        )
+        if WORKFLOWS_REMOTE_API_TARGET == "hosted":
+            client.select_api_v0()
+
+        for single_image in images:
+            prompt_class_ids: List[Optional[int]] = []
+            prompt_class_names: List[Optional[str]] = []
+            prompt_detection_ids: List[Optional[str]] = []
+
+            http_prompts: List[dict] = []
+            for idx, class_name in enumerate(class_names):
+                prompt_data = {"type": "text", "text": class_name}
+                if per_class_confidence is not None:
+                    prompt_data["output_prob_thresh"] = per_class_confidence[idx]
+                http_prompts.append(prompt_data)
+
+            resp_json = client.sam3_concept_segment(
+                inference_input=single_image.base64_image,
+                prompts=http_prompts,
+                model_id=model_id,
+                output_prob_thresh=confidence,
+                nms_iou_threshold=nms_iou_threshold if apply_nms else None,
+            )
+
+            class_predictions: List[InstanceSegmentationPrediction] = []
+            for prompt_result in resp_json.get("prompt_results", []):
+                idx = prompt_result.get("prompt_index", 0)
+                class_name = class_names[idx] if idx < len(class_names) else None
+                raw_predictions = prompt_result.get("predictions", [])
+                adapted_predictions = [SimpleNamespace(**p) for p in raw_predictions]
+                class_pred = convert_sam3_segmentation_response_to_inference_instances_seg_response(
+                    sam3_segmentation_predictions=adapted_predictions,  # type: ignore[arg-type]
+                    image=single_image,
+                    prompt_class_ids=prompt_class_ids,
+                    prompt_class_names=prompt_class_names,
+                    prompt_detection_ids=prompt_detection_ids,
+                    confidence=confidence,
+                    text_prompt=class_name,
+                    specific_class_id=idx,
+                )
+                class_predictions.extend(class_pred.predictions)
+
+            image_width = single_image.numpy_image.shape[1]
+            image_height = single_image.numpy_image.shape[0]
+            final_inference_prediction = InstanceSegmentationInferenceResponse(
+                predictions=class_predictions,
+                image=InferenceResponseImage(width=image_width, height=image_height),
+            )
+            predictions.append(final_inference_prediction)
+
+        predictions = [
+            e.model_dump(by_alias=True, exclude_none=True) for e in predictions
+        ]
+        return self._post_process_result(
+            images=images,
+            predictions=predictions,
+        )
+
     def run_via_request(
         self,
         images: Batch[WorkflowImageData],
diff --git a/inference/core/workflows/core_steps/models/foundation/segment_anything3/v3.py b/inference/core/workflows/core_steps/models/foundation/segment_anything3/v3.py
diff --git a/inference_sdk/http/client.py b/inference_sdk/http/client.py