Add examples/sam-3d notebooks

leeclemnet · leeclemnet · commit 919b499a2b70 · 2026-02-26T19:30:02.000Z
diff --git a/.gitignore b/.gitignore
@@ -208,4 +208,7 @@ tests/inference_sdk/unit_tests/http/inference_profiling
 inference_models/tests/integration_tests/models/assets/
 inference_models/tests/e2e_platform_tests/assets/
 
-inference_testing
+inference_testing
+
+# rerun.io recordings
+*.rrd
diff --git a/examples/sam-3d/README.md b/examples/sam-3d/README.md
@@ -0,0 +1,22 @@
+# Single-view 3D recontsruction with SAM-3D Objects
+
+SAM-3D is 3D object generation model that converts 2D images with segmentation masks into 3D assets (meshes and Gaussian splats) and estimates their layout. You can learn more about it by visiting this [Roboflow Blog post](https://blog.roboflow.com/sam-3d/) and the [Meta AI project page](https://ai.meta.com/research/sam3d/).
+
+![SAM-3D examples](https://blog.roboflow.com/content/images/size/w1000/2025/11/SAM3d_object_example.png)
+
+## Requirements
+
+To run these examples you will need a self-hosted inference server with a 32GB+ VRAM GPU and the `SAM3_3D_OBJECTS_ENABLED` flag turned on. We recommend using the Docker workflow below:
+
+```bash
+docker build -t roboflow/roboflow-inference-server-gpu-3d:dev -f docker/dockerfiles/Dockerfile.onnx.gpu.3d .
+```
+
+```bash
+docker run --gpus all -p 9001:9001 -v ./inference:/app/inference roboflow/roboflow-inference-server-gpu-3d:dev
+```
+
+## Example notebooks
+
+* [Single-view 3D reconstruction with SAM-3D Objects](sam-3d-detect.ipynb)
+* [Monocular 3D object tracking with SAM-3D Objects](sam-3d-track.ipynb)
diff --git a/examples/sam-3d/requirements.txt b/examples/sam-3d/requirements.txt
@@ -0,0 +1,7 @@
+numpy
+Pillow
+torch
+pytorch3d
+supervision[assets]
+rerun-sdk[notebook]
+-e ../../../inference  # install inference-development for now
diff --git a/examples/sam-3d/sam-3d-detect.ipynb b/examples/sam-3d/sam-3d-detect.ipynb
@@ -0,0 +1,321 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "1182c4c3",
+   "metadata": {},
+   "source": [
+    "# Single-view 3D reconstruction with SAM-3D Objects\n",
+    "\n",
+    "This notebook requires a self-hosted inference server with a 32GB+ VRAM GPU. See the README for the recommended setup."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8274b79d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "%pip install -r requirements.txt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c67abc01",
+   "metadata": {},
+   "source": [
+    "Set up the notebook to point at your inference server instance and use your API key to download model weights."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e879d5e6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "API_URL = \"http://localhost:9001\"\n",
+    "API_KEY = \"YOUR_API_KEY\"\n",
+    "\n",
+    "SEGMENTATION_MODEL_ID = \"rfdetr-seg-preview\"\n",
+    "SAM3_3D_MODEL_ID = \"sam3-3d-objects\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "717682ab",
+   "metadata": {},
+   "source": [
+    "Set input data and output directory for logging the annotated image and 3D view."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "be41fb82",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from supervision.assets import download_assets, VideoAssets\n",
+    "\n",
+    "# INPUT_VIDEO_PATH = download_assets(VideoAssets.MILK_BOTTLING_PLANT)\n",
+    "INPUT_VIDEO_PATH = download_assets(VideoAssets.VEHICLES)\n",
+    "\n",
+    "OUTPUT_DIR = \"sam-3d-detect\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "16a4b598",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import shutil\n",
+    "\n",
+    "if os.path.exists(OUTPUT_DIR):\n",
+    "    shutil.rmtree(OUTPUT_DIR)\n",
+    "os.makedirs(OUTPUT_DIR)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6b6ddc4b",
+   "metadata": {},
+   "source": [
+    "Step 1: Load an input image and make sure it looks how we expect."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d64d6a9d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import supervision as sv\n",
+    "\n",
+    "image = next(sv.get_video_frames_generator(INPUT_VIDEO_PATH))\n",
+    "sv.plot_image(image)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "439764ea",
+   "metadata": {},
+   "source": [
+    "Step 2: Generate 2D object masks by running an instance segmentation model like RF-DETR Seg."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "58066e1c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from inference_sdk import InferenceHTTPClient\n",
+    "import time\n",
+    "\n",
+    "client = InferenceHTTPClient(api_url=API_URL, api_key=API_KEY)\n",
+    "\n",
+    "start = time.perf_counter()\n",
+    "\n",
+    "seg_result = client.infer(image, model_id=SEGMENTATION_MODEL_ID)\n",
+    "\n",
+    "print(f\"{SEGMENTATION_MODEL_ID} inference took {(time.perf_counter() - start):.2f} sec\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9418f08b",
+   "metadata": {},
+   "source": [
+    "Let's take a look at the detections to check if they make sense."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f21ce7aa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "detections = sv.Detections.from_inference(seg_result)\n",
+    "\n",
+    "# remove low-confidence detections\n",
+    "detections = detections[detections.confidence > 0.5]\n",
+    "\n",
+    "labels = [\n",
+    "    f\"#{i} ({class_name})\" for i, class_name in enumerate(detections.data[\"class_name\"])\n",
+    "]\n",
+    "mask_annotator = sv.MaskAnnotator()\n",
+    "label_annotator = sv.LabelAnnotator()\n",
+    "annotated = mask_annotator.annotate(scene=image.copy(), detections=detections)\n",
+    "annotated = label_annotator.annotate(scene=annotated, detections=detections, labels=labels)\n",
+    "\n",
+    "sv.plot_image(annotated)\n",
+    "\n",
+    "with sv.ImageSink(target_dir_path=OUTPUT_DIR) as sink:\n",
+    "    sink.save_image(annotated, \"annotated.png\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2c915798",
+   "metadata": {},
+   "source": [
+    "Step 3: Pass the input image and object masks to SAM-3D to generate 3D reconstructions of each object. \n",
+    "\n",
+    "This will take a few minutes the first time as the model weights need to be downloaded to the server. Subsequent inference calls can take anywhere from seconds to minutes depending on the number of objects and the inference configuration."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "33f2694e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# flatten polygons to the expected [x1 y1 x2 y2 ... xN yN] format\n",
+    "mask_input = [\n",
+    "    np.array(sv.mask_to_polygons(mask)[0]).flatten().tolist()\n",
+    "    for mask in detections.mask\n",
+    "]\n",
+    "\n",
+    "start = time.perf_counter()\n",
+    "sam3_3d_result = client.sam3_3d_infer(\n",
+    "    inference_input=image,\n",
+    "    mask_input=mask_input,\n",
+    "    model_id=SAM3_3D_MODEL_ID,\n",
+    "    # 'Fast' SAM-3D config\n",
+    "    output_meshes=False,\n",
+    "    output_scene=False,\n",
+    "    with_mesh_postprocess=False,\n",
+    "    with_texture_baking=False,\n",
+    "    use_distillations=True,\n",
+    ")\n",
+    "print(f\"SAM-3D inference took {(time.perf_counter() - start):.2f} sec\")\n",
+    "\n",
+    "detections.data[\"sam3_3d\"] = sam3_3d_result[\"objects\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3dc8d557",
+   "metadata": {},
+   "source": [
+    "Step 4: Transform the 3D objects into a common global frame using their layout metadata, and draw them in [Rerun.io](https://rerun.io).\n",
+    "\n",
+    "When `output_scene=True` SAM-3D will output a combined 3D asset containing all 3D objects in a common frame. The code below uses the same Y-up frame convention to draw the objects, so it's consistent with what SAM-3D provides natively.\n",
+    "\n",
+    "Rerun will log to disk at `OUTPUT_DIR/rerun_log.rrd`. You can then visualize this file in the notebook or using the standalone Rerun viewer `rerun [OUTPUT_DIR]/rerun_log.rrd`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fcb2839d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from base64 import b64decode\n",
+    "from io import BytesIO\n",
+    "\n",
+    "import torch\n",
+    "from pytorch3d.io import IO\n",
+    "from pytorch3d.transforms.rotation_conversions import quaternion_to_matrix\n",
+    "\n",
+    "import rerun as rr\n",
+    "\n",
+    "rr.init(\"sam-3d-detect\")\n",
+    "rr.save(os.path.join(OUTPUT_DIR, \"rerun_log.rrd\"))\n",
+    "rr.log(\"/\", rr.ViewCoordinates.RIGHT_HAND_Y_UP, rr.TransformAxes3D(0.5), static=True)\n",
+    "\n",
+    "rr.set_time(\"tick\", sequence=0)\n",
+    "\n",
+    "rr.log(\"/camera/image\", rr.Image(annotated, color_model=\"bgr\"))\n",
+    "\n",
+    "# Coordinate transforms used in make_scene_glb\n",
+    "z_to_y_up = torch.tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]], dtype=torch.float)\n",
+    "y_to_z_up = torch.tensor([[1, 0, 0], [0, 0, 1], [0, -1, 0]], dtype=torch.float)\n",
+    "R_view = torch.tensor([[-1, 0, 0], [0, 0, -1], [0, -1, 0]], dtype=torch.float)\n",
+    "\n",
+    "for i in range(len(detections)):\n",
+    "    det = detections[i]\n",
+    "    obj_id = f\"#{i}\"\n",
+    "    if \"sam3_3d\" not in det.data:\n",
+    "        print(f\"No 3D data available for {obj_id}\")\n",
+    "        continue\n",
+    "    obj_sam3_3d = det.data[\"sam3_3d\"][0]\n",
+    "\n",
+    "    obj_ply = IO().load_pointcloud(BytesIO(b64decode(obj_sam3_3d[\"gaussian_ply\"])))\n",
+    "    obj_pts = obj_ply.points_list()[0]\n",
+    "    obj_pts = obj_pts[::100, :]  # Keep 1% of points to speed up rendering\n",
+    "    obj_box_size = (obj_pts.amax(dim=0) - obj_pts.amin(dim=0))\n",
+    "    obj_rgb = sv.annotators.utils.resolve_color(sv.ColorPalette.DEFAULT, detections, i).as_rgb()\n",
+    "\n",
+    "    metadata = obj_sam3_3d[\"metadata\"]\n",
+    "    t = torch.tensor(metadata[\"translation\"], dtype=torch.float)\n",
+    "    R = quaternion_to_matrix(torch.tensor(metadata[\"rotation\"], dtype=torch.float))\n",
+    "    s = torch.tensor(metadata[\"scale\"], dtype=torch.float)\n",
+    "    # 1. Z-up → Y-up coordinate conversion (row-vector convention throughout SAM3D)\n",
+    "    # 2. PyTorch3D quaternion_to_matrix is column-vector (R @ v), but SAM3D uses it\n",
+    "    #    row-vector (v @ R), so pass R.T to Rerun's column-vector mat3x3\n",
+    "    # 3. R_view: global scene correction from make_scene_glb, applied in world space\n",
+    "    t = t @ z_to_y_up @ R_view\n",
+    "    R = R_view @ y_to_z_up @ R.T @ z_to_y_up\n",
+    "\n",
+    "    rr.log(\n",
+    "        f\"objects/{obj_id}\",\n",
+    "        rr.Boxes3D(sizes=obj_box_size, colors=obj_rgb, labels=obj_id),\n",
+    "        rr.Transform3D(translation=t, mat3x3=R, scale=s),\n",
+    "    )\n",
+    "    rr.log(\n",
+    "        f\"objects/{obj_id}/pts\",\n",
+    "        rr.Points3D(positions=obj_pts, colors=obj_rgb),\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ab9ba8e6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# You can also use the standalone viewer app\n",
+    "# rerun [OUTPUT_DIR]/rerun_log.rrd\n",
+    "rr.notebook_show()\n",
+    "rr.log_file_from_path(os.path.join(OUTPUT_DIR, \"rerun_log.rrd\"))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.19"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/sam-3d/sam-3d-track.ipynb b/examples/sam-3d/sam-3d-track.ipynb