NVlabs · Hitdahit · May 6, 2025 · May 6, 2025
diff --git a/README.md b/README.md
@@ -23,7 +23,9 @@ pip install git+https://github.com/NVlabs/describe-anything
 # You can also clone the repo and install it locally
 git clone https://github.com/NVlabs/describe-anything
 cd describe-anything
-pip install -v .
+# if sam2 required.
+./get_sam2_weight.sh 
+pip install -v . # pip install -v ".[sam2]" if sam2 required.
 ```
 
 We also provide a self-contained script for detailed localized image descriptions without installing additional dependencies. Please refer to the [examples/dam_with_sam_self_contained.py](examples/dam_with_sam_self_contained.py) or [this Colab](https://colab.research.google.com/drive/1bQqAMRH2vdjoWpDpuMSb3SBf_OAntadZ?usp=sharing) for more details.

diff --git a/configs/sam2.1/sam2.1_hiera_l.yaml b/configs/sam2.1/sam2.1_hiera_l.yaml
@@ -0,0 +1,120 @@
+# @package _global_
+
+# Model
+model:
+  _target_: sam2.modeling.sam2_base.SAM2Base
+  image_encoder:
+    _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
+    scalp: 1
+    trunk:
+      _target_: sam2.modeling.backbones.hieradet.Hiera
+      embed_dim: 144
+      num_heads: 2
+      stages: [2, 6, 36, 4]
+      global_att_blocks: [23, 33, 43]
+      window_pos_embed_bkg_spatial_size: [7, 7]
+      window_spec: [8, 4, 16, 8]
+    neck:
+      _target_: sam2.modeling.backbones.image_encoder.FpnNeck
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 256
+        normalize: true
+        scale: null
+        temperature: 10000
+      d_model: 256
+      backbone_channel_list: [1152, 576, 288, 144]
+      fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
+      fpn_interp_model: nearest
+
+  memory_attention:
+    _target_: sam2.modeling.memory_attention.MemoryAttention
+    d_model: 256
+    pos_enc_at_input: true
+    layer:
+      _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
+      activation: relu
+      dim_feedforward: 2048
+      dropout: 0.1
+      pos_enc_at_attn: false
+      self_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [64, 64]
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+      d_model: 256
+      pos_enc_at_cross_attn_keys: true
+      pos_enc_at_cross_attn_queries: false
+      cross_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [64, 64]
+        rope_k_repeat: True
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+        kv_in_dim: 64
+    num_layers: 4
+
+  memory_encoder:
+      _target_: sam2.modeling.memory_encoder.MemoryEncoder
+      out_dim: 64
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 64
+        normalize: true
+        scale: null
+        temperature: 10000
+      mask_downsampler:
+        _target_: sam2.modeling.memory_encoder.MaskDownSampler
+        kernel_size: 3
+        stride: 2
+        padding: 1
+      fuser:
+        _target_: sam2.modeling.memory_encoder.Fuser
+        layer:
+          _target_: sam2.modeling.memory_encoder.CXBlock
+          dim: 256
+          kernel_size: 7
+          padding: 3
+          layer_scale_init_value: 1e-6
+          use_dwconv: True  # depth-wise convs
+        num_layers: 2
+
+  num_maskmem: 7
+  image_size: 1024
+  # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
+  sigmoid_scale_for_mem_enc: 20.0
+  sigmoid_bias_for_mem_enc: -10.0
+  use_mask_input_as_output_without_sam: true
+  # Memory
+  directly_add_no_mem_embed: true
+  no_obj_embed_spatial: true
+  # use high-resolution feature map in the SAM mask decoder
+  use_high_res_features_in_sam: true
+  # output 3 masks on the first click on initial conditioning frames
+  multimask_output_in_sam: true
+  # SAM heads
+  iou_prediction_use_sigmoid: True
+  # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+  use_obj_ptrs_in_encoder: true
+  add_tpos_enc_to_obj_ptrs: true
+  proj_tpos_enc_in_obj_ptrs: true
+  use_signed_tpos_enc_to_obj_ptrs: true
+  only_obj_ptrs_in_the_past_for_eval: true
+  # object occlusion prediction
+  pred_obj_scores: true
+  pred_obj_scores_mlp: true
+  fixed_no_obj_ptr: true
+  # multimask tracking settings
+  multimask_output_for_tracking: true
+  use_multimask_token_for_obj_ptr: true
+  multimask_min_pt_num: 0
+  multimask_max_pt_num: 1
+  use_mlp_for_obj_ptr_proj: true
+  # Compilation flag
+  compile_image_encoder: False
diff --git a/demo/app.py b/demo/app.py
@@ -181,7 +181,7 @@ def describe_without_streaming(image_base64: str, mask_base64: str, query: str):
     demo._block_thread = demo.block_thread
     demo.block_thread = lambda: None
     demo.launch(
-        share=False,
+        share=True,
         server_name=args.server_addr,
         server_port=args.server_port,
         ssr_mode=False,

diff --git a/demo_simple.py b/demo_simple.py
@@ -212,7 +212,7 @@ def apply_sam(image, input_points):
     )
 
     demo.launch(
-        share=False,
+        share=True,
         server_name=args_cli.server_addr,
         server_port=args_cli.server_port
     )
diff --git a/demo_video.py b/demo_video.py
@@ -314,7 +314,7 @@ def describe_video(video_path, annotated_frame):
         )
 
     demo.launch(
-        share=False,
+        share=True,
         server_name=args_cli.server_addr,
         server_port=args_cli.server_port,
     )
diff --git a/get_sam2_weight.sh b/get_sam2_weight.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+
+# --- 설정 ---
+WEIGHTS_URL="https://dl.fbaipublicfiles.com/segment_anything_2/092824/sam2.1_hiera_large.pt"
+WEIGHTS_DIR="checkpoints"
+WEIGHTS_FILENAME="sam2.1_hiera_large.pt"
+WEIGHTS_PATH="${WEIGHTS_DIR}/${WEIGHTS_FILENAME}"
+
+CONFIG_URL="https://raw.githubusercontent.com/facebookresearch/sam2/main/configs/sam2.1/sam2.1_hiera_l.yaml"
+CONFIG_DIR="configs/sam2.1"
+CONFIG_FILENAME="sam2.1_hiera_l.yaml"
+CONFIG_PATH="${CONFIG_DIR}/${CONFIG_FILENAME}"
+
+# --- 메인 로직 ---
+echo "==========================================="
+echo " Setting up SAM2 Weights and Config File "
+echo "==========================================="
+
+# 1. 디렉토리 생성
+echo -e "\n--- Creating directories (if they don't exist) ---"
+mkdir -p "$WEIGHTS_DIR"
+echo "Checked/Created directory: $WEIGHTS_DIR"
+mkdir -p "$CONFIG_DIR"
+echo "Checked/Created directory: $CONFIG_DIR"
+
+# 2. 가중치 파일 확인 및 다운로드
+echo -e "\n--- Checking Weights File ($WEIGHTS_FILENAME) ---"
+if [ -f "$WEIGHTS_PATH" ]; then
+    echo "Weights file already exists: $WEIGHTS_PATH"
+else
+    echo "Weights file not found. Downloading..."
+    # wget [옵션] -O [저장될 파일 경로] [URL]
+    wget --show-progress -O "$WEIGHTS_PATH" "$WEIGHTS_URL"
+    if [ $? -eq 0 ]; then
+        echo "Weights file downloaded successfully."
+    else
+        echo "Error downloading weights file. Please check the URL or network connection."
+        # 실패 시 생성된 빈 파일이나 부분 파일을 삭제할 수 있습니다.
+        rm -f "$WEIGHTS_PATH"
+    fi
+fi
+
+# 3. 설정 파일 확인 및 다운로드
+echo -e "\n--- Checking Config File ($CONFIG_FILENAME) ---"
+if [ -f "$CONFIG_PATH" ]; then
+    echo "Config file already exists: $CONFIG_PATH"
+else
+    echo "Config file not found. Downloading..."
+    # wget [옵션] -O [저장될 파일 경로] [URL]
+    wget --show-progress -O "$CONFIG_PATH" "$CONFIG_URL"
+     if [ $? -eq 0 ]; then
+        echo "Config file downloaded successfully."
+    else
+        echo "Error downloading config file. Please check the URL or network connection."
+        rm -f "$CONFIG_PATH"
+    fi
+fi
+
+echo -e "\n==========================================="
+echo " Setup process finished."
+echo "==========================================="
+
+# 최종 확인
+echo -e "\nFinal Check:"
+if [ -f "$WEIGHTS_PATH" ]; then
+    echo "✅ Weights file exists: $WEIGHTS_PATH"
+else
+    echo "❌ Weights file is missing: $WEIGHTS_PATH"
+fi
+
+if [ -f "$CONFIG_PATH" ]; then
+    echo "✅ Config file exists: $CONFIG_PATH"
+else
+    echo "❌ Config file is missing: $CONFIG_PATH"
+fi
diff --git a/pyproject.toml b/pyproject.toml
@@ -34,7 +34,7 @@ eval = [
     "matplotlib",
 ]
 sam = ["segment-anything @ git+https://github.com/facebookresearch/segment-anything.git"]
-sam2 = ["sam2 @ git+https://github.com/facebookresearch/sam2.git"]
+sam2 = ["sam-2 @ git+https://github.com/facebookresearch/sam2.git"]
 
 [tool.black]
 line-length = 120