Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@ pip install git+https://github.com/NVlabs/describe-anything
# You can also clone the repo and install it locally
git clone https://github.com/NVlabs/describe-anything
cd describe-anything
pip install -v .
# if sam2 required.
./get_sam2_weight.sh
pip install -v . # pip install -v ".[sam2]" if sam2 required.
```

We also provide a self-contained script for detailed localized image descriptions without installing additional dependencies. Please refer to the [examples/dam_with_sam_self_contained.py](examples/dam_with_sam_self_contained.py) or [this Colab](https://colab.research.google.com/drive/1bQqAMRH2vdjoWpDpuMSb3SBf_OAntadZ?usp=sharing) for more details.
Expand Down
120 changes: 120 additions & 0 deletions configs/sam2.1/sam2.1_hiera_l.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
# @package _global_

# Model
model:
_target_: sam2.modeling.sam2_base.SAM2Base
image_encoder:
_target_: sam2.modeling.backbones.image_encoder.ImageEncoder
scalp: 1
trunk:
_target_: sam2.modeling.backbones.hieradet.Hiera
embed_dim: 144
num_heads: 2
stages: [2, 6, 36, 4]
global_att_blocks: [23, 33, 43]
window_pos_embed_bkg_spatial_size: [7, 7]
window_spec: [8, 4, 16, 8]
neck:
_target_: sam2.modeling.backbones.image_encoder.FpnNeck
position_encoding:
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
num_pos_feats: 256
normalize: true
scale: null
temperature: 10000
d_model: 256
backbone_channel_list: [1152, 576, 288, 144]
fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
fpn_interp_model: nearest

memory_attention:
_target_: sam2.modeling.memory_attention.MemoryAttention
d_model: 256
pos_enc_at_input: true
layer:
_target_: sam2.modeling.memory_attention.MemoryAttentionLayer
activation: relu
dim_feedforward: 2048
dropout: 0.1
pos_enc_at_attn: false
self_attention:
_target_: sam2.modeling.sam.transformer.RoPEAttention
rope_theta: 10000.0
feat_sizes: [64, 64]
embedding_dim: 256
num_heads: 1
downsample_rate: 1
dropout: 0.1
d_model: 256
pos_enc_at_cross_attn_keys: true
pos_enc_at_cross_attn_queries: false
cross_attention:
_target_: sam2.modeling.sam.transformer.RoPEAttention
rope_theta: 10000.0
feat_sizes: [64, 64]
rope_k_repeat: True
embedding_dim: 256
num_heads: 1
downsample_rate: 1
dropout: 0.1
kv_in_dim: 64
num_layers: 4

memory_encoder:
_target_: sam2.modeling.memory_encoder.MemoryEncoder
out_dim: 64
position_encoding:
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
num_pos_feats: 64
normalize: true
scale: null
temperature: 10000
mask_downsampler:
_target_: sam2.modeling.memory_encoder.MaskDownSampler
kernel_size: 3
stride: 2
padding: 1
fuser:
_target_: sam2.modeling.memory_encoder.Fuser
layer:
_target_: sam2.modeling.memory_encoder.CXBlock
dim: 256
kernel_size: 7
padding: 3
layer_scale_init_value: 1e-6
use_dwconv: True # depth-wise convs
num_layers: 2

num_maskmem: 7
image_size: 1024
# apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
sigmoid_scale_for_mem_enc: 20.0
sigmoid_bias_for_mem_enc: -10.0
use_mask_input_as_output_without_sam: true
# Memory
directly_add_no_mem_embed: true
no_obj_embed_spatial: true
# use high-resolution feature map in the SAM mask decoder
use_high_res_features_in_sam: true
# output 3 masks on the first click on initial conditioning frames
multimask_output_in_sam: true
# SAM heads
iou_prediction_use_sigmoid: True
# cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
use_obj_ptrs_in_encoder: true
add_tpos_enc_to_obj_ptrs: true
proj_tpos_enc_in_obj_ptrs: true
use_signed_tpos_enc_to_obj_ptrs: true
only_obj_ptrs_in_the_past_for_eval: true
# object occlusion prediction
pred_obj_scores: true
pred_obj_scores_mlp: true
fixed_no_obj_ptr: true
# multimask tracking settings
multimask_output_for_tracking: true
use_multimask_token_for_obj_ptr: true
multimask_min_pt_num: 0
multimask_max_pt_num: 1
use_mlp_for_obj_ptr_proj: true
# Compilation flag
compile_image_encoder: False
2 changes: 1 addition & 1 deletion demo/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ def describe_without_streaming(image_base64: str, mask_base64: str, query: str):
demo._block_thread = demo.block_thread
demo.block_thread = lambda: None
demo.launch(
share=False,
share=True,
server_name=args.server_addr,
server_port=args.server_port,
ssr_mode=False,
Expand Down
2 changes: 1 addition & 1 deletion demo_simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ def apply_sam(image, input_points):
)

demo.launch(
share=False,
share=True,
server_name=args_cli.server_addr,
server_port=args_cli.server_port
)
2 changes: 1 addition & 1 deletion demo_video.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,7 @@ def describe_video(video_path, annotated_frame):
)

demo.launch(
share=False,
share=True,
server_name=args_cli.server_addr,
server_port=args_cli.server_port,
)
75 changes: 75 additions & 0 deletions get_sam2_weight.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#!/bin/bash

# --- 설정 ---
WEIGHTS_URL="https://dl.fbaipublicfiles.com/segment_anything_2/092824/sam2.1_hiera_large.pt"
WEIGHTS_DIR="checkpoints"
WEIGHTS_FILENAME="sam2.1_hiera_large.pt"
WEIGHTS_PATH="${WEIGHTS_DIR}/${WEIGHTS_FILENAME}"

CONFIG_URL="https://raw.githubusercontent.com/facebookresearch/sam2/main/configs/sam2.1/sam2.1_hiera_l.yaml"
CONFIG_DIR="configs/sam2.1"
CONFIG_FILENAME="sam2.1_hiera_l.yaml"
CONFIG_PATH="${CONFIG_DIR}/${CONFIG_FILENAME}"

# --- 메인 로직 ---
echo "==========================================="
echo " Setting up SAM2 Weights and Config File "
echo "==========================================="

# 1. 디렉토리 생성
echo -e "\n--- Creating directories (if they don't exist) ---"
mkdir -p "$WEIGHTS_DIR"
echo "Checked/Created directory: $WEIGHTS_DIR"
mkdir -p "$CONFIG_DIR"
echo "Checked/Created directory: $CONFIG_DIR"

# 2. 가중치 파일 확인 및 다운로드
echo -e "\n--- Checking Weights File ($WEIGHTS_FILENAME) ---"
if [ -f "$WEIGHTS_PATH" ]; then
echo "Weights file already exists: $WEIGHTS_PATH"
else
echo "Weights file not found. Downloading..."
# wget [옵션] -O [저장될 파일 경로] [URL]
wget --show-progress -O "$WEIGHTS_PATH" "$WEIGHTS_URL"
if [ $? -eq 0 ]; then
echo "Weights file downloaded successfully."
else
echo "Error downloading weights file. Please check the URL or network connection."
# 실패 시 생성된 빈 파일이나 부분 파일을 삭제할 수 있습니다.
rm -f "$WEIGHTS_PATH"
fi
fi

# 3. 설정 파일 확인 및 다운로드
echo -e "\n--- Checking Config File ($CONFIG_FILENAME) ---"
if [ -f "$CONFIG_PATH" ]; then
echo "Config file already exists: $CONFIG_PATH"
else
echo "Config file not found. Downloading..."
# wget [옵션] -O [저장될 파일 경로] [URL]
wget --show-progress -O "$CONFIG_PATH" "$CONFIG_URL"
if [ $? -eq 0 ]; then
echo "Config file downloaded successfully."
else
echo "Error downloading config file. Please check the URL or network connection."
rm -f "$CONFIG_PATH"
fi
fi

echo -e "\n==========================================="
echo " Setup process finished."
echo "==========================================="

# 최종 확인
echo -e "\nFinal Check:"
if [ -f "$WEIGHTS_PATH" ]; then
echo "✅ Weights file exists: $WEIGHTS_PATH"
else
echo "❌ Weights file is missing: $WEIGHTS_PATH"
fi

if [ -f "$CONFIG_PATH" ]; then
echo "✅ Config file exists: $CONFIG_PATH"
else
echo "❌ Config file is missing: $CONFIG_PATH"
fi
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ eval = [
"matplotlib",
]
sam = ["segment-anything @ git+https://github.com/facebookresearch/segment-anything.git"]
sam2 = ["sam2 @ git+https://github.com/facebookresearch/sam2.git"]
sam2 = ["sam-2 @ git+https://github.com/facebookresearch/sam2.git"]

[tool.black]
line-length = 120
Expand Down