From 973498f5a1c4fa369d4d43a22c0c86af8040acd6 Mon Sep 17 00:00:00 2001 From: hitdahit Date: Tue, 6 May 2025 14:32:54 +0900 Subject: [PATCH 1/2] =?UTF-8?q?sam2=20weight,=20config=20=ED=8C=8C?= =?UTF-8?q?=EC=9D=BC=20=EC=A0=9C=EA=B3=B5=20=EB=B0=8F=20sam2=20=EA=B4=80?= =?UTF-8?q?=EB=A0=A8=20toml=20=EC=88=98=EC=A0=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- configs/sam2.1/sam2.1_hiera_l.yaml | 120 +++++++++++++++++++++++++++++ demo/app.py | 2 +- demo_simple.py | 2 +- demo_video.py | 2 +- get_sam2_weight.sh | 75 ++++++++++++++++++ pyproject.toml | 2 +- 6 files changed, 199 insertions(+), 4 deletions(-) create mode 100644 configs/sam2.1/sam2.1_hiera_l.yaml create mode 100755 get_sam2_weight.sh diff --git a/configs/sam2.1/sam2.1_hiera_l.yaml b/configs/sam2.1/sam2.1_hiera_l.yaml new file mode 100644 index 0000000..a564c89 --- /dev/null +++ b/configs/sam2.1/sam2.1_hiera_l.yaml @@ -0,0 +1,120 @@ +# @package _global_ + +# Model +model: + _target_: sam2.modeling.sam2_base.SAM2Base + image_encoder: + _target_: sam2.modeling.backbones.image_encoder.ImageEncoder + scalp: 1 + trunk: + _target_: sam2.modeling.backbones.hieradet.Hiera + embed_dim: 144 + num_heads: 2 + stages: [2, 6, 36, 4] + global_att_blocks: [23, 33, 43] + window_pos_embed_bkg_spatial_size: [7, 7] + window_spec: [8, 4, 16, 8] + neck: + _target_: sam2.modeling.backbones.image_encoder.FpnNeck + position_encoding: + _target_: sam2.modeling.position_encoding.PositionEmbeddingSine + num_pos_feats: 256 + normalize: true + scale: null + temperature: 10000 + d_model: 256 + backbone_channel_list: [1152, 576, 288, 144] + fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features + fpn_interp_model: nearest + + memory_attention: + _target_: sam2.modeling.memory_attention.MemoryAttention + d_model: 256 + pos_enc_at_input: true + layer: + _target_: sam2.modeling.memory_attention.MemoryAttentionLayer + activation: relu + dim_feedforward: 2048 + dropout: 0.1 + pos_enc_at_attn: false + self_attention: + _target_: sam2.modeling.sam.transformer.RoPEAttention + rope_theta: 10000.0 + feat_sizes: [64, 64] + embedding_dim: 256 + num_heads: 1 + downsample_rate: 1 + dropout: 0.1 + d_model: 256 + pos_enc_at_cross_attn_keys: true + pos_enc_at_cross_attn_queries: false + cross_attention: + _target_: sam2.modeling.sam.transformer.RoPEAttention + rope_theta: 10000.0 + feat_sizes: [64, 64] + rope_k_repeat: True + embedding_dim: 256 + num_heads: 1 + downsample_rate: 1 + dropout: 0.1 + kv_in_dim: 64 + num_layers: 4 + + memory_encoder: + _target_: sam2.modeling.memory_encoder.MemoryEncoder + out_dim: 64 + position_encoding: + _target_: sam2.modeling.position_encoding.PositionEmbeddingSine + num_pos_feats: 64 + normalize: true + scale: null + temperature: 10000 + mask_downsampler: + _target_: sam2.modeling.memory_encoder.MaskDownSampler + kernel_size: 3 + stride: 2 + padding: 1 + fuser: + _target_: sam2.modeling.memory_encoder.Fuser + layer: + _target_: sam2.modeling.memory_encoder.CXBlock + dim: 256 + kernel_size: 7 + padding: 3 + layer_scale_init_value: 1e-6 + use_dwconv: True # depth-wise convs + num_layers: 2 + + num_maskmem: 7 + image_size: 1024 + # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask + sigmoid_scale_for_mem_enc: 20.0 + sigmoid_bias_for_mem_enc: -10.0 + use_mask_input_as_output_without_sam: true + # Memory + directly_add_no_mem_embed: true + no_obj_embed_spatial: true + # use high-resolution feature map in the SAM mask decoder + use_high_res_features_in_sam: true + # output 3 masks on the first click on initial conditioning frames + multimask_output_in_sam: true + # SAM heads + iou_prediction_use_sigmoid: True + # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder + use_obj_ptrs_in_encoder: true + add_tpos_enc_to_obj_ptrs: true + proj_tpos_enc_in_obj_ptrs: true + use_signed_tpos_enc_to_obj_ptrs: true + only_obj_ptrs_in_the_past_for_eval: true + # object occlusion prediction + pred_obj_scores: true + pred_obj_scores_mlp: true + fixed_no_obj_ptr: true + # multimask tracking settings + multimask_output_for_tracking: true + use_multimask_token_for_obj_ptr: true + multimask_min_pt_num: 0 + multimask_max_pt_num: 1 + use_mlp_for_obj_ptr_proj: true + # Compilation flag + compile_image_encoder: False \ No newline at end of file diff --git a/demo/app.py b/demo/app.py index e0f4bd9..54e40c1 100644 --- a/demo/app.py +++ b/demo/app.py @@ -181,7 +181,7 @@ def describe_without_streaming(image_base64: str, mask_base64: str, query: str): demo._block_thread = demo.block_thread demo.block_thread = lambda: None demo.launch( - share=False, + share=True, server_name=args.server_addr, server_port=args.server_port, ssr_mode=False, diff --git a/demo_simple.py b/demo_simple.py index 64703f4..b682a9f 100644 --- a/demo_simple.py +++ b/demo_simple.py @@ -212,7 +212,7 @@ def apply_sam(image, input_points): ) demo.launch( - share=False, + share=True, server_name=args_cli.server_addr, server_port=args_cli.server_port ) diff --git a/demo_video.py b/demo_video.py index ab647c2..d78e304 100644 --- a/demo_video.py +++ b/demo_video.py @@ -314,7 +314,7 @@ def describe_video(video_path, annotated_frame): ) demo.launch( - share=False, + share=True, server_name=args_cli.server_addr, server_port=args_cli.server_port, ) \ No newline at end of file diff --git a/get_sam2_weight.sh b/get_sam2_weight.sh new file mode 100755 index 0000000..c930297 --- /dev/null +++ b/get_sam2_weight.sh @@ -0,0 +1,75 @@ +#!/bin/bash + +# --- 설정 --- +WEIGHTS_URL="https://dl.fbaipublicfiles.com/segment_anything_2/092824/sam2.1_hiera_large.pt" +WEIGHTS_DIR="checkpoints" +WEIGHTS_FILENAME="sam2.1_hiera_large.pt" +WEIGHTS_PATH="${WEIGHTS_DIR}/${WEIGHTS_FILENAME}" + +CONFIG_URL="https://raw.githubusercontent.com/facebookresearch/sam2/main/configs/sam2.1/sam2.1_hiera_l.yaml" +CONFIG_DIR="configs/sam2.1" +CONFIG_FILENAME="sam2.1_hiera_l.yaml" +CONFIG_PATH="${CONFIG_DIR}/${CONFIG_FILENAME}" + +# --- 메인 로직 --- +echo "===========================================" +echo " Setting up SAM2 Weights and Config File " +echo "===========================================" + +# 1. 디렉토리 생성 +echo -e "\n--- Creating directories (if they don't exist) ---" +mkdir -p "$WEIGHTS_DIR" +echo "Checked/Created directory: $WEIGHTS_DIR" +mkdir -p "$CONFIG_DIR" +echo "Checked/Created directory: $CONFIG_DIR" + +# 2. 가중치 파일 확인 및 다운로드 +echo -e "\n--- Checking Weights File ($WEIGHTS_FILENAME) ---" +if [ -f "$WEIGHTS_PATH" ]; then + echo "Weights file already exists: $WEIGHTS_PATH" +else + echo "Weights file not found. Downloading..." + # wget [옵션] -O [저장될 파일 경로] [URL] + wget --show-progress -O "$WEIGHTS_PATH" "$WEIGHTS_URL" + if [ $? -eq 0 ]; then + echo "Weights file downloaded successfully." + else + echo "Error downloading weights file. Please check the URL or network connection." + # 실패 시 생성된 빈 파일이나 부분 파일을 삭제할 수 있습니다. + rm -f "$WEIGHTS_PATH" + fi +fi + +# 3. 설정 파일 확인 및 다운로드 +echo -e "\n--- Checking Config File ($CONFIG_FILENAME) ---" +if [ -f "$CONFIG_PATH" ]; then + echo "Config file already exists: $CONFIG_PATH" +else + echo "Config file not found. Downloading..." + # wget [옵션] -O [저장될 파일 경로] [URL] + wget --show-progress -O "$CONFIG_PATH" "$CONFIG_URL" + if [ $? -eq 0 ]; then + echo "Config file downloaded successfully." + else + echo "Error downloading config file. Please check the URL or network connection." + rm -f "$CONFIG_PATH" + fi +fi + +echo -e "\n===========================================" +echo " Setup process finished." +echo "===========================================" + +# 최종 확인 +echo -e "\nFinal Check:" +if [ -f "$WEIGHTS_PATH" ]; then + echo "✅ Weights file exists: $WEIGHTS_PATH" +else + echo "❌ Weights file is missing: $WEIGHTS_PATH" +fi + +if [ -f "$CONFIG_PATH" ]; then + echo "✅ Config file exists: $CONFIG_PATH" +else + echo "❌ Config file is missing: $CONFIG_PATH" +fi \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 1ed729c..a247a99 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,7 +34,7 @@ eval = [ "matplotlib", ] sam = ["segment-anything @ git+https://github.com/facebookresearch/segment-anything.git"] -sam2 = ["sam2 @ git+https://github.com/facebookresearch/sam2.git"] +sam2 = ["sam-2 @ git+https://github.com/facebookresearch/sam2.git"] [tool.black] line-length = 120 From e0fc5070245869b564d335d30be721671d7d9dc8 Mon Sep 17 00:00:00 2001 From: hitdahit Date: Tue, 6 May 2025 14:49:57 +0900 Subject: [PATCH 2/2] readme update --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index d4ef54e..d37449a 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,9 @@ pip install git+https://github.com/NVlabs/describe-anything # You can also clone the repo and install it locally git clone https://github.com/NVlabs/describe-anything cd describe-anything -pip install -v . +# if sam2 required. +./get_sam2_weight.sh +pip install -v . # pip install -v ".[sam2]" if sam2 required. ``` We also provide a self-contained script for detailed localized image descriptions without installing additional dependencies. Please refer to the [examples/dam_with_sam_self_contained.py](examples/dam_with_sam_self_contained.py) or [this Colab](https://colab.research.google.com/drive/1bQqAMRH2vdjoWpDpuMSb3SBf_OAntadZ?usp=sharing) for more details.