From a6f867607d91286268b8e7abe1af8144bd40a63d Mon Sep 17 00:00:00 2001 From: Phil Butler Date: Mon, 9 Dec 2024 18:41:15 -0500 Subject: [PATCH] Update SAM AMG README with more descriptive install instructions (#1337) --- examples/sam2_amg_server/README.md | 22 ++++++++++++++- .../sam2/configs/sam2/sam2_hiera_b+.yaml | 28 +++++++++---------- .../sam2/configs/sam2/sam2_hiera_s.yaml | 28 +++++++++---------- .../sam2/configs/sam2/sam2_hiera_t.yaml | 28 +++++++++---------- 4 files changed, 63 insertions(+), 43 deletions(-) diff --git a/examples/sam2_amg_server/README.md b/examples/sam2_amg_server/README.md index 43fc2b2528..c09b012c26 100644 --- a/examples/sam2_amg_server/README.md +++ b/examples/sam2_amg_server/README.md @@ -41,12 +41,32 @@ The 'ao' mode is a copy of the baseline with modifications to make the code more ### 0. Download checkpoints and install requirements ``` -pip install -r requirements.txt +# From the top-level "ao" directory + +# If necessary, create and activate a virtual environment +# Ex: +python -m venv venv && source venv/bin/activate + +# Install requirements for this example +pip install -r examples/sam2_amg_server/requirements.txt + +# If you have an older version of torch in your current environment, uninstall it first +pip uninstall torch + +# Install torch nightly +pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124 + +# Build ao from source for now +python setup.py develop + +# On your mark, get set... +cd examples/sam2_amg_server/ ``` Download `sam2.1_hiera_large.pt` from https://github.com/facebookresearch/sam2?tab=readme-ov-file#download-checkpoints and put it into `~/checkpoints/sam2` ### 1. Create a random subset of 1000 images +Using images with corresponding mask annotations, like from the Segment Anything Video (SA-V) [Dataset](https://github.com/facebookresearch/sam2/tree/main/sav_dataset#download-the-dataset) is suggested, to later compare any drop in accuracy using `--furious` (using `torch.float16`). ``` find sav_val -type f > sav_val_image_paths shuf -n 1000 sav_val_image_paths > sav_val_image_paths_shuf_1000 diff --git a/torchao/_models/sam2/configs/sam2/sam2_hiera_b+.yaml b/torchao/_models/sam2/configs/sam2/sam2_hiera_b+.yaml index 58f3eb8155..b3ba469471 100644 --- a/torchao/_models/sam2/configs/sam2/sam2_hiera_b+.yaml +++ b/torchao/_models/sam2/configs/sam2/sam2_hiera_b+.yaml @@ -2,18 +2,18 @@ # Model model: - _target_: sam2.modeling.sam2_base.SAM2Base + _target_: torchao._models.sam2.modeling.sam2_base.SAM2Base image_encoder: - _target_: sam2.modeling.backbones.image_encoder.ImageEncoder + _target_: torchao._models.sam2.modeling.backbones.image_encoder.ImageEncoder scalp: 1 trunk: - _target_: sam2.modeling.backbones.hieradet.Hiera + _target_: torchao._models.sam2.modeling.backbones.hieradet.Hiera embed_dim: 112 num_heads: 2 neck: - _target_: sam2.modeling.backbones.image_encoder.FpnNeck + _target_: torchao._models.sam2.modeling.backbones.image_encoder.FpnNeck position_encoding: - _target_: sam2.modeling.position_encoding.PositionEmbeddingSine + _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine num_pos_feats: 256 normalize: true scale: null @@ -24,17 +24,17 @@ model: fpn_interp_model: nearest memory_attention: - _target_: sam2.modeling.memory_attention.MemoryAttention + _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttention d_model: 256 pos_enc_at_input: true layer: - _target_: sam2.modeling.memory_attention.MemoryAttentionLayer + _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttentionLayer activation: relu dim_feedforward: 2048 dropout: 0.1 pos_enc_at_attn: false self_attention: - _target_: sam2.modeling.sam.transformer.RoPEAttention + _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention rope_theta: 10000.0 feat_sizes: [32, 32] embedding_dim: 256 @@ -45,7 +45,7 @@ model: pos_enc_at_cross_attn_keys: true pos_enc_at_cross_attn_queries: false cross_attention: - _target_: sam2.modeling.sam.transformer.RoPEAttention + _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention rope_theta: 10000.0 feat_sizes: [32, 32] rope_k_repeat: True @@ -57,23 +57,23 @@ model: num_layers: 4 memory_encoder: - _target_: sam2.modeling.memory_encoder.MemoryEncoder + _target_: torchao._models.sam2.modeling.memory_encoder.MemoryEncoder out_dim: 64 position_encoding: - _target_: sam2.modeling.position_encoding.PositionEmbeddingSine + _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine num_pos_feats: 64 normalize: true scale: null temperature: 10000 mask_downsampler: - _target_: sam2.modeling.memory_encoder.MaskDownSampler + _target_: torchao._models.sam2.modeling.memory_encoder.MaskDownSampler kernel_size: 3 stride: 2 padding: 1 fuser: - _target_: sam2.modeling.memory_encoder.Fuser + _target_: torchao._models.sam2.modeling.memory_encoder.Fuser layer: - _target_: sam2.modeling.memory_encoder.CXBlock + _target_: torchao._models.sam2.modeling.memory_encoder.CXBlock dim: 256 kernel_size: 7 padding: 3 diff --git a/torchao/_models/sam2/configs/sam2/sam2_hiera_s.yaml b/torchao/_models/sam2/configs/sam2/sam2_hiera_s.yaml index 26e5d4d39f..b051d3be63 100644 --- a/torchao/_models/sam2/configs/sam2/sam2_hiera_s.yaml +++ b/torchao/_models/sam2/configs/sam2/sam2_hiera_s.yaml @@ -2,21 +2,21 @@ # Model model: - _target_: sam2.modeling.sam2_base.SAM2Base + _target_: torchao._models.sam2.modeling.sam2_base.SAM2Base image_encoder: - _target_: sam2.modeling.backbones.image_encoder.ImageEncoder + _target_: torchao._models.sam2.modeling.backbones.image_encoder.ImageEncoder scalp: 1 trunk: - _target_: sam2.modeling.backbones.hieradet.Hiera + _target_: torchao._models.sam2.modeling.backbones.hieradet.Hiera embed_dim: 96 num_heads: 1 stages: [1, 2, 11, 2] global_att_blocks: [7, 10, 13] window_pos_embed_bkg_spatial_size: [7, 7] neck: - _target_: sam2.modeling.backbones.image_encoder.FpnNeck + _target_: torchao._models.sam2.modeling.backbones.image_encoder.FpnNeck position_encoding: - _target_: sam2.modeling.position_encoding.PositionEmbeddingSine + _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine num_pos_feats: 256 normalize: true scale: null @@ -27,17 +27,17 @@ model: fpn_interp_model: nearest memory_attention: - _target_: sam2.modeling.memory_attention.MemoryAttention + _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttention d_model: 256 pos_enc_at_input: true layer: - _target_: sam2.modeling.memory_attention.MemoryAttentionLayer + _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttentionLayer activation: relu dim_feedforward: 2048 dropout: 0.1 pos_enc_at_attn: false self_attention: - _target_: sam2.modeling.sam.transformer.RoPEAttention + _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention rope_theta: 10000.0 feat_sizes: [32, 32] embedding_dim: 256 @@ -48,7 +48,7 @@ model: pos_enc_at_cross_attn_keys: true pos_enc_at_cross_attn_queries: false cross_attention: - _target_: sam2.modeling.sam.transformer.RoPEAttention + _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention rope_theta: 10000.0 feat_sizes: [32, 32] rope_k_repeat: True @@ -60,23 +60,23 @@ model: num_layers: 4 memory_encoder: - _target_: sam2.modeling.memory_encoder.MemoryEncoder + _target_: torchao._models.sam2.modeling.memory_encoder.MemoryEncoder out_dim: 64 position_encoding: - _target_: sam2.modeling.position_encoding.PositionEmbeddingSine + _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine num_pos_feats: 64 normalize: true scale: null temperature: 10000 mask_downsampler: - _target_: sam2.modeling.memory_encoder.MaskDownSampler + _target_: torchao._models.sam2.modeling.memory_encoder.MaskDownSampler kernel_size: 3 stride: 2 padding: 1 fuser: - _target_: sam2.modeling.memory_encoder.Fuser + _target_: torchao._models.sam2.modeling.memory_encoder.Fuser layer: - _target_: sam2.modeling.memory_encoder.CXBlock + _target_: torchao._models.sam2.modeling.memory_encoder.CXBlock dim: 256 kernel_size: 7 padding: 3 diff --git a/torchao/_models/sam2/configs/sam2/sam2_hiera_t.yaml b/torchao/_models/sam2/configs/sam2/sam2_hiera_t.yaml index a62c903aaa..6b108e708f 100644 --- a/torchao/_models/sam2/configs/sam2/sam2_hiera_t.yaml +++ b/torchao/_models/sam2/configs/sam2/sam2_hiera_t.yaml @@ -2,21 +2,21 @@ # Model model: - _target_: sam2.modeling.sam2_base.SAM2Base + _target_: torchao._models.sam2.modeling.sam2_base.SAM2Base image_encoder: - _target_: sam2.modeling.backbones.image_encoder.ImageEncoder + _target_: torchao._models.sam2.modeling.backbones.image_encoder.ImageEncoder scalp: 1 trunk: - _target_: sam2.modeling.backbones.hieradet.Hiera + _target_: torchao._models.sam2.modeling.backbones.hieradet.Hiera embed_dim: 96 num_heads: 1 stages: [1, 2, 7, 2] global_att_blocks: [5, 7, 9] window_pos_embed_bkg_spatial_size: [7, 7] neck: - _target_: sam2.modeling.backbones.image_encoder.FpnNeck + _target_: torchao._models.sam2.modeling.backbones.image_encoder.FpnNeck position_encoding: - _target_: sam2.modeling.position_encoding.PositionEmbeddingSine + _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine num_pos_feats: 256 normalize: true scale: null @@ -27,17 +27,17 @@ model: fpn_interp_model: nearest memory_attention: - _target_: sam2.modeling.memory_attention.MemoryAttention + _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttention d_model: 256 pos_enc_at_input: true layer: - _target_: sam2.modeling.memory_attention.MemoryAttentionLayer + _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttentionLayer activation: relu dim_feedforward: 2048 dropout: 0.1 pos_enc_at_attn: false self_attention: - _target_: sam2.modeling.sam.transformer.RoPEAttention + _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention rope_theta: 10000.0 feat_sizes: [32, 32] embedding_dim: 256 @@ -48,7 +48,7 @@ model: pos_enc_at_cross_attn_keys: true pos_enc_at_cross_attn_queries: false cross_attention: - _target_: sam2.modeling.sam.transformer.RoPEAttention + _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention rope_theta: 10000.0 feat_sizes: [32, 32] rope_k_repeat: True @@ -60,23 +60,23 @@ model: num_layers: 4 memory_encoder: - _target_: sam2.modeling.memory_encoder.MemoryEncoder + _target_: torchao._models.sam2.modeling.memory_encoder.MemoryEncoder out_dim: 64 position_encoding: - _target_: sam2.modeling.position_encoding.PositionEmbeddingSine + _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine num_pos_feats: 64 normalize: true scale: null temperature: 10000 mask_downsampler: - _target_: sam2.modeling.memory_encoder.MaskDownSampler + _target_: torchao._models.sam2.modeling.memory_encoder.MaskDownSampler kernel_size: 3 stride: 2 padding: 1 fuser: - _target_: sam2.modeling.memory_encoder.Fuser + _target_: torchao._models.sam2.modeling.memory_encoder.Fuser layer: - _target_: sam2.modeling.memory_encoder.CXBlock + _target_: torchao._models.sam2.modeling.memory_encoder.CXBlock dim: 256 kernel_size: 7 padding: 3