meta-pytorch · salmanmohammadi · Nov 13, 2024 · Nov 12, 2024 · Nov 12, 2024 · Nov 12, 2024
diff --git a/tests/cache_artifacts.sh b/tests/cache_artifacts.sh
@@ -18,6 +18,9 @@ SMALL_MODEL_URLS=(
     "https://ossci-datasets.s3.amazonaws.com/torchtune/small-ckpt-hf-03082024.pt"
     "https://ossci-datasets.s3.amazonaws.com/torchtune/small-ckpt-tune-llama3-05052024.pt"
     "https://ossci-datasets.s3.amazonaws.com/torchtune/small-ckpt-hf-reward-07122024.pt"
+    "https://ossci-datasets.s3.amazonaws.com/torchtune/small-ckpt-meta-vision-10172024.pt"
+    "https://ossci-datasets.s3.amazonaws.com/torchtune/small-ckpt-hf-vision-10172024.pt"
+
 )
 FULL_MODEL_URL=("s3://pytorch-multimodal/llama2-7b-torchtune.pt")
 TOKENIZER_URLS=(

diff --git a/tests/recipes/test_eleuther_eval.py b/tests/recipes/test_eleuther_eval.py
@@ -13,11 +13,40 @@
 import pytest
 
 from tests.common import TUNE_PATH
-from tests.recipes.utils import llama2_test_config, write_hf_ckpt_config
-from tests.test_utils import CKPT_MODEL_PATHS
+from tests.recipes.utils import (
+    llama2_test_config,
+    llama3_2_vision_test_config,
+    write_hf_ckpt_config,
+    write_hf_vision_ckpt_config,
+)
+from tests.test_utils import CKPT_MODEL_PATHS, gpu_test
 
 
 class TestEleutherEval:
+    @pytest.fixture
+    def hide_correct_version_number(self, monkeypatch):
+        import importlib.metadata
+
+        import_orig = importlib.metadata.version
+
+        def mocked_import(name, *args, **kwargs):
+            if name == "lm-eval":
+                return "0.4.4"  # Hardcode wrong version number
+            return import_orig(name, *args, **kwargs)
+
+        monkeypatch.setattr(importlib.metadata, "version", mocked_import)
+
+    @pytest.fixture
+    def expected_vision_acc(self):
+        return {
+            "Science": 0.35,
+            "Biology": 0.25,
+            "Chemistry": 0.25,
+            "Geography": 0.5,
+            "Math": 0.0,
+            "Physics": 0.75,
+        }
+
     @pytest.mark.parametrize(
         "eval_name, expected_acc, bsz",
         [
@@ -74,22 +103,9 @@ def test_torchtune_checkpoint_eval_results(
         acc_result = float(search_results.group(1))
         assert math.isclose(acc_result, expected_acc, abs_tol=0.05)
 
-    @pytest.fixture
-    def hide_correct_version_number(self, monkeypatch):
-        import importlib.metadata
-
-        import_orig = importlib.metadata.version
-
-        def mocked_import(name, *args, **kwargs):
-            if name == "lm-eval":
-                return "0.4.4"  # Hardcode wrong version number
-            return import_orig(name, *args, **kwargs)
-
-        monkeypatch.setattr(importlib.metadata, "version", mocked_import)
-
     @pytest.mark.integration_test
     @pytest.mark.usefixtures("hide_correct_version_number")
-    def test_eval_recipe_errors_without_lm_eval(self, capsys, monkeypatch, tmpdir):
+    def test_eval_recipe_errors_without_lm_eval(self, monkeypatch, tmpdir):
         ckpt = "llama2_tune"
         ckpt_path = Path(CKPT_MODEL_PATHS[ckpt])
         ckpt_dir = ckpt_path.parent
@@ -123,7 +139,7 @@ def test_eval_recipe_errors_without_lm_eval(self, capsys, monkeypatch, tmpdir):
 
     @pytest.mark.integration_test
     def test_eval_recipe_errors_with_quantization_hf_checkpointer(
-        self, capsys, monkeypatch, tmpdir
+        self, monkeypatch, tmpdir
     ):
         ckpt = "llama2_hf"
         ckpt_path = Path(CKPT_MODEL_PATHS[ckpt])
@@ -162,7 +178,7 @@ def test_eval_recipe_errors_with_quantization_hf_checkpointer(
             runpy.run_path(TUNE_PATH, run_name="__main__")
 
     @pytest.mark.integration_test
-    def test_eval_recipe_errors_with_qat_quantizer(self, capsys, monkeypatch, tmpdir):
+    def test_eval_recipe_errors_with_qat_quantizer(self, monkeypatch, tmpdir):
         ckpt = "llama2_tune"
         ckpt_path = Path(CKPT_MODEL_PATHS[ckpt])
         ckpt_dir = ckpt_path.parent
@@ -194,3 +210,86 @@ def test_eval_recipe_errors_with_qat_quantizer(self, capsys, monkeypatch, tmpdir
             match="QAT quantizers should only be used during quantization aware training",
         ):
             runpy.run_path(TUNE_PATH, run_name="__main__")
+
+    @pytest.mark.integration_test
+    @gpu_test(gpu_count=1)
+    def test_meta_eval_vision(self, caplog, monkeypatch, tmpdir, expected_vision_acc):
+        ckpt = "llama3_2_vision_meta"
+        ckpt_path = Path(CKPT_MODEL_PATHS[ckpt])
+        ckpt_dir = ckpt_path.parent
+
+        cmd = f"""
+        tune run eleuther_eval \
+            --config llama3_2_vision/11B_evaluation \
+            output_dir={tmpdir} \
+            checkpointer=torchtune.training.FullModelMetaCheckpointer \
+            checkpointer.checkpoint_dir='{ckpt_dir}' \
+            checkpointer.checkpoint_files=[{ckpt_path}] \
+            ~checkpointer.checkpoint_files.filename_format \
+            ~checkpointer.checkpoint_files.max_filename \
+            checkpointer.output_dir={tmpdir} \
+            checkpointer.model_type=LLAMA3_VISION \
+            tokenizer.path=/tmp/test-artifacts/tokenizer_llama3.model \
+            tokenizer.prompt_template=null \
+            limit=4 \
+            dtype=bf16 \
+            device=cuda \
+        """.split()
+
+        model_config = llama3_2_vision_test_config()
+        cmd = cmd + model_config
+
+        monkeypatch.setattr(sys, "argv", cmd)
+        with pytest.raises(SystemExit, match=""):
+            runpy.run_path(TUNE_PATH, run_name="__main__")
+
+        out = caplog.text
+
+        pattern = r"^\|\s*(?:-\s*)?([^\|]+?)\s*\|\s*(\d+)\s*\|.*?\|.*?\|acc\s*\|\s*↑\s*\|\s*([\d.]+)"
+
+        matches = re.findall(pattern, out, re.MULTILINE)
+        for task_name, _, accuracy in matches:
+            assert math.isclose(float(accuracy), expected_vision_acc[task_name])
+
+    @pytest.mark.integration_test
+    @gpu_test(gpu_count=1)
+    def test_hf_eval_vision(self, caplog, monkeypatch, tmpdir, expected_vision_acc):
+        ckpt = "llama3_2_vision_hf"
+        ckpt_path = Path(CKPT_MODEL_PATHS[ckpt])
+        ckpt_dir = ckpt_path.parent
+
+        # Config file needed for model conversion.
+        write_hf_vision_ckpt_config(ckpt_dir)
+
+        cmd = f"""
+        tune run eleuther_eval \
+            --config llama3_2_vision/11B_evaluation \
+            output_dir={tmpdir} \
+            checkpointer=torchtune.training.FullModelHFCheckpointer \
+            checkpointer.checkpoint_dir='{ckpt_dir}' \
+            checkpointer.checkpoint_files=[{ckpt_path}]\
+            ~checkpointer.checkpoint_files.filename_format \
+            ~checkpointer.checkpoint_files.max_filename \
+            checkpointer.output_dir={tmpdir} \
+            checkpointer.model_type=LLAMA3_VISION \
+            tokenizer.path=/tmp/test-artifacts/tokenizer_llama3.model \
+            tokenizer.prompt_template=null \
+            limit=4 \
+            dtype=bf16 \
+            device=cuda \
+        """.split()
+
+        model_config = llama3_2_vision_test_config()
+        cmd = cmd + model_config
+
+        monkeypatch.setattr(sys, "argv", cmd)
+        with pytest.raises(SystemExit, match=""):
+            runpy.run_path(TUNE_PATH, run_name="__main__")
+
+        out = caplog.text
+
+        pattern = r"^\|\s*(?:-\s*)?([^\|]+?)\s*\|\s*(\d+)\s*\|.*?\|.*?\|acc\s*\|\s*↑\s*\|\s*([\d.]+)"
+
+        matches = re.findall(pattern, out, re.MULTILINE)
+        for task_name, _, accuracy in matches:
+            assert math.isclose(float(accuracy), expected_vision_acc[task_name])
diff --git a/tests/recipes/utils.py b/tests/recipes/utils.py
@@ -128,6 +128,58 @@ def llama3_test_config() -> List[str]:
     ]
 
 
+def llama3_2_vision_test_config() -> List[str]:
+    return [
+        "model=tests.recipes.utils.dummy_vision_model",
+        "tokenizer._component_=torchtune.models.llama3_2_vision._transform.Llama3VisionTransform",
+        "tokenizer.patch_size=9",
+        "tokenizer.max_num_tiles=2",
+        "tokenizer.tile_size=18",
+        "tokenizer.max_seq_len=4096",
+    ]
+
+
+def dummy_vision_model():
+    from torchtune.models.llama3_2_vision._component_builders import (
+        llama3_2_vision_decoder,
+        llama3_2_vision_encoder,
+    )
+    from torchtune.modules.model_fusion import DeepFusionModel
+
+    vision_encoder = llama3_2_vision_encoder(
+        clip_embed_dim=128,
+        clip_num_layers=4,
+        num_heads=4,
+        tile_size=18,
+        patch_size=9,
+        max_num_tiles=2,
+        in_channels=3,
+        clip_hidden_states=[0, 1],
+        num_layers_projection=2,
+        decoder_embed_dim=128,
+    )
+    vision_decoder = llama3_2_vision_decoder(
+        vocab_size=128256,
+        num_layers=4,
+        fusion_interval=2,
+        num_special_tokens=2,
+        num_heads=8,
+        num_kv_heads=4,
+        embed_dim=128,
+        max_seq_len=4096,
+        encoder_max_seq_len=4096,
+    )
+
+    model = DeepFusionModel(
+        encoder=vision_encoder,
+        decoder=vision_decoder,
+        encoder_trainable=False,
+        decoder_trainable=False,
+        fusion_trainable=False,
+    )
+    return model
+
+
 def lora_llama2_test_config(
     lora_attn_modules,
     apply_lora_to_mlp: bool = False,
@@ -199,6 +251,27 @@ def write_hf_ckpt_config(ckpt_dir: str):
         json.dump(config, f)
 
 
+def write_hf_vision_ckpt_config(ckpt_dir: str):
+    config = {
+        "text_config": {
+            "num_attention_heads": 8,
+            "num_key_value_heads": 4,
+            "hidden_size": 128,
+            "vocab_size": 128256,
+            "cross_attention_layers": [1, 4],
+        },
+        "vision_config": {
+            "hidden_size": 128,
+            "image_size": 18,
+            "max_num_tiles": 2,
+            "supported_aspect_ratios": [[1, 1], [1, 2], [2, 1]],
+        },
+    }
+    config_file = Path.joinpath(Path(ckpt_dir), "config.json")
+    with config_file.open("w") as f:
+        json.dump(config, f)
+
+
 MODEL_TEST_CONFIGS = {
     "llama2": llama2_test_config(),
     "llama3": llama3_test_config(),

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -34,6 +34,8 @@
     "llama2_reward_hf": "/tmp/test-artifacts/small-ckpt-hf-reward-07122024.pt",
     "llama3_tune": "/tmp/test-artifacts/small-ckpt-tune-llama3-05052024.pt",
     "llama2_7b": "/tmp/test-artifacts/llama2-7b-torchtune.pt",
+    "llama3_2_vision_hf": "/tmp/test-artifacts/small-ckpt-hf-vision-10172024.pt",
+    "llama3_2_vision_meta": "/tmp/test-artifacts/small-ckpt-meta-vision-10172024.pt",
 }
 
 TOKENIZER_PATHS = {

diff --git a/torchtune/modules/attention.py b/torchtune/modules/attention.py
@@ -255,6 +255,14 @@ def forward(
                 )
             k = self.kv_cache.k_cache
             v = self.kv_cache.v_cache
+
+            # If needed, expand the key and value tensors to have the same shape
+            # as the query tensor by copying values across the relevant dim
+            # k,v shape: [b, n_kv, s, h_d] -> [b, n_h, s, h_d]
+            if self.num_heads != self.num_kv_heads:
+                expand_shape = (-1, -1, q_per_kv, -1, -1)
+                k = k.unsqueeze(2).expand(expand_shape).flatten(1, 2)
+                v = v.unsqueeze(2).expand(expand_shape).flatten(1, 2)
         else:
             # Update k and v shape, positional embeddings, and normalization
 

diff --git a/torchtune/modules/transformer.py b/torchtune/modules/transformer.py
@@ -781,7 +781,7 @@ def setup_caches(
             isinstance(l, TransformerCrossAttentionLayer) for l in self.modules()
         )
         has_decoder_layers = any(
-            isinstance(l, TransformerSelfAttentionLayer) for l in self.layers
+            isinstance(l, TransformerSelfAttentionLayer) for l in self.modules()
         )
         if has_encoder_layers:
             if encoder_max_seq_len is not None: