AI-Hypercomputer · bhavya01 · Jun 10, 2024 · May 24, 2024 · May 24, 2024 · May 25, 2024
diff --git a/.github/workflows/unit_tests.yaml b/.github/workflows/unit_tests.yaml
@@ -100,7 +100,7 @@ jobs:
         source install_everything.sh
     - name: Run interactive (bf16)
       run: |
-        JAX_PLATFORMS=cpu python run_interactive.py --size=tiny --batch_size=1 --max_cache_length=2048 --tokenizer_path=jetstream_pt/third_party/llama/tokenizer.model --model_name=llama-2 --sharding_config=default_shardings/llama.yaml --quantize_weights=0 --quantize_kv_cache=0
+        JAX_PLATFORMS=cpu python run_interactive.py --size=tiny --batch_size=1 --max_cache_length=2048 --tokenizer_path=jetstream_pt/third_party/llama/tokenizer.model --model_name=llama-2 --sharding_config=default_shardings/llama-2.yaml --quantize_weights=0 --quantize_kv_cache=0
     - name: Run interactive (int8)
       run: |
-        JAX_PLATFORMS=cpu python run_interactive.py --size=tiny --batch_size=1 --max_cache_length=2048 --tokenizer_path=jetstream_pt/third_party/llama/tokenizer.model --model_name=llama-2 --sharding_config=default_shardings/llama.yaml --quantize_weights=1 --quantize_type="int8_per_channel" --quantize_kv_cache=1
+        JAX_PLATFORMS=cpu python run_interactive.py --size=tiny --batch_size=1 --max_cache_length=2048 --tokenizer_path=jetstream_pt/third_party/llama/tokenizer.model --model_name=llama-2 --sharding_config=default_shardings/llama-2.yaml --quantize_weights=1 --quantize_type="int8_per_channel" --quantize_kv_cache=1
diff --git a/README.md b/README.md
@@ -84,17 +84,22 @@ export tokenizer_path=tokenizer model file path
 
 ## Llama-2 7b
 ```bash
-python run_interactive.py --size=7b --model_name=$model_name --batch_size=128 --max_cache_length=2048 --quantize_weights=$quantize --quantize_type=$quantize_type --quantize_kv_cache=$quantize --checkpoint_path=$output_ckpt_dir --tokenizer_path=$tokenizer_path --sharding_config=default_shardings/llama.yaml
+python run_interactive.py --size=7b --model_name=$model_name --batch_size=128 --max_cache_length=2048 --quantize_weights=$quantize --quantize_type=$quantize_type --quantize_kv_cache=$quantize --checkpoint_path=$output_ckpt_dir --tokenizer_path=$tokenizer_path --sharding_config=default_shardings/$model_name.yaml
 ```
 
 ## Llama-2 13b
 ```bash
-python run_interactive.py --size=13b --model_name=$model_name --batch_size=64 --max_cache_length=2048 --quantize_weights=$quantize --quantize_type=$quantize_type --quantize_kv_cache=$quantize --checkpoint_path=$output_ckpt_dir --tokenizer_path=$tokenizer_path --sharding_config=default_shardings/llama.yaml
+python run_interactive.py --size=13b --model_name=$model_name --batch_size=64 --max_cache_length=2048 --quantize_weights=$quantize --quantize_type=$quantize_type --quantize_kv_cache=$quantize --checkpoint_path=$output_ckpt_dir --tokenizer_path=$tokenizer_path --sharding_config=default_shardings/$model_name.yaml
 ```
 
 ## Llama-3 8b
 ```bash
-python run_interactive.py --size=8b --model_name=$model_name --batch_size=128 --max_cache_length=2048 --quantize_weights=$quantize --quantize_type=$quantize_type --quantize_kv_cache=$quantize --checkpoint_path=$output_ckpt_dir --tokenizer_path=$tokenizer_path --sharding_config=default_shardings/llama.yaml
+python run_interactive.py --size=8b --model_name=$model_name --batch_size=128 --max_cache_length=2048 --quantize_weights=$quantize --quantize_type=$quantize_type --quantize_kv_cache=$quantize --checkpoint_path=$output_ckpt_dir --tokenizer_path=$tokenizer_path --sharding_config=default_shardings/$model_name.yaml
+```
+
+## Llama-3 70b
+```bash
+python run_interactive.py --size=70b --model_name=$model_name --batch_size=8 --max_cache_length=2048 --quantize_weights=$quantize --quantize_type=$quantize_type --quantize_kv_cache=$quantize --checkpoint_path=$output_ckpt_dir --tokenizer_path=$tokenizer_path --sharding_config=default_shardings/$model_name.yaml
 ```
 
 ## Gemma 7b

diff --git a/convert_checkpoints.py b/convert_checkpoints.py
@@ -179,17 +179,19 @@ def _merge_llama_weights(
         f"{len(tensors)} shards (shape = {tensors[0].shape}) for {key})"
     )
     state_dict_for_key = {}
-    for pattern, kind in llama_model.get_weight_sharding_type.items():
+    for pattern, kind in llama_model.Transformer.get_weight_sharding_type(
+        model_name=FLAGS.model_name
+    ).items():
       if not key.endswith(pattern):
         continue
       with torch.no_grad():
         if kind in ("ParallelEmbedding", "RowParallelLinear"):
           state_dict_for_key[key] = torch.cat(tensors, 1)
-        elif kind == "ColumnParallelLinear":
+        elif kind in ("ColumnParallelLinear", "VocabParallelEmbedding"):
           state_dict_for_key[key] = torch.cat(tensors, 0)
         else:
           if not all(
-              torch.allclose(tensors[0], tensor, atol=1e-6)
+              torch.allclose(tensors[0], tensor, atol=1e-2)
               for tensor in tensors[1:]
           ):
             raise ValueError(

diff --git a/default_shardings/llama.yaml → default_shardings/llama-2.yaml b/default_shardings/llama.yaml → default_shardings/llama-2.yaml
diff --git a/default_shardings/llama-3.yaml b/default_shardings/llama-3.yaml
@@ -0,0 +1,28 @@
+
+# Sharding config for llama-3
+# Sharding should either be an int between 0 and rank - 1
+# signifying the axis to shard or -1 / null signifying replicated
+
+
+freqs_cis : -1 #  torch.complex64 (2048, 64)
+tok_embeddings.weight : 0 #  torch.float32 (vocab_size, 4096)
+tok_embeddings.weight_scaler : 0 #  torch.bfloat16 (4096,)
+layers.*.attention.wo.weight : 1 #  torch.int8 (4096, 4096)
+layers.*.attention.wo.weight_scaler : 0 #  torch.bfloat16 (4096,)
+layers.*.attention.wq.weight : 0 #  torch.int8 (4096, 4096)
+layers.*.attention.wq.weight_scaler : 0 #  torch.bfloat16 (4096,)
+layers.*.attention.wk.weight : 0 #  torch.int8 (4096, 4096)
+layers.*.attention.wk.weight_scaler : 0 #  torch.bfloat16 (4096,)
+layers.*.attention.wv.weight : 0 #  torch.int8 (4096, 4096)
+layers.*.attention.wv.weight_scaler : 0 #  torch.bfloat16 (4096,)
+layers.*.feed_forward.w1.weight : 0 #  torch.float32 (11008, 4096)
+layers.*.feed_forward.w1.weight_scaler : 0  # torch.bfloat16 (4096,)
+layers.*.feed_forward.w2.weight : 1 #  torch.float32 (4096, 11008)
+layers.*.feed_forward.w2.weight_scaler : 0 # torch.bfloat16 (11008,)
+layers.*.feed_forward.w3.weight : 0 #  torch.float32 (11008, 4096)
+layers.*.feed_forward.w3.weight_scaler : 0 # torch.bfloat16 (4096,)
+layers.*.attention_norm.weight : -1 #  torch.float32 (4096,)
+layers.*.ffn_norm.weight : -1 #  torch.float32 (4096,)
+norm.weight : -1 #  torch.float32 (4096,)
+output.weight : 0 #  torch.float32 (vocab_size, 4096)
+output.weight_scaler : 0 #  torch.float32 (4096,)
diff --git a/jetstream_pt/third_party/llama/model_args.py b/jetstream_pt/third_party/llama/model_args.py
@@ -90,6 +90,19 @@ def get_arg(
         "norm_eps": 1e-05,
         "rope_theta": 500000.0,
     }
+  elif model_name == "llama-3-70b":
+    data = {
+        "dim": 8192,
+        "ffn_dim_multiplier": 1.3,
+        "multiple_of": 4096,
+        "n_heads": 64,
+        "n_kv_heads": 8,
+        "n_layers": 80,
+        "norm_eps": 1e-05,
+        "vocab_size": 128256,
+        "rope_theta": 500000.0,
+    }
+
   return ModelArgs(
       max_seq_len=seqlen,
       max_batch_size=batch_size,

diff --git a/jetstream_pt/third_party/llama/model_exportable.py b/jetstream_pt/third_party/llama/model_exportable.py
@@ -225,13 +225,17 @@ def get_quantized_embedding_weight_to_scaler_map():
     }
 
   @staticmethod
-  def get_weight_sharding_type():
+  def get_weight_sharding_type(model_name: str = ""):
     # ParallelEmbedding is col partitioned across the shards.
+    # VocalParallelEmbedding is row partitioned across the shards.
     # ColumnParallelLinear is row partitioned across shards due to transpose.
     # RowParallelLinear is col partitioned across shards due to transpose.
     # None is no partitioning and tensor should be identical across shards
-    return {
-        "tok_embeddings.weight": "ParallelEmbedding",
+    expected_model_names = ("llama-2", "llama-3")
+    assert (
+        model_name in expected_model_names
+    ), f"Expected model_name to one of {expected_model_names}"
+    sharding_dict = {
         "rope.freqs": None,
         "attention.wq.weight": "ColumnParallelLinear",
         "attention.wk.weight": "ColumnParallelLinear",
@@ -245,3 +249,8 @@ def get_weight_sharding_type():
         "norm.weight": None,
         "output.weight": "ColumnParallelLinear",
     }
+    if model_name == "llama-2":
+      sharding_dict["tok_embeddings.weight"] = "ParallelEmbedding"
+    elif model_name == "llama-3":
+      sharding_dict["tok_embeddings.weight"] = "VocabParallelEmbedding"
+    return sharding_dict