Merge branch 'main' into mistral

ZhengHSI · web-flow · commit fb3cd1da8e39 · 2025-09-15T11:03:07.000+08:00
diff --git a/benchmarks/run_mmstar.py b/benchmarks/run_mmstar.py
@@ -24,7 +24,7 @@ def main(args):
     print(f"Created temporary image directory: {cache_dir}")
 
     # Read data
-    dataset = load_dataset("Lin-Chen/MMStar")["validation"]
+    dataset = load_dataset("Lin-Chen/MMStar")["val"]
     questions = []
     for idx, q in enumerate(dataset):
         if idx >= args.num_questions:
diff --git a/configs/qwen2-5-vl-7b-eagle3.json b/configs/qwen2-5-vl-7b-eagle3.json
diff --git a/scripts/train_eagle3_offline.py b/scripts/train_eagle3_offline.py
@@ -376,7 +376,14 @@ def main():
         epoch_acces = [[] for _ in range(eagle3_model.module.length)]
         epoch_plosses = [[] for _ in range(eagle3_model.module.length)]
 
-        for data in tqdm(train_dataloader, desc=f"Training Epoch {epoch}"):
+        if dist.get_rank() == 0:
+            progress_bar = tqdm(
+                train_dataloader, desc=f"Training Epoch {epoch}", leave=True
+            )
+        else:
+            progress_bar = train_dataloader
+
+        for data in progress_bar:
             batch_index += 1
             if args.profile:
                 if batch_index == args.profile_start_step:
@@ -410,6 +417,7 @@ def main():
                 hidden_states=data["hidden_state"].cuda(),  # [B, S, D]
                 target=data["target"].cuda(),  # [B, S, D*3]
             )
+            acces = torch.stack(acces).cpu().tolist()
 
             # calculate weighted loss
             ploss_weight = [0.8**i for i in range(len(plosses))]
@@ -444,6 +452,13 @@ def main():
                 )
                 last_time = time.time()
 
+            if dist.get_rank() == 0:
+                avg_loss = sum(pl.item() for pl in plosses) / len(plosses)
+                avg_acc = sum(acces) / len(acces)
+                progress_bar.set_postfix(
+                    {"loss": f"{avg_loss:.2f}", "acc": f"{avg_acc:.2f}"}
+                )
+
         # Log epoch-level training metrics
         train_epoch_logdict = {}
         for i in range(len(epoch_acces)):
@@ -479,6 +494,7 @@ def main():
                     hidden_states=data["hidden_state"].cuda(),
                     target=data["target"].cuda(),
                 )
+                acces = torch.stack(acces).cpu().tolist()
                 eval_acces = [eval_acces[i] + [acces[i]] for i in range(len(acces))]
                 eval_plosses = [
                     eval_plosses[i] + [plosses[i].item()] for i in range(len(plosses))
diff --git a/scripts/train_eagle3_online.py b/scripts/train_eagle3_online.py
@@ -12,7 +12,7 @@
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.distributed.fsdp import MixedPrecision, ShardingStrategy, StateDictType
 from tqdm import tqdm
-from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+from transformers import AutoConfig, AutoModelForCausalLM, AutoProcessor, AutoTokenizer
 
 from specforge import (
     AutoDistributedTargetModel,
@@ -283,6 +283,10 @@ def main():
                 .eval()
                 .cuda()
             )
+
+    for p in target_model.parameters():
+        p.requires_grad = False
+
     print_with_rank("Initialized target model")
 
     # load model with resume
@@ -402,6 +406,7 @@ def main():
             draft_model=draft_model,
             processor=processor,
             length=args.ttt_length,
+            attention_backend=args.attention_backend,
         )
     else:
         eagle3_model = OnlineEagle3Model(
@@ -426,7 +431,7 @@ def main():
 
     # build other components
     optimizer = BF16Optimizer(
-        eagle3_model,
+        draft_model,
         lr=args.learning_rate,
         max_grad_norm=args.max_grad_norm,
         warmup_ratio=args.warmup_ratio,
@@ -469,7 +474,14 @@ def main():
         epoch_acces = [[] for _ in range(eagle3_model.module.length)]
         epoch_plosses = [[] for _ in range(eagle3_model.module.length)]
 
-        for data in tqdm(train_dataloader, desc=f"Training Epoch {epoch}"):
+        if dist.get_rank() == 0:
+            progress_bar = tqdm(
+                train_dataloader, desc=f"Training Epoch {epoch}", leave=True
+            )
+        else:
+            progress_bar = train_dataloader
+
+        for data in progress_bar:
             batch_index += 1
             if args.profile:
                 if batch_index == args.profile_start_step:
@@ -506,6 +518,7 @@ def main():
                     attention_mask=data["attention_mask"].cuda(),
                     loss_mask=data["loss_mask"].cuda(),
                 )
+            acces = torch.stack(acces).cpu().tolist()
 
             # calculate weighted loss
             ploss_weight = [0.8**i for i in range(len(plosses))]
@@ -539,6 +552,13 @@ def main():
                 )
                 last_time = time.time()
 
+            if dist.get_rank() == 0:
+                avg_loss = sum(pl.item() for pl in plosses) / len(plosses)
+                avg_acc = sum(acces) / len(acces)
+                progress_bar.set_postfix(
+                    {"loss": f"{avg_loss:.2f}", "acc": f"{avg_acc:.2f}"}
+                )
+
         epoch_logdict = {}
         for i in range(len(epoch_acces)):
             acc_i = torch.tensor(epoch_acces[i]).cuda().mean()
@@ -581,6 +601,7 @@ def main():
                         attention_mask=data["attention_mask"].cuda(),
                         loss_mask=data["loss_mask"].cuda(),
                     )
+                acces = torch.stack(acces).cpu().tolist()
 
                 eval_acces = [eval_acces[i] + [acces[i]] for i in range(len(acces))]
                 eval_plosses = [
diff --git a/specforge/core/eagle3.py b/specforge/core/eagle3.py
@@ -463,7 +463,12 @@ class QwenVLOnlineEagle3Model(Eagle3Model):
     """
 
     def __init__(
-        self, target_model, draft_model: Eagle3DraftModel, processor, length: int = 7
+        self,
+        target_model,
+        draft_model: Eagle3DraftModel,
+        processor,
+        length: int = 7,
+        attention_backend: str = "sdpa",
     ):
         """
         Args:
@@ -476,6 +481,7 @@ def __init__(
         self.draft_model = draft_model
         self.processor = processor
         self.length = length
+        self.attention_backend = attention_backend
 
     @torch.no_grad()
     def _prepare_data(
@@ -605,11 +611,20 @@ def forward(
             pixel_values: batch image pixel values, used for VLM models
             image_grid_thw: (batch, 3), image grid thw, used for VLM models
         """
-        # Step 1: prepare data with the target model
+        # Step 0: prepare data with the target model
         hidden_states, target, loss_mask, input_ids = self._prepare_data(
             input_ids, attention_mask, loss_mask, pixel_values, image_grid_thw
         )
 
+        # Step 1: handle vocab size
+        target_p_padded, position_mask = _compute_target_p_padded(
+            target=target,
+            t2d=self.draft_model.t2d,
+            loss_mask=loss_mask,
+            length=self.length,
+        )
+        del target
+
         # basic info
         batch_size, seq_length, _ = hidden_states.shape
         seq_length_with_past = seq_length
@@ -656,21 +671,28 @@ def forward(
                 dtype=torch.bool,
                 device=hidden_states.device,
             )
-        attention_mask = self.draft_model.prepare_decoder_attention_mask(
-            attention_mask=attention_mask,
-            hidden_states=hidden_states,
-            batch_size=batch_size,
-            seq_length=seq_length,
-            past_key_values_length=past_key_values_length,
-        )
+        if self.attention_backend == "sdpa":
+            attention_mask = self.draft_model.prepare_decoder_attention_mask(
+                attention_mask=attention_mask,
+                hidden_states=hidden_states,
+                batch_size=batch_size,
+                seq_length=seq_length,
+                past_key_values_length=past_key_values_length,
+            )
 
         # Step 5: run TTT
         plosses = []
         vlosses = []
         acces = []
-        cache_hidden = [[], []]
+        if self.attention_backend == "sdpa":
+            cache_hidden = [[], []]
+            past_key_values = None
+        elif self.attention_backend == "flex_attention":
+            cache_hidden = None
+            past_key_values = DynamicCache()
 
         for idx in range(self.length):
+            target_p = target_p_padded[:, idx : idx + seq_length, :].contiguous()
             is_last = idx == self.length - 1
 
             # Step 5.1: embed the input ids
@@ -685,55 +707,44 @@ def forward(
                 cache_hidden=cache_hidden,
                 attention_mask=attention_mask,
                 position_ids=position_ids,
+                past_key_values=past_key_values,
                 use_cache=True,
             )
 
-            # Step 5.3: handle vocab size
-            with torch.no_grad():
-                target_head = target
-                target_max_token = target_head.argmax(-1)
-                target_mask = self.draft_model.t2d[target_max_token]
-                target_mask = target_mask[..., None].int()
-                position_mask = target_mask * loss_mask
-                target_head = target_head[..., self.draft_model.t2d]
-                target_head = target_head.float()
-                target_p = nn.Softmax(dim=2)(target_head)
-                target_p = target_p.detach()
-
             # update hidden states for next step
             hidden_states = hidden_states_out
 
             # Step 5.4: get logits
             logits = self.draft_model.compute_logits(hidden_states)
-            logits = logits.float()
-
-            # Step 5.5: calculate loss
-            out_logp = nn.LogSoftmax(dim=2)(logits)
-            plogp = target_p * out_logp
-            loss = -torch.sum(position_mask * plogp, 2).mean()
 
-            # Step 5.6: record metrics
-            plosses.append(loss)
+            # Step 5.5: record metrics first as we in-place modify logits
             with torch.no_grad():
                 acces.append(
-                    (
-                        (logits.argmax(-1) == target_p.argmax(-1))
-                        * position_mask.squeeze(-1)
+                    _compute_metric_acc(
+                        logits=logits,
+                        target_p=target_p,
+                        position_mask=position_mask,
+                        loss_mask=loss_mask,
                     )
-                    .sum()
-                    .item()
-                    / (loss_mask.sum().item() + 1e-6)
                 )
 
+            # Step 5.6: calculate loss, in-place modifies logits!
+            loss = LogSoftmaxLoss.apply(logits, target_p, position_mask)
+            plosses.append(loss)
+
             if not is_last:
                 # Step 5.7: we need to update the loss mask
                 input_ids = padding(input_ids, left=False)
-                target = padding(target, left=False)
+                position_mask = padding(position_mask, left=False)
                 loss_mask = padding(loss_mask, left=False)
-                ind = torch.arange(seq_length, device=attention_mask.device)
-                ind0 = ind[idx:]
-                ind1 = ind[: seq_length - idx]
-                attention_mask[:, :, ind0, ind1] = torch.finfo(attention_mask.dtype).min
+                if self.attention_backend == "sdpa":
+                    ind = torch.arange(seq_length, device=attention_mask.device)
+                    ind0 = ind[idx:]
+                    ind1 = ind[: seq_length - idx]
+                    attention_mask[:, :, ind0, ind1] = torch.finfo(
+                        attention_mask.dtype
+                    ).min
+                # Flex attention mask shirnking is handled inside attention module
         return plosses, vlosses, acces
 
 
@@ -775,4 +786,4 @@ def _compute_target_p(target, t2d, loss_mask):
 def _compute_metric_acc(logits, target_p, position_mask, loss_mask):
     return (
         (logits.argmax(-1) == target_p.argmax(-1)) * position_mask.squeeze(-1)
-    ).sum().item() / (loss_mask.sum().item() + 1e-6)
+    ).sum() / loss_mask.sum().clamp_min(1e-6)
diff --git a/specforge/core/loss.py b/specforge/core/loss.py
@@ -170,9 +170,9 @@ class LogSoftmaxLoss(torch.autograd.Function):
     def forward(ctx, logits, target, position_mask):
         B, T, V = logits.shape
         loss = torch.zeros((B * T, 1), device=logits.device)
-        logits_flat = logits.view(B * T, V).contiguous()
-        target_flat = target.view(B * T, V).contiguous()
-        position_mask_flat = position_mask.view(B * T, 1).contiguous().bool()
+        logits_flat = logits.contiguous().view(B * T, V)
+        target_flat = target.contiguous().view(B * T, V)
+        position_mask_flat = position_mask.contiguous().view(B * T, 1).bool()
         grid = (B * T,)
         m = torch.zeros((B * T,), device=logits.device, dtype=torch.float32)
         d = torch.zeros((B * T,), device=logits.device, dtype=torch.float32)
@@ -200,9 +200,9 @@ def backward(ctx, grad_output):
         logits, target, position_mask, m, d = ctx.saved_tensors
         B, T, V = logits.shape
         scaling_factor = 1.0 / (B * T)
-        logits = logits.view(B * T, V).contiguous()
-        target = target.view(B * T, V).contiguous()
-        position_mask = position_mask.view(B * T, 1).contiguous().bool()
+        logits = logits.contiguous().view(B * T, V)
+        target = target.contiguous().view(B * T, V)
+        position_mask = position_mask.contiguous().view(B * T, 1).bool()
         grid = (B * T,)
         BLOCK_SIZE, num_warps = _calculate_settings(V)
         log_softmax_backward_kernel[grid](
diff --git a/specforge/data/parse.py b/specforge/data/parse.py
@@ -68,7 +68,11 @@ def parse(
             convroles = ["user", "assistant"]
             for j, sentence in enumerate(conversation):
                 role = sentence["role"]
-                assert role == convroles[j % 2], f"unexpected role {role}"
+                if role != convroles[j % 2]:
+                    warnings.warn(
+                        f"Conversation truncated due to unexpected role '{role}'. Expected '{convroles[j % 2]}'."
+                    )
+                    break
                 messages.append({"role": role, "content": sentence["content"]})
 
             conversation = self.tokenizer.apply_chat_template(
@@ -150,7 +154,6 @@ def parse(
             reasoning_level = "Low"
 
             for j, message in enumerate(conversation):
-                print(message)
                 if message["role"] == "user":
                     user_message = message["content"]
                 if message["role"] == "assistant_analysis":
diff --git a/specforge/modeling/draft/llama3_eagle.py b/specforge/modeling/draft/llama3_eagle.py
@@ -575,12 +575,23 @@ def forward(
         ).transpose(1, 2)
 
         lck = past_seen_tokens // q_len
-        cos, sin = self.rotary_emb(query_states, seq_len=q_len + lck)
-        cos, sin = cos.to(query_states.device), sin.to(query_states.device)
-        # Keep positions ids aligned when padding so the KV cache is unaffected.
-        query_states, key_states = apply_rotary_pos_emb(
-            query_states, key_states, cos, sin, position_ids + lck
-        )
+        if isinstance(self.rotary_emb, LlamaMutiRotaryEmbedding):
+            cos, sin = self.rotary_emb(query_states, position_ids + lck)
+            cos, sin = cos.to(query_states.device), sin.to(query_states.device)
+            query_states, key_states = apply_multimodal_rotary_pos_emb(
+                query_states,
+                key_states,
+                cos,
+                sin,
+                self.config.rope_scaling["mrope_section"],
+            )
+        else:
+            cos, sin = self.rotary_emb(query_states, seq_len=q_len + lck)
+            cos, sin = cos.to(query_states.device), sin.to(query_states.device)
+            # Keep positions ids aligned when padding so the KV cache is unaffected.
+            query_states, key_states = apply_rotary_pos_emb(
+                query_states, key_states, cos, sin, position_ids + lck
+            )
 
         cache_position: torch.Tensor = torch.arange(
             past_seen_tokens, past_seen_tokens + q_len, device=hidden_states.device