Add cross-model text baseline pipeline for GSM8K 2-agent

SStas · claude · SStas · commit 511b5d5a1884 · 2026-03-11T05:42:14.000Z
New pipeline_text_cross_model.py: Agent A (model X) generates text,
Agent B (model Y) reads that text as context. This is the text baseline
needed to interpret rosetta cross-model results — without it, we can't
tell if rosetta projection adds value over simply piping text between
different models.

Integrated into run_gsm8k_2agent.py as --mode text_cross_model. Model B
loading shared with rosetta mode to avoid duplicate loads.

Usage:
  python benchmarks/gsm8k_2agent/run_gsm8k_2agent.py \
    --mode text_cross_model \
    --model_name Qwen/Qwen2.5-7B-Instruct \
    --model_b meta-llama/Llama-3.2-3B-Instruct \
    --max_samples 200

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/benchmarks/gsm8k_2agent/pipeline_text_cross_model.py b/benchmarks/gsm8k_2agent/pipeline_text_cross_model.py
@@ -0,0 +1,189 @@
+"""Cross-model text pipeline: 2-agent chain where Researcher (model A) passes text to Solver (model B).
+
+This is the text baseline for cross-model comparison. Both agents communicate via
+text (like pipeline_text.py), but each agent runs on a different model. This lets
+us measure whether rosetta projection adds value over simply piping text between
+different models.
+"""
+
+import time
+from typing import Any, Dict, List
+
+from benchmarks.shared.generation import generate_text, render_prompt, tokenize_prompt
+from benchmarks.shared.metrics import gpu_memory_tracker
+from .agents import AGENTS, build_text_prompt
+from .evaluate import extract_gold, extract_gsm8k_answer, check_correct
+
+
+def run_text_cross_model_pipeline(
+    model_a: Any,
+    tokenizer_a: Any,
+    model_b: Any,
+    tokenizer_b: Any,
+    device: str,
+    question: str,
+    gold_solution: str,
+    max_new_tokens: int = 512,
+    temperature: float = 0.7,
+    top_p: float = 0.95,
+    verbose: bool = False,
+) -> Dict:
+    """Run the 2-agent cross-model text pipeline on a single GSM8K problem.
+
+    Researcher (model A) generates text analysis.
+    Solver (model B) receives that text in its prompt and generates the answer.
+    """
+    with gpu_memory_tracker(device) as mem:
+        t0 = time.perf_counter()
+        agent_traces: List[Dict] = []
+        total_prompt_tokens = 0
+        total_output_tokens = 0
+        total_context_tokens = 0
+
+        researcher = AGENTS[0]
+        solver = AGENTS[1]
+
+        # --- Agent 1: Researcher on model A ---
+        agent_t0 = time.perf_counter()
+        messages = build_text_prompt(researcher.role, question)
+        prompt_text = render_prompt(tokenizer_a, messages)
+        input_ids, attention_mask = tokenize_prompt(tokenizer_a, prompt_text, device)
+        prompt_tokens = int(input_ids.shape[-1])
+        total_prompt_tokens += prompt_tokens
+
+        researcher_text, _ = generate_text(
+            model_a, tokenizer_a, input_ids, attention_mask, device,
+            max_new_tokens=max_new_tokens,
+            temperature=temperature,
+            top_p=top_p,
+        )
+
+        output_encoded = tokenizer_a(researcher_text, add_special_tokens=False)
+        output_tokens = len(output_encoded["input_ids"])
+        total_output_tokens += output_tokens
+        agent_time_ms = (time.perf_counter() - agent_t0) * 1000
+
+        agent_traces.append({
+            "name": researcher.name,
+            "role": researcher.role,
+            "model": "model_a",
+            "prompt_tokens": prompt_tokens,
+            "output_tokens": output_tokens,
+            "context_tokens": 0,
+            "agent_time_ms": agent_time_ms,
+            "output": researcher_text,
+        })
+
+        if verbose:
+            print(f"  [{researcher.name} (A)] output ({len(researcher_text)} chars): "
+                  f"{researcher_text[:200]}...")
+
+        # --- Agent 2: Solver on model B ---
+        agent_t0 = time.perf_counter()
+
+        # Count context tokens — Researcher's text re-tokenized by model B's tokenizer
+        context_encoded = tokenizer_b(researcher_text, add_special_tokens=False)
+        context_token_count = len(context_encoded["input_ids"])
+        total_context_tokens += context_token_count
+
+        messages = build_text_prompt(solver.role, question, researcher_text)
+        prompt_text = render_prompt(tokenizer_b, messages)
+        input_ids, attention_mask = tokenize_prompt(tokenizer_b, prompt_text, device)
+        prompt_tokens = int(input_ids.shape[-1])
+        total_prompt_tokens += prompt_tokens
+
+        solver_text, _ = generate_text(
+            model_b, tokenizer_b, input_ids, attention_mask, device,
+            max_new_tokens=max_new_tokens,
+            temperature=temperature,
+            top_p=top_p,
+        )
+
+        output_encoded = tokenizer_b(solver_text, add_special_tokens=False)
+        output_tokens = len(output_encoded["input_ids"])
+        total_output_tokens += output_tokens
+        agent_time_ms = (time.perf_counter() - agent_t0) * 1000
+
+        agent_traces.append({
+            "name": solver.name,
+            "role": solver.role,
+            "model": "model_b",
+            "prompt_tokens": prompt_tokens,
+            "output_tokens": output_tokens,
+            "context_tokens": context_token_count,
+            "agent_time_ms": agent_time_ms,
+            "output": solver_text,
+        })
+
+        if verbose:
+            print(f"  [{solver.name} (B)] output ({len(solver_text)} chars): "
+                  f"{solver_text[:200]}...")
+
+        wall_time = time.perf_counter() - t0
+
+    total_tokens = total_prompt_tokens + total_output_tokens
+    tokens_per_sec = total_tokens / wall_time if wall_time > 0 else 0
+
+    gold = extract_gold(gold_solution)
+    prediction = extract_gsm8k_answer(agent_traces[-1]["output"])
+    correct = check_correct(prediction, gold)
+
+    return {
+        "question": question,
+        "gold": gold,
+        "prediction": prediction,
+        "raw_output": agent_traces[-1]["output"],
+        "correct": correct,
+        "wall_time": wall_time,
+        "total_prompt_tokens": total_prompt_tokens,
+        "total_output_tokens": total_output_tokens,
+        "total_tokens": total_tokens,
+        "total_context_tokens": total_context_tokens,
+        "tokens_per_sec": tokens_per_sec,
+        "peak_memory_mb": mem["peak_memory_mb"],
+        "agents": agent_traces,
+        "mode": "text_cross_model",
+    }
+
+
+def run_text_cross_model_benchmark(
+    model_a: Any,
+    tokenizer_a: Any,
+    model_b: Any,
+    tokenizer_b: Any,
+    device: str,
+    dataset: List[Dict],
+    max_new_tokens: int = 512,
+    temperature: float = 0.7,
+    top_p: float = 0.95,
+    verbose: bool = False,
+) -> List[Dict]:
+    """Run cross-model text pipeline on a list of GSM8K samples."""
+    results = []
+    for i, sample in enumerate(dataset):
+        if verbose:
+            print(f"\n[TextCrossModel] Sample {i + 1}/{len(dataset)}: "
+                  f"{sample['question'][:80]}...")
+
+        result = run_text_cross_model_pipeline(
+            model_a, tokenizer_a, model_b, tokenizer_b, device,
+            question=sample["question"],
+            gold_solution=sample["answer"],
+            max_new_tokens=max_new_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            verbose=verbose,
+        )
+        results.append(result)
+
+        if verbose:
+            status = "CORRECT" if result["correct"] else "WRONG"
+            print(f"  => {status} (pred={result['prediction']}, gold={result['gold']}, "
+                  f"time={result['wall_time']:.1f}s)")
+        else:
+            correct = sum(1 for r in results if r["correct"])
+            print(f"  [TextCrossModel] {i + 1}/{len(dataset)} "
+                  f"({correct}/{i + 1} correct, {result['wall_time']:.1f}s)",
+                  flush=True)
+
+    return results
diff --git a/benchmarks/gsm8k_2agent/run_gsm8k_2agent.py b/benchmarks/gsm8k_2agent/run_gsm8k_2agent.py
@@ -39,7 +39,7 @@ def parse_args() -> argparse.Namespace:
     )
     parser.add_argument(
         "--mode",
-        choices=["latent", "text", "direct", "rosetta", "both", "all"],
+        choices=["latent", "text", "direct", "rosetta", "text_cross_model", "both", "all"],
         default="all",
         help="Pipeline(s) to run (default: all)",
     )
@@ -111,19 +111,20 @@ def run_benchmark(config: dict) -> dict:
     run_latent = mode in ("latent", "both", "all")
     run_text = mode in ("text", "both", "all")
     run_rosetta = mode in ("rosetta", "all")
+    run_text_cross_model = mode in ("text_cross_model", "all")
 
     print(f"Device: {device}")
     print(f"Mode: {mode}")
     print(f"Model A: {model_name}")
-    if run_rosetta:
+    if run_rosetta or run_text_cross_model:
         print(f"Model B: {model_b_name}")
     print(f"Samples: {max_samples}")
     print(f"Latent steps: {latent_steps}")
     print(f"Max new tokens: {max_new_tokens}")
     print(f"Temperature: {temperature}")
     print(f"Seed: {seed}")
     print(f"Pipelines: direct={run_direct}, text={run_text}, latent={run_latent}, "
-          f"rosetta={run_rosetta}")
+          f"rosetta={run_rosetta}, text_cross_model={run_text_cross_model}")
     print()
 
     dataset = load_dataset(max_samples)
@@ -133,6 +134,7 @@ def run_benchmark(config: dict) -> dict:
     latent_results = None
     text_results = None
     rosetta_results = None
+    text_cross_model_results = None
 
     if run_direct:
         from benchmarks.gsm8k_2agent.pipeline_direct import run_direct_benchmark
@@ -178,6 +180,29 @@ def run_benchmark(config: dict) -> dict:
             top_p=top_p, verbose=verbose,
         )
 
+    # Load model B if needed for cross-model modes
+    model_b = tokenizer_b = connector_b = identity_b = None
+    if run_rosetta or run_text_cross_model:
+        model_b, tokenizer_b, connector_b, identity_b = load_model(model_b_name, device)
+
+    if run_text_cross_model:
+        from benchmarks.gsm8k_2agent.pipeline_text_cross_model import run_text_cross_model_benchmark
+
+        print("\n" + "=" * 50)
+        print("Running TEXT CROSS-MODEL (A generates text → B reads text) pipeline...")
+        print(f"  Model A (Researcher): {model_name}")
+        print(f"  Model B (Solver):     {model_b_name}")
+        print("=" * 50)
+        set_seed(seed)
+
+        text_cross_model_results = run_text_cross_model_benchmark(
+            model_a=model, tokenizer_a=tokenizer,
+            model_b=model_b, tokenizer_b=tokenizer_b,
+            device=device, dataset=dataset,
+            max_new_tokens=max_new_tokens, temperature=temperature,
+            top_p=top_p, verbose=verbose,
+        )
+
     if run_rosetta:
         from benchmarks.gsm8k_2agent.pipeline_rosetta import run_rosetta_benchmark
         from avp.rosetta.calibrate import calibrate
@@ -189,9 +214,6 @@ def run_benchmark(config: dict) -> dict:
         print("=" * 50)
         set_seed(seed)
 
-        # Load model B (model A is already loaded)
-        model_b, tokenizer_b, connector_b, identity_b = load_model(model_b_name, device)
-
         # Calibrate once — instant for same-family vocab-mediated
         print("Calibrating Rosetta Stone projection...")
         avp_map = calibrate(
@@ -213,7 +235,8 @@ def run_benchmark(config: dict) -> dict:
             num_transfer_states=num_transfer_states,
         )
 
-        # Free model B to reclaim GPU memory
+    # Free model B to reclaim GPU memory
+    if model_b is not None:
         del model_b, tokenizer_b, connector_b, identity_b
         if device == "cuda":
             import torch
@@ -231,6 +254,8 @@ def run_benchmark(config: dict) -> dict:
         modes.append(("Text", 13, text_results))
     if rosetta_results is not None:
         modes.append(("Rosetta", 13, rosetta_results))
+    if text_cross_model_results is not None:
+        modes.append(("Text Cross-Model", 16, text_cross_model_results))
 
     # Compute agreement across available modes
     available = {}
@@ -242,6 +267,8 @@ def run_benchmark(config: dict) -> dict:
         available["latent"] = latent_results
     if rosetta_results is not None:
         available["rosetta"] = rosetta_results
+    if text_cross_model_results is not None:
+        available["text_cross_model"] = text_cross_model_results
     agreement_data = compute_agreement(available) if len(available) > 1 else None
 
     print_summary(
@@ -262,7 +289,7 @@ def run_benchmark(config: dict) -> dict:
         "config": {
             "benchmark": "gsm8k_2agent",
             "model_a": model_name,
-            "model_b": model_b_name if run_rosetta else None,
+            "model_b": model_b_name if (run_rosetta or run_text_cross_model) else None,
             "device": device,
             "mode": mode,
             "max_samples": max_samples,
@@ -293,6 +320,11 @@ def run_benchmark(config: dict) -> dict:
             "summary": compute_stats(rosetta_results),
             "samples": rosetta_results,
         }
+    if text_cross_model_results is not None:
+        output_data["text_cross_model"] = {
+            "summary": compute_stats(text_cross_model_results),
+            "samples": text_cross_model_results,
+        }
     if agreement_data is not None:
         output_data["agreement"] = agreement_data