-
Notifications
You must be signed in to change notification settings - Fork 9
Fix: Bad Gateway Error 502 #61
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 6 commits
2a33e4f
3e418d2
8edd59d
79709ec
a946bba
c7d734c
0a8a80e
de89aa0
6c23b2d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -15,6 +15,7 @@ | |
| """ | ||
|
|
||
| import ast | ||
| import asyncio | ||
| import gc | ||
| import hashlib | ||
| import importlib.util | ||
|
|
@@ -25,6 +26,7 @@ | |
| import sys | ||
| import time | ||
| import traceback | ||
| import uuid as _uuid | ||
| from collections.abc import Iterator | ||
| from contextlib import contextmanager | ||
| from dataclasses import dataclass | ||
|
|
@@ -33,6 +35,7 @@ | |
| import torch | ||
| import torch.nn.functional as F | ||
| from fastapi import FastAPI | ||
| from fastapi.responses import JSONResponse | ||
| from pydantic import BaseModel, Field | ||
|
|
||
| from crusades.core.security_defs import ( | ||
|
|
@@ -2922,6 +2925,13 @@ def _timer_divergence(a: float, b: float) -> float: | |
|
|
||
| app = FastAPI(title="Templar MFU Evaluation", version="2.0.0") | ||
|
|
||
| # --------------------------------------------------------------------------- | ||
| # In-memory jobs table for async evaluation | ||
| # --------------------------------------------------------------------------- | ||
| # Maps job_id -> {"status": "pending"|"done"|"failed", "result": dict|None} | ||
| _jobs: dict[str, dict] = {} | ||
| _jobs_lock = asyncio.Lock() | ||
|
|
||
| # Global actor instance (reused for efficiency) | ||
| _actor: Actor | None = None | ||
|
|
||
|
|
@@ -2989,12 +2999,82 @@ async def health(): | |
| } | ||
|
|
||
|
|
||
| _current_torchrun: asyncio.subprocess.Process | None = None | ||
|
|
||
|
|
||
| def _get_descendant_pids(pid: int) -> list[int]: | ||
| """Recursively collect all descendant PIDs via /proc before killing.""" | ||
| descendants: list[int] = [] | ||
| try: | ||
| with open(f"/proc/{pid}/task/{pid}/children") as f: | ||
| child_pids = [int(p) for p in f.read().split()] | ||
| for cpid in child_pids: | ||
| descendants.append(cpid) | ||
| descendants.extend(_get_descendant_pids(cpid)) | ||
| except (FileNotFoundError, PermissionError, ProcessLookupError, ValueError, OSError): | ||
| pass | ||
| return descendants | ||
|
|
||
|
|
||
| def _kill_torchrun_group(proc: asyncio.subprocess.Process) -> None: | ||
| """SIGKILL a torchrun process, its process group, AND all descendants. | ||
|
|
||
| torchrun's elastic agent may spawn workers in a different process group | ||
| than the launcher, so os.killpg alone is insufficient. We walk | ||
| /proc/<pid>/children first (before any kill) to collect every descendant, | ||
| then kill the process group *and* each descendant individually. | ||
| """ | ||
| import signal | ||
|
|
||
| if proc.returncode is not None: | ||
| return | ||
|
|
||
| pid = proc.pid | ||
|
|
||
| desc_pids = _get_descendant_pids(pid) | ||
|
|
||
| try: | ||
| pgid = os.getpgid(pid) | ||
| os.killpg(pgid, signal.SIGKILL) | ||
| logger.warning(f"Killed torchrun process group (pgid={pgid})") | ||
| except (ProcessLookupError, PermissionError, OSError): | ||
| pass | ||
|
|
||
| killed_extra = 0 | ||
| for dpid in desc_pids: | ||
| try: | ||
| os.kill(dpid, signal.SIGKILL) | ||
| killed_extra += 1 | ||
| except (ProcessLookupError, PermissionError, OSError): | ||
| pass | ||
| if killed_extra: | ||
| logger.warning(f"Also killed {killed_extra} descendant processes of torchrun (pid={pid})") | ||
|
|
||
| try: | ||
| proc.kill() | ||
| except (ProcessLookupError, OSError): | ||
| pass | ||
|
|
||
|
|
||
| async def _evaluate_via_torchrun(request: EvaluateRequest) -> dict: | ||
| """Spawn torchrun for multi-GPU Basilica evaluation (uvicorn is single-process).""" | ||
| global _current_torchrun | ||
| import asyncio as _aio | ||
| import json as _json | ||
| import tempfile | ||
|
|
||
| if _current_torchrun is not None and _current_torchrun.returncode is None: | ||
| logger.warning( | ||
| f"Killing stale torchrun (pid={_current_torchrun.pid}) before new evaluation" | ||
| ) | ||
| _kill_torchrun_group(_current_torchrun) | ||
| try: | ||
| await _aio.wait_for(_current_torchrun.wait(), timeout=10) | ||
| except TimeoutError: | ||
| logger.warning("Stale torchrun launcher did not exit within 10s after kill") | ||
| await _aio.sleep(30) | ||
| _current_torchrun = None | ||
|
|
||
| params_path = None | ||
| script_path = None | ||
| try: | ||
|
|
@@ -3039,14 +3119,19 @@ async def main(): | |
| f.write(eval_script) | ||
| script_path = f.name | ||
|
|
||
| master_port = 29500 + random.randint(0, 10000) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: find . -name "env.py" -path "*/templar/*" | head -20Repository: one-covenant/crusades Length of output: 94 🏁 Script executed: sed -n '3070,3080p' ./environments/templar/env.pyRepository: one-covenant/crusades Length of output: 457 🏁 Script executed: sed -n '3040,3090p' ./environments/templar/env.pyRepository: one-covenant/crusades Length of output: 1904 🏁 Script executed: sed -n '3000,3045p' ./environments/templar/env.pyRepository: one-covenant/crusades Length of output: 1553 🏁 Script executed: grep -n "_reset_torch_state" ./environments/templar/env.pyRepository: one-covenant/crusades Length of output: 170 🏁 Script executed: sed -n '271,310p' ./environments/templar/env.pyRepository: one-covenant/crusades Length of output: 1641 🏁 Script executed: sed -n '310,330p' ./environments/templar/env.pyRepository: one-covenant/crusades Length of output: 716 🏁 Script executed: sed -n '3000,3015p' ./environments/templar/env.pyRepository: one-covenant/crusades Length of output: 617 🏁 Script executed: sed -n '2240,2260p' ./environments/templar/env.pyRepository: one-covenant/crusades Length of output: 821 🏁 Script executed: grep -n "def _evaluate" ./environments/templar/env.py | head -10Repository: one-covenant/crusades Length of output: 137 🏁 Script executed: grep -n "def _evaluate" ./environments/templar/env.pyRepository: one-covenant/crusades Length of output: 137 🏁 Script executed: grep -n "def evaluate" ./environments/templar/env.py | head -5Repository: one-covenant/crusades Length of output: 164 🏁 Script executed: sed -n '3154,3180p' ./environments/templar/env.pyRepository: one-covenant/crusades Length of output: 1344 🏁 Script executed: sed -n '1692,1750p' ./environments/templar/env.pyRepository: one-covenant/crusades Length of output: 2571 🏁 Script executed: head -1 ./environments/templar/env.py && grep -n "import random" ./environments/templar/env.py | head -10Repository: one-covenant/crusades Length of output: 85 🏁 Script executed: sed -n '2240,2280p' ./environments/templar/env.pyRepository: one-covenant/crusades Length of output: 1689 Use a cryptographically trusted entropy source for The single-GPU evaluation path executes untrusted miner code in-process. If untrusted code patches Capture Fix_secure_randrange = random.SystemRandom().randrange- master_port = 29500 + random.randint(0, 10000)
+ master_port = 29500 + _secure_randrange(10001)🧰 Tools🪛 Ruff (0.15.6)[error] 3075-3075: Standard pseudo-random generators are not suitable for cryptographic purposes (S311) 🤖 Prompt for AI Agents |
||
| proc = await _aio.create_subprocess_exec( | ||
| "torchrun", | ||
| "--nproc_per_node", | ||
| str(request.num_gpus), | ||
| "--master_port", | ||
| str(master_port), | ||
| script_path, | ||
| stdout=_aio.subprocess.PIPE, | ||
| stderr=_aio.subprocess.STDOUT, | ||
| start_new_session=True, | ||
| ) | ||
| _current_torchrun = proc | ||
|
|
||
| collected_lines: list[str] = [] | ||
|
|
||
|
|
@@ -3078,6 +3163,8 @@ async def _read_and_tee(): | |
| "wall_time_seconds": 0.0, | ||
| } | ||
| except TimeoutError: | ||
| if _current_torchrun is not None: | ||
| _kill_torchrun_group(_current_torchrun) | ||
| return { | ||
| "task_id": request.task_id, | ||
| "success": False, | ||
|
|
@@ -3089,6 +3176,8 @@ async def _read_and_tee(): | |
| "wall_time_seconds": 0.0, | ||
| } | ||
| except Exception as e: | ||
| if _current_torchrun is not None: | ||
| _kill_torchrun_group(_current_torchrun) | ||
| return { | ||
| "task_id": request.task_id, | ||
| "success": False, | ||
|
|
@@ -3108,49 +3197,114 @@ async def _read_and_tee(): | |
| pass | ||
|
|
||
|
|
||
| @app.post("/evaluate", response_model=EvaluateResponse) | ||
| async def evaluate(request: EvaluateRequest) -> EvaluateResponse: | ||
| """Evaluate miner's code. Spawns torchrun when num_gpus > 1.""" | ||
| async def _run_evaluation(request: EvaluateRequest) -> dict: | ||
| """Run the actual evaluation (sync helper used by background task).""" | ||
| if request.num_gpus > 1: | ||
| result = await _evaluate_via_torchrun(request) | ||
| else: | ||
| actor = get_actor() | ||
| result = await actor.evaluate( | ||
| task_id=request.task_id, | ||
| seed=request.seed, | ||
| model_url=request.model_url, | ||
| data_url=request.data_url, | ||
| steps=request.steps, | ||
| batch_size=request.batch_size, | ||
| timeout=request.timeout, | ||
| sequence_length=request.sequence_length, | ||
| data_samples=request.data_samples, | ||
| code=request.code, | ||
| max_loss_difference=request.max_loss_difference, | ||
| use_random_init=request.use_random_init, | ||
| min_trainable_params_ratio=request.min_trainable_params_ratio, | ||
| min_params_changed_ratio=request.min_params_changed_ratio, | ||
| weight_relative_error_max=request.weight_relative_error_max, | ||
| timer_divergence_threshold=request.timer_divergence_threshold, | ||
| gpu_peak_tflops=request.gpu_peak_tflops, | ||
| max_plausible_mfu=request.max_plausible_mfu, | ||
| min_mfu=request.min_mfu, | ||
| require_cuda_timing=True, | ||
| num_gpus=request.num_gpus, | ||
| ) | ||
| return await _evaluate_via_torchrun(request) | ||
|
|
||
| actor = get_actor() | ||
| return await actor.evaluate( | ||
| task_id=request.task_id, | ||
| seed=request.seed, | ||
| model_url=request.model_url, | ||
| data_url=request.data_url, | ||
| steps=request.steps, | ||
| batch_size=request.batch_size, | ||
| timeout=request.timeout, | ||
| sequence_length=request.sequence_length, | ||
| data_samples=request.data_samples, | ||
| code=request.code, | ||
| max_loss_difference=request.max_loss_difference, | ||
| use_random_init=request.use_random_init, | ||
| min_trainable_params_ratio=request.min_trainable_params_ratio, | ||
| min_params_changed_ratio=request.min_params_changed_ratio, | ||
| weight_relative_error_max=request.weight_relative_error_max, | ||
| timer_divergence_threshold=request.timer_divergence_threshold, | ||
| gpu_peak_tflops=request.gpu_peak_tflops, | ||
| max_plausible_mfu=request.max_plausible_mfu, | ||
| min_mfu=request.min_mfu, | ||
| require_cuda_timing=True, | ||
| num_gpus=request.num_gpus, | ||
| ) | ||
|
Comment on lines
+3205
to
+3228
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
🤖 Prompt for AI Agents |
||
|
|
||
|
|
||
| async def _evaluation_background(job_id: str, request: EvaluateRequest) -> None: | ||
| """Background coroutine: runs evaluation and stores result in _jobs.""" | ||
| try: | ||
| result = await _run_evaluation(request) | ||
| async with _jobs_lock: | ||
| _jobs[job_id] = {"status": "done", "result": result} | ||
| logger.info(f"[JOB {job_id}] Evaluation finished successfully") | ||
| except Exception as exc: | ||
| error_result = { | ||
| "task_id": request.task_id, | ||
| "success": False, | ||
| "error": f"Background evaluation failed: {exc}", | ||
| "seed": request.seed, | ||
| "mfu": 0.0, | ||
| "tps": 0.0, | ||
| "total_tokens": 0, | ||
| "wall_time_seconds": 0.0, | ||
| } | ||
| async with _jobs_lock: | ||
| _jobs[job_id] = {"status": "failed", "result": error_result} | ||
| logger.error(f"[JOB {job_id}] Evaluation failed: {exc}") | ||
|
|
||
|
|
||
| @app.post("/evaluate") | ||
| async def evaluate(request: EvaluateRequest): | ||
| """Accept evaluation request, start in background, return job_id immediately. | ||
|
|
||
| Returns HTTP 202 with {"job_id": "..."} so the caller can poll | ||
| GET /eval-status/{job_id} for results. This avoids proxy timeouts | ||
| on long-running evaluations. | ||
| """ | ||
| job_id = _uuid.uuid4().hex | ||
| async with _jobs_lock: | ||
| _jobs[job_id] = {"status": "pending", "result": None} | ||
| asyncio.create_task(_evaluation_background(job_id, request)) | ||
| logger.info( | ||
| f"[JOB {job_id}] Evaluation accepted (task_id={request.task_id}, " | ||
| f"num_gpus={request.num_gpus})" | ||
| ) | ||
| return JSONResponse(status_code=202, content={"job_id": job_id}) | ||
|
|
||
|
|
||
| @app.get("/eval-status/{job_id}") | ||
| async def eval_status(job_id: str): | ||
| """Poll for evaluation result. | ||
|
|
||
| Returns: | ||
| - 200 {"status": "pending"} while evaluation is running | ||
| - 200 {"status": "done", "result": {...}} when evaluation is complete | ||
| - 200 {"status": "failed", "result": {...}} on evaluation error | ||
| - 404 {"error": "unknown job_id"} if job_id is not found | ||
| """ | ||
| async with _jobs_lock: | ||
| job = _jobs.get(job_id) | ||
| if job is None: | ||
| return JSONResponse(status_code=404, content={"error": "unknown job_id"}) | ||
|
|
||
| if job["status"] == "pending": | ||
| return {"status": "pending"} | ||
|
|
||
| return EvaluateResponse( | ||
| task_id=result.get("task_id", request.task_id), | ||
| result = job["result"] | ||
| response = EvaluateResponse( | ||
| task_id=result.get("task_id", 0), | ||
| mfu=result.get("mfu", 0.0), | ||
| tps=result.get("tps", 0.0), | ||
| total_tokens=result.get("total_tokens", 0), | ||
| wall_time_seconds=result.get("wall_time_seconds", 0.0), | ||
| success=result.get("success", False), | ||
| error=result.get("error"), | ||
| error_code=result.get("error_code"), | ||
| seed=result.get("seed", request.seed), | ||
| seed=result.get("seed", ""), | ||
| diagnostics=result.get("diagnostics", {}), | ||
| ) | ||
| # Clean up to avoid unbounded memory growth | ||
| async with _jobs_lock: | ||
| _jobs.pop(job_id, None) | ||
|
Comment on lines
+3304
to
+3306
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Don't delete a finished job on the first status read. If the 🤖 Prompt for AI Agents |
||
| return {"status": job["status"], "result": response.model_dump()} | ||
|
|
||
|
|
||
| # Entry point when running directly (for local testing) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🧩 Analysis chain
🏁 Script executed:
Repository: one-covenant/crusades
Length of output: 94
🏁 Script executed:
Repository: one-covenant/crusades
Length of output: 97
🏁 Script executed:
Repository: one-covenant/crusades
Length of output: 952
🏁 Script executed:
Repository: one-covenant/crusades
Length of output: 197
🏁 Script executed:
Repository: one-covenant/crusades
Length of output: 1245
🏁 Script executed:
Repository: one-covenant/crusades
Length of output: 1693
🏁 Script executed:
Repository: one-covenant/crusades
Length of output: 343
🏁 Script executed:
Repository: one-covenant/crusades
Length of output: 751
🏁 Script executed:
Repository: one-covenant/crusades
Length of output: 47
🏁 Script executed:
Repository: one-covenant/crusades
Length of output: 2201
🏁 Script executed:
Repository: one-covenant/crusades
Length of output: 1009
🏁 Script executed:
Repository: one-covenant/crusades
Length of output: 1603
🏁 Script executed:
Repository: one-covenant/crusades
Length of output: 546
🏁 Script executed:
Repository: one-covenant/crusades
Length of output: 1143
Fix race condition in multi-GPU cleanup: track process identity per request.
The global
_current_torchruncan be overwritten by concurrent requests, causing one request's exception handler to kill another request's subprocess. Since the/evaluateendpoint is async with no concurrency control, overlapping multi-GPU calls will interleave: Request A sets_current_torchrun = proc_A, Request B overwrites it with_current_torchrun = proc_B, and if Request A times out or errors, it killsproc_Binstead. The unconditional stale-process kill at lines 3020–3024 also aborts unrelated jobs without task identity verification.Use a local
procvariable to track the process started by each request. Kill only the localprocin exception handlers (lines 3119–3120, 3132–3133), and only clear the global if it still points to the same process in the finally block:Fix outline
async def _evaluate_via_torchrun(request: EvaluateRequest) -> dict: global _current_torchrun + proc: asyncio.subprocess.Process | None = None import asyncio as _aio import json as _json import tempfile if _current_torchrun is not None and _current_torchrun.returncode is None: logger.warning( f"Killing stale torchrun (pid={_current_torchrun.pid}) before new evaluation" ) _kill_torchrun_group(_current_torchrun) try: await _aio.wait_for(_current_torchrun.wait(), timeout=5) except TimeoutError: pass _current_torchrun = None # ... setup code ... try: proc = await _aio.create_subprocess_exec(...) _current_torchrun = proc # ... rest of execution ... except TimeoutError: - if _current_torchrun is not None: - _kill_torchrun_group(_current_torchrun) + if proc is not None: + _kill_torchrun_group(proc) return { ... } except Exception as e: - if _current_torchrun is not None: - _kill_torchrun_group(_current_torchrun) + if proc is not None: + _kill_torchrun_group(proc) return { ... } finally: + if _current_torchrun is proc: + _current_torchrun = None for p in (params_path, script_path): if p: try: os.unlink(p) except OSError: pass🤖 Prompt for AI Agents