File tree Expand file tree Collapse file tree 1 file changed +3
-2
lines changed
Expand file tree Collapse file tree 1 file changed +3
-2
lines changed Original file line number Diff line number Diff line change @@ -33,11 +33,12 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
3333
3434# TODO: memory check will fail due to OOM tracked here https://github.com/NVIDIA-NeMo/RL/issues/263
3535
36+ # Revert to `mean(data["timing/train/total_step_time"], 2) < 30` once https://github.com/NVIDIA-NeMo/RL/issues/1719 resolved
3637# Only run metrics if the target step is reached
3738if [[ $( jq ' to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS ) -ge $MAX_STEPS ]]; then
3839 uv run tests/check_metrics.py $JSON_METRICS \
3940 ' data["train/loss"]["1"] < 1.0' \
4041 ' data["train/loss"]["50"] < 0.8' \
41- ' max(data["ray/node.0.gpu.0.mem_gb"]) < 50 ' \
42- ' mean(data["timing/train/total_step_time"], 2) < 10 '
42+ ' max(data["ray/node.0.gpu.0.mem_gb"]) < 60 ' \
43+ ' mean(data["timing/train/total_step_time"], 2) < 30 '
4344fi
You can’t perform that action at this time.
0 commit comments