From 3479e912071db63ac3de60d3f4e6ef0350868dc1 Mon Sep 17 00:00:00 2001 From: Mark Obozov Date: Sun, 13 Oct 2024 17:34:40 +0300 Subject: [PATCH 1/3] More consistent trace name --- torchtune/training/_profiler.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/torchtune/training/_profiler.py b/torchtune/training/_profiler.py index f8004a356e..b79693db71 100644 --- a/torchtune/training/_profiler.py +++ b/torchtune/training/_profiler.py @@ -6,6 +6,7 @@ import os +import socket import time from functools import partial from pathlib import Path @@ -98,7 +99,9 @@ def trace_handler( # Use tensorboard trace handler rather than directly exporting chrome traces since # tensorboard doesn't seem to be able to parse traces with prof.export_chrome_trace exporter = tensorboard_trace_handler( - curr_trace_dir, worker_name=f"rank{rank}", use_gzip=True + curr_trace_dir, + worker_name=f"rank{rank}_" + f"{socket.gethostname()}_{os.getpid()}", + use_gzip=True, ) exporter(prof) From 8cc85242efabc9cb18534b4d5f62fc4bbcf2f97e Mon Sep 17 00:00:00 2001 From: krammnic Date: Mon, 14 Oct 2024 16:09:04 -0400 Subject: [PATCH 2/3] last file trick --- torchtune/training/_profiler.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/torchtune/training/_profiler.py b/torchtune/training/_profiler.py index b79693db71..94eafcade7 100644 --- a/torchtune/training/_profiler.py +++ b/torchtune/training/_profiler.py @@ -5,8 +5,9 @@ # LICENSE file in the root directory of this source tree. +import datetime +import glob import os -import socket import time from functools import partial from pathlib import Path @@ -100,11 +101,21 @@ def trace_handler( # tensorboard doesn't seem to be able to parse traces with prof.export_chrome_trace exporter = tensorboard_trace_handler( curr_trace_dir, - worker_name=f"rank{rank}_" + f"{socket.gethostname()}_{os.getpid()}", + worker_name="rank0", use_gzip=True, ) exporter(prof) + latest_trace = max( + glob.glob(curr_trace_dir + "/*.pt.trace.json.gz"), key=os.path.getctime + ) + + now = datetime.datetime.now() + os.rename( + latest_trace, + f"{curr_trace_dir}/r0-{now.year}-{now.month}-{now.day}-{now.hour}-{now.minute}.pt.trace.json.gz", + ) + if rank == 0: log.info(f"Finished dumping traces in {time.monotonic() - begin:.2f} seconds") From bf7d1acb803599248027e5b7f56a45e69fe122f2 Mon Sep 17 00:00:00 2001 From: krammnic Date: Mon, 14 Oct 2024 18:03:47 -0400 Subject: [PATCH 3/3] just date as trace name --- torchtune/training/_profiler.py | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/torchtune/training/_profiler.py b/torchtune/training/_profiler.py index 94eafcade7..d296006b5d 100644 --- a/torchtune/training/_profiler.py +++ b/torchtune/training/_profiler.py @@ -6,7 +6,6 @@ import datetime -import glob import os import time from functools import partial @@ -99,23 +98,16 @@ def trace_handler( # Use tensorboard trace handler rather than directly exporting chrome traces since # tensorboard doesn't seem to be able to parse traces with prof.export_chrome_trace + + now = datetime.datetime.now() + exporter = tensorboard_trace_handler( curr_trace_dir, - worker_name="rank0", + worker_name=f"r0-{now.year}-{now.month}-{now.day}-{now.hour}-{now.minute}", use_gzip=True, ) exporter(prof) - latest_trace = max( - glob.glob(curr_trace_dir + "/*.pt.trace.json.gz"), key=os.path.getctime - ) - - now = datetime.datetime.now() - os.rename( - latest_trace, - f"{curr_trace_dir}/r0-{now.year}-{now.month}-{now.day}-{now.hour}-{now.minute}.pt.trace.json.gz", - ) - if rank == 0: log.info(f"Finished dumping traces in {time.monotonic() - begin:.2f} seconds")