From fb2f813d82dbfe60beaf7b4fbbcaf2bcd88d8ab6 Mon Sep 17 00:00:00 2001 From: deep1401 Date: Tue, 10 Mar 2026 15:17:22 -0600 Subject: [PATCH 1/8] Simpler profiling from scratch --- .../routers/compute_provider.py | 5 + api/transformerlab/routers/experiment/jobs.py | 28 ++ .../schemas/compute_providers.py | 8 + lab-sdk/src/lab/profiling.py | 385 ++++++++++++++++++ lab-sdk/src/lab/remote_trap.py | 38 ++ .../Tasks/EmbeddableStreamingOutput.tsx | 57 ++- .../Experiment/Tasks/ProfilingReport.tsx | 205 ++++++++++ .../Experiment/Tasks/QueueTaskModal.tsx | 54 +++ .../components/Experiment/Tasks/Tasks.tsx | 8 + .../Tasks/ViewOutputModalStreaming.tsx | 7 +- src/renderer/lib/api-client/endpoints.ts | 2 + 11 files changed, 778 insertions(+), 19 deletions(-) create mode 100644 lab-sdk/src/lab/profiling.py create mode 100644 src/renderer/components/Experiment/Tasks/ProfilingReport.tsx diff --git a/api/transformerlab/routers/compute_provider.py b/api/transformerlab/routers/compute_provider.py index f160ddc13..151685909 100644 --- a/api/transformerlab/routers/compute_provider.py +++ b/api/transformerlab/routers/compute_provider.py @@ -1713,6 +1713,11 @@ async def launch_template_on_provider( if request.enable_trackio: env_vars["TLAB_TRACKIO_AUTO_INIT"] = "true" + if request.enable_profiling: + env_vars["_TFL_PROFILING"] = "1" + if request.enable_profiling_torch: + env_vars["_TFL_PROFILING_TORCH"] = "1" + # Get TFL_STORAGE_URI from storage context tfl_storage_uri = None try: diff --git a/api/transformerlab/routers/experiment/jobs.py b/api/transformerlab/routers/experiment/jobs.py index 296388306..d8bde0607 100644 --- a/api/transformerlab/routers/experiment/jobs.py +++ b/api/transformerlab/routers/experiment/jobs.py @@ -1691,3 +1691,31 @@ async def generate(): return StreamingResponse( generate(), media_type=media_type, headers={"Content-Disposition": f'inline; filename="{filename}"'} ) + + +@router.get("/{job_id}/profiling_report") +async def get_profiling_report( + job_id: str, + experimentId: str, + session: AsyncSession = Depends(get_async_session), + user_and_team: dict = Depends(get_user_and_team), +): + """ + Return the profiling_report.json written by tfl-remote-trap when _TFL_PROFILING=1. + + Returns 404 if profiling was not enabled or the job has not yet completed profiling. + """ + from lab.dirs import get_job_dir + + job_dir = await get_job_dir(job_id) + report_path = storage.join(job_dir, "profiling_report.json") + + if not await storage.exists(report_path): + raise HTTPException(status_code=404, detail="Profiling report not found for this job") + + try: + async with await storage.open(report_path, "r", encoding="utf-8") as f: + content = await f.read() + return json.loads(content) + except Exception as exc: + raise HTTPException(status_code=500, detail=f"Failed to read profiling report: {exc}") from exc diff --git a/api/transformerlab/schemas/compute_providers.py b/api/transformerlab/schemas/compute_providers.py index 61dcc48ff..26b8830ca 100644 --- a/api/transformerlab/schemas/compute_providers.py +++ b/api/transformerlab/schemas/compute_providers.py @@ -166,6 +166,14 @@ class ProviderTemplateLaunchRequest(BaseModel): default=False, description="When True, set TLAB_TRACKIO_AUTO_INIT=true in the job environment so lab SDK can auto-integrate with Trackio.", ) + enable_profiling: Optional[bool] = Field( + default=False, + description="When True, set _TFL_PROFILING=1 to enable system-level CPU/GPU/memory sampling via tfl-remote-trap.", + ) + enable_profiling_torch: Optional[bool] = Field( + default=False, + description="When True (requires enable_profiling), also set _TFL_PROFILING_TORCH=1 to inject torch.profiler and export a Chrome trace.", + ) class ProviderTemplateFileUploadResponse(BaseModel): diff --git a/lab-sdk/src/lab/profiling.py b/lab-sdk/src/lab/profiling.py new file mode 100644 index 000000000..d04921a0d --- /dev/null +++ b/lab-sdk/src/lab/profiling.py @@ -0,0 +1,385 @@ +""" +Job profiling: background sampler for CPU, memory, and GPU resource usage. + +Usage in tfl-remote-trap (or any process wrapper): + + import subprocess + from lab.profiling import maybe_start_profiling, finalize_profiling + + proc = subprocess.Popen(...) + profiling_thread = maybe_start_profiling(proc.pid, job_dir) + + exit_code = proc.wait() + wall_time = time.monotonic() - start_time + + finalize_profiling(profiling_thread, job_dir, wall_time) + +Activation: + Set _TFL_PROFILING=1 in the job environment. + Set _TFL_PROFILING_INTERVAL= to change sampling interval (default 5). + Set _TFL_PROFILING_TORCH=1 to also inject torch.profiler tracing. +""" +from __future__ import annotations + +import json +import os +import subprocess +import tempfile +import threading +import time +from typing import Any, Dict, List, Optional + +_PROFILING_SAMPLES_FILE = "profiling_samples.jsonl" +_PROFILING_REPORT_FILE = "profiling_report.json" +_TORCH_PROFILE_DIR = "torch_profile" +_DEFAULT_INTERVAL_SEC = 5.0 + + +def _sample_cpu_memory(pid: int) -> Dict[str, Any]: + """Return CPU percent and RSS memory (MB) for the pid and its children.""" + try: + import psutil # type: ignore[import-not-found] + except ImportError: + return {} + + try: + parent = psutil.Process(pid) + procs = [parent] + parent.children(recursive=True) + cpu_total = 0.0 + rss_total = 0.0 + for p in procs: + try: + cpu_total += p.cpu_percent(interval=None) + rss_total += p.memory_info().rss + except (psutil.NoSuchProcess, psutil.AccessDenied): + pass + return { + "cpu_percent": round(cpu_total, 2), + "memory_rss_mb": round(rss_total / (1024 * 1024), 2), + } + except Exception: + return {} + + +def _sample_gpus_nvidia() -> List[Dict[str, Any]]: + """Try pynvml first, then fall back to nvidia-smi subprocess.""" + # Try pynvml + try: + import pynvml # type: ignore[import-not-found] + + pynvml.nvmlInit() + count = pynvml.nvmlDeviceGetCount() + gpus = [] + for i in range(count): + handle = pynvml.nvmlDeviceGetHandleByIndex(i) + util = pynvml.nvmlDeviceGetUtilizationRates(handle) + mem = pynvml.nvmlDeviceGetMemoryInfo(handle) + gpus.append( + { + "index": i, + "util_percent": util.gpu, + "mem_used_mb": round(mem.used / (1024 * 1024), 2), + "mem_total_mb": round(mem.total / (1024 * 1024), 2), + } + ) + return gpus + except Exception: + pass + + # Fall back to nvidia-smi + try: + result = subprocess.run( + [ + "nvidia-smi", + "--query-gpu=index,utilization.gpu,memory.used,memory.total", + "--format=csv,noheader,nounits", + ], + capture_output=True, + text=True, + timeout=5, + ) + if result.returncode != 0 or not result.stdout.strip(): + return [] + gpus = [] + for line in result.stdout.strip().splitlines(): + parts = [p.strip() for p in line.split(",")] + if len(parts) < 4: + continue + try: + gpus.append( + { + "index": int(parts[0]), + "util_percent": float(parts[1]), + "mem_used_mb": float(parts[2]), + "mem_total_mb": float(parts[3]), + } + ) + except (ValueError, IndexError): + pass + return gpus + except Exception: + return [] + + +def _sample_gpus_amd() -> List[Dict[str, Any]]: + """Sample AMD GPU stats via rocm-smi.""" + try: + result = subprocess.run( + ["rocm-smi", "--showuse", "--showmemuse", "--csv"], + capture_output=True, + text=True, + timeout=5, + ) + if result.returncode != 0 or not result.stdout.strip(): + return [] + lines = result.stdout.strip().splitlines() + # rocm-smi CSV format can vary; best-effort parse + gpus = [] + for i, line in enumerate(lines[1:]): # skip header + parts = [p.strip() for p in line.split(",")] + if len(parts) < 2: + continue + try: + gpus.append( + { + "index": i, + "util_percent": float(parts[1].rstrip("%")), + "mem_used_mb": None, + "mem_total_mb": None, + } + ) + except (ValueError, IndexError): + pass + return gpus + except Exception: + return [] + + +def _sample_gpus() -> List[Dict[str, Any]]: + """Detect and sample GPU stats from NVIDIA or AMD hardware.""" + import shutil + + if shutil.which("nvidia-smi"): + return _sample_gpus_nvidia() + if shutil.which("rocm-smi"): + return _sample_gpus_amd() + return [] + + +class _ProfilingThread(threading.Thread): + """Background thread that periodically samples resource stats and writes to JSONL.""" + + def __init__(self, pid: int, job_dir: str, interval_sec: float = _DEFAULT_INTERVAL_SEC) -> None: + super().__init__(daemon=True, name="tfl-profiler") + self.pid = pid + self.job_dir = job_dir + self.interval_sec = interval_sec + self._stop_event = threading.Event() + self.samples: List[Dict[str, Any]] = [] + + def stop(self) -> None: + self._stop_event.set() + + def run(self) -> None: + samples_path = os.path.join(self.job_dir, _PROFILING_SAMPLES_FILE) + # Initialise cpu_percent (first call always returns 0.0 for psutil) + try: + import psutil # type: ignore[import-not-found] + + psutil.Process(self.pid).cpu_percent(interval=None) + except Exception: + pass + + try: + f = open(samples_path, "w", encoding="utf-8") + except OSError: + return + + try: + while not self._stop_event.wait(self.interval_sec): + sample: Dict[str, Any] = {"timestamp": time.time()} + sample.update(_sample_cpu_memory(self.pid)) + gpus = _sample_gpus() + if gpus: + sample["gpus"] = gpus + self.samples.append(sample) + try: + f.write(json.dumps(sample) + "\n") + f.flush() + except OSError: + pass + finally: + try: + f.close() + except OSError: + pass + + +def _aggregate_samples(samples: List[Dict[str, Any]], wall_time_sec: float, interval_sec: float) -> Dict[str, Any]: + """Summarise a list of samples into a profiling report dict.""" + report: Dict[str, Any] = { + "wall_time_sec": round(wall_time_sec, 2), + "sample_count": len(samples), + "interval_sec": interval_sec, + } + + if not samples: + return report + + cpu_values = [s["cpu_percent"] for s in samples if "cpu_percent" in s] + mem_values = [s["memory_rss_mb"] for s in samples if "memory_rss_mb" in s] + + if cpu_values: + report["cpu"] = { + "peak_percent": round(max(cpu_values), 2), + "avg_percent": round(sum(cpu_values) / len(cpu_values), 2), + } + if mem_values: + report["memory"] = { + "peak_rss_mb": round(max(mem_values), 2), + "avg_rss_mb": round(sum(mem_values) / len(mem_values), 2), + } + + # GPU aggregation: group by index + gpu_samples: Dict[int, List[Dict[str, Any]]] = {} + for s in samples: + for g in s.get("gpus", []): + idx = g.get("index", 0) + gpu_samples.setdefault(idx, []).append(g) + + if gpu_samples: + gpu_summaries = [] + for idx in sorted(gpu_samples.keys()): + gs = gpu_samples[idx] + utils = [g["util_percent"] for g in gs if g.get("util_percent") is not None] + mems_used = [g["mem_used_mb"] for g in gs if g.get("mem_used_mb") is not None] + mem_total = next((g["mem_total_mb"] for g in reversed(gs) if g.get("mem_total_mb") is not None), None) + entry: Dict[str, Any] = {"index": idx} + if utils: + entry["peak_util_percent"] = round(max(utils), 2) + entry["avg_util_percent"] = round(sum(utils) / len(utils), 2) + if mems_used: + entry["peak_mem_used_mb"] = round(max(mems_used), 2) + entry["avg_mem_used_mb"] = round(sum(mems_used) / len(mems_used), 2) + if mem_total is not None: + entry["mem_total_mb"] = round(mem_total, 2) + gpu_summaries.append(entry) + report["gpus"] = gpu_summaries + + return report + + +def maybe_start_profiling(pid: int, job_dir: str) -> Optional[_ProfilingThread]: + """ + Start a profiling thread if _TFL_PROFILING=1 is set in the environment. + + Returns the thread (caller must call finalize_profiling later) or None if profiling + is disabled or the job_dir is unavailable. + """ + if os.environ.get("_TFL_PROFILING") != "1": + return None + if not job_dir or not os.path.isdir(job_dir): + return None + try: + interval = float(os.environ.get("_TFL_PROFILING_INTERVAL", str(_DEFAULT_INTERVAL_SEC))) + except ValueError: + interval = _DEFAULT_INTERVAL_SEC + + thread = _ProfilingThread(pid=pid, job_dir=job_dir, interval_sec=interval) + thread.start() + return thread + + +def finalize_profiling( + thread: Optional[_ProfilingThread], + job_dir: str, + wall_time_sec: float, +) -> None: + """ + Stop the profiling thread and write profiling_report.json to job_dir. + + Safe to call even when thread is None (profiling disabled). + """ + if thread is None: + return + try: + thread.stop() + thread.join(timeout=10) + except Exception: + pass + + try: + report = _aggregate_samples(thread.samples, wall_time_sec, thread.interval_sec) + report_path = os.path.join(job_dir, _PROFILING_REPORT_FILE) + with open(report_path, "w", encoding="utf-8") as f: + json.dump(report, f, indent=2) + except Exception: + pass + + +# --------------------------------------------------------------------------- +# Optional PyTorch profiler injection via sitecustomize.py +# --------------------------------------------------------------------------- + +_SITECUSTOMIZE_TEMPLATE = """\ +# Auto-injected by tfl-profile-trap (lab-sdk profiling). +# Activates torch.profiler.profile() and exports a Chrome trace to the job dir. +import os as _os +import atexit as _atexit + +_TFL_TORCH_PROFILE_DIR = _os.environ.get("_TFL_TORCH_PROFILE_DIR", "") +if _TFL_TORCH_PROFILE_DIR: + try: + import torch # noqa: F401 + from torch.profiler import profile as _profile, ProfilerActivity as _PA + + _prof = _profile( + activities=[_PA.CPU, _PA.CUDA], + with_stack=False, + record_shapes=False, + ) + _prof.__enter__() + + def _export_trace(): + try: + _prof.__exit__(None, None, None) + import pathlib as _pl + _pl.Path(_TFL_TORCH_PROFILE_DIR).mkdir(parents=True, exist_ok=True) + _trace_path = _pl.Path(_TFL_TORCH_PROFILE_DIR) / "trace.json" + _prof.export_chrome_trace(str(_trace_path)) + except Exception: + pass + + _atexit.register(_export_trace) + except Exception: + pass +""" + + +def inject_torch_profiler(job_dir: str, env: dict) -> str: + """ + If _TFL_PROFILING_TORCH=1, write a sitecustomize.py to a temp dir and + prepend it to PYTHONPATH in env so torch.profiler auto-activates in the job. + + Returns the temp dir path (caller should clean up after the job exits). + """ + if os.environ.get("_TFL_PROFILING_TORCH") != "1": + return "" + + try: + torch_profile_dir = os.path.join(job_dir, _TORCH_PROFILE_DIR) + os.makedirs(torch_profile_dir, exist_ok=True) + + tmp_dir = tempfile.mkdtemp(prefix="tfl_sitecustomize_") + sitecustomize_path = os.path.join(tmp_dir, "sitecustomize.py") + with open(sitecustomize_path, "w", encoding="utf-8") as f: + f.write(_SITECUSTOMIZE_TEMPLATE) + + env["_TFL_TORCH_PROFILE_DIR"] = torch_profile_dir + existing_pythonpath = env.get("PYTHONPATH", "") + env["PYTHONPATH"] = ( + f"{tmp_dir}{os.pathsep}{existing_pythonpath}" if existing_pythonpath else tmp_dir + ) + return tmp_dir + except Exception: + return "" diff --git a/lab-sdk/src/lab/remote_trap.py b/lab-sdk/src/lab/remote_trap.py index 503753d51..596994824 100644 --- a/lab-sdk/src/lab/remote_trap.py +++ b/lab-sdk/src/lab/remote_trap.py @@ -4,10 +4,12 @@ import os import subprocess import sys +import time from typing import List from lab import Job, storage from lab.job_status import JobStatus +from lab.profiling import finalize_profiling, inject_torch_profiler, maybe_start_profiling async def _set_live_status_async(job_id: str, status: str) -> None: @@ -171,20 +173,43 @@ def main(argv: List[str] | None = None) -> int: _set_live_status("started") _set_status(JobStatus.RUNNING) + # Resolve job directory for profiling output (same path used by _write_provider_logs). + job_id = os.environ.get("_TFL_JOB_ID") + job_dir: str = "" + if job_id: + try: + from lab.dirs import get_job_dir + + async def _get_job_dir() -> str: + return await get_job_dir(job_id) + + job_dir = asyncio.run(_get_job_dir()) + except Exception: + job_dir = "" + + # Optionally inject torch.profiler via sitecustomize.py before spawning the process. + proc_env = os.environ.copy() + torch_tmp_dir = inject_torch_profiler(job_dir, proc_env) if job_dir else "" + # Run the original command in the shell so it behaves exactly as submitted. # Stream output line-by-line to avoid buffering large logs in memory (training # jobs can produce GBs of output). stdout and stderr are merged into a single # stream (stderr redirected to stdout) so we can tee to both the console and # the provider_logs.txt file. log_lines: List[str] = [] + start_time = time.monotonic() proc = subprocess.Popen( command_str, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, + env=proc_env if torch_tmp_dir else None, ) + # Start profiling sidecar thread (no-op if _TFL_PROFILING is not set). + profiling_thread = maybe_start_profiling(proc.pid, job_dir) if job_dir else None + assert proc.stdout is not None for line in proc.stdout: try: @@ -195,10 +220,23 @@ def main(argv: List[str] | None = None) -> int: log_lines.append(line) exit_code = proc.wait() + wall_time = time.monotonic() - start_time combined_logs = "".join(log_lines) _write_provider_logs(combined_logs) + # Finalise profiling: stop sampler thread and write profiling_report.json. + finalize_profiling(profiling_thread, job_dir, wall_time) + + # Clean up torch sitecustomize temp dir (best-effort). + if torch_tmp_dir: + try: + import shutil + + shutil.rmtree(torch_tmp_dir, ignore_errors=True) + except Exception: + pass + # Update live_status based on outcome (best-effort). if exit_code == 0: _set_live_status("finished") diff --git a/src/renderer/components/Experiment/Tasks/EmbeddableStreamingOutput.tsx b/src/renderer/components/Experiment/Tasks/EmbeddableStreamingOutput.tsx index d6ae69c67..c3e2ee4b5 100644 --- a/src/renderer/components/Experiment/Tasks/EmbeddableStreamingOutput.tsx +++ b/src/renderer/components/Experiment/Tasks/EmbeddableStreamingOutput.tsx @@ -26,6 +26,7 @@ import * as chatAPI from 'renderer/lib/transformerlab-api-sdk'; import { useExperimentInfo } from 'renderer/lib/ExperimentInfoContext'; import { jobChipColor } from 'renderer/lib/utils'; import PollingOutputTerminal from './PollingOutputTerminal'; +import ProfilingReport from './ProfilingReport'; interface ProviderLogsTerminalProps { logsText: string; @@ -198,30 +199,33 @@ function RefreshIndicator({ ); } -const TAB_OPTIONS: { value: 'output' | 'provider'; label: string }[] = [ +type TabValue = 'output' | 'provider' | 'profiling'; + +const TAB_OPTIONS: { value: TabValue; label: string }[] = [ { value: 'output', label: 'Lab SDK Output' }, { value: 'provider', label: 'Machine Logs' }, + { value: 'profiling', label: 'Profiling' }, ]; export interface EmbeddableStreamingOutputProps { jobId: number; /** Which tabs to show, in order. e.g. ['output', 'provider'] or ['provider'] for interactive tasks. */ - tabs?: ('output' | 'provider')[]; + tabs?: TabValue[]; /** Current job status string (e.g. 'RUNNING', 'COMPLETE'). Passed from the parent to avoid extra polling. */ jobStatus?: string; } export default function EmbeddableStreamingOutput({ jobId, - tabs: tabsProp = ['output', 'provider'], + tabs: tabsProp = ['output', 'provider', 'profiling'], jobStatus = '', }: EmbeddableStreamingOutputProps) { const { experimentInfo } = useExperimentInfo(); - const [activeTab, setActiveTab] = useState<'output' | 'provider'>('output'); + const [activeTab, setActiveTab] = useState('output'); const [viewLiveProviderLogs, setViewLiveProviderLogs] = useState(false); - const tabs = tabsProp.length > 0 ? tabsProp : ['output', 'provider']; + const tabs = tabsProp.length > 0 ? tabsProp : ['output', 'provider', 'profiling']; const showTabList = tabs.length > 1; const tabsKey = tabs.join(','); @@ -229,7 +233,7 @@ export default function EmbeddableStreamingOutput({ setActiveTab((current) => tabs.includes(current) ? current - : ((tabs[0] ?? 'output') as 'output' | 'provider'), + : ((tabs[0] ?? 'output') as TabValue), ); setViewLiveProviderLogs(false); // tabsKey is a stable serialization of tabs to avoid array reference churn @@ -332,9 +336,11 @@ export default function EmbeddableStreamingOutput({ onChange={(_event, value) => { if ( typeof value === 'string' && - (value === 'output' || value === 'provider') + (value === 'output' || + value === 'provider' || + value === 'profiling') ) { - setActiveTab(value); + setActiveTab(value as TabValue); } }} > @@ -391,13 +397,19 @@ export default function EmbeddableStreamingOutput({ )} - + {activeTab !== 'profiling' && ( + + )} + ) : activeTab === 'profiling' ? ( + + + ) : ( = 1024) return `${(mb / 1024).toFixed(1)} GB`; + return `${mb.toFixed(0)} MB`; +} + +function formatPct(v: number | undefined): string { + if (v == null) return '—'; + return `${v.toFixed(1)}%`; +} + +function formatSec(sec: number | undefined): string { + if (sec == null) return '—'; + if (sec < 60) return `${sec.toFixed(1)}s`; + const m = Math.floor(sec / 60); + const s = Math.round(sec % 60); + return `${m}m ${s}s`; +} + +interface StatCardProps { + label: string; + peak: string; + avg: string; +} + +function StatCard({ label, peak, avg }: StatCardProps) { + return ( + + + + {label} + + {peak} + + avg {avg} + + + + ); +} + +interface ProfilingReportProps { + jobId: number; +} + +export default function ProfilingReport({ jobId }: ProfilingReportProps) { + const { experimentInfo } = useExperimentInfo(); + + const url = + jobId !== -1 && experimentInfo?.id + ? chatAPI.Endpoints.Experiment.GetProfilingReport( + experimentInfo.id, + String(jobId), + ) + : null; + + const { data, isLoading, isError } = useSWR(url); + + if (!url || isLoading) { + return ( + + + Loading profiling report… + + + ); + } + + if (isError || !data) { + return ( + + + No profiling report available. Enable profiling when launching the job + to capture CPU and GPU metrics. + + + ); + } + + const report = data as ProfilingData; + + return ( + + + {/* Summary row */} + + + + + Wall Time + + + {formatSec(report.wall_time_sec)} + + + {report.sample_count ?? 0} samples /{' '} + {report.interval_sec ?? 5}s + + + + + {report.cpu && ( + + )} + + {report.memory && ( + + )} + + + {/* GPU table */} + {report.gpus && report.gpus.length > 0 && ( + <> + + GPU Summary + + + + + + + + + + + + + + {report.gpus.map((g) => ( + + + + + + + + + ))} + +
GPUPeak UtilAvg UtilPeak MemAvg MemTotal Mem
GPU {g.index}{formatPct(g.peak_util_percent)}{formatPct(g.avg_util_percent)}{formatMb(g.peak_mem_used_mb)}{formatMb(g.avg_mem_used_mb)}{formatMb(g.mem_total_mb)}
+
+ + )} + + {/* Note about torch trace */} + {report.gpus === undefined && !report.cpu && ( + + No resource samples were collected. The job may have been too short + to capture data. + + )} +
+
+ ); +} diff --git a/src/renderer/components/Experiment/Tasks/QueueTaskModal.tsx b/src/renderer/components/Experiment/Tasks/QueueTaskModal.tsx index 54a8bbbc3..35308aaeb 100644 --- a/src/renderer/components/Experiment/Tasks/QueueTaskModal.tsx +++ b/src/renderer/components/Experiment/Tasks/QueueTaskModal.tsx @@ -106,6 +106,8 @@ export default function QueueTaskModal({ const [lowerIsBetter, setLowerIsBetter] = React.useState(true); const [jobSlurmFlags, setJobSlurmFlags] = React.useState(['']); const [useTrackio, setUseTrackio] = React.useState(false); + const [useProfiling, setUseProfiling] = React.useState(false); + const [useProfilingTorch, setUseProfilingTorch] = React.useState(false); const loadingMessages = React.useMemo( () => [ 'Contacting compute provider…', @@ -612,6 +614,15 @@ export default function QueueTaskModal({ config.enable_trackio = true; } + // Profiling: when enabled, backend will set _TFL_PROFILING=1 so tfl-remote-trap + // samples CPU/GPU/memory during the job and writes profiling_report.json. + if (useProfiling) { + config.enable_profiling = true; + if (useProfilingTorch) { + config.enable_profiling_torch = true; + } + } + onSubmit(config); }; @@ -1313,6 +1324,49 @@ export default function QueueTaskModal({ + {/* Profiling Section */} + + Profiling + + { + setUseProfiling(e.target.checked); + if (!e.target.checked) setUseProfilingTorch(false); + }} + disabled={isSubmitting} + /> + + Enable CPU & GPU profiling for this run + + + + Samples CPU%, memory, and GPU utilization every few seconds + during the job. Results are available in the Profiling tab after + the job completes. + + {useProfiling && ( + + setUseProfilingTorch(e.target.checked)} + disabled={isSubmitting} + /> + + Also capture PyTorch op-level trace (Chrome trace format) + + + )} + + + + {/* Sweep Configuration Section */} = { output: 'Lab SDK Output', provider: 'Machine Logs', + profiling: 'Profiling', }; interface ViewOutputModalStreamingProps { jobId: number; setJobId: (jobId: number) => void; /** Which tabs to show, in order. e.g. ['output', 'provider'] or ['provider'] for interactive tasks. */ - tabs?: ('output' | 'provider')[]; + tabs?: ('output' | 'provider' | 'profiling')[]; /** Current job status string (e.g. 'RUNNING', 'COMPLETE'). */ jobStatus?: string; } @@ -19,7 +20,7 @@ interface ViewOutputModalStreamingProps { function ViewOutputModalStreaming({ jobId, setJobId, - tabs = ['output', 'provider'], + tabs = ['output', 'provider', 'profiling'], jobStatus = '', }: ViewOutputModalStreamingProps) { if (jobId === -1) { @@ -61,7 +62,7 @@ function ViewOutputModalStreaming({ } ViewOutputModalStreaming.defaultProps = { - tabs: ['output', 'provider'], + tabs: ['output', 'provider', 'profiling'], jobStatus: '', }; diff --git a/src/renderer/lib/api-client/endpoints.ts b/src/renderer/lib/api-client/endpoints.ts index e540d4247..5f54ac514 100644 --- a/src/renderer/lib/api-client/endpoints.ts +++ b/src/renderer/lib/api-client/endpoints.ts @@ -468,6 +468,8 @@ Endpoints.Experiment = { tailLines: number = 1000, ) => `${API_URL()}experiment/${experimentId}/jobs/${jobId}/tunnel_info?tail_lines=${tailLines}`, + GetProfilingReport: (experimentId: string, jobId: string) => + `${API_URL()}experiment/${experimentId}/jobs/${jobId}/profiling_report`, GetAdditionalDetails: ( experimentId: string, jobId: string, From 6a3586a6cc47309cd05e32c6f96dfdfad42d6293 Mon Sep 17 00:00:00 2001 From: deep1401 Date: Tue, 10 Mar 2026 15:17:40 -0600 Subject: [PATCH 2/8] prettier --- .../Experiment/Tasks/EmbeddableStreamingOutput.tsx | 11 ++++------- .../components/Experiment/Tasks/ProfilingReport.tsx | 3 +-- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/src/renderer/components/Experiment/Tasks/EmbeddableStreamingOutput.tsx b/src/renderer/components/Experiment/Tasks/EmbeddableStreamingOutput.tsx index c3e2ee4b5..68039755c 100644 --- a/src/renderer/components/Experiment/Tasks/EmbeddableStreamingOutput.tsx +++ b/src/renderer/components/Experiment/Tasks/EmbeddableStreamingOutput.tsx @@ -225,15 +225,14 @@ export default function EmbeddableStreamingOutput({ const [viewLiveProviderLogs, setViewLiveProviderLogs] = useState(false); - const tabs = tabsProp.length > 0 ? tabsProp : ['output', 'provider', 'profiling']; + const tabs = + tabsProp.length > 0 ? tabsProp : ['output', 'provider', 'profiling']; const showTabList = tabs.length > 1; const tabsKey = tabs.join(','); useEffect(() => { setActiveTab((current) => - tabs.includes(current) - ? current - : ((tabs[0] ?? 'output') as TabValue), + tabs.includes(current) ? current : ((tabs[0] ?? 'output') as TabValue), ); setViewLiveProviderLogs(false); // tabsKey is a stable serialization of tabs to avoid array reference churn @@ -403,9 +402,7 @@ export default function EmbeddableStreamingOutput({ activeTab === 'output' ? outputCountdown : providerCountdown } isRefreshing={ - activeTab === 'output' - ? outputIsValidating - : providerIsValidating + activeTab === 'output' ? outputIsValidating : providerIsValidating } onRefresh={handleManualRefresh} /> diff --git a/src/renderer/components/Experiment/Tasks/ProfilingReport.tsx b/src/renderer/components/Experiment/Tasks/ProfilingReport.tsx index d51cec587..eb2fc751f 100644 --- a/src/renderer/components/Experiment/Tasks/ProfilingReport.tsx +++ b/src/renderer/components/Experiment/Tasks/ProfilingReport.tsx @@ -135,8 +135,7 @@ export default function ProfilingReport({ jobId }: ProfilingReportProps) { {formatSec(report.wall_time_sec)} - {report.sample_count ?? 0} samples /{' '} - {report.interval_sec ?? 5}s + {report.sample_count ?? 0} samples / {report.interval_sec ?? 5}s From 0d2c8338ea06401bcfba15c5be0c80e8bc64ab29 Mon Sep 17 00:00:00 2001 From: deep1401 Date: Fri, 13 Mar 2026 11:55:01 -0600 Subject: [PATCH 3/8] Changes to sync after run --- .../routers/compute_provider.py | 5 +- api/transformerlab/routers/experiment/jobs.py | 11 +-- lab-sdk/src/lab/dirs.py | 11 +++ lab-sdk/src/lab/job.py | 6 ++ lab-sdk/src/lab/lab_facade.py | 18 +++++ lab-sdk/src/lab/profiling.py | 76 +++++++++++++------ lab-sdk/src/lab/remote_trap.py | 45 ++++++----- 7 files changed, 124 insertions(+), 48 deletions(-) diff --git a/api/transformerlab/routers/compute_provider.py b/api/transformerlab/routers/compute_provider.py index 0bc6ceb6b..869c81a94 100644 --- a/api/transformerlab/routers/compute_provider.py +++ b/api/transformerlab/routers/compute_provider.py @@ -1694,7 +1694,10 @@ async def launch_template_on_provider( # Ensure transformerlab SDK is available on remote machines for live_status tracking and other helpers. # This runs after AWS credentials are configured so we have access to any remote storage if needed. if provider.type != ProviderType.LOCAL.value: - setup_commands.append("pip install -q transformerlab") + # setup_commands.append("pip install -q transformerlab") + setup_commands.append( + "git clone https://github.com/transformerlab/transformerlab-app; cd transformerlab-app; git checkout add/profiler; pip install -e lab-sdk/; cd ~;" + ) # Add GitHub clone setup if enabled if request.github_repo_url: diff --git a/api/transformerlab/routers/experiment/jobs.py b/api/transformerlab/routers/experiment/jobs.py index c462702f2..3b557a825 100644 --- a/api/transformerlab/routers/experiment/jobs.py +++ b/api/transformerlab/routers/experiment/jobs.py @@ -1708,14 +1708,15 @@ async def get_profiling_report( user_and_team: dict = Depends(get_user_and_team), ): """ - Return the profiling_report.json written by tfl-remote-trap when _TFL_PROFILING=1. + Return the profiling_report.json from the job's profiling folder (written when + _TFL_PROFILING=1 and copied on lab.finish/error or when the remote trap exits). - Returns 404 if profiling was not enabled or the job has not yet completed profiling. + Returns 404 if profiling was not enabled or the report is not yet available. """ - from lab.dirs import get_job_dir + from lab.dirs import get_job_profiling_dir - job_dir = await get_job_dir(job_id) - report_path = storage.join(job_dir, "profiling_report.json") + profiling_dir = await get_job_profiling_dir(job_id) + report_path = storage.join(profiling_dir, "profiling_report.json") if not await storage.exists(report_path): raise HTTPException(status_code=404, detail="Profiling report not found for this job") diff --git a/lab-sdk/src/lab/dirs.py b/lab-sdk/src/lab/dirs.py index bc1929999..b40d7b0c2 100644 --- a/lab-sdk/src/lab/dirs.py +++ b/lab-sdk/src/lab/dirs.py @@ -267,6 +267,17 @@ async def get_job_artifacts_dir(job_id: str | int) -> str: return path +async def get_job_profiling_dir(job_id: str | int) -> str: + """ + Return the profiling directory for a specific job, creating it if needed. + Example: ~/.transformerlab/workspace/jobs//profiling + """ + job_dir = await get_job_dir(job_id) + path = storage.join(job_dir, "profiling") + await storage.makedirs(path, exist_ok=True) + return path + + async def get_job_checkpoints_dir(job_id: str | int) -> str: """ Return the checkpoints directory for a specific job, creating it if needed. diff --git a/lab-sdk/src/lab/job.py b/lab-sdk/src/lab/job.py index f749b1eaa..0ac75a7b2 100644 --- a/lab-sdk/src/lab/job.py +++ b/lab-sdk/src/lab/job.py @@ -294,6 +294,12 @@ async def get_artifacts_dir(self): """ return await dirs.get_job_artifacts_dir(self.id) + async def get_profiling_dir(self): + """ + Get the profiling directory path for this job. + """ + return await dirs.get_job_profiling_dir(self.id) + async def get_checkpoint_paths(self): """ Get list of checkpoint paths for this job. diff --git a/lab-sdk/src/lab/lab_facade.py b/lab-sdk/src/lab/lab_facade.py index ed1fbc594..86bb5ca3d 100644 --- a/lab-sdk/src/lab/lab_facade.py +++ b/lab-sdk/src/lab/lab_facade.py @@ -538,6 +538,15 @@ def finish( Mark the job as successfully completed and set completion metadata. """ self._ensure_initialized() + # Copy profiling from temp dir into job's profiling folder (when run under remote trap). + try: + profiling_temp = os.environ.get("_TFL_PROFILING_TEMP_DIR") + if profiling_temp and self._job: + from lab.profiling import copy_profiling_to_job + + _run_async(copy_profiling_to_job(profiling_temp, str(self._job.id))) # type: ignore[union-attr] + except Exception: + pass _run_async(self._job.update_progress(100)) # type: ignore[union-attr] _run_async(self._job.update_status(JobStatus.COMPLETE)) # type: ignore[union-attr] _run_async(self._job.update_job_data_field("completion_status", "success")) # type: ignore[union-attr] @@ -1435,6 +1444,15 @@ def error( Mark the job as failed and set completion metadata. """ self._ensure_initialized() + # Copy profiling from temp dir into job's profiling folder (when run under remote trap). + try: + profiling_temp = os.environ.get("_TFL_PROFILING_TEMP_DIR") + if profiling_temp and self._job: + from lab.profiling import copy_profiling_to_job + + _run_async(copy_profiling_to_job(profiling_temp, str(self._job.id))) # type: ignore[union-attr] + except Exception: + pass _run_async(self._job.update_status(JobStatus.COMPLETE)) # type: ignore[union-attr] _run_async(self._job.update_job_data_field("completion_status", "failed")) # type: ignore[union-attr] _run_async(self._job.update_job_data_field("completion_details", message)) # type: ignore[union-attr] diff --git a/lab-sdk/src/lab/profiling.py b/lab-sdk/src/lab/profiling.py index d04921a0d..fa6618eef 100644 --- a/lab-sdk/src/lab/profiling.py +++ b/lab-sdk/src/lab/profiling.py @@ -1,18 +1,19 @@ """ Job profiling: background sampler for CPU, memory, and GPU resource usage. -Usage in tfl-remote-trap (or any process wrapper): - - import subprocess - from lab.profiling import maybe_start_profiling, finalize_profiling +Profiling writes to a temp directory during the run. The contents are copied into +the job's "profiling" folder (alongside "artifacts") when: + - lab.finish() or lab.error() is called (if _TFL_PROFILING_TEMP_DIR is set), or + - the remote trap exits after the child process (trap copies then). - proc = subprocess.Popen(...) - profiling_thread = maybe_start_profiling(proc.pid, job_dir) - - exit_code = proc.wait() - wall_time = time.monotonic() - start_time +Usage in tfl-remote-trap (or any process wrapper): - finalize_profiling(profiling_thread, job_dir, wall_time) + output_dir = tempfile.mkdtemp(prefix="tfl_profiling_") + os.environ["_TFL_PROFILING_TEMP_DIR"] = output_dir # so lab.finish/error can copy + profiling_thread = maybe_start_profiling(proc.pid, output_dir) + ... + finalize_profiling(profiling_thread, output_dir, wall_time) + await copy_profiling_to_job(output_dir, job_id) # or call from lab.finish/error Activation: Set _TFL_PROFILING=1 in the job environment. @@ -169,10 +170,10 @@ def _sample_gpus() -> List[Dict[str, Any]]: class _ProfilingThread(threading.Thread): """Background thread that periodically samples resource stats and writes to JSONL.""" - def __init__(self, pid: int, job_dir: str, interval_sec: float = _DEFAULT_INTERVAL_SEC) -> None: + def __init__(self, pid: int, output_dir: str, interval_sec: float = _DEFAULT_INTERVAL_SEC) -> None: super().__init__(daemon=True, name="tfl-profiler") self.pid = pid - self.job_dir = job_dir + self.output_dir = output_dir self.interval_sec = interval_sec self._stop_event = threading.Event() self.samples: List[Dict[str, Any]] = [] @@ -181,7 +182,7 @@ def stop(self) -> None: self._stop_event.set() def run(self) -> None: - samples_path = os.path.join(self.job_dir, _PROFILING_SAMPLES_FILE) + samples_path = os.path.join(self.output_dir, _PROFILING_SAMPLES_FILE) # Initialise cpu_percent (first call always returns 0.0 for psutil) try: import psutil # type: ignore[import-not-found] @@ -269,34 +270,38 @@ def _aggregate_samples(samples: List[Dict[str, Any]], wall_time_sec: float, inte return report -def maybe_start_profiling(pid: int, job_dir: str) -> Optional[_ProfilingThread]: +def maybe_start_profiling(pid: int, output_dir: str) -> Optional[_ProfilingThread]: """ Start a profiling thread if _TFL_PROFILING=1 is set in the environment. - Returns the thread (caller must call finalize_profiling later) or None if profiling - is disabled or the job_dir is unavailable. + output_dir: temp directory to write profiling_samples.jsonl (and later + profiling_report.json). Caller must create it and pass the same path to + finalize_profiling and copy_profiling_to_job. + + Returns the thread (caller must call finalize_profiling later) or None if + profiling is disabled or output_dir is unavailable. """ if os.environ.get("_TFL_PROFILING") != "1": return None - if not job_dir or not os.path.isdir(job_dir): + if not output_dir or not os.path.isdir(output_dir): return None try: interval = float(os.environ.get("_TFL_PROFILING_INTERVAL", str(_DEFAULT_INTERVAL_SEC))) except ValueError: interval = _DEFAULT_INTERVAL_SEC - thread = _ProfilingThread(pid=pid, job_dir=job_dir, interval_sec=interval) + thread = _ProfilingThread(pid=pid, output_dir=output_dir, interval_sec=interval) thread.start() return thread def finalize_profiling( thread: Optional[_ProfilingThread], - job_dir: str, + output_dir: str, wall_time_sec: float, ) -> None: """ - Stop the profiling thread and write profiling_report.json to job_dir. + Stop the profiling thread and write profiling_report.json to output_dir. Safe to call even when thread is None (profiling disabled). """ @@ -310,20 +315,39 @@ def finalize_profiling( try: report = _aggregate_samples(thread.samples, wall_time_sec, thread.interval_sec) - report_path = os.path.join(job_dir, _PROFILING_REPORT_FILE) + report_path = os.path.join(output_dir, _PROFILING_REPORT_FILE) with open(report_path, "w", encoding="utf-8") as f: json.dump(report, f, indent=2) except Exception: pass +async def copy_profiling_to_job(profiling_temp_dir: str, job_id: str) -> None: + """ + Copy profiling output from a temp directory into the job's profiling folder. + + Uses the storage abstraction so the destination may be local or remote (e.g. S3). + Safe to call if profiling_temp_dir is missing or empty; no-op on failure. + """ + if not profiling_temp_dir or not os.path.isdir(profiling_temp_dir): + return + try: + from lab.dirs import get_job_profiling_dir + from lab import storage + + dest_dir = await get_job_profiling_dir(job_id) + await storage.copy_dir(profiling_temp_dir, dest_dir) + except Exception: + pass + + # --------------------------------------------------------------------------- # Optional PyTorch profiler injection via sitecustomize.py # --------------------------------------------------------------------------- _SITECUSTOMIZE_TEMPLATE = """\ # Auto-injected by tfl-profile-trap (lab-sdk profiling). -# Activates torch.profiler.profile() and exports a Chrome trace to the job dir. +# Activates torch.profiler.profile() and exports a Chrome trace to the profiling output dir. import os as _os import atexit as _atexit @@ -356,18 +380,20 @@ def _export_trace(): """ -def inject_torch_profiler(job_dir: str, env: dict) -> str: +def inject_torch_profiler(profiling_output_dir: str, env: dict) -> str: """ If _TFL_PROFILING_TORCH=1, write a sitecustomize.py to a temp dir and prepend it to PYTHONPATH in env so torch.profiler auto-activates in the job. + Trace is written under profiling_output_dir/torch_profile so it is copied + with the rest of profiling data. - Returns the temp dir path (caller should clean up after the job exits). + Returns the sitecustomize temp dir path (caller should clean up after the job exits). """ if os.environ.get("_TFL_PROFILING_TORCH") != "1": return "" try: - torch_profile_dir = os.path.join(job_dir, _TORCH_PROFILE_DIR) + torch_profile_dir = os.path.join(profiling_output_dir, _TORCH_PROFILE_DIR) os.makedirs(torch_profile_dir, exist_ok=True) tmp_dir = tempfile.mkdtemp(prefix="tfl_sitecustomize_") diff --git a/lab-sdk/src/lab/remote_trap.py b/lab-sdk/src/lab/remote_trap.py index 596994824..0de26cc46 100644 --- a/lab-sdk/src/lab/remote_trap.py +++ b/lab-sdk/src/lab/remote_trap.py @@ -4,12 +4,13 @@ import os import subprocess import sys +import tempfile import time from typing import List from lab import Job, storage from lab.job_status import JobStatus -from lab.profiling import finalize_profiling, inject_torch_profiler, maybe_start_profiling +from lab.profiling import copy_profiling_to_job, finalize_profiling, inject_torch_profiler, maybe_start_profiling async def _set_live_status_async(job_id: str, status: str) -> None: @@ -173,23 +174,20 @@ def main(argv: List[str] | None = None) -> int: _set_live_status("started") _set_status(JobStatus.RUNNING) - # Resolve job directory for profiling output (same path used by _write_provider_logs). job_id = os.environ.get("_TFL_JOB_ID") - job_dir: str = "" - if job_id: + # Profiling writes to a temp dir; we copy it into job's "profiling" folder on exit + # (and lab.finish/error copy from _TFL_PROFILING_TEMP_DIR when the user calls them). + profiling_temp_dir: str = "" + if job_id and os.environ.get("_TFL_PROFILING") == "1": try: - from lab.dirs import get_job_dir + profiling_temp_dir = tempfile.mkdtemp(prefix="tfl_profiling_") + except OSError: + profiling_temp_dir = "" - async def _get_job_dir() -> str: - return await get_job_dir(job_id) - - job_dir = asyncio.run(_get_job_dir()) - except Exception: - job_dir = "" - - # Optionally inject torch.profiler via sitecustomize.py before spawning the process. proc_env = os.environ.copy() - torch_tmp_dir = inject_torch_profiler(job_dir, proc_env) if job_dir else "" + if profiling_temp_dir: + proc_env["_TFL_PROFILING_TEMP_DIR"] = profiling_temp_dir + torch_tmp_dir = inject_torch_profiler(profiling_temp_dir, proc_env) if profiling_temp_dir else "" # Run the original command in the shell so it behaves exactly as submitted. # Stream output line-by-line to avoid buffering large logs in memory (training @@ -208,7 +206,7 @@ async def _get_job_dir() -> str: ) # Start profiling sidecar thread (no-op if _TFL_PROFILING is not set). - profiling_thread = maybe_start_profiling(proc.pid, job_dir) if job_dir else None + profiling_thread = maybe_start_profiling(proc.pid, profiling_temp_dir) if profiling_temp_dir else None assert proc.stdout is not None for line in proc.stdout: @@ -225,8 +223,21 @@ async def _get_job_dir() -> str: combined_logs = "".join(log_lines) _write_provider_logs(combined_logs) - # Finalise profiling: stop sampler thread and write profiling_report.json. - finalize_profiling(profiling_thread, job_dir, wall_time) + # Finalise profiling: stop sampler thread and write report to profiling temp dir. + finalize_profiling(profiling_thread, profiling_temp_dir, wall_time) + + # Copy profiling output from temp dir into job's profiling folder (same as lab.finish/error). + if profiling_temp_dir and job_id: + try: + asyncio.run(copy_profiling_to_job(profiling_temp_dir, job_id)) + except Exception: + pass + try: + import shutil + + shutil.rmtree(profiling_temp_dir, ignore_errors=True) + except Exception: + pass # Clean up torch sitecustomize temp dir (best-effort). if torch_tmp_dir: From fe0ae8abcdf04054aee6971878b854c0e9930190 Mon Sep 17 00:00:00 2001 From: deep1401 Date: Fri, 13 Mar 2026 12:12:07 -0600 Subject: [PATCH 4/8] move profiler modal and add job data flag --- lab-sdk/src/lab/profiling.py | 14 ++++-- .../Tasks/EmbeddableStreamingOutput.tsx | 46 +++++-------------- .../components/Experiment/Tasks/JobsList.tsx | 10 +++- .../components/Experiment/Tasks/Tasks.tsx | 8 ++++ .../Tasks/ViewOutputModalStreaming.tsx | 7 ++- .../Experiment/Tasks/ViewProfilingModal.tsx | 38 +++++++++++++++ 6 files changed, 81 insertions(+), 42 deletions(-) create mode 100644 src/renderer/components/Experiment/Tasks/ViewProfilingModal.tsx diff --git a/lab-sdk/src/lab/profiling.py b/lab-sdk/src/lab/profiling.py index fa6618eef..517055e49 100644 --- a/lab-sdk/src/lab/profiling.py +++ b/lab-sdk/src/lab/profiling.py @@ -20,6 +20,7 @@ Set _TFL_PROFILING_INTERVAL= to change sampling interval (default 5). Set _TFL_PROFILING_TORCH=1 to also inject torch.profiler tracing. """ + from __future__ import annotations import json @@ -327,6 +328,7 @@ async def copy_profiling_to_job(profiling_temp_dir: str, job_id: str) -> None: Copy profiling output from a temp directory into the job's profiling folder. Uses the storage abstraction so the destination may be local or remote (e.g. S3). + Sets has_profiling=True in job_data so the UI can show a "View Profiling" option. Safe to call if profiling_temp_dir is missing or empty; no-op on failure. """ if not profiling_temp_dir or not os.path.isdir(profiling_temp_dir): @@ -337,6 +339,14 @@ async def copy_profiling_to_job(profiling_temp_dir: str, job_id: str) -> None: dest_dir = await get_job_profiling_dir(job_id) await storage.copy_dir(profiling_temp_dir, dest_dir) + try: + from lab.job import Job + + job = await Job.get(job_id) + if job is not None: + await job.update_job_data_field("has_profiling", True) + except Exception: + pass except Exception: pass @@ -403,9 +413,7 @@ def inject_torch_profiler(profiling_output_dir: str, env: dict) -> str: env["_TFL_TORCH_PROFILE_DIR"] = torch_profile_dir existing_pythonpath = env.get("PYTHONPATH", "") - env["PYTHONPATH"] = ( - f"{tmp_dir}{os.pathsep}{existing_pythonpath}" if existing_pythonpath else tmp_dir - ) + env["PYTHONPATH"] = f"{tmp_dir}{os.pathsep}{existing_pythonpath}" if existing_pythonpath else tmp_dir return tmp_dir except Exception: return "" diff --git a/src/renderer/components/Experiment/Tasks/EmbeddableStreamingOutput.tsx b/src/renderer/components/Experiment/Tasks/EmbeddableStreamingOutput.tsx index 68039755c..02f357d3d 100644 --- a/src/renderer/components/Experiment/Tasks/EmbeddableStreamingOutput.tsx +++ b/src/renderer/components/Experiment/Tasks/EmbeddableStreamingOutput.tsx @@ -26,7 +26,6 @@ import * as chatAPI from 'renderer/lib/transformerlab-api-sdk'; import { useExperimentInfo } from 'renderer/lib/ExperimentInfoContext'; import { jobChipColor } from 'renderer/lib/utils'; import PollingOutputTerminal from './PollingOutputTerminal'; -import ProfilingReport from './ProfilingReport'; interface ProviderLogsTerminalProps { logsText: string; @@ -199,12 +198,11 @@ function RefreshIndicator({ ); } -type TabValue = 'output' | 'provider' | 'profiling'; +type TabValue = 'output' | 'provider'; const TAB_OPTIONS: { value: TabValue; label: string }[] = [ { value: 'output', label: 'Lab SDK Output' }, { value: 'provider', label: 'Machine Logs' }, - { value: 'profiling', label: 'Profiling' }, ]; export interface EmbeddableStreamingOutputProps { @@ -217,7 +215,7 @@ export interface EmbeddableStreamingOutputProps { export default function EmbeddableStreamingOutput({ jobId, - tabs: tabsProp = ['output', 'provider', 'profiling'], + tabs: tabsProp = ['output', 'provider'], jobStatus = '', }: EmbeddableStreamingOutputProps) { const { experimentInfo } = useExperimentInfo(); @@ -225,8 +223,7 @@ export default function EmbeddableStreamingOutput({ const [viewLiveProviderLogs, setViewLiveProviderLogs] = useState(false); - const tabs = - tabsProp.length > 0 ? tabsProp : ['output', 'provider', 'profiling']; + const tabs = tabsProp.length > 0 ? tabsProp : ['output', 'provider']; const showTabList = tabs.length > 1; const tabsKey = tabs.join(','); @@ -335,9 +332,7 @@ export default function EmbeddableStreamingOutput({ onChange={(_event, value) => { if ( typeof value === 'string' && - (value === 'output' || - value === 'provider' || - value === 'profiling') + (value === 'output' || value === 'provider') ) { setActiveTab(value as TabValue); } @@ -396,17 +391,13 @@ export default function EmbeddableStreamingOutput({ )}
- {activeTab !== 'profiling' && ( - - )} + - ) : activeTab === 'profiling' ? ( - - - ) : ( void; onViewJobDatasets?: (jobId: string) => void; onViewJobModels?: (jobId: string) => void; + onViewProfiling?: (jobId: string) => void; onViewFileBrowser?: (jobId: string) => void; loading: boolean; onViewTrackio?: (jobId: string) => void; @@ -69,6 +70,7 @@ const JobsList: React.FC = ({ onViewInteractive, onViewJobDatasets, onViewJobModels, + onViewProfiling, onViewFileBrowser, loading, onViewTrackio, @@ -348,7 +350,8 @@ const JobsList: React.FC = ({ {(job?.job_data?.artifacts || job?.job_data?.artifacts_dir || job?.job_data?.generated_datasets || - job?.job_data?.models) && ( + job?.job_data?.models || + job?.job_data?.has_profiling) && ( = ({ View Artifacts )} + {job?.job_data?.has_profiling && ( + onViewProfiling?.(job?.id)}> + View Profiling + + )} {job?.job_data?.generated_datasets && ( onViewJobDatasets?.(job?.id)} diff --git a/src/renderer/components/Experiment/Tasks/Tasks.tsx b/src/renderer/components/Experiment/Tasks/Tasks.tsx index 0dc956ea5..533c09054 100644 --- a/src/renderer/components/Experiment/Tasks/Tasks.tsx +++ b/src/renderer/components/Experiment/Tasks/Tasks.tsx @@ -21,6 +21,7 @@ import DeleteTaskConfirmModal from './DeleteTaskConfirmModal'; import QueueTaskModal from './QueueTaskModal'; import ViewOutputModalStreaming from './ViewOutputModalStreaming'; import ViewArtifactsModal from './ViewArtifactsModal'; +import ViewProfilingModal from './ViewProfilingModal'; import ViewCheckpointsModal from './ViewCheckpointsModal'; import ViewEvalResultsModal from './ViewEvalResultsModal'; import CompareEvalResultsModal from './CompareEvalResultsModal'; @@ -52,6 +53,7 @@ export default function Tasks({ subtype }: { subtype?: string }) { useState(-1); const [viewCheckpointsFromJob, setViewCheckpointsFromJob] = useState(-1); const [viewArtifactsFromJob, setViewArtifactsFromJob] = useState(-1); + const [viewProfilingFromJob, setViewProfilingFromJob] = useState(-1); const [viewEvalImagesFromJob, setViewEvalImagesFromJob] = useState(-1); const [viewOutputFromSweepJob, setViewOutputFromSweepJob] = useState(false); const [viewSweepResultsFromJob, setViewSweepResultsFromJob] = useState(-1); @@ -1270,6 +1272,7 @@ export default function Tasks({ subtype }: { subtype?: string }) { setViewCheckpointsFromJob(parseInt(jobId)) } onViewArtifacts={(jobId) => setViewArtifactsFromJob(parseInt(jobId))} + onViewProfiling={(jobId) => setViewProfilingFromJob(parseInt(jobId))} onViewEvalImages={(jobId) => setViewEvalImagesFromJob(parseInt(jobId)) } @@ -1333,6 +1336,11 @@ export default function Tasks({ subtype }: { subtype?: string }) { onClose={() => setViewArtifactsFromJob(-1)} jobId={viewArtifactsFromJob} /> + setViewProfilingFromJob(-1)} + jobId={viewProfilingFromJob} + /> setViewCheckpointsFromJob(-1)} diff --git a/src/renderer/components/Experiment/Tasks/ViewOutputModalStreaming.tsx b/src/renderer/components/Experiment/Tasks/ViewOutputModalStreaming.tsx index 06ba1ae84..12a30c545 100644 --- a/src/renderer/components/Experiment/Tasks/ViewOutputModalStreaming.tsx +++ b/src/renderer/components/Experiment/Tasks/ViewOutputModalStreaming.tsx @@ -5,14 +5,13 @@ import EmbeddableStreamingOutput from './EmbeddableStreamingOutput'; const TAB_LABELS: Record = { output: 'Lab SDK Output', provider: 'Machine Logs', - profiling: 'Profiling', }; interface ViewOutputModalStreamingProps { jobId: number; setJobId: (jobId: number) => void; /** Which tabs to show, in order. e.g. ['output', 'provider'] or ['provider'] for interactive tasks. */ - tabs?: ('output' | 'provider' | 'profiling')[]; + tabs?: ('output' | 'provider')[]; /** Current job status string (e.g. 'RUNNING', 'COMPLETE'). */ jobStatus?: string; } @@ -20,7 +19,7 @@ interface ViewOutputModalStreamingProps { function ViewOutputModalStreaming({ jobId, setJobId, - tabs = ['output', 'provider', 'profiling'], + tabs = ['output', 'provider'], jobStatus = '', }: ViewOutputModalStreamingProps) { if (jobId === -1) { @@ -62,7 +61,7 @@ function ViewOutputModalStreaming({ } ViewOutputModalStreaming.defaultProps = { - tabs: ['output', 'provider', 'profiling'], + tabs: ['output', 'provider'], jobStatus: '', }; diff --git a/src/renderer/components/Experiment/Tasks/ViewProfilingModal.tsx b/src/renderer/components/Experiment/Tasks/ViewProfilingModal.tsx new file mode 100644 index 000000000..eaec187df --- /dev/null +++ b/src/renderer/components/Experiment/Tasks/ViewProfilingModal.tsx @@ -0,0 +1,38 @@ +import React from 'react'; +import { Box, Modal, ModalClose, ModalDialog, Typography } from '@mui/joy'; +import ProfilingReport from './ProfilingReport'; + +interface ViewProfilingModalProps { + open: boolean; + onClose: () => void; + jobId: number; +} + +export default function ViewProfilingModal({ + open, + onClose, + jobId, +}: ViewProfilingModalProps) { + return ( + + + + + Profiling – Job {jobId} + + + {jobId !== -1 && } + + + + ); +} From 458e577efab7764596dddba232a2e85ba7243373 Mon Sep 17 00:00:00 2001 From: deep1401 Date: Fri, 13 Mar 2026 12:26:28 -0600 Subject: [PATCH 5/8] sdk version --- api/pyproject.toml | 2 +- lab-sdk/pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/api/pyproject.toml b/api/pyproject.toml index fada1a5c5..16017528d 100644 --- a/api/pyproject.toml +++ b/api/pyproject.toml @@ -33,7 +33,7 @@ dependencies = [ "soundfile==0.13.1", "tensorboardX==2.6.2.2", "timm==1.0.15", - "transformerlab==0.0.98", + "transformerlab==0.0.99", "transformerlab-inference==0.2.52", "transformers==4.57.1", "wandb==0.23.1", diff --git a/lab-sdk/pyproject.toml b/lab-sdk/pyproject.toml index 5c419da3d..d17f0fc8a 100644 --- a/lab-sdk/pyproject.toml +++ b/lab-sdk/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "transformerlab" -version = "0.0.98" +version = "0.0.99" description = "Python SDK for Transformer Lab" readme = "README.md" requires-python = ">=3.10" From bebdc729782ee73eb44a72f43759b2dc60033ce8 Mon Sep 17 00:00:00 2001 From: deep1401 Date: Fri, 13 Mar 2026 12:28:58 -0600 Subject: [PATCH 6/8] restore the setup commands back --- api/transformerlab/routers/compute_provider.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/api/transformerlab/routers/compute_provider.py b/api/transformerlab/routers/compute_provider.py index 869c81a94..846a565cb 100644 --- a/api/transformerlab/routers/compute_provider.py +++ b/api/transformerlab/routers/compute_provider.py @@ -1694,10 +1694,10 @@ async def launch_template_on_provider( # Ensure transformerlab SDK is available on remote machines for live_status tracking and other helpers. # This runs after AWS credentials are configured so we have access to any remote storage if needed. if provider.type != ProviderType.LOCAL.value: - # setup_commands.append("pip install -q transformerlab") - setup_commands.append( - "git clone https://github.com/transformerlab/transformerlab-app; cd transformerlab-app; git checkout add/profiler; pip install -e lab-sdk/; cd ~;" - ) + setup_commands.append("pip install -q transformerlab") + # Install torch as well if torch profiler is enabled + if request.enable_profiling_torch: + setup_commands.append("pip install -q torch") # Add GitHub clone setup if enabled if request.github_repo_url: From 0657f333643ac580f288ab40ccb38e7598b6e9f6 Mon Sep 17 00:00:00 2001 From: deep1401 Date: Fri, 13 Mar 2026 12:36:46 -0600 Subject: [PATCH 7/8] retrigger --- api/transformerlab/routers/compute_provider.py | 1 + 1 file changed, 1 insertion(+) diff --git a/api/transformerlab/routers/compute_provider.py b/api/transformerlab/routers/compute_provider.py index 846a565cb..3e52e90d4 100644 --- a/api/transformerlab/routers/compute_provider.py +++ b/api/transformerlab/routers/compute_provider.py @@ -1695,6 +1695,7 @@ async def launch_template_on_provider( # This runs after AWS credentials are configured so we have access to any remote storage if needed. if provider.type != ProviderType.LOCAL.value: setup_commands.append("pip install -q transformerlab") + # Install torch as well if torch profiler is enabled if request.enable_profiling_torch: setup_commands.append("pip install -q torch") From 0611d499f032779b2bda218ee26c6cc4f062482e Mon Sep 17 00:00:00 2001 From: deep1401 Date: Mon, 16 Mar 2026 09:14:26 -0600 Subject: [PATCH 8/8] sdk --- api/pyproject.toml | 2 +- lab-sdk/pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/api/pyproject.toml b/api/pyproject.toml index 16017528d..4da723887 100644 --- a/api/pyproject.toml +++ b/api/pyproject.toml @@ -33,7 +33,7 @@ dependencies = [ "soundfile==0.13.1", "tensorboardX==2.6.2.2", "timm==1.0.15", - "transformerlab==0.0.99", + "transformerlab==0.1.0", "transformerlab-inference==0.2.52", "transformers==4.57.1", "wandb==0.23.1", diff --git a/lab-sdk/pyproject.toml b/lab-sdk/pyproject.toml index d17f0fc8a..53dc601ab 100644 --- a/lab-sdk/pyproject.toml +++ b/lab-sdk/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "transformerlab" -version = "0.0.99" +version = "0.1.0" description = "Python SDK for Transformer Lab" readme = "README.md" requires-python = ">=3.10"