Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion api/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ dependencies = [
"soundfile==0.13.1",
"tensorboardX==2.6.2.2",
"timm==1.0.15",
"transformerlab==0.1.1",
"transformerlab==0.1.2",
"transformerlab-inference==0.2.52",
"transformers==4.57.1",
"wandb==0.23.1",
Expand Down
24 changes: 23 additions & 1 deletion api/transformerlab/routers/compute_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -1814,9 +1814,27 @@ async def launch_template_on_provider(

# Enable Trackio auto-init for this job if requested. When set, the lab SDK
# running inside the remote script can automatically initialize Trackio
# and capture metrics for visualization in the Tasks UI.
# and capture metrics for visualization in the Tasks UI. For shared projects,
# pass project name and run name so the SDK can build trackio_runs/{experiment_id}/{project_name}/.
trackio_project_name_for_job: Optional[str] = None
trackio_run_name_for_job: Optional[str] = None
if request.enable_trackio:
env_vars["TLAB_TRACKIO_AUTO_INIT"] = "true"
project_name = (request.trackio_project_name or "").strip() or str(request.experiment_id)
trackio_run_name = f"{request.task_name or 'task'}-job-{job_id}"
trackio_project_name_for_job = project_name
trackio_run_name_for_job = trackio_run_name
env_vars["TLAB_TRACKIO_PROJECT_NAME"] = project_name
env_vars["TLAB_TRACKIO_RUN_NAME"] = trackio_run_name
# Create shared project dir so the SDK can sync into it; path is derived by dashboard when needed.
workspace_dir = await get_workspace_dir()
shared_path = storage.join(
workspace_dir,
"trackio_runs",
secure_filename(str(request.experiment_id)),
secure_filename(project_name),
)
await storage.makedirs(shared_path, exist_ok=True)

# Get TFL_STORAGE_URI from storage context
tfl_storage_uri = None
Expand Down Expand Up @@ -1984,6 +2002,10 @@ async def launch_template_on_provider(
job_data["workspace_dir"] = provider_config_dict["workspace_dir"]
if request.file_mounts is True and request.task_id:
job_data["task_id"] = request.task_id
if trackio_project_name_for_job is not None:
job_data["trackio_project_name"] = trackio_project_name_for_job
if trackio_run_name_for_job is not None:
job_data["trackio_run_name"] = trackio_run_name_for_job

for key, value in job_data.items():
if value is not None:
Expand Down
11 changes: 11 additions & 0 deletions api/transformerlab/routers/trackio.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from transformerlab.routers.auth import get_user_and_team
from transformerlab.services.trackio_service import (
list_trackio_projects,
start_trackio_for_job,
stop_trackio_for_job,
)
Expand All @@ -23,6 +24,16 @@ async def trackio_start(job_id: str, user_and_team=Depends(get_user_and_team)) -
return await start_trackio_for_job(job_id, org_id=org_id, experiment_id=experiment_id)


@router.get("/trackio/projects")
async def trackio_projects(experiment_id: str, user_and_team=Depends(get_user_and_team)) -> dict:
"""
List existing TrackIO project names for an experiment (for shared-project dropdown).
"""
_ = user_and_team
projects = await list_trackio_projects(experiment_id)
return {"projects": projects}


@router.get("/trackio/stop")
async def trackio_stop(job_id: str, user_and_team=Depends(get_user_and_team)) -> dict:
"""
Expand Down
4 changes: 4 additions & 0 deletions api/transformerlab/schemas/compute_providers.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,10 @@ class ProviderTemplateLaunchRequest(BaseModel):
default=False,
description="When True, set TLAB_TRACKIO_AUTO_INIT=true in the job environment so lab SDK can auto-integrate with Trackio.",
)
trackio_project_name: Optional[str] = Field(
default=None,
description="TrackIO project name for shared project; used when enable_trackio=True. Omit or empty to use 'default'.",
)


class ProviderTemplateFileUploadResponse(BaseModel):
Expand Down
40 changes: 37 additions & 3 deletions api/transformerlab/services/trackio_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@
import shutil
import subprocess
import sys
from typing import Any, Dict
from typing import Any, Dict, List

from fastapi import HTTPException

from lab import HOME_DIR, storage
from lab.dirs import get_workspace_dir
from lab.job import Job

from werkzeug.utils import secure_filename
Expand Down Expand Up @@ -44,18 +45,27 @@ async def start_trackio_for_job(job_id: str, org_id: str | None, experiment_id:
raise HTTPException(status_code=400, detail="Job data is not a dictionary")

source_path = job_data.get("trackio_db_artifact_path")
if not source_path and job_data.get("trackio_project_name") and experiment_id:
# Shared project: derive path from workspace + experiment_id + project_name
workspace_dir = await get_workspace_dir()
source_path = storage.join(
workspace_dir,
"trackio_runs",
secure_filename(str(experiment_id)),
secure_filename(str(job_data["trackio_project_name"]).strip()),
)
if not source_path:
raise HTTPException(
status_code=404,
detail="Trackio metrics not found for this job (trackio_db_artifact_path missing)",
detail="Trackio metrics not found for this job (trackio_db_artifact_path or trackio_project_name missing)",
)

# If there's already a Trackio process for this job, just return its URL
existing = _TRACKIO_PROCESSES.get(job_id)
if existing and isinstance(existing.get("url"), str):
return {"url": existing["url"]}

project = job_data.get("trackio_project")
project = job_data.get("trackio_project") or job_data.get("trackio_project_name")

# Always run Trackio from a local temporary copy of the metrics directory.
# This works for both local and remote storage backends. Use HOME_DIR so the
Expand Down Expand Up @@ -240,3 +250,27 @@ async def stop_trackio_for_job(job_id: str) -> None:
except Exception:
# Ignore cleanup errors
pass


async def list_trackio_projects(experiment_id: str) -> List[str]:
"""
List TrackIO project directory names for an experiment (shared project names).
Returns basenames of direct subdirs under workspace_dir/trackio_runs/{experiment_id}/.
"""
if not experiment_id:
return []
workspace_dir = await get_workspace_dir()
parent = storage.join(workspace_dir, "trackio_runs", secure_filename(str(experiment_id)))
if not await storage.exists(parent) or not await storage.isdir(parent):
return []
try:
entries = await storage.ls(parent, detail=False)
except Exception:
return []
projects: List[str] = []
for path in entries:
if await storage.isdir(path):
name = path.rstrip("/").split("/")[-1].split("\\")[-1]
if name:
projects.append(name)
return sorted(projects)
2 changes: 1 addition & 1 deletion lab-sdk/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "transformerlab"
version = "0.1.1"
version = "0.1.2"
description = "Python SDK for Transformer Lab"
readme = "README.md"
requires-python = ">=3.10"
Expand Down
106 changes: 80 additions & 26 deletions lab-sdk/src/lab/lab_facade.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from . import dirs
from .model import Model as ModelService
from . import storage
from werkzeug.utils import secure_filename
from .dataset import Dataset
from .task_template import TaskTemplate
from .generation import GenerationModel, load_generation_model as _load_generation_model
Expand Down Expand Up @@ -140,6 +141,8 @@ def init(self, experiment_id: str | None = None, config: Optional[Dict[str, Any]
self._trackio_available = False
self._trackio_managed = False
auto_init_trackio = os.environ.get("TLAB_TRACKIO_AUTO_INIT", "false").lower() == "true"
trackio_project_name_env = (os.environ.get("TLAB_TRACKIO_PROJECT_NAME") or "").strip()
trackio_run_name_env = (os.environ.get("TLAB_TRACKIO_RUN_NAME") or "").strip()
if auto_init_trackio:
try:
import trackio # type: ignore[import]
Expand All @@ -148,11 +151,29 @@ def init(self, experiment_id: str | None = None, config: Optional[Dict[str, Any]
self._trackio_available = True
existing_run = context_vars.current_run.get()
if existing_run is None:
# Use experiment_id as a natural default project name if available
project_name = str(experiment_id or "TransformerLab")
trackio.init(project=project_name)
self._trackio_managed = True
logger.info(f"📊 Trackio auto-init enabled for project '{project_name}'")
if trackio_project_name_env:
# Shared project: use temp dir, seed from shared path, init with project + run name
job_id_env = os.environ.get("_TFL_JOB_ID", "unknown")
temp_dir = f"/tmp/trackio/{job_id_env}"
os.makedirs(temp_dir, exist_ok=True)
os.environ["TRACKIO_DIR"] = temp_dir
_run_async(
self._seed_trackio_shared_path_async(
experiment_id or "", trackio_project_name_env, temp_dir
)
)
trackio.init(
project=trackio_project_name_env,
name=trackio_run_name_env or f"job-{job_id_env}",
)
self._trackio_managed = True
logger.info(f"📊 Trackio auto-init enabled for shared project '{trackio_project_name_env}'")
else:
# Legacy: per-job project name
project_name = str(experiment_id or "TransformerLab")
trackio.init(project=project_name)
self._trackio_managed = True
logger.info(f"📊 Trackio auto-init enabled for project '{project_name}'")
except Exception:
# Silently ignore any Trackio issues; lab core behavior must not be affected
self._trackio_available = False
Expand Down Expand Up @@ -1564,6 +1585,8 @@ def capture_trackio_metadata(self, db_path: str, project: Optional[str] = None)
async def async_capture_trackio_metadata(self, db_path: str, project: Optional[str] = None) -> str:
"""
Async implementation of capture_trackio_metadata().
When TLAB_TRACKIO_PROJECT_NAME is set (shared project), copies to trackio_runs/{experiment_id}/{project_name}/
and does not write trackio_db_artifact_path (dashboard derives path).
"""
self._ensure_initialized()

Expand All @@ -1575,32 +1598,63 @@ async def async_capture_trackio_metadata(self, db_path: str, project: Optional[s
if not os.path.exists(src):
raise FileNotFoundError(f"Trackio path does not exist: {src}")

# Resolve the job's artifacts directory and create a dedicated 'trackio' subfolder
artifacts_dir = await self._job.get_artifacts_dir() # type: ignore[union-attr]
trackio_dir = storage.join(artifacts_dir, "trackio")
trackio_project_name_env = (os.environ.get("TLAB_TRACKIO_PROJECT_NAME") or "").strip()
if trackio_project_name_env and self._experiment is not None:
# Shared project: copy to trackio_runs/{experiment_id}/{project_name}/ (merge, do not rm)
workspace_dir = await dirs.get_workspace_dir()
trackio_dir = storage.join(
workspace_dir,
"trackio_runs",
secure_filename(str(self._experiment.id)),
secure_filename(trackio_project_name_env),
)
await storage.makedirs(trackio_dir, exist_ok=True)
if os.path.isdir(src):
await storage.copy_dir(src, trackio_dir)
else:
base_name = posixpath.basename(src)
dest_file = storage.join(trackio_dir, base_name)
await storage.copy_file(src, dest_file)
# Do not write trackio_db_artifact_path; dashboard derives from trackio_project_name
logger.info(f"📊 Saved Trackio metrics to shared project: {trackio_dir}")
return trackio_dir
else:
# Legacy: per-job artifacts/trackio
artifacts_dir = await self._job.get_artifacts_dir() # type: ignore[union-attr]
trackio_dir = storage.join(artifacts_dir, "trackio")

# Ensure the destination directory exists and is clean
if await storage.exists(trackio_dir):
# Remove any previous Trackio data for this job to avoid stale metrics
await storage.rm_tree(trackio_dir)
if await storage.exists(trackio_dir):
await storage.rm_tree(trackio_dir)
await storage.makedirs(trackio_dir, exist_ok=True)

await storage.makedirs(trackio_dir, exist_ok=True)
if os.path.isdir(src):
await storage.copy_dir(src, trackio_dir)
else:
base_name = posixpath.basename(src)
dest_file = storage.join(trackio_dir, base_name)
await storage.copy_file(src, dest_file)

# Copy directory contents or single file into the trackio subfolder
if os.path.isdir(src):
await storage.copy_dir(src, trackio_dir)
else:
base_name = posixpath.basename(src)
dest_file = storage.join(trackio_dir, base_name)
await storage.copy_file(src, dest_file)
await self._job.update_job_data_field("trackio_db_artifact_path", trackio_dir) # type: ignore[union-attr]
if project is not None and isinstance(project, str) and project.strip() != "":
await self._job.update_job_data_field("trackio_project", project.strip()) # type: ignore[union-attr]

# Record the artifact location in job_data so the backend/UI can locate it
await self._job.update_job_data_field("trackio_db_artifact_path", trackio_dir) # type: ignore[union-attr]
if project is not None and isinstance(project, str) and project.strip() != "":
await self._job.update_job_data_field("trackio_project", project.strip()) # type: ignore[union-attr]
logger.info(f"📊 Saved Trackio metrics for job to: {trackio_dir}")
return trackio_dir

logger.info(f"📊 Saved Trackio metrics for job to: {trackio_dir}")
return trackio_dir
async def _seed_trackio_shared_path_async(self, experiment_id: str, project_name: str, dest_dir: str) -> None:
"""If shared project path exists, copy its contents into dest_dir (seed for new run)."""
try:
workspace_dir = await dirs.get_workspace_dir()
shared_path = storage.join(
workspace_dir,
"trackio_runs",
secure_filename(str(experiment_id)),
secure_filename(project_name),
)
if await storage.exists(shared_path) and await storage.isdir(shared_path):
await storage.copy_dir(shared_path, dest_dir)
except Exception as e:
logger.debug("Trackio shared path seed failed: %s", e)

def _capture_existing_trackio_run(self) -> None:
"""
Expand Down
3 changes: 2 additions & 1 deletion src/renderer/components/Experiment/Tasks/JobsList.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,8 @@ const JobsList: React.FC<JobsListProps> = ({
</Button>
)}

{job?.job_data?.trackio_db_artifact_path && (
{(job?.job_data?.trackio_db_artifact_path ||
job?.job_data?.trackio_project_name) && (
<Button
size="sm"
variant="plain"
Expand Down
Loading
Loading