diff --git a/.gitignore b/.gitignore
index 9e1a5ff76cd..251c8e5e60b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,15 @@
 .cline_storage
 /projects/hip/_build
+
+# Claude Code session data
+.claude/
+**/.claude/
+
+# Python
+__pycache__/
+**/__pycache__/
+*.pyc
+*.pyo
+
+# Analysis output generated during testing
+rocpd-output-data/
diff --git a/projects/rocprofiler-sdk/cmake/Modules/rocprofiler-sdk-utilities.cmake b/projects/rocprofiler-sdk/cmake/Modules/rocprofiler-sdk-utilities.cmake
index 69573ad447a..cd00f349994 100644
--- a/projects/rocprofiler-sdk/cmake/Modules/rocprofiler-sdk-utilities.cmake
+++ b/projects/rocprofiler-sdk/cmake/Modules/rocprofiler-sdk-utilities.cmake
@@ -56,6 +56,17 @@ function(rocprofiler_sdk_pc_sampling_disabled _VAR)
     set(CMAKE_MESSAGE_INDENT "[${PROJECT_NAME}]${ARG_PREFIX} ")
 
     rocprofiler_sdk_get_gfx_architectures(rocprofiler-sdk-tests-gfx-info ECHO)
+    # Guard against empty GPU list (e.g. build machine without GPUs)
+    list(LENGTH rocprofiler-sdk-tests-gfx-info _gfx_list_len)
+    if(_gfx_list_len EQUAL 0)
+        set(${_VAR}
+            TRUE
+            PARENT_SCOPE)
+        if(ARG_ECHO)
+            message(STATUS "PC Sampling is disabled (no GPUs detected)")
+        endif()
+        return()
+    endif()
     list(GET rocprofiler-sdk-tests-gfx-info 0 pc-sampling-gpu-0-gfx-info)
 
     if("${pc-sampling-gpu-0-gfx-info}" MATCHES "^gfx90a$"
@@ -88,6 +99,17 @@ function(rocprofiler_sdk_pc_sampling_stochastic_disabled _VAR)
     set(CMAKE_MESSAGE_INDENT "[${PROJECT_NAME}]${ARG_PREFIX} ")
 
     rocprofiler_sdk_get_gfx_architectures(rocprofiler-sdk-tests-gfx-info ECHO)
+    # Guard against empty GPU list (e.g. build machine without GPUs)
+    list(LENGTH rocprofiler-sdk-tests-gfx-info _gfx_list_len)
+    if(_gfx_list_len EQUAL 0)
+        set(${_VAR}
+            TRUE
+            PARENT_SCOPE)
+        if(ARG_ECHO)
+            message(STATUS "Stochastic PC Sampling is disabled (no GPUs detected)")
+        endif()
+        return()
+    endif()
     list(GET rocprofiler-sdk-tests-gfx-info 0 pc-sampling-gpu-0-gfx-info)
 
     if("${pc-sampling-gpu-0-gfx-info}" MATCHES "^gfx94[0-9]$"
diff --git a/projects/rocprofiler-sdk/source/bin/rocprofv3.py b/projects/rocprofiler-sdk/source/bin/rocprofv3.py
index 42158285ac5..4dfd7a38a9d 100755
--- a/projects/rocprofiler-sdk/source/bin/rocprofv3.py
+++ b/projects/rocprofiler-sdk/source/bin/rocprofv3.py
@@ -1291,7 +1291,10 @@ def _write_env_value():
         args.output_format = ["rocpd"]
 
     update_env(
-        "ROCPROF_OUTPUT_FORMAT", ",".join(args.output_format), append=True, join_char=","
+        "ROCPROF_OUTPUT_FORMAT",
+        ",".join(args.output_format),
+        append=True,
+        join_char=",",
     )
 
     if args.kokkos_trace:
diff --git a/projects/rocprofiler-sdk/source/lib/python/rocpd/__main__.py b/projects/rocprofiler-sdk/source/lib/python/rocpd/__main__.py
index 7a932507e82..e92276b356e 100644
--- a/projects/rocprofiler-sdk/source/lib/python/rocpd/__main__.py
+++ b/projects/rocprofiler-sdk/source/lib/python/rocpd/__main__.py
@@ -38,6 +38,7 @@ def main(argv=None, config=None):
 
     """
     import argparse
+    from . import analyze
     from . import csv
     from . import merge
     from . import otf2
@@ -123,6 +124,27 @@ def main(argv=None, config=None):
     Aggregate 2 databases and output all summary files to HTML, only include HIP and MARKER regions, include domain summary
     $ rocpd summary -i db{0,1}.db --region-categories HIP MARKERS --domain-summary --format html
 
+"""
+
+    analyze_examples = """
+
+Example usage:
+
+    Analyze performance of a single database
+    $ rocpd analyze -i db0.db
+
+    Analyze with output to file
+    $ rocpd analyze -i db0.db --format text -d ./output/ -o analysis
+
+    Analyze top 20 kernels instead of default 10
+    $ rocpd analyze -i db{0..3}.db --top-kernels 20
+
+    Analyze with a custom prompt (guides local analysis; enhances LLM output when --llm is used)
+    $ rocpd analyze -i db0.db --prompt "Why is my application slow?"
+
+    Analyze with LLM-enhanced explanation
+    $ rocpd analyze -i db0.db --llm anthropic
+
 """
     input_help_string = "Input path and filename to one or more database(s). Wildcards accepted, as well as .rpdb folders"
 
@@ -193,6 +215,14 @@ def add_required_args(_parser):
         epilog=summary_examples,
     )
 
+    analyzer = subparsers.add_parser(
+        "analyze",
+        description="Analyze GPU performance traces with AI-powered insights",
+        allow_abbrev=False,
+        formatter_class=argparse.RawTextHelpFormatter,
+        epilog=analyze_examples,
+    )
+
     def get_output_type(val):
         return val.lower().replace("perfetto", "pftrace")
 
@@ -213,6 +243,17 @@ def get_output_type(val):
     add_required_args(packager)
     add_required_args(query_reporter)
     add_required_args(generate_summary)
+    # analyze: -i is optional (not required when --source-dir is used for Tier 0)
+    _analyze_input_group = analyzer.add_argument_group("Required options")
+    _analyze_input_group.add_argument(
+        "-i",
+        "--input",
+        required=False,
+        default=None,
+        type=output_config.check_file_exists,
+        nargs="+",
+        help=input_help_string,
+    )
 
     # converter: add args from any sub-modules
     process_converter_args = []
@@ -243,6 +284,12 @@ def get_output_type(val):
     process_generate_summary_args.append(summary.add_args(generate_summary))
     process_generate_summary_args.append(time_window.add_args(generate_summary))
 
+    # analyze: subparser args
+    process_analyzer_args = []
+    process_analyzer_args.append(output_config.add_args(analyzer))
+    process_analyzer_args.append(analyze.add_args(analyzer))
+    process_analyzer_args.append(time_window.add_args(analyzer))
+
     # parse the command line arguments
     args = parser.parse_args(argv)
 
@@ -355,6 +402,43 @@ def get_output_type(val):
 
         summary.generate_all_summaries(input, **summary_args)
 
+    # if the user requested AI analysis, execute the analyzer
+    elif args.command == "analyze":
+        # Validate: at least one of -i, --source-dir, or --interactive must be provided
+        has_input = bool(getattr(args, "input", None))
+        has_source_dir = bool(getattr(args, "source_dir", None))
+        has_interactive = bool(getattr(args, "interactive", None))
+        if not has_input and not has_source_dir and not has_interactive:
+            analyzer.error(
+                "at least one of -i/--input, --source-dir, or --interactive is required.\n"
+                "  Use -i output.db for trace analysis (Tier 1/2).\n"
+                "  Use --source-dir ./src for source code analysis (Tier 0).\n"
+                '  Use --interactive "./my_app" for the 7-phase workflow.\n'
+                "  Use both -i and --source-dir for combined analysis."
+            )
+
+        # construct the rocpd import data object (None if source-only)
+        if has_input:
+            input = RocpdImportData(
+                args.input,
+                automerge_limit=getattr(
+                    args, "automerge_limit", package.IDEAL_NUMBER_OF_DATABASE_FILES
+                ),
+            )
+        else:
+            input = None
+
+        # analyzer subparser args
+        analyzer_args = {}
+        for pitr in process_analyzer_args:
+            analyzer_args.update(pitr(input, args))
+
+        # Pass source_dir if provided
+        if has_source_dir:
+            analyzer_args["source_dir"] = args.source_dir
+
+        analyze.execute(input, **analyzer_args)
+
     print("Done. Exiting...")
 
 
diff --git a/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/README.md b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/README.md
new file mode 100644
index 00000000000..d1c0d66b7b4
--- /dev/null
+++ b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/README.md
@@ -0,0 +1,796 @@
+# rocpd AI Analysis Module
+
+AI-powered GPU performance analysis for AMD ROCm profiling data.
+
+## Overview
+
+This module provides both CLI and Python API access to AI-powered analysis of GPU profiling traces. It analyzes rocpd database files and generates human-readable insights, bottleneck identification, and actionable optimization recommendations.
+
+### Key Features
+
+- **Local-first analysis** - Works offline, no API calls required by default
+- **Tier 0 source analysis** - Scan GPU source code without a trace database (`analyze_source()`)
+- **Optional LLM enhancement** - Natural language explanations via Anthropic Claude, OpenAI GPT, any OpenAI-compatible private server, or local Ollama
+- **User-modifiable "fence"** - Customize LLM behavior by editing reference guide
+- **Privacy-focused** - Data sanitization for LLM mode (kernel names, grid sizes redacted)
+- **Multiple output formats** - Python objects, JSON, text, markdown, webview (interactive HTML)
+- **Interactive session** - Menu-driven analysis loop with persistent multi-turn LLM conversation and session persistence
+- **Type-safe API** - Dataclass-based with type hints
+
+## Quick Start
+
+### CLI Usage
+
+```bash
+# Basic analysis (local mode)
+rocpd analyze -i output.db
+
+# With LLM enhancement — Anthropic or OpenAI
+export ANTHROPIC_API_KEY="sk-ant-..."
+rocpd analyze -i output.db --llm anthropic
+
+# Private/enterprise OpenAI-compatible server
+export ROCPD_LLM_PRIVATE_URL="https://llm-api.example.com/OpenAI"
+export ROCPD_LLM_PRIVATE_HEADERS='{"Ocp-Apim-Subscription-Key": "abc123", "api-version": "preview"}'
+rocpd analyze -i output.db --llm private --llm-private-model gpt-4o
+
+# Local Ollama model
+rocpd analyze -i output.db --llm-local ollama --llm-local-model llama3
+
+# With custom prompt
+rocpd analyze -i output.db --llm anthropic --prompt "Why is my matmul kernel slow?"
+
+# JSON output (produces analysis.json)
+rocpd analyze -i output.db --format json -d ./output -o analysis
+
+# Markdown output (produces analysis.md)
+rocpd analyze -i output.db --format markdown -d ./output -o analysis
+
+# Interactive HTML webview (produces analysis.html)
+rocpd analyze -i output.db --format webview -d ./output -o analysis
+
+# Tier 0: source code analysis (no .db required)
+rocpd analyze --source-dir ./my_app
+rocpd analyze --source-dir ./my_app --format json -d ./output -o plan
+
+# Combined: Tier 0 + Tier 1/2
+rocpd analyze -i output.db --source-dir ./my_app
+
+# Interactive menu session (persistent LLM conversation, session-persistent)
+rocpd analyze -i output.db --interactive
+rocpd analyze -i output.db --interactive --llm anthropic
+rocpd analyze --source-dir ./my_app --interactive "./my_app arg1" --llm private
+
+# Resume a previous interactive session
+rocpd analyze -i output.db --interactive --resume-session 2026-03-10_14-23-01_myapp
+```
+
+### Python API Usage
+
+```python
+from rocpd.ai_analysis import analyze_database
+from pathlib import Path
+
+# Analyze a database
+result = analyze_database(Path("output.db"))
+
+# Access results
+print(result.summary.overall_assessment)
+print(f"Primary bottleneck: {result.summary.primary_bottleneck}")
+
+# Get recommendations
+for rec in result.recommendations.high_priority:
+    print(f"🔴 {rec.title}")
+    print(f"   {rec.description}")
+```
+
+## Module Structure
+
+```
+ai_analysis/
+├── __init__.py              # Public API exports (incl. LLMConversation, load_reference_guide)
+├── api.py                   # Main API functions, AnalysisResult, SourceAnalysisResult
+├── llm_analyzer.py          # Single-shot LLM integration with "fence" implementation
+├── llm_conversation.py      # Persistent multi-turn LLM session (LLMConversation)
+├── exceptions.py            # Exception classes (incl. SourceDirectoryNotFoundError)
+├── source_analyzer.py       # Tier 0: static source code scanner
+├── interactive.py           # Interactive session: InteractiveSession + WorkflowSession
+│                            #   SessionData, SessionStore dataclasses
+├── tests/
+│   ├── __init__.py
+│   ├── test_api_standalone.py         # 23 AI analysis API unit tests
+│   ├── test_interactive.py            # 22 interactive session unit tests
+│   └── test_llm_conversation.py       # 51 LLMConversation + integration tests
+├── share/
+│   └── llm-reference-guide.md  # LLM "fence" - user-modifiable reference guide
+├── docs/
+│   ├── AI_ANALYSIS_API.md      # API documentation
+│   ├── SCHEMA_CHANGELOG.md     # JSON schema version history (current: v0.2.0)
+│   └── LLM_REFERENCE_GUIDE.md  # Fence documentation
+└── README.md                # This file
+```
+
+## Architecture: The "Fence"
+
+The LLM reference guide ("fence") is a **user-modifiable markdown file** that controls LLM behavior:
+
+**Location:**
+- `/opt/rocm/lib/python3.12/site-packages/rocpd/ai_analysis/share/llm-reference-guide.md` (default)
+- Can be overridden with `ROCPD_LLM_REFERENCE_GUIDE` environment variable
+
+**What's in the guide:**
+- **ROCm Profiling Tools** - Correct tool names and commands (rocprofv3, rocprof-compute, rocprof-sys)
+- **Tool Documentation Links** - Official ROCm documentation references
+- **AMD GPU Hardware Specs** - MI100, MI210/MI250/MI250X, MI300A/MI300X/MI325X, MI350X/MI355X, RDNA2/RDNA3 specifications with ridge points
+- **Performance Analysis Models** - Roofline, Speed-of-Light, Top-Down methodologies
+- **Bottleneck Classification** - Rules for identifying compute/memory/latency bottlenecks
+- **Optimization Techniques** - AMD-specific optimization strategies
+- **Recommendation Standards** - Quality requirements for actionable recommendations
+- **Output Format Rules** - Consistent plain text format across all LLM providers
+
+**Enforced Tool Usage:**
+- ✅ `rocprofv3` - Kernel-level profiling, counters, API tracing
+- ✅ `rocprof-compute` - Roofline analysis, memory hierarchy metrics
+- ✅ `rocprof-sys` (also known as `rocsys`) - System-wide, MPI, call-stack sampling
+- ❌ NEVER `rocprof` or `rocprof-v2` (deprecated tools)
+
+**How it works:**
+1. LLMAnalyzer loads the reference guide at initialization
+2. Guide is included in every LLM API request as system prompt
+3. LLM generates analysis following the guide's rules strictly
+4. **To change LLM behavior, just edit the guide - no code changes**
+5. All profiling commands are validated against official ROCm documentation
+
+Example modification:
+
+```bash
+# Edit the reference guide
+sudo nano /opt/rocm/lib/python3.12/site-packages/rocpd/ai_analysis/share/llm-reference-guide.md
+
+# Add new GPU specs, update tool commands, or change priority thresholds
+# Save and exit - changes take effect immediately on next analysis
+```
+
+See [LLM Reference Guide Documentation](docs/LLM_REFERENCE_GUIDE.md) for details.
+
+## Data Flow
+
+```
+rocprofv3 --sys-trace --pmc GRBM_COUNT -- ./app
+    ↓
+output.db created (SQLite database)
+    ↓
+rocpd analyze -i output.db --llm anthropic
+    ↓
+┌─────────────────────────────────────────┐
+│ 1. Local Analysis (always runs)        │
+│    - Parse database                     │
+│    - Calculate metrics                  │
+│    - Apply performance models           │
+│    - Generate recommendations           │
+└─────────────────────────────────────────┘
+    ↓
+┌─────────────────────────────────────────┐
+│ 2. LLM Enhancement (optional)           │
+│    - Load reference guide ("fence")     │
+│    - Sanitize data (privacy)            │
+│    - Call Anthropic/OpenAI API          │
+│    - Generate natural language output   │
+└─────────────────────────────────────────┘
+    ↓
+Analysis results (text/JSON/markdown/webview)
+```
+
+## Analysis Tiers
+
+| Tier | Data Required | Analysis Capabilities |
+|------|---------------|----------------------|
+| **Tier 0** | Source code directory (`--source-dir`) | Kernel detection, pattern scanning, profiling plan, suggested first command |
+| **Tier 1** | Trace data (`-i db.db`) | Kernel hotspots, time breakdown, memory copy overhead |
+| **Tier 2** | Trace + hardware counters (`--pmc`) | Roofline model, Speed-of-Light metrics, bottleneck classification |
+| **Tier 3** | Trace + PC sampling (`--pc-sampling`) | Instruction-level hotspots within kernels |
+| **Tier 4** | Trace + thread trace | Full instruction timeline, stall analysis |
+
+Tiers 0–2 are implemented and production-ready. The interactive session automatically
+suggests `ROCPROFILER_PC_SAMPLING_BETA_ENABLED=1 rocprofv3 --pc-sampling` (Tier 3)
+once all Tier 1/2 data has been collected.
+
+## API Reference
+
+### Main Functions
+
+```python
+# Analyze database and return result object (Tier 1/2)
+def analyze_database(
+    database_path: Path,
+    *,
+    custom_prompt: Optional[str] = None,
+    enable_llm: bool = False,
+    llm_provider: Optional[str] = None,
+    llm_api_key: Optional[str] = None,
+    output_format: OutputFormat = OutputFormat.PYTHON_OBJECT,
+    verbose: bool = False,
+    top_kernels: int = 10,
+) -> AnalysisResult
+
+# Analyze source code directory and return profiling plan (Tier 0)
+def analyze_source(
+    source_dir: Path,
+    *,
+    custom_prompt: Optional[str] = None,
+    enable_llm: bool = False,
+    llm_provider: Optional[str] = None,
+    llm_api_key: Optional[str] = None,
+    verbose: bool = False,
+) -> SourceAnalysisResult
+
+# Analyze and return JSON
+def analyze_database_to_json(
+    database_path: Path,
+    output_json_path: Optional[Path] = None,
+    **kwargs
+) -> str
+
+# Get filtered recommendations
+def get_recommendations(
+    database_path: Path,
+    priority_filter: Optional[str] = None,
+    category_filter: Optional[str] = None,
+    **kwargs
+) -> List[Recommendation]
+
+# Validate database
+def validate_database(database_path: Path) -> Dict[str, Any]
+```
+
+### Data Classes
+
+```python
+@dataclass
+class AnalysisResult:
+    metadata: AnalysisMetadata
+    profiling_info: ProfilingInfo
+    summary: AnalysisSummary
+    execution_breakdown: ExecutionBreakdown
+    recommendations: RecommendationSet
+    warnings: List[AnalysisWarning]
+    errors: List[str]
+    llm_enhanced_explanation: Optional[str]  # If LLM enabled
+    tier0: Optional[SourceAnalysisResult]    # If --source-dir also provided
+
+    # Methods
+    def to_dict() -> Dict[str, Any]
+    def to_json(indent: int = 2) -> str
+    def to_text() -> str
+    def to_markdown() -> str
+    def to_webview() -> str  # Self-contained interactive HTML report
+
+@dataclass
+class SourceAnalysisResult:
+    source_dir: str
+    analysis_timestamp: str
+    programming_model: str        # "HIP", "HIP+ROCm_Libraries", "PyTorch_HIP", etc.
+    files_scanned: int
+    files_skipped: int
+    detected_kernels: List[Dict]  # {name, file, line, launch_type}
+    kernel_count: int
+    detected_patterns: List[Dict] # {pattern_id, severity, category, description, count, locations}
+    risk_areas: List[str]
+    already_instrumented: bool
+    roctx_marker_count: int
+    recommendations: List[Dict]   # Same shape as generate_recommendations() output
+    suggested_counters: List[str]
+    suggested_first_command: str
+    llm_explanation: Optional[str]
+```
+
+### Exceptions
+
+```python
+AnalysisError (base)
+├── DatabaseNotFoundError
+├── DatabaseCorruptedError
+├── MissingDataError
+├── UnsupportedGPUError
+├── LLMAuthenticationError
+├── LLMRateLimitError
+├── ReferenceGuideNotFoundError
+├── SourceDirectoryNotFoundError   # analyze_source(): directory not found
+└── SourceAnalysisError            # analyze_source(): scanning error
+```
+
+## LLM Enhancement
+
+### Enabling LLM Mode
+
+**Option 1: Environment variable**
+
+```bash
+export ANTHROPIC_API_KEY="sk-ant-..."
+rocpd analyze -i output.db --llm anthropic
+```
+
+**Option 2: Python API**
+
+```python
+result = analyze_database(
+    Path("output.db"),
+    enable_llm=True,
+    llm_provider="anthropic",
+    llm_api_key="sk-ant-..."
+)
+```
+
+### Supported Providers
+
+- **Anthropic Claude** (recommended)
+  - Provider: `"anthropic"`
+  - Env var: `ANTHROPIC_API_KEY`
+  - Model: `claude-sonnet-4-20250514`
+
+- **OpenAI GPT**
+  - Provider: `"openai"`
+  - Env var: `OPENAI_API_KEY`
+  - Model: `gpt-4-turbo-preview`
+
+- **Private/enterprise server** (any OpenAI-compatible endpoint)
+  - Provider: `"private"` (`--llm private`)
+  - Required: `ROCPD_LLM_PRIVATE_URL` — base URL of the server
+  - Required: `ROCPD_LLM_PRIVATE_MODEL` or `--llm-private-model`
+  - Optional: `ROCPD_LLM_PRIVATE_API_KEY` (default: `"dummy"` for header-authenticated servers)
+  - Optional: `ROCPD_LLM_PRIVATE_HEADERS` — JSON or Python-dict of extra request headers
+    (e.g. `{"Ocp-Apim-Subscription-Key": "abc", "api-version": "preview"}`)
+    The `user` header is auto-set to `os.getlogin()` unless already present in `ROCPD_LLM_PRIVATE_HEADERS`
+  - Optional: `ROCPD_LLM_PRIVATE_VERIFY_SSL=0` — disable SSL verification (requires `httpx`)
+
+- **Local Ollama**
+  - Provider: `--llm-local ollama`
+  - Env var: `ROCPD_LLM_LOCAL_URL` (default: `http://localhost:11434/v1`)
+  - Env var: `ROCPD_LLM_LOCAL_MODEL` (default: `codellama:13b`)
+
+### Data Sanitization
+
+When LLM mode is enabled, sensitive data is automatically redacted:
+
+| Original | Sanitized |
+|----------|-----------|
+| `conv2d_forward_kernel` | `[KERNEL_1]` |
+| `[256, 256, 1]` | `[GRID_SIZE]` |
+| `/home/user/app.cpp` | `[REDACTED]` |
+
+Aggregated metrics (time percentages, bottleneck classifications) are preserved.
+
+## Examples
+
+### Example 1: Basic Analysis
+
+```python
+from rocpd.ai_analysis import analyze_database
+from pathlib import Path
+
+result = analyze_database(Path("output.db"))
+
+print(f"Summary: {result.summary.overall_assessment}")
+print(f"Bottleneck: {result.summary.primary_bottleneck}")
+print(f"Kernel time: {result.execution_breakdown.kernel_time_pct:.1f}%")
+print(f"Memory copy: {result.execution_breakdown.memcpy_time_pct:.1f}%")
+
+print("\nHigh Priority Recommendations:")
+for rec in result.recommendations.high_priority:
+    print(f"  - {rec.title}")
+```
+
+### Example 2: With LLM Enhancement
+
+```python
+import os
+from rocpd.ai_analysis import analyze_database
+from pathlib import Path
+
+os.environ["ANTHROPIC_API_KEY"] = "sk-ant-..."
+
+result = analyze_database(
+    database_path=Path("output.db"),
+    enable_llm=True,
+    llm_provider="anthropic",
+    custom_prompt="Focus on memory bottlenecks"
+)
+
+# LLM-generated natural language explanation
+print(result.llm_enhanced_explanation)
+```
+
+### Example 3: JSON Output
+
+```python
+from rocpd.ai_analysis import analyze_database_to_json
+from pathlib import Path
+
+json_output = analyze_database_to_json(
+    database_path=Path("output.db"),
+    output_json_path=Path("analysis.json")
+)
+
+# JSON is also returned as string
+import json
+data = json.loads(json_output)
+print(f"Analysis tier: {data['profiling_info']['analysis_tier']}")
+```
+
+### Example 4: Interactive HTML Webview
+
+```bash
+# Generate a self-contained HTML report for browser viewing
+# Output file extension is applied automatically (.html for webview)
+rocpd analyze -i output.db --format webview -d ./reports -o my_trace
+# Produces: ./reports/my_trace.html
+```
+
+```python
+from rocpd.ai_analysis import analyze_database
+from pathlib import Path
+
+result = analyze_database(Path("output.db"))
+html_report = result.to_webview()
+Path("analysis.html").write_text(html_report)
+```
+
+The HTML report is a fully self-contained, offline-capable file with:
+- **Light/Dark theme toggle** — persisted in `localStorage`; defaults to AMD dark theme
+- **Status summary badges** — Critical/Warning counts visible in the header at a glance
+- **Metric pills row** — Runtime, kernel count, tier, timestamp, and DB path in the header
+- **Status-colored KPI cards** — Kernel %, bottleneck type, runtime, and tier cards each
+  have a green/amber/red top border reflecting health status
+- **Priority icons on recommendations** — 🔴 HIGH, 🟠 MEDIUM, 🟡 LOW, ℹ INFO
+- **FAB scroll-to-top button** — Floating button appears after scrolling
+- **Staggered fade-in animations** on section cards
+- **Hover tooltips on every visual element** — gauges, bars, table headers, counter rows,
+  and overview stats explain what each metric measures, target thresholds, and how to
+  act on issues. Hardware counter rows (GRBM_*, SQ_*, TCP/TCC, FETCH_SIZE, etc.)
+  include educational content about the underlying hardware event being counted.
+
+### Example 5: roc-optiq Integration
+
+```python
+from rocpd.ai_analysis import analyze_database
+from pathlib import Path
+
+def load_trace_for_optiq(trace_path: str):
+    """Load trace and extract insights for Optiq UI"""
+    result = analyze_database(Path(trace_path))
+
+    return {
+        "summary": result.summary.overall_assessment,
+        "bottleneck": result.summary.primary_bottleneck,
+        "recommendations": [
+            {
+                "title": rec.title,
+                "description": rec.description,
+                "priority": rec.priority
+            }
+            for rec in result.recommendations.high_priority[:3]
+        ],
+        "breakdown": {
+            "kernel_pct": result.execution_breakdown.kernel_time_pct,
+            "memcpy_pct": result.execution_breakdown.memcpy_time_pct
+        }
+    }
+```
+
+## Interactive Session
+
+The interactive session (`--interactive`) launches a menu-driven loop for iterative profiling analysis. It maintains a **persistent multi-turn `LLMConversation`** across all calls within the same session — the LLM accumulates full message history and doesn't repeat itself.
+
+### Session menu
+
+```
+[p] Profile   — run a new rocprofv3 command and analyze the output .db
+[a] Analyze   — re-analyze the current .db and update recommendations
+[o] Optimize  — ask the LLM for optimization suggestions
+[s] Save      — save session to disk
+[q] Quit
+```
+
+### LLM conversation persistence
+
+`InteractiveSession` holds one `LLMConversation` for the entire session:
+- All `[a]`, `[o]`, and code-edit LLM calls share the same conversation object
+- The LLM sees the full message history from earlier in the session
+- History is automatically compacted to stay within context limits (`--llm-compact-every N`, default 10 turns)
+- Source files are tracked: a file sent once is not re-transmitted on repeat calls (only new files are sent); the file set is serialized into the session JSON and restored on `--resume-session`
+- On `[s]` save, the conversation state is serialized into the session file
+- On `--resume-session`, the conversation is restored so the LLM picks up exactly where it left off
+
+### Phase 1b: Quick workload analysis (WorkflowSession)
+
+Before presenting the initial profiling command in Phase 2, `WorkflowSession` runs a
+lightweight workload analysis to pick the best starter flags:
+
+1. **App-command heuristics** — always runs; inspects binary name and arguments:
+   - `python` + ML keywords (torch, jax, paddle…) → `python_ml`; adds `--hip-trace`
+   - `python` + LLM keywords (vllm, llama, gpt…) → `llm_inference`; adds `--hip-trace`
+   - `python` without ML → `python_generic`; adds `--hip-trace`
+   - MPI/Slurm launchers (`mpirun`, `srun`…) → warns about multi-rank capture limits
+   - Compiled HIP/ROCm binary → `hip_compute`; uses default flag set
+   - Multi-process patterns (torchrun, DDP, DeepSpeed) → warns about worker capture
+
+2. **Tier 0 source analysis** — if `--source-dir` was provided, runs `SourceAnalyzer`
+   on the source directory and extracts the recommended flags from its highest-priority
+   profiling recommendation; overrides the pure-heuristic flag set.
+
+3. **Fallback** — if neither source analysis nor heuristics yield specific flags, the
+   safe default is used: `--sys-trace --kernel-trace --memory-copy-trace --stats`.
+
+The analysis output is printed before the command box so the user can see what was
+detected and why specific flags were chosen. The user always confirms or edits the
+command in Phase 2.
+
+**Example output:**
+```
+── Quick Workload Analysis ──────────────────────────────────────
+  Detected: Python + ML framework (PyTorch / JAX / TF)
+  Source scan: 14 files, 3 kernels, model=hip_python
+  Source analysis suggests: rocprofv3 --sys-trace --hip-trace --kernel-trace --stats ...
+  Starter command basis: source analysis
+
+╭──────────────────────────────────────────────────────────────────╮
+│  Profiling Command                                               │
+│  rocprofv3 --sys-trace --hip-trace --kernel-trace --stats ...   │
+╰──────────────────────────────────────────────────────────────────╯
+  Would you like the interactive tool to run this command? [Y/n]
+```
+
+### Cycle prevention and going deeper (WorkflowSession)
+
+The 7-phase `WorkflowSession` (`--interactive "<app>"`) automatically detects and
+breaks counter-collection/API-tracing cycles:
+
+- **Fingerprint all collection flags** — when deciding whether to re-suggest a command,
+  the session checks `--sys-trace`, `--hip-trace`, `--kernel-trace`, `--memory-copy-trace`,
+  `--hsa-trace`, `--stats`, and individual `--pmc` counter names.
+- **Compares against all prior runs** — the dedup check looks at the union of everything
+  collected across all previous trace runs, not just the last one.
+- **Tier 3 escalation** — once all Tier 1/2 data has been collected, Phase 5 shows a
+  "go deeper" menu:
+  - TraceLens interval + kernel-category analysis is already shown in the report.
+  - `[d]` builds a PC sampling command and wires it into Phase 7 as option `[3]`:
+    ```
+    ROCPROFILER_PC_SAMPLING_BETA_ENABLED=1 rocprofv3 --pc-sampling \
+      -d /tmp/rocpd_trace/run_<ts> -o results -- <app>
+    ```
+  - `ENV=VALUE` prefixes in commands are automatically extracted and injected into the
+    subprocess environment (no `shell=True` needed).
+
+### AI-edit revert
+
+When the AI modifies source files (Phase 6), the session backs up each file to `<file>.bak`.
+Typing `revert` (or `undo` / `v` / `r`) in the recompile-wait prompt triggers the full revert
+flow:
+
+1. **Ask for error context** — if no errors were pasted yet, the session prompts:
+   ```
+   What went wrong? Paste the error output or briefly describe the issue.
+   (Press Enter to skip and proceed without error context)
+   >
+   ```
+2. **File restored** from `.bak` backup immediately.
+3. **LLM analysis** — calls the LLM with the original code, the failed edit, and the
+   error description. Response is formatted as:
+   - `ANALYSIS:` root-cause explanation of what went wrong
+   - `ALTERNATIVE:` a concrete corrected approach with specific code changes
+4. **Offer to apply** — `Apply this alternative approach now? [y/N]`
+   - If yes: shows a unified diff and applies on confirmation (with new `.bak`)
+5. **What-next menu**:
+   ```
+   What would you like to do next?
+     [f]  Try a different fix  — let the AI attempt another approach
+     [p]  Continue to re-profiling  (skip code changes this round)
+     [q]  Exit session
+   ```
+   - `[f]` is shown only in Phase 6 context (not after profiling failure in Phase 3)
+   - `[f]` re-enters the Phase 6 retry loop for a fresh LLM rewrite attempt
+   - `[p]` falls through to Phase 7 (re-profiling prompt)
+   - `[q]` saves the session and exits
+
+**Phase 3 failure revert** — `[v]` also appears in the Phase 3 retry menu when AI edits
+exist. In that context, `[f] Try a different fix` is not offered (a new edit can't be
+applied until re-profiling has run); instead only `[p]` (continue) and `[q]` (exit) appear.
+
+### AI-suggested commands
+
+After the LLM responds to `[o]`, the session scans the response text for `rocprofv3 ...` commands and combines them with structured commands from the current recommendation list. If any are found, the user is offered a numbered menu to run one immediately. If run, the resulting `.db` is auto-analyzed and the LLM is notified.
+
+### Session persistence
+
+Both session classes save to `~/.rocpd/sessions/` automatically:
+
+| Session class | Save triggers | File pattern | Resume |
+|---|---|---|---|
+| `InteractiveSession` | `[s]` key, `[q]` quit, Ctrl+C | `<ts>_<source_slug>.json` | `--resume-session` or auto-detect |
+| `WorkflowSession` | after Phase 3 trace run, after Phase 6 edit, on exit/Ctrl+C | `workflow_<ts>_<app_slug>.json` | not supported (new state each run) |
+
+The session file path is printed in the session summary so you always know where to find it.
+
+> **Note:** `--resume-session` applies only to **`InteractiveSession`** (the menu-driven
+> `[p]/[a]/[o]/[s]/[q]` mode, triggered by `rocpd analyze -i db.db --interactive` **without**
+> a `"<app_command>"` argument). `WorkflowSession` (7-phase workflow) starts a fresh state
+> each invocation and does not support resume.
+
+```bash
+# Start a new InteractiveSession
+rocpd analyze -i output.db --interactive --llm anthropic
+
+# With private enterprise server
+rocpd analyze -i output.db --interactive --llm private
+
+# Control compaction interval (default 10 turns)
+rocpd analyze -i output.db --interactive --llm anthropic --llm-compact-every 5
+
+# List available session IDs (files in ~/.rocpd/sessions/)
+ls ~/.rocpd/sessions/*.json | xargs -I{} python3 -c \
+    "import json,sys; d=json.load(open('{}'));print(d['session_id'],'|',d['source_dir'])"
+
+# Resume an existing session — restores LLM conversation, sent files, and history
+# Session ID format: YYYY-MM-DD_HH-MM-SS_<source_dir_basename>
+rocpd analyze -i output.db --interactive --resume-session 2026-03-10_14-23-01_myapp
+
+# If the source dir matches a previous session, the tool auto-prompts to resume
+# (no --resume-session needed)
+rocpd analyze -i output.db --source-dir ./my_app --interactive
+```
+
+---
+
+## Testing
+
+### Unit Tests
+
+```bash
+# Run from /tmp to avoid circular import of libpyrocpd
+ROCPD_SYS=/opt/rocm-7.0.0/lib/python3.12/site-packages
+TEST_DIR=/path/to/rocm-systems-dev/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/tests
+
+# All tests
+cd /tmp && PYTHONPATH="${ROCPD_SYS}" python3 -m pytest ${TEST_DIR} --noconftest -v
+
+# Interactive session tests only
+cd /tmp && PYTHONPATH="${ROCPD_SYS}" python3 -m pytest ${TEST_DIR}/test_interactive.py --noconftest -v
+
+# LLMConversation + integration tests
+cd /tmp && PYTHONPATH="${ROCPD_SYS}" python3 -m pytest ${TEST_DIR}/test_llm_conversation.py --noconftest -v
+```
+
+### Integration Tests
+
+```bash
+cd rocm-systems-dev/projects/rocprofiler-sdk/build
+ctest -R rocpd-ai-analysis
+```
+
+### Manual Testing
+
+```bash
+# Generate test trace
+rocprofv3 --sys-trace --pmc GRBM_COUNT SQ_WAVES -- ./sample_app
+
+# Analyze
+rocpd analyze -i output.db
+
+# With LLM (requires API key)
+export ANTHROPIC_API_KEY="sk-ant-..."
+rocpd analyze -i output.db --llm anthropic
+```
+
+## Configuration
+
+### Environment Variables
+
+| Variable | Purpose |
+|---|---|
+| `ANTHROPIC_API_KEY` | Anthropic Claude API key |
+| `OPENAI_API_KEY` | OpenAI GPT API key |
+| `ROCPD_LLM_MODEL` | Override default model for anthropic or openai provider |
+| `ROCPD_LLM_REFERENCE_GUIDE` | Path to custom reference guide (overrides package default) |
+| `ROCPD_LLM_PRIVATE_URL` | Base URL for private/enterprise OpenAI-compatible server (required for `--llm private`) |
+| `ROCPD_LLM_PRIVATE_MODEL` | Model name for private server |
+| `ROCPD_LLM_PRIVATE_API_KEY` | API key for private server (default: `"dummy"`) |
+| `ROCPD_LLM_PRIVATE_HEADERS` | JSON or Python-dict of extra HTTP request headers (e.g. `{"Ocp-Apim-Subscription-Key": "..."}`) |
+| `ROCPD_LLM_PRIVATE_VERIFY_SSL` | Set to `0` or `false` to disable SSL cert verification (requires `httpx`) |
+| `ROCPD_LLM_LOCAL_URL` | Base URL for local Ollama endpoint (default: `http://localhost:11434/v1`) |
+| `ROCPD_LLM_LOCAL_MODEL` | Model name for local Ollama (default: `codellama:13b`) |
+
+### Reference Guide Location
+
+Default: `/opt/rocm/share/rocprofiler-sdk/llm-reference-guide.md`
+
+Override:
+```bash
+export ROCPD_LLM_REFERENCE_GUIDE=/path/to/custom-guide.md
+```
+
+## Documentation
+
+- **[AI Analysis API Documentation](../../../docs/AI_ANALYSIS_API.md)** - Complete API reference
+- **[LLM Reference Guide Documentation](../../../docs/LLM_REFERENCE_GUIDE.md)** - How to customize LLM behavior
+- **[rocpd README](../README.md)** - Main rocpd documentation
+
+## Development
+
+### Adding New Analysis Features
+
+1. Add analysis logic to `analyze.py` (main rocpd module)
+2. Update `api.py` to expose new data in `AnalysisResult`
+3. Update reference guide if LLM should use new feature
+4. Add tests
+
+### Modifying LLM Behavior
+
+**Don't modify code.** Edit the reference guide instead:
+
+```bash
+sudo nano /opt/rocm/share/rocprofiler-sdk/llm-reference-guide.md
+```
+
+See [LLM Reference Guide Documentation](../../../docs/LLM_REFERENCE_GUIDE.md) for examples.
+
+## Troubleshooting
+
+### Reference Guide Not Found
+
+```bash
+# Check which path is being used
+python3 -c "from rocpd.ai_analysis.llm_analyzer import get_reference_guide_path; print(get_reference_guide_path())"
+
+# Copy from source
+sudo cp share/llm-reference-guide.md /opt/rocm/share/rocprofiler-sdk/
+
+# Or use environment variable
+export ROCPD_LLM_REFERENCE_GUIDE=/path/to/guide.md
+```
+
+### LLM Authentication Errors
+
+```bash
+# Verify API key is set
+echo $ANTHROPIC_API_KEY
+
+# Test API key directly
+python3 << EOF
+import anthropic
+client = anthropic.Anthropic(api_key="sk-ant-...")
+print("API key valid!")
+EOF
+```
+
+### Database Errors
+
+```bash
+# Validate database
+python3 << EOF
+from rocpd.ai_analysis import validate_database
+from pathlib import Path
+
+validation = validate_database(Path("output.db"))
+print(f"Valid: {validation['is_valid']}")
+print(f"Tier: {validation['tier']}")
+print(f"Tables: {validation['tables']}")
+EOF
+```
+
+## Contributing
+
+- Follow existing code style (PEP 8)
+- Add type hints
+- Write docstrings (Google style)
+- Add unit tests
+- Update documentation
+
+## License
+
+MIT License - Copyright (c) 2025 Advanced Micro Devices, Inc.
+
+## Support
+
+- File issues on GitHub
+- See [rocprofiler-sdk documentation](https://rocm.docs.amd.com/projects/rocprofiler-sdk/)
+- ROCm community: https://rocm.docs.amd.com/
diff --git a/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/__init__.py b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/__init__.py
new file mode 100644
index 00000000000..f94f130a27b
--- /dev/null
+++ b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/__init__.py
@@ -0,0 +1,117 @@
+#!/usr/bin/env python3
+###############################################################################
+# MIT License
+#
+# Copyright (c) 2025 Advanced Micro Devices, Inc.
+###############################################################################
+
+"""
+AI Analysis Module for rocpd
+
+This module provides AI-powered GPU performance analysis with optional
+LLM enhancement. The analysis is guided by a user-modifiable reference
+guide (the "fence") that ensures high-quality, actionable insights.
+
+Key Features:
+- Local-first analysis (always available, no internet required)
+- Optional LLM enhancement (Anthropic Claude, OpenAI GPT)
+- User-modifiable reference guide for customizing LLM behavior
+- Data sanitization for privacy in LLM mode
+- JSON, text, and markdown output formats
+
+Usage:
+    from rocpd.ai_analysis import analyze_database
+
+    result = analyze_database(
+        database_path=Path("output.db"),
+        enable_llm=True,
+        llm_provider="anthropic"
+    )
+
+    print(result.summary.overall_assessment)
+"""
+
+from .api import (
+    analyze_database,
+    analyze_database_to_json,
+    analyze_source,
+    get_kernel_analysis,
+    get_recommendations,
+    validate_database,
+    AnalysisResult,
+    SourceAnalysisResult,
+    OutputFormat,
+)
+
+from .exceptions import (
+    AnalysisError,
+    DatabaseNotFoundError,
+    DatabaseCorruptedError,
+    MissingDataError,
+    UnsupportedGPUError,
+    LLMAuthenticationError,
+    LLMRateLimitError,
+    ReferenceGuideNotFoundError,
+    SourceDirectoryNotFoundError,
+    SourceAnalysisError,
+)
+
+from .llm_analyzer import LLMAnalyzer, AnalysisContext, load_reference_guide
+from .llm_conversation import LLMConversation
+
+
+def _get_interactive():
+    from .interactive import InteractiveSession, SessionStore, SessionData
+
+    return InteractiveSession, SessionStore, SessionData
+
+
+def __getattr__(name):
+    if name in ("InteractiveSession", "SessionStore", "SessionData"):
+        InteractiveSession, SessionStore, SessionData = _get_interactive()
+        # Cache in module globals to avoid repeated import on subsequent accesses
+        import sys
+
+        mod = sys.modules[__name__]
+        mod.InteractiveSession = InteractiveSession
+        mod.SessionStore = SessionStore
+        mod.SessionData = SessionData
+        return getattr(mod, name)
+    raise AttributeError(f"module 'rocpd.ai_analysis' has no attribute {name!r}")
+
+
+__all__ = [
+    # Main API functions
+    "analyze_database",
+    "analyze_database_to_json",
+    "analyze_source",
+    "get_kernel_analysis",
+    "get_recommendations",
+    "validate_database",
+    # Data classes
+    "AnalysisResult",
+    "SourceAnalysisResult",
+    "OutputFormat",
+    # Exceptions
+    "AnalysisError",
+    "DatabaseNotFoundError",
+    "DatabaseCorruptedError",
+    "MissingDataError",
+    "UnsupportedGPUError",
+    "LLMAuthenticationError",
+    "LLMRateLimitError",
+    "ReferenceGuideNotFoundError",
+    "SourceDirectoryNotFoundError",
+    "SourceAnalysisError",
+    # Interactive session
+    "InteractiveSession",
+    "SessionStore",
+    "SessionData",
+    # LLM integration
+    "LLMAnalyzer",
+    "AnalysisContext",
+    "load_reference_guide",
+    "LLMConversation",
+]
+
+__version__ = "0.1.0"
diff --git a/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/api.py b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/api.py
new file mode 100644
index 00000000000..bc12aac38ed
--- /dev/null
+++ b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/api.py
@@ -0,0 +1,1137 @@
+#!/usr/bin/env python3
+###############################################################################
+# MIT License
+#
+# Copyright (c) 2025 Advanced Micro Devices, Inc.
+###############################################################################
+
+"""
+Public Python API for rocpd AI analysis.
+
+This module provides a simple function-based API for programmatic access
+to AI-powered GPU performance analysis. Designed for integration with
+tools like Optiq.
+
+Example:
+    from rocpd.ai_analysis import analyze_database
+    from pathlib import Path
+
+    result = analyze_database(Path("output.db"))
+    print(result.summary.overall_assessment)
+
+    for rec in result.recommendations.high_priority:
+        print(f"- {rec.title}")
+"""
+
+from dataclasses import dataclass, field, asdict
+from enum import Enum
+from pathlib import Path
+from typing import List, Optional, Dict, Any
+
+try:
+    from importlib.metadata import version as _pkg_version
+
+    _ROCPD_VERSION = _pkg_version("rocpd")
+except Exception:
+    _ROCPD_VERSION = "0.1.0"  # fallback if metadata not available (common in dev / ROCm system installs)
+
+from ..analyze import (
+    compute_time_breakdown,
+    identify_hotspots,
+    analyze_memory_copies,
+    analyze_hardware_counters,
+    generate_recommendations,
+    format_analysis_output,
+    _detect_already_collected,
+)
+from ..tracelens_port import (
+    compute_interval_timeline,
+    analyze_kernels_by_category,
+    analyze_short_kernels,
+)
+from .llm_analyzer import AnalysisContext, LLMAnalyzer
+from .exceptions import (
+    DatabaseNotFoundError,
+    DatabaseCorruptedError,
+    LLMAuthenticationError,
+    LLMRateLimitError,
+    SourceDirectoryNotFoundError,
+)
+
+
+class OutputFormat(Enum):
+    """Output format options"""
+
+    PYTHON_OBJECT = "python_object"  # Returns dataclass
+    JSON = "json"
+    TEXT = "text"
+    MARKDOWN = "markdown"
+    WEBVIEW = "webview"  # Self-contained interactive HTML
+
+
+@dataclass
+class AnalysisMetadata:
+    """Metadata about the analysis"""
+
+    rocpd_version: str
+    analysis_version: str = "0.1.0"
+    database_file: str = ""
+    analysis_timestamp: str = ""
+    analysis_duration_ms: int = 0
+    custom_prompt: Optional[str] = None
+
+
+@dataclass
+class GPUInfo:
+    """GPU device information"""
+
+    name: str
+    architecture: str
+    agent_id: int = 0
+
+
+@dataclass
+class ProfilingInfo:
+    """Profiling session information"""
+
+    total_duration_ns: int
+    profiling_mode: str  # "sys_trace_only", "sys_trace_with_counters", "pc_sampling"
+    analysis_tier: int  # 1=trace, 2=counters, 3=pc_sampling
+    gpus: List[GPUInfo] = field(default_factory=list)
+
+
+@dataclass
+class AnalysisSummary:
+    """High-level summary of analysis"""
+
+    overall_assessment: str
+    primary_bottleneck: str  # "compute", "memory", "latency", "mixed", "unknown"
+    confidence: float  # 0.0 to 1.0
+    key_findings: List[str] = field(default_factory=list)
+
+
+@dataclass
+class ExecutionBreakdown:
+    """Time distribution breakdown"""
+
+    kernel_time_ns: int
+    kernel_time_pct: float
+    memcpy_time_ns: int
+    memcpy_time_pct: float
+    api_overhead_ns: int = 0
+    api_overhead_pct: float = 0.0
+    idle_time_ns: int = 0
+    idle_time_pct: float = 0.0
+
+
+@dataclass
+class Recommendation:
+    """Single recommendation"""
+
+    id: str
+    priority: str  # "high", "medium", "low"
+    category: str  # "memory", "compute", "occupancy", "memory_transfer", etc.
+    title: str
+    description: str
+    estimated_impact: str
+    next_steps: List[str] = field(default_factory=list)
+
+
+@dataclass
+class RecommendationSet:
+    """Prioritized recommendations"""
+
+    high_priority: List[Recommendation] = field(default_factory=list)
+    medium_priority: List[Recommendation] = field(default_factory=list)
+    low_priority: List[Recommendation] = field(default_factory=list)
+
+
+@dataclass
+class AnalysisWarning:
+    """Warning message"""
+
+    severity: str  # "warning", "info"
+    message: str
+    recommendation: Optional[str] = None
+
+
+@dataclass
+class SourceAnalysisResult:
+    """
+    Tier 0 analysis result from static source code scanning.
+
+    Produced by analyze_source() and attached to AnalysisResult.tier0
+    when --source-dir is provided alongside -i.
+    """
+
+    source_dir: str
+    analysis_timestamp: str
+    programming_model: str  # "HIP", "HIP+ROCm_Libraries", "OpenCL", "PyTorch_HIP", etc.
+
+    files_scanned: int
+    files_skipped: int
+
+    detected_kernels: List[Dict[str, Any]]  # {name, file, line, launch_type}
+    kernel_count: int
+
+    detected_patterns: List[
+        Dict[str, Any]
+    ]  # {pattern_id, severity, category, description, count, locations}
+    risk_areas: List[str]
+
+    already_instrumented: bool
+    roctx_marker_count: int
+
+    recommendations: List[Dict[str, Any]]  # same structure as generate_recommendations()
+    suggested_counters: List[str]
+    suggested_first_command: str
+
+    llm_explanation: Optional[str] = None
+
+
+def _plan_to_source_result(plan) -> "SourceAnalysisResult":
+    """Convert a ProfilingPlan to a SourceAnalysisResult dataclass.
+
+    Centralizes the conversion logic so both api.py:analyze_source() and
+    analyze.py:analyze_source_code() produce identical SourceAnalysisResult
+    objects without duplicating the field-mapping code.
+    """
+    return SourceAnalysisResult(
+        source_dir=plan.source_dir,
+        analysis_timestamp=plan.analysis_timestamp,
+        programming_model=plan.programming_model,
+        files_scanned=plan.files_scanned,
+        files_skipped=plan.files_skipped,
+        detected_kernels=[
+            {
+                "name": k.name,
+                "file": k.file,
+                "line": k.line,
+                "launch_type": k.launch_type,
+            }
+            for k in plan.detected_kernels
+        ],
+        kernel_count=plan.kernel_count,
+        detected_patterns=[
+            {
+                "pattern_id": p.pattern_id,
+                "severity": p.severity,
+                "category": p.category,
+                "description": p.description,
+                "count": p.count,
+                "locations": p.locations,
+            }
+            for p in plan.detected_patterns
+        ],
+        risk_areas=plan.risk_areas,
+        already_instrumented=plan.already_instrumented,
+        roctx_marker_count=plan.roctx_marker_count,
+        recommendations=plan.recommendations,
+        suggested_counters=plan.suggested_counters,
+        suggested_first_command=plan.suggested_first_command,
+    )
+
+
+@dataclass
+class AnalysisResult:
+    """
+    Complete analysis result structure.
+
+    This is the main return type for analyze_database().
+    Contains all analysis data and can be serialized to JSON/text/markdown.
+    """
+
+    metadata: AnalysisMetadata
+    profiling_info: ProfilingInfo
+    summary: AnalysisSummary
+    execution_breakdown: ExecutionBreakdown
+    recommendations: RecommendationSet
+    warnings: List[AnalysisWarning] = field(default_factory=list)
+    errors: List[str] = field(default_factory=list)
+
+    # Optional LLM-enhanced natural language explanation
+    llm_enhanced_explanation: Optional[str] = None
+
+    # Tier 0 source code analysis (populated when analyze_source() is also run)
+    tier0: Optional[SourceAnalysisResult] = None
+
+    # TraceLens-derived analysis (Phase 1)
+    kernel_categories: List[dict] = field(default_factory=list)
+    short_kernels: dict = field(default_factory=dict)
+    interval_timeline: dict = field(default_factory=dict)
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary"""
+        return asdict(self)
+
+    def to_json(self, indent: int = 2) -> str:
+        """Serialize to schema-conformant JSON (analysis-output.schema.json v0.1.0).
+
+        Delegates to format_analysis_output() to ensure the output matches the
+        normative JSON schema. Falls back to dataclass serialization if raw
+        analysis data is not available.
+        """
+        raw = getattr(self, "_raw", None)
+        if raw:
+            return format_analysis_output(
+                time_breakdown=raw["time_breakdown"],
+                hotspots=raw["hotspots"],
+                memory_analysis=raw["memory_analysis"],
+                recommendations=raw["recommendations_raw"],
+                hardware_counters=raw["hardware_counters"],
+                database_path=raw["database_path"],
+                output_format="json",
+                interval_timeline=raw.get("interval_timeline"),  # NEW
+                kernel_categories=raw.get("kernel_categories"),  # NEW
+                short_kernels=raw.get("short_kernels"),  # NEW
+            )
+        raise RuntimeError(
+            "Raw analysis data not available. "
+            "Use analyze_database() to create the result, "
+            "or use to_dict() for a non-schema-conformant dict."
+        )
+
+    def to_webview(self) -> str:
+        """Generate self-contained interactive HTML report.
+
+        Returns the same AMD-themed webview HTML produced by the rocpd CLI
+        ``--format webview`` option. Requires that the result was created via
+        :func:`analyze_database` (which populates the raw data cache).
+
+        Raises:
+            RuntimeError: If the result was not created via analyze_database().
+        """
+        raw = getattr(self, "_raw", None)
+        if not raw:
+            raise RuntimeError(
+                "Raw analysis data not available. "
+                "Use analyze_database() to create the result."
+            )
+        return format_analysis_output(
+            time_breakdown=raw["time_breakdown"],
+            hotspots=raw["hotspots"],
+            memory_analysis=raw["memory_analysis"],
+            recommendations=raw["recommendations_raw"],
+            hardware_counters=raw["hardware_counters"],
+            database_path=raw["database_path"],
+            output_format="webview",
+            interval_timeline=raw.get("interval_timeline"),  # NEW
+            kernel_categories=raw.get("kernel_categories"),  # NEW
+            short_kernels=raw.get("short_kernels"),  # NEW
+        )
+
+    def to_text(self) -> str:
+        """Generate plain text report.
+
+        Works without ``_raw`` attached; renders from dataclass fields directly.
+        Does NOT guarantee schema conformance (use ``to_json()`` for that).
+        """
+        lines = []
+
+        # Header
+        lines.append("=" * 80)
+        lines.append("GPU PERFORMANCE ANALYSIS REPORT")
+        lines.append("=" * 80)
+        lines.append(f"Database: {self.metadata.database_file}")
+        lines.append(f"Analysis Date: {self.metadata.analysis_timestamp}")
+        lines.append(f"Analysis Tier: {self.profiling_info.analysis_tier}")
+        if self.metadata.custom_prompt:
+            lines.append(f"Custom Prompt: {self.metadata.custom_prompt}")
+        lines.append("")
+
+        # Summary
+        lines.append("SUMMARY")
+        lines.append("-" * 80)
+        lines.append(self.summary.overall_assessment)
+        lines.append(f"Primary Bottleneck: {self.summary.primary_bottleneck}")
+        lines.append(f"Confidence: {self.summary.confidence:.0%}")
+        lines.append("")
+
+        # Key findings
+        if self.summary.key_findings:
+            lines.append("Key Findings:")
+            for finding in self.summary.key_findings:
+                lines.append(f"  • {finding}")
+            lines.append("")
+
+        # Execution breakdown
+        lines.append("EXECUTION BREAKDOWN")
+        lines.append("-" * 80)
+        lines.append(
+            f"Kernel Execution:  {self.execution_breakdown.kernel_time_pct:6.1f}%"
+        )
+        lines.append(
+            f"Memory Copies:     {self.execution_breakdown.memcpy_time_pct:6.1f}%"
+        )
+        lines.append(
+            f"API Overhead:      {self.execution_breakdown.api_overhead_pct:6.1f}%"
+        )
+        lines.append("")
+
+        # Recommendations
+        lines.append("RECOMMENDATIONS")
+        lines.append("-" * 80)
+
+        for priority, recs in [
+            ("HIGH PRIORITY", self.recommendations.high_priority),
+            ("MEDIUM PRIORITY", self.recommendations.medium_priority),
+            ("LOW PRIORITY", self.recommendations.low_priority),
+        ]:
+            if recs:
+                lines.append(f"\n{priority}:")
+                for rec in recs:
+                    lines.append(f"\n  {rec.title}")
+                    lines.append(f"  {rec.description}")
+                    lines.append(f"  Estimated Impact: {rec.estimated_impact}")
+                    if rec.next_steps:
+                        lines.append("  Next Steps:")
+                        for step in rec.next_steps:
+                            lines.append(f"    - {step}")
+
+        # LLM-enhanced explanation (if available)
+        if self.llm_enhanced_explanation:
+            lines.append("\n")
+            lines.append("=" * 80)
+            lines.append("AI-ENHANCED EXPLANATION")
+            lines.append("=" * 80)
+            lines.append(self.llm_enhanced_explanation)
+
+        # Warnings
+        if self.warnings:
+            lines.append("\n")
+            lines.append("WARNINGS")
+            lines.append("-" * 80)
+            for warning in self.warnings:
+                lines.append(f"⚠️  {warning.message}")
+                if warning.recommendation:
+                    lines.append(f"   Recommendation: {warning.recommendation}")
+
+        lines.append("\n" + "=" * 80)
+        return "\n".join(lines)
+
+    def to_markdown(self) -> str:
+        """Generate markdown report.
+
+        Works without ``_raw`` attached; renders from dataclass fields directly.
+        Does NOT guarantee schema conformance (use ``to_json()`` for that).
+        """
+        lines = []
+
+        # Header
+        lines.append("# GPU Performance Analysis Report")
+        lines.append("")
+        lines.append(f"**Database:** `{self.metadata.database_file}`")
+        lines.append(f"**Analysis Date:** {self.metadata.analysis_timestamp}")
+        lines.append(f"**Analysis Tier:** {self.profiling_info.analysis_tier}")
+        if self.metadata.custom_prompt:
+            lines.append(f"**Custom Prompt:** _{self.metadata.custom_prompt}_")
+        lines.append("")
+
+        # Summary
+        lines.append("## Summary")
+        lines.append("")
+        lines.append(self.summary.overall_assessment)
+        lines.append("")
+        lines.append(f"- **Primary Bottleneck:** {self.summary.primary_bottleneck}")
+        lines.append(f"- **Confidence:** {self.summary.confidence:.0%}")
+        lines.append("")
+
+        # Key findings
+        if self.summary.key_findings:
+            lines.append("### Key Findings")
+            lines.append("")
+            for finding in self.summary.key_findings:
+                lines.append(f"- {finding}")
+            lines.append("")
+
+        # Execution breakdown
+        lines.append("## Execution Breakdown")
+        lines.append("")
+        lines.append("| Category | Percentage |")
+        lines.append("|----------|------------|")
+        lines.append(
+            f"| Kernel Execution | {self.execution_breakdown.kernel_time_pct:.1f}% |"
+        )
+        lines.append(
+            f"| Memory Copies | {self.execution_breakdown.memcpy_time_pct:.1f}% |"
+        )
+        lines.append(
+            f"| API Overhead | {self.execution_breakdown.api_overhead_pct:.1f}% |"
+        )
+        lines.append("")
+
+        # Recommendations
+        lines.append("## Recommendations")
+        lines.append("")
+
+        for priority, recs, emoji in [
+            ("High Priority", self.recommendations.high_priority, "🔴"),
+            ("Medium Priority", self.recommendations.medium_priority, "🟡"),
+            ("Low Priority", self.recommendations.low_priority, "🟢"),
+        ]:
+            if recs:
+                lines.append(f"### {emoji} {priority}")
+                lines.append("")
+                for rec in recs:
+                    lines.append(f"#### {rec.title}")
+                    lines.append("")
+                    lines.append(rec.description)
+                    lines.append("")
+                    lines.append(f"**Estimated Impact:** {rec.estimated_impact}")
+                    lines.append("")
+                    if rec.next_steps:
+                        lines.append("**Next Steps:**")
+                        for step in rec.next_steps:
+                            lines.append(f"- {step}")
+                        lines.append("")
+
+        # LLM-enhanced explanation
+        if self.llm_enhanced_explanation:
+            lines.append("---")
+            lines.append("")
+            lines.append("## AI-Enhanced Explanation")
+            lines.append("")
+            lines.append(self.llm_enhanced_explanation)
+            lines.append("")
+
+        # Warnings
+        if self.warnings:
+            lines.append("## Warnings")
+            lines.append("")
+            for warning in self.warnings:
+                lines.append(f"⚠️ **{warning.severity.upper()}:** {warning.message}")
+                if warning.recommendation:
+                    lines.append(f"  - Recommendation: {warning.recommendation}")
+                lines.append("")
+
+        return "\n".join(lines)
+
+
+def analyze_database(
+    database_path: Path,
+    *,
+    custom_prompt: Optional[str] = None,
+    enable_llm: bool = False,
+    llm_provider: Optional[str] = None,
+    llm_api_key: Optional[str] = None,
+    llm_thinking_tokens: Optional[int] = None,
+    output_format: OutputFormat = OutputFormat.PYTHON_OBJECT,
+    verbose: bool = False,
+    top_kernels: int = 10,
+) -> AnalysisResult:
+    """
+    Analyze a rocpd database file and return AI-powered insights.
+
+    This is the main entry point for programmatic analysis.
+    Performs local analysis (always) and optional LLM enhancement.
+
+    Args:
+        database_path: Path to .rpd or .db file
+        custom_prompt: Optional user question to guide analysis
+        enable_llm: Enable LLM-powered natural language enhancement
+        llm_provider: LLM provider ("anthropic", "openai")
+        llm_api_key: API key for LLM provider (or set env var)
+        llm_thinking_tokens: Enable extended thinking with this token budget.
+            Only supported with the Anthropic provider and compatible models
+            (claude-opus-4, claude-sonnet-4-5, claude-3-7-sonnet).
+        output_format: Desired output format
+        verbose: Enable verbose logging
+        top_kernels: Number of top kernels to analyze
+
+    Returns:
+        AnalysisResult object with complete analysis
+
+    Raises:
+        DatabaseNotFoundError: Database file doesn't exist
+        DatabaseCorruptedError: Database schema is invalid
+        MissingDataError: Required tables are missing
+
+    Example:
+        >>> from rocpd.ai_analysis import analyze_database
+        >>> from pathlib import Path
+        >>>
+        >>> result = analyze_database(Path("output.db"))
+        >>> print(result.summary.overall_assessment)
+        >>> for rec in result.recommendations.high_priority:
+        ...     print(f"- {rec.title}")
+    """
+    # Validate database exists
+    if not database_path.exists():
+        raise DatabaseNotFoundError(f"Database file not found: {database_path}")
+
+    if verbose:
+        print(f"[Analysis] Analyzing database: {database_path}")
+        print(f"[Analysis] Enable LLM: {enable_llm}")
+        if custom_prompt:
+            print(f"[Analysis] Custom prompt: {custom_prompt}")
+
+    # Perform local analysis by calling individual analysis functions directly.
+    # NOTE: We do NOT call analyze_performance() — it returns a formatted str,
+    # not a dict. We need raw data to build the AnalysisResult dataclass.
+    try:
+        from ..importer import RocpdImportData
+
+        # RocpdImportData's internal sanitize_input_list() iterates over its
+        # argument. Passing a plain str would iterate over characters. Pass a
+        # list with the single path string to ensure correct behavior.
+        connection = RocpdImportData([str(database_path)])
+
+        time_breakdown = compute_time_breakdown(connection)
+        hotspots = identify_hotspots(connection, top_n=top_kernels)
+        memory_analysis = analyze_memory_copies(connection)
+        hardware_counters = analyze_hardware_counters(connection)
+        already_collected = _detect_already_collected(connection)
+
+        # TraceLens-derived analysis
+        interval_timeline = compute_interval_timeline(connection)
+        kernel_categories = analyze_kernels_by_category(
+            connection, interval_timeline["total_wall_ns"]
+        )
+        short_kernels_data = analyze_short_kernels(connection)
+
+        recommendations = generate_recommendations(
+            time_breakdown,
+            hotspots,
+            memory_analysis,
+            hardware_counters,
+            already_collected,
+            short_kernels=short_kernels_data,
+            interval_timeline=interval_timeline,
+        )
+
+        if verbose:
+            print("[Analysis] Local analysis complete")
+
+    except Exception as e:
+        raise DatabaseCorruptedError(f"Failed to analyze database: {e}")
+
+    # Build AnalysisResult from raw analysis payloads
+    result = _build_analysis_result(
+        time_breakdown=time_breakdown,
+        hotspots=hotspots,
+        memory_analysis=memory_analysis,
+        recommendations=recommendations,
+        hardware_counters=hardware_counters,
+        database_path=database_path,
+        custom_prompt=custom_prompt,
+    )
+
+    result.kernel_categories = kernel_categories
+    result.short_kernels = short_kernels_data
+    result.interval_timeline = interval_timeline
+
+    # Also write into _raw so to_json() / to_webview() include them
+    result._raw["interval_timeline"] = interval_timeline
+    result._raw["kernel_categories"] = kernel_categories
+    result._raw["short_kernels"] = short_kernels_data
+
+    # Optional LLM enhancement
+    if enable_llm and llm_provider:
+        try:
+            if verbose:
+                print(f"[Analysis] Enhancing with {llm_provider} LLM...")
+
+            analyzer = LLMAnalyzer(
+                provider=llm_provider,
+                api_key=llm_api_key,
+                verbose=verbose,
+                thinking_budget_tokens=llm_thinking_tokens,
+            )
+
+            # Convert result to dict for LLM
+            analysis_data = _convert_result_to_llm_format(result)
+
+            # Build AnalysisContext so _select_tags() gates reference guide sections
+            # (including tracelens_metrics when TraceLens data is present)
+            has_counters = hardware_counters.get("has_counters", False)
+            analysis_tier = 2 if has_counters else 1
+            context = AnalysisContext(
+                tier=analysis_tier,
+                has_counters=has_counters,
+                custom_prompt=custom_prompt,
+                kernel_categories=result.kernel_categories or [],
+                interval_timeline={
+                    k: v
+                    for k, v in result.interval_timeline.items()
+                    if k.endswith("_pct")
+                },
+                short_kernel_summary=(
+                    {
+                        "threshold_us": result.short_kernels.get("threshold_us", 10),
+                        "short_kernel_count": result.short_kernels.get(
+                            "short_kernel_count", 0
+                        ),
+                        "wasted_pct_of_kernel_time": result.short_kernels.get(
+                            "wasted_pct_of_kernel_time", 0
+                        ),
+                    }
+                    if result.short_kernels
+                    else None
+                ),
+            )
+
+            # Get LLM enhancement
+            llm_explanation = analyzer.analyze_with_llm(
+                analysis_data,
+                custom_prompt=custom_prompt,
+                context=context,
+            )
+
+            result.llm_enhanced_explanation = llm_explanation
+
+            if verbose:
+                print("[Analysis] LLM enhancement complete")
+
+        except (LLMAuthenticationError, LLMRateLimitError):
+            # Auth and rate-limit errors must propagate — the caller needs to
+            # know their credentials are invalid or exhausted.
+            raise
+        except Exception as e:
+            # Other LLM errors are non-critical: add a warning and continue
+            # with local-only results.
+            result.warnings.append(
+                AnalysisWarning(
+                    severity="warning",
+                    message=f"LLM enhancement failed: {e}",
+                    recommendation="Analysis continues with local-only results",
+                )
+            )
+
+            if verbose:
+                print(f"[Analysis] LLM enhancement failed: {e}")
+
+    return result
+
+
+def _build_analysis_result(
+    time_breakdown: Dict[str, Any],
+    hotspots: List[Dict[str, Any]],
+    memory_analysis: Dict[str, Any],
+    recommendations: List[Dict[str, Any]],
+    hardware_counters: Dict[str, Any],
+    database_path: Path,
+    custom_prompt: Optional[str],
+) -> AnalysisResult:
+    """Build AnalysisResult from raw analysis payloads returned by analyze.py functions.
+
+    Key mapping from generate_recommendations() output:
+      rec["issue"]            → Recommendation.title
+      rec["suggestion"]       → Recommendation.description
+      rec["estimated_impact"] → Recommendation.estimated_impact
+      rec["actions"]          → Recommendation.next_steps
+      rec["priority"]         → "HIGH"/"MEDIUM"/"INFO" (uppercase) → normalized to lowercase
+    """
+    from datetime import datetime
+
+    # Build metadata
+    metadata = AnalysisMetadata(
+        rocpd_version=_ROCPD_VERSION,
+        analysis_version="0.1.0",  # schema version, not module version
+        database_file=str(database_path),
+        analysis_timestamp=datetime.now().isoformat(),
+        custom_prompt=custom_prompt,
+    )
+
+    # Build profiling info
+    has_counters = hardware_counters.get("has_counters", False)
+    profiling_mode = "sys_trace_with_counters" if has_counters else "sys_trace_only"
+    analysis_tier = 2 if has_counters else 1
+
+    profiling_info = ProfilingInfo(
+        total_duration_ns=int(time_breakdown.get("total_runtime", 0)),
+        profiling_mode=profiling_mode,
+        analysis_tier=analysis_tier,
+        gpus=[],
+    )
+
+    # Build summary — mirrors _build_summary() logic in analyze.py
+    primary_bottleneck = "mixed"
+    confidence = 0.50
+
+    memcpy_pct = time_breakdown.get("memcpy_percent", 0)
+    kernel_pct = time_breakdown.get("kernel_percent", 0)
+    overhead_pct = time_breakdown.get("overhead_percent", 0)
+    if memcpy_pct > 30:
+        primary_bottleneck = "memory_transfer"
+        confidence = 0.85
+    elif memcpy_pct > 20:
+        primary_bottleneck = "memory_transfer"
+        confidence = 0.70
+    elif overhead_pct > 25:
+        primary_bottleneck = "latency"
+        confidence = 0.75
+    elif kernel_pct > 70 and has_counters:
+        primary_bottleneck = "compute"
+        confidence = 0.80
+    elif kernel_pct > 70:
+        primary_bottleneck = "compute"
+        confidence = 0.60
+
+    summary = AnalysisSummary(
+        overall_assessment=f"Analysis complete. {len(hotspots)} kernels analyzed.",
+        primary_bottleneck=primary_bottleneck,
+        confidence=confidence,
+        key_findings=[
+            f"Total kernel execution time: {kernel_pct:.1f}%",
+            f"Memory copy overhead: {memcpy_pct:.1f}%",
+            f"Top kernel: {hotspots[0]['name'] if hotspots else 'N/A'}",
+        ],
+    )
+
+    # Build execution breakdown
+    execution_breakdown = ExecutionBreakdown(
+        kernel_time_ns=int(time_breakdown.get("total_kernel_time", 0)),
+        kernel_time_pct=kernel_pct,
+        memcpy_time_ns=int(time_breakdown.get("total_memcpy_time", 0)),
+        memcpy_time_pct=memcpy_pct,
+        api_overhead_pct=time_breakdown.get("overhead_percent", 0.0),
+    )
+
+    # Build recommendations — map keys from generate_recommendations() output.
+    # generate_recommendations() uses: issue, suggestion, estimated_impact, actions,
+    # priority (uppercase: "HIGH"/"MEDIUM"/"INFO"), category, commands.
+    rec_set = RecommendationSet()
+    for i, rec in enumerate(recommendations, 1):
+        priority_upper = rec.get("priority", "MEDIUM").upper()
+        recommendation = Recommendation(
+            id=f"rec_{i:03d}",
+            priority=priority_upper.lower(),
+            category=rec.get("category", "general"),
+            title=rec.get("issue", "Optimization opportunity"),
+            description=rec.get("suggestion", ""),
+            estimated_impact=rec.get("estimated_impact", "Unknown"),
+            next_steps=rec.get("actions", []),
+        )
+
+        if priority_upper == "HIGH":
+            rec_set.high_priority.append(recommendation)
+        elif priority_upper in ("MEDIUM", "INFO"):
+            rec_set.medium_priority.append(recommendation)
+        else:
+            rec_set.low_priority.append(recommendation)
+
+    # Build warnings
+    warnings = []
+    if not has_counters:
+        warnings.append(
+            AnalysisWarning(
+                severity="warning",
+                message="No hardware counters collected. Analysis limited to Tier 1 (trace data only).",
+                recommendation="Collect counters with: rocprofv3 --pmc GRBM_COUNT SQ_WAVES -- ./app",
+            )
+        )
+
+    result = AnalysisResult(
+        metadata=metadata,
+        profiling_info=profiling_info,
+        summary=summary,
+        execution_breakdown=execution_breakdown,
+        recommendations=rec_set,
+        warnings=warnings,
+    )
+
+    # Attach raw payloads as a dynamic attribute so to_json()/to_webview() can
+    # delegate serialization to format_analysis_output() for schema conformance.
+    result._raw = {
+        "time_breakdown": time_breakdown,
+        "hotspots": hotspots,
+        "memory_analysis": memory_analysis,
+        "recommendations_raw": recommendations,
+        "hardware_counters": hardware_counters,
+        "database_path": str(database_path),
+    }
+
+    return result
+
+
+def _convert_result_to_llm_format(result: AnalysisResult) -> Dict[str, Any]:
+    """Convert AnalysisResult to the format expected by LLMAnalyzer._sanitize_data().
+
+    Populates all sections from the raw analysis payloads stored on the result
+    so the LLM receives real profiling data rather than empty placeholders.
+    """
+    raw = getattr(result, "_raw", {})
+    hotspots = raw.get("hotspots", [])
+    memory_analysis = raw.get("memory_analysis", {})
+    hardware_counters = raw.get("hardware_counters", {})
+
+    return {
+        # GPU info — arch not currently stored in the DB views; keep as generic
+        "gpu": {"name": "AMD GPU", "arch": "unknown"},
+        "execution_breakdown": {
+            "kernel_time_pct": result.execution_breakdown.kernel_time_pct,
+            "memcpy_time_pct": result.execution_breakdown.memcpy_time_pct,
+            "api_overhead_pct": result.execution_breakdown.api_overhead_pct,
+        },
+        # Real kernel hotspot data
+        "kernels": [
+            {
+                "name": k.get("name"),
+                "calls": k.get("calls"),
+                "total_duration_ns": k.get("total_duration"),
+                "avg_duration_ns": k.get("avg_duration"),
+                "percent_of_total": k.get("percent_of_total"),
+            }
+            for k in hotspots
+        ],
+        # Real memory transfer data keyed by direction
+        "memory_ops": {
+            direction: {
+                "count": info.get("count"),
+                "total_bytes": info.get("total_bytes"),
+                "avg_duration_ns": info.get("avg_duration"),
+            }
+            for direction, info in memory_analysis.items()
+        },
+        "has_counters": hardware_counters.get("has_counters", False),
+        # Derived hardware metrics (gpu_utilization_percent, avg_waves, etc.)
+        "hardware_metrics": hardware_counters.get("metrics", {}),
+        "has_pc_sampling": result.profiling_info.analysis_tier >= 3,
+        "interval_timeline": {
+            k: v
+            for k, v in result.interval_timeline.items()
+            if k.endswith("_pct")  # pct fields only — omit _ns fields to reduce tokens
+        },
+        "kernel_categories": [
+            {k: v for k, v in c.items() if k != "total_ns" and k != "avg_duration_ns"}
+            for c in result.kernel_categories
+        ],
+        "short_kernel_summary": {
+            "threshold_us": result.short_kernels.get("threshold_us", 10),
+            "short_kernel_count": result.short_kernels.get("short_kernel_count", 0),
+            "wasted_pct_of_kernel_time": result.short_kernels.get(
+                "wasted_pct_of_kernel_time", 0
+            ),
+        },
+    }
+
+
+def analyze_database_to_json(
+    database_path: Path,
+    output_json_path: Optional[Path] = None,
+    **kwargs,
+) -> str:
+    """
+    Analyze database and return/save JSON output.
+
+    Args:
+        database_path: Path to .rpd or .db file
+        output_json_path: Optional path to save JSON file
+        **kwargs: Additional arguments passed to analyze_database()
+
+    Returns:
+        JSON string
+
+    Example:
+        >>> json_output = analyze_database_to_json(
+        ...     Path("output.db"),
+        ...     output_json_path=Path("analysis.json")
+        ... )
+    """
+    result = analyze_database(database_path, **kwargs)
+    json_output = result.to_json()
+
+    if output_json_path:
+        output_json_path.write_text(json_output)
+
+    return json_output
+
+
+def get_kernel_analysis(database_path: Path, kernel_name: str, **kwargs) -> Dict:
+    """
+    Get analysis for a specific kernel.
+
+    Args:
+        database_path: Path to .rpd or .db file
+        kernel_name: Exact kernel name or pattern
+        **kwargs: Additional arguments
+
+    Returns:
+        Kernel analysis data
+    """
+    # TODO: Implement kernel-specific analysis
+    raise NotImplementedError("Kernel-specific analysis not yet implemented")
+
+
+def get_recommendations(
+    database_path: Path,
+    priority_filter: Optional[str] = None,
+    category_filter: Optional[str] = None,
+    **kwargs,
+) -> List[Recommendation]:
+    """
+    Get filtered recommendations from analysis.
+
+    Args:
+        database_path: Path to .rpd or .db file
+        priority_filter: Filter by priority ("high", "medium", "low")
+        category_filter: Filter by category
+        **kwargs: Additional arguments
+
+    Returns:
+        List of Recommendation objects
+    """
+    result = analyze_database(database_path, **kwargs)
+
+    recommendations = []
+    if priority_filter == "high" or priority_filter is None:
+        recommendations.extend(result.recommendations.high_priority)
+    if priority_filter == "medium" or priority_filter is None:
+        recommendations.extend(result.recommendations.medium_priority)
+    if priority_filter == "low" or priority_filter is None:
+        recommendations.extend(result.recommendations.low_priority)
+
+    if category_filter:
+        recommendations = [
+            rec for rec in recommendations if rec.category == category_filter
+        ]
+
+    return recommendations
+
+
+def analyze_source(
+    source_dir: Path,
+    *,
+    custom_prompt: Optional[str] = None,
+    enable_llm: bool = False,
+    llm_provider: Optional[str] = None,
+    llm_api_key: Optional[str] = None,
+    verbose: bool = False,
+) -> SourceAnalysisResult:
+    """
+    Analyze a source code directory and return a Tier 0 profiling plan.
+
+    No database file is required. Scans .hip, .cpp, .cu, .cl, .py, .h,
+    .hpp files for GPU programming patterns and generates structured
+    recommendations for what to profile and with which commands.
+
+    Args:
+        source_dir: Path to source code directory
+        custom_prompt: Optional user question to guide LLM analysis
+        enable_llm: Enable LLM-powered explanation of the profiling plan
+        llm_provider: LLM provider ("anthropic", "openai")
+        llm_api_key: API key for LLM provider (or set env var)
+        verbose: Enable verbose logging
+
+    Returns:
+        SourceAnalysisResult with profiling plan
+
+    Raises:
+        SourceDirectoryNotFoundError: Source directory doesn't exist
+        SourceAnalysisError: Error during source scanning
+
+    Example:
+        >>> from rocpd.ai_analysis import analyze_source
+        >>> from pathlib import Path
+        >>>
+        >>> result = analyze_source(Path("./my_app/src"))
+        >>> print(result.programming_model)
+        >>> print(result.suggested_first_command)
+        >>> for rec in result.recommendations:
+        ...     print(f"[{rec['priority']}] {rec['category']}: {rec['issue']}")
+    """
+    if not source_dir.exists() or not source_dir.is_dir():
+        raise SourceDirectoryNotFoundError(
+            f"Source directory not found or not a directory: {source_dir}"
+        )
+
+    if verbose:
+        print(f"[Tier0] Scanning source directory: {source_dir}")
+
+    from .source_analyzer import SourceAnalyzer
+
+    scanner = SourceAnalyzer(source_dir, verbose=verbose)
+    plan = scanner.analyze()
+
+    if verbose:
+        print(
+            f"[Tier0] Scanned {plan.files_scanned} files, "
+            f"found {plan.kernel_count} kernels, "
+            f"programming model: {plan.programming_model}"
+        )
+
+    # Convert ProfilingPlan to SourceAnalysisResult dataclass
+    result = _plan_to_source_result(plan)
+
+    # Optional LLM enhancement
+    if enable_llm and llm_provider:
+        try:
+            if verbose:
+                print(f"[Tier0] Enhancing with {llm_provider} LLM...")
+
+            analyzer = LLMAnalyzer(
+                provider=llm_provider,
+                api_key=llm_api_key,
+                verbose=verbose,
+            )
+            context = AnalysisContext(tier=0, custom_prompt=custom_prompt)
+            result.llm_explanation = analyzer.analyze_source_with_llm(
+                result, custom_prompt=custom_prompt, context=context
+            )
+
+            if verbose:
+                print("[Tier0] LLM enhancement complete")
+
+        except (LLMAuthenticationError, LLMRateLimitError):
+            raise
+        except Exception as e:
+            if verbose:
+                print(f"[Tier0] LLM enhancement failed: {e}")
+
+    return result
+
+
+def validate_database(database_path: Path) -> Dict[str, Any]:
+    """
+    Validate database schema and contents without performing analysis.
+
+    Args:
+        database_path: Path to .rpd or .db file
+
+    Returns:
+        Validation result dictionary
+
+    Example:
+        >>> validation = validate_database(Path("output.db"))
+        >>> print(f"Valid: {validation['is_valid']}")
+        >>> print(f"Analysis tier: {validation['tier']}")
+    """
+    if not database_path.exists():
+        raise DatabaseNotFoundError(f"Database not found: {database_path}")
+
+    try:
+        from ..importer import RocpdImportData, execute_statement
+
+        connection = RocpdImportData([str(database_path)])
+
+        # Check for required tables AND views (kernels/memory_copies are views,
+        # not raw tables, in rocprofv3 databases created by the rocpd importer)
+        tables_query = "SELECT name FROM sqlite_master WHERE type IN ('table','view')"
+        tables = [
+            row[0] for row in execute_statement(connection, tables_query).fetchall()
+        ]
+
+        has_kernels = "kernels" in tables
+        has_memory_copies = "memory_copies" in tables
+        has_counters = "pmc_events" in tables
+        has_pc_sampling = "pc_sampling" in tables
+
+        # Determine tier
+        tier = 1
+        if has_counters:
+            tier = 2
+        if has_pc_sampling:
+            tier = 3
+
+        return {
+            "is_valid": has_kernels,
+            "tier": tier,
+            "has_kernels": has_kernels,
+            "has_memory_copies": has_memory_copies,
+            "has_counters": has_counters,
+            "has_pc_sampling": has_pc_sampling,
+            "tables": tables,
+        }
+
+    except Exception as e:
+        raise DatabaseCorruptedError(f"Database validation failed: {e}")
diff --git a/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/docs/AI_ANALYSIS_API.md b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/docs/AI_ANALYSIS_API.md
new file mode 100644
index 00000000000..41ff6e09ded
--- /dev/null
+++ b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/docs/AI_ANALYSIS_API.md
@@ -0,0 +1,1645 @@
+# rocpd AI Analysis Python API Documentation
+
+**Version:** 0.2.0
+**Module:** `rocpd.ai_analysis`
+
+---
+
+## Table of Contents
+
+1. [Overview](#overview)
+2. [Installation](#installation)
+3. [Quick Start](#quick-start)
+4. [API Reference](#api-reference)
+5. [Data Classes](#data-classes)
+6. [Output Formats](#output-formats)
+7. [LLM Enhancement](#llm-enhancement)
+8. [Error Handling](#error-handling)
+9. [Integration Examples](#integration-examples)
+10. [Bug Fixes & Behavioral Changes](#bug-fixes--behavioral-changes)
+
+---
+
+## Overview
+
+The rocpd AI Analysis API provides programmatic access to AI-powered GPU performance analysis. It's designed for integration with visualization tools (like Optiq), automated analysis pipelines, and custom workflows.
+
+**Key Features:**
+
+- ✅ **Local-first analysis** - Works offline, no API calls required
+- ✅ **Tier 0 source analysis** - Scan source code without a trace database (`analyze_source()`)
+- ✅ **Optional LLM enhancement** - Natural language explanations via Anthropic Claude, OpenAI GPT, any OpenAI-compatible private server, or local Ollama
+- ✅ **Multiple output formats** - Python objects, JSON, text, markdown, webview (interactive HTML)
+- ✅ **Privacy-focused** - Data sanitization for LLM mode
+- ✅ **User-modifiable** - Customize LLM behavior via reference guide
+- ✅ **Persistent conversations** - `LLMConversation` class for multi-turn streaming sessions
+- ✅ **Type-safe** - Dataclass-based API with type hints
+
+---
+
+## Installation
+
+The AI analysis module is included with rocprofiler-sdk 6.3.0 or later.
+
+```bash
+# rocprofiler-sdk is typically installed at:
+/opt/rocm/lib/python3.12/site-packages/rocpd/
+
+# No additional installation needed for local-only analysis
+
+# For LLM enhancement, install provider SDKs:
+pip install anthropic  # For Anthropic Claude
+pip install openai     # For OpenAI GPT
+```
+
+---
+
+## Quick Start
+
+### Basic Analysis (Local Mode)
+
+```python
+from rocpd.ai_analysis import analyze_database
+from pathlib import Path
+
+# Analyze a database file
+result = analyze_database(Path("output.db"))
+
+# Access results
+print(result.summary.overall_assessment)
+print(f"Primary bottleneck: {result.summary.primary_bottleneck}")
+print(f"Confidence: {result.summary.confidence:.0%}")
+
+# Get recommendations
+for rec in result.recommendations.high_priority:
+    print(f"🔴 {rec.title}")
+    print(f"   {rec.description}")
+    print(f"   Impact: {rec.estimated_impact}")
+```
+
+### With LLM Enhancement
+
+```python
+import os
+from rocpd.ai_analysis import analyze_database
+from pathlib import Path
+
+# Set API key
+os.environ["ANTHROPIC_API_KEY"] = "sk-ant-..."
+
+# Analyze with LLM enhancement
+result = analyze_database(
+    database_path=Path("output.db"),
+    enable_llm=True,
+    llm_provider="anthropic",
+    custom_prompt="Why is my matmul kernel slow?"
+)
+
+# LLM-enhanced natural language explanation
+print(result.llm_enhanced_explanation)
+```
+
+### JSON Output
+
+```python
+from rocpd.ai_analysis import analyze_database_to_json
+from pathlib import Path
+
+# Generate JSON output
+json_output = analyze_database_to_json(
+    database_path=Path("output.db"),
+    output_json_path=Path("analysis.json")  # Optional: save to file
+)
+
+# JSON string is also returned
+print(json_output)
+```
+
+### Webview (Interactive HTML)
+
+```python
+from rocpd.ai_analysis import analyze_database
+from pathlib import Path
+
+result = analyze_database(Path("output.db"))
+Path("analysis.html").write_text(result.to_webview())
+# Open analysis.html in any browser - no server required
+```
+
+Or via CLI (file extension applied automatically):
+
+```bash
+rocpd analyze -i output.db --format webview -d ./output -o analysis
+# Produces: ./output/analysis.html
+```
+
+---
+
+## API Reference
+
+### Main Functions
+
+#### `analyze_database()`
+
+Main entry point for performance analysis.
+
+```python
+def analyze_database(
+    database_path: Path,
+    *,
+    custom_prompt: Optional[str] = None,
+    enable_llm: bool = False,
+    llm_provider: Optional[str] = None,
+    llm_api_key: Optional[str] = None,
+    output_format: OutputFormat = OutputFormat.PYTHON_OBJECT,
+    verbose: bool = False,
+    top_kernels: int = 10,
+) -> AnalysisResult:
+```
+
+**Parameters:**
+
+- `database_path` (Path): Path to rocpd database file (.rpd or .db)
+- `custom_prompt` (str, optional): Natural language question to guide analysis
+  - Example: `"Why is kernel X slow?"`
+- `enable_llm` (bool): Enable LLM-powered enhancements (default: False)
+- `llm_provider` (str, optional): LLM provider ("anthropic" or "openai")
+- `llm_api_key` (str, optional): API key (or use environment variable)
+- `output_format` (OutputFormat): Output format (default: PYTHON_OBJECT)
+- `verbose` (bool): Enable verbose logging (default: False)
+- `top_kernels` (int): Number of top kernels to analyze (default: 10)
+
+**Returns:**
+
+- `AnalysisResult`: Complete analysis results object
+
+**Raises:**
+
+- `DatabaseNotFoundError`: Database file doesn't exist
+- `DatabaseCorruptedError`: Database schema is invalid
+- `MissingDataError`: Required tables missing
+- `LLMAuthenticationError`: LLM API key invalid (if enable_llm=True)
+
+**Example:**
+
+```python
+from rocpd.ai_analysis import analyze_database, OutputFormat
+from pathlib import Path
+
+result = analyze_database(
+    database_path=Path("output.db"),
+    custom_prompt="Focus on memory bottlenecks",
+    enable_llm=True,
+    llm_provider="anthropic",
+    verbose=True,
+    top_kernels=20
+)
+```
+
+---
+
+#### `analyze_database_to_json()`
+
+Analyze database and return JSON output.
+
+```python
+def analyze_database_to_json(
+    database_path: Path,
+    output_json_path: Optional[Path] = None,
+    **kwargs
+) -> str:
+```
+
+**Parameters:**
+
+- `database_path` (Path): Path to rocpd database file
+- `output_json_path` (Path, optional): Save JSON to this file
+- `**kwargs`: Additional arguments passed to `analyze_database()`
+
+**Returns:**
+
+- `str`: JSON string
+
+**Example:**
+
+```python
+from rocpd.ai_analysis import analyze_database_to_json
+from pathlib import Path
+
+json_str = analyze_database_to_json(
+    database_path=Path("output.db"),
+    output_json_path=Path("analysis.json"),
+    enable_llm=True,
+    llm_provider="anthropic"
+)
+```
+
+---
+
+#### `get_recommendations()`
+
+Get filtered recommendations from analysis.
+
+```python
+def get_recommendations(
+    database_path: Path,
+    priority_filter: Optional[str] = None,
+    category_filter: Optional[str] = None,
+    **kwargs
+) -> List[Recommendation]:
+```
+
+**Parameters:**
+
+- `database_path` (Path): Path to rocpd database file
+- `priority_filter` (str, optional): Filter by priority ("high", "medium", "low")
+- `category_filter` (str, optional): Filter by category ("memory", "compute", etc.)
+- `**kwargs`: Additional arguments passed to `analyze_database()`
+
+**Returns:**
+
+- `List[Recommendation]`: Filtered recommendations
+
+**Example:**
+
+```python
+from rocpd.ai_analysis import get_recommendations
+from pathlib import Path
+
+# Get only high-priority recommendations
+high_priority_recs = get_recommendations(
+    database_path=Path("output.db"),
+    priority_filter="high"
+)
+
+for rec in high_priority_recs:
+    print(f"{rec.title}: {rec.estimated_impact}")
+```
+
+---
+
+#### `validate_database()`
+
+Validate database without performing full analysis.
+
+```python
+def validate_database(database_path: Path) -> Dict[str, Any]:
+```
+
+**Parameters:**
+
+- `database_path` (Path): Path to rocpd database file
+
+**Returns:**
+
+- `Dict`: Validation results with keys:
+  - `is_valid` (bool): Database is valid
+  - `tier` (int): Analysis tier (1=trace, 2=counters, 3=pc_sampling)
+  - `has_kernels` (bool): Has kernel data
+  - `has_memory_copies` (bool): Has memory copy data
+  - `has_counters` (bool): Has hardware counters
+  - `has_pc_sampling` (bool): Has PC sampling data
+  - `tables` (List[str]): List of table names
+
+**Example:**
+
+```python
+from rocpd.ai_analysis import validate_database
+from pathlib import Path
+
+validation = validate_database(Path("output.db"))
+
+print(f"Valid: {validation['is_valid']}")
+print(f"Analysis tier: {validation['tier']}")
+print(f"Has counters: {validation['has_counters']}")
+```
+
+---
+
+#### `analyze_source()`
+
+Analyze source code directory (Tier 0) and return a profiling plan. No database required.
+
+```python
+def analyze_source(
+    source_dir: Path,
+    *,
+    custom_prompt: Optional[str] = None,
+    enable_llm: bool = False,
+    llm_provider: Optional[str] = None,
+    llm_api_key: Optional[str] = None,
+    verbose: bool = False,
+) -> SourceAnalysisResult:
+```
+
+**Parameters:**
+
+- `source_dir` (Path): Directory containing GPU source code (`.hip`, `.cpp`, `.cu`, `.cl`, `.py`, `.h`, `.hpp`)
+- `custom_prompt` (str, optional): Natural language question to guide LLM analysis
+- `enable_llm` (bool): Enable LLM-powered explanation of the profiling plan (default: False)
+- `llm_provider` (str, optional): LLM provider ("anthropic" or "openai")
+- `llm_api_key` (str, optional): API key (or use environment variable)
+- `verbose` (bool): Enable verbose logging (default: False)
+
+**Returns:**
+
+- `SourceAnalysisResult`: Profiling plan with detected kernels, patterns, risk areas, and suggested commands
+
+**Raises:**
+
+- `SourceDirectoryNotFoundError`: Source directory doesn't exist
+- `SourceAnalysisError`: Error during source scanning
+
+**Example:**
+
+```python
+from rocpd.ai_analysis import analyze_source
+from pathlib import Path
+
+result = analyze_source(Path("./my_app/src"))
+print(f"Programming model: {result.programming_model}")
+print(f"Kernels found: {result.kernel_count}")
+print(f"Suggested first command:\n  {result.suggested_first_command}")
+
+for rec in result.recommendations:
+    print(f"[{rec['priority']}] {rec['category']}: {rec['issue']}")
+```
+
+**CLI equivalent:**
+
+```bash
+rocpd analyze --source-dir ./my_app/src
+rocpd analyze --source-dir ./my_app/src --format json -d ./out -o plan  # → plan.json
+
+# Combined with trace database
+rocpd analyze -i output.db --source-dir ./my_app/src
+```
+
+---
+
+### Recommendation Deduplication
+
+The engine automatically detects what was already collected in the profiled run and
+suppresses redundant suggestions:
+
+| Already in database | Commands suppressed |
+|---|---|
+| `kernels` rows | `rocprofv3 --kernel-trace` |
+| `memory_copies` rows | `rocprofv3 --memory-copy-trace` |
+| `kernels` + `regions` rows | All `--sys-trace`-equivalent flags |
+| `pmc_events` counter `X` | `--pmc X` in any `rocprofv3` command |
+
+**PMC counter example**: if the trace was collected with
+`--pmc GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES`, a "Low occupancy" recommendation that
+would have suggested `--pmc SQ_WAVES SQ_WAVE_CYCLES TA_TA_BUSY` will be trimmed to
+`--pmc SQ_WAVE_CYCLES TA_TA_BUSY` (only the uncollected counters). If *all* suggested
+counters are already present the entire `rocprofv3` command is dropped.
+
+`rocprof-compute` commands are **never** dropped — they always represent new deep
+hardware counter analysis beyond what `rocprofv3` captures.
+
+---
+
+## Data Classes
+
+### `AnalysisResult`
+
+Main result object containing all analysis data.
+
+**Attributes:**
+
+```python
+@dataclass
+class AnalysisResult:
+    metadata: AnalysisMetadata
+    profiling_info: ProfilingInfo
+    summary: AnalysisSummary
+    execution_breakdown: ExecutionBreakdown
+    recommendations: RecommendationSet
+    warnings: List[AnalysisWarning]
+    errors: List[str]
+    llm_enhanced_explanation: Optional[str]  # Only if enable_llm=True
+```
+
+**Methods:**
+
+- `to_dict() -> Dict[str, Any]`: Convert to dictionary
+- `to_json(indent: int = 2) -> str`: Serialize to JSON
+- `to_text() -> str`: Generate plain text report
+- `to_markdown() -> str`: Generate markdown report
+- `to_webview() -> str`: Generate self-contained interactive HTML report
+
+**Example:**
+
+```python
+result = analyze_database(Path("output.db"))
+
+# Convert to different formats
+json_str = result.to_json()
+text_report = result.to_text()
+markdown_report = result.to_markdown()
+
+# Access structured data
+print(f"Kernel time: {result.execution_breakdown.kernel_time_pct:.1f}%")
+print(f"Primary bottleneck: {result.summary.primary_bottleneck}")
+```
+
+---
+
+### `Recommendation`
+
+Single optimization recommendation.
+
+```python
+@dataclass
+class Recommendation:
+    id: str
+    priority: str  # "high", "medium", "low"
+    category: str  # "memory", "compute", "occupancy", etc.
+    title: str
+    description: str
+    estimated_impact: str
+    next_steps: List[str]
+```
+
+**Example:**
+
+```python
+for rec in result.recommendations.high_priority:
+    print(f"ID: {rec.id}")
+    print(f"Title: {rec.title}")
+    print(f"Category: {rec.category}")
+    print(f"Impact: {rec.estimated_impact}")
+    print("Next steps:")
+    for step in rec.next_steps:
+        print(f"  - {step}")
+```
+
+---
+
+### `SourceAnalysisResult`
+
+Tier 0 analysis result from static source code scanning (returned by `analyze_source()`).
+
+**Attributes:**
+
+```python
+@dataclass
+class SourceAnalysisResult:
+    source_dir: str
+    analysis_timestamp: str
+    programming_model: str  # "HIP", "HIP+ROCm_Libraries", "OpenCL", "PyTorch_HIP", etc.
+
+    files_scanned: int
+    files_skipped: int
+
+    detected_kernels: List[Dict]   # {name, file, line, launch_type}
+    kernel_count: int
+
+    detected_patterns: List[Dict]  # {pattern_id, severity, category, description, count, locations}
+    risk_areas: List[str]
+
+    already_instrumented: bool     # True if ROCTx markers detected
+    roctx_marker_count: int
+
+    recommendations: List[Dict]    # Same structure as generate_recommendations() output
+    suggested_counters: List[str]  # Recommended --pmc counters for this codebase
+    suggested_first_command: str   # First rocprofv3 command to run
+
+    llm_explanation: Optional[str]  # Only if enable_llm=True
+```
+
+**Example:**
+
+```python
+result = analyze_source(Path("./my_app"))
+
+# Programming model detection
+print(result.programming_model)    # "HIP+ROCm_Libraries"
+
+# Discovered kernels
+for k in result.detected_kernels:
+    print(f"  {k['name']} in {k['file']}:{k['line']}")
+
+# Risk patterns
+for p in result.detected_patterns:
+    print(f"[{p['severity'].upper()}] {p['category']}: {p['description']}")
+
+# Suggested profiling workflow
+print(result.suggested_first_command)
+# e.g.: rocprofv3 --sys-trace --pmc GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES -- ./app
+```
+
+---
+
+### Other Data Classes
+
+- `AnalysisMetadata`: Metadata about analysis (timestamps, versions, etc.)
+- `ProfilingInfo`: Profiling session info (duration, mode, GPUs)
+- `AnalysisSummary`: High-level summary (assessment, bottleneck, findings)
+- `ExecutionBreakdown`: Time distribution (kernel, memcpy, API overhead)
+- `RecommendationSet`: Prioritized recommendations (high/medium/low)
+- `AnalysisWarning`: Warning messages
+
+See inline docstrings for complete documentation.
+
+---
+
+## Output Formats
+
+### Python Object (Default)
+
+Returns `AnalysisResult` dataclass with full type safety.
+
+```python
+result = analyze_database(Path("output.db"))
+print(result.summary.overall_assessment)
+```
+
+### JSON
+
+Machine-readable structured data. Output file extension: `.json`.
+
+```python
+from rocpd.ai_analysis import analyze_database, OutputFormat
+
+result = analyze_database(
+    Path("output.db"),
+    output_format=OutputFormat.JSON
+)
+
+json_str = result.to_json(indent=2)
+```
+
+**JSON Output conforms to `analysis-output.schema.json` (v0.1.0):**
+
+```json
+{
+  "schema_version": "0.1.0",
+  "metadata": {
+    "rocpd_version": "6.3.0",
+    "analysis_version": "0.1.0",
+    "database_file": "/path/to/output.db",
+    "analysis_timestamp": "2026-02-07T14:30:00Z"
+  },
+  "execution_breakdown": {
+    "kernel_time_pct": 40.0,
+    "memcpy_time_pct": 55.0,
+    "api_overhead_pct": 5.0,
+    "idle_time_pct": 0.0,
+    "total_runtime_ns": 5000000000
+  },
+  "hotspots": [
+    {
+      "rank": 1,
+      "name": "conv2d_kernel",
+      "calls": 100,
+      "total_duration_ns": 2000000000,
+      "avg_duration_ns": 20000000,
+      "pct_of_total": 40.0
+    }
+  ],
+  "memory_analysis": { ... },
+  "hardware_counters": { ... },
+  "recommendations": [
+    {
+      "priority": "HIGH",
+      "category": "Low Occupancy",
+      "issue": "Average wave occupancy is low",
+      "suggestion": "Increase occupancy by reducing VGPR usage",
+      "estimated_impact": "15-20% performance improvement",
+      "actions": ["Use rocprof-compute to measure occupancy", ...],
+      "commands": [...]
+    }
+  ],
+  "warnings": [...]
+}
+```
+
+> See `docs/analysis-output.schema.json` for the normative schema definition and
+> `docs/SCHEMA_CHANGELOG.md` for version history.
+
+### Text
+
+Human-readable plain text report. Output file extension: `.txt`.
+
+```python
+result = analyze_database(Path("output.db"))
+text_report = result.to_text()
+print(text_report)
+```
+
+### Markdown
+
+Markdown-formatted report with syntax highlighting. Output file extension: `.md`.
+
+```python
+result = analyze_database(Path("output.db"))
+markdown_report = result.to_markdown()
+Path("report.md").write_text(markdown_report)
+```
+
+### Webview (Interactive HTML)
+
+Self-contained single-file HTML report with light/dark theme, sortable tables, interactive
+recommendation cards, status-colored KPI cards, and SVG performance gauges. No external
+dependencies — works fully offline. Output file extension: `.html`.
+
+```python
+result = analyze_database(Path("output.db"))
+html_report = result.to_webview()
+Path("report.html").write_text(html_report)
+```
+
+**CLI usage:**
+
+```bash
+# Produces output/analysis.html automatically
+rocpd analyze -i output.db --format webview -d ./output -o analysis
+```
+
+**Features of the HTML report:**
+
+- **Light/Dark theme toggle**: Persisted in `localStorage`; defaults to AMD dark. Header
+  always uses AMD gradient branding regardless of active theme.
+- **Status summary badges**: Critical/Warning/Low/Info recommendation counts shown in the
+  sticky header — key issues visible without scrolling.
+- **Metric pills row**: Runtime (ms), kernel dispatch count, analysis tier, generation
+  timestamp, and DB file path in a compact row below the header.
+- **Status-colored KPI cards**: Kernel %, bottleneck type, total runtime, and tier cards
+  with colored top border (green/amber/red) reflecting health status.
+- **Priority icons on recommendations**: 🔴 HIGH, 🟠 MEDIUM, 🟡 LOW, ℹ INFO icons on each card.
+- **Overview panel**: Assessment text (blockquote style), status KPI grid, key findings list.
+- **Execution breakdown**: Gradient segment bars + grid-aligned legend rows.
+- **Recommendations**: Collapsible cards color-coded by priority (HIGH auto-expanded);
+  one-click copy of profiling commands; section-level Critical/Warning count badges.
+- **Hotspot table**: Sortable by any column; rows with >20% of total time highlighted.
+- **Memory transfers**: Per-direction table (H2D, D2H, D2D, P2P).
+- **Hardware counters**: GPU utilization and wave occupancy gauges (Tier 2); gauges have
+  background fill and hover border effect.
+- **FAB scroll-to-top**: Floating action button appears after scrolling 250 px.
+- **Staggered animations**: Section cards fade in with `@keyframes fadeInUp` on load.
+- **Embedded data**: Full JSON payload included for programmatic inspection.
+- **Hover tooltips**: Every graph, gauge, bar, table column, and counter row shows a
+  floating tooltip on hover explaining what the metric means, why it matters, good/bad
+  thresholds, and how to address issues. Coverage includes:
+  - *Gauges*: counter formula (e.g. `GRBM_GUI_ACTIVE ÷ GRBM_COUNT`), target thresholds,
+    current status assessment
+  - *Breakdown bars*: what each category measures, optimization guidance
+  - *Overview stats*: per-bottleneck type explanation with specific fix advice,
+    Tier 1 vs Tier 2 distinction with upgrade command
+  - *Hotspot columns*: semantics of Calls, Total/Avg/Min time, % Total
+  - *Memory directions*: H2D/D2H/D2D/P2P with PCIe vs HBM bandwidth context
+  - *Counter rows*: educational content for 20+ known AMD GPU counters
+    (GRBM_*, SQ_*, TCP/TCC cache, FETCH_SIZE, WRITE_SIZE, etc.);
+    unknown counters receive a generic fallback message
+
+---
+
+## LLM Enhancement
+
+### Overview
+
+LLM enhancement provides natural language explanations of performance data. It's **optional** and **privacy-focused**.
+
+### How It Works
+
+1. **Local analysis runs first** (always)
+2. **Data is sanitized** (kernel names → [KERNEL_1], grid sizes → [REDACTED])
+3. **Reference guide loaded** (the "fence" - defines analysis rules)
+4. **LLM called with sanitized data + reference guide**
+5. **Natural language explanation returned**
+
+### Enabling LLM Enhancement
+
+**Option 1: Environment Variable**
+
+```bash
+export ANTHROPIC_API_KEY="sk-ant-..."
+```
+
+```python
+from rocpd.ai_analysis import analyze_database
+
+result = analyze_database(
+    Path("output.db"),
+    enable_llm=True,
+    llm_provider="anthropic"
+)
+```
+
+**Option 2: Pass API Key Directly**
+
+```python
+result = analyze_database(
+    Path("output.db"),
+    enable_llm=True,
+    llm_provider="anthropic",
+    llm_api_key="sk-ant-..."
+)
+```
+
+### Supported Providers
+
+- **Anthropic Claude** (recommended)
+  - Provider: `"anthropic"`
+  - Environment variable: `ANTHROPIC_API_KEY`
+  - Default model: `claude-sonnet-4-20250514`
+
+- **OpenAI GPT**
+  - Provider: `"openai"`
+  - Environment variable: `OPENAI_API_KEY`
+  - Default model: `gpt-4-turbo-preview`
+  - **Model compatibility**: newer models (gpt-5, o1, o3, gpt-4o-2024-11-20+) require
+    `max_completion_tokens` instead of `max_tokens`. This is handled automatically —
+    `max_completion_tokens` is tried first and falls back to `max_tokens` if needed.
+
+- **Private/enterprise server** (any OpenAI-compatible endpoint)
+  - Provider: `"private"` (`--llm private`)
+  - Required env var: `ROCPD_LLM_PRIVATE_URL` — base URL (e.g. `https://llm-api.example.com/OpenAI`)
+  - Required: `ROCPD_LLM_PRIVATE_MODEL` or `--llm-private-model`
+  - Optional: `ROCPD_LLM_PRIVATE_API_KEY` (default: `"dummy"` for header-authenticated servers)
+  - Optional: `ROCPD_LLM_PRIVATE_HEADERS` — JSON object of extra request headers;
+    must be a JSON object (`{...}`), not an array or scalar — a `ValueError` is raised
+    if the parsed value is not a dict; the `user` header is auto-set to `os.getlogin()`
+    unless already provided
+  - Optional: `ROCPD_LLM_PRIVATE_VERIFY_SSL=0` — disable SSL certificate verification (requires `httpx`)
+
+  ```bash
+  export ROCPD_LLM_PRIVATE_URL="https://llm-api.example.com/OpenAI"
+  export ROCPD_LLM_PRIVATE_HEADERS='{"Ocp-Apim-Subscription-Key": "abc123", "api-version": "preview"}'
+  rocpd analyze -i output.db --llm private --llm-private-model gpt-4o
+  ```
+
+- **Local Ollama**
+  - Provider: `--llm-local ollama`
+  - Env var: `ROCPD_LLM_LOCAL_URL` (default: `http://localhost:11434/v1`)
+  - Env var: `ROCPD_LLM_LOCAL_MODEL` (default: `codellama:13b`)
+
+**Override the model at runtime** (anthropic/openai providers):
+
+```bash
+export ROCPD_LLM_MODEL="claude-opus-4-6"   # Use a different Anthropic model
+export ROCPD_LLM_MODEL="gpt-4o"            # Use a different OpenAI model
+```
+
+### Custom Prompts
+
+Guide the LLM with specific questions:
+
+```python
+result = analyze_database(
+    Path("output.db"),
+    enable_llm=True,
+    llm_provider="anthropic",
+    custom_prompt="Why is my convolution kernel slow? Focus on memory access patterns."
+)
+
+print(result.llm_enhanced_explanation)
+```
+
+### Data Sanitization
+
+When LLM mode is enabled, sensitive data is automatically redacted:
+
+| Data Type | Original | Sanitized |
+|-----------|----------|-----------|
+| Kernel names | `conv2d_forward_kernel` | `[KERNEL_1]` |
+| Grid sizes | `[256, 256, 1]` | `[GRID_SIZE]` |
+| Workgroup sizes | `[256, 1, 1]` | `[WORKGROUP_SIZE]` |
+| File paths | `/home/user/app.cpp` | `[REDACTED]` |
+
+**Preserved Data** (aggregated/classified):
+- Bottleneck classifications (compute-bound, memory-bound)
+- Aggregated metrics (time percentages, utilization %)
+- GPU architecture (gfx908, gfx90a, gfx942, gfx950, gfx1030, gfx1100)
+
+---
+
+## Error Handling
+
+### Exception Hierarchy
+
+```python
+AnalysisError (base)
+├── DatabaseNotFoundError
+├── DatabaseCorruptedError
+├── MissingDataError
+├── UnsupportedGPUError
+├── LLMAuthenticationError
+├── LLMRateLimitError
+├── ReferenceGuideNotFoundError
+├── SourceDirectoryNotFoundError   # analyze_source(): directory doesn't exist
+└── SourceAnalysisError            # analyze_source(): error during scanning
+```
+
+### Example Error Handling
+
+```python
+from rocpd.ai_analysis import (
+    analyze_database,
+    DatabaseNotFoundError,
+    MissingDataError,
+    LLMAuthenticationError
+)
+from pathlib import Path
+
+try:
+    result = analyze_database(
+        Path("output.db"),
+        enable_llm=True,
+        llm_provider="anthropic"
+    )
+
+except DatabaseNotFoundError as e:
+    print(f"Database not found: {e}")
+
+except MissingDataError as e:
+    print(f"Missing data: {e}")
+    print(f"Missing tables: {e.missing_tables}")
+    print("Suggestion: Collect additional profiling data")
+
+except LLMAuthenticationError as e:
+    print(f"LLM authentication failed: {e}")
+    print("Check your API key and environment variables")
+
+except Exception as e:
+    print(f"Unexpected error: {e}")
+```
+
+### Graceful Degradation
+
+**Authentication and rate-limit errors propagate** — if `enable_llm=True` and your key is
+invalid or exhausted, `LLMAuthenticationError` / `LLMRateLimitError` will be raised so you
+know immediately rather than silently getting local-only results.
+
+Other transient LLM failures (network timeouts, unexpected API errors) produce a warning
+and fall back to local-only results without raising:
+
+```python
+try:
+    result = analyze_database(
+        Path("output.db"),
+        enable_llm=True,
+        llm_provider="anthropic"
+    )
+except LLMAuthenticationError:
+    print("Invalid API key — check ANTHROPIC_API_KEY")
+    raise
+
+# If a transient error occurred, llm_enhanced_explanation will be None
+if result.llm_enhanced_explanation:
+    print("LLM enhancement available")
+else:
+    print("Local-only analysis (LLM enhancement failed or disabled)")
+
+# Check warnings for details on any transient failure
+for warning in result.warnings:
+    print(f"⚠️  {warning.message}")
+```
+
+---
+
+## Integration Examples
+
+### Optiq Integration
+
+```python
+# Optiq UI integration example
+from rocpd.ai_analysis import analyze_database
+from pathlib import Path
+
+def load_trace_with_ai_insights(trace_file_path: str):
+    """
+    Optiq function to load trace and get AI insights.
+    """
+    result = analyze_database(Path(trace_file_path))
+
+    # Extract insights for UI
+    insights = {
+        "summary": result.summary.overall_assessment,
+        "bottleneck": result.summary.primary_bottleneck,
+        "confidence": result.summary.confidence,
+        "top_recommendations": [
+            {
+                "title": rec.title,
+                "description": rec.description,
+                "impact": rec.estimated_impact,
+                "priority": rec.priority
+            }
+            for rec in result.recommendations.high_priority[:3]
+        ],
+        "execution_breakdown": {
+            "kernel_pct": result.execution_breakdown.kernel_time_pct,
+            "memcpy_pct": result.execution_breakdown.memcpy_time_pct,
+            "overhead_pct": result.execution_breakdown.api_overhead_pct
+        }
+    }
+
+    return insights
+
+# Usage in Optiq
+insights = load_trace_with_ai_insights("/path/to/output.db")
+display_ai_panel(insights)
+```
+
+### Automated Analysis Pipeline
+
+```python
+from rocpd.ai_analysis import analyze_database, get_recommendations
+from pathlib import Path
+import sys
+
+def automated_analysis_pipeline(trace_files: List[Path]):
+    """
+    Analyze multiple trace files and generate reports.
+    """
+    for trace_file in trace_files:
+        print(f"Analyzing {trace_file}...")
+
+        try:
+            # Analyze
+            result = analyze_database(
+                trace_file,
+                enable_llm=True,
+                llm_provider="anthropic"
+            )
+
+            # Generate markdown report
+            report_path = trace_file.with_suffix(".md")
+            report_path.write_text(result.to_markdown())
+            print(f"  ✅ Report saved: {report_path}")
+
+            # Check for high-priority issues
+            high_priority = result.recommendations.high_priority
+            if high_priority:
+                print(f"  🔴 {len(high_priority)} high-priority issues found")
+                for rec in high_priority:
+                    print(f"     - {rec.title}")
+
+        except Exception as e:
+            print(f"  ❌ Analysis failed: {e}")
+
+# Run pipeline
+trace_files = list(Path("./traces").glob("*.db"))
+automated_analysis_pipeline(trace_files)
+```
+
+### Batch Comparison
+
+```python
+from rocpd.ai_analysis import analyze_database
+from pathlib import Path
+import pandas as pd
+
+def compare_traces(baseline_path: Path, optimized_path: Path):
+    """
+    Compare baseline vs optimized traces.
+    """
+    baseline = analyze_database(baseline_path)
+    optimized = analyze_database(optimized_path)
+
+    # Build comparison dataframe
+    comparison = pd.DataFrame({
+        "Metric": [
+            "Kernel Time %",
+            "Memory Copy %",
+            "API Overhead %",
+            "Primary Bottleneck",
+            "Confidence"
+        ],
+        "Baseline": [
+            f"{baseline.execution_breakdown.kernel_time_pct:.1f}%",
+            f"{baseline.execution_breakdown.memcpy_time_pct:.1f}%",
+            f"{baseline.execution_breakdown.api_overhead_pct:.1f}%",
+            baseline.summary.primary_bottleneck,
+            f"{baseline.summary.confidence:.0%}"
+        ],
+        "Optimized": [
+            f"{optimized.execution_breakdown.kernel_time_pct:.1f}%",
+            f"{optimized.execution_breakdown.memcpy_time_pct:.1f}%",
+            f"{optimized.execution_breakdown.api_overhead_pct:.1f}%",
+            optimized.summary.primary_bottleneck,
+            f"{optimized.summary.confidence:.0%}"
+        ]
+    })
+
+    print(comparison.to_markdown(index=False))
+
+# Usage
+compare_traces(Path("baseline.db"), Path("optimized.db"))
+```
+
+---
+
+## See Also
+
+- [LLM Reference Guide Documentation](LLM_REFERENCE_GUIDE.md) - How to customize LLM behavior
+- [CLI Documentation](../README.md) - Using `rocpd analyze` command
+- [rocprofiler-sdk Documentation](https://rocm.docs.amd.com/projects/rocprofiler-sdk/)
+
+---
+
+### `LLMConversation` — Persistent Multi-Turn Streaming Session
+
+`LLMConversation` provides a stateful multi-turn LLM session with streaming output,
+automatic compaction, and disk archiving. It is used internally by `InteractiveSession`
+and is also available as a public API for custom workflows.
+
+```python
+from rocpd.ai_analysis import LLMConversation
+
+conv = LLMConversation(
+    provider="anthropic",      # "anthropic" | "openai" | "private" | "local"
+    api_key=None,              # or pass directly; falls back to env vars
+    model=None,                # or override default model
+    compact_every=10,          # compact history every N turns (default 10)
+    keep_recent_turns=6,       # keep this many turns after compaction
+    history_path=None,         # optional Path for JSONL disk archive
+)
+
+# Set the system prompt once (include the reference guide / "fence" here)
+from rocpd.ai_analysis import load_reference_guide
+conv.initialize("You are an AMD GPU expert.\n\n" + load_reference_guide())
+
+# Stream a response token-by-token
+response = conv.send(
+    "What is the bottleneck in this trace?",
+    on_token=lambda t: print(t, end="", flush=True),
+)
+
+# Serialize / restore across sessions
+state = conv.to_dict()                              # does NOT include api_key
+conv2 = LLMConversation.from_dict(state, api_key="sk-ant-...")
+```
+
+**Constructor parameters:**
+
+| Parameter | Default | Description |
+|---|---|---|
+| `provider` | — | `"anthropic"`, `"openai"`, `"private"`, or `"local"` |
+| `api_key` | `None` | API key; falls back to `ANTHROPIC_API_KEY` / `OPENAI_API_KEY` / `ROCPD_LLM_PRIVATE_API_KEY` |
+| `model` | `None` | Model override; falls back to `ROCPD_LLM_MODEL` then built-in default |
+| `compact_every` | `10` | Trigger LLM-based history compaction every N turns |
+| `keep_recent_turns` | `6` | Number of recent turns preserved verbatim after compaction |
+| `history_path` | `None` | JSONL file path for append-only message archive |
+
+**Methods:**
+
+- `initialize(system_prompt: str)` — Set system prompt (call once before `send()`)
+- `send(user_message, *, max_tokens=4096, on_token=None) -> str` — Append user turn, stream response
+- `to_dict() -> dict` — Serialize state (api_key excluded)
+- `from_dict(d, *, api_key=None, model=None) -> LLMConversation` — Restore from serialized state
+
+**Properties:** `turn_count: int`, `messages: List[dict]`
+
+---
+
+### `load_reference_guide()` — Load the LLM Fence
+
+Returns the full content of the LLM reference guide (the "fence") as a string.
+Useful when building a custom system prompt for `LLMConversation.initialize()`.
+
+```python
+from rocpd.ai_analysis import load_reference_guide
+
+guide = load_reference_guide()
+# guide is the full markdown text of share/llm-reference-guide.md
+
+conv.initialize("You are an expert AMD GPU engineer.\n\n" + guide)
+```
+
+The guide is loaded from (in order):
+1. `ROCPD_LLM_REFERENCE_GUIDE` environment variable path
+2. Module-relative `share/llm-reference-guide.md`
+3. `/opt/rocm/share/rocprofiler-sdk/llm-reference-guide.md`
+
+---
+
+### Context-Aware LLM Guide Loading
+
+`LLMAnalyzer` accepts an optional `AnalysisContext` to reduce the reference guide
+tokens sent per call. Build the context from already-computed analysis results:
+
+```python
+from rocpd.ai_analysis import AnalysisContext
+from rocpd.ai_analysis.llm_analyzer import LLMAnalyzer
+
+ctx = AnalysisContext(
+    tier=2,                        # 0=source-only, 1=trace, 2=counters
+    has_counters=True,
+    bottleneck_type="compute",     # triggers compiler section
+    custom_prompt="why is my kernel slow?",
+)
+
+analyzer = LLMAnalyzer(provider="anthropic", api_key="...", verbose=True)
+result = analyzer.analyze_with_llm(data, context=ctx)
+```
+
+When `context=None` (default), the full guide is used — backward compatible.
+
+Token savings by scenario:
+- Tier 1 trace-only: ~47% fewer tokens
+- Tier 0 source-only: ~51% fewer tokens
+- Tier 2 with latency bottleneck: ~18% fewer tokens
+
+See `docs/LLM_GUIDE_SECTIONS.md` for the full tag vocabulary and how to add
+new sections or tags.
+
+---
+
+## Support
+
+For issues, questions, or feature requests:
+- File an issue on GitHub
+- See [CONTRIBUTING.md](../CONTRIBUTING.md)
+- ROCm documentation: https://rocm.docs.amd.com/
+
+---
+
+## Bug Fixes & Behavioral Changes
+
+This section documents behavioral changes made during code review that affect
+how callers interact with the API. Changes are grouped by category.
+
+### LLM Layer
+
+**`LLMAnalyzer()` construction no longer raises `LLMAuthenticationError`**
+
+Previously, constructing `LLMAnalyzer(provider="anthropic")` without setting
+`ANTHROPIC_API_KEY` would raise `LLMAuthenticationError` immediately. This blocked
+use cases where the analyzer is constructed ahead of time and the API key is
+supplied later (e.g., via a configuration reload).
+
+The key validation is now **deferred** — `LLMAuthenticationError` is raised only
+when an actual API call is made (`analyze_with_llm()`, `_call_anthropic()`, etc.).
+Construction always succeeds as long as `provider` is valid.
+
+```python
+# This now works even without an API key set
+from rocpd.ai_analysis.llm_analyzer import LLMAnalyzer
+analyzer = LLMAnalyzer(provider="anthropic")  # no longer raises
+
+# The error fires here instead, when the call is actually made
+import os
+os.environ["ANTHROPIC_API_KEY"] = "sk-ant-..."  # set key before calling
+result = analyzer.analyze_with_llm(data)
+```
+
+**`LLMAnalyzer(model=...)` is now honored**
+
+Previously, the `model` parameter was stored but the `ROCPD_LLM_MODEL` environment
+variable was checked first at call time, silently overriding any explicit `model=`
+argument. The priority is now:
+
+1. `model=` constructor argument (highest priority)
+2. `ROCPD_LLM_MODEL` environment variable
+3. Built-in default (`DEFAULT_ANTHROPIC_MODEL` or `DEFAULT_OPENAI_MODEL`)
+
+**`analyze_source()` now passes `AnalysisContext(tier=0)` to the LLM automatically**
+
+When `enable_llm=True`, `analyze_source()` constructs an `AnalysisContext(tier=0,
+custom_prompt=...)` and passes it to `analyze_source_with_llm()`. This ensures the
+LLM reference guide is filtered to Tier 0-relevant sections (reducing token cost by
+~51%) and that compiler optimization guidance is included.
+
+Callers who create `LLMAnalyzer` directly and call `analyze_source_with_llm()`
+should also pass `context=AnalysisContext(tier=0)` for the same benefit.
+
+**Timeout parameter added to all LLM API calls**
+
+All Anthropic and OpenAI API calls now include `timeout=120` (seconds). Previously,
+LLM calls could hang indefinitely on slow or unavailable network connections. If the
+call takes longer than 120 seconds a network timeout exception is raised and wrapped
+as a non-fatal warning (local analysis continues).
+
+### Output & Serialization
+
+**`AnalysisResult.to_json()` now raises `RuntimeError` when `_raw` is absent**
+
+Previously, calling `to_json()` on an `AnalysisResult` constructed manually (not via
+`analyze_database()`) would silently return non-schema-conformant JSON — a plain
+`asdict()` serialization missing `schema_version`, `hotspots`, and other required
+fields.
+
+It now raises `RuntimeError("Raw analysis data not available. ...")` immediately,
+making the problem visible. Use `to_dict()` for non-schema-conformant dict output,
+or use `analyze_database()` (which populates `_raw`) to get schema-conformant JSON.
+
+```python
+# Manual construction — to_json() now raises:
+result = AnalysisResult(...)
+result.to_json()          # raises RuntimeError — use to_dict() instead
+result.to_dict()          # works — returns plain asdict() dict
+
+# Via analyze_database() — to_json() works:
+result = analyze_database(Path("output.db"))
+result.to_json()          # works — schema-conformant, schema_version="0.1.0"
+```
+
+**`analyze_memory_copies()` bandwidth now uses actual transfer sizes**
+
+Previously the `size` column in the `memory_copies` table was not reliably
+populated and bandwidth calculations returned 0. The column is now read and
+`bandwidth_bytes_per_sec` (and `bandwidth_gbps`) are computed from real transfer
+sizes when available. The "Low memory bandwidth" recommendation (< 10 GB/s threshold)
+can now fire based on actual measurements.
+
+### Analysis Correctness
+
+**`overhead_percent` is now guaranteed to be ≥ 0**
+
+In some trace databases where kernel + memcpy time slightly exceeds the computed
+total runtime (due to timestamp rounding), `overhead_percent` could be negative.
+`compute_time_breakdown()` now applies `max(0.0, raw_overhead_pct)` before
+returning the result. The field is always non-negative in the output.
+
+**Bottleneck classification no longer triggers `compute` from `has_counters` alone**
+
+Previously, the `_build_summary()` bottleneck classifier in `api.py` could produce
+`primary_bottleneck="compute"` based on `kernel_pct > 70 AND has_counters=True`,
+even when `kernel_pct` was well below 70%. The condition now uses the correct
+threshold check: `kernel_pct > 70` is evaluated first, then `has_counters` is used
+only to raise the confidence from 0.60 to 0.80 — not to change the bottleneck type.
+
+**`analyze_source_code()` raises `SourceDirectoryNotFoundError` for missing directories**
+
+The `analyze_source_code()` function in `analyze.py` (CLI path) now raises
+`SourceDirectoryNotFoundError` (not a generic `Exception`) when the `source_dir`
+argument does not exist or is not a directory. This matches the behavior of the
+Python API's `analyze_source()`.
+
+### Interactive Session (LLM Providers)
+
+**`"private"` provider now correctly routed in `_apply_suggestions_via_llm` and `_llm_rewrite_file`**
+
+Previously, both `InteractiveSession._apply_suggestions_via_llm` and
+`WorkflowSession._llm_rewrite_file` dispatched any unrecognized provider to
+`_call_local()` (Ollama). This caused the `"private"` provider to attempt a connection
+to `http://localhost:11434/v1` and fail with a connection error instead of calling the
+configured enterprise server.
+
+Both methods now explicitly handle `"private"` by routing to `_call_private()`.
+
+**`InteractiveSession` uses `LLMConversation` for persistent multi-turn context**
+
+The previous `SessionContext` dataclass (compact per-session summary: analyses, suggestions, commands)
+has been replaced by a persistent `LLMConversation` object that holds the full message history.
+All LLM calls within a session (`[o]`, `[a]` annotations, code rewrites) share the same conversation
+so the LLM accumulates full context rather than receiving a condensed summary block.
+
+Key behavioral changes:
+- History is compacted via `--llm-compact-every N` (default 10 turns) using an LLM-generated summary, not a rule-based snippet
+- Source files are tracked in `_sent_source_files`; a file already sent in this session is not re-transmitted
+- Conversation state (`conv.to_dict()`) is serialized into the session JSON on `[s]` save
+- On `--resume-session`, the conversation is restored with `LLMConversation.from_dict()`
+
+### Source Scanner
+
+**`SourceAnalyzer` adds a truncation warning to `risk_areas` when `_MAX_FILES` is hit**
+
+When the number of source files in the scanned directory exceeds `_MAX_FILES` (500),
+scanning stops early. The scanner now appends a human-readable warning to
+`plan.risk_areas` noting how many files were skipped and suggesting a more targeted
+`--source-dir` path. Previously the truncation was silent.
+
+```python
+plan = SourceAnalyzer(Path("./huge_repo")).analyze()
+# If > 500 files found:
+assert any("truncat" in r.lower() for r in plan.risk_areas)
+```
+
+### WorkflowSession — Cycle Prevention and Tier 3 Escalation
+
+**Collection fingerprint expanded to all trace flags**
+
+The PMC-dedup logic that prevents infinite `[r] → re-profile → same INFO` loops now
+fingerprints **all named trace collection flags** in addition to individual `--pmc`
+counter names:
+
+```
+--sys-trace  --hip-trace  --kernel-trace  --memory-copy-trace  --hsa-trace  --stats
+```
+
+Previously only `--pmc` counters were tracked, causing the session to cycle between
+sys-trace and counter-collection runs indefinitely.
+
+**All-history comparison (not just last run)**
+
+The dedup check now compares the suggested command's fingerprint against the **union**
+of everything collected across all previous trace runs:
+
+```python
+already_fp = frozenset().union(*(
+    _collection_fingerprint(t.command) for t in self._state.trace_history
+))
+if suggested_fp and suggested_fp.issubset(already_fp):
+    ai_rec_cmd = None  # every suggested collection already performed
+```
+
+**Tier 3 escalation when Tier 1/2 exhausted**
+
+When all Tier 1/2 data has been collected and there is nothing new to suggest, Phase 5
+now shows a "go deeper" menu instead of just printing "stuck":
+
+- TraceLens interval + kernel-category analysis: already embedded in the Phase 4 report.
+- `[d]` builds a PC sampling command and sets it as the Phase 7 option `[3]`:
+  ```
+  ROCPROFILER_PC_SAMPLING_BETA_ENABLED=1 rocprofv3 --pc-sampling \
+    -d /tmp/rocpd_trace/run_<ts> -o results -- <app>
+  ```
+
+**ENV=VALUE command prefix support in Phase 3**
+
+`_phase3_run_profiler` now strips leading `KEY=VALUE` tokens from the command string
+and injects them into the subprocess environment via `env=` rather than `shell=True`:
+
+```
+# This works directly — ROCPROFILER_PC_SAMPLING_BETA_ENABLED=1 is extracted
+# and added to the child process env before rocprofv3 is exec'd.
+ROCPROFILER_PC_SAMPLING_BETA_ENABLED=1 rocprofv3 --pc-sampling ...
+```
+
+### WorkflowSession — AI Edit Revert
+
+**`_revert_last_edit(failure_reason="")` helper**
+
+Restores the most recently AI-modified file from its `.bak` backup and removes the
+`_EditRecord` from `edit_history`. Accepts an optional `failure_reason` string.
+
+When `failure_reason` is non-empty and an `LLMConversation` is active, two messages
+are injected directly into the conversation history (a `user` message describing the
+failure and an `assistant` acknowledgement):
+
+```python
+feedback = (
+    f"IMPORTANT: The previous code edit to {file} was reverted "
+    f"because it caused errors.\n\nFailure details:\n{failure_reason}\n\n"
+    f"Do NOT suggest the same pattern again..."
+)
+conv._messages.append({"role": "user", "content": feedback})
+conv._messages.append({"role": "assistant", "content": "Understood. ..."})
+```
+
+This teaches the LLM what failed without requiring a separate API call.
+
+**Phase 3 (run profiler) — `[v]` revert on profiling failure**
+
+When the profiling command exits non-zero and `edit_history` is non-empty, the retry
+menu now includes `[v] Revert last AI edit and retry`. The exit code is passed as the
+failure reason so the LLM conversation records it.
+
+**Phase 6 (recompile wait) — accumulate and pass error text**
+
+The recompile-wait loop accumulates all lines the user types as potential compilation
+errors. When the user types `revert`/`undo`/`v`, the accumulated error lines are passed
+to `_revert_last_edit(failure_reason=...)` so the LLM conversation receives the exact
+compiler output. Example:
+
+```
+Changes applied. Please recompile your application.
+Type 'done' when compiled, 'revert' to undo the AI edit,
+'abort' to exit, or paste compilation errors.
+> error: use of undeclared identifier '__builtin_amdgcn_sin'
+  Error noted. Type 'done' when fixed or 'revert' to undo the edit.
+> revert
+  ✓ Reverted: inefficient_demo.cpp  (backup kept at inefficient_demo.cpp.bak)
+```
+
+### LLM Fence — Invalid HIP Intrinsics
+
+**`__builtin_amdgcn_sin` / `__builtin_amdgcn_cos` added to the prohibited list**
+
+The reference guide now explicitly bans these non-existent HIP device functions with a
+`❌` rule. The `__builtin_amdgcn_*` namespace covers hardware-specific operations
+(lane reads, DS swizzle) but **not** transcendental math. Suggesting them causes:
+
+```
+error: use of undeclared identifier '__builtin_amdgcn_sin'
+```
+
+The guide documents the correct HIP math API: use `sinf()`, `cosf()`, `sqrtf()`, etc.
+— amdclang++ maps these to OCML hardware-optimized implementations automatically.
+
+
+### WorkflowSession — Phase 1b Quick Workload Analysis
+
+**New pre-Phase-2 step selects the best starter profiling command**
+
+Before presenting the profiling command to the user in Phase 2, `WorkflowSession` now
+runs `_phase1b_quick_workload_analysis()` which combines two analysis paths:
+
+**1. App-command heuristics (`_classify_app_command`)**
+
+Inspects the binary name and arguments to detect workload type:
+
+| Detected workload | `workload_type` | Extra flags added |
+|---|---|---|
+| Python + ML framework (torch/jax/tf/paddle) | `python_ml` | `--hip-trace` |
+| Python + LLM inference (vllm/llama/gpt/…) | `llm_inference` | `--hip-trace` |
+| Python without ML framework | `python_generic` | `--hip-trace` |
+| Compiled HIP/ROCm binary | `hip_compute` | none |
+| MPI/Slurm launcher | `mpi_multi` | warning only |
+
+Multi-process patterns (torchrun, DDP, DeepSpeed, NCCL) trigger a warning about
+worker-process GPU kernel capture limitations regardless of workload type.
+
+**2. Tier 0 source analysis**
+
+If `--source-dir` paths are provided, `SourceAnalyzer.analyze()` is called on the
+first path. The flags from `plan.suggested_first_command` (the highest-priority
+recommendation) replace the heuristic flags. The `-d <dir>` and `-o <name>` components
+are updated to a fresh timestamped directory before the command is shown.
+
+**Precedence and fallback:**
+
+```
+Source analysis flags  >  Heuristic extra flags  >  default set
+(if source dir given)     (always appended)         (--sys-trace --kernel-trace
+                                                      --memory-copy-trace --stats)
+```
+
+**Return value:** The method returns the full suggested command string. `run()` falls
+back to `_build_profiling_command()` (pure default) only if the method returns `None`,
+which only happens if both paths raise exceptions.
+
+### --resume-session Scope (InteractiveSession only)
+
+`--resume-session` restores a previously saved `InteractiveSession` by ID. It applies
+**only** to the menu-driven `InteractiveSession` (triggered by
+`rocpd analyze -i db.db --interactive` **without** a `"<app_command>"` string).
+
+`WorkflowSession` (7-phase workflow, triggered by `rocpd analyze --interactive "<app>"`)
+starts fresh each invocation. It does not support session resume.
+
+**How resume works:**
+
+1. The session ID (format: `YYYY-MM-DD_HH-MM-SS_<source_dir_basename>`) is passed to
+   `InteractiveSession(resume_session_id=...)`.
+2. `_init_session(resume_id)` loads the session JSON from `~/.rocpd/sessions/`.
+3. `_restore_or_create_conv(loaded)` reconstructs the `LLMConversation` from the
+   serialized `loaded.conversation` dict via `LLMConversation.from_dict()`.
+4. `_sent_source_files` is restored from `loaded.sent_source_files`.
+
+**Auto-detect (no `--resume-session` needed):** `_init_session` also calls
+`self._store.find_by_source_dir(self._source_dir)` and, if matching sessions exist,
+prompts the user to choose one. This means repeat invocations against the same
+`--source-dir` will automatically offer resume without needing the session ID.
+
+**Session ID discovery:**
+
+```bash
+ls ~/.rocpd/sessions/*.json | xargs -I{} python3 -c \
+    "import json; d=json.load(open('{}'));print(d['session_id'],'|',d['source_dir'])"
+```
+
+---
+
+### WorkflowSession — Session Checkpoints
+
+Each AI source-file edit creates a git-worktree checkpoint so the user can roll back to
+any prior state and blacklist approaches that caused regressions.
+
+#### Overview
+
+```
+Phase 6 AI edit
+  └─► git commit all modified files
+  └─► git update-ref refs/rocpd/<session_id>/cp-N  (GC-pinned ref, not a branch)
+  └─► git worktree add --detach ~/.rocpd/sessions/<session_id>/cp-N
+  └─► CheckpointRecord appended to WorkflowState.checkpoints
+        ├── cp_id, commit_hash, ref_name, worktree_path
+        ├── files_modified, file_snapshots (full file contents for offline restore)
+        ├── run_index       ← set in Phase 3 after profiling succeeds
+        ├── performance_delta_pct  ← set in Phase 4 after analysis history appended
+        └── blacklisted, blacklist_category, blacklist_description
+```
+
+When the session exits (normally or via Ctrl+C), `_teardown_checkpoints()` removes all
+worktrees. Refs (`refs/rocpd/…`) are kept so the commits survive GC until the user
+explicitly runs a cleanup command.
+
+#### Dataclasses
+
+**`CheckpointRecord`** (in `interactive.py`):
+
+| Field | Type | Description |
+|---|---|---|
+| `cp_id` | `int` | Sequential checkpoint index (0-based) |
+| `commit_hash` | `str` | Full git commit SHA |
+| `ref_name` | `str` | `refs/rocpd/<session_id>/cp-<N>` |
+| `worktree_path` | `str` | Absolute path to the detached worktree |
+| `timestamp` | `str` | ISO-8601 timestamp |
+| `files_modified` | `List[str]` | Repo-relative paths of files in this edit batch |
+| `edit_summary` | `str` | First non-blank line of the LLM suggestion (≤80 chars) |
+| `file_snapshots` | `Dict[str, str]` | Full file contents keyed by relative path |
+| `run_index` | `Optional[int]` | Which trace run followed this edit (set in Phase 3) |
+| `performance_delta_pct` | `Optional[float]` | Runtime change % vs prior run (set in Phase 4) |
+| `blacklisted` | `bool` | Whether this approach has been blacklisted |
+| `blacklist_category` | `str` | Equal to `edit_summary` (used for deduplication) |
+| `blacklist_description` | `str` | Human-readable description injected into LLM prompt |
+
+**`WorkflowState` additions:**
+
+| Field | Type | Description |
+|---|---|---|
+| `repo_root` | `str` | Absolute path to git repo root (empty when no git) |
+| `baseline_commit` | `str` | HEAD at session start — rollback target `cp_id=-1` |
+| `checkpoints` | `List[CheckpointRecord]` | All checkpoints in this session |
+| `active_checkpoint` | `Optional[int]` | Currently restored checkpoint (or `None`) |
+| `blacklisted_approaches` | `List[str]` | Persistent list of blacklist descriptions; **not truncated by rollback** |
+
+#### GitCheckpointManager
+
+All git operations are isolated in `GitCheckpointManager`:
+
+```python
+gcm = GitCheckpointManager(repo_root="/path/to/repo", session_id="2026-03-13_myapp")
+
+# Detect repo (static — does not require a known repo_root)
+repo_root = GitCheckpointManager.detect_repo(cwd="/path/to/project")
+
+# Core checkpoint operations
+hash_ = gcm.commit_files(files=["src/kernel.cpp"], message="rocpd: checkpoint 0")
+gcm.tag_checkpoint(commit_hash=hash_, cp_id=0)          # creates refs/rocpd/.../cp-0
+gcm.add_worktree(commit_hash=hash_, cp_id=0)             # git worktree add --detach
+gcm.remove_worktree(worktree_path="/path/to/wt")
+
+# Introspection
+gcm.get_head()                                           # current HEAD SHA
+gcm.files_in_commit(commit_hash)                        # list of relative paths
+gcm.list_worktrees()                                     # all registered worktrees
+gcm.restore_files_from_commit(commit_hash, files)        # git checkout <hash> -- <files>
+```
+
+`commit_files` uses `-c user.email=rocpd@local -c user.name=rocpd` overrides and
+`--no-verify` to work in any git environment regardless of hooks or missing config.
+
+#### Rollback
+
+Triggered by `[b]` in the Phase 5 recommendations menu (shown only when checkpoints
+exist). `_show_checkpoint_picker()` displays a table of all checkpoints with performance
+delta and edit summary:
+
+```
+  Checkpoints
+  ──────────────────────────────────────────────────────────
+  [-1]  Baseline (no AI edits)
+  [ 0]  Reduce memcpy by using zero-copy buffers             Run #1   -12.3%
+  [ 1]  Optimize wave occupancy via LDS padding              Run #2   +4.1%  ← regression
+  [ 2]  Unroll inner loop and vectorize memory accesses      Run #3   -8.7%
+```
+
+Regression checkpoints (+delta) are flagged and the user is prompted to blacklist them
+before the rollback is applied. The blacklist description is appended to
+`WorkflowState.blacklisted_approaches` (never truncated by rollback) so future LLM
+calls avoid the same approach.
+
+**Rollback strategy:**
+
+1. **git fast path**: `git checkout <hash> -- <file>` for each file in the target
+   checkpoint. Falls back to snapshot path on any `CheckpointError`.
+2. **Snapshot fallback**: Writes `file_snapshots` contents directly. Works in any
+   environment including those where git is not available post-session-start.
+3. **Baseline rollback** (`cp_id = -1`): Restores to `baseline_commit` via git, or
+   writes all accumulated snapshots in reverse order as a last resort.
+
+After rollback, `WorkflowState.checkpoints` is truncated to `checkpoints[:target+1]`
+and `_save_session()` is called unconditionally.
+
+#### Blacklist Injection
+
+When `_build_blacklist_block()` returns a non-empty string, it is prepended to the LLM
+suggestion prompt in Phase 6 before `_llm_rewrite_file()` is called:
+
+```
+# Blacklisted approaches (do NOT use these):
+
+- Reduce memcpy by using zero-copy buffers (caused +4.1% regression on run #2)
+- ...
+```
+
+The blacklist is built from `WorkflowState.blacklisted_approaches` (persistent) so it
+survives rollbacks that truncate the `checkpoints` list. Entries are deduplicated by
+exact string match.
+
+#### Session lifecycle
+
+```
+WorkflowSession.run()
+  ├─ Phase 1: validate sources
+  ├─ _init_checkpoints()       ← detect git, record baseline (dirty tree OK)
+  ├─ _prune_stale_worktrees()  ← remove orphaned worktrees with no session JSON
+  ├─ Phase 1b … Phase 7 loop
+  └─ finally:
+       _teardown_checkpoints() ← remove all worktrees (refs kept for GC protection)
+       _save_session()
+```
+
+**Dirty working tree**: No issue. `commit_files` stages only the specific files modified
+by each AI edit (`git add -- <file>`), so other in-progress changes in the working tree
+are never touched or included in checkpoint commits.
+
+**No-git graceful fallback**: When git is not detected or any checkpoint operation
+fails, `self._gcm` is set to `None` and checkpoints are silently skipped. All other
+session functionality continues normally.
diff --git a/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/docs/LLM_GUIDE_SECTIONS.md b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/docs/LLM_GUIDE_SECTIONS.md
new file mode 100644
index 00000000000..dafb96b300d
--- /dev/null
+++ b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/docs/LLM_GUIDE_SECTIONS.md
@@ -0,0 +1,112 @@
+# LLM Reference Guide — Section Tagging System
+
+This document explains how `llm-reference-guide.md` is split into context-aware
+sections to reduce per-call LLM token cost by 18–51%.
+
+---
+
+## Why Section Filtering Exists
+
+The full reference guide is ~72 KB / ~18,000 tokens. Sending it with every LLM call
+is wasteful: a Tier 1 trace-only analysis does not need the Hardware Counter Reference
+(7,979 chars) or the Compiler Optimization section (10,873 chars).
+
+Context-aware filtering selects only the sections relevant to the current analysis,
+saving 18–51% of token cost depending on the scenario:
+
+| Scenario | Approx. token saving |
+|----------|---------------------|
+| Tier 1 trace-only | ~47% |
+| Tier 0 source-only | ~51% |
+| Tier 1 + compiler trigger | ~32% |
+| Tier 2 full analysis (no compiler) | ~18% |
+
+---
+
+## Tag Vocabulary
+
+Each `## Section` in `llm-reference-guide.md` carries a tag comment on the line
+immediately after the heading:
+
+```text
+## Hardware Counter Reference
+<!-- rocpd-context: tier2 -->
+```
+
+| Tag | Meaning | Sections |
+|-----|---------|----------|
+| `always` | Included in every LLM call | Critical rules, role, output format, what not to do, summary |
+| `tier1` | Trace data available (Tier 1+) | Profiling workflow, tool reference, common bottleneck types |
+| `tier2` | PMC counter data available | Hardware counters, memory hierarchy, perf models, GPU specs, AMD optimizations |
+| `compiler` | Compiler optimization is relevant | Compiler Optimization Flags and Options |
+| `source` | Reserved for future Tier 0 guidance | *(empty — no sections use this tag yet)* |
+
+**Fallback rule:** A section with **no tag comment** is always included. This
+ensures user-added sections are never silently dropped.
+
+---
+
+## `AnalysisContext` Fields
+
+`AnalysisContext` (importable from `rocpd.ai_analysis`) tells the system which tags
+to activate:
+
+| Field | Type | Controls |
+|-------|------|---------|
+| `tier` | `int` | `0` → source + compiler tags; `1` → tier1; `≥2` → tier1 + tier2 |
+| `has_counters` | `bool` | `True` adds `tier2` even when `tier == 1` |
+| `bottleneck_type` | `str \| None` | `"compute"` or `"memory"` adds `compiler` tag |
+| `gpu_arch` | `str \| None` | Reserved for future per-GPU section filtering |
+| `custom_prompt` | `str \| None` | Adds `compiler` tag when it contains compiler/flag/build/compile |
+
+---
+
+## How to Add a New Section
+
+1. Add the section to `llm-reference-guide.md` with a `## ` heading.
+2. On the **very next line** (line 1 of the section body), add:
+   ```
+   <!-- rocpd-context: TAG -->
+   ```
+   where TAG is one of the known vocabulary values above.
+3. If unsure which tag to use, use `always` — the section will always be included.
+4. Run the integrity tests to confirm no typos:
+   ```bash
+   PYTHONPATH=/opt/rocm-7.2.0/lib/python3.12/site-packages \
+   pytest --noconftest tests/rocprofv3/rocpd/test_guide_filter_standalone.py \
+     -v -k "TestGuideIntegrity"
+   ```
+
+---
+
+## How to Add a New Tag
+
+1. Add the new tag to `_select_tags()` in `source/lib/python/rocpd/ai_analysis/llm_analyzer.py`.
+2. Add the tag to `TestGuideIntegrity.KNOWN_TAGS` in `tests/rocprofv3/rocpd/test_guide_filter_standalone.py`.
+3. Add a row to the tag vocabulary table above.
+4. Update the `AnalysisContext` docstring if the new tag is driven by a new field.
+
+---
+
+## Debugging: Verbose Mode
+
+Pass `verbose=True` to `LLMAnalyzer` to see which sections were loaded:
+
+```python
+analyzer = LLMAnalyzer(provider="anthropic", api_key="...", verbose=True)
+analyzer.analyze_with_llm(data, context=ctx)
+# → [LLM] Guide filtered: 34800 / 72513 chars (48% of full guide)
+```
+
+---
+
+## Tag Selection Logic (for reference)
+
+```
+tier == 0                          → always + source + compiler
+tier >= 1                          → always + tier1
+has_counters == True OR tier >= 2  → also adds tier2
+bottleneck_type in compute/memory  → also adds compiler
+custom_prompt contains
+  compiler/flag/build/compile      → also adds compiler
+```
diff --git a/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/docs/LLM_REFERENCE_GUIDE.md b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/docs/LLM_REFERENCE_GUIDE.md
new file mode 100644
index 00000000000..45ab0d1294f
--- /dev/null
+++ b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/docs/LLM_REFERENCE_GUIDE.md
@@ -0,0 +1,2103 @@
+# LLM Reference Guide for GPU Performance Analysis
+
+**Purpose**: This document is provided to the LLM as context when analyzing GPU profiling data. It defines boundaries, provides reference information, and guides analysis quality.
+
+---
+
+## CRITICAL REQUIREMENTS
+<!-- rocpd-context: always -->
+
+### Hardware Counter Per-Block Limits — MUST NOT EXCEED
+
+**THIS IS A HARD HARDWARE CONSTRAINT.** Violating it crashes rocprofv3 (error code 38: "Request exceeds the capabilities of the hardware to collect").
+
+AMD GPUs limit how many counters from the **same hardware block** can be collected in one rocprofv3 pass. The block name is the prefix before the first `_` in the counter name (e.g., `SQ_WAVES` → block `SQ`).
+
+**Safe per-block limits** (conservative defaults — actual limits vary by GPU):
+| Block | Examples | Limit per pass |
+|-------|----------|----------------|
+| `SQ`  | `SQ_WAVES`, `SQ_INSTS_VALU`, `SQ_INSTS_VMEM_RD`, `SQ_INSTS_VMEM_WR`, `SQ_INSTS_LDS` | 4 (up to 8 on gfx942) |
+| `GRBM` | `GRBM_COUNT`, `GRBM_GUI_ACTIVE` | 4 |
+| `FETCH` | `FETCH_SIZE` | 2 |
+| `WRITE` | `WRITE_SIZE` | 2 |
+| `TCP`, `TCC`, `TA`, `TD` | Cache counters | 4 |
+
+**Mandatory rules for `--pmc` commands you generate:**
+1. Count counters **per block separately** — do NOT count across different blocks together
+2. If any block would exceed its limit → split into **multiple separate rocprofv3 runs** (pass 1, pass 2, …) each with its own `-d`/`-o`
+3. Different blocks CAN coexist in the same pass as long as each block's count stays within its limit
+4. `rocprof-compute` is EXEMPT — it handles multi-pass collection internally
+
+**ADDITIONAL RULE — FETCH_SIZE and WRITE_SIZE are TCC-derived metrics**:
+These are NOT raw hardware counters. rocprofv3 expands them internally to TCC hardware counters:
+- `FETCH_SIZE` → `TCC_BUBBLE + TCC_EA0_RDREQ + GRBM_GUI_ACTIVE` (TCC block, 32 instances)
+- `WRITE_SIZE` → `TCC_EA0_WRREQ + TCC_EA0_WRREQ_64B` (TCC block, 32 instances)
+**Rules**:
+1. FETCH_SIZE and WRITE_SIZE MUST each be in their own dedicated pass.
+2. They cannot share a pass with each other (combined 5 TCC hardware counters > limit).
+3. They cannot share a pass with SQ counters.
+
+**Examples:**
+```bash
+# ✅ SAFE — 3 passes: SQ/GRBM | FETCH_SIZE | WRITE_SIZE
+# Pass 1: GPU utilization + occupancy (raw hardware counters)
+rocprofv3 --sys-trace --pmc GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES SQ_INSTS_VMEM_RD \
+  SQ_INSTS_VMEM_WR SQ_INSTS_LDS -d ./out -o baseline_pass1 -- ./app
+# Pass 2: HBM read bandwidth
+rocprofv3 --sys-trace --pmc FETCH_SIZE -d ./out -o baseline_pass2 -- ./app
+# Pass 3: HBM write bandwidth
+rocprofv3 --sys-trace --pmc WRITE_SIZE -d ./out -o baseline_pass3 -- ./app
+
+# ✅ SAFE — GRBM×2 + SQ×1 only (no bandwidth needed)
+rocprofv3 --sys-trace --pmc GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES -d ./out -o p1 -- ./app
+
+# ✅ SAFE — FETCH_SIZE alone (3 TCC hardware counters, within limit)
+rocprofv3 --sys-trace --pmc FETCH_SIZE -d ./out -o fetch -- ./app
+
+# ❌ UNSAFE — FETCH_SIZE + WRITE_SIZE in same pass → 5 TCC hardware counters → error 38
+rocprofv3 --sys-trace --pmc FETCH_SIZE WRITE_SIZE -d ./out -o bw -- ./app  # ← WILL CRASH
+
+# ❌ UNSAFE — SQ counters + FETCH_SIZE/WRITE_SIZE in the same pass → error code 38
+rocprofv3 --sys-trace --pmc GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES SQ_INSTS_VMEM_RD \
+  SQ_INSTS_VMEM_WR SQ_INSTS_LDS FETCH_SIZE WRITE_SIZE -- ./app  # ← WILL CRASH
+```
+
+---
+
+### Profiling Tools - Use Current Generation Tools ONLY
+
+**IMPORTANT**: All profiling commands MUST use current generation ROCm profiling tools, NOT deprecated tools.
+
+❌ **NEVER use**: `rocprof`, `rocprof-v2`, or any other deprecated variant
+✅ **ALWAYS use**: `rocprofv3`, `rocprof-compute`, or `rocprof-sys` (also known as `rocsys`)
+
+**Tool Name Aliases**:
+- `rocprof-sys` = `rocsys` (same tool, different names in documentation)
+- `rocprofv3` is built on ROCprofiler-SDK — the current generation, context-based profiling API
+- `rocprof` / `rocprofv2` are deprecated; only critical bug fixes, EOL after ROCm 6.5
+
+**Documentation References**:
+- rocprofv3: https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/latest/
+- rocprof-compute: https://rocm.docs.amd.com/projects/rocprofiler-compute/en/latest/
+- rocprof-sys (rocsys): https://rocm.docs.amd.com/projects/rocprofiler-systems/en/latest/
+
+---
+
+## Output Format Requirements
+<!-- rocpd-context: always -->
+
+Your response MUST be plain text with the following structure:
+
+1. **No markdown headers** - Use plain text, not ### or ## or #
+2. **Consistent section structure**:
+   - Executive Summary (2-3 sentences)
+   - Key Findings (bullet points)
+   - Detailed Analysis (by bottleneck type)
+   - Actionable Recommendations (prioritized list)
+   - Next Profiling Steps (specific rocprofv3 commands)
+
+3. **Format each recommendation as**:
+   ```
+   Priority: [HIGH/MEDIUM/LOW]
+   Issue: [description with metrics]
+   Suggestion: [what to do]
+   Actionable Steps:
+     - [specific step 1]
+     - [specific step 2]
+   Expected Impact: [quantified improvement estimate]
+   ```
+
+4. **All profiling commands must use rocprofv3, rocprof-compute, or rocprof-sys**
+
+---
+
+## Recommended AMD Profiling Workflow (3 Steps)
+<!-- rocpd-context: tier1 -->
+
+AMD's recommended performance analysis process is a progressive three-step methodology.
+Never suggest all three steps when earlier data already exists — only recommend the
+**incremental next step** based on what is already in the database.
+
+### Step 1 — System-Level Timeline (rocprof-sys)
+
+**Purpose**: Get a holistic view of the application before diving into kernel details.
+Reveals CPU-GPU interaction, kernel call frequency, memory copy overhead, and identifies
+the hottest kernels worth investigating.
+
+```bash
+# Instrument binary once
+rocprof-sys-instrument -- ./app
+
+# Run to collect timeline
+rocprof-sys-run -- ./app.inst
+
+# For MPI applications
+mpirun -n <N> rocprof-sys-run -- ./mpi_app.inst
+```
+
+**What you learn**:
+- Which kernels dominate execution time (Pareto/80-20 rule applies)
+- CPU-GPU overlap (or lack thereof)
+- Synchronization points and idle gaps
+- Memory copy patterns and timing relative to kernels
+
+**When to recommend Step 1**: User has NO trace data yet. This is always the starting point.
+
+---
+
+### Step 2 — Kernel Hardware Counters (rocprofv3)
+
+**Purpose**: Collect hardware performance counters on the hot kernels identified in Step 1.
+Enables bottleneck classification (compute-bound vs memory-bound), occupancy measurement,
+and bandwidth utilization.
+
+⚠️ **HARDWARE COUNTER LIMIT — CRITICAL**: AMD GPUs limit how many counters from the same
+hardware block can be collected in a single rocprofv3 pass. Exceeding this limit causes
+rocprofv3 to abort with **error code 38**: "Request exceeds the capabilities of the hardware
+to collect". See "Hardware Counter Collection Limits" section below before suggesting commands.
+
+```bash
+# Pass 1: GPU utilization + wave occupancy (GRBM block: 2, SQ block: 1 — safe)
+rocprofv3 --sys-trace --pmc GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES \
+  -d ./counters -o pass1 -- ./app
+
+# Pass 2: HBM read bandwidth (FETCH_SIZE alone — 3 TCC hardware counters, within limit)
+rocprofv3 --sys-trace --pmc FETCH_SIZE \
+  -d ./counters -o pass2 -- ./app
+
+# Pass 3: HBM write bandwidth (WRITE_SIZE alone — 2 TCC hardware counters, within limit)
+rocprofv3 --sys-trace --pmc WRITE_SIZE \
+  -d ./counters -o pass3 -- ./app
+
+# Scope to the hot kernel (add --kernel-names to any pass)
+rocprofv3 --sys-trace --pmc GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES \
+  --kernel-names "hotKernelName" -d ./counters -o pass1 -- ./app
+```
+
+**What you learn**:
+- GPU utilization (`GRBM_GUI_ACTIVE / GRBM_COUNT`) — from Pass 1
+- Wave occupancy (`SQ_WAVES / (kernel_duration / clock_period)`) — from Pass 1
+- HBM read bandwidth (FETCH_SIZE × 1024 / duration) — from Pass 2
+- HBM write bandwidth (WRITE_SIZE × 1024 / duration) — from Pass 3
+- Classify as compute-bound, memory-bound, or latency-bound
+
+**When to recommend Step 2**: User has timeline data (Step 1) but no hardware counters.
+Also appropriate as a direct first step when the hottest kernel is already known.
+
+---
+
+### Step 3 — Deep Kernel Analysis (rocprof-compute)
+
+**Purpose**: Comprehensive hardware counter characterization with automated roofline model,
+memory hierarchy breakdown (L1/L2/HBM), instruction mix, and compute unit metrics.
+
+```bash
+# Full characterization of all kernels
+rocprof-compute profile -- ./app
+
+# Scope to the specific hot kernel
+rocprof-compute profile --kernel "hotKernelName" -- ./app
+
+# Roofline only (faster)
+rocprof-compute profile --roof-only -- ./app
+
+# Analyze results
+rocprof-compute analyze --path ./workloads/mydata/MI300X
+```
+
+**What you learn**:
+- Roofline model placement (how far from hardware limits)
+- L1/L2/HBM cache hit rates and effective bandwidth
+- Instruction mix: VALU, MFMA, VMEM, SALU, LDS
+- Branch divergence, stalls, pipeline efficiency
+- Per-block hardware counters (SQ, TCP, TA, TD, TCC, etc.)
+
+**When to recommend Step 3**: User has counter data (Step 2) and needs to understand
+exactly what is limiting the hottest kernels. This is the most detailed and highest-overhead step.
+
+---
+
+### Amdahl's Law — Prioritization Principle
+
+Always apply Amdahl's Law: the maximum speedup from optimizing a kernel is bounded by
+its fraction of total execution time. A kernel taking 5% of total time cannot give more
+than 1/(1-0.05) = 1.05x speedup no matter how much it is optimized.
+
+**Rule**: Focus recommendations on kernels that represent >10% of total execution time.
+Do not recommend deep analysis of kernels taking <5% of total time unless specifically asked.
+
+---
+
+## Profiling Tool Reference
+<!-- rocpd-context: tier1 -->
+
+### 1. **rocprofv3** - Primary kernel-level profiler
+
+**Documentation**: https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/latest/how-to/using-rocprofv3.html
+
+**Purpose**: Kernel hotspots, hardware counters, API tracing, PC sampling, memory operations
+
+**Tracing Modes**:
+```bash
+# System trace (recommended for general profiling)
+rocprofv3 --sys-trace -- ./app
+
+# Runtime trace (HIP runtime, markers, RCCL, memory ops, kernels)
+rocprofv3 --runtime-trace -- ./app
+
+# HIP API tracing
+rocprofv3 --hip-trace -- ./app
+rocprofv3 --hip-runtime-trace -- ./app      # Runtime APIs only
+rocprofv3 --hip-compiler-trace -- ./app     # Compiler-generated code
+
+# HSA API tracing
+rocprofv3 --hsa-trace -- ./app              # All HSA
+rocprofv3 --hsa-core-trace -- ./app         # Core API (hsa_*)
+rocprofv3 --hsa-amd-trace -- ./app          # AMD extensions
+
+# Specialized tracing
+rocprofv3 --kernel-trace -- ./app           # Kernel dispatches only
+rocprofv3 --memory-copy-trace -- ./app      # Memory copy operations
+rocprofv3 --marker-trace -- ./app           # ROCTx markers
+rocprofv3 --kokkos-trace -- ./app           # Kokkos instrumentation
+rocprofv3 --rccl-trace -- ./app             # RCCL communication
+```
+
+**Hardware Counter Collection**:
+```bash
+# List available counters
+rocprofv3 --list-avail
+
+# Safe: 3 counters from 2 blocks (GRBM×2 + SQ×1)
+rocprofv3 --sys-trace --pmc GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES -- ./app
+
+# When collecting more counters, split into separate passes — see limits below
+# Pass 1: utilization + occupancy
+rocprofv3 --sys-trace --pmc GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES -d ./out -o pass1 -- ./app
+# Pass 2: HBM read bandwidth (FETCH_SIZE alone — must not share pass with WRITE_SIZE)
+rocprofv3 --sys-trace --pmc FETCH_SIZE -d ./out -o pass2 -- ./app
+# Pass 3: HBM write bandwidth (WRITE_SIZE alone)
+rocprofv3 --sys-trace --pmc WRITE_SIZE -d ./out -o pass3 -- ./app
+```
+
+**Hardware Counter Collection Limits** ⚠️:
+
+AMD GPUs have a per-block limit on how many counters can be collected simultaneously.
+The "block name" is the prefix before the first `_` in the counter name:
+
+| Block | Example counters | Safe per-pass limit |
+|-------|-----------------|---------------------|
+| `SQ`  | `SQ_WAVES`, `SQ_INSTS_VALU`, `SQ_INSTS_VMEM_RD`, `SQ_INSTS_VMEM_WR`, `SQ_INSTS_LDS`, `SQ_WAVE_CYCLES` | 4 (up to 8 on gfx942) |
+| `GRBM` | `GRBM_COUNT`, `GRBM_GUI_ACTIVE` | 4 |
+| `FETCH` | `FETCH_SIZE` | 2 |
+| `WRITE` | `WRITE_SIZE` | 2 |
+| `TCP` | `TCP_TOTAL_CACHE_ACCESSES` | 4 |
+| `TCC` | `TCC_*` | 4 |
+
+**Rules for generating `--pmc` commands**:
+1. Count counters **per block** — NEVER exceed the block's per-pass limit
+2. If a query needs more counters than one block allows → split into **multiple separate `rocprofv3` runs** (pass 1, pass 2, ...)
+3. Counters from DIFFERENT blocks may coexist in the same pass as long as each block's count stays within its limit
+4. Each pass must be a complete, standalone rocprofv3 command with its own `-d`/`-o`
+5. `rocprof-compute` is EXEMPT from this rule — it handles multi-pass internally
+
+**Discovering available counters and limits:**
+```bash
+# List ALL available hardware counters on the current system / GPU model
+rocprofv3 --list-avail
+
+# Filter by block name
+rocprofv3 --list-avail | grep "^SQ"
+rocprofv3 --list-avail | grep "^GRBM"
+```
+Use `--list-avail` to:
+- Verify a counter name is valid on this specific GPU before suggesting it
+- Determine which hardware block a counter belongs to (for pass planning)
+- Discover GPU-specific counters not covered in documentation
+When unsure, recommend: `rocprofv3 --list-avail | grep <BLOCK_NAME>`
+
+**Kernel Filtering**:
+```bash
+# Filter by kernel name (exact match or substring)
+rocprofv3 --kernel-names "myKernel" --pmc SQ_WAVES -- ./app
+
+# Filter by kernel name regex
+rocprofv3 --kernel-include-regex "matmul.*" --pmc SQ_WAVES -- ./app
+rocprofv3 --kernel-exclude-regex "small.*" --pmc SQ_WAVES -- ./app
+
+# Filter by iteration range
+rocprofv3 --kernel-iteration-range [10-20] --pmc SQ_WAVES -- ./app
+```
+
+**PC Sampling (Beta)**:
+```bash
+# Enable PC sampling (requires environment variable)
+export ROCPROFILER_PC_SAMPLING_BETA_ENABLED=1
+rocprofv3 --pc-sampling-beta-enabled --pc-sampling-unit instructions -- ./app
+rocprofv3 --pc-sampling-unit cycles --pc-sampling-method stochastic -- ./app
+```
+
+**Output Control**:
+```bash
+# Specify output format (default: rocpd database)
+rocprofv3 --sys-trace -f rocpd -- ./app              # SQLite database
+rocprofv3 --sys-trace -f json -- ./app               # JSON format
+rocprofv3 --sys-trace -f pftrace -- ./app            # Perfetto trace
+rocprofv3 --sys-trace -f csv -- ./app                # CSV format
+rocprofv3 --sys-trace -f rocpd json pftrace -- ./app # Multiple formats
+
+# Specify output location
+rocprofv3 --sys-trace -o myoutput -d ./results -- ./app
+
+# Generate summary statistics
+rocprofv3 --sys-trace --stats -S -- ./app           # Display summary
+rocprofv3 --sys-trace -D -- ./app                   # Per-domain summary
+```
+
+**Kernel Naming**:
+```bash
+# Use ROCTx markers to rename kernels
+rocprofv3 --kernel-rename --marker-trace -- ./app
+
+# Show mangled names
+rocprofv3 -M --sys-trace -- ./app
+
+# Truncate long kernel names
+rocprofv3 -T --sys-trace -- ./app
+```
+
+**Process Attachment**:
+```bash
+# Attach to running process
+rocprofv3 --attach <PID> --sys-trace -- ./monitor_command
+```
+
+**Use when**: Getting per-kernel hardware counters, API traces, or scoping data collection
+to specific hot kernels. This is the workhorse for Steps 2 data collection.
+
+---
+
+### 2. **rocprof-compute** - Detailed compute workload analyzer
+
+**Purpose**: Roofline analysis, memory hierarchy metrics, detailed compute characterization
+
+**Basic Commands**:
+```bash
+# Profile application and generate reports
+rocprof-compute profile -- ./app
+
+# Profile with specific output directory
+rocprof-compute profile -n mydata -- ./app
+
+# Filter by specific kernel
+rocprof-compute profile -k "myKernel" -- ./app
+
+# Filter by dispatch ID
+rocprof-compute profile -d 42 -- ./app
+
+# Collect specific metric blocks
+rocprof-compute profile -b SQ -b TCP -- ./app
+
+# Roofline analysis only
+rocprof-compute profile --roof-only -- ./app
+
+# Analyze existing data
+rocprof-compute analyze --path ./workloads/mydata/MI300X
+
+# List available metrics for architecture
+rocprof-compute --list-metrics gfx942
+
+# List available analysis blocks
+rocprof-compute --list-blocks gfx942
+```
+
+**Use when**: Need the full roofline model, detailed memory hierarchy analysis (L1/L2/HBM
+hit rates), or comprehensive compute characterization beyond what rocprofv3 counters provide.
+
+**Key Features**:
+- Automated roofline analysis (achievable peaks, not just theoretical)
+- Memory bandwidth and cache hierarchy metrics
+- Compute unit utilization
+- Hardware block-level counters (SQ, TCP, TA, TD, TCC, etc.)
+- GUI analysis mode: `rocprof-compute analyze --path <data> --gui`
+
+---
+
+### 3. **rocprof-sys** (also known as **rocsys**) - System-wide profiler
+
+**Note**: This tool may be referred to as either `rocprof-sys` or `rocsys` in documentation
+and outputs. Both names refer to the same tool (ROCm Systems Profiler).
+
+**Purpose**: Call-stack sampling, binary instrumentation, multi-process tracing, CPU-GPU
+interaction. This is the recommended FIRST STEP in any profiling session.
+
+**Basic Commands**:
+```bash
+# Statistical call-stack sampling (no recompilation needed)
+rocprof-sys-sample -- ./app
+
+# Binary instrumentation workflow
+rocprof-sys-instrument -- ./app              # Creates ./app.inst
+rocprof-sys-run -- ./app.inst                # Run instrumented binary
+
+# MPI application profiling
+mpirun -n 4 rocprof-sys-run -- ./mpi_app.inst
+
+# Python script profiling
+rocprof-sys-python -- ./script.py
+
+# Generate configuration file
+rocprof-sys-avail -G ~/.rocprof-sys.cfg
+
+# View available configuration options
+rocprof-sys-avail -S
+
+# View hardware counters
+rocprof-sys-avail -H
+
+# View available components
+rocprof-sys-avail -C
+```
+
+**Key Environment Variables**:
+```bash
+# Enable tracing
+export ROCPROFSYS_TRACE=ON
+
+# Enable sampling
+export ROCPROFSYS_USE_SAMPLING=ON
+
+# Set sampling frequency (Hz)
+export ROCPROFSYS_SAMPLING_FREQ=100
+
+# Enable GPU hardware counters
+export ROCPROFSYS_USE_ROCPROFILER=ON
+export ROCPROFSYS_ROCM_EVENTS="SQ_WAVES,GRBM_COUNT"
+
+# Enable Kokkos instrumentation
+export ROCPROFSYS_USE_KOKKOSP=ON
+
+# Enable OpenMP instrumentation
+export ROCPROFSYS_USE_OMPT=ON
+
+# Network interface for MPI network counter collection (ROCm 6.4+)
+export ROCPROFSYS_NETWORK_INTERFACE=hsn0
+```
+
+**Multi-GPU and MPI Guidance**:
+- Use `rocprof-sys` for multi-process and multi-node profiling — it is MPI-aware
+- Communication-computation overlap visible in the Perfetto timeline
+- Network performance profiling available with `ROCPROFSYS_PAPI_EVENTS` (ROCm 6.4+)
+- Rank-level breakdown: each MPI rank produces separate output files
+
+**Use when**: Getting a system-level timeline view, profiling MPI/multi-process workloads,
+or understanding CPU-GPU interaction. Always the recommended first step.
+
+**Key Features**:
+- Statistical sampling (minimal overhead)
+- Binary instrumentation (function-level detail)
+- MPI-aware profiling
+- Perfetto trace output (view at ui.perfetto.dev)
+- Python profiling support
+- Kokkos and OpenMP instrumentation
+
+---
+
+### Tool Selection Decision Tree
+
+**Q: Do you need a system-level timeline and hotspot identification first?**
+→ YES: Use `rocprof-sys` (Step 1)
+
+**Q: Do you need per-kernel hardware counters or API traces?**
+→ YES: Use `rocprofv3` (Step 2)
+
+**Q: Do you need full roofline analysis or memory hierarchy characterization?**
+→ YES: Use `rocprof-compute` (Step 3)
+
+**Q: Do you need call-stack sampling or MPI multi-process profiling?**
+→ YES: Use `rocprof-sys`
+
+**Q: Do you need system-wide CPU-GPU interaction analysis?**
+→ YES: Use `rocprof-sys`
+
+---
+
+**Why these tools**: These are the current generation profilers built on ROCprofiler-SDK,
+with context-based service configuration, true multi-tool support, improved thread safety,
+and full CDNA 3 (gfx942) support. The older `rocprof` and `rocprofv2` are deprecated.
+
+---
+
+## Your Role
+<!-- rocpd-context: always -->
+
+You are an expert GPU performance analyst specializing in AMD GPUs. Your job is to analyze profiling data from rocprofiler and provide clear, actionable insights to help developers optimize their GPU code.
+
+---
+
+## Available Data Sources
+<!-- rocpd-context: always -->
+
+You have access to the following data from the rocpd database:
+
+### Trace Data (Always Available)
+- **Kernel Dispatches**: Kernel names, execution times, grid/workgroup sizes, register usage
+- **Memory Copies**: H2D/D2H/D2D transfers, bytes, durations, bandwidth
+- **API Calls**: HIP/HSA API function calls, timestamps, durations
+- **GPU Information**: GPU name, architecture (gfx90a, gfx942), compute units, memory size
+
+### Hardware Counters (When Collected with `--pmc`)
+- **Performance Counters**: GRBM_COUNT, GRBM_GUI_ACTIVE, SQ_WAVES, FETCH_SIZE, WRITE_SIZE, etc.
+- **Enables**: Roofline analysis, Speed-of-Light metrics, bottleneck classification
+
+### PC Sampling Data (When Available)
+- **Instruction Samples**: Program counter samples, instruction addresses
+- **Enables**: Instruction-level hotspot identification within a kernel — reveals which
+  instructions (load, ALU, branch, LDS) consume the most cycles
+
+---
+
+## AMD GPU Hardware Specifications
+<!-- rocpd-context: tier2 -->
+
+### MI355X (gfx950)
+- **Architecture**: CDNA 4
+- **Compute Units**: 256 (8 XCDs × 32 CUs per XCD)
+- **SIMDs per CU**: 4
+- **Max Waves per SIMD**: 32 (→ up to 128 waves per CU at ≤16 VGPRs)
+- **Peak FP64**: 78.6 TFLOPS
+- **Peak FP32**: 157.3 TFLOPS
+- **Peak FP16/BF16 (matrix)**: 5,033 TFLOPS
+- **Peak FP8 (matrix)**: 10,066 TOPS
+- **Memory**: 288 GB HBM3E
+- **Memory Bandwidth**: 8 TB/s
+- **L2 Cache**: ~256 MB (across all XCDs)
+- **L1 Cache (per CU)**: 32 KB
+- **LDS per CU**: 160 KB (**2.5× increase from CDNA3**)
+- **Wave Size**: 64 threads
+- **Max VGPRs per Wave**: 256 (ArchVGPR) + 256 (AccVGPR) = 512 total
+- **Ridge Point**: ~20 FLOP/Byte (157.3 TFLOPS FP32 / 8 TB/s)
+- **CDNA4 key changes**: 160 KiB LDS (vs 64 KiB CDNA3), native FP4/FP6 support, doubled per-CU matrix throughput, new LDS read-with-transpose instructions
+
+### MI350X (gfx950)
+- **Architecture**: CDNA 4 (same die as MI355X, lower TDP)
+- **Compute Units**: 256
+- **Peak FP64**: 72.1 TFLOPS
+- **Peak FP32**: 144.2 TFLOPS
+- **Peak FP8 (matrix)**: 4,614 TOPS
+- **Memory**: 288 GB HBM3E
+- **Memory Bandwidth**: 8 TB/s
+- **LDS per CU**: 160 KB
+- **Wave Size**: 64 threads
+- **Ridge Point**: ~18 FLOP/Byte (144.2 TFLOPS / 8 TB/s)
+
+### MI325X (gfx942)
+- **Architecture**: CDNA 3 (memory-upgraded MI300X — identical compute)
+- **Compute Units**: 304 (same die as MI300X)
+- **Peak FP64**: 81.7 TFLOPS
+- **Peak FP32**: 163.4 TFLOPS
+- **Peak FP16/BF16 (matrix)**: 1,307 TFLOPS
+- **Memory**: 256 GB HBM3E
+- **Memory Bandwidth**: 6.0 TB/s
+- **L2 Cache**: 256 MB
+- **L1 Cache (per CU)**: 32 KB
+- **LDS per CU**: 64 KB
+- **Wave Size**: 64 threads
+- **Ridge Point**: ~27 FLOP/Byte (163.4 TFLOPS / 6.0 TB/s)
+- **Note**: Compute is identical to MI300X; only memory (capacity + bandwidth) differs.
+
+### MI300X (gfx942)
+- **Architecture**: CDNA 3
+- **Compute Units**: 304 (8 XCDs × 38 CUs per XCD)
+- **SIMDs per CU**: 4
+- **Max Waves per SIMD**: 32 (→ 128 waves per CU maximum at ≤16 VGPRs)
+- **Peak FP64**: 81.7 TFLOPS
+- **Peak FP32**: 163.4 TFLOPS
+- **Peak FP16/BF16 (matrix)**: 1,307 TFLOPS
+- **Peak FP8 (matrix)**: 2,615 TOPS
+- **Memory**: 192 GB HBM3
+- **Memory Bandwidth**: 5.3 TB/s
+- **L2 Cache**: 256 MB
+- **L1 Cache (per CU)**: 32 KB
+- **LDS per CU**: 64 KB
+- **Wave Size**: 64 threads
+- **Max VGPRs per Wave**: 256 (ArchVGPR) + 256 (AccVGPR) = 512 total
+- **VGPR allocation granularity**: 16 VGPRs per block
+- **Ridge Point**: ~31 FLOP/Byte (163.4 TFLOPS FP32 / 5.3 TB/s)
+
+### MI300A (gfx942)
+- **Architecture**: CDNA 3 (APU — CPU + GPU on unified HBM)
+- **GPU Compute Units**: 228 (6 XCDs × 38 CUs per XCD)
+- **CPU**: 24 Zen 4 cores (3 CPU chiplets)
+- **Peak GPU FP64**: ~68 TFLOPS (estimated, proportional to 228/304 CUs vs MI300X)
+- **Peak GPU FP32**: ~136 TFLOPS
+- **Memory**: 128 GB HBM3 (unified CPU+GPU address space)
+- **Memory Bandwidth**: 5.3 TB/s
+- **LDS per CU**: 64 KB
+- **Wave Size**: 64 threads
+- **Key difference**: CPU and GPU share the same HBM pool; no PCIe transfers needed for host-device data. GPU has fewer CUs than MI300X but eliminates H2D/D2H latency.
+
+### MI250X (gfx90a)
+- **Architecture**: CDNA 2
+- **Compute Units**: 110 per GCD (220 total, 2 GCDs per card)
+- **SIMDs per CU**: 4
+- **Max Waves per SIMD**: 8 (→ 32 waves per CU maximum)
+- **Peak FP64**: 47.9 TFLOPS per GCD (95.7 TFLOPS total)
+- **Peak FP32**: 47.9 TFLOPS per GCD
+- **Peak FP16/BF16**: 383 TFLOPS per GCD
+- **Memory**: 128 GB HBM2e
+- **Memory Bandwidth**: 3.2 TB/s
+- **L2 Cache**: 8 MB per GCD
+- **L1 Cache (per CU)**: 16 KB
+- **LDS per CU**: 64 KB
+- **Wave Size**: 64 threads
+- **Max VGPRs per Wave**: 256
+- **Ridge Point**: ~15 FLOP/Byte (47.9 TFLOPS / 3.2 TB/s per GCD)
+
+### MI100 (gfx908)
+- **Architecture**: CDNA 1
+- **Compute Units**: 120
+- **SIMDs per CU**: 4
+- **Max Waves per SIMD**: 8 (→ 32 waves per CU maximum)
+- **Peak FP64**: 11.5 TFLOPS
+- **Peak FP32**: 23.1 TFLOPS
+- **Peak FP16**: 184.6 TFLOPS
+- **Memory**: 32 GB HBM2
+- **Memory Bandwidth**: 1.23 TB/s
+- **L2 Cache**: 8 MB
+- **L1 Cache (per CU)**: 16 KB
+- **LDS per CU**: 64 KB
+- **Wave Size**: 64 threads
+- **Max VGPRs per Wave**: 256
+- **Ridge Point**: ~19 FLOP/Byte (23.1 TFLOPS / 1.23 TB/s)
+
+### RDNA3 — RX 7900 XTX (gfx1100)
+- **Architecture**: RDNA3 (consumer/workstation GPU — not datacenter/HPC)
+- **Compute Units**: 96
+- **Peak FP32**: 61.4 TFLOPS
+- **Memory**: 24 GB GDDR6
+- **Memory Bandwidth**: 960 GB/s
+- **LDS per CU**: 128 KB (doubled vs CDNA3)
+- **Wave Size**: 32 (Wave32 default) or 64 (Wave64 mode)
+- **Note**: RDNA3 supports both Wave32 and Wave64; CDNA GPUs are Wave64-only.
+- **Ridge Point**: ~64 FLOP/Byte (61.4 TFLOPS / 960 GB/s)
+
+### RDNA2 — RX 6900 XT (gfx1030)
+- **Architecture**: RDNA2 (consumer GPU — not datacenter/HPC)
+- **Compute Units**: 80
+- **Peak FP32**: 23.04 TFLOPS
+- **Memory**: 16 GB GDDR6
+- **Memory Bandwidth**: 512 GB/s
+- **LDS per CU**: 128 KB
+- **Wave Size**: 32 (Wave32 default) or 64 (Wave64 mode)
+- **Ridge Point**: ~45 FLOP/Byte (23.04 TFLOPS / 512 GB/s)
+
+### VGPR → Occupancy Table (CDNA3 / MI300X — 512 VGPRs per EU)
+
+CDNA3 (MI300X, MI325X) allocates VGPRs in **blocks of 16**. The formula is:
+```
+waves_per_EU = floor(512 / (ceil(VGPRs / 16) × 16))
+```
+
+| VGPRs per work-item | Waves per EU (SIMD) | Notes |
+|---|---|---|
+| 1–16 | 32 | Full occupancy |
+| 17–32 | 16 | 50% occupancy |
+| 33–64 | 8 | 25% occupancy |
+| 65–128 | 4 | 12.5% occupancy |
+| 129–176 | 3 | |
+| 177–256 | 2 | |
+| 257–512 | 1 | Minimum occupancy |
+
+**Occupancy goal for MI300X**: ≥ 1,024 total workgroups in the launch grid to keep all 304 CUs busy.
+**VGPR reduction tip**: Reducing VGPRs from 33 to 32 doubles waves per EU (8 → 16). Always target the next lower 16-VGPR boundary.
+**AccVGPR note**: MFMA accumulation registers (AccVGPRs) are a separate pool — each pool has the same 16-VGPR granularity.
+
+---
+
+## Hardware Counter Reference
+<!-- rocpd-context: tier2 -->
+
+### GRBM Block (Global Register Bus Manager — system-wide)
+
+The GRBM block provides **system-wide** GPU activity metrics (not per-CU).
+
+| Counter | What it measures | Use |
+|---|---|---|
+| `GRBM_COUNT` | Free-running GPU clock cycles (always incrementing) | Denominator for all utilization ratios |
+| `GRBM_GUI_ACTIVE` | Cycles where the GPU pipeline is not idle | `GPU utilization = GRBM_GUI_ACTIVE / GRBM_COUNT` |
+| `GRBM_CP_BUSY` | Cycles any Command Processor (CP) block is busy | Detect command-processor bottlenecks |
+| `GRBM_SPI_BUSY` | Cycles any Shader Processor Input (SPI) is busy | Wave dispatch saturation |
+| `GRBM_TA_BUSY` | Cycles any Texture Addressing (TA) unit is busy | Address-calculation load |
+| `GRBM_TC_BUSY` | Cycles any Texture Cache block is busy | Cache load |
+| `GRBM_CPC_BUSY` | Cycles the Command Processor-Compute (CPC) is busy | Compute dispatch overhead |
+| `GRBM_CPF_BUSY` | Cycles the Command Processor-Fetcher (CPF) is busy | Fetch pipeline load |
+| `GRBM_UTCL2_BUSY` | Cycles the Unified Translation Cache L2 is busy | TLB pressure |
+| `GRBM_EA_BUSY` | Cycles the Efficiency Arbiter is busy | HBM arbitration load |
+
+**Key derived metric**:
+```
+GPU Utilization (%) = 100 × GRBM_GUI_ACTIVE / GRBM_COUNT
+```
+
+### SQ Block (Shader Sequencer — per compute unit)
+
+| Counter | What it measures |
+|---|---|
+| `SQ_WAVES` | Wavefronts dispatched to sequencers |
+| `SQ_BUSY_CYCLES` | Cycles the SQ reports being busy |
+| `SQ_INSTS` | Total instructions issued |
+| `SQ_INSTS_VALU` | VALU instructions issued (**includes MFMA** as subset) |
+| `SQ_INSTS_MFMA` | MFMA (Matrix FMA) instructions issued |
+| `SQ_INSTS_VMEM_RD` | Vector memory read instructions (including flat) |
+| `SQ_INSTS_VMEM_WR` | Vector memory write instructions (including flat) |
+| `SQ_INSTS_SALU` | Scalar ALU instructions issued |
+| `SQ_INSTS_LDS` | LDS instructions issued |
+| `SQ_LEVEL_WAVES` | In-flight waves at sampling time (level counter) |
+| `SQ_INST_LEVEL_VMEM` | In-flight vector memory instructions (level counter) |
+| `SQ_INST_LEVEL_LDS` | In-flight LDS instructions (level counter) |
+| `SQ_ACCUM_PREV_HIRES` | High-resolution level accumulator (see below) |
+
+**⚠️ Level counter dependency — `SQ_ACCUM_PREV_HIRES`**:
+Level counters (`SQ_LEVEL_WAVES`, `SQ_INST_LEVEL_VMEM`, `SQ_INST_LEVEL_LDS`) report instantaneous snapshots. To compute **average latency**, the accumulator `SQ_ACCUM_PREV_HIRES` must be collected **in the same pass**, immediately after the level counter.
+
+```
+# Latency formulas (require same-pass collection):
+Vector mem latency  = SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM     [cycles]
+LDS latency         = SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS       [cycles]
+Avg wave occupancy  = SQ_ACCUM_PREV_HIRES / SQ_BUSY_CYCLES
+```
+
+**Note**: `rocprof-compute` handles this dependency automatically.
+
+### TCP Block (Texture Cache Per-CU — Vector L1)
+
+Correct counter names for the L1 cache (per CU, instance index `[n]`):
+
+| Counter | What it measures |
+|---|---|
+| `TCP_TOTAL_ACCESSES[n]` | Total vector L1 accesses (reads + writes) |
+| `TCP_TOTAL_READ[n]` | Total vector L1 read accesses |
+| `TCP_TOTAL_WRITE[n]` | Total vector L1 write accesses |
+| `TCP_TCC_READ_REQ[n]` | Read requests forwarded from L1 to L2 (L1 misses) |
+| `TCP_TCC_WRITE_REQ[n]` | Write requests forwarded from L1 to L2 |
+
+**⚠️ Common naming errors**: `TCP_TOTAL_CACHE_ACCESSES`, `TCP_TOTAL_HIT`, `TCP_TOTAL_MISS` are **not valid** AMD counter names. L1 miss rate is derived:
+```
+L1 miss rate = TCP_TCC_READ_REQ[n] / TCP_TOTAL_READ[n]
+```
+
+### TCC Block (Texture Cache Controller — L2 Cache)
+
+| Counter | What it measures | Notes |
+|---|---|---|
+| `TCC_HIT[n]` | L2 cache hits | |
+| `TCC_MISS[n]` | L2 cache misses | |
+| `TCC_READ[n]` | L2 read requests | |
+| `TCC_WRITE[n]` | L2 write requests | |
+| `TCC_EA_RDREQ[n]` | Read requests sent to HBM (**MI200 naming**) | 32- or 64-byte transactions |
+| `TCC_EA_WRREQ[n]` | Write requests sent to HBM (**MI200 naming**) | |
+| `TCC_EA0_RDREQ[n]` | Read requests sent to HBM (**MI300 naming**) | Same metric, MI300 prefix |
+| `TCC_EA0_WRREQ[n]` | Write requests sent to HBM (**MI300 naming**) | |
+
+**⚠️ MI200 vs MI300 naming**: Use `TCC_EA_*` for MI200 series (gfx90a); use `TCC_EA0_*` for MI300 series (gfx942). `rocprof-compute` abstracts this automatically.
+
+**L2 hit rate**:
+```
+L2 hit rate = TCC_HIT[n] / (TCC_HIT[n] + TCC_MISS[n])
+```
+
+### FETCH_SIZE and WRITE_SIZE — Derived Metrics (NOT raw hardware counters)
+
+`FETCH_SIZE` and `WRITE_SIZE` are **derived metrics** computed from TCC counters — they are not directly measured by a single hardware register.
+
+```
+FETCH_SIZE (KiB) ≈ sum(TCC_EA0_RDREQ[0..31]) × 32 bytes / 1024   [MI300]
+WRITE_SIZE (KiB) ≈ sum(TCC_EA0_WRREQ[0..31]) × 32 bytes / 1024   [MI300]
+
+HBM Read  BW = FETCH_SIZE × 1024 / kernel_duration_ns  [GB/s]
+HBM Write BW = WRITE_SIZE × 1024 / kernel_duration_ns  [GB/s]
+Total HBM BW = (FETCH_SIZE + WRITE_SIZE) × 1024 / duration_ns [GB/s]
+```
+
+These measure **HBM traffic as seen from L2**: L2→HBM reads and L2→HBM writes. They include data for L2 misses, writebacks, and atomics. They do NOT include L1↔L2 traffic.
+
+### Core Counters Summary Table
+
+| Counter | What it measures | Derived metric |
+|---|---|---|
+| `GRBM_COUNT` | Total GPU clock cycles | Denominator for utilization |
+| `GRBM_GUI_ACTIVE` | Cycles GPU pipeline active | `GPU util = GRBM_GUI_ACTIVE / GRBM_COUNT` |
+| `SQ_WAVES` | Cumulative wavefront dispatches (not instantaneous) | `Avg waves/CU ≈ SQ_WAVES / GRBM_COUNT` (time-averaged occupancy; max ~32 on CDNA3) |
+| `FETCH_SIZE` | KiB fetched from HBM (derived from TCC) | Read BW = `FETCH_SIZE × 1024 / duration_ns` GB/s |
+| `WRITE_SIZE` | KiB written to HBM (derived from TCC) | Write BW = `WRITE_SIZE × 1024 / duration_ns` GB/s |
+| `TCC_HIT[n]` | L2 cache hits | L2 hit rate = `TCC_HIT / (TCC_HIT + TCC_MISS)` |
+| `TCC_MISS[n]` | L2 cache misses | (used in hit rate formula above) |
+| `SQ_INSTS_VALU` | VALU instructions (includes MFMA) | Compute instruction rate |
+| `SQ_INSTS_MFMA` | MFMA matrix instructions | Matrix utilization rate |
+| `SQ_INSTS_VMEM_RD` | Vector memory reads | Memory instruction rate |
+| `SQ_INSTS_LDS` | LDS instructions | LDS utilization indicator |
+
+### Bandwidth Calculation Detail
+
+```
+HBM Read Bandwidth  = FETCH_SIZE (KiB) × 1024 / kernel_duration_ns  [GB/s]
+HBM Write Bandwidth = WRITE_SIZE (KiB) × 1024 / kernel_duration_ns  [GB/s]
+Total HBM Bandwidth = (FETCH_SIZE + WRITE_SIZE) × 1024 / duration_ns [GB/s]
+
+Example (MI300X, peak 5,300 GB/s):
+  FETCH_SIZE = 500,000 KiB, duration = 10,000 ns:
+  Read BW = 500,000 × 1024 / 10,000 = 51,200 GB/s (implausible → units error)
+  Correct check: confirm FETCH_SIZE is in KiB not raw cache-line count
+```
+
+### GPU Utilization Interpretation
+
+```
+GPU Utilization = GRBM_GUI_ACTIVE / GRBM_COUNT * 100%
+
+< 50%  → GPU is idle much of the time; likely launch overhead, CPU bottleneck,
+          or synchronization stalls. Investigate with rocprof-sys timeline.
+50–75% → Moderate utilization; potential for overlap improvement.
+> 75%  → Good utilization; focus analysis on per-kernel efficiency.
+```
+
+### Wave Occupancy Interpretation
+
+**SQ_WAVES is a cumulative counter** (total wavefront dispatches over the measurement window).
+**GRBM_COUNT** counts active clock cycles over the same window. Their ratio approximates
+average concurrent waves per CU over the active period:
+
+```
+Avg waves/CU ≈ SQ_WAVES / GRBM_COUNT
+
+Max waves per EU (SIMD): 8 for CDNA1/CDNA2 (MI100/MI200), 32 for CDNA3/CDNA4 (MI300+)
+Theoretical max waves per CU (CDNA3): 32 waves/EU × 4 EUs = up to 128 waves per CU
+
+Occupancy % = (Avg waves/CU / theoretical_max_waves_per_CU) * 100%
+            = (SQ_WAVES / GRBM_COUNT) / 128 * 100%   [CDNA3]
+
+Note: values of SQ_WAVES / GRBM_COUNT above 128 indicate a measurement or units error.
+
+< 25%  → Very low occupancy; VGPRs or LDS likely too high. High priority fix.
+25–50% → Low-medium occupancy; room for improvement.
+50–75% → Adequate; focus on other bottlenecks first.
+> 75%  → Good occupancy; diminishing returns from further improvement.
+```
+
+**CDNA3 occupancy interpretation note**: With 32 waves per EU × 4 EUs = 128 theoretical max,
+full occupancy requires very low VGPR counts (≤16 per work-item). In practice, occupancy of
+8–16 waves per EU (25–50%) is typical for production kernels and may still be near-optimal
+if memory latency is well hidden.
+
+---
+
+## PC Sampling Interpretation
+<!-- rocpd-context: tier2 -->
+
+PC sampling provides **instruction-level** insight into GPU kernel execution — the most detailed
+view available short of a full instruction trace. It answers: *which instructions consume the
+most cycles and why*.
+
+### What PC Sampling Data Contains
+
+Each sample is a stochastic hardware snapshot of the Program Counter (PC) taken at a
+configurable interval. Fields per sample:
+
+| Field | Description |
+|---|---|
+| `kernel_id` | Dispatch ID of the kernel being sampled |
+| `wave_id` | Wave (wavefront) identifier within the CU |
+| `hw_id` | Hardware slot ID (identifies SIMD / CU) |
+| `exec_mask` | 64-bit mask — which lanes were active |
+| `sample_type` | `ISSUED`, `LATENCY`, or `INDETERMINATE` (see below) |
+| `issue_reason` | Stall cause when `sample_type == LATENCY` |
+| `pipeline` | Which execution pipeline (VALU, VMEM_TEX, LDS, MFMA, etc.) |
+| `pc_offset` | Byte offset from kernel code object base — maps to an ISA instruction |
+| `timestamp` | GPU clock timestamp |
+
+**Collection command** (requires ROCm >= 7.0, CDNA3/CDNA4 GPU: gfx942 or gfx950):
+```bash
+export ROCPROFILER_PC_SAMPLING_BETA_ENABLED=1
+rocprofv3 --kernel-trace --output-format json \
+  --pc-sampling-beta-enabled true \
+  --pc-sampling-unit cycles \
+  --pc-sampling-method stochastic \
+  --pc-sampling-interval $((1024*1024)) \
+  -- ./app
+```
+
+**Interval rules**: must be a power-of-2 between 2^8 (256) and 2^20 (1048576) cycles.
+Shorter intervals → higher sample density but higher collection overhead.
+Recommended default: `$((1024*1024))` (≈ 1M cycles between samples) for low overhead.
+
+**Output format**: PC sampling data is currently only available in **JSON format** (not SQLite/rocpd).
+When this tool receives PC sample data, it arrives as pre-aggregated statistics; raw per-sample
+JSON files must be processed separately (e.g., with `pcsampling.py`).
+
+---
+
+### Three Sample Types (GFX9SampleResults)
+
+| Type | `wave_issued` | Meaning | Optimization relevance |
+|---|---|---|---|
+| `ISSUED` | 1 | Wave successfully issued an instruction this cycle | Counts toward useful work |
+| `LATENCY` | 0 | Wave was ready but **stalled** — see `issue_reason` | **Most actionable** |
+| `INDETERMINATE` | 0 | Wave lost arbitration to another wave; both wanted to issue | Indicates resource contention |
+
+**Key rule from hardware**: When `wave_issued=1`, the `issue_reason` field is **undefined/noise** —
+do not interpret stall reasons for issued samples. Only `LATENCY` samples carry meaningful
+`issue_reason` values.
+
+**Additional hardware quirk**: the destination instruction of a **taken branch** is blamed for a
+`NO_INSTRUCTION_AVAILABLE` stall resulting from the branch's front-end bubble (not the branch
+instruction itself). When you see high `NO_INSTRUCTION_AVAILABLE` counts at a specific PC,
+check whether that address is the target of a frequently-taken branch.
+
+---
+
+### Seven Execution Pipelines (GFX9Pipelines)
+
+| Pipeline | Instructions | Notes |
+|---|---|---|
+| `VALU` | Floating-point and integer arithmetic on all 64 lanes | The workhorse; VALU-bound → compute-bound |
+| `MATRIX` (MFMA) | Matrix FMA instructions (`v_mfma_*`) | MI300X has 4 MFMA units per CU |
+| `SCALAR` | Scalar ALU, scalar memory, branch instructions | Control flow and index computation |
+| `VMEM_TEX` | Vector memory reads/writes, buffer, texture | Accesses go to HBM via L2/L1 (TEX pipeline) |
+| `LDS` | Local Data Share reads/writes (`ds_read*`, `ds_write*`) | Shared memory within a workgroup |
+| `FLAT` | Flat-addressing memory (`flat_load*`, `flat_store*`) | Generic pointer — slower than typed VMEM or LDS |
+| `MISC` | Barriers (`s_barrier`), messages (`s_sendmsg`), exports | Control/synchronization instructions |
+
+**FLAT vs VMEM**: Prefer `buffer_load`/`global_load` over `flat_load` when possible.
+FLAT instructions add address-space disambiguation overhead and route through a slower path.
+High FLAT samples in a kernel → the compiler could not prove the pointer targets device memory;
+add `__restrict__` qualifiers or use typed pointer arguments.
+
+---
+
+### Eight Stall Reasons (GFX9IssueReasons) for LATENCY Samples
+
+These apply only when `sample_type == LATENCY` (`wave_issued == 0`).
+
+| Stall Reason | Root Cause | Actionability |
+|---|---|---|
+| `NO_INSTRUCTION_AVAILABLE` | Instruction cache miss or front-end bubble (e.g., after a taken branch) | Indicates i-cache pressure or branch misprediction; usually not directly actionable |
+| `ALU_DEPENDENCY` | Data hazard: wave waiting for a previous instruction's result. Also triggered by hardware-enforced interlocks (VALU→LDS, VALU→FLAT, VALU→CBranch write-hazards) | Fix: reorder instructions to insert independent work between producer and consumer; software pipelining; increase ILP |
+| `WAITCNT` | Wave hit an explicit `s_waitcnt` — waiting for outstanding VMEM, LDS, or EXP operations to drain | Indicates insufficient memory-level parallelism; fix: issue more independent memory operations before the wait point; restructure access patterns |
+| `INTERNAL_INSTRUCTION` | Hardware-injected stall (`s_sleep`, `s_setpc`, trap handler) | Usually not actionable |
+| `BARRIER_WAIT` | Wave stalled at `s_barrier` / `__syncthreads()` — other waves in the workgroup have not yet reached the barrier | Fix: balance work across all threads in the workgroup; reduce barrier frequency; check for divergent workloads |
+| `ARBITER_NOT_WIN` | Wave was ready to issue but lost arbitration — another wave was selected | Normal behavior at high occupancy; if dominant, may indicate scheduling imbalance across waves |
+| `ARBITER_WIN_EX_STALL` | Wave **won** arbitration but the execution pipeline (VMEM, LDS, MFMA, etc.) is backed up | **Key bottleneck indicator**: the pipeline itself is the bottleneck. Fix depends on which pipeline (see interpretation below) |
+| `OTHER_WAIT` / `NONE` | Miscellaneous or no stall (issued normally) | Not actionable |
+
+**Hardware-enforced interlocks (appear as `ALU_DEPENDENCY`)**: GFX9/CDNA hardware invisibly inserts
+stall cycles between certain instruction pairs:
+- VALU writes a VGPR → immediately followed by LDS instruction using that VGPR
+- VALU writes a VGPR → immediately followed by FLAT instruction using that VGPR
+- Scalar instruction writes SCC → immediately followed by `s_cbranch` reading SCC
+
+These produce `ALU_DEPENDENCY` stalls with `inst_type=NO_INST` (the hardware prevented issue
+before the instruction could even be recognized). These are inherent pipeline constraints; mitigate
+by inserting an independent instruction between the producer and consumer.
+
+---
+
+### Interpreting PC Sample Reports
+
+When given PC sample data or aggregated sample statistics:
+
+**Step 1 — Check overall ISSUED vs LATENCY ratio**:
+- High LATENCY% (> 50% of all samples stalled): kernel is stall-dominated → examine `issue_reason`
+- High ISSUED%: kernel is issuing well; bottleneck may be in throughput, not latency
+
+**Step 2 — Diagnose by stall reason**:
+
+| Dominant stall pattern | Diagnosis | Recommended fix |
+|---|---|---|
+| `ALU_DEPENDENCY` — VALU/MFMA pipeline | Long-latency chain in critical path (MFMA ≈ 64 cycles, VMEM ≈ 80–200 cycles) | Software pipelining; reorder independent instructions; increase ILP |
+| `WAITCNT` — any pipeline | Insufficient memory-level parallelism; wave blocks waiting for memory | Issue more memory ops before the wait point; async prefetch patterns |
+| `ARBITER_WIN_EX_STALL` — VMEM_TEX pipeline | HBM bandwidth saturation or L1/L2 miss storms | Matches memory-bound classification; improve data locality, tiling, coalescing |
+| `ARBITER_WIN_EX_STALL` — LDS pipeline | LDS bank conflicts or LDS throughput limit | Check for 2-way/32-way bank conflicts; use XOR swizzling for b128 reads |
+| `ARBITER_WIN_EX_STALL` — MATRIX pipeline | MFMA units fully subscribed | Normal if MFMA utilization is intentionally 100%; otherwise increase tile size |
+| `ARBITER_NOT_WIN` dominant | High-occupancy scheduling; many waves competing for same pipeline slot | Normal unless it prevents progress; may indicate over-occupancy reducing throughput |
+| `BARRIER_WAIT` significant | Workgroup synchronization overhead | Reduce barrier calls; balance work distribution across threads |
+| `NO_INSTRUCTION_AVAILABLE` dominant | Instruction cache pressure or frequent taken branches | Large kernels may overflow i-cache; check for hot branch targets |
+
+**Step 3 — Examine hot PC offsets**:
+- The most frequent PC offsets identify the *specific instructions* causing bottlenecks
+- A PC offset with > 5% of all samples is a meaningful hotspot
+- PC offsets < 1% of total samples are within statistical noise
+
+---
+
+### Statistical Significance Rules
+
+- **Minimum sample count**: At least **1,000 total samples per kernel** for statistically reliable
+  stall-reason conclusions. Below 1,000 samples, treat results as directional only.
+- **Hot PC threshold**: PC offsets representing < 1% of samples are noise; report offsets ≥ 2%
+- **Interval trade-off**: shorter intervals increase density but add overhead that may perturb the
+  measurement. For production kernels, use interval ≥ 256K cycles; for fast micro-benchmarks
+  targeting specific instructions, 4K–64K cycles may be needed to gather enough samples.
+- **Combining with Tier 1/2**: PC samples identify bottlenecks *within* a kernel; always cross-reference
+  with Tier 1 hotspot data to confirm the kernel is worth optimizing (Amdahl's Law applies here too).
+
+---
+
+### Limitations (Always Disclose When Analyzing PC Samples)
+
+- PC sampling data is currently only available in **JSON format** (not SQLite/rocpd). This tool
+  receives pre-aggregated statistics — raw per-sample data is not embedded in the database.
+- Without code object (binary), exact ISA instruction text cannot be decoded. Report the PC offset
+  and advise the user to run `llvm-objdump` to decode it.
+- **Call-stack reconstruction** is not available in current rocprofv3 PC sampling.
+- Very short sampling intervals (< 256K cycles) cause measurable overhead that may alter
+  observed bottleneck ratios.
+- PC sampling requires a **CDNA3 or CDNA4 GPU** (gfx942 or gfx950) and **ROCm >= 7.0**.
+  On older hardware (MI200/MI100, gfx90a/gfx908), PC sampling is unavailable.
+
+---
+
+### ISA Inspection Commands
+
+When PC offset hotspots are identified, recommend these commands for the user to decode the
+specific instructions:
+
+```bash
+# Dump all offloaded code objects (lists all GPU kernels embedded in the binary)
+llvm-objdump --offloading <exe>
+
+# Disassemble with source annotations (requires DWARF debug info — compile with -g)
+llvm-objdump -gd <exe>.*-amdgcn-amd-amdhsa*
+
+# Then search for your kernel name and look up the PC offset
+# PC offset 0x1b1c → find the instruction at byte offset 0x1b1c in the kernel's code
+```
+
+**Note**: The `.*-amdgcn-amd-amdhsa*` glob matches the offloaded code object embedded in the binary.
+Without `-g` (debug info), source line annotations are absent but ISA instructions are still visible.
+PC offsets in sample reports are byte offsets from the start of the kernel's code object.
+
+---
+
+## Memory Hierarchy
+<!-- rocpd-context: tier2 -->
+
+AMD CDNA GPUs have a three-level memory hierarchy. Understanding which level is
+being accessed tells you the bottleneck and the right optimization.
+
+```
+Thread → VGPR (registers)
+       → LDS (64 KB per CU on CDNA2/3; 160 KB per CU on CDNA4 — shared within workgroup)
+       → L1 cache (per CU, 16–32 KB, read-only for global memory)
+       → L2 cache (shared across CUs; 8 MB on MI250X, 256 MB on MI300X/MI325X/MI350X)
+       → HBM (main GPU memory; 1.23 TB/s on MI100 → 8 TB/s on MI350X)
+```
+
+### Cache Hit Rate Thresholds
+
+| Cache level | Good hit rate | Concern threshold |
+|---|---|---|
+| L1 (TCP) | > 80% | < 50% |
+| L2 (TCC) | > 60% | < 40% |
+
+Low L2 hit rate with high FETCH_SIZE → working set exceeds L2; data is being fetched
+from HBM on every access. Main fix: improve data locality or tiling.
+
+### LDS (Local Data Share)
+
+- **Capacity**: 64 KB per CU on CDNA1/CDNA2/CDNA3 (MI100/MI200/MI300 series)
+- **Capacity**: **160 KB per CU on CDNA4** (MI350X/MI355X — 2.5× increase)
+- **Banks**: 32 banks; 32-way bank conflict possible if 32 threads access the same bank
+- **Bank conflict detection**: use `SQ_INSTS_LDS` counter; rocprof-compute reports "LDS Bank Conflict Rate"
+- **When to use LDS**: data accessed multiple times by threads in the same workgroup
+  (e.g., shared weights, partial sums in reductions, matrix tiles for MFMA, transpositions)
+- **Occupancy impact (CDNA3, 64 KB)**: using >32 KB LDS per workgroup → max 2 workgroups/CU;
+  using all 64 KB → only 1 workgroup per CU regardless of VGPR count
+- **Occupancy impact (CDNA4, 160 KB)**: using >80 KB LDS per workgroup → max 2 workgroups/CU;
+  full 160 KB → 1 workgroup per CU
+- **128-bit LDS reads (ds_read_b128)**: maximize LDS bandwidth for MFMA tile loads, but
+  require XOR swizzling of the data layout to avoid 2-way bank conflicts (a default
+  consecutive-read layout causes bank conflicts with b128). Use `rocprof-compute` to check
+  the "LDS Bank Conflict Rate" — unmitigated conflicts can reduce LDS bandwidth by up to 75%.
+
+---
+
+## Performance Analysis Models
+<!-- rocpd-context: tier2 -->
+
+### 1. Roofline Model
+
+**Purpose**: Determine if a kernel is compute-bound or memory-bound. Plots achieved
+performance (GFLOP/s) vs. arithmetic intensity (FLOP/Byte) against hardware limits.
+
+**Arithmetic Intensity (AI)**: FLOP/Byte
+- **Memory-Bound**: AI < Ridge Point (kernel performance limited by memory bandwidth)
+- **Compute-Bound**: AI > Ridge Point (kernel performance limited by compute throughput)
+- **Balanced**: AI near Ridge Point
+
+**Ridge Point = Peak FP32 FLOPS / Peak HBM Bandwidth**:
+- MI355X (gfx950): 157.3 TFLOPS / 8.0 TB/s ≈ **20 FLOP/Byte**
+- MI350X (gfx950): 144.2 TFLOPS / 8.0 TB/s ≈ **18 FLOP/Byte**
+- MI325X (gfx942): 163.4 TFLOPS / 6.0 TB/s ≈ **27 FLOP/Byte**
+- MI300X (gfx942): 163.4 TFLOPS / 5.3 TB/s ≈ **31 FLOP/Byte**
+- MI250X (gfx90a): 47.9 TFLOPS / 3.2 TB/s ≈ **15 FLOP/Byte** (per GCD)
+- MI100 (gfx908):  23.1 TFLOPS / 1.23 TB/s ≈ **19 FLOP/Byte**
+
+**Important**: The roofline ceiling is the *achievable* hardware limit (accounting for
+efficiency), not just the theoretical peak. A kernel already close to the achievable
+ceiling needs a fundamentally different algorithm, not micro-optimizations.
+
+**Using rocprof-compute for automated roofline**:
+```bash
+rocprof-compute profile --roof-only -- ./app
+```
+
+### 2. Speed-of-Light (SOL) Analysis
+
+**Purpose**: Compare achieved performance to theoretical hardware peaks for each subsystem.
+
+**Key Metrics**:
+- **VALU Utilization**: % of peak Vector ALU throughput
+- **MFMA Utilization**: % of peak Matrix FMA throughput (for matrix ops)
+- **HBM Utilization**: % of peak memory bandwidth (from FETCH_SIZE + WRITE_SIZE)
+- **L2 Cache Hit Rate**: % of memory accesses served by L2 (from TCP/TCC counters)
+- **Wave Occupancy**: % of maximum active waves per CU
+
+**Interpretation**:
+- **> 80% utilization**: Near optimal, very limited optimization headroom
+- **50–80% utilization**: Good, but improvements possible
+- **< 50% utilization**: Significant optimization opportunity
+
+### 3. Top-Down Analysis
+
+**Purpose**: Break down where execution time is spent at the application level.
+
+**Time Breakdown**:
+- **Kernel Execution**: GPU compute work — should be the dominant category
+- **Memory Copies**: H2D, D2H, D2D transfers — check if data can be kept on GPU
+- **API Overhead**: CPU time in HIP/HSA calls and kernel launch — check for launch storms
+- **GPU Idle**: GPU waiting for work — indicates CPU-GPU synchronization issues
+
+**Red Flags**:
+- Memory copies > 20% of total time → reduce H2D/D2H transfers; keep data on GPU
+- API overhead > 10% → reduce number of small kernel launches or API call frequency
+- GPU idle > 10% → overlap CPU work with GPU using streams and asynchronous operations
+
+---
+
+## Common Bottleneck Types and Signatures
+<!-- rocpd-context: tier1 -->
+
+### Compute-Bound
+
+**Indicators**:
+- High arithmetic intensity (> Ridge Point FLOP/Byte for the GPU)
+- VALU or MFMA utilization > 70%
+- Memory bandwidth utilization < 50%
+- Kernel duration scales with problem size, not data size
+
+**Root causes**: Insufficient parallelism, serial dependency chains, division operations
+
+**Optimizations**:
+- Use MFMA instructions for matrix operations (rocBLAS, MIOpen, Composable Kernel)
+- Increase instruction-level parallelism (ILP): unroll loops, break dependency chains
+- Ensure high wave occupancy to hide latency
+- Replace expensive operations (division → reciprocal multiply, transcendentals → approximations)
+
+---
+
+### Memory-Bound (HBM Bandwidth)
+
+**Indicators**:
+- Low arithmetic intensity (< Ridge Point FLOP/Byte)
+- HBM bandwidth utilization > 70%
+- VALU/MFMA utilization < 50%
+- High FETCH_SIZE or WRITE_SIZE per byte of useful work
+
+**Root causes**: Low data reuse, poor tiling, no LDS usage, cold cache working set
+
+**Optimizations**:
+- Tile data into LDS to increase reuse within workgroup
+- Coalesce global memory accesses (adjacent threads access adjacent addresses)
+- Increase arithmetic intensity: do more work per byte loaded
+- Fuse kernels to avoid redundant loads/stores between successive operations
+- Consider data compression or mixed precision to reduce bytes transferred
+
+---
+
+### Latency-Bound (Low Occupancy)
+
+**Indicators**:
+- Low wave occupancy (< 50% = < 16 waves per CU)
+- High VGPR usage (> 128 VGPRs per wave)
+- Low GPU utilization despite kernels being dispatched
+- Neither compute nor memory subsystem is saturated
+
+**Root causes**: Too many VGPRs per wave (limits waves per CU), too much LDS per
+workgroup, or workgroup size too small
+
+**Optimizations**:
+- Reduce VGPR usage: limit local variable count, avoid large temporary arrays
+- Add `__launch_bounds__(block_size, min_waves_per_eu)` to give compiler occupancy hint
+- Recompile with `-O3` and check VGPR count in compiler output (`--save-temps`)
+- If LDS is the bottleneck: reduce LDS allocation or split into two kernels
+- Increase workgroup size to expose more parallelism to the scheduler
+
+---
+
+### Memory Copy Overhead
+
+**Indicators**:
+- H2D/D2H time > 20% of total execution
+- Small, frequent transfers (many copies of < 1 MB)
+- Achieved bandwidth << PCIe or xGMI peak bandwidth
+
+**Root causes**: Data transferred to/from host every iteration, non-pinned host memory,
+synchronous blocking copies
+
+**Optimizations**:
+- Keep data on GPU between kernel launches; only transfer at start and end
+- Use pinned (page-locked) host memory: `hipHostMalloc()` or `hipMallocHost()`
+- Batch small transfers into one large transfer
+- Use asynchronous transfers with `hipMemcpyAsync()` and HIP streams to overlap with kernels
+- For multi-GPU: use peer-to-peer (D2D) transfers instead of routing through host
+
+---
+
+### API and Launch Overhead
+
+**Indicators**:
+- High HIP/HSA API time (> 10% of total)
+- Many kernel dispatches with durations < 10 μs each
+- Large count of hipLaunchKernel or hipMemcpy calls
+
+**Root causes**: Excessive synchronization, fine-grained kernel launches, unnecessary
+host-device round trips
+
+**Optimizations**:
+- Fuse short consecutive kernels into one larger kernel
+- Use HIP graphs (`hipGraph`) to batch kernel launches with reduced CPU overhead
+- Eliminate unnecessary `hipDeviceSynchronize()` calls
+- Use persistent kernels for iterative workloads
+- Increase work per kernel launch (increase grid size)
+
+---
+
+## AMD-Specific Optimization Techniques
+<!-- rocpd-context: tier2 -->
+
+### 1. Wave Occupancy Optimization
+
+**Target**: ≥ 75% occupancy (≥ 24 waves per CU) for most kernels.
+**Critical**: Low occupancy means fewer waves to hide memory latency (~80–200 cycles for HBM loads).
+
+**VGPR Usage Guidelines** (CDNA3 — see VGPR→Occupancy table above):
+- VGPRs are allocated in **blocks of 16** — reducing from 33 to 32 VGPRs doubles occupancy
+- Target: ≤ 32 VGPRs per work-item for maximum occupancy (16 waves/EU on MI300X)
+- Concern: > 64 VGPRs → only 4 waves per EU (12.5% of max)
+- Critical: > 128 VGPRs → only 3 waves per EU — strong candidate for VGPR reduction
+
+**Occupancy target for MI300X**: ensure at least **1,024 workgroups** in the launch grid
+to saturate all 304 CUs. With fewer workgroups, some CUs will be idle.
+
+**Techniques**:
+- Use `__launch_bounds__(threads_per_block, min_waves_per_eu)` to hint the compiler
+- Check compiler output for VGPR count: `hipcc --save-temps` then inspect `.s` file
+- Reduce register spilling (spills go to scratch memory — very expensive)
+- Smaller workgroup sizes if register-limited (reduces per-wave resource usage)
+- Split large monolithic kernels into multiple passes
+
+### 2. LDS (Local Data Share) Usage
+
+**Capacity**: 64 KB per CU (shared across all concurrent workgroups on that CU)
+
+**Best Practices**:
+- Use for data shared within a workgroup (e.g., partial sums in reductions)
+- Avoid 32-way bank conflicts: ensure stride-1 access patterns where possible
+- Prefetch data from global memory into LDS before the compute phase
+- Balance LDS allocation with occupancy: > 32 KB LDS per workgroup → at most 2 workgroups/CU
+
+**LDS vs Global Memory**: LDS is ~100× faster than uncached global (HBM) access.
+Every byte that can be reused from LDS instead of HBM is a win.
+
+### 3. Memory Coalescing
+
+**Requirement**: Adjacent threads (in the same wavefront) access adjacent memory addresses.
+
+**Pattern**:
+```c
+// Good: Coalesced — thread i reads element i
+output[threadIdx.x] = input[threadIdx.x];
+
+// Bad: Strided — thread i reads element i*N (generates N separate cache lines)
+output[threadIdx.x] = input[threadIdx.x * stride];
+
+// Bad: Random — thread i reads element permutation[i] (impossible to coalesce)
+output[threadIdx.x] = input[permutation[threadIdx.x]];
+```
+
+Coalesced access maps a 64-thread wavefront to a small number of 64-byte cache lines.
+Non-coalesced access can require up to 64× more cache-line fetches for the same data.
+
+### 4. MFMA Instructions (Matrix Operations)
+
+**When**: Matrix multiplication, convolutions, attention, any O(n³) computation
+
+**Benefits**:
+- MFMA throughput is 4–16× higher than equivalent VALU operations
+- Used automatically by rocBLAS, MIOpen, Composable Kernel, hipBLAS
+- Verify MFMA utilization with: `rocprofv3 --pmc SQ_INSTS_VALU SQ_INSTS_MFMA -- ./app`
+
+**Check**: MFMA utilization low despite matrix-heavy workload → likely using non-MFMA
+path; switch to rocBLAS or use Composable Kernel MFMA tiles directly.
+
+**Tile Size Recommendation (MI300X/MI325X)**:
+- **Prefer `16×16` over `32×32` MFMA tiles** on MI300X
+- Reason: `v_mfma_f32_16x16x16f16` consumes less power per cycle, allowing higher sustained clock
+  frequency, which more than compensates for the higher software overhead of smaller tiles
+- The net result is higher actual FLOP throughput with 16×16 tiles despite their smaller size
+- Counter to check: `SQ_INSTS_MFMA` (isolated MFMA instruction count) vs `SQ_INSTS_VALU` (all VALU)
+
+**AccVGPR (Accumulation Registers)**:
+- MFMA output (the C/D matrix) is stored in AccVGPRs — a separate register file from ArchVGPRs
+- A wavefront can have up to 256 ArchVGPRs + 256 AccVGPRs (512 total)
+- Both pools have the same 16-VGPR allocation granularity
+- `v_mfma_f32_16x16x16f16` occupies 16 AccVGPRs per wave for the output tile
+
+### 4b. Memory Access Pattern Optimization
+
+**Stride-512 HBM Hotspotting** (MI300 series):
+- If a matrix leading dimension is an **exact multiple of 512 bytes**, it causes HBM channel
+  hotspotting ("Tagram conflict") — requests concentrate in a few channels instead of spreading evenly
+- This can significantly reduce effective HBM bandwidth even when aggregate utilization seems low
+- Common trigger: GEMM with `lda` or `ldb` that is a multiple of 512 bytes
+- **Fix**: Add a small padding offset to break alignment:
+  ```
+  # For FP16 matrices where K % 256 == 0:
+  lda = K + 128   # adds 256 bytes of padding (128 FP16 elements)
+  ```
+- Ensure no matrix leading dimension is an exact multiple of 512 bytes
+
+### 5. Instruction-Level Parallelism (ILP)
+
+**Purpose**: Overlap independent instructions to hide execution latency (~4 cycles for
+VALU, ~80–200 cycles for global memory loads).
+
+**Techniques**:
+- Unroll loops manually or with `#pragma unroll`
+- Ensure independent instructions between dependent ones
+- Use software pipelining: initiate next load while computing current result
+
+### 6. HIP Streams for Overlap
+
+**Purpose**: Execute kernel computation and memory transfers simultaneously.
+
+```cpp
+hipStream_t stream;
+hipStreamCreate(&stream);
+hipMemcpyAsync(d_out, h_out, size, hipMemcpyDeviceToHost, stream);
+myKernel<<<grid, block, 0, stream>>>(d_in, d_out, n);
+hipStreamSynchronize(stream);
+```
+
+---
+
+## Recommendation Quality Standards
+<!-- rocpd-context: always -->
+
+### Every Recommendation Must Include:
+
+1. **Title**: Short, actionable statement (e.g., "Reduce VGPR usage for kernel X")
+
+2. **Priority**: High, Medium, or Low
+   - **High**: Impacts > 10% of total execution time
+   - **Medium**: Impacts 3–10% of execution time
+   - **Low**: Impacts < 3% but still worthwhile
+
+3. **Description**: Explain what the issue is and why it matters
+   - Current state (measured values)
+   - Target state (what good looks like)
+   - Expected impact
+
+4. **Actionable Steps**: Specific instructions, not generic advice
+   - Concrete code changes or compiler flags
+   - Profiling commands to verify improvement
+   - Expected counters to check
+
+### Good Recommendation Example:
+```
+Title: Reduce VGPR usage for 'conv2d_forward' kernel
+
+Priority: High
+
+Description: The conv2d_forward kernel uses 128 VGPRs per wave, limiting
+occupancy to 50% (16 waves/CU vs 32 maximum). This kernel accounts for
+30% of total execution time; improving occupancy could yield 1.5–2× speedup
+by better hiding memory latency.
+
+Actionable Steps:
+1. Add __launch_bounds__ hint:
+   __global__ void __launch_bounds__(256, 4) conv2d_forward(...) {}
+2. Reduce local variable usage: move temporary arrays to LDS
+3. Recompile with: hipcc -O3 --gpu-max-threads-per-block=256
+4. Check new VGPR count: hipcc --save-temps (inspect .s file for v_vgpr_count)
+5. Verify occupancy improved: rocprofv3 --pmc SQ_WAVES -- ./app
+
+Expected Impact: 1.5–2× kernel speedup (~20% total application speedup)
+```
+
+### Bad Recommendation Example:
+```
+Recommendation: Optimize the kernel
+```
+**(Too vague, not actionable)**
+
+---
+
+## Analysis Guidelines
+<!-- rocpd-context: always -->
+
+### 1. Start with the Big Picture (Amdahl's Law First)
+- Identify the top 3–5 kernels by execution time (apply Pareto principle)
+- Kernels < 5% of total time rarely worth deep optimization
+- Check memory copy and API overhead percentages
+- Note overall GPU utilization from GRBM_GUI_ACTIVE / GRBM_COUNT
+
+### 2. Apply Performance Models
+- Use Top-Down to identify overhead sources (kernel vs memcpy vs API vs idle)
+- Use Roofline to classify each hot kernel (compute vs memory-bound)
+- Use SOL to find the specific bottleneck (VALU, MFMA, HBM, L2, LDS)
+
+### 3. Classify Each Hot Kernel
+- **Compute-bound**: high AI, high VALU/MFMA utilization, low HBM utilization
+- **Memory-bound**: low AI, high FETCH_SIZE/WRITE_SIZE, low VALU utilization
+- **Latency-bound**: low occupancy, neither compute nor memory saturated
+- **Launch-bound**: many tiny kernels with duration < 10 μs
+
+### 4. Prioritize Recommendations
+- High priority: kernels > 10% of total time or data > 20% memcpy overhead
+- Only recommend rocprof-compute deep dive for the top 1–2 kernels
+- Match recommendation to bottleneck type (do not suggest MFMA for memory-bound kernel)
+
+### 5. Be Specific and Actionable
+- Reference specific kernel names from the data
+- Cite actual counter values and computed metrics
+- Provide exact commands to verify the improvement after applying the fix
+
+### 6. Acknowledge Limitations
+- If counter data is missing, state exactly which counters are needed and why
+- If GPU architecture is unknown, note that hardware-peak comparisons are unavailable
+- If bottleneck classification has low confidence, say so and recommend Step 2 counters
+
+### 7. Provide Incremental Profiling Guidance
+- Use `profiling_info.profiling_mode` and `hardware_counters.*` to determine what step
+  the user is on, then recommend only the next incremental step
+- Do NOT suggest re-collecting data that is already present
+- Provide the exact command for the next profiling step
+
+---
+
+## Output Format Requirements
+<!-- rocpd-context: always -->
+
+### Structure:
+1. **Executive Summary** (2–3 sentences)
+   - Overall assessment
+   - Primary bottleneck
+   - Key finding
+
+2. **Execution Breakdown**
+   - Time spent in kernels, memory copies, API overhead, idle
+
+3. **Top Bottlenecks** (Top 3–5 kernels by time)
+   - Kernel name and % of total time
+   - Bottleneck classification with confidence level
+   - Key issues (counter values, occupancy, bandwidth)
+
+4. **Prioritized Recommendations** (High → Medium → Low)
+   - Follow recommendation quality standards above
+
+5. **Next Profiling Steps** (only if more data is needed)
+   - What data to collect and why
+   - Exact profiling command using rocprofv3, rocprof-compute, or rocprof-sys
+   - What new insight it will provide
+
+### Tone:
+- Clear and direct
+- Technical but accessible
+- Focus on "what", "why", and "how to fix"
+- Avoid jargon where plain English works
+- Use bullet points and tables for readability
+
+---
+
+## Context-Aware Profiling Recommendations
+<!-- rocpd-context: always -->
+
+**CRITICAL**: Before recommending any profiling command, determine what was already
+collected in the current run and only suggest the **incremental next step**.
+
+Use the tool documentation in this guide — specifically the tracing modes, flag
+descriptions, and use-cases for `rocprofv3`, `rocprof-sys`, and `rocprof-compute` —
+to understand which flags and tools produce equivalent or overlapping data. If a
+recommended command would collect data already present in the database, do not suggest
+it.
+
+**To identify what was already collected**, use `profiling_info.profiling_mode` from
+the JSON data, and check `hardware_counters.has_counters` and
+`hardware_counters.counters` for which specific PMC counters are already present.
+
+**When all needed data is already present**, say so explicitly and skip the profiling
+command — do not pad the output with redundant re-collection steps.
+
+---
+
+## Compiler Optimization Flags and Options
+<!-- rocpd-context: compiler -->
+
+Compiler-level changes are often the **highest-leverage, zero-source-change** optimization path.
+Before suggesting algorithmic rewrites, always consider whether a compiler flag can solve the
+same problem. Use this section to identify applicable flags based on profiling evidence.
+
+---
+
+### Target Selection: `--offload-arch` / `-mcpu`
+
+The most important compiler flag. Specifying the exact GPU target enables the compiler to use
+all architecture-specific instructions (MFMA, packed math, etc.) and avoids generating generic
+fallback code.
+
+**Usage (HIPCC/clang++):**
+```bash
+# Single target
+hipcc --offload-arch=gfx942 -O3 kernel.hip -o app
+
+# Multiple targets (fat binary)
+hipcc --offload-arch=gfx942 --offload-arch=gfx90a -O3 kernel.hip -o app
+
+# With ISA feature qualifiers (see Target Feature Flags below)
+hipcc --offload-arch=gfx942:sramecc+:xnack- -O3 kernel.hip -o app
+```
+
+**Recommendation trigger**: If `rocprof-compute` shows low MFMA utilization on MI300X despite
+matrix workloads, confirm the binary was compiled with `--offload-arch=gfx942`. Generic builds
+(`--offload-arch=gfx900`) disable MFMA instructions entirely.
+
+---
+
+### Target Feature Flags (`-mattr` / target qualifiers)
+
+These flags control optional ISA features that affect **correctness and performance**. They are
+appended to `--offload-arch` as qualifiers or passed via `-mattr`.
+
+| Feature | Flag | Default | Performance Impact |
+|---------|------|---------|-------------------|
+| XNACK (page-fault retry) | `xnack+` / `xnack-` | GPU-dependent | **Disabling saves 5–15% overhead** on MI300X/gfx942 |
+| SRAMECC (ECC on SRAM) | `sramecc+` / `sramecc-` | GPU-dependent | **Disabling saves 2–8% overhead** if ECC not needed |
+| 64-wave mode | `wavefrontsize64` / no flag | 64 on CDNA, 32 on RDNA | Affects occupancy calculations significantly |
+| CU mode (vs WGP mode) | `cumode` / no flag | WGP on RDNA | CU mode restores RDNA2 shared-memory semantics |
+| Thread-group split | `tgsplit` | off | Enables LDS split across CU pairs (advanced use) |
+
+**XNACK — Key decision:**
+- `xnack+`: enables Unified Memory / page migration (required for `hipMallocManaged`). Has hardware
+  retry overhead on TLB miss.
+- `xnack-`: disables page-fault retry. **Faster for HPC workloads that don't use Unified Memory.**
+- **Recommendation**: If the application uses `hipMalloc` + explicit `hipMemcpy` (not `hipMallocManaged`),
+  compile with `--offload-arch=gfx942:xnack-` for a measurable throughput gain.
+
+**SRAMECC — Key decision:**
+- `sramecc+`: enables hardware ECC on L1/LDS SRAM. Adds correction overhead.
+- `sramecc-`: disables SRAM ECC. Appropriate for non-critical compute workloads.
+- **Recommendation**: Benchmark with and without `sramecc-` on MI300X. If the workload is not
+  safety-critical, `sramecc-` can reduce LDS and cache latency.
+
+**Wavefront size:**
+- CDNA GPUs (MI100, MI200, MI300 series) are always 64-wide. `wavefrontsize64` is implied.
+- RDNA GPUs (RX 6xxx / RX 7xxx) default to 32-wide. 64-wide mode (`wavefrontsize64`) is
+  available but doubles VGPR pressure per wave.
+- **Recommendation trigger**: If a kernel compiled for RDNA shows unexpected occupancy, confirm
+  the wavefront size matches the LDS/VGPR budget assumptions.
+
+---
+
+### Optimization Levels
+
+HIPCC/clang++ defaults to `-O0` in debug builds and `-O3` when no flag is given on the device
+side. Always verify the optimization level is appropriate.
+
+| Flag | Effect | When to Use |
+|------|--------|-------------|
+| `-O0` | No optimization | Debug builds only |
+| `-O1` | Basic optimizations, fast compile | Rarely appropriate for GPU |
+| `-O2` | Most optimizations, no vectorization hints | General use |
+| `-O3` | Full optimization + vectorization + inlining | **Default recommendation for GPU** |
+| `-Ofast` | `-O3` + aggressive fast-math (implies `-ffast-math`) | When math accuracy is not critical |
+
+**Recommendation**: If the binary was compiled without explicit `-O3` (e.g., CMake Debug mode),
+rebuilding in Release (`-O3`) is the single highest-ROI change. A Release build can be 2–10×
+faster than Debug for GPU kernels.
+
+---
+
+### Fast-Math Flags
+
+Control floating-point operation reordering and denormal handling. Can significantly improve
+throughput for FP32-heavy compute workloads.
+
+| Flag | Effect | Performance Gain |
+|------|--------|-----------------|
+| `-ffast-math` | Allows reassociation, assumes no NaN/Inf, enables FMA fusion | 10–40% on FP32 VALU-bound kernels |
+| `-fgpu-flush-denormals-to-zero` | Flushes FP32/FP16 denormals to zero in GPU code | 2–15% on kernels processing near-zero values |
+| `-fno-math-errno` | Removes errno-setting overhead from math calls | Minor; usually included in `-ffast-math` |
+| `-fassociative-math` | Allows reordering of FP additions for vectorization | Enables auto-vectorization of reductions |
+
+**`-fgpu-flush-denormals-to-zero` — Key recommendation:**
+Denormal (subnormal) FP values incur a hardware performance penalty on AMD GPUs. If a kernel
+processes values that may underflow to denormals (e.g., gradients in ML training, values close
+to zero), enabling this flag can eliminate the denormal-handling overhead. Unlike `-ffast-math`,
+it only changes behavior for subnormal inputs — normal FP values are unaffected.
+
+**Safety caveat**: `-ffast-math` is not IEEE-754 compliant. Do not use for financial calculations,
+iterative solvers requiring strict convergence, or any code that explicitly checks for NaN/Inf.
+
+---
+
+### Register and Occupancy Control
+
+When profiling shows VGPR pressure is limiting occupancy, the compiler can be directed to use
+fewer registers at the cost of potential spilling to scratch memory.
+
+#### Via `__attribute__` / `__launch_bounds__` (source annotation — preferred):
+```cpp
+// Tell compiler max 256 threads/workgroup, min 2 blocks/CU
+__global__ void __launch_bounds__(256, 2) my_kernel(...) { ... }
+```
+
+`__launch_bounds__(maxThreadsPerBlock, minBlocksPerMultiprocessor)` is the standard HIP way to
+constrain register allocation. The compiler will spill registers to scratch memory to meet the
+occupancy target.
+
+#### Via function attributes (IR-level control):
+```cpp
+__attribute__((amdgpu_num_vgpr(64)))   // Force 64 VGPRs maximum
+__attribute__((amdgpu_num_sgpr(32)))   // Force 32 SGPRs maximum
+__attribute__((amdgpu_waves_per_eu(2, 4)))  // Request 2–4 waves/CU
+__attribute__((amdgpu_flat_work_group_size(64, 256)))  // Valid workgroup range
+```
+
+These are lower-level than `__launch_bounds__` and should only be used when profiling confirms
+the exact VGPR count needed.
+
+#### Via `-mllvm` passthrough (compilation flag):
+```bash
+# Global VGPR limit for the entire translation unit
+hipcc -mllvm -amdgpu-num-vgpr=64 ...
+
+# Enable alloca promotion to registers (often auto-enabled at -O3)
+hipcc -mllvm -amdgpu-enable-promote-alloca ...
+```
+
+**Recommendation trigger**: If `rocprof-compute` reports `vgpr_count > 128` and occupancy is
+below target:
+1. First try `__launch_bounds__(blockSize, targetWaves)` — non-intrusive
+2. If still failing, use `amdgpu_waves_per_eu(minWaves, maxWaves)` to narrow the range
+3. As a last resort, use `-mllvm -amdgpu-num-vgpr=<n>` globally — watch for spill traffic
+
+**VGPR → occupancy table (CDNA3/gfx942, 512 VGPRs per SIMD):**
+| VGPRs per wave | Allocated VGPRs (16-block) | Max waves/EU | Occupancy (of 32 max) |
+|---------------|---------------------------|-------------|----------------------|
+| 1–16  | 16  | 32 | 100% |
+| 17–32 | 32  | 16 | 50% |
+| 33–48 | 48  | 10 | ~31% |
+| 49–64 | 64  |  8 | 25% |
+| 65–80 | 80  |  6 | ~19% |
+| 81–96 | 96  |  5 | ~16% |
+| 97–128 | 112–128 | 4 | ~13% |
+| 129–176 | 144–176 | 3 | ~9% |
+| 177–256 | 192–256 | 2 | ~6% |
+| 257–512 | 272–512 | 1 | ~3% |
+
+CDNA4 (gfx950): same VGPR pool per SIMD; doubled LDS (160 KB/CU) can allow larger workgroups.
+
+---
+
+### Environment Variables (HIPCC / HIP Runtime)
+
+These affect compilation and runtime behavior without code or CMake changes.
+
+| Variable | Value | Effect |
+|----------|-------|--------|
+| `HIPCC_COMPILE_FLAGS_APPEND` | `-O3 -ffast-math` | Appends flags to every `hipcc` invocation |
+| `HIP_FORCE_DEV_KERNARG=1` | `1` | Forces kernel arguments to device memory (avoids host-pinned buffer contention). **Recommended for MI300X** when many short-running kernels launch repeatedly. |
+| `HIPCC_VERBOSE=1` | `1` | Prints full clang++ command lines — use to verify flags are actually applied |
+| `ROCPD_LLM_LOCAL` | `ollama` | (rocpd-specific) Use local LLM for stage-1 summarization |
+
+**`HIP_FORCE_DEV_KERNARG=1` — Recommendation trigger**: If Tier 1 analysis shows API overhead
+> 15% and many short kernels (avg duration < 10 µs), enabling this env var can reduce
+host-device argument setup latency at no code cost.
+
+---
+
+### Compiler Flags for CMake Projects
+
+Most HIP/ROCm projects use CMake. The correct way to set GPU-level flags is:
+
+```cmake
+# Set target GPU(s)
+set(CMAKE_HIP_ARCHITECTURES "gfx942")
+# or for multiple targets:
+set(CMAKE_HIP_ARCHITECTURES "gfx942;gfx90a")
+
+# Add optimization flags for GPU code
+target_compile_options(my_target PRIVATE
+    $<$<COMPILE_LANGUAGE:HIP>:-O3 -ffast-math -fgpu-flush-denormals-to-zero>
+)
+
+# Add to all GPU targets in a directory
+add_compile_options($<$<COMPILE_LANGUAGE:HIP>:--offload-arch=gfx942:xnack->)
+```
+
+**Recommendation**: When suggesting compiler changes, always phrase them as CMake
+`target_compile_options` changes, not raw shell flags, unless the user's build system is
+confirmed to be non-CMake.
+
+---
+
+### Compiler Optimization Decision Tree
+
+Use this decision tree when profiling evidence suggests a compiler flag may help:
+
+```
+Profiling evidence → Recommended compiler action
+─────────────────────────────────────────────────
+MFMA utilization = 0 on MI300X         → Recompile with --offload-arch=gfx942
+Binary compiled -O0 or Debug mode      → Recompile with -O3 (highest ROI)
+API overhead > 15%, many short kernels → Set HIP_FORCE_DEV_KERNARG=1
+Denormal flush warnings in perf data   → Add -fgpu-flush-denormals-to-zero
+VALU bound + FP32 heavy                → Try -ffast-math (verify numerical correctness)
+VGPR count > 64, low occupancy        → Add __launch_bounds__ or amdgpu_waves_per_eu
+Using hipMallocManaged? No             → Recompile with --offload-arch=gfxXXX:xnack-
+ECC not required?                      → Recompile with --offload-arch=gfxXXX:sramecc-
+```
+
+---
+
+### Compiler Recommendation Format
+
+When recommending compiler changes in analysis output, use this structure:
+
+**Title**: [Descriptive title, e.g., "Enable Architecture-Specific Compilation"]
+**Priority**: HIGH / MEDIUM / LOW
+**Evidence**: [Specific counter or trace observation that triggered this recommendation]
+**Change**:
+```cmake
+# Before
+set(CMAKE_HIP_ARCHITECTURES "gfx900")  # generic
+
+# After
+set(CMAKE_HIP_ARCHITECTURES "gfx942")
+target_compile_options(... PRIVATE $<$<COMPILE_LANGUAGE:HIP>:-O3 -ffast-math>)
+```
+**Expected Impact**: [Estimated improvement, e.g., "10–40% VALU throughput improvement for FP32-heavy kernels"]
+**Verification**: [How to confirm the change worked, e.g., "Rerun Tier 2 analysis; check VALU SOL%"]
+
+---
+
+## What NOT to Do
+<!-- rocpd-context: always -->
+
+❌ **Do Not Recommend Already-Collected Data**
+- Check `profiling_info.profiling_mode` and `hardware_counters.counters` before suggesting
+  any `--pmc` counter or tracing flag. If it was already collected, do not suggest it again.
+
+❌ **Do Not Fabricate Metrics**
+- If a metric is not in the data, say "Unknown — counter data not collected"
+- Do not estimate or guess performance numbers; base everything on the provided data
+
+❌ **Do Not Suggest Deep Analysis for Minor Kernels**
+- Apply Amdahl's Law: do not recommend rocprof-compute deep dive for kernels < 5% of time
+
+❌ **Do Not Suggest Unsupported Architectures**
+- Stick to known GPU specs in this guide; state limitations for unknown GPUs
+- Supported: MI100 (gfx908), MI250X (gfx90a), MI300A/MI300X/MI325X (gfx942), MI350X/MI355X (gfx950), RX 6900 XT (gfx1030), RX 7900 XTX (gfx1100)
+
+❌ **Do Not Give Generic Advice**
+- "Optimize memory access" is not actionable
+- Always provide specific, measurable, step-by-step guidance
+
+❌ **Do Not Reference External Resources**
+- No "check the AMD documentation at..."
+- No "search online for examples"
+- Provide self-contained guidance
+
+⚠️ **Code Analysis Guidelines**
+- **By default**: Focus on performance metrics only — you do not have access to source code
+- **Exception**: If the user's custom prompt explicitly mentions code analysis AND provides
+  file paths, then you MAY analyze code logic and suggest algorithmic changes
+- **Rule**: Only suggest algorithmic changes when you can see the actual algorithm
+
+❌ **Do Not Use Other Vendors' Terminology**
+- Do not mention names of other companies or their products
+- Use AMD-specific terminology:
+  - "LDS" (Local Data Share), not shared memory
+  - "waves", not warps or threads
+  - "VALU" or "stream processors", not CUDA cores
+  - "workgroup", not thread block
+
+❌ **Do Not Make Unsupported Claims**
+- Use "estimated" or "expected" for predictions
+- Base estimates on actual counter values or similar profiling patterns
+
+❌ **Never Fabricate Hardware Counter Names**
+- Only reference counter names that appear in the provided profiling data or the Hardware Counter Reference section of this guide
+- Do NOT invent counters like `TCP_L1_HIT_RATE`, `GRBM_COMPUTE_BUSY`, `SQ_VALU_EFFICIENCY`, etc.
+- If a metric you want to reference was not collected, say "this counter was not collected in this run" and recommend adding it via `--pmc <COUNTER_NAME>`
+- Use `rocprofv3 --list-avail` to discover available counters for the target GPU
+
+❌ **Never Recommend CUDA/NVIDIA-Specific Optimizations**
+- Do not suggest NVIDIA-specific tools (`nvprof`, `Nsight`, `nvcc` flags)
+- Do not suggest CUDA-only APIs that have no HIP equivalent, or NVIDIA architecture-specific tuning (e.g., SM count, CUDA core optimization)
+- All recommendations must use AMD tools (`rocprofv3`, `rocprof-compute`, `amdclang++`, HIP APIs) and reference AMD architecture concepts
+
+❌ **Always Flag Implausible Metric Values — Never Silently Accept Them**
+- If profiling data shows GPU utilization > 100%, memory bandwidth exceeding the GPU's theoretical peak (see Hardware Specifications), negative durations, or wave occupancy > 32 waves/CU (CDNA3), flag this explicitly as a likely measurement artifact or data issue
+- Example: "The reported bandwidth of 12 TB/s exceeds MI300X's peak of 5.3 TB/s; this value appears to be a measurement artifact and should not be used for bottleneck classification."
+- Do not base recommendations on implausible values
+
+❌ **Never Double-Count MFMA Instructions in Instruction Mix Analysis**
+- `SQ_INSTS_MFMA` is a subset of `SQ_INSTS_VALU` — every MFMA instruction is also counted in VALU
+- When computing instruction mix percentages, use `SQ_INSTS_VALU - SQ_INSTS_MFMA` for "non-MFMA VALU" and report `SQ_INSTS_MFMA` separately
+- Correct total: `(SQ_INSTS_VALU - SQ_INSTS_MFMA) + SQ_INSTS_MFMA + SQ_INSTS_SALU + SQ_INSTS_SMEM + ...`
+- Incorrect total: `SQ_INSTS_VALU + SQ_INSTS_MFMA + ...` (this double-counts all MFMA instructions)
+
+---
+
+## Example Analysis Flow
+<!-- rocpd-context: tier2 -->
+
+### Input Data:
+- Kernel: `matmul_kernel`
+- Duration: 500 ms (60% of total time)
+- Grid: 256×256, Workgroup: 256×1×1
+- GPU utilization: 82% (GRBM_GUI_ACTIVE / GRBM_COUNT)
+- SQ_WAVES: implies 8 waves/CU → 25% occupancy
+- VGPR: 128 per wave
+
+### Analysis Steps:
+
+1. **Identify Importance**: 60% of total time → High priority (Amdahl: max 2.5× total speedup)
+
+2. **Classify Bottleneck** (requires FETCH_SIZE/WRITE_SIZE counters):
+   - If VALU util (45%) < HBM util (75%) → Memory-bound
+   - Occupancy 25% → also latency-bound (128 VGPRs → max 16 waves/CU)
+
+3. **Identify Root Causes**:
+   - Memory-bound: low arithmetic intensity or poor data reuse
+   - Low occupancy: 128 VGPRs limit to 16 waves/CU (target: ≤ 64 for 32 waves/CU)
+
+4. **Generate Recommendations**:
+   - **High Priority**: Reduce VGPR usage to ≤ 64 to enable 32 waves/CU
+   - **High Priority**: Tile data into LDS to increase arithmetic intensity
+   - **Medium Priority**: Coalesce global memory accesses
+
+5. **Suggest Next Step** (if counters missing):
+   - Collect L2 hit rate and instruction mix:
+     `rocprofv3 --pmc TCP_TCC_HIT_sum TCP_TCC_MISS_sum SQ_INSTS_VALU SQ_INSTS_VMEM -- ./app`
+   - If bottleneck still unclear: `rocprof-compute profile --kernel "matmul_kernel" -- ./app`
+
+---
+
+## Confidence Levels
+<!-- rocpd-context: always -->
+
+When classifying bottlenecks, indicate confidence:
+
+- **High Confidence (> 90%)**: Counter data present, clear bottleneck signature
+  - Example: "Memory-bound (High Confidence — HBM utilization 82%, VALU utilization 35%)"
+- **Medium Confidence (60–90%)**: Some counters, bottleneck likely but not definitive
+  - Example: "Likely memory-bound (Medium Confidence — low AI inferred from FETCH_SIZE,
+    no VALU counter available for cross-check)"
+- **Low Confidence (< 60%)**: Trace-only data, no counters
+  - Example: "Bottleneck unknown (Low Confidence — no hardware counters; collect
+    GRBM_COUNT, SQ_WAVES, FETCH_SIZE, WRITE_SIZE to classify)"
+
+---
+
+## Handling Missing Data
+<!-- rocpd-context: always -->
+
+### If No Hardware Counters (Tier 1 only):
+```
+Limited Analysis: No hardware counters detected.
+Cannot determine compute vs memory-bound classification.
+Cannot calculate GPU utilization, wave occupancy, or HBM bandwidth.
+
+Recommended next step (Step 2) — THREE passes required (each TCC-derived counter needs its own pass):
+  # Pass 1: GPU utilization + wave occupancy
+  rocprofv3 --sys-trace --pmc GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES \
+    --kernel-names "<hot_kernel>" -d ./counters -o profile_pass1 -- ./app
+  # Pass 2: HBM read bandwidth (FETCH_SIZE alone — 3 TCC hardware counters)
+  rocprofv3 --sys-trace --pmc FETCH_SIZE \
+    --kernel-names "<hot_kernel>" -d ./counters -o profile_pass2 -- ./app
+  # Pass 3: HBM write bandwidth (WRITE_SIZE alone — 2 TCC hardware counters)
+  rocprofv3 --sys-trace --pmc WRITE_SIZE \
+    --kernel-names "<hot_kernel>" -d ./counters -o profile_pass3 -- ./app
+
+This will enable: GPU utilization, occupancy, and HBM bandwidth analysis.
+For full roofline model, follow with: rocprof-compute profile -- ./app
+```
+
+### If Partial Counters (Tier 2, some counters missing):
+```
+Partial Counter Data: [list which counters are present and which are missing]
+- GPU utilization: [available/not available]
+- Wave occupancy: [available/not available]
+- HBM bandwidth: [available/not available — need FETCH_SIZE + WRITE_SIZE]
+- L2 hit rate: [available/not available — need TCP_TCC_HIT_sum + TCP_TCC_MISS_sum]
+
+Recommended: Collect missing counters for complete bottleneck classification.
+```
+
+### If Unknown GPU Architecture:
+```
+Unknown GPU Architecture: [gfx_arch]
+Using generic analysis (trace data only).
+Cannot compare to hardware peaks or calculate Speed-of-Light metrics.
+Supported GPUs: MI100 (gfx908), MI250X/MI210/MI250 (gfx90a),
+  MI300A/MI300X/MI325X (gfx942), MI350X/MI355X (gfx950),
+  RX 6900 XT (gfx1030), RX 7900 XTX (gfx1100)
+```
+
+---
+
+## Custom Prompt Handling
+<!-- rocpd-context: always -->
+
+If the user provides a custom prompt (e.g., `--prompt "Why is kernel X slow?"`), use it to:
+
+1. **Focus Analysis**: Prioritize the specific kernel/aspect mentioned
+2. **Tailor Output**: Structure response to directly answer the question
+3. **Provide Targeted Recommendations**: Focus on the area of interest
+
+**Examples**:
+- Prompt: "Focus on memory bottlenecks" → Emphasize FETCH_SIZE, WRITE_SIZE, L2 hit rates, memcpy overhead
+- Prompt: "Why is matmul slow?" → Lead with matmul kernel analysis, occupancy, MFMA utilization
+- Prompt: "What should I optimize first?" → Apply Amdahl's Law, rank by time × potential speedup
+
+---
+
+## vLLM on ROCm — Known API Pitfalls and Correct Patterns
+<!-- rocpd-context: always -->
+
+When suggesting code optimizations for applications that use **vLLM**, you MUST follow these
+rules precisely. vLLM has a well-defined public API; incorrect parameter names will cause
+immediate `TypeError` at runtime.
+
+### CRITICAL: `pin_memory` / `use_pinned_memory` are NOT `LLM()` constructor parameters
+
+**NEVER suggest passing `pin_memory=True` or `use_pinned_memory=True` to `LLM()`.**
+These parameters do not exist in the public `LLM()` / `EngineArgs` interface. Suggesting
+them will cause a `TypeError: LLM.__init__() got an unexpected keyword argument`.
+
+**How pinned memory actually works in vLLM:**
+- Pinned (page-locked) CPU memory is an **internal implementation detail** managed automatically by `vllm/worker/cache_engine.py` and `vllm/utils/__init__.py`.
+- vLLM calls `is_pin_memory_available()` internally at startup — the user never sets it.
+- On AMD ROCm GPUs (CUDA/ROCm platform): pinned memory is **automatically enabled** — no flag needed.
+- Pinned memory is automatically **disabled** on: CPU backend (`--device cpu`), TPU, WSL (Windows Subsystem for Linux).
+
+**The correct public parameters for CPU memory management in `LLM()`:**
+
+| Parameter | Type | Default | Effect |
+|---|---|---|---|
+| `swap_space` | `float` | `4` | GiB of CPU RAM per GPU for KV cache swapping (preempted sequences paged out to pinned CPU memory automatically) |
+| `cpu_offload_gb` | `float` | `0` | GiB of CPU RAM per GPU for **model weight** offloading (not KV cache) |
+
+**Example — correct way to increase CPU KV cache swap:**
+```python
+llm = LLM(
+    model="meta-llama/Llama-3.1-8B-Instruct",
+    swap_space=8,                 # 8 GiB of pinned CPU RAM for KV cache swap per GPU
+    gpu_memory_utilization=0.90,
+    tensor_parallel_size=tp_size,
+)
+```
+vLLM will automatically use pinned memory for the swap buffer on CUDA/ROCm. You do not need any additional flag.
+
+**If you need to check availability in custom torch code (NOT for LLM() args):**
+```python
+from vllm.utils import is_pin_memory_available
+
+pin_memory = is_pin_memory_available()  # True on CUDA/ROCm, False on CPU backend/WSL/TPU
+cpu_buffer = torch.zeros(shape, dtype=dtype, pin_memory=pin_memory, device="cpu")
+```
+
+### Other vLLM LLM() Parameters Relevant to ROCm Performance
+
+| Parameter | Recommended | Notes |
+|---|---|---|
+| `enforce_eager=False` | Yes | Enables CUDA/HIP graph capture and kernel fusion. Set `True` only to debug correctness. |
+| `tensor_parallel_size` | `≥ 1` | Should match available GPU count. Use `torch.cuda.device_count()`. |
+| `gpu_memory_utilization` | `0.90–0.95` | Higher values reduce KV cache evictions but risk OOM. |
+| `enable_chunked_prefill` | `True` | Overlaps prefill and decode phases; improves GPU occupancy. |
+| `max_num_seqs` | `128–512` | Larger batches amortize launch overhead. |
+| `dtype` | `"auto"` | Selects bfloat16 on MI300X; do not force float32. |
+
+### Multiprocessing Warning for rocprofv3
+
+vLLM uses Python `multiprocessing` with `spawn` start method. When profiling with `rocprofv3`,
+GPU kernels run in **worker subprocesses**, NOT the main process. The `.db` file from the main
+process will show `total_runtime_ns == 0` (empty). To profile vLLM:
+- Use `VLLM_ENABLE_V1_MULTIPROCESSING=0` to force single-process mode for tracing
+- Or profile the worker process directly with `rocprofv3 --pid <worker_pid>`
+- Or use `rocprof-sys --trace` which can follow forks/spawns
+
+---
+
+## Summary
+<!-- rocpd-context: always -->
+
+Your goal is to transform raw profiling data into **clear, actionable insights** that help developers optimize their GPU code. Always:
+
+✅ Follow the AMD 3-step profiling methodology and recommend only the next incremental step
+✅ Apply Amdahl's Law — focus on the hottest kernels first
+✅ Classify bottlenecks (compute / memory / latency / launch) before recommending fixes
+✅ Be specific: cite actual counter values, compute derived metrics, give exact commands
+✅ Prioritize high-impact optimizations (> 10% of total time)
+✅ Acknowledge when data is missing and explain exactly what to collect next
+✅ Use AMD GPU terminology (waves, LDS, VALU, MFMA, workgroup)
+✅ Never recommend collecting data that is already present in the database
+✅ Consider compiler flags **before** recommending algorithmic rewrites — check target arch, optimization level, fast-math, XNACK/SRAMECC, and VGPR limits first
+
+Follow this guide closely to ensure high-quality, trustworthy analysis.
+
+---
+
+## TraceLens-Derived Metrics
+<!-- rocpd-context: tracelens_metrics -->
+
+These fields are derived using set-theoretic interval arithmetic (matching AMD TraceLens methodology).
+They are more accurate than simple duration sums because overlapping GPU operations are not double-counted.
+
+### `interval_timeline`
+- `true_compute_pct`: % of wall time the GPU is executing kernels (overlapping kernels merged — more accurate than `execution_breakdown.kernel_time_pct`)
+- `exposed_memcpy_pct`: % of wall time spent on memory copies that do NOT overlap any kernel (truly serialized transfers)
+- `idle_pct`: % of wall time where the GPU is idle (no kernel or memcpy). **If idle_pct > 20%, this is a HIGH priority issue** — the GPU is waiting for CPU to dispatch work.
+
+### `kernel_categories`
+Each entry covers one of: GEMM, CONV, SDPA, NCCL, Elementwise, Normalization, Reduction, Other.
+- `pct_of_kernel_time`: how dominant this category is among all GPU kernels
+- Use this to classify workloads: high GEMM% → compute-bound candidate; high NCCL% → communication-bound; high Other% → custom/unclassified kernels
+- A workload that is 60%+ GEMM is a strong candidate for MFMA/rocBLAS optimization
+
+### `short_kernels`
+- `wasted_pct_of_kernel_time`: % of kernel time consumed by kernels below the `threshold_us` (default 10μs)
+- **If wasted_pct > 5%**, recommend kernel fusion or hipGraph batching
+- Common cause: many small elementwise ops that could be fused; excessive hipDeviceSynchronize() calls between tiny kernels
+- Top offenders list (kernel names sanitized) shows which kernels to target first
+
+### How to use these fields
+When answering a `--prompt` question about bottlenecks, prioritize:
+1. If `idle_pct > 20` → lead with GPU IDLE recommendation
+2. If `wasted_pct > 5` AND short kernels are the dominant category → recommend fusion
+3. If NCCL category dominates → mention communication bottleneck even if not yet Tier 2 diagnosed
+4. Cross-reference `interval_timeline.true_compute_pct` with `execution_breakdown.kernel_time_pct` — a large gap indicates significant kernel overlap (good for throughput but may hide serial stalls)
diff --git a/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/docs/SCHEMA_CHANGELOG.md b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/docs/SCHEMA_CHANGELOG.md
new file mode 100644
index 00000000000..8f7032a9e08
--- /dev/null
+++ b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/docs/SCHEMA_CHANGELOG.md
@@ -0,0 +1,642 @@
+# ROCpd AI Analysis Output - JSON Schema Changelog
+
+This document tracks all changes to the JSON output schema for `rocpd analyze --format json`
+and the `rocpd.ai_analysis` Python API.
+
+## Versioning Policy
+
+The schema follows **Semantic Versioning** (`MAJOR.MINOR.PATCH`):
+
+| Change type | Version bump | Example |
+|---|---|---|
+| New required field, renamed field, type change, removed field | **MAJOR** | `0.x.x` → `1.0.0` |
+| New optional field added | **MINOR** | `0.1.x` → `0.2.0` |
+| Description/example correction, no structural change | **PATCH** | `0.1.0` → `0.1.1` |
+
+> **Beta notice**: While `MAJOR` is `0` the schema is in beta. Minor versions may include
+> breaking changes without a MAJOR bump. Consumers should pin to an exact version during beta.
+
+**Compatibility rule**: A consumer written for schema version `0.x.x` MUST continue to work
+on any `0.y.z` output where `y >= x` (except during MAJOR=0 beta where minor may break).
+MAJOR version changes always require consumer updates.
+
+## How to Check the Schema Version
+
+Every JSON output document contains a top-level `schema_version` field:
+
+```json
+{
+  "schema_version": "0.1.0",
+  ...
+}
+```
+
+**Recommended consumer pattern**:
+
+```python
+import json
+
+with open("analysis.json") as f:
+    data = json.load(f)
+
+ver = data["schema_version"]
+major, minor, _ = (int(x) for x in ver.split("."))
+if major != 0 or minor < 1:
+    raise RuntimeError(
+        f"Unsupported schema version {ver!r}. "
+        "Expected 0.1.x. See SCHEMA_CHANGELOG.md for migration guidance."
+    )
+```
+
+## Schema File Naming
+
+A single schema file covers all emitted versions via its `schema_version` enum:
+
+```
+rocpd/ai_analysis/docs/
+├── analysis-output.schema.json   ← single schema; schema_version enum lists all valid values
+│                                    Tier 1/2 output emits: "0.1.0"
+│                                    Tier 0 (source-only) output emits: "0.2.0"
+│                                    Tier 1/2 with TraceLens fields emits: "0.3.0"
+│                                    All valid values: ["0.1.0", "0.2.0", "0.3.0"]
+│                                    New versions are added to the enum without breaking consumers
+├── SCHEMA_CHANGELOG.md           ← this file
+├── AI_ANALYSIS_API.md            ← Python API documentation
+└── LLM_REFERENCE_GUIDE.md       ← copy of share/llm-reference-guide.md (for reference)
+```
+
+The current schema can always be located programmatically:
+
+```python
+import importlib.resources as pkg_resources
+schema_path = pkg_resources.files("rocpd.ai_analysis") / "docs" / "analysis-output.schema.json"
+```
+
+---
+
+## Version History
+
+---
+
+## v0.3.1 — 2026-03-12
+
+**No schema changes.** Schema file validator corrections, Python 3.6 compatibility fixes,
+and LLM hardening only.
+
+**Schema file corrections (v0.2.0 spec was already correct; JSON file had bugs):**
+
+The `analysis-output.schema.json` file was corrected to match the already-documented
+v0.2.0 specification.  The emitted JSON format was never wrong; only the validator was:
+
+| Schema file bug | Fix |
+|---|---|
+| `profiling_info.profiling_mode` enum missing `"source_only"` | Added `"source_only"` as first enum value |
+| `profiling_info.analysis_tier` `minimum` was `1` | Lowered to `0` to allow Tier 0 documents |
+| `execution_breakdown` type was `"object"` only | Changed to `["object", "null"]` so source-only documents validate |
+| `tier0` property not declared in `properties` object | Added full `tier0` property definition with all 14 sub-fields |
+| `$id` embedded a version string (`"rocpd-ai-analysis-output-v0.1.0"`) | Changed to `"rocpd-ai-analysis-output"` (stable; version is in `schema_version` field) |
+
+Tier 0 JSON output (schema_version `"0.2.0"`) now passes `jsonschema.validate()` against
+the schema file.  28 JSON schema conformance tests added (was 17): 11 new tests cover
+Tier 0 source-only output and combined (Tier 0 + Tier 1/2) output validation.
+
+**Python 3.6 compatibility (`re.Pattern` annotation):**
+
+`tracelens_port.py` used `re.Pattern` in a module-level type annotation
+(`_CATEGORY_PATTERNS: List[Tuple[str, re.Pattern]]`).  Python 3.6 evaluates these
+annotations eagerly at import time; `re.Pattern` was added in Python 3.7.  This caused
+an `AttributeError` on RHEL 8.8 (Python 3.6.8) that cascaded into all tests importing
+`analyze.py` or `llm_analyzer.py`.  Fixed by changing the annotation to `Any` (already
+imported from `typing`).
+
+`test_analyze_schema.py` used `import importlib.resources` which also requires Python 3.7.
+Fixed with a `try/except ImportError` shim that falls back to `pkgutil.get_data()`.
+
+**`ROCPD_LLM_PRIVATE_HEADERS` dict validation:**
+
+After `json.loads()`, the parsed result is now validated to be a `dict` before
+`headers.update()` is called.  A non-dict JSON value (e.g. `"[1,2,3]"`) previously
+raised an opaque `TypeError`; it now raises a `ValueError` with a clear message
+showing the expected format.
+
+**Stream chunk accumulation (`LLMConversation`):**
+
+Both `_stream_anthropic` and `_stream_openai` now accumulate response chunks with
+`chunks.append(text)` + `"".join(chunks)` instead of `result += chunk` string
+concatenation, avoiding O(n²) memory allocation for long responses.
+
+---
+
+## v0.3.0 (2026-03-11)
+
+### New Fields (additive — old consumers should ignore unknown top-level keys)
+
+- `interval_timeline` (object): GPU wall-time breakdown using set-theoretic interval arithmetic
+  (TraceLens methodology). More accurate than `execution_breakdown` which sums raw durations.
+  Fields: `total_wall_ns`, `true_compute_ns/pct`, `exposed_memcpy_ns/pct`, `idle_ns/pct`.
+
+- `kernel_categories` (array): Kernel execution time aggregated by TraceLens op category
+  (GEMM, CONV, SDPA, NCCL, Elementwise, Normalization, Reduction, Other).
+  Fields per entry: `category`, `count`, `total_ns`, `pct_of_kernel_time`, `avg_duration_ns`, `pct_of_total_time`.
+
+- `short_kernels` (object): Short kernel analysis — kernels below 10μs threshold.
+  Fields: `threshold_us`, `total_kernels`, `short_kernel_count`, `short_kernel_pct`,
+  `wasted_ns`, `wasted_pct_of_kernel_time`, `histogram`, `top_offenders`.
+
+### Versioning Policy
+Tier 1/2 runs now emit `schema_version: "0.3.0"` when tracelens fields are present.
+Tier 0 source-only runs remain at `schema_version: "0.2.0"`.
+Prior `"0.1.0"` documents are unaffected.
+
+---
+
+### v0.2.1 — 2026-03-10
+
+**No schema changes.** Security, correctness, and LLM-layer bug fixes only.
+
+This release documents behavioral changes that affect output values and API
+consumers without altering the JSON document structure or field names.
+
+**Output value guarantees (metadata field):**
+- `analysis_version` in `metadata` now always reflects the schema version string
+  (e.g. `"0.1.0"` for Tier 1/2 documents, `"0.2.0"` for Tier 0 source-only
+  documents). The value was already correct in practice but is now explicitly
+  documented as schema-tied. Consumer code should continue to read
+  `schema_version` (not `analysis_version`) for compatibility checks.
+
+**`execution_breakdown.api_overhead_pct` is now guaranteed ≥ 0:**
+- `compute_time_breakdown()` now applies `max(0.0, ...)` to the raw `overhead_percent`
+  before returning. In some traces where kernel + memcpy time marginally exceeded the
+  computed total runtime (timestamp rounding), this field could previously be a small
+  negative value. It is now always non-negative in both CLI JSON output and the
+  Python API `ExecutionBreakdown.api_overhead_pct` field.
+
+**`memory_analysis[direction].bandwidth_bytes_per_sec` and `bandwidth_gbps` now use actual sizes:**
+- `analyze_memory_copies()` now reads the `size` column from `memory_copies` rows.
+  Previously `total_bytes` was always 0 and bandwidth was not computed. Consumers
+  that previously saw `bandwidth_gbps: 0` for all directions may now see non-zero
+  values, and the "Low memory bandwidth" recommendation (< 10 GB/s) can now fire
+  based on real measurements.
+
+**`recommendations[].commands[].full_command` kernel names are now shell-safe:**
+- In the "Compute Bottleneck" recommendation, `--kernel-names` arguments in
+  `full_command` strings are now wrapped with `shlex.quote()`. Kernel names
+  containing shell metacharacters (single quotes, semicolons, spaces) are properly
+  escaped. The `args[].value` field is unchanged (stores the raw kernel name for
+  display purposes).
+
+**LLM API calls now include `timeout=120`:**
+- All Anthropic and OpenAI API calls include an explicit 120-second timeout.
+  Previously calls could hang indefinitely. A timed-out call is caught and recorded
+  as a non-fatal warning; local analysis results are still returned.
+
+**Tier 0 webview XSS protection:**
+- `</script>` sequences in the embedded JSON payload of `_format_tier0_webview()`
+  are now escaped to `<\/script>`. This prevents a crafted kernel name or LLM
+  explanation from breaking out of the `<script>` block. No change to JSON output.
+
+---
+
+### v0.2.0 — 2026-03-09
+
+**Tier 0: Static Source Code Analysis support.**
+
+New fields and values added to support `rocpd analyze --source-dir` (no database required):
+
+| Change | Details |
+|---|---|
+| `schema_version` bumped | `"0.1.x"` → `"0.2.0"` for Tier 0 source-only documents |
+| `profiling_info.profiling_mode` | New value `"source_only"` |
+| `profiling_info.analysis_tier` | Now allows `0` (was minimum 1) |
+| `metadata.database_file` | Now nullable (`null`) when running source-only |
+| `execution_breakdown` | Now nullable (`null`) when running source-only |
+| `tier0` (new optional field) | Present in Tier 0 and combined (Tier 0 + Tier 1/2) documents |
+
+**`tier0` object schema:**
+```json
+{
+  "source_dir": "string",
+  "analysis_timestamp": "ISO 8601",
+  "programming_model": "HIP | HIP+ROCm_Libraries | PyTorch_HIP | JAX_HIP | OpenCL | Python_GPU | Unknown",
+  "files_scanned": 0,
+  "files_skipped": 0,
+  "kernel_count": 0,
+  "detected_kernels": [{"name": "", "file": "", "line": 0, "launch_type": ""}],
+  "detected_patterns": [{"pattern_id": "", "severity": "high|medium|low|info", "category": "", "description": "", "count": 0, "locations": []}],
+  "risk_areas": ["string"],
+  "already_instrumented": false,
+  "roctx_marker_count": 0,
+  "recommendations": [...],
+  "suggested_counters": ["string"],
+  "suggested_first_command": "string",
+  "llm_explanation": null
+}
+```
+
+**Tier 1/2 documents unchanged** — `schema_version` remains `"0.1.0"` for existing DB-only analysis. The `tier0` field is omitted from Tier 1/2 documents unless `--source-dir` is also provided (combined mode).
+
+**Migration**: Consumers checking `schema_version == "0.1.0"` continue to work unchanged. Consumers wanting Tier 0 data should additionally handle `"0.2.0"` and check for the presence of the `tier0` field.
+
+---
+
+### v0.1.8 — 2026-02-27
+
+**No schema changes.** Recommendation engine fix, OpenAI model compatibility, and test
+infrastructure improvements.
+
+**Recommendation engine — PMC counter deduplication:**
+- Recommendations no longer suggest re-collecting hardware counters that are already
+  present in `pmc_events`.  Previously, if a trace was collected with
+  `--pmc GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES`, the engine still emitted commands like
+  `rocprofv3 --pmc GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES ...`.
+- `_detect_already_collected()` now queries `SELECT DISTINCT counter_name FROM pmc_events`
+  and adds `"pmc:<NAME>"` entries to the covered frozenset for each counter found.
+- `_filter_rec_commands()` strips already-collected counters from `--pmc` arg values:
+  - **Partial overlap**: only uncovered counters remain in `--pmc` (and in `full_command`)
+  - **Full overlap**: `--pmc` arg and flag are dropped entirely
+  - `--kernel-names` is now treated as a scope filter (not data collection); a command
+    reduced to only `--kernel-names` and output path args after stripping is dropped
+  - A note listing stripped counters is appended to the `description` field so users
+    can see why a command looks different from the documentation
+- `rocprof-compute` commands are never dropped (always represent new deep analysis)
+- 7 new unit tests cover full/partial/zero PMC stripping, `full_command` update,
+  description note, kernel-names-only drop, and rocprof-compute always-kept behavior
+
+**OpenAI model compatibility:**
+- `_call_openai()` in `llm_analyzer.py` now tries `max_completion_tokens` first (required
+  by gpt-5, o1, o3, and newer gpt-4o variants) and falls back to `max_tokens` only when
+  the API explicitly rejects `max_completion_tokens` (for legacy models).  Transparent to
+  callers — no API change.
+
+**Output format hint:**
+- When `execute()` writes a file with the default text format (i.e., `--format` was not
+  specified), a tip is printed to stdout suggesting `--format webview`, `--format json`,
+  and `--format markdown` so users can discover the other output options.
+
+**CTest integration:**
+- `test_ai_analysis_standalone.py` (23 AI analysis API unit tests) is now registered with
+  CTest as `tests.integration.execute.rocprofv3-test-rocpd-ai-analysis-unit-tests`
+  via `configure_file()` + `rocprofiler_add_integration_execute_test()` in
+  `tests/rocprofv3/rocpd/CMakeLists.txt`.
+
+---
+
+### v0.1.7 — 2026-02-27
+
+**No schema changes.** `rocpd.ai_analysis` Python API bug fixes and hardening (audit AIA-001 through AIA-013).
+
+**Critical fix:**
+- AIA-001: `analyze_database()` was broken — it called `analyze_performance()` with wrong
+  parameters (`top_n`, `format_output=False`), but that function accepts neither and always
+  returns a formatted `str`. Fixed by calling individual analysis functions directly
+  (`compute_time_breakdown`, `identify_hotspots`, `analyze_memory_copies`,
+  `analyze_hardware_counters`, `generate_recommendations`).
+
+**High severity fixes:**
+- AIA-002: `_build_analysis_result()` used wrong key names from `generate_recommendations()`
+  output (`title`→`issue`, `description`→`suggestion`, `impact`→`estimated_impact`,
+  lowercase priority→uppercase priority comparison). Fixed key mapping.
+- AIA-003: `OutputFormat` enum was missing `WEBVIEW`. Added `WEBVIEW = "webview"`.
+- AIA-004: `to_json()` returned non-conformant dataclass dict missing `schema_version`.
+  Fixed by delegating to `format_analysis_output(..., output_format="json")`.
+  Added `to_webview()` method (was documented but missing). Both methods use raw
+  analysis payloads stored on `result._raw` for schema-conformant output.
+- AIA-012: Created `ai_analysis/tests/test_api_standalone.py` (23 tests) and source copy
+  `tests/rocprofv3/rocpd/test_ai_analysis_standalone.py`. Updated docs.
+
+**Medium severity fixes:**
+- AIA-005: LLM auth/rate-limit errors (`LLMAuthenticationError`, `LLMRateLimitError`) now
+  propagate when `enable_llm=True` instead of being silently swallowed as warnings.
+- AIA-006: `_convert_result_to_llm_format()` replaced hardcoded `"AMD GPU"`, `"gfx90a"`,
+  empty `kernels: []`, `memory_ops: {}` with real data from `result._raw`.
+- AIA-007: Implemented file path redaction in `_sanitize_data()` using a regex pattern
+  matching Unix (`/home/`, `/opt/`, `/root/`, `/tmp/`, `/var/`) and Windows paths.
+- AIA-008: `ReferenceGuideNotFoundError` now accepts `List[str]` of all attempted paths
+  and shows all in the error message (was only showing one path). Updated
+  `get_reference_guide_path()` to collect all attempted paths before raising.
+- AIA-009: Added `DEFAULT_ANTHROPIC_MODEL` / `DEFAULT_OPENAI_MODEL` constants. Model names
+  are now configurable via `ROCPD_LLM_MODEL` environment variable at runtime or the new
+  `--llm-model` CLI flag (`rocpd analyze --llm anthropic --llm-model claude-opus-4-6`).
+- AIA-013: `validate_database()` now queries `type IN ('table','view')` instead of
+  `type='table'` so `kernels`, `memory_copies`, and `pmc_events` views are detected.
+
+**Low severity fixes:**
+- AIA-010: Fixed type hints in `exceptions.py` (`missing_tables: Optional[List[str]]`,
+  `gpu_arch: Optional[str]`). Added `from typing import Optional` import.
+- AIA-011: `ReferenceGuideNotFoundError` is now exported from `rocpd.ai_analysis.__init__`.
+
+---
+
+### v0.1.6 — 2026-02-24
+
+**No schema changes.** Recommendation engine improvement only.
+
+- Recommendations no longer suggest re-running profiling flags that were
+  already used in the original collection run.  The engine now inspects the
+  database to infer what was already collected:
+  - Presence of `kernels` rows → `--kernel-trace` covered
+  - Presence of `regions` rows (HIP/HSA API spans) → API tracing covered
+  - Presence of `memory_copies` rows → `--memory-copy-trace` covered
+  - `kernels` + `regions` together → full `--sys-trace` implied, which
+    subsumes `--hip-trace`, `--hip-api-trace`, `--hsa-trace`,
+    `--kernel-trace`, `--memory-copy-trace`, `--marker-trace`
+- Redundant trace flags are stripped from recommended `rocprofv3` commands;
+  if a command has no remaining flags and no meaningful new args (beyond
+  `-d`/`-o`), it is dropped entirely so only actionable next steps appear.
+- Commands for `rocprof-sys` and `rocprof-compute` are never dropped — they
+  always represent a new perspective even on already-collected data.
+- New internal helpers: `_detect_already_collected()`,
+  `_filter_rec_commands()`, `_SYS_TRACE_IMPLIED`.
+
+---
+
+### v0.1.5 — 2026-02-24
+
+**No schema changes.** Webview bug fix only.
+
+- Fixed hover tooltip text being invisible in light theme. The `#tt` floating tooltip
+  had `color:var(--text)` which in light mode resolves to `#181828` (near-black) —
+  the same as the always-dark `#0e0e1c` tooltip background. Fixed by replacing
+  `color:var(--text)` with a pinned light color `#dde0f2` so the tooltip is readable
+  in both dark and light themes.
+
+---
+
+### v0.1.4 — 2026-02-24
+
+**No schema changes.** Webview bug fix only.
+
+- Fixed key findings bullet icons rendering as literal text (e.g. `&#8594;`) instead
+  of the intended arrow character. Root cause: CSS `content` property does not process
+  HTML entities — `content:'&#8594;'` outputs the 7-character string literally.
+  Fixed by using the actual Unicode character `→` (U+2192) directly in the CSS rule.
+
+---
+
+### v0.1.3 — 2026-02-23
+
+**No schema changes.** Webview UI/UX redesign only.
+
+- Redesigned webview layout inspired by AMD dashboard design language:
+  - **Light/Dark theme toggle** — persisted in `localStorage` (`rocpd-theme` key);
+    defaults to dark. Header always uses AMD dark gradient regardless of theme.
+  - **Status summary badges** in header — Critical/Warning/Low/Info counts derived
+    from recommendations so key issues are visible before scrolling.
+  - **Metric pills row** — Runtime (ms), kernel dispatch count, analysis tier, generation
+    timestamp, and database file path shown in a compact pill bar below the main header.
+  - **Status-colored KPI cards** — Four cards in the overview section (Kernel Execution,
+    Primary Bottleneck, Total Runtime, Analysis Tier) each have a colored top border
+    (`--c-ok`/`--c-warn`/`--c-crit`/`--c-info`) reflecting health status.
+  - **Section card pattern** (`.scard`) — Each report section uses a consistent
+    card layout with an icon-titled header (`.shdr`) and section-level badge.
+  - **Priority icons on recommendations** — 🔴 HIGH, 🟠 MEDIUM, 🟡 LOW, ℹ INFO icons
+    precede each recommendation badge for quicker visual scanning.
+  - **FAB scroll-to-top button** — Floating action button appears after scrolling 250 px.
+  - **`@keyframes fadeInUp`** staggered entrance animations on section cards.
+  - **Gradient execution bars** — Breakdown segment bars use color gradients.
+  - **Improved typography** — System font stack (`-apple-system`, `Segoe UI`, etc.) and
+    monospace stack (`JetBrains Mono`, `Cascadia Code`, `Fira Code`) for offline use.
+  - **Improved table headers** — Uppercase, 2 px bottom border.
+  - **Gauge cards** — Background fill and border on hover for hardware counter gauges.
+- No changes to JSON output structure, schema version string, or analysis logic.
+
+---
+
+### v0.1.2 — 2026-02-19
+
+**No schema changes.** Webview presentation improvements only.
+
+- Added hover tooltips to all visual elements in the `--format webview` HTML report:
+  gauges, execution breakdown bars, overview stat cards, hotspot table column headers,
+  memory transfer direction cells and column headers, and hardware counter table rows.
+- Counter rows use a `COUNTER_TIPS` JavaScript lookup covering 20+ known AMD GPU
+  hardware counters (GRBM_*, SQ_*, TCP/TCC cache, FETCH_SIZE, WRITE_SIZE, etc.)
+  with educational content about what each counter measures and why it matters.
+- Unknown counters receive a generic fallback tooltip pointing to AMD ISA documentation.
+- No changes to JSON output structure, schema version string, or analysis logic.
+
+---
+
+### v0.1.1 — 2026-02-19
+
+**No schema changes.** Description and tooling improvements only.
+
+- Added `webview` output format (`--format webview`) producing a self-contained
+  interactive HTML report. The underlying JSON data structure is unchanged; the HTML
+  report embeds the same payload as `--format json`.
+- CLI `--format` now automatically appends the correct file extension to the output
+  file name: `.json`, `.md`, `.html`, or `.txt` depending on the selected format.
+  No schema-level change.
+
+---
+
+### v0.1.0 — 2026-02-18
+
+**Initial beta release.**
+
+#### Document structure
+
+| Field | Type | Required | Notes |
+|---|---|---|---|
+| `schema_version` | `string` `"0.1.0"` | ✅ | Always present; check before parsing |
+| `metadata` | object | ✅ | Analysis run metadata |
+| `profiling_info` | object | ✅ | Profiling session info |
+| `summary` | object | ✅ | Bottleneck classification |
+| `execution_breakdown` | object | ✅ | Time distribution in ns and % |
+| `hotspots` | array | ✅ | Top kernels by total time |
+| `memory_analysis` | object | ✅ | Per-direction memory copy stats |
+| `hardware_counters` | object | ✅ | Tier 2 counter data (may be empty) |
+| `recommendations` | array | ✅ | Prioritized optimization suggestions with structured commands |
+| `warnings` | array | ✅ | Analysis quality warnings |
+| `errors` | array | ✅ | Non-fatal errors (empty = success) |
+| `llm_enhanced_explanation` | `string\|null` | — | Optional LLM text; null when not used |
+
+#### `metadata` fields
+
+| Field | Type | Notes |
+|---|---|---|
+| `rocpd_version` | `string` | e.g. `"6.3.0"` |
+| `analysis_version` | `string` | SemVer, e.g. `"0.1.0"` |
+| `database_file` | `string` | Path to analyzed `.db` file |
+| `analysis_timestamp` | `string` | ISO 8601 |
+| `analysis_duration_ms` | `integer` | Wall-clock analysis time |
+| `custom_prompt` | `string\|null` | Value of `--prompt`, or null |
+
+#### `profiling_info` fields
+
+| Field | Type | Values |
+|---|---|---|
+| `total_duration_ns` | `integer` | Wall-clock duration of profiled app |
+| `profiling_mode` | `string` | `sys_trace_only`, `sys_trace_with_counters`, `pc_sampling`, `thread_trace` |
+| `analysis_tier` | `integer` | `1`–`4` |
+| `gpus` | array of GPU objects | Each: `name`, `architecture`, `agent_id` |
+
+#### `summary` fields
+
+| Field | Type | Values |
+|---|---|---|
+| `overall_assessment` | `string` | Free text |
+| `primary_bottleneck` | `string` | `compute`, `memory_transfer`, `memory_bandwidth`, `latency`, `mixed`, `unknown` |
+| `confidence` | `number` | `0.0`–`1.0` |
+| `key_findings` | `string[]` | Ordered, most significant first |
+
+#### `execution_breakdown` fields
+
+All time fields are **nanoseconds** (`_ns`). All percentage fields are `_pct` (`0.0`–`100.0`).
+
+| Field | Description |
+|---|---|
+| `total_runtime_ns` | `MAX(end) - MIN(start)` across all operations |
+| `kernel_time_ns` / `kernel_time_pct` | GPU kernel execution |
+| `memcpy_time_ns` / `memcpy_time_pct` | All memory copies (all directions) |
+| `api_overhead_ns` / `api_overhead_pct` | API and launch overhead |
+| `idle_time_ns` / `idle_time_pct` | GPU idle gaps |
+
+#### `hotspots` item fields
+
+| Field | Type | Notes |
+|---|---|---|
+| `rank` | `integer` | 1-based, 1 = hottest |
+| `name` | `string` | Demangled kernel name |
+| `calls` | `integer` | Total dispatch count |
+| `total_duration_ns` | `integer` | Sum of all dispatch durations |
+| `avg_duration_ns` | `number` | Mean dispatch duration |
+| `min_duration_ns` | `integer` | Minimum dispatch duration |
+| `max_duration_ns` | `integer` | Maximum dispatch duration |
+| `pct_of_total` | `number` | % of `total_runtime_ns` |
+
+#### `memory_analysis` keys and value fields
+
+Keys are transfer direction strings: `"Host-to-Device"`, `"Device-to-Host"`, `"Device-to-Device"`, `"Peer-to-Peer"`, `"Unknown"`.
+
+| Field | Type | Notes |
+|---|---|---|
+| `count` | `integer` | Number of copy operations |
+| `total_bytes` | `integer` | Total bytes transferred |
+| `total_duration_ns` | `integer` | Total copy time |
+| `avg_bytes` | `number` | Average transfer size |
+| `avg_duration_ns` | `number` | Average copy duration |
+| `bandwidth_gbps` | `number` | Achieved bandwidth in GB/s |
+
+#### `hardware_counters` fields
+
+| Field | Type | Notes |
+|---|---|---|
+| `has_counters` | `boolean` | Check this before using other fields |
+| `metrics` | object or null | Derived metrics (GPU util%, waves) |
+| `metrics.gpu_utilization_pct` | `number\|null` | From GRBM_GUI_ACTIVE/GRBM_COUNT |
+| `metrics.avg_waves` | `number\|null` | From SQ_WAVES |
+| `metrics.max_waves` | `number\|null` | |
+| `metrics.min_waves` | `number\|null` | |
+| `counters` | object or null | Raw counter stats keyed by counter name |
+| `counters.<name>.sample_count` | `integer` | |
+| `counters.<name>.avg_value` | `number` | |
+| `counters.<name>.min_value` | `number` | |
+| `counters.<name>.max_value` | `number` | |
+| `counters.<name>.total_value` | `number` | |
+
+#### `recommendations` item fields
+
+| Field | Required | Notes |
+|---|---|---|
+| `id` | ✅ | Stable ID, e.g. `"ROCPD-MEMCPY-001"` |
+| `priority` | ✅ | `HIGH`, `MEDIUM`, `LOW`, or `INFO` |
+| `category` | ✅ | e.g. `"Memory Transfer"`, `"Compute Bottleneck"` |
+| `issue` | ✅ | What was detected (with measurements) |
+| `suggestion` | ✅ | What to do |
+| `actions` | — | Ordered implementation steps |
+| `estimated_impact` | — | Expected performance gain |
+| `commands` | — | Structured per-tool profiling commands (see below) |
+
+#### `recommendations[].commands` item fields
+
+Each item represents one invocation of a ROCm profiling tool:
+
+| Field | Type | Required | Notes |
+|---|---|---|---|
+| `tool` | `string` | ✅ | `"rocprofv3"`, `"rocprof-sys"`, or `"rocprof-compute"` |
+| `description` | `string` | ✅ | Why this command is recommended for the specific issue |
+| `flags` | `string[]` | ✅ | Boolean flags (no value), e.g. `["--sys-trace", "--hsa-trace"]` |
+| `args` | `object[]` | ✅ | Named args; each has `name` (string) and `value` (`string\|null`) |
+| `full_command` | `string` | ✅ | Complete ready-to-run command with `-- ./app` placeholder |
+
+**Tool meanings**:
+- `rocprofv3` — ROCm trace and counter collection (successor to rocprof)
+- `rocprof-sys` — System-level profiling (Omnitrace) with timeline visualization
+- `rocprof-compute` — Kernel-level hardware counter deep-dive analysis
+
+#### `warnings` item fields
+
+| Field | Required | Values |
+|---|---|---|
+| `severity` | ✅ | `"warning"` or `"info"` |
+| `message` | ✅ | Human-readable text |
+| `recommendation` | — | How to resolve |
+
+#### Known limitations in v0.1.0
+
+- `execution_breakdown.api_overhead_ns` is derived from `overhead_percent` of `total_runtime_ns` and is clamped to `0` internally. Similarly, `idle_time_ns` is clamped to `0`. Both fields are always non-negative.
+- `profiling_info.gpus` may be an empty array when GPU info is not yet populated from the database.
+- `hardware_counters.metrics.gpu_utilization_pct` requires both `GRBM_COUNT` and `GRBM_GUI_ACTIVE` counters to be collected. If only one is present, the field is `null`.
+
+---
+
+## Planned Future Versions
+
+The following are **not committed** but represent the current design direction:
+
+### v0.4.0 (planned)
+- Add `pc_sampling` section — instruction-level hotspots (Tier 3)
+
+### v1.0.0 (planned — first stable release)
+- Rename `recommendations[].issue` → `recommendations[].description` (aligns with Python API)
+- Merge `recommendations` flat array into `recommendations.high_priority` / `medium_priority` / `low_priority` sub-arrays (aligns with `AnalysisResult` Python dataclass)
+- Remove MAJOR=0 beta caveat from versioning policy
+
+---
+
+## Migration Guide
+
+### From pre-schema outputs (before v0.1.0)
+
+Pre-schema outputs from earlier development builds did not contain `schema_version`.
+Detection heuristic:
+
+```python
+if "schema_version" not in data:
+    # Legacy output — no structured parsing possible
+    raise ValueError("Legacy output without schema_version is not supported.")
+```
+
+---
+
+## Validation
+
+To validate a JSON output document against this schema:
+
+```bash
+# Using jsonschema (pip install jsonschema)
+python3 -m jsonschema \
+  --instance analysis.json \
+  rocpd/ai_analysis/docs/analysis-output.schema.json
+```
+
+```python
+# Programmatic validation
+import json
+import jsonschema
+import importlib.resources as pkg_resources
+
+schema_text = (
+    pkg_resources.files("rocpd.ai_analysis")
+    .joinpath("docs/analysis-output.schema.json")
+    .read_text()
+)
+schema = json.loads(schema_text)
+
+with open("analysis.json") as f:
+    instance = json.load(f)
+
+jsonschema.validate(instance=instance, schema=schema)
+print("Valid!")
+```
diff --git a/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/docs/analysis-output.schema.json b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/docs/analysis-output.schema.json
new file mode 100644
index 00000000000..554ba51dd63
--- /dev/null
+++ b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/docs/analysis-output.schema.json
@@ -0,0 +1,901 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "$id": "rocpd-ai-analysis-output",
+  "title": "ROCpd AI Analysis Output",
+  "description": "JSON output schema for 'rocpd analyze --format json' and the rocpd.ai_analysis Python API. Consumers MUST check schema_version before parsing.",
+  "type": "object",
+  "required": [
+    "schema_version",
+    "metadata",
+    "profiling_info",
+    "summary",
+    "execution_breakdown",
+    "hotspots",
+    "memory_analysis",
+    "hardware_counters",
+    "recommendations",
+    "warnings",
+    "errors"
+  ],
+  "additionalProperties": false,
+  "properties": {
+
+    "schema_version": {
+      "type": "string",
+      "enum": ["0.1.0", "0.2.0", "0.3.0"],
+      "description": "Version of this JSON schema. Tools MUST validate this field before parsing. Breaking changes increment the major number. See SCHEMA_CHANGELOG.md for migration guidance."
+    },
+
+    "metadata": {
+      "type": "object",
+      "description": "Metadata about the analysis run itself.",
+      "required": [
+        "rocpd_version",
+        "analysis_version",
+        "database_file",
+        "analysis_timestamp",
+        "analysis_duration_ms"
+      ],
+      "additionalProperties": false,
+      "properties": {
+        "rocpd_version": {
+          "type": "string",
+          "description": "Version of the rocprofiler-sdk / rocpd package that produced this output.",
+          "examples": ["6.3.0", "7.0.0"]
+        },
+        "analysis_version": {
+          "type": "string",
+          "description": "Version of the AI analysis module. Follows SemVer.",
+          "pattern": "^[0-9]+\\.[0-9]+\\.[0-9]+$",
+          "examples": ["0.1.0"]
+        },
+        "database_file": {
+          "type": ["string", "null"],
+          "description": "Absolute or relative path to the rocpd .db file that was analyzed. Null for Tier 0 (source-only) analysis where no database is provided."
+        },
+        "analysis_timestamp": {
+          "type": "string",
+          "description": "ISO 8601 timestamp of when the analysis was performed.",
+          "examples": ["2026-02-17T18:00:00.123456"]
+        },
+        "analysis_duration_ms": {
+          "type": "integer",
+          "minimum": 0,
+          "description": "Wall-clock time taken to run the analysis in milliseconds."
+        },
+        "custom_prompt": {
+          "type": ["string", "null"],
+          "description": "User-supplied question or focus area passed via --prompt. Null when not provided."
+        }
+      }
+    },
+
+    "profiling_info": {
+      "type": "object",
+      "description": "Information about the profiling session that produced the database.",
+      "required": [
+        "total_duration_ns",
+        "profiling_mode",
+        "analysis_tier",
+        "gpus"
+      ],
+      "additionalProperties": false,
+      "properties": {
+        "total_duration_ns": {
+          "type": "integer",
+          "minimum": 0,
+          "description": "Wall-clock duration of the profiled application in nanoseconds."
+        },
+        "profiling_mode": {
+          "type": "string",
+          "enum": [
+            "source_only",
+            "sys_trace_only",
+            "sys_trace_with_counters",
+            "pc_sampling",
+            "thread_trace"
+          ],
+          "description": "How the profiling data was collected. 'source_only' is used for Tier 0 (static source analysis with no database). Determines which analysis tiers are available."
+        },
+        "analysis_tier": {
+          "type": "integer",
+          "minimum": 0,
+          "maximum": 4,
+          "description": "Highest analysis tier available. 0=source-only (no database), 1=trace only, 2=hardware counters, 3=PC sampling, 4=thread trace."
+        },
+        "gpus": {
+          "type": "array",
+          "description": "List of GPU devices present in the profiling session.",
+          "items": {
+            "type": "object",
+            "required": ["name", "architecture"],
+            "additionalProperties": false,
+            "properties": {
+              "name": {
+                "type": "string",
+                "description": "Human-readable GPU model name.",
+                "examples": ["AMD Instinct MI300X", "AMD Radeon RX 7900 XTX"]
+              },
+              "architecture": {
+                "type": "string",
+                "description": "GFX architecture identifier.",
+                "examples": ["gfx942", "gfx90a", "gfx1100"]
+              },
+              "agent_id": {
+                "type": "integer",
+                "minimum": 0,
+                "description": "HSA agent ID for this GPU."
+              }
+            }
+          }
+        }
+      }
+    },
+
+    "summary": {
+      "type": "object",
+      "description": "High-level performance summary and bottleneck classification.",
+      "required": [
+        "overall_assessment",
+        "primary_bottleneck",
+        "confidence",
+        "key_findings"
+      ],
+      "additionalProperties": false,
+      "properties": {
+        "overall_assessment": {
+          "type": "string",
+          "description": "Human-readable one-to-three sentence summary of the workload performance."
+        },
+        "primary_bottleneck": {
+          "type": "string",
+          "enum": [
+            "compute",
+            "memory_transfer",
+            "memory_bandwidth",
+            "latency",
+            "mixed",
+            "unknown"
+          ],
+          "description": "Classified primary bottleneck type for the workload."
+        },
+        "confidence": {
+          "type": "number",
+          "minimum": 0.0,
+          "maximum": 1.0,
+          "description": "Confidence score for the bottleneck classification (0.0 to 1.0). Higher values indicate stronger evidence."
+        },
+        "key_findings": {
+          "type": "array",
+          "description": "Ordered list of the most significant findings from the analysis. First item is most important.",
+          "items": {
+            "type": "string"
+          }
+        }
+      }
+    },
+
+    "execution_breakdown": {
+      "type": ["object", "null"],
+      "description": "Time distribution across execution categories. Null for Tier 0 (source-only) analysis where no database is available. All *_ns values are in nanoseconds, all *_pct values sum to approximately 100.0.",
+      "required": [
+        "total_runtime_ns",
+        "kernel_time_ns",
+        "kernel_time_pct",
+        "memcpy_time_ns",
+        "memcpy_time_pct",
+        "api_overhead_ns",
+        "api_overhead_pct",
+        "idle_time_ns",
+        "idle_time_pct"
+      ],
+      "additionalProperties": false,
+      "properties": {
+        "total_runtime_ns": {
+          "type": "integer",
+          "minimum": 0,
+          "description": "Total observed runtime in nanoseconds (max_end - min_start across all operations)."
+        },
+        "kernel_time_ns": {
+          "type": "integer",
+          "minimum": 0,
+          "description": "Total GPU kernel execution time in nanoseconds."
+        },
+        "kernel_time_pct": {
+          "type": "number",
+          "minimum": 0.0,
+          "maximum": 100.0,
+          "description": "Percentage of total runtime spent in GPU kernels."
+        },
+        "memcpy_time_ns": {
+          "type": "integer",
+          "minimum": 0,
+          "description": "Total memory copy time in nanoseconds (all directions: H2D, D2H, D2D)."
+        },
+        "memcpy_time_pct": {
+          "type": "number",
+          "minimum": 0.0,
+          "maximum": 100.0,
+          "description": "Percentage of total runtime spent in memory copies."
+        },
+        "api_overhead_ns": {
+          "type": "integer",
+          "minimum": 0,
+          "description": "Estimated API and launch overhead in nanoseconds."
+        },
+        "api_overhead_pct": {
+          "type": "number",
+          "minimum": 0.0,
+          "maximum": 100.0,
+          "description": "Percentage of total runtime spent in API and launch overhead."
+        },
+        "idle_time_ns": {
+          "type": "integer",
+          "minimum": 0,
+          "description": "GPU idle time in nanoseconds (gaps between operations)."
+        },
+        "idle_time_pct": {
+          "type": "number",
+          "minimum": 0.0,
+          "maximum": 100.0,
+          "description": "Percentage of total runtime the GPU was idle."
+        }
+      }
+    },
+
+    "hotspots": {
+      "type": "array",
+      "description": "Top kernels ranked by total execution time (descending). Contains up to --top-kernels entries.",
+      "items": {
+        "type": "object",
+        "required": [
+          "rank",
+          "name",
+          "calls",
+          "total_duration_ns",
+          "avg_duration_ns",
+          "min_duration_ns",
+          "max_duration_ns",
+          "pct_of_total"
+        ],
+        "additionalProperties": false,
+        "properties": {
+          "rank": {
+            "type": "integer",
+            "minimum": 1,
+            "description": "1-based rank by total execution time (1 = hottest kernel)."
+          },
+          "name": {
+            "type": "string",
+            "description": "Demangled kernel name as reported by rocprofv3."
+          },
+          "calls": {
+            "type": "integer",
+            "minimum": 1,
+            "description": "Total number of dispatches (invocations) of this kernel."
+          },
+          "total_duration_ns": {
+            "type": "integer",
+            "minimum": 0,
+            "description": "Sum of all dispatch durations for this kernel in nanoseconds."
+          },
+          "avg_duration_ns": {
+            "type": "number",
+            "minimum": 0.0,
+            "description": "Average dispatch duration in nanoseconds."
+          },
+          "min_duration_ns": {
+            "type": "integer",
+            "minimum": 0,
+            "description": "Minimum dispatch duration in nanoseconds."
+          },
+          "max_duration_ns": {
+            "type": "integer",
+            "minimum": 0,
+            "description": "Maximum dispatch duration in nanoseconds."
+          },
+          "pct_of_total": {
+            "type": "number",
+            "minimum": 0.0,
+            "maximum": 100.0,
+            "description": "Percentage of total runtime this kernel accounts for."
+          }
+        }
+      }
+    },
+
+    "memory_analysis": {
+      "type": "object",
+      "description": "Memory copy statistics grouped by transfer direction. Keys are direction strings.",
+      "propertyNames": {
+        "enum": [
+          "Host-to-Device",
+          "Device-to-Host",
+          "Device-to-Device",
+          "Peer-to-Peer",
+          "Unknown"
+        ]
+      },
+      "additionalProperties": {
+        "type": "object",
+        "required": [
+          "count",
+          "total_bytes",
+          "total_duration_ns",
+          "avg_bytes",
+          "avg_duration_ns",
+          "bandwidth_gbps"
+        ],
+        "additionalProperties": false,
+        "properties": {
+          "count": {
+            "type": "integer",
+            "minimum": 0,
+            "description": "Number of memory copy operations in this direction."
+          },
+          "total_bytes": {
+            "type": "integer",
+            "minimum": 0,
+            "description": "Total bytes transferred in this direction."
+          },
+          "total_duration_ns": {
+            "type": "integer",
+            "minimum": 0,
+            "description": "Total time spent on copies in this direction, in nanoseconds."
+          },
+          "avg_bytes": {
+            "type": "number",
+            "minimum": 0.0,
+            "description": "Average bytes per copy operation."
+          },
+          "avg_duration_ns": {
+            "type": "number",
+            "minimum": 0.0,
+            "description": "Average duration per copy operation in nanoseconds."
+          },
+          "bandwidth_gbps": {
+            "type": "number",
+            "minimum": 0.0,
+            "description": "Achieved memory bandwidth in GB/s (total_bytes / total_duration_ns * 1e9 / 1e9)."
+          }
+        }
+      }
+    },
+
+    "hardware_counters": {
+      "type": "object",
+      "description": "Tier 2 hardware performance counter data. Present in all outputs; check has_counters before using counter fields.",
+      "required": ["has_counters"],
+      "additionalProperties": false,
+      "properties": {
+        "has_counters": {
+          "type": "boolean",
+          "description": "True when the database contains hardware counter data (collected via --pmc). When false, all other fields in this object are null or empty."
+        },
+        "metrics": {
+          "type": ["object", "null"],
+          "description": "Derived metrics computed from raw counters. Null when has_counters is false.",
+          "additionalProperties": false,
+          "properties": {
+            "gpu_utilization_pct": {
+              "type": ["number", "null"],
+              "minimum": 0.0,
+              "maximum": 100.0,
+              "description": "GPU utilization percentage derived from GRBM_GUI_ACTIVE / GRBM_COUNT * 100."
+            },
+            "avg_waves": {
+              "type": ["number", "null"],
+              "minimum": 0.0,
+              "description": "Average wave occupancy (active wavefronts per CU) from SQ_WAVES counter."
+            },
+            "max_waves": {
+              "type": ["number", "null"],
+              "minimum": 0.0,
+              "description": "Maximum wave occupancy observed."
+            },
+            "min_waves": {
+              "type": ["number", "null"],
+              "minimum": 0.0,
+              "description": "Minimum wave occupancy observed."
+            }
+          }
+        },
+        "counters": {
+          "type": ["object", "null"],
+          "description": "Raw hardware counter statistics keyed by counter name (e.g., GRBM_COUNT, SQ_WAVES). Null when has_counters is false.",
+          "additionalProperties": {
+            "type": "object",
+            "required": ["sample_count", "avg_value", "min_value", "max_value", "total_value"],
+            "additionalProperties": false,
+            "properties": {
+              "sample_count": {
+                "type": "integer",
+                "minimum": 0,
+                "description": "Number of samples collected for this counter."
+              },
+              "avg_value": {
+                "type": "number",
+                "description": "Average counter value across all samples."
+              },
+              "min_value": {
+                "type": "number",
+                "description": "Minimum counter value."
+              },
+              "max_value": {
+                "type": "number",
+                "description": "Maximum counter value."
+              },
+              "total_value": {
+                "type": "number",
+                "description": "Sum of all counter samples."
+              }
+            }
+          }
+        }
+      }
+    },
+
+    "recommendations": {
+      "type": "array",
+      "description": "Prioritized, actionable optimization recommendations with structured tool commands. Ordered by priority (HIGH first) then by estimated impact.",
+      "items": {
+        "$ref": "#/$defs/recommendation"
+      }
+    },
+
+    "warnings": {
+      "type": "array",
+      "description": "Non-fatal warnings about missing data, limited analysis scope, or degraded accuracy.",
+      "items": {
+        "type": "object",
+        "required": ["severity", "message"],
+        "additionalProperties": false,
+        "properties": {
+          "severity": {
+            "type": "string",
+            "enum": ["warning", "info"],
+            "description": "'warning' means the analysis may be incomplete. 'info' is purely informational."
+          },
+          "message": {
+            "type": "string",
+            "description": "Human-readable warning message."
+          },
+          "recommendation": {
+            "type": ["string", "null"],
+            "description": "Suggested action to resolve or work around this warning."
+          }
+        }
+      }
+    },
+
+    "errors": {
+      "type": "array",
+      "description": "Non-fatal errors encountered during analysis. An empty array means the analysis completed successfully. If this array is non-empty, some analysis sections may be incomplete.",
+      "items": {
+        "type": "string"
+      }
+    },
+
+    "llm_enhanced_explanation": {
+      "type": ["string", "null"],
+      "description": "Natural language explanation generated by an LLM (Anthropic Claude or OpenAI GPT) when --llm is enabled. Null when LLM enhancement was not requested or failed. Tools MUST treat this as supplementary; all actionable data is in the structured fields above."
+    },
+
+    "tier0": {
+      "type": "object",
+      "description": "Tier 0 static source-code analysis results. Present when --source-dir is used (source_only mode) or in combined mode (-i + --source-dir). Absent in pure Tier 1/2/3/4 runs.",
+      "required": [
+        "source_dir",
+        "analysis_timestamp",
+        "programming_model",
+        "files_scanned",
+        "kernel_count",
+        "recommendations",
+        "suggested_counters"
+      ],
+      "properties": {
+        "source_dir": {
+          "type": "string",
+          "description": "Path to the source directory that was scanned."
+        },
+        "analysis_timestamp": {
+          "type": "string",
+          "description": "ISO 8601 timestamp of the source analysis."
+        },
+        "programming_model": {
+          "type": "string",
+          "description": "Detected GPU programming model (e.g. HIP, CUDA, OpenCL, Python/PyTorch)."
+        },
+        "files_scanned": {
+          "type": "integer",
+          "minimum": 0,
+          "description": "Number of source files scanned."
+        },
+        "files_skipped": {
+          "type": "integer",
+          "minimum": 0,
+          "description": "Number of source files skipped."
+        },
+        "kernel_count": {
+          "type": "integer",
+          "minimum": 0,
+          "description": "Number of GPU kernel definitions detected."
+        },
+        "detected_kernels": {
+          "type": "array",
+          "description": "List of detected GPU kernel definitions.",
+          "items": { "type": "object" }
+        },
+        "detected_patterns": {
+          "type": "array",
+          "description": "List of GPU usage patterns detected in the source.",
+          "items": { "type": "object" }
+        },
+        "risk_areas": {
+          "type": "array",
+          "description": "List of potential performance risk areas identified.",
+          "items": { "type": "string" }
+        },
+        "already_instrumented": {
+          "type": "boolean",
+          "description": "True when ROCTx markers or other instrumentation are already present."
+        },
+        "roctx_marker_count": {
+          "type": "integer",
+          "minimum": 0,
+          "description": "Number of ROCTx marker calls found in the source."
+        },
+        "recommendations": {
+          "type": "array",
+          "description": "Ordered profiling and optimization recommendations from static analysis.",
+          "items": { "$ref": "#/$defs/recommendation" }
+        },
+        "suggested_counters": {
+          "type": "array",
+          "description": "List of hardware counters recommended for the next profiling pass.",
+          "items": { "type": "string" }
+        },
+        "suggested_first_command": {
+          "type": ["string", "null"],
+          "description": "Ready-to-run rocprofv3 command suggested as the first profiling step."
+        },
+        "llm_explanation": {
+          "type": ["string", "null"],
+          "description": "LLM-generated explanation of the source analysis findings."
+        }
+      }
+    },
+
+    "interval_timeline": {
+      "type": "object",
+      "description": "GPU timeline computed via interval arithmetic (TraceLens methodology)",
+      "properties": {
+        "total_wall_ns":      { "type": "integer" },
+        "true_compute_ns":    { "type": "integer" },
+        "true_compute_pct":   { "type": "number"  },
+        "exposed_memcpy_ns":  { "type": "integer" },
+        "exposed_memcpy_pct": { "type": "number"  },
+        "idle_ns":            { "type": "integer" },
+        "idle_pct":           { "type": "number"  }
+      }
+    },
+
+    "kernel_categories": {
+      "type": "array",
+      "description": "Kernel execution time aggregated by TraceLens op category",
+      "items": {
+        "type": "object",
+        "properties": {
+          "category":           { "type": "string"  },
+          "count":              { "type": "integer" },
+          "total_ns":           { "type": "integer" },
+          "pct_of_kernel_time": { "type": "number"  },
+          "avg_duration_ns":    { "type": "integer" },
+          "pct_of_total_time":  { "type": "number"  }
+        }
+      }
+    },
+
+    "short_kernels": {
+      "type": "object",
+      "description": "Short kernel analysis (TraceLens methodology)",
+      "properties": {
+        "threshold_us":              { "type": "number"  },
+        "total_kernels":             { "type": "integer" },
+        "short_kernel_count":        { "type": "integer" },
+        "short_kernel_pct":          { "type": "number"  },
+        "wasted_ns":                 { "type": "integer" },
+        "wasted_pct_of_kernel_time": { "type": "number"  },
+        "histogram": {
+          "type": "array",
+          "items": {
+            "type": "object",
+            "properties": {
+              "bucket_label": { "type": "string"  },
+              "count":        { "type": "integer" }
+            }
+          }
+        },
+        "top_offenders": {
+          "type": "array",
+          "items": {
+            "type": "object",
+            "properties": {
+              "name":            { "type": "string"  },
+              "count":           { "type": "integer" },
+              "avg_us":          { "type": "number"  },
+              "total_wasted_ns": { "type": "integer" }
+            }
+          }
+        }
+      }
+    }
+  },
+
+  "$defs": {
+
+    "recommendation_command": {
+      "type": "object",
+      "description": "A single structured command recommendation for one ROCm profiling tool.",
+      "required": ["tool", "description", "flags", "args", "full_command"],
+      "additionalProperties": false,
+      "properties": {
+        "tool": {
+          "type": "string",
+          "enum": ["rocprofv3", "rocprof-sys", "rocprof-compute"],
+          "description": "The ROCm profiling tool to invoke. 'rocprofv3' for trace/counter collection, 'rocprof-sys' for system-level profiling (Omnitrace), 'rocprof-compute' for kernel-level hardware counter analysis."
+        },
+        "description": {
+          "type": "string",
+          "description": "Human-readable explanation of what this command does and why it is recommended for this issue."
+        },
+        "flags": {
+          "type": "array",
+          "description": "Boolean flags to pass to the tool (no associated value). Order is preserved in full_command.",
+          "items": {
+            "type": "string",
+            "description": "A single flag, e.g. '--sys-trace', '--hsa-trace', '--hip-trace'."
+          },
+          "examples": [
+            ["--sys-trace", "--hsa-trace"],
+            ["--hip-trace"]
+          ]
+        },
+        "args": {
+          "type": "array",
+          "description": "Named arguments with optional values to pass to the tool. Arguments with null value are treated as boolean flags and merged into 'flags' semantically, but kept separate for structured access.",
+          "items": {
+            "type": "object",
+            "required": ["name"],
+            "additionalProperties": false,
+            "properties": {
+              "name": {
+                "type": "string",
+                "description": "Argument name including leading dashes, e.g. '--pmc', '-d', '-o'."
+              },
+              "value": {
+                "type": ["string", "null"],
+                "description": "Argument value. Null means the argument is a flag with no value (duplicate of flags for structured access). String values are included verbatim in full_command.",
+                "examples": ["SQ_WAVES GRBM_COUNT GRBM_GUI_ACTIVE", "./output_dir", "profile"]
+              }
+            }
+          },
+          "examples": [
+            [
+              {"name": "--pmc", "value": "SQ_WAVES GRBM_COUNT GRBM_GUI_ACTIVE"},
+              {"name": "-d", "value": "./output"},
+              {"name": "-o", "value": "profile"}
+            ]
+          ]
+        },
+        "full_command": {
+          "type": "string",
+          "description": "Complete ready-to-run shell command including tool name, all flags, all args, and application placeholder '-- ./app'. Consumers can display this directly for copy-paste. Application arguments should be appended after '-- ./app'.",
+          "examples": [
+            "rocprofv3 --sys-trace --pmc SQ_WAVES GRBM_COUNT GRBM_GUI_ACTIVE -d ./output -o profile -- ./app",
+            "rocprof-sys --trace-gpu-memory -- ./app",
+            "rocprof-compute -i profile.db --filter kernel_name -- ./app"
+          ]
+        }
+      }
+    },
+
+    "recommendation": {
+      "type": "object",
+      "description": "A single optimization recommendation with structured text explanation and tool commands.",
+      "required": [
+        "id",
+        "priority",
+        "category",
+        "issue",
+        "suggestion"
+      ],
+      "additionalProperties": false,
+      "properties": {
+        "id": {
+          "type": "string",
+          "description": "Stable identifier for this recommendation type. Can be used by tools to suppress or track specific recommendations across runs.",
+          "examples": [
+            "ROCPD-MEMCPY-001",
+            "ROCPD-COMPUTE-001",
+            "ROCPD-OCCUPANCY-001",
+            "ROCPD-KERNEL-FUSION-001"
+          ]
+        },
+        "priority": {
+          "type": "string",
+          "enum": ["HIGH", "MEDIUM", "LOW", "INFO"],
+          "description": "Recommendation priority. HIGH items should be addressed first. INFO items are observations that do not require action."
+        },
+        "category": {
+          "type": "string",
+          "description": "Optimization category for grouping and filtering.",
+          "examples": [
+            "Memory Transfer",
+            "Compute Bottleneck",
+            "Occupancy",
+            "Kernel Fusion",
+            "Memory Access Pattern",
+            "API Overhead",
+            "General"
+          ]
+        },
+        "issue": {
+          "type": "string",
+          "description": "Concise description of the detected performance problem, including measured values where available."
+        },
+        "suggestion": {
+          "type": "string",
+          "description": "High-level recommendation explaining what to do to address the issue."
+        },
+        "actions": {
+          "type": "array",
+          "description": "Concrete, numbered implementation steps to resolve the issue. Ordered from most to least impactful.",
+          "items": {
+            "type": "string"
+          }
+        },
+        "estimated_impact": {
+          "type": "string",
+          "description": "Qualitative or quantitative estimate of the expected performance improvement if this recommendation is implemented.",
+          "examples": [
+            "15-30% reduction in total runtime when transfers dominate",
+            "Eliminates ~25% of launch overhead for workloads with many small kernels",
+            "May improve GPU utilization from 45% toward >80%"
+          ]
+        },
+        "commands": {
+          "type": "array",
+          "description": "Ordered list of structured profiling commands to help investigate or validate this recommendation. Each command targets a specific ROCm tool. Consumers should offer these as copy-paste suggestions.",
+          "items": {
+            "$ref": "#/$defs/recommendation_command"
+          }
+        }
+      }
+    },
+
+    "examples": {
+      "description": "Non-normative examples showing typical field values for each analysis tier.",
+      "tier1_minimal": {
+        "schema_version": "0.1.0",
+        "metadata": {
+          "rocpd_version": "6.3.0",
+          "analysis_version": "0.1.0",
+          "database_file": "/path/to/out_results.db",
+          "analysis_timestamp": "2026-02-17T18:00:00.000000",
+          "analysis_duration_ms": 42,
+          "custom_prompt": null
+        },
+        "profiling_info": {
+          "total_duration_ns": 156800000,
+          "profiling_mode": "sys_trace_only",
+          "analysis_tier": 1,
+          "gpus": [
+            {"name": "AMD Instinct MI300X", "architecture": "gfx942", "agent_id": 1}
+          ]
+        },
+        "summary": {
+          "overall_assessment": "Workload is memory-transfer-bound with significant launch overhead from many small kernel dispatches.",
+          "primary_bottleneck": "memory_transfer",
+          "confidence": 0.82,
+          "key_findings": [
+            "Memory copies consume 25.3% of total runtime (40 small transfers)",
+            "add_scalar_kernel dominates at 45.2% of kernel time",
+            "180 kernel dispatches with average duration 8.7 μs (high launch overhead)"
+          ]
+        },
+        "execution_breakdown": {
+          "total_runtime_ns": 156800000,
+          "kernel_time_ns": 67200000,
+          "kernel_time_pct": 42.8,
+          "memcpy_time_ns": 39700000,
+          "memcpy_time_pct": 25.3,
+          "api_overhead_ns": 49900000,
+          "api_overhead_pct": 31.8,
+          "idle_time_ns": 0,
+          "idle_time_pct": 0.0
+        },
+        "hotspots": [
+          {
+            "rank": 1,
+            "name": "add_scalar_kernel",
+            "calls": 50,
+            "total_duration_ns": 30384000,
+            "avg_duration_ns": 607680.0,
+            "min_duration_ns": 580000,
+            "max_duration_ns": 650000,
+            "pct_of_total": 45.2
+          }
+        ],
+        "memory_analysis": {
+          "Host-to-Device": {
+            "count": 20,
+            "total_bytes": 4194304,
+            "total_duration_ns": 19850000,
+            "avg_bytes": 209715.2,
+            "avg_duration_ns": 992500.0,
+            "bandwidth_gbps": 0.21
+          },
+          "Device-to-Host": {
+            "count": 20,
+            "total_bytes": 4194304,
+            "total_duration_ns": 19850000,
+            "avg_bytes": 209715.2,
+            "avg_duration_ns": 992500.0,
+            "bandwidth_gbps": 0.21
+          }
+        },
+        "hardware_counters": {
+          "has_counters": false,
+          "metrics": null,
+          "counters": null
+        },
+        "recommendations": [
+          {
+            "id": "ROCPD-MEMCPY-001",
+            "priority": "HIGH",
+            "category": "Memory Transfer",
+            "issue": "Memory copies account for 25.3% of execution time across 40 small transfers",
+            "suggestion": "Batch the 40 small transfers into 1-2 large transfers to amortize per-transfer overhead",
+            "actions": [
+              "1. Allocate a single pinned host buffer for all data",
+              "2. Copy all data in one hipMemcpy call",
+              "3. Use hipMemcpyAsync with streams to overlap with computation"
+            ],
+            "estimated_impact": "15-25% reduction in total runtime",
+            "commands": [
+              {
+                "tool": "rocprofv3",
+                "description": "Trace HIP and HSA memory copy operations with timing to measure transfer bottleneck",
+                "flags": ["--sys-trace", "--hsa-trace"],
+                "args": [
+                  {"name": "-d", "value": "./memcpy_output"},
+                  {"name": "-o", "value": "profile"}
+                ],
+                "full_command": "rocprofv3 --sys-trace --hsa-trace -d ./memcpy_output -o profile -- ./app"
+              },
+              {
+                "tool": "rocprof-sys",
+                "description": "Detailed memory transfer timeline with PCIe bandwidth and overlap analysis",
+                "flags": [],
+                "args": [
+                  {"name": "--trace-gpu-memory", "value": null}
+                ],
+                "full_command": "rocprof-sys --trace-gpu-memory -- ./app"
+              }
+            ]
+          }
+        ],
+        "warnings": [
+          {
+            "severity": "warning",
+            "message": "No hardware counters collected. Analysis limited to Tier 1 (trace data only).",
+            "recommendation": "Collect counters with: rocprofv3 --pmc GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES -- ./app"
+          }
+        ],
+        "errors": [],
+        "llm_enhanced_explanation": null
+      }
+    }
+  }
+}
diff --git a/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/exceptions.py b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/exceptions.py
new file mode 100644
index 00000000000..0de4f002294
--- /dev/null
+++ b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/exceptions.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+###############################################################################
+# MIT License
+#
+# Copyright (c) 2025 Advanced Micro Devices, Inc.
+###############################################################################
+
+"""
+Exception classes for AI analysis module.
+"""
+
+from typing import List, Optional
+
+
+class AnalysisError(Exception):
+    """Base exception for AI analysis errors"""
+
+    pass
+
+
+class DatabaseNotFoundError(AnalysisError):
+    """Database file not found"""
+
+    pass
+
+
+class DatabaseCorruptedError(AnalysisError):
+    """Database schema invalid or corrupted"""
+
+    pass
+
+
+class MissingDataError(AnalysisError):
+    """Required data missing from database"""
+
+    def __init__(self, message: str, missing_tables: Optional[List[str]] = None):
+        super().__init__(message)
+        self.missing_tables = missing_tables or []
+
+
+class UnsupportedGPUError(AnalysisError):
+    """GPU architecture not supported"""
+
+    def __init__(self, message: str, gpu_arch: Optional[str] = None):
+        super().__init__(message)
+        self.gpu_arch = gpu_arch
+
+
+class LLMAuthenticationError(AnalysisError):
+    """LLM API authentication failed"""
+
+    pass
+
+
+class LLMRateLimitError(AnalysisError):
+    """LLM API rate limit exceeded"""
+
+    pass
+
+
+class AnalysisTimeoutError(AnalysisError):
+    """Analysis took too long"""
+
+    pass
+
+
+class ReferenceGuideNotFoundError(AnalysisError):
+    """LLM reference guide file not found"""
+
+    def __init__(self, attempted_paths: List[str]):
+        paths_str = "\n  - ".join(attempted_paths)
+        super().__init__(
+            f"LLM reference guide not found. Attempted locations:\n  - {paths_str}\n"
+            "This file is required for LLM-enhanced analysis.\n"
+            "See documentation for how to create or restore this file."
+        )
+        self.attempted_paths = attempted_paths
+
+
+class SourceDirectoryNotFoundError(AnalysisError):
+    """Source code directory not found or not a directory"""
+
+    pass
+
+
+class SourceAnalysisError(AnalysisError):
+    """Error during source code analysis"""
+
+    pass
diff --git a/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/interactive.py b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/interactive.py
new file mode 100644
index 00000000000..3b3a248714a
--- /dev/null
+++ b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/interactive.py
@@ -0,0 +1,4148 @@
+"""Interactive session for rocpd analyze --interactive."""
+
+from __future__ import annotations
+
+import json
+import os
+import pathlib
+import re
+import shlex
+import subprocess
+import tempfile
+import warnings
+from dataclasses import asdict, dataclass, field
+from datetime import datetime, timezone
+from typing import Any, Dict, List, Optional, Union
+
+from .llm_conversation import LLMConversation
+from .llm_analyzer import load_reference_guide
+
+# ── Session data ─────────────────────────────────────────────────────────────
+
+
+@dataclass
+class PersistentMenuItem:
+    """A recommendation promoted to the main menu from a previous analysis."""
+
+    id: str
+    title: str
+    priority: str  # "HIGH" | "MEDIUM" | "LOW"
+    source: str  # "profiling_analysis" | "code_change_analysis"
+    added_at: str  # ISO-8601
+    detail: Dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass
+class HistoryEntry:
+    type: str  # "profiling_run" | "code_change"
+    timestamp: str
+    db_path: str = ""
+    files_modified: List[str] = field(default_factory=list)
+    summary: str = ""
+
+
+@dataclass
+class SessionData:
+    session_id: str
+    source_dir: str
+    created_at: str
+    last_updated: str
+    history: List[HistoryEntry] = field(default_factory=list)
+    persistent_menu_items: List[PersistentMenuItem] = field(default_factory=list)
+    conversation: Optional[Dict[str, Any]] = None  # serialized LLMConversation
+    sent_source_files: List[str] = field(
+        default_factory=list
+    )  # files already sent to LLM
+
+    def to_dict(self) -> Dict[str, Any]:
+        return asdict(self)
+
+    @classmethod
+    def from_dict(cls, d: Dict[str, Any]) -> "SessionData":
+        history = [HistoryEntry(**h) for h in d.get("history", [])]
+        items = [PersistentMenuItem(**m) for m in d.get("persistent_menu_items", [])]
+        return cls(
+            session_id=d["session_id"],
+            source_dir=d["source_dir"],
+            created_at=d["created_at"],
+            last_updated=d["last_updated"],
+            history=history,
+            persistent_menu_items=items,
+            conversation=d.get("conversation"),  # None if key absent (backward compat)
+            sent_source_files=d.get(
+                "sent_source_files", []
+            ),  # empty list for old sessions
+        )
+
+
+# ── SessionStore ──────────────────────────────────────────────────────────────
+
+_DEFAULT_SESSIONS_DIR = pathlib.Path.home() / ".rocpd" / "sessions"
+
+
+class SessionStore:
+    """Handles session file I/O under sessions_dir."""
+
+    def __init__(self, sessions_dir: Optional[Union[str, pathlib.Path]] = None) -> None:
+        self._dir = pathlib.Path(sessions_dir) if sessions_dir else _DEFAULT_SESSIONS_DIR
+
+    def _path_for(self, session_id: str) -> pathlib.Path:
+        return self._dir / f"{session_id}.json"
+
+    def save(self, data: SessionData) -> pathlib.Path:
+        self._dir.mkdir(parents=True, exist_ok=True)
+        p = self._path_for(data.session_id)
+        p.write_text(json.dumps(data.to_dict(), indent=2))
+        return p
+
+    def load(self, id_or_path: str) -> Optional[SessionData]:
+        """Load by session ID or by absolute/relative file path."""
+        try:
+            candidate = pathlib.Path(id_or_path)
+            if candidate.exists():
+                raw = json.loads(candidate.read_text())
+                return SessionData.from_dict(raw)
+            p = self._path_for(id_or_path)
+            if p.exists():
+                raw = json.loads(p.read_text())
+                return SessionData.from_dict(raw)
+            return None
+        except Exception as exc:
+            warnings.warn(f"Failed to load session {id_or_path!r}: {exc}", stacklevel=2)
+            return None
+
+    def find_by_source_dir(self, source_dir: str) -> List[SessionData]:
+        """Return all sessions whose source_dir matches, newest first."""
+        if not self._dir.exists():
+            return []
+        results: List[SessionData] = []
+        for f in self._dir.glob("*.json"):
+            try:
+                raw = json.loads(f.read_text())
+                if raw.get("source_dir") == source_dir:
+                    results.append(SessionData.from_dict(raw))
+            except Exception:
+                pass
+
+        def _safe_dt(s):
+            try:
+                return datetime.fromisoformat(s.created_at)
+            except Exception:
+                return datetime.min.replace(tzinfo=timezone.utc)
+
+        return sorted(results, key=_safe_dt, reverse=True)
+
+    @staticmethod
+    def make_session_id(source_dir: str) -> str:
+        slug = pathlib.Path(source_dir).name.replace(" ", "_")[:24] or "session"
+        ts = datetime.now(timezone.utc).strftime("%Y-%m-%d_%H-%M-%S")
+        return f"{ts}_{slug}"
+
+
+# ── Rendering helpers ─────────────────────────────────────────────────────────
+
+try:
+    from rich.console import Console
+    from rich.panel import Panel
+
+    _RICH = True
+except ImportError:
+    _RICH = False
+
+_console = Console() if _RICH else None
+
+_PRI_STYLE = {"HIGH": "bold red", "MEDIUM": "yellow", "LOW": "green", "INFO": "blue"}
+
+
+def _print(msg: str = "", style: str = "") -> None:
+    if _RICH and _console:
+        # markup=False so literal brackets like [p], [o], [n], [1] are not
+        # consumed as Rich markup tags and are shown to the user as-is.
+        _console.print(msg, style=style or None, markup=False)
+    else:
+        print(msg)
+
+
+def _input(prompt: str) -> str:
+    return input(prompt)
+
+
+def _print_token(t: str) -> None:
+    """Stream a single LLM token to stdout without newline."""
+    print(t, end="", flush=True)
+
+
+class _Spinner:
+    """Context manager: show a Rich spinner or a plain 'working…' line."""
+
+    def __init__(self, msg: str) -> None:
+        self._msg = msg
+        self._status = None
+
+    def __enter__(self):
+        if _RICH and _console:
+            self._status = _console.status(self._msg, spinner="dots")
+            self._status.__enter__()
+        else:
+            print(self._msg, flush=True)
+        return self
+
+    def __exit__(self, *args):
+        if self._status is not None:
+            self._status.__exit__(*args)
+
+
+# ── AMD ROCm logo banner ──────────────────────────────────────────────────────
+
+
+def _render_logo_halfblock(width: int = 66, threshold: int = 70) -> Optional[str]:
+    """Convert the AMD ROCm logo PNG to half-block ANSI art (2 px per char row).
+
+    Uses the white-variant PNG bundled in share/.  Alpha channel encodes logo
+    density; each pixel pair (top/bottom) maps to ▀ / ▄ / █ / space.
+    All logo pixels are rendered in AMD red (\033[31m).
+    Returns None if PIL is unavailable or the logo file is missing.
+    """
+    try:
+        from PIL import Image  # type: ignore[import]
+
+        share_dir = pathlib.Path(__file__).parent / "share"
+        logo_path = share_dir / "amd_rocm_logo.png"
+        if not logo_path.exists():
+            return None
+
+        img = Image.open(str(logo_path)).convert("RGBA")
+
+        # Scale to requested width; account for char cell ~2:1 height:width ratio
+        height_px = max(8, int(img.height / img.width * width))
+        # Make even so each pair of rows maps cleanly to one character row
+        if height_px % 2:
+            height_px += 1
+        img = img.resize((width, height_px), Image.LANCZOS)
+
+        RED = "\033[31m"
+        RESET = "\033[0m"
+        lines: List[str] = []
+
+        for y_char in range(height_px // 2):
+            row = "  "  # leading indent
+            for x in range(width):
+                top_a = img.getpixel((x, y_char * 2))[3]
+                bot_a = img.getpixel((x, y_char * 2 + 1))[3]
+                top = top_a > threshold
+                bot = bot_a > threshold
+                if top and bot:
+                    row += f"{RED}█{RESET}"
+                elif top:
+                    row += f"{RED}▀{RESET}"
+                elif bot:
+                    row += f"{RED}▄{RESET}"
+                else:
+                    row += " "
+            if row.strip():
+                lines.append(row)
+
+        return "\n".join(lines) if lines else None
+
+    except Exception:
+        return None
+
+
+def _replace_output_dir(cmd: str, new_dir: str) -> str:
+    """Replace the -d <dir> argument in a rocprofv3 command with new_dir."""
+    import shlex as _shlex
+    import re as _re
+
+    # Replace -d <value> token pair
+    try:
+        parts = _shlex.split(cmd)
+    except ValueError:
+        parts = cmd.split()
+    out = []
+    i = 0
+    replaced = False
+    while i < len(parts):
+        if parts[i] in ("-d", "--output-path") and i + 1 < len(parts):
+            out.extend([parts[i], new_dir])
+            i += 2
+            replaced = True
+        else:
+            out.append(parts[i])
+            i += 1
+    result = " ".join(_shlex.quote(p) if " " in p else p for p in out)
+    if not replaced:
+        # Append -d before the -- separator if present
+        result = _re.sub(r"\s+--\s+", f" -d {new_dir} -- ", result, count=1)
+    return result
+
+
+def _print_startup_banner() -> None:
+    """Print the AMD ROCm logo + session title once at interactive startup."""
+    art = _render_logo_halfblock()
+    if art:
+        print()
+        print(art)
+        print()
+    else:
+        # Fallback: plain text header with AMD red
+        RED, BOLD, RESET = "\033[31m", "\033[1m", "\033[0m"
+        print(f"\n  {BOLD}{RED}AMD ROCm{RESET}  AI Analysis — Interactive Session\n")
+
+
+# ── InteractiveSession ────────────────────────────────────────────────────────
+
+
+class InteractiveSession:
+    """Top-level interactive menu for rocpd analyze --interactive."""
+
+    def __init__(
+        self,
+        source_dir: str,
+        tier0_result: Optional[Any],
+        recommendations: List[Dict[str, Any]],
+        database_path: str,
+        llm_provider: Optional[str],
+        llm_api_key: Optional[str],
+        llm_model: Optional[str],
+        llm_local: Optional[str] = None,
+        llm_local_model: Optional[str] = None,
+        session_store: Optional[SessionStore] = None,
+        resume_session_id: Optional[str] = None,
+        compact_every: int = 10,
+    ) -> None:
+        self._source_dir = source_dir
+        self._tier0 = tier0_result
+        self._recs = recommendations or []
+        self._db_path = database_path
+        self._llm_provider = llm_provider
+        self._llm_api_key = llm_api_key
+        self._llm_model = llm_model
+        self._llm_local = llm_local
+        self._llm_local_model = llm_local_model
+        self._store = session_store or SessionStore()
+        self._compact_every = compact_every
+        self._conv: Optional[LLMConversation] = None
+        self._sent_source_files: set = set()  # filenames already sent to _conv
+        self._session = self._init_session(resume_session_id)
+
+    @property
+    def session(self) -> SessionData:
+        return self._session
+
+    def _init_session(self, resume_id: Optional[str]) -> SessionData:
+        # Explicit resume
+        if resume_id:
+            loaded = self._store.load(resume_id)
+            if loaded:
+                self._conv = self._restore_or_create_conv(loaded)
+                self._sent_source_files = set(loaded.sent_source_files)
+                return loaded
+
+        # Auto-detect previous session for this source dir
+        existing = self._store.find_by_source_dir(self._source_dir)
+        if existing:
+            chosen = self._prompt_resume(existing)
+            if chosen:
+                self._conv = self._restore_or_create_conv(chosen)
+                self._sent_source_files = set(chosen.sent_source_files)
+                return chosen
+
+        # New session
+        now = datetime.now(timezone.utc).isoformat()
+        new_session = SessionData(
+            session_id=SessionStore.make_session_id(self._source_dir),
+            source_dir=self._source_dir,
+            created_at=now,
+            last_updated=now,
+        )
+        self._conv = self._make_fresh_conv(new_session.session_id)
+        return new_session
+
+    def _restore_or_create_conv(self, loaded: SessionData) -> Optional["LLMConversation"]:
+        """Restore _conv from a loaded session, or create fresh if absent."""
+        if not self._llm_provider:
+            return None
+        raw_conv = loaded.conversation
+        if raw_conv:
+            return LLMConversation.from_dict(
+                raw_conv, api_key=self._llm_api_key, model=self._llm_model
+            )
+        return self._make_fresh_conv(loaded.session_id)
+
+    def _make_fresh_conv(self, session_id: str) -> Optional["LLMConversation"]:
+        """Create a new LLMConversation for a session, or None if no LLM configured."""
+        if not self._llm_provider:
+            return None
+        hp = self._store._dir / f"{session_id}_history.jsonl"
+        conv = LLMConversation(
+            provider=self._llm_provider,
+            api_key=self._llm_api_key,
+            model=self._llm_model,
+            compact_every=self._compact_every,
+            history_path=hp,
+        )
+        try:
+            fence = load_reference_guide()
+        except Exception as e:
+            warnings.warn(
+                f"[LLMConversation] Could not load reference guide: {e}", stacklevel=3
+            )
+            fence = ""
+        conv.initialize(
+            "You are an expert AMD GPU performance engineer "
+            "helping optimize a HIP/ROCm application.\n\n" + fence
+        )
+        return conv
+
+    def _prompt_resume(self, existing: List[SessionData]) -> Optional[SessionData]:
+        _print()
+        _print(
+            f"Found {len(existing)} previous session(s) for {self._source_dir}:",
+            style="cyan",
+        )
+        for i, s in enumerate(existing, 1):
+            n_runs = sum(1 for h in s.history if h.type == "profiling_run")
+            n_change = sum(1 for h in s.history if h.type == "code_change")
+            n_items = len(s.persistent_menu_items)
+            _print(
+                f"  [{i}]  {s.session_id}  "
+                f"({n_runs} profiling run(s), {n_change} code change(s), "
+                f"{n_items} saved recommendation(s))"
+            )
+        _print("  [n]  Start new session  (or press Enter)")
+        _print()
+        choice = _input("  > ").strip().lower()
+        if choice.isdigit():
+            idx = int(choice) - 1
+            if 0 <= idx < len(existing):
+                return existing[idx]
+            _print("  Invalid selection — starting new session.", style="dim")
+        elif choice not in ("n", ""):
+            _print("  Unrecognized input — starting new session.", style="dim")
+        return None
+
+    def _render_main_menu(self) -> None:
+        src_label = (
+            pathlib.Path(self._source_dir).name if self._source_dir else "(no source)"
+        )
+        n_runs = sum(1 for h in self._session.history if h.type == "profiling_run")
+        db_label = f"  db: {pathlib.Path(self._db_path).name}" if self._db_path else ""
+        runs_label = f"  runs: {n_runs}" if n_runs else ""
+        status_line = (
+            f"[dim]{db_label}{runs_label}  \\[s] save  \\[q] quit[/dim]"
+            if _RICH
+            else f"{db_label}{runs_label}  [s] save  [q] quit"
+        )
+        if _RICH and _console:
+            _console.print(
+                Panel(
+                    f"[bold]Source:[/bold] {src_label}   "
+                    f"[bold]Session:[/bold] {self._session.session_id}   " + status_line,
+                    title="[bold cyan]rocpd Interactive Analysis[/bold cyan]",
+                    border_style="blue",
+                )
+            )
+        else:
+            w = 70
+            print("=" * w)
+            print(f"  rocpd Interactive Analysis | {src_label}")
+            print(
+                f"  Session: {self._session.session_id}"
+                f"  {db_label}  [s] save  [q] quit"
+            )
+            print("=" * w)
+
+        _print()
+        _print("  [p]  Profile app  — run rocprofv3, collect .db", style="white")
+        _print(
+            "  [a]  Analyze .db  — load existing trace and find bottlenecks",
+            style="white",
+        )
+        _print("  [o]  Optimize     — AI code optimization suggestions", style="white")
+
+        if self._session.persistent_menu_items:
+            _print()
+            _print("  ── Findings from this session " + "─" * 33, style="dim")
+            for i, item in enumerate(self._session.persistent_menu_items, 1):
+                pri = item.priority.upper()
+                pri_style = _PRI_STYLE.get(pri, "white")
+                src_tag = (
+                    "  [code change]" if item.source == "code_change_analysis" else ""
+                )
+                if _RICH and _console:
+                    _console.print(
+                        f"  [cyan bold]\\[{i}][/cyan bold]  "
+                        f"[{pri_style}][{pri}][/{pri_style}]  "
+                        f"{item.title}{src_tag}"
+                    )
+                else:
+                    print(f"  [{i}]  [{pri}]  {item.title}{src_tag}")
+        _print()
+
+    def _path_analyze_db(self) -> None:
+        """Prompt for a .db file, run Tier 1/2 analysis, show summary and add recommendations."""
+        _print()
+        if self._db_path:
+            _print(f"  Current .db: {self._db_path}", style="dim")
+            _print(
+                "  Enter a .db path to analyze, or press Enter to re-analyze current:",
+                style="cyan",
+            )
+        else:
+            _print("  Enter path to a .db trace file:", style="cyan")
+        try:
+            db_input = _input("  > ").strip()
+        except EOFError:
+            return
+        if not db_input and self._db_path:
+            db_path = pathlib.Path(self._db_path)
+        elif db_input:
+            db_path = pathlib.Path(db_input).expanduser()
+        else:
+            return
+        if not db_path.exists():
+            _print(f"  File not found: {db_path}", style="red")
+            return
+        self._db_path = str(db_path)
+        _print(f"  Running Tier 1/2 analysis on {db_path.name}...", style="dim")
+        new_recs, breakdown = self._run_tier1_analysis(str(db_path))
+        if new_recs:
+            self._show_analysis_summary(new_recs)
+        added = self._ingest_recommendations(new_recs)
+        now = datetime.now(timezone.utc).isoformat()
+        self._session.history.append(
+            HistoryEntry(type="profiling_run", timestamp=now, db_path=str(db_path))
+        )
+        _print(f"  ✓ {added} new finding(s) added to menu.", style="green")
+
+    def _show_analysis_summary(self, recs: List[Dict[str, Any]]) -> None:
+        """Print a brief summary of findings after Tier 1/2 analysis."""
+        if not recs:
+            _print("  No significant bottlenecks found.", style="green")
+            return
+        high = [r for r in recs if r.get("priority", "").upper() == "HIGH"]
+        med = [r for r in recs if r.get("priority", "").upper() == "MEDIUM"]
+        _print()
+        _print("  ── Analysis Summary ────────────────────────────────────", style="cyan")
+        for r in high:
+            _print(f"  [HIGH]    {r.get('issue', r.get('title', ''))}", style="bold red")
+        for r in med:
+            _print(f"  [MEDIUM]  {r.get('issue', r.get('title', ''))}", style="yellow")
+        _print()
+
+    def run(self) -> None:
+        """Main event loop."""
+        _print_startup_banner()
+        try:
+            self._run_loop()
+        except KeyboardInterrupt:
+            _print()
+            _print("  Interrupted — saving session.", style="yellow")
+            self._save_and_quit()
+
+    def _run_loop(self) -> None:
+        while True:
+            self._render_main_menu()
+            try:
+                choice = _input("  Enter choice [p/a/o/s/q]: ").strip().lower()
+            except EOFError:
+                self._save_and_quit()
+                break
+
+            if choice == "q":
+                self._save_and_quit()
+                break
+            elif choice == "s":
+                if self._conv:
+                    self._session.conversation = self._conv.to_dict()
+                self._session.sent_source_files = list(self._sent_source_files)
+                self._store.save(self._session)
+                _print("  Session saved.", style="green")
+            elif choice == "p":
+                self._path_profiling()
+            elif choice == "a":
+                self._path_analyze_db()
+            elif choice == "o":
+                self._path_optimize()
+            elif choice.isdigit():
+                idx = int(choice) - 1
+                if 0 <= idx < len(self._session.persistent_menu_items):
+                    self._pursue_recommendation(self._session.persistent_menu_items[idx])
+            else:
+                _print("  Unknown choice. Enter p, a, o, s, q, or a number.", style="dim")
+
+    def _save_and_quit(self) -> None:
+        self._session.last_updated = datetime.now(timezone.utc).isoformat()
+        if self._conv:
+            self._session.conversation = self._conv.to_dict()
+        self._session.sent_source_files = list(self._sent_source_files)
+        self._store.save(self._session)
+        _print("  Session saved. Goodbye.", style="cyan")
+
+    def _ingest_recommendations(
+        self, new_recs: List[Dict[str, Any]], source: str = "profiling_analysis"
+    ) -> int:
+        """Add unique recommendations to persistent_menu_items. Returns count added."""
+        now = datetime.now(timezone.utc).isoformat()
+        existing_ids = {m.id for m in self._session.persistent_menu_items}
+        added = 0
+        for rec in new_recs:
+            rid = rec.get("id", rec.get("category", ""))
+            if rid and rid not in existing_ids:
+                self._session.persistent_menu_items.append(
+                    PersistentMenuItem(
+                        id=rid,
+                        title=rec.get("issue", rec.get("category", rid)),
+                        priority=rec.get("priority", "INFO"),
+                        source=source,
+                        added_at=now,
+                        detail=rec,
+                    )
+                )
+                existing_ids.add(rid)
+                added += 1
+        return added
+
+    def _path_profiling(self, _source: str = "profiling_analysis") -> None:
+        """Show profiling commands; let user pick one to run; auto-ingest output .db."""
+        _print()
+        _print("  ── Profiling Commands ──────────────────────────────────", style="cyan")
+        _print()
+
+        cmds = self._collect_profiling_commands()
+
+        # Optional LLM annotation on tier0 metadata (no source text uploaded)
+        if self._llm_provider and self._tier0:
+            cmds = self._llm_annotate_profiling_plan(cmds)
+
+        if not cmds:
+            _print("  (no profiling commands available)", style="dim")
+            return
+
+        for i, (label, cmd) in enumerate(cmds, 1):
+            _print(f"  [{i}]  {label}", style="white")
+            _print(f"       $ {cmd}", style="dim")
+            _print()
+
+        _print("  Enter command number to run it, or Enter to skip:", style="cyan")
+        try:
+            choice = _input("  > ").strip()
+        except EOFError:
+            return
+
+        if not choice:
+            return
+
+        if not choice.isdigit() or not (1 <= int(choice) <= len(cmds)):
+            _print("  Invalid selection.", style="dim")
+            return
+
+        _, selected_cmd = cmds[int(choice) - 1]
+
+        # If the command has '-- ./app', ask the user what their app invocation is
+        if "-- ./app" in selected_cmd:
+            auto = self._resolve_app_placeholder(selected_cmd)
+            auto_app = auto.split("-- ", 1)[1] if "-- " in auto else ""
+            hint = f" (default: {auto_app})" if auto_app and auto_app != "./app" else ""
+            _print(f"  Enter application to profile{hint}:", style="cyan")
+            _print(
+                "  (e.g.  ./my_app --arg1 val1   or press Enter to use default)",
+                style="dim",
+            )
+            try:
+                app_input = _input("  > ").strip()
+            except EOFError:
+                return
+            if app_input:
+                selected_cmd = selected_cmd.replace("-- ./app", f"-- {app_input}")
+            elif auto_app and auto_app != "./app":
+                selected_cmd = auto
+            # else leave as-is (./app stays in command; user will see it)
+
+        _print()
+        _print(f"  Running: $ {selected_cmd}", style="cyan")
+        _print()
+
+        proc = subprocess.run(shlex.split(selected_cmd))
+        _print()
+        if proc.returncode != 0:
+            _print(f"  Command exited with code {proc.returncode}.", style="yellow")
+
+        # Try to find the output .db automatically from the command flags
+        db_path = self._find_output_db(selected_cmd)
+        if db_path:
+            _print(f"  Found output: {db_path}", style="green")
+        else:
+            _print(
+                "  Enter path to the output .db file (or Enter to skip):", style="cyan"
+            )
+            try:
+                db_input = _input("  > ").strip()
+            except EOFError:
+                return
+            if not db_input:
+                return
+            db_path = pathlib.Path(db_input).expanduser()
+            if not db_path.exists():
+                _print(f"  File not found: {db_path}", style="red")
+                return
+
+        _print("  Running Tier 1/2 analysis...", style="dim")
+        new_recs, breakdown = self._run_tier1_analysis(str(db_path))
+        if new_recs:
+            self._show_analysis_summary(new_recs)
+        added = self._ingest_recommendations(new_recs, source=_source)
+        now = datetime.now(timezone.utc).isoformat()
+        self._session.history.append(
+            HistoryEntry(
+                type="profiling_run",
+                timestamp=now,
+                db_path=str(db_path),
+            )
+        )
+        self._db_path = str(db_path)
+        _print(f"  ✓ {added} finding(s) added to menu.", style="green")
+
+    def _resolve_app_placeholder(self, cmd: str) -> str:
+        """Replace '-- ./app' placeholder with an actual binary found near source_dir."""
+        if "-- ./app" not in cmd:
+            return cmd
+        # Look for any executable in source_dir (non-script, non-dot files)
+        base = pathlib.Path(self._source_dir)
+        for candidate in sorted(base.iterdir()):
+            if (
+                candidate.is_file()
+                and os.access(str(candidate), os.X_OK)
+                and not candidate.name.startswith(".")
+                and candidate.suffix
+                not in {
+                    ".sh",
+                    ".py",
+                    ".md",
+                    ".txt",
+                    ".cpp",
+                    ".hip",
+                    ".cu",
+                    ".h",
+                    ".hpp",
+                }
+            ):
+                return cmd.replace("-- ./app", f"-- {candidate}")
+        return cmd  # leave as-is if nothing found
+
+    def _find_output_db(self, cmd: str) -> Optional[pathlib.Path]:
+        """Parse -d <dir> -o <base> from a rocprofv3 command and find the resulting .db."""
+        import shlex
+
+        try:
+            parts = shlex.split(cmd)
+        except ValueError:
+            parts = cmd.split()
+
+        out_dir = "."
+        out_base = None
+        for i, p in enumerate(parts):
+            if p in ("-d", "--output-path") and i + 1 < len(parts):
+                out_dir = parts[i + 1]
+            elif p in ("-o", "--output-file") and i + 1 < len(parts):
+                out_base = parts[i + 1]
+
+        if out_base is None:
+            return None
+
+        # rocprofv3 creates <out_base>_results.db inside out_dir
+        candidates = [
+            pathlib.Path(out_dir) / f"{out_base}_results.db",
+            pathlib.Path(out_dir) / f"{out_base}.db",
+        ]
+        for c in candidates:
+            if c.exists():
+                return c
+        # Glob fallback
+        import glob
+
+        matches = sorted(glob.glob(str(pathlib.Path(out_dir) / f"{out_base}*.db")))
+        if matches:
+            return pathlib.Path(matches[0])
+        return None
+
+    def _collect_profiling_commands(self) -> List[tuple]:
+        """Collect (label, full_command) pairs from tier0 and existing recommendations."""
+        cmds: List[tuple] = []
+        seen: set = set()
+
+        def _add(label: str, cmd: str) -> None:
+            if cmd and cmd not in seen:
+                seen.add(cmd)
+                cmds.append((label, cmd))
+
+        if self._tier0:
+            fc = getattr(self._tier0, "suggested_first_command", None)
+            if fc:
+                _add("Start Here — suggested first profiling command", fc)
+
+        priority_order = {"HIGH": 0, "MEDIUM": 1, "LOW": 2, "INFO": 3}
+        for rec in sorted(
+            self._recs, key=lambda r: priority_order.get(r.get("priority", "INFO"), 4)
+        ):
+            for cmd in rec.get("commands", []):
+                fc = cmd.get("full_command", "")
+                label = (
+                    f"[{rec.get('priority', 'INFO')}] {rec.get('category', '')} — "
+                    f"{cmd.get('tool', '')}: {cmd.get('description', '')}"
+                )
+                _add(label, fc)
+
+        return cmds
+
+    def _llm_annotate_profiling_plan(self, cmds: List[tuple]) -> List[tuple]:
+        """Send tier0 metadata to LLM for annotation via persistent conversation."""
+        if self._conv is None:
+            return cmds
+        try:
+            plan = self._tier0
+            if plan is None:
+                return cmds
+            patterns = getattr(plan, "detected_patterns", [])
+            import json as _json
+
+            metadata = {
+                "programming_model": getattr(plan, "programming_model", "HIP"),
+                "kernel_count": getattr(plan, "kernel_count", 0),
+                "suggested_counters": getattr(plan, "suggested_counters", []),
+                "risk_areas": getattr(plan, "risk_areas", []),
+                "detected_patterns": [
+                    {
+                        "id": (
+                            p.get("pattern_id")
+                            if isinstance(p, dict)
+                            else getattr(p, "pattern_id", "")
+                        ),
+                        "severity": (
+                            p.get("severity")
+                            if isinstance(p, dict)
+                            else getattr(p, "severity", "")
+                        ),
+                        "description": (
+                            p.get("description")
+                            if isinstance(p, dict)
+                            else getattr(p, "description", "")
+                        ),
+                    }
+                    for p in patterns
+                ],
+                "suggested_commands": [cmd for _, cmd in cmds],
+            }
+            user_msg = (
+                f"Annotate this profiling plan (max 200 words, plain text only — no markdown): "
+                f"{_json.dumps(metadata)}"
+            )
+            _print()
+            _print("  ── LLM Profiling Advice ────────────────────────────", style="cyan")
+            self._conv.send(user_msg, on_token=_print_token)
+            _print()
+        except Exception as exc:
+            _print(f"  (LLM annotation skipped: {exc})", style="dim")
+        return cmds
+
+    def _run_tier1_analysis(self, db_path: str):
+        """Run Tier 1/2 analysis on db_path; return (recs, breakdown) tuple.
+
+        recs     — List[Dict] of recommendations
+        breakdown — Dict with kernel_time_pct, memcpy_time_pct, api_overhead_pct,
+                    idle_time_pct, total_runtime_ns; None if analysis fails
+        """
+        try:
+            from rocpd.ai_analysis.api import analyze_database
+
+            result = analyze_database(pathlib.Path(db_path))
+            recs: List[Dict[str, Any]] = []
+            for r in (
+                result.recommendations.high_priority
+                + result.recommendations.medium_priority
+                + result.recommendations.low_priority
+            ):
+                recs.append(
+                    {
+                        "id": r.id,
+                        "priority": r.priority,
+                        "category": r.category,
+                        "issue": r.title,
+                        "suggestion": r.description,
+                        "actions": r.next_steps,
+                        "commands": [],
+                    }
+                )
+            breakdown: Optional[Dict[str, Any]] = None
+            eb = result.execution_breakdown
+            if eb is not None:
+                breakdown = {
+                    "kernel_time_pct": eb.kernel_time_pct,
+                    "memcpy_time_pct": eb.memcpy_time_pct,
+                    "api_overhead_pct": eb.api_overhead_pct,
+                    "idle_time_pct": eb.idle_time_pct,
+                    "total_runtime_ns": result.profiling_info.total_duration_ns,
+                }
+            return recs, breakdown
+        except Exception as exc:
+            _print(f"  (Tier 1 analysis failed: {exc})", style="red")
+            return [], None
+
+    def _extract_ai_commands(self, text: str, structured_cmds: List[str]) -> List[str]:
+        """Extract rocprofv3 commands from LLM text + structured recommendation list.
+
+        Free-form matches come first; deduplicates; returns at most 5.
+        """
+        free_form = re.findall(r"rocprofv3\s+[^\n]+", text)
+        # Strip trailing punctuation / markdown from free-form matches
+        free_form = [c.rstrip("`.,'\"") for c in free_form]
+        seen: set = set()
+        result: List[str] = []
+        for cmd in free_form + list(structured_cmds):
+            cmd = cmd.strip()
+            if cmd and cmd not in seen:
+                seen.add(cmd)
+                result.append(cmd)
+            if len(result) >= 5:
+                break
+        return result
+
+    def _offer_run_ai_commands(self, commands: List[str]) -> None:
+        """Prompt the user to run an AI-suggested profiling command; run + re-analyze if chosen."""
+        if not commands:
+            return
+        _print()
+        _print(
+            "  ── AI-suggested profiling commands ───────────────────────", style="cyan"
+        )
+        for i, cmd in enumerate(commands, 1):
+            _print(f"  [{i}]  $ {cmd}", style="dim")
+        _print()
+        prompt_opts = "/".join(str(i) for i in range(1, len(commands) + 1)) + "/n"
+        try:
+            choice = _input(f"  Run one of these now? [{prompt_opts}]:  ").strip()
+        except EOFError:
+            return
+        if not choice.isdigit() or not (1 <= int(choice) <= len(commands)):
+            return
+
+        cmd = commands[int(choice) - 1]
+        if "-- ./app" in cmd:
+            auto = self._resolve_app_placeholder(cmd)
+            _print("  Enter application to profile:", style="cyan")
+            try:
+                app_input = _input("  > ").strip()
+            except EOFError:
+                return
+            if app_input:
+                cmd = cmd.replace("-- ./app", f"-- {app_input}")
+            elif "-- ./app" not in auto:
+                cmd = auto
+
+        _print()
+        _print(f"  Running: $ {cmd}", style="cyan")
+        _print()
+        proc = subprocess.run(shlex.split(cmd))
+        _print()
+        if proc.returncode != 0:
+            _print(f"  Command exited with code {proc.returncode}.", style="yellow")
+
+        db_path = self._find_output_db(cmd)
+        if not db_path:
+            _print(
+                "  Enter path to the output .db file (or Enter to skip):", style="cyan"
+            )
+            try:
+                db_input = _input("  > ").strip()
+            except EOFError:
+                return
+            if not db_input:
+                return
+            db_path = pathlib.Path(db_input).expanduser()
+            if not db_path.exists():
+                _print(f"  File not found: {db_path}", style="red")
+                return
+
+        self._db_path = str(db_path)
+        _print("  Running Tier 1/2 analysis on new trace...", style="dim")
+        new_recs, breakdown = self._run_tier1_analysis(str(db_path))
+        if new_recs:
+            self._show_analysis_summary(new_recs)
+        added = self._ingest_recommendations(new_recs)
+        now = datetime.now(timezone.utc).isoformat()
+        self._session.history.append(
+            HistoryEntry(type="profiling_run", timestamp=now, db_path=str(db_path))
+        )
+        self._session.last_updated = now
+        if self._conv:
+            self._session.conversation = self._conv.to_dict()
+        self._session.sent_source_files = list(self._sent_source_files)
+        self._store.save(self._session)
+        _print(f"  ✓ {added} finding(s) added to menu.", style="green")
+
+    _TOKEN_BUDGET = 60_000  # characters (approximate token proxy)
+
+    # Subdirectory names that look like backup/archive copies — skip them so
+    # we don't send the same source file twice (e.g. original_code/).
+    _SKIP_SUBDIRS = frozenset(
+        {
+            "original_code",
+            "original",
+            "backup",
+            "bak",
+            "old",
+            "archive",
+            "reference",
+            "orig",
+            "before",
+        }
+    )
+
+    def _select_hot_files(self, budget: int = _TOKEN_BUDGET) -> List[tuple]:
+        """Return [(abs_path, content)] for files with detected kernels, within budget.
+
+        Deduplicates by basename so that backup copies in subdirectories (e.g.
+        original_code/foo.cpp when foo.cpp already exists at the root) are skipped.
+        """
+        if not self._tier0:
+            return []
+        # Support both SourceAnalysisResult (detected_kernels directly on tier0)
+        # and any future wrapper that exposes a .profiling_plan child object.
+        plan = getattr(self._tier0, "profiling_plan", None) or self._tier0
+
+        rel_paths: List[str] = []
+        seen_paths: set = set()
+        seen_names: set = set()  # deduplicate by basename
+        base = pathlib.Path(self._source_dir)
+        for k in getattr(plan, "detected_kernels", []):
+            # kernels may be dicts {"file": ...} or dataclass objects with .file
+            rp = k.get("file", "") if isinstance(k, dict) else getattr(k, "file", "")
+            if not rp or rp in seen_paths:
+                continue
+            # Skip files that live inside known backup subdirectories
+            parts = pathlib.Path(rp).parts
+            if any(p in self._SKIP_SUBDIRS for p in parts[:-1]):
+                continue
+            name = pathlib.Path(rp).name
+            if name in seen_names:
+                continue  # skip duplicate basenames (same file in different subdir)
+            seen_paths.add(rp)
+            seen_names.add(name)
+            rel_paths.append(rp)
+
+        result: List[tuple] = []
+        used = 0
+        for rp in rel_paths:
+            full = base / rp
+            if not full.exists():
+                continue
+            content = full.read_text(errors="replace")
+            if used + len(content) > budget:
+                content = content[: budget - used]
+                result.append((str(full), content))
+                break
+            result.append((str(full), content))
+            used += len(content)
+
+        return result
+
+    def _path_optimize(self) -> None:
+        """Get AI optimization suggestions for detected GPU source patterns."""
+        # Determine LLM provider
+        llm_provider = self._llm_provider
+        if not llm_provider:
+            llm_provider = self._autodetect_llm()
+
+        if not llm_provider:
+            _print(
+                "  No LLM configured. Add --llm anthropic or --llm openai to get "
+                "AI-generated code suggestions. Showing rule-based hints instead:",
+                style="yellow",
+            )
+            _print()
+            self._show_rulebased_suggestions()
+            return
+
+        # Fast path: use compact tier0 metadata when available (same speed as [p])
+        if self._tier0:
+            self._optimize_via_tier0(llm_provider)
+            return
+
+        # Fallback: send raw source files (slower — only used when --source-dir
+        # was not given, so tier0 was never run)
+        hot_files = self._select_hot_files()
+        if not hot_files:
+            _print(
+                "  No kernel-containing files detected. "
+                "Run with --source-dir pointing at your source.",
+                style="yellow",
+            )
+            return
+
+        _print()
+        _print(f"  Analyzing {len(hot_files)} file(s):", style="cyan")
+        for path, _ in hot_files:
+            try:
+                label = pathlib.Path(path).relative_to(pathlib.Path(self._source_dir))
+            except ValueError:
+                label = pathlib.Path(path).name
+            _print(f"    · {label}", style="dim")
+        _print()
+
+        summaries = [(pathlib.Path(p).name, c) for p, c in hot_files]
+        raw = self._request_optimization_suggestions(summaries, llm_provider)
+        if not raw:
+            return
+        # Display first file's suggestion directly
+        first_text = next(iter(raw.values()), "")
+        if first_text:
+            _print()
+            _print(
+                "  ── Optimization Suggestions ─────────────────────────", style="cyan"
+            )
+            _print(first_text[:3000] + ("…" if len(first_text) > 3000 else ""))
+            _print()
+
+        # Offer to run any profiling commands found in the LLM response
+        all_text = "\n".join(raw.values())
+        structured = [
+            c.get("full_command", "")
+            for rec in self._recs
+            for c in rec.get("commands", [])
+            if c.get("full_command")
+        ]
+        ai_cmds = self._extract_ai_commands(all_text, structured)
+        self._offer_run_ai_commands(ai_cmds)
+
+        # Apply changes file by file (legacy path)
+        modified: List[str] = []
+        for path, original_content in hot_files:
+            name = pathlib.Path(path).name
+            file_sugg = raw.get(name)
+            if not file_sugg:
+                continue
+            modified_content = self._present_and_apply(path, original_content, file_sugg)
+            if modified_content is not None:
+                p = pathlib.Path(path)
+                with tempfile.NamedTemporaryFile(
+                    mode="w", dir=p.parent, delete=False, suffix=".tmp"
+                ) as tmp:
+                    tmp.write(modified_content)
+                os.replace(tmp.name, str(p))
+                modified.append(name)
+
+        if modified:
+            now = datetime.now(timezone.utc).isoformat()
+            self._session.history.append(
+                HistoryEntry(
+                    type="code_change",
+                    timestamp=now,
+                    files_modified=modified,
+                    summary=f"Optimized {len(modified)} file(s) via LLM suggestions",
+                )
+            )
+            _print(f"  ✓ Modified: {', '.join(modified)}", style="green")
+            _print()
+            try:
+                ans = _input("  Run profiling commands now? [y/N]  ").strip().lower()
+            except EOFError:
+                ans = ""
+            if ans == "y":
+                self._path_profiling(_source="code_change_analysis")
+
+    def _optimize_via_tier0(self, llm_provider: str) -> None:
+        """Fast optimization path: send compact tier0 metadata to LLM (not raw source)."""
+        _print()
+        _print(
+            "  Requesting optimization suggestions (based on detected patterns)...",
+            style="dim",
+        )
+        import json as _json
+
+        # Build compact metadata from tier0 — same approach as annotate_profiling_plan
+        plan = self._tier0
+        patterns = getattr(plan, "detected_patterns", [])
+        kernels = getattr(plan, "detected_kernels", [])[:5]
+        metadata = {
+            "programming_model": getattr(plan, "programming_model", "HIP"),
+            "kernel_count": getattr(plan, "kernel_count", 0),
+            "risk_areas": getattr(plan, "risk_areas", []),
+            "detected_patterns": [
+                {
+                    "id": (
+                        p.get("pattern_id")
+                        if isinstance(p, dict)
+                        else getattr(p, "pattern_id", "")
+                    ),
+                    "severity": (
+                        p.get("severity")
+                        if isinstance(p, dict)
+                        else getattr(p, "severity", "")
+                    ),
+                    "description": (
+                        p.get("description")
+                        if isinstance(p, dict)
+                        else getattr(p, "description", "")
+                    ),
+                    "count": (
+                        p.get("count", 1)
+                        if isinstance(p, dict)
+                        else getattr(p, "count", 1)
+                    ),
+                }
+                for p in patterns
+            ],
+            "detected_kernels": [
+                {
+                    "name": ("[KERNEL]" if isinstance(k, dict) else "[KERNEL]"),
+                    "launch_type": (
+                        k.get("launch_type", "")
+                        if isinstance(k, dict)
+                        else getattr(k, "launch_type", "")
+                    ),
+                }
+                for k in kernels
+            ],
+        }
+
+        if self._conv is None:
+            _print("  (No LLM configured — skipping AI optimization)", style="dim")
+            return
+
+        user_msg = (
+            "Based on these detected GPU source patterns, provide concrete "
+            "optimization recommendations (max 300 words, plain text only — no markdown headers):\n"
+            + _json.dumps(metadata, indent=2)
+        )
+        _print()
+        _print("  ── AI Optimization Suggestions ──────────────────────", style="cyan")
+        try:
+            note = self._conv.send(user_msg, on_token=_print_token)
+            _print()
+        except Exception as exc:
+            _print(f"\n  (LLM optimization failed: {exc})", style="red")
+            return
+        if note:
+            self._offer_apply_suggestions(note, self._llm_provider)
+            structured = [
+                c.get("full_command", "")
+                for rec in self._recs
+                for c in rec.get("commands", [])
+                if c.get("full_command")
+            ]
+            ai_cmds = self._extract_ai_commands(note, structured)
+            self._offer_run_ai_commands(ai_cmds)
+        else:
+            _print("  (LLM returned no suggestions)", style="yellow")
+
+    def _offer_apply_suggestions(
+        self, suggestions: str, llm_provider: Optional[str] = None
+    ) -> None:
+        """Ask user whether to save the suggestions or let the LLM edit source code directly."""
+        _print("  Apply these suggestions to your source files?", style="cyan")
+        _print("    [s] Save suggestions to a file", style="dim")
+        _print("    [e] Edit code with AI  (LLM rewrites a source file)", style="dim")
+        _print("    [n] No, return to menu (default)", style="dim")
+        try:
+            ans = _input("  > ").strip().lower()
+        except EOFError:
+            return
+
+        if ans == "s":
+            out_path = pathlib.Path(self._source_dir) / "ai_optimization_suggestions.txt"
+            try:
+                out_path.write_text(suggestions + "\n")
+                _print(f"  Suggestions saved to: {out_path}", style="green")
+            except OSError as e:
+                _print(f"  (Could not save file: {e})", style="red")
+
+        elif ans == "e":
+            # Always use local LLM for code edits — preserves privacy and avoids
+            # cloud token limits that truncate large source files.
+            self._apply_suggestions_via_llm(suggestions, "local")
+
+    def _pick_source_file(self) -> Optional[pathlib.Path]:
+        """Present a numbered list of source files and return the chosen one."""
+        exts = {".hip", ".cpp", ".cu", ".cl", ".h", ".hpp", ".py"}
+        src_files: List[pathlib.Path] = []
+        try:
+            src_files = [
+                p
+                for p in sorted(pathlib.Path(self._source_dir).rglob("*"))
+                if p.suffix in exts and p.is_file()
+            ]
+        except OSError:
+            pass
+
+        if not src_files:
+            _print("  (No source files found in source directory)", style="yellow")
+            return None
+
+        _print()
+        _print("  Choose a file to edit:", style="cyan")
+        for i, p in enumerate(src_files[:15]):
+            try:
+                label = p.relative_to(pathlib.Path(self._source_dir))
+            except ValueError:
+                label = p.name
+            _print(f"    [{i + 1}] {label}", style="dim")
+        try:
+            choice = _input("  > ").strip()
+        except EOFError:
+            return None
+        try:
+            idx = int(choice) - 1
+            if not (0 <= idx < len(src_files)):
+                raise ValueError
+        except ValueError:
+            _print("  (Invalid choice — skipping)", style="yellow")
+            return None
+        return src_files[idx]
+
+    def _apply_suggestions_via_llm(
+        self, suggestions: str, llm_provider: Optional[str]
+    ) -> None:
+        """Use the LLM to rewrite a source file applying the optimization suggestions.
+
+        Workflow:
+          1. User picks a source file.
+          2. LLM receives: original file + suggestions → returns complete rewritten file.
+          3. Unified diff is shown.
+          4. User confirms before the file is overwritten.
+          5. Original is backed up as <file>.orig.
+        """
+        # For local provider: ensure a local LLM backend is actually running.
+        # If not, fall back to the configured online provider (anthropic/openai),
+        # or surface a helpful error if nothing is available.
+        if llm_provider == "local" and not self._llm_local:
+            detected = self._autodetect_llm()
+            if not detected:
+                fallback = (
+                    self._llm_provider
+                    if self._llm_provider and self._llm_provider != "local"
+                    else None
+                )
+                if fallback:
+                    _print(
+                        f"  Local LLM not detected — falling back to {fallback} for code edit.",
+                        style="yellow",
+                    )
+                    llm_provider = fallback
+                else:
+                    _print("  No LLM available for code editing.", style="yellow")
+                    _print("  Options:", style="dim")
+                    _print("    • Start a local model:  ollama run llama3", style="dim")
+                    _print(
+                        "    • Or pass --llm anthropic / --llm openai when launching rocpd.",
+                        style="dim",
+                    )
+                    return
+
+        if not llm_provider:
+            _print(
+                "  No LLM configured. Pass --llm local/anthropic/openai to enable AI code edits.",
+                style="yellow",
+            )
+            return
+
+        chosen = self._pick_source_file()
+        if chosen is None:
+            return
+
+        try:
+            original = chosen.read_text()
+        except OSError as e:
+            _print(f"  (Cannot read {chosen.name}: {e})", style="red")
+            return
+
+        # Build LLM prompt
+        system = (
+            "You are an expert AMD GPU performance engineer and C++/HIP developer. "
+            "You will be given a source file and a list of optimization suggestions. "
+            "Rewrite the file applying the suggestions. "
+            "Return ONLY the complete rewritten source file — no explanation, no markdown fences, "
+            "no commentary before or after the code. "
+            "Preserve all existing functionality. Make the minimum changes needed to apply the "
+            "optimizations. Add a short inline comment on each changed line explaining why."
+        )
+        user = (
+            f"=== OPTIMIZATION SUGGESTIONS ===\n{suggestions}\n\n"
+            f"=== SOURCE FILE: {chosen.name} ===\n{original}"
+        )
+
+        _print()
+        from rocpd.ai_analysis.llm_analyzer import LLMAnalyzer
+
+        model = self._llm_local_model if llm_provider == "local" else self._llm_model
+        analyzer = LLMAnalyzer(
+            provider=llm_provider,
+            api_key=self._llm_api_key,
+            model=model,
+        )
+
+        try:
+            with _Spinner(f"  {llm_provider} LLM is rewriting {chosen.name}..."):
+                if llm_provider == "openai":
+                    rewritten = analyzer._call_openai(system, user, max_tokens=16384)
+                elif llm_provider == "anthropic":
+                    rewritten = analyzer._call_anthropic(system, user)
+                elif llm_provider == "private":
+                    rewritten = analyzer._call_private(system, user)
+                else:
+                    rewritten = analyzer._call_local(system, user)
+        except Exception as exc:
+            _print(f"  (LLM edit failed: {exc})", style="red")
+            return
+
+        if not rewritten or not rewritten.strip():
+            _print("  (LLM returned an empty file — aborting)", style="yellow")
+            return
+
+        # Show unified diff
+        import difflib
+
+        diff_lines = list(
+            difflib.unified_diff(
+                original.splitlines(keepends=True),
+                rewritten.splitlines(keepends=True),
+                fromfile=f"{chosen.name} (original)",
+                tofile=f"{chosen.name} (AI-edited)",
+                n=3,
+            )
+        )
+
+        _print()
+        _print("  ── Proposed changes ─────────────────────────────────", style="cyan")
+        if diff_lines:
+            for line in diff_lines[:120]:  # cap at 120 diff lines for readability
+                line = line.rstrip("\n")
+                if line.startswith("+"):
+                    _print(line, style="green")
+                elif line.startswith("-"):
+                    _print(line, style="red")
+                else:
+                    _print(line, style="dim")
+            if len(diff_lines) > 120:
+                _print(
+                    f"  ... ({len(diff_lines) - 120} more diff lines omitted)",
+                    style="dim",
+                )
+        else:
+            _print(
+                "  (No changes — rewritten file is identical to original)",
+                style="yellow",
+            )
+            return
+
+        _print()
+        try:
+            confirm = _input("  Apply these changes? [y/N]  ").strip().lower()
+        except EOFError:
+            return
+
+        if confirm != "y":
+            _print("  Changes discarded — original file unchanged.", style="dim")
+            return
+
+        # Back up original then write
+        backup = chosen.with_suffix(chosen.suffix + ".orig")
+        try:
+            backup.write_text(original)
+            chosen.write_text(rewritten)
+            _print(f"  Original backed up to: {backup.name}", style="dim")
+            _print(f"  File updated: {chosen}", style="green")
+        except OSError as e:
+            _print(f"  (Write failed: {e})", style="red")
+
+        # Notify the persistent conversation about the rewrite
+        if self._conv:
+            try:
+                self._conv.send(
+                    f"File `{chosen.name}` was rewritten applying the above optimizations. "
+                    f"Compilation: pending.",
+                    on_token=None,
+                )
+            except Exception:
+                pass  # post-rewrite summary is advisory; never crash here
+
+    def _autodetect_llm(self) -> Optional[str]:
+        """Try to detect a running local LLM (ollama). Returns provider name or None."""
+        try:
+            import urllib.request
+
+            url = os.environ.get("ROCPD_LLM_LOCAL_URL", "http://localhost:11434")
+            req = urllib.request.urlopen(f"{url}/api/tags", timeout=1)
+            if req.status == 200:
+                _print(f"  Auto-detected ollama at {url} — using local LLM.", style="dim")
+                self._llm_local = "ollama"
+                return "local"
+        except Exception:
+            pass
+        return None
+
+    def _show_rulebased_suggestions(self) -> None:
+        """Display Tier 0 rule-based optimization hints when no LLM is available."""
+        recs = getattr(self._tier0, "recommendations", None) or self._recs
+        if not recs:
+            _print("  No rule-based suggestions available.", style="dim")
+            return
+        shown = 0
+        for rec in recs:
+            pri = (
+                rec.get("priority", "INFO")
+                if isinstance(rec, dict)
+                else getattr(rec, "priority", "INFO")
+            )
+            if pri in ("HIGH", "MEDIUM"):
+                issue = (
+                    rec.get("issue", rec.get("category", ""))
+                    if isinstance(rec, dict)
+                    else getattr(rec, "issue", "")
+                )
+                suggest = (
+                    rec.get("suggestion", "")
+                    if isinstance(rec, dict)
+                    else getattr(rec, "suggestion", "")
+                )
+                actions = (
+                    rec.get("actions", [])
+                    if isinstance(rec, dict)
+                    else getattr(rec, "actions", [])
+                )
+                _print(f"  [{pri}] {issue}", style="yellow" if pri == "MEDIUM" else "red")
+                if suggest:
+                    _print(f"    → {suggest}", style="dim")
+                for act in actions[:3]:
+                    _print(f"      • {act}", style="dim")
+                _print()
+                shown += 1
+        if shown == 0:
+            _print("  No HIGH/MEDIUM priority suggestions found.", style="dim")
+        _print(
+            "  To apply AI-generated code patches: re-run with --llm anthropic or --llm openai.",
+            style="dim",
+        )
+
+    def _request_optimization_suggestions(
+        self, summaries: List[tuple], llm_provider: Optional[str] = None
+    ) -> Dict[str, str]:
+        """Send source file summaries to LLM; return {filename: suggestion_text}."""
+        if self._conv is None:
+            return {}
+        try:
+            current_files = {name for name, _ in summaries}
+            already_sent = current_files.issubset(self._sent_source_files)
+            if already_sent:
+                # Source content already in conversation history — ask for new suggestions only
+                file_list = ", ".join(sorted(current_files))
+                user_msg = (
+                    f"Based on the source files already shared ({file_list}), provide "
+                    f"additional concrete optimization suggestions we haven't covered yet. "
+                    f"Plain text only — no markdown headers. "
+                    f"Start each file section with exactly: FILE: <filename>"
+                )
+            else:
+                new_files = current_files - self._sent_source_files
+                combined = "\n\n".join(
+                    f"=== {name} ===\n{content}"
+                    for name, content in summaries
+                    if name in new_files
+                )
+                user_msg = (
+                    f"Analyze these AMD GPU source files and provide concrete, actionable "
+                    f"optimization suggestions. Focus on: memory coalescing, wave occupancy, "
+                    f"unnecessary hipDeviceSynchronize, blocking hipMemcpy, MFMA usage, LDS "
+                    f"utilization, loop structure, kernel launch parameters. Be specific — "
+                    f"reference actual patterns visible in the code. Use plain text only — "
+                    f"no markdown headers. Start each file section with exactly: FILE: <filename>\n\n"
+                    f"{combined}"
+                )
+                self._sent_source_files.update(new_files)
+            _print()
+            _print(
+                "  ── AI Optimization Suggestions ──────────────────────", style="cyan"
+            )
+            raw = self._conv.send(user_msg, on_token=_print_token)
+            _print()
+
+            result: Dict[str, str] = {}
+            if raw and raw.lstrip().startswith("FILE:"):
+                raw = "\n" + raw.lstrip()
+            for block in re.split(r"\nFILE:\s*", raw or ""):
+                block = block.strip()
+                if not block:
+                    continue
+                lines = block.split("\n", 1)
+                if len(lines) == 2:
+                    result[lines[0].strip()] = lines[1].strip()
+            if not result and raw and raw.strip():
+                first_name = summaries[0][0] if summaries else "response"
+                result[first_name] = raw.strip()
+            return result
+        except Exception as exc:
+            _print(f"  (LLM optimization failed: {exc})", style="red")
+            return {}
+
+    def _present_and_apply(
+        self, path: str, original: str, suggestion: str
+    ) -> Optional[str]:
+        """Show suggestion, optionally show diff, ask for confirmation. Return new content or None."""
+        name = pathlib.Path(path).name
+        _print()
+        _print(
+            f"  ── Suggestions for {name} ──────────────────────────────", style="cyan"
+        )
+        _print(suggestion[:2000] + ("…" if len(suggestion) > 2000 else ""))
+        _print()
+        try:
+            ans = (
+                _input(f"  Append LLM suggestions as comments to {name}? [y/N/diff]  ")
+                .strip()
+                .lower()
+            )
+        except EOFError:
+            return None
+        if ans == "diff":
+            _print(
+                "  (Diff view: LLM suggestions are advisory — showing suggestion text)",
+                style="dim",
+            )
+            _print(suggestion, style="dim")
+            try:
+                ans = (
+                    _input(f"  Append LLM suggestions as comments to {name}? [y/N]  ")
+                    .strip()
+                    .lower()
+                )
+            except EOFError:
+                return None
+        if ans == "y":
+            separator = "\n" + "=" * 72 + "\n"
+            return (
+                original
+                + separator
+                + "// LLM OPTIMIZATION SUGGESTIONS:\n// "
+                + "\n// ".join(suggestion.splitlines())
+                + "\n"
+            )
+        return None
+
+    def _pursue_recommendation(self, item: PersistentMenuItem) -> None:
+        """Show full recommendation and sub-menu: [r] run command, [m] back to main menu."""
+        _print()
+        _print(
+            f"  ── {item.title} [{item.priority}] ──────────────────────────────",
+            style="cyan",
+        )
+        detail = item.detail
+        if detail.get("issue"):
+            _print(f"  Issue:  {detail['issue']}")
+        if detail.get("suggestion"):
+            _print(f"  Why:    {detail['suggestion']}")
+        if detail.get("estimated_impact"):
+            _print(f"  Impact: {detail['estimated_impact']}")
+        actions = detail.get("actions", [])
+        if actions:
+            _print()
+            _print("  Next steps:", style="cyan")
+            for act in actions:
+                _print(f"    • {act}", style="dim")
+
+        cmds = [
+            c.get("full_command", "")
+            for c in detail.get("commands", [])
+            if c.get("full_command")
+        ]
+        if cmds:
+            _print()
+            _print("  Suggested commands:", style="cyan")
+            for i, cmd in enumerate(cmds, 1):
+                _print(f"    [{i}]  $ {cmd}", style="dim")
+
+        _print()
+        if cmds:
+            _print("  [r]  Run suggested command")
+        else:
+            _print("  [r]  Run a profiling command")
+        _print("  [m]  Back to main menu")
+        _print()
+        try:
+            choice = _input("  > ").strip().lower()
+        except EOFError:
+            return
+
+        if choice == "r" and not cmds:
+            # No specific commands → fall back to the full profiling path
+            self._path_profiling()
+        elif choice == "r" and cmds:
+            cmd = self._resolve_app_placeholder(cmds[0])
+            # Ask for app if placeholder not resolved
+            if "-- ./app" in cmd:
+                _print("  Enter application to profile:", style="cyan")
+                try:
+                    app_input = _input("  > ").strip()
+                except EOFError:
+                    return
+                if app_input:
+                    cmd = cmd.replace("-- ./app", f"-- {app_input}")
+
+            _print()
+            _print(f"  Running: $ {cmd}", style="cyan")
+            _print()
+            proc = subprocess.run(shlex.split(cmd), check=False)
+            _print()
+            if proc.returncode != 0:
+                _print(f"  Command exited with code {proc.returncode}.", style="yellow")
+
+            # Auto-detect output .db
+            db_path = self._find_output_db(cmd)
+            if db_path:
+                _print(f"  Found output: {db_path}", style="green")
+            else:
+                try:
+                    db_input = _input(
+                        "  Enter path to .db file from this run (or Enter to skip): "
+                    ).strip()
+                except EOFError:
+                    db_input = ""
+                if db_input:
+                    db_path = pathlib.Path(db_input).expanduser()
+                    if not db_path.exists():
+                        _print(f"  File not found: {db_path}", style="red")
+                        db_path = None
+
+            if db_path:
+                _print("  Running Tier 1/2 analysis...", style="dim")
+                new_recs, breakdown = self._run_tier1_analysis(str(db_path))
+                added = self._ingest_recommendations(new_recs)
+                now = datetime.now(timezone.utc).isoformat()
+                self._session.history.append(
+                    HistoryEntry(
+                        type="profiling_run", timestamp=now, db_path=str(db_path)
+                    )
+                )
+                _print(f"  ✓ {added} new recommendation(s) added.", style="green")
+        # [m] or any other input → return to main menu (item stays in list)
+
+
+# ── WorkflowSession (7-phase interactive workflow) ───────────────────────────
+
+
+@dataclass
+class _TraceRun:
+    """Record of a single profiling run."""
+
+    timestamp: str
+    command: str
+    db_path: str
+    trace_files: List[str] = field(default_factory=list)
+
+
+@dataclass
+class _AnalysisSnapshot:
+    """Snapshot of one analysis iteration."""
+
+    timestamp: str
+    iteration: int
+    recommendations: List[Dict[str, Any]] = field(default_factory=list)
+    execution_breakdown: Optional[Dict[str, Any]] = None
+    hotspots: List[Dict[str, Any]] = field(default_factory=list)
+    ai_recommended_command: Optional[str] = None
+
+
+@dataclass
+class _EditRecord:
+    """Record of an AI-applied edit."""
+
+    timestamp: str
+    file_path: str
+    backup_path: str
+    checkpoint_id: Optional[int] = None  # cp_id of matching CheckpointRecord
+
+
+class CheckpointError(Exception):
+    """Raised when a git checkpoint operation fails."""
+
+
+@dataclass
+class CheckpointRecord:
+    """One git-worktree checkpoint created after an AI edit batch."""
+
+    cp_id: int  # 0-based index
+    commit_hash: str  # git commit hash
+    ref_name: str  # refs/rocpd/<session_id>/cp-N
+    worktree_path: str  # ~/.rocpd/sessions/<id>/cp-N
+    timestamp: str  # ISO-8601
+    files_modified: List[str]  # repo-relative paths touched
+    edit_summary: str  # human-readable description of AI changes
+    file_snapshots: Dict[str, str]  # {repo_relative_path: full_file_contents}
+    run_index: Optional[int] = None  # index into trace_history; None = no run yet
+    performance_delta_pct: Optional[float] = None  # +10 = improvement, -67 = regression
+    blacklisted: bool = False
+    blacklist_category: str = ""  # taken from edit_summary
+    blacklist_description: str = ""  # injected into LLM prompts
+
+
+class GitCheckpointManager:
+    """Wraps all git subprocess calls for checkpoint management.
+
+    All git calls pass identity overrides (-c user.email/name) so the feature
+    works in HPC environments where git identity is not configured.
+    Raises CheckpointError on any git failure.
+    """
+
+    # Base args prepended to every git call
+    _ID = ["-c", "user.email=rocpd@local", "-c", "user.name=rocpd"]
+
+    def __init__(
+        self,
+        repo_root: str,
+        session_id: str,
+        sessions_dir: str,
+    ) -> None:
+        self._repo_root = repo_root
+        self._session_id = session_id
+        self._sessions_dir = sessions_dir
+
+    def _git(self, *args: str, capture: bool = True) -> "subprocess.CompletedProcess":
+        """Run a git command rooted at repo_root with identity overrides."""
+        cmd = ["git", "-C", self._repo_root] + list(self._ID) + list(args)
+        return subprocess.run(
+            cmd,
+            capture_output=capture,
+            text=True,
+        )
+
+    def detect_repo(self, source_path: str) -> str:
+        """Return git repo root for source_path, or raise CheckpointError."""
+        result = subprocess.run(
+            ["git", "-C", source_path, "rev-parse", "--show-toplevel"],
+            capture_output=True,
+            text=True,
+        )
+        if result.returncode != 0:
+            raise CheckpointError(
+                f"'{source_path}' is not inside a git repository. "
+                "Checkpoints require git."
+            )
+        return result.stdout.strip()
+
+    def get_head(self) -> str:
+        """Return current HEAD commit hash."""
+        result = self._git("rev-parse", "HEAD")
+        if result.returncode != 0:
+            raise CheckpointError("Could not read HEAD commit hash.")
+        return result.stdout.strip()
+
+    def create_checkpoint_commit(self, files: List[str], message: str) -> str:
+        """Stage files and create a commit; return commit hash.
+
+        Uses --no-verify to skip pre-commit hooks (rocpd checkpoints are
+        tooling artifacts, not user commits).
+        """
+        # Stage only the specified files
+        add_result = self._git("add", "--", *files)
+        if add_result.returncode != 0:
+            raise CheckpointError(f"git add failed: {add_result.stderr.strip()}")
+        commit_result = self._git("commit", "--no-verify", "-m", message)
+        if commit_result.returncode != 0:
+            raise CheckpointError(f"git commit failed: {commit_result.stderr.strip()}")
+        # Get hash of the commit we just created
+        hash_result = self._git("rev-parse", "HEAD")
+        if hash_result.returncode != 0:
+            raise CheckpointError("Could not read new commit hash.")
+        return hash_result.stdout.strip()
+
+    def tag_checkpoint(self, cp_id: int, commit_hash: str) -> str:
+        """Create a named ref (not a branch) pinning commit_hash from GC.
+
+        Returns the ref name: refs/rocpd/<session_id>/cp-N
+        """
+        ref_name = f"refs/rocpd/{self._session_id}/cp-{cp_id}"
+        result = self._git("update-ref", ref_name, commit_hash)
+        if result.returncode != 0:
+            raise CheckpointError(f"git update-ref failed: {result.stderr.strip()}")
+        return ref_name
+
+    def add_worktree(self, cp_id: int, commit_hash: str) -> str:
+        """Create a detached-HEAD worktree at sessions_dir/<session_id>/cp-N.
+
+        If the target path already exists (stale from a crashed session),
+        it is removed before creating the worktree.
+        Returns the worktree path.
+        """
+        import shutil as _shutil
+
+        worktree_path = str(
+            pathlib.Path(self._sessions_dir) / self._session_id / f"cp-{cp_id}"
+        )
+        if os.path.exists(worktree_path):
+            _shutil.rmtree(worktree_path, ignore_errors=True)
+
+        result = self._git("worktree", "add", "--detach", worktree_path, commit_hash)
+        if result.returncode != 0:
+            raise CheckpointError(f"git worktree add failed: {result.stderr.strip()}")
+        return worktree_path
+
+    def restore_files_from_commit(self, commit_hash: str, files: List[str]) -> None:
+        """Restore files to their state at commit_hash in the working directory."""
+        ls_result = self._git("ls-tree", "-r", "--name-only", commit_hash)
+        files_in_commit = set(ls_result.stdout.splitlines())
+
+        for file in files:
+            if file in files_in_commit:
+                result = self._git("checkout", commit_hash, "--", file)
+                if result.returncode != 0:
+                    raise CheckpointError(
+                        f"git checkout {commit_hash} -- {file} failed: "
+                        f"{result.stderr.strip()}"
+                    )
+
+    def remove_worktree(self, worktree_path: str) -> None:
+        """Remove a worktree directory. Silently skips if path does not exist."""
+        if not worktree_path:
+            return
+        if not os.path.exists(worktree_path):
+            return
+        self._git("worktree", "remove", worktree_path, "--force")
+        # Non-fatal — log but don't raise (exit path must not crash)
+
+    def delete_ref(self, ref_name: str) -> None:
+        """Delete a named ref. Silently skips if ref was already gone."""
+        self._git("update-ref", "-d", ref_name)
+
+    def commit_reachable(self, commit_hash: str) -> bool:
+        """Return True if commit_hash exists as a git object."""
+        result = self._git("cat-file", "-e", commit_hash)
+        return result.returncode == 0
+
+    def files_in_commit(self, commit_hash: str) -> List[str]:
+        """Return repo-relative file paths present in commit_hash tree."""
+        result = self._git("ls-tree", "-r", "--name-only", commit_hash)
+        return result.stdout.splitlines()
+
+    def list_worktrees(self) -> List[str]:
+        """Return list of worktree paths registered in the repo."""
+        result = self._git("worktree", "list", "--porcelain")
+        paths = []
+        for line in result.stdout.splitlines():
+            if line.startswith("worktree "):
+                paths.append(line[len("worktree ") :])
+        return paths
+
+
+@dataclass
+class WorkflowState:
+    """Persistent state for the 7-phase interactive workflow session."""
+
+    app_command: str
+    source_paths: List[str] = field(default_factory=list)
+    profiling_command: str = ""
+    trace_history: List[_TraceRun] = field(default_factory=list)
+    analysis_history: List[_AnalysisSnapshot] = field(default_factory=list)
+    edit_history: List[_EditRecord] = field(default_factory=list)
+    iteration_count: int = 0
+    # Checkpoint fields
+    repo_root: str = ""
+    baseline_commit: str = ""
+    checkpoints: List[CheckpointRecord] = field(default_factory=list)
+    active_checkpoint: Optional[int] = None
+    blacklisted_approaches: List[str] = field(default_factory=list)
+
+
+def _edit_summary_from_suggestions(suggestions: str) -> str:
+    """Extract a short summary from the LLM suggestions string."""
+    for line in suggestions.splitlines():
+        line = line.strip(" #-*")
+        if line:
+            return line[:80]
+    return "AI code edit"
+
+
+class WorkflowSession:
+    """7-phase interactive profiling + optimization workflow.
+
+    Triggered by: rocpd analyze --interactive "<app_command>"
+    """
+
+    _DEFAULT_TRACE_DIR = "/tmp/rocpd_trace"
+
+    def __init__(
+        self,
+        app_command: str,
+        source_paths: Optional[List[str]] = None,
+        llm_provider: Optional[str] = None,
+        llm_api_key: Optional[str] = None,
+        llm_model: Optional[str] = None,
+        trace_dir: Optional[str] = None,
+    ) -> None:
+        self._state = WorkflowState(
+            app_command=app_command,
+            source_paths=list(source_paths or []),
+        )
+        self._llm_provider = llm_provider
+        self._llm_api_key = llm_api_key
+        self._llm_model = llm_model
+        self._trace_dir = trace_dir or self._DEFAULT_TRACE_DIR
+        # Session persistence
+        _ts = datetime.now(timezone.utc).strftime("%Y-%m-%d_%H-%M-%S")
+        try:
+            _slug = re.sub(r"[^\w-]", "_", shlex.split(app_command)[0])[:24]
+        except (ValueError, IndexError):
+            _slug = "app"
+        self._session_id = f"workflow_{_ts}_{_slug}"
+        self._sessions_dir = pathlib.Path.home() / ".rocpd" / "sessions"
+        self._session_file = self._sessions_dir / f"{self._session_id}.json"
+        # Checkpoint manager — set after _init_checkpoints() called from run()
+        self._gcm: Optional["GitCheckpointManager"] = None
+
+    def _save_session(self) -> None:
+        """Serialize WorkflowState to ~/.rocpd/sessions/workflow_<id>.json."""
+        from dataclasses import asdict as _asdict
+
+        try:
+            self._sessions_dir.mkdir(parents=True, exist_ok=True)
+            payload = {
+                "session_id": self._session_id,
+                "type": "workflow",
+                "app_command": self._state.app_command,
+                "state": _asdict(self._state),
+            }
+            self._session_file.write_text(json.dumps(payload, indent=2))
+        except Exception as exc:
+            _print(f"  (Session save failed: {exc})", style="dim")
+
+    def _init_checkpoints(self) -> None:
+        """Detect git repo and record baseline commit.
+
+        Sets self._state.repo_root and self._state.baseline_commit.
+        If source is not in a git repo: leaves repo_root="" and checkpoints
+        are silently disabled. Dirty working tree is not a problem — checkpoints
+        only commit the specific files modified by each AI edit, leaving all
+        other working tree changes untouched.
+        """
+        if not self._state.source_paths:
+            return  # No source paths — checkpoints require a source location
+
+        source = self._state.source_paths[0]
+        try:
+            gcm = GitCheckpointManager(
+                repo_root="",  # Will be set by detect_repo
+                session_id=self._session_id,
+                sessions_dir=str(self._sessions_dir),
+            )
+            repo_root = gcm.detect_repo(source)
+        except CheckpointError as exc:
+            _print(
+                f"  Note: checkpoints disabled — {exc}",
+                style="dim",
+            )
+            return
+
+        # Re-create with correct repo_root
+        self._gcm = GitCheckpointManager(
+            repo_root=repo_root,
+            session_id=self._session_id,
+            sessions_dir=str(self._sessions_dir),
+        )
+
+        self._state.repo_root = repo_root
+        try:
+            self._state.baseline_commit = self._gcm.get_head()
+        except CheckpointError as exc:
+            _print(
+                f"  Note: checkpoints disabled — {exc}",
+                style="dim",
+            )
+            self._state.repo_root = ""
+            self._gcm = None
+            return
+
+    def _create_checkpoint(
+        self,
+        files_modified: List[str],
+        edit_summary: str,
+        file_snapshots: Dict[str, str],
+    ) -> None:
+        """Create a git commit + worktree checkpoint after an AI edit batch.
+
+        Silently skips if checkpoints are disabled (self._gcm is None) or if
+        any git operation fails (non-fatal — session continues without this cp).
+        """
+        if self._gcm is None:
+            return
+
+        cp_id = len(self._state.checkpoints)
+        message = f"rocpd: cp-{cp_id} — {edit_summary}"
+
+        try:
+            commit_hash = self._gcm.create_checkpoint_commit(files_modified, message)
+            ref_name = self._gcm.tag_checkpoint(cp_id, commit_hash)
+            worktree_path = self._gcm.add_worktree(cp_id, commit_hash)
+        except CheckpointError as exc:
+            _print(f"  (Checkpoint cp-{cp_id} skipped: {exc})", style="dim")
+            return
+
+        cp = CheckpointRecord(
+            cp_id=cp_id,
+            commit_hash=commit_hash,
+            ref_name=ref_name,
+            worktree_path=worktree_path,
+            timestamp=datetime.now(timezone.utc).isoformat(),
+            files_modified=files_modified,
+            edit_summary=edit_summary,
+            file_snapshots=file_snapshots,
+        )
+        self._state.checkpoints.append(cp)
+
+        # Link the most recent edit record to this checkpoint
+        if self._state.edit_history:
+            self._state.edit_history[-1].checkpoint_id = cp_id
+
+    def _update_checkpoint_with_run(self) -> None:
+        """Set run_index on the most recent CheckpointRecord that does not yet
+        have a run attached.
+
+        Called from Phase 3 immediately after trace_history is appended.
+        Delta computation is deferred to _update_checkpoint_delta() which is
+        called from Phase 4 after analysis_history is updated.
+        """
+        if not self._state.checkpoints:
+            return
+
+        # Find the most recent checkpoint without a run
+        for cp in reversed(self._state.checkpoints):
+            if cp.run_index is None:
+                cp.run_index = len(self._state.trace_history) - 1
+                break
+
+    def _update_checkpoint_delta(self) -> None:
+        """Compute performance_delta_pct for the most recently-run checkpoint.
+
+        Called from Phase 4 after _record_analysis() appends to analysis_history,
+        so analysis_history[-1] is the current run and analysis_history[-2] is
+        the previous run.  Requires at least 2 entries in analysis_history.
+        """
+        if not self._state.checkpoints or len(self._state.analysis_history) < 2:
+            return
+
+        # Find the most recent checkpoint that has a run but no delta yet
+        target = None
+        for cp in reversed(self._state.checkpoints):
+            if cp.run_index is not None and cp.performance_delta_pct is None:
+                target = cp
+                break
+        if target is None:
+            return
+
+        prev_ns = (self._state.analysis_history[-2].execution_breakdown or {}).get(
+            "total_runtime_ns", 0
+        )
+        curr_ns = (self._state.analysis_history[-1].execution_breakdown or {}).get(
+            "total_runtime_ns", 0
+        )
+        if prev_ns > 0:
+            target.performance_delta_pct = round(((prev_ns - curr_ns) / prev_ns) * 100, 1)
+
+    def _restore_from_snapshots(self, files: set, snapshots: Dict[str, str]) -> None:
+        """Write file contents from snapshots dict; delete files not in snapshots."""
+        for f in files:
+            _path = (
+                pathlib.Path(self._state.repo_root) / f
+                if self._state.repo_root
+                else pathlib.Path(f)
+            )
+            if f in snapshots:
+                _path.parent.mkdir(parents=True, exist_ok=True)
+                _path.write_text(snapshots[f])
+            else:
+                if _path.exists():
+                    _path.unlink()
+
+    def _rollback_to_checkpoint(self, target_cp_id: int) -> None:
+        """Restore working directory to cp target_cp_id state.
+
+        target_cp_id == -1 means baseline (before any AI edits).
+        Removes all checkpoints after target, truncates trace/analysis history,
+        and sets active_checkpoint. Uses git fast path if commit is reachable,
+        falls back to file_snapshots otherwise.
+        """
+        if target_cp_id == -1:
+            target_hash = self._state.baseline_commit
+            modified_after = set()
+            for cp in self._state.checkpoints:
+                modified_after.update(cp.files_modified)
+            target_snapshots: Dict[str, str] = {}
+        else:
+            matches = [cp for cp in self._state.checkpoints if cp.cp_id == target_cp_id]
+            if not matches:
+                _print(f"  Checkpoint {target_cp_id} not found.", style="dim")
+                return
+            target = matches[0]
+            target_hash = target.commit_hash
+            modified_after = set()
+            for cp in self._state.checkpoints:
+                if cp.cp_id > target_cp_id:
+                    modified_after.update(cp.files_modified)
+            target_snapshots = target.file_snapshots
+
+        if not modified_after:
+            pass  # Nothing to restore
+        elif self._gcm and self._gcm.commit_reachable(target_hash):
+            try:
+                self._gcm.restore_files_from_commit(target_hash, list(modified_after))
+            except CheckpointError as exc:
+                _print(
+                    f"  Git restore failed: {exc}. Using file snapshots.", style="yellow"
+                )
+                self._restore_from_snapshots(modified_after, target_snapshots)
+        else:
+            # Fallback: write file snapshots directly
+            if target_cp_id == -1:
+                _print(
+                    "  \u2717 Cannot restore baseline: git unavailable and no file "
+                    "snapshots exist for the baseline state.",
+                    style="red",
+                )
+                pass  # fall through to cleanup: truncate checkpoints/history, _save_session
+            else:
+                self._restore_from_snapshots(modified_after, target_snapshots)
+            _print(
+                "  Note: restored from session file snapshot (git unavailable).",
+                style="dim",
+            )
+
+        # Remove stale worktrees
+        if self._gcm:
+            for cp in self._state.checkpoints:
+                if cp.cp_id > target_cp_id:
+                    self._gcm.remove_worktree(cp.worktree_path)
+
+        # Truncate checkpoints list
+        if target_cp_id == -1:
+            self._state.checkpoints = []
+        else:
+            self._state.checkpoints = self._state.checkpoints[: target_cp_id + 1]
+
+        # Truncate trace/analysis history
+        if target_cp_id == -1:
+            self._state.trace_history = []
+            self._state.analysis_history = []
+            self._state.iteration_count = 0
+            self._state.active_checkpoint = None
+        else:
+            run_idx = target.run_index
+            if run_idx is not None:
+                self._state.trace_history = self._state.trace_history[: run_idx + 1]
+                self._state.analysis_history = self._state.analysis_history[: run_idx + 1]
+                self._state.iteration_count = run_idx + 1
+            else:
+                self._state.trace_history = []
+                self._state.analysis_history = []
+                self._state.iteration_count = 0
+            self._state.active_checkpoint = target_cp_id
+
+        self._save_session()
+
+    def _blacklist_checkpoint(self, cp_id: int) -> None:
+        """Mark a checkpoint as blacklisted using its edit_summary as category."""
+        matches = [cp for cp in self._state.checkpoints if cp.cp_id == cp_id]
+        if not matches:
+            return
+        cp = matches[0]
+        delta_str = (
+            f"{cp.performance_delta_pct:.1f}%"
+            if cp.performance_delta_pct is not None
+            else "unknown regression"
+        )
+        cp.blacklisted = True
+        cp.blacklist_category = cp.edit_summary
+        cp.blacklist_description = (
+            f"'{cp.edit_summary}' caused {delta_str} performance regression. "
+            "Do not suggest this approach again."
+        )
+        self._state.blacklisted_approaches.append(cp.blacklist_description)
+        self._save_session()
+
+    def _build_blacklist_block(self) -> str:
+        """Return LLM prompt block for all blacklisted checkpoints.
+
+        Uses the persistent blacklisted_approaches list so entries survive rollback.
+        Deduplicates. Returns empty string when none blacklisted.
+        """
+        approaches = self._state.blacklisted_approaches
+        if not approaches:
+            return ""
+        seen: set = set()
+        lines = ["# Blacklisted approaches (do NOT use these):", ""]
+        for desc in approaches:
+            if desc and desc not in seen:
+                seen.add(desc)
+                lines.append(f"- {desc}")
+        if len(lines) <= 2:
+            return ""
+        return "\n".join(lines) + "\n"
+
+    def _show_checkpoint_picker(self) -> Optional[int]:
+        """Display checkpoint table; prompt for target and optional blacklist.
+
+        Returns target cp_id (-1 = baseline) or None if cancelled.
+        """
+        _print("\n┌─ Checkpoints " + "─" * 53 + "┐")
+        _print("│  [base]  baseline (before any AI edits)" + " " * 31 + "│")
+        for cp in self._state.checkpoints:
+            if cp.performance_delta_pct is not None:
+                if cp.performance_delta_pct > 0:
+                    delta_str = f"+{cp.performance_delta_pct:.1f}% ✓"
+                else:
+                    delta_str = f"{cp.performance_delta_pct:.1f}% ✗"
+            else:
+                delta_str = "no run yet"
+            summary = cp.edit_summary[:38].ljust(38)
+            run_str = (
+                f"Run {cp.run_index + 1}: {delta_str}"
+                if cp.run_index is not None
+                else "no run yet"
+            )
+            _print(f"│  [{cp.cp_id}]  {summary}  {run_str:20s}│")
+        _print("└" + "─" * 69 + "┘\n")
+
+        valid = ["base"] + [str(cp.cp_id) for cp in self._state.checkpoints]
+        raw = _input(f"  Restore to [{'/'.join(valid)}] or [c] cancel: ").strip().lower()
+        if raw == "c":
+            return None
+        if raw == "base":
+            target_cp_id = -1
+        elif raw.isdigit() and int(raw) in {cp.cp_id for cp in self._state.checkpoints}:
+            target_cp_id = int(raw)
+        else:
+            _print("  Invalid choice.", style="yellow")
+            return None
+
+        # Identify regressions BEFORE rollback removes them
+        if target_cp_id == -1:
+            # Rolling back to baseline: show all checkpoints with regressions
+            regression_cps = [
+                cp
+                for cp in self._state.checkpoints
+                if cp.performance_delta_pct is not None and cp.performance_delta_pct < 0
+            ]
+        else:
+            regression_cps = [
+                cp
+                for cp in self._state.checkpoints
+                if cp.cp_id > target_cp_id
+                and cp.performance_delta_pct is not None
+                and cp.performance_delta_pct < 0
+            ]
+
+        # Prompt for blacklist (before rollback so checkpoints are still in list)
+        blacklist_ids: List[int] = []
+        if regression_cps:
+            _print("\n  Blacklist approaches that caused regressions?")
+            for idx, cp in enumerate(regression_cps, 1):
+                delta_str = f"{cp.performance_delta_pct:.1f}%"
+                _print(f"    [{idx}]  cp-{cp.cp_id}: {cp.edit_summary} ({delta_str})")
+            bl_raw = (
+                _input("  Enter numbers to blacklist (space-separated), or [n] skip: ")
+                .strip()
+                .lower()
+            )
+            if bl_raw != "n":
+                for tok in bl_raw.split():
+                    if tok.isdigit():
+                        idx = int(tok) - 1
+                        if 0 <= idx < len(regression_cps):
+                            blacklist_ids.append(regression_cps[idx].cp_id)
+
+        # Apply blacklists before rollback removes checkpoints
+        for cp_id in blacklist_ids:
+            self._blacklist_checkpoint(cp_id)
+
+        # Now perform rollback
+        self._rollback_to_checkpoint(target_cp_id=target_cp_id)
+
+        return target_cp_id
+
+    def _teardown_checkpoints(self) -> None:
+        """Remove all checkpoint worktrees on session exit.
+
+        Refs (refs/rocpd/…) are kept so commits survive GC until
+        the user runs 'rocpd sessions --cleanup'.
+        """
+        if self._gcm is None:
+            return
+        for cp in self._state.checkpoints:
+            try:
+                self._gcm.remove_worktree(cp.worktree_path)
+            except Exception:
+                pass
+
+    def _prune_stale_worktrees(self) -> None:
+        """Remove worktrees under ~/.rocpd/sessions/ with no matching session JSON.
+
+        Called at session start, after git repo is detected.
+        """
+        if self._gcm is None:
+            return
+        sessions_dir = str(self._sessions_dir)
+        try:
+            worktree_paths = self._gcm.list_worktrees()
+        except Exception:
+            return
+
+        for path in worktree_paths:
+            if not path.startswith(sessions_dir + os.sep):
+                continue
+            # Extract session_id: first path component after sessions_dir
+            rel = path[len(sessions_dir) + 1 :]
+            parts = rel.split(os.sep)
+            if not parts:
+                continue
+            session_id = parts[0]
+            # Never prune the current session's own worktrees
+            if session_id == self._session_id:
+                continue
+            # Check for matching session JSON
+            json_candidates = [
+                self._sessions_dir / f"workflow_{session_id}.json",
+                self._sessions_dir / f"{session_id}.json",
+            ]
+            if not any(p.exists() for p in json_candidates):
+                self._gcm.remove_worktree(path)
+                _print(f"  Pruned stale checkpoint worktree: {path}", style="dim")
+
+    # ── Phase 1b: Quick workload analysis ──────────────────────────────────────
+
+    @staticmethod
+    def _classify_app_command(app_cmd: str) -> Dict[str, Any]:
+        """Inspect the app command for workload-type hints.
+
+        Returns a dict with:
+          workload_type  – one of: "python_ml", "python_generic", "hip_compute",
+                           "llm_inference", "mpi_multi", "unknown"
+          hints          – list of human-readable detection notes
+          extra_flags    – additional rocprofv3 flags to add beyond the default set
+          warnings       – list of capture-limitation warnings to show
+          uses_fork      – True when the app spawns child processes via fork/exec
+                           so the profiling command should use --process-sync and
+                           per-process output filenames (%nid%)
+        """
+        try:
+            tokens = shlex.split(app_cmd)
+        except ValueError:
+            tokens = app_cmd.split()
+        if not tokens:
+            return {
+                "workload_type": "unknown",
+                "hints": [],
+                "extra_flags": [],
+                "warnings": [],
+            }
+
+        # Strip leading KEY=VALUE env-var tokens (same logic as _phase3_run_profiler)
+        while tokens and "=" in tokens[0] and not tokens[0].startswith("-"):
+            tokens.pop(0)
+        if not tokens:
+            return {
+                "workload_type": "unknown",
+                "hints": [],
+                "extra_flags": [],
+                "warnings": [],
+            }
+
+        binary = tokens[0].lower()
+        all_lower = " ".join(tokens).lower()
+
+        hints: List[str] = []
+        extra_flags: List[str] = []
+        warnings: List[str] = []
+
+        # ── Multi-process launchers ──────────────────────────────────────────
+        is_mpi = any(
+            kw in binary for kw in ("mpirun", "mpiexec", "srun", "jsrun", "orterun")
+        )
+        if is_mpi:
+            # MPI forks worker processes — use --process-sync so rocprofv3 follows
+            # them, and %nid% in the output name so each rank gets its own DB.
+            extra_flags.append("--process-sync")
+            hints.append("MPI/Slurm launcher (--process-sync enabled)")
+            warnings.append(
+                "MPI/Slurm launcher detected — using --process-sync and %nid% output "
+                "naming so each rank's trace is captured separately and merged."
+            )
+            return {
+                "workload_type": "mpi_multi",
+                "hints": hints,
+                "extra_flags": extra_flags,
+                "warnings": warnings,
+                "uses_fork": True,
+            }
+
+        # ── Python workloads ─────────────────────────────────────────────────
+        is_python = "python" in binary or binary.endswith(".py")
+
+        ml_keywords = (
+            "torch",
+            "pytorch",
+            "tensorflow",
+            "jax",
+            "paddle",
+            "mxnet",
+            "onnx",
+            "triton",
+            "megatron",
+            "deepspeed",
+        )
+        is_ml = any(kw in all_lower for kw in ml_keywords)
+
+        llm_keywords = (
+            "vllm",
+            "llm",
+            "llama",
+            "mistral",
+            "falcon",
+            "gpt",
+            "bert",
+            "transformer",
+            "inference",
+            "generate",
+            "decode",
+        )
+        is_llm = any(kw in all_lower for kw in llm_keywords)
+
+        multiproc_keywords = (
+            "torchrun",
+            "torch.distributed",
+            "accelerate",
+            "deepspeed",
+            "nccl",
+            "ddp",
+        )
+        is_multiproc = any(kw in all_lower for kw in multiproc_keywords)
+
+        if is_python:
+            extra_flags.append("--hip-trace")  # Python HIP API overhead is significant
+            if is_multiproc:
+                # Fork-based distributed training — --process-sync follows child processes,
+                # %nid% in output name gives each worker its own DB file.
+                extra_flags.append("--process-sync")
+                hints.append(
+                    "distributed/multi-process training (--process-sync enabled)"
+                )
+                warnings.append(
+                    "Distributed/multi-process training detected (torchrun/DDP/DeepSpeed). "
+                    "Using --process-sync and per-process output naming (%nid%) so each "
+                    "worker's GPU activity is captured and merged automatically."
+                )
+            if is_llm:
+                hints.append("Python + LLM inference framework")
+                return {
+                    "workload_type": "llm_inference",
+                    "hints": hints,
+                    "extra_flags": extra_flags,
+                    "warnings": warnings,
+                    "uses_fork": is_multiproc,
+                }
+            if is_ml:
+                hints.append("Python + ML framework (PyTorch / JAX / TF)")
+                return {
+                    "workload_type": "python_ml",
+                    "hints": hints,
+                    "extra_flags": extra_flags,
+                    "warnings": warnings,
+                    "uses_fork": is_multiproc,
+                }
+            hints.append("Python workload")
+            return {
+                "workload_type": "python_generic",
+                "hints": hints,
+                "extra_flags": extra_flags,
+                "warnings": warnings,
+                "uses_fork": is_multiproc,
+            }
+
+        # ── Compiled HIP / ROCm binary ───────────────────────────────────────
+        hip_keywords = (
+            "hip",
+            "rocm",
+            "roc",
+            "hsa",
+            "blas",
+            "lapack",
+            "fft",
+            "conv",
+            "gemm",
+            "matmul",
+        )
+        if any(kw in binary for kw in hip_keywords):
+            hints.append(f"HIP/ROCm binary ({tokens[0]})")
+        else:
+            hints.append(f"Compiled binary ({tokens[0]})")
+
+        return {
+            "workload_type": "hip_compute",
+            "hints": hints,
+            "extra_flags": extra_flags,
+            "warnings": warnings,
+            "uses_fork": False,
+        }
+
+    def _phase1b_quick_workload_analysis(self) -> Optional[str]:
+        """Analyze the workload before Phase 2 to suggest the best starter command.
+
+        - If source paths provided: runs Tier 0 SourceAnalyzer and uses its
+          highest-priority recommendation flags.
+        - Always runs app-command heuristics for extra flags / warnings.
+        - Falls back to default --sys-trace --kernel-trace --memory-copy-trace --stats
+          when nothing more specific can be determined.
+
+        Returns the full suggested rocprofv3 command, or None to use the default.
+        """
+        _print()
+        _print(
+            "  ── Quick Workload Analysis ─────────────────────────────────",
+            style="cyan",
+        )
+
+        # App-command heuristics
+        app_info = self._classify_app_command(self._state.app_command)
+        for warn in app_info.get("warnings", []):
+            _print(f"  ⚠  {warn}", style="yellow")
+        for hint in app_info.get("hints", []):
+            _print(f"  Detected: {hint}", style="dim")
+
+        source_cmd_flags: Optional[str] = None
+
+        # Tier 0 source analysis
+        if self._state.source_paths:
+            try:
+                from .source_analyzer import SourceAnalyzer
+
+                plan = SourceAnalyzer(self._state.source_paths[0]).analyze()
+                _print(
+                    f"  Source scan: {plan.files_scanned} files, "
+                    f"{plan.kernel_count} kernels, "
+                    f"model={plan.programming_model}",
+                    style="dim",
+                )
+                # Extract just the flags from the suggested first command
+                # (strip the `-- <app>` part; we'll add the real app below)
+                raw = plan.suggested_first_command
+                sep_idx = raw.find(" -- ")
+                if sep_idx != -1:
+                    source_cmd_flags = raw[:sep_idx]
+                else:
+                    source_cmd_flags = raw
+                _print(f"  Source analysis suggests: {source_cmd_flags}", style="dim")
+            except Exception as exc:
+                _print(f"  Source analysis skipped: {exc}", style="dim")
+
+        # Build the final command
+        run_id = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
+        out_dir = f"{self._trace_dir}/run_{run_id}"
+        uses_fork = app_info.get("uses_fork", False)
+        out_name = "results_%nid%" if uses_fork else "results"
+
+        if source_cmd_flags:
+            # Source-derived flags — replace any `-d <dir>` and `-o <name>` with fresh values.
+            # When the app forks, override -o with per-process naming regardless of what the
+            # source analyzer suggested.
+            flags = re.sub(r"-d\s+\S+", f"-d {out_dir}", source_cmd_flags)
+            flags = re.sub(r"-o\s+\S+", f"-o {out_name}", flags)
+            if "-d " not in flags:
+                flags = f"{flags} -d {out_dir}"
+            if "-o " not in flags:
+                flags = f"{flags} -o {out_name}"
+            # Append any extra flags from app-command heuristics that aren't already present
+            for ef in app_info.get("extra_flags", []):
+                if ef not in flags:
+                    if "rocprofv3 " in flags:
+                        flags = flags.replace("rocprofv3 ", f"rocprofv3 {ef} ", 1)
+                    else:
+                        flags = f"rocprofv3 {ef} {flags}"
+            cmd = f"{flags} -- {self._state.app_command}"
+            reason = "source analysis"
+        else:
+            # Pure heuristics path — build on top of the safe default flag set
+            base_flags = "--sys-trace --kernel-trace --memory-copy-trace --stats"
+            extra = " ".join(app_info.get("extra_flags", []))
+            if extra:
+                base_flags = f"{base_flags} {extra}"
+            extra_info = (
+                f" + heuristics ({', '.join(app_info['hints'])})"
+                if app_info.get("hints")
+                else ""
+            )
+            reason = f"default flags{extra_info}"
+            cmd = (
+                f"rocprofv3 {base_flags} "
+                f"-d {out_dir} -o {out_name} "
+                f"-- {self._state.app_command}"
+            )
+
+        _print(f"  Starter command basis: {reason}", style="dim")
+        _print()
+        return cmd
+
+    # ── Phase 2: Profiling command generation ─────────────────────────────────
+
+    def _build_profiling_command(self, app_info: Optional[Dict[str, Any]] = None) -> str:
+        """Build a default rocprofv3 profiling command wrapping the user's app.
+
+        When app_info indicates uses_fork=True, adds --process-sync and uses
+        %nid% in the output filename so each forked process writes its own DB.
+        """
+        run_id = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
+        out_dir = f"{self._trace_dir}/run_{run_id}"
+        uses_fork = (app_info or {}).get("uses_fork", False)
+        proc_sync = " --process-sync" if uses_fork else ""
+        out_name = "results_%nid%" if uses_fork else "results"
+        return (
+            f"rocprofv3 --sys-trace --kernel-trace --memory-copy-trace --stats{proc_sync} "
+            f"-d {out_dir} -o {out_name} "
+            f"-- {self._state.app_command}"
+        )
+
+    def _phase2_show_command(self, cmd: str) -> bool:
+        """Display boxed profiling command; return True if user approves."""
+        _print()
+        width = max(66, len(cmd) + 8)
+        border = "─" * (width - 2)
+        _print(f"╭{border}╮", style="cyan")
+        _print("│  Profiling Command" + " " * (width - 21) + "│", style="cyan")
+        _print("│" + " " * (width - 2) + "│", style="cyan")
+        indent = "│  "
+        tail = "  │"
+        avail = width - len(indent) - len(tail)
+        # Word-wrap command
+        words = cmd.split()
+        line = ""
+        for word in words:
+            if line and len(line) + 1 + len(word) > avail:
+                _print(f"{indent}{line:<{avail}}{tail}", style="cyan")
+                line = word
+            else:
+                line = f"{line} {word}".lstrip()
+        if line:
+            _print(f"{indent}{line:<{avail}}{tail}", style="cyan")
+        _print("│" + " " * (width - 2) + "│", style="cyan")
+        _print(f"╰{border}╯", style="cyan")
+        _print()
+        try:
+            ans = (
+                _input(
+                    "  Would you like the interactive tool to run this command? [Y/n]  "
+                )
+                .strip()
+                .lower()
+            )
+        except EOFError:
+            return False
+        if ans in ("n", "no"):
+            _print()
+            _print("  Command not run. Copy it to run manually:", style="dim")
+            _print(f"  $ {cmd}", style="dim")
+            return False
+        return True
+
+    # ── Revert helper ──────────────────────────────────────────────────────────
+
+    def _post_revert_menu(self, show_retry: bool) -> str:
+        """Ask the user what to do after a revert + LLM analysis.
+
+        show_retry — True when the user can immediately ask the AI for another
+                     code fix (Phase 6 context, no alternative applied yet).
+                     False when "retry" doesn't apply (Phase 3 context, or an
+                     alternative was already applied in this cycle).
+
+        Returns one of: "retry" | "continue" | "exit"
+        """
+        _print()
+        _print("  ── What would you like to do next? ─────────────────────", style="cyan")
+        if show_retry:
+            _print(
+                "    [f]  Try a different fix  — let the AI attempt another approach",
+                style="white",
+            )
+        _print(
+            "    [p]  Continue to re-profiling  (skip code changes this round)",
+            style="white",
+        )
+        _print("    [q]  Exit session", style="white")
+        _print()
+        prompt = "  [f/p/q]: " if show_retry else "  [p/q]: "
+        try:
+            choice = _input(prompt).strip().lower()
+        except EOFError:
+            return "continue"
+        if choice in ("f", "fix", "retry") and show_retry:
+            return "retry"
+        if choice in ("q", "quit", "exit"):
+            return "exit"
+        return "continue"
+
+    def _revert_last_edit(
+        self, failure_reason: str = "", allow_retry: bool = True
+    ) -> "tuple[bool, str]":
+        """Restore the most recently AI-modified file from its .bak backup.
+
+        After restoring, calls the LLM to analyze the failure and propose a
+        concrete alternative, then shows a what-next menu.
+
+        allow_retry — when True (Phase 6 context), the what-next menu includes
+                      [f] Try a different fix.  When False (Phase 3 context,
+                      where a new edit cannot be applied immediately), [f] is
+                      hidden and "retry" is never returned.
+
+        Returns (success, next_action) where next_action is one of:
+          "retry"    — user wants to try another AI fix (only when allow_retry=True)
+          "continue" — user wants to skip code changes and proceed to re-profiling
+          "exit"     — user wants to end the session
+        """
+        if not self._state.edit_history:
+            _print("  No AI edits to revert.", style="yellow")
+            return False, "continue"
+        record = self._state.edit_history[-1]
+        bak = pathlib.Path(record.backup_path)
+        dst = pathlib.Path(record.file_path)
+        if not bak.exists():
+            _print(f"  Backup not found: {bak}", style="red")
+            return False, "continue"
+
+        # Capture the failed edit content BEFORE overwriting it — we'll send it
+        # to the LLM so it can see exactly what went wrong.
+        try:
+            failed_content = dst.read_text()
+        except OSError:
+            failed_content = ""
+
+        original_content = bak.read_text()
+
+        try:
+            dst.write_text(original_content)
+            self._state.edit_history.pop()
+            self._save_session()
+            _print(
+                f"  ✓ Reverted: {dst.name}  (backup kept at {bak.name})", style="green"
+            )
+        except OSError as exc:
+            _print(f"  Revert failed: {exc}", style="red")
+            return False, "continue"
+
+        # LLM analysis + what-next menu.
+        action = self._post_revert_llm_analysis(
+            dst,
+            original_content,
+            failed_content,
+            failure_reason,
+            allow_retry=allow_retry,
+        )
+        return True, action
+
+    def _post_revert_llm_analysis(
+        self,
+        file_path: pathlib.Path,
+        original_content: str,
+        failed_content: str,
+        failure_reason: str,
+        allow_retry: bool = True,
+    ) -> str:
+        """Call LLM to analyze the failed edit and propose a concrete alternative.
+
+        Returns the user's next-action choice from _post_revert_menu:
+        "retry" | "continue" | "exit"
+
+        If failure_reason is empty the user is asked to describe the error
+        before the LLM is called, so the analysis has useful context.
+        """
+        # Ask for error context before burning an LLM call if none was provided.
+        if not failure_reason.strip():
+            _print()
+            _print(
+                "  What went wrong? Paste the error output or briefly describe the issue.",
+                style="cyan",
+            )
+            _print(
+                "  (Press Enter to skip and proceed without error context)", style="dim"
+            )
+            lines: List[str] = []
+            try:
+                while True:
+                    line = _input("  > ").strip()
+                    if not line:
+                        break
+                    lines.append(line)
+            except EOFError:
+                pass
+            failure_reason = "\n".join(lines)
+
+        if not self._llm_provider:
+            _print("  (No LLM configured — cannot auto-analyze failure)", style="dim")
+            return self._post_revert_menu(show_retry=False)
+
+        _print()
+        _print("  ── Analyzing failure with LLM ──────────────────────────", style="cyan")
+
+        error_block = (
+            f"\n=== COMPILATION / RUNTIME ERRORS ===\n{failure_reason[:1500]}"
+            if failure_reason
+            else ""
+        )
+        failed_block = (
+            f"\n=== FAILED EDIT ===\n{failed_content}" if failed_content else ""
+        )
+
+        system = (
+            "You are an expert AMD GPU performance engineer. "
+            "A code edit you suggested was reverted because it caused errors. "
+            "Analyze what went wrong and propose a SPECIFIC corrected alternative.\n\n"
+            "Format your response with exactly two sections:\n"
+            "ANALYSIS: (root cause — what was wrong in the failed edit and why)\n"
+            "ALTERNATIVE: (the corrected optimization approach with specific code changes — "
+            "be concrete, not generic)"
+        )
+        user = (
+            f"The edit to {file_path.name} was reverted because it failed."
+            f"{error_block}"
+            f"\n=== ORIGINAL CODE (now restored) ===\n{original_content}"
+            f"{failed_block}"
+        )
+
+        try:
+            from rocpd.ai_analysis.llm_analyzer import LLMAnalyzer  # type: ignore[import]
+
+            analyzer = LLMAnalyzer(
+                provider=self._llm_provider,
+                api_key=self._llm_api_key,
+                model=self._llm_model,
+            )
+            with _Spinner(f"  {self._llm_provider} analyzing failure..."):
+                if self._llm_provider == "openai":
+                    analysis = analyzer._call_openai(system, user, timeout=120)
+                elif self._llm_provider == "anthropic":
+                    analysis = analyzer._call_anthropic(system, user, timeout=120)
+                elif self._llm_provider == "private":
+                    analysis = analyzer._call_private(system, user)
+                else:
+                    analysis = analyzer._call_local(system, user)
+        except Exception as exc:
+            _print(f"  (LLM analysis failed: {exc})", style="red")
+            return self._post_revert_menu(show_retry=allow_retry)
+
+        if not analysis:
+            return self._post_revert_menu(show_retry=allow_retry)
+
+        _print()
+        _print(analysis, style="white")
+        _print()
+
+        # Offer to apply the alternative right now.
+        applied = False
+        try:
+            apply = (
+                _input("  Apply this alternative approach now? [y/N]  ").strip().lower()
+            )
+        except EOFError:
+            return self._post_revert_menu(show_retry=allow_retry)
+
+        if apply == "y":
+            # Extract the ALTERNATIVE section as the suggestion for _llm_rewrite_file.
+            alt_section = analysis
+            if "ALTERNATIVE:" in analysis:
+                alt_section = analysis.split("ALTERNATIVE:", 1)[1].strip()
+
+            rewritten = self._llm_rewrite_file(file_path, alt_section)
+            if not rewritten or not rewritten.strip():
+                _print("  (LLM did not produce a rewrite)", style="yellow")
+            else:
+                import difflib
+
+                original_lines = original_content.splitlines(keepends=True)
+                rewritten_lines = rewritten.splitlines(keepends=True)
+                diff_lines = list(
+                    difflib.unified_diff(
+                        original_lines,
+                        rewritten_lines,
+                        fromfile=f"{file_path.name} (original)",
+                        tofile=f"{file_path.name} (alternative)",
+                        n=3,
+                    )
+                )
+                if not diff_lines:
+                    _print(
+                        "  (Alternative is identical to original — no changes)",
+                        style="yellow",
+                    )
+                else:
+                    _print()
+                    _print(
+                        "  ── Proposed alternative ──────────────────────────",
+                        style="cyan",
+                    )
+                    for line in diff_lines[:120]:
+                        line = line.rstrip("\n")
+                        if line.startswith("+"):
+                            _print(line, style="green")
+                        elif line.startswith("-"):
+                            _print(line, style="red")
+                        else:
+                            _print(line, style="dim")
+                    if len(diff_lines) > 120:
+                        _print(f"  ... ({len(diff_lines) - 120} more lines)", style="dim")
+                    _print()
+                    try:
+                        confirm = (
+                            _input("  Apply this corrected version? [y/N]  ")
+                            .strip()
+                            .lower()
+                        )
+                    except EOFError:
+                        confirm = "n"
+                    if confirm == "y":
+                        bak2 = file_path.with_suffix(file_path.suffix + ".bak")
+                        try:
+                            bak2.write_text(original_content)
+                            file_path.write_text(rewritten)
+                            self._state.edit_history.append(
+                                _EditRecord(
+                                    timestamp=datetime.now(timezone.utc).isoformat(),
+                                    file_path=str(file_path),
+                                    backup_path=str(bak2),
+                                )
+                            )
+                            self._save_session()
+                            _print(
+                                f"  ✓ Alternative applied: {file_path.name}",
+                                style="green",
+                            )
+                            _print("  Please recompile to verify.", style="dim")
+                            applied = True
+                        except OSError as exc:
+                            _print(f"  (Write failed: {exc})", style="red")
+                    else:
+                        _print(
+                            "  Alternative discarded. File remains at original.",
+                            style="dim",
+                        )
+
+        # [f] only offered when no alternative was applied and allow_retry is True
+        return self._post_revert_menu(show_retry=allow_retry and not applied)
+
+    # ── Phase 3: Trace collection ──────────────────────────────────────────────
+
+    def _merge_per_process_dbs(self, db_files: List[str], out_dir: str) -> Optional[str]:
+        """Merge per-process DB files into a single database and return its path.
+
+        Uses rocpd.merge.merge_sqlite_dbs() (the same engine behind `rocpd merge`).
+        Returns None if merge fails — caller falls back to the first DB file.
+        """
+        if len(db_files) <= 1:
+            return None
+        merged_path = str(pathlib.Path(out_dir) / "merged_processes.db")
+        try:
+            from rocpd.merge import merge_sqlite_dbs  # type: ignore[import]
+
+            _print(f"  Merging {len(db_files)} per-process databases…", style="cyan")
+            merge_sqlite_dbs(db_files, merged_path)
+            _print(f"  ✓ Merged → {merged_path}", style="green")
+            return merged_path
+        except Exception as exc:
+            _print(
+                f"  ⚠  DB merge failed ({exc}); using first DB for analysis.",
+                style="yellow",
+            )
+            return None
+
+    def _find_trace_files(self, cmd: str) -> List[str]:
+        """Parse -d <dir> from cmd; return .db/.csv/.json files found there."""
+        import glob as _glob
+        import shlex as _shlex
+
+        try:
+            parts = _shlex.split(cmd)
+        except ValueError:
+            parts = cmd.split()
+        out_dir = "."
+        for i, p in enumerate(parts):
+            if p in ("-d", "--output-path") and i + 1 < len(parts):
+                out_dir = parts[i + 1]
+        found = []
+        for ext in ("*.db", "*.csv", "*.json"):
+            found.extend(_glob.glob(f"{out_dir}/**/{ext}", recursive=True))
+            found.extend(_glob.glob(f"{out_dir}/{ext}"))
+        return sorted(set(found))
+
+    def _phase3_run_profiler(self, cmd: str) -> bool:
+        """Run profiling command with real-time stdout streaming.
+
+        On success (exit 0 + trace files found): records TraceRun, returns True.
+        On failure: ask retry / edit command / revert-AI-edit / abort.
+
+        Leading KEY=VALUE tokens (e.g. ROCPROFILER_PC_SAMPLING_BETA_ENABLED=1) are
+        extracted and injected into the child process environment automatically.
+        """
+        import shlex as _shlex
+
+        while True:
+            _print(f"  Running: $ {cmd}", style="cyan")
+            _print()
+
+            # Separate leading ENV=value tokens from the executable + args.
+            # This lets callers build commands like:
+            #   ROCPROFILER_PC_SAMPLING_BETA_ENABLED=1 rocprofv3 --pc-sampling ...
+            # without needing shell=True.
+            _tokens = _shlex.split(cmd)
+            _env_overrides: Dict[str, str] = {}
+            while _tokens and "=" in _tokens[0] and not _tokens[0].startswith("-"):
+                _key, _, _val = _tokens.pop(0).partition("=")
+                _env_overrides[_key] = _val
+            _run_env = {**os.environ, **_env_overrides} if _env_overrides else None
+
+            if not _tokens:
+                _print(
+                    "  [error] Command is empty after stripping env vars.", style="red"
+                )
+                _print(f"  Command was: {cmd}", style="dim")
+
+                class _FakeProc:
+                    returncode = 127
+
+                proc = _FakeProc()  # type: ignore[assignment]
+            else:
+                try:
+                    proc = subprocess.Popen(
+                        _tokens,
+                        stdout=subprocess.PIPE,
+                        stderr=subprocess.STDOUT,
+                        text=True,
+                        env=_run_env,
+                    )
+                    assert proc.stdout is not None
+                    for line in proc.stdout:
+                        print(line, end="", flush=True)
+                    proc.wait()
+                except FileNotFoundError as exc:
+                    _print(f"  [error] Command not found: {exc}", style="red")
+
+                    class _FakeProc:  # type: ignore[no-redef]
+                        returncode = 127
+
+                    proc = _FakeProc()  # type: ignore[assignment]
+
+            _print()
+            if proc.returncode == 0:
+                trace_files = self._find_trace_files(cmd)
+                if trace_files:
+                    db_files = [f for f in trace_files if f.endswith(".db")]
+                    _print(
+                        f"  ✓ Trace collected: {len(trace_files)} file(s)"
+                        + (f" ({len(db_files)} DB)" if db_files else ""),
+                        style="green",
+                    )
+                    for tf in trace_files[:5]:
+                        _print(f"    · {tf}", style="dim")
+                    if len(trace_files) > 5:
+                        _print(f"    … and {len(trace_files) - 5} more", style="dim")
+
+                    # When multiple DB files are present (one per forked process),
+                    # merge them into a single DB for analysis.
+                    out_dir = "."
+                    try:
+                        import shlex as _sl
+
+                        _parts = _sl.split(cmd)
+                    except ValueError:
+                        _parts = cmd.split()
+                    for _i, _p in enumerate(_parts):
+                        if _p in ("-d", "--output-path") and _i + 1 < len(_parts):
+                            out_dir = _parts[_i + 1]
+
+                    if len(db_files) > 1:
+                        merged = self._merge_per_process_dbs(db_files, out_dir)
+                        db_path = merged if merged else db_files[0]
+                    else:
+                        db_path = db_files[0] if db_files else trace_files[0]
+
+                    self._state.trace_history.append(
+                        _TraceRun(
+                            timestamp=datetime.now(timezone.utc).isoformat(),
+                            command=cmd,
+                            db_path=db_path,
+                            trace_files=trace_files,
+                        )
+                    )
+                    self._save_session()
+                    self._update_checkpoint_with_run()
+                    return True
+                # Ran OK but no files found — ask user for path
+                _print("  Profiler completed but no trace files found.", style="yellow")
+                try:
+                    db_input = _input(
+                        "  Enter path to .db file (or Enter to abort): "
+                    ).strip()
+                except EOFError:
+                    return False
+                if db_input and pathlib.Path(db_input).exists():
+                    self._state.trace_history.append(
+                        _TraceRun(
+                            timestamp=datetime.now(timezone.utc).isoformat(),
+                            command=cmd,
+                            db_path=db_input,
+                            trace_files=[db_input],
+                        )
+                    )
+                    self._save_session()
+                    self._update_checkpoint_with_run()
+                    return True
+                return False
+            else:
+                _print(
+                    f"  Profiling command failed (exit code {proc.returncode}).",
+                    style="red",
+                )
+                _print("    [r]  Retry same command", style="dim")
+                _print("    [e]  Edit the command and retry", style="dim")
+                if self._state.edit_history:
+                    _print("    [v]  Revert last AI edit and retry", style="yellow")
+                _print("    [a]  Abort", style="dim")
+                try:
+                    choice = _input("  > ").strip().lower()
+                except EOFError:
+                    return False
+                if choice == "r":
+                    continue
+                elif choice == "e":
+                    try:
+                        new_cmd = _input(f"  Edit command:\n  {cmd}\n  > ").strip()
+                        if new_cmd:
+                            cmd = new_cmd
+                    except EOFError:
+                        return False
+                    continue
+                elif choice == "v" and self._state.edit_history:
+                    reverted, action = self._revert_last_edit(
+                        failure_reason=f"Profiling command failed with exit code {proc.returncode}.",
+                        allow_retry=False,
+                    )
+                    if action == "exit":
+                        return False
+                    if reverted and action != "retry":
+                        _print("  Please recompile, then retry.", style="cyan")
+                    continue
+                else:
+                    return False
+
+    # ── Phase 4: AI trace analysis ─────────────────────────────────────────────
+
+    def _record_analysis(
+        self,
+        recs: List[Dict[str, Any]],
+        execution_breakdown: Optional[Dict[str, Any]],
+        hotspots: List[Dict[str, Any]],
+        ai_recommended_command: Optional[str] = None,
+    ) -> _AnalysisSnapshot:
+        snap = _AnalysisSnapshot(
+            timestamp=datetime.now(timezone.utc).isoformat(),
+            iteration=self._state.iteration_count,
+            recommendations=recs,
+            execution_breakdown=execution_breakdown,
+            hotspots=hotspots,
+            ai_recommended_command=ai_recommended_command,
+        )
+        self._state.analysis_history.append(snap)
+        self._state.iteration_count += 1
+        return snap
+
+    def _print_comparison(
+        self,
+        new_breakdown: Optional[Dict[str, Any]],
+    ) -> None:
+        # Called before the current snapshot is appended to analysis_history,
+        # so [-1] is the most-recent *previous* run.
+        if len(self._state.analysis_history) < 1:
+            return
+        prev = self._state.analysis_history[-1]
+        pb = prev.execution_breakdown or {}
+        nb = new_breakdown or {}
+        prev_s = pb.get("total_runtime_ns", 0) / 1e9
+        new_s = nb.get("total_runtime_ns", 0) / 1e9
+        if prev_s == 0:
+            return
+        pct = (new_s - prev_s) / prev_s * 100
+        arrow = "▼" if pct < 0 else "▲"
+        _print()
+        _print("  ── Performance Comparison ──────────────────────────────", style="cyan")
+        _print(f"  {'Metric':<28}  {'Before':>8}  {'After':>8}  Change", style="bold")
+        _print(
+            f"  {'Total GPU time':<28}  {prev_s:>7.2f}s  {new_s:>7.2f}s  "
+            f"{arrow} {abs(pct):.0f}%",
+            style="green" if pct < 0 else "yellow",
+        )
+        for key, label in [
+            ("kernel_time_pct", "Kernel %"),
+            ("memcpy_time_pct", "MemCopy %"),
+            ("api_overhead_pct", "API overhead %"),
+        ]:
+            pv = pb.get(key, 0)
+            nv = nb.get(key, 0)
+            diff = nv - pv
+            _print(
+                f"  {label:<28}  {pv:>7.1f}%  {nv:>7.1f}%  "
+                f"{'▼' if diff < 0 else '▲'} {abs(diff):.1f}pp",
+                style="green" if diff < 0 else "yellow",
+            )
+        _print()
+
+    def _phase4_analyze(self, db_path: str) -> _AnalysisSnapshot:
+        """Run Tier 1/2 analysis; print structured report; return snapshot."""
+        iteration = len(self._state.analysis_history) + 1
+        _print()
+        if iteration == 1:
+            header = "  ══ AI Trace Analysis Report " + "═" * 44
+        else:
+            header = f"  ══ AI Trace Analysis Report  (Run #{iteration}) " + "═" * 35
+        _print(header, style="bold cyan")
+        _print()
+
+        recs: List[Dict[str, Any]] = []
+        breakdown: Optional[Dict[str, Any]] = None
+        hotspots: List[Dict[str, Any]] = []
+
+        try:
+            from rocpd.ai_analysis.api import analyze_database  # type: ignore[import]
+
+            result = analyze_database(
+                pathlib.Path(db_path),
+                enable_llm=bool(self._llm_provider),
+                llm_provider=self._llm_provider or None,
+                llm_api_key=self._llm_api_key or None,
+            )
+
+            eb = result.execution_breakdown
+            if eb:
+                breakdown = {
+                    "kernel_time_pct": eb.kernel_time_pct,
+                    "memcpy_time_pct": eb.memcpy_time_pct,
+                    "api_overhead_pct": eb.api_overhead_pct,
+                    "idle_time_pct": eb.idle_time_pct,
+                    "total_runtime_ns": result.profiling_info.total_duration_ns,
+                }
+                total_s = result.profiling_info.total_duration_ns / 1e9
+                _print("  Summary:", style="white")
+                _print(f"    Total GPU active time : {total_s:.3f}s", style="dim")
+                _print(
+                    f"    Kernel  {eb.kernel_time_pct:.1f}%  "
+                    f"MemCopy {eb.memcpy_time_pct:.1f}%  "
+                    f"Overhead {eb.api_overhead_pct:.1f}%",
+                    style="dim",
+                )
+                _print()
+
+            # Warn when GPU time is zero but profiling ran — likely multiprocessing
+            total_ns = result.profiling_info.total_duration_ns if eb else 0
+            if total_ns == 0 and self._state.trace_history:
+                last_cmd = self._state.trace_history[-1].command
+                already_has_sync = "--process-sync" in last_cmd
+                _print("  ⚠  No GPU kernel activity captured.", style="yellow")
+                if already_has_sync:
+                    _print(
+                        "     --process-sync is active but the DB is still empty.",
+                        style="yellow",
+                    )
+                    _print(
+                        "     The app may use Python multiprocessing 'spawn' (not fork).",
+                        style="yellow",
+                    )
+                    _print(
+                        "     Try:  rocprof-sys --trace -- <app>  (spawn-aware)",
+                        style="yellow",
+                    )
+                    _print(
+                        "     or:   profile a specific worker with --pid <worker_pid>",
+                        style="yellow",
+                    )
+                else:
+                    _print(
+                        "     If your app spawns GPU work in child processes (fork/exec,",
+                        style="yellow",
+                    )
+                    _print(
+                        "     torchrun, DDP, MPI) add --process-sync to the profiling",
+                        style="yellow",
+                    )
+                    _print(
+                        "     command so rocprofv3 follows child processes, and use",
+                        style="yellow",
+                    )
+                    _print(
+                        "     -o results_%nid% so each process writes its own DB.",
+                        style="yellow",
+                    )
+                    _print(
+                        "     Alternatively: rocprof-sys --trace -- <app>",
+                        style="yellow",
+                    )
+                _print()
+
+            all_recs = (
+                result.recommendations.high_priority
+                + result.recommendations.medium_priority
+                + result.recommendations.low_priority
+            )
+
+            # Get raw recs (which carry the structured `commands` list with
+            # full_command strings) so we can surface them in the re-profiling menu.
+            # Match by index — raw_recs and all_recs are in the same order;
+            # raw recs have no stable id (the dataclass assigns "rec_001" etc. in api.py).
+            raw_recs: List[Dict[str, Any]] = getattr(result, "_raw", {}).get(
+                "recommendations_raw", []
+            )
+
+            for idx, r in enumerate(all_recs):
+                raw_rec = raw_recs[idx] if idx < len(raw_recs) else {}
+                recs.append(
+                    {
+                        "id": r.id,
+                        "priority": r.priority,
+                        "category": r.category,
+                        "issue": r.title,
+                        "suggestion": r.description,
+                        "estimated_impact": r.estimated_impact,
+                        "actions": r.next_steps,
+                        "commands": raw_rec.get("commands", []),
+                    }
+                )
+
+        except Exception as exc:
+            _print(f"  (Analysis failed: {exc})", style="red")
+            raw_recs = []
+
+        # Source correlation note
+        if self._state.source_paths:
+            _print(
+                f"  (Source paths provided: "
+                f"{', '.join(pathlib.Path(p).name for p in self._state.source_paths[:3])})",
+                style="dim",
+            )
+            _print()
+
+        # Print each finding; show recommended commands beneath each issue
+        for i, rec in enumerate(recs, 1):
+            pri = rec.get("priority", "INFO")
+            style = _PRI_STYLE.get(pri, "white")
+            _print(f"  ─── Issue #{i}: {rec.get('issue', '')[:70]} ───", style="cyan")
+            _print(f"  Severity   : {pri}", style=style)
+            if rec.get("suggestion"):
+                _print(f"  Root Cause : {rec['suggestion']}", style="dim")
+            if rec.get("estimated_impact"):
+                _print(f"  Impact     : {rec['estimated_impact']}", style="dim")
+            for act in rec.get("actions", [])[:3]:
+                _print(f"    • {act}", style="dim")
+            cmds = rec.get("commands", [])
+            if cmds:
+                _print("  Suggested next commands:", style="dim")
+                for cmd_obj in cmds[:3]:
+                    fc = cmd_obj.get("full_command", "")
+                    desc = cmd_obj.get("description", "")
+                    if fc:
+                        _print(f"    $ {fc}", style="cyan")
+                    if desc:
+                        _print(f"      ({desc})", style="dim")
+            _print()
+
+        if not recs:
+            _print("  No significant bottlenecks detected.", style="green")
+            _print()
+
+        # Comparison with previous run
+        if self._state.analysis_history:
+            self._print_comparison(breakdown)
+
+        # Derive AI-recommended re-profiling command from the first rocprofv3
+        # command found in any recommendation, replacing the generic placeholder
+        # with the actual application being profiled.
+        ai_rec_cmd: Optional[str] = None
+        app_cmd = self._state.app_command
+        run_id = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
+        new_out_dir = f"{self._trace_dir}/run_{run_id}"
+        for rec in recs:
+            for cmd_obj in rec.get("commands", []):
+                if cmd_obj.get("tool") == "rocprofv3":
+                    fc = cmd_obj.get("full_command", "")
+                    if fc and "-- ./app" in fc:
+                        # Replace placeholder app and generic output dir
+                        fc = fc.replace("-- ./app", f"-- {app_cmd}")
+                        fc = _replace_output_dir(fc, new_out_dir)
+                        # Strip flags not accepted by rocprofv3 CLI.
+                        # The LLM fence documents valid flags, but LLMs still
+                        # hallucinate non-existent names — strip defensively.
+                        # (a) --hip-api-trace: invalid; correct flag is --hip-trace:
+                        fc = re.sub(r"\s*--hip-api-trace\b", "", fc)
+                        # (b) --kernel-names <value> — value-taking invalid flag:
+                        fc = re.sub(
+                            r"--kernel-names\s+(?:'[^']*'|\"[^\"]*\"|\S+)",
+                            "",
+                            fc,
+                        )
+                        fc = fc.strip()
+                        fc = re.sub(r" {2,}", " ", fc)  # collapse extra spaces
+                        ai_rec_cmd = fc
+                        break
+            if ai_rec_cmd:
+                break
+
+        # Don't re-suggest a collection profile whose data has already been gathered.
+        # This prevents cycling between two INFO recommendations (e.g. "add --pmc
+        # counters" → "add --sys-trace" → "add --pmc counters" → ...).
+        # Strategy: fingerprint both PMC counters AND named trace flags, then compare
+        # the suggested command against the UNION of everything collected across ALL
+        # previous runs (not just the last one).
+        if ai_rec_cmd and self._state.trace_history:
+            _TRACE_FLAGS = frozenset(
+                {
+                    "--sys-trace",
+                    "--hip-trace",
+                    "--kernel-trace",
+                    "--memory-copy-trace",
+                    "--hsa-trace",
+                    "--stats",
+                }
+            )
+
+            def _collection_fingerprint(cmd: str) -> frozenset:
+                items: set = set()
+                # Individual PMC counters
+                for m in re.finditer(
+                    r"--pmc\s+((?:[A-Z_][A-Z0-9_]*(?:\s+|$))+)", cmd, re.IGNORECASE
+                ):
+                    items.update(f"pmc:{c}" for c in m.group(1).split())
+                # Named trace collection flags
+                for flag in _TRACE_FLAGS:
+                    if flag in cmd:
+                        items.add(flag)
+                return frozenset(items)
+
+            suggested_fp = _collection_fingerprint(ai_rec_cmd)
+            # Union of everything collected across all previous runs
+            already_fp = frozenset().union(
+                *(_collection_fingerprint(t.command) for t in self._state.trace_history)
+            )
+            if suggested_fp and suggested_fp.issubset(already_fp):
+                ai_rec_cmd = None  # every suggested collection already performed
+
+        snap = self._record_analysis(
+            recs, breakdown, hotspots, ai_recommended_command=ai_rec_cmd
+        )
+        # Compute performance delta now that analysis_history has been updated
+        self._update_checkpoint_delta()
+        return snap
+
+    # ── Phase 5: Recommendations menu ─────────────────────────────────────────
+
+    def _phase5_rec_menu(self, snap: _AnalysisSnapshot) -> Optional[tuple]:
+        """Show recommendations as a numbered menu.
+
+        Returns (mode, selected_recs) where mode='direct'|'diff', or None if skipped.
+        Returns None when recommendations are profiling-guidance only (INFO priority),
+        since those require re-profiling rather than source code changes.
+        """
+        recs = snap.recommendations
+        if not recs:
+            _print("  No recommendations to act on.", style="dim")
+            return None
+
+        # Determine if all recommendations are INFO-level profiling guidance
+        # (i.e. "collect more data") with no actionable source code changes.
+        all_info = all(r.get("priority", "INFO").upper() == "INFO" for r in recs)
+
+        # Detect "already re-profiled, still no progress" — don't loop indefinitely.
+        # This happens when: all INFO, iteration > 0, and no fresh AI command available.
+        already_reprofiled = (
+            all_info and snap.iteration > 0 and snap.ai_recommended_command is None
+        )
+
+        while True:
+            _print()
+            _print(
+                "  ── Recommendations ─────────────────────────────────────",
+                style="bold cyan",
+            )
+            for i, rec in enumerate(recs, 1):
+                pri = rec.get("priority", "INFO")
+                style = _PRI_STYLE.get(pri, "white")
+                issue = rec.get("issue", "")[:70]
+                _print(f"  [{i}]  [{pri}]  {issue}", style=style)
+            _print()
+            has_real_data = bool(self._state.trace_history)
+            if all_info and already_reprofiled:
+                # Re-profiling already attempted; nothing new to suggest at Tier 1/2.
+                if has_real_data:
+                    # GPU data captured — we've exhausted Tier 1/2 analysis.
+                    # Offer deeper tiers.
+                    _print(
+                        "  All Tier 1/2 data collected. To investigate further:",
+                        style="yellow",
+                    )
+                    _print(
+                        "  • TraceLens interval + kernel-category analysis: "
+                        "already shown in the report above",
+                        style="dim",
+                    )
+                    _print(
+                        "  • PC Sampling (Tier 3): instruction-level hotspots "
+                        "within each kernel",
+                        style="dim",
+                    )
+                    _print(
+                        "  • rocprof-compute / Omniperf: roofline + detailed "
+                        "micro-architecture metrics",
+                        style="dim",
+                    )
+                    _print()
+                    _print(
+                        "  [d]  Go deeper: collect PC sampling data (Tier 3)",
+                        style="cyan",
+                    )
+                else:
+                    # No GPU data at all — likely multiprocessing spawn issue.
+                    _print(
+                        "  Analysis result unchanged after re-profiling.",
+                        style="yellow",
+                    )
+                    _print(
+                        "  The profiler may not be capturing GPU kernels from this app.",
+                        style="yellow",
+                    )
+                    _print(
+                        "  See the ⚠ note above for multi-process profiling options.",
+                        style="yellow",
+                    )
+                _print()
+                if self._state.checkpoints:
+                    _print("  [b]  Roll back to a checkpoint", style="dim")
+                _print("  [n]  Skip — stop re-profiling", style="dim")
+                _print("  [q]  Quit session", style="dim")
+            elif all_info:
+                # Only profiling-guidance recommendations — no source code to optimize.
+                # Direct the user to re-profile with the suggested commands.
+                _print("  [r]  Re-profile with suggested commands", style="cyan")
+                if self._state.checkpoints:
+                    _print("  [b]  Roll back to a checkpoint", style="dim")
+                _print("  [n]  Skip", style="dim")
+                _print("  [q]  Quit session", style="dim")
+            else:
+                _print("  [a]  Address all with AI optimization", style="dim")
+                if self._state.checkpoints:
+                    _print("  [b]  Roll back to a checkpoint", style="dim")
+                _print("  [n]  Skip — proceed to re-profiling", style="dim")
+                _print("  [q]  Quit session", style="dim")
+            _print()
+            try:
+                choice = _input("  Enter choice: ").strip().lower()
+            except EOFError:
+                return None
+
+            if choice == "q":
+                return None
+            if choice in ("n", ""):
+                return None
+            if choice == "b" and self._state.checkpoints:
+                self._show_checkpoint_picker()
+                return None
+            if choice == "d" and all_info and already_reprofiled and has_real_data:
+                # Build PC sampling command and route it to Phase 7 as option [3].
+                run_id = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
+                new_dir = f"{self._trace_dir}/run_{run_id}"
+                pc_cmd = (
+                    f"ROCPROFILER_PC_SAMPLING_BETA_ENABLED=1 rocprofv3 --pc-sampling"
+                    f" -d {new_dir} -o results -- {self._state.app_command}"
+                )
+                snap.ai_recommended_command = pc_cmd
+                _print()
+                _print(
+                    "  PC sampling command ready. Proceeding to re-profiling.",
+                    style="dim",
+                )
+                _print("  Select [3] at the next prompt to run it.", style="dim")
+                return None
+            if choice == "r" and all_info and not already_reprofiled:
+                # Advance to re-profiling phase; AI-recommended command will be option [3].
+                _print()
+                _print(
+                    "  Advancing to re-profiling. Select [3] to use the suggested command.",
+                    style="dim",
+                )
+                return None
+            if choice == "a" and not all_info:
+                selected = recs
+            elif choice.isdigit() and 1 <= int(choice) <= len(recs):
+                selected = [recs[int(choice) - 1]]
+                r = selected[0]
+                # If the selected rec is INFO-level profiling guidance, direct to re-profiling.
+                if r.get("priority", "INFO").upper() == "INFO":
+                    _print()
+                    _print(
+                        "  This recommendation requires re-profiling with different flags,",
+                        style="dim",
+                    )
+                    _print(
+                        "  not source code changes. Proceeding to re-profiling step.",
+                        style="dim",
+                    )
+                    return None
+                _print()
+                _print(
+                    f"  ─── {r.get('issue', '')[:60]} [{r.get('priority', '')}] ───",
+                    style="cyan",
+                )
+                if r.get("suggestion"):
+                    _print(f"  Root Cause : {r['suggestion']}", style="dim")
+                if r.get("estimated_impact"):
+                    _print(f"  Impact     : {r['estimated_impact']}", style="green")
+                for act in r.get("actions", [])[:5]:
+                    _print(f"    • {act}", style="dim")
+                _print()
+            else:
+                _print("  Invalid choice.", style="yellow")
+                continue
+
+            _print("  How would you like the optimization applied?", style="cyan")
+            _print(
+                "    [1]  Edit files directly (AI modifies source files in-place)",
+                style="dim",
+            )
+            _print(
+                "    [2]  Provide a diff/patch file (you review and apply manually)",
+                style="dim",
+            )
+            _print("    [n]  Back to recommendations menu", style="dim")
+            _print()
+            try:
+                mode_choice = _input("  > ").strip().lower()
+            except EOFError:
+                return None
+            if mode_choice == "1":
+                return ("direct", selected)
+            if mode_choice == "2":
+                return ("diff", selected)
+            # n → loop back to menu
+
+    # ── Phase 6: Apply changes ─────────────────────────────────────────────────
+
+    def _pick_file_from_source_paths(self) -> Optional[pathlib.Path]:
+        """Present numbered list of source files; return chosen."""
+        exts = {".hip", ".cpp", ".cu", ".cl", ".h", ".hpp", ".py"}
+        files: List[pathlib.Path] = []
+        for sp in self._state.source_paths:
+            try:
+                for p in sorted(pathlib.Path(sp).rglob("*")):
+                    if p.suffix in exts and p.is_file():
+                        files.append(p)
+            except OSError:
+                pass
+        if not files:
+            _print("  (No source files found in provided --source paths)", style="yellow")
+            return None
+        _print()
+        _print("  Choose a file to edit:", style="cyan")
+        for i, f in enumerate(files[:15], 1):
+            try:
+                label = f.relative_to(self._state.source_paths[0])
+            except (ValueError, IndexError):
+                label = f.name  # type: ignore[assignment]
+            _print(f"    [{i}]  {label}", style="dim")
+        try:
+            choice = _input("  > ").strip()
+            idx = int(choice) - 1
+            if 0 <= idx < min(len(files), 15):
+                return files[idx]
+        except (ValueError, EOFError):
+            pass
+        return None
+
+    def _llm_rewrite_file(
+        self, file_path: pathlib.Path, suggestions: str
+    ) -> Optional[str]:
+        """Call LLM to rewrite file applying suggestions. Returns new content or None."""
+        if not self._llm_provider:
+            _print("  No LLM configured — cannot perform AI code edit.", style="yellow")
+            return None
+        try:
+            original = file_path.read_text()
+        except OSError as exc:
+            _print(f"  (Cannot read {file_path.name}: {exc})", style="red")
+            return None
+        try:
+            from rocpd.ai_analysis.llm_analyzer import LLMAnalyzer  # type: ignore[import]
+
+            analyzer = LLMAnalyzer(
+                provider=self._llm_provider,
+                api_key=self._llm_api_key,
+                model=self._llm_model,
+            )
+            system = (
+                "You are an expert AMD GPU performance engineer. "
+                "Rewrite the source file applying the optimization suggestions. "
+                "Return ONLY the complete rewritten file — no explanation, no markdown. "
+                "Add a short inline comment on each changed line explaining why."
+            )
+            user = (
+                f"=== SUGGESTIONS ===\n{suggestions}\n\n=== SOURCE FILE ===\n{original}"
+            )
+            # File rewrites can be large — use a generous timeout (5 min).
+            _rewrite_timeout = 300
+            with _Spinner(f"  {self._llm_provider} LLM rewriting {file_path.name}..."):
+                if self._llm_provider == "openai":
+                    try:
+                        result = analyzer._call_openai(
+                            system, user, max_tokens=16384, timeout=_rewrite_timeout
+                        )
+                    except Exception as exc:
+                        if (
+                            "too large" in str(exc).lower()
+                            or "max_tokens" in str(exc).lower()
+                        ):
+                            result = analyzer._call_openai(
+                                system, user, timeout=_rewrite_timeout
+                            )
+                        else:
+                            raise
+                elif self._llm_provider == "anthropic":
+                    result = analyzer._call_anthropic(
+                        system, user, timeout=_rewrite_timeout
+                    )
+                elif self._llm_provider == "private":
+                    result = analyzer._call_private(system, user)
+                else:
+                    result = analyzer._call_local(system, user)
+            return result if result and result.strip() else None
+        except Exception as exc:
+            _print(f"  (LLM rewrite failed: {exc})", style="red")
+            return None
+
+    def _phase6_apply_direct(self, snap: _AnalysisSnapshot) -> Optional[str]:
+        """Phase 6: AI edits source files in-place (.bak backup); waits for recompile.
+
+        Returns None normally, "exit" if the user chose to end the session after a revert.
+        The method loops internally when the user chooses "retry" after a revert.
+        """
+        suggestions = "\n\n".join(
+            f"[{r.get('priority', '')}] {r.get('issue', '')}:\n"
+            f"{r.get('suggestion', '')}\n"
+            + "\n".join(f"  • {a}" for a in r.get("actions", []))
+            for r in snap.recommendations
+        )
+
+        while True:  # retry loop — re-entered when user picks [f] Try a different fix
+            chosen = self._pick_file_from_source_paths()
+            if chosen is None:
+                return None
+            # Capture original content before the LLM call so the diff and backup
+            # are consistent even if a build system touches the file mid-call.
+            original = chosen.read_text()
+            blacklist_block = self._build_blacklist_block()
+            effective_suggestions = (
+                (blacklist_block + "\n" + suggestions) if blacklist_block else suggestions
+            )
+            rewritten = self._llm_rewrite_file(chosen, effective_suggestions)
+            while rewritten is None:
+                try:
+                    ans = _input("  Retry LLM rewrite? [y/N]  ").strip().lower()
+                except EOFError:
+                    return None
+                if ans != "y":
+                    return None
+                rewritten = self._llm_rewrite_file(chosen, effective_suggestions)
+
+            import difflib
+
+            diff_lines = list(
+                difflib.unified_diff(
+                    original.splitlines(keepends=True),
+                    rewritten.splitlines(keepends=True),
+                    fromfile=f"{chosen.name} (original)",
+                    tofile=f"{chosen.name} (AI-edited)",
+                    n=3,
+                )
+            )
+            _print()
+            _print(
+                "  ── Proposed changes ─────────────────────────────────", style="cyan"
+            )
+            for line in diff_lines[:120]:
+                line = line.rstrip("\n")
+                if line.startswith("+"):
+                    _print(line, style="green")
+                elif line.startswith("-"):
+                    _print(line, style="red")
+                else:
+                    _print(line, style="dim")
+            if len(diff_lines) > 120:
+                _print(f"  ... ({len(diff_lines) - 120} more lines omitted)", style="dim")
+            if not diff_lines:
+                _print(
+                    "  (No changes — rewritten file is identical to original)",
+                    style="yellow",
+                )
+                return None
+            _print()
+            try:
+                confirm = _input("  Apply these changes? [y/N]  ").strip().lower()
+            except EOFError:
+                return None
+            if confirm != "y":
+                _print("  Changes discarded.", style="dim")
+                return None
+
+            bak = chosen.with_suffix(chosen.suffix + ".bak")
+            try:
+                bak.write_text(original)
+                chosen.write_text(rewritten)
+                _print(f"  Backup : {bak}", style="dim")
+                _print(f"  Updated: {chosen}", style="green")
+                self._state.edit_history.append(
+                    _EditRecord(
+                        timestamp=datetime.now(timezone.utc).isoformat(),
+                        file_path=str(chosen),
+                        backup_path=str(bak),
+                    )
+                )
+                self._save_session()
+                # Create checkpoint after saving (captures file contents after the edit)
+                _rel_path = (
+                    os.path.relpath(str(chosen), self._state.repo_root)
+                    if self._state.repo_root
+                    else str(chosen)
+                )
+                _file_snapshots = {}
+                try:
+                    _file_snapshots[_rel_path] = chosen.read_text()
+                except OSError:
+                    pass
+                self._create_checkpoint(
+                    files_modified=[_rel_path],
+                    edit_summary=_edit_summary_from_suggestions(suggestions),
+                    file_snapshots=_file_snapshots,
+                )
+            except OSError as exc:
+                _print(f"  (Write failed: {exc})", style="red")
+                return None
+
+            # Wait for recompile
+            _print()
+            _print("  Changes applied. Please recompile your application.", style="cyan")
+            _print(
+                "  Type 'done' when compiled, 'revert' to undo the AI edit,",
+                style="dim",
+            )
+            _print("  'abort' to exit, or paste compilation errors.", style="dim")
+            _compile_errors: List[str] = []
+            _revert_action: str = "continue"
+            while True:
+                try:
+                    resp = _input("  > ").strip()
+                except EOFError:
+                    break
+                resp_lower = resp.lower()
+                if resp_lower in ("done", "compiled", "ok", "yes", "y", ""):
+                    _print("  Great — ready to re-profile.", style="green")
+                    break
+                if resp_lower in ("revert", "undo", "rollback", "v", "r"):
+                    error_ctx = "\n".join(_compile_errors)
+                    reverted, _revert_action = self._revert_last_edit(
+                        failure_reason=error_ctx
+                    )
+                    break
+                if resp_lower in ("abort", "cancel", "quit", "exit"):
+                    _print("  Aborting. Backup preserved at: " + str(bak), style="dim")
+                    _revert_action = "exit"
+                    break
+                # Treat as compilation error description — accumulate for context
+                _compile_errors.append(resp)
+                _print(
+                    "  Error noted. Type 'done' when fixed or 'revert' to undo the edit.",
+                    style="yellow",
+                )
+
+            if _revert_action == "retry":
+                _print(
+                    "  ── Trying a different fix ───────────────────────────",
+                    style="cyan",
+                )
+                continue  # re-enter the while True loop above
+            if _revert_action == "exit":
+                return "exit"
+            return None  # "continue" — proceed to Phase 7
+
+    def _phase6_apply_diff(self, snap: _AnalysisSnapshot) -> None:
+        """Phase 6 alt: Save suggestions to a patch file."""
+        suggestions = "\n\n".join(
+            f"[{r.get('priority', '')}] {r.get('issue', '')}:\n"
+            f"  Suggestion: {r.get('suggestion', '')}\n"
+            + "\n".join(f"  • {a}" for a in r.get("actions", []))
+            for r in snap.recommendations
+        )
+        base = self._state.source_paths[0] if self._state.source_paths else "."
+        diff_path = pathlib.Path(base) / "ai_optimizations.patch"
+        try:
+            diff_path.write_text(suggestions + "\n")
+            _print(f"  Suggestions saved to: {diff_path}", style="green")
+            _print("  Apply manually, recompile, then re-run profiling.", style="dim")
+        except OSError as exc:
+            _print(f"  (Could not save patch: {exc})", style="red")
+
+    # ── Phase 7: Re-profiling loop ─────────────────────────────────────────────
+
+    def _phase7_reprofiling_prompt(self) -> Optional[str]:
+        """Ask which profiling command to use for re-profiling. Returns cmd or None."""
+        current = self._state.profiling_command
+        ai_cmd: Optional[str] = None
+        if self._state.analysis_history:
+            ai_cmd = self._state.analysis_history[-1].ai_recommended_command
+
+        _print()
+        _print(
+            "  Ready to re-profile. Which command would you like to run?", style="cyan"
+        )
+        _print("    [1]  Same command as before:", style="dim")
+        _print(f"         {current}", style="dim")
+        _print("    [2]  Let me edit the command first", style="dim")
+        if ai_cmd:
+            _print("    [3]  Use AI-recommended command:", style="dim")
+            _print(f"         {ai_cmd}", style="dim")
+        _print("    [n]  Stop — I'm done profiling", style="dim")
+        _print()
+        try:
+            choice = _input("  > ").strip().lower()
+        except EOFError:
+            return None
+        if choice in ("1", ""):
+            return current
+        elif choice == "2":
+            try:
+                new_cmd = _input(
+                    f"  Edit command (Enter to keep):\n  {current}\n  > "
+                ).strip()
+                return new_cmd or current
+            except EOFError:
+                return current
+        elif choice == "3" and ai_cmd:
+            return ai_cmd
+        return None
+
+    # ── Session summary ────────────────────────────────────────────────────────
+
+    def print_session_summary(self) -> None:
+        """Print final session summary."""
+        _print()
+        _print("  ══════════════════════════════════════════", style="bold cyan")
+        _print("   Session Summary", style="bold cyan")
+        _print("  ══════════════════════════════════════════", style="bold cyan")
+        _print(f"  Iterations : {self._state.iteration_count}", style="white")
+
+        if len(self._state.analysis_history) >= 2:
+            first_bd = self._state.analysis_history[0].execution_breakdown or {}
+            last_bd = self._state.analysis_history[-1].execution_breakdown or {}
+            t0 = first_bd.get("total_runtime_ns", 0) / 1e9
+            t1 = last_bd.get("total_runtime_ns", 0) / 1e9
+            if t0 > 0:
+                pct = (t1 - t0) / t0 * 100
+                arrow = "▼" if pct < 0 else "▲"
+                _print(
+                    f"  GPU time   : {t0:.2f}s → {t1:.2f}s  "
+                    f"({arrow} {abs(pct):.0f}%)",
+                    style="white",
+                )
+
+        if self._state.edit_history:
+            files = [pathlib.Path(e.file_path).name for e in self._state.edit_history]
+            baks = [pathlib.Path(e.backup_path).name for e in self._state.edit_history]
+            _print(f"  Modified   : {', '.join(files)}", style="white")
+            _print(f"  Backups    : {', '.join(baks)}", style="dim")
+
+        if self._state.trace_history:
+            runs = [
+                pathlib.Path(t.db_path).parent.name for t in self._state.trace_history
+            ]
+            _print(f"  Trace runs : {', '.join(runs)}", style="dim")
+
+        if self._session_file.exists():
+            _print(f"  Session    : {self._session_file}", style="dim")
+
+        _print("  ══════════════════════════════════════════", style="bold cyan")
+        _print()
+
+    # ── Main entry point ──────────────────────────────────────────────────────
+
+    def run(self) -> None:
+        """Execute the 7-phase workflow loop."""
+        _print_startup_banner()
+
+        try:
+            # Phase 1: validate source paths
+            for sp in self._state.source_paths:
+                if not pathlib.Path(sp).exists():
+                    _print(f"  Warning: --source path not found: {sp}", style="yellow")
+
+            self._init_checkpoints()
+            self._prune_stale_worktrees()
+
+            # Phase 1b: quick workload analysis → derive best starter command.
+            # _phase1b always classifies the app; if it returns None we still want
+            # app_info (uses_fork etc.) so _build_profiling_command can use %nid%.
+            _app_info = self._classify_app_command(self._state.app_command)
+            cmd = (
+                self._phase1b_quick_workload_analysis()
+                or self._build_profiling_command(_app_info)
+            )
+
+            # Phase 2: confirm profiling command with user
+            self._state.profiling_command = cmd
+            if not self._phase2_show_command(cmd):
+                return
+
+            # Phases 3-7 loop
+            while True:
+                # Phase 3: run profiler
+                if not self._phase3_run_profiler(self._state.profiling_command):
+                    _print("  Trace collection failed or was aborted.", style="yellow")
+                    break
+
+                latest_run = self._state.trace_history[-1]
+
+                # Phase 4: analysis
+                snap = self._phase4_analyze(latest_run.db_path)
+
+                # Phase 5: recommendations menu
+                result = self._phase5_rec_menu(snap)
+                if result is not None:
+                    mode, selected_recs = result
+                    scoped = _AnalysisSnapshot(
+                        timestamp=snap.timestamp,
+                        iteration=snap.iteration,
+                        recommendations=selected_recs,
+                        execution_breakdown=snap.execution_breakdown,
+                        hotspots=snap.hotspots,
+                        ai_recommended_command=snap.ai_recommended_command,
+                    )
+                    # Phase 6: apply
+                    if mode == "direct":
+                        if self._phase6_apply_direct(scoped) == "exit":
+                            return
+                    elif mode == "diff":
+                        self._phase6_apply_diff(scoped)
+
+                # Phase 7: re-profiling?
+                next_cmd = self._phase7_reprofiling_prompt()
+                if next_cmd is None:
+                    break
+                self._state.profiling_command = next_cmd
+
+        except KeyboardInterrupt:
+            _print()
+            _print("  Interrupted.", style="yellow")
+        finally:
+            self._teardown_checkpoints()
+            self._save_session()
+            self.print_session_summary()
diff --git a/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/llm_analyzer.py b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/llm_analyzer.py
new file mode 100644
index 00000000000..7014b1904e9
--- /dev/null
+++ b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/llm_analyzer.py
@@ -0,0 +1,1306 @@
+#!/usr/bin/env python3
+###############################################################################
+# MIT License
+#
+# Copyright (c) 2025 Advanced Micro Devices, Inc.
+###############################################################################
+
+"""
+LLM-powered analysis with reference guide ("fence") implementation.
+
+The reference guide is a user-modifiable markdown file that defines:
+- GPU hardware specifications
+- Performance analysis models and formulas
+- Bottleneck classification guidelines
+- AMD-specific optimization techniques
+- Recommendation quality standards
+- Output format requirements
+
+This guide is loaded from llm-reference-guide.md and included in every
+LLM request to ensure consistent, high-quality analysis.
+
+To modify LLM behavior, edit: share/rocprofiler-sdk/llm-reference-guide.md
+No code changes required - the guide is loaded dynamically.
+"""
+
+import os
+import re
+import json
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional, Dict, Any, List
+
+from .exceptions import (
+    AnalysisError,
+    LLMAuthenticationError,
+    LLMRateLimitError,
+    ReferenceGuideNotFoundError,
+)
+
+# Regex to match Unix and Windows file paths that may appear in profiling data
+_PATH_PATTERN = re.compile(
+    r'(/home/[^\s,"\';>]+|/opt/[^\s,"\';>]+|/root/[^\s,"\';>]+|'
+    r'/tmp/[^\s,"\';>]+|/var/[^\s,"\';>]+|[A-Za-z]:\\[^\s,"\';>]+)'
+)
+
+# Regex to match rocpd-context tag comments in the reference guide
+_TAG_RE = re.compile(r"<!--\s*rocpd-context:\s*([^-]+?)\s*-->")
+
+
+def _redact_paths(value: str) -> str:
+    """Replace file system paths in a string with [REDACTED]."""
+    return _PATH_PATTERN.sub("[REDACTED]", value)
+
+
+# Default location for the reference guide (relative to package installation)
+# Users can override with ROCPD_LLM_REFERENCE_GUIDE environment variable
+DEFAULT_REFERENCE_GUIDE_NAME = "llm-reference-guide.md"
+
+# Default model names — override at runtime with ROCPD_LLM_MODEL env var
+DEFAULT_ANTHROPIC_MODEL = "claude-sonnet-4-20250514"
+DEFAULT_OPENAI_MODEL = "gpt-4-turbo-preview"
+
+
+def get_reference_guide_path() -> Path:
+    """
+    Get the path to the LLM reference guide.
+
+    Priority order:
+    1. ROCPD_LLM_REFERENCE_GUIDE environment variable
+    2. Relative to this module (ai_analysis/share/)
+    3. /opt/rocm/share/rocprofiler-sdk/llm-reference-guide.md
+
+    Returns:
+        Path to reference guide file
+
+    Raises:
+        ReferenceGuideNotFoundError: If guide file not found (lists all attempted paths)
+    """
+    attempted = []
+
+    # Check environment variable first
+    env_path = os.environ.get("ROCPD_LLM_REFERENCE_GUIDE")
+    if env_path:
+        guide_path = Path(env_path)
+        if guide_path.exists():
+            return guide_path
+        attempted.append(str(guide_path))
+
+    # Check relative to this module (preferred for development and installation)
+    module_path = Path(__file__).parent / "share" / DEFAULT_REFERENCE_GUIDE_NAME
+    if module_path.exists():
+        return module_path
+    attempted.append(str(module_path))
+
+    # Check ROCm installation directory (legacy)
+    rocm_path = Path("/opt/rocm/share/rocprofiler-sdk") / DEFAULT_REFERENCE_GUIDE_NAME
+    if rocm_path.exists():
+        return rocm_path
+    attempted.append(str(rocm_path))
+
+    # Not found — report all attempted paths
+    raise ReferenceGuideNotFoundError(attempted)
+
+
+def load_reference_guide() -> str:
+    """Load the LLM fence document.
+
+    Same path lookup order as get_reference_guide_path():
+    ROCPD_LLM_REFERENCE_GUIDE env var → module share/ dir → /opt/rocm/share/...
+
+    Raises:
+        ReferenceGuideNotFoundError: If guide file not found.
+    """
+    return get_reference_guide_path().read_text()
+
+
+# ---------------------------------------------------------------------------
+# Context-aware guide filtering
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class AnalysisContext:
+    """
+    Describes the current analysis state so LLMAnalyzer can select only the
+    relevant sections of the reference guide, reducing prompt token cost by
+    18–51% depending on the scenario.
+
+    Fields:
+        tier: Analysis tier — 0=source-only, 1=trace, 2=hardware counters.
+        has_counters: True when PMC counter data is present in the database.
+            When True, tier2-tagged sections are loaded even if tier==1.
+        bottleneck_type: Primary bottleneck from _build_summary() —
+            "compute", "memory", "latency", "launch", or "mixed".
+            "compute" and "memory" trigger the compiler tag.
+        gpu_arch: Detected GPU architecture string e.g. "gfx942".
+            Reserved for future per-GPU section filtering.
+        custom_prompt: The user's --prompt text, if any.
+            Triggers compiler tag when it contains compiler/flag/build/compile.
+    """
+
+    tier: int = 1
+    has_counters: bool = False
+    bottleneck_type: Optional[str] = None
+    gpu_arch: Optional[str] = None
+    custom_prompt: Optional[str] = None
+
+    # TraceLens-derived metrics (used by _select_tags() to gate reference guide section)
+    kernel_categories: Optional[list] = (
+        None  # [{category, count, pct_of_kernel_time, ...}]
+    )
+    short_kernel_summary: Optional[dict] = (
+        None  # {threshold_us, short_kernel_count, wasted_pct}
+    )
+    interval_timeline: Optional[dict] = (
+        None  # {true_compute_pct, exposed_memcpy_pct, idle_pct}
+    )
+
+
+def _select_tags(ctx: AnalysisContext) -> set:
+    """
+    Map an AnalysisContext to the set of section tags to include.
+
+    Tag vocabulary:
+        always   — critical rules, role, output format, what not to do, summary
+        tier1    — profiling workflow, tool reference, common bottleneck types
+        tier2    — hardware counters, memory hierarchy, GPU specs, perf models
+        compiler — compiler flags section (HIPCC, LLVM AMDGPU, register control)
+        source   — reserved for future Tier 0-specific guidance sections
+
+    Fallback: sections with no tag comment are always included.
+    """
+    tags = {"always"}
+    if ctx.tier >= 1:
+        tags.add("tier1")
+    if ctx.has_counters or ctx.tier >= 2:
+        tags.add("tier2")
+    if (
+        ctx.tier == 0
+        or ctx.bottleneck_type in ("compute", "memory")
+        or (
+            ctx.custom_prompt
+            and any(
+                w in ctx.custom_prompt.lower()
+                for w in ("compiler", "flag", "build", "compile")
+            )
+        )
+    ):
+        tags.add("compiler")
+    if ctx.tier == 0:
+        tags.add("source")
+    # tracelens_metrics: include when TraceLens analysis data is available
+    if ctx.kernel_categories or ctx.interval_timeline:
+        tags.add("tracelens_metrics")
+    return tags
+
+
+def _filter_guide(guide_text: str, tags: set) -> str:
+    """
+    Return only the sections of the reference guide whose
+    ``<!-- rocpd-context: TAG [, TAG ...] -->`` comment matches one of the
+    requested *tags*.
+
+    Parsing rules:
+    - Split on newline + "## " to find section boundaries.
+    - Scan only the first 3 lines of each section for the tag comment.
+    - A section with *no* tag comment is always included (safe fallback for
+      user-added sections that lack a tag).
+    - Multiple tags in a single comment are comma-separated; any match
+      is sufficient to include the section.
+    - Tags in the comment are stripped of surrounding whitespace before
+      comparison.
+
+    Args:
+        guide_text: Full reference guide markdown content.
+        tags:       Set of tag strings that should be included
+                    (e.g. {"always", "tier1"}).
+
+    Returns:
+        Filtered guide text.  Empty string if *guide_text* is empty.
+    """
+    if not guide_text:
+        return ""
+
+    # Split on section boundaries. The intro block (before first ##) is
+    # kept as-is (it has no tag → always included).
+    raw_sections = re.split(r"\n(?=## )", guide_text)
+
+    included = []
+    for section in raw_sections:
+        # Examine only the first 3 lines for a tag comment.
+        head_lines = section.splitlines()[:3]
+        head = "\n".join(head_lines)
+        match = _TAG_RE.search(head)
+
+        if match is None:
+            # No tag → always include (safe fallback).
+            included.append(section)
+        else:
+            section_tags = {t.strip() for t in match.group(1).split(",")}
+            if section_tags & tags:
+                included.append(section)
+
+    return "\n".join(included)
+
+
+class LLMAnalyzer:
+    """
+    Handles LLM-powered analysis enhancements.
+
+    The reference guide acts as the "fence" - it's loaded once and included
+    in every LLM request to ensure consistent, high-quality analysis.
+
+    Example:
+        >>> analyzer = LLMAnalyzer(provider="anthropic")
+        >>> result = analyzer.analyze_with_llm(analysis_data, custom_prompt="Why is kernel X slow?")
+        >>> print(result)
+    """
+
+    def __init__(
+        self,
+        provider: str = "anthropic",  # "anthropic", "openai", or "local"
+        api_key: Optional[str] = None,
+        model: Optional[str] = None,
+        reference_guide_path: Optional[Path] = None,
+        verbose: bool = False,
+        thinking_budget_tokens: Optional[int] = None,
+    ):
+        """
+        Initialize LLM analyzer.
+
+        Args:
+            provider: LLM provider ("anthropic", "openai", or "local")
+            api_key: API key (if None, reads from environment)
+            model: Override model name (if None, uses default for provider)
+            reference_guide_path: Path to reference guide (if None, uses default location)
+            verbose: Enable verbose logging
+            thinking_budget_tokens: Enable extended thinking with this token budget.
+                Only supported with the Anthropic provider and compatible models
+                (claude-opus-4, claude-sonnet-4-5, claude-3-7-sonnet).
+                Can also be set via ROCPD_LLM_THINKING environment variable.
+        """
+        valid_providers = {"anthropic", "openai", "local", "private"}
+        if provider not in valid_providers:
+            raise ValueError(
+                f"Unknown provider: {provider!r}. "
+                f"Must be one of: {', '.join(sorted(valid_providers))}"
+            )
+        self.provider = provider
+        self.model = model
+        self.verbose = verbose
+        self.api_key = api_key or self._get_api_key_from_env(raise_if_missing=False)
+
+        # Extended thinking budget: explicit parameter takes precedence over env var
+        if thinking_budget_tokens is not None:
+            self.thinking_budget_tokens = thinking_budget_tokens
+        else:
+            _env_thinking = os.environ.get("ROCPD_LLM_THINKING")
+            if _env_thinking:
+                try:
+                    self.thinking_budget_tokens = int(_env_thinking)
+                except ValueError:
+                    import warnings
+
+                    warnings.warn(
+                        f"ROCPD_LLM_THINKING={_env_thinking!r} is not a valid integer; "
+                        "extended thinking disabled.",
+                        stacklevel=2,
+                    )
+                    self.thinking_budget_tokens = None
+            else:
+                self.thinking_budget_tokens = None
+
+        # Load reference guide (the "fence")
+        if reference_guide_path:
+            self.reference_guide_path = reference_guide_path
+        else:
+            self.reference_guide_path = get_reference_guide_path()
+
+        self.reference_guide = self._load_reference_guide()
+
+        if self.verbose:
+            print(f"[LLM] Loaded reference guide from: {self.reference_guide_path}")
+            print(f"[LLM] Reference guide size: {len(self.reference_guide)} characters")
+
+    def _get_api_key_from_env(self, raise_if_missing: bool = True) -> str:
+        """Get API key from environment"""
+        if self.provider == "anthropic":
+            key = os.getenv("ANTHROPIC_API_KEY", "")
+        elif self.provider == "openai":
+            key = os.getenv("OPENAI_API_KEY", "")
+        elif self.provider == "local":
+            return os.environ.get("ROCPD_LLM_LOCAL_API_KEY", "ignored")
+        elif self.provider == "private":
+            return os.environ.get("ROCPD_LLM_PRIVATE_API_KEY", "dummy")
+        else:
+            raise ValueError(f"Unknown provider: {self.provider}")
+
+        if not key and raise_if_missing:
+            raise LLMAuthenticationError(
+                f"No API key found for {self.provider}. "
+                f"Set {'ANTHROPIC_API_KEY' if self.provider == 'anthropic' else 'OPENAI_API_KEY'} "
+                "environment variable."
+            )
+
+        return key
+
+    def _load_reference_guide(self) -> str:
+        """
+        Load the reference guide from file.
+
+        This makes it easy to modify the guide without changing code.
+        The guide is the "fence" that constrains LLM behavior.
+
+        Returns:
+            Reference guide content as string
+
+        Raises:
+            ReferenceGuideNotFoundError: If guide file doesn't exist
+        """
+        if not self.reference_guide_path.exists():
+            raise ReferenceGuideNotFoundError([str(self.reference_guide_path)])
+
+        return self.reference_guide_path.read_text()
+
+    def _sanitize_data(self, analysis_data: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Sanitize sensitive data before sending to LLM.
+
+        Privacy rules:
+        - Kernel names → [KERNEL_1], [KERNEL_2], etc.
+        - Grid dimensions → [GRID_SIZE]
+        - Workgroup sizes → [WORKGROUP_SIZE]
+        - File paths → [REDACTED]
+
+        Preserved data (aggregated/classified):
+        - Bottleneck classifications
+        - Aggregated metrics (time percentages, utilization)
+        - GPU architecture identifiers
+
+        Args:
+            analysis_data: Raw analysis data
+
+        Returns:
+            Sanitized copy of analysis data
+        """
+        sanitized = {}
+
+        # Copy top-level non-sensitive fields, redacting any embedded paths
+        for key in ["execution_breakdown", "gpu", "profiling_info"]:
+            if key in analysis_data:
+                section = analysis_data[key].copy()
+                # Redact path-like strings in nested string values
+                for k, v in section.items():
+                    if isinstance(v, str):
+                        section[k] = _redact_paths(v)
+                sanitized[key] = section
+
+        # Sanitize kernel information
+        if "kernels" in analysis_data:
+            sanitized["kernels"] = []
+            for i, kernel in enumerate(analysis_data["kernels"], 1):
+                sanitized_kernel = {
+                    "kernel_id": f"[KERNEL_{i}]",
+                    "dispatch_count": kernel.get("dispatch_count"),
+                    "pct_total_time": kernel.get("pct_total_time"),
+                    "avg_duration_ns": kernel.get("avg_duration_ns"),
+                }
+
+                # Include counter data but redact sizes
+                if "vgpr_count" in kernel:
+                    sanitized_kernel["vgpr_count"] = kernel["vgpr_count"]
+                if "occupancy_pct" in kernel:
+                    sanitized_kernel["occupancy_pct"] = kernel["occupancy_pct"]
+                if "valu_util_pct" in kernel:
+                    sanitized_kernel["valu_util_pct"] = kernel["valu_util_pct"]
+                if "hbm_util_pct" in kernel:
+                    sanitized_kernel["hbm_util_pct"] = kernel["hbm_util_pct"]
+
+                # Redact grid/workgroup sizes
+                if "grid_size" in kernel:
+                    sanitized_kernel["grid_size"] = "[GRID_SIZE]"
+                if "workgroup_size" in kernel:
+                    sanitized_kernel["workgroup_size"] = "[WORKGROUP_SIZE]"
+
+                sanitized["kernels"].append(sanitized_kernel)
+
+        # Keep memory operations (aggregated, no sensitive data)
+        if "memory_ops" in analysis_data:
+            sanitized["memory_ops"] = analysis_data["memory_ops"]
+
+        # Keep data availability flags
+        sanitized["has_counters"] = analysis_data.get("has_counters", False)
+        sanitized["has_pc_sampling"] = analysis_data.get("has_pc_sampling", False)
+
+        # Pass through TraceLens-derived metrics (already safe: pct values and category strings only)
+        for tl_key in (
+            "interval_timeline",
+            "kernel_categories",
+            "short_kernel_summary",
+        ):
+            if tl_key in analysis_data:
+                sanitized[tl_key] = analysis_data[tl_key]
+
+        return sanitized
+
+    def _build_system_prompt(self, context: Optional[AnalysisContext] = None) -> str:
+        """
+        Build the system prompt with the reference guide.
+
+        When *context* is provided, only the guide sections relevant to the
+        current analysis are included (see _filter_guide and _select_tags).
+        When *context* is None the full guide is used — preserving backward
+        compatibility for callers that do not yet provide context.
+
+        Args:
+            context: Optional AnalysisContext describing tier, bottleneck, etc.
+
+        Returns:
+            System prompt string with embedded (possibly filtered) reference guide.
+        """
+        if context is not None:
+            guide = _filter_guide(self.reference_guide, _select_tags(context))
+            if self.verbose:
+                full_len = len(self.reference_guide)
+                filt_len = len(guide)
+                print(
+                    f"[LLM] Guide filtered: {filt_len} / {full_len} chars "
+                    f"({(100 * filt_len // full_len if full_len else 0)}% of full guide)"
+                )
+        else:
+            guide = self.reference_guide
+
+        return f"""You are an expert GPU performance analyst specializing in AMD GPUs.
+
+{guide}
+
+CRITICAL: Follow these guidelines strictly:
+1. Use ONLY current generation tools (rocprofv3, rocprof-compute, rocprof-sys), NEVER rocprof or rocprof-v2
+2. Output plain text ONLY - no markdown headers (###), no **bold**, no special formatting
+3. Structure your response exactly as specified in the reference guide
+4. Choose the appropriate profiling tool based on the analysis need per documentation
+5. Maintain consistent format regardless of analysis complexity
+6. All commands and options must match the official documentation exactly
+"""
+
+    def _build_user_prompt(
+        self,
+        analysis_data: Dict[str, Any],
+        custom_prompt: Optional[str] = None,
+    ) -> str:
+        """
+        Build user prompt with profiling data.
+
+        Args:
+            analysis_data: Sanitized profiling data
+            custom_prompt: Optional user question
+
+        Returns:
+            User prompt string
+        """
+        # Format data as structured text for LLM
+        data_summary = self._format_data_for_llm(analysis_data)
+
+        if custom_prompt:
+            return f"""User Question: {custom_prompt}
+
+Profiling Data:
+{data_summary}
+
+Please analyze this data and answer the user's question, following the
+reference guide. Provide specific, actionable recommendations.
+
+IMPORTANT FORMAT REQUIREMENTS:
+- Use PLAIN TEXT only - no markdown headers (###, ##, #)
+- Use ONLY current generation tools (rocprofv3, rocprof-compute, rocprof-sys) in profiling suggestions
+- NEVER suggest deprecated tools like rocprof or rocprof-v2
+- All commands must match official documentation exactly
+- Structure recommendations with: Priority, Issue, Suggestion, Actionable Steps
+- Be consistent with the output format"""
+        else:
+            return f"""Profiling Data:
+{data_summary}
+
+Please analyze this GPU profiling data and provide:
+1. Executive summary (2-3 sentences)
+2. Primary bottleneck identification with confidence level
+3. Top 3-5 actionable recommendations (prioritized High/Medium/Low)
+4. Suggested next profiling steps (if applicable)
+
+IMPORTANT FORMAT REQUIREMENTS:
+- Use PLAIN TEXT only - no markdown headers (###, ##, #)
+- Use ONLY current generation tools (rocprofv3, rocprof-compute, rocprof-sys) in profiling suggestions
+- NEVER suggest deprecated tools like rocprof or rocprof-v2
+- All commands must match official documentation exactly
+- Structure each recommendation with: Priority, Issue, Suggestion, Actionable Steps, Expected Impact
+- Be consistent with the output format regardless of your model
+
+Follow the reference guide strictly for analysis methodology and output format."""
+
+    def _format_data_for_llm(self, data: Dict[str, Any]) -> str:
+        """Format analysis data as readable text for LLM"""
+        lines = []
+
+        # GPU info
+        if "gpu" in data:
+            lines.append("## GPU Information")
+            lines.append(f"- Name: {data['gpu'].get('name', 'Unknown')}")
+            lines.append(f"- Architecture: {data['gpu'].get('arch', 'Unknown')}")
+            lines.append("")
+
+        # Execution breakdown
+        if "execution_breakdown" in data:
+            lines.append("## Execution Breakdown")
+            breakdown = data["execution_breakdown"]
+            lines.append(f"- Kernel Time: {breakdown.get('kernel_time_pct', 0):.1f}%")
+            lines.append(
+                f"- Memory Copy Time: {breakdown.get('memcpy_time_pct', 0):.1f}%"
+            )
+            lines.append(f"- API Overhead: {breakdown.get('api_overhead_pct', 0):.1f}%")
+            lines.append("")
+
+        # Top kernels
+        if "kernels" in data:
+            lines.append("## Top Kernels")
+            for kernel in data["kernels"][:5]:  # Top 5
+                lines.append(f"- {kernel.get('kernel_id', 'Unknown')}")
+                lines.append(f"  - Time: {kernel.get('pct_total_time', 0):.1f}% of total")
+                lines.append(f"  - Dispatches: {kernel.get('dispatch_count', 'N/A')}")
+
+                if "vgpr_count" in kernel:
+                    lines.append(f"  - VGPR Usage: {kernel.get('vgpr_count')}")
+                if "occupancy_pct" in kernel:
+                    lines.append(
+                        f"  - Wave Occupancy: {kernel.get('occupancy_pct'):.1f}%"
+                    )
+                if "valu_util_pct" in kernel:
+                    lines.append(
+                        f"  - VALU Utilization: {kernel.get('valu_util_pct'):.1f}%"
+                    )
+                if "hbm_util_pct" in kernel:
+                    lines.append(
+                        f"  - HBM Utilization: {kernel.get('hbm_util_pct'):.1f}%"
+                    )
+                lines.append("")
+
+        # Memory operations
+        if "memory_ops" in data:
+            lines.append("## Memory Operations")
+            mem = data["memory_ops"]
+            if "h2d" in mem:
+                lines.append(
+                    f"- H2D: {mem['h2d'].get('count', 0)} transfers, "
+                    f"{mem['h2d'].get('total_bytes', 0) / 1e9:.2f} GB, "
+                    f"{mem['h2d'].get('bandwidth_gbps', 0):.1f} GB/s"
+                )
+            if "d2h" in mem:
+                lines.append(
+                    f"- D2H: {mem['d2h'].get('count', 0)} transfers, "
+                    f"{mem['d2h'].get('total_bytes', 0) / 1e9:.2f} GB, "
+                    f"{mem['d2h'].get('bandwidth_gbps', 0):.1f} GB/s"
+                )
+            lines.append("")
+
+        # TraceLens-derived metrics (present when _convert_result_to_llm_format() included them)
+        tracelens_parts = []
+        if data.get("interval_timeline"):
+            tracelens_parts.append(
+                "interval_timeline: " + json.dumps(data["interval_timeline"])
+            )
+        if data.get("kernel_categories"):
+            tracelens_parts.append(
+                "kernel_categories: " + json.dumps(data["kernel_categories"])
+            )
+        if data.get("short_kernel_summary"):
+            tracelens_parts.append(
+                "short_kernels: " + json.dumps(data["short_kernel_summary"])
+            )
+        if tracelens_parts:
+            lines.append("=== TraceLens-Derived Metrics ===")
+            lines.extend(tracelens_parts)
+            lines.append("")
+
+        # Data availability note
+        lines.append("## Data Availability")
+        if data.get("has_counters"):
+            lines.append("✅ Hardware counters available (Tier 2 analysis possible)")
+        else:
+            lines.append("⚠️  No hardware counters (Tier 1 trace analysis only)")
+
+        if data.get("has_pc_sampling"):
+            lines.append("✅ PC sampling data available (Tier 3 analysis possible)")
+
+        return "\n".join(lines)
+
+    def analyze_with_llm(
+        self,
+        analysis_data: Dict[str, Any],
+        custom_prompt: Optional[str] = None,
+        context: Optional[AnalysisContext] = None,
+    ) -> str:
+        """
+        Send analysis data to LLM for enhanced explanation.
+
+        The LLM receives:
+        1. System prompt with reference guide (the "fence")
+        2. Sanitized profiling data
+        3. Optional custom user prompt
+
+        Args:
+            analysis_data: Profiling data and basic analysis results
+            custom_prompt: User's custom question (e.g., "Why is kernel X slow?")
+            context: Optional AnalysisContext for guide section filtering.
+                When provided, only guide sections relevant to the current
+                analysis tier/bottleneck are included, reducing token cost.
+
+        Returns:
+            LLM-generated natural language analysis
+
+        Raises:
+            LLMAuthenticationError: Invalid API key
+            LLMRateLimitError: API rate limit exceeded
+        """
+        # Sanitize data (privacy protection)
+        sanitized_data = self._sanitize_data(analysis_data)
+
+        # Build prompts (includes reference guide as "fence")
+        system_prompt = self._build_system_prompt(context=context)
+        user_prompt = self._build_user_prompt(sanitized_data, custom_prompt)
+
+        if self.verbose:
+            print(f"[LLM] Calling {self.provider} API...")
+            print(f"[LLM] System prompt length: {len(system_prompt)} chars")
+            print(f"[LLM] User prompt length: {len(user_prompt)} chars")
+
+        # Extended thinking is only supported by Anthropic
+        if self.thinking_budget_tokens is not None and self.provider != "anthropic":
+            raise ValueError(
+                "Extended thinking is only supported with the Anthropic provider. "
+                f"Current provider: {self.provider!r}. "
+                "Remove --llm-thinking or switch to --llm anthropic."
+            )
+
+        # Call appropriate LLM API
+        if self.provider == "anthropic":
+            return self._call_anthropic(system_prompt, user_prompt)
+        elif self.provider == "openai":
+            return self._call_openai(system_prompt, user_prompt)
+        elif self.provider == "local":
+            return self._call_local(system_prompt, user_prompt)
+        elif self.provider == "private":
+            return self._call_private(system_prompt, user_prompt)
+        else:
+            raise ValueError(f"Unknown provider: {self.provider}")
+
+    def _call_anthropic(
+        self, system_prompt: str, user_prompt: str, timeout: int = 120
+    ) -> str:
+        """Call Anthropic Claude API"""
+        if not self.api_key:
+            raise LLMAuthenticationError(
+                "No Anthropic API key. Set ANTHROPIC_API_KEY environment variable."
+            )
+        try:
+            import anthropic
+        except ImportError:
+            raise ImportError(
+                "anthropic package not installed. Run: pip install anthropic"
+            )
+
+        try:
+            client = anthropic.Anthropic(api_key=self.api_key)
+
+            model = (
+                self.model or os.environ.get("ROCPD_LLM_MODEL") or DEFAULT_ANTHROPIC_MODEL
+            )
+
+            # Build base API call kwargs
+            create_kwargs: Dict[str, Any] = dict(
+                model=model,
+                max_tokens=4096,
+                system=system_prompt,
+                messages=[{"role": "user", "content": user_prompt}],
+                timeout=timeout,
+            )
+
+            # Extended thinking support (Anthropic-only)
+            if self.thinking_budget_tokens is not None:
+                # Warn if model may not support extended thinking
+                _thinking_models = (
+                    "claude-opus-4",
+                    "claude-sonnet-4-5",
+                    "claude-3-7-sonnet",
+                )
+                if not any(m in model for m in _thinking_models):
+                    import warnings
+
+                    warnings.warn(
+                        f"Extended thinking requested but model {model!r} may not support it. "
+                        "Compatible models: claude-opus-4, claude-sonnet-4-5, claude-3-7-sonnet. "
+                        "Attempting anyway — API will return an error if unsupported.",
+                        stacklevel=3,
+                    )
+
+                create_kwargs["thinking"] = {
+                    "type": "enabled",
+                    "budget_tokens": self.thinking_budget_tokens,
+                }
+                # claude-3-7-sonnet uses the older beta header
+                if "3-7" in model:
+                    create_kwargs["betas"] = ["thinking-2025-02-19"]
+
+                if self.verbose:
+                    print(
+                        f"[LLM] Extended thinking enabled: budget={self.thinking_budget_tokens} tokens"
+                    )
+
+            response = client.messages.create(**create_kwargs)
+
+            # Extract only text content blocks (skip thinking blocks)
+            text_parts = [
+                block.text
+                for block in response.content
+                if getattr(block, "type", None) == "text"
+            ]
+            return "\n".join(text_parts) if text_parts else ""
+
+        except anthropic.AuthenticationError as e:
+            raise LLMAuthenticationError(f"Anthropic authentication failed: {e}")
+        except anthropic.RateLimitError as e:
+            raise LLMRateLimitError(f"Anthropic rate limit exceeded: {e}")
+        except Exception as e:
+            raise AnalysisError(f"Anthropic API error: {e}")
+
+    def _call_openai(
+        self,
+        system_prompt: str,
+        user_prompt: str,
+        max_tokens: int = 4096,
+        timeout: int = 120,
+    ) -> str:
+        """Call OpenAI GPT API"""
+        if not self.api_key:
+            raise LLMAuthenticationError(
+                "No OpenAI API key. Set OPENAI_API_KEY environment variable."
+            )
+        try:
+            import openai
+        except ImportError:
+            raise ImportError("openai package not installed. Run: pip install openai")
+
+        try:
+            client = openai.OpenAI(api_key=self.api_key)
+
+            model = (
+                self.model or os.environ.get("ROCPD_LLM_MODEL") or DEFAULT_OPENAI_MODEL
+            )
+            _messages = [
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_prompt},
+            ]
+            # Newer OpenAI models (gpt-5, o1, o3, gpt-4o-2024-11-20+) require
+            # max_completion_tokens; older models use max_tokens.  Try the new
+            # parameter first and fall back transparently.
+            try:
+                response = client.chat.completions.create(
+                    model=model,
+                    messages=_messages,
+                    max_completion_tokens=max_tokens,
+                    timeout=timeout,
+                )
+            except openai.BadRequestError as _br:
+                if "max_completion_tokens" in str(_br):
+                    response = client.chat.completions.create(
+                        model=model,
+                        messages=_messages,
+                        max_tokens=max_tokens,
+                        timeout=timeout,
+                    )
+                else:
+                    raise
+
+            msg = response.choices[0].message
+            content = msg.content
+
+            # Handle content-parts list (newer OpenAI API format)
+            if isinstance(content, list):
+                text_parts = []
+                for part in content:
+                    if hasattr(part, "text"):
+                        text_parts.append(str(part.text))
+                    elif isinstance(part, dict) and part.get("type") == "text":
+                        text_parts.append(part.get("text", ""))
+                content = "\n".join(text_parts)
+
+            if content is None or content == "":
+                # Check for explicit refusal
+                refusal = getattr(msg, "refusal", None)
+                if refusal:
+                    raise AnalysisError(f"OpenAI refused request: {refusal}")
+                # Report finish reason for diagnostics
+                finish = getattr(response.choices[0], "finish_reason", "unknown")
+                if finish == "length":
+                    raise AnalysisError(
+                        f"OpenAI response truncated at token limit "
+                        f"(max_completion_tokens={max_tokens}). "
+                        "Try with a shorter prompt or increase max tokens."
+                    )
+                raise AnalysisError(
+                    f"OpenAI returned empty content (finish_reason={finish!r}). "
+                    "Try a different model or simplify the request."
+                )
+            return content
+
+        except openai.AuthenticationError as e:
+            raise LLMAuthenticationError(f"OpenAI authentication failed: {e}")
+        except openai.RateLimitError as e:
+            raise LLMRateLimitError(f"OpenAI rate limit exceeded: {e}")
+        except Exception as e:
+            raise AnalysisError(f"OpenAI API error: {e}")
+
+    def _call_local(self, system_prompt: str, user_prompt: str) -> str:
+        """Call a local OpenAI-compatible LLM endpoint (e.g. Ollama)."""
+        try:
+            import openai
+        except ImportError:
+            raise ImportError("openai package required for local LLM: pip install openai")
+        base_url = os.environ.get("ROCPD_LLM_LOCAL_URL", "http://localhost:11434/v1")
+        client = openai.OpenAI(base_url=base_url, api_key="ignored")
+        model = self.model or os.environ.get("ROCPD_LLM_LOCAL_MODEL", "codellama:13b")
+        try:
+            resp = client.chat.completions.create(
+                model=model,
+                messages=[
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": user_prompt},
+                ],
+                max_tokens=2048,
+                timeout=60,
+            )
+            return resp.choices[0].message.content
+        except Exception as exc:
+            raise RuntimeError(
+                f"Local LLM request failed ({base_url}). "
+                f"Is Ollama running? Set ROCPD_LLM_LOCAL_URL to override endpoint. "
+                f"Error: {exc}"
+            ) from exc
+
+    def _call_private(self, system_prompt: str, user_prompt: str) -> str:
+        """Call a private/enterprise OpenAI-compatible LLM server.
+
+        Configuration via environment variables:
+            ROCPD_LLM_PRIVATE_URL        Base URL (required)
+            ROCPD_LLM_PRIVATE_MODEL      Model name (required)
+            ROCPD_LLM_PRIVATE_API_KEY    API key (default: "dummy")
+            ROCPD_LLM_PRIVATE_HEADERS    JSON object of extra request headers
+                                         (the "user" header defaults to os.getlogin())
+            ROCPD_LLM_PRIVATE_VERIFY_SSL Set to "0" or "false" to disable SSL
+                                         certificate verification (requires httpx).
+        """
+        try:
+            import openai
+            import json as _json
+        except ImportError:
+            raise ImportError(
+                "openai package required for private LLM: pip install openai"
+            )
+
+        base_url = os.environ.get("ROCPD_LLM_PRIVATE_URL", "")
+        if not base_url:
+            raise ValueError(
+                "ROCPD_LLM_PRIVATE_URL is not set. "
+                "Export it to point at your private LLM server."
+            )
+        model = self.model or os.environ.get("ROCPD_LLM_PRIVATE_MODEL", "")
+        if not model:
+            raise ValueError(
+                "No model specified for private provider. "
+                "Set ROCPD_LLM_PRIVATE_MODEL or pass --llm-private-model."
+            )
+        key = self.api_key or os.environ.get("ROCPD_LLM_PRIVATE_API_KEY", "dummy")
+
+        headers: dict = {}
+        try:
+            headers["user"] = os.getlogin()
+        except OSError:
+            pass
+        raw_headers = os.environ.get("ROCPD_LLM_PRIVATE_HEADERS", "")
+        if raw_headers:
+            # Try strict JSON first; only normalize single-quotes as a fallback.
+            # The replace-based normalization would corrupt values with apostrophes.
+            parsed_h = None
+            try:
+                parsed_h = _json.loads(raw_headers)
+            except _json.JSONDecodeError:
+                try:
+                    parsed_h = _json.loads(raw_headers.replace("'", '"'))
+                except _json.JSONDecodeError as e:
+                    raise ValueError(f"ROCPD_LLM_PRIVATE_HEADERS is not valid JSON: {e}")
+            if not isinstance(parsed_h, dict):
+                raise ValueError(
+                    "ROCPD_LLM_PRIVATE_HEADERS must be a JSON object of header "
+                    'key/value pairs (e.g. {"X-My-Header": "value"}), '
+                    f"got {type(parsed_h).__name__}"
+                )
+            headers.update(parsed_h)
+
+        verify_ssl_env = os.environ.get("ROCPD_LLM_PRIVATE_VERIFY_SSL", "1").lower()
+        verify_ssl = verify_ssl_env not in ("0", "false", "no")
+        http_client = None
+        if not verify_ssl:
+            try:
+                import httpx as _httpx
+
+                http_client = _httpx.Client(verify=False)
+            except ImportError:
+                import warnings
+
+                warnings.warn(
+                    "ROCPD_LLM_PRIVATE_VERIFY_SSL=0 requested but httpx is not installed. "
+                    "SSL verification will remain enabled. Run: pip install httpx",
+                    stacklevel=2,
+                )
+
+        client_kwargs: dict = dict(
+            api_key=key, base_url=base_url, default_headers=headers
+        )
+        if http_client is not None:
+            client_kwargs["http_client"] = http_client
+        client = openai.OpenAI(**client_kwargs)
+
+        try:
+            try:
+                resp = client.chat.completions.create(
+                    model=model,
+                    messages=[
+                        {"role": "system", "content": system_prompt},
+                        {"role": "user", "content": user_prompt},
+                    ],
+                    max_completion_tokens=4096,
+                )
+                return resp.choices[0].message.content or ""
+            except openai.BadRequestError as e:
+                if "max_completion_tokens" in str(e):
+                    resp = client.chat.completions.create(
+                        model=model,
+                        messages=[
+                            {"role": "system", "content": system_prompt},
+                            {"role": "user", "content": user_prompt},
+                        ],
+                        max_tokens=4096,
+                    )
+                    return resp.choices[0].message.content or ""
+                raise
+            except Exception as exc:
+                raise RuntimeError(
+                    f"Private LLM request failed ({base_url}). "
+                    f"Check ROCPD_LLM_PRIVATE_URL, ROCPD_LLM_PRIVATE_HEADERS. "
+                    f"Error: {exc}"
+                ) from exc
+        finally:
+            if http_client is not None:
+                http_client.close()
+
+    def summarize_source_file(self, filename: str, content: str) -> str:
+        """Stage 1: summarize a GPU source file to its key patterns (local LLM)."""
+        system = (
+            "You are a GPU code analyst. Given a source file, extract the key information "
+            "relevant to GPU performance: kernel definitions, memory access patterns, "
+            "synchronization calls, and potential bottlenecks. "
+            "Respond in plain text, max 600 words."
+        )
+        user = f"File: {filename}\n\n```\n{content[:8000]}\n```"
+        if self.provider == "local":
+            return self._call_local(system, user)
+        elif self.provider == "anthropic":
+            return self._call_anthropic(system, user)
+        elif self.provider == "openai":
+            return self._call_openai(system, user)
+        elif self.provider == "private":
+            return self._call_private(system, user)
+        return ""
+
+    def annotate_profiling_plan(self, metadata: dict) -> str:
+        """Annotate profiling plan metadata with LLM advice (no source text)."""
+        import json as _json
+
+        system = (
+            "You are an expert AMD GPU performance analyst. "
+            "Given a structured profiling plan (no source code), "
+            "explain what the profiling commands are measuring and why, "
+            "and suggest any adjustments. Be concise (max 200 words)."
+        )
+        user = f"Profiling plan metadata:\n{_json.dumps(metadata, indent=2)}"
+        if self.provider == "anthropic":
+            return self._call_anthropic(system, user)
+        elif self.provider == "openai":
+            return self._call_openai(system, user)
+        elif self.provider == "local":
+            return self._call_local(system, user)
+        return ""
+
+    def _sanitize_source_data(self, source_result: Any) -> Dict[str, Any]:
+        """
+        Sanitize SourceAnalysisResult before sending to LLM.
+
+        Privacy rules for source mode:
+        - Kernel names → [KERNEL_1], [KERNEL_2], etc.
+        - File paths → [FILE_1], [FILE_2], etc.
+        - Pattern IDs, severities, categories, counts → kept (not sensitive)
+        - Suggested counters → kept (generic AMD counter names)
+        - Risk areas → kept (no proprietary info; redact any embedded paths)
+        """
+        # Build path → placeholder mapping
+        kernel_name_map: Dict[str, str] = {}
+        file_path_map: Dict[str, str] = {}
+
+        for i, k in enumerate(source_result.detected_kernels, 1):
+            name = k.get("name") if isinstance(k, dict) else k.name
+            if name and name not in kernel_name_map:
+                kernel_name_map[name] = f"[KERNEL_{i}]"
+
+        all_files = list(
+            {
+                (k.get("file") if isinstance(k, dict) else k.file)
+                for k in source_result.detected_kernels
+            }
+        )
+        for i, fp in enumerate(sorted(all_files), 1):
+            if fp:
+                file_path_map[fp] = f"[FILE_{i}]"
+
+        def _redact_file(f: str) -> str:
+            return file_path_map.get(f, _redact_paths(f))
+
+        def _redact_kernel(n: str) -> str:
+            return kernel_name_map.get(n, n)
+
+        sanitized_kernels = []
+        for k in source_result.detected_kernels[:10]:
+            name = k.get("name") if isinstance(k, dict) else k.name
+            fpath = k.get("file") if isinstance(k, dict) else k.file
+            line = k.get("line") if isinstance(k, dict) else k.line
+            launch = k.get("launch_type") if isinstance(k, dict) else k.launch_type
+            sanitized_kernels.append(
+                {
+                    "name": _redact_kernel(name or ""),
+                    "file": _redact_file(fpath or ""),
+                    "line": line,
+                    "launch_type": launch,
+                }
+            )
+
+        sanitized_patterns = []
+        for p in source_result.detected_patterns:
+            pd = (
+                p
+                if isinstance(p, dict)
+                else {
+                    "pattern_id": p.pattern_id,
+                    "severity": p.severity,
+                    "category": p.category,
+                    "description": p.description,
+                    "count": p.count,
+                    "locations": p.locations,
+                }
+            )
+            sanitized_patterns.append(
+                {
+                    "pattern_id": pd["pattern_id"],
+                    "severity": pd["severity"],
+                    "category": pd["category"],
+                    "description": pd["description"],
+                    "count": pd["count"],
+                    # Redact locations (may contain file paths)
+                    "locations": [
+                        _redact_paths(loc) for loc in pd.get("locations", [])[:3]
+                    ],
+                }
+            )
+
+        sanitized_risks = [_redact_paths(r) for r in source_result.risk_areas]
+
+        return {
+            "programming_model": source_result.programming_model,
+            "files_scanned": source_result.files_scanned,
+            "kernel_count": source_result.kernel_count,
+            "already_instrumented": source_result.already_instrumented,
+            "roctx_marker_count": source_result.roctx_marker_count,
+            "detected_kernels": sanitized_kernels,
+            "detected_patterns": sanitized_patterns,
+            "risk_areas": sanitized_risks,
+            "suggested_counters": source_result.suggested_counters,
+        }
+
+    def _build_source_user_prompt(
+        self,
+        sanitized: Dict[str, Any],
+        custom_prompt: Optional[str] = None,
+    ) -> str:
+        """Build user prompt for Tier 0 source code analysis."""
+        lines = []
+
+        lines.append("CONTEXT: This is a PRE-PROFILING source code analysis (Tier 0).")
+        lines.append("No runtime profiling data has been collected yet.")
+        lines.append("Goal: produce a prioritized profiling plan.")
+        lines.append("")
+
+        lines.append("## Source Code Summary")
+        lines.append(f"- Programming model: {sanitized['programming_model']}")
+        lines.append(f"- Files scanned: {sanitized['files_scanned']}")
+        lines.append(f"- GPU kernels found: {sanitized['kernel_count']}")
+        lines.append(
+            f"- Already instrumented with ROCTx: {sanitized['already_instrumented']}"
+        )
+        lines.append("")
+
+        if sanitized["detected_kernels"]:
+            lines.append("## Detected Kernels (names redacted)")
+            for k in sanitized["detected_kernels"][:5]:
+                lines.append(
+                    f"  - {k['name']} ({k['launch_type']}) at {k['file']}:{k['line']}"
+                )
+            lines.append("")
+
+        if sanitized["detected_patterns"]:
+            lines.append("## Detected Patterns")
+            for p in sanitized["detected_patterns"]:
+                lines.append(
+                    f"  - [{p['severity'].upper()}] {p['category']}: "
+                    f"{p['description']} (count: {p['count']})"
+                )
+            lines.append("")
+
+        if sanitized["risk_areas"]:
+            lines.append("## Risk Areas")
+            for r in sanitized["risk_areas"]:
+                lines.append(f"  - {r}")
+            lines.append("")
+
+        lines.append("## Suggested Counters")
+        lines.append(f"  {', '.join(sanitized['suggested_counters'])}")
+        lines.append("")
+
+        if custom_prompt:
+            lines.append("## User Question")
+            lines.append(custom_prompt)
+            lines.append("")
+            lines.append(
+                "Please answer the user's question and provide a prioritized profiling plan "
+                "based on the source code analysis above. Use the reference guide for "
+                "AMD GPU profiling methodology. Use PLAIN TEXT only — no markdown headers."
+            )
+        else:
+            lines.append(
+                "Based on the source code analysis above, provide:\n"
+                "1. Assessment of likely performance risks (2-3 sentences)\n"
+                "2. Recommended first profiling step with rationale\n"
+                "3. Top 3 things to look for when the first trace comes back\n"
+                "4. Any source-level patterns that suggest architectural issues\n\n"
+                "Use PLAIN TEXT only — no markdown headers (###, ##, #).\n"
+                "Use ONLY current generation tools (rocprofv3, rocprof-compute, rocprof-sys)."
+            )
+
+        return "\n".join(lines)
+
+    def suggest_optimizations(
+        self,
+        summaries: List[tuple],
+        custom_prompt: str = "",
+    ) -> str:
+        """
+        Request per-file GPU code optimization suggestions.
+
+        Uses a focused system prompt (NOT the full profiling reference guide) so
+        the LLM responds with concrete code-level advice, not profiling guidance.
+
+        Each file's section in the response starts with "FILE: <filename>".
+
+        Args:
+            summaries: List of (filename, content_or_summary) pairs.
+            custom_prompt: Optional extra instructions prepended to the user turn.
+
+        Returns:
+            LLM response text (plain text, FILE: sections per file).
+
+        Raises:
+            LLMAuthenticationError, LLMRateLimitError, AnalysisError
+        """
+        system = (
+            "You are an expert AMD GPU performance engineer specializing in HIP/CUDA "
+            "code optimization. Review the provided GPU source files and give concrete, "
+            "actionable optimization suggestions.\n\n"
+            "REQUIRED RESPONSE FORMAT:\n"
+            "Start each file's section with exactly:\n"
+            "FILE: <filename>\n"
+            "Then list specific suggestions for that file.\n\n"
+            "Focus on: memory coalescing, wave occupancy, unnecessary "
+            "hipDeviceSynchronize calls, blocking hipMemcpy, MFMA usage, "
+            "LDS utilization, loop structure, and kernel launch parameters.\n"
+            "Be specific — reference actual patterns visible in the code.\n"
+            "Use plain text only — no markdown headers or bold."
+        )
+        combined = "\n\n".join(
+            f"=== {name} ===\n{content}" for name, content in summaries
+        )
+        user = (
+            f"{custom_prompt}\n\nSource files:\n\n{combined}"
+            if custom_prompt
+            else combined
+        )
+
+        if self.verbose:
+            print(
+                f"[LLM] suggest_optimizations: {len(summaries)} file(s), "
+                f"user prompt {len(user)} chars"
+            )
+
+        if self.provider == "anthropic":
+            return self._call_anthropic(system, user)
+        elif self.provider == "openai":
+            return self._call_openai(system, user, max_tokens=3000)
+        elif self.provider == "local":
+            return self._call_local(system, user)
+        else:
+            raise ValueError(f"Unknown provider: {self.provider}")
+
+    def analyze_source_with_llm(
+        self,
+        source_result: Any,
+        custom_prompt: Optional[str] = None,
+        context: Optional[AnalysisContext] = None,
+    ) -> str:
+        """
+        Send Tier 0 source analysis to LLM for enhanced profiling guidance.
+
+        Args:
+            source_result: SourceAnalysisResult (from api.py)
+            custom_prompt: Optional user question
+            context: Optional AnalysisContext for guide section filtering.
+                When provided, only guide sections relevant to the current
+                analysis tier/bottleneck are included, reducing token cost.
+
+        Returns:
+            LLM-generated profiling guidance as plain text
+
+        Raises:
+            LLMAuthenticationError: Invalid API key
+            LLMRateLimitError: API rate limit exceeded
+        """
+        sanitized = self._sanitize_source_data(source_result)
+        system_prompt = self._build_system_prompt(context=context)
+        user_prompt = self._build_source_user_prompt(sanitized, custom_prompt)
+
+        if self.verbose:
+            print(f"[LLM] Calling {self.provider} API for Tier 0 source analysis...")
+            print(f"[LLM] User prompt length: {len(user_prompt)} chars")
+
+        if self.provider == "anthropic":
+            return self._call_anthropic(system_prompt, user_prompt)
+        elif self.provider == "openai":
+            return self._call_openai(system_prompt, user_prompt)
+        elif self.provider == "local":
+            return self._call_local(system_prompt, user_prompt)
+        else:
+            raise ValueError(f"Unknown provider: {self.provider}")
diff --git a/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/llm_conversation.py b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/llm_conversation.py
new file mode 100644
index 00000000000..5c5ac5137be
--- /dev/null
+++ b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/llm_conversation.py
@@ -0,0 +1,498 @@
+# ai_analysis/llm_conversation.py
+"""Persistent multi-turn LLM conversation with streaming, compaction, and disk archive."""
+
+from __future__ import annotations
+
+import json
+import os
+import warnings
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Optional
+
+from .llm_analyzer import DEFAULT_ANTHROPIC_MODEL, DEFAULT_OPENAI_MODEL
+from .exceptions import LLMAuthenticationError, LLMRateLimitError
+
+_DEFAULT_LOCAL_URL = "http://localhost:11434/v1"
+_DEFAULT_LOCAL_MODEL = "codellama:13b"
+
+
+def _build_private_client(api_key: Optional[str], model_override: Optional[str]):
+    """Build an OpenAI client for a private/enterprise LLM server.
+
+    Reads configuration from environment variables:
+        ROCPD_LLM_PRIVATE_URL        Base URL of the private server (required)
+        ROCPD_LLM_PRIVATE_MODEL      Model name to use
+        ROCPD_LLM_PRIVATE_API_KEY    API key (default: "dummy" for header-auth servers)
+        ROCPD_LLM_PRIVATE_HEADERS    JSON object of extra request headers, e.g.
+                                     '{"Ocp-Apim-Subscription-Key": "abc123"}'
+                                     The "user" header is auto-set to os.getlogin()
+                                     unless already present in ROCPD_LLM_PRIVATE_HEADERS.
+        ROCPD_LLM_PRIVATE_VERIFY_SSL Set to "0" or "false" to disable SSL certificate
+                                     verification (e.g. for corporate proxies with
+                                     self-signed certs). Requires httpx package.
+    """
+    try:
+        import openai as _openai
+    except ImportError:
+        raise ImportError("openai package not installed. Run: pip install openai")
+
+    base_url = os.environ.get("ROCPD_LLM_PRIVATE_URL", "")
+    if not base_url:
+        raise ValueError(
+            "ROCPD_LLM_PRIVATE_URL is not set. "
+            "Export it to point at your private LLM server, e.g.:\n"
+            "  export ROCPD_LLM_PRIVATE_URL=https://my-apim.example.com/openai/deployments/gpt4"
+        )
+    key = api_key or os.environ.get("ROCPD_LLM_PRIVATE_API_KEY", "dummy")
+    model = model_override or os.environ.get("ROCPD_LLM_PRIVATE_MODEL", "")
+
+    # Build headers: start with user auto-header, then overlay env-var headers
+    headers: Dict[str, str] = {}
+    try:
+        headers["user"] = os.getlogin()
+    except OSError:
+        pass  # getlogin() can fail in some CI/container environments
+    raw_headers = os.environ.get("ROCPD_LLM_PRIVATE_HEADERS", "")
+    if raw_headers:
+        # Try strict JSON first; only normalize single-quotes as a fallback.
+        # The replace-based normalization is intentionally not the first path
+        # because it would corrupt values containing legitimate apostrophes
+        # (e.g. Bearer tokens with embedded apostrophes).
+        parsed_headers = None
+        try:
+            parsed_headers = json.loads(raw_headers)
+        except json.JSONDecodeError:
+            try:
+                parsed_headers = json.loads(raw_headers.replace("'", '"'))
+            except json.JSONDecodeError as e:
+                raise ValueError(
+                    f"ROCPD_LLM_PRIVATE_HEADERS is not valid JSON: {e}\n"
+                    f'Use double-quoted JSON: \'{{"Ocp-Apim-Subscription-Key": "abc123"}}\'\n'
+                    f"Value was: {raw_headers!r}"
+                )
+        if not isinstance(parsed_headers, dict):
+            raise ValueError(
+                f"ROCPD_LLM_PRIVATE_HEADERS must be a JSON object, got "
+                f"{type(parsed_headers).__name__}.\n"
+                f'Expected format: \'{{"Ocp-Apim-Subscription-Key": "abc123"}}\'\n'
+                f"Value was: {raw_headers!r}"
+            )
+        headers.update(parsed_headers)
+
+    # SSL verification — disabled when ROCPD_LLM_PRIVATE_VERIFY_SSL=0/false
+    verify_ssl_env = os.environ.get("ROCPD_LLM_PRIVATE_VERIFY_SSL", "1").lower()
+    verify_ssl = verify_ssl_env not in ("0", "false", "no")
+    http_client = None
+    if not verify_ssl:
+        try:
+            import httpx as _httpx
+
+            http_client = _httpx.Client(verify=False)
+        except ImportError:
+            warnings.warn(
+                "[LLMConversation] ROCPD_LLM_PRIVATE_VERIFY_SSL=0 requested but httpx is "
+                "not installed. SSL verification will remain enabled. "
+                "Run: pip install httpx",
+                stacklevel=3,
+            )
+
+    kwargs: Dict[str, Any] = dict(api_key=key, base_url=base_url, default_headers=headers)
+    if http_client is not None:
+        kwargs["http_client"] = http_client
+
+    client = _openai.OpenAI(**kwargs)
+    return client, model, http_client
+
+
+class LLMConversation:
+    """Persistent multi-turn LLM session with streaming and LLM-based compaction.
+
+    Usage:
+        conv = LLMConversation(provider="anthropic", api_key="sk-ant-...")
+        conv.initialize("You are an expert AMD GPU engineer.\\n\\n" + fence)
+        response = conv.send("What is the bottleneck?", on_token=print_fn)
+    """
+
+    def __init__(
+        self,
+        provider: str,
+        api_key: Optional[str] = None,
+        model: Optional[str] = None,
+        compact_every: int = 10,
+        keep_recent_turns: int = 6,
+        history_path: Optional[Path] = None,
+    ) -> None:
+        valid = {"anthropic", "openai", "local", "private"}
+        if provider not in valid:
+            raise ValueError(
+                f"Unknown provider: {provider!r}. Must be one of: {', '.join(sorted(valid))}"
+            )
+        self._provider = provider
+        self._api_key = api_key
+        self._model = model
+        self._compact_every = max(1, compact_every)
+        self._keep_recent_turns = max(0, keep_recent_turns)
+        self._history_path = Path(history_path) if history_path else None
+        self._system: str = ""
+        self._messages: List[Dict[str, str]] = []
+        self._turn_count: int = 0
+
+    def initialize(self, system_prompt: str) -> None:
+        """Set the system prompt (fence + role). Must be called before send()."""
+        self._system = system_prompt
+
+    def send(
+        self,
+        user_message: str,
+        *,
+        max_tokens: int = 4096,
+        on_token: Optional[Callable[[str], None]] = None,
+    ) -> str:
+        """Append user turn, stream response, increment turn_count, check compaction."""
+        if not self._system:
+            raise RuntimeError(
+                "LLMConversation.send() called before initialize(). "
+                "Call initialize(system_prompt) first."
+            )
+        self._messages.append({"role": "user", "content": user_message})
+        result = self._stream_response(max_tokens=max_tokens, on_token=on_token)
+        self._messages.append({"role": "assistant", "content": result})
+        self._turn_count += 1
+        if self._turn_count % self._compact_every == 0:
+            self._compact()
+        return result
+
+    # ── Streaming ─────────────────────────────────────────────────────────────
+
+    def _stream_response(
+        self,
+        max_tokens: int,
+        on_token: Optional[Callable[[str], None]],
+    ) -> str:
+        if self._provider == "anthropic":
+            return self._stream_anthropic(max_tokens=max_tokens, on_token=on_token)
+        return self._stream_openai(max_tokens=max_tokens, on_token=on_token)
+
+    def _stream_anthropic(
+        self,
+        max_tokens: int,
+        on_token: Optional[Callable[[str], None]],
+    ) -> str:
+        try:
+            import anthropic as _anthropic
+        except ImportError:
+            raise ImportError(
+                "anthropic package not installed. Run: pip install anthropic"
+            )
+
+        api_key = self._api_key or os.environ.get("ANTHROPIC_API_KEY", "")
+        if not api_key:
+            raise LLMAuthenticationError(
+                "No Anthropic API key. Set ANTHROPIC_API_KEY environment variable."
+            )
+        model = (
+            self._model or os.environ.get("ROCPD_LLM_MODEL") or DEFAULT_ANTHROPIC_MODEL
+        )
+        client = _anthropic.Anthropic(api_key=api_key)
+        chunks: List[str] = []
+        try:
+            with client.messages.stream(
+                model=model,
+                max_tokens=max_tokens,
+                system=self._system,
+                messages=self._messages,
+                timeout=120,
+            ) as stream:
+                for text in stream.text_stream:
+                    if on_token:
+                        on_token(text)
+                    chunks.append(text)
+        except _anthropic.AuthenticationError as e:
+            raise LLMAuthenticationError(f"Anthropic authentication failed: {e}")
+        except _anthropic.RateLimitError as e:
+            raise LLMRateLimitError(f"Anthropic rate limit exceeded: {e}")
+        except (LLMAuthenticationError, LLMRateLimitError):
+            raise
+        except Exception as e:
+            if chunks:
+                warnings.warn(
+                    f"[LLMConversation] Streaming error mid-response: {e}", stacklevel=3
+                )
+            else:
+                raise
+        return "".join(chunks)
+
+    def _stream_openai(
+        self,
+        max_tokens: int,
+        on_token: Optional[Callable[[str], None]],
+    ) -> str:
+        try:
+            import openai as _openai
+        except ImportError:
+            raise ImportError("openai package not installed. Run: pip install openai")
+
+        _http_client = None
+        if self._provider == "local":
+            base_url = os.environ.get("ROCPD_LLM_LOCAL_URL", _DEFAULT_LOCAL_URL)
+            model = self._model or os.environ.get(
+                "ROCPD_LLM_LOCAL_MODEL", _DEFAULT_LOCAL_MODEL
+            )
+            client = _openai.OpenAI(api_key="ignored", base_url=base_url)
+        elif self._provider == "private":
+            client, model, _http_client = _build_private_client(
+                self._api_key, self._model
+            )
+            if not model:
+                if _http_client is not None:
+                    _http_client.close()
+                raise ValueError(
+                    "No model specified for private provider. "
+                    "Set ROCPD_LLM_PRIVATE_MODEL or pass --llm-private-model."
+                )
+        else:
+            api_key = self._api_key or os.environ.get("OPENAI_API_KEY", "")
+            if not api_key:
+                raise LLMAuthenticationError(
+                    "No OpenAI API key. Set OPENAI_API_KEY environment variable."
+                )
+            model = (
+                self._model or os.environ.get("ROCPD_LLM_MODEL") or DEFAULT_OPENAI_MODEL
+            )
+            client = _openai.OpenAI(api_key=api_key)
+            _http_client = None
+
+        messages_with_system = [
+            {"role": "system", "content": self._system}
+        ] + self._messages
+        chunks: List[str] = []
+        try:
+            try:
+                stream = client.chat.completions.create(
+                    model=model,
+                    messages=messages_with_system,
+                    max_completion_tokens=max_tokens,
+                    stream=True,
+                    timeout=120,
+                )
+            except _openai.BadRequestError as e:
+                if "max_completion_tokens" in str(e):
+                    stream = client.chat.completions.create(
+                        model=model,
+                        messages=messages_with_system,
+                        max_tokens=max_tokens,
+                        stream=True,
+                        timeout=120,
+                    )
+                else:
+                    raise
+            for chunk in stream:
+                delta = chunk.choices[0].delta.content
+                if delta:
+                    if on_token:
+                        on_token(delta)
+                    chunks.append(delta)
+        except _openai.AuthenticationError as e:
+            raise LLMAuthenticationError(f"OpenAI authentication failed: {e}")
+        except _openai.RateLimitError as e:
+            raise LLMRateLimitError(f"OpenAI rate limit exceeded: {e}")
+        except (LLMAuthenticationError, LLMRateLimitError):
+            raise
+        except Exception as e:
+            if chunks:
+                warnings.warn(
+                    f"[LLMConversation] Streaming error mid-response: {e}", stacklevel=3
+                )
+            else:
+                raise
+        finally:
+            if _http_client is not None:
+                _http_client.close()
+        return "".join(chunks)
+
+    # ── Compaction ────────────────────────────────────────────────────────────
+
+    _COMPACTION_PROMPT = (
+        "Summarize the key context from this session so far. Include:\n"
+        "- What app is being profiled and its source files\n"
+        "- Profiling runs done (trace types, counter sets collected)\n"
+        "- Performance issues identified (bottlenecks, percentages)\n"
+        "- Code optimizations applied and their observed effect\n"
+        "- Current state of the application\n"
+        "Be concise (max 300 words)."
+    )
+
+    def _compact(self) -> None:
+        """LLM-summarize oldest messages; replace with summary block + recent turns."""
+        keep = self._keep_recent_turns * 2
+        if len(self._messages) <= keep:
+            return
+        old_messages = self._messages[:-keep] if keep > 0 else list(self._messages)
+        recent_messages = self._messages[-keep:] if keep > 0 else []
+
+        if self._history_path and old_messages:
+            self._append_to_archive(old_messages)
+
+        try:
+            summary = self._call_non_streaming(
+                messages=old_messages
+                + [{"role": "user", "content": self._COMPACTION_PROMPT}],
+                max_tokens=600,
+            )
+            summary_block = [
+                {"role": "user", "content": "Summarize our session so far."},
+                {"role": "assistant", "content": f"[Session summary] {summary}"},
+            ]
+            self._messages = summary_block + recent_messages
+        except Exception as e:
+            warnings.warn(
+                f"[LLMConversation] Compaction failed, skipping: {e}", stacklevel=2
+            )
+
+    def _call_non_streaming(self, messages: List[Dict], max_tokens: int) -> str:
+        """Non-streaming API call used for compaction. Does NOT increment _turn_count."""
+        if self._provider == "anthropic":
+            try:
+                import anthropic as _anthropic
+            except ImportError:
+                raise ImportError("anthropic package not installed.")
+            api_key = self._api_key or os.environ.get("ANTHROPIC_API_KEY", "")
+            model = (
+                self._model
+                or os.environ.get("ROCPD_LLM_MODEL")
+                or DEFAULT_ANTHROPIC_MODEL
+            )
+            client = _anthropic.Anthropic(api_key=api_key)
+            resp = client.messages.create(
+                model=model,
+                max_tokens=max_tokens,
+                system=self._system,
+                messages=messages,
+                timeout=120,
+            )
+            return resp.content[0].text if resp.content else ""
+
+        # openai or local
+        try:
+            import openai as _openai
+        except ImportError:
+            raise ImportError("openai package not installed.")
+        _http_client = None
+        if self._provider == "local":
+            client = _openai.OpenAI(
+                api_key="ignored",
+                base_url=os.environ.get("ROCPD_LLM_LOCAL_URL", _DEFAULT_LOCAL_URL),
+            )
+            model = self._model or os.environ.get(
+                "ROCPD_LLM_LOCAL_MODEL", _DEFAULT_LOCAL_MODEL
+            )
+        elif self._provider == "private":
+            client, model, _http_client = _build_private_client(
+                self._api_key, self._model
+            )
+            if not model:
+                if _http_client is not None:
+                    _http_client.close()
+                raise ValueError(
+                    "No model specified for private provider. "
+                    "Set ROCPD_LLM_PRIVATE_MODEL or pass --llm-private-model."
+                )
+        else:
+            client = _openai.OpenAI(
+                api_key=self._api_key or os.environ.get("OPENAI_API_KEY", "")
+            )
+            model = (
+                self._model or os.environ.get("ROCPD_LLM_MODEL") or DEFAULT_OPENAI_MODEL
+            )
+        full_messages = [{"role": "system", "content": self._system}] + messages
+        try:
+            try:
+                resp = client.chat.completions.create(
+                    model=model,
+                    messages=full_messages,
+                    max_completion_tokens=max_tokens,
+                    timeout=120,
+                )
+            except _openai.BadRequestError as e:
+                if "max_completion_tokens" in str(e):
+                    resp = client.chat.completions.create(
+                        model=model,
+                        messages=full_messages,
+                        max_tokens=max_tokens,
+                        timeout=120,
+                    )
+                else:
+                    raise
+            return resp.choices[0].message.content or ""
+        finally:
+            if _http_client is not None:
+                _http_client.close()
+
+    # ── Disk archive ──────────────────────────────────────────────────────────
+
+    def _append_to_archive(self, messages: List[Dict]) -> None:
+        """Append messages to JSONL archive (append-only)."""
+        try:
+            self._history_path.parent.mkdir(parents=True, exist_ok=True)
+            ts = datetime.now(timezone.utc).isoformat()
+            with self._history_path.open("a", encoding="utf-8") as f:
+                for msg in messages:
+                    entry = {
+                        "role": msg["role"],
+                        "content": msg["content"],
+                        "turn": self._turn_count,
+                        "ts": ts,
+                    }
+                    f.write(json.dumps(entry) + "\n")
+        except Exception as e:
+            warnings.warn(
+                f"[LLMConversation] Failed to write history archive: {e}", stacklevel=3
+            )
+
+    # ── Persistence ───────────────────────────────────────────────────────────
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Serialize for SessionData persistence. Does NOT include api_key."""
+        return {
+            "provider": self._provider,
+            "model": self._model,
+            "compact_every": self._compact_every,
+            "keep_recent_turns": self._keep_recent_turns,
+            "history_path": str(self._history_path) if self._history_path else None,
+            "system": self._system,
+            "messages": list(self._messages),
+            "turn_count": self._turn_count,
+        }
+
+    @classmethod
+    def from_dict(cls, d: Dict[str, Any], **kwargs: Any) -> "LLMConversation":
+        """Restore from serialized state.
+
+        kwargs:
+            api_key: override (api_key is never stored in to_dict())
+            model:   override stored model
+        """
+        history_path = d.get("history_path")
+        conv = cls(
+            provider=d["provider"],
+            api_key=kwargs.get("api_key"),
+            model=kwargs.get("model") or d.get("model"),
+            compact_every=d.get("compact_every", 10),
+            keep_recent_turns=d.get("keep_recent_turns", 6),
+            history_path=Path(history_path) if history_path else None,
+        )
+        conv._system = d.get("system", "")
+        conv._messages = list(d.get("messages", []))
+        conv._turn_count = d.get("turn_count", 0)
+        return conv
+
+    # ── Properties ────────────────────────────────────────────────────────────
+
+    @property
+    def turn_count(self) -> int:
+        return self._turn_count
+
+    @property
+    def messages(self) -> List[Dict]:
+        return list(self._messages)
diff --git a/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/share/amd_rocm_logo.png b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/share/amd_rocm_logo.png
new file mode 100644
index 00000000000..4b6c17d07bf
Binary files /dev/null and b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/share/amd_rocm_logo.png differ
diff --git a/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/share/llm-reference-guide.md b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/share/llm-reference-guide.md
new file mode 100644
index 00000000000..a3b6a6c532c
--- /dev/null
+++ b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/share/llm-reference-guide.md
@@ -0,0 +1,2124 @@
+# LLM Reference Guide for GPU Performance Analysis
+
+**Purpose**: This document is provided to the LLM as context when analyzing GPU profiling data. It defines boundaries, provides reference information, and guides analysis quality.
+
+---
+
+## CRITICAL REQUIREMENTS
+<!-- rocpd-context: always -->
+
+### Hardware Counter Per-Block Limits — MUST NOT EXCEED
+
+**THIS IS A HARD HARDWARE CONSTRAINT.** Violating it crashes rocprofv3 (error code 38: "Request exceeds the capabilities of the hardware to collect").
+
+AMD GPUs limit how many counters from the **same hardware block** can be collected in one rocprofv3 pass. The block name is the prefix before the first `_` in the counter name (e.g., `SQ_WAVES` → block `SQ`).
+
+**Safe per-block limits** (conservative defaults — actual limits vary by GPU):
+| Block | Examples | Limit per pass |
+|-------|----------|----------------|
+| `SQ`  | `SQ_WAVES`, `SQ_INSTS_VALU`, `SQ_INSTS_VMEM_RD`, `SQ_INSTS_VMEM_WR`, `SQ_INSTS_LDS` | 4 (up to 8 on gfx942) |
+| `GRBM` | `GRBM_COUNT`, `GRBM_GUI_ACTIVE` | 4 |
+| `FETCH` | `FETCH_SIZE` | 2 |
+| `WRITE` | `WRITE_SIZE` | 2 |
+| `TCP`, `TCC`, `TA`, `TD` | Cache counters | 4 |
+
+**Mandatory rules for `--pmc` commands you generate:**
+1. Count counters **per block separately** — do NOT count across different blocks together
+2. If any block would exceed its limit → split into **multiple separate rocprofv3 runs** (pass 1, pass 2, …) each with its own `-d`/`-o`
+3. Different blocks CAN coexist in the same pass as long as each block's count stays within its limit
+4. `rocprof-compute` is EXEMPT — it handles multi-pass collection internally
+
+**ADDITIONAL RULE — FETCH_SIZE and WRITE_SIZE are TCC-derived metrics**:
+These are NOT raw hardware counters. rocprofv3 expands them internally to TCC hardware counters:
+- `FETCH_SIZE` → `TCC_BUBBLE + TCC_EA0_RDREQ + GRBM_GUI_ACTIVE` (TCC block, 32 instances)
+- `WRITE_SIZE` → `TCC_EA0_WRREQ + TCC_EA0_WRREQ_64B` (TCC block, 32 instances)
+**Rules**:
+1. FETCH_SIZE and WRITE_SIZE MUST each be in their own dedicated pass.
+2. They cannot share a pass with each other (combined 5 TCC hardware counters > limit).
+3. They cannot share a pass with SQ counters.
+
+**Examples:**
+```bash
+# ✅ SAFE — 3 passes: SQ/GRBM | FETCH_SIZE | WRITE_SIZE
+# Pass 1: GPU utilization + occupancy (raw hardware counters)
+rocprofv3 --sys-trace --pmc GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES SQ_INSTS_VMEM_RD \
+  SQ_INSTS_VMEM_WR SQ_INSTS_LDS -d ./out -o baseline_pass1 -- ./app
+# Pass 2: HBM read bandwidth
+rocprofv3 --sys-trace --pmc FETCH_SIZE -d ./out -o baseline_pass2 -- ./app
+# Pass 3: HBM write bandwidth
+rocprofv3 --sys-trace --pmc WRITE_SIZE -d ./out -o baseline_pass3 -- ./app
+
+# ✅ SAFE — GRBM×2 + SQ×1 only (no bandwidth needed)
+rocprofv3 --sys-trace --pmc GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES -d ./out -o p1 -- ./app
+
+# ✅ SAFE — FETCH_SIZE alone (3 TCC hardware counters, within limit)
+rocprofv3 --sys-trace --pmc FETCH_SIZE -d ./out -o fetch -- ./app
+
+# ❌ UNSAFE — FETCH_SIZE + WRITE_SIZE in same pass → 5 TCC hardware counters → error 38
+rocprofv3 --sys-trace --pmc FETCH_SIZE WRITE_SIZE -d ./out -o bw -- ./app  # ← WILL CRASH
+
+# ❌ UNSAFE — SQ counters + FETCH_SIZE/WRITE_SIZE in the same pass → error code 38
+rocprofv3 --sys-trace --pmc GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES SQ_INSTS_VMEM_RD \
+  SQ_INSTS_VMEM_WR SQ_INSTS_LDS FETCH_SIZE WRITE_SIZE -- ./app  # ← WILL CRASH
+```
+
+---
+
+### Profiling Tools - Use Current Generation Tools ONLY
+
+**IMPORTANT**: All profiling commands MUST use current generation ROCm profiling tools, NOT deprecated tools.
+
+❌ **NEVER use**: `rocprof`, `rocprof-v2`, or any other deprecated variant
+✅ **ALWAYS use**: `rocprofv3`, `rocprof-compute`, or `rocprof-sys` (also known as `rocsys`)
+
+**Tool Name Aliases**:
+- `rocprof-sys` = `rocsys` (same tool, different names in documentation)
+- `rocprofv3` is built on ROCprofiler-SDK — the current generation, context-based profiling API
+- `rocprof` / `rocprofv2` are deprecated; only critical bug fixes, EOL after ROCm 6.5
+
+**Documentation References**:
+- rocprofv3: https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/latest/
+- rocprof-compute: https://rocm.docs.amd.com/projects/rocprofiler-compute/en/latest/
+- rocprof-sys (rocsys): https://rocm.docs.amd.com/projects/rocprofiler-systems/en/latest/
+
+---
+
+## Output Format Requirements
+<!-- rocpd-context: always -->
+
+Your response MUST be plain text with the following structure:
+
+1. **No markdown headers** - Use plain text, not ### or ## or #
+2. **Consistent section structure**:
+   - Executive Summary (2-3 sentences)
+   - Key Findings (bullet points)
+   - Detailed Analysis (by bottleneck type)
+   - Actionable Recommendations (prioritized list)
+   - Next Profiling Steps (specific rocprofv3 commands)
+
+3. **Format each recommendation as**:
+   ```
+   Priority: [HIGH/MEDIUM/LOW]
+   Issue: [description with metrics]
+   Suggestion: [what to do]
+   Actionable Steps:
+     - [specific step 1]
+     - [specific step 2]
+   Expected Impact: [quantified improvement estimate]
+   ```
+
+4. **All profiling commands must use rocprofv3, rocprof-compute, or rocprof-sys**
+
+---
+
+## Recommended AMD Profiling Workflow (3 Steps)
+<!-- rocpd-context: tier1 -->
+
+AMD's recommended performance analysis process is a progressive three-step methodology.
+Never suggest all three steps when earlier data already exists — only recommend the
+**incremental next step** based on what is already in the database.
+
+### Step 1 — System-Level Timeline (rocprof-sys)
+
+**Purpose**: Get a holistic view of the application before diving into kernel details.
+Reveals CPU-GPU interaction, kernel call frequency, memory copy overhead, and identifies
+the hottest kernels worth investigating.
+
+```bash
+# Instrument binary once
+rocprof-sys-instrument -- ./app
+
+# Run to collect timeline
+rocprof-sys-run -- ./app.inst
+
+# For MPI applications
+mpirun -n <N> rocprof-sys-run -- ./mpi_app.inst
+```
+
+**What you learn**:
+- Which kernels dominate execution time (Pareto/80-20 rule applies)
+- CPU-GPU overlap (or lack thereof)
+- Synchronization points and idle gaps
+- Memory copy patterns and timing relative to kernels
+
+**When to recommend Step 1**: User has NO trace data yet. This is always the starting point.
+
+---
+
+### Step 2 — Kernel Hardware Counters (rocprofv3)
+
+**Purpose**: Collect hardware performance counters on the hot kernels identified in Step 1.
+Enables bottleneck classification (compute-bound vs memory-bound), occupancy measurement,
+and bandwidth utilization.
+
+⚠️ **HARDWARE COUNTER LIMIT — CRITICAL**: AMD GPUs limit how many counters from the same
+hardware block can be collected in a single rocprofv3 pass. Exceeding this limit causes
+rocprofv3 to abort with **error code 38**: "Request exceeds the capabilities of the hardware
+to collect". See "Hardware Counter Collection Limits" section below before suggesting commands.
+
+```bash
+# Pass 1: GPU utilization + wave occupancy (GRBM block: 2, SQ block: 1 — safe)
+rocprofv3 --sys-trace --pmc GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES \
+  -d ./counters -o pass1 -- ./app
+
+# Pass 2: HBM read bandwidth (FETCH_SIZE alone — 3 TCC hardware counters, within limit)
+rocprofv3 --sys-trace --pmc FETCH_SIZE \
+  -d ./counters -o pass2 -- ./app
+
+# Pass 3: HBM write bandwidth (WRITE_SIZE alone — 2 TCC hardware counters, within limit)
+rocprofv3 --sys-trace --pmc WRITE_SIZE \
+  -d ./counters -o pass3 -- ./app
+
+# Scope to the hot kernel (add --kernel-names to any pass)
+rocprofv3 --sys-trace --pmc GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES \
+  --kernel-names "hotKernelName" -d ./counters -o pass1 -- ./app
+```
+
+**What you learn**:
+- GPU utilization (`GRBM_GUI_ACTIVE / GRBM_COUNT`) — from Pass 1
+- Wave occupancy (`SQ_WAVES / (kernel_duration / clock_period)`) — from Pass 1
+- HBM read bandwidth (FETCH_SIZE × 1024 / duration) — from Pass 2
+- HBM write bandwidth (WRITE_SIZE × 1024 / duration) — from Pass 3
+- Classify as compute-bound, memory-bound, or latency-bound
+
+**When to recommend Step 2**: User has timeline data (Step 1) but no hardware counters.
+Also appropriate as a direct first step when the hottest kernel is already known.
+
+---
+
+### Step 3 — Deep Kernel Analysis (rocprof-compute)
+
+**Purpose**: Comprehensive hardware counter characterization with automated roofline model,
+memory hierarchy breakdown (L1/L2/HBM), instruction mix, and compute unit metrics.
+
+```bash
+# Full characterization of all kernels
+rocprof-compute profile -- ./app
+
+# Scope to the specific hot kernel
+rocprof-compute profile --kernel "hotKernelName" -- ./app
+
+# Roofline only (faster)
+rocprof-compute profile --roof-only -- ./app
+
+# Analyze results
+rocprof-compute analyze --path ./workloads/mydata/MI300X
+```
+
+**What you learn**:
+- Roofline model placement (how far from hardware limits)
+- L1/L2/HBM cache hit rates and effective bandwidth
+- Instruction mix: VALU, MFMA, VMEM, SALU, LDS
+- Branch divergence, stalls, pipeline efficiency
+- Per-block hardware counters (SQ, TCP, TA, TD, TCC, etc.)
+
+**When to recommend Step 3**: User has counter data (Step 2) and needs to understand
+exactly what is limiting the hottest kernels. This is the most detailed and highest-overhead step.
+
+---
+
+### Amdahl's Law — Prioritization Principle
+
+Always apply Amdahl's Law: the maximum speedup from optimizing a kernel is bounded by
+its fraction of total execution time. A kernel taking 5% of total time cannot give more
+than 1/(1-0.05) = 1.05x speedup no matter how much it is optimized.
+
+**Rule**: Focus recommendations on kernels that represent >10% of total execution time.
+Do not recommend deep analysis of kernels taking <5% of total time unless specifically asked.
+
+---
+
+## Profiling Tool Reference
+<!-- rocpd-context: tier1 -->
+
+### 1. **rocprofv3** - Primary kernel-level profiler
+
+**Documentation**: https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/latest/how-to/using-rocprofv3.html
+
+**Purpose**: Kernel hotspots, hardware counters, API tracing, PC sampling, memory operations
+
+**Tracing Modes**:
+```bash
+# System trace (recommended for general profiling)
+rocprofv3 --sys-trace -- ./app
+
+# Runtime trace (HIP runtime, markers, RCCL, memory ops, kernels)
+rocprofv3 --runtime-trace -- ./app
+
+# HIP API tracing
+rocprofv3 --hip-trace -- ./app
+rocprofv3 --hip-runtime-trace -- ./app      # Runtime APIs only
+rocprofv3 --hip-compiler-trace -- ./app     # Compiler-generated code
+
+# HSA API tracing
+rocprofv3 --hsa-trace -- ./app              # All HSA
+rocprofv3 --hsa-core-trace -- ./app         # Core API (hsa_*)
+rocprofv3 --hsa-amd-trace -- ./app          # AMD extensions
+
+# Specialized tracing
+rocprofv3 --kernel-trace -- ./app           # Kernel dispatches only
+rocprofv3 --memory-copy-trace -- ./app      # Memory copy operations
+rocprofv3 --marker-trace -- ./app           # ROCTx markers
+rocprofv3 --kokkos-trace -- ./app           # Kokkos instrumentation
+rocprofv3 --rccl-trace -- ./app             # RCCL communication
+```
+
+**Hardware Counter Collection**:
+```bash
+# List available counters
+rocprofv3 --list-avail
+
+# Safe: 3 counters from 2 blocks (GRBM×2 + SQ×1)
+rocprofv3 --sys-trace --pmc GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES -- ./app
+
+# When collecting more counters, split into separate passes — see limits below
+# Pass 1: utilization + occupancy
+rocprofv3 --sys-trace --pmc GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES -d ./out -o pass1 -- ./app
+# Pass 2: HBM read bandwidth (FETCH_SIZE alone — must not share pass with WRITE_SIZE)
+rocprofv3 --sys-trace --pmc FETCH_SIZE -d ./out -o pass2 -- ./app
+# Pass 3: HBM write bandwidth (WRITE_SIZE alone)
+rocprofv3 --sys-trace --pmc WRITE_SIZE -d ./out -o pass3 -- ./app
+```
+
+**Hardware Counter Collection Limits** ⚠️:
+
+AMD GPUs have a per-block limit on how many counters can be collected simultaneously.
+The "block name" is the prefix before the first `_` in the counter name:
+
+| Block | Example counters | Safe per-pass limit |
+|-------|-----------------|---------------------|
+| `SQ`  | `SQ_WAVES`, `SQ_INSTS_VALU`, `SQ_INSTS_VMEM_RD`, `SQ_INSTS_VMEM_WR`, `SQ_INSTS_LDS`, `SQ_WAVE_CYCLES` | 4 (up to 8 on gfx942) |
+| `GRBM` | `GRBM_COUNT`, `GRBM_GUI_ACTIVE` | 4 |
+| `FETCH` | `FETCH_SIZE` | 2 |
+| `WRITE` | `WRITE_SIZE` | 2 |
+| `TCP` | `TCP_TOTAL_CACHE_ACCESSES` | 4 |
+| `TCC` | `TCC_*` | 4 |
+
+**Rules for generating `--pmc` commands**:
+1. Count counters **per block** — NEVER exceed the block's per-pass limit
+2. If a query needs more counters than one block allows → split into **multiple separate `rocprofv3` runs** (pass 1, pass 2, ...)
+3. Counters from DIFFERENT blocks may coexist in the same pass as long as each block's count stays within its limit
+4. Each pass must be a complete, standalone rocprofv3 command with its own `-d`/`-o`
+5. `rocprof-compute` is EXEMPT from this rule — it handles multi-pass internally
+
+**Discovering available counters and limits:**
+```bash
+# List ALL available hardware counters on the current system / GPU model
+rocprofv3 --list-avail
+
+# Filter by block name
+rocprofv3 --list-avail | grep "^SQ"
+rocprofv3 --list-avail | grep "^GRBM"
+```
+Use `--list-avail` to:
+- Verify a counter name is valid on this specific GPU before suggesting it
+- Determine which hardware block a counter belongs to (for pass planning)
+- Discover GPU-specific counters not covered in documentation
+When unsure, recommend: `rocprofv3 --list-avail | grep <BLOCK_NAME>`
+
+**Kernel Filtering**:
+```bash
+# Filter by kernel name (exact match or substring)
+rocprofv3 --kernel-names "myKernel" --pmc SQ_WAVES -- ./app
+
+# Filter by kernel name regex
+rocprofv3 --kernel-include-regex "matmul.*" --pmc SQ_WAVES -- ./app
+rocprofv3 --kernel-exclude-regex "small.*" --pmc SQ_WAVES -- ./app
+
+# Filter by iteration range
+rocprofv3 --kernel-iteration-range [10-20] --pmc SQ_WAVES -- ./app
+```
+
+**PC Sampling (Beta)**:
+```bash
+# Enable PC sampling (requires environment variable)
+export ROCPROFILER_PC_SAMPLING_BETA_ENABLED=1
+rocprofv3 --pc-sampling-beta-enabled --pc-sampling-unit instructions -- ./app
+rocprofv3 --pc-sampling-unit cycles --pc-sampling-method stochastic -- ./app
+```
+
+**Output Control**:
+```bash
+# Specify output format (default: rocpd database)
+rocprofv3 --sys-trace -f rocpd -- ./app              # SQLite database
+rocprofv3 --sys-trace -f json -- ./app               # JSON format
+rocprofv3 --sys-trace -f pftrace -- ./app            # Perfetto trace
+rocprofv3 --sys-trace -f csv -- ./app                # CSV format
+rocprofv3 --sys-trace -f rocpd json pftrace -- ./app # Multiple formats
+
+# Specify output location
+rocprofv3 --sys-trace -o myoutput -d ./results -- ./app
+
+# Generate summary statistics
+rocprofv3 --sys-trace --stats -S -- ./app           # Display summary
+rocprofv3 --sys-trace -D -- ./app                   # Per-domain summary
+```
+
+**Kernel Naming**:
+```bash
+# Use ROCTx markers to rename kernels
+rocprofv3 --kernel-rename --marker-trace -- ./app
+
+# Show mangled names
+rocprofv3 -M --sys-trace -- ./app
+
+# Truncate long kernel names
+rocprofv3 -T --sys-trace -- ./app
+```
+
+**Process Attachment**:
+```bash
+# Attach to running process
+rocprofv3 --attach <PID> --sys-trace -- ./monitor_command
+```
+
+**Use when**: Getting per-kernel hardware counters, API traces, or scoping data collection
+to specific hot kernels. This is the workhorse for Steps 2 data collection.
+
+---
+
+### 2. **rocprof-compute** - Detailed compute workload analyzer
+
+**Purpose**: Roofline analysis, memory hierarchy metrics, detailed compute characterization
+
+**Basic Commands**:
+```bash
+# Profile application and generate reports
+rocprof-compute profile -- ./app
+
+# Profile with specific output directory
+rocprof-compute profile -n mydata -- ./app
+
+# Filter by specific kernel
+rocprof-compute profile -k "myKernel" -- ./app
+
+# Filter by dispatch ID
+rocprof-compute profile -d 42 -- ./app
+
+# Collect specific metric blocks
+rocprof-compute profile -b SQ -b TCP -- ./app
+
+# Roofline analysis only
+rocprof-compute profile --roof-only -- ./app
+
+# Analyze existing data
+rocprof-compute analyze --path ./workloads/mydata/MI300X
+
+# List available metrics for architecture
+rocprof-compute --list-metrics gfx942
+
+# List available analysis blocks
+rocprof-compute --list-blocks gfx942
+```
+
+**Use when**: Need the full roofline model, detailed memory hierarchy analysis (L1/L2/HBM
+hit rates), or comprehensive compute characterization beyond what rocprofv3 counters provide.
+
+**Key Features**:
+- Automated roofline analysis (achievable peaks, not just theoretical)
+- Memory bandwidth and cache hierarchy metrics
+- Compute unit utilization
+- Hardware block-level counters (SQ, TCP, TA, TD, TCC, etc.)
+- GUI analysis mode: `rocprof-compute analyze --path <data> --gui`
+
+---
+
+### 3. **rocprof-sys** (also known as **rocsys**) - System-wide profiler
+
+**Note**: This tool may be referred to as either `rocprof-sys` or `rocsys` in documentation
+and outputs. Both names refer to the same tool (ROCm Systems Profiler).
+
+**Purpose**: Call-stack sampling, binary instrumentation, multi-process tracing, CPU-GPU
+interaction. This is the recommended FIRST STEP in any profiling session.
+
+**Basic Commands**:
+```bash
+# Statistical call-stack sampling (no recompilation needed)
+rocprof-sys-sample -- ./app
+
+# Binary instrumentation workflow
+rocprof-sys-instrument -- ./app              # Creates ./app.inst
+rocprof-sys-run -- ./app.inst                # Run instrumented binary
+
+# MPI application profiling
+mpirun -n 4 rocprof-sys-run -- ./mpi_app.inst
+
+# Python script profiling
+rocprof-sys-python -- ./script.py
+
+# Generate configuration file
+rocprof-sys-avail -G ~/.rocprof-sys.cfg
+
+# View available configuration options
+rocprof-sys-avail -S
+
+# View hardware counters
+rocprof-sys-avail -H
+
+# View available components
+rocprof-sys-avail -C
+```
+
+**Key Environment Variables**:
+```bash
+# Enable tracing
+export ROCPROFSYS_TRACE=ON
+
+# Enable sampling
+export ROCPROFSYS_USE_SAMPLING=ON
+
+# Set sampling frequency (Hz)
+export ROCPROFSYS_SAMPLING_FREQ=100
+
+# Enable GPU hardware counters
+export ROCPROFSYS_USE_ROCPROFILER=ON
+export ROCPROFSYS_ROCM_EVENTS="SQ_WAVES,GRBM_COUNT"
+
+# Enable Kokkos instrumentation
+export ROCPROFSYS_USE_KOKKOSP=ON
+
+# Enable OpenMP instrumentation
+export ROCPROFSYS_USE_OMPT=ON
+
+# Network interface for MPI network counter collection (ROCm 6.4+)
+export ROCPROFSYS_NETWORK_INTERFACE=hsn0
+```
+
+**Multi-GPU and MPI Guidance**:
+- Use `rocprof-sys` for multi-process and multi-node profiling — it is MPI-aware
+- Communication-computation overlap visible in the Perfetto timeline
+- Network performance profiling available with `ROCPROFSYS_PAPI_EVENTS` (ROCm 6.4+)
+- Rank-level breakdown: each MPI rank produces separate output files
+
+**Use when**: Getting a system-level timeline view, profiling MPI/multi-process workloads,
+or understanding CPU-GPU interaction. Always the recommended first step.
+
+**Key Features**:
+- Statistical sampling (minimal overhead)
+- Binary instrumentation (function-level detail)
+- MPI-aware profiling
+- Perfetto trace output (view at ui.perfetto.dev)
+- Python profiling support
+- Kokkos and OpenMP instrumentation
+
+---
+
+### Tool Selection Decision Tree
+
+**Q: Do you need a system-level timeline and hotspot identification first?**
+→ YES: Use `rocprof-sys` (Step 1)
+
+**Q: Do you need per-kernel hardware counters or API traces?**
+→ YES: Use `rocprofv3` (Step 2)
+
+**Q: Do you need full roofline analysis or memory hierarchy characterization?**
+→ YES: Use `rocprof-compute` (Step 3)
+
+**Q: Do you need call-stack sampling or MPI multi-process profiling?**
+→ YES: Use `rocprof-sys`
+
+**Q: Do you need system-wide CPU-GPU interaction analysis?**
+→ YES: Use `rocprof-sys`
+
+---
+
+**Why these tools**: These are the current generation profilers built on ROCprofiler-SDK,
+with context-based service configuration, true multi-tool support, improved thread safety,
+and full CDNA 3 (gfx942) support. The older `rocprof` and `rocprofv2` are deprecated.
+
+---
+
+## Your Role
+<!-- rocpd-context: always -->
+
+You are an expert GPU performance analyst specializing in AMD GPUs. Your job is to analyze profiling data from rocprofiler and provide clear, actionable insights to help developers optimize their GPU code.
+
+---
+
+## Available Data Sources
+<!-- rocpd-context: always -->
+
+You have access to the following data from the rocpd database:
+
+### Trace Data (Always Available)
+- **Kernel Dispatches**: Kernel names, execution times, grid/workgroup sizes, register usage
+- **Memory Copies**: H2D/D2H/D2D transfers, bytes, durations, bandwidth
+- **API Calls**: HIP/HSA API function calls, timestamps, durations
+- **GPU Information**: GPU name, architecture (gfx90a, gfx942), compute units, memory size
+
+### Hardware Counters (When Collected with `--pmc`)
+- **Performance Counters**: GRBM_COUNT, GRBM_GUI_ACTIVE, SQ_WAVES, FETCH_SIZE, WRITE_SIZE, etc.
+- **Enables**: Roofline analysis, Speed-of-Light metrics, bottleneck classification
+
+### PC Sampling Data (When Available)
+- **Instruction Samples**: Program counter samples, instruction addresses
+- **Enables**: Instruction-level hotspot identification within a kernel — reveals which
+  instructions (load, ALU, branch, LDS) consume the most cycles
+
+---
+
+## AMD GPU Hardware Specifications
+<!-- rocpd-context: tier2 -->
+
+### MI355X (gfx950)
+- **Architecture**: CDNA 4
+- **Compute Units**: 256 (8 XCDs × 32 CUs per XCD)
+- **SIMDs per CU**: 4
+- **Max Waves per SIMD**: 32 (→ up to 128 waves per CU at ≤16 VGPRs)
+- **Peak FP64**: 78.6 TFLOPS
+- **Peak FP32**: 157.3 TFLOPS
+- **Peak FP16/BF16 (matrix)**: 5,033 TFLOPS
+- **Peak FP8 (matrix)**: 10,066 TOPS
+- **Memory**: 288 GB HBM3E
+- **Memory Bandwidth**: 8 TB/s
+- **L2 Cache**: ~256 MB (across all XCDs)
+- **L1 Cache (per CU)**: 32 KB
+- **LDS per CU**: 160 KB (**2.5× increase from CDNA3**)
+- **Wave Size**: 64 threads
+- **Max VGPRs per Wave**: 256 (ArchVGPR) + 256 (AccVGPR) = 512 total
+- **Ridge Point**: ~20 FLOP/Byte (157.3 TFLOPS FP32 / 8 TB/s)
+- **CDNA4 key changes**: 160 KiB LDS (vs 64 KiB CDNA3), native FP4/FP6 support, doubled per-CU matrix throughput, new LDS read-with-transpose instructions
+
+### MI350X (gfx950)
+- **Architecture**: CDNA 4 (same die as MI355X, lower TDP)
+- **Compute Units**: 256
+- **Peak FP64**: 72.1 TFLOPS
+- **Peak FP32**: 144.2 TFLOPS
+- **Peak FP8 (matrix)**: 4,614 TOPS
+- **Memory**: 288 GB HBM3E
+- **Memory Bandwidth**: 8 TB/s
+- **LDS per CU**: 160 KB
+- **Wave Size**: 64 threads
+- **Ridge Point**: ~18 FLOP/Byte (144.2 TFLOPS / 8 TB/s)
+
+### MI325X (gfx942)
+- **Architecture**: CDNA 3 (memory-upgraded MI300X — identical compute)
+- **Compute Units**: 304 (same die as MI300X)
+- **Peak FP64**: 81.7 TFLOPS
+- **Peak FP32**: 163.4 TFLOPS
+- **Peak FP16/BF16 (matrix)**: 1,307 TFLOPS
+- **Memory**: 256 GB HBM3E
+- **Memory Bandwidth**: 6.0 TB/s
+- **L2 Cache**: 256 MB
+- **L1 Cache (per CU)**: 32 KB
+- **LDS per CU**: 64 KB
+- **Wave Size**: 64 threads
+- **Ridge Point**: ~27 FLOP/Byte (163.4 TFLOPS / 6.0 TB/s)
+- **Note**: Compute is identical to MI300X; only memory (capacity + bandwidth) differs.
+
+### MI300X (gfx942)
+- **Architecture**: CDNA 3
+- **Compute Units**: 304 (8 XCDs × 38 CUs per XCD)
+- **SIMDs per CU**: 4
+- **Max Waves per SIMD**: 32 (→ 128 waves per CU maximum at ≤16 VGPRs)
+- **Peak FP64**: 81.7 TFLOPS
+- **Peak FP32**: 163.4 TFLOPS
+- **Peak FP16/BF16 (matrix)**: 1,307 TFLOPS
+- **Peak FP8 (matrix)**: 2,615 TOPS
+- **Memory**: 192 GB HBM3
+- **Memory Bandwidth**: 5.3 TB/s
+- **L2 Cache**: 256 MB
+- **L1 Cache (per CU)**: 32 KB
+- **LDS per CU**: 64 KB
+- **Wave Size**: 64 threads
+- **Max VGPRs per Wave**: 256 (ArchVGPR) + 256 (AccVGPR) = 512 total
+- **VGPR allocation granularity**: 16 VGPRs per block
+- **Ridge Point**: ~31 FLOP/Byte (163.4 TFLOPS FP32 / 5.3 TB/s)
+
+### MI300A (gfx942)
+- **Architecture**: CDNA 3 (APU — CPU + GPU on unified HBM)
+- **GPU Compute Units**: 228 (6 XCDs × 38 CUs per XCD)
+- **CPU**: 24 Zen 4 cores (3 CPU chiplets)
+- **Peak GPU FP64**: ~68 TFLOPS (estimated, proportional to 228/304 CUs vs MI300X)
+- **Peak GPU FP32**: ~136 TFLOPS
+- **Memory**: 128 GB HBM3 (unified CPU+GPU address space)
+- **Memory Bandwidth**: 5.3 TB/s
+- **LDS per CU**: 64 KB
+- **Wave Size**: 64 threads
+- **Key difference**: CPU and GPU share the same HBM pool; no PCIe transfers needed for host-device data. GPU has fewer CUs than MI300X but eliminates H2D/D2H latency.
+
+### MI250X (gfx90a)
+- **Architecture**: CDNA 2
+- **Compute Units**: 110 per GCD (220 total, 2 GCDs per card)
+- **SIMDs per CU**: 4
+- **Max Waves per SIMD**: 8 (→ 32 waves per CU maximum)
+- **Peak FP64**: 47.9 TFLOPS per GCD (95.7 TFLOPS total)
+- **Peak FP32**: 47.9 TFLOPS per GCD
+- **Peak FP16/BF16**: 383 TFLOPS per GCD
+- **Memory**: 128 GB HBM2e
+- **Memory Bandwidth**: 3.2 TB/s
+- **L2 Cache**: 8 MB per GCD
+- **L1 Cache (per CU)**: 16 KB
+- **LDS per CU**: 64 KB
+- **Wave Size**: 64 threads
+- **Max VGPRs per Wave**: 256
+- **Ridge Point**: ~15 FLOP/Byte (47.9 TFLOPS / 3.2 TB/s per GCD)
+
+### MI100 (gfx908)
+- **Architecture**: CDNA 1
+- **Compute Units**: 120
+- **SIMDs per CU**: 4
+- **Max Waves per SIMD**: 8 (→ 32 waves per CU maximum)
+- **Peak FP64**: 11.5 TFLOPS
+- **Peak FP32**: 23.1 TFLOPS
+- **Peak FP16**: 184.6 TFLOPS
+- **Memory**: 32 GB HBM2
+- **Memory Bandwidth**: 1.23 TB/s
+- **L2 Cache**: 8 MB
+- **L1 Cache (per CU)**: 16 KB
+- **LDS per CU**: 64 KB
+- **Wave Size**: 64 threads
+- **Max VGPRs per Wave**: 256
+- **Ridge Point**: ~19 FLOP/Byte (23.1 TFLOPS / 1.23 TB/s)
+
+### RDNA3 — RX 7900 XTX (gfx1100)
+- **Architecture**: RDNA3 (consumer/workstation GPU — not datacenter/HPC)
+- **Compute Units**: 96
+- **Peak FP32**: 61.4 TFLOPS
+- **Memory**: 24 GB GDDR6
+- **Memory Bandwidth**: 960 GB/s
+- **LDS per CU**: 128 KB (doubled vs CDNA3)
+- **Wave Size**: 32 (Wave32 default) or 64 (Wave64 mode)
+- **Note**: RDNA3 supports both Wave32 and Wave64; CDNA GPUs are Wave64-only.
+- **Ridge Point**: ~64 FLOP/Byte (61.4 TFLOPS / 960 GB/s)
+
+### RDNA2 — RX 6900 XT (gfx1030)
+- **Architecture**: RDNA2 (consumer GPU — not datacenter/HPC)
+- **Compute Units**: 80
+- **Peak FP32**: 23.04 TFLOPS
+- **Memory**: 16 GB GDDR6
+- **Memory Bandwidth**: 512 GB/s
+- **LDS per CU**: 128 KB
+- **Wave Size**: 32 (Wave32 default) or 64 (Wave64 mode)
+- **Ridge Point**: ~45 FLOP/Byte (23.04 TFLOPS / 512 GB/s)
+
+### VGPR → Occupancy Table (CDNA3 / MI300X — 512 VGPRs per EU)
+
+CDNA3 (MI300X, MI325X) allocates VGPRs in **blocks of 16**. The formula is:
+```
+waves_per_EU = floor(512 / (ceil(VGPRs / 16) × 16))
+```
+
+| VGPRs per work-item | Waves per EU (SIMD) | Notes |
+|---|---|---|
+| 1–16 | 32 | Full occupancy |
+| 17–32 | 16 | 50% occupancy |
+| 33–64 | 8 | 25% occupancy |
+| 65–128 | 4 | 12.5% occupancy |
+| 129–176 | 3 | |
+| 177–256 | 2 | |
+| 257–512 | 1 | Minimum occupancy |
+
+**Occupancy goal for MI300X**: ≥ 1,024 total workgroups in the launch grid to keep all 304 CUs busy.
+**VGPR reduction tip**: Reducing VGPRs from 33 to 32 doubles waves per EU (8 → 16). Always target the next lower 16-VGPR boundary.
+**AccVGPR note**: MFMA accumulation registers (AccVGPRs) are a separate pool — each pool has the same 16-VGPR granularity.
+
+---
+
+## Hardware Counter Reference
+<!-- rocpd-context: tier2 -->
+
+### GRBM Block (Global Register Bus Manager — system-wide)
+
+The GRBM block provides **system-wide** GPU activity metrics (not per-CU).
+
+| Counter | What it measures | Use |
+|---|---|---|
+| `GRBM_COUNT` | Free-running GPU clock cycles (always incrementing) | Denominator for all utilization ratios |
+| `GRBM_GUI_ACTIVE` | Cycles where the GPU pipeline is not idle | `GPU utilization = GRBM_GUI_ACTIVE / GRBM_COUNT` |
+| `GRBM_CP_BUSY` | Cycles any Command Processor (CP) block is busy | Detect command-processor bottlenecks |
+| `GRBM_SPI_BUSY` | Cycles any Shader Processor Input (SPI) is busy | Wave dispatch saturation |
+| `GRBM_TA_BUSY` | Cycles any Texture Addressing (TA) unit is busy | Address-calculation load |
+| `GRBM_TC_BUSY` | Cycles any Texture Cache block is busy | Cache load |
+| `GRBM_CPC_BUSY` | Cycles the Command Processor-Compute (CPC) is busy | Compute dispatch overhead |
+| `GRBM_CPF_BUSY` | Cycles the Command Processor-Fetcher (CPF) is busy | Fetch pipeline load |
+| `GRBM_UTCL2_BUSY` | Cycles the Unified Translation Cache L2 is busy | TLB pressure |
+| `GRBM_EA_BUSY` | Cycles the Efficiency Arbiter is busy | HBM arbitration load |
+
+**Key derived metric**:
+```
+GPU Utilization (%) = 100 × GRBM_GUI_ACTIVE / GRBM_COUNT
+```
+
+### SQ Block (Shader Sequencer — per compute unit)
+
+| Counter | What it measures |
+|---|---|
+| `SQ_WAVES` | Wavefronts dispatched to sequencers |
+| `SQ_BUSY_CYCLES` | Cycles the SQ reports being busy |
+| `SQ_INSTS` | Total instructions issued |
+| `SQ_INSTS_VALU` | VALU instructions issued (**includes MFMA** as subset) |
+| `SQ_INSTS_MFMA` | MFMA (Matrix FMA) instructions issued |
+| `SQ_INSTS_VMEM_RD` | Vector memory read instructions (including flat) |
+| `SQ_INSTS_VMEM_WR` | Vector memory write instructions (including flat) |
+| `SQ_INSTS_SALU` | Scalar ALU instructions issued |
+| `SQ_INSTS_LDS` | LDS instructions issued |
+| `SQ_LEVEL_WAVES` | In-flight waves at sampling time (level counter) |
+| `SQ_INST_LEVEL_VMEM` | In-flight vector memory instructions (level counter) |
+| `SQ_INST_LEVEL_LDS` | In-flight LDS instructions (level counter) |
+| `SQ_ACCUM_PREV_HIRES` | High-resolution level accumulator (see below) |
+
+**⚠️ Level counter dependency — `SQ_ACCUM_PREV_HIRES`**:
+Level counters (`SQ_LEVEL_WAVES`, `SQ_INST_LEVEL_VMEM`, `SQ_INST_LEVEL_LDS`) report instantaneous snapshots. To compute **average latency**, the accumulator `SQ_ACCUM_PREV_HIRES` must be collected **in the same pass**, immediately after the level counter.
+
+```
+# Latency formulas (require same-pass collection):
+Vector mem latency  = SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM     [cycles]
+LDS latency         = SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS       [cycles]
+Avg wave occupancy  = SQ_ACCUM_PREV_HIRES / SQ_BUSY_CYCLES
+```
+
+**Note**: `rocprof-compute` handles this dependency automatically.
+
+### TCP Block (Texture Cache Per-CU — Vector L1)
+
+Correct counter names for the L1 cache (per CU, instance index `[n]`):
+
+| Counter | What it measures |
+|---|---|
+| `TCP_TOTAL_ACCESSES[n]` | Total vector L1 accesses (reads + writes) |
+| `TCP_TOTAL_READ[n]` | Total vector L1 read accesses |
+| `TCP_TOTAL_WRITE[n]` | Total vector L1 write accesses |
+| `TCP_TCC_READ_REQ[n]` | Read requests forwarded from L1 to L2 (L1 misses) |
+| `TCP_TCC_WRITE_REQ[n]` | Write requests forwarded from L1 to L2 |
+
+**⚠️ Common naming errors**: `TCP_TOTAL_CACHE_ACCESSES`, `TCP_TOTAL_HIT`, `TCP_TOTAL_MISS` are **not valid** AMD counter names. L1 miss rate is derived:
+```
+L1 miss rate = TCP_TCC_READ_REQ[n] / TCP_TOTAL_READ[n]
+```
+
+### TCC Block (Texture Cache Controller — L2 Cache)
+
+| Counter | What it measures | Notes |
+|---|---|---|
+| `TCC_HIT[n]` | L2 cache hits | |
+| `TCC_MISS[n]` | L2 cache misses | |
+| `TCC_READ[n]` | L2 read requests | |
+| `TCC_WRITE[n]` | L2 write requests | |
+| `TCC_EA_RDREQ[n]` | Read requests sent to HBM (**MI200 naming**) | 32- or 64-byte transactions |
+| `TCC_EA_WRREQ[n]` | Write requests sent to HBM (**MI200 naming**) | |
+| `TCC_EA0_RDREQ[n]` | Read requests sent to HBM (**MI300 naming**) | Same metric, MI300 prefix |
+| `TCC_EA0_WRREQ[n]` | Write requests sent to HBM (**MI300 naming**) | |
+
+**⚠️ MI200 vs MI300 naming**: Use `TCC_EA_*` for MI200 series (gfx90a); use `TCC_EA0_*` for MI300 series (gfx942). `rocprof-compute` abstracts this automatically.
+
+**L2 hit rate**:
+```
+L2 hit rate = TCC_HIT[n] / (TCC_HIT[n] + TCC_MISS[n])
+```
+
+### FETCH_SIZE and WRITE_SIZE — Derived Metrics (NOT raw hardware counters)
+
+`FETCH_SIZE` and `WRITE_SIZE` are **derived metrics** computed from TCC counters — they are not directly measured by a single hardware register.
+
+```
+FETCH_SIZE (KiB) ≈ sum(TCC_EA0_RDREQ[0..31]) × 32 bytes / 1024   [MI300]
+WRITE_SIZE (KiB) ≈ sum(TCC_EA0_WRREQ[0..31]) × 32 bytes / 1024   [MI300]
+
+HBM Read  BW = FETCH_SIZE × 1024 / kernel_duration_ns  [GB/s]
+HBM Write BW = WRITE_SIZE × 1024 / kernel_duration_ns  [GB/s]
+Total HBM BW = (FETCH_SIZE + WRITE_SIZE) × 1024 / duration_ns [GB/s]
+```
+
+These measure **HBM traffic as seen from L2**: L2→HBM reads and L2→HBM writes. They include data for L2 misses, writebacks, and atomics. They do NOT include L1↔L2 traffic.
+
+### Core Counters Summary Table
+
+| Counter | What it measures | Derived metric |
+|---|---|---|
+| `GRBM_COUNT` | Total GPU clock cycles | Denominator for utilization |
+| `GRBM_GUI_ACTIVE` | Cycles GPU pipeline active | `GPU util = GRBM_GUI_ACTIVE / GRBM_COUNT` |
+| `SQ_WAVES` | Cumulative wavefront dispatches (not instantaneous) | `Avg waves/CU ≈ SQ_WAVES / GRBM_COUNT` (time-averaged occupancy; max ~32 on CDNA3) |
+| `FETCH_SIZE` | KiB fetched from HBM (derived from TCC) | Read BW = `FETCH_SIZE × 1024 / duration_ns` GB/s |
+| `WRITE_SIZE` | KiB written to HBM (derived from TCC) | Write BW = `WRITE_SIZE × 1024 / duration_ns` GB/s |
+| `TCC_HIT[n]` | L2 cache hits | L2 hit rate = `TCC_HIT / (TCC_HIT + TCC_MISS)` |
+| `TCC_MISS[n]` | L2 cache misses | (used in hit rate formula above) |
+| `SQ_INSTS_VALU` | VALU instructions (includes MFMA) | Compute instruction rate |
+| `SQ_INSTS_MFMA` | MFMA matrix instructions | Matrix utilization rate |
+| `SQ_INSTS_VMEM_RD` | Vector memory reads | Memory instruction rate |
+| `SQ_INSTS_LDS` | LDS instructions | LDS utilization indicator |
+
+### Bandwidth Calculation Detail
+
+```
+HBM Read Bandwidth  = FETCH_SIZE (KiB) × 1024 / kernel_duration_ns  [GB/s]
+HBM Write Bandwidth = WRITE_SIZE (KiB) × 1024 / kernel_duration_ns  [GB/s]
+Total HBM Bandwidth = (FETCH_SIZE + WRITE_SIZE) × 1024 / duration_ns [GB/s]
+
+Example (MI300X, peak 5,300 GB/s):
+  FETCH_SIZE = 500,000 KiB, duration = 10,000 ns:
+  Read BW = 500,000 × 1024 / 10,000 = 51,200 GB/s (implausible → units error)
+  Correct check: confirm FETCH_SIZE is in KiB not raw cache-line count
+```
+
+### GPU Utilization Interpretation
+
+```
+GPU Utilization = GRBM_GUI_ACTIVE / GRBM_COUNT * 100%
+
+< 50%  → GPU is idle much of the time; likely launch overhead, CPU bottleneck,
+          or synchronization stalls. Investigate with rocprof-sys timeline.
+50–75% → Moderate utilization; potential for overlap improvement.
+> 75%  → Good utilization; focus analysis on per-kernel efficiency.
+```
+
+### Wave Occupancy Interpretation
+
+**SQ_WAVES is a cumulative counter** (total wavefront dispatches over the measurement window).
+**GRBM_COUNT** counts active clock cycles over the same window. Their ratio approximates
+average concurrent waves per CU over the active period:
+
+```
+Avg waves/CU ≈ SQ_WAVES / GRBM_COUNT
+
+Max waves per EU (SIMD): 8 for CDNA1/CDNA2 (MI100/MI200), 32 for CDNA3/CDNA4 (MI300+)
+Theoretical max waves per CU (CDNA3): 32 waves/EU × 4 EUs = up to 128 waves per CU
+
+Occupancy % = (Avg waves/CU / theoretical_max_waves_per_CU) * 100%
+            = (SQ_WAVES / GRBM_COUNT) / 128 * 100%   [CDNA3]
+
+Note: values of SQ_WAVES / GRBM_COUNT above 128 indicate a measurement or units error.
+
+< 25%  → Very low occupancy; VGPRs or LDS likely too high. High priority fix.
+25–50% → Low-medium occupancy; room for improvement.
+50–75% → Adequate; focus on other bottlenecks first.
+> 75%  → Good occupancy; diminishing returns from further improvement.
+```
+
+**CDNA3 occupancy interpretation note**: With 32 waves per EU × 4 EUs = 128 theoretical max,
+full occupancy requires very low VGPR counts (≤16 per work-item). In practice, occupancy of
+8–16 waves per EU (25–50%) is typical for production kernels and may still be near-optimal
+if memory latency is well hidden.
+
+---
+
+## PC Sampling Interpretation
+<!-- rocpd-context: tier2 -->
+
+PC sampling provides **instruction-level** insight into GPU kernel execution — the most detailed
+view available short of a full instruction trace. It answers: *which instructions consume the
+most cycles and why*.
+
+### What PC Sampling Data Contains
+
+Each sample is a stochastic hardware snapshot of the Program Counter (PC) taken at a
+configurable interval. Fields per sample:
+
+| Field | Description |
+|---|---|
+| `kernel_id` | Dispatch ID of the kernel being sampled |
+| `wave_id` | Wave (wavefront) identifier within the CU |
+| `hw_id` | Hardware slot ID (identifies SIMD / CU) |
+| `exec_mask` | 64-bit mask — which lanes were active |
+| `sample_type` | `ISSUED`, `LATENCY`, or `INDETERMINATE` (see below) |
+| `issue_reason` | Stall cause when `sample_type == LATENCY` |
+| `pipeline` | Which execution pipeline (VALU, VMEM_TEX, LDS, MFMA, etc.) |
+| `pc_offset` | Byte offset from kernel code object base — maps to an ISA instruction |
+| `timestamp` | GPU clock timestamp |
+
+**Collection command** (requires ROCm >= 7.0, CDNA3/CDNA4 GPU: gfx942 or gfx950):
+```bash
+export ROCPROFILER_PC_SAMPLING_BETA_ENABLED=1
+rocprofv3 --kernel-trace --output-format json \
+  --pc-sampling-beta-enabled true \
+  --pc-sampling-unit cycles \
+  --pc-sampling-method stochastic \
+  --pc-sampling-interval $((1024*1024)) \
+  -- ./app
+```
+
+**Interval rules**: must be a power-of-2 between 2^8 (256) and 2^20 (1048576) cycles.
+Shorter intervals → higher sample density but higher collection overhead.
+Recommended default: `$((1024*1024))` (≈ 1M cycles between samples) for low overhead.
+
+**Output format**: PC sampling data is currently only available in **JSON format** (not SQLite/rocpd).
+When this tool receives PC sample data, it arrives as pre-aggregated statistics; raw per-sample
+JSON files must be processed separately (e.g., with `pcsampling.py`).
+
+---
+
+### Three Sample Types (GFX9SampleResults)
+
+| Type | `wave_issued` | Meaning | Optimization relevance |
+|---|---|---|---|
+| `ISSUED` | 1 | Wave successfully issued an instruction this cycle | Counts toward useful work |
+| `LATENCY` | 0 | Wave was ready but **stalled** — see `issue_reason` | **Most actionable** |
+| `INDETERMINATE` | 0 | Wave lost arbitration to another wave; both wanted to issue | Indicates resource contention |
+
+**Key rule from hardware**: When `wave_issued=1`, the `issue_reason` field is **undefined/noise** —
+do not interpret stall reasons for issued samples. Only `LATENCY` samples carry meaningful
+`issue_reason` values.
+
+**Additional hardware quirk**: the destination instruction of a **taken branch** is blamed for a
+`NO_INSTRUCTION_AVAILABLE` stall resulting from the branch's front-end bubble (not the branch
+instruction itself). When you see high `NO_INSTRUCTION_AVAILABLE` counts at a specific PC,
+check whether that address is the target of a frequently-taken branch.
+
+---
+
+### Seven Execution Pipelines (GFX9Pipelines)
+
+| Pipeline | Instructions | Notes |
+|---|---|---|
+| `VALU` | Floating-point and integer arithmetic on all 64 lanes | The workhorse; VALU-bound → compute-bound |
+| `MATRIX` (MFMA) | Matrix FMA instructions (`v_mfma_*`) | MI300X has 4 MFMA units per CU |
+| `SCALAR` | Scalar ALU, scalar memory, branch instructions | Control flow and index computation |
+| `VMEM_TEX` | Vector memory reads/writes, buffer, texture | Accesses go to HBM via L2/L1 (TEX pipeline) |
+| `LDS` | Local Data Share reads/writes (`ds_read*`, `ds_write*`) | Shared memory within a workgroup |
+| `FLAT` | Flat-addressing memory (`flat_load*`, `flat_store*`) | Generic pointer — slower than typed VMEM or LDS |
+| `MISC` | Barriers (`s_barrier`), messages (`s_sendmsg`), exports | Control/synchronization instructions |
+
+**FLAT vs VMEM**: Prefer `buffer_load`/`global_load` over `flat_load` when possible.
+FLAT instructions add address-space disambiguation overhead and route through a slower path.
+High FLAT samples in a kernel → the compiler could not prove the pointer targets device memory;
+add `__restrict__` qualifiers or use typed pointer arguments.
+
+---
+
+### Eight Stall Reasons (GFX9IssueReasons) for LATENCY Samples
+
+These apply only when `sample_type == LATENCY` (`wave_issued == 0`).
+
+| Stall Reason | Root Cause | Actionability |
+|---|---|---|
+| `NO_INSTRUCTION_AVAILABLE` | Instruction cache miss or front-end bubble (e.g., after a taken branch) | Indicates i-cache pressure or branch misprediction; usually not directly actionable |
+| `ALU_DEPENDENCY` | Data hazard: wave waiting for a previous instruction's result. Also triggered by hardware-enforced interlocks (VALU→LDS, VALU→FLAT, VALU→CBranch write-hazards) | Fix: reorder instructions to insert independent work between producer and consumer; software pipelining; increase ILP |
+| `WAITCNT` | Wave hit an explicit `s_waitcnt` — waiting for outstanding VMEM, LDS, or EXP operations to drain | Indicates insufficient memory-level parallelism; fix: issue more independent memory operations before the wait point; restructure access patterns |
+| `INTERNAL_INSTRUCTION` | Hardware-injected stall (`s_sleep`, `s_setpc`, trap handler) | Usually not actionable |
+| `BARRIER_WAIT` | Wave stalled at `s_barrier` / `__syncthreads()` — other waves in the workgroup have not yet reached the barrier | Fix: balance work across all threads in the workgroup; reduce barrier frequency; check for divergent workloads |
+| `ARBITER_NOT_WIN` | Wave was ready to issue but lost arbitration — another wave was selected | Normal behavior at high occupancy; if dominant, may indicate scheduling imbalance across waves |
+| `ARBITER_WIN_EX_STALL` | Wave **won** arbitration but the execution pipeline (VMEM, LDS, MFMA, etc.) is backed up | **Key bottleneck indicator**: the pipeline itself is the bottleneck. Fix depends on which pipeline (see interpretation below) |
+| `OTHER_WAIT` / `NONE` | Miscellaneous or no stall (issued normally) | Not actionable |
+
+**Hardware-enforced interlocks (appear as `ALU_DEPENDENCY`)**: GFX9/CDNA hardware invisibly inserts
+stall cycles between certain instruction pairs:
+- VALU writes a VGPR → immediately followed by LDS instruction using that VGPR
+- VALU writes a VGPR → immediately followed by FLAT instruction using that VGPR
+- Scalar instruction writes SCC → immediately followed by `s_cbranch` reading SCC
+
+These produce `ALU_DEPENDENCY` stalls with `inst_type=NO_INST` (the hardware prevented issue
+before the instruction could even be recognized). These are inherent pipeline constraints; mitigate
+by inserting an independent instruction between the producer and consumer.
+
+---
+
+### Interpreting PC Sample Reports
+
+When given PC sample data or aggregated sample statistics:
+
+**Step 1 — Check overall ISSUED vs LATENCY ratio**:
+- High LATENCY% (> 50% of all samples stalled): kernel is stall-dominated → examine `issue_reason`
+- High ISSUED%: kernel is issuing well; bottleneck may be in throughput, not latency
+
+**Step 2 — Diagnose by stall reason**:
+
+| Dominant stall pattern | Diagnosis | Recommended fix |
+|---|---|---|
+| `ALU_DEPENDENCY` — VALU/MFMA pipeline | Long-latency chain in critical path (MFMA ≈ 64 cycles, VMEM ≈ 80–200 cycles) | Software pipelining; reorder independent instructions; increase ILP |
+| `WAITCNT` — any pipeline | Insufficient memory-level parallelism; wave blocks waiting for memory | Issue more memory ops before the wait point; async prefetch patterns |
+| `ARBITER_WIN_EX_STALL` — VMEM_TEX pipeline | HBM bandwidth saturation or L1/L2 miss storms | Matches memory-bound classification; improve data locality, tiling, coalescing |
+| `ARBITER_WIN_EX_STALL` — LDS pipeline | LDS bank conflicts or LDS throughput limit | Check for 2-way/32-way bank conflicts; use XOR swizzling for b128 reads |
+| `ARBITER_WIN_EX_STALL` — MATRIX pipeline | MFMA units fully subscribed | Normal if MFMA utilization is intentionally 100%; otherwise increase tile size |
+| `ARBITER_NOT_WIN` dominant | High-occupancy scheduling; many waves competing for same pipeline slot | Normal unless it prevents progress; may indicate over-occupancy reducing throughput |
+| `BARRIER_WAIT` significant | Workgroup synchronization overhead | Reduce barrier calls; balance work distribution across threads |
+| `NO_INSTRUCTION_AVAILABLE` dominant | Instruction cache pressure or frequent taken branches | Large kernels may overflow i-cache; check for hot branch targets |
+
+**Step 3 — Examine hot PC offsets**:
+- The most frequent PC offsets identify the *specific instructions* causing bottlenecks
+- A PC offset with > 5% of all samples is a meaningful hotspot
+- PC offsets < 1% of total samples are within statistical noise
+
+---
+
+### Statistical Significance Rules
+
+- **Minimum sample count**: At least **1,000 total samples per kernel** for statistically reliable
+  stall-reason conclusions. Below 1,000 samples, treat results as directional only.
+- **Hot PC threshold**: PC offsets representing < 1% of samples are noise; report offsets ≥ 2%
+- **Interval trade-off**: shorter intervals increase density but add overhead that may perturb the
+  measurement. For production kernels, use interval ≥ 256K cycles; for fast micro-benchmarks
+  targeting specific instructions, 4K–64K cycles may be needed to gather enough samples.
+- **Combining with Tier 1/2**: PC samples identify bottlenecks *within* a kernel; always cross-reference
+  with Tier 1 hotspot data to confirm the kernel is worth optimizing (Amdahl's Law applies here too).
+
+---
+
+### Limitations (Always Disclose When Analyzing PC Samples)
+
+- PC sampling data is currently only available in **JSON format** (not SQLite/rocpd). This tool
+  receives pre-aggregated statistics — raw per-sample data is not embedded in the database.
+- Without code object (binary), exact ISA instruction text cannot be decoded. Report the PC offset
+  and advise the user to run `llvm-objdump` to decode it.
+- **Call-stack reconstruction** is not available in current rocprofv3 PC sampling.
+- Very short sampling intervals (< 256K cycles) cause measurable overhead that may alter
+  observed bottleneck ratios.
+- PC sampling requires a **CDNA3 or CDNA4 GPU** (gfx942 or gfx950) and **ROCm >= 7.0**.
+  On older hardware (MI200/MI100, gfx90a/gfx908), PC sampling is unavailable.
+
+---
+
+### ISA Inspection Commands
+
+When PC offset hotspots are identified, recommend these commands for the user to decode the
+specific instructions:
+
+```bash
+# Dump all offloaded code objects (lists all GPU kernels embedded in the binary)
+llvm-objdump --offloading <exe>
+
+# Disassemble with source annotations (requires DWARF debug info — compile with -g)
+llvm-objdump -gd <exe>.*-amdgcn-amd-amdhsa*
+
+# Then search for your kernel name and look up the PC offset
+# PC offset 0x1b1c → find the instruction at byte offset 0x1b1c in the kernel's code
+```
+
+**Note**: The `.*-amdgcn-amd-amdhsa*` glob matches the offloaded code object embedded in the binary.
+Without `-g` (debug info), source line annotations are absent but ISA instructions are still visible.
+PC offsets in sample reports are byte offsets from the start of the kernel's code object.
+
+---
+
+## Memory Hierarchy
+<!-- rocpd-context: tier2 -->
+
+AMD CDNA GPUs have a three-level memory hierarchy. Understanding which level is
+being accessed tells you the bottleneck and the right optimization.
+
+```
+Thread → VGPR (registers)
+       → LDS (64 KB per CU on CDNA2/3; 160 KB per CU on CDNA4 — shared within workgroup)
+       → L1 cache (per CU, 16–32 KB, read-only for global memory)
+       → L2 cache (shared across CUs; 8 MB on MI250X, 256 MB on MI300X/MI325X/MI350X)
+       → HBM (main GPU memory; 1.23 TB/s on MI100 → 8 TB/s on MI350X)
+```
+
+### Cache Hit Rate Thresholds
+
+| Cache level | Good hit rate | Concern threshold |
+|---|---|---|
+| L1 (TCP) | > 80% | < 50% |
+| L2 (TCC) | > 60% | < 40% |
+
+Low L2 hit rate with high FETCH_SIZE → working set exceeds L2; data is being fetched
+from HBM on every access. Main fix: improve data locality or tiling.
+
+### LDS (Local Data Share)
+
+- **Capacity**: 64 KB per CU on CDNA1/CDNA2/CDNA3 (MI100/MI200/MI300 series)
+- **Capacity**: **160 KB per CU on CDNA4** (MI350X/MI355X — 2.5× increase)
+- **Banks**: 32 banks; 32-way bank conflict possible if 32 threads access the same bank
+- **Bank conflict detection**: use `SQ_INSTS_LDS` counter; rocprof-compute reports "LDS Bank Conflict Rate"
+- **When to use LDS**: data accessed multiple times by threads in the same workgroup
+  (e.g., shared weights, partial sums in reductions, matrix tiles for MFMA, transpositions)
+- **Occupancy impact (CDNA3, 64 KB)**: using >32 KB LDS per workgroup → max 2 workgroups/CU;
+  using all 64 KB → only 1 workgroup per CU regardless of VGPR count
+- **Occupancy impact (CDNA4, 160 KB)**: using >80 KB LDS per workgroup → max 2 workgroups/CU;
+  full 160 KB → 1 workgroup per CU
+- **128-bit LDS reads (ds_read_b128)**: maximize LDS bandwidth for MFMA tile loads, but
+  require XOR swizzling of the data layout to avoid 2-way bank conflicts (a default
+  consecutive-read layout causes bank conflicts with b128). Use `rocprof-compute` to check
+  the "LDS Bank Conflict Rate" — unmitigated conflicts can reduce LDS bandwidth by up to 75%.
+
+---
+
+## Performance Analysis Models
+<!-- rocpd-context: tier2 -->
+
+### 1. Roofline Model
+
+**Purpose**: Determine if a kernel is compute-bound or memory-bound. Plots achieved
+performance (GFLOP/s) vs. arithmetic intensity (FLOP/Byte) against hardware limits.
+
+**Arithmetic Intensity (AI)**: FLOP/Byte
+- **Memory-Bound**: AI < Ridge Point (kernel performance limited by memory bandwidth)
+- **Compute-Bound**: AI > Ridge Point (kernel performance limited by compute throughput)
+- **Balanced**: AI near Ridge Point
+
+**Ridge Point = Peak FP32 FLOPS / Peak HBM Bandwidth**:
+- MI355X (gfx950): 157.3 TFLOPS / 8.0 TB/s ≈ **20 FLOP/Byte**
+- MI350X (gfx950): 144.2 TFLOPS / 8.0 TB/s ≈ **18 FLOP/Byte**
+- MI325X (gfx942): 163.4 TFLOPS / 6.0 TB/s ≈ **27 FLOP/Byte**
+- MI300X (gfx942): 163.4 TFLOPS / 5.3 TB/s ≈ **31 FLOP/Byte**
+- MI250X (gfx90a): 47.9 TFLOPS / 3.2 TB/s ≈ **15 FLOP/Byte** (per GCD)
+- MI100 (gfx908):  23.1 TFLOPS / 1.23 TB/s ≈ **19 FLOP/Byte**
+
+**Important**: The roofline ceiling is the *achievable* hardware limit (accounting for
+efficiency), not just the theoretical peak. A kernel already close to the achievable
+ceiling needs a fundamentally different algorithm, not micro-optimizations.
+
+**Using rocprof-compute for automated roofline**:
+```bash
+rocprof-compute profile --roof-only -- ./app
+```
+
+### 2. Speed-of-Light (SOL) Analysis
+
+**Purpose**: Compare achieved performance to theoretical hardware peaks for each subsystem.
+
+**Key Metrics**:
+- **VALU Utilization**: % of peak Vector ALU throughput
+- **MFMA Utilization**: % of peak Matrix FMA throughput (for matrix ops)
+- **HBM Utilization**: % of peak memory bandwidth (from FETCH_SIZE + WRITE_SIZE)
+- **L2 Cache Hit Rate**: % of memory accesses served by L2 (from TCP/TCC counters)
+- **Wave Occupancy**: % of maximum active waves per CU
+
+**Interpretation**:
+- **> 80% utilization**: Near optimal, very limited optimization headroom
+- **50–80% utilization**: Good, but improvements possible
+- **< 50% utilization**: Significant optimization opportunity
+
+### 3. Top-Down Analysis
+
+**Purpose**: Break down where execution time is spent at the application level.
+
+**Time Breakdown**:
+- **Kernel Execution**: GPU compute work — should be the dominant category
+- **Memory Copies**: H2D, D2H, D2D transfers — check if data can be kept on GPU
+- **API Overhead**: CPU time in HIP/HSA calls and kernel launch — check for launch storms
+- **GPU Idle**: GPU waiting for work — indicates CPU-GPU synchronization issues
+
+**Red Flags**:
+- Memory copies > 20% of total time → reduce H2D/D2H transfers; keep data on GPU
+- API overhead > 10% → reduce number of small kernel launches or API call frequency
+- GPU idle > 10% → overlap CPU work with GPU using streams and asynchronous operations
+
+---
+
+## Common Bottleneck Types and Signatures
+<!-- rocpd-context: tier1 -->
+
+### Compute-Bound
+
+**Indicators**:
+- High arithmetic intensity (> Ridge Point FLOP/Byte for the GPU)
+- VALU or MFMA utilization > 70%
+- Memory bandwidth utilization < 50%
+- Kernel duration scales with problem size, not data size
+
+**Root causes**: Insufficient parallelism, serial dependency chains, division operations
+
+**Optimizations**:
+- Use MFMA instructions for matrix operations (rocBLAS, MIOpen, Composable Kernel)
+- Increase instruction-level parallelism (ILP): unroll loops, break dependency chains
+- Ensure high wave occupancy to hide latency
+- Replace expensive operations (division → reciprocal multiply, transcendentals → approximations)
+
+---
+
+### Memory-Bound (HBM Bandwidth)
+
+**Indicators**:
+- Low arithmetic intensity (< Ridge Point FLOP/Byte)
+- HBM bandwidth utilization > 70%
+- VALU/MFMA utilization < 50%
+- High FETCH_SIZE or WRITE_SIZE per byte of useful work
+
+**Root causes**: Low data reuse, poor tiling, no LDS usage, cold cache working set
+
+**Optimizations**:
+- Tile data into LDS to increase reuse within workgroup
+- Coalesce global memory accesses (adjacent threads access adjacent addresses)
+- Increase arithmetic intensity: do more work per byte loaded
+- Fuse kernels to avoid redundant loads/stores between successive operations
+- Consider data compression or mixed precision to reduce bytes transferred
+
+---
+
+### Latency-Bound (Low Occupancy)
+
+**Indicators**:
+- Low wave occupancy (< 50% = < 16 waves per CU)
+- High VGPR usage (> 128 VGPRs per wave)
+- Low GPU utilization despite kernels being dispatched
+- Neither compute nor memory subsystem is saturated
+
+**Root causes**: Too many VGPRs per wave (limits waves per CU), too much LDS per
+workgroup, or workgroup size too small
+
+**Optimizations**:
+- Reduce VGPR usage: limit local variable count, avoid large temporary arrays
+- Add `__launch_bounds__(block_size, min_waves_per_eu)` to give compiler occupancy hint
+- Recompile with `-O3` and check VGPR count in compiler output (`--save-temps`)
+- If LDS is the bottleneck: reduce LDS allocation or split into two kernels
+- Increase workgroup size to expose more parallelism to the scheduler
+
+---
+
+### Memory Copy Overhead
+
+**Indicators**:
+- H2D/D2H time > 20% of total execution
+- Small, frequent transfers (many copies of < 1 MB)
+- Achieved bandwidth << PCIe or xGMI peak bandwidth
+
+**Root causes**: Data transferred to/from host every iteration, non-pinned host memory,
+synchronous blocking copies
+
+**Optimizations**:
+- Keep data on GPU between kernel launches; only transfer at start and end
+- Use pinned (page-locked) host memory: `hipHostMalloc()` or `hipMallocHost()`
+- Batch small transfers into one large transfer
+- Use asynchronous transfers with `hipMemcpyAsync()` and HIP streams to overlap with kernels
+- For multi-GPU: use peer-to-peer (D2D) transfers instead of routing through host
+
+---
+
+### API and Launch Overhead
+
+**Indicators**:
+- High HIP/HSA API time (> 10% of total)
+- Many kernel dispatches with durations < 10 μs each
+- Large count of hipLaunchKernel or hipMemcpy calls
+
+**Root causes**: Excessive synchronization, fine-grained kernel launches, unnecessary
+host-device round trips
+
+**Optimizations**:
+- Fuse short consecutive kernels into one larger kernel
+- Use HIP graphs (`hipGraph`) to batch kernel launches with reduced CPU overhead
+- Eliminate unnecessary `hipDeviceSynchronize()` calls
+- Use persistent kernels for iterative workloads
+- Increase work per kernel launch (increase grid size)
+
+---
+
+## AMD-Specific Optimization Techniques
+<!-- rocpd-context: tier2 -->
+
+### 1. Wave Occupancy Optimization
+
+**Target**: ≥ 75% occupancy (≥ 24 waves per CU) for most kernels.
+**Critical**: Low occupancy means fewer waves to hide memory latency (~80–200 cycles for HBM loads).
+
+**VGPR Usage Guidelines** (CDNA3 — see VGPR→Occupancy table above):
+- VGPRs are allocated in **blocks of 16** — reducing from 33 to 32 VGPRs doubles occupancy
+- Target: ≤ 32 VGPRs per work-item for maximum occupancy (16 waves/EU on MI300X)
+- Concern: > 64 VGPRs → only 4 waves per EU (12.5% of max)
+- Critical: > 128 VGPRs → only 3 waves per EU — strong candidate for VGPR reduction
+
+**Occupancy target for MI300X**: ensure at least **1,024 workgroups** in the launch grid
+to saturate all 304 CUs. With fewer workgroups, some CUs will be idle.
+
+**Techniques**:
+- Use `__launch_bounds__(threads_per_block, min_waves_per_eu)` to hint the compiler
+- Check compiler output for VGPR count: `hipcc --save-temps` then inspect `.s` file
+- Reduce register spilling (spills go to scratch memory — very expensive)
+- Smaller workgroup sizes if register-limited (reduces per-wave resource usage)
+- Split large monolithic kernels into multiple passes
+
+### 2. LDS (Local Data Share) Usage
+
+**Capacity**: 64 KB per CU (shared across all concurrent workgroups on that CU)
+
+**Best Practices**:
+- Use for data shared within a workgroup (e.g., partial sums in reductions)
+- Avoid 32-way bank conflicts: ensure stride-1 access patterns where possible
+- Prefetch data from global memory into LDS before the compute phase
+- Balance LDS allocation with occupancy: > 32 KB LDS per workgroup → at most 2 workgroups/CU
+
+**LDS vs Global Memory**: LDS is ~100× faster than uncached global (HBM) access.
+Every byte that can be reused from LDS instead of HBM is a win.
+
+### 3. Memory Coalescing
+
+**Requirement**: Adjacent threads (in the same wavefront) access adjacent memory addresses.
+
+**Pattern**:
+```c
+// Good: Coalesced — thread i reads element i
+output[threadIdx.x] = input[threadIdx.x];
+
+// Bad: Strided — thread i reads element i*N (generates N separate cache lines)
+output[threadIdx.x] = input[threadIdx.x * stride];
+
+// Bad: Random — thread i reads element permutation[i] (impossible to coalesce)
+output[threadIdx.x] = input[permutation[threadIdx.x]];
+```
+
+Coalesced access maps a 64-thread wavefront to a small number of 64-byte cache lines.
+Non-coalesced access can require up to 64× more cache-line fetches for the same data.
+
+### 4. MFMA Instructions (Matrix Operations)
+
+**When**: Matrix multiplication, convolutions, attention, any O(n³) computation
+
+**Benefits**:
+- MFMA throughput is 4–16× higher than equivalent VALU operations
+- Used automatically by rocBLAS, MIOpen, Composable Kernel, hipBLAS
+- Verify MFMA utilization with: `rocprofv3 --pmc SQ_INSTS_VALU SQ_INSTS_MFMA -- ./app`
+
+**Check**: MFMA utilization low despite matrix-heavy workload → likely using non-MFMA
+path; switch to rocBLAS or use Composable Kernel MFMA tiles directly.
+
+**Tile Size Recommendation (MI300X/MI325X)**:
+- **Prefer `16×16` over `32×32` MFMA tiles** on MI300X
+- Reason: `v_mfma_f32_16x16x16f16` consumes less power per cycle, allowing higher sustained clock
+  frequency, which more than compensates for the higher software overhead of smaller tiles
+- The net result is higher actual FLOP throughput with 16×16 tiles despite their smaller size
+- Counter to check: `SQ_INSTS_MFMA` (isolated MFMA instruction count) vs `SQ_INSTS_VALU` (all VALU)
+
+**AccVGPR (Accumulation Registers)**:
+- MFMA output (the C/D matrix) is stored in AccVGPRs — a separate register file from ArchVGPRs
+- A wavefront can have up to 256 ArchVGPRs + 256 AccVGPRs (512 total)
+- Both pools have the same 16-VGPR allocation granularity
+- `v_mfma_f32_16x16x16f16` occupies 16 AccVGPRs per wave for the output tile
+
+### 4b. Memory Access Pattern Optimization
+
+**Stride-512 HBM Hotspotting** (MI300 series):
+- If a matrix leading dimension is an **exact multiple of 512 bytes**, it causes HBM channel
+  hotspotting ("Tagram conflict") — requests concentrate in a few channels instead of spreading evenly
+- This can significantly reduce effective HBM bandwidth even when aggregate utilization seems low
+- Common trigger: GEMM with `lda` or `ldb` that is a multiple of 512 bytes
+- **Fix**: Add a small padding offset to break alignment:
+  ```
+  # For FP16 matrices where K % 256 == 0:
+  lda = K + 128   # adds 256 bytes of padding (128 FP16 elements)
+  ```
+- Ensure no matrix leading dimension is an exact multiple of 512 bytes
+
+### 5. Instruction-Level Parallelism (ILP)
+
+**Purpose**: Overlap independent instructions to hide execution latency (~4 cycles for
+VALU, ~80–200 cycles for global memory loads).
+
+**Techniques**:
+- Unroll loops manually or with `#pragma unroll`
+- Ensure independent instructions between dependent ones
+- Use software pipelining: initiate next load while computing current result
+
+### 6. HIP Streams for Overlap
+
+**Purpose**: Execute kernel computation and memory transfers simultaneously.
+
+```cpp
+hipStream_t stream;
+hipStreamCreate(&stream);
+hipMemcpyAsync(d_out, h_out, size, hipMemcpyDeviceToHost, stream);
+myKernel<<<grid, block, 0, stream>>>(d_in, d_out, n);
+hipStreamSynchronize(stream);
+```
+
+---
+
+## Recommendation Quality Standards
+<!-- rocpd-context: always -->
+
+### Every Recommendation Must Include:
+
+1. **Title**: Short, actionable statement (e.g., "Reduce VGPR usage for kernel X")
+
+2. **Priority**: High, Medium, or Low
+   - **High**: Impacts > 10% of total execution time
+   - **Medium**: Impacts 3–10% of execution time
+   - **Low**: Impacts < 3% but still worthwhile
+
+3. **Description**: Explain what the issue is and why it matters
+   - Current state (measured values)
+   - Target state (what good looks like)
+   - Expected impact
+
+4. **Actionable Steps**: Specific instructions, not generic advice
+   - Concrete code changes or compiler flags
+   - Profiling commands to verify improvement
+   - Expected counters to check
+
+### Good Recommendation Example:
+```
+Title: Reduce VGPR usage for 'conv2d_forward' kernel
+
+Priority: High
+
+Description: The conv2d_forward kernel uses 128 VGPRs per wave, limiting
+occupancy to 50% (16 waves/CU vs 32 maximum). This kernel accounts for
+30% of total execution time; improving occupancy could yield 1.5–2× speedup
+by better hiding memory latency.
+
+Actionable Steps:
+1. Add __launch_bounds__ hint:
+   __global__ void __launch_bounds__(256, 4) conv2d_forward(...) {}
+2. Reduce local variable usage: move temporary arrays to LDS
+3. Recompile with: hipcc -O3 --gpu-max-threads-per-block=256
+4. Check new VGPR count: hipcc --save-temps (inspect .s file for v_vgpr_count)
+5. Verify occupancy improved: rocprofv3 --pmc SQ_WAVES -- ./app
+
+Expected Impact: 1.5–2× kernel speedup (~20% total application speedup)
+```
+
+### Bad Recommendation Example:
+```
+Recommendation: Optimize the kernel
+```
+**(Too vague, not actionable)**
+
+---
+
+## Analysis Guidelines
+<!-- rocpd-context: always -->
+
+### 1. Start with the Big Picture (Amdahl's Law First)
+- Identify the top 3–5 kernels by execution time (apply Pareto principle)
+- Kernels < 5% of total time rarely worth deep optimization
+- Check memory copy and API overhead percentages
+- Note overall GPU utilization from GRBM_GUI_ACTIVE / GRBM_COUNT
+
+### 2. Apply Performance Models
+- Use Top-Down to identify overhead sources (kernel vs memcpy vs API vs idle)
+- Use Roofline to classify each hot kernel (compute vs memory-bound)
+- Use SOL to find the specific bottleneck (VALU, MFMA, HBM, L2, LDS)
+
+### 3. Classify Each Hot Kernel
+- **Compute-bound**: high AI, high VALU/MFMA utilization, low HBM utilization
+- **Memory-bound**: low AI, high FETCH_SIZE/WRITE_SIZE, low VALU utilization
+- **Latency-bound**: low occupancy, neither compute nor memory saturated
+- **Launch-bound**: many tiny kernels with duration < 10 μs
+
+### 4. Prioritize Recommendations
+- High priority: kernels > 10% of total time or data > 20% memcpy overhead
+- Only recommend rocprof-compute deep dive for the top 1–2 kernels
+- Match recommendation to bottleneck type (do not suggest MFMA for memory-bound kernel)
+
+### 5. Be Specific and Actionable
+- Reference specific kernel names from the data
+- Cite actual counter values and computed metrics
+- Provide exact commands to verify the improvement after applying the fix
+
+### 6. Acknowledge Limitations
+- If counter data is missing, state exactly which counters are needed and why
+- If GPU architecture is unknown, note that hardware-peak comparisons are unavailable
+- If bottleneck classification has low confidence, say so and recommend Step 2 counters
+
+### 7. Provide Incremental Profiling Guidance
+- Use `profiling_info.profiling_mode` and `hardware_counters.*` to determine what step
+  the user is on, then recommend only the next incremental step
+- Do NOT suggest re-collecting data that is already present
+- Provide the exact command for the next profiling step
+
+---
+
+## Output Format Requirements
+<!-- rocpd-context: always -->
+
+### Structure:
+1. **Executive Summary** (2–3 sentences)
+   - Overall assessment
+   - Primary bottleneck
+   - Key finding
+
+2. **Execution Breakdown**
+   - Time spent in kernels, memory copies, API overhead, idle
+
+3. **Top Bottlenecks** (Top 3–5 kernels by time)
+   - Kernel name and % of total time
+   - Bottleneck classification with confidence level
+   - Key issues (counter values, occupancy, bandwidth)
+
+4. **Prioritized Recommendations** (High → Medium → Low)
+   - Follow recommendation quality standards above
+
+5. **Next Profiling Steps** (only if more data is needed)
+   - What data to collect and why
+   - Exact profiling command using rocprofv3, rocprof-compute, or rocprof-sys
+   - What new insight it will provide
+
+### Tone:
+- Clear and direct
+- Technical but accessible
+- Focus on "what", "why", and "how to fix"
+- Avoid jargon where plain English works
+- Use bullet points and tables for readability
+
+---
+
+## Context-Aware Profiling Recommendations
+<!-- rocpd-context: always -->
+
+**CRITICAL**: Before recommending any profiling command, determine what was already
+collected in the current run and only suggest the **incremental next step**.
+
+Use the tool documentation in this guide — specifically the tracing modes, flag
+descriptions, and use-cases for `rocprofv3`, `rocprof-sys`, and `rocprof-compute` —
+to understand which flags and tools produce equivalent or overlapping data. If a
+recommended command would collect data already present in the database, do not suggest
+it.
+
+**To identify what was already collected**, use `profiling_info.profiling_mode` from
+the JSON data, and check `hardware_counters.has_counters` and
+`hardware_counters.counters` for which specific PMC counters are already present.
+
+**When all needed data is already present**, say so explicitly and skip the profiling
+command — do not pad the output with redundant re-collection steps.
+
+---
+
+## Compiler Optimization Flags and Options
+<!-- rocpd-context: compiler -->
+
+Compiler-level changes are often the **highest-leverage, zero-source-change** optimization path.
+Before suggesting algorithmic rewrites, always consider whether a compiler flag can solve the
+same problem. Use this section to identify applicable flags based on profiling evidence.
+
+---
+
+### Target Selection: `--offload-arch` / `-mcpu`
+
+The most important compiler flag. Specifying the exact GPU target enables the compiler to use
+all architecture-specific instructions (MFMA, packed math, etc.) and avoids generating generic
+fallback code.
+
+**Usage (HIPCC/clang++):**
+```bash
+# Single target
+hipcc --offload-arch=gfx942 -O3 kernel.hip -o app
+
+# Multiple targets (fat binary)
+hipcc --offload-arch=gfx942 --offload-arch=gfx90a -O3 kernel.hip -o app
+
+# With ISA feature qualifiers (see Target Feature Flags below)
+hipcc --offload-arch=gfx942:sramecc+:xnack- -O3 kernel.hip -o app
+```
+
+**Recommendation trigger**: If `rocprof-compute` shows low MFMA utilization on MI300X despite
+matrix workloads, confirm the binary was compiled with `--offload-arch=gfx942`. Generic builds
+(`--offload-arch=gfx900`) disable MFMA instructions entirely.
+
+---
+
+### Target Feature Flags (`-mattr` / target qualifiers)
+
+These flags control optional ISA features that affect **correctness and performance**. They are
+appended to `--offload-arch` as qualifiers or passed via `-mattr`.
+
+| Feature | Flag | Default | Performance Impact |
+|---------|------|---------|-------------------|
+| XNACK (page-fault retry) | `xnack+` / `xnack-` | GPU-dependent | **Disabling saves 5–15% overhead** on MI300X/gfx942 |
+| SRAMECC (ECC on SRAM) | `sramecc+` / `sramecc-` | GPU-dependent | **Disabling saves 2–8% overhead** if ECC not needed |
+| 64-wave mode | `wavefrontsize64` / no flag | 64 on CDNA, 32 on RDNA | Affects occupancy calculations significantly |
+| CU mode (vs WGP mode) | `cumode` / no flag | WGP on RDNA | CU mode restores RDNA2 shared-memory semantics |
+| Thread-group split | `tgsplit` | off | Enables LDS split across CU pairs (advanced use) |
+
+**XNACK — Key decision:**
+- `xnack+`: enables Unified Memory / page migration (required for `hipMallocManaged`). Has hardware
+  retry overhead on TLB miss.
+- `xnack-`: disables page-fault retry. **Faster for HPC workloads that don't use Unified Memory.**
+- **Recommendation**: If the application uses `hipMalloc` + explicit `hipMemcpy` (not `hipMallocManaged`),
+  compile with `--offload-arch=gfx942:xnack-` for a measurable throughput gain.
+
+**SRAMECC — Key decision:**
+- `sramecc+`: enables hardware ECC on L1/LDS SRAM. Adds correction overhead.
+- `sramecc-`: disables SRAM ECC. Appropriate for non-critical compute workloads.
+- **Recommendation**: Benchmark with and without `sramecc-` on MI300X. If the workload is not
+  safety-critical, `sramecc-` can reduce LDS and cache latency.
+
+**Wavefront size:**
+- CDNA GPUs (MI100, MI200, MI300 series) are always 64-wide. `wavefrontsize64` is implied.
+- RDNA GPUs (RX 6xxx / RX 7xxx) default to 32-wide. 64-wide mode (`wavefrontsize64`) is
+  available but doubles VGPR pressure per wave.
+- **Recommendation trigger**: If a kernel compiled for RDNA shows unexpected occupancy, confirm
+  the wavefront size matches the LDS/VGPR budget assumptions.
+
+---
+
+### Optimization Levels
+
+HIPCC/clang++ defaults to `-O0` in debug builds and `-O3` when no flag is given on the device
+side. Always verify the optimization level is appropriate.
+
+| Flag | Effect | When to Use |
+|------|--------|-------------|
+| `-O0` | No optimization | Debug builds only |
+| `-O1` | Basic optimizations, fast compile | Rarely appropriate for GPU |
+| `-O2` | Most optimizations, no vectorization hints | General use |
+| `-O3` | Full optimization + vectorization + inlining | **Default recommendation for GPU** |
+| `-Ofast` | `-O3` + aggressive fast-math (implies `-ffast-math`) | When math accuracy is not critical |
+
+**Recommendation**: If the binary was compiled without explicit `-O3` (e.g., CMake Debug mode),
+rebuilding in Release (`-O3`) is the single highest-ROI change. A Release build can be 2–10×
+faster than Debug for GPU kernels.
+
+---
+
+### Fast-Math Flags
+
+Control floating-point operation reordering and denormal handling. Can significantly improve
+throughput for FP32-heavy compute workloads.
+
+| Flag | Effect | Performance Gain |
+|------|--------|-----------------|
+| `-ffast-math` | Allows reassociation, assumes no NaN/Inf, enables FMA fusion | 10–40% on FP32 VALU-bound kernels |
+| `-fgpu-flush-denormals-to-zero` | Flushes FP32/FP16 denormals to zero in GPU code | 2–15% on kernels processing near-zero values |
+| `-fno-math-errno` | Removes errno-setting overhead from math calls | Minor; usually included in `-ffast-math` |
+| `-fassociative-math` | Allows reordering of FP additions for vectorization | Enables auto-vectorization of reductions |
+
+**`-fgpu-flush-denormals-to-zero` — Key recommendation:**
+Denormal (subnormal) FP values incur a hardware performance penalty on AMD GPUs. If a kernel
+processes values that may underflow to denormals (e.g., gradients in ML training, values close
+to zero), enabling this flag can eliminate the denormal-handling overhead. Unlike `-ffast-math`,
+it only changes behavior for subnormal inputs — normal FP values are unaffected.
+
+**Safety caveat**: `-ffast-math` is not IEEE-754 compliant. Do not use for financial calculations,
+iterative solvers requiring strict convergence, or any code that explicitly checks for NaN/Inf.
+
+---
+
+### Register and Occupancy Control
+
+When profiling shows VGPR pressure is limiting occupancy, the compiler can be directed to use
+fewer registers at the cost of potential spilling to scratch memory.
+
+#### Via `__attribute__` / `__launch_bounds__` (source annotation — preferred):
+```cpp
+// Tell compiler max 256 threads/workgroup, min 2 blocks/CU
+__global__ void __launch_bounds__(256, 2) my_kernel(...) { ... }
+```
+
+`__launch_bounds__(maxThreadsPerBlock, minBlocksPerMultiprocessor)` is the standard HIP way to
+constrain register allocation. The compiler will spill registers to scratch memory to meet the
+occupancy target.
+
+#### Via function attributes (IR-level control):
+```cpp
+__attribute__((amdgpu_num_vgpr(64)))   // Force 64 VGPRs maximum
+__attribute__((amdgpu_num_sgpr(32)))   // Force 32 SGPRs maximum
+__attribute__((amdgpu_waves_per_eu(2, 4)))  // Request 2–4 waves/CU
+__attribute__((amdgpu_flat_work_group_size(64, 256)))  // Valid workgroup range
+```
+
+These are lower-level than `__launch_bounds__` and should only be used when profiling confirms
+the exact VGPR count needed.
+
+#### Via `-mllvm` passthrough (compilation flag):
+```bash
+# Global VGPR limit for the entire translation unit
+hipcc -mllvm -amdgpu-num-vgpr=64 ...
+
+# Enable alloca promotion to registers (often auto-enabled at -O3)
+hipcc -mllvm -amdgpu-enable-promote-alloca ...
+```
+
+**Recommendation trigger**: If `rocprof-compute` reports `vgpr_count > 128` and occupancy is
+below target:
+1. First try `__launch_bounds__(blockSize, targetWaves)` — non-intrusive
+2. If still failing, use `amdgpu_waves_per_eu(minWaves, maxWaves)` to narrow the range
+3. As a last resort, use `-mllvm -amdgpu-num-vgpr=<n>` globally — watch for spill traffic
+
+**VGPR → occupancy table (CDNA3/gfx942, 512 VGPRs per SIMD):**
+| VGPRs per wave | Allocated VGPRs (16-block) | Max waves/EU | Occupancy (of 32 max) |
+|---------------|---------------------------|-------------|----------------------|
+| 1–16  | 16  | 32 | 100% |
+| 17–32 | 32  | 16 | 50% |
+| 33–48 | 48  | 10 | ~31% |
+| 49–64 | 64  |  8 | 25% |
+| 65–80 | 80  |  6 | ~19% |
+| 81–96 | 96  |  5 | ~16% |
+| 97–128 | 112–128 | 4 | ~13% |
+| 129–176 | 144–176 | 3 | ~9% |
+| 177–256 | 192–256 | 2 | ~6% |
+| 257–512 | 272–512 | 1 | ~3% |
+
+CDNA4 (gfx950): same VGPR pool per SIMD; doubled LDS (160 KB/CU) can allow larger workgroups.
+
+---
+
+### Environment Variables (HIPCC / HIP Runtime)
+
+These affect compilation and runtime behavior without code or CMake changes.
+
+| Variable | Value | Effect |
+|----------|-------|--------|
+| `HIPCC_COMPILE_FLAGS_APPEND` | `-O3 -ffast-math` | Appends flags to every `hipcc` invocation |
+| `HIP_FORCE_DEV_KERNARG=1` | `1` | Forces kernel arguments to device memory (avoids host-pinned buffer contention). **Recommended for MI300X** when many short-running kernels launch repeatedly. |
+| `HIPCC_VERBOSE=1` | `1` | Prints full clang++ command lines — use to verify flags are actually applied |
+| `ROCPD_LLM_LOCAL` | `ollama` | (rocpd-specific) Use local LLM for stage-1 summarization |
+
+**`HIP_FORCE_DEV_KERNARG=1` — Recommendation trigger**: If Tier 1 analysis shows API overhead
+> 15% and many short kernels (avg duration < 10 µs), enabling this env var can reduce
+host-device argument setup latency at no code cost.
+
+---
+
+### Compiler Flags for CMake Projects
+
+Most HIP/ROCm projects use CMake. The correct way to set GPU-level flags is:
+
+```cmake
+# Set target GPU(s)
+set(CMAKE_HIP_ARCHITECTURES "gfx942")
+# or for multiple targets:
+set(CMAKE_HIP_ARCHITECTURES "gfx942;gfx90a")
+
+# Add optimization flags for GPU code
+target_compile_options(my_target PRIVATE
+    $<$<COMPILE_LANGUAGE:HIP>:-O3 -ffast-math -fgpu-flush-denormals-to-zero>
+)
+
+# Add to all GPU targets in a directory
+add_compile_options($<$<COMPILE_LANGUAGE:HIP>:--offload-arch=gfx942:xnack->)
+```
+
+**Recommendation**: When suggesting compiler changes, always phrase them as CMake
+`target_compile_options` changes, not raw shell flags, unless the user's build system is
+confirmed to be non-CMake.
+
+---
+
+### Compiler Optimization Decision Tree
+
+Use this decision tree when profiling evidence suggests a compiler flag may help:
+
+```
+Profiling evidence → Recommended compiler action
+─────────────────────────────────────────────────
+MFMA utilization = 0 on MI300X         → Recompile with --offload-arch=gfx942
+Binary compiled -O0 or Debug mode      → Recompile with -O3 (highest ROI)
+API overhead > 15%, many short kernels → Set HIP_FORCE_DEV_KERNARG=1
+Denormal flush warnings in perf data   → Add -fgpu-flush-denormals-to-zero
+VALU bound + FP32 heavy                → Try -ffast-math (verify numerical correctness)
+VGPR count > 64, low occupancy        → Add __launch_bounds__ or amdgpu_waves_per_eu
+Using hipMallocManaged? No             → Recompile with --offload-arch=gfxXXX:xnack-
+ECC not required?                      → Recompile with --offload-arch=gfxXXX:sramecc-
+```
+
+---
+
+### Compiler Recommendation Format
+
+When recommending compiler changes in analysis output, use this structure:
+
+**Title**: [Descriptive title, e.g., "Enable Architecture-Specific Compilation"]
+**Priority**: HIGH / MEDIUM / LOW
+**Evidence**: [Specific counter or trace observation that triggered this recommendation]
+**Change**:
+```cmake
+# Before
+set(CMAKE_HIP_ARCHITECTURES "gfx900")  # generic
+
+# After
+set(CMAKE_HIP_ARCHITECTURES "gfx942")
+target_compile_options(... PRIVATE $<$<COMPILE_LANGUAGE:HIP>:-O3 -ffast-math>)
+```
+**Expected Impact**: [Estimated improvement, e.g., "10–40% VALU throughput improvement for FP32-heavy kernels"]
+**Verification**: [How to confirm the change worked, e.g., "Rerun Tier 2 analysis; check VALU SOL%"]
+
+---
+
+## What NOT to Do
+<!-- rocpd-context: always -->
+
+❌ **Do Not Recommend Already-Collected Data**
+- Check `profiling_info.profiling_mode` and `hardware_counters.counters` before suggesting
+  any `--pmc` counter or tracing flag. If it was already collected, do not suggest it again.
+
+❌ **Do Not Fabricate Metrics**
+- If a metric is not in the data, say "Unknown — counter data not collected"
+- Do not estimate or guess performance numbers; base everything on the provided data
+
+❌ **Do Not Suggest Deep Analysis for Minor Kernels**
+- Apply Amdahl's Law: do not recommend rocprof-compute deep dive for kernels < 5% of time
+
+❌ **Do Not Suggest Unsupported Architectures**
+- Stick to known GPU specs in this guide; state limitations for unknown GPUs
+- Supported: MI100 (gfx908), MI250X (gfx90a), MI300A/MI300X/MI325X (gfx942), MI350X/MI355X (gfx950), RX 6900 XT (gfx1030), RX 7900 XTX (gfx1100)
+
+❌ **Do Not Give Generic Advice**
+- "Optimize memory access" is not actionable
+- Always provide specific, measurable, step-by-step guidance
+
+❌ **Do Not Reference External Resources**
+- No "check the AMD documentation at..."
+- No "search online for examples"
+- Provide self-contained guidance
+
+⚠️ **Code Analysis Guidelines**
+- **By default**: Focus on performance metrics only — you do not have access to source code
+- **Exception**: If the user's custom prompt explicitly mentions code analysis AND provides
+  file paths, then you MAY analyze code logic and suggest algorithmic changes
+- **Rule**: Only suggest algorithmic changes when you can see the actual algorithm
+
+❌ **Do Not Use Other Vendors' Terminology**
+- Do not mention names of other companies or their products
+- Use AMD-specific terminology:
+  - "LDS" (Local Data Share), not shared memory
+  - "waves", not warps or threads
+  - "VALU" or "stream processors", not CUDA cores
+  - "workgroup", not thread block
+
+❌ **Do Not Make Unsupported Claims**
+- Use "estimated" or "expected" for predictions
+- Base estimates on actual counter values or similar profiling patterns
+
+❌ **Never Fabricate Hardware Counter Names**
+- Only reference counter names that appear in the provided profiling data or the Hardware Counter Reference section of this guide
+- Do NOT invent counters like `TCP_L1_HIT_RATE`, `GRBM_COMPUTE_BUSY`, `SQ_VALU_EFFICIENCY`, etc.
+- If a metric you want to reference was not collected, say "this counter was not collected in this run" and recommend adding it via `--pmc <COUNTER_NAME>`
+- Use `rocprofv3 --list-avail` to discover available counters for the target GPU
+
+❌ **Never Recommend CUDA/NVIDIA-Specific Optimizations**
+- Do not suggest NVIDIA-specific tools (`nvprof`, `Nsight`, `nvcc` flags)
+- Do not suggest CUDA-only APIs that have no HIP equivalent, or NVIDIA architecture-specific tuning (e.g., SM count, CUDA core optimization)
+- All recommendations must use AMD tools (`rocprofv3`, `rocprof-compute`, `amdclang++`, HIP APIs) and reference AMD architecture concepts
+
+❌ **Never Suggest Non-Existent HIP/AMDGCN Intrinsics — Compilation Will Fail**
+
+**`__builtin_amdgcn_sin` and `__builtin_amdgcn_cos` do not exist.** These are not valid
+HIP device functions. The `__builtin_amdgcn_*` namespace covers hardware-specific
+operations (lane reads, DS swizzle, warp shuffles) but **NOT transcendental math**.
+Suggesting them will produce a compile error:
+```
+error: use of undeclared identifier '__builtin_amdgcn_sin'
+```
+
+**Correct approach for transcendental math in HIP device code:**
+- Standard: `sinf(x)`, `cosf(x)`, `sqrtf(x)`, `rsqrtf(x)`, `expf(x)`, `logf(x)`
+  - The HIP compiler (amdclang++) automatically maps these to OCML hardware-optimized
+    implementations — no manual intrinsic substitution needed.
+- These functions already map to the fastest available hardware path on gfx9xx / gfx11xx.
+- `__builtin_amdgcn_*` intrinsics that DO exist: `__builtin_amdgcn_readlane`,
+  `__builtin_amdgcn_ds_swizzle`, `__builtin_amdgcn_s_barrier`, etc. (not math ops).
+
+**Rule:** Do not suggest replacing `sinf`/`cosf`/`sqrtf` with any `__builtin_amdgcn_*`
+variant. The standard math functions are already the correct choice.
+
+❌ **Always Flag Implausible Metric Values — Never Silently Accept Them**
+- If profiling data shows GPU utilization > 100%, memory bandwidth exceeding the GPU's theoretical peak (see Hardware Specifications), negative durations, or wave occupancy > 32 waves/CU (CDNA3), flag this explicitly as a likely measurement artifact or data issue
+- Example: "The reported bandwidth of 12 TB/s exceeds MI300X's peak of 5.3 TB/s; this value appears to be a measurement artifact and should not be used for bottleneck classification."
+- Do not base recommendations on implausible values
+
+❌ **Never Double-Count MFMA Instructions in Instruction Mix Analysis**
+- `SQ_INSTS_MFMA` is a subset of `SQ_INSTS_VALU` — every MFMA instruction is also counted in VALU
+- When computing instruction mix percentages, use `SQ_INSTS_VALU - SQ_INSTS_MFMA` for "non-MFMA VALU" and report `SQ_INSTS_MFMA` separately
+- Correct total: `(SQ_INSTS_VALU - SQ_INSTS_MFMA) + SQ_INSTS_MFMA + SQ_INSTS_SALU + SQ_INSTS_SMEM + ...`
+- Incorrect total: `SQ_INSTS_VALU + SQ_INSTS_MFMA + ...` (this double-counts all MFMA instructions)
+
+---
+
+## Example Analysis Flow
+<!-- rocpd-context: tier2 -->
+
+### Input Data:
+- Kernel: `matmul_kernel`
+- Duration: 500 ms (60% of total time)
+- Grid: 256×256, Workgroup: 256×1×1
+- GPU utilization: 82% (GRBM_GUI_ACTIVE / GRBM_COUNT)
+- SQ_WAVES: implies 8 waves/CU → 25% occupancy
+- VGPR: 128 per wave
+
+### Analysis Steps:
+
+1. **Identify Importance**: 60% of total time → High priority (Amdahl: max 2.5× total speedup)
+
+2. **Classify Bottleneck** (requires FETCH_SIZE/WRITE_SIZE counters):
+   - If VALU util (45%) < HBM util (75%) → Memory-bound
+   - Occupancy 25% → also latency-bound (128 VGPRs → max 16 waves/CU)
+
+3. **Identify Root Causes**:
+   - Memory-bound: low arithmetic intensity or poor data reuse
+   - Low occupancy: 128 VGPRs limit to 16 waves/CU (target: ≤ 64 for 32 waves/CU)
+
+4. **Generate Recommendations**:
+   - **High Priority**: Reduce VGPR usage to ≤ 64 to enable 32 waves/CU
+   - **High Priority**: Tile data into LDS to increase arithmetic intensity
+   - **Medium Priority**: Coalesce global memory accesses
+
+5. **Suggest Next Step** (if counters missing):
+   - Collect L2 hit rate and instruction mix:
+     `rocprofv3 --pmc TCP_TCC_HIT_sum TCP_TCC_MISS_sum SQ_INSTS_VALU SQ_INSTS_VMEM -- ./app`
+   - If bottleneck still unclear: `rocprof-compute profile --kernel "matmul_kernel" -- ./app`
+
+---
+
+## Confidence Levels
+<!-- rocpd-context: always -->
+
+When classifying bottlenecks, indicate confidence:
+
+- **High Confidence (> 90%)**: Counter data present, clear bottleneck signature
+  - Example: "Memory-bound (High Confidence — HBM utilization 82%, VALU utilization 35%)"
+- **Medium Confidence (60–90%)**: Some counters, bottleneck likely but not definitive
+  - Example: "Likely memory-bound (Medium Confidence — low AI inferred from FETCH_SIZE,
+    no VALU counter available for cross-check)"
+- **Low Confidence (< 60%)**: Trace-only data, no counters
+  - Example: "Bottleneck unknown (Low Confidence — no hardware counters; collect
+    GRBM_COUNT, SQ_WAVES, FETCH_SIZE, WRITE_SIZE to classify)"
+
+---
+
+## Handling Missing Data
+<!-- rocpd-context: always -->
+
+### If No Hardware Counters (Tier 1 only):
+```
+Limited Analysis: No hardware counters detected.
+Cannot determine compute vs memory-bound classification.
+Cannot calculate GPU utilization, wave occupancy, or HBM bandwidth.
+
+Recommended next step (Step 2) — THREE passes required (each TCC-derived counter needs its own pass):
+  # Pass 1: GPU utilization + wave occupancy
+  rocprofv3 --sys-trace --pmc GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES \
+    --kernel-names "<hot_kernel>" -d ./counters -o profile_pass1 -- ./app
+  # Pass 2: HBM read bandwidth (FETCH_SIZE alone — 3 TCC hardware counters)
+  rocprofv3 --sys-trace --pmc FETCH_SIZE \
+    --kernel-names "<hot_kernel>" -d ./counters -o profile_pass2 -- ./app
+  # Pass 3: HBM write bandwidth (WRITE_SIZE alone — 2 TCC hardware counters)
+  rocprofv3 --sys-trace --pmc WRITE_SIZE \
+    --kernel-names "<hot_kernel>" -d ./counters -o profile_pass3 -- ./app
+
+This will enable: GPU utilization, occupancy, and HBM bandwidth analysis.
+For full roofline model, follow with: rocprof-compute profile -- ./app
+```
+
+### If Partial Counters (Tier 2, some counters missing):
+```
+Partial Counter Data: [list which counters are present and which are missing]
+- GPU utilization: [available/not available]
+- Wave occupancy: [available/not available]
+- HBM bandwidth: [available/not available — need FETCH_SIZE + WRITE_SIZE]
+- L2 hit rate: [available/not available — need TCP_TCC_HIT_sum + TCP_TCC_MISS_sum]
+
+Recommended: Collect missing counters for complete bottleneck classification.
+```
+
+### If Unknown GPU Architecture:
+```
+Unknown GPU Architecture: [gfx_arch]
+Using generic analysis (trace data only).
+Cannot compare to hardware peaks or calculate Speed-of-Light metrics.
+Supported GPUs: MI100 (gfx908), MI250X/MI210/MI250 (gfx90a),
+  MI300A/MI300X/MI325X (gfx942), MI350X/MI355X (gfx950),
+  RX 6900 XT (gfx1030), RX 7900 XTX (gfx1100)
+```
+
+---
+
+## Custom Prompt Handling
+<!-- rocpd-context: always -->
+
+If the user provides a custom prompt (e.g., `--prompt "Why is kernel X slow?"`), use it to:
+
+1. **Focus Analysis**: Prioritize the specific kernel/aspect mentioned
+2. **Tailor Output**: Structure response to directly answer the question
+3. **Provide Targeted Recommendations**: Focus on the area of interest
+
+**Examples**:
+- Prompt: "Focus on memory bottlenecks" → Emphasize FETCH_SIZE, WRITE_SIZE, L2 hit rates, memcpy overhead
+- Prompt: "Why is matmul slow?" → Lead with matmul kernel analysis, occupancy, MFMA utilization
+- Prompt: "What should I optimize first?" → Apply Amdahl's Law, rank by time × potential speedup
+
+---
+
+## vLLM on ROCm — Known API Pitfalls and Correct Patterns
+<!-- rocpd-context: always -->
+
+When suggesting code optimizations for applications that use **vLLM**, you MUST follow these
+rules precisely. vLLM has a well-defined public API; incorrect parameter names will cause
+immediate `TypeError` at runtime.
+
+### CRITICAL: `pin_memory` / `use_pinned_memory` are NOT `LLM()` constructor parameters
+
+**NEVER suggest passing `pin_memory=True` or `use_pinned_memory=True` to `LLM()`.**
+These parameters do not exist in the public `LLM()` / `EngineArgs` interface. Suggesting
+them will cause a `TypeError: LLM.__init__() got an unexpected keyword argument`.
+
+**How pinned memory actually works in vLLM:**
+- Pinned (page-locked) CPU memory is an **internal implementation detail** managed automatically by `vllm/worker/cache_engine.py` and `vllm/utils/__init__.py`.
+- vLLM calls `is_pin_memory_available()` internally at startup — the user never sets it.
+- On AMD ROCm GPUs (CUDA/ROCm platform): pinned memory is **automatically enabled** — no flag needed.
+- Pinned memory is automatically **disabled** on: CPU backend (`--device cpu`), TPU, WSL (Windows Subsystem for Linux).
+
+**The correct public parameters for CPU memory management in `LLM()`:**
+
+| Parameter | Type | Default | Effect |
+|---|---|---|---|
+| `swap_space` | `float` | `4` | GiB of CPU RAM per GPU for KV cache swapping (preempted sequences paged out to pinned CPU memory automatically) |
+| `cpu_offload_gb` | `float` | `0` | GiB of CPU RAM per GPU for **model weight** offloading (not KV cache) |
+
+**Example — correct way to increase CPU KV cache swap:**
+```python
+llm = LLM(
+    model="meta-llama/Llama-3.1-8B-Instruct",
+    swap_space=8,                 # 8 GiB of pinned CPU RAM for KV cache swap per GPU
+    gpu_memory_utilization=0.90,
+    tensor_parallel_size=tp_size,
+)
+```
+vLLM will automatically use pinned memory for the swap buffer on CUDA/ROCm. You do not need any additional flag.
+
+**If you need to check availability in custom torch code (NOT for LLM() args):**
+```python
+from vllm.utils import is_pin_memory_available
+
+pin_memory = is_pin_memory_available()  # True on CUDA/ROCm, False on CPU backend/WSL/TPU
+cpu_buffer = torch.zeros(shape, dtype=dtype, pin_memory=pin_memory, device="cpu")
+```
+
+### Other vLLM LLM() Parameters Relevant to ROCm Performance
+
+| Parameter | Recommended | Notes |
+|---|---|---|
+| `enforce_eager=False` | Yes | Enables CUDA/HIP graph capture and kernel fusion. Set `True` only to debug correctness. |
+| `tensor_parallel_size` | `≥ 1` | Should match available GPU count. Use `torch.cuda.device_count()`. |
+| `gpu_memory_utilization` | `0.90–0.95` | Higher values reduce KV cache evictions but risk OOM. |
+| `enable_chunked_prefill` | `True` | Overlaps prefill and decode phases; improves GPU occupancy. |
+| `max_num_seqs` | `128–512` | Larger batches amortize launch overhead. |
+| `dtype` | `"auto"` | Selects bfloat16 on MI300X; do not force float32. |
+
+### Multiprocessing Warning for rocprofv3
+
+vLLM uses Python `multiprocessing` with `spawn` start method. When profiling with `rocprofv3`,
+GPU kernels run in **worker subprocesses**, NOT the main process. The `.db` file from the main
+process will show `total_runtime_ns == 0` (empty). To profile vLLM:
+- Use `VLLM_ENABLE_V1_MULTIPROCESSING=0` to force single-process mode for tracing
+- Or profile the worker process directly with `rocprofv3 --pid <worker_pid>`
+- Or use `rocprof-sys --trace` which can follow forks/spawns
+
+---
+
+## Summary
+<!-- rocpd-context: always -->
+
+Your goal is to transform raw profiling data into **clear, actionable insights** that help developers optimize their GPU code. Always:
+
+✅ Follow the AMD 3-step profiling methodology and recommend only the next incremental step
+✅ Apply Amdahl's Law — focus on the hottest kernels first
+✅ Classify bottlenecks (compute / memory / latency / launch) before recommending fixes
+✅ Be specific: cite actual counter values, compute derived metrics, give exact commands
+✅ Prioritize high-impact optimizations (> 10% of total time)
+✅ Acknowledge when data is missing and explain exactly what to collect next
+✅ Use AMD GPU terminology (waves, LDS, VALU, MFMA, workgroup)
+✅ Never recommend collecting data that is already present in the database
+✅ Consider compiler flags **before** recommending algorithmic rewrites — check target arch, optimization level, fast-math, XNACK/SRAMECC, and VGPR limits first
+
+Follow this guide closely to ensure high-quality, trustworthy analysis.
+
+---
+
+## TraceLens-Derived Metrics
+<!-- rocpd-context: tracelens_metrics -->
+
+These fields are derived using set-theoretic interval arithmetic (matching AMD TraceLens methodology).
+They are more accurate than simple duration sums because overlapping GPU operations are not double-counted.
+
+### `interval_timeline`
+- `true_compute_pct`: % of wall time the GPU is executing kernels (overlapping kernels merged — more accurate than `execution_breakdown.kernel_time_pct`)
+- `exposed_memcpy_pct`: % of wall time spent on memory copies that do NOT overlap any kernel (truly serialized transfers)
+- `idle_pct`: % of wall time where the GPU is idle (no kernel or memcpy). **If idle_pct > 20%, this is a HIGH priority issue** — the GPU is waiting for CPU to dispatch work.
+
+### `kernel_categories`
+Each entry covers one of: GEMM, CONV, SDPA, NCCL, Elementwise, Normalization, Reduction, Other.
+- `pct_of_kernel_time`: how dominant this category is among all GPU kernels
+- Use this to classify workloads: high GEMM% → compute-bound candidate; high NCCL% → communication-bound; high Other% → custom/unclassified kernels
+- A workload that is 60%+ GEMM is a strong candidate for MFMA/rocBLAS optimization
+
+### `short_kernels`
+- `wasted_pct_of_kernel_time`: % of kernel time consumed by kernels below the `threshold_us` (default 10μs)
+- **If wasted_pct > 5%**, recommend kernel fusion or hipGraph batching
+- Common cause: many small elementwise ops that could be fused; excessive hipDeviceSynchronize() calls between tiny kernels
+- Top offenders list (kernel names sanitized) shows which kernels to target first
+
+### How to use these fields
+When answering a `--prompt` question about bottlenecks, prioritize:
+1. If `idle_pct > 20` → lead with GPU IDLE recommendation
+2. If `wasted_pct > 5` AND short kernels are the dominant category → recommend fusion
+3. If NCCL category dominates → mention communication bottleneck even if not yet Tier 2 diagnosed
+4. Cross-reference `interval_timeline.true_compute_pct` with `execution_breakdown.kernel_time_pct` — a large gap indicates significant kernel overlap (good for throughput but may hide serial stalls)
diff --git a/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/source_analyzer.py b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/source_analyzer.py
new file mode 100644
index 00000000000..e8dc5745006
--- /dev/null
+++ b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/source_analyzer.py
@@ -0,0 +1,1512 @@
+#!/usr/bin/env python3
+###############################################################################
+# MIT License
+#
+# Copyright (c) 2025 Advanced Micro Devices, Inc.
+###############################################################################
+
+"""
+Tier 0: Static source code analysis for GPU profiling planning.
+
+Scans a source directory for GPU-related patterns (HIP/CUDA kernels, memory
+operations, synchronization points, ROCm library calls) and produces a
+structured profiling plan — recommending which rocprofv3/rocprof-compute
+commands to run and which hardware counters to collect.
+
+This module is intentionally self-contained (stdlib only, no rocpd DB
+imports) so it can be used independently and tested without a database.
+
+Usage:
+    from rocpd.ai_analysis.source_analyzer import SourceAnalyzer
+    from pathlib import Path
+
+    analyzer = SourceAnalyzer(Path("./my_app/src"))
+    plan = analyzer.analyze()
+    print(plan.programming_model)
+    print(plan.suggested_first_command)
+"""
+
+import os
+import re
+from dataclasses import dataclass, field
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+# ---------------------------------------------------------------------------
+# Source file extensions to scan
+# ---------------------------------------------------------------------------
+_GPU_EXTENSIONS = frozenset(
+    {
+        ".hip",
+        ".cpp",
+        ".cu",
+        ".cuh",
+        ".cl",
+        ".h",
+        ".hpp",
+        ".hxx",
+        ".cc",
+    }
+)
+_PYTHON_EXTENSIONS = frozenset({".py", ".pyx"})
+_ALL_EXTENSIONS = _GPU_EXTENSIONS | _PYTHON_EXTENSIONS
+
+# Directories to skip unconditionally
+_SKIP_DIRS = frozenset(
+    {
+        "build",
+        "_build",
+        ".build",
+        "__pycache__",
+        ".git",
+        ".svn",
+        "node_modules",
+        "vendor",
+        ".cache",
+        "dist",
+        "site-packages",
+        ".tox",
+        ".mypy_cache",
+        ".pytest_cache",
+        "CMakeFiles",
+        "CMakeCache",
+        ".deps",
+        "third_party",
+        "thirdparty",
+        "extern",
+    }
+)
+
+# Max limits to avoid hanging on huge repos
+_MAX_FILES = 500
+_MAX_FILE_SIZE_BYTES = 512 * 1024  # 512 KB
+
+# Maximum number of hardware counters per hardware block per rocprofv3 run.
+#
+# AMD GPUs limit how many performance counters from the same hardware block can
+# be collected simultaneously in a single kernel dispatch pass.  The block name
+# is the prefix before the first "_" in the counter name:
+#
+#   SQ_WAVES        → block "SQ"   (shader/wavefront counters)
+#   GRBM_COUNT      → block "GRBM" (GPU register bus)
+#   TCP_TOTAL_ACCESSES → block "TCP" (L1 cache per CU)
+#   TCC_HIT         → block "TCC"  (L2 cache)
+#
+# Exceeding this limit causes rocprofv3 to abort with error code 38:
+#   "Request exceeds the capabilities of the hardware to collect"
+#
+# Per-block limits vary by GPU generation (MI100/MI200/MI300X) and block type.
+# Use _PMC_BLOCK_LIMIT as the safe default; override per-block in _PMC_BLOCK_LIMITS.
+_PMC_BLOCK_LIMIT_DEFAULT: int = 4  # safe conservative default for any block
+_PMC_BLOCK_LIMITS: Dict[str, int] = {
+    # SQ (shader/wave counters): up to 8 on gfx942+ (MI300X); use 4 to be safe
+    "SQ": 4,
+    # GRBM (GPU register bus): 2–4 counters per pass
+    "GRBM": 4,
+    # Cache blocks: 4–8 per pass
+    "TCP": 4,
+    "TCC": 4,
+    "TA": 4,
+    "TD": 4,
+}
+
+# FETCH_SIZE and WRITE_SIZE are DERIVED metrics, not raw hardware counters.
+# Internally rocprofv3 expands them to multiple TCC hardware block counters:
+#   FETCH_SIZE → TCC_BUBBLE + TCC_EA0_RDREQ + GRBM_GUI_ACTIVE  (TCC block)
+#   WRITE_SIZE → TCC_EA0_WRREQ + TCC_EA0_WRREQ_64B             (TCC block)
+# Combined they require ~4 TCC hardware counter slots across 32 TCC instances.
+# They MUST be collected in a separate pass from SQ counters to avoid error code 38.
+_TCC_DERIVED_COUNTERS: frozenset = frozenset({"FETCH_SIZE", "WRITE_SIZE"})
+
+
+def _pmc_block(counter: str) -> str:
+    """Return the hardware block name for a counter (prefix before first '_')."""
+    return counter.split("_")[0]
+
+
+def _pmc_block_limit(block: str) -> int:
+    """Return the per-pass limit for the given hardware block."""
+    return _PMC_BLOCK_LIMITS.get(block, _PMC_BLOCK_LIMIT_DEFAULT)
+
+
+def _pmc_commands(
+    counters: List[str],
+    flags: List[str],
+    output_dir: str,
+    output_prefix: str,
+    description: str,
+    app_placeholder: str = "./app",
+) -> List[Dict[str, Any]]:
+    """
+    Return one or more rocprofv3 command dicts covering all *counters*, splitting
+    across multiple passes when any hardware block exceeds its per-pass limit.
+
+    Splitting strategy:
+    - FETCH_SIZE and WRITE_SIZE are TCC-derived metrics that expand internally to
+      multiple TCC hardware counters (FETCH_SIZE→3 TCC counters, WRITE_SIZE→2).
+      Together they exceed the TCC block per-pass limit, so each derived counter
+      MUST be in its own dedicated pass, isolated from all other counters.
+    - For all other (raw) counters: group by hardware block (prefix before '_'),
+      determine passes needed as max(ceil(block_count / block_limit)), distribute
+      counters round-robin per block.
+    """
+    from collections import defaultdict
+
+    if not counters:
+        return []
+
+    # Each TCC-derived counter must be in its own dedicated pass.
+    derived = [c for c in counters if c in _TCC_DERIVED_COUNTERS]
+    regular = [c for c in counters if c not in _TCC_DERIVED_COUNTERS]
+
+    if derived and (len(derived) > 1 or regular):
+        # Multiple derived counters can't share a pass (combined TCC hw counter count
+        # exceeds the block limit). Each derived counter gets its own dedicated pass;
+        # regular counters are handled together as a separate group.
+        all_cmds = []
+        if regular:
+            all_cmds.extend(
+                _pmc_commands(
+                    regular,
+                    flags,
+                    output_dir,
+                    output_prefix,
+                    description,
+                    app_placeholder,
+                )
+            )
+        for dc in derived:
+            # Single derived counter: build its command directly (no recursion).
+            pmc_str = dc
+            flags_str = " ".join(flags)
+            all_cmds.append(
+                {
+                    "tool": "rocprofv3",
+                    "description": description,
+                    "flags": list(flags),
+                    "args": [
+                        {"name": "--pmc", "value": pmc_str},
+                        {"name": "-d", "value": output_dir},
+                        {"name": "-o", "value": output_prefix},
+                    ],
+                    "full_command": (
+                        f"rocprofv3 {flags_str} --pmc {pmc_str}"
+                        f" -d {output_dir} -o {output_prefix} -- {app_placeholder}"
+                    ).strip(),
+                }
+            )
+        # Re-number pass suffixes across the combined list.
+        n = len(all_cmds)
+        if n > 1:
+            for idx, cmd in enumerate(all_cmds):
+                cmd["description"] = f"{description} (pass {idx + 1}/{n})"
+                out_name = f"{output_prefix}_pass{idx + 1}"
+                for arg in cmd["args"]:
+                    if arg["name"] == "-o":
+                        arg["value"] = out_name
+                pmc_val = next(a["value"] for a in cmd["args"] if a["name"] == "--pmc")
+                flags_str = " ".join(flags)
+                cmd["full_command"] = (
+                    f"rocprofv3 {flags_str} --pmc {pmc_val}"
+                    f" -d {output_dir} -o {out_name} -- {app_placeholder}"
+                ).strip()
+        return all_cmds
+
+    # Standard path: group by block and distribute round-robin.
+    block_groups: Dict[str, List[str]] = defaultdict(list)
+    for c in counters:
+        block_groups[_pmc_block(c)].append(c)
+
+    n_passes = max(
+        (len(cs) + _pmc_block_limit(blk) - 1) // max(_pmc_block_limit(blk), 1)
+        for blk, cs in block_groups.items()
+    )
+
+    pass_counters: List[List[str]] = [[] for _ in range(n_passes)]
+    for blk, cs in block_groups.items():
+        limit = _pmc_block_limit(blk)
+        for pass_idx in range(n_passes):
+            chunk = cs[pass_idx * limit : (pass_idx + 1) * limit]
+            pass_counters[pass_idx].extend(chunk)
+
+    pass_counters = [p for p in pass_counters if p]
+    n = len(pass_counters)
+
+    cmds: List[Dict[str, Any]] = []
+    for idx, pctrs in enumerate(pass_counters):
+        suffix = f" (pass {idx + 1}/{n})" if n > 1 else ""
+        out_name = f"{output_prefix}_pass{idx + 1}" if n > 1 else output_prefix
+        pmc_str = " ".join(pctrs)
+        flags_str = " ".join(flags)
+        full_cmd = (
+            f"rocprofv3 {flags_str} --pmc {pmc_str}"
+            f" -d {output_dir} -o {out_name} -- {app_placeholder}"
+        ).strip()
+        cmds.append(
+            {
+                "tool": "rocprofv3",
+                "description": f"{description}{suffix}",
+                "flags": list(flags),
+                "args": [
+                    {"name": "--pmc", "value": pmc_str},
+                    {"name": "-d", "value": output_dir},
+                    {"name": "-o", "value": out_name},
+                ],
+                "full_command": full_cmd,
+            }
+        )
+    return cmds
+
+
+# ---------------------------------------------------------------------------
+# Pattern table
+# Each entry: (pattern_id, compiled_regex_or_None, severity, category, description_template)
+# None regex = synthetic pattern computed from aggregate stats after all files are scanned.
+# ---------------------------------------------------------------------------
+@dataclass
+class _PatternDef:
+    pattern_id: str
+    regex: Optional[re.Pattern]
+    severity: str  # "high" | "medium" | "low" | "info"
+    category: str
+    description: str  # may contain {count} placeholder
+
+
+_PATTERN_DEFS: List[_PatternDef] = [
+    # ── Kernel definitions ────────────────────────────────────────────────
+    _PatternDef(
+        "GLOBAL_KERNEL_DEF",
+        re.compile(r"\b__global__\s+\w[\w\s*&]*?\s+(\w+)\s*\(", re.MULTILINE),
+        "info",
+        "GPU Kernels",
+        "Custom __global__ kernel definition(s) found",
+    ),
+    _PatternDef(
+        "HIP_KERNEL_LAUNCH",
+        re.compile(
+            r"\bhipLaunchKernelGGL\s*\(\s*(?:\(void\s*\*\)\s*)?(\w+)", re.MULTILINE
+        ),
+        "info",
+        "GPU Kernels",
+        "hipLaunchKernelGGL kernel launch(es) found",
+    ),
+    _PatternDef(
+        "TRIPLE_ANGLE_LAUNCH",
+        re.compile(r"(\w+)\s*<<<\s*[^>]+>>>", re.MULTILINE),
+        "info",
+        "GPU Kernels",
+        "Triple-angle-bracket kernel launch(es) found (<<<>>>)",
+    ),
+    _PatternDef(
+        "HIP_KERNEL_NAME",
+        re.compile(r"\bHIP_KERNEL_NAME\s*\(\s*(\w+)", re.MULTILINE),
+        "info",
+        "GPU Kernels",
+        "HIP_KERNEL_NAME macro usage found",
+    ),
+    # ── Memory operations ─────────────────────────────────────────────────
+    _PatternDef(
+        "BLOCKING_MEMCPY",
+        re.compile(r"\bhipMemcpy\s*\(", re.MULTILINE),
+        "medium",
+        "Memory",
+        "Blocking hipMemcpy call(s) — consider hipMemcpyAsync to overlap with compute",
+    ),
+    _PatternDef(
+        "ASYNC_MEMCPY",
+        re.compile(r"\bhipMemcpyAsync\s*\(", re.MULTILINE),
+        "info",
+        "Memory",
+        "hipMemcpyAsync call(s) found (good practice)",
+    ),
+    _PatternDef(
+        "HOST_MALLOC_PINNED",
+        re.compile(r"\bhipHostMalloc\s*\(", re.MULTILINE),
+        "info",
+        "Memory",
+        "hipHostMalloc (pinned host memory) found — enables fast DMA transfers",
+    ),
+    _PatternDef(
+        "DEVICE_MALLOC",
+        re.compile(r"\bhipMalloc\s*\(", re.MULTILINE),
+        "info",
+        "Memory",
+        "hipMalloc device allocation(s) found",
+    ),
+    _PatternDef(
+        "MEMSET",
+        re.compile(r"\bhipMemset\b", re.MULTILINE),
+        "low",
+        "Memory",
+        "hipMemset call(s) — may create implicit sync; consider fusing with kernel init",
+    ),
+    _PatternDef(
+        "MANAGED_MEMORY",
+        re.compile(r"\bhipMallocManaged\s*\(", re.MULTILINE),
+        "medium",
+        "Memory",
+        "hipMallocManaged (unified memory) found — can incur page-fault overhead on MI series",
+    ),
+    # ── Synchronization ───────────────────────────────────────────────────
+    _PatternDef(
+        "DEVICE_SYNC",
+        re.compile(r"\bhipDeviceSynchronize\s*\(", re.MULTILINE),
+        "high",
+        "Synchronization",
+        "hipDeviceSynchronize call(s) — serializes entire GPU pipeline; profile to confirm frequency",
+    ),
+    _PatternDef(
+        "STREAM_SYNC",
+        re.compile(r"\bhipStreamSynchronize\s*\(", re.MULTILINE),
+        "medium",
+        "Synchronization",
+        "hipStreamSynchronize call(s) — stream-level sync; ensure not called in a hot loop",
+    ),
+    _PatternDef(
+        "EVENT_SYNC",
+        re.compile(r"\bhipEventSynchronize\s*\(", re.MULTILINE),
+        "low",
+        "Synchronization",
+        "hipEventSynchronize call(s) found — event-based sync (preferred over device sync)",
+    ),
+    _PatternDef(
+        "STREAM_WAIT_EVENT",
+        re.compile(r"\bhipStreamWaitEvent\s*\(", re.MULTILINE),
+        "info",
+        "Synchronization",
+        "hipStreamWaitEvent found — inter-stream dependency (good concurrency pattern)",
+    ),
+    # ── Streams / concurrency ─────────────────────────────────────────────
+    _PatternDef(
+        "STREAM_CREATE",
+        re.compile(r"\bhipStreamCreate(?:WithFlags|WithPriority)?\s*\(", re.MULTILINE),
+        "info",
+        "Concurrency",
+        "hipStreamCreate call(s) found — streams enable overlapping transfers and kernels",
+    ),
+    # ── ROCm library calls ────────────────────────────────────────────────
+    _PatternDef(
+        "ROCBLAS",
+        re.compile(r"\brocblas_\w+\s*\(", re.MULTILINE),
+        "info",
+        "ROCm Libraries",
+        "rocBLAS call(s) found — library is GPU-optimized; profile with rocprof-compute MFMA blocks",
+    ),
+    _PatternDef(
+        "ROCSOLVER",
+        re.compile(r"\brocsolver_\w+\s*\(", re.MULTILINE),
+        "info",
+        "ROCm Libraries",
+        "rocSOLVER call(s) found",
+    ),
+    _PatternDef(
+        "MIOPEN",
+        re.compile(r"\bmiopen\w+\s*\(", re.MULTILINE),
+        "info",
+        "ROCm Libraries",
+        "MIOpen call(s) found — deep learning primitives; profile conv/gemm with rocprof-compute",
+    ),
+    _PatternDef(
+        "ROCFFT",
+        re.compile(r"\brocfft_\w+\s*\(", re.MULTILINE),
+        "info",
+        "ROCm Libraries",
+        "rocFFT call(s) found",
+    ),
+    _PatternDef(
+        "HIPSPARSE",
+        re.compile(r"\bhipsparse\w+\s*\(", re.MULTILINE),
+        "info",
+        "ROCm Libraries",
+        "hipSparse call(s) found — sparse operations can be memory-bandwidth-bound",
+    ),
+    _PatternDef(
+        "HIPBLAS",
+        re.compile(r"\bhipblas\w+\s*\(", re.MULTILINE),
+        "info",
+        "ROCm Libraries",
+        "hipBLAS call(s) found (portable BLAS interface)",
+    ),
+    _PatternDef(
+        "ROCRAND",
+        re.compile(r"\brocrand_\w+\s*\(", re.MULTILINE),
+        "info",
+        "ROCm Libraries",
+        "rocRAND call(s) found",
+    ),
+    _PatternDef(
+        "RCCL",
+        re.compile(
+            r"\bncclAll\w+\s*\(|\brcclAll\w+\s*\(|\bncclBcast\s*\(|\brcclBcast\s*\(",
+            re.MULTILINE,
+        ),
+        "info",
+        "Multi-GPU",
+        "RCCL/NCCL collective operation(s) found — multi-GPU communication; profile inter-GPU bandwidth",
+    ),
+    # ── ROCTx instrumentation ─────────────────────────────────────────────
+    _PatternDef(
+        "ROCTX_RANGE_PUSH",
+        re.compile(r"\broctxRangePush\s*\(", re.MULTILINE),
+        "info",
+        "Instrumentation",
+        "roctxRangePush found — application is already instrumented with ROCTx markers",
+    ),
+    _PatternDef(
+        "ROCTX_RANGE_POP",
+        re.compile(r"\broctxRangePop\s*\(", re.MULTILINE),
+        "info",
+        "Instrumentation",
+        "roctxRangePop found",
+    ),
+    _PatternDef(
+        "ROCTX_MARK",
+        re.compile(r"\broctxMark\s*\(", re.MULTILINE),
+        "info",
+        "Instrumentation",
+        "roctxMark found",
+    ),
+    # ── Python GPU patterns ───────────────────────────────────────────────
+    _PatternDef(
+        "TORCH_CUDA_DEVICE",
+        re.compile(
+            r'\.cuda\(\)|\.to\(["\']cuda["\']|\.to\(device\s*=\s*["\']cuda["\']',
+            re.MULTILINE,
+        ),
+        "info",
+        "PyTorch",
+        "PyTorch .cuda() / .to('cuda') tensor operation(s) found",
+    ),
+    _PatternDef(
+        "TORCH_COMPILE",
+        re.compile(r"\btorch\.compile\s*\(", re.MULTILINE),
+        "info",
+        "PyTorch",
+        "torch.compile() found — compiled kernels; use torch profiler + rocprof-sys",
+    ),
+    _PatternDef(
+        "TORCH_PROFILER",
+        re.compile(r"\btorch\.profiler\b|\btorch\.autograd\.profiler\b", re.MULTILINE),
+        "info",
+        "PyTorch",
+        "PyTorch profiler already in use",
+    ),
+    _PatternDef(
+        "JAX_JIT",
+        re.compile(r"\bjax\.jit\s*\(|\b@jax\.jit\b", re.MULTILINE),
+        "info",
+        "JAX",
+        "JAX jit-compiled function(s) found",
+    ),
+]
+
+# IDs of kernel-definition patterns (used to decide if any kernels found)
+_KERNEL_PATTERN_IDS = frozenset(
+    {
+        "GLOBAL_KERNEL_DEF",
+        "HIP_KERNEL_LAUNCH",
+        "TRIPLE_ANGLE_LAUNCH",
+        "HIP_KERNEL_NAME",
+    }
+)
+
+# IDs of ROCTx instrumentation patterns
+_ROCTX_PATTERN_IDS = frozenset(
+    {
+        "ROCTX_RANGE_PUSH",
+        "ROCTX_RANGE_POP",
+        "ROCTX_MARK",
+    }
+)
+
+# Stable IDs for Tier 0 recommendations
+_T0_CATEGORY_IDS = {
+    "Initial Profiling": "ROCPD-T0-INIT-001",
+    "Synchronization": "ROCPD-T0-SYNC-001",
+    "Memory Transfer": "ROCPD-T0-MEM-001",
+    "No Streams": "ROCPD-T0-STREAMS-001",
+    "Managed Memory": "ROCPD-T0-MANAGED-001",
+    "ROCm Libraries": "ROCPD-T0-LIBS-001",
+    "Instrumentation": "ROCPD-T0-ROCTX-001",
+    "PyTorch": "ROCPD-T0-PYTORCH-001",
+    "Multi-GPU": "ROCPD-T0-MULTIGPU-001",
+}
+
+
+# ---------------------------------------------------------------------------
+# Result dataclasses
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class DetectedKernel:
+    """A GPU kernel found in source code."""
+
+    name: str
+    file: str  # relative path from source_dir
+    line: int
+    launch_type: str  # "GLOBAL_KERNEL_DEF" | "HIP_KERNEL_LAUNCH" | "TRIPLE_ANGLE_LAUNCH" | "HIP_KERNEL_NAME"
+
+
+@dataclass
+class DetectedPattern:
+    """A detected GPU programming pattern in source code."""
+
+    pattern_id: str
+    severity: str  # "high" | "medium" | "low" | "info"
+    category: str
+    description: str
+    count: int  # total occurrences across all files
+    locations: List[str] = field(default_factory=list)  # "file.cpp:42"
+
+
+@dataclass
+class ProfilingPlan:
+    """
+    Complete Tier 0 analysis result from static source scanning.
+
+    Represents a profiling plan derived entirely from source code —
+    before any profiling data has been collected.
+    """
+
+    source_dir: str
+    analysis_timestamp: str
+    programming_model: str  # "HIP" | "HIP+ROCm_Libraries" | "OpenCL" | "PyTorch_HIP" | "JAX_HIP" | "CUDA" | "Unknown"
+
+    files_scanned: int
+    files_skipped: int
+
+    detected_kernels: List[DetectedKernel]
+    kernel_count: int
+
+    detected_patterns: List[DetectedPattern]
+    risk_areas: List[str]
+
+    already_instrumented: bool
+    roctx_marker_count: int
+
+    # Same dict structure as generate_recommendations() output
+    recommendations: List[Dict[str, Any]]
+
+    suggested_counters: List[str]
+    suggested_first_command: str
+
+    llm_explanation: Optional[str] = None
+
+
+# ---------------------------------------------------------------------------
+# Main analyzer class
+# ---------------------------------------------------------------------------
+
+
+class SourceAnalyzer:
+    """
+    Scans a source directory for GPU programming patterns and produces
+    a ProfilingPlan with structured rocprofv3/rocprof-compute recommendations.
+
+    Self-contained: stdlib + dataclasses only. No rocpd DB imports.
+
+    Example:
+        analyzer = SourceAnalyzer(Path("./src"))
+        plan = analyzer.analyze()
+        print(plan.programming_model)
+        print(plan.suggested_first_command)
+    """
+
+    def __init__(
+        self,
+        source_dir: Path,
+        max_files: int = _MAX_FILES,
+        max_file_size_bytes: int = _MAX_FILE_SIZE_BYTES,
+        verbose: bool = False,
+    ):
+        from .exceptions import SourceDirectoryNotFoundError
+
+        if not source_dir.exists():
+            raise SourceDirectoryNotFoundError(
+                f"Source directory not found: {source_dir}"
+            )
+        if not source_dir.is_dir():
+            raise SourceDirectoryNotFoundError(f"Not a directory: {source_dir}")
+
+        self._source_dir = source_dir
+        self._max_files = max_files
+        self._max_file_size_bytes = max_file_size_bytes
+        self._verbose = verbose
+
+        # Aggregated state populated during analyze()
+        self._pattern_counts: Dict[str, int] = {}  # pattern_id → total count
+        self._pattern_locations: Dict[str, List[str]] = {}  # pattern_id → ["file:line"]
+        self._detected_kernels: List[DetectedKernel] = []
+        self._files_scanned: int = 0
+        self._files_skipped: int = 0
+        self._scan_truncated: bool = False  # True when _MAX_FILES limit is reached
+        self._has_python: bool = False
+        self._has_hip: bool = False
+        self._has_opencl: bool = False
+
+    # ── Public API ─────────────────────────────────────────────────────────
+
+    def analyze(self) -> ProfilingPlan:
+        """Scan the source directory and return a ProfilingPlan."""
+        from .exceptions import SourceAnalysisError
+
+        try:
+            files = self._collect_files()
+            for path in files:
+                self._scan_file(path)
+
+            self._apply_synthetic_patterns()
+
+            programming_model = self._classify_programming_model()
+            risk_areas = self._assess_risks()
+
+            # Warn when the file limit was reached so users know results may
+            # be incomplete for very large repositories.
+            if self._scan_truncated:
+                risk_areas.append(
+                    f"File scan truncated: {self._files_skipped} files skipped "
+                    f"(limit: {self._max_files}). Large repositories may need "
+                    f"a more specific --source-dir path."
+                )
+
+            patterns = self._build_pattern_list()
+            roctx_count = (
+                self._pattern_counts.get("ROCTX_RANGE_PUSH", 0)
+                + self._pattern_counts.get("ROCTX_RANGE_POP", 0)
+                + self._pattern_counts.get("ROCTX_MARK", 0)
+            )
+            already_instrumented = roctx_count > 0
+            suggested_counters = self._suggest_counters()
+            recommendations = self._generate_profiling_commands(programming_model)
+            first_cmd = self._pick_first_command(recommendations)
+
+            return ProfilingPlan(
+                source_dir=str(self._source_dir),
+                analysis_timestamp=datetime.now().isoformat(),
+                programming_model=programming_model,
+                files_scanned=self._files_scanned,
+                files_skipped=self._files_skipped,
+                detected_kernels=self._detected_kernels,
+                kernel_count=len(self._detected_kernels),
+                detected_patterns=patterns,
+                risk_areas=risk_areas,
+                already_instrumented=already_instrumented,
+                roctx_marker_count=roctx_count,
+                recommendations=recommendations,
+                suggested_counters=suggested_counters,
+                suggested_first_command=first_cmd,
+            )
+        except Exception as e:
+            if not isinstance(e, SourceAnalysisError):
+                from .exceptions import SourceAnalysisError as SAE
+
+                raise SAE(f"Source analysis failed: {e}") from e
+            raise
+
+    # ── File collection ────────────────────────────────────────────────────
+
+    def _collect_files(self) -> List[Path]:
+        """Walk source_dir and return files to scan, respecting limits."""
+        collected: List[Path] = []
+
+        for root, dirs, files in os.walk(self._source_dir):
+            # Prune skip dirs in-place so os.walk doesn't recurse into them
+            dirs[:] = [d for d in dirs if d not in _SKIP_DIRS and not d.startswith(".")]
+
+            for fname in files:
+                if len(collected) >= self._max_files:
+                    self._files_skipped += 1
+                    self._scan_truncated = True
+                    continue
+
+                path = Path(root) / fname
+                ext = path.suffix.lower()
+                if ext not in _ALL_EXTENSIONS:
+                    continue
+
+                try:
+                    size = path.stat().st_size
+                except OSError:
+                    self._files_skipped += 1
+                    continue
+
+                if size > self._max_file_size_bytes:
+                    self._files_skipped += 1
+                    if self._verbose:
+                        print(f"[Tier0] Skipping large file ({size // 1024} KB): {path}")
+                    continue
+
+                collected.append(path)
+
+        if self._verbose:
+            print(f"[Tier0] Collected {len(collected)} files to scan")
+
+        return collected
+
+    # ── File scanning ──────────────────────────────────────────────────────
+
+    def _scan_file(self, path: Path) -> None:
+        """Scan a single file for all patterns and detected kernels."""
+        try:
+            text = path.read_text(encoding="utf-8", errors="replace")
+        except OSError:
+            self._files_skipped += 1
+            return
+
+        rel = str(path.relative_to(self._source_dir))
+        ext = path.suffix.lower()
+
+        if ext in _PYTHON_EXTENSIONS:
+            self._has_python = True
+        elif ext in {".cl"}:
+            self._has_opencl = True
+        elif ext in _GPU_EXTENSIONS:
+            self._has_hip = True
+
+        self._files_scanned += 1
+
+        # Strip block and line comments before pattern matching to reduce
+        # false positives from commented-out code.
+        clean = self._strip_comments(text)
+
+        for pdef in _PATTERN_DEFS:
+            if pdef.regex is None:
+                continue  # synthetic — handled in _apply_synthetic_patterns
+            matches = list(pdef.regex.finditer(clean))
+            if not matches:
+                continue
+
+            self._pattern_counts[pdef.pattern_id] = self._pattern_counts.get(
+                pdef.pattern_id, 0
+            ) + len(matches)
+
+            locs = self._pattern_locations.setdefault(pdef.pattern_id, [])
+            for m in matches[:10]:  # cap locations per file to avoid bloat
+                lineno = clean[: m.start()].count("\n") + 1
+                locs.append(f"{rel}:{lineno}")
+
+        # Extract kernel names for kernel-definition patterns
+        self._extract_kernels(clean, rel)
+
+    @staticmethod
+    def _strip_comments(text: str) -> str:
+        """Remove C/C++ // and /* */ comments from text (best-effort)."""
+        # Remove block comments /* ... */ (non-greedy, including newlines)
+        text = re.sub(r"/\*.*?\*/", " ", text, flags=re.DOTALL)
+        # Remove line comments // ... (up to end of line)
+        text = re.sub(r"//[^\n]*", " ", text)
+        return text
+
+    def _extract_kernels(self, clean: str, rel_path: str) -> None:
+        """Extract kernel names from all kernel-definition/launch patterns."""
+        seen: set = set()
+
+        # __global__ void kernel_name(
+        for m in re.finditer(
+            r"\b__global__\s+\w[\w\s*&]*?\s+(\w+)\s*\(", clean, re.MULTILINE
+        ):
+            name = m.group(1)
+            if name not in seen:
+                seen.add(name)
+                lineno = clean[: m.start()].count("\n") + 1
+                self._detected_kernels.append(
+                    DetectedKernel(
+                        name=name,
+                        file=rel_path,
+                        line=lineno,
+                        launch_type="GLOBAL_KERNEL_DEF",
+                    )
+                )
+
+        # hipLaunchKernelGGL(kernel_name, ...
+        for m in re.finditer(
+            r"\bhipLaunchKernelGGL\s*\(\s*(?:\(void\s*\*\)\s*)?(\w+)",
+            clean,
+            re.MULTILINE,
+        ):
+            name = m.group(1)
+            if name not in seen:
+                seen.add(name)
+                lineno = clean[: m.start()].count("\n") + 1
+                self._detected_kernels.append(
+                    DetectedKernel(
+                        name=name,
+                        file=rel_path,
+                        line=lineno,
+                        launch_type="HIP_KERNEL_LAUNCH",
+                    )
+                )
+
+        # kernel_name<<<grid, block>>>(
+        for m in re.finditer(r"\b(\w+)\s*<<<\s*[^>]+>>>", clean, re.MULTILINE):
+            name = m.group(1)
+            # Skip common non-kernel names that appear before <<<
+            if name in {"if", "for", "while", "switch", "else", "return"}:
+                continue
+            if name not in seen:
+                seen.add(name)
+                lineno = clean[: m.start()].count("\n") + 1
+                self._detected_kernels.append(
+                    DetectedKernel(
+                        name=name,
+                        file=rel_path,
+                        line=lineno,
+                        launch_type="TRIPLE_ANGLE_LAUNCH",
+                    )
+                )
+
+    # ── Synthetic patterns ─────────────────────────────────────────────────
+
+    def _apply_synthetic_patterns(self) -> None:
+        """Compute patterns derived from aggregate statistics."""
+        has_kernels = any(
+            self._pattern_counts.get(pid, 0) > 0 for pid in _KERNEL_PATTERN_IDS
+        )
+        has_streams = self._pattern_counts.get("STREAM_CREATE", 0) > 0
+
+        # NO_STREAMS: custom kernels launched but no stream management
+        if has_kernels and not has_streams:
+            self._pattern_counts["NO_STREAMS"] = 1
+            self._pattern_locations["NO_STREAMS"] = []
+
+        # LOOP_DEVICE_SYNC: hipDeviceSynchronize count > 5 suggests it's in a loop
+        dev_sync_count = self._pattern_counts.get("DEVICE_SYNC", 0)
+        if dev_sync_count > 5:
+            self._pattern_counts["LOOP_DEVICE_SYNC"] = dev_sync_count
+            self._pattern_locations["LOOP_DEVICE_SYNC"] = self._pattern_locations.get(
+                "DEVICE_SYNC", []
+            )[:5]
+
+    # ── Pattern list builder ───────────────────────────────────────────────
+
+    def _build_pattern_list(self) -> List[DetectedPattern]:
+        """Convert accumulated pattern counts into DetectedPattern objects."""
+        result: List[DetectedPattern] = []
+
+        # Known patterns from the table
+        for pdef in _PATTERN_DEFS:
+            count = self._pattern_counts.get(pdef.pattern_id, 0)
+            if count == 0:
+                continue
+            result.append(
+                DetectedPattern(
+                    pattern_id=pdef.pattern_id,
+                    severity=pdef.severity,
+                    category=pdef.category,
+                    description=pdef.description,
+                    count=count,
+                    locations=self._pattern_locations.get(pdef.pattern_id, [])[:20],
+                )
+            )
+
+        # Synthetic patterns not in _PATTERN_DEFS
+        _synthetic = [
+            (
+                "NO_STREAMS",
+                "medium",
+                "Concurrency",
+                "No hipStreamCreate found — concurrent kernel/transfer overlap not possible",
+            ),
+            (
+                "LOOP_DEVICE_SYNC",
+                "high",
+                "Synchronization",
+                "hipDeviceSynchronize called many times — likely inside a loop (severe serialization)",
+            ),
+        ]
+        for pid, sev, cat, desc in _synthetic:
+            count = self._pattern_counts.get(pid, 0)
+            if count == 0:
+                continue
+            result.append(
+                DetectedPattern(
+                    pattern_id=pid,
+                    severity=sev,
+                    category=cat,
+                    description=desc,
+                    count=count,
+                    locations=self._pattern_locations.get(pid, [])[:10],
+                )
+            )
+
+        # Sort: high → medium → low → info
+        _sev_order = {"high": 0, "medium": 1, "low": 2, "info": 3}
+        result.sort(key=lambda p: _sev_order.get(p.severity, 9))
+        return result
+
+    # ── Programming model classification ───────────────────────────────────
+
+    def _classify_programming_model(self) -> str:
+        """Infer the GPU programming model from detected patterns and file types."""
+        has_torch = bool(
+            self._pattern_counts.get("TORCH_CUDA_DEVICE", 0)
+            or self._pattern_counts.get("TORCH_COMPILE", 0)
+        )
+        has_jax = bool(self._pattern_counts.get("JAX_JIT", 0))
+        has_libs = bool(
+            self._pattern_counts.get("ROCBLAS", 0)
+            or self._pattern_counts.get("ROCSOLVER", 0)
+            or self._pattern_counts.get("MIOPEN", 0)
+            or self._pattern_counts.get("ROCFFT", 0)
+            or self._pattern_counts.get("HIPSPARSE", 0)
+            or self._pattern_counts.get("HIPBLAS", 0)
+        )
+        has_kernels = any(
+            self._pattern_counts.get(pid, 0) > 0 for pid in _KERNEL_PATTERN_IDS
+        )
+        has_rccl = bool(self._pattern_counts.get("RCCL", 0))
+
+        if has_torch:
+            return "PyTorch_HIP"
+        if has_jax:
+            return "JAX_HIP"
+        if self._has_opencl and not has_kernels:
+            return "OpenCL"
+        if has_kernels and has_libs:
+            return "HIP+ROCm_Libraries"
+        if has_kernels:
+            return "HIP"
+        if has_libs:
+            return "HIP+ROCm_Libraries"
+        if has_rccl:
+            return "HIP+ROCm_Libraries"
+        if self._has_hip:
+            return "HIP"
+        if self._has_python:
+            return "Python_GPU"
+        return "Unknown"
+
+    # ── Risk assessment ────────────────────────────────────────────────────
+
+    def _assess_risks(self) -> List[str]:
+        """Produce a list of human-readable risk area descriptions."""
+        risks: List[str] = []
+
+        dev_sync = self._pattern_counts.get("DEVICE_SYNC", 0)
+        if dev_sync > 5:
+            risks.append(
+                f"{dev_sync} hipDeviceSynchronize calls detected — "
+                "frequent global synchronization will stall the GPU pipeline"
+            )
+        elif dev_sync > 0:
+            risks.append(
+                f"{dev_sync} hipDeviceSynchronize call(s) — "
+                "verify these are not inside hot loops"
+            )
+
+        blocking_memcpy = self._pattern_counts.get("BLOCKING_MEMCPY", 0)
+        async_memcpy = self._pattern_counts.get("ASYNC_MEMCPY", 0)
+        if blocking_memcpy > 0 and async_memcpy == 0:
+            risks.append(
+                f"{blocking_memcpy} blocking hipMemcpy call(s) with no hipMemcpyAsync — "
+                "transfers cannot overlap with kernel execution"
+            )
+
+        if self._pattern_counts.get("MANAGED_MEMORY", 0) > 0:
+            risks.append(
+                "hipMallocManaged (unified memory) detected — "
+                "page migration overhead can be significant on MI-series GPUs"
+            )
+
+        if self._pattern_counts.get("NO_STREAMS", 0):
+            risks.append(
+                "No hipStreamCreate found — all work serialized on default stream; "
+                "concurrent kernel/transfer overlap is not possible"
+            )
+
+        roctx = self._pattern_counts.get(
+            "ROCTX_RANGE_PUSH", 0
+        ) + self._pattern_counts.get("ROCTX_MARK", 0)
+        if roctx == 0 and len(self._detected_kernels) > 0:
+            risks.append(
+                "No ROCTx markers found — adding roctxRangePush/Pop around key regions "
+                "will make trace timelines much easier to interpret"
+            )
+
+        if not risks:
+            risks.append(
+                "No major static risk factors detected — "
+                "run rocprofv3 --sys-trace to collect baseline profiling data"
+            )
+
+        return risks
+
+    # ── Counter suggestions ────────────────────────────────────────────────
+
+    def _suggest_counters(self) -> List[str]:
+        """Suggest hardware counters relevant to the detected patterns."""
+        counters: List[str] = ["GRBM_COUNT", "GRBM_GUI_ACTIVE", "SQ_WAVES"]
+
+        has_libs = bool(
+            self._pattern_counts.get("ROCBLAS", 0)
+            or self._pattern_counts.get("HIPBLAS", 0)
+            or self._pattern_counts.get("MIOPEN", 0)
+        )
+        if has_libs:
+            # MFMA-heavy workloads — add VALU + memory bandwidth counters
+            counters += ["SQ_INSTS_VALU", "FETCH_SIZE", "WRITE_SIZE"]
+
+        has_custom_kernels = any(
+            self._pattern_counts.get(pid, 0) > 0
+            for pid in {"GLOBAL_KERNEL_DEF", "TRIPLE_ANGLE_LAUNCH", "HIP_KERNEL_LAUNCH"}
+        )
+        if has_custom_kernels:
+            counters += ["SQ_INSTS_VMEM_RD", "SQ_INSTS_VMEM_WR", "SQ_INSTS_LDS"]
+
+        has_memcpy = bool(
+            self._pattern_counts.get("BLOCKING_MEMCPY", 0)
+            or self._pattern_counts.get("ASYNC_MEMCPY", 0)
+        )
+        if has_memcpy:
+            # PCIe/HBM bandwidth analysis
+            if "FETCH_SIZE" not in counters:
+                counters.append("FETCH_SIZE")
+            if "WRITE_SIZE" not in counters:
+                counters.append("WRITE_SIZE")
+
+        # Deduplicate while preserving order
+        seen: set = set()
+        deduped: List[str] = []
+        for c in counters:
+            if c not in seen:
+                seen.add(c)
+                deduped.append(c)
+        return deduped
+
+    # ── Recommendation / command generation ───────────────────────────────
+
+    def _generate_profiling_commands(
+        self, programming_model: str
+    ) -> List[Dict[str, Any]]:
+        """
+        Generate profiling recommendations in the same dict format as
+        generate_recommendations() in analyze.py, so the existing formatters
+        can render Tier 0 recommendations without modification.
+
+        Recommendation dict structure:
+            priority, category, issue, suggestion, actions[], estimated_impact, commands[]
+
+        Command dict structure:
+            tool, description, flags[], args[], full_command
+        """
+        recommendations: List[Dict[str, Any]] = []
+        counters = self._suggest_counters()
+
+        has_kernels = len(self._detected_kernels) > 0
+
+        # ── Rec 1: Baseline sys-trace (always if any GPU code found) ────────
+        # Split counters across multiple passes if they exceed _MAX_PMC_PER_PASS
+        # to avoid hardware error code 38 ("Request exceeds the capabilities of
+        # the hardware to collect").
+        if has_kernels or self._pattern_counts.get("ROCBLAS", 0):
+            # Determine how many passes the block-aware splitter will generate
+            from collections import defaultdict as _dd
+
+            _bg: Dict[str, int] = _dd(int)
+            for c in counters:
+                _bg[_pmc_block(c)] += 1
+            n_passes = (
+                max(
+                    (_cnt + _pmc_block_limit(_blk) - 1) // max(_pmc_block_limit(_blk), 1)
+                    for _blk, _cnt in _bg.items()
+                )
+                if _bg
+                else 1
+            )
+            pass_note = (
+                f" Hardware counter limits require {n_passes} separate collection passes."
+                if n_passes > 1
+                else ""
+            )
+            recommendations.append(
+                {
+                    "priority": "HIGH",
+                    "category": "Initial Profiling",
+                    "issue": (
+                        f"No profiling data yet — {len(self._detected_kernels)} kernel(s) "
+                        f"found in source. Establish a baseline trace first."
+                    ),
+                    "suggestion": (
+                        "Run a baseline sys-trace to capture kernel timings and memory transfers."
+                        + pass_note
+                    ),
+                    "actions": [
+                        "Collect a sys-trace with hardware counters (split across passes if needed)",
+                        "Open the resulting .db file with 'rocpd analyze -i output.db'",
+                        "Identify the top 3 kernels by time before deeper optimization",
+                    ],
+                    "estimated_impact": "Establishes ground truth for all subsequent optimization",
+                    "commands": _pmc_commands(
+                        counters,
+                        flags=["--sys-trace"],
+                        output_dir="./rocpd_output",
+                        output_prefix="baseline",
+                        description="Baseline trace + hardware counters",
+                    ),
+                }
+            )
+
+        # ── Rec 2: Synchronization risk ─────────────────────────────────────
+        dev_sync = self._pattern_counts.get("DEVICE_SYNC", 0)
+        loop_sync = self._pattern_counts.get("LOOP_DEVICE_SYNC", 0)
+        if loop_sync > 0:
+            sync_locations = self._pattern_locations.get("DEVICE_SYNC", [])[:3]
+            locations_str = (
+                "; ".join(sync_locations) if sync_locations else "multiple locations"
+            )
+            recommendations.append(
+                {
+                    "priority": "HIGH",
+                    "category": "Synchronization",
+                    "issue": (
+                        f"{dev_sync} hipDeviceSynchronize call(s) detected — likely inside a loop "
+                        f"({locations_str}). This serializes the entire GPU pipeline on every iteration."
+                    ),
+                    "suggestion": "Profile synchronization overhead and replace with hipStreamSynchronize or hipEventSynchronize",
+                    "actions": [
+                        "Use rocprof-sys to see exact CPU↔GPU synchronization gaps in the timeline",
+                        "Replace hipDeviceSynchronize() with per-stream sync (hipStreamSynchronize)",
+                        "Use hipEventRecord/hipEventSynchronize for fine-grained dependencies",
+                        "Consider double-buffering to overlap computation and data transfers",
+                    ],
+                    "estimated_impact": "Can reduce idle GPU time by 20-60% in sync-heavy workloads",
+                    "commands": [
+                        {
+                            "tool": "rocprofv3",
+                            "description": "Capture synchronization events and gaps in kernel timeline",
+                            "flags": ["--sys-trace"],
+                            "args": [
+                                {"name": "-d", "value": "./sync_output"},
+                                {"name": "-o", "value": "sync_profile"},
+                            ],
+                            "full_command": "rocprofv3 --sys-trace -d ./sync_output -o sync_profile -- ./app",
+                        },
+                        {
+                            "tool": "rocprof-sys",
+                            "description": "System-level timeline showing CPU/GPU sync points and idle gaps",
+                            "flags": ["--trace"],
+                            "args": [],
+                            "full_command": "rocprof-sys --trace -- ./app",
+                        },
+                    ],
+                }
+            )
+        elif dev_sync > 0:
+            recommendations.append(
+                {
+                    "priority": "MEDIUM",
+                    "category": "Synchronization",
+                    "issue": (
+                        f"{dev_sync} hipDeviceSynchronize call(s) detected — "
+                        "verify these are not in hot paths."
+                    ),
+                    "suggestion": "Profile to confirm sync frequency and duration at runtime",
+                    "actions": [
+                        "Check if hipDeviceSynchronize is inside a loop or called per-iteration",
+                        "Replace with stream-level sync where possible",
+                    ],
+                    "estimated_impact": "Depends on call frequency; 5-30% improvement if in hot loop",
+                    "commands": [
+                        {
+                            "tool": "rocprof-sys",
+                            "description": "Timeline view to identify CPU/GPU synchronization points",
+                            "flags": ["--trace"],
+                            "args": [],
+                            "full_command": "rocprof-sys --trace -- ./app",
+                        },
+                    ],
+                }
+            )
+
+        # ── Rec 3: Blocking memcpy without async ────────────────────────────
+        blocking = self._pattern_counts.get("BLOCKING_MEMCPY", 0)
+        async_mc = self._pattern_counts.get("ASYNC_MEMCPY", 0)
+        if blocking > 0 and async_mc == 0:
+            recommendations.append(
+                {
+                    "priority": "MEDIUM",
+                    "category": "Memory Transfer",
+                    "issue": (
+                        f"{blocking} blocking hipMemcpy call(s) with no hipMemcpyAsync — "
+                        "transfers block the CPU until complete and cannot overlap with kernels."
+                    ),
+                    "suggestion": "Convert to hipMemcpyAsync and add hipHostMalloc for pinned buffers",
+                    "actions": [
+                        "Allocate host buffers with hipHostMalloc(size, hipHostMallocDefault) for DMA access",
+                        "Replace hipMemcpy with hipMemcpyAsync(dst, src, size, kind, stream)",
+                        "Create at least 2 streams to overlap H2D transfers with D2H or kernel execution",
+                        "Profile with rocprofv3 --sys-trace to confirm transfer/kernel overlap",
+                    ],
+                    "estimated_impact": "15-40% reduction in total runtime when transfers are a bottleneck",
+                    "commands": [
+                        {
+                            "tool": "rocprofv3",
+                            "description": "Trace memory copies and kernel launches to measure overlap opportunity",
+                            "flags": ["--sys-trace"],
+                            "args": [
+                                {"name": "-d", "value": "./memcpy_output"},
+                                {"name": "-o", "value": "memcpy_profile"},
+                            ],
+                            "full_command": "rocprofv3 --sys-trace -d ./memcpy_output -o memcpy_profile -- ./app",
+                        },
+                    ],
+                }
+            )
+
+        # ── Rec 4: No streams ────────────────────────────────────────────────
+        if self._pattern_counts.get("NO_STREAMS", 0) and has_kernels:
+            recommendations.append(
+                {
+                    "priority": "MEDIUM",
+                    "category": "No Streams",
+                    "issue": (
+                        f"No hipStreamCreate found in {self._files_scanned} scanned files. "
+                        "All work runs on the default stream (serialized)."
+                    ),
+                    "suggestion": "Add hipStream_t to enable concurrent kernel execution and transfer overlap",
+                    "actions": [
+                        "Create 2-4 streams with hipStreamCreate(&stream)",
+                        "Pass stream to kernel launches: kernel<<<grid, block, 0, stream>>>(...)",
+                        "Use hipMemcpyAsync with streams to overlap H2D and D2H with compute",
+                        "Use hipStreamSynchronize(stream) instead of hipDeviceSynchronize()",
+                    ],
+                    "estimated_impact": "10-50% throughput improvement for workloads with independent work",
+                    "commands": [
+                        {
+                            "tool": "rocprof-sys",
+                            "description": "Visualize kernel concurrency gaps on the default stream",
+                            "flags": ["--trace"],
+                            "args": [],
+                            "full_command": "rocprof-sys --trace -- ./app",
+                        },
+                    ],
+                }
+            )
+
+        # ── Rec 5: Managed memory ────────────────────────────────────────────
+        if self._pattern_counts.get("MANAGED_MEMORY", 0) > 0:
+            recommendations.append(
+                {
+                    "priority": "MEDIUM",
+                    "category": "Managed Memory",
+                    "issue": (
+                        "hipMallocManaged (unified/managed memory) detected. "
+                        "On MI-series GPUs, page migration can add significant latency."
+                    ),
+                    "suggestion": "Replace hipMallocManaged with explicit hipMalloc + hipMemcpy for predictable performance",
+                    "actions": [
+                        "Profile page-fault overhead with rocprof-sys --trace-gpu-memory",
+                        "Replace hipMallocManaged with hipMalloc (device) + hipHostMalloc (host) pairs",
+                        "Use explicit hipMemcpy to control when data moves between host and device",
+                    ],
+                    "estimated_impact": "Can eliminate page-migration stalls; 2-10x improvement in some cases",
+                    "commands": [
+                        {
+                            "tool": "rocprof-sys",
+                            "description": "Trace GPU memory page migrations and access patterns",
+                            "flags": [],
+                            "args": [{"name": "--trace-gpu-memory", "value": None}],
+                            "full_command": "rocprof-sys --trace-gpu-memory -- ./app",
+                        },
+                    ],
+                }
+            )
+
+        # ── Rec 6: ROCm libraries → rocprof-compute deep dive ──────────────
+        has_libs = bool(
+            self._pattern_counts.get("ROCBLAS", 0)
+            or self._pattern_counts.get("HIPBLAS", 0)
+            or self._pattern_counts.get("MIOPEN", 0)
+            or self._pattern_counts.get("ROCFFT", 0)
+        )
+        if has_libs:
+            lib_names = []
+            if self._pattern_counts.get("ROCBLAS", 0) or self._pattern_counts.get(
+                "HIPBLAS", 0
+            ):
+                lib_names.append("rocBLAS/hipBLAS")
+            if self._pattern_counts.get("MIOPEN", 0):
+                lib_names.append("MIOpen")
+            if self._pattern_counts.get("ROCFFT", 0):
+                lib_names.append("rocFFT")
+            recommendations.append(
+                {
+                    "priority": "MEDIUM",
+                    "category": "ROCm Libraries",
+                    "issue": (
+                        f"{', '.join(lib_names)} call(s) detected. "
+                        "Library kernels are pre-tuned but may be memory- or compute-bound "
+                        "depending on problem size."
+                    ),
+                    "suggestion": "Profile library kernels with rocprof-compute to identify roofline position",
+                    "actions": [
+                        "Run rocprof-compute to see MFMA utilization and HBM bandwidth for library kernels",
+                        "Check if matrix/tensor dimensions are optimal for the hardware (multiples of 16/64)",
+                        "Try different batch sizes to find the efficiency sweet spot on the roofline",
+                    ],
+                    "estimated_impact": "Library tuning can yield 1.5-4x on GEMM-heavy workloads",
+                    "commands": [
+                        {
+                            "tool": "rocprof-compute",
+                            "description": "Roofline model and MFMA utilization for library kernels",
+                            "flags": [],
+                            "args": [{"name": "profile", "value": None}],
+                            "full_command": "rocprof-compute profile -- ./app",
+                        },
+                    ],
+                }
+            )
+
+        # ── Rec 7: ROCTx instrumentation suggestion ──────────────────────────
+        roctx_count = self._pattern_counts.get(
+            "ROCTX_RANGE_PUSH", 0
+        ) + self._pattern_counts.get("ROCTX_MARK", 0)
+        if roctx_count == 0 and has_kernels:
+            recommendations.append(
+                {
+                    "priority": "LOW",
+                    "category": "Instrumentation",
+                    "issue": (
+                        "No ROCTx markers found. Without markers, profiling timelines have "
+                        "no application-level context — all you see is kernel names."
+                    ),
+                    "suggestion": "Add roctxRangePush/Pop around key computation phases",
+                    "actions": [
+                        "#include <roctx.h> and link with -lroctx64",
+                        'Wrap major phases: roctxRangePush("forward_pass"); ...; roctxRangePop();',
+                        'Add roctxMark("iteration_N") at loop checkpoints',
+                        "Re-run rocprofv3 --sys-trace --marker-trace after adding markers",
+                    ],
+                    "estimated_impact": "No runtime impact; dramatically improves trace readability",
+                    "commands": [
+                        {
+                            "tool": "rocprofv3",
+                            "description": "Collect sys-trace with ROCTx marker tracing enabled",
+                            "flags": ["--sys-trace", "--marker-trace"],
+                            "args": [
+                                {"name": "-d", "value": "./marked_output"},
+                                {"name": "-o", "value": "marked_profile"},
+                            ],
+                            "full_command": "rocprofv3 --sys-trace --marker-trace -d ./marked_output -o marked_profile -- ./app",
+                        },
+                    ],
+                }
+            )
+
+        # ── Rec 8: PyTorch / Python path ────────────────────────────────────
+        if programming_model in ("PyTorch_HIP", "JAX_HIP", "Python_GPU"):
+            fw = (
+                "PyTorch"
+                if "Torch" in programming_model or programming_model == "PyTorch_HIP"
+                else "JAX"
+            )
+            recommendations.append(
+                {
+                    "priority": "HIGH",
+                    "category": "PyTorch",
+                    "issue": (
+                        f"{fw} GPU code detected. Framework-level profiling is required "
+                        "before dropping to rocprofv3."
+                    ),
+                    "suggestion": f"Use {fw} profiler for operator-level insight, then rocprof-sys for system-level timeline",
+                    "actions": [
+                        "Wrap your training/inference loop with torch.profiler.profile(activities=[ProfilerActivity.CUDA])",
+                        "Use with_stack=True to get Python call stacks",
+                        "Export to Chrome trace for visualization",
+                        "Use rocprof-sys for system-level GPU timeline alongside the framework profiler",
+                    ],
+                    "estimated_impact": "Framework profiler reveals op-level bottlenecks before HW counter collection",
+                    "commands": [
+                        {
+                            "tool": "rocprof-sys",
+                            "description": f"System-level trace capturing {fw} GPU kernel timeline",
+                            "flags": ["--trace"],
+                            "args": [],
+                            "full_command": "rocprof-sys --trace -- python ./train.py",
+                        },
+                        {
+                            "tool": "rocprofv3",
+                            "description": "Hardware counters for GPU kernels dispatched by PyTorch",
+                            "flags": ["--sys-trace"],
+                            "args": [
+                                {
+                                    "name": "--pmc",
+                                    "value": "GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES",
+                                },
+                                {"name": "-d", "value": "./pytorch_output"},
+                                {"name": "-o", "value": "pytorch_profile"},
+                            ],
+                            "full_command": "rocprofv3 --sys-trace --pmc GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES -d ./pytorch_output -o pytorch_profile -- python ./train.py",
+                        },
+                    ],
+                }
+            )
+
+        # ── Rec 9: Multi-GPU ─────────────────────────────────────────────────
+        if self._pattern_counts.get("RCCL", 0):
+            recommendations.append(
+                {
+                    "priority": "MEDIUM",
+                    "category": "Multi-GPU",
+                    "issue": "RCCL/NCCL collective operations detected — inter-GPU communication may be a bottleneck.",
+                    "suggestion": "Profile inter-GPU bandwidth and collective operation overlap",
+                    "actions": [
+                        "Enable RCCL_DEBUG=INFO to see collective sizes and durations",
+                        "Profile with rocprof-sys to see NIC/PCIe/NVLink bandwidth utilization",
+                        "Check RCCL_TREE_THRESHOLD and ring vs tree algorithm selection",
+                    ],
+                    "estimated_impact": "Communication optimizations can yield 1.2-2x on multi-GPU workloads",
+                    "commands": [
+                        {
+                            "tool": "rocprof-sys",
+                            "description": "System timeline showing inter-GPU communication and kernel overlap",
+                            "flags": ["--trace"],
+                            "args": [],
+                            "full_command": "rocprof-sys --trace -- ./app",
+                        },
+                    ],
+                }
+            )
+
+        # ── Rec 10: Default if nothing else triggered ────────────────────────
+        if not recommendations:
+            recommendations.append(
+                {
+                    "priority": "INFO",
+                    "category": "Initial Profiling",
+                    "issue": "No GPU source patterns detected. Ensure the correct source directory is provided.",
+                    "suggestion": "Run a baseline sys-trace to capture any GPU activity",
+                    "actions": [
+                        "Verify --source-dir points to your GPU source files (.hip, .cpp, .cu)",
+                        "Run rocprofv3 --sys-trace as a baseline even without detected patterns",
+                    ],
+                    "estimated_impact": "Baseline trace establishes ground truth for further analysis",
+                    "commands": [
+                        {
+                            "tool": "rocprofv3",
+                            "description": "Baseline sys-trace to capture any GPU activity",
+                            "flags": ["--sys-trace"],
+                            "args": [
+                                {"name": "-d", "value": "./rocpd_output"},
+                                {"name": "-o", "value": "baseline"},
+                            ],
+                            "full_command": "rocprofv3 --sys-trace -d ./rocpd_output -o baseline -- ./app",
+                        },
+                    ],
+                }
+            )
+
+        return recommendations
+
+    @staticmethod
+    def _pick_first_command(recommendations: List[Dict[str, Any]]) -> str:
+        """Return the full_command of the highest-priority recommendation."""
+        for priority in ("HIGH", "MEDIUM", "LOW", "INFO"):
+            for rec in recommendations:
+                if rec.get("priority") == priority:
+                    commands = rec.get("commands", [])
+                    if commands:
+                        return commands[0].get("full_command", "")
+        return "rocprofv3 --sys-trace -d ./rocpd_output -o baseline -- ./app"
diff --git a/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/tests/__init__.py b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/tests/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/tests/test_api_standalone.py b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/tests/test_api_standalone.py
new file mode 100644
index 00000000000..f82befd9242
--- /dev/null
+++ b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/tests/test_api_standalone.py
@@ -0,0 +1,399 @@
+#!/usr/bin/env python3
+###############################################################################
+# MIT License
+#
+# Copyright (c) 2025 Advanced Micro Devices, Inc.
+###############################################################################
+
+"""
+Standalone unit tests for the rocpd ai_analysis module.
+
+These tests do NOT require a real GPU trace database.
+They DO require the rocpd package to be importable (needs the built libpyrocpd
+C extension). Run with the system-installed rocpd path first, then the source
+path for the edited Python modules:
+
+    ROCPD_SYS=$(python3 -c "import site; print(site.getsitepackages()[-1])")
+    ROCPD_SRC=<repo>/projects/rocprofiler-sdk/source/lib/python
+    PYTHONPATH="${ROCPD_SYS}:${ROCPD_SRC}" pytest --noconftest test_api_standalone.py -v
+
+IMPORTANT: ROCPD_SYS must come BEFORE ROCPD_SRC in PYTHONPATH to avoid a
+circular import of libpyrocpd.
+"""
+
+import json
+from pathlib import Path
+
+import pytest
+
+# ---------------------------------------------------------------------------
+# Helpers: build a minimal AnalysisResult without touching a real DB
+# ---------------------------------------------------------------------------
+
+
+def _make_minimal_result():
+    """Build an AnalysisResult with empty/zero payloads for serialization tests."""
+    from rocpd.ai_analysis.api import (
+        AnalysisResult,
+        AnalysisMetadata,
+        ProfilingInfo,
+        AnalysisSummary,
+        ExecutionBreakdown,
+        RecommendationSet,
+    )
+
+    result = AnalysisResult(
+        metadata=AnalysisMetadata(
+            rocpd_version="6.3.0",
+            database_file="test.db",
+            analysis_timestamp="2025-01-01T00:00:00",
+        ),
+        profiling_info=ProfilingInfo(
+            total_duration_ns=1_000_000,
+            profiling_mode="sys_trace_only",
+            analysis_tier=1,
+        ),
+        summary=AnalysisSummary(
+            overall_assessment="Test analysis",
+            primary_bottleneck="unknown",
+            confidence=0.5,
+            key_findings=["Kernel time: 80.0%"],
+        ),
+        execution_breakdown=ExecutionBreakdown(
+            kernel_time_ns=800_000,
+            kernel_time_pct=80.0,
+            memcpy_time_ns=0,
+            memcpy_time_pct=0.0,
+        ),
+        recommendations=RecommendationSet(),
+    )
+    return result
+
+
+def _attach_raw(
+    result,
+    *,
+    time_breakdown=None,
+    hotspots=None,
+    memory_analysis=None,
+    recommendations_raw=None,
+    hardware_counters=None,
+    database_path="test.db",
+):
+    """Attach a _raw dict to an AnalysisResult for to_json()/to_webview() tests."""
+    result._raw = {
+        "time_breakdown": time_breakdown
+        or {
+            "total_kernel_time": 800_000,
+            "total_memcpy_time": 0,
+            "total_runtime": 1_000_000,
+            "kernel_percent": 80.0,
+            "memcpy_percent": 0.0,
+            "overhead_percent": 20.0,
+        },
+        "hotspots": hotspots
+        or [
+            {
+                "name": "test_kernel",
+                "calls": 10,
+                "total_duration": 800_000,
+                "avg_duration": 80_000,
+                "min_duration": 75_000,
+                "max_duration": 90_000,
+                "percent_of_total": 80.0,
+            }
+        ],
+        "memory_analysis": memory_analysis or {},
+        "recommendations_raw": recommendations_raw or [],
+        "hardware_counters": hardware_counters or {"has_counters": False},
+        "database_path": database_path,
+    }
+    return result
+
+
+# ===========================================================================
+# Tests: OutputFormat enum (AIA-003)
+# ===========================================================================
+
+
+class TestOutputFormat:
+    def test_has_python_object(self):
+        from rocpd.ai_analysis.api import OutputFormat
+
+        assert OutputFormat.PYTHON_OBJECT.value == "python_object"
+
+    def test_has_json(self):
+        from rocpd.ai_analysis.api import OutputFormat
+
+        assert OutputFormat.JSON.value == "json"
+
+    def test_has_text(self):
+        from rocpd.ai_analysis.api import OutputFormat
+
+        assert OutputFormat.TEXT.value == "text"
+
+    def test_has_markdown(self):
+        from rocpd.ai_analysis.api import OutputFormat
+
+        assert OutputFormat.MARKDOWN.value == "markdown"
+
+    def test_has_webview(self):
+        """AIA-003: WEBVIEW must be present in OutputFormat."""
+        from rocpd.ai_analysis.api import OutputFormat
+
+        assert OutputFormat.WEBVIEW.value == "webview"
+
+    def test_five_members(self):
+        from rocpd.ai_analysis.api import OutputFormat
+
+        assert len(list(OutputFormat)) == 5
+
+
+# ===========================================================================
+# Tests: Exceptions (AIA-008, AIA-010, AIA-011)
+# ===========================================================================
+
+
+class TestExceptions:
+    def test_missing_data_error_optional_list(self):
+        """AIA-010: missing_tables should be Optional[List[str]]."""
+        from rocpd.ai_analysis.exceptions import MissingDataError
+
+        # Both None and a list should work
+        err_no_list = MissingDataError("msg")
+        assert err_no_list.missing_tables == []
+        err_with_list = MissingDataError("msg", ["kernels"])
+        assert err_with_list.missing_tables == ["kernels"]
+
+    def test_unsupported_gpu_error_optional_str(self):
+        """AIA-010: gpu_arch should be Optional[str]."""
+        from rocpd.ai_analysis.exceptions import UnsupportedGPUError
+
+        err_no_arch = UnsupportedGPUError("msg")
+        assert err_no_arch.gpu_arch is None
+        err_with_arch = UnsupportedGPUError("msg", "gfx906")
+        assert err_with_arch.gpu_arch == "gfx906"
+
+    def test_reference_guide_not_found_shows_all_paths(self):
+        """AIA-008: ReferenceGuideNotFoundError must list all attempted paths."""
+        from rocpd.ai_analysis.exceptions import ReferenceGuideNotFoundError
+
+        paths = ["/path/one/guide.md", "/path/two/guide.md", "/path/three/guide.md"]
+        err = ReferenceGuideNotFoundError(paths)
+        msg = str(err)
+        for p in paths:
+            assert p in msg, f"Path '{p}' not found in error message"
+        assert err.attempted_paths == paths
+
+    def test_reference_guide_exported_from_init(self):
+        """AIA-011: ReferenceGuideNotFoundError must be importable from rocpd.ai_analysis."""
+        from rocpd.ai_analysis import ReferenceGuideNotFoundError
+
+        assert ReferenceGuideNotFoundError is not None
+
+    def test_all_exceptions_exported(self):
+        """Verify all documented exceptions are accessible from the public API."""
+        import rocpd.ai_analysis as m
+
+        for name in [
+            "AnalysisError",
+            "DatabaseNotFoundError",
+            "DatabaseCorruptedError",
+            "MissingDataError",
+            "UnsupportedGPUError",
+            "LLMAuthenticationError",
+            "LLMRateLimitError",
+            "ReferenceGuideNotFoundError",
+        ]:
+            assert hasattr(m, name), f"{name} not exported from rocpd.ai_analysis"
+
+
+# ===========================================================================
+# Tests: validate_database (AIA-013)
+# ===========================================================================
+
+
+class TestValidateDatabase:
+    def test_raises_for_missing_file(self):
+        """validate_database() must raise DatabaseNotFoundError for missing file."""
+        from rocpd.ai_analysis import validate_database, DatabaseNotFoundError
+
+        with pytest.raises(DatabaseNotFoundError):
+            validate_database(Path("/nonexistent/path/to/trace.db"))
+
+
+# ===========================================================================
+# Tests: AnalysisResult serialization (AIA-004)
+# ===========================================================================
+
+
+class TestAnalysisResultSerialization:
+    def test_to_dict_returns_dict(self):
+        result = _make_minimal_result()
+        d = result.to_dict()
+        assert isinstance(d, dict)
+        assert "metadata" in d
+        assert "recommendations" in d
+
+    def test_to_json_fallback_returns_valid_json(self):
+        """to_json() raises RuntimeError when _raw is not populated."""
+        result = _make_minimal_result()
+        # _raw is not attached — raises RuntimeError (correct behavior: caller must use analyze_database())
+        with pytest.raises(RuntimeError, match="Raw analysis data not available"):
+            result.to_json()
+
+    def test_to_json_with_raw_returns_schema_conformant_json(self):
+        """AIA-004: to_json() with _raw must include schema_version."""
+        result = _attach_raw(_make_minimal_result())
+        j = result.to_json()
+        parsed = json.loads(j)
+        # schema-conformant output includes schema_version
+        assert "schema_version" in parsed, "JSON output missing schema_version field"
+        assert parsed["schema_version"] == "0.1.0"
+
+    def test_to_webview_raises_without_raw(self):
+        """to_webview() must raise RuntimeError if _raw is not attached."""
+        result = _make_minimal_result()
+        with pytest.raises(RuntimeError, match="analyze_database"):
+            result.to_webview()
+
+    def test_to_webview_with_raw_returns_html(self):
+        """AIA-004: to_webview() with _raw must return HTML string."""
+        result = _attach_raw(_make_minimal_result())
+        html = result.to_webview()
+        assert isinstance(html, str)
+        assert "<!DOCTYPE" in html or "<html" in html
+        assert len(html) > 1000  # must be a real HTML document
+
+
+# ===========================================================================
+# Tests: _convert_result_to_llm_format (AIA-006)
+# ===========================================================================
+
+
+class TestConvertResultToLlmFormat:
+    def test_returns_real_kernel_data(self):
+        """AIA-006: kernels list must not be empty when hotspots are present."""
+        from rocpd.ai_analysis.api import _convert_result_to_llm_format
+
+        result = _attach_raw(
+            _make_minimal_result(),
+            hotspots=[
+                {
+                    "name": "conv2d",
+                    "calls": 5,
+                    "total_duration": 500_000,
+                    "avg_duration": 100_000,
+                    "percent_of_total": 50.0,
+                }
+            ],
+        )
+        llm_data = _convert_result_to_llm_format(result)
+        assert len(llm_data["kernels"]) == 1
+        assert llm_data["kernels"][0]["name"] == "conv2d"
+
+    def test_returns_empty_kernels_without_raw(self):
+        """Without _raw, kernels defaults to empty list (graceful degradation)."""
+        from rocpd.ai_analysis.api import _convert_result_to_llm_format
+
+        result = _make_minimal_result()
+        llm_data = _convert_result_to_llm_format(result)
+        assert llm_data["kernels"] == []
+
+    def test_has_execution_breakdown(self):
+        from rocpd.ai_analysis.api import _convert_result_to_llm_format
+
+        result = _make_minimal_result()
+        llm_data = _convert_result_to_llm_format(result)
+        assert "execution_breakdown" in llm_data
+        assert "kernel_time_pct" in llm_data["execution_breakdown"]
+
+
+# ===========================================================================
+# Tests: _build_analysis_result key mapping (AIA-002)
+# ===========================================================================
+
+
+class TestBuildAnalysisResultKeyMapping:
+    """Verify that recommendation keys from generate_recommendations() are mapped correctly."""
+
+    def _make_raw_rec(self, priority="HIGH"):
+        return {
+            "priority": priority,
+            "category": "Low Occupancy",
+            "issue": "Average wave occupancy is very low",
+            "suggestion": "Increase occupancy by reducing VGPR usage",
+            "estimated_impact": "15-20% performance improvement",
+            "actions": ["Compile with -O3", "Reduce local arrays"],
+            "commands": [],
+        }
+
+    def test_high_priority_bucketing(self):
+        from rocpd.ai_analysis.api import _build_analysis_result
+
+        result = _build_analysis_result(
+            time_breakdown={
+                "total_kernel_time": 0,
+                "total_memcpy_time": 0,
+                "total_runtime": 0,
+                "kernel_percent": 0.0,
+                "memcpy_percent": 0.0,
+                "overhead_percent": 0.0,
+            },
+            hotspots=[],
+            memory_analysis={},
+            recommendations=[self._make_raw_rec("HIGH")],
+            hardware_counters={"has_counters": False},
+            database_path=Path("test.db"),
+            custom_prompt=None,
+        )
+        assert len(result.recommendations.high_priority) == 1
+        rec = result.recommendations.high_priority[0]
+        assert rec.title == "Average wave occupancy is very low"
+        assert rec.description == "Increase occupancy by reducing VGPR usage"
+        assert rec.estimated_impact == "15-20% performance improvement"
+        assert rec.next_steps == ["Compile with -O3", "Reduce local arrays"]
+        assert rec.priority == "high"  # normalized to lowercase
+
+    def test_medium_priority_bucketing(self):
+        from rocpd.ai_analysis.api import _build_analysis_result
+
+        result = _build_analysis_result(
+            time_breakdown={
+                "total_kernel_time": 0,
+                "total_memcpy_time": 0,
+                "total_runtime": 0,
+                "kernel_percent": 0.0,
+                "memcpy_percent": 0.0,
+                "overhead_percent": 0.0,
+            },
+            hotspots=[],
+            memory_analysis={},
+            recommendations=[self._make_raw_rec("MEDIUM")],
+            hardware_counters={"has_counters": False},
+            database_path=Path("test.db"),
+            custom_prompt=None,
+        )
+        assert len(result.recommendations.medium_priority) == 1
+
+    def test_info_bucketed_as_medium(self):
+        """INFO priority should be placed in medium_priority bucket."""
+        from rocpd.ai_analysis.api import _build_analysis_result
+
+        result = _build_analysis_result(
+            time_breakdown={
+                "total_kernel_time": 0,
+                "total_memcpy_time": 0,
+                "total_runtime": 0,
+                "kernel_percent": 0.0,
+                "memcpy_percent": 0.0,
+                "overhead_percent": 0.0,
+            },
+            hotspots=[],
+            memory_analysis={},
+            recommendations=[self._make_raw_rec("INFO")],
+            hardware_counters={"has_counters": False},
+            database_path=Path("test.db"),
+            custom_prompt=None,
+        )
+        assert len(result.recommendations.medium_priority) == 1
diff --git a/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/tests/test_interactive.py b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/tests/test_interactive.py
new file mode 100644
index 00000000000..36a5afbc587
--- /dev/null
+++ b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/tests/test_interactive.py
@@ -0,0 +1,453 @@
+import pathlib
+from unittest.mock import patch, MagicMock
+
+from rocpd.ai_analysis.interactive import (
+    SessionStore,
+    SessionData,
+    PersistentMenuItem,
+    InteractiveSession,
+)
+
+
+class TestSessionStore:
+    def test_save_and_load_roundtrip(self, tmp_path):
+        store = SessionStore(sessions_dir=tmp_path)
+        data = SessionData(
+            session_id="2026-03-10_14-23-01_myapp",
+            source_dir="/tmp/myapp",
+            created_at="2026-03-10T14:23:01Z",
+            last_updated="2026-03-10T14:23:01Z",
+        )
+        store.save(data)
+        loaded = store.load("2026-03-10_14-23-01_myapp")
+        assert loaded.session_id == data.session_id
+        assert loaded.source_dir == data.source_dir
+
+    def test_load_nonexistent_returns_none(self, tmp_path):
+        store = SessionStore(sessions_dir=tmp_path)
+        assert store.load("nonexistent") is None
+
+    def test_find_by_source_dir(self, tmp_path):
+        store = SessionStore(sessions_dir=tmp_path)
+        a = SessionData(
+            session_id="2026-03-10_10-00-00_myapp",
+            source_dir="/tmp/myapp",
+            created_at="2026-03-10T10:00:00Z",
+            last_updated="2026-03-10T10:00:00Z",
+        )
+        b = SessionData(
+            session_id="2026-03-10_11-00-00_other",
+            source_dir="/tmp/other",
+            created_at="2026-03-10T11:00:00Z",
+            last_updated="2026-03-10T11:00:00Z",
+        )
+        store.save(a)
+        store.save(b)
+        results = store.find_by_source_dir("/tmp/myapp")
+        assert len(results) == 1
+        assert results[0].session_id == a.session_id
+
+    def test_save_creates_parent_dir(self, tmp_path):
+        nested = tmp_path / "deep" / "sessions"
+        store = SessionStore(sessions_dir=nested)
+        data = SessionData(
+            session_id="s1", source_dir="/x", created_at="t", last_updated="t"
+        )
+        store.save(data)
+        assert (nested / "s1.json").exists()
+
+    def test_load_by_file_path(self, tmp_path):
+        store = SessionStore(sessions_dir=tmp_path)
+        data = SessionData(
+            session_id="s2", source_dir="/y", created_at="t", last_updated="t"
+        )
+        store.save(data)
+        path = str(tmp_path / "s2.json")
+        loaded = store.load(path)
+        assert loaded.session_id == "s2"
+
+    def test_find_by_source_dir_skips_malformed_json(self, tmp_path):
+        store = SessionStore(sessions_dir=tmp_path)
+        # Write a valid session
+        good = SessionData(
+            session_id="good",
+            source_dir="/tmp/myapp",
+            created_at="2026-03-10T10:00:00Z",
+            last_updated="2026-03-10T10:00:00Z",
+        )
+        store.save(good)
+        # Write a malformed JSON file
+        (tmp_path / "bad.json").write_text("not valid json")
+        # Should still return the valid session
+        results = store.find_by_source_dir("/tmp/myapp")
+        assert len(results) == 1
+        assert results[0].session_id == "good"
+
+    def test_make_session_id_contains_slug(self):
+        sid = SessionStore.make_session_id("/home/user/my_project")
+        assert "my_project" in sid
+
+    def test_make_session_id_replaces_spaces(self):
+        sid = SessionStore.make_session_id("/home/user/my project")
+        assert " " not in sid
+
+    def test_make_session_id_empty_name_uses_fallback(self):
+        # A path whose last component is empty shouldn't crash
+        sid = SessionStore.make_session_id("/")
+        assert "session" in sid or len(sid) > 10  # just doesn't crash
+
+    def test_find_by_source_dir_newest_first(self, tmp_path):
+        store = SessionStore(sessions_dir=tmp_path)
+        older = SessionData(
+            session_id="older",
+            source_dir="/tmp/myapp",
+            created_at="2026-03-09T10:00:00Z",
+            last_updated="2026-03-09T10:00:00Z",
+        )
+        newer = SessionData(
+            session_id="newer",
+            source_dir="/tmp/myapp",
+            created_at="2026-03-10T10:00:00Z",
+            last_updated="2026-03-10T10:00:00Z",
+        )
+        store.save(older)
+        store.save(newer)
+        results = store.find_by_source_dir("/tmp/myapp")
+        assert results[0].session_id == "newer"
+
+
+class TestInteractiveSessionMenu:
+    def test_new_session_created_when_none_exist(self, tmp_path):
+        store = SessionStore(sessions_dir=tmp_path)
+        with patch("rocpd.ai_analysis.interactive._input", return_value=""):
+            s = InteractiveSession(
+                source_dir="/tmp/myapp",
+                tier0_result=None,
+                recommendations=[],
+                database_path="",
+                llm_provider=None,
+                llm_api_key=None,
+                llm_model=None,
+                session_store=store,
+                resume_session_id=None,
+            )
+        assert s.session.source_dir == "/tmp/myapp"
+        assert s.session.session_id != ""
+
+    def test_quit_saves_session(self, tmp_path):
+        store = SessionStore(sessions_dir=tmp_path)
+        with patch("rocpd.ai_analysis.interactive._input", side_effect=["q"]):
+            s = InteractiveSession(
+                source_dir="/tmp/myapp",
+                tier0_result=None,
+                recommendations=[],
+                database_path="",
+                llm_provider=None,
+                llm_api_key=None,
+                llm_model=None,
+                session_store=store,
+                resume_session_id=None,
+            )
+            s.run()
+        assert len(store.find_by_source_dir("/tmp/myapp")) == 1
+
+    def test_resume_loads_persistent_items(self, tmp_path):
+        store = SessionStore(sessions_dir=tmp_path)
+        existing = SessionData(
+            session_id="old-session",
+            source_dir="/tmp/myapp",
+            created_at="2026-03-09T10:00:00Z",
+            last_updated="2026-03-09T10:00:00Z",
+            persistent_menu_items=[
+                PersistentMenuItem(
+                    id="ROCPD-OCC-001",
+                    title="Increase occupancy",
+                    priority="HIGH",
+                    source="profiling_analysis",
+                    added_at="2026-03-09T10:30:00Z",
+                )
+            ],
+        )
+        store.save(existing)
+        with patch("rocpd.ai_analysis.interactive._input", return_value=""):
+            s = InteractiveSession(
+                source_dir="/tmp/myapp",
+                tier0_result=None,
+                recommendations=[],
+                database_path="",
+                llm_provider=None,
+                llm_api_key=None,
+                llm_model=None,
+                session_store=store,
+                resume_session_id="old-session",
+            )
+        assert len(s.session.persistent_menu_items) == 1
+        assert s.session.persistent_menu_items[0].title == "Increase occupancy"
+
+    def test_run_save_without_quit(self, tmp_path):
+        """[s] saves session without exiting; [q] then exits."""
+        store = SessionStore(sessions_dir=tmp_path)
+        with patch("rocpd.ai_analysis.interactive._input", side_effect=["s", "q"]):
+            s = InteractiveSession(
+                source_dir="/tmp/myapp",
+                tier0_result=None,
+                recommendations=[],
+                database_path="",
+                llm_provider=None,
+                llm_api_key=None,
+                llm_model=None,
+                session_store=store,
+                resume_session_id=None,
+            )
+            s.run()
+        # Session should exist (saved by either [s] or [q])
+        assert len(store.find_by_source_dir("/tmp/myapp")) == 1
+
+    def test_run_eof_saves_and_exits(self, tmp_path):
+        """EOFError on input triggers save-and-quit gracefully."""
+        store = SessionStore(sessions_dir=tmp_path)
+        with patch("rocpd.ai_analysis.interactive._input", side_effect=EOFError()):
+            s = InteractiveSession(
+                source_dir="/tmp/myapp",
+                tier0_result=None,
+                recommendations=[],
+                database_path="",
+                llm_provider=None,
+                llm_api_key=None,
+                llm_model=None,
+                session_store=store,
+                resume_session_id=None,
+            )
+            s.run()  # must not raise
+        assert len(store.find_by_source_dir("/tmp/myapp")) == 1
+
+    def test_run_numeric_pursues_recommendation(self, tmp_path):
+        """Entering a number calls _pursue_recommendation for that item."""
+        store = SessionStore(sessions_dir=tmp_path)
+        item = PersistentMenuItem(
+            id="ROCPD-OCC-001",
+            title="Increase occupancy",
+            priority="HIGH",
+            source="profiling_analysis",
+            added_at="2026-03-10T10:00:00Z",
+        )
+        with patch("rocpd.ai_analysis.interactive._input", side_effect=["1", "q"]):
+            s = InteractiveSession(
+                source_dir="/tmp/myapp",
+                tier0_result=None,
+                recommendations=[],
+                database_path="",
+                llm_provider=None,
+                llm_api_key=None,
+                llm_model=None,
+                session_store=store,
+                resume_session_id=None,
+            )
+            s.session.persistent_menu_items.append(item)
+            pursued = []
+            s._pursue_recommendation = lambda i: pursued.append(i.id)
+            s.run()
+        assert pursued == ["ROCPD-OCC-001"]
+
+    def test_prompt_resume_invalid_choice_starts_new(self, tmp_path):
+        """Out-of-range choice in resume prompt falls through to new session."""
+        store = SessionStore(sessions_dir=tmp_path)
+        existing = SessionData(
+            session_id="old",
+            source_dir="/tmp/myapp",
+            created_at="2026-03-09T10:00:00Z",
+            last_updated="2026-03-09T10:00:00Z",
+        )
+        store.save(existing)
+        # "99" is out of range; should fall back to new session
+        with patch("rocpd.ai_analysis.interactive._input", return_value="99"):
+            s = InteractiveSession(
+                source_dir="/tmp/myapp",
+                tier0_result=None,
+                recommendations=[],
+                database_path="",
+                llm_provider=None,
+                llm_api_key=None,
+                llm_model=None,
+                session_store=store,
+                resume_session_id=None,
+            )
+        assert s.session.session_id != "old"
+
+
+class TestPathProfiling:
+    def _tier0(self):
+        t = MagicMock()
+        t.suggested_first_command = "rocprofv3 --sys-trace -- ./app"
+        t.profiling_plan = MagicMock()
+        t.profiling_plan.detected_kernels = []
+        t.profiling_plan.detected_patterns = []
+        t.profiling_plan.suggested_counters = ["SQ_WAVES", "GRBM_GUI_ACTIVE"]
+        t.profiling_plan.risk_areas = ["sync_heavy"]
+        t.profiling_plan.programming_model = "HIP"
+        t.profiling_plan.kernel_count = 3
+        return t
+
+    def test_path_p_adds_history_entry_on_db_provided(self, tmp_path):
+        store = SessionStore(sessions_dir=tmp_path / "sessions")
+        fake_db = tmp_path / "trace.db"
+        fake_db.touch()
+
+        recs_from_analysis = [
+            {
+                "id": "ROCPD-OCC-001",
+                "priority": "HIGH",
+                "category": "OCCUPANCY",
+                "issue": "Low waves",
+                "suggestion": "Increase waves",
+                "commands": [],
+                "actions": [],
+            }
+        ]
+
+        s = InteractiveSession(
+            source_dir="/tmp/myapp",
+            tier0_result=self._tier0(),
+            recommendations=[],
+            database_path="",
+            llm_provider=None,
+            llm_api_key=None,
+            llm_model=None,
+            session_store=store,
+            resume_session_id=None,
+        )
+
+        mock_proc = MagicMock()
+        mock_proc.returncode = 0
+        # _path_profiling now prompts: (1) command number, (2) app placeholder, (3) db path
+        input_seq = ["1", "", str(fake_db)]
+        with patch("rocpd.ai_analysis.interactive._input", side_effect=input_seq):
+            with patch("subprocess.run", return_value=mock_proc):
+                with patch.object(
+                    s,
+                    "_resolve_app_placeholder",
+                    return_value="rocprofv3 --sys-trace -- ./app",
+                ):
+                    with patch.object(
+                        s,
+                        "_run_tier1_analysis",
+                        return_value=(recs_from_analysis, None),
+                    ):
+                        s._path_profiling()
+
+        assert any(h.type == "profiling_run" for h in s.session.history)
+        assert len(s.session.persistent_menu_items) == 1
+        assert s.session.persistent_menu_items[0].source == "profiling_analysis"
+
+    def test_path_p_skips_analysis_when_no_db(self, tmp_path):
+        store = SessionStore(sessions_dir=tmp_path / "sessions")
+        s = InteractiveSession(
+            source_dir="/tmp/myapp",
+            tier0_result=self._tier0(),
+            recommendations=[],
+            database_path="",
+            llm_provider=None,
+            llm_api_key=None,
+            llm_model=None,
+            session_store=store,
+            resume_session_id=None,
+        )
+        with patch("rocpd.ai_analysis.interactive._input", return_value=""):
+            s._path_profiling()
+        assert len(s.session.persistent_menu_items) == 0
+
+
+class TestPathOptimize:
+    def _session_with_tier0(self, tmp_path, files):
+        """files: dict of {rel_path: content}"""
+        for name, content in files.items():
+            p = tmp_path / name
+            p.parent.mkdir(parents=True, exist_ok=True)
+            p.write_text(content)
+
+        t = MagicMock()
+        t.profiling_plan = MagicMock()
+        kernels = []
+        for name in files:
+            k = MagicMock()
+            k.file = name
+            kernels.append(k)
+        t.profiling_plan.detected_kernels = kernels
+        t.suggested_first_command = ""
+        return t
+
+    def test_hot_files_selected_from_detected_kernels(self, tmp_path):
+        files = {"kernel_a.hip": "// kernel A", "other.cpp": "// not a kernel"}
+        store = SessionStore(sessions_dir=tmp_path / "sessions")
+        t = self._session_with_tier0(tmp_path, files)
+
+        s = InteractiveSession(
+            source_dir=str(tmp_path),
+            tier0_result=t,
+            recommendations=[],
+            database_path="",
+            llm_provider=None,
+            llm_api_key=None,
+            llm_model=None,
+            session_store=store,
+            resume_session_id=None,
+        )
+        hot = s._select_hot_files()
+        names = [pathlib.Path(f).name for f, _ in hot]
+        assert "kernel_a.hip" in names
+
+    def test_token_budget_caps_files(self, tmp_path):
+        files = {f"k{i}.hip": "x" * 25_000 for i in range(3)}
+        store = SessionStore(sessions_dir=tmp_path / "sessions")
+        t = self._session_with_tier0(tmp_path, files)
+        s = InteractiveSession(
+            source_dir=str(tmp_path),
+            tier0_result=t,
+            recommendations=[],
+            database_path="",
+            llm_provider=None,
+            llm_api_key=None,
+            llm_model=None,
+            session_store=store,
+            resume_session_id=None,
+        )
+        hot = s._select_hot_files(budget=60_000)
+        total = sum(len(c) for _, c in hot)
+        assert total <= 60_000
+
+
+class TestPursueRecommendation:
+    def test_pursue_back_to_menu_keeps_item(self, tmp_path):
+        store = SessionStore(sessions_dir=tmp_path)
+        item = PersistentMenuItem(
+            id="ROCPD-OCC-001",
+            title="Increase occupancy",
+            priority="HIGH",
+            source="profiling_analysis",
+            added_at="2026-03-10T10:00:00Z",
+            detail={
+                "commands": [
+                    {
+                        "full_command": "rocprofv3 --pmc SQ_WAVES -- ./app",
+                        "tool": "rocprofv3",
+                        "description": "collect waves",
+                    }
+                ]
+            },
+        )
+        s = InteractiveSession(
+            source_dir="/tmp/myapp",
+            tier0_result=None,
+            recommendations=[],
+            database_path="",
+            llm_provider=None,
+            llm_api_key=None,
+            llm_model=None,
+            session_store=store,
+            resume_session_id=None,
+        )
+        s.session.persistent_menu_items.append(item)
+        with patch("rocpd.ai_analysis.interactive._input", return_value="m"):
+            s._pursue_recommendation(item)
+        # Item must still be in the list (not consumed)
+        assert len(s.session.persistent_menu_items) == 1
diff --git a/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/tests/test_llm_conversation.py b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/tests/test_llm_conversation.py
new file mode 100644
index 00000000000..ed8a7c17951
--- /dev/null
+++ b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/tests/test_llm_conversation.py
@@ -0,0 +1,634 @@
+# ai_analysis/tests/test_llm_conversation.py
+"""Tests for LLMConversation persistent streaming session."""
+
+from __future__ import annotations
+import importlib.util
+import json
+import pathlib
+import tempfile
+import unittest
+from types import SimpleNamespace
+from unittest.mock import MagicMock, patch
+
+from rocpd.ai_analysis.llm_conversation import LLMConversation
+
+_has_anthropic = importlib.util.find_spec("anthropic") is not None
+_has_openai = importlib.util.find_spec("openai") is not None
+
+_skip_no_anthropic = unittest.skipUnless(
+    _has_anthropic, "anthropic package not installed"
+)
+_skip_no_openai = unittest.skipUnless(_has_openai, "openai package not installed")
+
+# ── Helpers ──────────────────────────────────────────────────────────────────
+
+
+class _MockAnthropicStream:
+    """Simulates anthropic.messages.stream() context manager."""
+
+    def __init__(self, chunks):
+        self._chunks = chunks
+
+    @property
+    def text_stream(self):
+        return iter(self._chunks)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *a):
+        pass
+
+
+class _MockAnthropicMessages:
+    def __init__(self, chunks):
+        self._chunks = chunks
+
+    def stream(self, **kwargs):
+        return _MockAnthropicStream(self._chunks)
+
+    def create(self, **kwargs):
+        text = "".join(self._chunks)
+        block = SimpleNamespace(text=text, type="text")
+        return SimpleNamespace(content=[block])
+
+
+class _MockAnthropicClient:
+    def __init__(self, chunks):
+        self.messages = _MockAnthropicMessages(chunks)
+
+
+def _openai_chunk(text):
+    return SimpleNamespace(choices=[SimpleNamespace(delta=SimpleNamespace(content=text))])
+
+
+class _MockOpenAICompletions:
+    def __init__(self, chunks, raise_first=None):
+        self._chunks = chunks
+        self._raise_first = raise_first
+        self._call_count = 0
+
+    def create(self, **kwargs):
+        if self._raise_first and self._call_count == 0:
+            self._call_count += 1
+            raise self._raise_first
+        self._call_count += 1
+        return iter([_openai_chunk(c) for c in self._chunks])
+
+
+class _MockOpenAIChat:
+    def __init__(self, chunks, raise_first=None):
+        self.completions = _MockOpenAICompletions(chunks, raise_first)
+
+
+class _MockOpenAIClient:
+    def __init__(self, chunks, raise_first=None):
+        self.chat = _MockOpenAIChat(chunks, raise_first)
+
+
+# ── TestLLMConversation ───────────────────────────────────────────────────────
+
+
+@_skip_no_anthropic
+class TestLLMConversation(unittest.TestCase):
+    """Core behavior: initialize, send, message growth, turn_count."""
+
+    def _make_conv(self, provider="anthropic"):
+        return LLMConversation(provider=provider, api_key="test-key")
+
+    def test_unknown_provider_raises(self):
+        with self.assertRaises(ValueError):
+            LLMConversation(provider="bogus")
+
+    def test_initialize_sets_system(self):
+        conv = self._make_conv()
+        conv.initialize("You are an expert.")
+        self.assertEqual(conv._system, "You are an expert.")
+
+    def test_messages_empty_before_send(self):
+        conv = self._make_conv()
+        conv.initialize("sys")
+        self.assertEqual(conv.messages, [])
+        self.assertEqual(conv.turn_count, 0)
+
+    def test_send_appends_user_and_assistant(self):
+        conv = self._make_conv("anthropic")
+        conv.initialize("sys")
+        mock_client = _MockAnthropicClient(["Hello ", "world"])
+        with patch("anthropic.Anthropic", return_value=mock_client):
+            result = conv.send("Hi there")
+        self.assertEqual(result, "Hello world")
+        self.assertEqual(len(conv.messages), 2)
+        self.assertEqual(conv.messages[0], {"role": "user", "content": "Hi there"})
+        self.assertEqual(
+            conv.messages[1], {"role": "assistant", "content": "Hello world"}
+        )
+        self.assertEqual(conv.turn_count, 1)
+
+    def test_send_multiple_turns_accumulates(self):
+        conv = self._make_conv("anthropic")
+        conv.initialize("sys")
+        mock_client = _MockAnthropicClient(["resp"])
+        with patch("anthropic.Anthropic", return_value=mock_client):
+            conv.send("turn1")
+            conv.send("turn2")
+        self.assertEqual(conv.turn_count, 2)
+        self.assertEqual(len(conv.messages), 4)
+
+    def test_system_set_once_not_in_messages(self):
+        conv = self._make_conv("anthropic")
+        conv.initialize("fence content")
+        mock_client = _MockAnthropicClient(["ok"])
+        with patch("anthropic.Anthropic", return_value=mock_client):
+            conv.send("q")
+        # system must never appear in _messages
+        for msg in conv.messages:
+            self.assertNotEqual(msg.get("content"), "fence content")
+
+
+# ── TestStreaming ─────────────────────────────────────────────────────────────
+
+
+@_skip_no_anthropic
+@_skip_no_openai
+class TestStreaming(unittest.TestCase):
+    """on_token callback and silent collection."""
+
+    def test_on_token_called_per_chunk_anthropic(self):
+        conv = LLMConversation(provider="anthropic", api_key="k")
+        conv.initialize("sys")
+        received = []
+        mock_client = _MockAnthropicClient(["Hello ", "world"])
+        with patch("anthropic.Anthropic", return_value=mock_client):
+            conv.send("q", on_token=received.append)
+        self.assertEqual(received, ["Hello ", "world"])
+
+    def test_on_token_none_silent(self):
+        conv = LLMConversation(provider="anthropic", api_key="k")
+        conv.initialize("sys")
+        mock_client = _MockAnthropicClient(["silent"])
+        with patch("anthropic.Anthropic", return_value=mock_client):
+            result = conv.send("q", on_token=None)
+        self.assertEqual(result, "silent")
+
+    def test_on_token_called_per_chunk_openai(self):
+        conv = LLMConversation(provider="openai", api_key="k")
+        conv.initialize("sys")
+        received = []
+        mock_client = _MockOpenAIClient(["Hello ", "world"])
+        with patch("openai.OpenAI", return_value=mock_client):
+            conv.send("q", on_token=received.append)
+        self.assertEqual(received, ["Hello ", "world"])
+
+    def test_local_provider_uses_openai_path(self):
+        conv = LLMConversation(provider="local")
+        conv.initialize("sys")
+        received = []
+        mock_client = _MockOpenAIClient(["local-resp"])
+        with patch("openai.OpenAI", return_value=mock_client):
+            result = conv.send("q", on_token=received.append)
+        self.assertEqual(result, "local-resp")
+        self.assertEqual(received, ["local-resp"])
+
+
+# ── TestOpenAIFallback ────────────────────────────────────────────────────────
+
+
+@_skip_no_openai
+class TestOpenAIFallback(unittest.TestCase):
+    """max_completion_tokens → max_tokens on BadRequestError."""
+
+    def test_fallback_on_bad_request(self):
+        import openai
+
+        conv = LLMConversation(provider="openai", api_key="k")
+        conv.initialize("sys")
+        bad_error = openai.BadRequestError(
+            message="max_completion_tokens not supported",
+            response=MagicMock(status_code=400),
+            body={"error": {"message": "max_completion_tokens not supported"}},
+        )
+        mock_client = _MockOpenAIClient(["fallback-ok"], raise_first=bad_error)
+        with patch("openai.OpenAI", return_value=mock_client):
+            result = conv.send("q")
+        self.assertEqual(result, "fallback-ok")
+        self.assertEqual(mock_client.chat.completions._call_count, 2)
+
+
+# ── TestCompaction ────────────────────────────────────────────────────────────
+
+
+@_skip_no_anthropic
+class TestCompaction(unittest.TestCase):
+    """Compaction trigger, turn_count not incremented, summary block placement."""
+
+    def _make_conv_with_mock(
+        self, provider="anthropic", compact_every=2, keep_recent_turns=1
+    ):
+        conv = LLMConversation(
+            provider=provider,
+            api_key="k",
+            compact_every=compact_every,
+            keep_recent_turns=keep_recent_turns,
+        )
+        conv.initialize("sys")
+        return conv
+
+    def test_compaction_triggered_at_n_turns(self):
+        conv = self._make_conv_with_mock(compact_every=2, keep_recent_turns=1)
+        mock_client = _MockAnthropicClient(["resp"])
+        compact_called = []
+
+        original_compact = conv._compact
+
+        def mock_compact():
+            compact_called.append(True)
+            original_compact()
+
+        conv._compact = mock_compact
+
+        with patch("anthropic.Anthropic", return_value=mock_client):
+            conv.send("turn1")  # turn_count=1, not a multiple of 2 → no compact
+            self.assertEqual(compact_called, [])
+            conv.send("turn2")  # turn_count=2, 2 % 2 == 0 → compact
+        self.assertEqual(len(compact_called), 1)
+
+    def test_turn_count_not_incremented_by_compaction(self):
+        conv = self._make_conv_with_mock(compact_every=2, keep_recent_turns=1)
+        # Mock _compact to be a no-op (avoids needing a second mock client)
+        conv._compact = lambda: None
+        mock_client = _MockAnthropicClient(["resp"])
+        with patch("anthropic.Anthropic", return_value=mock_client):
+            conv.send("t1")
+            conv.send("t2")
+        self.assertEqual(conv.turn_count, 2)
+
+    def test_compaction_replaces_old_messages_with_summary_block(self):
+        conv = self._make_conv_with_mock(compact_every=4, keep_recent_turns=1)
+        conv.initialize("sys")
+        conv._messages = [
+            {"role": "user", "content": "q1"},
+            {"role": "assistant", "content": "a1"},
+            {"role": "user", "content": "q2"},
+            {"role": "assistant", "content": "a2"},
+            {
+                "role": "user",
+                "content": "q3",
+            },  # recent (keep_recent_turns=1 → keep 2 msgs)
+            {"role": "assistant", "content": "a3"},
+        ]
+        conv._turn_count = 3
+
+        mock_client = _MockAnthropicClient(["compact summary"])
+        with patch("anthropic.Anthropic", return_value=mock_client):
+            conv._compact()
+
+        self.assertEqual(len(conv.messages), 4)  # 2 summary + 2 recent
+        self.assertEqual(conv.messages[0]["role"], "user")
+        self.assertEqual(conv.messages[0]["content"], "Summarize our session so far.")
+        self.assertIn("[Session summary]", conv.messages[1]["content"])
+        self.assertIn("compact summary", conv.messages[1]["content"])
+        # Recent messages preserved
+        self.assertEqual(conv.messages[2]["content"], "q3")
+        self.assertEqual(conv.messages[3]["content"], "a3")
+
+    def test_compaction_does_not_crash_on_failure(self):
+        conv = self._make_conv_with_mock(compact_every=2, keep_recent_turns=1)
+        conv._messages = [
+            {"role": "user", "content": "q1"},
+            {"role": "assistant", "content": "a1"},
+            {"role": "user", "content": "q2"},
+            {"role": "assistant", "content": "a2"},
+        ]
+        conv._turn_count = 2
+        import warnings
+
+        with warnings.catch_warnings(record=True):
+            warnings.simplefilter("always")
+
+            def _fail(**kw):
+                raise RuntimeError("API down")
+
+            conv._call_non_streaming = _fail
+            conv._compact()
+        # Messages unchanged
+        self.assertEqual(len(conv.messages), 4)
+
+
+# ── TestDiskArchive ───────────────────────────────────────────────────────────
+
+
+@_skip_no_anthropic
+class TestDiskArchive(unittest.TestCase):
+    """JSONL archive written only when history_path is set."""
+
+    def test_archive_written_on_compaction(self):
+        with tempfile.TemporaryDirectory() as td:
+            hp = pathlib.Path(td) / "history.jsonl"
+            conv = LLMConversation(
+                provider="anthropic",
+                api_key="k",
+                compact_every=4,
+                keep_recent_turns=1,
+                history_path=hp,
+            )
+            conv.initialize("sys")
+            conv._messages = [
+                {"role": "user", "content": "q1"},
+                {"role": "assistant", "content": "a1"},
+                {"role": "user", "content": "q2"},
+                {"role": "assistant", "content": "a2"},
+                {"role": "user", "content": "q3"},
+                {"role": "assistant", "content": "a3"},
+            ]
+            conv._turn_count = 3
+            mock_client = _MockAnthropicClient(["summary"])
+            with patch("anthropic.Anthropic", return_value=mock_client):
+                conv._compact()
+
+            self.assertTrue(hp.exists())
+            lines = hp.read_text().strip().splitlines()
+            self.assertGreaterEqual(len(lines), 4)  # at least 4 old messages archived
+            entry = json.loads(lines[0])
+            self.assertIn("role", entry)
+            self.assertIn("content", entry)
+            self.assertIn("ts", entry)
+
+    def test_no_archive_when_history_path_none(self):
+        conv = LLMConversation(
+            provider="anthropic",
+            api_key="k",
+            compact_every=4,
+            keep_recent_turns=1,
+            history_path=None,
+        )
+        conv.initialize("sys")
+        conv._messages = [
+            {"role": "user", "content": "q1"},
+            {"role": "assistant", "content": "a1"},
+            {"role": "user", "content": "q2"},
+            {"role": "assistant", "content": "a2"},
+            {"role": "user", "content": "q3"},
+            {"role": "assistant", "content": "a3"},
+        ]
+        conv._turn_count = 3
+        mock_client = _MockAnthropicClient(["summary"])
+        with patch("anthropic.Anthropic", return_value=mock_client):
+            conv._compact()  # Should not raise, should not create any file
+
+
+# ── TestPersistence ───────────────────────────────────────────────────────────
+
+
+class TestPersistence(unittest.TestCase):
+    """to_dict / from_dict round-trip."""
+
+    def test_round_trip_restores_all_state(self):
+        with tempfile.TemporaryDirectory() as td:
+            hp = pathlib.Path(td) / "hist.jsonl"
+            conv = LLMConversation(
+                provider="anthropic",
+                api_key="orig-key",
+                model="claude-opus-4-6",
+                compact_every=5,
+                keep_recent_turns=3,
+                history_path=hp,
+            )
+            conv.initialize("fence text")
+            conv._messages = [
+                {"role": "user", "content": "hello"},
+                {"role": "assistant", "content": "world"},
+            ]
+            conv._turn_count = 1
+
+            d = conv.to_dict()
+            restored = LLMConversation.from_dict(d, api_key="new-key")
+
+            self.assertEqual(restored._provider, "anthropic")
+            self.assertEqual(restored._model, "claude-opus-4-6")
+            self.assertEqual(restored._system, "fence text")
+            self.assertEqual(restored._messages, conv._messages)
+            self.assertEqual(restored.turn_count, 1)
+            self.assertEqual(restored._compact_every, 5)
+            self.assertEqual(restored._keep_recent_turns, 3)
+            self.assertEqual(restored._api_key, "new-key")
+
+    def test_from_dict_api_key_override(self):
+        conv = LLMConversation(provider="anthropic", api_key="orig")
+        conv.initialize("sys")
+        d = conv.to_dict()
+        restored = LLMConversation.from_dict(d, api_key="override")
+        self.assertEqual(restored._api_key, "override")
+
+    def test_from_dict_model_override(self):
+        conv = LLMConversation(provider="anthropic", model="claude-opus-4-6")
+        conv.initialize("sys")
+        d = conv.to_dict()
+        restored = LLMConversation.from_dict(d, model="claude-sonnet-4-6")
+        self.assertEqual(restored._model, "claude-sonnet-4-6")
+
+    def test_to_dict_does_not_include_api_key(self):
+        conv = LLMConversation(provider="anthropic", api_key="sk-secret")
+        conv.initialize("sys")
+        d = conv.to_dict()
+        self.assertNotIn("api_key", d)
+        self.assertNotIn("sk-secret", str(d))
+
+
+# ── TestInteractiveIntegration ────────────────────────────────────────────────
+
+
+class TestInteractiveIntegration(unittest.TestCase):
+    """Integration tests: LLMConversation wired into InteractiveSession."""
+
+    def _make_session(self, mock_conv=None):
+        """Build an InteractiveSession with a mocked _conv and a temp session store."""
+        from rocpd.ai_analysis.interactive import InteractiveSession, SessionStore
+        import tempfile
+
+        store_dir = tempfile.mkdtemp()
+        session = InteractiveSession(
+            source_dir="/tmp/fake_src",
+            tier0_result=None,
+            recommendations=[],
+            database_path="",
+            llm_provider=None,  # no real LLM; we inject mock directly
+            llm_api_key=None,
+            llm_model=None,
+            session_store=SessionStore(store_dir),
+        )
+        if mock_conv is not None:
+            session._conv = mock_conv
+        return session
+
+    def test_conv_send_called_for_annotate_profiling_plan(self):
+        mock_conv = MagicMock()
+        mock_conv.send.return_value = "some annotation"
+        session = self._make_session(mock_conv)
+
+        # Provide minimal tier0 result
+        import types
+
+        plan = types.SimpleNamespace(
+            programming_model="HIP",
+            kernel_count=2,
+            suggested_counters=[],
+            risk_areas=[],
+            detected_patterns=[],
+        )
+        session._tier0 = plan
+
+        session._llm_annotate_profiling_plan([("counter", "rocprofv3 --pmc A -- ./app")])
+        mock_conv.send.assert_called_once()
+        call_msg = mock_conv.send.call_args[0][0]
+        self.assertIn("Annotate this profiling plan", call_msg)
+
+    def test_conv_send_called_for_optimize_via_tier0(self):
+        mock_conv = MagicMock()
+        mock_conv.send.return_value = "use MFMA"
+        session = self._make_session(mock_conv)
+
+        # _optimize_via_tier0 reads self._recs; give it empty list
+        session._recs = []
+        # patch _offer_apply_suggestions and _offer_run_ai_commands to be no-ops
+        session._offer_apply_suggestions = MagicMock()
+        session._offer_run_ai_commands = MagicMock()
+        session._extract_ai_commands = MagicMock(return_value=[])
+
+        import types
+
+        plan = types.SimpleNamespace(
+            source_files=[],
+            detected_patterns=[],
+            suggested_counters=[],
+        )
+        session._tier0 = plan
+        session._optimize_via_tier0(llm_provider="anthropic")
+        mock_conv.send.assert_called_once()
+        call_msg = mock_conv.send.call_args[0][0]
+        self.assertIn("optimization recommendations", call_msg)
+
+    def test_conv_send_called_for_request_optimization_suggestions(self):
+        mock_conv = MagicMock()
+        mock_conv.send.return_value = "FILE: foo.cpp\nuse LDS"
+        session = self._make_session(mock_conv)
+
+        session._request_optimization_suggestions([("foo.cpp", "// kernel code")])
+        mock_conv.send.assert_called_once()
+        # First call sends full source content
+        call_msg = mock_conv.send.call_args[0][0]
+        self.assertIn("foo.cpp", call_msg)
+        self.assertIn("kernel code", call_msg)
+
+    def test_repeated_call_does_not_resend_source_content(self):
+        """Second call for the same files sends a short follow-up, not the full source."""
+        mock_conv = MagicMock()
+        mock_conv.send.return_value = "FILE: foo.cpp\nuse LDS"
+        session = self._make_session(mock_conv)
+
+        summaries = [("foo.cpp", "// kernel code")]
+        session._request_optimization_suggestions(summaries)
+        session._request_optimization_suggestions(summaries)
+
+        self.assertEqual(mock_conv.send.call_count, 2)
+        first_msg = mock_conv.send.call_args_list[0][0][0]
+        second_msg = mock_conv.send.call_args_list[1][0][0]
+        # First call contains source content, second does not
+        self.assertIn("kernel code", first_msg)
+        self.assertNotIn("kernel code", second_msg)
+        self.assertIn("already shared", second_msg)
+
+    def test_new_file_sends_only_new_content(self):
+        """Adding a new file on second call sends only the new file's content."""
+        mock_conv = MagicMock()
+        mock_conv.send.return_value = "FILE: bar.cpp\nuse streams"
+        session = self._make_session(mock_conv)
+
+        session._request_optimization_suggestions([("foo.cpp", "// foo")])
+        mock_conv.reset_mock()
+        mock_conv.send.return_value = "FILE: bar.cpp\nuse streams"
+
+        session._request_optimization_suggestions(
+            [("foo.cpp", "// foo"), ("bar.cpp", "// bar")]
+        )
+        mock_conv.send.assert_called_once()
+        msg = mock_conv.send.call_args[0][0]
+        self.assertNotIn("// foo", msg)  # already in conversation
+        self.assertIn("// bar", msg)  # new file — must be sent
+
+    def test_post_rewrite_summary_appended_to_conv(self):
+        """_apply_suggestions_via_llm notifies _conv after writing a file."""
+        from rocpd.ai_analysis.interactive import InteractiveSession, SessionStore
+        import pathlib
+        import tempfile
+
+        store_dir = tempfile.mkdtemp()
+        src_dir = tempfile.mkdtemp()
+        src_file = pathlib.Path(src_dir) / "kernel.cpp"
+        src_file.write_text("// original")
+
+        mock_conv = MagicMock()
+        mock_conv.send.return_value = "summary sent"
+
+        session = InteractiveSession(
+            source_dir=src_dir,
+            tier0_result=None,
+            recommendations=[],
+            database_path="",
+            llm_provider=None,
+            llm_api_key=None,
+            llm_model=None,
+            session_store=SessionStore(store_dir),
+        )
+        session._conv = mock_conv
+        session._llm_provider = "anthropic"
+
+        # _pick_source_file prompts interactively — return the file directly
+        session._pick_source_file = MagicMock(return_value=src_file)
+
+        # _apply_suggestions_via_llm instantiates LLMAnalyzer inline and calls
+        # _call_anthropic — patch it at the source so no real API call is made
+        mock_analyzer = MagicMock()
+        mock_analyzer._call_anthropic.return_value = "// rewritten by LLM"
+
+        import rocpd.ai_analysis.interactive as imod
+
+        with patch(
+            "rocpd.ai_analysis.llm_analyzer.LLMAnalyzer", return_value=mock_analyzer
+        ):
+            with patch.object(imod, "_input", return_value="y"):
+                session._apply_suggestions_via_llm(
+                    "use LDS tiling for better cache reuse", "anthropic"
+                )
+
+        # _conv.send must have been called with the post-rewrite notification
+        rewrite_calls = [
+            call for call in mock_conv.send.call_args_list if "rewritten" in str(call)
+        ]
+        self.assertTrue(
+            len(rewrite_calls) >= 1,
+            "Expected _conv.send() to be called with post-rewrite summary",
+        )
+
+    def test_session_context_not_referenced(self):
+        """SessionContext must not exist in the interactive module."""
+        import rocpd.ai_analysis.interactive as imod
+
+        self.assertFalse(
+            hasattr(imod, "SessionContext"),
+            "SessionContext should have been removed from interactive.py",
+        )
+
+    def test_workflow_session_has_no_conv(self):
+        """WorkflowSession must not own a _conv attribute."""
+        from rocpd.ai_analysis.interactive import WorkflowSession
+
+        ws = WorkflowSession(app_command="./app")
+        self.assertFalse(
+            hasattr(ws, "_conv"),
+            "WorkflowSession must not own _conv",
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/tests/test_local_llm.py b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/tests/test_local_llm.py
new file mode 100644
index 00000000000..f6e485edc21
--- /dev/null
+++ b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/tests/test_local_llm.py
@@ -0,0 +1,36 @@
+import pytest
+from unittest.mock import patch, MagicMock
+from rocpd.ai_analysis.llm_analyzer import LLMAnalyzer
+
+
+class TestLocalLLM:
+    def test_local_provider_accepted(self):
+        a = LLMAnalyzer(provider="local", model="codellama:13b", api_key="ignored")
+        assert a.provider == "local"
+
+    def test_summarize_source_file_calls_local(self):
+        a = LLMAnalyzer(provider="local", model="codellama:13b", api_key="ignored")
+        with patch.object(a, "_call_local", return_value="summary text") as mock:
+            result = a.summarize_source_file("kernel.hip", "// hip kernel code")
+        mock.assert_called_once()
+        assert result == "summary text"
+
+    def test_annotate_profiling_plan_calls_online(self):
+        a = LLMAnalyzer(provider="anthropic", api_key="sk-ant-test")
+        with patch.object(a, "_call_anthropic", return_value="advice") as mock:
+            result = a.annotate_profiling_plan({"kernel_count": 3})
+        mock.assert_called_once()
+        assert result == "advice"
+
+    def test_call_local_uses_openai_compat_endpoint(self):
+        pytest.importorskip("openai", reason="openai package not installed")
+        a = LLMAnalyzer(provider="local", model="codellama:13b", api_key="ignored")
+        mock_resp = MagicMock()
+        mock_resp.choices = [MagicMock(message=MagicMock(content="ok"))]
+        with patch("openai.OpenAI") as MockClient:
+            instance = MockClient.return_value
+            instance.chat.completions.create.return_value = mock_resp
+            result = a._call_local("sys", "user")
+        assert result == "ok"
+        call_kwargs = MockClient.call_args[1]
+        assert "localhost" in call_kwargs.get("base_url", "")
diff --git a/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/tests/test_tracelens_port.py b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/tests/test_tracelens_port.py
new file mode 100644
index 00000000000..6819c9170fc
--- /dev/null
+++ b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/tests/test_tracelens_port.py
@@ -0,0 +1,428 @@
+#!/usr/bin/env python3
+"""
+Unit tests for rocpd/tracelens_port.py.
+
+Run:
+    ROCPD_SYS=$(python3 -c "import site; print(site.getsitepackages()[-1])")
+    PYTHONPATH="${ROCPD_SYS}" pytest --noconftest test_tracelens_port.py -v
+
+Integration test (requires real merged_db.db):
+    ROCPD_TEST_DB=/path/to/merged_db.db pytest --noconftest test_tracelens_port.py -v
+"""
+
+from unittest.mock import MagicMock, patch
+import pytest
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _mock_cursor(rows):
+    """Return a mock cursor whose .fetchall() returns rows."""
+    cur = MagicMock()
+    cur.fetchall.return_value = rows
+    return cur
+
+
+def _mock_conn():
+    return MagicMock()
+
+
+# ===========================================================================
+# Task 1 Tests: pure functions (no DB)
+# ===========================================================================
+
+
+class TestMergeIntervals:
+    def test_empty(self):
+        from rocpd.tracelens_port import _merge_intervals
+
+        assert _merge_intervals([]) == []
+
+    def test_single(self):
+        from rocpd.tracelens_port import _merge_intervals
+
+        assert _merge_intervals([(0, 100)]) == [(0, 100)]
+
+    def test_non_overlapping(self):
+        from rocpd.tracelens_port import _merge_intervals
+
+        result = _merge_intervals([(0, 50), (100, 150)])
+        assert result == [(0, 50), (100, 150)]
+
+    def test_overlapping(self):
+        from rocpd.tracelens_port import _merge_intervals
+
+        result = _merge_intervals([(0, 100), (50, 150)])
+        assert result == [(0, 150)]
+
+    def test_adjacent(self):
+        from rocpd.tracelens_port import _merge_intervals
+
+        result = _merge_intervals([(0, 100), (100, 200)])
+        assert result == [(0, 200)]
+
+    def test_contained(self):
+        from rocpd.tracelens_port import _merge_intervals
+
+        result = _merge_intervals([(0, 200), (50, 100)])
+        assert result == [(0, 200)]
+
+    def test_unsorted_input(self):
+        from rocpd.tracelens_port import _merge_intervals
+
+        result = _merge_intervals([(100, 200), (0, 50)])
+        assert result == [(0, 50), (100, 200)]
+
+
+class TestCategorizeKernelName:
+    def test_gemm(self):
+        from rocpd.tracelens_port import categorize_kernel_name
+
+        assert categorize_kernel_name("sgemm_nn_kernel") == "GEMM"
+        assert categorize_kernel_name("Cijk_Alik_Bljk_HHS_BH_SRVB") == "GEMM"
+        assert categorize_kernel_name("rocblas_gemm_kernel") == "GEMM"
+
+    def test_conv(self):
+        from rocpd.tracelens_port import categorize_kernel_name
+
+        assert categorize_kernel_name("conv2d_fwd_kernel") == "CONV"
+        assert categorize_kernel_name("implicit_gemm_conv_v4r1") == "CONV"
+
+    def test_sdpa(self):
+        from rocpd.tracelens_port import categorize_kernel_name
+
+        assert categorize_kernel_name("flash_attention_fwd") == "SDPA"
+        assert categorize_kernel_name("fmha_v2_flash_attn") == "SDPA"
+        assert categorize_kernel_name("scaled_dot_product_attention") == "SDPA"
+
+    def test_nccl(self):
+        from rocpd.tracelens_port import categorize_kernel_name
+
+        assert categorize_kernel_name("ncclKernel_AllReduce_RING_LL") == "NCCL"
+        assert categorize_kernel_name("rccl_AllGather_kernel") == "NCCL"
+
+    def test_elementwise(self):
+        from rocpd.tracelens_port import categorize_kernel_name
+
+        assert categorize_kernel_name("vectorized_elementwise_kernel") == "Elementwise"
+        assert categorize_kernel_name("gelu_activation_kernel") == "Elementwise"
+
+    def test_normalization(self):
+        from rocpd.tracelens_port import categorize_kernel_name
+
+        assert categorize_kernel_name("layer_norm_fwd") == "Normalization"
+        assert categorize_kernel_name("rms_norm_kernel") == "Normalization"
+
+    def test_reduction(self):
+        from rocpd.tracelens_port import categorize_kernel_name
+
+        assert categorize_kernel_name("reduce_kernel") == "Reduction"
+        assert categorize_kernel_name("softmax_fwd") == "Reduction"
+
+    def test_other(self):
+        from rocpd.tracelens_port import categorize_kernel_name
+
+        assert categorize_kernel_name("reproducible_dispatch_count") == "Other"
+        assert categorize_kernel_name("custom_kernel_xyz") == "Other"
+
+    def test_case_insensitive(self):
+        from rocpd.tracelens_port import categorize_kernel_name
+
+        assert categorize_kernel_name("SGEMM_KERNEL") == "GEMM"
+        assert categorize_kernel_name("Flash_Attention") == "SDPA"
+
+
+class TestSubtractIntervals:
+    def test_empty_a(self):
+        from rocpd.tracelens_port import _subtract_intervals
+
+        assert _subtract_intervals([], [(0, 100)]) == []
+
+    def test_empty_b(self):
+        from rocpd.tracelens_port import _subtract_intervals
+
+        result = _subtract_intervals([(0, 100)], [])
+        assert result == [(0, 100)]
+
+    def test_b_fully_covers_a(self):
+        from rocpd.tracelens_port import _subtract_intervals
+
+        result = _subtract_intervals([(0, 100)], [(0, 100)])
+        assert result == []
+
+    def test_b_partially_overlaps_left(self):
+        from rocpd.tracelens_port import _subtract_intervals
+
+        # b covers [0,50], a is [0,100] → remaining [50, 100]
+        result = _subtract_intervals([(0, 100)], [(0, 50)])
+        assert result == [(50, 100)]
+
+    def test_b_partially_overlaps_right(self):
+        from rocpd.tracelens_port import _subtract_intervals
+
+        # b covers [50,100], a is [0,100] → remaining [0, 50]
+        result = _subtract_intervals([(0, 100)], [(50, 100)])
+        assert result == [(0, 50)]
+
+    def test_b_cuts_middle(self):
+        from rocpd.tracelens_port import _subtract_intervals
+
+        # b covers [40,60], a is [0,100] → remaining [0,40] and [60,100]
+        result = _subtract_intervals([(0, 100)], [(40, 60)])
+        assert result == [(0, 40), (60, 100)]
+
+    def test_multiple_a_intervals(self):
+        from rocpd.tracelens_port import _subtract_intervals
+
+        # a=[0,50],[100,150], b=[25,125] → [0,25] and [125,150]
+        result = _subtract_intervals([(0, 50), (100, 150)], [(25, 125)])
+        assert result == [(0, 25), (125, 150)]
+
+    def test_adjacent_boundary(self):
+        from rocpd.tracelens_port import _subtract_intervals
+
+        # b ends exactly at a_start → no overlap, a is preserved
+        result = _subtract_intervals([(100, 200)], [(0, 100)])
+        assert result == [(100, 200)]
+
+    def test_no_overlap(self):
+        from rocpd.tracelens_port import _subtract_intervals
+
+        # b is entirely before a
+        result = _subtract_intervals([(200, 300)], [(0, 100)])
+        assert result == [(200, 300)]
+
+
+# ===========================================================================
+# Task 2 Tests: DB-dependent functions (mocked execute_statement)
+# ===========================================================================
+
+
+class TestComputeIntervalTimeline:
+    def test_overlapping_kernels(self):
+        """Two overlapping kernel intervals: true_compute < sum of durations."""
+        from rocpd.tracelens_port import compute_interval_timeline
+
+        conn = _mock_conn()
+        # Kernels: [0, 100] and [50, 150] → merged [0, 150] = 150ns
+        # Memcpy: empty
+        kernel_rows = [(0, 100), (50, 150)]
+        memcpy_rows = []
+        with patch("rocpd.tracelens_port.execute_statement") as mock_es:
+            mock_es.side_effect = [
+                _mock_cursor(kernel_rows),
+                _mock_cursor(memcpy_rows),
+            ]
+            result = compute_interval_timeline(conn)
+        assert result["true_compute_ns"] == 150
+        assert result["total_wall_ns"] == 150
+        assert result["true_compute_pct"] == 100.0
+        assert result["exposed_memcpy_ns"] == 0
+        assert result["idle_ns"] == 0
+
+    def test_non_overlapping_intervals(self):
+        """Non-overlapping: true_compute == sum of durations."""
+        from rocpd.tracelens_port import compute_interval_timeline
+
+        conn = _mock_conn()
+        kernel_rows = [(0, 50), (100, 150)]  # 50 + 50 = 100ns compute
+        memcpy_rows = []
+        with patch("rocpd.tracelens_port.execute_statement") as mock_es:
+            mock_es.side_effect = [
+                _mock_cursor(kernel_rows),
+                _mock_cursor(memcpy_rows),
+            ]
+            result = compute_interval_timeline(conn)
+        assert result["true_compute_ns"] == 100
+        assert result["total_wall_ns"] == 150
+        assert result["idle_ns"] == 50
+
+    def test_empty_kernels(self):
+        """Empty kernels table → compute=0, idle=0, no crash."""
+        from rocpd.tracelens_port import compute_interval_timeline
+
+        conn = _mock_conn()
+        with patch("rocpd.tracelens_port.execute_statement") as mock_es:
+            mock_es.side_effect = [
+                _mock_cursor([]),  # kernels
+                _mock_cursor([]),  # memory_copies
+            ]
+            result = compute_interval_timeline(conn)
+        assert result["true_compute_ns"] == 0
+        assert result["true_compute_pct"] == 0.0
+        assert result["idle_pct"] == 0.0
+
+    def test_empty_memcpy(self):
+        """No memory copies → exposed_memcpy_ns=0, no crash."""
+        from rocpd.tracelens_port import compute_interval_timeline
+
+        conn = _mock_conn()
+        with patch("rocpd.tracelens_port.execute_statement") as mock_es:
+            mock_es.side_effect = [
+                _mock_cursor([(0, 100)]),
+                _mock_cursor([]),
+            ]
+            result = compute_interval_timeline(conn)
+        assert result["exposed_memcpy_ns"] == 0
+        assert result["exposed_memcpy_pct"] == 0.0
+
+    def test_zero_wall_time(self):
+        """Single-point trace → all pct fields are 0.0 (no division by zero)."""
+        from rocpd.tracelens_port import compute_interval_timeline
+
+        conn = _mock_conn()
+        with patch("rocpd.tracelens_port.execute_statement") as mock_es:
+            mock_es.side_effect = [
+                _mock_cursor([(100, 100)]),  # zero-length interval
+                _mock_cursor([]),
+            ]
+            result = compute_interval_timeline(conn)
+        assert result["true_compute_pct"] == 0.0
+        assert result["idle_pct"] == 0.0
+
+
+class TestAnalyzeKernelsByCategory:
+    def test_basic(self):
+        """Known kernel names → correct category aggregation."""
+        from rocpd.tracelens_port import analyze_kernels_by_category
+
+        conn = _mock_conn()
+        rows = [
+            ("sgemm_kernel", 1000),
+            ("sgemm_kernel", 2000),
+            ("gelu_kernel", 500),
+        ]
+        with patch("rocpd.tracelens_port.execute_statement") as mock_es:
+            mock_es.return_value = _mock_cursor(rows)
+            result = analyze_kernels_by_category(conn, total_wall_ns=10000)
+        cats = {r["category"]: r for r in result}
+        assert "GEMM" in cats
+        assert cats["GEMM"]["count"] == 2
+        assert cats["GEMM"]["total_ns"] == 3000
+        assert cats["Elementwise"]["count"] == 1
+        # Sorted by total_ns desc: GEMM first
+        assert result[0]["category"] == "GEMM"
+
+    def test_empty_table(self):
+        """Empty kernels table → []."""
+        from rocpd.tracelens_port import analyze_kernels_by_category
+
+        conn = _mock_conn()
+        with patch("rocpd.tracelens_port.execute_statement") as mock_es:
+            mock_es.return_value = _mock_cursor([])
+            result = analyze_kernels_by_category(conn, total_wall_ns=0)
+        assert result == []
+
+    def test_all_other(self):
+        """Unrecognized kernel names → single Other entry."""
+        from rocpd.tracelens_port import analyze_kernels_by_category
+
+        conn = _mock_conn()
+        rows = [("reproducible_dispatch_count", 100)] * 5
+        with patch("rocpd.tracelens_port.execute_statement") as mock_es:
+            mock_es.return_value = _mock_cursor(rows)
+            result = analyze_kernels_by_category(conn, total_wall_ns=1000)
+        assert len(result) == 1
+        assert result[0]["category"] == "Other"
+        assert result[0]["count"] == 5
+
+    def test_zero_wall_pct_guard(self):
+        """total_wall_ns=0 → pct_of_total_time=0.0, no crash."""
+        from rocpd.tracelens_port import analyze_kernels_by_category
+
+        conn = _mock_conn()
+        rows = [("sgemm", 100)]
+        with patch("rocpd.tracelens_port.execute_statement") as mock_es:
+            mock_es.return_value = _mock_cursor(rows)
+            result = analyze_kernels_by_category(conn, total_wall_ns=0)
+        assert result[0]["pct_of_total_time"] == 0.0
+
+
+class TestAnalyzeShortKernels:
+    def test_basic(self):
+        """Mix of short and long kernels → correct counts and histogram."""
+        from rocpd.tracelens_port import analyze_short_kernels
+
+        conn = _mock_conn()
+        rows = [
+            ("fast_k", 500),  # 0.5μs < 10μs
+            ("fast_k", 2000),  # 2μs < 10μs
+            ("slow_k", 50000),  # 50μs > 10μs
+            ("slow_k", 80000),  # 80μs > 10μs
+        ]
+        with patch("rocpd.tracelens_port.execute_statement") as mock_es:
+            mock_es.return_value = _mock_cursor(rows)
+            result = analyze_short_kernels(conn, threshold_us=10.0)
+        assert result["total_kernels"] == 4
+        assert result["short_kernel_count"] == 2
+        assert result["wasted_ns"] == 2500
+        assert len(result["top_offenders"]) == 1
+        assert result["top_offenders"][0]["name"] == "fast_k"
+        assert result["top_offenders"][0]["count"] == 2
+
+    def test_none_below_threshold(self):
+        """All kernels above threshold → short_kernel_count=0, histogram=[]."""
+        from rocpd.tracelens_port import analyze_short_kernels
+
+        conn = _mock_conn()
+        rows = [("slow_k", 50000), ("slow_k", 80000)]
+        with patch("rocpd.tracelens_port.execute_statement") as mock_es:
+            mock_es.return_value = _mock_cursor(rows)
+            result = analyze_short_kernels(conn)
+        assert result["short_kernel_count"] == 0
+        assert result["histogram"] == []
+        assert result["top_offenders"] == []
+
+    def test_empty_table(self):
+        """Empty kernels table → all-zero result, no crash."""
+        from rocpd.tracelens_port import analyze_short_kernels
+
+        conn = _mock_conn()
+        with patch("rocpd.tracelens_port.execute_statement") as mock_es:
+            mock_es.return_value = _mock_cursor([])
+            result = analyze_short_kernels(conn)
+        assert result["short_kernel_count"] == 0
+        assert result["wasted_pct_of_kernel_time"] == 0.0
+
+
+# ===========================================================================
+# Integration test (requires real merged_db.db)
+# ===========================================================================
+
+MERGED_DB = __import__("os").environ.get(
+    "ROCPD_TEST_DB",
+    "",
+)
+
+
+@pytest.mark.skipif(
+    not (MERGED_DB and __import__("os").path.exists(MERGED_DB)),
+    reason="merged_db.db not found; set ROCPD_TEST_DB env var to enable",
+)
+def test_integration_tracelens_with_real_db():
+    """End-to-end: all three functions return valid dicts from merged_db.db."""
+    from rocpd.importer import RocpdImportData
+    from rocpd.tracelens_port import (
+        compute_interval_timeline,
+        analyze_kernels_by_category,
+        analyze_short_kernels,
+    )
+
+    conn = RocpdImportData([MERGED_DB])
+
+    timeline = compute_interval_timeline(conn)
+    assert timeline["total_wall_ns"] > 0
+    assert 0.0 <= timeline["true_compute_pct"] <= 100.0
+    assert 0.0 <= timeline["idle_pct"] <= 100.0
+
+    categories = analyze_kernels_by_category(conn, timeline["total_wall_ns"])
+    assert isinstance(categories, list)
+    assert len(categories) > 0
+    assert all("category" in c and "count" in c and "total_ns" in c for c in categories)
+
+    short = analyze_short_kernels(conn)
+    assert "short_kernel_count" in short
+    assert "top_offenders" in short
+    assert isinstance(short["histogram"], list)
diff --git a/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/tests/test_workflow.py b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/tests/test_workflow.py
new file mode 100644
index 00000000000..c0fd99f43ac
--- /dev/null
+++ b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/tests/test_workflow.py
@@ -0,0 +1,818 @@
+"""Tests for WorkflowSession 7-phase interactive profiling workflow.
+
+Run with system rocpd first in PYTHONPATH:
+
+    ROCPD_SYS=$(python3 -c "import site; print(site.getsitepackages()[-1])")
+    ROCPD_SRC=<repo>/projects/rocprofiler-sdk/source/lib/python
+    PYTHONPATH="${ROCPD_SYS}:${ROCPD_SRC}" pytest --noconftest test_workflow.py -v
+"""
+
+import os
+import sys
+from unittest.mock import patch, MagicMock
+
+import pytest
+
+# If ROCPD_SYS is set, ensure the system-installed rocpd wins over any path that
+# pytest may have prepended during package-discovery (e.g. the build tree).
+_ROCPD_SYS = os.environ.get("ROCPD_SYS", "")
+if _ROCPD_SYS:
+    if not os.path.isdir(_ROCPD_SYS):
+        pytest.skip(
+            f"ROCPD_SYS={_ROCPD_SYS!r} does not exist; skipping workflow tests",
+            allow_module_level=True,
+        )
+    sys.path.insert(0, _ROCPD_SYS)
+    # Purge any partially-initialised rocpd loaded from the wrong tree.
+    for _key in list(sys.modules):
+        if _key == "rocpd" or _key.startswith("rocpd."):
+            del sys.modules[_key]
+
+from rocpd.ai_analysis.interactive import WorkflowState  # noqa: E402
+
+
+def test_workflow_state_defaults():
+    s = WorkflowState(app_command="./my_app --batch 64")
+    assert s.app_command == "./my_app --batch 64"
+    assert s.source_paths == []
+    assert s.profiling_command == ""
+    assert s.trace_history == []
+    assert s.analysis_history == []
+    assert s.edit_history == []
+    assert s.iteration_count == 0
+
+
+def test_checkpoint_record_defaults():
+    from rocpd.ai_analysis.interactive import CheckpointRecord
+
+    cp = CheckpointRecord(
+        cp_id=0,
+        commit_hash="abc1234",
+        ref_name="refs/rocpd/session-1/cp-0",
+        worktree_path="/tmp/cp-0",
+        timestamp="2026-03-13T00:00:00Z",
+        files_modified=["kernel.hip"],
+        edit_summary="increased thread block size",
+        file_snapshots={"kernel.hip": "__global__ void k() {}"},
+    )
+    assert cp.cp_id == 0
+    assert cp.run_index is None
+    assert cp.performance_delta_pct is None
+    assert cp.blacklisted is False
+    assert cp.blacklist_category == ""
+    assert cp.blacklist_description == ""
+
+
+def test_checkpoint_error_is_exception():
+    from rocpd.ai_analysis.interactive import CheckpointError
+
+    with pytest.raises(CheckpointError):
+        raise CheckpointError("git failed")
+
+
+def test_workflow_state_has_checkpoint_fields():
+    from rocpd.ai_analysis.interactive import WorkflowState
+
+    s = WorkflowState(app_command="./app")
+    assert s.repo_root == ""
+    assert s.baseline_commit == ""
+    assert s.checkpoints == []
+    assert s.active_checkpoint is None
+
+
+def test_edit_record_has_checkpoint_id():
+    from rocpd.ai_analysis.interactive import _EditRecord
+
+    r = _EditRecord(
+        timestamp="2026-03-13T00:00:00Z",
+        file_path="/src/kernel.hip",
+        backup_path="/src/kernel.hip.bak",
+    )
+    assert r.checkpoint_id is None
+
+
+def _make_gcm(repo_root="/repo", session_id="sess1"):
+    from rocpd.ai_analysis.interactive import GitCheckpointManager
+
+    return GitCheckpointManager(
+        repo_root=repo_root,
+        session_id=session_id,
+        sessions_dir="/home/user/.rocpd/sessions",
+    )
+
+
+def test_gcm_detect_repo_success():
+    gcm = _make_gcm()
+    with patch("subprocess.run") as mock_run:
+        mock_run.return_value = MagicMock(returncode=0, stdout="/repo\n")
+        result = gcm.detect_repo("/repo/src")
+    assert result == "/repo"
+
+
+def test_gcm_detect_repo_not_git():
+    from rocpd.ai_analysis.interactive import CheckpointError
+
+    gcm = _make_gcm()
+    with patch("subprocess.run") as mock_run:
+        mock_run.return_value = MagicMock(returncode=128, stdout="")
+        with pytest.raises(CheckpointError):
+            gcm.detect_repo("/not/a/repo")
+
+
+def test_gcm_get_head():
+    gcm = _make_gcm()
+    with patch("subprocess.run") as mock_run:
+        mock_run.return_value = MagicMock(returncode=0, stdout="abc1234\n")
+        assert gcm.get_head() == "abc1234"
+
+
+def test_gcm_create_checkpoint_commit():
+    gcm = _make_gcm()
+    with patch("subprocess.run") as mock_run:
+        mock_run.return_value = MagicMock(returncode=0, stdout="def5678\n")
+        result = gcm.create_checkpoint_commit(["kernel.hip"], "cp-0 — test edit")
+    assert result == "def5678"
+    calls = mock_run.call_args_list
+    assert any("add" in str(c) for c in calls)
+    assert any("commit" in str(c) for c in calls)
+
+
+def test_gcm_create_checkpoint_commit_passes_no_verify():
+    gcm = _make_gcm()
+    with patch("subprocess.run") as mock_run:
+        mock_run.return_value = MagicMock(returncode=0, stdout="abc\n")
+        gcm.create_checkpoint_commit(["f.hip"], "msg")
+    commit_call = [c for c in mock_run.call_args_list if "commit" in str(c)][0]
+    assert "--no-verify" in str(commit_call)
+
+
+def test_gcm_create_checkpoint_commit_passes_identity():
+    gcm = _make_gcm()
+    with patch("subprocess.run") as mock_run:
+        mock_run.return_value = MagicMock(returncode=0, stdout="abc\n")
+        gcm.create_checkpoint_commit(["f.hip"], "msg")
+    for c in mock_run.call_args_list:
+        assert "rocpd@local" in str(c)
+
+
+def test_gcm_tag_checkpoint():
+    gcm = _make_gcm()
+    with patch("subprocess.run") as mock_run:
+        mock_run.return_value = MagicMock(returncode=0, stdout="")
+        ref = gcm.tag_checkpoint(0, "abc1234")
+    assert ref == "refs/rocpd/sess1/cp-0"
+    assert "update-ref" in str(mock_run.call_args_list)
+
+
+def test_gcm_tag_checkpoint_not_a_branch():
+    gcm = _make_gcm()
+    with patch("subprocess.run") as mock_run:
+        mock_run.return_value = MagicMock(returncode=0, stdout="")
+        ref = gcm.tag_checkpoint(0, "abc")
+    assert "refs/heads" not in ref
+    assert ref.startswith("refs/rocpd/")
+
+
+def test_gcm_add_worktree():
+    gcm = _make_gcm()
+    with patch("subprocess.run") as mock_run:
+        mock_run.return_value = MagicMock(returncode=0, stdout="")
+        path = gcm.add_worktree(0, "abc1234")
+    assert path == "/home/user/.rocpd/sessions/sess1/cp-0"
+    assert "--detach" in str(mock_run.call_args)
+
+
+def test_gcm_add_worktree_cleans_stale_path():
+    gcm = _make_gcm()
+    with patch("subprocess.run") as mock_run, patch(
+        "os.path.exists", return_value=True
+    ), patch("shutil.rmtree") as mock_rmtree:
+        mock_run.return_value = MagicMock(returncode=0, stdout="")
+        gcm.add_worktree(0, "abc1234")
+    mock_rmtree.assert_called_once()
+
+
+def test_gcm_commit_reachable_true():
+    gcm = _make_gcm()
+    with patch("subprocess.run") as mock_run:
+        mock_run.return_value = MagicMock(returncode=0)
+        assert gcm.commit_reachable("abc1234") is True
+
+
+def test_gcm_commit_reachable_false():
+    gcm = _make_gcm()
+    with patch("subprocess.run") as mock_run:
+        mock_run.return_value = MagicMock(returncode=1)
+        assert gcm.commit_reachable("abc1234") is False
+
+
+def test_gcm_remove_worktree_silently_skips_missing():
+    gcm = _make_gcm()
+    with patch("subprocess.run") as mock_run, patch("os.path.exists", return_value=False):
+        gcm.remove_worktree("/tmp/nonexistent")
+    mock_run.assert_not_called()
+
+
+def test_gcm_delete_ref():
+    gcm = _make_gcm()
+    with patch("subprocess.run") as mock_run:
+        mock_run.return_value = MagicMock(returncode=0)
+        gcm.delete_ref("refs/rocpd/sess1/cp-0")
+    assert "update-ref" in str(mock_run.call_args)
+    assert "-d" in str(mock_run.call_args)
+
+
+def test_gcm_files_in_commit():
+    gcm = _make_gcm()
+    with patch("subprocess.run") as mock_run:
+        mock_run.return_value = MagicMock(
+            returncode=0, stdout="src/kernel.hip\nsrc/main.cpp\n"
+        )
+        files = gcm.files_in_commit("abc1234")
+    assert files == ["src/kernel.hip", "src/main.cpp"]
+
+
+def test_gcm_list_worktrees():
+    gcm = _make_gcm()
+    with patch("subprocess.run") as mock_run:
+        mock_run.return_value = MagicMock(
+            returncode=0,
+            stdout="worktree /repo\nHEAD abc\n\nworktree /home/user/.rocpd/sessions/s/cp-0\nHEAD def\n",
+        )
+        paths = gcm.list_worktrees()
+    assert "/repo" in paths
+    assert "/home/user/.rocpd/sessions/s/cp-0" in paths
+
+
+def test_gcm_restore_files_from_commit():
+    gcm = _make_gcm()
+    with patch("subprocess.run") as mock_run:
+        # ls-tree returns the file; checkout succeeds
+        mock_run.side_effect = [
+            MagicMock(returncode=0, stdout="kernel.hip\n"),  # ls-tree
+            MagicMock(returncode=0, stdout=""),  # checkout
+        ]
+        gcm.restore_files_from_commit("abc1234", ["kernel.hip"])
+    # Both ls-tree and checkout were called
+    assert mock_run.call_count == 2
+
+
+def test_session_start_sets_repo_root_when_git_available():
+    from rocpd.ai_analysis.interactive import WorkflowSession
+
+    with patch("subprocess.run") as mock_run:
+        mock_run.side_effect = [
+            MagicMock(returncode=0, stdout="/repo\n"),  # rev-parse --show-toplevel
+            MagicMock(returncode=0, stdout="abc1234\n"),  # rev-parse HEAD
+        ]
+        ws = WorkflowSession(app_command="./app", source_paths=["/repo/src"])
+        ws._init_checkpoints()
+    assert ws._state.repo_root == "/repo"
+    assert ws._state.baseline_commit == "abc1234"
+
+
+def test_session_start_disables_checkpoints_when_not_git():
+    from rocpd.ai_analysis.interactive import WorkflowSession
+
+    with patch("subprocess.run") as mock_run:
+        mock_run.return_value = MagicMock(returncode=128, stdout="")
+        ws = WorkflowSession(app_command="./app", source_paths=["/not/git"])
+        ws._init_checkpoints()
+    assert ws._state.repo_root == ""  # disabled
+
+
+def test_checkpoints_work_with_dirty_tree():
+    from rocpd.ai_analysis.interactive import WorkflowSession
+
+    with patch("subprocess.run") as mock_run:
+        mock_run.side_effect = [
+            MagicMock(returncode=0, stdout="/repo\n"),  # detect_repo
+            MagicMock(returncode=0, stdout="abc123\n"),  # get_head
+        ]
+        ws = WorkflowSession(app_command="./app", source_paths=["/repo/src"])
+        ws._init_checkpoints()
+        # Dirty working tree does not affect checkpoints — _gcm should be set
+        assert ws._gcm is not None
+        assert ws._state.baseline_commit == "abc123"
+
+
+def test_init_checkpoints_disables_on_get_head_failure():
+    from rocpd.ai_analysis.interactive import WorkflowSession, CheckpointError
+
+    with patch("subprocess.run") as mock_run:
+        mock_run.side_effect = [
+            MagicMock(returncode=0, stdout="/repo\n"),  # detect_repo
+            MagicMock(returncode=1, stdout=""),  # get_head (fails)
+        ]
+        ws = WorkflowSession(app_command="./app", source_paths=["/repo/src"])
+        ws._init_checkpoints()
+    assert ws._state.repo_root == ""
+    assert ws._gcm is None
+
+
+def _make_workflow_session_with_gcm(mock_gcm=None):
+    """Helper: WorkflowSession with mocked git and source paths."""
+    from rocpd.ai_analysis.interactive import WorkflowSession, GitCheckpointManager
+
+    ws = WorkflowSession(app_command="./app", source_paths=["/repo/src"])
+    ws._state.repo_root = "/repo"
+    ws._state.baseline_commit = "baseline123"
+    ws._gcm = mock_gcm or MagicMock(spec=GitCheckpointManager)
+    return ws
+
+
+def test_create_checkpoint_appends_checkpoint_record():
+    from rocpd.ai_analysis.interactive import CheckpointRecord
+
+    ws = _make_workflow_session_with_gcm()
+    ws._gcm.create_checkpoint_commit.return_value = "abc1234"
+    ws._gcm.tag_checkpoint.return_value = "refs/rocpd/sess/cp-0"
+    ws._gcm.add_worktree.return_value = "/tmp/cp-0"
+
+    with patch.object(ws, "_save_session"):
+        ws._create_checkpoint(
+            files_modified=["kernel.hip"],
+            edit_summary="increased block size",
+            file_snapshots={"kernel.hip": "content"},
+        )
+
+    assert len(ws._state.checkpoints) == 1
+    cp = ws._state.checkpoints[0]
+    assert cp.cp_id == 0
+    assert cp.commit_hash == "abc1234"
+    assert cp.files_modified == ["kernel.hip"]
+    assert cp.file_snapshots == {"kernel.hip": "content"}
+
+
+def test_create_checkpoint_links_edit_record():
+    ws = _make_workflow_session_with_gcm()
+    ws._gcm.create_checkpoint_commit.return_value = "abc"
+    ws._gcm.tag_checkpoint.return_value = "refs/rocpd/s/cp-0"
+    ws._gcm.add_worktree.return_value = "/tmp/cp-0"
+    from rocpd.ai_analysis.interactive import _EditRecord
+
+    ws._state.edit_history.append(
+        _EditRecord(timestamp="t", file_path="/f", backup_path="/f.bak")
+    )
+
+    with patch.object(ws, "_save_session"):
+        ws._create_checkpoint(["f"], "edit", {"f": "c"})
+
+    assert ws._state.edit_history[-1].checkpoint_id == 0
+
+
+def test_create_checkpoint_skipped_when_no_gcm():
+    ws = _make_workflow_session_with_gcm()
+    ws._gcm = None  # checkpoints disabled
+
+    with patch.object(ws, "_save_session"):
+        ws._create_checkpoint(["f"], "edit", {"f": "c"})
+
+    assert ws._state.checkpoints == []
+
+
+def test_create_checkpoint_skipped_on_git_error():
+    from rocpd.ai_analysis.interactive import CheckpointError
+
+    ws = _make_workflow_session_with_gcm()
+    ws._gcm.create_checkpoint_commit.side_effect = CheckpointError("git fail")
+
+    with patch.object(ws, "_save_session"):
+        ws._create_checkpoint(["f"], "edit", {"f": "c"})  # should not raise
+
+    assert ws._state.checkpoints == []
+
+
+def test_update_checkpoint_records_run_index():
+    from rocpd.ai_analysis.interactive import (
+        CheckpointRecord,
+        _AnalysisSnapshot,
+        _TraceRun,
+    )
+
+    ws = _make_workflow_session_with_gcm()
+
+    cp = CheckpointRecord(
+        cp_id=0,
+        commit_hash="abc",
+        ref_name="refs/r",
+        worktree_path="/wt",
+        timestamp="t",
+        files_modified=[],
+        edit_summary="e",
+        file_snapshots={},
+    )
+    ws._state.checkpoints.append(cp)
+
+    ws._state.trace_history.append(
+        _TraceRun(timestamp="t", command="cmd", db_path="/db.db")
+    )
+    ws._state.analysis_history.append(
+        _AnalysisSnapshot(
+            timestamp="t",
+            iteration=0,
+            execution_breakdown={"total_runtime_ns": 1_000_000},
+        )
+    )
+
+    with patch.object(ws, "_save_session"):
+        ws._update_checkpoint_with_run()
+
+    assert ws._state.checkpoints[0].run_index == 0
+    assert ws._state.checkpoints[0].performance_delta_pct is None  # only 1 analysis
+
+
+def test_update_checkpoint_computes_delta_from_total_runtime_ns():
+    from rocpd.ai_analysis.interactive import (
+        CheckpointRecord,
+        _AnalysisSnapshot,
+        _TraceRun,
+    )
+
+    ws = _make_workflow_session_with_gcm()
+
+    for i in range(2):
+        cp = CheckpointRecord(
+            cp_id=i,
+            commit_hash="h",
+            ref_name="r",
+            worktree_path="w",
+            timestamp="t",
+            files_modified=[],
+            edit_summary="e",
+            file_snapshots={},
+        )
+        ws._state.checkpoints.append(cp)
+
+    ws._state.checkpoints[0].run_index = 0
+    ws._state.trace_history.append(
+        _TraceRun(timestamp="t", command="c", db_path="/db0.db")
+    )
+    ws._state.trace_history.append(
+        _TraceRun(timestamp="t", command="c", db_path="/db1.db")
+    )
+    ws._state.checkpoints[1].run_index = 1  # set by Phase 3 already
+    ws._state.analysis_history.append(
+        _AnalysisSnapshot(
+            timestamp="t",
+            iteration=0,
+            execution_breakdown={"total_runtime_ns": 1_000_000},
+        )
+    )
+    ws._state.analysis_history.append(
+        _AnalysisSnapshot(
+            timestamp="t",
+            iteration=1,
+            execution_breakdown={"total_runtime_ns": 900_000},  # 10% faster
+        )
+    )
+
+    # Delta computed by Phase 4 method (after analysis_history updated)
+    ws._update_checkpoint_delta()
+
+    import pytest as _pytest
+
+    delta = ws._state.checkpoints[1].performance_delta_pct
+    assert delta == _pytest.approx(10.0, abs=0.1)  # (1M-900K)/1M * 100
+
+
+def test_update_checkpoint_delta_noop_when_insufficient_history():
+    from rocpd.ai_analysis.interactive import CheckpointRecord
+
+    ws = _make_workflow_session_with_gcm()
+    cp = CheckpointRecord(
+        cp_id=0,
+        commit_hash="h",
+        ref_name="r",
+        worktree_path="w",
+        timestamp="t",
+        files_modified=[],
+        edit_summary="e",
+        file_snapshots={},
+        run_index=0,
+    )
+    ws._state.checkpoints.append(cp)
+    ws._update_checkpoint_delta()  # only 0 analyses, should not raise
+    assert ws._state.checkpoints[0].performance_delta_pct is None
+
+
+def test_update_checkpoint_noop_when_no_checkpoints():
+    ws = _make_workflow_session_with_gcm()
+    with patch.object(ws, "_save_session"):
+        ws._update_checkpoint_with_run()  # should not raise
+
+
+def _make_ws_with_checkpoints():
+    """Helper: WorkflowSession with 3 checkpoints and 3 runs."""
+    from rocpd.ai_analysis.interactive import (
+        WorkflowSession,
+        CheckpointRecord,
+        _TraceRun,
+        _AnalysisSnapshot,
+    )
+
+    ws = _make_workflow_session_with_gcm()
+    ws._state.baseline_commit = "base000"
+
+    deltas = [10.0, -67.0, -15.0]
+    for i, delta in enumerate(deltas):
+        cp = CheckpointRecord(
+            cp_id=i,
+            commit_hash=f"hash{i}",
+            ref_name=f"refs/rocpd/s/cp-{i}",
+            worktree_path=f"/wt/cp-{i}",
+            timestamp="t",
+            files_modified=["kernel.hip"],
+            edit_summary=f"edit {i}",
+            file_snapshots={"kernel.hip": f"content{i}"},
+            run_index=i,
+            performance_delta_pct=delta,
+        )
+        ws._state.checkpoints.append(cp)
+        ws._state.trace_history.append(
+            _TraceRun(timestamp="t", command="c", db_path=f"/db{i}.db")
+        )
+        ws._state.analysis_history.append(
+            _AnalysisSnapshot(
+                timestamp="t",
+                iteration=i,
+                execution_breakdown={"total_runtime_ns": 1_000_000 - i * 100_000},
+                recommendations=[],
+            )
+        )
+    ws._state.iteration_count = 3
+    return ws
+
+
+def test_rollback_restores_files_from_git():
+    ws = _make_ws_with_checkpoints()
+    with patch.object(ws, "_save_session"), patch.object(
+        ws._gcm, "commit_reachable", return_value=True
+    ), patch.object(ws._gcm, "restore_files_from_commit") as mock_restore, patch.object(
+        ws._gcm, "remove_worktree"
+    ):
+        ws._rollback_to_checkpoint(target_cp_id=0)
+    mock_restore.assert_called_once_with("hash0", ["kernel.hip"])
+
+
+def test_rollback_uses_file_snapshots_when_commit_unreachable():
+    ws = _make_ws_with_checkpoints()
+    with patch.object(ws, "_save_session"), patch.object(
+        ws._gcm, "commit_reachable", return_value=False
+    ), patch.object(ws._gcm, "remove_worktree"), patch(
+        "pathlib.Path.write_text"
+    ) as mock_write, patch(
+        "pathlib.Path.mkdir"
+    ):
+        ws._rollback_to_checkpoint(target_cp_id=0)
+    mock_write.assert_called()
+
+
+def test_rollback_truncates_checkpoints_after_target():
+    ws = _make_ws_with_checkpoints()
+    with patch.object(ws, "_save_session"), patch.object(
+        ws._gcm, "commit_reachable", return_value=True
+    ), patch.object(ws._gcm, "restore_files_from_commit"), patch.object(
+        ws._gcm, "remove_worktree"
+    ):
+        ws._rollback_to_checkpoint(target_cp_id=0)
+    assert len(ws._state.checkpoints) == 1
+    assert ws._state.checkpoints[0].cp_id == 0
+
+
+def test_rollback_truncates_trace_and_analysis_history():
+    ws = _make_ws_with_checkpoints()
+    with patch.object(ws, "_save_session"), patch.object(
+        ws._gcm, "commit_reachable", return_value=True
+    ), patch.object(ws._gcm, "restore_files_from_commit"), patch.object(
+        ws._gcm, "remove_worktree"
+    ):
+        ws._rollback_to_checkpoint(target_cp_id=0)
+    assert len(ws._state.trace_history) == 1
+    assert len(ws._state.analysis_history) == 1
+    assert ws._state.iteration_count == 1
+
+
+def test_rollback_sets_active_checkpoint():
+    ws = _make_ws_with_checkpoints()
+    with patch.object(ws, "_save_session"), patch.object(
+        ws._gcm, "commit_reachable", return_value=True
+    ), patch.object(ws._gcm, "restore_files_from_commit"), patch.object(
+        ws._gcm, "remove_worktree"
+    ):
+        ws._rollback_to_checkpoint(target_cp_id=0)
+    assert ws._state.active_checkpoint == 0
+
+
+def test_blacklist_sets_fields():
+    ws = _make_ws_with_checkpoints()
+    ws._blacklist_checkpoint(1)
+    cp = ws._state.checkpoints[1]
+    assert cp.blacklisted is True
+    assert cp.blacklist_category == "edit 1"
+    assert "-67" in cp.blacklist_description
+
+
+def test_build_blacklist_block_empty_when_none():
+    ws = _make_ws_with_checkpoints()
+    assert ws._build_blacklist_block() == ""
+
+
+def test_build_blacklist_block_contains_description():
+    ws = _make_ws_with_checkpoints()
+    ws._blacklist_checkpoint(1)
+    block = ws._build_blacklist_block()
+    assert "Blacklisted approaches" in block
+    assert "edit 1" in block
+
+
+def test_build_blacklist_block_deduplicates():
+    ws = _make_ws_with_checkpoints()
+    ws._blacklist_checkpoint(1)
+    ws._blacklist_checkpoint(1)  # blacklist same cp twice
+    block = ws._build_blacklist_block()
+    assert block.count("edit 1") == 1
+
+
+def test_rollback_baseline_no_git_still_clears_state():
+    ws = _make_ws_with_checkpoints()
+    ws._gcm = None  # no git available
+    ws._state.repo_root = ""
+    with patch.object(ws, "_save_session"):
+        ws._rollback_to_checkpoint(target_cp_id=-1)
+    # State should be cleared even without git restore
+    assert ws._state.checkpoints == []
+    assert ws._state.trace_history == []
+    assert ws._state.analysis_history == []
+    assert ws._state.iteration_count == 0
+
+
+def test_phase5_shows_rollback_option_when_checkpoints_exist():
+    ws = _make_ws_with_checkpoints()
+    from rocpd.ai_analysis.interactive import _AnalysisSnapshot
+
+    snap = _AnalysisSnapshot(
+        timestamp="t",
+        iteration=2,
+        recommendations=[
+            {
+                "priority": "HIGH",
+                "category": "C",
+                "issue": "i",
+                "suggestion": "s",
+                "actions": [],
+                "id": "R1",
+                "estimated_impact": "",
+                "commands": [],
+            }
+        ],
+    )
+    # Simulate user typing "b" then "0" then "n" (no blacklist)
+    with patch("builtins.input", side_effect=["b", "0", "n"]), patch.object(
+        ws, "_rollback_to_checkpoint"
+    ) as mock_rollback, patch.object(ws, "_save_session"):
+        ws._phase5_rec_menu(snap)
+    mock_rollback.assert_called_once_with(target_cp_id=0)
+
+
+def test_phase5_does_not_crash_when_no_checkpoints():
+    from rocpd.ai_analysis.interactive import WorkflowSession, _AnalysisSnapshot
+
+    ws = WorkflowSession(app_command="./app")  # no checkpoints
+    snap = _AnalysisSnapshot(
+        timestamp="t",
+        iteration=0,
+        recommendations=[
+            {
+                "priority": "INFO",
+                "category": "C",
+                "issue": "i",
+                "suggestion": "s",
+                "actions": [],
+                "id": "R1",
+                "estimated_impact": "",
+                "commands": [],
+            }
+        ],
+    )
+    with patch("builtins.input", side_effect=["n"]):
+        result = ws._phase5_rec_menu(snap)
+    assert result is None
+
+
+def test_phase5_rollback_with_blacklist():
+    ws = _make_ws_with_checkpoints()
+    from rocpd.ai_analysis.interactive import _AnalysisSnapshot
+
+    snap = _AnalysisSnapshot(
+        timestamp="t",
+        iteration=2,
+        recommendations=[
+            {
+                "priority": "HIGH",
+                "category": "C",
+                "issue": "i",
+                "suggestion": "s",
+                "actions": [],
+                "id": "R1",
+                "estimated_impact": "",
+                "commands": [],
+            }
+        ],
+    )
+    with patch("builtins.input", side_effect=["b", "0", "1"]), patch.object(
+        ws, "_rollback_to_checkpoint"
+    ), patch.object(ws, "_blacklist_checkpoint") as mock_blacklist, patch.object(
+        ws, "_save_session"
+    ):
+        ws._phase5_rec_menu(snap)
+    mock_blacklist.assert_called_once_with(1)
+
+
+def test_blacklist_block_injected_into_phase6_prompt():
+    import pathlib
+
+    ws = _make_ws_with_checkpoints()
+    ws._blacklist_checkpoint(1)  # blacklist cp-1 (edit 1, -67%)
+
+    from rocpd.ai_analysis.interactive import _AnalysisSnapshot
+
+    snap = _AnalysisSnapshot(
+        timestamp="t",
+        iteration=2,
+        recommendations=[
+            {
+                "priority": "HIGH",
+                "category": "C",
+                "issue": "i",
+                "suggestion": "s",
+                "actions": [],
+                "id": "R1",
+                "estimated_impact": "",
+                "commands": [],
+            }
+        ],
+    )
+
+    captured_suggestions = []
+
+    def fake_llm_rewrite(file, suggestions):
+        captured_suggestions.append(suggestions)
+        return "rewritten content"
+
+    with patch.object(
+        ws, "_llm_rewrite_file", side_effect=fake_llm_rewrite
+    ), patch.object(
+        ws, "_pick_file_from_source_paths", return_value=pathlib.Path("/repo/kernel.hip")
+    ), patch(
+        "pathlib.Path.read_text", return_value="original"
+    ), patch(
+        "pathlib.Path.write_text"
+    ), patch(
+        "pathlib.Path.with_suffix", return_value=pathlib.Path("/repo/kernel.hip.bak")
+    ), patch(
+        "pathlib.Path.exists", return_value=False
+    ), patch(
+        "builtins.input", side_effect=["y", "done"]
+    ), patch.object(
+        ws, "_save_session"
+    ), patch.object(
+        ws, "_create_checkpoint"
+    ):
+        ws._phase6_apply_direct(snap)
+
+    assert any("Blacklisted" in s for s in captured_suggestions)
+
+
+def test_worktrees_removed_on_session_exit():
+    ws = _make_ws_with_checkpoints()
+
+    with patch.object(ws._gcm, "remove_worktree") as mock_remove, patch.object(
+        ws, "_save_session"
+    ), patch.object(ws, "_init_checkpoints"):
+        ws._teardown_checkpoints()
+
+    assert mock_remove.call_count == 3  # 3 checkpoints
+
+
+def test_stale_worktrees_pruned_on_start():
+    ws = _make_ws_with_checkpoints()
+    sessions_dir = str(ws._sessions_dir)
+
+    with patch.object(
+        ws._gcm,
+        "list_worktrees",
+        return_value=[
+            f"{sessions_dir}/other_session/cp-0",  # no JSON → stale
+            f"{sessions_dir}/{ws._session_id}/cp-0",  # current session → keep
+            "/repo/.git",  # not under sessions_dir → ignore
+        ],
+    ), patch.object(ws._gcm, "remove_worktree") as mock_remove, patch(
+        "pathlib.Path.exists", return_value=False
+    ):  # no JSON files exist
+        ws._prune_stale_worktrees()
+
+    # Only the other_session worktree should be pruned
+    pruned_paths = [str(c.args[0]) for c in mock_remove.call_args_list]
+    assert any("other_session" in p for p in pruned_paths)
+    assert not any(ws._session_id in p for p in pruned_paths)
diff --git a/projects/rocprofiler-sdk/source/lib/python/rocpd/analyze.py b/projects/rocprofiler-sdk/source/lib/python/rocpd/analyze.py
new file mode 100644
index 00000000000..250b009ffe1
--- /dev/null
+++ b/projects/rocprofiler-sdk/source/lib/python/rocpd/analyze.py
@@ -0,0 +1,5497 @@
+#!/usr/bin/env python3
+###############################################################################
+# MIT License
+#
+# Copyright (c) 2025 Advanced Micro Devices, Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+###############################################################################
+
+"""
+AI-powered performance analysis for GPU traces.
+
+This module analyzes rocpd database files and provides human-readable insights,
+bottleneck identification, and optimization recommendations.
+"""
+
+import argparse
+import os
+import re
+import shlex
+import sys
+from datetime import datetime
+from typing import Any, Dict, List, Optional
+
+try:
+    from importlib.metadata import version as _pkg_version
+
+    _ROCPD_VERSION = _pkg_version("rocpd")
+except Exception:
+    _ROCPD_VERSION = "0.1.0"  # fallback if metadata not available (common in dev / ROCm system installs)
+
+from .importer import RocpdImportData, execute_statement
+from .tracelens_port import (
+    compute_interval_timeline,
+    analyze_kernels_by_category,
+    analyze_short_kernels,
+)
+from . import output_config
+
+__all__ = [
+    "compute_time_breakdown",
+    "identify_hotspots",
+    "analyze_memory_copies",
+    "analyze_hardware_counters",
+    "generate_recommendations",
+    "format_analysis_output",
+    "analyze_performance",
+    "add_args",
+    "execute",
+    "main",
+]
+
+
+def compute_time_breakdown(connection: RocpdImportData) -> Dict[str, Any]:
+    """
+    Calculate time distribution across kernel execution, memory copies, and overhead.
+
+    Args:
+        connection: RocpdImportData database connection
+
+    Returns:
+        Dictionary with time breakdown metrics including percentages
+    """
+    query = """
+    WITH kernel_time AS (
+        SELECT COALESCE(SUM(duration), 0) as total_kernel_time
+        FROM kernels
+    ),
+    memcpy_time AS (
+        SELECT COALESCE(SUM(duration), 0) as total_memcpy_time
+        FROM memory_copies
+    ),
+    total_time AS (
+        SELECT MAX(end) - MIN(start) as total_runtime
+        FROM (
+            SELECT start, end FROM kernels
+            UNION ALL
+            SELECT start, end FROM memory_copies
+        )
+    )
+    SELECT
+        k.total_kernel_time,
+        m.total_memcpy_time,
+        COALESCE(t.total_runtime, 0) as total_runtime,
+        CASE WHEN COALESCE(t.total_runtime, 0) > 0
+             THEN (k.total_kernel_time * 100.0 / t.total_runtime)
+             ELSE 0 END as kernel_percent,
+        CASE WHEN COALESCE(t.total_runtime, 0) > 0
+             THEN (m.total_memcpy_time * 100.0 / t.total_runtime)
+             ELSE 0 END as memcpy_percent,
+        CASE WHEN COALESCE(t.total_runtime, 0) > 0
+             THEN ((t.total_runtime - k.total_kernel_time - m.total_memcpy_time) * 100.0 / t.total_runtime)
+             ELSE 0 END as overhead_percent
+    FROM kernel_time k, memcpy_time m, total_time t
+    """
+
+    try:
+        result = execute_statement(connection, query).fetchone()
+
+        if not result:
+            return {
+                "total_kernel_time": 0,
+                "total_memcpy_time": 0,
+                "total_runtime": 0,
+                "kernel_percent": 0,
+                "memcpy_percent": 0,
+                "overhead_percent": 0,
+            }
+
+        return {
+            "total_kernel_time": result[0] or 0,
+            "total_memcpy_time": result[1] or 0,
+            "total_runtime": result[2] or 0,
+            "kernel_percent": result[3] or 0,
+            "memcpy_percent": result[4] or 0,
+            "overhead_percent": max(0.0, result[5] or 0),
+        }
+
+    except Exception as e:
+        print(f"Warning: Could not compute time breakdown: {e}", file=sys.stderr)
+        return {
+            "error": str(e),
+            "total_kernel_time": 0,
+            "total_memcpy_time": 0,
+            "total_runtime": 0,
+            "kernel_percent": 0,
+            "memcpy_percent": 0,
+            "overhead_percent": 0,
+        }
+
+
+def identify_hotspots(
+    connection: RocpdImportData, top_n: int = 10, min_duration: float = 0.0
+) -> List[Dict[str, Any]]:
+    """
+    Identify top N kernels by total execution time.
+
+    Args:
+        connection: RocpdImportData database connection
+        top_n: Number of top kernels to return
+        min_duration: Minimum duration threshold in nanoseconds
+
+    Returns:
+        List of dictionaries containing kernel statistics
+    """
+    # Build query with string formatting to avoid parameter binding issues
+    # Use f-strings for both min_duration and top_n to avoid SQLite datatype issues
+    if min_duration > 0:
+        query = f"""
+        SELECT
+            name,
+            COUNT(*) as calls,
+            SUM(duration) as total_duration,
+            AVG(duration) as avg_duration,
+            MIN(duration) as min_duration,
+            MAX(duration) as max_duration,
+            (SUM(duration) * 100.0 / NULLIF((SELECT SUM(duration) FROM kernels), 0)) as percent_of_total
+        FROM kernels
+        WHERE duration >= {int(min_duration)}
+        GROUP BY name
+        ORDER BY total_duration DESC
+        LIMIT {int(top_n)}
+        """
+    else:
+        query = f"""
+        SELECT
+            name,
+            COUNT(*) as calls,
+            SUM(duration) as total_duration,
+            AVG(duration) as avg_duration,
+            MIN(duration) as min_duration,
+            MAX(duration) as max_duration,
+            (SUM(duration) * 100.0 / NULLIF((SELECT SUM(duration) FROM kernels), 0)) as percent_of_total
+        FROM kernels
+        GROUP BY name
+        ORDER BY total_duration DESC
+        LIMIT {int(top_n)}
+        """
+
+    try:
+        # No parameters needed with string formatting
+        results = execute_statement(connection, query, ()).fetchall()
+
+        hotspots = []
+        for row in results:
+            hotspots.append(
+                {
+                    "name": row[0],
+                    "calls": row[1],
+                    "total_duration": row[2],
+                    "avg_duration": row[3],
+                    "min_duration": row[4],
+                    "max_duration": row[5],
+                    "percent_of_total": row[6] or 0,
+                }
+            )
+
+        return hotspots
+
+    except Exception as e:
+        print(f"Warning: Could not identify hotspots: {e}", file=sys.stderr)
+        return []
+
+
+def analyze_memory_copies(connection: RocpdImportData) -> Dict[str, Dict[str, Any]]:
+    """
+    Analyze memory copy operations by direction and calculate bandwidth.
+
+    Args:
+        connection: RocpdImportData database connection
+
+    Returns:
+        Dictionary keyed by direction containing memory copy statistics
+    """
+    query = """
+    SELECT
+        CASE
+            WHEN category LIKE '%HostToDevice%' OR category LIKE '%H2D%' THEN 'Host-to-Device'
+            WHEN category LIKE '%DeviceToHost%' OR category LIKE '%D2H%' THEN 'Device-to-Host'
+            WHEN category LIKE '%DeviceToDevice%' OR category LIKE '%D2D%' THEN 'Device-to-Device'
+            ELSE category
+        END as direction,
+        COUNT(*) as count,
+        COALESCE(SUM(CAST(size AS INTEGER)), 0) as total_bytes,
+        SUM(end - start) as total_duration,
+        COALESCE(AVG(CAST(size AS INTEGER)), 0.0) as avg_bytes,
+        AVG(end - start) as avg_duration,
+        CASE WHEN SUM(end - start) > 0
+             THEN (COALESCE(SUM(CAST(size AS INTEGER)), 0) * 1.0e9 / SUM(end - start))
+             ELSE 0.0
+        END as bandwidth_bytes_per_sec
+    FROM memory_copies
+    WHERE category IS NOT NULL
+    GROUP BY direction
+    ORDER BY total_duration DESC
+    """
+
+    try:
+        results = execute_statement(connection, query).fetchall()
+
+        analysis = {}
+        for row in results:
+            direction = row[0]
+            analysis[direction] = {
+                "count": row[1],
+                "total_bytes": row[2],
+                "total_duration": row[3],
+                "avg_bytes": row[4],
+                "avg_duration": row[5],
+                "bandwidth_bytes_per_sec": row[6],
+            }
+
+        return analysis
+
+    except Exception as e:
+        print(f"Warning: Could not analyze memory copies: {e}", file=sys.stderr)
+        return {}
+
+
+def analyze_hardware_counters(connection: RocpdImportData) -> Dict[str, Any]:
+    """
+    Analyze hardware performance counters (Tier 2 analysis).
+
+    Args:
+        connection: RocpdImportData database connection
+
+    Returns:
+        Dictionary containing hardware counter analysis:
+        - has_counters: bool indicating if counter data exists
+        - counters: dict of counter statistics by name
+        - metrics: derived metrics (occupancy, utilization, etc.)
+        - per_kernel: counter analysis by kernel name
+    """
+    try:
+        # Check if pmc_events table exists by trying to query it
+        check_query = "SELECT COUNT(*) FROM pmc_events LIMIT 1"
+        result = execute_statement(connection, check_query, ()).fetchone()
+        if not result or result[0] == 0:
+            return {
+                "has_counters": False,
+                "reason": "pmc_events table exists but contains no data",
+            }
+
+        # Get available counters
+        counters_query = """
+        SELECT
+            counter_name,
+            COUNT(*) as sample_count,
+            AVG(counter_value) as avg_value,
+            MIN(counter_value) as min_value,
+            MAX(counter_value) as max_value,
+            SUM(counter_value) as total_value
+        FROM pmc_events
+        GROUP BY counter_name
+        ORDER BY counter_name
+        """
+
+        counter_results = execute_statement(connection, counters_query, ()).fetchall()
+
+        counters = {}
+        for row in counter_results:
+            counters[row[0]] = {
+                "sample_count": row[1],
+                "avg_value": row[2],
+                "min_value": row[3],
+                "max_value": row[4],
+                "total_value": row[5],
+            }
+
+        # Get per-kernel counter analysis
+        per_kernel_query = """
+        SELECT
+            name,
+            counter_name,
+            COUNT(DISTINCT dispatch_id) as dispatch_count,
+            AVG(counter_value) as avg_value,
+            MIN(counter_value) as min_value,
+            MAX(counter_value) as max_value
+        FROM pmc_events
+        GROUP BY name, counter_name
+        ORDER BY name, counter_name
+        LIMIT 5000
+        """
+
+        kernel_results = execute_statement(connection, per_kernel_query, ()).fetchall()
+
+        per_kernel = {}
+        for row in kernel_results:
+            kernel_name = row[0]
+            counter_name = row[1]
+
+            if kernel_name not in per_kernel:
+                per_kernel[kernel_name] = {}
+
+            per_kernel[kernel_name][counter_name] = {
+                "dispatch_count": row[2],
+                "avg_value": row[3],
+                "min_value": row[4],
+                "max_value": row[5],
+            }
+
+        # Calculate derived metrics
+        metrics = {}
+
+        # GPU Utilization (GRBM_GUI_ACTIVE / GRBM_COUNT)
+        if "GRBM_GUI_ACTIVE" in counters and "GRBM_COUNT" in counters:
+            grbm_count = counters["GRBM_COUNT"]["avg_value"]
+            grbm_active = counters["GRBM_GUI_ACTIVE"]["avg_value"]
+            if grbm_count > 0:
+                metrics["gpu_utilization_percent"] = (grbm_active / grbm_count) * 100
+
+        # Average wave occupancy
+        if "SQ_WAVES" in counters:
+            metrics["avg_waves"] = counters["SQ_WAVES"]["avg_value"]
+            metrics["max_waves"] = counters["SQ_WAVES"]["max_value"]
+            metrics["min_waves"] = counters["SQ_WAVES"]["min_value"]
+
+        return {
+            "has_counters": True,
+            "counters": counters,
+            "metrics": metrics,
+            "per_kernel": per_kernel,
+        }
+
+    except Exception as e:
+        print(f"Warning: Could not analyze hardware counters: {e}", file=sys.stderr)
+        return {"has_counters": False, "reason": str(e)}
+
+
+# ---------------------------------------------------------------------------
+# Collection-context detection
+# ---------------------------------------------------------------------------
+
+# Flags that --sys-trace subsumes.  Any flag in this set is redundant when
+# kernel + memory-copy trace data is already present in the database.
+_SYS_TRACE_IMPLIED: frozenset = frozenset(
+    {
+        "--sys-trace",
+        "--hip-trace",
+        "--hsa-trace",
+        "--kernel-trace",
+        "--memory-copy-trace",
+        "--marker-trace",
+        "--roctx-trace",
+    }
+)
+
+# Args that only specify output location — not considered "new data collection"
+_OUTPUT_ONLY_ARGS: frozenset = frozenset(
+    {"-d", "-o", "--output-directory", "--output-file"}
+)
+
+# Hardware counter collection limits for rocprofv3 --pmc.
+#
+# AMD GPUs limit how many performance counters from the SAME hardware block can
+# be collected simultaneously in a single kernel dispatch pass.  The "block name"
+# is the prefix before the first "_" in a counter name:
+#
+#   SQ_WAVES        → block "SQ"    (shader / wavefront counters)
+#   GRBM_COUNT      → block "GRBM"  (GPU register bus manager)
+#   TCP_*           → block "TCP"   (L1 vector cache)
+#   TCC_*           → block "TCC"   (L2 cache)
+#
+# IMPORTANT: FETCH_SIZE and WRITE_SIZE are DERIVED metrics, not raw hardware counters.
+# Internally rocprofv3 expands them to TCC hardware block counters:
+#   FETCH_SIZE → TCC_BUBBLE + TCC_EA0_RDREQ + GRBM_GUI_ACTIVE  (TCC block)
+#   WRITE_SIZE → TCC_EA0_WRREQ + TCC_EA0_WRREQ_64B              (TCC block)
+# Combined they require ~4 TCC hardware counter slots (across 32 TCC instances on MI300X).
+# They MUST be isolated in their own pass whenever SQ counters are also requested.
+#
+# Exceeding a block's per-pass limit causes rocprofv3 to abort with error code 38:
+#   "Request exceeds the capabilities of the hardware to collect"
+#
+# Actual limits vary by GPU generation (MI100/MI200/MI300X) and block type.
+# The values below are conservative safe defaults; some blocks (e.g. SQ on
+# gfx942/MI300X) support up to 8 counters per pass in practice.
+_PMC_BLOCK_LIMIT_DEFAULT: int = 4
+_PMC_BLOCK_LIMITS: Dict[str, int] = {
+    "SQ": 4,  # shader/wave; gfx942 supports up to 8 — use 4 as safe default
+    "GRBM": 4,  # GPU register bus manager
+    "TCP": 4,  # L1 vector cache
+    "TCC": 4,  # L2 cache
+    "TA": 4,  # texture addressing
+    "TD": 4,  # texture data
+}
+
+# FETCH_SIZE and WRITE_SIZE are derived metrics that each expand to multiple TCC
+# hardware counters (FETCH_SIZE → 3 counters, WRITE_SIZE → 2 counters; combined 5
+# exceed the TCC per-pass limit). Each must be in its own dedicated pass, isolated
+# from all other counters — including each other.
+_TCC_DERIVED_COUNTERS: frozenset = frozenset({"FETCH_SIZE", "WRITE_SIZE"})
+
+
+def _pmc_block(counter: str) -> str:
+    """Return the hardware block name for a counter (prefix before first '_')."""
+    return counter.split("_")[0]
+
+
+def _pmc_block_limit(block: str) -> int:
+    """Return the per-pass counter limit for the given hardware block."""
+    return _PMC_BLOCK_LIMITS.get(block, _PMC_BLOCK_LIMIT_DEFAULT)
+
+
+def _split_pmc_into_passes(
+    counters: List[str],
+    base_flags: List[str],
+    base_args: List[Dict[str, Any]],
+    output_dir: str,
+    output_prefix: str,
+    description: str,
+    app_placeholder: str = "./app",
+) -> List[Dict[str, Any]]:
+    """
+    Split a counter list into the minimum number of rocprofv3 commands so that
+    no hardware block exceeds its per-pass collection limit.
+
+    Strategy:
+    - FETCH_SIZE and WRITE_SIZE are TCC-derived metrics that expand internally to
+      multiple TCC hardware counters (FETCH_SIZE→3 TCC counters, WRITE_SIZE→2).
+      Together they exceed the TCC block per-pass limit, so each derived counter
+      MUST be in its own dedicated pass, isolated from all other counters.
+    - For all other counters: group by hardware block (prefix before '_'),
+      passes needed = max(ceil(block_count / block_limit)), distribute evenly.
+
+    Returns a list of command dicts. Single-element when one pass suffices.
+    """
+    from collections import defaultdict
+
+    if not counters:
+        return []
+
+    # Each TCC-derived counter must be in its own dedicated pass.
+    derived = [c for c in counters if c in _TCC_DERIVED_COUNTERS]
+    regular = [c for c in counters if c not in _TCC_DERIVED_COUNTERS]
+
+    if derived and (len(derived) > 1 or regular):
+        # Multiple derived counters can't share a pass (combined TCC hw counter count
+        # exceeds the block limit). Each derived counter gets its own dedicated pass;
+        # regular counters are handled together as a separate group.
+        all_cmds = []
+        if regular:
+            all_cmds.extend(
+                _split_pmc_into_passes(
+                    regular,
+                    base_flags,
+                    base_args,
+                    output_dir,
+                    output_prefix,
+                    description,
+                    app_placeholder,
+                )
+            )
+        for dc in derived:
+            # Single derived counter: build its command directly (no recursion).
+            pmc_str = dc
+            flags_str = " ".join(base_flags)
+            non_pmc = [a for a in base_args if a.get("name") not in ("--pmc",)]
+            args = list(non_pmc) + [
+                {"name": "--pmc", "value": pmc_str},
+                {"name": "-d", "value": output_dir},
+                {"name": "-o", "value": output_prefix},
+            ]
+            all_cmds.append(
+                {
+                    "tool": "rocprofv3",
+                    "description": description,
+                    "flags": list(base_flags),
+                    "args": args,
+                    "full_command": (
+                        f"rocprofv3 {flags_str} --pmc {pmc_str}"
+                        f" -d {output_dir} -o {output_prefix} -- {app_placeholder}"
+                    ).strip(),
+                }
+            )
+        n = len(all_cmds)
+        if n > 1:
+            for idx, cmd in enumerate(all_cmds):
+                out_name = f"{output_prefix}_pass{idx + 1}"
+                pmc_val = next(
+                    (a["value"] for a in cmd["args"] if a["name"] == "--pmc"), ""
+                )
+                flags_str = " ".join(base_flags)
+                cmd["description"] = f"{description} (pass {idx + 1}/{n})"
+                for arg in cmd["args"]:
+                    if arg["name"] == "-o":
+                        arg["value"] = out_name
+                cmd["full_command"] = (
+                    f"rocprofv3 {flags_str} --pmc {pmc_val}"
+                    f" -d {output_dir} -o {out_name} -- {app_placeholder}"
+                ).strip()
+        return all_cmds
+
+    # Standard path: group by block and distribute round-robin.
+    block_groups: Dict[str, List[str]] = defaultdict(list)
+    for c in counters:
+        block_groups[_pmc_block(c)].append(c)
+
+    if not block_groups:
+        return []
+
+    n_passes = max(
+        (len(cs) + _pmc_block_limit(blk) - 1) // max(_pmc_block_limit(blk), 1)
+        for blk, cs in block_groups.items()
+    )
+
+    pass_counters: List[List[str]] = [[] for _ in range(n_passes)]
+    for blk, cs in block_groups.items():
+        limit = _pmc_block_limit(blk)
+        for pass_idx in range(n_passes):
+            chunk = cs[pass_idx * limit : (pass_idx + 1) * limit]
+            pass_counters[pass_idx].extend(chunk)
+
+    pass_counters = [p for p in pass_counters if p]
+    n = len(pass_counters)
+
+    commands = []
+    for idx, pctrs in enumerate(pass_counters):
+        suffix = f" (pass {idx + 1}/{n})" if n > 1 else ""
+        out_name = f"{output_prefix}_pass{idx + 1}" if n > 1 else output_prefix
+        pmc_str = " ".join(pctrs)
+        flags_str = " ".join(base_flags)
+        non_pmc_args = [a for a in base_args if a.get("name") not in ("--pmc",)]
+        args = list(non_pmc_args) + [
+            {"name": "--pmc", "value": pmc_str},
+            {"name": "-d", "value": output_dir},
+            {"name": "-o", "value": out_name},
+        ]
+        full_cmd = (
+            f"rocprofv3 {flags_str} --pmc {pmc_str}"
+            f" -d {output_dir} -o {out_name} -- {app_placeholder}"
+        ).strip()
+        commands.append(
+            {
+                "tool": "rocprofv3",
+                "description": f"{description}{suffix}",
+                "flags": list(base_flags),
+                "args": args,
+                "full_command": full_cmd,
+            }
+        )
+    return commands
+
+
+def _detect_already_collected(connection: RocpdImportData) -> frozenset:
+    """
+    Inspect the database to infer which rocprofv3 flags were used during
+    the original profiling run.
+
+    Returns a frozenset of flag strings that are already covered by the
+    existing trace, so recommendations can avoid suggesting redundant
+    re-collection steps.
+
+    Detection heuristics:
+    - ``kernels`` rows    → ``--kernel-trace`` (or ``--sys-trace``) was used
+    - ``regions`` rows    → ``--hip-trace`` / ``--hsa-trace`` API spans were
+      captured (HIP/HSA API region data, a proxy for sys-trace level coverage)
+    - ``memory_copies`` rows → ``--memory-copy-trace`` was used
+    - kernels + regions together → full ``--sys-trace`` implied; all flags
+      in ``_SYS_TRACE_IMPLIED`` are marked as already collected
+    - ``pmc_events`` rows → per-counter names stored as ``"pmc:<NAME>"``
+      (e.g. ``"pmc:GRBM_COUNT"``) so ``_filter_rec_commands`` can strip
+      already-collected counters from ``--pmc`` recommendation commands
+    """
+    has_kernels = False
+    has_api_regions = False  # 'regions' view = HIP/HSA API spans → hip/hsa-trace
+    has_memcpy = False
+
+    checks = (
+        ("kernels", "kernels"),
+        ("regions", "api_regions"),
+        ("memory_copies", "memcpy"),
+    )
+    for table, key in checks:
+        try:
+            row = execute_statement(
+                connection, f"SELECT COUNT(*) FROM {table} LIMIT 1", ()
+            ).fetchone()
+            if row and row[0] > 0:
+                if key == "kernels":
+                    has_kernels = True
+                elif key == "api_regions":
+                    has_api_regions = True
+                else:
+                    has_memcpy = True
+        except Exception:
+            pass  # table may not exist; expected for Tier 1-only traces
+
+    covered: set = set()
+    if has_kernels:
+        covered.add("--kernel-trace")
+    if has_memcpy:
+        covered.add("--memory-copy-trace")
+
+    # If kernel-dispatch AND API-region data both exist, the user ran
+    # --sys-trace (or --hip-trace/--hsa-trace alongside --kernel-trace),
+    # which implies every flag in _SYS_TRACE_IMPLIED.
+    if has_kernels and has_api_regions:
+        covered.update(_SYS_TRACE_IMPLIED)
+
+    # Detect which hardware counters are already present in pmc_events.
+    # Stored as "pmc:<COUNTER_NAME>" to avoid collisions with flag strings.
+    try:
+        rows = execute_statement(
+            connection, "SELECT DISTINCT counter_name FROM pmc_events", ()
+        ).fetchall()
+        for row in rows:
+            if row and row[0]:
+                covered.add(f"pmc:{row[0]}")
+    except Exception:
+        pass  # pmc_events table absent; expected for Tier 1-only traces
+
+    return frozenset(covered)
+
+
+def _filter_rec_commands(
+    commands: List[Dict[str, Any]],
+    already_collected: frozenset,
+) -> List[Dict[str, Any]]:
+    """
+    Remove or trim recommendation commands whose flags are entirely covered
+    by the data already present in the database.
+
+    Rules:
+    - A flag in ``already_collected`` is stripped from ``flags`` and from
+      ``full_command``.
+    - ``--pmc`` counter names are checked against ``"pmc:<NAME>"`` entries in
+      ``already_collected`` (populated by ``_detect_already_collected``).
+      Already-collected counters are removed from the ``--pmc`` arg value; if
+      all counters in a ``--pmc`` arg are already present the arg (and flag)
+      are dropped entirely.
+    - If after stripping, a rocprofv3 command has no remaining flags AND
+      its args contain only output-path or scope-filter entries (-d / -o /
+      --kernel-names / etc.), the command adds no new data and is dropped.
+    - ``rocprof-sys --trace`` alone is equivalent to ``rocprofv3 --sys-trace``
+      (same HIP/HSA API data, just in Perfetto format instead of rocpd format)
+      and is dropped when sys-trace data is already present.  ``rocprof-sys``
+      commands that carry *additional* flags beyond ``--trace`` (e.g.
+      ``--trace-gpu-memory``, ``--call-stack-sampling``) are always kept
+      because they collect data that rocprofv3 cannot.
+    - ``rocprof-compute`` commands are always kept — they perform a deep
+      hardware counter analysis that neither rocprofv3 nor rocprof-sys covers.
+    - A short note is appended to ``description`` when flags/counters are
+      stripped so the user knows why the command looks different from the docs.
+    """
+    if not already_collected:
+        return commands
+
+    has_sys_trace = "--sys-trace" in already_collected
+
+    # Args that are scope filters or output-only — they don't represent new
+    # data collection on their own.
+    _NON_DATA_ARGS = _OUTPUT_ONLY_ARGS | frozenset(
+        {
+            "--kernel-names",
+            "--include-names",
+            "--exclude-names",
+        }
+    )
+
+    filtered = []
+    for cmd in commands:
+        tool = cmd.get("tool", "")
+        flags = cmd.get("flags", [])
+        args = cmd.get("args", [])
+
+        # ── rocprof-sys ──────────────────────────────────────────────────────
+        if tool == "rocprof-sys" and has_sys_trace:
+            # --trace alone ≈ rocprofv3 --sys-trace; drop if it adds nothing new
+            extra_flags = [f for f in flags if f != "--trace"]
+            meaningful_args = [
+                a for a in args if a.get("name", "") not in _OUTPUT_ONLY_ARGS
+            ]
+            if not extra_flags and not meaningful_args:
+                continue  # equivalent to already-collected sys-trace data
+            # Has meaningful extra flags (e.g. --trace-gpu-memory) → keep as-is
+            filtered.append(cmd)
+            continue
+
+        # ── rocprof-compute ──────────────────────────────────────────────────
+        if tool == "rocprof-compute":
+            filtered.append(cmd)  # always keep — deep hardware counter analysis
+            continue
+
+        # ── rocprofv3 ────────────────────────────────────────────────────────
+        redundant = [f for f in flags if f in already_collected]
+        new_flags = [f for f in flags if f not in already_collected]
+
+        # Process --pmc arg: strip counters already present in pmc_events.
+        # pmc_counters / new_pmc / removed_pmc are kept in outer scope so the
+        # full_command rebuild below can reference them.
+        new_args: list = list(args)
+        pmc_counters: list = []
+        new_pmc: list = []
+        removed_pmc: list = []
+        pmc_idx = next(
+            (i for i, a in enumerate(new_args) if a.get("name") == "--pmc"), -1
+        )
+        if pmc_idx >= 0:
+            pmc_val = new_args[pmc_idx].get("value") or ""
+            pmc_counters = pmc_val.split()
+            new_pmc = [c for c in pmc_counters if f"pmc:{c}" not in already_collected]
+            removed_pmc = [c for c in pmc_counters if f"pmc:{c}" in already_collected]
+            if removed_pmc:
+                if new_pmc:
+                    new_args[pmc_idx] = {"name": "--pmc", "value": " ".join(new_pmc)}
+                else:
+                    # All counters already collected — drop arg and flag entirely
+                    new_args.pop(pmc_idx)
+                    new_flags = [f for f in new_flags if f != "--pmc"]
+
+        nothing_changed = not redundant and not removed_pmc
+        if nothing_changed:
+            filtered.append(cmd)
+            continue
+
+        # Meaningful args: anything that isn't an output path or a scope filter.
+        # --kernel-names scopes collection but doesn't collect new data itself.
+        meaningful_args = [a for a in new_args if a.get("name", "") not in _NON_DATA_ARGS]
+        if not new_flags and not meaningful_args:
+            continue  # nothing new to collect — drop the command entirely
+
+        # Build updated full_command
+        new_full_cmd = cmd.get("full_command", "")
+        for f in redundant:
+            new_full_cmd = new_full_cmd.replace(f" {f}", "")
+        if removed_pmc:
+            old_pmc_block = "--pmc " + " ".join(pmc_counters)
+            if new_pmc:
+                new_full_cmd = new_full_cmd.replace(
+                    old_pmc_block, "--pmc " + " ".join(new_pmc)
+                )
+            else:
+                new_full_cmd = new_full_cmd.replace(" " + old_pmc_block, "")
+                new_full_cmd = new_full_cmd.replace(old_pmc_block, "")
+        new_full_cmd = re.sub(r" +", " ", new_full_cmd).strip()
+
+        new_cmd = dict(cmd)
+        new_cmd["flags"] = new_flags
+        new_cmd["args"] = new_args
+        new_cmd["full_command"] = new_full_cmd
+
+        note_parts = []
+        if redundant:
+            note_parts.append(f"flags: {' '.join(sorted(redundant))}")
+        if removed_pmc:
+            note_parts.append(f"PMC counters: {' '.join(sorted(removed_pmc))}")
+        new_cmd["description"] = (
+            new_cmd.get("description", "")
+            + f" (Already collected in this run: {'; '.join(note_parts)})"
+        )
+        filtered.append(new_cmd)
+
+    return filtered
+
+
+def generate_recommendations(
+    time_breakdown: Dict[str, Any],
+    hotspots: List[Dict[str, Any]],
+    memory_analysis: Dict[str, Dict[str, Any]],
+    hardware_counters: Optional[Dict[str, Any]] = None,
+    already_collected: Optional[frozenset] = None,
+    short_kernels: Optional[Dict[str, Any]] = None,  # NEW (TraceLens)
+    interval_timeline: Optional[Dict[str, Any]] = None,  # NEW (TraceLens)
+) -> List[Dict[str, Any]]:
+    """
+    Generate performance recommendations based on analysis results.
+
+    Args:
+        time_breakdown: Time distribution metrics
+        hotspots: Top kernel hotspots
+        memory_analysis: Memory copy analysis
+        hardware_counters: Hardware counter analysis (Tier 2)
+        already_collected: Frozenset of rocprofv3 flags already present in the
+            database (from ``_detect_already_collected``).  Commands that only
+            repeat already-collected flags are stripped or dropped so the user
+            is not told to re-run something they already did.
+
+    Returns:
+        List of recommendation dictionaries with priority, issue, and suggestions
+    """
+    already_collected = already_collected or frozenset()
+    recommendations = []
+
+    # Tier 2: Hardware counter-based recommendations
+    if hardware_counters and hardware_counters.get("has_counters"):
+        metrics = hardware_counters.get("metrics", {})
+
+        # Low wave occupancy
+        avg_waves = metrics.get("avg_waves", 0)
+        if avg_waves > 0 and avg_waves < 16:
+            recommendations.append(
+                {
+                    "priority": "HIGH",
+                    "category": "Low Occupancy",
+                    "issue": f"Low wave occupancy detected: average {avg_waves:.1f} waves per SIMD",
+                    "suggestion": "Increase kernel occupancy to improve GPU utilization",
+                    "actions": [
+                        "Increase block/workgroup size to launch more waves per CU",
+                        "Reduce register usage per thread (check with --save-temps or rocm-llvm-mc)",
+                        "Reduce shared memory (LDS) usage per workgroup",
+                        "Check for resource limitations preventing more waves with rocprof-compute",
+                    ],
+                    "estimated_impact": "10-30% throughput improvement depending on occupancy gap",
+                    "commands": [
+                        {
+                            "tool": "rocprofv3",
+                            "description": "Collect wave occupancy and cycle counters per kernel dispatch",
+                            "flags": ["--sys-trace"],
+                            "args": [
+                                {
+                                    "name": "--pmc",
+                                    "value": "SQ_WAVES SQ_WAVE_CYCLES TA_TA_BUSY",
+                                },
+                                {"name": "-d", "value": "./occupancy_output"},
+                                {"name": "-o", "value": "profile"},
+                            ],
+                            "full_command": "rocprofv3 --sys-trace --pmc SQ_WAVES SQ_WAVE_CYCLES TA_TA_BUSY -d ./occupancy_output -o profile -- ./app",
+                        },
+                        {
+                            "tool": "rocprof-compute",
+                            "description": "Deep-dive occupancy analysis: theoretical vs achieved waves per CU",
+                            "flags": [],
+                            "args": [
+                                {"name": "profile", "value": None},
+                                {"name": "--block", "value": "SQ"},
+                            ],
+                            "full_command": "rocprof-compute profile --block SQ -- ./app",
+                        },
+                    ],
+                }
+            )
+
+        # Low GPU utilization
+        gpu_util = metrics.get("gpu_utilization_percent", 0)
+        if gpu_util > 0 and gpu_util < 70:
+            recommendations.append(
+                {
+                    "priority": "MEDIUM",
+                    "category": "GPU Utilization",
+                    "issue": f"GPU utilization is only {gpu_util:.1f}% (target: >70%)",
+                    "suggestion": "Reduce GPU idle time by overlapping work and eliminating synchronization gaps",
+                    "actions": [
+                        "Launch independent kernels concurrently using hipStreams",
+                        "Increase kernel grid size to fill all CUs when problem size allows",
+                        "Reduce hipDeviceSynchronize() and hipStreamSynchronize() call frequency",
+                        "Overlap host-device transfers with compute using async streams",
+                    ],
+                    "estimated_impact": f"Up to {100 - gpu_util:.0f}% reduction in idle time",
+                    "commands": [
+                        {
+                            "tool": "rocprofv3",
+                            "description": "Collect GPU active vs total cycle counters to confirm utilization",
+                            "flags": ["--sys-trace"],
+                            "args": [
+                                {
+                                    "name": "--pmc",
+                                    "value": "GRBM_GUI_ACTIVE GRBM_COUNT",
+                                },
+                                {"name": "-d", "value": "./utilization_output"},
+                                {"name": "-o", "value": "profile"},
+                            ],
+                            "full_command": "rocprofv3 --sys-trace --pmc GRBM_GUI_ACTIVE GRBM_COUNT -d ./utilization_output -o profile -- ./app",
+                        },
+                        {
+                            "tool": "rocprof-sys",
+                            "description": "System-level timeline: identify host/GPU idle gaps and synchronization stalls",
+                            "flags": ["--trace"],
+                            "args": [],
+                            "full_command": "rocprof-sys --trace -- ./app",
+                        },
+                    ],
+                }
+            )
+
+    # Tier 1: Trace-level recommendations
+
+    # Rule 1: High memory copy overhead
+    memcpy_percent = time_breakdown.get("memcpy_percent", 0)
+    if memcpy_percent > 20:
+        recommendations.append(
+            {
+                "priority": "HIGH",
+                "category": "Memory Transfer",
+                "issue": f"Memory copies consume {memcpy_percent:.1f}% of execution time",
+                "suggestion": "Reduce host-device transfer overhead by batching and overlapping transfers",
+                "actions": [
+                    "Batch multiple small hipMemcpy calls into one large transfer",
+                    "Allocate pinned host memory with hipHostMalloc for faster PCIe transfers",
+                    "Use hipMemcpyAsync with streams to overlap transfers with kernel execution",
+                    "Minimize round-trips: keep data on GPU between consecutive kernels",
+                ],
+                "estimated_impact": "15-30% reduction in total runtime when transfers dominate",
+                "commands": [
+                    {
+                        "tool": "rocprofv3",
+                        "description": "Trace HIP and HSA memory copy operations with timing",
+                        "flags": ["--sys-trace", "--hsa-trace"],
+                        "args": [
+                            {"name": "-d", "value": "./memcpy_output"},
+                            {"name": "-o", "value": "profile"},
+                        ],
+                        "full_command": "rocprofv3 --sys-trace --hsa-trace -d ./memcpy_output -o profile -- ./app",
+                    },
+                    {
+                        "tool": "rocprof-sys",
+                        "description": "Detailed memory transfer timeline with PCIe bandwidth and overlap analysis",
+                        "flags": [],
+                        "args": [
+                            {"name": "--trace-gpu-memory", "value": None},
+                        ],
+                        "full_command": "rocprof-sys --trace-gpu-memory -- ./app",
+                    },
+                ],
+            }
+        )
+
+    # Rule 2: High API overhead
+    overhead_percent = time_breakdown.get("overhead_percent", 0)
+    if overhead_percent > 15:
+        recommendations.append(
+            {
+                "priority": "MEDIUM",
+                "category": "API Overhead",
+                "issue": f"API and launch overhead is {overhead_percent:.1f}% of total time",
+                "suggestion": "Reduce the number of API calls and kernel launches",
+                "actions": [
+                    "Fuse multiple small kernels into fewer larger launches",
+                    "Replace repeated hipMalloc/hipFree with a pre-allocated memory pool",
+                    "Batch hipMemcpy calls; use hipMemcpyAsync where possible",
+                    "Minimize hipDeviceSynchronize() — synchronize at stream level instead",
+                ],
+                "estimated_impact": "5-15% reduction when overhead exceeds 15%",
+                "commands": [
+                    {
+                        "tool": "rocprofv3",
+                        "description": "Trace all HIP runtime API calls to identify highest-frequency calls",
+                        "flags": ["--hip-trace", "--hsa-trace"],
+                        "args": [
+                            {"name": "-d", "value": "./api_output"},
+                            {"name": "-o", "value": "profile"},
+                        ],
+                        "full_command": "rocprofv3 --hip-trace --hsa-trace -d ./api_output -o profile -- ./app",
+                    },
+                    {
+                        "tool": "rocprof-sys",
+                        "description": "System-level API call frequency and per-call latency breakdown",
+                        "flags": ["--trace"],
+                        "args": [],
+                        "full_command": "rocprof-sys --trace -- ./app",
+                    },
+                ],
+            }
+        )
+
+    # Rule 3: Single kernel dominates
+    if hotspots and len(hotspots) > 0:
+        top_kernel = hotspots[0]
+        percent = top_kernel.get("percent_of_total", 0)
+        if percent > 50:
+            kernel_name = top_kernel.get("name", "unknown")
+            recommendations.append(
+                {
+                    "priority": "HIGH",
+                    "category": "Compute Bottleneck",
+                    "issue": f"Kernel '{kernel_name}' consumes {percent:.1f}% of GPU time",
+                    "suggestion": "Profile this kernel with hardware counters to identify its specific bottleneck",
+                    "actions": [
+                        "Collect hardware counters to classify compute vs memory bound",
+                        "Check memory access patterns for coalescing issues",
+                        "Analyze instruction mix: VALU, MFMA, load/store ratios",
+                        "Tune occupancy: balance registers, LDS, and block size",
+                    ],
+                    "estimated_impact": "Highly dependent on bottleneck type; 20-50% improvement possible",
+                    "commands": [
+                        {
+                            "tool": "rocprofv3",
+                            "description": "Collect GPU hardware counters scoped to the dominant kernel",
+                            "flags": ["--sys-trace"],
+                            "args": [
+                                {
+                                    "name": "--pmc",
+                                    "value": "GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES",
+                                },
+                                {"name": "-d", "value": "./kernel_output"},
+                                {"name": "-o", "value": "profile"},
+                            ],
+                            "full_command": (
+                                "rocprofv3 --sys-trace --pmc GRBM_COUNT GRBM_GUI_ACTIVE"
+                                " SQ_WAVES -d ./kernel_output -o profile -- ./app"
+                            ),
+                        },
+                        {
+                            "tool": "rocprof-compute",
+                            "description": "Roofline model, instruction mix, and memory bottleneck analysis for this kernel",
+                            "flags": [],
+                            "args": [
+                                {"name": "profile", "value": None},
+                                {
+                                    "name": "--kernel",
+                                    "value": kernel_name,
+                                },  # display only; full_command uses shlex.quote
+                            ],
+                            "full_command": f"rocprof-compute profile --kernel {shlex.quote(kernel_name)} -- ./app",
+                        },
+                    ],
+                }
+            )
+
+    # Rule 4: Many small kernels
+    if hotspots:
+        total_calls = sum(k.get("calls", 0) for k in hotspots)
+        if total_calls > 1000:
+            avg_duration = (
+                time_breakdown.get("total_kernel_time", 0) / total_calls
+                if total_calls > 0
+                else 0
+            )
+            if avg_duration < 10000:  # Less than 10 microseconds
+                recommendations.append(
+                    {
+                        "priority": "MEDIUM",
+                        "category": "Launch Overhead",
+                        "issue": f"Many small kernels detected: {total_calls} launches, avg {avg_duration / 1000:.1f} μs each",
+                        "suggestion": "Fuse kernels or batch work to amortize per-launch overhead (~5-10 μs each)",
+                        "actions": [
+                            "Combine sequential element-wise kernels (e.g., add + multiply) into a single fused kernel",
+                            "Increase problem size per launch to push avg duration above 50 μs",
+                            "Use persistent kernels for iterative workloads to eliminate repeated launches",
+                        ],
+                        "estimated_impact": "Eliminates up to 50% of launch overhead for fine-grained workloads",
+                        "commands": [
+                            {
+                                "tool": "rocprofv3",
+                                "description": "Capture full kernel dispatch timeline to visualize launch frequency and gaps",
+                                "flags": ["--sys-trace"],
+                                "args": [
+                                    {"name": "-d", "value": "./launch_output"},
+                                    {"name": "-o", "value": "profile"},
+                                ],
+                                "full_command": "rocprofv3 --sys-trace -d ./launch_output -o profile -- ./app",
+                            },
+                            {
+                                "tool": "rocprof-sys",
+                                "description": "Visualize kernel launch timeline and inter-launch gaps in a Perfetto trace",
+                                "flags": ["--trace"],
+                                "args": [],
+                                "full_command": "rocprof-sys --trace -- ./app",
+                            },
+                        ],
+                    }
+                )
+
+    # Rule 5: Low memory bandwidth
+    for direction, stats in memory_analysis.items():
+        bandwidth_gbps = stats.get("bandwidth_bytes_per_sec", 0) / 1e9
+        if bandwidth_gbps > 0 and bandwidth_gbps < 10:
+            avg_bytes = stats.get("avg_bytes", 0)
+            recommendations.append(
+                {
+                    "priority": "MEDIUM",
+                    "category": "Memory Bandwidth",
+                    "issue": f"{direction} copies achieving only {bandwidth_gbps:.2f} GB/s (avg transfer: {avg_bytes / 1024:.1f} KB)",
+                    "suggestion": "Increase transfer size per operation to reach PCIe or HBM saturation bandwidth",
+                    "actions": [
+                        f"Consolidate many {avg_bytes / 1024:.1f} KB transfers into fewer large transfers (>1 MB each)",
+                        "Use hipHostMalloc with hipHostMallocPinned flag to enable DMA engine transfers",
+                        "Consider hipMemcpyAsync with stream to overlap with compute",
+                        "For multi-GPU: evaluate hipMemcpyPeer for direct device-to-device transfers",
+                    ],
+                    "estimated_impact": "2-10x bandwidth improvement by eliminating small-transfer PCIe overhead",
+                    "commands": [
+                        {
+                            "tool": "rocprofv3",
+                            "description": "Trace memory copy operations with size and timing data",
+                            "flags": ["--hsa-trace"],
+                            "args": [
+                                {"name": "-d", "value": "./bandwidth_output"},
+                                {"name": "-o", "value": "profile"},
+                            ],
+                            "full_command": "rocprofv3 --hsa-trace -d ./bandwidth_output -o profile -- ./app",
+                        },
+                        {
+                            "tool": "rocprof-compute",
+                            "description": "HBM bandwidth utilization analysis for memory-bound kernels",
+                            "flags": [],
+                            "args": [
+                                {"name": "profile", "value": None},
+                                {"name": "--block", "value": "TD"},
+                            ],
+                            "full_command": "rocprof-compute profile --block TD -- ./app",
+                        },
+                    ],
+                }
+            )
+
+    # Rule 6: Default if no issues found
+    if not recommendations:
+        recommendations.append(
+            {
+                "priority": "INFO",
+                "category": "Performance",
+                "issue": "No obvious performance issues detected at this analysis tier",
+                "suggestion": "Collect deeper profiling data to find optimization opportunities",
+                "actions": [
+                    "Collect hardware counters to check GPU utilization and occupancy",
+                    "Enable PC sampling for instruction-level hotspot analysis",
+                    "Profile with rocprof-compute for roofline model and bottleneck classification",
+                ],
+                "estimated_impact": "Depends on findings from deeper analysis",
+                "commands": [
+                    {
+                        "tool": "rocprofv3",
+                        "description": "Collect standard hardware performance counters for Tier 2 analysis",
+                        "flags": ["--sys-trace"],
+                        "args": [
+                            {
+                                "name": "--pmc",
+                                "value": "GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES",
+                            },
+                            {"name": "-d", "value": "./counters_output"},
+                            {"name": "-o", "value": "profile"},
+                        ],
+                        "full_command": "rocprofv3 --sys-trace --pmc GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES -d ./counters_output -o profile -- ./app",
+                    },
+                    {
+                        "tool": "rocprof-sys",
+                        "description": "Full system trace for comprehensive performance timeline",
+                        "flags": ["--trace"],
+                        "args": [],
+                        "full_command": "rocprof-sys --trace -- ./app",
+                    },
+                    {
+                        "tool": "rocprof-compute",
+                        "description": "Complete hardware counter sweep for roofline model and bottleneck classification",
+                        "flags": [],
+                        "args": [
+                            {"name": "profile", "value": None},
+                        ],
+                        "full_command": "rocprof-compute profile -- ./app",
+                    },
+                ],
+            }
+        )
+
+    # Rule 9 — SHORT KERNELS (TraceLens-derived)
+    if short_kernels and short_kernels.get("wasted_pct_of_kernel_time", 0) > 5.0:
+        wasted_pct = short_kernels["wasted_pct_of_kernel_time"]
+        count = short_kernels.get("short_kernel_count", 0)
+        threshold = short_kernels.get("threshold_us", 10)
+        recommendations.append(
+            {
+                "priority": "MEDIUM",
+                "category": "Launch Efficiency",
+                "issue": f"Short kernel overhead: {count} kernels below {threshold}μs consume {wasted_pct:.1f}% of kernel time",
+                "suggestion": "Reduce kernel launch overhead by fusing small kernels or using persistent kernel patterns",
+                "actions": [
+                    "- Fuse consecutive elementwise ops into a single kernel",
+                    "- Use hipGraph to batch kernel launches and reduce launch latency",
+                    "- Consider persistent kernels for kernels called >1000×/sec",
+                    "- Profile with rocprofv3 --hip-trace to measure queue latency vs. execution time",
+                ],
+                "estimated_impact": "5–15% reduction in total kernel time if short kernels are dominant",
+                "commands": [],
+            }
+        )
+
+    # Rule 10 — GPU IDLE TIME (TraceLens interval arithmetic, more accurate than overhead%)
+    if interval_timeline and interval_timeline.get("idle_pct", 0) > 20.0:
+        idle_pct = interval_timeline["idle_pct"]
+        recommendations.append(
+            {
+                "priority": "HIGH",
+                "category": "GPU Utilization",
+                "issue": f"High GPU idle time detected: {idle_pct:.1f}% of wall time the GPU is idle",
+                "suggestion": "Overlap CPU dispatch work with GPU execution to reduce idle gaps",
+                "actions": [
+                    "- Use async HIP API calls (hipMemcpyAsync, kernel launches without hipDeviceSynchronize)",
+                    "- Introduce hipStream_t streams to overlap independent kernels and transfers",
+                    "- Check for unnecessary hipDeviceSynchronize() calls in hot loops",
+                    "- Use rocprofv3 --hip-trace to identify synchronization points causing stalls",
+                ],
+                "estimated_impact": f"Up to {idle_pct:.0f}% improvement in wall-time throughput if idle is CPU-bound dispatch",
+                "commands": [],
+            }
+        )
+
+    # Strip or drop commands whose flags are already covered by the original run
+    if already_collected:
+        for rec in recommendations:
+            rec["commands"] = _filter_rec_commands(
+                rec.get("commands", []), already_collected
+            )
+
+    return recommendations
+
+
+def _format_as_json(
+    time_breakdown: Dict[str, Any],
+    hotspots: List[Dict[str, Any]],
+    memory_analysis: Dict[str, Dict[str, Any]],
+    recommendations: List[Dict[str, Any]],
+    hardware_counters: Optional[Dict[str, Any]] = None,
+    database_path: str = "",
+    interval_timeline=None,
+    kernel_categories=None,
+    short_kernels=None,
+    custom_prompt: Optional[str] = None,
+) -> str:
+    """Serialize analysis results to JSON conforming to the current schema version (v0.3.0 when TraceLens fields are present, v0.1.0 otherwise).
+
+    The output document contains a top-level ``schema_version`` field that
+    consumers MUST check before parsing.  See
+    ``rocpd/ai_analysis/docs/analysis-output.schema.json`` for the
+    normative schema and ``SCHEMA_CHANGELOG.md`` for migration guidance.
+    """
+    import json as _json
+
+    breakdown = time_breakdown or {}
+    hw = hardware_counters or {}
+    total_runtime_ns = int(breakdown.get("total_runtime", 0))
+    kernel_time_ns = int(breakdown.get("total_kernel_time", 0))
+    memcpy_time_ns = int(breakdown.get("total_memcpy_time", 0))
+    kernel_pct = float(breakdown.get("kernel_percent", 0))
+    memcpy_pct = float(breakdown.get("memcpy_percent", 0))
+    overhead_pct = float(breakdown.get("overhead_percent", 0))
+    # Derive api_overhead_ns from the percentage; clamp negative values to 0
+    api_overhead_ns = max(0, int(total_runtime_ns * overhead_pct / 100.0))
+    idle_time_ns = max(
+        0, total_runtime_ns - kernel_time_ns - memcpy_time_ns - api_overhead_ns
+    )
+    idle_pct = (
+        float(idle_time_ns / total_runtime_ns * 100.0) if total_runtime_ns > 0 else 0.0
+    )
+
+    # --- metadata ---
+    has_counters = bool(hw.get("has_counters", False))
+    doc: Dict[str, Any] = {
+        "schema_version": "0.1.0",
+        "metadata": {
+            "rocpd_version": _ROCPD_VERSION,
+            "analysis_version": "0.1.0",  # schema version, not module version
+            "database_file": database_path,
+            "analysis_timestamp": datetime.now().isoformat(),
+            "analysis_duration_ms": 0,
+            "custom_prompt": custom_prompt,
+        },
+        # --- profiling_info ---
+        "profiling_info": {
+            "total_duration_ns": total_runtime_ns,
+            "profiling_mode": (
+                "sys_trace_with_counters" if has_counters else "sys_trace_only"
+            ),
+            "analysis_tier": 2 if has_counters else 1,
+            "gpus": [],
+        },
+        # --- summary ---
+        "summary": _build_summary(breakdown, hotspots, has_counters),
+        # --- execution_breakdown ---
+        "execution_breakdown": {
+            "total_runtime_ns": total_runtime_ns,
+            "kernel_time_ns": kernel_time_ns,
+            "kernel_time_pct": round(kernel_pct, 2),
+            "memcpy_time_ns": memcpy_time_ns,
+            "memcpy_time_pct": round(memcpy_pct, 2),
+            "api_overhead_ns": api_overhead_ns,
+            "api_overhead_pct": round(overhead_pct, 2),
+            "idle_time_ns": idle_time_ns,
+            "idle_time_pct": round(idle_pct, 2),
+        },
+        # --- hotspots ---
+        "hotspots": [
+            {
+                "rank": i + 1,
+                "name": k.get("name", "unknown"),
+                "calls": int(k.get("calls", 0)),
+                "total_duration_ns": int(k.get("total_duration", 0)),
+                "avg_duration_ns": float(k.get("avg_duration", 0)),
+                "min_duration_ns": int(k.get("min_duration", 0)),
+                "max_duration_ns": int(k.get("max_duration", 0)),
+                "pct_of_total": round(float(k.get("percent_of_total", 0)), 2),
+            }
+            for i, k in enumerate(hotspots or [])
+        ],
+        # --- memory_analysis ---
+        "memory_analysis": {
+            direction: {
+                "count": int(s.get("count", 0)),
+                "total_bytes": int(s.get("total_bytes", 0)),
+                "total_duration_ns": int(s.get("total_duration", 0)),
+                "avg_bytes": float(s.get("avg_bytes", 0)),
+                "avg_duration_ns": float(s.get("avg_duration", 0)),
+                "bandwidth_gbps": round(
+                    float(s.get("bandwidth_bytes_per_sec", 0)) / 1e9, 4
+                ),
+            }
+            for direction, s in (memory_analysis or {}).items()
+        },
+        # --- hardware_counters ---
+        "hardware_counters": _build_hw_counters_json(hw),
+        # --- recommendations ---
+        "recommendations": _build_recommendations_json(recommendations or []),
+        # --- warnings ---
+        "warnings": _build_warnings_json(has_counters),
+        "errors": [],
+        "llm_enhanced_explanation": None,
+    }
+
+    # TraceLens-derived fields (schema v0.3.0)
+    if interval_timeline:
+        doc["interval_timeline"] = interval_timeline
+    if kernel_categories:
+        doc["kernel_categories"] = kernel_categories
+    if short_kernels:
+        doc["short_kernels"] = short_kernels
+
+    # Bump schema version when new fields are present
+    if interval_timeline or kernel_categories or short_kernels:
+        doc["schema_version"] = "0.3.0"
+        doc["metadata"]["analysis_version"] = "0.3.0"
+
+    return _json.dumps(doc, indent=2)
+
+
+def _build_summary(
+    breakdown: Dict[str, Any],
+    hotspots: List[Dict[str, Any]],
+    has_counters: bool,
+) -> Dict[str, Any]:
+    """Derive the summary section from analysis data."""
+    memcpy_pct = float(breakdown.get("memcpy_percent", 0))
+    kernel_pct = float(breakdown.get("kernel_percent", 0))
+    overhead_pct = float(breakdown.get("overhead_percent", 0))
+
+    # Simple bottleneck classification
+    if memcpy_pct > 30:
+        bottleneck = "memory_transfer"
+        confidence = 0.85
+    elif memcpy_pct > 20:
+        bottleneck = "memory_transfer"
+        confidence = 0.70
+    elif overhead_pct > 25:
+        bottleneck = "latency"
+        confidence = 0.75
+    elif kernel_pct > 70 and has_counters:
+        bottleneck = "compute"
+        confidence = 0.80
+    elif kernel_pct > 70:
+        bottleneck = "compute"
+        confidence = 0.60
+    else:
+        bottleneck = "mixed"
+        confidence = 0.50
+
+    top_kernel = hotspots[0].get("name", "N/A") if hotspots else "N/A"
+    key_findings = [
+        f"Kernel execution: {kernel_pct:.1f}% of total runtime",
+        f"Memory copy overhead: {memcpy_pct:.1f}% of total runtime",
+        f"Top kernel: {top_kernel}",
+    ]
+    if has_counters:
+        key_findings.append("Hardware counter data available (Tier 2 analysis)")
+    else:
+        key_findings.append("No hardware counters — Tier 1 trace analysis only")
+
+    return {
+        "overall_assessment": (
+            f"Workload is {bottleneck.replace('_', ' ')}-bound "
+            f"with {len(hotspots)} unique kernels analyzed. "
+            f"Kernel time: {kernel_pct:.1f}%, memory copies: {memcpy_pct:.1f}%."
+        ),
+        "primary_bottleneck": bottleneck,
+        "confidence": round(confidence, 2),
+        "key_findings": key_findings,
+    }
+
+
+def _build_hw_counters_json(hw: Dict[str, Any]) -> Dict[str, Any]:
+    """Convert hardware_counters internal dict to schema-compliant form."""
+    has_counters = bool(hw.get("has_counters", False))
+    if not has_counters:
+        return {"has_counters": False, "metrics": None, "counters": None}
+
+    raw_metrics = hw.get("metrics", {}) or {}
+    metrics: Dict[str, Any] = {
+        "gpu_utilization_pct": raw_metrics.get("gpu_utilization_percent"),
+        "avg_waves": raw_metrics.get("avg_waves"),
+        "max_waves": raw_metrics.get("max_waves"),
+        "min_waves": raw_metrics.get("min_waves"),
+    }
+
+    raw_counters = hw.get("counters", {}) or {}
+    counters = {
+        name: {
+            "sample_count": int(s.get("sample_count", 0)),
+            "avg_value": float(s.get("avg_value", 0)),
+            "min_value": float(s.get("min_value", 0)),
+            "max_value": float(s.get("max_value", 0)),
+            "total_value": float(s.get("total_value", 0)),
+        }
+        for name, s in raw_counters.items()
+    }
+
+    return {"has_counters": True, "metrics": metrics, "counters": counters}
+
+
+# Stable IDs for known recommendation categories.
+_CATEGORY_IDS = {
+    "Low Occupancy": "ROCPD-OCCUPANCY-001",
+    "GPU Utilization": "ROCPD-UTILIZATION-001",
+    "Memory Transfer": "ROCPD-MEMCPY-001",
+    "API Overhead": "ROCPD-API-001",
+    "Compute Bottleneck": "ROCPD-COMPUTE-001",
+    "Launch Overhead": "ROCPD-LAUNCH-001",
+    "Launch Efficiency": "ROCPD-LAUNCH-EFFICIENCY-001",
+    "Memory Bandwidth": "ROCPD-MEMBW-001",
+    "Performance": "ROCPD-INFO-001",
+}
+
+
+def _build_recommendations_json(
+    recommendations: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+    """Map internal recommendation dicts to the schema v0.1.0 format."""
+    out = []
+    seen_ids: Dict[str, int] = {}
+    for rec in recommendations:
+        category = rec.get("category", "General")
+        base_id = _CATEGORY_IDS.get(
+            category, f"ROCPD-{category.upper().replace(' ', '-')[:12]}-001"
+        )
+        count = seen_ids.get(base_id, 0) + 1
+        seen_ids[base_id] = count
+        rec_id = base_id if count == 1 else f"{base_id[:-3]}{count:03d}"
+
+        out.append(
+            {
+                "id": rec_id,
+                "priority": rec.get("priority", "INFO"),
+                "category": category,
+                "issue": rec.get("issue", ""),
+                "suggestion": rec.get("suggestion", ""),
+                "actions": rec.get("actions", []),
+                "estimated_impact": rec.get("estimated_impact", ""),
+                "commands": rec.get("commands", []),
+            }
+        )
+    return out
+
+
+def _build_warnings_json(has_counters: bool) -> List[Dict[str, Any]]:
+    """Build the warnings list based on analysis context."""
+    if not has_counters:
+        return [
+            {
+                "severity": "warning",
+                "message": (
+                    "No hardware counters collected. Analysis limited to "
+                    "Tier 1 (trace data only)."
+                ),
+                "recommendation": (
+                    "Collect counters with: "
+                    "rocprofv3 --pmc GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES -- ./app"
+                ),
+            }
+        ]
+    return []
+
+
+def _format_as_markdown(
+    time_breakdown: Dict[str, Any],
+    hotspots: List[Dict[str, Any]],
+    memory_analysis: Dict[str, Dict[str, Any]],
+    recommendations: List[Dict[str, Any]],
+    hardware_counters: Optional[Dict[str, Any]] = None,
+    database_path: str = "",
+    interval_timeline=None,
+    kernel_categories=None,
+    short_kernels=None,
+) -> str:
+    """Format analysis results as Markdown."""
+    breakdown = time_breakdown or {}
+    hw = hardware_counters or {}
+    has_counters = bool(hw.get("has_counters", False))
+
+    total_runtime_ms = breakdown.get("total_runtime", 0) / 1e6
+    kernel_pct = breakdown.get("kernel_percent", 0)
+    memcpy_pct = breakdown.get("memcpy_percent", 0)
+    overhead_pct = breakdown.get("overhead_percent", 0)
+    kernel_ms = breakdown.get("total_kernel_time", 0) / 1e6
+    memcpy_ms = breakdown.get("total_memcpy_time", 0) / 1e6
+
+    lines = []
+    lines.append("# ROCpd AI Performance Analysis")
+    lines.append("")
+    if database_path:
+        lines.append(f"**Database:** `{database_path}`")
+    lines.append(f"**Analysis Date:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+    tier = 2 if has_counters else 1
+    lines.append(
+        f"**Analysis Tier:** {tier} ({'Hardware Counters' if has_counters else 'Trace Only'})"
+    )
+    lines.append("")
+
+    lines.append("## Time Breakdown")
+    lines.append("")
+    lines.append("| Category | Time (ms) | Percentage |")
+    lines.append("|----------|-----------|------------|")
+    lines.append(f"| Kernel Execution | {kernel_ms:,.2f} | {kernel_pct:.1f}% |")
+    lines.append(f"| Memory Copies | {memcpy_ms:,.2f} | {memcpy_pct:.1f}% |")
+    overhead_ms = (
+        max(0.0, total_runtime_ms - kernel_ms - memcpy_ms) if total_runtime_ms > 0 else 0
+    )
+    lines.append(f"| API Overhead | {overhead_ms:,.2f} | {overhead_pct:.1f}% |")
+    lines.append(f"| **Total** | **{total_runtime_ms:,.2f}** | **100%** |")
+    lines.append("")
+
+    if hotspots:
+        lines.append("## Top Kernel Hotspots")
+        lines.append("")
+        lines.append("| Rank | Kernel | Calls | Total (ms) | Avg (μs) | % Total |")
+        lines.append("|------|--------|-------|------------|----------|---------|")
+        for i, k in enumerate(hotspots, 1):
+            name = k.get("name", "unknown")
+            if len(name) > 40:
+                name = name[:37] + "..."
+            lines.append(
+                f"| {i} | `{name}` | {k.get('calls', 0)} "
+                f"| {k.get('total_duration', 0) / 1e6:,.2f} "
+                f"| {k.get('avg_duration', 0) / 1e3:,.1f} "
+                f"| {k.get('percent_of_total', 0):.1f}% |"
+            )
+        lines.append("")
+
+    if memory_analysis:
+        lines.append("## Memory Copy Analysis")
+        lines.append("")
+        lines.append(
+            "| Direction | Count | Total Size | Duration (ms) | Bandwidth (GB/s) |"
+        )
+        lines.append(
+            "|-----------|-------|------------|---------------|-----------------|"
+        )
+        for direction, s in memory_analysis.items():
+            tb = s.get("total_bytes", 0)
+            if tb >= 1e9:
+                size_str = f"{tb / 1e9:.1f} GB"
+            elif tb >= 1e6:
+                size_str = f"{tb / 1e6:.1f} MB"
+            elif tb >= 1e3:
+                size_str = f"{tb / 1e3:.1f} KB"
+            else:
+                size_str = f"{tb:.0f} B"
+            bw = s.get("bandwidth_bytes_per_sec", 0) / 1e9
+            lines.append(
+                f"| {direction} | {s.get('count', 0)} | {size_str} "
+                f"| {s.get('total_duration', 0) / 1e6:,.2f} | {bw:.2f} |"
+            )
+        lines.append("")
+
+    if has_counters:
+        metrics = hw.get("metrics", {}) or {}
+        lines.append("## Hardware Counters (Tier 2)")
+        lines.append("")
+        if "gpu_utilization_percent" in metrics:
+            lines.append(
+                f"- **GPU Utilization:** {metrics['gpu_utilization_percent']:.1f}%"
+            )
+        if "avg_waves" in metrics:
+            lines.append(f"- **Avg Wave Occupancy:** {metrics['avg_waves']:.1f} waves")
+            lines.append(
+                f"- **Max Wave Occupancy:** {metrics.get('max_waves', 0):.1f} waves"
+            )
+        lines.append("")
+
+    if recommendations:
+        lines.append("## Recommendations")
+        lines.append("")
+        priority_emoji = {"HIGH": "🔴", "MEDIUM": "🟡", "LOW": "🟢", "INFO": "🔵"}
+        for rec in recommendations:
+            p = rec.get("priority", "INFO")
+            emoji = priority_emoji.get(p, "•")
+            lines.append(f"### {emoji} [{p}] {rec.get('category', '')}")
+            lines.append("")
+            lines.append(f"**Issue:** {rec.get('issue', '')}")
+            lines.append("")
+            lines.append(f"**Suggestion:** {rec.get('suggestion', '')}")
+            actions = rec.get("actions", [])
+            if actions:
+                lines.append("")
+                for action in actions:
+                    lines.append(f"{action}")
+            estimated_impact = rec.get("estimated_impact", "")
+            if estimated_impact:
+                lines.append("")
+                lines.append(f"**Estimated Impact:** {estimated_impact}")
+            commands = rec.get("commands", [])
+            if commands:
+                lines.append("")
+                lines.append("**Recommended Commands:**")
+                lines.append("")
+                for cmd in commands:
+                    tool = cmd.get("tool", "")
+                    desc = cmd.get("description", "")
+                    full_command = cmd.get("full_command", "")
+                    flags = cmd.get("flags", [])
+                    args = cmd.get("args", [])
+                    lines.append(f"*{tool}* — {desc}")
+                    if flags:
+                        lines.append(f"- Flags: `{' '.join(flags)}`")
+                    if args:
+                        arg_strs = []
+                        for a in args:
+                            name = a.get("name", "")
+                            value = a.get("value")
+                            arg_strs.append(
+                                f"{name} {value}" if value is not None else name
+                            )
+                        lines.append(f"- Args: `{' '.join(arg_strs)}`")
+                    if full_command:
+                        lines.append(f"```bash\n{full_command}\n```")
+                    lines.append("")
+            lines.append("")
+
+    if kernel_categories:
+        lines.append("## Kernel Category Breakdown")
+        lines.append("")
+        lines.append("| Category | Kernels | % of Kernel Time | Avg Duration |")
+        lines.append("|----------|---------|-----------------|--------------|")
+        for cat in kernel_categories:
+            avg_us = cat["avg_duration_ns"] / 1_000
+            lines.append(
+                f"| {cat['category']} | {cat['count']} | "
+                f"{cat['pct_of_kernel_time']:.1f}% | {avg_us:.1f}μs |"
+            )
+        lines.append("")
+
+    if short_kernels and short_kernels.get("short_kernel_count", 0) > 0:
+        lines.append("## Short Kernel Analysis")
+        lines.append("")
+        thresh = short_kernels.get("threshold_us", 10)
+        count = short_kernels["short_kernel_count"]
+        wasted = short_kernels["wasted_pct_of_kernel_time"]
+        lines.append(
+            f"**{count} kernels** below {thresh}μs threshold — "
+            f"**{wasted:.1f}%** of kernel time wasted"
+        )
+        lines.append("")
+        if short_kernels.get("histogram"):
+            lines.append("| Bucket | Count |")
+            lines.append("|--------|-------|")
+            for b in short_kernels["histogram"]:
+                lines.append(f"| {b['bucket_label']} | {b['count']} |")
+            lines.append("")
+        if short_kernels.get("top_offenders"):
+            lines.append("**Top offenders by wasted time:**")
+            lines.append("")
+            for off in short_kernels["top_offenders"][:5]:
+                lines.append(
+                    f"- `{off['name']}` — ×{off['count']} calls, avg {off['avg_us']:.1f}μs"
+                )
+            lines.append("")
+
+    lines.append("---")
+    lines.append(
+        f"*Generated by rocpd analyze • {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*"
+    )
+    return "\n".join(lines)
+
+
+# ─────────────────────────────────────────────────────────────────────────────
+# WebView output format
+# ─────────────────────────────────────────────────────────────────────────────
+
+
+def _format_as_webview(
+    time_breakdown: Dict[str, Any],
+    hotspots: List[Dict[str, Any]],
+    memory_analysis: Dict[str, Dict[str, Any]],
+    recommendations: List[Dict[str, Any]],
+    hardware_counters: Optional[Dict[str, Any]] = None,
+    database_path: str = "",
+    interval_timeline=None,
+    kernel_categories=None,
+    short_kernels=None,
+) -> str:
+    """
+    Generate a self-contained interactive HTML report.
+
+    The file has no external dependencies — all CSS and JS are inlined so it
+    opens correctly from any local path or file-share without a web server.
+    """
+    import html as _html
+
+    def _h(v: Any) -> str:
+        """HTML-escape a value for safe text embedding."""
+        return _html.escape(str(v), quote=True)
+
+    def _fmt_ns(ns: Any) -> str:
+        if ns is None:
+            return "—"
+        ns = float(ns)
+        if ns < 1_000:
+            return f"{ns:.0f} ns"
+        if ns < 1_000_000:
+            return f"{ns / 1_000:.1f} µs"
+        if ns < 1_000_000_000:
+            return f"{ns / 1_000_000:.1f} ms"
+        return f"{ns / 1_000_000_000:.2f} s"
+
+    def _fmt_bytes(b: Any) -> str:
+        if not b:
+            return "—"
+        b = float(b)
+        if b < 1_024:
+            return f"{b:.0f} B"
+        if b < 1_048_576:
+            return f"{b / 1_024:.1f} KB"
+        if b < 1_073_741_824:
+            return f"{b / 1_048_576:.1f} MB"
+        return f"{b / 1_073_741_824:.2f} GB"
+
+    def _svg_gauge(pct: float, color: str, label: str, value_str: str) -> str:
+        """SVG donut gauge — semicircle (180°) style."""
+        r = 36
+        cx = cy = 44
+        full = 3.14159265 * r  # half circumference (180°)
+        dash = full * max(0.0, min(1.0, pct / 100.0))
+        return (
+            f'<div class="gauge-box">'
+            f'<svg viewBox="0 0 88 50" width="130" height="74">'
+            # track arc
+            f'<path d="M {cx - r},{cy} A {r},{r} 0 0 1 {cx + r},{cy}"'
+            f' fill="none" stroke="var(--bg3)" stroke-width="9" stroke-linecap="round"/>'
+            # filled arc (clipped at cy so only top half shows)
+            f'<path d="M {cx - r},{cy} A {r},{r} 0 0 1 {cx + r},{cy}"'
+            f' fill="none" stroke="{_h(color)}" stroke-width="9" stroke-linecap="round"'
+            f' stroke-dasharray="{dash:.2f} {full:.2f}"'
+            f' stroke-dashoffset="0"/>'
+            # value text
+            f'<text x="{cx}" y="{cy - 4}" text-anchor="middle"'
+            f' font-size="13" font-weight="700" fill="var(--text)">{_h(value_str)}</text>'
+            f'<text x="{cx}" y="{cy + 10}" text-anchor="middle"'
+            f' font-size="7.5" fill="var(--dim)">{_h(label.upper())}</text>'
+            f"</svg>"
+            f"</div>"
+        )
+
+    # ── derived values ──────────────────────────────────────────────────────
+    breakdown = time_breakdown or {}
+    hw = hardware_counters or {}
+    has_counters = bool(hw.get("has_counters", False))
+    total_ns = float(breakdown.get("total_runtime", 0))
+    total_ms = total_ns / 1e6
+    kernel_pct = float(breakdown.get("kernel_percent", 0))
+    memcpy_pct = float(breakdown.get("memcpy_percent", 0))
+    overhead_pct = float(breakdown.get("overhead_percent", 0))
+    kernel_ms = breakdown.get("total_kernel_time", 0) / 1e6
+    memcpy_ms = breakdown.get("total_memcpy_time", 0) / 1e6
+    overhead_ms = max(0.0, total_ms * overhead_pct / 100.0)
+    idle_pct = max(0.0, 100.0 - kernel_pct - memcpy_pct - overhead_pct)
+    idle_ms = max(0.0, total_ms - kernel_ms - memcpy_ms - overhead_ms)
+    tier = 2 if has_counters else 1
+    analysis_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    summary = _build_summary(breakdown, hotspots or [], has_counters)
+    bottleneck = summary.get("primary_bottleneck", "unknown")
+    confidence = int(summary.get("confidence", 0) * 100)
+    assessment = summary.get("overall_assessment", "")
+    key_findings = summary.get("key_findings", [])
+    metrics = hw.get("metrics", {}) or {}
+    gpu_util = metrics.get("gpu_utilization_pct") or metrics.get(
+        "gpu_utilization_percent"
+    )
+    avg_waves = metrics.get("avg_waves")
+    max_waves = metrics.get("max_waves")
+
+    BN_COLOR = {
+        "compute": "#5599ee",
+        "memory_transfer": "#ff8c00",
+        "latency": "#cc44cc",
+        "mixed": "#9999bb",
+        "unknown": "#666677",
+    }
+    bn_color = BN_COLOR.get(bottleneck, "#888899")
+
+    PRIORITY = {
+        "HIGH": ("#e84040", "#2a0808"),
+        "MEDIUM": ("#f08432", "#2a1600"),
+        "LOW": ("#caa828", "#241e08"),
+        "INFO": ("#4d8ef2", "#081428"),
+    }
+    PRIORITY_ICON = {
+        "HIGH": "&#128308;",
+        "MEDIUM": "&#128992;",
+        "LOW": "&#128993;",
+        "INFO": "&#8505;",
+    }
+
+    # ── recommendations HTML ────────────────────────────────────────────────
+    recs_parts = []
+    for ri, rec in enumerate(recommendations or []):
+        p = rec.get("priority", "INFO")
+        cat = rec.get("category", "")
+        fg, bg_rec = PRIORITY.get(p, ("#888", "#1a1a2a"))
+        picon = PRIORITY_ICON.get(p, "&#8505;")
+        actions_li = "".join(f"<li>{_h(a)}</li>" for a in rec.get("actions", []))
+        actions_html = f'<ol class="r-actions">{actions_li}</ol>' if actions_li else ""
+        impact = rec.get("estimated_impact", "")
+        impact_html = (
+            f'<p class="r-impact">&#9889; Expected impact: {_h(impact)}</p>'
+            if impact
+            else ""
+        )
+        cmds_parts = []
+        for ci, cmd in enumerate(rec.get("commands", [])):
+            fc = cmd.get("full_command", "")
+            tool = cmd.get("tool", "")
+            desc = cmd.get("description", "")
+            if not fc:
+                continue
+            cid = f"c{ri}_{ci}"
+            cmds_parts.append(
+                f'<div class="cmd-blk">'
+                f'<span class="tool-tag">{_h(tool)}</span>'
+                f'<span class="cmd-desc">{_h(desc)}</span>'
+                f'<div class="cmd-row" id="{cid}">'
+                f"<code>{_h(fc)}</code>"
+                f'<button class="cp-btn" onclick="cpCmd(\'{cid}\')">Copy</button>'
+                f"</div></div>"
+            )
+        cmds_html = "".join(cmds_parts)
+        issue_txt = rec.get("issue", "")
+        suggest = rec.get("suggestion", "")
+        recs_parts.append(
+            f'<div class="r-card" style="border-left-color:{fg}" data-p="{_h(p)}">'
+            f'<div class="r-hdr" onclick="toggleR(this)">'
+            f'<span class="r-priority-icon">{picon}</span>'
+            f'<span class="r-badge" style="background:{fg};color:#fff">{_h(p)}</span>'
+            f'<span class="r-cat">{_h(cat)}</span>'
+            f'<span class="r-chev">&#9660;</span>'
+            f"</div>"
+            f'<div class="r-body">'
+            f'<p class="r-issue"><strong>Issue:</strong> {_h(issue_txt)}</p>'
+            f'<p class="r-suggest"><strong>What to do:</strong> {_h(suggest)}</p>'
+            f"{actions_html}{impact_html}{cmds_html}"
+            f"</div></div>"
+        )
+    recs_html = (
+        "".join(recs_parts)
+        or '<p class="dim">No recommendations — workload looks well-optimized.</p>'
+    )
+
+    # ── hotspots table ──────────────────────────────────────────────────────
+    hotspot_rows = []
+    for i, k in enumerate(hotspots or []):
+        pct = float(k.get("percent_of_total", 0))
+        bar = min(100.0, pct)
+        name = k.get("name", "unknown")
+        hot = ' class="hot-row"' if pct >= 20 else ""
+        hotspot_rows.append(
+            f"<tr{hot}>"
+            f"<td>{i + 1}</td>"
+            f'<td class="kname" title="{_h(name)}"><code>{_h(name)}</code></td>'
+            f'<td data-v="{k.get("calls", 0)}">{int(k.get("calls", 0)):,}</td>'
+            f'<td data-v="{k.get("total_duration", 0)}">{_fmt_ns(k.get("total_duration", 0))}</td>'
+            f'<td data-v="{k.get("avg_duration", 0)}">{_fmt_ns(k.get("avg_duration", 0))}</td>'
+            f'<td data-v="{k.get("min_duration", 0)}">{_fmt_ns(k.get("min_duration", 0))}</td>'
+            f'<td data-v="{pct}">'
+            f'<div class="pbar"><div class="pfill" style="width:{bar:.1f}%"></div>'
+            f"<span>{pct:.1f}%</span></div>"
+            f"</td></tr>"
+        )
+    hotspots_html = ""
+    if hotspot_rows:
+        hotspots_html = (
+            '<section class="scard">'
+            '<div class="shdr">'
+            '<span class="shdr-icon">&#128293;</span>'
+            "<h2>Top Kernel Hotspots</h2>"
+            "</div>"
+            '<div class="sbody"><div class="tbl-wrap">'
+            '<table class="dtable sortable" id="hs-tbl">'
+            "<thead><tr>"
+            "<th data-tip='Rank by total execution time — 1 is the hottest kernel.'>#</th>"
+            "<th data-tip='Demangled GPU kernel function name dispatched to the GPU. Rows highlighted in red consume &gt;20% of total runtime.'>Kernel Name</th>"
+            "<th data-tip='Number of times this kernel was dispatched. Very high call counts with low avg time suggest kernel launch overhead dominates useful work.'>Calls &#8645;</th>"
+            "<th data-tip='Sum of all dispatch durations for this kernel — the primary metric for identifying hotspots. Longer total time = bigger optimization target.'>Total Time &#8645;</th>"
+            "<th data-tip='Mean duration per single dispatch. Values below 10 &micro;s suggest kernel launch overhead may dominate the actual computation.'>Avg Time &#8645;</th>"
+            "<th data-tip='Fastest observed single dispatch. Useful for spotting variance — a large gap between min and avg suggests irregular execution (cache effects, branch divergence).'>Min Time &#8645;</th>"
+            "<th data-tip='Percentage of total profiling window time consumed by this kernel. Kernels above 20% are highlighted and are the highest-priority optimization targets.'>% Total &#8645;</th>"
+            "</tr></thead>"
+            "<tbody>" + "".join(hotspot_rows) + "</tbody>"
+            "</table></div></div></section>"
+        )
+
+    # ── memory analysis table ───────────────────────────────────────────────
+    _MEM_DIR_TIPS = {
+        "Host-to-Device": (
+            "<strong>Host-to-Device (H2D)</strong>"
+            "CPU &rarr; GPU transfer over PCIe. Used to upload inputs, weights, or parameters before kernel execution. "
+            "<em>PCIe 4.0 x16 peak: ~32 GB/s. Minimize by reusing GPU allocations across iterations.</em>"
+        ),
+        "Device-to-Host": (
+            "<strong>Device-to-Host (D2H)</strong>"
+            "GPU &rarr; CPU transfer over PCIe. Used to read results back after kernel execution. "
+            "<em>Minimize these — prefer keeping results on GPU across multiple kernels. Use async memcpy to overlap with compute.</em>"
+        ),
+        "Device-to-Device": (
+            "<strong>Device-to-Device (D2D)</strong>"
+            "GPU &rarr; GPU on the same device, using HBM bandwidth directly (not PCIe). Very fast — can approach peak HBM bandwidth. "
+            "<em>Use for in-GPU data reorganization. MI300X HBM peak: ~5.3 TB/s.</em>"
+        ),
+        "Peer-to-Peer": (
+            "<strong>Peer-to-Peer (P2P)</strong>"
+            "GPU &rarr; different GPU transfer. Speed depends on interconnect: Infinity Fabric is fast (&sim;900 GB/s on MI300X); PCIe is slower (~32 GB/s). "
+            "<em>Enable peer access with hipDeviceEnablePeerAccess for direct transfers.</em>"
+        ),
+    }
+    mem_rows = []
+    for direction, s in (memory_analysis or {}).items():
+        tb = s.get("total_bytes", 0)
+        bw = s.get("bandwidth_bytes_per_sec", 0) / 1e9
+        dir_tip = _MEM_DIR_TIPS.get(
+            direction,
+            f"<strong>{_h(direction)}</strong>Memory transfer direction between host and device.",
+        )
+        mem_rows.append(
+            f"<tr>"
+            f"<td data-tip='{dir_tip}'>{_h(direction)}</td>"
+            f'<td>{int(s.get("count", 0)):,}</td>'
+            f"<td>{_fmt_bytes(tb)}</td>"
+            f'<td>{_fmt_ns(s.get("total_duration", 0))}</td>'
+            f'<td>{_fmt_bytes(s.get("avg_bytes", 0))}</td>'
+            f"<td>{bw:.2f} GB/s</td>"
+            f"</tr>"
+        )
+    mem_html = ""
+    if mem_rows:
+        mem_html = (
+            '<section class="scard">'
+            '<div class="shdr">'
+            '<span class="shdr-icon">&#128190;</span>'
+            "<h2>Memory Transfer Analysis</h2>"
+            "</div>"
+            '<div class="sbody"><div class="tbl-wrap">'
+            '<table class="dtable">'
+            "<thead><tr>"
+            "<th data-tip='Transfer direction. Hover each row to learn what each direction means.'>Direction</th>"
+            "<th data-tip='Number of individual copy operations in this direction. Many small transfers are inefficient — batch them when possible.'>Count</th>"
+            "<th data-tip='Total data volume transferred in this direction across all operations.'>Total Bytes</th>"
+            "<th data-tip='Total wall-clock time spent on copies in this direction.'>Total Time</th>"
+            "<th data-tip='Average bytes per copy operation. Transfers below 1 MB are typically inefficient due to PCIe transaction overhead — batch small transfers.'>Avg Size</th>"
+            "<th data-tip='Achieved transfer bandwidth. PCIe 4.0 x16 theoretical peak is ~32 GB/s. Low bandwidth usually means many small transfers, not PCIe saturation.'>Bandwidth</th>"
+            "</tr></thead>"
+            "<tbody>" + "".join(mem_rows) + "</tbody>"
+            "</table></div></div></section>"
+        )
+
+    # ── hardware counters ───────────────────────────────────────────────────
+    gauges_html = ""
+    if gpu_util is not None:
+        _gpu_u = float(gpu_util)
+        gc = "#44dd66" if _gpu_u >= 70 else "#ff8800"
+        hint = (
+            '<p class="g-hint warn">&#9888; Low — increase parallelism</p>'
+            if _gpu_u < 70
+            else '<p class="g-hint ok">&#10003; Good utilization</p>'
+        )
+        _gpu_ok = _gpu_u >= 70
+        _gpu_status = (
+            '<span class="tok">Good — GPU is well-utilized.</span>'
+            if _gpu_ok
+            else '<span class="twarn">Low — reduce synchronization barriers, increase batch size, or launch larger kernels.</span>'
+        )
+        _gpu_tip = (
+            f"<strong>GPU Utilization ({_gpu_u:.1f}%)</strong>"
+            f"Percentage of GPU clock cycles where the hardware was actively processing work. "
+            f"Derived from hardware counters: <code>GRBM_GUI_ACTIVE &divide; GRBM_COUNT</code>.<br>"
+            f"<em>Target: &ge;70%. Below 70% means the GPU is frequently idle.</em><br>"
+            f"{_gpu_status}"
+        )
+        gauges_html += (
+            f"<div class=\"gauge-wrap\" data-tip='{_gpu_tip}'>"
+            f'{_svg_gauge(_gpu_u, gc, "GPU Utilization", f"{_gpu_u:.1f}%")}'
+            f"{hint}</div>"
+        )
+    if avg_waves is not None:
+        _aw = float(avg_waves)
+        wc = "#44dd66" if _aw >= 16 else "#ff8800"
+        # Normalize waves to 0-100% assuming 64 waves/SIMD as 100%
+        wpct = min(100.0, _aw / 64.0 * 100.0)
+        whint = (
+            '<p class="g-hint warn">&#9888; Low occupancy — check registers/LDS</p>'
+            if _aw < 16
+            else '<p class="g-hint ok">&#10003; Adequate occupancy</p>'
+        )
+        wave_str = f"{_aw:.0f}"
+        if max_waves is not None:
+            wave_str += f" / {float(max_waves):.0f}"
+        _wave_ok = _aw >= 16
+        _wave_status = (
+            '<span class="tok">Good — adequate wavefront occupancy for latency hiding.</span>'
+            if _wave_ok
+            else '<span class="twarn">Low — reduce register usage or LDS allocation per wavefront to increase occupancy and hide memory latency.</span>'
+        )
+        _wave_tip = (
+            f"<strong>Wave Occupancy (avg {_aw:.0f} waves)</strong>"
+            f"Average number of wavefronts (64 threads each) simultaneously in-flight per compute unit. "
+            f"Collected via the <code>SQ_WAVES</code> hardware counter. "
+            f"Higher occupancy lets the GPU hide memory latency by switching to another wavefront while one waits for data.<br>"
+            f"<em>Target: &ge;16 waves. Max practical: 64 waves/SIMD unit. "
+            f"Low occupancy usually means each wavefront uses too many registers or too much LDS.</em><br>"
+            f"{_wave_status}"
+        )
+        gauges_html += (
+            f"<div class=\"gauge-wrap\" data-tip='{_wave_tip}'>"
+            f'{_svg_gauge(wpct, wc, "Avg Waves", wave_str)}'
+            f"{whint}</div>"
+        )
+    raw_counters = hw.get("counters", {}) or {}
+    ctr_rows = "".join(
+        f'<tr class="ctr-row" data-ctr="{_h(n)}"><td><code>{_h(n)}</code></td>'
+        f'<td>{int(v.get("sample_count", 0)):,}</td>'
+        f'<td>{float(v.get("avg_value", 0)):.2f}</td>'
+        f'<td>{float(v.get("min_value", 0)):.2f}</td>'
+        f'<td>{float(v.get("max_value", 0)):.2f}</td>'
+        f'<td>{float(v.get("total_value", 0)):,.0f}</td></tr>'
+        for n, v in raw_counters.items()
+    )
+    ctr_table = (
+        (
+            '<table class="dtable" style="margin-top:1rem">'
+            "<thead><tr><th>Counter</th><th>Samples</th>"
+            "<th>Avg</th><th>Min</th><th>Max</th><th>Total</th></tr></thead>"
+            "<tbody>" + ctr_rows + "</tbody></table>"
+        )
+        if ctr_rows
+        else ""
+    )
+
+    hw_inner = (
+        f'<div class="gauges">{gauges_html}</div>{ctr_table}'
+        if has_counters
+        else (
+            '<p class="dim">No hardware counter data — Tier 1 (trace-only) analysis.</p>'
+            '<p class="hint" style="margin-top:.5rem">Collect counters with:</p>'
+            '<div class="cmd-row" id="hw-hint">'
+            "<code>rocprofv3 --pmc GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES -- ./app</code>"
+            '<button class="cp-btn" onclick="cpCmd(\'hw-hint\')">Copy</button>'
+            "</div>"
+        )
+    )
+
+    # ── key findings list ───────────────────────────────────────────────────
+    findings_li = "".join(f"<li>{_h(f)}</li>" for f in key_findings)
+    findings_html = f'<ul class="findings">{findings_li}</ul>' if findings_li else ""
+
+    # ── embed full JSON (sanitized for HTML context) ────────────────────────
+    json_str = _format_as_json(
+        time_breakdown,
+        hotspots,
+        memory_analysis,
+        recommendations,
+        hardware_counters,
+        database_path,
+    )
+    json_embedded = json_str.replace("</script>", r"<\/script>").replace("<!--", r"<\!--")
+
+    # ══════════════════════════════════════════════════════════════════════
+    # HTML template
+    # All CSS { } must be doubled inside f-strings.
+    # JS template literals (`${}`) avoided; no external resources.
+    # ══════════════════════════════════════════════════════════════════════
+    tier_label = "Hardware Counters (Tier 2)" if has_counters else "Trace Only (Tier 1)"
+    bn_display = bottleneck.replace("_", " ").title()
+
+    # ── Pre-computed tooltip strings (single-quote delimited in HTML attrs) ──
+    _TIP_KERNEL = (
+        "<strong>Kernel Execution</strong>"
+        "Time actively running GPU compute kernels. Higher is better — means more "
+        "useful work is being done on the GPU silicon. "
+        "<em>If this is low (&lt;40%), look for excessive GPU idle time or API launch overhead.</em>"
+    )
+    _TIP_MEMCPY = (
+        "<strong>Memory Copies</strong>"
+        "Time transferring data between CPU (host) and GPU (device) over the PCIe bus. "
+        "High values (&gt;20%) indicate a PCIe bandwidth bottleneck. "
+        "<em>Minimize by batching transfers, using pinned (page-locked) memory, "
+        "or overlapping copies with kernel execution via async streams.</em>"
+    )
+    _TIP_OVERHEAD = (
+        "<strong>API &amp; Launch Overhead</strong>"
+        "Time in HIP/HSA runtime calls: kernel launch latency, "
+        "synchronization barriers, and runtime bookkeeping. "
+        "High values (&gt;15%) suggest too many small kernel dispatches or excessive "
+        "CPU&ndash;GPU synchronization points. "
+        "<em>Batch work into fewer larger kernels and minimize hipDeviceSynchronize calls.</em>"
+    )
+    _TIP_IDLE = (
+        "<strong>GPU Idle</strong>"
+        "Time when the GPU had no work to execute — pipeline bubbles between kernel launches. "
+        "High idle time means the CPU is not submitting work fast enough, "
+        "or there are long synchronization stalls waiting on host results. "
+        "<em>Use asynchronous launches, CUDA/HIP streams, and reduce host processing "
+        "between dispatches.</em>"
+    )
+    _BN_TIPS = {
+        "compute": (
+            "<strong>Compute Bottleneck</strong>"
+            "GPU arithmetic units (VALU/MFMA) are the limiting factor. "
+            "The workload is doing more FLOPs than the memory system can supply data for, "
+            "meaning arithmetic throughput is the ceiling. "
+            "<em>Optimize: use MFMA (matrix FMA) instructions, reduce register pressure, "
+            "increase thread-level parallelism.</em>"
+        ),
+        "memory_transfer": (
+            "<strong>Memory Transfer Bottleneck</strong>"
+            "PCIe data transfers between CPU and GPU dominate execution time. "
+            "The application is spending more time moving data than computing. "
+            "<em>Optimize: keep data resident on GPU across multiple kernels, "
+            "use pinned host memory, overlap transfers with computation via async streams.</em>"
+        ),
+        "memory_bandwidth": (
+            "<strong>Memory Bandwidth Bottleneck</strong>"
+            "HBM (High Bandwidth Memory) bandwidth is the limiting factor. "
+            "Kernels are reading/writing more data than HBM can deliver per clock. "
+            "<em>Optimize: improve data reuse via tiling, exploit L1/L2 cache locality, "
+            "use LDS (shared memory) to reduce HBM traffic.</em>"
+        ),
+        "latency": (
+            "<strong>Latency Bottleneck</strong>"
+            "Many small, short-lived kernels where launch overhead dominates actual computation. "
+            "GPU spends more time being launched than running. "
+            "<em>Optimize: fuse multiple small kernels into one, increase work per dispatch, "
+            "or use persistent kernel patterns.</em>"
+        ),
+        "mixed": (
+            "<strong>Mixed Bottleneck</strong>"
+            "Multiple performance limiters are present simultaneously. "
+            "No single dominant bottleneck was identified. "
+            "<em>Address the highest-priority recommendation first, re-profile, "
+            "then iterate.</em>"
+        ),
+        "unknown": (
+            "<strong>Bottleneck Unknown</strong>"
+            "Analysis could not determine a clear primary bottleneck from available data. "
+            "<em>Collect hardware counters for deeper analysis: "
+            "rocprofv3 --pmc GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES -- ./app</em>"
+        ),
+    }
+    _tip_bn = _BN_TIPS.get(bottleneck, _BN_TIPS["unknown"])
+    _tip_tier = (
+        "<strong>Analysis Tier 2 — Hardware Counters</strong>"
+        "Profiling data includes hardware performance counters collected via "
+        "<code>rocprofv3 --pmc</code>. Enables GPU utilization, wave occupancy, "
+        "and per-kernel counter breakdowns in addition to timing data."
+        if has_counters
+        else "<strong>Analysis Tier 1 — Trace Only</strong>"
+        "Profiling data contains timing information only (no hardware counters). "
+        "For deeper GPU-level insights, re-profile with: "
+        "<em>rocprofv3 --pmc GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES -- ./app</em>"
+    )
+
+    # ── Pre-compute badge / KPI status values ───────────────────────────────
+    n_high = sum(1 for r in (recommendations or []) if r.get("priority") == "HIGH")
+    n_medium = sum(1 for r in (recommendations or []) if r.get("priority") == "MEDIUM")
+    n_low = sum(1 for r in (recommendations or []) if r.get("priority") == "LOW")
+    n_info = sum(1 for r in (recommendations or []) if r.get("priority") == "INFO")
+
+    # kernel utilization KPI health class
+    if kernel_pct >= 60:
+        _kpi_kernel_cls = "kpi-ok"
+        _kpi_kernel_lbl = "Good"
+    elif kernel_pct >= 30:
+        _kpi_kernel_cls = "kpi-warn"
+        _kpi_kernel_lbl = "Moderate"
+    else:
+        _kpi_kernel_cls = "kpi-crit"
+        _kpi_kernel_lbl = "Low"
+
+    _BN_ICON = {
+        "compute": "&#128293;",
+        "memory_transfer": "&#128230;",
+        "memory_bandwidth": "&#128190;",
+        "latency": "&#9889;",
+        "mixed": "&#128256;",
+        "unknown": "&#10067;",
+    }
+    _bn_icon = _BN_ICON.get(bottleneck, "&#10067;")
+
+    _badge_parts = []
+    if n_high:
+        _badge_parts.append(
+            f'<span class="hbadge hbadge-crit">&#9679; {n_high} Critical</span>'
+        )
+    if n_medium:
+        _badge_parts.append(
+            f'<span class="hbadge hbadge-warn">&#9679; {n_medium} Warning</span>'
+        )
+    if n_low:
+        _badge_parts.append(f'<span class="hbadge hbadge-ok">&#9679; {n_low} Low</span>')
+    if n_info:
+        _badge_parts.append(
+            f'<span class="hbadge hbadge-info">&#9679; {n_info} Info</span>'
+        )
+    header_badges_html = " ".join(_badge_parts)
+
+    _recs_badge_html = ""
+    if n_high:
+        _recs_badge_html += (
+            f'<span class="shdr-badge sbadge-crit">{n_high} Critical</span> '
+        )
+    if n_medium:
+        _recs_badge_html += (
+            f'<span class="shdr-badge sbadge-warn">{n_medium} Warning</span>'
+        )
+
+    _tier_icon = "&#128300;" if has_counters else "&#128225;"
+    _tier_status_lbl = "HW Counters" if has_counters else "Trace Only"
+    _hw_badge_html = (
+        '<span class="shdr-badge sbadge-info">Tier 2</span>'
+        if has_counters
+        else '<span class="shdr-badge sbadge-info">Tier 1</span>'
+    )
+
+    _db_pill_html = ""
+    if database_path:
+        _db_label = database_path[-45:] if len(database_path) > 45 else database_path
+        _db_pill_html = (
+            f'<div class="hpill">'
+            f'<span class="hpill-label">DB:</span>'
+            f'<span class="hpill-value" title="{_h(database_path)}">{_h(_db_label)}</span>'
+            f"</div>"
+        )
+
+    html = f"""<!DOCTYPE html>
+<html lang="en" data-theme="dark">
+<head>
+<meta charset="UTF-8">
+<meta name="viewport" content="width=device-width,initial-scale=1">
+<title>ROCpd AI Analysis &#8212; {_h(database_path or "GPU Performance Report")}</title>
+<style>
+/* ── Reset + Variables ─────────────────────────────────────────────── */
+:root {{
+  --bg:#0d0d14; --bg2:#14141f; --bg3:#1c1c2c; --bg4:#242438;
+  --bdr:#2c2c48; --bdr2:#3a3a58;
+  --text:#e0e3f2; --sub:#a8aace; --dim:#6868a0;
+  --amd:#e01a22;
+  --blue:#4d8ef2; --green:#3acc66; --orange:#f08432;
+  --purple:#9866cc; --teal:#28bca8; --yellow:#caa828;
+  --c-ok:#3acc66;   --c-ok-bg:rgba(58,204,102,.13);
+  --c-warn:#f08432; --c-warn-bg:rgba(240,132,50,.13);
+  --c-crit:#e84040; --c-crit-bg:rgba(232,64,64,.13);
+  --c-info:#4d8ef2; --c-info-bg:rgba(77,142,242,.13);
+  --r:10px; --r-sm:6px;
+  --font:-apple-system,"Segoe UI",system-ui,Ubuntu,sans-serif;
+  --mono:"JetBrains Mono","Cascadia Code","Fira Code",ui-monospace,monospace;
+  --shadow:0 4px 18px rgba(0,0,0,.42);
+  --shadow-lg:0 8px 36px rgba(0,0,0,.55);
+  --trans:all 0.22s cubic-bezier(0.4,0,0.2,1);
+}}
+[data-theme="light"] {{
+  --bg:#f2f2f8; --bg2:#ffffff; --bg3:#eaeaf2; --bg4:#dddde8;
+  --bdr:#c8c8dc; --bdr2:#b4b4cc;
+  --text:#181828; --sub:#444468; --dim:#6868a0;
+  --c-ok-bg:rgba(58,204,102,.10); --c-warn-bg:rgba(240,132,50,.10);
+  --c-crit-bg:rgba(232,64,64,.10); --c-info-bg:rgba(77,142,242,.10);
+  --shadow:0 2px 12px rgba(0,0,0,.10);
+  --shadow-lg:0 4px 20px rgba(0,0,0,.14);
+}}
+*,*::before,*::after {{ box-sizing:border-box; margin:0; padding:0; }}
+html {{ scroll-behavior:smooth; }}
+body {{ font-family:var(--font); background:var(--bg); color:var(--text);
+       line-height:1.65; font-size:15px; min-height:100vh;
+       transition:background .25s,color .25s; }}
+a {{ color:var(--blue); }}
+code {{ font-family:var(--mono); font-size:.87em; }}
+/* ── Header ──────────────────────────────────────────────────────── */
+.hdr {{
+  background:linear-gradient(135deg,#080810 0%,#120e1c 100%);
+  border-bottom:3px solid var(--amd); padding:.9rem 0;
+  position:sticky; top:0; z-index:100;
+  box-shadow:0 2px 16px rgba(0,0,0,.55);
+}}
+[data-theme="light"] .hdr {{ background:linear-gradient(135deg,#1a0a10 0%,#280e18 100%); }}
+.hdr-inner {{ max-width:1140px; margin:0 auto; padding:0 1.25rem;
+              display:flex; align-items:center; gap:1rem; flex-wrap:wrap; }}
+.hdr-brand {{ display:flex; align-items:baseline; gap:.6rem; }}
+.logo {{ font-size:1.6rem; font-weight:900; color:var(--amd);
+         letter-spacing:-.04em; line-height:1; }}
+.logo em {{ color:#f0f0ff; font-style:normal; }}
+.hdr-subtitle {{ font-size:.88rem; color:rgba(255,255,255,.55); font-weight:500; }}
+.hdr-badges {{ display:flex; gap:.4rem; flex-wrap:wrap; margin-left:auto; }}
+.hbadge {{ font-size:.7rem; font-weight:800; padding:.2em .65em;
+           border-radius:100px; letter-spacing:.04em;
+           display:inline-flex; align-items:center; gap:.25em; }}
+.hbadge-crit {{ background:var(--c-crit-bg); color:var(--c-crit); border:1px solid rgba(232,64,64,.4); }}
+.hbadge-warn {{ background:var(--c-warn-bg); color:var(--c-warn); border:1px solid rgba(240,132,50,.4); }}
+.hbadge-ok   {{ background:var(--c-ok-bg);   color:var(--c-ok);   border:1px solid rgba(58,204,102,.4); }}
+.hbadge-info {{ background:var(--c-info-bg);  color:var(--c-info); border:1px solid rgba(77,142,242,.4); }}
+.hdr-controls {{ display:flex; gap:.5rem; align-items:center; }}
+.hdr-btn {{ background:rgba(255,255,255,.08); border:1px solid rgba(255,255,255,.15);
+            color:rgba(255,255,255,.7); border-radius:var(--r-sm);
+            padding:.3em .75em; font-size:.79rem; cursor:pointer;
+            font-family:var(--font); transition:var(--trans);
+            display:flex; align-items:center; gap:.3em; }}
+.hdr-btn:hover {{ background:rgba(255,255,255,.14); color:#fff; }}
+.hdr-pills {{ max-width:1140px; margin:.55rem auto 0; padding:.5rem 1.25rem 0;
+              display:flex; gap:.45rem; flex-wrap:wrap;
+              border-top:1px solid rgba(255,255,255,.07); }}
+.hpill {{ font-size:.72rem; background:rgba(255,255,255,.06);
+          border:1px solid rgba(255,255,255,.1); border-radius:5px;
+          padding:.12em .55em; display:flex; align-items:center; gap:.3em; }}
+.hpill-label {{ color:rgba(255,255,255,.4); }}
+.hpill-value {{ font-family:var(--mono); font-weight:600; color:rgba(255,255,255,.75); }}
+/* ── Layout ──────────────────────────────────────────────────────── */
+.wrap {{ max-width:1140px; margin:0 auto; padding:1.5rem 1.25rem 5rem; }}
+/* ── Section Card ────────────────────────────────────────────────── */
+.scard {{ background:var(--bg2); border:1px solid var(--bdr); border-radius:var(--r);
+          margin-bottom:1.5rem; box-shadow:var(--shadow); overflow:hidden;
+          animation:fadeInUp .35s ease both; }}
+.scard:nth-child(1) {{ animation-delay:.04s; }}
+.scard:nth-child(2) {{ animation-delay:.08s; }}
+.scard:nth-child(3) {{ animation-delay:.12s; }}
+.scard:nth-child(4) {{ animation-delay:.16s; }}
+.scard:nth-child(5) {{ animation-delay:.20s; }}
+.scard:nth-child(6) {{ animation-delay:.24s; }}
+.shdr {{ display:flex; align-items:center; gap:.6rem;
+         padding:.85rem 1.4rem; border-bottom:1px solid var(--bdr);
+         background:var(--bg3); }}
+.shdr-icon {{ font-size:1.1rem; flex-shrink:0; }}
+.shdr h2 {{ font-size:.97rem; font-weight:700; letter-spacing:.02em; flex:1; color:var(--text); }}
+.shdr-badge {{ font-size:.69rem; font-weight:800; padding:.15em .55em;
+               border-radius:100px; letter-spacing:.04em; flex-shrink:0; }}
+.sbadge-crit {{ background:var(--c-crit-bg); color:var(--c-crit); }}
+.sbadge-warn {{ background:var(--c-warn-bg); color:var(--c-warn); }}
+.sbadge-ok   {{ background:var(--c-ok-bg);   color:var(--c-ok); }}
+.sbadge-info {{ background:var(--c-info-bg);  color:var(--c-info); }}
+.sbody {{ padding:1.25rem 1.4rem; }}
+.dim {{ color:var(--dim); }}
+.hint {{ font-size:.85rem; color:var(--dim); }}
+/* ── Assessment / Quote ──────────────────────────────────────────── */
+.assess {{ font-style:italic; color:var(--sub); font-size:.92rem; line-height:1.7;
+           padding:.7rem 1rem; border-left:3px solid var(--blue);
+           background:var(--c-info-bg); border-radius:0 var(--r-sm) var(--r-sm) 0;
+           margin-bottom:1.25rem; }}
+/* ── KPI Grid ────────────────────────────────────────────────────── */
+.kpi-grid {{ display:grid; grid-template-columns:repeat(auto-fit,minmax(185px,1fr));
+             gap:1rem; margin-bottom:1.25rem; }}
+.kpi {{ border:1px solid var(--bdr); border-radius:var(--r); padding:1rem;
+        position:relative; overflow:hidden; transition:var(--trans); cursor:help; }}
+.kpi:hover {{ transform:translateY(-2px); box-shadow:var(--shadow); }}
+.kpi::before {{ content:''; position:absolute; top:0; left:0; right:0; height:3px; }}
+.kpi-ok   {{ background:var(--c-ok-bg); }}   .kpi-ok::before   {{ background:var(--c-ok); }}
+.kpi-warn {{ background:var(--c-warn-bg); }}  .kpi-warn::before {{ background:var(--c-warn); }}
+.kpi-crit {{ background:var(--c-crit-bg); }}  .kpi-crit::before {{ background:var(--c-crit); }}
+.kpi-info {{ background:var(--c-info-bg); }}  .kpi-info::before {{ background:var(--c-info); }}
+.kpi-head {{ display:flex; align-items:center; justify-content:space-between; margin-bottom:.4rem; }}
+.kpi-icon {{ font-size:1.25rem; }}
+.kpi-status {{ font-size:.68rem; font-weight:800; padding:.14em .5em; border-radius:100px; }}
+.kpi-ok   .kpi-status {{ background:rgba(58,204,102,.2);  color:var(--c-ok); }}
+.kpi-warn .kpi-status {{ background:rgba(240,132,50,.2);  color:var(--c-warn); }}
+.kpi-crit .kpi-status {{ background:rgba(232,64,64,.2);   color:var(--c-crit); }}
+.kpi-info .kpi-status {{ background:rgba(77,142,242,.2);  color:var(--c-info); }}
+.kpi-label {{ font-size:.69rem; text-transform:uppercase; letter-spacing:.1em; color:var(--dim); margin-bottom:.2rem; }}
+.kpi-value {{ font-size:1.55rem; font-weight:800; line-height:1.1; font-family:var(--mono); margin-bottom:.15rem; }}
+.kpi-ok   .kpi-value {{ color:var(--c-ok); }}
+.kpi-warn .kpi-value {{ color:var(--c-warn); }}
+.kpi-crit .kpi-value {{ color:var(--c-crit); }}
+.kpi-info .kpi-value {{ color:var(--c-info); }}
+.kpi-sub {{ font-size:.77rem; color:var(--dim); }}
+/* ── Key Findings ────────────────────────────────────────────────── */
+.findings {{ list-style:none; margin-top:.85rem; border-top:1px solid var(--bdr); padding-top:.75rem; }}
+.findings li {{ font-size:.87rem; color:var(--sub); padding:.28rem 0 .28rem 1.3rem;
+                position:relative; border-bottom:1px solid rgba(44,44,72,.3); }}
+.findings li:last-child {{ border-bottom:none; }}
+.findings li::before {{ content:'→'; position:absolute; left:0; color:var(--blue); font-weight:700; }}
+/* ── Breakdown ───────────────────────────────────────────────────── */
+.stacked {{ height:34px; display:flex; border-radius:8px; overflow:hidden;
+            box-shadow:0 0 0 1px var(--bdr); margin:1rem 0 .85rem; }}
+.seg {{ height:100%; transition:opacity .18s; cursor:help; }}
+.seg:hover {{ opacity:.75; }}
+.legend {{ display:flex; flex-wrap:wrap; gap:.55rem; margin-bottom:1rem; }}
+.leg {{ display:flex; align-items:center; gap:.4rem; font-size:.8rem; color:var(--sub); }}
+.dot {{ width:10px; height:10px; border-radius:3px; flex-shrink:0; }}
+.brows {{ display:flex; flex-direction:column; gap:.55rem; }}
+.brow {{ display:grid; grid-template-columns:155px 1fr 165px;
+         align-items:center; gap:.75rem; font-size:.87rem; cursor:help; }}
+.brow:hover .bval {{ color:var(--text); }}
+.blabel {{ color:var(--sub); font-weight:500; }}
+.btrack {{ background:var(--bg3); border-radius:4px; height:20px;
+           overflow:hidden; border:1px solid var(--bdr); }}
+.bfill {{ height:100%; border-radius:4px; }}
+.bval {{ text-align:right; font-family:var(--mono); font-size:.81rem; color:var(--dim); }}
+.bpct {{ color:var(--sub); font-weight:600; }}
+/* ── Recommendations ─────────────────────────────────────────────── */
+.r-card {{ border-left:4px solid; border-radius:0 var(--r) var(--r) 0;
+           background:var(--bg3); margin-bottom:.6rem; overflow:hidden;
+           transition:background .15s; }}
+.r-card:hover {{ background:var(--bg4); }}
+.r-hdr {{ display:flex; align-items:center; gap:.55rem; padding:.8rem 1rem;
+          cursor:pointer; user-select:none; }}
+.r-priority-icon {{ font-size:.9rem; flex-shrink:0; }}
+.r-badge {{ padding:.14em .55em; border-radius:4px; font-size:.69rem;
+            font-weight:800; letter-spacing:.06em; flex-shrink:0; }}
+.r-cat {{ font-weight:600; font-size:.9rem; flex:1; color:var(--text); }}
+.r-chev {{ color:var(--dim); font-size:.7rem; transition:transform .2s; flex-shrink:0; }}
+.r-card.open .r-chev {{ transform:rotate(180deg); }}
+.r-body {{ display:none; padding:.85rem 1rem 1rem; border-top:1px solid var(--bdr); }}
+.r-card.open .r-body {{ display:block; }}
+.r-issue {{ margin-bottom:.5rem; font-size:.9rem; }}
+.r-suggest {{ font-size:.9rem; margin-bottom:.5rem; }}
+.r-actions {{ padding-left:1.5rem; margin:.5rem 0; color:var(--sub); font-size:.87rem; }}
+.r-actions li {{ margin-bottom:.22rem; }}
+.r-impact {{ color:var(--c-ok); font-size:.84rem; margin-top:.65rem;
+             padding:.35rem .65rem; background:var(--c-ok-bg);
+             border-radius:var(--r-sm); display:inline-block; }}
+.cmd-blk {{ margin-top:.85rem; }}
+.tool-tag {{ color:var(--blue); font-weight:700; font-size:.83rem; }}
+.cmd-desc {{ color:var(--dim); font-size:.81rem; margin-left:.4rem; }}
+.cmd-row {{ display:flex; align-items:center; justify-content:space-between;
+            gap:.5rem; background:var(--bg); border:1px solid var(--bdr);
+            border-radius:var(--r-sm); padding:.55rem .85rem; margin-top:.35rem;
+            overflow-x:auto; }}
+.cmd-row code {{ color:#a0e870; white-space:nowrap; font-size:.84rem; }}
+.cp-btn {{ flex-shrink:0; background:var(--bg3); border:1px solid var(--bdr);
+           color:var(--dim); padding:.2em .6em; border-radius:4px;
+           cursor:pointer; font-size:.73rem; font-family:var(--font); transition:var(--trans); }}
+.cp-btn:hover {{ color:var(--text); border-color:var(--blue); }}
+/* ── Tables ──────────────────────────────────────────────────────── */
+.tbl-wrap {{ overflow-x:auto; }}
+.dtable {{ width:100%; border-collapse:collapse; font-size:.85rem; }}
+.dtable th {{ background:var(--bg3); color:var(--dim); font-weight:700; font-size:.76rem;
+              text-transform:uppercase; letter-spacing:.06em;
+              text-align:left; padding:.55rem .75rem; border-bottom:2px solid var(--bdr);
+              cursor:pointer; user-select:none; white-space:nowrap; transition:color .15s; }}
+.dtable th:hover {{ color:var(--text); }}
+.dtable td {{ padding:.5rem .75rem; border-bottom:1px solid rgba(44,44,72,.4); vertical-align:middle; }}
+.dtable tr:last-child td {{ border-bottom:none; }}
+.dtable tr:hover td {{ background:rgba(255,255,255,.02); }}
+.hot-row td {{ background:rgba(224,26,34,.08) !important; }}
+.hot-row td:last-child {{ font-weight:700; }}
+.kname {{ max-width:340px; overflow:hidden; text-overflow:ellipsis; white-space:nowrap; }}
+.pbar {{ display:flex; align-items:center; gap:.4rem; }}
+.pfill {{ height:10px; background:var(--blue); border-radius:3px; min-width:2px; }}
+.pbar span {{ font-size:.78rem; white-space:nowrap; color:var(--dim); }}
+/* ── Hardware Gauges ─────────────────────────────────────────────── */
+.gauges {{ display:flex; flex-wrap:wrap; gap:1.5rem; margin-bottom:1rem; }}
+.gauge-wrap {{ text-align:center; padding:.8rem 1.1rem; background:var(--bg3);
+               border:1px solid var(--bdr); border-radius:var(--r); min-width:145px;
+               transition:var(--trans); }}
+.gauge-wrap:hover {{ border-color:var(--bdr2); box-shadow:var(--shadow); transform:translateY(-1px); }}
+.g-hint {{ font-size:.78rem; margin-top:.3rem; font-weight:600; }}
+.g-hint.warn {{ color:var(--c-warn); }}
+.g-hint.ok   {{ color:var(--c-ok); }}
+.gauge-box {{ display:flex; justify-content:center; }}
+/* ── Floating Tooltip ────────────────────────────────────────────── */
+#tt {{
+  position:fixed; z-index:9999; pointer-events:none; max-width:320px;
+  padding:.7rem 1rem; background:#0e0e1c; border:1px solid #3a3a5c;
+  border-radius:10px; box-shadow:0 10px 40px rgba(0,0,0,.7);
+  font-size:.8rem; line-height:1.65; color:#dde0f2;
+  opacity:0; transition:opacity .12s; white-space:normal;
+}}
+#tt.show {{ opacity:1; }}
+#tt strong {{ color:var(--blue); display:block; margin-bottom:.2rem; font-size:.85rem; }}
+#tt code {{ font-size:.78rem; background:rgba(255,255,255,.08); padding:.05em .3em; border-radius:3px; }}
+#tt em {{ color:var(--dim); font-size:.77rem; display:block; margin-top:.3rem; }}
+#tt .tok  {{ color:var(--c-ok); font-weight:600; }}
+#tt .twarn {{ color:var(--c-warn); font-weight:600; }}
+[data-tip] {{ cursor:help; }}
+/* ── FAB ─────────────────────────────────────────────────────────── */
+.fab {{ position:fixed; bottom:1.5rem; right:1.5rem; width:46px; height:46px;
+        border-radius:50%; background:linear-gradient(135deg,var(--blue) 0%,var(--purple) 100%);
+        color:#fff; border:none; font-size:1.25rem; box-shadow:var(--shadow-lg);
+        cursor:pointer; display:flex; align-items:center; justify-content:center;
+        transition:var(--trans); z-index:200; opacity:0; pointer-events:none; }}
+.fab.visible {{ opacity:1; pointer-events:all; }}
+.fab:hover {{ transform:scale(1.1) translateY(-2px); }}
+/* ── Footer ──────────────────────────────────────────────────────── */
+footer {{ border-top:1px solid var(--bdr); padding:1.25rem 1.25rem; max-width:1140px; margin:0 auto; }}
+footer p {{ color:var(--dim); font-size:.77rem; text-align:center; }}
+/* ── Animations ──────────────────────────────────────────────────── */
+@keyframes fadeInUp {{
+  from {{ opacity:0; transform:translateY(14px); }}
+  to   {{ opacity:1; transform:translateY(0); }}
+}}
+/* ── Scrollbar ───────────────────────────────────────────────────── */
+::-webkit-scrollbar {{ width:7px; height:7px; }}
+::-webkit-scrollbar-track {{ background:var(--bg); }}
+::-webkit-scrollbar-thumb {{ background:var(--bdr2); border-radius:4px; }}
+::-webkit-scrollbar-thumb:hover {{ background:var(--dim); }}
+/* ── Mobile ──────────────────────────────────────────────────────── */
+@media (max-width:640px) {{
+  .brow {{ grid-template-columns:120px 1fr auto; }}
+  .kpi-value {{ font-size:1.3rem; }}
+  .hdr-subtitle {{ display:none; }}
+}}
+</style>
+</head>
+<body>
+
+<div id="tt"></div>
+
+<!-- ── Header ────────────────────────────────────────────────────── -->
+<header class="hdr">
+  <div class="hdr-inner">
+    <div class="hdr-brand">
+      <span class="logo">ROC<em>pd</em></span>
+      <span class="hdr-subtitle">AI Performance Analysis</span>
+    </div>
+    <div class="hdr-badges">{header_badges_html}</div>
+    <div class="hdr-controls">
+      <button class="hdr-btn" id="theme-btn" onclick="toggleTheme()">&#9728; Light</button>
+    </div>
+  </div>
+  <div class="hdr-pills">
+    <div class="hpill"><span class="hpill-label">Runtime:</span><span class="hpill-value">{total_ms:,.2f} ms</span></div>
+    <div class="hpill"><span class="hpill-label">Kernels:</span><span class="hpill-value">{len(hotspots or [])}</span></div>
+    <div class="hpill"><span class="hpill-label">Tier:</span><span class="hpill-value">{_h(tier_label)}</span></div>
+    <div class="hpill"><span class="hpill-label">Generated:</span><span class="hpill-value">{analysis_date}</span></div>
+    {_db_pill_html}
+  </div>
+</header>
+
+<div class="wrap">
+
+<!-- ── Overview ──────────────────────────────────────────────────── -->
+<section class="scard">
+  <div class="shdr">
+    <span class="shdr-icon">&#128202;</span>
+    <h2>Overview</h2>
+    <span class="shdr-badge sbadge-info">Tier {tier}</span>
+  </div>
+  <div class="sbody">
+    <p class="assess">{_h(assessment)}</p>
+    <div class="kpi-grid">
+      <div class="kpi kpi-info" data-tip='{_tip_bn}'>
+        <div class="kpi-head"><span class="kpi-icon">{_bn_icon}</span><span class="kpi-status">Bottleneck</span></div>
+        <div class="kpi-label">Primary Bottleneck</div>
+        <div class="kpi-value" style="color:{bn_color}">{_h(bn_display)}</div>
+        <div class="kpi-sub">Confidence: {confidence}%</div>
+      </div>
+      <div class="kpi kpi-info" data-tip='<strong>Total Runtime</strong>Wall-clock duration of the profiled application from first observed event to last. Includes kernel execution, memory copies, API calls, and GPU idle time.'>
+        <div class="kpi-head"><span class="kpi-icon">&#9201;</span><span class="kpi-status">Duration</span></div>
+        <div class="kpi-label">Total Runtime</div>
+        <div class="kpi-value">{total_ms:,.2f}</div>
+        <div class="kpi-sub">milliseconds &bull; {len(hotspots or [])} kernels</div>
+      </div>
+      <div class="kpi {_kpi_kernel_cls}" data-tip='{_TIP_KERNEL}'>
+        <div class="kpi-head"><span class="kpi-icon">&#128187;</span><span class="kpi-status">{_kpi_kernel_lbl}</span></div>
+        <div class="kpi-label">Kernel Execution</div>
+        <div class="kpi-value">{kernel_pct:.1f}%</div>
+        <div class="kpi-sub">{kernel_ms:,.2f} ms active compute</div>
+      </div>
+      <div class="kpi kpi-info" data-tip='{_tip_tier}'>
+        <div class="kpi-head"><span class="kpi-icon">{_tier_icon}</span><span class="kpi-status">{_tier_status_lbl}</span></div>
+        <div class="kpi-label">Analysis Tier</div>
+        <div class="kpi-value">{tier}</div>
+        <div class="kpi-sub">{'Hardware counters available' if has_counters else 'Trace-level only'}</div>
+      </div>
+    </div>
+    {findings_html}
+  </div>
+</section>
+
+<!-- ── Execution Breakdown ────────────────────────────────────────── -->
+<section class="scard">
+  <div class="shdr">
+    <span class="shdr-icon">&#9200;</span>
+    <h2>Execution Breakdown</h2>
+  </div>
+  <div class="sbody">
+    <div class="stacked">
+      <div class="seg" data-tip='{_TIP_KERNEL}' style="width:{kernel_pct:.2f}%;background:linear-gradient(90deg,#4d8ef2,#3a7de0)"></div>
+      <div class="seg" data-tip='{_TIP_MEMCPY}' style="width:{memcpy_pct:.2f}%;background:linear-gradient(90deg,#f08432,#d86c20)"></div>
+      <div class="seg" data-tip='{_TIP_OVERHEAD}' style="width:{overhead_pct:.2f}%;background:linear-gradient(90deg,#9866cc,#7a4db0)"></div>
+      <div class="seg" data-tip='{_TIP_IDLE}' style="width:{idle_pct:.2f}%;background:linear-gradient(90deg,#2c2c48,#222236)"></div>
+    </div>
+    <div class="legend">
+      <div class="leg"><div class="dot" style="background:#4d8ef2"></div>Kernel &nbsp;<strong style="color:#4d8ef2">{kernel_pct:.1f}%</strong></div>
+      <div class="leg"><div class="dot" style="background:#f08432"></div>Memory Copies &nbsp;<strong style="color:#f08432">{memcpy_pct:.1f}%</strong></div>
+      <div class="leg"><div class="dot" style="background:#9866cc"></div>API Overhead &nbsp;<strong style="color:#9866cc">{overhead_pct:.1f}%</strong></div>
+      <div class="leg"><div class="dot" style="background:#2c2c48;border:1px solid #3a3a55"></div>GPU Idle &nbsp;<strong style="color:var(--dim)">{idle_pct:.1f}%</strong></div>
+    </div>
+    <div class="brows">
+      <div class="brow" data-tip='{_TIP_KERNEL}'>
+        <div class="blabel">Kernel Execution</div>
+        <div class="btrack"><div class="bfill" style="width:{kernel_pct:.2f}%;background:linear-gradient(90deg,#4d8ef2,#3a7de0)"></div></div>
+        <div class="bval"><span class="bpct">{kernel_pct:.1f}%</span>&ensp;{kernel_ms:,.2f} ms</div>
+      </div>
+      <div class="brow" data-tip='{_TIP_MEMCPY}'>
+        <div class="blabel">Memory Copies</div>
+        <div class="btrack"><div class="bfill" style="width:{memcpy_pct:.2f}%;background:linear-gradient(90deg,#f08432,#d86c20)"></div></div>
+        <div class="bval"><span class="bpct">{memcpy_pct:.1f}%</span>&ensp;{memcpy_ms:,.2f} ms</div>
+      </div>
+      <div class="brow" data-tip='{_TIP_OVERHEAD}'>
+        <div class="blabel">API Overhead</div>
+        <div class="btrack"><div class="bfill" style="width:{overhead_pct:.2f}%;background:linear-gradient(90deg,#9866cc,#7a4db0)"></div></div>
+        <div class="bval"><span class="bpct">{overhead_pct:.1f}%</span>&ensp;{overhead_ms:,.2f} ms</div>
+      </div>
+      <div class="brow" data-tip='{_TIP_IDLE}'>
+        <div class="blabel">GPU Idle</div>
+        <div class="btrack"><div class="bfill" style="width:{idle_pct:.2f}%;background:#2c2c48"></div></div>
+        <div class="bval"><span class="bpct">{idle_pct:.1f}%</span>&ensp;{idle_ms:,.2f} ms</div>
+      </div>
+    </div>
+  </div>
+</section>
+
+<!-- ── Recommendations ────────────────────────────────────────────── -->
+<section class="scard">
+  <div class="shdr">
+    <span class="shdr-icon">&#128161;</span>
+    <h2>Optimization Recommendations</h2>
+    {_recs_badge_html}
+  </div>
+  <div class="sbody">
+    {recs_html}
+  </div>
+</section>
+
+{hotspots_html}
+{mem_html}
+
+<!-- ── Hardware Counters ──────────────────────────────────────────── -->
+<section class="scard">
+  <div class="shdr">
+    <span class="shdr-icon">&#128300;</span>
+    <h2>Hardware Counters</h2>
+    {_hw_badge_html}
+  </div>
+  <div class="sbody">
+    {hw_inner}
+  </div>
+</section>
+
+</div><!-- /wrap -->
+
+<footer>
+  <p>Generated by <strong>rocpd analyze</strong> &mdash; AMD ROCm GPU Performance Analysis &bull; {analysis_date}</p>
+</footer>
+
+<!-- scroll-to-top FAB -->
+<button class="fab" id="fab-top" title="Back to top" onclick="window.scrollTo({{top:0,behavior:'smooth'}})">&#8679;</button>
+
+<script>
+var ANALYSIS = {json_embedded};
+
+/* ── Theme toggle ── */
+var htmlEl = document.documentElement;
+var themeBtn = document.getElementById('theme-btn');
+var _saved = localStorage.getItem('rocpd-theme') || 'dark';
+if (_saved === 'light') {{ htmlEl.setAttribute('data-theme','light'); themeBtn.innerHTML = '&#127769; Dark'; }}
+function toggleTheme() {{
+  var isLight = htmlEl.getAttribute('data-theme') === 'light';
+  htmlEl.setAttribute('data-theme', isLight ? 'dark' : 'light');
+  themeBtn.innerHTML = isLight ? '&#9728; Light' : '&#127769; Dark';
+  localStorage.setItem('rocpd-theme', isLight ? 'dark' : 'light');
+}}
+
+/* ── Scroll-to-top FAB ── */
+var fabEl = document.getElementById('fab-top');
+window.addEventListener('scroll', function() {{
+  if (window.scrollY > 250) {{ fabEl.classList.add('visible'); }}
+  else {{ fabEl.classList.remove('visible'); }}
+}});
+
+/* ── Recommendation toggle ── */
+function toggleR(hdr) {{
+  hdr.closest('.r-card').classList.toggle('open');
+}}
+document.querySelectorAll('.r-card[data-p="HIGH"]').forEach(function(c) {{
+  c.classList.add('open');
+}});
+
+/* ── Copy command ── */
+function cpCmd(id) {{
+  var el = document.getElementById(id);
+  var txt = el.querySelector('code').textContent;
+  if (navigator.clipboard) {{
+    navigator.clipboard.writeText(txt).then(function() {{
+      var btn = el.querySelector('.cp-btn');
+      var orig = btn.textContent;
+      btn.textContent = '\u2713 Copied!';
+      btn.style.color = 'var(--c-ok)';
+      setTimeout(function() {{ btn.textContent = orig; btn.style.color = ''; }}, 1600);
+    }});
+  }}
+}}
+
+/* ── Sortable tables ── */
+document.querySelectorAll('.sortable thead th').forEach(function(th) {{
+  th.addEventListener('click', function() {{
+    var tbl   = th.closest('table');
+    var tbody = tbl.querySelector('tbody');
+    var col   = Array.prototype.indexOf.call(th.parentElement.children, th);
+    var dir   = th.dataset.dir === '1' ? -1 : 1;
+    tbl.querySelectorAll('thead th').forEach(function(t) {{
+      delete t.dataset.dir;
+      t.textContent = t.textContent.replace(/ [\u25b2\u25bc]$/, '');
+    }});
+    th.dataset.dir = String(dir);
+    th.textContent += dir === 1 ? ' \u25b2' : ' \u25bc';
+    var rows = Array.prototype.slice.call(tbody.querySelectorAll('tr'));
+    rows.sort(function(a, b) {{
+      var av = a.cells[col].dataset.v || a.cells[col].textContent.trim();
+      var bv = b.cells[col].dataset.v || b.cells[col].textContent.trim();
+      var an = parseFloat(av), bn = parseFloat(bv);
+      if (!isNaN(an) && !isNaN(bn)) return (an - bn) * dir;
+      return av < bv ? -dir : av > bv ? dir : 0;
+    }});
+    rows.forEach(function(r) {{ tbody.appendChild(r); }});
+  }});
+}});
+
+/* ── Floating tooltip ── */
+var ttEl = document.getElementById('tt');
+function showTip(e, html_content) {{
+  ttEl.innerHTML = html_content; ttEl.classList.add('show'); moveTip(e);
+}}
+function moveTip(e) {{
+  var x = e.clientX + 16, y = e.clientY - 12;
+  var w = ttEl.offsetWidth || 320;
+  if (x + w + 10 > window.innerWidth) {{ x = e.clientX - w - 14; }}
+  if (y + ttEl.offsetHeight + 10 > window.innerHeight) {{ y = e.clientY - ttEl.offsetHeight - 10; }}
+  ttEl.style.left = x + 'px'; ttEl.style.top = y + 'px';
+}}
+function hideTip() {{ ttEl.classList.remove('show'); }}
+document.querySelectorAll('[data-tip]').forEach(function(el) {{
+  el.addEventListener('mouseenter', function(e) {{ showTip(e, el.dataset.tip); }});
+  el.addEventListener('mousemove',  moveTip);
+  el.addEventListener('mouseleave', hideTip);
+}});
+
+/* ── AMD GPU hardware counter definitions ── */
+var COUNTER_TIPS = {{
+  'GRBM_COUNT': '<strong>GRBM_COUNT</strong>Total GPU clock cycles elapsed during the profiling window. Acts as the time denominator for all utilization metrics.<em>Usage: GPU Utilization = GRBM_GUI_ACTIVE &divide; GRBM_COUNT &times; 100%</em>',
+  'GRBM_GUI_ACTIVE': '<strong>GRBM_GUI_ACTIVE</strong>Clock cycles where the GPU Command Processor had active work queued. Numerator for GPU utilization — higher relative to GRBM_COUNT means better GPU occupancy.<em>Target: &ge;70% of GRBM_COUNT for a well-utilized GPU.</em>',
+  'SQ_WAVES': '<strong>SQ_WAVES</strong>Total number of wavefronts (groups of 64 threads) launched across all compute units during the profiling window. Each wavefront is one SIMD execution unit.<em>High counts = good parallelism. Used to compute wave occupancy (avg simultaneous waves).</em>',
+  'SQ_WAVE_CYCLES': '<strong>SQ_WAVE_CYCLES</strong>Total clock cycles consumed across all wavefronts. Divide by SQ_WAVES to get average cycles per wavefront — a proxy for per-kernel execution time.<em>Compare with GRBM_COUNT to estimate how busy the compute units were vs total time.</em>',
+  'SQ_INSTS_VALU': '<strong>SQ_INSTS_VALU</strong>Vector ALU instructions executed — floating-point and integer arithmetic (add, mul, fma, transcendental). This is the primary compute workload.<em>High VALU counts relative to VMEM reads indicate a compute-bound kernel (good use of GPU).</em>',
+  'SQ_INSTS_SALU': '<strong>SQ_INSTS_SALU</strong>Scalar ALU instructions — operations applied identically to all 64 threads in a wavefront (address calculation, control flow, predication).<em>Very high SALU relative to VALU may indicate excessive branching or non-uniform control flow.</em>',
+  'SQ_INSTS_VMEM_RD': '<strong>SQ_INSTS_VMEM_RD</strong>Vector memory read instructions (global/local memory loads). Each instruction may trigger multiple cache line fetches depending on access patterns.<em>High counts relative to VALU confirm a memory-bound workload. Improve data locality or increase compute intensity.</em>',
+  'SQ_INSTS_VMEM_WR': '<strong>SQ_INSTS_VMEM_WR</strong>Vector memory write instructions (global/local memory stores).<em>High write traffic alongside high reads can saturate HBM bandwidth. Consider write-combining or reducing redundant stores.</em>',
+  'SQ_INSTS_LDS': '<strong>SQ_INSTS_LDS</strong>Local Data Share (LDS / shared memory) instructions. LDS is fast on-chip memory shared within a workgroup — much faster than HBM.<em>High LDS usage is generally good (data reuse within workgroup). Watch for LDS bank conflicts which serialize access.</em>',
+  'SQ_INSTS_SMEM': '<strong>SQ_INSTS_SMEM</strong>Scalar memory instructions — loads from constant/uniform memory accessed by all threads in a wavefront identically.<em>Used for kernel arguments, constant buffers. Low latency due to scalar cache.</em>',
+  'FETCH_SIZE': '<strong>FETCH_SIZE</strong>Total kilobytes fetched from the L2 cache to the compute units (read bandwidth from L2 to L1/VGPR).<em>Compare against theoretical L2 bandwidth to assess cache pressure. High values with low VALU suggest memory-bound kernel.</em>',
+  'WRITE_SIZE': '<strong>WRITE_SIZE</strong>Total kilobytes written back to the L2 cache from compute units.<em>High write traffic alongside FETCH_SIZE indicates significant memory bandwidth demand. Check if writes can be reduced or deferred.</em>',
+  'TCP_TOTAL_READ_REQ': '<strong>TCP_TOTAL_READ_REQ</strong>Texture Cache Processor (TCP / L1 vector data cache) total read requests issued by compute units.<em>Used to compute L1 cache hit rate when combined with TCP miss counters.</em>',
+  'TCP_TOTAL_CACHE_ACCESSES': '<strong>TCP_TOTAL_CACHE_ACCESSES</strong>Total accesses to the L1 vector (TCP) cache.<em>Combine with miss counters to compute L1 hit rate. Low hit rate means working set exceeds L1 capacity.</em>',
+  'TCC_EA_RDREQ_COUNT_sum': '<strong>TCC L2 Read Requests</strong>L2 cache (TCC) read requests forwarded to the memory system (HBM). High values confirm HBM bandwidth is being heavily utilized.<em>If GPU is memory-bound and this is high, improve data reuse or reduce working set size.</em>',
+  'TCC_EA_WRREQ_COUNT_sum': '<strong>TCC L2 Write Requests</strong>L2 cache write requests to HBM. Combine with read requests for total HBM bandwidth demand.<em>High write counts may indicate unnecessary stores or lack of write combining.</em>',
+  'TCC_HIT_sum': '<strong>TCC L2 Cache Hits</strong>Number of requests satisfied by the L2 cache without going to HBM.<em>Higher is better. Low L2 hit rate means working set exceeds L2 capacity — consider tiling or blocking.</em>',
+  'TCC_MISS_sum': '<strong>TCC L2 Cache Misses</strong>Number of requests that missed L2 and had to fetch from HBM.<em>Each miss adds significant latency (~300-400 cycles on MI300X). Reduce misses via better data locality.</em>',
+  'TA_TA_BUSY': '<strong>TA_TA_BUSY</strong>Texture Addresser busy cycles — measures how actively the texture/address unit is computing memory addresses for vector loads.<em>High TA_BUSY alongside low VALU suggests address calculation is a bottleneck.</em>',
+  'SQ_ACTIVE_INST_VALU': '<strong>SQ_ACTIVE_INST_VALU</strong>Cycles where VALU instructions were actively executing (not stalled). A measure of effective compute throughput.<em>Compare with SQ_WAVES * cycles to estimate VALU utilization efficiency.</em>',
+}};
+
+/* Apply COUNTER_TIPS to counter table rows */
+document.querySelectorAll('.ctr-row').forEach(function(tr) {{
+  var name = tr.dataset.ctr;
+  var tip = COUNTER_TIPS[name] ||
+    ('<strong>' + name + '</strong>Hardware performance counter. ' +
+     'Values are raw HW event counts for the profiling window. ' +
+     '<em>Consult AMD CDNA ISA documentation or rocprofv3 counter reference for full semantics.</em>');
+  tr.addEventListener('mouseenter', function(e) {{ showTip(e, tip); }});
+  tr.addEventListener('mousemove',  moveTip);
+  tr.addEventListener('mouseleave', hideTip);
+}});
+</script>
+</body>
+</html>"""
+
+    # --- Kernel category breakdown card (TraceLens) ---
+    if kernel_categories:
+        cat_rows_html = ""
+        for cat in kernel_categories:
+            avg_us = cat["avg_duration_ns"] / 1_000
+            pct = cat["pct_of_kernel_time"]
+            bar_w = max(2, int(pct * 2))  # scale to max 200px
+            cat_rows_html += (
+                f'<tr><td>{cat["category"]}</td>'
+                f'<td>{cat["count"]}</td>'
+                f'<td><div style="display:inline-block;height:12px;width:{bar_w}px;'
+                f'background:#e01a22;border-radius:2px;vertical-align:middle"></div>'
+                f" {pct:.1f}%</td>"
+                f"<td>{avg_us:.1f}&#956;s</td></tr>"
+            )
+        category_card = (
+            '\n<div class="card" id="card-categories">'
+            '\n  <div class="card-hdr" onclick="toggle(\'categories\')">'
+            '\n    <span>Kernel Category Breakdown <span style="font-size:11px;opacity:.6">(TraceLens)</span></span>'
+            '\n    <span id="cat-chev">&#9660;</span>'
+            "\n  </div>"
+            '\n  <div id="categories" class="card-body">'
+            '\n    <table class="tbl">'
+            "\n      <thead><tr><th>Category</th><th>Kernels</th><th>% of Kernel Time</th><th>Avg Duration</th></tr></thead>"
+            "\n      <tbody>" + cat_rows_html + "</tbody>"
+            "\n    </table>"
+            "\n  </div>"
+            "\n</div>"
+        )
+        html = html.replace("</body>", category_card + "\n</body>")
+
+    return html
+
+
+# ---------------------------------------------------------------------------
+# Tier 0 format helpers
+# ---------------------------------------------------------------------------
+
+
+def _tier0_recommendations_text(
+    recommendations: List[Dict[str, Any]], width: int = 80
+) -> List[str]:
+    """Render Tier 0 recommendations as text lines (same format as Tier 1/2)."""
+    lines = []
+    for rec in recommendations:
+        pri = rec.get("priority", "INFO")
+        cat = rec.get("category", "")
+        issue = rec.get("issue", "")
+        suggestion = rec.get("suggestion", "")
+        impact = rec.get("estimated_impact", "")
+        actions = rec.get("actions", [])
+        commands = rec.get("commands", [])
+
+        lines.append(f"[{pri}] {cat}")
+        lines.append("─" * width)
+        lines.append(f"  Issue: {issue}")
+        lines.append("")
+        if suggestion:
+            lines.append(f"  Suggestion: {suggestion}")
+            for action in actions:
+                lines.append(f"    {action}")
+            lines.append("")
+        if impact:
+            lines.append(f"  Estimated Impact: {impact}")
+            lines.append("")
+        if commands:
+            lines.append("  Recommended Commands:")
+            for cmd in commands:
+                tool = cmd.get("tool", "")
+                desc = cmd.get("description", "")
+                full_command = cmd.get("full_command", "")
+                flags = cmd.get("flags", [])
+                args = cmd.get("args", [])
+                lines.append(f"    [{tool}] {desc}")
+                if flags:
+                    lines.append(f"      Flags: {' '.join(flags)}")
+                if args:
+                    arg_strs = []
+                    for a in args:
+                        name = a.get("name", "")
+                        value = a.get("value")
+                        arg_strs.append(f"{name} {value}" if value is not None else name)
+                    lines.append(f"      Args:  {' '.join(arg_strs)}")
+                if full_command:
+                    lines.append(f"      $ {full_command}")
+            lines.append("")
+        lines.append("")
+    return lines
+
+
+def _format_tier0_text(tier0_result: Any) -> str:
+    """Format Tier 0 source-only analysis as plain text."""
+    width = 80
+    lines = []
+    lines.append("=" * width)
+    lines.append("ROCPD AI PROFILING PLAN (TIER 0: SOURCE CODE ANALYSIS)".center(width))
+    lines.append("=" * width)
+    lines.append(f"Source Directory: {tier0_result.source_dir}")
+    lines.append(f"Analysis Date:    {tier0_result.analysis_timestamp}")
+    lines.append(f"Programming Model: {tier0_result.programming_model}")
+    lines.append(
+        f"Files Scanned:    {tier0_result.files_scanned}  "
+        f"(skipped: {tier0_result.files_skipped})"
+    )
+    lines.append("")
+
+    # Kernels
+    lines.append("━" * width)
+    lines.append("DETECTED GPU KERNELS".center(width))
+    lines.append("━" * width)
+    lines.append(f"  Total kernels found: {tier0_result.kernel_count}")
+    if tier0_result.detected_kernels:
+        for k in tier0_result.detected_kernels[:20]:
+            lines.append(
+                f"  • {k['name']}  ({k.get('launch_type', '')})  "
+                f"{k.get('file', '').split('/')[-1]}:{k.get('line', '')}"
+            )
+        if len(tier0_result.detected_kernels) > 20:
+            lines.append(f"  ... and {len(tier0_result.detected_kernels) - 20} more")
+    else:
+        lines.append("  No GPU kernels detected in source.")
+    lines.append("")
+
+    # Patterns by severity
+    lines.append("━" * width)
+    lines.append("DETECTED PATTERNS".center(width))
+    lines.append("━" * width)
+    if tier0_result.detected_patterns:
+        for p in tier0_result.detected_patterns:
+            sev = p.get("severity", "info").upper()
+            cat = p.get("category", "")
+            desc = p.get("description", "")
+            count = p.get("count", 0)
+            lines.append(f"  [{sev}] {cat} — {desc} (×{count})")
+    else:
+        lines.append("  No significant patterns detected.")
+    lines.append("")
+
+    # Risk areas
+    if tier0_result.risk_areas:
+        lines.append("━" * width)
+        lines.append("RISK AREAS".center(width))
+        lines.append("━" * width)
+        for risk in tier0_result.risk_areas:
+            lines.append(f"  ⚠  {risk}")
+        lines.append("")
+
+    # ROCTx
+    if tier0_result.already_instrumented:
+        lines.append(
+            f"  ✓ ROCTx markers detected ({tier0_result.roctx_marker_count} markers)"
+        )
+        lines.append("")
+
+    # Recommended counters
+    if tier0_result.suggested_counters:
+        lines.append("━" * width)
+        lines.append("SUGGESTED HARDWARE COUNTERS".center(width))
+        lines.append("━" * width)
+        lines.append("  " + "  ".join(tier0_result.suggested_counters))
+        lines.append("")
+
+    # Recommendations
+    lines.append("━" * width)
+    lines.append("PROFILING RECOMMENDATIONS".center(width))
+    lines.append("━" * width)
+    lines.append("")
+    lines.extend(_tier0_recommendations_text(tier0_result.recommendations, width))
+
+    # Suggested first command
+    if tier0_result.suggested_first_command:
+        lines.append("━" * width)
+        lines.append("START HERE — SUGGESTED FIRST COMMAND".center(width))
+        lines.append("━" * width)
+        lines.append("")
+        lines.append(f"  $ {tier0_result.suggested_first_command}")
+        lines.append("")
+
+    # LLM explanation
+    if tier0_result.llm_explanation:
+        lines.append("━" * width)
+        lines.append("AI-ENHANCED INSIGHTS".center(width))
+        lines.append("━" * width)
+        lines.append("")
+        lines.append(tier0_result.llm_explanation)
+        lines.append("")
+
+    lines.append("=" * width)
+    lines.append("Analysis complete.".center(width))
+    lines.append("=" * width)
+
+    return "\n".join(lines)
+
+
+def _tier0_to_dict(tier0_result: Any) -> Dict[str, Any]:
+    """Convert SourceAnalysisResult to a JSON-serializable dict for the tier0 field."""
+    return {
+        "source_dir": tier0_result.source_dir,
+        "analysis_timestamp": tier0_result.analysis_timestamp,
+        "programming_model": tier0_result.programming_model,
+        "files_scanned": tier0_result.files_scanned,
+        "files_skipped": tier0_result.files_skipped,
+        "kernel_count": tier0_result.kernel_count,
+        "detected_kernels": tier0_result.detected_kernels,
+        "detected_patterns": tier0_result.detected_patterns,
+        "risk_areas": tier0_result.risk_areas,
+        "already_instrumented": tier0_result.already_instrumented,
+        "roctx_marker_count": tier0_result.roctx_marker_count,
+        "recommendations": _build_recommendations_json(tier0_result.recommendations),
+        "suggested_counters": tier0_result.suggested_counters,
+        "suggested_first_command": tier0_result.suggested_first_command,
+        "llm_explanation": tier0_result.llm_explanation,
+    }
+
+
+def _format_tier0_json(tier0_result: Any) -> str:
+    """Format Tier 0 source-only analysis as schema v0.2.0 JSON."""
+    import json as _json
+
+    doc: Dict[str, Any] = {
+        "schema_version": "0.2.0",
+        "metadata": {
+            "rocpd_version": _ROCPD_VERSION,
+            "analysis_version": "0.2.0",  # schema version, not module version
+            "database_file": None,
+            "analysis_timestamp": tier0_result.analysis_timestamp,
+            "analysis_duration_ms": 0,
+            "custom_prompt": None,
+        },
+        "profiling_info": {
+            "total_duration_ns": 0,
+            "profiling_mode": "source_only",
+            "analysis_tier": 0,
+            "gpus": [],
+        },
+        "summary": {
+            "overall_assessment": (
+                f"Static analysis of {tier0_result.files_scanned} source files found "
+                f"{tier0_result.kernel_count} GPU kernels. "
+                f"Programming model: {tier0_result.programming_model}. "
+                f"See recommendations for next profiling steps."
+            ),
+            "primary_bottleneck": "unknown",
+            "confidence": 0.0,
+            "key_findings": tier0_result.risk_areas,
+        },
+        "tier0": _tier0_to_dict(tier0_result),
+        "execution_breakdown": None,
+        "hotspots": [],
+        "memory_analysis": {},
+        "hardware_counters": {"has_counters": False, "metrics": None, "counters": None},
+        "recommendations": _build_recommendations_json(tier0_result.recommendations),
+        "warnings": [],
+        "errors": [],
+        "llm_enhanced_explanation": tier0_result.llm_explanation,
+    }
+    return _json.dumps(doc, indent=2)
+
+
+def _format_tier0_markdown(tier0_result: Any) -> str:
+    """Format Tier 0 source-only analysis as Markdown."""
+    lines = []
+    lines.append("# ROCpd AI Profiling Plan — Tier 0: Source Code Analysis")
+    lines.append("")
+    lines.append(f"**Source Directory:** `{tier0_result.source_dir}`")
+    lines.append(f"**Analysis Date:** {tier0_result.analysis_timestamp}")
+    lines.append(f"**Programming Model:** {tier0_result.programming_model}")
+    lines.append("**Analysis Tier:** 0 (Source Code Analysis)")
+    lines.append("")
+
+    lines.append("## Detected Kernels")
+    lines.append("")
+    lines.append(f"**Total GPU kernels found:** {tier0_result.kernel_count}")
+    lines.append("")
+    if tier0_result.detected_kernels:
+        lines.append("| Kernel | Launch Type | File | Line |")
+        lines.append("|--------|-------------|------|------|")
+        for k in tier0_result.detected_kernels[:20]:
+            fname = k.get("file", "").split("/")[-1]
+            lines.append(
+                f"| `{k['name']}` | {k.get('launch_type', '')} | {fname} | {k.get('line', '')} |"
+            )
+        if len(tier0_result.detected_kernels) > 20:
+            lines.append(
+                f"\n*... and {len(tier0_result.detected_kernels) - 20} more kernels*"
+            )
+    else:
+        lines.append("*No GPU kernels detected in source.*")
+    lines.append("")
+
+    lines.append("## Detected Patterns")
+    lines.append("")
+    if tier0_result.detected_patterns:
+        lines.append("| Severity | Category | Description | Count |")
+        lines.append("|----------|----------|-------------|-------|")
+        for p in tier0_result.detected_patterns:
+            sev = p.get("severity", "info")
+            lines.append(
+                f"| **{sev.upper()}** | {p.get('category', '')} | {p.get('description', '')} | {p.get('count', 0)} |"
+            )
+    else:
+        lines.append("*No significant patterns detected.*")
+    lines.append("")
+
+    if tier0_result.risk_areas:
+        lines.append("## Risk Areas")
+        lines.append("")
+        for risk in tier0_result.risk_areas:
+            lines.append(f"- ⚠ {risk}")
+        lines.append("")
+
+    if tier0_result.suggested_counters:
+        lines.append("## Suggested Hardware Counters")
+        lines.append("")
+        lines.append("```")
+        lines.append(" ".join(tier0_result.suggested_counters))
+        lines.append("```")
+        lines.append("")
+
+    lines.append("## Profiling Recommendations")
+    lines.append("")
+    priority_emoji = {"HIGH": "🔴", "MEDIUM": "🟡", "LOW": "🟢", "INFO": "🔵"}
+    for rec in tier0_result.recommendations:
+        pri = rec.get("priority", "INFO")
+        cat = rec.get("category", "")
+        emoji = priority_emoji.get(pri, "•")
+        lines.append(f"### {emoji} [{pri}] {cat}")
+        lines.append("")
+        lines.append(f"**Issue:** {rec.get('issue', '')}")
+        lines.append("")
+        lines.append(f"**Suggestion:** {rec.get('suggestion', '')}")
+        actions = rec.get("actions", [])
+        if actions:
+            lines.append("")
+            for action in actions:
+                lines.append(f"{action}")
+        impact = rec.get("estimated_impact", "")
+        if impact:
+            lines.append("")
+            lines.append(f"**Estimated Impact:** {impact}")
+        commands = rec.get("commands", [])
+        if commands:
+            lines.append("")
+            lines.append("**Recommended Commands:**")
+            lines.append("")
+            for cmd in commands:
+                tool = cmd.get("tool", "")
+                desc = cmd.get("description", "")
+                full_command = cmd.get("full_command", "")
+                flags = cmd.get("flags", [])
+                args = cmd.get("args", [])
+                lines.append(f"*{tool}* — {desc}")
+                if flags:
+                    lines.append(f"- Flags: `{' '.join(flags)}`")
+                if args:
+                    arg_strs = []
+                    for a in args:
+                        name = a.get("name", "")
+                        value = a.get("value")
+                        arg_strs.append(f"{name} {value}" if value is not None else name)
+                    lines.append(f"- Args: `{' '.join(arg_strs)}`")
+                if full_command:
+                    lines.append(f"```bash\n{full_command}\n```")
+                lines.append("")
+        lines.append("")
+
+    if tier0_result.suggested_first_command:
+        lines.append("## Start Here — Suggested First Command")
+        lines.append("")
+        lines.append("```bash")
+        lines.append(tier0_result.suggested_first_command)
+        lines.append("```")
+        lines.append("")
+
+    if tier0_result.llm_explanation:
+        lines.append("## AI-Enhanced Insights")
+        lines.append("")
+        lines.append(tier0_result.llm_explanation)
+        lines.append("")
+
+    lines.append("---")
+    lines.append(
+        f"*Generated by rocpd analyze (Tier 0) \u2022 {tier0_result.analysis_timestamp}*"
+    )
+    return "\n".join(lines)
+
+
+def _format_tier0_webview(tier0_result: Any) -> str:
+    """Generate a self-contained AMD-themed HTML Tier 0 report (identical design system as Tier 1/2)."""
+    import html as _html
+    import json as _json
+
+    def _h(v: Any) -> str:
+        return _html.escape(str(v), quote=True)
+
+    SEV_FG = {
+        "high": "#e84040",
+        "medium": "#f08432",
+        "low": "#caa828",
+        "info": "#4d8ef2",
+    }
+    SEV_BG = {
+        "high": "rgba(232,64,64,.13)",
+        "medium": "rgba(240,132,50,.13)",
+        "low": "rgba(202,168,40,.13)",
+        "info": "rgba(77,142,242,.13)",
+    }
+    PRIORITY = {
+        "HIGH": ("#e84040", "#2a0808"),
+        "MEDIUM": ("#f08432", "#2a1600"),
+        "LOW": ("#caa828", "#241e08"),
+        "INFO": ("#4d8ef2", "#081428"),
+    }
+    PRIORITY_ICON = {
+        "HIGH": "&#128308;",
+        "MEDIUM": "&#128992;",
+        "LOW": "&#128993;",
+        "INFO": "&#8505;",
+    }
+
+    analysis_date = tier0_result.analysis_timestamp
+    src_dir = str(tier0_result.source_dir)
+    src_display = src_dir[-45:] if len(src_dir) > 45 else src_dir
+
+    # ── Counts ──────────────────────────────────────────────────────────────
+    recs = tier0_result.recommendations or []
+    n_high = sum(1 for r in recs if r.get("priority") == "HIGH")
+    n_medium = sum(1 for r in recs if r.get("priority") == "MEDIUM")
+    n_low = sum(1 for r in recs if r.get("priority") == "LOW")
+    n_info = sum(1 for r in recs if r.get("priority") == "INFO")
+
+    _badge_parts = []
+    if n_high:
+        _badge_parts.append(
+            f'<span class="hbadge hbadge-crit">&#9679; {n_high} Critical</span>'
+        )
+    if n_medium:
+        _badge_parts.append(
+            f'<span class="hbadge hbadge-warn">&#9679; {n_medium} Warning</span>'
+        )
+    if n_low:
+        _badge_parts.append(f'<span class="hbadge hbadge-ok">&#9679; {n_low} Low</span>')
+    if n_info:
+        _badge_parts.append(
+            f'<span class="hbadge hbadge-info">&#9679; {n_info} Info</span>'
+        )
+    header_badges_html = " ".join(_badge_parts)
+
+    _recs_badge_html = ""
+    if n_high:
+        _recs_badge_html += (
+            f'<span class="shdr-badge sbadge-crit">{n_high} Critical</span> '
+        )
+    if n_medium:
+        _recs_badge_html += (
+            f'<span class="shdr-badge sbadge-warn">{n_medium} Warning</span>'
+        )
+
+    # ── Recommendations HTML (same .r-card format as Tier 1/2) ──────────────
+    recs_parts = []
+    for ri, rec in enumerate(recs):
+        p = rec.get("priority", "INFO")
+        cat = rec.get("category", "")
+        fg, _ = PRIORITY.get(p, ("#888", "#1a1a2a"))
+        picon = PRIORITY_ICON.get(p, "&#8505;")
+        actions_li = "".join(f"<li>{_h(a)}</li>" for a in rec.get("actions", []))
+        actions_html = f'<ol class="r-actions">{actions_li}</ol>' if actions_li else ""
+        impact = rec.get("estimated_impact", "")
+        impact_html = (
+            f'<p class="r-impact">&#9889; Expected impact: {_h(impact)}</p>'
+            if impact
+            else ""
+        )
+        cmds_parts = []
+        for ci, cmd in enumerate(rec.get("commands", [])):
+            fc = cmd.get("full_command", "")
+            tool = cmd.get("tool", "")
+            desc = cmd.get("description", "")
+            if not fc:
+                continue
+            cid = f"c{ri}_{ci}"
+            cmds_parts.append(
+                f'<div class="cmd-blk">'
+                f'<span class="tool-tag">{_h(tool)}</span>'
+                f'<span class="cmd-desc">{_h(desc)}</span>'
+                f'<div class="cmd-row" id="{cid}">'
+                f"<code>{_h(fc)}</code>"
+                f'<button class="cp-btn" onclick="cpCmd(\'{cid}\')">Copy</button>'
+                f"</div></div>"
+            )
+        cmds_html = "".join(cmds_parts)
+        issue_txt = rec.get("issue", "")
+        suggest = rec.get("suggestion", "")
+        recs_parts.append(
+            f'<div class="r-card" style="border-left-color:{fg}" data-p="{_h(p)}">'
+            f'<div class="r-hdr" onclick="toggleR(this)">'
+            f'<span class="r-priority-icon">{picon}</span>'
+            f'<span class="r-badge" style="background:{fg};color:#fff">{_h(p)}</span>'
+            f'<span class="r-cat">{_h(cat)}</span>'
+            f'<span class="r-chev">&#9660;</span>'
+            f"</div>"
+            f'<div class="r-body">'
+            f'<p class="r-issue"><strong>Issue:</strong> {_h(issue_txt)}</p>'
+            f'<p class="r-suggest"><strong>What to do:</strong> {_h(suggest)}</p>'
+            f"{actions_html}{impact_html}{cmds_html}"
+            f"</div></div>"
+        )
+    recs_html = (
+        "".join(recs_parts)
+        or '<p class="dim">No recommendations — workload looks well-optimized.</p>'
+    )
+
+    # ── Kernels table ────────────────────────────────────────────────────────
+    kernel_rows = []
+    for i, k in enumerate(tier0_result.detected_kernels[:50]):
+        fname = _h(k.get("file", "").split("/")[-1])
+        kernel_rows.append(
+            f"<tr>"
+            f"<td>{i + 1}</td>"
+            f'<td class="kname" title="{_h(k.get("name", ""))}"><code>{_h(k.get("name", ""))}</code></td>'
+            f'<td>{_h(k.get("launch_type", ""))}</td>'
+            f"<td>{fname}</td>"
+            f'<td data-v="{k.get("line", 0)}">{_h(str(k.get("line", "")))}</td>'
+            f"</tr>"
+        )
+    if kernel_rows:
+        kernels_section = (
+            '<section class="scard">'
+            '<div class="shdr">'
+            '<span class="shdr-icon">&#128187;</span>'
+            "<h2>Detected GPU Kernels</h2>"
+            f'<span class="shdr-badge sbadge-info">{tier0_result.kernel_count} found</span>'
+            "</div>"
+            '<div class="sbody"><div class="tbl-wrap">'
+            '<table class="dtable sortable">'
+            "<thead><tr>"
+            "<th data-tip='Rank by order found in source.'>#</th>"
+            "<th data-tip='GPU kernel function name detected in source code. For HIP/CUDA: __global__ functions.'>Kernel Name</th>"
+            "<th data-tip='How the kernel is launched: __global__ for HIP/CUDA, kernel for OpenCL.'>Launch Type</th>"
+            "<th data-tip='Source file where the kernel is defined (basename only).'>File</th>"
+            "<th data-tip='Line number of the kernel definition in the source file.'>Line &#8645;</th>"
+            "</tr></thead>"
+            "<tbody>" + "".join(kernel_rows) + "</tbody>"
+            "</table></div></div></section>"
+        )
+    else:
+        kernels_section = (
+            '<section class="scard">'
+            '<div class="shdr"><span class="shdr-icon">&#128187;</span>'
+            "<h2>Detected GPU Kernels</h2></div>"
+            '<div class="sbody"><p class="dim">No GPU kernels detected in the source directory.</p></div>'
+            "</section>"
+        )
+
+    # ── Patterns table ───────────────────────────────────────────────────────
+    pattern_rows = []
+    for pat in tier0_result.detected_patterns:
+        sev = pat.get("severity", "info").lower()
+        sfg = SEV_FG.get(sev, "#6b7280")
+        sbg = SEV_BG.get(sev, "rgba(107,114,128,.13)")
+        pattern_rows.append(
+            f"<tr>"
+            f'<td><span style="display:inline-block;padding:.14em .55em;border-radius:4px;'
+            f"font-size:.69rem;font-weight:800;letter-spacing:.06em;"
+            f'background:{sbg};color:{sfg}">{_h(sev.upper())}</span></td>'
+            f'<td>{_h(pat.get("category", ""))}</td>'
+            f'<td>{_h(pat.get("description", ""))}</td>'
+            f'<td data-v="{pat.get("count", 0)}">{pat.get("count", 0)}</td>'
+            f"</tr>"
+        )
+    if pattern_rows:
+        patterns_section = (
+            '<section class="scard">'
+            '<div class="shdr">'
+            '<span class="shdr-icon">&#128202;</span>'
+            "<h2>Detected Performance Patterns</h2>"
+            f'<span class="shdr-badge sbadge-warn">{len(tier0_result.detected_patterns)} found</span>'
+            "</div>"
+            '<div class="sbody"><div class="tbl-wrap">'
+            '<table class="dtable sortable">'
+            "<thead><tr>"
+            "<th data-tip='Issue severity. HIGH = likely significant performance impact. MEDIUM = moderate. LOW = minor.'>Severity</th>"
+            "<th data-tip='Category of the anti-pattern detected in source code (memory, compute, synchronization, etc.).'>Category</th>"
+            "<th data-tip='Description of the specific pattern found and its likely performance impact.'>Description</th>"
+            "<th data-tip='Number of occurrences of this pattern across all scanned source files.'>Count &#8645;</th>"
+            "</tr></thead>"
+            "<tbody>" + "".join(pattern_rows) + "</tbody>"
+            "</table></div></div></section>"
+        )
+    else:
+        patterns_section = ""
+
+    # ── Risk areas ───────────────────────────────────────────────────────────
+    risk_li = "".join(f"<li>{_h(r)}</li>" for r in tier0_result.risk_areas)
+    risk_section = ""
+    if risk_li:
+        risk_section = (
+            '<section class="scard">'
+            '<div class="shdr">'
+            '<span class="shdr-icon">&#9888;</span>'
+            "<h2>Risk Areas</h2>"
+            f'<span class="shdr-badge sbadge-warn">{len(tier0_result.risk_areas)}</span>'
+            "</div>"
+            '<div class="sbody">'
+            f'<ul class="findings">{risk_li}</ul>'
+            "</div></section>"
+        )
+
+    # ── Suggested counters ───────────────────────────────────────────────────
+    ctr_badges = " ".join(
+        f'<code style="background:rgba(77,142,242,.15);color:#4d8ef2;'
+        f"padding:.14em .55em;border-radius:4px;font-size:.83rem;margin:.18rem .1rem;"
+        f'display:inline-block">{_h(c)}</code>'
+        for c in tier0_result.suggested_counters
+    )
+    counters_section = ""
+    if tier0_result.suggested_counters:
+        collect_cmd = (
+            "rocprofv3 --sys-trace --pmc "
+            + " ".join(tier0_result.suggested_counters)
+            + " -- ./your_app"
+        )
+        counters_section = (
+            '<section class="scard">'
+            '<div class="shdr">'
+            '<span class="shdr-icon">&#128300;</span>'
+            "<h2>Suggested Hardware Counters</h2>"
+            f'<span class="shdr-badge sbadge-info">{len(tier0_result.suggested_counters)} counters</span>'
+            "</div>"
+            '<div class="sbody">'
+            '<p style="margin-bottom:.85rem;color:var(--sub);font-size:.9rem">'
+            "Collect these counters to enable Tier 2 (hardware-level) analysis:</p>"
+            f'<p style="margin-bottom:1rem;line-height:1.9">{ctr_badges}</p>'
+            f'<div class="cmd-row" id="cmd-ctr">'
+            f"<code>{_h(collect_cmd)}</code>"
+            f'<button class="cp-btn" onclick="cpCmd(\'cmd-ctr\')">Copy</button>'
+            "</div>"
+            "</div></section>"
+        )
+
+    # ── Start Here ───────────────────────────────────────────────────────────
+    start_here_section = ""
+    if tier0_result.suggested_first_command:
+        fc = tier0_result.suggested_first_command
+        start_here_section = (
+            '<section class="scard">'
+            '<div class="shdr">'
+            '<span class="shdr-icon">&#9654;</span>'
+            "<h2>Start Here</h2>"
+            '<span class="shdr-badge sbadge-info">Recommended First Step</span>'
+            "</div>"
+            '<div class="sbody">'
+            '<p style="margin-bottom:.85rem;color:var(--sub);font-size:.9rem">'
+            "Run this command to collect profiling data for Tier 1/2 analysis:</p>"
+            f'<div class="cmd-row" id="cmd-start">'
+            f"<code>{_h(fc)}</code>"
+            f'<button class="cp-btn" onclick="cpCmd(\'cmd-start\')">Copy</button>'
+            "</div>"
+            "</div></section>"
+        )
+
+    # ── LLM section ──────────────────────────────────────────────────────────
+    llm_section = ""
+    if tier0_result.llm_explanation:
+        llm_section = (
+            '<section class="scard">'
+            '<div class="shdr">'
+            '<span class="shdr-icon">&#129302;</span>'
+            "<h2>AI-Enhanced Insights</h2>"
+            '<span class="shdr-badge sbadge-info">LLM</span>'
+            "</div>"
+            '<div class="sbody">'
+            f'<pre style="white-space:pre-wrap;line-height:1.6;'
+            f'color:var(--sub);font-size:.9rem">{_h(tier0_result.llm_explanation)}</pre>'
+            "</div></section>"
+        )
+
+    # ── KPI grid ─────────────────────────────────────────────────────────────
+    n_risks = len(tier0_result.risk_areas)
+    risk_kpi_cls = "kpi-warn" if n_risks > 0 else "kpi-ok"
+    risk_kpi_label = "Needs Attention" if n_risks > 0 else "None Found"
+    model_upper = _h(tier0_result.programming_model.upper())
+    assessment_txt = (
+        f"Static source analysis of {tier0_result.files_scanned} file(s) found "
+        f"{tier0_result.kernel_count} GPU kernel(s). "
+        f"Programming model: {tier0_result.programming_model}. "
+        "See recommendations below for the suggested profiling workflow."
+    )
+    n_patterns = len(tier0_result.detected_patterns)
+
+    payload = _json.dumps(_tier0_to_dict(tier0_result))
+    payload = payload.replace("</script>", r"<\/script>").replace("<!--", r"<\!--")
+
+    return f"""<!DOCTYPE html>
+<html lang="en" data-theme="dark">
+<head>
+<meta charset="UTF-8">
+<meta name="viewport" content="width=device-width,initial-scale=1">
+<title>ROCpd AI Profiling Plan &#8212; Tier 0 Source Analysis</title>
+<style>
+/* ── Reset + Variables ─────────────────────────────────────────────── */
+:root {{
+  --bg:#0d0d14; --bg2:#14141f; --bg3:#1c1c2c; --bg4:#242438;
+  --bdr:#2c2c48; --bdr2:#3a3a58;
+  --text:#e0e3f2; --sub:#a8aace; --dim:#6868a0;
+  --amd:#e01a22;
+  --blue:#4d8ef2; --green:#3acc66; --orange:#f08432;
+  --purple:#9866cc; --teal:#28bca8; --yellow:#caa828;
+  --c-ok:#3acc66;   --c-ok-bg:rgba(58,204,102,.13);
+  --c-warn:#f08432; --c-warn-bg:rgba(240,132,50,.13);
+  --c-crit:#e84040; --c-crit-bg:rgba(232,64,64,.13);
+  --c-info:#4d8ef2; --c-info-bg:rgba(77,142,242,.13);
+  --r:10px; --r-sm:6px;
+  --font:-apple-system,"Segoe UI",system-ui,Ubuntu,sans-serif;
+  --mono:"JetBrains Mono","Cascadia Code","Fira Code",ui-monospace,monospace;
+  --shadow:0 4px 18px rgba(0,0,0,.42);
+  --shadow-lg:0 8px 36px rgba(0,0,0,.55);
+  --trans:all 0.22s cubic-bezier(0.4,0,0.2,1);
+}}
+[data-theme="light"] {{
+  --bg:#f2f2f8; --bg2:#ffffff; --bg3:#eaeaf2; --bg4:#dddde8;
+  --bdr:#c8c8dc; --bdr2:#b4b4cc;
+  --text:#181828; --sub:#444468; --dim:#6868a0;
+  --c-ok-bg:rgba(58,204,102,.10); --c-warn-bg:rgba(240,132,50,.10);
+  --c-crit-bg:rgba(232,64,64,.10); --c-info-bg:rgba(77,142,242,.10);
+  --shadow:0 2px 12px rgba(0,0,0,.10);
+  --shadow-lg:0 4px 20px rgba(0,0,0,.14);
+}}
+*,*::before,*::after {{ box-sizing:border-box; margin:0; padding:0; }}
+html {{ scroll-behavior:smooth; }}
+body {{ font-family:var(--font); background:var(--bg); color:var(--text);
+       line-height:1.65; font-size:15px; min-height:100vh;
+       transition:background .25s,color .25s; }}
+a {{ color:var(--blue); }}
+code {{ font-family:var(--mono); font-size:.87em; }}
+/* ── Header ──────────────────────────────────────────────────────── */
+.hdr {{
+  background:linear-gradient(135deg,#080810 0%,#120e1c 100%);
+  border-bottom:3px solid var(--amd); padding:.9rem 0;
+  position:sticky; top:0; z-index:100;
+  box-shadow:0 2px 16px rgba(0,0,0,.55);
+}}
+[data-theme="light"] .hdr {{ background:linear-gradient(135deg,#1a0a10 0%,#280e18 100%); }}
+.hdr-inner {{ max-width:1140px; margin:0 auto; padding:0 1.25rem;
+              display:flex; align-items:center; gap:1rem; flex-wrap:wrap; }}
+.hdr-brand {{ display:flex; align-items:baseline; gap:.6rem; }}
+.logo {{ font-size:1.6rem; font-weight:900; color:var(--amd);
+         letter-spacing:-.04em; line-height:1; }}
+.logo em {{ color:#f0f0ff; font-style:normal; }}
+.hdr-subtitle {{ font-size:.88rem; color:rgba(255,255,255,.55); font-weight:500; }}
+.hdr-badges {{ display:flex; gap:.4rem; flex-wrap:wrap; margin-left:auto; }}
+.hbadge {{ font-size:.7rem; font-weight:800; padding:.2em .65em;
+           border-radius:100px; letter-spacing:.04em;
+           display:inline-flex; align-items:center; gap:.25em; }}
+.hbadge-crit {{ background:var(--c-crit-bg); color:var(--c-crit); border:1px solid rgba(232,64,64,.4); }}
+.hbadge-warn {{ background:var(--c-warn-bg); color:var(--c-warn); border:1px solid rgba(240,132,50,.4); }}
+.hbadge-ok   {{ background:var(--c-ok-bg);   color:var(--c-ok);   border:1px solid rgba(58,204,102,.4); }}
+.hbadge-info {{ background:var(--c-info-bg);  color:var(--c-info); border:1px solid rgba(77,142,242,.4); }}
+.hdr-controls {{ display:flex; gap:.5rem; align-items:center; }}
+.hdr-btn {{ background:rgba(255,255,255,.08); border:1px solid rgba(255,255,255,.15);
+            color:rgba(255,255,255,.7); border-radius:var(--r-sm);
+            padding:.3em .75em; font-size:.79rem; cursor:pointer;
+            font-family:var(--font); transition:var(--trans);
+            display:flex; align-items:center; gap:.3em; }}
+.hdr-btn:hover {{ background:rgba(255,255,255,.14); color:#fff; }}
+.hdr-pills {{ max-width:1140px; margin:.55rem auto 0; padding:.5rem 1.25rem 0;
+              display:flex; gap:.45rem; flex-wrap:wrap;
+              border-top:1px solid rgba(255,255,255,.07); }}
+.hpill {{ font-size:.72rem; background:rgba(255,255,255,.06);
+          border:1px solid rgba(255,255,255,.1); border-radius:5px;
+          padding:.12em .55em; display:flex; align-items:center; gap:.3em; }}
+.hpill-label {{ color:rgba(255,255,255,.4); }}
+.hpill-value {{ font-family:var(--mono); font-weight:600; color:rgba(255,255,255,.75); }}
+/* ── Layout ──────────────────────────────────────────────────────── */
+.wrap {{ max-width:1140px; margin:0 auto; padding:1.5rem 1.25rem 5rem; }}
+/* ── Section Card ────────────────────────────────────────────────── */
+.scard {{ background:var(--bg2); border:1px solid var(--bdr); border-radius:var(--r);
+          margin-bottom:1.5rem; box-shadow:var(--shadow); overflow:hidden;
+          animation:fadeInUp .35s ease both; }}
+.scard:nth-child(1) {{ animation-delay:.04s; }}
+.scard:nth-child(2) {{ animation-delay:.08s; }}
+.scard:nth-child(3) {{ animation-delay:.12s; }}
+.scard:nth-child(4) {{ animation-delay:.16s; }}
+.scard:nth-child(5) {{ animation-delay:.20s; }}
+.scard:nth-child(6) {{ animation-delay:.24s; }}
+.scard:nth-child(7) {{ animation-delay:.28s; }}
+.scard:nth-child(8) {{ animation-delay:.32s; }}
+.shdr {{ display:flex; align-items:center; gap:.6rem;
+         padding:.85rem 1.4rem; border-bottom:1px solid var(--bdr);
+         background:var(--bg3); }}
+.shdr-icon {{ font-size:1.1rem; flex-shrink:0; }}
+.shdr h2 {{ font-size:.97rem; font-weight:700; letter-spacing:.02em; flex:1; color:var(--text); }}
+.shdr-badge {{ font-size:.69rem; font-weight:800; padding:.15em .55em;
+               border-radius:100px; letter-spacing:.04em; flex-shrink:0; }}
+.sbadge-crit {{ background:var(--c-crit-bg); color:var(--c-crit); }}
+.sbadge-warn {{ background:var(--c-warn-bg); color:var(--c-warn); }}
+.sbadge-ok   {{ background:var(--c-ok-bg);   color:var(--c-ok); }}
+.sbadge-info {{ background:var(--c-info-bg);  color:var(--c-info); }}
+.sbody {{ padding:1.25rem 1.4rem; }}
+.dim {{ color:var(--dim); }}
+.hint {{ font-size:.85rem; color:var(--dim); }}
+/* ── Assessment / Quote ──────────────────────────────────────────── */
+.assess {{ font-style:italic; color:var(--sub); font-size:.92rem; line-height:1.7;
+           padding:.7rem 1rem; border-left:3px solid var(--blue);
+           background:var(--c-info-bg); border-radius:0 var(--r-sm) var(--r-sm) 0;
+           margin-bottom:1.25rem; }}
+/* ── KPI Grid ────────────────────────────────────────────────────── */
+.kpi-grid {{ display:grid; grid-template-columns:repeat(auto-fit,minmax(185px,1fr));
+             gap:1rem; margin-bottom:1.25rem; }}
+.kpi {{ border:1px solid var(--bdr); border-radius:var(--r); padding:1rem;
+        position:relative; overflow:hidden; transition:var(--trans); cursor:help; }}
+.kpi:hover {{ transform:translateY(-2px); box-shadow:var(--shadow); }}
+.kpi::before {{ content:''; position:absolute; top:0; left:0; right:0; height:3px; }}
+.kpi-ok   {{ background:var(--c-ok-bg); }}   .kpi-ok::before   {{ background:var(--c-ok); }}
+.kpi-warn {{ background:var(--c-warn-bg); }}  .kpi-warn::before {{ background:var(--c-warn); }}
+.kpi-crit {{ background:var(--c-crit-bg); }}  .kpi-crit::before {{ background:var(--c-crit); }}
+.kpi-info {{ background:var(--c-info-bg); }}  .kpi-info::before {{ background:var(--c-info); }}
+.kpi-head {{ display:flex; align-items:center; justify-content:space-between; margin-bottom:.4rem; }}
+.kpi-icon {{ font-size:1.25rem; }}
+.kpi-status {{ font-size:.68rem; font-weight:800; padding:.14em .5em; border-radius:100px; }}
+.kpi-ok   .kpi-status {{ background:rgba(58,204,102,.2);  color:var(--c-ok); }}
+.kpi-warn .kpi-status {{ background:rgba(240,132,50,.2);  color:var(--c-warn); }}
+.kpi-crit .kpi-status {{ background:rgba(232,64,64,.2);   color:var(--c-crit); }}
+.kpi-info .kpi-status {{ background:rgba(77,142,242,.2);  color:var(--c-info); }}
+.kpi-label {{ font-size:.69rem; text-transform:uppercase; letter-spacing:.1em; color:var(--dim); margin-bottom:.2rem; }}
+.kpi-value {{ font-size:1.55rem; font-weight:800; line-height:1.1; font-family:var(--mono); margin-bottom:.15rem; }}
+.kpi-ok   .kpi-value {{ color:var(--c-ok); }}
+.kpi-warn .kpi-value {{ color:var(--c-warn); }}
+.kpi-crit .kpi-value {{ color:var(--c-crit); }}
+.kpi-info .kpi-value {{ color:var(--c-info); }}
+.kpi-sub {{ font-size:.77rem; color:var(--dim); }}
+/* ── Key Findings / Risk list ────────────────────────────────────── */
+.findings {{ list-style:none; margin-top:.85rem; border-top:1px solid var(--bdr); padding-top:.75rem; }}
+.findings li {{ font-size:.87rem; color:var(--sub); padding:.28rem 0 .28rem 1.3rem;
+                position:relative; border-bottom:1px solid rgba(44,44,72,.3); }}
+.findings li:last-child {{ border-bottom:none; }}
+.findings li::before {{ content:'⚠'; position:absolute; left:0; color:var(--c-warn); font-weight:700; }}
+/* ── Recommendations ─────────────────────────────────────────────── */
+.r-card {{ border-left:4px solid; border-radius:0 var(--r) var(--r) 0;
+           background:var(--bg3); margin-bottom:.6rem; overflow:hidden;
+           transition:background .15s; }}
+.r-card:hover {{ background:var(--bg4); }}
+.r-hdr {{ display:flex; align-items:center; gap:.55rem; padding:.8rem 1rem;
+          cursor:pointer; user-select:none; }}
+.r-priority-icon {{ font-size:.9rem; flex-shrink:0; }}
+.r-badge {{ padding:.14em .55em; border-radius:4px; font-size:.69rem;
+            font-weight:800; letter-spacing:.06em; flex-shrink:0; }}
+.r-cat {{ font-weight:600; font-size:.9rem; flex:1; color:var(--text); }}
+.r-chev {{ color:var(--dim); font-size:.7rem; transition:transform .2s; flex-shrink:0; }}
+.r-card.open .r-chev {{ transform:rotate(180deg); }}
+.r-body {{ display:none; padding:.85rem 1rem 1rem; border-top:1px solid var(--bdr); }}
+.r-card.open .r-body {{ display:block; }}
+.r-issue {{ margin-bottom:.5rem; font-size:.9rem; }}
+.r-suggest {{ font-size:.9rem; margin-bottom:.5rem; }}
+.r-actions {{ padding-left:1.5rem; margin:.5rem 0; color:var(--sub); font-size:.87rem; }}
+.r-actions li {{ margin-bottom:.22rem; }}
+.r-impact {{ color:var(--c-ok); font-size:.84rem; margin-top:.65rem;
+             padding:.35rem .65rem; background:var(--c-ok-bg);
+             border-radius:var(--r-sm); display:inline-block; }}
+.cmd-blk {{ margin-top:.85rem; }}
+.tool-tag {{ color:var(--blue); font-weight:700; font-size:.83rem; }}
+.cmd-desc {{ color:var(--dim); font-size:.81rem; margin-left:.4rem; }}
+.cmd-row {{ display:flex; align-items:center; justify-content:space-between;
+            gap:.5rem; background:var(--bg); border:1px solid var(--bdr);
+            border-radius:var(--r-sm); padding:.55rem .85rem; margin-top:.35rem;
+            overflow-x:auto; }}
+.cmd-row code {{ color:#a0e870; white-space:nowrap; font-size:.84rem; }}
+.cp-btn {{ flex-shrink:0; background:var(--bg3); border:1px solid var(--bdr);
+           color:var(--dim); padding:.2em .6em; border-radius:4px;
+           cursor:pointer; font-size:.73rem; font-family:var(--font); transition:var(--trans); }}
+.cp-btn:hover {{ color:var(--text); border-color:var(--blue); }}
+/* ── Tables ──────────────────────────────────────────────────────── */
+.tbl-wrap {{ overflow-x:auto; }}
+.dtable {{ width:100%; border-collapse:collapse; font-size:.85rem; }}
+.dtable th {{ background:var(--bg3); color:var(--dim); font-weight:700; font-size:.76rem;
+              text-transform:uppercase; letter-spacing:.06em;
+              text-align:left; padding:.55rem .75rem; border-bottom:2px solid var(--bdr);
+              cursor:pointer; user-select:none; white-space:nowrap; transition:color .15s; }}
+.dtable th:hover {{ color:var(--text); }}
+.dtable td {{ padding:.5rem .75rem; border-bottom:1px solid rgba(44,44,72,.4); vertical-align:middle; }}
+.dtable tr:last-child td {{ border-bottom:none; }}
+.dtable tr:hover td {{ background:rgba(255,255,255,.02); }}
+.kname {{ max-width:340px; overflow:hidden; text-overflow:ellipsis; white-space:nowrap; }}
+/* ── Floating Tooltip ────────────────────────────────────────────── */
+#tt {{
+  position:fixed; z-index:9999; pointer-events:none; max-width:320px;
+  padding:.7rem 1rem; background:#0e0e1c; border:1px solid #3a3a5c;
+  border-radius:10px; box-shadow:0 10px 40px rgba(0,0,0,.7);
+  font-size:.8rem; line-height:1.65; color:#dde0f2;
+  opacity:0; transition:opacity .12s; white-space:normal;
+}}
+#tt.show {{ opacity:1; }}
+#tt strong {{ color:var(--blue); display:block; margin-bottom:.2rem; font-size:.85rem; }}
+#tt code {{ font-size:.78rem; background:rgba(255,255,255,.08); padding:.05em .3em; border-radius:3px; }}
+#tt em {{ color:var(--dim); font-size:.77rem; display:block; margin-top:.3rem; }}
+[data-tip] {{ cursor:help; }}
+/* ── FAB ─────────────────────────────────────────────────────────── */
+.fab {{ position:fixed; bottom:1.5rem; right:1.5rem; width:46px; height:46px;
+        border-radius:50%; background:linear-gradient(135deg,var(--blue) 0%,var(--purple) 100%);
+        color:#fff; border:none; font-size:1.25rem; box-shadow:var(--shadow-lg);
+        cursor:pointer; display:flex; align-items:center; justify-content:center;
+        transition:var(--trans); z-index:200; opacity:0; pointer-events:none; }}
+.fab.visible {{ opacity:1; pointer-events:all; }}
+.fab:hover {{ transform:scale(1.1) translateY(-2px); }}
+/* ── Footer ──────────────────────────────────────────────────────── */
+footer {{ border-top:1px solid var(--bdr); padding:1.25rem 1.25rem; max-width:1140px; margin:0 auto; }}
+footer p {{ color:var(--dim); font-size:.77rem; text-align:center; }}
+/* ── Animations ──────────────────────────────────────────────────── */
+@keyframes fadeInUp {{
+  from {{ opacity:0; transform:translateY(14px); }}
+  to   {{ opacity:1; transform:translateY(0); }}
+}}
+/* ── Scrollbar ───────────────────────────────────────────────────── */
+::-webkit-scrollbar {{ width:7px; height:7px; }}
+::-webkit-scrollbar-track {{ background:var(--bg); }}
+::-webkit-scrollbar-thumb {{ background:var(--bdr2); border-radius:4px; }}
+::-webkit-scrollbar-thumb:hover {{ background:var(--dim); }}
+/* ── Mobile ──────────────────────────────────────────────────────── */
+@media (max-width:640px) {{
+  .kpi-value {{ font-size:1.3rem; }}
+  .hdr-subtitle {{ display:none; }}
+}}
+</style>
+</head>
+<body>
+
+<div id="tt"></div>
+
+<!-- ── Header ────────────────────────────────────────────────────── -->
+<header class="hdr">
+  <div class="hdr-inner">
+    <div class="hdr-brand">
+      <span class="logo">ROC<em>pd</em></span>
+      <span class="hdr-subtitle">AI Profiling Plan</span>
+    </div>
+    <div class="hdr-badges">{header_badges_html}</div>
+    <div class="hdr-controls">
+      <button class="hdr-btn" id="theme-btn" onclick="toggleTheme()">&#9728; Light</button>
+    </div>
+  </div>
+  <div class="hdr-pills">
+    <div class="hpill"><span class="hpill-label">Source:</span><span class="hpill-value" title="{_h(src_dir)}">{_h(src_display)}</span></div>
+    <div class="hpill"><span class="hpill-label">Kernels:</span><span class="hpill-value">{tier0_result.kernel_count}</span></div>
+    <div class="hpill"><span class="hpill-label">Tier:</span><span class="hpill-value">0 (Source)</span></div>
+    <div class="hpill"><span class="hpill-label">Generated:</span><span class="hpill-value">{_h(analysis_date)}</span></div>
+    <div class="hpill"><span class="hpill-label">Model:</span><span class="hpill-value">{_h(tier0_result.programming_model)}</span></div>
+  </div>
+</header>
+
+<div class="wrap">
+
+<!-- ── Overview ──────────────────────────────────────────────────── -->
+<section class="scard">
+  <div class="shdr">
+    <span class="shdr-icon">&#128202;</span>
+    <h2>Overview</h2>
+    <span class="shdr-badge sbadge-info">Tier 0</span>
+  </div>
+  <div class="sbody">
+    <p class="assess">{_h(assessment_txt)}</p>
+    <div class="kpi-grid">
+      <div class="kpi kpi-info" data-tip='<strong>GPU Kernels Detected</strong>Number of GPU kernel functions found in the source directory by static analysis. Each __global__ (HIP/CUDA) or kernel (OpenCL) function is counted.'>
+        <div class="kpi-head"><span class="kpi-icon">&#128187;</span><span class="kpi-status">Detected</span></div>
+        <div class="kpi-label">GPU Kernels</div>
+        <div class="kpi-value">{tier0_result.kernel_count}</div>
+        <div class="kpi-sub">{tier0_result.files_scanned} file(s) scanned</div>
+      </div>
+      <div class="kpi kpi-info" data-tip='<strong>Programming Model</strong>GPU programming model detected in source files. HIP is AMD&#39;s primary GPU programming interface, compatible with CUDA syntax.'>
+        <div class="kpi-head"><span class="kpi-icon">&#129520;</span><span class="kpi-status">Model</span></div>
+        <div class="kpi-label">Programming Model</div>
+        <div class="kpi-value" style="font-size:1.2rem">{model_upper}</div>
+        <div class="kpi-sub">{tier0_result.files_scanned} files &bull; {tier0_result.files_skipped} skipped</div>
+      </div>
+      <div class="kpi kpi-info" data-tip='<strong>Performance Patterns</strong>Anti-patterns and potential bottlenecks detected by static source analysis. Patterns are classified by severity (HIGH, MEDIUM, LOW).'>
+        <div class="kpi-head"><span class="kpi-icon">&#128202;</span><span class="kpi-status">Found</span></div>
+        <div class="kpi-label">Patterns Detected</div>
+        <div class="kpi-value">{n_patterns}</div>
+        <div class="kpi-sub">potential issues identified</div>
+      </div>
+      <div class="kpi {risk_kpi_cls}" data-tip='<strong>Risk Areas</strong>High-level risk categories identified in the source code that may cause performance issues at runtime. Run profiling to confirm and quantify each risk.'>
+        <div class="kpi-head"><span class="kpi-icon">&#9888;</span><span class="kpi-status">{risk_kpi_label}</span></div>
+        <div class="kpi-label">Risk Areas</div>
+        <div class="kpi-value">{n_risks}</div>
+        <div class="kpi-sub">{"requires profiling to confirm" if n_risks > 0 else "no obvious risk areas"}</div>
+      </div>
+    </div>
+  </div>
+</section>
+
+<!-- ── Recommendations ────────────────────────────────────────────── -->
+<section class="scard">
+  <div class="shdr">
+    <span class="shdr-icon">&#128161;</span>
+    <h2>Profiling Recommendations</h2>
+    {_recs_badge_html}
+  </div>
+  <div class="sbody">
+    {recs_html}
+  </div>
+</section>
+
+{kernels_section}
+{patterns_section}
+{risk_section}
+{counters_section}
+{start_here_section}
+{llm_section}
+
+</div><!-- /wrap -->
+
+<footer>
+  <p>Generated by <strong>rocpd analyze</strong> (Tier 0) &mdash; AMD ROCm GPU Performance Analysis &bull; {_h(analysis_date)}</p>
+</footer>
+
+<!-- scroll-to-top FAB -->
+<button class="fab" id="fab-top" title="Back to top" onclick="window.scrollTo({{top:0,behavior:'smooth'}})">&#8679;</button>
+
+<script>
+var TIER0 = {payload};
+
+/* ── Theme toggle ── */
+var htmlEl = document.documentElement;
+var themeBtn = document.getElementById('theme-btn');
+var _saved = localStorage.getItem('rocpd-theme') || 'dark';
+if (_saved === 'light') {{ htmlEl.setAttribute('data-theme','light'); themeBtn.innerHTML = '&#127769; Dark'; }}
+function toggleTheme() {{
+  var isLight = htmlEl.getAttribute('data-theme') === 'light';
+  htmlEl.setAttribute('data-theme', isLight ? 'dark' : 'light');
+  themeBtn.innerHTML = isLight ? '&#9728; Light' : '&#127769; Dark';
+  localStorage.setItem('rocpd-theme', isLight ? 'dark' : 'light');
+}}
+
+/* ── Scroll-to-top FAB ── */
+var fabEl = document.getElementById('fab-top');
+window.addEventListener('scroll', function() {{
+  if (window.scrollY > 250) {{ fabEl.classList.add('visible'); }}
+  else {{ fabEl.classList.remove('visible'); }}
+}});
+
+/* ── Recommendation toggle ── */
+function toggleR(hdr) {{
+  hdr.closest('.r-card').classList.toggle('open');
+}}
+document.querySelectorAll('.r-card[data-p="HIGH"]').forEach(function(c) {{
+  c.classList.add('open');
+}});
+
+/* ── Copy command ── */
+function cpCmd(id) {{
+  var el = document.getElementById(id);
+  var txt = el.querySelector('code').textContent;
+  if (navigator.clipboard) {{
+    navigator.clipboard.writeText(txt).then(function() {{
+      var btn = el.querySelector('.cp-btn');
+      var orig = btn.textContent;
+      btn.textContent = '\u2713 Copied!';
+      btn.style.color = 'var(--c-ok)';
+      setTimeout(function() {{ btn.textContent = orig; btn.style.color = ''; }}, 1600);
+    }});
+  }}
+}}
+
+/* ── Sortable tables ── */
+document.querySelectorAll('.sortable thead th').forEach(function(th) {{
+  th.addEventListener('click', function() {{
+    var tbl   = th.closest('table');
+    var tbody = tbl.querySelector('tbody');
+    var col   = Array.prototype.indexOf.call(th.parentElement.children, th);
+    var dir   = th.dataset.dir === '1' ? -1 : 1;
+    tbl.querySelectorAll('thead th').forEach(function(t) {{
+      delete t.dataset.dir;
+      t.textContent = t.textContent.replace(/ [\u25b2\u25bc]$/, '');
+    }});
+    th.dataset.dir = String(dir);
+    th.textContent += dir === 1 ? ' \u25b2' : ' \u25bc';
+    var rows = Array.prototype.slice.call(tbody.querySelectorAll('tr'));
+    rows.sort(function(a, b) {{
+      var av = a.cells[col].dataset.v || a.cells[col].textContent.trim();
+      var bv = b.cells[col].dataset.v || b.cells[col].textContent.trim();
+      var an = parseFloat(av), bn = parseFloat(bv);
+      if (!isNaN(an) && !isNaN(bn)) return (an - bn) * dir;
+      return av < bv ? -dir : av > bv ? dir : 0;
+    }});
+    rows.forEach(function(r) {{ tbody.appendChild(r); }});
+  }});
+}});
+
+/* ── Floating tooltip ── */
+var ttEl = document.getElementById('tt');
+function showTip(e, html_content) {{
+  ttEl.innerHTML = html_content; ttEl.classList.add('show'); moveTip(e);
+}}
+function moveTip(e) {{
+  var x = e.clientX + 16, y = e.clientY - 12;
+  var w = ttEl.offsetWidth || 320;
+  if (x + w + 10 > window.innerWidth) {{ x = e.clientX - w - 14; }}
+  if (y + ttEl.offsetHeight + 10 > window.innerHeight) {{ y = e.clientY - ttEl.offsetHeight - 10; }}
+  ttEl.style.left = x + 'px'; ttEl.style.top = y + 'px';
+}}
+function hideTip() {{ ttEl.classList.remove('show'); }}
+document.querySelectorAll('[data-tip]').forEach(function(el) {{
+  el.addEventListener('mouseenter', function(e) {{ showTip(e, el.dataset.tip); }});
+  el.addEventListener('mousemove',  moveTip);
+  el.addEventListener('mouseleave', hideTip);
+}});
+</script>
+</body>
+</html>"""
+
+
+def format_analysis_output(
+    time_breakdown: Dict[str, Any],
+    hotspots: List[Dict[str, Any]],
+    memory_analysis: Dict[str, Dict[str, Any]],
+    recommendations: List[Dict[str, Any]],
+    hardware_counters: Optional[Dict[str, Any]] = None,
+    database_path: str = "",
+    output_format: str = "text",
+    tier0_result: Optional[Any] = None,
+    source_only: bool = False,
+    interval_timeline: Optional[
+        Dict[str, Any]
+    ] = None,  # NEW (TraceLens) — logic in Task 4
+    kernel_categories: Optional[List[Any]] = None,  # NEW (TraceLens) — logic in Task 4
+    short_kernels: Optional[Dict[str, Any]] = None,  # NEW (TraceLens) — logic in Task 4
+    custom_prompt: Optional[str] = None,
+) -> str:
+    """
+    Format analysis results for display.
+
+    Args:
+        time_breakdown: Time distribution metrics
+        hotspots: Top kernel hotspots
+        memory_analysis: Memory copy analysis
+        recommendations: Performance recommendations
+        database_path: Path to analyzed database
+        output_format: Output format (text, json, markdown, webview)
+        tier0_result: Optional Tier 0 source analysis result
+        source_only: True when no database was provided (Tier 0 only)
+
+    Returns:
+        Formatted string output
+    """
+    # Source-only mode: dispatch entirely to Tier 0 formatters
+    if source_only and tier0_result is not None:
+        if output_format == "json":
+            return _format_tier0_json(tier0_result)
+        if output_format == "markdown":
+            return _format_tier0_markdown(tier0_result)
+        if output_format == "webview":
+            return _format_tier0_webview(tier0_result)
+        return _format_tier0_text(tier0_result)
+
+    if output_format == "json":
+        output = _format_as_json(
+            time_breakdown=time_breakdown,
+            hotspots=hotspots,
+            memory_analysis=memory_analysis,
+            recommendations=recommendations,
+            hardware_counters=hardware_counters,
+            database_path=database_path,
+            interval_timeline=interval_timeline,
+            kernel_categories=kernel_categories,
+            short_kernels=short_kernels,
+            custom_prompt=custom_prompt,
+        )
+        # Combined mode: embed tier0 into JSON document
+        if tier0_result is not None:
+            import json as _json
+
+            try:
+                doc = _json.loads(output)
+                doc["tier0"] = _tier0_to_dict(tier0_result)
+                output = _json.dumps(doc, indent=2)
+            except Exception:
+                pass  # Tier0 embedding into combined JSON is non-fatal; return Tier1/2 output unchanged
+        return output
+
+    if output_format == "markdown":
+        output = _format_as_markdown(
+            time_breakdown=time_breakdown,
+            hotspots=hotspots,
+            memory_analysis=memory_analysis,
+            recommendations=recommendations,
+            hardware_counters=hardware_counters,
+            database_path=database_path,
+            interval_timeline=interval_timeline,
+            kernel_categories=kernel_categories,
+            short_kernels=short_kernels,
+        )
+        if tier0_result is not None:
+            output += "\n\n---\n\n## Tier 0: Source Code Analysis\n\n"
+            output += _format_tier0_markdown(tier0_result)
+        return output
+
+    if output_format == "webview":
+        return _format_as_webview(
+            time_breakdown=time_breakdown,
+            hotspots=hotspots,
+            memory_analysis=memory_analysis,
+            recommendations=recommendations,
+            hardware_counters=hardware_counters,
+            database_path=database_path,
+            interval_timeline=interval_timeline,
+            kernel_categories=kernel_categories,
+            short_kernels=short_kernels,
+        )
+
+    # Default: text
+    lines = []
+    width = 80
+
+    # Header
+    lines.append("=" * width)
+    lines.append("ROCPD AI PERFORMANCE ANALYSIS".center(width))
+    lines.append("=" * width)
+    if database_path:
+        lines.append(f"Database: {database_path}")
+    lines.append(f"Analysis Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+
+    total_runtime_ms = time_breakdown.get("total_runtime", 0) / 1e6
+    lines.append(f"Total Runtime: {total_runtime_ms:,.2f} ms")
+    lines.append("")
+
+    # Time Breakdown
+    lines.append("━" * width)
+    lines.append("TIME BREAKDOWN".center(width))
+    lines.append("━" * width)
+    lines.append("")
+
+    def make_bar(percent: float, bar_width: int = 30) -> str:
+        """Create a visual percentage bar."""
+        filled = int(percent / 100.0 * bar_width)
+        return "█" * filled
+
+    kernel_pct = time_breakdown.get("kernel_percent", 0)
+    memcpy_pct = time_breakdown.get("memcpy_percent", 0)
+    overhead_pct = time_breakdown.get("overhead_percent", 0)
+
+    kernel_time_ms = time_breakdown.get("total_kernel_time", 0) / 1e6
+    memcpy_time_ms = time_breakdown.get("total_memcpy_time", 0) / 1e6
+    overhead_time_ms = (
+        max(0.0, total_runtime_ms - kernel_time_ms - memcpy_time_ms)
+        if total_runtime_ms > 0
+        else 0
+    )
+
+    lines.append(
+        f"  Kernel Execution:  {kernel_time_ms:10,.2f} ms  ({kernel_pct:5.1f}%)  {make_bar(kernel_pct)}"
+    )
+    lines.append(
+        f"  Memory Copies:     {memcpy_time_ms:10,.2f} ms  ({memcpy_pct:5.1f}%)  {make_bar(memcpy_pct)}"
+    )
+    lines.append(
+        f"  API Overhead:      {overhead_time_ms:10,.2f} ms  ({overhead_pct:5.1f}%)  {make_bar(overhead_pct)}"
+    )
+    lines.append("")
+
+    # Hotspots
+    if hotspots:
+        lines.append("━" * width)
+        lines.append("HOTSPOTS".center(width))
+        lines.append("━" * width)
+        lines.append("")
+        lines.append(f"Top {len(hotspots)} Kernels by Duration:")
+        lines.append("")
+
+        # Table header
+        lines.append(
+            f" #  {'Kernel Name':<30}  {'Calls':>6}  {'Total (ms)':>10}  {'Avg (μs)':>9}  {'% Total':>7}"
+        )
+        lines.append("─" * width)
+
+        # Table rows
+        for i, kernel in enumerate(hotspots, 1):
+            name = kernel.get("name", "unknown")
+            if len(name) > 30:
+                name = name[:27] + "..."
+
+            calls = kernel.get("calls", 0)
+            total_ms = kernel.get("total_duration", 0) / 1e6
+            avg_us = kernel.get("avg_duration", 0) / 1e3
+            percent = kernel.get("percent_of_total", 0)
+
+            lines.append(
+                f"{i:2}  {name:<30}  {calls:6}  {total_ms:10,.2f}  {avg_us:9,.1f}  {percent:6.1f}%"
+            )
+
+        lines.append("")
+
+    # Memory Analysis
+    if memory_analysis:
+        lines.append("━" * width)
+        lines.append("MEMORY COPY ANALYSIS".center(width))
+        lines.append("━" * width)
+        lines.append("")
+
+        # Table header
+        lines.append(
+            f"{'Direction':<20}  {'Count':>6}  {'Total Size':>12}  {'Duration':>10}  {'Bandwidth':>10}"
+        )
+        lines.append("─" * width)
+
+        # Table rows
+        for direction, stats in memory_analysis.items():
+            count = stats.get("count", 0)
+            total_bytes = stats.get("total_bytes", 0)
+            duration_ms = stats.get("total_duration", 0) / 1e6
+            bandwidth_gbps = stats.get("bandwidth_bytes_per_sec", 0) / 1e9
+
+            # Format size
+            if total_bytes >= 1e9:
+                size_str = f"{total_bytes / 1e9:.1f} GB"
+            elif total_bytes >= 1e6:
+                size_str = f"{total_bytes / 1e6:.1f} MB"
+            elif total_bytes >= 1e3:
+                size_str = f"{total_bytes / 1e3:.1f} KB"
+            else:
+                size_str = f"{total_bytes:.0f} B"
+
+            lines.append(
+                f"{direction:<20}  {count:6}  {size_str:>12}  {duration_ms:9,.2f} ms  {bandwidth_gbps:8.2f} GB/s"
+            )
+
+        lines.append("")
+
+    # Hardware Counters (Tier 2)
+    if hardware_counters and hardware_counters.get("has_counters"):
+        lines.append("━" * width)
+        lines.append("HARDWARE COUNTERS (Tier 2)".center(width))
+        lines.append("━" * width)
+        lines.append("")
+
+        metrics = hardware_counters.get("metrics", {})
+        counters = hardware_counters.get("counters", {})
+
+        # Display derived metrics
+        if metrics:
+            lines.append("Derived Metrics:")
+            lines.append("")
+
+            if "gpu_utilization_percent" in metrics:
+                util_pct = metrics["gpu_utilization_percent"]
+                lines.append(
+                    f"  GPU Utilization:        {util_pct:6.1f}%  {make_bar(util_pct)}"
+                )
+
+            if "avg_waves" in metrics:
+                avg_waves = metrics["avg_waves"]
+                max_waves = metrics.get("max_waves", 0)
+                lines.append(f"  Avg Wave Occupancy:     {avg_waves:6.1f} waves")
+                lines.append(f"  Max Wave Occupancy:     {max_waves:6.1f} waves")
+
+            lines.append("")
+
+        # Display raw counters
+        if counters:
+            lines.append("Collected Counters:")
+            lines.append("")
+            lines.append(
+                f"{'Counter Name':<25}  {'Avg Value':>15}  {'Min Value':>15}  {'Max Value':>15}"
+            )
+            lines.append("─" * width)
+
+            for counter_name, stats in counters.items():
+                avg = stats.get("avg_value", 0)
+                min_val = stats.get("min_value", 0)
+                max_val = stats.get("max_value", 0)
+
+                lines.append(
+                    f"{counter_name:<25}  {avg:15,.1f}  {min_val:15,.1f}  {max_val:15,.1f}"
+                )
+
+            lines.append("")
+
+    # TraceLens: Kernel Category Breakdown
+    if kernel_categories:
+        lines.append("")
+        lines.append("━" * width)
+        lines.append("KERNEL CATEGORY BREAKDOWN (TraceLens)".center(width))
+        lines.append("━" * width)
+        lines.append("")
+        max_pct = max((c["pct_of_kernel_time"] for c in kernel_categories), default=1)
+        bar_width = 30
+        for cat in kernel_categories:
+            pct = cat["pct_of_kernel_time"]
+            bar = "█" * int(bar_width * pct / max(max_pct, 1))
+            cnt = cat["count"]
+            avg_us = cat["avg_duration_ns"] / 1_000
+            lines.append(
+                f"  {cat['category']:<15} {bar:<30} {pct:5.1f}%  ({cnt} kernels, avg {avg_us:.1f}μs)"
+            )
+        lines.append("")
+
+    # TraceLens: Short Kernel Analysis
+    if short_kernels and short_kernels.get("short_kernel_count", 0) > 0:
+        lines.append("━" * width)
+        lines.append("SHORT KERNEL ANALYSIS (TraceLens)".center(width))
+        lines.append("━" * width)
+        lines.append("")
+        thresh = short_kernels.get("threshold_us", 10)
+        count = short_kernels["short_kernel_count"]
+        wasted = short_kernels["wasted_pct_of_kernel_time"]
+        lines.append(
+            f"  {count} kernels below {thresh}μs threshold — {wasted:.1f}% of kernel time wasted"
+        )
+        if short_kernels.get("histogram"):
+            hist_str = "  Histogram: " + "  ".join(
+                f"[{b['bucket_label']}]: {b['count']}" for b in short_kernels["histogram"]
+            )
+            lines.append(hist_str)
+        if short_kernels.get("top_offenders"):
+            lines.append("  Top offenders:")
+            for off in short_kernels["top_offenders"][:5]:
+                lines.append(
+                    f"    {off['name'][:50]:<52} ×{off['count']}  avg {off['avg_us']:.1f}μs"
+                )
+        lines.append("")
+
+    # Recommendations
+    lines.append("━" * width)
+    lines.append("RECOMMENDATIONS".center(width))
+    lines.append("━" * width)
+    lines.append("")
+
+    for rec in recommendations:
+        priority = rec.get("priority", "INFO")
+        category = rec.get("category", "")
+        issue = rec.get("issue", "")
+        suggestion = rec.get("suggestion", "")
+        actions = rec.get("actions", [])
+        commands = rec.get("commands", [])
+        estimated_impact = rec.get("estimated_impact", "")
+
+        lines.append(f"[{priority}] {category}")
+        lines.append("─" * width)
+        lines.append(f"  Issue: {issue}")
+        lines.append("")
+        if suggestion:
+            lines.append(f"  Suggestion: {suggestion}")
+            if actions:
+                for action in actions:
+                    lines.append(f"    {action}")
+            lines.append("")
+        if estimated_impact:
+            lines.append(f"  Estimated Impact: {estimated_impact}")
+            lines.append("")
+        if commands:
+            lines.append("  Recommended Commands:")
+            for cmd in commands:
+                tool = cmd.get("tool", "")
+                desc = cmd.get("description", "")
+                full_command = cmd.get("full_command", "")
+                flags = cmd.get("flags", [])
+                args = cmd.get("args", [])
+                lines.append(f"    [{tool}] {desc}")
+                if flags:
+                    lines.append(f"      Flags: {' '.join(flags)}")
+                if args:
+                    arg_strs = []
+                    for a in args:
+                        name = a.get("name", "")
+                        value = a.get("value")
+                        arg_strs.append(f"{name} {value}" if value is not None else name)
+                    lines.append(f"      Args:  {' '.join(arg_strs)}")
+                if full_command:
+                    lines.append(f"      $ {full_command}")
+            lines.append("")
+        lines.append("")
+
+    # Footer
+    lines.append("=" * width)
+    lines.append("Analysis complete.".center(width))
+    lines.append("=" * width)
+
+    return "\n".join(lines)
+
+
+def analyze_source_code(
+    source_dir: str,
+    prompt: Optional[str] = None,
+    llm: Optional[str] = None,
+    llm_api_key: Optional[str] = None,
+    llm_model: Optional[str] = None,
+    verbose: bool = False,
+) -> Any:
+    """
+    Run Tier 0 static source code analysis.
+
+    Args:
+        source_dir: Path to source directory
+        prompt: Optional user question to guide analysis
+        llm: LLM provider ("anthropic", "openai")
+        llm_api_key: API key for LLM provider
+        llm_model: Override LLM model name
+        verbose: Enable verbose logging
+
+    Returns:
+        SourceAnalysisResult from ai_analysis.api
+    """
+    from pathlib import Path as _Path
+    from .ai_analysis.source_analyzer import SourceAnalyzer
+    from .ai_analysis.api import _plan_to_source_result
+
+    _src_path = _Path(source_dir)
+    if not _src_path.exists() or not _src_path.is_dir():
+        from .ai_analysis.exceptions import SourceDirectoryNotFoundError
+
+        raise SourceDirectoryNotFoundError(
+            f"Source directory not found or not a directory: {source_dir}"
+        )
+
+    if verbose:
+        print(f"[Tier0] Scanning source directory: {source_dir}")
+
+    scanner = SourceAnalyzer(_src_path, verbose=verbose)
+    plan = scanner.analyze()
+
+    if verbose:
+        print(
+            f"[Tier0] Scanned {plan.files_scanned} files, "
+            f"{plan.kernel_count} kernels, model: {plan.programming_model}"
+        )
+
+    # Convert ProfilingPlan → SourceAnalysisResult dataclass
+    result = _plan_to_source_result(plan)
+
+    if llm:
+        _prev = os.environ.get("ROCPD_LLM_MODEL")
+        try:
+            from .ai_analysis.llm_analyzer import LLMAnalyzer
+
+            if llm_model:
+                os.environ["ROCPD_LLM_MODEL"] = llm_model
+            try:
+                analyzer = LLMAnalyzer(provider=llm, api_key=llm_api_key, verbose=verbose)
+                from .ai_analysis.llm_analyzer import (
+                    AnalysisContext as _AnalysisContext,
+                )
+
+                _llm_ctx = _AnalysisContext(tier=0, custom_prompt=prompt)
+                _mdl = llm_model or os.environ.get("ROCPD_LLM_MODEL", "")
+                _mdl_str = f" ({_mdl})" if _mdl else ""
+                print(
+                    f"  Contacting {llm}{_mdl_str} for source analysis — please wait...",
+                    file=sys.stderr,
+                    flush=True,
+                )
+                result.llm_explanation = analyzer.analyze_source_with_llm(
+                    result, custom_prompt=prompt, context=_llm_ctx
+                )
+            finally:
+                if llm_model:
+                    if _prev is None:
+                        os.environ.pop("ROCPD_LLM_MODEL", None)
+                    else:
+                        os.environ["ROCPD_LLM_MODEL"] = _prev
+        except Exception as e:
+            print(f"⚠️  Tier 0 LLM enhancement failed: {e}", file=sys.stderr)
+
+    return result
+
+
+def analyze_performance(
+    connection: Optional[RocpdImportData],
+    prompt: Optional[str] = None,
+    top_kernels: int = 10,
+    min_duration: float = 0.0,
+    output_format: str = "text",
+    database_path: str = "",
+    llm: Optional[str] = None,
+    llm_api_key: Optional[str] = None,
+    llm_model: Optional[str] = None,
+    llm_thinking: Optional[int] = None,
+    verbose: bool = False,
+    source_dir: Optional[str] = None,
+    _collect_result: Optional[Dict[str, Any]] = None,
+    **kwargs: Any,
+) -> str:
+    """
+    Main analysis orchestrator that runs all analyses and formats output.
+
+    Args:
+        connection: RocpdImportData database connection
+        prompt: Optional custom analysis prompt
+        top_kernels: Number of top kernels to analyze
+        min_duration: Minimum kernel duration threshold
+        output_format: Output format (text, json, markdown)
+        database_path: Path to database file
+        llm: LLM provider (anthropic or openai)
+        llm_api_key: API key for LLM provider
+        verbose: Enable verbose logging
+        **kwargs: Additional arguments
+
+    Returns:
+        Formatted analysis output string
+    """
+    # ------------------------------------------------------------------
+    # Tier 0 — static source code analysis (optional)
+    # ------------------------------------------------------------------
+    tier0_result = None
+    if source_dir:
+        tier0_result = analyze_source_code(
+            source_dir=source_dir,
+            prompt=prompt,
+            llm=llm,
+            llm_api_key=llm_api_key,
+            llm_model=llm_model,
+            verbose=verbose,
+        )
+
+    # ------------------------------------------------------------------
+    # Tier 1/2 — database analysis (only when a connection is provided)
+    # ------------------------------------------------------------------
+    source_only = connection is None
+    if not source_only:
+        time_breakdown = compute_time_breakdown(connection)
+        hotspots = identify_hotspots(
+            connection, top_n=top_kernels, min_duration=min_duration
+        )
+        memory_analysis = analyze_memory_copies(connection)
+        hardware_counters = analyze_hardware_counters(connection)  # Tier 2
+        already_collected = _detect_already_collected(connection)
+        # TraceLens-derived analysis (Phase 1)
+        interval_timeline = compute_interval_timeline(connection)
+        kernel_categories = analyze_kernels_by_category(
+            connection, interval_timeline["total_wall_ns"]
+        )
+        short_kernels_data = analyze_short_kernels(connection)
+        # Generate recommendations (redundant re-collection commands are filtered out)
+        recommendations = generate_recommendations(
+            time_breakdown,
+            hotspots,
+            memory_analysis,
+            hardware_counters,
+            already_collected=already_collected,
+            short_kernels=short_kernels_data,  # NEW
+            interval_timeline=interval_timeline,  # NEW
+        )
+    else:
+        time_breakdown = {}
+        hotspots = []
+        memory_analysis = {}
+        hardware_counters = {}
+        already_collected = frozenset()
+        interval_timeline = {}
+        kernel_categories = []
+        short_kernels_data = {}
+        recommendations = tier0_result.recommendations if tier0_result else []
+
+    # Format output
+    output = format_analysis_output(
+        time_breakdown=time_breakdown,
+        hotspots=hotspots,
+        memory_analysis=memory_analysis,
+        recommendations=recommendations,
+        hardware_counters=hardware_counters,
+        database_path=database_path,
+        output_format=output_format,
+        tier0_result=tier0_result,
+        source_only=source_only,
+        interval_timeline=interval_timeline,  # NEW (TraceLens)
+        kernel_categories=kernel_categories,  # NEW (TraceLens)
+        short_kernels=short_kernels_data,  # NEW (TraceLens)
+        custom_prompt=prompt,
+    )
+
+    # Expose structured results to caller (used by interactive mode)
+    if _collect_result is not None:
+        _collect_result["recommendations"] = recommendations
+        _collect_result["tier0_result"] = tier0_result
+        _collect_result["database_path"] = database_path
+
+    # LLM enhancement (if enabled) — only for Tier 1/2; Tier 0 LLM runs in analyze_source_code()
+    if llm and not source_only:
+        # Initialize before try so the finally block can always reference these names safely.
+        _prev_model_env = os.environ.get("ROCPD_LLM_MODEL")
+        try:
+            if verbose:
+                print(f"[LLM] Enabling {llm} enhancement...")
+
+            from .ai_analysis.llm_analyzer import LLMAnalyzer
+
+            # If caller provided --llm-model, set it in the environment so
+            # LLMAnalyzer._call_anthropic/_call_openai can pick it up.
+            # We restore the original value afterwards.
+            if llm_model:
+                os.environ["ROCPD_LLM_MODEL"] = llm_model
+
+            _mdl = llm_model or os.environ.get("ROCPD_LLM_MODEL", "")
+            _mdl_str = f" ({_mdl})" if _mdl else ""
+            print(
+                f"  Contacting {llm}{_mdl_str} for trace analysis — please wait...",
+                file=sys.stderr,
+                flush=True,
+            )
+
+            # Initialize LLM analyzer
+            analyzer = LLMAnalyzer(
+                provider=llm,
+                api_key=llm_api_key,
+                verbose=verbose,
+                thinking_budget_tokens=llm_thinking,
+            )
+
+            # Prepare data for LLM
+            analysis_data = {
+                "gpu": {"name": "AMD GPU", "arch": "unknown"},  # TODO: Extract from DB
+                "execution_breakdown": {
+                    "kernel_time_pct": time_breakdown.get("kernel_percent", 0),
+                    "memcpy_time_pct": time_breakdown.get("memcpy_percent", 0),
+                    "api_overhead_pct": time_breakdown.get("overhead_percent", 0),
+                },
+                "kernels": [
+                    {
+                        "name": h.get("name", "unknown"),
+                        "dispatch_count": h.get("calls", 0),
+                        "pct_total_time": h.get("percent_of_total", 0),
+                        "avg_duration_ns": h.get("avg_duration", 0),
+                    }
+                    for h in hotspots[:5]  # Top 5 kernels
+                ],
+                "memory_ops": {
+                    direction: {
+                        "count": data.get("count", 0),
+                        "total_bytes": data.get("total_bytes", 0),
+                        "bandwidth_gbps": data.get("bandwidth_bytes_per_sec", 0) / 1e9,
+                    }
+                    for direction, data in memory_analysis.items()
+                },
+                "has_counters": bool(hardware_counters),
+                "has_pc_sampling": False,
+            }
+
+            # Build analysis context for guide filtering
+            from .ai_analysis.llm_analyzer import AnalysisContext as _AnalysisContext
+
+            _has_ctr = bool(hardware_counters and hardware_counters.get("has_counters"))
+            _summary = _build_summary(time_breakdown, hotspots, _has_ctr)
+            _llm_ctx = _AnalysisContext(
+                tier=2 if _has_ctr else 1,
+                has_counters=_has_ctr,
+                bottleneck_type=_summary.get("primary_bottleneck"),
+                gpu_arch=None,  # reserved for future per-GPU filtering
+                custom_prompt=prompt,
+            )
+
+            # Get LLM enhancement
+            llm_explanation = analyzer.analyze_with_llm(
+                analysis_data=analysis_data,
+                custom_prompt=prompt,
+                context=_llm_ctx,
+            )
+
+            # Append LLM explanation to output
+            if output_format == "text":
+                output += "\n\n" + "=" * 80 + "\n"
+                output += (
+                    "AI-ENHANCED EXPLANATION (powered by {})".format(llm.upper()).center(
+                        80
+                    )
+                    + "\n"
+                )
+                output += "=" * 80 + "\n\n"
+                output += llm_explanation
+                output += "\n\n" + "=" * 80 + "\n"
+            elif output_format == "json":
+                # Parse JSON and add LLM explanation
+                import json
+
+                try:
+                    output_dict = json.loads(output)
+                    output_dict["llm_enhanced_explanation"] = llm_explanation
+                    output = json.dumps(output_dict, indent=2)
+                except (json.JSONDecodeError, ValueError, KeyError) as _je:
+                    print(
+                        f"Warning: Could not embed LLM explanation in JSON output: {_je}",
+                        file=sys.stderr,
+                    )
+
+            if verbose:
+                print("[LLM] Enhancement complete")
+
+        except Exception as e:
+            # Always show LLM failures on console (even without --verbose)
+            import sys
+
+            error_msg = f"⚠️  LLM enhancement failed: {e}"
+            print(error_msg, file=sys.stderr)
+
+            # Also add to output file
+            warning_msg = (
+                f"\n\n{error_msg}\n(Analysis completed with local results only)\n"
+            )
+            if output_format == "text":
+                output += warning_msg
+
+            # Show full traceback only in verbose mode
+            if verbose:
+                import traceback
+
+                traceback.print_exc()
+
+        finally:
+            # Restore the ROCPD_LLM_MODEL env var to its previous state
+            if llm_model:
+                if _prev_model_env is None:
+                    os.environ.pop("ROCPD_LLM_MODEL", None)
+                else:
+                    os.environ["ROCPD_LLM_MODEL"] = _prev_model_env
+
+    return output
+
+
+def _is_code_change_rec(rec: Dict[str, Any]) -> bool:
+    """Return True if this recommendation suggests source-code modifications."""
+    CODE_CHANGE_KEYWORDS = (
+        "replace ",
+        "convert ",
+        "add ",
+        "insert ",
+        "remove ",
+        "delete ",
+        "change ",
+        "modify ",
+        "update ",
+        "use hip",
+        "hipstream",
+        "hipmemcpy",
+        "hiplaunchkernel",
+        "block size",
+        "blockdim",
+        "thread block",
+        "merge kernel",
+        "fuse kernel",
+        "combine kernel",
+        "async",
+        "hipstreamcreate",
+        "batch ",
+        "coalesce",
+        "stride",
+        "unroll",
+        "pragma ",
+        "#pragma",
+        "__launch_bounds__",
+        "wave32",
+        "wave64",
+    )
+    for action in rec.get("actions", []):
+        al = action.lower()
+        if any(kw in al for kw in CODE_CHANGE_KEYWORDS):
+            return True
+    return False
+
+
+def _call_llm_for_code(
+    provider: str,
+    api_key: Optional[str],
+    model: Optional[str],
+    prompt: str,
+) -> str:
+    """Call Anthropic or OpenAI to generate code-change suggestions."""
+    if provider == "anthropic":
+        try:
+            import anthropic
+        except ImportError:
+            raise ImportError(
+                "anthropic package not installed. Run: pip install anthropic"
+            )
+        key = api_key or os.environ.get("ANTHROPIC_API_KEY")
+        if not key:
+            raise ValueError(
+                "No Anthropic API key. Set ANTHROPIC_API_KEY or pass --llm-api-key."
+            )
+        use_model = model or os.environ.get("ROCPD_LLM_MODEL", "claude-sonnet-4-20250514")
+        client = anthropic.Anthropic(api_key=key)
+        msg = client.messages.create(
+            model=use_model,
+            max_tokens=4096,
+            messages=[{"role": "user", "content": prompt}],
+        )
+        return msg.content[0].text
+
+    elif provider in ("openai", "gpt"):
+        try:
+            import openai
+        except ImportError:
+            raise ImportError("openai package not installed. Run: pip install openai")
+        key = api_key or os.environ.get("OPENAI_API_KEY")
+        if not key:
+            raise ValueError(
+                "No OpenAI API key. Set OPENAI_API_KEY or pass --llm-api-key."
+            )
+        use_model = model or os.environ.get("ROCPD_LLM_MODEL", "gpt-4-turbo-preview")
+        client = openai.OpenAI(api_key=key)
+        try:
+            resp = client.chat.completions.create(
+                model=use_model,
+                messages=[{"role": "user", "content": prompt}],
+                max_completion_tokens=4096,
+            )
+        except Exception:
+            resp = client.chat.completions.create(
+                model=use_model,
+                messages=[{"role": "user", "content": prompt}],
+                max_tokens=4096,
+            )
+        return resp.choices[0].message.content
+
+    else:
+        raise ValueError(f"Unknown LLM provider: {provider!r}")
+
+
+def _apply_code_change_interactive(
+    rec: Dict[str, Any],
+    source_dir: str,
+    llm_provider: Optional[str],
+    llm_api_key: Optional[str],
+    llm_model: Optional[str],
+    colors: Dict[str, str],
+) -> None:
+    """Walk the user through applying a code-change recommendation."""
+    _os = os  # alias to keep existing _os.path.* calls working
+    import glob as _glob
+    import difflib
+    import shutil
+
+    C = colors["C"]
+    G = colors["G"]
+    Y = colors["Y"]
+    R = colors["R"]
+    DIM = colors["DIM"]
+    N = colors["N"]
+
+    cat = rec.get("category", "")
+    issue = rec.get("issue", "")
+    suggestion = rec.get("suggestion", "")
+    actions = rec.get("actions", [])
+    impact = rec.get("estimated_impact", "")
+
+    # ── Show recommendation details ──────────────────────────────────────────
+    print(f"\n{C}{'─' * 80}{N}")
+    print(f"{C}  Code Change Recommendation: {cat}{N}")
+    print(f"{C}{'─' * 80}{N}")
+    print(f"\n  {Y}Issue:{N}      {issue}")
+    print(f"  {Y}Suggestion:{N} {suggestion}")
+    if actions:
+        print(f"\n  {Y}Required Changes:{N}")
+        for i, action in enumerate(actions, 1):
+            print(f"    {i}. {action}")
+    if impact:
+        print(f"\n  {Y}Estimated Impact:{N} {impact}")
+    print()
+
+    if not source_dir:
+        print(f"  {DIM}Tip: run with --source-dir <path> to enable AI code editing.{N}\n")
+        return
+
+    # ── Find GPU source files ────────────────────────────────────────────────
+    source_files: List[str] = []
+    for ext in ("*.hip", "*.cpp", "*.cu", "*.cuh", "*.h"):
+        source_files.extend(
+            _glob.glob(_os.path.join(source_dir, "**", ext), recursive=True)
+        )
+    source_files = [f for f in source_files if _os.path.isfile(f)]
+
+    if not source_files:
+        print(f"  {DIM}No GPU source files found in {source_dir}/{N}\n")
+        return
+
+    # ── Auto-detect LLM provider from environment if not explicitly set ─────
+    if not llm_provider:
+        if os.environ.get("ANTHROPIC_API_KEY"):
+            llm_provider = "anthropic"
+        elif os.environ.get("OPENAI_API_KEY"):
+            llm_provider = "openai"
+
+    # ── No LLM configured: show manual steps and offer $EDITOR ──────────────
+    if not llm_provider:
+        print(
+            f"  {DIM}To enable AI code editing, set ANTHROPIC_API_KEY (or OPENAI_API_KEY) in your"
+            f" environment, or pass --llm anthropic to rocpd analyze.{N}"
+        )
+        print(f"\n  {Y}Manual steps:{N}")
+        for i, action in enumerate(actions, 1):
+            print(f"    {i}. {action}")
+        editor = _os.environ.get("EDITOR", "")
+        if editor and source_files:
+            try:
+                ans = input(f"\n  Open source files in {editor}? [y/N]: ").strip().lower()
+            except (EOFError, KeyboardInterrupt):
+                ans = "n"
+            if ans in ("y", "yes"):
+                import subprocess
+
+                subprocess.run([editor] + source_files[:3])
+        print()
+        return
+
+    # ── Ask user before invoking LLM ────────────────────────────────────────
+    try:
+        ans = (
+            input(
+                f"  {Y}Would you like the AI to apply this change to your source code? [y/N]: {N}"
+            )
+            .strip()
+            .lower()
+        )
+    except (EOFError, KeyboardInterrupt):
+        print()
+        return
+    if ans not in ("y", "yes"):
+        print()
+        return
+
+    # ── Read source files ────────────────────────────────────────────────────
+    MAX_FILES = 5
+    MAX_FILE_SIZE = 50_000  # bytes per file
+
+    print(f"\n  {DIM}Reading source files...{N}")
+    file_contents: Dict[str, str] = {}
+    for fpath in source_files[:MAX_FILES]:
+        try:
+            with open(fpath, "r", encoding="utf-8", errors="replace") as fh:
+                file_contents[fpath] = fh.read(MAX_FILE_SIZE)
+        except OSError:
+            pass
+
+    if not file_contents:
+        print(f"  {R}Could not read source files.{N}\n")
+        return
+
+    # ── Build LLM prompt ─────────────────────────────────────────────────────
+    files_text = "\n\n".join(
+        f"=== {_os.path.relpath(fp, source_dir)} ===\n{content}"
+        for fp, content in file_contents.items()
+    )
+    changes_text = "\n".join(f"- {a}" for a in actions)
+
+    llm_prompt = (
+        "You are a GPU performance optimization expert. The following GPU source files "
+        "have a performance issue that needs to be fixed.\n\n"
+        f"ISSUE: {issue}\n"
+        f"SUGGESTION: {suggestion}\n"
+        f"REQUIRED CHANGES:\n{changes_text}\n\n"
+        f"SOURCE FILES:\n{files_text}\n\n"
+        "OUTPUT INSTRUCTIONS:\n"
+        "For each file that needs modification, output EXACTLY this format:\n"
+        "MODIFY_FILE: <relative_filename>\n"
+        "<<<ORIGINAL\n"
+        "<exact original code section to replace — copy verbatim from the source>\n"
+        "ORIGINAL\n"
+        "<<<REPLACEMENT\n"
+        "<new replacement code>\n"
+        "REPLACEMENT\n\n"
+        "Only output sections that need to change. Be precise — the ORIGINAL block must "
+        "match exactly what appears in the file (used for find-and-replace). "
+        "If no changes are needed, output: NO_CHANGES_NEEDED"
+    )
+
+    print(f"  {DIM}Calling {llm_provider} for code change suggestions...{N}")
+
+    try:
+        llm_response = _call_llm_for_code(
+            provider=llm_provider,
+            api_key=llm_api_key,
+            model=llm_model,
+            prompt=llm_prompt,
+        )
+    except Exception as exc:
+        print(f"  {R}LLM error: {exc}{N}\n")
+        return
+
+    if "NO_CHANGES_NEEDED" in llm_response:
+        print(f"  {G}AI analysis: no code changes are needed for this issue.{N}\n")
+        return
+
+    # ── Parse MODIFY_FILE blocks ─────────────────────────────────────────────
+    patches: List[tuple] = []
+    pattern = re.compile(
+        r"MODIFY_FILE:\s*(\S+)\s*<<<ORIGINAL\n(.*?)ORIGINAL\s*<<<REPLACEMENT\n(.*?)REPLACEMENT",
+        re.DOTALL,
+    )
+    for m in pattern.finditer(llm_response):
+        rel_path = m.group(1).strip()
+        original = m.group(2).strip()
+        replacement = m.group(3).strip()
+        abs_path = _os.path.join(source_dir, rel_path)
+        # Guard against path traversal (e.g. rel_path = "../../etc/passwd")
+        _resolved = _os.path.realpath(abs_path)
+        _src_resolved = _os.path.realpath(source_dir)
+        if (
+            not _resolved.startswith(_src_resolved + _os.sep)
+            and _resolved != _src_resolved
+        ):
+            continue  # reject: path escapes source_dir
+        if _os.path.isfile(abs_path) and abs_path in file_contents:
+            patches.append((abs_path, rel_path, original, replacement))
+
+    if not patches:
+        print(f"  {Y}AI did not produce actionable code changes.{N}")
+        print(f"  {DIM}Raw AI response (first 20 lines):{N}")
+        for line in llm_response.splitlines()[:20]:
+            print(f"    {DIM}{line}{N}")
+        print()
+        return
+
+    # ── Show unified diff ────────────────────────────────────────────────────
+    print(f"\n{C}{'─' * 80}{N}")
+    print(f"{C}  Proposed changes:{N}")
+    print(f"{C}{'─' * 80}{N}")
+
+    valid_patches: List[tuple] = []
+    for abs_path, rel_path, original, replacement in patches:
+        orig_content = file_contents[abs_path]
+        if original not in orig_content:
+            print(f"\n  {R}✗ Could not locate original code in {rel_path} — skipping.{N}")
+            continue
+        new_content = orig_content.replace(original, replacement, 1)
+        diff = list(
+            difflib.unified_diff(
+                orig_content.splitlines(keepends=True),
+                new_content.splitlines(keepends=True),
+                fromfile=f"a/{rel_path}",
+                tofile=f"b/{rel_path}",
+                n=3,
+            )
+        )
+        print(f"\n  File: {rel_path}")
+        for line in diff[:80]:
+            if line.startswith("+") and not line.startswith("+++"):
+                print(f"  {G}{line.rstrip()}{N}")
+            elif line.startswith("-") and not line.startswith("---"):
+                print(f"  {R}{line.rstrip()}{N}")
+            elif line.startswith("@@"):
+                print(f"  {C}{line.rstrip()}{N}")
+            else:
+                print(f"  {DIM}{line.rstrip()}{N}")
+        if len(diff) > 80:
+            print(f"  {DIM}  ... ({len(diff) - 80} more lines){N}")
+        valid_patches.append((abs_path, rel_path, orig_content, new_content))
+
+    if not valid_patches:
+        print()
+        return
+
+    print()
+    try:
+        ans = input(f"  {Y}Apply these changes? [y/N]: {N}").strip().lower()
+    except (EOFError, KeyboardInterrupt):
+        print()
+        return
+
+    if ans not in ("y", "yes"):
+        print(f"  {DIM}Changes not applied.{N}\n")
+        return
+
+    # ── Apply with backup ────────────────────────────────────────────────────
+    applied = 0
+    for abs_path, rel_path, orig_content, new_content in valid_patches:
+        backup_path = abs_path + ".rocpd.bak"
+        try:
+            shutil.copy2(abs_path, backup_path)
+            with open(abs_path, "w", encoding="utf-8") as fh:
+                fh.write(new_content)
+            print(
+                f"  {G}✓ Applied: {rel_path}  (backup: {_os.path.basename(backup_path)}){N}"
+            )
+            applied += 1
+        except OSError as exc:
+            print(f"  {R}✗ Failed to write {rel_path}: {exc}{N}")
+
+    if applied:
+        print(
+            f"\n  {G}✓ {applied} file(s) modified. Rebuild your application to test.{N}\n"
+        )
+        return True
+    else:
+        print(f"  {Y}No files were modified.{N}\n")
+        return False
+
+
+def _get_app_path_from_db(database_path: str) -> str:
+    """
+    Extract the profiled application's executable path from a rocpd database.
+
+    rocprofv3 writes the process command into rocpd_info_process_<uuid>.command.
+    Returns the path string, or "" if the database cannot be read or has no entry.
+    """
+    if not database_path:
+        return ""
+    try:
+        import sqlite3 as _sqlite3
+
+        con = _sqlite3.connect(database_path)
+        # Find all rocpd_info_process_* tables
+        tables = con.execute(
+            "SELECT name FROM sqlite_master WHERE type='table' AND name LIKE 'rocpd_info_process_%'"
+        ).fetchall()
+        for (tname,) in tables:
+            row = con.execute(
+                f'SELECT command FROM "{tname}" WHERE command IS NOT NULL LIMIT 1'
+            ).fetchone()
+            if row and row[0]:
+                return row[0].strip()
+        con.close()
+    except Exception:
+        pass
+    return ""
+
+
+def _run_interactive_session(
+    recommendations: List[Dict[str, Any]],
+    tier0_result: Optional[Any] = None,
+    database_path: str = "",
+    source_dir: str = "",
+    llm_provider: Optional[str] = None,
+    llm_api_key: Optional[str] = None,
+    llm_model: Optional[str] = None,
+    llm_local: Optional[str] = None,
+    llm_local_model: Optional[str] = None,
+    resume_session: Optional[str] = None,
+    compact_every: int = 10,
+) -> None:
+    """Thin shim: delegates to InteractiveSession in ai_analysis/interactive.py."""
+    from rocpd.ai_analysis.interactive import InteractiveSession, SessionStore
+
+    InteractiveSession(
+        source_dir=source_dir,
+        tier0_result=tier0_result,
+        recommendations=recommendations,
+        database_path=database_path,
+        llm_provider=llm_provider,
+        llm_api_key=llm_api_key,
+        llm_model=llm_model,
+        llm_local=llm_local,
+        llm_local_model=llm_local_model,
+        session_store=SessionStore(),
+        resume_session_id=resume_session,
+        compact_every=compact_every,
+    ).run()
+
+
+def add_args(parser: argparse.ArgumentParser):
+    """
+    Add command-line arguments for AI analysis.
+
+    Args:
+        parser: Argument parser to add arguments to
+
+    Returns:
+        Function to process parsed arguments
+    """
+    analysis_options = parser.add_argument_group("Analysis options")
+
+    analysis_options.add_argument(
+        "--source-dir",
+        type=str,
+        default=None,
+        dest="source_dir",
+        help=(
+            "Path to GPU application source directory for Tier 0 static analysis. "
+            "Scans .hip/.cpp/.cu files and generates a profiling plan. "
+            "Can be used alone (no -i required) or alongside -i for combined analysis."
+        ),
+    )
+
+    analysis_options.add_argument(
+        "--prompt",
+        type=str,
+        default=None,
+        help="Custom analysis prompt/question to guide analysis (e.g., 'Why is my matmul kernel slow?')",
+    )
+
+    analysis_options.add_argument(
+        "--top-kernels",
+        type=int,
+        default=10,
+        help="Number of top kernels to analyze (default: 10)",
+    )
+
+    analysis_options.add_argument(
+        "--format",
+        type=str,
+        choices=["text", "json", "markdown", "webview"],
+        default="text",
+        help="Output format: text, json, markdown, or webview (default: text). "
+        "File extension is set automatically: .txt, .json, .md, .html",
+    )
+
+    analysis_options.add_argument(
+        "--min-duration",
+        type=float,
+        default=0.0,
+        help="Minimum kernel duration threshold in microseconds (filter out short kernels)",
+    )
+
+    # LLM Enhancement Options
+    llm_options = parser.add_argument_group(
+        "LLM enhancement options (optional)",
+        "Enable natural language explanations via Anthropic Claude or OpenAI GPT. "
+        "Requires API key - see https://console.anthropic.com/ or https://platform.openai.com/api-keys",
+    )
+
+    llm_options.add_argument(
+        "--llm",
+        type=str,
+        choices=["anthropic", "openai", "private"],
+        default=None,
+        help="Enable LLM-powered analysis enhancement. Choices: 'anthropic' (Claude), 'openai' (GPT), "
+        "or 'private' (any OpenAI-compatible private/enterprise server). "
+        "Requires API key set via environment variable or --llm-api-key option. "
+        "For 'private': set ROCPD_LLM_PRIVATE_URL, ROCPD_LLM_PRIVATE_MODEL, and optionally "
+        "ROCPD_LLM_PRIVATE_HEADERS (JSON). "
+        "Local analysis always runs first; LLM provides additional natural language insights.",
+    )
+
+    llm_options.add_argument(
+        "--llm-api-key",
+        type=str,
+        default=None,
+        help="API key for LLM provider. Alternatively, set environment variable: "
+        "ANTHROPIC_API_KEY for Anthropic Claude, or OPENAI_API_KEY for OpenAI GPT. "
+        "Example: --llm anthropic --llm-api-key sk-ant-... "
+        "Or: export ANTHROPIC_API_KEY='sk-ant-...' && rocpd analyze --llm anthropic",
+    )
+
+    llm_options.add_argument(
+        "--llm-model",
+        type=str,
+        default=None,
+        help="Override the LLM model name. Defaults to claude-sonnet-4-20250514 for Anthropic "
+        "and gpt-4-turbo-preview for OpenAI. Can also be set via ROCPD_LLM_MODEL environment "
+        "variable (--llm-model takes precedence). "
+        "Examples: --llm-model claude-opus-4-6, --llm-model gpt-4o",
+    )
+
+    llm_options.add_argument(
+        "--verbose",
+        action="store_true",
+        default=False,
+        help="Enable verbose logging (shows LLM API calls, reference guide loading, etc.)",
+    )
+
+    analysis_options.add_argument(
+        "--interactive",
+        "-I",
+        metavar="RUN_COMMAND",
+        type=str,
+        default=None,
+        dest="interactive",
+        help=(
+            "Launch the 7-phase interactive profiling + optimization workflow. "
+            "RUN_COMMAND is the full command used to run your GPU application. "
+            'Example: --interactive "./my_gpu_app --batch-size 64". '
+            "The workflow automatically wraps your command with rocprofv3, collects "
+            "a trace, analyzes bottlenecks with AI, and offers to apply optimizations."
+        ),
+    )
+
+    analysis_options.add_argument(
+        "--resume-session",
+        type=str,
+        default=None,
+        dest="resume_session",
+        help=(
+            "Resume a previous interactive session by session ID or file path. "
+            "Example: --resume-session 2026-03-10_14-23-01_myapp"
+        ),
+    )
+
+    llm_options.add_argument(
+        "--llm-thinking",
+        metavar="TOKENS",
+        type=int,
+        default=None,
+        dest="llm_thinking",
+        help=(
+            "Enable extended thinking for deeper LLM analysis. Specify the thinking "
+            "budget in tokens (e.g. --llm-thinking 8000). Only available with the "
+            "Anthropic provider and compatible models (claude-opus-4, "
+            "claude-sonnet-4-5, claude-3-7-sonnet). Adds latency but improves "
+            "analysis quality for complex traces with multiple interacting "
+            "bottlenecks. Requires --llm anthropic. Also configurable via the "
+            "ROCPD_LLM_THINKING environment variable (set to token count)."
+        ),
+    )
+
+    llm_options.add_argument(
+        "--llm-compact-every",
+        metavar="N",
+        type=int,
+        default=10,
+        dest="llm_compact_every",
+        help=(
+            "Compact the LLM conversation context every N assistant turns by summarizing "
+            "older messages (default: 10). Lower values use less memory; higher values "
+            "preserve more context. Only applies to --interactive sessions."
+        ),
+    )
+
+    llm_options.add_argument(
+        "--llm-local",
+        type=str,
+        choices=["ollama"],
+        default=None,
+        dest="llm_local",
+        help=(
+            "Local LLM provider for Stage 1 source summarization (before online LLM). "
+            "Choices: 'ollama'. Requires Ollama running at localhost:11434. "
+            "Set ROCPD_LLM_LOCAL_URL to override endpoint."
+        ),
+    )
+
+    llm_options.add_argument(
+        "--llm-local-model",
+        type=str,
+        default=None,
+        dest="llm_local_model",
+        help=(
+            "Model name for local LLM (default: codellama:13b). "
+            "Can also be set via ROCPD_LLM_LOCAL_MODEL environment variable."
+        ),
+    )
+
+    llm_options.add_argument(
+        "--llm-private-url",
+        type=str,
+        default=None,
+        dest="llm_private_url",
+        help=(
+            "Base URL for a private/enterprise OpenAI-compatible LLM server "
+            "(used with --llm private). E.g. https://my-apim.example.com/openai/deployments/gpt4. "
+            "Can also be set via ROCPD_LLM_PRIVATE_URL environment variable."
+        ),
+    )
+
+    llm_options.add_argument(
+        "--llm-private-model",
+        type=str,
+        default=None,
+        dest="llm_private_model",
+        help=(
+            "Model name for private LLM server (used with --llm private). "
+            "Can also be set via ROCPD_LLM_PRIVATE_MODEL environment variable."
+        ),
+    )
+
+    def process_args(input: RocpdImportData, args: argparse.Namespace):
+        """Process and return valid arguments as dictionary."""
+        valid_args = [
+            "source_dir",
+            "prompt",
+            "top_kernels",
+            "format",
+            "min_duration",
+            "llm",
+            "llm_api_key",
+            "llm_model",
+            "llm_thinking",
+            "llm_compact_every",
+            "verbose",
+            "interactive",
+            "resume_session",
+            "llm_local",
+            "llm_local_model",
+            "llm_private_url",
+            "llm_private_model",
+        ]
+        ret = {}
+        for itr in valid_args:
+            if hasattr(args, itr):
+                val = getattr(args, itr)
+                if val is not None:
+                    ret[itr] = val
+        # Convert min_duration from microseconds to nanoseconds
+        if "min_duration" in ret:
+            ret["min_duration"] = ret["min_duration"] * 1000
+        return ret
+
+    return process_args
+
+
+def execute(
+    input: Optional[RocpdImportData],
+    config: Optional[output_config.output_config] = None,
+    **kwargs: Any,
+) -> Optional[RocpdImportData]:
+    """
+    Execute AI analysis on rocpd database and/or source directory.
+
+    Args:
+        input: RocpdImportData object with database connection, or None for source-only mode
+        config: Optional output configuration
+        **kwargs: Analysis parameters (may include source_dir for Tier 0)
+
+    Returns:
+        The input RocpdImportData object (for chaining), or None in source-only mode
+    """
+    # Update config if provided
+    if config is not None:
+        config = config.update(**kwargs)
+    else:
+        config = output_config.output_config(**kwargs)
+
+    # Get database path for display
+    database_path = ""
+    if input is not None and hasattr(input, "_paths") and input._paths:
+        database_path = (
+            input._paths[0] if isinstance(input._paths, list) else str(input._paths)
+        )
+
+    # Pop interactive before passing to analyze_performance (it doesn't accept it)
+    interactive = kwargs.pop("interactive", None)
+
+    # 7-phase workflow mode: triggered when --interactive is provided with a RUN_COMMAND
+    if interactive and isinstance(interactive, str):
+        from rocpd.ai_analysis.interactive import WorkflowSession  # type: ignore[import]
+
+        source_paths: list = []
+        source_dir = kwargs.get("source_dir")
+        if source_dir:
+            source_paths.append(source_dir)
+        ws = WorkflowSession(
+            app_command=interactive,
+            source_paths=source_paths,
+            llm_provider=kwargs.get("llm"),
+            llm_api_key=kwargs.get("llm_api_key")
+            or os.environ.get("ANTHROPIC_API_KEY")
+            or os.environ.get("OPENAI_API_KEY"),
+            llm_model=kwargs.get("llm_model"),
+        )
+        ws.run()
+        return input
+
+    # Map 'format' CLI key → 'output_format' parameter expected by analyze_performance
+    if "format" in kwargs:
+        kwargs["output_format"] = kwargs.pop("format")
+
+    # Inject private-server CLI args into env so downstream code picks them up
+    if kwargs.get("llm_private_url"):
+        os.environ.setdefault("ROCPD_LLM_PRIVATE_URL", kwargs["llm_private_url"])
+    if kwargs.get("llm_private_model"):
+        os.environ.setdefault("ROCPD_LLM_PRIVATE_MODEL", kwargs["llm_private_model"])
+
+    # In interactive mode: skip the upfront LLM call entirely — the user will
+    # trigger LLM requests explicitly via [p] and [o] inside the session.
+    # Save credentials first so _run_interactive_session can still use them.
+    _interactive_llm_provider = kwargs.get("llm")
+    _interactive_llm_api_key = kwargs.get("llm_api_key")
+    _interactive_llm_model = kwargs.get("llm_model")
+    if interactive:
+        kwargs.pop("llm", None)
+        kwargs.pop("llm_model", None)
+        kwargs.pop("llm_api_key", None)
+        kwargs.pop("llm_thinking", None)
+
+    # Collect structured results so interactive mode can build its command menu
+    result_store: Dict[str, Any] = {}
+
+    # Run analysis
+    output = analyze_performance(
+        connection=input,
+        database_path=database_path,
+        _collect_result=result_store,
+        **kwargs,
+    )
+
+    # Determine file extension based on output format
+    _ext_map = {"json": ".json", "markdown": ".md", "webview": ".html", "text": ".txt"}
+    _fmt = kwargs.get("output_format", "text")
+    _ext = _ext_map.get(_fmt, ".txt")
+
+    # Handle output
+    if config and config.output_file and config.output_path:
+        base = config.output_file
+        # Append the format extension if the base name doesn't already have it
+        if not base.endswith(_ext):
+            base = base + _ext
+        output_file = os.path.join(config.output_path, base)
+        os.makedirs(config.output_path, exist_ok=True)
+        with open(output_file, "w") as f:
+            f.write(output)
+        print(f"Analysis written to: {output_file}")
+        if _fmt == "text":
+            print(
+                "Tip: use --format webview for an interactive HTML report, "
+                "--format json for machine-readable output, "
+                "or --format markdown for Markdown."
+            )
+    else:
+        print(output)
+
+    # ── Interactive mode ─────────────────────────────────────────────────────
+    if interactive:
+        _run_interactive_session(
+            recommendations=result_store.get("recommendations", []),
+            tier0_result=result_store.get("tier0_result"),
+            database_path=result_store.get("database_path", database_path),
+            source_dir=kwargs.get("source_dir", ""),
+            llm_provider=_interactive_llm_provider,
+            llm_api_key=_interactive_llm_api_key,
+            llm_model=_interactive_llm_model,
+            llm_local=kwargs.get("llm_local"),
+            llm_local_model=kwargs.get("llm_local_model"),
+            resume_session=kwargs.get("resume_session"),
+            compact_every=kwargs.get("llm_compact_every", 10),
+        )
+
+    return input
+
+
+def main(argv=None) -> int:
+    """
+    Main entry point for standalone execution.
+
+    Args:
+        argv: Command-line arguments (defaults to sys.argv)
+
+    Returns:
+        Exit code (0 for success, non-zero for error)
+    """
+    parser = argparse.ArgumentParser(
+        prog="rocpd.analyze",
+        description="AI-powered performance analysis for GPU traces",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+
+    parser.add_argument(
+        "-i",
+        "--input",
+        nargs="+",
+        type=str,
+        required=True,
+        help="Input rocpd database file(s)",
+    )
+
+    # Add output config args
+    output_config.add_args(parser)
+
+    # Add analysis args
+    process_args = add_args(parser)
+
+    # Parse arguments
+    args = parser.parse_args(argv)
+
+    try:
+        # Create database connection
+        input_data = RocpdImportData(args.input)
+
+        # Process arguments
+        analysis_args = process_args(input_data, args)
+
+        # Execute analysis
+        execute(input_data, **analysis_args)
+
+        return 0
+
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        import traceback
+
+        traceback.print_exc()
+        return 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/projects/rocprofiler-sdk/source/lib/python/rocpd/tracelens_port.py b/projects/rocprofiler-sdk/source/lib/python/rocpd/tracelens_port.py
new file mode 100644
index 00000000000..39f00453713
--- /dev/null
+++ b/projects/rocprofiler-sdk/source/lib/python/rocpd/tracelens_port.py
@@ -0,0 +1,388 @@
+#!/usr/bin/env python3
+###############################################################################
+# MIT License
+# Copyright (c) 2025 Advanced Micro Devices, Inc.
+###############################################################################
+
+"""
+TraceLens-derived analysis algorithms for rocpd.
+
+Ports interval arithmetic, kernel categorization, and short kernel detection
+from AMD TraceLens (https://github.com/AMD-AGI/TraceLens).
+
+All functions read from an existing RocpdImportData connection and return
+plain dict / list structures. No output formatting. No ai_analysis imports.
+
+Call order dependency:
+    timeline = compute_interval_timeline(conn)
+    categories = analyze_kernels_by_category(conn, timeline["total_wall_ns"])
+    short = analyze_short_kernels(conn)
+"""
+
+import re
+from typing import Any, Dict, List, Tuple
+
+from .importer import RocpdImportData, execute_statement
+
+__all__ = [
+    "compute_interval_timeline",
+    "categorize_kernel_name",
+    "analyze_kernels_by_category",
+    "analyze_short_kernels",
+]
+
+# ---------------------------------------------------------------------------
+# Kernel category patterns (matching TraceLens kernel_name_parser.py)
+# Order matters: first match wins.
+# ---------------------------------------------------------------------------
+_CATEGORY_PATTERNS: List[Tuple[str, Any]] = [
+    ("CONV", re.compile(r"conv|winograd|implicit_gemm_conv", re.IGNORECASE)),
+    ("GEMM", re.compile(r"gemm|gemv|xdlops_gemm|Cijk_|rocblas_gemm", re.IGNORECASE)),
+    (
+        "SDPA",
+        re.compile(
+            r"flash_attention|fmha|scaled_dot_product|FlashAttention", re.IGNORECASE
+        ),
+    ),
+    (
+        "NCCL",
+        re.compile(
+            r"ncclKernel|rccl|AllReduce|AllGather|ReduceScatter|Broadcast",
+            re.IGNORECASE,
+        ),
+    ),
+    (
+        "Elementwise",
+        re.compile(
+            r"vectorized_elementwise|aten_add|aten_mul|relu|gelu|silu", re.IGNORECASE
+        ),
+    ),
+    (
+        "Normalization",
+        re.compile(r"layer_norm|batch_norm|group_norm|rms_norm", re.IGNORECASE),
+    ),
+    ("Reduction", re.compile(r"reduce|softmax|sum_|amax", re.IGNORECASE)),
+]
+
+
+def categorize_kernel_name(name: str) -> str:
+    """Map a kernel name to a TraceLens op category.
+
+    Returns one of: GEMM, CONV, SDPA, NCCL, Elementwise, Normalization,
+    Reduction, Other.
+    """
+    for category, pattern in _CATEGORY_PATTERNS:
+        if pattern.search(name):
+            return category
+    return "Other"
+
+
+# ---------------------------------------------------------------------------
+# Interval arithmetic helpers (matching TraceLens gpu_event_analyser.py)
+# ---------------------------------------------------------------------------
+
+
+def _merge_intervals(intervals: List[Tuple[int, int]]) -> List[Tuple[int, int]]:
+    """Sort and merge overlapping (start, end) intervals.
+
+    Returns a list of non-overlapping (start, end) tuples in ascending order.
+    """
+    if not intervals:
+        return []
+    sorted_ivs = sorted(intervals, key=lambda x: x[0])
+    merged = [sorted_ivs[0]]
+    for start, end in sorted_ivs[1:]:
+        prev_start, prev_end = merged[-1]
+        if start <= prev_end:
+            merged[-1] = (prev_start, max(prev_end, end))
+        else:
+            merged.append((start, end))
+    return merged
+
+
+def _total_ns(intervals: List[Tuple[int, int]]) -> int:
+    """Sum the duration of a list of non-overlapping intervals."""
+    return sum(end - start for start, end in intervals)
+
+
+def _subtract_intervals(
+    a: List[Tuple[int, int]], b: List[Tuple[int, int]]
+) -> List[Tuple[int, int]]:
+    """Return intervals in *a* that do not overlap with any interval in *b*.
+
+    Both inputs must already be merged (non-overlapping, sorted).
+    Implements set difference A − B for interval sets.
+    """
+    result = []
+    b_idx = 0
+    for a_start, a_end in a:
+        cur_start = a_start
+        while b_idx < len(b) and b[b_idx][1] <= cur_start:
+            b_idx += 1
+        j = b_idx
+        while j < len(b) and b[j][0] < a_end:
+            b_start, b_end = b[j]
+            if cur_start < b_start:
+                result.append((cur_start, b_start))
+            cur_start = max(cur_start, b_end)
+            j += 1
+        if cur_start < a_end:
+            result.append((cur_start, a_end))
+    return result
+
+
+# ---------------------------------------------------------------------------
+# Public analysis functions
+# ---------------------------------------------------------------------------
+
+
+def compute_interval_timeline(connection: RocpdImportData) -> Dict[str, Any]:
+    """Compute accurate GPU timeline using set-theoretic interval arithmetic.
+
+    Unlike compute_time_breakdown() which sums raw durations and double-counts
+    overlapping periods, this function uses merged interval sets to compute:
+    - true_compute_ns: kernel time with overlaps removed
+    - exposed_memcpy_ns: memcpy time NOT overlapping any kernel
+    - idle_ns: wall time minus all GPU activity
+
+    total_wall_ns is defined as MAX(end) - MIN(start) across the union of
+    kernels and memory_copies — matching compute_time_breakdown()'s definition.
+
+    Edge cases:
+    - Empty kernels table → true_compute_ns=0, true_compute_pct=0.0
+    - Empty memory_copies → exposed_memcpy_ns=0, exposed_memcpy_pct=0.0
+    - total_wall_ns==0  → all _pct fields return 0.0
+    """
+    # Load kernel intervals
+    try:
+        kernel_rows = execute_statement(
+            connection, "SELECT start, end FROM kernels", ()
+        ).fetchall()
+        kernel_intervals = [
+            (int(r[0]), int(r[1]))
+            for r in kernel_rows
+            if r[0] is not None and r[1] is not None
+        ]
+    except Exception:
+        kernel_intervals = []
+
+    # Load memcpy intervals
+    try:
+        memcpy_rows = execute_statement(
+            connection, "SELECT start, end FROM memory_copies", ()
+        ).fetchall()
+        memcpy_intervals = [
+            (int(r[0]), int(r[1]))
+            for r in memcpy_rows
+            if r[0] is not None and r[1] is not None
+        ]
+    except Exception:
+        memcpy_intervals = []
+
+    # Compute wall time across union of both tables
+    all_starts = [s for s, _ in kernel_intervals] + [s for s, _ in memcpy_intervals]
+    all_ends = [e for _, e in kernel_intervals] + [e for _, e in memcpy_intervals]
+    if not all_starts:
+        return {
+            "total_wall_ns": 0,
+            "true_compute_ns": 0,
+            "true_compute_pct": 0.0,
+            "exposed_memcpy_ns": 0,
+            "exposed_memcpy_pct": 0.0,
+            "idle_ns": 0,
+            "idle_pct": 0.0,
+        }
+
+    total_wall_ns = max(all_ends) - min(all_starts)
+    if total_wall_ns <= 0:
+        return {
+            "total_wall_ns": 0,
+            "true_compute_ns": 0,
+            "true_compute_pct": 0.0,
+            "exposed_memcpy_ns": 0,
+            "exposed_memcpy_pct": 0.0,
+            "idle_ns": 0,
+            "idle_pct": 0.0,
+        }
+
+    # Merge intervals within each set
+    merged_kernels = _merge_intervals(kernel_intervals)
+    merged_memcpy = _merge_intervals(memcpy_intervals)
+
+    # Compute metrics
+    true_compute_ns = _total_ns(merged_kernels)
+    exposed_memcpy = _subtract_intervals(merged_memcpy, merged_kernels)
+    exposed_memcpy_ns = _total_ns(exposed_memcpy)
+
+    # Idle = wall minus union of all activity
+    all_activity = _merge_intervals(merged_kernels + merged_memcpy)
+    active_ns = _total_ns(all_activity)
+    idle_ns = max(0, total_wall_ns - active_ns)
+
+    def _pct(v: int) -> float:
+        return round(100.0 * v / total_wall_ns, 2)
+
+    return {
+        "total_wall_ns": total_wall_ns,
+        "true_compute_ns": true_compute_ns,
+        "true_compute_pct": _pct(true_compute_ns),
+        "exposed_memcpy_ns": exposed_memcpy_ns,
+        "exposed_memcpy_pct": _pct(exposed_memcpy_ns),
+        "idle_ns": idle_ns,
+        "idle_pct": _pct(idle_ns),
+    }
+
+
+def analyze_kernels_by_category(
+    connection: RocpdImportData,
+    total_wall_ns: int,
+) -> List[Dict[str, Any]]:
+    """Aggregate kernel execution time by TraceLens op category.
+
+    Call compute_interval_timeline() first and pass its total_wall_ns here
+    so pct_of_total_time uses the same wall-time baseline.
+
+    Returns list of dicts sorted by total_ns descending, one entry per category.
+    Returns [] if kernels table is empty.
+
+    Edge cases:
+    - Empty kernels table → []
+    - total_wall_ns==0   → pct_of_total_time=0.0 for all categories
+    """
+    try:
+        rows = execute_statement(
+            connection, "SELECT name, duration FROM kernels", ()
+        ).fetchall()
+    except Exception:
+        return []
+
+    if not rows:
+        return []
+
+    # Aggregate by category
+    cat_totals: Dict[str, Dict[str, Any]] = {}
+    total_kernel_ns = 0
+    for name, duration in rows:
+        if name is None or duration is None:
+            continue
+        category = categorize_kernel_name(str(name))
+        dur = int(duration)
+        total_kernel_ns += dur
+        if category not in cat_totals:
+            cat_totals[category] = {"count": 0, "total_ns": 0}
+        cat_totals[category]["count"] += 1
+        cat_totals[category]["total_ns"] += dur
+
+    if not cat_totals:
+        return []
+
+    result = []
+    for category, data in cat_totals.items():
+        count = data["count"]
+        total_ns = data["total_ns"]
+        avg_ns = total_ns // count if count > 0 else 0
+        pct_kernel = (
+            round(100.0 * total_ns / total_kernel_ns, 2) if total_kernel_ns > 0 else 0.0
+        )
+        pct_wall = (
+            round(100.0 * total_ns / total_wall_ns, 2) if total_wall_ns > 0 else 0.0
+        )
+        result.append(
+            {
+                "category": category,
+                "count": count,
+                "total_ns": total_ns,
+                "pct_of_kernel_time": pct_kernel,
+                "avg_duration_ns": avg_ns,
+                "pct_of_total_time": pct_wall,
+            }
+        )
+
+    return sorted(result, key=lambda x: x["total_ns"], reverse=True)
+
+
+def analyze_short_kernels(
+    connection: RocpdImportData,
+    threshold_us: float = 10.0,
+) -> Dict[str, Any]:
+    """Identify kernels below threshold_us microseconds (TraceLens short-kernel analysis).
+
+    threshold_us defaults to 10μs and is not configurable via CLI in Phase 1.
+
+    Edge cases:
+    - No kernels below threshold → short_kernel_count=0, histogram=[], top_offenders=[]
+    - Empty kernels table        → same as above
+    - total_kernel_time==0       → wasted_pct_of_kernel_time=0.0
+    """
+    threshold_ns = int(threshold_us * 1_000)
+
+    try:
+        all_rows = execute_statement(
+            connection, "SELECT name, duration FROM kernels", ()
+        ).fetchall()
+    except Exception:
+        all_rows = []
+
+    total_kernels = len(all_rows)
+    total_kernel_ns = sum(int(r[1]) for r in all_rows if r[1] is not None)
+
+    # Filter short kernels
+    short_rows = [
+        (str(r[0]), int(r[1]))
+        for r in all_rows
+        if r[0] is not None and r[1] is not None and int(r[1]) < threshold_ns
+    ]
+
+    short_count = len(short_rows)
+    wasted_ns = sum(d for _, d in short_rows)
+    short_pct = (
+        round(100.0 * short_count / total_kernels, 2) if total_kernels > 0 else 0.0
+    )
+    wasted_pct = (
+        round(100.0 * wasted_ns / total_kernel_ns, 2) if total_kernel_ns > 0 else 0.0
+    )
+
+    # Histogram buckets (matching TraceLens short kernel histogram)
+    buckets = [
+        ("0-1μs", 0, 1_000),
+        ("1-5μs", 1_000, 5_000),
+        (f"5-{int(threshold_us)}μs", 5_000, threshold_ns),
+    ]
+    histogram = [
+        {"bucket_label": label, "count": sum(1 for _, d in short_rows if lo <= d < hi)}
+        for label, lo, hi in buckets
+        if any(lo <= d < hi for _, d in short_rows)
+    ]
+
+    # Top offenders by total wasted time
+    offender_map: Dict[str, Dict[str, Any]] = {}
+    for name, dur in short_rows:
+        if name not in offender_map:
+            offender_map[name] = {"count": 0, "total_wasted_ns": 0}
+        offender_map[name]["count"] += 1
+        offender_map[name]["total_wasted_ns"] += dur
+
+    top_offenders = sorted(
+        [
+            {
+                "name": name,
+                "count": data["count"],
+                "avg_us": round(data["total_wasted_ns"] / data["count"] / 1_000, 3),
+                "total_wasted_ns": data["total_wasted_ns"],
+            }
+            for name, data in offender_map.items()
+        ],
+        key=lambda x: x["total_wasted_ns"],
+        reverse=True,
+    )[:10]
+
+    return {
+        "threshold_us": threshold_us,
+        "total_kernels": total_kernels,
+        "short_kernel_count": short_count,
+        "short_kernel_pct": short_pct,
+        "wasted_ns": wasted_ns,
+        "wasted_pct_of_kernel_time": wasted_pct,
+        "histogram": histogram,
+        "top_offenders": top_offenders,
+    }
diff --git a/projects/rocprofiler-sdk/source/lib/python/utilities.cmake b/projects/rocprofiler-sdk/source/lib/python/utilities.cmake
index 8f51fe971b9..4d9486297c6 100644
--- a/projects/rocprofiler-sdk/source/lib/python/utilities.cmake
+++ b/projects/rocprofiler-sdk/source/lib/python/utilities.cmake
@@ -178,7 +178,9 @@ function(rocprofiler_rocpd_python_bindings _VERSION)
         query.py
         schema.py
         summary.py
-        time_window.py)
+        time_window.py
+        tracelens_port.py
+        analyze.py)
 
     foreach(_SOURCE ${rocpd_PYTHON_SOURCES})
         configure_file(${CMAKE_CURRENT_LIST_DIR}/${_SOURCE}
@@ -189,6 +191,33 @@ function(rocprofiler_rocpd_python_bindings _VERSION)
             COMPONENT rocpd)
     endforeach()
 
+    # Copy ai_analysis directory and its contents (including subdirectories). Includes
+    # *.py modules, *.md docs, *.json schema files, and *.png assets (e.g.
+    # ai_analysis/share/amd_rocm_logo.png used by interactive.py banner). Excludes
+    # ai_analysis/tests/ — test-only files should not be installed into site-packages as
+    # they are not runtime assets and can cause import side-effects.
+    file(
+        GLOB_RECURSE
+        rocpd_AI_ANALYSIS_FILES
+        "${CMAKE_CURRENT_LIST_DIR}/ai_analysis/*.py"
+        "${CMAKE_CURRENT_LIST_DIR}/ai_analysis/*.md"
+        "${CMAKE_CURRENT_LIST_DIR}/ai_analysis/*.json"
+        "${CMAKE_CURRENT_LIST_DIR}/ai_analysis/*.png")
+    list(FILTER rocpd_AI_ANALYSIS_FILES EXCLUDE REGEX
+         "${CMAKE_CURRENT_LIST_DIR}/ai_analysis/tests/.*")
+
+    foreach(_AI_FILE ${rocpd_AI_ANALYSIS_FILES})
+        file(RELATIVE_PATH _REL_PATH "${CMAKE_CURRENT_LIST_DIR}" "${_AI_FILE}")
+        get_filename_component(_REL_DIR "${_REL_PATH}" DIRECTORY)
+        # Use file(COPY) instead of configure_file so binary assets (e.g. PNG) are handled
+        # correctly without text substitution or EPERM on binary data.
+        file(COPY ${_AI_FILE} DESTINATION ${rocpd_PYTHON_OUTPUT_DIRECTORY}/${_REL_DIR})
+        install(
+            FILES ${rocpd_PYTHON_OUTPUT_DIRECTORY}/${_REL_PATH}
+            DESTINATION ${rocpd_PYTHON_INSTALL_DIRECTORY}/${_REL_DIR}
+            COMPONENT rocpd)
+    endforeach()
+
     add_library(rocprofiler-sdk-rocpd-python-bindings-${_VERSION} MODULE)
     target_sources(
         rocprofiler-sdk-rocpd-python-bindings-${_VERSION}
diff --git a/projects/rocprofiler-sdk/source/scripts/format-deps.py b/projects/rocprofiler-sdk/source/scripts/format-deps.py
index 98af5bb2390..b6420f9f451 100755
--- a/projects/rocprofiler-sdk/source/scripts/format-deps.py
+++ b/projects/rocprofiler-sdk/source/scripts/format-deps.py
@@ -25,7 +25,6 @@
 
 import argparse
 import os
-import sys
 
 
 class FormatSource(argparse.Action):
@@ -118,6 +117,10 @@ def __call__(self, parser, namespace, values, option_string=None):
     "-p", "--python", nargs=0, help="format python files", action=FormatPython
 )
 parser.add_argument(
-    "-a", "--all", nargs=0, help="format cmake, source and python files", action=FormatAll
+    "-a",
+    "--all",
+    nargs=0,
+    help="format cmake, source and python files",
+    action=FormatAll,
 )
 parser.parse_args()
diff --git a/projects/rocprofiler-sdk/tests/pytest-packages/pytest_utils/perfetto_reader.py b/projects/rocprofiler-sdk/tests/pytest-packages/pytest_utils/perfetto_reader.py
index 7a3be745bad..b7c7ff2d5b7 100644
--- a/projects/rocprofiler-sdk/tests/pytest-packages/pytest_utils/perfetto_reader.py
+++ b/projects/rocprofiler-sdk/tests/pytest-packages/pytest_utils/perfetto_reader.py
@@ -349,10 +349,10 @@ def extract_tp_data(self, **kwargs):
                      counter_track.name as track_name,
                      ROW_NUMBER() OVER window AS rn
                   FROM counter JOIN counter_track ON counter.track_id = counter_track.id
-                  WHERE counter_track.name LIKE '%SCRATCH MEMORY%' 
+                  WHERE counter_track.name LIKE '%SCRATCH MEMORY%'
                   WINDOW window AS (PARTITION BY counter.value, track_id ORDER BY counter.ts)
             )
-            SELECT 
+            SELECT
                slice_id,
                track_id,
                'scratch_memory' as category,
diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/rocpd/CMakeLists.txt b/projects/rocprofiler-sdk/tests/rocprofv3/rocpd/CMakeLists.txt
index ef10bb7720f..4372dc1a963 100644
--- a/projects/rocprofiler-sdk/tests/rocprofv3/rocpd/CMakeLists.txt
+++ b/projects/rocprofiler-sdk/tests/rocprofv3/rocpd/CMakeLists.txt
@@ -18,6 +18,27 @@ set(rocprofv3-rocpd-env
 find_package(MPI)
 find_package(Python3 REQUIRED)
 
+# Helper: copy a test script to the build directory at build time (not just at configure
+# time).  Unlike configure_file(COPYONLY), this registers a proper build-time dependency
+# so that editing the source file and re-running cmake --build is sufficient to pick up
+# the change without re-running cmake.
+#
+# Each call creates an add_custom_command (triggered by the file dependency) and a
+# lightweight ALL custom target that forces it to run on every build.
+function(rocpd_stage_test_script _SRC _DST)
+    get_filename_component(_tgt "${_DST}" NAME_WE)
+    add_custom_command(
+        OUTPUT "${_DST}"
+        COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${_SRC}" "${_DST}"
+        DEPENDS "${_SRC}"
+        COMMENT "Staging ${_SRC} -> build dir")
+    add_custom_target(rocpd-stage-${_tgt} ALL DEPENDS "${_DST}")
+    set_property(
+        DIRECTORY
+        APPEND
+        PROPERTY CMAKE_CONFIGURE_DEPENDS "${_SRC}")
+endfunction()
+
 if(MPI_FOUND)
     set(MULTIPROC_IS_DISABLED OFF)
     set(MULTIPROC_LAUNCHER ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} 2
@@ -382,3 +403,294 @@ rocprofiler_add_integration_execute_test(
     FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}"
     DISABLED "${MULTIPROC_IS_DISABLED}"
     FIXTURES_REQUIRED rocprofv3-test-rocpd-merge-generation-using-package-multiproc)
+
+#########################################################################################
+#
+# AI analysis module tests
+#
+#########################################################################################
+
+# Test the analyze --help flag works
+rocprofiler_add_integration_execute_test(
+    rocprofv3-test-rocpd-analyze-help
+    COMMAND ${Python3_EXECUTABLE} -m rocpd analyze --help
+    DEPENDS rocprofiler-sdk::rocprofv3
+    TIMEOUT 120
+    LABELS "integration-tests;rocpd"
+    ENVIRONMENT "${rocprofv3-rocpd-env}"
+    FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}")
+
+# Test standalone module help
+rocprofiler_add_integration_execute_test(
+    rocprofv3-test-rocpd-module-analyze-help
+    COMMAND ${Python3_EXECUTABLE} -m rocpd.analyze --help
+    DEPENDS rocprofiler-sdk::rocprofv3
+    TIMEOUT 120
+    LABELS "integration-tests;rocpd"
+    ENVIRONMENT "${rocprofv3-rocpd-env}"
+    FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}")
+
+# Test analyze on existing database
+rocprofiler_add_integration_execute_test(
+    rocprofv3-test-rocpd-analyze
+    COMMAND ${Python3_EXECUTABLE} -m rocpd analyze -i
+            ${CMAKE_CURRENT_BINARY_DIR}/rocpd-input-data/out_results.db --top-kernels 5
+    DEPENDS rocprofiler-sdk::rocprofv3
+    TIMEOUT 120
+    LABELS "integration-tests;rocpd"
+    ENVIRONMENT "${rocprofv3-rocpd-env}"
+    FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}"
+    FIXTURES_REQUIRED rocprofv3-test-rocpd)
+
+# Test analyze with JSON output format (also a fixture for schema validation test)
+rocprofiler_add_integration_execute_test(
+    rocprofv3-test-rocpd-analyze-json
+    COMMAND
+        ${Python3_EXECUTABLE} -m rocpd analyze -i
+        ${CMAKE_CURRENT_BINARY_DIR}/rocpd-input-data/out_results.db --format json -o
+        analysis_results -d ${CMAKE_CURRENT_BINARY_DIR}/rocpd-analyze-output
+    DEPENDS rocprofiler-sdk::rocprofv3
+    TIMEOUT 120
+    LABELS "integration-tests;rocpd"
+    ENVIRONMENT "${rocprofv3-rocpd-env}"
+    FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}"
+    FIXTURES_SETUP rocprofv3-test-rocpd-analyze-json
+    FIXTURES_REQUIRED rocprofv3-test-rocpd)
+
+# Test analyze with custom top-kernels
+rocprofiler_add_integration_execute_test(
+    rocprofv3-test-rocpd-analyze-top-kernels
+    COMMAND ${Python3_EXECUTABLE} -m rocpd analyze -i
+            ${CMAKE_CURRENT_BINARY_DIR}/rocpd-input-data/out_results.db --top-kernels 20
+    DEPENDS rocprofiler-sdk::rocprofv3
+    TIMEOUT 120
+    LABELS "integration-tests;rocpd"
+    ENVIRONMENT "${rocprofv3-rocpd-env}"
+    FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}"
+    FIXTURES_REQUIRED rocprofv3-test-rocpd)
+
+# Test analyze with custom prompt (no LLM, just metadata)
+rocprofiler_add_integration_execute_test(
+    rocprofv3-test-rocpd-analyze-custom-prompt
+    COMMAND
+        ${Python3_EXECUTABLE} -m rocpd analyze -i
+        ${CMAKE_CURRENT_BINARY_DIR}/rocpd-input-data/out_results.db --prompt
+        "Why is this kernel slow?"
+    DEPENDS rocprofiler-sdk::rocprofv3
+    TIMEOUT 120
+    LABELS "integration-tests;rocpd"
+    ENVIRONMENT "${rocprofv3-rocpd-env}"
+    FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}"
+    FIXTURES_REQUIRED rocprofv3-test-rocpd)
+
+# Test analyze on multiproc database
+rocprofiler_add_integration_execute_test(
+    rocprofv3-test-rocpd-analyze-multiproc
+    COMMAND
+        ${Python3_EXECUTABLE} -m rocpd analyze -i
+        ${CMAKE_CURRENT_BINARY_DIR}/rocpd-input-data-multiproc/out_mp_0_results.db
+        ${CMAKE_CURRENT_BINARY_DIR}/rocpd-input-data-multiproc/out_mp_1_results.db
+        --top-kernels 5
+    DEPENDS rocprofiler-sdk::rocprofv3
+    TIMEOUT 120
+    LABELS "integration-tests;rocpd"
+    ENVIRONMENT "${rocprofv3-rocpd-env}"
+    FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}"
+    DISABLED "${MULTIPROC_IS_DISABLED}"
+    FIXTURES_REQUIRED rocprofv3-test-rocpd-multiproc)
+
+# Test AI analysis Python API import
+rocprofiler_add_integration_execute_test(
+    rocprofv3-test-rocpd-ai-analysis-api
+    COMMAND
+        ${Python3_EXECUTABLE} -c
+        "from rocpd.ai_analysis import analyze_database; print('AI analysis API imported successfully')"
+    DEPENDS rocprofiler-sdk::rocprofv3
+    TIMEOUT 120
+    LABELS "integration-tests;rocpd"
+    ENVIRONMENT "${rocprofv3-rocpd-env}"
+    FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}")
+
+rocpd_stage_test_script(${CMAKE_CURRENT_SOURCE_DIR}/test_analyze.py
+                        ${CMAKE_CURRENT_BINARY_DIR}/test_analyze_standalone.py)
+
+# Test analyze unit tests (run from build dir to avoid conftest.py issues)
+rocprofiler_add_integration_execute_test(
+    rocprofv3-test-rocpd-analyze-unit-tests
+    COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/test_analyze_standalone.py
+    DEPENDS rocprofiler-sdk::rocprofv3
+    TIMEOUT 120
+    LABELS "integration-tests;rocpd;pytest"
+    ENVIRONMENT "${rocprofv3-rocpd-env}"
+    FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}")
+
+#########################################################################################
+#
+# JSON schema tests
+#
+#########################################################################################
+
+rocpd_stage_test_script(${CMAKE_CURRENT_SOURCE_DIR}/test_analyze_schema.py
+                        ${CMAKE_CURRENT_BINARY_DIR}/test_analyze_schema_standalone.py)
+
+# Unit schema tests: validate the schema file structure and synthetic JSON output. No
+# database fixture needed - uses synthetic data generated in-process.
+rocprofiler_add_integration_execute_test(
+    rocprofv3-test-rocpd-analyze-schema-unit-tests
+    COMMAND ${Python3_EXECUTABLE}
+            ${CMAKE_CURRENT_BINARY_DIR}/test_analyze_schema_standalone.py
+    DEPENDS rocprofiler-sdk::rocprofv3
+    TIMEOUT 120
+    LABELS "integration-tests;rocpd;pytest"
+    ENVIRONMENT "${rocprofv3-rocpd-env}"
+    FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}")
+
+# Integration schema validation: parse the real JSON output produced by the analyze-json
+# test and assert schema_version, required fields, and commands structure.
+rocprofiler_add_integration_execute_test(
+    rocprofv3-test-rocpd-analyze-schema-validate
+    COMMAND
+        ${Python3_EXECUTABLE} -c
+        "import json, pkgutil; \
+         path = '${CMAKE_CURRENT_BINARY_DIR}/rocpd-analyze-output/analysis_results.json'; \
+         d = json.load(open(path)); \
+         schema = json.loads(pkgutil.get_data('rocpd.ai_analysis', 'docs/analysis-output.schema.json')); \
+         allowed = schema['properties']['schema_version']['enum']; \
+         assert d.get('schema_version') in allowed, 'Bad schema_version ' + repr(d.get('schema_version')) + ', expected one of ' + repr(allowed); \
+         assert all(k in d for k in ('metadata','recommendations','hardware_counters','hotspots')); \
+         cmds = [c for r in d['recommendations'] for c in r.get('commands', [])]; \
+         assert len(cmds) > 0, 'No commands found in recommendations'; \
+         assert all(c['tool'] in ('rocprofv3','rocprof-sys','rocprof-compute') for c in cmds); \
+         print('Schema OK: version=' + str(d['schema_version']) + ', recs=' + str(len(d['recommendations'])) + ', commands=' + str(len(cmds)))"
+    DEPENDS rocprofiler-sdk::rocprofv3
+    TIMEOUT 120
+    LABELS "integration-tests;rocpd"
+    ENVIRONMENT "${rocprofv3-rocpd-env}"
+    FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}"
+    FIXTURES_REQUIRED rocprofv3-test-rocpd-analyze-json)
+
+#########################################################################################
+#
+# AI analysis API unit tests
+#
+#########################################################################################
+
+rocpd_stage_test_script(${CMAKE_CURRENT_SOURCE_DIR}/test_ai_analysis_standalone.py
+                        ${CMAKE_CURRENT_BINARY_DIR}/test_ai_analysis_standalone.py)
+
+# AI analysis API unit tests (run from build dir to avoid conftest.py issues) These tests
+# do not require a GPU trace fixture — they test the Python API in isolation.
+rocprofiler_add_integration_execute_test(
+    rocprofv3-test-rocpd-ai-analysis-unit-tests
+    COMMAND ${Python3_EXECUTABLE}
+            ${CMAKE_CURRENT_BINARY_DIR}/test_ai_analysis_standalone.py
+    DEPENDS rocprofiler-sdk::rocprofv3
+    TIMEOUT 120
+    LABELS "integration-tests;rocpd;pytest"
+    ENVIRONMENT "${rocprofv3-rocpd-env}"
+    FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}")
+
+#########################################################################################
+#
+# LLM guide context-aware filtering unit tests
+#
+#########################################################################################
+
+rocpd_stage_test_script(${CMAKE_CURRENT_SOURCE_DIR}/test_guide_filter_standalone.py
+                        ${CMAKE_CURRENT_BINARY_DIR}/test_guide_filter_standalone.py)
+
+# Guide filtering unit tests — no GPU trace fixture required; no LLM API calls made.
+rocprofiler_add_integration_execute_test(
+    rocprofv3-test-rocpd-guide-filter-unit-tests
+    COMMAND ${Python3_EXECUTABLE}
+            ${CMAKE_CURRENT_BINARY_DIR}/test_guide_filter_standalone.py
+    DEPENDS rocprofiler-sdk::rocprofv3
+    TIMEOUT 120
+    LABELS "integration-tests;rocpd;pytest"
+    ENVIRONMENT "${rocprofv3-rocpd-env}"
+    FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}")
+
+#########################################################################################
+#
+# AI analysis module sub-package unit tests
+# Source: source/lib/python/rocpd/ai_analysis/tests/
+# Tests are mock-based; no GPU trace fixture or LLM API key required.
+# Optional-provider tests (anthropic/openai) are skipped when packages are absent.
+#
+#########################################################################################
+
+set(_AI_ANALYSIS_TESTS_DIR
+    "${CMAKE_CURRENT_SOURCE_DIR}/../../../source/lib/python/rocpd/ai_analysis/tests")
+
+rocpd_stage_test_script("${_AI_ANALYSIS_TESTS_DIR}/test_api_standalone.py"
+                        "${CMAKE_CURRENT_BINARY_DIR}/test_api_standalone.py")
+rocpd_stage_test_script("${_AI_ANALYSIS_TESTS_DIR}/test_llm_conversation.py"
+                        "${CMAKE_CURRENT_BINARY_DIR}/test_llm_conversation_standalone.py")
+rocpd_stage_test_script("${_AI_ANALYSIS_TESTS_DIR}/test_local_llm.py"
+                        "${CMAKE_CURRENT_BINARY_DIR}/test_local_llm_standalone.py")
+rocpd_stage_test_script("${_AI_ANALYSIS_TESTS_DIR}/test_interactive.py"
+                        "${CMAKE_CURRENT_BINARY_DIR}/test_interactive_standalone.py")
+rocpd_stage_test_script("${_AI_ANALYSIS_TESTS_DIR}/test_workflow.py"
+                        "${CMAKE_CURRENT_BINARY_DIR}/test_workflow_standalone.py")
+rocpd_stage_test_script("${_AI_ANALYSIS_TESTS_DIR}/test_tracelens_port.py"
+                        "${CMAKE_CURRENT_BINARY_DIR}/test_tracelens_port_standalone.py")
+
+rocprofiler_add_integration_execute_test(
+    rocprofv3-test-rocpd-api-unit-tests
+    COMMAND ${Python3_EXECUTABLE} -m pytest --noconftest -x
+            ${CMAKE_CURRENT_BINARY_DIR}/test_api_standalone.py
+    DEPENDS rocprofiler-sdk::rocprofv3
+    TIMEOUT 120
+    LABELS "integration-tests;rocpd;pytest"
+    ENVIRONMENT "${rocprofv3-rocpd-env}"
+    FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}")
+
+rocprofiler_add_integration_execute_test(
+    rocprofv3-test-rocpd-llm-conversation-unit-tests
+    COMMAND ${Python3_EXECUTABLE} -m pytest --noconftest -x
+            ${CMAKE_CURRENT_BINARY_DIR}/test_llm_conversation_standalone.py
+    DEPENDS rocprofiler-sdk::rocprofv3
+    TIMEOUT 120
+    LABELS "integration-tests;rocpd;pytest"
+    ENVIRONMENT "${rocprofv3-rocpd-env}"
+    FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}")
+
+rocprofiler_add_integration_execute_test(
+    rocprofv3-test-rocpd-local-llm-unit-tests
+    COMMAND ${Python3_EXECUTABLE} -m pytest --noconftest -x
+            ${CMAKE_CURRENT_BINARY_DIR}/test_local_llm_standalone.py
+    DEPENDS rocprofiler-sdk::rocprofv3
+    TIMEOUT 120
+    LABELS "integration-tests;rocpd;pytest"
+    ENVIRONMENT "${rocprofv3-rocpd-env}"
+    FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}")
+
+rocprofiler_add_integration_execute_test(
+    rocprofv3-test-rocpd-interactive-unit-tests
+    COMMAND ${Python3_EXECUTABLE} -m pytest --noconftest -x
+            ${CMAKE_CURRENT_BINARY_DIR}/test_interactive_standalone.py
+    DEPENDS rocprofiler-sdk::rocprofv3
+    TIMEOUT 120
+    LABELS "integration-tests;rocpd;pytest"
+    ENVIRONMENT "${rocprofv3-rocpd-env}"
+    FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}")
+
+rocprofiler_add_integration_execute_test(
+    rocprofv3-test-rocpd-workflow-unit-tests
+    COMMAND ${Python3_EXECUTABLE} -m pytest --noconftest -x
+            ${CMAKE_CURRENT_BINARY_DIR}/test_workflow_standalone.py
+    DEPENDS rocprofiler-sdk::rocprofv3
+    TIMEOUT 120
+    LABELS "integration-tests;rocpd;pytest"
+    ENVIRONMENT "${rocprofv3-rocpd-env}"
+    FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}")
+
+rocprofiler_add_integration_execute_test(
+    rocprofv3-test-rocpd-tracelens-port-unit-tests
+    COMMAND ${Python3_EXECUTABLE} -m pytest --noconftest -x
+            ${CMAKE_CURRENT_BINARY_DIR}/test_tracelens_port_standalone.py
+    DEPENDS rocprofiler-sdk::rocprofv3
+    TIMEOUT 120
+    LABELS "integration-tests;rocpd;pytest"
+    ENVIRONMENT "${rocprofv3-rocpd-env}"
+    FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}")
diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/rocpd/test_ai_analysis_standalone.py b/projects/rocprofiler-sdk/tests/rocprofv3/rocpd/test_ai_analysis_standalone.py
new file mode 100644
index 00000000000..521f1bbd2ad
--- /dev/null
+++ b/projects/rocprofiler-sdk/tests/rocprofv3/rocpd/test_ai_analysis_standalone.py
@@ -0,0 +1,816 @@
+#!/usr/bin/env python3
+###############################################################################
+# MIT License
+#
+# Copyright (c) 2025 Advanced Micro Devices, Inc.
+###############################################################################
+
+"""
+Standalone unit tests for the rocpd ai_analysis module.
+
+These tests do NOT require a real GPU trace database.
+They DO require the rocpd package to be importable (needs the built libpyrocpd
+C extension). Run with the system-installed rocpd path first, then the source
+path for the edited Python modules:
+
+    ROCPD_SYS=$(python3 -c "import site; print(site.getsitepackages()[-1])")
+    ROCPD_SRC=<repo>/projects/rocprofiler-sdk/source/lib/python
+    PYTHONPATH="${ROCPD_SYS}:${ROCPD_SRC}" pytest --noconftest test_ai_analysis_standalone.py -v
+
+IMPORTANT: ROCPD_SYS must come BEFORE ROCPD_SRC in PYTHONPATH to avoid a
+circular import of libpyrocpd.
+"""
+
+import json
+import sys
+from pathlib import Path
+
+import pytest
+
+# ---------------------------------------------------------------------------
+# Helpers: build a minimal AnalysisResult without touching a real DB
+# ---------------------------------------------------------------------------
+
+
+def _make_minimal_result():
+    """Build an AnalysisResult with empty/zero payloads for serialization tests."""
+    from rocpd.ai_analysis.api import (
+        AnalysisResult,
+        AnalysisMetadata,
+        ProfilingInfo,
+        AnalysisSummary,
+        ExecutionBreakdown,
+        RecommendationSet,
+    )
+
+    result = AnalysisResult(
+        metadata=AnalysisMetadata(
+            rocpd_version="6.3.0",
+            database_file="test.db",
+            analysis_timestamp="2025-01-01T00:00:00",
+        ),
+        profiling_info=ProfilingInfo(
+            total_duration_ns=1_000_000,
+            profiling_mode="sys_trace_only",
+            analysis_tier=1,
+        ),
+        summary=AnalysisSummary(
+            overall_assessment="Test analysis",
+            primary_bottleneck="unknown",
+            confidence=0.5,
+            key_findings=["Kernel time: 80.0%"],
+        ),
+        execution_breakdown=ExecutionBreakdown(
+            kernel_time_ns=800_000,
+            kernel_time_pct=80.0,
+            memcpy_time_ns=0,
+            memcpy_time_pct=0.0,
+        ),
+        recommendations=RecommendationSet(),
+    )
+    return result
+
+
+def _attach_raw(
+    result,
+    *,
+    time_breakdown=None,
+    hotspots=None,
+    memory_analysis=None,
+    recommendations_raw=None,
+    hardware_counters=None,
+    database_path="test.db",
+):
+    """Attach a _raw dict to an AnalysisResult for to_json()/to_webview() tests."""
+    result._raw = {
+        "time_breakdown": time_breakdown
+        or {
+            "total_kernel_time": 800_000,
+            "total_memcpy_time": 0,
+            "total_runtime": 1_000_000,
+            "kernel_percent": 80.0,
+            "memcpy_percent": 0.0,
+            "overhead_percent": 20.0,
+        },
+        "hotspots": hotspots
+        or [
+            {
+                "name": "test_kernel",
+                "calls": 10,
+                "total_duration": 800_000,
+                "avg_duration": 80_000,
+                "min_duration": 75_000,
+                "max_duration": 90_000,
+                "percent_of_total": 80.0,
+            }
+        ],
+        "memory_analysis": memory_analysis or {},
+        "recommendations_raw": recommendations_raw or [],
+        "hardware_counters": hardware_counters or {"has_counters": False},
+        "database_path": database_path,
+    }
+    return result
+
+
+# ===========================================================================
+# Tests: OutputFormat enum (AIA-003)
+# ===========================================================================
+
+
+class TestOutputFormat:
+    def test_has_python_object(self):
+        from rocpd.ai_analysis.api import OutputFormat
+
+        assert OutputFormat.PYTHON_OBJECT.value == "python_object"
+
+    def test_has_json(self):
+        from rocpd.ai_analysis.api import OutputFormat
+
+        assert OutputFormat.JSON.value == "json"
+
+    def test_has_text(self):
+        from rocpd.ai_analysis.api import OutputFormat
+
+        assert OutputFormat.TEXT.value == "text"
+
+    def test_has_markdown(self):
+        from rocpd.ai_analysis.api import OutputFormat
+
+        assert OutputFormat.MARKDOWN.value == "markdown"
+
+    def test_has_webview(self):
+        """AIA-003: WEBVIEW must be present in OutputFormat."""
+        from rocpd.ai_analysis.api import OutputFormat
+
+        assert OutputFormat.WEBVIEW.value == "webview"
+
+    def test_five_members(self):
+        from rocpd.ai_analysis.api import OutputFormat
+
+        assert len(list(OutputFormat)) == 5
+
+
+# ===========================================================================
+# Tests: Exceptions (AIA-008, AIA-010, AIA-011)
+# ===========================================================================
+
+
+class TestExceptions:
+    def test_missing_data_error_optional_list(self):
+        """AIA-010: missing_tables should be Optional[List[str]]."""
+        from rocpd.ai_analysis.exceptions import MissingDataError
+
+        # Both None and a list should work
+        err_no_list = MissingDataError("msg")
+        assert err_no_list.missing_tables == []
+        err_with_list = MissingDataError("msg", ["kernels"])
+        assert err_with_list.missing_tables == ["kernels"]
+
+    def test_unsupported_gpu_error_optional_str(self):
+        """AIA-010: gpu_arch should be Optional[str]."""
+        from rocpd.ai_analysis.exceptions import UnsupportedGPUError
+
+        err_no_arch = UnsupportedGPUError("msg")
+        assert err_no_arch.gpu_arch is None
+        err_with_arch = UnsupportedGPUError("msg", "gfx906")
+        assert err_with_arch.gpu_arch == "gfx906"
+
+    def test_reference_guide_not_found_shows_all_paths(self):
+        """AIA-008: ReferenceGuideNotFoundError must list all attempted paths."""
+        from rocpd.ai_analysis.exceptions import ReferenceGuideNotFoundError
+
+        paths = ["/path/one/guide.md", "/path/two/guide.md", "/path/three/guide.md"]
+        err = ReferenceGuideNotFoundError(paths)
+        msg = str(err)
+        for p in paths:
+            assert p in msg, f"Path '{p}' not found in error message"
+        assert err.attempted_paths == paths
+
+    def test_reference_guide_exported_from_init(self):
+        """AIA-011: ReferenceGuideNotFoundError must be importable from rocpd.ai_analysis."""
+        from rocpd.ai_analysis import ReferenceGuideNotFoundError
+
+        assert ReferenceGuideNotFoundError is not None
+
+    def test_all_exceptions_exported(self):
+        """Verify all documented exceptions are accessible from the public API."""
+        import rocpd.ai_analysis as m
+
+        for name in [
+            "AnalysisError",
+            "DatabaseNotFoundError",
+            "DatabaseCorruptedError",
+            "MissingDataError",
+            "UnsupportedGPUError",
+            "LLMAuthenticationError",
+            "LLMRateLimitError",
+            "ReferenceGuideNotFoundError",
+        ]:
+            assert hasattr(m, name), f"{name} not exported from rocpd.ai_analysis"
+
+
+# ===========================================================================
+# Tests: validate_database (AIA-013)
+# ===========================================================================
+
+
+class TestValidateDatabase:
+    def test_raises_for_missing_file(self):
+        """validate_database() must raise DatabaseNotFoundError for missing file."""
+        from rocpd.ai_analysis import validate_database, DatabaseNotFoundError
+
+        with pytest.raises(DatabaseNotFoundError):
+            validate_database(Path("/nonexistent/path/to/trace.db"))
+
+
+# ===========================================================================
+# Tests: AnalysisResult serialization (AIA-004)
+# ===========================================================================
+
+
+class TestAnalysisResultSerialization:
+    def test_to_dict_returns_dict(self):
+        result = _make_minimal_result()
+        d = result.to_dict()
+        assert isinstance(d, dict)
+        assert "metadata" in d
+        assert "recommendations" in d
+
+    def test_to_json_without_raw_raises_runtime_error(self):
+        """to_json() without _raw must raise RuntimeError (not silently produce non-schema JSON)."""
+        import pytest
+
+        result = _make_minimal_result()
+        # No _raw attached → must raise so callers know output would be non-schema-conformant
+        with pytest.raises(RuntimeError, match="Raw analysis data not available"):
+            result.to_json()
+
+    def test_to_json_with_raw_returns_schema_conformant_json(self):
+        """AIA-004: to_json() with _raw must include schema_version."""
+        result = _attach_raw(_make_minimal_result())
+        j = result.to_json()
+        parsed = json.loads(j)
+        # schema-conformant output includes schema_version
+        assert "schema_version" in parsed, "JSON output missing schema_version field"
+        assert parsed["schema_version"] == "0.1.0"
+
+    def test_to_webview_raises_without_raw(self):
+        """to_webview() must raise RuntimeError if _raw is not attached."""
+        result = _make_minimal_result()
+        with pytest.raises(RuntimeError, match="analyze_database"):
+            result.to_webview()
+
+    def test_to_webview_with_raw_returns_html(self):
+        """AIA-004: to_webview() with _raw must return HTML string."""
+        result = _attach_raw(_make_minimal_result())
+        html = result.to_webview()
+        assert isinstance(html, str)
+        assert "<!DOCTYPE" in html or "<html" in html
+        assert len(html) > 1000  # must be a real HTML document
+
+
+# ===========================================================================
+# Tests: _convert_result_to_llm_format (AIA-006)
+# ===========================================================================
+
+
+class TestConvertResultToLlmFormat:
+    def test_returns_real_kernel_data(self):
+        """AIA-006: kernels list must not be empty when hotspots are present."""
+        from rocpd.ai_analysis.api import _convert_result_to_llm_format
+
+        result = _attach_raw(
+            _make_minimal_result(),
+            hotspots=[
+                {
+                    "name": "conv2d",
+                    "calls": 5,
+                    "total_duration": 500_000,
+                    "avg_duration": 100_000,
+                    "percent_of_total": 50.0,
+                }
+            ],
+        )
+        llm_data = _convert_result_to_llm_format(result)
+        assert len(llm_data["kernels"]) == 1
+        assert llm_data["kernels"][0]["name"] == "conv2d"
+
+    def test_returns_empty_kernels_without_raw(self):
+        """Without _raw, kernels defaults to empty list (graceful degradation)."""
+        from rocpd.ai_analysis.api import _convert_result_to_llm_format
+
+        result = _make_minimal_result()
+        llm_data = _convert_result_to_llm_format(result)
+        assert llm_data["kernels"] == []
+
+    def test_has_execution_breakdown(self):
+        from rocpd.ai_analysis.api import _convert_result_to_llm_format
+
+        result = _make_minimal_result()
+        llm_data = _convert_result_to_llm_format(result)
+        assert "execution_breakdown" in llm_data
+        assert "kernel_time_pct" in llm_data["execution_breakdown"]
+
+
+# ===========================================================================
+# Tests: _build_analysis_result key mapping (AIA-002)
+# ===========================================================================
+
+
+class TestBuildAnalysisResultKeyMapping:
+    """Verify that recommendation keys from generate_recommendations() are mapped correctly."""
+
+    def _make_raw_rec(self, priority="HIGH"):
+        return {
+            "priority": priority,
+            "category": "Low Occupancy",
+            "issue": "Average wave occupancy is very low",
+            "suggestion": "Increase occupancy by reducing VGPR usage",
+            "estimated_impact": "15-20% performance improvement",
+            "actions": ["Compile with -O3", "Reduce local arrays"],
+            "commands": [],
+        }
+
+    def test_high_priority_bucketing(self):
+        from rocpd.ai_analysis.api import _build_analysis_result
+
+        result = _build_analysis_result(
+            time_breakdown={
+                "total_kernel_time": 0,
+                "total_memcpy_time": 0,
+                "total_runtime": 0,
+                "kernel_percent": 0.0,
+                "memcpy_percent": 0.0,
+                "overhead_percent": 0.0,
+            },
+            hotspots=[],
+            memory_analysis={},
+            recommendations=[self._make_raw_rec("HIGH")],
+            hardware_counters={"has_counters": False},
+            database_path=Path("test.db"),
+            custom_prompt=None,
+        )
+        assert len(result.recommendations.high_priority) == 1
+        rec = result.recommendations.high_priority[0]
+        assert rec.title == "Average wave occupancy is very low"
+        assert rec.description == "Increase occupancy by reducing VGPR usage"
+        assert rec.estimated_impact == "15-20% performance improvement"
+        assert rec.next_steps == ["Compile with -O3", "Reduce local arrays"]
+        assert rec.priority == "high"  # normalized to lowercase
+
+    def test_medium_priority_bucketing(self):
+        from rocpd.ai_analysis.api import _build_analysis_result
+
+        result = _build_analysis_result(
+            time_breakdown={
+                "total_kernel_time": 0,
+                "total_memcpy_time": 0,
+                "total_runtime": 0,
+                "kernel_percent": 0.0,
+                "memcpy_percent": 0.0,
+                "overhead_percent": 0.0,
+            },
+            hotspots=[],
+            memory_analysis={},
+            recommendations=[self._make_raw_rec("MEDIUM")],
+            hardware_counters={"has_counters": False},
+            database_path=Path("test.db"),
+            custom_prompt=None,
+        )
+        assert len(result.recommendations.medium_priority) == 1
+
+    def test_info_bucketed_as_medium(self):
+        """INFO priority should be placed in medium_priority bucket."""
+        from rocpd.ai_analysis.api import _build_analysis_result
+
+        result = _build_analysis_result(
+            time_breakdown={
+                "total_kernel_time": 0,
+                "total_memcpy_time": 0,
+                "total_runtime": 0,
+                "kernel_percent": 0.0,
+                "memcpy_percent": 0.0,
+                "overhead_percent": 0.0,
+            },
+            hotspots=[],
+            memory_analysis={},
+            recommendations=[self._make_raw_rec("INFO")],
+            hardware_counters={"has_counters": False},
+            database_path=Path("test.db"),
+            custom_prompt=None,
+        )
+        assert len(result.recommendations.medium_priority) == 1
+
+
+# ===========================================================================
+# Tests: Bug-fix regression tests (Tasks 1-4)
+# ===========================================================================
+
+
+class TestBugFixes:
+    """
+    Regression tests covering security, correctness, and LLM-layer bug fixes
+    from code review Tasks 1-4.  Each test is tagged with the fix ID it covers.
+    """
+
+    # ------------------------------------------------------------------
+    # C-1: shlex.quote in full_command
+    # ------------------------------------------------------------------
+
+    def test_kernel_name_shell_quoted_in_full_command(self):
+        """C-1: full_command strings must use shlex.quote() for kernel names with shell metacharacters."""
+        import shlex
+        from rocpd.analyze import generate_recommendations
+
+        dangerous_name = "kernel'; rm -rf / #"
+        hotspots = [
+            {
+                "name": dangerous_name,
+                "percent_of_total": 60.0,
+                "calls": 100,
+                "avg_duration": 100_000,
+            }
+        ]
+        time_breakdown = {
+            "kernel_percent": 70,
+            "memcpy_percent": 5,
+            "overhead_percent": 5,
+            "total_kernel_time": 1_000_000,
+            "total_runtime": 1_500_000,
+        }
+        recs = generate_recommendations(time_breakdown, hotspots, {}, [])
+        compute_recs = [r for r in recs if r["category"] == "Compute Bottleneck"]
+        assert compute_recs, "Expected a compute bottleneck recommendation"
+
+        quoted_name = shlex.quote(dangerous_name)
+        # The kernel name is scoped via rocprof-compute (rocprofv3 collects general
+        # PMC counters without kernel filtering, so the name only appears in the
+        # rocprof-compute command where shlex.quote is applied).
+        kernel_cmds = [
+            cmd
+            for cmd in compute_recs[0]["commands"]
+            if cmd.get("tool") == "rocprof-compute"
+        ]
+        assert kernel_cmds, "Expected at least one rocprof-compute command"
+        for cmd in kernel_cmds:
+            full = cmd["full_command"]
+            # The properly shell-quoted form of the kernel name must appear
+            assert quoted_name in full, (
+                f"Expected shlex.quote({dangerous_name!r}) == {quoted_name!r} "
+                f"in full_command, got: {full}"
+            )
+            # The raw (unquoted) name must not appear verbatim (i.e., not word-split)
+            assert f" {dangerous_name} " not in full and not full.endswith(
+                f" {dangerous_name}"
+            ), f"Raw unquoted kernel name found in full_command: {full}"
+
+    # ------------------------------------------------------------------
+    # C-6: overhead_percent clamped at zero
+    # ------------------------------------------------------------------
+
+    def test_overhead_percent_clamped_at_zero(self):
+        """C-6: overhead_percent must never be negative even when kernel+memcpy > total."""
+        from unittest.mock import patch, MagicMock
+        from rocpd.analyze import compute_time_breakdown
+
+        # Simulate a result row where overhead would come out negative:
+        # total_kernel=900, total_memcpy=200, total_runtime=1000 → overhead=-10%
+        mock_result = (900, 200, 1000, 90.0, 20.0, -10.0)
+        mock_conn = MagicMock()
+        with patch("rocpd.analyze.execute_statement") as mock_exec:
+            mock_exec.return_value.fetchone.return_value = mock_result
+            result = compute_time_breakdown(mock_conn)
+
+        assert (
+            result["overhead_percent"] == 0.0
+        ), f"Expected 0.0, got {result['overhead_percent']}"
+        assert result["kernel_percent"] == 90.0
+        assert result["memcpy_percent"] == 20.0
+
+    # ------------------------------------------------------------------
+    # C-7: Tier 0 webview XSS escaping
+    # ------------------------------------------------------------------
+
+    def test_tier0_webview_script_tag_escaped(self):
+        """C-7: </script> in tier0 JSON payload must be escaped to prevent XSS."""
+        from datetime import datetime
+        from rocpd.analyze import _format_tier0_webview
+        from rocpd.ai_analysis.api import SourceAnalysisResult
+
+        result = SourceAnalysisResult(
+            source_dir="/tmp/test",
+            analysis_timestamp=datetime.now().isoformat(),
+            programming_model="HIP",
+            files_scanned=1,
+            files_skipped=0,
+            detected_kernels=[],
+            kernel_count=0,
+            detected_patterns=[],
+            risk_areas=[],
+            already_instrumented=False,
+            roctx_marker_count=0,
+            recommendations=[],
+            suggested_counters=[],
+            suggested_first_command="rocprofv3 --sys-trace -- ./app",
+            llm_explanation="Normal text </script><script>alert(1)</script> more text",
+        )
+
+        html = _format_tier0_webview(result)
+        # The unescaped </script><script>alert(1) sequence must not appear in the HTML
+        assert (
+            "</script><script>alert(1)" not in html
+        ), "XSS vulnerability: </script> not escaped in tier0 webview payload"
+
+    # ------------------------------------------------------------------
+    # I-1: Bottleneck classification not mislead by has_counters alone
+    # ------------------------------------------------------------------
+
+    def test_bottleneck_classification_not_mislead_by_counters(self):
+        """I-1: has_counters=True alone should not produce 'compute' bottleneck."""
+        from pathlib import Path
+        from rocpd.ai_analysis.api import _build_analysis_result
+
+        # Balanced breakdown — kernel% is only 40%, well below the 70% threshold
+        time_breakdown = {
+            "kernel_percent": 40.0,
+            "memcpy_percent": 15.0,
+            "overhead_percent": 10.0,
+            "total_kernel_time": 400_000,
+            "total_memcpy_time": 150_000,
+            "total_runtime": 1_000_000,
+        }
+        hardware_counters = {"has_counters": True}
+
+        result = _build_analysis_result(
+            time_breakdown=time_breakdown,
+            hotspots=[{"name": "k1", "percent_of_total": 40.0}],
+            memory_analysis={},
+            recommendations=[],
+            hardware_counters=hardware_counters,
+            database_path=Path("/tmp/fake.db"),
+            custom_prompt=None,
+        )
+
+        assert (
+            result.summary.primary_bottleneck == "mixed"
+        ), f"Expected 'mixed' bottleneck, got {result.summary.primary_bottleneck!r}"
+
+    # ------------------------------------------------------------------
+    # I-3: AnalysisContext(tier=0) passed to LLM in analyze_source()
+    # ------------------------------------------------------------------
+
+    def test_analyze_source_passes_analysis_context_to_llm(self, tmp_path):
+        """I-3: analyze_source() must pass AnalysisContext(tier=0) to analyze_source_with_llm."""
+        from unittest.mock import patch, MagicMock
+        from rocpd.ai_analysis.api import analyze_source
+        from rocpd.ai_analysis.llm_analyzer import AnalysisContext
+
+        # Create a minimal hip file so SourceAnalyzer has something to scan
+        (tmp_path / "test.hip").write_text("__global__ void myKernel() {}")
+
+        mock_analyzer = MagicMock()
+        mock_analyzer.analyze_source_with_llm.return_value = "LLM result"
+
+        with patch("rocpd.ai_analysis.api.LLMAnalyzer", return_value=mock_analyzer):
+            analyze_source(
+                tmp_path, enable_llm=True, llm_provider="anthropic", llm_api_key="fake"
+            )
+
+        assert (
+            mock_analyzer.analyze_source_with_llm.called
+        ), "analyze_source_with_llm was not called"
+        call_kwargs = mock_analyzer.analyze_source_with_llm.call_args
+        # Accept both positional and keyword arg style
+        kwargs = call_kwargs[1] if call_kwargs[1] else {}
+        context = kwargs.get("context")
+        if context is None and call_kwargs[0]:
+            # Unlikely but check positional args too
+            for arg in call_kwargs[0]:
+                if isinstance(arg, AnalysisContext):
+                    context = arg
+                    break
+
+        assert (
+            context is not None
+        ), "context= argument not passed to analyze_source_with_llm"
+        assert isinstance(
+            context, AnalysisContext
+        ), f"Expected AnalysisContext, got {type(context)}"
+        assert context.tier == 0, f"Expected tier=0, got {context.tier}"
+
+    # ------------------------------------------------------------------
+    # I-4: LLMAnalyzer construction without API key does not raise
+    # ------------------------------------------------------------------
+
+    def test_llm_analyzer_construction_without_api_key_does_not_raise(self):
+        """I-4: LLMAnalyzer() must not raise LLMAuthenticationError at construction time."""
+        import os
+        from unittest.mock import patch
+        from rocpd.ai_analysis.llm_analyzer import LLMAnalyzer
+        from rocpd.ai_analysis.exceptions import LLMAuthenticationError
+
+        with patch.dict(os.environ, {}, clear=False):
+            os.environ.pop("ANTHROPIC_API_KEY", None)
+            try:
+                LLMAnalyzer(provider="anthropic")
+            except LLMAuthenticationError:
+                pytest.fail(
+                    "LLMAnalyzer raised LLMAuthenticationError at construction time; "
+                    "authentication should be deferred until the first API call"
+                )
+
+    # ------------------------------------------------------------------
+    # I-5: self.model honored in LLMAnalyzer
+    # ------------------------------------------------------------------
+
+    def test_llm_analyzer_model_parameter_honored(self):
+        """I-5: LLMAnalyzer(model='my-model') must use that model in the API call."""
+        from unittest.mock import patch, MagicMock
+        from rocpd.ai_analysis.llm_analyzer import LLMAnalyzer
+
+        custom_model = "claude-haiku-4-5-20251001"
+        analyzer = LLMAnalyzer(
+            provider="anthropic", api_key="sk-test", model=custom_model
+        )
+
+        mock_client = MagicMock()
+        mock_client.messages.create.return_value = MagicMock(
+            content=[MagicMock(text="ok")]
+        )
+
+        with patch("anthropic.Anthropic", return_value=mock_client):
+            analyzer._call_anthropic("sys", "user")
+
+        assert mock_client.messages.create.called, "messages.create was not called"
+        used_model = mock_client.messages.create.call_args[1].get("model")
+        assert (
+            used_model == custom_model
+        ), f"Expected model {custom_model!r}, got {used_model!r}"
+
+    # ------------------------------------------------------------------
+    # P-2: Timeout added to LLM calls
+    # ------------------------------------------------------------------
+
+    def test_llm_calls_have_timeout(self):
+        """P-2: All Anthropic LLM API calls must include a timeout parameter."""
+        from unittest.mock import patch, MagicMock
+        from rocpd.ai_analysis.llm_analyzer import LLMAnalyzer
+
+        analyzer = LLMAnalyzer(provider="anthropic", api_key="sk-test")
+
+        mock_client = MagicMock()
+        mock_client.messages.create.return_value = MagicMock(
+            content=[MagicMock(text="ok")]
+        )
+
+        with patch("anthropic.Anthropic", return_value=mock_client):
+            analyzer._call_anthropic("sys", "user")
+
+        call_kwargs = mock_client.messages.create.call_args[1]
+        assert (
+            "timeout" in call_kwargs
+        ), "timeout parameter missing from Anthropic API call"
+        assert (
+            call_kwargs["timeout"] == 120
+        ), f"Expected timeout=120, got {call_kwargs['timeout']}"
+
+    # ------------------------------------------------------------------
+    # I-12: analyze_source_code raises on missing source_dir
+    # ------------------------------------------------------------------
+
+    def test_analyze_source_code_raises_on_missing_dir(self):
+        """I-12: analyze_source_code() must raise SourceDirectoryNotFoundError for non-existent dir."""
+        from rocpd.analyze import analyze_source_code
+        from rocpd.ai_analysis.exceptions import SourceDirectoryNotFoundError
+
+        with pytest.raises(SourceDirectoryNotFoundError):
+            analyze_source_code(source_dir="/nonexistent/path/xyz_no_exist_123")
+
+    # ------------------------------------------------------------------
+    # I-9: ReferenceGuideNotFoundError with list not string
+    # ------------------------------------------------------------------
+
+    def test_reference_guide_not_found_error_with_list(self):
+        """I-9: ReferenceGuideNotFoundError must accept List[str] and produce readable message."""
+        from rocpd.ai_analysis.exceptions import ReferenceGuideNotFoundError
+
+        paths = [
+            "share/rocprofiler-sdk/llm-reference-guide.md",
+            "~/.config/rocpd/guide.md",
+        ]
+        err = ReferenceGuideNotFoundError(paths)
+        msg = str(err)
+
+        # Both paths should appear intact in the error message
+        assert (
+            "share/rocprofiler-sdk/llm-reference-guide.md" in msg
+        ), f"First path missing from error message: {msg}"
+        assert (
+            "~/.config/rocpd/guide.md" in msg
+        ), f"Second path missing from error message: {msg}"
+        # Guard against the old bug where a bare string was iterated char-by-char
+        assert (
+            "o\n  - p" not in msg
+        ), "Characters are being joined — bare string was passed instead of list"
+
+    # ------------------------------------------------------------------
+    # M-8: Source scanner truncation warning
+    # ------------------------------------------------------------------
+
+    def test_source_scanner_truncation_warning(self, tmp_path):
+        """M-8: SourceAnalyzer must add a risk_area warning when _MAX_FILES limit is hit."""
+        from rocpd.ai_analysis.source_analyzer import SourceAnalyzer, _MAX_FILES
+
+        # Create more files than _MAX_FILES (use .hip extension so they are scanned)
+        for i in range(_MAX_FILES + 5):
+            (tmp_path / f"kernel_{i}.hip").write_text(f"__global__ void k{i}() {{}}")
+
+        scanner = SourceAnalyzer(tmp_path)
+        plan = scanner.analyze()
+
+        truncation_warnings = [
+            r for r in plan.risk_areas if "truncat" in r.lower() or "limit" in r.lower()
+        ]
+        assert (
+            truncation_warnings
+        ), f"Expected a truncation warning in risk_areas, got: {plan.risk_areas}"
+
+
+# ===========================================================================
+# Tests: Extended thinking / --llm-thinking flag (Task 22)
+# ===========================================================================
+
+
+class TestLLMThinking:
+    """Tests for extended thinking support via thinking_budget_tokens."""
+
+    def test_llm_thinking_parameter_stored(self):
+        """thinking_budget_tokens passed to __init__ must be stored on the instance."""
+        from rocpd.ai_analysis.llm_analyzer import LLMAnalyzer
+
+        analyzer = LLMAnalyzer(provider="anthropic", thinking_budget_tokens=8000)
+        assert (
+            analyzer.thinking_budget_tokens == 8000
+        ), f"Expected thinking_budget_tokens=8000, got {analyzer.thinking_budget_tokens!r}"
+
+    def test_llm_thinking_defaults_to_none(self):
+        """When thinking_budget_tokens is not supplied, the attribute must be None."""
+        import os
+        from unittest.mock import patch
+        from rocpd.ai_analysis.llm_analyzer import LLMAnalyzer
+
+        # Ensure env var is absent so it doesn't override the default
+        with patch.dict(os.environ, {}, clear=False):
+            os.environ.pop("ROCPD_LLM_THINKING", None)
+            analyzer = LLMAnalyzer(provider="anthropic")
+
+        assert (
+            analyzer.thinking_budget_tokens is None
+        ), f"Expected thinking_budget_tokens=None, got {analyzer.thinking_budget_tokens!r}"
+
+    def test_llm_thinking_openai_raises(self):
+        """analyze_with_llm() must raise ValueError when provider=openai and thinking is set."""
+        from unittest.mock import patch, MagicMock
+        from rocpd.ai_analysis.llm_analyzer import LLMAnalyzer
+
+        analyzer = LLMAnalyzer(
+            provider="openai",
+            api_key="sk-test",
+            thinking_budget_tokens=8000,
+        )
+
+        # analyze_with_llm() should raise before any API call is made
+        with pytest.raises(
+            ValueError,
+            match="Extended thinking is only supported with the Anthropic provider",
+        ):
+            # Patch openai to avoid ImportError; the ValueError should fire before the actual call
+            with patch.dict("sys.modules", {"openai": MagicMock()}):
+                analyzer.analyze_with_llm(
+                    {"has_counters": False, "has_pc_sampling": False},
+                    custom_prompt=None,
+                )
+
+    def test_llm_thinking_env_var(self):
+        """ROCPD_LLM_THINKING env var must set thinking_budget_tokens on construction."""
+        import os
+        from unittest.mock import patch
+        from rocpd.ai_analysis.llm_analyzer import LLMAnalyzer
+
+        with patch.dict(os.environ, {"ROCPD_LLM_THINKING": "5000"}):
+            analyzer = LLMAnalyzer(provider="anthropic")
+
+        assert analyzer.thinking_budget_tokens == 5000, (
+            f"Expected thinking_budget_tokens=5000 from env var, "
+            f"got {analyzer.thinking_budget_tokens!r}"
+        )
+
+
+# ---------------------------------------------------------------------------
+# Entry point
+# ---------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    # Use --noconftest to avoid loading conftest.py which requires rocprofiler_sdk module
+    exit_code = pytest.main(["--noconftest", "-x", __file__] + sys.argv[1:])
+    sys.exit(exit_code)
diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/rocpd/test_analyze.py b/projects/rocprofiler-sdk/tests/rocprofv3/rocpd/test_analyze.py
new file mode 100644
index 00000000000..52fdc466983
--- /dev/null
+++ b/projects/rocprofiler-sdk/tests/rocprofv3/rocpd/test_analyze.py
@@ -0,0 +1,1194 @@
+#!/usr/bin/env python3
+###############################################################################
+# MIT License
+#
+# Copyright (c) 2025 Advanced Micro Devices, Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+###############################################################################
+
+"""
+Tests for the AI analysis module (analyze.py).
+
+Covers:
+  - Public API exports and imports
+  - generate_recommendations: all 6 Tier-1 rules + 2 Tier-2 rules + boundaries
+  - _build_summary: all bottleneck classification branches
+  - _build_hw_counters_json: with/without counters
+  - _build_warnings_json: both cases
+  - _build_recommendations_json: stable IDs, duplicate dedup, unknown category
+  - _format_as_json: correct value mapping, idle time, Tier 2, bandwidth conversion
+  - format_analysis_output: text, json, and markdown formats
+"""
+
+import json
+import sys
+import pytest
+
+# ---------------------------------------------------------------------------
+# Shared helpers
+# ---------------------------------------------------------------------------
+
+
+def _empty_breakdown(**overrides):
+    """Return a time_breakdown dict with all fields zeroed unless overridden."""
+    base = {
+        "total_runtime": 0,
+        "total_kernel_time": 0,
+        "total_memcpy_time": 0,
+        "kernel_percent": 0.0,
+        "memcpy_percent": 0.0,
+        "overhead_percent": 0.0,
+    }
+    base.update(overrides)
+    return base
+
+
+def _make_hotspot(
+    name="k",
+    calls=10,
+    total=1_000_000,
+    pct=10.0,
+    avg=100_000,
+    min_d=90_000,
+    max_d=110_000,
+):
+    return {
+        "name": name,
+        "calls": calls,
+        "total_duration": total,
+        "avg_duration": avg,
+        "min_duration": min_d,
+        "max_duration": max_d,
+        "percent_of_total": pct,
+    }
+
+
+def _hw_counters(avg_waves=None, gpu_util=None):
+    """Build a hardware_counters dict for Tier 2 tests."""
+    metrics = {}
+    if avg_waves is not None:
+        metrics["avg_waves"] = avg_waves
+        metrics["max_waves"] = avg_waves * 2
+        metrics["min_waves"] = avg_waves / 2
+    if gpu_util is not None:
+        metrics["gpu_utilization_percent"] = gpu_util
+    return {"has_counters": True, "metrics": metrics, "counters": {}, "per_kernel": {}}
+
+
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+
+
+def test_analyze_module_import():
+    """Verify analyze module can be imported."""
+    from rocpd import analyze
+
+    assert hasattr(analyze, "compute_time_breakdown")
+    assert hasattr(analyze, "identify_hotspots")
+    assert hasattr(analyze, "analyze_memory_copies")
+    assert hasattr(analyze, "generate_recommendations")
+    assert hasattr(analyze, "format_analysis_output")
+    assert hasattr(analyze, "add_args")
+    assert hasattr(analyze, "execute")
+    assert hasattr(analyze, "main")
+
+
+def test_analyze_module_has_all():
+    """Verify analyze module exports expected functions."""
+    from rocpd import analyze
+
+    expected_exports = [
+        "compute_time_breakdown",
+        "identify_hotspots",
+        "analyze_memory_copies",
+        "generate_recommendations",
+        "format_analysis_output",
+        "analyze_performance",
+        "add_args",
+        "execute",
+        "main",
+    ]
+    for export in expected_exports:
+        assert export in analyze.__all__, f"Missing export: {export}"
+
+
+# ---------------------------------------------------------------------------
+# generate_recommendations – Tier 1 rules
+# ---------------------------------------------------------------------------
+
+
+def test_rule1_high_memcpy_fires():
+    """Rule 1: memcpy_percent > 20 triggers 'Memory Transfer' HIGH recommendation."""
+    from rocpd.analyze import generate_recommendations
+
+    recs = generate_recommendations(_empty_breakdown(memcpy_percent=25), [], {})
+    matches = [r for r in recs if r["category"] == "Memory Transfer"]
+    assert len(matches) == 1
+    assert matches[0]["priority"] == "HIGH"
+    assert "25.0%" in matches[0]["issue"]
+
+
+def test_rule1_memcpy_boundary_does_not_fire():
+    """Rule 1: memcpy_percent exactly 20 does NOT trigger (threshold is >20)."""
+    from rocpd.analyze import generate_recommendations
+
+    recs = generate_recommendations(_empty_breakdown(memcpy_percent=20), [], {})
+    assert not any(r["category"] == "Memory Transfer" for r in recs)
+
+
+def test_rule2_api_overhead_fires():
+    """Rule 2: overhead_percent > 15 triggers 'API Overhead' MEDIUM recommendation."""
+    from rocpd.analyze import generate_recommendations
+
+    recs = generate_recommendations(_empty_breakdown(overhead_percent=20), [], {})
+    matches = [r for r in recs if r["category"] == "API Overhead"]
+    assert len(matches) == 1
+    assert matches[0]["priority"] == "MEDIUM"
+    assert "20.0%" in matches[0]["issue"]
+
+
+def test_rule2_overhead_boundary_does_not_fire():
+    """Rule 2: overhead_percent exactly 15 does NOT trigger (threshold is >15)."""
+    from rocpd.analyze import generate_recommendations
+
+    recs = generate_recommendations(_empty_breakdown(overhead_percent=15), [], {})
+    assert not any(r["category"] == "API Overhead" for r in recs)
+
+
+def test_rule3_dominant_kernel_fires():
+    """Rule 3: single kernel > 50% triggers 'Compute Bottleneck' HIGH recommendation."""
+    from rocpd.analyze import generate_recommendations
+
+    hotspots = [_make_hotspot(name="dominant_kernel", pct=60.0)]
+    recs = generate_recommendations(_empty_breakdown(), hotspots, {})
+    matches = [r for r in recs if r["category"] == "Compute Bottleneck"]
+    assert len(matches) == 1
+    assert matches[0]["priority"] == "HIGH"
+    assert "dominant_kernel" in matches[0]["issue"]
+
+
+def test_rule3_dominant_kernel_boundary_does_not_fire():
+    """Rule 3: top kernel exactly 50% does NOT trigger (threshold is >50)."""
+    from rocpd.analyze import generate_recommendations
+
+    hotspots = [_make_hotspot(pct=50.0)]
+    recs = generate_recommendations(_empty_breakdown(), hotspots, {})
+    assert not any(r["category"] == "Compute Bottleneck" for r in recs)
+
+
+def test_rule3_uses_hotspot_name_in_commands():
+    """Rule 3: the kernel name appears in the rocprofv3 command's full_command."""
+    from rocpd.analyze import generate_recommendations
+
+    hotspots = [_make_hotspot(name="my_matmul", pct=75.0)]
+    recs = generate_recommendations(_empty_breakdown(), hotspots, {})
+    matches = [r for r in recs if r["category"] == "Compute Bottleneck"]
+    assert matches
+    cmds = matches[0].get("commands", [])
+    assert any("my_matmul" in c.get("full_command", "") for c in cmds)
+
+
+def test_rule4_many_small_kernels_fires():
+    """Rule 4: >1000 total calls with avg <10μs triggers 'Launch Overhead'."""
+    from rocpd.analyze import generate_recommendations
+
+    # 10 kernels × 200 calls = 2000 launches; 2e10 ns / 2000 = 1e7 ns = 10ms >> 10μs...
+    # Need avg < 10μs = 10_000 ns, so total_kernel_time < 2000 * 10_000 = 20_000_000
+    td = _empty_breakdown(total_kernel_time=10_000_000)  # avg = 5μs
+    hotspots = [_make_hotspot(name=f"k{i}", calls=200) for i in range(10)]
+    recs = generate_recommendations(td, hotspots, {})
+    matches = [r for r in recs if r["category"] == "Launch Overhead"]
+    assert len(matches) == 1
+    assert matches[0]["priority"] == "MEDIUM"
+    assert "2000" in matches[0]["issue"]
+
+
+def test_rule4_many_calls_but_large_kernels_does_not_fire():
+    """Rule 4: >1000 calls but avg >= 10μs does NOT trigger."""
+    from rocpd.analyze import generate_recommendations
+
+    # 2000 calls but avg = 50ms >> 10μs
+    td = _empty_breakdown(total_kernel_time=100_000_000_000)
+    hotspots = [_make_hotspot(name=f"k{i}", calls=200) for i in range(10)]
+    recs = generate_recommendations(td, hotspots, {})
+    assert not any(r["category"] == "Launch Overhead" for r in recs)
+
+
+def test_rule4_few_calls_does_not_fire():
+    """Rule 4: <= 1000 total calls does NOT trigger even if each is short."""
+    from rocpd.analyze import generate_recommendations
+
+    td = _empty_breakdown(total_kernel_time=1_000_000)
+    hotspots = [_make_hotspot(calls=100)]  # only 100 calls
+    recs = generate_recommendations(td, hotspots, {})
+    assert not any(r["category"] == "Launch Overhead" for r in recs)
+
+
+def test_rule5_low_bandwidth_fires():
+    """Rule 5: bandwidth < 10 GB/s triggers 'Memory Bandwidth' MEDIUM recommendation."""
+    from rocpd.analyze import generate_recommendations
+
+    mem = {"Host-to-Device": {"bandwidth_bytes_per_sec": 5e9, "avg_bytes": 1024}}
+    recs = generate_recommendations(_empty_breakdown(), [], mem)
+    matches = [r for r in recs if r["category"] == "Memory Bandwidth"]
+    assert len(matches) == 1
+    assert matches[0]["priority"] == "MEDIUM"
+    assert "Host-to-Device" in matches[0]["issue"]
+    assert "5.00 GB/s" in matches[0]["issue"]
+
+
+def test_rule5_high_bandwidth_does_not_fire():
+    """Rule 5: bandwidth >= 10 GB/s does NOT trigger."""
+    from rocpd.analyze import generate_recommendations
+
+    mem = {"Host-to-Device": {"bandwidth_bytes_per_sec": 50e9, "avg_bytes": 1024}}
+    recs = generate_recommendations(_empty_breakdown(), [], mem)
+    assert not any(r["category"] == "Memory Bandwidth" for r in recs)
+
+
+def test_rule5_zero_bandwidth_does_not_fire():
+    """Rule 5: bandwidth == 0 does NOT trigger (guard: bandwidth_gbps > 0)."""
+    from rocpd.analyze import generate_recommendations
+
+    mem = {"Host-to-Device": {"bandwidth_bytes_per_sec": 0, "avg_bytes": 0}}
+    recs = generate_recommendations(_empty_breakdown(), [], mem)
+    assert not any(r["category"] == "Memory Bandwidth" for r in recs)
+
+
+def test_rule5_multiple_directions():
+    """Rule 5: each low-bandwidth direction generates its own recommendation."""
+    from rocpd.analyze import generate_recommendations
+
+    mem = {
+        "Host-to-Device": {"bandwidth_bytes_per_sec": 2e9, "avg_bytes": 512},
+        "Device-to-Host": {"bandwidth_bytes_per_sec": 3e9, "avg_bytes": 512},
+    }
+    recs = generate_recommendations(_empty_breakdown(), [], mem)
+    bw_recs = [r for r in recs if r["category"] == "Memory Bandwidth"]
+    assert len(bw_recs) == 2
+    directions = {r["issue"].split()[0] for r in bw_recs}
+    assert "Host-to-Device" in directions
+    assert "Device-to-Host" in directions
+
+
+def test_rule6_default_info_fires_when_no_rules_trigger():
+    """Rule 6: INFO/Performance recommendation emitted when no rules fire."""
+    from rocpd.analyze import generate_recommendations
+
+    recs = generate_recommendations(_empty_breakdown(), [], {})
+    assert len(recs) == 1
+    assert recs[0]["priority"] == "INFO"
+    assert recs[0]["category"] == "Performance"
+    assert len(recs[0].get("commands", [])) > 0
+
+
+def test_rule6_default_suppressed_when_any_rule_fires():
+    """Rule 6: default INFO NOT emitted when at least one other rule fires."""
+    from rocpd.analyze import generate_recommendations
+
+    recs = generate_recommendations(_empty_breakdown(memcpy_percent=25), [], {})
+    assert not any(r["priority"] == "INFO" for r in recs)
+
+
+def test_multiple_rules_fire_simultaneously():
+    """Multiple Tier-1 rules can fire at once; all appear in recommendations."""
+    from rocpd.analyze import generate_recommendations
+
+    td = _empty_breakdown(memcpy_percent=30, overhead_percent=20)
+    recs = generate_recommendations(td, [], {})
+    categories = {r["category"] for r in recs}
+    assert "Memory Transfer" in categories
+    assert "API Overhead" in categories
+
+
+# ---------------------------------------------------------------------------
+# generate_recommendations – Tier 2 rules
+# ---------------------------------------------------------------------------
+
+
+def test_tier2_low_occupancy_fires():
+    """Tier 2: avg_waves > 0 and < 16 triggers 'Low Occupancy' HIGH."""
+    from rocpd.analyze import generate_recommendations
+
+    recs = generate_recommendations(
+        _empty_breakdown(), [], {}, _hw_counters(avg_waves=8.0)
+    )
+    matches = [r for r in recs if r["category"] == "Low Occupancy"]
+    assert len(matches) == 1
+    assert matches[0]["priority"] == "HIGH"
+    assert "8.0" in matches[0]["issue"]
+
+
+def test_tier2_low_occupancy_boundary_does_not_fire():
+    """Tier 2: avg_waves exactly 16 does NOT trigger (threshold is < 16)."""
+    from rocpd.analyze import generate_recommendations
+
+    recs = generate_recommendations(
+        _empty_breakdown(), [], {}, _hw_counters(avg_waves=16.0)
+    )
+    assert not any(r["category"] == "Low Occupancy" for r in recs)
+
+
+def test_tier2_zero_waves_does_not_fire():
+    """Tier 2: avg_waves == 0 does NOT trigger (guard: avg_waves > 0)."""
+    from rocpd.analyze import generate_recommendations
+
+    recs = generate_recommendations(_empty_breakdown(), [], {}, _hw_counters(avg_waves=0))
+    assert not any(r["category"] == "Low Occupancy" for r in recs)
+
+
+def test_tier2_low_gpu_utilization_fires():
+    """Tier 2: gpu_utilization_percent > 0 and < 70 triggers 'GPU Utilization' MEDIUM."""
+    from rocpd.analyze import generate_recommendations
+
+    recs = generate_recommendations(
+        _empty_breakdown(), [], {}, _hw_counters(gpu_util=50.0)
+    )
+    matches = [r for r in recs if r["category"] == "GPU Utilization"]
+    assert len(matches) == 1
+    assert matches[0]["priority"] == "MEDIUM"
+    assert "50.0%" in matches[0]["issue"]
+
+
+def test_tier2_gpu_utilization_boundary_does_not_fire():
+    """Tier 2: gpu_utilization exactly 70% does NOT trigger (threshold is < 70)."""
+    from rocpd.analyze import generate_recommendations
+
+    recs = generate_recommendations(
+        _empty_breakdown(), [], {}, _hw_counters(gpu_util=70.0)
+    )
+    assert not any(r["category"] == "GPU Utilization" for r in recs)
+
+
+def test_tier2_not_activated_when_no_counters():
+    """Tier 2 rules do NOT fire when has_counters=False."""
+    from rocpd.analyze import generate_recommendations
+
+    hw = {"has_counters": False}
+    recs = generate_recommendations(_empty_breakdown(), [], {}, hardware_counters=hw)
+    assert not any(r["category"] in ("Low Occupancy", "GPU Utilization") for r in recs)
+
+
+def test_tier2_commands_use_valid_tools():
+    """Tier 2 recommendations include commands with valid tool names."""
+    from rocpd.analyze import generate_recommendations
+
+    VALID_TOOLS = {"rocprofv3", "rocprof-sys", "rocprof-compute"}
+    recs = generate_recommendations(
+        _empty_breakdown(),
+        [],
+        {},
+        hardware_counters=_hw_counters(avg_waves=4.0, gpu_util=40.0),
+    )
+    for rec in recs:
+        for cmd in rec.get("commands", []):
+            assert cmd["tool"] in VALID_TOOLS, f"Invalid tool: {cmd['tool']!r}"
+
+
+# ---------------------------------------------------------------------------
+# Existing tests (preserved)
+# ---------------------------------------------------------------------------
+
+
+def test_recommendation_structure():
+    """Test that recommendations have the expected structure."""
+    from rocpd.analyze import generate_recommendations
+
+    recommendations = generate_recommendations(_empty_breakdown(), [], {})
+    assert isinstance(recommendations, list)
+    assert len(recommendations) > 0
+    rec = recommendations[0]
+    for field in ("priority", "category", "issue", "suggestion"):
+        assert field in rec
+    assert rec["priority"] in ["HIGH", "MEDIUM", "LOW", "INFO"]
+
+
+def test_high_memcpy_recommendation():
+    """Test that high memory copy overhead triggers recommendation."""
+    from rocpd.analyze import generate_recommendations
+
+    td = _empty_breakdown(memcpy_percent=35)
+    recs = generate_recommendations(td, [], {})
+    memcpy_recs = [r for r in recs if "Memory Transfer" in r.get("category", "")]
+    assert len(memcpy_recs) > 0
+    assert memcpy_recs[0]["priority"] == "HIGH"
+
+
+def test_hotspot_recommendation():
+    """Test that dominant kernel triggers recommendation."""
+    from rocpd.analyze import generate_recommendations
+
+    hotspots = [_make_hotspot(name="test_kernel", pct=60)]
+    recs = generate_recommendations(_empty_breakdown(), hotspots, {})
+    compute_recs = [r for r in recs if "Compute Bottleneck" in r.get("category", "")]
+    assert len(compute_recs) > 0
+    assert "test_kernel" in compute_recs[0]["issue"]
+
+
+# ---------------------------------------------------------------------------
+# _build_summary – all bottleneck classification branches
+# ---------------------------------------------------------------------------
+
+
+def test_summary_memory_transfer_high_confidence():
+    """memcpy_pct > 30 → memory_transfer with confidence 0.85."""
+    from rocpd.analyze import _build_summary
+
+    result = _build_summary(
+        {"memcpy_percent": 35, "kernel_percent": 50, "overhead_percent": 15}, [], False
+    )
+    assert result["primary_bottleneck"] == "memory_transfer"
+    assert result["confidence"] == 0.85
+
+
+def test_summary_memory_transfer_medium_confidence():
+    """memcpy_pct 20-30 → memory_transfer with confidence 0.70."""
+    from rocpd.analyze import _build_summary
+
+    result = _build_summary(
+        {"memcpy_percent": 25, "kernel_percent": 60, "overhead_percent": 15}, [], False
+    )
+    assert result["primary_bottleneck"] == "memory_transfer"
+    assert result["confidence"] == 0.70
+
+
+def test_summary_latency_bottleneck():
+    """overhead_pct > 25 (memcpy < 20) → latency with confidence 0.75."""
+    from rocpd.analyze import _build_summary
+
+    result = _build_summary(
+        {"memcpy_percent": 10, "kernel_percent": 60, "overhead_percent": 30}, [], False
+    )
+    assert result["primary_bottleneck"] == "latency"
+    assert result["confidence"] == 0.75
+
+
+def test_summary_compute_with_counters():
+    """kernel_pct > 70 + has_counters=True → compute with confidence 0.80."""
+    from rocpd.analyze import _build_summary
+
+    result = _build_summary(
+        {"memcpy_percent": 5, "kernel_percent": 80, "overhead_percent": 5}, [], True
+    )
+    assert result["primary_bottleneck"] == "compute"
+    assert result["confidence"] == 0.80
+
+
+def test_summary_compute_without_counters():
+    """kernel_pct > 70 + has_counters=False → compute with confidence 0.60."""
+    from rocpd.analyze import _build_summary
+
+    result = _build_summary(
+        {"memcpy_percent": 5, "kernel_percent": 80, "overhead_percent": 5}, [], False
+    )
+    assert result["primary_bottleneck"] == "compute"
+    assert result["confidence"] == 0.60
+
+
+def test_summary_mixed_bottleneck():
+    """Low percentages all round → mixed with confidence 0.50."""
+    from rocpd.analyze import _build_summary
+
+    result = _build_summary(
+        {"memcpy_percent": 10, "kernel_percent": 50, "overhead_percent": 10}, [], False
+    )
+    assert result["primary_bottleneck"] == "mixed"
+    assert result["confidence"] == 0.50
+
+
+def test_summary_top_kernel_in_findings():
+    """Top kernel name from hotspots[0] appears in key_findings."""
+    from rocpd.analyze import _build_summary
+
+    hotspots = [_make_hotspot(name="gemm_kernel")]
+    result = _build_summary(
+        {"memcpy_percent": 5, "kernel_percent": 80, "overhead_percent": 5},
+        hotspots,
+        False,
+    )
+    assert any("gemm_kernel" in f for f in result["key_findings"])
+
+
+def test_summary_empty_hotspots_shows_na():
+    """Empty hotspots → top kernel reported as 'N/A' in key_findings."""
+    from rocpd.analyze import _build_summary
+
+    result = _build_summary(
+        {"memcpy_percent": 5, "kernel_percent": 80, "overhead_percent": 5}, [], False
+    )
+    assert any("N/A" in f for f in result["key_findings"])
+
+
+def test_summary_counters_finding_present():
+    """has_counters=True adds counter-data finding; False adds Tier 1 note."""
+    from rocpd.analyze import _build_summary
+
+    bd = {"memcpy_percent": 5, "kernel_percent": 50, "overhead_percent": 5}
+    with_hw = _build_summary(bd, [], True)
+    without_hw = _build_summary(bd, [], False)
+    assert any("Hardware counter" in f for f in with_hw["key_findings"])
+    assert any("Tier 1" in f for f in without_hw["key_findings"])
+
+
+def test_summary_has_required_keys():
+    """Summary dict contains all required schema keys."""
+    from rocpd.analyze import _build_summary
+
+    result = _build_summary(
+        {"memcpy_percent": 10, "kernel_percent": 60, "overhead_percent": 10}, [], False
+    )
+    for key in (
+        "overall_assessment",
+        "primary_bottleneck",
+        "confidence",
+        "key_findings",
+    ):
+        assert key in result, f"Missing key: {key!r}"
+    assert isinstance(result["key_findings"], list)
+    assert isinstance(result["confidence"], float)
+
+
+# ---------------------------------------------------------------------------
+# _build_hw_counters_json
+# ---------------------------------------------------------------------------
+
+
+def test_hw_counters_no_counters_structure():
+    """has_counters=False returns the correct minimal structure."""
+    from rocpd.analyze import _build_hw_counters_json
+
+    result = _build_hw_counters_json({"has_counters": False})
+    assert result == {"has_counters": False, "metrics": None, "counters": None}
+
+
+def test_hw_counters_empty_dict():
+    """Empty dict (no has_counters key) treated as no counters."""
+    from rocpd.analyze import _build_hw_counters_json
+
+    result = _build_hw_counters_json({})
+    assert result["has_counters"] is False
+
+
+def test_hw_counters_with_metrics():
+    """has_counters=True maps all metric fields correctly."""
+    from rocpd.analyze import _build_hw_counters_json
+
+    hw = {
+        "has_counters": True,
+        "metrics": {
+            "gpu_utilization_percent": 75.5,
+            "avg_waves": 32.0,
+            "max_waves": 64.0,
+            "min_waves": 8.0,
+        },
+        "counters": {},
+    }
+    result = _build_hw_counters_json(hw)
+    assert result["has_counters"] is True
+    m = result["metrics"]
+    assert m["gpu_utilization_pct"] == 75.5
+    assert m["avg_waves"] == 32.0
+    assert m["max_waves"] == 64.0
+    assert m["min_waves"] == 8.0
+
+
+def test_hw_counters_with_counter_data():
+    """Counter stats are mapped with correct types."""
+    from rocpd.analyze import _build_hw_counters_json
+
+    hw = {
+        "has_counters": True,
+        "metrics": {},
+        "counters": {
+            "GRBM_COUNT": {
+                "sample_count": 100,
+                "avg_value": 1000.0,
+                "min_value": 900.0,
+                "max_value": 1100.0,
+                "total_value": 100_000.0,
+            }
+        },
+    }
+    result = _build_hw_counters_json(hw)
+    ctr = result["counters"]["GRBM_COUNT"]
+    assert ctr["sample_count"] == 100
+    assert isinstance(ctr["sample_count"], int)
+    assert ctr["avg_value"] == 1000.0
+    assert isinstance(ctr["avg_value"], float)
+
+
+# ---------------------------------------------------------------------------
+# _build_warnings_json
+# ---------------------------------------------------------------------------
+
+
+def test_warnings_no_counters_emits_warning():
+    """has_counters=False → one warning with 'warning' severity."""
+    from rocpd.analyze import _build_warnings_json
+
+    warnings = _build_warnings_json(has_counters=False)
+    assert len(warnings) == 1
+    assert warnings[0]["severity"] == "warning"
+    assert "Tier 1" in warnings[0]["message"]
+    assert "recommendation" in warnings[0]
+
+
+def test_warnings_with_counters_is_empty():
+    """has_counters=True → empty warnings list."""
+    from rocpd.analyze import _build_warnings_json
+
+    assert _build_warnings_json(has_counters=True) == []
+
+
+# ---------------------------------------------------------------------------
+# _build_recommendations_json – stable IDs, dedup, unknown category
+# ---------------------------------------------------------------------------
+
+
+def _simple_rec(category, priority="INFO"):
+    return {"category": category, "priority": priority, "issue": "x", "suggestion": "y"}
+
+
+def test_recs_json_stable_ids_for_known_categories():
+    """Known categories get their stable ROCPD-*-001 IDs."""
+    from rocpd.analyze import _build_recommendations_json
+
+    expected = {
+        "Low Occupancy": "ROCPD-OCCUPANCY-001",
+        "GPU Utilization": "ROCPD-UTILIZATION-001",
+        "Memory Transfer": "ROCPD-MEMCPY-001",
+        "API Overhead": "ROCPD-API-001",
+        "Compute Bottleneck": "ROCPD-COMPUTE-001",
+        "Launch Overhead": "ROCPD-LAUNCH-001",
+        "Memory Bandwidth": "ROCPD-MEMBW-001",
+        "Performance": "ROCPD-INFO-001",
+    }
+    recs = [_simple_rec(cat) for cat in expected]
+    out = _build_recommendations_json(recs)
+    by_cat = {r["category"]: r["id"] for r in out}
+    for cat, expected_id in expected.items():
+        assert (
+            by_cat[cat] == expected_id
+        ), f"{cat}: expected {expected_id}, got {by_cat[cat]}"
+
+
+def test_recs_json_duplicate_category_gets_incremented_id():
+    """Two recs with the same category → IDs end in 001 and 002."""
+    from rocpd.analyze import _build_recommendations_json
+
+    recs = [_simple_rec("Memory Transfer"), _simple_rec("Memory Transfer")]
+    out = _build_recommendations_json(recs)
+    assert out[0]["id"] == "ROCPD-MEMCPY-001"
+    assert out[1]["id"] == "ROCPD-MEMCPY-002"
+
+
+def test_recs_json_unknown_category_generates_id():
+    """Unknown category generates a ROCPD-...-001 style ID from the name."""
+    from rocpd.analyze import _build_recommendations_json
+
+    out = _build_recommendations_json([_simple_rec("Custom Analysis")])
+    assert out[0]["id"].startswith("ROCPD-")
+    assert out[0]["id"].endswith("-001")
+
+
+def test_recs_json_preserves_all_fields():
+    """_build_recommendations_json preserves all expected fields."""
+    from rocpd.analyze import _build_recommendations_json
+
+    rec = {
+        "category": "Performance",
+        "priority": "INFO",
+        "issue": "test issue",
+        "suggestion": "test suggestion",
+        "actions": ["do this"],
+        "estimated_impact": "5%",
+        "commands": [
+            {
+                "tool": "rocprofv3",
+                "full_command": "rocprofv3 -- ./app",
+                "description": "d",
+                "flags": [],
+                "args": [],
+            }
+        ],
+    }
+    out = _build_recommendations_json([rec])
+    assert out[0]["priority"] == "INFO"
+    assert out[0]["issue"] == "test issue"
+    assert out[0]["actions"] == ["do this"]
+    assert len(out[0]["commands"]) == 1
+
+
+def test_recs_json_empty_input_returns_empty():
+    """Empty input list returns empty output list."""
+    from rocpd.analyze import _build_recommendations_json
+
+    assert _build_recommendations_json([]) == []
+
+
+# ---------------------------------------------------------------------------
+# _format_as_json – value mapping correctness
+# ---------------------------------------------------------------------------
+
+
+def test_format_json_time_breakdown_values():
+    """_format_as_json maps time_breakdown keys correctly into execution_breakdown."""
+    from rocpd.analyze import _format_as_json
+
+    td = {
+        "total_runtime": 1_000_000_000,
+        "total_kernel_time": 800_000_000,
+        "total_memcpy_time": 100_000_000,
+        "kernel_percent": 80.0,
+        "memcpy_percent": 10.0,
+        "overhead_percent": 5.0,
+    }
+    doc = json.loads(_format_as_json(td, [], {}, []))
+    eb = doc["execution_breakdown"]
+    assert eb["total_runtime_ns"] == 1_000_000_000
+    assert eb["kernel_time_ns"] == 800_000_000
+    assert eb["memcpy_time_ns"] == 100_000_000
+    assert eb["kernel_time_pct"] == 80.0
+    assert eb["memcpy_time_pct"] == 10.0
+    assert eb["api_overhead_pct"] == 5.0
+
+
+def test_format_json_idle_time_calculation():
+    """Idle time = total − kernel − memcpy − api_overhead, clamped to 0."""
+    from rocpd.analyze import _format_as_json
+
+    td = {
+        "total_runtime": 1_000_000_000,  # 1 s
+        "total_kernel_time": 600_000_000,  # 600 ms
+        "total_memcpy_time": 200_000_000,  # 200 ms
+        "kernel_percent": 60.0,
+        "memcpy_percent": 20.0,
+        "overhead_percent": 10.0,  # 100 ms
+    }
+    doc = json.loads(_format_as_json(td, [], {}, []))
+    eb = doc["execution_breakdown"]
+    # api_overhead_ns = 10% of 1_000_000_000 = 100_000_000
+    assert eb["api_overhead_ns"] == 100_000_000
+    # idle = 1_000_000_000 - 600_000_000 - 200_000_000 - 100_000_000 = 100_000_000
+    assert eb["idle_time_ns"] == 100_000_000
+
+
+def test_format_json_idle_time_clamped_to_zero():
+    """Idle time never goes negative (clamped to 0)."""
+    from rocpd.analyze import _format_as_json
+
+    # kernel + memcpy already exceed total_runtime
+    td = {
+        "total_runtime": 100_000_000,
+        "total_kernel_time": 80_000_000,
+        "total_memcpy_time": 30_000_000,  # overflows
+        "kernel_percent": 80.0,
+        "memcpy_percent": 30.0,
+        "overhead_percent": 5.0,
+    }
+    doc = json.loads(_format_as_json(td, [], {}, []))
+    assert doc["execution_breakdown"]["idle_time_ns"] >= 0
+
+
+def test_format_json_hotspot_field_mapping():
+    """Hotspot fields are mapped with correct names and types."""
+    from rocpd.analyze import _format_as_json
+
+    hotspots = [
+        _make_hotspot(
+            name="conv_fwd",
+            calls=5,
+            total=400_000_000,
+            avg=80_000_000,
+            min_d=60_000_000,
+            max_d=100_000_000,
+            pct=40.0,
+        ),
+    ]
+    doc = json.loads(_format_as_json(_empty_breakdown(), hotspots, {}, []))
+    hs = doc["hotspots"][0]
+    assert hs["rank"] == 1
+    assert hs["name"] == "conv_fwd"
+    assert hs["calls"] == 5
+    assert hs["total_duration_ns"] == 400_000_000
+    assert hs["avg_duration_ns"] == 80_000_000.0
+    assert hs["min_duration_ns"] == 60_000_000
+    assert hs["max_duration_ns"] == 100_000_000
+    assert hs["pct_of_total"] == 40.0
+
+
+def test_format_json_hotspot_rank_increments():
+    """Multiple hotspots get ranks 1, 2, 3 in order."""
+    from rocpd.analyze import _format_as_json
+
+    hotspots = [_make_hotspot(name=f"k{i}") for i in range(3)]
+    doc = json.loads(_format_as_json(_empty_breakdown(), hotspots, {}, []))
+    ranks = [h["rank"] for h in doc["hotspots"]]
+    assert ranks == [1, 2, 3]
+
+
+def test_format_json_memory_bandwidth_gbps_conversion():
+    """bandwidth_bytes_per_sec is correctly converted to bandwidth_gbps."""
+    from rocpd.analyze import _format_as_json
+
+    mem = {
+        "Host-to-Device": {
+            "count": 10,
+            "total_bytes": 0,
+            "total_duration": 0,
+            "avg_bytes": 0,
+            "avg_duration": 0,
+            "bandwidth_bytes_per_sec": 50e9,  # 50 GB/s
+        }
+    }
+    doc = json.loads(_format_as_json(_empty_breakdown(), [], mem, []))
+    bw = doc["memory_analysis"]["Host-to-Device"]["bandwidth_gbps"]
+    assert abs(bw - 50.0) < 0.001
+
+
+def test_format_json_analysis_tier_with_counters():
+    """analysis_tier=2 and hardware_counters.has_counters=True when counters present."""
+    from rocpd.analyze import _format_as_json
+
+    hw = {"has_counters": True, "metrics": {}, "counters": {}}
+    doc = json.loads(
+        _format_as_json(_empty_breakdown(), [], {}, [], hardware_counters=hw)
+    )
+    assert doc["profiling_info"]["analysis_tier"] == 2
+    assert doc["hardware_counters"]["has_counters"] is True
+
+
+def test_format_json_analysis_tier_without_counters():
+    """analysis_tier=1 and hardware_counters.has_counters=False when no counters."""
+    from rocpd.analyze import _format_as_json
+
+    doc = json.loads(_format_as_json(_empty_breakdown(), [], {}, []))
+    assert doc["profiling_info"]["analysis_tier"] == 1
+    assert doc["hardware_counters"]["has_counters"] is False
+
+
+def test_format_json_database_path_in_metadata():
+    """database_file in metadata reflects the database_path argument."""
+    from rocpd.analyze import _format_as_json
+
+    doc = json.loads(
+        _format_as_json(_empty_breakdown(), [], {}, [], database_path="/data/trace.db")
+    )
+    assert doc["metadata"]["database_file"] == "/data/trace.db"
+
+
+def test_format_json_schema_version():
+    """JSON output always carries schema_version = '0.1.0'."""
+    from rocpd.analyze import _format_as_json
+
+    doc = json.loads(_format_as_json(_empty_breakdown(), [], {}, []))
+    assert doc["schema_version"] == "0.1.0"
+
+
+def test_format_json_analysis_version_in_metadata():
+    """metadata.analysis_version = '0.1.0'."""
+    from rocpd.analyze import _format_as_json
+
+    doc = json.loads(_format_as_json(_empty_breakdown(), [], {}, []))
+    assert doc["metadata"]["analysis_version"] == "0.1.0"
+
+
+# ---------------------------------------------------------------------------
+# format_analysis_output – text, json, markdown
+# ---------------------------------------------------------------------------
+
+
+def _full_sample_data():
+    td = {
+        "total_runtime": 1_200_000_000,
+        "total_kernel_time": 1_000_000_000,
+        "total_memcpy_time": 200_000_000,
+        "kernel_percent": 83.3,
+        "memcpy_percent": 16.7,
+        "overhead_percent": 0.0,
+    }
+    hotspots = [_make_hotspot(name="kernel_1", calls=100, total=500_000_000, pct=50.0)]
+    memory_analysis = {
+        "Host-to-Device": {
+            "count": 10,
+            "total_bytes": 1_048_576,
+            "total_duration": 100_000_000,
+            "avg_bytes": 104_857,
+            "avg_duration": 10_000_000,
+            "bandwidth_bytes_per_sec": 10_485_760,
+        }
+    }
+    recommendations = [
+        {
+            "priority": "INFO",
+            "category": "Test",
+            "issue": "Test issue",
+            "suggestion": "Test suggestion",
+            "actions": ["Action 1"],
+            "estimated_impact": "5%",
+            "commands": [],
+        }
+    ]
+    return td, hotspots, memory_analysis, recommendations
+
+
+def test_format_output_text():
+    """Text format contains all expected section headers and data."""
+    from rocpd.analyze import format_analysis_output
+
+    td, hs, mem, recs = _full_sample_data()
+    out = format_analysis_output(
+        td, hs, mem, recs, output_format="text", database_path="/test/db.db"
+    )
+    assert isinstance(out, str)
+    assert "ROCPD AI PERFORMANCE ANALYSIS" in out
+    assert "TIME BREAKDOWN" in out
+    assert "HOTSPOTS" in out
+    assert "MEMORY COPY ANALYSIS" in out
+    assert "RECOMMENDATIONS" in out
+    assert "kernel_1" in out
+    assert "Host-to-Device" in out
+
+
+def test_format_output_text_empty_data():
+    """Text format with all-zero data still produces valid output."""
+    from rocpd.analyze import format_analysis_output
+
+    out = format_analysis_output(_empty_breakdown(), [], {}, [], output_format="text")
+    assert isinstance(out, str)
+    assert "ROCPD AI PERFORMANCE ANALYSIS" in out
+
+
+def test_format_output_json():
+    """JSON format returns valid parseable JSON with required top-level keys."""
+    from rocpd.analyze import format_analysis_output
+
+    td, hs, mem, recs = _full_sample_data()
+    out = format_analysis_output(td, hs, mem, recs, output_format="json")
+    doc = json.loads(out)
+    for key in (
+        "schema_version",
+        "metadata",
+        "hotspots",
+        "recommendations",
+        "execution_breakdown",
+        "hardware_counters",
+    ):
+        assert key in doc, f"Missing key: {key!r}"
+
+
+def test_format_output_markdown():
+    """Markdown format returns well-structured markdown document."""
+    from rocpd.analyze import format_analysis_output
+
+    td, hs, mem, recs = _full_sample_data()
+    out = format_analysis_output(
+        td, hs, mem, recs, output_format="markdown", database_path="/test/db.db"
+    )
+    assert isinstance(out, str)
+    assert out.startswith("# ROCpd AI Performance Analysis")
+    assert "## Time Breakdown" in out
+    assert "## Top Kernel Hotspots" in out
+    assert "## Memory Copy Analysis" in out
+    assert "## Recommendations" in out
+    assert "kernel_1" in out
+    assert "Host-to-Device" in out
+
+
+def test_format_output_markdown_no_hotspots():
+    """Markdown format omits hotspot section when list is empty."""
+    from rocpd.analyze import format_analysis_output
+
+    td, _, mem, recs = _full_sample_data()
+    out = format_analysis_output(td, [], mem, recs, output_format="markdown")
+    assert "## Top Kernel Hotspots" not in out
+
+
+def test_format_output_markdown_no_memory():
+    """Markdown format omits memory section when analysis is empty."""
+    from rocpd.analyze import format_analysis_output
+
+    td, hs, _, recs = _full_sample_data()
+    out = format_analysis_output(td, hs, {}, recs, output_format="markdown")
+    assert "## Memory Copy Analysis" not in out
+
+
+def test_format_output_markdown_with_hardware_counters():
+    """Markdown format includes Tier 2 section when hardware counters present."""
+    from rocpd.analyze import format_analysis_output
+
+    td, hs, mem, recs = _full_sample_data()
+    hw = {
+        "has_counters": True,
+        "metrics": {
+            "gpu_utilization_percent": 65.0,
+            "avg_waves": 24.0,
+            "max_waves": 48.0,
+        },
+        "counters": {},
+    }
+    out = format_analysis_output(
+        td, hs, mem, recs, hardware_counters=hw, output_format="markdown"
+    )
+    assert "## Hardware Counters (Tier 2)" in out
+    assert "65.0%" in out
+
+
+def test_format_output_unknown_format_falls_back_to_text():
+    """Unrecognized format falls back to text output."""
+    from rocpd.analyze import format_analysis_output
+
+    out = format_analysis_output(_empty_breakdown(), [], {}, [], output_format="xml")
+    assert "ROCPD AI PERFORMANCE ANALYSIS" in out
+
+
+# ---------------------------------------------------------------------------
+# _filter_rec_commands: PMC counter filtering
+# ---------------------------------------------------------------------------
+
+
+def _pmc_cmd(
+    counters="GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES", extra_flags=None, extra_args=None
+):
+    """Build a minimal rocprofv3 recommendation command with a --pmc arg."""
+    flags = ["--sys-trace"] + (extra_flags or [])
+    args = [
+        {"name": "--pmc", "value": counters},
+        {"name": "-d", "value": "./output"},
+        {"name": "-o", "value": "profile"},
+    ] + (extra_args or [])
+    return {
+        "tool": "rocprofv3",
+        "description": "Collect hardware counters",
+        "flags": flags,
+        "args": args,
+        "full_command": (
+            f"rocprofv3 --sys-trace --pmc {counters} -d ./output -o profile -- ./app"
+        ),
+    }
+
+
+def test_filter_pmc_all_counters_already_collected_drops_command():
+    """When every --pmc counter is already in pmc_events, the command is dropped."""
+    from rocpd.analyze import _filter_rec_commands
+
+    already = frozenset(
+        {"--sys-trace", "pmc:GRBM_COUNT", "pmc:GRBM_GUI_ACTIVE", "pmc:SQ_WAVES"}
+    )
+    result = _filter_rec_commands([_pmc_cmd()], already)
+    assert result == [], "Command with all counters already collected should be dropped"
+
+
+def test_filter_pmc_partial_counters_already_collected_updates_arg():
+    """When some --pmc counters are already collected, only new ones remain."""
+    from rocpd.analyze import _filter_rec_commands
+
+    # GRBM_COUNT already collected; GRBM_GUI_ACTIVE and SQ_WAVES are new
+    already = frozenset({"--sys-trace", "pmc:GRBM_COUNT"})
+    result = _filter_rec_commands([_pmc_cmd()], already)
+    assert len(result) == 1
+    pmc_arg = next(a for a in result[0]["args"] if a.get("name") == "--pmc")
+    remaining = set(pmc_arg["value"].split())
+    assert remaining == {"GRBM_GUI_ACTIVE", "SQ_WAVES"}
+    assert "GRBM_COUNT" not in pmc_arg["value"]
+
+
+def test_filter_pmc_partial_updates_full_command():
+    """full_command reflects the reduced counter list after partial stripping."""
+    from rocpd.analyze import _filter_rec_commands
+
+    already = frozenset({"--sys-trace", "pmc:GRBM_COUNT"})
+    result = _filter_rec_commands([_pmc_cmd()], already)
+    assert len(result) == 1
+    assert "GRBM_COUNT" not in result[0]["full_command"]
+    assert "GRBM_GUI_ACTIVE" in result[0]["full_command"]
+    assert "SQ_WAVES" in result[0]["full_command"]
+
+
+def test_filter_pmc_no_counters_collected_keeps_command_unchanged():
+    """When already_collected is empty, the command is returned unchanged."""
+    from rocpd.analyze import _filter_rec_commands
+
+    already = frozenset()
+    cmd = _pmc_cmd()
+    result = _filter_rec_commands([cmd], already)
+    assert len(result) == 1
+    assert result[0] is cmd  # exact same object, no copy
+
+
+def test_filter_pmc_description_note_added():
+    """A note listing removed PMC counters is appended to description."""
+    from rocpd.analyze import _filter_rec_commands
+
+    already = frozenset({"--sys-trace", "pmc:GRBM_COUNT"})
+    result = _filter_rec_commands([_pmc_cmd()], already)
+    assert len(result) == 1
+    assert "GRBM_COUNT" in result[0]["description"]
+    assert "Already collected" in result[0]["description"]
+
+
+def test_filter_pmc_kernel_names_alone_not_meaningful():
+    """--kernel-names is a scope filter; command with only scope+output args is dropped."""
+    from rocpd.analyze import _filter_rec_commands
+
+    cmd = _pmc_cmd(
+        counters="GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES",
+        extra_args=[{"name": "--kernel-names", "value": "my_kernel"}],
+    )
+    cmd["full_command"] = (
+        "rocprofv3 --sys-trace --pmc GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES"
+        ' --kernel-names "my_kernel" -d ./output -o profile -- ./app'
+    )
+    # All three counters already collected + sys-trace → nothing new
+    already = frozenset(
+        {"--sys-trace", "pmc:GRBM_COUNT", "pmc:GRBM_GUI_ACTIVE", "pmc:SQ_WAVES"}
+    )
+    result = _filter_rec_commands([cmd], already)
+    assert result == [], "Command with only scope+output args remaining should be dropped"
+
+
+def test_filter_pmc_rocprof_compute_always_kept():
+    """rocprof-compute commands are never dropped, even when counters are collected."""
+    from rocpd.analyze import _filter_rec_commands
+
+    compute_cmd = {
+        "tool": "rocprof-compute",
+        "description": "Roofline model analysis",
+        "flags": [],
+        "args": [{"name": "profile", "value": None}],
+        "full_command": "rocprof-compute profile -- ./app",
+    }
+    already = frozenset(
+        {"--sys-trace", "pmc:GRBM_COUNT", "pmc:GRBM_GUI_ACTIVE", "pmc:SQ_WAVES"}
+    )
+    result = _filter_rec_commands([compute_cmd], already)
+    assert len(result) == 1
+    assert result[0] is compute_cmd
+
+
+# ---------------------------------------------------------------------------
+# Entry point
+# ---------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    # Use --noconftest to avoid loading conftest.py which requires rocprofiler_sdk module
+    exit_code = pytest.main(["--noconftest", "-x", __file__] + sys.argv[1:])
+    sys.exit(exit_code)
diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/rocpd/test_analyze_schema.py b/projects/rocprofiler-sdk/tests/rocprofv3/rocpd/test_analyze_schema.py
new file mode 100644
index 00000000000..6d762460f6d
--- /dev/null
+++ b/projects/rocprofiler-sdk/tests/rocprofv3/rocpd/test_analyze_schema.py
@@ -0,0 +1,562 @@
+#!/usr/bin/env python3
+###############################################################################
+# MIT License
+#
+# Copyright (c) 2025 Advanced Micro Devices, Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+###############################################################################
+
+"""
+Tests for the AI analysis JSON schema (analysis-output.schema.json).
+
+Validates:
+  - The schema file is present, parseable, and structurally correct.
+  - rocpd analyze --format json output conforms to the schema.
+  - Recommendations contain the structured commands array.
+"""
+
+import json
+import os
+import sys
+import tempfile
+
+try:
+    import importlib.resources as pkg_resources
+except ImportError:  # Python 3.6
+    import pkgutil as _pkgutil
+
+    class pkg_resources:  # type: ignore[no-redef]
+        """Minimal shim so _load_schema() works on Python 3.6."""
+
+        class _Traversable:
+            def __init__(self, package, resource):
+                self._package = package
+                self._resource = resource
+
+            def read_text(self, encoding="utf-8"):
+                data = _pkgutil.get_data(self._package, self._resource)
+                return data.decode(encoding) if data is not None else ""
+
+        class _Package:
+            def __init__(self, package):
+                self._package = package
+
+            def joinpath(self, resource):
+                return pkg_resources._Traversable(self._package, resource)
+
+        @staticmethod
+        def files(package):
+            return pkg_resources._Package(package)
+
+
+import pytest
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+# The version emitted by Tier 1/2 analysis (no TraceLens fields).
+# This constant is only used to verify the schema enum includes the Tier 1/2 version;
+# conformance tests derive allowed versions from the loaded schema enum directly.
+TIER12_SCHEMA_VERSION = "0.1.0"
+
+REQUIRED_TOP_LEVEL = [
+    "schema_version",
+    "metadata",
+    "profiling_info",
+    "summary",
+    "execution_breakdown",
+    "hotspots",
+    "memory_analysis",
+    "hardware_counters",
+    "recommendations",
+    "warnings",
+    "errors",
+]
+
+COMMAND_TOOLS = {"rocprofv3", "rocprof-sys", "rocprof-compute"}
+
+
+def _load_schema():
+    """Load the schema JSON from the installed package."""
+    schema_text = (
+        pkg_resources.files("rocpd.ai_analysis")
+        .joinpath("docs/analysis-output.schema.json")
+        .read_text(encoding="utf-8")
+    )
+    return json.loads(schema_text)
+
+
+def _make_synthetic_json_output():
+    """Generate a minimal JSON analysis document using the public API."""
+    from rocpd.analyze import format_analysis_output, generate_recommendations
+
+    # Keys must match what compute_time_breakdown() actually returns.
+    time_breakdown = {
+        "kernel_percent": 50.0,
+        "memcpy_percent": 30.0,
+        "overhead_percent": 15.0,
+        "total_runtime": 100_000_000,
+        "total_kernel_time": 50_000_000,
+        "total_memcpy_time": 30_000_000,
+    }
+    hotspots = [
+        {
+            "name": "test_kernel",
+            "total_duration": 45_000_000,
+            "calls": 10,  # matches identify_hotspots() key (COUNT(*) as calls)
+            "avg_duration": 4_500_000,
+            "min_duration": 4_000_000,
+            "max_duration": 5_000_000,
+        }
+    ]
+    # Keys must match the actual return shape of analyze_memory_copies():
+    # count, total_bytes, total_duration, avg_bytes, avg_duration, bandwidth_bytes_per_sec
+    memory_analysis = {
+        "Host-to-Device": {
+            "count": 5,
+            "total_bytes": 5120,
+            "total_duration": 30_000_000,
+            "avg_bytes": 1024.0,
+            "avg_duration": 6_000_000.0,
+            "bandwidth_bytes_per_sec": 1e9,
+        }
+    }
+    recommendations = generate_recommendations(time_breakdown, hotspots, memory_analysis)
+    output = format_analysis_output(
+        time_breakdown,
+        hotspots,
+        memory_analysis,
+        recommendations,
+        output_format="json",
+    )
+    return json.loads(output)
+
+
+# ---------------------------------------------------------------------------
+# Schema file tests
+# ---------------------------------------------------------------------------
+
+
+def test_schema_file_is_readable():
+    """Schema file can be located and read through the package."""
+    text = (
+        pkg_resources.files("rocpd.ai_analysis")
+        .joinpath("docs/analysis-output.schema.json")
+        .read_text(encoding="utf-8")
+    )
+    assert len(text) > 0, "Schema file is empty"
+
+
+def test_schema_file_is_valid_json():
+    """Schema file is valid JSON."""
+    schema = _load_schema()
+    assert isinstance(schema, dict), "Schema root must be a JSON object"
+
+
+def test_schema_file_has_json_schema_keyword():
+    """Schema file declares a JSON Schema dialect."""
+    from urllib.parse import urlparse
+
+    schema = _load_schema()
+    assert "$schema" in schema, "Schema must contain $schema keyword"
+    parsed = urlparse(schema["$schema"])
+    assert (
+        parsed.netloc == "json-schema.org"
+    ), f"$schema must point to json-schema.org, got netloc={parsed.netloc!r}"
+
+
+def test_schema_file_version_enum():
+    """schema_version property enum includes the Tier 1/2 version (0.1.0)."""
+    schema = _load_schema()
+    props = schema.get("properties", {})
+    assert "schema_version" in props, "schema_version must be in properties"
+    enum_vals = props["schema_version"].get("enum", [])
+    assert TIER12_SCHEMA_VERSION in enum_vals, (
+        f"schema_version enum must include {TIER12_SCHEMA_VERSION!r}, "
+        f"got {enum_vals!r}"
+    )
+
+
+def test_schema_file_required_fields():
+    """Schema requires all expected top-level fields."""
+    schema = _load_schema()
+    required = schema.get("required", [])
+    for field in REQUIRED_TOP_LEVEL:
+        assert field in required, f"Required field missing from schema: {field!r}"
+
+
+def test_schema_file_defines_recommendation_command():
+    """Schema $defs contains a recommendation_command definition."""
+    schema = _load_schema()
+    defs = schema.get("$defs", {})
+    assert "recommendation_command" in defs, "$defs must define recommendation_command"
+    cmd_def = defs["recommendation_command"]
+    required_cmd_fields = {"tool", "description", "flags", "args", "full_command"}
+    defined = set(cmd_def.get("properties", {}).keys())
+    missing = required_cmd_fields - defined
+    assert not missing, f"recommendation_command missing properties: {missing}"
+
+
+def test_schema_file_tool_enum():
+    """recommendation_command.tool is an enum of the three ROCm tools."""
+    schema = _load_schema()
+    cmd_props = schema["$defs"]["recommendation_command"]["properties"]
+    tool_enum = set(cmd_props["tool"].get("enum", []))
+    assert (
+        tool_enum == COMMAND_TOOLS
+    ), f"tool enum must be {COMMAND_TOOLS}, got {tool_enum}"
+
+
+# ---------------------------------------------------------------------------
+# JSON output conformance tests (using synthetic data)
+# ---------------------------------------------------------------------------
+
+
+def test_json_output_schema_version():
+    """format_analysis_output JSON output carries a schema_version in the allowed enum."""
+    schema = _load_schema()
+    allowed = schema["properties"]["schema_version"]["enum"]
+    doc = _make_synthetic_json_output()
+    assert (
+        doc.get("schema_version") in allowed
+    ), f"schema_version {doc.get('schema_version')!r} not in allowed enum {allowed}"
+
+
+def test_json_output_required_fields_present():
+    """All required top-level fields are present in JSON output."""
+    doc = _make_synthetic_json_output()
+    for field in REQUIRED_TOP_LEVEL:
+        assert field in doc, f"Required field missing from JSON output: {field!r}"
+
+
+def test_json_output_metadata_fields():
+    """metadata object contains expected sub-fields."""
+    doc = _make_synthetic_json_output()
+    meta = doc["metadata"]
+    for field in (
+        "rocpd_version",
+        "analysis_version",
+        "database_file",
+        "analysis_timestamp",
+    ):
+        assert field in meta, f"metadata missing field: {field!r}"
+    schema = _load_schema()
+    allowed = schema["properties"]["schema_version"]["enum"]
+    assert (
+        meta["analysis_version"] in allowed
+    ), f"metadata.analysis_version {meta['analysis_version']!r} not in allowed enum {allowed}"
+
+
+def test_json_output_hardware_counters_has_flag():
+    """hardware_counters always contains has_counters boolean."""
+    doc = _make_synthetic_json_output()
+    hw = doc["hardware_counters"]
+    assert "has_counters" in hw, "hardware_counters must have has_counters"
+    assert isinstance(hw["has_counters"], bool)
+
+
+def test_json_output_recommendations_are_list():
+    """recommendations is a list."""
+    doc = _make_synthetic_json_output()
+    assert isinstance(doc["recommendations"], list)
+
+
+def test_json_output_recommendation_required_fields():
+    """Each recommendation has required fields: id, priority, category, issue, suggestion."""
+    doc = _make_synthetic_json_output()
+    for i, rec in enumerate(doc["recommendations"]):
+        for field in ("id", "priority", "category", "issue", "suggestion"):
+            assert field in rec, f"recommendations[{i}] missing field {field!r}"
+        assert rec["priority"] in (
+            "HIGH",
+            "MEDIUM",
+            "LOW",
+            "INFO",
+        ), f"recommendations[{i}] has invalid priority {rec['priority']!r}"
+
+
+def test_json_output_recommendations_have_commands():
+    """Recommendations include a commands array."""
+    doc = _make_synthetic_json_output()
+    recs_with_commands = [r for r in doc["recommendations"] if r.get("commands")]
+    assert (
+        len(recs_with_commands) > 0
+    ), "At least one recommendation must have a non-empty commands array"
+
+
+def test_json_output_command_structure():
+    """Each command object has all required fields with correct types."""
+    doc = _make_synthetic_json_output()
+    for i, rec in enumerate(doc["recommendations"]):
+        for j, cmd in enumerate(rec.get("commands", [])):
+            loc = f"recommendations[{i}].commands[{j}]"
+            assert "tool" in cmd, f"{loc} missing 'tool'"
+            assert "description" in cmd, f"{loc} missing 'description'"
+            assert "flags" in cmd, f"{loc} missing 'flags'"
+            assert "args" in cmd, f"{loc} missing 'args'"
+            assert "full_command" in cmd, f"{loc} missing 'full_command'"
+            assert (
+                cmd["tool"] in COMMAND_TOOLS
+            ), f"{loc} tool {cmd['tool']!r} not in {COMMAND_TOOLS}"
+            assert isinstance(cmd["flags"], list), f"{loc} flags must be a list"
+            assert isinstance(cmd["args"], list), f"{loc} args must be a list"
+            assert isinstance(
+                cmd["full_command"], str
+            ), f"{loc} full_command must be a string"
+            assert (
+                cmd["tool"] in cmd["full_command"]
+            ), f"{loc} full_command must start with tool name"
+
+
+def test_json_output_command_args_structure():
+    """Each arg in commands.args has name and value fields."""
+    doc = _make_synthetic_json_output()
+    for i, rec in enumerate(doc["recommendations"]):
+        for j, cmd in enumerate(rec.get("commands", [])):
+            for k, arg in enumerate(cmd.get("args", [])):
+                loc = f"recommendations[{i}].commands[{j}].args[{k}]"
+                assert "name" in arg, f"{loc} missing 'name'"
+                assert "value" in arg, f"{loc} missing 'value'"
+                assert isinstance(arg["name"], str), f"{loc} name must be a string"
+                # value may be str or None
+                assert arg["value"] is None or isinstance(
+                    arg["value"], str
+                ), f"{loc} value must be str or null"
+
+
+def test_json_output_validates_against_schema():
+    """JSON output passes jsonschema validation against analysis-output.schema.json."""
+    jsonschema = pytest.importorskip("jsonschema", reason="jsonschema not installed")
+    schema = _load_schema()
+    doc = _make_synthetic_json_output()
+    try:
+        jsonschema.validate(instance=doc, schema=schema)
+    except jsonschema.ValidationError as exc:
+        pytest.fail(f"JSON output failed schema validation: {exc.message}")
+
+
+# ---------------------------------------------------------------------------
+# Tier 0 (source-only) JSON output helpers
+# ---------------------------------------------------------------------------
+
+_MINIMAL_HIP_SOURCE = """\
+__global__ void my_kernel(float* x) { *x = 1.0f; }
+void run() {
+    hipLaunchKernelGGL(my_kernel, dim3(1), dim3(64), 0, 0, nullptr);
+    hipMemcpy(nullptr, nullptr, 0, hipMemcpyHostToDevice);
+}
+"""
+
+TIER0_SCHEMA_VERSION = "0.2.0"
+
+
+def _make_synthetic_tier0_json_output():
+    """Generate a Tier 0 (source-only) JSON document via format_analysis_output."""
+    from rocpd.analyze import analyze_source_code, format_analysis_output
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        hip_file = os.path.join(tmpdir, "test.cpp")
+        with open(hip_file, "w") as fh:
+            fh.write(_MINIMAL_HIP_SOURCE)
+
+        tier0_result = analyze_source_code(tmpdir)
+        output = format_analysis_output(
+            {},
+            [],
+            {},
+            [],
+            output_format="json",
+            tier0_result=tier0_result,
+            source_only=True,
+        )
+    return json.loads(output)
+
+
+def _make_synthetic_combined_json_output():
+    """Generate a combined (Tier 0 + Tier 1/2) JSON document."""
+    from rocpd.analyze import (
+        analyze_source_code,
+        format_analysis_output,
+        generate_recommendations,
+    )
+
+    time_breakdown = {
+        "kernel_percent": 50.0,
+        "memcpy_percent": 30.0,
+        "overhead_percent": 15.0,
+        "total_runtime": 100_000_000,
+        "total_kernel_time": 50_000_000,
+        "total_memcpy_time": 30_000_000,
+    }
+    hotspots = [
+        {
+            "name": "test_kernel",
+            "total_duration": 45_000_000,
+            "calls": 10,
+            "avg_duration": 4_500_000,
+            "min_duration": 4_000_000,
+            "max_duration": 5_000_000,
+        }
+    ]
+    memory_analysis = {
+        "Host-to-Device": {
+            "count": 5,
+            "total_bytes": 5120,
+            "total_duration": 30_000_000,
+            "avg_bytes": 1024.0,
+            "avg_duration": 6_000_000.0,
+            "bandwidth_bytes_per_sec": 1e9,
+        }
+    }
+    recommendations = generate_recommendations(time_breakdown, hotspots, memory_analysis)
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        hip_file = os.path.join(tmpdir, "test.cpp")
+        with open(hip_file, "w") as fh:
+            fh.write(_MINIMAL_HIP_SOURCE)
+
+        tier0_result = analyze_source_code(tmpdir)
+        output = format_analysis_output(
+            time_breakdown,
+            hotspots,
+            memory_analysis,
+            recommendations,
+            output_format="json",
+            tier0_result=tier0_result,
+            source_only=False,
+        )
+    return json.loads(output)
+
+
+# ---------------------------------------------------------------------------
+# Tier 0 (source-only) schema conformance tests
+# ---------------------------------------------------------------------------
+
+
+def test_tier0_json_output_schema_version():
+    """Tier 0 JSON output has schema_version in the allowed enum."""
+    schema = _load_schema()
+    allowed = schema["properties"]["schema_version"]["enum"]
+    doc = _make_synthetic_tier0_json_output()
+    assert (
+        doc.get("schema_version") in allowed
+    ), f"tier0 schema_version {doc.get('schema_version')!r} not in allowed enum {allowed}"
+    assert (
+        doc.get("schema_version") == TIER0_SCHEMA_VERSION
+    ), f"tier0 schema_version should be {TIER0_SCHEMA_VERSION!r}"
+
+
+def test_tier0_json_output_required_fields_present():
+    """All required top-level fields are present in Tier 0 JSON output."""
+    doc = _make_synthetic_tier0_json_output()
+    for field in REQUIRED_TOP_LEVEL:
+        assert field in doc, f"Tier 0 JSON missing required field: {field!r}"
+
+
+def test_tier0_json_output_execution_breakdown_is_null():
+    """execution_breakdown is null in source-only (Tier 0) output."""
+    doc = _make_synthetic_tier0_json_output()
+    assert (
+        doc["execution_breakdown"] is None
+    ), "execution_breakdown must be null in Tier 0 source-only output"
+
+
+def test_tier0_json_output_profiling_mode_is_source_only():
+    """profiling_info.profiling_mode is 'source_only' in Tier 0 output."""
+    doc = _make_synthetic_tier0_json_output()
+    assert (
+        doc["profiling_info"]["profiling_mode"] == "source_only"
+    ), "Tier 0 profiling_mode must be 'source_only'"
+
+
+def test_tier0_json_output_analysis_tier_is_zero():
+    """profiling_info.analysis_tier is 0 in Tier 0 source-only output."""
+    doc = _make_synthetic_tier0_json_output()
+    assert doc["profiling_info"]["analysis_tier"] == 0, "Tier 0 analysis_tier must be 0"
+
+
+def test_tier0_json_output_has_tier0_field():
+    """Tier 0 JSON output includes a top-level 'tier0' object."""
+    doc = _make_synthetic_tier0_json_output()
+    assert "tier0" in doc, "Tier 0 JSON output must include a 'tier0' field"
+    tier0 = doc["tier0"]
+    assert isinstance(tier0, dict), "'tier0' must be a JSON object"
+    for field in ("source_dir", "programming_model", "files_scanned", "kernel_count"):
+        assert field in tier0, f"tier0 missing field {field!r}"
+
+
+def test_tier0_json_output_validates_against_schema():
+    """Tier 0 JSON output passes jsonschema validation."""
+    jsonschema = pytest.importorskip("jsonschema", reason="jsonschema not installed")
+    schema = _load_schema()
+    doc = _make_synthetic_tier0_json_output()
+    try:
+        jsonschema.validate(instance=doc, schema=schema)
+    except jsonschema.ValidationError as exc:
+        pytest.fail(f"Tier 0 JSON failed schema validation: {exc.message}")
+
+
+# ---------------------------------------------------------------------------
+# Combined (Tier 0 + Tier 1/2) schema conformance tests
+# ---------------------------------------------------------------------------
+
+
+def test_combined_json_output_has_tier0_field():
+    """Combined (Tier 0 + Tier 1/2) JSON output includes a top-level 'tier0' object."""
+    doc = _make_synthetic_combined_json_output()
+    assert "tier0" in doc, "Combined JSON output must include a 'tier0' field"
+    assert isinstance(doc["tier0"], dict), "'tier0' must be a JSON object"
+
+
+def test_combined_json_output_tier12_required_fields_present():
+    """Combined JSON output has all required Tier 1/2 top-level fields."""
+    doc = _make_synthetic_combined_json_output()
+    for field in REQUIRED_TOP_LEVEL:
+        assert field in doc, f"Combined JSON missing required field: {field!r}"
+
+
+def test_combined_json_output_execution_breakdown_not_null():
+    """execution_breakdown is non-null in combined (Tier 0 + Tier 1/2) output."""
+    doc = _make_synthetic_combined_json_output()
+    assert (
+        doc["execution_breakdown"] is not None
+    ), "execution_breakdown must not be null in combined output"
+
+
+def test_combined_json_output_validates_against_schema():
+    """Combined (Tier 0 + Tier 1/2) JSON output passes jsonschema validation."""
+    jsonschema = pytest.importorskip("jsonschema", reason="jsonschema not installed")
+    schema = _load_schema()
+    doc = _make_synthetic_combined_json_output()
+    try:
+        jsonschema.validate(instance=doc, schema=schema)
+    except jsonschema.ValidationError as exc:
+        pytest.fail(f"Combined JSON failed schema validation: {exc.message}")
+
+
+# ---------------------------------------------------------------------------
+# Entry point
+# ---------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    # Use --noconftest to avoid loading conftest.py which requires rocprofiler_sdk module
+    exit_code = pytest.main(["--noconftest", "-x", __file__] + sys.argv[1:])
+    sys.exit(exit_code)
diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/rocpd/test_guide_filter_standalone.py b/projects/rocprofiler-sdk/tests/rocprofv3/rocpd/test_guide_filter_standalone.py
new file mode 100644
index 00000000000..25e3b2888e9
--- /dev/null
+++ b/projects/rocprofiler-sdk/tests/rocprofv3/rocpd/test_guide_filter_standalone.py
@@ -0,0 +1,528 @@
+#!/usr/bin/env python3
+###############################################################################
+# MIT License
+#
+# Copyright (c) 2025 Advanced Micro Devices, Inc.
+###############################################################################
+"""
+Standalone unit tests for LLM reference guide context-aware filtering.
+
+These tests do NOT require a GPU trace database or real LLM credentials.
+Run with:
+    ROCPD_SYS=$(python3 -c "import site; print(site.getsitepackages()[-1])")
+    PYTHONPATH="${ROCPD_SYS}" pytest --noconftest test_guide_filter_standalone.py -v
+"""
+
+import sys
+
+import pytest
+
+# ---------------------------------------------------------------------------
+# Group A: AnalysisContext defaults and construction (5 tests)
+# ---------------------------------------------------------------------------
+
+
+class TestAnalysisContextDefaults:
+
+    def test_default_tier_is_1(self):
+        from rocpd.ai_analysis.llm_analyzer import AnalysisContext
+
+        ctx = AnalysisContext()
+        assert ctx.tier == 1
+
+    def test_default_has_counters_false(self):
+        from rocpd.ai_analysis.llm_analyzer import AnalysisContext
+
+        ctx = AnalysisContext()
+        assert ctx.has_counters is False
+
+    def test_default_nullable_fields_are_none(self):
+        from rocpd.ai_analysis.llm_analyzer import AnalysisContext
+
+        ctx = AnalysisContext()
+        assert ctx.bottleneck_type is None
+        assert ctx.gpu_arch is None
+        assert ctx.custom_prompt is None
+
+    def test_explicit_values_preserved(self):
+        from rocpd.ai_analysis.llm_analyzer import AnalysisContext
+
+        ctx = AnalysisContext(
+            tier=2,
+            has_counters=True,
+            bottleneck_type="compute",
+            gpu_arch="gfx942",
+            custom_prompt="why is my kernel slow?",
+        )
+        assert ctx.tier == 2
+        assert ctx.has_counters is True
+        assert ctx.bottleneck_type == "compute"
+        assert ctx.gpu_arch == "gfx942"
+        assert ctx.custom_prompt == "why is my kernel slow?"
+
+    def test_dataclass_equality(self):
+        from rocpd.ai_analysis.llm_analyzer import AnalysisContext
+
+        a = AnalysisContext(tier=1, has_counters=False)
+        b = AnalysisContext(tier=1, has_counters=False)
+        assert a == b
+
+
+# ---------------------------------------------------------------------------
+# Group B: _select_tags logic (14 tests)
+# ---------------------------------------------------------------------------
+
+
+class TestSelectTags:
+
+    def _tags(self, **kwargs):
+        from rocpd.ai_analysis.llm_analyzer import AnalysisContext, _select_tags
+
+        return _select_tags(AnalysisContext(**kwargs))
+
+    def test_tier1_no_counters_gives_always_and_tier1_only(self):
+        tags = self._tags(tier=1, has_counters=False)
+        assert tags == {"always", "tier1"}
+
+    def test_tier2_value_adds_tier2_even_without_flag(self):
+        tags = self._tags(tier=2, has_counters=False)
+        assert "tier2" in tags
+        assert "tier1" in tags
+
+    def test_has_counters_true_adds_tier2_regardless_of_tier_field(self):
+        tags = self._tags(tier=1, has_counters=True)
+        assert "tier2" in tags
+
+    def test_tier0_gives_always_source_compiler_not_tier1_or_tier2(self):
+        tags = self._tags(tier=0)
+        assert "always" in tags
+        assert "source" in tags
+        assert "compiler" in tags
+        assert "tier1" not in tags
+        assert "tier2" not in tags
+
+    def test_bottleneck_compute_adds_compiler(self):
+        tags = self._tags(tier=1, bottleneck_type="compute")
+        assert "compiler" in tags
+
+    def test_bottleneck_memory_adds_compiler(self):
+        tags = self._tags(tier=1, bottleneck_type="memory")
+        assert "compiler" in tags
+
+    def test_bottleneck_latency_does_not_add_compiler(self):
+        tags = self._tags(tier=2, has_counters=True, bottleneck_type="latency")
+        assert "compiler" not in tags
+
+    def test_bottleneck_mixed_does_not_add_compiler(self):
+        tags = self._tags(tier=2, has_counters=True, bottleneck_type="mixed")
+        assert "compiler" not in tags
+
+    def test_custom_prompt_compiler_keyword_adds_compiler(self):
+        tags = self._tags(tier=1, custom_prompt="check compiler flags")
+        assert "compiler" in tags
+
+    def test_custom_prompt_build_keyword_adds_compiler(self):
+        tags = self._tags(tier=1, custom_prompt="build options to try")
+        assert "compiler" in tags
+
+    def test_custom_prompt_memory_keyword_does_not_add_compiler(self):
+        tags = self._tags(tier=1, custom_prompt="memory bottleneck analysis")
+        assert "compiler" not in tags
+
+    def test_custom_prompt_none_does_not_add_compiler(self):
+        tags = self._tags(tier=1, custom_prompt=None)
+        assert "compiler" not in tags
+
+    def test_full_tier2_compute_bottleneck_has_all_tags(self):
+        tags = self._tags(tier=2, has_counters=True, bottleneck_type="compute")
+        assert tags == {"always", "tier1", "tier2", "compiler"}
+
+    def test_full_tier2_latency_bottleneck_has_no_compiler(self):
+        tags = self._tags(tier=2, has_counters=True, bottleneck_type="latency")
+        assert tags == {"always", "tier1", "tier2"}
+
+
+# ---------------------------------------------------------------------------
+# Group C: _filter_guide section parsing (12 tests)
+# ---------------------------------------------------------------------------
+
+
+class TestFilterGuide:
+
+    def _filter(self, guide, tags):
+        from rocpd.ai_analysis.llm_analyzer import _filter_guide
+
+        return _filter_guide(guide, tags)
+
+    def _make_guide(self, *sections):
+        """Build a mini guide string from (title, tag_or_None, content) tuples."""
+        parts = ["# LLM Reference Guide\n\nIntro block with no tag.\n"]
+        for title, tag, content in sections:
+            tag_line = f"<!-- rocpd-context: {tag} -->\n" if tag else ""
+            parts.append(f"## {title}\n{tag_line}{content}\n")
+        return "\n".join(parts)
+
+    def test_always_tagged_section_included_when_always_in_tags(self):
+        guide = self._make_guide(("Critical", "always", "critical content"))
+        result = self._filter(guide, {"always"})
+        assert "critical content" in result
+
+    def test_tier2_section_excluded_when_only_tier1_in_tags(self):
+        guide = self._make_guide(
+            ("HW Counters", "tier2", "counter content"),
+            ("Workflow", "tier1", "workflow content"),
+        )
+        result = self._filter(guide, {"always", "tier1"})
+        assert "counter content" not in result
+        assert "workflow content" in result
+
+    def test_tier2_section_included_when_tier2_in_tags(self):
+        guide = self._make_guide(("HW Counters", "tier2", "counter content"))
+        result = self._filter(guide, {"always", "tier1", "tier2"})
+        assert "counter content" in result
+
+    def test_section_with_no_tag_always_included(self):
+        guide = self._make_guide(("Untagged Section", None, "untagged content"))
+        result = self._filter(guide, {"always"})
+        assert "untagged content" in result
+
+    def test_section_with_multiple_tags_included_on_any_match(self):
+        guide = (
+            "# Guide\n\n## Multi\n<!-- rocpd-context: tier1, tier2 -->\nmulti content\n"
+        )
+        result = self._filter(guide, {"always", "tier2"})
+        assert "multi content" in result
+
+    def test_empty_guide_returns_empty_string(self):
+        result = self._filter("", {"always"})
+        assert result == ""
+
+    def test_guide_with_zero_tagged_sections_returns_full_content(self):
+        guide = self._make_guide(
+            ("Alpha", None, "alpha content"),
+            ("Beta", None, "beta content"),
+        )
+        result = self._filter(guide, {"always"})
+        assert "alpha content" in result
+        assert "beta content" in result
+
+    def test_tag_comment_with_extra_whitespace_parsed_correctly(self):
+        guide = (
+            "# Guide\n\n## Section\n<!--  rocpd-context:  tier2  -->\nspaced content\n"
+        )
+        result = self._filter(guide, {"tier2"})
+        assert "spaced content" in result
+
+    def test_unknown_tag_excludes_section(self):
+        guide = self._make_guide(("Future", "future_tag", "future content"))
+        result = self._filter(guide, {"always", "tier1", "tier2"})
+        assert "future content" not in result
+
+    def test_tag_comment_on_line2_still_found(self):
+        guide = (
+            "# Guide\n\n## Section\n\n<!-- rocpd-context: tier1 -->\nline2 tag content\n"
+        )
+        result = self._filter(guide, {"tier1"})
+        assert "line2 tag content" in result
+
+    def test_tag_comment_beyond_scan_window_treated_as_no_tag(self):
+        # Tag comment on line 5 (beyond first-3-line scan) → treated as no tag → included
+        guide = (
+            "# Guide\n\n## Section\nline1\nline2\nline3\nline4\n"
+            "<!-- rocpd-context: tier2 -->\nlate tag content\n"
+        )
+        result = self._filter(guide, {"always"})
+        assert "late tag content" in result
+
+    def test_multiple_sections_ordering_preserved(self):
+        guide = self._make_guide(
+            ("First", "always", "first content"),
+            ("Second", "tier2", "second content"),
+            ("Third", "always", "third content"),
+        )
+        result = self._filter(guide, {"always"})
+        assert result.index("first content") < result.index("third content")
+        assert "second content" not in result
+
+
+# ---------------------------------------------------------------------------
+# Group D: _build_system_prompt integration (4 tests)
+# ---------------------------------------------------------------------------
+
+
+class TestBuildSystemPrompt:
+
+    def _make_analyzer(self):
+        from rocpd.ai_analysis.llm_analyzer import LLMAnalyzer
+        from unittest.mock import patch
+
+        with patch.object(
+            LLMAnalyzer,
+            "_load_reference_guide",
+            return_value=(
+                "# Guide\n\n## Always Section\n<!-- rocpd-context: always -->\nalways content\n\n"
+                "## Tier2 Section\n<!-- rocpd-context: tier2 -->\ntier2 content\n\n"
+                "## Compiler Section\n<!-- rocpd-context: compiler -->\ncompiler content\n"
+            ),
+        ):
+            return LLMAnalyzer(provider="anthropic", api_key="fake-key")
+
+    def test_context_none_returns_full_guide(self):
+        analyzer = self._make_analyzer()
+        prompt = analyzer._build_system_prompt(context=None)
+        assert "always content" in prompt
+        assert "tier2 content" in prompt
+        assert "compiler content" in prompt
+
+    def test_tier1_context_excludes_tier2_and_compiler(self):
+        from rocpd.ai_analysis.llm_analyzer import AnalysisContext
+
+        analyzer = self._make_analyzer()
+        ctx = AnalysisContext(tier=1, has_counters=False)
+        prompt = analyzer._build_system_prompt(context=ctx)
+        assert "always content" in prompt
+        assert "tier2 content" not in prompt
+        assert "compiler content" not in prompt
+
+    def test_tier2_context_includes_tier2_excludes_compiler(self):
+        from rocpd.ai_analysis.llm_analyzer import AnalysisContext
+
+        analyzer = self._make_analyzer()
+        ctx = AnalysisContext(tier=2, has_counters=True, bottleneck_type="latency")
+        prompt = analyzer._build_system_prompt(context=ctx)
+        assert "tier2 content" in prompt
+        assert "compiler content" not in prompt
+
+    def test_returned_prompt_is_always_non_empty(self):
+        from rocpd.ai_analysis.llm_analyzer import AnalysisContext
+
+        analyzer = self._make_analyzer()
+        ctx = AnalysisContext(tier=1)
+        prompt = analyzer._build_system_prompt(context=ctx)
+        assert len(prompt) > 0
+
+
+# ---------------------------------------------------------------------------
+# Group D continued: context propagation through public methods (3 tests)
+# ---------------------------------------------------------------------------
+
+
+class TestAnalyzeWithLLMContextParam:
+
+    def _make_analyzer_capturing_prompt(self):
+        from rocpd.ai_analysis.llm_analyzer import LLMAnalyzer
+        from unittest.mock import patch
+
+        captured = {}
+
+        with patch.object(
+            LLMAnalyzer,
+            "_load_reference_guide",
+            return_value=(
+                "# Guide\n\n## Always\n<!-- rocpd-context: always -->\nalways text\n\n"
+                "## Tier2\n<!-- rocpd-context: tier2 -->\ntier2 text\n"
+            ),
+        ):
+            analyzer = LLMAnalyzer(provider="anthropic", api_key="fake")
+
+        def fake_call(system_prompt, user_prompt):
+            captured["system_prompt"] = system_prompt
+            return "fake llm response"
+
+        analyzer._call_anthropic = fake_call
+        return analyzer, captured
+
+    def test_analyze_with_llm_context_filters_guide(self):
+        from rocpd.ai_analysis.llm_analyzer import AnalysisContext
+
+        analyzer, captured = self._make_analyzer_capturing_prompt()
+        ctx = AnalysisContext(tier=1, has_counters=False)
+        analyzer.analyze_with_llm(analysis_data={}, context=ctx)
+        assert "tier2 text" not in captured["system_prompt"]
+        assert "always text" in captured["system_prompt"]
+
+    def test_analyze_with_llm_no_context_uses_full_guide(self):
+        analyzer, captured = self._make_analyzer_capturing_prompt()
+        analyzer.analyze_with_llm(analysis_data={})
+        assert "tier2 text" in captured["system_prompt"]
+
+    def test_analyze_source_with_llm_context_filters_guide(self):
+        from rocpd.ai_analysis.llm_analyzer import AnalysisContext, LLMAnalyzer
+        from unittest.mock import patch
+        from rocpd.ai_analysis.api import SourceAnalysisResult
+
+        captured = {}
+
+        with patch.object(
+            LLMAnalyzer,
+            "_load_reference_guide",
+            return_value=(
+                "# Guide\n\n## Always\n<!-- rocpd-context: always -->\nalways text\n\n"
+                "## Compiler\n<!-- rocpd-context: compiler -->\ncompiler text\n"
+            ),
+        ):
+            analyzer = LLMAnalyzer(provider="anthropic", api_key="fake")
+
+        def fake_call(system_prompt, user_prompt):
+            captured["system_prompt"] = system_prompt
+            return "fake source response"
+
+        analyzer._call_anthropic = fake_call
+
+        ctx = AnalysisContext(tier=0)  # Tier 0 → compiler tag active
+        minimal_result = SourceAnalysisResult(
+            source_dir="/tmp",
+            analysis_timestamp="2026-01-01T00:00:00",
+            programming_model="HIP",
+            files_scanned=0,
+            files_skipped=0,
+            detected_kernels=[],
+            kernel_count=0,
+            detected_patterns=[],
+            risk_areas=[],
+            already_instrumented=False,
+            roctx_marker_count=0,
+            recommendations=[],
+            suggested_counters=[],
+            suggested_first_command="",
+        )
+        analyzer.analyze_source_with_llm(minimal_result, context=ctx)
+        assert "compiler text" in captured["system_prompt"]
+
+
+# ---------------------------------------------------------------------------
+# Group F: public API export (2 tests)
+# ---------------------------------------------------------------------------
+
+
+class TestPublicExport:
+
+    def test_analysis_context_importable_from_package(self):
+        from rocpd.ai_analysis import AnalysisContext
+
+        ctx = AnalysisContext(tier=2)
+        assert ctx.tier == 2
+
+    def test_analysis_context_in_all(self):
+        import rocpd.ai_analysis as pkg
+
+        assert "AnalysisContext" in pkg.__all__
+
+
+# ---------------------------------------------------------------------------
+# Group E: end-to-end with real guide file (6 tests)
+# ---------------------------------------------------------------------------
+
+
+class TestEndToEndWithRealGuide:
+    """
+    Load the actual llm-reference-guide.md and verify filtering behaviour.
+    These tests do NOT call any external LLM API.
+    """
+
+    def _build_prompt(self, **ctx_kwargs):
+        from rocpd.ai_analysis.llm_analyzer import LLMAnalyzer, AnalysisContext
+        from unittest.mock import patch
+
+        guide = (
+            LLMAnalyzer.__module__
+            and __import__(
+                "rocpd.ai_analysis.llm_analyzer", fromlist=["get_reference_guide_path"]
+            )
+            .get_reference_guide_path()
+            .read_text()
+        )
+        with patch.object(LLMAnalyzer, "_load_reference_guide", return_value=guide):
+            analyzer = LLMAnalyzer(provider="anthropic", api_key="fake")
+        ctx = AnalysisContext(**ctx_kwargs)
+        return analyzer._build_system_prompt(context=ctx)
+
+    def test_tier1_excludes_compiler_section(self):
+        prompt = self._build_prompt(tier=1, has_counters=False)
+        assert "Compiler Optimization Flags" not in prompt
+
+    def test_tier2_latency_excludes_compiler_section(self):
+        prompt = self._build_prompt(tier=2, has_counters=True, bottleneck_type="latency")
+        assert "Compiler Optimization Flags" not in prompt
+
+    def test_tier0_includes_compiler_section(self):
+        prompt = self._build_prompt(tier=0)
+        assert "Compiler Optimization Flags" in prompt
+
+    def test_bottleneck_compute_includes_compiler_section(self):
+        prompt = self._build_prompt(tier=2, has_counters=True, bottleneck_type="compute")
+        assert "Compiler Optimization Flags" in prompt
+
+    def test_critical_requirements_always_present(self):
+        for tier in (0, 1, 2):
+            prompt = self._build_prompt(tier=tier)
+            assert "CRITICAL REQUIREMENTS" in prompt, f"Missing in tier {tier}"
+
+    def test_always_tagged_sections_present_in_every_tier(self):
+        always_markers = [
+            "Your Role",
+            "Output Format Requirements",
+            "What NOT to Do",
+            "Summary",
+        ]
+        for tier in (0, 1, 2):
+            prompt = self._build_prompt(tier=tier)
+            for marker in always_markers:
+                assert marker in prompt, f"'{marker}' missing for tier {tier}"
+
+
+# ---------------------------------------------------------------------------
+# Group F: guide file integrity (2 tests)
+# ---------------------------------------------------------------------------
+
+
+class TestGuideIntegrity:
+    """Validate that the real llm-reference-guide.md is correctly tagged."""
+
+    KNOWN_TAGS = {"always", "tier1", "tier2", "compiler", "source", "tracelens_metrics"}
+    # The intro block (before the first ## section) is intentionally untagged
+    UNTAGGED_ALLOWED_PREFIXES = ("LLM Reference Guide",)
+
+    @classmethod
+    def _sections(cls):
+        """Return list of (title, tag_or_None) for every ## section."""
+        import re
+        from rocpd.ai_analysis.llm_analyzer import get_reference_guide_path
+
+        text = get_reference_guide_path().read_text()
+        tag_re = re.compile(r"<!--\s*rocpd-context:\s*([^-]+?)\s*-->")
+        results = []
+        for raw in re.split(r"\n(?=## )", text):
+            if not raw.startswith("## "):
+                continue
+            title = raw.splitlines()[0][3:].strip()
+            head = "\n".join(raw.splitlines()[:3])
+            match = tag_re.search(head)
+            tag = match.group(1).strip() if match else None
+            results.append((title, tag))
+        return results
+
+    def test_every_section_has_a_tag(self):
+        """No ## section should be accidentally left without a rocpd-context tag."""
+        untagged = [
+            title
+            for title, tag in self._sections()
+            if tag is None
+            and not any(title.startswith(p) for p in self.UNTAGGED_ALLOWED_PREFIXES)
+        ]
+        assert untagged == [], f"Sections missing rocpd-context tag: {untagged}"
+
+    def test_all_tags_are_from_known_vocabulary(self):
+        """Catch typos in tag names e.g. 'tier_2' instead of 'tier2'."""
+        bad = []
+        for title, tag in self._sections():
+            if tag is None:
+                continue
+            for t in (t.strip() for t in tag.split(",")):
+                if t not in self.KNOWN_TAGS:
+                    bad.append((title, t))
+        assert bad == [], f"Unknown tags found: {bad}"
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__, "-v"]))