diff --git a/.gitignore b/.gitignore index 9e1a5ff76cd..251c8e5e60b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,15 @@ .cline_storage /projects/hip/_build + +# Claude Code session data +.claude/ +**/.claude/ + +# Python +__pycache__/ +**/__pycache__/ +*.pyc +*.pyo + +# Analysis output generated during testing +rocpd-output-data/ diff --git a/projects/rocprofiler-sdk/cmake/Modules/rocprofiler-sdk-utilities.cmake b/projects/rocprofiler-sdk/cmake/Modules/rocprofiler-sdk-utilities.cmake index 69573ad447a..cd00f349994 100644 --- a/projects/rocprofiler-sdk/cmake/Modules/rocprofiler-sdk-utilities.cmake +++ b/projects/rocprofiler-sdk/cmake/Modules/rocprofiler-sdk-utilities.cmake @@ -56,6 +56,17 @@ function(rocprofiler_sdk_pc_sampling_disabled _VAR) set(CMAKE_MESSAGE_INDENT "[${PROJECT_NAME}]${ARG_PREFIX} ") rocprofiler_sdk_get_gfx_architectures(rocprofiler-sdk-tests-gfx-info ECHO) + # Guard against empty GPU list (e.g. build machine without GPUs) + list(LENGTH rocprofiler-sdk-tests-gfx-info _gfx_list_len) + if(_gfx_list_len EQUAL 0) + set(${_VAR} + TRUE + PARENT_SCOPE) + if(ARG_ECHO) + message(STATUS "PC Sampling is disabled (no GPUs detected)") + endif() + return() + endif() list(GET rocprofiler-sdk-tests-gfx-info 0 pc-sampling-gpu-0-gfx-info) if("${pc-sampling-gpu-0-gfx-info}" MATCHES "^gfx90a$" @@ -88,6 +99,17 @@ function(rocprofiler_sdk_pc_sampling_stochastic_disabled _VAR) set(CMAKE_MESSAGE_INDENT "[${PROJECT_NAME}]${ARG_PREFIX} ") rocprofiler_sdk_get_gfx_architectures(rocprofiler-sdk-tests-gfx-info ECHO) + # Guard against empty GPU list (e.g. build machine without GPUs) + list(LENGTH rocprofiler-sdk-tests-gfx-info _gfx_list_len) + if(_gfx_list_len EQUAL 0) + set(${_VAR} + TRUE + PARENT_SCOPE) + if(ARG_ECHO) + message(STATUS "Stochastic PC Sampling is disabled (no GPUs detected)") + endif() + return() + endif() list(GET rocprofiler-sdk-tests-gfx-info 0 pc-sampling-gpu-0-gfx-info) if("${pc-sampling-gpu-0-gfx-info}" MATCHES "^gfx94[0-9]$" diff --git a/projects/rocprofiler-sdk/source/bin/rocprofv3.py b/projects/rocprofiler-sdk/source/bin/rocprofv3.py index 42158285ac5..4dfd7a38a9d 100755 --- a/projects/rocprofiler-sdk/source/bin/rocprofv3.py +++ b/projects/rocprofiler-sdk/source/bin/rocprofv3.py @@ -1291,7 +1291,10 @@ def _write_env_value(): args.output_format = ["rocpd"] update_env( - "ROCPROF_OUTPUT_FORMAT", ",".join(args.output_format), append=True, join_char="," + "ROCPROF_OUTPUT_FORMAT", + ",".join(args.output_format), + append=True, + join_char=",", ) if args.kokkos_trace: diff --git a/projects/rocprofiler-sdk/source/lib/python/rocpd/__main__.py b/projects/rocprofiler-sdk/source/lib/python/rocpd/__main__.py index 7a932507e82..e92276b356e 100644 --- a/projects/rocprofiler-sdk/source/lib/python/rocpd/__main__.py +++ b/projects/rocprofiler-sdk/source/lib/python/rocpd/__main__.py @@ -38,6 +38,7 @@ def main(argv=None, config=None): """ import argparse + from . import analyze from . import csv from . import merge from . import otf2 @@ -123,6 +124,27 @@ def main(argv=None, config=None): Aggregate 2 databases and output all summary files to HTML, only include HIP and MARKER regions, include domain summary $ rocpd summary -i db{0,1}.db --region-categories HIP MARKERS --domain-summary --format html +""" + + analyze_examples = """ + +Example usage: + + Analyze performance of a single database + $ rocpd analyze -i db0.db + + Analyze with output to file + $ rocpd analyze -i db0.db --format text -d ./output/ -o analysis + + Analyze top 20 kernels instead of default 10 + $ rocpd analyze -i db{0..3}.db --top-kernels 20 + + Analyze with a custom prompt (guides local analysis; enhances LLM output when --llm is used) + $ rocpd analyze -i db0.db --prompt "Why is my application slow?" + + Analyze with LLM-enhanced explanation + $ rocpd analyze -i db0.db --llm anthropic + """ input_help_string = "Input path and filename to one or more database(s). Wildcards accepted, as well as .rpdb folders" @@ -193,6 +215,14 @@ def add_required_args(_parser): epilog=summary_examples, ) + analyzer = subparsers.add_parser( + "analyze", + description="Analyze GPU performance traces with AI-powered insights", + allow_abbrev=False, + formatter_class=argparse.RawTextHelpFormatter, + epilog=analyze_examples, + ) + def get_output_type(val): return val.lower().replace("perfetto", "pftrace") @@ -213,6 +243,17 @@ def get_output_type(val): add_required_args(packager) add_required_args(query_reporter) add_required_args(generate_summary) + # analyze: -i is optional (not required when --source-dir is used for Tier 0) + _analyze_input_group = analyzer.add_argument_group("Required options") + _analyze_input_group.add_argument( + "-i", + "--input", + required=False, + default=None, + type=output_config.check_file_exists, + nargs="+", + help=input_help_string, + ) # converter: add args from any sub-modules process_converter_args = [] @@ -243,6 +284,12 @@ def get_output_type(val): process_generate_summary_args.append(summary.add_args(generate_summary)) process_generate_summary_args.append(time_window.add_args(generate_summary)) + # analyze: subparser args + process_analyzer_args = [] + process_analyzer_args.append(output_config.add_args(analyzer)) + process_analyzer_args.append(analyze.add_args(analyzer)) + process_analyzer_args.append(time_window.add_args(analyzer)) + # parse the command line arguments args = parser.parse_args(argv) @@ -355,6 +402,43 @@ def get_output_type(val): summary.generate_all_summaries(input, **summary_args) + # if the user requested AI analysis, execute the analyzer + elif args.command == "analyze": + # Validate: at least one of -i, --source-dir, or --interactive must be provided + has_input = bool(getattr(args, "input", None)) + has_source_dir = bool(getattr(args, "source_dir", None)) + has_interactive = bool(getattr(args, "interactive", None)) + if not has_input and not has_source_dir and not has_interactive: + analyzer.error( + "at least one of -i/--input, --source-dir, or --interactive is required.\n" + " Use -i output.db for trace analysis (Tier 1/2).\n" + " Use --source-dir ./src for source code analysis (Tier 0).\n" + ' Use --interactive "./my_app" for the 7-phase workflow.\n' + " Use both -i and --source-dir for combined analysis." + ) + + # construct the rocpd import data object (None if source-only) + if has_input: + input = RocpdImportData( + args.input, + automerge_limit=getattr( + args, "automerge_limit", package.IDEAL_NUMBER_OF_DATABASE_FILES + ), + ) + else: + input = None + + # analyzer subparser args + analyzer_args = {} + for pitr in process_analyzer_args: + analyzer_args.update(pitr(input, args)) + + # Pass source_dir if provided + if has_source_dir: + analyzer_args["source_dir"] = args.source_dir + + analyze.execute(input, **analyzer_args) + print("Done. Exiting...") diff --git a/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/README.md b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/README.md new file mode 100644 index 00000000000..d1c0d66b7b4 --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/README.md @@ -0,0 +1,796 @@ +# rocpd AI Analysis Module + +AI-powered GPU performance analysis for AMD ROCm profiling data. + +## Overview + +This module provides both CLI and Python API access to AI-powered analysis of GPU profiling traces. It analyzes rocpd database files and generates human-readable insights, bottleneck identification, and actionable optimization recommendations. + +### Key Features + +- **Local-first analysis** - Works offline, no API calls required by default +- **Tier 0 source analysis** - Scan GPU source code without a trace database (`analyze_source()`) +- **Optional LLM enhancement** - Natural language explanations via Anthropic Claude, OpenAI GPT, any OpenAI-compatible private server, or local Ollama +- **User-modifiable "fence"** - Customize LLM behavior by editing reference guide +- **Privacy-focused** - Data sanitization for LLM mode (kernel names, grid sizes redacted) +- **Multiple output formats** - Python objects, JSON, text, markdown, webview (interactive HTML) +- **Interactive session** - Menu-driven analysis loop with persistent multi-turn LLM conversation and session persistence +- **Type-safe API** - Dataclass-based with type hints + +## Quick Start + +### CLI Usage + +```bash +# Basic analysis (local mode) +rocpd analyze -i output.db + +# With LLM enhancement โ€” Anthropic or OpenAI +export ANTHROPIC_API_KEY="sk-ant-..." +rocpd analyze -i output.db --llm anthropic + +# Private/enterprise OpenAI-compatible server +export ROCPD_LLM_PRIVATE_URL="https://llm-api.example.com/OpenAI" +export ROCPD_LLM_PRIVATE_HEADERS='{"Ocp-Apim-Subscription-Key": "abc123", "api-version": "preview"}' +rocpd analyze -i output.db --llm private --llm-private-model gpt-4o + +# Local Ollama model +rocpd analyze -i output.db --llm-local ollama --llm-local-model llama3 + +# With custom prompt +rocpd analyze -i output.db --llm anthropic --prompt "Why is my matmul kernel slow?" + +# JSON output (produces analysis.json) +rocpd analyze -i output.db --format json -d ./output -o analysis + +# Markdown output (produces analysis.md) +rocpd analyze -i output.db --format markdown -d ./output -o analysis + +# Interactive HTML webview (produces analysis.html) +rocpd analyze -i output.db --format webview -d ./output -o analysis + +# Tier 0: source code analysis (no .db required) +rocpd analyze --source-dir ./my_app +rocpd analyze --source-dir ./my_app --format json -d ./output -o plan + +# Combined: Tier 0 + Tier 1/2 +rocpd analyze -i output.db --source-dir ./my_app + +# Interactive menu session (persistent LLM conversation, session-persistent) +rocpd analyze -i output.db --interactive +rocpd analyze -i output.db --interactive --llm anthropic +rocpd analyze --source-dir ./my_app --interactive "./my_app arg1" --llm private + +# Resume a previous interactive session +rocpd analyze -i output.db --interactive --resume-session 2026-03-10_14-23-01_myapp +``` + +### Python API Usage + +```python +from rocpd.ai_analysis import analyze_database +from pathlib import Path + +# Analyze a database +result = analyze_database(Path("output.db")) + +# Access results +print(result.summary.overall_assessment) +print(f"Primary bottleneck: {result.summary.primary_bottleneck}") + +# Get recommendations +for rec in result.recommendations.high_priority: + print(f"๐Ÿ”ด {rec.title}") + print(f" {rec.description}") +``` + +## Module Structure + +``` +ai_analysis/ +โ”œโ”€โ”€ __init__.py # Public API exports (incl. LLMConversation, load_reference_guide) +โ”œโ”€โ”€ api.py # Main API functions, AnalysisResult, SourceAnalysisResult +โ”œโ”€โ”€ llm_analyzer.py # Single-shot LLM integration with "fence" implementation +โ”œโ”€โ”€ llm_conversation.py # Persistent multi-turn LLM session (LLMConversation) +โ”œโ”€โ”€ exceptions.py # Exception classes (incl. SourceDirectoryNotFoundError) +โ”œโ”€โ”€ source_analyzer.py # Tier 0: static source code scanner +โ”œโ”€โ”€ interactive.py # Interactive session: InteractiveSession + WorkflowSession +โ”‚ # SessionData, SessionStore dataclasses +โ”œโ”€โ”€ tests/ +โ”‚ โ”œโ”€โ”€ __init__.py +โ”‚ โ”œโ”€โ”€ test_api_standalone.py # 23 AI analysis API unit tests +โ”‚ โ”œโ”€โ”€ test_interactive.py # 22 interactive session unit tests +โ”‚ โ””โ”€โ”€ test_llm_conversation.py # 51 LLMConversation + integration tests +โ”œโ”€โ”€ share/ +โ”‚ โ””โ”€โ”€ llm-reference-guide.md # LLM "fence" - user-modifiable reference guide +โ”œโ”€โ”€ docs/ +โ”‚ โ”œโ”€โ”€ AI_ANALYSIS_API.md # API documentation +โ”‚ โ”œโ”€โ”€ SCHEMA_CHANGELOG.md # JSON schema version history (current: v0.2.0) +โ”‚ โ””โ”€โ”€ LLM_REFERENCE_GUIDE.md # Fence documentation +โ””โ”€โ”€ README.md # This file +``` + +## Architecture: The "Fence" + +The LLM reference guide ("fence") is a **user-modifiable markdown file** that controls LLM behavior: + +**Location:** +- `/opt/rocm/lib/python3.12/site-packages/rocpd/ai_analysis/share/llm-reference-guide.md` (default) +- Can be overridden with `ROCPD_LLM_REFERENCE_GUIDE` environment variable + +**What's in the guide:** +- **ROCm Profiling Tools** - Correct tool names and commands (rocprofv3, rocprof-compute, rocprof-sys) +- **Tool Documentation Links** - Official ROCm documentation references +- **AMD GPU Hardware Specs** - MI100, MI210/MI250/MI250X, MI300A/MI300X/MI325X, MI350X/MI355X, RDNA2/RDNA3 specifications with ridge points +- **Performance Analysis Models** - Roofline, Speed-of-Light, Top-Down methodologies +- **Bottleneck Classification** - Rules for identifying compute/memory/latency bottlenecks +- **Optimization Techniques** - AMD-specific optimization strategies +- **Recommendation Standards** - Quality requirements for actionable recommendations +- **Output Format Rules** - Consistent plain text format across all LLM providers + +**Enforced Tool Usage:** +- โœ… `rocprofv3` - Kernel-level profiling, counters, API tracing +- โœ… `rocprof-compute` - Roofline analysis, memory hierarchy metrics +- โœ… `rocprof-sys` (also known as `rocsys`) - System-wide, MPI, call-stack sampling +- โŒ NEVER `rocprof` or `rocprof-v2` (deprecated tools) + +**How it works:** +1. LLMAnalyzer loads the reference guide at initialization +2. Guide is included in every LLM API request as system prompt +3. LLM generates analysis following the guide's rules strictly +4. **To change LLM behavior, just edit the guide - no code changes** +5. All profiling commands are validated against official ROCm documentation + +Example modification: + +```bash +# Edit the reference guide +sudo nano /opt/rocm/lib/python3.12/site-packages/rocpd/ai_analysis/share/llm-reference-guide.md + +# Add new GPU specs, update tool commands, or change priority thresholds +# Save and exit - changes take effect immediately on next analysis +``` + +See [LLM Reference Guide Documentation](docs/LLM_REFERENCE_GUIDE.md) for details. + +## Data Flow + +``` +rocprofv3 --sys-trace --pmc GRBM_COUNT -- ./app + โ†“ +output.db created (SQLite database) + โ†“ +rocpd analyze -i output.db --llm anthropic + โ†“ +โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” +โ”‚ 1. Local Analysis (always runs) โ”‚ +โ”‚ - Parse database โ”‚ +โ”‚ - Calculate metrics โ”‚ +โ”‚ - Apply performance models โ”‚ +โ”‚ - Generate recommendations โ”‚ +โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ + โ†“ +โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” +โ”‚ 2. LLM Enhancement (optional) โ”‚ +โ”‚ - Load reference guide ("fence") โ”‚ +โ”‚ - Sanitize data (privacy) โ”‚ +โ”‚ - Call Anthropic/OpenAI API โ”‚ +โ”‚ - Generate natural language output โ”‚ +โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ + โ†“ +Analysis results (text/JSON/markdown/webview) +``` + +## Analysis Tiers + +| Tier | Data Required | Analysis Capabilities | +|------|---------------|----------------------| +| **Tier 0** | Source code directory (`--source-dir`) | Kernel detection, pattern scanning, profiling plan, suggested first command | +| **Tier 1** | Trace data (`-i db.db`) | Kernel hotspots, time breakdown, memory copy overhead | +| **Tier 2** | Trace + hardware counters (`--pmc`) | Roofline model, Speed-of-Light metrics, bottleneck classification | +| **Tier 3** | Trace + PC sampling (`--pc-sampling`) | Instruction-level hotspots within kernels | +| **Tier 4** | Trace + thread trace | Full instruction timeline, stall analysis | + +Tiers 0โ€“2 are implemented and production-ready. The interactive session automatically +suggests `ROCPROFILER_PC_SAMPLING_BETA_ENABLED=1 rocprofv3 --pc-sampling` (Tier 3) +once all Tier 1/2 data has been collected. + +## API Reference + +### Main Functions + +```python +# Analyze database and return result object (Tier 1/2) +def analyze_database( + database_path: Path, + *, + custom_prompt: Optional[str] = None, + enable_llm: bool = False, + llm_provider: Optional[str] = None, + llm_api_key: Optional[str] = None, + output_format: OutputFormat = OutputFormat.PYTHON_OBJECT, + verbose: bool = False, + top_kernels: int = 10, +) -> AnalysisResult + +# Analyze source code directory and return profiling plan (Tier 0) +def analyze_source( + source_dir: Path, + *, + custom_prompt: Optional[str] = None, + enable_llm: bool = False, + llm_provider: Optional[str] = None, + llm_api_key: Optional[str] = None, + verbose: bool = False, +) -> SourceAnalysisResult + +# Analyze and return JSON +def analyze_database_to_json( + database_path: Path, + output_json_path: Optional[Path] = None, + **kwargs +) -> str + +# Get filtered recommendations +def get_recommendations( + database_path: Path, + priority_filter: Optional[str] = None, + category_filter: Optional[str] = None, + **kwargs +) -> List[Recommendation] + +# Validate database +def validate_database(database_path: Path) -> Dict[str, Any] +``` + +### Data Classes + +```python +@dataclass +class AnalysisResult: + metadata: AnalysisMetadata + profiling_info: ProfilingInfo + summary: AnalysisSummary + execution_breakdown: ExecutionBreakdown + recommendations: RecommendationSet + warnings: List[AnalysisWarning] + errors: List[str] + llm_enhanced_explanation: Optional[str] # If LLM enabled + tier0: Optional[SourceAnalysisResult] # If --source-dir also provided + + # Methods + def to_dict() -> Dict[str, Any] + def to_json(indent: int = 2) -> str + def to_text() -> str + def to_markdown() -> str + def to_webview() -> str # Self-contained interactive HTML report + +@dataclass +class SourceAnalysisResult: + source_dir: str + analysis_timestamp: str + programming_model: str # "HIP", "HIP+ROCm_Libraries", "PyTorch_HIP", etc. + files_scanned: int + files_skipped: int + detected_kernels: List[Dict] # {name, file, line, launch_type} + kernel_count: int + detected_patterns: List[Dict] # {pattern_id, severity, category, description, count, locations} + risk_areas: List[str] + already_instrumented: bool + roctx_marker_count: int + recommendations: List[Dict] # Same shape as generate_recommendations() output + suggested_counters: List[str] + suggested_first_command: str + llm_explanation: Optional[str] +``` + +### Exceptions + +```python +AnalysisError (base) +โ”œโ”€โ”€ DatabaseNotFoundError +โ”œโ”€โ”€ DatabaseCorruptedError +โ”œโ”€โ”€ MissingDataError +โ”œโ”€โ”€ UnsupportedGPUError +โ”œโ”€โ”€ LLMAuthenticationError +โ”œโ”€โ”€ LLMRateLimitError +โ”œโ”€โ”€ ReferenceGuideNotFoundError +โ”œโ”€โ”€ SourceDirectoryNotFoundError # analyze_source(): directory not found +โ””โ”€โ”€ SourceAnalysisError # analyze_source(): scanning error +``` + +## LLM Enhancement + +### Enabling LLM Mode + +**Option 1: Environment variable** + +```bash +export ANTHROPIC_API_KEY="sk-ant-..." +rocpd analyze -i output.db --llm anthropic +``` + +**Option 2: Python API** + +```python +result = analyze_database( + Path("output.db"), + enable_llm=True, + llm_provider="anthropic", + llm_api_key="sk-ant-..." +) +``` + +### Supported Providers + +- **Anthropic Claude** (recommended) + - Provider: `"anthropic"` + - Env var: `ANTHROPIC_API_KEY` + - Model: `claude-sonnet-4-20250514` + +- **OpenAI GPT** + - Provider: `"openai"` + - Env var: `OPENAI_API_KEY` + - Model: `gpt-4-turbo-preview` + +- **Private/enterprise server** (any OpenAI-compatible endpoint) + - Provider: `"private"` (`--llm private`) + - Required: `ROCPD_LLM_PRIVATE_URL` โ€” base URL of the server + - Required: `ROCPD_LLM_PRIVATE_MODEL` or `--llm-private-model` + - Optional: `ROCPD_LLM_PRIVATE_API_KEY` (default: `"dummy"` for header-authenticated servers) + - Optional: `ROCPD_LLM_PRIVATE_HEADERS` โ€” JSON or Python-dict of extra request headers + (e.g. `{"Ocp-Apim-Subscription-Key": "abc", "api-version": "preview"}`) + The `user` header is auto-set to `os.getlogin()` unless already present in `ROCPD_LLM_PRIVATE_HEADERS` + - Optional: `ROCPD_LLM_PRIVATE_VERIFY_SSL=0` โ€” disable SSL verification (requires `httpx`) + +- **Local Ollama** + - Provider: `--llm-local ollama` + - Env var: `ROCPD_LLM_LOCAL_URL` (default: `http://localhost:11434/v1`) + - Env var: `ROCPD_LLM_LOCAL_MODEL` (default: `codellama:13b`) + +### Data Sanitization + +When LLM mode is enabled, sensitive data is automatically redacted: + +| Original | Sanitized | +|----------|-----------| +| `conv2d_forward_kernel` | `[KERNEL_1]` | +| `[256, 256, 1]` | `[GRID_SIZE]` | +| `/home/user/app.cpp` | `[REDACTED]` | + +Aggregated metrics (time percentages, bottleneck classifications) are preserved. + +## Examples + +### Example 1: Basic Analysis + +```python +from rocpd.ai_analysis import analyze_database +from pathlib import Path + +result = analyze_database(Path("output.db")) + +print(f"Summary: {result.summary.overall_assessment}") +print(f"Bottleneck: {result.summary.primary_bottleneck}") +print(f"Kernel time: {result.execution_breakdown.kernel_time_pct:.1f}%") +print(f"Memory copy: {result.execution_breakdown.memcpy_time_pct:.1f}%") + +print("\nHigh Priority Recommendations:") +for rec in result.recommendations.high_priority: + print(f" - {rec.title}") +``` + +### Example 2: With LLM Enhancement + +```python +import os +from rocpd.ai_analysis import analyze_database +from pathlib import Path + +os.environ["ANTHROPIC_API_KEY"] = "sk-ant-..." + +result = analyze_database( + database_path=Path("output.db"), + enable_llm=True, + llm_provider="anthropic", + custom_prompt="Focus on memory bottlenecks" +) + +# LLM-generated natural language explanation +print(result.llm_enhanced_explanation) +``` + +### Example 3: JSON Output + +```python +from rocpd.ai_analysis import analyze_database_to_json +from pathlib import Path + +json_output = analyze_database_to_json( + database_path=Path("output.db"), + output_json_path=Path("analysis.json") +) + +# JSON is also returned as string +import json +data = json.loads(json_output) +print(f"Analysis tier: {data['profiling_info']['analysis_tier']}") +``` + +### Example 4: Interactive HTML Webview + +```bash +# Generate a self-contained HTML report for browser viewing +# Output file extension is applied automatically (.html for webview) +rocpd analyze -i output.db --format webview -d ./reports -o my_trace +# Produces: ./reports/my_trace.html +``` + +```python +from rocpd.ai_analysis import analyze_database +from pathlib import Path + +result = analyze_database(Path("output.db")) +html_report = result.to_webview() +Path("analysis.html").write_text(html_report) +``` + +The HTML report is a fully self-contained, offline-capable file with: +- **Light/Dark theme toggle** โ€” persisted in `localStorage`; defaults to AMD dark theme +- **Status summary badges** โ€” Critical/Warning counts visible in the header at a glance +- **Metric pills row** โ€” Runtime, kernel count, tier, timestamp, and DB path in the header +- **Status-colored KPI cards** โ€” Kernel %, bottleneck type, runtime, and tier cards each + have a green/amber/red top border reflecting health status +- **Priority icons on recommendations** โ€” ๐Ÿ”ด HIGH, ๐ŸŸ  MEDIUM, ๐ŸŸก LOW, โ„น INFO +- **FAB scroll-to-top button** โ€” Floating button appears after scrolling +- **Staggered fade-in animations** on section cards +- **Hover tooltips on every visual element** โ€” gauges, bars, table headers, counter rows, + and overview stats explain what each metric measures, target thresholds, and how to + act on issues. Hardware counter rows (GRBM_*, SQ_*, TCP/TCC, FETCH_SIZE, etc.) + include educational content about the underlying hardware event being counted. + +### Example 5: roc-optiq Integration + +```python +from rocpd.ai_analysis import analyze_database +from pathlib import Path + +def load_trace_for_optiq(trace_path: str): + """Load trace and extract insights for Optiq UI""" + result = analyze_database(Path(trace_path)) + + return { + "summary": result.summary.overall_assessment, + "bottleneck": result.summary.primary_bottleneck, + "recommendations": [ + { + "title": rec.title, + "description": rec.description, + "priority": rec.priority + } + for rec in result.recommendations.high_priority[:3] + ], + "breakdown": { + "kernel_pct": result.execution_breakdown.kernel_time_pct, + "memcpy_pct": result.execution_breakdown.memcpy_time_pct + } + } +``` + +## Interactive Session + +The interactive session (`--interactive`) launches a menu-driven loop for iterative profiling analysis. It maintains a **persistent multi-turn `LLMConversation`** across all calls within the same session โ€” the LLM accumulates full message history and doesn't repeat itself. + +### Session menu + +``` +[p] Profile โ€” run a new rocprofv3 command and analyze the output .db +[a] Analyze โ€” re-analyze the current .db and update recommendations +[o] Optimize โ€” ask the LLM for optimization suggestions +[s] Save โ€” save session to disk +[q] Quit +``` + +### LLM conversation persistence + +`InteractiveSession` holds one `LLMConversation` for the entire session: +- All `[a]`, `[o]`, and code-edit LLM calls share the same conversation object +- The LLM sees the full message history from earlier in the session +- History is automatically compacted to stay within context limits (`--llm-compact-every N`, default 10 turns) +- Source files are tracked: a file sent once is not re-transmitted on repeat calls (only new files are sent); the file set is serialized into the session JSON and restored on `--resume-session` +- On `[s]` save, the conversation state is serialized into the session file +- On `--resume-session`, the conversation is restored so the LLM picks up exactly where it left off + +### Phase 1b: Quick workload analysis (WorkflowSession) + +Before presenting the initial profiling command in Phase 2, `WorkflowSession` runs a +lightweight workload analysis to pick the best starter flags: + +1. **App-command heuristics** โ€” always runs; inspects binary name and arguments: + - `python` + ML keywords (torch, jax, paddleโ€ฆ) โ†’ `python_ml`; adds `--hip-trace` + - `python` + LLM keywords (vllm, llama, gptโ€ฆ) โ†’ `llm_inference`; adds `--hip-trace` + - `python` without ML โ†’ `python_generic`; adds `--hip-trace` + - MPI/Slurm launchers (`mpirun`, `srun`โ€ฆ) โ†’ warns about multi-rank capture limits + - Compiled HIP/ROCm binary โ†’ `hip_compute`; uses default flag set + - Multi-process patterns (torchrun, DDP, DeepSpeed) โ†’ warns about worker capture + +2. **Tier 0 source analysis** โ€” if `--source-dir` was provided, runs `SourceAnalyzer` + on the source directory and extracts the recommended flags from its highest-priority + profiling recommendation; overrides the pure-heuristic flag set. + +3. **Fallback** โ€” if neither source analysis nor heuristics yield specific flags, the + safe default is used: `--sys-trace --kernel-trace --memory-copy-trace --stats`. + +The analysis output is printed before the command box so the user can see what was +detected and why specific flags were chosen. The user always confirms or edits the +command in Phase 2. + +**Example output:** +``` +โ”€โ”€ Quick Workload Analysis โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + Detected: Python + ML framework (PyTorch / JAX / TF) + Source scan: 14 files, 3 kernels, model=hip_python + Source analysis suggests: rocprofv3 --sys-trace --hip-trace --kernel-trace --stats ... + Starter command basis: source analysis + +โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ +โ”‚ Profiling Command โ”‚ +โ”‚ rocprofv3 --sys-trace --hip-trace --kernel-trace --stats ... โ”‚ +โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ + Would you like the interactive tool to run this command? [Y/n] +``` + +### Cycle prevention and going deeper (WorkflowSession) + +The 7-phase `WorkflowSession` (`--interactive ""`) automatically detects and +breaks counter-collection/API-tracing cycles: + +- **Fingerprint all collection flags** โ€” when deciding whether to re-suggest a command, + the session checks `--sys-trace`, `--hip-trace`, `--kernel-trace`, `--memory-copy-trace`, + `--hsa-trace`, `--stats`, and individual `--pmc` counter names. +- **Compares against all prior runs** โ€” the dedup check looks at the union of everything + collected across all previous trace runs, not just the last one. +- **Tier 3 escalation** โ€” once all Tier 1/2 data has been collected, Phase 5 shows a + "go deeper" menu: + - TraceLens interval + kernel-category analysis is already shown in the report. + - `[d]` builds a PC sampling command and wires it into Phase 7 as option `[3]`: + ``` + ROCPROFILER_PC_SAMPLING_BETA_ENABLED=1 rocprofv3 --pc-sampling \ + -d /tmp/rocpd_trace/run_ -o results -- + ``` + - `ENV=VALUE` prefixes in commands are automatically extracted and injected into the + subprocess environment (no `shell=True` needed). + +### AI-edit revert + +When the AI modifies source files (Phase 6), the session backs up each file to `.bak`. +Typing `revert` (or `undo` / `v` / `r`) in the recompile-wait prompt triggers the full revert +flow: + +1. **Ask for error context** โ€” if no errors were pasted yet, the session prompts: + ``` + What went wrong? Paste the error output or briefly describe the issue. + (Press Enter to skip and proceed without error context) + > + ``` +2. **File restored** from `.bak` backup immediately. +3. **LLM analysis** โ€” calls the LLM with the original code, the failed edit, and the + error description. Response is formatted as: + - `ANALYSIS:` root-cause explanation of what went wrong + - `ALTERNATIVE:` a concrete corrected approach with specific code changes +4. **Offer to apply** โ€” `Apply this alternative approach now? [y/N]` + - If yes: shows a unified diff and applies on confirmation (with new `.bak`) +5. **What-next menu**: + ``` + What would you like to do next? + [f] Try a different fix โ€” let the AI attempt another approach + [p] Continue to re-profiling (skip code changes this round) + [q] Exit session + ``` + - `[f]` is shown only in Phase 6 context (not after profiling failure in Phase 3) + - `[f]` re-enters the Phase 6 retry loop for a fresh LLM rewrite attempt + - `[p]` falls through to Phase 7 (re-profiling prompt) + - `[q]` saves the session and exits + +**Phase 3 failure revert** โ€” `[v]` also appears in the Phase 3 retry menu when AI edits +exist. In that context, `[f] Try a different fix` is not offered (a new edit can't be +applied until re-profiling has run); instead only `[p]` (continue) and `[q]` (exit) appear. + +### AI-suggested commands + +After the LLM responds to `[o]`, the session scans the response text for `rocprofv3 ...` commands and combines them with structured commands from the current recommendation list. If any are found, the user is offered a numbered menu to run one immediately. If run, the resulting `.db` is auto-analyzed and the LLM is notified. + +### Session persistence + +Both session classes save to `~/.rocpd/sessions/` automatically: + +| Session class | Save triggers | File pattern | Resume | +|---|---|---|---| +| `InteractiveSession` | `[s]` key, `[q]` quit, Ctrl+C | `_.json` | `--resume-session` or auto-detect | +| `WorkflowSession` | after Phase 3 trace run, after Phase 6 edit, on exit/Ctrl+C | `workflow__.json` | not supported (new state each run) | + +The session file path is printed in the session summary so you always know where to find it. + +> **Note:** `--resume-session` applies only to **`InteractiveSession`** (the menu-driven +> `[p]/[a]/[o]/[s]/[q]` mode, triggered by `rocpd analyze -i db.db --interactive` **without** +> a `""` argument). `WorkflowSession` (7-phase workflow) starts a fresh state +> each invocation and does not support resume. + +```bash +# Start a new InteractiveSession +rocpd analyze -i output.db --interactive --llm anthropic + +# With private enterprise server +rocpd analyze -i output.db --interactive --llm private + +# Control compaction interval (default 10 turns) +rocpd analyze -i output.db --interactive --llm anthropic --llm-compact-every 5 + +# List available session IDs (files in ~/.rocpd/sessions/) +ls ~/.rocpd/sessions/*.json | xargs -I{} python3 -c \ + "import json,sys; d=json.load(open('{}'));print(d['session_id'],'|',d['source_dir'])" + +# Resume an existing session โ€” restores LLM conversation, sent files, and history +# Session ID format: YYYY-MM-DD_HH-MM-SS_ +rocpd analyze -i output.db --interactive --resume-session 2026-03-10_14-23-01_myapp + +# If the source dir matches a previous session, the tool auto-prompts to resume +# (no --resume-session needed) +rocpd analyze -i output.db --source-dir ./my_app --interactive +``` + +--- + +## Testing + +### Unit Tests + +```bash +# Run from /tmp to avoid circular import of libpyrocpd +ROCPD_SYS=/opt/rocm-7.0.0/lib/python3.12/site-packages +TEST_DIR=/path/to/rocm-systems-dev/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/tests + +# All tests +cd /tmp && PYTHONPATH="${ROCPD_SYS}" python3 -m pytest ${TEST_DIR} --noconftest -v + +# Interactive session tests only +cd /tmp && PYTHONPATH="${ROCPD_SYS}" python3 -m pytest ${TEST_DIR}/test_interactive.py --noconftest -v + +# LLMConversation + integration tests +cd /tmp && PYTHONPATH="${ROCPD_SYS}" python3 -m pytest ${TEST_DIR}/test_llm_conversation.py --noconftest -v +``` + +### Integration Tests + +```bash +cd rocm-systems-dev/projects/rocprofiler-sdk/build +ctest -R rocpd-ai-analysis +``` + +### Manual Testing + +```bash +# Generate test trace +rocprofv3 --sys-trace --pmc GRBM_COUNT SQ_WAVES -- ./sample_app + +# Analyze +rocpd analyze -i output.db + +# With LLM (requires API key) +export ANTHROPIC_API_KEY="sk-ant-..." +rocpd analyze -i output.db --llm anthropic +``` + +## Configuration + +### Environment Variables + +| Variable | Purpose | +|---|---| +| `ANTHROPIC_API_KEY` | Anthropic Claude API key | +| `OPENAI_API_KEY` | OpenAI GPT API key | +| `ROCPD_LLM_MODEL` | Override default model for anthropic or openai provider | +| `ROCPD_LLM_REFERENCE_GUIDE` | Path to custom reference guide (overrides package default) | +| `ROCPD_LLM_PRIVATE_URL` | Base URL for private/enterprise OpenAI-compatible server (required for `--llm private`) | +| `ROCPD_LLM_PRIVATE_MODEL` | Model name for private server | +| `ROCPD_LLM_PRIVATE_API_KEY` | API key for private server (default: `"dummy"`) | +| `ROCPD_LLM_PRIVATE_HEADERS` | JSON or Python-dict of extra HTTP request headers (e.g. `{"Ocp-Apim-Subscription-Key": "..."}`) | +| `ROCPD_LLM_PRIVATE_VERIFY_SSL` | Set to `0` or `false` to disable SSL cert verification (requires `httpx`) | +| `ROCPD_LLM_LOCAL_URL` | Base URL for local Ollama endpoint (default: `http://localhost:11434/v1`) | +| `ROCPD_LLM_LOCAL_MODEL` | Model name for local Ollama (default: `codellama:13b`) | + +### Reference Guide Location + +Default: `/opt/rocm/share/rocprofiler-sdk/llm-reference-guide.md` + +Override: +```bash +export ROCPD_LLM_REFERENCE_GUIDE=/path/to/custom-guide.md +``` + +## Documentation + +- **[AI Analysis API Documentation](../../../docs/AI_ANALYSIS_API.md)** - Complete API reference +- **[LLM Reference Guide Documentation](../../../docs/LLM_REFERENCE_GUIDE.md)** - How to customize LLM behavior +- **[rocpd README](../README.md)** - Main rocpd documentation + +## Development + +### Adding New Analysis Features + +1. Add analysis logic to `analyze.py` (main rocpd module) +2. Update `api.py` to expose new data in `AnalysisResult` +3. Update reference guide if LLM should use new feature +4. Add tests + +### Modifying LLM Behavior + +**Don't modify code.** Edit the reference guide instead: + +```bash +sudo nano /opt/rocm/share/rocprofiler-sdk/llm-reference-guide.md +``` + +See [LLM Reference Guide Documentation](../../../docs/LLM_REFERENCE_GUIDE.md) for examples. + +## Troubleshooting + +### Reference Guide Not Found + +```bash +# Check which path is being used +python3 -c "from rocpd.ai_analysis.llm_analyzer import get_reference_guide_path; print(get_reference_guide_path())" + +# Copy from source +sudo cp share/llm-reference-guide.md /opt/rocm/share/rocprofiler-sdk/ + +# Or use environment variable +export ROCPD_LLM_REFERENCE_GUIDE=/path/to/guide.md +``` + +### LLM Authentication Errors + +```bash +# Verify API key is set +echo $ANTHROPIC_API_KEY + +# Test API key directly +python3 << EOF +import anthropic +client = anthropic.Anthropic(api_key="sk-ant-...") +print("API key valid!") +EOF +``` + +### Database Errors + +```bash +# Validate database +python3 << EOF +from rocpd.ai_analysis import validate_database +from pathlib import Path + +validation = validate_database(Path("output.db")) +print(f"Valid: {validation['is_valid']}") +print(f"Tier: {validation['tier']}") +print(f"Tables: {validation['tables']}") +EOF +``` + +## Contributing + +- Follow existing code style (PEP 8) +- Add type hints +- Write docstrings (Google style) +- Add unit tests +- Update documentation + +## License + +MIT License - Copyright (c) 2025 Advanced Micro Devices, Inc. + +## Support + +- File issues on GitHub +- See [rocprofiler-sdk documentation](https://rocm.docs.amd.com/projects/rocprofiler-sdk/) +- ROCm community: https://rocm.docs.amd.com/ diff --git a/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/__init__.py b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/__init__.py new file mode 100644 index 00000000000..f94f130a27b --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/__init__.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 +############################################################################### +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +############################################################################### + +""" +AI Analysis Module for rocpd + +This module provides AI-powered GPU performance analysis with optional +LLM enhancement. The analysis is guided by a user-modifiable reference +guide (the "fence") that ensures high-quality, actionable insights. + +Key Features: +- Local-first analysis (always available, no internet required) +- Optional LLM enhancement (Anthropic Claude, OpenAI GPT) +- User-modifiable reference guide for customizing LLM behavior +- Data sanitization for privacy in LLM mode +- JSON, text, and markdown output formats + +Usage: + from rocpd.ai_analysis import analyze_database + + result = analyze_database( + database_path=Path("output.db"), + enable_llm=True, + llm_provider="anthropic" + ) + + print(result.summary.overall_assessment) +""" + +from .api import ( + analyze_database, + analyze_database_to_json, + analyze_source, + get_kernel_analysis, + get_recommendations, + validate_database, + AnalysisResult, + SourceAnalysisResult, + OutputFormat, +) + +from .exceptions import ( + AnalysisError, + DatabaseNotFoundError, + DatabaseCorruptedError, + MissingDataError, + UnsupportedGPUError, + LLMAuthenticationError, + LLMRateLimitError, + ReferenceGuideNotFoundError, + SourceDirectoryNotFoundError, + SourceAnalysisError, +) + +from .llm_analyzer import LLMAnalyzer, AnalysisContext, load_reference_guide +from .llm_conversation import LLMConversation + + +def _get_interactive(): + from .interactive import InteractiveSession, SessionStore, SessionData + + return InteractiveSession, SessionStore, SessionData + + +def __getattr__(name): + if name in ("InteractiveSession", "SessionStore", "SessionData"): + InteractiveSession, SessionStore, SessionData = _get_interactive() + # Cache in module globals to avoid repeated import on subsequent accesses + import sys + + mod = sys.modules[__name__] + mod.InteractiveSession = InteractiveSession + mod.SessionStore = SessionStore + mod.SessionData = SessionData + return getattr(mod, name) + raise AttributeError(f"module 'rocpd.ai_analysis' has no attribute {name!r}") + + +__all__ = [ + # Main API functions + "analyze_database", + "analyze_database_to_json", + "analyze_source", + "get_kernel_analysis", + "get_recommendations", + "validate_database", + # Data classes + "AnalysisResult", + "SourceAnalysisResult", + "OutputFormat", + # Exceptions + "AnalysisError", + "DatabaseNotFoundError", + "DatabaseCorruptedError", + "MissingDataError", + "UnsupportedGPUError", + "LLMAuthenticationError", + "LLMRateLimitError", + "ReferenceGuideNotFoundError", + "SourceDirectoryNotFoundError", + "SourceAnalysisError", + # Interactive session + "InteractiveSession", + "SessionStore", + "SessionData", + # LLM integration + "LLMAnalyzer", + "AnalysisContext", + "load_reference_guide", + "LLMConversation", +] + +__version__ = "0.1.0" diff --git a/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/api.py b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/api.py new file mode 100644 index 00000000000..bc12aac38ed --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/api.py @@ -0,0 +1,1137 @@ +#!/usr/bin/env python3 +############################################################################### +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +############################################################################### + +""" +Public Python API for rocpd AI analysis. + +This module provides a simple function-based API for programmatic access +to AI-powered GPU performance analysis. Designed for integration with +tools like Optiq. + +Example: + from rocpd.ai_analysis import analyze_database + from pathlib import Path + + result = analyze_database(Path("output.db")) + print(result.summary.overall_assessment) + + for rec in result.recommendations.high_priority: + print(f"- {rec.title}") +""" + +from dataclasses import dataclass, field, asdict +from enum import Enum +from pathlib import Path +from typing import List, Optional, Dict, Any + +try: + from importlib.metadata import version as _pkg_version + + _ROCPD_VERSION = _pkg_version("rocpd") +except Exception: + _ROCPD_VERSION = "0.1.0" # fallback if metadata not available (common in dev / ROCm system installs) + +from ..analyze import ( + compute_time_breakdown, + identify_hotspots, + analyze_memory_copies, + analyze_hardware_counters, + generate_recommendations, + format_analysis_output, + _detect_already_collected, +) +from ..tracelens_port import ( + compute_interval_timeline, + analyze_kernels_by_category, + analyze_short_kernels, +) +from .llm_analyzer import AnalysisContext, LLMAnalyzer +from .exceptions import ( + DatabaseNotFoundError, + DatabaseCorruptedError, + LLMAuthenticationError, + LLMRateLimitError, + SourceDirectoryNotFoundError, +) + + +class OutputFormat(Enum): + """Output format options""" + + PYTHON_OBJECT = "python_object" # Returns dataclass + JSON = "json" + TEXT = "text" + MARKDOWN = "markdown" + WEBVIEW = "webview" # Self-contained interactive HTML + + +@dataclass +class AnalysisMetadata: + """Metadata about the analysis""" + + rocpd_version: str + analysis_version: str = "0.1.0" + database_file: str = "" + analysis_timestamp: str = "" + analysis_duration_ms: int = 0 + custom_prompt: Optional[str] = None + + +@dataclass +class GPUInfo: + """GPU device information""" + + name: str + architecture: str + agent_id: int = 0 + + +@dataclass +class ProfilingInfo: + """Profiling session information""" + + total_duration_ns: int + profiling_mode: str # "sys_trace_only", "sys_trace_with_counters", "pc_sampling" + analysis_tier: int # 1=trace, 2=counters, 3=pc_sampling + gpus: List[GPUInfo] = field(default_factory=list) + + +@dataclass +class AnalysisSummary: + """High-level summary of analysis""" + + overall_assessment: str + primary_bottleneck: str # "compute", "memory", "latency", "mixed", "unknown" + confidence: float # 0.0 to 1.0 + key_findings: List[str] = field(default_factory=list) + + +@dataclass +class ExecutionBreakdown: + """Time distribution breakdown""" + + kernel_time_ns: int + kernel_time_pct: float + memcpy_time_ns: int + memcpy_time_pct: float + api_overhead_ns: int = 0 + api_overhead_pct: float = 0.0 + idle_time_ns: int = 0 + idle_time_pct: float = 0.0 + + +@dataclass +class Recommendation: + """Single recommendation""" + + id: str + priority: str # "high", "medium", "low" + category: str # "memory", "compute", "occupancy", "memory_transfer", etc. + title: str + description: str + estimated_impact: str + next_steps: List[str] = field(default_factory=list) + + +@dataclass +class RecommendationSet: + """Prioritized recommendations""" + + high_priority: List[Recommendation] = field(default_factory=list) + medium_priority: List[Recommendation] = field(default_factory=list) + low_priority: List[Recommendation] = field(default_factory=list) + + +@dataclass +class AnalysisWarning: + """Warning message""" + + severity: str # "warning", "info" + message: str + recommendation: Optional[str] = None + + +@dataclass +class SourceAnalysisResult: + """ + Tier 0 analysis result from static source code scanning. + + Produced by analyze_source() and attached to AnalysisResult.tier0 + when --source-dir is provided alongside -i. + """ + + source_dir: str + analysis_timestamp: str + programming_model: str # "HIP", "HIP+ROCm_Libraries", "OpenCL", "PyTorch_HIP", etc. + + files_scanned: int + files_skipped: int + + detected_kernels: List[Dict[str, Any]] # {name, file, line, launch_type} + kernel_count: int + + detected_patterns: List[ + Dict[str, Any] + ] # {pattern_id, severity, category, description, count, locations} + risk_areas: List[str] + + already_instrumented: bool + roctx_marker_count: int + + recommendations: List[Dict[str, Any]] # same structure as generate_recommendations() + suggested_counters: List[str] + suggested_first_command: str + + llm_explanation: Optional[str] = None + + +def _plan_to_source_result(plan) -> "SourceAnalysisResult": + """Convert a ProfilingPlan to a SourceAnalysisResult dataclass. + + Centralizes the conversion logic so both api.py:analyze_source() and + analyze.py:analyze_source_code() produce identical SourceAnalysisResult + objects without duplicating the field-mapping code. + """ + return SourceAnalysisResult( + source_dir=plan.source_dir, + analysis_timestamp=plan.analysis_timestamp, + programming_model=plan.programming_model, + files_scanned=plan.files_scanned, + files_skipped=plan.files_skipped, + detected_kernels=[ + { + "name": k.name, + "file": k.file, + "line": k.line, + "launch_type": k.launch_type, + } + for k in plan.detected_kernels + ], + kernel_count=plan.kernel_count, + detected_patterns=[ + { + "pattern_id": p.pattern_id, + "severity": p.severity, + "category": p.category, + "description": p.description, + "count": p.count, + "locations": p.locations, + } + for p in plan.detected_patterns + ], + risk_areas=plan.risk_areas, + already_instrumented=plan.already_instrumented, + roctx_marker_count=plan.roctx_marker_count, + recommendations=plan.recommendations, + suggested_counters=plan.suggested_counters, + suggested_first_command=plan.suggested_first_command, + ) + + +@dataclass +class AnalysisResult: + """ + Complete analysis result structure. + + This is the main return type for analyze_database(). + Contains all analysis data and can be serialized to JSON/text/markdown. + """ + + metadata: AnalysisMetadata + profiling_info: ProfilingInfo + summary: AnalysisSummary + execution_breakdown: ExecutionBreakdown + recommendations: RecommendationSet + warnings: List[AnalysisWarning] = field(default_factory=list) + errors: List[str] = field(default_factory=list) + + # Optional LLM-enhanced natural language explanation + llm_enhanced_explanation: Optional[str] = None + + # Tier 0 source code analysis (populated when analyze_source() is also run) + tier0: Optional[SourceAnalysisResult] = None + + # TraceLens-derived analysis (Phase 1) + kernel_categories: List[dict] = field(default_factory=list) + short_kernels: dict = field(default_factory=dict) + interval_timeline: dict = field(default_factory=dict) + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary""" + return asdict(self) + + def to_json(self, indent: int = 2) -> str: + """Serialize to schema-conformant JSON (analysis-output.schema.json v0.1.0). + + Delegates to format_analysis_output() to ensure the output matches the + normative JSON schema. Falls back to dataclass serialization if raw + analysis data is not available. + """ + raw = getattr(self, "_raw", None) + if raw: + return format_analysis_output( + time_breakdown=raw["time_breakdown"], + hotspots=raw["hotspots"], + memory_analysis=raw["memory_analysis"], + recommendations=raw["recommendations_raw"], + hardware_counters=raw["hardware_counters"], + database_path=raw["database_path"], + output_format="json", + interval_timeline=raw.get("interval_timeline"), # NEW + kernel_categories=raw.get("kernel_categories"), # NEW + short_kernels=raw.get("short_kernels"), # NEW + ) + raise RuntimeError( + "Raw analysis data not available. " + "Use analyze_database() to create the result, " + "or use to_dict() for a non-schema-conformant dict." + ) + + def to_webview(self) -> str: + """Generate self-contained interactive HTML report. + + Returns the same AMD-themed webview HTML produced by the rocpd CLI + ``--format webview`` option. Requires that the result was created via + :func:`analyze_database` (which populates the raw data cache). + + Raises: + RuntimeError: If the result was not created via analyze_database(). + """ + raw = getattr(self, "_raw", None) + if not raw: + raise RuntimeError( + "Raw analysis data not available. " + "Use analyze_database() to create the result." + ) + return format_analysis_output( + time_breakdown=raw["time_breakdown"], + hotspots=raw["hotspots"], + memory_analysis=raw["memory_analysis"], + recommendations=raw["recommendations_raw"], + hardware_counters=raw["hardware_counters"], + database_path=raw["database_path"], + output_format="webview", + interval_timeline=raw.get("interval_timeline"), # NEW + kernel_categories=raw.get("kernel_categories"), # NEW + short_kernels=raw.get("short_kernels"), # NEW + ) + + def to_text(self) -> str: + """Generate plain text report. + + Works without ``_raw`` attached; renders from dataclass fields directly. + Does NOT guarantee schema conformance (use ``to_json()`` for that). + """ + lines = [] + + # Header + lines.append("=" * 80) + lines.append("GPU PERFORMANCE ANALYSIS REPORT") + lines.append("=" * 80) + lines.append(f"Database: {self.metadata.database_file}") + lines.append(f"Analysis Date: {self.metadata.analysis_timestamp}") + lines.append(f"Analysis Tier: {self.profiling_info.analysis_tier}") + if self.metadata.custom_prompt: + lines.append(f"Custom Prompt: {self.metadata.custom_prompt}") + lines.append("") + + # Summary + lines.append("SUMMARY") + lines.append("-" * 80) + lines.append(self.summary.overall_assessment) + lines.append(f"Primary Bottleneck: {self.summary.primary_bottleneck}") + lines.append(f"Confidence: {self.summary.confidence:.0%}") + lines.append("") + + # Key findings + if self.summary.key_findings: + lines.append("Key Findings:") + for finding in self.summary.key_findings: + lines.append(f" โ€ข {finding}") + lines.append("") + + # Execution breakdown + lines.append("EXECUTION BREAKDOWN") + lines.append("-" * 80) + lines.append( + f"Kernel Execution: {self.execution_breakdown.kernel_time_pct:6.1f}%" + ) + lines.append( + f"Memory Copies: {self.execution_breakdown.memcpy_time_pct:6.1f}%" + ) + lines.append( + f"API Overhead: {self.execution_breakdown.api_overhead_pct:6.1f}%" + ) + lines.append("") + + # Recommendations + lines.append("RECOMMENDATIONS") + lines.append("-" * 80) + + for priority, recs in [ + ("HIGH PRIORITY", self.recommendations.high_priority), + ("MEDIUM PRIORITY", self.recommendations.medium_priority), + ("LOW PRIORITY", self.recommendations.low_priority), + ]: + if recs: + lines.append(f"\n{priority}:") + for rec in recs: + lines.append(f"\n {rec.title}") + lines.append(f" {rec.description}") + lines.append(f" Estimated Impact: {rec.estimated_impact}") + if rec.next_steps: + lines.append(" Next Steps:") + for step in rec.next_steps: + lines.append(f" - {step}") + + # LLM-enhanced explanation (if available) + if self.llm_enhanced_explanation: + lines.append("\n") + lines.append("=" * 80) + lines.append("AI-ENHANCED EXPLANATION") + lines.append("=" * 80) + lines.append(self.llm_enhanced_explanation) + + # Warnings + if self.warnings: + lines.append("\n") + lines.append("WARNINGS") + lines.append("-" * 80) + for warning in self.warnings: + lines.append(f"โš ๏ธ {warning.message}") + if warning.recommendation: + lines.append(f" Recommendation: {warning.recommendation}") + + lines.append("\n" + "=" * 80) + return "\n".join(lines) + + def to_markdown(self) -> str: + """Generate markdown report. + + Works without ``_raw`` attached; renders from dataclass fields directly. + Does NOT guarantee schema conformance (use ``to_json()`` for that). + """ + lines = [] + + # Header + lines.append("# GPU Performance Analysis Report") + lines.append("") + lines.append(f"**Database:** `{self.metadata.database_file}`") + lines.append(f"**Analysis Date:** {self.metadata.analysis_timestamp}") + lines.append(f"**Analysis Tier:** {self.profiling_info.analysis_tier}") + if self.metadata.custom_prompt: + lines.append(f"**Custom Prompt:** _{self.metadata.custom_prompt}_") + lines.append("") + + # Summary + lines.append("## Summary") + lines.append("") + lines.append(self.summary.overall_assessment) + lines.append("") + lines.append(f"- **Primary Bottleneck:** {self.summary.primary_bottleneck}") + lines.append(f"- **Confidence:** {self.summary.confidence:.0%}") + lines.append("") + + # Key findings + if self.summary.key_findings: + lines.append("### Key Findings") + lines.append("") + for finding in self.summary.key_findings: + lines.append(f"- {finding}") + lines.append("") + + # Execution breakdown + lines.append("## Execution Breakdown") + lines.append("") + lines.append("| Category | Percentage |") + lines.append("|----------|------------|") + lines.append( + f"| Kernel Execution | {self.execution_breakdown.kernel_time_pct:.1f}% |" + ) + lines.append( + f"| Memory Copies | {self.execution_breakdown.memcpy_time_pct:.1f}% |" + ) + lines.append( + f"| API Overhead | {self.execution_breakdown.api_overhead_pct:.1f}% |" + ) + lines.append("") + + # Recommendations + lines.append("## Recommendations") + lines.append("") + + for priority, recs, emoji in [ + ("High Priority", self.recommendations.high_priority, "๐Ÿ”ด"), + ("Medium Priority", self.recommendations.medium_priority, "๐ŸŸก"), + ("Low Priority", self.recommendations.low_priority, "๐ŸŸข"), + ]: + if recs: + lines.append(f"### {emoji} {priority}") + lines.append("") + for rec in recs: + lines.append(f"#### {rec.title}") + lines.append("") + lines.append(rec.description) + lines.append("") + lines.append(f"**Estimated Impact:** {rec.estimated_impact}") + lines.append("") + if rec.next_steps: + lines.append("**Next Steps:**") + for step in rec.next_steps: + lines.append(f"- {step}") + lines.append("") + + # LLM-enhanced explanation + if self.llm_enhanced_explanation: + lines.append("---") + lines.append("") + lines.append("## AI-Enhanced Explanation") + lines.append("") + lines.append(self.llm_enhanced_explanation) + lines.append("") + + # Warnings + if self.warnings: + lines.append("## Warnings") + lines.append("") + for warning in self.warnings: + lines.append(f"โš ๏ธ **{warning.severity.upper()}:** {warning.message}") + if warning.recommendation: + lines.append(f" - Recommendation: {warning.recommendation}") + lines.append("") + + return "\n".join(lines) + + +def analyze_database( + database_path: Path, + *, + custom_prompt: Optional[str] = None, + enable_llm: bool = False, + llm_provider: Optional[str] = None, + llm_api_key: Optional[str] = None, + llm_thinking_tokens: Optional[int] = None, + output_format: OutputFormat = OutputFormat.PYTHON_OBJECT, + verbose: bool = False, + top_kernels: int = 10, +) -> AnalysisResult: + """ + Analyze a rocpd database file and return AI-powered insights. + + This is the main entry point for programmatic analysis. + Performs local analysis (always) and optional LLM enhancement. + + Args: + database_path: Path to .rpd or .db file + custom_prompt: Optional user question to guide analysis + enable_llm: Enable LLM-powered natural language enhancement + llm_provider: LLM provider ("anthropic", "openai") + llm_api_key: API key for LLM provider (or set env var) + llm_thinking_tokens: Enable extended thinking with this token budget. + Only supported with the Anthropic provider and compatible models + (claude-opus-4, claude-sonnet-4-5, claude-3-7-sonnet). + output_format: Desired output format + verbose: Enable verbose logging + top_kernels: Number of top kernels to analyze + + Returns: + AnalysisResult object with complete analysis + + Raises: + DatabaseNotFoundError: Database file doesn't exist + DatabaseCorruptedError: Database schema is invalid + MissingDataError: Required tables are missing + + Example: + >>> from rocpd.ai_analysis import analyze_database + >>> from pathlib import Path + >>> + >>> result = analyze_database(Path("output.db")) + >>> print(result.summary.overall_assessment) + >>> for rec in result.recommendations.high_priority: + ... print(f"- {rec.title}") + """ + # Validate database exists + if not database_path.exists(): + raise DatabaseNotFoundError(f"Database file not found: {database_path}") + + if verbose: + print(f"[Analysis] Analyzing database: {database_path}") + print(f"[Analysis] Enable LLM: {enable_llm}") + if custom_prompt: + print(f"[Analysis] Custom prompt: {custom_prompt}") + + # Perform local analysis by calling individual analysis functions directly. + # NOTE: We do NOT call analyze_performance() โ€” it returns a formatted str, + # not a dict. We need raw data to build the AnalysisResult dataclass. + try: + from ..importer import RocpdImportData + + # RocpdImportData's internal sanitize_input_list() iterates over its + # argument. Passing a plain str would iterate over characters. Pass a + # list with the single path string to ensure correct behavior. + connection = RocpdImportData([str(database_path)]) + + time_breakdown = compute_time_breakdown(connection) + hotspots = identify_hotspots(connection, top_n=top_kernels) + memory_analysis = analyze_memory_copies(connection) + hardware_counters = analyze_hardware_counters(connection) + already_collected = _detect_already_collected(connection) + + # TraceLens-derived analysis + interval_timeline = compute_interval_timeline(connection) + kernel_categories = analyze_kernels_by_category( + connection, interval_timeline["total_wall_ns"] + ) + short_kernels_data = analyze_short_kernels(connection) + + recommendations = generate_recommendations( + time_breakdown, + hotspots, + memory_analysis, + hardware_counters, + already_collected, + short_kernels=short_kernels_data, + interval_timeline=interval_timeline, + ) + + if verbose: + print("[Analysis] Local analysis complete") + + except Exception as e: + raise DatabaseCorruptedError(f"Failed to analyze database: {e}") + + # Build AnalysisResult from raw analysis payloads + result = _build_analysis_result( + time_breakdown=time_breakdown, + hotspots=hotspots, + memory_analysis=memory_analysis, + recommendations=recommendations, + hardware_counters=hardware_counters, + database_path=database_path, + custom_prompt=custom_prompt, + ) + + result.kernel_categories = kernel_categories + result.short_kernels = short_kernels_data + result.interval_timeline = interval_timeline + + # Also write into _raw so to_json() / to_webview() include them + result._raw["interval_timeline"] = interval_timeline + result._raw["kernel_categories"] = kernel_categories + result._raw["short_kernels"] = short_kernels_data + + # Optional LLM enhancement + if enable_llm and llm_provider: + try: + if verbose: + print(f"[Analysis] Enhancing with {llm_provider} LLM...") + + analyzer = LLMAnalyzer( + provider=llm_provider, + api_key=llm_api_key, + verbose=verbose, + thinking_budget_tokens=llm_thinking_tokens, + ) + + # Convert result to dict for LLM + analysis_data = _convert_result_to_llm_format(result) + + # Build AnalysisContext so _select_tags() gates reference guide sections + # (including tracelens_metrics when TraceLens data is present) + has_counters = hardware_counters.get("has_counters", False) + analysis_tier = 2 if has_counters else 1 + context = AnalysisContext( + tier=analysis_tier, + has_counters=has_counters, + custom_prompt=custom_prompt, + kernel_categories=result.kernel_categories or [], + interval_timeline={ + k: v + for k, v in result.interval_timeline.items() + if k.endswith("_pct") + }, + short_kernel_summary=( + { + "threshold_us": result.short_kernels.get("threshold_us", 10), + "short_kernel_count": result.short_kernels.get( + "short_kernel_count", 0 + ), + "wasted_pct_of_kernel_time": result.short_kernels.get( + "wasted_pct_of_kernel_time", 0 + ), + } + if result.short_kernels + else None + ), + ) + + # Get LLM enhancement + llm_explanation = analyzer.analyze_with_llm( + analysis_data, + custom_prompt=custom_prompt, + context=context, + ) + + result.llm_enhanced_explanation = llm_explanation + + if verbose: + print("[Analysis] LLM enhancement complete") + + except (LLMAuthenticationError, LLMRateLimitError): + # Auth and rate-limit errors must propagate โ€” the caller needs to + # know their credentials are invalid or exhausted. + raise + except Exception as e: + # Other LLM errors are non-critical: add a warning and continue + # with local-only results. + result.warnings.append( + AnalysisWarning( + severity="warning", + message=f"LLM enhancement failed: {e}", + recommendation="Analysis continues with local-only results", + ) + ) + + if verbose: + print(f"[Analysis] LLM enhancement failed: {e}") + + return result + + +def _build_analysis_result( + time_breakdown: Dict[str, Any], + hotspots: List[Dict[str, Any]], + memory_analysis: Dict[str, Any], + recommendations: List[Dict[str, Any]], + hardware_counters: Dict[str, Any], + database_path: Path, + custom_prompt: Optional[str], +) -> AnalysisResult: + """Build AnalysisResult from raw analysis payloads returned by analyze.py functions. + + Key mapping from generate_recommendations() output: + rec["issue"] โ†’ Recommendation.title + rec["suggestion"] โ†’ Recommendation.description + rec["estimated_impact"] โ†’ Recommendation.estimated_impact + rec["actions"] โ†’ Recommendation.next_steps + rec["priority"] โ†’ "HIGH"/"MEDIUM"/"INFO" (uppercase) โ†’ normalized to lowercase + """ + from datetime import datetime + + # Build metadata + metadata = AnalysisMetadata( + rocpd_version=_ROCPD_VERSION, + analysis_version="0.1.0", # schema version, not module version + database_file=str(database_path), + analysis_timestamp=datetime.now().isoformat(), + custom_prompt=custom_prompt, + ) + + # Build profiling info + has_counters = hardware_counters.get("has_counters", False) + profiling_mode = "sys_trace_with_counters" if has_counters else "sys_trace_only" + analysis_tier = 2 if has_counters else 1 + + profiling_info = ProfilingInfo( + total_duration_ns=int(time_breakdown.get("total_runtime", 0)), + profiling_mode=profiling_mode, + analysis_tier=analysis_tier, + gpus=[], + ) + + # Build summary โ€” mirrors _build_summary() logic in analyze.py + primary_bottleneck = "mixed" + confidence = 0.50 + + memcpy_pct = time_breakdown.get("memcpy_percent", 0) + kernel_pct = time_breakdown.get("kernel_percent", 0) + overhead_pct = time_breakdown.get("overhead_percent", 0) + if memcpy_pct > 30: + primary_bottleneck = "memory_transfer" + confidence = 0.85 + elif memcpy_pct > 20: + primary_bottleneck = "memory_transfer" + confidence = 0.70 + elif overhead_pct > 25: + primary_bottleneck = "latency" + confidence = 0.75 + elif kernel_pct > 70 and has_counters: + primary_bottleneck = "compute" + confidence = 0.80 + elif kernel_pct > 70: + primary_bottleneck = "compute" + confidence = 0.60 + + summary = AnalysisSummary( + overall_assessment=f"Analysis complete. {len(hotspots)} kernels analyzed.", + primary_bottleneck=primary_bottleneck, + confidence=confidence, + key_findings=[ + f"Total kernel execution time: {kernel_pct:.1f}%", + f"Memory copy overhead: {memcpy_pct:.1f}%", + f"Top kernel: {hotspots[0]['name'] if hotspots else 'N/A'}", + ], + ) + + # Build execution breakdown + execution_breakdown = ExecutionBreakdown( + kernel_time_ns=int(time_breakdown.get("total_kernel_time", 0)), + kernel_time_pct=kernel_pct, + memcpy_time_ns=int(time_breakdown.get("total_memcpy_time", 0)), + memcpy_time_pct=memcpy_pct, + api_overhead_pct=time_breakdown.get("overhead_percent", 0.0), + ) + + # Build recommendations โ€” map keys from generate_recommendations() output. + # generate_recommendations() uses: issue, suggestion, estimated_impact, actions, + # priority (uppercase: "HIGH"/"MEDIUM"/"INFO"), category, commands. + rec_set = RecommendationSet() + for i, rec in enumerate(recommendations, 1): + priority_upper = rec.get("priority", "MEDIUM").upper() + recommendation = Recommendation( + id=f"rec_{i:03d}", + priority=priority_upper.lower(), + category=rec.get("category", "general"), + title=rec.get("issue", "Optimization opportunity"), + description=rec.get("suggestion", ""), + estimated_impact=rec.get("estimated_impact", "Unknown"), + next_steps=rec.get("actions", []), + ) + + if priority_upper == "HIGH": + rec_set.high_priority.append(recommendation) + elif priority_upper in ("MEDIUM", "INFO"): + rec_set.medium_priority.append(recommendation) + else: + rec_set.low_priority.append(recommendation) + + # Build warnings + warnings = [] + if not has_counters: + warnings.append( + AnalysisWarning( + severity="warning", + message="No hardware counters collected. Analysis limited to Tier 1 (trace data only).", + recommendation="Collect counters with: rocprofv3 --pmc GRBM_COUNT SQ_WAVES -- ./app", + ) + ) + + result = AnalysisResult( + metadata=metadata, + profiling_info=profiling_info, + summary=summary, + execution_breakdown=execution_breakdown, + recommendations=rec_set, + warnings=warnings, + ) + + # Attach raw payloads as a dynamic attribute so to_json()/to_webview() can + # delegate serialization to format_analysis_output() for schema conformance. + result._raw = { + "time_breakdown": time_breakdown, + "hotspots": hotspots, + "memory_analysis": memory_analysis, + "recommendations_raw": recommendations, + "hardware_counters": hardware_counters, + "database_path": str(database_path), + } + + return result + + +def _convert_result_to_llm_format(result: AnalysisResult) -> Dict[str, Any]: + """Convert AnalysisResult to the format expected by LLMAnalyzer._sanitize_data(). + + Populates all sections from the raw analysis payloads stored on the result + so the LLM receives real profiling data rather than empty placeholders. + """ + raw = getattr(result, "_raw", {}) + hotspots = raw.get("hotspots", []) + memory_analysis = raw.get("memory_analysis", {}) + hardware_counters = raw.get("hardware_counters", {}) + + return { + # GPU info โ€” arch not currently stored in the DB views; keep as generic + "gpu": {"name": "AMD GPU", "arch": "unknown"}, + "execution_breakdown": { + "kernel_time_pct": result.execution_breakdown.kernel_time_pct, + "memcpy_time_pct": result.execution_breakdown.memcpy_time_pct, + "api_overhead_pct": result.execution_breakdown.api_overhead_pct, + }, + # Real kernel hotspot data + "kernels": [ + { + "name": k.get("name"), + "calls": k.get("calls"), + "total_duration_ns": k.get("total_duration"), + "avg_duration_ns": k.get("avg_duration"), + "percent_of_total": k.get("percent_of_total"), + } + for k in hotspots + ], + # Real memory transfer data keyed by direction + "memory_ops": { + direction: { + "count": info.get("count"), + "total_bytes": info.get("total_bytes"), + "avg_duration_ns": info.get("avg_duration"), + } + for direction, info in memory_analysis.items() + }, + "has_counters": hardware_counters.get("has_counters", False), + # Derived hardware metrics (gpu_utilization_percent, avg_waves, etc.) + "hardware_metrics": hardware_counters.get("metrics", {}), + "has_pc_sampling": result.profiling_info.analysis_tier >= 3, + "interval_timeline": { + k: v + for k, v in result.interval_timeline.items() + if k.endswith("_pct") # pct fields only โ€” omit _ns fields to reduce tokens + }, + "kernel_categories": [ + {k: v for k, v in c.items() if k != "total_ns" and k != "avg_duration_ns"} + for c in result.kernel_categories + ], + "short_kernel_summary": { + "threshold_us": result.short_kernels.get("threshold_us", 10), + "short_kernel_count": result.short_kernels.get("short_kernel_count", 0), + "wasted_pct_of_kernel_time": result.short_kernels.get( + "wasted_pct_of_kernel_time", 0 + ), + }, + } + + +def analyze_database_to_json( + database_path: Path, + output_json_path: Optional[Path] = None, + **kwargs, +) -> str: + """ + Analyze database and return/save JSON output. + + Args: + database_path: Path to .rpd or .db file + output_json_path: Optional path to save JSON file + **kwargs: Additional arguments passed to analyze_database() + + Returns: + JSON string + + Example: + >>> json_output = analyze_database_to_json( + ... Path("output.db"), + ... output_json_path=Path("analysis.json") + ... ) + """ + result = analyze_database(database_path, **kwargs) + json_output = result.to_json() + + if output_json_path: + output_json_path.write_text(json_output) + + return json_output + + +def get_kernel_analysis(database_path: Path, kernel_name: str, **kwargs) -> Dict: + """ + Get analysis for a specific kernel. + + Args: + database_path: Path to .rpd or .db file + kernel_name: Exact kernel name or pattern + **kwargs: Additional arguments + + Returns: + Kernel analysis data + """ + # TODO: Implement kernel-specific analysis + raise NotImplementedError("Kernel-specific analysis not yet implemented") + + +def get_recommendations( + database_path: Path, + priority_filter: Optional[str] = None, + category_filter: Optional[str] = None, + **kwargs, +) -> List[Recommendation]: + """ + Get filtered recommendations from analysis. + + Args: + database_path: Path to .rpd or .db file + priority_filter: Filter by priority ("high", "medium", "low") + category_filter: Filter by category + **kwargs: Additional arguments + + Returns: + List of Recommendation objects + """ + result = analyze_database(database_path, **kwargs) + + recommendations = [] + if priority_filter == "high" or priority_filter is None: + recommendations.extend(result.recommendations.high_priority) + if priority_filter == "medium" or priority_filter is None: + recommendations.extend(result.recommendations.medium_priority) + if priority_filter == "low" or priority_filter is None: + recommendations.extend(result.recommendations.low_priority) + + if category_filter: + recommendations = [ + rec for rec in recommendations if rec.category == category_filter + ] + + return recommendations + + +def analyze_source( + source_dir: Path, + *, + custom_prompt: Optional[str] = None, + enable_llm: bool = False, + llm_provider: Optional[str] = None, + llm_api_key: Optional[str] = None, + verbose: bool = False, +) -> SourceAnalysisResult: + """ + Analyze a source code directory and return a Tier 0 profiling plan. + + No database file is required. Scans .hip, .cpp, .cu, .cl, .py, .h, + .hpp files for GPU programming patterns and generates structured + recommendations for what to profile and with which commands. + + Args: + source_dir: Path to source code directory + custom_prompt: Optional user question to guide LLM analysis + enable_llm: Enable LLM-powered explanation of the profiling plan + llm_provider: LLM provider ("anthropic", "openai") + llm_api_key: API key for LLM provider (or set env var) + verbose: Enable verbose logging + + Returns: + SourceAnalysisResult with profiling plan + + Raises: + SourceDirectoryNotFoundError: Source directory doesn't exist + SourceAnalysisError: Error during source scanning + + Example: + >>> from rocpd.ai_analysis import analyze_source + >>> from pathlib import Path + >>> + >>> result = analyze_source(Path("./my_app/src")) + >>> print(result.programming_model) + >>> print(result.suggested_first_command) + >>> for rec in result.recommendations: + ... print(f"[{rec['priority']}] {rec['category']}: {rec['issue']}") + """ + if not source_dir.exists() or not source_dir.is_dir(): + raise SourceDirectoryNotFoundError( + f"Source directory not found or not a directory: {source_dir}" + ) + + if verbose: + print(f"[Tier0] Scanning source directory: {source_dir}") + + from .source_analyzer import SourceAnalyzer + + scanner = SourceAnalyzer(source_dir, verbose=verbose) + plan = scanner.analyze() + + if verbose: + print( + f"[Tier0] Scanned {plan.files_scanned} files, " + f"found {plan.kernel_count} kernels, " + f"programming model: {plan.programming_model}" + ) + + # Convert ProfilingPlan to SourceAnalysisResult dataclass + result = _plan_to_source_result(plan) + + # Optional LLM enhancement + if enable_llm and llm_provider: + try: + if verbose: + print(f"[Tier0] Enhancing with {llm_provider} LLM...") + + analyzer = LLMAnalyzer( + provider=llm_provider, + api_key=llm_api_key, + verbose=verbose, + ) + context = AnalysisContext(tier=0, custom_prompt=custom_prompt) + result.llm_explanation = analyzer.analyze_source_with_llm( + result, custom_prompt=custom_prompt, context=context + ) + + if verbose: + print("[Tier0] LLM enhancement complete") + + except (LLMAuthenticationError, LLMRateLimitError): + raise + except Exception as e: + if verbose: + print(f"[Tier0] LLM enhancement failed: {e}") + + return result + + +def validate_database(database_path: Path) -> Dict[str, Any]: + """ + Validate database schema and contents without performing analysis. + + Args: + database_path: Path to .rpd or .db file + + Returns: + Validation result dictionary + + Example: + >>> validation = validate_database(Path("output.db")) + >>> print(f"Valid: {validation['is_valid']}") + >>> print(f"Analysis tier: {validation['tier']}") + """ + if not database_path.exists(): + raise DatabaseNotFoundError(f"Database not found: {database_path}") + + try: + from ..importer import RocpdImportData, execute_statement + + connection = RocpdImportData([str(database_path)]) + + # Check for required tables AND views (kernels/memory_copies are views, + # not raw tables, in rocprofv3 databases created by the rocpd importer) + tables_query = "SELECT name FROM sqlite_master WHERE type IN ('table','view')" + tables = [ + row[0] for row in execute_statement(connection, tables_query).fetchall() + ] + + has_kernels = "kernels" in tables + has_memory_copies = "memory_copies" in tables + has_counters = "pmc_events" in tables + has_pc_sampling = "pc_sampling" in tables + + # Determine tier + tier = 1 + if has_counters: + tier = 2 + if has_pc_sampling: + tier = 3 + + return { + "is_valid": has_kernels, + "tier": tier, + "has_kernels": has_kernels, + "has_memory_copies": has_memory_copies, + "has_counters": has_counters, + "has_pc_sampling": has_pc_sampling, + "tables": tables, + } + + except Exception as e: + raise DatabaseCorruptedError(f"Database validation failed: {e}") diff --git a/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/docs/AI_ANALYSIS_API.md b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/docs/AI_ANALYSIS_API.md new file mode 100644 index 00000000000..41ff6e09ded --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/docs/AI_ANALYSIS_API.md @@ -0,0 +1,1645 @@ +# rocpd AI Analysis Python API Documentation + +**Version:** 0.2.0 +**Module:** `rocpd.ai_analysis` + +--- + +## Table of Contents + +1. [Overview](#overview) +2. [Installation](#installation) +3. [Quick Start](#quick-start) +4. [API Reference](#api-reference) +5. [Data Classes](#data-classes) +6. [Output Formats](#output-formats) +7. [LLM Enhancement](#llm-enhancement) +8. [Error Handling](#error-handling) +9. [Integration Examples](#integration-examples) +10. [Bug Fixes & Behavioral Changes](#bug-fixes--behavioral-changes) + +--- + +## Overview + +The rocpd AI Analysis API provides programmatic access to AI-powered GPU performance analysis. It's designed for integration with visualization tools (like Optiq), automated analysis pipelines, and custom workflows. + +**Key Features:** + +- โœ… **Local-first analysis** - Works offline, no API calls required +- โœ… **Tier 0 source analysis** - Scan source code without a trace database (`analyze_source()`) +- โœ… **Optional LLM enhancement** - Natural language explanations via Anthropic Claude, OpenAI GPT, any OpenAI-compatible private server, or local Ollama +- โœ… **Multiple output formats** - Python objects, JSON, text, markdown, webview (interactive HTML) +- โœ… **Privacy-focused** - Data sanitization for LLM mode +- โœ… **User-modifiable** - Customize LLM behavior via reference guide +- โœ… **Persistent conversations** - `LLMConversation` class for multi-turn streaming sessions +- โœ… **Type-safe** - Dataclass-based API with type hints + +--- + +## Installation + +The AI analysis module is included with rocprofiler-sdk 6.3.0 or later. + +```bash +# rocprofiler-sdk is typically installed at: +/opt/rocm/lib/python3.12/site-packages/rocpd/ + +# No additional installation needed for local-only analysis + +# For LLM enhancement, install provider SDKs: +pip install anthropic # For Anthropic Claude +pip install openai # For OpenAI GPT +``` + +--- + +## Quick Start + +### Basic Analysis (Local Mode) + +```python +from rocpd.ai_analysis import analyze_database +from pathlib import Path + +# Analyze a database file +result = analyze_database(Path("output.db")) + +# Access results +print(result.summary.overall_assessment) +print(f"Primary bottleneck: {result.summary.primary_bottleneck}") +print(f"Confidence: {result.summary.confidence:.0%}") + +# Get recommendations +for rec in result.recommendations.high_priority: + print(f"๐Ÿ”ด {rec.title}") + print(f" {rec.description}") + print(f" Impact: {rec.estimated_impact}") +``` + +### With LLM Enhancement + +```python +import os +from rocpd.ai_analysis import analyze_database +from pathlib import Path + +# Set API key +os.environ["ANTHROPIC_API_KEY"] = "sk-ant-..." + +# Analyze with LLM enhancement +result = analyze_database( + database_path=Path("output.db"), + enable_llm=True, + llm_provider="anthropic", + custom_prompt="Why is my matmul kernel slow?" +) + +# LLM-enhanced natural language explanation +print(result.llm_enhanced_explanation) +``` + +### JSON Output + +```python +from rocpd.ai_analysis import analyze_database_to_json +from pathlib import Path + +# Generate JSON output +json_output = analyze_database_to_json( + database_path=Path("output.db"), + output_json_path=Path("analysis.json") # Optional: save to file +) + +# JSON string is also returned +print(json_output) +``` + +### Webview (Interactive HTML) + +```python +from rocpd.ai_analysis import analyze_database +from pathlib import Path + +result = analyze_database(Path("output.db")) +Path("analysis.html").write_text(result.to_webview()) +# Open analysis.html in any browser - no server required +``` + +Or via CLI (file extension applied automatically): + +```bash +rocpd analyze -i output.db --format webview -d ./output -o analysis +# Produces: ./output/analysis.html +``` + +--- + +## API Reference + +### Main Functions + +#### `analyze_database()` + +Main entry point for performance analysis. + +```python +def analyze_database( + database_path: Path, + *, + custom_prompt: Optional[str] = None, + enable_llm: bool = False, + llm_provider: Optional[str] = None, + llm_api_key: Optional[str] = None, + output_format: OutputFormat = OutputFormat.PYTHON_OBJECT, + verbose: bool = False, + top_kernels: int = 10, +) -> AnalysisResult: +``` + +**Parameters:** + +- `database_path` (Path): Path to rocpd database file (.rpd or .db) +- `custom_prompt` (str, optional): Natural language question to guide analysis + - Example: `"Why is kernel X slow?"` +- `enable_llm` (bool): Enable LLM-powered enhancements (default: False) +- `llm_provider` (str, optional): LLM provider ("anthropic" or "openai") +- `llm_api_key` (str, optional): API key (or use environment variable) +- `output_format` (OutputFormat): Output format (default: PYTHON_OBJECT) +- `verbose` (bool): Enable verbose logging (default: False) +- `top_kernels` (int): Number of top kernels to analyze (default: 10) + +**Returns:** + +- `AnalysisResult`: Complete analysis results object + +**Raises:** + +- `DatabaseNotFoundError`: Database file doesn't exist +- `DatabaseCorruptedError`: Database schema is invalid +- `MissingDataError`: Required tables missing +- `LLMAuthenticationError`: LLM API key invalid (if enable_llm=True) + +**Example:** + +```python +from rocpd.ai_analysis import analyze_database, OutputFormat +from pathlib import Path + +result = analyze_database( + database_path=Path("output.db"), + custom_prompt="Focus on memory bottlenecks", + enable_llm=True, + llm_provider="anthropic", + verbose=True, + top_kernels=20 +) +``` + +--- + +#### `analyze_database_to_json()` + +Analyze database and return JSON output. + +```python +def analyze_database_to_json( + database_path: Path, + output_json_path: Optional[Path] = None, + **kwargs +) -> str: +``` + +**Parameters:** + +- `database_path` (Path): Path to rocpd database file +- `output_json_path` (Path, optional): Save JSON to this file +- `**kwargs`: Additional arguments passed to `analyze_database()` + +**Returns:** + +- `str`: JSON string + +**Example:** + +```python +from rocpd.ai_analysis import analyze_database_to_json +from pathlib import Path + +json_str = analyze_database_to_json( + database_path=Path("output.db"), + output_json_path=Path("analysis.json"), + enable_llm=True, + llm_provider="anthropic" +) +``` + +--- + +#### `get_recommendations()` + +Get filtered recommendations from analysis. + +```python +def get_recommendations( + database_path: Path, + priority_filter: Optional[str] = None, + category_filter: Optional[str] = None, + **kwargs +) -> List[Recommendation]: +``` + +**Parameters:** + +- `database_path` (Path): Path to rocpd database file +- `priority_filter` (str, optional): Filter by priority ("high", "medium", "low") +- `category_filter` (str, optional): Filter by category ("memory", "compute", etc.) +- `**kwargs`: Additional arguments passed to `analyze_database()` + +**Returns:** + +- `List[Recommendation]`: Filtered recommendations + +**Example:** + +```python +from rocpd.ai_analysis import get_recommendations +from pathlib import Path + +# Get only high-priority recommendations +high_priority_recs = get_recommendations( + database_path=Path("output.db"), + priority_filter="high" +) + +for rec in high_priority_recs: + print(f"{rec.title}: {rec.estimated_impact}") +``` + +--- + +#### `validate_database()` + +Validate database without performing full analysis. + +```python +def validate_database(database_path: Path) -> Dict[str, Any]: +``` + +**Parameters:** + +- `database_path` (Path): Path to rocpd database file + +**Returns:** + +- `Dict`: Validation results with keys: + - `is_valid` (bool): Database is valid + - `tier` (int): Analysis tier (1=trace, 2=counters, 3=pc_sampling) + - `has_kernels` (bool): Has kernel data + - `has_memory_copies` (bool): Has memory copy data + - `has_counters` (bool): Has hardware counters + - `has_pc_sampling` (bool): Has PC sampling data + - `tables` (List[str]): List of table names + +**Example:** + +```python +from rocpd.ai_analysis import validate_database +from pathlib import Path + +validation = validate_database(Path("output.db")) + +print(f"Valid: {validation['is_valid']}") +print(f"Analysis tier: {validation['tier']}") +print(f"Has counters: {validation['has_counters']}") +``` + +--- + +#### `analyze_source()` + +Analyze source code directory (Tier 0) and return a profiling plan. No database required. + +```python +def analyze_source( + source_dir: Path, + *, + custom_prompt: Optional[str] = None, + enable_llm: bool = False, + llm_provider: Optional[str] = None, + llm_api_key: Optional[str] = None, + verbose: bool = False, +) -> SourceAnalysisResult: +``` + +**Parameters:** + +- `source_dir` (Path): Directory containing GPU source code (`.hip`, `.cpp`, `.cu`, `.cl`, `.py`, `.h`, `.hpp`) +- `custom_prompt` (str, optional): Natural language question to guide LLM analysis +- `enable_llm` (bool): Enable LLM-powered explanation of the profiling plan (default: False) +- `llm_provider` (str, optional): LLM provider ("anthropic" or "openai") +- `llm_api_key` (str, optional): API key (or use environment variable) +- `verbose` (bool): Enable verbose logging (default: False) + +**Returns:** + +- `SourceAnalysisResult`: Profiling plan with detected kernels, patterns, risk areas, and suggested commands + +**Raises:** + +- `SourceDirectoryNotFoundError`: Source directory doesn't exist +- `SourceAnalysisError`: Error during source scanning + +**Example:** + +```python +from rocpd.ai_analysis import analyze_source +from pathlib import Path + +result = analyze_source(Path("./my_app/src")) +print(f"Programming model: {result.programming_model}") +print(f"Kernels found: {result.kernel_count}") +print(f"Suggested first command:\n {result.suggested_first_command}") + +for rec in result.recommendations: + print(f"[{rec['priority']}] {rec['category']}: {rec['issue']}") +``` + +**CLI equivalent:** + +```bash +rocpd analyze --source-dir ./my_app/src +rocpd analyze --source-dir ./my_app/src --format json -d ./out -o plan # โ†’ plan.json + +# Combined with trace database +rocpd analyze -i output.db --source-dir ./my_app/src +``` + +--- + +### Recommendation Deduplication + +The engine automatically detects what was already collected in the profiled run and +suppresses redundant suggestions: + +| Already in database | Commands suppressed | +|---|---| +| `kernels` rows | `rocprofv3 --kernel-trace` | +| `memory_copies` rows | `rocprofv3 --memory-copy-trace` | +| `kernels` + `regions` rows | All `--sys-trace`-equivalent flags | +| `pmc_events` counter `X` | `--pmc X` in any `rocprofv3` command | + +**PMC counter example**: if the trace was collected with +`--pmc GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES`, a "Low occupancy" recommendation that +would have suggested `--pmc SQ_WAVES SQ_WAVE_CYCLES TA_TA_BUSY` will be trimmed to +`--pmc SQ_WAVE_CYCLES TA_TA_BUSY` (only the uncollected counters). If *all* suggested +counters are already present the entire `rocprofv3` command is dropped. + +`rocprof-compute` commands are **never** dropped โ€” they always represent new deep +hardware counter analysis beyond what `rocprofv3` captures. + +--- + +## Data Classes + +### `AnalysisResult` + +Main result object containing all analysis data. + +**Attributes:** + +```python +@dataclass +class AnalysisResult: + metadata: AnalysisMetadata + profiling_info: ProfilingInfo + summary: AnalysisSummary + execution_breakdown: ExecutionBreakdown + recommendations: RecommendationSet + warnings: List[AnalysisWarning] + errors: List[str] + llm_enhanced_explanation: Optional[str] # Only if enable_llm=True +``` + +**Methods:** + +- `to_dict() -> Dict[str, Any]`: Convert to dictionary +- `to_json(indent: int = 2) -> str`: Serialize to JSON +- `to_text() -> str`: Generate plain text report +- `to_markdown() -> str`: Generate markdown report +- `to_webview() -> str`: Generate self-contained interactive HTML report + +**Example:** + +```python +result = analyze_database(Path("output.db")) + +# Convert to different formats +json_str = result.to_json() +text_report = result.to_text() +markdown_report = result.to_markdown() + +# Access structured data +print(f"Kernel time: {result.execution_breakdown.kernel_time_pct:.1f}%") +print(f"Primary bottleneck: {result.summary.primary_bottleneck}") +``` + +--- + +### `Recommendation` + +Single optimization recommendation. + +```python +@dataclass +class Recommendation: + id: str + priority: str # "high", "medium", "low" + category: str # "memory", "compute", "occupancy", etc. + title: str + description: str + estimated_impact: str + next_steps: List[str] +``` + +**Example:** + +```python +for rec in result.recommendations.high_priority: + print(f"ID: {rec.id}") + print(f"Title: {rec.title}") + print(f"Category: {rec.category}") + print(f"Impact: {rec.estimated_impact}") + print("Next steps:") + for step in rec.next_steps: + print(f" - {step}") +``` + +--- + +### `SourceAnalysisResult` + +Tier 0 analysis result from static source code scanning (returned by `analyze_source()`). + +**Attributes:** + +```python +@dataclass +class SourceAnalysisResult: + source_dir: str + analysis_timestamp: str + programming_model: str # "HIP", "HIP+ROCm_Libraries", "OpenCL", "PyTorch_HIP", etc. + + files_scanned: int + files_skipped: int + + detected_kernels: List[Dict] # {name, file, line, launch_type} + kernel_count: int + + detected_patterns: List[Dict] # {pattern_id, severity, category, description, count, locations} + risk_areas: List[str] + + already_instrumented: bool # True if ROCTx markers detected + roctx_marker_count: int + + recommendations: List[Dict] # Same structure as generate_recommendations() output + suggested_counters: List[str] # Recommended --pmc counters for this codebase + suggested_first_command: str # First rocprofv3 command to run + + llm_explanation: Optional[str] # Only if enable_llm=True +``` + +**Example:** + +```python +result = analyze_source(Path("./my_app")) + +# Programming model detection +print(result.programming_model) # "HIP+ROCm_Libraries" + +# Discovered kernels +for k in result.detected_kernels: + print(f" {k['name']} in {k['file']}:{k['line']}") + +# Risk patterns +for p in result.detected_patterns: + print(f"[{p['severity'].upper()}] {p['category']}: {p['description']}") + +# Suggested profiling workflow +print(result.suggested_first_command) +# e.g.: rocprofv3 --sys-trace --pmc GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES -- ./app +``` + +--- + +### Other Data Classes + +- `AnalysisMetadata`: Metadata about analysis (timestamps, versions, etc.) +- `ProfilingInfo`: Profiling session info (duration, mode, GPUs) +- `AnalysisSummary`: High-level summary (assessment, bottleneck, findings) +- `ExecutionBreakdown`: Time distribution (kernel, memcpy, API overhead) +- `RecommendationSet`: Prioritized recommendations (high/medium/low) +- `AnalysisWarning`: Warning messages + +See inline docstrings for complete documentation. + +--- + +## Output Formats + +### Python Object (Default) + +Returns `AnalysisResult` dataclass with full type safety. + +```python +result = analyze_database(Path("output.db")) +print(result.summary.overall_assessment) +``` + +### JSON + +Machine-readable structured data. Output file extension: `.json`. + +```python +from rocpd.ai_analysis import analyze_database, OutputFormat + +result = analyze_database( + Path("output.db"), + output_format=OutputFormat.JSON +) + +json_str = result.to_json(indent=2) +``` + +**JSON Output conforms to `analysis-output.schema.json` (v0.1.0):** + +```json +{ + "schema_version": "0.1.0", + "metadata": { + "rocpd_version": "6.3.0", + "analysis_version": "0.1.0", + "database_file": "/path/to/output.db", + "analysis_timestamp": "2026-02-07T14:30:00Z" + }, + "execution_breakdown": { + "kernel_time_pct": 40.0, + "memcpy_time_pct": 55.0, + "api_overhead_pct": 5.0, + "idle_time_pct": 0.0, + "total_runtime_ns": 5000000000 + }, + "hotspots": [ + { + "rank": 1, + "name": "conv2d_kernel", + "calls": 100, + "total_duration_ns": 2000000000, + "avg_duration_ns": 20000000, + "pct_of_total": 40.0 + } + ], + "memory_analysis": { ... }, + "hardware_counters": { ... }, + "recommendations": [ + { + "priority": "HIGH", + "category": "Low Occupancy", + "issue": "Average wave occupancy is low", + "suggestion": "Increase occupancy by reducing VGPR usage", + "estimated_impact": "15-20% performance improvement", + "actions": ["Use rocprof-compute to measure occupancy", ...], + "commands": [...] + } + ], + "warnings": [...] +} +``` + +> See `docs/analysis-output.schema.json` for the normative schema definition and +> `docs/SCHEMA_CHANGELOG.md` for version history. + +### Text + +Human-readable plain text report. Output file extension: `.txt`. + +```python +result = analyze_database(Path("output.db")) +text_report = result.to_text() +print(text_report) +``` + +### Markdown + +Markdown-formatted report with syntax highlighting. Output file extension: `.md`. + +```python +result = analyze_database(Path("output.db")) +markdown_report = result.to_markdown() +Path("report.md").write_text(markdown_report) +``` + +### Webview (Interactive HTML) + +Self-contained single-file HTML report with light/dark theme, sortable tables, interactive +recommendation cards, status-colored KPI cards, and SVG performance gauges. No external +dependencies โ€” works fully offline. Output file extension: `.html`. + +```python +result = analyze_database(Path("output.db")) +html_report = result.to_webview() +Path("report.html").write_text(html_report) +``` + +**CLI usage:** + +```bash +# Produces output/analysis.html automatically +rocpd analyze -i output.db --format webview -d ./output -o analysis +``` + +**Features of the HTML report:** + +- **Light/Dark theme toggle**: Persisted in `localStorage`; defaults to AMD dark. Header + always uses AMD gradient branding regardless of active theme. +- **Status summary badges**: Critical/Warning/Low/Info recommendation counts shown in the + sticky header โ€” key issues visible without scrolling. +- **Metric pills row**: Runtime (ms), kernel dispatch count, analysis tier, generation + timestamp, and DB file path in a compact row below the header. +- **Status-colored KPI cards**: Kernel %, bottleneck type, total runtime, and tier cards + with colored top border (green/amber/red) reflecting health status. +- **Priority icons on recommendations**: ๐Ÿ”ด HIGH, ๐ŸŸ  MEDIUM, ๐ŸŸก LOW, โ„น INFO icons on each card. +- **Overview panel**: Assessment text (blockquote style), status KPI grid, key findings list. +- **Execution breakdown**: Gradient segment bars + grid-aligned legend rows. +- **Recommendations**: Collapsible cards color-coded by priority (HIGH auto-expanded); + one-click copy of profiling commands; section-level Critical/Warning count badges. +- **Hotspot table**: Sortable by any column; rows with >20% of total time highlighted. +- **Memory transfers**: Per-direction table (H2D, D2H, D2D, P2P). +- **Hardware counters**: GPU utilization and wave occupancy gauges (Tier 2); gauges have + background fill and hover border effect. +- **FAB scroll-to-top**: Floating action button appears after scrolling 250 px. +- **Staggered animations**: Section cards fade in with `@keyframes fadeInUp` on load. +- **Embedded data**: Full JSON payload included for programmatic inspection. +- **Hover tooltips**: Every graph, gauge, bar, table column, and counter row shows a + floating tooltip on hover explaining what the metric means, why it matters, good/bad + thresholds, and how to address issues. Coverage includes: + - *Gauges*: counter formula (e.g. `GRBM_GUI_ACTIVE รท GRBM_COUNT`), target thresholds, + current status assessment + - *Breakdown bars*: what each category measures, optimization guidance + - *Overview stats*: per-bottleneck type explanation with specific fix advice, + Tier 1 vs Tier 2 distinction with upgrade command + - *Hotspot columns*: semantics of Calls, Total/Avg/Min time, % Total + - *Memory directions*: H2D/D2H/D2D/P2P with PCIe vs HBM bandwidth context + - *Counter rows*: educational content for 20+ known AMD GPU counters + (GRBM_*, SQ_*, TCP/TCC cache, FETCH_SIZE, WRITE_SIZE, etc.); + unknown counters receive a generic fallback message + +--- + +## LLM Enhancement + +### Overview + +LLM enhancement provides natural language explanations of performance data. It's **optional** and **privacy-focused**. + +### How It Works + +1. **Local analysis runs first** (always) +2. **Data is sanitized** (kernel names โ†’ [KERNEL_1], grid sizes โ†’ [REDACTED]) +3. **Reference guide loaded** (the "fence" - defines analysis rules) +4. **LLM called with sanitized data + reference guide** +5. **Natural language explanation returned** + +### Enabling LLM Enhancement + +**Option 1: Environment Variable** + +```bash +export ANTHROPIC_API_KEY="sk-ant-..." +``` + +```python +from rocpd.ai_analysis import analyze_database + +result = analyze_database( + Path("output.db"), + enable_llm=True, + llm_provider="anthropic" +) +``` + +**Option 2: Pass API Key Directly** + +```python +result = analyze_database( + Path("output.db"), + enable_llm=True, + llm_provider="anthropic", + llm_api_key="sk-ant-..." +) +``` + +### Supported Providers + +- **Anthropic Claude** (recommended) + - Provider: `"anthropic"` + - Environment variable: `ANTHROPIC_API_KEY` + - Default model: `claude-sonnet-4-20250514` + +- **OpenAI GPT** + - Provider: `"openai"` + - Environment variable: `OPENAI_API_KEY` + - Default model: `gpt-4-turbo-preview` + - **Model compatibility**: newer models (gpt-5, o1, o3, gpt-4o-2024-11-20+) require + `max_completion_tokens` instead of `max_tokens`. This is handled automatically โ€” + `max_completion_tokens` is tried first and falls back to `max_tokens` if needed. + +- **Private/enterprise server** (any OpenAI-compatible endpoint) + - Provider: `"private"` (`--llm private`) + - Required env var: `ROCPD_LLM_PRIVATE_URL` โ€” base URL (e.g. `https://llm-api.example.com/OpenAI`) + - Required: `ROCPD_LLM_PRIVATE_MODEL` or `--llm-private-model` + - Optional: `ROCPD_LLM_PRIVATE_API_KEY` (default: `"dummy"` for header-authenticated servers) + - Optional: `ROCPD_LLM_PRIVATE_HEADERS` โ€” JSON object of extra request headers; + must be a JSON object (`{...}`), not an array or scalar โ€” a `ValueError` is raised + if the parsed value is not a dict; the `user` header is auto-set to `os.getlogin()` + unless already provided + - Optional: `ROCPD_LLM_PRIVATE_VERIFY_SSL=0` โ€” disable SSL certificate verification (requires `httpx`) + + ```bash + export ROCPD_LLM_PRIVATE_URL="https://llm-api.example.com/OpenAI" + export ROCPD_LLM_PRIVATE_HEADERS='{"Ocp-Apim-Subscription-Key": "abc123", "api-version": "preview"}' + rocpd analyze -i output.db --llm private --llm-private-model gpt-4o + ``` + +- **Local Ollama** + - Provider: `--llm-local ollama` + - Env var: `ROCPD_LLM_LOCAL_URL` (default: `http://localhost:11434/v1`) + - Env var: `ROCPD_LLM_LOCAL_MODEL` (default: `codellama:13b`) + +**Override the model at runtime** (anthropic/openai providers): + +```bash +export ROCPD_LLM_MODEL="claude-opus-4-6" # Use a different Anthropic model +export ROCPD_LLM_MODEL="gpt-4o" # Use a different OpenAI model +``` + +### Custom Prompts + +Guide the LLM with specific questions: + +```python +result = analyze_database( + Path("output.db"), + enable_llm=True, + llm_provider="anthropic", + custom_prompt="Why is my convolution kernel slow? Focus on memory access patterns." +) + +print(result.llm_enhanced_explanation) +``` + +### Data Sanitization + +When LLM mode is enabled, sensitive data is automatically redacted: + +| Data Type | Original | Sanitized | +|-----------|----------|-----------| +| Kernel names | `conv2d_forward_kernel` | `[KERNEL_1]` | +| Grid sizes | `[256, 256, 1]` | `[GRID_SIZE]` | +| Workgroup sizes | `[256, 1, 1]` | `[WORKGROUP_SIZE]` | +| File paths | `/home/user/app.cpp` | `[REDACTED]` | + +**Preserved Data** (aggregated/classified): +- Bottleneck classifications (compute-bound, memory-bound) +- Aggregated metrics (time percentages, utilization %) +- GPU architecture (gfx908, gfx90a, gfx942, gfx950, gfx1030, gfx1100) + +--- + +## Error Handling + +### Exception Hierarchy + +```python +AnalysisError (base) +โ”œโ”€โ”€ DatabaseNotFoundError +โ”œโ”€โ”€ DatabaseCorruptedError +โ”œโ”€โ”€ MissingDataError +โ”œโ”€โ”€ UnsupportedGPUError +โ”œโ”€โ”€ LLMAuthenticationError +โ”œโ”€โ”€ LLMRateLimitError +โ”œโ”€โ”€ ReferenceGuideNotFoundError +โ”œโ”€โ”€ SourceDirectoryNotFoundError # analyze_source(): directory doesn't exist +โ””โ”€โ”€ SourceAnalysisError # analyze_source(): error during scanning +``` + +### Example Error Handling + +```python +from rocpd.ai_analysis import ( + analyze_database, + DatabaseNotFoundError, + MissingDataError, + LLMAuthenticationError +) +from pathlib import Path + +try: + result = analyze_database( + Path("output.db"), + enable_llm=True, + llm_provider="anthropic" + ) + +except DatabaseNotFoundError as e: + print(f"Database not found: {e}") + +except MissingDataError as e: + print(f"Missing data: {e}") + print(f"Missing tables: {e.missing_tables}") + print("Suggestion: Collect additional profiling data") + +except LLMAuthenticationError as e: + print(f"LLM authentication failed: {e}") + print("Check your API key and environment variables") + +except Exception as e: + print(f"Unexpected error: {e}") +``` + +### Graceful Degradation + +**Authentication and rate-limit errors propagate** โ€” if `enable_llm=True` and your key is +invalid or exhausted, `LLMAuthenticationError` / `LLMRateLimitError` will be raised so you +know immediately rather than silently getting local-only results. + +Other transient LLM failures (network timeouts, unexpected API errors) produce a warning +and fall back to local-only results without raising: + +```python +try: + result = analyze_database( + Path("output.db"), + enable_llm=True, + llm_provider="anthropic" + ) +except LLMAuthenticationError: + print("Invalid API key โ€” check ANTHROPIC_API_KEY") + raise + +# If a transient error occurred, llm_enhanced_explanation will be None +if result.llm_enhanced_explanation: + print("LLM enhancement available") +else: + print("Local-only analysis (LLM enhancement failed or disabled)") + +# Check warnings for details on any transient failure +for warning in result.warnings: + print(f"โš ๏ธ {warning.message}") +``` + +--- + +## Integration Examples + +### Optiq Integration + +```python +# Optiq UI integration example +from rocpd.ai_analysis import analyze_database +from pathlib import Path + +def load_trace_with_ai_insights(trace_file_path: str): + """ + Optiq function to load trace and get AI insights. + """ + result = analyze_database(Path(trace_file_path)) + + # Extract insights for UI + insights = { + "summary": result.summary.overall_assessment, + "bottleneck": result.summary.primary_bottleneck, + "confidence": result.summary.confidence, + "top_recommendations": [ + { + "title": rec.title, + "description": rec.description, + "impact": rec.estimated_impact, + "priority": rec.priority + } + for rec in result.recommendations.high_priority[:3] + ], + "execution_breakdown": { + "kernel_pct": result.execution_breakdown.kernel_time_pct, + "memcpy_pct": result.execution_breakdown.memcpy_time_pct, + "overhead_pct": result.execution_breakdown.api_overhead_pct + } + } + + return insights + +# Usage in Optiq +insights = load_trace_with_ai_insights("/path/to/output.db") +display_ai_panel(insights) +``` + +### Automated Analysis Pipeline + +```python +from rocpd.ai_analysis import analyze_database, get_recommendations +from pathlib import Path +import sys + +def automated_analysis_pipeline(trace_files: List[Path]): + """ + Analyze multiple trace files and generate reports. + """ + for trace_file in trace_files: + print(f"Analyzing {trace_file}...") + + try: + # Analyze + result = analyze_database( + trace_file, + enable_llm=True, + llm_provider="anthropic" + ) + + # Generate markdown report + report_path = trace_file.with_suffix(".md") + report_path.write_text(result.to_markdown()) + print(f" โœ… Report saved: {report_path}") + + # Check for high-priority issues + high_priority = result.recommendations.high_priority + if high_priority: + print(f" ๐Ÿ”ด {len(high_priority)} high-priority issues found") + for rec in high_priority: + print(f" - {rec.title}") + + except Exception as e: + print(f" โŒ Analysis failed: {e}") + +# Run pipeline +trace_files = list(Path("./traces").glob("*.db")) +automated_analysis_pipeline(trace_files) +``` + +### Batch Comparison + +```python +from rocpd.ai_analysis import analyze_database +from pathlib import Path +import pandas as pd + +def compare_traces(baseline_path: Path, optimized_path: Path): + """ + Compare baseline vs optimized traces. + """ + baseline = analyze_database(baseline_path) + optimized = analyze_database(optimized_path) + + # Build comparison dataframe + comparison = pd.DataFrame({ + "Metric": [ + "Kernel Time %", + "Memory Copy %", + "API Overhead %", + "Primary Bottleneck", + "Confidence" + ], + "Baseline": [ + f"{baseline.execution_breakdown.kernel_time_pct:.1f}%", + f"{baseline.execution_breakdown.memcpy_time_pct:.1f}%", + f"{baseline.execution_breakdown.api_overhead_pct:.1f}%", + baseline.summary.primary_bottleneck, + f"{baseline.summary.confidence:.0%}" + ], + "Optimized": [ + f"{optimized.execution_breakdown.kernel_time_pct:.1f}%", + f"{optimized.execution_breakdown.memcpy_time_pct:.1f}%", + f"{optimized.execution_breakdown.api_overhead_pct:.1f}%", + optimized.summary.primary_bottleneck, + f"{optimized.summary.confidence:.0%}" + ] + }) + + print(comparison.to_markdown(index=False)) + +# Usage +compare_traces(Path("baseline.db"), Path("optimized.db")) +``` + +--- + +## See Also + +- [LLM Reference Guide Documentation](LLM_REFERENCE_GUIDE.md) - How to customize LLM behavior +- [CLI Documentation](../README.md) - Using `rocpd analyze` command +- [rocprofiler-sdk Documentation](https://rocm.docs.amd.com/projects/rocprofiler-sdk/) + +--- + +### `LLMConversation` โ€” Persistent Multi-Turn Streaming Session + +`LLMConversation` provides a stateful multi-turn LLM session with streaming output, +automatic compaction, and disk archiving. It is used internally by `InteractiveSession` +and is also available as a public API for custom workflows. + +```python +from rocpd.ai_analysis import LLMConversation + +conv = LLMConversation( + provider="anthropic", # "anthropic" | "openai" | "private" | "local" + api_key=None, # or pass directly; falls back to env vars + model=None, # or override default model + compact_every=10, # compact history every N turns (default 10) + keep_recent_turns=6, # keep this many turns after compaction + history_path=None, # optional Path for JSONL disk archive +) + +# Set the system prompt once (include the reference guide / "fence" here) +from rocpd.ai_analysis import load_reference_guide +conv.initialize("You are an AMD GPU expert.\n\n" + load_reference_guide()) + +# Stream a response token-by-token +response = conv.send( + "What is the bottleneck in this trace?", + on_token=lambda t: print(t, end="", flush=True), +) + +# Serialize / restore across sessions +state = conv.to_dict() # does NOT include api_key +conv2 = LLMConversation.from_dict(state, api_key="sk-ant-...") +``` + +**Constructor parameters:** + +| Parameter | Default | Description | +|---|---|---| +| `provider` | โ€” | `"anthropic"`, `"openai"`, `"private"`, or `"local"` | +| `api_key` | `None` | API key; falls back to `ANTHROPIC_API_KEY` / `OPENAI_API_KEY` / `ROCPD_LLM_PRIVATE_API_KEY` | +| `model` | `None` | Model override; falls back to `ROCPD_LLM_MODEL` then built-in default | +| `compact_every` | `10` | Trigger LLM-based history compaction every N turns | +| `keep_recent_turns` | `6` | Number of recent turns preserved verbatim after compaction | +| `history_path` | `None` | JSONL file path for append-only message archive | + +**Methods:** + +- `initialize(system_prompt: str)` โ€” Set system prompt (call once before `send()`) +- `send(user_message, *, max_tokens=4096, on_token=None) -> str` โ€” Append user turn, stream response +- `to_dict() -> dict` โ€” Serialize state (api_key excluded) +- `from_dict(d, *, api_key=None, model=None) -> LLMConversation` โ€” Restore from serialized state + +**Properties:** `turn_count: int`, `messages: List[dict]` + +--- + +### `load_reference_guide()` โ€” Load the LLM Fence + +Returns the full content of the LLM reference guide (the "fence") as a string. +Useful when building a custom system prompt for `LLMConversation.initialize()`. + +```python +from rocpd.ai_analysis import load_reference_guide + +guide = load_reference_guide() +# guide is the full markdown text of share/llm-reference-guide.md + +conv.initialize("You are an expert AMD GPU engineer.\n\n" + guide) +``` + +The guide is loaded from (in order): +1. `ROCPD_LLM_REFERENCE_GUIDE` environment variable path +2. Module-relative `share/llm-reference-guide.md` +3. `/opt/rocm/share/rocprofiler-sdk/llm-reference-guide.md` + +--- + +### Context-Aware LLM Guide Loading + +`LLMAnalyzer` accepts an optional `AnalysisContext` to reduce the reference guide +tokens sent per call. Build the context from already-computed analysis results: + +```python +from rocpd.ai_analysis import AnalysisContext +from rocpd.ai_analysis.llm_analyzer import LLMAnalyzer + +ctx = AnalysisContext( + tier=2, # 0=source-only, 1=trace, 2=counters + has_counters=True, + bottleneck_type="compute", # triggers compiler section + custom_prompt="why is my kernel slow?", +) + +analyzer = LLMAnalyzer(provider="anthropic", api_key="...", verbose=True) +result = analyzer.analyze_with_llm(data, context=ctx) +``` + +When `context=None` (default), the full guide is used โ€” backward compatible. + +Token savings by scenario: +- Tier 1 trace-only: ~47% fewer tokens +- Tier 0 source-only: ~51% fewer tokens +- Tier 2 with latency bottleneck: ~18% fewer tokens + +See `docs/LLM_GUIDE_SECTIONS.md` for the full tag vocabulary and how to add +new sections or tags. + +--- + +## Support + +For issues, questions, or feature requests: +- File an issue on GitHub +- See [CONTRIBUTING.md](../CONTRIBUTING.md) +- ROCm documentation: https://rocm.docs.amd.com/ + +--- + +## Bug Fixes & Behavioral Changes + +This section documents behavioral changes made during code review that affect +how callers interact with the API. Changes are grouped by category. + +### LLM Layer + +**`LLMAnalyzer()` construction no longer raises `LLMAuthenticationError`** + +Previously, constructing `LLMAnalyzer(provider="anthropic")` without setting +`ANTHROPIC_API_KEY` would raise `LLMAuthenticationError` immediately. This blocked +use cases where the analyzer is constructed ahead of time and the API key is +supplied later (e.g., via a configuration reload). + +The key validation is now **deferred** โ€” `LLMAuthenticationError` is raised only +when an actual API call is made (`analyze_with_llm()`, `_call_anthropic()`, etc.). +Construction always succeeds as long as `provider` is valid. + +```python +# This now works even without an API key set +from rocpd.ai_analysis.llm_analyzer import LLMAnalyzer +analyzer = LLMAnalyzer(provider="anthropic") # no longer raises + +# The error fires here instead, when the call is actually made +import os +os.environ["ANTHROPIC_API_KEY"] = "sk-ant-..." # set key before calling +result = analyzer.analyze_with_llm(data) +``` + +**`LLMAnalyzer(model=...)` is now honored** + +Previously, the `model` parameter was stored but the `ROCPD_LLM_MODEL` environment +variable was checked first at call time, silently overriding any explicit `model=` +argument. The priority is now: + +1. `model=` constructor argument (highest priority) +2. `ROCPD_LLM_MODEL` environment variable +3. Built-in default (`DEFAULT_ANTHROPIC_MODEL` or `DEFAULT_OPENAI_MODEL`) + +**`analyze_source()` now passes `AnalysisContext(tier=0)` to the LLM automatically** + +When `enable_llm=True`, `analyze_source()` constructs an `AnalysisContext(tier=0, +custom_prompt=...)` and passes it to `analyze_source_with_llm()`. This ensures the +LLM reference guide is filtered to Tier 0-relevant sections (reducing token cost by +~51%) and that compiler optimization guidance is included. + +Callers who create `LLMAnalyzer` directly and call `analyze_source_with_llm()` +should also pass `context=AnalysisContext(tier=0)` for the same benefit. + +**Timeout parameter added to all LLM API calls** + +All Anthropic and OpenAI API calls now include `timeout=120` (seconds). Previously, +LLM calls could hang indefinitely on slow or unavailable network connections. If the +call takes longer than 120 seconds a network timeout exception is raised and wrapped +as a non-fatal warning (local analysis continues). + +### Output & Serialization + +**`AnalysisResult.to_json()` now raises `RuntimeError` when `_raw` is absent** + +Previously, calling `to_json()` on an `AnalysisResult` constructed manually (not via +`analyze_database()`) would silently return non-schema-conformant JSON โ€” a plain +`asdict()` serialization missing `schema_version`, `hotspots`, and other required +fields. + +It now raises `RuntimeError("Raw analysis data not available. ...")` immediately, +making the problem visible. Use `to_dict()` for non-schema-conformant dict output, +or use `analyze_database()` (which populates `_raw`) to get schema-conformant JSON. + +```python +# Manual construction โ€” to_json() now raises: +result = AnalysisResult(...) +result.to_json() # raises RuntimeError โ€” use to_dict() instead +result.to_dict() # works โ€” returns plain asdict() dict + +# Via analyze_database() โ€” to_json() works: +result = analyze_database(Path("output.db")) +result.to_json() # works โ€” schema-conformant, schema_version="0.1.0" +``` + +**`analyze_memory_copies()` bandwidth now uses actual transfer sizes** + +Previously the `size` column in the `memory_copies` table was not reliably +populated and bandwidth calculations returned 0. The column is now read and +`bandwidth_bytes_per_sec` (and `bandwidth_gbps`) are computed from real transfer +sizes when available. The "Low memory bandwidth" recommendation (< 10 GB/s threshold) +can now fire based on actual measurements. + +### Analysis Correctness + +**`overhead_percent` is now guaranteed to be โ‰ฅ 0** + +In some trace databases where kernel + memcpy time slightly exceeds the computed +total runtime (due to timestamp rounding), `overhead_percent` could be negative. +`compute_time_breakdown()` now applies `max(0.0, raw_overhead_pct)` before +returning the result. The field is always non-negative in the output. + +**Bottleneck classification no longer triggers `compute` from `has_counters` alone** + +Previously, the `_build_summary()` bottleneck classifier in `api.py` could produce +`primary_bottleneck="compute"` based on `kernel_pct > 70 AND has_counters=True`, +even when `kernel_pct` was well below 70%. The condition now uses the correct +threshold check: `kernel_pct > 70` is evaluated first, then `has_counters` is used +only to raise the confidence from 0.60 to 0.80 โ€” not to change the bottleneck type. + +**`analyze_source_code()` raises `SourceDirectoryNotFoundError` for missing directories** + +The `analyze_source_code()` function in `analyze.py` (CLI path) now raises +`SourceDirectoryNotFoundError` (not a generic `Exception`) when the `source_dir` +argument does not exist or is not a directory. This matches the behavior of the +Python API's `analyze_source()`. + +### Interactive Session (LLM Providers) + +**`"private"` provider now correctly routed in `_apply_suggestions_via_llm` and `_llm_rewrite_file`** + +Previously, both `InteractiveSession._apply_suggestions_via_llm` and +`WorkflowSession._llm_rewrite_file` dispatched any unrecognized provider to +`_call_local()` (Ollama). This caused the `"private"` provider to attempt a connection +to `http://localhost:11434/v1` and fail with a connection error instead of calling the +configured enterprise server. + +Both methods now explicitly handle `"private"` by routing to `_call_private()`. + +**`InteractiveSession` uses `LLMConversation` for persistent multi-turn context** + +The previous `SessionContext` dataclass (compact per-session summary: analyses, suggestions, commands) +has been replaced by a persistent `LLMConversation` object that holds the full message history. +All LLM calls within a session (`[o]`, `[a]` annotations, code rewrites) share the same conversation +so the LLM accumulates full context rather than receiving a condensed summary block. + +Key behavioral changes: +- History is compacted via `--llm-compact-every N` (default 10 turns) using an LLM-generated summary, not a rule-based snippet +- Source files are tracked in `_sent_source_files`; a file already sent in this session is not re-transmitted +- Conversation state (`conv.to_dict()`) is serialized into the session JSON on `[s]` save +- On `--resume-session`, the conversation is restored with `LLMConversation.from_dict()` + +### Source Scanner + +**`SourceAnalyzer` adds a truncation warning to `risk_areas` when `_MAX_FILES` is hit** + +When the number of source files in the scanned directory exceeds `_MAX_FILES` (500), +scanning stops early. The scanner now appends a human-readable warning to +`plan.risk_areas` noting how many files were skipped and suggesting a more targeted +`--source-dir` path. Previously the truncation was silent. + +```python +plan = SourceAnalyzer(Path("./huge_repo")).analyze() +# If > 500 files found: +assert any("truncat" in r.lower() for r in plan.risk_areas) +``` + +### WorkflowSession โ€” Cycle Prevention and Tier 3 Escalation + +**Collection fingerprint expanded to all trace flags** + +The PMC-dedup logic that prevents infinite `[r] โ†’ re-profile โ†’ same INFO` loops now +fingerprints **all named trace collection flags** in addition to individual `--pmc` +counter names: + +``` +--sys-trace --hip-trace --kernel-trace --memory-copy-trace --hsa-trace --stats +``` + +Previously only `--pmc` counters were tracked, causing the session to cycle between +sys-trace and counter-collection runs indefinitely. + +**All-history comparison (not just last run)** + +The dedup check now compares the suggested command's fingerprint against the **union** +of everything collected across all previous trace runs: + +```python +already_fp = frozenset().union(*( + _collection_fingerprint(t.command) for t in self._state.trace_history +)) +if suggested_fp and suggested_fp.issubset(already_fp): + ai_rec_cmd = None # every suggested collection already performed +``` + +**Tier 3 escalation when Tier 1/2 exhausted** + +When all Tier 1/2 data has been collected and there is nothing new to suggest, Phase 5 +now shows a "go deeper" menu instead of just printing "stuck": + +- TraceLens interval + kernel-category analysis: already embedded in the Phase 4 report. +- `[d]` builds a PC sampling command and sets it as the Phase 7 option `[3]`: + ``` + ROCPROFILER_PC_SAMPLING_BETA_ENABLED=1 rocprofv3 --pc-sampling \ + -d /tmp/rocpd_trace/run_ -o results -- + ``` + +**ENV=VALUE command prefix support in Phase 3** + +`_phase3_run_profiler` now strips leading `KEY=VALUE` tokens from the command string +and injects them into the subprocess environment via `env=` rather than `shell=True`: + +``` +# This works directly โ€” ROCPROFILER_PC_SAMPLING_BETA_ENABLED=1 is extracted +# and added to the child process env before rocprofv3 is exec'd. +ROCPROFILER_PC_SAMPLING_BETA_ENABLED=1 rocprofv3 --pc-sampling ... +``` + +### WorkflowSession โ€” AI Edit Revert + +**`_revert_last_edit(failure_reason="")` helper** + +Restores the most recently AI-modified file from its `.bak` backup and removes the +`_EditRecord` from `edit_history`. Accepts an optional `failure_reason` string. + +When `failure_reason` is non-empty and an `LLMConversation` is active, two messages +are injected directly into the conversation history (a `user` message describing the +failure and an `assistant` acknowledgement): + +```python +feedback = ( + f"IMPORTANT: The previous code edit to {file} was reverted " + f"because it caused errors.\n\nFailure details:\n{failure_reason}\n\n" + f"Do NOT suggest the same pattern again..." +) +conv._messages.append({"role": "user", "content": feedback}) +conv._messages.append({"role": "assistant", "content": "Understood. ..."}) +``` + +This teaches the LLM what failed without requiring a separate API call. + +**Phase 3 (run profiler) โ€” `[v]` revert on profiling failure** + +When the profiling command exits non-zero and `edit_history` is non-empty, the retry +menu now includes `[v] Revert last AI edit and retry`. The exit code is passed as the +failure reason so the LLM conversation records it. + +**Phase 6 (recompile wait) โ€” accumulate and pass error text** + +The recompile-wait loop accumulates all lines the user types as potential compilation +errors. When the user types `revert`/`undo`/`v`, the accumulated error lines are passed +to `_revert_last_edit(failure_reason=...)` so the LLM conversation receives the exact +compiler output. Example: + +``` +Changes applied. Please recompile your application. +Type 'done' when compiled, 'revert' to undo the AI edit, +'abort' to exit, or paste compilation errors. +> error: use of undeclared identifier '__builtin_amdgcn_sin' + Error noted. Type 'done' when fixed or 'revert' to undo the edit. +> revert + โœ“ Reverted: inefficient_demo.cpp (backup kept at inefficient_demo.cpp.bak) +``` + +### LLM Fence โ€” Invalid HIP Intrinsics + +**`__builtin_amdgcn_sin` / `__builtin_amdgcn_cos` added to the prohibited list** + +The reference guide now explicitly bans these non-existent HIP device functions with a +`โŒ` rule. The `__builtin_amdgcn_*` namespace covers hardware-specific operations +(lane reads, DS swizzle) but **not** transcendental math. Suggesting them causes: + +``` +error: use of undeclared identifier '__builtin_amdgcn_sin' +``` + +The guide documents the correct HIP math API: use `sinf()`, `cosf()`, `sqrtf()`, etc. +โ€” amdclang++ maps these to OCML hardware-optimized implementations automatically. + + +### WorkflowSession โ€” Phase 1b Quick Workload Analysis + +**New pre-Phase-2 step selects the best starter profiling command** + +Before presenting the profiling command to the user in Phase 2, `WorkflowSession` now +runs `_phase1b_quick_workload_analysis()` which combines two analysis paths: + +**1. App-command heuristics (`_classify_app_command`)** + +Inspects the binary name and arguments to detect workload type: + +| Detected workload | `workload_type` | Extra flags added | +|---|---|---| +| Python + ML framework (torch/jax/tf/paddle) | `python_ml` | `--hip-trace` | +| Python + LLM inference (vllm/llama/gpt/โ€ฆ) | `llm_inference` | `--hip-trace` | +| Python without ML framework | `python_generic` | `--hip-trace` | +| Compiled HIP/ROCm binary | `hip_compute` | none | +| MPI/Slurm launcher | `mpi_multi` | warning only | + +Multi-process patterns (torchrun, DDP, DeepSpeed, NCCL) trigger a warning about +worker-process GPU kernel capture limitations regardless of workload type. + +**2. Tier 0 source analysis** + +If `--source-dir` paths are provided, `SourceAnalyzer.analyze()` is called on the +first path. The flags from `plan.suggested_first_command` (the highest-priority +recommendation) replace the heuristic flags. The `-d ` and `-o ` components +are updated to a fresh timestamped directory before the command is shown. + +**Precedence and fallback:** + +``` +Source analysis flags > Heuristic extra flags > default set +(if source dir given) (always appended) (--sys-trace --kernel-trace + --memory-copy-trace --stats) +``` + +**Return value:** The method returns the full suggested command string. `run()` falls +back to `_build_profiling_command()` (pure default) only if the method returns `None`, +which only happens if both paths raise exceptions. + +### --resume-session Scope (InteractiveSession only) + +`--resume-session` restores a previously saved `InteractiveSession` by ID. It applies +**only** to the menu-driven `InteractiveSession` (triggered by +`rocpd analyze -i db.db --interactive` **without** a `""` string). + +`WorkflowSession` (7-phase workflow, triggered by `rocpd analyze --interactive ""`) +starts fresh each invocation. It does not support session resume. + +**How resume works:** + +1. The session ID (format: `YYYY-MM-DD_HH-MM-SS_`) is passed to + `InteractiveSession(resume_session_id=...)`. +2. `_init_session(resume_id)` loads the session JSON from `~/.rocpd/sessions/`. +3. `_restore_or_create_conv(loaded)` reconstructs the `LLMConversation` from the + serialized `loaded.conversation` dict via `LLMConversation.from_dict()`. +4. `_sent_source_files` is restored from `loaded.sent_source_files`. + +**Auto-detect (no `--resume-session` needed):** `_init_session` also calls +`self._store.find_by_source_dir(self._source_dir)` and, if matching sessions exist, +prompts the user to choose one. This means repeat invocations against the same +`--source-dir` will automatically offer resume without needing the session ID. + +**Session ID discovery:** + +```bash +ls ~/.rocpd/sessions/*.json | xargs -I{} python3 -c \ + "import json; d=json.load(open('{}'));print(d['session_id'],'|',d['source_dir'])" +``` + +--- + +### WorkflowSession โ€” Session Checkpoints + +Each AI source-file edit creates a git-worktree checkpoint so the user can roll back to +any prior state and blacklist approaches that caused regressions. + +#### Overview + +``` +Phase 6 AI edit + โ””โ”€โ–บ git commit all modified files + โ””โ”€โ–บ git update-ref refs/rocpd//cp-N (GC-pinned ref, not a branch) + โ””โ”€โ–บ git worktree add --detach ~/.rocpd/sessions//cp-N + โ””โ”€โ–บ CheckpointRecord appended to WorkflowState.checkpoints + โ”œโ”€โ”€ cp_id, commit_hash, ref_name, worktree_path + โ”œโ”€โ”€ files_modified, file_snapshots (full file contents for offline restore) + โ”œโ”€โ”€ run_index โ† set in Phase 3 after profiling succeeds + โ”œโ”€โ”€ performance_delta_pct โ† set in Phase 4 after analysis history appended + โ””โ”€โ”€ blacklisted, blacklist_category, blacklist_description +``` + +When the session exits (normally or via Ctrl+C), `_teardown_checkpoints()` removes all +worktrees. Refs (`refs/rocpd/โ€ฆ`) are kept so the commits survive GC until the user +explicitly runs a cleanup command. + +#### Dataclasses + +**`CheckpointRecord`** (in `interactive.py`): + +| Field | Type | Description | +|---|---|---| +| `cp_id` | `int` | Sequential checkpoint index (0-based) | +| `commit_hash` | `str` | Full git commit SHA | +| `ref_name` | `str` | `refs/rocpd//cp-` | +| `worktree_path` | `str` | Absolute path to the detached worktree | +| `timestamp` | `str` | ISO-8601 timestamp | +| `files_modified` | `List[str]` | Repo-relative paths of files in this edit batch | +| `edit_summary` | `str` | First non-blank line of the LLM suggestion (โ‰ค80 chars) | +| `file_snapshots` | `Dict[str, str]` | Full file contents keyed by relative path | +| `run_index` | `Optional[int]` | Which trace run followed this edit (set in Phase 3) | +| `performance_delta_pct` | `Optional[float]` | Runtime change % vs prior run (set in Phase 4) | +| `blacklisted` | `bool` | Whether this approach has been blacklisted | +| `blacklist_category` | `str` | Equal to `edit_summary` (used for deduplication) | +| `blacklist_description` | `str` | Human-readable description injected into LLM prompt | + +**`WorkflowState` additions:** + +| Field | Type | Description | +|---|---|---| +| `repo_root` | `str` | Absolute path to git repo root (empty when no git) | +| `baseline_commit` | `str` | HEAD at session start โ€” rollback target `cp_id=-1` | +| `checkpoints` | `List[CheckpointRecord]` | All checkpoints in this session | +| `active_checkpoint` | `Optional[int]` | Currently restored checkpoint (or `None`) | +| `blacklisted_approaches` | `List[str]` | Persistent list of blacklist descriptions; **not truncated by rollback** | + +#### GitCheckpointManager + +All git operations are isolated in `GitCheckpointManager`: + +```python +gcm = GitCheckpointManager(repo_root="/path/to/repo", session_id="2026-03-13_myapp") + +# Detect repo (static โ€” does not require a known repo_root) +repo_root = GitCheckpointManager.detect_repo(cwd="/path/to/project") + +# Core checkpoint operations +hash_ = gcm.commit_files(files=["src/kernel.cpp"], message="rocpd: checkpoint 0") +gcm.tag_checkpoint(commit_hash=hash_, cp_id=0) # creates refs/rocpd/.../cp-0 +gcm.add_worktree(commit_hash=hash_, cp_id=0) # git worktree add --detach +gcm.remove_worktree(worktree_path="/path/to/wt") + +# Introspection +gcm.get_head() # current HEAD SHA +gcm.files_in_commit(commit_hash) # list of relative paths +gcm.list_worktrees() # all registered worktrees +gcm.restore_files_from_commit(commit_hash, files) # git checkout -- +``` + +`commit_files` uses `-c user.email=rocpd@local -c user.name=rocpd` overrides and +`--no-verify` to work in any git environment regardless of hooks or missing config. + +#### Rollback + +Triggered by `[b]` in the Phase 5 recommendations menu (shown only when checkpoints +exist). `_show_checkpoint_picker()` displays a table of all checkpoints with performance +delta and edit summary: + +``` + Checkpoints + โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + [-1] Baseline (no AI edits) + [ 0] Reduce memcpy by using zero-copy buffers Run #1 -12.3% + [ 1] Optimize wave occupancy via LDS padding Run #2 +4.1% โ† regression + [ 2] Unroll inner loop and vectorize memory accesses Run #3 -8.7% +``` + +Regression checkpoints (+delta) are flagged and the user is prompted to blacklist them +before the rollback is applied. The blacklist description is appended to +`WorkflowState.blacklisted_approaches` (never truncated by rollback) so future LLM +calls avoid the same approach. + +**Rollback strategy:** + +1. **git fast path**: `git checkout -- ` for each file in the target + checkpoint. Falls back to snapshot path on any `CheckpointError`. +2. **Snapshot fallback**: Writes `file_snapshots` contents directly. Works in any + environment including those where git is not available post-session-start. +3. **Baseline rollback** (`cp_id = -1`): Restores to `baseline_commit` via git, or + writes all accumulated snapshots in reverse order as a last resort. + +After rollback, `WorkflowState.checkpoints` is truncated to `checkpoints[:target+1]` +and `_save_session()` is called unconditionally. + +#### Blacklist Injection + +When `_build_blacklist_block()` returns a non-empty string, it is prepended to the LLM +suggestion prompt in Phase 6 before `_llm_rewrite_file()` is called: + +``` +# Blacklisted approaches (do NOT use these): + +- Reduce memcpy by using zero-copy buffers (caused +4.1% regression on run #2) +- ... +``` + +The blacklist is built from `WorkflowState.blacklisted_approaches` (persistent) so it +survives rollbacks that truncate the `checkpoints` list. Entries are deduplicated by +exact string match. + +#### Session lifecycle + +``` +WorkflowSession.run() + โ”œโ”€ Phase 1: validate sources + โ”œโ”€ _init_checkpoints() โ† detect git, record baseline (dirty tree OK) + โ”œโ”€ _prune_stale_worktrees() โ† remove orphaned worktrees with no session JSON + โ”œโ”€ Phase 1b โ€ฆ Phase 7 loop + โ””โ”€ finally: + _teardown_checkpoints() โ† remove all worktrees (refs kept for GC protection) + _save_session() +``` + +**Dirty working tree**: No issue. `commit_files` stages only the specific files modified +by each AI edit (`git add -- `), so other in-progress changes in the working tree +are never touched or included in checkpoint commits. + +**No-git graceful fallback**: When git is not detected or any checkpoint operation +fails, `self._gcm` is set to `None` and checkpoints are silently skipped. All other +session functionality continues normally. diff --git a/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/docs/LLM_GUIDE_SECTIONS.md b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/docs/LLM_GUIDE_SECTIONS.md new file mode 100644 index 00000000000..dafb96b300d --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/docs/LLM_GUIDE_SECTIONS.md @@ -0,0 +1,112 @@ +# LLM Reference Guide โ€” Section Tagging System + +This document explains how `llm-reference-guide.md` is split into context-aware +sections to reduce per-call LLM token cost by 18โ€“51%. + +--- + +## Why Section Filtering Exists + +The full reference guide is ~72 KB / ~18,000 tokens. Sending it with every LLM call +is wasteful: a Tier 1 trace-only analysis does not need the Hardware Counter Reference +(7,979 chars) or the Compiler Optimization section (10,873 chars). + +Context-aware filtering selects only the sections relevant to the current analysis, +saving 18โ€“51% of token cost depending on the scenario: + +| Scenario | Approx. token saving | +|----------|---------------------| +| Tier 1 trace-only | ~47% | +| Tier 0 source-only | ~51% | +| Tier 1 + compiler trigger | ~32% | +| Tier 2 full analysis (no compiler) | ~18% | + +--- + +## Tag Vocabulary + +Each `## Section` in `llm-reference-guide.md` carries a tag comment on the line +immediately after the heading: + +```text +## Hardware Counter Reference + +``` + +| Tag | Meaning | Sections | +|-----|---------|----------| +| `always` | Included in every LLM call | Critical rules, role, output format, what not to do, summary | +| `tier1` | Trace data available (Tier 1+) | Profiling workflow, tool reference, common bottleneck types | +| `tier2` | PMC counter data available | Hardware counters, memory hierarchy, perf models, GPU specs, AMD optimizations | +| `compiler` | Compiler optimization is relevant | Compiler Optimization Flags and Options | +| `source` | Reserved for future Tier 0 guidance | *(empty โ€” no sections use this tag yet)* | + +**Fallback rule:** A section with **no tag comment** is always included. This +ensures user-added sections are never silently dropped. + +--- + +## `AnalysisContext` Fields + +`AnalysisContext` (importable from `rocpd.ai_analysis`) tells the system which tags +to activate: + +| Field | Type | Controls | +|-------|------|---------| +| `tier` | `int` | `0` โ†’ source + compiler tags; `1` โ†’ tier1; `โ‰ฅ2` โ†’ tier1 + tier2 | +| `has_counters` | `bool` | `True` adds `tier2` even when `tier == 1` | +| `bottleneck_type` | `str \| None` | `"compute"` or `"memory"` adds `compiler` tag | +| `gpu_arch` | `str \| None` | Reserved for future per-GPU section filtering | +| `custom_prompt` | `str \| None` | Adds `compiler` tag when it contains compiler/flag/build/compile | + +--- + +## How to Add a New Section + +1. Add the section to `llm-reference-guide.md` with a `## ` heading. +2. On the **very next line** (line 1 of the section body), add: + ``` + + ``` + where TAG is one of the known vocabulary values above. +3. If unsure which tag to use, use `always` โ€” the section will always be included. +4. Run the integrity tests to confirm no typos: + ```bash + PYTHONPATH=/opt/rocm-7.2.0/lib/python3.12/site-packages \ + pytest --noconftest tests/rocprofv3/rocpd/test_guide_filter_standalone.py \ + -v -k "TestGuideIntegrity" + ``` + +--- + +## How to Add a New Tag + +1. Add the new tag to `_select_tags()` in `source/lib/python/rocpd/ai_analysis/llm_analyzer.py`. +2. Add the tag to `TestGuideIntegrity.KNOWN_TAGS` in `tests/rocprofv3/rocpd/test_guide_filter_standalone.py`. +3. Add a row to the tag vocabulary table above. +4. Update the `AnalysisContext` docstring if the new tag is driven by a new field. + +--- + +## Debugging: Verbose Mode + +Pass `verbose=True` to `LLMAnalyzer` to see which sections were loaded: + +```python +analyzer = LLMAnalyzer(provider="anthropic", api_key="...", verbose=True) +analyzer.analyze_with_llm(data, context=ctx) +# โ†’ [LLM] Guide filtered: 34800 / 72513 chars (48% of full guide) +``` + +--- + +## Tag Selection Logic (for reference) + +``` +tier == 0 โ†’ always + source + compiler +tier >= 1 โ†’ always + tier1 +has_counters == True OR tier >= 2 โ†’ also adds tier2 +bottleneck_type in compute/memory โ†’ also adds compiler +custom_prompt contains + compiler/flag/build/compile โ†’ also adds compiler +``` diff --git a/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/docs/LLM_REFERENCE_GUIDE.md b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/docs/LLM_REFERENCE_GUIDE.md new file mode 100644 index 00000000000..45ab0d1294f --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/docs/LLM_REFERENCE_GUIDE.md @@ -0,0 +1,2103 @@ +# LLM Reference Guide for GPU Performance Analysis + +**Purpose**: This document is provided to the LLM as context when analyzing GPU profiling data. It defines boundaries, provides reference information, and guides analysis quality. + +--- + +## CRITICAL REQUIREMENTS + + +### Hardware Counter Per-Block Limits โ€” MUST NOT EXCEED + +**THIS IS A HARD HARDWARE CONSTRAINT.** Violating it crashes rocprofv3 (error code 38: "Request exceeds the capabilities of the hardware to collect"). + +AMD GPUs limit how many counters from the **same hardware block** can be collected in one rocprofv3 pass. The block name is the prefix before the first `_` in the counter name (e.g., `SQ_WAVES` โ†’ block `SQ`). + +**Safe per-block limits** (conservative defaults โ€” actual limits vary by GPU): +| Block | Examples | Limit per pass | +|-------|----------|----------------| +| `SQ` | `SQ_WAVES`, `SQ_INSTS_VALU`, `SQ_INSTS_VMEM_RD`, `SQ_INSTS_VMEM_WR`, `SQ_INSTS_LDS` | 4 (up to 8 on gfx942) | +| `GRBM` | `GRBM_COUNT`, `GRBM_GUI_ACTIVE` | 4 | +| `FETCH` | `FETCH_SIZE` | 2 | +| `WRITE` | `WRITE_SIZE` | 2 | +| `TCP`, `TCC`, `TA`, `TD` | Cache counters | 4 | + +**Mandatory rules for `--pmc` commands you generate:** +1. Count counters **per block separately** โ€” do NOT count across different blocks together +2. If any block would exceed its limit โ†’ split into **multiple separate rocprofv3 runs** (pass 1, pass 2, โ€ฆ) each with its own `-d`/`-o` +3. Different blocks CAN coexist in the same pass as long as each block's count stays within its limit +4. `rocprof-compute` is EXEMPT โ€” it handles multi-pass collection internally + +**ADDITIONAL RULE โ€” FETCH_SIZE and WRITE_SIZE are TCC-derived metrics**: +These are NOT raw hardware counters. rocprofv3 expands them internally to TCC hardware counters: +- `FETCH_SIZE` โ†’ `TCC_BUBBLE + TCC_EA0_RDREQ + GRBM_GUI_ACTIVE` (TCC block, 32 instances) +- `WRITE_SIZE` โ†’ `TCC_EA0_WRREQ + TCC_EA0_WRREQ_64B` (TCC block, 32 instances) +**Rules**: +1. FETCH_SIZE and WRITE_SIZE MUST each be in their own dedicated pass. +2. They cannot share a pass with each other (combined 5 TCC hardware counters > limit). +3. They cannot share a pass with SQ counters. + +**Examples:** +```bash +# โœ… SAFE โ€” 3 passes: SQ/GRBM | FETCH_SIZE | WRITE_SIZE +# Pass 1: GPU utilization + occupancy (raw hardware counters) +rocprofv3 --sys-trace --pmc GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES SQ_INSTS_VMEM_RD \ + SQ_INSTS_VMEM_WR SQ_INSTS_LDS -d ./out -o baseline_pass1 -- ./app +# Pass 2: HBM read bandwidth +rocprofv3 --sys-trace --pmc FETCH_SIZE -d ./out -o baseline_pass2 -- ./app +# Pass 3: HBM write bandwidth +rocprofv3 --sys-trace --pmc WRITE_SIZE -d ./out -o baseline_pass3 -- ./app + +# โœ… SAFE โ€” GRBMร—2 + SQร—1 only (no bandwidth needed) +rocprofv3 --sys-trace --pmc GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES -d ./out -o p1 -- ./app + +# โœ… SAFE โ€” FETCH_SIZE alone (3 TCC hardware counters, within limit) +rocprofv3 --sys-trace --pmc FETCH_SIZE -d ./out -o fetch -- ./app + +# โŒ UNSAFE โ€” FETCH_SIZE + WRITE_SIZE in same pass โ†’ 5 TCC hardware counters โ†’ error 38 +rocprofv3 --sys-trace --pmc FETCH_SIZE WRITE_SIZE -d ./out -o bw -- ./app # โ† WILL CRASH + +# โŒ UNSAFE โ€” SQ counters + FETCH_SIZE/WRITE_SIZE in the same pass โ†’ error code 38 +rocprofv3 --sys-trace --pmc GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES SQ_INSTS_VMEM_RD \ + SQ_INSTS_VMEM_WR SQ_INSTS_LDS FETCH_SIZE WRITE_SIZE -- ./app # โ† WILL CRASH +``` + +--- + +### Profiling Tools - Use Current Generation Tools ONLY + +**IMPORTANT**: All profiling commands MUST use current generation ROCm profiling tools, NOT deprecated tools. + +โŒ **NEVER use**: `rocprof`, `rocprof-v2`, or any other deprecated variant +โœ… **ALWAYS use**: `rocprofv3`, `rocprof-compute`, or `rocprof-sys` (also known as `rocsys`) + +**Tool Name Aliases**: +- `rocprof-sys` = `rocsys` (same tool, different names in documentation) +- `rocprofv3` is built on ROCprofiler-SDK โ€” the current generation, context-based profiling API +- `rocprof` / `rocprofv2` are deprecated; only critical bug fixes, EOL after ROCm 6.5 + +**Documentation References**: +- rocprofv3: https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/latest/ +- rocprof-compute: https://rocm.docs.amd.com/projects/rocprofiler-compute/en/latest/ +- rocprof-sys (rocsys): https://rocm.docs.amd.com/projects/rocprofiler-systems/en/latest/ + +--- + +## Output Format Requirements + + +Your response MUST be plain text with the following structure: + +1. **No markdown headers** - Use plain text, not ### or ## or # +2. **Consistent section structure**: + - Executive Summary (2-3 sentences) + - Key Findings (bullet points) + - Detailed Analysis (by bottleneck type) + - Actionable Recommendations (prioritized list) + - Next Profiling Steps (specific rocprofv3 commands) + +3. **Format each recommendation as**: + ``` + Priority: [HIGH/MEDIUM/LOW] + Issue: [description with metrics] + Suggestion: [what to do] + Actionable Steps: + - [specific step 1] + - [specific step 2] + Expected Impact: [quantified improvement estimate] + ``` + +4. **All profiling commands must use rocprofv3, rocprof-compute, or rocprof-sys** + +--- + +## Recommended AMD Profiling Workflow (3 Steps) + + +AMD's recommended performance analysis process is a progressive three-step methodology. +Never suggest all three steps when earlier data already exists โ€” only recommend the +**incremental next step** based on what is already in the database. + +### Step 1 โ€” System-Level Timeline (rocprof-sys) + +**Purpose**: Get a holistic view of the application before diving into kernel details. +Reveals CPU-GPU interaction, kernel call frequency, memory copy overhead, and identifies +the hottest kernels worth investigating. + +```bash +# Instrument binary once +rocprof-sys-instrument -- ./app + +# Run to collect timeline +rocprof-sys-run -- ./app.inst + +# For MPI applications +mpirun -n rocprof-sys-run -- ./mpi_app.inst +``` + +**What you learn**: +- Which kernels dominate execution time (Pareto/80-20 rule applies) +- CPU-GPU overlap (or lack thereof) +- Synchronization points and idle gaps +- Memory copy patterns and timing relative to kernels + +**When to recommend Step 1**: User has NO trace data yet. This is always the starting point. + +--- + +### Step 2 โ€” Kernel Hardware Counters (rocprofv3) + +**Purpose**: Collect hardware performance counters on the hot kernels identified in Step 1. +Enables bottleneck classification (compute-bound vs memory-bound), occupancy measurement, +and bandwidth utilization. + +โš ๏ธ **HARDWARE COUNTER LIMIT โ€” CRITICAL**: AMD GPUs limit how many counters from the same +hardware block can be collected in a single rocprofv3 pass. Exceeding this limit causes +rocprofv3 to abort with **error code 38**: "Request exceeds the capabilities of the hardware +to collect". See "Hardware Counter Collection Limits" section below before suggesting commands. + +```bash +# Pass 1: GPU utilization + wave occupancy (GRBM block: 2, SQ block: 1 โ€” safe) +rocprofv3 --sys-trace --pmc GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES \ + -d ./counters -o pass1 -- ./app + +# Pass 2: HBM read bandwidth (FETCH_SIZE alone โ€” 3 TCC hardware counters, within limit) +rocprofv3 --sys-trace --pmc FETCH_SIZE \ + -d ./counters -o pass2 -- ./app + +# Pass 3: HBM write bandwidth (WRITE_SIZE alone โ€” 2 TCC hardware counters, within limit) +rocprofv3 --sys-trace --pmc WRITE_SIZE \ + -d ./counters -o pass3 -- ./app + +# Scope to the hot kernel (add --kernel-names to any pass) +rocprofv3 --sys-trace --pmc GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES \ + --kernel-names "hotKernelName" -d ./counters -o pass1 -- ./app +``` + +**What you learn**: +- GPU utilization (`GRBM_GUI_ACTIVE / GRBM_COUNT`) โ€” from Pass 1 +- Wave occupancy (`SQ_WAVES / (kernel_duration / clock_period)`) โ€” from Pass 1 +- HBM read bandwidth (FETCH_SIZE ร— 1024 / duration) โ€” from Pass 2 +- HBM write bandwidth (WRITE_SIZE ร— 1024 / duration) โ€” from Pass 3 +- Classify as compute-bound, memory-bound, or latency-bound + +**When to recommend Step 2**: User has timeline data (Step 1) but no hardware counters. +Also appropriate as a direct first step when the hottest kernel is already known. + +--- + +### Step 3 โ€” Deep Kernel Analysis (rocprof-compute) + +**Purpose**: Comprehensive hardware counter characterization with automated roofline model, +memory hierarchy breakdown (L1/L2/HBM), instruction mix, and compute unit metrics. + +```bash +# Full characterization of all kernels +rocprof-compute profile -- ./app + +# Scope to the specific hot kernel +rocprof-compute profile --kernel "hotKernelName" -- ./app + +# Roofline only (faster) +rocprof-compute profile --roof-only -- ./app + +# Analyze results +rocprof-compute analyze --path ./workloads/mydata/MI300X +``` + +**What you learn**: +- Roofline model placement (how far from hardware limits) +- L1/L2/HBM cache hit rates and effective bandwidth +- Instruction mix: VALU, MFMA, VMEM, SALU, LDS +- Branch divergence, stalls, pipeline efficiency +- Per-block hardware counters (SQ, TCP, TA, TD, TCC, etc.) + +**When to recommend Step 3**: User has counter data (Step 2) and needs to understand +exactly what is limiting the hottest kernels. This is the most detailed and highest-overhead step. + +--- + +### Amdahl's Law โ€” Prioritization Principle + +Always apply Amdahl's Law: the maximum speedup from optimizing a kernel is bounded by +its fraction of total execution time. A kernel taking 5% of total time cannot give more +than 1/(1-0.05) = 1.05x speedup no matter how much it is optimized. + +**Rule**: Focus recommendations on kernels that represent >10% of total execution time. +Do not recommend deep analysis of kernels taking <5% of total time unless specifically asked. + +--- + +## Profiling Tool Reference + + +### 1. **rocprofv3** - Primary kernel-level profiler + +**Documentation**: https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/latest/how-to/using-rocprofv3.html + +**Purpose**: Kernel hotspots, hardware counters, API tracing, PC sampling, memory operations + +**Tracing Modes**: +```bash +# System trace (recommended for general profiling) +rocprofv3 --sys-trace -- ./app + +# Runtime trace (HIP runtime, markers, RCCL, memory ops, kernels) +rocprofv3 --runtime-trace -- ./app + +# HIP API tracing +rocprofv3 --hip-trace -- ./app +rocprofv3 --hip-runtime-trace -- ./app # Runtime APIs only +rocprofv3 --hip-compiler-trace -- ./app # Compiler-generated code + +# HSA API tracing +rocprofv3 --hsa-trace -- ./app # All HSA +rocprofv3 --hsa-core-trace -- ./app # Core API (hsa_*) +rocprofv3 --hsa-amd-trace -- ./app # AMD extensions + +# Specialized tracing +rocprofv3 --kernel-trace -- ./app # Kernel dispatches only +rocprofv3 --memory-copy-trace -- ./app # Memory copy operations +rocprofv3 --marker-trace -- ./app # ROCTx markers +rocprofv3 --kokkos-trace -- ./app # Kokkos instrumentation +rocprofv3 --rccl-trace -- ./app # RCCL communication +``` + +**Hardware Counter Collection**: +```bash +# List available counters +rocprofv3 --list-avail + +# Safe: 3 counters from 2 blocks (GRBMร—2 + SQร—1) +rocprofv3 --sys-trace --pmc GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES -- ./app + +# When collecting more counters, split into separate passes โ€” see limits below +# Pass 1: utilization + occupancy +rocprofv3 --sys-trace --pmc GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES -d ./out -o pass1 -- ./app +# Pass 2: HBM read bandwidth (FETCH_SIZE alone โ€” must not share pass with WRITE_SIZE) +rocprofv3 --sys-trace --pmc FETCH_SIZE -d ./out -o pass2 -- ./app +# Pass 3: HBM write bandwidth (WRITE_SIZE alone) +rocprofv3 --sys-trace --pmc WRITE_SIZE -d ./out -o pass3 -- ./app +``` + +**Hardware Counter Collection Limits** โš ๏ธ: + +AMD GPUs have a per-block limit on how many counters can be collected simultaneously. +The "block name" is the prefix before the first `_` in the counter name: + +| Block | Example counters | Safe per-pass limit | +|-------|-----------------|---------------------| +| `SQ` | `SQ_WAVES`, `SQ_INSTS_VALU`, `SQ_INSTS_VMEM_RD`, `SQ_INSTS_VMEM_WR`, `SQ_INSTS_LDS`, `SQ_WAVE_CYCLES` | 4 (up to 8 on gfx942) | +| `GRBM` | `GRBM_COUNT`, `GRBM_GUI_ACTIVE` | 4 | +| `FETCH` | `FETCH_SIZE` | 2 | +| `WRITE` | `WRITE_SIZE` | 2 | +| `TCP` | `TCP_TOTAL_CACHE_ACCESSES` | 4 | +| `TCC` | `TCC_*` | 4 | + +**Rules for generating `--pmc` commands**: +1. Count counters **per block** โ€” NEVER exceed the block's per-pass limit +2. If a query needs more counters than one block allows โ†’ split into **multiple separate `rocprofv3` runs** (pass 1, pass 2, ...) +3. Counters from DIFFERENT blocks may coexist in the same pass as long as each block's count stays within its limit +4. Each pass must be a complete, standalone rocprofv3 command with its own `-d`/`-o` +5. `rocprof-compute` is EXEMPT from this rule โ€” it handles multi-pass internally + +**Discovering available counters and limits:** +```bash +# List ALL available hardware counters on the current system / GPU model +rocprofv3 --list-avail + +# Filter by block name +rocprofv3 --list-avail | grep "^SQ" +rocprofv3 --list-avail | grep "^GRBM" +``` +Use `--list-avail` to: +- Verify a counter name is valid on this specific GPU before suggesting it +- Determine which hardware block a counter belongs to (for pass planning) +- Discover GPU-specific counters not covered in documentation +When unsure, recommend: `rocprofv3 --list-avail | grep ` + +**Kernel Filtering**: +```bash +# Filter by kernel name (exact match or substring) +rocprofv3 --kernel-names "myKernel" --pmc SQ_WAVES -- ./app + +# Filter by kernel name regex +rocprofv3 --kernel-include-regex "matmul.*" --pmc SQ_WAVES -- ./app +rocprofv3 --kernel-exclude-regex "small.*" --pmc SQ_WAVES -- ./app + +# Filter by iteration range +rocprofv3 --kernel-iteration-range [10-20] --pmc SQ_WAVES -- ./app +``` + +**PC Sampling (Beta)**: +```bash +# Enable PC sampling (requires environment variable) +export ROCPROFILER_PC_SAMPLING_BETA_ENABLED=1 +rocprofv3 --pc-sampling-beta-enabled --pc-sampling-unit instructions -- ./app +rocprofv3 --pc-sampling-unit cycles --pc-sampling-method stochastic -- ./app +``` + +**Output Control**: +```bash +# Specify output format (default: rocpd database) +rocprofv3 --sys-trace -f rocpd -- ./app # SQLite database +rocprofv3 --sys-trace -f json -- ./app # JSON format +rocprofv3 --sys-trace -f pftrace -- ./app # Perfetto trace +rocprofv3 --sys-trace -f csv -- ./app # CSV format +rocprofv3 --sys-trace -f rocpd json pftrace -- ./app # Multiple formats + +# Specify output location +rocprofv3 --sys-trace -o myoutput -d ./results -- ./app + +# Generate summary statistics +rocprofv3 --sys-trace --stats -S -- ./app # Display summary +rocprofv3 --sys-trace -D -- ./app # Per-domain summary +``` + +**Kernel Naming**: +```bash +# Use ROCTx markers to rename kernels +rocprofv3 --kernel-rename --marker-trace -- ./app + +# Show mangled names +rocprofv3 -M --sys-trace -- ./app + +# Truncate long kernel names +rocprofv3 -T --sys-trace -- ./app +``` + +**Process Attachment**: +```bash +# Attach to running process +rocprofv3 --attach --sys-trace -- ./monitor_command +``` + +**Use when**: Getting per-kernel hardware counters, API traces, or scoping data collection +to specific hot kernels. This is the workhorse for Steps 2 data collection. + +--- + +### 2. **rocprof-compute** - Detailed compute workload analyzer + +**Purpose**: Roofline analysis, memory hierarchy metrics, detailed compute characterization + +**Basic Commands**: +```bash +# Profile application and generate reports +rocprof-compute profile -- ./app + +# Profile with specific output directory +rocprof-compute profile -n mydata -- ./app + +# Filter by specific kernel +rocprof-compute profile -k "myKernel" -- ./app + +# Filter by dispatch ID +rocprof-compute profile -d 42 -- ./app + +# Collect specific metric blocks +rocprof-compute profile -b SQ -b TCP -- ./app + +# Roofline analysis only +rocprof-compute profile --roof-only -- ./app + +# Analyze existing data +rocprof-compute analyze --path ./workloads/mydata/MI300X + +# List available metrics for architecture +rocprof-compute --list-metrics gfx942 + +# List available analysis blocks +rocprof-compute --list-blocks gfx942 +``` + +**Use when**: Need the full roofline model, detailed memory hierarchy analysis (L1/L2/HBM +hit rates), or comprehensive compute characterization beyond what rocprofv3 counters provide. + +**Key Features**: +- Automated roofline analysis (achievable peaks, not just theoretical) +- Memory bandwidth and cache hierarchy metrics +- Compute unit utilization +- Hardware block-level counters (SQ, TCP, TA, TD, TCC, etc.) +- GUI analysis mode: `rocprof-compute analyze --path --gui` + +--- + +### 3. **rocprof-sys** (also known as **rocsys**) - System-wide profiler + +**Note**: This tool may be referred to as either `rocprof-sys` or `rocsys` in documentation +and outputs. Both names refer to the same tool (ROCm Systems Profiler). + +**Purpose**: Call-stack sampling, binary instrumentation, multi-process tracing, CPU-GPU +interaction. This is the recommended FIRST STEP in any profiling session. + +**Basic Commands**: +```bash +# Statistical call-stack sampling (no recompilation needed) +rocprof-sys-sample -- ./app + +# Binary instrumentation workflow +rocprof-sys-instrument -- ./app # Creates ./app.inst +rocprof-sys-run -- ./app.inst # Run instrumented binary + +# MPI application profiling +mpirun -n 4 rocprof-sys-run -- ./mpi_app.inst + +# Python script profiling +rocprof-sys-python -- ./script.py + +# Generate configuration file +rocprof-sys-avail -G ~/.rocprof-sys.cfg + +# View available configuration options +rocprof-sys-avail -S + +# View hardware counters +rocprof-sys-avail -H + +# View available components +rocprof-sys-avail -C +``` + +**Key Environment Variables**: +```bash +# Enable tracing +export ROCPROFSYS_TRACE=ON + +# Enable sampling +export ROCPROFSYS_USE_SAMPLING=ON + +# Set sampling frequency (Hz) +export ROCPROFSYS_SAMPLING_FREQ=100 + +# Enable GPU hardware counters +export ROCPROFSYS_USE_ROCPROFILER=ON +export ROCPROFSYS_ROCM_EVENTS="SQ_WAVES,GRBM_COUNT" + +# Enable Kokkos instrumentation +export ROCPROFSYS_USE_KOKKOSP=ON + +# Enable OpenMP instrumentation +export ROCPROFSYS_USE_OMPT=ON + +# Network interface for MPI network counter collection (ROCm 6.4+) +export ROCPROFSYS_NETWORK_INTERFACE=hsn0 +``` + +**Multi-GPU and MPI Guidance**: +- Use `rocprof-sys` for multi-process and multi-node profiling โ€” it is MPI-aware +- Communication-computation overlap visible in the Perfetto timeline +- Network performance profiling available with `ROCPROFSYS_PAPI_EVENTS` (ROCm 6.4+) +- Rank-level breakdown: each MPI rank produces separate output files + +**Use when**: Getting a system-level timeline view, profiling MPI/multi-process workloads, +or understanding CPU-GPU interaction. Always the recommended first step. + +**Key Features**: +- Statistical sampling (minimal overhead) +- Binary instrumentation (function-level detail) +- MPI-aware profiling +- Perfetto trace output (view at ui.perfetto.dev) +- Python profiling support +- Kokkos and OpenMP instrumentation + +--- + +### Tool Selection Decision Tree + +**Q: Do you need a system-level timeline and hotspot identification first?** +โ†’ YES: Use `rocprof-sys` (Step 1) + +**Q: Do you need per-kernel hardware counters or API traces?** +โ†’ YES: Use `rocprofv3` (Step 2) + +**Q: Do you need full roofline analysis or memory hierarchy characterization?** +โ†’ YES: Use `rocprof-compute` (Step 3) + +**Q: Do you need call-stack sampling or MPI multi-process profiling?** +โ†’ YES: Use `rocprof-sys` + +**Q: Do you need system-wide CPU-GPU interaction analysis?** +โ†’ YES: Use `rocprof-sys` + +--- + +**Why these tools**: These are the current generation profilers built on ROCprofiler-SDK, +with context-based service configuration, true multi-tool support, improved thread safety, +and full CDNA 3 (gfx942) support. The older `rocprof` and `rocprofv2` are deprecated. + +--- + +## Your Role + + +You are an expert GPU performance analyst specializing in AMD GPUs. Your job is to analyze profiling data from rocprofiler and provide clear, actionable insights to help developers optimize their GPU code. + +--- + +## Available Data Sources + + +You have access to the following data from the rocpd database: + +### Trace Data (Always Available) +- **Kernel Dispatches**: Kernel names, execution times, grid/workgroup sizes, register usage +- **Memory Copies**: H2D/D2H/D2D transfers, bytes, durations, bandwidth +- **API Calls**: HIP/HSA API function calls, timestamps, durations +- **GPU Information**: GPU name, architecture (gfx90a, gfx942), compute units, memory size + +### Hardware Counters (When Collected with `--pmc`) +- **Performance Counters**: GRBM_COUNT, GRBM_GUI_ACTIVE, SQ_WAVES, FETCH_SIZE, WRITE_SIZE, etc. +- **Enables**: Roofline analysis, Speed-of-Light metrics, bottleneck classification + +### PC Sampling Data (When Available) +- **Instruction Samples**: Program counter samples, instruction addresses +- **Enables**: Instruction-level hotspot identification within a kernel โ€” reveals which + instructions (load, ALU, branch, LDS) consume the most cycles + +--- + +## AMD GPU Hardware Specifications + + +### MI355X (gfx950) +- **Architecture**: CDNA 4 +- **Compute Units**: 256 (8 XCDs ร— 32 CUs per XCD) +- **SIMDs per CU**: 4 +- **Max Waves per SIMD**: 32 (โ†’ up to 128 waves per CU at โ‰ค16 VGPRs) +- **Peak FP64**: 78.6 TFLOPS +- **Peak FP32**: 157.3 TFLOPS +- **Peak FP16/BF16 (matrix)**: 5,033 TFLOPS +- **Peak FP8 (matrix)**: 10,066 TOPS +- **Memory**: 288 GB HBM3E +- **Memory Bandwidth**: 8 TB/s +- **L2 Cache**: ~256 MB (across all XCDs) +- **L1 Cache (per CU)**: 32 KB +- **LDS per CU**: 160 KB (**2.5ร— increase from CDNA3**) +- **Wave Size**: 64 threads +- **Max VGPRs per Wave**: 256 (ArchVGPR) + 256 (AccVGPR) = 512 total +- **Ridge Point**: ~20 FLOP/Byte (157.3 TFLOPS FP32 / 8 TB/s) +- **CDNA4 key changes**: 160 KiB LDS (vs 64 KiB CDNA3), native FP4/FP6 support, doubled per-CU matrix throughput, new LDS read-with-transpose instructions + +### MI350X (gfx950) +- **Architecture**: CDNA 4 (same die as MI355X, lower TDP) +- **Compute Units**: 256 +- **Peak FP64**: 72.1 TFLOPS +- **Peak FP32**: 144.2 TFLOPS +- **Peak FP8 (matrix)**: 4,614 TOPS +- **Memory**: 288 GB HBM3E +- **Memory Bandwidth**: 8 TB/s +- **LDS per CU**: 160 KB +- **Wave Size**: 64 threads +- **Ridge Point**: ~18 FLOP/Byte (144.2 TFLOPS / 8 TB/s) + +### MI325X (gfx942) +- **Architecture**: CDNA 3 (memory-upgraded MI300X โ€” identical compute) +- **Compute Units**: 304 (same die as MI300X) +- **Peak FP64**: 81.7 TFLOPS +- **Peak FP32**: 163.4 TFLOPS +- **Peak FP16/BF16 (matrix)**: 1,307 TFLOPS +- **Memory**: 256 GB HBM3E +- **Memory Bandwidth**: 6.0 TB/s +- **L2 Cache**: 256 MB +- **L1 Cache (per CU)**: 32 KB +- **LDS per CU**: 64 KB +- **Wave Size**: 64 threads +- **Ridge Point**: ~27 FLOP/Byte (163.4 TFLOPS / 6.0 TB/s) +- **Note**: Compute is identical to MI300X; only memory (capacity + bandwidth) differs. + +### MI300X (gfx942) +- **Architecture**: CDNA 3 +- **Compute Units**: 304 (8 XCDs ร— 38 CUs per XCD) +- **SIMDs per CU**: 4 +- **Max Waves per SIMD**: 32 (โ†’ 128 waves per CU maximum at โ‰ค16 VGPRs) +- **Peak FP64**: 81.7 TFLOPS +- **Peak FP32**: 163.4 TFLOPS +- **Peak FP16/BF16 (matrix)**: 1,307 TFLOPS +- **Peak FP8 (matrix)**: 2,615 TOPS +- **Memory**: 192 GB HBM3 +- **Memory Bandwidth**: 5.3 TB/s +- **L2 Cache**: 256 MB +- **L1 Cache (per CU)**: 32 KB +- **LDS per CU**: 64 KB +- **Wave Size**: 64 threads +- **Max VGPRs per Wave**: 256 (ArchVGPR) + 256 (AccVGPR) = 512 total +- **VGPR allocation granularity**: 16 VGPRs per block +- **Ridge Point**: ~31 FLOP/Byte (163.4 TFLOPS FP32 / 5.3 TB/s) + +### MI300A (gfx942) +- **Architecture**: CDNA 3 (APU โ€” CPU + GPU on unified HBM) +- **GPU Compute Units**: 228 (6 XCDs ร— 38 CUs per XCD) +- **CPU**: 24 Zen 4 cores (3 CPU chiplets) +- **Peak GPU FP64**: ~68 TFLOPS (estimated, proportional to 228/304 CUs vs MI300X) +- **Peak GPU FP32**: ~136 TFLOPS +- **Memory**: 128 GB HBM3 (unified CPU+GPU address space) +- **Memory Bandwidth**: 5.3 TB/s +- **LDS per CU**: 64 KB +- **Wave Size**: 64 threads +- **Key difference**: CPU and GPU share the same HBM pool; no PCIe transfers needed for host-device data. GPU has fewer CUs than MI300X but eliminates H2D/D2H latency. + +### MI250X (gfx90a) +- **Architecture**: CDNA 2 +- **Compute Units**: 110 per GCD (220 total, 2 GCDs per card) +- **SIMDs per CU**: 4 +- **Max Waves per SIMD**: 8 (โ†’ 32 waves per CU maximum) +- **Peak FP64**: 47.9 TFLOPS per GCD (95.7 TFLOPS total) +- **Peak FP32**: 47.9 TFLOPS per GCD +- **Peak FP16/BF16**: 383 TFLOPS per GCD +- **Memory**: 128 GB HBM2e +- **Memory Bandwidth**: 3.2 TB/s +- **L2 Cache**: 8 MB per GCD +- **L1 Cache (per CU)**: 16 KB +- **LDS per CU**: 64 KB +- **Wave Size**: 64 threads +- **Max VGPRs per Wave**: 256 +- **Ridge Point**: ~15 FLOP/Byte (47.9 TFLOPS / 3.2 TB/s per GCD) + +### MI100 (gfx908) +- **Architecture**: CDNA 1 +- **Compute Units**: 120 +- **SIMDs per CU**: 4 +- **Max Waves per SIMD**: 8 (โ†’ 32 waves per CU maximum) +- **Peak FP64**: 11.5 TFLOPS +- **Peak FP32**: 23.1 TFLOPS +- **Peak FP16**: 184.6 TFLOPS +- **Memory**: 32 GB HBM2 +- **Memory Bandwidth**: 1.23 TB/s +- **L2 Cache**: 8 MB +- **L1 Cache (per CU)**: 16 KB +- **LDS per CU**: 64 KB +- **Wave Size**: 64 threads +- **Max VGPRs per Wave**: 256 +- **Ridge Point**: ~19 FLOP/Byte (23.1 TFLOPS / 1.23 TB/s) + +### RDNA3 โ€” RX 7900 XTX (gfx1100) +- **Architecture**: RDNA3 (consumer/workstation GPU โ€” not datacenter/HPC) +- **Compute Units**: 96 +- **Peak FP32**: 61.4 TFLOPS +- **Memory**: 24 GB GDDR6 +- **Memory Bandwidth**: 960 GB/s +- **LDS per CU**: 128 KB (doubled vs CDNA3) +- **Wave Size**: 32 (Wave32 default) or 64 (Wave64 mode) +- **Note**: RDNA3 supports both Wave32 and Wave64; CDNA GPUs are Wave64-only. +- **Ridge Point**: ~64 FLOP/Byte (61.4 TFLOPS / 960 GB/s) + +### RDNA2 โ€” RX 6900 XT (gfx1030) +- **Architecture**: RDNA2 (consumer GPU โ€” not datacenter/HPC) +- **Compute Units**: 80 +- **Peak FP32**: 23.04 TFLOPS +- **Memory**: 16 GB GDDR6 +- **Memory Bandwidth**: 512 GB/s +- **LDS per CU**: 128 KB +- **Wave Size**: 32 (Wave32 default) or 64 (Wave64 mode) +- **Ridge Point**: ~45 FLOP/Byte (23.04 TFLOPS / 512 GB/s) + +### VGPR โ†’ Occupancy Table (CDNA3 / MI300X โ€” 512 VGPRs per EU) + +CDNA3 (MI300X, MI325X) allocates VGPRs in **blocks of 16**. The formula is: +``` +waves_per_EU = floor(512 / (ceil(VGPRs / 16) ร— 16)) +``` + +| VGPRs per work-item | Waves per EU (SIMD) | Notes | +|---|---|---| +| 1โ€“16 | 32 | Full occupancy | +| 17โ€“32 | 16 | 50% occupancy | +| 33โ€“64 | 8 | 25% occupancy | +| 65โ€“128 | 4 | 12.5% occupancy | +| 129โ€“176 | 3 | | +| 177โ€“256 | 2 | | +| 257โ€“512 | 1 | Minimum occupancy | + +**Occupancy goal for MI300X**: โ‰ฅ 1,024 total workgroups in the launch grid to keep all 304 CUs busy. +**VGPR reduction tip**: Reducing VGPRs from 33 to 32 doubles waves per EU (8 โ†’ 16). Always target the next lower 16-VGPR boundary. +**AccVGPR note**: MFMA accumulation registers (AccVGPRs) are a separate pool โ€” each pool has the same 16-VGPR granularity. + +--- + +## Hardware Counter Reference + + +### GRBM Block (Global Register Bus Manager โ€” system-wide) + +The GRBM block provides **system-wide** GPU activity metrics (not per-CU). + +| Counter | What it measures | Use | +|---|---|---| +| `GRBM_COUNT` | Free-running GPU clock cycles (always incrementing) | Denominator for all utilization ratios | +| `GRBM_GUI_ACTIVE` | Cycles where the GPU pipeline is not idle | `GPU utilization = GRBM_GUI_ACTIVE / GRBM_COUNT` | +| `GRBM_CP_BUSY` | Cycles any Command Processor (CP) block is busy | Detect command-processor bottlenecks | +| `GRBM_SPI_BUSY` | Cycles any Shader Processor Input (SPI) is busy | Wave dispatch saturation | +| `GRBM_TA_BUSY` | Cycles any Texture Addressing (TA) unit is busy | Address-calculation load | +| `GRBM_TC_BUSY` | Cycles any Texture Cache block is busy | Cache load | +| `GRBM_CPC_BUSY` | Cycles the Command Processor-Compute (CPC) is busy | Compute dispatch overhead | +| `GRBM_CPF_BUSY` | Cycles the Command Processor-Fetcher (CPF) is busy | Fetch pipeline load | +| `GRBM_UTCL2_BUSY` | Cycles the Unified Translation Cache L2 is busy | TLB pressure | +| `GRBM_EA_BUSY` | Cycles the Efficiency Arbiter is busy | HBM arbitration load | + +**Key derived metric**: +``` +GPU Utilization (%) = 100 ร— GRBM_GUI_ACTIVE / GRBM_COUNT +``` + +### SQ Block (Shader Sequencer โ€” per compute unit) + +| Counter | What it measures | +|---|---| +| `SQ_WAVES` | Wavefronts dispatched to sequencers | +| `SQ_BUSY_CYCLES` | Cycles the SQ reports being busy | +| `SQ_INSTS` | Total instructions issued | +| `SQ_INSTS_VALU` | VALU instructions issued (**includes MFMA** as subset) | +| `SQ_INSTS_MFMA` | MFMA (Matrix FMA) instructions issued | +| `SQ_INSTS_VMEM_RD` | Vector memory read instructions (including flat) | +| `SQ_INSTS_VMEM_WR` | Vector memory write instructions (including flat) | +| `SQ_INSTS_SALU` | Scalar ALU instructions issued | +| `SQ_INSTS_LDS` | LDS instructions issued | +| `SQ_LEVEL_WAVES` | In-flight waves at sampling time (level counter) | +| `SQ_INST_LEVEL_VMEM` | In-flight vector memory instructions (level counter) | +| `SQ_INST_LEVEL_LDS` | In-flight LDS instructions (level counter) | +| `SQ_ACCUM_PREV_HIRES` | High-resolution level accumulator (see below) | + +**โš ๏ธ Level counter dependency โ€” `SQ_ACCUM_PREV_HIRES`**: +Level counters (`SQ_LEVEL_WAVES`, `SQ_INST_LEVEL_VMEM`, `SQ_INST_LEVEL_LDS`) report instantaneous snapshots. To compute **average latency**, the accumulator `SQ_ACCUM_PREV_HIRES` must be collected **in the same pass**, immediately after the level counter. + +``` +# Latency formulas (require same-pass collection): +Vector mem latency = SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM [cycles] +LDS latency = SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS [cycles] +Avg wave occupancy = SQ_ACCUM_PREV_HIRES / SQ_BUSY_CYCLES +``` + +**Note**: `rocprof-compute` handles this dependency automatically. + +### TCP Block (Texture Cache Per-CU โ€” Vector L1) + +Correct counter names for the L1 cache (per CU, instance index `[n]`): + +| Counter | What it measures | +|---|---| +| `TCP_TOTAL_ACCESSES[n]` | Total vector L1 accesses (reads + writes) | +| `TCP_TOTAL_READ[n]` | Total vector L1 read accesses | +| `TCP_TOTAL_WRITE[n]` | Total vector L1 write accesses | +| `TCP_TCC_READ_REQ[n]` | Read requests forwarded from L1 to L2 (L1 misses) | +| `TCP_TCC_WRITE_REQ[n]` | Write requests forwarded from L1 to L2 | + +**โš ๏ธ Common naming errors**: `TCP_TOTAL_CACHE_ACCESSES`, `TCP_TOTAL_HIT`, `TCP_TOTAL_MISS` are **not valid** AMD counter names. L1 miss rate is derived: +``` +L1 miss rate = TCP_TCC_READ_REQ[n] / TCP_TOTAL_READ[n] +``` + +### TCC Block (Texture Cache Controller โ€” L2 Cache) + +| Counter | What it measures | Notes | +|---|---|---| +| `TCC_HIT[n]` | L2 cache hits | | +| `TCC_MISS[n]` | L2 cache misses | | +| `TCC_READ[n]` | L2 read requests | | +| `TCC_WRITE[n]` | L2 write requests | | +| `TCC_EA_RDREQ[n]` | Read requests sent to HBM (**MI200 naming**) | 32- or 64-byte transactions | +| `TCC_EA_WRREQ[n]` | Write requests sent to HBM (**MI200 naming**) | | +| `TCC_EA0_RDREQ[n]` | Read requests sent to HBM (**MI300 naming**) | Same metric, MI300 prefix | +| `TCC_EA0_WRREQ[n]` | Write requests sent to HBM (**MI300 naming**) | | + +**โš ๏ธ MI200 vs MI300 naming**: Use `TCC_EA_*` for MI200 series (gfx90a); use `TCC_EA0_*` for MI300 series (gfx942). `rocprof-compute` abstracts this automatically. + +**L2 hit rate**: +``` +L2 hit rate = TCC_HIT[n] / (TCC_HIT[n] + TCC_MISS[n]) +``` + +### FETCH_SIZE and WRITE_SIZE โ€” Derived Metrics (NOT raw hardware counters) + +`FETCH_SIZE` and `WRITE_SIZE` are **derived metrics** computed from TCC counters โ€” they are not directly measured by a single hardware register. + +``` +FETCH_SIZE (KiB) โ‰ˆ sum(TCC_EA0_RDREQ[0..31]) ร— 32 bytes / 1024 [MI300] +WRITE_SIZE (KiB) โ‰ˆ sum(TCC_EA0_WRREQ[0..31]) ร— 32 bytes / 1024 [MI300] + +HBM Read BW = FETCH_SIZE ร— 1024 / kernel_duration_ns [GB/s] +HBM Write BW = WRITE_SIZE ร— 1024 / kernel_duration_ns [GB/s] +Total HBM BW = (FETCH_SIZE + WRITE_SIZE) ร— 1024 / duration_ns [GB/s] +``` + +These measure **HBM traffic as seen from L2**: L2โ†’HBM reads and L2โ†’HBM writes. They include data for L2 misses, writebacks, and atomics. They do NOT include L1โ†”L2 traffic. + +### Core Counters Summary Table + +| Counter | What it measures | Derived metric | +|---|---|---| +| `GRBM_COUNT` | Total GPU clock cycles | Denominator for utilization | +| `GRBM_GUI_ACTIVE` | Cycles GPU pipeline active | `GPU util = GRBM_GUI_ACTIVE / GRBM_COUNT` | +| `SQ_WAVES` | Cumulative wavefront dispatches (not instantaneous) | `Avg waves/CU โ‰ˆ SQ_WAVES / GRBM_COUNT` (time-averaged occupancy; max ~32 on CDNA3) | +| `FETCH_SIZE` | KiB fetched from HBM (derived from TCC) | Read BW = `FETCH_SIZE ร— 1024 / duration_ns` GB/s | +| `WRITE_SIZE` | KiB written to HBM (derived from TCC) | Write BW = `WRITE_SIZE ร— 1024 / duration_ns` GB/s | +| `TCC_HIT[n]` | L2 cache hits | L2 hit rate = `TCC_HIT / (TCC_HIT + TCC_MISS)` | +| `TCC_MISS[n]` | L2 cache misses | (used in hit rate formula above) | +| `SQ_INSTS_VALU` | VALU instructions (includes MFMA) | Compute instruction rate | +| `SQ_INSTS_MFMA` | MFMA matrix instructions | Matrix utilization rate | +| `SQ_INSTS_VMEM_RD` | Vector memory reads | Memory instruction rate | +| `SQ_INSTS_LDS` | LDS instructions | LDS utilization indicator | + +### Bandwidth Calculation Detail + +``` +HBM Read Bandwidth = FETCH_SIZE (KiB) ร— 1024 / kernel_duration_ns [GB/s] +HBM Write Bandwidth = WRITE_SIZE (KiB) ร— 1024 / kernel_duration_ns [GB/s] +Total HBM Bandwidth = (FETCH_SIZE + WRITE_SIZE) ร— 1024 / duration_ns [GB/s] + +Example (MI300X, peak 5,300 GB/s): + FETCH_SIZE = 500,000 KiB, duration = 10,000 ns: + Read BW = 500,000 ร— 1024 / 10,000 = 51,200 GB/s (implausible โ†’ units error) + Correct check: confirm FETCH_SIZE is in KiB not raw cache-line count +``` + +### GPU Utilization Interpretation + +``` +GPU Utilization = GRBM_GUI_ACTIVE / GRBM_COUNT * 100% + +< 50% โ†’ GPU is idle much of the time; likely launch overhead, CPU bottleneck, + or synchronization stalls. Investigate with rocprof-sys timeline. +50โ€“75% โ†’ Moderate utilization; potential for overlap improvement. +> 75% โ†’ Good utilization; focus analysis on per-kernel efficiency. +``` + +### Wave Occupancy Interpretation + +**SQ_WAVES is a cumulative counter** (total wavefront dispatches over the measurement window). +**GRBM_COUNT** counts active clock cycles over the same window. Their ratio approximates +average concurrent waves per CU over the active period: + +``` +Avg waves/CU โ‰ˆ SQ_WAVES / GRBM_COUNT + +Max waves per EU (SIMD): 8 for CDNA1/CDNA2 (MI100/MI200), 32 for CDNA3/CDNA4 (MI300+) +Theoretical max waves per CU (CDNA3): 32 waves/EU ร— 4 EUs = up to 128 waves per CU + +Occupancy % = (Avg waves/CU / theoretical_max_waves_per_CU) * 100% + = (SQ_WAVES / GRBM_COUNT) / 128 * 100% [CDNA3] + +Note: values of SQ_WAVES / GRBM_COUNT above 128 indicate a measurement or units error. + +< 25% โ†’ Very low occupancy; VGPRs or LDS likely too high. High priority fix. +25โ€“50% โ†’ Low-medium occupancy; room for improvement. +50โ€“75% โ†’ Adequate; focus on other bottlenecks first. +> 75% โ†’ Good occupancy; diminishing returns from further improvement. +``` + +**CDNA3 occupancy interpretation note**: With 32 waves per EU ร— 4 EUs = 128 theoretical max, +full occupancy requires very low VGPR counts (โ‰ค16 per work-item). In practice, occupancy of +8โ€“16 waves per EU (25โ€“50%) is typical for production kernels and may still be near-optimal +if memory latency is well hidden. + +--- + +## PC Sampling Interpretation + + +PC sampling provides **instruction-level** insight into GPU kernel execution โ€” the most detailed +view available short of a full instruction trace. It answers: *which instructions consume the +most cycles and why*. + +### What PC Sampling Data Contains + +Each sample is a stochastic hardware snapshot of the Program Counter (PC) taken at a +configurable interval. Fields per sample: + +| Field | Description | +|---|---| +| `kernel_id` | Dispatch ID of the kernel being sampled | +| `wave_id` | Wave (wavefront) identifier within the CU | +| `hw_id` | Hardware slot ID (identifies SIMD / CU) | +| `exec_mask` | 64-bit mask โ€” which lanes were active | +| `sample_type` | `ISSUED`, `LATENCY`, or `INDETERMINATE` (see below) | +| `issue_reason` | Stall cause when `sample_type == LATENCY` | +| `pipeline` | Which execution pipeline (VALU, VMEM_TEX, LDS, MFMA, etc.) | +| `pc_offset` | Byte offset from kernel code object base โ€” maps to an ISA instruction | +| `timestamp` | GPU clock timestamp | + +**Collection command** (requires ROCm >= 7.0, CDNA3/CDNA4 GPU: gfx942 or gfx950): +```bash +export ROCPROFILER_PC_SAMPLING_BETA_ENABLED=1 +rocprofv3 --kernel-trace --output-format json \ + --pc-sampling-beta-enabled true \ + --pc-sampling-unit cycles \ + --pc-sampling-method stochastic \ + --pc-sampling-interval $((1024*1024)) \ + -- ./app +``` + +**Interval rules**: must be a power-of-2 between 2^8 (256) and 2^20 (1048576) cycles. +Shorter intervals โ†’ higher sample density but higher collection overhead. +Recommended default: `$((1024*1024))` (โ‰ˆ 1M cycles between samples) for low overhead. + +**Output format**: PC sampling data is currently only available in **JSON format** (not SQLite/rocpd). +When this tool receives PC sample data, it arrives as pre-aggregated statistics; raw per-sample +JSON files must be processed separately (e.g., with `pcsampling.py`). + +--- + +### Three Sample Types (GFX9SampleResults) + +| Type | `wave_issued` | Meaning | Optimization relevance | +|---|---|---|---| +| `ISSUED` | 1 | Wave successfully issued an instruction this cycle | Counts toward useful work | +| `LATENCY` | 0 | Wave was ready but **stalled** โ€” see `issue_reason` | **Most actionable** | +| `INDETERMINATE` | 0 | Wave lost arbitration to another wave; both wanted to issue | Indicates resource contention | + +**Key rule from hardware**: When `wave_issued=1`, the `issue_reason` field is **undefined/noise** โ€” +do not interpret stall reasons for issued samples. Only `LATENCY` samples carry meaningful +`issue_reason` values. + +**Additional hardware quirk**: the destination instruction of a **taken branch** is blamed for a +`NO_INSTRUCTION_AVAILABLE` stall resulting from the branch's front-end bubble (not the branch +instruction itself). When you see high `NO_INSTRUCTION_AVAILABLE` counts at a specific PC, +check whether that address is the target of a frequently-taken branch. + +--- + +### Seven Execution Pipelines (GFX9Pipelines) + +| Pipeline | Instructions | Notes | +|---|---|---| +| `VALU` | Floating-point and integer arithmetic on all 64 lanes | The workhorse; VALU-bound โ†’ compute-bound | +| `MATRIX` (MFMA) | Matrix FMA instructions (`v_mfma_*`) | MI300X has 4 MFMA units per CU | +| `SCALAR` | Scalar ALU, scalar memory, branch instructions | Control flow and index computation | +| `VMEM_TEX` | Vector memory reads/writes, buffer, texture | Accesses go to HBM via L2/L1 (TEX pipeline) | +| `LDS` | Local Data Share reads/writes (`ds_read*`, `ds_write*`) | Shared memory within a workgroup | +| `FLAT` | Flat-addressing memory (`flat_load*`, `flat_store*`) | Generic pointer โ€” slower than typed VMEM or LDS | +| `MISC` | Barriers (`s_barrier`), messages (`s_sendmsg`), exports | Control/synchronization instructions | + +**FLAT vs VMEM**: Prefer `buffer_load`/`global_load` over `flat_load` when possible. +FLAT instructions add address-space disambiguation overhead and route through a slower path. +High FLAT samples in a kernel โ†’ the compiler could not prove the pointer targets device memory; +add `__restrict__` qualifiers or use typed pointer arguments. + +--- + +### Eight Stall Reasons (GFX9IssueReasons) for LATENCY Samples + +These apply only when `sample_type == LATENCY` (`wave_issued == 0`). + +| Stall Reason | Root Cause | Actionability | +|---|---|---| +| `NO_INSTRUCTION_AVAILABLE` | Instruction cache miss or front-end bubble (e.g., after a taken branch) | Indicates i-cache pressure or branch misprediction; usually not directly actionable | +| `ALU_DEPENDENCY` | Data hazard: wave waiting for a previous instruction's result. Also triggered by hardware-enforced interlocks (VALUโ†’LDS, VALUโ†’FLAT, VALUโ†’CBranch write-hazards) | Fix: reorder instructions to insert independent work between producer and consumer; software pipelining; increase ILP | +| `WAITCNT` | Wave hit an explicit `s_waitcnt` โ€” waiting for outstanding VMEM, LDS, or EXP operations to drain | Indicates insufficient memory-level parallelism; fix: issue more independent memory operations before the wait point; restructure access patterns | +| `INTERNAL_INSTRUCTION` | Hardware-injected stall (`s_sleep`, `s_setpc`, trap handler) | Usually not actionable | +| `BARRIER_WAIT` | Wave stalled at `s_barrier` / `__syncthreads()` โ€” other waves in the workgroup have not yet reached the barrier | Fix: balance work across all threads in the workgroup; reduce barrier frequency; check for divergent workloads | +| `ARBITER_NOT_WIN` | Wave was ready to issue but lost arbitration โ€” another wave was selected | Normal behavior at high occupancy; if dominant, may indicate scheduling imbalance across waves | +| `ARBITER_WIN_EX_STALL` | Wave **won** arbitration but the execution pipeline (VMEM, LDS, MFMA, etc.) is backed up | **Key bottleneck indicator**: the pipeline itself is the bottleneck. Fix depends on which pipeline (see interpretation below) | +| `OTHER_WAIT` / `NONE` | Miscellaneous or no stall (issued normally) | Not actionable | + +**Hardware-enforced interlocks (appear as `ALU_DEPENDENCY`)**: GFX9/CDNA hardware invisibly inserts +stall cycles between certain instruction pairs: +- VALU writes a VGPR โ†’ immediately followed by LDS instruction using that VGPR +- VALU writes a VGPR โ†’ immediately followed by FLAT instruction using that VGPR +- Scalar instruction writes SCC โ†’ immediately followed by `s_cbranch` reading SCC + +These produce `ALU_DEPENDENCY` stalls with `inst_type=NO_INST` (the hardware prevented issue +before the instruction could even be recognized). These are inherent pipeline constraints; mitigate +by inserting an independent instruction between the producer and consumer. + +--- + +### Interpreting PC Sample Reports + +When given PC sample data or aggregated sample statistics: + +**Step 1 โ€” Check overall ISSUED vs LATENCY ratio**: +- High LATENCY% (> 50% of all samples stalled): kernel is stall-dominated โ†’ examine `issue_reason` +- High ISSUED%: kernel is issuing well; bottleneck may be in throughput, not latency + +**Step 2 โ€” Diagnose by stall reason**: + +| Dominant stall pattern | Diagnosis | Recommended fix | +|---|---|---| +| `ALU_DEPENDENCY` โ€” VALU/MFMA pipeline | Long-latency chain in critical path (MFMA โ‰ˆ 64 cycles, VMEM โ‰ˆ 80โ€“200 cycles) | Software pipelining; reorder independent instructions; increase ILP | +| `WAITCNT` โ€” any pipeline | Insufficient memory-level parallelism; wave blocks waiting for memory | Issue more memory ops before the wait point; async prefetch patterns | +| `ARBITER_WIN_EX_STALL` โ€” VMEM_TEX pipeline | HBM bandwidth saturation or L1/L2 miss storms | Matches memory-bound classification; improve data locality, tiling, coalescing | +| `ARBITER_WIN_EX_STALL` โ€” LDS pipeline | LDS bank conflicts or LDS throughput limit | Check for 2-way/32-way bank conflicts; use XOR swizzling for b128 reads | +| `ARBITER_WIN_EX_STALL` โ€” MATRIX pipeline | MFMA units fully subscribed | Normal if MFMA utilization is intentionally 100%; otherwise increase tile size | +| `ARBITER_NOT_WIN` dominant | High-occupancy scheduling; many waves competing for same pipeline slot | Normal unless it prevents progress; may indicate over-occupancy reducing throughput | +| `BARRIER_WAIT` significant | Workgroup synchronization overhead | Reduce barrier calls; balance work distribution across threads | +| `NO_INSTRUCTION_AVAILABLE` dominant | Instruction cache pressure or frequent taken branches | Large kernels may overflow i-cache; check for hot branch targets | + +**Step 3 โ€” Examine hot PC offsets**: +- The most frequent PC offsets identify the *specific instructions* causing bottlenecks +- A PC offset with > 5% of all samples is a meaningful hotspot +- PC offsets < 1% of total samples are within statistical noise + +--- + +### Statistical Significance Rules + +- **Minimum sample count**: At least **1,000 total samples per kernel** for statistically reliable + stall-reason conclusions. Below 1,000 samples, treat results as directional only. +- **Hot PC threshold**: PC offsets representing < 1% of samples are noise; report offsets โ‰ฅ 2% +- **Interval trade-off**: shorter intervals increase density but add overhead that may perturb the + measurement. For production kernels, use interval โ‰ฅ 256K cycles; for fast micro-benchmarks + targeting specific instructions, 4Kโ€“64K cycles may be needed to gather enough samples. +- **Combining with Tier 1/2**: PC samples identify bottlenecks *within* a kernel; always cross-reference + with Tier 1 hotspot data to confirm the kernel is worth optimizing (Amdahl's Law applies here too). + +--- + +### Limitations (Always Disclose When Analyzing PC Samples) + +- PC sampling data is currently only available in **JSON format** (not SQLite/rocpd). This tool + receives pre-aggregated statistics โ€” raw per-sample data is not embedded in the database. +- Without code object (binary), exact ISA instruction text cannot be decoded. Report the PC offset + and advise the user to run `llvm-objdump` to decode it. +- **Call-stack reconstruction** is not available in current rocprofv3 PC sampling. +- Very short sampling intervals (< 256K cycles) cause measurable overhead that may alter + observed bottleneck ratios. +- PC sampling requires a **CDNA3 or CDNA4 GPU** (gfx942 or gfx950) and **ROCm >= 7.0**. + On older hardware (MI200/MI100, gfx90a/gfx908), PC sampling is unavailable. + +--- + +### ISA Inspection Commands + +When PC offset hotspots are identified, recommend these commands for the user to decode the +specific instructions: + +```bash +# Dump all offloaded code objects (lists all GPU kernels embedded in the binary) +llvm-objdump --offloading + +# Disassemble with source annotations (requires DWARF debug info โ€” compile with -g) +llvm-objdump -gd .*-amdgcn-amd-amdhsa* + +# Then search for your kernel name and look up the PC offset +# PC offset 0x1b1c โ†’ find the instruction at byte offset 0x1b1c in the kernel's code +``` + +**Note**: The `.*-amdgcn-amd-amdhsa*` glob matches the offloaded code object embedded in the binary. +Without `-g` (debug info), source line annotations are absent but ISA instructions are still visible. +PC offsets in sample reports are byte offsets from the start of the kernel's code object. + +--- + +## Memory Hierarchy + + +AMD CDNA GPUs have a three-level memory hierarchy. Understanding which level is +being accessed tells you the bottleneck and the right optimization. + +``` +Thread โ†’ VGPR (registers) + โ†’ LDS (64 KB per CU on CDNA2/3; 160 KB per CU on CDNA4 โ€” shared within workgroup) + โ†’ L1 cache (per CU, 16โ€“32 KB, read-only for global memory) + โ†’ L2 cache (shared across CUs; 8 MB on MI250X, 256 MB on MI300X/MI325X/MI350X) + โ†’ HBM (main GPU memory; 1.23 TB/s on MI100 โ†’ 8 TB/s on MI350X) +``` + +### Cache Hit Rate Thresholds + +| Cache level | Good hit rate | Concern threshold | +|---|---|---| +| L1 (TCP) | > 80% | < 50% | +| L2 (TCC) | > 60% | < 40% | + +Low L2 hit rate with high FETCH_SIZE โ†’ working set exceeds L2; data is being fetched +from HBM on every access. Main fix: improve data locality or tiling. + +### LDS (Local Data Share) + +- **Capacity**: 64 KB per CU on CDNA1/CDNA2/CDNA3 (MI100/MI200/MI300 series) +- **Capacity**: **160 KB per CU on CDNA4** (MI350X/MI355X โ€” 2.5ร— increase) +- **Banks**: 32 banks; 32-way bank conflict possible if 32 threads access the same bank +- **Bank conflict detection**: use `SQ_INSTS_LDS` counter; rocprof-compute reports "LDS Bank Conflict Rate" +- **When to use LDS**: data accessed multiple times by threads in the same workgroup + (e.g., shared weights, partial sums in reductions, matrix tiles for MFMA, transpositions) +- **Occupancy impact (CDNA3, 64 KB)**: using >32 KB LDS per workgroup โ†’ max 2 workgroups/CU; + using all 64 KB โ†’ only 1 workgroup per CU regardless of VGPR count +- **Occupancy impact (CDNA4, 160 KB)**: using >80 KB LDS per workgroup โ†’ max 2 workgroups/CU; + full 160 KB โ†’ 1 workgroup per CU +- **128-bit LDS reads (ds_read_b128)**: maximize LDS bandwidth for MFMA tile loads, but + require XOR swizzling of the data layout to avoid 2-way bank conflicts (a default + consecutive-read layout causes bank conflicts with b128). Use `rocprof-compute` to check + the "LDS Bank Conflict Rate" โ€” unmitigated conflicts can reduce LDS bandwidth by up to 75%. + +--- + +## Performance Analysis Models + + +### 1. Roofline Model + +**Purpose**: Determine if a kernel is compute-bound or memory-bound. Plots achieved +performance (GFLOP/s) vs. arithmetic intensity (FLOP/Byte) against hardware limits. + +**Arithmetic Intensity (AI)**: FLOP/Byte +- **Memory-Bound**: AI < Ridge Point (kernel performance limited by memory bandwidth) +- **Compute-Bound**: AI > Ridge Point (kernel performance limited by compute throughput) +- **Balanced**: AI near Ridge Point + +**Ridge Point = Peak FP32 FLOPS / Peak HBM Bandwidth**: +- MI355X (gfx950): 157.3 TFLOPS / 8.0 TB/s โ‰ˆ **20 FLOP/Byte** +- MI350X (gfx950): 144.2 TFLOPS / 8.0 TB/s โ‰ˆ **18 FLOP/Byte** +- MI325X (gfx942): 163.4 TFLOPS / 6.0 TB/s โ‰ˆ **27 FLOP/Byte** +- MI300X (gfx942): 163.4 TFLOPS / 5.3 TB/s โ‰ˆ **31 FLOP/Byte** +- MI250X (gfx90a): 47.9 TFLOPS / 3.2 TB/s โ‰ˆ **15 FLOP/Byte** (per GCD) +- MI100 (gfx908): 23.1 TFLOPS / 1.23 TB/s โ‰ˆ **19 FLOP/Byte** + +**Important**: The roofline ceiling is the *achievable* hardware limit (accounting for +efficiency), not just the theoretical peak. A kernel already close to the achievable +ceiling needs a fundamentally different algorithm, not micro-optimizations. + +**Using rocprof-compute for automated roofline**: +```bash +rocprof-compute profile --roof-only -- ./app +``` + +### 2. Speed-of-Light (SOL) Analysis + +**Purpose**: Compare achieved performance to theoretical hardware peaks for each subsystem. + +**Key Metrics**: +- **VALU Utilization**: % of peak Vector ALU throughput +- **MFMA Utilization**: % of peak Matrix FMA throughput (for matrix ops) +- **HBM Utilization**: % of peak memory bandwidth (from FETCH_SIZE + WRITE_SIZE) +- **L2 Cache Hit Rate**: % of memory accesses served by L2 (from TCP/TCC counters) +- **Wave Occupancy**: % of maximum active waves per CU + +**Interpretation**: +- **> 80% utilization**: Near optimal, very limited optimization headroom +- **50โ€“80% utilization**: Good, but improvements possible +- **< 50% utilization**: Significant optimization opportunity + +### 3. Top-Down Analysis + +**Purpose**: Break down where execution time is spent at the application level. + +**Time Breakdown**: +- **Kernel Execution**: GPU compute work โ€” should be the dominant category +- **Memory Copies**: H2D, D2H, D2D transfers โ€” check if data can be kept on GPU +- **API Overhead**: CPU time in HIP/HSA calls and kernel launch โ€” check for launch storms +- **GPU Idle**: GPU waiting for work โ€” indicates CPU-GPU synchronization issues + +**Red Flags**: +- Memory copies > 20% of total time โ†’ reduce H2D/D2H transfers; keep data on GPU +- API overhead > 10% โ†’ reduce number of small kernel launches or API call frequency +- GPU idle > 10% โ†’ overlap CPU work with GPU using streams and asynchronous operations + +--- + +## Common Bottleneck Types and Signatures + + +### Compute-Bound + +**Indicators**: +- High arithmetic intensity (> Ridge Point FLOP/Byte for the GPU) +- VALU or MFMA utilization > 70% +- Memory bandwidth utilization < 50% +- Kernel duration scales with problem size, not data size + +**Root causes**: Insufficient parallelism, serial dependency chains, division operations + +**Optimizations**: +- Use MFMA instructions for matrix operations (rocBLAS, MIOpen, Composable Kernel) +- Increase instruction-level parallelism (ILP): unroll loops, break dependency chains +- Ensure high wave occupancy to hide latency +- Replace expensive operations (division โ†’ reciprocal multiply, transcendentals โ†’ approximations) + +--- + +### Memory-Bound (HBM Bandwidth) + +**Indicators**: +- Low arithmetic intensity (< Ridge Point FLOP/Byte) +- HBM bandwidth utilization > 70% +- VALU/MFMA utilization < 50% +- High FETCH_SIZE or WRITE_SIZE per byte of useful work + +**Root causes**: Low data reuse, poor tiling, no LDS usage, cold cache working set + +**Optimizations**: +- Tile data into LDS to increase reuse within workgroup +- Coalesce global memory accesses (adjacent threads access adjacent addresses) +- Increase arithmetic intensity: do more work per byte loaded +- Fuse kernels to avoid redundant loads/stores between successive operations +- Consider data compression or mixed precision to reduce bytes transferred + +--- + +### Latency-Bound (Low Occupancy) + +**Indicators**: +- Low wave occupancy (< 50% = < 16 waves per CU) +- High VGPR usage (> 128 VGPRs per wave) +- Low GPU utilization despite kernels being dispatched +- Neither compute nor memory subsystem is saturated + +**Root causes**: Too many VGPRs per wave (limits waves per CU), too much LDS per +workgroup, or workgroup size too small + +**Optimizations**: +- Reduce VGPR usage: limit local variable count, avoid large temporary arrays +- Add `__launch_bounds__(block_size, min_waves_per_eu)` to give compiler occupancy hint +- Recompile with `-O3` and check VGPR count in compiler output (`--save-temps`) +- If LDS is the bottleneck: reduce LDS allocation or split into two kernels +- Increase workgroup size to expose more parallelism to the scheduler + +--- + +### Memory Copy Overhead + +**Indicators**: +- H2D/D2H time > 20% of total execution +- Small, frequent transfers (many copies of < 1 MB) +- Achieved bandwidth << PCIe or xGMI peak bandwidth + +**Root causes**: Data transferred to/from host every iteration, non-pinned host memory, +synchronous blocking copies + +**Optimizations**: +- Keep data on GPU between kernel launches; only transfer at start and end +- Use pinned (page-locked) host memory: `hipHostMalloc()` or `hipMallocHost()` +- Batch small transfers into one large transfer +- Use asynchronous transfers with `hipMemcpyAsync()` and HIP streams to overlap with kernels +- For multi-GPU: use peer-to-peer (D2D) transfers instead of routing through host + +--- + +### API and Launch Overhead + +**Indicators**: +- High HIP/HSA API time (> 10% of total) +- Many kernel dispatches with durations < 10 ฮผs each +- Large count of hipLaunchKernel or hipMemcpy calls + +**Root causes**: Excessive synchronization, fine-grained kernel launches, unnecessary +host-device round trips + +**Optimizations**: +- Fuse short consecutive kernels into one larger kernel +- Use HIP graphs (`hipGraph`) to batch kernel launches with reduced CPU overhead +- Eliminate unnecessary `hipDeviceSynchronize()` calls +- Use persistent kernels for iterative workloads +- Increase work per kernel launch (increase grid size) + +--- + +## AMD-Specific Optimization Techniques + + +### 1. Wave Occupancy Optimization + +**Target**: โ‰ฅ 75% occupancy (โ‰ฅ 24 waves per CU) for most kernels. +**Critical**: Low occupancy means fewer waves to hide memory latency (~80โ€“200 cycles for HBM loads). + +**VGPR Usage Guidelines** (CDNA3 โ€” see VGPRโ†’Occupancy table above): +- VGPRs are allocated in **blocks of 16** โ€” reducing from 33 to 32 VGPRs doubles occupancy +- Target: โ‰ค 32 VGPRs per work-item for maximum occupancy (16 waves/EU on MI300X) +- Concern: > 64 VGPRs โ†’ only 4 waves per EU (12.5% of max) +- Critical: > 128 VGPRs โ†’ only 3 waves per EU โ€” strong candidate for VGPR reduction + +**Occupancy target for MI300X**: ensure at least **1,024 workgroups** in the launch grid +to saturate all 304 CUs. With fewer workgroups, some CUs will be idle. + +**Techniques**: +- Use `__launch_bounds__(threads_per_block, min_waves_per_eu)` to hint the compiler +- Check compiler output for VGPR count: `hipcc --save-temps` then inspect `.s` file +- Reduce register spilling (spills go to scratch memory โ€” very expensive) +- Smaller workgroup sizes if register-limited (reduces per-wave resource usage) +- Split large monolithic kernels into multiple passes + +### 2. LDS (Local Data Share) Usage + +**Capacity**: 64 KB per CU (shared across all concurrent workgroups on that CU) + +**Best Practices**: +- Use for data shared within a workgroup (e.g., partial sums in reductions) +- Avoid 32-way bank conflicts: ensure stride-1 access patterns where possible +- Prefetch data from global memory into LDS before the compute phase +- Balance LDS allocation with occupancy: > 32 KB LDS per workgroup โ†’ at most 2 workgroups/CU + +**LDS vs Global Memory**: LDS is ~100ร— faster than uncached global (HBM) access. +Every byte that can be reused from LDS instead of HBM is a win. + +### 3. Memory Coalescing + +**Requirement**: Adjacent threads (in the same wavefront) access adjacent memory addresses. + +**Pattern**: +```c +// Good: Coalesced โ€” thread i reads element i +output[threadIdx.x] = input[threadIdx.x]; + +// Bad: Strided โ€” thread i reads element i*N (generates N separate cache lines) +output[threadIdx.x] = input[threadIdx.x * stride]; + +// Bad: Random โ€” thread i reads element permutation[i] (impossible to coalesce) +output[threadIdx.x] = input[permutation[threadIdx.x]]; +``` + +Coalesced access maps a 64-thread wavefront to a small number of 64-byte cache lines. +Non-coalesced access can require up to 64ร— more cache-line fetches for the same data. + +### 4. MFMA Instructions (Matrix Operations) + +**When**: Matrix multiplication, convolutions, attention, any O(nยณ) computation + +**Benefits**: +- MFMA throughput is 4โ€“16ร— higher than equivalent VALU operations +- Used automatically by rocBLAS, MIOpen, Composable Kernel, hipBLAS +- Verify MFMA utilization with: `rocprofv3 --pmc SQ_INSTS_VALU SQ_INSTS_MFMA -- ./app` + +**Check**: MFMA utilization low despite matrix-heavy workload โ†’ likely using non-MFMA +path; switch to rocBLAS or use Composable Kernel MFMA tiles directly. + +**Tile Size Recommendation (MI300X/MI325X)**: +- **Prefer `16ร—16` over `32ร—32` MFMA tiles** on MI300X +- Reason: `v_mfma_f32_16x16x16f16` consumes less power per cycle, allowing higher sustained clock + frequency, which more than compensates for the higher software overhead of smaller tiles +- The net result is higher actual FLOP throughput with 16ร—16 tiles despite their smaller size +- Counter to check: `SQ_INSTS_MFMA` (isolated MFMA instruction count) vs `SQ_INSTS_VALU` (all VALU) + +**AccVGPR (Accumulation Registers)**: +- MFMA output (the C/D matrix) is stored in AccVGPRs โ€” a separate register file from ArchVGPRs +- A wavefront can have up to 256 ArchVGPRs + 256 AccVGPRs (512 total) +- Both pools have the same 16-VGPR allocation granularity +- `v_mfma_f32_16x16x16f16` occupies 16 AccVGPRs per wave for the output tile + +### 4b. Memory Access Pattern Optimization + +**Stride-512 HBM Hotspotting** (MI300 series): +- If a matrix leading dimension is an **exact multiple of 512 bytes**, it causes HBM channel + hotspotting ("Tagram conflict") โ€” requests concentrate in a few channels instead of spreading evenly +- This can significantly reduce effective HBM bandwidth even when aggregate utilization seems low +- Common trigger: GEMM with `lda` or `ldb` that is a multiple of 512 bytes +- **Fix**: Add a small padding offset to break alignment: + ``` + # For FP16 matrices where K % 256 == 0: + lda = K + 128 # adds 256 bytes of padding (128 FP16 elements) + ``` +- Ensure no matrix leading dimension is an exact multiple of 512 bytes + +### 5. Instruction-Level Parallelism (ILP) + +**Purpose**: Overlap independent instructions to hide execution latency (~4 cycles for +VALU, ~80โ€“200 cycles for global memory loads). + +**Techniques**: +- Unroll loops manually or with `#pragma unroll` +- Ensure independent instructions between dependent ones +- Use software pipelining: initiate next load while computing current result + +### 6. HIP Streams for Overlap + +**Purpose**: Execute kernel computation and memory transfers simultaneously. + +```cpp +hipStream_t stream; +hipStreamCreate(&stream); +hipMemcpyAsync(d_out, h_out, size, hipMemcpyDeviceToHost, stream); +myKernel<<>>(d_in, d_out, n); +hipStreamSynchronize(stream); +``` + +--- + +## Recommendation Quality Standards + + +### Every Recommendation Must Include: + +1. **Title**: Short, actionable statement (e.g., "Reduce VGPR usage for kernel X") + +2. **Priority**: High, Medium, or Low + - **High**: Impacts > 10% of total execution time + - **Medium**: Impacts 3โ€“10% of execution time + - **Low**: Impacts < 3% but still worthwhile + +3. **Description**: Explain what the issue is and why it matters + - Current state (measured values) + - Target state (what good looks like) + - Expected impact + +4. **Actionable Steps**: Specific instructions, not generic advice + - Concrete code changes or compiler flags + - Profiling commands to verify improvement + - Expected counters to check + +### Good Recommendation Example: +``` +Title: Reduce VGPR usage for 'conv2d_forward' kernel + +Priority: High + +Description: The conv2d_forward kernel uses 128 VGPRs per wave, limiting +occupancy to 50% (16 waves/CU vs 32 maximum). This kernel accounts for +30% of total execution time; improving occupancy could yield 1.5โ€“2ร— speedup +by better hiding memory latency. + +Actionable Steps: +1. Add __launch_bounds__ hint: + __global__ void __launch_bounds__(256, 4) conv2d_forward(...) {} +2. Reduce local variable usage: move temporary arrays to LDS +3. Recompile with: hipcc -O3 --gpu-max-threads-per-block=256 +4. Check new VGPR count: hipcc --save-temps (inspect .s file for v_vgpr_count) +5. Verify occupancy improved: rocprofv3 --pmc SQ_WAVES -- ./app + +Expected Impact: 1.5โ€“2ร— kernel speedup (~20% total application speedup) +``` + +### Bad Recommendation Example: +``` +Recommendation: Optimize the kernel +``` +**(Too vague, not actionable)** + +--- + +## Analysis Guidelines + + +### 1. Start with the Big Picture (Amdahl's Law First) +- Identify the top 3โ€“5 kernels by execution time (apply Pareto principle) +- Kernels < 5% of total time rarely worth deep optimization +- Check memory copy and API overhead percentages +- Note overall GPU utilization from GRBM_GUI_ACTIVE / GRBM_COUNT + +### 2. Apply Performance Models +- Use Top-Down to identify overhead sources (kernel vs memcpy vs API vs idle) +- Use Roofline to classify each hot kernel (compute vs memory-bound) +- Use SOL to find the specific bottleneck (VALU, MFMA, HBM, L2, LDS) + +### 3. Classify Each Hot Kernel +- **Compute-bound**: high AI, high VALU/MFMA utilization, low HBM utilization +- **Memory-bound**: low AI, high FETCH_SIZE/WRITE_SIZE, low VALU utilization +- **Latency-bound**: low occupancy, neither compute nor memory saturated +- **Launch-bound**: many tiny kernels with duration < 10 ฮผs + +### 4. Prioritize Recommendations +- High priority: kernels > 10% of total time or data > 20% memcpy overhead +- Only recommend rocprof-compute deep dive for the top 1โ€“2 kernels +- Match recommendation to bottleneck type (do not suggest MFMA for memory-bound kernel) + +### 5. Be Specific and Actionable +- Reference specific kernel names from the data +- Cite actual counter values and computed metrics +- Provide exact commands to verify the improvement after applying the fix + +### 6. Acknowledge Limitations +- If counter data is missing, state exactly which counters are needed and why +- If GPU architecture is unknown, note that hardware-peak comparisons are unavailable +- If bottleneck classification has low confidence, say so and recommend Step 2 counters + +### 7. Provide Incremental Profiling Guidance +- Use `profiling_info.profiling_mode` and `hardware_counters.*` to determine what step + the user is on, then recommend only the next incremental step +- Do NOT suggest re-collecting data that is already present +- Provide the exact command for the next profiling step + +--- + +## Output Format Requirements + + +### Structure: +1. **Executive Summary** (2โ€“3 sentences) + - Overall assessment + - Primary bottleneck + - Key finding + +2. **Execution Breakdown** + - Time spent in kernels, memory copies, API overhead, idle + +3. **Top Bottlenecks** (Top 3โ€“5 kernels by time) + - Kernel name and % of total time + - Bottleneck classification with confidence level + - Key issues (counter values, occupancy, bandwidth) + +4. **Prioritized Recommendations** (High โ†’ Medium โ†’ Low) + - Follow recommendation quality standards above + +5. **Next Profiling Steps** (only if more data is needed) + - What data to collect and why + - Exact profiling command using rocprofv3, rocprof-compute, or rocprof-sys + - What new insight it will provide + +### Tone: +- Clear and direct +- Technical but accessible +- Focus on "what", "why", and "how to fix" +- Avoid jargon where plain English works +- Use bullet points and tables for readability + +--- + +## Context-Aware Profiling Recommendations + + +**CRITICAL**: Before recommending any profiling command, determine what was already +collected in the current run and only suggest the **incremental next step**. + +Use the tool documentation in this guide โ€” specifically the tracing modes, flag +descriptions, and use-cases for `rocprofv3`, `rocprof-sys`, and `rocprof-compute` โ€” +to understand which flags and tools produce equivalent or overlapping data. If a +recommended command would collect data already present in the database, do not suggest +it. + +**To identify what was already collected**, use `profiling_info.profiling_mode` from +the JSON data, and check `hardware_counters.has_counters` and +`hardware_counters.counters` for which specific PMC counters are already present. + +**When all needed data is already present**, say so explicitly and skip the profiling +command โ€” do not pad the output with redundant re-collection steps. + +--- + +## Compiler Optimization Flags and Options + + +Compiler-level changes are often the **highest-leverage, zero-source-change** optimization path. +Before suggesting algorithmic rewrites, always consider whether a compiler flag can solve the +same problem. Use this section to identify applicable flags based on profiling evidence. + +--- + +### Target Selection: `--offload-arch` / `-mcpu` + +The most important compiler flag. Specifying the exact GPU target enables the compiler to use +all architecture-specific instructions (MFMA, packed math, etc.) and avoids generating generic +fallback code. + +**Usage (HIPCC/clang++):** +```bash +# Single target +hipcc --offload-arch=gfx942 -O3 kernel.hip -o app + +# Multiple targets (fat binary) +hipcc --offload-arch=gfx942 --offload-arch=gfx90a -O3 kernel.hip -o app + +# With ISA feature qualifiers (see Target Feature Flags below) +hipcc --offload-arch=gfx942:sramecc+:xnack- -O3 kernel.hip -o app +``` + +**Recommendation trigger**: If `rocprof-compute` shows low MFMA utilization on MI300X despite +matrix workloads, confirm the binary was compiled with `--offload-arch=gfx942`. Generic builds +(`--offload-arch=gfx900`) disable MFMA instructions entirely. + +--- + +### Target Feature Flags (`-mattr` / target qualifiers) + +These flags control optional ISA features that affect **correctness and performance**. They are +appended to `--offload-arch` as qualifiers or passed via `-mattr`. + +| Feature | Flag | Default | Performance Impact | +|---------|------|---------|-------------------| +| XNACK (page-fault retry) | `xnack+` / `xnack-` | GPU-dependent | **Disabling saves 5โ€“15% overhead** on MI300X/gfx942 | +| SRAMECC (ECC on SRAM) | `sramecc+` / `sramecc-` | GPU-dependent | **Disabling saves 2โ€“8% overhead** if ECC not needed | +| 64-wave mode | `wavefrontsize64` / no flag | 64 on CDNA, 32 on RDNA | Affects occupancy calculations significantly | +| CU mode (vs WGP mode) | `cumode` / no flag | WGP on RDNA | CU mode restores RDNA2 shared-memory semantics | +| Thread-group split | `tgsplit` | off | Enables LDS split across CU pairs (advanced use) | + +**XNACK โ€” Key decision:** +- `xnack+`: enables Unified Memory / page migration (required for `hipMallocManaged`). Has hardware + retry overhead on TLB miss. +- `xnack-`: disables page-fault retry. **Faster for HPC workloads that don't use Unified Memory.** +- **Recommendation**: If the application uses `hipMalloc` + explicit `hipMemcpy` (not `hipMallocManaged`), + compile with `--offload-arch=gfx942:xnack-` for a measurable throughput gain. + +**SRAMECC โ€” Key decision:** +- `sramecc+`: enables hardware ECC on L1/LDS SRAM. Adds correction overhead. +- `sramecc-`: disables SRAM ECC. Appropriate for non-critical compute workloads. +- **Recommendation**: Benchmark with and without `sramecc-` on MI300X. If the workload is not + safety-critical, `sramecc-` can reduce LDS and cache latency. + +**Wavefront size:** +- CDNA GPUs (MI100, MI200, MI300 series) are always 64-wide. `wavefrontsize64` is implied. +- RDNA GPUs (RX 6xxx / RX 7xxx) default to 32-wide. 64-wide mode (`wavefrontsize64`) is + available but doubles VGPR pressure per wave. +- **Recommendation trigger**: If a kernel compiled for RDNA shows unexpected occupancy, confirm + the wavefront size matches the LDS/VGPR budget assumptions. + +--- + +### Optimization Levels + +HIPCC/clang++ defaults to `-O0` in debug builds and `-O3` when no flag is given on the device +side. Always verify the optimization level is appropriate. + +| Flag | Effect | When to Use | +|------|--------|-------------| +| `-O0` | No optimization | Debug builds only | +| `-O1` | Basic optimizations, fast compile | Rarely appropriate for GPU | +| `-O2` | Most optimizations, no vectorization hints | General use | +| `-O3` | Full optimization + vectorization + inlining | **Default recommendation for GPU** | +| `-Ofast` | `-O3` + aggressive fast-math (implies `-ffast-math`) | When math accuracy is not critical | + +**Recommendation**: If the binary was compiled without explicit `-O3` (e.g., CMake Debug mode), +rebuilding in Release (`-O3`) is the single highest-ROI change. A Release build can be 2โ€“10ร— +faster than Debug for GPU kernels. + +--- + +### Fast-Math Flags + +Control floating-point operation reordering and denormal handling. Can significantly improve +throughput for FP32-heavy compute workloads. + +| Flag | Effect | Performance Gain | +|------|--------|-----------------| +| `-ffast-math` | Allows reassociation, assumes no NaN/Inf, enables FMA fusion | 10โ€“40% on FP32 VALU-bound kernels | +| `-fgpu-flush-denormals-to-zero` | Flushes FP32/FP16 denormals to zero in GPU code | 2โ€“15% on kernels processing near-zero values | +| `-fno-math-errno` | Removes errno-setting overhead from math calls | Minor; usually included in `-ffast-math` | +| `-fassociative-math` | Allows reordering of FP additions for vectorization | Enables auto-vectorization of reductions | + +**`-fgpu-flush-denormals-to-zero` โ€” Key recommendation:** +Denormal (subnormal) FP values incur a hardware performance penalty on AMD GPUs. If a kernel +processes values that may underflow to denormals (e.g., gradients in ML training, values close +to zero), enabling this flag can eliminate the denormal-handling overhead. Unlike `-ffast-math`, +it only changes behavior for subnormal inputs โ€” normal FP values are unaffected. + +**Safety caveat**: `-ffast-math` is not IEEE-754 compliant. Do not use for financial calculations, +iterative solvers requiring strict convergence, or any code that explicitly checks for NaN/Inf. + +--- + +### Register and Occupancy Control + +When profiling shows VGPR pressure is limiting occupancy, the compiler can be directed to use +fewer registers at the cost of potential spilling to scratch memory. + +#### Via `__attribute__` / `__launch_bounds__` (source annotation โ€” preferred): +```cpp +// Tell compiler max 256 threads/workgroup, min 2 blocks/CU +__global__ void __launch_bounds__(256, 2) my_kernel(...) { ... } +``` + +`__launch_bounds__(maxThreadsPerBlock, minBlocksPerMultiprocessor)` is the standard HIP way to +constrain register allocation. The compiler will spill registers to scratch memory to meet the +occupancy target. + +#### Via function attributes (IR-level control): +```cpp +__attribute__((amdgpu_num_vgpr(64))) // Force 64 VGPRs maximum +__attribute__((amdgpu_num_sgpr(32))) // Force 32 SGPRs maximum +__attribute__((amdgpu_waves_per_eu(2, 4))) // Request 2โ€“4 waves/CU +__attribute__((amdgpu_flat_work_group_size(64, 256))) // Valid workgroup range +``` + +These are lower-level than `__launch_bounds__` and should only be used when profiling confirms +the exact VGPR count needed. + +#### Via `-mllvm` passthrough (compilation flag): +```bash +# Global VGPR limit for the entire translation unit +hipcc -mllvm -amdgpu-num-vgpr=64 ... + +# Enable alloca promotion to registers (often auto-enabled at -O3) +hipcc -mllvm -amdgpu-enable-promote-alloca ... +``` + +**Recommendation trigger**: If `rocprof-compute` reports `vgpr_count > 128` and occupancy is +below target: +1. First try `__launch_bounds__(blockSize, targetWaves)` โ€” non-intrusive +2. If still failing, use `amdgpu_waves_per_eu(minWaves, maxWaves)` to narrow the range +3. As a last resort, use `-mllvm -amdgpu-num-vgpr=` globally โ€” watch for spill traffic + +**VGPR โ†’ occupancy table (CDNA3/gfx942, 512 VGPRs per SIMD):** +| VGPRs per wave | Allocated VGPRs (16-block) | Max waves/EU | Occupancy (of 32 max) | +|---------------|---------------------------|-------------|----------------------| +| 1โ€“16 | 16 | 32 | 100% | +| 17โ€“32 | 32 | 16 | 50% | +| 33โ€“48 | 48 | 10 | ~31% | +| 49โ€“64 | 64 | 8 | 25% | +| 65โ€“80 | 80 | 6 | ~19% | +| 81โ€“96 | 96 | 5 | ~16% | +| 97โ€“128 | 112โ€“128 | 4 | ~13% | +| 129โ€“176 | 144โ€“176 | 3 | ~9% | +| 177โ€“256 | 192โ€“256 | 2 | ~6% | +| 257โ€“512 | 272โ€“512 | 1 | ~3% | + +CDNA4 (gfx950): same VGPR pool per SIMD; doubled LDS (160 KB/CU) can allow larger workgroups. + +--- + +### Environment Variables (HIPCC / HIP Runtime) + +These affect compilation and runtime behavior without code or CMake changes. + +| Variable | Value | Effect | +|----------|-------|--------| +| `HIPCC_COMPILE_FLAGS_APPEND` | `-O3 -ffast-math` | Appends flags to every `hipcc` invocation | +| `HIP_FORCE_DEV_KERNARG=1` | `1` | Forces kernel arguments to device memory (avoids host-pinned buffer contention). **Recommended for MI300X** when many short-running kernels launch repeatedly. | +| `HIPCC_VERBOSE=1` | `1` | Prints full clang++ command lines โ€” use to verify flags are actually applied | +| `ROCPD_LLM_LOCAL` | `ollama` | (rocpd-specific) Use local LLM for stage-1 summarization | + +**`HIP_FORCE_DEV_KERNARG=1` โ€” Recommendation trigger**: If Tier 1 analysis shows API overhead +> 15% and many short kernels (avg duration < 10 ยตs), enabling this env var can reduce +host-device argument setup latency at no code cost. + +--- + +### Compiler Flags for CMake Projects + +Most HIP/ROCm projects use CMake. The correct way to set GPU-level flags is: + +```cmake +# Set target GPU(s) +set(CMAKE_HIP_ARCHITECTURES "gfx942") +# or for multiple targets: +set(CMAKE_HIP_ARCHITECTURES "gfx942;gfx90a") + +# Add optimization flags for GPU code +target_compile_options(my_target PRIVATE + $<$:-O3 -ffast-math -fgpu-flush-denormals-to-zero> +) + +# Add to all GPU targets in a directory +add_compile_options($<$:--offload-arch=gfx942:xnack->) +``` + +**Recommendation**: When suggesting compiler changes, always phrase them as CMake +`target_compile_options` changes, not raw shell flags, unless the user's build system is +confirmed to be non-CMake. + +--- + +### Compiler Optimization Decision Tree + +Use this decision tree when profiling evidence suggests a compiler flag may help: + +``` +Profiling evidence โ†’ Recommended compiler action +โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +MFMA utilization = 0 on MI300X โ†’ Recompile with --offload-arch=gfx942 +Binary compiled -O0 or Debug mode โ†’ Recompile with -O3 (highest ROI) +API overhead > 15%, many short kernels โ†’ Set HIP_FORCE_DEV_KERNARG=1 +Denormal flush warnings in perf data โ†’ Add -fgpu-flush-denormals-to-zero +VALU bound + FP32 heavy โ†’ Try -ffast-math (verify numerical correctness) +VGPR count > 64, low occupancy โ†’ Add __launch_bounds__ or amdgpu_waves_per_eu +Using hipMallocManaged? No โ†’ Recompile with --offload-arch=gfxXXX:xnack- +ECC not required? โ†’ Recompile with --offload-arch=gfxXXX:sramecc- +``` + +--- + +### Compiler Recommendation Format + +When recommending compiler changes in analysis output, use this structure: + +**Title**: [Descriptive title, e.g., "Enable Architecture-Specific Compilation"] +**Priority**: HIGH / MEDIUM / LOW +**Evidence**: [Specific counter or trace observation that triggered this recommendation] +**Change**: +```cmake +# Before +set(CMAKE_HIP_ARCHITECTURES "gfx900") # generic + +# After +set(CMAKE_HIP_ARCHITECTURES "gfx942") +target_compile_options(... PRIVATE $<$:-O3 -ffast-math>) +``` +**Expected Impact**: [Estimated improvement, e.g., "10โ€“40% VALU throughput improvement for FP32-heavy kernels"] +**Verification**: [How to confirm the change worked, e.g., "Rerun Tier 2 analysis; check VALU SOL%"] + +--- + +## What NOT to Do + + +โŒ **Do Not Recommend Already-Collected Data** +- Check `profiling_info.profiling_mode` and `hardware_counters.counters` before suggesting + any `--pmc` counter or tracing flag. If it was already collected, do not suggest it again. + +โŒ **Do Not Fabricate Metrics** +- If a metric is not in the data, say "Unknown โ€” counter data not collected" +- Do not estimate or guess performance numbers; base everything on the provided data + +โŒ **Do Not Suggest Deep Analysis for Minor Kernels** +- Apply Amdahl's Law: do not recommend rocprof-compute deep dive for kernels < 5% of time + +โŒ **Do Not Suggest Unsupported Architectures** +- Stick to known GPU specs in this guide; state limitations for unknown GPUs +- Supported: MI100 (gfx908), MI250X (gfx90a), MI300A/MI300X/MI325X (gfx942), MI350X/MI355X (gfx950), RX 6900 XT (gfx1030), RX 7900 XTX (gfx1100) + +โŒ **Do Not Give Generic Advice** +- "Optimize memory access" is not actionable +- Always provide specific, measurable, step-by-step guidance + +โŒ **Do Not Reference External Resources** +- No "check the AMD documentation at..." +- No "search online for examples" +- Provide self-contained guidance + +โš ๏ธ **Code Analysis Guidelines** +- **By default**: Focus on performance metrics only โ€” you do not have access to source code +- **Exception**: If the user's custom prompt explicitly mentions code analysis AND provides + file paths, then you MAY analyze code logic and suggest algorithmic changes +- **Rule**: Only suggest algorithmic changes when you can see the actual algorithm + +โŒ **Do Not Use Other Vendors' Terminology** +- Do not mention names of other companies or their products +- Use AMD-specific terminology: + - "LDS" (Local Data Share), not shared memory + - "waves", not warps or threads + - "VALU" or "stream processors", not CUDA cores + - "workgroup", not thread block + +โŒ **Do Not Make Unsupported Claims** +- Use "estimated" or "expected" for predictions +- Base estimates on actual counter values or similar profiling patterns + +โŒ **Never Fabricate Hardware Counter Names** +- Only reference counter names that appear in the provided profiling data or the Hardware Counter Reference section of this guide +- Do NOT invent counters like `TCP_L1_HIT_RATE`, `GRBM_COMPUTE_BUSY`, `SQ_VALU_EFFICIENCY`, etc. +- If a metric you want to reference was not collected, say "this counter was not collected in this run" and recommend adding it via `--pmc ` +- Use `rocprofv3 --list-avail` to discover available counters for the target GPU + +โŒ **Never Recommend CUDA/NVIDIA-Specific Optimizations** +- Do not suggest NVIDIA-specific tools (`nvprof`, `Nsight`, `nvcc` flags) +- Do not suggest CUDA-only APIs that have no HIP equivalent, or NVIDIA architecture-specific tuning (e.g., SM count, CUDA core optimization) +- All recommendations must use AMD tools (`rocprofv3`, `rocprof-compute`, `amdclang++`, HIP APIs) and reference AMD architecture concepts + +โŒ **Always Flag Implausible Metric Values โ€” Never Silently Accept Them** +- If profiling data shows GPU utilization > 100%, memory bandwidth exceeding the GPU's theoretical peak (see Hardware Specifications), negative durations, or wave occupancy > 32 waves/CU (CDNA3), flag this explicitly as a likely measurement artifact or data issue +- Example: "The reported bandwidth of 12 TB/s exceeds MI300X's peak of 5.3 TB/s; this value appears to be a measurement artifact and should not be used for bottleneck classification." +- Do not base recommendations on implausible values + +โŒ **Never Double-Count MFMA Instructions in Instruction Mix Analysis** +- `SQ_INSTS_MFMA` is a subset of `SQ_INSTS_VALU` โ€” every MFMA instruction is also counted in VALU +- When computing instruction mix percentages, use `SQ_INSTS_VALU - SQ_INSTS_MFMA` for "non-MFMA VALU" and report `SQ_INSTS_MFMA` separately +- Correct total: `(SQ_INSTS_VALU - SQ_INSTS_MFMA) + SQ_INSTS_MFMA + SQ_INSTS_SALU + SQ_INSTS_SMEM + ...` +- Incorrect total: `SQ_INSTS_VALU + SQ_INSTS_MFMA + ...` (this double-counts all MFMA instructions) + +--- + +## Example Analysis Flow + + +### Input Data: +- Kernel: `matmul_kernel` +- Duration: 500 ms (60% of total time) +- Grid: 256ร—256, Workgroup: 256ร—1ร—1 +- GPU utilization: 82% (GRBM_GUI_ACTIVE / GRBM_COUNT) +- SQ_WAVES: implies 8 waves/CU โ†’ 25% occupancy +- VGPR: 128 per wave + +### Analysis Steps: + +1. **Identify Importance**: 60% of total time โ†’ High priority (Amdahl: max 2.5ร— total speedup) + +2. **Classify Bottleneck** (requires FETCH_SIZE/WRITE_SIZE counters): + - If VALU util (45%) < HBM util (75%) โ†’ Memory-bound + - Occupancy 25% โ†’ also latency-bound (128 VGPRs โ†’ max 16 waves/CU) + +3. **Identify Root Causes**: + - Memory-bound: low arithmetic intensity or poor data reuse + - Low occupancy: 128 VGPRs limit to 16 waves/CU (target: โ‰ค 64 for 32 waves/CU) + +4. **Generate Recommendations**: + - **High Priority**: Reduce VGPR usage to โ‰ค 64 to enable 32 waves/CU + - **High Priority**: Tile data into LDS to increase arithmetic intensity + - **Medium Priority**: Coalesce global memory accesses + +5. **Suggest Next Step** (if counters missing): + - Collect L2 hit rate and instruction mix: + `rocprofv3 --pmc TCP_TCC_HIT_sum TCP_TCC_MISS_sum SQ_INSTS_VALU SQ_INSTS_VMEM -- ./app` + - If bottleneck still unclear: `rocprof-compute profile --kernel "matmul_kernel" -- ./app` + +--- + +## Confidence Levels + + +When classifying bottlenecks, indicate confidence: + +- **High Confidence (> 90%)**: Counter data present, clear bottleneck signature + - Example: "Memory-bound (High Confidence โ€” HBM utilization 82%, VALU utilization 35%)" +- **Medium Confidence (60โ€“90%)**: Some counters, bottleneck likely but not definitive + - Example: "Likely memory-bound (Medium Confidence โ€” low AI inferred from FETCH_SIZE, + no VALU counter available for cross-check)" +- **Low Confidence (< 60%)**: Trace-only data, no counters + - Example: "Bottleneck unknown (Low Confidence โ€” no hardware counters; collect + GRBM_COUNT, SQ_WAVES, FETCH_SIZE, WRITE_SIZE to classify)" + +--- + +## Handling Missing Data + + +### If No Hardware Counters (Tier 1 only): +``` +Limited Analysis: No hardware counters detected. +Cannot determine compute vs memory-bound classification. +Cannot calculate GPU utilization, wave occupancy, or HBM bandwidth. + +Recommended next step (Step 2) โ€” THREE passes required (each TCC-derived counter needs its own pass): + # Pass 1: GPU utilization + wave occupancy + rocprofv3 --sys-trace --pmc GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES \ + --kernel-names "" -d ./counters -o profile_pass1 -- ./app + # Pass 2: HBM read bandwidth (FETCH_SIZE alone โ€” 3 TCC hardware counters) + rocprofv3 --sys-trace --pmc FETCH_SIZE \ + --kernel-names "" -d ./counters -o profile_pass2 -- ./app + # Pass 3: HBM write bandwidth (WRITE_SIZE alone โ€” 2 TCC hardware counters) + rocprofv3 --sys-trace --pmc WRITE_SIZE \ + --kernel-names "" -d ./counters -o profile_pass3 -- ./app + +This will enable: GPU utilization, occupancy, and HBM bandwidth analysis. +For full roofline model, follow with: rocprof-compute profile -- ./app +``` + +### If Partial Counters (Tier 2, some counters missing): +``` +Partial Counter Data: [list which counters are present and which are missing] +- GPU utilization: [available/not available] +- Wave occupancy: [available/not available] +- HBM bandwidth: [available/not available โ€” need FETCH_SIZE + WRITE_SIZE] +- L2 hit rate: [available/not available โ€” need TCP_TCC_HIT_sum + TCP_TCC_MISS_sum] + +Recommended: Collect missing counters for complete bottleneck classification. +``` + +### If Unknown GPU Architecture: +``` +Unknown GPU Architecture: [gfx_arch] +Using generic analysis (trace data only). +Cannot compare to hardware peaks or calculate Speed-of-Light metrics. +Supported GPUs: MI100 (gfx908), MI250X/MI210/MI250 (gfx90a), + MI300A/MI300X/MI325X (gfx942), MI350X/MI355X (gfx950), + RX 6900 XT (gfx1030), RX 7900 XTX (gfx1100) +``` + +--- + +## Custom Prompt Handling + + +If the user provides a custom prompt (e.g., `--prompt "Why is kernel X slow?"`), use it to: + +1. **Focus Analysis**: Prioritize the specific kernel/aspect mentioned +2. **Tailor Output**: Structure response to directly answer the question +3. **Provide Targeted Recommendations**: Focus on the area of interest + +**Examples**: +- Prompt: "Focus on memory bottlenecks" โ†’ Emphasize FETCH_SIZE, WRITE_SIZE, L2 hit rates, memcpy overhead +- Prompt: "Why is matmul slow?" โ†’ Lead with matmul kernel analysis, occupancy, MFMA utilization +- Prompt: "What should I optimize first?" โ†’ Apply Amdahl's Law, rank by time ร— potential speedup + +--- + +## vLLM on ROCm โ€” Known API Pitfalls and Correct Patterns + + +When suggesting code optimizations for applications that use **vLLM**, you MUST follow these +rules precisely. vLLM has a well-defined public API; incorrect parameter names will cause +immediate `TypeError` at runtime. + +### CRITICAL: `pin_memory` / `use_pinned_memory` are NOT `LLM()` constructor parameters + +**NEVER suggest passing `pin_memory=True` or `use_pinned_memory=True` to `LLM()`.** +These parameters do not exist in the public `LLM()` / `EngineArgs` interface. Suggesting +them will cause a `TypeError: LLM.__init__() got an unexpected keyword argument`. + +**How pinned memory actually works in vLLM:** +- Pinned (page-locked) CPU memory is an **internal implementation detail** managed automatically by `vllm/worker/cache_engine.py` and `vllm/utils/__init__.py`. +- vLLM calls `is_pin_memory_available()` internally at startup โ€” the user never sets it. +- On AMD ROCm GPUs (CUDA/ROCm platform): pinned memory is **automatically enabled** โ€” no flag needed. +- Pinned memory is automatically **disabled** on: CPU backend (`--device cpu`), TPU, WSL (Windows Subsystem for Linux). + +**The correct public parameters for CPU memory management in `LLM()`:** + +| Parameter | Type | Default | Effect | +|---|---|---|---| +| `swap_space` | `float` | `4` | GiB of CPU RAM per GPU for KV cache swapping (preempted sequences paged out to pinned CPU memory automatically) | +| `cpu_offload_gb` | `float` | `0` | GiB of CPU RAM per GPU for **model weight** offloading (not KV cache) | + +**Example โ€” correct way to increase CPU KV cache swap:** +```python +llm = LLM( + model="meta-llama/Llama-3.1-8B-Instruct", + swap_space=8, # 8 GiB of pinned CPU RAM for KV cache swap per GPU + gpu_memory_utilization=0.90, + tensor_parallel_size=tp_size, +) +``` +vLLM will automatically use pinned memory for the swap buffer on CUDA/ROCm. You do not need any additional flag. + +**If you need to check availability in custom torch code (NOT for LLM() args):** +```python +from vllm.utils import is_pin_memory_available + +pin_memory = is_pin_memory_available() # True on CUDA/ROCm, False on CPU backend/WSL/TPU +cpu_buffer = torch.zeros(shape, dtype=dtype, pin_memory=pin_memory, device="cpu") +``` + +### Other vLLM LLM() Parameters Relevant to ROCm Performance + +| Parameter | Recommended | Notes | +|---|---|---| +| `enforce_eager=False` | Yes | Enables CUDA/HIP graph capture and kernel fusion. Set `True` only to debug correctness. | +| `tensor_parallel_size` | `โ‰ฅ 1` | Should match available GPU count. Use `torch.cuda.device_count()`. | +| `gpu_memory_utilization` | `0.90โ€“0.95` | Higher values reduce KV cache evictions but risk OOM. | +| `enable_chunked_prefill` | `True` | Overlaps prefill and decode phases; improves GPU occupancy. | +| `max_num_seqs` | `128โ€“512` | Larger batches amortize launch overhead. | +| `dtype` | `"auto"` | Selects bfloat16 on MI300X; do not force float32. | + +### Multiprocessing Warning for rocprofv3 + +vLLM uses Python `multiprocessing` with `spawn` start method. When profiling with `rocprofv3`, +GPU kernels run in **worker subprocesses**, NOT the main process. The `.db` file from the main +process will show `total_runtime_ns == 0` (empty). To profile vLLM: +- Use `VLLM_ENABLE_V1_MULTIPROCESSING=0` to force single-process mode for tracing +- Or profile the worker process directly with `rocprofv3 --pid ` +- Or use `rocprof-sys --trace` which can follow forks/spawns + +--- + +## Summary + + +Your goal is to transform raw profiling data into **clear, actionable insights** that help developers optimize their GPU code. Always: + +โœ… Follow the AMD 3-step profiling methodology and recommend only the next incremental step +โœ… Apply Amdahl's Law โ€” focus on the hottest kernels first +โœ… Classify bottlenecks (compute / memory / latency / launch) before recommending fixes +โœ… Be specific: cite actual counter values, compute derived metrics, give exact commands +โœ… Prioritize high-impact optimizations (> 10% of total time) +โœ… Acknowledge when data is missing and explain exactly what to collect next +โœ… Use AMD GPU terminology (waves, LDS, VALU, MFMA, workgroup) +โœ… Never recommend collecting data that is already present in the database +โœ… Consider compiler flags **before** recommending algorithmic rewrites โ€” check target arch, optimization level, fast-math, XNACK/SRAMECC, and VGPR limits first + +Follow this guide closely to ensure high-quality, trustworthy analysis. + +--- + +## TraceLens-Derived Metrics + + +These fields are derived using set-theoretic interval arithmetic (matching AMD TraceLens methodology). +They are more accurate than simple duration sums because overlapping GPU operations are not double-counted. + +### `interval_timeline` +- `true_compute_pct`: % of wall time the GPU is executing kernels (overlapping kernels merged โ€” more accurate than `execution_breakdown.kernel_time_pct`) +- `exposed_memcpy_pct`: % of wall time spent on memory copies that do NOT overlap any kernel (truly serialized transfers) +- `idle_pct`: % of wall time where the GPU is idle (no kernel or memcpy). **If idle_pct > 20%, this is a HIGH priority issue** โ€” the GPU is waiting for CPU to dispatch work. + +### `kernel_categories` +Each entry covers one of: GEMM, CONV, SDPA, NCCL, Elementwise, Normalization, Reduction, Other. +- `pct_of_kernel_time`: how dominant this category is among all GPU kernels +- Use this to classify workloads: high GEMM% โ†’ compute-bound candidate; high NCCL% โ†’ communication-bound; high Other% โ†’ custom/unclassified kernels +- A workload that is 60%+ GEMM is a strong candidate for MFMA/rocBLAS optimization + +### `short_kernels` +- `wasted_pct_of_kernel_time`: % of kernel time consumed by kernels below the `threshold_us` (default 10ฮผs) +- **If wasted_pct > 5%**, recommend kernel fusion or hipGraph batching +- Common cause: many small elementwise ops that could be fused; excessive hipDeviceSynchronize() calls between tiny kernels +- Top offenders list (kernel names sanitized) shows which kernels to target first + +### How to use these fields +When answering a `--prompt` question about bottlenecks, prioritize: +1. If `idle_pct > 20` โ†’ lead with GPU IDLE recommendation +2. If `wasted_pct > 5` AND short kernels are the dominant category โ†’ recommend fusion +3. If NCCL category dominates โ†’ mention communication bottleneck even if not yet Tier 2 diagnosed +4. Cross-reference `interval_timeline.true_compute_pct` with `execution_breakdown.kernel_time_pct` โ€” a large gap indicates significant kernel overlap (good for throughput but may hide serial stalls) diff --git a/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/docs/SCHEMA_CHANGELOG.md b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/docs/SCHEMA_CHANGELOG.md new file mode 100644 index 00000000000..8f7032a9e08 --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/python/rocpd/ai_analysis/docs/SCHEMA_CHANGELOG.md @@ -0,0 +1,642 @@ +# ROCpd AI Analysis Output - JSON Schema Changelog + +This document tracks all changes to the JSON output schema for `rocpd analyze --format json` +and the `rocpd.ai_analysis` Python API. + +## Versioning Policy + +The schema follows **Semantic Versioning** (`MAJOR.MINOR.PATCH`): + +| Change type | Version bump | Example | +|---|---|---| +| New required field, renamed field, type change, removed field | **MAJOR** | `0.x.x` โ†’ `1.0.0` | +| New optional field added | **MINOR** | `0.1.x` โ†’ `0.2.0` | +| Description/example correction, no structural change | **PATCH** | `0.1.0` โ†’ `0.1.1` | + +> **Beta notice**: While `MAJOR` is `0` the schema is in beta. Minor versions may include +> breaking changes without a MAJOR bump. Consumers should pin to an exact version during beta. + +**Compatibility rule**: A consumer written for schema version `0.x.x` MUST continue to work +on any `0.y.z` output where `y >= x` (except during MAJOR=0 beta where minor may break). +MAJOR version changes always require consumer updates. + +## How to Check the Schema Version + +Every JSON output document contains a top-level `schema_version` field: + +```json +{ + "schema_version": "0.1.0", + ... +} +``` + +**Recommended consumer pattern**: + +```python +import json + +with open("analysis.json") as f: + data = json.load(f) + +ver = data["schema_version"] +major, minor, _ = (int(x) for x in ver.split(".")) +if major != 0 or minor < 1: + raise RuntimeError( + f"Unsupported schema version {ver!r}. " + "Expected 0.1.x. See SCHEMA_CHANGELOG.md for migration guidance." + ) +``` + +## Schema File Naming + +A single schema file covers all emitted versions via its `schema_version` enum: + +``` +rocpd/ai_analysis/docs/ +โ”œโ”€โ”€ analysis-output.schema.json โ† single schema; schema_version enum lists all valid values +โ”‚ Tier 1/2 output emits: "0.1.0" +โ”‚ Tier 0 (source-only) output emits: "0.2.0" +โ”‚ Tier 1/2 with TraceLens fields emits: "0.3.0" +โ”‚ All valid values: ["0.1.0", "0.2.0", "0.3.0"] +โ”‚ New versions are added to the enum without breaking consumers +โ”œโ”€โ”€ SCHEMA_CHANGELOG.md โ† this file +โ”œโ”€โ”€ AI_ANALYSIS_API.md โ† Python API documentation +โ””โ”€โ”€ LLM_REFERENCE_GUIDE.md โ† copy of share/llm-reference-guide.md (for reference) +``` + +The current schema can always be located programmatically: + +```python +import importlib.resources as pkg_resources +schema_path = pkg_resources.files("rocpd.ai_analysis") / "docs" / "analysis-output.schema.json" +``` + +--- + +## Version History + +--- + +## v0.3.1 โ€” 2026-03-12 + +**No schema changes.** Schema file validator corrections, Python 3.6 compatibility fixes, +and LLM hardening only. + +**Schema file corrections (v0.2.0 spec was already correct; JSON file had bugs):** + +The `analysis-output.schema.json` file was corrected to match the already-documented +v0.2.0 specification. The emitted JSON format was never wrong; only the validator was: + +| Schema file bug | Fix | +|---|---| +| `profiling_info.profiling_mode` enum missing `"source_only"` | Added `"source_only"` as first enum value | +| `profiling_info.analysis_tier` `minimum` was `1` | Lowered to `0` to allow Tier 0 documents | +| `execution_breakdown` type was `"object"` only | Changed to `["object", "null"]` so source-only documents validate | +| `tier0` property not declared in `properties` object | Added full `tier0` property definition with all 14 sub-fields | +| `$id` embedded a version string (`"rocpd-ai-analysis-output-v0.1.0"`) | Changed to `"rocpd-ai-analysis-output"` (stable; version is in `schema_version` field) | + +Tier 0 JSON output (schema_version `"0.2.0"`) now passes `jsonschema.validate()` against +the schema file. 28 JSON schema conformance tests added (was 17): 11 new tests cover +Tier 0 source-only output and combined (Tier 0 + Tier 1/2) output validation. + +**Python 3.6 compatibility (`re.Pattern` annotation):** + +`tracelens_port.py` used `re.Pattern` in a module-level type annotation +(`_CATEGORY_PATTERNS: List[Tuple[str, re.Pattern]]`). Python 3.6 evaluates these +annotations eagerly at import time; `re.Pattern` was added in Python 3.7. This caused +an `AttributeError` on RHEL 8.8 (Python 3.6.8) that cascaded into all tests importing +`analyze.py` or `llm_analyzer.py`. Fixed by changing the annotation to `Any` (already +imported from `typing`). + +`test_analyze_schema.py` used `import importlib.resources` which also requires Python 3.7. +Fixed with a `try/except ImportError` shim that falls back to `pkgutil.get_data()`. + +**`ROCPD_LLM_PRIVATE_HEADERS` dict validation:** + +After `json.loads()`, the parsed result is now validated to be a `dict` before +`headers.update()` is called. A non-dict JSON value (e.g. `"[1,2,3]"`) previously +raised an opaque `TypeError`; it now raises a `ValueError` with a clear message +showing the expected format. + +**Stream chunk accumulation (`LLMConversation`):** + +Both `_stream_anthropic` and `_stream_openai` now accumulate response chunks with +`chunks.append(text)` + `"".join(chunks)` instead of `result += chunk` string +concatenation, avoiding O(nยฒ) memory allocation for long responses. + +--- + +## v0.3.0 (2026-03-11) + +### New Fields (additive โ€” old consumers should ignore unknown top-level keys) + +- `interval_timeline` (object): GPU wall-time breakdown using set-theoretic interval arithmetic + (TraceLens methodology). More accurate than `execution_breakdown` which sums raw durations. + Fields: `total_wall_ns`, `true_compute_ns/pct`, `exposed_memcpy_ns/pct`, `idle_ns/pct`. + +- `kernel_categories` (array): Kernel execution time aggregated by TraceLens op category + (GEMM, CONV, SDPA, NCCL, Elementwise, Normalization, Reduction, Other). + Fields per entry: `category`, `count`, `total_ns`, `pct_of_kernel_time`, `avg_duration_ns`, `pct_of_total_time`. + +- `short_kernels` (object): Short kernel analysis โ€” kernels below 10ฮผs threshold. + Fields: `threshold_us`, `total_kernels`, `short_kernel_count`, `short_kernel_pct`, + `wasted_ns`, `wasted_pct_of_kernel_time`, `histogram`, `top_offenders`. + +### Versioning Policy +Tier 1/2 runs now emit `schema_version: "0.3.0"` when tracelens fields are present. +Tier 0 source-only runs remain at `schema_version: "0.2.0"`. +Prior `"0.1.0"` documents are unaffected. + +--- + +### v0.2.1 โ€” 2026-03-10 + +**No schema changes.** Security, correctness, and LLM-layer bug fixes only. + +This release documents behavioral changes that affect output values and API +consumers without altering the JSON document structure or field names. + +**Output value guarantees (metadata field):** +- `analysis_version` in `metadata` now always reflects the schema version string + (e.g. `"0.1.0"` for Tier 1/2 documents, `"0.2.0"` for Tier 0 source-only + documents). The value was already correct in practice but is now explicitly + documented as schema-tied. Consumer code should continue to read + `schema_version` (not `analysis_version`) for compatibility checks. + +**`execution_breakdown.api_overhead_pct` is now guaranteed โ‰ฅ 0:** +- `compute_time_breakdown()` now applies `max(0.0, ...)` to the raw `overhead_percent` + before returning. In some traces where kernel + memcpy time marginally exceeded the + computed total runtime (timestamp rounding), this field could previously be a small + negative value. It is now always non-negative in both CLI JSON output and the + Python API `ExecutionBreakdown.api_overhead_pct` field. + +**`memory_analysis[direction].bandwidth_bytes_per_sec` and `bandwidth_gbps` now use actual sizes:** +- `analyze_memory_copies()` now reads the `size` column from `memory_copies` rows. + Previously `total_bytes` was always 0 and bandwidth was not computed. Consumers + that previously saw `bandwidth_gbps: 0` for all directions may now see non-zero + values, and the "Low memory bandwidth" recommendation (< 10 GB/s) can now fire + based on real measurements. + +**`recommendations[].commands[].full_command` kernel names are now shell-safe:** +- In the "Compute Bottleneck" recommendation, `--kernel-names` arguments in + `full_command` strings are now wrapped with `shlex.quote()`. Kernel names + containing shell metacharacters (single quotes, semicolons, spaces) are properly + escaped. The `args[].value` field is unchanged (stores the raw kernel name for + display purposes). + +**LLM API calls now include `timeout=120`:** +- All Anthropic and OpenAI API calls include an explicit 120-second timeout. + Previously calls could hang indefinitely. A timed-out call is caught and recorded + as a non-fatal warning; local analysis results are still returned. + +**Tier 0 webview XSS protection:** +- `` sequences in the embedded JSON payload of `_format_tier0_webview()` + are now escaped to `<\/script>`. This prevents a crafted kernel name or LLM + explanation from breaking out of the `", r"<\/script>").replace(" +
+
+
+ + AI Performance Analysis +
+
{header_badges_html}
+
+ +
+
+
+
Runtime:{total_ms:,.2f} ms
+
Kernels:{len(hotspots or [])}
+
Tier:{_h(tier_label)}
+
Generated:{analysis_date}
+ {_db_pill_html} +
+
+ +
+ + +
+
+ 📊 +

Overview

+ Tier {tier} +
+
+

{_h(assessment)}

+
+
+
{_bn_icon}Bottleneck
+
Primary Bottleneck
+
{_h(bn_display)}
+
Confidence: {confidence}%
+
+
+
Duration
+
Total Runtime
+
{total_ms:,.2f}
+
milliseconds • {len(hotspots or [])} kernels
+
+
+
💻{_kpi_kernel_lbl}
+
Kernel Execution
+
{kernel_pct:.1f}%
+
{kernel_ms:,.2f} ms active compute
+
+
+
{_tier_icon}{_tier_status_lbl}
+
Analysis Tier
+
{tier}
+
{'Hardware counters available' if has_counters else 'Trace-level only'}
+
+
+ {findings_html} +
+
+ + +
+
+ +

Execution Breakdown

+
+
+
+
+
+
+
+
+
+
Kernel  {kernel_pct:.1f}%
+
Memory Copies  {memcpy_pct:.1f}%
+
API Overhead  {overhead_pct:.1f}%
+
GPU Idle  {idle_pct:.1f}%
+
+
+
+
Kernel Execution
+
+
{kernel_pct:.1f}% {kernel_ms:,.2f} ms
+
+
+
Memory Copies
+
+
{memcpy_pct:.1f}% {memcpy_ms:,.2f} ms
+
+
+
API Overhead
+
+
{overhead_pct:.1f}% {overhead_ms:,.2f} ms
+
+
+
GPU Idle
+
+
{idle_pct:.1f}% {idle_ms:,.2f} ms
+
+
+
+
+ + +
+
+ 💡 +

Optimization Recommendations

+ {_recs_badge_html} +
+
+ {recs_html} +
+
+ +{hotspots_html} +{mem_html} + + +
+
+ 🔬 +

Hardware Counters

+ {_hw_badge_html} +
+
+ {hw_inner} +
+
+ +
+ +
+

Generated by rocpd analyze — AMD ROCm GPU Performance Analysis • {analysis_date}

+
+ + + + + + +""" + + # --- Kernel category breakdown card (TraceLens) --- + if kernel_categories: + cat_rows_html = "" + for cat in kernel_categories: + avg_us = cat["avg_duration_ns"] / 1_000 + pct = cat["pct_of_kernel_time"] + bar_w = max(2, int(pct * 2)) # scale to max 200px + cat_rows_html += ( + f'{cat["category"]}' + f'{cat["count"]}' + f'
' + f" {pct:.1f}%" + f"{avg_us:.1f}μs" + ) + category_card = ( + '\n
' + '\n
' + '\n Kernel Category Breakdown (TraceLens)' + '\n ' + "\n
" + '\n
' + '\n ' + "\n " + "\n " + cat_rows_html + "" + "\n
CategoryKernels% of Kernel TimeAvg Duration
" + "\n
" + "\n
" + ) + html = html.replace("", category_card + "\n") + + return html + + +# --------------------------------------------------------------------------- +# Tier 0 format helpers +# --------------------------------------------------------------------------- + + +def _tier0_recommendations_text( + recommendations: List[Dict[str, Any]], width: int = 80 +) -> List[str]: + """Render Tier 0 recommendations as text lines (same format as Tier 1/2).""" + lines = [] + for rec in recommendations: + pri = rec.get("priority", "INFO") + cat = rec.get("category", "") + issue = rec.get("issue", "") + suggestion = rec.get("suggestion", "") + impact = rec.get("estimated_impact", "") + actions = rec.get("actions", []) + commands = rec.get("commands", []) + + lines.append(f"[{pri}] {cat}") + lines.append("โ”€" * width) + lines.append(f" Issue: {issue}") + lines.append("") + if suggestion: + lines.append(f" Suggestion: {suggestion}") + for action in actions: + lines.append(f" {action}") + lines.append("") + if impact: + lines.append(f" Estimated Impact: {impact}") + lines.append("") + if commands: + lines.append(" Recommended Commands:") + for cmd in commands: + tool = cmd.get("tool", "") + desc = cmd.get("description", "") + full_command = cmd.get("full_command", "") + flags = cmd.get("flags", []) + args = cmd.get("args", []) + lines.append(f" [{tool}] {desc}") + if flags: + lines.append(f" Flags: {' '.join(flags)}") + if args: + arg_strs = [] + for a in args: + name = a.get("name", "") + value = a.get("value") + arg_strs.append(f"{name} {value}" if value is not None else name) + lines.append(f" Args: {' '.join(arg_strs)}") + if full_command: + lines.append(f" $ {full_command}") + lines.append("") + lines.append("") + return lines + + +def _format_tier0_text(tier0_result: Any) -> str: + """Format Tier 0 source-only analysis as plain text.""" + width = 80 + lines = [] + lines.append("=" * width) + lines.append("ROCPD AI PROFILING PLAN (TIER 0: SOURCE CODE ANALYSIS)".center(width)) + lines.append("=" * width) + lines.append(f"Source Directory: {tier0_result.source_dir}") + lines.append(f"Analysis Date: {tier0_result.analysis_timestamp}") + lines.append(f"Programming Model: {tier0_result.programming_model}") + lines.append( + f"Files Scanned: {tier0_result.files_scanned} " + f"(skipped: {tier0_result.files_skipped})" + ) + lines.append("") + + # Kernels + lines.append("โ”" * width) + lines.append("DETECTED GPU KERNELS".center(width)) + lines.append("โ”" * width) + lines.append(f" Total kernels found: {tier0_result.kernel_count}") + if tier0_result.detected_kernels: + for k in tier0_result.detected_kernels[:20]: + lines.append( + f" โ€ข {k['name']} ({k.get('launch_type', '')}) " + f"{k.get('file', '').split('/')[-1]}:{k.get('line', '')}" + ) + if len(tier0_result.detected_kernels) > 20: + lines.append(f" ... and {len(tier0_result.detected_kernels) - 20} more") + else: + lines.append(" No GPU kernels detected in source.") + lines.append("") + + # Patterns by severity + lines.append("โ”" * width) + lines.append("DETECTED PATTERNS".center(width)) + lines.append("โ”" * width) + if tier0_result.detected_patterns: + for p in tier0_result.detected_patterns: + sev = p.get("severity", "info").upper() + cat = p.get("category", "") + desc = p.get("description", "") + count = p.get("count", 0) + lines.append(f" [{sev}] {cat} โ€” {desc} (ร—{count})") + else: + lines.append(" No significant patterns detected.") + lines.append("") + + # Risk areas + if tier0_result.risk_areas: + lines.append("โ”" * width) + lines.append("RISK AREAS".center(width)) + lines.append("โ”" * width) + for risk in tier0_result.risk_areas: + lines.append(f" โš  {risk}") + lines.append("") + + # ROCTx + if tier0_result.already_instrumented: + lines.append( + f" โœ“ ROCTx markers detected ({tier0_result.roctx_marker_count} markers)" + ) + lines.append("") + + # Recommended counters + if tier0_result.suggested_counters: + lines.append("โ”" * width) + lines.append("SUGGESTED HARDWARE COUNTERS".center(width)) + lines.append("โ”" * width) + lines.append(" " + " ".join(tier0_result.suggested_counters)) + lines.append("") + + # Recommendations + lines.append("โ”" * width) + lines.append("PROFILING RECOMMENDATIONS".center(width)) + lines.append("โ”" * width) + lines.append("") + lines.extend(_tier0_recommendations_text(tier0_result.recommendations, width)) + + # Suggested first command + if tier0_result.suggested_first_command: + lines.append("โ”" * width) + lines.append("START HERE โ€” SUGGESTED FIRST COMMAND".center(width)) + lines.append("โ”" * width) + lines.append("") + lines.append(f" $ {tier0_result.suggested_first_command}") + lines.append("") + + # LLM explanation + if tier0_result.llm_explanation: + lines.append("โ”" * width) + lines.append("AI-ENHANCED INSIGHTS".center(width)) + lines.append("โ”" * width) + lines.append("") + lines.append(tier0_result.llm_explanation) + lines.append("") + + lines.append("=" * width) + lines.append("Analysis complete.".center(width)) + lines.append("=" * width) + + return "\n".join(lines) + + +def _tier0_to_dict(tier0_result: Any) -> Dict[str, Any]: + """Convert SourceAnalysisResult to a JSON-serializable dict for the tier0 field.""" + return { + "source_dir": tier0_result.source_dir, + "analysis_timestamp": tier0_result.analysis_timestamp, + "programming_model": tier0_result.programming_model, + "files_scanned": tier0_result.files_scanned, + "files_skipped": tier0_result.files_skipped, + "kernel_count": tier0_result.kernel_count, + "detected_kernels": tier0_result.detected_kernels, + "detected_patterns": tier0_result.detected_patterns, + "risk_areas": tier0_result.risk_areas, + "already_instrumented": tier0_result.already_instrumented, + "roctx_marker_count": tier0_result.roctx_marker_count, + "recommendations": _build_recommendations_json(tier0_result.recommendations), + "suggested_counters": tier0_result.suggested_counters, + "suggested_first_command": tier0_result.suggested_first_command, + "llm_explanation": tier0_result.llm_explanation, + } + + +def _format_tier0_json(tier0_result: Any) -> str: + """Format Tier 0 source-only analysis as schema v0.2.0 JSON.""" + import json as _json + + doc: Dict[str, Any] = { + "schema_version": "0.2.0", + "metadata": { + "rocpd_version": _ROCPD_VERSION, + "analysis_version": "0.2.0", # schema version, not module version + "database_file": None, + "analysis_timestamp": tier0_result.analysis_timestamp, + "analysis_duration_ms": 0, + "custom_prompt": None, + }, + "profiling_info": { + "total_duration_ns": 0, + "profiling_mode": "source_only", + "analysis_tier": 0, + "gpus": [], + }, + "summary": { + "overall_assessment": ( + f"Static analysis of {tier0_result.files_scanned} source files found " + f"{tier0_result.kernel_count} GPU kernels. " + f"Programming model: {tier0_result.programming_model}. " + f"See recommendations for next profiling steps." + ), + "primary_bottleneck": "unknown", + "confidence": 0.0, + "key_findings": tier0_result.risk_areas, + }, + "tier0": _tier0_to_dict(tier0_result), + "execution_breakdown": None, + "hotspots": [], + "memory_analysis": {}, + "hardware_counters": {"has_counters": False, "metrics": None, "counters": None}, + "recommendations": _build_recommendations_json(tier0_result.recommendations), + "warnings": [], + "errors": [], + "llm_enhanced_explanation": tier0_result.llm_explanation, + } + return _json.dumps(doc, indent=2) + + +def _format_tier0_markdown(tier0_result: Any) -> str: + """Format Tier 0 source-only analysis as Markdown.""" + lines = [] + lines.append("# ROCpd AI Profiling Plan โ€” Tier 0: Source Code Analysis") + lines.append("") + lines.append(f"**Source Directory:** `{tier0_result.source_dir}`") + lines.append(f"**Analysis Date:** {tier0_result.analysis_timestamp}") + lines.append(f"**Programming Model:** {tier0_result.programming_model}") + lines.append("**Analysis Tier:** 0 (Source Code Analysis)") + lines.append("") + + lines.append("## Detected Kernels") + lines.append("") + lines.append(f"**Total GPU kernels found:** {tier0_result.kernel_count}") + lines.append("") + if tier0_result.detected_kernels: + lines.append("| Kernel | Launch Type | File | Line |") + lines.append("|--------|-------------|------|------|") + for k in tier0_result.detected_kernels[:20]: + fname = k.get("file", "").split("/")[-1] + lines.append( + f"| `{k['name']}` | {k.get('launch_type', '')} | {fname} | {k.get('line', '')} |" + ) + if len(tier0_result.detected_kernels) > 20: + lines.append( + f"\n*... and {len(tier0_result.detected_kernels) - 20} more kernels*" + ) + else: + lines.append("*No GPU kernels detected in source.*") + lines.append("") + + lines.append("## Detected Patterns") + lines.append("") + if tier0_result.detected_patterns: + lines.append("| Severity | Category | Description | Count |") + lines.append("|----------|----------|-------------|-------|") + for p in tier0_result.detected_patterns: + sev = p.get("severity", "info") + lines.append( + f"| **{sev.upper()}** | {p.get('category', '')} | {p.get('description', '')} | {p.get('count', 0)} |" + ) + else: + lines.append("*No significant patterns detected.*") + lines.append("") + + if tier0_result.risk_areas: + lines.append("## Risk Areas") + lines.append("") + for risk in tier0_result.risk_areas: + lines.append(f"- โš  {risk}") + lines.append("") + + if tier0_result.suggested_counters: + lines.append("## Suggested Hardware Counters") + lines.append("") + lines.append("```") + lines.append(" ".join(tier0_result.suggested_counters)) + lines.append("```") + lines.append("") + + lines.append("## Profiling Recommendations") + lines.append("") + priority_emoji = {"HIGH": "๐Ÿ”ด", "MEDIUM": "๐ŸŸก", "LOW": "๐ŸŸข", "INFO": "๐Ÿ”ต"} + for rec in tier0_result.recommendations: + pri = rec.get("priority", "INFO") + cat = rec.get("category", "") + emoji = priority_emoji.get(pri, "โ€ข") + lines.append(f"### {emoji} [{pri}] {cat}") + lines.append("") + lines.append(f"**Issue:** {rec.get('issue', '')}") + lines.append("") + lines.append(f"**Suggestion:** {rec.get('suggestion', '')}") + actions = rec.get("actions", []) + if actions: + lines.append("") + for action in actions: + lines.append(f"{action}") + impact = rec.get("estimated_impact", "") + if impact: + lines.append("") + lines.append(f"**Estimated Impact:** {impact}") + commands = rec.get("commands", []) + if commands: + lines.append("") + lines.append("**Recommended Commands:**") + lines.append("") + for cmd in commands: + tool = cmd.get("tool", "") + desc = cmd.get("description", "") + full_command = cmd.get("full_command", "") + flags = cmd.get("flags", []) + args = cmd.get("args", []) + lines.append(f"*{tool}* โ€” {desc}") + if flags: + lines.append(f"- Flags: `{' '.join(flags)}`") + if args: + arg_strs = [] + for a in args: + name = a.get("name", "") + value = a.get("value") + arg_strs.append(f"{name} {value}" if value is not None else name) + lines.append(f"- Args: `{' '.join(arg_strs)}`") + if full_command: + lines.append(f"```bash\n{full_command}\n```") + lines.append("") + lines.append("") + + if tier0_result.suggested_first_command: + lines.append("## Start Here โ€” Suggested First Command") + lines.append("") + lines.append("```bash") + lines.append(tier0_result.suggested_first_command) + lines.append("```") + lines.append("") + + if tier0_result.llm_explanation: + lines.append("## AI-Enhanced Insights") + lines.append("") + lines.append(tier0_result.llm_explanation) + lines.append("") + + lines.append("---") + lines.append( + f"*Generated by rocpd analyze (Tier 0) \u2022 {tier0_result.analysis_timestamp}*" + ) + return "\n".join(lines) + + +def _format_tier0_webview(tier0_result: Any) -> str: + """Generate a self-contained AMD-themed HTML Tier 0 report (identical design system as Tier 1/2).""" + import html as _html + import json as _json + + def _h(v: Any) -> str: + return _html.escape(str(v), quote=True) + + SEV_FG = { + "high": "#e84040", + "medium": "#f08432", + "low": "#caa828", + "info": "#4d8ef2", + } + SEV_BG = { + "high": "rgba(232,64,64,.13)", + "medium": "rgba(240,132,50,.13)", + "low": "rgba(202,168,40,.13)", + "info": "rgba(77,142,242,.13)", + } + PRIORITY = { + "HIGH": ("#e84040", "#2a0808"), + "MEDIUM": ("#f08432", "#2a1600"), + "LOW": ("#caa828", "#241e08"), + "INFO": ("#4d8ef2", "#081428"), + } + PRIORITY_ICON = { + "HIGH": "🔴", + "MEDIUM": "🟠", + "LOW": "🟡", + "INFO": "ℹ", + } + + analysis_date = tier0_result.analysis_timestamp + src_dir = str(tier0_result.source_dir) + src_display = src_dir[-45:] if len(src_dir) > 45 else src_dir + + # โ”€โ”€ Counts โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + recs = tier0_result.recommendations or [] + n_high = sum(1 for r in recs if r.get("priority") == "HIGH") + n_medium = sum(1 for r in recs if r.get("priority") == "MEDIUM") + n_low = sum(1 for r in recs if r.get("priority") == "LOW") + n_info = sum(1 for r in recs if r.get("priority") == "INFO") + + _badge_parts = [] + if n_high: + _badge_parts.append( + f'● {n_high} Critical' + ) + if n_medium: + _badge_parts.append( + f'● {n_medium} Warning' + ) + if n_low: + _badge_parts.append(f'● {n_low} Low') + if n_info: + _badge_parts.append( + f'● {n_info} Info' + ) + header_badges_html = " ".join(_badge_parts) + + _recs_badge_html = "" + if n_high: + _recs_badge_html += ( + f'{n_high} Critical ' + ) + if n_medium: + _recs_badge_html += ( + f'{n_medium} Warning' + ) + + # โ”€โ”€ Recommendations HTML (same .r-card format as Tier 1/2) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + recs_parts = [] + for ri, rec in enumerate(recs): + p = rec.get("priority", "INFO") + cat = rec.get("category", "") + fg, _ = PRIORITY.get(p, ("#888", "#1a1a2a")) + picon = PRIORITY_ICON.get(p, "ℹ") + actions_li = "".join(f"
  • {_h(a)}
  • " for a in rec.get("actions", [])) + actions_html = f'
      {actions_li}
    ' if actions_li else "" + impact = rec.get("estimated_impact", "") + impact_html = ( + f'

    ⚡ Expected impact: {_h(impact)}

    ' + if impact + else "" + ) + cmds_parts = [] + for ci, cmd in enumerate(rec.get("commands", [])): + fc = cmd.get("full_command", "") + tool = cmd.get("tool", "") + desc = cmd.get("description", "") + if not fc: + continue + cid = f"c{ri}_{ci}" + cmds_parts.append( + f'
    ' + f'{_h(tool)}' + f'{_h(desc)}' + f'
    ' + f"{_h(fc)}" + f'' + f"
    " + ) + cmds_html = "".join(cmds_parts) + issue_txt = rec.get("issue", "") + suggest = rec.get("suggestion", "") + recs_parts.append( + f'
    ' + f'
    ' + f'{picon}' + f'{_h(p)}' + f'{_h(cat)}' + f'' + f"
    " + f'
    ' + f'

    Issue: {_h(issue_txt)}

    ' + f'

    What to do: {_h(suggest)}

    ' + f"{actions_html}{impact_html}{cmds_html}" + f"
    " + ) + recs_html = ( + "".join(recs_parts) + or '

    No recommendations โ€” workload looks well-optimized.

    ' + ) + + # โ”€โ”€ Kernels table โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + kernel_rows = [] + for i, k in enumerate(tier0_result.detected_kernels[:50]): + fname = _h(k.get("file", "").split("/")[-1]) + kernel_rows.append( + f"" + f"{i + 1}" + f'{_h(k.get("name", ""))}' + f'{_h(k.get("launch_type", ""))}' + f"{fname}" + f'{_h(str(k.get("line", "")))}' + f"" + ) + if kernel_rows: + kernels_section = ( + '
    ' + '
    ' + '💻' + "

    Detected GPU Kernels

    " + f'{tier0_result.kernel_count} found' + "
    " + '
    ' + '' + "" + "" + "" + "" + "" + "" + "" + "" + "".join(kernel_rows) + "" + "
    #Kernel NameLaunch TypeFileLine ⇅
    " + ) + else: + kernels_section = ( + '
    ' + '
    💻' + "

    Detected GPU Kernels

    " + '

    No GPU kernels detected in the source directory.

    ' + "
    " + ) + + # โ”€โ”€ Patterns table โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + pattern_rows = [] + for pat in tier0_result.detected_patterns: + sev = pat.get("severity", "info").lower() + sfg = SEV_FG.get(sev, "#6b7280") + sbg = SEV_BG.get(sev, "rgba(107,114,128,.13)") + pattern_rows.append( + f"" + f'{_h(sev.upper())}' + f'{_h(pat.get("category", ""))}' + f'{_h(pat.get("description", ""))}' + f'{pat.get("count", 0)}' + f"" + ) + if pattern_rows: + patterns_section = ( + '
    ' + '
    ' + '📊' + "

    Detected Performance Patterns

    " + f'{len(tier0_result.detected_patterns)} found' + "
    " + '
    ' + '' + "" + "" + "" + "" + "" + "" + "" + "".join(pattern_rows) + "" + "
    SeverityCategoryDescriptionCount ⇅
    " + ) + else: + patterns_section = "" + + # โ”€โ”€ Risk areas โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + risk_li = "".join(f"
  • {_h(r)}
  • " for r in tier0_result.risk_areas) + risk_section = "" + if risk_li: + risk_section = ( + '
    ' + '
    ' + '' + "

    Risk Areas

    " + f'{len(tier0_result.risk_areas)}' + "
    " + '
    ' + f'
      {risk_li}
    ' + "
    " + ) + + # โ”€โ”€ Suggested counters โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + ctr_badges = " ".join( + f'{_h(c)}' + for c in tier0_result.suggested_counters + ) + counters_section = "" + if tier0_result.suggested_counters: + collect_cmd = ( + "rocprofv3 --sys-trace --pmc " + + " ".join(tier0_result.suggested_counters) + + " -- ./your_app" + ) + counters_section = ( + '
    ' + '
    ' + '🔬' + "

    Suggested Hardware Counters

    " + f'{len(tier0_result.suggested_counters)} counters' + "
    " + '
    ' + '

    ' + "Collect these counters to enable Tier 2 (hardware-level) analysis:

    " + f'

    {ctr_badges}

    ' + f'
    ' + f"{_h(collect_cmd)}" + f'' + "
    " + "
    " + ) + + # โ”€โ”€ Start Here โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + start_here_section = "" + if tier0_result.suggested_first_command: + fc = tier0_result.suggested_first_command + start_here_section = ( + '
    ' + '
    ' + '' + "

    Start Here

    " + 'Recommended First Step' + "
    " + '
    ' + '

    ' + "Run this command to collect profiling data for Tier 1/2 analysis:

    " + f'
    ' + f"{_h(fc)}" + f'' + "
    " + "
    " + ) + + # โ”€โ”€ LLM section โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + llm_section = "" + if tier0_result.llm_explanation: + llm_section = ( + '
    ' + '
    ' + '🤖' + "

    AI-Enhanced Insights

    " + 'LLM' + "
    " + '
    ' + f'
    {_h(tier0_result.llm_explanation)}
    ' + "
    " + ) + + # โ”€โ”€ KPI grid โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + n_risks = len(tier0_result.risk_areas) + risk_kpi_cls = "kpi-warn" if n_risks > 0 else "kpi-ok" + risk_kpi_label = "Needs Attention" if n_risks > 0 else "None Found" + model_upper = _h(tier0_result.programming_model.upper()) + assessment_txt = ( + f"Static source analysis of {tier0_result.files_scanned} file(s) found " + f"{tier0_result.kernel_count} GPU kernel(s). " + f"Programming model: {tier0_result.programming_model}. " + "See recommendations below for the suggested profiling workflow." + ) + n_patterns = len(tier0_result.detected_patterns) + + payload = _json.dumps(_tier0_to_dict(tier0_result)) + payload = payload.replace("", r"<\/script>").replace(" +
    +
    +
    + + AI Profiling Plan +
    +
    {header_badges_html}
    +
    + +
    +
    +
    +
    Source:{_h(src_display)}
    +
    Kernels:{tier0_result.kernel_count}
    +
    Tier:0 (Source)
    +
    Generated:{_h(analysis_date)}
    +
    Model:{_h(tier0_result.programming_model)}
    +
    +
    + +
    + + +
    +
    + 📊 +

    Overview

    + Tier 0 +
    +
    +

    {_h(assessment_txt)}

    +
    +
    +
    💻Detected
    +
    GPU Kernels
    +
    {tier0_result.kernel_count}
    +
    {tier0_result.files_scanned} file(s) scanned
    +
    +
    +
    🧰Model
    +
    Programming Model
    +
    {model_upper}
    +
    {tier0_result.files_scanned} files • {tier0_result.files_skipped} skipped
    +
    +
    +
    📊Found
    +
    Patterns Detected
    +
    {n_patterns}
    +
    potential issues identified
    +
    +
    +
    {risk_kpi_label}
    +
    Risk Areas
    +
    {n_risks}
    +
    {"requires profiling to confirm" if n_risks > 0 else "no obvious risk areas"}
    +
    +
    +
    +
    + + +
    +
    + 💡 +

    Profiling Recommendations

    + {_recs_badge_html} +
    +
    + {recs_html} +
    +
    + +{kernels_section} +{patterns_section} +{risk_section} +{counters_section} +{start_here_section} +{llm_section} + +
    + +
    +

    Generated by rocpd analyze (Tier 0) — AMD ROCm GPU Performance Analysis • {_h(analysis_date)}

    +
    + + + + + + +""" + + +def format_analysis_output( + time_breakdown: Dict[str, Any], + hotspots: List[Dict[str, Any]], + memory_analysis: Dict[str, Dict[str, Any]], + recommendations: List[Dict[str, Any]], + hardware_counters: Optional[Dict[str, Any]] = None, + database_path: str = "", + output_format: str = "text", + tier0_result: Optional[Any] = None, + source_only: bool = False, + interval_timeline: Optional[ + Dict[str, Any] + ] = None, # NEW (TraceLens) โ€” logic in Task 4 + kernel_categories: Optional[List[Any]] = None, # NEW (TraceLens) โ€” logic in Task 4 + short_kernels: Optional[Dict[str, Any]] = None, # NEW (TraceLens) โ€” logic in Task 4 + custom_prompt: Optional[str] = None, +) -> str: + """ + Format analysis results for display. + + Args: + time_breakdown: Time distribution metrics + hotspots: Top kernel hotspots + memory_analysis: Memory copy analysis + recommendations: Performance recommendations + database_path: Path to analyzed database + output_format: Output format (text, json, markdown, webview) + tier0_result: Optional Tier 0 source analysis result + source_only: True when no database was provided (Tier 0 only) + + Returns: + Formatted string output + """ + # Source-only mode: dispatch entirely to Tier 0 formatters + if source_only and tier0_result is not None: + if output_format == "json": + return _format_tier0_json(tier0_result) + if output_format == "markdown": + return _format_tier0_markdown(tier0_result) + if output_format == "webview": + return _format_tier0_webview(tier0_result) + return _format_tier0_text(tier0_result) + + if output_format == "json": + output = _format_as_json( + time_breakdown=time_breakdown, + hotspots=hotspots, + memory_analysis=memory_analysis, + recommendations=recommendations, + hardware_counters=hardware_counters, + database_path=database_path, + interval_timeline=interval_timeline, + kernel_categories=kernel_categories, + short_kernels=short_kernels, + custom_prompt=custom_prompt, + ) + # Combined mode: embed tier0 into JSON document + if tier0_result is not None: + import json as _json + + try: + doc = _json.loads(output) + doc["tier0"] = _tier0_to_dict(tier0_result) + output = _json.dumps(doc, indent=2) + except Exception: + pass # Tier0 embedding into combined JSON is non-fatal; return Tier1/2 output unchanged + return output + + if output_format == "markdown": + output = _format_as_markdown( + time_breakdown=time_breakdown, + hotspots=hotspots, + memory_analysis=memory_analysis, + recommendations=recommendations, + hardware_counters=hardware_counters, + database_path=database_path, + interval_timeline=interval_timeline, + kernel_categories=kernel_categories, + short_kernels=short_kernels, + ) + if tier0_result is not None: + output += "\n\n---\n\n## Tier 0: Source Code Analysis\n\n" + output += _format_tier0_markdown(tier0_result) + return output + + if output_format == "webview": + return _format_as_webview( + time_breakdown=time_breakdown, + hotspots=hotspots, + memory_analysis=memory_analysis, + recommendations=recommendations, + hardware_counters=hardware_counters, + database_path=database_path, + interval_timeline=interval_timeline, + kernel_categories=kernel_categories, + short_kernels=short_kernels, + ) + + # Default: text + lines = [] + width = 80 + + # Header + lines.append("=" * width) + lines.append("ROCPD AI PERFORMANCE ANALYSIS".center(width)) + lines.append("=" * width) + if database_path: + lines.append(f"Database: {database_path}") + lines.append(f"Analysis Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + + total_runtime_ms = time_breakdown.get("total_runtime", 0) / 1e6 + lines.append(f"Total Runtime: {total_runtime_ms:,.2f} ms") + lines.append("") + + # Time Breakdown + lines.append("โ”" * width) + lines.append("TIME BREAKDOWN".center(width)) + lines.append("โ”" * width) + lines.append("") + + def make_bar(percent: float, bar_width: int = 30) -> str: + """Create a visual percentage bar.""" + filled = int(percent / 100.0 * bar_width) + return "โ–ˆ" * filled + + kernel_pct = time_breakdown.get("kernel_percent", 0) + memcpy_pct = time_breakdown.get("memcpy_percent", 0) + overhead_pct = time_breakdown.get("overhead_percent", 0) + + kernel_time_ms = time_breakdown.get("total_kernel_time", 0) / 1e6 + memcpy_time_ms = time_breakdown.get("total_memcpy_time", 0) / 1e6 + overhead_time_ms = ( + max(0.0, total_runtime_ms - kernel_time_ms - memcpy_time_ms) + if total_runtime_ms > 0 + else 0 + ) + + lines.append( + f" Kernel Execution: {kernel_time_ms:10,.2f} ms ({kernel_pct:5.1f}%) {make_bar(kernel_pct)}" + ) + lines.append( + f" Memory Copies: {memcpy_time_ms:10,.2f} ms ({memcpy_pct:5.1f}%) {make_bar(memcpy_pct)}" + ) + lines.append( + f" API Overhead: {overhead_time_ms:10,.2f} ms ({overhead_pct:5.1f}%) {make_bar(overhead_pct)}" + ) + lines.append("") + + # Hotspots + if hotspots: + lines.append("โ”" * width) + lines.append("HOTSPOTS".center(width)) + lines.append("โ”" * width) + lines.append("") + lines.append(f"Top {len(hotspots)} Kernels by Duration:") + lines.append("") + + # Table header + lines.append( + f" # {'Kernel Name':<30} {'Calls':>6} {'Total (ms)':>10} {'Avg (ฮผs)':>9} {'% Total':>7}" + ) + lines.append("โ”€" * width) + + # Table rows + for i, kernel in enumerate(hotspots, 1): + name = kernel.get("name", "unknown") + if len(name) > 30: + name = name[:27] + "..." + + calls = kernel.get("calls", 0) + total_ms = kernel.get("total_duration", 0) / 1e6 + avg_us = kernel.get("avg_duration", 0) / 1e3 + percent = kernel.get("percent_of_total", 0) + + lines.append( + f"{i:2} {name:<30} {calls:6} {total_ms:10,.2f} {avg_us:9,.1f} {percent:6.1f}%" + ) + + lines.append("") + + # Memory Analysis + if memory_analysis: + lines.append("โ”" * width) + lines.append("MEMORY COPY ANALYSIS".center(width)) + lines.append("โ”" * width) + lines.append("") + + # Table header + lines.append( + f"{'Direction':<20} {'Count':>6} {'Total Size':>12} {'Duration':>10} {'Bandwidth':>10}" + ) + lines.append("โ”€" * width) + + # Table rows + for direction, stats in memory_analysis.items(): + count = stats.get("count", 0) + total_bytes = stats.get("total_bytes", 0) + duration_ms = stats.get("total_duration", 0) / 1e6 + bandwidth_gbps = stats.get("bandwidth_bytes_per_sec", 0) / 1e9 + + # Format size + if total_bytes >= 1e9: + size_str = f"{total_bytes / 1e9:.1f} GB" + elif total_bytes >= 1e6: + size_str = f"{total_bytes / 1e6:.1f} MB" + elif total_bytes >= 1e3: + size_str = f"{total_bytes / 1e3:.1f} KB" + else: + size_str = f"{total_bytes:.0f} B" + + lines.append( + f"{direction:<20} {count:6} {size_str:>12} {duration_ms:9,.2f} ms {bandwidth_gbps:8.2f} GB/s" + ) + + lines.append("") + + # Hardware Counters (Tier 2) + if hardware_counters and hardware_counters.get("has_counters"): + lines.append("โ”" * width) + lines.append("HARDWARE COUNTERS (Tier 2)".center(width)) + lines.append("โ”" * width) + lines.append("") + + metrics = hardware_counters.get("metrics", {}) + counters = hardware_counters.get("counters", {}) + + # Display derived metrics + if metrics: + lines.append("Derived Metrics:") + lines.append("") + + if "gpu_utilization_percent" in metrics: + util_pct = metrics["gpu_utilization_percent"] + lines.append( + f" GPU Utilization: {util_pct:6.1f}% {make_bar(util_pct)}" + ) + + if "avg_waves" in metrics: + avg_waves = metrics["avg_waves"] + max_waves = metrics.get("max_waves", 0) + lines.append(f" Avg Wave Occupancy: {avg_waves:6.1f} waves") + lines.append(f" Max Wave Occupancy: {max_waves:6.1f} waves") + + lines.append("") + + # Display raw counters + if counters: + lines.append("Collected Counters:") + lines.append("") + lines.append( + f"{'Counter Name':<25} {'Avg Value':>15} {'Min Value':>15} {'Max Value':>15}" + ) + lines.append("โ”€" * width) + + for counter_name, stats in counters.items(): + avg = stats.get("avg_value", 0) + min_val = stats.get("min_value", 0) + max_val = stats.get("max_value", 0) + + lines.append( + f"{counter_name:<25} {avg:15,.1f} {min_val:15,.1f} {max_val:15,.1f}" + ) + + lines.append("") + + # TraceLens: Kernel Category Breakdown + if kernel_categories: + lines.append("") + lines.append("โ”" * width) + lines.append("KERNEL CATEGORY BREAKDOWN (TraceLens)".center(width)) + lines.append("โ”" * width) + lines.append("") + max_pct = max((c["pct_of_kernel_time"] for c in kernel_categories), default=1) + bar_width = 30 + for cat in kernel_categories: + pct = cat["pct_of_kernel_time"] + bar = "โ–ˆ" * int(bar_width * pct / max(max_pct, 1)) + cnt = cat["count"] + avg_us = cat["avg_duration_ns"] / 1_000 + lines.append( + f" {cat['category']:<15} {bar:<30} {pct:5.1f}% ({cnt} kernels, avg {avg_us:.1f}ฮผs)" + ) + lines.append("") + + # TraceLens: Short Kernel Analysis + if short_kernels and short_kernels.get("short_kernel_count", 0) > 0: + lines.append("โ”" * width) + lines.append("SHORT KERNEL ANALYSIS (TraceLens)".center(width)) + lines.append("โ”" * width) + lines.append("") + thresh = short_kernels.get("threshold_us", 10) + count = short_kernels["short_kernel_count"] + wasted = short_kernels["wasted_pct_of_kernel_time"] + lines.append( + f" {count} kernels below {thresh}ฮผs threshold โ€” {wasted:.1f}% of kernel time wasted" + ) + if short_kernels.get("histogram"): + hist_str = " Histogram: " + " ".join( + f"[{b['bucket_label']}]: {b['count']}" for b in short_kernels["histogram"] + ) + lines.append(hist_str) + if short_kernels.get("top_offenders"): + lines.append(" Top offenders:") + for off in short_kernels["top_offenders"][:5]: + lines.append( + f" {off['name'][:50]:<52} ร—{off['count']} avg {off['avg_us']:.1f}ฮผs" + ) + lines.append("") + + # Recommendations + lines.append("โ”" * width) + lines.append("RECOMMENDATIONS".center(width)) + lines.append("โ”" * width) + lines.append("") + + for rec in recommendations: + priority = rec.get("priority", "INFO") + category = rec.get("category", "") + issue = rec.get("issue", "") + suggestion = rec.get("suggestion", "") + actions = rec.get("actions", []) + commands = rec.get("commands", []) + estimated_impact = rec.get("estimated_impact", "") + + lines.append(f"[{priority}] {category}") + lines.append("โ”€" * width) + lines.append(f" Issue: {issue}") + lines.append("") + if suggestion: + lines.append(f" Suggestion: {suggestion}") + if actions: + for action in actions: + lines.append(f" {action}") + lines.append("") + if estimated_impact: + lines.append(f" Estimated Impact: {estimated_impact}") + lines.append("") + if commands: + lines.append(" Recommended Commands:") + for cmd in commands: + tool = cmd.get("tool", "") + desc = cmd.get("description", "") + full_command = cmd.get("full_command", "") + flags = cmd.get("flags", []) + args = cmd.get("args", []) + lines.append(f" [{tool}] {desc}") + if flags: + lines.append(f" Flags: {' '.join(flags)}") + if args: + arg_strs = [] + for a in args: + name = a.get("name", "") + value = a.get("value") + arg_strs.append(f"{name} {value}" if value is not None else name) + lines.append(f" Args: {' '.join(arg_strs)}") + if full_command: + lines.append(f" $ {full_command}") + lines.append("") + lines.append("") + + # Footer + lines.append("=" * width) + lines.append("Analysis complete.".center(width)) + lines.append("=" * width) + + return "\n".join(lines) + + +def analyze_source_code( + source_dir: str, + prompt: Optional[str] = None, + llm: Optional[str] = None, + llm_api_key: Optional[str] = None, + llm_model: Optional[str] = None, + verbose: bool = False, +) -> Any: + """ + Run Tier 0 static source code analysis. + + Args: + source_dir: Path to source directory + prompt: Optional user question to guide analysis + llm: LLM provider ("anthropic", "openai") + llm_api_key: API key for LLM provider + llm_model: Override LLM model name + verbose: Enable verbose logging + + Returns: + SourceAnalysisResult from ai_analysis.api + """ + from pathlib import Path as _Path + from .ai_analysis.source_analyzer import SourceAnalyzer + from .ai_analysis.api import _plan_to_source_result + + _src_path = _Path(source_dir) + if not _src_path.exists() or not _src_path.is_dir(): + from .ai_analysis.exceptions import SourceDirectoryNotFoundError + + raise SourceDirectoryNotFoundError( + f"Source directory not found or not a directory: {source_dir}" + ) + + if verbose: + print(f"[Tier0] Scanning source directory: {source_dir}") + + scanner = SourceAnalyzer(_src_path, verbose=verbose) + plan = scanner.analyze() + + if verbose: + print( + f"[Tier0] Scanned {plan.files_scanned} files, " + f"{plan.kernel_count} kernels, model: {plan.programming_model}" + ) + + # Convert ProfilingPlan โ†’ SourceAnalysisResult dataclass + result = _plan_to_source_result(plan) + + if llm: + _prev = os.environ.get("ROCPD_LLM_MODEL") + try: + from .ai_analysis.llm_analyzer import LLMAnalyzer + + if llm_model: + os.environ["ROCPD_LLM_MODEL"] = llm_model + try: + analyzer = LLMAnalyzer(provider=llm, api_key=llm_api_key, verbose=verbose) + from .ai_analysis.llm_analyzer import ( + AnalysisContext as _AnalysisContext, + ) + + _llm_ctx = _AnalysisContext(tier=0, custom_prompt=prompt) + _mdl = llm_model or os.environ.get("ROCPD_LLM_MODEL", "") + _mdl_str = f" ({_mdl})" if _mdl else "" + print( + f" Contacting {llm}{_mdl_str} for source analysis โ€” please wait...", + file=sys.stderr, + flush=True, + ) + result.llm_explanation = analyzer.analyze_source_with_llm( + result, custom_prompt=prompt, context=_llm_ctx + ) + finally: + if llm_model: + if _prev is None: + os.environ.pop("ROCPD_LLM_MODEL", None) + else: + os.environ["ROCPD_LLM_MODEL"] = _prev + except Exception as e: + print(f"โš ๏ธ Tier 0 LLM enhancement failed: {e}", file=sys.stderr) + + return result + + +def analyze_performance( + connection: Optional[RocpdImportData], + prompt: Optional[str] = None, + top_kernels: int = 10, + min_duration: float = 0.0, + output_format: str = "text", + database_path: str = "", + llm: Optional[str] = None, + llm_api_key: Optional[str] = None, + llm_model: Optional[str] = None, + llm_thinking: Optional[int] = None, + verbose: bool = False, + source_dir: Optional[str] = None, + _collect_result: Optional[Dict[str, Any]] = None, + **kwargs: Any, +) -> str: + """ + Main analysis orchestrator that runs all analyses and formats output. + + Args: + connection: RocpdImportData database connection + prompt: Optional custom analysis prompt + top_kernels: Number of top kernels to analyze + min_duration: Minimum kernel duration threshold + output_format: Output format (text, json, markdown) + database_path: Path to database file + llm: LLM provider (anthropic or openai) + llm_api_key: API key for LLM provider + verbose: Enable verbose logging + **kwargs: Additional arguments + + Returns: + Formatted analysis output string + """ + # ------------------------------------------------------------------ + # Tier 0 โ€” static source code analysis (optional) + # ------------------------------------------------------------------ + tier0_result = None + if source_dir: + tier0_result = analyze_source_code( + source_dir=source_dir, + prompt=prompt, + llm=llm, + llm_api_key=llm_api_key, + llm_model=llm_model, + verbose=verbose, + ) + + # ------------------------------------------------------------------ + # Tier 1/2 โ€” database analysis (only when a connection is provided) + # ------------------------------------------------------------------ + source_only = connection is None + if not source_only: + time_breakdown = compute_time_breakdown(connection) + hotspots = identify_hotspots( + connection, top_n=top_kernels, min_duration=min_duration + ) + memory_analysis = analyze_memory_copies(connection) + hardware_counters = analyze_hardware_counters(connection) # Tier 2 + already_collected = _detect_already_collected(connection) + # TraceLens-derived analysis (Phase 1) + interval_timeline = compute_interval_timeline(connection) + kernel_categories = analyze_kernels_by_category( + connection, interval_timeline["total_wall_ns"] + ) + short_kernels_data = analyze_short_kernels(connection) + # Generate recommendations (redundant re-collection commands are filtered out) + recommendations = generate_recommendations( + time_breakdown, + hotspots, + memory_analysis, + hardware_counters, + already_collected=already_collected, + short_kernels=short_kernels_data, # NEW + interval_timeline=interval_timeline, # NEW + ) + else: + time_breakdown = {} + hotspots = [] + memory_analysis = {} + hardware_counters = {} + already_collected = frozenset() + interval_timeline = {} + kernel_categories = [] + short_kernels_data = {} + recommendations = tier0_result.recommendations if tier0_result else [] + + # Format output + output = format_analysis_output( + time_breakdown=time_breakdown, + hotspots=hotspots, + memory_analysis=memory_analysis, + recommendations=recommendations, + hardware_counters=hardware_counters, + database_path=database_path, + output_format=output_format, + tier0_result=tier0_result, + source_only=source_only, + interval_timeline=interval_timeline, # NEW (TraceLens) + kernel_categories=kernel_categories, # NEW (TraceLens) + short_kernels=short_kernels_data, # NEW (TraceLens) + custom_prompt=prompt, + ) + + # Expose structured results to caller (used by interactive mode) + if _collect_result is not None: + _collect_result["recommendations"] = recommendations + _collect_result["tier0_result"] = tier0_result + _collect_result["database_path"] = database_path + + # LLM enhancement (if enabled) โ€” only for Tier 1/2; Tier 0 LLM runs in analyze_source_code() + if llm and not source_only: + # Initialize before try so the finally block can always reference these names safely. + _prev_model_env = os.environ.get("ROCPD_LLM_MODEL") + try: + if verbose: + print(f"[LLM] Enabling {llm} enhancement...") + + from .ai_analysis.llm_analyzer import LLMAnalyzer + + # If caller provided --llm-model, set it in the environment so + # LLMAnalyzer._call_anthropic/_call_openai can pick it up. + # We restore the original value afterwards. + if llm_model: + os.environ["ROCPD_LLM_MODEL"] = llm_model + + _mdl = llm_model or os.environ.get("ROCPD_LLM_MODEL", "") + _mdl_str = f" ({_mdl})" if _mdl else "" + print( + f" Contacting {llm}{_mdl_str} for trace analysis โ€” please wait...", + file=sys.stderr, + flush=True, + ) + + # Initialize LLM analyzer + analyzer = LLMAnalyzer( + provider=llm, + api_key=llm_api_key, + verbose=verbose, + thinking_budget_tokens=llm_thinking, + ) + + # Prepare data for LLM + analysis_data = { + "gpu": {"name": "AMD GPU", "arch": "unknown"}, # TODO: Extract from DB + "execution_breakdown": { + "kernel_time_pct": time_breakdown.get("kernel_percent", 0), + "memcpy_time_pct": time_breakdown.get("memcpy_percent", 0), + "api_overhead_pct": time_breakdown.get("overhead_percent", 0), + }, + "kernels": [ + { + "name": h.get("name", "unknown"), + "dispatch_count": h.get("calls", 0), + "pct_total_time": h.get("percent_of_total", 0), + "avg_duration_ns": h.get("avg_duration", 0), + } + for h in hotspots[:5] # Top 5 kernels + ], + "memory_ops": { + direction: { + "count": data.get("count", 0), + "total_bytes": data.get("total_bytes", 0), + "bandwidth_gbps": data.get("bandwidth_bytes_per_sec", 0) / 1e9, + } + for direction, data in memory_analysis.items() + }, + "has_counters": bool(hardware_counters), + "has_pc_sampling": False, + } + + # Build analysis context for guide filtering + from .ai_analysis.llm_analyzer import AnalysisContext as _AnalysisContext + + _has_ctr = bool(hardware_counters and hardware_counters.get("has_counters")) + _summary = _build_summary(time_breakdown, hotspots, _has_ctr) + _llm_ctx = _AnalysisContext( + tier=2 if _has_ctr else 1, + has_counters=_has_ctr, + bottleneck_type=_summary.get("primary_bottleneck"), + gpu_arch=None, # reserved for future per-GPU filtering + custom_prompt=prompt, + ) + + # Get LLM enhancement + llm_explanation = analyzer.analyze_with_llm( + analysis_data=analysis_data, + custom_prompt=prompt, + context=_llm_ctx, + ) + + # Append LLM explanation to output + if output_format == "text": + output += "\n\n" + "=" * 80 + "\n" + output += ( + "AI-ENHANCED EXPLANATION (powered by {})".format(llm.upper()).center( + 80 + ) + + "\n" + ) + output += "=" * 80 + "\n\n" + output += llm_explanation + output += "\n\n" + "=" * 80 + "\n" + elif output_format == "json": + # Parse JSON and add LLM explanation + import json + + try: + output_dict = json.loads(output) + output_dict["llm_enhanced_explanation"] = llm_explanation + output = json.dumps(output_dict, indent=2) + except (json.JSONDecodeError, ValueError, KeyError) as _je: + print( + f"Warning: Could not embed LLM explanation in JSON output: {_je}", + file=sys.stderr, + ) + + if verbose: + print("[LLM] Enhancement complete") + + except Exception as e: + # Always show LLM failures on console (even without --verbose) + import sys + + error_msg = f"โš ๏ธ LLM enhancement failed: {e}" + print(error_msg, file=sys.stderr) + + # Also add to output file + warning_msg = ( + f"\n\n{error_msg}\n(Analysis completed with local results only)\n" + ) + if output_format == "text": + output += warning_msg + + # Show full traceback only in verbose mode + if verbose: + import traceback + + traceback.print_exc() + + finally: + # Restore the ROCPD_LLM_MODEL env var to its previous state + if llm_model: + if _prev_model_env is None: + os.environ.pop("ROCPD_LLM_MODEL", None) + else: + os.environ["ROCPD_LLM_MODEL"] = _prev_model_env + + return output + + +def _is_code_change_rec(rec: Dict[str, Any]) -> bool: + """Return True if this recommendation suggests source-code modifications.""" + CODE_CHANGE_KEYWORDS = ( + "replace ", + "convert ", + "add ", + "insert ", + "remove ", + "delete ", + "change ", + "modify ", + "update ", + "use hip", + "hipstream", + "hipmemcpy", + "hiplaunchkernel", + "block size", + "blockdim", + "thread block", + "merge kernel", + "fuse kernel", + "combine kernel", + "async", + "hipstreamcreate", + "batch ", + "coalesce", + "stride", + "unroll", + "pragma ", + "#pragma", + "__launch_bounds__", + "wave32", + "wave64", + ) + for action in rec.get("actions", []): + al = action.lower() + if any(kw in al for kw in CODE_CHANGE_KEYWORDS): + return True + return False + + +def _call_llm_for_code( + provider: str, + api_key: Optional[str], + model: Optional[str], + prompt: str, +) -> str: + """Call Anthropic or OpenAI to generate code-change suggestions.""" + if provider == "anthropic": + try: + import anthropic + except ImportError: + raise ImportError( + "anthropic package not installed. Run: pip install anthropic" + ) + key = api_key or os.environ.get("ANTHROPIC_API_KEY") + if not key: + raise ValueError( + "No Anthropic API key. Set ANTHROPIC_API_KEY or pass --llm-api-key." + ) + use_model = model or os.environ.get("ROCPD_LLM_MODEL", "claude-sonnet-4-20250514") + client = anthropic.Anthropic(api_key=key) + msg = client.messages.create( + model=use_model, + max_tokens=4096, + messages=[{"role": "user", "content": prompt}], + ) + return msg.content[0].text + + elif provider in ("openai", "gpt"): + try: + import openai + except ImportError: + raise ImportError("openai package not installed. Run: pip install openai") + key = api_key or os.environ.get("OPENAI_API_KEY") + if not key: + raise ValueError( + "No OpenAI API key. Set OPENAI_API_KEY or pass --llm-api-key." + ) + use_model = model or os.environ.get("ROCPD_LLM_MODEL", "gpt-4-turbo-preview") + client = openai.OpenAI(api_key=key) + try: + resp = client.chat.completions.create( + model=use_model, + messages=[{"role": "user", "content": prompt}], + max_completion_tokens=4096, + ) + except Exception: + resp = client.chat.completions.create( + model=use_model, + messages=[{"role": "user", "content": prompt}], + max_tokens=4096, + ) + return resp.choices[0].message.content + + else: + raise ValueError(f"Unknown LLM provider: {provider!r}") + + +def _apply_code_change_interactive( + rec: Dict[str, Any], + source_dir: str, + llm_provider: Optional[str], + llm_api_key: Optional[str], + llm_model: Optional[str], + colors: Dict[str, str], +) -> None: + """Walk the user through applying a code-change recommendation.""" + _os = os # alias to keep existing _os.path.* calls working + import glob as _glob + import difflib + import shutil + + C = colors["C"] + G = colors["G"] + Y = colors["Y"] + R = colors["R"] + DIM = colors["DIM"] + N = colors["N"] + + cat = rec.get("category", "") + issue = rec.get("issue", "") + suggestion = rec.get("suggestion", "") + actions = rec.get("actions", []) + impact = rec.get("estimated_impact", "") + + # โ”€โ”€ Show recommendation details โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + print(f"\n{C}{'โ”€' * 80}{N}") + print(f"{C} Code Change Recommendation: {cat}{N}") + print(f"{C}{'โ”€' * 80}{N}") + print(f"\n {Y}Issue:{N} {issue}") + print(f" {Y}Suggestion:{N} {suggestion}") + if actions: + print(f"\n {Y}Required Changes:{N}") + for i, action in enumerate(actions, 1): + print(f" {i}. {action}") + if impact: + print(f"\n {Y}Estimated Impact:{N} {impact}") + print() + + if not source_dir: + print(f" {DIM}Tip: run with --source-dir to enable AI code editing.{N}\n") + return + + # โ”€โ”€ Find GPU source files โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + source_files: List[str] = [] + for ext in ("*.hip", "*.cpp", "*.cu", "*.cuh", "*.h"): + source_files.extend( + _glob.glob(_os.path.join(source_dir, "**", ext), recursive=True) + ) + source_files = [f for f in source_files if _os.path.isfile(f)] + + if not source_files: + print(f" {DIM}No GPU source files found in {source_dir}/{N}\n") + return + + # โ”€โ”€ Auto-detect LLM provider from environment if not explicitly set โ”€โ”€โ”€โ”€โ”€ + if not llm_provider: + if os.environ.get("ANTHROPIC_API_KEY"): + llm_provider = "anthropic" + elif os.environ.get("OPENAI_API_KEY"): + llm_provider = "openai" + + # โ”€โ”€ No LLM configured: show manual steps and offer $EDITOR โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + if not llm_provider: + print( + f" {DIM}To enable AI code editing, set ANTHROPIC_API_KEY (or OPENAI_API_KEY) in your" + f" environment, or pass --llm anthropic to rocpd analyze.{N}" + ) + print(f"\n {Y}Manual steps:{N}") + for i, action in enumerate(actions, 1): + print(f" {i}. {action}") + editor = _os.environ.get("EDITOR", "") + if editor and source_files: + try: + ans = input(f"\n Open source files in {editor}? [y/N]: ").strip().lower() + except (EOFError, KeyboardInterrupt): + ans = "n" + if ans in ("y", "yes"): + import subprocess + + subprocess.run([editor] + source_files[:3]) + print() + return + + # โ”€โ”€ Ask user before invoking LLM โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + try: + ans = ( + input( + f" {Y}Would you like the AI to apply this change to your source code? [y/N]: {N}" + ) + .strip() + .lower() + ) + except (EOFError, KeyboardInterrupt): + print() + return + if ans not in ("y", "yes"): + print() + return + + # โ”€โ”€ Read source files โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + MAX_FILES = 5 + MAX_FILE_SIZE = 50_000 # bytes per file + + print(f"\n {DIM}Reading source files...{N}") + file_contents: Dict[str, str] = {} + for fpath in source_files[:MAX_FILES]: + try: + with open(fpath, "r", encoding="utf-8", errors="replace") as fh: + file_contents[fpath] = fh.read(MAX_FILE_SIZE) + except OSError: + pass + + if not file_contents: + print(f" {R}Could not read source files.{N}\n") + return + + # โ”€โ”€ Build LLM prompt โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + files_text = "\n\n".join( + f"=== {_os.path.relpath(fp, source_dir)} ===\n{content}" + for fp, content in file_contents.items() + ) + changes_text = "\n".join(f"- {a}" for a in actions) + + llm_prompt = ( + "You are a GPU performance optimization expert. The following GPU source files " + "have a performance issue that needs to be fixed.\n\n" + f"ISSUE: {issue}\n" + f"SUGGESTION: {suggestion}\n" + f"REQUIRED CHANGES:\n{changes_text}\n\n" + f"SOURCE FILES:\n{files_text}\n\n" + "OUTPUT INSTRUCTIONS:\n" + "For each file that needs modification, output EXACTLY this format:\n" + "MODIFY_FILE: \n" + "<<\n" + "ORIGINAL\n" + "<<\n" + "REPLACEMENT\n\n" + "Only output sections that need to change. Be precise โ€” the ORIGINAL block must " + "match exactly what appears in the file (used for find-and-replace). " + "If no changes are needed, output: NO_CHANGES_NEEDED" + ) + + print(f" {DIM}Calling {llm_provider} for code change suggestions...{N}") + + try: + llm_response = _call_llm_for_code( + provider=llm_provider, + api_key=llm_api_key, + model=llm_model, + prompt=llm_prompt, + ) + except Exception as exc: + print(f" {R}LLM error: {exc}{N}\n") + return + + if "NO_CHANGES_NEEDED" in llm_response: + print(f" {G}AI analysis: no code changes are needed for this issue.{N}\n") + return + + # โ”€โ”€ Parse MODIFY_FILE blocks โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + patches: List[tuple] = [] + pattern = re.compile( + r"MODIFY_FILE:\s*(\S+)\s*<< 80: + print(f" {DIM} ... ({len(diff) - 80} more lines){N}") + valid_patches.append((abs_path, rel_path, orig_content, new_content)) + + if not valid_patches: + print() + return + + print() + try: + ans = input(f" {Y}Apply these changes? [y/N]: {N}").strip().lower() + except (EOFError, KeyboardInterrupt): + print() + return + + if ans not in ("y", "yes"): + print(f" {DIM}Changes not applied.{N}\n") + return + + # โ”€โ”€ Apply with backup โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + applied = 0 + for abs_path, rel_path, orig_content, new_content in valid_patches: + backup_path = abs_path + ".rocpd.bak" + try: + shutil.copy2(abs_path, backup_path) + with open(abs_path, "w", encoding="utf-8") as fh: + fh.write(new_content) + print( + f" {G}โœ“ Applied: {rel_path} (backup: {_os.path.basename(backup_path)}){N}" + ) + applied += 1 + except OSError as exc: + print(f" {R}โœ— Failed to write {rel_path}: {exc}{N}") + + if applied: + print( + f"\n {G}โœ“ {applied} file(s) modified. Rebuild your application to test.{N}\n" + ) + return True + else: + print(f" {Y}No files were modified.{N}\n") + return False + + +def _get_app_path_from_db(database_path: str) -> str: + """ + Extract the profiled application's executable path from a rocpd database. + + rocprofv3 writes the process command into rocpd_info_process_.command. + Returns the path string, or "" if the database cannot be read or has no entry. + """ + if not database_path: + return "" + try: + import sqlite3 as _sqlite3 + + con = _sqlite3.connect(database_path) + # Find all rocpd_info_process_* tables + tables = con.execute( + "SELECT name FROM sqlite_master WHERE type='table' AND name LIKE 'rocpd_info_process_%'" + ).fetchall() + for (tname,) in tables: + row = con.execute( + f'SELECT command FROM "{tname}" WHERE command IS NOT NULL LIMIT 1' + ).fetchone() + if row and row[0]: + return row[0].strip() + con.close() + except Exception: + pass + return "" + + +def _run_interactive_session( + recommendations: List[Dict[str, Any]], + tier0_result: Optional[Any] = None, + database_path: str = "", + source_dir: str = "", + llm_provider: Optional[str] = None, + llm_api_key: Optional[str] = None, + llm_model: Optional[str] = None, + llm_local: Optional[str] = None, + llm_local_model: Optional[str] = None, + resume_session: Optional[str] = None, + compact_every: int = 10, +) -> None: + """Thin shim: delegates to InteractiveSession in ai_analysis/interactive.py.""" + from rocpd.ai_analysis.interactive import InteractiveSession, SessionStore + + InteractiveSession( + source_dir=source_dir, + tier0_result=tier0_result, + recommendations=recommendations, + database_path=database_path, + llm_provider=llm_provider, + llm_api_key=llm_api_key, + llm_model=llm_model, + llm_local=llm_local, + llm_local_model=llm_local_model, + session_store=SessionStore(), + resume_session_id=resume_session, + compact_every=compact_every, + ).run() + + +def add_args(parser: argparse.ArgumentParser): + """ + Add command-line arguments for AI analysis. + + Args: + parser: Argument parser to add arguments to + + Returns: + Function to process parsed arguments + """ + analysis_options = parser.add_argument_group("Analysis options") + + analysis_options.add_argument( + "--source-dir", + type=str, + default=None, + dest="source_dir", + help=( + "Path to GPU application source directory for Tier 0 static analysis. " + "Scans .hip/.cpp/.cu files and generates a profiling plan. " + "Can be used alone (no -i required) or alongside -i for combined analysis." + ), + ) + + analysis_options.add_argument( + "--prompt", + type=str, + default=None, + help="Custom analysis prompt/question to guide analysis (e.g., 'Why is my matmul kernel slow?')", + ) + + analysis_options.add_argument( + "--top-kernels", + type=int, + default=10, + help="Number of top kernels to analyze (default: 10)", + ) + + analysis_options.add_argument( + "--format", + type=str, + choices=["text", "json", "markdown", "webview"], + default="text", + help="Output format: text, json, markdown, or webview (default: text). " + "File extension is set automatically: .txt, .json, .md, .html", + ) + + analysis_options.add_argument( + "--min-duration", + type=float, + default=0.0, + help="Minimum kernel duration threshold in microseconds (filter out short kernels)", + ) + + # LLM Enhancement Options + llm_options = parser.add_argument_group( + "LLM enhancement options (optional)", + "Enable natural language explanations via Anthropic Claude or OpenAI GPT. " + "Requires API key - see https://console.anthropic.com/ or https://platform.openai.com/api-keys", + ) + + llm_options.add_argument( + "--llm", + type=str, + choices=["anthropic", "openai", "private"], + default=None, + help="Enable LLM-powered analysis enhancement. Choices: 'anthropic' (Claude), 'openai' (GPT), " + "or 'private' (any OpenAI-compatible private/enterprise server). " + "Requires API key set via environment variable or --llm-api-key option. " + "For 'private': set ROCPD_LLM_PRIVATE_URL, ROCPD_LLM_PRIVATE_MODEL, and optionally " + "ROCPD_LLM_PRIVATE_HEADERS (JSON). " + "Local analysis always runs first; LLM provides additional natural language insights.", + ) + + llm_options.add_argument( + "--llm-api-key", + type=str, + default=None, + help="API key for LLM provider. Alternatively, set environment variable: " + "ANTHROPIC_API_KEY for Anthropic Claude, or OPENAI_API_KEY for OpenAI GPT. " + "Example: --llm anthropic --llm-api-key sk-ant-... " + "Or: export ANTHROPIC_API_KEY='sk-ant-...' && rocpd analyze --llm anthropic", + ) + + llm_options.add_argument( + "--llm-model", + type=str, + default=None, + help="Override the LLM model name. Defaults to claude-sonnet-4-20250514 for Anthropic " + "and gpt-4-turbo-preview for OpenAI. Can also be set via ROCPD_LLM_MODEL environment " + "variable (--llm-model takes precedence). " + "Examples: --llm-model claude-opus-4-6, --llm-model gpt-4o", + ) + + llm_options.add_argument( + "--verbose", + action="store_true", + default=False, + help="Enable verbose logging (shows LLM API calls, reference guide loading, etc.)", + ) + + analysis_options.add_argument( + "--interactive", + "-I", + metavar="RUN_COMMAND", + type=str, + default=None, + dest="interactive", + help=( + "Launch the 7-phase interactive profiling + optimization workflow. " + "RUN_COMMAND is the full command used to run your GPU application. " + 'Example: --interactive "./my_gpu_app --batch-size 64". ' + "The workflow automatically wraps your command with rocprofv3, collects " + "a trace, analyzes bottlenecks with AI, and offers to apply optimizations." + ), + ) + + analysis_options.add_argument( + "--resume-session", + type=str, + default=None, + dest="resume_session", + help=( + "Resume a previous interactive session by session ID or file path. " + "Example: --resume-session 2026-03-10_14-23-01_myapp" + ), + ) + + llm_options.add_argument( + "--llm-thinking", + metavar="TOKENS", + type=int, + default=None, + dest="llm_thinking", + help=( + "Enable extended thinking for deeper LLM analysis. Specify the thinking " + "budget in tokens (e.g. --llm-thinking 8000). Only available with the " + "Anthropic provider and compatible models (claude-opus-4, " + "claude-sonnet-4-5, claude-3-7-sonnet). Adds latency but improves " + "analysis quality for complex traces with multiple interacting " + "bottlenecks. Requires --llm anthropic. Also configurable via the " + "ROCPD_LLM_THINKING environment variable (set to token count)." + ), + ) + + llm_options.add_argument( + "--llm-compact-every", + metavar="N", + type=int, + default=10, + dest="llm_compact_every", + help=( + "Compact the LLM conversation context every N assistant turns by summarizing " + "older messages (default: 10). Lower values use less memory; higher values " + "preserve more context. Only applies to --interactive sessions." + ), + ) + + llm_options.add_argument( + "--llm-local", + type=str, + choices=["ollama"], + default=None, + dest="llm_local", + help=( + "Local LLM provider for Stage 1 source summarization (before online LLM). " + "Choices: 'ollama'. Requires Ollama running at localhost:11434. " + "Set ROCPD_LLM_LOCAL_URL to override endpoint." + ), + ) + + llm_options.add_argument( + "--llm-local-model", + type=str, + default=None, + dest="llm_local_model", + help=( + "Model name for local LLM (default: codellama:13b). " + "Can also be set via ROCPD_LLM_LOCAL_MODEL environment variable." + ), + ) + + llm_options.add_argument( + "--llm-private-url", + type=str, + default=None, + dest="llm_private_url", + help=( + "Base URL for a private/enterprise OpenAI-compatible LLM server " + "(used with --llm private). E.g. https://my-apim.example.com/openai/deployments/gpt4. " + "Can also be set via ROCPD_LLM_PRIVATE_URL environment variable." + ), + ) + + llm_options.add_argument( + "--llm-private-model", + type=str, + default=None, + dest="llm_private_model", + help=( + "Model name for private LLM server (used with --llm private). " + "Can also be set via ROCPD_LLM_PRIVATE_MODEL environment variable." + ), + ) + + def process_args(input: RocpdImportData, args: argparse.Namespace): + """Process and return valid arguments as dictionary.""" + valid_args = [ + "source_dir", + "prompt", + "top_kernels", + "format", + "min_duration", + "llm", + "llm_api_key", + "llm_model", + "llm_thinking", + "llm_compact_every", + "verbose", + "interactive", + "resume_session", + "llm_local", + "llm_local_model", + "llm_private_url", + "llm_private_model", + ] + ret = {} + for itr in valid_args: + if hasattr(args, itr): + val = getattr(args, itr) + if val is not None: + ret[itr] = val + # Convert min_duration from microseconds to nanoseconds + if "min_duration" in ret: + ret["min_duration"] = ret["min_duration"] * 1000 + return ret + + return process_args + + +def execute( + input: Optional[RocpdImportData], + config: Optional[output_config.output_config] = None, + **kwargs: Any, +) -> Optional[RocpdImportData]: + """ + Execute AI analysis on rocpd database and/or source directory. + + Args: + input: RocpdImportData object with database connection, or None for source-only mode + config: Optional output configuration + **kwargs: Analysis parameters (may include source_dir for Tier 0) + + Returns: + The input RocpdImportData object (for chaining), or None in source-only mode + """ + # Update config if provided + if config is not None: + config = config.update(**kwargs) + else: + config = output_config.output_config(**kwargs) + + # Get database path for display + database_path = "" + if input is not None and hasattr(input, "_paths") and input._paths: + database_path = ( + input._paths[0] if isinstance(input._paths, list) else str(input._paths) + ) + + # Pop interactive before passing to analyze_performance (it doesn't accept it) + interactive = kwargs.pop("interactive", None) + + # 7-phase workflow mode: triggered when --interactive is provided with a RUN_COMMAND + if interactive and isinstance(interactive, str): + from rocpd.ai_analysis.interactive import WorkflowSession # type: ignore[import] + + source_paths: list = [] + source_dir = kwargs.get("source_dir") + if source_dir: + source_paths.append(source_dir) + ws = WorkflowSession( + app_command=interactive, + source_paths=source_paths, + llm_provider=kwargs.get("llm"), + llm_api_key=kwargs.get("llm_api_key") + or os.environ.get("ANTHROPIC_API_KEY") + or os.environ.get("OPENAI_API_KEY"), + llm_model=kwargs.get("llm_model"), + ) + ws.run() + return input + + # Map 'format' CLI key โ†’ 'output_format' parameter expected by analyze_performance + if "format" in kwargs: + kwargs["output_format"] = kwargs.pop("format") + + # Inject private-server CLI args into env so downstream code picks them up + if kwargs.get("llm_private_url"): + os.environ.setdefault("ROCPD_LLM_PRIVATE_URL", kwargs["llm_private_url"]) + if kwargs.get("llm_private_model"): + os.environ.setdefault("ROCPD_LLM_PRIVATE_MODEL", kwargs["llm_private_model"]) + + # In interactive mode: skip the upfront LLM call entirely โ€” the user will + # trigger LLM requests explicitly via [p] and [o] inside the session. + # Save credentials first so _run_interactive_session can still use them. + _interactive_llm_provider = kwargs.get("llm") + _interactive_llm_api_key = kwargs.get("llm_api_key") + _interactive_llm_model = kwargs.get("llm_model") + if interactive: + kwargs.pop("llm", None) + kwargs.pop("llm_model", None) + kwargs.pop("llm_api_key", None) + kwargs.pop("llm_thinking", None) + + # Collect structured results so interactive mode can build its command menu + result_store: Dict[str, Any] = {} + + # Run analysis + output = analyze_performance( + connection=input, + database_path=database_path, + _collect_result=result_store, + **kwargs, + ) + + # Determine file extension based on output format + _ext_map = {"json": ".json", "markdown": ".md", "webview": ".html", "text": ".txt"} + _fmt = kwargs.get("output_format", "text") + _ext = _ext_map.get(_fmt, ".txt") + + # Handle output + if config and config.output_file and config.output_path: + base = config.output_file + # Append the format extension if the base name doesn't already have it + if not base.endswith(_ext): + base = base + _ext + output_file = os.path.join(config.output_path, base) + os.makedirs(config.output_path, exist_ok=True) + with open(output_file, "w") as f: + f.write(output) + print(f"Analysis written to: {output_file}") + if _fmt == "text": + print( + "Tip: use --format webview for an interactive HTML report, " + "--format json for machine-readable output, " + "or --format markdown for Markdown." + ) + else: + print(output) + + # โ”€โ”€ Interactive mode โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + if interactive: + _run_interactive_session( + recommendations=result_store.get("recommendations", []), + tier0_result=result_store.get("tier0_result"), + database_path=result_store.get("database_path", database_path), + source_dir=kwargs.get("source_dir", ""), + llm_provider=_interactive_llm_provider, + llm_api_key=_interactive_llm_api_key, + llm_model=_interactive_llm_model, + llm_local=kwargs.get("llm_local"), + llm_local_model=kwargs.get("llm_local_model"), + resume_session=kwargs.get("resume_session"), + compact_every=kwargs.get("llm_compact_every", 10), + ) + + return input + + +def main(argv=None) -> int: + """ + Main entry point for standalone execution. + + Args: + argv: Command-line arguments (defaults to sys.argv) + + Returns: + Exit code (0 for success, non-zero for error) + """ + parser = argparse.ArgumentParser( + prog="rocpd.analyze", + description="AI-powered performance analysis for GPU traces", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + + parser.add_argument( + "-i", + "--input", + nargs="+", + type=str, + required=True, + help="Input rocpd database file(s)", + ) + + # Add output config args + output_config.add_args(parser) + + # Add analysis args + process_args = add_args(parser) + + # Parse arguments + args = parser.parse_args(argv) + + try: + # Create database connection + input_data = RocpdImportData(args.input) + + # Process arguments + analysis_args = process_args(input_data, args) + + # Execute analysis + execute(input_data, **analysis_args) + + return 0 + + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + import traceback + + traceback.print_exc() + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/projects/rocprofiler-sdk/source/lib/python/rocpd/tracelens_port.py b/projects/rocprofiler-sdk/source/lib/python/rocpd/tracelens_port.py new file mode 100644 index 00000000000..39f00453713 --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/python/rocpd/tracelens_port.py @@ -0,0 +1,388 @@ +#!/usr/bin/env python3 +############################################################################### +# MIT License +# Copyright (c) 2025 Advanced Micro Devices, Inc. +############################################################################### + +""" +TraceLens-derived analysis algorithms for rocpd. + +Ports interval arithmetic, kernel categorization, and short kernel detection +from AMD TraceLens (https://github.com/AMD-AGI/TraceLens). + +All functions read from an existing RocpdImportData connection and return +plain dict / list structures. No output formatting. No ai_analysis imports. + +Call order dependency: + timeline = compute_interval_timeline(conn) + categories = analyze_kernels_by_category(conn, timeline["total_wall_ns"]) + short = analyze_short_kernels(conn) +""" + +import re +from typing import Any, Dict, List, Tuple + +from .importer import RocpdImportData, execute_statement + +__all__ = [ + "compute_interval_timeline", + "categorize_kernel_name", + "analyze_kernels_by_category", + "analyze_short_kernels", +] + +# --------------------------------------------------------------------------- +# Kernel category patterns (matching TraceLens kernel_name_parser.py) +# Order matters: first match wins. +# --------------------------------------------------------------------------- +_CATEGORY_PATTERNS: List[Tuple[str, Any]] = [ + ("CONV", re.compile(r"conv|winograd|implicit_gemm_conv", re.IGNORECASE)), + ("GEMM", re.compile(r"gemm|gemv|xdlops_gemm|Cijk_|rocblas_gemm", re.IGNORECASE)), + ( + "SDPA", + re.compile( + r"flash_attention|fmha|scaled_dot_product|FlashAttention", re.IGNORECASE + ), + ), + ( + "NCCL", + re.compile( + r"ncclKernel|rccl|AllReduce|AllGather|ReduceScatter|Broadcast", + re.IGNORECASE, + ), + ), + ( + "Elementwise", + re.compile( + r"vectorized_elementwise|aten_add|aten_mul|relu|gelu|silu", re.IGNORECASE + ), + ), + ( + "Normalization", + re.compile(r"layer_norm|batch_norm|group_norm|rms_norm", re.IGNORECASE), + ), + ("Reduction", re.compile(r"reduce|softmax|sum_|amax", re.IGNORECASE)), +] + + +def categorize_kernel_name(name: str) -> str: + """Map a kernel name to a TraceLens op category. + + Returns one of: GEMM, CONV, SDPA, NCCL, Elementwise, Normalization, + Reduction, Other. + """ + for category, pattern in _CATEGORY_PATTERNS: + if pattern.search(name): + return category + return "Other" + + +# --------------------------------------------------------------------------- +# Interval arithmetic helpers (matching TraceLens gpu_event_analyser.py) +# --------------------------------------------------------------------------- + + +def _merge_intervals(intervals: List[Tuple[int, int]]) -> List[Tuple[int, int]]: + """Sort and merge overlapping (start, end) intervals. + + Returns a list of non-overlapping (start, end) tuples in ascending order. + """ + if not intervals: + return [] + sorted_ivs = sorted(intervals, key=lambda x: x[0]) + merged = [sorted_ivs[0]] + for start, end in sorted_ivs[1:]: + prev_start, prev_end = merged[-1] + if start <= prev_end: + merged[-1] = (prev_start, max(prev_end, end)) + else: + merged.append((start, end)) + return merged + + +def _total_ns(intervals: List[Tuple[int, int]]) -> int: + """Sum the duration of a list of non-overlapping intervals.""" + return sum(end - start for start, end in intervals) + + +def _subtract_intervals( + a: List[Tuple[int, int]], b: List[Tuple[int, int]] +) -> List[Tuple[int, int]]: + """Return intervals in *a* that do not overlap with any interval in *b*. + + Both inputs must already be merged (non-overlapping, sorted). + Implements set difference A โˆ’ B for interval sets. + """ + result = [] + b_idx = 0 + for a_start, a_end in a: + cur_start = a_start + while b_idx < len(b) and b[b_idx][1] <= cur_start: + b_idx += 1 + j = b_idx + while j < len(b) and b[j][0] < a_end: + b_start, b_end = b[j] + if cur_start < b_start: + result.append((cur_start, b_start)) + cur_start = max(cur_start, b_end) + j += 1 + if cur_start < a_end: + result.append((cur_start, a_end)) + return result + + +# --------------------------------------------------------------------------- +# Public analysis functions +# --------------------------------------------------------------------------- + + +def compute_interval_timeline(connection: RocpdImportData) -> Dict[str, Any]: + """Compute accurate GPU timeline using set-theoretic interval arithmetic. + + Unlike compute_time_breakdown() which sums raw durations and double-counts + overlapping periods, this function uses merged interval sets to compute: + - true_compute_ns: kernel time with overlaps removed + - exposed_memcpy_ns: memcpy time NOT overlapping any kernel + - idle_ns: wall time minus all GPU activity + + total_wall_ns is defined as MAX(end) - MIN(start) across the union of + kernels and memory_copies โ€” matching compute_time_breakdown()'s definition. + + Edge cases: + - Empty kernels table โ†’ true_compute_ns=0, true_compute_pct=0.0 + - Empty memory_copies โ†’ exposed_memcpy_ns=0, exposed_memcpy_pct=0.0 + - total_wall_ns==0 โ†’ all _pct fields return 0.0 + """ + # Load kernel intervals + try: + kernel_rows = execute_statement( + connection, "SELECT start, end FROM kernels", () + ).fetchall() + kernel_intervals = [ + (int(r[0]), int(r[1])) + for r in kernel_rows + if r[0] is not None and r[1] is not None + ] + except Exception: + kernel_intervals = [] + + # Load memcpy intervals + try: + memcpy_rows = execute_statement( + connection, "SELECT start, end FROM memory_copies", () + ).fetchall() + memcpy_intervals = [ + (int(r[0]), int(r[1])) + for r in memcpy_rows + if r[0] is not None and r[1] is not None + ] + except Exception: + memcpy_intervals = [] + + # Compute wall time across union of both tables + all_starts = [s for s, _ in kernel_intervals] + [s for s, _ in memcpy_intervals] + all_ends = [e for _, e in kernel_intervals] + [e for _, e in memcpy_intervals] + if not all_starts: + return { + "total_wall_ns": 0, + "true_compute_ns": 0, + "true_compute_pct": 0.0, + "exposed_memcpy_ns": 0, + "exposed_memcpy_pct": 0.0, + "idle_ns": 0, + "idle_pct": 0.0, + } + + total_wall_ns = max(all_ends) - min(all_starts) + if total_wall_ns <= 0: + return { + "total_wall_ns": 0, + "true_compute_ns": 0, + "true_compute_pct": 0.0, + "exposed_memcpy_ns": 0, + "exposed_memcpy_pct": 0.0, + "idle_ns": 0, + "idle_pct": 0.0, + } + + # Merge intervals within each set + merged_kernels = _merge_intervals(kernel_intervals) + merged_memcpy = _merge_intervals(memcpy_intervals) + + # Compute metrics + true_compute_ns = _total_ns(merged_kernels) + exposed_memcpy = _subtract_intervals(merged_memcpy, merged_kernels) + exposed_memcpy_ns = _total_ns(exposed_memcpy) + + # Idle = wall minus union of all activity + all_activity = _merge_intervals(merged_kernels + merged_memcpy) + active_ns = _total_ns(all_activity) + idle_ns = max(0, total_wall_ns - active_ns) + + def _pct(v: int) -> float: + return round(100.0 * v / total_wall_ns, 2) + + return { + "total_wall_ns": total_wall_ns, + "true_compute_ns": true_compute_ns, + "true_compute_pct": _pct(true_compute_ns), + "exposed_memcpy_ns": exposed_memcpy_ns, + "exposed_memcpy_pct": _pct(exposed_memcpy_ns), + "idle_ns": idle_ns, + "idle_pct": _pct(idle_ns), + } + + +def analyze_kernels_by_category( + connection: RocpdImportData, + total_wall_ns: int, +) -> List[Dict[str, Any]]: + """Aggregate kernel execution time by TraceLens op category. + + Call compute_interval_timeline() first and pass its total_wall_ns here + so pct_of_total_time uses the same wall-time baseline. + + Returns list of dicts sorted by total_ns descending, one entry per category. + Returns [] if kernels table is empty. + + Edge cases: + - Empty kernels table โ†’ [] + - total_wall_ns==0 โ†’ pct_of_total_time=0.0 for all categories + """ + try: + rows = execute_statement( + connection, "SELECT name, duration FROM kernels", () + ).fetchall() + except Exception: + return [] + + if not rows: + return [] + + # Aggregate by category + cat_totals: Dict[str, Dict[str, Any]] = {} + total_kernel_ns = 0 + for name, duration in rows: + if name is None or duration is None: + continue + category = categorize_kernel_name(str(name)) + dur = int(duration) + total_kernel_ns += dur + if category not in cat_totals: + cat_totals[category] = {"count": 0, "total_ns": 0} + cat_totals[category]["count"] += 1 + cat_totals[category]["total_ns"] += dur + + if not cat_totals: + return [] + + result = [] + for category, data in cat_totals.items(): + count = data["count"] + total_ns = data["total_ns"] + avg_ns = total_ns // count if count > 0 else 0 + pct_kernel = ( + round(100.0 * total_ns / total_kernel_ns, 2) if total_kernel_ns > 0 else 0.0 + ) + pct_wall = ( + round(100.0 * total_ns / total_wall_ns, 2) if total_wall_ns > 0 else 0.0 + ) + result.append( + { + "category": category, + "count": count, + "total_ns": total_ns, + "pct_of_kernel_time": pct_kernel, + "avg_duration_ns": avg_ns, + "pct_of_total_time": pct_wall, + } + ) + + return sorted(result, key=lambda x: x["total_ns"], reverse=True) + + +def analyze_short_kernels( + connection: RocpdImportData, + threshold_us: float = 10.0, +) -> Dict[str, Any]: + """Identify kernels below threshold_us microseconds (TraceLens short-kernel analysis). + + threshold_us defaults to 10ฮผs and is not configurable via CLI in Phase 1. + + Edge cases: + - No kernels below threshold โ†’ short_kernel_count=0, histogram=[], top_offenders=[] + - Empty kernels table โ†’ same as above + - total_kernel_time==0 โ†’ wasted_pct_of_kernel_time=0.0 + """ + threshold_ns = int(threshold_us * 1_000) + + try: + all_rows = execute_statement( + connection, "SELECT name, duration FROM kernels", () + ).fetchall() + except Exception: + all_rows = [] + + total_kernels = len(all_rows) + total_kernel_ns = sum(int(r[1]) for r in all_rows if r[1] is not None) + + # Filter short kernels + short_rows = [ + (str(r[0]), int(r[1])) + for r in all_rows + if r[0] is not None and r[1] is not None and int(r[1]) < threshold_ns + ] + + short_count = len(short_rows) + wasted_ns = sum(d for _, d in short_rows) + short_pct = ( + round(100.0 * short_count / total_kernels, 2) if total_kernels > 0 else 0.0 + ) + wasted_pct = ( + round(100.0 * wasted_ns / total_kernel_ns, 2) if total_kernel_ns > 0 else 0.0 + ) + + # Histogram buckets (matching TraceLens short kernel histogram) + buckets = [ + ("0-1ฮผs", 0, 1_000), + ("1-5ฮผs", 1_000, 5_000), + (f"5-{int(threshold_us)}ฮผs", 5_000, threshold_ns), + ] + histogram = [ + {"bucket_label": label, "count": sum(1 for _, d in short_rows if lo <= d < hi)} + for label, lo, hi in buckets + if any(lo <= d < hi for _, d in short_rows) + ] + + # Top offenders by total wasted time + offender_map: Dict[str, Dict[str, Any]] = {} + for name, dur in short_rows: + if name not in offender_map: + offender_map[name] = {"count": 0, "total_wasted_ns": 0} + offender_map[name]["count"] += 1 + offender_map[name]["total_wasted_ns"] += dur + + top_offenders = sorted( + [ + { + "name": name, + "count": data["count"], + "avg_us": round(data["total_wasted_ns"] / data["count"] / 1_000, 3), + "total_wasted_ns": data["total_wasted_ns"], + } + for name, data in offender_map.items() + ], + key=lambda x: x["total_wasted_ns"], + reverse=True, + )[:10] + + return { + "threshold_us": threshold_us, + "total_kernels": total_kernels, + "short_kernel_count": short_count, + "short_kernel_pct": short_pct, + "wasted_ns": wasted_ns, + "wasted_pct_of_kernel_time": wasted_pct, + "histogram": histogram, + "top_offenders": top_offenders, + } diff --git a/projects/rocprofiler-sdk/source/lib/python/utilities.cmake b/projects/rocprofiler-sdk/source/lib/python/utilities.cmake index 8f51fe971b9..4d9486297c6 100644 --- a/projects/rocprofiler-sdk/source/lib/python/utilities.cmake +++ b/projects/rocprofiler-sdk/source/lib/python/utilities.cmake @@ -178,7 +178,9 @@ function(rocprofiler_rocpd_python_bindings _VERSION) query.py schema.py summary.py - time_window.py) + time_window.py + tracelens_port.py + analyze.py) foreach(_SOURCE ${rocpd_PYTHON_SOURCES}) configure_file(${CMAKE_CURRENT_LIST_DIR}/${_SOURCE} @@ -189,6 +191,33 @@ function(rocprofiler_rocpd_python_bindings _VERSION) COMPONENT rocpd) endforeach() + # Copy ai_analysis directory and its contents (including subdirectories). Includes + # *.py modules, *.md docs, *.json schema files, and *.png assets (e.g. + # ai_analysis/share/amd_rocm_logo.png used by interactive.py banner). Excludes + # ai_analysis/tests/ โ€” test-only files should not be installed into site-packages as + # they are not runtime assets and can cause import side-effects. + file( + GLOB_RECURSE + rocpd_AI_ANALYSIS_FILES + "${CMAKE_CURRENT_LIST_DIR}/ai_analysis/*.py" + "${CMAKE_CURRENT_LIST_DIR}/ai_analysis/*.md" + "${CMAKE_CURRENT_LIST_DIR}/ai_analysis/*.json" + "${CMAKE_CURRENT_LIST_DIR}/ai_analysis/*.png") + list(FILTER rocpd_AI_ANALYSIS_FILES EXCLUDE REGEX + "${CMAKE_CURRENT_LIST_DIR}/ai_analysis/tests/.*") + + foreach(_AI_FILE ${rocpd_AI_ANALYSIS_FILES}) + file(RELATIVE_PATH _REL_PATH "${CMAKE_CURRENT_LIST_DIR}" "${_AI_FILE}") + get_filename_component(_REL_DIR "${_REL_PATH}" DIRECTORY) + # Use file(COPY) instead of configure_file so binary assets (e.g. PNG) are handled + # correctly without text substitution or EPERM on binary data. + file(COPY ${_AI_FILE} DESTINATION ${rocpd_PYTHON_OUTPUT_DIRECTORY}/${_REL_DIR}) + install( + FILES ${rocpd_PYTHON_OUTPUT_DIRECTORY}/${_REL_PATH} + DESTINATION ${rocpd_PYTHON_INSTALL_DIRECTORY}/${_REL_DIR} + COMPONENT rocpd) + endforeach() + add_library(rocprofiler-sdk-rocpd-python-bindings-${_VERSION} MODULE) target_sources( rocprofiler-sdk-rocpd-python-bindings-${_VERSION} diff --git a/projects/rocprofiler-sdk/source/scripts/format-deps.py b/projects/rocprofiler-sdk/source/scripts/format-deps.py index 98af5bb2390..b6420f9f451 100755 --- a/projects/rocprofiler-sdk/source/scripts/format-deps.py +++ b/projects/rocprofiler-sdk/source/scripts/format-deps.py @@ -25,7 +25,6 @@ import argparse import os -import sys class FormatSource(argparse.Action): @@ -118,6 +117,10 @@ def __call__(self, parser, namespace, values, option_string=None): "-p", "--python", nargs=0, help="format python files", action=FormatPython ) parser.add_argument( - "-a", "--all", nargs=0, help="format cmake, source and python files", action=FormatAll + "-a", + "--all", + nargs=0, + help="format cmake, source and python files", + action=FormatAll, ) parser.parse_args() diff --git a/projects/rocprofiler-sdk/tests/pytest-packages/pytest_utils/perfetto_reader.py b/projects/rocprofiler-sdk/tests/pytest-packages/pytest_utils/perfetto_reader.py index 7a3be745bad..b7c7ff2d5b7 100644 --- a/projects/rocprofiler-sdk/tests/pytest-packages/pytest_utils/perfetto_reader.py +++ b/projects/rocprofiler-sdk/tests/pytest-packages/pytest_utils/perfetto_reader.py @@ -349,10 +349,10 @@ def extract_tp_data(self, **kwargs): counter_track.name as track_name, ROW_NUMBER() OVER window AS rn FROM counter JOIN counter_track ON counter.track_id = counter_track.id - WHERE counter_track.name LIKE '%SCRATCH MEMORY%' + WHERE counter_track.name LIKE '%SCRATCH MEMORY%' WINDOW window AS (PARTITION BY counter.value, track_id ORDER BY counter.ts) ) - SELECT + SELECT slice_id, track_id, 'scratch_memory' as category, diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/rocpd/CMakeLists.txt b/projects/rocprofiler-sdk/tests/rocprofv3/rocpd/CMakeLists.txt index ef10bb7720f..4372dc1a963 100644 --- a/projects/rocprofiler-sdk/tests/rocprofv3/rocpd/CMakeLists.txt +++ b/projects/rocprofiler-sdk/tests/rocprofv3/rocpd/CMakeLists.txt @@ -18,6 +18,27 @@ set(rocprofv3-rocpd-env find_package(MPI) find_package(Python3 REQUIRED) +# Helper: copy a test script to the build directory at build time (not just at configure +# time). Unlike configure_file(COPYONLY), this registers a proper build-time dependency +# so that editing the source file and re-running cmake --build is sufficient to pick up +# the change without re-running cmake. +# +# Each call creates an add_custom_command (triggered by the file dependency) and a +# lightweight ALL custom target that forces it to run on every build. +function(rocpd_stage_test_script _SRC _DST) + get_filename_component(_tgt "${_DST}" NAME_WE) + add_custom_command( + OUTPUT "${_DST}" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${_SRC}" "${_DST}" + DEPENDS "${_SRC}" + COMMENT "Staging ${_SRC} -> build dir") + add_custom_target(rocpd-stage-${_tgt} ALL DEPENDS "${_DST}") + set_property( + DIRECTORY + APPEND + PROPERTY CMAKE_CONFIGURE_DEPENDS "${_SRC}") +endfunction() + if(MPI_FOUND) set(MULTIPROC_IS_DISABLED OFF) set(MULTIPROC_LAUNCHER ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} 2 @@ -382,3 +403,294 @@ rocprofiler_add_integration_execute_test( FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}" DISABLED "${MULTIPROC_IS_DISABLED}" FIXTURES_REQUIRED rocprofv3-test-rocpd-merge-generation-using-package-multiproc) + +######################################################################################### +# +# AI analysis module tests +# +######################################################################################### + +# Test the analyze --help flag works +rocprofiler_add_integration_execute_test( + rocprofv3-test-rocpd-analyze-help + COMMAND ${Python3_EXECUTABLE} -m rocpd analyze --help + DEPENDS rocprofiler-sdk::rocprofv3 + TIMEOUT 120 + LABELS "integration-tests;rocpd" + ENVIRONMENT "${rocprofv3-rocpd-env}" + FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}") + +# Test standalone module help +rocprofiler_add_integration_execute_test( + rocprofv3-test-rocpd-module-analyze-help + COMMAND ${Python3_EXECUTABLE} -m rocpd.analyze --help + DEPENDS rocprofiler-sdk::rocprofv3 + TIMEOUT 120 + LABELS "integration-tests;rocpd" + ENVIRONMENT "${rocprofv3-rocpd-env}" + FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}") + +# Test analyze on existing database +rocprofiler_add_integration_execute_test( + rocprofv3-test-rocpd-analyze + COMMAND ${Python3_EXECUTABLE} -m rocpd analyze -i + ${CMAKE_CURRENT_BINARY_DIR}/rocpd-input-data/out_results.db --top-kernels 5 + DEPENDS rocprofiler-sdk::rocprofv3 + TIMEOUT 120 + LABELS "integration-tests;rocpd" + ENVIRONMENT "${rocprofv3-rocpd-env}" + FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}" + FIXTURES_REQUIRED rocprofv3-test-rocpd) + +# Test analyze with JSON output format (also a fixture for schema validation test) +rocprofiler_add_integration_execute_test( + rocprofv3-test-rocpd-analyze-json + COMMAND + ${Python3_EXECUTABLE} -m rocpd analyze -i + ${CMAKE_CURRENT_BINARY_DIR}/rocpd-input-data/out_results.db --format json -o + analysis_results -d ${CMAKE_CURRENT_BINARY_DIR}/rocpd-analyze-output + DEPENDS rocprofiler-sdk::rocprofv3 + TIMEOUT 120 + LABELS "integration-tests;rocpd" + ENVIRONMENT "${rocprofv3-rocpd-env}" + FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}" + FIXTURES_SETUP rocprofv3-test-rocpd-analyze-json + FIXTURES_REQUIRED rocprofv3-test-rocpd) + +# Test analyze with custom top-kernels +rocprofiler_add_integration_execute_test( + rocprofv3-test-rocpd-analyze-top-kernels + COMMAND ${Python3_EXECUTABLE} -m rocpd analyze -i + ${CMAKE_CURRENT_BINARY_DIR}/rocpd-input-data/out_results.db --top-kernels 20 + DEPENDS rocprofiler-sdk::rocprofv3 + TIMEOUT 120 + LABELS "integration-tests;rocpd" + ENVIRONMENT "${rocprofv3-rocpd-env}" + FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}" + FIXTURES_REQUIRED rocprofv3-test-rocpd) + +# Test analyze with custom prompt (no LLM, just metadata) +rocprofiler_add_integration_execute_test( + rocprofv3-test-rocpd-analyze-custom-prompt + COMMAND + ${Python3_EXECUTABLE} -m rocpd analyze -i + ${CMAKE_CURRENT_BINARY_DIR}/rocpd-input-data/out_results.db --prompt + "Why is this kernel slow?" + DEPENDS rocprofiler-sdk::rocprofv3 + TIMEOUT 120 + LABELS "integration-tests;rocpd" + ENVIRONMENT "${rocprofv3-rocpd-env}" + FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}" + FIXTURES_REQUIRED rocprofv3-test-rocpd) + +# Test analyze on multiproc database +rocprofiler_add_integration_execute_test( + rocprofv3-test-rocpd-analyze-multiproc + COMMAND + ${Python3_EXECUTABLE} -m rocpd analyze -i + ${CMAKE_CURRENT_BINARY_DIR}/rocpd-input-data-multiproc/out_mp_0_results.db + ${CMAKE_CURRENT_BINARY_DIR}/rocpd-input-data-multiproc/out_mp_1_results.db + --top-kernels 5 + DEPENDS rocprofiler-sdk::rocprofv3 + TIMEOUT 120 + LABELS "integration-tests;rocpd" + ENVIRONMENT "${rocprofv3-rocpd-env}" + FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}" + DISABLED "${MULTIPROC_IS_DISABLED}" + FIXTURES_REQUIRED rocprofv3-test-rocpd-multiproc) + +# Test AI analysis Python API import +rocprofiler_add_integration_execute_test( + rocprofv3-test-rocpd-ai-analysis-api + COMMAND + ${Python3_EXECUTABLE} -c + "from rocpd.ai_analysis import analyze_database; print('AI analysis API imported successfully')" + DEPENDS rocprofiler-sdk::rocprofv3 + TIMEOUT 120 + LABELS "integration-tests;rocpd" + ENVIRONMENT "${rocprofv3-rocpd-env}" + FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}") + +rocpd_stage_test_script(${CMAKE_CURRENT_SOURCE_DIR}/test_analyze.py + ${CMAKE_CURRENT_BINARY_DIR}/test_analyze_standalone.py) + +# Test analyze unit tests (run from build dir to avoid conftest.py issues) +rocprofiler_add_integration_execute_test( + rocprofv3-test-rocpd-analyze-unit-tests + COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/test_analyze_standalone.py + DEPENDS rocprofiler-sdk::rocprofv3 + TIMEOUT 120 + LABELS "integration-tests;rocpd;pytest" + ENVIRONMENT "${rocprofv3-rocpd-env}" + FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}") + +######################################################################################### +# +# JSON schema tests +# +######################################################################################### + +rocpd_stage_test_script(${CMAKE_CURRENT_SOURCE_DIR}/test_analyze_schema.py + ${CMAKE_CURRENT_BINARY_DIR}/test_analyze_schema_standalone.py) + +# Unit schema tests: validate the schema file structure and synthetic JSON output. No +# database fixture needed - uses synthetic data generated in-process. +rocprofiler_add_integration_execute_test( + rocprofv3-test-rocpd-analyze-schema-unit-tests + COMMAND ${Python3_EXECUTABLE} + ${CMAKE_CURRENT_BINARY_DIR}/test_analyze_schema_standalone.py + DEPENDS rocprofiler-sdk::rocprofv3 + TIMEOUT 120 + LABELS "integration-tests;rocpd;pytest" + ENVIRONMENT "${rocprofv3-rocpd-env}" + FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}") + +# Integration schema validation: parse the real JSON output produced by the analyze-json +# test and assert schema_version, required fields, and commands structure. +rocprofiler_add_integration_execute_test( + rocprofv3-test-rocpd-analyze-schema-validate + COMMAND + ${Python3_EXECUTABLE} -c + "import json, pkgutil; \ + path = '${CMAKE_CURRENT_BINARY_DIR}/rocpd-analyze-output/analysis_results.json'; \ + d = json.load(open(path)); \ + schema = json.loads(pkgutil.get_data('rocpd.ai_analysis', 'docs/analysis-output.schema.json')); \ + allowed = schema['properties']['schema_version']['enum']; \ + assert d.get('schema_version') in allowed, 'Bad schema_version ' + repr(d.get('schema_version')) + ', expected one of ' + repr(allowed); \ + assert all(k in d for k in ('metadata','recommendations','hardware_counters','hotspots')); \ + cmds = [c for r in d['recommendations'] for c in r.get('commands', [])]; \ + assert len(cmds) > 0, 'No commands found in recommendations'; \ + assert all(c['tool'] in ('rocprofv3','rocprof-sys','rocprof-compute') for c in cmds); \ + print('Schema OK: version=' + str(d['schema_version']) + ', recs=' + str(len(d['recommendations'])) + ', commands=' + str(len(cmds)))" + DEPENDS rocprofiler-sdk::rocprofv3 + TIMEOUT 120 + LABELS "integration-tests;rocpd" + ENVIRONMENT "${rocprofv3-rocpd-env}" + FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}" + FIXTURES_REQUIRED rocprofv3-test-rocpd-analyze-json) + +######################################################################################### +# +# AI analysis API unit tests +# +######################################################################################### + +rocpd_stage_test_script(${CMAKE_CURRENT_SOURCE_DIR}/test_ai_analysis_standalone.py + ${CMAKE_CURRENT_BINARY_DIR}/test_ai_analysis_standalone.py) + +# AI analysis API unit tests (run from build dir to avoid conftest.py issues) These tests +# do not require a GPU trace fixture โ€” they test the Python API in isolation. +rocprofiler_add_integration_execute_test( + rocprofv3-test-rocpd-ai-analysis-unit-tests + COMMAND ${Python3_EXECUTABLE} + ${CMAKE_CURRENT_BINARY_DIR}/test_ai_analysis_standalone.py + DEPENDS rocprofiler-sdk::rocprofv3 + TIMEOUT 120 + LABELS "integration-tests;rocpd;pytest" + ENVIRONMENT "${rocprofv3-rocpd-env}" + FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}") + +######################################################################################### +# +# LLM guide context-aware filtering unit tests +# +######################################################################################### + +rocpd_stage_test_script(${CMAKE_CURRENT_SOURCE_DIR}/test_guide_filter_standalone.py + ${CMAKE_CURRENT_BINARY_DIR}/test_guide_filter_standalone.py) + +# Guide filtering unit tests โ€” no GPU trace fixture required; no LLM API calls made. +rocprofiler_add_integration_execute_test( + rocprofv3-test-rocpd-guide-filter-unit-tests + COMMAND ${Python3_EXECUTABLE} + ${CMAKE_CURRENT_BINARY_DIR}/test_guide_filter_standalone.py + DEPENDS rocprofiler-sdk::rocprofv3 + TIMEOUT 120 + LABELS "integration-tests;rocpd;pytest" + ENVIRONMENT "${rocprofv3-rocpd-env}" + FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}") + +######################################################################################### +# +# AI analysis module sub-package unit tests +# Source: source/lib/python/rocpd/ai_analysis/tests/ +# Tests are mock-based; no GPU trace fixture or LLM API key required. +# Optional-provider tests (anthropic/openai) are skipped when packages are absent. +# +######################################################################################### + +set(_AI_ANALYSIS_TESTS_DIR + "${CMAKE_CURRENT_SOURCE_DIR}/../../../source/lib/python/rocpd/ai_analysis/tests") + +rocpd_stage_test_script("${_AI_ANALYSIS_TESTS_DIR}/test_api_standalone.py" + "${CMAKE_CURRENT_BINARY_DIR}/test_api_standalone.py") +rocpd_stage_test_script("${_AI_ANALYSIS_TESTS_DIR}/test_llm_conversation.py" + "${CMAKE_CURRENT_BINARY_DIR}/test_llm_conversation_standalone.py") +rocpd_stage_test_script("${_AI_ANALYSIS_TESTS_DIR}/test_local_llm.py" + "${CMAKE_CURRENT_BINARY_DIR}/test_local_llm_standalone.py") +rocpd_stage_test_script("${_AI_ANALYSIS_TESTS_DIR}/test_interactive.py" + "${CMAKE_CURRENT_BINARY_DIR}/test_interactive_standalone.py") +rocpd_stage_test_script("${_AI_ANALYSIS_TESTS_DIR}/test_workflow.py" + "${CMAKE_CURRENT_BINARY_DIR}/test_workflow_standalone.py") +rocpd_stage_test_script("${_AI_ANALYSIS_TESTS_DIR}/test_tracelens_port.py" + "${CMAKE_CURRENT_BINARY_DIR}/test_tracelens_port_standalone.py") + +rocprofiler_add_integration_execute_test( + rocprofv3-test-rocpd-api-unit-tests + COMMAND ${Python3_EXECUTABLE} -m pytest --noconftest -x + ${CMAKE_CURRENT_BINARY_DIR}/test_api_standalone.py + DEPENDS rocprofiler-sdk::rocprofv3 + TIMEOUT 120 + LABELS "integration-tests;rocpd;pytest" + ENVIRONMENT "${rocprofv3-rocpd-env}" + FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}") + +rocprofiler_add_integration_execute_test( + rocprofv3-test-rocpd-llm-conversation-unit-tests + COMMAND ${Python3_EXECUTABLE} -m pytest --noconftest -x + ${CMAKE_CURRENT_BINARY_DIR}/test_llm_conversation_standalone.py + DEPENDS rocprofiler-sdk::rocprofv3 + TIMEOUT 120 + LABELS "integration-tests;rocpd;pytest" + ENVIRONMENT "${rocprofv3-rocpd-env}" + FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}") + +rocprofiler_add_integration_execute_test( + rocprofv3-test-rocpd-local-llm-unit-tests + COMMAND ${Python3_EXECUTABLE} -m pytest --noconftest -x + ${CMAKE_CURRENT_BINARY_DIR}/test_local_llm_standalone.py + DEPENDS rocprofiler-sdk::rocprofv3 + TIMEOUT 120 + LABELS "integration-tests;rocpd;pytest" + ENVIRONMENT "${rocprofv3-rocpd-env}" + FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}") + +rocprofiler_add_integration_execute_test( + rocprofv3-test-rocpd-interactive-unit-tests + COMMAND ${Python3_EXECUTABLE} -m pytest --noconftest -x + ${CMAKE_CURRENT_BINARY_DIR}/test_interactive_standalone.py + DEPENDS rocprofiler-sdk::rocprofv3 + TIMEOUT 120 + LABELS "integration-tests;rocpd;pytest" + ENVIRONMENT "${rocprofv3-rocpd-env}" + FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}") + +rocprofiler_add_integration_execute_test( + rocprofv3-test-rocpd-workflow-unit-tests + COMMAND ${Python3_EXECUTABLE} -m pytest --noconftest -x + ${CMAKE_CURRENT_BINARY_DIR}/test_workflow_standalone.py + DEPENDS rocprofiler-sdk::rocprofv3 + TIMEOUT 120 + LABELS "integration-tests;rocpd;pytest" + ENVIRONMENT "${rocprofv3-rocpd-env}" + FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}") + +rocprofiler_add_integration_execute_test( + rocprofv3-test-rocpd-tracelens-port-unit-tests + COMMAND ${Python3_EXECUTABLE} -m pytest --noconftest -x + ${CMAKE_CURRENT_BINARY_DIR}/test_tracelens_port_standalone.py + DEPENDS rocprofiler-sdk::rocprofv3 + TIMEOUT 120 + LABELS "integration-tests;rocpd;pytest" + ENVIRONMENT "${rocprofv3-rocpd-env}" + FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}") diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/rocpd/test_ai_analysis_standalone.py b/projects/rocprofiler-sdk/tests/rocprofv3/rocpd/test_ai_analysis_standalone.py new file mode 100644 index 00000000000..521f1bbd2ad --- /dev/null +++ b/projects/rocprofiler-sdk/tests/rocprofv3/rocpd/test_ai_analysis_standalone.py @@ -0,0 +1,816 @@ +#!/usr/bin/env python3 +############################################################################### +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +############################################################################### + +""" +Standalone unit tests for the rocpd ai_analysis module. + +These tests do NOT require a real GPU trace database. +They DO require the rocpd package to be importable (needs the built libpyrocpd +C extension). Run with the system-installed rocpd path first, then the source +path for the edited Python modules: + + ROCPD_SYS=$(python3 -c "import site; print(site.getsitepackages()[-1])") + ROCPD_SRC=/projects/rocprofiler-sdk/source/lib/python + PYTHONPATH="${ROCPD_SYS}:${ROCPD_SRC}" pytest --noconftest test_ai_analysis_standalone.py -v + +IMPORTANT: ROCPD_SYS must come BEFORE ROCPD_SRC in PYTHONPATH to avoid a +circular import of libpyrocpd. +""" + +import json +import sys +from pathlib import Path + +import pytest + +# --------------------------------------------------------------------------- +# Helpers: build a minimal AnalysisResult without touching a real DB +# --------------------------------------------------------------------------- + + +def _make_minimal_result(): + """Build an AnalysisResult with empty/zero payloads for serialization tests.""" + from rocpd.ai_analysis.api import ( + AnalysisResult, + AnalysisMetadata, + ProfilingInfo, + AnalysisSummary, + ExecutionBreakdown, + RecommendationSet, + ) + + result = AnalysisResult( + metadata=AnalysisMetadata( + rocpd_version="6.3.0", + database_file="test.db", + analysis_timestamp="2025-01-01T00:00:00", + ), + profiling_info=ProfilingInfo( + total_duration_ns=1_000_000, + profiling_mode="sys_trace_only", + analysis_tier=1, + ), + summary=AnalysisSummary( + overall_assessment="Test analysis", + primary_bottleneck="unknown", + confidence=0.5, + key_findings=["Kernel time: 80.0%"], + ), + execution_breakdown=ExecutionBreakdown( + kernel_time_ns=800_000, + kernel_time_pct=80.0, + memcpy_time_ns=0, + memcpy_time_pct=0.0, + ), + recommendations=RecommendationSet(), + ) + return result + + +def _attach_raw( + result, + *, + time_breakdown=None, + hotspots=None, + memory_analysis=None, + recommendations_raw=None, + hardware_counters=None, + database_path="test.db", +): + """Attach a _raw dict to an AnalysisResult for to_json()/to_webview() tests.""" + result._raw = { + "time_breakdown": time_breakdown + or { + "total_kernel_time": 800_000, + "total_memcpy_time": 0, + "total_runtime": 1_000_000, + "kernel_percent": 80.0, + "memcpy_percent": 0.0, + "overhead_percent": 20.0, + }, + "hotspots": hotspots + or [ + { + "name": "test_kernel", + "calls": 10, + "total_duration": 800_000, + "avg_duration": 80_000, + "min_duration": 75_000, + "max_duration": 90_000, + "percent_of_total": 80.0, + } + ], + "memory_analysis": memory_analysis or {}, + "recommendations_raw": recommendations_raw or [], + "hardware_counters": hardware_counters or {"has_counters": False}, + "database_path": database_path, + } + return result + + +# =========================================================================== +# Tests: OutputFormat enum (AIA-003) +# =========================================================================== + + +class TestOutputFormat: + def test_has_python_object(self): + from rocpd.ai_analysis.api import OutputFormat + + assert OutputFormat.PYTHON_OBJECT.value == "python_object" + + def test_has_json(self): + from rocpd.ai_analysis.api import OutputFormat + + assert OutputFormat.JSON.value == "json" + + def test_has_text(self): + from rocpd.ai_analysis.api import OutputFormat + + assert OutputFormat.TEXT.value == "text" + + def test_has_markdown(self): + from rocpd.ai_analysis.api import OutputFormat + + assert OutputFormat.MARKDOWN.value == "markdown" + + def test_has_webview(self): + """AIA-003: WEBVIEW must be present in OutputFormat.""" + from rocpd.ai_analysis.api import OutputFormat + + assert OutputFormat.WEBVIEW.value == "webview" + + def test_five_members(self): + from rocpd.ai_analysis.api import OutputFormat + + assert len(list(OutputFormat)) == 5 + + +# =========================================================================== +# Tests: Exceptions (AIA-008, AIA-010, AIA-011) +# =========================================================================== + + +class TestExceptions: + def test_missing_data_error_optional_list(self): + """AIA-010: missing_tables should be Optional[List[str]].""" + from rocpd.ai_analysis.exceptions import MissingDataError + + # Both None and a list should work + err_no_list = MissingDataError("msg") + assert err_no_list.missing_tables == [] + err_with_list = MissingDataError("msg", ["kernels"]) + assert err_with_list.missing_tables == ["kernels"] + + def test_unsupported_gpu_error_optional_str(self): + """AIA-010: gpu_arch should be Optional[str].""" + from rocpd.ai_analysis.exceptions import UnsupportedGPUError + + err_no_arch = UnsupportedGPUError("msg") + assert err_no_arch.gpu_arch is None + err_with_arch = UnsupportedGPUError("msg", "gfx906") + assert err_with_arch.gpu_arch == "gfx906" + + def test_reference_guide_not_found_shows_all_paths(self): + """AIA-008: ReferenceGuideNotFoundError must list all attempted paths.""" + from rocpd.ai_analysis.exceptions import ReferenceGuideNotFoundError + + paths = ["/path/one/guide.md", "/path/two/guide.md", "/path/three/guide.md"] + err = ReferenceGuideNotFoundError(paths) + msg = str(err) + for p in paths: + assert p in msg, f"Path '{p}' not found in error message" + assert err.attempted_paths == paths + + def test_reference_guide_exported_from_init(self): + """AIA-011: ReferenceGuideNotFoundError must be importable from rocpd.ai_analysis.""" + from rocpd.ai_analysis import ReferenceGuideNotFoundError + + assert ReferenceGuideNotFoundError is not None + + def test_all_exceptions_exported(self): + """Verify all documented exceptions are accessible from the public API.""" + import rocpd.ai_analysis as m + + for name in [ + "AnalysisError", + "DatabaseNotFoundError", + "DatabaseCorruptedError", + "MissingDataError", + "UnsupportedGPUError", + "LLMAuthenticationError", + "LLMRateLimitError", + "ReferenceGuideNotFoundError", + ]: + assert hasattr(m, name), f"{name} not exported from rocpd.ai_analysis" + + +# =========================================================================== +# Tests: validate_database (AIA-013) +# =========================================================================== + + +class TestValidateDatabase: + def test_raises_for_missing_file(self): + """validate_database() must raise DatabaseNotFoundError for missing file.""" + from rocpd.ai_analysis import validate_database, DatabaseNotFoundError + + with pytest.raises(DatabaseNotFoundError): + validate_database(Path("/nonexistent/path/to/trace.db")) + + +# =========================================================================== +# Tests: AnalysisResult serialization (AIA-004) +# =========================================================================== + + +class TestAnalysisResultSerialization: + def test_to_dict_returns_dict(self): + result = _make_minimal_result() + d = result.to_dict() + assert isinstance(d, dict) + assert "metadata" in d + assert "recommendations" in d + + def test_to_json_without_raw_raises_runtime_error(self): + """to_json() without _raw must raise RuntimeError (not silently produce non-schema JSON).""" + import pytest + + result = _make_minimal_result() + # No _raw attached โ†’ must raise so callers know output would be non-schema-conformant + with pytest.raises(RuntimeError, match="Raw analysis data not available"): + result.to_json() + + def test_to_json_with_raw_returns_schema_conformant_json(self): + """AIA-004: to_json() with _raw must include schema_version.""" + result = _attach_raw(_make_minimal_result()) + j = result.to_json() + parsed = json.loads(j) + # schema-conformant output includes schema_version + assert "schema_version" in parsed, "JSON output missing schema_version field" + assert parsed["schema_version"] == "0.1.0" + + def test_to_webview_raises_without_raw(self): + """to_webview() must raise RuntimeError if _raw is not attached.""" + result = _make_minimal_result() + with pytest.raises(RuntimeError, match="analyze_database"): + result.to_webview() + + def test_to_webview_with_raw_returns_html(self): + """AIA-004: to_webview() with _raw must return HTML string.""" + result = _attach_raw(_make_minimal_result()) + html = result.to_webview() + assert isinstance(html, str) + assert " 1000 # must be a real HTML document + + +# =========================================================================== +# Tests: _convert_result_to_llm_format (AIA-006) +# =========================================================================== + + +class TestConvertResultToLlmFormat: + def test_returns_real_kernel_data(self): + """AIA-006: kernels list must not be empty when hotspots are present.""" + from rocpd.ai_analysis.api import _convert_result_to_llm_format + + result = _attach_raw( + _make_minimal_result(), + hotspots=[ + { + "name": "conv2d", + "calls": 5, + "total_duration": 500_000, + "avg_duration": 100_000, + "percent_of_total": 50.0, + } + ], + ) + llm_data = _convert_result_to_llm_format(result) + assert len(llm_data["kernels"]) == 1 + assert llm_data["kernels"][0]["name"] == "conv2d" + + def test_returns_empty_kernels_without_raw(self): + """Without _raw, kernels defaults to empty list (graceful degradation).""" + from rocpd.ai_analysis.api import _convert_result_to_llm_format + + result = _make_minimal_result() + llm_data = _convert_result_to_llm_format(result) + assert llm_data["kernels"] == [] + + def test_has_execution_breakdown(self): + from rocpd.ai_analysis.api import _convert_result_to_llm_format + + result = _make_minimal_result() + llm_data = _convert_result_to_llm_format(result) + assert "execution_breakdown" in llm_data + assert "kernel_time_pct" in llm_data["execution_breakdown"] + + +# =========================================================================== +# Tests: _build_analysis_result key mapping (AIA-002) +# =========================================================================== + + +class TestBuildAnalysisResultKeyMapping: + """Verify that recommendation keys from generate_recommendations() are mapped correctly.""" + + def _make_raw_rec(self, priority="HIGH"): + return { + "priority": priority, + "category": "Low Occupancy", + "issue": "Average wave occupancy is very low", + "suggestion": "Increase occupancy by reducing VGPR usage", + "estimated_impact": "15-20% performance improvement", + "actions": ["Compile with -O3", "Reduce local arrays"], + "commands": [], + } + + def test_high_priority_bucketing(self): + from rocpd.ai_analysis.api import _build_analysis_result + + result = _build_analysis_result( + time_breakdown={ + "total_kernel_time": 0, + "total_memcpy_time": 0, + "total_runtime": 0, + "kernel_percent": 0.0, + "memcpy_percent": 0.0, + "overhead_percent": 0.0, + }, + hotspots=[], + memory_analysis={}, + recommendations=[self._make_raw_rec("HIGH")], + hardware_counters={"has_counters": False}, + database_path=Path("test.db"), + custom_prompt=None, + ) + assert len(result.recommendations.high_priority) == 1 + rec = result.recommendations.high_priority[0] + assert rec.title == "Average wave occupancy is very low" + assert rec.description == "Increase occupancy by reducing VGPR usage" + assert rec.estimated_impact == "15-20% performance improvement" + assert rec.next_steps == ["Compile with -O3", "Reduce local arrays"] + assert rec.priority == "high" # normalized to lowercase + + def test_medium_priority_bucketing(self): + from rocpd.ai_analysis.api import _build_analysis_result + + result = _build_analysis_result( + time_breakdown={ + "total_kernel_time": 0, + "total_memcpy_time": 0, + "total_runtime": 0, + "kernel_percent": 0.0, + "memcpy_percent": 0.0, + "overhead_percent": 0.0, + }, + hotspots=[], + memory_analysis={}, + recommendations=[self._make_raw_rec("MEDIUM")], + hardware_counters={"has_counters": False}, + database_path=Path("test.db"), + custom_prompt=None, + ) + assert len(result.recommendations.medium_priority) == 1 + + def test_info_bucketed_as_medium(self): + """INFO priority should be placed in medium_priority bucket.""" + from rocpd.ai_analysis.api import _build_analysis_result + + result = _build_analysis_result( + time_breakdown={ + "total_kernel_time": 0, + "total_memcpy_time": 0, + "total_runtime": 0, + "kernel_percent": 0.0, + "memcpy_percent": 0.0, + "overhead_percent": 0.0, + }, + hotspots=[], + memory_analysis={}, + recommendations=[self._make_raw_rec("INFO")], + hardware_counters={"has_counters": False}, + database_path=Path("test.db"), + custom_prompt=None, + ) + assert len(result.recommendations.medium_priority) == 1 + + +# =========================================================================== +# Tests: Bug-fix regression tests (Tasks 1-4) +# =========================================================================== + + +class TestBugFixes: + """ + Regression tests covering security, correctness, and LLM-layer bug fixes + from code review Tasks 1-4. Each test is tagged with the fix ID it covers. + """ + + # ------------------------------------------------------------------ + # C-1: shlex.quote in full_command + # ------------------------------------------------------------------ + + def test_kernel_name_shell_quoted_in_full_command(self): + """C-1: full_command strings must use shlex.quote() for kernel names with shell metacharacters.""" + import shlex + from rocpd.analyze import generate_recommendations + + dangerous_name = "kernel'; rm -rf / #" + hotspots = [ + { + "name": dangerous_name, + "percent_of_total": 60.0, + "calls": 100, + "avg_duration": 100_000, + } + ] + time_breakdown = { + "kernel_percent": 70, + "memcpy_percent": 5, + "overhead_percent": 5, + "total_kernel_time": 1_000_000, + "total_runtime": 1_500_000, + } + recs = generate_recommendations(time_breakdown, hotspots, {}, []) + compute_recs = [r for r in recs if r["category"] == "Compute Bottleneck"] + assert compute_recs, "Expected a compute bottleneck recommendation" + + quoted_name = shlex.quote(dangerous_name) + # The kernel name is scoped via rocprof-compute (rocprofv3 collects general + # PMC counters without kernel filtering, so the name only appears in the + # rocprof-compute command where shlex.quote is applied). + kernel_cmds = [ + cmd + for cmd in compute_recs[0]["commands"] + if cmd.get("tool") == "rocprof-compute" + ] + assert kernel_cmds, "Expected at least one rocprof-compute command" + for cmd in kernel_cmds: + full = cmd["full_command"] + # The properly shell-quoted form of the kernel name must appear + assert quoted_name in full, ( + f"Expected shlex.quote({dangerous_name!r}) == {quoted_name!r} " + f"in full_command, got: {full}" + ) + # The raw (unquoted) name must not appear verbatim (i.e., not word-split) + assert f" {dangerous_name} " not in full and not full.endswith( + f" {dangerous_name}" + ), f"Raw unquoted kernel name found in full_command: {full}" + + # ------------------------------------------------------------------ + # C-6: overhead_percent clamped at zero + # ------------------------------------------------------------------ + + def test_overhead_percent_clamped_at_zero(self): + """C-6: overhead_percent must never be negative even when kernel+memcpy > total.""" + from unittest.mock import patch, MagicMock + from rocpd.analyze import compute_time_breakdown + + # Simulate a result row where overhead would come out negative: + # total_kernel=900, total_memcpy=200, total_runtime=1000 โ†’ overhead=-10% + mock_result = (900, 200, 1000, 90.0, 20.0, -10.0) + mock_conn = MagicMock() + with patch("rocpd.analyze.execute_statement") as mock_exec: + mock_exec.return_value.fetchone.return_value = mock_result + result = compute_time_breakdown(mock_conn) + + assert ( + result["overhead_percent"] == 0.0 + ), f"Expected 0.0, got {result['overhead_percent']}" + assert result["kernel_percent"] == 90.0 + assert result["memcpy_percent"] == 20.0 + + # ------------------------------------------------------------------ + # C-7: Tier 0 webview XSS escaping + # ------------------------------------------------------------------ + + def test_tier0_webview_script_tag_escaped(self): + """C-7: in tier0 JSON payload must be escaped to prevent XSS.""" + from datetime import datetime + from rocpd.analyze import _format_tier0_webview + from rocpd.ai_analysis.api import SourceAnalysisResult + + result = SourceAnalysisResult( + source_dir="/tmp/test", + analysis_timestamp=datetime.now().isoformat(), + programming_model="HIP", + files_scanned=1, + files_skipped=0, + detected_kernels=[], + kernel_count=0, + detected_patterns=[], + risk_areas=[], + already_instrumented=False, + roctx_marker_count=0, + recommendations=[], + suggested_counters=[], + suggested_first_command="rocprofv3 --sys-trace -- ./app", + llm_explanation="Normal text more text", + ) + + html = _format_tier0_webview(result) + # The unescaped not escaped in tier0 webview payload" + + # ------------------------------------------------------------------ + # I-1: Bottleneck classification not mislead by has_counters alone + # ------------------------------------------------------------------ + + def test_bottleneck_classification_not_mislead_by_counters(self): + """I-1: has_counters=True alone should not produce 'compute' bottleneck.""" + from pathlib import Path + from rocpd.ai_analysis.api import _build_analysis_result + + # Balanced breakdown โ€” kernel% is only 40%, well below the 70% threshold + time_breakdown = { + "kernel_percent": 40.0, + "memcpy_percent": 15.0, + "overhead_percent": 10.0, + "total_kernel_time": 400_000, + "total_memcpy_time": 150_000, + "total_runtime": 1_000_000, + } + hardware_counters = {"has_counters": True} + + result = _build_analysis_result( + time_breakdown=time_breakdown, + hotspots=[{"name": "k1", "percent_of_total": 40.0}], + memory_analysis={}, + recommendations=[], + hardware_counters=hardware_counters, + database_path=Path("/tmp/fake.db"), + custom_prompt=None, + ) + + assert ( + result.summary.primary_bottleneck == "mixed" + ), f"Expected 'mixed' bottleneck, got {result.summary.primary_bottleneck!r}" + + # ------------------------------------------------------------------ + # I-3: AnalysisContext(tier=0) passed to LLM in analyze_source() + # ------------------------------------------------------------------ + + def test_analyze_source_passes_analysis_context_to_llm(self, tmp_path): + """I-3: analyze_source() must pass AnalysisContext(tier=0) to analyze_source_with_llm.""" + from unittest.mock import patch, MagicMock + from rocpd.ai_analysis.api import analyze_source + from rocpd.ai_analysis.llm_analyzer import AnalysisContext + + # Create a minimal hip file so SourceAnalyzer has something to scan + (tmp_path / "test.hip").write_text("__global__ void myKernel() {}") + + mock_analyzer = MagicMock() + mock_analyzer.analyze_source_with_llm.return_value = "LLM result" + + with patch("rocpd.ai_analysis.api.LLMAnalyzer", return_value=mock_analyzer): + analyze_source( + tmp_path, enable_llm=True, llm_provider="anthropic", llm_api_key="fake" + ) + + assert ( + mock_analyzer.analyze_source_with_llm.called + ), "analyze_source_with_llm was not called" + call_kwargs = mock_analyzer.analyze_source_with_llm.call_args + # Accept both positional and keyword arg style + kwargs = call_kwargs[1] if call_kwargs[1] else {} + context = kwargs.get("context") + if context is None and call_kwargs[0]: + # Unlikely but check positional args too + for arg in call_kwargs[0]: + if isinstance(arg, AnalysisContext): + context = arg + break + + assert ( + context is not None + ), "context= argument not passed to analyze_source_with_llm" + assert isinstance( + context, AnalysisContext + ), f"Expected AnalysisContext, got {type(context)}" + assert context.tier == 0, f"Expected tier=0, got {context.tier}" + + # ------------------------------------------------------------------ + # I-4: LLMAnalyzer construction without API key does not raise + # ------------------------------------------------------------------ + + def test_llm_analyzer_construction_without_api_key_does_not_raise(self): + """I-4: LLMAnalyzer() must not raise LLMAuthenticationError at construction time.""" + import os + from unittest.mock import patch + from rocpd.ai_analysis.llm_analyzer import LLMAnalyzer + from rocpd.ai_analysis.exceptions import LLMAuthenticationError + + with patch.dict(os.environ, {}, clear=False): + os.environ.pop("ANTHROPIC_API_KEY", None) + try: + LLMAnalyzer(provider="anthropic") + except LLMAuthenticationError: + pytest.fail( + "LLMAnalyzer raised LLMAuthenticationError at construction time; " + "authentication should be deferred until the first API call" + ) + + # ------------------------------------------------------------------ + # I-5: self.model honored in LLMAnalyzer + # ------------------------------------------------------------------ + + def test_llm_analyzer_model_parameter_honored(self): + """I-5: LLMAnalyzer(model='my-model') must use that model in the API call.""" + from unittest.mock import patch, MagicMock + from rocpd.ai_analysis.llm_analyzer import LLMAnalyzer + + custom_model = "claude-haiku-4-5-20251001" + analyzer = LLMAnalyzer( + provider="anthropic", api_key="sk-test", model=custom_model + ) + + mock_client = MagicMock() + mock_client.messages.create.return_value = MagicMock( + content=[MagicMock(text="ok")] + ) + + with patch("anthropic.Anthropic", return_value=mock_client): + analyzer._call_anthropic("sys", "user") + + assert mock_client.messages.create.called, "messages.create was not called" + used_model = mock_client.messages.create.call_args[1].get("model") + assert ( + used_model == custom_model + ), f"Expected model {custom_model!r}, got {used_model!r}" + + # ------------------------------------------------------------------ + # P-2: Timeout added to LLM calls + # ------------------------------------------------------------------ + + def test_llm_calls_have_timeout(self): + """P-2: All Anthropic LLM API calls must include a timeout parameter.""" + from unittest.mock import patch, MagicMock + from rocpd.ai_analysis.llm_analyzer import LLMAnalyzer + + analyzer = LLMAnalyzer(provider="anthropic", api_key="sk-test") + + mock_client = MagicMock() + mock_client.messages.create.return_value = MagicMock( + content=[MagicMock(text="ok")] + ) + + with patch("anthropic.Anthropic", return_value=mock_client): + analyzer._call_anthropic("sys", "user") + + call_kwargs = mock_client.messages.create.call_args[1] + assert ( + "timeout" in call_kwargs + ), "timeout parameter missing from Anthropic API call" + assert ( + call_kwargs["timeout"] == 120 + ), f"Expected timeout=120, got {call_kwargs['timeout']}" + + # ------------------------------------------------------------------ + # I-12: analyze_source_code raises on missing source_dir + # ------------------------------------------------------------------ + + def test_analyze_source_code_raises_on_missing_dir(self): + """I-12: analyze_source_code() must raise SourceDirectoryNotFoundError for non-existent dir.""" + from rocpd.analyze import analyze_source_code + from rocpd.ai_analysis.exceptions import SourceDirectoryNotFoundError + + with pytest.raises(SourceDirectoryNotFoundError): + analyze_source_code(source_dir="/nonexistent/path/xyz_no_exist_123") + + # ------------------------------------------------------------------ + # I-9: ReferenceGuideNotFoundError with list not string + # ------------------------------------------------------------------ + + def test_reference_guide_not_found_error_with_list(self): + """I-9: ReferenceGuideNotFoundError must accept List[str] and produce readable message.""" + from rocpd.ai_analysis.exceptions import ReferenceGuideNotFoundError + + paths = [ + "share/rocprofiler-sdk/llm-reference-guide.md", + "~/.config/rocpd/guide.md", + ] + err = ReferenceGuideNotFoundError(paths) + msg = str(err) + + # Both paths should appear intact in the error message + assert ( + "share/rocprofiler-sdk/llm-reference-guide.md" in msg + ), f"First path missing from error message: {msg}" + assert ( + "~/.config/rocpd/guide.md" in msg + ), f"Second path missing from error message: {msg}" + # Guard against the old bug where a bare string was iterated char-by-char + assert ( + "o\n - p" not in msg + ), "Characters are being joined โ€” bare string was passed instead of list" + + # ------------------------------------------------------------------ + # M-8: Source scanner truncation warning + # ------------------------------------------------------------------ + + def test_source_scanner_truncation_warning(self, tmp_path): + """M-8: SourceAnalyzer must add a risk_area warning when _MAX_FILES limit is hit.""" + from rocpd.ai_analysis.source_analyzer import SourceAnalyzer, _MAX_FILES + + # Create more files than _MAX_FILES (use .hip extension so they are scanned) + for i in range(_MAX_FILES + 5): + (tmp_path / f"kernel_{i}.hip").write_text(f"__global__ void k{i}() {{}}") + + scanner = SourceAnalyzer(tmp_path) + plan = scanner.analyze() + + truncation_warnings = [ + r for r in plan.risk_areas if "truncat" in r.lower() or "limit" in r.lower() + ] + assert ( + truncation_warnings + ), f"Expected a truncation warning in risk_areas, got: {plan.risk_areas}" + + +# =========================================================================== +# Tests: Extended thinking / --llm-thinking flag (Task 22) +# =========================================================================== + + +class TestLLMThinking: + """Tests for extended thinking support via thinking_budget_tokens.""" + + def test_llm_thinking_parameter_stored(self): + """thinking_budget_tokens passed to __init__ must be stored on the instance.""" + from rocpd.ai_analysis.llm_analyzer import LLMAnalyzer + + analyzer = LLMAnalyzer(provider="anthropic", thinking_budget_tokens=8000) + assert ( + analyzer.thinking_budget_tokens == 8000 + ), f"Expected thinking_budget_tokens=8000, got {analyzer.thinking_budget_tokens!r}" + + def test_llm_thinking_defaults_to_none(self): + """When thinking_budget_tokens is not supplied, the attribute must be None.""" + import os + from unittest.mock import patch + from rocpd.ai_analysis.llm_analyzer import LLMAnalyzer + + # Ensure env var is absent so it doesn't override the default + with patch.dict(os.environ, {}, clear=False): + os.environ.pop("ROCPD_LLM_THINKING", None) + analyzer = LLMAnalyzer(provider="anthropic") + + assert ( + analyzer.thinking_budget_tokens is None + ), f"Expected thinking_budget_tokens=None, got {analyzer.thinking_budget_tokens!r}" + + def test_llm_thinking_openai_raises(self): + """analyze_with_llm() must raise ValueError when provider=openai and thinking is set.""" + from unittest.mock import patch, MagicMock + from rocpd.ai_analysis.llm_analyzer import LLMAnalyzer + + analyzer = LLMAnalyzer( + provider="openai", + api_key="sk-test", + thinking_budget_tokens=8000, + ) + + # analyze_with_llm() should raise before any API call is made + with pytest.raises( + ValueError, + match="Extended thinking is only supported with the Anthropic provider", + ): + # Patch openai to avoid ImportError; the ValueError should fire before the actual call + with patch.dict("sys.modules", {"openai": MagicMock()}): + analyzer.analyze_with_llm( + {"has_counters": False, "has_pc_sampling": False}, + custom_prompt=None, + ) + + def test_llm_thinking_env_var(self): + """ROCPD_LLM_THINKING env var must set thinking_budget_tokens on construction.""" + import os + from unittest.mock import patch + from rocpd.ai_analysis.llm_analyzer import LLMAnalyzer + + with patch.dict(os.environ, {"ROCPD_LLM_THINKING": "5000"}): + analyzer = LLMAnalyzer(provider="anthropic") + + assert analyzer.thinking_budget_tokens == 5000, ( + f"Expected thinking_budget_tokens=5000 from env var, " + f"got {analyzer.thinking_budget_tokens!r}" + ) + + +# --------------------------------------------------------------------------- +# Entry point +# --------------------------------------------------------------------------- + +if __name__ == "__main__": + # Use --noconftest to avoid loading conftest.py which requires rocprofiler_sdk module + exit_code = pytest.main(["--noconftest", "-x", __file__] + sys.argv[1:]) + sys.exit(exit_code) diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/rocpd/test_analyze.py b/projects/rocprofiler-sdk/tests/rocprofv3/rocpd/test_analyze.py new file mode 100644 index 00000000000..52fdc466983 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/rocprofv3/rocpd/test_analyze.py @@ -0,0 +1,1194 @@ +#!/usr/bin/env python3 +############################################################################### +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +############################################################################### + +""" +Tests for the AI analysis module (analyze.py). + +Covers: + - Public API exports and imports + - generate_recommendations: all 6 Tier-1 rules + 2 Tier-2 rules + boundaries + - _build_summary: all bottleneck classification branches + - _build_hw_counters_json: with/without counters + - _build_warnings_json: both cases + - _build_recommendations_json: stable IDs, duplicate dedup, unknown category + - _format_as_json: correct value mapping, idle time, Tier 2, bandwidth conversion + - format_analysis_output: text, json, and markdown formats +""" + +import json +import sys +import pytest + +# --------------------------------------------------------------------------- +# Shared helpers +# --------------------------------------------------------------------------- + + +def _empty_breakdown(**overrides): + """Return a time_breakdown dict with all fields zeroed unless overridden.""" + base = { + "total_runtime": 0, + "total_kernel_time": 0, + "total_memcpy_time": 0, + "kernel_percent": 0.0, + "memcpy_percent": 0.0, + "overhead_percent": 0.0, + } + base.update(overrides) + return base + + +def _make_hotspot( + name="k", + calls=10, + total=1_000_000, + pct=10.0, + avg=100_000, + min_d=90_000, + max_d=110_000, +): + return { + "name": name, + "calls": calls, + "total_duration": total, + "avg_duration": avg, + "min_duration": min_d, + "max_duration": max_d, + "percent_of_total": pct, + } + + +def _hw_counters(avg_waves=None, gpu_util=None): + """Build a hardware_counters dict for Tier 2 tests.""" + metrics = {} + if avg_waves is not None: + metrics["avg_waves"] = avg_waves + metrics["max_waves"] = avg_waves * 2 + metrics["min_waves"] = avg_waves / 2 + if gpu_util is not None: + metrics["gpu_utilization_percent"] = gpu_util + return {"has_counters": True, "metrics": metrics, "counters": {}, "per_kernel": {}} + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + + +def test_analyze_module_import(): + """Verify analyze module can be imported.""" + from rocpd import analyze + + assert hasattr(analyze, "compute_time_breakdown") + assert hasattr(analyze, "identify_hotspots") + assert hasattr(analyze, "analyze_memory_copies") + assert hasattr(analyze, "generate_recommendations") + assert hasattr(analyze, "format_analysis_output") + assert hasattr(analyze, "add_args") + assert hasattr(analyze, "execute") + assert hasattr(analyze, "main") + + +def test_analyze_module_has_all(): + """Verify analyze module exports expected functions.""" + from rocpd import analyze + + expected_exports = [ + "compute_time_breakdown", + "identify_hotspots", + "analyze_memory_copies", + "generate_recommendations", + "format_analysis_output", + "analyze_performance", + "add_args", + "execute", + "main", + ] + for export in expected_exports: + assert export in analyze.__all__, f"Missing export: {export}" + + +# --------------------------------------------------------------------------- +# generate_recommendations โ€“ Tier 1 rules +# --------------------------------------------------------------------------- + + +def test_rule1_high_memcpy_fires(): + """Rule 1: memcpy_percent > 20 triggers 'Memory Transfer' HIGH recommendation.""" + from rocpd.analyze import generate_recommendations + + recs = generate_recommendations(_empty_breakdown(memcpy_percent=25), [], {}) + matches = [r for r in recs if r["category"] == "Memory Transfer"] + assert len(matches) == 1 + assert matches[0]["priority"] == "HIGH" + assert "25.0%" in matches[0]["issue"] + + +def test_rule1_memcpy_boundary_does_not_fire(): + """Rule 1: memcpy_percent exactly 20 does NOT trigger (threshold is >20).""" + from rocpd.analyze import generate_recommendations + + recs = generate_recommendations(_empty_breakdown(memcpy_percent=20), [], {}) + assert not any(r["category"] == "Memory Transfer" for r in recs) + + +def test_rule2_api_overhead_fires(): + """Rule 2: overhead_percent > 15 triggers 'API Overhead' MEDIUM recommendation.""" + from rocpd.analyze import generate_recommendations + + recs = generate_recommendations(_empty_breakdown(overhead_percent=20), [], {}) + matches = [r for r in recs if r["category"] == "API Overhead"] + assert len(matches) == 1 + assert matches[0]["priority"] == "MEDIUM" + assert "20.0%" in matches[0]["issue"] + + +def test_rule2_overhead_boundary_does_not_fire(): + """Rule 2: overhead_percent exactly 15 does NOT trigger (threshold is >15).""" + from rocpd.analyze import generate_recommendations + + recs = generate_recommendations(_empty_breakdown(overhead_percent=15), [], {}) + assert not any(r["category"] == "API Overhead" for r in recs) + + +def test_rule3_dominant_kernel_fires(): + """Rule 3: single kernel > 50% triggers 'Compute Bottleneck' HIGH recommendation.""" + from rocpd.analyze import generate_recommendations + + hotspots = [_make_hotspot(name="dominant_kernel", pct=60.0)] + recs = generate_recommendations(_empty_breakdown(), hotspots, {}) + matches = [r for r in recs if r["category"] == "Compute Bottleneck"] + assert len(matches) == 1 + assert matches[0]["priority"] == "HIGH" + assert "dominant_kernel" in matches[0]["issue"] + + +def test_rule3_dominant_kernel_boundary_does_not_fire(): + """Rule 3: top kernel exactly 50% does NOT trigger (threshold is >50).""" + from rocpd.analyze import generate_recommendations + + hotspots = [_make_hotspot(pct=50.0)] + recs = generate_recommendations(_empty_breakdown(), hotspots, {}) + assert not any(r["category"] == "Compute Bottleneck" for r in recs) + + +def test_rule3_uses_hotspot_name_in_commands(): + """Rule 3: the kernel name appears in the rocprofv3 command's full_command.""" + from rocpd.analyze import generate_recommendations + + hotspots = [_make_hotspot(name="my_matmul", pct=75.0)] + recs = generate_recommendations(_empty_breakdown(), hotspots, {}) + matches = [r for r in recs if r["category"] == "Compute Bottleneck"] + assert matches + cmds = matches[0].get("commands", []) + assert any("my_matmul" in c.get("full_command", "") for c in cmds) + + +def test_rule4_many_small_kernels_fires(): + """Rule 4: >1000 total calls with avg <10ฮผs triggers 'Launch Overhead'.""" + from rocpd.analyze import generate_recommendations + + # 10 kernels ร— 200 calls = 2000 launches; 2e10 ns / 2000 = 1e7 ns = 10ms >> 10ฮผs... + # Need avg < 10ฮผs = 10_000 ns, so total_kernel_time < 2000 * 10_000 = 20_000_000 + td = _empty_breakdown(total_kernel_time=10_000_000) # avg = 5ฮผs + hotspots = [_make_hotspot(name=f"k{i}", calls=200) for i in range(10)] + recs = generate_recommendations(td, hotspots, {}) + matches = [r for r in recs if r["category"] == "Launch Overhead"] + assert len(matches) == 1 + assert matches[0]["priority"] == "MEDIUM" + assert "2000" in matches[0]["issue"] + + +def test_rule4_many_calls_but_large_kernels_does_not_fire(): + """Rule 4: >1000 calls but avg >= 10ฮผs does NOT trigger.""" + from rocpd.analyze import generate_recommendations + + # 2000 calls but avg = 50ms >> 10ฮผs + td = _empty_breakdown(total_kernel_time=100_000_000_000) + hotspots = [_make_hotspot(name=f"k{i}", calls=200) for i in range(10)] + recs = generate_recommendations(td, hotspots, {}) + assert not any(r["category"] == "Launch Overhead" for r in recs) + + +def test_rule4_few_calls_does_not_fire(): + """Rule 4: <= 1000 total calls does NOT trigger even if each is short.""" + from rocpd.analyze import generate_recommendations + + td = _empty_breakdown(total_kernel_time=1_000_000) + hotspots = [_make_hotspot(calls=100)] # only 100 calls + recs = generate_recommendations(td, hotspots, {}) + assert not any(r["category"] == "Launch Overhead" for r in recs) + + +def test_rule5_low_bandwidth_fires(): + """Rule 5: bandwidth < 10 GB/s triggers 'Memory Bandwidth' MEDIUM recommendation.""" + from rocpd.analyze import generate_recommendations + + mem = {"Host-to-Device": {"bandwidth_bytes_per_sec": 5e9, "avg_bytes": 1024}} + recs = generate_recommendations(_empty_breakdown(), [], mem) + matches = [r for r in recs if r["category"] == "Memory Bandwidth"] + assert len(matches) == 1 + assert matches[0]["priority"] == "MEDIUM" + assert "Host-to-Device" in matches[0]["issue"] + assert "5.00 GB/s" in matches[0]["issue"] + + +def test_rule5_high_bandwidth_does_not_fire(): + """Rule 5: bandwidth >= 10 GB/s does NOT trigger.""" + from rocpd.analyze import generate_recommendations + + mem = {"Host-to-Device": {"bandwidth_bytes_per_sec": 50e9, "avg_bytes": 1024}} + recs = generate_recommendations(_empty_breakdown(), [], mem) + assert not any(r["category"] == "Memory Bandwidth" for r in recs) + + +def test_rule5_zero_bandwidth_does_not_fire(): + """Rule 5: bandwidth == 0 does NOT trigger (guard: bandwidth_gbps > 0).""" + from rocpd.analyze import generate_recommendations + + mem = {"Host-to-Device": {"bandwidth_bytes_per_sec": 0, "avg_bytes": 0}} + recs = generate_recommendations(_empty_breakdown(), [], mem) + assert not any(r["category"] == "Memory Bandwidth" for r in recs) + + +def test_rule5_multiple_directions(): + """Rule 5: each low-bandwidth direction generates its own recommendation.""" + from rocpd.analyze import generate_recommendations + + mem = { + "Host-to-Device": {"bandwidth_bytes_per_sec": 2e9, "avg_bytes": 512}, + "Device-to-Host": {"bandwidth_bytes_per_sec": 3e9, "avg_bytes": 512}, + } + recs = generate_recommendations(_empty_breakdown(), [], mem) + bw_recs = [r for r in recs if r["category"] == "Memory Bandwidth"] + assert len(bw_recs) == 2 + directions = {r["issue"].split()[0] for r in bw_recs} + assert "Host-to-Device" in directions + assert "Device-to-Host" in directions + + +def test_rule6_default_info_fires_when_no_rules_trigger(): + """Rule 6: INFO/Performance recommendation emitted when no rules fire.""" + from rocpd.analyze import generate_recommendations + + recs = generate_recommendations(_empty_breakdown(), [], {}) + assert len(recs) == 1 + assert recs[0]["priority"] == "INFO" + assert recs[0]["category"] == "Performance" + assert len(recs[0].get("commands", [])) > 0 + + +def test_rule6_default_suppressed_when_any_rule_fires(): + """Rule 6: default INFO NOT emitted when at least one other rule fires.""" + from rocpd.analyze import generate_recommendations + + recs = generate_recommendations(_empty_breakdown(memcpy_percent=25), [], {}) + assert not any(r["priority"] == "INFO" for r in recs) + + +def test_multiple_rules_fire_simultaneously(): + """Multiple Tier-1 rules can fire at once; all appear in recommendations.""" + from rocpd.analyze import generate_recommendations + + td = _empty_breakdown(memcpy_percent=30, overhead_percent=20) + recs = generate_recommendations(td, [], {}) + categories = {r["category"] for r in recs} + assert "Memory Transfer" in categories + assert "API Overhead" in categories + + +# --------------------------------------------------------------------------- +# generate_recommendations โ€“ Tier 2 rules +# --------------------------------------------------------------------------- + + +def test_tier2_low_occupancy_fires(): + """Tier 2: avg_waves > 0 and < 16 triggers 'Low Occupancy' HIGH.""" + from rocpd.analyze import generate_recommendations + + recs = generate_recommendations( + _empty_breakdown(), [], {}, _hw_counters(avg_waves=8.0) + ) + matches = [r for r in recs if r["category"] == "Low Occupancy"] + assert len(matches) == 1 + assert matches[0]["priority"] == "HIGH" + assert "8.0" in matches[0]["issue"] + + +def test_tier2_low_occupancy_boundary_does_not_fire(): + """Tier 2: avg_waves exactly 16 does NOT trigger (threshold is < 16).""" + from rocpd.analyze import generate_recommendations + + recs = generate_recommendations( + _empty_breakdown(), [], {}, _hw_counters(avg_waves=16.0) + ) + assert not any(r["category"] == "Low Occupancy" for r in recs) + + +def test_tier2_zero_waves_does_not_fire(): + """Tier 2: avg_waves == 0 does NOT trigger (guard: avg_waves > 0).""" + from rocpd.analyze import generate_recommendations + + recs = generate_recommendations(_empty_breakdown(), [], {}, _hw_counters(avg_waves=0)) + assert not any(r["category"] == "Low Occupancy" for r in recs) + + +def test_tier2_low_gpu_utilization_fires(): + """Tier 2: gpu_utilization_percent > 0 and < 70 triggers 'GPU Utilization' MEDIUM.""" + from rocpd.analyze import generate_recommendations + + recs = generate_recommendations( + _empty_breakdown(), [], {}, _hw_counters(gpu_util=50.0) + ) + matches = [r for r in recs if r["category"] == "GPU Utilization"] + assert len(matches) == 1 + assert matches[0]["priority"] == "MEDIUM" + assert "50.0%" in matches[0]["issue"] + + +def test_tier2_gpu_utilization_boundary_does_not_fire(): + """Tier 2: gpu_utilization exactly 70% does NOT trigger (threshold is < 70).""" + from rocpd.analyze import generate_recommendations + + recs = generate_recommendations( + _empty_breakdown(), [], {}, _hw_counters(gpu_util=70.0) + ) + assert not any(r["category"] == "GPU Utilization" for r in recs) + + +def test_tier2_not_activated_when_no_counters(): + """Tier 2 rules do NOT fire when has_counters=False.""" + from rocpd.analyze import generate_recommendations + + hw = {"has_counters": False} + recs = generate_recommendations(_empty_breakdown(), [], {}, hardware_counters=hw) + assert not any(r["category"] in ("Low Occupancy", "GPU Utilization") for r in recs) + + +def test_tier2_commands_use_valid_tools(): + """Tier 2 recommendations include commands with valid tool names.""" + from rocpd.analyze import generate_recommendations + + VALID_TOOLS = {"rocprofv3", "rocprof-sys", "rocprof-compute"} + recs = generate_recommendations( + _empty_breakdown(), + [], + {}, + hardware_counters=_hw_counters(avg_waves=4.0, gpu_util=40.0), + ) + for rec in recs: + for cmd in rec.get("commands", []): + assert cmd["tool"] in VALID_TOOLS, f"Invalid tool: {cmd['tool']!r}" + + +# --------------------------------------------------------------------------- +# Existing tests (preserved) +# --------------------------------------------------------------------------- + + +def test_recommendation_structure(): + """Test that recommendations have the expected structure.""" + from rocpd.analyze import generate_recommendations + + recommendations = generate_recommendations(_empty_breakdown(), [], {}) + assert isinstance(recommendations, list) + assert len(recommendations) > 0 + rec = recommendations[0] + for field in ("priority", "category", "issue", "suggestion"): + assert field in rec + assert rec["priority"] in ["HIGH", "MEDIUM", "LOW", "INFO"] + + +def test_high_memcpy_recommendation(): + """Test that high memory copy overhead triggers recommendation.""" + from rocpd.analyze import generate_recommendations + + td = _empty_breakdown(memcpy_percent=35) + recs = generate_recommendations(td, [], {}) + memcpy_recs = [r for r in recs if "Memory Transfer" in r.get("category", "")] + assert len(memcpy_recs) > 0 + assert memcpy_recs[0]["priority"] == "HIGH" + + +def test_hotspot_recommendation(): + """Test that dominant kernel triggers recommendation.""" + from rocpd.analyze import generate_recommendations + + hotspots = [_make_hotspot(name="test_kernel", pct=60)] + recs = generate_recommendations(_empty_breakdown(), hotspots, {}) + compute_recs = [r for r in recs if "Compute Bottleneck" in r.get("category", "")] + assert len(compute_recs) > 0 + assert "test_kernel" in compute_recs[0]["issue"] + + +# --------------------------------------------------------------------------- +# _build_summary โ€“ all bottleneck classification branches +# --------------------------------------------------------------------------- + + +def test_summary_memory_transfer_high_confidence(): + """memcpy_pct > 30 โ†’ memory_transfer with confidence 0.85.""" + from rocpd.analyze import _build_summary + + result = _build_summary( + {"memcpy_percent": 35, "kernel_percent": 50, "overhead_percent": 15}, [], False + ) + assert result["primary_bottleneck"] == "memory_transfer" + assert result["confidence"] == 0.85 + + +def test_summary_memory_transfer_medium_confidence(): + """memcpy_pct 20-30 โ†’ memory_transfer with confidence 0.70.""" + from rocpd.analyze import _build_summary + + result = _build_summary( + {"memcpy_percent": 25, "kernel_percent": 60, "overhead_percent": 15}, [], False + ) + assert result["primary_bottleneck"] == "memory_transfer" + assert result["confidence"] == 0.70 + + +def test_summary_latency_bottleneck(): + """overhead_pct > 25 (memcpy < 20) โ†’ latency with confidence 0.75.""" + from rocpd.analyze import _build_summary + + result = _build_summary( + {"memcpy_percent": 10, "kernel_percent": 60, "overhead_percent": 30}, [], False + ) + assert result["primary_bottleneck"] == "latency" + assert result["confidence"] == 0.75 + + +def test_summary_compute_with_counters(): + """kernel_pct > 70 + has_counters=True โ†’ compute with confidence 0.80.""" + from rocpd.analyze import _build_summary + + result = _build_summary( + {"memcpy_percent": 5, "kernel_percent": 80, "overhead_percent": 5}, [], True + ) + assert result["primary_bottleneck"] == "compute" + assert result["confidence"] == 0.80 + + +def test_summary_compute_without_counters(): + """kernel_pct > 70 + has_counters=False โ†’ compute with confidence 0.60.""" + from rocpd.analyze import _build_summary + + result = _build_summary( + {"memcpy_percent": 5, "kernel_percent": 80, "overhead_percent": 5}, [], False + ) + assert result["primary_bottleneck"] == "compute" + assert result["confidence"] == 0.60 + + +def test_summary_mixed_bottleneck(): + """Low percentages all round โ†’ mixed with confidence 0.50.""" + from rocpd.analyze import _build_summary + + result = _build_summary( + {"memcpy_percent": 10, "kernel_percent": 50, "overhead_percent": 10}, [], False + ) + assert result["primary_bottleneck"] == "mixed" + assert result["confidence"] == 0.50 + + +def test_summary_top_kernel_in_findings(): + """Top kernel name from hotspots[0] appears in key_findings.""" + from rocpd.analyze import _build_summary + + hotspots = [_make_hotspot(name="gemm_kernel")] + result = _build_summary( + {"memcpy_percent": 5, "kernel_percent": 80, "overhead_percent": 5}, + hotspots, + False, + ) + assert any("gemm_kernel" in f for f in result["key_findings"]) + + +def test_summary_empty_hotspots_shows_na(): + """Empty hotspots โ†’ top kernel reported as 'N/A' in key_findings.""" + from rocpd.analyze import _build_summary + + result = _build_summary( + {"memcpy_percent": 5, "kernel_percent": 80, "overhead_percent": 5}, [], False + ) + assert any("N/A" in f for f in result["key_findings"]) + + +def test_summary_counters_finding_present(): + """has_counters=True adds counter-data finding; False adds Tier 1 note.""" + from rocpd.analyze import _build_summary + + bd = {"memcpy_percent": 5, "kernel_percent": 50, "overhead_percent": 5} + with_hw = _build_summary(bd, [], True) + without_hw = _build_summary(bd, [], False) + assert any("Hardware counter" in f for f in with_hw["key_findings"]) + assert any("Tier 1" in f for f in without_hw["key_findings"]) + + +def test_summary_has_required_keys(): + """Summary dict contains all required schema keys.""" + from rocpd.analyze import _build_summary + + result = _build_summary( + {"memcpy_percent": 10, "kernel_percent": 60, "overhead_percent": 10}, [], False + ) + for key in ( + "overall_assessment", + "primary_bottleneck", + "confidence", + "key_findings", + ): + assert key in result, f"Missing key: {key!r}" + assert isinstance(result["key_findings"], list) + assert isinstance(result["confidence"], float) + + +# --------------------------------------------------------------------------- +# _build_hw_counters_json +# --------------------------------------------------------------------------- + + +def test_hw_counters_no_counters_structure(): + """has_counters=False returns the correct minimal structure.""" + from rocpd.analyze import _build_hw_counters_json + + result = _build_hw_counters_json({"has_counters": False}) + assert result == {"has_counters": False, "metrics": None, "counters": None} + + +def test_hw_counters_empty_dict(): + """Empty dict (no has_counters key) treated as no counters.""" + from rocpd.analyze import _build_hw_counters_json + + result = _build_hw_counters_json({}) + assert result["has_counters"] is False + + +def test_hw_counters_with_metrics(): + """has_counters=True maps all metric fields correctly.""" + from rocpd.analyze import _build_hw_counters_json + + hw = { + "has_counters": True, + "metrics": { + "gpu_utilization_percent": 75.5, + "avg_waves": 32.0, + "max_waves": 64.0, + "min_waves": 8.0, + }, + "counters": {}, + } + result = _build_hw_counters_json(hw) + assert result["has_counters"] is True + m = result["metrics"] + assert m["gpu_utilization_pct"] == 75.5 + assert m["avg_waves"] == 32.0 + assert m["max_waves"] == 64.0 + assert m["min_waves"] == 8.0 + + +def test_hw_counters_with_counter_data(): + """Counter stats are mapped with correct types.""" + from rocpd.analyze import _build_hw_counters_json + + hw = { + "has_counters": True, + "metrics": {}, + "counters": { + "GRBM_COUNT": { + "sample_count": 100, + "avg_value": 1000.0, + "min_value": 900.0, + "max_value": 1100.0, + "total_value": 100_000.0, + } + }, + } + result = _build_hw_counters_json(hw) + ctr = result["counters"]["GRBM_COUNT"] + assert ctr["sample_count"] == 100 + assert isinstance(ctr["sample_count"], int) + assert ctr["avg_value"] == 1000.0 + assert isinstance(ctr["avg_value"], float) + + +# --------------------------------------------------------------------------- +# _build_warnings_json +# --------------------------------------------------------------------------- + + +def test_warnings_no_counters_emits_warning(): + """has_counters=False โ†’ one warning with 'warning' severity.""" + from rocpd.analyze import _build_warnings_json + + warnings = _build_warnings_json(has_counters=False) + assert len(warnings) == 1 + assert warnings[0]["severity"] == "warning" + assert "Tier 1" in warnings[0]["message"] + assert "recommendation" in warnings[0] + + +def test_warnings_with_counters_is_empty(): + """has_counters=True โ†’ empty warnings list.""" + from rocpd.analyze import _build_warnings_json + + assert _build_warnings_json(has_counters=True) == [] + + +# --------------------------------------------------------------------------- +# _build_recommendations_json โ€“ stable IDs, dedup, unknown category +# --------------------------------------------------------------------------- + + +def _simple_rec(category, priority="INFO"): + return {"category": category, "priority": priority, "issue": "x", "suggestion": "y"} + + +def test_recs_json_stable_ids_for_known_categories(): + """Known categories get their stable ROCPD-*-001 IDs.""" + from rocpd.analyze import _build_recommendations_json + + expected = { + "Low Occupancy": "ROCPD-OCCUPANCY-001", + "GPU Utilization": "ROCPD-UTILIZATION-001", + "Memory Transfer": "ROCPD-MEMCPY-001", + "API Overhead": "ROCPD-API-001", + "Compute Bottleneck": "ROCPD-COMPUTE-001", + "Launch Overhead": "ROCPD-LAUNCH-001", + "Memory Bandwidth": "ROCPD-MEMBW-001", + "Performance": "ROCPD-INFO-001", + } + recs = [_simple_rec(cat) for cat in expected] + out = _build_recommendations_json(recs) + by_cat = {r["category"]: r["id"] for r in out} + for cat, expected_id in expected.items(): + assert ( + by_cat[cat] == expected_id + ), f"{cat}: expected {expected_id}, got {by_cat[cat]}" + + +def test_recs_json_duplicate_category_gets_incremented_id(): + """Two recs with the same category โ†’ IDs end in 001 and 002.""" + from rocpd.analyze import _build_recommendations_json + + recs = [_simple_rec("Memory Transfer"), _simple_rec("Memory Transfer")] + out = _build_recommendations_json(recs) + assert out[0]["id"] == "ROCPD-MEMCPY-001" + assert out[1]["id"] == "ROCPD-MEMCPY-002" + + +def test_recs_json_unknown_category_generates_id(): + """Unknown category generates a ROCPD-...-001 style ID from the name.""" + from rocpd.analyze import _build_recommendations_json + + out = _build_recommendations_json([_simple_rec("Custom Analysis")]) + assert out[0]["id"].startswith("ROCPD-") + assert out[0]["id"].endswith("-001") + + +def test_recs_json_preserves_all_fields(): + """_build_recommendations_json preserves all expected fields.""" + from rocpd.analyze import _build_recommendations_json + + rec = { + "category": "Performance", + "priority": "INFO", + "issue": "test issue", + "suggestion": "test suggestion", + "actions": ["do this"], + "estimated_impact": "5%", + "commands": [ + { + "tool": "rocprofv3", + "full_command": "rocprofv3 -- ./app", + "description": "d", + "flags": [], + "args": [], + } + ], + } + out = _build_recommendations_json([rec]) + assert out[0]["priority"] == "INFO" + assert out[0]["issue"] == "test issue" + assert out[0]["actions"] == ["do this"] + assert len(out[0]["commands"]) == 1 + + +def test_recs_json_empty_input_returns_empty(): + """Empty input list returns empty output list.""" + from rocpd.analyze import _build_recommendations_json + + assert _build_recommendations_json([]) == [] + + +# --------------------------------------------------------------------------- +# _format_as_json โ€“ value mapping correctness +# --------------------------------------------------------------------------- + + +def test_format_json_time_breakdown_values(): + """_format_as_json maps time_breakdown keys correctly into execution_breakdown.""" + from rocpd.analyze import _format_as_json + + td = { + "total_runtime": 1_000_000_000, + "total_kernel_time": 800_000_000, + "total_memcpy_time": 100_000_000, + "kernel_percent": 80.0, + "memcpy_percent": 10.0, + "overhead_percent": 5.0, + } + doc = json.loads(_format_as_json(td, [], {}, [])) + eb = doc["execution_breakdown"] + assert eb["total_runtime_ns"] == 1_000_000_000 + assert eb["kernel_time_ns"] == 800_000_000 + assert eb["memcpy_time_ns"] == 100_000_000 + assert eb["kernel_time_pct"] == 80.0 + assert eb["memcpy_time_pct"] == 10.0 + assert eb["api_overhead_pct"] == 5.0 + + +def test_format_json_idle_time_calculation(): + """Idle time = total โˆ’ kernel โˆ’ memcpy โˆ’ api_overhead, clamped to 0.""" + from rocpd.analyze import _format_as_json + + td = { + "total_runtime": 1_000_000_000, # 1 s + "total_kernel_time": 600_000_000, # 600 ms + "total_memcpy_time": 200_000_000, # 200 ms + "kernel_percent": 60.0, + "memcpy_percent": 20.0, + "overhead_percent": 10.0, # 100 ms + } + doc = json.loads(_format_as_json(td, [], {}, [])) + eb = doc["execution_breakdown"] + # api_overhead_ns = 10% of 1_000_000_000 = 100_000_000 + assert eb["api_overhead_ns"] == 100_000_000 + # idle = 1_000_000_000 - 600_000_000 - 200_000_000 - 100_000_000 = 100_000_000 + assert eb["idle_time_ns"] == 100_000_000 + + +def test_format_json_idle_time_clamped_to_zero(): + """Idle time never goes negative (clamped to 0).""" + from rocpd.analyze import _format_as_json + + # kernel + memcpy already exceed total_runtime + td = { + "total_runtime": 100_000_000, + "total_kernel_time": 80_000_000, + "total_memcpy_time": 30_000_000, # overflows + "kernel_percent": 80.0, + "memcpy_percent": 30.0, + "overhead_percent": 5.0, + } + doc = json.loads(_format_as_json(td, [], {}, [])) + assert doc["execution_breakdown"]["idle_time_ns"] >= 0 + + +def test_format_json_hotspot_field_mapping(): + """Hotspot fields are mapped with correct names and types.""" + from rocpd.analyze import _format_as_json + + hotspots = [ + _make_hotspot( + name="conv_fwd", + calls=5, + total=400_000_000, + avg=80_000_000, + min_d=60_000_000, + max_d=100_000_000, + pct=40.0, + ), + ] + doc = json.loads(_format_as_json(_empty_breakdown(), hotspots, {}, [])) + hs = doc["hotspots"][0] + assert hs["rank"] == 1 + assert hs["name"] == "conv_fwd" + assert hs["calls"] == 5 + assert hs["total_duration_ns"] == 400_000_000 + assert hs["avg_duration_ns"] == 80_000_000.0 + assert hs["min_duration_ns"] == 60_000_000 + assert hs["max_duration_ns"] == 100_000_000 + assert hs["pct_of_total"] == 40.0 + + +def test_format_json_hotspot_rank_increments(): + """Multiple hotspots get ranks 1, 2, 3 in order.""" + from rocpd.analyze import _format_as_json + + hotspots = [_make_hotspot(name=f"k{i}") for i in range(3)] + doc = json.loads(_format_as_json(_empty_breakdown(), hotspots, {}, [])) + ranks = [h["rank"] for h in doc["hotspots"]] + assert ranks == [1, 2, 3] + + +def test_format_json_memory_bandwidth_gbps_conversion(): + """bandwidth_bytes_per_sec is correctly converted to bandwidth_gbps.""" + from rocpd.analyze import _format_as_json + + mem = { + "Host-to-Device": { + "count": 10, + "total_bytes": 0, + "total_duration": 0, + "avg_bytes": 0, + "avg_duration": 0, + "bandwidth_bytes_per_sec": 50e9, # 50 GB/s + } + } + doc = json.loads(_format_as_json(_empty_breakdown(), [], mem, [])) + bw = doc["memory_analysis"]["Host-to-Device"]["bandwidth_gbps"] + assert abs(bw - 50.0) < 0.001 + + +def test_format_json_analysis_tier_with_counters(): + """analysis_tier=2 and hardware_counters.has_counters=True when counters present.""" + from rocpd.analyze import _format_as_json + + hw = {"has_counters": True, "metrics": {}, "counters": {}} + doc = json.loads( + _format_as_json(_empty_breakdown(), [], {}, [], hardware_counters=hw) + ) + assert doc["profiling_info"]["analysis_tier"] == 2 + assert doc["hardware_counters"]["has_counters"] is True + + +def test_format_json_analysis_tier_without_counters(): + """analysis_tier=1 and hardware_counters.has_counters=False when no counters.""" + from rocpd.analyze import _format_as_json + + doc = json.loads(_format_as_json(_empty_breakdown(), [], {}, [])) + assert doc["profiling_info"]["analysis_tier"] == 1 + assert doc["hardware_counters"]["has_counters"] is False + + +def test_format_json_database_path_in_metadata(): + """database_file in metadata reflects the database_path argument.""" + from rocpd.analyze import _format_as_json + + doc = json.loads( + _format_as_json(_empty_breakdown(), [], {}, [], database_path="/data/trace.db") + ) + assert doc["metadata"]["database_file"] == "/data/trace.db" + + +def test_format_json_schema_version(): + """JSON output always carries schema_version = '0.1.0'.""" + from rocpd.analyze import _format_as_json + + doc = json.loads(_format_as_json(_empty_breakdown(), [], {}, [])) + assert doc["schema_version"] == "0.1.0" + + +def test_format_json_analysis_version_in_metadata(): + """metadata.analysis_version = '0.1.0'.""" + from rocpd.analyze import _format_as_json + + doc = json.loads(_format_as_json(_empty_breakdown(), [], {}, [])) + assert doc["metadata"]["analysis_version"] == "0.1.0" + + +# --------------------------------------------------------------------------- +# format_analysis_output โ€“ text, json, markdown +# --------------------------------------------------------------------------- + + +def _full_sample_data(): + td = { + "total_runtime": 1_200_000_000, + "total_kernel_time": 1_000_000_000, + "total_memcpy_time": 200_000_000, + "kernel_percent": 83.3, + "memcpy_percent": 16.7, + "overhead_percent": 0.0, + } + hotspots = [_make_hotspot(name="kernel_1", calls=100, total=500_000_000, pct=50.0)] + memory_analysis = { + "Host-to-Device": { + "count": 10, + "total_bytes": 1_048_576, + "total_duration": 100_000_000, + "avg_bytes": 104_857, + "avg_duration": 10_000_000, + "bandwidth_bytes_per_sec": 10_485_760, + } + } + recommendations = [ + { + "priority": "INFO", + "category": "Test", + "issue": "Test issue", + "suggestion": "Test suggestion", + "actions": ["Action 1"], + "estimated_impact": "5%", + "commands": [], + } + ] + return td, hotspots, memory_analysis, recommendations + + +def test_format_output_text(): + """Text format contains all expected section headers and data.""" + from rocpd.analyze import format_analysis_output + + td, hs, mem, recs = _full_sample_data() + out = format_analysis_output( + td, hs, mem, recs, output_format="text", database_path="/test/db.db" + ) + assert isinstance(out, str) + assert "ROCPD AI PERFORMANCE ANALYSIS" in out + assert "TIME BREAKDOWN" in out + assert "HOTSPOTS" in out + assert "MEMORY COPY ANALYSIS" in out + assert "RECOMMENDATIONS" in out + assert "kernel_1" in out + assert "Host-to-Device" in out + + +def test_format_output_text_empty_data(): + """Text format with all-zero data still produces valid output.""" + from rocpd.analyze import format_analysis_output + + out = format_analysis_output(_empty_breakdown(), [], {}, [], output_format="text") + assert isinstance(out, str) + assert "ROCPD AI PERFORMANCE ANALYSIS" in out + + +def test_format_output_json(): + """JSON format returns valid parseable JSON with required top-level keys.""" + from rocpd.analyze import format_analysis_output + + td, hs, mem, recs = _full_sample_data() + out = format_analysis_output(td, hs, mem, recs, output_format="json") + doc = json.loads(out) + for key in ( + "schema_version", + "metadata", + "hotspots", + "recommendations", + "execution_breakdown", + "hardware_counters", + ): + assert key in doc, f"Missing key: {key!r}" + + +def test_format_output_markdown(): + """Markdown format returns well-structured markdown document.""" + from rocpd.analyze import format_analysis_output + + td, hs, mem, recs = _full_sample_data() + out = format_analysis_output( + td, hs, mem, recs, output_format="markdown", database_path="/test/db.db" + ) + assert isinstance(out, str) + assert out.startswith("# ROCpd AI Performance Analysis") + assert "## Time Breakdown" in out + assert "## Top Kernel Hotspots" in out + assert "## Memory Copy Analysis" in out + assert "## Recommendations" in out + assert "kernel_1" in out + assert "Host-to-Device" in out + + +def test_format_output_markdown_no_hotspots(): + """Markdown format omits hotspot section when list is empty.""" + from rocpd.analyze import format_analysis_output + + td, _, mem, recs = _full_sample_data() + out = format_analysis_output(td, [], mem, recs, output_format="markdown") + assert "## Top Kernel Hotspots" not in out + + +def test_format_output_markdown_no_memory(): + """Markdown format omits memory section when analysis is empty.""" + from rocpd.analyze import format_analysis_output + + td, hs, _, recs = _full_sample_data() + out = format_analysis_output(td, hs, {}, recs, output_format="markdown") + assert "## Memory Copy Analysis" not in out + + +def test_format_output_markdown_with_hardware_counters(): + """Markdown format includes Tier 2 section when hardware counters present.""" + from rocpd.analyze import format_analysis_output + + td, hs, mem, recs = _full_sample_data() + hw = { + "has_counters": True, + "metrics": { + "gpu_utilization_percent": 65.0, + "avg_waves": 24.0, + "max_waves": 48.0, + }, + "counters": {}, + } + out = format_analysis_output( + td, hs, mem, recs, hardware_counters=hw, output_format="markdown" + ) + assert "## Hardware Counters (Tier 2)" in out + assert "65.0%" in out + + +def test_format_output_unknown_format_falls_back_to_text(): + """Unrecognized format falls back to text output.""" + from rocpd.analyze import format_analysis_output + + out = format_analysis_output(_empty_breakdown(), [], {}, [], output_format="xml") + assert "ROCPD AI PERFORMANCE ANALYSIS" in out + + +# --------------------------------------------------------------------------- +# _filter_rec_commands: PMC counter filtering +# --------------------------------------------------------------------------- + + +def _pmc_cmd( + counters="GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES", extra_flags=None, extra_args=None +): + """Build a minimal rocprofv3 recommendation command with a --pmc arg.""" + flags = ["--sys-trace"] + (extra_flags or []) + args = [ + {"name": "--pmc", "value": counters}, + {"name": "-d", "value": "./output"}, + {"name": "-o", "value": "profile"}, + ] + (extra_args or []) + return { + "tool": "rocprofv3", + "description": "Collect hardware counters", + "flags": flags, + "args": args, + "full_command": ( + f"rocprofv3 --sys-trace --pmc {counters} -d ./output -o profile -- ./app" + ), + } + + +def test_filter_pmc_all_counters_already_collected_drops_command(): + """When every --pmc counter is already in pmc_events, the command is dropped.""" + from rocpd.analyze import _filter_rec_commands + + already = frozenset( + {"--sys-trace", "pmc:GRBM_COUNT", "pmc:GRBM_GUI_ACTIVE", "pmc:SQ_WAVES"} + ) + result = _filter_rec_commands([_pmc_cmd()], already) + assert result == [], "Command with all counters already collected should be dropped" + + +def test_filter_pmc_partial_counters_already_collected_updates_arg(): + """When some --pmc counters are already collected, only new ones remain.""" + from rocpd.analyze import _filter_rec_commands + + # GRBM_COUNT already collected; GRBM_GUI_ACTIVE and SQ_WAVES are new + already = frozenset({"--sys-trace", "pmc:GRBM_COUNT"}) + result = _filter_rec_commands([_pmc_cmd()], already) + assert len(result) == 1 + pmc_arg = next(a for a in result[0]["args"] if a.get("name") == "--pmc") + remaining = set(pmc_arg["value"].split()) + assert remaining == {"GRBM_GUI_ACTIVE", "SQ_WAVES"} + assert "GRBM_COUNT" not in pmc_arg["value"] + + +def test_filter_pmc_partial_updates_full_command(): + """full_command reflects the reduced counter list after partial stripping.""" + from rocpd.analyze import _filter_rec_commands + + already = frozenset({"--sys-trace", "pmc:GRBM_COUNT"}) + result = _filter_rec_commands([_pmc_cmd()], already) + assert len(result) == 1 + assert "GRBM_COUNT" not in result[0]["full_command"] + assert "GRBM_GUI_ACTIVE" in result[0]["full_command"] + assert "SQ_WAVES" in result[0]["full_command"] + + +def test_filter_pmc_no_counters_collected_keeps_command_unchanged(): + """When already_collected is empty, the command is returned unchanged.""" + from rocpd.analyze import _filter_rec_commands + + already = frozenset() + cmd = _pmc_cmd() + result = _filter_rec_commands([cmd], already) + assert len(result) == 1 + assert result[0] is cmd # exact same object, no copy + + +def test_filter_pmc_description_note_added(): + """A note listing removed PMC counters is appended to description.""" + from rocpd.analyze import _filter_rec_commands + + already = frozenset({"--sys-trace", "pmc:GRBM_COUNT"}) + result = _filter_rec_commands([_pmc_cmd()], already) + assert len(result) == 1 + assert "GRBM_COUNT" in result[0]["description"] + assert "Already collected" in result[0]["description"] + + +def test_filter_pmc_kernel_names_alone_not_meaningful(): + """--kernel-names is a scope filter; command with only scope+output args is dropped.""" + from rocpd.analyze import _filter_rec_commands + + cmd = _pmc_cmd( + counters="GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES", + extra_args=[{"name": "--kernel-names", "value": "my_kernel"}], + ) + cmd["full_command"] = ( + "rocprofv3 --sys-trace --pmc GRBM_COUNT GRBM_GUI_ACTIVE SQ_WAVES" + ' --kernel-names "my_kernel" -d ./output -o profile -- ./app' + ) + # All three counters already collected + sys-trace โ†’ nothing new + already = frozenset( + {"--sys-trace", "pmc:GRBM_COUNT", "pmc:GRBM_GUI_ACTIVE", "pmc:SQ_WAVES"} + ) + result = _filter_rec_commands([cmd], already) + assert result == [], "Command with only scope+output args remaining should be dropped" + + +def test_filter_pmc_rocprof_compute_always_kept(): + """rocprof-compute commands are never dropped, even when counters are collected.""" + from rocpd.analyze import _filter_rec_commands + + compute_cmd = { + "tool": "rocprof-compute", + "description": "Roofline model analysis", + "flags": [], + "args": [{"name": "profile", "value": None}], + "full_command": "rocprof-compute profile -- ./app", + } + already = frozenset( + {"--sys-trace", "pmc:GRBM_COUNT", "pmc:GRBM_GUI_ACTIVE", "pmc:SQ_WAVES"} + ) + result = _filter_rec_commands([compute_cmd], already) + assert len(result) == 1 + assert result[0] is compute_cmd + + +# --------------------------------------------------------------------------- +# Entry point +# --------------------------------------------------------------------------- + +if __name__ == "__main__": + # Use --noconftest to avoid loading conftest.py which requires rocprofiler_sdk module + exit_code = pytest.main(["--noconftest", "-x", __file__] + sys.argv[1:]) + sys.exit(exit_code) diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/rocpd/test_analyze_schema.py b/projects/rocprofiler-sdk/tests/rocprofv3/rocpd/test_analyze_schema.py new file mode 100644 index 00000000000..6d762460f6d --- /dev/null +++ b/projects/rocprofiler-sdk/tests/rocprofv3/rocpd/test_analyze_schema.py @@ -0,0 +1,562 @@ +#!/usr/bin/env python3 +############################################################################### +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +############################################################################### + +""" +Tests for the AI analysis JSON schema (analysis-output.schema.json). + +Validates: + - The schema file is present, parseable, and structurally correct. + - rocpd analyze --format json output conforms to the schema. + - Recommendations contain the structured commands array. +""" + +import json +import os +import sys +import tempfile + +try: + import importlib.resources as pkg_resources +except ImportError: # Python 3.6 + import pkgutil as _pkgutil + + class pkg_resources: # type: ignore[no-redef] + """Minimal shim so _load_schema() works on Python 3.6.""" + + class _Traversable: + def __init__(self, package, resource): + self._package = package + self._resource = resource + + def read_text(self, encoding="utf-8"): + data = _pkgutil.get_data(self._package, self._resource) + return data.decode(encoding) if data is not None else "" + + class _Package: + def __init__(self, package): + self._package = package + + def joinpath(self, resource): + return pkg_resources._Traversable(self._package, resource) + + @staticmethod + def files(package): + return pkg_resources._Package(package) + + +import pytest + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +# The version emitted by Tier 1/2 analysis (no TraceLens fields). +# This constant is only used to verify the schema enum includes the Tier 1/2 version; +# conformance tests derive allowed versions from the loaded schema enum directly. +TIER12_SCHEMA_VERSION = "0.1.0" + +REQUIRED_TOP_LEVEL = [ + "schema_version", + "metadata", + "profiling_info", + "summary", + "execution_breakdown", + "hotspots", + "memory_analysis", + "hardware_counters", + "recommendations", + "warnings", + "errors", +] + +COMMAND_TOOLS = {"rocprofv3", "rocprof-sys", "rocprof-compute"} + + +def _load_schema(): + """Load the schema JSON from the installed package.""" + schema_text = ( + pkg_resources.files("rocpd.ai_analysis") + .joinpath("docs/analysis-output.schema.json") + .read_text(encoding="utf-8") + ) + return json.loads(schema_text) + + +def _make_synthetic_json_output(): + """Generate a minimal JSON analysis document using the public API.""" + from rocpd.analyze import format_analysis_output, generate_recommendations + + # Keys must match what compute_time_breakdown() actually returns. + time_breakdown = { + "kernel_percent": 50.0, + "memcpy_percent": 30.0, + "overhead_percent": 15.0, + "total_runtime": 100_000_000, + "total_kernel_time": 50_000_000, + "total_memcpy_time": 30_000_000, + } + hotspots = [ + { + "name": "test_kernel", + "total_duration": 45_000_000, + "calls": 10, # matches identify_hotspots() key (COUNT(*) as calls) + "avg_duration": 4_500_000, + "min_duration": 4_000_000, + "max_duration": 5_000_000, + } + ] + # Keys must match the actual return shape of analyze_memory_copies(): + # count, total_bytes, total_duration, avg_bytes, avg_duration, bandwidth_bytes_per_sec + memory_analysis = { + "Host-to-Device": { + "count": 5, + "total_bytes": 5120, + "total_duration": 30_000_000, + "avg_bytes": 1024.0, + "avg_duration": 6_000_000.0, + "bandwidth_bytes_per_sec": 1e9, + } + } + recommendations = generate_recommendations(time_breakdown, hotspots, memory_analysis) + output = format_analysis_output( + time_breakdown, + hotspots, + memory_analysis, + recommendations, + output_format="json", + ) + return json.loads(output) + + +# --------------------------------------------------------------------------- +# Schema file tests +# --------------------------------------------------------------------------- + + +def test_schema_file_is_readable(): + """Schema file can be located and read through the package.""" + text = ( + pkg_resources.files("rocpd.ai_analysis") + .joinpath("docs/analysis-output.schema.json") + .read_text(encoding="utf-8") + ) + assert len(text) > 0, "Schema file is empty" + + +def test_schema_file_is_valid_json(): + """Schema file is valid JSON.""" + schema = _load_schema() + assert isinstance(schema, dict), "Schema root must be a JSON object" + + +def test_schema_file_has_json_schema_keyword(): + """Schema file declares a JSON Schema dialect.""" + from urllib.parse import urlparse + + schema = _load_schema() + assert "$schema" in schema, "Schema must contain $schema keyword" + parsed = urlparse(schema["$schema"]) + assert ( + parsed.netloc == "json-schema.org" + ), f"$schema must point to json-schema.org, got netloc={parsed.netloc!r}" + + +def test_schema_file_version_enum(): + """schema_version property enum includes the Tier 1/2 version (0.1.0).""" + schema = _load_schema() + props = schema.get("properties", {}) + assert "schema_version" in props, "schema_version must be in properties" + enum_vals = props["schema_version"].get("enum", []) + assert TIER12_SCHEMA_VERSION in enum_vals, ( + f"schema_version enum must include {TIER12_SCHEMA_VERSION!r}, " + f"got {enum_vals!r}" + ) + + +def test_schema_file_required_fields(): + """Schema requires all expected top-level fields.""" + schema = _load_schema() + required = schema.get("required", []) + for field in REQUIRED_TOP_LEVEL: + assert field in required, f"Required field missing from schema: {field!r}" + + +def test_schema_file_defines_recommendation_command(): + """Schema $defs contains a recommendation_command definition.""" + schema = _load_schema() + defs = schema.get("$defs", {}) + assert "recommendation_command" in defs, "$defs must define recommendation_command" + cmd_def = defs["recommendation_command"] + required_cmd_fields = {"tool", "description", "flags", "args", "full_command"} + defined = set(cmd_def.get("properties", {}).keys()) + missing = required_cmd_fields - defined + assert not missing, f"recommendation_command missing properties: {missing}" + + +def test_schema_file_tool_enum(): + """recommendation_command.tool is an enum of the three ROCm tools.""" + schema = _load_schema() + cmd_props = schema["$defs"]["recommendation_command"]["properties"] + tool_enum = set(cmd_props["tool"].get("enum", [])) + assert ( + tool_enum == COMMAND_TOOLS + ), f"tool enum must be {COMMAND_TOOLS}, got {tool_enum}" + + +# --------------------------------------------------------------------------- +# JSON output conformance tests (using synthetic data) +# --------------------------------------------------------------------------- + + +def test_json_output_schema_version(): + """format_analysis_output JSON output carries a schema_version in the allowed enum.""" + schema = _load_schema() + allowed = schema["properties"]["schema_version"]["enum"] + doc = _make_synthetic_json_output() + assert ( + doc.get("schema_version") in allowed + ), f"schema_version {doc.get('schema_version')!r} not in allowed enum {allowed}" + + +def test_json_output_required_fields_present(): + """All required top-level fields are present in JSON output.""" + doc = _make_synthetic_json_output() + for field in REQUIRED_TOP_LEVEL: + assert field in doc, f"Required field missing from JSON output: {field!r}" + + +def test_json_output_metadata_fields(): + """metadata object contains expected sub-fields.""" + doc = _make_synthetic_json_output() + meta = doc["metadata"] + for field in ( + "rocpd_version", + "analysis_version", + "database_file", + "analysis_timestamp", + ): + assert field in meta, f"metadata missing field: {field!r}" + schema = _load_schema() + allowed = schema["properties"]["schema_version"]["enum"] + assert ( + meta["analysis_version"] in allowed + ), f"metadata.analysis_version {meta['analysis_version']!r} not in allowed enum {allowed}" + + +def test_json_output_hardware_counters_has_flag(): + """hardware_counters always contains has_counters boolean.""" + doc = _make_synthetic_json_output() + hw = doc["hardware_counters"] + assert "has_counters" in hw, "hardware_counters must have has_counters" + assert isinstance(hw["has_counters"], bool) + + +def test_json_output_recommendations_are_list(): + """recommendations is a list.""" + doc = _make_synthetic_json_output() + assert isinstance(doc["recommendations"], list) + + +def test_json_output_recommendation_required_fields(): + """Each recommendation has required fields: id, priority, category, issue, suggestion.""" + doc = _make_synthetic_json_output() + for i, rec in enumerate(doc["recommendations"]): + for field in ("id", "priority", "category", "issue", "suggestion"): + assert field in rec, f"recommendations[{i}] missing field {field!r}" + assert rec["priority"] in ( + "HIGH", + "MEDIUM", + "LOW", + "INFO", + ), f"recommendations[{i}] has invalid priority {rec['priority']!r}" + + +def test_json_output_recommendations_have_commands(): + """Recommendations include a commands array.""" + doc = _make_synthetic_json_output() + recs_with_commands = [r for r in doc["recommendations"] if r.get("commands")] + assert ( + len(recs_with_commands) > 0 + ), "At least one recommendation must have a non-empty commands array" + + +def test_json_output_command_structure(): + """Each command object has all required fields with correct types.""" + doc = _make_synthetic_json_output() + for i, rec in enumerate(doc["recommendations"]): + for j, cmd in enumerate(rec.get("commands", [])): + loc = f"recommendations[{i}].commands[{j}]" + assert "tool" in cmd, f"{loc} missing 'tool'" + assert "description" in cmd, f"{loc} missing 'description'" + assert "flags" in cmd, f"{loc} missing 'flags'" + assert "args" in cmd, f"{loc} missing 'args'" + assert "full_command" in cmd, f"{loc} missing 'full_command'" + assert ( + cmd["tool"] in COMMAND_TOOLS + ), f"{loc} tool {cmd['tool']!r} not in {COMMAND_TOOLS}" + assert isinstance(cmd["flags"], list), f"{loc} flags must be a list" + assert isinstance(cmd["args"], list), f"{loc} args must be a list" + assert isinstance( + cmd["full_command"], str + ), f"{loc} full_command must be a string" + assert ( + cmd["tool"] in cmd["full_command"] + ), f"{loc} full_command must start with tool name" + + +def test_json_output_command_args_structure(): + """Each arg in commands.args has name and value fields.""" + doc = _make_synthetic_json_output() + for i, rec in enumerate(doc["recommendations"]): + for j, cmd in enumerate(rec.get("commands", [])): + for k, arg in enumerate(cmd.get("args", [])): + loc = f"recommendations[{i}].commands[{j}].args[{k}]" + assert "name" in arg, f"{loc} missing 'name'" + assert "value" in arg, f"{loc} missing 'value'" + assert isinstance(arg["name"], str), f"{loc} name must be a string" + # value may be str or None + assert arg["value"] is None or isinstance( + arg["value"], str + ), f"{loc} value must be str or null" + + +def test_json_output_validates_against_schema(): + """JSON output passes jsonschema validation against analysis-output.schema.json.""" + jsonschema = pytest.importorskip("jsonschema", reason="jsonschema not installed") + schema = _load_schema() + doc = _make_synthetic_json_output() + try: + jsonschema.validate(instance=doc, schema=schema) + except jsonschema.ValidationError as exc: + pytest.fail(f"JSON output failed schema validation: {exc.message}") + + +# --------------------------------------------------------------------------- +# Tier 0 (source-only) JSON output helpers +# --------------------------------------------------------------------------- + +_MINIMAL_HIP_SOURCE = """\ +__global__ void my_kernel(float* x) { *x = 1.0f; } +void run() { + hipLaunchKernelGGL(my_kernel, dim3(1), dim3(64), 0, 0, nullptr); + hipMemcpy(nullptr, nullptr, 0, hipMemcpyHostToDevice); +} +""" + +TIER0_SCHEMA_VERSION = "0.2.0" + + +def _make_synthetic_tier0_json_output(): + """Generate a Tier 0 (source-only) JSON document via format_analysis_output.""" + from rocpd.analyze import analyze_source_code, format_analysis_output + + with tempfile.TemporaryDirectory() as tmpdir: + hip_file = os.path.join(tmpdir, "test.cpp") + with open(hip_file, "w") as fh: + fh.write(_MINIMAL_HIP_SOURCE) + + tier0_result = analyze_source_code(tmpdir) + output = format_analysis_output( + {}, + [], + {}, + [], + output_format="json", + tier0_result=tier0_result, + source_only=True, + ) + return json.loads(output) + + +def _make_synthetic_combined_json_output(): + """Generate a combined (Tier 0 + Tier 1/2) JSON document.""" + from rocpd.analyze import ( + analyze_source_code, + format_analysis_output, + generate_recommendations, + ) + + time_breakdown = { + "kernel_percent": 50.0, + "memcpy_percent": 30.0, + "overhead_percent": 15.0, + "total_runtime": 100_000_000, + "total_kernel_time": 50_000_000, + "total_memcpy_time": 30_000_000, + } + hotspots = [ + { + "name": "test_kernel", + "total_duration": 45_000_000, + "calls": 10, + "avg_duration": 4_500_000, + "min_duration": 4_000_000, + "max_duration": 5_000_000, + } + ] + memory_analysis = { + "Host-to-Device": { + "count": 5, + "total_bytes": 5120, + "total_duration": 30_000_000, + "avg_bytes": 1024.0, + "avg_duration": 6_000_000.0, + "bandwidth_bytes_per_sec": 1e9, + } + } + recommendations = generate_recommendations(time_breakdown, hotspots, memory_analysis) + + with tempfile.TemporaryDirectory() as tmpdir: + hip_file = os.path.join(tmpdir, "test.cpp") + with open(hip_file, "w") as fh: + fh.write(_MINIMAL_HIP_SOURCE) + + tier0_result = analyze_source_code(tmpdir) + output = format_analysis_output( + time_breakdown, + hotspots, + memory_analysis, + recommendations, + output_format="json", + tier0_result=tier0_result, + source_only=False, + ) + return json.loads(output) + + +# --------------------------------------------------------------------------- +# Tier 0 (source-only) schema conformance tests +# --------------------------------------------------------------------------- + + +def test_tier0_json_output_schema_version(): + """Tier 0 JSON output has schema_version in the allowed enum.""" + schema = _load_schema() + allowed = schema["properties"]["schema_version"]["enum"] + doc = _make_synthetic_tier0_json_output() + assert ( + doc.get("schema_version") in allowed + ), f"tier0 schema_version {doc.get('schema_version')!r} not in allowed enum {allowed}" + assert ( + doc.get("schema_version") == TIER0_SCHEMA_VERSION + ), f"tier0 schema_version should be {TIER0_SCHEMA_VERSION!r}" + + +def test_tier0_json_output_required_fields_present(): + """All required top-level fields are present in Tier 0 JSON output.""" + doc = _make_synthetic_tier0_json_output() + for field in REQUIRED_TOP_LEVEL: + assert field in doc, f"Tier 0 JSON missing required field: {field!r}" + + +def test_tier0_json_output_execution_breakdown_is_null(): + """execution_breakdown is null in source-only (Tier 0) output.""" + doc = _make_synthetic_tier0_json_output() + assert ( + doc["execution_breakdown"] is None + ), "execution_breakdown must be null in Tier 0 source-only output" + + +def test_tier0_json_output_profiling_mode_is_source_only(): + """profiling_info.profiling_mode is 'source_only' in Tier 0 output.""" + doc = _make_synthetic_tier0_json_output() + assert ( + doc["profiling_info"]["profiling_mode"] == "source_only" + ), "Tier 0 profiling_mode must be 'source_only'" + + +def test_tier0_json_output_analysis_tier_is_zero(): + """profiling_info.analysis_tier is 0 in Tier 0 source-only output.""" + doc = _make_synthetic_tier0_json_output() + assert doc["profiling_info"]["analysis_tier"] == 0, "Tier 0 analysis_tier must be 0" + + +def test_tier0_json_output_has_tier0_field(): + """Tier 0 JSON output includes a top-level 'tier0' object.""" + doc = _make_synthetic_tier0_json_output() + assert "tier0" in doc, "Tier 0 JSON output must include a 'tier0' field" + tier0 = doc["tier0"] + assert isinstance(tier0, dict), "'tier0' must be a JSON object" + for field in ("source_dir", "programming_model", "files_scanned", "kernel_count"): + assert field in tier0, f"tier0 missing field {field!r}" + + +def test_tier0_json_output_validates_against_schema(): + """Tier 0 JSON output passes jsonschema validation.""" + jsonschema = pytest.importorskip("jsonschema", reason="jsonschema not installed") + schema = _load_schema() + doc = _make_synthetic_tier0_json_output() + try: + jsonschema.validate(instance=doc, schema=schema) + except jsonschema.ValidationError as exc: + pytest.fail(f"Tier 0 JSON failed schema validation: {exc.message}") + + +# --------------------------------------------------------------------------- +# Combined (Tier 0 + Tier 1/2) schema conformance tests +# --------------------------------------------------------------------------- + + +def test_combined_json_output_has_tier0_field(): + """Combined (Tier 0 + Tier 1/2) JSON output includes a top-level 'tier0' object.""" + doc = _make_synthetic_combined_json_output() + assert "tier0" in doc, "Combined JSON output must include a 'tier0' field" + assert isinstance(doc["tier0"], dict), "'tier0' must be a JSON object" + + +def test_combined_json_output_tier12_required_fields_present(): + """Combined JSON output has all required Tier 1/2 top-level fields.""" + doc = _make_synthetic_combined_json_output() + for field in REQUIRED_TOP_LEVEL: + assert field in doc, f"Combined JSON missing required field: {field!r}" + + +def test_combined_json_output_execution_breakdown_not_null(): + """execution_breakdown is non-null in combined (Tier 0 + Tier 1/2) output.""" + doc = _make_synthetic_combined_json_output() + assert ( + doc["execution_breakdown"] is not None + ), "execution_breakdown must not be null in combined output" + + +def test_combined_json_output_validates_against_schema(): + """Combined (Tier 0 + Tier 1/2) JSON output passes jsonschema validation.""" + jsonschema = pytest.importorskip("jsonschema", reason="jsonschema not installed") + schema = _load_schema() + doc = _make_synthetic_combined_json_output() + try: + jsonschema.validate(instance=doc, schema=schema) + except jsonschema.ValidationError as exc: + pytest.fail(f"Combined JSON failed schema validation: {exc.message}") + + +# --------------------------------------------------------------------------- +# Entry point +# --------------------------------------------------------------------------- + +if __name__ == "__main__": + # Use --noconftest to avoid loading conftest.py which requires rocprofiler_sdk module + exit_code = pytest.main(["--noconftest", "-x", __file__] + sys.argv[1:]) + sys.exit(exit_code) diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/rocpd/test_guide_filter_standalone.py b/projects/rocprofiler-sdk/tests/rocprofv3/rocpd/test_guide_filter_standalone.py new file mode 100644 index 00000000000..25e3b2888e9 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/rocprofv3/rocpd/test_guide_filter_standalone.py @@ -0,0 +1,528 @@ +#!/usr/bin/env python3 +############################################################################### +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +############################################################################### +""" +Standalone unit tests for LLM reference guide context-aware filtering. + +These tests do NOT require a GPU trace database or real LLM credentials. +Run with: + ROCPD_SYS=$(python3 -c "import site; print(site.getsitepackages()[-1])") + PYTHONPATH="${ROCPD_SYS}" pytest --noconftest test_guide_filter_standalone.py -v +""" + +import sys + +import pytest + +# --------------------------------------------------------------------------- +# Group A: AnalysisContext defaults and construction (5 tests) +# --------------------------------------------------------------------------- + + +class TestAnalysisContextDefaults: + + def test_default_tier_is_1(self): + from rocpd.ai_analysis.llm_analyzer import AnalysisContext + + ctx = AnalysisContext() + assert ctx.tier == 1 + + def test_default_has_counters_false(self): + from rocpd.ai_analysis.llm_analyzer import AnalysisContext + + ctx = AnalysisContext() + assert ctx.has_counters is False + + def test_default_nullable_fields_are_none(self): + from rocpd.ai_analysis.llm_analyzer import AnalysisContext + + ctx = AnalysisContext() + assert ctx.bottleneck_type is None + assert ctx.gpu_arch is None + assert ctx.custom_prompt is None + + def test_explicit_values_preserved(self): + from rocpd.ai_analysis.llm_analyzer import AnalysisContext + + ctx = AnalysisContext( + tier=2, + has_counters=True, + bottleneck_type="compute", + gpu_arch="gfx942", + custom_prompt="why is my kernel slow?", + ) + assert ctx.tier == 2 + assert ctx.has_counters is True + assert ctx.bottleneck_type == "compute" + assert ctx.gpu_arch == "gfx942" + assert ctx.custom_prompt == "why is my kernel slow?" + + def test_dataclass_equality(self): + from rocpd.ai_analysis.llm_analyzer import AnalysisContext + + a = AnalysisContext(tier=1, has_counters=False) + b = AnalysisContext(tier=1, has_counters=False) + assert a == b + + +# --------------------------------------------------------------------------- +# Group B: _select_tags logic (14 tests) +# --------------------------------------------------------------------------- + + +class TestSelectTags: + + def _tags(self, **kwargs): + from rocpd.ai_analysis.llm_analyzer import AnalysisContext, _select_tags + + return _select_tags(AnalysisContext(**kwargs)) + + def test_tier1_no_counters_gives_always_and_tier1_only(self): + tags = self._tags(tier=1, has_counters=False) + assert tags == {"always", "tier1"} + + def test_tier2_value_adds_tier2_even_without_flag(self): + tags = self._tags(tier=2, has_counters=False) + assert "tier2" in tags + assert "tier1" in tags + + def test_has_counters_true_adds_tier2_regardless_of_tier_field(self): + tags = self._tags(tier=1, has_counters=True) + assert "tier2" in tags + + def test_tier0_gives_always_source_compiler_not_tier1_or_tier2(self): + tags = self._tags(tier=0) + assert "always" in tags + assert "source" in tags + assert "compiler" in tags + assert "tier1" not in tags + assert "tier2" not in tags + + def test_bottleneck_compute_adds_compiler(self): + tags = self._tags(tier=1, bottleneck_type="compute") + assert "compiler" in tags + + def test_bottleneck_memory_adds_compiler(self): + tags = self._tags(tier=1, bottleneck_type="memory") + assert "compiler" in tags + + def test_bottleneck_latency_does_not_add_compiler(self): + tags = self._tags(tier=2, has_counters=True, bottleneck_type="latency") + assert "compiler" not in tags + + def test_bottleneck_mixed_does_not_add_compiler(self): + tags = self._tags(tier=2, has_counters=True, bottleneck_type="mixed") + assert "compiler" not in tags + + def test_custom_prompt_compiler_keyword_adds_compiler(self): + tags = self._tags(tier=1, custom_prompt="check compiler flags") + assert "compiler" in tags + + def test_custom_prompt_build_keyword_adds_compiler(self): + tags = self._tags(tier=1, custom_prompt="build options to try") + assert "compiler" in tags + + def test_custom_prompt_memory_keyword_does_not_add_compiler(self): + tags = self._tags(tier=1, custom_prompt="memory bottleneck analysis") + assert "compiler" not in tags + + def test_custom_prompt_none_does_not_add_compiler(self): + tags = self._tags(tier=1, custom_prompt=None) + assert "compiler" not in tags + + def test_full_tier2_compute_bottleneck_has_all_tags(self): + tags = self._tags(tier=2, has_counters=True, bottleneck_type="compute") + assert tags == {"always", "tier1", "tier2", "compiler"} + + def test_full_tier2_latency_bottleneck_has_no_compiler(self): + tags = self._tags(tier=2, has_counters=True, bottleneck_type="latency") + assert tags == {"always", "tier1", "tier2"} + + +# --------------------------------------------------------------------------- +# Group C: _filter_guide section parsing (12 tests) +# --------------------------------------------------------------------------- + + +class TestFilterGuide: + + def _filter(self, guide, tags): + from rocpd.ai_analysis.llm_analyzer import _filter_guide + + return _filter_guide(guide, tags) + + def _make_guide(self, *sections): + """Build a mini guide string from (title, tag_or_None, content) tuples.""" + parts = ["# LLM Reference Guide\n\nIntro block with no tag.\n"] + for title, tag, content in sections: + tag_line = f"\n" if tag else "" + parts.append(f"## {title}\n{tag_line}{content}\n") + return "\n".join(parts) + + def test_always_tagged_section_included_when_always_in_tags(self): + guide = self._make_guide(("Critical", "always", "critical content")) + result = self._filter(guide, {"always"}) + assert "critical content" in result + + def test_tier2_section_excluded_when_only_tier1_in_tags(self): + guide = self._make_guide( + ("HW Counters", "tier2", "counter content"), + ("Workflow", "tier1", "workflow content"), + ) + result = self._filter(guide, {"always", "tier1"}) + assert "counter content" not in result + assert "workflow content" in result + + def test_tier2_section_included_when_tier2_in_tags(self): + guide = self._make_guide(("HW Counters", "tier2", "counter content")) + result = self._filter(guide, {"always", "tier1", "tier2"}) + assert "counter content" in result + + def test_section_with_no_tag_always_included(self): + guide = self._make_guide(("Untagged Section", None, "untagged content")) + result = self._filter(guide, {"always"}) + assert "untagged content" in result + + def test_section_with_multiple_tags_included_on_any_match(self): + guide = ( + "# Guide\n\n## Multi\n\nmulti content\n" + ) + result = self._filter(guide, {"always", "tier2"}) + assert "multi content" in result + + def test_empty_guide_returns_empty_string(self): + result = self._filter("", {"always"}) + assert result == "" + + def test_guide_with_zero_tagged_sections_returns_full_content(self): + guide = self._make_guide( + ("Alpha", None, "alpha content"), + ("Beta", None, "beta content"), + ) + result = self._filter(guide, {"always"}) + assert "alpha content" in result + assert "beta content" in result + + def test_tag_comment_with_extra_whitespace_parsed_correctly(self): + guide = ( + "# Guide\n\n## Section\n\nspaced content\n" + ) + result = self._filter(guide, {"tier2"}) + assert "spaced content" in result + + def test_unknown_tag_excludes_section(self): + guide = self._make_guide(("Future", "future_tag", "future content")) + result = self._filter(guide, {"always", "tier1", "tier2"}) + assert "future content" not in result + + def test_tag_comment_on_line2_still_found(self): + guide = ( + "# Guide\n\n## Section\n\n\nline2 tag content\n" + ) + result = self._filter(guide, {"tier1"}) + assert "line2 tag content" in result + + def test_tag_comment_beyond_scan_window_treated_as_no_tag(self): + # Tag comment on line 5 (beyond first-3-line scan) โ†’ treated as no tag โ†’ included + guide = ( + "# Guide\n\n## Section\nline1\nline2\nline3\nline4\n" + "\nlate tag content\n" + ) + result = self._filter(guide, {"always"}) + assert "late tag content" in result + + def test_multiple_sections_ordering_preserved(self): + guide = self._make_guide( + ("First", "always", "first content"), + ("Second", "tier2", "second content"), + ("Third", "always", "third content"), + ) + result = self._filter(guide, {"always"}) + assert result.index("first content") < result.index("third content") + assert "second content" not in result + + +# --------------------------------------------------------------------------- +# Group D: _build_system_prompt integration (4 tests) +# --------------------------------------------------------------------------- + + +class TestBuildSystemPrompt: + + def _make_analyzer(self): + from rocpd.ai_analysis.llm_analyzer import LLMAnalyzer + from unittest.mock import patch + + with patch.object( + LLMAnalyzer, + "_load_reference_guide", + return_value=( + "# Guide\n\n## Always Section\n\nalways content\n\n" + "## Tier2 Section\n\ntier2 content\n\n" + "## Compiler Section\n\ncompiler content\n" + ), + ): + return LLMAnalyzer(provider="anthropic", api_key="fake-key") + + def test_context_none_returns_full_guide(self): + analyzer = self._make_analyzer() + prompt = analyzer._build_system_prompt(context=None) + assert "always content" in prompt + assert "tier2 content" in prompt + assert "compiler content" in prompt + + def test_tier1_context_excludes_tier2_and_compiler(self): + from rocpd.ai_analysis.llm_analyzer import AnalysisContext + + analyzer = self._make_analyzer() + ctx = AnalysisContext(tier=1, has_counters=False) + prompt = analyzer._build_system_prompt(context=ctx) + assert "always content" in prompt + assert "tier2 content" not in prompt + assert "compiler content" not in prompt + + def test_tier2_context_includes_tier2_excludes_compiler(self): + from rocpd.ai_analysis.llm_analyzer import AnalysisContext + + analyzer = self._make_analyzer() + ctx = AnalysisContext(tier=2, has_counters=True, bottleneck_type="latency") + prompt = analyzer._build_system_prompt(context=ctx) + assert "tier2 content" in prompt + assert "compiler content" not in prompt + + def test_returned_prompt_is_always_non_empty(self): + from rocpd.ai_analysis.llm_analyzer import AnalysisContext + + analyzer = self._make_analyzer() + ctx = AnalysisContext(tier=1) + prompt = analyzer._build_system_prompt(context=ctx) + assert len(prompt) > 0 + + +# --------------------------------------------------------------------------- +# Group D continued: context propagation through public methods (3 tests) +# --------------------------------------------------------------------------- + + +class TestAnalyzeWithLLMContextParam: + + def _make_analyzer_capturing_prompt(self): + from rocpd.ai_analysis.llm_analyzer import LLMAnalyzer + from unittest.mock import patch + + captured = {} + + with patch.object( + LLMAnalyzer, + "_load_reference_guide", + return_value=( + "# Guide\n\n## Always\n\nalways text\n\n" + "## Tier2\n\ntier2 text\n" + ), + ): + analyzer = LLMAnalyzer(provider="anthropic", api_key="fake") + + def fake_call(system_prompt, user_prompt): + captured["system_prompt"] = system_prompt + return "fake llm response" + + analyzer._call_anthropic = fake_call + return analyzer, captured + + def test_analyze_with_llm_context_filters_guide(self): + from rocpd.ai_analysis.llm_analyzer import AnalysisContext + + analyzer, captured = self._make_analyzer_capturing_prompt() + ctx = AnalysisContext(tier=1, has_counters=False) + analyzer.analyze_with_llm(analysis_data={}, context=ctx) + assert "tier2 text" not in captured["system_prompt"] + assert "always text" in captured["system_prompt"] + + def test_analyze_with_llm_no_context_uses_full_guide(self): + analyzer, captured = self._make_analyzer_capturing_prompt() + analyzer.analyze_with_llm(analysis_data={}) + assert "tier2 text" in captured["system_prompt"] + + def test_analyze_source_with_llm_context_filters_guide(self): + from rocpd.ai_analysis.llm_analyzer import AnalysisContext, LLMAnalyzer + from unittest.mock import patch + from rocpd.ai_analysis.api import SourceAnalysisResult + + captured = {} + + with patch.object( + LLMAnalyzer, + "_load_reference_guide", + return_value=( + "# Guide\n\n## Always\n\nalways text\n\n" + "## Compiler\n\ncompiler text\n" + ), + ): + analyzer = LLMAnalyzer(provider="anthropic", api_key="fake") + + def fake_call(system_prompt, user_prompt): + captured["system_prompt"] = system_prompt + return "fake source response" + + analyzer._call_anthropic = fake_call + + ctx = AnalysisContext(tier=0) # Tier 0 โ†’ compiler tag active + minimal_result = SourceAnalysisResult( + source_dir="/tmp", + analysis_timestamp="2026-01-01T00:00:00", + programming_model="HIP", + files_scanned=0, + files_skipped=0, + detected_kernels=[], + kernel_count=0, + detected_patterns=[], + risk_areas=[], + already_instrumented=False, + roctx_marker_count=0, + recommendations=[], + suggested_counters=[], + suggested_first_command="", + ) + analyzer.analyze_source_with_llm(minimal_result, context=ctx) + assert "compiler text" in captured["system_prompt"] + + +# --------------------------------------------------------------------------- +# Group F: public API export (2 tests) +# --------------------------------------------------------------------------- + + +class TestPublicExport: + + def test_analysis_context_importable_from_package(self): + from rocpd.ai_analysis import AnalysisContext + + ctx = AnalysisContext(tier=2) + assert ctx.tier == 2 + + def test_analysis_context_in_all(self): + import rocpd.ai_analysis as pkg + + assert "AnalysisContext" in pkg.__all__ + + +# --------------------------------------------------------------------------- +# Group E: end-to-end with real guide file (6 tests) +# --------------------------------------------------------------------------- + + +class TestEndToEndWithRealGuide: + """ + Load the actual llm-reference-guide.md and verify filtering behaviour. + These tests do NOT call any external LLM API. + """ + + def _build_prompt(self, **ctx_kwargs): + from rocpd.ai_analysis.llm_analyzer import LLMAnalyzer, AnalysisContext + from unittest.mock import patch + + guide = ( + LLMAnalyzer.__module__ + and __import__( + "rocpd.ai_analysis.llm_analyzer", fromlist=["get_reference_guide_path"] + ) + .get_reference_guide_path() + .read_text() + ) + with patch.object(LLMAnalyzer, "_load_reference_guide", return_value=guide): + analyzer = LLMAnalyzer(provider="anthropic", api_key="fake") + ctx = AnalysisContext(**ctx_kwargs) + return analyzer._build_system_prompt(context=ctx) + + def test_tier1_excludes_compiler_section(self): + prompt = self._build_prompt(tier=1, has_counters=False) + assert "Compiler Optimization Flags" not in prompt + + def test_tier2_latency_excludes_compiler_section(self): + prompt = self._build_prompt(tier=2, has_counters=True, bottleneck_type="latency") + assert "Compiler Optimization Flags" not in prompt + + def test_tier0_includes_compiler_section(self): + prompt = self._build_prompt(tier=0) + assert "Compiler Optimization Flags" in prompt + + def test_bottleneck_compute_includes_compiler_section(self): + prompt = self._build_prompt(tier=2, has_counters=True, bottleneck_type="compute") + assert "Compiler Optimization Flags" in prompt + + def test_critical_requirements_always_present(self): + for tier in (0, 1, 2): + prompt = self._build_prompt(tier=tier) + assert "CRITICAL REQUIREMENTS" in prompt, f"Missing in tier {tier}" + + def test_always_tagged_sections_present_in_every_tier(self): + always_markers = [ + "Your Role", + "Output Format Requirements", + "What NOT to Do", + "Summary", + ] + for tier in (0, 1, 2): + prompt = self._build_prompt(tier=tier) + for marker in always_markers: + assert marker in prompt, f"'{marker}' missing for tier {tier}" + + +# --------------------------------------------------------------------------- +# Group F: guide file integrity (2 tests) +# --------------------------------------------------------------------------- + + +class TestGuideIntegrity: + """Validate that the real llm-reference-guide.md is correctly tagged.""" + + KNOWN_TAGS = {"always", "tier1", "tier2", "compiler", "source", "tracelens_metrics"} + # The intro block (before the first ## section) is intentionally untagged + UNTAGGED_ALLOWED_PREFIXES = ("LLM Reference Guide",) + + @classmethod + def _sections(cls): + """Return list of (title, tag_or_None) for every ## section.""" + import re + from rocpd.ai_analysis.llm_analyzer import get_reference_guide_path + + text = get_reference_guide_path().read_text() + tag_re = re.compile(r"") + results = [] + for raw in re.split(r"\n(?=## )", text): + if not raw.startswith("## "): + continue + title = raw.splitlines()[0][3:].strip() + head = "\n".join(raw.splitlines()[:3]) + match = tag_re.search(head) + tag = match.group(1).strip() if match else None + results.append((title, tag)) + return results + + def test_every_section_has_a_tag(self): + """No ## section should be accidentally left without a rocpd-context tag.""" + untagged = [ + title + for title, tag in self._sections() + if tag is None + and not any(title.startswith(p) for p in self.UNTAGGED_ALLOWED_PREFIXES) + ] + assert untagged == [], f"Sections missing rocpd-context tag: {untagged}" + + def test_all_tags_are_from_known_vocabulary(self): + """Catch typos in tag names e.g. 'tier_2' instead of 'tier2'.""" + bad = [] + for title, tag in self._sections(): + if tag is None: + continue + for t in (t.strip() for t in tag.split(",")): + if t not in self.KNOWN_TAGS: + bad.append((title, t)) + assert bad == [], f"Unknown tags found: {bad}" + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__, "-v"]))