shivasurya · shivasurya · Oct 24, 2025 · Oct 24, 2025
diff --git a/class_memory.csv b/class_memory.csv
@@ -0,0 +1 @@
+timestamp,rss_mb,vsz_mb
diff --git a/perf_tools/.gitignore b/perf_tools/.gitignore
@@ -0,0 +1,25 @@
+# Ignore all benchmark output files
+*.csv
+*.png
+*.log
+
+# Ignore specific benchmark runs
+benchmark.*
+*_benchmark.*
+*_memory.*
+class_*
+function_*
+test_*
+
+# Allow example files if we add them later
+!example_output.png
+!example_data.csv
+
+# Python cache
+__pycache__/
+*.py[cod]
+*$py.class
+
+# OS specific
+.DS_Store
+Thumbs.db
diff --git a/perf_tools/README.md b/perf_tools/README.md
@@ -0,0 +1,226 @@
+# Performance Profiling Tools for Code-Pathfinder
+
+A collection of tools to measure and visualize memory usage and performance of code-pathfinder queries.
+
+## Quick Start
+
+### 1. Basic Usage (Easiest)
+
+```bash
+cd perf_tools
+./benchmark.sh
+```
+
+This runs a default benchmark on the SaltStack codebase with a function definition query.
+
+### 2. Custom Query
+
+```bash
+./benchmark.sh -q "FROM class_definition AS cd SELECT cd" -o class_results
+```
+
+### 3. Different Project
+
+```bash
+./benchmark.sh -p /path/to/your/project -q "FROM function_definition AS fd SELECT fd"
+```
+
+## Command Line Options
+
+```
+Usage: ./benchmark.sh [options]
+
+Options:
+  -p, --project DIR     Project directory to analyze (default: ~/src/shivasurya/salt)
+  -q, --query QUERY     Query to run (default: 'FROM function_definition AS fd SELECT fd')
+  -o, --output NAME     Output file prefix (default: 'benchmark')
+  -b, --binary PATH     Path to pathfinder binary (default: ../sourcecode-parser/build/go/pathfinder)
+  -h, --help            Show this help message
+```
+
+## Examples
+
+### Compare Class vs Function Queries
+
+```bash
+# Run class definition benchmark
+./benchmark.sh -q "FROM class_definition AS cd SELECT cd" -o class_benchmark
+
+# Run function definition benchmark
+./benchmark.sh -q "FROM function_definition AS fd SELECT fd" -o function_benchmark
+
+# Compare the PNG graphs!
+open class_benchmark.png function_benchmark.png
+```
+
+### Test Different Codebases
+
+```bash
+# Test on your own project
+./benchmark.sh -p ~/myproject -o myproject_benchmark
+
+# Test on multiple projects
+for proj in project1 project2 project3; do
+    ./benchmark.sh -p ~/repos/$proj -o ${proj}_benchmark
+done
+```
+
+## Output Files
+
+Each benchmark run creates 3 files:
+
+1. **`{name}.csv`** - Raw memory usage data (timestamp, RSS, VSZ)
+2. **`{name}.png`** - Memory usage graph with timeline
+3. **`{name}.log`** - Query execution log
+
+Example:
+```
+benchmark.csv  - Memory data points
+benchmark.png  - Visual graph
+benchmark.log  - Execution log
+```
+
+## Understanding the Results
+
+### Memory Metrics
+
+- **RSS (Resident Set Size)**: Actual physical memory used (most important)
+- **VSZ (Virtual Memory Size)**: Total virtual memory allocated
+
+### Graph Interpretation
+
+```
+Memory Usage Over Time
+│
+│   Peak: 2943.6 MB
+│   Avg: 2813.4 MB
+│
+│ 3000 MB ├─────────────────────── Flat line (good!)
+│         │       ╱────────────────
+│ 2000 MB │      ╱
+│         │     ╱  Parsing phase
+│ 1000 MB │    ╱
+│         │   ╱
+│    0 MB └──────────────────────►
+          0s  20s  40s  60s  80s
+```
+
+**Good patterns:**
+- ✅ Rapid rise then flat = efficient memory use
+- ✅ Stable plateau = no memory leaks
+
+**Bad patterns:**
+- ❌ Continuous rise = possible memory leak
+- ❌ Spikes during query = inefficient allocations
+
+## Requirements
+
+### Required
+- Bash shell
+- Built pathfinder binary (run `cd ../sourcecode-parser && gradle buildGo`)
+
+### Optional
+- Python 3 with matplotlib and pandas for graph generation
+  ```bash
+  pip3 install matplotlib pandas
+  ```
+
+## Manual Mode (Advanced)
+
+If you want more control, use the individual scripts:
+
+### 1. Run Query with Monitoring
+
+```bash
+# Terminal 1: Start query
+../sourcecode-parser/build/go/pathfinder query --project ~/salt --query "..." &
+PID=$!
+
+# Terminal 2: Monitor memory
+./fast_monitor.sh $PID memory_data.csv
+```
+
+### 2. Generate Graph
+
+```bash
+python3 plot_memory.py memory_data.csv
+# Creates: memory_data.png
+```
+
+## Scripts Overview
+
+| Script | Purpose |
+|--------|---------|
+| `benchmark.sh` | **Main tool** - Easy-to-use wrapper |
+| `fast_monitor.sh` | Monitors process memory (100ms sampling) |
+| `monitor_memory.sh` | Slower monitoring (500ms sampling) |
+| `plot_memory.py` | Generates memory usage graphs |
+
+## Comparing Optimizations
+
+To measure the impact of performance optimizations:
+
+```bash
+# Before optimization
+git checkout main
+cd sourcecode-parser && gradle clean buildGo && cd ../perf_tools
+./benchmark.sh -o before_optimization
+
+# After optimization
+git checkout feature-branch
+cd sourcecode-parser && gradle clean buildGo && cd ../perf_tools
+./benchmark.sh -o after_optimization
+
+# Compare results
+echo "Before: $(grep 'Peak RSS' before_optimization.csv | tail -1)"
+echo "After:  $(grep 'Peak RSS' after_optimization.csv | tail -1)"
+```
+
+## Troubleshooting
+
+### "Pathfinder binary not found"
+
+Build the binary first:
+```bash
+cd ../sourcecode-parser
+gradle clean buildGo
+cd ../perf_tools
+```
+
+### "Python3 not found"
+
+The CSV data is still generated. You can:
+1. Install Python: `brew install python3`
+2. Use the CSV data with your own tools
+3. Run without graphs (CSV has all the data)
+
+### "Project directory not found"
+
+Specify the correct path:
+```bash
+./benchmark.sh -p /absolute/path/to/your/project
+```
+
+## Tips
+
+1. **Run multiple times**: Results can vary due to system load. Run 3 times and compare.
+
+2. **Close other apps**: For accurate results, close memory-heavy applications.
+
+3. **Use full paths**: When in doubt, use absolute paths for `-p` and `-b` options.
+
+4. **Compare similar queries**: Compare "class vs class" or "function vs function" for fair comparisons.
+
+## Contributing
+
+Found a bug or want to improve these tools? The scripts are simple bash/Python:
+
+- `benchmark.sh` - Main orchestration script
+- `fast_monitor.sh` - Memory sampling loop
+- `plot_memory.py` - matplotlib graphing
+
+Feel free to modify and improve!
+
+## License
+
+Same as code-pathfinder project (AGPL-3.0).
diff --git a/perf_tools/fast_monitor.sh b/perf_tools/fast_monitor.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+PID=$1
+OUTPUT=${2:-memory_usage.csv}
+
+echo "timestamp,rss_mb,vsz_mb" > $OUTPUT
+
+while kill -0 $PID 2>/dev/null; do
+    TIMESTAMP=$(date +%s.%N)
+    MEM=$(ps -p $PID -o rss=,vsz= 2>/dev/null | awk '{print $1/1024","$2/1024}')
+    if [ ! -z "$MEM" ]; then
+        echo "$TIMESTAMP,$MEM" >> $OUTPUT
+    fi
+    sleep 0.1  # Sample every 100ms instead of 500ms
+done
+
+echo "Memory monitoring complete. Data saved to $OUTPUT"
diff --git a/sourcecode-parser/cmd/query.go b/sourcecode-parser/cmd/query.go
@@ -139,7 +139,7 @@ func processQuery(input string, codeGraph *graph.CodeGraph, output string) (stri
 				result := make(map[string]interface{})
 				result["file"] = entityObject.File
 				result["line"] = entityObject.LineNumber
-				result["code"] = entityObject.CodeSnippet
+				result["code"] = entityObject.GetCodeSnippet()
 
 				results["result_set"] = append(results["result_set"].([]map[string]interface{}), result) //nolint:all
 			}
@@ -167,7 +167,7 @@ func processQuery(input string, codeGraph *graph.CodeGraph, output string) (stri
 			header += output + "\n"
 			result += header
 			result += "\n"
-			codeSnippetArray := strings.Split(entityObject.CodeSnippet, "\n")
+			codeSnippetArray := strings.Split(entityObject.GetCodeSnippet(), "\n")
 			for i := 0; i < len(codeSnippetArray); i++ {
 				lineNumber := color.New(color.FgCyan).SprintfFunc()("%4d", int(entityObject.LineNumber)+i)
 				result += fmt.Sprintf("%s%s %s %s\n", strings.Repeat("\t", 2), lineNumber, verticalLine, yellowCode(codeSnippetArray[i]))

diff --git a/sourcecode-parser/graph/parser_java.go b/sourcecode-parser/graph/parser_java.go
@@ -61,7 +61,7 @@ func parseJavaBinaryExpression(node *sitter.Node, sourceCode []byte, graph *Code
 		ID:               GenerateSha256(exprType + node.Content(sourceCode)),
 		Type:             exprType,
 		Name:             node.Content(sourceCode),
-		CodeSnippet:      node.Content(sourceCode),
+		SourceLocation: &SourceLocation{File: file, StartByte: node.StartByte(), EndByte: node.EndByte()},
 		LineNumber:       node.StartPoint().Row + 1,
 		File:             file,
 		isJavaSourceFile: isJavaSourceFile,
@@ -73,7 +73,7 @@ func parseJavaBinaryExpression(node *sitter.Node, sourceCode []byte, graph *Code
 		ID:               GenerateSha256("binary_expression" + node.Content(sourceCode)),
 		Type:             "binary_expression",
 		Name:             node.Content(sourceCode),
-		CodeSnippet:      node.Content(sourceCode),
+		SourceLocation: &SourceLocation{File: file, StartByte: node.StartByte(), EndByte: node.EndByte()},
 		LineNumber:       node.StartPoint().Row + 1,
 		File:             file,
 		isJavaSourceFile: isJavaSourceFile,
@@ -138,7 +138,7 @@ func parseJavaMethodDeclaration(node *sitter.Node, sourceCode []byte, graph *Cod
 		ID:                   methodID,
 		Type:                 "method_declaration",
 		Name:                 methodName,
-		CodeSnippet:          node.Content(sourceCode),
+		SourceLocation: &SourceLocation{File: file, StartByte: node.StartByte(), EndByte: node.EndByte()},
 		LineNumber:           node.StartPoint().Row + 1,
 		Modifier:             extractVisibilityModifier(modifiers),
 		ReturnType:           returnType,
@@ -183,7 +183,7 @@ func parseJavaMethodInvocation(node *sitter.Node, sourceCode []byte, graph *Code
 		Type:                 "method_invocation",
 		Name:                 methodName,
 		IsExternal:           true,
-		CodeSnippet:          node.Content(sourceCode),
+		SourceLocation: &SourceLocation{File: file, StartByte: node.StartByte(), EndByte: node.EndByte()},
 		LineNumber:           node.StartPoint().Row + 1,
 		MethodArgumentsValue: arguments,
 		File:                 file,
@@ -241,7 +241,7 @@ func parseJavaClassDeclaration(node *sitter.Node, sourceCode []byte, graph *Code
 		ID:               GenerateMethodID("class:"+className, []string{}, file),
 		Type:             "class_declaration",
 		Name:             className,
-		CodeSnippet:      node.Content(sourceCode),
+		SourceLocation: &SourceLocation{File: file, StartByte: node.StartByte(), EndByte: node.EndByte()},
 		LineNumber:       node.StartPoint().Row + 1,
 		PackageName:      packageName,
 		Modifier:         extractVisibilityModifier(accessModifier),
@@ -264,7 +264,7 @@ func parseJavaBlockComment(node *sitter.Node, sourceCode []byte, graph *CodeGrap
 		commentNode := &Node{
 			ID:               GenerateMethodID(node.Content(sourceCode), []string{}, file),
 			Type:             "block_comment",
-			CodeSnippet:      commentContent,
+			SourceLocation:   &SourceLocation{File: file, StartByte: node.StartByte(), EndByte: node.EndByte()},
 			LineNumber:       node.StartPoint().Row + 1,
 			File:             file,
 			isJavaSourceFile: true,
@@ -316,7 +316,7 @@ func parseJavaVariableDeclaration(node *sitter.Node, sourceCode []byte, graph *C
 		ID:               GenerateMethodID(variableName, []string{}, file),
 		Type:             "variable_declaration",
 		Name:             variableName,
-		CodeSnippet:      node.Content(sourceCode),
+		SourceLocation: &SourceLocation{File: file, StartByte: node.StartByte(), EndByte: node.EndByte()},
 		LineNumber:       node.StartPoint().Row + 1,
 		Modifier:         extractVisibilityModifier(variableModifier),
 		DataType:         variableType,
@@ -364,7 +364,7 @@ func parseJavaObjectCreation(node *sitter.Node, sourceCode []byte, graph *CodeGr
 		ID:                GenerateMethodID(className, []string{strconv.Itoa(int(node.StartPoint().Row + 1))}, file),
 		Type:              "ClassInstanceExpr",
 		Name:              className,
-		CodeSnippet:       node.Content(sourceCode),
+		SourceLocation: &SourceLocation{File: file, StartByte: node.StartByte(), EndByte: node.EndByte()},
 		LineNumber:        node.StartPoint().Row + 1,
 		File:              file,
 		isJavaSourceFile:  true,